summaryrefslogtreecommitdiff
path: root/usr/src/uts/i86pc
diff options
context:
space:
mode:
authorPatrick Mooney <pmooney@pfmooney.com>2019-06-06 20:17:45 +0000
committerPatrick Mooney <pmooney@pfmooney.com>2019-07-19 15:33:23 +0000
commit9e88ade90f654d7c2cdfcec90cface22eaa124c7 (patch)
treef23752c2ec4204ecc1e3ba6d4b9a0f1dc5b0021e /usr/src/uts/i86pc
parent62ae06fb599ccdbf3a97e6c584e0a055e763e2e9 (diff)
downloadillumos-joyent-9e88ade90f654d7c2cdfcec90cface22eaa124c7.tar.gz
OS-7843 viona could be split up
Reviewed by: Jason King <jbk@joyent.com> Reviewed by: Ryan Zezeski <rpz@joyent.com> Approved by: Ryan Zezeski <rpz@joyent.com>
Diffstat (limited to 'usr/src/uts/i86pc')
-rw-r--r--usr/src/uts/i86pc/Makefile.files6
-rw-r--r--usr/src/uts/i86pc/io/viona/viona.c3631
-rw-r--r--usr/src/uts/i86pc/io/viona/viona_hook.c438
-rw-r--r--usr/src/uts/i86pc/io/viona/viona_impl.h325
-rw-r--r--usr/src/uts/i86pc/io/viona/viona_main.c985
-rw-r--r--usr/src/uts/i86pc/io/viona/viona_ring.c636
-rw-r--r--usr/src/uts/i86pc/io/viona/viona_rx.c747
-rw-r--r--usr/src/uts/i86pc/io/viona/viona_tx.c755
-rw-r--r--usr/src/uts/i86pc/sys/vmm_drv.h3
9 files changed, 3894 insertions, 3632 deletions
diff --git a/usr/src/uts/i86pc/Makefile.files b/usr/src/uts/i86pc/Makefile.files
index d541e92bf3..a0509bf21d 100644
--- a/usr/src/uts/i86pc/Makefile.files
+++ b/usr/src/uts/i86pc/Makefile.files
@@ -285,7 +285,11 @@ VMM_OBJS += vmm.o \
vmm_support.o \
vmm_zsd.o
-VIONA_OBJS += viona.o
+VIONA_OBJS += viona_main.o \
+ viona_ring.o \
+ viona_rx.o \
+ viona_tx.o \
+ viona_hook.o \
PPT_OBJS += ppt.o
diff --git a/usr/src/uts/i86pc/io/viona/viona.c b/usr/src/uts/i86pc/io/viona/viona.c
deleted file mode 100644
index 80b5b07aaa..0000000000
--- a/usr/src/uts/i86pc/io/viona/viona.c
+++ /dev/null
@@ -1,3631 +0,0 @@
-/*
- * Copyright (c) 2013 Chris Torek <torek @ torek net>
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-/*
- * This file and its contents are supplied under the terms of the
- * Common Development and Distribution License ("CDDL"), version 1.0.
- * You may only use this file in accordance with the terms of version
- * 1.0 of the CDDL.
- *
- * A full copy of the text of the CDDL should have accompanied this
- * source. A copy of the CDDL is also available via the Internet at
- * http://www.illumos.org/license/CDDL.
- *
- * Copyright 2015 Pluribus Networks Inc.
- * Copyright 2019 Joyent, Inc.
- */
-
-/*
- * viona - VirtIO-Net, Accelerated
- *
- * The purpose of viona is to provide high performance virtio-net devices to
- * bhyve guests. It does so by sitting directly atop MAC, skipping all of the
- * DLS/DLD stack.
- *
- * --------------------
- * General Architecture
- * --------------------
- *
- * A single viona instance is comprised of a "link" handle and two "rings".
- * After opening the viona device, it must be associated with a MAC network
- * interface and a bhyve (vmm) instance to form its link resource. This is
- * done with the VNA_IOC_CREATE ioctl, where the datalink ID and vmm fd are
- * passed in to perform the initialization. With the MAC client opened, and a
- * driver handle to the vmm instance established, the device is ready to be
- * configured by the guest.
- *
- * The userspace portion of bhyve, which interfaces with the PCI device
- * emulation framework, is meant to stay out of the datapath if at all
- * possible. Configuration changes made via PCI are mapped to actions which
- * will steer the operation of the in-kernel logic.
- *
- *
- * -----------
- * Ring Basics
- * -----------
- *
- * Each viona link has two viona_vring_t entities, RX and TX, for handling data
- * transfers to and from the guest. They represent an interface to the
- * standard virtio ring structures. When intiailized and active, each ring is
- * backed by a kernel worker thread (parented to the bhyve process for the
- * instance) which handles ring events. The RX worker has the simple task of
- * watching for ring shutdown conditions. The TX worker does that in addition
- * to processing all requests to transmit data. Data destined for the guest is
- * delivered directly by MAC to viona_rx() when the ring is active.
- *
- *
- * -----------
- * Ring States
- * -----------
- *
- * The viona_vring_t instances follow a simple path through the possible state
- * values represented in virtio_vring_t`vr_state:
- *
- * +<--------------------------------------------+
- * | |
- * V ^
- * +-----------+ This is the initial state when a link is created or
- * | VRS_RESET | when the ring has been explicitly reset.
- * +-----------+
- * | ^
- * |---* ioctl(VNA_IOC_RING_INIT) issued |
- * | |
- * | ^
- * V
- * +-----------+ The ring parameters (size, guest physical addresses)
- * | VRS_SETUP | have been set and start-up of the ring worker thread
- * +-----------+ has begun.
- * | ^
- * | |
- * |---* ring worker thread begins execution |
- * | |
- * +-------------------------------------------->+
- * | | ^
- * | |
- * | * If ring shutdown is requested (by ioctl or impending
- * | bhyve process death) while the worker thread is
- * | starting, the worker will transition the ring to
- * | VRS_RESET and exit.
- * | ^
- * | |
- * | ^
- * V
- * +-----------+ The worker thread associated with the ring has started
- * | VRS_INIT | executing. It has allocated any extra resources needed
- * +-----------+ for the ring to operate.
- * | ^
- * | |
- * +-------------------------------------------->+
- * | | ^
- * | |
- * | * If ring shutdown is requested while the worker is
- * | waiting in VRS_INIT, it will free any extra resources
- * | and transition to VRS_RESET.
- * | ^
- * | |
- * |--* ioctl(VNA_IOC_RING_KICK) issued |
- * | ^
- * V
- * +-----------+ The worker thread associated with the ring is executing
- * | VRS_RUN | workload specific to that ring.
- * +-----------+
- * | ^
- * |---* ioctl(VNA_IOC_RING_RESET) issued |
- * | (or bhyve process begins exit) |
- * V |
- * +-------------------------------------------->+
- *
- *
- * While the worker thread is not running, changes to vr_state are only made by
- * viona_ioc_ring_init() under vr_lock. There, it initializes the ring, starts
- * the worker, and sets the ring state to VRS_SETUP. Once the worker thread
- * has been started, only it may perform ring state transitions (still under
- * the protection of vr_lock), when requested by outside consumers via
- * vr_state_flags or when the containing bhyve process initiates an exit.
- *
- *
- * ----------------------------
- * Transmission mblk_t Handling
- * ----------------------------
- *
- * For incoming frames destined for a bhyve guest, the data must first land in
- * a host OS buffer from the physical NIC before it is copied into the awaiting
- * guest buffer(s). Outbound frames transmitted by the guest are not bound by
- * this limitation and can avoid extra copying before the buffers are accessed
- * directly by the NIC. When a guest designates buffers to be transmitted,
- * viona translates the guest-physical addresses contained in the ring
- * descriptors to host-virtual addresses via vmm_dr_gpa2kva(). That pointer is
- * wrapped in an mblk_t using a preallocated viona_desb_t for the desballoc().
- * Doing so increments vr_xfer_outstanding, preventing the ring from being
- * reset (allowing the link to drop its vmm handle to the guest) until all
- * transmit mblks referencing guest memory have been processed. Allocation of
- * the viona_desb_t entries is done during the VRS_INIT stage of the ring
- * worker thread. The ring size informs that allocation as the number of
- * concurrent transmissions is limited by the number of descriptors in the
- * ring. This minimizes allocation in the transmit hot-path by aqcuiring those
- * fixed-size resources during initialization.
- *
- * This optimization depends on the underlying NIC driver freeing the mblks in
- * a timely manner after they have been transmitted by the hardware. Some
- * drivers have been found to flush TX descriptors only when new transmissions
- * are initiated. This means that there is no upper bound to the time needed
- * for an mblk to be flushed and can stall bhyve guests from shutting down
- * since their memory must be free of viona TX references prior to clean-up.
- *
- * This expectation of deterministic mblk_t processing is likely the reason
- * behind the notable exception to the zero-copy TX path: systems with 'bnxe'
- * loaded will copy transmit data into fresh buffers rather than passing up
- * zero-copy mblks. It is a hold-over from the original viona sources provided
- * by Pluribus and its continued necessity has not been confirmed.
- *
- *
- * ----------------------------
- * Ring Notification Fast-paths
- * ----------------------------
- *
- * Device operation for viona requires that notifications flow to and from the
- * guest to indicate certain ring conditions. In order to minimize latency and
- * processing overhead, the notification procedures are kept in-kernel whenever
- * possible.
- *
- * Guest-to-host notifications, when new available descriptors have been placed
- * in the ring, are posted via the 'queue notify' address in the virtio BAR.
- * The vmm_drv_ioport_hook() interface was added to bhyve which allows viona to
- * install a callback hook on an ioport address. Guest exits for accesses to
- * viona-hooked ioport addresses will result in direct calls to notify the
- * appropriate ring worker without a trip to userland.
- *
- * Host-to-guest notifications in the form of interrupts enjoy similar
- * acceleration. Each viona ring can be configured to send MSI notifications
- * to the guest as virtio conditions dictate. This in-kernel interrupt
- * configuration is kept synchronized through viona ioctls which are utilized
- * during writes to the associated PCI config registers or MSI-X BAR.
- *
- * Guests which do not utilize MSI-X will result in viona falling back to the
- * slow path for interrupts. It will poll(2) the viona handle, receiving
- * notification when ring events necessitate the assertion of an interrupt.
- *
- *
- * ---------------
- * Nethook Support
- * ---------------
- *
- * Viona provides four nethook events that consumers (e.g. ipf) can hook into
- * to intercept packets as they go up or down the stack. Unfortunately,
- * the nethook framework does not understand raw packets, so we can only
- * generate events (in, out) for IPv4 and IPv6 packets. At driver attach,
- * we register callbacks with the neti (netinfo) module that will be invoked
- * for each netstack already present, as well as for any additional netstack
- * instances created as the system operates. These callbacks will
- * register/unregister the hooks with the nethook framework for each
- * netstack instance. This registration occurs prior to creating any
- * viona instances for a given netstack, and the unregistration for a netstack
- * instance occurs after all viona instances of the netstack instance have
- * been deleted.
- */
-
-#include <sys/conf.h>
-#include <sys/file.h>
-#include <sys/stat.h>
-#include <sys/ddi.h>
-#include <sys/disp.h>
-#include <sys/sunddi.h>
-#include <sys/sunndi.h>
-#include <sys/sysmacros.h>
-#include <sys/strsubr.h>
-#include <sys/strsun.h>
-#include <vm/seg_kmem.h>
-#include <sys/smt.h>
-
-#include <sys/pattr.h>
-#include <sys/dls.h>
-#include <sys/dlpi.h>
-#include <sys/hook.h>
-#include <sys/hook_event.h>
-#include <sys/list.h>
-#include <sys/mac_client.h>
-#include <sys/mac_provider.h>
-#include <sys/mac_client_priv.h>
-#include <sys/neti.h>
-#include <sys/vlan.h>
-#include <inet/ip.h>
-#include <inet/ip_impl.h>
-#include <inet/tcp.h>
-
-#include <sys/vmm_drv.h>
-#include <sys/viona_io.h>
-
-/* Min. octets in an ethernet frame minus FCS */
-#define MIN_BUF_SIZE 60
-#define NEED_VLAN_PAD_SIZE (MIN_BUF_SIZE - VLAN_TAGSZ)
-
-#define VIONA_NAME "Virtio Network Accelerator"
-#define VIONA_CTL_MINOR 0
-#define VIONA_CLI_NAME "viona" /* MAC client name */
-#define VIONA_MAX_HDRS_LEN (sizeof (struct ether_vlan_header) + \
- IP_MAX_HDR_LENGTH + TCP_MAX_HDR_LENGTH)
-
-#define VTNET_MAXSEGS 32
-
-#define VRING_ALIGN 4096
-#define VRING_MAX_LEN 32768
-
-#define VRING_DESC_F_NEXT (1 << 0)
-#define VRING_DESC_F_WRITE (1 << 1)
-#define VRING_DESC_F_INDIRECT (1 << 2)
-
-#define VIRTIO_NET_HDR_F_NEEDS_CSUM (1 << 0)
-#define VIRTIO_NET_HDR_F_DATA_VALID (1 << 1)
-
-#define VIRTIO_NET_HDR_GSO_NONE 0
-#define VIRTIO_NET_HDR_GSO_TCPV4 1
-
-#define VRING_AVAIL_F_NO_INTERRUPT 1
-
-#define VRING_USED_F_NO_NOTIFY 1
-
-#define BNXE_NIC_DRIVER "bnxe"
-
-/*
- * Feature bits. See section 5.1.3 of the VIRTIO 1.0 spec.
- */
-#define VIRTIO_NET_F_CSUM (1 << 0)
-#define VIRTIO_NET_F_GUEST_CSUM (1 << 1)
-#define VIRTIO_NET_F_MAC (1 << 5) /* host supplies MAC */
-#define VIRTIO_NET_F_GUEST_TSO4 (1 << 7) /* guest can accept TSO */
-#define VIRTIO_NET_F_HOST_TSO4 (1 << 11) /* host can accept TSO */
-#define VIRTIO_NET_F_MRG_RXBUF (1 << 15) /* host can merge RX buffers */
-#define VIRTIO_NET_F_STATUS (1 << 16) /* config status field available */
-#define VIRTIO_F_RING_NOTIFY_ON_EMPTY (1 << 24)
-#define VIRTIO_F_RING_INDIRECT_DESC (1 << 28)
-#define VIRTIO_F_RING_EVENT_IDX (1 << 29)
-
-/*
- * Host capabilities.
- */
-#define VIONA_S_HOSTCAPS ( \
- VIRTIO_NET_F_GUEST_CSUM | \
- VIRTIO_NET_F_MAC | \
- VIRTIO_NET_F_GUEST_TSO4 | \
- VIRTIO_NET_F_MRG_RXBUF | \
- VIRTIO_NET_F_STATUS | \
- VIRTIO_F_RING_NOTIFY_ON_EMPTY | \
- VIRTIO_F_RING_INDIRECT_DESC)
-
-/* MAC_CAPAB_HCKSUM specifics of interest */
-#define VIONA_CAP_HCKSUM_INTEREST \
- (HCKSUM_INET_PARTIAL | \
- HCKSUM_INET_FULL_V4 | \
- HCKSUM_INET_FULL_V6)
-
-
-#define VIONA_PROBE(name) DTRACE_PROBE(viona__##name)
-#define VIONA_PROBE1(name, arg1, arg2) \
- DTRACE_PROBE1(viona__##name, arg1, arg2)
-#define VIONA_PROBE2(name, arg1, arg2, arg3, arg4) \
- DTRACE_PROBE2(viona__##name, arg1, arg2, arg3, arg4)
-#define VIONA_PROBE3(name, arg1, arg2, arg3, arg4, arg5, arg6) \
- DTRACE_PROBE3(viona__##name, arg1, arg2, arg3, arg4, arg5, arg6)
-#define VIONA_PROBE5(name, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, \
- arg9, arg10) \
- DTRACE_PROBE5(viona__##name, arg1, arg2, arg3, arg4, arg5, arg6, arg7, \
- arg8, arg9, arg10)
-#define VIONA_PROBE_BAD_RING_ADDR(r, a) \
- VIONA_PROBE2(bad_ring_addr, viona_vring_t *, r, void *, (void *)(a))
-
-#define VIONA_RING_STAT_INCR(r, name) \
- (((r)->vr_stats.rs_ ## name)++)
-
-#pragma pack(1)
-struct virtio_desc {
- uint64_t vd_addr;
- uint32_t vd_len;
- uint16_t vd_flags;
- uint16_t vd_next;
-};
-#pragma pack()
-
-#pragma pack(1)
-struct virtio_used {
- uint32_t vu_idx;
- uint32_t vu_tlen;
-};
-#pragma pack()
-
-#pragma pack(1)
-struct virtio_net_mrgrxhdr {
- uint8_t vrh_flags;
- uint8_t vrh_gso_type;
- uint16_t vrh_hdr_len;
- uint16_t vrh_gso_size;
- uint16_t vrh_csum_start;
- uint16_t vrh_csum_offset;
- uint16_t vrh_bufs;
-};
-struct virtio_net_hdr {
- uint8_t vrh_flags;
- uint8_t vrh_gso_type;
- uint16_t vrh_hdr_len;
- uint16_t vrh_gso_size;
- uint16_t vrh_csum_start;
- uint16_t vrh_csum_offset;
-};
-#pragma pack()
-
-struct viona_link;
-typedef struct viona_link viona_link_t;
-struct viona_desb;
-typedef struct viona_desb viona_desb_t;
-struct viona_net;
-typedef struct viona_neti viona_neti_t;
-
-enum viona_ring_state {
- VRS_RESET = 0x0, /* just allocated or reset */
- VRS_SETUP = 0x1, /* addrs setup and starting worker thread */
- VRS_INIT = 0x2, /* worker thread started & waiting to run */
- VRS_RUN = 0x3, /* running work routine */
-};
-enum viona_ring_state_flags {
- VRSF_REQ_START = 0x1, /* start running from INIT state */
- VRSF_REQ_STOP = 0x2, /* stop running, clean up, goto RESET state */
- VRSF_RENEW = 0x4, /* ring renewing lease */
-};
-
-#define VRING_NEED_BAIL(ring, proc) \
- (((ring)->vr_state_flags & VRSF_REQ_STOP) != 0 || \
- ((proc)->p_flag & SEXITING) != 0)
-
-#define VNETHOOK_INTERESTED_IN(neti) \
- (neti)->vni_nethook.vnh_event_in.he_interested
-#define VNETHOOK_INTERESTED_OUT(neti) \
- (neti)->vni_nethook.vnh_event_out.he_interested
-
-typedef struct viona_vring {
- viona_link_t *vr_link;
-
- kmutex_t vr_lock;
- kcondvar_t vr_cv;
- uint16_t vr_state;
- uint16_t vr_state_flags;
- uint_t vr_xfer_outstanding;
- kthread_t *vr_worker_thread;
- vmm_lease_t *vr_lease;
-
- /* ring-sized resources for TX activity */
- viona_desb_t *vr_txdesb;
- struct iovec *vr_txiov;
-
- uint_t vr_intr_enabled;
- uint64_t vr_msi_addr;
- uint64_t vr_msi_msg;
-
- /* Internal ring-related state */
- kmutex_t vr_a_mutex; /* sync consumers of 'avail' */
- kmutex_t vr_u_mutex; /* sync consumers of 'used' */
- uint64_t vr_pa;
- uint16_t vr_size;
- uint16_t vr_mask; /* cached from vr_size */
- uint16_t vr_cur_aidx; /* trails behind 'avail_idx' */
-
- /* Host-context pointers to the queue */
- volatile struct virtio_desc *vr_descr;
-
- volatile uint16_t *vr_avail_flags;
- volatile uint16_t *vr_avail_idx;
- volatile uint16_t *vr_avail_ring;
- volatile uint16_t *vr_avail_used_event;
-
- volatile uint16_t *vr_used_flags;
- volatile uint16_t *vr_used_idx;
- volatile struct virtio_used *vr_used_ring;
- volatile uint16_t *vr_used_avail_event;
-
- /* Per-ring error condition statistics */
- struct viona_ring_stats {
- uint64_t rs_ndesc_too_high;
- uint64_t rs_bad_idx;
- uint64_t rs_indir_bad_len;
- uint64_t rs_indir_bad_nest;
- uint64_t rs_indir_bad_next;
- uint64_t rs_no_space;
- uint64_t rs_too_many_desc;
- uint64_t rs_desc_bad_len;
-
- uint64_t rs_bad_ring_addr;
-
- uint64_t rs_fail_hcksum;
- uint64_t rs_fail_hcksum6;
- uint64_t rs_fail_hcksum_proto;
-
- uint64_t rs_bad_rx_frame;
- uint64_t rs_rx_merge_overrun;
- uint64_t rs_rx_merge_underrun;
- uint64_t rs_rx_pad_short;
- uint64_t rs_rx_mcast_check;
- uint64_t rs_too_short;
- uint64_t rs_tx_absent;
-
- uint64_t rs_rx_hookdrop;
- uint64_t rs_tx_hookdrop;
- } vr_stats;
-} viona_vring_t;
-
-struct viona_link {
- vmm_hold_t *l_vm_hold;
- boolean_t l_destroyed;
-
- viona_vring_t l_vrings[VIONA_VQ_MAX];
-
- uint32_t l_features;
- uint32_t l_features_hw;
- uint32_t l_cap_csum;
- boolean_t l_force_tx_copy;
-
- uintptr_t l_notify_ioport;
- void *l_notify_cookie;
-
- datalink_id_t l_linkid;
- mac_handle_t l_mh;
- mac_client_handle_t l_mch;
- mac_promisc_handle_t l_mph;
-
- pollhead_t l_pollhead;
-
- viona_neti_t *l_neti;
-};
-
-typedef struct viona_nethook {
- net_handle_t vnh_neti;
- hook_family_t vnh_family;
- hook_event_t vnh_event_in;
- hook_event_t vnh_event_out;
- hook_event_token_t vnh_token_in;
- hook_event_token_t vnh_token_out;
- boolean_t vnh_hooked;
-} viona_nethook_t;
-
-struct viona_neti {
- list_node_t vni_node;
-
- netid_t vni_netid;
- zoneid_t vni_zid;
-
- viona_nethook_t vni_nethook;
-
- kmutex_t vni_lock; /* Protects remaining members */
- kcondvar_t vni_ref_change; /* Protected by vni_lock */
- uint_t vni_ref; /* Protected by vni_lock */
- list_t vni_dev_list; /* Protected by vni_lock */
-};
-
-struct viona_desb {
- frtn_t d_frtn;
- viona_vring_t *d_ring;
- uint_t d_ref;
- uint32_t d_len;
- uint16_t d_cookie;
- uchar_t *d_headers;
-};
-
-typedef struct viona_soft_state {
- kmutex_t ss_lock;
- viona_link_t *ss_link;
- list_node_t ss_node;
-} viona_soft_state_t;
-
-typedef struct used_elem {
- uint16_t id;
- uint32_t len;
-} used_elem_t;
-
-static void *viona_state;
-static dev_info_t *viona_dip;
-static id_space_t *viona_minors;
-static mblk_t *viona_vlan_pad_mp;
-
-/*
- * Global linked list of viona_neti_ts. Access is protected by viona_neti_lock
- */
-static kmutex_t viona_neti_lock;
-static list_t viona_neti_list;
-
-/*
- * viona_neti is allocated and initialized during attach, and read-only
- * until detach (where it's also freed)
- */
-static net_instance_t *viona_neti;
-
-/*
- * copy tx mbufs from virtio ring to avoid necessitating a wait for packet
- * transmission to free resources.
- */
-static kmutex_t viona_force_copy_lock;
-static enum viona_force_copy {
- VFC_UNINITALIZED = 0,
- VFC_COPY_UNEEDED = 1,
- VFC_COPY_REQUIRED = 2,
-} viona_force_copy_state = VFC_UNINITALIZED;
-
-static int viona_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg,
- void **result);
-static int viona_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
-static int viona_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
-static int viona_open(dev_t *devp, int flag, int otype, cred_t *credp);
-static int viona_close(dev_t dev, int flag, int otype, cred_t *credp);
-static int viona_ioctl(dev_t dev, int cmd, intptr_t data, int mode,
- cred_t *credp, int *rval);
-static int viona_chpoll(dev_t dev, short events, int anyyet, short *reventsp,
- struct pollhead **phpp);
-
-static int viona_ioc_create(viona_soft_state_t *, void *, int, cred_t *);
-static int viona_ioc_delete(viona_soft_state_t *, boolean_t);
-
-static void *viona_gpa2kva(viona_vring_t *, uint64_t, size_t);
-
-static void viona_ring_alloc(viona_link_t *, viona_vring_t *);
-static void viona_ring_free(viona_vring_t *);
-static int viona_ring_reset(viona_vring_t *, boolean_t);
-static kthread_t *viona_create_worker(viona_vring_t *);
-static boolean_t viona_ring_map(viona_vring_t *);
-static void viona_ring_unmap(viona_vring_t *);
-
-static int viona_ioc_set_notify_ioport(viona_link_t *, uint_t);
-static int viona_ioc_ring_init(viona_link_t *, void *, int);
-static int viona_ioc_ring_reset(viona_link_t *, uint_t);
-static int viona_ioc_ring_kick(viona_link_t *, uint_t);
-static int viona_ioc_ring_set_msi(viona_link_t *, void *, int);
-static int viona_ioc_ring_intr_clear(viona_link_t *, uint_t);
-static int viona_ioc_intr_poll(viona_link_t *, void *, int, int *);
-
-static void viona_intr_ring(viona_vring_t *);
-
-static void viona_desb_release(viona_desb_t *);
-static void viona_rx_classified(void *, mac_resource_handle_t, mblk_t *,
- boolean_t);
-static void viona_rx_mcast(void *, mac_resource_handle_t, mblk_t *, boolean_t);
-static void viona_tx_wait_outstanding(viona_vring_t *);
-static void viona_tx(viona_link_t *, viona_vring_t *);
-
-static viona_neti_t *viona_neti_lookup_by_zid(zoneid_t);
-static void viona_neti_rele(viona_neti_t *);
-
-static void *viona_neti_create(const netid_t);
-static void viona_neti_shutdown(const netid_t, void *);
-static void viona_neti_destroy(const netid_t, void *);
-
-static int viona_hook(viona_link_t *, viona_vring_t *, mblk_t **, boolean_t);
-
-static struct cb_ops viona_cb_ops = {
- viona_open,
- viona_close,
- nodev,
- nodev,
- nodev,
- nodev,
- nodev,
- viona_ioctl,
- nodev,
- nodev,
- nodev,
- viona_chpoll,
- ddi_prop_op,
- 0,
- D_MP | D_NEW | D_HOTPLUG,
- CB_REV,
- nodev,
- nodev
-};
-
-static struct dev_ops viona_ops = {
- DEVO_REV,
- 0,
- viona_info,
- nulldev,
- nulldev,
- viona_attach,
- viona_detach,
- nodev,
- &viona_cb_ops,
- NULL,
- ddi_power,
- ddi_quiesce_not_needed
-};
-
-static struct modldrv modldrv = {
- &mod_driverops,
- VIONA_NAME,
- &viona_ops,
-};
-
-static struct modlinkage modlinkage = {
- MODREV_1, &modldrv, NULL
-};
-
-int
-_init(void)
-{
- int ret;
-
- ret = ddi_soft_state_init(&viona_state, sizeof (viona_soft_state_t), 0);
- if (ret != 0)
- return (ret);
-
- ret = mod_install(&modlinkage);
- if (ret != 0) {
- ddi_soft_state_fini(&viona_state);
- return (ret);
- }
-
- return (ret);
-}
-
-int
-_fini(void)
-{
- int ret;
-
- ret = mod_remove(&modlinkage);
- if (ret == 0) {
- ddi_soft_state_fini(&viona_state);
- }
-
- return (ret);
-}
-
-int
-_info(struct modinfo *modinfop)
-{
- return (mod_info(&modlinkage, modinfop));
-}
-
-/*
- * Check if full TX packet copying is needed. This should not be called from
- * viona attach()/detach() context.
- */
-static boolean_t
-viona_tx_copy_needed()
-{
- boolean_t result;
-
- mutex_enter(&viona_force_copy_lock);
- if (viona_force_copy_state == VFC_UNINITALIZED) {
- major_t bnxe_major;
-
- /*
- * The original code for viona featured an explicit check for
- * the bnxe driver which, when found present, necessitated that
- * all transmissions be copied into their own mblks instead of
- * passing guest memory to the underlying device.
- *
- * The motivations for this are unclear, but until it can be
- * proven unnecessary, the check lives on.
- */
- viona_force_copy_state = VFC_COPY_UNEEDED;
- if ((bnxe_major = ddi_name_to_major(BNXE_NIC_DRIVER))
- != DDI_MAJOR_T_NONE) {
- if (ddi_hold_installed_driver(bnxe_major) != NULL) {
- viona_force_copy_state = VFC_COPY_REQUIRED;
- ddi_rele_driver(bnxe_major);
- }
- }
- }
- result = (viona_force_copy_state == VFC_COPY_REQUIRED);
- mutex_exit(&viona_force_copy_lock);
-
- return (result);
-}
-
-/* ARGSUSED */
-static int
-viona_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
-{
- int error;
-
- switch (cmd) {
- case DDI_INFO_DEVT2DEVINFO:
- *result = (void *)viona_dip;
- error = DDI_SUCCESS;
- break;
- case DDI_INFO_DEVT2INSTANCE:
- *result = (void *)0;
- error = DDI_SUCCESS;
- break;
- default:
- error = DDI_FAILURE;
- break;
- }
- return (error);
-}
-
-static int
-viona_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
-{
- mblk_t *mp;
-
- if (cmd != DDI_ATTACH) {
- return (DDI_FAILURE);
- }
-
- if (ddi_create_minor_node(dip, "viona", S_IFCHR, VIONA_CTL_MINOR,
- DDI_PSEUDO, 0) != DDI_SUCCESS) {
- return (DDI_FAILURE);
- }
-
- viona_minors = id_space_create("viona_minors",
- VIONA_CTL_MINOR + 1, UINT16_MAX);
-
- mutex_init(&viona_force_copy_lock, NULL, MUTEX_DRIVER, NULL);
-
- /* Create mblk for padding when VLAN tags are stripped */
- mp = allocb_wait(VLAN_TAGSZ, BPRI_HI, STR_NOSIG, NULL);
- bzero(mp->b_rptr, VLAN_TAGSZ);
- mp->b_wptr += VLAN_TAGSZ;
- viona_vlan_pad_mp = mp;
-
- viona_dip = dip;
- ddi_report_dev(viona_dip);
-
- mutex_init(&viona_neti_lock, NULL, MUTEX_DRIVER, NULL);
- list_create(&viona_neti_list, sizeof (viona_neti_t),
- offsetof(viona_neti_t, vni_node));
-
- /* This can only fail if NETINFO_VERSION is wrong */
- viona_neti = net_instance_alloc(NETINFO_VERSION);
- VERIFY(viona_neti != NULL);
-
- viona_neti->nin_name = "viona";
- viona_neti->nin_create = viona_neti_create;
- viona_neti->nin_shutdown = viona_neti_shutdown;
- viona_neti->nin_destroy = viona_neti_destroy;
- /* This can only fail if we've registered ourselves multiple times */
- VERIFY3S(net_instance_register(viona_neti), ==, DDI_SUCCESS);
-
- return (DDI_SUCCESS);
-}
-
-static int
-viona_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
-{
- mblk_t *mp;
-
- if (cmd != DDI_DETACH) {
- return (DDI_FAILURE);
- }
-
- /* Clean up the VLAN padding mblk */
- mp = viona_vlan_pad_mp;
- viona_vlan_pad_mp = NULL;
- VERIFY(mp != NULL && mp->b_cont == NULL);
- freemsg(mp);
-
- id_space_destroy(viona_minors);
- ddi_remove_minor_node(viona_dip, NULL);
- viona_dip = NULL;
-
- /* This can only fail if we've not registered previously */
- VERIFY3S(net_instance_unregister(viona_neti), ==, DDI_SUCCESS);
- net_instance_free(viona_neti);
- viona_neti = NULL;
-
- list_destroy(&viona_neti_list);
- mutex_destroy(&viona_neti_lock);
-
- return (DDI_SUCCESS);
-}
-
-static int
-viona_open(dev_t *devp, int flag, int otype, cred_t *credp)
-{
- int minor;
- viona_soft_state_t *ss;
-
- if (otype != OTYP_CHR) {
- return (EINVAL);
- }
-#if 0
- /*
- * XXX-mg: drv_priv() is wrong, but I'm not sure what is right.
- * Should the check be at open() or ioctl()?
- */
- if (drv_priv(credp) != 0) {
- return (EPERM);
- }
-#endif
- if (getminor(*devp) != VIONA_CTL_MINOR) {
- return (ENXIO);
- }
-
- minor = id_alloc_nosleep(viona_minors);
- if (minor == 0) {
- /* All minors are busy */
- return (EBUSY);
- }
- if (ddi_soft_state_zalloc(viona_state, minor) != DDI_SUCCESS) {
- id_free(viona_minors, minor);
- return (ENOMEM);
- }
-
- ss = ddi_get_soft_state(viona_state, minor);
- mutex_init(&ss->ss_lock, NULL, MUTEX_DEFAULT, NULL);
- *devp = makedevice(getmajor(*devp), minor);
-
- return (0);
-}
-
-static int
-viona_close(dev_t dev, int flag, int otype, cred_t *credp)
-{
- int minor;
- viona_soft_state_t *ss;
-
- if (otype != OTYP_CHR) {
- return (EINVAL);
- }
-
- minor = getminor(dev);
-
- ss = ddi_get_soft_state(viona_state, minor);
- if (ss == NULL) {
- return (ENXIO);
- }
-
- VERIFY0(viona_ioc_delete(ss, B_TRUE));
- VERIFY(!list_link_active(&ss->ss_node));
- ddi_soft_state_free(viona_state, minor);
- id_free(viona_minors, minor);
-
- return (0);
-}
-
-static int
-viona_ioctl(dev_t dev, int cmd, intptr_t data, int md, cred_t *cr, int *rv)
-{
- viona_soft_state_t *ss;
- void *dptr = (void *)data;
- int err = 0, val;
- viona_link_t *link;
-
- ss = ddi_get_soft_state(viona_state, getminor(dev));
- if (ss == NULL) {
- return (ENXIO);
- }
-
- switch (cmd) {
- case VNA_IOC_CREATE:
- return (viona_ioc_create(ss, dptr, md, cr));
- case VNA_IOC_DELETE:
- return (viona_ioc_delete(ss, B_FALSE));
- default:
- break;
- }
-
- mutex_enter(&ss->ss_lock);
- if ((link = ss->ss_link) == NULL || link->l_destroyed ||
- vmm_drv_release_reqd(link->l_vm_hold)) {
- mutex_exit(&ss->ss_lock);
- return (ENXIO);
- }
-
- switch (cmd) {
- case VNA_IOC_GET_FEATURES:
- val = VIONA_S_HOSTCAPS | link->l_features_hw;
- if (ddi_copyout(&val, dptr, sizeof (val), md) != 0) {
- err = EFAULT;
- }
- break;
- case VNA_IOC_SET_FEATURES:
- if (ddi_copyin(dptr, &val, sizeof (val), md) != 0) {
- err = EFAULT;
- break;
- }
- val &= (VIONA_S_HOSTCAPS | link->l_features_hw);
-
- if ((val & VIRTIO_NET_F_CSUM) == 0)
- val &= ~VIRTIO_NET_F_HOST_TSO4;
-
- if ((val & VIRTIO_NET_F_GUEST_CSUM) == 0)
- val &= ~VIRTIO_NET_F_GUEST_TSO4;
-
- link->l_features = val;
- break;
- case VNA_IOC_RING_INIT:
- err = viona_ioc_ring_init(link, dptr, md);
- break;
- case VNA_IOC_RING_RESET:
- err = viona_ioc_ring_reset(link, (uint_t)data);
- break;
- case VNA_IOC_RING_KICK:
- err = viona_ioc_ring_kick(link, (uint_t)data);
- break;
- case VNA_IOC_RING_SET_MSI:
- err = viona_ioc_ring_set_msi(link, dptr, md);
- break;
- case VNA_IOC_RING_INTR_CLR:
- err = viona_ioc_ring_intr_clear(link, (uint_t)data);
- break;
- case VNA_IOC_INTR_POLL:
- err = viona_ioc_intr_poll(link, dptr, md, rv);
- break;
- case VNA_IOC_SET_NOTIFY_IOP:
- err = viona_ioc_set_notify_ioport(link, (uint_t)data);
- break;
- default:
- err = ENOTTY;
- break;
- }
-
- mutex_exit(&ss->ss_lock);
- return (err);
-}
-
-static int
-viona_chpoll(dev_t dev, short events, int anyyet, short *reventsp,
- struct pollhead **phpp)
-{
- viona_soft_state_t *ss;
- viona_link_t *link;
-
- ss = ddi_get_soft_state(viona_state, getminor(dev));
- if (ss == NULL) {
- return (ENXIO);
- }
-
- mutex_enter(&ss->ss_lock);
- if ((link = ss->ss_link) == NULL || link->l_destroyed) {
- mutex_exit(&ss->ss_lock);
- return (ENXIO);
- }
-
- *reventsp = 0;
- if ((events & POLLRDBAND) != 0) {
- for (uint_t i = 0; i < VIONA_VQ_MAX; i++) {
- if (link->l_vrings[i].vr_intr_enabled != 0) {
- *reventsp |= POLLRDBAND;
- break;
- }
- }
- }
- if ((*reventsp == 0 && !anyyet) || (events & POLLET)) {
- *phpp = &link->l_pollhead;
- }
- mutex_exit(&ss->ss_lock);
-
- return (0);
-}
-
-static void
-viona_get_mac_capab(viona_link_t *link)
-{
- mac_handle_t mh = link->l_mh;
- uint32_t cap = 0;
- mac_capab_lso_t lso_cap;
-
- link->l_features_hw = 0;
- if (mac_capab_get(mh, MAC_CAPAB_HCKSUM, &cap)) {
- /*
- * Only report HW checksum ability if the underlying MAC
- * resource is capable of populating the L4 header.
- */
- if ((cap & VIONA_CAP_HCKSUM_INTEREST) != 0) {
- link->l_features_hw |= VIRTIO_NET_F_CSUM;
- }
- link->l_cap_csum = cap;
- }
-
- if ((link->l_features_hw & VIRTIO_NET_F_CSUM) &&
- mac_capab_get(mh, MAC_CAPAB_LSO, &lso_cap)) {
- /*
- * Virtio doesn't allow for negotiating a maximum LSO
- * packet size. We have to assume that the guest may
- * send a maximum length IP packet. Make sure the
- * underlying MAC can handle an LSO of this size.
- */
- if ((lso_cap.lso_flags & LSO_TX_BASIC_TCP_IPV4) &&
- lso_cap.lso_basic_tcp_ipv4.lso_max >= IP_MAXPACKET)
- link->l_features_hw |= VIRTIO_NET_F_HOST_TSO4;
- }
-}
-
-static int
-viona_rx_set(viona_link_t *link)
-{
- viona_vring_t *ring = &link->l_vrings[VIONA_VQ_RX];
- int err;
-
- mac_rx_set(link->l_mch, viona_rx_classified, ring);
- err = mac_promisc_add(link->l_mch, MAC_CLIENT_PROMISC_MULTI,
- viona_rx_mcast, ring, &link->l_mph,
- MAC_PROMISC_FLAGS_NO_TX_LOOP | MAC_PROMISC_FLAGS_VLAN_TAG_STRIP);
- if (err != 0) {
- mac_rx_clear(link->l_mch);
- }
-
- return (err);
-}
-
-static void
-viona_rx_clear(viona_link_t *link)
-{
- mac_promisc_remove(link->l_mph);
- mac_rx_clear(link->l_mch);
-}
-
-static int
-viona_ioc_create(viona_soft_state_t *ss, void *dptr, int md, cred_t *cr)
-{
- vioc_create_t kvc;
- viona_link_t *link = NULL;
- char cli_name[MAXNAMELEN];
- int err = 0;
- file_t *fp;
- vmm_hold_t *hold = NULL;
- viona_neti_t *nip = NULL;
- zoneid_t zid;
-
- ASSERT(MUTEX_NOT_HELD(&ss->ss_lock));
-
- if (ddi_copyin(dptr, &kvc, sizeof (kvc), md) != 0) {
- return (EFAULT);
- }
-
- zid = crgetzoneid(cr);
- nip = viona_neti_lookup_by_zid(zid);
- if (nip == NULL) {
- return (EIO);
- }
-
- if (!nip->vni_nethook.vnh_hooked) {
- viona_neti_rele(nip);
- return (EIO);
- }
-
- mutex_enter(&ss->ss_lock);
- if (ss->ss_link != NULL) {
- mutex_exit(&ss->ss_lock);
- viona_neti_rele(nip);
- return (EEXIST);
- }
-
- if ((fp = getf(kvc.c_vmfd)) == NULL) {
- err = EBADF;
- goto bail;
- }
- err = vmm_drv_hold(fp, cr, &hold);
- releasef(kvc.c_vmfd);
- if (err != 0) {
- goto bail;
- }
-
- link = kmem_zalloc(sizeof (viona_link_t), KM_SLEEP);
- link->l_linkid = kvc.c_linkid;
- link->l_vm_hold = hold;
- link->l_force_tx_copy = viona_tx_copy_needed();
-
- err = mac_open_by_linkid(link->l_linkid, &link->l_mh);
- if (err != 0) {
- goto bail;
- }
-
- viona_get_mac_capab(link);
-
- (void) snprintf(cli_name, sizeof (cli_name), "%s-%d", VIONA_CLI_NAME,
- link->l_linkid);
- err = mac_client_open(link->l_mh, &link->l_mch, cli_name, 0);
- if (err != 0) {
- goto bail;
- }
-
- viona_ring_alloc(link, &link->l_vrings[VIONA_VQ_RX]);
- viona_ring_alloc(link, &link->l_vrings[VIONA_VQ_TX]);
-
- if ((err = viona_rx_set(link)) != 0) {
- viona_ring_free(&link->l_vrings[VIONA_VQ_RX]);
- viona_ring_free(&link->l_vrings[VIONA_VQ_TX]);
- goto bail;
- }
-
- link->l_neti = nip;
- ss->ss_link = link;
- mutex_exit(&ss->ss_lock);
-
- mutex_enter(&nip->vni_lock);
- list_insert_tail(&nip->vni_dev_list, ss);
- mutex_exit(&nip->vni_lock);
-
- return (0);
-
-bail:
- if (link != NULL) {
- if (link->l_mch != NULL) {
- mac_client_close(link->l_mch, 0);
- }
- if (link->l_mh != NULL) {
- mac_close(link->l_mh);
- }
- kmem_free(link, sizeof (viona_link_t));
- }
- if (hold != NULL) {
- vmm_drv_rele(hold);
- }
- viona_neti_rele(nip);
-
- mutex_exit(&ss->ss_lock);
- return (err);
-}
-
-static int
-viona_ioc_delete(viona_soft_state_t *ss, boolean_t on_close)
-{
- viona_link_t *link;
- viona_neti_t *nip = NULL;
-
- mutex_enter(&ss->ss_lock);
- if ((link = ss->ss_link) == NULL) {
- /* Link destruction already complete */
- mutex_exit(&ss->ss_lock);
- return (0);
- }
-
- if (link->l_destroyed) {
- /*
- * Link destruction has been started by another thread, but has
- * not completed. This condition should be impossible to
- * encounter when performing the on-close destroy of the link,
- * since racing ioctl accessors must necessarily be absent.
- */
- VERIFY(!on_close);
- mutex_exit(&ss->ss_lock);
- return (EAGAIN);
- }
- /*
- * The link deletion cannot fail after this point, continuing until its
- * successful completion is reached.
- */
- link->l_destroyed = B_TRUE;
-
- /*
- * Tear down the IO port hook so it cannot be used to kick any of the
- * rings which are about to be reset and stopped.
- */
- VERIFY0(viona_ioc_set_notify_ioport(link, 0));
- mutex_exit(&ss->ss_lock);
-
- /*
- * Return the rings to their reset state, ignoring any possible
- * interruptions from signals.
- */
- VERIFY0(viona_ring_reset(&link->l_vrings[VIONA_VQ_RX], B_FALSE));
- VERIFY0(viona_ring_reset(&link->l_vrings[VIONA_VQ_TX], B_FALSE));
-
- mutex_enter(&ss->ss_lock);
- if (link->l_mch != NULL) {
- /* Unhook the receive callbacks and close out the client */
- viona_rx_clear(link);
- mac_client_close(link->l_mch, 0);
- }
- if (link->l_mh != NULL) {
- mac_close(link->l_mh);
- }
- if (link->l_vm_hold != NULL) {
- vmm_drv_rele(link->l_vm_hold);
- link->l_vm_hold = NULL;
- }
-
- nip = link->l_neti;
- link->l_neti = NULL;
-
- viona_ring_free(&link->l_vrings[VIONA_VQ_RX]);
- viona_ring_free(&link->l_vrings[VIONA_VQ_TX]);
- pollhead_clean(&link->l_pollhead);
- ss->ss_link = NULL;
- mutex_exit(&ss->ss_lock);
-
- mutex_enter(&nip->vni_lock);
- list_remove(&nip->vni_dev_list, ss);
- mutex_exit(&nip->vni_lock);
-
- viona_neti_rele(nip);
-
- kmem_free(link, sizeof (viona_link_t));
- return (0);
-}
-
-/*
- * Translate a guest physical address into a kernel virtual address.
- */
-static void *
-viona_gpa2kva(viona_vring_t *ring, uint64_t gpa, size_t len)
-{
- ASSERT3P(ring->vr_lease, !=, NULL);
-
- return (vmm_drv_gpa2kva(ring->vr_lease, gpa, len));
-}
-
-static boolean_t
-viona_ring_lease_expire_cb(void *arg)
-{
- viona_vring_t *ring = arg;
-
- cv_broadcast(&ring->vr_cv);
-
- /* The lease will be broken asynchronously. */
- return (B_FALSE);
-}
-
-static void
-viona_ring_lease_drop(viona_vring_t *ring)
-{
- ASSERT(MUTEX_HELD(&ring->vr_lock));
-
- if (ring->vr_lease != NULL) {
- vmm_hold_t *hold = ring->vr_link->l_vm_hold;
-
- ASSERT(hold != NULL);
-
- /*
- * Without an active lease, the ring mappings cannot be
- * considered valid.
- */
- viona_ring_unmap(ring);
-
- vmm_drv_lease_break(hold, ring->vr_lease);
- ring->vr_lease = NULL;
- }
-}
-
-static boolean_t
-viona_ring_lease_renew(viona_vring_t *ring)
-{
- vmm_hold_t *hold = ring->vr_link->l_vm_hold;
-
- ASSERT(hold != NULL);
- ASSERT(MUTEX_HELD(&ring->vr_lock));
-
- viona_ring_lease_drop(ring);
-
- /*
- * Lease renewal will fail if the VM has requested that all holds be
- * cleaned up.
- */
- ring->vr_lease = vmm_drv_lease_sign(hold, viona_ring_lease_expire_cb,
- ring);
- if (ring->vr_lease != NULL) {
- /* A ring undergoing renewal will need valid guest mappings */
- if (ring->vr_pa != 0 && ring->vr_size != 0) {
- /*
- * If new mappings cannot be established, consider the
- * lease renewal a failure.
- */
- if (!viona_ring_map(ring)) {
- viona_ring_lease_drop(ring);
- return (B_FALSE);
- }
- }
- }
- return (ring->vr_lease != NULL);
-}
-
-static void
-viona_ring_alloc(viona_link_t *link, viona_vring_t *ring)
-{
- ring->vr_link = link;
- mutex_init(&ring->vr_lock, NULL, MUTEX_DRIVER, NULL);
- cv_init(&ring->vr_cv, NULL, CV_DRIVER, NULL);
- mutex_init(&ring->vr_a_mutex, NULL, MUTEX_DRIVER, NULL);
- mutex_init(&ring->vr_u_mutex, NULL, MUTEX_DRIVER, NULL);
-}
-
-static void
-viona_ring_misc_free(viona_vring_t *ring)
-{
- const uint_t cnt = ring->vr_size;
-
- if (ring->vr_txdesb != NULL) {
- viona_desb_t *dp = ring->vr_txdesb;
-
- for (uint_t i = 0; i < cnt; i++, dp++) {
- kmem_free(dp->d_headers, VIONA_MAX_HDRS_LEN);
- }
- kmem_free(ring->vr_txdesb, sizeof (viona_desb_t) * cnt);
- ring->vr_txdesb = NULL;
- }
-
- if (ring->vr_txiov != NULL) {
- kmem_free(ring->vr_txiov, sizeof (struct iovec) * cnt);
- ring->vr_txiov = NULL;
- }
-}
-
-static void
-viona_ring_free(viona_vring_t *ring)
-{
- mutex_destroy(&ring->vr_lock);
- cv_destroy(&ring->vr_cv);
- mutex_destroy(&ring->vr_a_mutex);
- mutex_destroy(&ring->vr_u_mutex);
- ring->vr_link = NULL;
-}
-
-static int
-viona_ring_reset(viona_vring_t *ring, boolean_t heed_signals)
-{
- mutex_enter(&ring->vr_lock);
- if (ring->vr_state == VRS_RESET) {
- mutex_exit(&ring->vr_lock);
- return (0);
- }
-
- if ((ring->vr_state_flags & VRSF_REQ_STOP) == 0) {
- ring->vr_state_flags |= VRSF_REQ_STOP;
- cv_broadcast(&ring->vr_cv);
- }
- while (ring->vr_state != VRS_RESET) {
- if (!heed_signals) {
- cv_wait(&ring->vr_cv, &ring->vr_lock);
- } else {
- int rs;
-
- rs = cv_wait_sig(&ring->vr_cv, &ring->vr_lock);
- if (rs <= 0 && ring->vr_state != VRS_RESET) {
- mutex_exit(&ring->vr_lock);
- return (EINTR);
- }
- }
- }
- viona_ring_lease_drop(ring);
- mutex_exit(&ring->vr_lock);
- return (0);
-}
-
-static boolean_t
-viona_ring_map(viona_vring_t *ring)
-{
- uint64_t pos = ring->vr_pa;
- const uint16_t qsz = ring->vr_size;
-
- ASSERT3U(qsz, !=, 0);
- ASSERT3U(pos, !=, 0);
- ASSERT(MUTEX_HELD(&ring->vr_lock));
-
- const size_t desc_sz = qsz * sizeof (struct virtio_desc);
- ring->vr_descr = viona_gpa2kva(ring, pos, desc_sz);
- if (ring->vr_descr == NULL) {
- goto fail;
- }
- pos += desc_sz;
-
- const size_t avail_sz = (qsz + 3) * sizeof (uint16_t);
- ring->vr_avail_flags = viona_gpa2kva(ring, pos, avail_sz);
- if (ring->vr_avail_flags == NULL) {
- goto fail;
- }
- ring->vr_avail_idx = ring->vr_avail_flags + 1;
- ring->vr_avail_ring = ring->vr_avail_flags + 2;
- ring->vr_avail_used_event = ring->vr_avail_ring + qsz;
- pos += avail_sz;
-
- const size_t used_sz = (qsz * sizeof (struct virtio_used)) +
- (sizeof (uint16_t) * 3);
- pos = P2ROUNDUP(pos, VRING_ALIGN);
- ring->vr_used_flags = viona_gpa2kva(ring, pos, used_sz);
- if (ring->vr_used_flags == NULL) {
- goto fail;
- }
- ring->vr_used_idx = ring->vr_used_flags + 1;
- ring->vr_used_ring = (struct virtio_used *)(ring->vr_used_flags + 2);
- ring->vr_used_avail_event = (uint16_t *)(ring->vr_used_ring + qsz);
-
- return (B_TRUE);
-
-fail:
- viona_ring_unmap(ring);
- return (B_FALSE);
-}
-
-static void
-viona_ring_unmap(viona_vring_t *ring)
-{
- ASSERT(MUTEX_HELD(&ring->vr_lock));
-
- ring->vr_descr = NULL;
- ring->vr_avail_flags = NULL;
- ring->vr_avail_idx = NULL;
- ring->vr_avail_ring = NULL;
- ring->vr_avail_used_event = NULL;
- ring->vr_used_flags = NULL;
- ring->vr_used_idx = NULL;
- ring->vr_used_ring = NULL;
- ring->vr_used_avail_event = NULL;
-}
-
-static int
-viona_ioc_ring_init(viona_link_t *link, void *udata, int md)
-{
- vioc_ring_init_t kri;
- viona_vring_t *ring;
- kthread_t *t;
- int err = 0;
-
- if (ddi_copyin(udata, &kri, sizeof (kri), md) != 0) {
- return (EFAULT);
- }
-
- if (kri.ri_index >= VIONA_VQ_MAX) {
- return (EINVAL);
- }
- const uint16_t qsz = kri.ri_qsize;
- if (qsz == 0 || qsz > VRING_MAX_LEN || (1 << (ffs(qsz) - 1)) != qsz) {
- return (EINVAL);
- }
-
- ring = &link->l_vrings[kri.ri_index];
- mutex_enter(&ring->vr_lock);
- if (ring->vr_state != VRS_RESET) {
- mutex_exit(&ring->vr_lock);
- return (EBUSY);
- }
- VERIFY(ring->vr_state_flags == 0);
-
- ring->vr_lease = NULL;
- if (!viona_ring_lease_renew(ring)) {
- err = EBUSY;
- goto fail;
- }
-
- ring->vr_size = qsz;
- ring->vr_mask = (ring->vr_size - 1);
- ring->vr_pa = kri.ri_qaddr;
- if (!viona_ring_map(ring)) {
- err = EINVAL;
- goto fail;
- }
-
- /* Initialize queue indexes */
- ring->vr_cur_aidx = 0;
-
- /* Allocate desb handles for TX ring if packet copying not disabled */
- if (kri.ri_index == VIONA_VQ_TX && !link->l_force_tx_copy) {
- viona_desb_t *dp;
-
- dp = kmem_zalloc(sizeof (viona_desb_t) * qsz, KM_SLEEP);
- ring->vr_txdesb = dp;
- for (uint_t i = 0; i < qsz; i++, dp++) {
- dp->d_frtn.free_func = viona_desb_release;
- dp->d_frtn.free_arg = (void *)dp;
- dp->d_ring = ring;
- dp->d_headers = kmem_zalloc(VIONA_MAX_HDRS_LEN,
- KM_SLEEP);
- }
- }
-
- /* Allocate ring-sized iovec buffers for TX */
- if (kri.ri_index == VIONA_VQ_TX) {
- ring->vr_txiov = kmem_alloc(sizeof (struct iovec) * qsz,
- KM_SLEEP);
- }
-
- /* Zero out MSI-X configuration */
- ring->vr_msi_addr = 0;
- ring->vr_msi_msg = 0;
-
- /* Clear the stats */
- bzero(&ring->vr_stats, sizeof (ring->vr_stats));
-
- t = viona_create_worker(ring);
- if (t == NULL) {
- err = ENOMEM;
- goto fail;
- }
- ring->vr_worker_thread = t;
- ring->vr_state = VRS_SETUP;
- cv_broadcast(&ring->vr_cv);
- mutex_exit(&ring->vr_lock);
- return (0);
-
-fail:
- viona_ring_lease_drop(ring);
- viona_ring_misc_free(ring);
- ring->vr_size = 0;
- ring->vr_mask = 0;
- mutex_exit(&ring->vr_lock);
- return (err);
-}
-
-static int
-viona_ioc_ring_reset(viona_link_t *link, uint_t idx)
-{
- viona_vring_t *ring;
-
- if (idx >= VIONA_VQ_MAX) {
- return (EINVAL);
- }
- ring = &link->l_vrings[idx];
-
- return (viona_ring_reset(ring, B_TRUE));
-}
-
-static int
-viona_ioc_ring_kick(viona_link_t *link, uint_t idx)
-{
- viona_vring_t *ring;
- int err;
-
- if (idx >= VIONA_VQ_MAX) {
- return (EINVAL);
- }
- ring = &link->l_vrings[idx];
-
- mutex_enter(&ring->vr_lock);
- switch (ring->vr_state) {
- case VRS_SETUP:
- /*
- * An early kick to a ring which is starting its worker thread
- * is fine. Once that thread is active, it will process the
- * start-up request immediately.
- */
- /* FALLTHROUGH */
- case VRS_INIT:
- ring->vr_state_flags |= VRSF_REQ_START;
- /* FALLTHROUGH */
- case VRS_RUN:
- cv_broadcast(&ring->vr_cv);
- err = 0;
- break;
- default:
- err = EBUSY;
- break;
- }
- mutex_exit(&ring->vr_lock);
-
- return (err);
-}
-
-static int
-viona_ioc_ring_set_msi(viona_link_t *link, void *data, int md)
-{
- vioc_ring_msi_t vrm;
- viona_vring_t *ring;
-
- if (ddi_copyin(data, &vrm, sizeof (vrm), md) != 0) {
- return (EFAULT);
- }
- if (vrm.rm_index >= VIONA_VQ_MAX) {
- return (EINVAL);
- }
-
- ring = &link->l_vrings[vrm.rm_index];
- mutex_enter(&ring->vr_lock);
- ring->vr_msi_addr = vrm.rm_addr;
- ring->vr_msi_msg = vrm.rm_msg;
- mutex_exit(&ring->vr_lock);
-
- return (0);
-}
-
-static int
-viona_notify_wcb(void *arg, uintptr_t ioport, uint_t sz, uint64_t val)
-{
- viona_link_t *link = (viona_link_t *)arg;
- uint16_t vq = (uint16_t)val;
-
- if (ioport != link->l_notify_ioport || sz != sizeof (uint16_t)) {
- return (EINVAL);
- }
- return (viona_ioc_ring_kick(link, vq));
-}
-
-static int
-viona_ioc_set_notify_ioport(viona_link_t *link, uint_t ioport)
-{
- int err = 0;
-
- if (link->l_notify_ioport != 0) {
- vmm_drv_ioport_unhook(link->l_vm_hold, &link->l_notify_cookie);
- link->l_notify_ioport = 0;
- }
-
- if (ioport != 0) {
- err = vmm_drv_ioport_hook(link->l_vm_hold, ioport, NULL,
- viona_notify_wcb, (void *)link, &link->l_notify_cookie);
- if (err == 0) {
- link->l_notify_ioport = ioport;
- }
- }
- return (err);
-}
-
-/*
- * Return the number of available descriptors in the vring taking care of the
- * 16-bit index wraparound.
- *
- * Note: If the number of apparently available descriptors is larger than the
- * ring size (due to guest misbehavior), this check will still report the
- * positive count of descriptors.
- */
-static inline int
-viona_vr_num_avail(viona_vring_t *ring)
-{
- uint16_t ndesc;
-
- /*
- * We're just computing (a-b) in GF(216).
- *
- * The only glitch here is that in standard C, uint16_t promotes to
- * (signed) int when int has more than 16 bits (almost always now).
- * A cast back to unsigned is necessary for proper operation.
- */
- ndesc = (unsigned)*ring->vr_avail_idx - (unsigned)ring->vr_cur_aidx;
-
- return (ndesc);
-}
-
-static void
-viona_worker_rx(viona_vring_t *ring, viona_link_t *link)
-{
- proc_t *p = ttoproc(curthread);
-
- (void) thread_vsetname(curthread, "viona_rx_%p", ring);
-
- ASSERT(MUTEX_HELD(&ring->vr_lock));
- ASSERT3U(ring->vr_state, ==, VRS_RUN);
-
- *ring->vr_used_flags |= VRING_USED_F_NO_NOTIFY;
-
- do {
- if (vmm_drv_lease_expired(ring->vr_lease)) {
- /*
- * Set the renewal flag, causing incoming traffic to be
- * dropped, and issue an RX barrier to ensure any
- * threads in the RX callbacks will have finished.
- * The vr_lock cannot be held across the barrier as it
- * poses a deadlock risk.
- */
- ring->vr_state_flags |= VRSF_RENEW;
- mutex_exit(&ring->vr_lock);
- mac_rx_barrier(link->l_mch);
- mutex_enter(&ring->vr_lock);
-
- if (!viona_ring_lease_renew(ring)) {
- break;
- }
- ring->vr_state_flags &= ~VRSF_RENEW;
- }
-
- /*
- * For now, there is little to do in the RX worker as inbound
- * data is delivered by MAC via the RX callbacks. If tap-like
- * functionality is added later, this would be a convenient
- * place to inject frames into the guest.
- */
- (void) cv_wait_sig(&ring->vr_cv, &ring->vr_lock);
- } while (!VRING_NEED_BAIL(ring, p));
-
- *ring->vr_used_flags &= ~VRING_USED_F_NO_NOTIFY;
-}
-
-static void
-viona_worker_tx(viona_vring_t *ring, viona_link_t *link)
-{
- proc_t *p = ttoproc(curthread);
-
- (void) thread_vsetname(curthread, "viona_tx_%p", ring);
-
- ASSERT(MUTEX_HELD(&ring->vr_lock));
- ASSERT3U(ring->vr_state, ==, VRS_RUN);
-
- mutex_exit(&ring->vr_lock);
-
- for (;;) {
- boolean_t bail = B_FALSE;
- boolean_t renew = B_FALSE;
- uint_t ntx = 0;
-
- *ring->vr_used_flags |= VRING_USED_F_NO_NOTIFY;
- while (viona_vr_num_avail(ring)) {
- viona_tx(link, ring);
-
- /*
- * It is advantageous for throughput to keep this
- * transmission loop tight, but periodic breaks to
- * check for other events are of value too.
- */
- if (ntx++ >= ring->vr_size)
- break;
- }
- *ring->vr_used_flags &= ~VRING_USED_F_NO_NOTIFY;
-
- VIONA_PROBE2(tx, viona_link_t *, link, uint_t, ntx);
-
- /*
- * Check for available descriptors on the ring once more in
- * case a late addition raced with the NO_NOTIFY flag toggle.
- *
- * The barrier ensures that visibility of the vr_used_flags
- * store does not cross the viona_vr_num_avail() check below.
- */
- membar_enter();
- bail = VRING_NEED_BAIL(ring, p);
- renew = vmm_drv_lease_expired(ring->vr_lease);
- if (!bail && !renew && viona_vr_num_avail(ring)) {
- continue;
- }
-
- if ((link->l_features & VIRTIO_F_RING_NOTIFY_ON_EMPTY) != 0) {
- viona_intr_ring(ring);
- }
-
- mutex_enter(&ring->vr_lock);
-
- while (!bail && !renew && !viona_vr_num_avail(ring)) {
- (void) cv_wait_sig(&ring->vr_cv, &ring->vr_lock);
- bail = VRING_NEED_BAIL(ring, p);
- renew = vmm_drv_lease_expired(ring->vr_lease);
- }
-
- if (bail) {
- break;
- } else if (renew) {
- ring->vr_state_flags |= VRSF_RENEW;
- /*
- * When renewing the lease for the ring, no TX
- * frames may be outstanding, as they contain
- * references to guest memory.
- */
- viona_tx_wait_outstanding(ring);
-
- if (!viona_ring_lease_renew(ring)) {
- break;
- }
- ring->vr_state_flags &= ~VRSF_RENEW;
- }
- mutex_exit(&ring->vr_lock);
- }
-
- ASSERT(MUTEX_HELD(&ring->vr_lock));
-
- viona_tx_wait_outstanding(ring);
-}
-
-static void
-viona_worker(void *arg)
-{
- viona_vring_t *ring = (viona_vring_t *)arg;
- viona_link_t *link = ring->vr_link;
- proc_t *p = ttoproc(curthread);
-
- mutex_enter(&ring->vr_lock);
- VERIFY3U(ring->vr_state, ==, VRS_SETUP);
-
- /* Bail immediately if ring shutdown or process exit was requested */
- if (VRING_NEED_BAIL(ring, p)) {
- goto cleanup;
- }
-
- /* Report worker thread as alive and notify creator */
- ring->vr_state = VRS_INIT;
- cv_broadcast(&ring->vr_cv);
-
- while (ring->vr_state_flags == 0) {
- /*
- * Keeping lease renewals timely while waiting for the ring to
- * be started is important for avoiding deadlocks.
- */
- if (vmm_drv_lease_expired(ring->vr_lease)) {
- if (!viona_ring_lease_renew(ring)) {
- goto cleanup;
- }
- }
-
- (void) cv_wait_sig(&ring->vr_cv, &ring->vr_lock);
-
- if (VRING_NEED_BAIL(ring, p)) {
- goto cleanup;
- }
- }
-
- ASSERT((ring->vr_state_flags & VRSF_REQ_START) != 0);
- ring->vr_state = VRS_RUN;
- ring->vr_state_flags &= ~VRSF_REQ_START;
-
- /* Ensure ring lease is valid first */
- if (vmm_drv_lease_expired(ring->vr_lease)) {
- if (!viona_ring_lease_renew(ring)) {
- goto cleanup;
- }
- }
-
- /* Process actual work */
- if (ring == &link->l_vrings[VIONA_VQ_RX]) {
- viona_worker_rx(ring, link);
- } else if (ring == &link->l_vrings[VIONA_VQ_TX]) {
- viona_worker_tx(ring, link);
- } else {
- panic("unexpected ring: %p", (void *)ring);
- }
-
-cleanup:
- if (ring->vr_txdesb != NULL) {
- /*
- * Transmit activity must be entirely concluded before the
- * associated descriptors can be cleaned up.
- */
- VERIFY(ring->vr_xfer_outstanding == 0);
- }
- viona_ring_misc_free(ring);
-
- viona_ring_lease_drop(ring);
- ring->vr_cur_aidx = 0;
- ring->vr_state = VRS_RESET;
- ring->vr_state_flags = 0;
- ring->vr_worker_thread = NULL;
- cv_broadcast(&ring->vr_cv);
- mutex_exit(&ring->vr_lock);
-
- mutex_enter(&ttoproc(curthread)->p_lock);
- lwp_exit();
-}
-
-static kthread_t *
-viona_create_worker(viona_vring_t *ring)
-{
- k_sigset_t hold_set;
- proc_t *p = curproc;
- kthread_t *t;
- klwp_t *lwp;
-
- ASSERT(MUTEX_HELD(&ring->vr_lock));
- ASSERT(ring->vr_state == VRS_RESET);
-
- sigfillset(&hold_set);
- lwp = lwp_create(viona_worker, (void *)ring, 0, p, TS_STOPPED,
- minclsyspri - 1, &hold_set, curthread->t_cid, 0);
- if (lwp == NULL) {
- return (NULL);
- }
-
- t = lwptot(lwp);
- mutex_enter(&p->p_lock);
- t->t_proc_flag = (t->t_proc_flag & ~TP_HOLDLWP) | TP_KTHREAD;
- lwp_create_done(t);
- mutex_exit(&p->p_lock);
-
- return (t);
-}
-
-static int
-viona_ioc_ring_intr_clear(viona_link_t *link, uint_t idx)
-{
- if (idx >= VIONA_VQ_MAX) {
- return (EINVAL);
- }
-
- link->l_vrings[idx].vr_intr_enabled = 0;
- return (0);
-}
-
-static int
-viona_ioc_intr_poll(viona_link_t *link, void *udata, int md, int *rv)
-{
- uint_t cnt = 0;
- vioc_intr_poll_t vip;
-
- for (uint_t i = 0; i < VIONA_VQ_MAX; i++) {
- uint_t val = link->l_vrings[i].vr_intr_enabled;
-
- vip.vip_status[i] = val;
- if (val != 0) {
- cnt++;
- }
- }
-
- if (ddi_copyout(&vip, udata, sizeof (vip), md) != 0) {
- return (EFAULT);
- }
- *rv = (int)cnt;
- return (0);
-}
-
-static int
-vq_popchain(viona_vring_t *ring, struct iovec *iov, int niov, uint16_t *cookie)
-{
- uint_t i, ndesc, idx, head, next;
- struct virtio_desc vdir;
- void *buf;
-
- ASSERT(iov != NULL);
- ASSERT(niov > 0);
-
- mutex_enter(&ring->vr_a_mutex);
- idx = ring->vr_cur_aidx;
- ndesc = (uint16_t)((unsigned)*ring->vr_avail_idx - (unsigned)idx);
-
- if (ndesc == 0) {
- mutex_exit(&ring->vr_a_mutex);
- return (0);
- }
- if (ndesc > ring->vr_size) {
- /*
- * Despite the fact that the guest has provided an 'avail_idx'
- * which indicates that an impossible number of descriptors are
- * available, continue on and attempt to process the next one.
- *
- * The transgression will not escape the probe or stats though.
- */
- VIONA_PROBE2(ndesc_too_high, viona_vring_t *, ring,
- uint16_t, ndesc);
- VIONA_RING_STAT_INCR(ring, ndesc_too_high);
- }
-
- head = ring->vr_avail_ring[idx & ring->vr_mask];
- next = head;
-
- for (i = 0; i < niov; next = vdir.vd_next) {
- if (next >= ring->vr_size) {
- VIONA_PROBE2(bad_idx, viona_vring_t *, ring,
- uint16_t, next);
- VIONA_RING_STAT_INCR(ring, bad_idx);
- goto bail;
- }
-
- vdir = ring->vr_descr[next];
- if ((vdir.vd_flags & VRING_DESC_F_INDIRECT) == 0) {
- if (vdir.vd_len == 0) {
- VIONA_PROBE2(desc_bad_len,
- viona_vring_t *, ring,
- uint32_t, vdir.vd_len);
- VIONA_RING_STAT_INCR(ring, desc_bad_len);
- goto bail;
- }
- buf = viona_gpa2kva(ring, vdir.vd_addr, vdir.vd_len);
- if (buf == NULL) {
- VIONA_PROBE_BAD_RING_ADDR(ring, vdir.vd_addr);
- VIONA_RING_STAT_INCR(ring, bad_ring_addr);
- goto bail;
- }
- iov[i].iov_base = buf;
- iov[i].iov_len = vdir.vd_len;
- i++;
- } else {
- const uint_t nindir = vdir.vd_len / 16;
- volatile struct virtio_desc *vindir;
-
- if ((vdir.vd_len & 0xf) || nindir == 0) {
- VIONA_PROBE2(indir_bad_len,
- viona_vring_t *, ring,
- uint32_t, vdir.vd_len);
- VIONA_RING_STAT_INCR(ring, indir_bad_len);
- goto bail;
- }
- vindir = viona_gpa2kva(ring, vdir.vd_addr, vdir.vd_len);
- if (vindir == NULL) {
- VIONA_PROBE_BAD_RING_ADDR(ring, vdir.vd_addr);
- VIONA_RING_STAT_INCR(ring, bad_ring_addr);
- goto bail;
- }
- next = 0;
- for (;;) {
- struct virtio_desc vp;
-
- /*
- * A copy of the indirect descriptor is made
- * here, rather than simply using a reference
- * pointer. This prevents malicious or
- * erroneous guest writes to the descriptor
- * from fooling the flags/bounds verification
- * through a race.
- */
- vp = vindir[next];
- if (vp.vd_flags & VRING_DESC_F_INDIRECT) {
- VIONA_PROBE1(indir_bad_nest,
- viona_vring_t *, ring);
- VIONA_RING_STAT_INCR(ring,
- indir_bad_nest);
- goto bail;
- } else if (vp.vd_len == 0) {
- VIONA_PROBE2(desc_bad_len,
- viona_vring_t *, ring,
- uint32_t, vp.vd_len);
- VIONA_RING_STAT_INCR(ring,
- desc_bad_len);
- goto bail;
- }
- buf = viona_gpa2kva(ring, vp.vd_addr,
- vp.vd_len);
- if (buf == NULL) {
- VIONA_PROBE_BAD_RING_ADDR(ring,
- vp.vd_addr);
- VIONA_RING_STAT_INCR(ring,
- bad_ring_addr);
- goto bail;
- }
- iov[i].iov_base = buf;
- iov[i].iov_len = vp.vd_len;
- i++;
-
- if ((vp.vd_flags & VRING_DESC_F_NEXT) == 0)
- break;
- if (i >= niov) {
- goto loopy;
- }
-
- next = vp.vd_next;
- if (next >= nindir) {
- VIONA_PROBE3(indir_bad_next,
- viona_vring_t *, ring,
- uint16_t, next,
- uint_t, nindir);
- VIONA_RING_STAT_INCR(ring,
- indir_bad_next);
- goto bail;
- }
- }
- }
- if ((vdir.vd_flags & VRING_DESC_F_NEXT) == 0) {
- *cookie = head;
- ring->vr_cur_aidx++;
- mutex_exit(&ring->vr_a_mutex);
- return (i);
- }
- }
-
-loopy:
- VIONA_PROBE1(too_many_desc, viona_vring_t *, ring);
- VIONA_RING_STAT_INCR(ring, too_many_desc);
-bail:
- mutex_exit(&ring->vr_a_mutex);
- return (-1);
-}
-
-static void
-vq_pushchain(viona_vring_t *ring, uint32_t len, uint16_t cookie)
-{
- volatile struct virtio_used *vu;
- uint_t uidx;
-
- mutex_enter(&ring->vr_u_mutex);
-
- uidx = *ring->vr_used_idx;
- vu = &ring->vr_used_ring[uidx++ & ring->vr_mask];
- vu->vu_idx = cookie;
- vu->vu_tlen = len;
- membar_producer();
- *ring->vr_used_idx = uidx;
-
- mutex_exit(&ring->vr_u_mutex);
-}
-
-static void
-vq_pushchain_mrgrx(viona_vring_t *ring, int num_bufs, used_elem_t *elem)
-{
- volatile struct virtio_used *vu;
- uint_t uidx, i;
-
- mutex_enter(&ring->vr_u_mutex);
-
- uidx = *ring->vr_used_idx;
- if (num_bufs == 1) {
- vu = &ring->vr_used_ring[uidx++ & ring->vr_mask];
- vu->vu_idx = elem[0].id;
- vu->vu_tlen = elem[0].len;
- } else {
- for (i = 0; i < num_bufs; i++) {
- vu = &ring->vr_used_ring[(uidx + i) & ring->vr_mask];
- vu->vu_idx = elem[i].id;
- vu->vu_tlen = elem[i].len;
- }
- uidx = uidx + num_bufs;
- }
- membar_producer();
- *ring->vr_used_idx = uidx;
-
- mutex_exit(&ring->vr_u_mutex);
-}
-
-static void
-viona_intr_ring(viona_vring_t *ring)
-{
- uint64_t addr;
-
- mutex_enter(&ring->vr_lock);
- /* Deliver the interrupt directly, if so configured. */
- if ((addr = ring->vr_msi_addr) != 0) {
- uint64_t msg = ring->vr_msi_msg;
-
- mutex_exit(&ring->vr_lock);
- (void) vmm_drv_msi(ring->vr_lease, addr, msg);
- return;
- }
- mutex_exit(&ring->vr_lock);
-
- if (atomic_cas_uint(&ring->vr_intr_enabled, 0, 1) == 0) {
- pollwakeup(&ring->vr_link->l_pollhead, POLLRDBAND);
- }
-}
-
-static size_t
-viona_copy_mblk(const mblk_t *mp, size_t seek, caddr_t buf, size_t len,
- boolean_t *end)
-{
- size_t copied = 0;
- size_t off = 0;
-
- /* Seek past already-consumed data */
- while (seek > 0 && mp != NULL) {
- const size_t chunk = MBLKL(mp);
-
- if (chunk > seek) {
- off = seek;
- break;
- }
- mp = mp->b_cont;
- seek -= chunk;
- }
-
- while (mp != NULL) {
- const size_t chunk = MBLKL(mp) - off;
- const size_t to_copy = MIN(chunk, len);
-
- bcopy(mp->b_rptr + off, buf, to_copy);
- copied += to_copy;
- buf += to_copy;
- len -= to_copy;
-
- /*
- * If all the remaining data in the mblk_t was copied, move on
- * to the next one in the chain. Any seek offset applied to
- * the first mblk copy is zeroed out for subsequent operations.
- */
- if (chunk == to_copy) {
- mp = mp->b_cont;
- off = 0;
- }
-#ifdef DEBUG
- else {
- /*
- * The only valid reason for the copy to consume less
- * than the entire contents of the mblk_t is because
- * the output buffer has been filled.
- */
- ASSERT0(len);
- }
-#endif
-
- /* Go no further if the buffer has been filled */
- if (len == 0) {
- break;
- }
-
- }
- *end = (mp == NULL);
- return (copied);
-}
-
-static int
-viona_recv_plain(viona_vring_t *ring, const mblk_t *mp, size_t msz)
-{
- struct iovec iov[VTNET_MAXSEGS];
- uint16_t cookie;
- int n;
- const size_t hdr_sz = sizeof (struct virtio_net_hdr);
- struct virtio_net_hdr *hdr;
- size_t len, copied = 0;
- caddr_t buf = NULL;
- boolean_t end = B_FALSE;
- const uint32_t features = ring->vr_link->l_features;
-
- ASSERT(msz >= MIN_BUF_SIZE);
-
- n = vq_popchain(ring, iov, VTNET_MAXSEGS, &cookie);
- if (n <= 0) {
- /* Without available buffers, the frame must be dropped. */
- return (ENOSPC);
- }
- if (iov[0].iov_len < hdr_sz) {
- /*
- * There is little to do if there is not even space available
- * for the sole header. Zero the buffer and bail out as a last
- * act of desperation.
- */
- bzero(iov[0].iov_base, iov[0].iov_len);
- goto bad_frame;
- }
-
- /* Grab the address of the header before anything else */
- hdr = (struct virtio_net_hdr *)iov[0].iov_base;
-
- /*
- * If there is any space remaining in the first buffer after writing
- * the header, fill it with frame data.
- */
- if (iov[0].iov_len > hdr_sz) {
- buf = (caddr_t)iov[0].iov_base + hdr_sz;
- len = iov[0].iov_len - hdr_sz;
-
- copied += viona_copy_mblk(mp, copied, buf, len, &end);
- }
-
- /* Copy any remaining data into subsequent buffers, if present */
- for (int i = 1; i < n && !end; i++) {
- buf = (caddr_t)iov[i].iov_base;
- len = iov[i].iov_len;
-
- copied += viona_copy_mblk(mp, copied, buf, len, &end);
- }
-
- /* Was the expected amount of data copied? */
- if (copied != msz) {
- VIONA_PROBE5(too_short, viona_vring_t *, ring,
- uint16_t, cookie, mblk_t *, mp, size_t, copied,
- size_t, msz);
- VIONA_RING_STAT_INCR(ring, too_short);
- goto bad_frame;
- }
-
- /* Populate (read: zero) the header and account for it in the size */
- bzero(hdr, hdr_sz);
- copied += hdr_sz;
-
- /* Add chksum bits, if needed */
- if ((features & VIRTIO_NET_F_GUEST_CSUM) != 0) {
- uint32_t cksum_flags;
-
- if (((features & VIRTIO_NET_F_GUEST_TSO4) != 0) &&
- ((DB_CKSUMFLAGS(mp) & HW_LSO) != 0)) {
- hdr->vrh_gso_type |= VIRTIO_NET_HDR_GSO_TCPV4;
- hdr->vrh_gso_size = DB_LSOMSS(mp);
- }
-
- mac_hcksum_get((mblk_t *)mp, NULL, NULL, NULL, NULL,
- &cksum_flags);
- if ((cksum_flags & HCK_FULLCKSUM_OK) != 0) {
- hdr->vrh_flags |= VIRTIO_NET_HDR_F_DATA_VALID;
- }
- }
-
- /* Release this chain */
- vq_pushchain(ring, copied, cookie);
- return (0);
-
-bad_frame:
- VIONA_PROBE3(bad_rx_frame, viona_vring_t *, ring, uint16_t, cookie,
- mblk_t *, mp);
- VIONA_RING_STAT_INCR(ring, bad_rx_frame);
-
- vq_pushchain(ring, MAX(copied, MIN_BUF_SIZE + hdr_sz), cookie);
- return (EINVAL);
-}
-
-static int
-viona_recv_merged(viona_vring_t *ring, const mblk_t *mp, size_t msz)
-{
- struct iovec iov[VTNET_MAXSEGS];
- used_elem_t uelem[VTNET_MAXSEGS];
- int n, i = 0, buf_idx = 0, err = 0;
- uint16_t cookie;
- caddr_t buf;
- size_t len, copied = 0, chunk = 0;
- struct virtio_net_mrgrxhdr *hdr = NULL;
- const size_t hdr_sz = sizeof (struct virtio_net_mrgrxhdr);
- boolean_t end = B_FALSE;
- const uint32_t features = ring->vr_link->l_features;
-
- ASSERT(msz >= MIN_BUF_SIZE);
-
- n = vq_popchain(ring, iov, VTNET_MAXSEGS, &cookie);
- if (n <= 0) {
- /* Without available buffers, the frame must be dropped. */
- VIONA_PROBE2(no_space, viona_vring_t *, ring, mblk_t *, mp);
- VIONA_RING_STAT_INCR(ring, no_space);
- return (ENOSPC);
- }
- if (iov[0].iov_len < hdr_sz) {
- /*
- * There is little to do if there is not even space available
- * for the sole header. Zero the buffer and bail out as a last
- * act of desperation.
- */
- bzero(iov[0].iov_base, iov[0].iov_len);
- uelem[0].id = cookie;
- uelem[0].len = iov[0].iov_len;
- err = EINVAL;
- goto done;
- }
-
- /* Grab the address of the header and do initial population */
- hdr = (struct virtio_net_mrgrxhdr *)iov[0].iov_base;
- bzero(hdr, hdr_sz);
- hdr->vrh_bufs = 1;
-
- /*
- * If there is any space remaining in the first buffer after writing
- * the header, fill it with frame data.
- */
- if (iov[0].iov_len > hdr_sz) {
- buf = iov[0].iov_base + hdr_sz;
- len = iov[0].iov_len - hdr_sz;
-
- chunk += viona_copy_mblk(mp, copied, buf, len, &end);
- copied += chunk;
- }
- i = 1;
-
- do {
- while (i < n && !end) {
- buf = iov[i].iov_base;
- len = iov[i].iov_len;
-
- chunk += viona_copy_mblk(mp, copied, buf, len, &end);
- copied += chunk;
- i++;
- }
-
- uelem[buf_idx].id = cookie;
- uelem[buf_idx].len = chunk;
-
- /*
- * Try to grab another buffer from the ring if the mblk has not
- * yet been entirely copied out.
- */
- if (!end) {
- if (buf_idx == (VTNET_MAXSEGS - 1)) {
- /*
- * Our arbitrary limit on the number of buffers
- * to offer for merge has already been reached.
- */
- err = EOVERFLOW;
- break;
- }
- n = vq_popchain(ring, iov, VTNET_MAXSEGS, &cookie);
- if (n <= 0) {
- /*
- * Without more immediate space to perform the
- * copying, there is little choice left but to
- * drop the packet.
- */
- err = EMSGSIZE;
- break;
- }
- chunk = 0;
- i = 0;
- buf_idx++;
- /*
- * Keep the header up-to-date with the number of
- * buffers, but never reference its value since the
- * guest could meddle with it.
- */
- hdr->vrh_bufs++;
- }
- } while (!end && copied < msz);
-
- /* Account for the header size in the first buffer */
- uelem[0].len += hdr_sz;
-
- /*
- * If no other errors were encounted during the copy, was the expected
- * amount of data transfered?
- */
- if (err == 0 && copied != msz) {
- VIONA_PROBE5(too_short, viona_vring_t *, ring,
- uint16_t, cookie, mblk_t *, mp, size_t, copied,
- size_t, msz);
- VIONA_RING_STAT_INCR(ring, too_short);
- err = EINVAL;
- }
-
- /* Add chksum bits, if needed */
- if ((features & VIRTIO_NET_F_GUEST_CSUM) != 0) {
- uint32_t cksum_flags;
-
- if (((features & VIRTIO_NET_F_GUEST_TSO4) != 0) &&
- ((DB_CKSUMFLAGS(mp) & HW_LSO) != 0)) {
- hdr->vrh_gso_type |= VIRTIO_NET_HDR_GSO_TCPV4;
- hdr->vrh_gso_size = DB_LSOMSS(mp);
- }
-
- mac_hcksum_get((mblk_t *)mp, NULL, NULL, NULL, NULL,
- &cksum_flags);
- if ((cksum_flags & HCK_FULLCKSUM_OK) != 0) {
- hdr->vrh_flags |= VIRTIO_NET_HDR_F_DATA_VALID;
- }
- }
-
-done:
- switch (err) {
- case 0:
- /* Success can fall right through to ring delivery */
- break;
-
- case EMSGSIZE:
- VIONA_PROBE3(rx_merge_underrun, viona_vring_t *, ring,
- uint16_t, cookie, mblk_t *, mp);
- VIONA_RING_STAT_INCR(ring, rx_merge_underrun);
- break;
-
- case EOVERFLOW:
- VIONA_PROBE3(rx_merge_overrun, viona_vring_t *, ring,
- uint16_t, cookie, mblk_t *, mp);
- VIONA_RING_STAT_INCR(ring, rx_merge_overrun);
- break;
-
- default:
- VIONA_PROBE3(bad_rx_frame, viona_vring_t *, ring,
- uint16_t, cookie, mblk_t *, mp);
- VIONA_RING_STAT_INCR(ring, bad_rx_frame);
- }
- vq_pushchain_mrgrx(ring, buf_idx + 1, uelem);
- return (err);
-}
-
-static void
-viona_rx_common(viona_vring_t *ring, mblk_t *mp, boolean_t is_loopback)
-{
- viona_link_t *link = ring->vr_link;
- mblk_t *mprx = NULL, **mprx_prevp = &mprx;
- mblk_t *mpdrop = NULL, **mpdrop_prevp = &mpdrop;
- const boolean_t do_merge =
- ((link->l_features & VIRTIO_NET_F_MRG_RXBUF) != 0);
- const boolean_t guest_csum =
- ((link->l_features & VIRTIO_NET_F_GUEST_CSUM) != 0);
- const boolean_t guest_tso4 =
- ((link->l_features & VIRTIO_NET_F_GUEST_TSO4) != 0);
-
- size_t nrx = 0, ndrop = 0;
-
- /*
- * The mac_hw_emul() function, by design, doesn't predicate on
- * HW_LOCAL_MAC. Since we are in Rx context we know that any
- * LSO packet must also be from a same-machine sender. We take
- * advantage of that and forgoe writing a manual loop to
- * predicate on HW_LOCAL_MAC.
- *
- * For checksum emulation we need to predicate on HW_LOCAL_MAC
- * to avoid calling mac_hw_emul() on packets that don't need
- * it (thanks to the fact that HCK_IPV4_HDRCKSUM and
- * HCK_IPV4_HDRCKSUM_OK use the same value). Therefore, we do
- * the checksum emulation in the second loop.
- */
- if (!guest_tso4)
- mac_hw_emul(&mp, NULL, NULL, MAC_LSO_EMUL);
-
- while (mp != NULL) {
- mblk_t *next, *pad = NULL;
- size_t size;
- int err = 0;
-
- next = mp->b_next;
- mp->b_next = NULL;
-
- if (DB_CKSUMFLAGS(mp) & HW_LOCAL_MAC) {
- /*
- * The VIRTIO_NET_HDR_F_DATA_VALID flag only
- * covers the ULP checksum -- so we still have
- * to populate the IP header checksum.
- */
- if (guest_csum) {
- mac_hw_emul(&mp, NULL, NULL, MAC_IPCKSUM_EMUL);
- } else {
- mac_hw_emul(&mp, NULL, NULL, MAC_HWCKSUM_EMUL);
- }
-
- if (mp == NULL) {
- mp = next;
- continue;
- }
- }
-
- size = msgsize(mp);
-
- /*
- * We treat both a 'drop' response and errors the same here
- * and put the packet on the drop chain. As packets may be
- * subject to different actions in ipf (which do not all
- * return the same set of error values), an error processing
- * one packet doesn't mean the next packet will also generate
- * an error.
- */
- if (VNETHOOK_INTERESTED_IN(link->l_neti) &&
- viona_hook(link, ring, &mp, B_FALSE) != 0) {
- if (mp != NULL) {
- *mpdrop_prevp = mp;
- mpdrop_prevp = &mp->b_next;
- } else {
- /*
- * If the hook consumer (e.g. ipf) already
- * freed the mblk_t, update the drop count now.
- */
- ndrop++;
- }
- mp = next;
- continue;
- }
-
- /*
- * Ethernet frames are expected to be padded out in order to
- * meet the minimum size.
- *
- * A special case is made for frames which are short by
- * VLAN_TAGSZ, having been stripped of their VLAN tag while
- * traversing MAC. A preallocated (and recycled) mblk is used
- * for that specific condition.
- *
- * All other frames that fall short on length will have custom
- * zero-padding allocated appended to them.
- */
- if (size == NEED_VLAN_PAD_SIZE) {
- ASSERT(MBLKL(viona_vlan_pad_mp) == VLAN_TAGSZ);
- ASSERT(viona_vlan_pad_mp->b_cont == NULL);
-
- for (pad = mp; pad->b_cont != NULL; pad = pad->b_cont)
- ;
-
- pad->b_cont = viona_vlan_pad_mp;
- size += VLAN_TAGSZ;
- } else if (size < MIN_BUF_SIZE) {
- const size_t pad_size = MIN_BUF_SIZE - size;
- mblk_t *zero_mp;
-
- zero_mp = allocb(pad_size, BPRI_MED);
- if (zero_mp == NULL) {
- err = ENOMEM;
- goto pad_drop;
- }
-
- VIONA_PROBE3(rx_pad_short, viona_vring_t *, ring,
- mblk_t *, mp, size_t, pad_size);
- VIONA_RING_STAT_INCR(ring, rx_pad_short);
- zero_mp->b_wptr += pad_size;
- bzero(zero_mp->b_rptr, pad_size);
- linkb(mp, zero_mp);
- size += pad_size;
- }
-
- if (do_merge) {
- err = viona_recv_merged(ring, mp, size);
- } else {
- err = viona_recv_plain(ring, mp, size);
- }
-
- /*
- * The VLAN padding mblk is meant for continual reuse, so
- * remove it from the chain to prevent it from being freed.
- *
- * Custom allocated padding does not require this treatment and
- * is freed normally.
- */
- if (pad != NULL) {
- pad->b_cont = NULL;
- }
-
-pad_drop:
- /*
- * While an error during rx processing
- * (viona_recv_{merged,plain}) does not free mp on error,
- * hook processing might or might not free mp. Handle either
- * scenario -- if mp is not yet free, it is queued up and
- * freed after the guest has been notified. If mp is
- * already NULL, just proceed on.
- */
- if (err != 0) {
- *mpdrop_prevp = mp;
- mpdrop_prevp = &mp->b_next;
-
- /*
- * If the available ring is empty, do not bother
- * attempting to deliver any more frames. Count the
- * rest as dropped too.
- */
- if (err == ENOSPC) {
- mp->b_next = next;
- break;
- }
- } else {
- /* Chain successful mblks to be freed later */
- *mprx_prevp = mp;
- mprx_prevp = &mp->b_next;
- nrx++;
- }
- mp = next;
- }
-
- membar_enter();
- if ((*ring->vr_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) {
- viona_intr_ring(ring);
- }
-
- /* Free successfully received frames */
- if (mprx != NULL) {
- freemsgchain(mprx);
- }
-
- /* Free dropped frames, also tallying them */
- mp = mpdrop;
- while (mp != NULL) {
- mblk_t *next = mp->b_next;
-
- mp->b_next = NULL;
- freemsg(mp);
- mp = next;
- ndrop++;
- }
- VIONA_PROBE3(rx, viona_link_t *, link, size_t, nrx, size_t, ndrop);
-}
-
-static void
-viona_rx_classified(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
- boolean_t is_loopback)
-{
- viona_vring_t *ring = (viona_vring_t *)arg;
-
- /* Drop traffic if ring is inactive or renewing its lease */
- if (ring->vr_state != VRS_RUN ||
- (ring->vr_state_flags & VRSF_RENEW) != 0) {
- freemsgchain(mp);
- return;
- }
-
- viona_rx_common(ring, mp, is_loopback);
-}
-
-static void
-viona_rx_mcast(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
- boolean_t is_loopback)
-{
- viona_vring_t *ring = (viona_vring_t *)arg;
- mac_handle_t mh = ring->vr_link->l_mh;
- mblk_t *mp_mcast_only = NULL;
- mblk_t **mpp = &mp_mcast_only;
-
- /* Drop traffic if ring is inactive or renewing its lease */
- if (ring->vr_state != VRS_RUN ||
- (ring->vr_state_flags & VRSF_RENEW) != 0) {
- freemsgchain(mp);
- return;
- }
-
- /*
- * In addition to multicast traffic, broadcast packets will also arrive
- * via the MAC_CLIENT_PROMISC_MULTI handler. The mac_rx_set() callback
- * for fully-classified traffic has already delivered that broadcast
- * traffic, so it should be suppressed here, rather than duplicating it
- * to the guest.
- */
- while (mp != NULL) {
- mblk_t *mp_next;
- mac_header_info_t mhi;
- int err;
-
- mp_next = mp->b_next;
- mp->b_next = NULL;
-
- /* Determine the packet type */
- err = mac_vlan_header_info(mh, mp, &mhi);
- if (err != 0) {
- mblk_t *pull;
-
- /*
- * It is possible that gathering of the header
- * information was impeded by a leading mblk_t which
- * was of inadequate length to reference the needed
- * fields. Try again, in case that could be solved
- * with a pull-up.
- */
- pull = msgpullup(mp, sizeof (struct ether_vlan_header));
- if (pull == NULL) {
- err = ENOMEM;
- } else {
- err = mac_vlan_header_info(mh, pull, &mhi);
- freemsg(pull);
- }
-
- if (err != 0) {
- VIONA_RING_STAT_INCR(ring, rx_mcast_check);
- }
- }
-
- /* Chain up matching packets while discarding others */
- if (err == 0 && mhi.mhi_dsttype == MAC_ADDRTYPE_MULTICAST) {
- *mpp = mp;
- mpp = &mp->b_next;
- } else {
- freemsg(mp);
- }
-
- mp = mp_next;
- }
-
- if (mp_mcast_only != NULL) {
- viona_rx_common(ring, mp_mcast_only, is_loopback);
- }
-}
-
-static void
-viona_tx_done(viona_vring_t *ring, uint32_t len, uint16_t cookie)
-{
- vq_pushchain(ring, len, cookie);
-
- membar_enter();
- if ((*ring->vr_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) {
- viona_intr_ring(ring);
- }
-}
-
-static void
-viona_desb_release(viona_desb_t *dp)
-{
- viona_vring_t *ring = dp->d_ring;
- uint_t ref;
- uint32_t len;
- uint16_t cookie;
-
- ref = atomic_dec_uint_nv(&dp->d_ref);
- if (ref > 1) {
- return;
- }
-
- /*
- * The desb corresponding to this index must be ready for reuse before
- * the descriptor is returned to the guest via the 'used' ring.
- */
- len = dp->d_len;
- cookie = dp->d_cookie;
- dp->d_len = 0;
- dp->d_cookie = 0;
- dp->d_ref = 0;
-
- viona_tx_done(ring, len, cookie);
-
- mutex_enter(&ring->vr_lock);
- if ((--ring->vr_xfer_outstanding) == 0) {
- cv_broadcast(&ring->vr_cv);
- }
- mutex_exit(&ring->vr_lock);
-}
-
-static void
-viona_tx_wait_outstanding(viona_vring_t *ring)
-{
- ASSERT(MUTEX_HELD(&ring->vr_lock));
-
- while (ring->vr_xfer_outstanding != 0) {
- /*
- * Paying heed to signals is counterproductive here. This is a
- * very tight loop if pending transfers take an extended amount
- * of time to be reclaimed while the host process is exiting.
- */
- cv_wait(&ring->vr_cv, &ring->vr_lock);
- }
-}
-
-static boolean_t
-viona_tx_csum(viona_vring_t *ring, const struct virtio_net_hdr *hdr,
- mblk_t *mp, uint32_t len)
-{
- viona_link_t *link = ring->vr_link;
- const struct ether_header *eth;
- uint_t eth_len = sizeof (struct ether_header);
- ushort_t ftype;
- ipha_t *ipha = NULL;
- uint8_t ipproto = IPPROTO_NONE; /* NONE is not exactly right, but ok */
- uint16_t flags = 0;
- const uint_t csum_start = hdr->vrh_csum_start;
- const uint_t csum_stuff = hdr->vrh_csum_offset + csum_start;
-
- /*
- * Validate that the checksum offsets provided by the guest are within
- * the bounds of the packet. Additionally, ensure that the checksum
- * contents field is within the headers mblk copied by viona_tx().
- */
- if (csum_start >= len || csum_start < eth_len || csum_stuff >= len ||
- (csum_stuff + sizeof (uint16_t)) > MBLKL(mp)) {
- VIONA_PROBE2(fail_hcksum, viona_link_t *, link, mblk_t *, mp);
- VIONA_RING_STAT_INCR(ring, fail_hcksum);
- return (B_FALSE);
- }
-
- /*
- * This is guaranteed to be safe thanks to the header copying
- * done in viona_tx().
- */
- eth = (const struct ether_header *)mp->b_rptr;
- ftype = ntohs(eth->ether_type);
-
- if (ftype == ETHERTYPE_VLAN) {
- const struct ether_vlan_header *veth;
-
- /* punt on QinQ for now */
- eth_len = sizeof (struct ether_vlan_header);
- veth = (const struct ether_vlan_header *)eth;
- ftype = ntohs(veth->ether_type);
- }
-
- if (ftype == ETHERTYPE_IP) {
- ipha = (ipha_t *)(mp->b_rptr + eth_len);
-
- ipproto = ipha->ipha_protocol;
- } else if (ftype == ETHERTYPE_IPV6) {
- ip6_t *ip6h = (ip6_t *)(mp->b_rptr + eth_len);
-
- ipproto = ip6h->ip6_nxt;
- }
-
- /*
- * We ignore hdr_len because the spec says it can't be
- * trusted. Besides, our own stack will determine the header
- * boundary.
- */
- if ((link->l_cap_csum & HCKSUM_INET_PARTIAL) != 0 &&
- (hdr->vrh_gso_type & VIRTIO_NET_HDR_GSO_TCPV4) != 0 &&
- ftype == ETHERTYPE_IP) {
- uint16_t *cksump;
- uint32_t cksum;
- ipaddr_t src = ipha->ipha_src;
- ipaddr_t dst = ipha->ipha_dst;
-
- /*
- * Our native IP stack doesn't set the L4 length field
- * of the pseudo header when LSO is in play. Other IP
- * stacks, e.g. Linux, do include the length field.
- * This is a problem because the hardware expects that
- * the length field is not set. When it is set it will
- * cause an incorrect TCP checksum to be generated.
- * The reason this works in Linux is because Linux
- * corrects the pseudo-header checksum in the driver
- * code. In order to get the correct HW checksum we
- * need to assume the guest's IP stack gave us a bogus
- * TCP partial checksum and calculate it ourselves.
- */
- cksump = IPH_TCPH_CHECKSUMP(ipha, IPH_HDR_LENGTH(ipha));
- cksum = IP_TCP_CSUM_COMP;
- cksum += (dst >> 16) + (dst & 0xFFFF) +
- (src >> 16) + (src & 0xFFFF);
- cksum = (cksum & 0xFFFF) + (cksum >> 16);
- *(cksump) = (cksum & 0xFFFF) + (cksum >> 16);
-
- /*
- * Since viona is a "legacy device", the data stored
- * by the driver will be in the guest's native endian
- * format (see sections 2.4.3 and 5.1.6.1 of the
- * VIRTIO 1.0 spec for more info). At this time the
- * only guests using viona are x86 and we can assume
- * little-endian.
- */
- lso_info_set(mp, LE_16(hdr->vrh_gso_size), HW_LSO);
-
- /*
- * Hardware, like ixgbe, expects the client to request
- * IP header checksum offload if it's sending LSO (see
- * ixgbe_get_context()). Unfortunately, virtio makes
- * no allowances for negotiating IP header checksum
- * and HW offload, only TCP checksum. We add the flag
- * and zero-out the checksum field. This mirrors the
- * behavior of our native IP stack (which does this in
- * the interest of HW that expects the field to be
- * zero).
- */
- flags |= HCK_IPV4_HDRCKSUM;
- ipha->ipha_hdr_checksum = 0;
- }
-
- /*
- * Use DB_CKSUMFLAGS instead of mac_hcksum_get() to make sure
- * HW_LSO, if present, is not lost.
- */
- flags |= DB_CKSUMFLAGS(mp);
-
- /*
- * Partial checksum support from the NIC is ideal, since it most
- * closely maps to the interface defined by virtio.
- */
- if ((link->l_cap_csum & HCKSUM_INET_PARTIAL) != 0 &&
- (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) {
- /*
- * MAC expects these offsets to be relative to the
- * start of the L3 header rather than the L2 frame.
- */
- flags |= HCK_PARTIALCKSUM;
- mac_hcksum_set(mp, csum_start - eth_len, csum_stuff - eth_len,
- len - eth_len, 0, flags);
- return (B_TRUE);
- }
-
- /*
- * Without partial checksum support, look to the L3/L4 protocol
- * information to see if the NIC can handle it. If not, the
- * checksum will need to calculated inline.
- */
- if (ftype == ETHERTYPE_IP) {
- if ((link->l_cap_csum & HCKSUM_INET_FULL_V4) != 0 &&
- (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) {
- uint16_t *csump = (uint16_t *)(mp->b_rptr + csum_stuff);
- *csump = 0;
- flags |= HCK_FULLCKSUM;
- mac_hcksum_set(mp, 0, 0, 0, 0, flags);
- return (B_TRUE);
- }
-
- /* XXX: Implement manual fallback checksumming? */
- VIONA_PROBE2(fail_hcksum, viona_link_t *, link, mblk_t *, mp);
- VIONA_RING_STAT_INCR(ring, fail_hcksum);
- return (B_FALSE);
- } else if (ftype == ETHERTYPE_IPV6) {
- if ((link->l_cap_csum & HCKSUM_INET_FULL_V6) != 0 &&
- (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) {
- uint16_t *csump = (uint16_t *)(mp->b_rptr + csum_stuff);
- *csump = 0;
- flags |= HCK_FULLCKSUM;
- mac_hcksum_set(mp, 0, 0, 0, 0, flags);
- return (B_TRUE);
- }
-
- /* XXX: Implement manual fallback checksumming? */
- VIONA_PROBE2(fail_hcksum6, viona_link_t *, link, mblk_t *, mp);
- VIONA_RING_STAT_INCR(ring, fail_hcksum6);
- return (B_FALSE);
- }
-
- /* Cannot even emulate hcksum for unrecognized protocols */
- VIONA_PROBE2(fail_hcksum_proto, viona_link_t *, link, mblk_t *, mp);
- VIONA_RING_STAT_INCR(ring, fail_hcksum_proto);
- return (B_FALSE);
-}
-
-static void
-viona_tx(viona_link_t *link, viona_vring_t *ring)
-{
- struct iovec *iov = ring->vr_txiov;
- const uint_t max_segs = ring->vr_size;
- uint16_t cookie;
- int i, n;
- uint32_t len, base_off = 0;
- uint32_t min_copy = VIONA_MAX_HDRS_LEN;
- mblk_t *mp_head, *mp_tail, *mp;
- viona_desb_t *dp = NULL;
- mac_client_handle_t link_mch = link->l_mch;
- const struct virtio_net_hdr *hdr;
-
- mp_head = mp_tail = NULL;
-
- ASSERT(iov != NULL);
-
- n = vq_popchain(ring, iov, max_segs, &cookie);
- if (n == 0) {
- VIONA_PROBE1(tx_absent, viona_vring_t *, ring);
- VIONA_RING_STAT_INCR(ring, tx_absent);
- return;
- } else if (n < 0) {
- /*
- * Any error encountered in vq_popchain has already resulted in
- * specific probe and statistic handling. Further action here
- * is unnecessary.
- */
- return;
- }
-
- /* Grab the header and ensure it is of adequate length */
- hdr = (const struct virtio_net_hdr *)iov[0].iov_base;
- len = iov[0].iov_len;
- if (len < sizeof (struct virtio_net_hdr)) {
- goto drop_fail;
- }
-
- /* Make sure the packet headers are always in the first mblk. */
- if (ring->vr_txdesb != NULL) {
- dp = &ring->vr_txdesb[cookie];
-
- /*
- * If the guest driver is operating properly, each desb slot
- * should be available for use when processing a TX descriptor
- * from the 'avail' ring. In the case of drivers that reuse a
- * descriptor before it has been posted to the 'used' ring, the
- * data is simply dropped.
- */
- if (atomic_cas_uint(&dp->d_ref, 0, 1) != 0) {
- dp = NULL;
- goto drop_fail;
- }
-
- dp->d_cookie = cookie;
- mp_head = desballoc(dp->d_headers, VIONA_MAX_HDRS_LEN, 0,
- &dp->d_frtn);
-
- /* Account for the successful desballoc. */
- if (mp_head != NULL)
- dp->d_ref++;
- } else {
- mp_head = allocb(VIONA_MAX_HDRS_LEN, 0);
- }
-
- if (mp_head == NULL)
- goto drop_fail;
-
- mp_tail = mp_head;
-
- /*
- * We always copy enough of the guest data to cover the
- * headers. This protects us from TOCTOU attacks and allows
- * message block length assumptions to be made in subsequent
- * code. In many cases, this means copying more data than
- * strictly necessary. That's okay, as it is the larger packets
- * (such as LSO) that really benefit from desballoc().
- */
- for (i = 1; i < n; i++) {
- const uint32_t to_copy = MIN(min_copy, iov[i].iov_len);
-
- bcopy(iov[i].iov_base, mp_head->b_wptr, to_copy);
- mp_head->b_wptr += to_copy;
- len += to_copy;
- min_copy -= to_copy;
-
- /*
- * We've met the minimum copy requirement. The rest of
- * the guest data can be referenced.
- */
- if (min_copy == 0) {
- /*
- * If we copied all contents of this
- * descriptor then move onto the next one.
- * Otherwise, record how far we are into the
- * current descriptor.
- */
- if (iov[i].iov_len == to_copy)
- i++;
- else
- base_off = to_copy;
-
- break;
- }
- }
-
- ASSERT3P(mp_head, !=, NULL);
- ASSERT3P(mp_tail, !=, NULL);
-
- for (; i < n; i++) {
- uintptr_t base = (uintptr_t)iov[i].iov_base + base_off;
- uint32_t chunk = iov[i].iov_len - base_off;
-
- ASSERT3U(base_off, <, iov[i].iov_len);
- ASSERT3U(chunk, >, 0);
-
- if (dp != NULL) {
- mp = desballoc((uchar_t *)base, chunk, 0, &dp->d_frtn);
- if (mp == NULL) {
- goto drop_fail;
- }
- dp->d_ref++;
- } else {
- mp = allocb(chunk, BPRI_MED);
- if (mp == NULL) {
- goto drop_fail;
- }
- bcopy((uchar_t *)base, mp->b_wptr, chunk);
- }
-
- base_off = 0;
- len += chunk;
- mp->b_wptr += chunk;
- mp_tail->b_cont = mp;
- mp_tail = mp;
- }
-
- if (VNETHOOK_INTERESTED_OUT(link->l_neti)) {
- /*
- * The hook consumer may elect to free the mblk_t and set
- * our mblk_t ** to NULL. When using a viona_desb_t
- * (dp != NULL), we do not want the corresponding cleanup to
- * occur during the viona_hook() call. We instead want to
- * reset and recycle dp for future use. To prevent cleanup
- * during the viona_hook() call, we take a ref on dp (if being
- * used), and release it on success. On failure, the
- * freemsgchain() call will release all the refs taken earlier
- * in viona_tx() (aside from the initial ref and the one we
- * take), and drop_hook will reset dp for reuse.
- */
- if (dp != NULL)
- dp->d_ref++;
-
- /*
- * Pass &mp instead of &mp_head so we don't lose track of
- * mp_head if the hook consumer (i.e. ipf) elects to free mp
- * and set mp to NULL.
- */
- mp = mp_head;
- if (viona_hook(link, ring, &mp, B_TRUE) != 0) {
- if (mp != NULL)
- freemsgchain(mp);
- goto drop_hook;
- }
-
- if (dp != NULL) {
- dp->d_ref--;
-
- /*
- * It is possible that the hook(s) accepted the packet,
- * but as part of its processing, it issued a pull-up
- * which released all references to the desb. In that
- * case, go back to acting like the packet is entirely
- * copied (which it is).
- */
- if (dp->d_ref == 1) {
- dp->d_cookie = 0;
- dp->d_ref = 0;
- dp = NULL;
- }
- }
- }
-
- /*
- * Request hardware checksumming, if necessary. If the guest
- * sent an LSO packet then it must have also negotiated and
- * requested partial checksum; therefore the LSO logic is
- * contained within viona_tx_csum().
- */
- if ((link->l_features & VIRTIO_NET_F_CSUM) != 0 &&
- (hdr->vrh_flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) != 0) {
- if (!viona_tx_csum(ring, hdr, mp_head, len - iov[0].iov_len)) {
- goto drop_fail;
- }
- }
-
- if (dp != NULL) {
- dp->d_len = len;
- mutex_enter(&ring->vr_lock);
- ring->vr_xfer_outstanding++;
- mutex_exit(&ring->vr_lock);
- } else {
- /*
- * If the data was cloned out of the ring, the descriptors can
- * be marked as 'used' now, rather than deferring that action
- * until after successful packet transmission.
- */
- viona_tx_done(ring, len, cookie);
- }
-
- /*
- * We're potentially going deep into the networking layer; make sure the
- * guest can't run concurrently.
- */
- smt_begin_unsafe();
- mac_tx(link_mch, mp_head, 0, MAC_DROP_ON_NO_DESC, NULL);
- smt_end_unsafe();
- return;
-
-drop_fail:
- /*
- * On the off chance that memory is not available via the desballoc or
- * allocb calls, there are few options left besides to fail and drop
- * the frame on the floor.
- */
-
- if (dp != NULL) {
- /*
- * Take an additional reference on the desb handle (if present)
- * so any desballoc-sourced mblks can release their hold on it
- * without the handle reaching its final state and executing
- * its clean-up logic.
- */
- dp->d_ref++;
- }
-
- /*
- * Free any already-allocated blocks and sum up the total length of the
- * dropped data to be released to the used ring.
- */
- freemsgchain(mp_head);
-
-drop_hook:
- len = 0;
- for (uint_t i = 0; i < n; i++) {
- len += iov[i].iov_len;
- }
-
- if (dp != NULL) {
- VERIFY(dp->d_ref == 2);
-
- /* Clean up the desb handle, releasing the extra hold. */
- dp->d_len = 0;
- dp->d_cookie = 0;
- dp->d_ref = 0;
- }
-
- VIONA_PROBE3(tx_drop, viona_vring_t *, ring, uint32_t, len,
- uint16_t, cookie);
- viona_tx_done(ring, len, cookie);
-}
-
-/*
- * Generate a hook event for the packet in *mpp headed in the direction
- * indicated by 'out'. If the packet is accepted, 0 is returned. If the
- * packet is rejected, an error is returned. The hook function may or may not
- * alter or even free *mpp. The caller is expected to deal with either
- * situation.
- */
-static int
-viona_hook(viona_link_t *link, viona_vring_t *ring, mblk_t **mpp, boolean_t out)
-{
- viona_neti_t *nip = link->l_neti;
- viona_nethook_t *vnh = &nip->vni_nethook;
- hook_pkt_event_t info;
- hook_event_t he;
- hook_event_token_t het;
- int ret;
-
- he = out ? vnh->vnh_event_out : vnh->vnh_event_in;
- het = out ? vnh->vnh_token_out : vnh->vnh_token_in;
-
- if (!he.he_interested)
- return (0);
-
- info.hpe_protocol = vnh->vnh_neti;
- info.hpe_ifp = (phy_if_t)link;
- info.hpe_ofp = (phy_if_t)link;
- info.hpe_mp = mpp;
- info.hpe_flags = 0;
-
- ret = hook_run(vnh->vnh_neti->netd_hooks, het, (hook_data_t)&info);
- if (ret == 0)
- return (0);
-
- if (out) {
- VIONA_PROBE3(tx_hook_drop, viona_vring_t *, ring,
- mblk_t *, *mpp, int, ret);
- VIONA_RING_STAT_INCR(ring, tx_hookdrop);
- } else {
- VIONA_PROBE3(rx_hook_drop, viona_vring_t *, ring,
- mblk_t *, *mpp, int, ret);
- VIONA_RING_STAT_INCR(ring, rx_hookdrop);
- }
- return (ret);
-}
-
-/*
- * netinfo stubs - required by the nethook framework, but otherwise unused
- *
- * Currently, all ipf rules are applied against all interfaces in a given
- * netstack (e.g. all interfaces in a zone). In the future if we want to
- * support being able to apply different rules to different interfaces, I
- * believe we would need to implement some of these stubs to map an interface
- * name in a rule (e.g. 'net0', back to an index or viona_link_t);
- */
-static int
-viona_neti_getifname(net_handle_t neti __unused, phy_if_t phy __unused,
- char *buf __unused, const size_t len __unused)
-{
- return (-1);
-}
-
-static int
-viona_neti_getmtu(net_handle_t neti __unused, phy_if_t phy __unused,
- lif_if_t ifdata __unused)
-{
- return (-1);
-}
-
-static int
-viona_neti_getptmue(net_handle_t neti __unused)
-{
- return (-1);
-}
-
-static int
-viona_neti_getlifaddr(net_handle_t neti __unused, phy_if_t phy __unused,
- lif_if_t ifdata __unused, size_t nelem __unused,
- net_ifaddr_t type[] __unused, void *storage __unused)
-{
- return (-1);
-}
-
-static int
-viona_neti_getlifzone(net_handle_t neti __unused, phy_if_t phy __unused,
- lif_if_t ifdata __unused, zoneid_t *zid __unused)
-{
- return (-1);
-}
-
-static int
-viona_neti_getlifflags(net_handle_t neti __unused, phy_if_t phy __unused,
- lif_if_t ifdata __unused, uint64_t *flags __unused)
-{
- return (-1);
-}
-
-static phy_if_t
-viona_neti_phygetnext(net_handle_t neti __unused, phy_if_t phy __unused)
-{
- return ((phy_if_t)-1);
-}
-
-static phy_if_t
-viona_neti_phylookup(net_handle_t neti __unused, const char *name __unused)
-{
- return ((phy_if_t)-1);
-}
-
-static lif_if_t
-viona_neti_lifgetnext(net_handle_t neti __unused, phy_if_t phy __unused,
- lif_if_t ifdata __unused)
-{
- return (-1);
-}
-
-static int
-viona_neti_inject(net_handle_t neti __unused, inject_t style __unused,
- net_inject_t *packet __unused)
-{
- return (-1);
-}
-
-static phy_if_t
-viona_neti_route(net_handle_t neti __unused, struct sockaddr *address __unused,
- struct sockaddr *next __unused)
-{
- return ((phy_if_t)-1);
-}
-
-static int
-viona_neti_ispchksum(net_handle_t neti __unused, mblk_t *mp __unused)
-{
- return (-1);
-}
-
-static int
-viona_neti_isvchksum(net_handle_t neti __unused, mblk_t *mp __unused)
-{
- return (-1);
-}
-
-static net_protocol_t viona_netinfo = {
- NETINFO_VERSION,
- NHF_VIONA,
- viona_neti_getifname,
- viona_neti_getmtu,
- viona_neti_getptmue,
- viona_neti_getlifaddr,
- viona_neti_getlifzone,
- viona_neti_getlifflags,
- viona_neti_phygetnext,
- viona_neti_phylookup,
- viona_neti_lifgetnext,
- viona_neti_inject,
- viona_neti_route,
- viona_neti_ispchksum,
- viona_neti_isvchksum
-};
-
-/*
- * Create/register our nethooks
- */
-static int
-viona_nethook_init(netid_t nid, viona_nethook_t *vnh, char *nh_name,
- net_protocol_t *netip)
-{
- int ret;
-
- if ((vnh->vnh_neti = net_protocol_register(nid, netip)) == NULL) {
- cmn_err(CE_NOTE, "%s: net_protocol_register failed "
- "(netid=%d name=%s)", __func__, nid, nh_name);
- goto fail_init_proto;
- }
-
- HOOK_FAMILY_INIT(&vnh->vnh_family, nh_name);
- if ((ret = net_family_register(vnh->vnh_neti, &vnh->vnh_family)) != 0) {
- cmn_err(CE_NOTE, "%s: net_family_register failed "
- "(netid=%d name=%s err=%d)", __func__,
- nid, nh_name, ret);
- goto fail_init_family;
- }
-
- HOOK_EVENT_INIT(&vnh->vnh_event_in, NH_PHYSICAL_IN);
- if ((vnh->vnh_token_in = net_event_register(vnh->vnh_neti,
- &vnh->vnh_event_in)) == NULL) {
- cmn_err(CE_NOTE, "%s: net_event_register %s failed "
- "(netid=%d name=%s)", __func__, NH_PHYSICAL_IN, nid,
- nh_name);
- goto fail_init_event_in;
- }
-
- HOOK_EVENT_INIT(&vnh->vnh_event_out, NH_PHYSICAL_OUT);
- if ((vnh->vnh_token_out = net_event_register(vnh->vnh_neti,
- &vnh->vnh_event_out)) == NULL) {
- cmn_err(CE_NOTE, "%s: net_event_register %s failed "
- "(netid=%d name=%s)", __func__, NH_PHYSICAL_OUT, nid,
- nh_name);
- goto fail_init_event_out;
- }
- return (0);
-
- /*
- * On failure, we undo all the steps that succeeded in the
- * reverse order of initialization, starting at the last
- * successful step (the labels denoting the failing step).
- */
-fail_init_event_out:
- VERIFY0(net_event_shutdown(vnh->vnh_neti, &vnh->vnh_event_in));
- VERIFY0(net_event_unregister(vnh->vnh_neti, &vnh->vnh_event_in));
- vnh->vnh_token_in = NULL;
-
-fail_init_event_in:
- VERIFY0(net_family_shutdown(vnh->vnh_neti, &vnh->vnh_family));
- VERIFY0(net_family_unregister(vnh->vnh_neti, &vnh->vnh_family));
-
-fail_init_family:
- VERIFY0(net_protocol_unregister(vnh->vnh_neti));
- vnh->vnh_neti = NULL;
-
-fail_init_proto:
- return (1);
-}
-
-/*
- * Shutdown the nethooks for a protocol family. This triggers notification
- * callbacks to anything that has registered interest to allow hook consumers
- * to unhook prior to the removal of the hooks as well as makes them unavailable
- * to any future consumers as the first step of removal.
- */
-static void
-viona_nethook_shutdown(viona_nethook_t *vnh)
-{
- VERIFY0(net_event_shutdown(vnh->vnh_neti, &vnh->vnh_event_out));
- VERIFY0(net_event_shutdown(vnh->vnh_neti, &vnh->vnh_event_in));
- VERIFY0(net_family_shutdown(vnh->vnh_neti, &vnh->vnh_family));
-}
-
-/*
- * Remove the nethooks for a protocol family.
- */
-static void
-viona_nethook_fini(viona_nethook_t *vnh)
-{
- VERIFY0(net_event_unregister(vnh->vnh_neti, &vnh->vnh_event_out));
- VERIFY0(net_event_unregister(vnh->vnh_neti, &vnh->vnh_event_in));
- VERIFY0(net_family_unregister(vnh->vnh_neti, &vnh->vnh_family));
- VERIFY0(net_protocol_unregister(vnh->vnh_neti));
- vnh->vnh_neti = NULL;
-}
-
-/*
- * Callback invoked by the neti module. This creates/registers our hooks
- * {IPv4,IPv6}{in,out} with the nethook framework so they are available to
- * interested consumers (e.g. ipf).
- *
- * During attach, viona_neti_create is called once for every netstack
- * present on the system at the time of attach. Thereafter, it is called
- * during the creation of additional netstack instances (i.e. zone boot). As a
- * result, the viona_neti_t that is created during this call always occurs
- * prior to any viona instances that will use it to send hook events.
- *
- * It should never return NULL. If we cannot register our hooks, we do not
- * set vnh_hooked of the respective protocol family, which will prevent the
- * creation of any viona instances on this netstack (see viona_ioc_create).
- * This can only occur if after a shutdown event (which means destruction is
- * imminent) we are trying to create a new instance.
- */
-static void *
-viona_neti_create(const netid_t netid)
-{
- viona_neti_t *nip;
-
- VERIFY(netid != -1);
-
- nip = kmem_zalloc(sizeof (*nip), KM_SLEEP);
- nip->vni_netid = netid;
- nip->vni_zid = net_getzoneidbynetid(netid);
- mutex_init(&nip->vni_lock, NULL, MUTEX_DRIVER, NULL);
- list_create(&nip->vni_dev_list, sizeof (viona_soft_state_t),
- offsetof(viona_soft_state_t, ss_node));
-
- if (viona_nethook_init(netid, &nip->vni_nethook, Hn_VIONA,
- &viona_netinfo) == 0)
- nip->vni_nethook.vnh_hooked = B_TRUE;
-
- mutex_enter(&viona_neti_lock);
- list_insert_tail(&viona_neti_list, nip);
- mutex_exit(&viona_neti_lock);
-
- return (nip);
-}
-
-/*
- * Called during netstack teardown by the neti module. During teardown, all
- * the shutdown callbacks are invoked, allowing consumers to release any holds
- * and otherwise quiesce themselves prior to destruction, followed by the
- * actual destruction callbacks.
- */
-static void
-viona_neti_shutdown(netid_t nid, void *arg)
-{
- viona_neti_t *nip = arg;
-
- ASSERT(nip != NULL);
- VERIFY(nid == nip->vni_netid);
-
- mutex_enter(&viona_neti_lock);
- list_remove(&viona_neti_list, nip);
- mutex_exit(&viona_neti_lock);
-
- if (nip->vni_nethook.vnh_hooked)
- viona_nethook_shutdown(&nip->vni_nethook);
-}
-
-/*
- * Called during netstack teardown by the neti module. Destroys the viona
- * netinst data. This is invoked after all the netstack and neti shutdown
- * callbacks have been invoked.
- */
-static void
-viona_neti_destroy(netid_t nid, void *arg)
-{
- viona_neti_t *nip = arg;
-
- ASSERT(nip != NULL);
- VERIFY(nid == nip->vni_netid);
-
- mutex_enter(&nip->vni_lock);
- while (nip->vni_ref != 0)
- cv_wait(&nip->vni_ref_change, &nip->vni_lock);
- mutex_exit(&nip->vni_lock);
-
- VERIFY(!list_link_active(&nip->vni_node));
-
- if (nip->vni_nethook.vnh_hooked)
- viona_nethook_fini(&nip->vni_nethook);
-
- mutex_destroy(&nip->vni_lock);
- list_destroy(&nip->vni_dev_list);
- kmem_free(nip, sizeof (*nip));
-}
-
-/*
- * Find the viona netinst data by zone id. This is only used during
- * viona instance creation (and thus is only called by a zone that is running).
- */
-static viona_neti_t *
-viona_neti_lookup_by_zid(zoneid_t zid)
-{
- viona_neti_t *nip;
-
- mutex_enter(&viona_neti_lock);
- for (nip = list_head(&viona_neti_list); nip != NULL;
- nip = list_next(&viona_neti_list, nip)) {
- if (nip->vni_zid == zid) {
- mutex_enter(&nip->vni_lock);
- nip->vni_ref++;
- mutex_exit(&nip->vni_lock);
- mutex_exit(&viona_neti_lock);
- return (nip);
- }
- }
- mutex_exit(&viona_neti_lock);
- return (NULL);
-}
-
-static void
-viona_neti_rele(viona_neti_t *nip)
-{
- mutex_enter(&nip->vni_lock);
- VERIFY3S(nip->vni_ref, >, 0);
- nip->vni_ref--;
- mutex_exit(&nip->vni_lock);
- cv_broadcast(&nip->vni_ref_change);
-}
diff --git a/usr/src/uts/i86pc/io/viona/viona_hook.c b/usr/src/uts/i86pc/io/viona/viona_hook.c
new file mode 100644
index 0000000000..4520be04b0
--- /dev/null
+++ b/usr/src/uts/i86pc/io/viona/viona_hook.c
@@ -0,0 +1,438 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2019 Joyent, Inc.
+ */
+
+#include <sys/hook.h>
+#include <sys/hook_event.h>
+
+#include "viona_impl.h"
+
+
+/*
+ * Global linked list of viona_neti_ts. Access is protected by viona_neti_lock
+ */
+static list_t viona_neti_list;
+static kmutex_t viona_neti_lock;
+
+/*
+ * viona_neti is allocated and initialized during attach, and read-only
+ * until detach (where it's also freed)
+ */
+static net_instance_t *viona_neti;
+
+
+/*
+ * Generate a hook event for the packet in *mpp headed in the direction
+ * indicated by 'out'. If the packet is accepted, 0 is returned. If the
+ * packet is rejected, an error is returned. The hook function may or may not
+ * alter or even free *mpp. The caller is expected to deal with either
+ * situation.
+ */
+int
+viona_hook(viona_link_t *link, viona_vring_t *ring, mblk_t **mpp, boolean_t out)
+{
+ viona_neti_t *nip = link->l_neti;
+ viona_nethook_t *vnh = &nip->vni_nethook;
+ hook_pkt_event_t info;
+ hook_event_t he;
+ hook_event_token_t het;
+ int ret;
+
+ he = out ? vnh->vnh_event_out : vnh->vnh_event_in;
+ het = out ? vnh->vnh_token_out : vnh->vnh_token_in;
+
+ if (!he.he_interested)
+ return (0);
+
+ info.hpe_protocol = vnh->vnh_neti;
+ info.hpe_ifp = (phy_if_t)link;
+ info.hpe_ofp = (phy_if_t)link;
+ info.hpe_mp = mpp;
+ info.hpe_flags = 0;
+
+ ret = hook_run(vnh->vnh_neti->netd_hooks, het, (hook_data_t)&info);
+ if (ret == 0)
+ return (0);
+
+ if (out) {
+ VIONA_PROBE3(tx_hook_drop, viona_vring_t *, ring,
+ mblk_t *, *mpp, int, ret);
+ VIONA_RING_STAT_INCR(ring, tx_hookdrop);
+ } else {
+ VIONA_PROBE3(rx_hook_drop, viona_vring_t *, ring,
+ mblk_t *, *mpp, int, ret);
+ VIONA_RING_STAT_INCR(ring, rx_hookdrop);
+ }
+ return (ret);
+}
+
+/*
+ * netinfo stubs - required by the nethook framework, but otherwise unused
+ *
+ * Currently, all ipf rules are applied against all interfaces in a given
+ * netstack (e.g. all interfaces in a zone). In the future if we want to
+ * support being able to apply different rules to different interfaces, I
+ * believe we would need to implement some of these stubs to map an interface
+ * name in a rule (e.g. 'net0', back to an index or viona_link_t);
+ */
+static int
+viona_neti_getifname(net_handle_t neti __unused, phy_if_t phy __unused,
+ char *buf __unused, const size_t len __unused)
+{
+ return (-1);
+}
+
+static int
+viona_neti_getmtu(net_handle_t neti __unused, phy_if_t phy __unused,
+ lif_if_t ifdata __unused)
+{
+ return (-1);
+}
+
+static int
+viona_neti_getptmue(net_handle_t neti __unused)
+{
+ return (-1);
+}
+
+static int
+viona_neti_getlifaddr(net_handle_t neti __unused, phy_if_t phy __unused,
+ lif_if_t ifdata __unused, size_t nelem __unused,
+ net_ifaddr_t type[] __unused, void *storage __unused)
+{
+ return (-1);
+}
+
+static int
+viona_neti_getlifzone(net_handle_t neti __unused, phy_if_t phy __unused,
+ lif_if_t ifdata __unused, zoneid_t *zid __unused)
+{
+ return (-1);
+}
+
+static int
+viona_neti_getlifflags(net_handle_t neti __unused, phy_if_t phy __unused,
+ lif_if_t ifdata __unused, uint64_t *flags __unused)
+{
+ return (-1);
+}
+
+static phy_if_t
+viona_neti_phygetnext(net_handle_t neti __unused, phy_if_t phy __unused)
+{
+ return ((phy_if_t)-1);
+}
+
+static phy_if_t
+viona_neti_phylookup(net_handle_t neti __unused, const char *name __unused)
+{
+ return ((phy_if_t)-1);
+}
+
+static lif_if_t
+viona_neti_lifgetnext(net_handle_t neti __unused, phy_if_t phy __unused,
+ lif_if_t ifdata __unused)
+{
+ return (-1);
+}
+
+static int
+viona_neti_inject(net_handle_t neti __unused, inject_t style __unused,
+ net_inject_t *packet __unused)
+{
+ return (-1);
+}
+
+static phy_if_t
+viona_neti_route(net_handle_t neti __unused, struct sockaddr *address __unused,
+ struct sockaddr *next __unused)
+{
+ return ((phy_if_t)-1);
+}
+
+static int
+viona_neti_ispchksum(net_handle_t neti __unused, mblk_t *mp __unused)
+{
+ return (-1);
+}
+
+static int
+viona_neti_isvchksum(net_handle_t neti __unused, mblk_t *mp __unused)
+{
+ return (-1);
+}
+
+static net_protocol_t viona_netinfo = {
+ NETINFO_VERSION,
+ NHF_VIONA,
+ viona_neti_getifname,
+ viona_neti_getmtu,
+ viona_neti_getptmue,
+ viona_neti_getlifaddr,
+ viona_neti_getlifzone,
+ viona_neti_getlifflags,
+ viona_neti_phygetnext,
+ viona_neti_phylookup,
+ viona_neti_lifgetnext,
+ viona_neti_inject,
+ viona_neti_route,
+ viona_neti_ispchksum,
+ viona_neti_isvchksum
+};
+
+/*
+ * Create/register our nethooks
+ */
+static int
+viona_nethook_init(netid_t nid, viona_nethook_t *vnh, char *nh_name,
+ net_protocol_t *netip)
+{
+ int ret;
+
+ if ((vnh->vnh_neti = net_protocol_register(nid, netip)) == NULL) {
+ cmn_err(CE_NOTE, "%s: net_protocol_register failed "
+ "(netid=%d name=%s)", __func__, nid, nh_name);
+ goto fail_init_proto;
+ }
+
+ HOOK_FAMILY_INIT(&vnh->vnh_family, nh_name);
+ if ((ret = net_family_register(vnh->vnh_neti, &vnh->vnh_family)) != 0) {
+ cmn_err(CE_NOTE, "%s: net_family_register failed "
+ "(netid=%d name=%s err=%d)", __func__,
+ nid, nh_name, ret);
+ goto fail_init_family;
+ }
+
+ HOOK_EVENT_INIT(&vnh->vnh_event_in, NH_PHYSICAL_IN);
+ if ((vnh->vnh_token_in = net_event_register(vnh->vnh_neti,
+ &vnh->vnh_event_in)) == NULL) {
+ cmn_err(CE_NOTE, "%s: net_event_register %s failed "
+ "(netid=%d name=%s)", __func__, NH_PHYSICAL_IN, nid,
+ nh_name);
+ goto fail_init_event_in;
+ }
+
+ HOOK_EVENT_INIT(&vnh->vnh_event_out, NH_PHYSICAL_OUT);
+ if ((vnh->vnh_token_out = net_event_register(vnh->vnh_neti,
+ &vnh->vnh_event_out)) == NULL) {
+ cmn_err(CE_NOTE, "%s: net_event_register %s failed "
+ "(netid=%d name=%s)", __func__, NH_PHYSICAL_OUT, nid,
+ nh_name);
+ goto fail_init_event_out;
+ }
+ return (0);
+
+ /*
+ * On failure, we undo all the steps that succeeded in the
+ * reverse order of initialization, starting at the last
+ * successful step (the labels denoting the failing step).
+ */
+fail_init_event_out:
+ VERIFY0(net_event_shutdown(vnh->vnh_neti, &vnh->vnh_event_in));
+ VERIFY0(net_event_unregister(vnh->vnh_neti, &vnh->vnh_event_in));
+ vnh->vnh_token_in = NULL;
+
+fail_init_event_in:
+ VERIFY0(net_family_shutdown(vnh->vnh_neti, &vnh->vnh_family));
+ VERIFY0(net_family_unregister(vnh->vnh_neti, &vnh->vnh_family));
+
+fail_init_family:
+ VERIFY0(net_protocol_unregister(vnh->vnh_neti));
+ vnh->vnh_neti = NULL;
+
+fail_init_proto:
+ return (1);
+}
+
+/*
+ * Shutdown the nethooks for a protocol family. This triggers notification
+ * callbacks to anything that has registered interest to allow hook consumers
+ * to unhook prior to the removal of the hooks as well as makes them unavailable
+ * to any future consumers as the first step of removal.
+ */
+static void
+viona_nethook_shutdown(viona_nethook_t *vnh)
+{
+ VERIFY0(net_event_shutdown(vnh->vnh_neti, &vnh->vnh_event_out));
+ VERIFY0(net_event_shutdown(vnh->vnh_neti, &vnh->vnh_event_in));
+ VERIFY0(net_family_shutdown(vnh->vnh_neti, &vnh->vnh_family));
+}
+
+/*
+ * Remove the nethooks for a protocol family.
+ */
+static void
+viona_nethook_fini(viona_nethook_t *vnh)
+{
+ VERIFY0(net_event_unregister(vnh->vnh_neti, &vnh->vnh_event_out));
+ VERIFY0(net_event_unregister(vnh->vnh_neti, &vnh->vnh_event_in));
+ VERIFY0(net_family_unregister(vnh->vnh_neti, &vnh->vnh_family));
+ VERIFY0(net_protocol_unregister(vnh->vnh_neti));
+ vnh->vnh_neti = NULL;
+}
+
+/*
+ * Callback invoked by the neti module. This creates/registers our hooks
+ * {IPv4,IPv6}{in,out} with the nethook framework so they are available to
+ * interested consumers (e.g. ipf).
+ *
+ * During attach, viona_neti_create is called once for every netstack
+ * present on the system at the time of attach. Thereafter, it is called
+ * during the creation of additional netstack instances (i.e. zone boot). As a
+ * result, the viona_neti_t that is created during this call always occurs
+ * prior to any viona instances that will use it to send hook events.
+ *
+ * It should never return NULL. If we cannot register our hooks, we do not
+ * set vnh_hooked of the respective protocol family, which will prevent the
+ * creation of any viona instances on this netstack (see viona_ioc_create).
+ * This can only occur if after a shutdown event (which means destruction is
+ * imminent) we are trying to create a new instance.
+ */
+static void *
+viona_neti_create(const netid_t netid)
+{
+ viona_neti_t *nip;
+
+ VERIFY(netid != -1);
+
+ nip = kmem_zalloc(sizeof (*nip), KM_SLEEP);
+ nip->vni_netid = netid;
+ nip->vni_zid = net_getzoneidbynetid(netid);
+ mutex_init(&nip->vni_lock, NULL, MUTEX_DRIVER, NULL);
+ list_create(&nip->vni_dev_list, sizeof (viona_soft_state_t),
+ offsetof(viona_soft_state_t, ss_node));
+
+ if (viona_nethook_init(netid, &nip->vni_nethook, Hn_VIONA,
+ &viona_netinfo) == 0)
+ nip->vni_nethook.vnh_hooked = B_TRUE;
+
+ mutex_enter(&viona_neti_lock);
+ list_insert_tail(&viona_neti_list, nip);
+ mutex_exit(&viona_neti_lock);
+
+ return (nip);
+}
+
+/*
+ * Called during netstack teardown by the neti module. During teardown, all
+ * the shutdown callbacks are invoked, allowing consumers to release any holds
+ * and otherwise quiesce themselves prior to destruction, followed by the
+ * actual destruction callbacks.
+ */
+static void
+viona_neti_shutdown(netid_t nid, void *arg)
+{
+ viona_neti_t *nip = arg;
+
+ ASSERT(nip != NULL);
+ VERIFY(nid == nip->vni_netid);
+
+ mutex_enter(&viona_neti_lock);
+ list_remove(&viona_neti_list, nip);
+ mutex_exit(&viona_neti_lock);
+
+ if (nip->vni_nethook.vnh_hooked)
+ viona_nethook_shutdown(&nip->vni_nethook);
+}
+
+/*
+ * Called during netstack teardown by the neti module. Destroys the viona
+ * netinst data. This is invoked after all the netstack and neti shutdown
+ * callbacks have been invoked.
+ */
+static void
+viona_neti_destroy(netid_t nid, void *arg)
+{
+ viona_neti_t *nip = arg;
+
+ ASSERT(nip != NULL);
+ VERIFY(nid == nip->vni_netid);
+
+ mutex_enter(&nip->vni_lock);
+ while (nip->vni_ref != 0)
+ cv_wait(&nip->vni_ref_change, &nip->vni_lock);
+ mutex_exit(&nip->vni_lock);
+
+ VERIFY(!list_link_active(&nip->vni_node));
+
+ if (nip->vni_nethook.vnh_hooked)
+ viona_nethook_fini(&nip->vni_nethook);
+
+ mutex_destroy(&nip->vni_lock);
+ list_destroy(&nip->vni_dev_list);
+ kmem_free(nip, sizeof (*nip));
+}
+
+/*
+ * Find the viona netinst data by zone id. This is only used during
+ * viona instance creation (and thus is only called by a zone that is running).
+ */
+viona_neti_t *
+viona_neti_lookup_by_zid(zoneid_t zid)
+{
+ viona_neti_t *nip;
+
+ mutex_enter(&viona_neti_lock);
+ for (nip = list_head(&viona_neti_list); nip != NULL;
+ nip = list_next(&viona_neti_list, nip)) {
+ if (nip->vni_zid == zid) {
+ mutex_enter(&nip->vni_lock);
+ nip->vni_ref++;
+ mutex_exit(&nip->vni_lock);
+ mutex_exit(&viona_neti_lock);
+ return (nip);
+ }
+ }
+ mutex_exit(&viona_neti_lock);
+ return (NULL);
+}
+
+void
+viona_neti_rele(viona_neti_t *nip)
+{
+ mutex_enter(&nip->vni_lock);
+ VERIFY3S(nip->vni_ref, >, 0);
+ nip->vni_ref--;
+ mutex_exit(&nip->vni_lock);
+ cv_broadcast(&nip->vni_ref_change);
+}
+
+void
+viona_neti_attach(void)
+{
+ mutex_init(&viona_neti_lock, NULL, MUTEX_DRIVER, NULL);
+ list_create(&viona_neti_list, sizeof (viona_neti_t),
+ offsetof(viona_neti_t, vni_node));
+
+ /* This can only fail if NETINFO_VERSION is wrong */
+ viona_neti = net_instance_alloc(NETINFO_VERSION);
+ VERIFY(viona_neti != NULL);
+
+ viona_neti->nin_name = "viona";
+ viona_neti->nin_create = viona_neti_create;
+ viona_neti->nin_shutdown = viona_neti_shutdown;
+ viona_neti->nin_destroy = viona_neti_destroy;
+ /* This can only fail if we've registered ourselves multiple times */
+ VERIFY3S(net_instance_register(viona_neti), ==, DDI_SUCCESS);
+}
+
+void
+viona_neti_detach(void)
+{
+ /* This can only fail if we've not registered previously */
+ VERIFY3S(net_instance_unregister(viona_neti), ==, DDI_SUCCESS);
+ net_instance_free(viona_neti);
+ viona_neti = NULL;
+
+ list_destroy(&viona_neti_list);
+ mutex_destroy(&viona_neti_lock);
+}
diff --git a/usr/src/uts/i86pc/io/viona/viona_impl.h b/usr/src/uts/i86pc/io/viona/viona_impl.h
new file mode 100644
index 0000000000..ee31c4d4ce
--- /dev/null
+++ b/usr/src/uts/i86pc/io/viona/viona_impl.h
@@ -0,0 +1,325 @@
+/*
+ * Copyright (c) 2013 Chris Torek <torek @ torek net>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2015 Pluribus Networks Inc.
+ * Copyright 2019 Joyent, Inc.
+ */
+
+#ifndef _VIONA_IMPL_H
+#define _VIONA_IMPL_H
+
+#include <sys/ddi.h>
+#include <sys/list.h>
+#include <sys/sunddi.h>
+#include <sys/sunndi.h>
+#include <sys/strsun.h>
+#include <sys/sysmacros.h>
+#include <sys/uio.h>
+
+#include <sys/mac_client.h>
+#include <sys/mac_provider.h>
+#include <sys/mac_client_priv.h>
+#include <sys/neti.h>
+#include <inet/ip.h>
+#include <inet/tcp.h>
+
+#include <sys/vmm_drv.h>
+#include <sys/viona_io.h>
+
+struct viona_link;
+typedef struct viona_link viona_link_t;
+struct viona_desb;
+typedef struct viona_desb viona_desb_t;
+struct viona_net;
+typedef struct viona_neti viona_neti_t;
+
+enum viona_ring_state {
+ VRS_RESET = 0x0, /* just allocated or reset */
+ VRS_SETUP = 0x1, /* addrs setup and starting worker thread */
+ VRS_INIT = 0x2, /* worker thread started & waiting to run */
+ VRS_RUN = 0x3, /* running work routine */
+};
+enum viona_ring_state_flags {
+ VRSF_REQ_START = 0x1, /* start running from INIT state */
+ VRSF_REQ_STOP = 0x2, /* stop running, clean up, goto RESET state */
+ VRSF_RENEW = 0x4, /* ring renewing lease */
+};
+
+typedef struct viona_vring {
+ viona_link_t *vr_link;
+
+ kmutex_t vr_lock;
+ kcondvar_t vr_cv;
+ uint16_t vr_state;
+ uint16_t vr_state_flags;
+ uint_t vr_xfer_outstanding;
+ kthread_t *vr_worker_thread;
+ vmm_lease_t *vr_lease;
+
+ /* ring-sized resources for TX activity */
+ viona_desb_t *vr_txdesb;
+ struct iovec *vr_txiov;
+
+ uint_t vr_intr_enabled;
+ uint64_t vr_msi_addr;
+ uint64_t vr_msi_msg;
+
+ /* Internal ring-related state */
+ kmutex_t vr_a_mutex; /* sync consumers of 'avail' */
+ kmutex_t vr_u_mutex; /* sync consumers of 'used' */
+ uint64_t vr_pa;
+ uint16_t vr_size;
+ uint16_t vr_mask; /* cached from vr_size */
+ uint16_t vr_cur_aidx; /* trails behind 'avail_idx' */
+
+ /* Host-context pointers to the queue */
+ volatile struct virtio_desc *vr_descr;
+
+ volatile uint16_t *vr_avail_flags;
+ volatile uint16_t *vr_avail_idx;
+ volatile uint16_t *vr_avail_ring;
+ volatile uint16_t *vr_avail_used_event;
+
+ volatile uint16_t *vr_used_flags;
+ volatile uint16_t *vr_used_idx;
+ volatile struct virtio_used *vr_used_ring;
+ volatile uint16_t *vr_used_avail_event;
+
+ /* Per-ring error condition statistics */
+ struct viona_ring_stats {
+ uint64_t rs_ndesc_too_high;
+ uint64_t rs_bad_idx;
+ uint64_t rs_indir_bad_len;
+ uint64_t rs_indir_bad_nest;
+ uint64_t rs_indir_bad_next;
+ uint64_t rs_no_space;
+ uint64_t rs_too_many_desc;
+ uint64_t rs_desc_bad_len;
+
+ uint64_t rs_bad_ring_addr;
+
+ uint64_t rs_fail_hcksum;
+ uint64_t rs_fail_hcksum6;
+ uint64_t rs_fail_hcksum_proto;
+
+ uint64_t rs_bad_rx_frame;
+ uint64_t rs_rx_merge_overrun;
+ uint64_t rs_rx_merge_underrun;
+ uint64_t rs_rx_pad_short;
+ uint64_t rs_rx_mcast_check;
+ uint64_t rs_too_short;
+ uint64_t rs_tx_absent;
+
+ uint64_t rs_rx_hookdrop;
+ uint64_t rs_tx_hookdrop;
+ } vr_stats;
+} viona_vring_t;
+
+struct viona_link {
+ vmm_hold_t *l_vm_hold;
+ boolean_t l_destroyed;
+
+ viona_vring_t l_vrings[VIONA_VQ_MAX];
+
+ uint32_t l_features;
+ uint32_t l_features_hw;
+ uint32_t l_cap_csum;
+
+ uintptr_t l_notify_ioport;
+ void *l_notify_cookie;
+
+ datalink_id_t l_linkid;
+ mac_handle_t l_mh;
+ mac_client_handle_t l_mch;
+ mac_promisc_handle_t l_mph;
+
+ pollhead_t l_pollhead;
+
+ viona_neti_t *l_neti;
+};
+
+typedef struct viona_nethook {
+ net_handle_t vnh_neti;
+ hook_family_t vnh_family;
+ hook_event_t vnh_event_in;
+ hook_event_t vnh_event_out;
+ hook_event_token_t vnh_token_in;
+ hook_event_token_t vnh_token_out;
+ boolean_t vnh_hooked;
+} viona_nethook_t;
+
+struct viona_neti {
+ list_node_t vni_node;
+
+ netid_t vni_netid;
+ zoneid_t vni_zid;
+
+ viona_nethook_t vni_nethook;
+
+ kmutex_t vni_lock; /* Protects remaining members */
+ kcondvar_t vni_ref_change; /* Protected by vni_lock */
+ uint_t vni_ref; /* Protected by vni_lock */
+ list_t vni_dev_list; /* Protected by vni_lock */
+};
+
+typedef struct used_elem {
+ uint16_t id;
+ uint32_t len;
+} used_elem_t;
+
+typedef struct viona_soft_state {
+ kmutex_t ss_lock;
+ viona_link_t *ss_link;
+ list_node_t ss_node;
+} viona_soft_state_t;
+
+#pragma pack(1)
+struct virtio_desc {
+ uint64_t vd_addr;
+ uint32_t vd_len;
+ uint16_t vd_flags;
+ uint16_t vd_next;
+};
+
+struct virtio_used {
+ uint32_t vu_idx;
+ uint32_t vu_tlen;
+};
+
+struct virtio_net_mrgrxhdr {
+ uint8_t vrh_flags;
+ uint8_t vrh_gso_type;
+ uint16_t vrh_hdr_len;
+ uint16_t vrh_gso_size;
+ uint16_t vrh_csum_start;
+ uint16_t vrh_csum_offset;
+ uint16_t vrh_bufs;
+};
+
+struct virtio_net_hdr {
+ uint8_t vrh_flags;
+ uint8_t vrh_gso_type;
+ uint16_t vrh_hdr_len;
+ uint16_t vrh_gso_size;
+ uint16_t vrh_csum_start;
+ uint16_t vrh_csum_offset;
+};
+#pragma pack()
+
+#define VRING_NEED_BAIL(ring, proc) \
+ (((ring)->vr_state_flags & VRSF_REQ_STOP) != 0 || \
+ ((proc)->p_flag & SEXITING) != 0)
+
+
+#define VNETHOOK_INTERESTED_IN(neti) \
+ (neti)->vni_nethook.vnh_event_in.he_interested
+#define VNETHOOK_INTERESTED_OUT(neti) \
+ (neti)->vni_nethook.vnh_event_out.he_interested
+
+
+#define VIONA_PROBE(name) DTRACE_PROBE(viona__##name)
+#define VIONA_PROBE1(name, arg1, arg2) \
+ DTRACE_PROBE1(viona__##name, arg1, arg2)
+#define VIONA_PROBE2(name, arg1, arg2, arg3, arg4) \
+ DTRACE_PROBE2(viona__##name, arg1, arg2, arg3, arg4)
+#define VIONA_PROBE3(name, arg1, arg2, arg3, arg4, arg5, arg6) \
+ DTRACE_PROBE3(viona__##name, arg1, arg2, arg3, arg4, arg5, arg6)
+#define VIONA_PROBE5(name, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, \
+ arg9, arg10) \
+ DTRACE_PROBE5(viona__##name, arg1, arg2, arg3, arg4, arg5, arg6, arg7, \
+ arg8, arg9, arg10)
+#define VIONA_PROBE_BAD_RING_ADDR(r, a) \
+ VIONA_PROBE2(bad_ring_addr, viona_vring_t *, r, void *, (void *)(a))
+
+#define VIONA_RING_STAT_INCR(r, name) \
+ (((r)->vr_stats.rs_ ## name)++)
+
+
+#define VIONA_MAX_HDRS_LEN (sizeof (struct ether_vlan_header) + \
+ IP_MAX_HDR_LENGTH + TCP_MAX_HDR_LENGTH)
+
+#define VRING_AVAIL_F_NO_INTERRUPT 1
+#define VRING_USED_F_NO_NOTIFY 1
+
+#define VRING_DESC_F_NEXT (1 << 0)
+#define VRING_DESC_F_WRITE (1 << 1)
+#define VRING_DESC_F_INDIRECT (1 << 2)
+
+#define VIRTIO_NET_HDR_F_NEEDS_CSUM (1 << 0)
+#define VIRTIO_NET_HDR_F_DATA_VALID (1 << 1)
+
+#define VIRTIO_NET_HDR_GSO_NONE 0
+#define VIRTIO_NET_HDR_GSO_TCPV4 1
+
+#define VIRTIO_NET_F_CSUM (1 << 0)
+#define VIRTIO_NET_F_GUEST_CSUM (1 << 1)
+#define VIRTIO_NET_F_MAC (1 << 5) /* host supplies MAC */
+#define VIRTIO_NET_F_GUEST_TSO4 (1 << 7) /* guest can accept TSO */
+#define VIRTIO_NET_F_HOST_TSO4 (1 << 11) /* host can accept TSO */
+#define VIRTIO_NET_F_MRG_RXBUF (1 << 15) /* host can merge RX bufs */
+#define VIRTIO_NET_F_STATUS (1 << 16) /* cfg status field present */
+#define VIRTIO_F_RING_NOTIFY_ON_EMPTY (1 << 24)
+#define VIRTIO_F_RING_INDIRECT_DESC (1 << 28)
+#define VIRTIO_F_RING_EVENT_IDX (1 << 29)
+
+
+void viona_ring_alloc(viona_link_t *, viona_vring_t *);
+void viona_ring_free(viona_vring_t *);
+int viona_ring_reset(viona_vring_t *, boolean_t);
+int viona_ring_init(viona_link_t *, uint16_t, uint16_t, uint64_t);
+boolean_t viona_ring_lease_renew(viona_vring_t *);
+int vq_popchain(viona_vring_t *, struct iovec *, uint_t, uint16_t *);
+void vq_pushchain(viona_vring_t *, uint32_t, uint16_t);
+void vq_pushchain_many(viona_vring_t *, uint_t, used_elem_t *);
+void viona_intr_ring(viona_vring_t *ring);
+
+void viona_rx_init(void);
+void viona_rx_fini(void);
+int viona_rx_set(viona_link_t *);
+void viona_rx_clear(viona_link_t *);
+void viona_worker_rx(viona_vring_t *, viona_link_t *);
+
+extern kmutex_t viona_force_copy_lock;
+void viona_worker_tx(viona_vring_t *, viona_link_t *);
+void viona_tx_ring_alloc(viona_vring_t *, const uint16_t);
+void viona_tx_ring_free(viona_vring_t *, const uint16_t);
+
+void viona_neti_attach(void);
+void viona_neti_detach(void);
+viona_neti_t *viona_neti_lookup_by_zid(zoneid_t);
+void viona_neti_rele(viona_neti_t *);
+int viona_hook(viona_link_t *, viona_vring_t *, mblk_t **, boolean_t);
+
+#endif /* _VIONA_IMPL_H */
diff --git a/usr/src/uts/i86pc/io/viona/viona_main.c b/usr/src/uts/i86pc/io/viona/viona_main.c
new file mode 100644
index 0000000000..e3c9b90a57
--- /dev/null
+++ b/usr/src/uts/i86pc/io/viona/viona_main.c
@@ -0,0 +1,985 @@
+/*
+ * Copyright (c) 2013 Chris Torek <torek @ torek net>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2015 Pluribus Networks Inc.
+ * Copyright 2019 Joyent, Inc.
+ */
+
+/*
+ * viona - VirtIO-Net, Accelerated
+ *
+ * The purpose of viona is to provide high performance virtio-net devices to
+ * bhyve guests. It does so by sitting directly atop MAC, skipping all of the
+ * DLS/DLD stack.
+ *
+ * --------------------
+ * General Architecture
+ * --------------------
+ *
+ * A single viona instance is comprised of a "link" handle and two "rings".
+ * After opening the viona device, it must be associated with a MAC network
+ * interface and a bhyve (vmm) instance to form its link resource. This is
+ * done with the VNA_IOC_CREATE ioctl, where the datalink ID and vmm fd are
+ * passed in to perform the initialization. With the MAC client opened, and a
+ * driver handle to the vmm instance established, the device is ready to be
+ * configured by the guest.
+ *
+ * The userspace portion of bhyve, which interfaces with the PCI device
+ * emulation framework, is meant to stay out of the datapath if at all
+ * possible. Configuration changes made via PCI are mapped to actions which
+ * will steer the operation of the in-kernel logic.
+ *
+ *
+ * -----------
+ * Ring Basics
+ * -----------
+ *
+ * Each viona link has two viona_vring_t entities, RX and TX, for handling data
+ * transfers to and from the guest. They represent an interface to the
+ * standard virtio ring structures. When intiailized and active, each ring is
+ * backed by a kernel worker thread (parented to the bhyve process for the
+ * instance) which handles ring events. The RX worker has the simple task of
+ * watching for ring shutdown conditions. The TX worker does that in addition
+ * to processing all requests to transmit data. Data destined for the guest is
+ * delivered directly by MAC to viona_rx() when the ring is active.
+ *
+ *
+ * -----------
+ * Ring States
+ * -----------
+ *
+ * The viona_vring_t instances follow a simple path through the possible state
+ * values represented in virtio_vring_t`vr_state:
+ *
+ * +<--------------------------------------------+
+ * | |
+ * V ^
+ * +-----------+ This is the initial state when a link is created or
+ * | VRS_RESET | when the ring has been explicitly reset.
+ * +-----------+
+ * | ^
+ * |---* ioctl(VNA_IOC_RING_INIT) issued |
+ * | |
+ * | ^
+ * V
+ * +-----------+ The ring parameters (size, guest physical addresses)
+ * | VRS_SETUP | have been set and start-up of the ring worker thread
+ * +-----------+ has begun.
+ * | ^
+ * | |
+ * |---* ring worker thread begins execution |
+ * | |
+ * +-------------------------------------------->+
+ * | | ^
+ * | |
+ * | * If ring shutdown is requested (by ioctl or impending
+ * | bhyve process death) while the worker thread is
+ * | starting, the worker will transition the ring to
+ * | VRS_RESET and exit.
+ * | ^
+ * | |
+ * | ^
+ * V
+ * +-----------+ The worker thread associated with the ring has started
+ * | VRS_INIT | executing. It has allocated any extra resources needed
+ * +-----------+ for the ring to operate.
+ * | ^
+ * | |
+ * +-------------------------------------------->+
+ * | | ^
+ * | |
+ * | * If ring shutdown is requested while the worker is
+ * | waiting in VRS_INIT, it will free any extra resources
+ * | and transition to VRS_RESET.
+ * | ^
+ * | |
+ * |--* ioctl(VNA_IOC_RING_KICK) issued |
+ * | ^
+ * V
+ * +-----------+ The worker thread associated with the ring is executing
+ * | VRS_RUN | workload specific to that ring.
+ * +-----------+
+ * | ^
+ * |---* ioctl(VNA_IOC_RING_RESET) issued |
+ * | (or bhyve process begins exit) |
+ * V |
+ * +-------------------------------------------->+
+ *
+ *
+ * While the worker thread is not running, changes to vr_state are only made by
+ * viona_ioc_ring_init() under vr_lock. There, it initializes the ring, starts
+ * the worker, and sets the ring state to VRS_SETUP. Once the worker thread
+ * has been started, only it may perform ring state transitions (still under
+ * the protection of vr_lock), when requested by outside consumers via
+ * vr_state_flags or when the containing bhyve process initiates an exit.
+ *
+ *
+ * ----------------------------
+ * Transmission mblk_t Handling
+ * ----------------------------
+ *
+ * For incoming frames destined for a bhyve guest, the data must first land in
+ * a host OS buffer from the physical NIC before it is copied into the awaiting
+ * guest buffer(s). Outbound frames transmitted by the guest are not bound by
+ * this limitation and can avoid extra copying before the buffers are accessed
+ * directly by the NIC. When a guest designates buffers to be transmitted,
+ * viona translates the guest-physical addresses contained in the ring
+ * descriptors to host-virtual addresses via vmm_dr_gpa2kva(). That pointer is
+ * wrapped in an mblk_t using a preallocated viona_desb_t for the desballoc().
+ * Doing so increments vr_xfer_outstanding, preventing the ring from being
+ * reset (allowing the link to drop its vmm handle to the guest) until all
+ * transmit mblks referencing guest memory have been processed. Allocation of
+ * the viona_desb_t entries is done during the VRS_INIT stage of the ring
+ * worker thread. The ring size informs that allocation as the number of
+ * concurrent transmissions is limited by the number of descriptors in the
+ * ring. This minimizes allocation in the transmit hot-path by aqcuiring those
+ * fixed-size resources during initialization.
+ *
+ * This optimization depends on the underlying NIC driver freeing the mblks in
+ * a timely manner after they have been transmitted by the hardware. Some
+ * drivers have been found to flush TX descriptors only when new transmissions
+ * are initiated. This means that there is no upper bound to the time needed
+ * for an mblk to be flushed and can stall bhyve guests from shutting down
+ * since their memory must be free of viona TX references prior to clean-up.
+ *
+ * This expectation of deterministic mblk_t processing is likely the reason
+ * behind the notable exception to the zero-copy TX path: systems with 'bnxe'
+ * loaded will copy transmit data into fresh buffers rather than passing up
+ * zero-copy mblks. It is a hold-over from the original viona sources provided
+ * by Pluribus and its continued necessity has not been confirmed.
+ *
+ *
+ * ----------------------------
+ * Ring Notification Fast-paths
+ * ----------------------------
+ *
+ * Device operation for viona requires that notifications flow to and from the
+ * guest to indicate certain ring conditions. In order to minimize latency and
+ * processing overhead, the notification procedures are kept in-kernel whenever
+ * possible.
+ *
+ * Guest-to-host notifications, when new available descriptors have been placed
+ * in the ring, are posted via the 'queue notify' address in the virtio BAR.
+ * The vmm_drv_ioport_hook() interface was added to bhyve which allows viona to
+ * install a callback hook on an ioport address. Guest exits for accesses to
+ * viona-hooked ioport addresses will result in direct calls to notify the
+ * appropriate ring worker without a trip to userland.
+ *
+ * Host-to-guest notifications in the form of interrupts enjoy similar
+ * acceleration. Each viona ring can be configured to send MSI notifications
+ * to the guest as virtio conditions dictate. This in-kernel interrupt
+ * configuration is kept synchronized through viona ioctls which are utilized
+ * during writes to the associated PCI config registers or MSI-X BAR.
+ *
+ * Guests which do not utilize MSI-X will result in viona falling back to the
+ * slow path for interrupts. It will poll(2) the viona handle, receiving
+ * notification when ring events necessitate the assertion of an interrupt.
+ *
+ *
+ * ---------------
+ * Nethook Support
+ * ---------------
+ *
+ * Viona provides four nethook events that consumers (e.g. ipf) can hook into
+ * to intercept packets as they go up or down the stack. Unfortunately,
+ * the nethook framework does not understand raw packets, so we can only
+ * generate events (in, out) for IPv4 and IPv6 packets. At driver attach,
+ * we register callbacks with the neti (netinfo) module that will be invoked
+ * for each netstack already present, as well as for any additional netstack
+ * instances created as the system operates. These callbacks will
+ * register/unregister the hooks with the nethook framework for each
+ * netstack instance. This registration occurs prior to creating any
+ * viona instances for a given netstack, and the unregistration for a netstack
+ * instance occurs after all viona instances of the netstack instance have
+ * been deleted.
+ */
+
+#include <sys/conf.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+
+#include <sys/dlpi.h>
+
+#include "viona_impl.h"
+
+
+#define VIONA_NAME "Virtio Network Accelerator"
+#define VIONA_CTL_MINOR 0
+#define VIONA_CLI_NAME "viona" /* MAC client name */
+
+
+/*
+ * Host capabilities.
+ */
+#define VIONA_S_HOSTCAPS ( \
+ VIRTIO_NET_F_GUEST_CSUM | \
+ VIRTIO_NET_F_MAC | \
+ VIRTIO_NET_F_GUEST_TSO4 | \
+ VIRTIO_NET_F_MRG_RXBUF | \
+ VIRTIO_NET_F_STATUS | \
+ VIRTIO_F_RING_NOTIFY_ON_EMPTY | \
+ VIRTIO_F_RING_INDIRECT_DESC)
+
+/* MAC_CAPAB_HCKSUM specifics of interest */
+#define VIONA_CAP_HCKSUM_INTEREST \
+ (HCKSUM_INET_PARTIAL | \
+ HCKSUM_INET_FULL_V4 | \
+ HCKSUM_INET_FULL_V6)
+
+static void *viona_state;
+static dev_info_t *viona_dip;
+static id_space_t *viona_minors;
+
+
+static int viona_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg,
+ void **result);
+static int viona_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
+static int viona_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
+static int viona_open(dev_t *devp, int flag, int otype, cred_t *credp);
+static int viona_close(dev_t dev, int flag, int otype, cred_t *credp);
+static int viona_ioctl(dev_t dev, int cmd, intptr_t data, int mode,
+ cred_t *credp, int *rval);
+static int viona_chpoll(dev_t dev, short events, int anyyet, short *reventsp,
+ struct pollhead **phpp);
+
+static int viona_ioc_create(viona_soft_state_t *, void *, int, cred_t *);
+static int viona_ioc_delete(viona_soft_state_t *, boolean_t);
+
+static int viona_ioc_set_notify_ioport(viona_link_t *, uint_t);
+static int viona_ioc_ring_init(viona_link_t *, void *, int);
+static int viona_ioc_ring_reset(viona_link_t *, uint_t);
+static int viona_ioc_ring_kick(viona_link_t *, uint_t);
+static int viona_ioc_ring_set_msi(viona_link_t *, void *, int);
+static int viona_ioc_ring_intr_clear(viona_link_t *, uint_t);
+static int viona_ioc_intr_poll(viona_link_t *, void *, int, int *);
+
+static struct cb_ops viona_cb_ops = {
+ viona_open,
+ viona_close,
+ nodev,
+ nodev,
+ nodev,
+ nodev,
+ nodev,
+ viona_ioctl,
+ nodev,
+ nodev,
+ nodev,
+ viona_chpoll,
+ ddi_prop_op,
+ 0,
+ D_MP | D_NEW | D_HOTPLUG,
+ CB_REV,
+ nodev,
+ nodev
+};
+
+static struct dev_ops viona_ops = {
+ DEVO_REV,
+ 0,
+ viona_info,
+ nulldev,
+ nulldev,
+ viona_attach,
+ viona_detach,
+ nodev,
+ &viona_cb_ops,
+ NULL,
+ ddi_power,
+ ddi_quiesce_not_needed
+};
+
+static struct modldrv modldrv = {
+ &mod_driverops,
+ VIONA_NAME,
+ &viona_ops,
+};
+
+static struct modlinkage modlinkage = {
+ MODREV_1, &modldrv, NULL
+};
+
+int
+_init(void)
+{
+ int ret;
+
+ ret = ddi_soft_state_init(&viona_state, sizeof (viona_soft_state_t), 0);
+ if (ret != 0) {
+ return (ret);
+ }
+
+ viona_minors = id_space_create("viona_minors",
+ VIONA_CTL_MINOR + 1, UINT16_MAX);
+ viona_rx_init();
+ mutex_init(&viona_force_copy_lock, NULL, MUTEX_DRIVER, NULL);
+
+ ret = mod_install(&modlinkage);
+ if (ret != 0) {
+ ddi_soft_state_fini(&viona_state);
+ id_space_destroy(viona_minors);
+ viona_rx_fini();
+ mutex_destroy(&viona_force_copy_lock);
+ }
+
+ return (ret);
+}
+
+int
+_fini(void)
+{
+ int ret;
+
+ ret = mod_remove(&modlinkage);
+ if (ret != 0) {
+ return (ret);
+ }
+
+ ddi_soft_state_fini(&viona_state);
+ id_space_destroy(viona_minors);
+ viona_rx_fini();
+ mutex_destroy(&viona_force_copy_lock);
+
+ return (ret);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+ return (mod_info(&modlinkage, modinfop));
+}
+
+/* ARGSUSED */
+static int
+viona_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
+{
+ int error;
+
+ switch (cmd) {
+ case DDI_INFO_DEVT2DEVINFO:
+ *result = (void *)viona_dip;
+ error = DDI_SUCCESS;
+ break;
+ case DDI_INFO_DEVT2INSTANCE:
+ *result = (void *)0;
+ error = DDI_SUCCESS;
+ break;
+ default:
+ error = DDI_FAILURE;
+ break;
+ }
+ return (error);
+}
+
+static int
+viona_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+ if (cmd != DDI_ATTACH) {
+ return (DDI_FAILURE);
+ }
+
+ if (ddi_create_minor_node(dip, "viona", S_IFCHR, VIONA_CTL_MINOR,
+ DDI_PSEUDO, 0) != DDI_SUCCESS) {
+ return (DDI_FAILURE);
+ }
+
+ viona_neti_attach();
+
+ viona_dip = dip;
+ ddi_report_dev(viona_dip);
+
+ return (DDI_SUCCESS);
+}
+
+static int
+viona_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+ dev_info_t *old_dip = viona_dip;
+
+ if (cmd != DDI_DETACH) {
+ return (DDI_FAILURE);
+ }
+
+ VERIFY(old_dip != NULL);
+
+ viona_neti_detach();
+ viona_dip = NULL;
+ ddi_remove_minor_node(old_dip, NULL);
+
+ return (DDI_SUCCESS);
+}
+
+static int
+viona_open(dev_t *devp, int flag, int otype, cred_t *credp)
+{
+ int minor;
+ viona_soft_state_t *ss;
+
+ if (otype != OTYP_CHR) {
+ return (EINVAL);
+ }
+#if 0
+ /*
+ * XXX-mg: drv_priv() is wrong, but I'm not sure what is right.
+ * Should the check be at open() or ioctl()?
+ */
+ if (drv_priv(credp) != 0) {
+ return (EPERM);
+ }
+#endif
+ if (getminor(*devp) != VIONA_CTL_MINOR) {
+ return (ENXIO);
+ }
+
+ minor = id_alloc_nosleep(viona_minors);
+ if (minor == 0) {
+ /* All minors are busy */
+ return (EBUSY);
+ }
+ if (ddi_soft_state_zalloc(viona_state, minor) != DDI_SUCCESS) {
+ id_free(viona_minors, minor);
+ return (ENOMEM);
+ }
+
+ ss = ddi_get_soft_state(viona_state, minor);
+ mutex_init(&ss->ss_lock, NULL, MUTEX_DEFAULT, NULL);
+ *devp = makedevice(getmajor(*devp), minor);
+
+ return (0);
+}
+
+static int
+viona_close(dev_t dev, int flag, int otype, cred_t *credp)
+{
+ int minor;
+ viona_soft_state_t *ss;
+
+ if (otype != OTYP_CHR) {
+ return (EINVAL);
+ }
+
+ minor = getminor(dev);
+
+ ss = ddi_get_soft_state(viona_state, minor);
+ if (ss == NULL) {
+ return (ENXIO);
+ }
+
+ VERIFY0(viona_ioc_delete(ss, B_TRUE));
+ VERIFY(!list_link_active(&ss->ss_node));
+ ddi_soft_state_free(viona_state, minor);
+ id_free(viona_minors, minor);
+
+ return (0);
+}
+
+static int
+viona_ioctl(dev_t dev, int cmd, intptr_t data, int md, cred_t *cr, int *rv)
+{
+ viona_soft_state_t *ss;
+ void *dptr = (void *)data;
+ int err = 0, val;
+ viona_link_t *link;
+
+ ss = ddi_get_soft_state(viona_state, getminor(dev));
+ if (ss == NULL) {
+ return (ENXIO);
+ }
+
+ switch (cmd) {
+ case VNA_IOC_CREATE:
+ return (viona_ioc_create(ss, dptr, md, cr));
+ case VNA_IOC_DELETE:
+ return (viona_ioc_delete(ss, B_FALSE));
+ default:
+ break;
+ }
+
+ mutex_enter(&ss->ss_lock);
+ if ((link = ss->ss_link) == NULL || link->l_destroyed ||
+ vmm_drv_release_reqd(link->l_vm_hold)) {
+ mutex_exit(&ss->ss_lock);
+ return (ENXIO);
+ }
+
+ switch (cmd) {
+ case VNA_IOC_GET_FEATURES:
+ val = VIONA_S_HOSTCAPS | link->l_features_hw;
+ if (ddi_copyout(&val, dptr, sizeof (val), md) != 0) {
+ err = EFAULT;
+ }
+ break;
+ case VNA_IOC_SET_FEATURES:
+ if (ddi_copyin(dptr, &val, sizeof (val), md) != 0) {
+ err = EFAULT;
+ break;
+ }
+ val &= (VIONA_S_HOSTCAPS | link->l_features_hw);
+
+ if ((val & VIRTIO_NET_F_CSUM) == 0)
+ val &= ~VIRTIO_NET_F_HOST_TSO4;
+
+ if ((val & VIRTIO_NET_F_GUEST_CSUM) == 0)
+ val &= ~VIRTIO_NET_F_GUEST_TSO4;
+
+ link->l_features = val;
+ break;
+ case VNA_IOC_RING_INIT:
+ err = viona_ioc_ring_init(link, dptr, md);
+ break;
+ case VNA_IOC_RING_RESET:
+ err = viona_ioc_ring_reset(link, (uint_t)data);
+ break;
+ case VNA_IOC_RING_KICK:
+ err = viona_ioc_ring_kick(link, (uint_t)data);
+ break;
+ case VNA_IOC_RING_SET_MSI:
+ err = viona_ioc_ring_set_msi(link, dptr, md);
+ break;
+ case VNA_IOC_RING_INTR_CLR:
+ err = viona_ioc_ring_intr_clear(link, (uint_t)data);
+ break;
+ case VNA_IOC_INTR_POLL:
+ err = viona_ioc_intr_poll(link, dptr, md, rv);
+ break;
+ case VNA_IOC_SET_NOTIFY_IOP:
+ err = viona_ioc_set_notify_ioport(link, (uint_t)data);
+ break;
+ default:
+ err = ENOTTY;
+ break;
+ }
+
+ mutex_exit(&ss->ss_lock);
+ return (err);
+}
+
+static int
+viona_chpoll(dev_t dev, short events, int anyyet, short *reventsp,
+ struct pollhead **phpp)
+{
+ viona_soft_state_t *ss;
+ viona_link_t *link;
+
+ ss = ddi_get_soft_state(viona_state, getminor(dev));
+ if (ss == NULL) {
+ return (ENXIO);
+ }
+
+ mutex_enter(&ss->ss_lock);
+ if ((link = ss->ss_link) == NULL || link->l_destroyed) {
+ mutex_exit(&ss->ss_lock);
+ return (ENXIO);
+ }
+
+ *reventsp = 0;
+ if ((events & POLLRDBAND) != 0) {
+ for (uint_t i = 0; i < VIONA_VQ_MAX; i++) {
+ if (link->l_vrings[i].vr_intr_enabled != 0) {
+ *reventsp |= POLLRDBAND;
+ break;
+ }
+ }
+ }
+ if ((*reventsp == 0 && !anyyet) || (events & POLLET)) {
+ *phpp = &link->l_pollhead;
+ }
+ mutex_exit(&ss->ss_lock);
+
+ return (0);
+}
+
+static void
+viona_get_mac_capab(viona_link_t *link)
+{
+ mac_handle_t mh = link->l_mh;
+ uint32_t cap = 0;
+ mac_capab_lso_t lso_cap;
+
+ link->l_features_hw = 0;
+ if (mac_capab_get(mh, MAC_CAPAB_HCKSUM, &cap)) {
+ /*
+ * Only report HW checksum ability if the underlying MAC
+ * resource is capable of populating the L4 header.
+ */
+ if ((cap & VIONA_CAP_HCKSUM_INTEREST) != 0) {
+ link->l_features_hw |= VIRTIO_NET_F_CSUM;
+ }
+ link->l_cap_csum = cap;
+ }
+
+ if ((link->l_features_hw & VIRTIO_NET_F_CSUM) &&
+ mac_capab_get(mh, MAC_CAPAB_LSO, &lso_cap)) {
+ /*
+ * Virtio doesn't allow for negotiating a maximum LSO
+ * packet size. We have to assume that the guest may
+ * send a maximum length IP packet. Make sure the
+ * underlying MAC can handle an LSO of this size.
+ */
+ if ((lso_cap.lso_flags & LSO_TX_BASIC_TCP_IPV4) &&
+ lso_cap.lso_basic_tcp_ipv4.lso_max >= IP_MAXPACKET)
+ link->l_features_hw |= VIRTIO_NET_F_HOST_TSO4;
+ }
+}
+
+static int
+viona_ioc_create(viona_soft_state_t *ss, void *dptr, int md, cred_t *cr)
+{
+ vioc_create_t kvc;
+ viona_link_t *link = NULL;
+ char cli_name[MAXNAMELEN];
+ int err = 0;
+ file_t *fp;
+ vmm_hold_t *hold = NULL;
+ viona_neti_t *nip = NULL;
+ zoneid_t zid;
+
+ ASSERT(MUTEX_NOT_HELD(&ss->ss_lock));
+
+ if (ddi_copyin(dptr, &kvc, sizeof (kvc), md) != 0) {
+ return (EFAULT);
+ }
+
+ zid = crgetzoneid(cr);
+ nip = viona_neti_lookup_by_zid(zid);
+ if (nip == NULL) {
+ return (EIO);
+ }
+
+ if (!nip->vni_nethook.vnh_hooked) {
+ viona_neti_rele(nip);
+ return (EIO);
+ }
+
+ mutex_enter(&ss->ss_lock);
+ if (ss->ss_link != NULL) {
+ mutex_exit(&ss->ss_lock);
+ viona_neti_rele(nip);
+ return (EEXIST);
+ }
+
+ if ((fp = getf(kvc.c_vmfd)) == NULL) {
+ err = EBADF;
+ goto bail;
+ }
+ err = vmm_drv_hold(fp, cr, &hold);
+ releasef(kvc.c_vmfd);
+ if (err != 0) {
+ goto bail;
+ }
+
+ link = kmem_zalloc(sizeof (viona_link_t), KM_SLEEP);
+ link->l_linkid = kvc.c_linkid;
+ link->l_vm_hold = hold;
+
+ err = mac_open_by_linkid(link->l_linkid, &link->l_mh);
+ if (err != 0) {
+ goto bail;
+ }
+
+ viona_get_mac_capab(link);
+
+ (void) snprintf(cli_name, sizeof (cli_name), "%s-%d", VIONA_CLI_NAME,
+ link->l_linkid);
+ err = mac_client_open(link->l_mh, &link->l_mch, cli_name, 0);
+ if (err != 0) {
+ goto bail;
+ }
+
+ viona_ring_alloc(link, &link->l_vrings[VIONA_VQ_RX]);
+ viona_ring_alloc(link, &link->l_vrings[VIONA_VQ_TX]);
+
+ if ((err = viona_rx_set(link)) != 0) {
+ viona_ring_free(&link->l_vrings[VIONA_VQ_RX]);
+ viona_ring_free(&link->l_vrings[VIONA_VQ_TX]);
+ goto bail;
+ }
+
+ link->l_neti = nip;
+ ss->ss_link = link;
+ mutex_exit(&ss->ss_lock);
+
+ mutex_enter(&nip->vni_lock);
+ list_insert_tail(&nip->vni_dev_list, ss);
+ mutex_exit(&nip->vni_lock);
+
+ return (0);
+
+bail:
+ if (link != NULL) {
+ if (link->l_mch != NULL) {
+ mac_client_close(link->l_mch, 0);
+ }
+ if (link->l_mh != NULL) {
+ mac_close(link->l_mh);
+ }
+ kmem_free(link, sizeof (viona_link_t));
+ }
+ if (hold != NULL) {
+ vmm_drv_rele(hold);
+ }
+ viona_neti_rele(nip);
+
+ mutex_exit(&ss->ss_lock);
+ return (err);
+}
+
+static int
+viona_ioc_delete(viona_soft_state_t *ss, boolean_t on_close)
+{
+ viona_link_t *link;
+ viona_neti_t *nip = NULL;
+
+ mutex_enter(&ss->ss_lock);
+ if ((link = ss->ss_link) == NULL) {
+ /* Link destruction already complete */
+ mutex_exit(&ss->ss_lock);
+ return (0);
+ }
+
+ if (link->l_destroyed) {
+ /*
+ * Link destruction has been started by another thread, but has
+ * not completed. This condition should be impossible to
+ * encounter when performing the on-close destroy of the link,
+ * since racing ioctl accessors must necessarily be absent.
+ */
+ VERIFY(!on_close);
+ mutex_exit(&ss->ss_lock);
+ return (EAGAIN);
+ }
+ /*
+ * The link deletion cannot fail after this point, continuing until its
+ * successful completion is reached.
+ */
+ link->l_destroyed = B_TRUE;
+
+ /*
+ * Tear down the IO port hook so it cannot be used to kick any of the
+ * rings which are about to be reset and stopped.
+ */
+ VERIFY0(viona_ioc_set_notify_ioport(link, 0));
+ mutex_exit(&ss->ss_lock);
+
+ /*
+ * Return the rings to their reset state, ignoring any possible
+ * interruptions from signals.
+ */
+ VERIFY0(viona_ring_reset(&link->l_vrings[VIONA_VQ_RX], B_FALSE));
+ VERIFY0(viona_ring_reset(&link->l_vrings[VIONA_VQ_TX], B_FALSE));
+
+ mutex_enter(&ss->ss_lock);
+ if (link->l_mch != NULL) {
+ /* Unhook the receive callbacks and close out the client */
+ viona_rx_clear(link);
+ mac_client_close(link->l_mch, 0);
+ }
+ if (link->l_mh != NULL) {
+ mac_close(link->l_mh);
+ }
+ if (link->l_vm_hold != NULL) {
+ vmm_drv_rele(link->l_vm_hold);
+ link->l_vm_hold = NULL;
+ }
+
+ nip = link->l_neti;
+ link->l_neti = NULL;
+
+ viona_ring_free(&link->l_vrings[VIONA_VQ_RX]);
+ viona_ring_free(&link->l_vrings[VIONA_VQ_TX]);
+ pollhead_clean(&link->l_pollhead);
+ ss->ss_link = NULL;
+ mutex_exit(&ss->ss_lock);
+
+ mutex_enter(&nip->vni_lock);
+ list_remove(&nip->vni_dev_list, ss);
+ mutex_exit(&nip->vni_lock);
+
+ viona_neti_rele(nip);
+
+ kmem_free(link, sizeof (viona_link_t));
+ return (0);
+}
+
+static int
+viona_ioc_ring_init(viona_link_t *link, void *udata, int md)
+{
+ vioc_ring_init_t kri;
+ int err;
+
+ if (ddi_copyin(udata, &kri, sizeof (kri), md) != 0) {
+ return (EFAULT);
+ }
+
+ err = viona_ring_init(link, kri.ri_index, kri.ri_qsize, kri.ri_qaddr);
+
+ return (err);
+}
+
+static int
+viona_ioc_ring_reset(viona_link_t *link, uint_t idx)
+{
+ viona_vring_t *ring;
+
+ if (idx >= VIONA_VQ_MAX) {
+ return (EINVAL);
+ }
+ ring = &link->l_vrings[idx];
+
+ return (viona_ring_reset(ring, B_TRUE));
+}
+
+static int
+viona_ioc_ring_kick(viona_link_t *link, uint_t idx)
+{
+ viona_vring_t *ring;
+ int err;
+
+ if (idx >= VIONA_VQ_MAX) {
+ return (EINVAL);
+ }
+ ring = &link->l_vrings[idx];
+
+ mutex_enter(&ring->vr_lock);
+ switch (ring->vr_state) {
+ case VRS_SETUP:
+ /*
+ * An early kick to a ring which is starting its worker thread
+ * is fine. Once that thread is active, it will process the
+ * start-up request immediately.
+ */
+ /* FALLTHROUGH */
+ case VRS_INIT:
+ ring->vr_state_flags |= VRSF_REQ_START;
+ /* FALLTHROUGH */
+ case VRS_RUN:
+ cv_broadcast(&ring->vr_cv);
+ err = 0;
+ break;
+ default:
+ err = EBUSY;
+ break;
+ }
+ mutex_exit(&ring->vr_lock);
+
+ return (err);
+}
+
+static int
+viona_ioc_ring_set_msi(viona_link_t *link, void *data, int md)
+{
+ vioc_ring_msi_t vrm;
+ viona_vring_t *ring;
+
+ if (ddi_copyin(data, &vrm, sizeof (vrm), md) != 0) {
+ return (EFAULT);
+ }
+ if (vrm.rm_index >= VIONA_VQ_MAX) {
+ return (EINVAL);
+ }
+
+ ring = &link->l_vrings[vrm.rm_index];
+ mutex_enter(&ring->vr_lock);
+ ring->vr_msi_addr = vrm.rm_addr;
+ ring->vr_msi_msg = vrm.rm_msg;
+ mutex_exit(&ring->vr_lock);
+
+ return (0);
+}
+
+static int
+viona_notify_wcb(void *arg, uintptr_t ioport, uint_t sz, uint64_t val)
+{
+ viona_link_t *link = (viona_link_t *)arg;
+ uint16_t vq = (uint16_t)val;
+
+ if (ioport != link->l_notify_ioport || sz != sizeof (uint16_t)) {
+ return (EINVAL);
+ }
+ return (viona_ioc_ring_kick(link, vq));
+}
+
+static int
+viona_ioc_set_notify_ioport(viona_link_t *link, uint_t ioport)
+{
+ int err = 0;
+
+ if (link->l_notify_ioport != 0) {
+ vmm_drv_ioport_unhook(link->l_vm_hold, &link->l_notify_cookie);
+ link->l_notify_ioport = 0;
+ }
+
+ if (ioport != 0) {
+ err = vmm_drv_ioport_hook(link->l_vm_hold, ioport, NULL,
+ viona_notify_wcb, (void *)link, &link->l_notify_cookie);
+ if (err == 0) {
+ link->l_notify_ioport = ioport;
+ }
+ }
+ return (err);
+}
+
+static int
+viona_ioc_ring_intr_clear(viona_link_t *link, uint_t idx)
+{
+ if (idx >= VIONA_VQ_MAX) {
+ return (EINVAL);
+ }
+
+ link->l_vrings[idx].vr_intr_enabled = 0;
+ return (0);
+}
+
+static int
+viona_ioc_intr_poll(viona_link_t *link, void *udata, int md, int *rv)
+{
+ uint_t cnt = 0;
+ vioc_intr_poll_t vip;
+
+ for (uint_t i = 0; i < VIONA_VQ_MAX; i++) {
+ uint_t val = link->l_vrings[i].vr_intr_enabled;
+
+ vip.vip_status[i] = val;
+ if (val != 0) {
+ cnt++;
+ }
+ }
+
+ if (ddi_copyout(&vip, udata, sizeof (vip), md) != 0) {
+ return (EFAULT);
+ }
+ *rv = (int)cnt;
+ return (0);
+}
diff --git a/usr/src/uts/i86pc/io/viona/viona_ring.c b/usr/src/uts/i86pc/io/viona/viona_ring.c
new file mode 100644
index 0000000000..e535bfaa1a
--- /dev/null
+++ b/usr/src/uts/i86pc/io/viona/viona_ring.c
@@ -0,0 +1,636 @@
+/*
+ * Copyright (c) 2013 Chris Torek <torek @ torek net>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2015 Pluribus Networks Inc.
+ * Copyright 2019 Joyent, Inc.
+ */
+
+
+#include <sys/disp.h>
+
+#include "viona_impl.h"
+
+#define VRING_ALIGN 4096
+#define VRING_MAX_LEN 32768
+
+static boolean_t viona_ring_map(viona_vring_t *);
+static void viona_ring_unmap(viona_vring_t *);
+static kthread_t *viona_create_worker(viona_vring_t *);
+
+static void *
+viona_gpa2kva(viona_vring_t *ring, uint64_t gpa, size_t len)
+{
+ ASSERT3P(ring->vr_lease, !=, NULL);
+
+ return (vmm_drv_gpa2kva(ring->vr_lease, gpa, len));
+}
+
+static boolean_t
+viona_ring_lease_expire_cb(void *arg)
+{
+ viona_vring_t *ring = arg;
+
+ cv_broadcast(&ring->vr_cv);
+
+ /* The lease will be broken asynchronously. */
+ return (B_FALSE);
+}
+
+static void
+viona_ring_lease_drop(viona_vring_t *ring)
+{
+ ASSERT(MUTEX_HELD(&ring->vr_lock));
+
+ if (ring->vr_lease != NULL) {
+ vmm_hold_t *hold = ring->vr_link->l_vm_hold;
+
+ ASSERT(hold != NULL);
+
+ /*
+ * Without an active lease, the ring mappings cannot be
+ * considered valid.
+ */
+ viona_ring_unmap(ring);
+
+ vmm_drv_lease_break(hold, ring->vr_lease);
+ ring->vr_lease = NULL;
+ }
+}
+
+boolean_t
+viona_ring_lease_renew(viona_vring_t *ring)
+{
+ vmm_hold_t *hold = ring->vr_link->l_vm_hold;
+
+ ASSERT(hold != NULL);
+ ASSERT(MUTEX_HELD(&ring->vr_lock));
+
+ viona_ring_lease_drop(ring);
+
+ /*
+ * Lease renewal will fail if the VM has requested that all holds be
+ * cleaned up.
+ */
+ ring->vr_lease = vmm_drv_lease_sign(hold, viona_ring_lease_expire_cb,
+ ring);
+ if (ring->vr_lease != NULL) {
+ /* A ring undergoing renewal will need valid guest mappings */
+ if (ring->vr_pa != 0 && ring->vr_size != 0) {
+ /*
+ * If new mappings cannot be established, consider the
+ * lease renewal a failure.
+ */
+ if (!viona_ring_map(ring)) {
+ viona_ring_lease_drop(ring);
+ return (B_FALSE);
+ }
+ }
+ }
+ return (ring->vr_lease != NULL);
+}
+
+void
+viona_ring_alloc(viona_link_t *link, viona_vring_t *ring)
+{
+ ring->vr_link = link;
+ mutex_init(&ring->vr_lock, NULL, MUTEX_DRIVER, NULL);
+ cv_init(&ring->vr_cv, NULL, CV_DRIVER, NULL);
+ mutex_init(&ring->vr_a_mutex, NULL, MUTEX_DRIVER, NULL);
+ mutex_init(&ring->vr_u_mutex, NULL, MUTEX_DRIVER, NULL);
+}
+
+static void
+viona_ring_misc_free(viona_vring_t *ring)
+{
+ const uint_t qsz = ring->vr_size;
+
+ viona_tx_ring_free(ring, qsz);
+}
+
+void
+viona_ring_free(viona_vring_t *ring)
+{
+ mutex_destroy(&ring->vr_lock);
+ cv_destroy(&ring->vr_cv);
+ mutex_destroy(&ring->vr_a_mutex);
+ mutex_destroy(&ring->vr_u_mutex);
+ ring->vr_link = NULL;
+}
+
+int
+viona_ring_init(viona_link_t *link, uint16_t idx, uint16_t qsz, uint64_t pa)
+{
+ viona_vring_t *ring;
+ kthread_t *t;
+ int err = 0;
+
+ if (idx >= VIONA_VQ_MAX) {
+ return (EINVAL);
+ }
+ if (qsz == 0 || qsz > VRING_MAX_LEN || (1 << (ffs(qsz) - 1)) != qsz) {
+ return (EINVAL);
+ }
+
+ ring = &link->l_vrings[idx];
+ mutex_enter(&ring->vr_lock);
+ if (ring->vr_state != VRS_RESET) {
+ mutex_exit(&ring->vr_lock);
+ return (EBUSY);
+ }
+ VERIFY(ring->vr_state_flags == 0);
+
+ ring->vr_lease = NULL;
+ if (!viona_ring_lease_renew(ring)) {
+ err = EBUSY;
+ goto fail;
+ }
+
+ ring->vr_size = qsz;
+ ring->vr_mask = (ring->vr_size - 1);
+ ring->vr_pa = pa;
+ if (!viona_ring_map(ring)) {
+ err = EINVAL;
+ goto fail;
+ }
+
+ /* Initialize queue indexes */
+ ring->vr_cur_aidx = 0;
+
+ if (idx == VIONA_VQ_TX) {
+ viona_tx_ring_alloc(ring, qsz);
+ }
+
+ /* Zero out MSI-X configuration */
+ ring->vr_msi_addr = 0;
+ ring->vr_msi_msg = 0;
+
+ /* Clear the stats */
+ bzero(&ring->vr_stats, sizeof (ring->vr_stats));
+
+ t = viona_create_worker(ring);
+ if (t == NULL) {
+ err = ENOMEM;
+ goto fail;
+ }
+ ring->vr_worker_thread = t;
+ ring->vr_state = VRS_SETUP;
+ cv_broadcast(&ring->vr_cv);
+ mutex_exit(&ring->vr_lock);
+ return (0);
+
+fail:
+ viona_ring_lease_drop(ring);
+ viona_ring_misc_free(ring);
+ ring->vr_size = 0;
+ ring->vr_mask = 0;
+ mutex_exit(&ring->vr_lock);
+ return (err);
+}
+
+int
+viona_ring_reset(viona_vring_t *ring, boolean_t heed_signals)
+{
+ mutex_enter(&ring->vr_lock);
+ if (ring->vr_state == VRS_RESET) {
+ mutex_exit(&ring->vr_lock);
+ return (0);
+ }
+
+ if ((ring->vr_state_flags & VRSF_REQ_STOP) == 0) {
+ ring->vr_state_flags |= VRSF_REQ_STOP;
+ cv_broadcast(&ring->vr_cv);
+ }
+ while (ring->vr_state != VRS_RESET) {
+ if (!heed_signals) {
+ cv_wait(&ring->vr_cv, &ring->vr_lock);
+ } else {
+ int rs;
+
+ rs = cv_wait_sig(&ring->vr_cv, &ring->vr_lock);
+ if (rs <= 0 && ring->vr_state != VRS_RESET) {
+ mutex_exit(&ring->vr_lock);
+ return (EINTR);
+ }
+ }
+ }
+ viona_ring_lease_drop(ring);
+ mutex_exit(&ring->vr_lock);
+ return (0);
+}
+
+static boolean_t
+viona_ring_map(viona_vring_t *ring)
+{
+ uint64_t pos = ring->vr_pa;
+ const uint16_t qsz = ring->vr_size;
+
+ ASSERT3U(qsz, !=, 0);
+ ASSERT3U(pos, !=, 0);
+ ASSERT(MUTEX_HELD(&ring->vr_lock));
+
+ const size_t desc_sz = qsz * sizeof (struct virtio_desc);
+ ring->vr_descr = viona_gpa2kva(ring, pos, desc_sz);
+ if (ring->vr_descr == NULL) {
+ goto fail;
+ }
+ pos += desc_sz;
+
+ const size_t avail_sz = (qsz + 3) * sizeof (uint16_t);
+ ring->vr_avail_flags = viona_gpa2kva(ring, pos, avail_sz);
+ if (ring->vr_avail_flags == NULL) {
+ goto fail;
+ }
+ ring->vr_avail_idx = ring->vr_avail_flags + 1;
+ ring->vr_avail_ring = ring->vr_avail_flags + 2;
+ ring->vr_avail_used_event = ring->vr_avail_ring + qsz;
+ pos += avail_sz;
+
+ const size_t used_sz = (qsz * sizeof (struct virtio_used)) +
+ (sizeof (uint16_t) * 3);
+ pos = P2ROUNDUP(pos, VRING_ALIGN);
+ ring->vr_used_flags = viona_gpa2kva(ring, pos, used_sz);
+ if (ring->vr_used_flags == NULL) {
+ goto fail;
+ }
+ ring->vr_used_idx = ring->vr_used_flags + 1;
+ ring->vr_used_ring = (struct virtio_used *)(ring->vr_used_flags + 2);
+ ring->vr_used_avail_event = (uint16_t *)(ring->vr_used_ring + qsz);
+
+ return (B_TRUE);
+
+fail:
+ viona_ring_unmap(ring);
+ return (B_FALSE);
+}
+
+static void
+viona_ring_unmap(viona_vring_t *ring)
+{
+ ASSERT(MUTEX_HELD(&ring->vr_lock));
+
+ ring->vr_descr = NULL;
+ ring->vr_avail_flags = NULL;
+ ring->vr_avail_idx = NULL;
+ ring->vr_avail_ring = NULL;
+ ring->vr_avail_used_event = NULL;
+ ring->vr_used_flags = NULL;
+ ring->vr_used_idx = NULL;
+ ring->vr_used_ring = NULL;
+ ring->vr_used_avail_event = NULL;
+}
+
+void
+viona_intr_ring(viona_vring_t *ring)
+{
+ uint64_t addr;
+
+ mutex_enter(&ring->vr_lock);
+ /* Deliver the interrupt directly, if so configured. */
+ if ((addr = ring->vr_msi_addr) != 0) {
+ uint64_t msg = ring->vr_msi_msg;
+
+ mutex_exit(&ring->vr_lock);
+ (void) vmm_drv_msi(ring->vr_lease, addr, msg);
+ return;
+ }
+ mutex_exit(&ring->vr_lock);
+
+ if (atomic_cas_uint(&ring->vr_intr_enabled, 0, 1) == 0) {
+ pollwakeup(&ring->vr_link->l_pollhead, POLLRDBAND);
+ }
+}
+
+static void
+viona_worker(void *arg)
+{
+ viona_vring_t *ring = (viona_vring_t *)arg;
+ viona_link_t *link = ring->vr_link;
+ proc_t *p = ttoproc(curthread);
+
+ mutex_enter(&ring->vr_lock);
+ VERIFY3U(ring->vr_state, ==, VRS_SETUP);
+
+ /* Bail immediately if ring shutdown or process exit was requested */
+ if (VRING_NEED_BAIL(ring, p)) {
+ goto cleanup;
+ }
+
+ /* Report worker thread as alive and notify creator */
+ ring->vr_state = VRS_INIT;
+ cv_broadcast(&ring->vr_cv);
+
+ while (ring->vr_state_flags == 0) {
+ /*
+ * Keeping lease renewals timely while waiting for the ring to
+ * be started is important for avoiding deadlocks.
+ */
+ if (vmm_drv_lease_expired(ring->vr_lease)) {
+ if (!viona_ring_lease_renew(ring)) {
+ goto cleanup;
+ }
+ }
+
+ (void) cv_wait_sig(&ring->vr_cv, &ring->vr_lock);
+
+ if (VRING_NEED_BAIL(ring, p)) {
+ goto cleanup;
+ }
+ }
+
+ ASSERT((ring->vr_state_flags & VRSF_REQ_START) != 0);
+ ring->vr_state = VRS_RUN;
+ ring->vr_state_flags &= ~VRSF_REQ_START;
+
+ /* Ensure ring lease is valid first */
+ if (vmm_drv_lease_expired(ring->vr_lease)) {
+ if (!viona_ring_lease_renew(ring)) {
+ goto cleanup;
+ }
+ }
+
+ /* Process actual work */
+ if (ring == &link->l_vrings[VIONA_VQ_RX]) {
+ viona_worker_rx(ring, link);
+ } else if (ring == &link->l_vrings[VIONA_VQ_TX]) {
+ viona_worker_tx(ring, link);
+ } else {
+ panic("unexpected ring: %p", (void *)ring);
+ }
+
+cleanup:
+ if (ring->vr_txdesb != NULL) {
+ /*
+ * Transmit activity must be entirely concluded before the
+ * associated descriptors can be cleaned up.
+ */
+ VERIFY(ring->vr_xfer_outstanding == 0);
+ }
+ viona_ring_misc_free(ring);
+
+ viona_ring_lease_drop(ring);
+ ring->vr_cur_aidx = 0;
+ ring->vr_state = VRS_RESET;
+ ring->vr_state_flags = 0;
+ ring->vr_worker_thread = NULL;
+ cv_broadcast(&ring->vr_cv);
+ mutex_exit(&ring->vr_lock);
+
+ mutex_enter(&ttoproc(curthread)->p_lock);
+ lwp_exit();
+}
+
+static kthread_t *
+viona_create_worker(viona_vring_t *ring)
+{
+ k_sigset_t hold_set;
+ proc_t *p = curproc;
+ kthread_t *t;
+ klwp_t *lwp;
+
+ ASSERT(MUTEX_HELD(&ring->vr_lock));
+ ASSERT(ring->vr_state == VRS_RESET);
+
+ sigfillset(&hold_set);
+ lwp = lwp_create(viona_worker, (void *)ring, 0, p, TS_STOPPED,
+ minclsyspri - 1, &hold_set, curthread->t_cid, 0);
+ if (lwp == NULL) {
+ return (NULL);
+ }
+
+ t = lwptot(lwp);
+ mutex_enter(&p->p_lock);
+ t->t_proc_flag = (t->t_proc_flag & ~TP_HOLDLWP) | TP_KTHREAD;
+ lwp_create_done(t);
+ mutex_exit(&p->p_lock);
+
+ return (t);
+}
+
+int
+vq_popchain(viona_vring_t *ring, struct iovec *iov, uint_t niov,
+ uint16_t *cookie)
+{
+ uint_t i, ndesc, idx, head, next;
+ struct virtio_desc vdir;
+ void *buf;
+
+ ASSERT(iov != NULL);
+ ASSERT(niov > 0 && niov < INT_MAX);
+
+ mutex_enter(&ring->vr_a_mutex);
+ idx = ring->vr_cur_aidx;
+ ndesc = (uint16_t)((unsigned)*ring->vr_avail_idx - (unsigned)idx);
+
+ if (ndesc == 0) {
+ mutex_exit(&ring->vr_a_mutex);
+ return (0);
+ }
+ if (ndesc > ring->vr_size) {
+ /*
+ * Despite the fact that the guest has provided an 'avail_idx'
+ * which indicates that an impossible number of descriptors are
+ * available, continue on and attempt to process the next one.
+ *
+ * The transgression will not escape the probe or stats though.
+ */
+ VIONA_PROBE2(ndesc_too_high, viona_vring_t *, ring,
+ uint16_t, ndesc);
+ VIONA_RING_STAT_INCR(ring, ndesc_too_high);
+ }
+
+ head = ring->vr_avail_ring[idx & ring->vr_mask];
+ next = head;
+
+ for (i = 0; i < niov; next = vdir.vd_next) {
+ if (next >= ring->vr_size) {
+ VIONA_PROBE2(bad_idx, viona_vring_t *, ring,
+ uint16_t, next);
+ VIONA_RING_STAT_INCR(ring, bad_idx);
+ goto bail;
+ }
+
+ vdir = ring->vr_descr[next];
+ if ((vdir.vd_flags & VRING_DESC_F_INDIRECT) == 0) {
+ if (vdir.vd_len == 0) {
+ VIONA_PROBE2(desc_bad_len,
+ viona_vring_t *, ring,
+ uint32_t, vdir.vd_len);
+ VIONA_RING_STAT_INCR(ring, desc_bad_len);
+ goto bail;
+ }
+ buf = viona_gpa2kva(ring, vdir.vd_addr, vdir.vd_len);
+ if (buf == NULL) {
+ VIONA_PROBE_BAD_RING_ADDR(ring, vdir.vd_addr);
+ VIONA_RING_STAT_INCR(ring, bad_ring_addr);
+ goto bail;
+ }
+ iov[i].iov_base = buf;
+ iov[i].iov_len = vdir.vd_len;
+ i++;
+ } else {
+ const uint_t nindir = vdir.vd_len / 16;
+ volatile struct virtio_desc *vindir;
+
+ if ((vdir.vd_len & 0xf) || nindir == 0) {
+ VIONA_PROBE2(indir_bad_len,
+ viona_vring_t *, ring,
+ uint32_t, vdir.vd_len);
+ VIONA_RING_STAT_INCR(ring, indir_bad_len);
+ goto bail;
+ }
+ vindir = viona_gpa2kva(ring, vdir.vd_addr, vdir.vd_len);
+ if (vindir == NULL) {
+ VIONA_PROBE_BAD_RING_ADDR(ring, vdir.vd_addr);
+ VIONA_RING_STAT_INCR(ring, bad_ring_addr);
+ goto bail;
+ }
+ next = 0;
+ for (;;) {
+ struct virtio_desc vp;
+
+ /*
+ * A copy of the indirect descriptor is made
+ * here, rather than simply using a reference
+ * pointer. This prevents malicious or
+ * erroneous guest writes to the descriptor
+ * from fooling the flags/bounds verification
+ * through a race.
+ */
+ vp = vindir[next];
+ if (vp.vd_flags & VRING_DESC_F_INDIRECT) {
+ VIONA_PROBE1(indir_bad_nest,
+ viona_vring_t *, ring);
+ VIONA_RING_STAT_INCR(ring,
+ indir_bad_nest);
+ goto bail;
+ } else if (vp.vd_len == 0) {
+ VIONA_PROBE2(desc_bad_len,
+ viona_vring_t *, ring,
+ uint32_t, vp.vd_len);
+ VIONA_RING_STAT_INCR(ring,
+ desc_bad_len);
+ goto bail;
+ }
+ buf = viona_gpa2kva(ring, vp.vd_addr,
+ vp.vd_len);
+ if (buf == NULL) {
+ VIONA_PROBE_BAD_RING_ADDR(ring,
+ vp.vd_addr);
+ VIONA_RING_STAT_INCR(ring,
+ bad_ring_addr);
+ goto bail;
+ }
+ iov[i].iov_base = buf;
+ iov[i].iov_len = vp.vd_len;
+ i++;
+
+ if ((vp.vd_flags & VRING_DESC_F_NEXT) == 0)
+ break;
+ if (i >= niov) {
+ goto loopy;
+ }
+
+ next = vp.vd_next;
+ if (next >= nindir) {
+ VIONA_PROBE3(indir_bad_next,
+ viona_vring_t *, ring,
+ uint16_t, next,
+ uint_t, nindir);
+ VIONA_RING_STAT_INCR(ring,
+ indir_bad_next);
+ goto bail;
+ }
+ }
+ }
+ if ((vdir.vd_flags & VRING_DESC_F_NEXT) == 0) {
+ *cookie = head;
+ ring->vr_cur_aidx++;
+ mutex_exit(&ring->vr_a_mutex);
+ return (i);
+ }
+ }
+
+loopy:
+ VIONA_PROBE1(too_many_desc, viona_vring_t *, ring);
+ VIONA_RING_STAT_INCR(ring, too_many_desc);
+bail:
+ mutex_exit(&ring->vr_a_mutex);
+ return (-1);
+}
+
+void
+vq_pushchain(viona_vring_t *ring, uint32_t len, uint16_t cookie)
+{
+ volatile struct virtio_used *vu;
+ uint_t uidx;
+
+ mutex_enter(&ring->vr_u_mutex);
+
+ uidx = *ring->vr_used_idx;
+ vu = &ring->vr_used_ring[uidx++ & ring->vr_mask];
+ vu->vu_idx = cookie;
+ vu->vu_tlen = len;
+ membar_producer();
+ *ring->vr_used_idx = uidx;
+
+ mutex_exit(&ring->vr_u_mutex);
+}
+
+void
+vq_pushchain_many(viona_vring_t *ring, uint_t num_bufs, used_elem_t *elem)
+{
+ volatile struct virtio_used *vu;
+ uint_t uidx, i;
+
+ mutex_enter(&ring->vr_u_mutex);
+
+ uidx = *ring->vr_used_idx;
+ if (num_bufs == 1) {
+ vu = &ring->vr_used_ring[uidx++ & ring->vr_mask];
+ vu->vu_idx = elem[0].id;
+ vu->vu_tlen = elem[0].len;
+ } else {
+ for (i = 0; i < num_bufs; i++) {
+ vu = &ring->vr_used_ring[(uidx + i) & ring->vr_mask];
+ vu->vu_idx = elem[i].id;
+ vu->vu_tlen = elem[i].len;
+ }
+ uidx = uidx + num_bufs;
+ }
+ membar_producer();
+ *ring->vr_used_idx = uidx;
+
+ mutex_exit(&ring->vr_u_mutex);
+}
diff --git a/usr/src/uts/i86pc/io/viona/viona_rx.c b/usr/src/uts/i86pc/io/viona/viona_rx.c
new file mode 100644
index 0000000000..b354b201cb
--- /dev/null
+++ b/usr/src/uts/i86pc/io/viona/viona_rx.c
@@ -0,0 +1,747 @@
+/*
+ * Copyright (c) 2013 Chris Torek <torek @ torek net>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2015 Pluribus Networks Inc.
+ * Copyright 2019 Joyent, Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/strsubr.h>
+
+#include <sys/dlpi.h>
+#include <sys/pattr.h>
+#include <sys/vlan.h>
+
+#include "viona_impl.h"
+
+
+
+#define VTNET_MAXSEGS 32
+
+/* Min. octets in an ethernet frame minus FCS */
+#define MIN_BUF_SIZE 60
+#define NEED_VLAN_PAD_SIZE (MIN_BUF_SIZE - VLAN_TAGSZ)
+
+static mblk_t *viona_vlan_pad_mp;
+
+void
+viona_rx_init(void)
+{
+ mblk_t *mp;
+
+ ASSERT(viona_vlan_pad_mp == NULL);
+
+ /* Create mblk for padding when VLAN tags are stripped */
+ mp = allocb_wait(VLAN_TAGSZ, BPRI_HI, STR_NOSIG, NULL);
+ bzero(mp->b_rptr, VLAN_TAGSZ);
+ mp->b_wptr += VLAN_TAGSZ;
+ viona_vlan_pad_mp = mp;
+}
+
+void
+viona_rx_fini(void)
+{
+ mblk_t *mp;
+
+ /* Clean up the VLAN padding mblk */
+ mp = viona_vlan_pad_mp;
+ viona_vlan_pad_mp = NULL;
+ VERIFY(mp != NULL && mp->b_cont == NULL);
+ freemsg(mp);
+}
+
+void
+viona_worker_rx(viona_vring_t *ring, viona_link_t *link)
+{
+ proc_t *p = ttoproc(curthread);
+
+ (void) thread_vsetname(curthread, "viona_rx_%p", ring);
+
+ ASSERT(MUTEX_HELD(&ring->vr_lock));
+ ASSERT3U(ring->vr_state, ==, VRS_RUN);
+
+ *ring->vr_used_flags |= VRING_USED_F_NO_NOTIFY;
+
+ do {
+ if (vmm_drv_lease_expired(ring->vr_lease)) {
+ /*
+ * Set the renewal flag, causing incoming traffic to be
+ * dropped, and issue an RX barrier to ensure any
+ * threads in the RX callbacks will have finished.
+ * The vr_lock cannot be held across the barrier as it
+ * poses a deadlock risk.
+ */
+ ring->vr_state_flags |= VRSF_RENEW;
+ mutex_exit(&ring->vr_lock);
+ mac_rx_barrier(link->l_mch);
+ mutex_enter(&ring->vr_lock);
+
+ if (!viona_ring_lease_renew(ring)) {
+ break;
+ }
+ ring->vr_state_flags &= ~VRSF_RENEW;
+ }
+
+ /*
+ * For now, there is little to do in the RX worker as inbound
+ * data is delivered by MAC via the RX callbacks. If tap-like
+ * functionality is added later, this would be a convenient
+ * place to inject frames into the guest.
+ */
+ (void) cv_wait_sig(&ring->vr_cv, &ring->vr_lock);
+ } while (!VRING_NEED_BAIL(ring, p));
+
+ *ring->vr_used_flags &= ~VRING_USED_F_NO_NOTIFY;
+}
+
+static size_t
+viona_copy_mblk(const mblk_t *mp, size_t seek, caddr_t buf, size_t len,
+ boolean_t *end)
+{
+ size_t copied = 0;
+ size_t off = 0;
+
+ /* Seek past already-consumed data */
+ while (seek > 0 && mp != NULL) {
+ const size_t chunk = MBLKL(mp);
+
+ if (chunk > seek) {
+ off = seek;
+ break;
+ }
+ mp = mp->b_cont;
+ seek -= chunk;
+ }
+
+ while (mp != NULL) {
+ const size_t chunk = MBLKL(mp) - off;
+ const size_t to_copy = MIN(chunk, len);
+
+ bcopy(mp->b_rptr + off, buf, to_copy);
+ copied += to_copy;
+ buf += to_copy;
+ len -= to_copy;
+
+ /*
+ * If all the remaining data in the mblk_t was copied, move on
+ * to the next one in the chain. Any seek offset applied to
+ * the first mblk copy is zeroed out for subsequent operations.
+ */
+ if (chunk == to_copy) {
+ mp = mp->b_cont;
+ off = 0;
+ }
+#ifdef DEBUG
+ else {
+ /*
+ * The only valid reason for the copy to consume less
+ * than the entire contents of the mblk_t is because
+ * the output buffer has been filled.
+ */
+ ASSERT0(len);
+ }
+#endif
+
+ /* Go no further if the buffer has been filled */
+ if (len == 0) {
+ break;
+ }
+
+ }
+ *end = (mp == NULL);
+ return (copied);
+}
+
+static int
+viona_recv_plain(viona_vring_t *ring, const mblk_t *mp, size_t msz)
+{
+ struct iovec iov[VTNET_MAXSEGS];
+ uint16_t cookie;
+ int n;
+ const size_t hdr_sz = sizeof (struct virtio_net_hdr);
+ struct virtio_net_hdr *hdr;
+ size_t len, copied = 0;
+ caddr_t buf = NULL;
+ boolean_t end = B_FALSE;
+ const uint32_t features = ring->vr_link->l_features;
+
+ ASSERT(msz >= MIN_BUF_SIZE);
+
+ n = vq_popchain(ring, iov, VTNET_MAXSEGS, &cookie);
+ if (n <= 0) {
+ /* Without available buffers, the frame must be dropped. */
+ return (ENOSPC);
+ }
+ if (iov[0].iov_len < hdr_sz) {
+ /*
+ * There is little to do if there is not even space available
+ * for the sole header. Zero the buffer and bail out as a last
+ * act of desperation.
+ */
+ bzero(iov[0].iov_base, iov[0].iov_len);
+ goto bad_frame;
+ }
+
+ /* Grab the address of the header before anything else */
+ hdr = (struct virtio_net_hdr *)iov[0].iov_base;
+
+ /*
+ * If there is any space remaining in the first buffer after writing
+ * the header, fill it with frame data.
+ */
+ if (iov[0].iov_len > hdr_sz) {
+ buf = (caddr_t)iov[0].iov_base + hdr_sz;
+ len = iov[0].iov_len - hdr_sz;
+
+ copied += viona_copy_mblk(mp, copied, buf, len, &end);
+ }
+
+ /* Copy any remaining data into subsequent buffers, if present */
+ for (int i = 1; i < n && !end; i++) {
+ buf = (caddr_t)iov[i].iov_base;
+ len = iov[i].iov_len;
+
+ copied += viona_copy_mblk(mp, copied, buf, len, &end);
+ }
+
+ /* Was the expected amount of data copied? */
+ if (copied != msz) {
+ VIONA_PROBE5(too_short, viona_vring_t *, ring,
+ uint16_t, cookie, mblk_t *, mp, size_t, copied,
+ size_t, msz);
+ VIONA_RING_STAT_INCR(ring, too_short);
+ goto bad_frame;
+ }
+
+ /* Populate (read: zero) the header and account for it in the size */
+ bzero(hdr, hdr_sz);
+ copied += hdr_sz;
+
+ /* Add chksum bits, if needed */
+ if ((features & VIRTIO_NET_F_GUEST_CSUM) != 0) {
+ uint32_t cksum_flags;
+
+ if (((features & VIRTIO_NET_F_GUEST_TSO4) != 0) &&
+ ((DB_CKSUMFLAGS(mp) & HW_LSO) != 0)) {
+ hdr->vrh_gso_type |= VIRTIO_NET_HDR_GSO_TCPV4;
+ hdr->vrh_gso_size = DB_LSOMSS(mp);
+ }
+
+ mac_hcksum_get((mblk_t *)mp, NULL, NULL, NULL, NULL,
+ &cksum_flags);
+ if ((cksum_flags & HCK_FULLCKSUM_OK) != 0) {
+ hdr->vrh_flags |= VIRTIO_NET_HDR_F_DATA_VALID;
+ }
+ }
+
+ /* Release this chain */
+ vq_pushchain(ring, copied, cookie);
+ return (0);
+
+bad_frame:
+ VIONA_PROBE3(bad_rx_frame, viona_vring_t *, ring, uint16_t, cookie,
+ mblk_t *, mp);
+ VIONA_RING_STAT_INCR(ring, bad_rx_frame);
+
+ vq_pushchain(ring, MAX(copied, MIN_BUF_SIZE + hdr_sz), cookie);
+ return (EINVAL);
+}
+
+static int
+viona_recv_merged(viona_vring_t *ring, const mblk_t *mp, size_t msz)
+{
+ struct iovec iov[VTNET_MAXSEGS];
+ used_elem_t uelem[VTNET_MAXSEGS];
+ int n, i = 0, buf_idx = 0, err = 0;
+ uint16_t cookie;
+ caddr_t buf;
+ size_t len, copied = 0, chunk = 0;
+ struct virtio_net_mrgrxhdr *hdr = NULL;
+ const size_t hdr_sz = sizeof (struct virtio_net_mrgrxhdr);
+ boolean_t end = B_FALSE;
+ const uint32_t features = ring->vr_link->l_features;
+
+ ASSERT(msz >= MIN_BUF_SIZE);
+
+ n = vq_popchain(ring, iov, VTNET_MAXSEGS, &cookie);
+ if (n <= 0) {
+ /* Without available buffers, the frame must be dropped. */
+ VIONA_PROBE2(no_space, viona_vring_t *, ring, mblk_t *, mp);
+ VIONA_RING_STAT_INCR(ring, no_space);
+ return (ENOSPC);
+ }
+ if (iov[0].iov_len < hdr_sz) {
+ /*
+ * There is little to do if there is not even space available
+ * for the sole header. Zero the buffer and bail out as a last
+ * act of desperation.
+ */
+ bzero(iov[0].iov_base, iov[0].iov_len);
+ uelem[0].id = cookie;
+ uelem[0].len = iov[0].iov_len;
+ err = EINVAL;
+ goto done;
+ }
+
+ /* Grab the address of the header and do initial population */
+ hdr = (struct virtio_net_mrgrxhdr *)iov[0].iov_base;
+ bzero(hdr, hdr_sz);
+ hdr->vrh_bufs = 1;
+
+ /*
+ * If there is any space remaining in the first buffer after writing
+ * the header, fill it with frame data.
+ */
+ if (iov[0].iov_len > hdr_sz) {
+ buf = iov[0].iov_base + hdr_sz;
+ len = iov[0].iov_len - hdr_sz;
+
+ chunk += viona_copy_mblk(mp, copied, buf, len, &end);
+ copied += chunk;
+ }
+ i = 1;
+
+ do {
+ while (i < n && !end) {
+ buf = iov[i].iov_base;
+ len = iov[i].iov_len;
+
+ chunk += viona_copy_mblk(mp, copied, buf, len, &end);
+ copied += chunk;
+ i++;
+ }
+
+ uelem[buf_idx].id = cookie;
+ uelem[buf_idx].len = chunk;
+
+ /*
+ * Try to grab another buffer from the ring if the mblk has not
+ * yet been entirely copied out.
+ */
+ if (!end) {
+ if (buf_idx == (VTNET_MAXSEGS - 1)) {
+ /*
+ * Our arbitrary limit on the number of buffers
+ * to offer for merge has already been reached.
+ */
+ err = EOVERFLOW;
+ break;
+ }
+ n = vq_popchain(ring, iov, VTNET_MAXSEGS, &cookie);
+ if (n <= 0) {
+ /*
+ * Without more immediate space to perform the
+ * copying, there is little choice left but to
+ * drop the packet.
+ */
+ err = EMSGSIZE;
+ break;
+ }
+ chunk = 0;
+ i = 0;
+ buf_idx++;
+ /*
+ * Keep the header up-to-date with the number of
+ * buffers, but never reference its value since the
+ * guest could meddle with it.
+ */
+ hdr->vrh_bufs++;
+ }
+ } while (!end && copied < msz);
+
+ /* Account for the header size in the first buffer */
+ uelem[0].len += hdr_sz;
+
+ /*
+ * If no other errors were encounted during the copy, was the expected
+ * amount of data transfered?
+ */
+ if (err == 0 && copied != msz) {
+ VIONA_PROBE5(too_short, viona_vring_t *, ring,
+ uint16_t, cookie, mblk_t *, mp, size_t, copied,
+ size_t, msz);
+ VIONA_RING_STAT_INCR(ring, too_short);
+ err = EINVAL;
+ }
+
+ /* Add chksum bits, if needed */
+ if ((features & VIRTIO_NET_F_GUEST_CSUM) != 0) {
+ uint32_t cksum_flags;
+
+ if (((features & VIRTIO_NET_F_GUEST_TSO4) != 0) &&
+ ((DB_CKSUMFLAGS(mp) & HW_LSO) != 0)) {
+ hdr->vrh_gso_type |= VIRTIO_NET_HDR_GSO_TCPV4;
+ hdr->vrh_gso_size = DB_LSOMSS(mp);
+ }
+
+ mac_hcksum_get((mblk_t *)mp, NULL, NULL, NULL, NULL,
+ &cksum_flags);
+ if ((cksum_flags & HCK_FULLCKSUM_OK) != 0) {
+ hdr->vrh_flags |= VIRTIO_NET_HDR_F_DATA_VALID;
+ }
+ }
+
+done:
+ switch (err) {
+ case 0:
+ /* Success can fall right through to ring delivery */
+ break;
+
+ case EMSGSIZE:
+ VIONA_PROBE3(rx_merge_underrun, viona_vring_t *, ring,
+ uint16_t, cookie, mblk_t *, mp);
+ VIONA_RING_STAT_INCR(ring, rx_merge_underrun);
+ break;
+
+ case EOVERFLOW:
+ VIONA_PROBE3(rx_merge_overrun, viona_vring_t *, ring,
+ uint16_t, cookie, mblk_t *, mp);
+ VIONA_RING_STAT_INCR(ring, rx_merge_overrun);
+ break;
+
+ default:
+ VIONA_PROBE3(bad_rx_frame, viona_vring_t *, ring,
+ uint16_t, cookie, mblk_t *, mp);
+ VIONA_RING_STAT_INCR(ring, bad_rx_frame);
+ }
+ vq_pushchain_many(ring, buf_idx + 1, uelem);
+ return (err);
+}
+
+static void
+viona_rx_common(viona_vring_t *ring, mblk_t *mp, boolean_t is_loopback)
+{
+ viona_link_t *link = ring->vr_link;
+ mblk_t *mprx = NULL, **mprx_prevp = &mprx;
+ mblk_t *mpdrop = NULL, **mpdrop_prevp = &mpdrop;
+ const boolean_t do_merge =
+ ((link->l_features & VIRTIO_NET_F_MRG_RXBUF) != 0);
+ const boolean_t guest_csum =
+ ((link->l_features & VIRTIO_NET_F_GUEST_CSUM) != 0);
+ const boolean_t guest_tso4 =
+ ((link->l_features & VIRTIO_NET_F_GUEST_TSO4) != 0);
+
+ size_t nrx = 0, ndrop = 0;
+
+ /*
+ * The mac_hw_emul() function, by design, doesn't predicate on
+ * HW_LOCAL_MAC. Since we are in Rx context we know that any
+ * LSO packet must also be from a same-machine sender. We take
+ * advantage of that and forgoe writing a manual loop to
+ * predicate on HW_LOCAL_MAC.
+ *
+ * For checksum emulation we need to predicate on HW_LOCAL_MAC
+ * to avoid calling mac_hw_emul() on packets that don't need
+ * it (thanks to the fact that HCK_IPV4_HDRCKSUM and
+ * HCK_IPV4_HDRCKSUM_OK use the same value). Therefore, we do
+ * the checksum emulation in the second loop.
+ */
+ if (!guest_tso4)
+ mac_hw_emul(&mp, NULL, NULL, MAC_LSO_EMUL);
+
+ while (mp != NULL) {
+ mblk_t *next, *pad = NULL;
+ size_t size;
+ int err = 0;
+
+ next = mp->b_next;
+ mp->b_next = NULL;
+
+ if (DB_CKSUMFLAGS(mp) & HW_LOCAL_MAC) {
+ /*
+ * The VIRTIO_NET_HDR_F_DATA_VALID flag only
+ * covers the ULP checksum -- so we still have
+ * to populate the IP header checksum.
+ */
+ if (guest_csum) {
+ mac_hw_emul(&mp, NULL, NULL, MAC_IPCKSUM_EMUL);
+ } else {
+ mac_hw_emul(&mp, NULL, NULL, MAC_HWCKSUM_EMUL);
+ }
+
+ if (mp == NULL) {
+ mp = next;
+ continue;
+ }
+ }
+
+ size = msgsize(mp);
+
+ /*
+ * We treat both a 'drop' response and errors the same here
+ * and put the packet on the drop chain. As packets may be
+ * subject to different actions in ipf (which do not all
+ * return the same set of error values), an error processing
+ * one packet doesn't mean the next packet will also generate
+ * an error.
+ */
+ if (VNETHOOK_INTERESTED_IN(link->l_neti) &&
+ viona_hook(link, ring, &mp, B_FALSE) != 0) {
+ if (mp != NULL) {
+ *mpdrop_prevp = mp;
+ mpdrop_prevp = &mp->b_next;
+ } else {
+ /*
+ * If the hook consumer (e.g. ipf) already
+ * freed the mblk_t, update the drop count now.
+ */
+ ndrop++;
+ }
+ mp = next;
+ continue;
+ }
+
+ /*
+ * Ethernet frames are expected to be padded out in order to
+ * meet the minimum size.
+ *
+ * A special case is made for frames which are short by
+ * VLAN_TAGSZ, having been stripped of their VLAN tag while
+ * traversing MAC. A preallocated (and recycled) mblk is used
+ * for that specific condition.
+ *
+ * All other frames that fall short on length will have custom
+ * zero-padding allocated appended to them.
+ */
+ if (size == NEED_VLAN_PAD_SIZE) {
+ ASSERT(MBLKL(viona_vlan_pad_mp) == VLAN_TAGSZ);
+ ASSERT(viona_vlan_pad_mp->b_cont == NULL);
+
+ for (pad = mp; pad->b_cont != NULL; pad = pad->b_cont)
+ ;
+
+ pad->b_cont = viona_vlan_pad_mp;
+ size += VLAN_TAGSZ;
+ } else if (size < MIN_BUF_SIZE) {
+ const size_t pad_size = MIN_BUF_SIZE - size;
+ mblk_t *zero_mp;
+
+ zero_mp = allocb(pad_size, BPRI_MED);
+ if (zero_mp == NULL) {
+ err = ENOMEM;
+ goto pad_drop;
+ }
+
+ VIONA_PROBE3(rx_pad_short, viona_vring_t *, ring,
+ mblk_t *, mp, size_t, pad_size);
+ VIONA_RING_STAT_INCR(ring, rx_pad_short);
+ zero_mp->b_wptr += pad_size;
+ bzero(zero_mp->b_rptr, pad_size);
+ linkb(mp, zero_mp);
+ size += pad_size;
+ }
+
+ if (do_merge) {
+ err = viona_recv_merged(ring, mp, size);
+ } else {
+ err = viona_recv_plain(ring, mp, size);
+ }
+
+ /*
+ * The VLAN padding mblk is meant for continual reuse, so
+ * remove it from the chain to prevent it from being freed.
+ *
+ * Custom allocated padding does not require this treatment and
+ * is freed normally.
+ */
+ if (pad != NULL) {
+ pad->b_cont = NULL;
+ }
+
+pad_drop:
+ /*
+ * While an error during rx processing
+ * (viona_recv_{merged,plain}) does not free mp on error,
+ * hook processing might or might not free mp. Handle either
+ * scenario -- if mp is not yet free, it is queued up and
+ * freed after the guest has been notified. If mp is
+ * already NULL, just proceed on.
+ */
+ if (err != 0) {
+ *mpdrop_prevp = mp;
+ mpdrop_prevp = &mp->b_next;
+
+ /*
+ * If the available ring is empty, do not bother
+ * attempting to deliver any more frames. Count the
+ * rest as dropped too.
+ */
+ if (err == ENOSPC) {
+ mp->b_next = next;
+ break;
+ }
+ } else {
+ /* Chain successful mblks to be freed later */
+ *mprx_prevp = mp;
+ mprx_prevp = &mp->b_next;
+ nrx++;
+ }
+ mp = next;
+ }
+
+ membar_enter();
+ if ((*ring->vr_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) {
+ viona_intr_ring(ring);
+ }
+
+ /* Free successfully received frames */
+ if (mprx != NULL) {
+ freemsgchain(mprx);
+ }
+
+ /* Free dropped frames, also tallying them */
+ mp = mpdrop;
+ while (mp != NULL) {
+ mblk_t *next = mp->b_next;
+
+ mp->b_next = NULL;
+ freemsg(mp);
+ mp = next;
+ ndrop++;
+ }
+ VIONA_PROBE3(rx, viona_link_t *, link, size_t, nrx, size_t, ndrop);
+}
+
+static void
+viona_rx_classified(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
+ boolean_t is_loopback)
+{
+ viona_vring_t *ring = (viona_vring_t *)arg;
+
+ /* Drop traffic if ring is inactive or renewing its lease */
+ if (ring->vr_state != VRS_RUN ||
+ (ring->vr_state_flags & VRSF_RENEW) != 0) {
+ freemsgchain(mp);
+ return;
+ }
+
+ viona_rx_common(ring, mp, is_loopback);
+}
+
+static void
+viona_rx_mcast(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
+ boolean_t is_loopback)
+{
+ viona_vring_t *ring = (viona_vring_t *)arg;
+ mac_handle_t mh = ring->vr_link->l_mh;
+ mblk_t *mp_mcast_only = NULL;
+ mblk_t **mpp = &mp_mcast_only;
+
+ /* Drop traffic if ring is inactive or renewing its lease */
+ if (ring->vr_state != VRS_RUN ||
+ (ring->vr_state_flags & VRSF_RENEW) != 0) {
+ freemsgchain(mp);
+ return;
+ }
+
+ /*
+ * In addition to multicast traffic, broadcast packets will also arrive
+ * via the MAC_CLIENT_PROMISC_MULTI handler. The mac_rx_set() callback
+ * for fully-classified traffic has already delivered that broadcast
+ * traffic, so it should be suppressed here, rather than duplicating it
+ * to the guest.
+ */
+ while (mp != NULL) {
+ mblk_t *mp_next;
+ mac_header_info_t mhi;
+ int err;
+
+ mp_next = mp->b_next;
+ mp->b_next = NULL;
+
+ /* Determine the packet type */
+ err = mac_vlan_header_info(mh, mp, &mhi);
+ if (err != 0) {
+ mblk_t *pull;
+
+ /*
+ * It is possible that gathering of the header
+ * information was impeded by a leading mblk_t which
+ * was of inadequate length to reference the needed
+ * fields. Try again, in case that could be solved
+ * with a pull-up.
+ */
+ pull = msgpullup(mp, sizeof (struct ether_vlan_header));
+ if (pull == NULL) {
+ err = ENOMEM;
+ } else {
+ err = mac_vlan_header_info(mh, pull, &mhi);
+ freemsg(pull);
+ }
+
+ if (err != 0) {
+ VIONA_RING_STAT_INCR(ring, rx_mcast_check);
+ }
+ }
+
+ /* Chain up matching packets while discarding others */
+ if (err == 0 && mhi.mhi_dsttype == MAC_ADDRTYPE_MULTICAST) {
+ *mpp = mp;
+ mpp = &mp->b_next;
+ } else {
+ freemsg(mp);
+ }
+
+ mp = mp_next;
+ }
+
+ if (mp_mcast_only != NULL) {
+ viona_rx_common(ring, mp_mcast_only, is_loopback);
+ }
+}
+
+int
+viona_rx_set(viona_link_t *link)
+{
+ viona_vring_t *ring = &link->l_vrings[VIONA_VQ_RX];
+ int err;
+
+ mac_rx_set(link->l_mch, viona_rx_classified, ring);
+ err = mac_promisc_add(link->l_mch, MAC_CLIENT_PROMISC_MULTI,
+ viona_rx_mcast, ring, &link->l_mph,
+ MAC_PROMISC_FLAGS_NO_TX_LOOP | MAC_PROMISC_FLAGS_VLAN_TAG_STRIP);
+ if (err != 0) {
+ mac_rx_clear(link->l_mch);
+ }
+
+ return (err);
+}
+
+void
+viona_rx_clear(viona_link_t *link)
+{
+ mac_promisc_remove(link->l_mph);
+ mac_rx_clear(link->l_mch);
+}
diff --git a/usr/src/uts/i86pc/io/viona/viona_tx.c b/usr/src/uts/i86pc/io/viona/viona_tx.c
new file mode 100644
index 0000000000..843435c67d
--- /dev/null
+++ b/usr/src/uts/i86pc/io/viona/viona_tx.c
@@ -0,0 +1,755 @@
+/*
+ * Copyright (c) 2013 Chris Torek <torek @ torek net>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2015 Pluribus Networks Inc.
+ * Copyright 2019 Joyent, Inc.
+ */
+
+
+#include <sys/types.h>
+#include <sys/smt.h>
+#include <sys/strsubr.h>
+
+#include <sys/pattr.h>
+#include <sys/dlpi.h>
+#include <inet/ip.h>
+#include <inet/ip_impl.h>
+
+#include "viona_impl.h"
+
+#define BNXE_NIC_DRIVER "bnxe"
+
+/*
+ * copy tx mbufs from virtio ring to avoid necessitating a wait for packet
+ * transmission to free resources.
+ */
+kmutex_t viona_force_copy_lock;
+static enum viona_force_copy {
+ VFC_UNINITALIZED = 0,
+ VFC_COPY_UNEEDED = 1,
+ VFC_COPY_REQUIRED = 2,
+} viona_force_copy_state = VFC_UNINITALIZED;
+
+struct viona_desb {
+ frtn_t d_frtn;
+ viona_vring_t *d_ring;
+ uint_t d_ref;
+ uint32_t d_len;
+ uint16_t d_cookie;
+ uchar_t *d_headers;
+};
+
+static void viona_tx(viona_link_t *, viona_vring_t *);
+static void viona_desb_release(viona_desb_t *);
+
+/*
+ * Return the number of available descriptors in the vring taking care of the
+ * 16-bit index wraparound.
+ *
+ * Note: If the number of apparently available descriptors is larger than the
+ * ring size (due to guest misbehavior), this check will still report the
+ * positive count of descriptors.
+ */
+static inline uint_t
+viona_vr_num_avail(viona_vring_t *ring)
+{
+ uint16_t ndesc;
+
+ /*
+ * We're just computing (a-b) in GF(216).
+ *
+ * The only glitch here is that in standard C, uint16_t promotes to
+ * (signed) int when int has more than 16 bits (almost always now).
+ * A cast back to unsigned is necessary for proper operation.
+ */
+ ndesc = (unsigned)*ring->vr_avail_idx - (unsigned)ring->vr_cur_aidx;
+
+ return (ndesc);
+}
+
+static void
+viona_tx_wait_outstanding(viona_vring_t *ring)
+{
+ ASSERT(MUTEX_HELD(&ring->vr_lock));
+
+ while (ring->vr_xfer_outstanding != 0) {
+ /*
+ * Paying heed to signals is counterproductive here. This is a
+ * very tight loop if pending transfers take an extended amount
+ * of time to be reclaimed while the host process is exiting.
+ */
+ cv_wait(&ring->vr_cv, &ring->vr_lock);
+ }
+}
+
+/*
+ * Check if full TX packet copying is needed. This should not be called from
+ * viona attach()/detach() context.
+ */
+static boolean_t
+viona_tx_copy_needed(void)
+{
+ boolean_t result;
+
+ mutex_enter(&viona_force_copy_lock);
+ if (viona_force_copy_state == VFC_UNINITALIZED) {
+ major_t bnxe_major;
+
+ /*
+ * The original code for viona featured an explicit check for
+ * the bnxe driver which, when found present, necessitated that
+ * all transmissions be copied into their own mblks instead of
+ * passing guest memory to the underlying device.
+ *
+ * The motivations for this are unclear, but until it can be
+ * proven unnecessary, the check lives on.
+ */
+ viona_force_copy_state = VFC_COPY_UNEEDED;
+ if ((bnxe_major = ddi_name_to_major(BNXE_NIC_DRIVER))
+ != DDI_MAJOR_T_NONE) {
+ if (ddi_hold_installed_driver(bnxe_major) != NULL) {
+ viona_force_copy_state = VFC_COPY_REQUIRED;
+ ddi_rele_driver(bnxe_major);
+ }
+ }
+ }
+ result = (viona_force_copy_state == VFC_COPY_REQUIRED);
+ mutex_exit(&viona_force_copy_lock);
+
+ return (result);
+}
+
+void
+viona_tx_ring_alloc(viona_vring_t *ring, const uint16_t qsz)
+{
+ /* Allocate desb handles for TX ring if packet copying not disabled */
+ if (!viona_tx_copy_needed()) {
+ viona_desb_t *dp;
+
+ dp = kmem_zalloc(sizeof (viona_desb_t) * qsz, KM_SLEEP);
+ ring->vr_txdesb = dp;
+ for (uint_t i = 0; i < qsz; i++, dp++) {
+ dp->d_frtn.free_func = viona_desb_release;
+ dp->d_frtn.free_arg = (void *)dp;
+ dp->d_ring = ring;
+ dp->d_headers = kmem_zalloc(VIONA_MAX_HDRS_LEN,
+ KM_SLEEP);
+ }
+ }
+
+ /* Allocate ring-sized iovec buffers for TX */
+ ring->vr_txiov = kmem_alloc(sizeof (struct iovec) * qsz, KM_SLEEP);
+}
+
+void
+viona_tx_ring_free(viona_vring_t *ring, const uint16_t qsz)
+{
+ if (ring->vr_txdesb != NULL) {
+ viona_desb_t *dp = ring->vr_txdesb;
+
+ for (uint_t i = 0; i < qsz; i++, dp++) {
+ kmem_free(dp->d_headers, VIONA_MAX_HDRS_LEN);
+ }
+ kmem_free(ring->vr_txdesb, sizeof (viona_desb_t) * qsz);
+ ring->vr_txdesb = NULL;
+ }
+
+ if (ring->vr_txiov != NULL) {
+ kmem_free(ring->vr_txiov, sizeof (struct iovec) * qsz);
+ ring->vr_txiov = NULL;
+ }
+}
+
+static void
+viona_tx_done(viona_vring_t *ring, uint32_t len, uint16_t cookie)
+{
+ vq_pushchain(ring, len, cookie);
+
+ membar_enter();
+ if ((*ring->vr_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) {
+ viona_intr_ring(ring);
+ }
+}
+
+void
+viona_worker_tx(viona_vring_t *ring, viona_link_t *link)
+{
+ proc_t *p = ttoproc(curthread);
+
+ (void) thread_vsetname(curthread, "viona_tx_%p", ring);
+
+ ASSERT(MUTEX_HELD(&ring->vr_lock));
+ ASSERT3U(ring->vr_state, ==, VRS_RUN);
+
+ mutex_exit(&ring->vr_lock);
+
+ for (;;) {
+ boolean_t bail = B_FALSE;
+ boolean_t renew = B_FALSE;
+ uint_t ntx = 0;
+
+ *ring->vr_used_flags |= VRING_USED_F_NO_NOTIFY;
+ while (viona_vr_num_avail(ring)) {
+ viona_tx(link, ring);
+
+ /*
+ * It is advantageous for throughput to keep this
+ * transmission loop tight, but periodic breaks to
+ * check for other events are of value too.
+ */
+ if (ntx++ >= ring->vr_size)
+ break;
+ }
+ *ring->vr_used_flags &= ~VRING_USED_F_NO_NOTIFY;
+
+ VIONA_PROBE2(tx, viona_link_t *, link, uint_t, ntx);
+
+ /*
+ * Check for available descriptors on the ring once more in
+ * case a late addition raced with the NO_NOTIFY flag toggle.
+ *
+ * The barrier ensures that visibility of the vr_used_flags
+ * store does not cross the viona_vr_num_avail() check below.
+ */
+ membar_enter();
+ bail = VRING_NEED_BAIL(ring, p);
+ renew = vmm_drv_lease_expired(ring->vr_lease);
+ if (!bail && !renew && viona_vr_num_avail(ring)) {
+ continue;
+ }
+
+ if ((link->l_features & VIRTIO_F_RING_NOTIFY_ON_EMPTY) != 0) {
+ viona_intr_ring(ring);
+ }
+
+ mutex_enter(&ring->vr_lock);
+
+ while (!bail && !renew && !viona_vr_num_avail(ring)) {
+ (void) cv_wait_sig(&ring->vr_cv, &ring->vr_lock);
+ bail = VRING_NEED_BAIL(ring, p);
+ renew = vmm_drv_lease_expired(ring->vr_lease);
+ }
+
+ if (bail) {
+ break;
+ } else if (renew) {
+ ring->vr_state_flags |= VRSF_RENEW;
+ /*
+ * When renewing the lease for the ring, no TX
+ * frames may be outstanding, as they contain
+ * references to guest memory.
+ */
+ viona_tx_wait_outstanding(ring);
+
+ if (!viona_ring_lease_renew(ring)) {
+ break;
+ }
+ ring->vr_state_flags &= ~VRSF_RENEW;
+ }
+ mutex_exit(&ring->vr_lock);
+ }
+
+ ASSERT(MUTEX_HELD(&ring->vr_lock));
+
+ viona_tx_wait_outstanding(ring);
+}
+
+static void
+viona_desb_release(viona_desb_t *dp)
+{
+ viona_vring_t *ring = dp->d_ring;
+ uint_t ref;
+ uint32_t len;
+ uint16_t cookie;
+
+ ref = atomic_dec_uint_nv(&dp->d_ref);
+ if (ref > 1) {
+ return;
+ }
+
+ /*
+ * The desb corresponding to this index must be ready for reuse before
+ * the descriptor is returned to the guest via the 'used' ring.
+ */
+ len = dp->d_len;
+ cookie = dp->d_cookie;
+ dp->d_len = 0;
+ dp->d_cookie = 0;
+ dp->d_ref = 0;
+
+ viona_tx_done(ring, len, cookie);
+
+ mutex_enter(&ring->vr_lock);
+ if ((--ring->vr_xfer_outstanding) == 0) {
+ cv_broadcast(&ring->vr_cv);
+ }
+ mutex_exit(&ring->vr_lock);
+}
+
+static boolean_t
+viona_tx_csum(viona_vring_t *ring, const struct virtio_net_hdr *hdr,
+ mblk_t *mp, uint32_t len)
+{
+ viona_link_t *link = ring->vr_link;
+ const struct ether_header *eth;
+ uint_t eth_len = sizeof (struct ether_header);
+ ushort_t ftype;
+ ipha_t *ipha = NULL;
+ uint8_t ipproto = IPPROTO_NONE; /* NONE is not exactly right, but ok */
+ uint16_t flags = 0;
+ const uint_t csum_start = hdr->vrh_csum_start;
+ const uint_t csum_stuff = hdr->vrh_csum_offset + csum_start;
+
+ /*
+ * Validate that the checksum offsets provided by the guest are within
+ * the bounds of the packet. Additionally, ensure that the checksum
+ * contents field is within the headers mblk copied by viona_tx().
+ */
+ if (csum_start >= len || csum_start < eth_len || csum_stuff >= len ||
+ (csum_stuff + sizeof (uint16_t)) > MBLKL(mp)) {
+ VIONA_PROBE2(fail_hcksum, viona_link_t *, link, mblk_t *, mp);
+ VIONA_RING_STAT_INCR(ring, fail_hcksum);
+ return (B_FALSE);
+ }
+
+ /*
+ * This is guaranteed to be safe thanks to the header copying
+ * done in viona_tx().
+ */
+ eth = (const struct ether_header *)mp->b_rptr;
+ ftype = ntohs(eth->ether_type);
+
+ if (ftype == ETHERTYPE_VLAN) {
+ const struct ether_vlan_header *veth;
+
+ /* punt on QinQ for now */
+ eth_len = sizeof (struct ether_vlan_header);
+ veth = (const struct ether_vlan_header *)eth;
+ ftype = ntohs(veth->ether_type);
+ }
+
+ if (ftype == ETHERTYPE_IP) {
+ ipha = (ipha_t *)(mp->b_rptr + eth_len);
+
+ ipproto = ipha->ipha_protocol;
+ } else if (ftype == ETHERTYPE_IPV6) {
+ ip6_t *ip6h = (ip6_t *)(mp->b_rptr + eth_len);
+
+ ipproto = ip6h->ip6_nxt;
+ }
+
+ /*
+ * We ignore hdr_len because the spec says it can't be
+ * trusted. Besides, our own stack will determine the header
+ * boundary.
+ */
+ if ((link->l_cap_csum & HCKSUM_INET_PARTIAL) != 0 &&
+ (hdr->vrh_gso_type & VIRTIO_NET_HDR_GSO_TCPV4) != 0 &&
+ ftype == ETHERTYPE_IP) {
+ uint16_t *cksump;
+ uint32_t cksum;
+ ipaddr_t src = ipha->ipha_src;
+ ipaddr_t dst = ipha->ipha_dst;
+
+ /*
+ * Our native IP stack doesn't set the L4 length field
+ * of the pseudo header when LSO is in play. Other IP
+ * stacks, e.g. Linux, do include the length field.
+ * This is a problem because the hardware expects that
+ * the length field is not set. When it is set it will
+ * cause an incorrect TCP checksum to be generated.
+ * The reason this works in Linux is because Linux
+ * corrects the pseudo-header checksum in the driver
+ * code. In order to get the correct HW checksum we
+ * need to assume the guest's IP stack gave us a bogus
+ * TCP partial checksum and calculate it ourselves.
+ */
+ cksump = IPH_TCPH_CHECKSUMP(ipha, IPH_HDR_LENGTH(ipha));
+ cksum = IP_TCP_CSUM_COMP;
+ cksum += (dst >> 16) + (dst & 0xFFFF) +
+ (src >> 16) + (src & 0xFFFF);
+ cksum = (cksum & 0xFFFF) + (cksum >> 16);
+ *(cksump) = (cksum & 0xFFFF) + (cksum >> 16);
+
+ /*
+ * Since viona is a "legacy device", the data stored
+ * by the driver will be in the guest's native endian
+ * format (see sections 2.4.3 and 5.1.6.1 of the
+ * VIRTIO 1.0 spec for more info). At this time the
+ * only guests using viona are x86 and we can assume
+ * little-endian.
+ */
+ lso_info_set(mp, LE_16(hdr->vrh_gso_size), HW_LSO);
+
+ /*
+ * Hardware, like ixgbe, expects the client to request
+ * IP header checksum offload if it's sending LSO (see
+ * ixgbe_get_context()). Unfortunately, virtio makes
+ * no allowances for negotiating IP header checksum
+ * and HW offload, only TCP checksum. We add the flag
+ * and zero-out the checksum field. This mirrors the
+ * behavior of our native IP stack (which does this in
+ * the interest of HW that expects the field to be
+ * zero).
+ */
+ flags |= HCK_IPV4_HDRCKSUM;
+ ipha->ipha_hdr_checksum = 0;
+ }
+
+ /*
+ * Use DB_CKSUMFLAGS instead of mac_hcksum_get() to make sure
+ * HW_LSO, if present, is not lost.
+ */
+ flags |= DB_CKSUMFLAGS(mp);
+
+ /*
+ * Partial checksum support from the NIC is ideal, since it most
+ * closely maps to the interface defined by virtio.
+ */
+ if ((link->l_cap_csum & HCKSUM_INET_PARTIAL) != 0 &&
+ (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) {
+ /*
+ * MAC expects these offsets to be relative to the
+ * start of the L3 header rather than the L2 frame.
+ */
+ flags |= HCK_PARTIALCKSUM;
+ mac_hcksum_set(mp, csum_start - eth_len, csum_stuff - eth_len,
+ len - eth_len, 0, flags);
+ return (B_TRUE);
+ }
+
+ /*
+ * Without partial checksum support, look to the L3/L4 protocol
+ * information to see if the NIC can handle it. If not, the
+ * checksum will need to calculated inline.
+ */
+ if (ftype == ETHERTYPE_IP) {
+ if ((link->l_cap_csum & HCKSUM_INET_FULL_V4) != 0 &&
+ (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) {
+ uint16_t *csump = (uint16_t *)(mp->b_rptr + csum_stuff);
+ *csump = 0;
+ flags |= HCK_FULLCKSUM;
+ mac_hcksum_set(mp, 0, 0, 0, 0, flags);
+ return (B_TRUE);
+ }
+
+ /* XXX: Implement manual fallback checksumming? */
+ VIONA_PROBE2(fail_hcksum, viona_link_t *, link, mblk_t *, mp);
+ VIONA_RING_STAT_INCR(ring, fail_hcksum);
+ return (B_FALSE);
+ } else if (ftype == ETHERTYPE_IPV6) {
+ if ((link->l_cap_csum & HCKSUM_INET_FULL_V6) != 0 &&
+ (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) {
+ uint16_t *csump = (uint16_t *)(mp->b_rptr + csum_stuff);
+ *csump = 0;
+ flags |= HCK_FULLCKSUM;
+ mac_hcksum_set(mp, 0, 0, 0, 0, flags);
+ return (B_TRUE);
+ }
+
+ /* XXX: Implement manual fallback checksumming? */
+ VIONA_PROBE2(fail_hcksum6, viona_link_t *, link, mblk_t *, mp);
+ VIONA_RING_STAT_INCR(ring, fail_hcksum6);
+ return (B_FALSE);
+ }
+
+ /* Cannot even emulate hcksum for unrecognized protocols */
+ VIONA_PROBE2(fail_hcksum_proto, viona_link_t *, link, mblk_t *, mp);
+ VIONA_RING_STAT_INCR(ring, fail_hcksum_proto);
+ return (B_FALSE);
+}
+
+static void
+viona_tx(viona_link_t *link, viona_vring_t *ring)
+{
+ struct iovec *iov = ring->vr_txiov;
+ const uint_t max_segs = ring->vr_size;
+ uint16_t cookie;
+ int i, n;
+ uint32_t len, base_off = 0;
+ uint32_t min_copy = VIONA_MAX_HDRS_LEN;
+ mblk_t *mp_head, *mp_tail, *mp;
+ viona_desb_t *dp = NULL;
+ mac_client_handle_t link_mch = link->l_mch;
+ const struct virtio_net_hdr *hdr;
+
+ mp_head = mp_tail = NULL;
+
+ ASSERT(iov != NULL);
+
+ n = vq_popchain(ring, iov, max_segs, &cookie);
+ if (n == 0) {
+ VIONA_PROBE1(tx_absent, viona_vring_t *, ring);
+ VIONA_RING_STAT_INCR(ring, tx_absent);
+ return;
+ } else if (n < 0) {
+ /*
+ * Any error encountered in vq_popchain has already resulted in
+ * specific probe and statistic handling. Further action here
+ * is unnecessary.
+ */
+ return;
+ }
+
+ /* Grab the header and ensure it is of adequate length */
+ hdr = (const struct virtio_net_hdr *)iov[0].iov_base;
+ len = iov[0].iov_len;
+ if (len < sizeof (struct virtio_net_hdr)) {
+ goto drop_fail;
+ }
+
+ /* Make sure the packet headers are always in the first mblk. */
+ if (ring->vr_txdesb != NULL) {
+ dp = &ring->vr_txdesb[cookie];
+
+ /*
+ * If the guest driver is operating properly, each desb slot
+ * should be available for use when processing a TX descriptor
+ * from the 'avail' ring. In the case of drivers that reuse a
+ * descriptor before it has been posted to the 'used' ring, the
+ * data is simply dropped.
+ */
+ if (atomic_cas_uint(&dp->d_ref, 0, 1) != 0) {
+ dp = NULL;
+ goto drop_fail;
+ }
+
+ dp->d_cookie = cookie;
+ mp_head = desballoc(dp->d_headers, VIONA_MAX_HDRS_LEN, 0,
+ &dp->d_frtn);
+
+ /* Account for the successful desballoc. */
+ if (mp_head != NULL)
+ dp->d_ref++;
+ } else {
+ mp_head = allocb(VIONA_MAX_HDRS_LEN, 0);
+ }
+
+ if (mp_head == NULL)
+ goto drop_fail;
+
+ mp_tail = mp_head;
+
+ /*
+ * We always copy enough of the guest data to cover the
+ * headers. This protects us from TOCTOU attacks and allows
+ * message block length assumptions to be made in subsequent
+ * code. In many cases, this means copying more data than
+ * strictly necessary. That's okay, as it is the larger packets
+ * (such as LSO) that really benefit from desballoc().
+ */
+ for (i = 1; i < n; i++) {
+ const uint32_t to_copy = MIN(min_copy, iov[i].iov_len);
+
+ bcopy(iov[i].iov_base, mp_head->b_wptr, to_copy);
+ mp_head->b_wptr += to_copy;
+ len += to_copy;
+ min_copy -= to_copy;
+
+ /*
+ * We've met the minimum copy requirement. The rest of
+ * the guest data can be referenced.
+ */
+ if (min_copy == 0) {
+ /*
+ * If we copied all contents of this
+ * descriptor then move onto the next one.
+ * Otherwise, record how far we are into the
+ * current descriptor.
+ */
+ if (iov[i].iov_len == to_copy)
+ i++;
+ else
+ base_off = to_copy;
+
+ break;
+ }
+ }
+
+ ASSERT3P(mp_head, !=, NULL);
+ ASSERT3P(mp_tail, !=, NULL);
+
+ for (; i < n; i++) {
+ uintptr_t base = (uintptr_t)iov[i].iov_base + base_off;
+ uint32_t chunk = iov[i].iov_len - base_off;
+
+ ASSERT3U(base_off, <, iov[i].iov_len);
+ ASSERT3U(chunk, >, 0);
+
+ if (dp != NULL) {
+ mp = desballoc((uchar_t *)base, chunk, 0, &dp->d_frtn);
+ if (mp == NULL) {
+ goto drop_fail;
+ }
+ dp->d_ref++;
+ } else {
+ mp = allocb(chunk, BPRI_MED);
+ if (mp == NULL) {
+ goto drop_fail;
+ }
+ bcopy((uchar_t *)base, mp->b_wptr, chunk);
+ }
+
+ base_off = 0;
+ len += chunk;
+ mp->b_wptr += chunk;
+ mp_tail->b_cont = mp;
+ mp_tail = mp;
+ }
+
+ if (VNETHOOK_INTERESTED_OUT(link->l_neti)) {
+ /*
+ * The hook consumer may elect to free the mblk_t and set
+ * our mblk_t ** to NULL. When using a viona_desb_t
+ * (dp != NULL), we do not want the corresponding cleanup to
+ * occur during the viona_hook() call. We instead want to
+ * reset and recycle dp for future use. To prevent cleanup
+ * during the viona_hook() call, we take a ref on dp (if being
+ * used), and release it on success. On failure, the
+ * freemsgchain() call will release all the refs taken earlier
+ * in viona_tx() (aside from the initial ref and the one we
+ * take), and drop_hook will reset dp for reuse.
+ */
+ if (dp != NULL)
+ dp->d_ref++;
+
+ /*
+ * Pass &mp instead of &mp_head so we don't lose track of
+ * mp_head if the hook consumer (i.e. ipf) elects to free mp
+ * and set mp to NULL.
+ */
+ mp = mp_head;
+ if (viona_hook(link, ring, &mp, B_TRUE) != 0) {
+ if (mp != NULL)
+ freemsgchain(mp);
+ goto drop_hook;
+ }
+
+ if (dp != NULL) {
+ dp->d_ref--;
+
+ /*
+ * It is possible that the hook(s) accepted the packet,
+ * but as part of its processing, it issued a pull-up
+ * which released all references to the desb. In that
+ * case, go back to acting like the packet is entirely
+ * copied (which it is).
+ */
+ if (dp->d_ref == 1) {
+ dp->d_cookie = 0;
+ dp->d_ref = 0;
+ dp = NULL;
+ }
+ }
+ }
+
+ /*
+ * Request hardware checksumming, if necessary. If the guest
+ * sent an LSO packet then it must have also negotiated and
+ * requested partial checksum; therefore the LSO logic is
+ * contained within viona_tx_csum().
+ */
+ if ((link->l_features & VIRTIO_NET_F_CSUM) != 0 &&
+ (hdr->vrh_flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) != 0) {
+ if (!viona_tx_csum(ring, hdr, mp_head, len - iov[0].iov_len)) {
+ goto drop_fail;
+ }
+ }
+
+ if (dp != NULL) {
+ dp->d_len = len;
+ mutex_enter(&ring->vr_lock);
+ ring->vr_xfer_outstanding++;
+ mutex_exit(&ring->vr_lock);
+ } else {
+ /*
+ * If the data was cloned out of the ring, the descriptors can
+ * be marked as 'used' now, rather than deferring that action
+ * until after successful packet transmission.
+ */
+ viona_tx_done(ring, len, cookie);
+ }
+
+ /*
+ * We're potentially going deep into the networking layer; make sure the
+ * guest can't run concurrently.
+ */
+ smt_begin_unsafe();
+ mac_tx(link_mch, mp_head, 0, MAC_DROP_ON_NO_DESC, NULL);
+ smt_end_unsafe();
+ return;
+
+drop_fail:
+ /*
+ * On the off chance that memory is not available via the desballoc or
+ * allocb calls, there are few options left besides to fail and drop
+ * the frame on the floor.
+ */
+
+ if (dp != NULL) {
+ /*
+ * Take an additional reference on the desb handle (if present)
+ * so any desballoc-sourced mblks can release their hold on it
+ * without the handle reaching its final state and executing
+ * its clean-up logic.
+ */
+ dp->d_ref++;
+ }
+
+ /*
+ * Free any already-allocated blocks and sum up the total length of the
+ * dropped data to be released to the used ring.
+ */
+ freemsgchain(mp_head);
+
+drop_hook:
+ len = 0;
+ for (uint_t i = 0; i < n; i++) {
+ len += iov[i].iov_len;
+ }
+
+ if (dp != NULL) {
+ VERIFY(dp->d_ref == 2);
+
+ /* Clean up the desb handle, releasing the extra hold. */
+ dp->d_len = 0;
+ dp->d_cookie = 0;
+ dp->d_ref = 0;
+ }
+
+ VIONA_PROBE3(tx_drop, viona_vring_t *, ring, uint32_t, len,
+ uint16_t, cookie);
+ viona_tx_done(ring, len, cookie);
+}
diff --git a/usr/src/uts/i86pc/sys/vmm_drv.h b/usr/src/uts/i86pc/sys/vmm_drv.h
index 33fefc10ea..856b75e5cc 100644
--- a/usr/src/uts/i86pc/sys/vmm_drv.h
+++ b/usr/src/uts/i86pc/sys/vmm_drv.h
@@ -17,6 +17,9 @@
#define _VMM_DRV_H_
#ifdef _KERNEL
+
+#include <sys/file.h>
+
struct vmm_hold;
typedef struct vmm_hold vmm_hold_t;