OS-7843 viona could be split up

Reviewed by: Jason King <jbk@joyent.com> Reviewed by: Ryan Zezeski <rpz@joyent.com> Approved by: Ryan Zezeski <rpz@joyent.com>
author: Patrick Mooney <pmooney@pfmooney.com> 2019-06-06 20:17:45 +0000
committer: Patrick Mooney <pmooney@pfmooney.com> 2019-07-19 15:33:23 +0000
commit: 9e88ade90f654d7c2cdfcec90cface22eaa124c7 (patch)
tree: f23752c2ec4204ecc1e3ba6d4b9a0f1dc5b0021e /usr/src/uts/i86pc
parent: 62ae06fb599ccdbf3a97e6c584e0a055e763e2e9 (diff)
download: illumos-joyent-9e88ade90f654d7c2cdfcec90cface22eaa124c7.tar.gz
9 files changed, 3894 insertions, 3632 deletions
diff --git a/usr/src/uts/i86pc/Makefile.files b/usr/src/uts/i86pc/Makefile.files
index d541e92bf3..a0509bf21d 100644
--- a/usr/src/uts/i86pc/Makefile.files
+++ b/usr/src/uts/i86pc/Makefile.files
@@ -285,7 +285,11 @@ VMM_OBJS += vmm.o \
 	vmm_support.o \
 	vmm_zsd.o
 
-VIONA_OBJS += viona.o
+VIONA_OBJS += viona_main.o \
+	viona_ring.o \
+	viona_rx.o \
+	viona_tx.o \
+	viona_hook.o \
 
 PPT_OBJS += ppt.o
 
diff --git a/usr/src/uts/i86pc/io/viona/viona.c b/usr/src/uts/i86pc/io/viona/viona.c
deleted file mode 100644
index 80b5b07aaa..0000000000
--- a/usr/src/uts/i86pc/io/viona/viona.c
+++ /dev/null
@@ -1,3631 +0,0 @@
-/*
- * Copyright (c) 2013  Chris Torek <torek @ torek net>
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-/*
- * This file and its contents are supplied under the terms of the
- * Common Development and Distribution License ("CDDL"), version 1.0.
- * You may only use this file in accordance with the terms of version
- * 1.0 of the CDDL.
- *
- * A full copy of the text of the CDDL should have accompanied this
- * source.  A copy of the CDDL is also available via the Internet at
- * http://www.illumos.org/license/CDDL.
- *
- * Copyright 2015 Pluribus Networks Inc.
- * Copyright 2019 Joyent, Inc.
- */
-
-/*
- * viona - VirtIO-Net, Accelerated
- *
- * The purpose of viona is to provide high performance virtio-net devices to
- * bhyve guests.  It does so by sitting directly atop MAC, skipping all of the
- * DLS/DLD stack.
- *
- * --------------------
- * General Architecture
- * --------------------
- *
- * A single viona instance is comprised of a "link" handle and two "rings".
- * After opening the viona device, it must be associated with a MAC network
- * interface and a bhyve (vmm) instance to form its link resource.  This is
- * done with the VNA_IOC_CREATE ioctl, where the datalink ID and vmm fd are
- * passed in to perform the initialization.  With the MAC client opened, and a
- * driver handle to the vmm instance established, the device is ready to be
- * configured by the guest.
- *
- * The userspace portion of bhyve, which interfaces with the PCI device
- * emulation framework, is meant to stay out of the datapath if at all
- * possible.  Configuration changes made via PCI are mapped to actions which
- * will steer the operation of the in-kernel logic.
- *
- *
- * -----------
- * Ring Basics
- * -----------
- *
- * Each viona link has two viona_vring_t entities, RX and TX, for handling data
- * transfers to and from the guest.  They represent an interface to the
- * standard virtio ring structures.  When intiailized and active, each ring is
- * backed by a kernel worker thread (parented to the bhyve process for the
- * instance) which handles ring events.  The RX worker has the simple task of
- * watching for ring shutdown conditions.  The TX worker does that in addition
- * to processing all requests to transmit data.  Data destined for the guest is
- * delivered directly by MAC to viona_rx() when the ring is active.
- *
- *
- * -----------
- * Ring States
- * -----------
- *
- * The viona_vring_t instances follow a simple path through the possible state
- * values represented in virtio_vring_t`vr_state:
- *
- *        +<--------------------------------------------+
- *        |						|
- *        V						^
- *  +-----------+	This is the initial state when a link is created or
- *  | VRS_RESET |	when the ring has been explicitly reset.
- *  +-----------+
- *        |						^
- *        |---* ioctl(VNA_IOC_RING_INIT) issued		|
- *        |						|
- *        |						^
- *        V
- *  +-----------+	The ring parameters (size, guest physical addresses)
- *  | VRS_SETUP |	have been set and start-up of the ring worker thread
- *  +-----------+	has begun.
- *        |						^
- *        |						|
- *        |---* ring worker thread begins execution	|
- *        |						|
- *        +-------------------------------------------->+
- *        |	      |					^
- *        |	      |
- *        |	      *	If ring shutdown is requested (by ioctl or impending
- *        |		bhyve process death) while the worker thread is
- *        |		starting, the worker will transition the ring to
- *        |		VRS_RESET and exit.
- *        |						^
- *        |						|
- *        |						^
- *        V
- *  +-----------+	The worker thread associated with the ring has started
- *  | VRS_INIT  |	executing.  It has allocated any extra resources needed
- *  +-----------+	for the ring to operate.
- *        |						^
- *        |						|
- *        +-------------------------------------------->+
- *        |	      |					^
- *        |	      |
- *        |	      *	If ring shutdown is requested while the worker is
- *        |		waiting in VRS_INIT, it will free any extra resources
- *        |		and transition to VRS_RESET.
- *        |						^
- *        |						|
- *        |--* ioctl(VNA_IOC_RING_KICK) issued		|
- *        |						^
- *        V
- *  +-----------+	The worker thread associated with the ring is executing
- *  | VRS_RUN   |	workload specific to that ring.
- *  +-----------+
- *        |						^
- *        |---* ioctl(VNA_IOC_RING_RESET) issued	|
- *        |	(or bhyve process begins exit)		|
- *        V						|
- *        +-------------------------------------------->+
- *
- *
- * While the worker thread is not running, changes to vr_state are only made by
- * viona_ioc_ring_init() under vr_lock.  There, it initializes the ring, starts
- * the worker, and sets the ring state to VRS_SETUP.  Once the worker thread
- * has been started, only it may perform ring state transitions (still under
- * the protection of vr_lock), when requested by outside consumers via
- * vr_state_flags or when the containing bhyve process initiates an exit.
- *
- *
- * ----------------------------
- * Transmission mblk_t Handling
- * ----------------------------
- *
- * For incoming frames destined for a bhyve guest, the data must first land in
- * a host OS buffer from the physical NIC before it is copied into the awaiting
- * guest buffer(s).  Outbound frames transmitted by the guest are not bound by
- * this limitation and can avoid extra copying before the buffers are accessed
- * directly by the NIC.  When a guest designates buffers to be transmitted,
- * viona translates the guest-physical addresses contained in the ring
- * descriptors to host-virtual addresses via vmm_dr_gpa2kva().  That pointer is
- * wrapped in an mblk_t using a preallocated viona_desb_t for the desballoc().
- * Doing so increments vr_xfer_outstanding, preventing the ring from being
- * reset (allowing the link to drop its vmm handle to the guest) until all
- * transmit mblks referencing guest memory have been processed.  Allocation of
- * the viona_desb_t entries is done during the VRS_INIT stage of the ring
- * worker thread.  The ring size informs that allocation as the number of
- * concurrent transmissions is limited by the number of descriptors in the
- * ring.  This minimizes allocation in the transmit hot-path by aqcuiring those
- * fixed-size resources during initialization.
- *
- * This optimization depends on the underlying NIC driver freeing the mblks in
- * a timely manner after they have been transmitted by the hardware.  Some
- * drivers have been found to flush TX descriptors only when new transmissions
- * are initiated.  This means that there is no upper bound to the time needed
- * for an mblk to be flushed and can stall bhyve guests from shutting down
- * since their memory must be free of viona TX references prior to clean-up.
- *
- * This expectation of deterministic mblk_t processing is likely the reason
- * behind the notable exception to the zero-copy TX path: systems with 'bnxe'
- * loaded will copy transmit data into fresh buffers rather than passing up
- * zero-copy mblks.  It is a hold-over from the original viona sources provided
- * by Pluribus and its continued necessity has not been confirmed.
- *
- *
- * ----------------------------
- * Ring Notification Fast-paths
- * ----------------------------
- *
- * Device operation for viona requires that notifications flow to and from the
- * guest to indicate certain ring conditions.  In order to minimize latency and
- * processing overhead, the notification procedures are kept in-kernel whenever
- * possible.
- *
- * Guest-to-host notifications, when new available descriptors have been placed
- * in the ring, are posted via the 'queue notify' address in the virtio BAR.
- * The vmm_drv_ioport_hook() interface was added to bhyve which allows viona to
- * install a callback hook on an ioport address.  Guest exits for accesses to
- * viona-hooked ioport addresses will result in direct calls to notify the
- * appropriate ring worker without a trip to userland.
- *
- * Host-to-guest notifications in the form of interrupts enjoy similar
- * acceleration.  Each viona ring can be configured to send MSI notifications
- * to the guest as virtio conditions dictate.  This in-kernel interrupt
- * configuration is kept synchronized through viona ioctls which are utilized
- * during writes to the associated PCI config registers or MSI-X BAR.
- *
- * Guests which do not utilize MSI-X will result in viona falling back to the
- * slow path for interrupts.  It will poll(2) the viona handle, receiving
- * notification when ring events necessitate the assertion of an interrupt.
- *
- *
- * ---------------
- * Nethook Support
- * ---------------
- *
- * Viona provides four nethook events that consumers (e.g. ipf) can hook into
- * to intercept packets as they go up or down the stack.  Unfortunately,
- * the nethook framework does not understand raw packets, so we can only
- * generate events (in, out) for IPv4 and IPv6 packets.  At driver attach,
- * we register callbacks with the neti (netinfo) module that will be invoked
- * for each netstack already present, as well as for any additional netstack
- * instances created as the system operates.  These callbacks will
- * register/unregister the hooks with the nethook framework for each
- * netstack instance.  This registration occurs prior to creating any
- * viona instances for a given netstack, and the unregistration for a netstack
- * instance occurs after all viona instances of the netstack instance have
- * been deleted.
- */
-
-#include <sys/conf.h>
-#include <sys/file.h>
-#include <sys/stat.h>
-#include <sys/ddi.h>
-#include <sys/disp.h>
-#include <sys/sunddi.h>
-#include <sys/sunndi.h>
-#include <sys/sysmacros.h>
-#include <sys/strsubr.h>
-#include <sys/strsun.h>
-#include <vm/seg_kmem.h>
-#include <sys/smt.h>
-
-#include <sys/pattr.h>
-#include <sys/dls.h>
-#include <sys/dlpi.h>
-#include <sys/hook.h>
-#include <sys/hook_event.h>
-#include <sys/list.h>
-#include <sys/mac_client.h>
-#include <sys/mac_provider.h>
-#include <sys/mac_client_priv.h>
-#include <sys/neti.h>
-#include <sys/vlan.h>
-#include <inet/ip.h>
-#include <inet/ip_impl.h>
-#include <inet/tcp.h>
-
-#include <sys/vmm_drv.h>
-#include <sys/viona_io.h>
-
-/* Min. octets in an ethernet frame minus FCS */
-#define	MIN_BUF_SIZE		60
-#define	NEED_VLAN_PAD_SIZE	(MIN_BUF_SIZE - VLAN_TAGSZ)
-
-#define	VIONA_NAME		"Virtio Network Accelerator"
-#define	VIONA_CTL_MINOR		0
-#define	VIONA_CLI_NAME		"viona"		/* MAC client name */
-#define	VIONA_MAX_HDRS_LEN	(sizeof (struct ether_vlan_header) + \
-	IP_MAX_HDR_LENGTH + TCP_MAX_HDR_LENGTH)
-
-#define	VTNET_MAXSEGS		32
-
-#define	VRING_ALIGN		4096
-#define	VRING_MAX_LEN		32768
-
-#define	VRING_DESC_F_NEXT	(1 << 0)
-#define	VRING_DESC_F_WRITE	(1 << 1)
-#define	VRING_DESC_F_INDIRECT	(1 << 2)
-
-#define	VIRTIO_NET_HDR_F_NEEDS_CSUM	(1 << 0)
-#define	VIRTIO_NET_HDR_F_DATA_VALID	(1 << 1)
-
-#define	VIRTIO_NET_HDR_GSO_NONE		0
-#define	VIRTIO_NET_HDR_GSO_TCPV4	1
-
-#define	VRING_AVAIL_F_NO_INTERRUPT	1
-
-#define	VRING_USED_F_NO_NOTIFY		1
-
-#define	BNXE_NIC_DRIVER		"bnxe"
-
-/*
- * Feature bits. See section 5.1.3 of the VIRTIO 1.0 spec.
- */
-#define	VIRTIO_NET_F_CSUM	(1 << 0)
-#define	VIRTIO_NET_F_GUEST_CSUM	(1 << 1)
-#define	VIRTIO_NET_F_MAC	(1 << 5) /* host supplies MAC */
-#define	VIRTIO_NET_F_GUEST_TSO4	(1 << 7) /* guest can accept TSO */
-#define	VIRTIO_NET_F_HOST_TSO4	(1 << 11) /* host can accept TSO */
-#define	VIRTIO_NET_F_MRG_RXBUF	(1 << 15) /* host can merge RX buffers */
-#define	VIRTIO_NET_F_STATUS	(1 << 16) /* config status field available */
-#define	VIRTIO_F_RING_NOTIFY_ON_EMPTY	(1 << 24)
-#define	VIRTIO_F_RING_INDIRECT_DESC	(1 << 28)
-#define	VIRTIO_F_RING_EVENT_IDX		(1 << 29)
-
-/*
- * Host capabilities.
- */
-#define	VIONA_S_HOSTCAPS	(	\
-	VIRTIO_NET_F_GUEST_CSUM |	\
-	VIRTIO_NET_F_MAC |		\
-	VIRTIO_NET_F_GUEST_TSO4 |	\
-	VIRTIO_NET_F_MRG_RXBUF |	\
-	VIRTIO_NET_F_STATUS |		\
-	VIRTIO_F_RING_NOTIFY_ON_EMPTY |	\
-	VIRTIO_F_RING_INDIRECT_DESC)
-
-/* MAC_CAPAB_HCKSUM specifics of interest */
-#define	VIONA_CAP_HCKSUM_INTEREST	\
-	(HCKSUM_INET_PARTIAL |		\
-	HCKSUM_INET_FULL_V4 |		\
-	HCKSUM_INET_FULL_V6)
-
-
-#define	VIONA_PROBE(name)	DTRACE_PROBE(viona__##name)
-#define	VIONA_PROBE1(name, arg1, arg2)	\
-	DTRACE_PROBE1(viona__##name, arg1, arg2)
-#define	VIONA_PROBE2(name, arg1, arg2, arg3, arg4)	\
-	DTRACE_PROBE2(viona__##name, arg1, arg2, arg3, arg4)
-#define	VIONA_PROBE3(name, arg1, arg2, arg3, arg4, arg5, arg6)	\
-	DTRACE_PROBE3(viona__##name, arg1, arg2, arg3, arg4, arg5, arg6)
-#define	VIONA_PROBE5(name, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, \
-	arg9, arg10) \
-	DTRACE_PROBE5(viona__##name, arg1, arg2, arg3, arg4, arg5, arg6, arg7, \
-	arg8, arg9, arg10)
-#define	VIONA_PROBE_BAD_RING_ADDR(r, a)		\
-	VIONA_PROBE2(bad_ring_addr, viona_vring_t *, r, void *, (void *)(a))
-
-#define	VIONA_RING_STAT_INCR(r, name)	\
-	(((r)->vr_stats.rs_ ## name)++)
-
-#pragma pack(1)
-struct virtio_desc {
-	uint64_t	vd_addr;
-	uint32_t	vd_len;
-	uint16_t	vd_flags;
-	uint16_t	vd_next;
-};
-#pragma pack()
-
-#pragma pack(1)
-struct virtio_used {
-	uint32_t	vu_idx;
-	uint32_t	vu_tlen;
-};
-#pragma pack()
-
-#pragma pack(1)
-struct virtio_net_mrgrxhdr {
-	uint8_t		vrh_flags;
-	uint8_t		vrh_gso_type;
-	uint16_t	vrh_hdr_len;
-	uint16_t	vrh_gso_size;
-	uint16_t	vrh_csum_start;
-	uint16_t	vrh_csum_offset;
-	uint16_t	vrh_bufs;
-};
-struct virtio_net_hdr {
-	uint8_t		vrh_flags;
-	uint8_t		vrh_gso_type;
-	uint16_t	vrh_hdr_len;
-	uint16_t	vrh_gso_size;
-	uint16_t	vrh_csum_start;
-	uint16_t	vrh_csum_offset;
-};
-#pragma pack()
-
-struct viona_link;
-typedef struct viona_link viona_link_t;
-struct viona_desb;
-typedef struct viona_desb viona_desb_t;
-struct viona_net;
-typedef struct viona_neti viona_neti_t;
-
-enum viona_ring_state {
-	VRS_RESET	= 0x0,	/* just allocated or reset */
-	VRS_SETUP	= 0x1,	/* addrs setup and starting worker thread */
-	VRS_INIT	= 0x2,	/* worker thread started & waiting to run */
-	VRS_RUN		= 0x3,	/* running work routine */
-};
-enum viona_ring_state_flags {
-	VRSF_REQ_START	= 0x1,	/* start running from INIT state */
-	VRSF_REQ_STOP	= 0x2,	/* stop running, clean up, goto RESET state */
-	VRSF_RENEW	= 0x4,	/* ring renewing lease */
-};
-
-#define	VRING_NEED_BAIL(ring, proc)					\
-		(((ring)->vr_state_flags & VRSF_REQ_STOP) != 0 ||	\
-		((proc)->p_flag & SEXITING) != 0)
-
-#define	VNETHOOK_INTERESTED_IN(neti) \
-	(neti)->vni_nethook.vnh_event_in.he_interested
-#define	VNETHOOK_INTERESTED_OUT(neti) \
-	(neti)->vni_nethook.vnh_event_out.he_interested
-
-typedef struct viona_vring {
-	viona_link_t	*vr_link;
-
-	kmutex_t	vr_lock;
-	kcondvar_t	vr_cv;
-	uint16_t	vr_state;
-	uint16_t	vr_state_flags;
-	uint_t		vr_xfer_outstanding;
-	kthread_t	*vr_worker_thread;
-	vmm_lease_t	*vr_lease;
-
-	/* ring-sized resources for TX activity */
-	viona_desb_t	*vr_txdesb;
-	struct iovec	*vr_txiov;
-
-	uint_t		vr_intr_enabled;
-	uint64_t	vr_msi_addr;
-	uint64_t	vr_msi_msg;
-
-	/* Internal ring-related state */
-	kmutex_t	vr_a_mutex;	/* sync consumers of 'avail' */
-	kmutex_t	vr_u_mutex;	/* sync consumers of 'used' */
-	uint64_t	vr_pa;
-	uint16_t	vr_size;
-	uint16_t	vr_mask;	/* cached from vr_size */
-	uint16_t	vr_cur_aidx;	/* trails behind 'avail_idx' */
-
-	/* Host-context pointers to the queue */
-	volatile struct virtio_desc	*vr_descr;
-
-	volatile uint16_t		*vr_avail_flags;
-	volatile uint16_t		*vr_avail_idx;
-	volatile uint16_t		*vr_avail_ring;
-	volatile uint16_t		*vr_avail_used_event;
-
-	volatile uint16_t		*vr_used_flags;
-	volatile uint16_t		*vr_used_idx;
-	volatile struct virtio_used	*vr_used_ring;
-	volatile uint16_t		*vr_used_avail_event;
-
-	/* Per-ring error condition statistics */
-	struct viona_ring_stats {
-		uint64_t	rs_ndesc_too_high;
-		uint64_t	rs_bad_idx;
-		uint64_t	rs_indir_bad_len;
-		uint64_t	rs_indir_bad_nest;
-		uint64_t	rs_indir_bad_next;
-		uint64_t	rs_no_space;
-		uint64_t	rs_too_many_desc;
-		uint64_t	rs_desc_bad_len;
-
-		uint64_t	rs_bad_ring_addr;
-
-		uint64_t	rs_fail_hcksum;
-		uint64_t	rs_fail_hcksum6;
-		uint64_t	rs_fail_hcksum_proto;
-
-		uint64_t	rs_bad_rx_frame;
-		uint64_t	rs_rx_merge_overrun;
-		uint64_t	rs_rx_merge_underrun;
-		uint64_t	rs_rx_pad_short;
-		uint64_t	rs_rx_mcast_check;
-		uint64_t	rs_too_short;
-		uint64_t	rs_tx_absent;
-
-		uint64_t	rs_rx_hookdrop;
-		uint64_t	rs_tx_hookdrop;
-	} vr_stats;
-} viona_vring_t;
-
-struct viona_link {
-	vmm_hold_t		*l_vm_hold;
-	boolean_t		l_destroyed;
-
-	viona_vring_t		l_vrings[VIONA_VQ_MAX];
-
-	uint32_t		l_features;
-	uint32_t		l_features_hw;
-	uint32_t		l_cap_csum;
-	boolean_t		l_force_tx_copy;
-
-	uintptr_t		l_notify_ioport;
-	void			*l_notify_cookie;
-
-	datalink_id_t		l_linkid;
-	mac_handle_t		l_mh;
-	mac_client_handle_t	l_mch;
-	mac_promisc_handle_t	l_mph;
-
-	pollhead_t		l_pollhead;
-
-	viona_neti_t		*l_neti;
-};
-
-typedef struct viona_nethook {
-	net_handle_t		vnh_neti;
-	hook_family_t		vnh_family;
-	hook_event_t		vnh_event_in;
-	hook_event_t		vnh_event_out;
-	hook_event_token_t	vnh_token_in;
-	hook_event_token_t	vnh_token_out;
-	boolean_t		vnh_hooked;
-} viona_nethook_t;
-
-struct viona_neti {
-	list_node_t		vni_node;
-
-	netid_t			vni_netid;
-	zoneid_t		vni_zid;
-
-	viona_nethook_t		vni_nethook;
-
-	kmutex_t		vni_lock;	/* Protects remaining members */
-	kcondvar_t		vni_ref_change; /* Protected by vni_lock */
-	uint_t			vni_ref;	/* Protected by vni_lock */
-	list_t			vni_dev_list;	/* Protected by vni_lock */
-};
-
-struct viona_desb {
-	frtn_t			d_frtn;
-	viona_vring_t		*d_ring;
-	uint_t			d_ref;
-	uint32_t		d_len;
-	uint16_t		d_cookie;
-	uchar_t			*d_headers;
-};
-
-typedef struct viona_soft_state {
-	kmutex_t		ss_lock;
-	viona_link_t		*ss_link;
-	list_node_t		ss_node;
-} viona_soft_state_t;
-
-typedef struct used_elem {
-	uint16_t	id;
-	uint32_t	len;
-} used_elem_t;
-
-static void			*viona_state;
-static dev_info_t		*viona_dip;
-static id_space_t		*viona_minors;
-static mblk_t			*viona_vlan_pad_mp;
-
-/*
- * Global linked list of viona_neti_ts.  Access is protected by viona_neti_lock
- */
-static kmutex_t			viona_neti_lock;
-static list_t			viona_neti_list;
-
-/*
- * viona_neti is allocated and initialized during attach, and read-only
- * until detach (where it's also freed)
- */
-static net_instance_t		*viona_neti;
-
-/*
- * copy tx mbufs from virtio ring to avoid necessitating a wait for packet
- * transmission to free resources.
- */
-static kmutex_t			viona_force_copy_lock;
-static enum viona_force_copy {
-	VFC_UNINITALIZED	= 0,
-	VFC_COPY_UNEEDED	= 1,
-	VFC_COPY_REQUIRED	= 2,
-}				viona_force_copy_state = VFC_UNINITALIZED;
-
-static int viona_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg,
-    void **result);
-static int viona_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
-static int viona_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
-static int viona_open(dev_t *devp, int flag, int otype, cred_t *credp);
-static int viona_close(dev_t dev, int flag, int otype, cred_t *credp);
-static int viona_ioctl(dev_t dev, int cmd, intptr_t data, int mode,
-    cred_t *credp, int *rval);
-static int viona_chpoll(dev_t dev, short events, int anyyet, short *reventsp,
-    struct pollhead **phpp);
-
-static int viona_ioc_create(viona_soft_state_t *, void *, int, cred_t *);
-static int viona_ioc_delete(viona_soft_state_t *, boolean_t);
-
-static void *viona_gpa2kva(viona_vring_t *, uint64_t, size_t);
-
-static void viona_ring_alloc(viona_link_t *, viona_vring_t *);
-static void viona_ring_free(viona_vring_t *);
-static int viona_ring_reset(viona_vring_t *, boolean_t);
-static kthread_t *viona_create_worker(viona_vring_t *);
-static boolean_t viona_ring_map(viona_vring_t *);
-static void viona_ring_unmap(viona_vring_t *);
-
-static int viona_ioc_set_notify_ioport(viona_link_t *, uint_t);
-static int viona_ioc_ring_init(viona_link_t *, void *, int);
-static int viona_ioc_ring_reset(viona_link_t *, uint_t);
-static int viona_ioc_ring_kick(viona_link_t *, uint_t);
-static int viona_ioc_ring_set_msi(viona_link_t *, void *, int);
-static int viona_ioc_ring_intr_clear(viona_link_t *, uint_t);
-static int viona_ioc_intr_poll(viona_link_t *, void *, int, int *);
-
-static void viona_intr_ring(viona_vring_t *);
-
-static void viona_desb_release(viona_desb_t *);
-static void viona_rx_classified(void *, mac_resource_handle_t, mblk_t *,
-    boolean_t);
-static void viona_rx_mcast(void *, mac_resource_handle_t, mblk_t *, boolean_t);
-static void viona_tx_wait_outstanding(viona_vring_t *);
-static void viona_tx(viona_link_t *, viona_vring_t *);
-
-static viona_neti_t *viona_neti_lookup_by_zid(zoneid_t);
-static void viona_neti_rele(viona_neti_t *);
-
-static void *viona_neti_create(const netid_t);
-static void viona_neti_shutdown(const netid_t, void *);
-static void viona_neti_destroy(const netid_t, void *);
-
-static int viona_hook(viona_link_t *, viona_vring_t *, mblk_t **, boolean_t);
-
-static struct cb_ops viona_cb_ops = {
-	viona_open,
-	viona_close,
-	nodev,
-	nodev,
-	nodev,
-	nodev,
-	nodev,
-	viona_ioctl,
-	nodev,
-	nodev,
-	nodev,
-	viona_chpoll,
-	ddi_prop_op,
-	0,
-	D_MP | D_NEW | D_HOTPLUG,
-	CB_REV,
-	nodev,
-	nodev
-};
-
-static struct dev_ops viona_ops = {
-	DEVO_REV,
-	0,
-	viona_info,
-	nulldev,
-	nulldev,
-	viona_attach,
-	viona_detach,
-	nodev,
-	&viona_cb_ops,
-	NULL,
-	ddi_power,
-	ddi_quiesce_not_needed
-};
-
-static struct modldrv modldrv = {
-	&mod_driverops,
-	VIONA_NAME,
-	&viona_ops,
-};
-
-static struct modlinkage modlinkage = {
-	MODREV_1, &modldrv, NULL
-};
-
-int
-_init(void)
-{
-	int	ret;
-
-	ret = ddi_soft_state_init(&viona_state, sizeof (viona_soft_state_t), 0);
-	if (ret != 0)
-		return (ret);
-
-	ret = mod_install(&modlinkage);
-	if (ret != 0) {
-		ddi_soft_state_fini(&viona_state);
-		return (ret);
-	}
-
-	return (ret);
-}
-
-int
-_fini(void)
-{
-	int	ret;
-
-	ret = mod_remove(&modlinkage);
-	if (ret == 0) {
-		ddi_soft_state_fini(&viona_state);
-	}
-
-	return (ret);
-}
-
-int
-_info(struct modinfo *modinfop)
-{
-	return (mod_info(&modlinkage, modinfop));
-}
-
-/*
- * Check if full TX packet copying is needed.  This should not be called from
- * viona attach()/detach() context.
- */
-static boolean_t
-viona_tx_copy_needed()
-{
-	boolean_t result;
-
-	mutex_enter(&viona_force_copy_lock);
-	if (viona_force_copy_state == VFC_UNINITALIZED) {
-		major_t bnxe_major;
-
-		/*
-		 * The original code for viona featured an explicit check for
-		 * the bnxe driver which, when found present, necessitated that
-		 * all transmissions be copied into their own mblks instead of
-		 * passing guest memory to the underlying device.
-		 *
-		 * The motivations for this are unclear, but until it can be
-		 * proven unnecessary, the check lives on.
-		 */
-		viona_force_copy_state = VFC_COPY_UNEEDED;
-		if ((bnxe_major = ddi_name_to_major(BNXE_NIC_DRIVER))
-		    != DDI_MAJOR_T_NONE) {
-			if (ddi_hold_installed_driver(bnxe_major) != NULL) {
-				viona_force_copy_state = VFC_COPY_REQUIRED;
-				ddi_rele_driver(bnxe_major);
-			}
-		}
-	}
-	result = (viona_force_copy_state == VFC_COPY_REQUIRED);
-	mutex_exit(&viona_force_copy_lock);
-
-	return (result);
-}
-
-/* ARGSUSED */
-static int
-viona_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
-{
-	int error;
-
-	switch (cmd) {
-	case DDI_INFO_DEVT2DEVINFO:
-		*result = (void *)viona_dip;
-		error = DDI_SUCCESS;
-		break;
-	case DDI_INFO_DEVT2INSTANCE:
-		*result = (void *)0;
-		error = DDI_SUCCESS;
-		break;
-	default:
-		error = DDI_FAILURE;
-		break;
-	}
-	return (error);
-}
-
-static int
-viona_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
-{
-	mblk_t *mp;
-
-	if (cmd != DDI_ATTACH) {
-		return (DDI_FAILURE);
-	}
-
-	if (ddi_create_minor_node(dip, "viona", S_IFCHR, VIONA_CTL_MINOR,
-	    DDI_PSEUDO, 0) != DDI_SUCCESS) {
-		return (DDI_FAILURE);
-	}
-
-	viona_minors = id_space_create("viona_minors",
-	    VIONA_CTL_MINOR + 1, UINT16_MAX);
-
-	mutex_init(&viona_force_copy_lock, NULL, MUTEX_DRIVER, NULL);
-
-	/* Create mblk for padding when VLAN tags are stripped */
-	mp = allocb_wait(VLAN_TAGSZ, BPRI_HI, STR_NOSIG, NULL);
-	bzero(mp->b_rptr, VLAN_TAGSZ);
-	mp->b_wptr += VLAN_TAGSZ;
-	viona_vlan_pad_mp = mp;
-
-	viona_dip = dip;
-	ddi_report_dev(viona_dip);
-
-	mutex_init(&viona_neti_lock, NULL, MUTEX_DRIVER, NULL);
-	list_create(&viona_neti_list, sizeof (viona_neti_t),
-	    offsetof(viona_neti_t, vni_node));
-
-	/* This can only fail if NETINFO_VERSION is wrong */
-	viona_neti = net_instance_alloc(NETINFO_VERSION);
-	VERIFY(viona_neti != NULL);
-
-	viona_neti->nin_name = "viona";
-	viona_neti->nin_create = viona_neti_create;
-	viona_neti->nin_shutdown = viona_neti_shutdown;
-	viona_neti->nin_destroy = viona_neti_destroy;
-	/* This can only fail if we've registered ourselves multiple times */
-	VERIFY3S(net_instance_register(viona_neti), ==, DDI_SUCCESS);
-
-	return (DDI_SUCCESS);
-}
-
-static int
-viona_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
-{
-	mblk_t *mp;
-
-	if (cmd != DDI_DETACH) {
-		return (DDI_FAILURE);
-	}
-
-	/* Clean up the VLAN padding mblk */
-	mp = viona_vlan_pad_mp;
-	viona_vlan_pad_mp = NULL;
-	VERIFY(mp != NULL && mp->b_cont == NULL);
-	freemsg(mp);
-
-	id_space_destroy(viona_minors);
-	ddi_remove_minor_node(viona_dip, NULL);
-	viona_dip = NULL;
-
-	/* This can only fail if we've not registered previously */
-	VERIFY3S(net_instance_unregister(viona_neti), ==, DDI_SUCCESS);
-	net_instance_free(viona_neti);
-	viona_neti = NULL;
-
-	list_destroy(&viona_neti_list);
-	mutex_destroy(&viona_neti_lock);
-
-	return (DDI_SUCCESS);
-}
-
-static int
-viona_open(dev_t *devp, int flag, int otype, cred_t *credp)
-{
-	int	minor;
-	viona_soft_state_t *ss;
-
-	if (otype != OTYP_CHR) {
-		return (EINVAL);
-	}
-#if 0
-	/*
-	 * XXX-mg: drv_priv() is wrong, but I'm not sure what is right.
-	 * Should the check be at open() or ioctl()?
-	 */
-	if (drv_priv(credp) != 0) {
-		return (EPERM);
-	}
-#endif
-	if (getminor(*devp) != VIONA_CTL_MINOR) {
-		return (ENXIO);
-	}
-
-	minor = id_alloc_nosleep(viona_minors);
-	if (minor == 0) {
-		/* All minors are busy */
-		return (EBUSY);
-	}
-	if (ddi_soft_state_zalloc(viona_state, minor) != DDI_SUCCESS) {
-		id_free(viona_minors, minor);
-		return (ENOMEM);
-	}
-
-	ss = ddi_get_soft_state(viona_state, minor);
-	mutex_init(&ss->ss_lock, NULL, MUTEX_DEFAULT, NULL);
-	*devp = makedevice(getmajor(*devp), minor);
-
-	return (0);
-}
-
-static int
-viona_close(dev_t dev, int flag, int otype, cred_t *credp)
-{
-	int			minor;
-	viona_soft_state_t	*ss;
-
-	if (otype != OTYP_CHR) {
-		return (EINVAL);
-	}
-
-	minor = getminor(dev);
-
-	ss = ddi_get_soft_state(viona_state, minor);
-	if (ss == NULL) {
-		return (ENXIO);
-	}
-
-	VERIFY0(viona_ioc_delete(ss, B_TRUE));
-	VERIFY(!list_link_active(&ss->ss_node));
-	ddi_soft_state_free(viona_state, minor);
-	id_free(viona_minors, minor);
-
-	return (0);
-}
-
-static int
-viona_ioctl(dev_t dev, int cmd, intptr_t data, int md, cred_t *cr, int *rv)
-{
-	viona_soft_state_t *ss;
-	void *dptr = (void *)data;
-	int err = 0, val;
-	viona_link_t *link;
-
-	ss = ddi_get_soft_state(viona_state, getminor(dev));
-	if (ss == NULL) {
-		return (ENXIO);
-	}
-
-	switch (cmd) {
-	case VNA_IOC_CREATE:
-		return (viona_ioc_create(ss, dptr, md, cr));
-	case VNA_IOC_DELETE:
-		return (viona_ioc_delete(ss, B_FALSE));
-	default:
-		break;
-	}
-
-	mutex_enter(&ss->ss_lock);
-	if ((link = ss->ss_link) == NULL || link->l_destroyed ||
-	    vmm_drv_release_reqd(link->l_vm_hold)) {
-		mutex_exit(&ss->ss_lock);
-		return (ENXIO);
-	}
-
-	switch (cmd) {
-	case VNA_IOC_GET_FEATURES:
-		val = VIONA_S_HOSTCAPS | link->l_features_hw;
-		if (ddi_copyout(&val, dptr, sizeof (val), md) != 0) {
-			err = EFAULT;
-		}
-		break;
-	case VNA_IOC_SET_FEATURES:
-		if (ddi_copyin(dptr, &val, sizeof (val), md) != 0) {
-			err = EFAULT;
-			break;
-		}
-		val &= (VIONA_S_HOSTCAPS | link->l_features_hw);
-
-		if ((val & VIRTIO_NET_F_CSUM) == 0)
-			val &= ~VIRTIO_NET_F_HOST_TSO4;
-
-		if ((val & VIRTIO_NET_F_GUEST_CSUM) == 0)
-			val &= ~VIRTIO_NET_F_GUEST_TSO4;
-
-		link->l_features = val;
-		break;
-	case VNA_IOC_RING_INIT:
-		err = viona_ioc_ring_init(link, dptr, md);
-		break;
-	case VNA_IOC_RING_RESET:
-		err = viona_ioc_ring_reset(link, (uint_t)data);
-		break;
-	case VNA_IOC_RING_KICK:
-		err = viona_ioc_ring_kick(link, (uint_t)data);
-		break;
-	case VNA_IOC_RING_SET_MSI:
-		err = viona_ioc_ring_set_msi(link, dptr, md);
-		break;
-	case VNA_IOC_RING_INTR_CLR:
-		err = viona_ioc_ring_intr_clear(link, (uint_t)data);
-		break;
-	case VNA_IOC_INTR_POLL:
-		err = viona_ioc_intr_poll(link, dptr, md, rv);
-		break;
-	case VNA_IOC_SET_NOTIFY_IOP:
-		err = viona_ioc_set_notify_ioport(link, (uint_t)data);
-		break;
-	default:
-		err = ENOTTY;
-		break;
-	}
-
-	mutex_exit(&ss->ss_lock);
-	return (err);
-}
-
-static int
-viona_chpoll(dev_t dev, short events, int anyyet, short *reventsp,
-    struct pollhead **phpp)
-{
-	viona_soft_state_t *ss;
-	viona_link_t *link;
-
-	ss = ddi_get_soft_state(viona_state, getminor(dev));
-	if (ss == NULL) {
-		return (ENXIO);
-	}
-
-	mutex_enter(&ss->ss_lock);
-	if ((link = ss->ss_link) == NULL || link->l_destroyed) {
-		mutex_exit(&ss->ss_lock);
-		return (ENXIO);
-	}
-
-	*reventsp = 0;
-	if ((events & POLLRDBAND) != 0) {
-		for (uint_t i = 0; i < VIONA_VQ_MAX; i++) {
-			if (link->l_vrings[i].vr_intr_enabled != 0) {
-				*reventsp |= POLLRDBAND;
-				break;
-			}
-		}
-	}
-	if ((*reventsp == 0 && !anyyet) || (events & POLLET)) {
-		*phpp = &link->l_pollhead;
-	}
-	mutex_exit(&ss->ss_lock);
-
-	return (0);
-}
-
-static void
-viona_get_mac_capab(viona_link_t *link)
-{
-	mac_handle_t mh = link->l_mh;
-	uint32_t cap = 0;
-	mac_capab_lso_t lso_cap;
-
-	link->l_features_hw = 0;
-	if (mac_capab_get(mh, MAC_CAPAB_HCKSUM, &cap)) {
-		/*
-		 * Only report HW checksum ability if the underlying MAC
-		 * resource is capable of populating the L4 header.
-		 */
-		if ((cap & VIONA_CAP_HCKSUM_INTEREST) != 0) {
-			link->l_features_hw |= VIRTIO_NET_F_CSUM;
-		}
-		link->l_cap_csum = cap;
-	}
-
-	if ((link->l_features_hw & VIRTIO_NET_F_CSUM) &&
-	    mac_capab_get(mh, MAC_CAPAB_LSO, &lso_cap)) {
-		/*
-		 * Virtio doesn't allow for negotiating a maximum LSO
-		 * packet size. We have to assume that the guest may
-		 * send a maximum length IP packet. Make sure the
-		 * underlying MAC can handle an LSO of this size.
-		 */
-		if ((lso_cap.lso_flags & LSO_TX_BASIC_TCP_IPV4) &&
-		    lso_cap.lso_basic_tcp_ipv4.lso_max >= IP_MAXPACKET)
-			link->l_features_hw |= VIRTIO_NET_F_HOST_TSO4;
-	}
-}
-
-static int
-viona_rx_set(viona_link_t *link)
-{
-	viona_vring_t *ring = &link->l_vrings[VIONA_VQ_RX];
-	int err;
-
-	mac_rx_set(link->l_mch, viona_rx_classified, ring);
-	err = mac_promisc_add(link->l_mch, MAC_CLIENT_PROMISC_MULTI,
-	    viona_rx_mcast, ring, &link->l_mph,
-	    MAC_PROMISC_FLAGS_NO_TX_LOOP | MAC_PROMISC_FLAGS_VLAN_TAG_STRIP);
-	if (err != 0) {
-		mac_rx_clear(link->l_mch);
-	}
-
-	return (err);
-}
-
-static void
-viona_rx_clear(viona_link_t *link)
-{
-	mac_promisc_remove(link->l_mph);
-	mac_rx_clear(link->l_mch);
-}
-
-static int
-viona_ioc_create(viona_soft_state_t *ss, void *dptr, int md, cred_t *cr)
-{
-	vioc_create_t	kvc;
-	viona_link_t	*link = NULL;
-	char		cli_name[MAXNAMELEN];
-	int		err = 0;
-	file_t		*fp;
-	vmm_hold_t	*hold = NULL;
-	viona_neti_t	*nip = NULL;
-	zoneid_t	zid;
-
-	ASSERT(MUTEX_NOT_HELD(&ss->ss_lock));
-
-	if (ddi_copyin(dptr, &kvc, sizeof (kvc), md) != 0) {
-		return (EFAULT);
-	}
-
-	zid = crgetzoneid(cr);
-	nip = viona_neti_lookup_by_zid(zid);
-	if (nip == NULL) {
-		return (EIO);
-	}
-
-	if (!nip->vni_nethook.vnh_hooked) {
-		viona_neti_rele(nip);
-		return (EIO);
-	}
-
-	mutex_enter(&ss->ss_lock);
-	if (ss->ss_link != NULL) {
-		mutex_exit(&ss->ss_lock);
-		viona_neti_rele(nip);
-		return (EEXIST);
-	}
-
-	if ((fp = getf(kvc.c_vmfd)) == NULL) {
-		err = EBADF;
-		goto bail;
-	}
-	err = vmm_drv_hold(fp, cr, &hold);
-	releasef(kvc.c_vmfd);
-	if (err != 0) {
-		goto bail;
-	}
-
-	link = kmem_zalloc(sizeof (viona_link_t), KM_SLEEP);
-	link->l_linkid = kvc.c_linkid;
-	link->l_vm_hold = hold;
-	link->l_force_tx_copy = viona_tx_copy_needed();
-
-	err = mac_open_by_linkid(link->l_linkid, &link->l_mh);
-	if (err != 0) {
-		goto bail;
-	}
-
-	viona_get_mac_capab(link);
-
-	(void) snprintf(cli_name, sizeof (cli_name), "%s-%d", VIONA_CLI_NAME,
-	    link->l_linkid);
-	err = mac_client_open(link->l_mh, &link->l_mch, cli_name, 0);
-	if (err != 0) {
-		goto bail;
-	}
-
-	viona_ring_alloc(link, &link->l_vrings[VIONA_VQ_RX]);
-	viona_ring_alloc(link, &link->l_vrings[VIONA_VQ_TX]);
-
-	if ((err = viona_rx_set(link)) != 0) {
-		viona_ring_free(&link->l_vrings[VIONA_VQ_RX]);
-		viona_ring_free(&link->l_vrings[VIONA_VQ_TX]);
-		goto bail;
-	}
-
-	link->l_neti = nip;
-	ss->ss_link = link;
-	mutex_exit(&ss->ss_lock);
-
-	mutex_enter(&nip->vni_lock);
-	list_insert_tail(&nip->vni_dev_list, ss);
-	mutex_exit(&nip->vni_lock);
-
-	return (0);
-
-bail:
-	if (link != NULL) {
-		if (link->l_mch != NULL) {
-			mac_client_close(link->l_mch, 0);
-		}
-		if (link->l_mh != NULL) {
-			mac_close(link->l_mh);
-		}
-		kmem_free(link, sizeof (viona_link_t));
-	}
-	if (hold != NULL) {
-		vmm_drv_rele(hold);
-	}
-	viona_neti_rele(nip);
-
-	mutex_exit(&ss->ss_lock);
-	return (err);
-}
-
-static int
-viona_ioc_delete(viona_soft_state_t *ss, boolean_t on_close)
-{
-	viona_link_t *link;
-	viona_neti_t *nip = NULL;
-
-	mutex_enter(&ss->ss_lock);
-	if ((link = ss->ss_link) == NULL) {
-		/* Link destruction already complete */
-		mutex_exit(&ss->ss_lock);
-		return (0);
-	}
-
-	if (link->l_destroyed) {
-		/*
-		 * Link destruction has been started by another thread, but has
-		 * not completed.  This condition should be impossible to
-		 * encounter when performing the on-close destroy of the link,
-		 * since racing ioctl accessors must necessarily be absent.
-		 */
-		VERIFY(!on_close);
-		mutex_exit(&ss->ss_lock);
-		return (EAGAIN);
-	}
-	/*
-	 * The link deletion cannot fail after this point, continuing until its
-	 * successful completion is reached.
-	 */
-	link->l_destroyed = B_TRUE;
-
-	/*
-	 * Tear down the IO port hook so it cannot be used to kick any of the
-	 * rings which are about to be reset and stopped.
-	 */
-	VERIFY0(viona_ioc_set_notify_ioport(link, 0));
-	mutex_exit(&ss->ss_lock);
-
-	/*
-	 * Return the rings to their reset state, ignoring any possible
-	 * interruptions from signals.
-	 */
-	VERIFY0(viona_ring_reset(&link->l_vrings[VIONA_VQ_RX], B_FALSE));
-	VERIFY0(viona_ring_reset(&link->l_vrings[VIONA_VQ_TX], B_FALSE));
-
-	mutex_enter(&ss->ss_lock);
-	if (link->l_mch != NULL) {
-		/* Unhook the receive callbacks and close out the client */
-		viona_rx_clear(link);
-		mac_client_close(link->l_mch, 0);
-	}
-	if (link->l_mh != NULL) {
-		mac_close(link->l_mh);
-	}
-	if (link->l_vm_hold != NULL) {
-		vmm_drv_rele(link->l_vm_hold);
-		link->l_vm_hold = NULL;
-	}
-
-	nip = link->l_neti;
-	link->l_neti = NULL;
-
-	viona_ring_free(&link->l_vrings[VIONA_VQ_RX]);
-	viona_ring_free(&link->l_vrings[VIONA_VQ_TX]);
-	pollhead_clean(&link->l_pollhead);
-	ss->ss_link = NULL;
-	mutex_exit(&ss->ss_lock);
-
-	mutex_enter(&nip->vni_lock);
-	list_remove(&nip->vni_dev_list, ss);
-	mutex_exit(&nip->vni_lock);
-
-	viona_neti_rele(nip);
-
-	kmem_free(link, sizeof (viona_link_t));
-	return (0);
-}
-
-/*
- * Translate a guest physical address into a kernel virtual address.
- */
-static void *
-viona_gpa2kva(viona_vring_t *ring, uint64_t gpa, size_t len)
-{
-	ASSERT3P(ring->vr_lease, !=, NULL);
-
-	return (vmm_drv_gpa2kva(ring->vr_lease, gpa, len));
-}
-
-static boolean_t
-viona_ring_lease_expire_cb(void *arg)
-{
-	viona_vring_t *ring = arg;
-
-	cv_broadcast(&ring->vr_cv);
-
-	/* The lease will be broken asynchronously. */
-	return (B_FALSE);
-}
-
-static void
-viona_ring_lease_drop(viona_vring_t *ring)
-{
-	ASSERT(MUTEX_HELD(&ring->vr_lock));
-
-	if (ring->vr_lease != NULL) {
-		vmm_hold_t *hold = ring->vr_link->l_vm_hold;
-
-		ASSERT(hold != NULL);
-
-		/*
-		 * Without an active lease, the ring mappings cannot be
-		 * considered valid.
-		 */
-		viona_ring_unmap(ring);
-
-		vmm_drv_lease_break(hold, ring->vr_lease);
-		ring->vr_lease = NULL;
-	}
-}
-
-static boolean_t
-viona_ring_lease_renew(viona_vring_t *ring)
-{
-	vmm_hold_t *hold = ring->vr_link->l_vm_hold;
-
-	ASSERT(hold != NULL);
-	ASSERT(MUTEX_HELD(&ring->vr_lock));
-
-	viona_ring_lease_drop(ring);
-
-	/*
-	 * Lease renewal will fail if the VM has requested that all holds be
-	 * cleaned up.
-	 */
-	ring->vr_lease = vmm_drv_lease_sign(hold, viona_ring_lease_expire_cb,
-	    ring);
-	if (ring->vr_lease != NULL) {
-		/* A ring undergoing renewal will need valid guest mappings */
-		if (ring->vr_pa != 0 && ring->vr_size != 0) {
-			/*
-			 * If new mappings cannot be established, consider the
-			 * lease renewal a failure.
-			 */
-			if (!viona_ring_map(ring)) {
-				viona_ring_lease_drop(ring);
-				return (B_FALSE);
-			}
-		}
-	}
-	return (ring->vr_lease != NULL);
-}
-
-static void
-viona_ring_alloc(viona_link_t *link, viona_vring_t *ring)
-{
-	ring->vr_link = link;
-	mutex_init(&ring->vr_lock, NULL, MUTEX_DRIVER, NULL);
-	cv_init(&ring->vr_cv, NULL, CV_DRIVER, NULL);
-	mutex_init(&ring->vr_a_mutex, NULL, MUTEX_DRIVER, NULL);
-	mutex_init(&ring->vr_u_mutex, NULL, MUTEX_DRIVER, NULL);
-}
-
-static void
-viona_ring_misc_free(viona_vring_t *ring)
-{
-	const uint_t cnt = ring->vr_size;
-
-	if (ring->vr_txdesb != NULL) {
-		viona_desb_t *dp = ring->vr_txdesb;
-
-		for (uint_t i = 0; i < cnt; i++, dp++) {
-			kmem_free(dp->d_headers, VIONA_MAX_HDRS_LEN);
-		}
-		kmem_free(ring->vr_txdesb, sizeof (viona_desb_t) * cnt);
-		ring->vr_txdesb = NULL;
-	}
-
-	if (ring->vr_txiov != NULL) {
-		kmem_free(ring->vr_txiov, sizeof (struct iovec) * cnt);
-		ring->vr_txiov = NULL;
-	}
-}
-
-static void
-viona_ring_free(viona_vring_t *ring)
-{
-	mutex_destroy(&ring->vr_lock);
-	cv_destroy(&ring->vr_cv);
-	mutex_destroy(&ring->vr_a_mutex);
-	mutex_destroy(&ring->vr_u_mutex);
-	ring->vr_link = NULL;
-}
-
-static int
-viona_ring_reset(viona_vring_t *ring, boolean_t heed_signals)
-{
-	mutex_enter(&ring->vr_lock);
-	if (ring->vr_state == VRS_RESET) {
-		mutex_exit(&ring->vr_lock);
-		return (0);
-	}
-
-	if ((ring->vr_state_flags & VRSF_REQ_STOP) == 0) {
-		ring->vr_state_flags |= VRSF_REQ_STOP;
-		cv_broadcast(&ring->vr_cv);
-	}
-	while (ring->vr_state != VRS_RESET) {
-		if (!heed_signals) {
-			cv_wait(&ring->vr_cv, &ring->vr_lock);
-		} else {
-			int rs;
-
-			rs = cv_wait_sig(&ring->vr_cv, &ring->vr_lock);
-			if (rs <= 0 && ring->vr_state != VRS_RESET) {
-				mutex_exit(&ring->vr_lock);
-				return (EINTR);
-			}
-		}
-	}
-	viona_ring_lease_drop(ring);
-	mutex_exit(&ring->vr_lock);
-	return (0);
-}
-
-static boolean_t
-viona_ring_map(viona_vring_t *ring)
-{
-	uint64_t pos = ring->vr_pa;
-	const uint16_t qsz = ring->vr_size;
-
-	ASSERT3U(qsz, !=, 0);
-	ASSERT3U(pos, !=, 0);
-	ASSERT(MUTEX_HELD(&ring->vr_lock));
-
-	const size_t desc_sz = qsz * sizeof (struct virtio_desc);
-	ring->vr_descr = viona_gpa2kva(ring, pos, desc_sz);
-	if (ring->vr_descr == NULL) {
-		goto fail;
-	}
-	pos += desc_sz;
-
-	const size_t avail_sz = (qsz + 3) * sizeof (uint16_t);
-	ring->vr_avail_flags = viona_gpa2kva(ring, pos, avail_sz);
-	if (ring->vr_avail_flags == NULL) {
-		goto fail;
-	}
-	ring->vr_avail_idx = ring->vr_avail_flags + 1;
-	ring->vr_avail_ring = ring->vr_avail_flags + 2;
-	ring->vr_avail_used_event = ring->vr_avail_ring + qsz;
-	pos += avail_sz;
-
-	const size_t used_sz = (qsz * sizeof (struct virtio_used)) +
-	    (sizeof (uint16_t) * 3);
-	pos = P2ROUNDUP(pos, VRING_ALIGN);
-	ring->vr_used_flags = viona_gpa2kva(ring, pos, used_sz);
-	if (ring->vr_used_flags == NULL) {
-		goto fail;
-	}
-	ring->vr_used_idx = ring->vr_used_flags + 1;
-	ring->vr_used_ring = (struct virtio_used *)(ring->vr_used_flags + 2);
-	ring->vr_used_avail_event = (uint16_t *)(ring->vr_used_ring + qsz);
-
-	return (B_TRUE);
-
-fail:
-	viona_ring_unmap(ring);
-	return (B_FALSE);
-}
-
-static void
-viona_ring_unmap(viona_vring_t *ring)
-{
-	ASSERT(MUTEX_HELD(&ring->vr_lock));
-
-	ring->vr_descr = NULL;
-	ring->vr_avail_flags = NULL;
-	ring->vr_avail_idx = NULL;
-	ring->vr_avail_ring = NULL;
-	ring->vr_avail_used_event = NULL;
-	ring->vr_used_flags = NULL;
-	ring->vr_used_idx = NULL;
-	ring->vr_used_ring = NULL;
-	ring->vr_used_avail_event = NULL;
-}
-
-static int
-viona_ioc_ring_init(viona_link_t *link, void *udata, int md)
-{
-	vioc_ring_init_t kri;
-	viona_vring_t *ring;
-	kthread_t *t;
-	int err = 0;
-
-	if (ddi_copyin(udata, &kri, sizeof (kri), md) != 0) {
-		return (EFAULT);
-	}
-
-	if (kri.ri_index >= VIONA_VQ_MAX) {
-		return (EINVAL);
-	}
-	const uint16_t qsz = kri.ri_qsize;
-	if (qsz == 0 || qsz > VRING_MAX_LEN || (1 << (ffs(qsz) - 1)) != qsz) {
-		return (EINVAL);
-	}
-
-	ring = &link->l_vrings[kri.ri_index];
-	mutex_enter(&ring->vr_lock);
-	if (ring->vr_state != VRS_RESET) {
-		mutex_exit(&ring->vr_lock);
-		return (EBUSY);
-	}
-	VERIFY(ring->vr_state_flags == 0);
-
-	ring->vr_lease = NULL;
-	if (!viona_ring_lease_renew(ring)) {
-		err = EBUSY;
-		goto fail;
-	}
-
-	ring->vr_size = qsz;
-	ring->vr_mask = (ring->vr_size - 1);
-	ring->vr_pa = kri.ri_qaddr;
-	if (!viona_ring_map(ring)) {
-		err = EINVAL;
-		goto fail;
-	}
-
-	/* Initialize queue indexes */
-	ring->vr_cur_aidx = 0;
-
-	/* Allocate desb handles for TX ring if packet copying not disabled */
-	if (kri.ri_index == VIONA_VQ_TX && !link->l_force_tx_copy) {
-		viona_desb_t *dp;
-
-		dp = kmem_zalloc(sizeof (viona_desb_t) * qsz, KM_SLEEP);
-		ring->vr_txdesb = dp;
-		for (uint_t i = 0; i < qsz; i++, dp++) {
-			dp->d_frtn.free_func = viona_desb_release;
-			dp->d_frtn.free_arg = (void *)dp;
-			dp->d_ring = ring;
-			dp->d_headers = kmem_zalloc(VIONA_MAX_HDRS_LEN,
-			    KM_SLEEP);
-		}
-	}
-
-	/* Allocate ring-sized iovec buffers for TX */
-	if (kri.ri_index == VIONA_VQ_TX) {
-		ring->vr_txiov = kmem_alloc(sizeof (struct iovec) * qsz,
-		    KM_SLEEP);
-	}
-
-	/* Zero out MSI-X configuration */
-	ring->vr_msi_addr = 0;
-	ring->vr_msi_msg = 0;
-
-	/* Clear the stats */
-	bzero(&ring->vr_stats, sizeof (ring->vr_stats));
-
-	t = viona_create_worker(ring);
-	if (t == NULL) {
-		err = ENOMEM;
-		goto fail;
-	}
-	ring->vr_worker_thread = t;
-	ring->vr_state = VRS_SETUP;
-	cv_broadcast(&ring->vr_cv);
-	mutex_exit(&ring->vr_lock);
-	return (0);
-
-fail:
-	viona_ring_lease_drop(ring);
-	viona_ring_misc_free(ring);
-	ring->vr_size = 0;
-	ring->vr_mask = 0;
-	mutex_exit(&ring->vr_lock);
-	return (err);
-}
-
-static int
-viona_ioc_ring_reset(viona_link_t *link, uint_t idx)
-{
-	viona_vring_t *ring;
-
-	if (idx >= VIONA_VQ_MAX) {
-		return (EINVAL);
-	}
-	ring = &link->l_vrings[idx];
-
-	return (viona_ring_reset(ring, B_TRUE));
-}
-
-static int
-viona_ioc_ring_kick(viona_link_t *link, uint_t idx)
-{
-	viona_vring_t *ring;
-	int err;
-
-	if (idx >= VIONA_VQ_MAX) {
-		return (EINVAL);
-	}
-	ring = &link->l_vrings[idx];
-
-	mutex_enter(&ring->vr_lock);
-	switch (ring->vr_state) {
-	case VRS_SETUP:
-		/*
-		 * An early kick to a ring which is starting its worker thread
-		 * is fine.  Once that thread is active, it will process the
-		 * start-up request immediately.
-		 */
-		/* FALLTHROUGH */
-	case VRS_INIT:
-		ring->vr_state_flags |= VRSF_REQ_START;
-		/* FALLTHROUGH */
-	case VRS_RUN:
-		cv_broadcast(&ring->vr_cv);
-		err = 0;
-		break;
-	default:
-		err = EBUSY;
-		break;
-	}
-	mutex_exit(&ring->vr_lock);
-
-	return (err);
-}
-
-static int
-viona_ioc_ring_set_msi(viona_link_t *link, void *data, int md)
-{
-	vioc_ring_msi_t vrm;
-	viona_vring_t *ring;
-
-	if (ddi_copyin(data, &vrm, sizeof (vrm), md) != 0) {
-		return (EFAULT);
-	}
-	if (vrm.rm_index >= VIONA_VQ_MAX) {
-		return (EINVAL);
-	}
-
-	ring = &link->l_vrings[vrm.rm_index];
-	mutex_enter(&ring->vr_lock);
-	ring->vr_msi_addr = vrm.rm_addr;
-	ring->vr_msi_msg = vrm.rm_msg;
-	mutex_exit(&ring->vr_lock);
-
-	return (0);
-}
-
-static int
-viona_notify_wcb(void *arg, uintptr_t ioport, uint_t sz, uint64_t val)
-{
-	viona_link_t *link = (viona_link_t *)arg;
-	uint16_t vq = (uint16_t)val;
-
-	if (ioport != link->l_notify_ioport || sz != sizeof (uint16_t)) {
-		return (EINVAL);
-	}
-	return (viona_ioc_ring_kick(link, vq));
-}
-
-static int
-viona_ioc_set_notify_ioport(viona_link_t *link, uint_t ioport)
-{
-	int err = 0;
-
-	if (link->l_notify_ioport != 0) {
-		vmm_drv_ioport_unhook(link->l_vm_hold, &link->l_notify_cookie);
-		link->l_notify_ioport = 0;
-	}
-
-	if (ioport != 0) {
-		err = vmm_drv_ioport_hook(link->l_vm_hold, ioport, NULL,
-		    viona_notify_wcb, (void *)link, &link->l_notify_cookie);
-		if (err == 0) {
-			link->l_notify_ioport = ioport;
-		}
-	}
-	return (err);
-}
-
-/*
- * Return the number of available descriptors in the vring taking care of the
- * 16-bit index wraparound.
- *
- * Note: If the number of apparently available descriptors is larger than the
- * ring size (due to guest misbehavior), this check will still report the
- * positive count of descriptors.
- */
-static inline int
-viona_vr_num_avail(viona_vring_t *ring)
-{
-	uint16_t ndesc;
-
-	/*
-	 * We're just computing (a-b) in GF(216).
-	 *
-	 * The only glitch here is that in standard C, uint16_t promotes to
-	 * (signed) int when int has more than 16 bits (almost always now).
-	 * A cast back to unsigned is necessary for proper operation.
-	 */
-	ndesc = (unsigned)*ring->vr_avail_idx - (unsigned)ring->vr_cur_aidx;
-
-	return (ndesc);
-}
-
-static void
-viona_worker_rx(viona_vring_t *ring, viona_link_t *link)
-{
-	proc_t *p = ttoproc(curthread);
-
-	(void) thread_vsetname(curthread, "viona_rx_%p", ring);
-
-	ASSERT(MUTEX_HELD(&ring->vr_lock));
-	ASSERT3U(ring->vr_state, ==, VRS_RUN);
-
-	*ring->vr_used_flags |= VRING_USED_F_NO_NOTIFY;
-
-	do {
-		if (vmm_drv_lease_expired(ring->vr_lease)) {
-			/*
-			 * Set the renewal flag, causing incoming traffic to be
-			 * dropped, and issue an RX barrier to ensure any
-			 * threads in the RX callbacks will have finished.
-			 * The vr_lock cannot be held across the barrier as it
-			 * poses a deadlock risk.
-			 */
-			ring->vr_state_flags |= VRSF_RENEW;
-			mutex_exit(&ring->vr_lock);
-			mac_rx_barrier(link->l_mch);
-			mutex_enter(&ring->vr_lock);
-
-			if (!viona_ring_lease_renew(ring)) {
-				break;
-			}
-			ring->vr_state_flags &= ~VRSF_RENEW;
-		}
-
-		/*
-		 * For now, there is little to do in the RX worker as inbound
-		 * data is delivered by MAC via the RX callbacks.  If tap-like
-		 * functionality is added later, this would be a convenient
-		 * place to inject frames into the guest.
-		 */
-		(void) cv_wait_sig(&ring->vr_cv, &ring->vr_lock);
-	} while (!VRING_NEED_BAIL(ring, p));
-
-	*ring->vr_used_flags &= ~VRING_USED_F_NO_NOTIFY;
-}
-
-static void
-viona_worker_tx(viona_vring_t *ring, viona_link_t *link)
-{
-	proc_t *p = ttoproc(curthread);
-
-	(void) thread_vsetname(curthread, "viona_tx_%p", ring);
-
-	ASSERT(MUTEX_HELD(&ring->vr_lock));
-	ASSERT3U(ring->vr_state, ==, VRS_RUN);
-
-	mutex_exit(&ring->vr_lock);
-
-	for (;;) {
-		boolean_t bail = B_FALSE;
-		boolean_t renew = B_FALSE;
-		uint_t ntx = 0;
-
-		*ring->vr_used_flags |= VRING_USED_F_NO_NOTIFY;
-		while (viona_vr_num_avail(ring)) {
-			viona_tx(link, ring);
-
-			/*
-			 * It is advantageous for throughput to keep this
-			 * transmission loop tight, but periodic breaks to
-			 * check for other events are of value too.
-			 */
-			if (ntx++ >= ring->vr_size)
-				break;
-		}
-		*ring->vr_used_flags &= ~VRING_USED_F_NO_NOTIFY;
-
-		VIONA_PROBE2(tx, viona_link_t *, link, uint_t, ntx);
-
-		/*
-		 * Check for available descriptors on the ring once more in
-		 * case a late addition raced with the NO_NOTIFY flag toggle.
-		 *
-		 * The barrier ensures that visibility of the vr_used_flags
-		 * store does not cross the viona_vr_num_avail() check below.
-		 */
-		membar_enter();
-		bail = VRING_NEED_BAIL(ring, p);
-		renew = vmm_drv_lease_expired(ring->vr_lease);
-		if (!bail && !renew && viona_vr_num_avail(ring)) {
-			continue;
-		}
-
-		if ((link->l_features & VIRTIO_F_RING_NOTIFY_ON_EMPTY) != 0) {
-			viona_intr_ring(ring);
-		}
-
-		mutex_enter(&ring->vr_lock);
-
-		while (!bail && !renew && !viona_vr_num_avail(ring)) {
-			(void) cv_wait_sig(&ring->vr_cv, &ring->vr_lock);
-			bail = VRING_NEED_BAIL(ring, p);
-			renew = vmm_drv_lease_expired(ring->vr_lease);
-		}
-
-		if (bail) {
-			break;
-		} else if (renew) {
-			ring->vr_state_flags |= VRSF_RENEW;
-			/*
-			 * When renewing the lease for the ring, no TX
-			 * frames may be outstanding, as they contain
-			 * references to guest memory.
-			 */
-			viona_tx_wait_outstanding(ring);
-
-			if (!viona_ring_lease_renew(ring)) {
-				break;
-			}
-			ring->vr_state_flags &= ~VRSF_RENEW;
-		}
-		mutex_exit(&ring->vr_lock);
-	}
-
-	ASSERT(MUTEX_HELD(&ring->vr_lock));
-
-	viona_tx_wait_outstanding(ring);
-}
-
-static void
-viona_worker(void *arg)
-{
-	viona_vring_t *ring = (viona_vring_t *)arg;
-	viona_link_t *link = ring->vr_link;
-	proc_t *p = ttoproc(curthread);
-
-	mutex_enter(&ring->vr_lock);
-	VERIFY3U(ring->vr_state, ==, VRS_SETUP);
-
-	/* Bail immediately if ring shutdown or process exit was requested */
-	if (VRING_NEED_BAIL(ring, p)) {
-		goto cleanup;
-	}
-
-	/* Report worker thread as alive and notify creator */
-	ring->vr_state = VRS_INIT;
-	cv_broadcast(&ring->vr_cv);
-
-	while (ring->vr_state_flags == 0) {
-		/*
-		 * Keeping lease renewals timely while waiting for the ring to
-		 * be started is important for avoiding deadlocks.
-		 */
-		if (vmm_drv_lease_expired(ring->vr_lease)) {
-			if (!viona_ring_lease_renew(ring)) {
-				goto cleanup;
-			}
-		}
-
-		(void) cv_wait_sig(&ring->vr_cv, &ring->vr_lock);
-
-		if (VRING_NEED_BAIL(ring, p)) {
-			goto cleanup;
-		}
-	}
-
-	ASSERT((ring->vr_state_flags & VRSF_REQ_START) != 0);
-	ring->vr_state = VRS_RUN;
-	ring->vr_state_flags &= ~VRSF_REQ_START;
-
-	/* Ensure ring lease is valid first */
-	if (vmm_drv_lease_expired(ring->vr_lease)) {
-		if (!viona_ring_lease_renew(ring)) {
-			goto cleanup;
-		}
-	}
-
-	/* Process actual work */
-	if (ring == &link->l_vrings[VIONA_VQ_RX]) {
-		viona_worker_rx(ring, link);
-	} else if (ring == &link->l_vrings[VIONA_VQ_TX]) {
-		viona_worker_tx(ring, link);
-	} else {
-		panic("unexpected ring: %p", (void *)ring);
-	}
-
-cleanup:
-	if (ring->vr_txdesb != NULL) {
-		/*
-		 * Transmit activity must be entirely concluded before the
-		 * associated descriptors can be cleaned up.
-		 */
-		VERIFY(ring->vr_xfer_outstanding == 0);
-	}
-	viona_ring_misc_free(ring);
-
-	viona_ring_lease_drop(ring);
-	ring->vr_cur_aidx = 0;
-	ring->vr_state = VRS_RESET;
-	ring->vr_state_flags = 0;
-	ring->vr_worker_thread = NULL;
-	cv_broadcast(&ring->vr_cv);
-	mutex_exit(&ring->vr_lock);
-
-	mutex_enter(&ttoproc(curthread)->p_lock);
-	lwp_exit();
-}
-
-static kthread_t *
-viona_create_worker(viona_vring_t *ring)
-{
-	k_sigset_t hold_set;
-	proc_t *p = curproc;
-	kthread_t *t;
-	klwp_t *lwp;
-
-	ASSERT(MUTEX_HELD(&ring->vr_lock));
-	ASSERT(ring->vr_state == VRS_RESET);
-
-	sigfillset(&hold_set);
-	lwp = lwp_create(viona_worker, (void *)ring, 0, p, TS_STOPPED,
-	    minclsyspri - 1, &hold_set, curthread->t_cid, 0);
-	if (lwp == NULL) {
-		return (NULL);
-	}
-
-	t = lwptot(lwp);
-	mutex_enter(&p->p_lock);
-	t->t_proc_flag = (t->t_proc_flag & ~TP_HOLDLWP) | TP_KTHREAD;
-	lwp_create_done(t);
-	mutex_exit(&p->p_lock);
-
-	return (t);
-}
-
-static int
-viona_ioc_ring_intr_clear(viona_link_t *link, uint_t idx)
-{
-	if (idx >= VIONA_VQ_MAX) {
-		return (EINVAL);
-	}
-
-	link->l_vrings[idx].vr_intr_enabled = 0;
-	return (0);
-}
-
-static int
-viona_ioc_intr_poll(viona_link_t *link, void *udata, int md, int *rv)
-{
-	uint_t cnt = 0;
-	vioc_intr_poll_t vip;
-
-	for (uint_t i = 0; i < VIONA_VQ_MAX; i++) {
-		uint_t val = link->l_vrings[i].vr_intr_enabled;
-
-		vip.vip_status[i] = val;
-		if (val != 0) {
-			cnt++;
-		}
-	}
-
-	if (ddi_copyout(&vip, udata, sizeof (vip), md) != 0) {
-		return (EFAULT);
-	}
-	*rv = (int)cnt;
-	return (0);
-}
-
-static int
-vq_popchain(viona_vring_t *ring, struct iovec *iov, int niov, uint16_t *cookie)
-{
-	uint_t i, ndesc, idx, head, next;
-	struct virtio_desc vdir;
-	void *buf;
-
-	ASSERT(iov != NULL);
-	ASSERT(niov > 0);
-
-	mutex_enter(&ring->vr_a_mutex);
-	idx = ring->vr_cur_aidx;
-	ndesc = (uint16_t)((unsigned)*ring->vr_avail_idx - (unsigned)idx);
-
-	if (ndesc == 0) {
-		mutex_exit(&ring->vr_a_mutex);
-		return (0);
-	}
-	if (ndesc > ring->vr_size) {
-		/*
-		 * Despite the fact that the guest has provided an 'avail_idx'
-		 * which indicates that an impossible number of descriptors are
-		 * available, continue on and attempt to process the next one.
-		 *
-		 * The transgression will not escape the probe or stats though.
-		 */
-		VIONA_PROBE2(ndesc_too_high, viona_vring_t *, ring,
-		    uint16_t, ndesc);
-		VIONA_RING_STAT_INCR(ring, ndesc_too_high);
-	}
-
-	head = ring->vr_avail_ring[idx & ring->vr_mask];
-	next = head;
-
-	for (i = 0; i < niov; next = vdir.vd_next) {
-		if (next >= ring->vr_size) {
-			VIONA_PROBE2(bad_idx, viona_vring_t *, ring,
-			    uint16_t, next);
-			VIONA_RING_STAT_INCR(ring, bad_idx);
-			goto bail;
-		}
-
-		vdir = ring->vr_descr[next];
-		if ((vdir.vd_flags & VRING_DESC_F_INDIRECT) == 0) {
-			if (vdir.vd_len == 0) {
-				VIONA_PROBE2(desc_bad_len,
-				    viona_vring_t *, ring,
-				    uint32_t, vdir.vd_len);
-				VIONA_RING_STAT_INCR(ring, desc_bad_len);
-				goto bail;
-			}
-			buf = viona_gpa2kva(ring, vdir.vd_addr, vdir.vd_len);
-			if (buf == NULL) {
-				VIONA_PROBE_BAD_RING_ADDR(ring, vdir.vd_addr);
-				VIONA_RING_STAT_INCR(ring, bad_ring_addr);
-				goto bail;
-			}
-			iov[i].iov_base = buf;
-			iov[i].iov_len = vdir.vd_len;
-			i++;
-		} else {
-			const uint_t nindir = vdir.vd_len / 16;
-			volatile struct virtio_desc *vindir;
-
-			if ((vdir.vd_len & 0xf) || nindir == 0) {
-				VIONA_PROBE2(indir_bad_len,
-				    viona_vring_t *, ring,
-				    uint32_t, vdir.vd_len);
-				VIONA_RING_STAT_INCR(ring, indir_bad_len);
-				goto bail;
-			}
-			vindir = viona_gpa2kva(ring, vdir.vd_addr, vdir.vd_len);
-			if (vindir == NULL) {
-				VIONA_PROBE_BAD_RING_ADDR(ring, vdir.vd_addr);
-				VIONA_RING_STAT_INCR(ring, bad_ring_addr);
-				goto bail;
-			}
-			next = 0;
-			for (;;) {
-				struct virtio_desc vp;
-
-				/*
-				 * A copy of the indirect descriptor is made
-				 * here, rather than simply using a reference
-				 * pointer.  This prevents malicious or
-				 * erroneous guest writes to the descriptor
-				 * from fooling the flags/bounds verification
-				 * through a race.
-				 */
-				vp = vindir[next];
-				if (vp.vd_flags & VRING_DESC_F_INDIRECT) {
-					VIONA_PROBE1(indir_bad_nest,
-					    viona_vring_t *, ring);
-					VIONA_RING_STAT_INCR(ring,
-					    indir_bad_nest);
-					goto bail;
-				} else if (vp.vd_len == 0) {
-					VIONA_PROBE2(desc_bad_len,
-					    viona_vring_t *, ring,
-					    uint32_t, vp.vd_len);
-					VIONA_RING_STAT_INCR(ring,
-					    desc_bad_len);
-					goto bail;
-				}
-				buf = viona_gpa2kva(ring, vp.vd_addr,
-				    vp.vd_len);
-				if (buf == NULL) {
-					VIONA_PROBE_BAD_RING_ADDR(ring,
-					    vp.vd_addr);
-					VIONA_RING_STAT_INCR(ring,
-					    bad_ring_addr);
-					goto bail;
-				}
-				iov[i].iov_base = buf;
-				iov[i].iov_len = vp.vd_len;
-				i++;
-
-				if ((vp.vd_flags & VRING_DESC_F_NEXT) == 0)
-					break;
-				if (i >= niov) {
-					goto loopy;
-				}
-
-				next = vp.vd_next;
-				if (next >= nindir) {
-					VIONA_PROBE3(indir_bad_next,
-					    viona_vring_t *, ring,
-					    uint16_t, next,
-					    uint_t, nindir);
-					VIONA_RING_STAT_INCR(ring,
-					    indir_bad_next);
-					goto bail;
-				}
-			}
-		}
-		if ((vdir.vd_flags & VRING_DESC_F_NEXT) == 0) {
-			*cookie = head;
-			ring->vr_cur_aidx++;
-			mutex_exit(&ring->vr_a_mutex);
-			return (i);
-		}
-	}
-
-loopy:
-	VIONA_PROBE1(too_many_desc, viona_vring_t *, ring);
-	VIONA_RING_STAT_INCR(ring, too_many_desc);
-bail:
-	mutex_exit(&ring->vr_a_mutex);
-	return (-1);
-}
-
-static void
-vq_pushchain(viona_vring_t *ring, uint32_t len, uint16_t cookie)
-{
-	volatile struct virtio_used *vu;
-	uint_t uidx;
-
-	mutex_enter(&ring->vr_u_mutex);
-
-	uidx = *ring->vr_used_idx;
-	vu = &ring->vr_used_ring[uidx++ & ring->vr_mask];
-	vu->vu_idx = cookie;
-	vu->vu_tlen = len;
-	membar_producer();
-	*ring->vr_used_idx = uidx;
-
-	mutex_exit(&ring->vr_u_mutex);
-}
-
-static void
-vq_pushchain_mrgrx(viona_vring_t *ring, int num_bufs, used_elem_t *elem)
-{
-	volatile struct virtio_used *vu;
-	uint_t uidx, i;
-
-	mutex_enter(&ring->vr_u_mutex);
-
-	uidx = *ring->vr_used_idx;
-	if (num_bufs == 1) {
-		vu = &ring->vr_used_ring[uidx++ & ring->vr_mask];
-		vu->vu_idx = elem[0].id;
-		vu->vu_tlen = elem[0].len;
-	} else {
-		for (i = 0; i < num_bufs; i++) {
-			vu = &ring->vr_used_ring[(uidx + i) & ring->vr_mask];
-			vu->vu_idx = elem[i].id;
-			vu->vu_tlen = elem[i].len;
-		}
-		uidx = uidx + num_bufs;
-	}
-	membar_producer();
-	*ring->vr_used_idx = uidx;
-
-	mutex_exit(&ring->vr_u_mutex);
-}
-
-static void
-viona_intr_ring(viona_vring_t *ring)
-{
-	uint64_t addr;
-
-	mutex_enter(&ring->vr_lock);
-	/* Deliver the interrupt directly, if so configured. */
-	if ((addr = ring->vr_msi_addr) != 0) {
-		uint64_t msg = ring->vr_msi_msg;
-
-		mutex_exit(&ring->vr_lock);
-		(void) vmm_drv_msi(ring->vr_lease, addr, msg);
-		return;
-	}
-	mutex_exit(&ring->vr_lock);
-
-	if (atomic_cas_uint(&ring->vr_intr_enabled, 0, 1) == 0) {
-		pollwakeup(&ring->vr_link->l_pollhead, POLLRDBAND);
-	}
-}
-
-static size_t
-viona_copy_mblk(const mblk_t *mp, size_t seek, caddr_t buf, size_t len,
-    boolean_t *end)
-{
-	size_t copied = 0;
-	size_t off = 0;
-
-	/* Seek past already-consumed data */
-	while (seek > 0 && mp != NULL) {
-		const size_t chunk = MBLKL(mp);
-
-		if (chunk > seek) {
-			off = seek;
-			break;
-		}
-		mp = mp->b_cont;
-		seek -= chunk;
-	}
-
-	while (mp != NULL) {
-		const size_t chunk = MBLKL(mp) - off;
-		const size_t to_copy = MIN(chunk, len);
-
-		bcopy(mp->b_rptr + off, buf, to_copy);
-		copied += to_copy;
-		buf += to_copy;
-		len -= to_copy;
-
-		/*
-		 * If all the remaining data in the mblk_t was copied, move on
-		 * to the next one in the chain.  Any seek offset applied to
-		 * the first mblk copy is zeroed out for subsequent operations.
-		 */
-		if (chunk == to_copy) {
-			mp = mp->b_cont;
-			off = 0;
-		}
-#ifdef DEBUG
-		else {
-			/*
-			 * The only valid reason for the copy to consume less
-			 * than the entire contents of the mblk_t is because
-			 * the output buffer has been filled.
-			 */
-			ASSERT0(len);
-		}
-#endif
-
-		/* Go no further if the buffer has been filled */
-		if (len == 0) {
-			break;
-		}
-
-	}
-	*end = (mp == NULL);
-	return (copied);
-}
-
-static int
-viona_recv_plain(viona_vring_t *ring, const mblk_t *mp, size_t msz)
-{
-	struct iovec iov[VTNET_MAXSEGS];
-	uint16_t cookie;
-	int n;
-	const size_t hdr_sz = sizeof (struct virtio_net_hdr);
-	struct virtio_net_hdr *hdr;
-	size_t len, copied = 0;
-	caddr_t buf = NULL;
-	boolean_t end = B_FALSE;
-	const uint32_t features = ring->vr_link->l_features;
-
-	ASSERT(msz >= MIN_BUF_SIZE);
-
-	n = vq_popchain(ring, iov, VTNET_MAXSEGS, &cookie);
-	if (n <= 0) {
-		/* Without available buffers, the frame must be dropped. */
-		return (ENOSPC);
-	}
-	if (iov[0].iov_len < hdr_sz) {
-		/*
-		 * There is little to do if there is not even space available
-		 * for the sole header.  Zero the buffer and bail out as a last
-		 * act of desperation.
-		 */
-		bzero(iov[0].iov_base, iov[0].iov_len);
-		goto bad_frame;
-	}
-
-	/* Grab the address of the header before anything else */
-	hdr = (struct virtio_net_hdr *)iov[0].iov_base;
-
-	/*
-	 * If there is any space remaining in the first buffer after writing
-	 * the header, fill it with frame data.
-	 */
-	if (iov[0].iov_len > hdr_sz) {
-		buf = (caddr_t)iov[0].iov_base + hdr_sz;
-		len = iov[0].iov_len - hdr_sz;
-
-		copied += viona_copy_mblk(mp, copied, buf, len, &end);
-	}
-
-	/* Copy any remaining data into subsequent buffers, if present */
-	for (int i = 1; i < n && !end; i++) {
-		buf = (caddr_t)iov[i].iov_base;
-		len = iov[i].iov_len;
-
-		copied += viona_copy_mblk(mp, copied, buf, len, &end);
-	}
-
-	/* Was the expected amount of data copied? */
-	if (copied != msz) {
-		VIONA_PROBE5(too_short, viona_vring_t *, ring,
-		    uint16_t, cookie, mblk_t *, mp, size_t, copied,
-		    size_t, msz);
-		VIONA_RING_STAT_INCR(ring, too_short);
-		goto bad_frame;
-	}
-
-	/* Populate (read: zero) the header and account for it in the size */
-	bzero(hdr, hdr_sz);
-	copied += hdr_sz;
-
-	/* Add chksum bits, if needed */
-	if ((features & VIRTIO_NET_F_GUEST_CSUM) != 0) {
-		uint32_t cksum_flags;
-
-		if (((features & VIRTIO_NET_F_GUEST_TSO4) != 0) &&
-		    ((DB_CKSUMFLAGS(mp) & HW_LSO) != 0)) {
-			hdr->vrh_gso_type |= VIRTIO_NET_HDR_GSO_TCPV4;
-			hdr->vrh_gso_size = DB_LSOMSS(mp);
-		}
-
-		mac_hcksum_get((mblk_t *)mp, NULL, NULL, NULL, NULL,
-		    &cksum_flags);
-		if ((cksum_flags & HCK_FULLCKSUM_OK) != 0) {
-			hdr->vrh_flags |= VIRTIO_NET_HDR_F_DATA_VALID;
-		}
-	}
-
-	/* Release this chain */
-	vq_pushchain(ring, copied, cookie);
-	return (0);
-
-bad_frame:
-	VIONA_PROBE3(bad_rx_frame, viona_vring_t *, ring, uint16_t, cookie,
-	    mblk_t *, mp);
-	VIONA_RING_STAT_INCR(ring, bad_rx_frame);
-
-	vq_pushchain(ring, MAX(copied, MIN_BUF_SIZE + hdr_sz), cookie);
-	return (EINVAL);
-}
-
-static int
-viona_recv_merged(viona_vring_t *ring, const mblk_t *mp, size_t msz)
-{
-	struct iovec iov[VTNET_MAXSEGS];
-	used_elem_t uelem[VTNET_MAXSEGS];
-	int n, i = 0, buf_idx = 0, err = 0;
-	uint16_t cookie;
-	caddr_t buf;
-	size_t len, copied = 0, chunk = 0;
-	struct virtio_net_mrgrxhdr *hdr = NULL;
-	const size_t hdr_sz = sizeof (struct virtio_net_mrgrxhdr);
-	boolean_t end = B_FALSE;
-	const uint32_t features = ring->vr_link->l_features;
-
-	ASSERT(msz >= MIN_BUF_SIZE);
-
-	n = vq_popchain(ring, iov, VTNET_MAXSEGS, &cookie);
-	if (n <= 0) {
-		/* Without available buffers, the frame must be dropped. */
-		VIONA_PROBE2(no_space, viona_vring_t *, ring, mblk_t *, mp);
-		VIONA_RING_STAT_INCR(ring, no_space);
-		return (ENOSPC);
-	}
-	if (iov[0].iov_len < hdr_sz) {
-		/*
-		 * There is little to do if there is not even space available
-		 * for the sole header.  Zero the buffer and bail out as a last
-		 * act of desperation.
-		 */
-		bzero(iov[0].iov_base, iov[0].iov_len);
-		uelem[0].id = cookie;
-		uelem[0].len = iov[0].iov_len;
-		err = EINVAL;
-		goto done;
-	}
-
-	/* Grab the address of the header and do initial population */
-	hdr = (struct virtio_net_mrgrxhdr *)iov[0].iov_base;
-	bzero(hdr, hdr_sz);
-	hdr->vrh_bufs = 1;
-
-	/*
-	 * If there is any space remaining in the first buffer after writing
-	 * the header, fill it with frame data.
-	 */
-	if (iov[0].iov_len > hdr_sz) {
-		buf = iov[0].iov_base + hdr_sz;
-		len = iov[0].iov_len - hdr_sz;
-
-		chunk += viona_copy_mblk(mp, copied, buf, len, &end);
-		copied += chunk;
-	}
-	i = 1;
-
-	do {
-		while (i < n && !end) {
-			buf = iov[i].iov_base;
-			len = iov[i].iov_len;
-
-			chunk += viona_copy_mblk(mp, copied, buf, len, &end);
-			copied += chunk;
-			i++;
-		}
-
-		uelem[buf_idx].id = cookie;
-		uelem[buf_idx].len = chunk;
-
-		/*
-		 * Try to grab another buffer from the ring if the mblk has not
-		 * yet been entirely copied out.
-		 */
-		if (!end) {
-			if (buf_idx == (VTNET_MAXSEGS - 1)) {
-				/*
-				 * Our arbitrary limit on the number of buffers
-				 * to offer for merge has already been reached.
-				 */
-				err = EOVERFLOW;
-				break;
-			}
-			n = vq_popchain(ring, iov, VTNET_MAXSEGS, &cookie);
-			if (n <= 0) {
-				/*
-				 * Without more immediate space to perform the
-				 * copying, there is little choice left but to
-				 * drop the packet.
-				 */
-				err = EMSGSIZE;
-				break;
-			}
-			chunk = 0;
-			i = 0;
-			buf_idx++;
-			/*
-			 * Keep the header up-to-date with the number of
-			 * buffers, but never reference its value since the
-			 * guest could meddle with it.
-			 */
-			hdr->vrh_bufs++;
-		}
-	} while (!end && copied < msz);
-
-	/* Account for the header size in the first buffer */
-	uelem[0].len += hdr_sz;
-
-	/*
-	 * If no other errors were encounted during the copy, was the expected
-	 * amount of data transfered?
-	 */
-	if (err == 0 && copied != msz) {
-		VIONA_PROBE5(too_short, viona_vring_t *, ring,
-		    uint16_t, cookie, mblk_t *, mp, size_t, copied,
-		    size_t, msz);
-		VIONA_RING_STAT_INCR(ring, too_short);
-		err = EINVAL;
-	}
-
-	/* Add chksum bits, if needed */
-	if ((features & VIRTIO_NET_F_GUEST_CSUM) != 0) {
-		uint32_t cksum_flags;
-
-		if (((features & VIRTIO_NET_F_GUEST_TSO4) != 0) &&
-		    ((DB_CKSUMFLAGS(mp) & HW_LSO) != 0)) {
-			hdr->vrh_gso_type |= VIRTIO_NET_HDR_GSO_TCPV4;
-			hdr->vrh_gso_size = DB_LSOMSS(mp);
-		}
-
-		mac_hcksum_get((mblk_t *)mp, NULL, NULL, NULL, NULL,
-		    &cksum_flags);
-		if ((cksum_flags & HCK_FULLCKSUM_OK) != 0) {
-			hdr->vrh_flags |= VIRTIO_NET_HDR_F_DATA_VALID;
-		}
-	}
-
-done:
-	switch (err) {
-	case 0:
-		/* Success can fall right through to ring delivery */
-		break;
-
-	case EMSGSIZE:
-		VIONA_PROBE3(rx_merge_underrun, viona_vring_t *, ring,
-		    uint16_t, cookie, mblk_t *, mp);
-		VIONA_RING_STAT_INCR(ring, rx_merge_underrun);
-		break;
-
-	case EOVERFLOW:
-		VIONA_PROBE3(rx_merge_overrun, viona_vring_t *, ring,
-		    uint16_t, cookie, mblk_t *, mp);
-		VIONA_RING_STAT_INCR(ring, rx_merge_overrun);
-		break;
-
-	default:
-		VIONA_PROBE3(bad_rx_frame, viona_vring_t *, ring,
-		    uint16_t, cookie, mblk_t *, mp);
-		VIONA_RING_STAT_INCR(ring, bad_rx_frame);
-	}
-	vq_pushchain_mrgrx(ring, buf_idx + 1, uelem);
-	return (err);
-}
-
-static void
-viona_rx_common(viona_vring_t *ring, mblk_t *mp, boolean_t is_loopback)
-{
-	viona_link_t *link = ring->vr_link;
-	mblk_t *mprx = NULL, **mprx_prevp = &mprx;
-	mblk_t *mpdrop = NULL, **mpdrop_prevp = &mpdrop;
-	const boolean_t do_merge =
-	    ((link->l_features & VIRTIO_NET_F_MRG_RXBUF) != 0);
-	const boolean_t guest_csum =
-	    ((link->l_features & VIRTIO_NET_F_GUEST_CSUM) != 0);
-	const boolean_t guest_tso4 =
-	    ((link->l_features & VIRTIO_NET_F_GUEST_TSO4) != 0);
-
-	size_t nrx = 0, ndrop = 0;
-
-	/*
-	 * The mac_hw_emul() function, by design, doesn't predicate on
-	 * HW_LOCAL_MAC. Since we are in Rx context we know that any
-	 * LSO packet must also be from a same-machine sender. We take
-	 * advantage of that and forgoe writing a manual loop to
-	 * predicate on HW_LOCAL_MAC.
-	 *
-	 * For checksum emulation we need to predicate on HW_LOCAL_MAC
-	 * to avoid calling mac_hw_emul() on packets that don't need
-	 * it (thanks to the fact that HCK_IPV4_HDRCKSUM and
-	 * HCK_IPV4_HDRCKSUM_OK use the same value). Therefore, we do
-	 * the checksum emulation in the second loop.
-	 */
-	if (!guest_tso4)
-		mac_hw_emul(&mp, NULL, NULL, MAC_LSO_EMUL);
-
-	while (mp != NULL) {
-		mblk_t *next, *pad = NULL;
-		size_t size;
-		int err = 0;
-
-		next = mp->b_next;
-		mp->b_next = NULL;
-
-		if (DB_CKSUMFLAGS(mp) & HW_LOCAL_MAC) {
-			/*
-			 * The VIRTIO_NET_HDR_F_DATA_VALID flag only
-			 * covers the ULP checksum -- so we still have
-			 * to populate the IP header checksum.
-			 */
-			if (guest_csum) {
-				mac_hw_emul(&mp, NULL, NULL, MAC_IPCKSUM_EMUL);
-			} else {
-				mac_hw_emul(&mp, NULL, NULL, MAC_HWCKSUM_EMUL);
-			}
-
-			if (mp == NULL) {
-				mp = next;
-				continue;
-			}
-		}
-
-		size = msgsize(mp);
-
-		/*
-		 * We treat both a 'drop' response and errors the same here
-		 * and put the packet on the drop chain.  As packets may be
-		 * subject to different actions in ipf (which do not all
-		 * return the same set of error values), an error processing
-		 * one packet doesn't mean the next packet will also generate
-		 * an error.
-		 */
-		if (VNETHOOK_INTERESTED_IN(link->l_neti) &&
-		    viona_hook(link, ring, &mp, B_FALSE) != 0) {
-			if (mp != NULL) {
-				*mpdrop_prevp = mp;
-				mpdrop_prevp = &mp->b_next;
-			} else {
-				/*
-				 * If the hook consumer (e.g. ipf) already
-				 * freed the mblk_t, update the drop count now.
-				 */
-				ndrop++;
-			}
-			mp = next;
-			continue;
-		}
-
-		/*
-		 * Ethernet frames are expected to be padded out in order to
-		 * meet the minimum size.
-		 *
-		 * A special case is made for frames which are short by
-		 * VLAN_TAGSZ, having been stripped of their VLAN tag while
-		 * traversing MAC.  A preallocated (and recycled) mblk is used
-		 * for that specific condition.
-		 *
-		 * All other frames that fall short on length will have custom
-		 * zero-padding allocated appended to them.
-		 */
-		if (size == NEED_VLAN_PAD_SIZE) {
-			ASSERT(MBLKL(viona_vlan_pad_mp) == VLAN_TAGSZ);
-			ASSERT(viona_vlan_pad_mp->b_cont == NULL);
-
-			for (pad = mp; pad->b_cont != NULL; pad = pad->b_cont)
-				;
-
-			pad->b_cont = viona_vlan_pad_mp;
-			size += VLAN_TAGSZ;
-		} else if (size < MIN_BUF_SIZE) {
-			const size_t pad_size = MIN_BUF_SIZE - size;
-			mblk_t *zero_mp;
-
-			zero_mp = allocb(pad_size, BPRI_MED);
-			if (zero_mp == NULL) {
-				err = ENOMEM;
-				goto pad_drop;
-			}
-
-			VIONA_PROBE3(rx_pad_short, viona_vring_t *, ring,
-			    mblk_t *, mp, size_t, pad_size);
-			VIONA_RING_STAT_INCR(ring, rx_pad_short);
-			zero_mp->b_wptr += pad_size;
-			bzero(zero_mp->b_rptr, pad_size);
-			linkb(mp, zero_mp);
-			size += pad_size;
-		}
-
-		if (do_merge) {
-			err = viona_recv_merged(ring, mp, size);
-		} else {
-			err = viona_recv_plain(ring, mp, size);
-		}
-
-		/*
-		 * The VLAN padding mblk is meant for continual reuse, so
-		 * remove it from the chain to prevent it from being freed.
-		 *
-		 * Custom allocated padding does not require this treatment and
-		 * is freed normally.
-		 */
-		if (pad != NULL) {
-			pad->b_cont = NULL;
-		}
-
-pad_drop:
-		/*
-		 * While an error during rx processing
-		 * (viona_recv_{merged,plain}) does not free mp on error,
-		 * hook processing might or might not free mp.  Handle either
-		 * scenario -- if mp is not yet free, it is queued up and
-		 * freed after the guest has been notified.  If mp is
-		 * already NULL, just proceed on.
-		 */
-		if (err != 0) {
-			*mpdrop_prevp = mp;
-			mpdrop_prevp = &mp->b_next;
-
-			/*
-			 * If the available ring is empty, do not bother
-			 * attempting to deliver any more frames.  Count the
-			 * rest as dropped too.
-			 */
-			if (err == ENOSPC) {
-				mp->b_next = next;
-				break;
-			}
-		} else {
-			/* Chain successful mblks to be freed later */
-			*mprx_prevp = mp;
-			mprx_prevp = &mp->b_next;
-			nrx++;
-		}
-		mp = next;
-	}
-
-	membar_enter();
-	if ((*ring->vr_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) {
-		viona_intr_ring(ring);
-	}
-
-	/* Free successfully received frames */
-	if (mprx != NULL) {
-		freemsgchain(mprx);
-	}
-
-	/* Free dropped frames, also tallying them */
-	mp = mpdrop;
-	while (mp != NULL) {
-		mblk_t *next = mp->b_next;
-
-		mp->b_next = NULL;
-		freemsg(mp);
-		mp = next;
-		ndrop++;
-	}
-	VIONA_PROBE3(rx, viona_link_t *, link, size_t, nrx, size_t, ndrop);
-}
-
-static void
-viona_rx_classified(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
-    boolean_t is_loopback)
-{
-	viona_vring_t *ring = (viona_vring_t *)arg;
-
-	/* Drop traffic if ring is inactive or renewing its lease */
-	if (ring->vr_state != VRS_RUN ||
-	    (ring->vr_state_flags & VRSF_RENEW) != 0) {
-		freemsgchain(mp);
-		return;
-	}
-
-	viona_rx_common(ring, mp, is_loopback);
-}
-
-static void
-viona_rx_mcast(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
-    boolean_t is_loopback)
-{
-	viona_vring_t *ring = (viona_vring_t *)arg;
-	mac_handle_t mh = ring->vr_link->l_mh;
-	mblk_t *mp_mcast_only = NULL;
-	mblk_t **mpp = &mp_mcast_only;
-
-	/* Drop traffic if ring is inactive or renewing its lease */
-	if (ring->vr_state != VRS_RUN ||
-	    (ring->vr_state_flags & VRSF_RENEW) != 0) {
-		freemsgchain(mp);
-		return;
-	}
-
-	/*
-	 * In addition to multicast traffic, broadcast packets will also arrive
-	 * via the MAC_CLIENT_PROMISC_MULTI handler. The mac_rx_set() callback
-	 * for fully-classified traffic has already delivered that broadcast
-	 * traffic, so it should be suppressed here, rather than duplicating it
-	 * to the guest.
-	 */
-	while (mp != NULL) {
-		mblk_t *mp_next;
-		mac_header_info_t mhi;
-		int err;
-
-		mp_next = mp->b_next;
-		mp->b_next = NULL;
-
-		/* Determine the packet type */
-		err = mac_vlan_header_info(mh, mp, &mhi);
-		if (err != 0) {
-			mblk_t *pull;
-
-			/*
-			 * It is possible that gathering of the header
-			 * information was impeded by a leading mblk_t which
-			 * was of inadequate length to reference the needed
-			 * fields.  Try again, in case that could be solved
-			 * with a pull-up.
-			 */
-			pull = msgpullup(mp, sizeof (struct ether_vlan_header));
-			if (pull == NULL) {
-				err = ENOMEM;
-			} else {
-				err = mac_vlan_header_info(mh, pull, &mhi);
-				freemsg(pull);
-			}
-
-			if (err != 0) {
-				VIONA_RING_STAT_INCR(ring, rx_mcast_check);
-			}
-		}
-
-		/* Chain up matching packets while discarding others */
-		if (err == 0 && mhi.mhi_dsttype == MAC_ADDRTYPE_MULTICAST) {
-			*mpp = mp;
-			mpp = &mp->b_next;
-		} else {
-			freemsg(mp);
-		}
-
-		mp = mp_next;
-	}
-
-	if (mp_mcast_only != NULL) {
-		viona_rx_common(ring, mp_mcast_only, is_loopback);
-	}
-}
-
-static void
-viona_tx_done(viona_vring_t *ring, uint32_t len, uint16_t cookie)
-{
-	vq_pushchain(ring, len, cookie);
-
-	membar_enter();
-	if ((*ring->vr_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) {
-		viona_intr_ring(ring);
-	}
-}
-
-static void
-viona_desb_release(viona_desb_t *dp)
-{
-	viona_vring_t *ring = dp->d_ring;
-	uint_t ref;
-	uint32_t len;
-	uint16_t cookie;
-
-	ref = atomic_dec_uint_nv(&dp->d_ref);
-	if (ref > 1) {
-		return;
-	}
-
-	/*
-	 * The desb corresponding to this index must be ready for reuse before
-	 * the descriptor is returned to the guest via the 'used' ring.
-	 */
-	len = dp->d_len;
-	cookie = dp->d_cookie;
-	dp->d_len = 0;
-	dp->d_cookie = 0;
-	dp->d_ref = 0;
-
-	viona_tx_done(ring, len, cookie);
-
-	mutex_enter(&ring->vr_lock);
-	if ((--ring->vr_xfer_outstanding) == 0) {
-		cv_broadcast(&ring->vr_cv);
-	}
-	mutex_exit(&ring->vr_lock);
-}
-
-static void
-viona_tx_wait_outstanding(viona_vring_t *ring)
-{
-	ASSERT(MUTEX_HELD(&ring->vr_lock));
-
-	while (ring->vr_xfer_outstanding != 0) {
-		/*
-		 * Paying heed to signals is counterproductive here.  This is a
-		 * very tight loop if pending transfers take an extended amount
-		 * of time to be reclaimed while the host process is exiting.
-		 */
-		cv_wait(&ring->vr_cv, &ring->vr_lock);
-	}
-}
-
-static boolean_t
-viona_tx_csum(viona_vring_t *ring, const struct virtio_net_hdr *hdr,
-    mblk_t *mp, uint32_t len)
-{
-	viona_link_t *link = ring->vr_link;
-	const struct ether_header *eth;
-	uint_t eth_len = sizeof (struct ether_header);
-	ushort_t ftype;
-	ipha_t *ipha = NULL;
-	uint8_t ipproto = IPPROTO_NONE; /* NONE is not exactly right, but ok */
-	uint16_t flags = 0;
-	const uint_t csum_start = hdr->vrh_csum_start;
-	const uint_t csum_stuff = hdr->vrh_csum_offset + csum_start;
-
-	/*
-	 * Validate that the checksum offsets provided by the guest are within
-	 * the bounds of the packet.  Additionally, ensure that the checksum
-	 * contents field is within the headers mblk copied by viona_tx().
-	 */
-	if (csum_start >= len || csum_start < eth_len || csum_stuff >= len ||
-	    (csum_stuff + sizeof (uint16_t)) > MBLKL(mp)) {
-		VIONA_PROBE2(fail_hcksum, viona_link_t *, link, mblk_t *, mp);
-		VIONA_RING_STAT_INCR(ring, fail_hcksum);
-		return (B_FALSE);
-	}
-
-	/*
-	 * This is guaranteed to be safe thanks to the header copying
-	 * done in viona_tx().
-	 */
-	eth = (const struct ether_header *)mp->b_rptr;
-	ftype = ntohs(eth->ether_type);
-
-	if (ftype == ETHERTYPE_VLAN) {
-		const struct ether_vlan_header *veth;
-
-		/* punt on QinQ for now */
-		eth_len = sizeof (struct ether_vlan_header);
-		veth = (const struct ether_vlan_header *)eth;
-		ftype = ntohs(veth->ether_type);
-	}
-
-	if (ftype == ETHERTYPE_IP) {
-		ipha = (ipha_t *)(mp->b_rptr + eth_len);
-
-		ipproto = ipha->ipha_protocol;
-	} else if (ftype == ETHERTYPE_IPV6) {
-		ip6_t *ip6h = (ip6_t *)(mp->b_rptr + eth_len);
-
-		ipproto = ip6h->ip6_nxt;
-	}
-
-	/*
-	 * We ignore hdr_len because the spec says it can't be
-	 * trusted. Besides, our own stack will determine the header
-	 * boundary.
-	 */
-	if ((link->l_cap_csum & HCKSUM_INET_PARTIAL) != 0 &&
-	    (hdr->vrh_gso_type & VIRTIO_NET_HDR_GSO_TCPV4) != 0 &&
-	    ftype == ETHERTYPE_IP) {
-		uint16_t	*cksump;
-		uint32_t	cksum;
-		ipaddr_t	src = ipha->ipha_src;
-		ipaddr_t	dst = ipha->ipha_dst;
-
-		/*
-		 * Our native IP stack doesn't set the L4 length field
-		 * of the pseudo header when LSO is in play. Other IP
-		 * stacks, e.g. Linux, do include the length field.
-		 * This is a problem because the hardware expects that
-		 * the length field is not set. When it is set it will
-		 * cause an incorrect TCP checksum to be generated.
-		 * The reason this works in Linux is because Linux
-		 * corrects the pseudo-header checksum in the driver
-		 * code. In order to get the correct HW checksum we
-		 * need to assume the guest's IP stack gave us a bogus
-		 * TCP partial checksum and calculate it ourselves.
-		 */
-		cksump = IPH_TCPH_CHECKSUMP(ipha, IPH_HDR_LENGTH(ipha));
-		cksum = IP_TCP_CSUM_COMP;
-		cksum += (dst >> 16) + (dst & 0xFFFF) +
-		    (src >> 16) + (src & 0xFFFF);
-		cksum = (cksum & 0xFFFF) + (cksum >> 16);
-		*(cksump) = (cksum & 0xFFFF) + (cksum >> 16);
-
-		/*
-		 * Since viona is a "legacy device", the data stored
-		 * by the driver will be in the guest's native endian
-		 * format (see sections 2.4.3 and 5.1.6.1 of the
-		 * VIRTIO 1.0 spec for more info). At this time the
-		 * only guests using viona are x86 and we can assume
-		 * little-endian.
-		 */
-		lso_info_set(mp, LE_16(hdr->vrh_gso_size), HW_LSO);
-
-		/*
-		 * Hardware, like ixgbe, expects the client to request
-		 * IP header checksum offload if it's sending LSO (see
-		 * ixgbe_get_context()). Unfortunately, virtio makes
-		 * no allowances for negotiating IP header checksum
-		 * and HW offload, only TCP checksum. We add the flag
-		 * and zero-out the checksum field. This mirrors the
-		 * behavior of our native IP stack (which does this in
-		 * the interest of HW that expects the field to be
-		 * zero).
-		 */
-		flags |= HCK_IPV4_HDRCKSUM;
-		ipha->ipha_hdr_checksum = 0;
-	}
-
-	/*
-	 * Use DB_CKSUMFLAGS instead of mac_hcksum_get() to make sure
-	 * HW_LSO, if present, is not lost.
-	 */
-	flags |= DB_CKSUMFLAGS(mp);
-
-	/*
-	 * Partial checksum support from the NIC is ideal, since it most
-	 * closely maps to the interface defined by virtio.
-	 */
-	if ((link->l_cap_csum & HCKSUM_INET_PARTIAL) != 0 &&
-	    (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) {
-		/*
-		 * MAC expects these offsets to be relative to the
-		 * start of the L3 header rather than the L2 frame.
-		 */
-		flags |= HCK_PARTIALCKSUM;
-		mac_hcksum_set(mp, csum_start - eth_len, csum_stuff - eth_len,
-		    len - eth_len, 0, flags);
-		return (B_TRUE);
-	}
-
-	/*
-	 * Without partial checksum support, look to the L3/L4 protocol
-	 * information to see if the NIC can handle it.  If not, the
-	 * checksum will need to calculated inline.
-	 */
-	if (ftype == ETHERTYPE_IP) {
-		if ((link->l_cap_csum & HCKSUM_INET_FULL_V4) != 0 &&
-		    (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) {
-			uint16_t *csump = (uint16_t *)(mp->b_rptr + csum_stuff);
-			*csump = 0;
-			flags |= HCK_FULLCKSUM;
-			mac_hcksum_set(mp, 0, 0, 0, 0, flags);
-			return (B_TRUE);
-		}
-
-		/* XXX: Implement manual fallback checksumming? */
-		VIONA_PROBE2(fail_hcksum, viona_link_t *, link, mblk_t *, mp);
-		VIONA_RING_STAT_INCR(ring, fail_hcksum);
-		return (B_FALSE);
-	} else if (ftype == ETHERTYPE_IPV6) {
-		if ((link->l_cap_csum & HCKSUM_INET_FULL_V6) != 0 &&
-		    (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) {
-			uint16_t *csump = (uint16_t *)(mp->b_rptr + csum_stuff);
-			*csump = 0;
-			flags |= HCK_FULLCKSUM;
-			mac_hcksum_set(mp, 0, 0, 0, 0, flags);
-			return (B_TRUE);
-		}
-
-		/* XXX: Implement manual fallback checksumming? */
-		VIONA_PROBE2(fail_hcksum6, viona_link_t *, link, mblk_t *, mp);
-		VIONA_RING_STAT_INCR(ring, fail_hcksum6);
-		return (B_FALSE);
-	}
-
-	/* Cannot even emulate hcksum for unrecognized protocols */
-	VIONA_PROBE2(fail_hcksum_proto, viona_link_t *, link, mblk_t *, mp);
-	VIONA_RING_STAT_INCR(ring, fail_hcksum_proto);
-	return (B_FALSE);
-}
-
-static void
-viona_tx(viona_link_t *link, viona_vring_t *ring)
-{
-	struct iovec		*iov = ring->vr_txiov;
-	const uint_t		max_segs = ring->vr_size;
-	uint16_t		cookie;
-	int			i, n;
-	uint32_t		len, base_off = 0;
-	uint32_t		min_copy = VIONA_MAX_HDRS_LEN;
-	mblk_t			*mp_head, *mp_tail, *mp;
-	viona_desb_t		*dp = NULL;
-	mac_client_handle_t	link_mch = link->l_mch;
-	const struct virtio_net_hdr *hdr;
-
-	mp_head = mp_tail = NULL;
-
-	ASSERT(iov != NULL);
-
-	n = vq_popchain(ring, iov, max_segs, &cookie);
-	if (n == 0) {
-		VIONA_PROBE1(tx_absent, viona_vring_t *, ring);
-		VIONA_RING_STAT_INCR(ring, tx_absent);
-		return;
-	} else if (n < 0) {
-		/*
-		 * Any error encountered in vq_popchain has already resulted in
-		 * specific probe and statistic handling.  Further action here
-		 * is unnecessary.
-		 */
-		return;
-	}
-
-	/* Grab the header and ensure it is of adequate length */
-	hdr = (const struct virtio_net_hdr *)iov[0].iov_base;
-	len = iov[0].iov_len;
-	if (len < sizeof (struct virtio_net_hdr)) {
-		goto drop_fail;
-	}
-
-	/* Make sure the packet headers are always in the first mblk. */
-	if (ring->vr_txdesb != NULL) {
-		dp = &ring->vr_txdesb[cookie];
-
-		/*
-		 * If the guest driver is operating properly, each desb slot
-		 * should be available for use when processing a TX descriptor
-		 * from the 'avail' ring.  In the case of drivers that reuse a
-		 * descriptor before it has been posted to the 'used' ring, the
-		 * data is simply dropped.
-		 */
-		if (atomic_cas_uint(&dp->d_ref, 0, 1) != 0) {
-			dp = NULL;
-			goto drop_fail;
-		}
-
-		dp->d_cookie = cookie;
-		mp_head = desballoc(dp->d_headers, VIONA_MAX_HDRS_LEN, 0,
-		    &dp->d_frtn);
-
-		/* Account for the successful desballoc. */
-		if (mp_head != NULL)
-			dp->d_ref++;
-	} else {
-		mp_head = allocb(VIONA_MAX_HDRS_LEN, 0);
-	}
-
-	if (mp_head == NULL)
-		goto drop_fail;
-
-	mp_tail = mp_head;
-
-	/*
-	 * We always copy enough of the guest data to cover the
-	 * headers. This protects us from TOCTOU attacks and allows
-	 * message block length assumptions to be made in subsequent
-	 * code. In many cases, this means copying more data than
-	 * strictly necessary. That's okay, as it is the larger packets
-	 * (such as LSO) that really benefit from desballoc().
-	 */
-	for (i = 1; i < n; i++) {
-		const uint32_t to_copy = MIN(min_copy, iov[i].iov_len);
-
-		bcopy(iov[i].iov_base, mp_head->b_wptr, to_copy);
-		mp_head->b_wptr += to_copy;
-		len += to_copy;
-		min_copy -= to_copy;
-
-		/*
-		 * We've met the minimum copy requirement. The rest of
-		 * the guest data can be referenced.
-		 */
-		if (min_copy == 0) {
-			/*
-			 * If we copied all contents of this
-			 * descriptor then move onto the next one.
-			 * Otherwise, record how far we are into the
-			 * current descriptor.
-			 */
-			if (iov[i].iov_len == to_copy)
-				i++;
-			else
-				base_off = to_copy;
-
-			break;
-		}
-	}
-
-	ASSERT3P(mp_head, !=, NULL);
-	ASSERT3P(mp_tail, !=, NULL);
-
-	for (; i < n; i++) {
-		uintptr_t base = (uintptr_t)iov[i].iov_base + base_off;
-		uint32_t chunk = iov[i].iov_len - base_off;
-
-		ASSERT3U(base_off, <, iov[i].iov_len);
-		ASSERT3U(chunk, >, 0);
-
-		if (dp != NULL) {
-			mp = desballoc((uchar_t *)base, chunk, 0, &dp->d_frtn);
-			if (mp == NULL) {
-				goto drop_fail;
-			}
-			dp->d_ref++;
-		} else {
-			mp = allocb(chunk, BPRI_MED);
-			if (mp == NULL) {
-				goto drop_fail;
-			}
-			bcopy((uchar_t *)base, mp->b_wptr, chunk);
-		}
-
-		base_off = 0;
-		len += chunk;
-		mp->b_wptr += chunk;
-		mp_tail->b_cont = mp;
-		mp_tail = mp;
-	}
-
-	if (VNETHOOK_INTERESTED_OUT(link->l_neti)) {
-		/*
-		 * The hook consumer may elect to free the mblk_t and set
-		 * our mblk_t ** to NULL.  When using a viona_desb_t
-		 * (dp != NULL), we do not want the corresponding cleanup to
-		 * occur during the viona_hook() call. We instead want to
-		 * reset and recycle dp for future use.  To prevent cleanup
-		 * during the viona_hook() call, we take a ref on dp (if being
-		 * used), and release it on success.  On failure, the
-		 * freemsgchain() call will release all the refs taken earlier
-		 * in viona_tx() (aside from the initial ref and the one we
-		 * take), and drop_hook will reset dp for reuse.
-		 */
-		if (dp != NULL)
-			dp->d_ref++;
-
-		/*
-		 * Pass &mp instead of &mp_head so we don't lose track of
-		 * mp_head if the hook consumer (i.e. ipf) elects to free mp
-		 * and set mp to NULL.
-		 */
-		mp = mp_head;
-		if (viona_hook(link, ring, &mp, B_TRUE) != 0) {
-			if (mp != NULL)
-				freemsgchain(mp);
-			goto drop_hook;
-		}
-
-		if (dp != NULL) {
-			dp->d_ref--;
-
-			/*
-			 * It is possible that the hook(s) accepted the packet,
-			 * but as part of its processing, it issued a pull-up
-			 * which released all references to the desb.  In that
-			 * case, go back to acting like the packet is entirely
-			 * copied (which it is).
-			 */
-			if (dp->d_ref == 1) {
-				dp->d_cookie = 0;
-				dp->d_ref = 0;
-				dp = NULL;
-			}
-		}
-	}
-
-	/*
-	 * Request hardware checksumming, if necessary. If the guest
-	 * sent an LSO packet then it must have also negotiated and
-	 * requested partial checksum; therefore the LSO logic is
-	 * contained within viona_tx_csum().
-	 */
-	if ((link->l_features & VIRTIO_NET_F_CSUM) != 0 &&
-	    (hdr->vrh_flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) != 0) {
-		if (!viona_tx_csum(ring, hdr, mp_head, len - iov[0].iov_len)) {
-			goto drop_fail;
-		}
-	}
-
-	if (dp != NULL) {
-		dp->d_len = len;
-		mutex_enter(&ring->vr_lock);
-		ring->vr_xfer_outstanding++;
-		mutex_exit(&ring->vr_lock);
-	} else {
-		/*
-		 * If the data was cloned out of the ring, the descriptors can
-		 * be marked as 'used' now, rather than deferring that action
-		 * until after successful packet transmission.
-		 */
-		viona_tx_done(ring, len, cookie);
-	}
-
-	/*
-	 * We're potentially going deep into the networking layer; make sure the
-	 * guest can't run concurrently.
-	 */
-	smt_begin_unsafe();
-	mac_tx(link_mch, mp_head, 0, MAC_DROP_ON_NO_DESC, NULL);
-	smt_end_unsafe();
-	return;
-
-drop_fail:
-	/*
-	 * On the off chance that memory is not available via the desballoc or
-	 * allocb calls, there are few options left besides to fail and drop
-	 * the frame on the floor.
-	 */
-
-	if (dp != NULL) {
-		/*
-		 * Take an additional reference on the desb handle (if present)
-		 * so any desballoc-sourced mblks can release their hold on it
-		 * without the handle reaching its final state and executing
-		 * its clean-up logic.
-		 */
-		dp->d_ref++;
-	}
-
-	/*
-	 * Free any already-allocated blocks and sum up the total length of the
-	 * dropped data to be released to the used ring.
-	 */
-	freemsgchain(mp_head);
-
-drop_hook:
-	len = 0;
-	for (uint_t i = 0; i < n; i++) {
-		len += iov[i].iov_len;
-	}
-
-	if (dp != NULL) {
-		VERIFY(dp->d_ref == 2);
-
-		/* Clean up the desb handle, releasing the extra hold. */
-		dp->d_len = 0;
-		dp->d_cookie = 0;
-		dp->d_ref = 0;
-	}
-
-	VIONA_PROBE3(tx_drop, viona_vring_t *, ring, uint32_t, len,
-	    uint16_t, cookie);
-	viona_tx_done(ring, len, cookie);
-}
-
-/*
- * Generate a hook event for the packet in *mpp headed in the direction
- * indicated by 'out'.  If the packet is accepted, 0 is returned.  If the
- * packet is rejected, an error is returned.  The hook function may or may not
- * alter or even free *mpp.  The caller is expected to deal with either
- * situation.
- */
-static int
-viona_hook(viona_link_t *link, viona_vring_t *ring, mblk_t **mpp, boolean_t out)
-{
-	viona_neti_t *nip = link->l_neti;
-	viona_nethook_t *vnh = &nip->vni_nethook;
-	hook_pkt_event_t info;
-	hook_event_t he;
-	hook_event_token_t het;
-	int ret;
-
-	he = out ? vnh->vnh_event_out : vnh->vnh_event_in;
-	het = out ? vnh->vnh_token_out : vnh->vnh_token_in;
-
-	if (!he.he_interested)
-		return (0);
-
-	info.hpe_protocol = vnh->vnh_neti;
-	info.hpe_ifp = (phy_if_t)link;
-	info.hpe_ofp = (phy_if_t)link;
-	info.hpe_mp = mpp;
-	info.hpe_flags = 0;
-
-	ret = hook_run(vnh->vnh_neti->netd_hooks, het, (hook_data_t)&info);
-	if (ret == 0)
-		return (0);
-
-	if (out) {
-		VIONA_PROBE3(tx_hook_drop, viona_vring_t *, ring,
-		    mblk_t *, *mpp, int, ret);
-		VIONA_RING_STAT_INCR(ring, tx_hookdrop);
-	} else {
-		VIONA_PROBE3(rx_hook_drop, viona_vring_t *, ring,
-		    mblk_t *, *mpp, int, ret);
-		VIONA_RING_STAT_INCR(ring, rx_hookdrop);
-	}
-	return (ret);
-}
-
-/*
- * netinfo stubs - required by the nethook framework, but otherwise unused
- *
- * Currently, all ipf rules are applied against all interfaces in a given
- * netstack (e.g. all interfaces in a zone).  In the future if we want to
- * support being able to apply different rules to different interfaces, I
- * believe we would need to implement some of these stubs to map an interface
- * name in a rule (e.g. 'net0', back to an index or viona_link_t);
- */
-static int
-viona_neti_getifname(net_handle_t neti __unused, phy_if_t phy __unused,
-    char *buf __unused, const size_t len __unused)
-{
-	return (-1);
-}
-
-static int
-viona_neti_getmtu(net_handle_t neti __unused, phy_if_t phy __unused,
-    lif_if_t ifdata __unused)
-{
-	return (-1);
-}
-
-static int
-viona_neti_getptmue(net_handle_t neti __unused)
-{
-	return (-1);
-}
-
-static int
-viona_neti_getlifaddr(net_handle_t neti __unused, phy_if_t phy __unused,
-    lif_if_t ifdata __unused, size_t nelem __unused,
-    net_ifaddr_t type[] __unused, void *storage __unused)
-{
-	return (-1);
-}
-
-static int
-viona_neti_getlifzone(net_handle_t neti __unused, phy_if_t phy __unused,
-    lif_if_t ifdata __unused, zoneid_t *zid __unused)
-{
-	return (-1);
-}
-
-static int
-viona_neti_getlifflags(net_handle_t neti __unused, phy_if_t phy __unused,
-    lif_if_t ifdata __unused, uint64_t *flags __unused)
-{
-	return (-1);
-}
-
-static phy_if_t
-viona_neti_phygetnext(net_handle_t neti __unused, phy_if_t phy __unused)
-{
-	return ((phy_if_t)-1);
-}
-
-static phy_if_t
-viona_neti_phylookup(net_handle_t neti __unused, const char *name __unused)
-{
-	return ((phy_if_t)-1);
-}
-
-static lif_if_t
-viona_neti_lifgetnext(net_handle_t neti __unused, phy_if_t phy __unused,
-    lif_if_t ifdata __unused)
-{
-	return (-1);
-}
-
-static int
-viona_neti_inject(net_handle_t neti __unused, inject_t style __unused,
-    net_inject_t *packet __unused)
-{
-	return (-1);
-}
-
-static phy_if_t
-viona_neti_route(net_handle_t neti __unused, struct sockaddr *address __unused,
-    struct sockaddr *next __unused)
-{
-	return ((phy_if_t)-1);
-}
-
-static int
-viona_neti_ispchksum(net_handle_t neti __unused, mblk_t *mp __unused)
-{
-	return (-1);
-}
-
-static int
-viona_neti_isvchksum(net_handle_t neti __unused, mblk_t *mp __unused)
-{
-	return (-1);
-}
-
-static net_protocol_t viona_netinfo = {
-	NETINFO_VERSION,
-	NHF_VIONA,
-	viona_neti_getifname,
-	viona_neti_getmtu,
-	viona_neti_getptmue,
-	viona_neti_getlifaddr,
-	viona_neti_getlifzone,
-	viona_neti_getlifflags,
-	viona_neti_phygetnext,
-	viona_neti_phylookup,
-	viona_neti_lifgetnext,
-	viona_neti_inject,
-	viona_neti_route,
-	viona_neti_ispchksum,
-	viona_neti_isvchksum
-};
-
-/*
- * Create/register our nethooks
- */
-static int
-viona_nethook_init(netid_t nid, viona_nethook_t *vnh, char *nh_name,
-    net_protocol_t *netip)
-{
-	int ret;
-
-	if ((vnh->vnh_neti = net_protocol_register(nid, netip)) == NULL) {
-		cmn_err(CE_NOTE, "%s: net_protocol_register failed "
-		    "(netid=%d name=%s)", __func__, nid, nh_name);
-		goto fail_init_proto;
-	}
-
-	HOOK_FAMILY_INIT(&vnh->vnh_family, nh_name);
-	if ((ret = net_family_register(vnh->vnh_neti, &vnh->vnh_family)) != 0) {
-		cmn_err(CE_NOTE, "%s: net_family_register failed "
-		    "(netid=%d name=%s err=%d)", __func__,
-		    nid, nh_name, ret);
-		goto fail_init_family;
-	}
-
-	HOOK_EVENT_INIT(&vnh->vnh_event_in, NH_PHYSICAL_IN);
-	if ((vnh->vnh_token_in = net_event_register(vnh->vnh_neti,
-	    &vnh->vnh_event_in)) == NULL) {
-		cmn_err(CE_NOTE, "%s: net_event_register %s failed "
-		    "(netid=%d name=%s)", __func__, NH_PHYSICAL_IN, nid,
-		    nh_name);
-		goto fail_init_event_in;
-	}
-
-	HOOK_EVENT_INIT(&vnh->vnh_event_out, NH_PHYSICAL_OUT);
-	if ((vnh->vnh_token_out = net_event_register(vnh->vnh_neti,
-	    &vnh->vnh_event_out)) == NULL) {
-		cmn_err(CE_NOTE, "%s: net_event_register %s failed "
-		    "(netid=%d name=%s)", __func__, NH_PHYSICAL_OUT, nid,
-		    nh_name);
-		goto fail_init_event_out;
-	}
-	return (0);
-
-	/*
-	 * On failure, we undo all the steps that succeeded in the
-	 * reverse order of initialization, starting at the last
-	 * successful step (the labels denoting the failing step).
-	 */
-fail_init_event_out:
-	VERIFY0(net_event_shutdown(vnh->vnh_neti, &vnh->vnh_event_in));
-	VERIFY0(net_event_unregister(vnh->vnh_neti, &vnh->vnh_event_in));
-	vnh->vnh_token_in = NULL;
-
-fail_init_event_in:
-	VERIFY0(net_family_shutdown(vnh->vnh_neti, &vnh->vnh_family));
-	VERIFY0(net_family_unregister(vnh->vnh_neti, &vnh->vnh_family));
-
-fail_init_family:
-	VERIFY0(net_protocol_unregister(vnh->vnh_neti));
-	vnh->vnh_neti = NULL;
-
-fail_init_proto:
-	return (1);
-}
-
-/*
- * Shutdown the nethooks for a protocol family.  This triggers notification
- * callbacks to anything that has registered interest to allow hook consumers
- * to unhook prior to the removal of the hooks as well as makes them unavailable
- * to any future consumers as the first step of removal.
- */
-static void
-viona_nethook_shutdown(viona_nethook_t *vnh)
-{
-	VERIFY0(net_event_shutdown(vnh->vnh_neti, &vnh->vnh_event_out));
-	VERIFY0(net_event_shutdown(vnh->vnh_neti, &vnh->vnh_event_in));
-	VERIFY0(net_family_shutdown(vnh->vnh_neti, &vnh->vnh_family));
-}
-
-/*
- * Remove the nethooks for a protocol family.
- */
-static void
-viona_nethook_fini(viona_nethook_t *vnh)
-{
-	VERIFY0(net_event_unregister(vnh->vnh_neti, &vnh->vnh_event_out));
-	VERIFY0(net_event_unregister(vnh->vnh_neti, &vnh->vnh_event_in));
-	VERIFY0(net_family_unregister(vnh->vnh_neti, &vnh->vnh_family));
-	VERIFY0(net_protocol_unregister(vnh->vnh_neti));
-	vnh->vnh_neti = NULL;
-}
-
-/*
- * Callback invoked by the neti module.  This creates/registers our hooks
- * {IPv4,IPv6}{in,out} with the nethook framework so they are available to
- * interested consumers (e.g. ipf).
- *
- * During attach, viona_neti_create is called once for every netstack
- * present on the system at the time of attach.  Thereafter, it is called
- * during the creation of additional netstack instances (i.e. zone boot).  As a
- * result, the viona_neti_t that is created during this call always occurs
- * prior to any viona instances that will use it to send hook events.
- *
- * It should never return NULL.  If we cannot register our hooks, we do not
- * set vnh_hooked of the respective protocol family, which will prevent the
- * creation of any viona instances on this netstack (see viona_ioc_create).
- * This can only occur if after a shutdown event (which means destruction is
- * imminent) we are trying to create a new instance.
- */
-static void *
-viona_neti_create(const netid_t netid)
-{
-	viona_neti_t *nip;
-
-	VERIFY(netid != -1);
-
-	nip = kmem_zalloc(sizeof (*nip), KM_SLEEP);
-	nip->vni_netid = netid;
-	nip->vni_zid = net_getzoneidbynetid(netid);
-	mutex_init(&nip->vni_lock, NULL, MUTEX_DRIVER, NULL);
-	list_create(&nip->vni_dev_list, sizeof (viona_soft_state_t),
-	    offsetof(viona_soft_state_t, ss_node));
-
-	if (viona_nethook_init(netid, &nip->vni_nethook, Hn_VIONA,
-	    &viona_netinfo) == 0)
-		nip->vni_nethook.vnh_hooked = B_TRUE;
-
-	mutex_enter(&viona_neti_lock);
-	list_insert_tail(&viona_neti_list, nip);
-	mutex_exit(&viona_neti_lock);
-
-	return (nip);
-}
-
-/*
- * Called during netstack teardown by the neti module.  During teardown, all
- * the shutdown callbacks are invoked, allowing consumers to release any holds
- * and otherwise quiesce themselves prior to destruction, followed by the
- * actual destruction callbacks.
- */
-static void
-viona_neti_shutdown(netid_t nid, void *arg)
-{
-	viona_neti_t *nip = arg;
-
-	ASSERT(nip != NULL);
-	VERIFY(nid == nip->vni_netid);
-
-	mutex_enter(&viona_neti_lock);
-	list_remove(&viona_neti_list, nip);
-	mutex_exit(&viona_neti_lock);
-
-	if (nip->vni_nethook.vnh_hooked)
-		viona_nethook_shutdown(&nip->vni_nethook);
-}
-
-/*
- * Called during netstack teardown by the neti module.  Destroys the viona
- * netinst data.  This is invoked after all the netstack and neti shutdown
- * callbacks have been invoked.
- */
-static void
-viona_neti_destroy(netid_t nid, void *arg)
-{
-	viona_neti_t *nip = arg;
-
-	ASSERT(nip != NULL);
-	VERIFY(nid == nip->vni_netid);
-
-	mutex_enter(&nip->vni_lock);
-	while (nip->vni_ref != 0)
-		cv_wait(&nip->vni_ref_change, &nip->vni_lock);
-	mutex_exit(&nip->vni_lock);
-
-	VERIFY(!list_link_active(&nip->vni_node));
-
-	if (nip->vni_nethook.vnh_hooked)
-		viona_nethook_fini(&nip->vni_nethook);
-
-	mutex_destroy(&nip->vni_lock);
-	list_destroy(&nip->vni_dev_list);
-	kmem_free(nip, sizeof (*nip));
-}
-
-/*
- * Find the viona netinst data by zone id.  This is only used during
- * viona instance creation (and thus is only called by a zone that is running).
- */
-static viona_neti_t *
-viona_neti_lookup_by_zid(zoneid_t zid)
-{
-	viona_neti_t *nip;
-
-	mutex_enter(&viona_neti_lock);
-	for (nip = list_head(&viona_neti_list); nip != NULL;
-	    nip = list_next(&viona_neti_list, nip)) {
-		if (nip->vni_zid == zid) {
-			mutex_enter(&nip->vni_lock);
-			nip->vni_ref++;
-			mutex_exit(&nip->vni_lock);
-			mutex_exit(&viona_neti_lock);
-			return (nip);
-		}
-	}
-	mutex_exit(&viona_neti_lock);
-	return (NULL);
-}
-
-static void
-viona_neti_rele(viona_neti_t *nip)
-{
-	mutex_enter(&nip->vni_lock);
-	VERIFY3S(nip->vni_ref, >, 0);
-	nip->vni_ref--;
-	mutex_exit(&nip->vni_lock);
-	cv_broadcast(&nip->vni_ref_change);
-}
diff --git a/usr/src/uts/i86pc/io/viona/viona_hook.c b/usr/src/uts/i86pc/io/viona/viona_hook.c
new file mode 100644
index 0000000000..4520be04b0
--- /dev/null
+++ b/usr/src/uts/i86pc/io/viona/viona_hook.c
@@ -0,0 +1,438 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2019 Joyent, Inc.
+ */
+
+#include <sys/hook.h>
+#include <sys/hook_event.h>
+
+#include "viona_impl.h"
+
+
+/*
+ * Global linked list of viona_neti_ts.  Access is protected by viona_neti_lock
+ */
+static list_t		viona_neti_list;
+static kmutex_t		viona_neti_lock;
+
+/*
+ * viona_neti is allocated and initialized during attach, and read-only
+ * until detach (where it's also freed)
+ */
+static net_instance_t	*viona_neti;
+
+
+/*
+ * Generate a hook event for the packet in *mpp headed in the direction
+ * indicated by 'out'.  If the packet is accepted, 0 is returned.  If the
+ * packet is rejected, an error is returned.  The hook function may or may not
+ * alter or even free *mpp.  The caller is expected to deal with either
+ * situation.
+ */
+int
+viona_hook(viona_link_t *link, viona_vring_t *ring, mblk_t **mpp, boolean_t out)
+{
+	viona_neti_t *nip = link->l_neti;
+	viona_nethook_t *vnh = &nip->vni_nethook;
+	hook_pkt_event_t info;
+	hook_event_t he;
+	hook_event_token_t het;
+	int ret;
+
+	he = out ? vnh->vnh_event_out : vnh->vnh_event_in;
+	het = out ? vnh->vnh_token_out : vnh->vnh_token_in;
+
+	if (!he.he_interested)
+		return (0);
+
+	info.hpe_protocol = vnh->vnh_neti;
+	info.hpe_ifp = (phy_if_t)link;
+	info.hpe_ofp = (phy_if_t)link;
+	info.hpe_mp = mpp;
+	info.hpe_flags = 0;
+
+	ret = hook_run(vnh->vnh_neti->netd_hooks, het, (hook_data_t)&info);
+	if (ret == 0)
+		return (0);
+
+	if (out) {
+		VIONA_PROBE3(tx_hook_drop, viona_vring_t *, ring,
+		    mblk_t *, *mpp, int, ret);
+		VIONA_RING_STAT_INCR(ring, tx_hookdrop);
+	} else {
+		VIONA_PROBE3(rx_hook_drop, viona_vring_t *, ring,
+		    mblk_t *, *mpp, int, ret);
+		VIONA_RING_STAT_INCR(ring, rx_hookdrop);
+	}
+	return (ret);
+}
+
+/*
+ * netinfo stubs - required by the nethook framework, but otherwise unused
+ *
+ * Currently, all ipf rules are applied against all interfaces in a given
+ * netstack (e.g. all interfaces in a zone).  In the future if we want to
+ * support being able to apply different rules to different interfaces, I
+ * believe we would need to implement some of these stubs to map an interface
+ * name in a rule (e.g. 'net0', back to an index or viona_link_t);
+ */
+static int
+viona_neti_getifname(net_handle_t neti __unused, phy_if_t phy __unused,
+    char *buf __unused, const size_t len __unused)
+{
+	return (-1);
+}
+
+static int
+viona_neti_getmtu(net_handle_t neti __unused, phy_if_t phy __unused,
+    lif_if_t ifdata __unused)
+{
+	return (-1);
+}
+
+static int
+viona_neti_getptmue(net_handle_t neti __unused)
+{
+	return (-1);
+}
+
+static int
+viona_neti_getlifaddr(net_handle_t neti __unused, phy_if_t phy __unused,
+    lif_if_t ifdata __unused, size_t nelem __unused,
+    net_ifaddr_t type[] __unused, void *storage __unused)
+{
+	return (-1);
+}
+
+static int
+viona_neti_getlifzone(net_handle_t neti __unused, phy_if_t phy __unused,
+    lif_if_t ifdata __unused, zoneid_t *zid __unused)
+{
+	return (-1);
+}
+
+static int
+viona_neti_getlifflags(net_handle_t neti __unused, phy_if_t phy __unused,
+    lif_if_t ifdata __unused, uint64_t *flags __unused)
+{
+	return (-1);
+}
+
+static phy_if_t
+viona_neti_phygetnext(net_handle_t neti __unused, phy_if_t phy __unused)
+{
+	return ((phy_if_t)-1);
+}
+
+static phy_if_t
+viona_neti_phylookup(net_handle_t neti __unused, const char *name __unused)
+{
+	return ((phy_if_t)-1);
+}
+
+static lif_if_t
+viona_neti_lifgetnext(net_handle_t neti __unused, phy_if_t phy __unused,
+    lif_if_t ifdata __unused)
+{
+	return (-1);
+}
+
+static int
+viona_neti_inject(net_handle_t neti __unused, inject_t style __unused,
+    net_inject_t *packet __unused)
+{
+	return (-1);
+}
+
+static phy_if_t
+viona_neti_route(net_handle_t neti __unused, struct sockaddr *address __unused,
+    struct sockaddr *next __unused)
+{
+	return ((phy_if_t)-1);
+}
+
+static int
+viona_neti_ispchksum(net_handle_t neti __unused, mblk_t *mp __unused)
+{
+	return (-1);
+}
+
+static int
+viona_neti_isvchksum(net_handle_t neti __unused, mblk_t *mp __unused)
+{
+	return (-1);
+}
+
+static net_protocol_t viona_netinfo = {
+	NETINFO_VERSION,
+	NHF_VIONA,
+	viona_neti_getifname,
+	viona_neti_getmtu,
+	viona_neti_getptmue,
+	viona_neti_getlifaddr,
+	viona_neti_getlifzone,
+	viona_neti_getlifflags,
+	viona_neti_phygetnext,
+	viona_neti_phylookup,
+	viona_neti_lifgetnext,
+	viona_neti_inject,
+	viona_neti_route,
+	viona_neti_ispchksum,
+	viona_neti_isvchksum
+};
+
+/*
+ * Create/register our nethooks
+ */
+static int
+viona_nethook_init(netid_t nid, viona_nethook_t *vnh, char *nh_name,
+    net_protocol_t *netip)
+{
+	int ret;
+
+	if ((vnh->vnh_neti = net_protocol_register(nid, netip)) == NULL) {
+		cmn_err(CE_NOTE, "%s: net_protocol_register failed "
+		    "(netid=%d name=%s)", __func__, nid, nh_name);
+		goto fail_init_proto;
+	}
+
+	HOOK_FAMILY_INIT(&vnh->vnh_family, nh_name);
+	if ((ret = net_family_register(vnh->vnh_neti, &vnh->vnh_family)) != 0) {
+		cmn_err(CE_NOTE, "%s: net_family_register failed "
+		    "(netid=%d name=%s err=%d)", __func__,
+		    nid, nh_name, ret);
+		goto fail_init_family;
+	}
+
+	HOOK_EVENT_INIT(&vnh->vnh_event_in, NH_PHYSICAL_IN);
+	if ((vnh->vnh_token_in = net_event_register(vnh->vnh_neti,
+	    &vnh->vnh_event_in)) == NULL) {
+		cmn_err(CE_NOTE, "%s: net_event_register %s failed "
+		    "(netid=%d name=%s)", __func__, NH_PHYSICAL_IN, nid,
+		    nh_name);
+		goto fail_init_event_in;
+	}
+
+	HOOK_EVENT_INIT(&vnh->vnh_event_out, NH_PHYSICAL_OUT);
+	if ((vnh->vnh_token_out = net_event_register(vnh->vnh_neti,
+	    &vnh->vnh_event_out)) == NULL) {
+		cmn_err(CE_NOTE, "%s: net_event_register %s failed "
+		    "(netid=%d name=%s)", __func__, NH_PHYSICAL_OUT, nid,
+		    nh_name);
+		goto fail_init_event_out;
+	}
+	return (0);
+
+	/*
+	 * On failure, we undo all the steps that succeeded in the
+	 * reverse order of initialization, starting at the last
+	 * successful step (the labels denoting the failing step).
+	 */
+fail_init_event_out:
+	VERIFY0(net_event_shutdown(vnh->vnh_neti, &vnh->vnh_event_in));
+	VERIFY0(net_event_unregister(vnh->vnh_neti, &vnh->vnh_event_in));
+	vnh->vnh_token_in = NULL;
+
+fail_init_event_in:
+	VERIFY0(net_family_shutdown(vnh->vnh_neti, &vnh->vnh_family));
+	VERIFY0(net_family_unregister(vnh->vnh_neti, &vnh->vnh_family));
+
+fail_init_family:
+	VERIFY0(net_protocol_unregister(vnh->vnh_neti));
+	vnh->vnh_neti = NULL;
+
+fail_init_proto:
+	return (1);
+}
+
+/*
+ * Shutdown the nethooks for a protocol family.  This triggers notification
+ * callbacks to anything that has registered interest to allow hook consumers
+ * to unhook prior to the removal of the hooks as well as makes them unavailable
+ * to any future consumers as the first step of removal.
+ */
+static void
+viona_nethook_shutdown(viona_nethook_t *vnh)
+{
+	VERIFY0(net_event_shutdown(vnh->vnh_neti, &vnh->vnh_event_out));
+	VERIFY0(net_event_shutdown(vnh->vnh_neti, &vnh->vnh_event_in));
+	VERIFY0(net_family_shutdown(vnh->vnh_neti, &vnh->vnh_family));
+}
+
+/*
+ * Remove the nethooks for a protocol family.
+ */
+static void
+viona_nethook_fini(viona_nethook_t *vnh)
+{
+	VERIFY0(net_event_unregister(vnh->vnh_neti, &vnh->vnh_event_out));
+	VERIFY0(net_event_unregister(vnh->vnh_neti, &vnh->vnh_event_in));
+	VERIFY0(net_family_unregister(vnh->vnh_neti, &vnh->vnh_family));
+	VERIFY0(net_protocol_unregister(vnh->vnh_neti));
+	vnh->vnh_neti = NULL;
+}
+
+/*
+ * Callback invoked by the neti module.  This creates/registers our hooks
+ * {IPv4,IPv6}{in,out} with the nethook framework so they are available to
+ * interested consumers (e.g. ipf).
+ *
+ * During attach, viona_neti_create is called once for every netstack
+ * present on the system at the time of attach.  Thereafter, it is called
+ * during the creation of additional netstack instances (i.e. zone boot).  As a
+ * result, the viona_neti_t that is created during this call always occurs
+ * prior to any viona instances that will use it to send hook events.
+ *
+ * It should never return NULL.  If we cannot register our hooks, we do not
+ * set vnh_hooked of the respective protocol family, which will prevent the
+ * creation of any viona instances on this netstack (see viona_ioc_create).
+ * This can only occur if after a shutdown event (which means destruction is
+ * imminent) we are trying to create a new instance.
+ */
+static void *
+viona_neti_create(const netid_t netid)
+{
+	viona_neti_t *nip;
+
+	VERIFY(netid != -1);
+
+	nip = kmem_zalloc(sizeof (*nip), KM_SLEEP);
+	nip->vni_netid = netid;
+	nip->vni_zid = net_getzoneidbynetid(netid);
+	mutex_init(&nip->vni_lock, NULL, MUTEX_DRIVER, NULL);
+	list_create(&nip->vni_dev_list, sizeof (viona_soft_state_t),
+	    offsetof(viona_soft_state_t, ss_node));
+
+	if (viona_nethook_init(netid, &nip->vni_nethook, Hn_VIONA,
+	    &viona_netinfo) == 0)
+		nip->vni_nethook.vnh_hooked = B_TRUE;
+
+	mutex_enter(&viona_neti_lock);
+	list_insert_tail(&viona_neti_list, nip);
+	mutex_exit(&viona_neti_lock);
+
+	return (nip);
+}
+
+/*
+ * Called during netstack teardown by the neti module.  During teardown, all
+ * the shutdown callbacks are invoked, allowing consumers to release any holds
+ * and otherwise quiesce themselves prior to destruction, followed by the
+ * actual destruction callbacks.
+ */
+static void
+viona_neti_shutdown(netid_t nid, void *arg)
+{
+	viona_neti_t *nip = arg;
+
+	ASSERT(nip != NULL);
+	VERIFY(nid == nip->vni_netid);
+
+	mutex_enter(&viona_neti_lock);
+	list_remove(&viona_neti_list, nip);
+	mutex_exit(&viona_neti_lock);
+
+	if (nip->vni_nethook.vnh_hooked)
+		viona_nethook_shutdown(&nip->vni_nethook);
+}
+
+/*
+ * Called during netstack teardown by the neti module.  Destroys the viona
+ * netinst data.  This is invoked after all the netstack and neti shutdown
+ * callbacks have been invoked.
+ */
+static void
+viona_neti_destroy(netid_t nid, void *arg)
+{
+	viona_neti_t *nip = arg;
+
+	ASSERT(nip != NULL);
+	VERIFY(nid == nip->vni_netid);
+
+	mutex_enter(&nip->vni_lock);
+	while (nip->vni_ref != 0)
+		cv_wait(&nip->vni_ref_change, &nip->vni_lock);
+	mutex_exit(&nip->vni_lock);
+
+	VERIFY(!list_link_active(&nip->vni_node));
+
+	if (nip->vni_nethook.vnh_hooked)
+		viona_nethook_fini(&nip->vni_nethook);
+
+	mutex_destroy(&nip->vni_lock);
+	list_destroy(&nip->vni_dev_list);
+	kmem_free(nip, sizeof (*nip));
+}
+
+/*
+ * Find the viona netinst data by zone id.  This is only used during
+ * viona instance creation (and thus is only called by a zone that is running).
+ */
+viona_neti_t *
+viona_neti_lookup_by_zid(zoneid_t zid)
+{
+	viona_neti_t *nip;
+
+	mutex_enter(&viona_neti_lock);
+	for (nip = list_head(&viona_neti_list); nip != NULL;
+	    nip = list_next(&viona_neti_list, nip)) {
+		if (nip->vni_zid == zid) {
+			mutex_enter(&nip->vni_lock);
+			nip->vni_ref++;
+			mutex_exit(&nip->vni_lock);
+			mutex_exit(&viona_neti_lock);
+			return (nip);
+		}
+	}
+	mutex_exit(&viona_neti_lock);
+	return (NULL);
+}
+
+void
+viona_neti_rele(viona_neti_t *nip)
+{
+	mutex_enter(&nip->vni_lock);
+	VERIFY3S(nip->vni_ref, >, 0);
+	nip->vni_ref--;
+	mutex_exit(&nip->vni_lock);
+	cv_broadcast(&nip->vni_ref_change);
+}
+
+void
+viona_neti_attach(void)
+{
+	mutex_init(&viona_neti_lock, NULL, MUTEX_DRIVER, NULL);
+	list_create(&viona_neti_list, sizeof (viona_neti_t),
+	    offsetof(viona_neti_t, vni_node));
+
+	/* This can only fail if NETINFO_VERSION is wrong */
+	viona_neti = net_instance_alloc(NETINFO_VERSION);
+	VERIFY(viona_neti != NULL);
+
+	viona_neti->nin_name = "viona";
+	viona_neti->nin_create = viona_neti_create;
+	viona_neti->nin_shutdown = viona_neti_shutdown;
+	viona_neti->nin_destroy = viona_neti_destroy;
+	/* This can only fail if we've registered ourselves multiple times */
+	VERIFY3S(net_instance_register(viona_neti), ==, DDI_SUCCESS);
+}
+
+void
+viona_neti_detach(void)
+{
+	/* This can only fail if we've not registered previously */
+	VERIFY3S(net_instance_unregister(viona_neti), ==, DDI_SUCCESS);
+	net_instance_free(viona_neti);
+	viona_neti = NULL;
+
+	list_destroy(&viona_neti_list);
+	mutex_destroy(&viona_neti_lock);
+}
diff --git a/usr/src/uts/i86pc/io/viona/viona_impl.h b/usr/src/uts/i86pc/io/viona/viona_impl.h
new file mode 100644
index 0000000000..ee31c4d4ce
--- /dev/null
+++ b/usr/src/uts/i86pc/io/viona/viona_impl.h
@@ -0,0 +1,325 @@
+/*
+ * Copyright (c) 2013  Chris Torek <torek @ torek net>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2015 Pluribus Networks Inc.
+ * Copyright 2019 Joyent, Inc.
+ */
+
+#ifndef	_VIONA_IMPL_H
+#define	_VIONA_IMPL_H
+
+#include <sys/ddi.h>
+#include <sys/list.h>
+#include <sys/sunddi.h>
+#include <sys/sunndi.h>
+#include <sys/strsun.h>
+#include <sys/sysmacros.h>
+#include <sys/uio.h>
+
+#include <sys/mac_client.h>
+#include <sys/mac_provider.h>
+#include <sys/mac_client_priv.h>
+#include <sys/neti.h>
+#include <inet/ip.h>
+#include <inet/tcp.h>
+
+#include <sys/vmm_drv.h>
+#include <sys/viona_io.h>
+
+struct viona_link;
+typedef struct viona_link viona_link_t;
+struct viona_desb;
+typedef struct viona_desb viona_desb_t;
+struct viona_net;
+typedef struct viona_neti viona_neti_t;
+
+enum viona_ring_state {
+	VRS_RESET	= 0x0,	/* just allocated or reset */
+	VRS_SETUP	= 0x1,	/* addrs setup and starting worker thread */
+	VRS_INIT	= 0x2,	/* worker thread started & waiting to run */
+	VRS_RUN		= 0x3,	/* running work routine */
+};
+enum viona_ring_state_flags {
+	VRSF_REQ_START	= 0x1,	/* start running from INIT state */
+	VRSF_REQ_STOP	= 0x2,	/* stop running, clean up, goto RESET state */
+	VRSF_RENEW	= 0x4,	/* ring renewing lease */
+};
+
+typedef struct viona_vring {
+	viona_link_t	*vr_link;
+
+	kmutex_t	vr_lock;
+	kcondvar_t	vr_cv;
+	uint16_t	vr_state;
+	uint16_t	vr_state_flags;
+	uint_t		vr_xfer_outstanding;
+	kthread_t	*vr_worker_thread;
+	vmm_lease_t	*vr_lease;
+
+	/* ring-sized resources for TX activity */
+	viona_desb_t	*vr_txdesb;
+	struct iovec	*vr_txiov;
+
+	uint_t		vr_intr_enabled;
+	uint64_t	vr_msi_addr;
+	uint64_t	vr_msi_msg;
+
+	/* Internal ring-related state */
+	kmutex_t	vr_a_mutex;	/* sync consumers of 'avail' */
+	kmutex_t	vr_u_mutex;	/* sync consumers of 'used' */
+	uint64_t	vr_pa;
+	uint16_t	vr_size;
+	uint16_t	vr_mask;	/* cached from vr_size */
+	uint16_t	vr_cur_aidx;	/* trails behind 'avail_idx' */
+
+	/* Host-context pointers to the queue */
+	volatile struct virtio_desc	*vr_descr;
+
+	volatile uint16_t		*vr_avail_flags;
+	volatile uint16_t		*vr_avail_idx;
+	volatile uint16_t		*vr_avail_ring;
+	volatile uint16_t		*vr_avail_used_event;
+
+	volatile uint16_t		*vr_used_flags;
+	volatile uint16_t		*vr_used_idx;
+	volatile struct virtio_used	*vr_used_ring;
+	volatile uint16_t		*vr_used_avail_event;
+
+	/* Per-ring error condition statistics */
+	struct viona_ring_stats {
+		uint64_t	rs_ndesc_too_high;
+		uint64_t	rs_bad_idx;
+		uint64_t	rs_indir_bad_len;
+		uint64_t	rs_indir_bad_nest;
+		uint64_t	rs_indir_bad_next;
+		uint64_t	rs_no_space;
+		uint64_t	rs_too_many_desc;
+		uint64_t	rs_desc_bad_len;
+
+		uint64_t	rs_bad_ring_addr;
+
+		uint64_t	rs_fail_hcksum;
+		uint64_t	rs_fail_hcksum6;
+		uint64_t	rs_fail_hcksum_proto;
+
+		uint64_t	rs_bad_rx_frame;
+		uint64_t	rs_rx_merge_overrun;
+		uint64_t	rs_rx_merge_underrun;
+		uint64_t	rs_rx_pad_short;
+		uint64_t	rs_rx_mcast_check;
+		uint64_t	rs_too_short;
+		uint64_t	rs_tx_absent;
+
+		uint64_t	rs_rx_hookdrop;
+		uint64_t	rs_tx_hookdrop;
+	} vr_stats;
+} viona_vring_t;
+
+struct viona_link {
+	vmm_hold_t		*l_vm_hold;
+	boolean_t		l_destroyed;
+
+	viona_vring_t		l_vrings[VIONA_VQ_MAX];
+
+	uint32_t		l_features;
+	uint32_t		l_features_hw;
+	uint32_t		l_cap_csum;
+
+	uintptr_t		l_notify_ioport;
+	void			*l_notify_cookie;
+
+	datalink_id_t		l_linkid;
+	mac_handle_t		l_mh;
+	mac_client_handle_t	l_mch;
+	mac_promisc_handle_t	l_mph;
+
+	pollhead_t		l_pollhead;
+
+	viona_neti_t		*l_neti;
+};
+
+typedef struct viona_nethook {
+	net_handle_t		vnh_neti;
+	hook_family_t		vnh_family;
+	hook_event_t		vnh_event_in;
+	hook_event_t		vnh_event_out;
+	hook_event_token_t	vnh_token_in;
+	hook_event_token_t	vnh_token_out;
+	boolean_t		vnh_hooked;
+} viona_nethook_t;
+
+struct viona_neti {
+	list_node_t		vni_node;
+
+	netid_t			vni_netid;
+	zoneid_t		vni_zid;
+
+	viona_nethook_t		vni_nethook;
+
+	kmutex_t		vni_lock;	/* Protects remaining members */
+	kcondvar_t		vni_ref_change; /* Protected by vni_lock */
+	uint_t			vni_ref;	/* Protected by vni_lock */
+	list_t			vni_dev_list;	/* Protected by vni_lock */
+};
+
+typedef struct used_elem {
+	uint16_t	id;
+	uint32_t	len;
+} used_elem_t;
+
+typedef struct viona_soft_state {
+	kmutex_t		ss_lock;
+	viona_link_t		*ss_link;
+	list_node_t		ss_node;
+} viona_soft_state_t;
+
+#pragma pack(1)
+struct virtio_desc {
+	uint64_t	vd_addr;
+	uint32_t	vd_len;
+	uint16_t	vd_flags;
+	uint16_t	vd_next;
+};
+
+struct virtio_used {
+	uint32_t	vu_idx;
+	uint32_t	vu_tlen;
+};
+
+struct virtio_net_mrgrxhdr {
+	uint8_t		vrh_flags;
+	uint8_t		vrh_gso_type;
+	uint16_t	vrh_hdr_len;
+	uint16_t	vrh_gso_size;
+	uint16_t	vrh_csum_start;
+	uint16_t	vrh_csum_offset;
+	uint16_t	vrh_bufs;
+};
+
+struct virtio_net_hdr {
+	uint8_t		vrh_flags;
+	uint8_t		vrh_gso_type;
+	uint16_t	vrh_hdr_len;
+	uint16_t	vrh_gso_size;
+	uint16_t	vrh_csum_start;
+	uint16_t	vrh_csum_offset;
+};
+#pragma pack()
+
+#define	VRING_NEED_BAIL(ring, proc)					\
+		(((ring)->vr_state_flags & VRSF_REQ_STOP) != 0 ||	\
+		((proc)->p_flag & SEXITING) != 0)
+
+
+#define	VNETHOOK_INTERESTED_IN(neti) \
+	(neti)->vni_nethook.vnh_event_in.he_interested
+#define	VNETHOOK_INTERESTED_OUT(neti) \
+	(neti)->vni_nethook.vnh_event_out.he_interested
+
+
+#define	VIONA_PROBE(name)	DTRACE_PROBE(viona__##name)
+#define	VIONA_PROBE1(name, arg1, arg2)	\
+	DTRACE_PROBE1(viona__##name, arg1, arg2)
+#define	VIONA_PROBE2(name, arg1, arg2, arg3, arg4)	\
+	DTRACE_PROBE2(viona__##name, arg1, arg2, arg3, arg4)
+#define	VIONA_PROBE3(name, arg1, arg2, arg3, arg4, arg5, arg6)	\
+	DTRACE_PROBE3(viona__##name, arg1, arg2, arg3, arg4, arg5, arg6)
+#define	VIONA_PROBE5(name, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, \
+	arg9, arg10) \
+	DTRACE_PROBE5(viona__##name, arg1, arg2, arg3, arg4, arg5, arg6, arg7, \
+	arg8, arg9, arg10)
+#define	VIONA_PROBE_BAD_RING_ADDR(r, a)		\
+	VIONA_PROBE2(bad_ring_addr, viona_vring_t *, r, void *, (void *)(a))
+
+#define	VIONA_RING_STAT_INCR(r, name)	\
+	(((r)->vr_stats.rs_ ## name)++)
+
+
+#define	VIONA_MAX_HDRS_LEN	(sizeof (struct ether_vlan_header) + \
+	IP_MAX_HDR_LENGTH + TCP_MAX_HDR_LENGTH)
+
+#define	VRING_AVAIL_F_NO_INTERRUPT	1
+#define	VRING_USED_F_NO_NOTIFY		1
+
+#define	VRING_DESC_F_NEXT	(1 << 0)
+#define	VRING_DESC_F_WRITE	(1 << 1)
+#define	VRING_DESC_F_INDIRECT	(1 << 2)
+
+#define	VIRTIO_NET_HDR_F_NEEDS_CSUM	(1 << 0)
+#define	VIRTIO_NET_HDR_F_DATA_VALID	(1 << 1)
+
+#define	VIRTIO_NET_HDR_GSO_NONE		0
+#define	VIRTIO_NET_HDR_GSO_TCPV4	1
+
+#define	VIRTIO_NET_F_CSUM		(1 << 0)
+#define	VIRTIO_NET_F_GUEST_CSUM		(1 << 1)
+#define	VIRTIO_NET_F_MAC		(1 << 5) /* host supplies MAC */
+#define	VIRTIO_NET_F_GUEST_TSO4		(1 << 7) /* guest can accept TSO */
+#define	VIRTIO_NET_F_HOST_TSO4		(1 << 11) /* host can accept TSO */
+#define	VIRTIO_NET_F_MRG_RXBUF		(1 << 15) /* host can merge RX bufs */
+#define	VIRTIO_NET_F_STATUS		(1 << 16) /* cfg status field present */
+#define	VIRTIO_F_RING_NOTIFY_ON_EMPTY	(1 << 24)
+#define	VIRTIO_F_RING_INDIRECT_DESC	(1 << 28)
+#define	VIRTIO_F_RING_EVENT_IDX		(1 << 29)
+
+
+void viona_ring_alloc(viona_link_t *, viona_vring_t *);
+void viona_ring_free(viona_vring_t *);
+int viona_ring_reset(viona_vring_t *, boolean_t);
+int viona_ring_init(viona_link_t *, uint16_t, uint16_t, uint64_t);
+boolean_t viona_ring_lease_renew(viona_vring_t *);
+int vq_popchain(viona_vring_t *, struct iovec *, uint_t, uint16_t *);
+void vq_pushchain(viona_vring_t *, uint32_t, uint16_t);
+void vq_pushchain_many(viona_vring_t *, uint_t, used_elem_t *);
+void viona_intr_ring(viona_vring_t *ring);
+
+void viona_rx_init(void);
+void viona_rx_fini(void);
+int viona_rx_set(viona_link_t *);
+void viona_rx_clear(viona_link_t *);
+void viona_worker_rx(viona_vring_t *, viona_link_t *);
+
+extern kmutex_t viona_force_copy_lock;
+void viona_worker_tx(viona_vring_t *, viona_link_t *);
+void viona_tx_ring_alloc(viona_vring_t *, const uint16_t);
+void viona_tx_ring_free(viona_vring_t *, const uint16_t);
+
+void viona_neti_attach(void);
+void viona_neti_detach(void);
+viona_neti_t *viona_neti_lookup_by_zid(zoneid_t);
+void viona_neti_rele(viona_neti_t *);
+int viona_hook(viona_link_t *, viona_vring_t *, mblk_t **, boolean_t);
+
+#endif	/* _VIONA_IMPL_H */
diff --git a/usr/src/uts/i86pc/io/viona/viona_main.c b/usr/src/uts/i86pc/io/viona/viona_main.c
new file mode 100644
index 0000000000..e3c9b90a57
--- /dev/null
+++ b/usr/src/uts/i86pc/io/viona/viona_main.c
@@ -0,0 +1,985 @@
+/*
+ * Copyright (c) 2013  Chris Torek <torek @ torek net>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2015 Pluribus Networks Inc.
+ * Copyright 2019 Joyent, Inc.
+ */
+
+/*
+ * viona - VirtIO-Net, Accelerated
+ *
+ * The purpose of viona is to provide high performance virtio-net devices to
+ * bhyve guests.  It does so by sitting directly atop MAC, skipping all of the
+ * DLS/DLD stack.
+ *
+ * --------------------
+ * General Architecture
+ * --------------------
+ *
+ * A single viona instance is comprised of a "link" handle and two "rings".
+ * After opening the viona device, it must be associated with a MAC network
+ * interface and a bhyve (vmm) instance to form its link resource.  This is
+ * done with the VNA_IOC_CREATE ioctl, where the datalink ID and vmm fd are
+ * passed in to perform the initialization.  With the MAC client opened, and a
+ * driver handle to the vmm instance established, the device is ready to be
+ * configured by the guest.
+ *
+ * The userspace portion of bhyve, which interfaces with the PCI device
+ * emulation framework, is meant to stay out of the datapath if at all
+ * possible.  Configuration changes made via PCI are mapped to actions which
+ * will steer the operation of the in-kernel logic.
+ *
+ *
+ * -----------
+ * Ring Basics
+ * -----------
+ *
+ * Each viona link has two viona_vring_t entities, RX and TX, for handling data
+ * transfers to and from the guest.  They represent an interface to the
+ * standard virtio ring structures.  When intiailized and active, each ring is
+ * backed by a kernel worker thread (parented to the bhyve process for the
+ * instance) which handles ring events.  The RX worker has the simple task of
+ * watching for ring shutdown conditions.  The TX worker does that in addition
+ * to processing all requests to transmit data.  Data destined for the guest is
+ * delivered directly by MAC to viona_rx() when the ring is active.
+ *
+ *
+ * -----------
+ * Ring States
+ * -----------
+ *
+ * The viona_vring_t instances follow a simple path through the possible state
+ * values represented in virtio_vring_t`vr_state:
+ *
+ *        +<--------------------------------------------+
+ *        |						|
+ *        V						^
+ *  +-----------+	This is the initial state when a link is created or
+ *  | VRS_RESET |	when the ring has been explicitly reset.
+ *  +-----------+
+ *        |						^
+ *        |---* ioctl(VNA_IOC_RING_INIT) issued		|
+ *        |						|
+ *        |						^
+ *        V
+ *  +-----------+	The ring parameters (size, guest physical addresses)
+ *  | VRS_SETUP |	have been set and start-up of the ring worker thread
+ *  +-----------+	has begun.
+ *        |						^
+ *        |						|
+ *        |---* ring worker thread begins execution	|
+ *        |						|
+ *        +-------------------------------------------->+
+ *        |	      |					^
+ *        |	      |
+ *        |	      *	If ring shutdown is requested (by ioctl or impending
+ *        |		bhyve process death) while the worker thread is
+ *        |		starting, the worker will transition the ring to
+ *        |		VRS_RESET and exit.
+ *        |						^
+ *        |						|
+ *        |						^
+ *        V
+ *  +-----------+	The worker thread associated with the ring has started
+ *  | VRS_INIT  |	executing.  It has allocated any extra resources needed
+ *  +-----------+	for the ring to operate.
+ *        |						^
+ *        |						|
+ *        +-------------------------------------------->+
+ *        |	      |					^
+ *        |	      |
+ *        |	      *	If ring shutdown is requested while the worker is
+ *        |		waiting in VRS_INIT, it will free any extra resources
+ *        |		and transition to VRS_RESET.
+ *        |						^
+ *        |						|
+ *        |--* ioctl(VNA_IOC_RING_KICK) issued		|
+ *        |						^
+ *        V
+ *  +-----------+	The worker thread associated with the ring is executing
+ *  | VRS_RUN   |	workload specific to that ring.
+ *  +-----------+
+ *        |						^
+ *        |---* ioctl(VNA_IOC_RING_RESET) issued	|
+ *        |	(or bhyve process begins exit)		|
+ *        V						|
+ *        +-------------------------------------------->+
+ *
+ *
+ * While the worker thread is not running, changes to vr_state are only made by
+ * viona_ioc_ring_init() under vr_lock.  There, it initializes the ring, starts
+ * the worker, and sets the ring state to VRS_SETUP.  Once the worker thread
+ * has been started, only it may perform ring state transitions (still under
+ * the protection of vr_lock), when requested by outside consumers via
+ * vr_state_flags or when the containing bhyve process initiates an exit.
+ *
+ *
+ * ----------------------------
+ * Transmission mblk_t Handling
+ * ----------------------------
+ *
+ * For incoming frames destined for a bhyve guest, the data must first land in
+ * a host OS buffer from the physical NIC before it is copied into the awaiting
+ * guest buffer(s).  Outbound frames transmitted by the guest are not bound by
+ * this limitation and can avoid extra copying before the buffers are accessed
+ * directly by the NIC.  When a guest designates buffers to be transmitted,
+ * viona translates the guest-physical addresses contained in the ring
+ * descriptors to host-virtual addresses via vmm_dr_gpa2kva().  That pointer is
+ * wrapped in an mblk_t using a preallocated viona_desb_t for the desballoc().
+ * Doing so increments vr_xfer_outstanding, preventing the ring from being
+ * reset (allowing the link to drop its vmm handle to the guest) until all
+ * transmit mblks referencing guest memory have been processed.  Allocation of
+ * the viona_desb_t entries is done during the VRS_INIT stage of the ring
+ * worker thread.  The ring size informs that allocation as the number of
+ * concurrent transmissions is limited by the number of descriptors in the
+ * ring.  This minimizes allocation in the transmit hot-path by aqcuiring those
+ * fixed-size resources during initialization.
+ *
+ * This optimization depends on the underlying NIC driver freeing the mblks in
+ * a timely manner after they have been transmitted by the hardware.  Some
+ * drivers have been found to flush TX descriptors only when new transmissions
+ * are initiated.  This means that there is no upper bound to the time needed
+ * for an mblk to be flushed and can stall bhyve guests from shutting down
+ * since their memory must be free of viona TX references prior to clean-up.
+ *
+ * This expectation of deterministic mblk_t processing is likely the reason
+ * behind the notable exception to the zero-copy TX path: systems with 'bnxe'
+ * loaded will copy transmit data into fresh buffers rather than passing up
+ * zero-copy mblks.  It is a hold-over from the original viona sources provided
+ * by Pluribus and its continued necessity has not been confirmed.
+ *
+ *
+ * ----------------------------
+ * Ring Notification Fast-paths
+ * ----------------------------
+ *
+ * Device operation for viona requires that notifications flow to and from the
+ * guest to indicate certain ring conditions.  In order to minimize latency and
+ * processing overhead, the notification procedures are kept in-kernel whenever
+ * possible.
+ *
+ * Guest-to-host notifications, when new available descriptors have been placed
+ * in the ring, are posted via the 'queue notify' address in the virtio BAR.
+ * The vmm_drv_ioport_hook() interface was added to bhyve which allows viona to
+ * install a callback hook on an ioport address.  Guest exits for accesses to
+ * viona-hooked ioport addresses will result in direct calls to notify the
+ * appropriate ring worker without a trip to userland.
+ *
+ * Host-to-guest notifications in the form of interrupts enjoy similar
+ * acceleration.  Each viona ring can be configured to send MSI notifications
+ * to the guest as virtio conditions dictate.  This in-kernel interrupt
+ * configuration is kept synchronized through viona ioctls which are utilized
+ * during writes to the associated PCI config registers or MSI-X BAR.
+ *
+ * Guests which do not utilize MSI-X will result in viona falling back to the
+ * slow path for interrupts.  It will poll(2) the viona handle, receiving
+ * notification when ring events necessitate the assertion of an interrupt.
+ *
+ *
+ * ---------------
+ * Nethook Support
+ * ---------------
+ *
+ * Viona provides four nethook events that consumers (e.g. ipf) can hook into
+ * to intercept packets as they go up or down the stack.  Unfortunately,
+ * the nethook framework does not understand raw packets, so we can only
+ * generate events (in, out) for IPv4 and IPv6 packets.  At driver attach,
+ * we register callbacks with the neti (netinfo) module that will be invoked
+ * for each netstack already present, as well as for any additional netstack
+ * instances created as the system operates.  These callbacks will
+ * register/unregister the hooks with the nethook framework for each
+ * netstack instance.  This registration occurs prior to creating any
+ * viona instances for a given netstack, and the unregistration for a netstack
+ * instance occurs after all viona instances of the netstack instance have
+ * been deleted.
+ */
+
+#include <sys/conf.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+
+#include <sys/dlpi.h>
+
+#include "viona_impl.h"
+
+
+#define	VIONA_NAME		"Virtio Network Accelerator"
+#define	VIONA_CTL_MINOR		0
+#define	VIONA_CLI_NAME		"viona"		/* MAC client name */
+
+
+/*
+ * Host capabilities.
+ */
+#define	VIONA_S_HOSTCAPS	(	\
+	VIRTIO_NET_F_GUEST_CSUM |	\
+	VIRTIO_NET_F_MAC |		\
+	VIRTIO_NET_F_GUEST_TSO4 |	\
+	VIRTIO_NET_F_MRG_RXBUF |	\
+	VIRTIO_NET_F_STATUS |		\
+	VIRTIO_F_RING_NOTIFY_ON_EMPTY |	\
+	VIRTIO_F_RING_INDIRECT_DESC)
+
+/* MAC_CAPAB_HCKSUM specifics of interest */
+#define	VIONA_CAP_HCKSUM_INTEREST	\
+	(HCKSUM_INET_PARTIAL |		\
+	HCKSUM_INET_FULL_V4 |		\
+	HCKSUM_INET_FULL_V6)
+
+static void		*viona_state;
+static dev_info_t	*viona_dip;
+static id_space_t	*viona_minors;
+
+
+static int viona_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg,
+    void **result);
+static int viona_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
+static int viona_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
+static int viona_open(dev_t *devp, int flag, int otype, cred_t *credp);
+static int viona_close(dev_t dev, int flag, int otype, cred_t *credp);
+static int viona_ioctl(dev_t dev, int cmd, intptr_t data, int mode,
+    cred_t *credp, int *rval);
+static int viona_chpoll(dev_t dev, short events, int anyyet, short *reventsp,
+    struct pollhead **phpp);
+
+static int viona_ioc_create(viona_soft_state_t *, void *, int, cred_t *);
+static int viona_ioc_delete(viona_soft_state_t *, boolean_t);
+
+static int viona_ioc_set_notify_ioport(viona_link_t *, uint_t);
+static int viona_ioc_ring_init(viona_link_t *, void *, int);
+static int viona_ioc_ring_reset(viona_link_t *, uint_t);
+static int viona_ioc_ring_kick(viona_link_t *, uint_t);
+static int viona_ioc_ring_set_msi(viona_link_t *, void *, int);
+static int viona_ioc_ring_intr_clear(viona_link_t *, uint_t);
+static int viona_ioc_intr_poll(viona_link_t *, void *, int, int *);
+
+static struct cb_ops viona_cb_ops = {
+	viona_open,
+	viona_close,
+	nodev,
+	nodev,
+	nodev,
+	nodev,
+	nodev,
+	viona_ioctl,
+	nodev,
+	nodev,
+	nodev,
+	viona_chpoll,
+	ddi_prop_op,
+	0,
+	D_MP | D_NEW | D_HOTPLUG,
+	CB_REV,
+	nodev,
+	nodev
+};
+
+static struct dev_ops viona_ops = {
+	DEVO_REV,
+	0,
+	viona_info,
+	nulldev,
+	nulldev,
+	viona_attach,
+	viona_detach,
+	nodev,
+	&viona_cb_ops,
+	NULL,
+	ddi_power,
+	ddi_quiesce_not_needed
+};
+
+static struct modldrv modldrv = {
+	&mod_driverops,
+	VIONA_NAME,
+	&viona_ops,
+};
+
+static struct modlinkage modlinkage = {
+	MODREV_1, &modldrv, NULL
+};
+
+int
+_init(void)
+{
+	int ret;
+
+	ret = ddi_soft_state_init(&viona_state, sizeof (viona_soft_state_t), 0);
+	if (ret != 0) {
+		return (ret);
+	}
+
+	viona_minors = id_space_create("viona_minors",
+	    VIONA_CTL_MINOR + 1, UINT16_MAX);
+	viona_rx_init();
+	mutex_init(&viona_force_copy_lock, NULL, MUTEX_DRIVER, NULL);
+
+	ret = mod_install(&modlinkage);
+	if (ret != 0) {
+		ddi_soft_state_fini(&viona_state);
+		id_space_destroy(viona_minors);
+		viona_rx_fini();
+		mutex_destroy(&viona_force_copy_lock);
+	}
+
+	return (ret);
+}
+
+int
+_fini(void)
+{
+	int ret;
+
+	ret = mod_remove(&modlinkage);
+	if (ret != 0) {
+		return (ret);
+	}
+
+	ddi_soft_state_fini(&viona_state);
+	id_space_destroy(viona_minors);
+	viona_rx_fini();
+	mutex_destroy(&viona_force_copy_lock);
+
+	return (ret);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&modlinkage, modinfop));
+}
+
+/* ARGSUSED */
+static int
+viona_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
+{
+	int error;
+
+	switch (cmd) {
+	case DDI_INFO_DEVT2DEVINFO:
+		*result = (void *)viona_dip;
+		error = DDI_SUCCESS;
+		break;
+	case DDI_INFO_DEVT2INSTANCE:
+		*result = (void *)0;
+		error = DDI_SUCCESS;
+		break;
+	default:
+		error = DDI_FAILURE;
+		break;
+	}
+	return (error);
+}
+
+static int
+viona_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+	if (cmd != DDI_ATTACH) {
+		return (DDI_FAILURE);
+	}
+
+	if (ddi_create_minor_node(dip, "viona", S_IFCHR, VIONA_CTL_MINOR,
+	    DDI_PSEUDO, 0) != DDI_SUCCESS) {
+		return (DDI_FAILURE);
+	}
+
+	viona_neti_attach();
+
+	viona_dip = dip;
+	ddi_report_dev(viona_dip);
+
+	return (DDI_SUCCESS);
+}
+
+static int
+viona_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+	dev_info_t *old_dip = viona_dip;
+
+	if (cmd != DDI_DETACH) {
+		return (DDI_FAILURE);
+	}
+
+	VERIFY(old_dip != NULL);
+
+	viona_neti_detach();
+	viona_dip = NULL;
+	ddi_remove_minor_node(old_dip, NULL);
+
+	return (DDI_SUCCESS);
+}
+
+static int
+viona_open(dev_t *devp, int flag, int otype, cred_t *credp)
+{
+	int	minor;
+	viona_soft_state_t *ss;
+
+	if (otype != OTYP_CHR) {
+		return (EINVAL);
+	}
+#if 0
+	/*
+	 * XXX-mg: drv_priv() is wrong, but I'm not sure what is right.
+	 * Should the check be at open() or ioctl()?
+	 */
+	if (drv_priv(credp) != 0) {
+		return (EPERM);
+	}
+#endif
+	if (getminor(*devp) != VIONA_CTL_MINOR) {
+		return (ENXIO);
+	}
+
+	minor = id_alloc_nosleep(viona_minors);
+	if (minor == 0) {
+		/* All minors are busy */
+		return (EBUSY);
+	}
+	if (ddi_soft_state_zalloc(viona_state, minor) != DDI_SUCCESS) {
+		id_free(viona_minors, minor);
+		return (ENOMEM);
+	}
+
+	ss = ddi_get_soft_state(viona_state, minor);
+	mutex_init(&ss->ss_lock, NULL, MUTEX_DEFAULT, NULL);
+	*devp = makedevice(getmajor(*devp), minor);
+
+	return (0);
+}
+
+static int
+viona_close(dev_t dev, int flag, int otype, cred_t *credp)
+{
+	int			minor;
+	viona_soft_state_t	*ss;
+
+	if (otype != OTYP_CHR) {
+		return (EINVAL);
+	}
+
+	minor = getminor(dev);
+
+	ss = ddi_get_soft_state(viona_state, minor);
+	if (ss == NULL) {
+		return (ENXIO);
+	}
+
+	VERIFY0(viona_ioc_delete(ss, B_TRUE));
+	VERIFY(!list_link_active(&ss->ss_node));
+	ddi_soft_state_free(viona_state, minor);
+	id_free(viona_minors, minor);
+
+	return (0);
+}
+
+static int
+viona_ioctl(dev_t dev, int cmd, intptr_t data, int md, cred_t *cr, int *rv)
+{
+	viona_soft_state_t *ss;
+	void *dptr = (void *)data;
+	int err = 0, val;
+	viona_link_t *link;
+
+	ss = ddi_get_soft_state(viona_state, getminor(dev));
+	if (ss == NULL) {
+		return (ENXIO);
+	}
+
+	switch (cmd) {
+	case VNA_IOC_CREATE:
+		return (viona_ioc_create(ss, dptr, md, cr));
+	case VNA_IOC_DELETE:
+		return (viona_ioc_delete(ss, B_FALSE));
+	default:
+		break;
+	}
+
+	mutex_enter(&ss->ss_lock);
+	if ((link = ss->ss_link) == NULL || link->l_destroyed ||
+	    vmm_drv_release_reqd(link->l_vm_hold)) {
+		mutex_exit(&ss->ss_lock);
+		return (ENXIO);
+	}
+
+	switch (cmd) {
+	case VNA_IOC_GET_FEATURES:
+		val = VIONA_S_HOSTCAPS | link->l_features_hw;
+		if (ddi_copyout(&val, dptr, sizeof (val), md) != 0) {
+			err = EFAULT;
+		}
+		break;
+	case VNA_IOC_SET_FEATURES:
+		if (ddi_copyin(dptr, &val, sizeof (val), md) != 0) {
+			err = EFAULT;
+			break;
+		}
+		val &= (VIONA_S_HOSTCAPS | link->l_features_hw);
+
+		if ((val & VIRTIO_NET_F_CSUM) == 0)
+			val &= ~VIRTIO_NET_F_HOST_TSO4;
+
+		if ((val & VIRTIO_NET_F_GUEST_CSUM) == 0)
+			val &= ~VIRTIO_NET_F_GUEST_TSO4;
+
+		link->l_features = val;
+		break;
+	case VNA_IOC_RING_INIT:
+		err = viona_ioc_ring_init(link, dptr, md);
+		break;
+	case VNA_IOC_RING_RESET:
+		err = viona_ioc_ring_reset(link, (uint_t)data);
+		break;
+	case VNA_IOC_RING_KICK:
+		err = viona_ioc_ring_kick(link, (uint_t)data);
+		break;
+	case VNA_IOC_RING_SET_MSI:
+		err = viona_ioc_ring_set_msi(link, dptr, md);
+		break;
+	case VNA_IOC_RING_INTR_CLR:
+		err = viona_ioc_ring_intr_clear(link, (uint_t)data);
+		break;
+	case VNA_IOC_INTR_POLL:
+		err = viona_ioc_intr_poll(link, dptr, md, rv);
+		break;
+	case VNA_IOC_SET_NOTIFY_IOP:
+		err = viona_ioc_set_notify_ioport(link, (uint_t)data);
+		break;
+	default:
+		err = ENOTTY;
+		break;
+	}
+
+	mutex_exit(&ss->ss_lock);
+	return (err);
+}
+
+static int
+viona_chpoll(dev_t dev, short events, int anyyet, short *reventsp,
+    struct pollhead **phpp)
+{
+	viona_soft_state_t *ss;
+	viona_link_t *link;
+
+	ss = ddi_get_soft_state(viona_state, getminor(dev));
+	if (ss == NULL) {
+		return (ENXIO);
+	}
+
+	mutex_enter(&ss->ss_lock);
+	if ((link = ss->ss_link) == NULL || link->l_destroyed) {
+		mutex_exit(&ss->ss_lock);
+		return (ENXIO);
+	}
+
+	*reventsp = 0;
+	if ((events & POLLRDBAND) != 0) {
+		for (uint_t i = 0; i < VIONA_VQ_MAX; i++) {
+			if (link->l_vrings[i].vr_intr_enabled != 0) {
+				*reventsp |= POLLRDBAND;
+				break;
+			}
+		}
+	}
+	if ((*reventsp == 0 && !anyyet) || (events & POLLET)) {
+		*phpp = &link->l_pollhead;
+	}
+	mutex_exit(&ss->ss_lock);
+
+	return (0);
+}
+
+static void
+viona_get_mac_capab(viona_link_t *link)
+{
+	mac_handle_t mh = link->l_mh;
+	uint32_t cap = 0;
+	mac_capab_lso_t lso_cap;
+
+	link->l_features_hw = 0;
+	if (mac_capab_get(mh, MAC_CAPAB_HCKSUM, &cap)) {
+		/*
+		 * Only report HW checksum ability if the underlying MAC
+		 * resource is capable of populating the L4 header.
+		 */
+		if ((cap & VIONA_CAP_HCKSUM_INTEREST) != 0) {
+			link->l_features_hw |= VIRTIO_NET_F_CSUM;
+		}
+		link->l_cap_csum = cap;
+	}
+
+	if ((link->l_features_hw & VIRTIO_NET_F_CSUM) &&
+	    mac_capab_get(mh, MAC_CAPAB_LSO, &lso_cap)) {
+		/*
+		 * Virtio doesn't allow for negotiating a maximum LSO
+		 * packet size. We have to assume that the guest may
+		 * send a maximum length IP packet. Make sure the
+		 * underlying MAC can handle an LSO of this size.
+		 */
+		if ((lso_cap.lso_flags & LSO_TX_BASIC_TCP_IPV4) &&
+		    lso_cap.lso_basic_tcp_ipv4.lso_max >= IP_MAXPACKET)
+			link->l_features_hw |= VIRTIO_NET_F_HOST_TSO4;
+	}
+}
+
+static int
+viona_ioc_create(viona_soft_state_t *ss, void *dptr, int md, cred_t *cr)
+{
+	vioc_create_t	kvc;
+	viona_link_t	*link = NULL;
+	char		cli_name[MAXNAMELEN];
+	int		err = 0;
+	file_t		*fp;
+	vmm_hold_t	*hold = NULL;
+	viona_neti_t	*nip = NULL;
+	zoneid_t	zid;
+
+	ASSERT(MUTEX_NOT_HELD(&ss->ss_lock));
+
+	if (ddi_copyin(dptr, &kvc, sizeof (kvc), md) != 0) {
+		return (EFAULT);
+	}
+
+	zid = crgetzoneid(cr);
+	nip = viona_neti_lookup_by_zid(zid);
+	if (nip == NULL) {
+		return (EIO);
+	}
+
+	if (!nip->vni_nethook.vnh_hooked) {
+		viona_neti_rele(nip);
+		return (EIO);
+	}
+
+	mutex_enter(&ss->ss_lock);
+	if (ss->ss_link != NULL) {
+		mutex_exit(&ss->ss_lock);
+		viona_neti_rele(nip);
+		return (EEXIST);
+	}
+
+	if ((fp = getf(kvc.c_vmfd)) == NULL) {
+		err = EBADF;
+		goto bail;
+	}
+	err = vmm_drv_hold(fp, cr, &hold);
+	releasef(kvc.c_vmfd);
+	if (err != 0) {
+		goto bail;
+	}
+
+	link = kmem_zalloc(sizeof (viona_link_t), KM_SLEEP);
+	link->l_linkid = kvc.c_linkid;
+	link->l_vm_hold = hold;
+
+	err = mac_open_by_linkid(link->l_linkid, &link->l_mh);
+	if (err != 0) {
+		goto bail;
+	}
+
+	viona_get_mac_capab(link);
+
+	(void) snprintf(cli_name, sizeof (cli_name), "%s-%d", VIONA_CLI_NAME,
+	    link->l_linkid);
+	err = mac_client_open(link->l_mh, &link->l_mch, cli_name, 0);
+	if (err != 0) {
+		goto bail;
+	}
+
+	viona_ring_alloc(link, &link->l_vrings[VIONA_VQ_RX]);
+	viona_ring_alloc(link, &link->l_vrings[VIONA_VQ_TX]);
+
+	if ((err = viona_rx_set(link)) != 0) {
+		viona_ring_free(&link->l_vrings[VIONA_VQ_RX]);
+		viona_ring_free(&link->l_vrings[VIONA_VQ_TX]);
+		goto bail;
+	}
+
+	link->l_neti = nip;
+	ss->ss_link = link;
+	mutex_exit(&ss->ss_lock);
+
+	mutex_enter(&nip->vni_lock);
+	list_insert_tail(&nip->vni_dev_list, ss);
+	mutex_exit(&nip->vni_lock);
+
+	return (0);
+
+bail:
+	if (link != NULL) {
+		if (link->l_mch != NULL) {
+			mac_client_close(link->l_mch, 0);
+		}
+		if (link->l_mh != NULL) {
+			mac_close(link->l_mh);
+		}
+		kmem_free(link, sizeof (viona_link_t));
+	}
+	if (hold != NULL) {
+		vmm_drv_rele(hold);
+	}
+	viona_neti_rele(nip);
+
+	mutex_exit(&ss->ss_lock);
+	return (err);
+}
+
+static int
+viona_ioc_delete(viona_soft_state_t *ss, boolean_t on_close)
+{
+	viona_link_t *link;
+	viona_neti_t *nip = NULL;
+
+	mutex_enter(&ss->ss_lock);
+	if ((link = ss->ss_link) == NULL) {
+		/* Link destruction already complete */
+		mutex_exit(&ss->ss_lock);
+		return (0);
+	}
+
+	if (link->l_destroyed) {
+		/*
+		 * Link destruction has been started by another thread, but has
+		 * not completed.  This condition should be impossible to
+		 * encounter when performing the on-close destroy of the link,
+		 * since racing ioctl accessors must necessarily be absent.
+		 */
+		VERIFY(!on_close);
+		mutex_exit(&ss->ss_lock);
+		return (EAGAIN);
+	}
+	/*
+	 * The link deletion cannot fail after this point, continuing until its
+	 * successful completion is reached.
+	 */
+	link->l_destroyed = B_TRUE;
+
+	/*
+	 * Tear down the IO port hook so it cannot be used to kick any of the
+	 * rings which are about to be reset and stopped.
+	 */
+	VERIFY0(viona_ioc_set_notify_ioport(link, 0));
+	mutex_exit(&ss->ss_lock);
+
+	/*
+	 * Return the rings to their reset state, ignoring any possible
+	 * interruptions from signals.
+	 */
+	VERIFY0(viona_ring_reset(&link->l_vrings[VIONA_VQ_RX], B_FALSE));
+	VERIFY0(viona_ring_reset(&link->l_vrings[VIONA_VQ_TX], B_FALSE));
+
+	mutex_enter(&ss->ss_lock);
+	if (link->l_mch != NULL) {
+		/* Unhook the receive callbacks and close out the client */
+		viona_rx_clear(link);
+		mac_client_close(link->l_mch, 0);
+	}
+	if (link->l_mh != NULL) {
+		mac_close(link->l_mh);
+	}
+	if (link->l_vm_hold != NULL) {
+		vmm_drv_rele(link->l_vm_hold);
+		link->l_vm_hold = NULL;
+	}
+
+	nip = link->l_neti;
+	link->l_neti = NULL;
+
+	viona_ring_free(&link->l_vrings[VIONA_VQ_RX]);
+	viona_ring_free(&link->l_vrings[VIONA_VQ_TX]);
+	pollhead_clean(&link->l_pollhead);
+	ss->ss_link = NULL;
+	mutex_exit(&ss->ss_lock);
+
+	mutex_enter(&nip->vni_lock);
+	list_remove(&nip->vni_dev_list, ss);
+	mutex_exit(&nip->vni_lock);
+
+	viona_neti_rele(nip);
+
+	kmem_free(link, sizeof (viona_link_t));
+	return (0);
+}
+
+static int
+viona_ioc_ring_init(viona_link_t *link, void *udata, int md)
+{
+	vioc_ring_init_t kri;
+	int err;
+
+	if (ddi_copyin(udata, &kri, sizeof (kri), md) != 0) {
+		return (EFAULT);
+	}
+
+	err = viona_ring_init(link, kri.ri_index, kri.ri_qsize, kri.ri_qaddr);
+
+	return (err);
+}
+
+static int
+viona_ioc_ring_reset(viona_link_t *link, uint_t idx)
+{
+	viona_vring_t *ring;
+
+	if (idx >= VIONA_VQ_MAX) {
+		return (EINVAL);
+	}
+	ring = &link->l_vrings[idx];
+
+	return (viona_ring_reset(ring, B_TRUE));
+}
+
+static int
+viona_ioc_ring_kick(viona_link_t *link, uint_t idx)
+{
+	viona_vring_t *ring;
+	int err;
+
+	if (idx >= VIONA_VQ_MAX) {
+		return (EINVAL);
+	}
+	ring = &link->l_vrings[idx];
+
+	mutex_enter(&ring->vr_lock);
+	switch (ring->vr_state) {
+	case VRS_SETUP:
+		/*
+		 * An early kick to a ring which is starting its worker thread
+		 * is fine.  Once that thread is active, it will process the
+		 * start-up request immediately.
+		 */
+		/* FALLTHROUGH */
+	case VRS_INIT:
+		ring->vr_state_flags |= VRSF_REQ_START;
+		/* FALLTHROUGH */
+	case VRS_RUN:
+		cv_broadcast(&ring->vr_cv);
+		err = 0;
+		break;
+	default:
+		err = EBUSY;
+		break;
+	}
+	mutex_exit(&ring->vr_lock);
+
+	return (err);
+}
+
+static int
+viona_ioc_ring_set_msi(viona_link_t *link, void *data, int md)
+{
+	vioc_ring_msi_t vrm;
+	viona_vring_t *ring;
+
+	if (ddi_copyin(data, &vrm, sizeof (vrm), md) != 0) {
+		return (EFAULT);
+	}
+	if (vrm.rm_index >= VIONA_VQ_MAX) {
+		return (EINVAL);
+	}
+
+	ring = &link->l_vrings[vrm.rm_index];
+	mutex_enter(&ring->vr_lock);
+	ring->vr_msi_addr = vrm.rm_addr;
+	ring->vr_msi_msg = vrm.rm_msg;
+	mutex_exit(&ring->vr_lock);
+
+	return (0);
+}
+
+static int
+viona_notify_wcb(void *arg, uintptr_t ioport, uint_t sz, uint64_t val)
+{
+	viona_link_t *link = (viona_link_t *)arg;
+	uint16_t vq = (uint16_t)val;
+
+	if (ioport != link->l_notify_ioport || sz != sizeof (uint16_t)) {
+		return (EINVAL);
+	}
+	return (viona_ioc_ring_kick(link, vq));
+}
+
+static int
+viona_ioc_set_notify_ioport(viona_link_t *link, uint_t ioport)
+{
+	int err = 0;
+
+	if (link->l_notify_ioport != 0) {
+		vmm_drv_ioport_unhook(link->l_vm_hold, &link->l_notify_cookie);
+		link->l_notify_ioport = 0;
+	}
+
+	if (ioport != 0) {
+		err = vmm_drv_ioport_hook(link->l_vm_hold, ioport, NULL,
+		    viona_notify_wcb, (void *)link, &link->l_notify_cookie);
+		if (err == 0) {
+			link->l_notify_ioport = ioport;
+		}
+	}
+	return (err);
+}
+
+static int
+viona_ioc_ring_intr_clear(viona_link_t *link, uint_t idx)
+{
+	if (idx >= VIONA_VQ_MAX) {
+		return (EINVAL);
+	}
+
+	link->l_vrings[idx].vr_intr_enabled = 0;
+	return (0);
+}
+
+static int
+viona_ioc_intr_poll(viona_link_t *link, void *udata, int md, int *rv)
+{
+	uint_t cnt = 0;
+	vioc_intr_poll_t vip;
+
+	for (uint_t i = 0; i < VIONA_VQ_MAX; i++) {
+		uint_t val = link->l_vrings[i].vr_intr_enabled;
+
+		vip.vip_status[i] = val;
+		if (val != 0) {
+			cnt++;
+		}
+	}
+
+	if (ddi_copyout(&vip, udata, sizeof (vip), md) != 0) {
+		return (EFAULT);
+	}
+	*rv = (int)cnt;
+	return (0);
+}
diff --git a/usr/src/uts/i86pc/io/viona/viona_ring.c b/usr/src/uts/i86pc/io/viona/viona_ring.c
new file mode 100644
index 0000000000..e535bfaa1a
--- /dev/null
+++ b/usr/src/uts/i86pc/io/viona/viona_ring.c
@@ -0,0 +1,636 @@
+/*
+ * Copyright (c) 2013  Chris Torek <torek @ torek net>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2015 Pluribus Networks Inc.
+ * Copyright 2019 Joyent, Inc.
+ */
+
+
+#include <sys/disp.h>
+
+#include "viona_impl.h"
+
+#define	VRING_ALIGN		4096
+#define	VRING_MAX_LEN		32768
+
+static boolean_t viona_ring_map(viona_vring_t *);
+static void viona_ring_unmap(viona_vring_t *);
+static kthread_t *viona_create_worker(viona_vring_t *);
+
+static void *
+viona_gpa2kva(viona_vring_t *ring, uint64_t gpa, size_t len)
+{
+	ASSERT3P(ring->vr_lease, !=, NULL);
+
+	return (vmm_drv_gpa2kva(ring->vr_lease, gpa, len));
+}
+
+static boolean_t
+viona_ring_lease_expire_cb(void *arg)
+{
+	viona_vring_t *ring = arg;
+
+	cv_broadcast(&ring->vr_cv);
+
+	/* The lease will be broken asynchronously. */
+	return (B_FALSE);
+}
+
+static void
+viona_ring_lease_drop(viona_vring_t *ring)
+{
+	ASSERT(MUTEX_HELD(&ring->vr_lock));
+
+	if (ring->vr_lease != NULL) {
+		vmm_hold_t *hold = ring->vr_link->l_vm_hold;
+
+		ASSERT(hold != NULL);
+
+		/*
+		 * Without an active lease, the ring mappings cannot be
+		 * considered valid.
+		 */
+		viona_ring_unmap(ring);
+
+		vmm_drv_lease_break(hold, ring->vr_lease);
+		ring->vr_lease = NULL;
+	}
+}
+
+boolean_t
+viona_ring_lease_renew(viona_vring_t *ring)
+{
+	vmm_hold_t *hold = ring->vr_link->l_vm_hold;
+
+	ASSERT(hold != NULL);
+	ASSERT(MUTEX_HELD(&ring->vr_lock));
+
+	viona_ring_lease_drop(ring);
+
+	/*
+	 * Lease renewal will fail if the VM has requested that all holds be
+	 * cleaned up.
+	 */
+	ring->vr_lease = vmm_drv_lease_sign(hold, viona_ring_lease_expire_cb,
+	    ring);
+	if (ring->vr_lease != NULL) {
+		/* A ring undergoing renewal will need valid guest mappings */
+		if (ring->vr_pa != 0 && ring->vr_size != 0) {
+			/*
+			 * If new mappings cannot be established, consider the
+			 * lease renewal a failure.
+			 */
+			if (!viona_ring_map(ring)) {
+				viona_ring_lease_drop(ring);
+				return (B_FALSE);
+			}
+		}
+	}
+	return (ring->vr_lease != NULL);
+}
+
+void
+viona_ring_alloc(viona_link_t *link, viona_vring_t *ring)
+{
+	ring->vr_link = link;
+	mutex_init(&ring->vr_lock, NULL, MUTEX_DRIVER, NULL);
+	cv_init(&ring->vr_cv, NULL, CV_DRIVER, NULL);
+	mutex_init(&ring->vr_a_mutex, NULL, MUTEX_DRIVER, NULL);
+	mutex_init(&ring->vr_u_mutex, NULL, MUTEX_DRIVER, NULL);
+}
+
+static void
+viona_ring_misc_free(viona_vring_t *ring)
+{
+	const uint_t qsz = ring->vr_size;
+
+	viona_tx_ring_free(ring, qsz);
+}
+
+void
+viona_ring_free(viona_vring_t *ring)
+{
+	mutex_destroy(&ring->vr_lock);
+	cv_destroy(&ring->vr_cv);
+	mutex_destroy(&ring->vr_a_mutex);
+	mutex_destroy(&ring->vr_u_mutex);
+	ring->vr_link = NULL;
+}
+
+int
+viona_ring_init(viona_link_t *link, uint16_t idx, uint16_t qsz, uint64_t pa)
+{
+	viona_vring_t *ring;
+	kthread_t *t;
+	int err = 0;
+
+	if (idx >= VIONA_VQ_MAX) {
+		return (EINVAL);
+	}
+	if (qsz == 0 || qsz > VRING_MAX_LEN || (1 << (ffs(qsz) - 1)) != qsz) {
+		return (EINVAL);
+	}
+
+	ring = &link->l_vrings[idx];
+	mutex_enter(&ring->vr_lock);
+	if (ring->vr_state != VRS_RESET) {
+		mutex_exit(&ring->vr_lock);
+		return (EBUSY);
+	}
+	VERIFY(ring->vr_state_flags == 0);
+
+	ring->vr_lease = NULL;
+	if (!viona_ring_lease_renew(ring)) {
+		err = EBUSY;
+		goto fail;
+	}
+
+	ring->vr_size = qsz;
+	ring->vr_mask = (ring->vr_size - 1);
+	ring->vr_pa = pa;
+	if (!viona_ring_map(ring)) {
+		err = EINVAL;
+		goto fail;
+	}
+
+	/* Initialize queue indexes */
+	ring->vr_cur_aidx = 0;
+
+	if (idx == VIONA_VQ_TX) {
+		viona_tx_ring_alloc(ring, qsz);
+	}
+
+	/* Zero out MSI-X configuration */
+	ring->vr_msi_addr = 0;
+	ring->vr_msi_msg = 0;
+
+	/* Clear the stats */
+	bzero(&ring->vr_stats, sizeof (ring->vr_stats));
+
+	t = viona_create_worker(ring);
+	if (t == NULL) {
+		err = ENOMEM;
+		goto fail;
+	}
+	ring->vr_worker_thread = t;
+	ring->vr_state = VRS_SETUP;
+	cv_broadcast(&ring->vr_cv);
+	mutex_exit(&ring->vr_lock);
+	return (0);
+
+fail:
+	viona_ring_lease_drop(ring);
+	viona_ring_misc_free(ring);
+	ring->vr_size = 0;
+	ring->vr_mask = 0;
+	mutex_exit(&ring->vr_lock);
+	return (err);
+}
+
+int
+viona_ring_reset(viona_vring_t *ring, boolean_t heed_signals)
+{
+	mutex_enter(&ring->vr_lock);
+	if (ring->vr_state == VRS_RESET) {
+		mutex_exit(&ring->vr_lock);
+		return (0);
+	}
+
+	if ((ring->vr_state_flags & VRSF_REQ_STOP) == 0) {
+		ring->vr_state_flags |= VRSF_REQ_STOP;
+		cv_broadcast(&ring->vr_cv);
+	}
+	while (ring->vr_state != VRS_RESET) {
+		if (!heed_signals) {
+			cv_wait(&ring->vr_cv, &ring->vr_lock);
+		} else {
+			int rs;
+
+			rs = cv_wait_sig(&ring->vr_cv, &ring->vr_lock);
+			if (rs <= 0 && ring->vr_state != VRS_RESET) {
+				mutex_exit(&ring->vr_lock);
+				return (EINTR);
+			}
+		}
+	}
+	viona_ring_lease_drop(ring);
+	mutex_exit(&ring->vr_lock);
+	return (0);
+}
+
+static boolean_t
+viona_ring_map(viona_vring_t *ring)
+{
+	uint64_t pos = ring->vr_pa;
+	const uint16_t qsz = ring->vr_size;
+
+	ASSERT3U(qsz, !=, 0);
+	ASSERT3U(pos, !=, 0);
+	ASSERT(MUTEX_HELD(&ring->vr_lock));
+
+	const size_t desc_sz = qsz * sizeof (struct virtio_desc);
+	ring->vr_descr = viona_gpa2kva(ring, pos, desc_sz);
+	if (ring->vr_descr == NULL) {
+		goto fail;
+	}
+	pos += desc_sz;
+
+	const size_t avail_sz = (qsz + 3) * sizeof (uint16_t);
+	ring->vr_avail_flags = viona_gpa2kva(ring, pos, avail_sz);
+	if (ring->vr_avail_flags == NULL) {
+		goto fail;
+	}
+	ring->vr_avail_idx = ring->vr_avail_flags + 1;
+	ring->vr_avail_ring = ring->vr_avail_flags + 2;
+	ring->vr_avail_used_event = ring->vr_avail_ring + qsz;
+	pos += avail_sz;
+
+	const size_t used_sz = (qsz * sizeof (struct virtio_used)) +
+	    (sizeof (uint16_t) * 3);
+	pos = P2ROUNDUP(pos, VRING_ALIGN);
+	ring->vr_used_flags = viona_gpa2kva(ring, pos, used_sz);
+	if (ring->vr_used_flags == NULL) {
+		goto fail;
+	}
+	ring->vr_used_idx = ring->vr_used_flags + 1;
+	ring->vr_used_ring = (struct virtio_used *)(ring->vr_used_flags + 2);
+	ring->vr_used_avail_event = (uint16_t *)(ring->vr_used_ring + qsz);
+
+	return (B_TRUE);
+
+fail:
+	viona_ring_unmap(ring);
+	return (B_FALSE);
+}
+
+static void
+viona_ring_unmap(viona_vring_t *ring)
+{
+	ASSERT(MUTEX_HELD(&ring->vr_lock));
+
+	ring->vr_descr = NULL;
+	ring->vr_avail_flags = NULL;
+	ring->vr_avail_idx = NULL;
+	ring->vr_avail_ring = NULL;
+	ring->vr_avail_used_event = NULL;
+	ring->vr_used_flags = NULL;
+	ring->vr_used_idx = NULL;
+	ring->vr_used_ring = NULL;
+	ring->vr_used_avail_event = NULL;
+}
+
+void
+viona_intr_ring(viona_vring_t *ring)
+{
+	uint64_t addr;
+
+	mutex_enter(&ring->vr_lock);
+	/* Deliver the interrupt directly, if so configured. */
+	if ((addr = ring->vr_msi_addr) != 0) {
+		uint64_t msg = ring->vr_msi_msg;
+
+		mutex_exit(&ring->vr_lock);
+		(void) vmm_drv_msi(ring->vr_lease, addr, msg);
+		return;
+	}
+	mutex_exit(&ring->vr_lock);
+
+	if (atomic_cas_uint(&ring->vr_intr_enabled, 0, 1) == 0) {
+		pollwakeup(&ring->vr_link->l_pollhead, POLLRDBAND);
+	}
+}
+
+static void
+viona_worker(void *arg)
+{
+	viona_vring_t *ring = (viona_vring_t *)arg;
+	viona_link_t *link = ring->vr_link;
+	proc_t *p = ttoproc(curthread);
+
+	mutex_enter(&ring->vr_lock);
+	VERIFY3U(ring->vr_state, ==, VRS_SETUP);
+
+	/* Bail immediately if ring shutdown or process exit was requested */
+	if (VRING_NEED_BAIL(ring, p)) {
+		goto cleanup;
+	}
+
+	/* Report worker thread as alive and notify creator */
+	ring->vr_state = VRS_INIT;
+	cv_broadcast(&ring->vr_cv);
+
+	while (ring->vr_state_flags == 0) {
+		/*
+		 * Keeping lease renewals timely while waiting for the ring to
+		 * be started is important for avoiding deadlocks.
+		 */
+		if (vmm_drv_lease_expired(ring->vr_lease)) {
+			if (!viona_ring_lease_renew(ring)) {
+				goto cleanup;
+			}
+		}
+
+		(void) cv_wait_sig(&ring->vr_cv, &ring->vr_lock);
+
+		if (VRING_NEED_BAIL(ring, p)) {
+			goto cleanup;
+		}
+	}
+
+	ASSERT((ring->vr_state_flags & VRSF_REQ_START) != 0);
+	ring->vr_state = VRS_RUN;
+	ring->vr_state_flags &= ~VRSF_REQ_START;
+
+	/* Ensure ring lease is valid first */
+	if (vmm_drv_lease_expired(ring->vr_lease)) {
+		if (!viona_ring_lease_renew(ring)) {
+			goto cleanup;
+		}
+	}
+
+	/* Process actual work */
+	if (ring == &link->l_vrings[VIONA_VQ_RX]) {
+		viona_worker_rx(ring, link);
+	} else if (ring == &link->l_vrings[VIONA_VQ_TX]) {
+		viona_worker_tx(ring, link);
+	} else {
+		panic("unexpected ring: %p", (void *)ring);
+	}
+
+cleanup:
+	if (ring->vr_txdesb != NULL) {
+		/*
+		 * Transmit activity must be entirely concluded before the
+		 * associated descriptors can be cleaned up.
+		 */
+		VERIFY(ring->vr_xfer_outstanding == 0);
+	}
+	viona_ring_misc_free(ring);
+
+	viona_ring_lease_drop(ring);
+	ring->vr_cur_aidx = 0;
+	ring->vr_state = VRS_RESET;
+	ring->vr_state_flags = 0;
+	ring->vr_worker_thread = NULL;
+	cv_broadcast(&ring->vr_cv);
+	mutex_exit(&ring->vr_lock);
+
+	mutex_enter(&ttoproc(curthread)->p_lock);
+	lwp_exit();
+}
+
+static kthread_t *
+viona_create_worker(viona_vring_t *ring)
+{
+	k_sigset_t hold_set;
+	proc_t *p = curproc;
+	kthread_t *t;
+	klwp_t *lwp;
+
+	ASSERT(MUTEX_HELD(&ring->vr_lock));
+	ASSERT(ring->vr_state == VRS_RESET);
+
+	sigfillset(&hold_set);
+	lwp = lwp_create(viona_worker, (void *)ring, 0, p, TS_STOPPED,
+	    minclsyspri - 1, &hold_set, curthread->t_cid, 0);
+	if (lwp == NULL) {
+		return (NULL);
+	}
+
+	t = lwptot(lwp);
+	mutex_enter(&p->p_lock);
+	t->t_proc_flag = (t->t_proc_flag & ~TP_HOLDLWP) | TP_KTHREAD;
+	lwp_create_done(t);
+	mutex_exit(&p->p_lock);
+
+	return (t);
+}
+
+int
+vq_popchain(viona_vring_t *ring, struct iovec *iov, uint_t niov,
+    uint16_t *cookie)
+{
+	uint_t i, ndesc, idx, head, next;
+	struct virtio_desc vdir;
+	void *buf;
+
+	ASSERT(iov != NULL);
+	ASSERT(niov > 0 && niov < INT_MAX);
+
+	mutex_enter(&ring->vr_a_mutex);
+	idx = ring->vr_cur_aidx;
+	ndesc = (uint16_t)((unsigned)*ring->vr_avail_idx - (unsigned)idx);
+
+	if (ndesc == 0) {
+		mutex_exit(&ring->vr_a_mutex);
+		return (0);
+	}
+	if (ndesc > ring->vr_size) {
+		/*
+		 * Despite the fact that the guest has provided an 'avail_idx'
+		 * which indicates that an impossible number of descriptors are
+		 * available, continue on and attempt to process the next one.
+		 *
+		 * The transgression will not escape the probe or stats though.
+		 */
+		VIONA_PROBE2(ndesc_too_high, viona_vring_t *, ring,
+		    uint16_t, ndesc);
+		VIONA_RING_STAT_INCR(ring, ndesc_too_high);
+	}
+
+	head = ring->vr_avail_ring[idx & ring->vr_mask];
+	next = head;
+
+	for (i = 0; i < niov; next = vdir.vd_next) {
+		if (next >= ring->vr_size) {
+			VIONA_PROBE2(bad_idx, viona_vring_t *, ring,
+			    uint16_t, next);
+			VIONA_RING_STAT_INCR(ring, bad_idx);
+			goto bail;
+		}
+
+		vdir = ring->vr_descr[next];
+		if ((vdir.vd_flags & VRING_DESC_F_INDIRECT) == 0) {
+			if (vdir.vd_len == 0) {
+				VIONA_PROBE2(desc_bad_len,
+				    viona_vring_t *, ring,
+				    uint32_t, vdir.vd_len);
+				VIONA_RING_STAT_INCR(ring, desc_bad_len);
+				goto bail;
+			}
+			buf = viona_gpa2kva(ring, vdir.vd_addr, vdir.vd_len);
+			if (buf == NULL) {
+				VIONA_PROBE_BAD_RING_ADDR(ring, vdir.vd_addr);
+				VIONA_RING_STAT_INCR(ring, bad_ring_addr);
+				goto bail;
+			}
+			iov[i].iov_base = buf;
+			iov[i].iov_len = vdir.vd_len;
+			i++;
+		} else {
+			const uint_t nindir = vdir.vd_len / 16;
+			volatile struct virtio_desc *vindir;
+
+			if ((vdir.vd_len & 0xf) || nindir == 0) {
+				VIONA_PROBE2(indir_bad_len,
+				    viona_vring_t *, ring,
+				    uint32_t, vdir.vd_len);
+				VIONA_RING_STAT_INCR(ring, indir_bad_len);
+				goto bail;
+			}
+			vindir = viona_gpa2kva(ring, vdir.vd_addr, vdir.vd_len);
+			if (vindir == NULL) {
+				VIONA_PROBE_BAD_RING_ADDR(ring, vdir.vd_addr);
+				VIONA_RING_STAT_INCR(ring, bad_ring_addr);
+				goto bail;
+			}
+			next = 0;
+			for (;;) {
+				struct virtio_desc vp;
+
+				/*
+				 * A copy of the indirect descriptor is made
+				 * here, rather than simply using a reference
+				 * pointer.  This prevents malicious or
+				 * erroneous guest writes to the descriptor
+				 * from fooling the flags/bounds verification
+				 * through a race.
+				 */
+				vp = vindir[next];
+				if (vp.vd_flags & VRING_DESC_F_INDIRECT) {
+					VIONA_PROBE1(indir_bad_nest,
+					    viona_vring_t *, ring);
+					VIONA_RING_STAT_INCR(ring,
+					    indir_bad_nest);
+					goto bail;
+				} else if (vp.vd_len == 0) {
+					VIONA_PROBE2(desc_bad_len,
+					    viona_vring_t *, ring,
+					    uint32_t, vp.vd_len);
+					VIONA_RING_STAT_INCR(ring,
+					    desc_bad_len);
+					goto bail;
+				}
+				buf = viona_gpa2kva(ring, vp.vd_addr,
+				    vp.vd_len);
+				if (buf == NULL) {
+					VIONA_PROBE_BAD_RING_ADDR(ring,
+					    vp.vd_addr);
+					VIONA_RING_STAT_INCR(ring,
+					    bad_ring_addr);
+					goto bail;
+				}
+				iov[i].iov_base = buf;
+				iov[i].iov_len = vp.vd_len;
+				i++;
+
+				if ((vp.vd_flags & VRING_DESC_F_NEXT) == 0)
+					break;
+				if (i >= niov) {
+					goto loopy;
+				}
+
+				next = vp.vd_next;
+				if (next >= nindir) {
+					VIONA_PROBE3(indir_bad_next,
+					    viona_vring_t *, ring,
+					    uint16_t, next,
+					    uint_t, nindir);
+					VIONA_RING_STAT_INCR(ring,
+					    indir_bad_next);
+					goto bail;
+				}
+			}
+		}
+		if ((vdir.vd_flags & VRING_DESC_F_NEXT) == 0) {
+			*cookie = head;
+			ring->vr_cur_aidx++;
+			mutex_exit(&ring->vr_a_mutex);
+			return (i);
+		}
+	}
+
+loopy:
+	VIONA_PROBE1(too_many_desc, viona_vring_t *, ring);
+	VIONA_RING_STAT_INCR(ring, too_many_desc);
+bail:
+	mutex_exit(&ring->vr_a_mutex);
+	return (-1);
+}
+
+void
+vq_pushchain(viona_vring_t *ring, uint32_t len, uint16_t cookie)
+{
+	volatile struct virtio_used *vu;
+	uint_t uidx;
+
+	mutex_enter(&ring->vr_u_mutex);
+
+	uidx = *ring->vr_used_idx;
+	vu = &ring->vr_used_ring[uidx++ & ring->vr_mask];
+	vu->vu_idx = cookie;
+	vu->vu_tlen = len;
+	membar_producer();
+	*ring->vr_used_idx = uidx;
+
+	mutex_exit(&ring->vr_u_mutex);
+}
+
+void
+vq_pushchain_many(viona_vring_t *ring, uint_t num_bufs, used_elem_t *elem)
+{
+	volatile struct virtio_used *vu;
+	uint_t uidx, i;
+
+	mutex_enter(&ring->vr_u_mutex);
+
+	uidx = *ring->vr_used_idx;
+	if (num_bufs == 1) {
+		vu = &ring->vr_used_ring[uidx++ & ring->vr_mask];
+		vu->vu_idx = elem[0].id;
+		vu->vu_tlen = elem[0].len;
+	} else {
+		for (i = 0; i < num_bufs; i++) {
+			vu = &ring->vr_used_ring[(uidx + i) & ring->vr_mask];
+			vu->vu_idx = elem[i].id;
+			vu->vu_tlen = elem[i].len;
+		}
+		uidx = uidx + num_bufs;
+	}
+	membar_producer();
+	*ring->vr_used_idx = uidx;
+
+	mutex_exit(&ring->vr_u_mutex);
+}
diff --git a/usr/src/uts/i86pc/io/viona/viona_rx.c b/usr/src/uts/i86pc/io/viona/viona_rx.c
new file mode 100644
index 0000000000..b354b201cb
--- /dev/null
+++ b/usr/src/uts/i86pc/io/viona/viona_rx.c
@@ -0,0 +1,747 @@
+/*
+ * Copyright (c) 2013  Chris Torek <torek @ torek net>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2015 Pluribus Networks Inc.
+ * Copyright 2019 Joyent, Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/strsubr.h>
+
+#include <sys/dlpi.h>
+#include <sys/pattr.h>
+#include <sys/vlan.h>
+
+#include "viona_impl.h"
+
+
+
+#define	VTNET_MAXSEGS		32
+
+/* Min. octets in an ethernet frame minus FCS */
+#define	MIN_BUF_SIZE		60
+#define	NEED_VLAN_PAD_SIZE	(MIN_BUF_SIZE - VLAN_TAGSZ)
+
+static mblk_t *viona_vlan_pad_mp;
+
+void
+viona_rx_init(void)
+{
+	mblk_t *mp;
+
+	ASSERT(viona_vlan_pad_mp == NULL);
+
+	/* Create mblk for padding when VLAN tags are stripped */
+	mp = allocb_wait(VLAN_TAGSZ, BPRI_HI, STR_NOSIG, NULL);
+	bzero(mp->b_rptr, VLAN_TAGSZ);
+	mp->b_wptr += VLAN_TAGSZ;
+	viona_vlan_pad_mp = mp;
+}
+
+void
+viona_rx_fini(void)
+{
+	mblk_t *mp;
+
+	/* Clean up the VLAN padding mblk */
+	mp = viona_vlan_pad_mp;
+	viona_vlan_pad_mp = NULL;
+	VERIFY(mp != NULL && mp->b_cont == NULL);
+	freemsg(mp);
+}
+
+void
+viona_worker_rx(viona_vring_t *ring, viona_link_t *link)
+{
+	proc_t *p = ttoproc(curthread);
+
+	(void) thread_vsetname(curthread, "viona_rx_%p", ring);
+
+	ASSERT(MUTEX_HELD(&ring->vr_lock));
+	ASSERT3U(ring->vr_state, ==, VRS_RUN);
+
+	*ring->vr_used_flags |= VRING_USED_F_NO_NOTIFY;
+
+	do {
+		if (vmm_drv_lease_expired(ring->vr_lease)) {
+			/*
+			 * Set the renewal flag, causing incoming traffic to be
+			 * dropped, and issue an RX barrier to ensure any
+			 * threads in the RX callbacks will have finished.
+			 * The vr_lock cannot be held across the barrier as it
+			 * poses a deadlock risk.
+			 */
+			ring->vr_state_flags |= VRSF_RENEW;
+			mutex_exit(&ring->vr_lock);
+			mac_rx_barrier(link->l_mch);
+			mutex_enter(&ring->vr_lock);
+
+			if (!viona_ring_lease_renew(ring)) {
+				break;
+			}
+			ring->vr_state_flags &= ~VRSF_RENEW;
+		}
+
+		/*
+		 * For now, there is little to do in the RX worker as inbound
+		 * data is delivered by MAC via the RX callbacks.  If tap-like
+		 * functionality is added later, this would be a convenient
+		 * place to inject frames into the guest.
+		 */
+		(void) cv_wait_sig(&ring->vr_cv, &ring->vr_lock);
+	} while (!VRING_NEED_BAIL(ring, p));
+
+	*ring->vr_used_flags &= ~VRING_USED_F_NO_NOTIFY;
+}
+
+static size_t
+viona_copy_mblk(const mblk_t *mp, size_t seek, caddr_t buf, size_t len,
+    boolean_t *end)
+{
+	size_t copied = 0;
+	size_t off = 0;
+
+	/* Seek past already-consumed data */
+	while (seek > 0 && mp != NULL) {
+		const size_t chunk = MBLKL(mp);
+
+		if (chunk > seek) {
+			off = seek;
+			break;
+		}
+		mp = mp->b_cont;
+		seek -= chunk;
+	}
+
+	while (mp != NULL) {
+		const size_t chunk = MBLKL(mp) - off;
+		const size_t to_copy = MIN(chunk, len);
+
+		bcopy(mp->b_rptr + off, buf, to_copy);
+		copied += to_copy;
+		buf += to_copy;
+		len -= to_copy;
+
+		/*
+		 * If all the remaining data in the mblk_t was copied, move on
+		 * to the next one in the chain.  Any seek offset applied to
+		 * the first mblk copy is zeroed out for subsequent operations.
+		 */
+		if (chunk == to_copy) {
+			mp = mp->b_cont;
+			off = 0;
+		}
+#ifdef DEBUG
+		else {
+			/*
+			 * The only valid reason for the copy to consume less
+			 * than the entire contents of the mblk_t is because
+			 * the output buffer has been filled.
+			 */
+			ASSERT0(len);
+		}
+#endif
+
+		/* Go no further if the buffer has been filled */
+		if (len == 0) {
+			break;
+		}
+
+	}
+	*end = (mp == NULL);
+	return (copied);
+}
+
+static int
+viona_recv_plain(viona_vring_t *ring, const mblk_t *mp, size_t msz)
+{
+	struct iovec iov[VTNET_MAXSEGS];
+	uint16_t cookie;
+	int n;
+	const size_t hdr_sz = sizeof (struct virtio_net_hdr);
+	struct virtio_net_hdr *hdr;
+	size_t len, copied = 0;
+	caddr_t buf = NULL;
+	boolean_t end = B_FALSE;
+	const uint32_t features = ring->vr_link->l_features;
+
+	ASSERT(msz >= MIN_BUF_SIZE);
+
+	n = vq_popchain(ring, iov, VTNET_MAXSEGS, &cookie);
+	if (n <= 0) {
+		/* Without available buffers, the frame must be dropped. */
+		return (ENOSPC);
+	}
+	if (iov[0].iov_len < hdr_sz) {
+		/*
+		 * There is little to do if there is not even space available
+		 * for the sole header.  Zero the buffer and bail out as a last
+		 * act of desperation.
+		 */
+		bzero(iov[0].iov_base, iov[0].iov_len);
+		goto bad_frame;
+	}
+
+	/* Grab the address of the header before anything else */
+	hdr = (struct virtio_net_hdr *)iov[0].iov_base;
+
+	/*
+	 * If there is any space remaining in the first buffer after writing
+	 * the header, fill it with frame data.
+	 */
+	if (iov[0].iov_len > hdr_sz) {
+		buf = (caddr_t)iov[0].iov_base + hdr_sz;
+		len = iov[0].iov_len - hdr_sz;
+
+		copied += viona_copy_mblk(mp, copied, buf, len, &end);
+	}
+
+	/* Copy any remaining data into subsequent buffers, if present */
+	for (int i = 1; i < n && !end; i++) {
+		buf = (caddr_t)iov[i].iov_base;
+		len = iov[i].iov_len;
+
+		copied += viona_copy_mblk(mp, copied, buf, len, &end);
+	}
+
+	/* Was the expected amount of data copied? */
+	if (copied != msz) {
+		VIONA_PROBE5(too_short, viona_vring_t *, ring,
+		    uint16_t, cookie, mblk_t *, mp, size_t, copied,
+		    size_t, msz);
+		VIONA_RING_STAT_INCR(ring, too_short);
+		goto bad_frame;
+	}
+
+	/* Populate (read: zero) the header and account for it in the size */
+	bzero(hdr, hdr_sz);
+	copied += hdr_sz;
+
+	/* Add chksum bits, if needed */
+	if ((features & VIRTIO_NET_F_GUEST_CSUM) != 0) {
+		uint32_t cksum_flags;
+
+		if (((features & VIRTIO_NET_F_GUEST_TSO4) != 0) &&
+		    ((DB_CKSUMFLAGS(mp) & HW_LSO) != 0)) {
+			hdr->vrh_gso_type |= VIRTIO_NET_HDR_GSO_TCPV4;
+			hdr->vrh_gso_size = DB_LSOMSS(mp);
+		}
+
+		mac_hcksum_get((mblk_t *)mp, NULL, NULL, NULL, NULL,
+		    &cksum_flags);
+		if ((cksum_flags & HCK_FULLCKSUM_OK) != 0) {
+			hdr->vrh_flags |= VIRTIO_NET_HDR_F_DATA_VALID;
+		}
+	}
+
+	/* Release this chain */
+	vq_pushchain(ring, copied, cookie);
+	return (0);
+
+bad_frame:
+	VIONA_PROBE3(bad_rx_frame, viona_vring_t *, ring, uint16_t, cookie,
+	    mblk_t *, mp);
+	VIONA_RING_STAT_INCR(ring, bad_rx_frame);
+
+	vq_pushchain(ring, MAX(copied, MIN_BUF_SIZE + hdr_sz), cookie);
+	return (EINVAL);
+}
+
+static int
+viona_recv_merged(viona_vring_t *ring, const mblk_t *mp, size_t msz)
+{
+	struct iovec iov[VTNET_MAXSEGS];
+	used_elem_t uelem[VTNET_MAXSEGS];
+	int n, i = 0, buf_idx = 0, err = 0;
+	uint16_t cookie;
+	caddr_t buf;
+	size_t len, copied = 0, chunk = 0;
+	struct virtio_net_mrgrxhdr *hdr = NULL;
+	const size_t hdr_sz = sizeof (struct virtio_net_mrgrxhdr);
+	boolean_t end = B_FALSE;
+	const uint32_t features = ring->vr_link->l_features;
+
+	ASSERT(msz >= MIN_BUF_SIZE);
+
+	n = vq_popchain(ring, iov, VTNET_MAXSEGS, &cookie);
+	if (n <= 0) {
+		/* Without available buffers, the frame must be dropped. */
+		VIONA_PROBE2(no_space, viona_vring_t *, ring, mblk_t *, mp);
+		VIONA_RING_STAT_INCR(ring, no_space);
+		return (ENOSPC);
+	}
+	if (iov[0].iov_len < hdr_sz) {
+		/*
+		 * There is little to do if there is not even space available
+		 * for the sole header.  Zero the buffer and bail out as a last
+		 * act of desperation.
+		 */
+		bzero(iov[0].iov_base, iov[0].iov_len);
+		uelem[0].id = cookie;
+		uelem[0].len = iov[0].iov_len;
+		err = EINVAL;
+		goto done;
+	}
+
+	/* Grab the address of the header and do initial population */
+	hdr = (struct virtio_net_mrgrxhdr *)iov[0].iov_base;
+	bzero(hdr, hdr_sz);
+	hdr->vrh_bufs = 1;
+
+	/*
+	 * If there is any space remaining in the first buffer after writing
+	 * the header, fill it with frame data.
+	 */
+	if (iov[0].iov_len > hdr_sz) {
+		buf = iov[0].iov_base + hdr_sz;
+		len = iov[0].iov_len - hdr_sz;
+
+		chunk += viona_copy_mblk(mp, copied, buf, len, &end);
+		copied += chunk;
+	}
+	i = 1;
+
+	do {
+		while (i < n && !end) {
+			buf = iov[i].iov_base;
+			len = iov[i].iov_len;
+
+			chunk += viona_copy_mblk(mp, copied, buf, len, &end);
+			copied += chunk;
+			i++;
+		}
+
+		uelem[buf_idx].id = cookie;
+		uelem[buf_idx].len = chunk;
+
+		/*
+		 * Try to grab another buffer from the ring if the mblk has not
+		 * yet been entirely copied out.
+		 */
+		if (!end) {
+			if (buf_idx == (VTNET_MAXSEGS - 1)) {
+				/*
+				 * Our arbitrary limit on the number of buffers
+				 * to offer for merge has already been reached.
+				 */
+				err = EOVERFLOW;
+				break;
+			}
+			n = vq_popchain(ring, iov, VTNET_MAXSEGS, &cookie);
+			if (n <= 0) {
+				/*
+				 * Without more immediate space to perform the
+				 * copying, there is little choice left but to
+				 * drop the packet.
+				 */
+				err = EMSGSIZE;
+				break;
+			}
+			chunk = 0;
+			i = 0;
+			buf_idx++;
+			/*
+			 * Keep the header up-to-date with the number of
+			 * buffers, but never reference its value since the
+			 * guest could meddle with it.
+			 */
+			hdr->vrh_bufs++;
+		}
+	} while (!end && copied < msz);
+
+	/* Account for the header size in the first buffer */
+	uelem[0].len += hdr_sz;
+
+	/*
+	 * If no other errors were encounted during the copy, was the expected
+	 * amount of data transfered?
+	 */
+	if (err == 0 && copied != msz) {
+		VIONA_PROBE5(too_short, viona_vring_t *, ring,
+		    uint16_t, cookie, mblk_t *, mp, size_t, copied,
+		    size_t, msz);
+		VIONA_RING_STAT_INCR(ring, too_short);
+		err = EINVAL;
+	}
+
+	/* Add chksum bits, if needed */
+	if ((features & VIRTIO_NET_F_GUEST_CSUM) != 0) {
+		uint32_t cksum_flags;
+
+		if (((features & VIRTIO_NET_F_GUEST_TSO4) != 0) &&
+		    ((DB_CKSUMFLAGS(mp) & HW_LSO) != 0)) {
+			hdr->vrh_gso_type |= VIRTIO_NET_HDR_GSO_TCPV4;
+			hdr->vrh_gso_size = DB_LSOMSS(mp);
+		}
+
+		mac_hcksum_get((mblk_t *)mp, NULL, NULL, NULL, NULL,
+		    &cksum_flags);
+		if ((cksum_flags & HCK_FULLCKSUM_OK) != 0) {
+			hdr->vrh_flags |= VIRTIO_NET_HDR_F_DATA_VALID;
+		}
+	}
+
+done:
+	switch (err) {
+	case 0:
+		/* Success can fall right through to ring delivery */
+		break;
+
+	case EMSGSIZE:
+		VIONA_PROBE3(rx_merge_underrun, viona_vring_t *, ring,
+		    uint16_t, cookie, mblk_t *, mp);
+		VIONA_RING_STAT_INCR(ring, rx_merge_underrun);
+		break;
+
+	case EOVERFLOW:
+		VIONA_PROBE3(rx_merge_overrun, viona_vring_t *, ring,
+		    uint16_t, cookie, mblk_t *, mp);
+		VIONA_RING_STAT_INCR(ring, rx_merge_overrun);
+		break;
+
+	default:
+		VIONA_PROBE3(bad_rx_frame, viona_vring_t *, ring,
+		    uint16_t, cookie, mblk_t *, mp);
+		VIONA_RING_STAT_INCR(ring, bad_rx_frame);
+	}
+	vq_pushchain_many(ring, buf_idx + 1, uelem);
+	return (err);
+}
+
+static void
+viona_rx_common(viona_vring_t *ring, mblk_t *mp, boolean_t is_loopback)
+{
+	viona_link_t *link = ring->vr_link;
+	mblk_t *mprx = NULL, **mprx_prevp = &mprx;
+	mblk_t *mpdrop = NULL, **mpdrop_prevp = &mpdrop;
+	const boolean_t do_merge =
+	    ((link->l_features & VIRTIO_NET_F_MRG_RXBUF) != 0);
+	const boolean_t guest_csum =
+	    ((link->l_features & VIRTIO_NET_F_GUEST_CSUM) != 0);
+	const boolean_t guest_tso4 =
+	    ((link->l_features & VIRTIO_NET_F_GUEST_TSO4) != 0);
+
+	size_t nrx = 0, ndrop = 0;
+
+	/*
+	 * The mac_hw_emul() function, by design, doesn't predicate on
+	 * HW_LOCAL_MAC. Since we are in Rx context we know that any
+	 * LSO packet must also be from a same-machine sender. We take
+	 * advantage of that and forgoe writing a manual loop to
+	 * predicate on HW_LOCAL_MAC.
+	 *
+	 * For checksum emulation we need to predicate on HW_LOCAL_MAC
+	 * to avoid calling mac_hw_emul() on packets that don't need
+	 * it (thanks to the fact that HCK_IPV4_HDRCKSUM and
+	 * HCK_IPV4_HDRCKSUM_OK use the same value). Therefore, we do
+	 * the checksum emulation in the second loop.
+	 */
+	if (!guest_tso4)
+		mac_hw_emul(&mp, NULL, NULL, MAC_LSO_EMUL);
+
+	while (mp != NULL) {
+		mblk_t *next, *pad = NULL;
+		size_t size;
+		int err = 0;
+
+		next = mp->b_next;
+		mp->b_next = NULL;
+
+		if (DB_CKSUMFLAGS(mp) & HW_LOCAL_MAC) {
+			/*
+			 * The VIRTIO_NET_HDR_F_DATA_VALID flag only
+			 * covers the ULP checksum -- so we still have
+			 * to populate the IP header checksum.
+			 */
+			if (guest_csum) {
+				mac_hw_emul(&mp, NULL, NULL, MAC_IPCKSUM_EMUL);
+			} else {
+				mac_hw_emul(&mp, NULL, NULL, MAC_HWCKSUM_EMUL);
+			}
+
+			if (mp == NULL) {
+				mp = next;
+				continue;
+			}
+		}
+
+		size = msgsize(mp);
+
+		/*
+		 * We treat both a 'drop' response and errors the same here
+		 * and put the packet on the drop chain.  As packets may be
+		 * subject to different actions in ipf (which do not all
+		 * return the same set of error values), an error processing
+		 * one packet doesn't mean the next packet will also generate
+		 * an error.
+		 */
+		if (VNETHOOK_INTERESTED_IN(link->l_neti) &&
+		    viona_hook(link, ring, &mp, B_FALSE) != 0) {
+			if (mp != NULL) {
+				*mpdrop_prevp = mp;
+				mpdrop_prevp = &mp->b_next;
+			} else {
+				/*
+				 * If the hook consumer (e.g. ipf) already
+				 * freed the mblk_t, update the drop count now.
+				 */
+				ndrop++;
+			}
+			mp = next;
+			continue;
+		}
+
+		/*
+		 * Ethernet frames are expected to be padded out in order to
+		 * meet the minimum size.
+		 *
+		 * A special case is made for frames which are short by
+		 * VLAN_TAGSZ, having been stripped of their VLAN tag while
+		 * traversing MAC.  A preallocated (and recycled) mblk is used
+		 * for that specific condition.
+		 *
+		 * All other frames that fall short on length will have custom
+		 * zero-padding allocated appended to them.
+		 */
+		if (size == NEED_VLAN_PAD_SIZE) {
+			ASSERT(MBLKL(viona_vlan_pad_mp) == VLAN_TAGSZ);
+			ASSERT(viona_vlan_pad_mp->b_cont == NULL);
+
+			for (pad = mp; pad->b_cont != NULL; pad = pad->b_cont)
+				;
+
+			pad->b_cont = viona_vlan_pad_mp;
+			size += VLAN_TAGSZ;
+		} else if (size < MIN_BUF_SIZE) {
+			const size_t pad_size = MIN_BUF_SIZE - size;
+			mblk_t *zero_mp;
+
+			zero_mp = allocb(pad_size, BPRI_MED);
+			if (zero_mp == NULL) {
+				err = ENOMEM;
+				goto pad_drop;
+			}
+
+			VIONA_PROBE3(rx_pad_short, viona_vring_t *, ring,
+			    mblk_t *, mp, size_t, pad_size);
+			VIONA_RING_STAT_INCR(ring, rx_pad_short);
+			zero_mp->b_wptr += pad_size;
+			bzero(zero_mp->b_rptr, pad_size);
+			linkb(mp, zero_mp);
+			size += pad_size;
+		}
+
+		if (do_merge) {
+			err = viona_recv_merged(ring, mp, size);
+		} else {
+			err = viona_recv_plain(ring, mp, size);
+		}
+
+		/*
+		 * The VLAN padding mblk is meant for continual reuse, so
+		 * remove it from the chain to prevent it from being freed.
+		 *
+		 * Custom allocated padding does not require this treatment and
+		 * is freed normally.
+		 */
+		if (pad != NULL) {
+			pad->b_cont = NULL;
+		}
+
+pad_drop:
+		/*
+		 * While an error during rx processing
+		 * (viona_recv_{merged,plain}) does not free mp on error,
+		 * hook processing might or might not free mp.  Handle either
+		 * scenario -- if mp is not yet free, it is queued up and
+		 * freed after the guest has been notified.  If mp is
+		 * already NULL, just proceed on.
+		 */
+		if (err != 0) {
+			*mpdrop_prevp = mp;
+			mpdrop_prevp = &mp->b_next;
+
+			/*
+			 * If the available ring is empty, do not bother
+			 * attempting to deliver any more frames.  Count the
+			 * rest as dropped too.
+			 */
+			if (err == ENOSPC) {
+				mp->b_next = next;
+				break;
+			}
+		} else {
+			/* Chain successful mblks to be freed later */
+			*mprx_prevp = mp;
+			mprx_prevp = &mp->b_next;
+			nrx++;
+		}
+		mp = next;
+	}
+
+	membar_enter();
+	if ((*ring->vr_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) {
+		viona_intr_ring(ring);
+	}
+
+	/* Free successfully received frames */
+	if (mprx != NULL) {
+		freemsgchain(mprx);
+	}
+
+	/* Free dropped frames, also tallying them */
+	mp = mpdrop;
+	while (mp != NULL) {
+		mblk_t *next = mp->b_next;
+
+		mp->b_next = NULL;
+		freemsg(mp);
+		mp = next;
+		ndrop++;
+	}
+	VIONA_PROBE3(rx, viona_link_t *, link, size_t, nrx, size_t, ndrop);
+}
+
+static void
+viona_rx_classified(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
+    boolean_t is_loopback)
+{
+	viona_vring_t *ring = (viona_vring_t *)arg;
+
+	/* Drop traffic if ring is inactive or renewing its lease */
+	if (ring->vr_state != VRS_RUN ||
+	    (ring->vr_state_flags & VRSF_RENEW) != 0) {
+		freemsgchain(mp);
+		return;
+	}
+
+	viona_rx_common(ring, mp, is_loopback);
+}
+
+static void
+viona_rx_mcast(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
+    boolean_t is_loopback)
+{
+	viona_vring_t *ring = (viona_vring_t *)arg;
+	mac_handle_t mh = ring->vr_link->l_mh;
+	mblk_t *mp_mcast_only = NULL;
+	mblk_t **mpp = &mp_mcast_only;
+
+	/* Drop traffic if ring is inactive or renewing its lease */
+	if (ring->vr_state != VRS_RUN ||
+	    (ring->vr_state_flags & VRSF_RENEW) != 0) {
+		freemsgchain(mp);
+		return;
+	}
+
+	/*
+	 * In addition to multicast traffic, broadcast packets will also arrive
+	 * via the MAC_CLIENT_PROMISC_MULTI handler. The mac_rx_set() callback
+	 * for fully-classified traffic has already delivered that broadcast
+	 * traffic, so it should be suppressed here, rather than duplicating it
+	 * to the guest.
+	 */
+	while (mp != NULL) {
+		mblk_t *mp_next;
+		mac_header_info_t mhi;
+		int err;
+
+		mp_next = mp->b_next;
+		mp->b_next = NULL;
+
+		/* Determine the packet type */
+		err = mac_vlan_header_info(mh, mp, &mhi);
+		if (err != 0) {
+			mblk_t *pull;
+
+			/*
+			 * It is possible that gathering of the header
+			 * information was impeded by a leading mblk_t which
+			 * was of inadequate length to reference the needed
+			 * fields.  Try again, in case that could be solved
+			 * with a pull-up.
+			 */
+			pull = msgpullup(mp, sizeof (struct ether_vlan_header));
+			if (pull == NULL) {
+				err = ENOMEM;
+			} else {
+				err = mac_vlan_header_info(mh, pull, &mhi);
+				freemsg(pull);
+			}
+
+			if (err != 0) {
+				VIONA_RING_STAT_INCR(ring, rx_mcast_check);
+			}
+		}
+
+		/* Chain up matching packets while discarding others */
+		if (err == 0 && mhi.mhi_dsttype == MAC_ADDRTYPE_MULTICAST) {
+			*mpp = mp;
+			mpp = &mp->b_next;
+		} else {
+			freemsg(mp);
+		}
+
+		mp = mp_next;
+	}
+
+	if (mp_mcast_only != NULL) {
+		viona_rx_common(ring, mp_mcast_only, is_loopback);
+	}
+}
+
+int
+viona_rx_set(viona_link_t *link)
+{
+	viona_vring_t *ring = &link->l_vrings[VIONA_VQ_RX];
+	int err;
+
+	mac_rx_set(link->l_mch, viona_rx_classified, ring);
+	err = mac_promisc_add(link->l_mch, MAC_CLIENT_PROMISC_MULTI,
+	    viona_rx_mcast, ring, &link->l_mph,
+	    MAC_PROMISC_FLAGS_NO_TX_LOOP | MAC_PROMISC_FLAGS_VLAN_TAG_STRIP);
+	if (err != 0) {
+		mac_rx_clear(link->l_mch);
+	}
+
+	return (err);
+}
+
+void
+viona_rx_clear(viona_link_t *link)
+{
+	mac_promisc_remove(link->l_mph);
+	mac_rx_clear(link->l_mch);
+}
diff --git a/usr/src/uts/i86pc/io/viona/viona_tx.c b/usr/src/uts/i86pc/io/viona/viona_tx.c
new file mode 100644
index 0000000000..843435c67d
--- /dev/null
+++ b/usr/src/uts/i86pc/io/viona/viona_tx.c
@@ -0,0 +1,755 @@
+/*
+ * Copyright (c) 2013  Chris Torek <torek @ torek net>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2015 Pluribus Networks Inc.
+ * Copyright 2019 Joyent, Inc.
+ */
+
+
+#include <sys/types.h>
+#include <sys/smt.h>
+#include <sys/strsubr.h>
+
+#include <sys/pattr.h>
+#include <sys/dlpi.h>
+#include <inet/ip.h>
+#include <inet/ip_impl.h>
+
+#include "viona_impl.h"
+
+#define	BNXE_NIC_DRIVER		"bnxe"
+
+/*
+ * copy tx mbufs from virtio ring to avoid necessitating a wait for packet
+ * transmission to free resources.
+ */
+kmutex_t viona_force_copy_lock;
+static enum viona_force_copy {
+	VFC_UNINITALIZED	= 0,
+	VFC_COPY_UNEEDED	= 1,
+	VFC_COPY_REQUIRED	= 2,
+} viona_force_copy_state = VFC_UNINITALIZED;
+
+struct viona_desb {
+	frtn_t			d_frtn;
+	viona_vring_t		*d_ring;
+	uint_t			d_ref;
+	uint32_t		d_len;
+	uint16_t		d_cookie;
+	uchar_t			*d_headers;
+};
+
+static void viona_tx(viona_link_t *, viona_vring_t *);
+static void viona_desb_release(viona_desb_t *);
+
+/*
+ * Return the number of available descriptors in the vring taking care of the
+ * 16-bit index wraparound.
+ *
+ * Note: If the number of apparently available descriptors is larger than the
+ * ring size (due to guest misbehavior), this check will still report the
+ * positive count of descriptors.
+ */
+static inline uint_t
+viona_vr_num_avail(viona_vring_t *ring)
+{
+	uint16_t ndesc;
+
+	/*
+	 * We're just computing (a-b) in GF(216).
+	 *
+	 * The only glitch here is that in standard C, uint16_t promotes to
+	 * (signed) int when int has more than 16 bits (almost always now).
+	 * A cast back to unsigned is necessary for proper operation.
+	 */
+	ndesc = (unsigned)*ring->vr_avail_idx - (unsigned)ring->vr_cur_aidx;
+
+	return (ndesc);
+}
+
+static void
+viona_tx_wait_outstanding(viona_vring_t *ring)
+{
+	ASSERT(MUTEX_HELD(&ring->vr_lock));
+
+	while (ring->vr_xfer_outstanding != 0) {
+		/*
+		 * Paying heed to signals is counterproductive here.  This is a
+		 * very tight loop if pending transfers take an extended amount
+		 * of time to be reclaimed while the host process is exiting.
+		 */
+		cv_wait(&ring->vr_cv, &ring->vr_lock);
+	}
+}
+
+/*
+ * Check if full TX packet copying is needed.  This should not be called from
+ * viona attach()/detach() context.
+ */
+static boolean_t
+viona_tx_copy_needed(void)
+{
+	boolean_t result;
+
+	mutex_enter(&viona_force_copy_lock);
+	if (viona_force_copy_state == VFC_UNINITALIZED) {
+		major_t bnxe_major;
+
+		/*
+		 * The original code for viona featured an explicit check for
+		 * the bnxe driver which, when found present, necessitated that
+		 * all transmissions be copied into their own mblks instead of
+		 * passing guest memory to the underlying device.
+		 *
+		 * The motivations for this are unclear, but until it can be
+		 * proven unnecessary, the check lives on.
+		 */
+		viona_force_copy_state = VFC_COPY_UNEEDED;
+		if ((bnxe_major = ddi_name_to_major(BNXE_NIC_DRIVER))
+		    != DDI_MAJOR_T_NONE) {
+			if (ddi_hold_installed_driver(bnxe_major) != NULL) {
+				viona_force_copy_state = VFC_COPY_REQUIRED;
+				ddi_rele_driver(bnxe_major);
+			}
+		}
+	}
+	result = (viona_force_copy_state == VFC_COPY_REQUIRED);
+	mutex_exit(&viona_force_copy_lock);
+
+	return (result);
+}
+
+void
+viona_tx_ring_alloc(viona_vring_t *ring, const uint16_t qsz)
+{
+	/* Allocate desb handles for TX ring if packet copying not disabled */
+	if (!viona_tx_copy_needed()) {
+		viona_desb_t *dp;
+
+		dp = kmem_zalloc(sizeof (viona_desb_t) * qsz, KM_SLEEP);
+		ring->vr_txdesb = dp;
+		for (uint_t i = 0; i < qsz; i++, dp++) {
+			dp->d_frtn.free_func = viona_desb_release;
+			dp->d_frtn.free_arg = (void *)dp;
+			dp->d_ring = ring;
+			dp->d_headers = kmem_zalloc(VIONA_MAX_HDRS_LEN,
+			    KM_SLEEP);
+		}
+	}
+
+	/* Allocate ring-sized iovec buffers for TX */
+	ring->vr_txiov = kmem_alloc(sizeof (struct iovec) * qsz, KM_SLEEP);
+}
+
+void
+viona_tx_ring_free(viona_vring_t *ring, const uint16_t qsz)
+{
+	if (ring->vr_txdesb != NULL) {
+		viona_desb_t *dp = ring->vr_txdesb;
+
+		for (uint_t i = 0; i < qsz; i++, dp++) {
+			kmem_free(dp->d_headers, VIONA_MAX_HDRS_LEN);
+		}
+		kmem_free(ring->vr_txdesb, sizeof (viona_desb_t) * qsz);
+		ring->vr_txdesb = NULL;
+	}
+
+	if (ring->vr_txiov != NULL) {
+		kmem_free(ring->vr_txiov, sizeof (struct iovec) * qsz);
+		ring->vr_txiov = NULL;
+	}
+}
+
+static void
+viona_tx_done(viona_vring_t *ring, uint32_t len, uint16_t cookie)
+{
+	vq_pushchain(ring, len, cookie);
+
+	membar_enter();
+	if ((*ring->vr_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) {
+		viona_intr_ring(ring);
+	}
+}
+
+void
+viona_worker_tx(viona_vring_t *ring, viona_link_t *link)
+{
+	proc_t *p = ttoproc(curthread);
+
+	(void) thread_vsetname(curthread, "viona_tx_%p", ring);
+
+	ASSERT(MUTEX_HELD(&ring->vr_lock));
+	ASSERT3U(ring->vr_state, ==, VRS_RUN);
+
+	mutex_exit(&ring->vr_lock);
+
+	for (;;) {
+		boolean_t bail = B_FALSE;
+		boolean_t renew = B_FALSE;
+		uint_t ntx = 0;
+
+		*ring->vr_used_flags |= VRING_USED_F_NO_NOTIFY;
+		while (viona_vr_num_avail(ring)) {
+			viona_tx(link, ring);
+
+			/*
+			 * It is advantageous for throughput to keep this
+			 * transmission loop tight, but periodic breaks to
+			 * check for other events are of value too.
+			 */
+			if (ntx++ >= ring->vr_size)
+				break;
+		}
+		*ring->vr_used_flags &= ~VRING_USED_F_NO_NOTIFY;
+
+		VIONA_PROBE2(tx, viona_link_t *, link, uint_t, ntx);
+
+		/*
+		 * Check for available descriptors on the ring once more in
+		 * case a late addition raced with the NO_NOTIFY flag toggle.
+		 *
+		 * The barrier ensures that visibility of the vr_used_flags
+		 * store does not cross the viona_vr_num_avail() check below.
+		 */
+		membar_enter();
+		bail = VRING_NEED_BAIL(ring, p);
+		renew = vmm_drv_lease_expired(ring->vr_lease);
+		if (!bail && !renew && viona_vr_num_avail(ring)) {
+			continue;
+		}
+
+		if ((link->l_features & VIRTIO_F_RING_NOTIFY_ON_EMPTY) != 0) {
+			viona_intr_ring(ring);
+		}
+
+		mutex_enter(&ring->vr_lock);
+
+		while (!bail && !renew && !viona_vr_num_avail(ring)) {
+			(void) cv_wait_sig(&ring->vr_cv, &ring->vr_lock);
+			bail = VRING_NEED_BAIL(ring, p);
+			renew = vmm_drv_lease_expired(ring->vr_lease);
+		}
+
+		if (bail) {
+			break;
+		} else if (renew) {
+			ring->vr_state_flags |= VRSF_RENEW;
+			/*
+			 * When renewing the lease for the ring, no TX
+			 * frames may be outstanding, as they contain
+			 * references to guest memory.
+			 */
+			viona_tx_wait_outstanding(ring);
+
+			if (!viona_ring_lease_renew(ring)) {
+				break;
+			}
+			ring->vr_state_flags &= ~VRSF_RENEW;
+		}
+		mutex_exit(&ring->vr_lock);
+	}
+
+	ASSERT(MUTEX_HELD(&ring->vr_lock));
+
+	viona_tx_wait_outstanding(ring);
+}
+
+static void
+viona_desb_release(viona_desb_t *dp)
+{
+	viona_vring_t *ring = dp->d_ring;
+	uint_t ref;
+	uint32_t len;
+	uint16_t cookie;
+
+	ref = atomic_dec_uint_nv(&dp->d_ref);
+	if (ref > 1) {
+		return;
+	}
+
+	/*
+	 * The desb corresponding to this index must be ready for reuse before
+	 * the descriptor is returned to the guest via the 'used' ring.
+	 */
+	len = dp->d_len;
+	cookie = dp->d_cookie;
+	dp->d_len = 0;
+	dp->d_cookie = 0;
+	dp->d_ref = 0;
+
+	viona_tx_done(ring, len, cookie);
+
+	mutex_enter(&ring->vr_lock);
+	if ((--ring->vr_xfer_outstanding) == 0) {
+		cv_broadcast(&ring->vr_cv);
+	}
+	mutex_exit(&ring->vr_lock);
+}
+
+static boolean_t
+viona_tx_csum(viona_vring_t *ring, const struct virtio_net_hdr *hdr,
+    mblk_t *mp, uint32_t len)
+{
+	viona_link_t *link = ring->vr_link;
+	const struct ether_header *eth;
+	uint_t eth_len = sizeof (struct ether_header);
+	ushort_t ftype;
+	ipha_t *ipha = NULL;
+	uint8_t ipproto = IPPROTO_NONE; /* NONE is not exactly right, but ok */
+	uint16_t flags = 0;
+	const uint_t csum_start = hdr->vrh_csum_start;
+	const uint_t csum_stuff = hdr->vrh_csum_offset + csum_start;
+
+	/*
+	 * Validate that the checksum offsets provided by the guest are within
+	 * the bounds of the packet.  Additionally, ensure that the checksum
+	 * contents field is within the headers mblk copied by viona_tx().
+	 */
+	if (csum_start >= len || csum_start < eth_len || csum_stuff >= len ||
+	    (csum_stuff + sizeof (uint16_t)) > MBLKL(mp)) {
+		VIONA_PROBE2(fail_hcksum, viona_link_t *, link, mblk_t *, mp);
+		VIONA_RING_STAT_INCR(ring, fail_hcksum);
+		return (B_FALSE);
+	}
+
+	/*
+	 * This is guaranteed to be safe thanks to the header copying
+	 * done in viona_tx().
+	 */
+	eth = (const struct ether_header *)mp->b_rptr;
+	ftype = ntohs(eth->ether_type);
+
+	if (ftype == ETHERTYPE_VLAN) {
+		const struct ether_vlan_header *veth;
+
+		/* punt on QinQ for now */
+		eth_len = sizeof (struct ether_vlan_header);
+		veth = (const struct ether_vlan_header *)eth;
+		ftype = ntohs(veth->ether_type);
+	}
+
+	if (ftype == ETHERTYPE_IP) {
+		ipha = (ipha_t *)(mp->b_rptr + eth_len);
+
+		ipproto = ipha->ipha_protocol;
+	} else if (ftype == ETHERTYPE_IPV6) {
+		ip6_t *ip6h = (ip6_t *)(mp->b_rptr + eth_len);
+
+		ipproto = ip6h->ip6_nxt;
+	}
+
+	/*
+	 * We ignore hdr_len because the spec says it can't be
+	 * trusted. Besides, our own stack will determine the header
+	 * boundary.
+	 */
+	if ((link->l_cap_csum & HCKSUM_INET_PARTIAL) != 0 &&
+	    (hdr->vrh_gso_type & VIRTIO_NET_HDR_GSO_TCPV4) != 0 &&
+	    ftype == ETHERTYPE_IP) {
+		uint16_t	*cksump;
+		uint32_t	cksum;
+		ipaddr_t	src = ipha->ipha_src;
+		ipaddr_t	dst = ipha->ipha_dst;
+
+		/*
+		 * Our native IP stack doesn't set the L4 length field
+		 * of the pseudo header when LSO is in play. Other IP
+		 * stacks, e.g. Linux, do include the length field.
+		 * This is a problem because the hardware expects that
+		 * the length field is not set. When it is set it will
+		 * cause an incorrect TCP checksum to be generated.
+		 * The reason this works in Linux is because Linux
+		 * corrects the pseudo-header checksum in the driver
+		 * code. In order to get the correct HW checksum we
+		 * need to assume the guest's IP stack gave us a bogus
+		 * TCP partial checksum and calculate it ourselves.
+		 */
+		cksump = IPH_TCPH_CHECKSUMP(ipha, IPH_HDR_LENGTH(ipha));
+		cksum = IP_TCP_CSUM_COMP;
+		cksum += (dst >> 16) + (dst & 0xFFFF) +
+		    (src >> 16) + (src & 0xFFFF);
+		cksum = (cksum & 0xFFFF) + (cksum >> 16);
+		*(cksump) = (cksum & 0xFFFF) + (cksum >> 16);
+
+		/*
+		 * Since viona is a "legacy device", the data stored
+		 * by the driver will be in the guest's native endian
+		 * format (see sections 2.4.3 and 5.1.6.1 of the
+		 * VIRTIO 1.0 spec for more info). At this time the
+		 * only guests using viona are x86 and we can assume
+		 * little-endian.
+		 */
+		lso_info_set(mp, LE_16(hdr->vrh_gso_size), HW_LSO);
+
+		/*
+		 * Hardware, like ixgbe, expects the client to request
+		 * IP header checksum offload if it's sending LSO (see
+		 * ixgbe_get_context()). Unfortunately, virtio makes
+		 * no allowances for negotiating IP header checksum
+		 * and HW offload, only TCP checksum. We add the flag
+		 * and zero-out the checksum field. This mirrors the
+		 * behavior of our native IP stack (which does this in
+		 * the interest of HW that expects the field to be
+		 * zero).
+		 */
+		flags |= HCK_IPV4_HDRCKSUM;
+		ipha->ipha_hdr_checksum = 0;
+	}
+
+	/*
+	 * Use DB_CKSUMFLAGS instead of mac_hcksum_get() to make sure
+	 * HW_LSO, if present, is not lost.
+	 */
+	flags |= DB_CKSUMFLAGS(mp);
+
+	/*
+	 * Partial checksum support from the NIC is ideal, since it most
+	 * closely maps to the interface defined by virtio.
+	 */
+	if ((link->l_cap_csum & HCKSUM_INET_PARTIAL) != 0 &&
+	    (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) {
+		/*
+		 * MAC expects these offsets to be relative to the
+		 * start of the L3 header rather than the L2 frame.
+		 */
+		flags |= HCK_PARTIALCKSUM;
+		mac_hcksum_set(mp, csum_start - eth_len, csum_stuff - eth_len,
+		    len - eth_len, 0, flags);
+		return (B_TRUE);
+	}
+
+	/*
+	 * Without partial checksum support, look to the L3/L4 protocol
+	 * information to see if the NIC can handle it.  If not, the
+	 * checksum will need to calculated inline.
+	 */
+	if (ftype == ETHERTYPE_IP) {
+		if ((link->l_cap_csum & HCKSUM_INET_FULL_V4) != 0 &&
+		    (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) {
+			uint16_t *csump = (uint16_t *)(mp->b_rptr + csum_stuff);
+			*csump = 0;
+			flags |= HCK_FULLCKSUM;
+			mac_hcksum_set(mp, 0, 0, 0, 0, flags);
+			return (B_TRUE);
+		}
+
+		/* XXX: Implement manual fallback checksumming? */
+		VIONA_PROBE2(fail_hcksum, viona_link_t *, link, mblk_t *, mp);
+		VIONA_RING_STAT_INCR(ring, fail_hcksum);
+		return (B_FALSE);
+	} else if (ftype == ETHERTYPE_IPV6) {
+		if ((link->l_cap_csum & HCKSUM_INET_FULL_V6) != 0 &&
+		    (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) {
+			uint16_t *csump = (uint16_t *)(mp->b_rptr + csum_stuff);
+			*csump = 0;
+			flags |= HCK_FULLCKSUM;
+			mac_hcksum_set(mp, 0, 0, 0, 0, flags);
+			return (B_TRUE);
+		}
+
+		/* XXX: Implement manual fallback checksumming? */
+		VIONA_PROBE2(fail_hcksum6, viona_link_t *, link, mblk_t *, mp);
+		VIONA_RING_STAT_INCR(ring, fail_hcksum6);
+		return (B_FALSE);
+	}
+
+	/* Cannot even emulate hcksum for unrecognized protocols */
+	VIONA_PROBE2(fail_hcksum_proto, viona_link_t *, link, mblk_t *, mp);
+	VIONA_RING_STAT_INCR(ring, fail_hcksum_proto);
+	return (B_FALSE);
+}
+
+static void
+viona_tx(viona_link_t *link, viona_vring_t *ring)
+{
+	struct iovec		*iov = ring->vr_txiov;
+	const uint_t		max_segs = ring->vr_size;
+	uint16_t		cookie;
+	int			i, n;
+	uint32_t		len, base_off = 0;
+	uint32_t		min_copy = VIONA_MAX_HDRS_LEN;
+	mblk_t			*mp_head, *mp_tail, *mp;
+	viona_desb_t		*dp = NULL;
+	mac_client_handle_t	link_mch = link->l_mch;
+	const struct virtio_net_hdr *hdr;
+
+	mp_head = mp_tail = NULL;
+
+	ASSERT(iov != NULL);
+
+	n = vq_popchain(ring, iov, max_segs, &cookie);
+	if (n == 0) {
+		VIONA_PROBE1(tx_absent, viona_vring_t *, ring);
+		VIONA_RING_STAT_INCR(ring, tx_absent);
+		return;
+	} else if (n < 0) {
+		/*
+		 * Any error encountered in vq_popchain has already resulted in
+		 * specific probe and statistic handling.  Further action here
+		 * is unnecessary.
+		 */
+		return;
+	}
+
+	/* Grab the header and ensure it is of adequate length */
+	hdr = (const struct virtio_net_hdr *)iov[0].iov_base;
+	len = iov[0].iov_len;
+	if (len < sizeof (struct virtio_net_hdr)) {
+		goto drop_fail;
+	}
+
+	/* Make sure the packet headers are always in the first mblk. */
+	if (ring->vr_txdesb != NULL) {
+		dp = &ring->vr_txdesb[cookie];
+
+		/*
+		 * If the guest driver is operating properly, each desb slot
+		 * should be available for use when processing a TX descriptor
+		 * from the 'avail' ring.  In the case of drivers that reuse a
+		 * descriptor before it has been posted to the 'used' ring, the
+		 * data is simply dropped.
+		 */
+		if (atomic_cas_uint(&dp->d_ref, 0, 1) != 0) {
+			dp = NULL;
+			goto drop_fail;
+		}
+
+		dp->d_cookie = cookie;
+		mp_head = desballoc(dp->d_headers, VIONA_MAX_HDRS_LEN, 0,
+		    &dp->d_frtn);
+
+		/* Account for the successful desballoc. */
+		if (mp_head != NULL)
+			dp->d_ref++;
+	} else {
+		mp_head = allocb(VIONA_MAX_HDRS_LEN, 0);
+	}
+
+	if (mp_head == NULL)
+		goto drop_fail;
+
+	mp_tail = mp_head;
+
+	/*
+	 * We always copy enough of the guest data to cover the
+	 * headers. This protects us from TOCTOU attacks and allows
+	 * message block length assumptions to be made in subsequent
+	 * code. In many cases, this means copying more data than
+	 * strictly necessary. That's okay, as it is the larger packets
+	 * (such as LSO) that really benefit from desballoc().
+	 */
+	for (i = 1; i < n; i++) {
+		const uint32_t to_copy = MIN(min_copy, iov[i].iov_len);
+
+		bcopy(iov[i].iov_base, mp_head->b_wptr, to_copy);
+		mp_head->b_wptr += to_copy;
+		len += to_copy;
+		min_copy -= to_copy;
+
+		/*
+		 * We've met the minimum copy requirement. The rest of
+		 * the guest data can be referenced.
+		 */
+		if (min_copy == 0) {
+			/*
+			 * If we copied all contents of this
+			 * descriptor then move onto the next one.
+			 * Otherwise, record how far we are into the
+			 * current descriptor.
+			 */
+			if (iov[i].iov_len == to_copy)
+				i++;
+			else
+				base_off = to_copy;
+
+			break;
+		}
+	}
+
+	ASSERT3P(mp_head, !=, NULL);
+	ASSERT3P(mp_tail, !=, NULL);
+
+	for (; i < n; i++) {
+		uintptr_t base = (uintptr_t)iov[i].iov_base + base_off;
+		uint32_t chunk = iov[i].iov_len - base_off;
+
+		ASSERT3U(base_off, <, iov[i].iov_len);
+		ASSERT3U(chunk, >, 0);
+
+		if (dp != NULL) {
+			mp = desballoc((uchar_t *)base, chunk, 0, &dp->d_frtn);
+			if (mp == NULL) {
+				goto drop_fail;
+			}
+			dp->d_ref++;
+		} else {
+			mp = allocb(chunk, BPRI_MED);
+			if (mp == NULL) {
+				goto drop_fail;
+			}
+			bcopy((uchar_t *)base, mp->b_wptr, chunk);
+		}
+
+		base_off = 0;
+		len += chunk;
+		mp->b_wptr += chunk;
+		mp_tail->b_cont = mp;
+		mp_tail = mp;
+	}
+
+	if (VNETHOOK_INTERESTED_OUT(link->l_neti)) {
+		/*
+		 * The hook consumer may elect to free the mblk_t and set
+		 * our mblk_t ** to NULL.  When using a viona_desb_t
+		 * (dp != NULL), we do not want the corresponding cleanup to
+		 * occur during the viona_hook() call. We instead want to
+		 * reset and recycle dp for future use.  To prevent cleanup
+		 * during the viona_hook() call, we take a ref on dp (if being
+		 * used), and release it on success.  On failure, the
+		 * freemsgchain() call will release all the refs taken earlier
+		 * in viona_tx() (aside from the initial ref and the one we
+		 * take), and drop_hook will reset dp for reuse.
+		 */
+		if (dp != NULL)
+			dp->d_ref++;
+
+		/*
+		 * Pass &mp instead of &mp_head so we don't lose track of
+		 * mp_head if the hook consumer (i.e. ipf) elects to free mp
+		 * and set mp to NULL.
+		 */
+		mp = mp_head;
+		if (viona_hook(link, ring, &mp, B_TRUE) != 0) {
+			if (mp != NULL)
+				freemsgchain(mp);
+			goto drop_hook;
+		}
+
+		if (dp != NULL) {
+			dp->d_ref--;
+
+			/*
+			 * It is possible that the hook(s) accepted the packet,
+			 * but as part of its processing, it issued a pull-up
+			 * which released all references to the desb.  In that
+			 * case, go back to acting like the packet is entirely
+			 * copied (which it is).
+			 */
+			if (dp->d_ref == 1) {
+				dp->d_cookie = 0;
+				dp->d_ref = 0;
+				dp = NULL;
+			}
+		}
+	}
+
+	/*
+	 * Request hardware checksumming, if necessary. If the guest
+	 * sent an LSO packet then it must have also negotiated and
+	 * requested partial checksum; therefore the LSO logic is
+	 * contained within viona_tx_csum().
+	 */
+	if ((link->l_features & VIRTIO_NET_F_CSUM) != 0 &&
+	    (hdr->vrh_flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) != 0) {
+		if (!viona_tx_csum(ring, hdr, mp_head, len - iov[0].iov_len)) {
+			goto drop_fail;
+		}
+	}
+
+	if (dp != NULL) {
+		dp->d_len = len;
+		mutex_enter(&ring->vr_lock);
+		ring->vr_xfer_outstanding++;
+		mutex_exit(&ring->vr_lock);
+	} else {
+		/*
+		 * If the data was cloned out of the ring, the descriptors can
+		 * be marked as 'used' now, rather than deferring that action
+		 * until after successful packet transmission.
+		 */
+		viona_tx_done(ring, len, cookie);
+	}
+
+	/*
+	 * We're potentially going deep into the networking layer; make sure the
+	 * guest can't run concurrently.
+	 */
+	smt_begin_unsafe();
+	mac_tx(link_mch, mp_head, 0, MAC_DROP_ON_NO_DESC, NULL);
+	smt_end_unsafe();
+	return;
+
+drop_fail:
+	/*
+	 * On the off chance that memory is not available via the desballoc or
+	 * allocb calls, there are few options left besides to fail and drop
+	 * the frame on the floor.
+	 */
+
+	if (dp != NULL) {
+		/*
+		 * Take an additional reference on the desb handle (if present)
+		 * so any desballoc-sourced mblks can release their hold on it
+		 * without the handle reaching its final state and executing
+		 * its clean-up logic.
+		 */
+		dp->d_ref++;
+	}
+
+	/*
+	 * Free any already-allocated blocks and sum up the total length of the
+	 * dropped data to be released to the used ring.
+	 */
+	freemsgchain(mp_head);
+
+drop_hook:
+	len = 0;
+	for (uint_t i = 0; i < n; i++) {
+		len += iov[i].iov_len;
+	}
+
+	if (dp != NULL) {
+		VERIFY(dp->d_ref == 2);
+
+		/* Clean up the desb handle, releasing the extra hold. */
+		dp->d_len = 0;
+		dp->d_cookie = 0;
+		dp->d_ref = 0;
+	}
+
+	VIONA_PROBE3(tx_drop, viona_vring_t *, ring, uint32_t, len,
+	    uint16_t, cookie);
+	viona_tx_done(ring, len, cookie);
+}
diff --git a/usr/src/uts/i86pc/sys/vmm_drv.h b/usr/src/uts/i86pc/sys/vmm_drv.h
index 33fefc10ea..856b75e5cc 100644
--- a/usr/src/uts/i86pc/sys/vmm_drv.h
+++ b/usr/src/uts/i86pc/sys/vmm_drv.h
@@ -17,6 +17,9 @@
 #define	_VMM_DRV_H_
 
 #ifdef	_KERNEL
+
+#include <sys/file.h>
+
 struct vmm_hold;
 typedef struct vmm_hold vmm_hold_t;
author	Patrick Mooney <pmooney@pfmooney.com>	2019-06-06 20:17:45 +0000
committer	Patrick Mooney <pmooney@pfmooney.com>	2019-07-19 15:33:23 +0000
commit	9e88ade90f654d7c2cdfcec90cface22eaa124c7 (patch)
tree	f23752c2ec4204ecc1e3ba6d4b9a0f1dc5b0021e /usr/src/uts/i86pc
parent	62ae06fb599ccdbf3a97e6c584e0a055e763e2e9 (diff)
download	illumos-joyent-9e88ade90f654d7c2cdfcec90cface22eaa124c7.tar.gz