From 57f6b2a25cc23b5c46f9c6de51402167ec6754cc Mon Sep 17 00:00:00 2001
From: Robert Mustacchi <rm@fingolfin.org>
Date: Mon, 4 May 2020 23:25:27 +0000
Subject: 12719 ioctl(9E) should mention what to return for an unknown command
 Reviewed by: Gergő Doma <domag02@gmail.com> Reviewed by: Yuri Pankov
 <ypankov@fastmail.com> Reviewed by: Patrick Mooney <pmooney@pfmooney.com>
 Reviewed by: Juraj Lutter <juraj@lutter.sk> Approved by: Joshua M. Clulow
 <josh@sysmgr.org>
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 usr/src/man/man9e/ioctl.9e | 21 +++++----------------
 1 file changed, 5 insertions(+), 16 deletions(-)

(limited to 'usr/src/man/man9e')

diff --git a/usr/src/man/man9e/ioctl.9e b/usr/src/man/man9e/ioctl.9e
index afb43407d0..796d92a37f 100644
--- a/usr/src/man/man9e/ioctl.9e
+++ b/usr/src/man/man9e/ioctl.9e
@@ -3,11 +3,10 @@
 .\" The contents of this file are subject to the terms of the Common Development and Distribution License (the "License").  You may not use this file except in compliance with the License.
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing.  See the License for the specific language governing permissions and limitations under the License.
 .\" When distributing Covered Code, include this CDDL HEADER in each file and include the License file at usr/src/OPENSOLARIS.LICENSE.  If applicable, add the following below this CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your own identifying information: Portions Copyright [yyyy] [name of copyright owner]
-.TH IOCTL 9E "Dec 3, 1996"
+.TH IOCTL 9E "May 6, 2020"
 .SH NAME
 ioctl \- control a character device
 .SH SYNOPSIS
-.LP
 .nf
 #include <sys/cred.h>
 #include <sys/file.h>
@@ -23,11 +22,8 @@ ioctl \- control a character device
 .fi
 
 .SH INTERFACE LEVEL
-.sp
-.LP
 Architecture independent level 1 (DDI/DKI). This entry point is \fBoptional\fR.
 .SH ARGUMENTS
-.sp
 .ne 2
 .na
 \fB\fIdev\fR\fR
@@ -112,8 +108,6 @@ value which is valid only if the  \fBioctl()\fR succeeds.
 .RE
 
 .SH DESCRIPTION
-.sp
-.LP
 \fBioctl()\fR provides character-access drivers with an alternate entry point
 that can be used for almost any operation other than a simple transfer of
 characters in and out of buffers. Most often,  \fBioctl()\fR is used to control
@@ -132,7 +126,10 @@ I/O control commands are used to implement the terminal settings passed from
 \fBttymon\fR(1M) and  \fBstty\fR(1), to format disk devices, to implement a
 trace driver for debugging, and to clean up character queues. Since the kernel
 does not interpret the command type that defines the operation, a driver is
-free to define its own commands.
+free to define its own commands. Drivers must be prepared to receive commands
+that they do not recognize or are in contexts that they do not expect. In the
+case where \fIcmd\fR is unknown, it is recommended that the driver return
+\fBENOTTY\fR.
 .sp
 .LP
 Drivers that use an  \fBioctl()\fR routine typically have a command to ``read''
@@ -205,13 +202,10 @@ action that should be taken. However, the command passed to the driver by the
 user process is an integer value associated with the command name in the
 header.
 .SH RETURN VALUES
-.sp
-.LP
 \fBioctl()\fR should return  \fB0\fR on success, or the appropriate error
 number. The driver may also set the value returned to the calling process
 through \fIrval_p\fR.
 .SH EXAMPLES
-.LP
 \fBExample 1 \fR\fBioctl()\fR entry point
 .sp
 .LP
@@ -263,8 +257,6 @@ xxioctl(dev_t dev, int cmd, intptr_t arg, int mode,
 .in -2
 
 .SH SEE ALSO
-.sp
-.LP
 \fBstty\fR(1), \fBttymon\fR(1M), \fBdkio\fR(7I), \fBfbio\fR(7I),
 \fBtermio\fR(7I), \fBopen\fR(9E), \fBput\fR(9E), \fBsrv\fR(9E),
 \fBcopyin\fR(9F), \fBcopyout\fR(9F), \fBddi_copyin\fR(9F),
@@ -273,7 +265,6 @@ xxioctl(dev_t dev, int cmd, intptr_t arg, int mode,
 .LP
 \fIWriting Device Drivers\fR
 .SH WARNINGS
-.sp
 .LP
 Non-STREAMS driver  \fBioctl()\fR routines must make sure that user data is
 copied into or out of the kernel address space explicitly using
@@ -288,8 +279,6 @@ even when in user context.
 Failure to use the appropriate copying routines can result in panics under load
 on some platforms, and reproducible panics on others.
 .SH NOTES
-.sp
-.LP
 STREAMS drivers do not have  \fBioctl()\fR routines. The stream head converts
 I/O control commands to  \fBM_IOCTL\fR messages, which are handled by the
 driver's  \fBput\fR(9E) or \fBsrv\fR(9E) routine.
-- 
cgit v1.2.3


From 91a7ed35601b9dac7c03b16025b401dc9f7a6bbd Mon Sep 17 00:00:00 2001
From: Robert Mustacchi <rm@fingolfin.org>
Date: Mon, 4 May 2020 23:26:10 +0000
Subject: 12720 _fini(9E) could note it's not called when _init fails Reviewed
 by: Gergő Doma <domag02@gmail.com> Reviewed by: Yuri Pankov
 <ypankov@fastmail.com> Reviewed by: Patrick Mooney <pmooney@pfmooney.com>
 Reviewed by: Juraj Lutter <juraj@lutter.sk> Approved by: Dan McDonald
 <danmcd@joyent.com>
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 usr/src/man/man9e/_fini.9e | 23 ++++-------------------
 1 file changed, 4 insertions(+), 19 deletions(-)

(limited to 'usr/src/man/man9e')

diff --git a/usr/src/man/man9e/_fini.9e b/usr/src/man/man9e/_fini.9e
index 19cd7b665f..72bfbb8979 100644
--- a/usr/src/man/man9e/_fini.9e
+++ b/usr/src/man/man9e/_fini.9e
@@ -3,11 +3,10 @@
 .\" The contents of this file are subject to the terms of the Common Development and Distribution License (the "License").  You may not use this file except in compliance with the License.
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing.  See the License for the specific language governing permissions and limitations under the License.
 .\" When distributing Covered Code, include this CDDL HEADER in each file and include the License file at usr/src/OPENSOLARIS.LICENSE.  If applicable, add the following below this CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your own identifying information: Portions Copyright [yyyy] [name of copyright owner]
-.TH _FINI 9E "Jan 22, 2002"
+.TH _FINI 9E "May 06, 2020"
 .SH NAME
 _fini, _info, _init \- loadable module configuration entry points
 .SH SYNOPSIS
-.LP
 .nf
 #include <sys/modctl.h>
 
@@ -27,13 +26,10 @@ _fini, _info, _init \- loadable module configuration entry points
 .fi
 
 .SH INTERFACE LEVEL
-.sp
-.LP
 Solaris DDI specific (Solaris DDI). These entry points are required. You must
 write them.
 .SH PARAMETERS
 .SS "_info(\|)"
-.sp
 .ne 2
 .na
 \fB\fImodinfop\fR \fR
@@ -43,8 +39,6 @@ A pointer to an opaque \fBmodinfo\fR structure.
 .RE
 
 .SH DESCRIPTION
-.sp
-.LP
 \fB_init()\fR initializes a loadable module. It is called before any other
 routine in a loadable module. \fB_init()\fR returns the value returned by
 \fBmod_install\fR(9F). The module may optionally perform some other work before
@@ -61,10 +55,10 @@ returns the value returned by \fBmod_info\fR(9F).
 system wants to unload a module. If the module determines that it can be
 unloaded, then \fB_fini()\fR returns the value returned by
 \fBmod_remove\fR(9F). Upon successful return from \fB_fini()\fR no other
-routine in the module will be called before \fB_init()\fR is called.
+routine in the module will be called before \fB_init()\fR is called. If
+\fB_init()\fR did not successfully complete, \fB_fini()\fR will not be
+called.
 .SH RETURN VALUES
-.sp
-.LP
 \fB_init()\fR should return the appropriate error number if there is an error,
 otherwise it should return the return value from \fBmod_install\fR(9F).
 .sp
@@ -79,7 +73,6 @@ resources, such as mutexes and calls to \fBddi_soft_state_fini\fR(9F), should
 only be destroyed in \fB_fini()\fR after \fBmod_remove()\fR returns
 successfully.
 .SH EXAMPLES
-.LP
 \fBExample 1 \fRInitializing and Freeing a Mutex
 .sp
 .LP
@@ -158,8 +151,6 @@ _fini(void)
 .in -2
 
 .SH SEE ALSO
-.sp
-.LP
 \fBadd_drv\fR(1M), \fBmod_info\fR(9F), \fBmod_install\fR(9F),
 \fBmod_remove\fR(9F), \fBmutex\fR(9F), \fBmodldrv\fR(9S), \fBmodlinkage\fR(9S),
 \fBmodlstrmod\fR(9S)
@@ -167,17 +158,11 @@ _fini(void)
 .LP
 \fIWriting Device Drivers\fR
 .SH WARNINGS
-.sp
-.LP
 Do not change the structures referred to by the \fBmodlinkage\fR structure
 after the call to \fBmod_install()\fR, as the system may copy or change them.
 .SH NOTES
-.sp
-.LP
 Even though the identifiers \fB_fini()\fR, \fB_info()\fR, and \fB_init()\fR
 appear to be declared as globals, their scope is restricted by the kernel to
 the module that they are defined in.
 .SH BUGS
-.sp
-.LP
 On some implementations \fB_info()\fR may be called before \fB_init()\fR.
-- 
cgit v1.2.3


From d77e6e0f12d19668c0e9068c0fcd7a2123da5373 Mon Sep 17 00:00:00 2001
From: Paul Winder <paul@winder.uk.net>
Date: Tue, 12 May 2020 12:26:12 +0100
Subject: 12693 Enable Forward Error Correction (FEC) configuration via dladm
 Reviewed by: Garrett D'Amore <garrett@damore.org> Reviewed by: Robert
 Mustacchi <rm@fingolfin.org> Approved by: Dan McDonald <danmcd@joyent.com>

---
 usr/src/lib/libdladm/common/linkprop.c             |  80 +++++++++++++-
 usr/src/man/man1m/dladm.1m                         |  85 ++++++++++++---
 usr/src/man/man9e/mac.9e                           |  54 +++++++++-
 usr/src/uts/common/io/cxgbe/common/common.h        |  17 ++-
 usr/src/uts/common/io/cxgbe/common/t4_hw.c         |  85 ++++++++++++---
 .../uts/common/io/cxgbe/firmware/t4fw_interface.h  |  20 +++-
 usr/src/uts/common/io/cxgbe/t4nex/t4_mac.c         | 112 ++++++++++++++++++-
 usr/src/uts/common/io/mac/mac.c                    |  27 +++++
 usr/src/uts/common/io/mac/mac_provider.c           |  17 +++
 usr/src/uts/common/io/mlxcx/mlxcx.c                |   5 +
 usr/src/uts/common/io/mlxcx/mlxcx.h                |   7 ++
 usr/src/uts/common/io/mlxcx/mlxcx_cmd.c            | 101 +++++++++++++++++
 usr/src/uts/common/io/mlxcx/mlxcx_gld.c            | 119 +++++++++++++++++++++
 usr/src/uts/common/io/mlxcx/mlxcx_intr.c           |   1 +
 usr/src/uts/common/io/mlxcx/mlxcx_reg.h            |  55 ++++++++++
 usr/src/uts/common/sys/mac.h                       |  10 ++
 usr/src/uts/common/sys/mac_provider.h              |   3 +
 17 files changed, 758 insertions(+), 40 deletions(-)

(limited to 'usr/src/man/man9e')

diff --git a/usr/src/lib/libdladm/common/linkprop.c b/usr/src/lib/libdladm/common/linkprop.c
index c33268c8f8..90edf1293f 100644
--- a/usr/src/lib/libdladm/common/linkprop.c
+++ b/usr/src/lib/libdladm/common/linkprop.c
@@ -22,6 +22,7 @@
  * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2017 Joyent, Inc.
  * Copyright 2015 Garrett D'Amore <garrett@damore.org>
+ * Copyright 2020 RackTop Systems, Inc.
  */
 
 #include <stdlib.h>
@@ -154,13 +155,13 @@ static pd_getf_t	get_zone, get_autopush, get_rate_mod, get_rate,
 			get_bridge_pvid, get_protection, get_rxrings,
 			get_txrings, get_cntavail, get_secondary_macs,
 			get_allowedips, get_allowedcids, get_pool,
-			get_rings_range, get_linkmode_prop,
+			get_rings_range, get_linkmode_prop, get_bits,
 			get_promisc_filtered;
 
 static pd_setf_t	set_zone, set_rate, set_powermode, set_radio,
 			set_public_prop, set_resource, set_stp_prop,
 			set_bridge_forward, set_bridge_pvid, set_secondary_macs,
-			set_promisc_filtered;
+			set_promisc_filtered, set_public_bitprop;
 
 static pd_checkf_t	check_zone, check_autopush, check_rate, check_hoplimit,
 			check_encaplim, check_uint32, check_maxbw, check_cpus,
@@ -255,6 +256,10 @@ static link_attr_t link_attr[] = {
 
 	{ MAC_PROP_FLOWCTRL,	sizeof (link_flowctrl_t), "flowctrl"},
 
+	{ MAC_PROP_ADV_FEC_CAP,	sizeof (link_fec_t),	"adv_fec_cap"},
+
+	{ MAC_PROP_EN_FEC_CAP,	sizeof (link_fec_t),	"en_fec_cap"},
+
 	{ MAC_PROP_ZONE,	sizeof (dld_ioc_zid_t),	"zone"},
 
 	{ MAC_PROP_AUTOPUSH,	sizeof (struct dlautopush), "autopush"},
@@ -433,6 +438,12 @@ static  val_desc_t	link_flow_vals[] = {
 	{ "rx",		LINK_FLOWCTRL_RX	},
 	{ "bi",		LINK_FLOWCTRL_BI	}
 };
+static  val_desc_t	link_fec_vals[] = {
+	{ "none",	LINK_FEC_NONE		},
+	{ "auto",	LINK_FEC_AUTO		},
+	{ "rs",		LINK_FEC_RS		},
+	{ "base-r",	LINK_FEC_BASE_R		}
+};
 static  val_desc_t	link_priority_vals[] = {
 	{ "low",	MPL_LOW	},
 	{ "medium",	MPL_MEDIUM	},
@@ -551,6 +562,16 @@ static prop_desc_t	prop_table[] = {
 	    set_public_prop, NULL, get_flowctl, NULL,
 	    0, DATALINK_CLASS_PHYS, DL_ETHER },
 
+	{ "adv_fec_cap", { "", LINK_FEC_AUTO },
+	    link_fec_vals, VALCNT(link_fec_vals),
+	    NULL, NULL, get_bits, NULL,
+	    0, DATALINK_CLASS_PHYS, DL_ETHER },
+
+	{ "en_fec_cap", { "", LINK_FEC_AUTO },
+	    link_fec_vals, VALCNT(link_fec_vals),
+	    set_public_bitprop, NULL, get_bits, NULL,
+	    0, DATALINK_CLASS_PHYS, DL_ETHER },
+
 	{ "secondary-macs", { "--", 0 }, NULL, 0,
 	    set_secondary_macs, NULL,
 	    get_secondary_macs, check_secondary_macs, PD_CHECK_ALLOC,
@@ -3846,6 +3867,33 @@ done:
 	return (status);
 }
 
+static dladm_status_t
+set_public_bitprop(dladm_handle_t handle, prop_desc_t *pdp,
+    datalink_id_t linkid, val_desc_t *vdp, uint_t val_cnt, uint_t flags,
+    datalink_media_t media)
+{
+	uint_t		i, j;
+	val_desc_t	vd = { 0 };
+
+	if ((pdp->pd_flags & PD_CHECK_ALLOC) != 0)
+		return (DLADM_STATUS_BADARG);
+
+	for (i = 0; i < val_cnt; i++) {
+		for (j = 0; j < pdp->pd_noptval; j++) {
+			if (strcasecmp(vdp[i].vd_name,
+			    pdp->pd_optval[j].vd_name) == 0) {
+				vd.vd_val |= pdp->pd_optval[j].vd_val;
+				break;
+			}
+		}
+	}
+
+	if (vd.vd_val == 0)
+		return (DLADM_STATUS_BADARG);
+
+	return (set_public_prop(handle, pdp, linkid, &vd, 1, flags, media));
+}
+
 dladm_status_t
 i_dladm_macprop(dladm_handle_t handle, void *dip, boolean_t set)
 {
@@ -4158,6 +4206,34 @@ get_flowctl(dladm_handle_t handle, prop_desc_t *pdp,
 	return (DLADM_STATUS_OK);
 }
 
+static dladm_status_t
+get_bits(dladm_handle_t handle, prop_desc_t *pdp,
+    datalink_id_t linkid, char **prop_val, uint_t *val_cnt,
+    datalink_media_t media, uint_t flags, uint_t *perm_flags)
+{
+	uint32_t	v;
+	dladm_status_t	status;
+	uint_t		i, cnt;
+
+	status = i_dladm_get_public_prop(handle, linkid, pdp->pd_name, flags,
+	    perm_flags, &v, sizeof (v));
+	if (status != DLADM_STATUS_OK)
+		return (status);
+
+	cnt = 0;
+	for (i = 0; cnt < *val_cnt && i < pdp->pd_noptval; i++) {
+		if ((v & pdp->pd_optval[i].vd_val) != 0) {
+			(void) snprintf(prop_val[cnt++], DLADM_STRSIZE,
+			    pdp->pd_optval[i].vd_name);
+		}
+	}
+
+	if (i < pdp->pd_noptval)
+		return (DLADM_STATUS_BADVALCNT);
+
+	*val_cnt = cnt;
+	return (DLADM_STATUS_OK);
+}
 
 /* ARGSUSED */
 static dladm_status_t
diff --git a/usr/src/man/man1m/dladm.1m b/usr/src/man/man1m/dladm.1m
index f84c147caf..d6050c5114 100644
--- a/usr/src/man/man1m/dladm.1m
+++ b/usr/src/man/man1m/dladm.1m
@@ -42,12 +42,12 @@
 .\"
 .\" Copyright (c) 2008, Sun Microsystems, Inc. All Rights Reserved
 .\" Copyright 2016 Joyent, Inc.
+.\" Copyright 2020 RackTop Systems, Inc.
 .\"
-.TH DLADM 1M "Dec 16, 2016"
+.TH DLADM 1M "May 4, 2020"
 .SH NAME
 dladm \- administer data links
 .SH SYNOPSIS
-.LP
 .nf
 \fBdladm show-link\fR [\fB-P\fR] [\fB-s\fR [\fB-i\fR \fIinterval\fR]] [[\fB-p\fR] \fB-o\fR \fIfield\fR[,...]] [\fIlink\fR]
 \fBdladm rename-link\fR [\fB-R\fR \fIroot-dir\fR] \fIlink\fR \fInew-link\fR
@@ -179,7 +179,6 @@ dladm \- administer data links
 .fi
 
 .SH DESCRIPTION
-.LP
 The \fBdladm\fR command is used to administer data-links. A data-link is
 represented in the system as a \fBSTREAMS DLPI\fR (v2) interface which can be
 plumbed under protocol stacks such as \fBTCP/IP\fR. Each data-link relies on
@@ -332,7 +331,6 @@ characters.
 .RE
 
 .SS "Options"
-.LP
 Each \fBdladm\fR subcommand has its own set of options. However, many of the
 subcommands have the following as a common option:
 .sp
@@ -347,7 +345,6 @@ deletion, or renaming-should apply.
 .RE
 
 .SS "SUBCOMMANDS"
-.LP
 The following subcommands are supported:
 .sp
 .ne 2
@@ -4438,7 +4435,6 @@ display network usage for all links.
 .RE
 
 .SS "Parsable Output Format"
-.LP
 Many \fBdladm\fR subcommands have an option that displays output in a
 machine-parsable format. The output format is one or more lines of colon
 (\fB:\fR) delimited fields. The fields displayed are specific to the subcommand
@@ -4454,7 +4450,6 @@ by using shell \fBread\fR(1) functions with the environment variable
 \fBIFS=:\fR (see \fBEXAMPLES\fR, below). Note that escaping is not done when
 you request only a single field.
 .SS "General Link Properties"
-.LP
 The following general link properties are supported:
 .sp
 .ne 2
@@ -4725,7 +4720,6 @@ currently running on the system. By default, the zone binding is as per
 .RE
 
 .SS "Wifi Link Properties"
-.LP
 The following \fBWiFi\fR link properties are supported. Note that the ability
 to set a given property to a given value depends on the driver and hardware.
 .sp
@@ -4777,7 +4771,6 @@ is no fixed speed.
 .RE
 
 .SS "Ethernet Link Properties"
-.LP
 The following MII Properties, as documented in \fBieee802.3\fR(5), are
 supported in read-only mode:
 .RS +4
@@ -4935,6 +4928,75 @@ Note that the actual settings for this value are constrained by the
 capabilities allowed by the device and the link partner.
 .RE
 
+.sp
+.ne 2
+.na
+\fB\fBen_fec_cap\fR\fR
+.ad
+.sp .6
+.RS 4n
+Sets the Forward Error Correct (FEC) code(s) to be advertised by the
+device.
+Valid values are:
+.sp
+.ne 2
+.na
+\fB\fBnone\fR\fR
+.ad
+.sp .6
+.RS 4n
+Allow the device not to use FEC.
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fBauto\fR\fR
+.ad
+.sp .6
+.RS 4n
+The device will automatically decide which FEC code to use.
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fBrs\fR\fR
+.ad
+.sp .6
+.RS 4n
+Allow Reed-Solomon FEC code.
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fBbase-r\fR\fR
+.ad
+.sp .6
+.RS 4n
+Allow Base-R (also known as FireCode) code.
+.RE
+
+Valid input is either \fBauto\fR as a single value, or a comma separated
+combination of \fBnone\fR, \fBrs\fR and \fBbase-r\fR.
+The default value is \fBauto\fR.
+.sp
+.LP
+Note the actual FEC settings and combinations are constrained by the
+capabilities allowed by the device and the link partner.
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fBadv_fec_cap\fR\fR
+.ad
+.sp .6
+.RS 4n
+(read only) The current negotiated Forward Error Correction code.
+.RE
+
 .sp
 .ne 2
 .na
@@ -4992,7 +5054,6 @@ The default value is \fBvlanonly\fR.
 .RE
 
 .SS "IP Tunnel Link Properties"
-.LP
 The following IP tunnel link properties are supported.
 .sp
 .ne 2
@@ -5019,7 +5080,6 @@ default value is 4. A value of 0 disables the encapsulation limit.
 .RE
 
 .SH EXAMPLES
-.LP
 \fBExample 1 \fRConfiguring an Aggregation
 .sp
 .LP
@@ -5494,7 +5554,6 @@ interface. See \fBifconfig\fR(1M) for a description of how IPv6 addresses are
 configured on 6to4 tunnel links.
 
 .SH ATTRIBUTES
-.LP
 See \fBattributes\fR(5) for descriptions of the following attributes:
 .sp
 .LP
@@ -5527,12 +5586,10 @@ Interface Stability	Committed
 .TE
 
 .SH SEE ALSO
-.LP
 \fBacctadm\fR(1M), \fBautopush\fR(1M), \fBifconfig\fR(1M), \fBipsecconf\fR(1M),
 \fBndd\fR(1M), \fBpsrset\fR(1M), \fBwpad\fR(1M), \fBzonecfg\fR(1M),
 \fBattributes\fR(5), \fBieee802.3\fR(5), \fBdlpi\fR(7P)
 .SH NOTES
-.LP
 The preferred method of referring to an aggregation in the aggregation
 subcommands is by its link name. Referring to an aggregation by its integer
 \fIkey\fR is supported for backward compatibility, but is not necessary. When
diff --git a/usr/src/man/man9e/mac.9e b/usr/src/man/man9e/mac.9e
index ffeea417ca..3a3f2ae90a 100644
--- a/usr/src/man/man9e/mac.9e
+++ b/usr/src/man/man9e/mac.9e
@@ -10,8 +10,9 @@
 .\"
 .\"
 .\" Copyright 2019 Joyent, Inc.
+.\" Copyright 2020 RackTop Systems, Inc.
 .\"
-.Dd July 22, 2019
+.Dd May 11, 2020
 .Dt MAC 9E
 .Os
 .Sh NAME
@@ -890,6 +891,57 @@ it has configured the device, not what the device has actually
 negotiated.
 When setting the property, it should update the hardware and allow the link to
 potentially perform auto-negotiation again.
+.It Sy MAC_PROP_EN_FEC_CAP
+.Bd -filled -compact
+Type:
+.Sy link_fec_t |
+Permissions:
+.Sy Read/Write
+.Ed
+.Pp
+The
+.Sy MAC_PROP_EN_FEC_CAP
+property indicates which Forward Error Correction (FEC) code is advertised
+by the device.
+.Pp
+The
+.Sy link_fec_t
+is an enumeration that may be a combination of the following bit values:
+.Bl -tag -width Ds
+.It Sy LINK_FEC_NONE
+No FEC over the link.
+.It Sy LINK_FEC_AUTO
+The FEC coding to use is auto-negotiated,
+.Sy LINK_FEC_AUTO
+cannot be set along with any of the other values.
+This is the default setting the device driver should use.
+.It Sy LINK_FEC_RS
+The link may use Reed-Solomon FEC coding.
+.It Sy LINK_FEC_BASE_R
+The link may use Base-R coding, also common referred to as FireCode.
+.El
+.Pp
+When setting the property, it should update the hardware with the requested, or
+combination of requested codings.
+If a particular combination of codings is not supported by the hardware,
+the device driver should return
+.Er EINVAL .
+When retrieving this property, the device driver should return the current
+value of the property.
+.It Sy MAC_PROP_ADV_FEC_CAP
+.Bd -filled -compact
+Type:
+.Sy link_fec_t |
+Permissions:
+.Sy Read-Only
+.Ed
+.Pp
+The
+.Sy MAC_PROP_ADV_FEC_CAP
+has the same values as
+.Sy MAC_PROP_EN_FEC_CAP .
+The property indicates which Forward Error Correction (FEC) code has been
+negotiated over the link.
 .El
 .Pp
 The remaining properties are all about various auto-negotiation link
diff --git a/usr/src/uts/common/io/cxgbe/common/common.h b/usr/src/uts/common/io/cxgbe/common/common.h
index c7de2c4ebf..b8d77ebda3 100644
--- a/usr/src/uts/common/io/cxgbe/common/common.h
+++ b/usr/src/uts/common/io/cxgbe/common/common.h
@@ -20,6 +20,10 @@
  * release for licensing terms and conditions.
  */
 
+/*
+ * Copyright 2020 RackTop Systems, Inc.
+ */
+
 #ifndef __CHELSIO_COMMON_H
 #define __CHELSIO_COMMON_H
 
@@ -103,9 +107,16 @@ enum {
 typedef unsigned char cc_pause_t;
 
 enum {
-	FEC_AUTO	= 1 << 0,	/* IEEE 802.3 "automatic" */
-	FEC_RS		= 1 << 1,	/* Reed-Solomon */
-	FEC_BASER_RS	= 1 << 2,	/* BaseR/Reed-Solomon */
+	FEC_RS		= 1 << 0,	/* Reed-Solomon */
+	FEC_BASER_RS	= 1 << 1,	/* Base-R, aka Firecode */
+	FEC_NONE	= 1 << 2,	/* no FEC */
+
+	/*
+	 * Pseudo FECs that translate to real FECs.  The firmware knows nothing
+	 * about these and they start at M_FW_PORT_CAP32_FEC + 1.  AUTO should
+	 * be set all by itself.
+	 */
+	FEC_AUTO	= 1 << 5,
 };
 typedef unsigned char cc_fec_t;
 
diff --git a/usr/src/uts/common/io/cxgbe/common/t4_hw.c b/usr/src/uts/common/io/cxgbe/common/t4_hw.c
index ae88f36f15..4bb48f1b3a 100644
--- a/usr/src/uts/common/io/cxgbe/common/t4_hw.c
+++ b/usr/src/uts/common/io/cxgbe/common/t4_hw.c
@@ -20,6 +20,10 @@
  * release for licensing terms and conditions.
  */
 
+/*
+ * Copyright 2020 RackTop Systems, Inc.
+ */
+
 #include "common.h"
 #include "t4_regs.h"
 #include "t4_regs_values.h"
@@ -4645,20 +4649,57 @@ static inline cc_fec_t fwcap_to_cc_fec(fw_port_cap32_t fw_fec)
 	if (fw_fec & FW_PORT_CAP32_FEC_BASER_RS)
 		cc_fec |= FEC_BASER_RS;
 
-	return cc_fec;
+	if (cc_fec == 0)
+		cc_fec = FEC_NONE;
+
+	return (cc_fec);
 }
 
 /* Translate Common Code Forward Error Correction specification to Firmware */
-static inline fw_port_cap32_t cc_to_fwcap_fec(cc_fec_t cc_fec)
+static inline boolean_t
+cc_to_fwcap_fec(fw_port_cap32_t *fw_fecp, cc_fec_t cc_fec,
+    struct link_config *lc)
 {
 	fw_port_cap32_t fw_fec = 0;
 
-	if (cc_fec & FEC_RS)
+	if ((cc_fec & FEC_AUTO) != 0) {
+		if ((lc->pcaps & FW_PORT_CAP32_SPEED_100G) == 0)
+			fw_fec |= FW_PORT_CAP32_FEC_BASER_RS;
+
+		if ((lc->pcaps & FW_PORT_CAP32_FORCE_FEC) != 0)
+			fw_fec |= FW_PORT_CAP32_FEC_NO_FEC;
+
+		fw_fec |= FW_PORT_CAP32_FEC_RS;
+
+		*fw_fecp = fw_fec;
+		return (B_TRUE);
+	}
+
+	if ((cc_fec & FEC_RS) != 0)
 		fw_fec |= FW_PORT_CAP32_FEC_RS;
-	if (cc_fec & FEC_BASER_RS)
+
+	if ((cc_fec & FEC_BASER_RS) != 0 &&
+	    (lc->pcaps & FW_PORT_CAP32_SPEED_100G) == 0)
 		fw_fec |= FW_PORT_CAP32_FEC_BASER_RS;
 
-	return fw_fec;
+	if ((cc_fec & FEC_NONE) != 0) {
+		if ((lc->pcaps & FW_PORT_CAP32_FORCE_FEC) != 0) {
+			fw_fec |= FW_PORT_CAP32_FORCE_FEC;
+			fw_fec |= FW_PORT_CAP32_FEC_NO_FEC;
+		}
+
+		*fw_fecp = fw_fec;
+		return (B_TRUE);
+	}
+
+	if (fw_fec == 0)
+		return (B_FALSE);
+
+	if ((lc->pcaps & FW_PORT_CAP32_FORCE_FEC) != 0)
+		fw_fec |= FW_PORT_CAP32_FORCE_FEC;
+
+	*fw_fecp = fw_fec;
+	return (B_TRUE);
 }
 
 /**
@@ -4692,11 +4733,18 @@ fw_port_cap32_t t4_link_acaps(struct adapter *adapter, unsigned int port,
 	 * the Transceiver Module EPROM FEC parameters.  Otherwise we
 	 * use whatever is in the current Requested FEC settings.
 	 */
-	if (lc->requested_fec & FEC_AUTO)
-		cc_fec = fwcap_to_cc_fec(lc->def_acaps);
-	else
-		cc_fec = lc->requested_fec;
-	fw_fec = cc_to_fwcap_fec(cc_fec);
+	if (fec_supported(lc->pcaps)) {
+		if (lc->requested_fec & FEC_AUTO)
+			cc_fec = fwcap_to_cc_fec(lc->def_acaps);
+		else
+			cc_fec = lc->requested_fec;
+
+		if (!cc_to_fwcap_fec(&fw_fec, cc_fec, lc))
+			return (0);
+	} else {
+		fw_fec = 0;
+		cc_fec = FEC_NONE;
+	}
 
 	/* Figure out what our Requested Port Capabilities are going to be.
 	 * Note parallel structure in t4_handle_get_port_info() and
@@ -9641,12 +9689,17 @@ static void init_link_config(struct link_config *lc, fw_port_cap32_t pcaps,
 	lc->speed = 0;
 	lc->requested_fc = lc->fc = PAUSE_RX | PAUSE_TX;
 
-	/*
-	 * For Forward Error Control, we default to whatever the Firmware
-	 * tells us the Link is currently advertising.
-	 */
-	lc->requested_fec = FEC_AUTO;
-	lc->fec = fwcap_to_cc_fec(lc->def_acaps);
+	if (fec_supported(pcaps)) {
+		/*
+		 * For Forward Error Control, we default to whatever the Firmware
+		 * tells us the Link is currently advertising.
+		 */
+		lc->requested_fec = FEC_AUTO;
+		lc->fec = fwcap_to_cc_fec(lc->def_acaps);
+	} else {
+		lc->requested_fec = FEC_NONE;
+		lc->fec = FEC_NONE;
+	}
 
 	/* If the Port is capable of Auto-Negtotiation, initialize it as
 	 * "enabled" and copy over all of the Physical Port Capabilities
diff --git a/usr/src/uts/common/io/cxgbe/firmware/t4fw_interface.h b/usr/src/uts/common/io/cxgbe/firmware/t4fw_interface.h
index d705c73891..b998e85bae 100644
--- a/usr/src/uts/common/io/cxgbe/firmware/t4fw_interface.h
+++ b/usr/src/uts/common/io/cxgbe/firmware/t4fw_interface.h
@@ -11,6 +11,10 @@
  * release for licensing terms and conditions.
  */
 
+/*
+ * Copyright 2020 RackTop Systems, Inc.
+ */
+
 #ifndef _T4FW_INTERFACE_H_
 #define _T4FW_INTERFACE_H_
 
@@ -7204,11 +7208,12 @@ enum fw_port_mdi {
 #define	FW_PORT_CAP32_MDISTRAIGHT	0x00400000UL
 #define	FW_PORT_CAP32_FEC_RS		0x00800000UL
 #define	FW_PORT_CAP32_FEC_BASER_RS	0x01000000UL
-#define	FW_PORT_CAP32_FEC_RESERVED1	0x02000000UL
+#define	FW_PORT_CAP32_FEC_NO_FEC	0x02000000UL
 #define	FW_PORT_CAP32_FEC_RESERVED2	0x04000000UL
 #define	FW_PORT_CAP32_FEC_RESERVED3	0x08000000UL
 #define	FW_PORT_CAP32_FORCE_PAUSE	0x10000000UL
-#define	FW_PORT_CAP32_RESERVED2		0xe0000000UL
+#define	FW_PORT_CAP32_FORCE_FEC		0x20000000UL
+#define	FW_PORT_CAP32_RESERVED2		0xc0000000UL
 
 #define S_FW_PORT_CAP32_SPEED	0
 #define M_FW_PORT_CAP32_SPEED	0xfff
@@ -7254,7 +7259,7 @@ enum fw_port_mdi32 {
     (((x) >> S_FW_PORT_CAP32_MDI) & M_FW_PORT_CAP32_MDI)
 
 #define S_FW_PORT_CAP32_FEC	23
-#define M_FW_PORT_CAP32_FEC	0x1f
+#define M_FW_PORT_CAP32_FEC	0x5f
 #define V_FW_PORT_CAP32_FEC(x)	((x) << S_FW_PORT_CAP32_FEC)
 #define G_FW_PORT_CAP32_FEC(x) \
     (((x) >> S_FW_PORT_CAP32_FEC) & M_FW_PORT_CAP32_FEC)
@@ -7269,6 +7274,15 @@ enum fw_port_mdi32 {
 #define CAP32_FC(__cap32) \
 	(V_FW_PORT_CAP32_FC(M_FW_PORT_CAP32_FC) & __cap32)
 
+#ifdef _KERNEL
+static inline boolean_t
+fec_supported(uint32_t caps)
+{
+	return ((caps & (FW_PORT_CAP32_SPEED_25G | FW_PORT_CAP32_SPEED_50G |
+	    FW_PORT_CAP32_SPEED_100G)) != 0);
+}
+#endif
+
 enum fw_port_action {
 	FW_PORT_ACTION_L1_CFG		= 0x0001,
 	FW_PORT_ACTION_L2_CFG		= 0x0002,
diff --git a/usr/src/uts/common/io/cxgbe/t4nex/t4_mac.c b/usr/src/uts/common/io/cxgbe/t4nex/t4_mac.c
index 59c0ddde8d..9b4ffd8325 100644
--- a/usr/src/uts/common/io/cxgbe/t4nex/t4_mac.c
+++ b/usr/src/uts/common/io/cxgbe/t4nex/t4_mac.c
@@ -20,6 +20,10 @@
  * release for licensing terms and conditions.
  */
 
+/*
+ * Copyright 2020 RackTop Systems, Inc.
+ */
+
 #include <sys/ddi.h>
 #include <sys/sunddi.h>
 #include <sys/dlpi.h>
@@ -930,6 +934,62 @@ t4_mc_getcapab(void *arg, mac_capab_t cap, void *data)
 	return (status);
 }
 
+static link_fec_t
+fec_to_link_fec(cc_fec_t cc_fec)
+{
+	link_fec_t link_fec = 0;
+
+	if ((cc_fec & (FEC_RS | FEC_BASER_RS)) == (FEC_RS | FEC_BASER_RS))
+		return (LINK_FEC_AUTO);
+
+	if ((cc_fec & FEC_NONE) != 0)
+		link_fec |= LINK_FEC_NONE;
+
+	if ((cc_fec & FEC_AUTO) != 0)
+		link_fec |= LINK_FEC_AUTO;
+
+	if ((cc_fec & FEC_RS) != 0)
+		link_fec |= LINK_FEC_RS;
+
+	if ((cc_fec & FEC_BASER_RS) != 0)
+		link_fec |= LINK_FEC_BASE_R;
+
+	return (link_fec);
+}
+
+static int
+link_fec_to_fec(int v)
+{
+	int fec = 0;
+
+	if ((v & LINK_FEC_AUTO) != 0) {
+		fec = FEC_AUTO;
+		v &= ~LINK_FEC_AUTO;
+	} else {
+		if ((v & LINK_FEC_NONE) != 0) {
+			fec = FEC_NONE;
+			v &= ~LINK_FEC_NONE;
+		}
+
+		if ((v & LINK_FEC_RS) != 0) {
+			fec |= FEC_RS;
+			v &= ~LINK_FEC_RS;
+		}
+
+		if ((v & LINK_FEC_BASE_R) != 0) {
+			fec |= FEC_BASER_RS;
+			v &= ~LINK_FEC_BASE_R;
+		}
+	}
+
+	if (v != 0)
+		return (-1);
+
+	ASSERT3S(fec, !=, 0);
+
+	return (fec);
+}
+
 /* ARGSUSED */
 static int
 t4_mc_setprop(void *arg, const char *name, mac_prop_id_t id, uint_t size,
@@ -941,7 +1001,9 @@ t4_mc_setprop(void *arg, const char *name, mac_prop_id_t id, uint_t size,
 	uint8_t v8 = *(uint8_t *)val;
 	uint32_t v32 = *(uint32_t *)val;
 	int old, new = 0, relink = 0, rx_mode = 0, rc = 0;
+	boolean_t down_link = B_TRUE;
 	link_flowctrl_t fc;
+	link_fec_t fec;
 
 	/*
 	 * Save a copy of link_config. This can be used to restore link_config
@@ -1009,6 +1071,30 @@ t4_mc_setprop(void *arg, const char *name, mac_prop_id_t id, uint_t size,
 		}
 		break;
 
+	case MAC_PROP_EN_FEC_CAP:
+		if (!fec_supported(lc->pcaps)) {
+			rc = ENOTSUP;
+			break;
+		}
+
+		fec = *(link_fec_t *)val;
+		new = link_fec_to_fec(fec);
+		if (new < 0) {
+			rc = EINVAL;
+		} else if (new != lc->requested_fec) {
+			lc->requested_fec = new;
+			relink = 1;
+			/*
+			 * For fec, do not preemptively force the link
+			 * down. If changing fec causes the link state
+			 * to transition, then appropriate asynchronous
+			 * events are generated which correctly reflect
+			 * the link state.
+			 */
+			down_link = B_FALSE;
+		}
+		break;
+
 	case MAC_PROP_EN_10GFDX_CAP:
 		if (lc->pcaps & FW_PORT_CAP32_ANEG && is_10G_port(pi)) {
 			old = lc->acaps & FW_PORT_CAP32_SPEED_10G;
@@ -1062,7 +1148,8 @@ t4_mc_setprop(void *arg, const char *name, mac_prop_id_t id, uint_t size,
 
 	if (isset(&sc->open_device_map, pi->port_id) != 0) {
 		if (relink != 0) {
-			t4_os_link_changed(pi->adapter, pi->port_id, 0);
+			if (down_link)
+				t4_os_link_changed(pi->adapter, pi->port_id, 0);
 			rc = begin_synchronized_op(pi, 1, 1);
 			if (rc != 0)
 				return (rc);
@@ -1143,6 +1230,20 @@ t4_mc_getprop(void *arg, const char *name, mac_prop_id_t id, uint_t size,
 			*(link_flowctrl_t *)val = LINK_FLOWCTRL_NONE;
 		break;
 
+	case MAC_PROP_ADV_FEC_CAP:
+		if (!fec_supported(lc->pcaps))
+			return (ENOTSUP);
+
+		*(link_fec_t *)val = fec_to_link_fec(lc->fec);
+		break;
+
+	case MAC_PROP_EN_FEC_CAP:
+		if (!fec_supported(lc->pcaps))
+			return (ENOTSUP);
+
+		*(link_fec_t *)val = fec_to_link_fec(lc->requested_fec);
+		break;
+
 	case MAC_PROP_ADV_100GFDX_CAP:
 	case MAC_PROP_EN_100GFDX_CAP:
 		*u = !!(lc->acaps & FW_PORT_CAP32_SPEED_100G);
@@ -1212,6 +1313,15 @@ t4_mc_propinfo(void *arg, const char *name, mac_prop_id_t id,
 		mac_prop_info_set_default_link_flowctrl(ph, LINK_FLOWCTRL_BI);
 		break;
 
+	case MAC_PROP_EN_FEC_CAP:
+		mac_prop_info_set_default_fec(ph, LINK_FEC_AUTO);
+		break;
+
+	case MAC_PROP_ADV_FEC_CAP:
+		mac_prop_info_set_perm(ph, MAC_PROP_PERM_READ);
+		mac_prop_info_set_default_fec(ph, LINK_FEC_AUTO);
+		break;
+
 	case MAC_PROP_EN_10GFDX_CAP:
 		if (lc->pcaps & FW_PORT_CAP32_ANEG &&
 		    lc->pcaps & FW_PORT_CAP32_SPEED_10G)
diff --git a/usr/src/uts/common/io/mac/mac.c b/usr/src/uts/common/io/mac/mac.c
index 0a52043a15..707fca24d0 100644
--- a/usr/src/uts/common/io/mac/mac.c
+++ b/usr/src/uts/common/io/mac/mac.c
@@ -23,6 +23,7 @@
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2020 Joyent, Inc.
  * Copyright 2015 Garrett D'Amore <garrett@damore.org>
+ * Copyright 2020 RackTop Systems, Inc.
  */
 
 /*
@@ -3336,6 +3337,10 @@ mac_prop_check_size(mac_prop_id_t id, uint_t valsize, boolean_t is_range)
 	case MAC_PROP_FLOWCTRL:
 		minsize = sizeof (link_flowctrl_t);
 		break;
+	case MAC_PROP_ADV_FEC_CAP:
+	case MAC_PROP_EN_FEC_CAP:
+		minsize = sizeof (link_fec_t);
+		break;
 	case MAC_PROP_ADV_5000FDX_CAP:
 	case MAC_PROP_EN_5000FDX_CAP:
 	case MAC_PROP_ADV_2500FDX_CAP:
@@ -3524,6 +3529,28 @@ mac_set_prop(mac_handle_t mh, mac_prop_id_t id, char *name, void *val,
 		break;
 	}
 
+	case MAC_PROP_ADV_FEC_CAP:
+	case MAC_PROP_EN_FEC_CAP: {
+		link_fec_t fec;
+
+		ASSERT(valsize >= sizeof (link_fec_t));
+
+		/*
+		 * fec cannot be zero, and auto must be set exclusively.
+		 */
+		bcopy(val, &fec, sizeof (link_fec_t));
+		if (fec == 0)
+			return (EINVAL);
+		if ((fec & LINK_FEC_AUTO) != 0 && (fec & ~LINK_FEC_AUTO) != 0)
+			return (EINVAL);
+
+		if (mip->mi_callbacks->mc_callbacks & MC_SETPROP) {
+			err = mip->mi_callbacks->mc_setprop(mip->mi_driver,
+			    name, id, valsize, val);
+		}
+		break;
+	}
+
 	default:
 		/* For other driver properties, call driver's callback */
 		if (mip->mi_callbacks->mc_callbacks & MC_SETPROP) {
diff --git a/usr/src/uts/common/io/mac/mac_provider.c b/usr/src/uts/common/io/mac/mac_provider.c
index ce986fd4bf..bfaf232d25 100644
--- a/usr/src/uts/common/io/mac/mac_provider.c
+++ b/usr/src/uts/common/io/mac/mac_provider.c
@@ -23,6 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2019 Joyent, Inc.
  * Copyright 2017 OmniTI Computer Consulting, Inc. All rights reserved.
+ * Copyright 2020 RackTop Systems, Inc.
  */
 
 #include <sys/types.h>
@@ -1526,6 +1527,22 @@ mac_prop_info_set_default_link_flowctrl(mac_prop_info_handle_t ph,
 	pr->pr_flags |= MAC_PROP_INFO_DEFAULT;
 }
 
+void
+mac_prop_info_set_default_fec(mac_prop_info_handle_t ph, link_fec_t val)
+{
+	mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph;
+
+	/* nothing to do if the caller doesn't want the default value */
+	if (pr->pr_default == NULL)
+		return;
+
+	ASSERT(pr->pr_default_size >= sizeof (link_fec_t));
+
+	bcopy(&val, pr->pr_default, sizeof (val));
+
+	pr->pr_flags |= MAC_PROP_INFO_DEFAULT;
+}
+
 void
 mac_prop_info_set_range_uint32(mac_prop_info_handle_t ph, uint32_t min,
     uint32_t max)
diff --git a/usr/src/uts/common/io/mlxcx/mlxcx.c b/usr/src/uts/common/io/mlxcx/mlxcx.c
index 9fae7c5f77..2aefac33db 100644
--- a/usr/src/uts/common/io/mlxcx/mlxcx.c
+++ b/usr/src/uts/common/io/mlxcx/mlxcx.c
@@ -1756,6 +1756,11 @@ mlxcx_setup_ports(mlxcx_t *mlxp)
 			mutex_exit(&p->mlp_mtx);
 			goto err;
 		}
+		if (!mlxcx_cmd_query_port_fec(mlxp, p)) {
+			mutex_exit(&p->mlp_mtx);
+			goto err;
+		}
+		p->mlp_fec_requested = LINK_FEC_AUTO;
 
 		mutex_exit(&p->mlp_mtx);
 	}
diff --git a/usr/src/uts/common/io/mlxcx/mlxcx.h b/usr/src/uts/common/io/mlxcx/mlxcx.h
index 52240df3a3..06277d033c 100644
--- a/usr/src/uts/common/io/mlxcx/mlxcx.h
+++ b/usr/src/uts/common/io/mlxcx/mlxcx.h
@@ -346,6 +346,8 @@ typedef struct mlxcx_port {
 	mlxcx_eth_proto_t	mlp_max_proto;
 	mlxcx_eth_proto_t	mlp_admin_proto;
 	mlxcx_eth_proto_t	mlp_oper_proto;
+	mlxcx_pplm_fec_active_t	mlp_fec_active;
+	link_fec_t		mlp_fec_requested;
 
 	mlxcx_eth_inline_mode_t	mlp_wqe_min_inline;
 
@@ -1320,7 +1322,12 @@ extern boolean_t mlxcx_cmd_access_register(mlxcx_t *, mlxcx_cmd_reg_opmod_t,
     mlxcx_register_id_t, mlxcx_register_data_t *);
 extern boolean_t mlxcx_cmd_query_port_mtu(mlxcx_t *, mlxcx_port_t *);
 extern boolean_t mlxcx_cmd_query_port_status(mlxcx_t *, mlxcx_port_t *);
+extern boolean_t mlxcx_cmd_modify_port_status(mlxcx_t *, mlxcx_port_t *,
+    mlxcx_port_status_t);
 extern boolean_t mlxcx_cmd_query_port_speed(mlxcx_t *, mlxcx_port_t *);
+extern boolean_t mlxcx_cmd_query_port_fec(mlxcx_t *, mlxcx_port_t *);
+extern boolean_t mlxcx_cmd_modify_port_fec(mlxcx_t *, mlxcx_port_t *,
+    mlxcx_pplm_fec_caps_t);
 
 extern boolean_t mlxcx_cmd_set_port_mtu(mlxcx_t *, mlxcx_port_t *);
 
diff --git a/usr/src/uts/common/io/mlxcx/mlxcx_cmd.c b/usr/src/uts/common/io/mlxcx/mlxcx_cmd.c
index 30fb7ca8ef..f059b856a6 100644
--- a/usr/src/uts/common/io/mlxcx/mlxcx_cmd.c
+++ b/usr/src/uts/common/io/mlxcx/mlxcx_cmd.c
@@ -12,6 +12,7 @@
 /*
  * Copyright 2020, The University of Queensland
  * Copyright (c) 2018, Joyent, Inc.
+ * Copyright 2020 RackTop Systems, Inc.
  */
 
 /*
@@ -1594,6 +1595,8 @@ mlxcx_reg_name(mlxcx_register_id_t rid)
 		return ("MCIA");
 	case MLXCX_REG_PPCNT:
 		return ("PPCNT");
+	case MLXCX_REG_PPLM:
+		return ("PPLM");
 	default:
 		return ("???");
 	}
@@ -1640,6 +1643,9 @@ mlxcx_cmd_access_register(mlxcx_t *mlxp, mlxcx_cmd_reg_opmod_t opmod,
 	case MLXCX_REG_PPCNT:
 		dsize = sizeof (mlxcx_reg_ppcnt_t);
 		break;
+	case MLXCX_REG_PPLM:
+		dsize = sizeof (mlxcx_reg_pplm_t);
+		break;
 	default:
 		dsize = 0;
 		VERIFY(0);
@@ -1775,6 +1781,25 @@ mlxcx_cmd_query_port_status(mlxcx_t *mlxp, mlxcx_port_t *mlp)
 	return (ret);
 }
 
+boolean_t
+mlxcx_cmd_modify_port_status(mlxcx_t *mlxp, mlxcx_port_t *mlp,
+    mlxcx_port_status_t status)
+{
+	mlxcx_register_data_t data;
+	boolean_t ret;
+
+	ASSERT(mutex_owned(&mlp->mlp_mtx));
+	bzero(&data, sizeof (data));
+	data.mlrd_paos.mlrd_paos_local_port = mlp->mlp_num + 1;
+	data.mlrd_paos.mlrd_paos_admin_status = status;
+	set_bit32(&data.mlrd_paos.mlrd_paos_flags, MLXCX_PAOS_ADMIN_ST_EN);
+
+	ret = mlxcx_cmd_access_register(mlxp, MLXCX_CMD_ACCESS_REGISTER_WRITE,
+	    MLXCX_REG_PAOS, &data);
+
+	return (ret);
+}
+
 boolean_t
 mlxcx_cmd_query_port_speed(mlxcx_t *mlxp, mlxcx_port_t *mlp)
 {
@@ -1808,6 +1833,82 @@ mlxcx_cmd_query_port_speed(mlxcx_t *mlxp, mlxcx_port_t *mlp)
 	return (ret);
 }
 
+boolean_t
+mlxcx_cmd_query_port_fec(mlxcx_t *mlxp, mlxcx_port_t *mlp)
+{
+	mlxcx_register_data_t data;
+	boolean_t ret;
+
+	ASSERT(mutex_owned(&mlp->mlp_mtx));
+	bzero(&data, sizeof (data));
+	data.mlrd_pplm.mlrd_pplm_local_port = mlp->mlp_num + 1;
+
+	ret = mlxcx_cmd_access_register(mlxp, MLXCX_CMD_ACCESS_REGISTER_READ,
+	    MLXCX_REG_PPLM, &data);
+
+	if (ret) {
+		mlp->mlp_fec_active =
+		    from_be24(data.mlrd_pplm.mlrd_pplm_fec_mode_active);
+	}
+
+	return (ret);
+}
+
+boolean_t
+mlxcx_cmd_modify_port_fec(mlxcx_t *mlxp, mlxcx_port_t *mlp,
+    mlxcx_pplm_fec_caps_t fec)
+{
+	mlxcx_register_data_t data_in, data_out;
+	mlxcx_pplm_fec_caps_t caps;
+	mlxcx_reg_pplm_t *pplm_in, *pplm_out;
+	boolean_t ret;
+
+	ASSERT(mutex_owned(&mlp->mlp_mtx));
+	bzero(&data_in, sizeof (data_in));
+	pplm_in = &data_in.mlrd_pplm;
+	pplm_in->mlrd_pplm_local_port = mlp->mlp_num + 1;
+
+	ret = mlxcx_cmd_access_register(mlxp, MLXCX_CMD_ACCESS_REGISTER_READ,
+	    MLXCX_REG_PPLM, &data_in);
+
+	if (!ret)
+		return (B_FALSE);
+
+	bzero(&data_out, sizeof (data_out));
+	pplm_out = &data_out.mlrd_pplm;
+	pplm_out->mlrd_pplm_local_port = mlp->mlp_num + 1;
+
+	caps = get_bits32(pplm_in->mlrd_pplm_fec_override_cap,
+	    MLXCX_PPLM_CAP_56G);
+	set_bits32(&pplm_out->mlrd_pplm_fec_override_admin,
+	    MLXCX_PPLM_CAP_56G, fec & caps);
+
+	caps = get_bits32(pplm_in->mlrd_pplm_fec_override_cap,
+	    MLXCX_PPLM_CAP_100G);
+	set_bits32(&pplm_out->mlrd_pplm_fec_override_admin,
+	    MLXCX_PPLM_CAP_100G, fec & caps);
+
+	caps = get_bits32(pplm_in->mlrd_pplm_fec_override_cap,
+	    MLXCX_PPLM_CAP_50G);
+	set_bits32(&pplm_out->mlrd_pplm_fec_override_admin,
+	    MLXCX_PPLM_CAP_50G, fec & caps);
+
+	caps = get_bits32(pplm_in->mlrd_pplm_fec_override_cap,
+	    MLXCX_PPLM_CAP_25G);
+	set_bits32(&pplm_out->mlrd_pplm_fec_override_admin,
+	    MLXCX_PPLM_CAP_25G, fec & caps);
+
+	caps = get_bits32(pplm_in->mlrd_pplm_fec_override_cap,
+	    MLXCX_PPLM_CAP_10_40G);
+	set_bits32(&pplm_out->mlrd_pplm_fec_override_admin,
+	    MLXCX_PPLM_CAP_10_40G, fec & caps);
+
+	ret = mlxcx_cmd_access_register(mlxp, MLXCX_CMD_ACCESS_REGISTER_WRITE,
+	    MLXCX_REG_PPLM, &data_out);
+
+	return (ret);
+}
+
 boolean_t
 mlxcx_cmd_modify_nic_vport_ctx(mlxcx_t *mlxp, mlxcx_port_t *mlp,
     mlxcx_modify_nic_vport_ctx_fields_t fields)
diff --git a/usr/src/uts/common/io/mlxcx/mlxcx_gld.c b/usr/src/uts/common/io/mlxcx/mlxcx_gld.c
index 5d15ec1fbb..2521641a00 100644
--- a/usr/src/uts/common/io/mlxcx/mlxcx_gld.c
+++ b/usr/src/uts/common/io/mlxcx/mlxcx_gld.c
@@ -80,6 +80,53 @@ mlxcx_speed_to_bits(mlxcx_eth_proto_t v)
 	}
 }
 
+static link_fec_t
+mlxcx_fec_to_link_fec(mlxcx_pplm_fec_active_t mlxcx_fec)
+{
+	if ((mlxcx_fec & MLXCX_PPLM_FEC_ACTIVE_NONE) != 0)
+		return (LINK_FEC_NONE);
+
+	if ((mlxcx_fec & MLXCX_PPLM_FEC_ACTIVE_FIRECODE) != 0)
+		return (LINK_FEC_BASE_R);
+
+	if ((mlxcx_fec & (MLXCX_PPLM_FEC_ACTIVE_RS528 |
+	    MLXCX_PPLM_FEC_ACTIVE_RS271 | MLXCX_PPLM_FEC_ACTIVE_RS544 |
+	    MLXCX_PPLM_FEC_ACTIVE_RS272)) != 0)
+		return (LINK_FEC_RS);
+
+	return (LINK_FEC_NONE);
+}
+
+static boolean_t
+mlxcx_link_fec_cap(link_fec_t fec, mlxcx_pplm_fec_caps_t *pfecp)
+{
+	mlxcx_pplm_fec_caps_t pplm_fec = 0;
+
+	if ((fec & LINK_FEC_AUTO) != 0) {
+		pplm_fec = MLXCX_PPLM_FEC_CAP_AUTO;
+		fec &= ~LINK_FEC_AUTO;
+	} else if ((fec & LINK_FEC_NONE) != 0) {
+		pplm_fec = MLXCX_PPLM_FEC_CAP_NONE;
+		fec &= ~LINK_FEC_NONE;
+	} else if ((fec & LINK_FEC_RS) != 0) {
+		pplm_fec |= MLXCX_PPLM_FEC_CAP_RS;
+		fec &= ~LINK_FEC_RS;
+	} else if ((fec & LINK_FEC_BASE_R) != 0) {
+		pplm_fec |= MLXCX_PPLM_FEC_CAP_FIRECODE;
+		fec &= ~LINK_FEC_BASE_R;
+	}
+
+	/*
+	 * Only one fec option is allowed.
+	 */
+	if (fec != 0)
+		return (B_FALSE);
+
+	*pfecp = pplm_fec;
+
+	return (B_TRUE);
+}
+
 static int
 mlxcx_mac_stat_rfc_2863(mlxcx_t *mlxp, mlxcx_port_t *port, uint_t stat,
     uint64_t *val)
@@ -1091,6 +1138,14 @@ mlxcx_mac_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num,
 		mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
 		mac_prop_info_set_default_uint8(prh, 1);
 		break;
+	case MAC_PROP_ADV_FEC_CAP:
+		mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
+		mac_prop_info_set_default_fec(prh, LINK_FEC_AUTO);
+		break;
+	case MAC_PROP_EN_FEC_CAP:
+		mac_prop_info_set_perm(prh, MAC_PROP_PERM_RW);
+		mac_prop_info_set_default_fec(prh, LINK_FEC_AUTO);
+		break;
 	case MAC_PROP_ADV_100GFDX_CAP:
 	case MAC_PROP_EN_100GFDX_CAP:
 		mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
@@ -1150,6 +1205,9 @@ mlxcx_mac_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
 	uint32_t new_mtu, new_hw_mtu, old_mtu;
 	mlxcx_buf_shard_t *sh;
 	boolean_t allocd = B_FALSE;
+	boolean_t relink = B_FALSE;
+	link_fec_t fec;
+	mlxcx_pplm_fec_caps_t cap_fec;
 
 	mutex_enter(&port->mlp_mtx);
 
@@ -1198,11 +1256,57 @@ mlxcx_mac_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
 			break;
 		}
 		break;
+
+	case MAC_PROP_EN_FEC_CAP:
+		bcopy(pr_val, &fec, sizeof (fec));
+		if (!mlxcx_link_fec_cap(fec, &cap_fec)) {
+			ret = EINVAL;
+			break;
+		}
+
+		/*
+		 * Don't change the FEC if it is already at the requested
+		 * setting AND the port is up.
+		 * When the port is down, always set the FEC and attempt
+		 * to retrain the link.
+		 */
+		if (fec == port->mlp_fec_requested &&
+		    fec == mlxcx_fec_to_link_fec(port->mlp_fec_active) &&
+		    port->mlp_oper_status != MLXCX_PORT_STATUS_DOWN)
+			break;
+
+		/*
+		 * The most like cause of this failing is an invalid
+		 * or unsupported fec option.
+		 */
+		if (!mlxcx_cmd_modify_port_fec(mlxp, port, cap_fec)) {
+			ret = EINVAL;
+			break;
+		}
+
+		port->mlp_fec_requested = fec;
+
+		/*
+		 * For FEC to become effective, the link needs to go back
+		 * to training and negotiation state. This happens when
+		 * the link transitions from down to up, force a relink.
+		 */
+		relink = B_TRUE;
+		break;
+
 	default:
 		ret = ENOTSUP;
 		break;
 	}
 
+	if (relink) {
+		if (!mlxcx_cmd_modify_port_status(mlxp, port,
+		    MLXCX_PORT_STATUS_DOWN) ||
+		    !mlxcx_cmd_modify_port_status(mlxp, port,
+		    MLXCX_PORT_STATUS_UP)) {
+			ret = EIO;
+		}
+	}
 	mutex_exit(&port->mlp_mtx);
 
 	return (ret);
@@ -1260,6 +1364,21 @@ mlxcx_mac_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
 		}
 		*(uint8_t *)pr_val = port->mlp_autoneg;
 		break;
+	case MAC_PROP_ADV_FEC_CAP:
+		if (pr_valsize < sizeof (link_fec_t)) {
+			ret = EOVERFLOW;
+			break;
+		}
+		*(link_fec_t *)pr_val =
+		    mlxcx_fec_to_link_fec(port->mlp_fec_active);
+		break;
+	case MAC_PROP_EN_FEC_CAP:
+		if (pr_valsize < sizeof (link_fec_t)) {
+			ret = EOVERFLOW;
+			break;
+		}
+		*(link_fec_t *)pr_val = port->mlp_fec_requested;
+		break;
 	case MAC_PROP_MTU:
 		if (pr_valsize < sizeof (uint32_t)) {
 			ret = EOVERFLOW;
diff --git a/usr/src/uts/common/io/mlxcx/mlxcx_intr.c b/usr/src/uts/common/io/mlxcx/mlxcx_intr.c
index 4dc4291b08..aed691897b 100644
--- a/usr/src/uts/common/io/mlxcx/mlxcx_intr.c
+++ b/usr/src/uts/common/io/mlxcx/mlxcx_intr.c
@@ -355,6 +355,7 @@ mlxcx_update_link_state(mlxcx_t *mlxp, mlxcx_port_t *port)
 	mutex_enter(&port->mlp_mtx);
 	(void) mlxcx_cmd_query_port_status(mlxp, port);
 	(void) mlxcx_cmd_query_port_speed(mlxp, port);
+	(void) mlxcx_cmd_query_port_fec(mlxp, port);
 
 	switch (port->mlp_oper_status) {
 	case MLXCX_PORT_STATUS_UP:
diff --git a/usr/src/uts/common/io/mlxcx/mlxcx_reg.h b/usr/src/uts/common/io/mlxcx/mlxcx_reg.h
index 6d09abea5c..abd717842d 100644
--- a/usr/src/uts/common/io/mlxcx/mlxcx_reg.h
+++ b/usr/src/uts/common/io/mlxcx/mlxcx_reg.h
@@ -2463,6 +2463,59 @@ typedef struct {
 	};
 } mlxcx_reg_ppcnt_t;
 
+typedef enum {
+	MLXCX_PPLM_FEC_CAP_AUTO			= 0,
+	MLXCX_PPLM_FEC_CAP_NONE			= (1 << 0),
+	MLXCX_PPLM_FEC_CAP_FIRECODE		= (1 << 1),
+	MLXCX_PPLM_FEC_CAP_RS			= (1 << 2),
+} mlxcx_pplm_fec_caps_t;
+
+typedef enum {
+	MLXCX_PPLM_FEC_ACTIVE_NONE		= (1 << 0),
+	MLXCX_PPLM_FEC_ACTIVE_FIRECODE		= (1 << 1),
+	MLXCX_PPLM_FEC_ACTIVE_RS528		= (1 << 2),
+	MLXCX_PPLM_FEC_ACTIVE_RS271		= (1 << 3),
+	MLXCX_PPLM_FEC_ACTIVE_RS544		= (1 << 7),
+	MLXCX_PPLM_FEC_ACTIVE_RS272		= (1 << 9),
+} mlxcx_pplm_fec_active_t;
+
+/* CSTYLED */
+#define	MLXCX_PPLM_CAP_56G		(bitdef_t){ 16, 0x000f0000 }
+/* CSTYLED */
+#define	MLXCX_PPLM_CAP_100G		(bitdef_t){ 12, 0x0000f000 }
+/* CSTYLED */
+#define	MLXCX_PPLM_CAP_50G		(bitdef_t){ 8, 0x00000f00 }
+/* CSTYLED */
+#define	MLXCX_PPLM_CAP_25G		(bitdef_t){ 4, 0x000000f0 }
+/* CSTYLED */
+#define	MLXCX_PPLM_CAP_10_40G		(bitdef_t){ 0, 0x0000000f }
+
+typedef struct {
+	uint8_t		mlrd_pplm_rsvd;
+	uint8_t		mlrd_pplm_local_port;
+	uint8_t		mlrd_pplm_rsvd1[11];
+	uint24be_t	mlrd_pplm_fec_mode_active;
+	bits32_t	mlrd_pplm_fec_override_cap;
+	bits32_t	mlrd_pplm_fec_override_admin;
+	uint16be_t	mlrd_pplm_fec_override_cap_400g_8x;
+	uint16be_t	mlrd_pplm_fec_override_cap_200g_4x;
+	uint16be_t	mlrd_pplm_fec_override_cap_100g_2x;
+	uint16be_t	mlrd_pplm_fec_override_cap_50g_1x;
+	uint16be_t	mlrd_pplm_fec_override_admin_400g_8x;
+	uint16be_t	mlrd_pplm_fec_override_admin_200g_4x;
+	uint16be_t	mlrd_pplm_fec_override_admin_100g_2x;
+	uint16be_t	mlrd_pplm_fec_override_admin_50g_1x;
+	uint8_t		mlrd_pplm_rsvd2[8];
+	uint16be_t	mlrd_pplm_fec_override_cap_hdr;
+	uint16be_t	mlrd_pplm_fec_override_cap_edr;
+	uint16be_t	mlrd_pplm_fec_override_cap_fdr;
+	uint16be_t	mlrd_pplm_fec_override_cap_fdr10;
+	uint16be_t	mlrd_pplm_fec_override_admin_hdr;
+	uint16be_t	mlrd_pplm_fec_override_admin_edr;
+	uint16be_t	mlrd_pplm_fec_override_admin_fdr;
+	uint16be_t	mlrd_pplm_fec_override_admin_fdr10;
+} mlxcx_reg_pplm_t;
+
 typedef enum {
 	MLXCX_REG_PMTU		= 0x5003,
 	MLXCX_REG_PTYS		= 0x5004,
@@ -2472,6 +2525,7 @@ typedef enum {
 	MLXCX_REG_MLCR		= 0x902B,
 	MLXCX_REG_MCIA		= 0x9014,
 	MLXCX_REG_PPCNT		= 0x5008,
+	MLXCX_REG_PPLM		= 0x5023,
 } mlxcx_register_id_t;
 
 typedef union {
@@ -2482,6 +2536,7 @@ typedef union {
 	mlxcx_reg_pmaos_t		mlrd_pmaos;
 	mlxcx_reg_mcia_t		mlrd_mcia;
 	mlxcx_reg_ppcnt_t		mlrd_ppcnt;
+	mlxcx_reg_pplm_t		mlrd_pplm;
 } mlxcx_register_data_t;
 
 typedef enum {
diff --git a/usr/src/uts/common/sys/mac.h b/usr/src/uts/common/sys/mac.h
index 2ce448fc3d..00d9901719 100644
--- a/usr/src/uts/common/sys/mac.h
+++ b/usr/src/uts/common/sys/mac.h
@@ -23,6 +23,7 @@
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2018 Joyent, Inc.
  * Copyright (c) 2015 Garrett D'Amore <garrett@damore.org>
+ * Copyright 2020 RackTop Systems, Inc.
  */
 
 #ifndef	_SYS_MAC_H
@@ -87,6 +88,13 @@ typedef enum {
 	LINK_FLOWCTRL_BI
 } link_flowctrl_t;
 
+typedef enum {
+	LINK_FEC_NONE		= 1 << 0,
+	LINK_FEC_AUTO		= 1 << 1,
+	LINK_FEC_RS		= 1 << 2,
+	LINK_FEC_BASE_R		= 1 << 3
+} link_fec_t;
+
 typedef enum {
 	LINK_TAGMODE_VLANONLY = 0,
 	LINK_TAGMODE_NORMAL
@@ -230,6 +238,8 @@ typedef enum {
 	MAC_PROP_EN_25GFDX_CAP,
 	MAC_PROP_ADV_50GFDX_CAP,
 	MAC_PROP_EN_50GFDX_CAP,
+	MAC_PROP_EN_FEC_CAP,
+	MAC_PROP_ADV_FEC_CAP,
 	MAC_PROP_PRIVATE = -1
 } mac_prop_id_t;
 
diff --git a/usr/src/uts/common/sys/mac_provider.h b/usr/src/uts/common/sys/mac_provider.h
index 9cc5c1ad5c..2cb326814a 100644
--- a/usr/src/uts/common/sys/mac_provider.h
+++ b/usr/src/uts/common/sys/mac_provider.h
@@ -22,6 +22,7 @@
 /*
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2018, Joyent, Inc.
+ * Copyright 2020 RackTop Systems, Inc.
  */
 
 #ifndef	_SYS_MAC_PROVIDER_H
@@ -630,6 +631,8 @@ extern void			mac_prop_info_set_default_uint32(
 				    mac_prop_info_handle_t, uint32_t);
 extern void			mac_prop_info_set_default_link_flowctrl(
 				    mac_prop_info_handle_t, link_flowctrl_t);
+extern void			mac_prop_info_set_default_fec(
+				    mac_prop_info_handle_t, link_fec_t);
 extern void			mac_prop_info_set_range_uint32(
 				    mac_prop_info_handle_t,
 				    uint32_t, uint32_t);
-- 
cgit v1.2.3


From b22a70abf81f995ecc990b8444e63308bc389d5c Mon Sep 17 00:00:00 2001
From: Patrick Mooney <pmooney@pfmooney.com>
Date: Wed, 3 Jan 2018 21:11:35 +0000
Subject: 12679 want viona driver for bhyve Portions contributed by: Ryan
 Zezeski <rpz@joyent.com> Portions contributed by: John Levon
 <john.levon@joyent.com> Portions contributed by: Jason King
 <jason.king@joyent.com> Portions contributed by: Robert Mustacchi
 <rm@joyent.com> Portions contributed by: Bryan Cantrill <bryan@joyent.com>
 Reviewed by: Ryan Zezeski <ryan@zinascii.com> Approved by: Dan McDonald
 <danmcd@joyent.com>

---
 usr/src/cmd/bhyve/Makefile                      |    3 +-
 usr/src/cmd/bhyve/pci_emul.c                    |    5 +
 usr/src/cmd/bhyve/pci_emul.h                    |    7 +
 usr/src/cmd/bhyve/pci_virtio_viona.c            |  494 +++++---
 usr/src/cmd/devfsadm/i386/misc_link_i386.c      |    6 +
 usr/src/man/man9e/mac.9e                        |   22 +-
 usr/src/pkg/manifests/system-bhyve.mf           |    3 +
 usr/src/uts/common/inet/ip/ip6_output.c         |   13 +-
 usr/src/uts/common/inet/ip/ip_output.c          |    8 +
 usr/src/uts/common/inet/ipf/ip_fil_solaris.c    |  335 +++++-
 usr/src/uts/common/inet/ipf/netinet/ipf_stack.h |   16 +-
 usr/src/uts/common/io/hook.c                    |    2 +-
 usr/src/uts/common/sys/dlpi.h                   |    7 +-
 usr/src/uts/common/sys/hook_impl.h              |    4 +-
 usr/src/uts/common/sys/neti.h                   |    5 +-
 usr/src/uts/i86pc/Makefile.files                |    6 +-
 usr/src/uts/i86pc/Makefile.i86pc                |    1 +
 usr/src/uts/i86pc/io/viona/viona.c              | 1409 -----------------------
 usr/src/uts/i86pc/io/viona/viona.mapfile        |   41 +
 usr/src/uts/i86pc/io/viona/viona_hook.c         |  438 +++++++
 usr/src/uts/i86pc/io/viona/viona_impl.h         |  326 ++++++
 usr/src/uts/i86pc/io/viona/viona_main.c         |  991 ++++++++++++++++
 usr/src/uts/i86pc/io/viona/viona_ring.c         |  638 ++++++++++
 usr/src/uts/i86pc/io/viona/viona_rx.c           |  718 ++++++++++++
 usr/src/uts/i86pc/io/viona/viona_tx.c           |  756 ++++++++++++
 usr/src/uts/i86pc/sys/viona_io.h                |   49 +-
 usr/src/uts/i86pc/sys/vmm_drv.h                 |    3 +
 usr/src/uts/i86pc/viona/Makefile                |   13 +-
 usr/src/uts/intel/ipf/ipf.global-objs.debug64   |    9 +-
 29 files changed, 4689 insertions(+), 1639 deletions(-)
 delete mode 100644 usr/src/uts/i86pc/io/viona/viona.c
 create mode 100644 usr/src/uts/i86pc/io/viona/viona.mapfile
 create mode 100644 usr/src/uts/i86pc/io/viona/viona_hook.c
 create mode 100644 usr/src/uts/i86pc/io/viona/viona_impl.h
 create mode 100644 usr/src/uts/i86pc/io/viona/viona_main.c
 create mode 100644 usr/src/uts/i86pc/io/viona/viona_ring.c
 create mode 100644 usr/src/uts/i86pc/io/viona/viona_rx.c
 create mode 100644 usr/src/uts/i86pc/io/viona/viona_tx.c

(limited to 'usr/src/man/man9e')

diff --git a/usr/src/cmd/bhyve/Makefile b/usr/src/cmd/bhyve/Makefile
index e96868e006..2301e6c8a6 100644
--- a/usr/src/cmd/bhyve/Makefile
+++ b/usr/src/cmd/bhyve/Makefile
@@ -58,6 +58,7 @@ SRCS =	acpi.c			\
 	pci_virtio_console.c	\
 	pci_virtio_net.c	\
 	pci_virtio_rnd.c	\
+	pci_virtio_viona.c	\
 	pci_xhci.c		\
 	pm.c			\
 	post.c			\
@@ -120,7 +121,7 @@ CSTD=		$(CSTD_GNU99)
 C99MODE=	-xc99=%all
 C99LMODE=	-Xc99=%all
 
-$(PROG) := LDLIBS += -lsocket -lnsl -ldlpi -lmd -luuid -lvmmapi -lz
+$(PROG) := LDLIBS += -lsocket -lnsl -ldlpi -ldladm -lmd -luuid -lvmmapi -lz
 $(MEVENT_TEST_PROG) := LDLIBS += -lsocket
 
 .KEEP_STATE:
diff --git a/usr/src/cmd/bhyve/pci_emul.c b/usr/src/cmd/bhyve/pci_emul.c
index 5118b31534..a71cc528aa 100644
--- a/usr/src/cmd/bhyve/pci_emul.c
+++ b/usr/src/cmd/bhyve/pci_emul.c
@@ -1597,6 +1597,11 @@ pci_lintr_update(struct pci_devinst *pi)
 		pci_irq_assert(pi);
 	}
 	pthread_mutex_unlock(&pi->pi_lintr.lock);
+#ifndef __FreeBSD__
+	if (pi->pi_d->pe_lintrupdate != NULL) {
+		pi->pi_d->pe_lintrupdate(pi);
+	}
+#endif /* __FreeBSD__ */
 }
 
 int
diff --git a/usr/src/cmd/bhyve/pci_emul.h b/usr/src/cmd/bhyve/pci_emul.h
index 853badaadb..0053caed99 100644
--- a/usr/src/cmd/bhyve/pci_emul.h
+++ b/usr/src/cmd/bhyve/pci_emul.h
@@ -27,6 +27,9 @@
  *
  * $FreeBSD$
  */
+/*
+ * Copyright 2018 Joyent, Inc.
+ */
 
 #ifndef _PCI_EMUL_H_
 #define _PCI_EMUL_H_
@@ -71,6 +74,10 @@ struct pci_devemu {
 	uint64_t  (*pe_barread)(struct vmctx *ctx, int vcpu,
 				struct pci_devinst *pi, int baridx,
 				uint64_t offset, int size);
+
+#ifndef __FreeBSD__
+	void	(*pe_lintrupdate)(struct pci_devinst *pi);
+#endif /* __FreeBSD__ */
 };
 #define PCI_EMUL_SET(x)   DATA_SET(pci_devemu_set, x);
 
diff --git a/usr/src/cmd/bhyve/pci_virtio_viona.c b/usr/src/cmd/bhyve/pci_virtio_viona.c
index e5a5cb584f..9cafa7b111 100644
--- a/usr/src/cmd/bhyve/pci_virtio_viona.c
+++ b/usr/src/cmd/bhyve/pci_virtio_viona.c
@@ -34,7 +34,7 @@
  * http://www.illumos.org/license/CDDL.
  *
  * Copyright 2015 Pluribus Networks Inc.
- * Copyright 2017 Joyent, Inc.
+ * Copyright 2019 Joyent, Inc.
  */
 
 #include <sys/cdefs.h>
@@ -84,18 +84,6 @@
 
 #define	VIONA_REGSZ	VIONA_R_MAX+1
 
-/*
- * Host capabilities
- */
-#define	VIRTIO_NET_F_MAC	(1 <<  5) /* host supplies MAC */
-#define	VIRTIO_NET_F_MRG_RXBUF	(1 << 15) /* host can merge RX buffers */
-#define	VIRTIO_NET_F_STATUS	(1 << 16) /* config status field available */
-
-#define	VIONA_S_HOSTCAPS		\
-	(VIRTIO_NET_F_MAC |		\
-	VIRTIO_NET_F_MRG_RXBUF |	\
-	VIRTIO_NET_F_STATUS)
-
 /*
  * Queue definitions.
  */
@@ -108,7 +96,7 @@
 /*
  * Debug printf
  */
-static int pci_viona_debug;
+static volatile int pci_viona_debug;
 #define	DPRINTF(params) if (pci_viona_debug) printf params
 #define	WPRINTF(params) printf params
 
@@ -124,26 +112,20 @@ struct pci_viona_softc {
 	int		vsc_isr;
 
 	datalink_id_t	vsc_linkid;
-	char		vsc_linkname[MAXLINKNAMELEN];
 	int		vsc_vnafd;
 
+	/* Configurable parameters */
+	char		vsc_linkname[MAXLINKNAMELEN];
+	uint32_t	vsc_feature_mask;
+	uint16_t	vsc_vq_size;
+
 	uint32_t	vsc_features;
 	uint8_t		vsc_macaddr[6];
 
 	uint64_t	vsc_pfn[VIONA_MAXQ];
 	uint16_t	vsc_msix_table_idx[VIONA_MAXQ];
-	/*
-	 * Flag to see if host is already sending data out.
-	 * If it is, no need to wait for lock and send interrupt to host
-	 * for new data.
-	 */
-	boolean_t	vsc_tx_kick_lock_held;
-
-	pthread_t	tx_tid;
-	pthread_mutex_t	tx_mtx;
-	pthread_cond_t	tx_cond;
+	boolean_t	vsc_msix_active;
 };
-#define	viona_ctx(sc)	((sc)->vsc_pi->pi_vmctx)
 
 /*
  * Return the size of IO BAR that maps virtio header and device specific
@@ -160,47 +142,44 @@ pci_viona_iosize(struct pci_devinst *pi)
 }
 
 static uint16_t
-pci_viona_qsize(int qnum)
+pci_viona_qsize(struct pci_viona_softc *sc, int qnum)
 {
 	/* XXX no ctl queue currently */
 	if (qnum == VIONA_CTLQ) {
 		return (0);
 	}
 
-	/* XXX fixed currently. Maybe different for tx/rx/ctl */
-	return (VIONA_RINGSZ);
+	return (sc->vsc_vq_size);
 }
 
 static void
 pci_viona_ring_reset(struct pci_viona_softc *sc, int ring)
 {
-	int	error;
-
 	assert(ring < VIONA_MAXQ);
 
 	switch (ring) {
 	case VIONA_RXQ:
-		error = ioctl(sc->vsc_vnafd, VNA_IOC_RX_RING_RESET);
-		if (error != 0) {
-			WPRINTF(("ioctl viona rx ring reset failed %d\n",
-			    error));
-		} else {
-			sc->vsc_pfn[VIONA_RXQ] = 0;
-		}
-		break;
 	case VIONA_TXQ:
-		error = ioctl(sc->vsc_vnafd, VNA_IOC_TX_RING_RESET);
-		if (error != 0) {
-			WPRINTF(("ioctl viona tx ring reset failed %d\n",
-			    error));
-		} else {
-			sc->vsc_pfn[VIONA_TXQ] = 0;
-		}
 		break;
 	case VIONA_CTLQ:
 	default:
-		break;
+		return;
+	}
+
+	for (;;) {
+		int res;
+
+		res = ioctl(sc->vsc_vnafd, VNA_IOC_RING_RESET, ring);
+		if (res == 0) {
+			break;
+		} else if (errno != EINTR) {
+			WPRINTF(("ioctl viona ring %d reset failed %d\n",
+			    ring, errno));
+			return;
+		}
 	}
+
+	sc->vsc_pfn[ring] = 0;
 }
 
 static void
@@ -220,11 +199,11 @@ static void *
 pci_viona_poll_thread(void *param)
 {
 	struct pci_viona_softc *sc = param;
-	pollfd_t	pollset;
-	int			error;
+	pollfd_t pollset;
+	const int fd = sc->vsc_vnafd;
 
-	pollset.fd = sc->vsc_vnafd;
-	pollset.events = POLLIN | POLLOUT;
+	pollset.fd = fd;
+	pollset.events = POLLRDBAND;
 
 	for (;;) {
 		if (poll(&pollset, 1, -1) < 0) {
@@ -236,23 +215,35 @@ pci_viona_poll_thread(void *param)
 				break;
 			}
 		}
-		if (pollset.revents & POLLIN) {
-			pci_generate_msix(sc->vsc_pi,
-			    sc->vsc_msix_table_idx[VIONA_RXQ]);
-			error = ioctl(sc->vsc_vnafd, VNA_IOC_RX_INTR_CLR);
-			if (error != 0) {
-				WPRINTF(("ioctl viona rx intr clear failed"
-				    " %d\n", error));
+		if (pollset.revents & POLLRDBAND) {
+			vioc_intr_poll_t vip;
+			uint_t i;
+			int res;
+			boolean_t assert_lintr = B_FALSE;
+			const boolean_t do_msix = pci_msix_enabled(sc->vsc_pi);
+
+			res = ioctl(fd, VNA_IOC_INTR_POLL, &vip);
+			for (i = 0; res > 0 && i < VIONA_VQ_MAX; i++) {
+				if (vip.vip_status[i] == 0) {
+					continue;
+				}
+				if (do_msix) {
+					pci_generate_msix(sc->vsc_pi,
+					    sc->vsc_msix_table_idx[i]);
+				} else {
+					assert_lintr = B_TRUE;
+				}
+				res = ioctl(fd, VNA_IOC_RING_INTR_CLR, i);
+				if (res != 0) {
+					WPRINTF(("ioctl viona vq %d intr "
+					    "clear failed %d\n", i, errno));
+				}
 			}
-		}
-
-		if (pollset.revents & POLLOUT) {
-			pci_generate_msix(sc->vsc_pi,
-			    sc->vsc_msix_table_idx[VIONA_TXQ]);
-			error = ioctl(sc->vsc_vnafd, VNA_IOC_TX_INTR_CLR);
-			if (error != 0) {
-				WPRINTF(("ioctl viona tx intr clear failed"
-				    " %d\n", error));
+			if (assert_lintr) {
+				pthread_mutex_lock(&sc->vsc_mtx);
+				sc->vsc_isr |= VTCFG_ISR_QUEUES;
+				pci_lintr_assert(sc->vsc_pi);
+				pthread_mutex_unlock(&sc->vsc_mtx);
 			}
 		}
 	}
@@ -260,57 +251,6 @@ pci_viona_poll_thread(void *param)
 	pthread_exit(NULL);
 }
 
-static void
-pci_viona_ping_rxq(struct pci_viona_softc *sc)
-{
-	int error;
-
-	error = ioctl(sc->vsc_vnafd, VNA_IOC_RX_RING_KICK);
-	if (error != 0) {
-		WPRINTF(("ioctl viona rx ring kick failed %d\n", error));
-	}
-}
-
-static void *
-pci_viona_tx_thread(void *param)
-{
-	struct pci_viona_softc *sc = (struct pci_viona_softc *)param;
-	int error;
-
-	pthread_mutex_lock(&sc->tx_mtx);
-	for (;;) {
-		error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx);
-		assert(error == 0);
-		sc->vsc_tx_kick_lock_held = B_TRUE;
-		error = ioctl(sc->vsc_vnafd, VNA_IOC_TX_RING_KICK);
-		if (error != 0) {
-			WPRINTF(("ioctl viona tx ring kick failed %d\n",
-			    error));
-		}
-		sc->vsc_tx_kick_lock_held = B_FALSE;
-	}
-	pthread_mutex_unlock(&sc->tx_mtx);
-
-	return (NULL);
-}
-
-static void
-pci_viona_ping_txq(struct pci_viona_softc *sc)
-{
-	/* Signal the tx thread for processing */
-	if (sc->vsc_tx_kick_lock_held)
-		return;
-	pthread_mutex_lock(&sc->tx_mtx);
-	pthread_cond_signal(&sc->tx_cond);
-	pthread_mutex_unlock(&sc->tx_mtx);
-}
-
-static void
-pci_viona_ping_ctlq(struct pci_viona_softc *sc)
-{
-	DPRINTF(("viona: control qnotify!\n\r"));
-}
-
 static void
 pci_viona_ring_init(struct pci_viona_softc *sc, uint64_t pfn)
 {
@@ -320,29 +260,19 @@ pci_viona_ring_init(struct pci_viona_softc *sc, uint64_t pfn)
 
 	assert(qnum < VIONA_MAXQ);
 
+	if (qnum == VIONA_CTLQ) {
+		return;
+	}
+
 	sc->vsc_pfn[qnum] = (pfn << VRING_PFN);
 
-	vna_ri.ri_qsize = pci_viona_qsize(qnum);
+	vna_ri.ri_index = qnum;
+	vna_ri.ri_qsize = pci_viona_qsize(sc, qnum);
 	vna_ri.ri_qaddr = (pfn << VRING_PFN);
+	error = ioctl(sc->vsc_vnafd, VNA_IOC_RING_INIT, &vna_ri);
 
-	switch (qnum) {
-	case VIONA_RXQ:
-		error = ioctl(sc->vsc_vnafd, VNA_IOC_RX_RING_INIT, &vna_ri);
-		if (error != 0) {
-			WPRINTF(("ioctl viona rx ring init failed %d\n",
-			    error));
-		}
-		break;
-	case VIONA_TXQ:
-		error = ioctl(sc->vsc_vnafd, VNA_IOC_TX_RING_INIT, &vna_ri);
-		if (error != 0) {
-			WPRINTF(("ioctl viona tx ring init failed %d\n",
-			    error));
-		}
-		break;
-	case VIONA_CTLQ:
-	default:
-		break;
+	if (error != 0) {
+		WPRINTF(("ioctl viona ring %u init failed %d\n", qnum, errno));
 	}
 }
 
@@ -350,36 +280,110 @@ static int
 pci_viona_viona_init(struct vmctx *ctx, struct pci_viona_softc *sc)
 {
 	vioc_create_t		vna_create;
-#if notyet
-	char			devname[MAXNAMELEN];
-	int			ctlfd;
-#endif
 	int			error;
 
-	sc->vsc_vnafd = open("/devices/pseudo/viona@0:ctl", O_RDWR | O_EXCL);
+	sc->vsc_vnafd = open("/dev/viona", O_RDWR | O_EXCL);
 	if (sc->vsc_vnafd == -1) {
-		WPRINTF(("open viona ctl failed\n"));
+		WPRINTF(("open viona ctl failed: %d\n", errno));
 		return (-1);
 	}
 
 	vna_create.c_linkid = sc->vsc_linkid;
-	strlcpy(vna_create.c_vmname, vmname,
-	    sizeof (vna_create.c_vmname));
-#if notyet
-	vm_get_memory_seg(ctx, 1 * (1024 * 1024UL), &vna_create.c_lomem_size,
-	    NULL);
-	vm_get_memory_seg(ctx, 4 * (1024 * 1024 * 1024UL),
-	    &vna_create.c_himem_size, NULL);
-#endif
+	vna_create.c_vmfd = vm_get_device_fd(ctx);
 	error = ioctl(sc->vsc_vnafd, VNA_IOC_CREATE, &vna_create);
 	if (error != 0) {
-		WPRINTF(("ioctl viona create failed %d\n", error));
+		(void) close(sc->vsc_vnafd);
+		WPRINTF(("ioctl viona create failed %d\n", errno));
 		return (-1);
 	}
 
 	return (0);
 }
 
+static int
+pci_viona_parse_opts(struct pci_viona_softc *sc, char *opts)
+{
+	char *next, *cp, *vnic = NULL;
+	int err = 0;
+
+	sc->vsc_vq_size = VIONA_RINGSZ;
+	sc->vsc_feature_mask = 0;
+
+	for (; opts != NULL && *opts != '\0'; opts = next) {
+		char *val;
+
+		if ((cp = strchr(opts, ',')) != NULL) {
+			*cp = '\0';
+			next = cp + 1;
+		} else {
+			next = NULL;
+		}
+
+		if ((cp = strchr(opts, '=')) == NULL) {
+			/* vnic chosen with bare name */
+			if (vnic != NULL) {
+				fprintf(stderr,
+				    "viona: unexpected vnic name '%s'", opts);
+				err = -1;
+			} else {
+				vnic = opts;
+			}
+			continue;
+		}
+
+		/* <param>=<value> handling */
+		val = cp + 1;
+		*cp = '\0';
+		if (strcmp(opts, "feature_mask") == 0) {
+			long num;
+
+			errno = 0;
+			num = strtol(val, NULL, 0);
+			if (errno != 0 || num < 0) {
+				fprintf(stderr,
+				    "viona: invalid mask '%s'", val);
+			} else {
+				sc->vsc_feature_mask = num;
+			}
+		} else if (strcmp(opts, "vqsize") == 0) {
+			long num;
+
+			errno = 0;
+			num = strtol(val, NULL, 0);
+			if (errno != 0) {
+				fprintf(stderr,
+				    "viona: invalid vsqize '%s'", val);
+				err = -1;
+			} else if (num <= 2 || num > 32768) {
+				fprintf(stderr,
+				    "viona: vqsize out of range", num);
+				err = -1;
+			} else if ((1 << (ffs(num) - 1)) != num) {
+				fprintf(stderr,
+				    "viona: vqsize must be power of 2", num);
+				err = -1;
+			} else {
+				sc->vsc_vq_size = num;
+			}
+		} else {
+			fprintf(stderr,
+			    "viona: unrecognized option '%s'", opts);
+			err = -1;
+		}
+	}
+	if (vnic == NULL) {
+		fprintf(stderr, "viona: vnic name required");
+		sc->vsc_linkname[0] = '\0';
+		err = -1;
+	} else {
+		(void) strlcpy(sc->vsc_linkname, vnic, MAXLINKNAMELEN);
+	}
+
+	DPRINTF(("viona=%p dev=%s vqsize=%x feature_mask=%x\n", sc,
+	    sc->vsc_linkname, sc->vsc_vq_size, sc->vsc_feature_mask));
+	return (err);
+}
+
 static int
 pci_viona_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 {
@@ -387,9 +391,9 @@ pci_viona_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 	dladm_status_t		status;
 	dladm_vnic_attr_t	attr;
 	char			errmsg[DLADM_STRSIZE];
-	int error;
+	int error, i;
 	struct pci_viona_softc *sc;
-	int i;
+	uint64_t ioport;
 
 	if (opts == NULL) {
 		printf("virtio-viona: vnic required\n");
@@ -404,7 +408,10 @@ pci_viona_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 
 	pthread_mutex_init(&sc->vsc_mtx, NULL);
 
-	strlcpy(sc->vsc_linkname, opts, MAXLINKNAMELEN);
+	if (pci_viona_parse_opts(sc, opts) != 0) {
+		free(sc);
+		return (1);
+	}
 
 	if ((status = dladm_open(&handle)) != DLADM_STATUS_OK) {
 		WPRINTF(("could not open /dev/dld"));
@@ -430,7 +437,6 @@ pci_viona_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 		return (1);
 	}
 
-	sc->vsc_tx_kick_lock_held = B_FALSE;
 	memcpy(sc->vsc_macaddr, attr.va_mac_addr, ETHERADDRL);
 
 	dladm_close(handle);
@@ -449,42 +455,44 @@ pci_viona_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 	pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
 	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK);
 	pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_NET);
+	pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR);
 
 	/* MSI-X support */
 	for (i = 0; i < VIONA_MAXQ; i++)
 		sc->vsc_msix_table_idx[i] = VIRTIO_MSI_NO_VECTOR;
 
-	/*
-	 * BAR 1 used to map MSI-X table and PBA
-	 */
+	/* BAR 1 used to map MSI-X table and PBA */
 	if (pci_emul_add_msixcap(pi, VIONA_MAXQ, 1)) {
 		free(sc);
 		return (1);
 	}
 
-	pci_emul_alloc_bar(pi, 0, PCIBAR_IO, VIONA_REGSZ);
+	/* BAR 0 for legacy-style virtio register access. */
+	error = pci_emul_alloc_bar(pi, 0, PCIBAR_IO, VIONA_REGSZ);
+	if (error != 0) {
+		WPRINTF(("could not allocate virtio BAR\n"));
+		free(sc);
+		return (1);
+	}
+
+	/* Install ioport hook for virtqueue notification */
+	ioport = pi->pi_bar[0].addr + VTCFG_R_QNOTIFY;
+	error = ioctl(sc->vsc_vnafd, VNA_IOC_SET_NOTIFY_IOP, ioport);
+	if (error != 0) {
+		WPRINTF(("could not install ioport hook at %x\n", ioport));
+		free(sc);
+		return (1);
+	}
 
 	/*
-	 * Initialize tx semaphore & spawn TX processing thread
-	 * As of now, only one thread for TX desc processing is
-	 * spawned.
+	 * Need a legacy interrupt for virtio compliance, even though MSI-X
+	 * operation is _strongly_ suggested for adequate performance.
 	 */
-	pthread_mutex_init(&sc->tx_mtx, NULL);
-	pthread_cond_init(&sc->tx_cond, NULL);
-	pthread_create(&sc->tx_tid, NULL, pci_viona_tx_thread, (void *)sc);
+	pci_lintr_request(pi);
 
 	return (0);
 }
 
-/*
- * Function pointer array to handle queue notifications
- */
-static void (*pci_viona_qnotify[VIONA_MAXQ])(struct pci_viona_softc *) = {
-	pci_viona_ping_rxq,
-	pci_viona_ping_txq,
-	pci_viona_ping_ctlq
-};
-
 static uint64_t
 viona_adjust_offset(struct pci_devinst *pi, uint64_t offset)
 {
@@ -500,6 +508,109 @@ viona_adjust_offset(struct pci_devinst *pi, uint64_t offset)
 	return (offset);
 }
 
+static void
+pci_viona_ring_set_msix(struct pci_devinst *pi, uint_t ring)
+{
+	struct pci_viona_softc *sc = pi->pi_arg;
+	struct msix_table_entry mte;
+	uint16_t tab_index;
+	vioc_ring_msi_t vrm;
+	int res;
+
+	assert(ring <= VIONA_VQ_TX);
+
+	vrm.rm_index = ring;
+	vrm.rm_addr = 0;
+	vrm.rm_msg = 0;
+	tab_index = sc->vsc_msix_table_idx[ring];
+
+	if (tab_index != VIRTIO_MSI_NO_VECTOR && sc->vsc_msix_active) {
+		mte = pi->pi_msix.table[tab_index];
+		if ((mte.vector_control & PCIM_MSIX_VCTRL_MASK) == 0) {
+			vrm.rm_addr = mte.addr;
+			vrm.rm_msg = mte.msg_data;
+		}
+	}
+
+	res = ioctl(sc->vsc_vnafd, VNA_IOC_RING_SET_MSI, &vrm);
+	if (res != 0) {
+		WPRINTF(("ioctl viona set_msi %d failed %d\n", ring, errno));
+	}
+}
+
+static void
+pci_viona_lintrupdate(struct pci_devinst *pi)
+{
+	struct pci_viona_softc *sc = pi->pi_arg;
+	boolean_t msix_on = B_FALSE;
+
+	pthread_mutex_lock(&sc->vsc_mtx);
+	msix_on = pci_msix_enabled(pi) && (pi->pi_msix.function_mask == 0);
+	if ((sc->vsc_msix_active && !msix_on) ||
+	    (msix_on && !sc->vsc_msix_active)) {
+		uint_t i;
+
+		sc->vsc_msix_active = msix_on;
+		/* Update in-kernel ring configs */
+		for (i = 0; i <= VIONA_VQ_TX; i++) {
+			pci_viona_ring_set_msix(pi, i);
+		}
+	}
+	pthread_mutex_unlock(&sc->vsc_mtx);
+}
+
+static void
+pci_viona_msix_update(struct pci_devinst *pi, uint64_t offset)
+{
+	struct pci_viona_softc *sc = pi->pi_arg;
+	uint_t tab_index, i;
+
+	pthread_mutex_lock(&sc->vsc_mtx);
+	if (!sc->vsc_msix_active) {
+		pthread_mutex_unlock(&sc->vsc_mtx);
+		return;
+	}
+
+	/*
+	 * Rather than update every possible MSI-X vector, cheat and use the
+	 * offset to calculate the entry within the table.  Since this should
+	 * only be called when a write to the table succeeds, the index should
+	 * be valid.
+	 */
+	tab_index = offset / MSIX_TABLE_ENTRY_SIZE;
+
+	for (i = 0; i <= VIONA_VQ_TX; i++) {
+		if (sc->vsc_msix_table_idx[i] != tab_index) {
+			continue;
+		}
+		pci_viona_ring_set_msix(pi, i);
+	}
+
+	pthread_mutex_unlock(&sc->vsc_mtx);
+}
+
+static void
+pci_viona_qnotify(struct pci_viona_softc *sc, int ring)
+{
+	int error;
+
+	switch (ring) {
+	case VIONA_TXQ:
+	case VIONA_RXQ:
+		error = ioctl(sc->vsc_vnafd, VNA_IOC_RING_KICK, ring);
+		if (error != 0) {
+			WPRINTF(("ioctl viona ring %d kick failed %d\n",
+			    ring, errno));
+		}
+		break;
+	case VIONA_CTLQ:
+		DPRINTF(("viona: control qnotify!\n"));
+		break;
+	default:
+		break;
+	}
+}
+
 static void
 pci_viona_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
     int baridx, uint64_t offset, int size, uint64_t value)
@@ -510,7 +621,9 @@ pci_viona_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
 
 	if (baridx == pci_msix_table_bar(pi) ||
 	    baridx == pci_msix_pba_bar(pi)) {
-		pci_emul_msix_twrite(pi, offset, size, value);
+		if (pci_emul_msix_twrite(pi, offset, size, value) == 0) {
+			pci_viona_msix_update(pi, offset);
+		}
 		return;
 	}
 
@@ -529,10 +642,14 @@ pci_viona_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
 	switch (offset) {
 	case VTCFG_R_GUESTCAP:
 		assert(size == 4);
+		value &= ~(sc->vsc_feature_mask);
 		err = ioctl(sc->vsc_vnafd, VNA_IOC_SET_FEATURES, &value);
-		if (err != 0)
+		if (err != 0) {
 			WPRINTF(("ioctl feature negotiation returned"
-			    " err = %d\n", err));
+			    " err = %d\n", errno));
+		} else {
+			sc->vsc_features = value;
+		}
 		break;
 	case VTCFG_R_PFN:
 		assert(size == 4);
@@ -546,7 +663,7 @@ pci_viona_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
 	case VTCFG_R_QNOTIFY:
 		assert(size == 2);
 		assert(value < VIONA_MAXQ);
-		(*pci_viona_qnotify[value])(sc);
+		pci_viona_qnotify(sc, value);
 		break;
 	case VTCFG_R_STATUS:
 		assert(size == 1);
@@ -560,6 +677,7 @@ pci_viona_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
 		assert(size == 2);
 		assert(sc->vsc_curq != VIONA_CTLQ);
 		sc->vsc_msix_table_idx[sc->vsc_curq] = value;
+		pci_viona_ring_set_msix(pi, sc->vsc_curq);
 		break;
 	case VIONA_R_CFG0:
 	case VIONA_R_CFG1:
@@ -597,7 +715,7 @@ pci_viona_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
 	pthread_mutex_unlock(&sc->vsc_mtx);
 }
 
-uint64_t
+static uint64_t
 pci_viona_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
     int baridx, uint64_t offset, int size)
 {
@@ -627,9 +745,11 @@ pci_viona_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
 	case VTCFG_R_HOSTCAP:
 		assert(size == 4);
 		err = ioctl(sc->vsc_vnafd, VNA_IOC_GET_FEATURES, &value);
-		if (err != 0)
+		if (err != 0) {
 			WPRINTF(("ioctl get host features returned"
-			    " err = %d\n", err));
+			    " err = %d\n", errno));
+		}
+		value &= ~sc->vsc_feature_mask;
 		break;
 	case VTCFG_R_GUESTCAP:
 		assert(size == 4);
@@ -641,7 +761,7 @@ pci_viona_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
 		break;
 	case VTCFG_R_QNUM:
 		assert(size == 2);
-		value = pci_viona_qsize(sc->vsc_curq);
+		value = pci_viona_qsize(sc, sc->vsc_curq);
 		break;
 	case VTCFG_R_QSEL:
 		assert(size == 2);
@@ -659,6 +779,9 @@ pci_viona_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
 		assert(size == 1);
 		value = sc->vsc_isr;
 		sc->vsc_isr = 0;	/* a read clears this flag */
+		if (value != 0) {
+			pci_lintr_deassert(pi);
+		}
 		break;
 	case VTCFG_R_CFGVEC:
 		assert(size == 2);
@@ -705,9 +828,10 @@ pci_viona_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
 }
 
 struct pci_devemu pci_de_viona = {
-	.pe_emu = 	"virtio-net-viona",
+	.pe_emu =	"virtio-net-viona",
 	.pe_init =	pci_viona_init,
 	.pe_barwrite =	pci_viona_write,
-	.pe_barread =	pci_viona_read
+	.pe_barread =	pci_viona_read,
+	.pe_lintrupdate = pci_viona_lintrupdate
 };
 PCI_EMUL_SET(pci_de_viona);
diff --git a/usr/src/cmd/devfsadm/i386/misc_link_i386.c b/usr/src/cmd/devfsadm/i386/misc_link_i386.c
index 4aeea7d294..0f8e64551d 100644
--- a/usr/src/cmd/devfsadm/i386/misc_link_i386.c
+++ b/usr/src/cmd/devfsadm/i386/misc_link_i386.c
@@ -85,6 +85,9 @@ static devfsadm_create_t misc_cbt[] = {
 	{ "pseudo", "ddi_pseudo", "ucode",
 	    TYPE_EXACT | DRV_EXACT, ILEVEL_0, ln_minor_name,
 	},
+	{ "pseudo", "ddi_pseudo", "viona",
+	    TYPE_EXACT | DRV_EXACT, ILEVEL_0, ln_minor_name,
+	},
 	{ "pseudo", "ddi_pseudo", "vmm",
 	    TYPE_EXACT | DRV_EXACT, ILEVEL_0, vmmctl,
 	}
@@ -114,6 +117,9 @@ static devfsadm_remove_t misc_remove_cbt[] = {
 	{ "serial", "^tty[a-z]$", RM_ALWAYS | RM_PRE,
 		ILEVEL_1, devfsadm_rm_all
 	},
+	{ "pseudo", "^viona$", RM_ALWAYS | RM_PRE | RM_HOT,
+		ILEVEL_0, devfsadm_rm_all
+	},
 	{ "pseudo", "^vmmctl$", RM_ALWAYS | RM_PRE | RM_HOT,
 		ILEVEL_0, devfsadm_rm_all
 	}
diff --git a/usr/src/man/man9e/mac.9e b/usr/src/man/man9e/mac.9e
index 3a3f2ae90a..d3d066a564 100644
--- a/usr/src/man/man9e/mac.9e
+++ b/usr/src/man/man9e/mac.9e
@@ -570,24 +570,28 @@ The following set of flags may be combined through a bitwise inclusive OR:
 .Bl -tag -width Ds
 .It Sy HCKSUM_INET_PARTIAL
 This indicates that the hardware can calculate a partial checksum for
-both IPv4 and IPv6; however, it requires the pseudo-header checksum be
-calculated for it.
+both IPv4 and IPv6 UDP and TCP packets; however, it requires the pseudo-header
+checksum be calculated for it.
 The pseudo-header checksum will be available for the mblk_t when calling
 .Xr mac_hcksum_get 9F .
-Note this does not imply that the hardware is capable of calculating the
-IPv4 header checksum.
+Note this does not imply that the hardware is capable of calculating
+the partial checksum for other L4 protocols or the IPv4 header checksum.
 That should be indicated with the
 .Sy HCKSUM_IPHDRCKSUM flag.
 .It Sy HCKSUM_INET_FULL_V4
-This indicates that the hardware will fully calculate the L4 checksum
-for outgoing IPv4 packets and does not require a pseudo-header checksum.
+This indicates that the hardware will fully calculate the L4 checksum for
+outgoing IPv4 UDP or TCP packets only, and does not require a pseudo-header
+checksum.
 Note this does not imply that the hardware is capable of calculating the
-IPv4 header checksum.
+checksum for other L4 protocols or the IPv4 header checksum.
 That should be indicated with the
 .Sy HCKSUM_IPHDRCKSUM .
 .It Sy HCKSUM_INET_FULL_V6
-This indicates that the hardware will fully calculate the L4 checksum
-for outgoing IPv6 packets and does not require a pseudo-header checksum.
+This indicates that the hardware will fully calculate the L4 checksum for
+outgoing IPv6 UDP or TCP packets only, and does not require a pseudo-header
+checksum.
+Note this does not imply that the hardware is capable of calculating the
+checksum for any other L4 protocols.
 .It Sy HCKSUM_IPHDRCKSUM
 This indicates that the hardware supports calculating the checksum for
 the IPv4 header itself.
diff --git a/usr/src/pkg/manifests/system-bhyve.mf b/usr/src/pkg/manifests/system-bhyve.mf
index 2a51d4fc22..7fdeb81254 100644
--- a/usr/src/pkg/manifests/system-bhyve.mf
+++ b/usr/src/pkg/manifests/system-bhyve.mf
@@ -35,8 +35,11 @@ dir path=usr group=sys
 dir path=usr/kernel/drv group=sys
 dir path=usr/kernel/drv/$(ARCH64) group=sys
 dir path=usr/sbin
+driver name=viona
 driver name=vmm
+file path=usr/kernel/drv/$(ARCH64)/viona
 file path=usr/kernel/drv/$(ARCH64)/vmm
+file path=usr/kernel/drv/viona.conf
 file path=usr/kernel/drv/vmm.conf
 file path=usr/sbin/bhyve mode=0555
 file path=usr/sbin/bhyvectl mode=0555
diff --git a/usr/src/uts/common/inet/ip/ip6_output.c b/usr/src/uts/common/inet/ip/ip6_output.c
index 6c5868ddde..143077ed32 100644
--- a/usr/src/uts/common/inet/ip/ip6_output.c
+++ b/usr/src/uts/common/inet/ip/ip6_output.c
@@ -23,6 +23,7 @@
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  * Copyright 2017 OmniTI Computer Consulting, Inc. All rights reserved.
+ * Copyright 2018 Joyent, Inc.
  */
 /* Copyright (c) 1990 Mentat Inc. */
 
@@ -866,8 +867,16 @@ ip_output_cksum_v6(iaflags_t ixaflags, mblk_t *mp, ip6_t *ip6h,
 		    ixa->ixa_raw_cksum_offset);
 		cksum = htons(protocol);
 	} else if (protocol == IPPROTO_ICMPV6) {
-		cksump = IPH_ICMPV6_CHECKSUMP(ip6h, ip_hdr_length);
-		cksum = IP_ICMPV6_CSUM_COMP;	/* Pseudo-header cksum */
+		/*
+		 * Currently we assume no HW support for ICMP checksum calc.
+		 *
+		 * When HW support is advertised for ICMP, we'll want the
+		 * following to be set:
+		 * cksump = IPH_ICMPV6_CHECKSUMP(ip6h, ip_hdr_length);
+		 * cksum = IP_ICMPV6_CSUM_COMP;		Pseudo-header cksum
+		 */
+
+		return (ip_output_sw_cksum_v6(mp, ip6h, ixa));
 	} else {
 	ip_hdr_cksum:
 		/* No IP header checksum for IPv6 */
diff --git a/usr/src/uts/common/inet/ip/ip_output.c b/usr/src/uts/common/inet/ip/ip_output.c
index 1017240521..a0157d3c48 100644
--- a/usr/src/uts/common/inet/ip/ip_output.c
+++ b/usr/src/uts/common/inet/ip/ip_output.c
@@ -21,6 +21,7 @@
 
 /*
  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2018 Joyent, Inc.
  */
 /* Copyright (c) 1990 Mentat Inc. */
 
@@ -1738,6 +1739,13 @@ ip_output_cksum_v4(iaflags_t ixaflags, mblk_t *mp, ipha_t *ipha,
 #endif
 			sctph->sh_chksum = sctp_cksum(mp, ip_hdr_length);
 		goto ip_hdr_cksum;
+	} else if (protocol == IPPROTO_ICMP) {
+		/*
+		 * Note that we always calculate a SW checksum for ICMP. In the
+		 * future, if HW support for ICMP is advertised, we can change
+		 * this.
+		 */
+		return (ip_output_sw_cksum_v4(mp, ipha, ixa));
 	} else {
 	ip_hdr_cksum:
 		/* Calculate IPv4 header checksum */
diff --git a/usr/src/uts/common/inet/ipf/ip_fil_solaris.c b/usr/src/uts/common/inet/ipf/ip_fil_solaris.c
index b80cf53882..2e55e6fab8 100644
--- a/usr/src/uts/common/inet/ipf/ip_fil_solaris.c
+++ b/usr/src/uts/common/inet/ipf/ip_fil_solaris.c
@@ -22,6 +22,7 @@ static const char rcsid[] = "@(#)$Id: ip_fil_solaris.c,v 2.62.2.19 2005/07/13 21
 #include <sys/filio.h>
 #include <sys/systm.h>
 #include <sys/strsubr.h>
+#include <sys/strsun.h>
 #include <sys/cred.h>
 #include <sys/ddi.h>
 #include <sys/sunddi.h>
@@ -84,9 +85,19 @@ static	int	ipf_hook6_loop_out __P((hook_event_token_t, hook_data_t,
 static	int	ipf_hook6_loop_in __P((hook_event_token_t, hook_data_t,
     void *));
 static	int     ipf_hook6 __P((hook_data_t, int, int, void *));
+
+static	int	ipf_hookviona_in __P((hook_event_token_t, hook_data_t, void *));
+static	int	ipf_hookviona_out __P((hook_event_token_t, hook_data_t,
+    void *));
+
 extern	int	ipf_geniter __P((ipftoken_t *, ipfgeniter_t *, ipf_stack_t *));
 extern	int	ipf_frruleiter __P((void *, int, void *, ipf_stack_t *));
 
+static int	ipf_hook_protocol_notify __P((hook_notify_cmd_t, void *,
+    const char *, const char *, const char *));
+static int	ipf_hook_instance_notify __P((hook_notify_cmd_t, void *,
+    const char *, const char *, const char *));
+
 #if SOLARIS2 < 10
 #if SOLARIS2 >= 7
 u_int		*ip_ttl_ptr = NULL;
@@ -153,6 +164,12 @@ char *hook6_loop_in_gz = 	"ipfilter_hook6_loop_in_gz";
 char *hook6_loop_out = 		"ipfilter_hook6_loop_out";
 char *hook6_loop_out_gz = 	"ipfilter_hook6_loop_out_gz";
 
+/* viona hook names */
+char *hook_viona_in =		"ipfilter_hookviona_in";
+char *hook_viona_in_gz =	"ipfilter_hookviona_in_gz";
+char *hook_viona_out =		"ipfilter_hookviona_out";
+char *hook_viona_out_gz =	"ipfilter_hookviona_out_gz";
+
 /* ------------------------------------------------------------------------ */
 /* Function:    ipldetach                                                   */
 /* Returns:     int - 0 == success, else error.                             */
@@ -249,8 +266,40 @@ ipf_stack_t *ifs;
 		ifs->ifs_ipf_ipv4 = NULL;
 	}
 
+	/*
+	 * Remove notification of viona hooks
+	 */
+	net_instance_notify_unregister(ifs->ifs_netid,
+	    ipf_hook_instance_notify);
+
 #undef UNDO_HOOK
 
+	/*
+	 * Normally, viona will unregister itself before ipldetach() is called,
+	 * so these will be no-ops, but out of caution, we try to make sure
+	 * we've removed any of our references.
+	 */
+	(void) ipf_hook_protocol_notify(HN_UNREGISTER, ifs, Hn_VIONA, NULL,
+	    NH_PHYSICAL_IN);
+	(void) ipf_hook_protocol_notify(HN_UNREGISTER, ifs, Hn_VIONA, NULL,
+	    NH_PHYSICAL_OUT);
+
+	{
+		char netidstr[12]; /* Large enough for INT_MAX + NUL */
+		(void) snprintf(netidstr, sizeof (netidstr), "%d",
+		    ifs->ifs_netid);
+
+		/*
+		 * The notify callbacks expect the netid value passed as a
+		 * string in the third argument.  To prevent confusion if
+		 * traced, we pass the same value the nethook framework would
+		 * pass, even though the callback does not currently use the
+		 * value.
+		 */
+		(void) ipf_hook_instance_notify(HN_UNREGISTER, ifs, netidstr,
+		    NULL, Hn_VIONA);
+	}
+
 #ifdef	IPFDEBUG
 	cmn_err(CE_CONT, "ipldetach()\n");
 #endif
@@ -445,6 +494,21 @@ ipf_stack_t *ifs;
 			goto hookup_failed;
 	}
 
+	/*
+	 * VIONA INET hooks.  While the nethook framework allows us to register
+	 * hooks for events that haven't been registered yet, we instead
+	 * register and unregister our hooks in response to notifications
+	 * about the viona hooks from the nethook framework.  This prevents
+	 * problems when the viona module gets unloaded while the ipf module
+	 * does not.  If we do not unregister our hooks after the viona module
+	 * is unloaded, the viona module cannot later re-register them if it
+	 * gets reloaded.  As the ip, vnd, and ipf modules are rarely unloaded
+	 * even on DEBUG kernels, they do not experience this issue.
+	 */
+	if (net_instance_notify_register(id, ipf_hook_instance_notify,
+	    ifs) != 0)
+		goto hookup_failed;
+
 	/*
 	 * Reacquire ipf_global, now it is safe.
 	 */
@@ -508,6 +572,155 @@ hookup_failed:
 	return -1;
 }
 
+/* ------------------------------------------------------------------------ */
+/*
+ * Called whenever a nethook protocol is registered or unregistered.  Currently
+ * only used to add or remove the hooks for viona.
+ *
+ * While the function signature requires returning int, nothing
+ * in usr/src/uts/common/io/hook.c that invokes the callbacks
+ * captures the return value (nor is there currently any documentation
+ * on what return values should be).  For now at least, we'll return 0
+ * on success (or 'not applicable') or an error value.  Even if the
+ * nethook framework doesn't use the return address, it can be observed via
+ * dtrace if needed.
+ */
+static int
+ipf_hook_protocol_notify(hook_notify_cmd_t command, void *arg,
+    const char *name, const char *dummy __unused, const char *he_name)
+{
+	ipf_stack_t *ifs = arg;
+	hook_t **hookpp;
+	char *hook_name, *hint_name;
+	hook_func_t hookfn;
+	boolean_t *hookedp;
+	hook_hint_t hint;
+	boolean_t out;
+	int ret = 0;
+
+	const boolean_t gz = ifs->ifs_gz_controlled;
+
+	/* We currently only care about viona hooks notifications */
+	if (strcmp(name, Hn_VIONA) != 0)
+		return (0);
+
+	if (strcmp(he_name, NH_PHYSICAL_IN) == 0) {
+		out = B_FALSE;
+	} else if (strcmp(he_name, NH_PHYSICAL_OUT) == 0) {
+		out = B_TRUE;
+	} else {
+		/*
+		 * If we've added more hook events to viona, we must add
+		 * the corresponding handling here (even if it's just to
+		 * ignore it) to prevent the firewall from not working as
+		 * intended.
+		 */
+		cmn_err(CE_PANIC, "%s: unhandled hook event %s", __func__,
+		    he_name);
+
+		return (0);
+	}
+
+	if (out) {
+		hookpp = &ifs->ifs_ipfhookviona_out;
+		hookfn = ipf_hookviona_out;
+		hookedp = &ifs->ifs_hookviona_physical_out;
+		name = gz ? hook_viona_out_gz : hook_viona_out;
+		hint = gz ? HH_AFTER : HH_BEFORE;
+		hint_name = gz ? hook_viona_out : hook_viona_out_gz;
+	} else {
+		hookpp = &ifs->ifs_ipfhookviona_in;
+		hookfn = ipf_hookviona_in;
+		hookedp = &ifs->ifs_hookviona_physical_in;
+		name = gz ? hook_viona_in_gz : hook_viona_in;
+		hint = gz ? HH_BEFORE : HH_AFTER;
+		hint_name = gz ? hook_viona_in : hook_viona_in_gz;
+	}
+
+	switch (command) {
+	default:
+	case HN_NONE:
+		break;
+	case HN_REGISTER:
+		HOOK_INIT(*hookpp, hookfn, (char *)name, ifs);
+		(*hookpp)->h_hint = hint;
+		(*hookpp)->h_hintvalue = (uintptr_t)hint_name;
+		ret = net_hook_register(ifs->ifs_ipf_viona,
+		    (char *)he_name, *hookpp);
+		if (ret != 0) {
+			cmn_err(CE_NOTE, "%s: could not register hook "
+			    "(hook family=%s hook=%s) err=%d", __func__,
+			    name, he_name, ret);
+			*hookedp = B_FALSE;
+			return (ret);
+		}
+		*hookedp = B_TRUE;
+		break;
+	case HN_UNREGISTER:
+		if (ifs->ifs_ipf_viona == NULL)
+			break;
+
+		ret = *hookedp ? net_hook_unregister(ifs->ifs_ipf_viona,
+		    (char *)he_name, *hookpp) : 0;
+		if ((ret == 0 || ret == ENXIO)) {
+			if (*hookpp != NULL) {
+				hook_free(*hookpp);
+				*hookpp = NULL;
+			}
+			*hookedp = B_FALSE;
+		}
+		break;
+	}
+
+	return (ret);
+}
+
+/*
+ * Called whenever a new nethook instance is created.  Currently only used
+ * with the Hn_VIONA nethooks.  Similar to ipf_hook_protocol_notify, the out
+ * function signature must return an int, though the result is never used.
+ * We elect to return 0 on success (or not applicable) or a non-zero value
+ * on error.
+ */
+static int
+ipf_hook_instance_notify(hook_notify_cmd_t command, void *arg,
+    const char *netid, const char *dummy __unused, const char *instance)
+{
+	ipf_stack_t *ifs = arg;
+	int ret = 0;
+
+	/* We currently only care about viona hooks */
+	if (strcmp(instance, Hn_VIONA) != 0)
+		return (0);
+
+	switch (command) {
+	case HN_NONE:
+	default:
+		return (0);
+	case HN_REGISTER:
+		ifs->ifs_ipf_viona = net_protocol_lookup(ifs->ifs_netid,
+		    NHF_VIONA);
+
+		if (ifs->ifs_ipf_viona == NULL)
+			return (EPROTONOSUPPORT);
+
+		ret = net_protocol_notify_register(ifs->ifs_ipf_viona,
+		    ipf_hook_protocol_notify, ifs);
+		VERIFY(ret == 0 || ret == ESHUTDOWN);
+		break;
+	case HN_UNREGISTER:
+		if (ifs->ifs_ipf_viona == NULL)
+			break;
+		VERIFY0(net_protocol_notify_unregister(ifs->ifs_ipf_viona,
+		    ipf_hook_protocol_notify));
+		VERIFY0(net_protocol_release(ifs->ifs_ipf_viona));
+		ifs->ifs_ipf_viona = NULL;
+		break;
+	}
+
+	return (ret);
+}
+
 static	int	fr_setipfloopback(set, ifs)
 int set;
 ipf_stack_t *ifs;
@@ -2043,6 +2256,124 @@ int ipf_hook6_loop_out(hook_event_token_t token, hook_data_t info, void *arg)
 	return ipf_hook6(info, 1, FI_NOCKSUM, arg);
 }
 
+/* Static constants used by ipf_hook_ether */
+static uint8_t ipf_eth_bcast_addr[ETHERADDRL] = {
+	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
+};
+static uint8_t ipf_eth_ipv4_mcast[3] = { 0x01, 0x00, 0x5E };
+static uint8_t ipf_eth_ipv6_mcast[2] = { 0x33, 0x33 };
+
+/* ------------------------------------------------------------------------ */
+/* Function:	ipf_hook_ether                                              */
+/* Returns:	int - 0 == packet ok, else problem, free packet if not done */
+/* Parameters:	token(I)     - pointer to event                             */
+/*              info(I)      - pointer to hook information for firewalling  */
+/*                                                                          */
+/* The ipf_hook_ether hook is currently private to illumos.  It represents  */
+/* a layer 2 datapath generally used by virtual machines.  Currently the    */
+/* hook is only used by the viona driver to pass along L2 frames for        */
+/* inspection.  It requires that the L2 ethernet header is contained within */
+/* a single dblk_t (however layers above the L2 header have no restrctions  */
+/* in ipf).  ipf does not currently support filtering on L2 fields (e.g.    */
+/* filtering on a MAC address or ethertype), however virtual machines do    */
+/* not have native IP stack instances where ipf traditionally hooks in.     */
+/* Instead this entry point is used to determine if the packet is unicast,  */
+/* broadcast, or multicast. The IPv4 or IPv6 packet is then passed to the   */
+/* traditional ip hooks for filtering.  Non IPv4 or non IPv6 packets are    */
+/* not subject to examination.                                              */
+/* ------------------------------------------------------------------------ */
+int ipf_hook_ether(hook_event_token_t token, hook_data_t info, void *arg,
+    boolean_t out)
+{
+	struct ether_header *ethp;
+	hook_pkt_event_t *hpe = (hook_pkt_event_t *)info;
+	mblk_t *mp;
+	size_t offset, len;
+	uint16_t etype;
+	boolean_t v6;
+
+	/*
+	 * viona will only pass us mblks with the L2 header contained in a
+	 * single data block.
+	 */
+	mp = *hpe->hpe_mp;
+	len = MBLKL(mp);
+
+	VERIFY3S(len, >=, sizeof (struct ether_header));
+
+	ethp = (struct ether_header *)mp->b_rptr;
+	if ((etype = ntohs(ethp->ether_type)) == ETHERTYPE_VLAN) {
+		struct ether_vlan_header *evh =
+		    (struct ether_vlan_header *)ethp;
+
+		VERIFY3S(len, >=, sizeof (struct ether_vlan_header));
+
+		etype = ntohs(evh->ether_type);
+		offset = sizeof (*evh);
+	} else {
+		offset = sizeof (*ethp);
+	}
+
+	/*
+	 * ipf only support filtering IPv4 and IPv6.  Ignore other types.
+	 */
+	if (etype == ETHERTYPE_IP)
+		v6 = B_FALSE;
+	else if (etype == ETHERTYPE_IPV6)
+		v6 = B_TRUE;
+	else
+		return (0);
+
+	if (bcmp(ipf_eth_bcast_addr, ethp, ETHERADDRL) == 0)
+		hpe->hpe_flags |= HPE_BROADCAST;
+	else if (bcmp(ipf_eth_ipv4_mcast, ethp,
+	    sizeof (ipf_eth_ipv4_mcast)) == 0)
+		hpe->hpe_flags |= HPE_MULTICAST;
+	else if (bcmp(ipf_eth_ipv6_mcast, ethp,
+	    sizeof (ipf_eth_ipv6_mcast)) == 0)
+		hpe->hpe_flags |= HPE_MULTICAST;
+
+	/* Find the start of the IPv4 or IPv6 header */
+	for (; offset >= len; len = MBLKL(mp)) {
+		offset -= len;
+		mp = mp->b_cont;
+		if (mp == NULL) {
+			freemsg(*hpe->hpe_mp);
+			*hpe->hpe_mp = NULL;
+			return (-1);
+		}
+	}
+	hpe->hpe_mb = mp;
+	hpe->hpe_hdr = mp->b_rptr + offset;
+
+	return (v6 ? ipf_hook6(info, out, 0, arg) :
+	    ipf_hook(info, out, 0, arg));
+}
+
+/* ------------------------------------------------------------------------ */
+/* Function:    ipf_hookviona_{in,out}                                      */
+/* Returns:     int - 0 == packet ok, else problem, free packet if not done */
+/* Parameters:  event(I)     - pointer to event                             */
+/*              info(I)      - pointer to hook information for firewalling  */
+/*                                                                          */
+/* The viona hooks are private hooks to illumos. They represents a layer 2  */
+/* datapath generally used to implement virtual machines.                   */
+/* along L2 packets.                                                        */
+/*                                                                          */
+/* They end up calling the appropriate traditional ip hooks.                */
+/* ------------------------------------------------------------------------ */
+int
+ipf_hookviona_in(hook_event_token_t token, hook_data_t info, void *arg)
+{
+	return (ipf_hook_ether(token, info, arg, B_FALSE));
+}
+
+int
+ipf_hookviona_out(hook_event_token_t token, hook_data_t info, void *arg)
+{
+	return (ipf_hook_ether(token, info, arg, B_TRUE));
+}
+
 /* ------------------------------------------------------------------------ */
 /* Function:    ipf_hook4_loop_in                                           */
 /* Returns:     int - 0 == packet ok, else problem, free packet if not done */
@@ -2386,7 +2717,7 @@ fr_info_t *fin;
 #ifdef USE_INET6
 	struct in6_addr	tmp_src6;
 #endif
-	
+
 	ASSERT(fin->fin_p == IPPROTO_TCP);
 
 	/*
@@ -2428,7 +2759,7 @@ fr_info_t *fin;
 #endif
 
 	if (tcp != NULL) {
-		/* 
+		/*
 		 * Adjust TCP header:
 		 *	swap ports,
 		 *	set flags,
diff --git a/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h b/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h
index a239f1c1ca..0ceea1e921 100644
--- a/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h
+++ b/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h
@@ -6,7 +6,7 @@
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright 2014 Joyent, Inc.  All rights reserved.
+ * Copyright 2018 Joyent, Inc.  All rights reserved.
  */
 
 #ifndef	__IPF_STACK_H__
@@ -87,8 +87,8 @@ struct ipf_stack {
 #endif
 	int			ifs_ipf_locks_done;
 
-	ipftoken_t 		*ifs_ipftokenhead;
-	ipftoken_t 		**ifs_ipftokentail;
+	ipftoken_t		*ifs_ipftokenhead;
+	ipftoken_t		**ifs_ipftokentail;
 
 	ipfmutex_t	ifs_ipl_mutex;
 	ipfmutex_t	ifs_ipf_authmx;
@@ -126,6 +126,9 @@ struct ipf_stack {
 	hook_t		*ifs_ipfhook6_loop_out;
 	hook_t		*ifs_ipfhook6_nicevents;
 
+	hook_t		*ifs_ipfhookviona_in;
+	hook_t		*ifs_ipfhookviona_out;
+
 	/* flags to indicate whether hooks are registered. */
 	boolean_t	ifs_hook4_physical_in;
 	boolean_t	ifs_hook4_physical_out;
@@ -137,10 +140,13 @@ struct ipf_stack {
 	boolean_t	ifs_hook6_nic_events;
 	boolean_t	ifs_hook6_loopback_in;
 	boolean_t	ifs_hook6_loopback_out;
+	boolean_t	ifs_hookviona_physical_in;
+	boolean_t	ifs_hookviona_physical_out;
 
 	int		ifs_ipf_loopback;
 	net_handle_t	ifs_ipf_ipv4;
 	net_handle_t	ifs_ipf_ipv6;
+	net_handle_t	ifs_ipf_viona;
 
 	/* ip_auth.c */
 	int			ifs_fr_authsize;
@@ -167,8 +173,8 @@ struct ipf_stack {
 	ipfr_t			**ifs_ipfr_nattail;
 	ipfr_t			**ifs_ipfr_nattab;
 
-	ipfr_t  		*ifs_ipfr_ipidlist;
-	ipfr_t  		**ifs_ipfr_ipidtail;
+	ipfr_t			*ifs_ipfr_ipidlist;
+	ipfr_t			**ifs_ipfr_ipidtail;
 	ipfr_t			**ifs_ipfr_ipidtab;
 
 	ipfrstat_t		ifs_ipfr_stats;
diff --git a/usr/src/uts/common/io/hook.c b/usr/src/uts/common/io/hook.c
index eb139a37e2..44af26e7c4 100644
--- a/usr/src/uts/common/io/hook.c
+++ b/usr/src/uts/common/io/hook.c
@@ -1050,7 +1050,7 @@ hook_family_free(hook_family_int_t *hfi, hook_stack_t *hks)
 	/* Free container */
 	kmem_free(hfi, sizeof (*hfi));
 
-	if (hks->hks_shutdown == 2)
+	if (hks != NULL && hks->hks_shutdown == 2)
 		hook_stack_remove(hks);
 
 	mutex_exit(&hook_stack_lock);
diff --git a/usr/src/uts/common/sys/dlpi.h b/usr/src/uts/common/sys/dlpi.h
index 5bc2bd41c5..54aad9307a 100644
--- a/usr/src/uts/common/sys/dlpi.h
+++ b/usr/src/uts/common/sys/dlpi.h
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2018 Joyent, Inc.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -673,11 +674,11 @@ typedef struct {
 #define	HCKSUM_ENABLE		0x01	/* Set to enable hardware checksum */
 					/* capability */
 #define	HCKSUM_INET_PARTIAL	0x02	/* Partial 1's complement checksum */
-					/* ability */
+					/* ability for TCP/UDP packets. */
 #define	HCKSUM_INET_FULL_V4	0x04	/* Full 1's complement checksum */
-					/* ability for IPv4 packets. */
+					/* ability for IPv4 TCP/UDP packets. */
 #define	HCKSUM_INET_FULL_V6	0x08	/* Full 1's complement checksum */
-					/* ability for IPv6 packets. */
+					/* ability for IPv6 TCP/UDP packets. */
 #define	HCKSUM_IPHDRCKSUM	0x10	/* IPv4 Header checksum offload */
 					/* capability */
 #ifdef _KERNEL
diff --git a/usr/src/uts/common/sys/hook_impl.h b/usr/src/uts/common/sys/hook_impl.h
index d8a15f0fe5..f3337bbacf 100644
--- a/usr/src/uts/common/sys/hook_impl.h
+++ b/usr/src/uts/common/sys/hook_impl.h
@@ -21,6 +21,7 @@
 /*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2018, Joyent, Inc.
  */
 
 /*
@@ -171,7 +172,7 @@ typedef struct hook_family_int {
 	cvwaitlock_t			hfi_lock;
 	SLIST_ENTRY(hook_family_int)	hfi_entry;
 	hook_event_int_head_t		hfi_head;
-	hook_family_t 			hfi_family;
+	hook_family_t			hfi_family;
 	kstat_t				*hfi_kstat;
 	struct hook_stack		*hfi_stack;
 	hook_notify_head_t		hfi_nhead;
@@ -209,6 +210,7 @@ typedef struct hook_stack_head hook_stack_head_t;
 #define	Hn_ARP	"arp"
 #define	Hn_IPV4	"inet"
 #define	Hn_IPV6	"inet6"
+#define	Hn_VIONA "viona_inet"
 
 extern int hook_run(hook_family_int_t *, hook_event_token_t, hook_data_t);
 extern int hook_register(hook_family_int_t *, char *, hook_t *);
diff --git a/usr/src/uts/common/sys/neti.h b/usr/src/uts/common/sys/neti.h
index b21504109c..e7027f8ece 100644
--- a/usr/src/uts/common/sys/neti.h
+++ b/usr/src/uts/common/sys/neti.h
@@ -21,6 +21,8 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ *
+ * Copyright 2018, Joyent, Inc.
  */
 
 #ifndef _SYS_NETI_H
@@ -46,6 +48,7 @@ struct msgb;	/* avoiding sys/stream.h here */
 #define	NHF_INET	"NHF_INET"
 #define	NHF_INET6	"NHF_INET6"
 #define	NHF_ARP		"NHF_ARP"
+#define	NHF_VIONA	"NHF_VIONA"
 
 /*
  * Event identification
@@ -61,7 +64,7 @@ struct msgb;	/* avoiding sys/stream.h here */
 /*
  * Network NIC hardware checksum capability
  */
-#define	NET_HCK_NONE   	0x00
+#define	NET_HCK_NONE	0x00
 #define	NET_HCK_L3_FULL	0x01
 #define	NET_HCK_L3_PART	0x02
 #define	NET_HCK_L4_FULL	0x10
diff --git a/usr/src/uts/i86pc/Makefile.files b/usr/src/uts/i86pc/Makefile.files
index ca4ae0cd65..312c0f233d 100644
--- a/usr/src/uts/i86pc/Makefile.files
+++ b/usr/src/uts/i86pc/Makefile.files
@@ -276,7 +276,11 @@ VMM_OBJS += vmm.o \
 	vmm_support.o \
 	vmm_zsd.o
 
-VIONA_OBJS += viona.o
+VIONA_OBJS += viona_main.o \
+	viona_ring.o \
+	viona_rx.o \
+	viona_tx.o \
+	viona_hook.o \
 
 #
 #	Build up defines and paths.
diff --git a/usr/src/uts/i86pc/Makefile.i86pc b/usr/src/uts/i86pc/Makefile.i86pc
index b66b0ca2da..b60d24d82c 100644
--- a/usr/src/uts/i86pc/Makefile.i86pc
+++ b/usr/src/uts/i86pc/Makefile.i86pc
@@ -247,6 +247,7 @@ DRV_KMODS	+= ioat
 DRV_KMODS	+= fipe
 DRV_KMODS	+= imc imcstub
 DRV_KMODS	+= vmm
+DRV_KMODS	+= viona
 
 DRV_KMODS	+= cpudrv
 
diff --git a/usr/src/uts/i86pc/io/viona/viona.c b/usr/src/uts/i86pc/io/viona/viona.c
deleted file mode 100644
index 2371a2f3ae..0000000000
--- a/usr/src/uts/i86pc/io/viona/viona.c
+++ /dev/null
@@ -1,1409 +0,0 @@
-/*
- * Copyright (c) 2013  Chris Torek <torek @ torek net>
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-/*
- * This file and its contents are supplied under the terms of the
- * Common Development and Distribution License ("CDDL"), version 1.0.
- * You may only use this file in accordance with the terms of version
- * 1.0 of the CDDL.
- *
- * A full copy of the text of the CDDL should have accompanied this
- * source.  A copy of the CDDL is also available via the Internet at
- * http://www.illumos.org/license/CDDL.
- *
- * Copyright 2015 Pluribus Networks Inc.
- * Copyright 2017 Joyent, Inc.
- */
-
-#include <sys/conf.h>
-#include <sys/file.h>
-#include <sys/stat.h>
-#include <sys/ddi.h>
-#include <sys/sunddi.h>
-#include <sys/sunndi.h>
-#include <sys/sysmacros.h>
-#include <sys/strsubr.h>
-#include <sys/strsun.h>
-#include <vm/seg_kmem.h>
-
-#include <sys/dls.h>
-#include <sys/mac_client.h>
-
-#include <sys/viona_io.h>
-
-#define	MB	(1024UL * 1024)
-#define	GB	(1024UL * MB)
-
-/*
- * Min. octets in an ethernet frame minus FCS
- */
-#define	MIN_BUF_SIZE	60
-
-#define	VIONA_NAME		"Virtio Network Accelerator"
-
-#define	VIONA_CTL_MINOR		0
-#define	VIONA_CTL_NODE_NAME	"ctl"
-
-#define	VIONA_CLI_NAME		"viona"
-
-#define	VTNET_MAXSEGS		32
-
-#define	VRING_ALIGN		4096
-
-#define	VRING_DESC_F_NEXT	(1 << 0)
-#define	VRING_DESC_F_WRITE	(1 << 1)
-#define	VRING_DESC_F_INDIRECT	(1 << 2)
-
-#define	VRING_AVAIL_F_NO_INTERRUPT	1
-
-#define	VRING_USED_F_NO_NOTIFY		1
-
-#define	BCM_NIC_DRIVER		"bnxe"
-/*
- * Host capabilities
- */
-#define	VIRTIO_NET_F_MAC	(1 <<  5) /* host supplies MAC */
-#define	VIRTIO_NET_F_MRG_RXBUF	(1 << 15) /* host can merge RX buffers */
-#define	VIRTIO_NET_F_STATUS	(1 << 16) /* config status field available */
-
-#define	VIONA_S_HOSTCAPS		\
-	(VIRTIO_NET_F_MAC | VIRTIO_NET_F_MRG_RXBUF | \
-	VIRTIO_NET_F_STATUS)
-
-#pragma pack(1)
-struct virtio_desc {
-	uint64_t	vd_addr;
-	uint32_t	vd_len;
-	uint16_t	vd_flags;
-	uint16_t	vd_next;
-};
-#pragma pack()
-
-#pragma pack(1)
-struct virtio_used {
-	uint32_t	vu_idx;
-	uint32_t	vu_tlen;
-};
-#pragma pack()
-
-#pragma pack(1)
-struct virtio_net_mrgrxhdr {
-	uint8_t		vrh_flags;
-	uint8_t		vrh_gso_type;
-	uint16_t	vrh_hdr_len;
-	uint16_t	vrh_gso_size;
-	uint16_t	vrh_csum_start;
-	uint16_t	vrh_csum_offset;
-	uint16_t	vrh_bufs;
-};
-struct virtio_net_hdr {
-	uint8_t		vrh_flags;
-	uint8_t		vrh_gso_type;
-	uint16_t	vrh_hdr_len;
-	uint16_t	vrh_gso_size;
-	uint16_t	vrh_csum_start;
-	uint16_t	vrh_csum_offset;
-};
-#pragma pack()
-
-typedef struct viona_vring_hqueue {
-	/* Internal state */
-	uint16_t		hq_size;
-	kmutex_t		hq_a_mutex;
-	kmutex_t		hq_u_mutex;
-	uint16_t		hq_cur_aidx;	/* trails behind 'avail_idx' */
-
-	/* Host-context pointers to the queue */
-	caddr_t			hq_baseaddr;
-	uint16_t		*hq_avail_flags;
-	uint16_t		*hq_avail_idx;	/* monotonically increasing */
-	uint16_t		*hq_avail_ring;
-
-	uint16_t		*hq_used_flags;
-	uint16_t		*hq_used_idx;	/* monotonically increasing */
-	struct virtio_used	*hq_used_ring;
-} viona_vring_hqueue_t;
-
-
-typedef struct viona_link {
-	datalink_id_t		l_linkid;
-
-	struct vm		*l_vm;
-	size_t			l_vm_lomemsize;
-	caddr_t			l_vm_lomemaddr;
-	size_t			l_vm_himemsize;
-	caddr_t			l_vm_himemaddr;
-
-	mac_handle_t		l_mh;
-	mac_client_handle_t	l_mch;
-
-	kmem_cache_t		*l_desb_kmc;
-
-	pollhead_t		l_pollhead;
-
-	viona_vring_hqueue_t	l_rx_vring;
-	uint_t			l_rx_intr;
-
-	viona_vring_hqueue_t	l_tx_vring;
-	kcondvar_t		l_tx_cv;
-	uint_t			l_tx_intr;
-	kmutex_t		l_tx_mutex;
-	int			l_tx_outstanding;
-	uint32_t		l_features;
-} viona_link_t;
-
-typedef struct {
-	frtn_t			d_frtn;
-	viona_link_t		*d_link;
-	uint_t			d_ref;
-	uint16_t		d_cookie;
-	int			d_len;
-} viona_desb_t;
-
-typedef struct viona_soft_state {
-	viona_link_t		*ss_link;
-} viona_soft_state_t;
-
-typedef struct used_elem {
-	uint16_t	id;
-	uint32_t	len;
-} used_elem_t;
-
-static void			*viona_state;
-static dev_info_t		*viona_dip;
-static id_space_t		*viona_minor_ids;
-/*
- * copy tx mbufs from virtio ring to avoid necessitating a wait for packet
- * transmission to free resources.
- */
-static boolean_t		copy_tx_mblks = B_TRUE;
-
-extern struct vm *vm_lookup_by_name(char *name);
-extern uint64_t vm_gpa2hpa(struct vm *vm, uint64_t gpa, size_t len);
-
-static int viona_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
-static int viona_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
-static int viona_open(dev_t *devp, int flag, int otype, cred_t *credp);
-static int viona_close(dev_t dev, int flag, int otype, cred_t *credp);
-static int viona_ioctl(dev_t dev, int cmd, intptr_t data, int mode,
-    cred_t *credp, int *rval);
-static int viona_chpoll(dev_t dev, short events, int anyyet, short *reventsp,
-    struct pollhead **phpp);
-
-static int viona_ioc_create(viona_soft_state_t *ss, vioc_create_t *u_create);
-static int viona_ioc_delete(viona_soft_state_t *ss);
-
-static int viona_vm_map(viona_link_t *link);
-static caddr_t viona_gpa2kva(viona_link_t *link, uint64_t gpa);
-static void viona_vm_unmap(viona_link_t *link);
-
-static int viona_ioc_rx_ring_init(viona_link_t *link,
-    vioc_ring_init_t *u_ri);
-static int viona_ioc_tx_ring_init(viona_link_t *link,
-    vioc_ring_init_t *u_ri);
-static int viona_ioc_rx_ring_reset(viona_link_t *link);
-static int viona_ioc_tx_ring_reset(viona_link_t *link);
-static void viona_ioc_rx_ring_kick(viona_link_t *link);
-static void viona_ioc_tx_ring_kick(viona_link_t *link);
-static int viona_ioc_rx_intr_clear(viona_link_t *link);
-static int viona_ioc_tx_intr_clear(viona_link_t *link);
-
-static void viona_rx(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
-    boolean_t loopback);
-static void viona_tx(viona_link_t *link, viona_vring_hqueue_t *hq);
-
-static struct cb_ops viona_cb_ops = {
-	viona_open,
-	viona_close,
-	nodev,
-	nodev,
-	nodev,
-	nodev,
-	nodev,
-	viona_ioctl,
-	nodev,
-	nodev,
-	nodev,
-	viona_chpoll,
-	ddi_prop_op,
-	0,
-	D_MP | D_NEW | D_HOTPLUG,
-	CB_REV,
-	nodev,
-	nodev
-};
-
-static struct dev_ops viona_ops = {
-	DEVO_REV,
-	0,
-	nodev,
-	nulldev,
-	nulldev,
-	viona_attach,
-	viona_detach,
-	nodev,
-	&viona_cb_ops,
-	NULL,
-	ddi_power,
-	ddi_quiesce_not_needed
-};
-
-static struct modldrv modldrv = {
-	&mod_driverops,
-	VIONA_NAME,
-	&viona_ops,
-};
-
-static struct modlinkage modlinkage = {
-	MODREV_1, &modldrv, NULL
-};
-
-int
-_init(void)
-{
-	int	ret;
-
-	ret = ddi_soft_state_init(&viona_state,
-	    sizeof (viona_soft_state_t), 0);
-	if (ret == 0) {
-		ret = mod_install(&modlinkage);
-		if (ret != 0) {
-			ddi_soft_state_fini(&viona_state);
-			return (ret);
-		}
-	}
-
-	return (ret);
-}
-
-int
-_fini(void)
-{
-	int	ret;
-
-	ret = mod_remove(&modlinkage);
-	if (ret == 0) {
-		ddi_soft_state_fini(&viona_state);
-	}
-
-	return (ret);
-}
-
-int
-_info(struct modinfo *modinfop)
-{
-	return (mod_info(&modlinkage, modinfop));
-}
-
-static void
-set_viona_tx_mode()
-{
-	major_t bcm_nic_major;
-	if ((bcm_nic_major = ddi_name_to_major(BCM_NIC_DRIVER))
-	    != DDI_MAJOR_T_NONE) {
-		if (ddi_hold_installed_driver(bcm_nic_major) != NULL) {
-			copy_tx_mblks = B_FALSE;
-			ddi_rele_driver(bcm_nic_major);
-		}
-	}
-}
-
-static int
-viona_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
-{
-	if (cmd != DDI_ATTACH) {
-		return (DDI_FAILURE);
-	}
-
-	viona_minor_ids = id_space_create("viona_minor_id",
-	    VIONA_CTL_MINOR + 1, UINT16_MAX);
-
-	if (ddi_create_minor_node(dip, VIONA_CTL_NODE_NAME,
-	    S_IFCHR, VIONA_CTL_MINOR, DDI_PSEUDO, 0) != DDI_SUCCESS) {
-		return (DDI_FAILURE);
-	}
-
-	viona_dip = dip;
-
-	set_viona_tx_mode();
-	ddi_report_dev(viona_dip);
-
-	return (DDI_SUCCESS);
-}
-
-static int
-viona_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
-{
-	if (cmd != DDI_DETACH) {
-		return (DDI_FAILURE);
-	}
-
-	id_space_destroy(viona_minor_ids);
-
-	ddi_remove_minor_node(viona_dip, NULL);
-
-	viona_dip = NULL;
-
-	return (DDI_SUCCESS);
-}
-
-static int
-viona_open(dev_t *devp, int flag, int otype, cred_t *credp)
-{
-	int	minor;
-
-	if (otype != OTYP_CHR) {
-		return (EINVAL);
-	}
-
-	if (drv_priv(credp) != 0) {
-		return (EPERM);
-	}
-
-	if (getminor(*devp) != VIONA_CTL_MINOR) {
-		return (ENXIO);
-	}
-
-	minor = id_alloc(viona_minor_ids);
-	if (minor == 0) {
-		/* All minors are busy */
-		return (EBUSY);
-	}
-
-	if (ddi_soft_state_zalloc(viona_state, minor) != DDI_SUCCESS) {
-		id_free(viona_minor_ids, minor);
-	}
-
-	*devp = makedevice(getmajor(*devp), minor);
-
-	return (0);
-}
-
-static int
-viona_close(dev_t dev, int flag, int otype, cred_t *credp)
-{
-	int			minor;
-	viona_soft_state_t	*ss;
-
-	if (otype != OTYP_CHR) {
-		return (EINVAL);
-	}
-
-	if (drv_priv(credp) != 0) {
-		return (EPERM);
-	}
-
-	minor = getminor(dev);
-
-	ss = ddi_get_soft_state(viona_state, minor);
-	if (ss == NULL) {
-		return (ENXIO);
-	}
-
-	viona_ioc_delete(ss);
-
-	ddi_soft_state_free(viona_state, minor);
-
-	id_free(viona_minor_ids, minor);
-
-	return (0);
-}
-
-static int
-viona_ioctl(dev_t dev, int cmd, intptr_t data, int mode,
-    cred_t *credp, int *rval)
-{
-	viona_soft_state_t	*ss;
-	int			err = 0;
-
-	ss = ddi_get_soft_state(viona_state, getminor(dev));
-	if (ss == NULL) {
-		return (ENXIO);
-	}
-
-	switch (cmd) {
-	case VNA_IOC_CREATE:
-		err = viona_ioc_create(ss, (vioc_create_t *)data);
-		break;
-	case VNA_IOC_DELETE:
-		err = viona_ioc_delete(ss);
-		break;
-	case VNA_IOC_SET_FEATURES:
-		if (ss->ss_link == NULL) {
-			return (ENOSYS);
-		}
-		ss->ss_link->l_features = *(int *)data & VIONA_S_HOSTCAPS;
-		break;
-	case VNA_IOC_GET_FEATURES:
-		if (ss->ss_link == NULL) {
-			return (ENOSYS);
-		}
-		*(int *)data = VIONA_S_HOSTCAPS;
-		break;
-	case VNA_IOC_RX_RING_INIT:
-		if (ss->ss_link == NULL) {
-			return (ENOSYS);
-		}
-		err = viona_ioc_rx_ring_init(ss->ss_link,
-		    (vioc_ring_init_t *)data);
-		break;
-	case VNA_IOC_RX_RING_RESET:
-		if (ss->ss_link == NULL) {
-			return (ENOSYS);
-		}
-		err = viona_ioc_rx_ring_reset(ss->ss_link);
-		break;
-	case VNA_IOC_RX_RING_KICK:
-		if (ss->ss_link == NULL) {
-			return (ENOSYS);
-		}
-		viona_ioc_rx_ring_kick(ss->ss_link);
-		err = 0;
-		break;
-	case VNA_IOC_TX_RING_INIT:
-		if (ss->ss_link == NULL) {
-			return (ENOSYS);
-		}
-		err = viona_ioc_tx_ring_init(ss->ss_link,
-		    (vioc_ring_init_t *)data);
-		break;
-	case VNA_IOC_TX_RING_RESET:
-		if (ss->ss_link == NULL) {
-			return (ENOSYS);
-		}
-		err = viona_ioc_tx_ring_reset(ss->ss_link);
-		break;
-	case VNA_IOC_TX_RING_KICK:
-		if (ss->ss_link == NULL) {
-			return (ENOSYS);
-		}
-		viona_ioc_tx_ring_kick(ss->ss_link);
-		err = 0;
-		break;
-	case VNA_IOC_RX_INTR_CLR:
-		if (ss->ss_link == NULL) {
-			return (ENOSYS);
-		}
-		err = viona_ioc_rx_intr_clear(ss->ss_link);
-		break;
-	case VNA_IOC_TX_INTR_CLR:
-		if (ss->ss_link == NULL) {
-			return (ENOSYS);
-		}
-		err = viona_ioc_tx_intr_clear(ss->ss_link);
-		break;
-	default:
-		err = ENOTTY;
-		break;
-	}
-
-	return (err);
-}
-
-static int
-viona_chpoll(dev_t dev, short events, int anyyet, short *reventsp,
-    struct pollhead **phpp)
-{
-	viona_soft_state_t	*ss;
-
-	ss = ddi_get_soft_state(viona_state, getminor(dev));
-	if (ss == NULL || ss->ss_link == NULL) {
-		return (ENXIO);
-	}
-
-	*reventsp = 0;
-
-	if (ss->ss_link->l_rx_intr && (events & POLLIN)) {
-		*reventsp |= POLLIN;
-	}
-
-	if (ss->ss_link->l_tx_intr && (events & POLLOUT)) {
-		*reventsp |= POLLOUT;
-	}
-
-	if (*reventsp == 0 && !anyyet) {
-		*phpp = &ss->ss_link->l_pollhead;
-	}
-
-	return (0);
-}
-
-static int
-viona_ioc_create(viona_soft_state_t *ss, vioc_create_t *u_create)
-{
-	vioc_create_t		k_create;
-	viona_link_t		*link;
-	char			cli_name[MAXNAMELEN];
-	int			err;
-
-	if (ss->ss_link != NULL) {
-		return (ENOSYS);
-	}
-	if (copyin(u_create, &k_create, sizeof (k_create)) != 0) {
-		return (EFAULT);
-	}
-
-	link = kmem_zalloc(sizeof (viona_link_t), KM_SLEEP);
-
-	link->l_linkid = k_create.c_linkid;
-	link->l_vm = vm_lookup_by_name(k_create.c_vmname);
-	if (link->l_vm == NULL) {
-		err = ENXIO;
-		goto bail;
-	}
-
-	link->l_vm_lomemsize = k_create.c_lomem_size;
-	link->l_vm_himemsize = k_create.c_himem_size;
-	err = viona_vm_map(link);
-	if (err != 0) {
-		goto bail;
-	}
-
-	err = mac_open_by_linkid(link->l_linkid, &link->l_mh);
-	if (err != 0) {
-		cmn_err(CE_WARN, "viona create mac_open_by_linkid"
-		    " returned %d\n", err);
-		goto bail;
-	}
-
-	snprintf(cli_name, sizeof (cli_name), "%s-%d",
-	    VIONA_CLI_NAME, link->l_linkid);
-	err = mac_client_open(link->l_mh, &link->l_mch, cli_name, 0);
-	if (err != 0) {
-		cmn_err(CE_WARN, "viona create mac_client_open"
-		    " returned %d\n", err);
-		goto bail;
-	}
-
-	link->l_features = VIONA_S_HOSTCAPS;
-	link->l_desb_kmc = kmem_cache_create(cli_name,
-	    sizeof (viona_desb_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
-
-	mutex_init(&link->l_rx_vring.hq_a_mutex, NULL, MUTEX_DRIVER, NULL);
-	mutex_init(&link->l_rx_vring.hq_u_mutex, NULL, MUTEX_DRIVER, NULL);
-	mutex_init(&link->l_rx_vring.hq_a_mutex, NULL, MUTEX_DRIVER, NULL);
-	mutex_init(&link->l_tx_vring.hq_u_mutex, NULL, MUTEX_DRIVER, NULL);
-	if (copy_tx_mblks) {
-		mutex_init(&link->l_tx_mutex, NULL, MUTEX_DRIVER, NULL);
-		cv_init(&link->l_tx_cv, NULL, CV_DRIVER, NULL);
-	}
-	ss->ss_link = link;
-
-	return (0);
-
-bail:
-	if (link->l_mch != NULL) {
-		mac_client_close(link->l_mch, 0);
-	}
-	if (link->l_mh != NULL) {
-		mac_close(link->l_mh);
-	}
-
-	kmem_free(link, sizeof (viona_link_t));
-
-	return (err);
-}
-
-static int
-viona_ioc_delete(viona_soft_state_t *ss)
-{
-	viona_link_t	*link;
-
-	link = ss->ss_link;
-	if (link == NULL) {
-		return (ENOSYS);
-	}
-	if (copy_tx_mblks) {
-		mutex_enter(&link->l_tx_mutex);
-		while (link->l_tx_outstanding != 0) {
-			cv_wait(&link->l_tx_cv, &link->l_tx_mutex);
-		}
-		mutex_exit(&link->l_tx_mutex);
-	}
-	if (link->l_mch != NULL) {
-		mac_rx_clear(link->l_mch);
-		mac_client_close(link->l_mch, 0);
-	}
-	if (link->l_mh != NULL) {
-		mac_close(link->l_mh);
-	}
-
-	viona_vm_unmap(link);
-	mutex_destroy(&link->l_tx_vring.hq_a_mutex);
-	mutex_destroy(&link->l_tx_vring.hq_u_mutex);
-	mutex_destroy(&link->l_rx_vring.hq_a_mutex);
-	mutex_destroy(&link->l_rx_vring.hq_u_mutex);
-	if (copy_tx_mblks) {
-		mutex_destroy(&link->l_tx_mutex);
-		cv_destroy(&link->l_tx_cv);
-	}
-
-	kmem_cache_destroy(link->l_desb_kmc);
-
-	kmem_free(link, sizeof (viona_link_t));
-
-	ss->ss_link = NULL;
-
-	return (0);
-}
-
-static caddr_t
-viona_mapin_vm_chunk(viona_link_t *link, uint64_t gpa, size_t len)
-{
-	caddr_t		addr;
-	size_t		offset;
-	pfn_t		pfnum;
-
-	if (len == 0)
-		return (NULL);
-
-	addr = vmem_alloc(heap_arena, len, VM_SLEEP);
-	if (addr == NULL)
-		return (NULL);
-
-	for (offset = 0; offset < len; offset += PAGESIZE) {
-		pfnum = btop(vm_gpa2hpa(link->l_vm, gpa + offset, PAGESIZE));
-		ASSERT(pfnum);
-		hat_devload(kas.a_hat, addr + offset, PAGESIZE, pfnum,
-		    PROT_READ | PROT_WRITE, HAT_LOAD_LOCK);
-	}
-
-	return (addr);
-}
-
-/*
- * Map the guest physical address space into the kernel virtual address space.
- */
-static int
-viona_vm_map(viona_link_t *link)
-{
-	link->l_vm_lomemaddr = viona_mapin_vm_chunk(link,
-	    0, link->l_vm_lomemsize);
-	if (link->l_vm_lomemaddr == NULL)
-		return (-1);
-	link->l_vm_himemaddr = viona_mapin_vm_chunk(link,
-	    4 * (1024 * 1024 * 1024UL), link->l_vm_himemsize);
-	if (link->l_vm_himemsize && link->l_vm_himemaddr == NULL)
-		return (-1);
-
-	return (0);
-}
-
-/*
- * Translate a guest physical address into a kernel virtual address.
- */
-static caddr_t
-viona_gpa2kva(viona_link_t *link, uint64_t gpa)
-{
-	if (gpa < link->l_vm_lomemsize)
-		return (link->l_vm_lomemaddr + gpa);
-
-	gpa -= (4 * GB);
-	if (gpa < link->l_vm_himemsize)
-		return (link->l_vm_himemaddr + gpa);
-
-	return (NULL);
-}
-
-static void
-viona_vm_unmap(viona_link_t *link)
-{
-	if (link->l_vm_lomemaddr) {
-		hat_unload(kas.a_hat, link->l_vm_lomemaddr,
-		    link->l_vm_lomemsize, HAT_UNLOAD_UNLOCK);
-		vmem_free(heap_arena, link->l_vm_lomemaddr,
-		    link->l_vm_lomemsize);
-	}
-	if (link->l_vm_himemaddr) {
-		hat_unload(kas.a_hat, link->l_vm_himemaddr,
-		    link->l_vm_himemsize, HAT_UNLOAD_UNLOCK);
-		vmem_free(heap_arena, link->l_vm_himemaddr,
-		    link->l_vm_himemsize);
-	}
-}
-
-static int
-viona_ioc_ring_init_common(viona_link_t *link, viona_vring_hqueue_t *hq,
-    vioc_ring_init_t *u_ri)
-{
-	vioc_ring_init_t	k_ri;
-
-	if (copyin(u_ri, &k_ri, sizeof (k_ri)) != 0) {
-		return (EFAULT);
-	}
-
-	hq->hq_size = k_ri.ri_qsize;
-	hq->hq_baseaddr = viona_gpa2kva(link, k_ri.ri_qaddr);
-	if (hq->hq_baseaddr == NULL)
-		return (EINVAL);
-
-	hq->hq_avail_flags = (uint16_t *)(viona_gpa2kva(link,
-	    k_ri.ri_qaddr + hq->hq_size * sizeof (struct virtio_desc)));
-	if (hq->hq_avail_flags == NULL)
-		return (EINVAL);
-	hq->hq_avail_idx = hq->hq_avail_flags + 1;
-	hq->hq_avail_ring = hq->hq_avail_flags + 2;
-
-	hq->hq_used_flags = (uint16_t *)(viona_gpa2kva(link,
-	    P2ROUNDUP(k_ri.ri_qaddr +
-	    hq->hq_size * sizeof (struct virtio_desc) + 2, VRING_ALIGN)));
-	if (hq->hq_used_flags == NULL)
-		return (EINVAL);
-	hq->hq_used_idx = hq->hq_used_flags + 1;
-	hq->hq_used_ring = (struct virtio_used *)(hq->hq_used_flags + 2);
-
-	/*
-	 * Initialize queue indexes
-	 */
-	hq->hq_cur_aidx = 0;
-
-	return (0);
-}
-
-static int
-viona_ioc_rx_ring_init(viona_link_t *link, vioc_ring_init_t *u_ri)
-{
-	viona_vring_hqueue_t	*hq;
-	int			rval;
-
-	hq = &link->l_rx_vring;
-
-	rval = viona_ioc_ring_init_common(link, hq, u_ri);
-	if (rval != 0) {
-		return (rval);
-	}
-
-	return (0);
-}
-
-static int
-viona_ioc_tx_ring_init(viona_link_t *link, vioc_ring_init_t *u_ri)
-{
-	viona_vring_hqueue_t	*hq;
-
-	hq = &link->l_tx_vring;
-
-	return (viona_ioc_ring_init_common(link, hq, u_ri));
-}
-
-static int
-viona_ioc_ring_reset_common(viona_vring_hqueue_t *hq)
-{
-	/*
-	 * Reset all soft state
-	 */
-	hq->hq_cur_aidx = 0;
-
-	return (0);
-}
-
-static int
-viona_ioc_rx_ring_reset(viona_link_t *link)
-{
-	viona_vring_hqueue_t	*hq;
-
-	mac_rx_clear(link->l_mch);
-
-	hq = &link->l_rx_vring;
-
-	return (viona_ioc_ring_reset_common(hq));
-}
-
-static int
-viona_ioc_tx_ring_reset(viona_link_t *link)
-{
-	viona_vring_hqueue_t	*hq;
-
-	hq = &link->l_tx_vring;
-
-	return (viona_ioc_ring_reset_common(hq));
-}
-
-static void
-viona_ioc_rx_ring_kick(viona_link_t *link)
-{
-	viona_vring_hqueue_t	*hq = &link->l_rx_vring;
-
-	atomic_or_16(hq->hq_used_flags, VRING_USED_F_NO_NOTIFY);
-
-	mac_rx_set(link->l_mch, viona_rx, link);
-}
-
-/*
- * Return the number of available descriptors in the vring taking care
- * of the 16-bit index wraparound.
- */
-static inline int
-viona_hq_num_avail(viona_vring_hqueue_t *hq)
-{
-	uint16_t ndesc;
-
-	/*
-	 * We're just computing (a-b) in GF(216).
-	 *
-	 * The only glitch here is that in standard C,
-	 * uint16_t promotes to (signed) int when int has
-	 * more than 16 bits (pretty much always now), so
-	 * we have to force it back to unsigned.
-	 */
-	ndesc = (unsigned)*hq->hq_avail_idx - (unsigned)hq->hq_cur_aidx;
-
-	ASSERT(ndesc <= hq->hq_size);
-
-	return (ndesc);
-}
-
-static void
-viona_ioc_tx_ring_kick(viona_link_t *link)
-{
-	viona_vring_hqueue_t	*hq = &link->l_tx_vring;
-
-	do {
-		atomic_or_16(hq->hq_used_flags, VRING_USED_F_NO_NOTIFY);
-		while (viona_hq_num_avail(hq)) {
-			viona_tx(link, hq);
-		}
-		if (copy_tx_mblks) {
-			mutex_enter(&link->l_tx_mutex);
-			if (link->l_tx_outstanding != 0) {
-				cv_wait_sig(&link->l_tx_cv, &link->l_tx_mutex);
-			}
-			mutex_exit(&link->l_tx_mutex);
-		}
-		atomic_and_16(hq->hq_used_flags, ~VRING_USED_F_NO_NOTIFY);
-	} while (viona_hq_num_avail(hq));
-}
-
-static int
-viona_ioc_rx_intr_clear(viona_link_t *link)
-{
-	link->l_rx_intr = 0;
-
-	return (0);
-}
-
-static int
-viona_ioc_tx_intr_clear(viona_link_t *link)
-{
-	link->l_tx_intr = 0;
-
-	return (0);
-}
-#define	VQ_MAX_DESCRIPTORS	512
-
-static int
-vq_popchain(viona_link_t *link, viona_vring_hqueue_t *hq, struct iovec *iov,
-    int n_iov, uint16_t *cookie)
-{
-	int			i;
-	int			ndesc, nindir;
-	int			idx, head, next;
-	struct virtio_desc	*vdir, *vindir, *vp;
-
-	idx = hq->hq_cur_aidx;
-	ndesc = (uint16_t)((unsigned)*hq->hq_avail_idx - (unsigned)idx);
-
-	if (ndesc == 0)
-		return (0);
-	if (ndesc > hq->hq_size) {
-		cmn_err(CE_NOTE, "ndesc (%d) out of range\n", ndesc);
-		return (-1);
-	}
-
-	head = hq->hq_avail_ring[idx & (hq->hq_size - 1)];
-	next = head;
-
-	for (i = 0; i < VQ_MAX_DESCRIPTORS; next = vdir->vd_next) {
-		if (next >= hq->hq_size) {
-			cmn_err(CE_NOTE, "descriptor index (%d)"
-			    "out of range\n", next);
-			return (-1);
-		}
-
-		vdir = (struct virtio_desc *)(hq->hq_baseaddr +
-		    next * sizeof (struct virtio_desc));
-		if ((vdir->vd_flags & VRING_DESC_F_INDIRECT) == 0) {
-			if (i > n_iov)
-				return (-1);
-			iov[i].iov_base = viona_gpa2kva(link, vdir->vd_addr);
-			if (iov[i].iov_base == NULL) {
-				cmn_err(CE_NOTE, "invalid guest physical"
-				    " address 0x%"PRIx64"\n", vdir->vd_addr);
-				return (-1);
-			}
-			iov[i++].iov_len = vdir->vd_len;
-		} else {
-			nindir = vdir->vd_len / 16;
-			if ((vdir->vd_len & 0xf) || nindir == 0) {
-				cmn_err(CE_NOTE, "invalid indir len 0x%x\n",
-				    vdir->vd_len);
-				return (-1);
-			}
-			vindir = (struct virtio_desc *)
-			    viona_gpa2kva(link, vdir->vd_addr);
-			if (vindir == NULL) {
-				cmn_err(CE_NOTE, "invalid guest physical"
-				    " address 0x%"PRIx64"\n", vdir->vd_addr);
-				return (-1);
-			}
-			next = 0;
-			for (;;) {
-				vp = &vindir[next];
-				if (vp->vd_flags & VRING_DESC_F_INDIRECT) {
-					cmn_err(CE_NOTE, "indirect desc"
-					    " has INDIR flag\n");
-					return (-1);
-				}
-				if (i > n_iov)
-					return (-1);
-				iov[i].iov_base =
-				    viona_gpa2kva(link, vp->vd_addr);
-				if (iov[i].iov_base == NULL) {
-					cmn_err(CE_NOTE, "invalid guest"
-					    " physical address 0x%"PRIx64"\n",
-					    vp->vd_addr);
-					return (-1);
-				}
-				iov[i++].iov_len = vp->vd_len;
-
-				if (i > VQ_MAX_DESCRIPTORS)
-					goto loopy;
-				if ((vp->vd_flags & VRING_DESC_F_NEXT) == 0)
-					break;
-
-				next = vp->vd_next;
-				if (next >= nindir) {
-					cmn_err(CE_NOTE, "invalid next"
-					    " %d > %d\n", next, nindir);
-					return (-1);
-				}
-			}
-		}
-		if ((vdir->vd_flags & VRING_DESC_F_NEXT) == 0) {
-			*cookie = head;
-			hq->hq_cur_aidx++;
-			return (i);
-		}
-	}
-
-loopy:
-	cmn_err(CE_NOTE, "%d > descriptor loop count\n", i);
-
-	return (-1);
-}
-
-static void
-vq_pushchain(viona_vring_hqueue_t *hq, uint32_t len, uint16_t cookie)
-{
-	struct virtio_used	*vu;
-	int			uidx;
-
-	uidx = *hq->hq_used_idx;
-	vu = &hq->hq_used_ring[uidx++ & (hq->hq_size - 1)];
-	vu->vu_idx = cookie;
-	vu->vu_tlen = len;
-	membar_producer();
-	*hq->hq_used_idx = uidx;
-}
-
-static void
-vq_pushchain_mrgrx(viona_vring_hqueue_t *hq, int num_bufs, used_elem_t *elem)
-{
-	struct virtio_used	*vu;
-	int			uidx;
-	int			i;
-
-	uidx = *hq->hq_used_idx;
-	if (num_bufs == 1) {
-		vu = &hq->hq_used_ring[uidx++ & (hq->hq_size - 1)];
-		vu->vu_idx = elem[0].id;
-		vu->vu_tlen = elem[0].len;
-	} else {
-		for (i = 0; i < num_bufs; i++) {
-			vu = &hq->hq_used_ring[(uidx + i) & (hq->hq_size - 1)];
-			vu->vu_idx = elem[i].id;
-			vu->vu_tlen = elem[i].len;
-		}
-		uidx = uidx + num_bufs;
-	}
-	membar_producer();
-	*hq->hq_used_idx = uidx;
-}
-
-/*
- * Copy bytes from mp to iov.
- * copied_buf: Total num_bytes copied from mblk to iov array.
- * buf: pointer to iov_base.
- * i: index of iov array. Mainly used to identify if we are
- *    dealing with first iov array element.
- * rxhdr_size: Virtio header size. Two possibilities in case
- *    of MRGRX buf, header has 2 additional bytes.
- *    In case of mrgrx, virtio header should be part of iov[0].
- *    In case of non-mrgrx, virtio header may or may not be part
- *    of iov[0].
- */
-static int
-copy_in_mblk(mblk_t *mp, int copied_buf, caddr_t buf, struct iovec *iov,
-    int i, int rxhdr_size)
-{
-	int copied_chunk = 0;
-	mblk_t *ml;
-	int total_buf_len = iov->iov_len;
-	/*
-	 * iov[0] might have header, adjust
-	 * total_buf_len accordingly
-	 */
-	if (i == 0) {
-		total_buf_len = iov->iov_len - rxhdr_size;
-	}
-	for (ml = mp; ml != NULL; ml = ml->b_cont) {
-		size_t	chunk = MBLKL(ml);
-		/*
-		 * If chunk is less than
-		 * copied_buf we should move
-		 * to correct msgblk
-		 */
-		if (copied_buf != 0) {
-			if (copied_buf < chunk) {
-				chunk -= copied_buf;
-			} else {
-				copied_buf -= chunk;
-				continue;
-			}
-		}
-		/*
-		 * iov[0] already has virtio header.
-		 * and if copied chunk is length of iov_len break
-		 */
-		if (copied_chunk == total_buf_len) {
-			break;
-		}
-		/*
-		 * Sometimes chunk is total mblk len, sometimes mblk is
-		 * divided into multiple chunks.
-		 */
-		if (chunk > copied_buf) {
-			if (chunk > copied_chunk) {
-				if ((chunk + copied_chunk) > total_buf_len)
-					chunk = (size_t)total_buf_len
-					    - copied_chunk;
-			} else {
-				if (chunk > (total_buf_len - copied_chunk))
-					chunk = (size_t)((total_buf_len
-					    - copied_chunk) - chunk);
-			}
-			bcopy(ml->b_rptr + copied_buf, buf, chunk);
-		} else {
-			if (chunk > (total_buf_len - copied_chunk)) {
-				chunk = (size_t)(total_buf_len - copied_chunk);
-			}
-			bcopy(ml->b_rptr + copied_buf, buf, chunk);
-		}
-		buf += chunk;
-		copied_chunk += chunk;
-	}
-	return (copied_chunk);
-}
-
-static void
-viona_rx(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
-    boolean_t loopback)
-{
-	viona_link_t		*link = arg;
-	viona_vring_hqueue_t	*hq = &link->l_rx_vring;
-	mblk_t			*mp0 = mp;
-
-	while (viona_hq_num_avail(hq)) {
-		struct iovec		iov[VTNET_MAXSEGS];
-		size_t			mblklen;
-		int			n, i = 0;
-		uint16_t		cookie;
-		struct virtio_net_hdr	*vrx = NULL;
-		struct virtio_net_mrgrxhdr *vmrgrx = NULL;
-#if notyet
-		mblk_t			*ml;
-#endif
-		caddr_t			buf = NULL;
-		int			total_len = 0;
-		int			copied_buf = 0;
-		int			num_bufs = 0;
-		int			num_pops = 0;
-		used_elem_t		uelem[VTNET_MAXSEGS];
-
-		if (mp == NULL) {
-			break;
-		}
-		mblklen = msgsize(mp);
-		if (mblklen == 0) {
-			break;
-		}
-
-		mutex_enter(&hq->hq_a_mutex);
-		n = vq_popchain(link, hq, iov, VTNET_MAXSEGS, &cookie);
-		mutex_exit(&hq->hq_a_mutex);
-		if (n <= 0) {
-			break;
-		}
-		num_pops++;
-		if (link->l_features & VIRTIO_NET_F_MRG_RXBUF) {
-			int total_n = n;
-			int mrgrxhdr_size = sizeof (struct virtio_net_mrgrxhdr);
-			/*
-			 * Get a pointer to the rx header, and use the
-			 * data immediately following it for the packet buffer.
-			 */
-			vmrgrx = (struct virtio_net_mrgrxhdr *)iov[0].iov_base;
-			if (n == 1) {
-				buf = iov[0].iov_base + mrgrxhdr_size;
-			}
-			while (mblklen > copied_buf) {
-				if (total_n == i) {
-					mutex_enter(&hq->hq_a_mutex);
-					n = vq_popchain(link, hq, &iov[i],
-					    VTNET_MAXSEGS, &cookie);
-					mutex_exit(&hq->hq_a_mutex);
-					if (n <= 0) {
-						freemsgchain(mp0);
-						return;
-					}
-					num_pops++;
-					total_n += n;
-				}
-				if (total_n > i) {
-					int copied_chunk = 0;
-					if (i != 0) {
-						buf = iov[i].iov_base;
-					}
-					copied_chunk = copy_in_mblk(mp,
-					    copied_buf, buf, &iov[i], i,
-					    mrgrxhdr_size);
-					copied_buf += copied_chunk;
-					uelem[i].id = cookie;
-					uelem[i].len = copied_chunk;
-					if (i == 0) {
-						uelem[i].len += mrgrxhdr_size;
-					}
-				}
-				num_bufs++;
-				i++;
-			}
-		} else {
-			boolean_t virt_hdr_incl_iov = B_FALSE;
-			int rxhdr_size = sizeof (struct virtio_net_hdr);
-			/* First element is header */
-			vrx = (struct virtio_net_hdr *)iov[0].iov_base;
-			if (n == 1 || iov[0].iov_len > rxhdr_size) {
-				buf = iov[0].iov_base + rxhdr_size;
-				virt_hdr_incl_iov = B_TRUE;
-				total_len += rxhdr_size;
-				if (iov[0].iov_len < rxhdr_size) {
-					// Buff too small to fit pkt. Drop it.
-					freemsgchain(mp0);
-					return;
-				}
-			} else {
-				total_len = iov[0].iov_len;
-			}
-			if (iov[0].iov_len == rxhdr_size)
-				i++;
-			while (mblklen > copied_buf) {
-				if (n > i) {
-					int copied_chunk = 0;
-					if (i != 0) {
-						buf = iov[i].iov_base;
-					}
-					/*
-					 * In case of non-mrgrx buf, first
-					 * descriptor always has header and
-					 * rest of the descriptors have data.
-					 * But it is not guaranteed that first
-					 * descriptor will only have virtio
-					 * header. It might also have data.
-					 */
-					if (virt_hdr_incl_iov) {
-						copied_chunk = copy_in_mblk(mp,
-						    copied_buf, buf, &iov[i],
-						    i, rxhdr_size);
-					} else {
-						copied_chunk = copy_in_mblk(mp,
-						    copied_buf, buf, &iov[i],
-						    i, 0);
-					}
-					copied_buf += copied_chunk;
-					total_len += copied_chunk;
-				} else {
-					/*
-					 * Drop packet as it cant fit
-					 * in buf provided by guest.
-					 */
-					freemsgchain(mp0);
-					return;
-				}
-				i++;
-			}
-		}
-		/*
-		 * The only valid field in the rx packet header is the
-		 * number of buffers, which is always 1 without TSO
-		 * support.
-		 */
-		if (link->l_features & VIRTIO_NET_F_MRG_RXBUF) {
-			memset(vmrgrx, 0, sizeof (struct virtio_net_mrgrxhdr));
-			vmrgrx->vrh_bufs = num_bufs;
-			/*
-			 * Make sure iov[0].iov_len >= MIN_BUF_SIZE
-			 * otherwise guest will consider it as invalid frame.
-			 */
-			if (num_bufs == 1 && uelem[0].len < MIN_BUF_SIZE) {
-				uelem[0].len = MIN_BUF_SIZE;
-			}
-			/*
-			 * Release this chain and handle more chains.
-			 */
-			mutex_enter(&hq->hq_u_mutex);
-			vq_pushchain_mrgrx(hq, num_pops, uelem);
-			mutex_exit(&hq->hq_u_mutex);
-		} else {
-			memset(vrx, 0, sizeof (struct virtio_net_hdr));
-			if (total_len < MIN_BUF_SIZE) {
-				total_len = MIN_BUF_SIZE;
-			}
-			/*
-			 * Release this chain and handle more chains.
-			 */
-			mutex_enter(&hq->hq_u_mutex);
-			vq_pushchain(hq, total_len, cookie);
-			mutex_exit(&hq->hq_u_mutex);
-		}
-
-		mp = mp->b_next;
-	}
-
-	if ((*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) {
-		if (atomic_cas_uint(&link->l_rx_intr, 0, 1) == 0) {
-			pollwakeup(&link->l_pollhead, POLLIN);
-		}
-	}
-
-	freemsgchain(mp0);
-}
-
-static void
-viona_desb_free(viona_desb_t *dp)
-{
-	viona_link_t		*link;
-	viona_vring_hqueue_t	*hq;
-#if notyet
-	struct virtio_used	*vu;
-	int			uidx;
-#endif
-	uint_t			ref;
-
-	ref = atomic_dec_uint_nv(&dp->d_ref);
-	if (ref != 0)
-		return;
-
-	link = dp->d_link;
-	hq = &link->l_tx_vring;
-
-	mutex_enter(&hq->hq_u_mutex);
-	vq_pushchain(hq, dp->d_len, dp->d_cookie);
-	mutex_exit(&hq->hq_u_mutex);
-
-	kmem_cache_free(link->l_desb_kmc, dp);
-
-	if ((*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) {
-		if (atomic_cas_uint(&link->l_tx_intr, 0, 1) == 0) {
-			pollwakeup(&link->l_pollhead, POLLOUT);
-		}
-	}
-	if (copy_tx_mblks) {
-		mutex_enter(&link->l_tx_mutex);
-		if (--link->l_tx_outstanding == 0) {
-			cv_broadcast(&link->l_tx_cv);
-		}
-		mutex_exit(&link->l_tx_mutex);
-	}
-}
-
-static void
-viona_tx(viona_link_t *link, viona_vring_hqueue_t *hq)
-{
-	struct iovec		iov[VTNET_MAXSEGS];
-	uint16_t		cookie;
-	int			i, n;
-	mblk_t			*mp_head, *mp_tail, *mp;
-	viona_desb_t		*dp;
-	mac_client_handle_t	link_mch = link->l_mch;
-
-	mp_head = mp_tail = NULL;
-
-	mutex_enter(&hq->hq_a_mutex);
-	n = vq_popchain(link, hq, iov, VTNET_MAXSEGS, &cookie);
-	mutex_exit(&hq->hq_a_mutex);
-	ASSERT(n != 0);
-
-	dp = kmem_cache_alloc(link->l_desb_kmc, KM_SLEEP);
-	dp->d_frtn.free_func = viona_desb_free;
-	dp->d_frtn.free_arg = (void *)dp;
-	dp->d_link = link;
-	dp->d_cookie = cookie;
-
-	dp->d_ref = 0;
-	dp->d_len = iov[0].iov_len;
-
-	for (i = 1; i < n; i++) {
-		dp->d_ref++;
-		dp->d_len += iov[i].iov_len;
-		if (copy_tx_mblks) {
-			mp = desballoc((uchar_t *)iov[i].iov_base,
-			    iov[i].iov_len, BPRI_MED, &dp->d_frtn);
-			ASSERT(mp);
-		} else {
-			mp = allocb(iov[i].iov_len, BPRI_MED);
-			ASSERT(mp);
-			bcopy((uchar_t *)iov[i].iov_base, mp->b_wptr,
-			    iov[i].iov_len);
-		}
-		mp->b_wptr += iov[i].iov_len;
-		if (mp_head == NULL) {
-			ASSERT(mp_tail == NULL);
-			mp_head = mp;
-		} else {
-			ASSERT(mp_tail != NULL);
-			mp_tail->b_cont = mp;
-		}
-		mp_tail = mp;
-	}
-	if (copy_tx_mblks == B_FALSE) {
-		viona_desb_free(dp);
-	}
-	if (copy_tx_mblks) {
-		mutex_enter(&link->l_tx_mutex);
-		link->l_tx_outstanding++;
-		mutex_exit(&link->l_tx_mutex);
-	}
-	mac_tx(link_mch, mp_head, 0, MAC_DROP_ON_NO_DESC, NULL);
-}
diff --git a/usr/src/uts/i86pc/io/viona/viona.mapfile b/usr/src/uts/i86pc/io/viona/viona.mapfile
new file mode 100644
index 0000000000..cece86348c
--- /dev/null
+++ b/usr/src/uts/i86pc/io/viona/viona.mapfile
@@ -0,0 +1,41 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2019 Joyent, Inc.
+#
+
+#
+# MAPFILE HEADER START
+#
+# WARNING:  STOP NOW.  DO NOT MODIFY THIS FILE.
+# Object versioning must comply with the rules detailed in
+#
+#	usr/src/lib/README.mapfiles
+#
+# You should not be making modifications here until you've read the most current
+# copy of that file. If you need help, contact a gatekeeper for guidance.
+#
+# MAPFILE HEADER END
+#
+
+$mapfile_version 2
+
+SYMBOL_VERSION ILLUMOSprivate {
+    global:
+	# DDI Interfaces
+	_fini;
+	_init;
+	_info;
+
+    local:
+	*;
+};
diff --git a/usr/src/uts/i86pc/io/viona/viona_hook.c b/usr/src/uts/i86pc/io/viona/viona_hook.c
new file mode 100644
index 0000000000..4520be04b0
--- /dev/null
+++ b/usr/src/uts/i86pc/io/viona/viona_hook.c
@@ -0,0 +1,438 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2019 Joyent, Inc.
+ */
+
+#include <sys/hook.h>
+#include <sys/hook_event.h>
+
+#include "viona_impl.h"
+
+
+/*
+ * Global linked list of viona_neti_ts.  Access is protected by viona_neti_lock
+ */
+static list_t		viona_neti_list;
+static kmutex_t		viona_neti_lock;
+
+/*
+ * viona_neti is allocated and initialized during attach, and read-only
+ * until detach (where it's also freed)
+ */
+static net_instance_t	*viona_neti;
+
+
+/*
+ * Generate a hook event for the packet in *mpp headed in the direction
+ * indicated by 'out'.  If the packet is accepted, 0 is returned.  If the
+ * packet is rejected, an error is returned.  The hook function may or may not
+ * alter or even free *mpp.  The caller is expected to deal with either
+ * situation.
+ */
+int
+viona_hook(viona_link_t *link, viona_vring_t *ring, mblk_t **mpp, boolean_t out)
+{
+	viona_neti_t *nip = link->l_neti;
+	viona_nethook_t *vnh = &nip->vni_nethook;
+	hook_pkt_event_t info;
+	hook_event_t he;
+	hook_event_token_t het;
+	int ret;
+
+	he = out ? vnh->vnh_event_out : vnh->vnh_event_in;
+	het = out ? vnh->vnh_token_out : vnh->vnh_token_in;
+
+	if (!he.he_interested)
+		return (0);
+
+	info.hpe_protocol = vnh->vnh_neti;
+	info.hpe_ifp = (phy_if_t)link;
+	info.hpe_ofp = (phy_if_t)link;
+	info.hpe_mp = mpp;
+	info.hpe_flags = 0;
+
+	ret = hook_run(vnh->vnh_neti->netd_hooks, het, (hook_data_t)&info);
+	if (ret == 0)
+		return (0);
+
+	if (out) {
+		VIONA_PROBE3(tx_hook_drop, viona_vring_t *, ring,
+		    mblk_t *, *mpp, int, ret);
+		VIONA_RING_STAT_INCR(ring, tx_hookdrop);
+	} else {
+		VIONA_PROBE3(rx_hook_drop, viona_vring_t *, ring,
+		    mblk_t *, *mpp, int, ret);
+		VIONA_RING_STAT_INCR(ring, rx_hookdrop);
+	}
+	return (ret);
+}
+
+/*
+ * netinfo stubs - required by the nethook framework, but otherwise unused
+ *
+ * Currently, all ipf rules are applied against all interfaces in a given
+ * netstack (e.g. all interfaces in a zone).  In the future if we want to
+ * support being able to apply different rules to different interfaces, I
+ * believe we would need to implement some of these stubs to map an interface
+ * name in a rule (e.g. 'net0', back to an index or viona_link_t);
+ */
+static int
+viona_neti_getifname(net_handle_t neti __unused, phy_if_t phy __unused,
+    char *buf __unused, const size_t len __unused)
+{
+	return (-1);
+}
+
+static int
+viona_neti_getmtu(net_handle_t neti __unused, phy_if_t phy __unused,
+    lif_if_t ifdata __unused)
+{
+	return (-1);
+}
+
+static int
+viona_neti_getptmue(net_handle_t neti __unused)
+{
+	return (-1);
+}
+
+static int
+viona_neti_getlifaddr(net_handle_t neti __unused, phy_if_t phy __unused,
+    lif_if_t ifdata __unused, size_t nelem __unused,
+    net_ifaddr_t type[] __unused, void *storage __unused)
+{
+	return (-1);
+}
+
+static int
+viona_neti_getlifzone(net_handle_t neti __unused, phy_if_t phy __unused,
+    lif_if_t ifdata __unused, zoneid_t *zid __unused)
+{
+	return (-1);
+}
+
+static int
+viona_neti_getlifflags(net_handle_t neti __unused, phy_if_t phy __unused,
+    lif_if_t ifdata __unused, uint64_t *flags __unused)
+{
+	return (-1);
+}
+
+static phy_if_t
+viona_neti_phygetnext(net_handle_t neti __unused, phy_if_t phy __unused)
+{
+	return ((phy_if_t)-1);
+}
+
+static phy_if_t
+viona_neti_phylookup(net_handle_t neti __unused, const char *name __unused)
+{
+	return ((phy_if_t)-1);
+}
+
+static lif_if_t
+viona_neti_lifgetnext(net_handle_t neti __unused, phy_if_t phy __unused,
+    lif_if_t ifdata __unused)
+{
+	return (-1);
+}
+
+static int
+viona_neti_inject(net_handle_t neti __unused, inject_t style __unused,
+    net_inject_t *packet __unused)
+{
+	return (-1);
+}
+
+static phy_if_t
+viona_neti_route(net_handle_t neti __unused, struct sockaddr *address __unused,
+    struct sockaddr *next __unused)
+{
+	return ((phy_if_t)-1);
+}
+
+static int
+viona_neti_ispchksum(net_handle_t neti __unused, mblk_t *mp __unused)
+{
+	return (-1);
+}
+
+static int
+viona_neti_isvchksum(net_handle_t neti __unused, mblk_t *mp __unused)
+{
+	return (-1);
+}
+
+static net_protocol_t viona_netinfo = {
+	NETINFO_VERSION,
+	NHF_VIONA,
+	viona_neti_getifname,
+	viona_neti_getmtu,
+	viona_neti_getptmue,
+	viona_neti_getlifaddr,
+	viona_neti_getlifzone,
+	viona_neti_getlifflags,
+	viona_neti_phygetnext,
+	viona_neti_phylookup,
+	viona_neti_lifgetnext,
+	viona_neti_inject,
+	viona_neti_route,
+	viona_neti_ispchksum,
+	viona_neti_isvchksum
+};
+
+/*
+ * Create/register our nethooks
+ */
+static int
+viona_nethook_init(netid_t nid, viona_nethook_t *vnh, char *nh_name,
+    net_protocol_t *netip)
+{
+	int ret;
+
+	if ((vnh->vnh_neti = net_protocol_register(nid, netip)) == NULL) {
+		cmn_err(CE_NOTE, "%s: net_protocol_register failed "
+		    "(netid=%d name=%s)", __func__, nid, nh_name);
+		goto fail_init_proto;
+	}
+
+	HOOK_FAMILY_INIT(&vnh->vnh_family, nh_name);
+	if ((ret = net_family_register(vnh->vnh_neti, &vnh->vnh_family)) != 0) {
+		cmn_err(CE_NOTE, "%s: net_family_register failed "
+		    "(netid=%d name=%s err=%d)", __func__,
+		    nid, nh_name, ret);
+		goto fail_init_family;
+	}
+
+	HOOK_EVENT_INIT(&vnh->vnh_event_in, NH_PHYSICAL_IN);
+	if ((vnh->vnh_token_in = net_event_register(vnh->vnh_neti,
+	    &vnh->vnh_event_in)) == NULL) {
+		cmn_err(CE_NOTE, "%s: net_event_register %s failed "
+		    "(netid=%d name=%s)", __func__, NH_PHYSICAL_IN, nid,
+		    nh_name);
+		goto fail_init_event_in;
+	}
+
+	HOOK_EVENT_INIT(&vnh->vnh_event_out, NH_PHYSICAL_OUT);
+	if ((vnh->vnh_token_out = net_event_register(vnh->vnh_neti,
+	    &vnh->vnh_event_out)) == NULL) {
+		cmn_err(CE_NOTE, "%s: net_event_register %s failed "
+		    "(netid=%d name=%s)", __func__, NH_PHYSICAL_OUT, nid,
+		    nh_name);
+		goto fail_init_event_out;
+	}
+	return (0);
+
+	/*
+	 * On failure, we undo all the steps that succeeded in the
+	 * reverse order of initialization, starting at the last
+	 * successful step (the labels denoting the failing step).
+	 */
+fail_init_event_out:
+	VERIFY0(net_event_shutdown(vnh->vnh_neti, &vnh->vnh_event_in));
+	VERIFY0(net_event_unregister(vnh->vnh_neti, &vnh->vnh_event_in));
+	vnh->vnh_token_in = NULL;
+
+fail_init_event_in:
+	VERIFY0(net_family_shutdown(vnh->vnh_neti, &vnh->vnh_family));
+	VERIFY0(net_family_unregister(vnh->vnh_neti, &vnh->vnh_family));
+
+fail_init_family:
+	VERIFY0(net_protocol_unregister(vnh->vnh_neti));
+	vnh->vnh_neti = NULL;
+
+fail_init_proto:
+	return (1);
+}
+
+/*
+ * Shutdown the nethooks for a protocol family.  This triggers notification
+ * callbacks to anything that has registered interest to allow hook consumers
+ * to unhook prior to the removal of the hooks as well as makes them unavailable
+ * to any future consumers as the first step of removal.
+ */
+static void
+viona_nethook_shutdown(viona_nethook_t *vnh)
+{
+	VERIFY0(net_event_shutdown(vnh->vnh_neti, &vnh->vnh_event_out));
+	VERIFY0(net_event_shutdown(vnh->vnh_neti, &vnh->vnh_event_in));
+	VERIFY0(net_family_shutdown(vnh->vnh_neti, &vnh->vnh_family));
+}
+
+/*
+ * Remove the nethooks for a protocol family.
+ */
+static void
+viona_nethook_fini(viona_nethook_t *vnh)
+{
+	VERIFY0(net_event_unregister(vnh->vnh_neti, &vnh->vnh_event_out));
+	VERIFY0(net_event_unregister(vnh->vnh_neti, &vnh->vnh_event_in));
+	VERIFY0(net_family_unregister(vnh->vnh_neti, &vnh->vnh_family));
+	VERIFY0(net_protocol_unregister(vnh->vnh_neti));
+	vnh->vnh_neti = NULL;
+}
+
+/*
+ * Callback invoked by the neti module.  This creates/registers our hooks
+ * {IPv4,IPv6}{in,out} with the nethook framework so they are available to
+ * interested consumers (e.g. ipf).
+ *
+ * During attach, viona_neti_create is called once for every netstack
+ * present on the system at the time of attach.  Thereafter, it is called
+ * during the creation of additional netstack instances (i.e. zone boot).  As a
+ * result, the viona_neti_t that is created during this call always occurs
+ * prior to any viona instances that will use it to send hook events.
+ *
+ * It should never return NULL.  If we cannot register our hooks, we do not
+ * set vnh_hooked of the respective protocol family, which will prevent the
+ * creation of any viona instances on this netstack (see viona_ioc_create).
+ * This can only occur if after a shutdown event (which means destruction is
+ * imminent) we are trying to create a new instance.
+ */
+static void *
+viona_neti_create(const netid_t netid)
+{
+	viona_neti_t *nip;
+
+	VERIFY(netid != -1);
+
+	nip = kmem_zalloc(sizeof (*nip), KM_SLEEP);
+	nip->vni_netid = netid;
+	nip->vni_zid = net_getzoneidbynetid(netid);
+	mutex_init(&nip->vni_lock, NULL, MUTEX_DRIVER, NULL);
+	list_create(&nip->vni_dev_list, sizeof (viona_soft_state_t),
+	    offsetof(viona_soft_state_t, ss_node));
+
+	if (viona_nethook_init(netid, &nip->vni_nethook, Hn_VIONA,
+	    &viona_netinfo) == 0)
+		nip->vni_nethook.vnh_hooked = B_TRUE;
+
+	mutex_enter(&viona_neti_lock);
+	list_insert_tail(&viona_neti_list, nip);
+	mutex_exit(&viona_neti_lock);
+
+	return (nip);
+}
+
+/*
+ * Called during netstack teardown by the neti module.  During teardown, all
+ * the shutdown callbacks are invoked, allowing consumers to release any holds
+ * and otherwise quiesce themselves prior to destruction, followed by the
+ * actual destruction callbacks.
+ */
+static void
+viona_neti_shutdown(netid_t nid, void *arg)
+{
+	viona_neti_t *nip = arg;
+
+	ASSERT(nip != NULL);
+	VERIFY(nid == nip->vni_netid);
+
+	mutex_enter(&viona_neti_lock);
+	list_remove(&viona_neti_list, nip);
+	mutex_exit(&viona_neti_lock);
+
+	if (nip->vni_nethook.vnh_hooked)
+		viona_nethook_shutdown(&nip->vni_nethook);
+}
+
+/*
+ * Called during netstack teardown by the neti module.  Destroys the viona
+ * netinst data.  This is invoked after all the netstack and neti shutdown
+ * callbacks have been invoked.
+ */
+static void
+viona_neti_destroy(netid_t nid, void *arg)
+{
+	viona_neti_t *nip = arg;
+
+	ASSERT(nip != NULL);
+	VERIFY(nid == nip->vni_netid);
+
+	mutex_enter(&nip->vni_lock);
+	while (nip->vni_ref != 0)
+		cv_wait(&nip->vni_ref_change, &nip->vni_lock);
+	mutex_exit(&nip->vni_lock);
+
+	VERIFY(!list_link_active(&nip->vni_node));
+
+	if (nip->vni_nethook.vnh_hooked)
+		viona_nethook_fini(&nip->vni_nethook);
+
+	mutex_destroy(&nip->vni_lock);
+	list_destroy(&nip->vni_dev_list);
+	kmem_free(nip, sizeof (*nip));
+}
+
+/*
+ * Find the viona netinst data by zone id.  This is only used during
+ * viona instance creation (and thus is only called by a zone that is running).
+ */
+viona_neti_t *
+viona_neti_lookup_by_zid(zoneid_t zid)
+{
+	viona_neti_t *nip;
+
+	mutex_enter(&viona_neti_lock);
+	for (nip = list_head(&viona_neti_list); nip != NULL;
+	    nip = list_next(&viona_neti_list, nip)) {
+		if (nip->vni_zid == zid) {
+			mutex_enter(&nip->vni_lock);
+			nip->vni_ref++;
+			mutex_exit(&nip->vni_lock);
+			mutex_exit(&viona_neti_lock);
+			return (nip);
+		}
+	}
+	mutex_exit(&viona_neti_lock);
+	return (NULL);
+}
+
+void
+viona_neti_rele(viona_neti_t *nip)
+{
+	mutex_enter(&nip->vni_lock);
+	VERIFY3S(nip->vni_ref, >, 0);
+	nip->vni_ref--;
+	mutex_exit(&nip->vni_lock);
+	cv_broadcast(&nip->vni_ref_change);
+}
+
+void
+viona_neti_attach(void)
+{
+	mutex_init(&viona_neti_lock, NULL, MUTEX_DRIVER, NULL);
+	list_create(&viona_neti_list, sizeof (viona_neti_t),
+	    offsetof(viona_neti_t, vni_node));
+
+	/* This can only fail if NETINFO_VERSION is wrong */
+	viona_neti = net_instance_alloc(NETINFO_VERSION);
+	VERIFY(viona_neti != NULL);
+
+	viona_neti->nin_name = "viona";
+	viona_neti->nin_create = viona_neti_create;
+	viona_neti->nin_shutdown = viona_neti_shutdown;
+	viona_neti->nin_destroy = viona_neti_destroy;
+	/* This can only fail if we've registered ourselves multiple times */
+	VERIFY3S(net_instance_register(viona_neti), ==, DDI_SUCCESS);
+}
+
+void
+viona_neti_detach(void)
+{
+	/* This can only fail if we've not registered previously */
+	VERIFY3S(net_instance_unregister(viona_neti), ==, DDI_SUCCESS);
+	net_instance_free(viona_neti);
+	viona_neti = NULL;
+
+	list_destroy(&viona_neti_list);
+	mutex_destroy(&viona_neti_lock);
+}
diff --git a/usr/src/uts/i86pc/io/viona/viona_impl.h b/usr/src/uts/i86pc/io/viona/viona_impl.h
new file mode 100644
index 0000000000..5471b611a4
--- /dev/null
+++ b/usr/src/uts/i86pc/io/viona/viona_impl.h
@@ -0,0 +1,326 @@
+/*
+ * Copyright (c) 2013  Chris Torek <torek @ torek net>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2015 Pluribus Networks Inc.
+ * Copyright 2019 Joyent, Inc.
+ */
+
+#ifndef	_VIONA_IMPL_H
+#define	_VIONA_IMPL_H
+
+#include <sys/ddi.h>
+#include <sys/list.h>
+#include <sys/sunddi.h>
+#include <sys/sunndi.h>
+#include <sys/strsun.h>
+#include <sys/sysmacros.h>
+#include <sys/uio.h>
+
+#include <sys/mac_client.h>
+#include <sys/mac_provider.h>
+#include <sys/mac_client_priv.h>
+#include <sys/neti.h>
+#include <inet/ip.h>
+#include <inet/tcp.h>
+
+#include <sys/vmm_drv.h>
+#include <sys/viona_io.h>
+
+struct viona_link;
+typedef struct viona_link viona_link_t;
+struct viona_desb;
+typedef struct viona_desb viona_desb_t;
+struct viona_net;
+typedef struct viona_neti viona_neti_t;
+
+enum viona_ring_state {
+	VRS_RESET	= 0x0,	/* just allocated or reset */
+	VRS_SETUP	= 0x1,	/* addrs setup and starting worker thread */
+	VRS_INIT	= 0x2,	/* worker thread started & waiting to run */
+	VRS_RUN		= 0x3,	/* running work routine */
+	VRS_STOP	= 0x4,	/* worker is exiting */
+};
+enum viona_ring_state_flags {
+	VRSF_REQ_START	= 0x1,	/* start running from INIT state */
+	VRSF_REQ_STOP	= 0x2,	/* stop running, clean up, goto RESET state */
+	VRSF_RENEW	= 0x4,	/* ring renewing lease */
+};
+
+typedef struct viona_vring {
+	viona_link_t	*vr_link;
+
+	kmutex_t	vr_lock;
+	kcondvar_t	vr_cv;
+	uint16_t	vr_state;
+	uint16_t	vr_state_flags;
+	uint_t		vr_xfer_outstanding;
+	kthread_t	*vr_worker_thread;
+	vmm_lease_t	*vr_lease;
+
+	/* ring-sized resources for TX activity */
+	viona_desb_t	*vr_txdesb;
+	struct iovec	*vr_txiov;
+
+	uint_t		vr_intr_enabled;
+	uint64_t	vr_msi_addr;
+	uint64_t	vr_msi_msg;
+
+	/* Internal ring-related state */
+	kmutex_t	vr_a_mutex;	/* sync consumers of 'avail' */
+	kmutex_t	vr_u_mutex;	/* sync consumers of 'used' */
+	uint64_t	vr_pa;
+	uint16_t	vr_size;
+	uint16_t	vr_mask;	/* cached from vr_size */
+	uint16_t	vr_cur_aidx;	/* trails behind 'avail_idx' */
+
+	/* Host-context pointers to the queue */
+	volatile struct virtio_desc	*vr_descr;
+
+	volatile uint16_t		*vr_avail_flags;
+	volatile uint16_t		*vr_avail_idx;
+	volatile uint16_t		*vr_avail_ring;
+	volatile uint16_t		*vr_avail_used_event;
+
+	volatile uint16_t		*vr_used_flags;
+	volatile uint16_t		*vr_used_idx;
+	volatile struct virtio_used	*vr_used_ring;
+	volatile uint16_t		*vr_used_avail_event;
+
+	/* Per-ring error condition statistics */
+	struct viona_ring_stats {
+		uint64_t	rs_ndesc_too_high;
+		uint64_t	rs_bad_idx;
+		uint64_t	rs_indir_bad_len;
+		uint64_t	rs_indir_bad_nest;
+		uint64_t	rs_indir_bad_next;
+		uint64_t	rs_no_space;
+		uint64_t	rs_too_many_desc;
+		uint64_t	rs_desc_bad_len;
+
+		uint64_t	rs_bad_ring_addr;
+
+		uint64_t	rs_fail_hcksum;
+		uint64_t	rs_fail_hcksum6;
+		uint64_t	rs_fail_hcksum_proto;
+
+		uint64_t	rs_bad_rx_frame;
+		uint64_t	rs_rx_merge_overrun;
+		uint64_t	rs_rx_merge_underrun;
+		uint64_t	rs_rx_pad_short;
+		uint64_t	rs_rx_mcast_check;
+		uint64_t	rs_too_short;
+		uint64_t	rs_tx_absent;
+
+		uint64_t	rs_rx_hookdrop;
+		uint64_t	rs_tx_hookdrop;
+	} vr_stats;
+} viona_vring_t;
+
+struct viona_link {
+	vmm_hold_t		*l_vm_hold;
+	boolean_t		l_destroyed;
+
+	viona_vring_t		l_vrings[VIONA_VQ_MAX];
+
+	uint32_t		l_features;
+	uint32_t		l_features_hw;
+	uint32_t		l_cap_csum;
+
+	uintptr_t		l_notify_ioport;
+	void			*l_notify_cookie;
+
+	datalink_id_t		l_linkid;
+	mac_handle_t		l_mh;
+	mac_client_handle_t	l_mch;
+	mac_promisc_handle_t	l_mph;
+
+	pollhead_t		l_pollhead;
+
+	viona_neti_t		*l_neti;
+};
+
+typedef struct viona_nethook {
+	net_handle_t		vnh_neti;
+	hook_family_t		vnh_family;
+	hook_event_t		vnh_event_in;
+	hook_event_t		vnh_event_out;
+	hook_event_token_t	vnh_token_in;
+	hook_event_token_t	vnh_token_out;
+	boolean_t		vnh_hooked;
+} viona_nethook_t;
+
+struct viona_neti {
+	list_node_t		vni_node;
+
+	netid_t			vni_netid;
+	zoneid_t		vni_zid;
+
+	viona_nethook_t		vni_nethook;
+
+	kmutex_t		vni_lock;	/* Protects remaining members */
+	kcondvar_t		vni_ref_change; /* Protected by vni_lock */
+	uint_t			vni_ref;	/* Protected by vni_lock */
+	list_t			vni_dev_list;	/* Protected by vni_lock */
+};
+
+typedef struct used_elem {
+	uint16_t	id;
+	uint32_t	len;
+} used_elem_t;
+
+typedef struct viona_soft_state {
+	kmutex_t		ss_lock;
+	viona_link_t		*ss_link;
+	list_node_t		ss_node;
+} viona_soft_state_t;
+
+#pragma pack(1)
+struct virtio_desc {
+	uint64_t	vd_addr;
+	uint32_t	vd_len;
+	uint16_t	vd_flags;
+	uint16_t	vd_next;
+};
+
+struct virtio_used {
+	uint32_t	vu_idx;
+	uint32_t	vu_tlen;
+};
+
+struct virtio_net_mrgrxhdr {
+	uint8_t		vrh_flags;
+	uint8_t		vrh_gso_type;
+	uint16_t	vrh_hdr_len;
+	uint16_t	vrh_gso_size;
+	uint16_t	vrh_csum_start;
+	uint16_t	vrh_csum_offset;
+	uint16_t	vrh_bufs;
+};
+
+struct virtio_net_hdr {
+	uint8_t		vrh_flags;
+	uint8_t		vrh_gso_type;
+	uint16_t	vrh_hdr_len;
+	uint16_t	vrh_gso_size;
+	uint16_t	vrh_csum_start;
+	uint16_t	vrh_csum_offset;
+};
+#pragma pack()
+
+#define	VRING_NEED_BAIL(ring, proc)					\
+		(((ring)->vr_state_flags & VRSF_REQ_STOP) != 0 ||	\
+		((proc)->p_flag & SEXITING) != 0)
+
+
+#define	VNETHOOK_INTERESTED_IN(neti) \
+	(neti)->vni_nethook.vnh_event_in.he_interested
+#define	VNETHOOK_INTERESTED_OUT(neti) \
+	(neti)->vni_nethook.vnh_event_out.he_interested
+
+
+#define	VIONA_PROBE(name)	DTRACE_PROBE(viona__##name)
+#define	VIONA_PROBE1(name, arg1, arg2)	\
+	DTRACE_PROBE1(viona__##name, arg1, arg2)
+#define	VIONA_PROBE2(name, arg1, arg2, arg3, arg4)	\
+	DTRACE_PROBE2(viona__##name, arg1, arg2, arg3, arg4)
+#define	VIONA_PROBE3(name, arg1, arg2, arg3, arg4, arg5, arg6)	\
+	DTRACE_PROBE3(viona__##name, arg1, arg2, arg3, arg4, arg5, arg6)
+#define	VIONA_PROBE5(name, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, \
+	arg9, arg10) \
+	DTRACE_PROBE5(viona__##name, arg1, arg2, arg3, arg4, arg5, arg6, arg7, \
+	arg8, arg9, arg10)
+#define	VIONA_PROBE_BAD_RING_ADDR(r, a)		\
+	VIONA_PROBE2(bad_ring_addr, viona_vring_t *, r, void *, (void *)(a))
+
+#define	VIONA_RING_STAT_INCR(r, name)	\
+	(((r)->vr_stats.rs_ ## name)++)
+
+
+#define	VIONA_MAX_HDRS_LEN	(sizeof (struct ether_vlan_header) + \
+	IP_MAX_HDR_LENGTH + TCP_MAX_HDR_LENGTH)
+
+#define	VRING_AVAIL_F_NO_INTERRUPT	1
+#define	VRING_USED_F_NO_NOTIFY		1
+
+#define	VRING_DESC_F_NEXT	(1 << 0)
+#define	VRING_DESC_F_WRITE	(1 << 1)
+#define	VRING_DESC_F_INDIRECT	(1 << 2)
+
+#define	VIRTIO_NET_HDR_F_NEEDS_CSUM	(1 << 0)
+#define	VIRTIO_NET_HDR_F_DATA_VALID	(1 << 1)
+
+#define	VIRTIO_NET_HDR_GSO_NONE		0
+#define	VIRTIO_NET_HDR_GSO_TCPV4	1
+
+#define	VIRTIO_NET_F_CSUM		(1 << 0)
+#define	VIRTIO_NET_F_GUEST_CSUM		(1 << 1)
+#define	VIRTIO_NET_F_MAC		(1 << 5) /* host supplies MAC */
+#define	VIRTIO_NET_F_GUEST_TSO4		(1 << 7) /* guest can accept TSO */
+#define	VIRTIO_NET_F_HOST_TSO4		(1 << 11) /* host can accept TSO */
+#define	VIRTIO_NET_F_MRG_RXBUF		(1 << 15) /* host can merge RX bufs */
+#define	VIRTIO_NET_F_STATUS		(1 << 16) /* cfg status field present */
+#define	VIRTIO_F_RING_NOTIFY_ON_EMPTY	(1 << 24)
+#define	VIRTIO_F_RING_INDIRECT_DESC	(1 << 28)
+#define	VIRTIO_F_RING_EVENT_IDX		(1 << 29)
+
+
+void viona_ring_alloc(viona_link_t *, viona_vring_t *);
+void viona_ring_free(viona_vring_t *);
+int viona_ring_reset(viona_vring_t *, boolean_t);
+int viona_ring_init(viona_link_t *, uint16_t, uint16_t, uint64_t);
+boolean_t viona_ring_lease_renew(viona_vring_t *);
+int vq_popchain(viona_vring_t *, struct iovec *, uint_t, uint16_t *);
+void vq_pushchain(viona_vring_t *, uint32_t, uint16_t);
+void vq_pushchain_many(viona_vring_t *, uint_t, used_elem_t *);
+void viona_intr_ring(viona_vring_t *ring);
+
+void viona_rx_init(void);
+void viona_rx_fini(void);
+int viona_rx_set(viona_link_t *);
+void viona_rx_clear(viona_link_t *);
+void viona_worker_rx(viona_vring_t *, viona_link_t *);
+
+extern kmutex_t viona_force_copy_lock;
+void viona_worker_tx(viona_vring_t *, viona_link_t *);
+void viona_tx_ring_alloc(viona_vring_t *, const uint16_t);
+void viona_tx_ring_free(viona_vring_t *, const uint16_t);
+
+void viona_neti_attach(void);
+void viona_neti_detach(void);
+viona_neti_t *viona_neti_lookup_by_zid(zoneid_t);
+void viona_neti_rele(viona_neti_t *);
+int viona_hook(viona_link_t *, viona_vring_t *, mblk_t **, boolean_t);
+
+#endif	/* _VIONA_IMPL_H */
diff --git a/usr/src/uts/i86pc/io/viona/viona_main.c b/usr/src/uts/i86pc/io/viona/viona_main.c
new file mode 100644
index 0000000000..f51a1f9b12
--- /dev/null
+++ b/usr/src/uts/i86pc/io/viona/viona_main.c
@@ -0,0 +1,991 @@
+/*
+ * Copyright (c) 2013  Chris Torek <torek @ torek net>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2015 Pluribus Networks Inc.
+ * Copyright 2019 Joyent, Inc.
+ */
+
+/*
+ * viona - VirtIO-Net, Accelerated
+ *
+ * The purpose of viona is to provide high performance virtio-net devices to
+ * bhyve guests.  It does so by sitting directly atop MAC, skipping all of the
+ * DLS/DLD stack.
+ *
+ * --------------------
+ * General Architecture
+ * --------------------
+ *
+ * A single viona instance is comprised of a "link" handle and two "rings".
+ * After opening the viona device, it must be associated with a MAC network
+ * interface and a bhyve (vmm) instance to form its link resource.  This is
+ * done with the VNA_IOC_CREATE ioctl, where the datalink ID and vmm fd are
+ * passed in to perform the initialization.  With the MAC client opened, and a
+ * driver handle to the vmm instance established, the device is ready to be
+ * configured by the guest.
+ *
+ * The userspace portion of bhyve, which interfaces with the PCI device
+ * emulation framework, is meant to stay out of the datapath if at all
+ * possible.  Configuration changes made via PCI are mapped to actions which
+ * will steer the operation of the in-kernel logic.
+ *
+ *
+ * -----------
+ * Ring Basics
+ * -----------
+ *
+ * Each viona link has two viona_vring_t entities, RX and TX, for handling data
+ * transfers to and from the guest.  They represent an interface to the
+ * standard virtio ring structures.  When intiailized and active, each ring is
+ * backed by a kernel worker thread (parented to the bhyve process for the
+ * instance) which handles ring events.  The RX worker has the simple task of
+ * watching for ring shutdown conditions.  The TX worker does that in addition
+ * to processing all requests to transmit data.  Data destined for the guest is
+ * delivered directly by MAC to viona_rx() when the ring is active.
+ *
+ *
+ * -----------
+ * Ring States
+ * -----------
+ *
+ * The viona_vring_t instances follow a simple path through the possible state
+ * values represented in virtio_vring_t`vr_state:
+ *
+ *        +<--------------------------------------------+
+ *        |						|
+ *        V						^
+ *  +-----------+	This is the initial state when a link is created or
+ *  | VRS_RESET |	when the ring has been explicitly reset.
+ *  +-----------+
+ *        |						^
+ *        |---* ioctl(VNA_IOC_RING_INIT) issued		|
+ *        |						|
+ *        |						^
+ *        V
+ *  +-----------+	The ring parameters (size, guest physical addresses)
+ *  | VRS_SETUP |	have been set and start-up of the ring worker thread
+ *  +-----------+	has begun.
+ *        |						^
+ *        |						|
+ *        |---* ring worker thread begins execution	|
+ *        |						|
+ *        +-------------------------------------------->+
+ *        |	      |					^
+ *        |	      |
+ *        |	      *	If ring shutdown is requested (by ioctl or impending
+ *        |		bhyve process death) while the worker thread is
+ *        |		starting, the worker will transition the ring to
+ *        |		VRS_RESET and exit.
+ *        |						^
+ *        |						|
+ *        |						^
+ *        V
+ *  +-----------+	The worker thread associated with the ring has started
+ *  | VRS_INIT  |	executing.  It has allocated any extra resources needed
+ *  +-----------+	for the ring to operate.
+ *        |						^
+ *        |						|
+ *        +-------------------------------------------->+
+ *        |	      |					^
+ *        |	      |
+ *        |	      *	If ring shutdown is requested while the worker is
+ *        |		waiting in VRS_INIT, it will free any extra resources
+ *        |		and transition to VRS_RESET.
+ *        |						^
+ *        |						|
+ *        |--* ioctl(VNA_IOC_RING_KICK) issued		|
+ *        |						^
+ *        V
+ *  +-----------+	The worker thread associated with the ring is executing
+ *  | VRS_RUN   |	workload specific to that ring.
+ *  +-----------+
+ *        |						^
+ *        |---* ioctl(VNA_IOC_RING_RESET) issued	|
+ *        |	(or bhyve process begins exit)		^
+ *        |
+ *  +-----------+	The worker thread associated with the ring is in the
+ *  | VRS_STOP  |	process of exiting. All outstanding TX and RX
+ *  +-----------+	requests are allowed to complete, but new requests
+ *        |		must be ignored.
+ *        |						^
+ *        |						|
+ *        +-------------------------------------------->+
+ *
+ *
+ * While the worker thread is not running, changes to vr_state are only made by
+ * viona_ioc_ring_init() under vr_lock.  There, it initializes the ring, starts
+ * the worker, and sets the ring state to VRS_SETUP.  Once the worker thread
+ * has been started, only it may perform ring state transitions (still under
+ * the protection of vr_lock), when requested by outside consumers via
+ * vr_state_flags or when the containing bhyve process initiates an exit.
+ *
+ *
+ * ----------------------------
+ * Transmission mblk_t Handling
+ * ----------------------------
+ *
+ * For incoming frames destined for a bhyve guest, the data must first land in
+ * a host OS buffer from the physical NIC before it is copied into the awaiting
+ * guest buffer(s).  Outbound frames transmitted by the guest are not bound by
+ * this limitation and can avoid extra copying before the buffers are accessed
+ * directly by the NIC.  When a guest designates buffers to be transmitted,
+ * viona translates the guest-physical addresses contained in the ring
+ * descriptors to host-virtual addresses via vmm_dr_gpa2kva().  That pointer is
+ * wrapped in an mblk_t using a preallocated viona_desb_t for the desballoc().
+ * Doing so increments vr_xfer_outstanding, preventing the ring from being
+ * reset (allowing the link to drop its vmm handle to the guest) until all
+ * transmit mblks referencing guest memory have been processed.  Allocation of
+ * the viona_desb_t entries is done during the VRS_INIT stage of the ring
+ * worker thread.  The ring size informs that allocation as the number of
+ * concurrent transmissions is limited by the number of descriptors in the
+ * ring.  This minimizes allocation in the transmit hot-path by aqcuiring those
+ * fixed-size resources during initialization.
+ *
+ * This optimization depends on the underlying NIC driver freeing the mblks in
+ * a timely manner after they have been transmitted by the hardware.  Some
+ * drivers have been found to flush TX descriptors only when new transmissions
+ * are initiated.  This means that there is no upper bound to the time needed
+ * for an mblk to be flushed and can stall bhyve guests from shutting down
+ * since their memory must be free of viona TX references prior to clean-up.
+ *
+ * This expectation of deterministic mblk_t processing is likely the reason
+ * behind the notable exception to the zero-copy TX path: systems with 'bnxe'
+ * loaded will copy transmit data into fresh buffers rather than passing up
+ * zero-copy mblks.  It is a hold-over from the original viona sources provided
+ * by Pluribus and its continued necessity has not been confirmed.
+ *
+ *
+ * ----------------------------
+ * Ring Notification Fast-paths
+ * ----------------------------
+ *
+ * Device operation for viona requires that notifications flow to and from the
+ * guest to indicate certain ring conditions.  In order to minimize latency and
+ * processing overhead, the notification procedures are kept in-kernel whenever
+ * possible.
+ *
+ * Guest-to-host notifications, when new available descriptors have been placed
+ * in the ring, are posted via the 'queue notify' address in the virtio BAR.
+ * The vmm_drv_ioport_hook() interface was added to bhyve which allows viona to
+ * install a callback hook on an ioport address.  Guest exits for accesses to
+ * viona-hooked ioport addresses will result in direct calls to notify the
+ * appropriate ring worker without a trip to userland.
+ *
+ * Host-to-guest notifications in the form of interrupts enjoy similar
+ * acceleration.  Each viona ring can be configured to send MSI notifications
+ * to the guest as virtio conditions dictate.  This in-kernel interrupt
+ * configuration is kept synchronized through viona ioctls which are utilized
+ * during writes to the associated PCI config registers or MSI-X BAR.
+ *
+ * Guests which do not utilize MSI-X will result in viona falling back to the
+ * slow path for interrupts.  It will poll(2) the viona handle, receiving
+ * notification when ring events necessitate the assertion of an interrupt.
+ *
+ *
+ * ---------------
+ * Nethook Support
+ * ---------------
+ *
+ * Viona provides four nethook events that consumers (e.g. ipf) can hook into
+ * to intercept packets as they go up or down the stack.  Unfortunately,
+ * the nethook framework does not understand raw packets, so we can only
+ * generate events (in, out) for IPv4 and IPv6 packets.  At driver attach,
+ * we register callbacks with the neti (netinfo) module that will be invoked
+ * for each netstack already present, as well as for any additional netstack
+ * instances created as the system operates.  These callbacks will
+ * register/unregister the hooks with the nethook framework for each
+ * netstack instance.  This registration occurs prior to creating any
+ * viona instances for a given netstack, and the unregistration for a netstack
+ * instance occurs after all viona instances of the netstack instance have
+ * been deleted.
+ */
+
+#include <sys/conf.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+
+#include <sys/dlpi.h>
+
+#include "viona_impl.h"
+
+
+#define	VIONA_NAME		"Virtio Network Accelerator"
+#define	VIONA_CTL_MINOR		0
+#define	VIONA_CLI_NAME		"viona"		/* MAC client name */
+
+
+/*
+ * Host capabilities.
+ */
+#define	VIONA_S_HOSTCAPS	(	\
+	VIRTIO_NET_F_GUEST_CSUM |	\
+	VIRTIO_NET_F_MAC |		\
+	VIRTIO_NET_F_GUEST_TSO4 |	\
+	VIRTIO_NET_F_MRG_RXBUF |	\
+	VIRTIO_NET_F_STATUS |		\
+	VIRTIO_F_RING_NOTIFY_ON_EMPTY |	\
+	VIRTIO_F_RING_INDIRECT_DESC)
+
+/* MAC_CAPAB_HCKSUM specifics of interest */
+#define	VIONA_CAP_HCKSUM_INTEREST	\
+	(HCKSUM_INET_PARTIAL |		\
+	HCKSUM_INET_FULL_V4 |		\
+	HCKSUM_INET_FULL_V6)
+
+static void		*viona_state;
+static dev_info_t	*viona_dip;
+static id_space_t	*viona_minors;
+
+
+static int viona_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg,
+    void **result);
+static int viona_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
+static int viona_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
+static int viona_open(dev_t *devp, int flag, int otype, cred_t *credp);
+static int viona_close(dev_t dev, int flag, int otype, cred_t *credp);
+static int viona_ioctl(dev_t dev, int cmd, intptr_t data, int mode,
+    cred_t *credp, int *rval);
+static int viona_chpoll(dev_t dev, short events, int anyyet, short *reventsp,
+    struct pollhead **phpp);
+
+static int viona_ioc_create(viona_soft_state_t *, void *, int, cred_t *);
+static int viona_ioc_delete(viona_soft_state_t *, boolean_t);
+
+static int viona_ioc_set_notify_ioport(viona_link_t *, uint_t);
+static int viona_ioc_ring_init(viona_link_t *, void *, int);
+static int viona_ioc_ring_reset(viona_link_t *, uint_t);
+static int viona_ioc_ring_kick(viona_link_t *, uint_t);
+static int viona_ioc_ring_set_msi(viona_link_t *, void *, int);
+static int viona_ioc_ring_intr_clear(viona_link_t *, uint_t);
+static int viona_ioc_intr_poll(viona_link_t *, void *, int, int *);
+
+static struct cb_ops viona_cb_ops = {
+	viona_open,
+	viona_close,
+	nodev,
+	nodev,
+	nodev,
+	nodev,
+	nodev,
+	viona_ioctl,
+	nodev,
+	nodev,
+	nodev,
+	viona_chpoll,
+	ddi_prop_op,
+	0,
+	D_MP | D_NEW | D_HOTPLUG,
+	CB_REV,
+	nodev,
+	nodev
+};
+
+static struct dev_ops viona_ops = {
+	DEVO_REV,
+	0,
+	viona_info,
+	nulldev,
+	nulldev,
+	viona_attach,
+	viona_detach,
+	nodev,
+	&viona_cb_ops,
+	NULL,
+	ddi_power,
+	ddi_quiesce_not_needed
+};
+
+static struct modldrv modldrv = {
+	&mod_driverops,
+	VIONA_NAME,
+	&viona_ops,
+};
+
+static struct modlinkage modlinkage = {
+	MODREV_1, &modldrv, NULL
+};
+
+int
+_init(void)
+{
+	int ret;
+
+	ret = ddi_soft_state_init(&viona_state, sizeof (viona_soft_state_t), 0);
+	if (ret != 0) {
+		return (ret);
+	}
+
+	viona_minors = id_space_create("viona_minors",
+	    VIONA_CTL_MINOR + 1, UINT16_MAX);
+	viona_rx_init();
+	mutex_init(&viona_force_copy_lock, NULL, MUTEX_DRIVER, NULL);
+
+	ret = mod_install(&modlinkage);
+	if (ret != 0) {
+		ddi_soft_state_fini(&viona_state);
+		id_space_destroy(viona_minors);
+		viona_rx_fini();
+		mutex_destroy(&viona_force_copy_lock);
+	}
+
+	return (ret);
+}
+
+int
+_fini(void)
+{
+	int ret;
+
+	ret = mod_remove(&modlinkage);
+	if (ret != 0) {
+		return (ret);
+	}
+
+	ddi_soft_state_fini(&viona_state);
+	id_space_destroy(viona_minors);
+	viona_rx_fini();
+	mutex_destroy(&viona_force_copy_lock);
+
+	return (ret);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&modlinkage, modinfop));
+}
+
+/* ARGSUSED */
+static int
+viona_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
+{
+	int error;
+
+	switch (cmd) {
+	case DDI_INFO_DEVT2DEVINFO:
+		*result = (void *)viona_dip;
+		error = DDI_SUCCESS;
+		break;
+	case DDI_INFO_DEVT2INSTANCE:
+		*result = (void *)0;
+		error = DDI_SUCCESS;
+		break;
+	default:
+		error = DDI_FAILURE;
+		break;
+	}
+	return (error);
+}
+
+static int
+viona_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+	if (cmd != DDI_ATTACH) {
+		return (DDI_FAILURE);
+	}
+
+	if (ddi_create_minor_node(dip, "viona", S_IFCHR, VIONA_CTL_MINOR,
+	    DDI_PSEUDO, 0) != DDI_SUCCESS) {
+		return (DDI_FAILURE);
+	}
+
+	viona_neti_attach();
+
+	viona_dip = dip;
+	ddi_report_dev(viona_dip);
+
+	return (DDI_SUCCESS);
+}
+
+static int
+viona_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+	dev_info_t *old_dip = viona_dip;
+
+	if (cmd != DDI_DETACH) {
+		return (DDI_FAILURE);
+	}
+
+	VERIFY(old_dip != NULL);
+
+	viona_neti_detach();
+	viona_dip = NULL;
+	ddi_remove_minor_node(old_dip, NULL);
+
+	return (DDI_SUCCESS);
+}
+
+static int
+viona_open(dev_t *devp, int flag, int otype, cred_t *credp)
+{
+	int	minor;
+	viona_soft_state_t *ss;
+
+	if (otype != OTYP_CHR) {
+		return (EINVAL);
+	}
+#if 0
+	/*
+	 * XXX-mg: drv_priv() is wrong, but I'm not sure what is right.
+	 * Should the check be at open() or ioctl()?
+	 */
+	if (drv_priv(credp) != 0) {
+		return (EPERM);
+	}
+#endif
+	if (getminor(*devp) != VIONA_CTL_MINOR) {
+		return (ENXIO);
+	}
+
+	minor = id_alloc_nosleep(viona_minors);
+	if (minor == -1) {
+		/* All minors are busy */
+		return (EBUSY);
+	}
+	if (ddi_soft_state_zalloc(viona_state, minor) != DDI_SUCCESS) {
+		id_free(viona_minors, minor);
+		return (ENOMEM);
+	}
+
+	ss = ddi_get_soft_state(viona_state, minor);
+	mutex_init(&ss->ss_lock, NULL, MUTEX_DEFAULT, NULL);
+	*devp = makedevice(getmajor(*devp), minor);
+
+	return (0);
+}
+
+static int
+viona_close(dev_t dev, int flag, int otype, cred_t *credp)
+{
+	int			minor;
+	viona_soft_state_t	*ss;
+
+	if (otype != OTYP_CHR) {
+		return (EINVAL);
+	}
+
+	minor = getminor(dev);
+
+	ss = ddi_get_soft_state(viona_state, minor);
+	if (ss == NULL) {
+		return (ENXIO);
+	}
+
+	VERIFY0(viona_ioc_delete(ss, B_TRUE));
+	VERIFY(!list_link_active(&ss->ss_node));
+	ddi_soft_state_free(viona_state, minor);
+	id_free(viona_minors, minor);
+
+	return (0);
+}
+
+static int
+viona_ioctl(dev_t dev, int cmd, intptr_t data, int md, cred_t *cr, int *rv)
+{
+	viona_soft_state_t *ss;
+	void *dptr = (void *)data;
+	int err = 0, val;
+	viona_link_t *link;
+
+	ss = ddi_get_soft_state(viona_state, getminor(dev));
+	if (ss == NULL) {
+		return (ENXIO);
+	}
+
+	switch (cmd) {
+	case VNA_IOC_CREATE:
+		return (viona_ioc_create(ss, dptr, md, cr));
+	case VNA_IOC_DELETE:
+		return (viona_ioc_delete(ss, B_FALSE));
+	default:
+		break;
+	}
+
+	mutex_enter(&ss->ss_lock);
+	if ((link = ss->ss_link) == NULL || link->l_destroyed ||
+	    vmm_drv_release_reqd(link->l_vm_hold)) {
+		mutex_exit(&ss->ss_lock);
+		return (ENXIO);
+	}
+
+	switch (cmd) {
+	case VNA_IOC_GET_FEATURES:
+		val = VIONA_S_HOSTCAPS | link->l_features_hw;
+		if (ddi_copyout(&val, dptr, sizeof (val), md) != 0) {
+			err = EFAULT;
+		}
+		break;
+	case VNA_IOC_SET_FEATURES:
+		if (ddi_copyin(dptr, &val, sizeof (val), md) != 0) {
+			err = EFAULT;
+			break;
+		}
+		val &= (VIONA_S_HOSTCAPS | link->l_features_hw);
+
+		if ((val & VIRTIO_NET_F_CSUM) == 0)
+			val &= ~VIRTIO_NET_F_HOST_TSO4;
+
+		if ((val & VIRTIO_NET_F_GUEST_CSUM) == 0)
+			val &= ~VIRTIO_NET_F_GUEST_TSO4;
+
+		link->l_features = val;
+		break;
+	case VNA_IOC_RING_INIT:
+		err = viona_ioc_ring_init(link, dptr, md);
+		break;
+	case VNA_IOC_RING_RESET:
+		err = viona_ioc_ring_reset(link, (uint_t)data);
+		break;
+	case VNA_IOC_RING_KICK:
+		err = viona_ioc_ring_kick(link, (uint_t)data);
+		break;
+	case VNA_IOC_RING_SET_MSI:
+		err = viona_ioc_ring_set_msi(link, dptr, md);
+		break;
+	case VNA_IOC_RING_INTR_CLR:
+		err = viona_ioc_ring_intr_clear(link, (uint_t)data);
+		break;
+	case VNA_IOC_INTR_POLL:
+		err = viona_ioc_intr_poll(link, dptr, md, rv);
+		break;
+	case VNA_IOC_SET_NOTIFY_IOP:
+		err = viona_ioc_set_notify_ioport(link, (uint_t)data);
+		break;
+	default:
+		err = ENOTTY;
+		break;
+	}
+
+	mutex_exit(&ss->ss_lock);
+	return (err);
+}
+
+static int
+viona_chpoll(dev_t dev, short events, int anyyet, short *reventsp,
+    struct pollhead **phpp)
+{
+	viona_soft_state_t *ss;
+	viona_link_t *link;
+
+	ss = ddi_get_soft_state(viona_state, getminor(dev));
+	if (ss == NULL) {
+		return (ENXIO);
+	}
+
+	mutex_enter(&ss->ss_lock);
+	if ((link = ss->ss_link) == NULL || link->l_destroyed) {
+		mutex_exit(&ss->ss_lock);
+		return (ENXIO);
+	}
+
+	*reventsp = 0;
+	if ((events & POLLRDBAND) != 0) {
+		for (uint_t i = 0; i < VIONA_VQ_MAX; i++) {
+			if (link->l_vrings[i].vr_intr_enabled != 0) {
+				*reventsp |= POLLRDBAND;
+				break;
+			}
+		}
+	}
+	if ((*reventsp == 0 && !anyyet) || (events & POLLET)) {
+		*phpp = &link->l_pollhead;
+	}
+	mutex_exit(&ss->ss_lock);
+
+	return (0);
+}
+
+static void
+viona_get_mac_capab(viona_link_t *link)
+{
+	mac_handle_t mh = link->l_mh;
+	uint32_t cap = 0;
+	mac_capab_lso_t lso_cap;
+
+	link->l_features_hw = 0;
+	if (mac_capab_get(mh, MAC_CAPAB_HCKSUM, &cap)) {
+		/*
+		 * Only report HW checksum ability if the underlying MAC
+		 * resource is capable of populating the L4 header.
+		 */
+		if ((cap & VIONA_CAP_HCKSUM_INTEREST) != 0) {
+			link->l_features_hw |= VIRTIO_NET_F_CSUM;
+		}
+		link->l_cap_csum = cap;
+	}
+
+	if ((link->l_features_hw & VIRTIO_NET_F_CSUM) &&
+	    mac_capab_get(mh, MAC_CAPAB_LSO, &lso_cap)) {
+		/*
+		 * Virtio doesn't allow for negotiating a maximum LSO
+		 * packet size. We have to assume that the guest may
+		 * send a maximum length IP packet. Make sure the
+		 * underlying MAC can handle an LSO of this size.
+		 */
+		if ((lso_cap.lso_flags & LSO_TX_BASIC_TCP_IPV4) &&
+		    lso_cap.lso_basic_tcp_ipv4.lso_max >= IP_MAXPACKET)
+			link->l_features_hw |= VIRTIO_NET_F_HOST_TSO4;
+	}
+}
+
+static int
+viona_ioc_create(viona_soft_state_t *ss, void *dptr, int md, cred_t *cr)
+{
+	vioc_create_t	kvc;
+	viona_link_t	*link = NULL;
+	char		cli_name[MAXNAMELEN];
+	int		err = 0;
+	file_t		*fp;
+	vmm_hold_t	*hold = NULL;
+	viona_neti_t	*nip = NULL;
+	zoneid_t	zid;
+
+	ASSERT(MUTEX_NOT_HELD(&ss->ss_lock));
+
+	if (ddi_copyin(dptr, &kvc, sizeof (kvc), md) != 0) {
+		return (EFAULT);
+	}
+
+	zid = crgetzoneid(cr);
+	nip = viona_neti_lookup_by_zid(zid);
+	if (nip == NULL) {
+		return (EIO);
+	}
+
+	if (!nip->vni_nethook.vnh_hooked) {
+		viona_neti_rele(nip);
+		return (EIO);
+	}
+
+	mutex_enter(&ss->ss_lock);
+	if (ss->ss_link != NULL) {
+		mutex_exit(&ss->ss_lock);
+		viona_neti_rele(nip);
+		return (EEXIST);
+	}
+
+	if ((fp = getf(kvc.c_vmfd)) == NULL) {
+		err = EBADF;
+		goto bail;
+	}
+	err = vmm_drv_hold(fp, cr, &hold);
+	releasef(kvc.c_vmfd);
+	if (err != 0) {
+		goto bail;
+	}
+
+	link = kmem_zalloc(sizeof (viona_link_t), KM_SLEEP);
+	link->l_linkid = kvc.c_linkid;
+	link->l_vm_hold = hold;
+
+	err = mac_open_by_linkid(link->l_linkid, &link->l_mh);
+	if (err != 0) {
+		goto bail;
+	}
+
+	viona_get_mac_capab(link);
+
+	(void) snprintf(cli_name, sizeof (cli_name), "%s-%d", VIONA_CLI_NAME,
+	    link->l_linkid);
+	err = mac_client_open(link->l_mh, &link->l_mch, cli_name, 0);
+	if (err != 0) {
+		goto bail;
+	}
+
+	viona_ring_alloc(link, &link->l_vrings[VIONA_VQ_RX]);
+	viona_ring_alloc(link, &link->l_vrings[VIONA_VQ_TX]);
+
+	if ((err = viona_rx_set(link)) != 0) {
+		viona_ring_free(&link->l_vrings[VIONA_VQ_RX]);
+		viona_ring_free(&link->l_vrings[VIONA_VQ_TX]);
+		goto bail;
+	}
+
+	link->l_neti = nip;
+	ss->ss_link = link;
+	mutex_exit(&ss->ss_lock);
+
+	mutex_enter(&nip->vni_lock);
+	list_insert_tail(&nip->vni_dev_list, ss);
+	mutex_exit(&nip->vni_lock);
+
+	return (0);
+
+bail:
+	if (link != NULL) {
+		if (link->l_mch != NULL) {
+			mac_client_close(link->l_mch, 0);
+		}
+		if (link->l_mh != NULL) {
+			mac_close(link->l_mh);
+		}
+		kmem_free(link, sizeof (viona_link_t));
+	}
+	if (hold != NULL) {
+		vmm_drv_rele(hold);
+	}
+	viona_neti_rele(nip);
+
+	mutex_exit(&ss->ss_lock);
+	return (err);
+}
+
+static int
+viona_ioc_delete(viona_soft_state_t *ss, boolean_t on_close)
+{
+	viona_link_t *link;
+	viona_neti_t *nip = NULL;
+
+	mutex_enter(&ss->ss_lock);
+	if ((link = ss->ss_link) == NULL) {
+		/* Link destruction already complete */
+		mutex_exit(&ss->ss_lock);
+		return (0);
+	}
+
+	if (link->l_destroyed) {
+		/*
+		 * Link destruction has been started by another thread, but has
+		 * not completed.  This condition should be impossible to
+		 * encounter when performing the on-close destroy of the link,
+		 * since racing ioctl accessors must necessarily be absent.
+		 */
+		VERIFY(!on_close);
+		mutex_exit(&ss->ss_lock);
+		return (EAGAIN);
+	}
+	/*
+	 * The link deletion cannot fail after this point, continuing until its
+	 * successful completion is reached.
+	 */
+	link->l_destroyed = B_TRUE;
+
+	/*
+	 * Tear down the IO port hook so it cannot be used to kick any of the
+	 * rings which are about to be reset and stopped.
+	 */
+	VERIFY0(viona_ioc_set_notify_ioport(link, 0));
+	mutex_exit(&ss->ss_lock);
+
+	/*
+	 * Return the rings to their reset state, ignoring any possible
+	 * interruptions from signals.
+	 */
+	VERIFY0(viona_ring_reset(&link->l_vrings[VIONA_VQ_RX], B_FALSE));
+	VERIFY0(viona_ring_reset(&link->l_vrings[VIONA_VQ_TX], B_FALSE));
+
+	mutex_enter(&ss->ss_lock);
+	if (link->l_mch != NULL) {
+		/* Unhook the receive callbacks and close out the client */
+		viona_rx_clear(link);
+		mac_client_close(link->l_mch, 0);
+	}
+	if (link->l_mh != NULL) {
+		mac_close(link->l_mh);
+	}
+	if (link->l_vm_hold != NULL) {
+		vmm_drv_rele(link->l_vm_hold);
+		link->l_vm_hold = NULL;
+	}
+
+	nip = link->l_neti;
+	link->l_neti = NULL;
+
+	viona_ring_free(&link->l_vrings[VIONA_VQ_RX]);
+	viona_ring_free(&link->l_vrings[VIONA_VQ_TX]);
+	pollhead_clean(&link->l_pollhead);
+	ss->ss_link = NULL;
+	mutex_exit(&ss->ss_lock);
+
+	mutex_enter(&nip->vni_lock);
+	list_remove(&nip->vni_dev_list, ss);
+	mutex_exit(&nip->vni_lock);
+
+	viona_neti_rele(nip);
+
+	kmem_free(link, sizeof (viona_link_t));
+	return (0);
+}
+
+static int
+viona_ioc_ring_init(viona_link_t *link, void *udata, int md)
+{
+	vioc_ring_init_t kri;
+	int err;
+
+	if (ddi_copyin(udata, &kri, sizeof (kri), md) != 0) {
+		return (EFAULT);
+	}
+
+	err = viona_ring_init(link, kri.ri_index, kri.ri_qsize, kri.ri_qaddr);
+
+	return (err);
+}
+
+static int
+viona_ioc_ring_reset(viona_link_t *link, uint_t idx)
+{
+	viona_vring_t *ring;
+
+	if (idx >= VIONA_VQ_MAX) {
+		return (EINVAL);
+	}
+	ring = &link->l_vrings[idx];
+
+	return (viona_ring_reset(ring, B_TRUE));
+}
+
+static int
+viona_ioc_ring_kick(viona_link_t *link, uint_t idx)
+{
+	viona_vring_t *ring;
+	int err;
+
+	if (idx >= VIONA_VQ_MAX) {
+		return (EINVAL);
+	}
+	ring = &link->l_vrings[idx];
+
+	mutex_enter(&ring->vr_lock);
+	switch (ring->vr_state) {
+	case VRS_SETUP:
+		/*
+		 * An early kick to a ring which is starting its worker thread
+		 * is fine.  Once that thread is active, it will process the
+		 * start-up request immediately.
+		 */
+		/* FALLTHROUGH */
+	case VRS_INIT:
+		ring->vr_state_flags |= VRSF_REQ_START;
+		/* FALLTHROUGH */
+	case VRS_RUN:
+		cv_broadcast(&ring->vr_cv);
+		err = 0;
+		break;
+	default:
+		err = EBUSY;
+		break;
+	}
+	mutex_exit(&ring->vr_lock);
+
+	return (err);
+}
+
+static int
+viona_ioc_ring_set_msi(viona_link_t *link, void *data, int md)
+{
+	vioc_ring_msi_t vrm;
+	viona_vring_t *ring;
+
+	if (ddi_copyin(data, &vrm, sizeof (vrm), md) != 0) {
+		return (EFAULT);
+	}
+	if (vrm.rm_index >= VIONA_VQ_MAX) {
+		return (EINVAL);
+	}
+
+	ring = &link->l_vrings[vrm.rm_index];
+	mutex_enter(&ring->vr_lock);
+	ring->vr_msi_addr = vrm.rm_addr;
+	ring->vr_msi_msg = vrm.rm_msg;
+	mutex_exit(&ring->vr_lock);
+
+	return (0);
+}
+
+static int
+viona_notify_wcb(void *arg, uintptr_t ioport, uint_t sz, uint64_t val)
+{
+	viona_link_t *link = (viona_link_t *)arg;
+	uint16_t vq = (uint16_t)val;
+
+	if (ioport != link->l_notify_ioport || sz != sizeof (uint16_t)) {
+		return (EINVAL);
+	}
+	return (viona_ioc_ring_kick(link, vq));
+}
+
+static int
+viona_ioc_set_notify_ioport(viona_link_t *link, uint_t ioport)
+{
+	int err = 0;
+
+	if (link->l_notify_ioport != 0) {
+		vmm_drv_ioport_unhook(link->l_vm_hold, &link->l_notify_cookie);
+		link->l_notify_ioport = 0;
+	}
+
+	if (ioport != 0) {
+		err = vmm_drv_ioport_hook(link->l_vm_hold, ioport, NULL,
+		    viona_notify_wcb, (void *)link, &link->l_notify_cookie);
+		if (err == 0) {
+			link->l_notify_ioport = ioport;
+		}
+	}
+	return (err);
+}
+
+static int
+viona_ioc_ring_intr_clear(viona_link_t *link, uint_t idx)
+{
+	if (idx >= VIONA_VQ_MAX) {
+		return (EINVAL);
+	}
+
+	link->l_vrings[idx].vr_intr_enabled = 0;
+	return (0);
+}
+
+static int
+viona_ioc_intr_poll(viona_link_t *link, void *udata, int md, int *rv)
+{
+	uint_t cnt = 0;
+	vioc_intr_poll_t vip;
+
+	for (uint_t i = 0; i < VIONA_VQ_MAX; i++) {
+		uint_t val = link->l_vrings[i].vr_intr_enabled;
+
+		vip.vip_status[i] = val;
+		if (val != 0) {
+			cnt++;
+		}
+	}
+
+	if (ddi_copyout(&vip, udata, sizeof (vip), md) != 0) {
+		return (EFAULT);
+	}
+	*rv = (int)cnt;
+	return (0);
+}
diff --git a/usr/src/uts/i86pc/io/viona/viona_ring.c b/usr/src/uts/i86pc/io/viona/viona_ring.c
new file mode 100644
index 0000000000..5ba6fad963
--- /dev/null
+++ b/usr/src/uts/i86pc/io/viona/viona_ring.c
@@ -0,0 +1,638 @@
+/*
+ * Copyright (c) 2013  Chris Torek <torek @ torek net>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2015 Pluribus Networks Inc.
+ * Copyright 2019 Joyent, Inc.
+ */
+
+
+#include <sys/disp.h>
+
+#include "viona_impl.h"
+
+#define	VRING_ALIGN		4096
+#define	VRING_MAX_LEN		32768
+
+static boolean_t viona_ring_map(viona_vring_t *);
+static void viona_ring_unmap(viona_vring_t *);
+static kthread_t *viona_create_worker(viona_vring_t *);
+
+static void *
+viona_gpa2kva(viona_vring_t *ring, uint64_t gpa, size_t len)
+{
+	ASSERT3P(ring->vr_lease, !=, NULL);
+
+	return (vmm_drv_gpa2kva(ring->vr_lease, gpa, len));
+}
+
+static boolean_t
+viona_ring_lease_expire_cb(void *arg)
+{
+	viona_vring_t *ring = arg;
+
+	cv_broadcast(&ring->vr_cv);
+
+	/* The lease will be broken asynchronously. */
+	return (B_FALSE);
+}
+
+static void
+viona_ring_lease_drop(viona_vring_t *ring)
+{
+	ASSERT(MUTEX_HELD(&ring->vr_lock));
+
+	if (ring->vr_lease != NULL) {
+		vmm_hold_t *hold = ring->vr_link->l_vm_hold;
+
+		ASSERT(hold != NULL);
+
+		/*
+		 * Without an active lease, the ring mappings cannot be
+		 * considered valid.
+		 */
+		viona_ring_unmap(ring);
+
+		vmm_drv_lease_break(hold, ring->vr_lease);
+		ring->vr_lease = NULL;
+	}
+}
+
+boolean_t
+viona_ring_lease_renew(viona_vring_t *ring)
+{
+	vmm_hold_t *hold = ring->vr_link->l_vm_hold;
+
+	ASSERT(hold != NULL);
+	ASSERT(MUTEX_HELD(&ring->vr_lock));
+
+	viona_ring_lease_drop(ring);
+
+	/*
+	 * Lease renewal will fail if the VM has requested that all holds be
+	 * cleaned up.
+	 */
+	ring->vr_lease = vmm_drv_lease_sign(hold, viona_ring_lease_expire_cb,
+	    ring);
+	if (ring->vr_lease != NULL) {
+		/* A ring undergoing renewal will need valid guest mappings */
+		if (ring->vr_pa != 0 && ring->vr_size != 0) {
+			/*
+			 * If new mappings cannot be established, consider the
+			 * lease renewal a failure.
+			 */
+			if (!viona_ring_map(ring)) {
+				viona_ring_lease_drop(ring);
+				return (B_FALSE);
+			}
+		}
+	}
+	return (ring->vr_lease != NULL);
+}
+
+void
+viona_ring_alloc(viona_link_t *link, viona_vring_t *ring)
+{
+	ring->vr_link = link;
+	mutex_init(&ring->vr_lock, NULL, MUTEX_DRIVER, NULL);
+	cv_init(&ring->vr_cv, NULL, CV_DRIVER, NULL);
+	mutex_init(&ring->vr_a_mutex, NULL, MUTEX_DRIVER, NULL);
+	mutex_init(&ring->vr_u_mutex, NULL, MUTEX_DRIVER, NULL);
+}
+
+static void
+viona_ring_misc_free(viona_vring_t *ring)
+{
+	const uint_t qsz = ring->vr_size;
+
+	viona_tx_ring_free(ring, qsz);
+}
+
+void
+viona_ring_free(viona_vring_t *ring)
+{
+	mutex_destroy(&ring->vr_lock);
+	cv_destroy(&ring->vr_cv);
+	mutex_destroy(&ring->vr_a_mutex);
+	mutex_destroy(&ring->vr_u_mutex);
+	ring->vr_link = NULL;
+}
+
+int
+viona_ring_init(viona_link_t *link, uint16_t idx, uint16_t qsz, uint64_t pa)
+{
+	viona_vring_t *ring;
+	kthread_t *t;
+	int err = 0;
+
+	if (idx >= VIONA_VQ_MAX) {
+		return (EINVAL);
+	}
+	if (qsz == 0 || qsz > VRING_MAX_LEN || (1 << (ffs(qsz) - 1)) != qsz) {
+		return (EINVAL);
+	}
+
+	ring = &link->l_vrings[idx];
+	mutex_enter(&ring->vr_lock);
+	if (ring->vr_state != VRS_RESET) {
+		mutex_exit(&ring->vr_lock);
+		return (EBUSY);
+	}
+	VERIFY(ring->vr_state_flags == 0);
+
+	ring->vr_lease = NULL;
+	if (!viona_ring_lease_renew(ring)) {
+		err = EBUSY;
+		goto fail;
+	}
+
+	ring->vr_size = qsz;
+	ring->vr_mask = (ring->vr_size - 1);
+	ring->vr_pa = pa;
+	if (!viona_ring_map(ring)) {
+		err = EINVAL;
+		goto fail;
+	}
+
+	/* Initialize queue indexes */
+	ring->vr_cur_aidx = 0;
+
+	if (idx == VIONA_VQ_TX) {
+		viona_tx_ring_alloc(ring, qsz);
+	}
+
+	/* Zero out MSI-X configuration */
+	ring->vr_msi_addr = 0;
+	ring->vr_msi_msg = 0;
+
+	/* Clear the stats */
+	bzero(&ring->vr_stats, sizeof (ring->vr_stats));
+
+	t = viona_create_worker(ring);
+	if (t == NULL) {
+		err = ENOMEM;
+		goto fail;
+	}
+	ring->vr_worker_thread = t;
+	ring->vr_state = VRS_SETUP;
+	cv_broadcast(&ring->vr_cv);
+	mutex_exit(&ring->vr_lock);
+	return (0);
+
+fail:
+	viona_ring_lease_drop(ring);
+	viona_ring_misc_free(ring);
+	ring->vr_size = 0;
+	ring->vr_mask = 0;
+	mutex_exit(&ring->vr_lock);
+	return (err);
+}
+
+int
+viona_ring_reset(viona_vring_t *ring, boolean_t heed_signals)
+{
+	mutex_enter(&ring->vr_lock);
+	if (ring->vr_state == VRS_RESET) {
+		mutex_exit(&ring->vr_lock);
+		return (0);
+	}
+
+	if ((ring->vr_state_flags & VRSF_REQ_STOP) == 0) {
+		ring->vr_state_flags |= VRSF_REQ_STOP;
+		cv_broadcast(&ring->vr_cv);
+	}
+	while (ring->vr_state != VRS_RESET) {
+		if (!heed_signals) {
+			cv_wait(&ring->vr_cv, &ring->vr_lock);
+		} else {
+			int rs;
+
+			rs = cv_wait_sig(&ring->vr_cv, &ring->vr_lock);
+			if (rs <= 0 && ring->vr_state != VRS_RESET) {
+				mutex_exit(&ring->vr_lock);
+				return (EINTR);
+			}
+		}
+	}
+	viona_ring_lease_drop(ring);
+	mutex_exit(&ring->vr_lock);
+	return (0);
+}
+
+static boolean_t
+viona_ring_map(viona_vring_t *ring)
+{
+	uint64_t pos = ring->vr_pa;
+	const uint16_t qsz = ring->vr_size;
+
+	ASSERT3U(qsz, !=, 0);
+	ASSERT3U(pos, !=, 0);
+	ASSERT(MUTEX_HELD(&ring->vr_lock));
+
+	const size_t desc_sz = qsz * sizeof (struct virtio_desc);
+	ring->vr_descr = viona_gpa2kva(ring, pos, desc_sz);
+	if (ring->vr_descr == NULL) {
+		goto fail;
+	}
+	pos += desc_sz;
+
+	const size_t avail_sz = (qsz + 3) * sizeof (uint16_t);
+	ring->vr_avail_flags = viona_gpa2kva(ring, pos, avail_sz);
+	if (ring->vr_avail_flags == NULL) {
+		goto fail;
+	}
+	ring->vr_avail_idx = ring->vr_avail_flags + 1;
+	ring->vr_avail_ring = ring->vr_avail_flags + 2;
+	ring->vr_avail_used_event = ring->vr_avail_ring + qsz;
+	pos += avail_sz;
+
+	const size_t used_sz = (qsz * sizeof (struct virtio_used)) +
+	    (sizeof (uint16_t) * 3);
+	pos = P2ROUNDUP(pos, VRING_ALIGN);
+	ring->vr_used_flags = viona_gpa2kva(ring, pos, used_sz);
+	if (ring->vr_used_flags == NULL) {
+		goto fail;
+	}
+	ring->vr_used_idx = ring->vr_used_flags + 1;
+	ring->vr_used_ring = (struct virtio_used *)(ring->vr_used_flags + 2);
+	ring->vr_used_avail_event = (uint16_t *)(ring->vr_used_ring + qsz);
+
+	return (B_TRUE);
+
+fail:
+	viona_ring_unmap(ring);
+	return (B_FALSE);
+}
+
+static void
+viona_ring_unmap(viona_vring_t *ring)
+{
+	ASSERT(MUTEX_HELD(&ring->vr_lock));
+
+	ring->vr_descr = NULL;
+	ring->vr_avail_flags = NULL;
+	ring->vr_avail_idx = NULL;
+	ring->vr_avail_ring = NULL;
+	ring->vr_avail_used_event = NULL;
+	ring->vr_used_flags = NULL;
+	ring->vr_used_idx = NULL;
+	ring->vr_used_ring = NULL;
+	ring->vr_used_avail_event = NULL;
+}
+
+void
+viona_intr_ring(viona_vring_t *ring)
+{
+	uint64_t addr;
+
+	mutex_enter(&ring->vr_lock);
+	/* Deliver the interrupt directly, if so configured. */
+	if ((addr = ring->vr_msi_addr) != 0) {
+		uint64_t msg = ring->vr_msi_msg;
+
+		mutex_exit(&ring->vr_lock);
+		(void) vmm_drv_msi(ring->vr_lease, addr, msg);
+		return;
+	}
+	mutex_exit(&ring->vr_lock);
+
+	if (atomic_cas_uint(&ring->vr_intr_enabled, 0, 1) == 0) {
+		pollwakeup(&ring->vr_link->l_pollhead, POLLRDBAND);
+	}
+}
+
+static void
+viona_worker(void *arg)
+{
+	viona_vring_t *ring = (viona_vring_t *)arg;
+	viona_link_t *link = ring->vr_link;
+	proc_t *p = ttoproc(curthread);
+
+	mutex_enter(&ring->vr_lock);
+	VERIFY3U(ring->vr_state, ==, VRS_SETUP);
+
+	/* Bail immediately if ring shutdown or process exit was requested */
+	if (VRING_NEED_BAIL(ring, p)) {
+		goto cleanup;
+	}
+
+	/* Report worker thread as alive and notify creator */
+	ring->vr_state = VRS_INIT;
+	cv_broadcast(&ring->vr_cv);
+
+	while (ring->vr_state_flags == 0) {
+		/*
+		 * Keeping lease renewals timely while waiting for the ring to
+		 * be started is important for avoiding deadlocks.
+		 */
+		if (vmm_drv_lease_expired(ring->vr_lease)) {
+			if (!viona_ring_lease_renew(ring)) {
+				goto cleanup;
+			}
+		}
+
+		(void) cv_wait_sig(&ring->vr_cv, &ring->vr_lock);
+
+		if (VRING_NEED_BAIL(ring, p)) {
+			goto cleanup;
+		}
+	}
+
+	ASSERT((ring->vr_state_flags & VRSF_REQ_START) != 0);
+	ring->vr_state = VRS_RUN;
+	ring->vr_state_flags &= ~VRSF_REQ_START;
+
+	/* Ensure ring lease is valid first */
+	if (vmm_drv_lease_expired(ring->vr_lease)) {
+		if (!viona_ring_lease_renew(ring)) {
+			goto cleanup;
+		}
+	}
+
+	/* Process actual work */
+	if (ring == &link->l_vrings[VIONA_VQ_RX]) {
+		viona_worker_rx(ring, link);
+	} else if (ring == &link->l_vrings[VIONA_VQ_TX]) {
+		viona_worker_tx(ring, link);
+	} else {
+		panic("unexpected ring: %p", (void *)ring);
+	}
+
+	VERIFY3U(ring->vr_state, ==, VRS_STOP);
+
+cleanup:
+	if (ring->vr_txdesb != NULL) {
+		/*
+		 * Transmit activity must be entirely concluded before the
+		 * associated descriptors can be cleaned up.
+		 */
+		VERIFY(ring->vr_xfer_outstanding == 0);
+	}
+	viona_ring_misc_free(ring);
+
+	viona_ring_lease_drop(ring);
+	ring->vr_cur_aidx = 0;
+	ring->vr_state = VRS_RESET;
+	ring->vr_state_flags = 0;
+	ring->vr_worker_thread = NULL;
+	cv_broadcast(&ring->vr_cv);
+	mutex_exit(&ring->vr_lock);
+
+	mutex_enter(&ttoproc(curthread)->p_lock);
+	lwp_exit();
+}
+
+static kthread_t *
+viona_create_worker(viona_vring_t *ring)
+{
+	k_sigset_t hold_set;
+	proc_t *p = curproc;
+	kthread_t *t;
+	klwp_t *lwp;
+
+	ASSERT(MUTEX_HELD(&ring->vr_lock));
+	ASSERT(ring->vr_state == VRS_RESET);
+
+	sigfillset(&hold_set);
+	lwp = lwp_create(viona_worker, (void *)ring, 0, p, TS_STOPPED,
+	    minclsyspri - 1, &hold_set, curthread->t_cid, 0);
+	if (lwp == NULL) {
+		return (NULL);
+	}
+
+	t = lwptot(lwp);
+	mutex_enter(&p->p_lock);
+	t->t_proc_flag = (t->t_proc_flag & ~TP_HOLDLWP) | TP_KTHREAD;
+	lwp_create_done(t);
+	mutex_exit(&p->p_lock);
+
+	return (t);
+}
+
+int
+vq_popchain(viona_vring_t *ring, struct iovec *iov, uint_t niov,
+    uint16_t *cookie)
+{
+	uint_t i, ndesc, idx, head, next;
+	struct virtio_desc vdir;
+	void *buf;
+
+	ASSERT(iov != NULL);
+	ASSERT(niov > 0 && niov < INT_MAX);
+
+	mutex_enter(&ring->vr_a_mutex);
+	idx = ring->vr_cur_aidx;
+	ndesc = (uint16_t)((unsigned)*ring->vr_avail_idx - (unsigned)idx);
+
+	if (ndesc == 0) {
+		mutex_exit(&ring->vr_a_mutex);
+		return (0);
+	}
+	if (ndesc > ring->vr_size) {
+		/*
+		 * Despite the fact that the guest has provided an 'avail_idx'
+		 * which indicates that an impossible number of descriptors are
+		 * available, continue on and attempt to process the next one.
+		 *
+		 * The transgression will not escape the probe or stats though.
+		 */
+		VIONA_PROBE2(ndesc_too_high, viona_vring_t *, ring,
+		    uint16_t, ndesc);
+		VIONA_RING_STAT_INCR(ring, ndesc_too_high);
+	}
+
+	head = ring->vr_avail_ring[idx & ring->vr_mask];
+	next = head;
+
+	for (i = 0; i < niov; next = vdir.vd_next) {
+		if (next >= ring->vr_size) {
+			VIONA_PROBE2(bad_idx, viona_vring_t *, ring,
+			    uint16_t, next);
+			VIONA_RING_STAT_INCR(ring, bad_idx);
+			goto bail;
+		}
+
+		vdir = ring->vr_descr[next];
+		if ((vdir.vd_flags & VRING_DESC_F_INDIRECT) == 0) {
+			if (vdir.vd_len == 0) {
+				VIONA_PROBE2(desc_bad_len,
+				    viona_vring_t *, ring,
+				    uint32_t, vdir.vd_len);
+				VIONA_RING_STAT_INCR(ring, desc_bad_len);
+				goto bail;
+			}
+			buf = viona_gpa2kva(ring, vdir.vd_addr, vdir.vd_len);
+			if (buf == NULL) {
+				VIONA_PROBE_BAD_RING_ADDR(ring, vdir.vd_addr);
+				VIONA_RING_STAT_INCR(ring, bad_ring_addr);
+				goto bail;
+			}
+			iov[i].iov_base = buf;
+			iov[i].iov_len = vdir.vd_len;
+			i++;
+		} else {
+			const uint_t nindir = vdir.vd_len / 16;
+			volatile struct virtio_desc *vindir;
+
+			if ((vdir.vd_len & 0xf) || nindir == 0) {
+				VIONA_PROBE2(indir_bad_len,
+				    viona_vring_t *, ring,
+				    uint32_t, vdir.vd_len);
+				VIONA_RING_STAT_INCR(ring, indir_bad_len);
+				goto bail;
+			}
+			vindir = viona_gpa2kva(ring, vdir.vd_addr, vdir.vd_len);
+			if (vindir == NULL) {
+				VIONA_PROBE_BAD_RING_ADDR(ring, vdir.vd_addr);
+				VIONA_RING_STAT_INCR(ring, bad_ring_addr);
+				goto bail;
+			}
+			next = 0;
+			for (;;) {
+				struct virtio_desc vp;
+
+				/*
+				 * A copy of the indirect descriptor is made
+				 * here, rather than simply using a reference
+				 * pointer.  This prevents malicious or
+				 * erroneous guest writes to the descriptor
+				 * from fooling the flags/bounds verification
+				 * through a race.
+				 */
+				vp = vindir[next];
+				if (vp.vd_flags & VRING_DESC_F_INDIRECT) {
+					VIONA_PROBE1(indir_bad_nest,
+					    viona_vring_t *, ring);
+					VIONA_RING_STAT_INCR(ring,
+					    indir_bad_nest);
+					goto bail;
+				} else if (vp.vd_len == 0) {
+					VIONA_PROBE2(desc_bad_len,
+					    viona_vring_t *, ring,
+					    uint32_t, vp.vd_len);
+					VIONA_RING_STAT_INCR(ring,
+					    desc_bad_len);
+					goto bail;
+				}
+				buf = viona_gpa2kva(ring, vp.vd_addr,
+				    vp.vd_len);
+				if (buf == NULL) {
+					VIONA_PROBE_BAD_RING_ADDR(ring,
+					    vp.vd_addr);
+					VIONA_RING_STAT_INCR(ring,
+					    bad_ring_addr);
+					goto bail;
+				}
+				iov[i].iov_base = buf;
+				iov[i].iov_len = vp.vd_len;
+				i++;
+
+				if ((vp.vd_flags & VRING_DESC_F_NEXT) == 0)
+					break;
+				if (i >= niov) {
+					goto loopy;
+				}
+
+				next = vp.vd_next;
+				if (next >= nindir) {
+					VIONA_PROBE3(indir_bad_next,
+					    viona_vring_t *, ring,
+					    uint16_t, next,
+					    uint_t, nindir);
+					VIONA_RING_STAT_INCR(ring,
+					    indir_bad_next);
+					goto bail;
+				}
+			}
+		}
+		if ((vdir.vd_flags & VRING_DESC_F_NEXT) == 0) {
+			*cookie = head;
+			ring->vr_cur_aidx++;
+			mutex_exit(&ring->vr_a_mutex);
+			return (i);
+		}
+	}
+
+loopy:
+	VIONA_PROBE1(too_many_desc, viona_vring_t *, ring);
+	VIONA_RING_STAT_INCR(ring, too_many_desc);
+bail:
+	mutex_exit(&ring->vr_a_mutex);
+	return (-1);
+}
+
+void
+vq_pushchain(viona_vring_t *ring, uint32_t len, uint16_t cookie)
+{
+	volatile struct virtio_used *vu;
+	uint_t uidx;
+
+	mutex_enter(&ring->vr_u_mutex);
+
+	uidx = *ring->vr_used_idx;
+	vu = &ring->vr_used_ring[uidx++ & ring->vr_mask];
+	vu->vu_idx = cookie;
+	vu->vu_tlen = len;
+	membar_producer();
+	*ring->vr_used_idx = uidx;
+
+	mutex_exit(&ring->vr_u_mutex);
+}
+
+void
+vq_pushchain_many(viona_vring_t *ring, uint_t num_bufs, used_elem_t *elem)
+{
+	volatile struct virtio_used *vu;
+	uint_t uidx, i;
+
+	mutex_enter(&ring->vr_u_mutex);
+
+	uidx = *ring->vr_used_idx;
+	if (num_bufs == 1) {
+		vu = &ring->vr_used_ring[uidx++ & ring->vr_mask];
+		vu->vu_idx = elem[0].id;
+		vu->vu_tlen = elem[0].len;
+	} else {
+		for (i = 0; i < num_bufs; i++) {
+			vu = &ring->vr_used_ring[(uidx + i) & ring->vr_mask];
+			vu->vu_idx = elem[i].id;
+			vu->vu_tlen = elem[i].len;
+		}
+		uidx = uidx + num_bufs;
+	}
+	membar_producer();
+	*ring->vr_used_idx = uidx;
+
+	mutex_exit(&ring->vr_u_mutex);
+}
diff --git a/usr/src/uts/i86pc/io/viona/viona_rx.c b/usr/src/uts/i86pc/io/viona/viona_rx.c
new file mode 100644
index 0000000000..1ccbaa63f1
--- /dev/null
+++ b/usr/src/uts/i86pc/io/viona/viona_rx.c
@@ -0,0 +1,718 @@
+/*
+ * Copyright (c) 2013  Chris Torek <torek @ torek net>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2015 Pluribus Networks Inc.
+ * Copyright 2019 Joyent, Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/strsubr.h>
+
+#include <sys/dlpi.h>
+#include <sys/pattr.h>
+#include <sys/vlan.h>
+
+#include "viona_impl.h"
+
+
+
+#define	VTNET_MAXSEGS		32
+
+/* Min. octets in an ethernet frame minus FCS */
+#define	MIN_BUF_SIZE		60
+#define	NEED_VLAN_PAD_SIZE	(MIN_BUF_SIZE - VLAN_TAGSZ)
+
+static mblk_t *viona_vlan_pad_mp;
+
+void
+viona_rx_init(void)
+{
+	mblk_t *mp;
+
+	ASSERT(viona_vlan_pad_mp == NULL);
+
+	/* Create mblk for padding when VLAN tags are stripped */
+	mp = allocb_wait(VLAN_TAGSZ, BPRI_HI, STR_NOSIG, NULL);
+	bzero(mp->b_rptr, VLAN_TAGSZ);
+	mp->b_wptr += VLAN_TAGSZ;
+	viona_vlan_pad_mp = mp;
+}
+
+void
+viona_rx_fini(void)
+{
+	mblk_t *mp;
+
+	/* Clean up the VLAN padding mblk */
+	mp = viona_vlan_pad_mp;
+	viona_vlan_pad_mp = NULL;
+	VERIFY(mp != NULL && mp->b_cont == NULL);
+	freemsg(mp);
+}
+
+void
+viona_worker_rx(viona_vring_t *ring, viona_link_t *link)
+{
+	proc_t *p = ttoproc(curthread);
+
+	(void) thread_vsetname(curthread, "viona_rx_%p", ring);
+
+	ASSERT(MUTEX_HELD(&ring->vr_lock));
+	ASSERT3U(ring->vr_state, ==, VRS_RUN);
+
+	*ring->vr_used_flags |= VRING_USED_F_NO_NOTIFY;
+
+	do {
+		if (vmm_drv_lease_expired(ring->vr_lease)) {
+			/*
+			 * Set the renewal flag, causing incoming traffic to be
+			 * dropped, and issue an RX barrier to ensure any
+			 * threads in the RX callbacks will have finished.
+			 * The vr_lock cannot be held across the barrier as it
+			 * poses a deadlock risk.
+			 */
+			ring->vr_state_flags |= VRSF_RENEW;
+			mutex_exit(&ring->vr_lock);
+			mac_rx_barrier(link->l_mch);
+			mutex_enter(&ring->vr_lock);
+
+			if (!viona_ring_lease_renew(ring)) {
+				break;
+			}
+			ring->vr_state_flags &= ~VRSF_RENEW;
+		}
+
+		/*
+		 * For now, there is little to do in the RX worker as inbound
+		 * data is delivered by MAC via the RX callbacks.  If tap-like
+		 * functionality is added later, this would be a convenient
+		 * place to inject frames into the guest.
+		 */
+		(void) cv_wait_sig(&ring->vr_cv, &ring->vr_lock);
+	} while (!VRING_NEED_BAIL(ring, p));
+
+	ring->vr_state = VRS_STOP;
+
+	/*
+	 * The RX ring is stopping, before we start tearing it down it
+	 * is imperative that we perform an RX barrier so that
+	 * incoming packets are dropped at viona_rx_classified().
+	 */
+	mutex_exit(&ring->vr_lock);
+	mac_rx_barrier(link->l_mch);
+	mutex_enter(&ring->vr_lock);
+
+	*ring->vr_used_flags &= ~VRING_USED_F_NO_NOTIFY;
+}
+
+static size_t
+viona_copy_mblk(const mblk_t *mp, size_t seek, caddr_t buf, size_t len,
+    boolean_t *end)
+{
+	size_t copied = 0;
+	size_t off = 0;
+
+	/* Seek past already-consumed data */
+	while (seek > 0 && mp != NULL) {
+		const size_t chunk = MBLKL(mp);
+
+		if (chunk > seek) {
+			off = seek;
+			break;
+		}
+		mp = mp->b_cont;
+		seek -= chunk;
+	}
+
+	while (mp != NULL) {
+		const size_t chunk = MBLKL(mp) - off;
+		const size_t to_copy = MIN(chunk, len);
+
+		bcopy(mp->b_rptr + off, buf, to_copy);
+		copied += to_copy;
+		buf += to_copy;
+		len -= to_copy;
+
+		/*
+		 * If all the remaining data in the mblk_t was copied, move on
+		 * to the next one in the chain.  Any seek offset applied to
+		 * the first mblk copy is zeroed out for subsequent operations.
+		 */
+		if (chunk == to_copy) {
+			mp = mp->b_cont;
+			off = 0;
+		}
+#ifdef DEBUG
+		else {
+			/*
+			 * The only valid reason for the copy to consume less
+			 * than the entire contents of the mblk_t is because
+			 * the output buffer has been filled.
+			 */
+			ASSERT0(len);
+		}
+#endif
+
+		/* Go no further if the buffer has been filled */
+		if (len == 0) {
+			break;
+		}
+
+	}
+	*end = (mp == NULL);
+	return (copied);
+}
+
+static int
+viona_recv_plain(viona_vring_t *ring, const mblk_t *mp, size_t msz)
+{
+	struct iovec iov[VTNET_MAXSEGS];
+	uint16_t cookie;
+	int n;
+	const size_t hdr_sz = sizeof (struct virtio_net_hdr);
+	struct virtio_net_hdr *hdr;
+	size_t len, copied = 0;
+	caddr_t buf = NULL;
+	boolean_t end = B_FALSE;
+	const uint32_t features = ring->vr_link->l_features;
+
+	ASSERT(msz >= MIN_BUF_SIZE);
+
+	n = vq_popchain(ring, iov, VTNET_MAXSEGS, &cookie);
+	if (n <= 0) {
+		/* Without available buffers, the frame must be dropped. */
+		return (ENOSPC);
+	}
+	if (iov[0].iov_len < hdr_sz) {
+		/*
+		 * There is little to do if there is not even space available
+		 * for the sole header.  Zero the buffer and bail out as a last
+		 * act of desperation.
+		 */
+		bzero(iov[0].iov_base, iov[0].iov_len);
+		goto bad_frame;
+	}
+
+	/* Grab the address of the header before anything else */
+	hdr = (struct virtio_net_hdr *)iov[0].iov_base;
+
+	/*
+	 * If there is any space remaining in the first buffer after writing
+	 * the header, fill it with frame data.
+	 */
+	if (iov[0].iov_len > hdr_sz) {
+		buf = (caddr_t)iov[0].iov_base + hdr_sz;
+		len = iov[0].iov_len - hdr_sz;
+
+		copied += viona_copy_mblk(mp, copied, buf, len, &end);
+	}
+
+	/* Copy any remaining data into subsequent buffers, if present */
+	for (int i = 1; i < n && !end; i++) {
+		buf = (caddr_t)iov[i].iov_base;
+		len = iov[i].iov_len;
+
+		copied += viona_copy_mblk(mp, copied, buf, len, &end);
+	}
+
+	/* Was the expected amount of data copied? */
+	if (copied != msz) {
+		VIONA_PROBE5(too_short, viona_vring_t *, ring,
+		    uint16_t, cookie, mblk_t *, mp, size_t, copied,
+		    size_t, msz);
+		VIONA_RING_STAT_INCR(ring, too_short);
+		goto bad_frame;
+	}
+
+	/* Populate (read: zero) the header and account for it in the size */
+	bzero(hdr, hdr_sz);
+	copied += hdr_sz;
+
+	/* Add chksum bits, if needed */
+	if ((features & VIRTIO_NET_F_GUEST_CSUM) != 0) {
+		uint32_t cksum_flags;
+
+		if (((features & VIRTIO_NET_F_GUEST_TSO4) != 0) &&
+		    ((DB_CKSUMFLAGS(mp) & HW_LSO) != 0)) {
+			hdr->vrh_gso_type |= VIRTIO_NET_HDR_GSO_TCPV4;
+			hdr->vrh_gso_size = DB_LSOMSS(mp);
+		}
+
+		mac_hcksum_get((mblk_t *)mp, NULL, NULL, NULL, NULL,
+		    &cksum_flags);
+		if ((cksum_flags & HCK_FULLCKSUM_OK) != 0) {
+			hdr->vrh_flags |= VIRTIO_NET_HDR_F_DATA_VALID;
+		}
+	}
+
+	/* Release this chain */
+	vq_pushchain(ring, copied, cookie);
+	return (0);
+
+bad_frame:
+	VIONA_PROBE3(bad_rx_frame, viona_vring_t *, ring, uint16_t, cookie,
+	    mblk_t *, mp);
+	VIONA_RING_STAT_INCR(ring, bad_rx_frame);
+
+	vq_pushchain(ring, MAX(copied, MIN_BUF_SIZE + hdr_sz), cookie);
+	return (EINVAL);
+}
+
+static int
+viona_recv_merged(viona_vring_t *ring, const mblk_t *mp, size_t msz)
+{
+	struct iovec iov[VTNET_MAXSEGS];
+	used_elem_t uelem[VTNET_MAXSEGS];
+	int n, i = 0, buf_idx = 0, err = 0;
+	uint16_t cookie;
+	caddr_t buf;
+	size_t len, copied = 0, chunk = 0;
+	struct virtio_net_mrgrxhdr *hdr = NULL;
+	const size_t hdr_sz = sizeof (struct virtio_net_mrgrxhdr);
+	boolean_t end = B_FALSE;
+	const uint32_t features = ring->vr_link->l_features;
+
+	ASSERT(msz >= MIN_BUF_SIZE);
+
+	n = vq_popchain(ring, iov, VTNET_MAXSEGS, &cookie);
+	if (n <= 0) {
+		/* Without available buffers, the frame must be dropped. */
+		VIONA_PROBE2(no_space, viona_vring_t *, ring, mblk_t *, mp);
+		VIONA_RING_STAT_INCR(ring, no_space);
+		return (ENOSPC);
+	}
+	if (iov[0].iov_len < hdr_sz) {
+		/*
+		 * There is little to do if there is not even space available
+		 * for the sole header.  Zero the buffer and bail out as a last
+		 * act of desperation.
+		 */
+		bzero(iov[0].iov_base, iov[0].iov_len);
+		uelem[0].id = cookie;
+		uelem[0].len = iov[0].iov_len;
+		err = EINVAL;
+		goto done;
+	}
+
+	/* Grab the address of the header and do initial population */
+	hdr = (struct virtio_net_mrgrxhdr *)iov[0].iov_base;
+	bzero(hdr, hdr_sz);
+	hdr->vrh_bufs = 1;
+
+	/*
+	 * If there is any space remaining in the first buffer after writing
+	 * the header, fill it with frame data.
+	 */
+	if (iov[0].iov_len > hdr_sz) {
+		buf = iov[0].iov_base + hdr_sz;
+		len = iov[0].iov_len - hdr_sz;
+
+		chunk += viona_copy_mblk(mp, copied, buf, len, &end);
+		copied += chunk;
+	}
+	i = 1;
+
+	do {
+		while (i < n && !end) {
+			buf = iov[i].iov_base;
+			len = iov[i].iov_len;
+
+			chunk += viona_copy_mblk(mp, copied, buf, len, &end);
+			copied += chunk;
+			i++;
+		}
+
+		uelem[buf_idx].id = cookie;
+		uelem[buf_idx].len = chunk;
+
+		/*
+		 * Try to grab another buffer from the ring if the mblk has not
+		 * yet been entirely copied out.
+		 */
+		if (!end) {
+			if (buf_idx == (VTNET_MAXSEGS - 1)) {
+				/*
+				 * Our arbitrary limit on the number of buffers
+				 * to offer for merge has already been reached.
+				 */
+				err = EOVERFLOW;
+				break;
+			}
+			n = vq_popchain(ring, iov, VTNET_MAXSEGS, &cookie);
+			if (n <= 0) {
+				/*
+				 * Without more immediate space to perform the
+				 * copying, there is little choice left but to
+				 * drop the packet.
+				 */
+				err = EMSGSIZE;
+				break;
+			}
+			chunk = 0;
+			i = 0;
+			buf_idx++;
+			/*
+			 * Keep the header up-to-date with the number of
+			 * buffers, but never reference its value since the
+			 * guest could meddle with it.
+			 */
+			hdr->vrh_bufs++;
+		}
+	} while (!end && copied < msz);
+
+	/* Account for the header size in the first buffer */
+	uelem[0].len += hdr_sz;
+
+	/*
+	 * If no other errors were encounted during the copy, was the expected
+	 * amount of data transfered?
+	 */
+	if (err == 0 && copied != msz) {
+		VIONA_PROBE5(too_short, viona_vring_t *, ring,
+		    uint16_t, cookie, mblk_t *, mp, size_t, copied,
+		    size_t, msz);
+		VIONA_RING_STAT_INCR(ring, too_short);
+		err = EINVAL;
+	}
+
+	/* Add chksum bits, if needed */
+	if ((features & VIRTIO_NET_F_GUEST_CSUM) != 0) {
+		uint32_t cksum_flags;
+
+		if (((features & VIRTIO_NET_F_GUEST_TSO4) != 0) &&
+		    ((DB_CKSUMFLAGS(mp) & HW_LSO) != 0)) {
+			hdr->vrh_gso_type |= VIRTIO_NET_HDR_GSO_TCPV4;
+			hdr->vrh_gso_size = DB_LSOMSS(mp);
+		}
+
+		mac_hcksum_get((mblk_t *)mp, NULL, NULL, NULL, NULL,
+		    &cksum_flags);
+		if ((cksum_flags & HCK_FULLCKSUM_OK) != 0) {
+			hdr->vrh_flags |= VIRTIO_NET_HDR_F_DATA_VALID;
+		}
+	}
+
+done:
+	switch (err) {
+	case 0:
+		/* Success can fall right through to ring delivery */
+		break;
+
+	case EMSGSIZE:
+		VIONA_PROBE3(rx_merge_underrun, viona_vring_t *, ring,
+		    uint16_t, cookie, mblk_t *, mp);
+		VIONA_RING_STAT_INCR(ring, rx_merge_underrun);
+		break;
+
+	case EOVERFLOW:
+		VIONA_PROBE3(rx_merge_overrun, viona_vring_t *, ring,
+		    uint16_t, cookie, mblk_t *, mp);
+		VIONA_RING_STAT_INCR(ring, rx_merge_overrun);
+		break;
+
+	default:
+		VIONA_PROBE3(bad_rx_frame, viona_vring_t *, ring,
+		    uint16_t, cookie, mblk_t *, mp);
+		VIONA_RING_STAT_INCR(ring, bad_rx_frame);
+	}
+	vq_pushchain_many(ring, buf_idx + 1, uelem);
+	return (err);
+}
+
+static void
+viona_rx_common(viona_vring_t *ring, mblk_t *mp, boolean_t is_loopback)
+{
+	viona_link_t *link = ring->vr_link;
+	mblk_t *mprx = NULL, **mprx_prevp = &mprx;
+	mblk_t *mpdrop = NULL, **mpdrop_prevp = &mpdrop;
+	const boolean_t do_merge =
+	    ((link->l_features & VIRTIO_NET_F_MRG_RXBUF) != 0);
+
+	size_t nrx = 0, ndrop = 0;
+
+	while (mp != NULL) {
+		mblk_t *next = mp->b_next;
+		mblk_t *pad = NULL;
+		size_t size = msgsize(mp);
+		int err = 0;
+
+		mp->b_next = NULL;
+
+		/*
+		 * We treat both a 'drop' response and errors the same here
+		 * and put the packet on the drop chain.  As packets may be
+		 * subject to different actions in ipf (which do not all
+		 * return the same set of error values), an error processing
+		 * one packet doesn't mean the next packet will also generate
+		 * an error.
+		 */
+		if (VNETHOOK_INTERESTED_IN(link->l_neti) &&
+		    viona_hook(link, ring, &mp, B_FALSE) != 0) {
+			if (mp != NULL) {
+				*mpdrop_prevp = mp;
+				mpdrop_prevp = &mp->b_next;
+			} else {
+				/*
+				 * If the hook consumer (e.g. ipf) already
+				 * freed the mblk_t, update the drop count now.
+				 */
+				ndrop++;
+			}
+			mp = next;
+			continue;
+		}
+
+		/*
+		 * Ethernet frames are expected to be padded out in order to
+		 * meet the minimum size.
+		 *
+		 * A special case is made for frames which are short by
+		 * VLAN_TAGSZ, having been stripped of their VLAN tag while
+		 * traversing MAC.  A preallocated (and recycled) mblk is used
+		 * for that specific condition.
+		 *
+		 * All other frames that fall short on length will have custom
+		 * zero-padding allocated appended to them.
+		 */
+		if (size == NEED_VLAN_PAD_SIZE) {
+			ASSERT(MBLKL(viona_vlan_pad_mp) == VLAN_TAGSZ);
+			ASSERT(viona_vlan_pad_mp->b_cont == NULL);
+
+			for (pad = mp; pad->b_cont != NULL; pad = pad->b_cont)
+				;
+
+			pad->b_cont = viona_vlan_pad_mp;
+			size += VLAN_TAGSZ;
+		} else if (size < MIN_BUF_SIZE) {
+			const size_t pad_size = MIN_BUF_SIZE - size;
+			mblk_t *zero_mp;
+
+			zero_mp = allocb(pad_size, BPRI_MED);
+			if (zero_mp == NULL) {
+				err = ENOMEM;
+				goto pad_drop;
+			}
+
+			VIONA_PROBE3(rx_pad_short, viona_vring_t *, ring,
+			    mblk_t *, mp, size_t, pad_size);
+			VIONA_RING_STAT_INCR(ring, rx_pad_short);
+			zero_mp->b_wptr += pad_size;
+			bzero(zero_mp->b_rptr, pad_size);
+			linkb(mp, zero_mp);
+			size += pad_size;
+		}
+
+		if (do_merge) {
+			err = viona_recv_merged(ring, mp, size);
+		} else {
+			err = viona_recv_plain(ring, mp, size);
+		}
+
+		/*
+		 * The VLAN padding mblk is meant for continual reuse, so
+		 * remove it from the chain to prevent it from being freed.
+		 *
+		 * Custom allocated padding does not require this treatment and
+		 * is freed normally.
+		 */
+		if (pad != NULL) {
+			pad->b_cont = NULL;
+		}
+
+pad_drop:
+		/*
+		 * While an error during rx processing
+		 * (viona_recv_{merged,plain}) does not free mp on error,
+		 * hook processing might or might not free mp.  Handle either
+		 * scenario -- if mp is not yet free, it is queued up and
+		 * freed after the guest has been notified.  If mp is
+		 * already NULL, just proceed on.
+		 */
+		if (err != 0) {
+			*mpdrop_prevp = mp;
+			mpdrop_prevp = &mp->b_next;
+
+			/*
+			 * If the available ring is empty, do not bother
+			 * attempting to deliver any more frames.  Count the
+			 * rest as dropped too.
+			 */
+			if (err == ENOSPC) {
+				mp->b_next = next;
+				break;
+			}
+		} else {
+			/* Chain successful mblks to be freed later */
+			*mprx_prevp = mp;
+			mprx_prevp = &mp->b_next;
+			nrx++;
+		}
+		mp = next;
+	}
+
+	membar_enter();
+	if ((*ring->vr_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) {
+		viona_intr_ring(ring);
+	}
+
+	/* Free successfully received frames */
+	if (mprx != NULL) {
+		freemsgchain(mprx);
+	}
+
+	/* Free dropped frames, also tallying them */
+	mp = mpdrop;
+	while (mp != NULL) {
+		mblk_t *next = mp->b_next;
+
+		mp->b_next = NULL;
+		freemsg(mp);
+		mp = next;
+		ndrop++;
+	}
+	VIONA_PROBE3(rx, viona_link_t *, link, size_t, nrx, size_t, ndrop);
+}
+
+static void
+viona_rx_classified(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
+    boolean_t is_loopback)
+{
+	viona_vring_t *ring = (viona_vring_t *)arg;
+
+	/* Drop traffic if ring is inactive or renewing its lease */
+	if (ring->vr_state != VRS_RUN ||
+	    (ring->vr_state_flags & VRSF_RENEW) != 0) {
+		freemsgchain(mp);
+		return;
+	}
+
+	viona_rx_common(ring, mp, is_loopback);
+}
+
+static void
+viona_rx_mcast(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
+    boolean_t is_loopback)
+{
+	viona_vring_t *ring = (viona_vring_t *)arg;
+	mac_handle_t mh = ring->vr_link->l_mh;
+	mblk_t *mp_mcast_only = NULL;
+	mblk_t **mpp = &mp_mcast_only;
+
+	/* Drop traffic if ring is inactive or renewing its lease */
+	if (ring->vr_state != VRS_RUN ||
+	    (ring->vr_state_flags & VRSF_RENEW) != 0) {
+		freemsgchain(mp);
+		return;
+	}
+
+	/*
+	 * In addition to multicast traffic, broadcast packets will also arrive
+	 * via the MAC_CLIENT_PROMISC_MULTI handler. The mac_rx_set() callback
+	 * for fully-classified traffic has already delivered that broadcast
+	 * traffic, so it should be suppressed here, rather than duplicating it
+	 * to the guest.
+	 */
+	while (mp != NULL) {
+		mblk_t *mp_next;
+		mac_header_info_t mhi;
+		int err;
+
+		mp_next = mp->b_next;
+		mp->b_next = NULL;
+
+		/* Determine the packet type */
+		err = mac_vlan_header_info(mh, mp, &mhi);
+		if (err != 0) {
+			mblk_t *pull;
+
+			/*
+			 * It is possible that gathering of the header
+			 * information was impeded by a leading mblk_t which
+			 * was of inadequate length to reference the needed
+			 * fields.  Try again, in case that could be solved
+			 * with a pull-up.
+			 */
+			pull = msgpullup(mp, sizeof (struct ether_vlan_header));
+			if (pull == NULL) {
+				err = ENOMEM;
+			} else {
+				err = mac_vlan_header_info(mh, pull, &mhi);
+				freemsg(pull);
+			}
+
+			if (err != 0) {
+				VIONA_RING_STAT_INCR(ring, rx_mcast_check);
+			}
+		}
+
+		/* Chain up matching packets while discarding others */
+		if (err == 0 && mhi.mhi_dsttype == MAC_ADDRTYPE_MULTICAST) {
+			*mpp = mp;
+			mpp = &mp->b_next;
+		} else {
+			freemsg(mp);
+		}
+
+		mp = mp_next;
+	}
+
+	if (mp_mcast_only != NULL) {
+		viona_rx_common(ring, mp_mcast_only, is_loopback);
+	}
+}
+
+int
+viona_rx_set(viona_link_t *link)
+{
+	viona_vring_t *ring = &link->l_vrings[VIONA_VQ_RX];
+	int err;
+
+	mac_rx_set(link->l_mch, viona_rx_classified, ring);
+	err = mac_promisc_add(link->l_mch, MAC_CLIENT_PROMISC_MULTI,
+	    viona_rx_mcast, ring, &link->l_mph,
+	    MAC_PROMISC_FLAGS_NO_TX_LOOP | MAC_PROMISC_FLAGS_VLAN_TAG_STRIP);
+	if (err != 0) {
+		mac_rx_clear(link->l_mch);
+	}
+
+	return (err);
+}
+
+void
+viona_rx_clear(viona_link_t *link)
+{
+	mac_promisc_remove(link->l_mph);
+	mac_rx_clear(link->l_mch);
+}
diff --git a/usr/src/uts/i86pc/io/viona/viona_tx.c b/usr/src/uts/i86pc/io/viona/viona_tx.c
new file mode 100644
index 0000000000..5dc645723c
--- /dev/null
+++ b/usr/src/uts/i86pc/io/viona/viona_tx.c
@@ -0,0 +1,756 @@
+/*
+ * Copyright (c) 2013  Chris Torek <torek @ torek net>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2015 Pluribus Networks Inc.
+ * Copyright 2019 Joyent, Inc.
+ */
+
+
+#include <sys/types.h>
+#include <sys/smt.h>
+#include <sys/strsubr.h>
+
+#include <sys/pattr.h>
+#include <sys/dlpi.h>
+#include <inet/ip.h>
+#include <inet/ip_impl.h>
+
+#include "viona_impl.h"
+
+#define	BNXE_NIC_DRIVER		"bnxe"
+
+/*
+ * copy tx mbufs from virtio ring to avoid necessitating a wait for packet
+ * transmission to free resources.
+ */
+kmutex_t viona_force_copy_lock;
+static enum viona_force_copy {
+	VFC_UNINITALIZED	= 0,
+	VFC_COPY_UNEEDED	= 1,
+	VFC_COPY_REQUIRED	= 2,
+} viona_force_copy_state = VFC_UNINITALIZED;
+
+struct viona_desb {
+	frtn_t			d_frtn;
+	viona_vring_t		*d_ring;
+	uint_t			d_ref;
+	uint32_t		d_len;
+	uint16_t		d_cookie;
+	uchar_t			*d_headers;
+};
+
+static void viona_tx(viona_link_t *, viona_vring_t *);
+static void viona_desb_release(viona_desb_t *);
+
+/*
+ * Return the number of available descriptors in the vring taking care of the
+ * 16-bit index wraparound.
+ *
+ * Note: If the number of apparently available descriptors is larger than the
+ * ring size (due to guest misbehavior), this check will still report the
+ * positive count of descriptors.
+ */
+static inline uint_t
+viona_vr_num_avail(viona_vring_t *ring)
+{
+	uint16_t ndesc;
+
+	/*
+	 * We're just computing (a-b) in GF(216).
+	 *
+	 * The only glitch here is that in standard C, uint16_t promotes to
+	 * (signed) int when int has more than 16 bits (almost always now).
+	 * A cast back to unsigned is necessary for proper operation.
+	 */
+	ndesc = (unsigned)*ring->vr_avail_idx - (unsigned)ring->vr_cur_aidx;
+
+	return (ndesc);
+}
+
+static void
+viona_tx_wait_outstanding(viona_vring_t *ring)
+{
+	ASSERT(MUTEX_HELD(&ring->vr_lock));
+
+	while (ring->vr_xfer_outstanding != 0) {
+		/*
+		 * Paying heed to signals is counterproductive here.  This is a
+		 * very tight loop if pending transfers take an extended amount
+		 * of time to be reclaimed while the host process is exiting.
+		 */
+		cv_wait(&ring->vr_cv, &ring->vr_lock);
+	}
+}
+
+/*
+ * Check if full TX packet copying is needed.  This should not be called from
+ * viona attach()/detach() context.
+ */
+static boolean_t
+viona_tx_copy_needed(void)
+{
+	boolean_t result;
+
+	mutex_enter(&viona_force_copy_lock);
+	if (viona_force_copy_state == VFC_UNINITALIZED) {
+		major_t bnxe_major;
+
+		/*
+		 * The original code for viona featured an explicit check for
+		 * the bnxe driver which, when found present, necessitated that
+		 * all transmissions be copied into their own mblks instead of
+		 * passing guest memory to the underlying device.
+		 *
+		 * The motivations for this are unclear, but until it can be
+		 * proven unnecessary, the check lives on.
+		 */
+		viona_force_copy_state = VFC_COPY_UNEEDED;
+		if ((bnxe_major = ddi_name_to_major(BNXE_NIC_DRIVER))
+		    != DDI_MAJOR_T_NONE) {
+			if (ddi_hold_installed_driver(bnxe_major) != NULL) {
+				viona_force_copy_state = VFC_COPY_REQUIRED;
+				ddi_rele_driver(bnxe_major);
+			}
+		}
+	}
+	result = (viona_force_copy_state == VFC_COPY_REQUIRED);
+	mutex_exit(&viona_force_copy_lock);
+
+	return (result);
+}
+
+void
+viona_tx_ring_alloc(viona_vring_t *ring, const uint16_t qsz)
+{
+	/* Allocate desb handles for TX ring if packet copying not disabled */
+	if (!viona_tx_copy_needed()) {
+		viona_desb_t *dp;
+
+		dp = kmem_zalloc(sizeof (viona_desb_t) * qsz, KM_SLEEP);
+		ring->vr_txdesb = dp;
+		for (uint_t i = 0; i < qsz; i++, dp++) {
+			dp->d_frtn.free_func = viona_desb_release;
+			dp->d_frtn.free_arg = (void *)dp;
+			dp->d_ring = ring;
+			dp->d_headers = kmem_zalloc(VIONA_MAX_HDRS_LEN,
+			    KM_SLEEP);
+		}
+	}
+
+	/* Allocate ring-sized iovec buffers for TX */
+	ring->vr_txiov = kmem_alloc(sizeof (struct iovec) * qsz, KM_SLEEP);
+}
+
+void
+viona_tx_ring_free(viona_vring_t *ring, const uint16_t qsz)
+{
+	if (ring->vr_txdesb != NULL) {
+		viona_desb_t *dp = ring->vr_txdesb;
+
+		for (uint_t i = 0; i < qsz; i++, dp++) {
+			kmem_free(dp->d_headers, VIONA_MAX_HDRS_LEN);
+		}
+		kmem_free(ring->vr_txdesb, sizeof (viona_desb_t) * qsz);
+		ring->vr_txdesb = NULL;
+	}
+
+	if (ring->vr_txiov != NULL) {
+		kmem_free(ring->vr_txiov, sizeof (struct iovec) * qsz);
+		ring->vr_txiov = NULL;
+	}
+}
+
+static void
+viona_tx_done(viona_vring_t *ring, uint32_t len, uint16_t cookie)
+{
+	vq_pushchain(ring, len, cookie);
+
+	membar_enter();
+	if ((*ring->vr_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) {
+		viona_intr_ring(ring);
+	}
+}
+
+void
+viona_worker_tx(viona_vring_t *ring, viona_link_t *link)
+{
+	proc_t *p = ttoproc(curthread);
+
+	(void) thread_vsetname(curthread, "viona_tx_%p", ring);
+
+	ASSERT(MUTEX_HELD(&ring->vr_lock));
+	ASSERT3U(ring->vr_state, ==, VRS_RUN);
+
+	mutex_exit(&ring->vr_lock);
+
+	for (;;) {
+		boolean_t bail = B_FALSE;
+		boolean_t renew = B_FALSE;
+		uint_t ntx = 0;
+
+		*ring->vr_used_flags |= VRING_USED_F_NO_NOTIFY;
+		while (viona_vr_num_avail(ring)) {
+			viona_tx(link, ring);
+
+			/*
+			 * It is advantageous for throughput to keep this
+			 * transmission loop tight, but periodic breaks to
+			 * check for other events are of value too.
+			 */
+			if (ntx++ >= ring->vr_size)
+				break;
+		}
+		*ring->vr_used_flags &= ~VRING_USED_F_NO_NOTIFY;
+
+		VIONA_PROBE2(tx, viona_link_t *, link, uint_t, ntx);
+
+		/*
+		 * Check for available descriptors on the ring once more in
+		 * case a late addition raced with the NO_NOTIFY flag toggle.
+		 *
+		 * The barrier ensures that visibility of the vr_used_flags
+		 * store does not cross the viona_vr_num_avail() check below.
+		 */
+		membar_enter();
+		bail = VRING_NEED_BAIL(ring, p);
+		renew = vmm_drv_lease_expired(ring->vr_lease);
+		if (!bail && !renew && viona_vr_num_avail(ring)) {
+			continue;
+		}
+
+		if ((link->l_features & VIRTIO_F_RING_NOTIFY_ON_EMPTY) != 0) {
+			viona_intr_ring(ring);
+		}
+
+		mutex_enter(&ring->vr_lock);
+
+		while (!bail && !renew && !viona_vr_num_avail(ring)) {
+			(void) cv_wait_sig(&ring->vr_cv, &ring->vr_lock);
+			bail = VRING_NEED_BAIL(ring, p);
+			renew = vmm_drv_lease_expired(ring->vr_lease);
+		}
+
+		if (bail) {
+			break;
+		} else if (renew) {
+			ring->vr_state_flags |= VRSF_RENEW;
+			/*
+			 * When renewing the lease for the ring, no TX
+			 * frames may be outstanding, as they contain
+			 * references to guest memory.
+			 */
+			viona_tx_wait_outstanding(ring);
+
+			if (!viona_ring_lease_renew(ring)) {
+				break;
+			}
+			ring->vr_state_flags &= ~VRSF_RENEW;
+		}
+		mutex_exit(&ring->vr_lock);
+	}
+
+	ASSERT(MUTEX_HELD(&ring->vr_lock));
+
+	ring->vr_state = VRS_STOP;
+	viona_tx_wait_outstanding(ring);
+}
+
+static void
+viona_desb_release(viona_desb_t *dp)
+{
+	viona_vring_t *ring = dp->d_ring;
+	uint_t ref;
+	uint32_t len;
+	uint16_t cookie;
+
+	ref = atomic_dec_uint_nv(&dp->d_ref);
+	if (ref > 1) {
+		return;
+	}
+
+	/*
+	 * The desb corresponding to this index must be ready for reuse before
+	 * the descriptor is returned to the guest via the 'used' ring.
+	 */
+	len = dp->d_len;
+	cookie = dp->d_cookie;
+	dp->d_len = 0;
+	dp->d_cookie = 0;
+	dp->d_ref = 0;
+
+	viona_tx_done(ring, len, cookie);
+
+	mutex_enter(&ring->vr_lock);
+	if ((--ring->vr_xfer_outstanding) == 0) {
+		cv_broadcast(&ring->vr_cv);
+	}
+	mutex_exit(&ring->vr_lock);
+}
+
+static boolean_t
+viona_tx_csum(viona_vring_t *ring, const struct virtio_net_hdr *hdr,
+    mblk_t *mp, uint32_t len)
+{
+	viona_link_t *link = ring->vr_link;
+	const struct ether_header *eth;
+	uint_t eth_len = sizeof (struct ether_header);
+	ushort_t ftype;
+	ipha_t *ipha = NULL;
+	uint8_t ipproto = IPPROTO_NONE; /* NONE is not exactly right, but ok */
+	uint16_t flags = 0;
+	const uint_t csum_start = hdr->vrh_csum_start;
+	const uint_t csum_stuff = hdr->vrh_csum_offset + csum_start;
+
+	/*
+	 * Validate that the checksum offsets provided by the guest are within
+	 * the bounds of the packet.  Additionally, ensure that the checksum
+	 * contents field is within the headers mblk copied by viona_tx().
+	 */
+	if (csum_start >= len || csum_start < eth_len || csum_stuff >= len ||
+	    (csum_stuff + sizeof (uint16_t)) > MBLKL(mp)) {
+		VIONA_PROBE2(fail_hcksum, viona_link_t *, link, mblk_t *, mp);
+		VIONA_RING_STAT_INCR(ring, fail_hcksum);
+		return (B_FALSE);
+	}
+
+	/*
+	 * This is guaranteed to be safe thanks to the header copying
+	 * done in viona_tx().
+	 */
+	eth = (const struct ether_header *)mp->b_rptr;
+	ftype = ntohs(eth->ether_type);
+
+	if (ftype == ETHERTYPE_VLAN) {
+		const struct ether_vlan_header *veth;
+
+		/* punt on QinQ for now */
+		eth_len = sizeof (struct ether_vlan_header);
+		veth = (const struct ether_vlan_header *)eth;
+		ftype = ntohs(veth->ether_type);
+	}
+
+	if (ftype == ETHERTYPE_IP) {
+		ipha = (ipha_t *)(mp->b_rptr + eth_len);
+
+		ipproto = ipha->ipha_protocol;
+	} else if (ftype == ETHERTYPE_IPV6) {
+		ip6_t *ip6h = (ip6_t *)(mp->b_rptr + eth_len);
+
+		ipproto = ip6h->ip6_nxt;
+	}
+
+	/*
+	 * We ignore hdr_len because the spec says it can't be
+	 * trusted. Besides, our own stack will determine the header
+	 * boundary.
+	 */
+	if ((link->l_cap_csum & HCKSUM_INET_PARTIAL) != 0 &&
+	    (hdr->vrh_gso_type & VIRTIO_NET_HDR_GSO_TCPV4) != 0 &&
+	    ftype == ETHERTYPE_IP) {
+		uint16_t	*cksump;
+		uint32_t	cksum;
+		ipaddr_t	src = ipha->ipha_src;
+		ipaddr_t	dst = ipha->ipha_dst;
+
+		/*
+		 * Our native IP stack doesn't set the L4 length field
+		 * of the pseudo header when LSO is in play. Other IP
+		 * stacks, e.g. Linux, do include the length field.
+		 * This is a problem because the hardware expects that
+		 * the length field is not set. When it is set it will
+		 * cause an incorrect TCP checksum to be generated.
+		 * The reason this works in Linux is because Linux
+		 * corrects the pseudo-header checksum in the driver
+		 * code. In order to get the correct HW checksum we
+		 * need to assume the guest's IP stack gave us a bogus
+		 * TCP partial checksum and calculate it ourselves.
+		 */
+		cksump = IPH_TCPH_CHECKSUMP(ipha, IPH_HDR_LENGTH(ipha));
+		cksum = IP_TCP_CSUM_COMP;
+		cksum += (dst >> 16) + (dst & 0xFFFF) +
+		    (src >> 16) + (src & 0xFFFF);
+		cksum = (cksum & 0xFFFF) + (cksum >> 16);
+		*(cksump) = (cksum & 0xFFFF) + (cksum >> 16);
+
+		/*
+		 * Since viona is a "legacy device", the data stored
+		 * by the driver will be in the guest's native endian
+		 * format (see sections 2.4.3 and 5.1.6.1 of the
+		 * VIRTIO 1.0 spec for more info). At this time the
+		 * only guests using viona are x86 and we can assume
+		 * little-endian.
+		 */
+		lso_info_set(mp, LE_16(hdr->vrh_gso_size), HW_LSO);
+
+		/*
+		 * Hardware, like ixgbe, expects the client to request
+		 * IP header checksum offload if it's sending LSO (see
+		 * ixgbe_get_context()). Unfortunately, virtio makes
+		 * no allowances for negotiating IP header checksum
+		 * and HW offload, only TCP checksum. We add the flag
+		 * and zero-out the checksum field. This mirrors the
+		 * behavior of our native IP stack (which does this in
+		 * the interest of HW that expects the field to be
+		 * zero).
+		 */
+		flags |= HCK_IPV4_HDRCKSUM;
+		ipha->ipha_hdr_checksum = 0;
+	}
+
+	/*
+	 * Use DB_CKSUMFLAGS instead of mac_hcksum_get() to make sure
+	 * HW_LSO, if present, is not lost.
+	 */
+	flags |= DB_CKSUMFLAGS(mp);
+
+	/*
+	 * Partial checksum support from the NIC is ideal, since it most
+	 * closely maps to the interface defined by virtio.
+	 */
+	if ((link->l_cap_csum & HCKSUM_INET_PARTIAL) != 0 &&
+	    (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) {
+		/*
+		 * MAC expects these offsets to be relative to the
+		 * start of the L3 header rather than the L2 frame.
+		 */
+		flags |= HCK_PARTIALCKSUM;
+		mac_hcksum_set(mp, csum_start - eth_len, csum_stuff - eth_len,
+		    len - eth_len, 0, flags);
+		return (B_TRUE);
+	}
+
+	/*
+	 * Without partial checksum support, look to the L3/L4 protocol
+	 * information to see if the NIC can handle it.  If not, the
+	 * checksum will need to calculated inline.
+	 */
+	if (ftype == ETHERTYPE_IP) {
+		if ((link->l_cap_csum & HCKSUM_INET_FULL_V4) != 0 &&
+		    (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) {
+			uint16_t *csump = (uint16_t *)(mp->b_rptr + csum_stuff);
+			*csump = 0;
+			flags |= HCK_FULLCKSUM;
+			mac_hcksum_set(mp, 0, 0, 0, 0, flags);
+			return (B_TRUE);
+		}
+
+		/* XXX: Implement manual fallback checksumming? */
+		VIONA_PROBE2(fail_hcksum, viona_link_t *, link, mblk_t *, mp);
+		VIONA_RING_STAT_INCR(ring, fail_hcksum);
+		return (B_FALSE);
+	} else if (ftype == ETHERTYPE_IPV6) {
+		if ((link->l_cap_csum & HCKSUM_INET_FULL_V6) != 0 &&
+		    (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) {
+			uint16_t *csump = (uint16_t *)(mp->b_rptr + csum_stuff);
+			*csump = 0;
+			flags |= HCK_FULLCKSUM;
+			mac_hcksum_set(mp, 0, 0, 0, 0, flags);
+			return (B_TRUE);
+		}
+
+		/* XXX: Implement manual fallback checksumming? */
+		VIONA_PROBE2(fail_hcksum6, viona_link_t *, link, mblk_t *, mp);
+		VIONA_RING_STAT_INCR(ring, fail_hcksum6);
+		return (B_FALSE);
+	}
+
+	/* Cannot even emulate hcksum for unrecognized protocols */
+	VIONA_PROBE2(fail_hcksum_proto, viona_link_t *, link, mblk_t *, mp);
+	VIONA_RING_STAT_INCR(ring, fail_hcksum_proto);
+	return (B_FALSE);
+}
+
+static void
+viona_tx(viona_link_t *link, viona_vring_t *ring)
+{
+	struct iovec		*iov = ring->vr_txiov;
+	const uint_t		max_segs = ring->vr_size;
+	uint16_t		cookie;
+	int			i, n;
+	uint32_t		len, base_off = 0;
+	uint32_t		min_copy = VIONA_MAX_HDRS_LEN;
+	mblk_t			*mp_head, *mp_tail, *mp;
+	viona_desb_t		*dp = NULL;
+	mac_client_handle_t	link_mch = link->l_mch;
+	const struct virtio_net_hdr *hdr;
+
+	mp_head = mp_tail = NULL;
+
+	ASSERT(iov != NULL);
+
+	n = vq_popchain(ring, iov, max_segs, &cookie);
+	if (n == 0) {
+		VIONA_PROBE1(tx_absent, viona_vring_t *, ring);
+		VIONA_RING_STAT_INCR(ring, tx_absent);
+		return;
+	} else if (n < 0) {
+		/*
+		 * Any error encountered in vq_popchain has already resulted in
+		 * specific probe and statistic handling.  Further action here
+		 * is unnecessary.
+		 */
+		return;
+	}
+
+	/* Grab the header and ensure it is of adequate length */
+	hdr = (const struct virtio_net_hdr *)iov[0].iov_base;
+	len = iov[0].iov_len;
+	if (len < sizeof (struct virtio_net_hdr)) {
+		goto drop_fail;
+	}
+
+	/* Make sure the packet headers are always in the first mblk. */
+	if (ring->vr_txdesb != NULL) {
+		dp = &ring->vr_txdesb[cookie];
+
+		/*
+		 * If the guest driver is operating properly, each desb slot
+		 * should be available for use when processing a TX descriptor
+		 * from the 'avail' ring.  In the case of drivers that reuse a
+		 * descriptor before it has been posted to the 'used' ring, the
+		 * data is simply dropped.
+		 */
+		if (atomic_cas_uint(&dp->d_ref, 0, 1) != 0) {
+			dp = NULL;
+			goto drop_fail;
+		}
+
+		dp->d_cookie = cookie;
+		mp_head = desballoc(dp->d_headers, VIONA_MAX_HDRS_LEN, 0,
+		    &dp->d_frtn);
+
+		/* Account for the successful desballoc. */
+		if (mp_head != NULL)
+			dp->d_ref++;
+	} else {
+		mp_head = allocb(VIONA_MAX_HDRS_LEN, 0);
+	}
+
+	if (mp_head == NULL)
+		goto drop_fail;
+
+	mp_tail = mp_head;
+
+	/*
+	 * We always copy enough of the guest data to cover the
+	 * headers. This protects us from TOCTOU attacks and allows
+	 * message block length assumptions to be made in subsequent
+	 * code. In many cases, this means copying more data than
+	 * strictly necessary. That's okay, as it is the larger packets
+	 * (such as LSO) that really benefit from desballoc().
+	 */
+	for (i = 1; i < n; i++) {
+		const uint32_t to_copy = MIN(min_copy, iov[i].iov_len);
+
+		bcopy(iov[i].iov_base, mp_head->b_wptr, to_copy);
+		mp_head->b_wptr += to_copy;
+		len += to_copy;
+		min_copy -= to_copy;
+
+		/*
+		 * We've met the minimum copy requirement. The rest of
+		 * the guest data can be referenced.
+		 */
+		if (min_copy == 0) {
+			/*
+			 * If we copied all contents of this
+			 * descriptor then move onto the next one.
+			 * Otherwise, record how far we are into the
+			 * current descriptor.
+			 */
+			if (iov[i].iov_len == to_copy)
+				i++;
+			else
+				base_off = to_copy;
+
+			break;
+		}
+	}
+
+	ASSERT3P(mp_head, !=, NULL);
+	ASSERT3P(mp_tail, !=, NULL);
+
+	for (; i < n; i++) {
+		uintptr_t base = (uintptr_t)iov[i].iov_base + base_off;
+		uint32_t chunk = iov[i].iov_len - base_off;
+
+		ASSERT3U(base_off, <, iov[i].iov_len);
+		ASSERT3U(chunk, >, 0);
+
+		if (dp != NULL) {
+			mp = desballoc((uchar_t *)base, chunk, 0, &dp->d_frtn);
+			if (mp == NULL) {
+				goto drop_fail;
+			}
+			dp->d_ref++;
+		} else {
+			mp = allocb(chunk, BPRI_MED);
+			if (mp == NULL) {
+				goto drop_fail;
+			}
+			bcopy((uchar_t *)base, mp->b_wptr, chunk);
+		}
+
+		base_off = 0;
+		len += chunk;
+		mp->b_wptr += chunk;
+		mp_tail->b_cont = mp;
+		mp_tail = mp;
+	}
+
+	if (VNETHOOK_INTERESTED_OUT(link->l_neti)) {
+		/*
+		 * The hook consumer may elect to free the mblk_t and set
+		 * our mblk_t ** to NULL.  When using a viona_desb_t
+		 * (dp != NULL), we do not want the corresponding cleanup to
+		 * occur during the viona_hook() call. We instead want to
+		 * reset and recycle dp for future use.  To prevent cleanup
+		 * during the viona_hook() call, we take a ref on dp (if being
+		 * used), and release it on success.  On failure, the
+		 * freemsgchain() call will release all the refs taken earlier
+		 * in viona_tx() (aside from the initial ref and the one we
+		 * take), and drop_hook will reset dp for reuse.
+		 */
+		if (dp != NULL)
+			dp->d_ref++;
+
+		/*
+		 * Pass &mp instead of &mp_head so we don't lose track of
+		 * mp_head if the hook consumer (i.e. ipf) elects to free mp
+		 * and set mp to NULL.
+		 */
+		mp = mp_head;
+		if (viona_hook(link, ring, &mp, B_TRUE) != 0) {
+			if (mp != NULL)
+				freemsgchain(mp);
+			goto drop_hook;
+		}
+
+		if (dp != NULL) {
+			dp->d_ref--;
+
+			/*
+			 * It is possible that the hook(s) accepted the packet,
+			 * but as part of its processing, it issued a pull-up
+			 * which released all references to the desb.  In that
+			 * case, go back to acting like the packet is entirely
+			 * copied (which it is).
+			 */
+			if (dp->d_ref == 1) {
+				dp->d_cookie = 0;
+				dp->d_ref = 0;
+				dp = NULL;
+			}
+		}
+	}
+
+	/*
+	 * Request hardware checksumming, if necessary. If the guest
+	 * sent an LSO packet then it must have also negotiated and
+	 * requested partial checksum; therefore the LSO logic is
+	 * contained within viona_tx_csum().
+	 */
+	if ((link->l_features & VIRTIO_NET_F_CSUM) != 0 &&
+	    (hdr->vrh_flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) != 0) {
+		if (!viona_tx_csum(ring, hdr, mp_head, len - iov[0].iov_len)) {
+			goto drop_fail;
+		}
+	}
+
+	if (dp != NULL) {
+		dp->d_len = len;
+		mutex_enter(&ring->vr_lock);
+		ring->vr_xfer_outstanding++;
+		mutex_exit(&ring->vr_lock);
+	} else {
+		/*
+		 * If the data was cloned out of the ring, the descriptors can
+		 * be marked as 'used' now, rather than deferring that action
+		 * until after successful packet transmission.
+		 */
+		viona_tx_done(ring, len, cookie);
+	}
+
+	/*
+	 * We're potentially going deep into the networking layer; make sure the
+	 * guest can't run concurrently.
+	 */
+	smt_begin_unsafe();
+	mac_tx(link_mch, mp_head, 0, MAC_DROP_ON_NO_DESC, NULL);
+	smt_end_unsafe();
+	return;
+
+drop_fail:
+	/*
+	 * On the off chance that memory is not available via the desballoc or
+	 * allocb calls, there are few options left besides to fail and drop
+	 * the frame on the floor.
+	 */
+
+	if (dp != NULL) {
+		/*
+		 * Take an additional reference on the desb handle (if present)
+		 * so any desballoc-sourced mblks can release their hold on it
+		 * without the handle reaching its final state and executing
+		 * its clean-up logic.
+		 */
+		dp->d_ref++;
+	}
+
+	/*
+	 * Free any already-allocated blocks and sum up the total length of the
+	 * dropped data to be released to the used ring.
+	 */
+	freemsgchain(mp_head);
+
+drop_hook:
+	len = 0;
+	for (uint_t i = 0; i < n; i++) {
+		len += iov[i].iov_len;
+	}
+
+	if (dp != NULL) {
+		VERIFY(dp->d_ref == 2);
+
+		/* Clean up the desb handle, releasing the extra hold. */
+		dp->d_len = 0;
+		dp->d_cookie = 0;
+		dp->d_ref = 0;
+	}
+
+	VIONA_PROBE3(tx_drop, viona_vring_t *, ring, uint32_t, len,
+	    uint16_t, cookie);
+	viona_tx_done(ring, len, cookie);
+}
diff --git a/usr/src/uts/i86pc/sys/viona_io.h b/usr/src/uts/i86pc/sys/viona_io.h
index a26cc00a55..46cc72eb06 100644
--- a/usr/src/uts/i86pc/sys/viona_io.h
+++ b/usr/src/uts/i86pc/sys/viona_io.h
@@ -11,36 +11,53 @@
 
 /*
  * Copyright 2013 Pluribus Networks Inc.
- * Copyright 2017 Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
  */
 
 #ifndef	_VIONA_IO_H_
 #define	_VIONA_IO_H_
 
 #define	VNA_IOC			(('V' << 16)|('C' << 8))
-#define	VNA_IOC_CREATE		(VNA_IOC | 1)
-#define	VNA_IOC_DELETE		(VNA_IOC | 2)
-#define	VNA_IOC_RX_RING_INIT	(VNA_IOC | 3)
-#define	VNA_IOC_TX_RING_INIT	(VNA_IOC | 4)
-#define	VNA_IOC_RX_RING_RESET	(VNA_IOC | 5)
-#define	VNA_IOC_TX_RING_RESET	(VNA_IOC | 6)
-#define	VNA_IOC_RX_RING_KICK	(VNA_IOC | 7)
-#define	VNA_IOC_TX_RING_KICK	(VNA_IOC | 8)
-#define	VNA_IOC_RX_INTR_CLR	(VNA_IOC | 9)
-#define	VNA_IOC_TX_INTR_CLR	(VNA_IOC | 10)
-#define	VNA_IOC_SET_FEATURES	(VNA_IOC | 11)
-#define	VNA_IOC_GET_FEATURES	(VNA_IOC | 12)
+#define	VNA_IOC_CREATE		(VNA_IOC | 0x01)
+#define	VNA_IOC_DELETE		(VNA_IOC | 0x02)
+
+#define	VNA_IOC_RING_INIT	(VNA_IOC | 0x10)
+#define	VNA_IOC_RING_RESET	(VNA_IOC | 0x11)
+#define	VNA_IOC_RING_KICK	(VNA_IOC | 0x12)
+#define	VNA_IOC_RING_SET_MSI	(VNA_IOC | 0x13)
+#define	VNA_IOC_RING_INTR_CLR	(VNA_IOC | 0x14)
+
+#define	VNA_IOC_INTR_POLL	(VNA_IOC | 0x20)
+#define	VNA_IOC_SET_FEATURES	(VNA_IOC | 0x21)
+#define	VNA_IOC_GET_FEATURES	(VNA_IOC | 0x22)
+#define	VNA_IOC_SET_NOTIFY_IOP	(VNA_IOC | 0x23)
 
 typedef struct vioc_create {
 	datalink_id_t	c_linkid;
-	char		c_vmname[64];
-	size_t		c_lomem_size;
-	size_t		c_himem_size;
+	int		c_vmfd;
 } vioc_create_t;
 
 typedef struct vioc_ring_init {
+	uint16_t	ri_index;
 	uint16_t	ri_qsize;
 	uint64_t	ri_qaddr;
 } vioc_ring_init_t;
 
+typedef struct vioc_ring_msi {
+	uint16_t	rm_index;
+	uint64_t	rm_addr;
+	uint64_t	rm_msg;
+} vioc_ring_msi_t;
+
+enum viona_vq_id {
+	VIONA_VQ_RX = 0,
+	VIONA_VQ_TX = 1,
+	VIONA_VQ_MAX = 2
+};
+
+typedef struct vioc_intr_poll {
+	uint32_t	vip_status[VIONA_VQ_MAX];
+} vioc_intr_poll_t;
+
+
 #endif	/* _VIONA_IO_H_ */
diff --git a/usr/src/uts/i86pc/sys/vmm_drv.h b/usr/src/uts/i86pc/sys/vmm_drv.h
index 33fefc10ea..856b75e5cc 100644
--- a/usr/src/uts/i86pc/sys/vmm_drv.h
+++ b/usr/src/uts/i86pc/sys/vmm_drv.h
@@ -17,6 +17,9 @@
 #define	_VMM_DRV_H_
 
 #ifdef	_KERNEL
+
+#include <sys/file.h>
+
 struct vmm_hold;
 typedef struct vmm_hold vmm_hold_t;
 
diff --git a/usr/src/uts/i86pc/viona/Makefile b/usr/src/uts/i86pc/viona/Makefile
index 4ede5bbd84..dac59c9a45 100644
--- a/usr/src/uts/i86pc/viona/Makefile
+++ b/usr/src/uts/i86pc/viona/Makefile
@@ -11,7 +11,7 @@
 
 #
 # Copyright 2013 Pluribus Networks Inc.
-# Copyright 2017 Joyent, Inc.
+# Copyright 2019 Joyent, Inc.
 #
 
 #
@@ -27,6 +27,7 @@ OBJECTS	= $(VIONA_OBJS:%=$(OBJS_DIR)/%)
 LINTS		= $(VIONA_OBJS:%.o=$(LINTS_DIR)/%.ln)
 ROOTMODULE	= $(USR_DRV_DIR)/$(MODULE)
 CONF_SRCDIR	= $(UTSBASE)/i86pc/io/viona
+MAPFILE		= $(UTSBASE)/i86pc/io/viona/viona.mapfile
 
 #
 # Include common rules.
@@ -49,8 +50,16 @@ LINTTAGS	+= -erroff=E_BAD_PTR_CAST_ALIGN
 LINTTAGS	+= -erroff=E_FUNC_RET_MAYBE_IGNORED2
 LINTTAGS	+= -erroff=E_FUNC_RET_ALWAYS_IGNOR2
 
+# needs work
+SMOFF += all_func_returns
+
+ALL_BUILDS	= $(ALL_BUILDSONLY64)
+DEF_BUILDS	= $(DEF_BUILDSONLY64)
+
 CFLAGS		+= $(CCVERBOSE)
-LDFLAGS		+= -dy -Ndrv/dld -Nmisc/mac -Nmisc/dls -Ndrv/vmm
+LDFLAGS		+= -dy -Ndrv/dld -Nmisc/mac -Nmisc/dls -Ndrv/vmm -Nmisc/neti
+LDFLAGS		+= -Nmisc/hook
+LDFLAGS		+= -M $(MAPFILE)
 
 #
 #	Default build targets.
diff --git a/usr/src/uts/intel/ipf/ipf.global-objs.debug64 b/usr/src/uts/intel/ipf/ipf.global-objs.debug64
index 663613cee3..846011b4c5 100644
--- a/usr/src/uts/intel/ipf/ipf.global-objs.debug64
+++ b/usr/src/uts/intel/ipf/ipf.global-objs.debug64
@@ -22,13 +22,17 @@
 # Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
-# Copyright 2013 Joyent, Inc. All rights reserved
+# Copyright 2018 Joyent, Inc. All rights reserved
 #
 
 fr_availfuncs
 fr_features
 fr_objbytes
 hdrsizes
+hook_viona_in
+hook_viona_in_gz
+hook_viona_out
+hook_viona_out_gz
 hook4_in
 hook4_in_gz
 hook4_loop_in
@@ -58,6 +62,9 @@ ip6exthdr
 ipf_cb_ops
 ipf_dev_info
 ipf_devfiles
+ipf_eth_bcast_addr
+ipf_eth_ipv4_mcast
+ipf_eth_ipv6_mcast
 ipf_kstat_tmp
 ipf_minor
 ipf_ops
-- 
cgit v1.2.3