[illumos-gate merge]

commit 2052a1fb16201e50b4c3a91ebcbeeccbc8276644 11568 loader: pxe.c missing initializer commit 8d94f651a44d41a7147253bb5dad1a53941e8f50 11031 SMB3 persistent handles commit 2f57b5e005e6dce9d124b3dbd5fdcad1cc0372d2 11532 Makefile.master: add gcc9 support flags commit f8296c60994fb27105f37ac6f75661e4a6bdbab7 11329 improved Virtio framework 10012 vioblk should not accept an all-zero serial number 7366 vioif happily creates rx descriptors until it consumes all memory Conflicts: usr/src/uts/common/io/vioif/vioif.c
author: Jerry Jelinek <jerry.jelinek@joyent.com> 2019-08-19 12:06:05 +0000
committer: Jerry Jelinek <jerry.jelinek@joyent.com> 2019-08-19 12:06:05 +0000
commit: 289a9bb49771505b864985403334d2f94f0ca3ec (patch)
tree: 2853dbf40fb16b4ea3df020177473835c0641dcb
parent: fb22979c02ec1ab84832084bea882640c366be5b (diff)
parent: 2052a1fb16201e50b4c3a91ebcbeeccbc8276644 (diff)
download: illumos-joyent-289a9bb49771505b864985403334d2f94f0ca3ec.tar.gz
59 files changed, 7523 insertions, 4797 deletions
diff --git a/exception_lists/packaging b/exception_lists/packaging
index ce7ebe91a1..c6cb2ccf99 100644
--- a/exception_lists/packaging
+++ b/exception_lists/packaging
@@ -600,6 +600,7 @@ usr/lib/smbsrv/libfksmbsrv.so.1
 usr/lib/smbsrv/libmlsvc.so
 usr/lib/smbsrv/libsmb.so
 usr/lib/smbsrv/libsmbns.so
+usr/lib/smbsrv/nvlprint
 usr/lib/smbsrv/test-msgbuf
 usr/lib/smbsrv/testoplock
 #
diff --git a/usr/src/Makefile.master b/usr/src/Makefile.master
index da8d14c660..e751a9f79f 100644
--- a/usr/src/Makefile.master
+++ b/usr/src/Makefile.master
@@ -372,8 +372,10 @@ CCNOAUTOINLINE= \
 	-_gcc=-fno-ipa-cp \
 	-_gcc7=-fno-ipa-icf \
 	-_gcc8=-fno-ipa-icf \
+	-_gcc9=-fno-ipa-icf \
 	-_gcc7=-fno-clone-functions \
-	-_gcc8=-fno-clone-functions
+	-_gcc8=-fno-clone-functions \
+	-_gcc9=-fno-clone-functions
 
 # GCC may put functions in different named sub-sections of .text based on
 # their presumed calling frequency.  At least in the kernel, where we actually
@@ -383,7 +385,8 @@ CCNOAUTOINLINE= \
 # but the application of this may move into usr/src/uts/ in future.
 CCNOREORDER=	\
 	-_gcc7=-fno-reorder-functions \
-	-_gcc8=-fno-reorder-functions
+	-_gcc8=-fno-reorder-functions \
+	-_gcc9=-fno-reorder-functions
 
 #
 # gcc has a rather aggressive optimization on by default that infers loop
@@ -394,7 +397,8 @@ CCNOREORDER=	\
 #
 CCNOAGGRESSIVELOOPS= \
 	-_gcc7=-fno-aggressive-loop-optimizations \
-	-_gcc8=-fno-aggressive-loop-optimizations
+	-_gcc8=-fno-aggressive-loop-optimizations \
+	-_gcc9=-fno-aggressive-loop-optimizations
 
 # One optimization the compiler might perform is to turn this:
 #	#pragma weak foo
@@ -472,7 +476,8 @@ CERRWARN +=	-_gcc=-Wno-array-bounds
 # gcc4 lacks -Wno-maybe-uninitialized
 CNOWARN_UNINIT = -_gcc4=-Wno-uninitialized \
     -_gcc7=-Wno-maybe-uninitialized \
-    -_gcc8=-Wno-maybe-uninitialized
+    -_gcc8=-Wno-maybe-uninitialized \
+    -_gcc9=-Wno-maybe-uninitialized
 
 CERRWARN += -_smatch=-p=illumos_user
 include $(SRC)/Makefile.smatch
diff --git a/usr/src/boot/sys/boot/i386/libi386/pxe.c b/usr/src/boot/sys/boot/i386/libi386/pxe.c
index 693596559d..821d0f627d 100644
--- a/usr/src/boot/sys/boot/i386/libi386/pxe.c
+++ b/usr/src/boot/sys/boot/i386/libi386/pxe.c
@@ -76,16 +76,21 @@ static ssize_t	pxe_netif_put(struct iodesc *desc, void *pkt, size_t len);
 static void	pxe_netif_end(struct netif *nif);
 
 extern struct netif_stats	pxe_st[];
-extern u_int16_t		__bangpxeseg;
-extern u_int16_t		__bangpxeoff;
+extern uint16_t			__bangpxeseg;
+extern uint16_t			__bangpxeoff;
 extern void			__bangpxeentry(void);
-extern u_int16_t		__pxenvseg;
-extern u_int16_t		__pxenvoff;
+extern uint16_t			__pxenvseg;
+extern uint16_t			__pxenvoff;
 extern void			__pxenventry(void);
 
 struct netif_dif pxe_ifs[] = {
-/*	dif_unit	dif_nsel	dif_stats	dif_private	*/
-	{0,		1,		&pxe_st[0],	0}
+	{
+		.dif_unit = 0,
+		.dif_nsel = 1,
+		.dif_stats = &pxe_st[0],
+		.dif_private = NULL,
+		.dif_used = 0
+	}
 };
 
 struct netif_stats pxe_st[nitems(pxe_ifs)];
@@ -218,7 +223,7 @@ pxe_init(void)
 		    pxenv_p->RMEntry.segment, pxenv_p->RMEntry.offset);
 	}
 
-	gci_p = bio_alloc(sizeof(*gci_p));
+	gci_p = bio_alloc(sizeof (*gci_p));
 	if (gci_p == NULL) {
 		pxe_p = NULL;
 		return (0);
@@ -269,7 +274,7 @@ pxe_cleanup(void)
 	if (pxe_call == NULL)
 		return;
 
-	undi_shutdown_p = bio_alloc(sizeof(*undi_shutdown_p));
+	undi_shutdown_p = bio_alloc(sizeof (*undi_shutdown_p));
 	if (undi_shutdown_p != NULL) {
 		bzero(undi_shutdown_p, sizeof (*undi_shutdown_p));
 		pxe_call(PXENV_UNDI_SHUTDOWN, undi_shutdown_p);
@@ -282,7 +287,7 @@ pxe_cleanup(void)
 		bio_free(undi_shutdown_p, sizeof (*undi_shutdown_p));
 	}
 
-	unload_stack_p = bio_alloc(sizeof(*unload_stack_p));
+	unload_stack_p = bio_alloc(sizeof (*unload_stack_p));
 	if (unload_stack_p != NULL) {
 		bzero(unload_stack_p, sizeof (*unload_stack_p));
 		pxe_call(PXENV_UNLOAD_STACK, unload_stack_p);
@@ -423,7 +428,7 @@ pxe_netif_init(struct iodesc *desc, void *machdep_hint)
 	else
 		desc->xid = 0;
 
-	bio_free(undi_info_p, sizeof(*undi_info_p));
+	bio_free(undi_info_p, sizeof (*undi_info_p));
 	undi_open_p = bio_alloc(sizeof (*undi_open_p));
 	if (undi_open_p == NULL)
 		return;
diff --git a/usr/src/cmd/mdb/common/modules/smbsrv/smbsrv.c b/usr/src/cmd/mdb/common/modules/smbsrv/smbsrv.c
index b54549eebb..4195a62149 100644
--- a/usr/src/cmd/mdb/common/modules/smbsrv/smbsrv.c
+++ b/usr/src/cmd/mdb/common/modules/smbsrv/smbsrv.c
@@ -1623,6 +1623,9 @@ tree_flag_bits[] = {
 	{ "FORCE_L2_OPLOCK",
 	    SMB_TREE_FORCE_L2_OPLOCK,
 	    SMB_TREE_FORCE_L2_OPLOCK },
+	{ "CA",
+	    SMB_TREE_CA,
+	    SMB_TREE_CA },
 	{ NULL, 0, 0 }
 };
 
@@ -2334,17 +2337,26 @@ smb_kshare_walk_step(mdb_walk_state_t *wsp)
  * *****************************************************************************
  */
 
+typedef struct mdb_smb_vfs {
+	list_node_t		sv_lnd;
+	uint32_t		sv_magic;
+	uint32_t		sv_refcnt;
+	vfs_t			*sv_vfsp;
+	vnode_t			*sv_rootvp;
+} mdb_smb_vfs_t;
+
 struct smb_vfs_cb_args {
 	uint_t		opts;
 	vnode_t		vn;
 	char		path[MAXPATHLEN];
 };
 
+/*ARGSUSED*/
 static int
 smb_vfs_cb(uintptr_t addr, const void *data, void *varg)
 {
 	struct smb_vfs_cb_args *args = varg;
-	const smb_vfs_t *sf = data;
+	mdb_smb_vfs_t sf;
 
 	if (args->opts & SMB_OPT_VERBOSE) {
 		mdb_arg_t	argv;
@@ -2363,16 +2375,21 @@ smb_vfs_cb(uintptr_t addr, const void *data, void *varg)
 	 *
 	 * Get the vnode v_path string if we can.
 	 */
+	if (mdb_ctf_vread(&sf, SMBSRV_SCOPE "smb_vfs_t",
+	    "mdb_smb_vfs_t", addr, 0) < 0) {
+		mdb_warn("failed to read struct smb_vfs at %p", addr);
+		return (DCMD_ERR);
+	}
 	strcpy(args->path, "?");
 	if (mdb_vread(&args->vn, sizeof (args->vn),
-	    (uintptr_t)sf->sv_rootvp) == sizeof (args->vn))
+	    (uintptr_t)sf.sv_rootvp) == sizeof (args->vn))
 		(void) mdb_readstr(args->path, sizeof (args->path),
 		    (uintptr_t)args->vn.v_path);
 
 	mdb_printf("%-?p ", addr);
-	mdb_printf("%-10d ", sf->sv_refcnt);
-	mdb_printf("%-?p ", sf->sv_vfsp);
-	mdb_printf("%-?p ", sf->sv_rootvp);
+	mdb_printf("%-10d ", sf.sv_refcnt);
+	mdb_printf("%-?p ", sf.sv_vfsp);
+	mdb_printf("%-?p ", sf.sv_rootvp);
 	mdb_printf("%-s\n", args->path);
 
 	return (WALK_NEXT);
@@ -2442,7 +2459,12 @@ smb_vfs_walk_init(mdb_walk_state_t *wsp)
 	 * OFFSETOF(smb_server_t, sv_export.e_vfs_list.ll_list);
 	 */
 	GET_OFFSET(sv_exp_off, smb_server_t, sv_export);
-	GET_OFFSET(ex_vfs_off, smb_export_t, e_vfs_list);
+	/* GET_OFFSET(ex_vfs_off, smb_export_t, e_vfs_list); */
+	ex_vfs_off = mdb_ctf_offsetof_by_name("smb_export_t", "e_vfs_list");
+	if (ex_vfs_off < 0) {
+		mdb_warn("cannot lookup: smb_export_t .e_vfs_list");
+		return (WALK_ERR);
+	}
 	GET_OFFSET(ll_off, smb_llist_t, ll_list);
 	wsp->walk_addr += (sv_exp_off + ex_vfs_off + ll_off);
 
diff --git a/usr/src/cmd/smbsrv/Makefile b/usr/src/cmd/smbsrv/Makefile
index 8e7699c252..85d9ec05f1 100644
--- a/usr/src/cmd/smbsrv/Makefile
+++ b/usr/src/cmd/smbsrv/Makefile
@@ -26,7 +26,7 @@
 #
 
 SUBDIRS = smbadm smbd smbstat dtrace fksmbd bind-helper \
-	test-msgbuf testoplock
+	nvlprint testoplock test-msgbuf
 MSGSUBDIRS = smbadm smbstat
 
 include ../Makefile.cmd
diff --git a/usr/src/cmd/smbsrv/fksmbd/fksmbd_shr.c b/usr/src/cmd/smbsrv/fksmbd/fksmbd_shr.c
index 23038f1641..20f1f146b0 100644
--- a/usr/src/cmd/smbsrv/fksmbd/fksmbd_shr.c
+++ b/usr/src/cmd/smbsrv/fksmbd/fksmbd_shr.c
@@ -10,7 +10,7 @@
  */
 
 /*
- * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright 2018 Nexenta Systems, Inc.  All rights reserved.
  */
 
 /*
@@ -115,6 +115,8 @@ smb_shr_load(void *args)
 	 */
 	new_share("test", "/var/smb/test", "fksmbd test share",
 	    SMB_SHRF_GUEST_OK);
+	new_share("testca", "/var/smb/test", "fksmbd test CA share",
+	    SMB_SHRF_CA);
 
 	/* Allow creating lots of shares for testing. */
 	shr_file = getenv("FKSMBD_SHARE_FILE");
diff --git a/usr/src/cmd/smbsrv/nvlprint/Makefile b/usr/src/cmd/smbsrv/nvlprint/Makefile
new file mode 100644
index 0000000000..6e107f4219
--- /dev/null
+++ b/usr/src/cmd/smbsrv/nvlprint/Makefile
@@ -0,0 +1,37 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2017 Nexenta Systems, Inc.  All rights reserved.
+#
+
+
+PROG= nvlprint
+
+include ../../Makefile.cmd
+ROOTCMDDIR=	$(ROOT)/usr/lib/smbsrv
+
+CFLAGS	+= $(CCVERBOSE)
+
+CPPFLAGS += -D_FILE_OFFSET_BITS=64
+LDLIBS += -lnvpair
+
+.KEEP_STATE:
+
+all: $(PROG)
+
+install: all $(ROOTCMD)
+
+clean:
+
+lint:
+
+include ../../Makefile.targ
diff --git a/usr/src/cmd/smbsrv/nvlprint/nvlprint.c b/usr/src/cmd/smbsrv/nvlprint/nvlprint.c
new file mode 100644
index 0000000000..939cedd933
--- /dev/null
+++ b/usr/src/cmd/smbsrv/nvlprint/nvlprint.c
@@ -0,0 +1,88 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2017 Nexenta Systems, Inc.  All rights reserved.
+ */
+
+/*
+ * Print a packed nvlist from a file.
+ */
+
+#include <stdio.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include "libnvpair.h"
+
+char buf[65536];
+
+void
+dumpit(FILE *fp)
+{
+	struct stat st;
+	size_t flen;
+	int rlen;
+	nvlist_t *nvl = NULL;
+	int err;
+
+	if (fstat(fileno(fp), &st) < 0) {
+		perror("fstat");
+		return;
+	}
+	flen = (size_t)st.st_size;
+	if (flen > sizeof (buf)) {
+		(void) printf("File too large\n");
+		return;
+	}
+	rlen = fread(buf, 1, flen, fp);
+	if (rlen <= 0) {
+		perror("fread");
+		return;
+	}
+	if (rlen != flen) {
+		(void) printf("Short read %d %d \n", rlen, flen);
+		return;
+	}
+
+	err = nvlist_unpack(buf, flen, &nvl, 0);
+	if (err != 0) {
+		(void) printf("nvlist_unpack, err=%d\n", err);
+		return;
+	}
+
+	nvlist_print(stdout, nvl);
+	nvlist_free(nvl);
+}
+
+int
+main(int argc, char **argv)
+{
+	FILE *fp;
+	int i;
+
+	if (argc < 2) {
+		(void) fprintf(stderr, "usage: %s {filename} [filename2...]\n",
+		    argv[0]);
+		return (1);
+	}
+	for (i = 1; i < argc; i++) {
+		fp = fopen(argv[i], "r");
+		if (fp == NULL) {
+			perror(argv[i]);
+			return (1);
+		}
+		(void) printf("%s:\n", argv[i]);
+		dumpit(fp);
+		(void) fclose(fp);
+	}
+	return (0);
+}
diff --git a/usr/src/lib/libfakekernel/common/clock.c b/usr/src/lib/libfakekernel/common/clock.c
index 2bee02af2e..deacbd4705 100644
--- a/usr/src/lib/libfakekernel/common/clock.c
+++ b/usr/src/lib/libfakekernel/common/clock.c
@@ -10,7 +10,7 @@
  */
 
 /*
- * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright 2017 Nexenta Systems, Inc.  All rights reserved.
  */
 
 
@@ -83,3 +83,48 @@ void
 scalehrtime(hrtime_t *t)
 {
 }
+
+/*
+ * These functions are blatently stolen from the kernel.
+ * See the dissertation in the comments preceding the
+ * hrt2ts() and ts2hrt() functions in:
+ *	uts/common/os/timers.c
+ */
+void
+hrt2ts(hrtime_t hrt, timespec_t *tsp)
+{
+	uint32_t sec, nsec, tmp;
+
+	tmp = (uint32_t)(hrt >> 30);
+	sec = tmp - (tmp >> 2);
+	sec = tmp - (sec >> 5);
+	sec = tmp + (sec >> 1);
+	sec = tmp - (sec >> 6) + 7;
+	sec = tmp - (sec >> 3);
+	sec = tmp + (sec >> 1);
+	sec = tmp + (sec >> 3);
+	sec = tmp + (sec >> 4);
+	tmp = (sec << 7) - sec - sec - sec;
+	tmp = (tmp << 7) - tmp - tmp - tmp;
+	tmp = (tmp << 7) - tmp - tmp - tmp;
+	nsec = (uint32_t)hrt - (tmp << 9);
+	while (nsec >= NANOSEC) {
+		nsec -= NANOSEC;
+		sec++;
+	}
+	tsp->tv_sec = (time_t)sec;
+	tsp->tv_nsec = nsec;
+}
+
+hrtime_t
+ts2hrt(const timestruc_t *tsp)
+{
+	hrtime_t hrt;
+
+	hrt = tsp->tv_sec;
+	hrt = (hrt << 7) - hrt - hrt - hrt;
+	hrt = (hrt << 7) - hrt - hrt - hrt;
+	hrt = (hrt << 7) - hrt - hrt - hrt;
+	hrt = (hrt << 9) + tsp->tv_nsec;
+	return (hrt);
+}
diff --git a/usr/src/lib/libfakekernel/common/kmisc.c b/usr/src/lib/libfakekernel/common/kmisc.c
index 15730d6539..70f303e035 100644
--- a/usr/src/lib/libfakekernel/common/kmisc.c
+++ b/usr/src/lib/libfakekernel/common/kmisc.c
@@ -10,7 +10,7 @@
  */
 
 /*
- * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright 2017 Nexenta Systems, Inc.  All rights reserved.
  * Copyright 2017 RackTop Systems.
  */
 
@@ -95,6 +95,7 @@ highbit64(uint64_t i)
 int
 ddi_strtoul(const char *str, char **endp, int base, unsigned long *res)
 {
+	errno = 0;
 	*res = strtoul(str, endp, base);
 	if (*res == 0)
 		return (errno);
diff --git a/usr/src/lib/libfakekernel/common/mapfile-vers b/usr/src/lib/libfakekernel/common/mapfile-vers
index 3950ccd4b5..731f6801a5 100644
--- a/usr/src/lib/libfakekernel/common/mapfile-vers
+++ b/usr/src/lib/libfakekernel/common/mapfile-vers
@@ -99,7 +99,7 @@ SYMBOL_VERSION SUNWprivate_1.1 {
 
 	highbit;
 	highbit64;
-
+	hrt2ts;
 	hz;
 
 	issig;
@@ -233,6 +233,7 @@ SYMBOL_VERSION SUNWprivate_1.1 {
 
 	tick_per_msec;
 	timeout;
+	ts2hrt;
 	tsignal;
 	uiomove;
 	uioskip;
diff --git a/usr/src/lib/libshare/smb/libshare_smb.c b/usr/src/lib/libshare/smb/libshare_smb.c
index e15bb26d9a..f567e7818b 100644
--- a/usr/src/lib/libshare/smb/libshare_smb.c
+++ b/usr/src/lib/libshare/smb/libshare_smb.c
@@ -179,6 +179,7 @@ struct option_defs optdefs[] = {
 	{ SHOPT_GUEST,		OPT_TYPE_BOOLEAN },
 	{ SHOPT_DFSROOT,	OPT_TYPE_BOOLEAN },
 	{ SHOPT_DESCRIPTION,	OPT_TYPE_STRING },
+	{ SHOPT_CA,		OPT_TYPE_BOOLEAN },
 	{ SHOPT_FSO,		OPT_TYPE_BOOLEAN },
 	{ SHOPT_QUOTAS,		OPT_TYPE_BOOLEAN },
 	{ SHOPT_ENCRYPT,	OPT_TYPE_STRING },
@@ -2195,6 +2196,9 @@ smb_build_shareinfo(sa_share_t share, sa_resource_t resource, smb_share_t *si)
 	if (smb_saprop_getbool(opts, SHOPT_DFSROOT, B_FALSE))
 		si->shr_flags |= SMB_SHRF_DFSROOT;
 
+	if (smb_saprop_getbool(opts, SHOPT_CA, B_FALSE))
+		si->shr_flags |= SMB_SHRF_CA;
+
 	if (smb_saprop_getbool(opts, SHOPT_FSO, B_FALSE))
 		si->shr_flags |= SMB_SHRF_FSO;
 
diff --git a/usr/src/lib/smbsrv/libfksmbsrv/Makefile.com b/usr/src/lib/smbsrv/libfksmbsrv/Makefile.com
index 507122dadd..7f29003239 100644
--- a/usr/src/lib/smbsrv/libfksmbsrv/Makefile.com
+++ b/usr/src/lib/smbsrv/libfksmbsrv/Makefile.com
@@ -119,7 +119,6 @@ OBJS_FS_SMBSRV = \
 		smb_tree_connect.o			\
 		smb_unlock_byte_range.o			\
 		smb_user.o				\
-		smb_vfs.o				\
 		smb_vops.o				\
 		smb_vss.o				\
 		smb_write.o				\
@@ -210,8 +209,10 @@ STRIP_STABS	= :
 
 
 # Note: need our sys includes _before_ ENVCPPFLAGS, proto etc.
+# Also, like Makefile.uts, reset CPPFLAGS
 CPPFLAGS.first += -I../../../libfakekernel/common
 CPPFLAGS.first += -I../common
+CPPFLAGS	= $(CPPFLAGS.first)
 
 INCS += -I$(SRC)/uts/common
 INCS += -I$(SRC)/common/smbsrv
diff --git a/usr/src/lib/smbsrv/libfksmbsrv/common/fksmb_cred.c b/usr/src/lib/smbsrv/libfksmbsrv/common/fksmb_cred.c
index 7b2bb93581..030c9c6244 100644
--- a/usr/src/lib/smbsrv/libfksmbsrv/common/fksmb_cred.c
+++ b/usr/src/lib/smbsrv/libfksmbsrv/common/fksmb_cred.c
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright 2017 Nexenta Systems, Inc.  All rights reserved.
  */
 
 #include <sys/types.h>
@@ -53,6 +53,14 @@ smb_cred_create(smb_token_t *token)
 	return (cr);
 }
 
+cred_t *
+smb_kcred_create(void)
+{
+	cred_t *cr;
+	cr = CRED();
+	return (cr);
+}
+
 void
 smb_user_setcred(smb_user_t *user, cred_t *cr, uint32_t privileges)
 {
diff --git a/usr/src/lib/smbsrv/libfksmbsrv/common/fksmb_init.c b/usr/src/lib/smbsrv/libfksmbsrv/common/fksmb_init.c
index 4f0d6bf299..dc9eff1b44 100644
--- a/usr/src/lib/smbsrv/libfksmbsrv/common/fksmb_init.c
+++ b/usr/src/lib/smbsrv/libfksmbsrv/common/fksmb_init.c
@@ -141,9 +141,12 @@ fksmbsrv_drv_open(void)
 int
 fksmbsrv_drv_close(void)
 {
+	smb_server_t *sv;
 	int rc;
 
-	rc = smb_server_delete();
+	rc = smb_server_lookup(&sv);
+	if (rc == 0)
+		rc = smb_server_delete(sv);
 
 	if (g_init_done != 0) {
 		smb_server_g_fini();
diff --git a/usr/src/lib/smbsrv/libmlsvc/common/smb_share.c b/usr/src/lib/smbsrv/libmlsvc/common/smb_share.c
index ccd5b75c12..8a354a7da0 100644
--- a/usr/src/lib/smbsrv/libmlsvc/common/smb_share.c
+++ b/usr/src/lib/smbsrv/libmlsvc/common/smb_share.c
@@ -770,6 +770,10 @@ smb_shr_modify(smb_share_t *new_si)
 	si->shr_flags &= ~SMB_SHRF_DFSROOT;
 	si->shr_flags |= flag;
 
+	flag = (new_si->shr_flags & SMB_SHRF_CA);
+	si->shr_flags &= ~SMB_SHRF_CA;
+	si->shr_flags |= flag;
+
 	flag = (new_si->shr_flags & SMB_SHRF_FSO);
 	si->shr_flags &= ~SMB_SHRF_FSO;
 	si->shr_flags |= flag;
@@ -1822,6 +1826,12 @@ smb_shr_sa_get(sa_share_t share, sa_resource_t resource, smb_share_t *si)
 		free(val);
 	}
 
+	val = smb_shr_sa_getprop(opts, SHOPT_CA);
+	if (val != NULL) {
+		smb_shr_sa_setflag(val, si, SMB_SHRF_CA);
+		free(val);
+	}
+
 	val = smb_shr_sa_getprop(opts, SHOPT_FSO);
 	if (val != NULL) {
 		smb_shr_sa_setflag(val, si, SMB_SHRF_FSO);
@@ -2611,6 +2621,8 @@ smb_shr_encode(smb_share_t *si, nvlist_t **nvlist)
 		rc |= nvlist_add_string(smb, SHOPT_GUEST, "true");
 	if ((si->shr_flags & SMB_SHRF_DFSROOT) != 0)
 		rc |= nvlist_add_string(smb, SHOPT_DFSROOT, "true");
+	if ((si->shr_flags & SMB_SHRF_CA) != 0)
+		rc |= nvlist_add_string(smb, SHOPT_CA, "true");
 	if ((si->shr_flags & SMB_SHRF_FSO) != 0)
 		rc |= nvlist_add_string(smb, SHOPT_FSO, "true");
 	if ((si->shr_flags & SMB_SHRF_QUOTAS) != 0)
diff --git a/usr/src/tools/quick/make-smbsrv b/usr/src/tools/quick/make-smbsrv
index 9e2381288d..0aabee3812 100755
--- a/usr/src/tools/quick/make-smbsrv
+++ b/usr/src/tools/quick/make-smbsrv
@@ -278,6 +278,7 @@ usr/lib/libmlrpc.so.2
 usr/lib/smbsrv/libmlsvc.so.1
 usr/lib/smbsrv/libsmb.so.1
 usr/lib/smbsrv/libsmbns.so.1
+usr/lib/smbsrv/nvlprint
 usr/lib/smbsrv/smbd
 usr/sbin/smbadm
 usr/sbin/smbstat
diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files
index 9d63669f58..0b4426db3a 100644
--- a/usr/src/uts/common/Makefile.files
+++ b/usr/src/uts/common/Makefile.files
@@ -1231,7 +1231,6 @@ SMBSRV_OBJS +=	$(SMBSRV_SHARED_OBJS)			\
 		smb_tree_connect.o			\
 		smb_unlock_byte_range.o			\
 		smb_user.o				\
-		smb_vfs.o				\
 		smb_vops.o				\
 		smb_vss.o				\
 		smb_write.o				\
@@ -2097,7 +2096,7 @@ NXGE_HCALL_OBJS =	\
 #
 
 # Virtio core
-VIRTIO_OBJS = virtio.o
+VIRTIO_OBJS = virtio_main.o virtio_dma.o
 
 # Virtio block driver
 VIOBLK_OBJS = vioblk.o
diff --git a/usr/src/uts/common/fs/smbsrv/smb2_close.c b/usr/src/uts/common/fs/smbsrv/smb2_close.c
index e019a3c3da..bbb000f329 100644
--- a/usr/src/uts/common/fs/smbsrv/smb2_close.c
+++ b/usr/src/uts/common/fs/smbsrv/smb2_close.c
@@ -10,7 +10,7 @@
  */
 
 /*
- * Copyright 2017 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright 2018 Nexenta Systems, Inc.  All rights reserved.
  */
 
 /*
@@ -71,6 +71,8 @@ smb2_close(smb_request_t *sr)
 		}
 	}
 
+	if (of->dh_persist)
+		smb2_dh_setdoc_persistent(of);
 	smb_ofile_close(of, 0);
 
 errout:
diff --git a/usr/src/uts/common/fs/smbsrv/smb2_create.c b/usr/src/uts/common/fs/smbsrv/smb2_create.c
index 6aab3c5127..582efbae28 100644
--- a/usr/src/uts/common/fs/smbsrv/smb2_create.c
+++ b/usr/src/uts/common/fs/smbsrv/smb2_create.c
@@ -10,7 +10,7 @@
  */
 
 /*
- * Copyright 2017 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright 2018 Nexenta Systems, Inc.  All rights reserved.
  */
 
 /*
@@ -280,7 +280,6 @@ smb2_create(smb_request_t *sr)
 	 * many create context types are ignored too.
 	 */
 	op->dh_vers = SMB2_NOT_DURABLE;
-	op->dh_v2_flags = 0;
 	if ((cctx.cc_in_flags &
 	    (CCTX_DH_RECONNECT|CCTX_DH_RECONNECT_V2)) != 0) {
 
@@ -388,6 +387,9 @@ smb2_create(smb_request_t *sr)
 		cctx.cc_in_flags &= ~CCTX_REQUEST_LEASE;
 	}
 
+	if ((sr->tid_tree->t_flags & SMB_TREE_CA) == 0)
+		op->dh_v2_flags &= ~DH_PERSISTENT;
+
 	if ((cctx.cc_in_flags &
 	    (CCTX_DH_REQUEST|CCTX_DH_REQUEST_V2)) != 0) {
 		if ((cctx.cc_in_flags & CCTX_DH_REQUEST_V2) != 0)
@@ -441,15 +443,19 @@ smb2_create(smb_request_t *sr)
 	 * non-durable handles in case we get the ioctl
 	 * to set "resiliency" on this handle.
 	 */
-	if (of->f_ftype == SMB_FTYPE_DISK)
-		smb_ofile_set_persistid(of);
+	if (of->f_ftype == SMB_FTYPE_DISK) {
+		if ((op->dh_v2_flags & DH_PERSISTENT) != 0)
+			smb_ofile_set_persistid_ph(of);
+		else
+			smb_ofile_set_persistid_dh(of);
+	}
 
 	/*
 	 * [MS-SMB2] 3.3.5.9.8
 	 * Handling the SMB2_CREATE_REQUEST_LEASE Create Context
 	 */
 	if ((cctx.cc_in_flags & CCTX_REQUEST_LEASE) != 0) {
-		status = smb2_lease_create(sr);
+		status = smb2_lease_create(sr, sr->session->clnt_uuid);
 		if (status != NT_STATUS_SUCCESS) {
 			if (op->action_taken == SMB_OACT_CREATED) {
 				smb_ofile_set_delete_on_close(sr, of);
@@ -479,7 +485,8 @@ smb2_create(smb_request_t *sr)
 	if ((cctx.cc_in_flags &
 	    (CCTX_DH_REQUEST|CCTX_DH_REQUEST_V2)) != 0 &&
 	    smb_node_is_file(of->f_node) &&
-	    ((op->op_oplock_level == SMB2_OPLOCK_LEVEL_BATCH) ||
+	    ((op->dh_v2_flags & DH_PERSISTENT) != 0 ||
+	    (op->op_oplock_level == SMB2_OPLOCK_LEVEL_BATCH) ||
 	    (op->op_oplock_level == SMB2_OPLOCK_LEVEL_LEASE &&
 	    (op->lease_state & OPLOCK_LEVEL_CACHE_HANDLE) != 0))) {
 		/*
@@ -489,8 +496,13 @@ smb2_create(smb_request_t *sr)
 			(void) memcpy(of->dh_create_guid,
 			    op->create_guid, UUID_LEN);
 
-			/* no persistent handles yet */
-			of->dh_persist = B_FALSE;
+			if ((op->dh_v2_flags & DH_PERSISTENT) != 0) {
+				if (smb2_dh_make_persistent(sr, of) == 0) {
+					of->dh_persist = B_TRUE;
+				} else {
+					op->dh_v2_flags = 0;
+				}
+			}
 		}
 		if (op->dh_vers != SMB2_NOT_DURABLE) {
 			uint32_t msto;
@@ -503,8 +515,11 @@ smb2_create(smb_request_t *sr)
 			 * the default timeout (in mSec.)
 			 */
 			msto = op->dh_timeout;
-			if (msto == 0)
-				msto = smb2_dh_def_timeout;
+			if (msto == 0) {
+				msto = (of->dh_persist) ?
+				    smb2_persist_timeout :
+				    smb2_dh_def_timeout;
+			}
 			if (msto > smb2_dh_max_timeout)
 				msto = smb2_dh_max_timeout;
 			op->dh_timeout = msto;
@@ -512,6 +527,7 @@ smb2_create(smb_request_t *sr)
 		}
 	} else {
 		op->dh_vers = SMB2_NOT_DURABLE;
+		op->dh_v2_flags = 0;
 	}
 
 	/*
diff --git a/usr/src/uts/common/fs/smbsrv/smb2_dispatch.c b/usr/src/uts/common/fs/smbsrv/smb2_dispatch.c
index b592dc4c5f..88c4b6d600 100644
--- a/usr/src/uts/common/fs/smbsrv/smb2_dispatch.c
+++ b/usr/src/uts/common/fs/smbsrv/smb2_dispatch.c
@@ -979,6 +979,16 @@ cmd_done:
 	 */
 	if (!sr->smb2_async && sr->smb2_next_command != 0)
 		goto cmd_start;
+
+	/*
+	 * If we have a durable handle, and this operation updated
+	 * the nvlist, write it out (before smb2_send_reply).
+	 */
+	if (sr->dh_nvl_dirty) {
+		sr->dh_nvl_dirty = B_FALSE;
+		smb2_dh_update_nvfile(sr);
+	}
+
 	smb2_send_reply(sr);
 	if (sr->smb2_async && sr->smb2_next_command != 0) {
 		MBC_FLUSH(&sr->reply);	/* New reply buffer. */
@@ -990,6 +1000,9 @@ cleanup:
 	if (disconnect)
 		smb_session_disconnect(session);
 
+	/*
+	 * Do "postwork" for oplock (and maybe other things)
+	 */
 	if (sr->sr_postwork != NULL)
 		smb2sr_run_postwork(sr);
 
@@ -1728,6 +1741,16 @@ smb2sr_run_postwork(smb_request_t *top_sr)
 		default:
 			ASSERT(0);
 		}
+
+		/*
+		 * If we have a durable handle, and this operation
+		 * updated the nvlist, write it out.
+		 */
+		if (post_sr->dh_nvl_dirty) {
+			post_sr->dh_nvl_dirty = B_FALSE;
+			smb2_dh_update_nvfile(post_sr);
+		}
+
 		post_sr->sr_state = SMB_REQ_STATE_COMPLETED;
 		smb_request_free(post_sr);
 	}
diff --git a/usr/src/uts/common/fs/smbsrv/smb2_durable.c b/usr/src/uts/common/fs/smbsrv/smb2_durable.c
index 9ba3dd9c07..7b65924ca4 100644
--- a/usr/src/uts/common/fs/smbsrv/smb2_durable.c
+++ b/usr/src/uts/common/fs/smbsrv/smb2_durable.c
@@ -21,6 +21,7 @@
 #include <sys/cmn_err.h>
 #include <sys/fcntl.h>
 #include <sys/nbmlock.h>
+#include <sys/sid.h>
 #include <smbsrv/string.h>
 #include <smbsrv/smb_kproto.h>
 #include <smbsrv/smb_fsops.h>
@@ -53,6 +54,48 @@ uint32_t smb2_dh_max_timeout = 300 * MILLISEC;	/* mSec. */
 uint32_t smb2_res_def_timeout = 120 * MILLISEC;	/* mSec. */
 uint32_t smb2_res_max_timeout = 300 * MILLISEC;	/* mSec. */
 
+uint32_t smb2_persist_timeout = 300 * MILLISEC;	/* mSec. */
+
+/* Max. size of the file used to store a CA handle. */
+static uint32_t smb2_dh_max_cah_size = 64 * 1024;
+static uint32_t smb2_ca_info_version = 1;
+
+/*
+ * Want this to have invariant layout on disk, where the
+ * last two uint32_t values are stored as a uint64_t
+ */
+struct nvlk {
+	uint64_t lk_start;
+	uint64_t lk_len;
+	/* (lk_pid << 32) | lk_type */
+#ifdef	_BIG_ENDIAN
+	uint32_t lk_pid, lk_type;
+#else
+	uint32_t lk_type, lk_pid;
+#endif
+};
+
+static void smb2_dh_import_share(void *);
+static smb_ofile_t *smb2_dh_import_handle(smb_request_t *, smb_node_t *,
+    uint64_t);
+static int smb2_dh_read_nvlist(smb_request_t *, smb_node_t *, struct nvlist **);
+static int smb2_dh_import_cred(smb_ofile_t *, char *);
+
+#define	DH_SN_SIZE 24	/* size of DH stream name buffers */
+/*
+ * Build the stream name used to store a CA handle.
+ * i.e. ":0123456789abcdef:$CA"
+ * Note: smb_fsop_create adds the SUNWsmb prefix,
+ * so we compose the name without the prefix.
+ */
+static inline void
+smb2_dh_make_stream_name(char *buf, size_t buflen, uint64_t id)
+{
+	ASSERT(buflen >= DH_SN_SIZE);
+	(void) snprintf(buf, buflen,
+	    ":%016" PRIx64 ":$CA", id);
+}
+
 /*
  * smb_dh_should_save
  *
@@ -80,6 +123,11 @@ uint32_t smb2_res_max_timeout = 300 * MILLISEC;	/* mSec. */
  *   Open.OplockState == Held, and Open.IsDurable is TRUE.
  *
  * - Open.IsPersistent is TRUE.
+ *
+ * We also deal with some special cases for shutdown of the
+ * server, session, user, tree (in that order). Other than
+ * the cases above, shutdown (or forced termination) should
+ * destroy durable handles.
  */
 boolean_t
 smb_dh_should_save(smb_ofile_t *of)
@@ -87,12 +135,49 @@ smb_dh_should_save(smb_ofile_t *of)
 	ASSERT(MUTEX_HELD(&of->f_mutex));
 	ASSERT(of->dh_vers != SMB2_NOT_DURABLE);
 
-	if (of->f_user->preserve_opens == SMB2_DH_PRESERVE_NONE)
+	/* SMB service shutting down, destroy DH */
+	if (of->f_server->sv_state == SMB_SERVER_STATE_STOPPING)
 		return (B_FALSE);
 
-	if (of->f_user->preserve_opens == SMB2_DH_PRESERVE_ALL)
+	/*
+	 * SMB Session (connection) going away (server up).
+	 * If server initiated disconnect, destroy DH
+	 * If client initiated disconnect, save all DH.
+	 */
+	if (of->f_session->s_state == SMB_SESSION_STATE_TERMINATED)
+		return (B_FALSE);
+	if (of->f_session->s_state == SMB_SESSION_STATE_DISCONNECTED)
 		return (B_TRUE);
 
+	/*
+	 * SMB User logoff, session still "up".
+	 * Action depends on why/how this logoff happened,
+	 * determined based on user->preserve_opens
+	 */
+	if (of->f_user->u_state == SMB_USER_STATE_LOGGING_OFF) {
+		switch (of->f_user->preserve_opens) {
+		case SMB2_DH_PRESERVE_NONE:
+			/* Server-initiated */
+			return (B_FALSE);
+		case SMB2_DH_PRESERVE_SOME:
+			/* Previous session logoff. */
+			goto preserve_some;
+		case SMB2_DH_PRESERVE_ALL:
+			/* Protocol logoff request */
+			return (B_TRUE);
+		}
+	}
+
+	/*
+	 * SMB tree disconnecting (user still logged on)
+	 * i.e. when kshare export forces disconnection.
+	 */
+	if (of->f_tree->t_state == SMB_TREE_STATE_DISCONNECTING)
+		return (B_FALSE);
+
+preserve_some:
+	/* preserve_opens == SMB2_DH_PRESERVE_SOME */
+
 	switch (of->dh_vers) {
 	case SMB2_RESILIENT:
 		return (B_TRUE);
@@ -116,6 +201,1063 @@ smb_dh_should_save(smb_ofile_t *of)
 }
 
 /*
+ * Is this stream name a CA handle? i.e.
+ * ":0123456789abcdef:$CA"
+ */
+static boolean_t
+smb2_dh_match_ca_name(const char *name, uint64_t *idp)
+{
+	static const char suffix[] = ":$CA";
+	u_longlong_t ull;
+	const char *p = name;
+	char *p2 = NULL;
+	int len, rc;
+
+	if (*p++ != ':')
+		return (B_FALSE);
+
+	rc = ddi_strtoull(p, &p2, 16, &ull);
+	if (rc != 0 || p2 != (p + 16))
+		return (B_FALSE);
+	p += 16;
+
+	len = sizeof (suffix) - 1;
+	if (strncmp(p, suffix, len) != 0)
+		return (B_FALSE);
+	p += len;
+
+	if (*p != '\0')
+		return (B_FALSE);
+
+	*idp = (uint64_t)ull;
+	return (B_TRUE);
+}
+
+/*
+ * smb2_dh_new_ca_share
+ *
+ * Called when a new share has ca=true.  Find or create the CA dir,
+ * and start a thread to import persistent handles.
+ */
+int
+smb2_dh_new_ca_share(smb_server_t *sv, smb_kshare_t *shr)
+{
+	smb_kshare_t	*shr2;
+	smb_request_t	*sr;
+
+	ASSERT(STYPE_ISDSK(shr->shr_type));
+
+	/*
+	 * Need to lookup the kshare again, to get a hold.
+	 * Add a function to just get the hold?
+	 */
+	shr2 = smb_kshare_lookup(sv, shr->shr_name);
+	if (shr2 != shr)
+		return (EINVAL);
+
+	sr = smb_request_alloc(sv->sv_session, 0);
+	if (sr == NULL) {
+		/* shutting down? */
+		smb_kshare_release(sv, shr);
+		return (EINTR);
+	}
+	sr->sr_state = SMB_REQ_STATE_SUBMITTED;
+
+	/*
+	 * Mark this share as "busy importing persistent handles"
+	 * so we can hold off tree connect until that's done.
+	 * Will clear and wakeup below.
+	 */
+	mutex_enter(&shr->shr_mutex);
+	shr->shr_import_busy = sr;
+	mutex_exit(&shr->shr_mutex);
+
+	/*
+	 * Start a taskq job to import any CA handles.
+	 * The hold on the kshare is given to this job,
+	 * which releases it when it's done.
+	 */
+	sr->arg.tcon.si = shr;	/* hold from above */
+	(void) taskq_dispatch(
+	    sv->sv_worker_pool,
+	    smb2_dh_import_share, sr, TQ_SLEEP);
+
+	return (0);
+}
+
+int smb2_dh_import_delay = 0;
+
+static void
+smb2_dh_import_share(void *arg)
+{
+	smb_request_t	*sr = arg;
+	smb_kshare_t	*shr = sr->arg.tcon.si;
+	smb_node_t	*snode;
+	cred_t		*kcr = zone_kcred();
+	smb_streaminfo_t *str_info = NULL;
+	uint64_t	id;
+	smb_node_t	*str_node;
+	smb_odir_t	*od = NULL;
+	smb_ofile_t	*of;
+	int		rc;
+	boolean_t	eof;
+
+	sr->sr_state = SMB_REQ_STATE_ACTIVE;
+
+	if (smb2_dh_import_delay > 0)
+		delay(SEC_TO_TICK(smb2_dh_import_delay));
+
+	/*
+	 * Borrow the server's "root" user.
+	 *
+	 * This takes the place of smb_session_lookup_ssnid()
+	 * that would happen in smb2_dispatch for a normal SR.
+	 * As usual, this hold is released in smb_request_free.
+	 */
+	sr->uid_user = sr->sr_server->sv_rootuser;
+	smb_user_hold_internal(sr->uid_user);
+	sr->user_cr = sr->uid_user->u_cred;
+
+	/*
+	 * Create a temporary tree connect
+	 */
+	sr->arg.tcon.path = shr->shr_name;
+	sr->tid_tree = smb_tree_alloc(sr, shr, shr->shr_root_node,
+	    ACE_ALL_PERMS, 0);
+	if (sr->tid_tree == NULL) {
+		cmn_err(CE_NOTE, "smb2_dh_import_share: "
+		    "failed connect share <%s>", shr->shr_name);
+		goto out;
+	}
+	snode = sr->tid_tree->t_snode;
+
+	/*
+	 * Get the buffers we'll use to read CA handle data.
+	 * Stash in sr_request_buf for smb2_dh_import_handle().
+	 * Also a buffer for the stream name info.
+	 */
+	sr->sr_req_length = smb2_dh_max_cah_size;
+	sr->sr_request_buf = kmem_alloc(sr->sr_req_length, KM_SLEEP);
+	str_info = kmem_alloc(sizeof (smb_streaminfo_t), KM_SLEEP);
+
+	/*
+	 * Open the ext. attr dir under the share root and
+	 * import CA handles for this share.
+	 */
+	if (smb_odir_openat(sr, snode, &od) != 0) {
+		cmn_err(CE_NOTE, "Share [%s] CA import, no xattr dir?",
+		    shr->shr_name);
+		goto out;
+	}
+
+	eof = B_FALSE;
+	do {
+		/*
+		 * If the kshare gets unshared before we finish,
+		 * bail out so we don't hold things up.
+		 */
+		if (shr->shr_flags & SMB_SHRF_REMOVED)
+			break;
+
+		/*
+		 * Read a stream name and info
+		 */
+		rc = smb_odir_read_streaminfo(sr, od, str_info, &eof);
+		if ((rc != 0) || (eof))
+			break;
+
+		/*
+		 * Skip anything not a CA handle.
+		 */
+		if (!smb2_dh_match_ca_name(str_info->si_name, &id)) {
+			continue;
+		}
+
+		/*
+		 * Lookup stream node and import
+		 */
+		str_node = NULL;
+		rc = smb_fsop_lookup_name(sr, kcr, SMB_CASE_SENSITIVE,
+		    snode, snode, str_info->si_name, &str_node);
+		if (rc != 0) {
+			cmn_err(CE_NOTE, "Share [%s] CA import, "
+			    "lookup <%s> failed rc=%d",
+			    shr->shr_name, str_info->si_name, rc);
+			continue;
+		}
+		of = smb2_dh_import_handle(sr, str_node, id);
+		smb_node_release(str_node);
+		if (of != NULL) {
+			smb_ofile_release(of);
+			of = NULL;
+		}
+		sr->fid_ofile = NULL;
+
+	} while (!eof);
+
+out:
+	if (od != NULL) {
+		smb_odir_close(od);
+		smb_odir_release(od);
+	}
+
+	if (str_info != NULL)
+		kmem_free(str_info, sizeof (smb_streaminfo_t));
+	/* Let smb_request_free clean up sr->sr_request_buf */
+
+	/*
+	 * We did a (temporary, internal) tree connect above,
+	 * which we need to undo before we return.  Note that
+	 * smb_request_free will do the final release of
+	 * sr->tid_tree, sr->uid_user
+	 */
+	if (sr->tid_tree != NULL)
+		smb_tree_disconnect(sr->tid_tree, B_FALSE);
+
+	/*
+	 * Wake up any waiting tree connect(s).
+	 * See smb_tree_connect_disk().
+	 */
+	mutex_enter(&shr->shr_mutex);
+	shr->shr_import_busy = NULL;
+	cv_broadcast(&shr->shr_cv);
+	mutex_exit(&shr->shr_mutex);
+
+	smb_kshare_release(sr->sr_server, shr);
+	smb_request_free(sr);
+}
+
+/*
+ * This returns the new ofile mostly for dtrace.
+ */
+static smb_ofile_t *
+smb2_dh_import_handle(smb_request_t *sr, smb_node_t *str_node,
+    uint64_t persist_id)
+{
+	uint8_t		client_uuid[UUID_LEN];
+	smb_tree_t	*tree = sr->tid_tree;
+	smb_arg_open_t	*op = &sr->arg.open;
+	smb_pathname_t	*pn = &op->fqi.fq_path;
+	cred_t		*kcr = zone_kcred();
+	struct nvlist	*nvl = NULL;
+	char		*sidstr = NULL;
+	smb_ofile_t	*of = NULL;
+	smb_attr_t	*pa;
+	boolean_t	did_open = B_FALSE;
+	boolean_t	have_lease = B_FALSE;
+	hrtime_t	hrt;
+	uint64_t	*u64p;
+	uint64_t	u64;
+	uint32_t	u32;
+	uint32_t	status;
+	char		*s;
+	uint8_t		*u8p;
+	uint_t		alen;
+	int		rc;
+
+	/*
+	 * While we're called with arg.tcon, we now want to use
+	 * smb_arg_open for the rest of import, so clear it.
+	 */
+	bzero(op, sizeof (*op));
+	op->create_disposition = FILE_OPEN;
+
+	/*
+	 * Read and unpack the NVL
+	 */
+	rc = smb2_dh_read_nvlist(sr, str_node, &nvl);
+	if (rc != 0)
+		return (NULL);
+
+	/*
+	 * Known CA info version?
+	 */
+	u32 = 0;
+	rc = nvlist_lookup_uint32(nvl, "info_version", &u32);
+	if (rc != 0 || u32 != smb2_ca_info_version) {
+		cmn_err(CE_NOTE, "CA import (%s/%s) bad vers=%d",
+		    tree->t_resource, str_node->od_name, u32);
+		goto errout;
+	}
+
+	/*
+	 * The persist ID in the nvlist should match the one
+	 * encoded in the file name. (not enforced)
+	 */
+	u64 = 0;
+	rc = nvlist_lookup_uint64(nvl, "file_persistid", &u64);
+	if (rc != 0 || u64 != persist_id) {
+		cmn_err(CE_WARN, "CA import (%s/%s) bad id=%016" PRIx64,
+		    tree->t_resource, str_node->od_name, u64);
+		/* goto errout? (allow) */
+	}
+
+	/*
+	 * Does it belong in the share being imported?
+	 */
+	s = NULL;
+	rc = nvlist_lookup_string(nvl, "share_name", &s);
+	if (rc != 0) {
+		cmn_err(CE_NOTE, "CA import (%s/%s) no share_name",
+		    tree->t_resource, str_node->od_name);
+		goto errout;
+	}
+	if (smb_strcasecmp(s, tree->t_sharename, 0) != 0) {
+		/* Normal (not an error) */
+#ifdef DEBUG
+		cmn_err(CE_NOTE, "CA import (%s/%s) other share",
+		    tree->t_resource, str_node->od_name);
+#endif
+		goto errout;
+	}
+
+	/*
+	 * Get the path name (for lookup)
+	 */
+	rc = nvlist_lookup_string(nvl, "path_name", &pn->pn_path);
+	if (rc != 0) {
+		cmn_err(CE_NOTE, "CA import (%s/%s) no path_name",
+		    tree->t_resource, str_node->od_name);
+		goto errout;
+	}
+
+	/*
+	 * owner sid
+	 */
+	rc = nvlist_lookup_string(nvl, "owner_sid", &sidstr);
+	if (rc != 0) {
+		cmn_err(CE_NOTE, "CA import (%s/%s) no owner_sid",
+		    tree->t_resource, str_node->od_name);
+		goto errout;
+	}
+
+	/*
+	 * granted access
+	 */
+	rc = nvlist_lookup_uint32(nvl,
+	    "granted_access", &op->desired_access);
+	if (rc != 0) {
+		cmn_err(CE_NOTE, "CA import (%s/%s) no granted_access",
+		    tree->t_resource, str_node->od_name);
+		goto errout;
+	}
+
+	/*
+	 * share access
+	 */
+	rc = nvlist_lookup_uint32(nvl,
+	    "share_access", &op->share_access);
+	if (rc != 0) {
+		cmn_err(CE_NOTE, "CA import (%s/%s) no share_access",
+		    tree->t_resource, str_node->od_name);
+		goto errout;
+	}
+
+	/*
+	 * create options
+	 */
+	rc = nvlist_lookup_uint32(nvl,
+	    "create_options", &op->create_options);
+	if (rc != 0) {
+		cmn_err(CE_NOTE, "CA import (%s/%s) no create_options",
+		    tree->t_resource, str_node->od_name);
+		goto errout;
+	}
+
+	/*
+	 * create guid (client-assigned)
+	 */
+	alen = UUID_LEN;
+	u8p = NULL;
+	rc = nvlist_lookup_uint8_array(nvl, "file_guid", &u8p, &alen);
+	if (rc != 0 || alen != UUID_LEN) {
+		cmn_err(CE_NOTE, "CA import (%s/%s) bad file_guid",
+		    tree->t_resource, str_node->od_name);
+		goto errout;
+	}
+	bcopy(u8p, op->create_guid, UUID_LEN);
+
+	/*
+	 * client uuid (identifies the client)
+	 */
+	alen = UUID_LEN;
+	u8p = NULL;
+	rc = nvlist_lookup_uint8_array(nvl, "client_uuid", &u8p, &alen);
+	if (rc != 0 || alen != UUID_LEN) {
+		cmn_err(CE_NOTE, "CA import (%s/%s) no client_uuid",
+		    tree->t_resource, str_node->od_name);
+		goto errout;
+	}
+	bcopy(u8p, client_uuid, UUID_LEN);
+
+	/*
+	 * Lease key (optional)
+	 */
+	alen = SMB_LEASE_KEY_SZ;
+	u8p = NULL;
+	rc = nvlist_lookup_uint8_array(nvl, "lease_uuid", &u8p, &alen);
+	if (rc == 0) {
+		bcopy(u8p, op->lease_key, UUID_LEN);
+		(void) nvlist_lookup_uint32(nvl,
+		    "lease_state", &op->lease_state);
+		(void) nvlist_lookup_uint16(nvl,
+		    "lease_epoch", &op->lease_epoch);
+		(void) nvlist_lookup_uint16(nvl,
+		    "lease_version", &op->lease_version);
+		have_lease = B_TRUE;
+	} else {
+		(void) nvlist_lookup_uint32(nvl,
+		    "oplock_state", &op->op_oplock_state);
+	}
+
+	/*
+	 * Done getting what we need from the NV list.
+	 * (re)open the file
+	 */
+	status = smb_common_open(sr);
+	if (status != 0) {
+		cmn_err(CE_NOTE, "CA import (%s/%s) open failed 0x%x",
+		    tree->t_resource, str_node->od_name, status);
+		(void) smb_node_set_delete_on_close(str_node, kcr, 0);
+		goto errout;
+	}
+	of = sr->fid_ofile;
+	did_open = B_TRUE;
+
+	/*
+	 * Now restore the rest of the SMB2 level state.
+	 * See smb2_create after smb_common_open
+	 */
+
+	/*
+	 * Setup of->f_cr with owner SID
+	 */
+	rc = smb2_dh_import_cred(of, sidstr);
+	if (rc != 0) {
+		cmn_err(CE_NOTE, "CA import (%s/%s) import cred failed",
+		    tree->t_resource, str_node->od_name);
+		goto errout;
+	}
+
+	/*
+	 * Use the persist ID we previously assigned.
+	 * Like smb_ofile_set_persistid_ph()
+	 */
+	rc = smb_ofile_insert_persistid(of, persist_id);
+	if (rc != 0) {
+		cmn_err(CE_NOTE, "CA import (%s/%s) "
+		    "insert_persistid rc=%d",
+		    tree->t_resource, str_node->od_name, rc);
+		goto errout;
+	}
+
+	/*
+	 * Like smb2_lease_create()
+	 *
+	 * Lease state is stored in each persistent handle, but
+	 * only one handle has the state we want.  As we import
+	 * each handle, "upgrade" the lease if the handle we're
+	 * importing has a "better" lease state (higher epoch or
+	 * more cache rights).  After all handles are imported,
+	 * that will get the lease to the right state.
+	 */
+	if (have_lease) {
+		smb_lease_t *ls;
+		status = smb2_lease_create(sr, client_uuid);
+		if (status != 0) {
+			cmn_err(CE_NOTE, "CA import (%s/%s) get lease 0x%x",
+			    tree->t_resource, str_node->od_name, status);
+			goto errout;
+		}
+		ls = of->f_lease;
+
+		/* Use most current "epoch". */
+		mutex_enter(&ls->ls_mutex);
+		if (ls->ls_epoch < op->lease_epoch)
+			ls->ls_epoch = op->lease_epoch;
+		mutex_exit(&ls->ls_mutex);
+
+		/*
+		 * Get the lease (and oplock)
+		 * uses op->lease_state
+		 */
+		op->op_oplock_level = SMB2_OPLOCK_LEVEL_LEASE;
+		smb2_lease_acquire(sr);
+
+	} else {
+		/*
+		 * No lease; maybe get an oplock
+		 * uses: op->op_oplock_level
+		 */
+		if (op->op_oplock_state & OPLOCK_LEVEL_BATCH) {
+			op->op_oplock_level = SMB2_OPLOCK_LEVEL_BATCH;
+		} else if (op->op_oplock_state & OPLOCK_LEVEL_ONE) {
+			op->op_oplock_level = SMB2_OPLOCK_LEVEL_EXCLUSIVE;
+		} else if (op->op_oplock_state & OPLOCK_LEVEL_TWO) {
+			op->op_oplock_level = SMB2_OPLOCK_LEVEL_II;
+		} else {
+			op->op_oplock_level = SMB2_OPLOCK_LEVEL_NONE;
+		}
+		smb2_oplock_acquire(sr);
+	}
+
+	/*
+	 * Byte range locks
+	 */
+	alen = 0;
+	u64p = NULL;
+	if (nvlist_lookup_uint64_array(nvl, "locks", &u64p, &alen) == 0) {
+		uint_t	i;
+		uint_t nlocks = alen / 3;
+		struct nvlk	*nlp;
+
+		nlp = (struct nvlk *)u64p;
+		for (i = 0; i < nlocks; i++) {
+			status = smb_lock_range(
+			    sr,
+			    nlp->lk_start,
+			    nlp->lk_len,
+			    nlp->lk_pid,
+			    nlp->lk_type,
+			    0);
+			if (status != 0) {
+				cmn_err(CE_NOTE, "CA import (%s/%s) "
+				    "get lock %d failed 0x%x",
+				    tree->t_resource,
+				    str_node->od_name,
+				    i, status);
+			}
+			nlp++;
+		}
+	}
+	alen = SMB_OFILE_LSEQ_MAX;
+	u8p = NULL;
+	if (nvlist_lookup_uint8_array(nvl, "lockseq", &u8p, &alen) == 0) {
+		if (alen != SMB_OFILE_LSEQ_MAX) {
+			cmn_err(CE_NOTE, "CA import (%s/%s) "
+			    "get lockseq bad len=%d",
+			    tree->t_resource,
+			    str_node->od_name,
+			    alen);
+		} else {
+			mutex_enter(&of->f_mutex);
+			bcopy(u8p, of->f_lock_seq, alen);
+			mutex_exit(&of->f_mutex);
+		}
+	}
+
+	/*
+	 * Optional "sticky" times (set pending attributes)
+	 */
+	mutex_enter(&of->f_mutex);
+	pa = &of->f_pending_attr;
+	if (nvlist_lookup_hrtime(nvl, "atime", &hrt) == 0) {
+		hrt2ts(hrt, &pa->sa_vattr.va_atime);
+		pa->sa_mask |= SMB_AT_ATIME;
+	}
+	if (nvlist_lookup_hrtime(nvl, "mtime", &hrt) == 0) {
+		hrt2ts(hrt, &pa->sa_vattr.va_mtime);
+		pa->sa_mask |= SMB_AT_MTIME;
+	}
+	if (nvlist_lookup_hrtime(nvl, "ctime", &hrt) == 0) {
+		hrt2ts(hrt, &pa->sa_vattr.va_ctime);
+		pa->sa_mask |= SMB_AT_CTIME;
+	}
+	mutex_exit(&of->f_mutex);
+
+	/*
+	 * Make durable and persistent.
+	 * See smb2_dh_make_persistent()
+	 */
+	of->dh_vers = SMB2_DURABLE_V2;
+	bcopy(op->create_guid, of->dh_create_guid, UUID_LEN);
+	of->dh_persist = B_TRUE;
+	of->dh_nvfile = str_node;
+	smb_node_ref(str_node);
+	of->dh_nvlist = nvl;
+	nvl = NULL;
+
+	/*
+	 * Now make it state orphaned...
+	 * See smb_ofile_drop(), then
+	 * smb_ofile_save_dh()
+	 */
+	mutex_enter(&of->f_mutex);
+	of->f_state = SMB_OFILE_STATE_SAVE_DH;
+	of->dh_timeout_offset = MSEC2NSEC(smb2_persist_timeout);
+	mutex_exit(&of->f_mutex);
+
+	/*
+	 * Finished!
+	 */
+	return (of);
+
+errout:
+	if (did_open) {
+		smb_ofile_close(of, 0);
+		smb_ofile_release(of);
+	} else {
+		ASSERT(of == NULL);
+	}
+
+	if (nvl != NULL)
+		nvlist_free(nvl);
+
+	return (NULL);
+}
+
+static int
+smb2_dh_read_nvlist(smb_request_t *sr, smb_node_t *node,
+    struct nvlist **nvlpp)
+{
+	smb_attr_t	attr;
+	iovec_t		iov;
+	uio_t		uio;
+	smb_kshare_t	*shr = sr->arg.tcon.si;
+	cred_t		*kcr = zone_kcred();
+	size_t		flen;
+	int		rc;
+
+	bzero(&attr, sizeof (attr));
+	attr.sa_mask = SMB_AT_SIZE;
+	rc = smb_node_getattr(NULL, node, kcr, NULL, &attr);
+	if (rc != 0) {
+		cmn_err(CE_NOTE, "CA import (%s/%s) getattr rc=%d",
+		    shr->shr_path, node->od_name, rc);
+		return (rc);
+	}
+
+	if (attr.sa_vattr.va_size < 4 ||
+	    attr.sa_vattr.va_size > sr->sr_req_length) {
+		cmn_err(CE_NOTE, "CA import (%s/%s) bad size=%" PRIu64,
+		    shr->shr_path, node->od_name,
+		    (uint64_t)attr.sa_vattr.va_size);
+		return (EINVAL);
+	}
+	flen = (size_t)attr.sa_vattr.va_size;
+
+	bzero(&uio, sizeof (uio));
+	iov.iov_base = sr->sr_request_buf;
+	iov.iov_len = flen;
+	uio.uio_iov = &iov;
+	uio.uio_iovcnt = 1;
+	uio.uio_resid = flen;
+	uio.uio_segflg = UIO_SYSSPACE;
+	uio.uio_extflg = UIO_COPY_DEFAULT;
+	rc = smb_fsop_read(sr, kcr, node, NULL, &uio);
+	if (rc != 0) {
+		cmn_err(CE_NOTE, "CA import (%s/%s) read, rc=%d",
+		    shr->shr_path, node->od_name, rc);
+		return (rc);
+	}
+	if (uio.uio_resid != 0) {
+		cmn_err(CE_NOTE, "CA import (%s/%s) short read",
+		    shr->shr_path, node->od_name);
+		return (EIO);
+	}
+
+	rc = nvlist_unpack(sr->sr_request_buf, flen, nvlpp, KM_SLEEP);
+	if (rc != 0) {
+		cmn_err(CE_NOTE, "CA import (%s/%s) unpack, rc=%d",
+		    shr->shr_path, node->od_name, rc);
+		return (rc);
+	}
+
+	return (0);
+}
+
+/*
+ * Setup a vestigial credential in of->f_cr just good enough for
+ * smb_is_same_user to determine if the caller owned this ofile.
+ * At reconnect, of->f_cr will be replaced with the caller's.
+ */
+static int
+smb2_dh_import_cred(smb_ofile_t *of, char *sidstr)
+{
+#ifdef	_FAKE_KERNEL
+	_NOTE(ARGUNUSED(sidstr))
+	/* fksmbd doesn't have real credentials. */
+	of->f_cr = CRED();
+	crhold(of->f_cr);
+#else
+	char tmpstr[SMB_SID_STRSZ];
+	ksid_t		ksid;
+	cred_t		*cr, *oldcr;
+	int		rc;
+
+	(void) strlcpy(tmpstr, sidstr, sizeof (tmpstr));
+	bzero(&ksid, sizeof (ksid));
+
+	rc = smb_sid_splitstr(tmpstr, &ksid.ks_rid);
+	if (rc != 0)
+		return (rc);
+	cr = crget();
+
+	ksid.ks_domain = ksid_lookupdomain(tmpstr);
+	crsetsid(cr, &ksid, KSID_USER);
+	ksiddomain_hold(ksid.ks_domain);
+	crsetsid(cr, &ksid, KSID_OWNER);
+
+	/*
+	 * Just to avoid leaving the KSID_GROUP slot NULL,
+	 * put the "everyone" SID there (S-1-1-0).
+	 */
+	ksid.ks_domain = ksid_lookupdomain("S-1-1");
+	ksid.ks_rid = 0;
+	crsetsid(cr, &ksid, KSID_GROUP);
+
+	oldcr = of->f_cr;
+	of->f_cr = cr;
+	if (oldcr != NULL)
+		crfree(oldcr);
+#endif
+
+	return (0);
+}
+
+/*
+ * Set Delete-on-Close (DoC) on the persistent state file so it will be
+ * removed when the last ref. goes away (in smb2_dh_close_persistent).
+ *
+ * This is called in just two places:
+ * (1) SMB2_close request -- client tells us to destroy the handle.
+ * (2) smb2_dh_expire -- client has forgotten about this handle.
+ * All other (server-initiated) close calls should leave these
+ * persistent state files in the file system.
+ */
+void
+smb2_dh_setdoc_persistent(smb_ofile_t *of)
+{
+	smb_node_t *strnode;
+	uint32_t status;
+
+	mutex_enter(&of->dh_nvlock);
+	if ((strnode = of->dh_nvfile) != NULL)
+		smb_node_ref(strnode);
+	mutex_exit(&of->dh_nvlock);
+
+	if (strnode != NULL) {
+		status = smb_node_set_delete_on_close(strnode,
+		    zone_kcred(), SMB_CASE_SENSITIVE);
+		if (status != 0) {
+			cmn_err(CE_WARN, "Can't set DoC on CA file: %s",
+			    strnode->od_name);
+			DTRACE_PROBE1(rm__ca__err, smb_ofile_t *, of);
+		}
+		smb_node_release(strnode);
+	}
+}
+
+/*
+ * During ofile close, free the persistent handle state nvlist and
+ * drop our reference to the state file node (which may unlink it
+ * if smb2_dh_setdoc_persistent was called).
+ */
+void
+smb2_dh_close_persistent(smb_ofile_t *of)
+{
+	smb_node_t	*strnode;
+	struct nvlist	*nvl;
+
+	/*
+	 * Clear out nvlist and stream linkage
+	 */
+	mutex_enter(&of->dh_nvlock);
+	strnode = of->dh_nvfile;
+	of->dh_nvfile = NULL;
+	nvl = of->dh_nvlist;
+	of->dh_nvlist = NULL;
+	mutex_exit(&of->dh_nvlock);
+
+	if (nvl != NULL)
+		nvlist_free(nvl);
+
+	if (strnode != NULL)
+		smb_node_release(strnode);
+}
+
+/*
+ * Make this durable handle persistent.
+ * If we succeed, set of->dh_persist = TRUE.
+ */
+int
+smb2_dh_make_persistent(smb_request_t *sr, smb_ofile_t *of)
+{
+	char		fname[DH_SN_SIZE];
+	char		sidstr[SMB_SID_STRSZ];
+	smb_attr_t	attr;
+	smb_arg_open_t	*op = &sr->arg.open;
+	cred_t		*kcr = zone_kcred();
+	smb_node_t	*dnode = of->f_tree->t_snode;
+	smb_node_t	*fnode = NULL;
+	ksid_t		*ksid;
+	int		rc;
+
+	ASSERT(of->dh_nvfile == NULL);
+
+	/*
+	 * Create the persistent handle nvlist file.
+	 * It's a named stream in the share root.
+	 */
+	smb2_dh_make_stream_name(fname, sizeof (fname), of->f_persistid);
+
+	bzero(&attr, sizeof (attr));
+	attr.sa_mask = SMB_AT_TYPE | SMB_AT_MODE | SMB_AT_SIZE;
+	attr.sa_vattr.va_type = VREG;
+	attr.sa_vattr.va_mode = 0640;
+	attr.sa_vattr.va_size = 4;
+	rc = smb_fsop_create(sr, kcr, dnode, fname, &attr, &fnode);
+	if (rc != 0)
+		return (rc);
+
+	mutex_enter(&of->dh_nvlock);
+
+	/* fnode is held. rele in smb2_dh_close_persistent */
+	of->dh_nvfile = fnode;
+	(void) nvlist_alloc(&of->dh_nvlist, NV_UNIQUE_NAME, KM_SLEEP);
+
+	/*
+	 * Want the ksid as a string
+	 */
+	ksid = crgetsid(of->f_user->u_cred, KSID_USER);
+	(void) snprintf(sidstr, sizeof (sidstr), "%s-%u",
+	    ksid->ks_domain->kd_name, ksid->ks_rid);
+
+	/*
+	 * Fill in the fixed parts of the nvlist
+	 */
+	(void) nvlist_add_uint32(of->dh_nvlist,
+	    "info_version", smb2_ca_info_version);
+	(void) nvlist_add_string(of->dh_nvlist,
+	    "owner_sid", sidstr);
+	(void) nvlist_add_string(of->dh_nvlist,
+	    "share_name", of->f_tree->t_sharename);
+	(void) nvlist_add_uint64(of->dh_nvlist,
+	    "file_persistid", of->f_persistid);
+	(void) nvlist_add_uint8_array(of->dh_nvlist,
+	    "file_guid", of->dh_create_guid, UUID_LEN);
+	(void) nvlist_add_string(of->dh_nvlist,
+	    "client_ipaddr", sr->session->ip_addr_str);
+	(void) nvlist_add_uint8_array(of->dh_nvlist,
+	    "client_uuid", sr->session->clnt_uuid, UUID_LEN);
+	(void) nvlist_add_string(of->dh_nvlist,
+	    "path_name", op->fqi.fq_path.pn_path);
+	(void) nvlist_add_uint32(of->dh_nvlist,
+	    "granted_access", of->f_granted_access);
+	(void) nvlist_add_uint32(of->dh_nvlist,
+	    "share_access", of->f_share_access);
+	(void) nvlist_add_uint32(of->dh_nvlist,
+	    "create_options", of->f_create_options);
+	if (of->f_lease != NULL) {
+		smb_lease_t *ls = of->f_lease;
+		(void) nvlist_add_uint8_array(of->dh_nvlist,
+		    "lease_uuid", ls->ls_key, 16);
+		(void) nvlist_add_uint32(of->dh_nvlist,
+		    "lease_state", ls->ls_state);
+		(void) nvlist_add_uint16(of->dh_nvlist,
+		    "lease_epoch", ls->ls_epoch);
+		(void) nvlist_add_uint16(of->dh_nvlist,
+		    "lease_version", ls->ls_version);
+	} else {
+		(void) nvlist_add_uint32(of->dh_nvlist,
+		    "oplock_state", of->f_oplock.og_state);
+	}
+	mutex_exit(&of->dh_nvlock);
+
+	smb2_dh_update_locks(sr, of);
+
+	/* Tell sr update nvlist file */
+	sr->dh_nvl_dirty = B_TRUE;
+
+	return (0);
+}
+
+void
+smb2_dh_update_nvfile(smb_request_t *sr)
+{
+	smb_attr_t	attr;
+	iovec_t		iov;
+	uio_t		uio;
+	smb_ofile_t	*of = sr->fid_ofile;
+	cred_t		*kcr = zone_kcred();
+	char		*buf = NULL;
+	size_t		buflen = 0;
+	uint32_t	wcnt;
+	int		rc;
+
+	if (of == NULL || of->dh_persist == B_FALSE)
+		return;
+
+	mutex_enter(&of->dh_nvlock);
+	if (of->dh_nvlist == NULL || of->dh_nvfile == NULL) {
+		mutex_exit(&of->dh_nvlock);
+		return;
+	}
+
+	rc = nvlist_size(of->dh_nvlist, &buflen, NV_ENCODE_XDR);
+	if (rc != 0)
+		goto out;
+	buf = kmem_zalloc(buflen, KM_SLEEP);
+
+	rc = nvlist_pack(of->dh_nvlist, &buf, &buflen,
+	    NV_ENCODE_XDR, KM_SLEEP);
+	if (rc != 0)
+		goto out;
+
+	bzero(&attr, sizeof (attr));
+	attr.sa_mask = SMB_AT_SIZE;
+	attr.sa_vattr.va_size = buflen;
+	rc = smb_node_setattr(sr, of->dh_nvfile, kcr, NULL, &attr);
+	if (rc != 0)
+		goto out;
+
+	bzero(&uio, sizeof (uio));
+	iov.iov_base = (void *) buf;
+	iov.iov_len = buflen;
+	uio.uio_iov = &iov;
+	uio.uio_iovcnt = 1;
+	uio.uio_resid = buflen;
+	uio.uio_segflg = UIO_SYSSPACE;
+	uio.uio_extflg = UIO_COPY_DEFAULT;
+	rc = smb_fsop_write(sr, kcr, of->dh_nvfile,
+	    NULL, &uio, &wcnt, 0);
+	if (rc == 0 && wcnt != buflen)
+		rc = EIO;
+
+out:
+	mutex_exit(&of->dh_nvlock);
+
+	if (rc != 0) {
+		cmn_err(CE_WARN,
+		    "clnt(%s) failed to update persistent handle, rc=%d",
+		    sr->session->ip_addr_str, rc);
+	}
+
+	if (buf != NULL) {
+		kmem_free(buf, buflen);
+	}
+}
+
+/*
+ * Called after f_oplock (and lease) changes
+ * If lease, update: lease_state, lease_epoch
+ * else (oplock) update: oplock_state
+ */
+void
+smb2_dh_update_oplock(smb_request_t *sr, smb_ofile_t *of)
+{
+	smb_lease_t *ls;
+
+	mutex_enter(&of->dh_nvlock);
+	if (of->dh_nvlist == NULL) {
+		mutex_exit(&of->dh_nvlock);
+		return;
+	}
+
+	if (of->f_lease != NULL) {
+		ls = of->f_lease;
+		(void) nvlist_add_uint32(of->dh_nvlist,
+		    "lease_state", ls->ls_state);
+		(void) nvlist_add_uint16(of->dh_nvlist,
+		    "lease_epoch", ls->ls_epoch);
+	} else {
+		(void) nvlist_add_uint32(of->dh_nvlist,
+		    "oplock_state", of->f_oplock.og_state);
+	}
+	mutex_exit(&of->dh_nvlock);
+
+	sr->dh_nvl_dirty = B_TRUE;
+}
+
+/*
+ * Save locks from this ofile as an array of uint64_t, where the
+ * elements are triplets: (start, length, (pid << 32) | type)
+ * Note pid should always be zero for SMB2, so we could use
+ * that 32-bit spot for something else if needed.
+ */
+void
+smb2_dh_update_locks(smb_request_t *sr, smb_ofile_t *of)
+{
+	uint8_t		lseq[SMB_OFILE_LSEQ_MAX];
+	smb_node_t	*node = of->f_node;
+	smb_llist_t	*llist = &node->n_lock_list;
+	size_t		vec_sz;	// storage size
+	uint_t		my_cnt = 0;
+	uint64_t	*vec = NULL;
+	struct nvlk	*nlp;
+	smb_lock_t	*lock;
+
+	smb_llist_enter(llist, RW_READER);
+	vec_sz = (llist->ll_count + 1) * sizeof (struct nvlk);
+	vec = kmem_alloc(vec_sz, KM_SLEEP);
+	nlp = (struct nvlk *)vec;
+	for (lock = smb_llist_head(llist);
+	    lock != NULL;
+	    lock = smb_llist_next(llist, lock)) {
+		if (lock->l_file != of)
+			continue;
+		nlp->lk_start = lock->l_start;
+		nlp->lk_len = lock->l_length;
+		nlp->lk_pid = lock->l_pid;
+		nlp->lk_type = lock->l_type;
+		nlp++;
+		my_cnt++;
+	}
+	smb_llist_exit(llist);
+
+	mutex_enter(&of->f_mutex);
+	bcopy(of->f_lock_seq, lseq, sizeof (lseq));
+	mutex_exit(&of->f_mutex);
+
+	mutex_enter(&of->dh_nvlock);
+	if (of->dh_nvlist != NULL) {
+
+		(void) nvlist_add_uint64_array(of->dh_nvlist,
+		    "locks", vec, my_cnt * 3);
+
+		(void) nvlist_add_uint8_array(of->dh_nvlist,
+		    "lockseq", lseq, sizeof (lseq));
+	}
+	mutex_exit(&of->dh_nvlock);
+
+	kmem_free(vec, vec_sz);
+
+	sr->dh_nvl_dirty = B_TRUE;
+}
+
+/*
+ * Save "sticky" times
+ */
+void
+smb2_dh_update_times(smb_request_t *sr, smb_ofile_t *of, smb_attr_t *attr)
+{
+	hrtime_t t;
+
+	mutex_enter(&of->dh_nvlock);
+	if (of->dh_nvlist == NULL) {
+		mutex_exit(&of->dh_nvlock);
+		return;
+	}
+
+	if (attr->sa_mask & SMB_AT_ATIME) {
+		t = ts2hrt(&attr->sa_vattr.va_atime);
+		(void) nvlist_add_hrtime(of->dh_nvlist, "atime", t);
+	}
+	if (attr->sa_mask & SMB_AT_MTIME) {
+		t = ts2hrt(&attr->sa_vattr.va_mtime);
+		(void) nvlist_add_hrtime(of->dh_nvlist, "mtime", t);
+	}
+	if (attr->sa_mask & SMB_AT_CTIME) {
+		t = ts2hrt(&attr->sa_vattr.va_ctime);
+		(void) nvlist_add_hrtime(of->dh_nvlist, "ctime", t);
+	}
+	mutex_exit(&of->dh_nvlock);
+
+	sr->dh_nvl_dirty = B_TRUE;
+}
+
+
+/*
  * Requirements for ofile found during reconnect (MS-SMB2 3.3.5.9.7):
  * - security descriptor must match provided descriptor
  *
@@ -332,6 +1474,8 @@ smb2_dh_expire(void *arg)
 {
 	smb_ofile_t *of = (smb_ofile_t *)arg;
 
+	if (of->dh_persist)
+		smb2_dh_setdoc_persistent(of);
 	smb_ofile_close(of, 0);
 	smb_ofile_release(of);
 }
@@ -383,9 +1527,96 @@ smb2_durable_timers(smb_server_t *sv)
 }
 
 /*
+ * This is called when we're about to add a new open to some node.
+ * If we still have orphaned durable handles on this node, let's
+ * assume the client has lost interest in those and close them,
+ * otherwise we might conflict with our own orphaned handles.
+ *
+ * We need this because we import persistent handles "speculatively"
+ * during share import (before the client ever asks for reconnect).
+ * That allows us to avoid any need for a "create blackout" (or
+ * "grace period") because the imported handles prevent unwanted
+ * conflicting opens from other clients.  However, if some client
+ * "forgets" about a persistent handle (*cough* Hyper-V) and tries
+ * a new (conflicting) open instead of a reconnect, that might
+ * fail unless we expire our orphaned durables handle first.
+ *
+ * Logic similar to smb_node_open_check()
+ */
+void
+smb2_dh_close_my_orphans(smb_request_t *sr, smb_ofile_t *new_of)
+{
+	smb_node_t *node = new_of->f_node;
+	smb_ofile_t *of;
+
+	SMB_NODE_VALID(node);
+
+	smb_llist_enter(&node->n_ofile_list, RW_READER);
+	for (of = smb_llist_head(&node->n_ofile_list);
+	    of != NULL;
+	    of = smb_llist_next(&node->n_ofile_list, of)) {
+
+		/* Same client? */
+		if (of->f_lease != NULL &&
+		    bcmp(sr->session->clnt_uuid,
+		    of->f_lease->ls_clnt, 16) != 0)
+			continue;
+
+		if (!smb_is_same_user(sr->user_cr, of->f_cr))
+			continue;
+
+		mutex_enter(&of->f_mutex);
+		if (of->f_state == SMB_OFILE_STATE_ORPHANED) {
+			of->f_state = SMB_OFILE_STATE_EXPIRED;
+			/* inline smb_ofile_hold_internal() */
+			of->f_refcnt++;
+			smb_llist_post(&node->n_ofile_list,
+			    of, smb2_dh_expire);
+		}
+		mutex_exit(&of->f_mutex);
+	}
+
+	smb_llist_exit(&node->n_ofile_list);
+}
+
+/*
+ * Called for each orphaned DH during shutdown.
+ * Clean out any in-memory state, but leave any
+ * on-disk persistent handle state in place.
+ */
+static void
+smb2_dh_cleanup(void *arg)
+{
+	smb_ofile_t *of = (smb_ofile_t *)arg;
+	smb_node_t *strnode;
+	struct nvlist *nvl;
+
+	/*
+	 * Intentionally skip smb2_dh_close_persistent by
+	 * clearing dh_nvfile before smb_ofile_close().
+	 */
+	mutex_enter(&of->dh_nvlock);
+	strnode = of->dh_nvfile;
+	of->dh_nvfile = NULL;
+	nvl = of->dh_nvlist;
+	of->dh_nvlist = NULL;
+	mutex_exit(&of->dh_nvlock);
+
+	if (nvl != NULL)
+		nvlist_free(nvl);
+
+	if (strnode != NULL)
+		smb_node_release(strnode);
+
+	smb_ofile_close(of, 0);
+	smb_ofile_release(of);
+}
+
+/*
  * Clean out durable handles during shutdown.
- * Like, smb2_durable_timers but expire all,
- * and make sure the hash buckets are empty.
+ *
+ * Like, smb2_durable_timers but cleanup only in-memory state,
+ * and leave any persistent state there for later reconnect.
  */
 void
 smb2_dh_shutdown(smb_server_t *sv)
@@ -410,7 +1641,7 @@ smb2_dh_shutdown(smb_server_t *sv)
 				of->f_state = SMB_OFILE_STATE_EXPIRED;
 				/* inline smb_ofile_hold_internal() */
 				of->f_refcnt++;
-				smb_llist_post(bucket, of, smb2_dh_expire);
+				smb_llist_post(bucket, of, smb2_dh_cleanup);
 				break;
 			default:
 				break;
diff --git a/usr/src/uts/common/fs/smbsrv/smb2_lease.c b/usr/src/uts/common/fs/smbsrv/smb2_lease.c
index d2bf4805b3..95d7d9c7f1 100644
--- a/usr/src/uts/common/fs/smbsrv/smb2_lease.c
+++ b/usr/src/uts/common/fs/smbsrv/smb2_lease.c
@@ -122,11 +122,10 @@ smb_hash_uuid(const uint8_t *uuid)
  * Handling the SMB2_CREATE_REQUEST_LEASE Create Context
  */
 uint32_t
-smb2_lease_create(smb_request_t *sr)
+smb2_lease_create(smb_request_t *sr, uint8_t *clnt)
 {
 	smb_arg_open_t *op = &sr->arg.open;
 	uint8_t *key = op->lease_key;
-	uint8_t *clnt = sr->session->clnt_uuid;
 	smb_ofile_t *of = sr->fid_ofile;
 	smb_hash_t *ht = sr->sr_server->sv_lease_ht;
 	smb_llist_t *bucket;
diff --git a/usr/src/uts/common/fs/smbsrv/smb2_lock.c b/usr/src/uts/common/fs/smbsrv/smb2_lock.c
index c6e8236cce..cc05f96e75 100644
--- a/usr/src/uts/common/fs/smbsrv/smb2_lock.c
+++ b/usr/src/uts/common/fs/smbsrv/smb2_lock.c
@@ -142,6 +142,10 @@ smb2_lock(smb_request_t *sr)
 		status = smb2_locks(sr);
 	}
 
+	if (sr->fid_ofile->dh_persist) {
+		smb2_dh_update_locks(sr, sr->fid_ofile);
+	}
+
 errout:
 	sr->smb2_status = status;
 	DTRACE_SMB2_DONE(op__Lock, smb_request_t *, sr);
diff --git a/usr/src/uts/common/fs/smbsrv/smb2_negotiate.c b/usr/src/uts/common/fs/smbsrv/smb2_negotiate.c
index cbdd5f9fb5..5bc7b01260 100644
--- a/usr/src/uts/common/fs/smbsrv/smb2_negotiate.c
+++ b/usr/src/uts/common/fs/smbsrv/smb2_negotiate.c
@@ -26,8 +26,12 @@ uint32_t smb2srv_capabilities =
 	SMB2_CAP_DFS |
 	SMB2_CAP_LEASING |
 	SMB2_CAP_LARGE_MTU |
+	SMB2_CAP_PERSISTENT_HANDLES |
 	SMB2_CAP_ENCRYPTION;
 
+/* These are the only capabilities defined for SMB2.X */
+#define	SMB_2X_CAPS (SMB2_CAP_DFS | SMB2_CAP_LEASING | SMB2_CAP_LARGE_MTU)
+
 /*
  * These are not intended as customer tunables, but dev. & test folks
  * might want to adjust them (with caution).
@@ -350,16 +354,26 @@ smb2_negotiate_common(smb_request_t *sr, uint16_t version)
 	/*
 	 * [MS-SMB2] 3.3.5.4 Receiving an SMB2 NEGOTIATE Request
 	 *
-	 * Only set CAP_ENCRYPTION if this is 3.0 or 3.0.2 and
-	 * the client has it set.
+	 * The SMB2.x capabilities are returned without regard for
+	 * what capabilities the client provided in the request.
+	 * The SMB3.x capabilities returned are the traditional
+	 * logical AND of server and client capabilities.
+	 *
+	 * One additional check: If KCF is missing something we
+	 * require for encryption, turn off that capability.
 	 */
-
-	if (s->dialect < SMB_VERS_3_0 ||
-	    !SMB3_CLIENT_ENCRYPTS(sr) ||
-	    smb3_encrypt_init_mech(s) != 0)
-		s->srv_cap = smb2srv_capabilities & ~SMB2_CAP_ENCRYPTION;
-	else
-		s->srv_cap = smb2srv_capabilities;
+	if (s->dialect < SMB_VERS_3_0) {
+		/* SMB 2.x */
+		s->srv_cap = smb2srv_capabilities & SMB_2X_CAPS;
+	} else {
+		/* SMB 3.0 or later */
+		s->srv_cap = smb2srv_capabilities &
+		    (SMB_2X_CAPS | s->capabilities);
+		if ((s->srv_cap & SMB2_CAP_ENCRYPTION) != 0 &&
+		    smb3_encrypt_init_mech(s) != 0) {
+			s->srv_cap &= ~SMB2_CAP_ENCRYPTION;
+		}
+	}
 
 	/*
 	 * See notes above smb2_max_rwsize, smb2_old_rwsize
diff --git a/usr/src/uts/common/fs/smbsrv/smb2_tree_connect.c b/usr/src/uts/common/fs/smbsrv/smb2_tree_connect.c
index e11a8855f7..34a74f564b 100644
--- a/usr/src/uts/common/fs/smbsrv/smb2_tree_connect.c
+++ b/usr/src/uts/common/fs/smbsrv/smb2_tree_connect.c
@@ -19,6 +19,8 @@
 
 #include <smbsrv/smb2_kproto.h>
 
+#define	SMB2_SHARE_CAP_CA SMB2_SHARE_CAP_CONTINUOUS_AVAILABILITY
+
 smb_sdrc_t
 smb2_tree_connect(smb_request_t *sr)
 {
@@ -114,6 +116,10 @@ smb2_tree_connect(smb_request_t *sr)
 		ShareFlags = 0;
 
 	Capabilities = 0;
+	if ((tree->t_flags & SMB_TREE_DFSROOT) != 0)
+		Capabilities |= SMB2_SHARE_CAP_DFS;
+	if ((tree->t_flags & SMB_TREE_CA) != 0)
+		Capabilities |= SMB2_SHARE_CAP_CA;
 
 	/*
 	 * SMB2 Tree Connect reply
diff --git a/usr/src/uts/common/fs/smbsrv/smb_common_open.c b/usr/src/uts/common/fs/smbsrv/smb_common_open.c
index 161f2790f6..0ef06a3c3e 100644
--- a/usr/src/uts/common/fs/smbsrv/smb_common_open.c
+++ b/usr/src/uts/common/fs/smbsrv/smb_common_open.c
@@ -40,9 +40,6 @@
 
 int smb_session_ofile_max = 32768;
 
-static volatile uint32_t smb_fids = 0;
-#define	SMB_UNIQ_FID()	atomic_inc_32_nv(&smb_fids)
-
 extern uint32_t smb_is_executable(char *);
 static void smb_delete_new_object(smb_request_t *);
 static int smb_set_open_attributes(smb_request_t *, smb_ofile_t *);
@@ -280,6 +277,7 @@ smb_common_open(smb_request_t *sr)
 	boolean_t	fnode_shrlk = B_FALSE;
 	boolean_t	did_open = B_FALSE;
 	boolean_t	did_break_handle = B_FALSE;
+	boolean_t	did_cleanup_orphans = B_FALSE;
 
 	/* Get out now if we've been cancelled. */
 	mutex_enter(&sr->sr_mutex);
@@ -350,10 +348,9 @@ smb_common_open(smb_request_t *sr)
 		/*
 		 * Most of IPC open is handled in smb_opipe_open()
 		 */
-		uniq_fid = SMB_UNIQ_FID();
 		op->create_options = 0;
 		of = smb_ofile_alloc(sr, op, NULL, SMB_FTYPE_MESG_PIPE,
-		    tree_fid, uniq_fid);
+		    tree_fid);
 		tree_fid = 0; // given to the ofile
 		status = smb_opipe_open(sr, of);
 		smb_threshold_exit(&sv->sv_opipe_ct);
@@ -450,13 +447,6 @@ smb_common_open(smb_request_t *sr)
 		goto errout;
 	}
 
-	/*
-	 * The uniq_fid is a CIFS-server-wide unique identifier for an ofile
-	 * which is used to uniquely identify open instances for the
-	 * VFS share reservation and POSIX locks.
-	 */
-	uniq_fid = SMB_UNIQ_FID();
-
 	if (last_comp_found) {
 
 		smb_node_unlock(dnode);
@@ -584,10 +574,14 @@ smb_common_open(smb_request_t *sr)
 		 * affect the sharing checks, and may delete the file due to
 		 * DELETE_ON_CLOSE. This may block, so set the file opening
 		 * count before oplock stuff.
+		 *
+		 * Need the "proposed" ofile (and its TargetOplockKey) for
+		 * correct oplock break semantics.
 		 */
 		of = smb_ofile_alloc(sr, op, fnode, SMB_FTYPE_DISK,
-		    tree_fid, uniq_fid);
+		    tree_fid);
 		tree_fid = 0; // given to the ofile
+		uniq_fid = of->f_uniqid;
 
 		smb_node_inc_opening_count(fnode);
 		opening_incr = B_TRUE;
@@ -683,6 +677,22 @@ smb_common_open(smb_request_t *sr)
 		}
 
 		/*
+		 * If we still have orphaned durable handles on this file,
+		 * let's assume the client has lost interest in those and
+		 * close them so they don't cause sharing violations.
+		 * See longer comment at smb2_dh_close_my_orphans().
+		 */
+		if (status == NT_STATUS_SHARING_VIOLATION &&
+		    sr->session->dialect >= SMB_VERS_2_BASE &&
+		    did_cleanup_orphans == B_FALSE) {
+
+			did_cleanup_orphans = B_TRUE;
+			smb2_dh_close_my_orphans(sr, of);
+
+			goto shrlock_again;
+		}
+
+		/*
 		 * SMB1 expects a 1 sec. delay before returning a
 		 * sharing violation error.  If breaking oplocks
 		 * above took less than a sec, wait some more.
@@ -904,27 +914,17 @@ create:
 			goto errout;
 		}
 
+		/* Create done. */
 		smb_node_unlock(dnode);
 		dnode_wlock = B_FALSE;
 
 		created = B_TRUE;
 		op->action_taken = SMB_OACT_CREATED;
 
+		/* Note: hold from create */
 		fnode = op->fqi.fq_fnode;
 		fnode_held = B_TRUE;
 
-		smb_node_inc_opening_count(fnode);
-		opening_incr = B_TRUE;
-
-		smb_node_wrlock(fnode);
-		fnode_wlock = B_TRUE;
-
-		status = smb_fsop_shrlock(sr->user_cr, fnode, uniq_fid,
-		    op->desired_access, op->share_access);
-		if (status != 0)
-			goto errout;
-		fnode_shrlk = B_TRUE;
-
 		if (max_requested) {
 			smb_fsop_eaccess(sr, sr->user_cr, fnode, &max_allowed);
 			op->desired_access |= max_allowed;
@@ -937,6 +937,27 @@ create:
 		 */
 		op->desired_access |= (READ_CONTROL | FILE_READ_ATTRIBUTES);
 
+		/* Allocate the ofile and fill in most of it. */
+		of = smb_ofile_alloc(sr, op, fnode, SMB_FTYPE_DISK,
+		    tree_fid);
+		tree_fid = 0; // given to the ofile
+		uniq_fid = of->f_uniqid;
+
+		smb_node_inc_opening_count(fnode);
+		opening_incr = B_TRUE;
+
+		/*
+		 * Share access checks...
+		 */
+		smb_node_wrlock(fnode);
+		fnode_wlock = B_TRUE;
+
+		status = smb_fsop_shrlock(sr->user_cr, fnode, uniq_fid,
+		    op->desired_access, op->share_access);
+		if (status != 0)
+			goto errout;
+		fnode_shrlk = B_TRUE;
+
 		/*
 		 * MS-FSA 2.1.5.1.1
 		 * If the Oplock member of the DirectoryStream in
@@ -951,9 +972,6 @@ create:
 		 *
 		 * The break never blocks, so ignore the return.
 		 */
-		of = smb_ofile_alloc(sr, op, fnode, SMB_FTYPE_DISK,
-		    tree_fid, uniq_fid);
-		tree_fid = 0; // given to the ofile
 		(void) smb_oplock_break_PARENT(dnode, of);
 	}
 
@@ -1052,8 +1070,9 @@ create:
 errout:
 	if (did_open) {
 		smb_ofile_close(of, 0);
-		/* Don't also ofile_free */
+		/* rele via sr->fid_ofile */
 	} else if (of != NULL) {
+		/* No other refs possible */
 		smb_ofile_free(of);
 	}
 
diff --git a/usr/src/uts/common/fs/smbsrv/smb_cred.c b/usr/src/uts/common/fs/smbsrv/smb_cred.c
index f47f5e72a5..8431db4653 100644
--- a/usr/src/uts/common/fs/smbsrv/smb_cred.c
+++ b/usr/src/uts/common/fs/smbsrv/smb_cred.c
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright 2017 Nexenta Systems, Inc.  All rights reserved.
  */
 
 /*
@@ -172,3 +172,19 @@ smb_cred_set_sidlist(smb_ids_t *token_grps)
 
 	return (lp);
 }
+
+/*
+ * Special variant of smb_cred_create() used when we need an
+ * SMB kcred (e.g. DH import).  The returned cred must be
+ * from crget() so it can be passed to smb_user_setcred().
+ */
+cred_t *
+smb_kcred_create(void)
+{
+	cred_t	*cr;
+
+	cr = crget();
+	ASSERT(cr != NULL);
+
+	return (cr);
+}
diff --git a/usr/src/uts/common/fs/smbsrv/smb_fsops.c b/usr/src/uts/common/fs/smbsrv/smb_fsops.c
index 6aa4074221..1b7c3a9fa9 100644
--- a/usr/src/uts/common/fs/smbsrv/smb_fsops.c
+++ b/usr/src/uts/common/fs/smbsrv/smb_fsops.c
@@ -365,6 +365,9 @@ smb_fsop_create(smb_request_t *sr, cred_t *cr, smb_node_t *dnode,
  * because we want to set the UID and GID on the named
  * stream in this case for consistency with the (unnamed
  * stream) file (see comments for smb_vop_setattr()).
+ *
+ * Note that some stream "types" are "restricted" and only
+ * internal callers (cr == kcred) can create those.
  */
 static int
 smb_fsop_create_stream(smb_request_t *sr, cred_t *cr,
@@ -379,6 +382,9 @@ smb_fsop_create_stream(smb_request_t *sr, cred_t *cr,
 	int		rc = 0;
 	boolean_t	fcreate = B_FALSE;
 
+	if (cr != kcr && smb_strname_restricted(sname))
+		return (EACCES);
+
 	/* Look up / create the unnamed stream, fname */
 	rc = smb_fsop_lookup(sr, cr, flags | SMB_FOLLOW_LINKS,
 	    sr->tid_tree->t_snode, dnode, fname, &fnode);
@@ -663,6 +669,9 @@ smb_fsop_mkdir(
  * It is assumed that a reference exists on snode coming into this routine.
  *
  * A null smb_request might be passed to this function.
+ *
+ * Note that some stream "types" are "restricted" and only
+ * internal callers (cr == kcred) can remove those.
  */
 int
 smb_fsop_remove(
@@ -698,6 +707,11 @@ smb_fsop_remove(
 	sname = kmem_alloc(MAXNAMELEN, KM_SLEEP);
 
 	if (dnode->flags & NODE_XATTR_DIR) {
+		if (cr != zone_kcred() && smb_strname_restricted(name)) {
+			rc = EACCES;
+			goto out;
+		}
+
 		fnode = dnode->n_dnode;
 		rc = smb_vop_stream_remove(fnode->vp, name, flags, cr);
 
@@ -709,6 +723,11 @@ smb_fsop_remove(
 	} else if (smb_is_stream_name(name)) {
 		smb_stream_parse_name(name, fname, sname);
 
+		if (cr != zone_kcred() && smb_strname_restricted(sname)) {
+			rc = EACCES;
+			goto out;
+		}
+
 		/*
 		 * Look up the unnamed stream (i.e. fname).
 		 * Unmangle processing will be done on fname
@@ -719,9 +738,7 @@ smb_fsop_remove(
 		    sr->tid_tree->t_snode, dnode, fname, &fnode);
 
 		if (rc != 0) {
-			kmem_free(fname, MAXNAMELEN);
-			kmem_free(sname, MAXNAMELEN);
-			return (rc);
+			goto out;
 		}
 
 		/*
@@ -744,9 +761,7 @@ smb_fsop_remove(
 		if (rc == ENOENT) {
 			if (!SMB_TREE_SUPPORTS_SHORTNAMES(sr) ||
 			    !smb_maybe_mangled(name)) {
-				kmem_free(fname, MAXNAMELEN);
-				kmem_free(sname, MAXNAMELEN);
-				return (rc);
+				goto out;
 			}
 			longname = kmem_alloc(MAXNAMELEN, KM_SLEEP);
 
@@ -776,6 +791,7 @@ smb_fsop_remove(
 		}
 	}
 
+out:
 	kmem_free(fname, MAXNAMELEN);
 	kmem_free(sname, MAXNAMELEN);
 
@@ -1609,6 +1625,9 @@ smb_fsop_statfs(
  * check is performed on the named stream in case it has been
  * quarantined.  kcred is used to avoid issues with the permissions
  * set on the extended attribute file representing the named stream.
+ *
+ * Note that some stream "types" are "restricted" and only
+ * internal callers (cr == kcred) can access those.
  */
 int
 smb_fsop_access(smb_request_t *sr, cred_t *cr, smb_node_t *snode,
@@ -1639,9 +1658,14 @@ smb_fsop_access(smb_request_t *sr, cred_t *cr, smb_node_t *snode,
 
 	unnamed_node = SMB_IS_STREAM(snode);
 	if (unnamed_node) {
+		cred_t *kcr = zone_kcred();
+
 		ASSERT(unnamed_node->n_magic == SMB_NODE_MAGIC);
 		ASSERT(unnamed_node->n_state != SMB_NODE_STATE_DESTROYING);
 
+		if (cr != kcr && smb_strname_restricted(snode->od_name))
+			return (NT_STATUS_ACCESS_DENIED);
+
 		/*
 		 * Perform VREAD access check on the named stream in case it
 		 * is quarantined. kcred is passed to smb_vop_access so it
@@ -1649,7 +1673,7 @@ smb_fsop_access(smb_request_t *sr, cred_t *cr, smb_node_t *snode,
 		 */
 		if (faccess & (FILE_READ_DATA | FILE_EXECUTE)) {
 			error = smb_vop_access(snode->vp, VREAD,
-			    0, NULL, zone_kcred());
+			    0, NULL, kcr);
 			if (error)
 				return (NT_STATUS_ACCESS_DENIED);
 		}
diff --git a/usr/src/uts/common/fs/smbsrv/smb_init.c b/usr/src/uts/common/fs/smbsrv/smb_init.c
index 88d804723e..f7e1739367 100644
--- a/usr/src/uts/common/fs/smbsrv/smb_init.c
+++ b/usr/src/uts/common/fs/smbsrv/smb_init.c
@@ -247,7 +247,14 @@ smb_drv_open(dev_t *devp, int flag, int otyp, cred_t *cr)
 static int
 smb_drv_close(dev_t dev, int flag, int otyp, cred_t *credp)
 {
-	return (smb_server_delete());
+	smb_server_t	*sv;
+	int		rc;
+
+	rc = smb_server_lookup(&sv);
+	if (rc == 0)
+		rc = smb_server_delete(sv);
+
+	return (rc);
 }
 
 /* ARGSUSED */
diff --git a/usr/src/uts/common/fs/smbsrv/smb_kshare.c b/usr/src/uts/common/fs/smbsrv/smb_kshare.c
index a43c4af02a..5c5458bca5 100644
--- a/usr/src/uts/common/fs/smbsrv/smb_kshare.c
+++ b/usr/src/uts/common/fs/smbsrv/smb_kshare.c
@@ -26,8 +26,9 @@
  */
 
 #include <smbsrv/smb_door.h>
-#include <smbsrv/smb_kproto.h>
 #include <smbsrv/smb_ktypes.h>
+#include <smbsrv/smb2_kproto.h>
+#include <smbsrv/smb_kstat.h>
 
 typedef struct smb_unshare {
 	list_node_t	us_lnd;
@@ -36,7 +37,6 @@ typedef struct smb_unshare {
 
 static kmem_cache_t	*smb_kshare_cache_share;
 static kmem_cache_t	*smb_kshare_cache_unexport;
-kmem_cache_t	*smb_kshare_cache_vfs;
 
 static int smb_kshare_cmp(const void *, const void *);
 static void smb_kshare_hold(const void *);
@@ -294,7 +294,6 @@ smb_export_stop(smb_server_t *sv)
 	mutex_exit(&sv->sv_export.e_mutex);
 
 	smb_avl_destroy(&sv->sv_export.e_share_avl);
-	smb_vfs_rele_all(&sv->sv_export);
 }
 
 void
@@ -305,18 +304,12 @@ smb_kshare_g_init(void)
 
 	smb_kshare_cache_unexport = kmem_cache_create("smb_unexport_cache",
 	    sizeof (smb_unshare_t), 8, NULL, NULL, NULL, NULL, NULL, 0);
-
-	smb_kshare_cache_vfs = kmem_cache_create("smb_vfs_cache",
-	    sizeof (smb_vfs_t), 8, NULL, NULL, NULL, NULL, NULL, 0);
 }
 
 void
 smb_kshare_init(smb_server_t *sv)
 {
 
-	smb_llist_constructor(&sv->sv_export.e_vfs_list, sizeof (smb_vfs_t),
-	    offsetof(smb_vfs_t, sv_lnd));
-
 	smb_slist_constructor(&sv->sv_export.e_unexport_list,
 	    sizeof (smb_unshare_t), offsetof(smb_unshare_t, us_lnd));
 }
@@ -348,10 +341,6 @@ smb_kshare_fini(smb_server_t *sv)
 		kmem_cache_free(smb_kshare_cache_unexport, ux);
 	}
 	smb_slist_destructor(&sv->sv_export.e_unexport_list);
-
-	smb_vfs_rele_all(&sv->sv_export);
-
-	smb_llist_destructor(&sv->sv_export.e_vfs_list);
 }
 
 void
@@ -359,7 +348,6 @@ smb_kshare_g_fini(void)
 {
 	kmem_cache_destroy(smb_kshare_cache_unexport);
 	kmem_cache_destroy(smb_kshare_cache_share);
-	kmem_cache_destroy(smb_kshare_cache_vfs);
 }
 
 
@@ -684,10 +672,8 @@ smb_kshare_release(smb_server_t *sv, smb_kshare_t *shr)
 
 /*
  * Add the given share in the specified server.
- * If the share is a disk share, smb_vfs_hold() is
- * invoked to ensure that there is a hold on the
- * corresponding file system before the share is
- * added to shares AVL.
+ * If the share is a disk share, lookup the share path
+ * and hold the smb_node_t for the share root.
  *
  * If the share is an Autohome share and it is
  * already in the AVL only a reference count for
@@ -698,7 +684,7 @@ smb_kshare_export(smb_server_t *sv, smb_kshare_t *shr)
 {
 	smb_avl_t	*share_avl;
 	smb_kshare_t	*auto_shr;
-	vnode_t		*vp;
+	smb_node_t	*snode = NULL;
 	int		rc = 0;
 
 	share_avl = &sv->sv_export.e_share_avl;
@@ -713,36 +699,53 @@ smb_kshare_export(smb_server_t *sv, smb_kshare_t *shr)
 	}
 
 	if ((auto_shr = smb_avl_lookup(share_avl, shr)) != NULL) {
-		if ((auto_shr->shr_flags & SMB_SHRF_AUTOHOME) == 0) {
-			smb_avl_release(share_avl, auto_shr);
-			return (EEXIST);
+		rc = EEXIST;
+		if ((auto_shr->shr_flags & SMB_SHRF_AUTOHOME) != 0) {
+			mutex_enter(&auto_shr->shr_mutex);
+			auto_shr->shr_autocnt++;
+			mutex_exit(&auto_shr->shr_mutex);
+			rc = 0;
 		}
-
-		mutex_enter(&auto_shr->shr_mutex);
-		auto_shr->shr_autocnt++;
-		mutex_exit(&auto_shr->shr_mutex);
 		smb_avl_release(share_avl, auto_shr);
-		return (0);
+		return (rc);
 	}
 
-	if ((rc = smb_server_sharevp(sv, shr->shr_path, &vp)) != 0) {
-		cmn_err(CE_WARN, "export[%s(%s)]: failed obtaining vnode (%d)",
+	/*
+	 * Get the root smb_node_t for this share, held.
+	 * This hold is normally released during AVL destroy,
+	 * via the element destructor:  smb_kshare_destroy
+	 */
+	rc = smb_server_share_lookup(sv, shr->shr_path, &snode);
+	if (rc != 0) {
+		cmn_err(CE_WARN, "export[%s(%s)]: lookup failed (%d)",
 		    shr->shr_name, shr->shr_path, rc);
 		return (rc);
 	}
 
-	if ((rc = smb_vfs_hold(&sv->sv_export, vp->v_vfsp)) == 0) {
-		if ((rc = smb_avl_add(share_avl, shr)) != 0) {
-			cmn_err(CE_WARN, "export[%s]: failed caching (%d)",
-			    shr->shr_name, rc);
-			smb_vfs_rele(&sv->sv_export, vp->v_vfsp);
+	shr->shr_root_node = snode;
+	if ((rc = smb_avl_add(share_avl, shr)) != 0) {
+		cmn_err(CE_WARN, "export[%s]: failed caching (%d)",
+		    shr->shr_name, rc);
+		shr->shr_root_node = NULL;
+		smb_node_release(snode);
+		return (rc);
+	}
+
+	/*
+	 * For CA shares, find or create the CA handle dir,
+	 * and (if restarted) import persistent handles.
+	 */
+	if ((shr->shr_flags & SMB_SHRF_CA) != 0) {
+		rc = smb2_dh_new_ca_share(sv, shr);
+		if (rc != 0) {
+			/* Just make it a non-CA share. */
+			mutex_enter(&shr->shr_mutex);
+			shr->shr_flags &= ~SMB_SHRF_CA;
+			mutex_exit(&shr->shr_mutex);
+			rc = 0;
 		}
-	} else {
-		cmn_err(CE_WARN, "export[%s(%s)]: failed holding VFS (%d)",
-		    shr->shr_name, shr->shr_path, rc);
 	}
 
-	VN_RELE(vp);
 	return (rc);
 }
 
@@ -764,8 +767,6 @@ smb_kshare_unexport(smb_server_t *sv, const char *shrname)
 	smb_avl_t	*share_avl;
 	smb_kshare_t	key;
 	smb_kshare_t	*shr;
-	vnode_t		*vp;
-	int		rc;
 	boolean_t	auto_unexport;
 
 	share_avl = &sv->sv_export.e_share_avl;
@@ -785,19 +786,12 @@ smb_kshare_unexport(smb_server_t *sv, const char *shrname)
 		}
 	}
 
-	if (STYPE_ISDSK(shr->shr_type)) {
-		if ((rc = smb_server_sharevp(sv, shr->shr_path, &vp)) != 0) {
-			smb_avl_release(share_avl, shr);
-			cmn_err(CE_WARN, "unexport[%s]: failed obtaining vnode"
-			    " (%d)", shrname, rc);
-			return (rc);
-		}
+	smb_avl_remove(share_avl, shr);
 
-		smb_vfs_rele(&sv->sv_export, vp->v_vfsp);
-		VN_RELE(vp);
-	}
+	mutex_enter(&shr->shr_mutex);
+	shr->shr_flags |= SMB_SHRF_REMOVED;
+	mutex_exit(&shr->shr_mutex);
 
-	smb_avl_remove(share_avl, shr);
 	smb_avl_release(share_avl, shr);
 
 	return (0);
@@ -892,6 +886,7 @@ smb_kshare_decode(nvlist_t *share)
 	    SMB_SHRF_DFSROOT);
 	tmp.shr_flags |= smb_kshare_decode_bool(smb, SHOPT_QUOTAS,
 	    SMB_SHRF_QUOTAS);
+	tmp.shr_flags |= smb_kshare_decode_bool(smb, SHOPT_CA, SMB_SHRF_CA);
 	tmp.shr_flags |= smb_kshare_decode_bool(smb, SHOPT_FSO, SMB_SHRF_FSO);
 	tmp.shr_flags |= smb_kshare_decode_bool(smb, SHOPT_AUTOHOME,
 	    SMB_SHRF_AUTOHOME);
@@ -1041,6 +1036,11 @@ smb_kshare_destroy(void *p)
 	ASSERT(shr);
 	ASSERT(shr->shr_magic == SMB_SHARE_MAGIC);
 
+	if (shr->shr_ca_dir != NULL)
+		smb_node_release(shr->shr_ca_dir);
+	if (shr->shr_root_node)
+		smb_node_release(shr->shr_root_node);
+
 	smb_mem_free(shr->shr_name);
 	smb_mem_free(shr->shr_path);
 	smb_mem_free(shr->shr_cmnt);
diff --git a/usr/src/uts/common/fs/smbsrv/smb_node.c b/usr/src/uts/common/fs/smbsrv/smb_node.c
index 63756f9037..3e9933d51a 100644
--- a/usr/src/uts/common/fs/smbsrv/smb_node.c
+++ b/usr/src/uts/common/fs/smbsrv/smb_node.c
@@ -88,7 +88,7 @@
  *    course the state of the node should be tested/updated under the
  *    protection of the mutex).
  */
-#include <smbsrv/smb_kproto.h>
+#include <smbsrv/smb2_kproto.h>
 #include <smbsrv/smb_fsops.h>
 #include <smbsrv/smb_kstat.h>
 #include <sys/ddi.h>
@@ -1574,10 +1574,20 @@ smb_node_setattr(smb_request_t *sr, smb_node_t *node,
 			    attr->sa_crtime;
 
 		mutex_exit(&of->f_mutex);
+
 		/*
 		 * The f_pending_attr times are reapplied in
 		 * smb_ofile_close().
 		 */
+
+		/*
+		 * If this change is coming directly from a client
+		 * (sr != NULL) and it's a persistent handle, save
+		 * the "sticky times" in the handle.
+		 */
+		if (sr != NULL && of->dh_persist) {
+			smb2_dh_update_times(sr, of, attr);
+		}
 	}
 
 	if ((attr->sa_mask & SMB_AT_ALLOCSZ) != 0) {
diff --git a/usr/src/uts/common/fs/smbsrv/smb_ofile.c b/usr/src/uts/common/fs/smbsrv/smb_ofile.c
index 0142bf9164..531ca314fb 100644
--- a/usr/src/uts/common/fs/smbsrv/smb_ofile.c
+++ b/usr/src/uts/common/fs/smbsrv/smb_ofile.c
@@ -280,11 +280,7 @@
 #include <smbsrv/smb2_kproto.h>
 #include <smbsrv/smb_fsops.h>
 #include <sys/time.h>
-
-/* XXX: May need to actually assign GUIDs for these. */
-/* Don't leak object addresses */
-#define	SMB_OFILE_PERSISTID(of) \
-	((uintptr_t)&smb_cache_ofile ^ (uintptr_t)(of))
+#include <sys/random.h>
 
 static boolean_t smb_ofile_is_open_locked(smb_ofile_t *);
 static void smb_ofile_delete(void *arg);
@@ -296,6 +292,14 @@ static int smb_ofile_netinfo_init(smb_ofile_t *, smb_netfileinfo_t *);
 static void smb_ofile_netinfo_fini(smb_netfileinfo_t *);
 
 /*
+ * The uniq_fid is a CIFS-server-wide unique identifier for an ofile
+ * which is used to uniquely identify open instances for the
+ * VFS share reservation and POSIX locks.
+ */
+static volatile uint32_t smb_fids = 0;
+#define	SMB_UNIQ_FID()	atomic_inc_32_nv(&smb_fids)
+
+/*
  * smb_ofile_alloc
  * Allocate an ofile and fill in it's "up" pointers, but
  * do NOT link it into the tree's list of ofiles or the
@@ -304,6 +308,9 @@ static void smb_ofile_netinfo_fini(smb_netfileinfo_t *);
  *
  * If we don't get as far as smb_ofile_open with this OF,
  * call smb_ofile_free() to free this object.
+ *
+ * Note: The following sr members may be null during
+ * persistent handle import: session, uid_usr, tid_tree
  */
 smb_ofile_t *
 smb_ofile_alloc(
@@ -311,10 +318,10 @@ smb_ofile_alloc(
     smb_arg_open_t	*op,
     smb_node_t		*node, /* optional (may be NULL) */
     uint16_t		ftype,
-    uint16_t		tree_fid,
-    uint32_t		uniqid)
+    uint16_t		tree_fid)
 {
-	smb_tree_t	*tree = sr->tid_tree;
+	smb_user_t	*user = sr->uid_user;	/* optional */
+	smb_tree_t	*tree = sr->tid_tree;	/* optional */
 	smb_ofile_t	*of;
 
 	of = kmem_cache_alloc(smb_cache_ofile, KM_SLEEP);
@@ -324,22 +331,28 @@ smb_ofile_alloc(
 	mutex_init(&of->f_mutex, NULL, MUTEX_DEFAULT, NULL);
 	list_create(&of->f_notify.nc_waiters, sizeof (smb_request_t),
 	    offsetof(smb_request_t, sr_waiters));
+	mutex_init(&of->dh_nvlock, NULL, MUTEX_DEFAULT, NULL);
 
 	of->f_state = SMB_OFILE_STATE_ALLOC;
 	of->f_refcnt = 1;
 	of->f_ftype = ftype;
 	of->f_fid = tree_fid;
 	/* of->f_persistid see smb2_create */
-	of->f_uniqid = uniqid;
+	of->f_uniqid = SMB_UNIQ_FID();
 	of->f_opened_by_pid = sr->smb_pid;
 	of->f_granted_access = op->desired_access;
 	of->f_share_access = op->share_access;
 	of->f_create_options = op->create_options;
-	of->f_cr = (op->create_options & FILE_OPEN_FOR_BACKUP_INTENT) ?
-	    smb_user_getprivcred(sr->uid_user) : sr->uid_user->u_cred;
-	crhold(of->f_cr);
-	of->f_server = tree->t_server;
-	of->f_session = tree->t_session;
+	if (user != NULL) {
+		if ((op->create_options & FILE_OPEN_FOR_BACKUP_INTENT) != 0)
+			of->f_cr = smb_user_getprivcred(user);
+		else
+			of->f_cr = user->u_cred;
+		crhold(of->f_cr);
+	}
+	of->f_server = sr->sr_server;
+	of->f_session = sr->session;	/* may be NULL */
+
 	(void) memset(of->f_lock_seq, -1, SMB_OFILE_LSEQ_MAX);
 
 	of->f_mode = smb_fsop_amask_to_omode(of->f_granted_access);
@@ -361,11 +374,15 @@ smb_ofile_alloc(
 	 * held by our caller, until smb_ofile_open puts this
 	 * ofile on the node ofile list with smb_node_add_ofile.
 	 */
-	smb_user_hold_internal(sr->uid_user);
-	smb_tree_hold_internal(tree);
-	of->f_user = sr->uid_user;
-	of->f_tree = tree;
-	of->f_node = node;
+	if (user != NULL) {
+		smb_user_hold_internal(user);
+		of->f_user = user;
+	}
+	if (tree != NULL) {
+		smb_tree_hold_internal(tree);
+		of->f_tree = tree;
+	}
+	of->f_node = node;	/* may be NULL */
 
 	return (of);
 }
@@ -448,6 +465,9 @@ smb_ofile_close(smb_ofile_t *of, int32_t mtime_sec)
 		return;
 	}
 
+	/*
+	 * Only one thread here (the one that that set f_state closing)
+	 */
 	switch (of->f_ftype) {
 	case SMB_FTYPE_BYTE_PIPE:
 	case SMB_FTYPE_MESG_PIPE:
@@ -456,6 +476,8 @@ smb_ofile_close(smb_ofile_t *of, int32_t mtime_sec)
 		break;
 
 	case SMB_FTYPE_DISK:
+		if (of->dh_persist)
+			smb2_dh_close_persistent(of);
 		if (of->f_persistid != 0)
 			smb_ofile_del_persistid(of);
 		if (of->f_lease != NULL)
@@ -961,6 +983,9 @@ smb_ofile_lookup_by_persistid(smb_request_t *sr, uint64_t persistid)
 	smb_ofile_t *of;
 	uint_t idx;
 
+	if (persistid == 0)
+		return (NULL);
+
 	hash = sr->sr_server->sv_persistid_ht;
 	idx = smb_hash_uint64(hash, persistid);
 	bucket = &hash->buckets[idx];
@@ -981,28 +1006,132 @@ smb_ofile_lookup_by_persistid(smb_request_t *sr, uint64_t persistid)
 }
 
 /*
- * Create a (unique) persistent ID for a new ofile,
- * and add this ofile to the persistid hash table.
+ * Create a (unique) durable/persistent ID for a new ofile,
+ * and add this ofile to the persistid hash table.  This ID
+ * is referred to as the persistent ID in the protocol spec,
+ * so that's what we call it too, though the persistence may
+ * vary.  "Durable" handles are persistent across reconnects
+ * but not server reboots.  Persistent handles are persistent
+ * across server reboots too.
+ *
+ * Note that persistent IDs need to be unique for the lifetime of
+ * any given ofile.  For normal (non-persistent) ofiles we can just
+ * use a persistent ID derived from the ofile memory address, as
+ * these don't ever live beyond the current OS boot lifetime.
+ *
+ * Persistent handles are re-imported after server restart, and
+ * generally have a different memory address after import than
+ * they had in the previous OS boot lifetime, so for these we
+ * use a randomly assigned value that won't conflict with any
+ * non-persistent (durable) handles.  Ensuring that a randomly
+ * generated ID is unique requires a search of the ofiles in one
+ * hash bucket, which we'd rather avoid for non-persistent opens.
+ *
+ * The solution used here is to divide the persistent ID space
+ * in half (odd and even values) where durable opens use an ID
+ * derived from the ofile address (which is always even), and
+ * persistent opens use an ID generated randomly (always odd).
+ *
+ * smb_ofile_set_persistid_dh() sets a durable handle ID and
+ * smb_ofile_set_persistid_ph() sets a persistent handle ID.
  */
 void
-smb_ofile_set_persistid(smb_ofile_t *of)
+smb_ofile_set_persistid_dh(smb_ofile_t *of)
 {
 	smb_hash_t *hash = of->f_server->sv_persistid_ht;
 	smb_bucket_t *bucket;
 	smb_llist_t *ll;
+	uint64_t persistid;
 	uint_t idx;
 
-	of->f_persistid = SMB_OFILE_PERSISTID(of);
+	persistid = (uintptr_t)of;
+	/* Avoid showing object addresses */
+	persistid ^= ((uintptr_t)&smb_cache_ofile);
+	/* make sure it's even */
+	persistid &= ~((uint64_t)1);
 
-	idx = smb_hash_uint64(hash, of->f_persistid);
+	idx = smb_hash_uint64(hash, persistid);
 	bucket = &hash->buckets[idx];
 	ll = &bucket->b_list;
 	smb_llist_enter(ll, RW_WRITER);
-	smb_llist_insert_tail(ll, of);
+	if (of->f_persistid == 0) {
+		of->f_persistid = persistid;
+		smb_llist_insert_tail(ll, of);
+	}
 	smb_llist_exit(ll);
 }
 
 void
+smb_ofile_set_persistid_ph(smb_ofile_t *of)
+{
+	uint64_t persistid;
+	int rc;
+
+top:
+	(void) random_get_pseudo_bytes((uint8_t *)&persistid,
+	    sizeof (persistid));
+	if (persistid == 0) {
+		cmn_err(CE_NOTE, "random gave all zeros!");
+		goto top;
+	}
+	/* make sure it's odd */
+	persistid |= (uint64_t)1;
+
+	/*
+	 * Try inserting with this persistent ID.
+	 */
+	rc = smb_ofile_insert_persistid(of, persistid);
+	if (rc == EEXIST)
+		goto top;
+	if (rc != 0) {
+		cmn_err(CE_NOTE, "set persistid rc=%d", rc);
+	}
+}
+
+/*
+ * Insert an ofile into the persistid hash table.
+ * If the persistent ID is in use, error.
+ */
+int
+smb_ofile_insert_persistid(smb_ofile_t *new_of, uint64_t persistid)
+{
+	smb_hash_t *hash = new_of->f_server->sv_persistid_ht;
+	smb_bucket_t *bucket;
+	smb_llist_t *ll;
+	smb_ofile_t *of;
+	uint_t idx;
+
+	ASSERT(persistid != 0);
+
+	/*
+	 * Look to see if this key alreay exists.
+	 */
+	idx = smb_hash_uint64(hash, persistid);
+	bucket = &hash->buckets[idx];
+	ll = &bucket->b_list;
+
+	smb_llist_enter(ll, RW_WRITER);
+	of = smb_llist_head(ll);
+	while (of != NULL) {
+		if (of->f_persistid == persistid) {
+			/* already in use */
+			smb_llist_exit(ll);
+			return (EEXIST);
+		}
+		of = smb_llist_next(ll, of);
+	}
+
+	/* Not found, so OK to insert. */
+	if (new_of->f_persistid == 0) {
+		new_of->f_persistid = persistid;
+		smb_llist_insert_tail(ll, new_of);
+	}
+	smb_llist_exit(ll);
+
+	return (0);
+}
+
+void
 smb_ofile_del_persistid(smb_ofile_t *of)
 {
 	smb_hash_t *hash = of->f_server->sv_persistid_ht;
@@ -1014,7 +1143,10 @@ smb_ofile_del_persistid(smb_ofile_t *of)
 	bucket = &hash->buckets[idx];
 	ll = &bucket->b_list;
 	smb_llist_enter(ll, RW_WRITER);
-	smb_llist_remove(ll, of);
+	if (of->f_persistid != 0) {
+		smb_llist_remove(ll, of);
+		of->f_persistid = 0;
+	}
 	smb_llist_exit(ll);
 }
 
@@ -1390,6 +1522,7 @@ smb_ofile_free(smb_ofile_t *of)
 
 	of->f_magic = (uint32_t)~SMB_OFILE_MAGIC;
 	list_destroy(&of->f_notify.nc_waiters);
+	mutex_destroy(&of->dh_nvlock);
 	mutex_destroy(&of->f_mutex);
 	kmem_cache_free(smb_cache_ofile, of);
 }
diff --git a/usr/src/uts/common/fs/smbsrv/smb_pathname.c b/usr/src/uts/common/fs/smbsrv/smb_pathname.c
index a8f5ae3aa4..fbf003c7c0 100644
--- a/usr/src/uts/common/fs/smbsrv/smb_pathname.c
+++ b/usr/src/uts/common/fs/smbsrv/smb_pathname.c
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2017 Nexenta Systems, Inc. All rights reserved.
  */
 
 #include <smbsrv/smb_kproto.h>
@@ -154,7 +154,7 @@ smb_pathname_reduce(
 	pathname_t	ppn;
 	char		*usepath;
 	int		lookup_flags = FOLLOW;
-	int 		trailing_slash = 0;
+	int		trailing_slash = 0;
 	int		err = 0;
 	int		len;
 	smb_node_t	*vss_cur_node;
@@ -423,6 +423,10 @@ smb_pathname(smb_request_t *sr, char *path, int flags,
 		if ((err = pn_set(&pn, namep)) != 0)
 			break;
 
+		/* We want the DOS attributes. */
+		bzero(&attr, sizeof (attr));
+		attr.sa_mask = SMB_AT_DOSATTR;
+
 		local_flags = flags & FIGNORECASE;
 		err = smb_pathname_lookup(&pn, &rpn, local_flags,
 		    &vp, rootvp, dnode->vp, &attr, cred);
@@ -1066,6 +1070,27 @@ smb_is_stream_name(char *path)
 }
 
 /*
+ * Is this stream node a "restricted" type?
+ */
+boolean_t
+smb_strname_restricted(char *strname)
+{
+	char *stype;
+
+	stype = strrchr(strname, ':');
+	if (stype == NULL)
+		return (B_FALSE);
+
+	/*
+	 * Only ":$CA" is restricted (for now).
+	 */
+	if (strcmp(stype, ":$CA") == 0)
+		return (B_TRUE);
+
+	return (B_FALSE);
+}
+
+/*
  * smb_validate_stream_name
  *
  * B_FALSE will be returned, and the error status ser in the sr, if:
@@ -1079,6 +1104,7 @@ boolean_t
 smb_validate_stream_name(smb_request_t *sr, smb_pathname_t *pn)
 {
 	static char *strmtype[] = {
+		"$CA",
 		"$DATA",
 		"$INDEX_ALLOCATION"
 	};
diff --git a/usr/src/uts/common/fs/smbsrv/smb_server.c b/usr/src/uts/common/fs/smbsrv/smb_server.c
index 42b6f8defa..6b2390d633 100644
--- a/usr/src/uts/common/fs/smbsrv/smb_server.c
+++ b/usr/src/uts/common/fs/smbsrv/smb_server.c
@@ -20,8 +20,8 @@
  */
 /*
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2017 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2017 by Delphix. All rights reserved.
+ * Copyright 2018 Nexenta Systems, Inc.  All rights reserved.
  */
 
 /*
@@ -229,12 +229,12 @@ static void smb_server_fsop_stop(smb_server_t *);
 static void smb_event_cancel(smb_server_t *, uint32_t);
 static uint32_t smb_event_alloc_txid(void);
 
-static void smb_server_disconnect_share(smb_llist_t *, const char *);
-static void smb_server_enum_users(smb_llist_t *, smb_svcenum_t *);
-static void smb_server_enum_trees(smb_llist_t *, smb_svcenum_t *);
-static int smb_server_session_disconnect(smb_llist_t *, const char *,
+static void smb_server_disconnect_share(smb_server_t *, const char *);
+static void smb_server_enum_users(smb_server_t *, smb_svcenum_t *);
+static void smb_server_enum_trees(smb_server_t *, smb_svcenum_t *);
+static int smb_server_session_disconnect(smb_server_t *, const char *,
     const char *);
-static int smb_server_fclose(smb_llist_t *, uint32_t);
+static int smb_server_fclose(smb_server_t *, uint32_t);
 static int smb_server_kstat_update(kstat_t *, int);
 static int smb_server_legacy_kstat_update(kstat_t *, int);
 static void smb_server_listener_init(smb_server_t *, smb_listener_daemon_t *,
@@ -473,14 +473,8 @@ smb_server_create(void)
  * activity associated that server has ceased before destroying it.
  */
 int
-smb_server_delete(void)
+smb_server_delete(smb_server_t	*sv)
 {
-	smb_server_t	*sv;
-	int		rc;
-
-	rc = smb_server_lookup(&sv);
-	if (rc != 0)
-		return (rc);
 
 	mutex_enter(&sv->sv_mutex);
 	switch (sv->sv_state) {
@@ -608,6 +602,7 @@ smb_server_start(smb_ioc_start_t *ioc)
 	int		rc = 0;
 	int		family;
 	smb_server_t	*sv;
+	cred_t		*ucr;
 
 	rc = smb_server_lookup(&sv);
 	if (rc)
@@ -620,6 +615,31 @@ smb_server_start(smb_ioc_start_t *ioc)
 		if ((rc = smb_server_fsop_start(sv)) != 0)
 			break;
 
+		/*
+		 * Note: smb_kshare_start needs sv_session.
+		 */
+		sv->sv_session = smb_session_create(NULL, 0, sv, 0);
+		if (sv->sv_session == NULL) {
+			rc = ENOMEM;
+			break;
+		}
+
+		/*
+		 * Create a logon on the server session,
+		 * used when importing CA shares.
+		 */
+		sv->sv_rootuser = smb_user_new(sv->sv_session);
+		ucr = smb_kcred_create();
+		rc = smb_user_logon(sv->sv_rootuser, ucr, "", "root",
+		    SMB_USER_FLAG_ADMIN, 0, 0);
+		crfree(ucr);
+		ucr = NULL;
+		if (rc != 0) {
+			cmn_err(CE_NOTE, "smb_server_start: "
+			    "failed to create root user");
+			break;
+		}
+
 		if ((rc = smb_kshare_start(sv)) != 0)
 			break;
 
@@ -637,9 +657,8 @@ smb_server_start(smb_ioc_start_t *ioc)
 		    sv->sv_cfg.skc_maxconnections, INT_MAX,
 		    curzone->zone_zsched, TASKQ_DYNAMIC);
 
-		sv->sv_session = smb_session_create(NULL, 0, sv, 0);
-
-		if (sv->sv_worker_pool == NULL || sv->sv_session == NULL) {
+		if (sv->sv_worker_pool == NULL ||
+		    sv->sv_receiver_pool == NULL) {
 			rc = ENOMEM;
 			break;
 		}
@@ -904,11 +923,11 @@ smb_server_enum(smb_ioc_svcenum_t *ioc)
 
 	switch (svcenum->se_type) {
 	case SMB_SVCENUM_TYPE_USER:
-		smb_server_enum_users(&sv->sv_session_list, svcenum);
+		smb_server_enum_users(sv, svcenum);
 		break;
 	case SMB_SVCENUM_TYPE_TREE:
 	case SMB_SVCENUM_TYPE_FILE:
-		smb_server_enum_trees(&sv->sv_session_list, svcenum);
+		smb_server_enum_trees(sv, svcenum);
 		break;
 	default:
 		rc = EINVAL;
@@ -924,7 +943,6 @@ smb_server_enum(smb_ioc_svcenum_t *ioc)
 int
 smb_server_session_close(smb_ioc_session_t *ioc)
 {
-	smb_llist_t	*ll;
 	smb_server_t	*sv;
 	int		cnt;
 	int		rc;
@@ -932,8 +950,7 @@ smb_server_session_close(smb_ioc_session_t *ioc)
 	if ((rc = smb_server_lookup(&sv)) != 0)
 		return (rc);
 
-	ll = &sv->sv_session_list;
-	cnt = smb_server_session_disconnect(ll, ioc->client, ioc->username);
+	cnt = smb_server_session_disconnect(sv, ioc->client, ioc->username);
 
 	smb_server_release(sv);
 
@@ -949,15 +966,13 @@ int
 smb_server_file_close(smb_ioc_fileid_t *ioc)
 {
 	uint32_t	uniqid = ioc->uniqid;
-	smb_llist_t	*ll;
 	smb_server_t	*sv;
 	int		rc;
 
 	if ((rc = smb_server_lookup(&sv)) != 0)
 		return (rc);
 
-	ll = &sv->sv_session_list;
-	rc = smb_server_fclose(ll, uniqid);
+	rc = smb_server_fclose(sv, uniqid);
 
 	smb_server_release(sv);
 	return (rc);
@@ -978,17 +993,16 @@ smb_server_get_session_count(smb_server_t *sv)
 }
 
 /*
- * Gets the vnode of the specified share path.
- *
- * A hold on the returned vnode pointer is taken so the caller
- * must call VN_RELE.
+ * Gets the smb_node of the specified share path.
+ * Node is returned held (caller must rele.)
  */
 int
-smb_server_sharevp(smb_server_t *sv, const char *shr_path, vnode_t **vp)
+smb_server_share_lookup(smb_server_t *sv, const char *shr_path,
+    smb_node_t **nodepp)
 {
 	smb_request_t	*sr;
 	smb_node_t	*fnode = NULL;
-	smb_node_t	*dnode;
+	smb_node_t	*dnode = NULL;
 	char		last_comp[MAXNAMELEN];
 	int		rc = 0;
 
@@ -1025,10 +1039,7 @@ smb_server_sharevp(smb_server_t *sv, const char *shr_path, vnode_t **vp)
 
 	ASSERT(fnode->vp && fnode->vp->v_vfsp);
 
-	VN_HOLD(fnode->vp);
-	*vp = fnode->vp;
-
-	smb_node_release(fnode);
+	*nodepp = fnode;
 
 	return (0);
 }
@@ -1070,7 +1081,6 @@ int
 smb_server_unshare(const char *sharename)
 {
 	smb_server_t	*sv;
-	smb_llist_t	*ll;
 	int		rc;
 
 	if ((rc = smb_server_lookup(&sv)))
@@ -1088,8 +1098,7 @@ smb_server_unshare(const char *sharename)
 	}
 	mutex_exit(&sv->sv_mutex);
 
-	ll = &sv->sv_session_list;
-	smb_server_disconnect_share(ll, sharename);
+	smb_server_disconnect_share(sv, sharename);
 
 	smb_server_release(sv);
 	return (0);
@@ -1100,10 +1109,12 @@ smb_server_unshare(const char *sharename)
  * Typically called when a share has been removed.
  */
 static void
-smb_server_disconnect_share(smb_llist_t *ll, const char *sharename)
+smb_server_disconnect_share(smb_server_t *sv, const char *sharename)
 {
+	smb_llist_t	*ll;
 	smb_session_t	*session;
 
+	ll = &sv->sv_session_list;
 	smb_llist_enter(ll, RW_READER);
 
 	session = smb_llist_head(ll);
@@ -1514,9 +1525,17 @@ smb_server_shutdown(smb_server_t *sv)
 	 * normal sessions, this happens in smb_session_cancel,
 	 * but that's not called for the server session.
 	 */
+	if (sv->sv_rootuser != NULL) {
+		smb_user_logoff(sv->sv_rootuser);
+		smb_user_release(sv->sv_rootuser);
+		sv->sv_rootuser = NULL;
+	}
 	if (sv->sv_session != NULL) {
 		smb_slist_wait_for_empty(&sv->sv_session->s_req_list);
 
+		/* Just in case import left users and trees */
+		smb_session_logoff(sv->sv_session);
+
 		smb_session_delete(sv->sv_session);
 		sv->sv_session = NULL;
 	}
@@ -1817,8 +1836,9 @@ smb_server_release(smb_server_t *sv)
  * Enumerate the users associated with a session list.
  */
 static void
-smb_server_enum_users(smb_llist_t *ll, smb_svcenum_t *svcenum)
+smb_server_enum_users(smb_server_t *sv, smb_svcenum_t *svcenum)
 {
+	smb_llist_t	*ll = &sv->sv_session_list;
 	smb_session_t	*sn;
 	smb_llist_t	*ulist;
 	smb_user_t	*user;
@@ -1859,8 +1879,9 @@ smb_server_enum_users(smb_llist_t *ll, smb_svcenum_t *svcenum)
  * Enumerate the trees/files associated with a session list.
  */
 static void
-smb_server_enum_trees(smb_llist_t *ll, smb_svcenum_t *svcenum)
+smb_server_enum_trees(smb_server_t *sv, smb_svcenum_t *svcenum)
 {
+	smb_llist_t	*ll = &sv->sv_session_list;
 	smb_session_t	*sn;
 	smb_llist_t	*tlist;
 	smb_tree_t	*tree;
@@ -1902,9 +1923,10 @@ smb_server_enum_trees(smb_llist_t *ll, smb_svcenum_t *svcenum)
  * Empty strings are treated as wildcards.
  */
 static int
-smb_server_session_disconnect(smb_llist_t *ll,
+smb_server_session_disconnect(smb_server_t *sv,
     const char *client, const char *name)
 {
+	smb_llist_t	*ll = &sv->sv_session_list;
 	smb_session_t	*sn;
 	smb_llist_t	*ulist;
 	smb_user_t	*user;
@@ -1949,13 +1971,15 @@ smb_server_session_disconnect(smb_llist_t *ll,
  * Close a file by its unique id.
  */
 static int
-smb_server_fclose(smb_llist_t *ll, uint32_t uniqid)
+smb_server_fclose(smb_server_t *sv, uint32_t uniqid)
 {
+	smb_llist_t	*ll;
 	smb_session_t	*sn;
 	smb_llist_t	*tlist;
 	smb_tree_t	*tree;
 	int		rc = ENOENT;
 
+	ll = &sv->sv_session_list;
 	smb_llist_enter(ll, RW_READER);
 	sn = smb_llist_head(ll);
 
diff --git a/usr/src/uts/common/fs/smbsrv/smb_session.c b/usr/src/uts/common/fs/smbsrv/smb_session.c
index 205c21179b..2878df28e7 100644
--- a/usr/src/uts/common/fs/smbsrv/smb_session.c
+++ b/usr/src/uts/common/fs/smbsrv/smb_session.c
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2018 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright 2019 Nexenta Systems, Inc.  All rights reserved.
  */
 
 #include <sys/atomic.h>
@@ -72,8 +72,6 @@ static int smb_session_reader(smb_session_t *);
 static int smb_session_xprt_puthdr(smb_session_t *,
     uint8_t msg_type, uint32_t msg_len,
     uint8_t *dst, size_t dstlen);
-static smb_tree_t *smb_session_get_tree(smb_session_t *, smb_tree_t *);
-static void smb_session_logoff(smb_session_t *);
 static void smb_session_disconnect_trees(smb_session_t	*);
 static void smb_request_init_command_mbuf(smb_request_t *sr);
 static void smb_session_genkey(smb_session_t *);
@@ -752,7 +750,22 @@ smb_session_create(ksocket_t new_so, uint16_t port, smb_server_t *sv,
 
 	smb_rwx_init(&session->s_lock);
 
-	if (new_so != NULL) {
+	session->s_srqueue = &sv->sv_srqueue;
+	smb_server_get_cfg(sv, &session->s_cfg);
+
+	if (new_so == NULL) {
+		/*
+		 * This call is creating the special "server" session,
+		 * used for kshare export, oplock breaks, CA import.
+		 * CA import creates temporary trees on this session
+		 * and those should never get map/unmap up-calls, so
+		 * force the map/unmap flags zero on this session.
+		 * Set a "modern" dialect for CA import too, so
+		 * pathname parse doesn't do OS/2 stuff, etc.
+		 */
+		session->s_cfg.skc_execflags = 0;
+		session->dialect = session->s_cfg.skc_max_protocol;
+	} else {
 		if (family == AF_INET) {
 			slen = sizeof (sin);
 			(void) ksocket_getsockname(new_so,
@@ -794,8 +807,6 @@ smb_session_create(ksocket_t new_so, uint16_t port, smb_server_t *sv,
 		else
 			smb_server_inc_tcp_sess(sv);
 	}
-	smb_server_get_cfg(sv, &session->s_cfg);
-	session->s_srqueue = &sv->sv_srqueue;
 
 	/*
 	 * The initial new request handler is special,
@@ -1006,117 +1017,35 @@ smb_session_lookup_tree(
 }
 
 /*
- * Find the first connected tree that matches the specified sharename.
- * If the specified tree is NULL the search starts from the beginning of
- * the user's tree list.  If a tree is provided the search starts just
- * after that tree.
- */
-smb_tree_t *
-smb_session_lookup_share(
-    smb_session_t	*session,
-    const char		*sharename,
-    smb_tree_t		*tree)
-{
-	SMB_SESSION_VALID(session);
-	ASSERT(sharename);
-
-	smb_llist_enter(&session->s_tree_list, RW_READER);
-
-	if (tree) {
-		ASSERT3U(tree->t_magic, ==, SMB_TREE_MAGIC);
-		ASSERT(tree->t_session == session);
-		tree = smb_llist_next(&session->s_tree_list, tree);
-	} else {
-		tree = smb_llist_head(&session->s_tree_list);
-	}
-
-	while (tree) {
-		ASSERT3U(tree->t_magic, ==, SMB_TREE_MAGIC);
-		ASSERT(tree->t_session == session);
-		if (smb_strcasecmp(tree->t_sharename, sharename, 0) == 0) {
-			if (smb_tree_hold(tree)) {
-				smb_llist_exit(&session->s_tree_list);
-				return (tree);
-			}
-		}
-		tree = smb_llist_next(&session->s_tree_list, tree);
-	}
-
-	smb_llist_exit(&session->s_tree_list);
-	return (NULL);
-}
-
-/*
- * Find the first connected tree that matches the specified volume name.
- * If the specified tree is NULL the search starts from the beginning of
- * the user's tree list.  If a tree is provided the search starts just
- * after that tree.
- */
-smb_tree_t *
-smb_session_lookup_volume(
-    smb_session_t	*session,
-    const char		*name,
-    smb_tree_t		*tree)
-{
-	SMB_SESSION_VALID(session);
-	ASSERT(name);
-
-	smb_llist_enter(&session->s_tree_list, RW_READER);
-
-	if (tree) {
-		ASSERT3U(tree->t_magic, ==, SMB_TREE_MAGIC);
-		ASSERT(tree->t_session == session);
-		tree = smb_llist_next(&session->s_tree_list, tree);
-	} else {
-		tree = smb_llist_head(&session->s_tree_list);
-	}
-
-	while (tree) {
-		ASSERT3U(tree->t_magic, ==, SMB_TREE_MAGIC);
-		ASSERT(tree->t_session == session);
-
-		if (smb_strcasecmp(tree->t_volume, name, 0) == 0) {
-			if (smb_tree_hold(tree)) {
-				smb_llist_exit(&session->s_tree_list);
-				return (tree);
-			}
-		}
-
-		tree = smb_llist_next(&session->s_tree_list, tree);
-	}
-
-	smb_llist_exit(&session->s_tree_list);
-	return (NULL);
-}
-
-/*
  * Disconnect all trees that match the specified client process-id.
+ * Used by the SMB1 "process exit" request.
  */
 void
 smb_session_close_pid(
     smb_session_t	*session,
     uint32_t		pid)
 {
+	smb_llist_t	*tree_list = &session->s_tree_list;
 	smb_tree_t	*tree;
 
-	SMB_SESSION_VALID(session);
+	smb_llist_enter(tree_list, RW_READER);
 
-	tree = smb_session_get_tree(session, NULL);
+	tree = smb_llist_head(tree_list);
 	while (tree) {
-		smb_tree_t *next;
-		ASSERT3U(tree->t_magic, ==, SMB_TREE_MAGIC);
-		ASSERT(tree->t_session == session);
-		smb_tree_close_pid(tree, pid);
-		next = smb_session_get_tree(session, tree);
-		smb_tree_release(tree);
-		tree = next;
+		if (smb_tree_hold(tree)) {
+			smb_tree_close_pid(tree, pid);
+			smb_tree_release(tree);
+		}
+		tree = smb_llist_next(tree_list, tree);
 	}
+
+	smb_llist_exit(tree_list);
 }
 
 static void
-smb_session_tree_dtor(void *t)
+smb_session_tree_dtor(void *arg)
 {
-	smb_tree_t	*tree = (smb_tree_t *)t;
+	smb_tree_t	*tree = arg;
 
 	smb_tree_disconnect(tree, B_TRUE);
 	/* release the ref acquired during the traversal loop */
@@ -1167,84 +1096,76 @@ static void
 smb_session_disconnect_trees(
     smb_session_t	*session)
 {
-	smb_tree_t	*tree, *next_tree;
+	smb_llist_t	*tree_list = &session->s_tree_list;
+	smb_tree_t	*tree;
 
-	SMB_SESSION_VALID(session);
+	smb_llist_enter(tree_list, RW_READER);
 
-	tree = smb_session_get_tree(session, NULL);
+	tree = smb_llist_head(tree_list);
 	while (tree) {
-		ASSERT3U(tree->t_magic, ==, SMB_TREE_MAGIC);
-		ASSERT(tree->t_session == session);
-		smb_tree_disconnect(tree, B_TRUE);
-		next_tree = smb_session_get_tree(session, tree);
-		smb_tree_release(tree);
-		tree = next_tree;
+		if (smb_tree_hold(tree)) {
+			smb_llist_post(tree_list, tree,
+			    smb_session_tree_dtor);
+		}
+		tree = smb_llist_next(tree_list, tree);
 	}
+
+	/* drop the lock and flush the dtor queue */
+	smb_llist_exit(tree_list);
 }
 
 /*
- * Disconnect all trees that match the specified share name.
+ * Variant of smb_session_tree_dtor that also
+ * cancels requests using this tree.
  */
-void
-smb_session_disconnect_share(
-    smb_session_t	*session,
-    const char		*sharename)
+static void
+smb_session_tree_kill(void *arg)
 {
-	smb_tree_t	*tree;
-	smb_tree_t	*next;
+	smb_tree_t	*tree = arg;
 
-	SMB_SESSION_VALID(session);
+	SMB_TREE_VALID(tree);
 
-	tree = smb_session_lookup_share(session, sharename, NULL);
-	while (tree) {
-		ASSERT3U(tree->t_magic, ==, SMB_TREE_MAGIC);
-		ASSERT(tree->t_session == session);
-		smb_tree_disconnect(tree, B_TRUE);
-		smb_session_cancel_requests(session, tree, NULL);
-		next = smb_session_lookup_share(session, sharename, tree);
-		smb_tree_release(tree);
-		tree = next;
-	}
+	smb_tree_disconnect(tree, B_TRUE);
+	smb_session_cancel_requests(tree->t_session, tree, NULL);
+
+	/* release the ref acquired during the traversal loop */
+	smb_tree_release(tree);
 }
 
 /*
- * Get the next connected tree in the list.  A reference is taken on
- * the tree, which can be released later with smb_tree_release().
- *
- * If the specified tree is NULL the search starts from the beginning of
- * the tree list.  If a tree is provided the search starts just after
- * that tree.
- *
- * Returns NULL if there are no connected trees in the list.
+ * Disconnect all trees that match the specified share name,
+ * and kill requests using those trees.
  */
-static smb_tree_t *
-smb_session_get_tree(
+void
+smb_session_disconnect_share(
     smb_session_t	*session,
-    smb_tree_t		*tree)
+    const char		*sharename)
 {
-	smb_llist_t	*tree_list;
+	smb_llist_t	*ll;
+	smb_tree_t	*tree;
 
 	SMB_SESSION_VALID(session);
-	tree_list = &session->s_tree_list;
 
-	smb_llist_enter(tree_list, RW_READER);
+	ll = &session->s_tree_list;
+	smb_llist_enter(ll, RW_READER);
 
-	if (tree) {
-		ASSERT3U(tree->t_magic, ==, SMB_TREE_MAGIC);
-		tree = smb_llist_next(tree_list, tree);
-	} else {
-		tree = smb_llist_head(tree_list);
-	}
+	for (tree = smb_llist_head(ll);
+	    tree != NULL;
+	    tree = smb_llist_next(ll, tree)) {
 
-	while (tree) {
-		if (smb_tree_hold(tree))
-			break;
+		SMB_TREE_VALID(tree);
+		ASSERT(tree->t_session == session);
 
-		tree = smb_llist_next(tree_list, tree);
+		if (smb_strcasecmp(tree->t_sharename, sharename, 0) != 0)
+			continue;
+
+		if (smb_tree_hold(tree)) {
+			smb_llist_post(ll, tree,
+			    smb_session_tree_kill);
+		}
 	}
 
-	smb_llist_exit(tree_list);
-	return (tree);
+	smb_llist_exit(ll);
 }
 
 /*
@@ -1255,7 +1176,7 @@ smb_session_get_tree(
  * disconnect (SMB_SESSION_STATE_DISCONNECTED).
  * If client-initiated, save durable handles.
  */
-static void
+void
 smb_session_logoff(smb_session_t *session)
 {
 	smb_llist_t	*ulist;
@@ -1279,9 +1200,6 @@ top:
 			// smb_user_hold_internal(user);
 			user->u_refcnt++;
 			mutex_exit(&user->u_mutex);
-			if (user->u_session->s_state ==
-			    SMB_SESSION_STATE_DISCONNECTED)
-				user->preserve_opens = SMB2_DH_PRESERVE_ALL;
 			smb_user_logoff(user);
 			smb_user_release(user);
 			break;
diff --git a/usr/src/uts/common/fs/smbsrv/smb_srv_oplock.c b/usr/src/uts/common/fs/smbsrv/smb_srv_oplock.c
index 86ce24c0b0..7c4be2f56e 100644
--- a/usr/src/uts/common/fs/smbsrv/smb_srv_oplock.c
+++ b/usr/src/uts/common/fs/smbsrv/smb_srv_oplock.c
@@ -346,6 +346,11 @@ smb_oplock_async_break(void *arg)
 		break;
 	}
 
+	if (sr->dh_nvl_dirty) {
+		sr->dh_nvl_dirty = B_FALSE;
+		smb2_dh_update_nvfile(sr);
+	}
+
 	sr->sr_state = SMB_REQ_STATE_COMPLETED;
 	smb_request_free(sr);
 }
@@ -444,6 +449,10 @@ smb_oplock_send_brk(smb_request_t *sr)
 		if (lease != NULL)
 			lease->ls_state = NewLevel & CACHE_RWH;
 		ofile->f_oplock.og_state = NewLevel;
+
+		if (ofile->dh_persist) {
+			smb2_dh_update_oplock(sr, ofile);
+		}
 	}
 
 	/*
@@ -583,6 +592,10 @@ smb_oplock_send_brk(smb_request_t *sr)
 	if (lease != NULL) {
 		lease->ls_state = NewLevel & CACHE_RWH;
 	}
+
+	if (ofile->dh_persist) {
+		smb2_dh_update_oplock(sr, ofile);
+	}
 }
 
 /*
diff --git a/usr/src/uts/common/fs/smbsrv/smb_tree.c b/usr/src/uts/common/fs/smbsrv/smb_tree.c
index 5020dec794..aedacf2123 100644
--- a/usr/src/uts/common/fs/smbsrv/smb_tree.c
+++ b/usr/src/uts/common/fs/smbsrv/smb_tree.c
@@ -184,8 +184,6 @@ uint32_t	smb_tree_connect_core(smb_request_t *);
 uint32_t	smb_tree_connect_disk(smb_request_t *, smb_arg_tcon_t *);
 uint32_t	smb_tree_connect_printq(smb_request_t *, smb_arg_tcon_t *);
 uint32_t	smb_tree_connect_ipc(smb_request_t *, smb_arg_tcon_t *);
-static smb_tree_t *smb_tree_alloc(smb_request_t *, const smb_kshare_t *,
-    smb_node_t *, uint32_t, uint32_t);
 static void smb_tree_dealloc(void *);
 static boolean_t smb_tree_is_connected_locked(smb_tree_t *);
 static char *smb_tree_get_sharename(char *);
@@ -193,9 +191,7 @@ static int smb_tree_getattr(const smb_kshare_t *, smb_node_t *, smb_tree_t *);
 static void smb_tree_get_volname(vfs_t *, smb_tree_t *);
 static void smb_tree_get_flags(const smb_kshare_t *, vfs_t *, smb_tree_t *);
 static void smb_tree_log(smb_request_t *, const char *, const char *, ...);
-static void smb_tree_close_odirs(smb_tree_t *, uint16_t);
-static smb_ofile_t *smb_tree_get_ofile(smb_tree_t *, smb_ofile_t *);
-static smb_odir_t *smb_tree_get_odir(smb_tree_t *, smb_odir_t *);
+static void smb_tree_close_odirs(smb_tree_t *, uint32_t);
 static void smb_tree_set_execinfo(smb_tree_t *, smb_shr_execinfo_t *, int);
 static int smb_tree_enum_private(smb_tree_t *, smb_svcenum_t *);
 static int smb_tree_netinfo_encode(smb_tree_t *, uint8_t *, size_t, uint32_t *);
@@ -303,10 +299,13 @@ out:
 
 /*
  * Disconnect a tree.
+ *
+ * The "do_exec" arg is obsolete and ignored.
  */
 void
 smb_tree_disconnect(smb_tree_t *tree, boolean_t do_exec)
 {
+	_NOTE(ARGUNUSED(do_exec))
 	smb_shr_execinfo_t execinfo;
 
 	ASSERT(tree->t_magic == SMB_TREE_MAGIC);
@@ -314,34 +313,27 @@ smb_tree_disconnect(smb_tree_t *tree, boolean_t do_exec)
 	mutex_enter(&tree->t_mutex);
 	ASSERT(tree->t_refcnt);
 
-	if (smb_tree_is_connected_locked(tree)) {
-		/*
-		 * Indicate that the disconnect process has started.
-		 */
-		tree->t_state = SMB_TREE_STATE_DISCONNECTING;
+	if (!smb_tree_is_connected_locked(tree)) {
 		mutex_exit(&tree->t_mutex);
-
-		if (do_exec) {
-			/*
-			 * The files opened under this tree are closed.
-			 */
-			smb_ofile_close_all(tree, 0);
-			/*
-			 * The directories opened under this tree are closed.
-			 */
-			smb_tree_close_odirs(tree, 0);
-		}
-
-		mutex_enter(&tree->t_mutex);
-		tree->t_state = SMB_TREE_STATE_DISCONNECTED;
-		smb_server_dec_trees(tree->t_server);
+		return;
 	}
 
+	/*
+	 * Indicate that the disconnect process has started.
+	 */
+	tree->t_state = SMB_TREE_STATE_DISCONNECTING;
 	mutex_exit(&tree->t_mutex);
 
-	if (do_exec && (tree->t_state == SMB_TREE_STATE_DISCONNECTED) &&
-	    (tree->t_execflags & SMB_EXEC_UNMAP)) {
+	/*
+	 * The files opened under this tree are closed.
+	 */
+	smb_ofile_close_all(tree, 0);
+	/*
+	 * The directories opened under this tree are closed.
+	 */
+	smb_tree_close_odirs(tree, 0);
 
+	if ((tree->t_execflags & SMB_EXEC_UNMAP) != 0) {
 		smb_tree_set_execinfo(tree, &execinfo, SMB_EXEC_UNMAP);
 		(void) smb_kshare_exec(tree->t_server, &execinfo);
 	}
@@ -408,7 +400,7 @@ smb_tree_release(
 	tree->t_refcnt--;
 
 	switch (tree->t_state) {
-	case SMB_TREE_STATE_DISCONNECTED:
+	case SMB_TREE_STATE_DISCONNECTING:
 		if (tree->t_refcnt == 0) {
 			smb_session_t *ssn = tree->t_session;
 			tree->t_state = SMB_TREE_STATE_DISCONNECTED;
@@ -417,7 +409,6 @@ smb_tree_release(
 		}
 		break;
 	case SMB_TREE_STATE_CONNECTED:
-	case SMB_TREE_STATE_DISCONNECTING:
 		break;
 	default:
 		ASSERT(0);
@@ -463,31 +454,29 @@ smb_tree_has_feature(smb_tree_t *tree, uint32_t flags)
 int
 smb_tree_enum(smb_tree_t *tree, smb_svcenum_t *svcenum)
 {
+	smb_llist_t	*of_list;
 	smb_ofile_t	*of;
-	smb_ofile_t	*next;
 	int		rc = 0;
 
-	ASSERT(tree);
-	ASSERT(tree->t_magic == SMB_TREE_MAGIC);
-
 	if (svcenum->se_type == SMB_SVCENUM_TYPE_TREE)
 		return (smb_tree_enum_private(tree, svcenum));
 
-	of = smb_tree_get_ofile(tree, NULL);
-	while (of) {
-		ASSERT(of->f_tree == tree);
+	of_list = &tree->t_ofile_list;
+	smb_llist_enter(of_list, RW_READER);
 
-		rc = smb_ofile_enum(of, svcenum);
-		if (rc != 0) {
+	of = smb_llist_head(of_list);
+	while (of) {
+		if (smb_ofile_hold(of)) {
+			rc = smb_ofile_enum(of, svcenum);
 			smb_ofile_release(of);
-			break;
 		}
-
-		next = smb_tree_get_ofile(tree, of);
-		smb_ofile_release(of);
-		of = next;
+		if (rc != 0)
+			break;
+		of = smb_llist_next(of_list, of);
 	}
 
+	smb_llist_exit(of_list);
+
 	return (rc);
 }
 
@@ -662,6 +651,9 @@ smb_tree_chkaccess(smb_request_t *sr, smb_kshare_t *shr, vnode_t *vp)
 	return (access);
 }
 
+/* How long should tree connect wait for DH import to complete? */
+int smb_tcon_import_wait = 20; /* sec. */
+
 /*
  * Connect a share for use with files and directories.
  */
@@ -671,16 +663,14 @@ smb_tree_connect_disk(smb_request_t *sr, smb_arg_tcon_t *tcon)
 	char			*sharename = tcon->path;
 	const char		*any = "?????";
 	smb_user_t		*user = sr->uid_user;
-	smb_node_t		*dnode = NULL;
 	smb_node_t		*snode = NULL;
 	smb_kshare_t		*si = tcon->si;
 	char			*service = tcon->service;
-	char			last_component[MAXNAMELEN];
 	smb_tree_t		*tree;
-	cred_t			*kcr;
 	int			rc;
 	uint32_t		access;
 	smb_shr_execinfo_t	execinfo;
+	clock_t	time;
 
 	ASSERT(user);
 	ASSERT(user->u_cred);
@@ -694,34 +684,34 @@ smb_tree_connect_disk(smb_request_t *sr, smb_arg_tcon_t *tcon)
 
 	/*
 	 * Check that the shared directory exists.
-	 * Client might not have access to the path _leading_ to the share,
-	 * so we use "kcred" to get to the share root.
 	 */
-	kcr = zone_kcred();
-	rc = smb_pathname_reduce(sr, kcr, si->shr_path, 0, 0, &dnode,
-	    last_component);
-	if (rc == 0) {
-		rc = smb_fsop_lookup(sr, kcr, SMB_FOLLOW_LINKS,
-		    sr->sr_server->si_root_smb_node, dnode, last_component,
-		    &snode);
-
-		smb_node_release(dnode);
-	}
-
-	if (rc) {
-		if (snode)
-			smb_node_release(snode);
-
+	snode = si->shr_root_node;
+	if (snode == NULL) {
 		smb_tree_log(sr, sharename, "bad path: %s", si->shr_path);
 		return (NT_STATUS_BAD_NETWORK_NAME);
 	}
 
 	if ((access = smb_tree_chkaccess(sr, si, snode->vp)) == 0) {
-		smb_node_release(snode);
 		return (NT_STATUS_ACCESS_DENIED);
 	}
 
 	/*
+	 * Wait for DH import of persistent handles to finish.
+	 * If we timeout, it's not clear what status to return,
+	 * but as the share is not really available yet, let's
+	 * return the status for "no such share".
+	 */
+	time = SEC_TO_TICK(smb_tcon_import_wait) + ddi_get_lbolt();
+	mutex_enter(&si->shr_mutex);
+	while (si->shr_import_busy != NULL) {
+		if (cv_timedwait(&si->shr_cv, &si->shr_mutex, time) < 0) {
+			mutex_exit(&si->shr_mutex);
+			return (NT_STATUS_BAD_NETWORK_NAME);
+		}
+	}
+	mutex_exit(&si->shr_mutex);
+
+	/*
 	 * Set up the OptionalSupport for this share.
 	 */
 	tcon->optional_support = SMB_SUPPORT_SEARCH_BITS;
@@ -758,8 +748,6 @@ smb_tree_connect_disk(smb_request_t *sr, smb_arg_tcon_t *tcon)
 
 	tree = smb_tree_alloc(sr, si, snode, access, sr->sr_cfg->skc_execflags);
 
-	smb_node_release(snode);
-
 	if (tree == NULL)
 		return (NT_STATUS_INSUFF_SERVER_RESOURCES);
 
@@ -769,7 +757,17 @@ smb_tree_connect_disk(smb_request_t *sr, smb_arg_tcon_t *tcon)
 		rc = smb_kshare_exec(tree->t_server, &execinfo);
 
 		if ((rc != 0) && (tree->t_execflags & SMB_EXEC_TERM)) {
-			smb_tree_disconnect(tree, B_FALSE);
+			/*
+			 * Inline parts of: smb_tree_disconnect()
+			 * Not using smb_tree_disconnect() for cleanup
+			 * here because: we don't want an exec up-call,
+			 * and there can't be any opens as we never
+			 * returned this TID to the client.
+			 */
+			mutex_enter(&tree->t_mutex);
+			tree->t_state = SMB_TREE_STATE_DISCONNECTING;
+			mutex_exit(&tree->t_mutex);
+
 			smb_tree_release(tree);
 			return (NT_STATUS_ACCESS_DENIED);
 		}
@@ -901,7 +899,7 @@ smb_tree_connect_ipc(smb_request_t *sr, smb_arg_tcon_t *tcon)
 /*
  * Allocate a tree.
  */
-static smb_tree_t *
+smb_tree_t *
 smb_tree_alloc(smb_request_t *sr, const smb_kshare_t *si,
     smb_node_t *snode, uint32_t access, uint32_t execflags)
 {
@@ -1001,6 +999,8 @@ smb_tree_dealloc(void *arg)
 	ASSERT(tree->t_state == SMB_TREE_STATE_DISCONNECTED);
 	ASSERT(tree->t_refcnt == 0);
 
+	smb_server_dec_trees(tree->t_server);
+
 	session = tree->t_session;
 	smb_llist_enter(&session->s_tree_list, RW_WRITER);
 	smb_llist_remove(&session->s_tree_list, tree);
@@ -1199,6 +1199,9 @@ smb_tree_get_flags(const smb_kshare_t *si, vfs_t *vfsp, smb_tree_t *tree)
 	if (si->shr_flags & SMB_SHRF_ABE)
 		flags |= SMB_TREE_ABE;
 
+	if (si->shr_flags & SMB_SHRF_CA)
+		flags |= SMB_TREE_CA;
+
 	if (si->shr_flags & SMB_SHRF_FSO)
 		flags |= SMB_TREE_FORCE_L2_OPLOCK;
 
@@ -1361,83 +1364,6 @@ smb_tree_is_connected(smb_tree_t *tree)
 }
 
 /*
- * Get the next open ofile in the list.  A reference is taken on
- * the ofile, which can be released later with smb_ofile_release().
- *
- * If the specified ofile is NULL, search from the beginning of the
- * list.  Otherwise, the search starts just after that ofile.
- *
- * Returns NULL if there are no open files in the list.
- */
-static smb_ofile_t *
-smb_tree_get_ofile(smb_tree_t *tree, smb_ofile_t *of)
-{
-	smb_llist_t *ofile_list;
-
-	ASSERT(tree);
-	ASSERT(tree->t_magic == SMB_TREE_MAGIC);
-
-	ofile_list = &tree->t_ofile_list;
-	smb_llist_enter(ofile_list, RW_READER);
-
-	if (of) {
-		ASSERT(of->f_magic == SMB_OFILE_MAGIC);
-		of = smb_llist_next(ofile_list, of);
-	} else {
-		of = smb_llist_head(ofile_list);
-	}
-
-	while (of) {
-		if (smb_ofile_hold(of))
-			break;
-
-		of = smb_llist_next(ofile_list, of);
-	}
-
-	smb_llist_exit(ofile_list);
-	return (of);
-}
-
-/*
- * smb_tree_get_odir
- *
- * Find the next odir in the tree's list of odirs, and obtain a
- * hold on it.
- * If the specified odir is NULL the search starts at the beginning
- * of the tree's odir list, otherwise the search starts after the
- * specified odir.
- */
-static smb_odir_t *
-smb_tree_get_odir(smb_tree_t *tree, smb_odir_t *od)
-{
-	smb_llist_t *od_list;
-
-	ASSERT(tree);
-	ASSERT(tree->t_magic == SMB_TREE_MAGIC);
-
-	od_list = &tree->t_odir_list;
-	smb_llist_enter(od_list, RW_READER);
-
-	if (od) {
-		ASSERT(od->d_magic == SMB_ODIR_MAGIC);
-		od = smb_llist_next(od_list, od);
-	} else {
-		od = smb_llist_head(od_list);
-	}
-
-	while (od) {
-		ASSERT(od->d_magic == SMB_ODIR_MAGIC);
-
-		if (smb_odir_hold(od))
-			break;
-		od = smb_llist_next(od_list, od);
-	}
-
-	smb_llist_exit(od_list);
-	return (od);
-}
-
-/*
  * smb_tree_close_odirs
  *
  * Close all open odirs in the tree's list which were opened by
@@ -1445,25 +1371,34 @@ smb_tree_get_odir(smb_tree_t *tree, smb_odir_t *od)
  * If pid is zero, close all open odirs in the tree's list.
  */
 static void
-smb_tree_close_odirs(smb_tree_t *tree, uint16_t pid)
+smb_tree_close_odirs(smb_tree_t *tree, uint32_t pid)
 {
-	smb_odir_t *od, *next_od;
+	smb_llist_t	*od_list;
+	smb_odir_t	*od;
 
 	ASSERT(tree);
 	ASSERT(tree->t_magic == SMB_TREE_MAGIC);
 
-	od = smb_tree_get_odir(tree, NULL);
-	while (od) {
+	od_list = &tree->t_odir_list;
+	smb_llist_enter(od_list, RW_READER);
+
+	for (od = smb_llist_head(od_list);
+	    od != NULL;
+	    od = smb_llist_next(od_list, od)) {
+
 		ASSERT(od->d_magic == SMB_ODIR_MAGIC);
 		ASSERT(od->d_tree == tree);
 
-		next_od = smb_tree_get_odir(tree, od);
-		if ((pid == 0) || (od->d_opened_by_pid == pid))
-			smb_odir_close(od);
-		smb_odir_release(od);
+		if (pid != 0 && od->d_opened_by_pid != pid)
+			continue;
 
-		od = next_od;
+		if (smb_odir_hold(od)) {
+			smb_odir_close(od);
+			smb_odir_release(od);
+		}
 	}
+
+	smb_llist_exit(od_list);
 }
 
 static void
diff --git a/usr/src/uts/common/fs/smbsrv/smb_user.c b/usr/src/uts/common/fs/smbsrv/smb_user.c
index 0bfceb4ff4..74bb502c56 100644
--- a/usr/src/uts/common/fs/smbsrv/smb_user.c
+++ b/usr/src/uts/common/fs/smbsrv/smb_user.c
@@ -303,7 +303,6 @@ smb_user_logon(
 	 * we always have an auth. socket to close.
 	 */
 	authsock = user->u_authsock;
-	ASSERT(authsock != NULL);
 	user->u_authsock = NULL;
 	tmo = user->u_auth_tmo;
 	user->u_auth_tmo = NULL;
@@ -325,7 +324,8 @@ smb_user_logon(
 		(void) untimeout(tmo);
 
 	/* This close can block, so not under the mutex. */
-	smb_authsock_close(user, authsock);
+	if (authsock != NULL)
+		smb_authsock_close(user, authsock);
 
 	return (0);
 }
diff --git a/usr/src/uts/common/fs/smbsrv/smb_vfs.c b/usr/src/uts/common/fs/smbsrv/smb_vfs.c
deleted file mode 100644
index ae631e4ffa..0000000000
--- a/usr/src/uts/common/fs/smbsrv/smb_vfs.c
+++ /dev/null
@@ -1,164 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
- */
-
-#include <sys/vfs.h>
-#include <smbsrv/smb_ktypes.h>
-#include <smbsrv/smb_kproto.h>
-
-static smb_vfs_t *smb_vfs_find(smb_export_t *, vfs_t *);
-static void smb_vfs_destroy(smb_vfs_t *);
-
-/*
- * If a hold on the specified VFS has already been taken
- * then only increment the reference count of the corresponding
- * smb_vfs_t structure. If no smb_vfs_t structure has been created
- * yet for the specified VFS then create one and take a hold on
- * the VFS.
- */
-int
-smb_vfs_hold(smb_export_t *se, vfs_t *vfsp)
-{
-	smb_vfs_t	*smb_vfs;
-	vnode_t 	*rootvp;
-	int		rc;
-
-	if (se == NULL || vfsp == NULL)
-		return (EINVAL);
-
-	smb_llist_enter(&se->e_vfs_list, RW_WRITER);
-
-	if ((smb_vfs = smb_vfs_find(se, vfsp)) != NULL) {
-		smb_vfs->sv_refcnt++;
-		DTRACE_PROBE1(smb_vfs_hold_hit, smb_vfs_t *, smb_vfs);
-		smb_llist_exit(&se->e_vfs_list);
-		return (0);
-	}
-
-	if ((rc = VFS_ROOT(vfsp, &rootvp)) != 0) {
-		smb_llist_exit(&se->e_vfs_list);
-		return (rc);
-	}
-
-	smb_vfs = kmem_cache_alloc(smb_kshare_cache_vfs, KM_SLEEP);
-
-	bzero(smb_vfs, sizeof (smb_vfs_t));
-
-	smb_vfs->sv_magic = SMB_VFS_MAGIC;
-	smb_vfs->sv_refcnt = 1;
-	smb_vfs->sv_vfsp = vfsp;
-	/*
-	 * We have a hold on the root vnode of the file system
-	 * from the VFS_ROOT call above.
-	 */
-	smb_vfs->sv_rootvp = rootvp;
-
-	smb_llist_insert_head(&se->e_vfs_list, smb_vfs);
-	DTRACE_PROBE1(smb_vfs_hold_miss, smb_vfs_t *, smb_vfs);
-	smb_llist_exit(&se->e_vfs_list);
-
-	return (0);
-}
-
-/*
- * smb_vfs_rele
- *
- * Decrements the reference count of the fs passed in. If the reference count
- * drops to zero the smb_vfs_t structure associated with the fs is freed.
- */
-void
-smb_vfs_rele(smb_export_t *se, vfs_t *vfsp)
-{
-	smb_vfs_t	*smb_vfs;
-
-	ASSERT(vfsp);
-
-	smb_llist_enter(&se->e_vfs_list, RW_WRITER);
-	smb_vfs = smb_vfs_find(se, vfsp);
-	DTRACE_PROBE1(smb_vfs_release, smb_vfs_t *, smb_vfs);
-	if (smb_vfs) {
-		ASSERT(smb_vfs->sv_refcnt);
-		if (--smb_vfs->sv_refcnt == 0) {
-			smb_llist_remove(&se->e_vfs_list, smb_vfs);
-			smb_llist_exit(&se->e_vfs_list);
-			smb_vfs_destroy(smb_vfs);
-			return;
-		}
-	}
-	smb_llist_exit(&se->e_vfs_list);
-}
-
-/*
- * smb_vfs_rele_all()
- *
- * Release all holds on root vnodes of file systems which were taken
- * due to the existence of at least one enabled share on the file system.
- * Called at driver close time.
- */
-void
-smb_vfs_rele_all(smb_export_t *se)
-{
-	smb_vfs_t	*smb_vfs;
-
-	smb_llist_enter(&se->e_vfs_list, RW_WRITER);
-	while ((smb_vfs = smb_llist_head(&se->e_vfs_list)) != NULL) {
-
-		ASSERT(smb_vfs->sv_magic == SMB_VFS_MAGIC);
-		DTRACE_PROBE1(smb_vfs_rele_all_hit, smb_vfs_t *, smb_vfs);
-		smb_llist_remove(&se->e_vfs_list, smb_vfs);
-		smb_vfs_destroy(smb_vfs);
-	}
-	smb_llist_exit(&se->e_vfs_list);
-}
-
-/*
- * Goes through the list of smb_vfs_t structure and returns the one matching
- * the vnode passed in. If no match is found a NULL pointer is returned.
- *
- * The list of smb_vfs_t structures has to have been entered prior calling
- * this function.
- */
-static smb_vfs_t *
-smb_vfs_find(smb_export_t *se, vfs_t *vfsp)
-{
-	smb_vfs_t *smb_vfs;
-
-	smb_vfs = smb_llist_head(&se->e_vfs_list);
-	while (smb_vfs) {
-		ASSERT(smb_vfs->sv_magic == SMB_VFS_MAGIC);
-		if (smb_vfs->sv_vfsp == vfsp)
-			return (smb_vfs);
-		smb_vfs = smb_llist_next(&se->e_vfs_list, smb_vfs);
-	}
-
-	return (NULL);
-}
-
-static void
-smb_vfs_destroy(smb_vfs_t *smb_vfs)
-{
-	VN_RELE(smb_vfs->sv_rootvp);
-	smb_vfs->sv_magic = (uint32_t)~SMB_VFS_MAGIC;
-	kmem_cache_free(smb_kshare_cache_vfs, smb_vfs);
-}
diff --git a/usr/src/uts/common/fs/smbsrv/smb_vops.c b/usr/src/uts/common/fs/smbsrv/smb_vops.c
index d2f0fd7085..4b0f99839f 100644
--- a/usr/src/uts/common/fs/smbsrv/smb_vops.c
+++ b/usr/src/uts/common/fs/smbsrv/smb_vops.c
@@ -608,8 +608,14 @@ smb_vop_lookup(
 	char *np = name;
 	char namebuf[MAXNAMELEN];
 
-	if (*name == '\0')
-		return (EINVAL);
+	if (*name == '\0') {
+		/*
+		 * This happens creating named streams at the share root.
+		 */
+		VN_HOLD(dvp);
+		*vpp = dvp;
+		return (0);
+	}
 
 	ASSERT(vpp);
 	*vpp = NULL;
diff --git a/usr/src/uts/common/io/vioblk/vioblk.c b/usr/src/uts/common/io/vioblk/vioblk.c
index 074d886857..8801a0e760 100644
--- a/usr/src/uts/common/io/vioblk/vioblk.c
+++ b/usr/src/uts/common/io/vioblk/vioblk.c
@@ -22,9 +22,50 @@
 /*
  * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved.
  * Copyright (c) 2012, Alexey Zaytsev <alexey.zaytsev@gmail.com>
- * Copyright 2017, Joyent Inc.
+ * Copyright 2019 Joyent Inc.
  */
 
+/*
+ * VIRTIO BLOCK DRIVER
+ *
+ * This driver provides support for Virtio Block devices.  Each driver instance
+ * attaches to a single underlying block device.
+ *
+ * REQUEST CHAIN LAYOUT
+ *
+ * Every request chain sent to the I/O queue has the following structure.  Each
+ * box in the diagram represents a descriptor entry (i.e., a DMA cookie) within
+ * the chain:
+ *
+ *    +-0-----------------------------------------+
+ *    | struct virtio_blk_hdr                     |-----------------------\
+ *    |   (written by driver, read by device)     |                       |
+ *    +-1-----------------------------------------+                       |
+ *    | optional data payload                     |--\                    |
+ *    |   (written by driver for write requests,  |  |                    |
+ *    |    or by device for read requests)        |  |                    |
+ *    +-2-----------------------------------------+  |                    |
+ *    | ,~`           :                              |-cookies loaned     |
+ *    |/              :                        ,~`|  | from blkdev        |
+ *                    :                       /   |  |                    |
+ *    +-(N - 1)-----------------------------------+  |                    |
+ *    | ... end of data payload.                  |  |                    |
+ *    |                                           |  |                    |
+ *    |                                           |--/                    |
+ *    +-N-----------------------------------------+                       |
+ *    | status byte                               |                       |
+ *    |   (written by device, read by driver)     |--------------------\  |
+ *    +-------------------------------------------+                    |  |
+ *                                                                     |  |
+ * The memory for the header and status bytes (i.e., 0 and N above)    |  |
+ * is allocated as a single chunk by vioblk_alloc_reqs():              |  |
+ *                                                                     |  |
+ *    +-------------------------------------------+                    |  |
+ *    | struct virtio_blk_hdr                     |<----------------------/
+ *    +-------------------------------------------+                    |
+ *    | status byte                               |<-------------------/
+ *    +-------------------------------------------+
+ */
 
 #include <sys/modctl.h>
 #include <sys/blkdev.h>
@@ -43,402 +84,429 @@
 #include <sys/debug.h>
 #include <sys/pci.h>
 #include <sys/containerof.h>
-#include "virtiovar.h"
-#include "virtioreg.h"
-
-/* Feature bits */
-#define	VIRTIO_BLK_F_BARRIER	(1<<0)
-#define	VIRTIO_BLK_F_SIZE_MAX	(1<<1)
-#define	VIRTIO_BLK_F_SEG_MAX	(1<<2)
-#define	VIRTIO_BLK_F_GEOMETRY	(1<<4)
-#define	VIRTIO_BLK_F_RO		(1<<5)
-#define	VIRTIO_BLK_F_BLK_SIZE	(1<<6)
-#define	VIRTIO_BLK_F_SCSI	(1<<7)
-#define	VIRTIO_BLK_F_FLUSH	(1<<9)
-#define	VIRTIO_BLK_F_TOPOLOGY	(1<<10)
-
-/* Configuration registers */
-#define	VIRTIO_BLK_CONFIG_CAPACITY	0 /* 64bit */
-#define	VIRTIO_BLK_CONFIG_SIZE_MAX	8 /* 32bit */
-#define	VIRTIO_BLK_CONFIG_SEG_MAX	12 /* 32bit */
-#define	VIRTIO_BLK_CONFIG_GEOMETRY_C	16 /* 16bit */
-#define	VIRTIO_BLK_CONFIG_GEOMETRY_H	18 /* 8bit */
-#define	VIRTIO_BLK_CONFIG_GEOMETRY_S	19 /* 8bit */
-#define	VIRTIO_BLK_CONFIG_BLK_SIZE	20 /* 32bit */
-#define	VIRTIO_BLK_CONFIG_TOPO_PBEXP	24 /* 8bit */
-#define	VIRTIO_BLK_CONFIG_TOPO_ALIGN	25 /* 8bit */
-#define	VIRTIO_BLK_CONFIG_TOPO_MIN_SZ	26 /* 16bit */
-#define	VIRTIO_BLK_CONFIG_TOPO_OPT_SZ	28 /* 32bit */
-
-/* Command */
-#define	VIRTIO_BLK_T_IN			0
-#define	VIRTIO_BLK_T_OUT		1
-#define	VIRTIO_BLK_T_SCSI_CMD		2
-#define	VIRTIO_BLK_T_SCSI_CMD_OUT	3
-#define	VIRTIO_BLK_T_FLUSH		4
-#define	VIRTIO_BLK_T_FLUSH_OUT		5
-#define	VIRTIO_BLK_T_GET_ID		8
-#define	VIRTIO_BLK_T_BARRIER		0x80000000
-
-#define	VIRTIO_BLK_ID_BYTES	20 /* devid */
-
-/* Statuses */
-#define	VIRTIO_BLK_S_OK		0
-#define	VIRTIO_BLK_S_IOERR	1
-#define	VIRTIO_BLK_S_UNSUPP	2
-
-#define	DEF_MAXINDIRECT		(128)
-#define	DEF_MAXSECTOR		(4096)
-
-#define	VIOBLK_POISON		0xdead0001dead0001
+#include <sys/ctype.h>
+#include <sys/sysmacros.h>
 
-/*
- * Static Variables.
- */
-static char vioblk_ident[] = "VirtIO block driver";
+#include "virtio.h"
+#include "vioblk.h"
 
-/* Request header structure */
-struct vioblk_req_hdr {
-	uint32_t		type;   /* VIRTIO_BLK_T_* */
-	uint32_t		ioprio;
-	uint64_t		sector;
-};
 
-struct vioblk_req {
-	struct vioblk_req_hdr	hdr;
-	uint8_t			status;
-	uint8_t			unused[3];
-	unsigned int		ndmac;
-	ddi_dma_handle_t	dmah;
-	ddi_dma_handle_t	bd_dmah;
-	ddi_dma_cookie_t	dmac;
-	bd_xfer_t		*xfer;
-};
+static void vioblk_get_id(vioblk_t *);
+uint_t vioblk_int_handler(caddr_t, caddr_t);
+static uint_t vioblk_poll(vioblk_t *);
+static int vioblk_quiesce(dev_info_t *);
+static int vioblk_attach(dev_info_t *, ddi_attach_cmd_t);
+static int vioblk_detach(dev_info_t *, ddi_detach_cmd_t);
 
-struct vioblk_stats {
-	struct kstat_named	sts_rw_outofmemory;
-	struct kstat_named	sts_rw_badoffset;
-	struct kstat_named	sts_rw_queuemax;
-	struct kstat_named	sts_rw_cookiesmax;
-	struct kstat_named	sts_rw_cacheflush;
-	struct kstat_named	sts_intr_queuemax;
-	struct kstat_named	sts_intr_total;
-	struct kstat_named	sts_io_errors;
-	struct kstat_named	sts_unsupp_errors;
-	struct kstat_named	sts_nxio_errors;
+
+static struct dev_ops vioblk_dev_ops = {
+	.devo_rev =			DEVO_REV,
+	.devo_refcnt =			0,
+
+	.devo_attach =			vioblk_attach,
+	.devo_detach =			vioblk_detach,
+	.devo_quiesce =			vioblk_quiesce,
+
+	.devo_getinfo =			ddi_no_info,
+	.devo_identify =		nulldev,
+	.devo_probe =			nulldev,
+	.devo_reset =			nodev,
+	.devo_cb_ops =			NULL,
+	.devo_bus_ops =			NULL,
+	.devo_power =			NULL,
 };
 
-struct vioblk_lstats {
-	uint64_t		rw_cacheflush;
-	uint64_t		intr_total;
-	unsigned int		rw_cookiesmax;
-	unsigned int		intr_queuemax;
-	unsigned int		io_errors;
-	unsigned int		unsupp_errors;
-	unsigned int		nxio_errors;
+static struct modldrv vioblk_modldrv = {
+	.drv_modops =			&mod_driverops,
+	.drv_linkinfo =			"VIRTIO block driver",
+	.drv_dev_ops =			&vioblk_dev_ops
 };
 
-struct vioblk_softc {
-	dev_info_t		*sc_dev; /* mirrors virtio_softc->sc_dev */
-	struct virtio_softc	sc_virtio;
-	struct virtqueue	*sc_vq;
-	bd_handle_t		bd_h;
-	struct vioblk_req	*sc_reqs;
-	struct vioblk_stats	*ks_data;
-	kstat_t			*sc_intrstat;
-	uint64_t		sc_capacity;
-	uint64_t		sc_nblks;
-	struct vioblk_lstats	sc_stats;
-	short			sc_blkflags;
-	boolean_t		sc_in_poll_mode;
-	boolean_t		sc_readonly;
-	int			sc_blk_size;
-	int			sc_pblk_size;
-	int			sc_seg_max;
-	int			sc_seg_size_max;
-	kmutex_t		lock_devid;
-	kcondvar_t		cv_devid;
-	char			devid[VIRTIO_BLK_ID_BYTES + 1];
+static struct modlinkage vioblk_modlinkage = {
+	.ml_rev =			MODREV_1,
+	.ml_linkage =			{ &vioblk_modldrv, NULL }
 };
 
-static int vioblk_get_id(struct vioblk_softc *sc);
-
-static int vioblk_read(void *arg, bd_xfer_t *xfer);
-static int vioblk_write(void *arg, bd_xfer_t *xfer);
-static int vioblk_flush(void *arg, bd_xfer_t *xfer);
-static void vioblk_driveinfo(void *arg, bd_drive_t *drive);
-static int vioblk_mediainfo(void *arg, bd_media_t *media);
-static int vioblk_devid_init(void *, dev_info_t *, ddi_devid_t *);
-uint_t vioblk_int_handler(caddr_t arg1, caddr_t arg2);
-
-static bd_ops_t vioblk_ops = {
-	BD_OPS_VERSION_0,
-	vioblk_driveinfo,
-	vioblk_mediainfo,
-	vioblk_devid_init,
-	vioblk_flush,
-	vioblk_read,
-	vioblk_write,
+/*
+ * DMA attribute template for header and status blocks.  We also make a
+ * per-instance copy of this template with negotiated sizes from the device for
+ * blkdev.
+ */
+static const ddi_dma_attr_t vioblk_dma_attr = {
+	.dma_attr_version =		DMA_ATTR_V0,
+	.dma_attr_addr_lo =		0x0000000000000000,
+	.dma_attr_addr_hi =		0xFFFFFFFFFFFFFFFF,
+	.dma_attr_count_max =		0x00000000FFFFFFFF,
+	.dma_attr_align =		1,
+	.dma_attr_burstsizes =		1,
+	.dma_attr_minxfer =		1,
+	.dma_attr_maxxfer =		0x00000000FFFFFFFF,
+	.dma_attr_seg =			0x00000000FFFFFFFF,
+	.dma_attr_sgllen =		1,
+	.dma_attr_granular =		1,
+	.dma_attr_flags =		0
 };
 
-static int vioblk_quiesce(dev_info_t *);
-static int vioblk_attach(dev_info_t *, ddi_attach_cmd_t);
-static int vioblk_detach(dev_info_t *, ddi_detach_cmd_t);
 
-static struct dev_ops vioblk_dev_ops = {
-	DEVO_REV,
-	0,
-	ddi_no_info,
-	nulldev,	/* identify */
-	nulldev,	/* probe */
-	vioblk_attach,	/* attach */
-	vioblk_detach,	/* detach */
-	nodev,		/* reset */
-	NULL,		/* cb_ops */
-	NULL,		/* bus_ops */
-	NULL,		/* power */
-	vioblk_quiesce	/* quiesce */
-};
+static vioblk_req_t *
+vioblk_req_alloc(vioblk_t *vib)
+{
+	vioblk_req_t *vbr;
 
+	VERIFY(MUTEX_HELD(&vib->vib_mutex));
 
+	if ((vbr = list_remove_head(&vib->vib_reqs)) == NULL) {
+		return (NULL);
+	}
+	vib->vib_nreqs_alloc++;
 
-/* Standard Module linkage initialization for a Streams driver */
-extern struct mod_ops mod_driverops;
+	VERIFY0(vbr->vbr_status);
+	vbr->vbr_status |= VIOBLK_REQSTAT_ALLOCATED;
 
-static struct modldrv modldrv = {
-	&mod_driverops,		/* Type of module.  This one is a driver */
-	vioblk_ident,    /* short description */
-	&vioblk_dev_ops	/* driver specific ops */
-};
+	VERIFY3P(vbr->vbr_xfer, ==, NULL);
+	VERIFY3S(vbr->vbr_error, ==, 0);
 
-static struct modlinkage modlinkage = {
-	MODREV_1,
-	{
-		(void *)&modldrv,
-		NULL,
-	},
-};
+	return (vbr);
+}
 
-ddi_device_acc_attr_t vioblk_attr = {
-	DDI_DEVICE_ATTR_V0,
-	DDI_NEVERSWAP_ACC,	/* virtio is always native byte order */
-	DDI_STORECACHING_OK_ACC,
-	DDI_DEFAULT_ACC
-};
+static void
+vioblk_req_free(vioblk_t *vib, vioblk_req_t *vbr)
+{
+	VERIFY(MUTEX_HELD(&vib->vib_mutex));
 
-/* DMA attr for the header/status blocks. */
-static ddi_dma_attr_t vioblk_req_dma_attr = {
-	DMA_ATTR_V0,			/* dma_attr version	*/
-	0,				/* dma_attr_addr_lo	*/
-	0xFFFFFFFFFFFFFFFFull,		/* dma_attr_addr_hi	*/
-	0x00000000FFFFFFFFull,		/* dma_attr_count_max	*/
-	1,				/* dma_attr_align	*/
-	1,				/* dma_attr_burstsizes	*/
-	1,				/* dma_attr_minxfer	*/
-	0xFFFFFFFFull,			/* dma_attr_maxxfer	*/
-	0xFFFFFFFFFFFFFFFFull,		/* dma_attr_seg		*/
-	1,				/* dma_attr_sgllen	*/
-	1,				/* dma_attr_granular	*/
-	0,				/* dma_attr_flags	*/
-};
+	/*
+	 * Check that this request was allocated, then zero the status field to
+	 * clear all status bits.
+	 */
+	VERIFY(vbr->vbr_status & VIOBLK_REQSTAT_ALLOCATED);
+	vbr->vbr_status = 0;
 
-/* DMA attr for the data blocks. */
-static ddi_dma_attr_t vioblk_bd_dma_attr = {
-	DMA_ATTR_V0,			/* dma_attr version	*/
-	0,				/* dma_attr_addr_lo	*/
-	0xFFFFFFFFFFFFFFFFull,		/* dma_attr_addr_hi	*/
-	0x00000000FFFFFFFFull,		/* dma_attr_count_max	*/
-	1,				/* dma_attr_align	*/
-	1,				/* dma_attr_burstsizes	*/
-	1,				/* dma_attr_minxfer	*/
-	0,				/* dma_attr_maxxfer, set in attach */
-	0xFFFFFFFFFFFFFFFFull,		/* dma_attr_seg		*/
-	0,				/* dma_attr_sgllen, set in attach */
-	1,				/* dma_attr_granular	*/
-	0,				/* dma_attr_flags	*/
-};
+	vbr->vbr_xfer = NULL;
+	vbr->vbr_error = 0;
+	vbr->vbr_type = 0;
 
-static int
-vioblk_rw(struct vioblk_softc *sc, bd_xfer_t *xfer, int type,
-    uint32_t len)
+	list_insert_head(&vib->vib_reqs, vbr);
+
+	VERIFY3U(vib->vib_nreqs_alloc, >, 0);
+	vib->vib_nreqs_alloc--;
+}
+
+static void
+vioblk_complete(vioblk_t *vib, vioblk_req_t *vbr)
 {
-	struct vioblk_req *req;
-	struct vq_entry *ve_hdr;
-	int total_cookies, write;
+	VERIFY(MUTEX_HELD(&vib->vib_mutex));
 
-	write = (type == VIRTIO_BLK_T_OUT ||
-	    type == VIRTIO_BLK_T_FLUSH_OUT) ? 1 : 0;
-	total_cookies = 2;
+	VERIFY(!(vbr->vbr_status & VIOBLK_REQSTAT_COMPLETE));
+	vbr->vbr_status |= VIOBLK_REQSTAT_COMPLETE;
 
-	if ((xfer->x_blkno + xfer->x_nblks) > sc->sc_nblks) {
-		sc->ks_data->sts_rw_badoffset.value.ui64++;
-		return (EINVAL);
+	if (vbr->vbr_type == VIRTIO_BLK_T_FLUSH) {
+		vib->vib_stats->vbs_rw_cacheflush.value.ui64++;
 	}
 
-	/* allocate top entry */
-	ve_hdr = vq_alloc_entry(sc->sc_vq);
-	if (!ve_hdr) {
-		sc->ks_data->sts_rw_outofmemory.value.ui64++;
-		return (ENOMEM);
+	if (vbr->vbr_xfer != NULL) {
+		/*
+		 * This is a blkdev framework request.
+		 */
+		mutex_exit(&vib->vib_mutex);
+		bd_xfer_done(vbr->vbr_xfer, vbr->vbr_error);
+		mutex_enter(&vib->vib_mutex);
+		vbr->vbr_xfer = NULL;
 	}
+}
 
-	/* getting request */
-	req = &sc->sc_reqs[ve_hdr->qe_index];
-	req->hdr.type = type;
-	req->hdr.ioprio = 0;
-	req->hdr.sector = xfer->x_blkno;
-	req->xfer = xfer;
-
-	/* Header */
-	virtio_ve_add_indirect_buf(ve_hdr, req->dmac.dmac_laddress,
-	    sizeof (struct vioblk_req_hdr), B_TRUE);
-
-	/* Payload */
-	if (len > 0) {
-		virtio_ve_add_cookie(ve_hdr, xfer->x_dmah, xfer->x_dmac,
-		    xfer->x_ndmac, write ? B_TRUE : B_FALSE);
-		total_cookies += xfer->x_ndmac;
+static virtio_chain_t *
+vioblk_common_start(vioblk_t *vib, int type, uint64_t sector,
+    boolean_t polled)
+{
+	vioblk_req_t *vbr = NULL;
+	virtio_chain_t *vic = NULL;
+
+	if ((vbr = vioblk_req_alloc(vib)) == NULL) {
+		vib->vib_stats->vbs_rw_outofmemory.value.ui64++;
+		return (NULL);
+	}
+	vbr->vbr_type = type;
+
+	if (polled) {
+		/*
+		 * Mark this command as polled so that we can wait on it
+		 * ourselves.
+		 */
+		vbr->vbr_status |= VIOBLK_REQSTAT_POLLED;
 	}
 
-	/* Status */
-	virtio_ve_add_indirect_buf(ve_hdr,
-	    req->dmac.dmac_laddress + sizeof (struct vioblk_req_hdr),
-	    sizeof (uint8_t), B_FALSE);
+	if ((vic = virtio_chain_alloc(vib->vib_vq, KM_NOSLEEP)) == NULL) {
+		vib->vib_stats->vbs_rw_outofmemory.value.ui64++;
+		goto fail;
+	}
 
-	/* sending the whole chain to the device */
-	virtio_push_chain(ve_hdr, B_TRUE);
+	struct vioblk_req_hdr vbh;
+	vbh.vbh_type = type;
+	vbh.vbh_ioprio = 0;
+	vbh.vbh_sector = sector;
+	bcopy(&vbh, virtio_dma_va(vbr->vbr_dma, 0), sizeof (vbh));
 
-	if (sc->sc_stats.rw_cookiesmax < total_cookies)
-		sc->sc_stats.rw_cookiesmax = total_cookies;
+	virtio_chain_data_set(vic, vbr);
 
-	return (DDI_SUCCESS);
+	/*
+	 * Put the header in the first descriptor.  See the block comment at
+	 * the top of the file for more details on the chain layout.
+	 */
+	if (virtio_chain_append(vic, virtio_dma_cookie_pa(vbr->vbr_dma, 0),
+	    sizeof (struct vioblk_req_hdr), VIRTIO_DIR_DEVICE_READS) !=
+	    DDI_SUCCESS) {
+		goto fail;
+	}
+
+	return (vic);
+
+fail:
+	vbr->vbr_xfer = NULL;
+	vioblk_req_free(vib, vbr);
+	if (vic != NULL) {
+		virtio_chain_free(vic);
+	}
+	return (NULL);
 }
 
-/*
- * Now in polling mode. Interrupts are off, so we
- * 1) poll for the already queued requests to complete.
- * 2) push our request.
- * 3) wait for our request to complete.
- */
 static int
-vioblk_rw_poll(struct vioblk_softc *sc, bd_xfer_t *xfer,
-    int type, uint32_t len)
+vioblk_common_submit(vioblk_t *vib, virtio_chain_t *vic)
 {
-	clock_t tmout;
-	int ret;
+	int r;
+	vioblk_req_t *vbr = virtio_chain_data(vic);
 
-	ASSERT(xfer->x_flags & BD_XFER_POLL);
+	VERIFY(MUTEX_HELD(&vib->vib_mutex));
 
-	/* Prevent a hard hang. */
-	tmout = drv_usectohz(30000000);
-
-	/* Poll for an empty queue */
-	while (vq_num_used(sc->sc_vq)) {
-		/* Check if any pending requests completed. */
-		ret = vioblk_int_handler((caddr_t)&sc->sc_virtio, NULL);
-		if (ret != DDI_INTR_CLAIMED) {
-			drv_usecwait(10);
-			tmout -= 10;
-			return (ETIMEDOUT);
-		}
+	/*
+	 * The device will write the status byte into this last descriptor.
+	 * See the block comment at the top of the file for more details on the
+	 * chain layout.
+	 */
+	if (virtio_chain_append(vic, virtio_dma_cookie_pa(vbr->vbr_dma, 0) +
+	    sizeof (struct vioblk_req_hdr), sizeof (uint8_t),
+	    VIRTIO_DIR_DEVICE_WRITES) != DDI_SUCCESS) {
+		r = ENOMEM;
+		goto out;
 	}
 
-	ret = vioblk_rw(sc, xfer, type, len);
-	if (ret)
-		return (ret);
+	virtio_dma_sync(vbr->vbr_dma, DDI_DMA_SYNC_FORDEV);
+	virtio_chain_submit(vic, B_TRUE);
+
+	if (!(vbr->vbr_status & VIOBLK_REQSTAT_POLLED)) {
+		/*
+		 * This is not a polled request.  Our request will be freed and
+		 * the caller notified later in vioblk_poll().
+		 */
+		return (0);
+	}
 
-	tmout = drv_usectohz(30000000);
-	/* Poll for an empty queue again. */
-	while (vq_num_used(sc->sc_vq)) {
-		/* Check if any pending requests completed. */
-		ret = vioblk_int_handler((caddr_t)&sc->sc_virtio, NULL);
-		if (ret != DDI_INTR_CLAIMED) {
+	/*
+	 * This is a polled request.  We need to block here and wait for the
+	 * device to complete request processing.
+	 */
+	while (!(vbr->vbr_status & VIOBLK_REQSTAT_POLL_COMPLETE)) {
+		if (ddi_in_panic()) {
+			/*
+			 * When panicking, interrupts are disabled.  We must
+			 * poll the queue manually.
+			 */
 			drv_usecwait(10);
-			tmout -= 10;
-			return (ETIMEDOUT);
+			(void) vioblk_poll(vib);
+			continue;
 		}
+
+		/*
+		 * When not panicking, the device will interrupt on command
+		 * completion and vioblk_poll() will be called to wake us up.
+		 */
+		cv_wait(&vib->vib_cv, &vib->vib_mutex);
 	}
 
-	return (DDI_SUCCESS);
+	vioblk_complete(vib, vbr);
+	r = vbr->vbr_error;
+
+out:
+	vioblk_req_free(vib, vbr);
+	virtio_chain_free(vic);
+	return (r);
 }
 
 static int
-vioblk_read(void *arg, bd_xfer_t *xfer)
+vioblk_internal(vioblk_t *vib, int type, virtio_dma_t *dma,
+    uint64_t sector, virtio_direction_t dir)
 {
-	int ret;
-	struct vioblk_softc *sc = (void *)arg;
+	virtio_chain_t *vic;
+	vioblk_req_t *vbr;
+	int r;
 
-	if (xfer->x_flags & BD_XFER_POLL) {
-		if (!sc->sc_in_poll_mode) {
-			virtio_stop_vq_intr(sc->sc_vq);
-			sc->sc_in_poll_mode = 1;
-		}
+	VERIFY(MUTEX_HELD(&vib->vib_mutex));
 
-		ret = vioblk_rw_poll(sc, xfer, VIRTIO_BLK_T_IN,
-		    xfer->x_nblks * DEV_BSIZE);
-	} else {
-		if (sc->sc_in_poll_mode) {
-			virtio_start_vq_intr(sc->sc_vq);
-			sc->sc_in_poll_mode = 0;
-		}
+	/*
+	 * Allocate a polled request.
+	 */
+	if ((vic = vioblk_common_start(vib, type, sector, B_TRUE)) == NULL) {
+		return (ENOMEM);
+	}
+	vbr = virtio_chain_data(vic);
 
-		ret = vioblk_rw(sc, xfer, VIRTIO_BLK_T_IN,
-		    xfer->x_nblks * DEV_BSIZE);
+	/*
+	 * If there is a request payload, it goes between the header and the
+	 * status byte.  See the block comment at the top of the file for more
+	 * detail on the chain layout.
+	 */
+	if (dma != NULL) {
+		for (uint_t n = 0; n < virtio_dma_ncookies(dma); n++) {
+			if (virtio_chain_append(vic,
+			    virtio_dma_cookie_pa(dma, n),
+			    virtio_dma_cookie_size(dma, n), dir) !=
+			    DDI_SUCCESS) {
+				r = ENOMEM;
+				goto out;
+			}
+		}
 	}
 
-	return (ret);
+	return (vioblk_common_submit(vib, vic));
+
+out:
+	vioblk_req_free(vib, vbr);
+	virtio_chain_free(vic);
+	return (r);
 }
 
 static int
-vioblk_write(void *arg, bd_xfer_t *xfer)
+vioblk_request(vioblk_t *vib, bd_xfer_t *xfer, int type)
 {
-	int ret;
-	struct vioblk_softc *sc = (void *)arg;
+	virtio_chain_t *vic = NULL;
+	vioblk_req_t *vbr = NULL;
+	uint_t total_cookies = 2;
+	boolean_t polled = (xfer->x_flags & BD_XFER_POLL) != 0;
+	int r;
 
-	if (xfer->x_flags & BD_XFER_POLL) {
-		if (!sc->sc_in_poll_mode) {
-			virtio_stop_vq_intr(sc->sc_vq);
-			sc->sc_in_poll_mode = 1;
-		}
+	VERIFY(MUTEX_HELD(&vib->vib_mutex));
+
+	/*
+	 * Ensure that this request falls within the advertised size of the
+	 * block device.  Be careful to avoid overflow.
+	 */
+	if (xfer->x_nblks > SIZE_MAX - xfer->x_blkno ||
+	    (xfer->x_blkno + xfer->x_nblks) > vib->vib_nblks) {
+		vib->vib_stats->vbs_rw_badoffset.value.ui64++;
+		return (EINVAL);
+	}
 
-		ret = vioblk_rw_poll(sc, xfer, VIRTIO_BLK_T_OUT,
-		    xfer->x_nblks * DEV_BSIZE);
-	} else {
-		if (sc->sc_in_poll_mode) {
-			virtio_start_vq_intr(sc->sc_vq);
-			sc->sc_in_poll_mode = 0;
+	if ((vic = vioblk_common_start(vib, type, xfer->x_blkno, polled)) ==
+	    NULL) {
+		return (ENOMEM);
+	}
+	vbr = virtio_chain_data(vic);
+	vbr->vbr_xfer = xfer;
+
+	/*
+	 * If there is a request payload, it goes between the header and the
+	 * status byte.  See the block comment at the top of the file for more
+	 * detail on the chain layout.
+	 */
+	if ((type == VIRTIO_BLK_T_IN || type == VIRTIO_BLK_T_OUT) &&
+	    xfer->x_nblks > 0) {
+		virtio_direction_t dir = (type == VIRTIO_BLK_T_OUT) ?
+		    VIRTIO_DIR_DEVICE_READS : VIRTIO_DIR_DEVICE_WRITES;
+
+		for (uint_t n = 0; n < xfer->x_ndmac; n++) {
+			ddi_dma_cookie_t dmac;
+
+			if (n == 0) {
+				/*
+				 * The first cookie is in the blkdev request.
+				 */
+				dmac = xfer->x_dmac;
+			} else {
+				ddi_dma_nextcookie(xfer->x_dmah, &dmac);
+			}
+
+			if (virtio_chain_append(vic, dmac.dmac_laddress,
+			    dmac.dmac_size, dir) != DDI_SUCCESS) {
+				r = ENOMEM;
+				goto fail;
+			}
 		}
 
-		ret = vioblk_rw(sc, xfer, VIRTIO_BLK_T_OUT,
-		    xfer->x_nblks * DEV_BSIZE);
+		total_cookies += xfer->x_ndmac;
+
+	} else if (xfer->x_nblks > 0) {
+		dev_err(vib->vib_dip, CE_PANIC,
+		    "request of type %d had payload length of %lu blocks", type,
+		    xfer->x_nblks);
+	}
+
+	if (vib->vib_stats->vbs_rw_cookiesmax.value.ui32 < total_cookies) {
+		vib->vib_stats->vbs_rw_cookiesmax.value.ui32 = total_cookies;
 	}
-	return (ret);
+
+	return (vioblk_common_submit(vib, vic));
+
+fail:
+	vbr->vbr_xfer = NULL;
+	vioblk_req_free(vib, vbr);
+	virtio_chain_free(vic);
+	return (r);
 }
 
 static int
-vioblk_flush(void *arg, bd_xfer_t *xfer)
+vioblk_bd_read(void *arg, bd_xfer_t *xfer)
 {
-	int ret;
-	struct vioblk_softc *sc = (void *)arg;
+	vioblk_t *vib = arg;
+	int r;
+
+	mutex_enter(&vib->vib_mutex);
+	r = vioblk_request(vib, xfer, VIRTIO_BLK_T_IN);
+	mutex_exit(&vib->vib_mutex);
 
-	ASSERT((xfer->x_flags & BD_XFER_POLL) == 0);
+	return (r);
+}
 
-	ret = vioblk_rw(sc, xfer, VIRTIO_BLK_T_FLUSH_OUT,
-	    xfer->x_nblks * DEV_BSIZE);
+static int
+vioblk_bd_write(void *arg, bd_xfer_t *xfer)
+{
+	vioblk_t *vib = arg;
+	int r;
 
-	if (!ret)
-		sc->sc_stats.rw_cacheflush++;
+	mutex_enter(&vib->vib_mutex);
+	r = vioblk_request(vib, xfer, VIRTIO_BLK_T_OUT);
+	mutex_exit(&vib->vib_mutex);
 
-	return (ret);
+	return (r);
 }
 
+static int
+vioblk_bd_flush(void *arg, bd_xfer_t *xfer)
+{
+	vioblk_t *vib = arg;
+	int r;
+
+	mutex_enter(&vib->vib_mutex);
+	if (!virtio_feature_present(vib->vib_virtio, VIRTIO_BLK_F_FLUSH)) {
+		/*
+		 * We don't really expect to get here, because if we did not
+		 * negotiate the flush feature we would not have installed this
+		 * function in the blkdev ops vector.
+		 */
+		mutex_exit(&vib->vib_mutex);
+		return (ENOTSUP);
+	}
+
+	r = vioblk_request(vib, xfer, VIRTIO_BLK_T_FLUSH);
+	mutex_exit(&vib->vib_mutex);
+
+	return (r);
+}
 
 static void
-vioblk_driveinfo(void *arg, bd_drive_t *drive)
+vioblk_bd_driveinfo(void *arg, bd_drive_t *drive)
 {
-	struct vioblk_softc *sc = (void *)arg;
+	vioblk_t *vib = arg;
 
-	drive->d_qsize = sc->sc_vq->vq_num;
+	drive->d_qsize = vib->vib_reqs_capacity;
 	drive->d_removable = B_FALSE;
 	drive->d_hotpluggable = B_TRUE;
 	drive->d_target = 0;
@@ -450,8 +518,7 @@ vioblk_driveinfo(void *arg, bd_drive_t *drive)
 	drive->d_product = "Block Device";
 	drive->d_product_len = strlen(drive->d_product);
 
-	(void) vioblk_get_id(sc);
-	drive->d_serial = sc->devid;
+	drive->d_serial = vib->vib_devid;
 	drive->d_serial_len = strlen(drive->d_serial);
 
 	drive->d_revision = "0000";
@@ -459,618 +526,501 @@ vioblk_driveinfo(void *arg, bd_drive_t *drive)
 }
 
 static int
-vioblk_mediainfo(void *arg, bd_media_t *media)
+vioblk_bd_mediainfo(void *arg, bd_media_t *media)
 {
-	struct vioblk_softc *sc = (void *)arg;
+	vioblk_t *vib = (void *)arg;
 
-	media->m_nblks = sc->sc_nblks;
-	media->m_blksize = sc->sc_blk_size;
-	media->m_readonly = sc->sc_readonly;
-	media->m_pblksize = sc->sc_pblk_size;
+	/*
+	 * The device protocol is specified in terms of 512 byte logical
+	 * blocks, regardless of the recommended I/O size which might be
+	 * larger.
+	 */
+	media->m_nblks = vib->vib_nblks;
+	media->m_blksize = DEV_BSIZE;
+
+	media->m_readonly = vib->vib_readonly;
+	media->m_pblksize = vib->vib_pblk_size;
 	return (0);
 }
 
-static int
-vioblk_get_id(struct vioblk_softc *sc)
+static void
+vioblk_get_id(vioblk_t *vib)
 {
-	clock_t deadline;
-	int ret;
-	bd_xfer_t xfer;
-
-	deadline = ddi_get_lbolt() + (clock_t)drv_usectohz(3 * 1000000);
-	(void) memset(&xfer, 0, sizeof (bd_xfer_t));
-	xfer.x_nblks = 1;
-
-	ret = ddi_dma_alloc_handle(sc->sc_dev, &vioblk_bd_dma_attr,
-	    DDI_DMA_SLEEP, NULL, &xfer.x_dmah);
-	if (ret != DDI_SUCCESS)
-		goto out_alloc;
-
-	ret = ddi_dma_addr_bind_handle(xfer.x_dmah, NULL, (caddr_t)&sc->devid,
-	    VIRTIO_BLK_ID_BYTES, DDI_DMA_READ | DDI_DMA_CONSISTENT,
-	    DDI_DMA_SLEEP, NULL, &xfer.x_dmac, &xfer.x_ndmac);
-	if (ret != DDI_DMA_MAPPED) {
-		ret = DDI_FAILURE;
-		goto out_map;
-	}
+	virtio_dma_t *dma;
+	int r;
 
-	mutex_enter(&sc->lock_devid);
-
-	ret = vioblk_rw(sc, &xfer, VIRTIO_BLK_T_GET_ID,
-	    VIRTIO_BLK_ID_BYTES);
-	if (ret) {
-		mutex_exit(&sc->lock_devid);
-		goto out_rw;
+	if ((dma = virtio_dma_alloc(vib->vib_virtio, VIRTIO_BLK_ID_BYTES,
+	    &vioblk_dma_attr, DDI_DMA_CONSISTENT | DDI_DMA_READ,
+	    KM_SLEEP)) == NULL) {
+		return;
 	}
 
-	/* wait for reply */
-	ret = cv_timedwait(&sc->cv_devid, &sc->lock_devid, deadline);
-	mutex_exit(&sc->lock_devid);
-
-	(void) ddi_dma_unbind_handle(xfer.x_dmah);
-	ddi_dma_free_handle(&xfer.x_dmah);
+	mutex_enter(&vib->vib_mutex);
+	if ((r = vioblk_internal(vib, VIRTIO_BLK_T_GET_ID, dma, 0,
+	    VIRTIO_DIR_DEVICE_WRITES)) == 0) {
+		const char *b = virtio_dma_va(dma, 0);
+		uint_t pos = 0;
 
-	/* timeout */
-	if (ret < 0) {
-		dev_err(sc->sc_dev, CE_WARN,
-		    "Cannot get devid from the device");
-		return (DDI_FAILURE);
-	}
-
-	return (0);
+		/*
+		 * Save the entire response for debugging purposes.
+		 */
+		bcopy(virtio_dma_va(dma, 0), vib->vib_rawid,
+		    VIRTIO_BLK_ID_BYTES);
 
-out_rw:
-	(void) ddi_dma_unbind_handle(xfer.x_dmah);
-out_map:
-	ddi_dma_free_handle(&xfer.x_dmah);
-out_alloc:
-	return (ret);
-}
+		/*
+		 * Process the returned ID.
+		 */
+		bzero(vib->vib_devid, sizeof (vib->vib_devid));
+		for (uint_t n = 0; n < VIRTIO_BLK_ID_BYTES; n++) {
+			if (isalnum(b[n]) || b[n] == '-' || b[n] == '_') {
+				/*
+				 * Accept a subset of printable ASCII
+				 * characters.
+				 */
+				vib->vib_devid[pos++] = b[n];
+			} else {
+				/*
+				 * Stop processing at the first sign of
+				 * trouble.
+				 */
+				break;
+			}
+		}
 
-static int
-vioblk_devid_init(void *arg, dev_info_t *devinfo, ddi_devid_t *devid)
-{
-	struct vioblk_softc *sc = (void *)arg;
-	int ret;
-
-	ret = vioblk_get_id(sc);
-	if (ret != DDI_SUCCESS)
-		return (ret);
-
-	ret = ddi_devid_init(devinfo, DEVID_ATA_SERIAL,
-	    VIRTIO_BLK_ID_BYTES, sc->devid, devid);
-	if (ret != DDI_SUCCESS) {
-		dev_err(devinfo, CE_WARN, "Cannot build devid from the device");
-		return (ret);
+		vib->vib_devid_fetched = B_TRUE;
 	}
+	mutex_exit(&vib->vib_mutex);
 
-	dev_debug(sc->sc_dev, CE_NOTE,
-	    "devid %x%x%x%x%x%x%x%x%x%x%x%x%x%x%x%x%x%x%x%x",
-	    sc->devid[0], sc->devid[1], sc->devid[2], sc->devid[3],
-	    sc->devid[4], sc->devid[5], sc->devid[6], sc->devid[7],
-	    sc->devid[8], sc->devid[9], sc->devid[10], sc->devid[11],
-	    sc->devid[12], sc->devid[13], sc->devid[14], sc->devid[15],
-	    sc->devid[16], sc->devid[17], sc->devid[18], sc->devid[19]);
-
-	return (0);
-}
-
-static void
-vioblk_show_features(struct vioblk_softc *sc, const char *prefix,
-    uint32_t features)
-{
-	char buf[512];
-	char *bufp = buf;
-	char *bufend = buf + sizeof (buf);
-
-	/* LINTED E_PTRDIFF_OVERFLOW */
-	bufp += snprintf(bufp, bufend - bufp, prefix);
-
-	/* LINTED E_PTRDIFF_OVERFLOW */
-	bufp += virtio_show_features(features, bufp, bufend - bufp);
-
-
-	/* LINTED E_PTRDIFF_OVERFLOW */
-	bufp += snprintf(bufp, bufend - bufp, "Vioblk ( ");
-
-	if (features & VIRTIO_BLK_F_BARRIER)
-		/* LINTED E_PTRDIFF_OVERFLOW */
-		bufp += snprintf(bufp, bufend - bufp, "BARRIER ");
-	if (features & VIRTIO_BLK_F_SIZE_MAX)
-		/* LINTED E_PTRDIFF_OVERFLOW */
-		bufp += snprintf(bufp, bufend - bufp, "SIZE_MAX ");
-	if (features & VIRTIO_BLK_F_SEG_MAX)
-		/* LINTED E_PTRDIFF_OVERFLOW */
-		bufp += snprintf(bufp, bufend - bufp, "SEG_MAX ");
-	if (features & VIRTIO_BLK_F_GEOMETRY)
-		/* LINTED E_PTRDIFF_OVERFLOW */
-		bufp += snprintf(bufp, bufend - bufp, "GEOMETRY ");
-	if (features & VIRTIO_BLK_F_RO)
-		/* LINTED E_PTRDIFF_OVERFLOW */
-		bufp += snprintf(bufp, bufend - bufp, "RO ");
-	if (features & VIRTIO_BLK_F_BLK_SIZE)
-		/* LINTED E_PTRDIFF_OVERFLOW */
-		bufp += snprintf(bufp, bufend - bufp, "BLK_SIZE ");
-	if (features & VIRTIO_BLK_F_SCSI)
-		/* LINTED E_PTRDIFF_OVERFLOW */
-		bufp += snprintf(bufp, bufend - bufp, "SCSI ");
-	if (features & VIRTIO_BLK_F_FLUSH)
-		/* LINTED E_PTRDIFF_OVERFLOW */
-		bufp += snprintf(bufp, bufend - bufp, "FLUSH ");
-	if (features & VIRTIO_BLK_F_TOPOLOGY)
-		/* LINTED E_PTRDIFF_OVERFLOW */
-		bufp += snprintf(bufp, bufend - bufp, "TOPOLOGY ");
-
-	/* LINTED E_PTRDIFF_OVERFLOW */
-	bufp += snprintf(bufp, bufend - bufp, ")");
-	*bufp = '\0';
-
-	dev_debug(sc->sc_dev, CE_NOTE, "%s", buf);
+	virtio_dma_free(dma);
 }
 
 static int
-vioblk_dev_features(struct vioblk_softc *sc)
+vioblk_bd_devid(void *arg, dev_info_t *dip, ddi_devid_t *devid)
 {
-	uint32_t host_features;
-
-	host_features = virtio_negotiate_features(&sc->sc_virtio,
-	    VIRTIO_BLK_F_RO |
-	    VIRTIO_BLK_F_GEOMETRY |
-	    VIRTIO_BLK_F_BLK_SIZE |
-	    VIRTIO_BLK_F_FLUSH |
-	    VIRTIO_BLK_F_TOPOLOGY |
-	    VIRTIO_BLK_F_SEG_MAX |
-	    VIRTIO_BLK_F_SIZE_MAX |
-	    VIRTIO_F_RING_INDIRECT_DESC);
-
-	vioblk_show_features(sc, "Host features: ", host_features);
-	vioblk_show_features(sc, "Negotiated features: ",
-	    sc->sc_virtio.sc_features);
-
-	if (!(sc->sc_virtio.sc_features & VIRTIO_F_RING_INDIRECT_DESC)) {
-		dev_err(sc->sc_dev, CE_NOTE,
-		    "Host does not support RING_INDIRECT_DESC, bye.");
+	vioblk_t *vib = arg;
+	size_t len;
+
+	if ((len = strlen(vib->vib_devid)) == 0) {
+		/*
+		 * The device has no ID.
+		 */
 		return (DDI_FAILURE);
 	}
 
-	return (DDI_SUCCESS);
+	return (ddi_devid_init(dip, DEVID_ATA_SERIAL, len, vib->vib_devid,
+	    devid));
 }
 
-/* ARGSUSED */
-uint_t
-vioblk_int_handler(caddr_t arg1, caddr_t arg2)
+/*
+ * As the device completes processing of a request, it returns the chain for
+ * that request to our I/O queue.  This routine is called in two contexts:
+ *   - from the interrupt handler, in response to notification from the device
+ *   - synchronously in line with request processing when panicking
+ */
+static uint_t
+vioblk_poll(vioblk_t *vib)
 {
-	struct virtio_softc *vsc = (void *)arg1;
-	struct vioblk_softc *sc = __containerof(vsc,
-	    struct vioblk_softc, sc_virtio);
-	struct vq_entry *ve;
-	uint32_t len;
-	int i = 0, error;
-
-	while ((ve = virtio_pull_chain(sc->sc_vq, &len))) {
-		struct vioblk_req *req = &sc->sc_reqs[ve->qe_index];
-		bd_xfer_t *xfer = req->xfer;
-		uint8_t status = req->status;
-		uint32_t type = req->hdr.type;
-
-		if (req->xfer == (void *)VIOBLK_POISON) {
-			dev_err(sc->sc_dev, CE_WARN, "Poisoned descriptor!");
-			virtio_free_chain(ve);
-			return (DDI_INTR_CLAIMED);
-		}
+	virtio_chain_t *vic;
+	uint_t count = 0;
+	boolean_t wakeup = B_FALSE;
+
+	VERIFY(MUTEX_HELD(&vib->vib_mutex));
 
-		req->xfer = (void *) VIOBLK_POISON;
+	while ((vic = virtio_queue_poll(vib->vib_vq)) != NULL) {
+		vioblk_req_t *vbr = virtio_chain_data(vic);
+		uint8_t status;
 
-		/* Note: blkdev tears down the payload mapping for us. */
-		virtio_free_chain(ve);
+		virtio_dma_sync(vbr->vbr_dma, DDI_DMA_SYNC_FORCPU);
+
+		bcopy(virtio_dma_va(vbr->vbr_dma,
+		    sizeof (struct vioblk_req_hdr)), &status, sizeof (status));
 
-		/* returning payload back to blkdev */
 		switch (status) {
-			case VIRTIO_BLK_S_OK:
-				error = 0;
-				break;
-			case VIRTIO_BLK_S_IOERR:
-				error = EIO;
-				sc->sc_stats.io_errors++;
-				break;
-			case VIRTIO_BLK_S_UNSUPP:
-				sc->sc_stats.unsupp_errors++;
-				error = ENOTTY;
-				break;
-			default:
-				sc->sc_stats.nxio_errors++;
-				error = ENXIO;
-				break;
+		case VIRTIO_BLK_S_OK:
+			vbr->vbr_error = 0;
+			break;
+		case VIRTIO_BLK_S_IOERR:
+			vbr->vbr_error = EIO;
+			vib->vib_stats->vbs_io_errors.value.ui64++;
+			break;
+		case VIRTIO_BLK_S_UNSUPP:
+			vbr->vbr_error = ENOTTY;
+			vib->vib_stats->vbs_unsupp_errors.value.ui64++;
+			break;
+		default:
+			vbr->vbr_error = ENXIO;
+			vib->vib_stats->vbs_nxio_errors.value.ui64++;
+			break;
+		}
+
+		count++;
+
+		if (vbr->vbr_status & VIOBLK_REQSTAT_POLLED) {
+			/*
+			 * This request must not be freed as it is being held
+			 * by a call to vioblk_common_submit().
+			 */
+			VERIFY(!(vbr->vbr_status &
+			    VIOBLK_REQSTAT_POLL_COMPLETE));
+			vbr->vbr_status |= VIOBLK_REQSTAT_POLL_COMPLETE;
+			wakeup = B_TRUE;
+			continue;
 		}
 
-		if (type == VIRTIO_BLK_T_GET_ID) {
-			/* notify devid_init */
-			mutex_enter(&sc->lock_devid);
-			cv_broadcast(&sc->cv_devid);
-			mutex_exit(&sc->lock_devid);
-		} else
-			bd_xfer_done(xfer, error);
+		vioblk_complete(vib, vbr);
 
-		i++;
+		vioblk_req_free(vib, vbr);
+		virtio_chain_free(vic);
 	}
 
-	/* update stats */
-	if (sc->sc_stats.intr_queuemax < i)
-		sc->sc_stats.intr_queuemax = i;
-	sc->sc_stats.intr_total++;
+	if (wakeup) {
+		/*
+		 * Signal anybody waiting for polled command completion.
+		 */
+		cv_broadcast(&vib->vib_cv);
+	}
 
-	return (DDI_INTR_CLAIMED);
+	return (count);
 }
 
-/* ARGSUSED */
 uint_t
-vioblk_config_handler(caddr_t arg1, caddr_t arg2)
-{
-	return (DDI_INTR_CLAIMED);
-}
-
-static int
-vioblk_register_ints(struct vioblk_softc *sc)
+vioblk_int_handler(caddr_t arg0, caddr_t arg1)
 {
-	int ret;
+	vioblk_t *vib = (vioblk_t *)arg0;
+	uint_t count;
 
-	struct virtio_int_handler vioblk_conf_h = {
-		vioblk_config_handler
-	};
-
-	struct virtio_int_handler vioblk_vq_h[] = {
-		{ vioblk_int_handler },
-		{ NULL },
-	};
+	mutex_enter(&vib->vib_mutex);
+	if ((count = vioblk_poll(vib)) >
+	    vib->vib_stats->vbs_intr_queuemax.value.ui32) {
+		vib->vib_stats->vbs_intr_queuemax.value.ui32 = count;
+	}
 
-	ret = virtio_register_ints(&sc->sc_virtio,
-	    &vioblk_conf_h, vioblk_vq_h);
+	vib->vib_stats->vbs_intr_total.value.ui64++;
+	mutex_exit(&vib->vib_mutex);
 
-	return (ret);
+	return (DDI_INTR_CLAIMED);
 }
 
 static void
-vioblk_free_reqs(struct vioblk_softc *sc)
+vioblk_free_reqs(vioblk_t *vib)
 {
-	int i, qsize;
+	VERIFY3U(vib->vib_nreqs_alloc, ==, 0);
 
-	qsize = sc->sc_vq->vq_num;
+	for (uint_t i = 0; i < vib->vib_reqs_capacity; i++) {
+		struct vioblk_req *vbr = &vib->vib_reqs_mem[i];
 
-	for (i = 0; i < qsize; i++) {
-		struct vioblk_req *req = &sc->sc_reqs[i];
+		VERIFY(list_link_active(&vbr->vbr_link));
+		list_remove(&vib->vib_reqs, vbr);
 
-		if (req->ndmac)
-			(void) ddi_dma_unbind_handle(req->dmah);
+		VERIFY0(vbr->vbr_status);
 
-		if (req->dmah)
-			ddi_dma_free_handle(&req->dmah);
+		if (vbr->vbr_dma != NULL) {
+			virtio_dma_free(vbr->vbr_dma);
+			vbr->vbr_dma = NULL;
+		}
 	}
+	VERIFY(list_is_empty(&vib->vib_reqs));
 
-	kmem_free(sc->sc_reqs, sizeof (struct vioblk_req) * qsize);
+	if (vib->vib_reqs_mem != NULL) {
+		kmem_free(vib->vib_reqs_mem,
+		    sizeof (struct vioblk_req) * vib->vib_reqs_capacity);
+		vib->vib_reqs_mem = NULL;
+		vib->vib_reqs_capacity = 0;
+	}
 }
 
 static int
-vioblk_alloc_reqs(struct vioblk_softc *sc)
+vioblk_alloc_reqs(vioblk_t *vib)
 {
-	int i, qsize;
-	int ret;
-
-	qsize = sc->sc_vq->vq_num;
-
-	sc->sc_reqs = kmem_zalloc(sizeof (struct vioblk_req) * qsize, KM_SLEEP);
-
-	for (i = 0; i < qsize; i++) {
-		struct vioblk_req *req = &sc->sc_reqs[i];
-
-		ret = ddi_dma_alloc_handle(sc->sc_dev, &vioblk_req_dma_attr,
-		    DDI_DMA_SLEEP, NULL, &req->dmah);
-		if (ret != DDI_SUCCESS) {
-
-			dev_err(sc->sc_dev, CE_WARN,
-			    "Can't allocate dma handle for req "
-			    "buffer %d", i);
-			goto exit;
-		}
+	vib->vib_reqs_capacity = MIN(virtio_queue_size(vib->vib_vq),
+	    VIRTIO_BLK_REQ_BUFS);
+	vib->vib_reqs_mem = kmem_zalloc(
+	    sizeof (struct vioblk_req) * vib->vib_reqs_capacity, KM_SLEEP);
+	vib->vib_nreqs_alloc = 0;
+
+	for (uint_t i = 0; i < vib->vib_reqs_capacity; i++) {
+		list_insert_tail(&vib->vib_reqs, &vib->vib_reqs_mem[i]);
+	}
 
-		ret = ddi_dma_addr_bind_handle(req->dmah, NULL,
-		    (caddr_t)&req->hdr,
+	for (vioblk_req_t *vbr = list_head(&vib->vib_reqs); vbr != NULL;
+	    vbr = list_next(&vib->vib_reqs, vbr)) {
+		if ((vbr->vbr_dma = virtio_dma_alloc(vib->vib_virtio,
 		    sizeof (struct vioblk_req_hdr) + sizeof (uint8_t),
-		    DDI_DMA_RDWR | DDI_DMA_CONSISTENT, DDI_DMA_SLEEP,
-		    NULL, &req->dmac, &req->ndmac);
-		if (ret != DDI_DMA_MAPPED) {
-			dev_err(sc->sc_dev, CE_WARN,
-			    "Can't bind req buffer %d", i);
-			goto exit;
+		    &vioblk_dma_attr, DDI_DMA_RDWR | DDI_DMA_CONSISTENT,
+		    KM_SLEEP)) == NULL) {
+			goto fail;
 		}
 	}
 
 	return (0);
 
-exit:
-	vioblk_free_reqs(sc);
+fail:
+	vioblk_free_reqs(vib);
 	return (ENOMEM);
 }
 
-
-static int
-vioblk_ksupdate(kstat_t *ksp, int rw)
-{
-	struct vioblk_softc *sc = ksp->ks_private;
-
-	if (rw == KSTAT_WRITE)
-		return (EACCES);
-
-	sc->ks_data->sts_rw_cookiesmax.value.ui32 = sc->sc_stats.rw_cookiesmax;
-	sc->ks_data->sts_intr_queuemax.value.ui32 = sc->sc_stats.intr_queuemax;
-	sc->ks_data->sts_unsupp_errors.value.ui32 = sc->sc_stats.unsupp_errors;
-	sc->ks_data->sts_nxio_errors.value.ui32 = sc->sc_stats.nxio_errors;
-	sc->ks_data->sts_io_errors.value.ui32 = sc->sc_stats.io_errors;
-	sc->ks_data->sts_rw_cacheflush.value.ui64 = sc->sc_stats.rw_cacheflush;
-	sc->ks_data->sts_intr_total.value.ui64 = sc->sc_stats.intr_total;
-
-
-	return (0);
-}
-
 static int
-vioblk_attach(dev_info_t *devinfo, ddi_attach_cmd_t cmd)
+vioblk_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
 {
-	int ret = DDI_SUCCESS;
-	int instance;
-	struct vioblk_softc *sc;
-	struct virtio_softc *vsc;
-	struct vioblk_stats *ks_data;
+	int instance = ddi_get_instance(dip);
+	vioblk_t *vib;
+	virtio_t *vio;
+	boolean_t did_mutex = B_FALSE;
 
-	instance = ddi_get_instance(devinfo);
-
-	switch (cmd) {
-	case DDI_ATTACH:
-		break;
-
-	case DDI_RESUME:
-	case DDI_PM_RESUME:
-		dev_err(devinfo, CE_WARN, "resume not supported yet");
+	if (cmd != DDI_ATTACH) {
 		return (DDI_FAILURE);
+	}
 
-	default:
-		dev_err(devinfo, CE_WARN, "cmd 0x%x not recognized", cmd);
+	if ((vio = virtio_init(dip, VIRTIO_BLK_WANTED_FEATURES, B_TRUE)) ==
+	    NULL) {
+		dev_err(dip, CE_WARN, "failed to start Virtio init");
 		return (DDI_FAILURE);
 	}
 
-	sc = kmem_zalloc(sizeof (struct vioblk_softc), KM_SLEEP);
-	ddi_set_driver_private(devinfo, sc);
-
-	vsc = &sc->sc_virtio;
+	vib = kmem_zalloc(sizeof (*vib), KM_SLEEP);
+	vib->vib_dip = dip;
+	vib->vib_virtio = vio;
+	ddi_set_driver_private(dip, vib);
+	list_create(&vib->vib_reqs, sizeof (vioblk_req_t),
+	    offsetof(vioblk_req_t, vbr_link));
 
-	/* Duplicate for faster access / less typing */
-	sc->sc_dev = devinfo;
-	vsc->sc_dev = devinfo;
+	/*
+	 * Determine how many scatter-gather entries we can use in a single
+	 * request.
+	 */
+	vib->vib_seg_max = VIRTIO_BLK_DEFAULT_MAX_SEG;
+	if (virtio_feature_present(vio, VIRTIO_BLK_F_SEG_MAX)) {
+		vib->vib_seg_max = virtio_dev_get32(vio,
+		    VIRTIO_BLK_CONFIG_SEG_MAX);
 
-	cv_init(&sc->cv_devid, NULL, CV_DRIVER, NULL);
-	mutex_init(&sc->lock_devid, NULL, MUTEX_DRIVER, NULL);
+		if (vib->vib_seg_max == 0 || vib->vib_seg_max == PCI_EINVAL32) {
+			/*
+			 * We need to be able to use at least one data segment,
+			 * so we'll assume that this device is just poorly
+			 * implemented and try for one.
+			 */
+			vib->vib_seg_max = 1;
+		}
+	}
 
 	/*
-	 * Initialize interrupt kstat.  This should not normally fail, since
-	 * we don't use a persistent stat.  We do it this way to avoid having
-	 * to test for it at run time on the hot path.
+	 * When allocating the request queue, we include two additional
+	 * descriptors (beyond those required for request data) to account for
+	 * the header and the status byte.
 	 */
-	sc->sc_intrstat = kstat_create("vioblk", instance,
-	    "intrs", "controller", KSTAT_TYPE_NAMED,
+	if ((vib->vib_vq = virtio_queue_alloc(vio, VIRTIO_BLK_VIRTQ_IO, "io",
+	    vioblk_int_handler, vib, B_FALSE, vib->vib_seg_max + 2)) == NULL) {
+		goto fail;
+	}
+
+	if (virtio_init_complete(vio, 0) != DDI_SUCCESS) {
+		dev_err(dip, CE_WARN, "failed to complete Virtio init");
+		goto fail;
+	}
+
+	cv_init(&vib->vib_cv, NULL, CV_DRIVER, NULL);
+	mutex_init(&vib->vib_mutex, NULL, MUTEX_DRIVER, virtio_intr_pri(vio));
+	did_mutex = B_TRUE;
+
+	if ((vib->vib_kstat = kstat_create("vioblk", instance,
+	    "statistics", "controller", KSTAT_TYPE_NAMED,
 	    sizeof (struct vioblk_stats) / sizeof (kstat_named_t),
-	    KSTAT_FLAG_PERSISTENT);
-	if (sc->sc_intrstat == NULL) {
-		dev_err(devinfo, CE_WARN, "kstat_create failed");
-		goto exit_intrstat;
+	    KSTAT_FLAG_PERSISTENT)) == NULL) {
+		dev_err(dip, CE_WARN, "kstat_create failed");
+		goto fail;
 	}
-	ks_data = (struct vioblk_stats *)sc->sc_intrstat->ks_data;
-	kstat_named_init(&ks_data->sts_rw_outofmemory,
+	vib->vib_stats = (vioblk_stats_t *)vib->vib_kstat->ks_data;
+	kstat_named_init(&vib->vib_stats->vbs_rw_outofmemory,
 	    "total_rw_outofmemory", KSTAT_DATA_UINT64);
-	kstat_named_init(&ks_data->sts_rw_badoffset,
+	kstat_named_init(&vib->vib_stats->vbs_rw_badoffset,
 	    "total_rw_badoffset", KSTAT_DATA_UINT64);
-	kstat_named_init(&ks_data->sts_intr_total,
+	kstat_named_init(&vib->vib_stats->vbs_intr_total,
 	    "total_intr", KSTAT_DATA_UINT64);
-	kstat_named_init(&ks_data->sts_io_errors,
-	    "total_io_errors", KSTAT_DATA_UINT32);
-	kstat_named_init(&ks_data->sts_unsupp_errors,
-	    "total_unsupp_errors", KSTAT_DATA_UINT32);
-	kstat_named_init(&ks_data->sts_nxio_errors,
-	    "total_nxio_errors", KSTAT_DATA_UINT32);
-	kstat_named_init(&ks_data->sts_rw_cacheflush,
+	kstat_named_init(&vib->vib_stats->vbs_io_errors,
+	    "total_io_errors", KSTAT_DATA_UINT64);
+	kstat_named_init(&vib->vib_stats->vbs_unsupp_errors,
+	    "total_unsupp_errors", KSTAT_DATA_UINT64);
+	kstat_named_init(&vib->vib_stats->vbs_nxio_errors,
+	    "total_nxio_errors", KSTAT_DATA_UINT64);
+	kstat_named_init(&vib->vib_stats->vbs_rw_cacheflush,
 	    "total_rw_cacheflush", KSTAT_DATA_UINT64);
-	kstat_named_init(&ks_data->sts_rw_cookiesmax,
+	kstat_named_init(&vib->vib_stats->vbs_rw_cookiesmax,
 	    "max_rw_cookies", KSTAT_DATA_UINT32);
-	kstat_named_init(&ks_data->sts_intr_queuemax,
+	kstat_named_init(&vib->vib_stats->vbs_intr_queuemax,
 	    "max_intr_queue", KSTAT_DATA_UINT32);
-	sc->ks_data = ks_data;
-	sc->sc_intrstat->ks_private = sc;
-	sc->sc_intrstat->ks_update = vioblk_ksupdate;
-	kstat_install(sc->sc_intrstat);
-
-	/* map BAR0 */
-	ret = ddi_regs_map_setup(devinfo, 1,
-	    (caddr_t *)&sc->sc_virtio.sc_io_addr,
-	    0, 0, &vioblk_attr, &sc->sc_virtio.sc_ioh);
-	if (ret != DDI_SUCCESS) {
-		dev_err(devinfo, CE_WARN, "unable to map bar0: [%d]", ret);
-		goto exit_map;
+	kstat_install(vib->vib_kstat);
+
+	vib->vib_readonly = virtio_feature_present(vio, VIRTIO_BLK_F_RO);
+	if ((vib->vib_nblks = virtio_dev_get64(vio,
+	    VIRTIO_BLK_CONFIG_CAPACITY)) == UINT64_MAX) {
+		dev_err(dip, CE_WARN, "invalid capacity");
+		goto fail;
 	}
 
-	virtio_device_reset(&sc->sc_virtio);
-	virtio_set_status(&sc->sc_virtio, VIRTIO_CONFIG_DEVICE_STATUS_ACK);
-	virtio_set_status(&sc->sc_virtio, VIRTIO_CONFIG_DEVICE_STATUS_DRIVER);
+	/*
+	 * Determine the optimal logical block size recommended by the device.
+	 * This size is advisory; the protocol always deals in 512 byte blocks.
+	 */
+	vib->vib_blk_size = DEV_BSIZE;
+	if (virtio_feature_present(vio, VIRTIO_BLK_F_BLK_SIZE)) {
+		uint32_t v = virtio_dev_get32(vio, VIRTIO_BLK_CONFIG_BLK_SIZE);
 
-	if (vioblk_register_ints(sc)) {
-		dev_err(devinfo, CE_WARN, "Unable to add interrupt");
-		goto exit_int;
+		if (v != 0 && v != PCI_EINVAL32) {
+			vib->vib_blk_size = v;
+		}
 	}
 
-	ret = vioblk_dev_features(sc);
-	if (ret)
-		goto exit_features;
+	/*
+	 * The device may also provide an advisory physical block size.
+	 */
+	vib->vib_pblk_size = vib->vib_blk_size;
+	if (virtio_feature_present(vio, VIRTIO_BLK_F_TOPOLOGY)) {
+		uint8_t v = virtio_dev_get8(vio, VIRTIO_BLK_CONFIG_TOPO_PBEXP);
 
-	if (sc->sc_virtio.sc_features & VIRTIO_BLK_F_RO)
-		sc->sc_readonly = B_TRUE;
-	else
-		sc->sc_readonly = B_FALSE;
+		if (v != PCI_EINVAL8) {
+			vib->vib_pblk_size <<= v;
+		}
+	}
 
-	sc->sc_capacity = virtio_read_device_config_8(&sc->sc_virtio,
-	    VIRTIO_BLK_CONFIG_CAPACITY);
-	sc->sc_nblks = sc->sc_capacity;
+	/*
+	 * The maximum size for a cookie in a request.
+	 */
+	vib->vib_seg_size_max = VIRTIO_BLK_DEFAULT_MAX_SIZE;
+	if (virtio_feature_present(vio, VIRTIO_BLK_F_SIZE_MAX)) {
+		uint32_t v = virtio_dev_get32(vio, VIRTIO_BLK_CONFIG_SIZE_MAX);
 
-	sc->sc_blk_size = DEV_BSIZE;
-	if (sc->sc_virtio.sc_features & VIRTIO_BLK_F_BLK_SIZE) {
-		sc->sc_blk_size = virtio_read_device_config_4(&sc->sc_virtio,
-		    VIRTIO_BLK_CONFIG_BLK_SIZE);
+		if (v != 0 && v != PCI_EINVAL32) {
+			vib->vib_seg_size_max = v;
+		}
 	}
 
-	sc->sc_pblk_size = sc->sc_blk_size;
-	if (sc->sc_virtio.sc_features & VIRTIO_BLK_F_TOPOLOGY) {
-		sc->sc_pblk_size <<= virtio_read_device_config_1(&sc->sc_virtio,
-		    VIRTIO_BLK_CONFIG_TOPO_PBEXP);
+	/*
+	 * Set up the DMA attributes for blkdev to use for request data.  The
+	 * specification is not extremely clear about whether DMA-related
+	 * parameters include or exclude the header and status descriptors.
+	 * For now, we assume they cover only the request data and not the
+	 * headers.
+	 */
+	vib->vib_bd_dma_attr = vioblk_dma_attr;
+	vib->vib_bd_dma_attr.dma_attr_sgllen = vib->vib_seg_max;
+	vib->vib_bd_dma_attr.dma_attr_count_max = vib->vib_seg_size_max;
+	vib->vib_bd_dma_attr.dma_attr_maxxfer = vib->vib_seg_max *
+	    vib->vib_seg_size_max;
+
+	if (vioblk_alloc_reqs(vib) != 0) {
+		goto fail;
 	}
 
-	/* Flushing is not supported. */
-	if (!(sc->sc_virtio.sc_features & VIRTIO_BLK_F_FLUSH)) {
-		vioblk_ops.o_sync_cache = NULL;
+	/*
+	 * The blkdev framework does not provide a way to specify that the
+	 * device does not support write cache flushing, except by omitting the
+	 * "o_sync_cache" member from the ops vector.  As "bd_alloc_handle()"
+	 * makes a copy of the ops vector, we can safely assemble one on the
+	 * stack based on negotiated features.
+	 */
+	bd_ops_t vioblk_bd_ops = {
+		.o_version =		BD_OPS_VERSION_0,
+		.o_drive_info =		vioblk_bd_driveinfo,
+		.o_media_info =		vioblk_bd_mediainfo,
+		.o_devid_init =		vioblk_bd_devid,
+		.o_sync_cache =		vioblk_bd_flush,
+		.o_read =		vioblk_bd_read,
+		.o_write =		vioblk_bd_write,
+	};
+	if (!virtio_feature_present(vio, VIRTIO_BLK_F_FLUSH)) {
+		vioblk_bd_ops.o_sync_cache = NULL;
 	}
 
-	sc->sc_seg_max = DEF_MAXINDIRECT;
-	/* The max number of segments (cookies) in a request */
-	if (sc->sc_virtio.sc_features & VIRTIO_BLK_F_SEG_MAX) {
-		sc->sc_seg_max = virtio_read_device_config_4(&sc->sc_virtio,
-		    VIRTIO_BLK_CONFIG_SEG_MAX);
-
-		/* That's what Linux does. */
-		if (!sc->sc_seg_max)
-			sc->sc_seg_max = 1;
+	vib->vib_bd_h = bd_alloc_handle(vib, &vioblk_bd_ops,
+	    &vib->vib_bd_dma_attr, KM_SLEEP);
 
-		/*
-		 * SEG_MAX corresponds to the number of _data_
-		 * blocks in a request
-		 */
-		sc->sc_seg_max += 2;
+	/*
+	 * Enable interrupts now so that we can request the device identity.
+	 */
+	if (virtio_interrupts_enable(vio) != DDI_SUCCESS) {
+		goto fail;
 	}
-	/* 2 descriptors taken for header/status */
-	vioblk_bd_dma_attr.dma_attr_sgllen = sc->sc_seg_max - 2;
 
+	vioblk_get_id(vib);
 
-	/* The maximum size for a cookie in a request. */
-	sc->sc_seg_size_max = DEF_MAXSECTOR;
-	if (sc->sc_virtio.sc_features & VIRTIO_BLK_F_SIZE_MAX) {
-		sc->sc_seg_size_max = virtio_read_device_config_4(
-		    &sc->sc_virtio, VIRTIO_BLK_CONFIG_SIZE_MAX);
+	if (bd_attach_handle(dip, vib->vib_bd_h) != DDI_SUCCESS) {
+		dev_err(dip, CE_WARN, "Failed to attach blkdev");
+		goto fail;
 	}
 
-	/* The maximum request size */
-	vioblk_bd_dma_attr.dma_attr_maxxfer =
-	    vioblk_bd_dma_attr.dma_attr_sgllen * sc->sc_seg_size_max;
-
-	dev_debug(devinfo, CE_NOTE,
-	    "nblks=%" PRIu64 " blksize=%d (%d) num_seg=%d, "
-	    "seg_size=%d, maxxfer=%" PRIu64,
-	    sc->sc_nblks, sc->sc_blk_size, sc->sc_pblk_size,
-	    vioblk_bd_dma_attr.dma_attr_sgllen,
-	    sc->sc_seg_size_max,
-	    vioblk_bd_dma_attr.dma_attr_maxxfer);
-
+	return (DDI_SUCCESS);
 
-	sc->sc_vq = virtio_alloc_vq(&sc->sc_virtio, 0, 0,
-	    sc->sc_seg_max, "I/O request");
-	if (sc->sc_vq == NULL) {
-		goto exit_alloc1;
+fail:
+	if (vib->vib_bd_h != NULL) {
+		(void) bd_detach_handle(vib->vib_bd_h);
+		bd_free_handle(vib->vib_bd_h);
 	}
-
-	ret = vioblk_alloc_reqs(sc);
-	if (ret) {
-		goto exit_alloc2;
+	if (vio != NULL) {
+		(void) virtio_fini(vio, B_TRUE);
 	}
-
-	sc->bd_h = bd_alloc_handle(sc, &vioblk_ops, &vioblk_bd_dma_attr,
-	    KM_SLEEP);
-
-
-	virtio_set_status(&sc->sc_virtio,
-	    VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK);
-	virtio_start_vq_intr(sc->sc_vq);
-
-	ret = virtio_enable_ints(&sc->sc_virtio);
-	if (ret)
-		goto exit_enable_ints;
-
-	ret = bd_attach_handle(devinfo, sc->bd_h);
-	if (ret != DDI_SUCCESS) {
-		dev_err(devinfo, CE_WARN, "Failed to attach blkdev");
-		goto exit_attach_bd;
+	if (did_mutex) {
+		mutex_destroy(&vib->vib_mutex);
+		cv_destroy(&vib->vib_cv);
 	}
-
-	return (DDI_SUCCESS);
-
-exit_attach_bd:
-	/*
-	 * There is no virtio_disable_ints(), it's done in virtio_release_ints.
-	 * If they ever get split, don't forget to add a call here.
-	 */
-exit_enable_ints:
-	virtio_stop_vq_intr(sc->sc_vq);
-	bd_free_handle(sc->bd_h);
-	vioblk_free_reqs(sc);
-exit_alloc2:
-	virtio_free_vq(sc->sc_vq);
-exit_alloc1:
-exit_features:
-	virtio_release_ints(&sc->sc_virtio);
-exit_int:
-	virtio_set_status(&sc->sc_virtio, VIRTIO_CONFIG_DEVICE_STATUS_FAILED);
-	ddi_regs_map_free(&sc->sc_virtio.sc_ioh);
-exit_map:
-	kstat_delete(sc->sc_intrstat);
-exit_intrstat:
-	mutex_destroy(&sc->lock_devid);
-	cv_destroy(&sc->cv_devid);
-	kmem_free(sc, sizeof (struct vioblk_softc));
+	if (vib->vib_kstat != NULL) {
+		kstat_delete(vib->vib_kstat);
+	}
+	vioblk_free_reqs(vib);
+	kmem_free(vib, sizeof (*vib));
 	return (DDI_FAILURE);
 }
 
 static int
-vioblk_detach(dev_info_t *devinfo, ddi_detach_cmd_t cmd)
+vioblk_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
 {
-	struct vioblk_softc *sc = ddi_get_driver_private(devinfo);
+	vioblk_t *vib = ddi_get_driver_private(dip);
 
-	switch (cmd) {
-	case DDI_DETACH:
-		break;
+	if (cmd != DDI_DETACH) {
+		return (DDI_FAILURE);
+	}
 
-	case DDI_PM_SUSPEND:
-		cmn_err(CE_WARN, "suspend not supported yet");
+	mutex_enter(&vib->vib_mutex);
+	if (vib->vib_nreqs_alloc > 0) {
+		/*
+		 * Cannot detach while there are still outstanding requests.
+		 */
+		mutex_exit(&vib->vib_mutex);
 		return (DDI_FAILURE);
+	}
 
-	default:
-		cmn_err(CE_WARN, "cmd 0x%x unrecognized", cmd);
+	if (bd_detach_handle(vib->vib_bd_h) != DDI_SUCCESS) {
+		mutex_exit(&vib->vib_mutex);
 		return (DDI_FAILURE);
 	}
 
-	(void) bd_detach_handle(sc->bd_h);
-	virtio_stop_vq_intr(sc->sc_vq);
-	virtio_release_ints(&sc->sc_virtio);
-	vioblk_free_reqs(sc);
-	virtio_free_vq(sc->sc_vq);
-	virtio_device_reset(&sc->sc_virtio);
-	ddi_regs_map_free(&sc->sc_virtio.sc_ioh);
-	kstat_delete(sc->sc_intrstat);
-	kmem_free(sc, sizeof (struct vioblk_softc));
+	/*
+	 * Tear down the Virtio framework before freeing the rest of the
+	 * resources.  This will ensure the interrupt handlers are no longer
+	 * running.
+	 */
+	virtio_fini(vib->vib_virtio, B_FALSE);
+
+	vioblk_free_reqs(vib);
+	kstat_delete(vib->vib_kstat);
+
+	mutex_exit(&vib->vib_mutex);
+	mutex_destroy(&vib->vib_mutex);
+
+	kmem_free(vib, sizeof (*vib));
 
 	return (DDI_SUCCESS);
 }
 
 static int
-vioblk_quiesce(dev_info_t *devinfo)
+vioblk_quiesce(dev_info_t *dip)
 {
-	struct vioblk_softc *sc = ddi_get_driver_private(devinfo);
+	vioblk_t *vib;
 
-	virtio_stop_vq_intr(sc->sc_vq);
-	virtio_device_reset(&sc->sc_virtio);
+	if ((vib = ddi_get_driver_private(dip)) == NULL) {
+		return (DDI_FAILURE);
+	}
 
-	return (DDI_SUCCESS);
+	return (virtio_quiesce(vib->vib_virtio));
 }
 
 int
@@ -1080,7 +1030,7 @@ _init(void)
 
 	bd_mod_init(&vioblk_dev_ops);
 
-	if ((rv = mod_install(&modlinkage)) != 0) {
+	if ((rv = mod_install(&vioblk_modlinkage)) != 0) {
 		bd_mod_fini(&vioblk_dev_ops);
 	}
 
@@ -1092,7 +1042,7 @@ _fini(void)
 {
 	int rv;
 
-	if ((rv = mod_remove(&modlinkage)) == 0) {
+	if ((rv = mod_remove(&vioblk_modlinkage)) == 0) {
 		bd_mod_fini(&vioblk_dev_ops);
 	}
 
@@ -1102,5 +1052,5 @@ _fini(void)
 int
 _info(struct modinfo *modinfop)
 {
-	return (mod_info(&modlinkage, modinfop));
+	return (mod_info(&vioblk_modlinkage, modinfop));
 }
diff --git a/usr/src/uts/common/io/vioblk/vioblk.h b/usr/src/uts/common/io/vioblk/vioblk.h
new file mode 100644
index 0000000000..e08fc31e8f
--- /dev/null
+++ b/usr/src/uts/common/io/vioblk/vioblk.h
@@ -0,0 +1,212 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2019 Joyent, Inc.
+ */
+
+/*
+ * VIRTIO BLOCK DRIVER
+ */
+
+#ifndef _VIOBLK_H
+#define	_VIOBLK_H
+
+#include "virtio.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * VIRTIO BLOCK CONFIGURATION REGISTERS
+ *
+ * These are offsets into the device-specific configuration space available
+ * through the virtio_dev_*() family of functions.
+ */
+#define	VIRTIO_BLK_CONFIG_CAPACITY	0x00	/* 64 R   */
+#define	VIRTIO_BLK_CONFIG_SIZE_MAX	0x08	/* 32 R   */
+#define	VIRTIO_BLK_CONFIG_SEG_MAX	0x0C	/* 32 R   */
+#define	VIRTIO_BLK_CONFIG_GEOMETRY_C	0x10	/* 16 R   */
+#define	VIRTIO_BLK_CONFIG_GEOMETRY_H	0x12	/*  8 R   */
+#define	VIRTIO_BLK_CONFIG_GEOMETRY_S	0x13	/*  8 R   */
+#define	VIRTIO_BLK_CONFIG_BLK_SIZE	0x14	/* 32 R   */
+#define	VIRTIO_BLK_CONFIG_TOPO_PBEXP	0x18	/*  8 R   */
+#define	VIRTIO_BLK_CONFIG_TOPO_ALIGN	0x19	/*  8 R   */
+#define	VIRTIO_BLK_CONFIG_TOPO_MIN_SZ	0x1A	/* 16 R   */
+#define	VIRTIO_BLK_CONFIG_TOPO_OPT_SZ	0x1C	/* 32 R   */
+
+/*
+ * VIRTIO BLOCK VIRTQUEUES
+ *
+ * Virtio block devices have just one queue which is used to make the various
+ * supported I/O requests.
+ */
+#define	VIRTIO_BLK_VIRTQ_IO		0
+
+/*
+ * VIRTIO BLOCK FEATURE BITS
+ */
+#define	VIRTIO_BLK_F_BARRIER		(1ULL << 0)
+#define	VIRTIO_BLK_F_SIZE_MAX		(1ULL << 1)
+#define	VIRTIO_BLK_F_SEG_MAX		(1ULL << 2)
+#define	VIRTIO_BLK_F_GEOMETRY		(1ULL << 4)
+#define	VIRTIO_BLK_F_RO			(1ULL << 5)
+#define	VIRTIO_BLK_F_BLK_SIZE		(1ULL << 6)
+#define	VIRTIO_BLK_F_SCSI		(1ULL << 7)
+#define	VIRTIO_BLK_F_FLUSH		(1ULL << 9)
+#define	VIRTIO_BLK_F_TOPOLOGY		(1ULL << 10)
+
+/*
+ * These features are supported by the driver and we will request them from the
+ * device.
+ */
+#define	VIRTIO_BLK_WANTED_FEATURES	(VIRTIO_BLK_F_RO |		\
+					VIRTIO_BLK_F_BLK_SIZE |		\
+					VIRTIO_BLK_F_FLUSH |		\
+					VIRTIO_BLK_F_TOPOLOGY |		\
+					VIRTIO_BLK_F_SEG_MAX |		\
+					VIRTIO_BLK_F_SIZE_MAX)
+
+/*
+ * VIRTIO BLOCK REQUEST HEADER
+ *
+ * This structure appears at the start of each I/O request buffer.  Note that
+ * neither the data payload nor the status byte appear in this structure as
+ * both are handled in separate descriptor entries.
+ */
+struct vioblk_req_hdr {
+	uint32_t			vbh_type;
+	uint32_t			vbh_ioprio;
+	uint64_t			vbh_sector;
+} __packed;
+
+/*
+ * VIRTIO BLOCK REQUEST HEADER: COMMANDS (vbh_type)
+ *
+ * Each of these is a command type, except for BARRIER which is logically
+ * OR-ed with one of the other types.
+ */
+#define	VIRTIO_BLK_T_IN			0
+#define	VIRTIO_BLK_T_OUT		1
+#define	VIRTIO_BLK_T_SCSI_CMD		2
+#define	VIRTIO_BLK_T_SCSI_CMD_OUT	3
+#define	VIRTIO_BLK_T_FLUSH		4
+#define	VIRTIO_BLK_T_FLUSH_OUT		5
+#define	VIRTIO_BLK_T_GET_ID		8
+#define	VIRTIO_BLK_T_BARRIER		0x80000000
+
+/*
+ * The GET_ID command type does not appear in the specification, but
+ * implementations in the wild use a 20 byte buffer into which the device will
+ * write an ASCII string.  The string should not be assumed to be
+ * NUL-terminated.
+ */
+#define	VIRTIO_BLK_ID_BYTES		20
+
+/*
+ * VIRTIO BLOCK REQUEST HEADER: STATUS CODES
+ *
+ * These are returned in the writeable status byte descriptor included at the
+ * end of each request passed to the device.
+ */
+#define	VIRTIO_BLK_S_OK			0
+#define	VIRTIO_BLK_S_IOERR		1
+#define	VIRTIO_BLK_S_UNSUPP		2
+
+/*
+ * DRIVER PARAMETERS
+ */
+
+/*
+ * In the event that the device does not negotiate DMA parameters, we have to
+ * make a best guess.
+ */
+#define	VIRTIO_BLK_DEFAULT_MAX_SEG	128
+#define	VIRTIO_BLK_DEFAULT_MAX_SIZE	4096
+
+/*
+ * We allocate a fixed number of request buffers in advance and place them in a
+ * per-instance free list.
+ */
+#define	VIRTIO_BLK_REQ_BUFS		256
+
+/*
+ * TYPE DEFINITIONS
+ */
+
+typedef enum vioblk_req_status {
+	VIOBLK_REQSTAT_ALLOCATED =	(0x1 << 0),
+	VIOBLK_REQSTAT_INFLIGHT =	(0x1 << 1),
+	VIOBLK_REQSTAT_COMPLETE =	(0x1 << 2),
+	VIOBLK_REQSTAT_POLLED =		(0x1 << 3),
+	VIOBLK_REQSTAT_POLL_COMPLETE =	(0x1 << 4),
+} vioblk_req_status_t;
+
+typedef struct vioblk_req {
+	vioblk_req_status_t		vbr_status;
+	uint64_t			vbr_seqno;
+	int				vbr_type;
+	int				vbr_error;
+	virtio_dma_t			*vbr_dma;
+	bd_xfer_t			*vbr_xfer;
+	list_node_t			vbr_link;
+} vioblk_req_t;
+
+typedef struct vioblk_stats {
+	struct kstat_named		vbs_rw_outofmemory;
+	struct kstat_named		vbs_rw_badoffset;
+	struct kstat_named		vbs_rw_queuemax;
+	struct kstat_named		vbs_rw_cookiesmax;
+	struct kstat_named		vbs_rw_cacheflush;
+	struct kstat_named		vbs_intr_queuemax;
+	struct kstat_named		vbs_intr_total;
+	struct kstat_named		vbs_io_errors;
+	struct kstat_named		vbs_unsupp_errors;
+	struct kstat_named		vbs_nxio_errors;
+} vioblk_stats_t;
+
+typedef struct vioblk {
+	dev_info_t			*vib_dip;
+	virtio_t			*vib_virtio;
+	virtio_queue_t			*vib_vq;
+
+	kmutex_t			vib_mutex;
+	kcondvar_t			vib_cv;
+
+	bd_handle_t			vib_bd_h;
+	ddi_dma_attr_t			vib_bd_dma_attr;
+
+	list_t				vib_reqs;
+	uint_t				vib_nreqs_alloc;
+	uint_t				vib_reqs_capacity;
+	vioblk_req_t			*vib_reqs_mem;
+
+	kstat_t				*vib_kstat;
+	vioblk_stats_t			*vib_stats;
+
+	uint64_t			vib_nblks;
+	boolean_t			vib_readonly;
+	uint_t				vib_blk_size;
+	uint_t				vib_pblk_size;
+	uint_t				vib_seg_max;
+	uint_t				vib_seg_size_max;
+
+	boolean_t			vib_devid_fetched;
+	char				vib_devid[VIRTIO_BLK_ID_BYTES + 1];
+	uint8_t				vib_rawid[VIRTIO_BLK_ID_BYTES];
+} vioblk_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _VIOBLK_H */
diff --git a/usr/src/uts/common/io/vioif/vioif.c b/usr/src/uts/common/io/vioif/vioif.c
index ec6684f040..201e84e11b 100644
--- a/usr/src/uts/common/io/vioif/vioif.c
+++ b/usr/src/uts/common/io/vioif/vioif.c
@@ -41,6 +41,10 @@
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+/*
+ * VIRTIO NETWORK DRIVER
+ */
+
 #include <sys/types.h>
 #include <sys/errno.h>
 #include <sys/param.h>
@@ -57,6 +61,7 @@
 #include <sys/pci.h>
 #include <sys/ethernet.h>
 #include <sys/vlan.h>
+#include <sys/sysmacros.h>
 
 #include <sys/dlpi.h>
 #include <sys/taskq.h>
@@ -72,805 +77,625 @@
 #include <sys/mac_provider.h>
 #include <sys/mac_ether.h>
 
-#include "virtiovar.h"
-#include "virtioreg.h"
-
-/* Configuration registers */
-#define	VIRTIO_NET_CONFIG_MAC		0 /* 8bit x 6byte */
-#define	VIRTIO_NET_CONFIG_STATUS	6 /* 16bit */
-
-/* Feature bits */
-#define	VIRTIO_NET_F_CSUM	(1 << 0) /* Host handles pkts w/ partial csum */
-#define	VIRTIO_NET_F_GUEST_CSUM	(1 << 1) /* Guest handles pkts w/ part csum */
-#define	VIRTIO_NET_F_MAC	(1 << 5) /* Host has given MAC address. */
-#define	VIRTIO_NET_F_GSO	(1 << 6) /* Host handles pkts w/ any GSO type */
-#define	VIRTIO_NET_F_GUEST_TSO4	(1 << 7) /* Guest can handle TSOv4 in. */
-#define	VIRTIO_NET_F_GUEST_TSO6	(1 << 8) /* Guest can handle TSOv6 in. */
-#define	VIRTIO_NET_F_GUEST_ECN	(1 << 9) /* Guest can handle TSO[6] w/ ECN in */
-#define	VIRTIO_NET_F_GUEST_UFO	(1 << 10) /* Guest can handle UFO in. */
-#define	VIRTIO_NET_F_HOST_TSO4	(1 << 11) /* Host can handle TSOv4 in. */
-#define	VIRTIO_NET_F_HOST_TSO6	(1 << 12) /* Host can handle TSOv6 in. */
-#define	VIRTIO_NET_F_HOST_ECN	(1 << 13) /* Host can handle TSO[6] w/ ECN in */
-#define	VIRTIO_NET_F_HOST_UFO	(1 << 14) /* Host can handle UFO in. */
-#define	VIRTIO_NET_F_MRG_RXBUF	(1 << 15) /* Host can merge receive buffers. */
-#define	VIRTIO_NET_F_STATUS	(1 << 16) /* Config.status available */
-#define	VIRTIO_NET_F_CTRL_VQ	(1 << 17) /* Control channel available */
-#define	VIRTIO_NET_F_CTRL_RX	(1 << 18) /* Control channel RX mode support */
-#define	VIRTIO_NET_F_CTRL_VLAN	(1 << 19) /* Control channel VLAN filtering */
-#define	VIRTIO_NET_F_CTRL_RX_EXTRA (1 << 20) /* Extra RX mode control support */
-
-#define	VIRTIO_NET_FEATURE_BITS \
-	"\020" \
-	"\1CSUM" \
-	"\2GUEST_CSUM" \
-	"\6MAC" \
-	"\7GSO" \
-	"\10GUEST_TSO4" \
-	"\11GUEST_TSO6" \
-	"\12GUEST_ECN" \
-	"\13GUEST_UFO" \
-	"\14HOST_TSO4" \
-	"\15HOST_TSO6" \
-	"\16HOST_ECN" \
-	"\17HOST_UFO" \
-	"\20MRG_RXBUF" \
-	"\21STATUS" \
-	"\22CTRL_VQ" \
-	"\23CTRL_RX" \
-	"\24CTRL_VLAN" \
-	"\25CTRL_RX_EXTRA"
-
-/* Status */
-#define	VIRTIO_NET_S_LINK_UP	1
-
-#pragma pack(1)
-/* Packet header structure */
-struct virtio_net_hdr {
-	uint8_t		flags;
-	uint8_t		gso_type;
-	uint16_t	hdr_len;
-	uint16_t	gso_size;
-	uint16_t	csum_start;
-	uint16_t	csum_offset;
-};
-#pragma pack()
+#include "virtio.h"
+#include "vioif.h"
 
-#define	VIRTIO_NET_HDR_F_NEEDS_CSUM	1 /* flags */
-#define	VIRTIO_NET_HDR_GSO_NONE		0 /* gso_type */
-#define	VIRTIO_NET_HDR_GSO_TCPV4	1 /* gso_type */
-#define	VIRTIO_NET_HDR_GSO_UDP		3 /* gso_type */
-#define	VIRTIO_NET_HDR_GSO_TCPV6	4 /* gso_type */
-#define	VIRTIO_NET_HDR_GSO_ECN		0x80 /* gso_type, |'ed */
 
-
-/* Control virtqueue */
-#pragma pack(1)
-struct virtio_net_ctrl_cmd {
-	uint8_t	class;
-	uint8_t	command;
+static int vioif_quiesce(dev_info_t *);
+static int vioif_attach(dev_info_t *, ddi_attach_cmd_t);
+static int vioif_detach(dev_info_t *, ddi_detach_cmd_t);
+static boolean_t vioif_has_feature(vioif_t *, uint32_t);
+static void vioif_reclaim_restart(vioif_t *);
+static int vioif_m_stat(void *, uint_t, uint64_t *);
+static void vioif_m_stop(void *);
+static int vioif_m_start(void *);
+static int vioif_m_multicst(void *, boolean_t, const uint8_t *);
+static int vioif_m_setpromisc(void *, boolean_t);
+static int vioif_m_unicst(void *, const uint8_t *);
+static mblk_t *vioif_m_tx(void *, mblk_t *);
+static int vioif_m_setprop(void *, const char *, mac_prop_id_t, uint_t,
+    const void *);
+static int vioif_m_getprop(void *, const char *, mac_prop_id_t, uint_t, void *);
+static void vioif_m_propinfo(void *, const char *, mac_prop_id_t,
+    mac_prop_info_handle_t);
+static boolean_t vioif_m_getcapab(void *, mac_capab_t, void *);
+static uint_t vioif_add_rx(vioif_t *);
+
+
+static struct cb_ops vioif_cb_ops = {
+	.cb_rev =			CB_REV,
+	.cb_flag =			D_MP | D_NEW,
+
+	.cb_open =			nulldev,
+	.cb_close =			nulldev,
+	.cb_strategy =			nodev,
+	.cb_print =			nodev,
+	.cb_dump =			nodev,
+	.cb_read =			nodev,
+	.cb_write =			nodev,
+	.cb_ioctl =			nodev,
+	.cb_devmap =			nodev,
+	.cb_mmap =			nodev,
+	.cb_segmap =			nodev,
+	.cb_chpoll =			nochpoll,
+	.cb_prop_op =			ddi_prop_op,
+	.cb_str =			NULL,
+	.cb_aread =			nodev,
+	.cb_awrite =			nodev,
 };
-#pragma pack()
-
-#define	VIRTIO_NET_CTRL_RX		0
-#define	VIRTIO_NET_CTRL_RX_PROMISC	0
-#define	VIRTIO_NET_CTRL_RX_ALLMULTI	1
 
-#define	VIRTIO_NET_CTRL_MAC		1
-#define	VIRTIO_NET_CTRL_MAC_TABLE_SET	0
+static struct dev_ops vioif_dev_ops = {
+	.devo_rev =			DEVO_REV,
+	.devo_refcnt =			0,
 
-#define	VIRTIO_NET_CTRL_VLAN		2
-#define	VIRTIO_NET_CTRL_VLAN_ADD	0
-#define	VIRTIO_NET_CTRL_VLAN_DEL	1
+	.devo_attach =			vioif_attach,
+	.devo_detach =			vioif_detach,
+	.devo_quiesce =			vioif_quiesce,
 
-#pragma pack(1)
-struct virtio_net_ctrl_status {
-	uint8_t	ack;
-};
+	.devo_cb_ops =			&vioif_cb_ops,
 
-struct virtio_net_ctrl_rx {
-	uint8_t	onoff;
+	.devo_getinfo =			NULL,
+	.devo_identify =		nulldev,
+	.devo_probe =			nulldev,
+	.devo_reset =			nodev,
+	.devo_bus_ops =			NULL,
+	.devo_power =			NULL,
 };
 
-struct virtio_net_ctrl_mac_tbl {
-	uint32_t nentries;
-	uint8_t macs[][ETHERADDRL];
+static struct modldrv vioif_modldrv = {
+	.drv_modops =			&mod_driverops,
+	.drv_linkinfo =			"VIRTIO network driver",
+	.drv_dev_ops =			&vioif_dev_ops
 };
 
-struct virtio_net_ctrl_vlan {
-	uint16_t id;
-};
-#pragma pack()
-
-static int vioif_quiesce(dev_info_t *);
-static int vioif_attach(dev_info_t *, ddi_attach_cmd_t);
-static int vioif_detach(dev_info_t *, ddi_detach_cmd_t);
-
-DDI_DEFINE_STREAM_OPS(vioif_ops,
-    nulldev,		/* identify */
-    nulldev,		/* probe */
-    vioif_attach,	/* attach */
-    vioif_detach,	/* detach */
-    nodev,		/* reset */
-    NULL,		/* cb_ops */
-    D_MP,		/* bus_ops */
-    NULL,		/* power */
-    vioif_quiesce	/* quiesce */);
-
-static char vioif_ident[] = "VirtIO ethernet driver";
-
-/* Standard Module linkage initialization for a Streams driver */
-extern struct mod_ops mod_driverops;
-
-static struct modldrv modldrv = {
-	&mod_driverops,		/* Type of module.  This one is a driver */
-	vioif_ident,		/* short description */
-	&vioif_ops		/* driver specific ops */
+static struct modlinkage vioif_modlinkage = {
+	.ml_rev =			MODREV_1,
+	.ml_linkage =			{ &vioif_modldrv, NULL }
 };
 
-static struct modlinkage modlinkage = {
-	MODREV_1,
-	{
-		(void *)&modldrv,
-		NULL,
-	},
+static mac_callbacks_t vioif_mac_callbacks = {
+	.mc_getstat =			vioif_m_stat,
+	.mc_start =			vioif_m_start,
+	.mc_stop =			vioif_m_stop,
+	.mc_setpromisc =		vioif_m_setpromisc,
+	.mc_multicst =			vioif_m_multicst,
+	.mc_unicst =			vioif_m_unicst,
+	.mc_tx =			vioif_m_tx,
+
+	.mc_callbacks =			(MC_GETCAPAB | MC_SETPROP |
+					    MC_GETPROP | MC_PROPINFO),
+	.mc_getcapab =			vioif_m_getcapab,
+	.mc_setprop =			vioif_m_setprop,
+	.mc_getprop =			vioif_m_getprop,
+	.mc_propinfo =			vioif_m_propinfo,
 };
 
-/* Interval for the periodic TX reclaim */
-uint_t vioif_reclaim_ms = 200;
-
-ddi_device_acc_attr_t vioif_attr = {
-	DDI_DEVICE_ATTR_V0,
-	DDI_NEVERSWAP_ACC,	/* virtio is always native byte order */
-	DDI_STORECACHING_OK_ACC,
-	DDI_DEFAULT_ACC
+static const uchar_t vioif_broadcast[ETHERADDRL] = {
+	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
 };
 
 /*
- * A mapping represents a binding for a single buffer that is contiguous in the
- * virtual address space.
+ * Interval for the periodic TX reclaim.
  */
-struct vioif_buf_mapping {
-	caddr_t			vbm_buf;
-	ddi_dma_handle_t	vbm_dmah;
-	ddi_acc_handle_t	vbm_acch;
-	ddi_dma_cookie_t	vbm_dmac;
-	unsigned int		vbm_ncookies;
-};
+uint_t vioif_reclaim_ms = 200;
 
 /*
- * Rx buffers can be loaned upstream, so the code has
- * to allocate them dynamically.
+ * DMA attribute template for transmit and receive buffers.  The SGL entry
+ * count will be modified before using the template.  Note that these
+ * allocations are aligned so that VIOIF_HEADER_SKIP places the IP header in
+ * received frames at the correct offset for the networking stack.
  */
-struct vioif_rx_buf {
-	struct vioif_softc	*rb_sc;
-	frtn_t			rb_frtn;
-
-	struct vioif_buf_mapping rb_mapping;
+ddi_dma_attr_t vioif_dma_attr_bufs = {
+	.dma_attr_version =		DMA_ATTR_V0,
+	.dma_attr_addr_lo =		0x0000000000000000,
+	.dma_attr_addr_hi =		0xFFFFFFFFFFFFFFFF,
+	.dma_attr_count_max =		0x00000000FFFFFFFF,
+	.dma_attr_align =		VIOIF_HEADER_ALIGN,
+	.dma_attr_burstsizes =		1,
+	.dma_attr_minxfer =		1,
+	.dma_attr_maxxfer =		0x00000000FFFFFFFF,
+	.dma_attr_seg =			0x00000000FFFFFFFF,
+	.dma_attr_sgllen =		0,
+	.dma_attr_granular =		1,
+	.dma_attr_flags =		0
 };
 
 /*
- * Tx buffers have two mapping types. One, "inline", is pre-allocated and is
- * used to hold the virtio_net_header. Small packets also get copied there, as
- * it's faster then mapping them. Bigger packets get mapped using the "external"
- * mapping array. An array is used, because a packet may consist of muptiple
- * fragments, so each fragment gets bound to an entry. According to my
- * observations, the number of fragments does not exceed 2, but just in case,
- * a bigger, up to VIOIF_INDIRECT_MAX - 1 array is allocated. To save resources,
- * the dma handles are allocated lazily in the tx path.
+ * DMA attributes for mapping larger transmit buffers from the networking
+ * stack.  The requirements are quite loose, but note that the SGL entry length
+ * field is 32-bit.
  */
-struct vioif_tx_buf {
-	mblk_t			*tb_mp;
-
-	/* inline buffer */
-	struct vioif_buf_mapping tb_inline_mapping;
-
-	/* External buffers */
-	struct vioif_buf_mapping *tb_external_mapping;
-	unsigned int		tb_external_num;
+ddi_dma_attr_t vioif_dma_attr_external = {
+	.dma_attr_version =		DMA_ATTR_V0,
+	.dma_attr_addr_lo =		0x0000000000000000,
+	.dma_attr_addr_hi =		0xFFFFFFFFFFFFFFFF,
+	.dma_attr_count_max =		0x00000000FFFFFFFF,
+	.dma_attr_align =		1,
+	.dma_attr_burstsizes =		1,
+	.dma_attr_minxfer =		1,
+	.dma_attr_maxxfer =		0x00000000FFFFFFFF,
+	.dma_attr_seg =			0x00000000FFFFFFFF,
+	.dma_attr_sgllen =		VIOIF_MAX_SEGS - 1,
+	.dma_attr_granular =		1,
+	.dma_attr_flags =		0
 };
 
-struct vioif_softc {
-	dev_info_t		*sc_dev; /* mirrors virtio_softc->sc_dev */
-	struct virtio_softc	sc_virtio;
-
-	mac_handle_t sc_mac_handle;
-	mac_register_t *sc_macp;
-
-	struct virtqueue	*sc_rx_vq;
-	struct virtqueue	*sc_tx_vq;
-	struct virtqueue	*sc_ctrl_vq;
-
-	/* TX virtqueue management resources */
-	kmutex_t		sc_tx_lock;
-	boolean_t		sc_tx_corked;
-	boolean_t		sc_tx_drain;
-	timeout_id_t		sc_tx_reclaim_tid;
-
-	/* Feature bits. */
-	unsigned int		sc_rx_csum:1;
-	unsigned int		sc_tx_csum:1;
-	unsigned int		sc_tx_tso4:1;
-
-	/*
-	 * For debugging, it is useful to know whether the MAC address we
-	 * are using came from the host (via VIRTIO_NET_CONFIG_MAC) or
-	 * was otherwise generated or set from within the guest.
-	 */
-	unsigned int		sc_mac_from_host:1;
-
-	int			sc_mtu;
-	uint8_t			sc_mac[ETHERADDRL];
-	/*
-	 * For rx buffers, we keep a pointer array, because the buffers
-	 * can be loaned upstream, and we have to repopulate the array with
-	 * new members.
-	 */
-	struct vioif_rx_buf	**sc_rxbufs;
-
-	/*
-	 * For tx, we just allocate an array of buffers. The packet can
-	 * either be copied into the inline buffer, or the external mapping
-	 * could be used to map the packet
-	 */
-	struct vioif_tx_buf	*sc_txbufs;
-
-	kstat_t			*sc_intrstat;
-	/*
-	 * We "loan" rx buffers upstream and reuse them after they are
-	 * freed. This lets us avoid allocations in the hot path.
-	 */
-	kmem_cache_t		*sc_rxbuf_cache;
-	ulong_t			sc_rxloan;
-
-	/* Copying small packets turns out to be faster then mapping them. */
-	unsigned long		sc_rxcopy_thresh;
-	unsigned long		sc_txcopy_thresh;
-
-	/*
-	 * Statistics visible through mac:
-	 */
-	uint64_t		sc_ipackets;
-	uint64_t		sc_opackets;
-	uint64_t		sc_rbytes;
-	uint64_t		sc_obytes;
-	uint64_t		sc_brdcstxmt;
-	uint64_t		sc_brdcstrcv;
-	uint64_t		sc_multixmt;
-	uint64_t		sc_multircv;
-	uint64_t		sc_norecvbuf;
-	uint64_t		sc_notxbuf;
-	uint64_t		sc_ierrors;
-	uint64_t		sc_oerrors;
-
-	/*
-	 * Internal debugging statistics:
-	 */
-	uint64_t		sc_rxfail_dma_handle;
-	uint64_t		sc_rxfail_dma_buffer;
-	uint64_t		sc_rxfail_dma_bind;
-	uint64_t		sc_rxfail_chain_undersize;
-	uint64_t		sc_rxfail_no_descriptors;
-	uint64_t		sc_txfail_dma_handle;
-	uint64_t		sc_txfail_dma_bind;
-	uint64_t		sc_txfail_indirect_limit;
-};
-
-#define	ETHER_HEADER_LEN		sizeof (struct ether_header)
-
-/* MTU + the ethernet header. */
-#define	MAX_PAYLOAD	65535
-#define	MAX_MTU		(MAX_PAYLOAD - ETHER_HEADER_LEN)
-#define	DEFAULT_MTU	ETHERMTU
 
 /*
- * Yeah, we spend 8M per device. Turns out, there is no point
- * being smart and using merged rx buffers (VIRTIO_NET_F_MRG_RXBUF),
- * because vhost does not support them, and we expect to be used with
- * vhost in production environment.
+ * VIRTIO NET MAC PROPERTIES
  */
-/* The buffer keeps both the packet data and the virtio_net_header. */
-#define	VIOIF_RX_SIZE (MAX_PAYLOAD + sizeof (struct virtio_net_hdr))
+#define	VIOIF_MACPROP_TXCOPY_THRESH	"_txcopy_thresh"
+#define	VIOIF_MACPROP_TXCOPY_THRESH_DEF	300
+#define	VIOIF_MACPROP_TXCOPY_THRESH_MAX	640
 
-/*
- * We win a bit on header alignment, but the host wins a lot
- * more on moving aligned buffers. Might need more thought.
- */
-#define	VIOIF_IP_ALIGN 0
-
-/* Maximum number of indirect descriptors, somewhat arbitrary. */
-#define	VIOIF_INDIRECT_MAX 128
-
-/*
- * We pre-allocate a reasonably large buffer to copy small packets
- * there. Bigger packets are mapped, packets with multiple
- * cookies are mapped as indirect buffers.
- */
-#define	VIOIF_TX_INLINE_SIZE 2048
-
-/* Native queue size for all queues */
-#define	VIOIF_RX_QLEN 0
-#define	VIOIF_TX_QLEN 0
-#define	VIOIF_CTRL_QLEN 0
-
-static uchar_t vioif_broadcast[ETHERADDRL] = {
-	0xff, 0xff, 0xff, 0xff, 0xff, 0xff
-};
-
-#define	VIOIF_TX_THRESH_MAX	640
-#define	VIOIF_RX_THRESH_MAX	640
-
-#define	CACHE_NAME_SIZE	32
-
-static char vioif_txcopy_thresh[] =
-	"vioif_txcopy_thresh";
-static char vioif_rxcopy_thresh[] =
-	"vioif_rxcopy_thresh";
+#define	VIOIF_MACPROP_RXCOPY_THRESH	"_rxcopy_thresh"
+#define	VIOIF_MACPROP_RXCOPY_THRESH_DEF	300
+#define	VIOIF_MACPROP_RXCOPY_THRESH_MAX	640
 
 static char *vioif_priv_props[] = {
-	vioif_txcopy_thresh,
-	vioif_rxcopy_thresh,
+	VIOIF_MACPROP_TXCOPY_THRESH,
+	VIOIF_MACPROP_RXCOPY_THRESH,
 	NULL
 };
 
-static void vioif_reclaim_restart(struct vioif_softc *);
 
-/* Add up to ddi? */
-static ddi_dma_cookie_t *
-vioif_dma_curr_cookie(ddi_dma_handle_t dmah)
+static vioif_txbuf_t *
+vioif_txbuf_alloc(vioif_t *vif)
 {
-	ddi_dma_impl_t *dmah_impl = (void *) dmah;
-	ASSERT(dmah_impl->dmai_cookie);
-	return (dmah_impl->dmai_cookie);
-}
+	vioif_txbuf_t *tb;
 
-static void
-vioif_dma_reset_cookie(ddi_dma_handle_t dmah, ddi_dma_cookie_t *dmac)
-{
-	ddi_dma_impl_t *dmah_impl = (void *) dmah;
-	dmah_impl->dmai_cookie = dmac;
-}
+	VERIFY(MUTEX_HELD(&vif->vif_mutex));
 
-static link_state_t
-vioif_link_state(struct vioif_softc *sc)
-{
-	if (sc->sc_virtio.sc_features & VIRTIO_NET_F_STATUS) {
-		if (virtio_read_device_config_2(&sc->sc_virtio,
-		    VIRTIO_NET_CONFIG_STATUS) & VIRTIO_NET_S_LINK_UP) {
-			return (LINK_STATE_UP);
-		} else {
-			return (LINK_STATE_DOWN);
-		}
+	if ((tb = list_remove_head(&vif->vif_txbufs)) != NULL) {
+		vif->vif_ntxbufs_alloc++;
 	}
 
-	return (LINK_STATE_UP);
+	return (tb);
 }
 
-static ddi_dma_attr_t vioif_inline_buf_dma_attr = {
-	DMA_ATTR_V0,		/* Version number */
-	0,			/* low address */
-	0xFFFFFFFFFFFFFFFF,	/* high address */
-	0xFFFFFFFF,		/* counter register max */
-	1,			/* page alignment */
-	1,			/* burst sizes: 1 - 32 */
-	1,			/* minimum transfer size */
-	0xFFFFFFFF,		/* max transfer size */
-	0xFFFFFFFFFFFFFFF,	/* address register max */
-	1,			/* scatter-gather capacity */
-	1,			/* device operates on bytes */
-	0,			/* attr flag: set to 0 */
-};
-
-static ddi_dma_attr_t vioif_mapped_buf_dma_attr = {
-	DMA_ATTR_V0,		/* Version number */
-	0,			/* low address */
-	0xFFFFFFFFFFFFFFFF,	/* high address */
-	0xFFFFFFFF,		/* counter register max */
-	1,			/* page alignment */
-	1,			/* burst sizes: 1 - 32 */
-	1,			/* minimum transfer size */
-	0xFFFFFFFF,		/* max transfer size */
-	0xFFFFFFFFFFFFFFF,	/* address register max */
-
-	/* One entry is used for the virtio_net_hdr on the tx path */
-	VIOIF_INDIRECT_MAX - 1,	/* scatter-gather capacity */
-	1,			/* device operates on bytes */
-	0,			/* attr flag: set to 0 */
-};
-
-static ddi_device_acc_attr_t vioif_bufattr = {
-	DDI_DEVICE_ATTR_V0,
-	DDI_NEVERSWAP_ACC,
-	DDI_STORECACHING_OK_ACC,
-	DDI_DEFAULT_ACC
-};
-
 static void
-vioif_rx_free(caddr_t free_arg)
+vioif_txbuf_free(vioif_t *vif, vioif_txbuf_t *tb)
 {
-	struct vioif_rx_buf *buf = (void *) free_arg;
-	struct vioif_softc *sc = buf->rb_sc;
+	VERIFY(MUTEX_HELD(&vif->vif_mutex));
+
+	VERIFY3U(vif->vif_ntxbufs_alloc, >, 0);
+	vif->vif_ntxbufs_alloc--;
 
-	kmem_cache_free(sc->sc_rxbuf_cache, buf);
-	atomic_dec_ulong(&sc->sc_rxloan);
+	virtio_chain_clear(tb->tb_chain);
+	list_insert_head(&vif->vif_txbufs, tb);
 }
 
-static int
-vioif_rx_construct(void *buffer, void *user_arg, int kmflags)
+static vioif_rxbuf_t *
+vioif_rxbuf_alloc(vioif_t *vif)
 {
-	_NOTE(ARGUNUSED(kmflags));
-	struct vioif_softc *sc = user_arg;
-	struct vioif_rx_buf *buf = buffer;
-	size_t len;
+	vioif_rxbuf_t *rb;
 
-	if (ddi_dma_alloc_handle(sc->sc_dev, &vioif_mapped_buf_dma_attr,
-	    DDI_DMA_SLEEP, NULL, &buf->rb_mapping.vbm_dmah)) {
-		sc->sc_rxfail_dma_handle++;
-		goto exit_handle;
-	}
-
-	if (ddi_dma_mem_alloc(buf->rb_mapping.vbm_dmah,
-	    VIOIF_RX_SIZE + sizeof (struct virtio_net_hdr),
-	    &vioif_bufattr, DDI_DMA_STREAMING, DDI_DMA_SLEEP,
-	    NULL, &buf->rb_mapping.vbm_buf, &len, &buf->rb_mapping.vbm_acch)) {
-		sc->sc_rxfail_dma_buffer++;
-		goto exit_alloc;
-	}
-	ASSERT(len >= VIOIF_RX_SIZE);
+	VERIFY(MUTEX_HELD(&vif->vif_mutex));
 
-	if (ddi_dma_addr_bind_handle(buf->rb_mapping.vbm_dmah, NULL,
-	    buf->rb_mapping.vbm_buf, len, DDI_DMA_READ | DDI_DMA_STREAMING,
-	    DDI_DMA_SLEEP, NULL, &buf->rb_mapping.vbm_dmac,
-	    &buf->rb_mapping.vbm_ncookies)) {
-		sc->sc_rxfail_dma_bind++;
-		goto exit_bind;
+	if ((rb = list_remove_head(&vif->vif_rxbufs)) != NULL) {
+		vif->vif_nrxbufs_alloc++;
 	}
 
-	ASSERT(buf->rb_mapping.vbm_ncookies <= VIOIF_INDIRECT_MAX);
+	return (rb);
+}
 
-	buf->rb_sc = sc;
-	buf->rb_frtn.free_arg = (void *) buf;
-	buf->rb_frtn.free_func = vioif_rx_free;
+static void
+vioif_rxbuf_free(vioif_t *vif, vioif_rxbuf_t *rb)
+{
+	VERIFY(MUTEX_HELD(&vif->vif_mutex));
 
-	return (0);
-exit_bind:
-	ddi_dma_mem_free(&buf->rb_mapping.vbm_acch);
-exit_alloc:
-	ddi_dma_free_handle(&buf->rb_mapping.vbm_dmah);
-exit_handle:
+	VERIFY3U(vif->vif_nrxbufs_alloc, >, 0);
+	vif->vif_nrxbufs_alloc--;
 
-	return (ENOMEM);
+	virtio_chain_clear(rb->rb_chain);
+	list_insert_head(&vif->vif_rxbufs, rb);
 }
 
 static void
-vioif_rx_destruct(void *buffer, void *user_arg)
+vioif_rx_free_callback(caddr_t free_arg)
 {
-	_NOTE(ARGUNUSED(user_arg));
-	struct vioif_rx_buf *buf = buffer;
+	vioif_rxbuf_t *rb = (vioif_rxbuf_t *)free_arg;
+	vioif_t *vif = rb->rb_vioif;
+
+	mutex_enter(&vif->vif_mutex);
+
+	/*
+	 * Return this receive buffer to the free list.
+	 */
+	vioif_rxbuf_free(vif, rb);
+
+	VERIFY3U(vif->vif_nrxbufs_onloan, >, 0);
+	vif->vif_nrxbufs_onloan--;
 
-	ASSERT(buf->rb_mapping.vbm_acch);
-	ASSERT(buf->rb_mapping.vbm_acch);
+	/*
+	 * Attempt to replenish the receive queue with at least the buffer we
+	 * just freed.  There isn't a great way to deal with failure here,
+	 * though because we'll only loan at most half of the buffers there
+	 * should always be at least some available even if this fails.
+	 */
+	(void) vioif_add_rx(vif);
 
-	(void) ddi_dma_unbind_handle(buf->rb_mapping.vbm_dmah);
-	ddi_dma_mem_free(&buf->rb_mapping.vbm_acch);
-	ddi_dma_free_handle(&buf->rb_mapping.vbm_dmah);
+	mutex_exit(&vif->vif_mutex);
 }
 
 static void
-vioif_free_mems(struct vioif_softc *sc)
+vioif_free_bufs(vioif_t *vif)
 {
-	int i;
+	VERIFY(MUTEX_HELD(&vif->vif_mutex));
 
-	for (i = 0; i < sc->sc_tx_vq->vq_num; i++) {
-		struct vioif_tx_buf *buf = &sc->sc_txbufs[i];
-		int j;
+	VERIFY3U(vif->vif_ntxbufs_alloc, ==, 0);
+	for (uint_t i = 0; i < vif->vif_txbufs_capacity; i++) {
+		vioif_txbuf_t *tb = &vif->vif_txbufs_mem[i];
 
-		/* Tear down the internal mapping. */
+		/*
+		 * Ensure that this txbuf is now in the free list:
+		 */
+		VERIFY(list_link_active(&tb->tb_link));
+		list_remove(&vif->vif_txbufs, tb);
 
-		ASSERT(buf->tb_inline_mapping.vbm_acch);
-		ASSERT(buf->tb_inline_mapping.vbm_dmah);
+		/*
+		 * We should not have an mblk chain at this point.
+		 */
+		VERIFY3P(tb->tb_mp, ==, NULL);
 
-		(void) ddi_dma_unbind_handle(buf->tb_inline_mapping.vbm_dmah);
-		ddi_dma_mem_free(&buf->tb_inline_mapping.vbm_acch);
-		ddi_dma_free_handle(&buf->tb_inline_mapping.vbm_dmah);
+		if (tb->tb_dma != NULL) {
+			virtio_dma_free(tb->tb_dma);
+			tb->tb_dma = NULL;
+		}
 
-		/* We should not see any in-flight buffers at this point. */
-		ASSERT(!buf->tb_mp);
+		if (tb->tb_chain != NULL) {
+			virtio_chain_free(tb->tb_chain);
+			tb->tb_chain = NULL;
+		}
+
+		if (tb->tb_dmaext != NULL) {
+			for (uint_t j = 0; j < tb->tb_dmaext_capacity; j++) {
+				if (tb->tb_dmaext[j] != NULL) {
+					virtio_dma_free(
+					    tb->tb_dmaext[j]);
+					tb->tb_dmaext[j] = NULL;
+				}
+			}
 
-		/* Free all the dma hdnales we allocated lazily. */
-		for (j = 0; buf->tb_external_mapping[j].vbm_dmah; j++)
-			ddi_dma_free_handle(
-			    &buf->tb_external_mapping[j].vbm_dmah);
-		/* Free the external mapping array. */
-		kmem_free(buf->tb_external_mapping,
-		    sizeof (struct vioif_tx_buf) * VIOIF_INDIRECT_MAX - 1);
+			kmem_free(tb->tb_dmaext,
+			    sizeof (virtio_dma_t *) * tb->tb_dmaext_capacity);
+			tb->tb_dmaext = NULL;
+			tb->tb_dmaext_capacity = 0;
+		}
+	}
+	VERIFY(list_is_empty(&vif->vif_txbufs));
+	if (vif->vif_txbufs_mem != NULL) {
+		kmem_free(vif->vif_txbufs_mem,
+		    sizeof (vioif_txbuf_t) * vif->vif_txbufs_capacity);
+		vif->vif_txbufs_mem = NULL;
+		vif->vif_txbufs_capacity = 0;
 	}
 
-	kmem_free(sc->sc_txbufs, sizeof (struct vioif_tx_buf) *
-	    sc->sc_tx_vq->vq_num);
+	VERIFY3U(vif->vif_nrxbufs_alloc, ==, 0);
+	for (uint_t i = 0; i < vif->vif_rxbufs_capacity; i++) {
+		vioif_rxbuf_t *rb = &vif->vif_rxbufs_mem[i];
 
-	for (i = 0; i < sc->sc_rx_vq->vq_num; i++) {
-		struct vioif_rx_buf *buf = sc->sc_rxbufs[i];
+		/*
+		 * Ensure that this rxbuf is now in the free list:
+		 */
+		VERIFY(list_link_active(&rb->rb_link));
+		list_remove(&vif->vif_rxbufs, rb);
 
-		if (buf)
-			kmem_cache_free(sc->sc_rxbuf_cache, buf);
+		if (rb->rb_dma != NULL) {
+			virtio_dma_free(rb->rb_dma);
+			rb->rb_dma = NULL;
+		}
+
+		if (rb->rb_chain != NULL) {
+			virtio_chain_free(rb->rb_chain);
+			rb->rb_chain = NULL;
+		}
+	}
+	VERIFY(list_is_empty(&vif->vif_rxbufs));
+	if (vif->vif_rxbufs_mem != NULL) {
+		kmem_free(vif->vif_rxbufs_mem,
+		    sizeof (vioif_rxbuf_t) * vif->vif_rxbufs_capacity);
+		vif->vif_rxbufs_mem = NULL;
+		vif->vif_rxbufs_capacity = 0;
 	}
-	kmem_free(sc->sc_rxbufs, sizeof (struct vioif_rx_buf *) *
-	    sc->sc_rx_vq->vq_num);
 }
 
 static int
-vioif_alloc_mems(struct vioif_softc *sc)
+vioif_alloc_bufs(vioif_t *vif)
 {
-	int i, txqsize, rxqsize;
-	size_t len;
-	unsigned int nsegments;
+	VERIFY(MUTEX_HELD(&vif->vif_mutex));
 
-	txqsize = sc->sc_tx_vq->vq_num;
-	rxqsize = sc->sc_rx_vq->vq_num;
+	/*
+	 * Allocate one contiguous chunk of memory for the transmit and receive
+	 * buffer tracking objects.  If the ring is unusually small, we'll
+	 * reduce our target buffer count accordingly.
+	 */
+	vif->vif_txbufs_capacity = MIN(VIRTIO_NET_TX_BUFS,
+	    virtio_queue_size(vif->vif_tx_vq));
+	vif->vif_txbufs_mem = kmem_zalloc(
+	    sizeof (vioif_txbuf_t) * vif->vif_txbufs_capacity, KM_SLEEP);
+	list_create(&vif->vif_txbufs, sizeof (vioif_txbuf_t),
+	    offsetof(vioif_txbuf_t, tb_link));
+
+	vif->vif_rxbufs_capacity = MIN(VIRTIO_NET_RX_BUFS,
+	    virtio_queue_size(vif->vif_rx_vq));
+	vif->vif_rxbufs_mem = kmem_zalloc(
+	    sizeof (vioif_rxbuf_t) * vif->vif_rxbufs_capacity, KM_SLEEP);
+	list_create(&vif->vif_rxbufs, sizeof (vioif_rxbuf_t),
+	    offsetof(vioif_rxbuf_t, rb_link));
 
-	sc->sc_txbufs = kmem_zalloc(sizeof (struct vioif_tx_buf) * txqsize,
-	    KM_SLEEP);
-	if (sc->sc_txbufs == NULL) {
-		dev_err(sc->sc_dev, CE_WARN,
-		    "Failed to allocate the tx buffers array");
-		goto exit_txalloc;
-	}
+	/*
+	 * Do not loan more than half of our allocated receive buffers into
+	 * the networking stack.
+	 */
+	vif->vif_nrxbufs_onloan_max = vif->vif_rxbufs_capacity / 2;
 
 	/*
-	 * We don't allocate the rx vioif_bufs, just the pointers, as
-	 * rx vioif_bufs can be loaned upstream, and we don't know the
-	 * total number we need.
+	 * Put everything in the free list straight away in order to simplify
+	 * the use of vioif_free_bufs() for cleanup on allocation failure.
 	 */
-	sc->sc_rxbufs = kmem_zalloc(sizeof (struct vioif_rx_buf *) * rxqsize,
-	    KM_SLEEP);
-	if (sc->sc_rxbufs == NULL) {
-		dev_err(sc->sc_dev, CE_WARN,
-		    "Failed to allocate the rx buffers pointer array");
-		goto exit_rxalloc;
+	for (uint_t i = 0; i < vif->vif_txbufs_capacity; i++) {
+		list_insert_tail(&vif->vif_txbufs, &vif->vif_txbufs_mem[i]);
+	}
+	for (uint_t i = 0; i < vif->vif_rxbufs_capacity; i++) {
+		list_insert_tail(&vif->vif_rxbufs, &vif->vif_rxbufs_mem[i]);
 	}
 
-	for (i = 0; i < txqsize; i++) {
-		struct vioif_tx_buf *buf = &sc->sc_txbufs[i];
-
-		/* Allocate and bind an inline mapping. */
-
-		if (ddi_dma_alloc_handle(sc->sc_dev,
-		    &vioif_inline_buf_dma_attr,
-		    DDI_DMA_SLEEP, NULL, &buf->tb_inline_mapping.vbm_dmah)) {
+	/*
+	 * Start from the DMA attribute template common to both transmit and
+	 * receive buffers.  The SGL entry count will be modified for each
+	 * buffer type.
+	 */
+	ddi_dma_attr_t attr = vioif_dma_attr_bufs;
 
-			dev_err(sc->sc_dev, CE_WARN,
-			    "Can't allocate dma handle for tx buffer %d", i);
-			goto exit_tx;
+	/*
+	 * The transmit inline buffer is small (less than a page), so it's
+	 * reasonable to request a single cookie.
+	 */
+	attr.dma_attr_sgllen = 1;
+
+	for (vioif_txbuf_t *tb = list_head(&vif->vif_txbufs); tb != NULL;
+	    tb = list_next(&vif->vif_txbufs, tb)) {
+		if ((tb->tb_dma = virtio_dma_alloc(vif->vif_virtio,
+		    VIOIF_TX_INLINE_SIZE, &attr,
+		    DDI_DMA_STREAMING | DDI_DMA_WRITE, KM_SLEEP)) == NULL) {
+			goto fail;
 		}
+		VERIFY3U(virtio_dma_ncookies(tb->tb_dma), ==, 1);
 
-		if (ddi_dma_mem_alloc(buf->tb_inline_mapping.vbm_dmah,
-		    VIOIF_TX_INLINE_SIZE, &vioif_bufattr, DDI_DMA_STREAMING,
-		    DDI_DMA_SLEEP, NULL, &buf->tb_inline_mapping.vbm_buf,
-		    &len, &buf->tb_inline_mapping.vbm_acch)) {
-
-			dev_err(sc->sc_dev, CE_WARN,
-			    "Can't allocate tx buffer %d", i);
-			goto exit_tx;
+		if ((tb->tb_chain = virtio_chain_alloc(vif->vif_tx_vq,
+		    KM_SLEEP)) == NULL) {
+			goto fail;
 		}
-		ASSERT(len >= VIOIF_TX_INLINE_SIZE);
+		virtio_chain_data_set(tb->tb_chain, tb);
 
-		if (ddi_dma_addr_bind_handle(buf->tb_inline_mapping.vbm_dmah,
-		    NULL, buf->tb_inline_mapping.vbm_buf, len,
-		    DDI_DMA_WRITE | DDI_DMA_STREAMING, DDI_DMA_SLEEP, NULL,
-		    &buf->tb_inline_mapping.vbm_dmac, &nsegments)) {
+		tb->tb_dmaext_capacity = VIOIF_MAX_SEGS - 1;
+		tb->tb_dmaext = kmem_zalloc(
+		    sizeof (virtio_dma_t *) * tb->tb_dmaext_capacity,
+		    KM_SLEEP);
+	}
 
-			dev_err(sc->sc_dev, CE_WARN,
-			    "Can't bind tx buffer %d", i);
-			goto exit_tx;
+	/*
+	 * The receive buffers are larger, and we can tolerate a large number
+	 * of segments.  Adjust the SGL entry count, setting aside one segment
+	 * for the virtio net header.
+	 */
+	attr.dma_attr_sgllen = VIOIF_MAX_SEGS - 1;
+
+	for (vioif_rxbuf_t *rb = list_head(&vif->vif_rxbufs); rb != NULL;
+	    rb = list_next(&vif->vif_rxbufs, rb)) {
+		if ((rb->rb_dma = virtio_dma_alloc(vif->vif_virtio,
+		    VIOIF_RX_BUF_SIZE, &attr, DDI_DMA_STREAMING | DDI_DMA_READ,
+		    KM_SLEEP)) == NULL) {
+			goto fail;
 		}
 
-		/* We asked for a single segment */
-		ASSERT(nsegments == 1);
+		if ((rb->rb_chain = virtio_chain_alloc(vif->vif_rx_vq,
+		    KM_SLEEP)) == NULL) {
+			goto fail;
+		}
+		virtio_chain_data_set(rb->rb_chain, rb);
 
 		/*
-		 * We allow up to VIOIF_INDIRECT_MAX - 1 external mappings.
-		 * In reality, I don't expect more then 2-3 used, but who
-		 * knows.
+		 * Ensure that the first cookie is sufficient to cover the
+		 * header skip region plus one byte.
 		 */
-		buf->tb_external_mapping = kmem_zalloc(
-		    sizeof (struct vioif_tx_buf) * VIOIF_INDIRECT_MAX - 1,
-		    KM_SLEEP);
+		VERIFY3U(virtio_dma_cookie_size(rb->rb_dma, 0), >=,
+		    VIOIF_HEADER_SKIP + 1);
 
 		/*
-		 * The external mapping's dma handles are allocate lazily,
-		 * as we don't expect most of them to be used..
+		 * Ensure that the frame data begins at a location with a
+		 * correctly aligned IP header.
 		 */
-	}
-
-	return (0);
-
-exit_tx:
-	for (i = 0; i < txqsize; i++) {
-		struct vioif_tx_buf *buf = &sc->sc_txbufs[i];
+		VERIFY3U((uintptr_t)virtio_dma_va(rb->rb_dma,
+		    VIOIF_HEADER_SKIP) % 4, ==, 2);
 
-		if (buf->tb_inline_mapping.vbm_dmah)
-			(void) ddi_dma_unbind_handle(
-			    buf->tb_inline_mapping.vbm_dmah);
-
-		if (buf->tb_inline_mapping.vbm_acch)
-			ddi_dma_mem_free(
-			    &buf->tb_inline_mapping.vbm_acch);
-
-		if (buf->tb_inline_mapping.vbm_dmah)
-			ddi_dma_free_handle(
-			    &buf->tb_inline_mapping.vbm_dmah);
-
-		if (buf->tb_external_mapping)
-			kmem_free(buf->tb_external_mapping,
-			    sizeof (struct vioif_tx_buf) *
-			    VIOIF_INDIRECT_MAX - 1);
+		rb->rb_vioif = vif;
+		rb->rb_frtn.free_func = vioif_rx_free_callback;
+		rb->rb_frtn.free_arg = (caddr_t)rb;
 	}
 
-	kmem_free(sc->sc_rxbufs, sizeof (struct vioif_rx_buf) * rxqsize);
+	return (0);
 
-exit_rxalloc:
-	kmem_free(sc->sc_txbufs, sizeof (struct vioif_tx_buf) * txqsize);
-exit_txalloc:
+fail:
+	vioif_free_bufs(vif);
 	return (ENOMEM);
 }
 
-/* ARGSUSED */
 static int
-vioif_multicst(void *arg, boolean_t add, const uint8_t *macaddr)
+vioif_m_multicst(void *arg, boolean_t add, const uint8_t *mcst_addr)
 {
-	return (DDI_SUCCESS);
+	/*
+	 * Even though we currently do not have support for programming
+	 * multicast filters, or even enabling promiscuous mode, we return
+	 * success here to avoid the networking stack falling back to link
+	 * layer broadcast for multicast traffic.  Some hypervisors already
+	 * pass received multicast frames onto the guest, so at least on those
+	 * systems multicast will work as expected anyway.
+	 */
+	return (0);
 }
 
-/* ARGSUSED */
 static int
-vioif_promisc(void *arg, boolean_t on)
+vioif_m_setpromisc(void *arg, boolean_t on)
 {
-	return (DDI_SUCCESS);
+	/*
+	 * Even though we cannot currently enable promiscuous mode, we return
+	 * success here to allow tools like snoop(1M) to continue to function.
+	 */
+	return (0);
 }
 
-/* ARGSUSED */
 static int
-vioif_unicst(void *arg, const uint8_t *macaddr)
+vioif_m_unicst(void *arg, const uint8_t *mac)
 {
-	return (DDI_FAILURE);
+	return (ENOTSUP);
 }
 
 static uint_t
-vioif_add_rx(struct vioif_softc *sc, int kmflag)
+vioif_add_rx(vioif_t *vif)
 {
-	uint_t num_added = 0;
-	struct vq_entry *ve;
-
-	while ((ve = vq_alloc_entry(sc->sc_rx_vq)) != NULL) {
-		struct vioif_rx_buf *buf = sc->sc_rxbufs[ve->qe_index];
-
-		if (buf == NULL) {
-			/* First run, allocate the buffer. */
-			buf = kmem_cache_alloc(sc->sc_rxbuf_cache, kmflag);
-			sc->sc_rxbufs[ve->qe_index] = buf;
-		}
-
-		/* Still nothing? Bye. */
-		if (buf == NULL) {
-			sc->sc_norecvbuf++;
-			vq_free_entry(sc->sc_rx_vq, ve);
-			break;
-		}
-
-		ASSERT(buf->rb_mapping.vbm_ncookies >= 1);
+	VERIFY(MUTEX_HELD(&vif->vif_mutex));
 
+	if (vif->vif_runstate != VIOIF_RUNSTATE_RUNNING) {
 		/*
-		 * For an unknown reason, the virtio_net_hdr must be placed
-		 * as a separate virtio queue entry.
+		 * If the NIC is not running, do not give the device any
+		 * receive buffers.
 		 */
-		virtio_ve_add_indirect_buf(ve,
-		    buf->rb_mapping.vbm_dmac.dmac_laddress,
-		    sizeof (struct virtio_net_hdr), B_FALSE);
+		return (0);
+	}
 
-		/* Add the rest of the first cookie. */
-		virtio_ve_add_indirect_buf(ve,
-		    buf->rb_mapping.vbm_dmac.dmac_laddress +
-		    sizeof (struct virtio_net_hdr),
-		    buf->rb_mapping.vbm_dmac.dmac_size -
-		    sizeof (struct virtio_net_hdr), B_FALSE);
+	uint_t num_added = 0;
 
+	vioif_rxbuf_t *rb;
+	while ((rb = vioif_rxbuf_alloc(vif)) != NULL) {
 		/*
-		 * If the buffer consists of a single cookie (unlikely for a
-		 * 64-k buffer), we are done. Otherwise, add the rest of the
-		 * cookies using indirect entries.
+		 * For legacy devices, and those that have not negotiated
+		 * VIRTIO_F_ANY_LAYOUT, the virtio net header must appear in a
+		 * separate descriptor entry to the rest of the buffer.
 		 */
-		if (buf->rb_mapping.vbm_ncookies > 1) {
-			ddi_dma_cookie_t *first_extra_dmac;
-			ddi_dma_cookie_t dmac;
-			first_extra_dmac =
-			    vioif_dma_curr_cookie(buf->rb_mapping.vbm_dmah);
-
-			ddi_dma_nextcookie(buf->rb_mapping.vbm_dmah, &dmac);
-			virtio_ve_add_cookie(ve, buf->rb_mapping.vbm_dmah,
-			    dmac, buf->rb_mapping.vbm_ncookies - 1, B_FALSE);
-			vioif_dma_reset_cookie(buf->rb_mapping.vbm_dmah,
-			    first_extra_dmac);
+		if (virtio_chain_append(rb->rb_chain,
+		    virtio_dma_cookie_pa(rb->rb_dma, 0),
+		    sizeof (struct virtio_net_hdr),
+		    VIRTIO_DIR_DEVICE_WRITES) != DDI_SUCCESS) {
+			goto fail;
 		}
 
-		virtio_push_chain(ve, B_FALSE);
-		num_added++;
-	}
+		for (uint_t n = 0; n < virtio_dma_ncookies(rb->rb_dma); n++) {
+			uint64_t pa = virtio_dma_cookie_pa(rb->rb_dma, n);
+			size_t sz = virtio_dma_cookie_size(rb->rb_dma, n);
 
-	return (num_added);
-}
+			if (n == 0) {
+				pa += VIOIF_HEADER_SKIP;
+				VERIFY3U(sz, >, VIOIF_HEADER_SKIP);
+				sz -= VIOIF_HEADER_SKIP;
+			}
 
-static uint_t
-vioif_populate_rx(struct vioif_softc *sc, int kmflag)
-{
-	uint_t num_added = vioif_add_rx(sc, kmflag);
+			if (virtio_chain_append(rb->rb_chain, pa, sz,
+			    VIRTIO_DIR_DEVICE_WRITES) != DDI_SUCCESS) {
+				goto fail;
+			}
+		}
+
+		virtio_chain_submit(rb->rb_chain, B_FALSE);
+		num_added++;
+		continue;
 
-	if (num_added > 0)
-		virtio_sync_vq(sc->sc_rx_vq);
+fail:
+		vioif_rxbuf_free(vif, rb);
+		vif->vif_norecvbuf++;
+		break;
+	}
+
+	if (num_added > 0) {
+		virtio_queue_flush(vif->vif_rx_vq);
+	}
 
 	return (num_added);
 }
 
 static uint_t
-vioif_process_rx(struct vioif_softc *sc)
+vioif_process_rx(vioif_t *vif)
 {
-	struct vq_entry *ve;
-	struct vioif_rx_buf *buf;
+	virtio_chain_t *vic;
 	mblk_t *mphead = NULL, *lastmp = NULL, *mp;
-	uint32_t len;
 	uint_t num_processed = 0;
 
-	while ((ve = virtio_pull_chain(sc->sc_rx_vq, &len))) {
+	VERIFY(MUTEX_HELD(&vif->vif_mutex));
 
-		buf = sc->sc_rxbufs[ve->qe_index];
-		ASSERT(buf);
+	while ((vic = virtio_queue_poll(vif->vif_rx_vq)) != NULL) {
+		/*
+		 * We have to use the chain received length here, as the device
+		 * does not tell us the received frame length any other way.
+		 * In a limited survey of hypervisors, virtio network devices
+		 * appear to provide the right value here.
+		 */
+		size_t len = virtio_chain_received_length(vic);
+		vioif_rxbuf_t *rb = virtio_chain_data(vic);
 
-		if (len < sizeof (struct virtio_net_hdr)) {
-			sc->sc_rxfail_chain_undersize++;
-			sc->sc_ierrors++;
-			virtio_free_chain(ve);
+		virtio_dma_sync(rb->rb_dma, DDI_DMA_SYNC_FORCPU);
+
+		/*
+		 * If the NIC is not running, discard any received frames.
+		 */
+		if (vif->vif_runstate != VIOIF_RUNSTATE_RUNNING) {
+			vioif_rxbuf_free(vif, rb);
 			continue;
 		}
 
+		if (len < sizeof (struct virtio_net_hdr)) {
+			vif->vif_rxfail_chain_undersize++;
+			vif->vif_ierrors++;
+			vioif_rxbuf_free(vif, rb);
+			continue;
+		}
 		len -= sizeof (struct virtio_net_hdr);
+
 		/*
 		 * We copy small packets that happen to fit into a single
 		 * cookie and reuse the buffers. For bigger ones, we loan
 		 * the buffers upstream.
 		 */
-		if (len < sc->sc_rxcopy_thresh) {
-			mp = allocb(len, 0);
-			if (mp == NULL) {
-				sc->sc_norecvbuf++;
-				sc->sc_ierrors++;
-
-				virtio_free_chain(ve);
-				break;
+		if (len < vif->vif_rxcopy_thresh ||
+		    vif->vif_nrxbufs_onloan >= vif->vif_nrxbufs_onloan_max) {
+			mutex_exit(&vif->vif_mutex);
+			if ((mp = allocb(len, 0)) == NULL) {
+				mutex_enter(&vif->vif_mutex);
+				vif->vif_norecvbuf++;
+				vif->vif_ierrors++;
+
+				vioif_rxbuf_free(vif, rb);
+				continue;
 			}
 
-			bcopy((char *)buf->rb_mapping.vbm_buf +
-			    sizeof (struct virtio_net_hdr), mp->b_rptr, len);
+			bcopy(virtio_dma_va(rb->rb_dma, VIOIF_HEADER_SKIP),
+			    mp->b_rptr, len);
 			mp->b_wptr = mp->b_rptr + len;
 
+			/*
+			 * As the packet contents was copied rather than
+			 * loaned, we can return the receive buffer resources
+			 * to the free list.
+			 */
+			mutex_enter(&vif->vif_mutex);
+			vioif_rxbuf_free(vif, rb);
+
 		} else {
-			mp = desballoc((unsigned char *)
-			    buf->rb_mapping.vbm_buf +
-			    sizeof (struct virtio_net_hdr) +
-			    VIOIF_IP_ALIGN, len, 0, &buf->rb_frtn);
-			if (mp == NULL) {
-				sc->sc_norecvbuf++;
-				sc->sc_ierrors++;
-
-				virtio_free_chain(ve);
-				break;
+			mutex_exit(&vif->vif_mutex);
+			if ((mp = desballoc(virtio_dma_va(rb->rb_dma,
+			    VIOIF_HEADER_SKIP), len, 0,
+			    &rb->rb_frtn)) == NULL) {
+				mutex_enter(&vif->vif_mutex);
+				vif->vif_norecvbuf++;
+				vif->vif_ierrors++;
+
+				vioif_rxbuf_free(vif, rb);
+				continue;
 			}
 			mp->b_wptr = mp->b_rptr + len;
 
-			atomic_inc_ulong(&sc->sc_rxloan);
-			/*
-			 * Buffer loaned, we will have to allocate a new one
-			 * for this slot.
-			 */
-			sc->sc_rxbufs[ve->qe_index] = NULL;
+			mutex_enter(&vif->vif_mutex);
+			vif->vif_nrxbufs_onloan++;
 		}
 
 		/*
@@ -879,15 +704,13 @@ vioif_process_rx(struct vioif_softc *sc)
 		 */
 		if (mp->b_rptr[0] & 0x1) {
 			if (bcmp(mp->b_rptr, vioif_broadcast, ETHERADDRL) != 0)
-				sc->sc_multircv++;
+				vif->vif_multircv++;
 			else
-				sc->sc_brdcstrcv++;
+				vif->vif_brdcstrcv++;
 		}
 
-		sc->sc_rbytes += len;
-		sc->sc_ipackets++;
-
-		virtio_free_chain(ve);
+		vif->vif_rbytes += len;
+		vif->vif_ipackets++;
 
 		if (lastmp == NULL) {
 			mphead = mp;
@@ -899,42 +722,56 @@ vioif_process_rx(struct vioif_softc *sc)
 	}
 
 	if (mphead != NULL) {
-		mac_rx(sc->sc_mac_handle, NULL, mphead);
+		if (vif->vif_runstate == VIOIF_RUNSTATE_RUNNING) {
+			mutex_exit(&vif->vif_mutex);
+			mac_rx(vif->vif_mac_handle, NULL, mphead);
+			mutex_enter(&vif->vif_mutex);
+		} else {
+			/*
+			 * The NIC was disabled part way through our execution,
+			 * so free the messages we allocated.
+			 */
+			freemsgchain(mphead);
+		}
 	}
 
 	return (num_processed);
 }
 
 static uint_t
-vioif_reclaim_used_tx(struct vioif_softc *sc)
+vioif_reclaim_used_tx(vioif_t *vif)
 {
-	struct vq_entry *ve;
-	uint32_t len;
+	virtio_chain_t *vic;
 	uint_t num_reclaimed = 0;
 
-	while ((ve = virtio_pull_chain(sc->sc_tx_vq, &len))) {
-		struct vioif_tx_buf *buf;
-		mblk_t *mp;
+	VERIFY(MUTEX_NOT_HELD(&vif->vif_mutex));
 
-		/* We don't chain descriptors for tx, so don't expect any. */
-		ASSERT(ve->qe_next == NULL);
+	while ((vic = virtio_queue_poll(vif->vif_tx_vq)) != NULL) {
+		vioif_txbuf_t *tb = virtio_chain_data(vic);
 
-		buf = &sc->sc_txbufs[ve->qe_index];
-		mp = buf->tb_mp;
-		buf->tb_mp = NULL;
+		if (tb->tb_mp != NULL) {
+			/*
+			 * Unbind the external mapping.
+			 */
+			for (uint_t i = 0; i < tb->tb_dmaext_capacity; i++) {
+				if (tb->tb_dmaext[i] == NULL) {
+					continue;
+				}
 
-		if (mp != NULL) {
-			for (uint_t i = 0; i < buf->tb_external_num; i++) {
-				(void) ddi_dma_unbind_handle(
-				    buf->tb_external_mapping[i].vbm_dmah);
+				virtio_dma_unbind(tb->tb_dmaext[i]);
 			}
+
+			freemsg(tb->tb_mp);
+			tb->tb_mp = NULL;
 		}
 
-		virtio_free_chain(ve);
+		/*
+		 * Return this transmit buffer to the free list for reuse.
+		 */
+		mutex_enter(&vif->vif_mutex);
+		vioif_txbuf_free(vif, tb);
+		mutex_exit(&vif->vif_mutex);
 
-		/* External mapping used, mp was not freed in vioif_send() */
-		if (mp != NULL)
-			freemsg(mp);
 		num_reclaimed++;
 	}
 
@@ -942,24 +779,24 @@ vioif_reclaim_used_tx(struct vioif_softc *sc)
 	if (num_reclaimed > 0) {
 		boolean_t do_update = B_FALSE;
 
-		mutex_enter(&sc->sc_tx_lock);
-		if (sc->sc_tx_corked) {
+		mutex_enter(&vif->vif_mutex);
+		vif->vif_stat_tx_reclaim += num_reclaimed;
+		if (vif->vif_tx_corked) {
 			/*
 			 * TX was corked on a lack of available descriptors.
 			 * That dire state has passed so the TX interrupt can
 			 * be disabled and MAC can be notified that
 			 * transmission is possible again.
 			 */
-			sc->sc_tx_corked = B_FALSE;
-			virtio_stop_vq_intr(sc->sc_tx_vq);
+			vif->vif_tx_corked = B_FALSE;
+			virtio_queue_no_interrupt(vif->vif_tx_vq, B_TRUE);
 			do_update = B_TRUE;
 		}
-		mutex_exit(&sc->sc_tx_lock);
 
-		/* Notify MAC outside the above lock */
 		if (do_update) {
-			mac_tx_update(sc->sc_mac_handle);
+			mac_tx_update(vif->vif_mac_handle);
 		}
+		mutex_exit(&vif->vif_mutex);
 	}
 
 	return (num_reclaimed);
@@ -968,208 +805,196 @@ vioif_reclaim_used_tx(struct vioif_softc *sc)
 static void
 vioif_reclaim_periodic(void *arg)
 {
-	struct vioif_softc *sc = arg;
+	vioif_t *vif = arg;
 	uint_t num_reclaimed;
 
-	num_reclaimed = vioif_reclaim_used_tx(sc);
+	num_reclaimed = vioif_reclaim_used_tx(vif);
 
-	mutex_enter(&sc->sc_tx_lock);
-	sc->sc_tx_reclaim_tid = 0;
+	mutex_enter(&vif->vif_mutex);
+	vif->vif_tx_reclaim_tid = 0;
 	/*
 	 * If used descriptors were reclaimed or TX descriptors appear to be
 	 * outstanding, the ring is considered active and periodic reclamation
 	 * is necessary for now.
 	 */
-	if (num_reclaimed != 0 || vq_num_used(sc->sc_tx_vq) != 0) {
+	if (num_reclaimed != 0 || virtio_queue_nactive(vif->vif_tx_vq) != 0) {
 		/* Do not reschedule if the ring is being drained. */
-		if (!sc->sc_tx_drain) {
-			vioif_reclaim_restart(sc);
+		if (!vif->vif_tx_drain) {
+			vioif_reclaim_restart(vif);
 		}
 	}
-	mutex_exit(&sc->sc_tx_lock);
+	mutex_exit(&vif->vif_mutex);
 }
 
 static void
-vioif_reclaim_restart(struct vioif_softc *sc)
+vioif_reclaim_restart(vioif_t *vif)
 {
-	ASSERT(MUTEX_HELD(&sc->sc_tx_lock));
-	ASSERT(!sc->sc_tx_drain);
+	VERIFY(MUTEX_HELD(&vif->vif_mutex));
+	VERIFY(!vif->vif_tx_drain);
 
-	if (sc->sc_tx_reclaim_tid == 0) {
-		sc->sc_tx_reclaim_tid = timeout(vioif_reclaim_periodic, sc,
+	if (vif->vif_tx_reclaim_tid == 0) {
+		vif->vif_tx_reclaim_tid = timeout(vioif_reclaim_periodic, vif,
 		    MSEC_TO_TICK_ROUNDUP(vioif_reclaim_ms));
 	}
 }
 
 static void
-vioif_tx_drain(struct vioif_softc *sc)
+vioif_tx_drain(vioif_t *vif)
 {
-	mutex_enter(&sc->sc_tx_lock);
-	sc->sc_tx_drain = B_TRUE;
+	VERIFY(MUTEX_HELD(&vif->vif_mutex));
+	VERIFY3S(vif->vif_runstate, ==, VIOIF_RUNSTATE_STOPPING);
+
+	vif->vif_tx_drain = B_TRUE;
 	/* Put a stop to the periodic reclaim if it is running */
-	if (sc->sc_tx_reclaim_tid != 0) {
-		timeout_id_t tid = sc->sc_tx_reclaim_tid;
+	if (vif->vif_tx_reclaim_tid != 0) {
+		timeout_id_t tid = vif->vif_tx_reclaim_tid;
 
 		/*
-		 * With sc_tx_drain set, there is no risk that a racing
+		 * With vif_tx_drain set, there is no risk that a racing
 		 * vioif_reclaim_periodic() call will reschedule itself.
 		 *
 		 * Being part of the mc_stop hook also guarantees that
-		 * vioif_tx() will not be called to restart it.
+		 * vioif_m_tx() will not be called to restart it.
 		 */
-		sc->sc_tx_reclaim_tid = 0;
-		mutex_exit(&sc->sc_tx_lock);
+		vif->vif_tx_reclaim_tid = 0;
+		mutex_exit(&vif->vif_mutex);
 		(void) untimeout(tid);
-		mutex_enter(&sc->sc_tx_lock);
+		mutex_enter(&vif->vif_mutex);
 	}
-	virtio_stop_vq_intr(sc->sc_tx_vq);
-	mutex_exit(&sc->sc_tx_lock);
+	virtio_queue_no_interrupt(vif->vif_tx_vq, B_TRUE);
 
 	/*
 	 * Wait for all of the TX descriptors to be processed by the host so
 	 * they can be reclaimed.
 	 */
-	while (vq_num_used(sc->sc_tx_vq) != 0) {
-		(void) vioif_reclaim_used_tx(sc);
+	while (vif->vif_ntxbufs_alloc > 0) {
+		mutex_exit(&vif->vif_mutex);
+		(void) vioif_reclaim_used_tx(vif);
 		delay(5);
+		mutex_enter(&vif->vif_mutex);
 	}
-
-	VERIFY(!sc->sc_tx_corked);
-	VERIFY3U(sc->sc_tx_reclaim_tid, ==, 0);
-	VERIFY3U(vq_num_used(sc->sc_tx_vq), ==, 0);
+	VERIFY(!vif->vif_tx_corked);
+	VERIFY3U(vif->vif_tx_reclaim_tid, ==, 0);
+	VERIFY3U(virtio_queue_nactive(vif->vif_tx_vq), ==, 0);
 }
 
-/* sc will be used to update stat counters. */
-/* ARGSUSED */
-static inline void
-vioif_tx_inline(struct vioif_softc *sc, struct vq_entry *ve, mblk_t *mp,
-    size_t msg_size)
+static int
+vioif_tx_inline(vioif_t *vif, vioif_txbuf_t *tb, mblk_t *mp, size_t msg_size)
 {
-	struct vioif_tx_buf *buf;
-	buf = &sc->sc_txbufs[ve->qe_index];
-
-	ASSERT(buf);
+	VERIFY(MUTEX_NOT_HELD(&vif->vif_mutex));
 
-	/* Frees mp */
-	mcopymsg(mp, buf->tb_inline_mapping.vbm_buf +
-	    sizeof (struct virtio_net_hdr));
+	VERIFY3U(msg_size, <=, virtio_dma_size(tb->tb_dma) - VIOIF_HEADER_SKIP);
 
-	virtio_ve_add_indirect_buf(ve,
-	    buf->tb_inline_mapping.vbm_dmac.dmac_laddress +
-	    sizeof (struct virtio_net_hdr), msg_size, B_TRUE);
-}
+	/*
+	 * Copy the message into the inline buffer and then free the message.
+	 */
+	mcopymsg(mp, virtio_dma_va(tb->tb_dma, VIOIF_HEADER_SKIP));
 
-static inline int
-vioif_tx_lazy_handle_alloc(struct vioif_softc *sc, struct vioif_tx_buf *buf,
-    int i)
-{
-	int ret = DDI_SUCCESS;
-
-	if (!buf->tb_external_mapping[i].vbm_dmah) {
-		ret = ddi_dma_alloc_handle(sc->sc_dev,
-		    &vioif_mapped_buf_dma_attr, DDI_DMA_SLEEP, NULL,
-		    &buf->tb_external_mapping[i].vbm_dmah);
-		if (ret != DDI_SUCCESS) {
-			sc->sc_txfail_dma_handle++;
-		}
+	if (virtio_chain_append(tb->tb_chain,
+	    virtio_dma_cookie_pa(tb->tb_dma, 0) + VIOIF_HEADER_SKIP,
+	    msg_size, VIRTIO_DIR_DEVICE_READS) != DDI_SUCCESS) {
+		return (DDI_FAILURE);
 	}
 
-	return (ret);
+	return (DDI_SUCCESS);
 }
 
-static inline int
-vioif_tx_external(struct vioif_softc *sc, struct vq_entry *ve, mblk_t *mp,
-    size_t msg_size)
+static int
+vioif_tx_external(vioif_t *vif, vioif_txbuf_t *tb, mblk_t *mp, size_t msg_size)
 {
-	_NOTE(ARGUNUSED(msg_size));
+	VERIFY(MUTEX_NOT_HELD(&vif->vif_mutex));
 
-	struct vioif_tx_buf *buf;
-	mblk_t *nmp;
-	int i, j;
-	int ret = DDI_SUCCESS;
+	mblk_t *nmp = mp;
+	tb->tb_ndmaext = 0;
 
-	buf = &sc->sc_txbufs[ve->qe_index];
-
-	ASSERT(buf);
-
-	buf->tb_external_num = 0;
-	i = 0;
-	nmp = mp;
-
-	while (nmp) {
+	while (nmp != NULL) {
 		size_t len;
-		ddi_dma_cookie_t dmac;
-		unsigned int ncookies;
 
-		len = MBLKL(nmp);
-		/*
-		 * For some reason, the network stack can
-		 * actually send us zero-length fragments.
-		 */
-		if (len == 0) {
+		if ((len = MBLKL(nmp)) == 0) {
+			/*
+			 * Skip any zero-length entries in the chain.
+			 */
 			nmp = nmp->b_cont;
 			continue;
 		}
 
-		ret = vioif_tx_lazy_handle_alloc(sc, buf, i);
-		if (ret != DDI_SUCCESS) {
-			sc->sc_notxbuf++;
-			sc->sc_oerrors++;
-			goto exit_lazy_alloc;
-		}
-		ret = ddi_dma_addr_bind_handle(
-		    buf->tb_external_mapping[i].vbm_dmah, NULL,
-		    (caddr_t)nmp->b_rptr, len,
-		    DDI_DMA_WRITE | DDI_DMA_STREAMING,
-		    DDI_DMA_SLEEP, NULL, &dmac, &ncookies);
-
-		if (ret != DDI_SUCCESS) {
-			sc->sc_txfail_dma_bind++;
-			sc->sc_oerrors++;
-			goto exit_bind;
+		if (tb->tb_ndmaext >= tb->tb_dmaext_capacity) {
+			mutex_enter(&vif->vif_mutex);
+			vif->vif_txfail_indirect_limit++;
+			vif->vif_notxbuf++;
+			mutex_exit(&vif->vif_mutex);
+			goto fail;
 		}
 
-		/* Check if we still fit into the indirect table. */
-		if (virtio_ve_indirect_available(ve) < ncookies) {
-			sc->sc_txfail_indirect_limit++;
-			sc->sc_notxbuf++;
-			sc->sc_oerrors++;
-
-			ret = DDI_FAILURE;
-			goto exit_limit;
+		if (tb->tb_dmaext[tb->tb_ndmaext] == NULL) {
+			/*
+			 * Allocate a DMA handle for this slot.
+			 */
+			if ((tb->tb_dmaext[tb->tb_ndmaext] =
+			    virtio_dma_alloc_nomem(vif->vif_virtio,
+			    &vioif_dma_attr_external, KM_SLEEP)) == NULL) {
+				mutex_enter(&vif->vif_mutex);
+				vif->vif_notxbuf++;
+				mutex_exit(&vif->vif_mutex);
+				goto fail;
+			}
+		}
+		virtio_dma_t *extdma = tb->tb_dmaext[tb->tb_ndmaext++];
+
+		if (virtio_dma_bind(extdma, nmp->b_rptr, len,
+		    DDI_DMA_WRITE | DDI_DMA_STREAMING, KM_SLEEP) !=
+		    DDI_SUCCESS) {
+			mutex_enter(&vif->vif_mutex);
+			vif->vif_txfail_dma_bind++;
+			mutex_exit(&vif->vif_mutex);
+			goto fail;
 		}
 
-		virtio_ve_add_cookie(ve, buf->tb_external_mapping[i].vbm_dmah,
-		    dmac, ncookies, B_TRUE);
+		for (uint_t n = 0; n < virtio_dma_ncookies(extdma); n++) {
+			uint64_t pa = virtio_dma_cookie_pa(extdma, n);
+			size_t sz = virtio_dma_cookie_size(extdma, n);
+
+			if (virtio_chain_append(tb->tb_chain, pa, sz,
+			    VIRTIO_DIR_DEVICE_READS) != DDI_SUCCESS) {
+				mutex_enter(&vif->vif_mutex);
+				vif->vif_txfail_indirect_limit++;
+				vif->vif_notxbuf++;
+				mutex_exit(&vif->vif_mutex);
+				goto fail;
+			}
+		}
 
 		nmp = nmp->b_cont;
-		i++;
 	}
 
-	buf->tb_external_num = i;
-	/* Save the mp to free it when the packet is sent. */
-	buf->tb_mp = mp;
+	/*
+	 * We need to keep the message around until we reclaim the buffer from
+	 * the device before freeing it.
+	 */
+	tb->tb_mp = mp;
 
 	return (DDI_SUCCESS);
 
-exit_limit:
-exit_bind:
-exit_lazy_alloc:
-
-	for (j = 0; j < i; j++) {
-		(void) ddi_dma_unbind_handle(
-		    buf->tb_external_mapping[j].vbm_dmah);
+fail:
+	for (uint_t n = 0; n < tb->tb_ndmaext; n++) {
+		if (tb->tb_dmaext[n] != NULL) {
+			virtio_dma_unbind(tb->tb_dmaext[n]);
+		}
 	}
+	tb->tb_ndmaext = 0;
 
-	return (ret);
+	freemsg(mp);
+
+	return (DDI_FAILURE);
 }
 
 static boolean_t
-vioif_send(struct vioif_softc *sc, mblk_t *mp)
+vioif_send(vioif_t *vif, mblk_t *mp)
 {
-	struct vq_entry *ve;
-	struct vioif_tx_buf *buf;
-	struct virtio_net_hdr *net_header = NULL;
+	VERIFY(MUTEX_NOT_HELD(&vif->vif_mutex));
+
+	vioif_txbuf_t *tb = NULL;
+	struct virtio_net_hdr *vnh = NULL;
 	size_t msg_size = 0;
 	uint32_t csum_start;
 	uint32_t csum_stuff;
@@ -1179,133 +1004,159 @@ vioif_send(struct vioif_softc *sc, mblk_t *mp)
 	mblk_t *nmp;
 	int ret;
 	boolean_t lso_required = B_FALSE;
+	struct ether_header *ether = (void *)mp->b_rptr;
 
 	for (nmp = mp; nmp; nmp = nmp->b_cont)
 		msg_size += MBLKL(nmp);
 
-	if (sc->sc_tx_tso4) {
+	if (vif->vif_tx_tso4) {
 		mac_lso_get(mp, &lso_mss, &lso_flags);
-		lso_required = (lso_flags & HW_LSO);
+		lso_required = (lso_flags & HW_LSO) != 0;
 	}
 
-	ve = vq_alloc_entry(sc->sc_tx_vq);
-
-	if (ve == NULL) {
-		sc->sc_notxbuf++;
-		/* Out of free descriptors - try later. */
-		return (B_FALSE);
+	mutex_enter(&vif->vif_mutex);
+	if ((tb = vioif_txbuf_alloc(vif)) == NULL) {
+		vif->vif_notxbuf++;
+		goto fail;
 	}
-	buf = &sc->sc_txbufs[ve->qe_index];
+	mutex_exit(&vif->vif_mutex);
 
-	/* Use the inline buffer of the first entry for the virtio_net_hdr. */
-	(void) memset(buf->tb_inline_mapping.vbm_buf, 0,
-	    sizeof (struct virtio_net_hdr));
+	/*
+	 * Use the inline buffer for the virtio net header.  Zero the portion
+	 * of our DMA allocation prior to the packet data.
+	 */
+	vnh = virtio_dma_va(tb->tb_dma, 0);
+	bzero(vnh, VIOIF_HEADER_SKIP);
 
-	net_header = (struct virtio_net_hdr *)buf->tb_inline_mapping.vbm_buf;
+	/*
+	 * For legacy devices, and those that have not negotiated
+	 * VIRTIO_F_ANY_LAYOUT, the virtio net header must appear in a separate
+	 * descriptor entry to the rest of the buffer.
+	 */
+	if (virtio_chain_append(tb->tb_chain,
+	    virtio_dma_cookie_pa(tb->tb_dma, 0), sizeof (struct virtio_net_hdr),
+	    VIRTIO_DIR_DEVICE_READS) != DDI_SUCCESS) {
+		mutex_enter(&vif->vif_mutex);
+		vif->vif_notxbuf++;
+		goto fail;
+	}
 
-	mac_hcksum_get(mp, &csum_start, &csum_stuff, NULL,
-	    NULL, &csum_flags);
+	mac_hcksum_get(mp, &csum_start, &csum_stuff, NULL, NULL, &csum_flags);
 
-	/* They want us to do the TCP/UDP csum calculation. */
+	/*
+	 * They want us to do the TCP/UDP csum calculation.
+	 */
 	if (csum_flags & HCK_PARTIALCKSUM) {
-		struct ether_header *eth_header;
 		int eth_hsize;
 
-		/* Did we ask for it? */
-		ASSERT(sc->sc_tx_csum);
+		/*
+		 * Did we ask for it?
+		 */
+		ASSERT(vif->vif_tx_csum);
 
-		/* We only asked for partial csum packets. */
+		/*
+		 * We only asked for partial csum packets.
+		 */
 		ASSERT(!(csum_flags & HCK_IPV4_HDRCKSUM));
 		ASSERT(!(csum_flags & HCK_FULLCKSUM));
 
-		eth_header = (void *) mp->b_rptr;
-		if (eth_header->ether_type == htons(ETHERTYPE_VLAN)) {
+		if (ether->ether_type == htons(ETHERTYPE_VLAN)) {
 			eth_hsize = sizeof (struct ether_vlan_header);
 		} else {
 			eth_hsize = sizeof (struct ether_header);
 		}
-		net_header->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
-		net_header->csum_start = eth_hsize + csum_start;
-		net_header->csum_offset = csum_stuff - csum_start;
+
+		vnh->vnh_flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
+		vnh->vnh_csum_start = eth_hsize + csum_start;
+		vnh->vnh_csum_offset = csum_stuff - csum_start;
 	}
 
-	/* setup LSO fields if required */
+	/*
+	 * Setup LSO fields if required.
+	 */
 	if (lso_required) {
-		net_header->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
-		net_header->gso_size = (uint16_t)lso_mss;
+		vnh->vnh_gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
+		vnh->vnh_gso_size = (uint16_t)lso_mss;
 	}
 
-	virtio_ve_add_indirect_buf(ve,
-	    buf->tb_inline_mapping.vbm_dmac.dmac_laddress,
-	    sizeof (struct virtio_net_hdr), B_TRUE);
-
-	/* meanwhile update the statistic */
-	if (mp->b_rptr[0] & 0x1) {
-		if (bcmp(mp->b_rptr, vioif_broadcast, ETHERADDRL) != 0)
-			sc->sc_multixmt++;
-		else
-			sc->sc_brdcstxmt++;
+	/*
+	 * The device does not maintain its own statistics about broadcast or
+	 * multicast packets, so we have to check the destination address
+	 * ourselves.
+	 */
+	if ((ether->ether_dhost.ether_addr_octet[0] & 0x01) != 0) {
+		mutex_enter(&vif->vif_mutex);
+		if (ether_cmp(&ether->ether_dhost, vioif_broadcast) == 0) {
+			vif->vif_brdcstxmt++;
+		} else {
+			vif->vif_multixmt++;
+		}
+		mutex_exit(&vif->vif_mutex);
 	}
 
 	/*
-	 * We copy small packets into the inline buffer. The bigger ones
-	 * get mapped using the mapped buffer.
+	 * For small packets, copy into the preallocated inline buffer rather
+	 * than incur the overhead of mapping.  Note that both of these
+	 * functions ensure that "mp" is freed before returning.
 	 */
-	if (msg_size < sc->sc_txcopy_thresh) {
-		vioif_tx_inline(sc, ve, mp, msg_size);
+	if (msg_size < vif->vif_txcopy_thresh) {
+		ret = vioif_tx_inline(vif, tb, mp, msg_size);
 	} else {
-		/* statistic gets updated by vioif_tx_external when fail */
-		ret = vioif_tx_external(sc, ve, mp, msg_size);
-		if (ret != DDI_SUCCESS)
-			goto exit_tx_external;
+		ret = vioif_tx_external(vif, tb, mp, msg_size);
 	}
+	mp = NULL;
 
-	virtio_push_chain(ve, B_TRUE);
-
-	sc->sc_opackets++;
-	sc->sc_obytes += msg_size;
+	mutex_enter(&vif->vif_mutex);
 
-	return (B_TRUE);
+	if (ret != DDI_SUCCESS) {
+		goto fail;
+	}
 
-exit_tx_external:
+	vif->vif_opackets++;
+	vif->vif_obytes += msg_size;
+	mutex_exit(&vif->vif_mutex);
 
-	vq_free_entry(sc->sc_tx_vq, ve);
-	/*
-	 * vioif_tx_external can fail when the buffer does not fit into the
-	 * indirect descriptor table. Free the mp. I don't expect this ever
-	 * to happen.
-	 */
-	freemsg(mp);
+	virtio_dma_sync(tb->tb_dma, DDI_DMA_SYNC_FORDEV);
+	virtio_chain_submit(tb->tb_chain, B_TRUE);
 
 	return (B_TRUE);
+
+fail:
+	vif->vif_oerrors++;
+	if (tb != NULL) {
+		vioif_txbuf_free(vif, tb);
+	}
+	mutex_exit(&vif->vif_mutex);
+
+	return (mp == NULL);
 }
 
 static mblk_t *
-vioif_tx(void *arg, mblk_t *mp)
+vioif_m_tx(void *arg, mblk_t *mp)
 {
-	struct vioif_softc *sc = arg;
+	vioif_t *vif = arg;
 	mblk_t *nmp;
 
 	/*
 	 * Prior to attempting to send any more frames, do a reclaim to pick up
 	 * any descriptors which have been processed by the host.
 	 */
-	if (vq_num_used(sc->sc_tx_vq) != 0) {
-		(void) vioif_reclaim_used_tx(sc);
+	if (virtio_queue_nactive(vif->vif_tx_vq) != 0) {
+		(void) vioif_reclaim_used_tx(vif);
 	}
 
 	while (mp != NULL) {
 		nmp = mp->b_next;
 		mp->b_next = NULL;
 
-		if (!vioif_send(sc, mp)) {
+		if (!vioif_send(vif, mp)) {
 			/*
 			 * If there are no descriptors available, try to
 			 * reclaim some, allowing a retry of the send if some
 			 * are found.
 			 */
 			mp->b_next = nmp;
-			if (vioif_reclaim_used_tx(sc) != 0) {
+			if (vioif_reclaim_used_tx(vif) != 0) {
 				continue;
 			}
 
@@ -1315,106 +1166,116 @@ vioif_tx(void *arg, mblk_t *mp)
 			 * can begin again.  For safety, make sure the periodic
 			 * reclaim is running as well.
 			 */
-			mutex_enter(&sc->sc_tx_lock);
-			sc->sc_tx_corked = B_TRUE;
-			virtio_start_vq_intr(sc->sc_tx_vq);
-			vioif_reclaim_restart(sc);
-			mutex_exit(&sc->sc_tx_lock);
+			mutex_enter(&vif->vif_mutex);
+			vif->vif_tx_corked = B_TRUE;
+			virtio_queue_no_interrupt(vif->vif_tx_vq, B_FALSE);
+			vioif_reclaim_restart(vif);
+			mutex_exit(&vif->vif_mutex);
 			return (mp);
 		}
 		mp = nmp;
 	}
 
 	/* Ensure the periodic reclaim has been started. */
-	mutex_enter(&sc->sc_tx_lock);
-	vioif_reclaim_restart(sc);
-	mutex_exit(&sc->sc_tx_lock);
+	mutex_enter(&vif->vif_mutex);
+	vioif_reclaim_restart(vif);
+	mutex_exit(&vif->vif_mutex);
 
 	return (NULL);
 }
 
 static int
-vioif_start(void *arg)
+vioif_m_start(void *arg)
 {
-	struct vioif_softc *sc = arg;
-	struct vq_entry *ve;
-	uint32_t len;
+	vioif_t *vif = arg;
+
+	mutex_enter(&vif->vif_mutex);
+
+	VERIFY3S(vif->vif_runstate, ==, VIOIF_RUNSTATE_STOPPED);
+	vif->vif_runstate = VIOIF_RUNSTATE_RUNNING;
 
-	mac_link_update(sc->sc_mac_handle, vioif_link_state(sc));
+	mac_link_update(vif->vif_mac_handle, LINK_STATE_UP);
 
-	virtio_start_vq_intr(sc->sc_rx_vq);
+	virtio_queue_no_interrupt(vif->vif_rx_vq, B_FALSE);
 
 	/*
 	 * Starting interrupts on the TX virtqueue is unnecessary at this time.
 	 * Descriptor reclamation is handling during transmit, via a periodic
 	 * timer, and when resources are tight, via the then-enabled interrupt.
 	 */
-	sc->sc_tx_drain = B_FALSE;
+	vif->vif_tx_drain = B_FALSE;
 
 	/*
-	 * Clear any data that arrived early on the receive queue and populate
-	 * it with free buffers that the device can use moving forward.
+	 * Add as many receive buffers as we can to the receive queue.  If we
+	 * cannot add any, it may be because we have stopped and started again
+	 * and the descriptors are all in the queue already.
 	 */
-	while ((ve = virtio_pull_chain(sc->sc_rx_vq, &len)) != NULL) {
-		virtio_free_chain(ve);
-	}
-	(void) vioif_populate_rx(sc, KM_SLEEP);
+	(void) vioif_add_rx(vif);
 
+	mutex_exit(&vif->vif_mutex);
 	return (DDI_SUCCESS);
 }
 
 static void
-vioif_stop(void *arg)
+vioif_m_stop(void *arg)
 {
-	struct vioif_softc *sc = arg;
+	vioif_t *vif = arg;
+
+	mutex_enter(&vif->vif_mutex);
+
+	VERIFY3S(vif->vif_runstate, ==, VIOIF_RUNSTATE_RUNNING);
+	vif->vif_runstate = VIOIF_RUNSTATE_STOPPING;
 
 	/* Ensure all TX descriptors have been processed and reclaimed */
-	vioif_tx_drain(sc);
+	vioif_tx_drain(vif);
 
-	virtio_stop_vq_intr(sc->sc_rx_vq);
+	virtio_queue_no_interrupt(vif->vif_rx_vq, B_TRUE);
+
+	vif->vif_runstate = VIOIF_RUNSTATE_STOPPED;
+	mutex_exit(&vif->vif_mutex);
 }
 
 static int
-vioif_stat(void *arg, uint_t stat, uint64_t *val)
+vioif_m_stat(void *arg, uint_t stat, uint64_t *val)
 {
-	struct vioif_softc *sc = arg;
+	vioif_t *vif = arg;
 
 	switch (stat) {
 	case MAC_STAT_IERRORS:
-		*val = sc->sc_ierrors;
+		*val = vif->vif_ierrors;
 		break;
 	case MAC_STAT_OERRORS:
-		*val = sc->sc_oerrors;
+		*val = vif->vif_oerrors;
 		break;
 	case MAC_STAT_MULTIRCV:
-		*val = sc->sc_multircv;
+		*val = vif->vif_multircv;
 		break;
 	case MAC_STAT_BRDCSTRCV:
-		*val = sc->sc_brdcstrcv;
+		*val = vif->vif_brdcstrcv;
 		break;
 	case MAC_STAT_MULTIXMT:
-		*val = sc->sc_multixmt;
+		*val = vif->vif_multixmt;
 		break;
 	case MAC_STAT_BRDCSTXMT:
-		*val = sc->sc_brdcstxmt;
+		*val = vif->vif_brdcstxmt;
 		break;
 	case MAC_STAT_IPACKETS:
-		*val = sc->sc_ipackets;
+		*val = vif->vif_ipackets;
 		break;
 	case MAC_STAT_RBYTES:
-		*val = sc->sc_rbytes;
+		*val = vif->vif_rbytes;
 		break;
 	case MAC_STAT_OPACKETS:
-		*val = sc->sc_opackets;
+		*val = vif->vif_opackets;
 		break;
 	case MAC_STAT_OBYTES:
-		*val = sc->sc_obytes;
+		*val = vif->vif_obytes;
 		break;
 	case MAC_STAT_NORCVBUF:
-		*val = sc->sc_norecvbuf;
+		*val = vif->vif_norecvbuf;
 		break;
 	case MAC_STAT_NOXMTBUF:
-		*val = sc->sc_notxbuf;
+		*val = vif->vif_notxbuf;
 		break;
 	case MAC_STAT_IFSPEED:
 		/* always 1 Gbit */
@@ -1433,651 +1294,490 @@ vioif_stat(void *arg, uint_t stat, uint64_t *val)
 }
 
 static int
-vioif_set_prop_private(struct vioif_softc *sc, const char *pr_name,
+vioif_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
     uint_t pr_valsize, const void *pr_val)
 {
-	_NOTE(ARGUNUSED(pr_valsize));
-
-	long result;
+	vioif_t *vif = arg;
 
-	if (strcmp(pr_name, vioif_txcopy_thresh) == 0) {
+	switch (pr_num) {
+	case MAC_PROP_MTU: {
+		int r;
+		uint32_t mtu;
+		if (pr_valsize < sizeof (mtu)) {
+			return (EOVERFLOW);
+		}
+		bcopy(pr_val, &mtu, sizeof (mtu));
 
-		if (pr_val == NULL)
+		if (mtu < ETHERMIN || mtu > vif->vif_mtu_max) {
 			return (EINVAL);
+		}
 
-		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
+		mutex_enter(&vif->vif_mutex);
+		if ((r = mac_maxsdu_update(vif->vif_mac_handle, mtu)) == 0) {
+			vif->vif_mtu = mtu;
+		}
+		mutex_exit(&vif->vif_mutex);
 
-		if (result < 0 || result > VIOIF_TX_THRESH_MAX)
-			return (EINVAL);
-		sc->sc_txcopy_thresh = result;
+		return (r);
 	}
-	if (strcmp(pr_name, vioif_rxcopy_thresh) == 0) {
 
-		if (pr_val == NULL)
-			return (EINVAL);
-
-		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
+	case MAC_PROP_PRIVATE: {
+		long max, result;
+		uint_t *resp;
+		char *endptr;
+
+		if (strcmp(pr_name, VIOIF_MACPROP_TXCOPY_THRESH) == 0) {
+			max = VIOIF_MACPROP_TXCOPY_THRESH_MAX;
+			resp = &vif->vif_txcopy_thresh;
+		} else if (strcmp(pr_name, VIOIF_MACPROP_RXCOPY_THRESH) == 0) {
+			max = VIOIF_MACPROP_RXCOPY_THRESH_MAX;
+			resp = &vif->vif_rxcopy_thresh;
+		} else {
+			return (ENOTSUP);
+		}
 
-		if (result < 0 || result > VIOIF_RX_THRESH_MAX)
+		if (pr_val == NULL) {
 			return (EINVAL);
-		sc->sc_rxcopy_thresh = result;
-	}
-	return (0);
-}
-
-static int
-vioif_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
-    uint_t pr_valsize, const void *pr_val)
-{
-	struct vioif_softc *sc = arg;
-	const uint32_t *new_mtu;
-	int err;
-
-	switch (pr_num) {
-	case MAC_PROP_MTU:
-		new_mtu = pr_val;
+		}
 
-		if (*new_mtu > MAX_MTU) {
+		if (ddi_strtol(pr_val, &endptr, 10, &result) != 0 ||
+		    *endptr != '\0' || result < 0 || result > max) {
 			return (EINVAL);
 		}
 
-		err = mac_maxsdu_update(sc->sc_mac_handle, *new_mtu);
-		if (err) {
-			return (err);
-		}
-		break;
-	case MAC_PROP_PRIVATE:
-		err = vioif_set_prop_private(sc, pr_name,
-		    pr_valsize, pr_val);
-		if (err)
-			return (err);
-		break;
+		mutex_enter(&vif->vif_mutex);
+		*resp = result;
+		mutex_exit(&vif->vif_mutex);
+
+		return (0);
+	}
+
 	default:
 		return (ENOTSUP);
 	}
-
-	return (0);
 }
 
 static int
-vioif_get_prop_private(struct vioif_softc *sc, const char *pr_name,
+vioif_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
     uint_t pr_valsize, void *pr_val)
 {
-	int err = ENOTSUP;
-	int value;
+	vioif_t *vif = arg;
 
-	if (strcmp(pr_name, vioif_txcopy_thresh) == 0) {
+	switch (pr_num) {
+	case MAC_PROP_PRIVATE: {
+		uint_t value;
 
-		value = sc->sc_txcopy_thresh;
-		err = 0;
-		goto done;
-	}
-	if (strcmp(pr_name, vioif_rxcopy_thresh) == 0) {
+		if (strcmp(pr_name, VIOIF_MACPROP_TXCOPY_THRESH) == 0) {
+			value = vif->vif_txcopy_thresh;
+		} else if (strcmp(pr_name, VIOIF_MACPROP_RXCOPY_THRESH) == 0) {
+			value = vif->vif_rxcopy_thresh;
+		} else {
+			return (ENOTSUP);
+		}
 
-		value = sc->sc_rxcopy_thresh;
-		err = 0;
-		goto done;
-	}
-done:
-	if (err == 0) {
-		(void) snprintf(pr_val, pr_valsize, "%d", value);
-	}
-	return (err);
-}
+		if (snprintf(pr_val, pr_valsize, "%u", value) >= pr_valsize) {
+			return (EOVERFLOW);
+		}
 
-static int
-vioif_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
-    uint_t pr_valsize, void *pr_val)
-{
-	struct vioif_softc *sc = arg;
-	int err = ENOTSUP;
+		return (0);
+	}
 
-	switch (pr_num) {
-	case MAC_PROP_PRIVATE:
-		err = vioif_get_prop_private(sc, pr_name,
-		    pr_valsize, pr_val);
-		break;
 	default:
-		break;
+		return (ENOTSUP);
 	}
-	return (err);
 }
 
 static void
-vioif_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num,
+vioif_m_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num,
     mac_prop_info_handle_t prh)
 {
-	struct vioif_softc *sc = arg;
+	vioif_t *vif = arg;
 	char valstr[64];
 	int value;
 
 	switch (pr_num) {
 	case MAC_PROP_MTU:
-		mac_prop_info_set_range_uint32(prh, ETHERMIN, MAX_MTU);
-		break;
+		mac_prop_info_set_perm(prh, MAC_PROP_PERM_RW);
+		mac_prop_info_set_range_uint32(prh, ETHERMIN, vif->vif_mtu_max);
+		return;
 
 	case MAC_PROP_PRIVATE:
-		bzero(valstr, sizeof (valstr));
-		if (strcmp(pr_name, vioif_txcopy_thresh) == 0) {
-			value = sc->sc_txcopy_thresh;
-		} else if (strcmp(pr_name, vioif_rxcopy_thresh) == 0) {
-			value = sc->sc_rxcopy_thresh;
+		if (strcmp(pr_name, VIOIF_MACPROP_TXCOPY_THRESH) == 0) {
+			value = VIOIF_MACPROP_TXCOPY_THRESH_DEF;
+		} else if (strcmp(pr_name, VIOIF_MACPROP_RXCOPY_THRESH) == 0) {
+			value = VIOIF_MACPROP_RXCOPY_THRESH_DEF;
 		} else {
+			/*
+			 * We do not recognise this private property name.
+			 */
 			return;
 		}
+		mac_prop_info_set_perm(prh, MAC_PROP_PERM_RW);
 		(void) snprintf(valstr, sizeof (valstr), "%d", value);
-		break;
+		mac_prop_info_set_default_str(prh, valstr);
+		return;
 
 	default:
-		break;
+		return;
 	}
 }
 
 static boolean_t
-vioif_getcapab(void *arg, mac_capab_t cap, void *cap_data)
+vioif_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
 {
-	struct vioif_softc *sc = arg;
+	vioif_t *vif = arg;
 
 	switch (cap) {
-	case MAC_CAPAB_HCKSUM:
-		if (sc->sc_tx_csum) {
-			uint32_t *txflags = cap_data;
-
-			*txflags = HCKSUM_INET_PARTIAL;
-			return (B_TRUE);
+	case MAC_CAPAB_HCKSUM: {
+		if (!vif->vif_tx_csum) {
+			return (B_FALSE);
 		}
-		return (B_FALSE);
-	case MAC_CAPAB_LSO:
-		if (sc->sc_tx_tso4) {
-			mac_capab_lso_t *cap_lso = cap_data;
 
-			cap_lso->lso_flags = LSO_TX_BASIC_TCP_IPV4;
-			cap_lso->lso_basic_tcp_ipv4.lso_max = MAX_MTU;
-			return (B_TRUE);
-		}
-		return (B_FALSE);
-	default:
-		break;
+		*(uint32_t *)cap_data = HCKSUM_INET_PARTIAL;
+
+		return (B_TRUE);
 	}
-	return (B_FALSE);
-}
 
-static mac_callbacks_t vioif_m_callbacks = {
-	.mc_callbacks	= (MC_GETCAPAB | MC_SETPROP | MC_GETPROP | MC_PROPINFO),
-	.mc_getstat	= vioif_stat,
-	.mc_start	= vioif_start,
-	.mc_stop	= vioif_stop,
-	.mc_setpromisc	= vioif_promisc,
-	.mc_multicst	= vioif_multicst,
-	.mc_unicst	= vioif_unicst,
-	.mc_tx		= vioif_tx,
-	/* Optional callbacks */
-	.mc_reserved	= NULL,		/* reserved */
-	.mc_ioctl	= NULL,		/* mc_ioctl */
-	.mc_getcapab	= vioif_getcapab,		/* mc_getcapab */
-	.mc_open	= NULL,		/* mc_open */
-	.mc_close	= NULL,		/* mc_close */
-	.mc_setprop	= vioif_setprop,
-	.mc_getprop	= vioif_getprop,
-	.mc_propinfo	= vioif_propinfo,
-};
+	case MAC_CAPAB_LSO: {
+		if (!vif->vif_tx_tso4) {
+			return (B_FALSE);
+		}
 
-static void
-vioif_show_features(struct vioif_softc *sc, const char *prefix,
-    uint32_t features)
-{
-	char buf[512];
-	char *bufp = buf;
-	char *bufend = buf + sizeof (buf);
-
-	/* LINTED E_PTRDIFF_OVERFLOW */
-	bufp += snprintf(bufp, bufend - bufp, prefix);
-	/* LINTED E_PTRDIFF_OVERFLOW */
-	bufp += virtio_show_features(features, bufp, bufend - bufp);
-	*bufp = '\0';
-
-	/* Using '!' to only CE_NOTE this to the system log. */
-	dev_err(sc->sc_dev, CE_NOTE, "!%s Vioif (%b)", buf, features,
-	    VIRTIO_NET_FEATURE_BITS);
-}
+		mac_capab_lso_t *lso = cap_data;
+		lso->lso_flags = LSO_TX_BASIC_TCP_IPV4;
+		lso->lso_basic_tcp_ipv4.lso_max = VIOIF_RX_DATA_SIZE;
 
-/*
- * Find out which features are supported by the device and
- * choose which ones we wish to use.
- */
-static int
-vioif_dev_features(struct vioif_softc *sc)
-{
-	uint32_t host_features;
-
-	host_features = virtio_negotiate_features(&sc->sc_virtio,
-	    VIRTIO_NET_F_CSUM |
-	    VIRTIO_NET_F_HOST_TSO4 |
-	    VIRTIO_NET_F_HOST_ECN |
-	    VIRTIO_NET_F_MAC |
-	    VIRTIO_NET_F_STATUS |
-	    VIRTIO_F_RING_INDIRECT_DESC);
-
-	vioif_show_features(sc, "Host features: ", host_features);
-	vioif_show_features(sc, "Negotiated features: ",
-	    sc->sc_virtio.sc_features);
-
-	if (!(sc->sc_virtio.sc_features & VIRTIO_F_RING_INDIRECT_DESC)) {
-		dev_err(sc->sc_dev, CE_WARN,
-		    "Host does not support RING_INDIRECT_DESC. Cannot attach.");
-		return (DDI_FAILURE);
+		return (B_TRUE);
 	}
 
-	return (DDI_SUCCESS);
+	default:
+		return (B_FALSE);
+	}
 }
 
 static boolean_t
-vioif_has_feature(struct vioif_softc *sc, uint32_t feature)
+vioif_has_feature(vioif_t *vif, uint32_t feature)
 {
-	return (virtio_has_feature(&sc->sc_virtio, feature));
+	return (virtio_feature_present(vif->vif_virtio, feature));
 }
 
+/*
+ * Read the primary MAC address from the device if one is provided.  If not,
+ * generate a random locally administered MAC address and write it back to the
+ * device.
+ */
 static void
-vioif_set_mac(struct vioif_softc *sc)
+vioif_get_mac(vioif_t *vif)
 {
-	int i;
-
-	for (i = 0; i < ETHERADDRL; i++) {
-		virtio_write_device_config_1(&sc->sc_virtio,
-		    VIRTIO_NET_CONFIG_MAC + i, sc->sc_mac[i]);
-	}
-	sc->sc_mac_from_host = 0;
-}
+	VERIFY(MUTEX_HELD(&vif->vif_mutex));
 
-/* Get the mac address out of the hardware, or make up one. */
-static void
-vioif_get_mac(struct vioif_softc *sc)
-{
-	int i;
-	if (sc->sc_virtio.sc_features & VIRTIO_NET_F_MAC) {
-		for (i = 0; i < ETHERADDRL; i++) {
-			sc->sc_mac[i] = virtio_read_device_config_1(
-			    &sc->sc_virtio,
+	if (vioif_has_feature(vif, VIRTIO_NET_F_MAC)) {
+		for (uint_t i = 0; i < ETHERADDRL; i++) {
+			vif->vif_mac[i] = virtio_dev_get8(vif->vif_virtio,
 			    VIRTIO_NET_CONFIG_MAC + i);
 		}
-		sc->sc_mac_from_host = 1;
-	} else {
-		/* Get a few random bytes */
-		(void) random_get_pseudo_bytes(sc->sc_mac, ETHERADDRL);
-		/* Make sure it's a unicast MAC */
-		sc->sc_mac[0] &= ~1;
-		/* Set the "locally administered" bit */
-		sc->sc_mac[1] |= 2;
+		vif->vif_mac_from_host = 1;
 
-		vioif_set_mac(sc);
+		return;
+	}
 
-		dev_err(sc->sc_dev, CE_NOTE,
-		    "!Generated a random MAC address: %s",
-		    ether_sprintf((struct ether_addr *)sc->sc_mac));
+	/* Get a few random bytes */
+	(void) random_get_pseudo_bytes(vif->vif_mac, ETHERADDRL);
+	/* Make sure it's a unicast MAC */
+	vif->vif_mac[0] &= ~1;
+	/* Set the "locally administered" bit */
+	vif->vif_mac[1] |= 2;
+
+	/*
+	 * Write the random MAC address back to the device.
+	 */
+	for (uint_t i = 0; i < ETHERADDRL; i++) {
+		virtio_dev_put8(vif->vif_virtio, VIRTIO_NET_CONFIG_MAC + i,
+		    vif->vif_mac[i]);
 	}
+	vif->vif_mac_from_host = 0;
+
+	dev_err(vif->vif_dip, CE_NOTE, "!Generated a random MAC address: "
+	    "%02x:%02x:%02x:%02x:%02x:%02x",
+	    (uint_t)vif->vif_mac[0], (uint_t)vif->vif_mac[1],
+	    (uint_t)vif->vif_mac[2], (uint_t)vif->vif_mac[3],
+	    (uint_t)vif->vif_mac[4], (uint_t)vif->vif_mac[5]);
 }
 
 /*
  * Virtqueue interrupt handlers
  */
-/* ARGSUSED */
 static uint_t
-vioif_rx_handler(caddr_t arg1, caddr_t arg2)
+vioif_rx_handler(caddr_t arg0, caddr_t arg1)
 {
-	struct virtio_softc *vsc = (void *) arg1;
-	struct vioif_softc *sc = __containerof(vsc,
-	    struct vioif_softc, sc_virtio);
+	vioif_t *vif = (vioif_t *)arg0;
+
+	mutex_enter(&vif->vif_mutex);
+	(void) vioif_process_rx(vif);
 
 	/*
-	 * The return values of these functions are not needed but they make
-	 * debugging interrupts simpler because you can use them to detect when
-	 * stuff was processed and repopulated in this handler.
+	 * Attempt to replenish the receive queue.  If we cannot add any
+	 * descriptors here, it may be because all of the recently received
+	 * packets were loaned up to the networking stack.
 	 */
-	(void) vioif_process_rx(sc);
-	(void) vioif_populate_rx(sc, KM_NOSLEEP);
+	(void) vioif_add_rx(vif);
+	mutex_exit(&vif->vif_mutex);
 
 	return (DDI_INTR_CLAIMED);
 }
 
-/* ARGSUSED */
 static uint_t
-vioif_tx_handler(caddr_t arg1, caddr_t arg2)
+vioif_tx_handler(caddr_t arg0, caddr_t arg1)
 {
-	struct virtio_softc *vsc = (void *)arg1;
-	struct vioif_softc *sc = __containerof(vsc,
-	    struct vioif_softc, sc_virtio);
+	vioif_t *vif = (vioif_t *)arg0;
 
 	/*
 	 * The TX interrupt could race with other reclamation activity, so
 	 * interpreting the return value is unimportant.
 	 */
-	(void) vioif_reclaim_used_tx(sc);
+	(void) vioif_reclaim_used_tx(vif);
 
 	return (DDI_INTR_CLAIMED);
 }
 
-static int
-vioif_register_ints(struct vioif_softc *sc)
-{
-	int ret;
-
-	struct virtio_int_handler vioif_vq_h[] = {
-		{ vioif_rx_handler },
-		{ vioif_tx_handler },
-		{ NULL }
-	};
-
-	ret = virtio_register_ints(&sc->sc_virtio, NULL, vioif_vq_h);
-
-	return (ret);
-}
-
-
 static void
-vioif_check_features(struct vioif_softc *sc)
+vioif_check_features(vioif_t *vif)
 {
-	if (vioif_has_feature(sc, VIRTIO_NET_F_CSUM)) {
-		/* The GSO/GRO featured depend on CSUM, check them here. */
-		sc->sc_tx_csum = 1;
-		sc->sc_rx_csum = 1;
+	VERIFY(MUTEX_HELD(&vif->vif_mutex));
 
-		if (!vioif_has_feature(sc, VIRTIO_NET_F_GUEST_CSUM)) {
-			sc->sc_rx_csum = 0;
-		}
-		dev_err(sc->sc_dev, CE_NOTE, "!Csum enabled.");
+	vif->vif_tx_csum = 0;
+	vif->vif_tx_tso4 = 0;
 
-		if (vioif_has_feature(sc, VIRTIO_NET_F_HOST_TSO4)) {
+	if (vioif_has_feature(vif, VIRTIO_NET_F_CSUM)) {
+		/*
+		 * The host will accept packets with partial checksums from us.
+		 */
+		vif->vif_tx_csum = 1;
 
-			sc->sc_tx_tso4 = 1;
-			/*
-			 * We don't seem to have a way to ask the system
-			 * not to send us LSO packets with Explicit
-			 * Congestion Notification bit set, so we require
-			 * the device to support it in order to do
-			 * LSO.
-			 */
-			if (!vioif_has_feature(sc, VIRTIO_NET_F_HOST_ECN)) {
-				dev_err(sc->sc_dev, CE_NOTE,
-				    "!TSO4 supported, but not ECN. "
-				    "Not using LSO.");
-				sc->sc_tx_tso4 = 0;
-			} else {
-				dev_err(sc->sc_dev, CE_NOTE, "!LSO enabled");
-			}
+		/*
+		 * The legacy GSO feature represents the combination of
+		 * HOST_TSO4, HOST_TSO6, and HOST_ECN.
+		 */
+		boolean_t gso = vioif_has_feature(vif, VIRTIO_NET_F_GSO);
+		boolean_t tso4 = vioif_has_feature(vif, VIRTIO_NET_F_HOST_TSO4);
+		boolean_t ecn = vioif_has_feature(vif, VIRTIO_NET_F_HOST_ECN);
+
+		/*
+		 * Explicit congestion notification (ECN) is configured
+		 * globally; see "tcp_ecn_permitted".  As we cannot currently
+		 * request that the stack disable ECN on a per interface basis,
+		 * we require the device to support the combination of
+		 * segmentation offload and ECN support.
+		 */
+		if (gso || (tso4 && ecn)) {
+			vif->vif_tx_tso4 = 1;
 		}
 	}
 }
 
 static int
-vioif_attach(dev_info_t *devinfo, ddi_attach_cmd_t cmd)
+vioif_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
 {
-	int ret, instance;
-	struct vioif_softc *sc;
-	struct virtio_softc *vsc;
-	mac_register_t *macp;
-	char cache_name[CACHE_NAME_SIZE];
-
-	instance = ddi_get_instance(devinfo);
-
-	switch (cmd) {
-	case DDI_ATTACH:
-		break;
-
-	case DDI_RESUME:
-	case DDI_PM_RESUME:
-		/* We do not support suspend/resume for vioif. */
-		goto exit;
+	int ret;
+	vioif_t *vif;
+	virtio_t *vio;
+	mac_register_t *macp = NULL;
 
-	default:
-		goto exit;
+	if (cmd != DDI_ATTACH) {
+		return (DDI_FAILURE);
 	}
 
-	sc = kmem_zalloc(sizeof (struct vioif_softc), KM_SLEEP);
-	ddi_set_driver_private(devinfo, sc);
-
-	vsc = &sc->sc_virtio;
+	if ((vio = virtio_init(dip, VIRTIO_NET_WANTED_FEATURES, B_TRUE)) ==
+	    NULL) {
+		return (DDI_FAILURE);
+	}
 
-	/* Duplicate for less typing */
-	sc->sc_dev = devinfo;
-	vsc->sc_dev = devinfo;
+	vif = kmem_zalloc(sizeof (*vif), KM_SLEEP);
+	vif->vif_dip = dip;
+	vif->vif_virtio = vio;
+	vif->vif_runstate = VIOIF_RUNSTATE_STOPPED;
+	ddi_set_driver_private(dip, vif);
+
+	if ((vif->vif_rx_vq = virtio_queue_alloc(vio, VIRTIO_NET_VIRTQ_RX,
+	    "rx", vioif_rx_handler, vif, B_FALSE, VIOIF_MAX_SEGS)) == NULL ||
+	    (vif->vif_tx_vq = virtio_queue_alloc(vio, VIRTIO_NET_VIRTQ_TX,
+	    "tx", vioif_tx_handler, vif, B_FALSE, VIOIF_MAX_SEGS)) == NULL) {
+		goto fail;
+	}
 
-	/*
-	 * Initialize interrupt kstat.
-	 */
-	sc->sc_intrstat = kstat_create("vioif", instance, "intr", "controller",
-	    KSTAT_TYPE_INTR, 1, 0);
-	if (sc->sc_intrstat == NULL) {
-		dev_err(devinfo, CE_WARN, "kstat_create failed");
-		goto exit_intrstat;
-	}
-	kstat_install(sc->sc_intrstat);
-
-	/* map BAR 0 */
-	ret = ddi_regs_map_setup(devinfo, 1,
-	    (caddr_t *)&sc->sc_virtio.sc_io_addr,
-	    0, 0, &vioif_attr, &sc->sc_virtio.sc_ioh);
-	if (ret != DDI_SUCCESS) {
-		dev_err(devinfo, CE_WARN, "unable to map bar 0: %d", ret);
-		goto exit_map;
+	if (virtio_init_complete(vio, 0) != DDI_SUCCESS) {
+		dev_err(dip, CE_WARN, "failed to complete Virtio init");
+		goto fail;
 	}
 
-	virtio_device_reset(&sc->sc_virtio);
-	virtio_set_status(&sc->sc_virtio, VIRTIO_CONFIG_DEVICE_STATUS_ACK);
-	virtio_set_status(&sc->sc_virtio, VIRTIO_CONFIG_DEVICE_STATUS_DRIVER);
+	virtio_queue_no_interrupt(vif->vif_rx_vq, B_TRUE);
+	virtio_queue_no_interrupt(vif->vif_tx_vq, B_TRUE);
 
-	ret = vioif_dev_features(sc);
-	if (ret)
-		goto exit_features;
+	mutex_init(&vif->vif_mutex, NULL, MUTEX_DRIVER, virtio_intr_pri(vio));
+	mutex_enter(&vif->vif_mutex);
 
-	vsc->sc_nvqs = vioif_has_feature(sc, VIRTIO_NET_F_CTRL_VQ) ? 3 : 2;
+	vioif_get_mac(vif);
 
-	(void) snprintf(cache_name, CACHE_NAME_SIZE, "vioif%d_rx", instance);
-	sc->sc_rxbuf_cache = kmem_cache_create(cache_name,
-	    sizeof (struct vioif_rx_buf), 0, vioif_rx_construct,
-	    vioif_rx_destruct, NULL, sc, NULL, KM_SLEEP);
-	if (sc->sc_rxbuf_cache == NULL) {
-		dev_err(sc->sc_dev, CE_WARN, "Can't allocate the buffer cache");
-		goto exit_cache;
-	}
+	vif->vif_rxcopy_thresh = VIOIF_MACPROP_RXCOPY_THRESH_DEF;
+	vif->vif_txcopy_thresh = VIOIF_MACPROP_TXCOPY_THRESH_DEF;
 
-	ret = vioif_register_ints(sc);
-	if (ret) {
-		dev_err(sc->sc_dev, CE_WARN,
-		    "Failed to allocate interrupt(s)!");
-		goto exit_ints;
+	if (vioif_has_feature(vif, VIRTIO_NET_F_MTU)) {
+		vif->vif_mtu_max = virtio_dev_get16(vio, VIRTIO_NET_CONFIG_MTU);
+	} else {
+		vif->vif_mtu_max = ETHERMTU;
 	}
 
-	/*
-	 * Register layout determined, can now access the
-	 * device-specific bits
-	 */
-	vioif_get_mac(sc);
-
-	sc->sc_rx_vq = virtio_alloc_vq(&sc->sc_virtio, 0,
-	    VIOIF_RX_QLEN, VIOIF_INDIRECT_MAX, "rx");
-	if (!sc->sc_rx_vq)
-		goto exit_alloc1;
-	virtio_stop_vq_intr(sc->sc_rx_vq);
-
-	sc->sc_tx_vq = virtio_alloc_vq(&sc->sc_virtio, 1,
-	    VIOIF_TX_QLEN, VIOIF_INDIRECT_MAX, "tx");
-	if (!sc->sc_tx_vq)
-		goto exit_alloc2;
-	virtio_stop_vq_intr(sc->sc_tx_vq);
-
-	mutex_init(&sc->sc_tx_lock, NULL, MUTEX_DRIVER,
-	    DDI_INTR_PRI(sc->sc_virtio.sc_intr_prio));
-
-	if (vioif_has_feature(sc, VIRTIO_NET_F_CTRL_VQ)) {
-		sc->sc_ctrl_vq = virtio_alloc_vq(&sc->sc_virtio, 2,
-		    VIOIF_CTRL_QLEN, 0, "ctrl");
-		if (!sc->sc_ctrl_vq) {
-			goto exit_alloc3;
-		}
-		virtio_stop_vq_intr(sc->sc_ctrl_vq);
+	vif->vif_mtu = ETHERMTU;
+	if (vif->vif_mtu > vif->vif_mtu_max) {
+		vif->vif_mtu = vif->vif_mtu_max;
 	}
 
-	virtio_set_status(&sc->sc_virtio,
-	    VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK);
-
-	sc->sc_rxloan = 0;
+	vioif_check_features(vif);
 
-	/* set some reasonable-small default values */
-	sc->sc_rxcopy_thresh = 300;
-	sc->sc_txcopy_thresh = 300;
-	sc->sc_mtu = ETHERMTU;
+	if (vioif_alloc_bufs(vif) != 0) {
+		mutex_exit(&vif->vif_mutex);
+		dev_err(dip, CE_WARN, "failed to allocate memory");
+		goto fail;
+	}
 
-	vioif_check_features(sc);
+	mutex_exit(&vif->vif_mutex);
 
-	if (vioif_alloc_mems(sc) != 0)
-		goto exit_alloc_mems;
+	if (virtio_interrupts_enable(vio) != DDI_SUCCESS) {
+		dev_err(dip, CE_WARN, "failed to enable interrupts");
+		goto fail;
+	}
 
 	if ((macp = mac_alloc(MAC_VERSION)) == NULL) {
-		dev_err(devinfo, CE_WARN, "Failed to allocate a mac_register");
-		goto exit_macalloc;
+		dev_err(dip, CE_WARN, "failed to allocate a mac_register");
+		goto fail;
 	}
 
 	macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
-	macp->m_driver = sc;
-	macp->m_dip = devinfo;
-	macp->m_src_addr = sc->sc_mac;
-	macp->m_callbacks = &vioif_m_callbacks;
+	macp->m_driver = vif;
+	macp->m_dip = dip;
+	macp->m_src_addr = vif->vif_mac;
+	macp->m_callbacks = &vioif_mac_callbacks;
 	macp->m_min_sdu = 0;
-	macp->m_max_sdu = sc->sc_mtu;
+	macp->m_max_sdu = vif->vif_mtu;
 	macp->m_margin = VLAN_TAGSZ;
 	macp->m_priv_props = vioif_priv_props;
 
-	sc->sc_macp = macp;
-
-	/* Pre-fill the rx ring. */
-	(void) vioif_populate_rx(sc, KM_SLEEP);
-
-	ret = mac_register(macp, &sc->sc_mac_handle);
-	if (ret != 0) {
-		dev_err(devinfo, CE_WARN, "vioif_attach: "
-		    "mac_register() failed, ret=%d", ret);
-		goto exit_register;
+	if ((ret = mac_register(macp, &vif->vif_mac_handle)) != 0) {
+		dev_err(dip, CE_WARN, "mac_register() failed (%d)", ret);
+		goto fail;
 	}
+	mac_free(macp);
 
-	ret = virtio_enable_ints(&sc->sc_virtio);
-	if (ret) {
-		dev_err(devinfo, CE_WARN, "Failed to enable interrupts");
-		goto exit_enable_ints;
-	}
+	mac_link_update(vif->vif_mac_handle, LINK_STATE_UP);
 
-	mac_link_update(sc->sc_mac_handle, LINK_STATE_UP);
 	return (DDI_SUCCESS);
 
-exit_enable_ints:
-	(void) mac_unregister(sc->sc_mac_handle);
-exit_register:
-	mac_free(macp);
-exit_macalloc:
-	vioif_free_mems(sc);
-exit_alloc_mems:
-	virtio_release_ints(&sc->sc_virtio);
-	if (sc->sc_ctrl_vq)
-		virtio_free_vq(sc->sc_ctrl_vq);
-exit_alloc3:
-	virtio_free_vq(sc->sc_tx_vq);
-exit_alloc2:
-	virtio_free_vq(sc->sc_rx_vq);
-exit_alloc1:
-exit_ints:
-	kmem_cache_destroy(sc->sc_rxbuf_cache);
-exit_cache:
-exit_features:
-	virtio_set_status(&sc->sc_virtio, VIRTIO_CONFIG_DEVICE_STATUS_FAILED);
-	ddi_regs_map_free(&sc->sc_virtio.sc_ioh);
-exit_intrstat:
-exit_map:
-	kstat_delete(sc->sc_intrstat);
-	kmem_free(sc, sizeof (struct vioif_softc));
-exit:
+fail:
+	vioif_free_bufs(vif);
+	if (macp != NULL) {
+		mac_free(macp);
+	}
+	(void) virtio_fini(vio, B_TRUE);
+	kmem_free(vif, sizeof (*vif));
 	return (DDI_FAILURE);
 }
 
 static int
-vioif_detach(dev_info_t *devinfo, ddi_detach_cmd_t cmd)
+vioif_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
 {
-	struct vioif_softc *sc;
+	int r;
+	vioif_t *vif;
 
-	if ((sc = ddi_get_driver_private(devinfo)) == NULL)
+	if (cmd != DDI_DETACH) {
 		return (DDI_FAILURE);
+	}
 
-	switch (cmd) {
-	case DDI_DETACH:
-		break;
+	if ((vif = ddi_get_driver_private(dip)) == NULL) {
+		return (DDI_FAILURE);
+	}
 
-	case DDI_PM_SUSPEND:
-		/* We do not support suspend/resume for vioif. */
+	mutex_enter(&vif->vif_mutex);
+	if (vif->vif_runstate != VIOIF_RUNSTATE_STOPPED) {
+		dev_err(dip, CE_WARN, "!NIC still running, cannot detach");
+		mutex_exit(&vif->vif_mutex);
 		return (DDI_FAILURE);
+	}
 
-	default:
+	/*
+	 * There should be no outstanding transmit buffers once the NIC is
+	 * completely stopped.
+	 */
+	VERIFY3U(vif->vif_ntxbufs_alloc, ==, 0);
+
+	/*
+	 * Though we cannot claw back all of the receive buffers until we reset
+	 * the device, we must ensure all those loaned to MAC have been
+	 * returned before calling mac_unregister().
+	 */
+	if (vif->vif_nrxbufs_onloan > 0) {
+		dev_err(dip, CE_WARN, "!%u receive buffers still loaned, "
+		    "cannot detach", vif->vif_nrxbufs_onloan);
+		mutex_exit(&vif->vif_mutex);
 		return (DDI_FAILURE);
 	}
 
-	if (sc->sc_rxloan > 0) {
-		dev_err(devinfo, CE_WARN, "!Some rx buffers are still upstream,"
-		    " not detaching.");
+	if ((r = mac_unregister(vif->vif_mac_handle)) != 0) {
+		dev_err(dip, CE_WARN, "!MAC unregister failed (%d)", r);
 		return (DDI_FAILURE);
 	}
+	mac_free(vif->vif_macp);
 
-	virtio_stop_vq_intr(sc->sc_rx_vq);
-	virtio_stop_vq_intr(sc->sc_tx_vq);
+	/*
+	 * Shut down the device so that we can recover any previously
+	 * submitted receive buffers.
+	 */
+	virtio_shutdown(vif->vif_virtio);
+	for (;;) {
+		virtio_chain_t *vic;
 
-	virtio_release_ints(&sc->sc_virtio);
+		if ((vic = virtio_queue_evacuate(vif->vif_rx_vq)) == NULL) {
+			break;
+		}
 
-	if (mac_unregister(sc->sc_mac_handle)) {
-		return (DDI_FAILURE);
+		vioif_rxbuf_t *rb = virtio_chain_data(vic);
+		vioif_rxbuf_free(vif, rb);
 	}
 
-	mac_free(sc->sc_macp);
-
-	vioif_free_mems(sc);
-	virtio_free_vq(sc->sc_rx_vq);
-	virtio_free_vq(sc->sc_tx_vq);
+	(void) virtio_fini(vif->vif_virtio, B_FALSE);
 
-	virtio_device_reset(&sc->sc_virtio);
+	vioif_free_bufs(vif);
 
-	ddi_regs_map_free(&sc->sc_virtio.sc_ioh);
+	mutex_exit(&vif->vif_mutex);
+	mutex_destroy(&vif->vif_mutex);
 
-	kmem_cache_destroy(sc->sc_rxbuf_cache);
-	kstat_delete(sc->sc_intrstat);
-	kmem_free(sc, sizeof (struct vioif_softc));
+	kmem_free(vif, sizeof (*vif));
 
 	return (DDI_SUCCESS);
 }
 
 static int
-vioif_quiesce(dev_info_t *devinfo)
+vioif_quiesce(dev_info_t *dip)
 {
-	struct vioif_softc *sc;
+	vioif_t *vif;
 
-	if ((sc = ddi_get_driver_private(devinfo)) == NULL)
+	if ((vif = ddi_get_driver_private(dip)) == NULL)
 		return (DDI_FAILURE);
 
-	virtio_stop_vq_intr(sc->sc_rx_vq);
-	virtio_stop_vq_intr(sc->sc_tx_vq);
-	virtio_device_reset(&sc->sc_virtio);
-
-	return (DDI_SUCCESS);
+	return (virtio_quiesce(vif->vif_virtio));
 }
 
 int
 _init(void)
 {
-	int ret = 0;
+	int ret;
 
-	mac_init_ops(&vioif_ops, "vioif");
+	mac_init_ops(&vioif_dev_ops, "vioif");
 
-	ret = mod_install(&modlinkage);
-	if (ret != DDI_SUCCESS) {
-		mac_fini_ops(&vioif_ops);
-		return (ret);
+	if ((ret = mod_install(&vioif_modlinkage)) != DDI_SUCCESS) {
+		mac_fini_ops(&vioif_dev_ops);
 	}
 
-	return (0);
+	return (ret);
 }
 
 int
@@ -2085,16 +1785,15 @@ _fini(void)
 {
 	int ret;
 
-	ret = mod_remove(&modlinkage);
-	if (ret == DDI_SUCCESS) {
-		mac_fini_ops(&vioif_ops);
+	if ((ret = mod_remove(&vioif_modlinkage)) == DDI_SUCCESS) {
+		mac_fini_ops(&vioif_dev_ops);
 	}
 
 	return (ret);
 }
 
 int
-_info(struct modinfo *pModinfo)
+_info(struct modinfo *modinfop)
 {
-	return (mod_info(&modlinkage, pModinfo));
+	return (mod_info(&vioif_modlinkage, modinfop));
 }
diff --git a/usr/src/uts/common/io/vioif/vioif.h b/usr/src/uts/common/io/vioif/vioif.h
new file mode 100644
index 0000000000..51dbc1acd4
--- /dev/null
+++ b/usr/src/uts/common/io/vioif/vioif.h
@@ -0,0 +1,432 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2019 Joyent, Inc.
+ */
+
+/*
+ * VIRTIO NETWORK DRIVER
+ */
+
+#ifndef _VIOIF_H
+#define	_VIOIF_H
+
+#include "virtio.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * VIRTIO NETWORK CONFIGURATION REGISTERS
+ *
+ * These are offsets into the device-specific configuration space available
+ * through the virtio_dev_*() family of functions.
+ */
+#define	VIRTIO_NET_CONFIG_MAC		0x00	/* 48 R/W */
+#define	VIRTIO_NET_CONFIG_STATUS	0x06	/* 16 R   */
+#define	VIRTIO_NET_CONFIG_MAX_VQ_PAIRS	0x08	/* 16 R   */
+#define	VIRTIO_NET_CONFIG_MTU		0x0A	/* 16 R   */
+
+/*
+ * VIRTIO NETWORK VIRTQUEUES
+ *
+ * Note that the control queue is only present if VIRTIO_NET_F_CTRL_VQ is
+ * negotiated with the device.
+ */
+#define	VIRTIO_NET_VIRTQ_RX		0
+#define	VIRTIO_NET_VIRTQ_TX		1
+#define	VIRTIO_NET_VIRTQ_CONTROL	2
+
+/*
+ * VIRTIO NETWORK FEATURE BITS
+ */
+
+/*
+ * CSUM, GUEST_CSUM:
+ *	Partial checksum support.  These features signal that the device will
+ *	accept packets with partial checksums (CSUM), and that the driver will
+ *	accept packets with partial checksums (GUEST_CSUM).  These features
+ *	combine the use of the VIRTIO_NET_HDR_F_NEEDS_CSUM flag, and the
+ *	"csum_start" and "csum_offset" fields, in the virtio net header.
+ */
+#define	VIRTIO_NET_F_CSUM		(1ULL << 0)
+#define	VIRTIO_NET_F_GUEST_CSUM		(1ULL << 1)
+
+/*
+ * MTU:
+ *	The device offers a maximum MTU value at VIRTIO_NET_CONFIG_MTU.  If
+ *	this is not negotiated, we allow the largest possible MTU that our
+ *	buffer allocations support in case jumbo frames are tacitly supported
+ *	by the device.  The default MTU is always 1500.
+ */
+#define	VIRTIO_NET_F_MTU		(1ULL << 3)
+
+/*
+ * MAC:
+ *	The device has an assigned primary MAC address.  If this feature bit is
+ *	not set, the driver must provide a locally assigned MAC address.  See
+ *	IEEE 802, "48-bit universal LAN MAC addresses" for more details on
+ *	assignment.
+ */
+#define	VIRTIO_NET_F_MAC		(1ULL << 5)
+
+/*
+ * GUEST_TSO4, GUEST_TSO6, GUEST_UFO:
+ *	Inbound segmentation offload support.  These features depend on having
+ *	VIRTIO_NET_F_GUEST_CSUM and signal that the driver can accept large
+ *	combined TCP (v4 or v6) packets, or reassembled UDP fragments.
+ */
+#define	VIRTIO_NET_F_GUEST_TSO4		(1ULL << 7)
+#define	VIRTIO_NET_F_GUEST_TSO6		(1ULL << 8)
+#define	VIRTIO_NET_F_GUEST_UFO		(1ULL << 10)
+
+/*
+ * GUEST_ECN:
+ *	Depends on either VIRTIO_NET_F_GUEST_TSO4 or VIRTIO_NET_F_GUEST_TSO6.
+ *	This feature means the driver will look for the VIRTIO_NET_HDR_GSO_ECN
+ *	bit in the "gso_type" of the virtio net header.  This bit tells the
+ *	driver that the Explicit Congestion Notification (ECN) bit was set in
+ *	the original TCP packets.
+ */
+#define	VIRTIO_NET_F_GUEST_ECN		(1ULL << 9)
+
+/*
+ * HOST_TSO4, HOST_TSO6, HOST_UFO:
+ *	Outbound segmentation offload support.  These features depend on having
+ *	VIRTIO_NET_F_CSUM and signal that the device will accept large combined
+ *	TCP (v4 or v6) packets that require segmentation offload, or large
+ *	combined UDP packets that require fragmentation offload.
+ */
+#define	VIRTIO_NET_F_HOST_TSO4		(1ULL << 11)
+#define	VIRTIO_NET_F_HOST_TSO6		(1ULL << 12)
+#define	VIRTIO_NET_F_HOST_UFO		(1ULL << 14)
+
+/*
+ * HOST_ECN:
+ *	Depends on either VIRTIO_NET_F_HOST_TSO4 or VIRTIO_NET_F_HOST_TSO6.
+ *	This features means the device will accept packets that both require
+ *	segmentation offload and have the Explicit Congestion Notification
+ *	(ECN) bit set.  If this feature is not present, the device must not
+ *	send large segments that require ECN to be set.
+ */
+#define	VIRTIO_NET_F_HOST_ECN		(1ULL << 13)
+
+/*
+ * GSO:
+ *	The GSO feature is, in theory, the combination of HOST_TSO4, HOST_TSO6,
+ *	and HOST_ECN.  This is only useful for legacy devices; newer devices
+ *	should be using the more specific bits above.
+ */
+#define	VIRTIO_NET_F_GSO		(1ULL << 6)
+
+/*
+ * MRG_RXBUF:
+ *	This feature allows the receipt of large packets without needing to
+ *	allocate large buffers.  The "virtio_net_hdr" will include an extra
+ *	value: the number of buffers to gang together.
+ */
+#define	VIRTIO_NET_F_MRG_RXBUF		(1ULL << 15)
+
+/*
+ * STATUS:
+ *	The VIRTIO_NET_CONFIG_STATUS configuration register is available, which
+ *	allows the driver to read the link state from the device.
+ */
+#define	VIRTIO_NET_F_STATUS		(1ULL << 16)
+
+/*
+ * CTRL_VQ, CTRL_RX, CTRL_VLAN:
+ *	These features signal that the device exposes the control queue
+ *	(VIRTIO_NET_VIRTQ_CONTROL), in the case of CTRL_VQ; and that the
+ *	control queue supports extra commands (CTRL_RX, CTRL_VLAN).
+ */
+#define	VIRTIO_NET_F_CTRL_VQ		(1ULL << 17)
+#define	VIRTIO_NET_F_CTRL_RX		(1ULL << 18)
+#define	VIRTIO_NET_F_CTRL_VLAN		(1ULL << 19)
+#define	VIRTIO_NET_F_CTRL_RX_EXTRA	(1ULL << 20)
+
+/*
+ * These features are supported by the driver and we will request them from the
+ * device.  Note that we do not currently request GUEST_CSUM, as the driver
+ * does not presently support receiving frames with any offload features from
+ * the device.
+ */
+#define	VIRTIO_NET_WANTED_FEATURES	(VIRTIO_NET_F_CSUM |		\
+					VIRTIO_NET_F_GSO |		\
+					VIRTIO_NET_F_HOST_TSO4 |	\
+					VIRTIO_NET_F_HOST_ECN |		\
+					VIRTIO_NET_F_MAC |		\
+					VIRTIO_NET_F_MTU)
+
+/*
+ * VIRTIO NETWORK HEADER
+ *
+ * This structure appears at the start of each transmit or receive packet
+ * buffer.
+ */
+struct virtio_net_hdr {
+	uint8_t				vnh_flags;
+	uint8_t				vnh_gso_type;
+	uint16_t			vnh_hdr_len;
+	uint16_t			vnh_gso_size;
+	uint16_t			vnh_csum_start;
+	uint16_t			vnh_csum_offset;
+} __packed;
+
+/*
+ * VIRTIO NETWORK HEADER: FLAGS (vnh_flags)
+ */
+#define	VIRTIO_NET_HDR_F_NEEDS_CSUM	0x01
+
+/*
+ * VIRTIO NETWORK HEADER: OFFLOAD OPTIONS (vnh_gso_type)
+ *
+ * Each of these is an offload type, except for the ECN value which is
+ * logically OR-ed with one of the other types.
+ */
+#define	VIRTIO_NET_HDR_GSO_NONE		0
+#define	VIRTIO_NET_HDR_GSO_TCPV4	1
+#define	VIRTIO_NET_HDR_GSO_UDP		3
+#define	VIRTIO_NET_HDR_GSO_TCPV6	4
+#define	VIRTIO_NET_HDR_GSO_ECN		0x80
+
+
+/*
+ * DRIVER PARAMETERS
+ */
+
+/*
+ * At attach, we allocate a fixed pool of buffers for receipt and transmission
+ * of frames.  The maximum number of buffers of each type that we will allocate
+ * is specified here.  If the ring size is smaller than this number, we will
+ * use the ring size instead.
+ */
+#define	VIRTIO_NET_TX_BUFS		256
+#define	VIRTIO_NET_RX_BUFS		256
+
+/*
+ * The virtio net header and the first buffer segment share the same DMA
+ * allocation.  We round up the virtio header size to a multiple of 4 and add 2
+ * bytes so that the IP header, which starts immediately after the 14 or 18
+ * byte Ethernet header, is then correctly aligned:
+ *
+ *   0                10      16   18                              32/36
+ *   | virtio_net_hdr | %4==0 | +2 | Ethernet header (14/18 bytes) | IPv4 ...
+ *
+ * Note that for this to work correctly, the DMA allocation must also be 4 byte
+ * aligned.
+ */
+#define	VIOIF_HEADER_ALIGN		4
+#define	VIOIF_HEADER_SKIP		(P2ROUNDUP( \
+					    sizeof (struct virtio_net_hdr), \
+					    VIOIF_HEADER_ALIGN) + 2)
+
+/*
+ * Given we are not negotiating VIRTIO_NET_F_MRG_RXBUF, the specification says
+ * we must be able to accept a 1514 byte packet, or if any segmentation offload
+ * features have been negotiated a 65550 byte packet.  To keep things simple,
+ * we'll assume segmentation offload is possible in most cases.  In addition to
+ * the packet payload, we need to account for the Ethernet header and the
+ * virtio_net_hdr.
+ */
+#define	VIOIF_RX_DATA_SIZE		65550
+#define	VIOIF_RX_BUF_SIZE		(VIOIF_RX_DATA_SIZE + \
+					    sizeof (struct ether_header) + \
+					    VIOIF_HEADER_SKIP)
+
+/*
+ * If we assume that a large allocation will probably have mostly 4K page sized
+ * cookies, 64 segments allows us 256KB for a single frame.  We're in control
+ * of the allocation we use for receive buffers, so this value only has an
+ * impact on the length of chain we're able to create for external transmit
+ * buffer mappings.
+ */
+#define	VIOIF_MAX_SEGS			64
+
+/*
+ * We pre-allocate a reasonably large buffer to copy small packets
+ * there. Bigger packets are mapped, packets with multiple
+ * cookies are mapped as indirect buffers.
+ */
+#define	VIOIF_TX_INLINE_SIZE		(2 * 1024)
+
+
+/*
+ * TYPE DEFINITIONS
+ */
+
+typedef struct vioif vioif_t;
+
+/*
+ * Receive buffers are allocated in advance as a combination of DMA memory and
+ * a descriptor chain.  Receive buffers can be loaned to the networking stack
+ * to avoid copying, and this object contains the free routine to pass to
+ * desballoc().
+ *
+ * When receive buffers are not in use, they are linked into the per-instance
+ * free list, "vif_rxbufs" via "rb_link".  Under normal conditions, we expect
+ * the free list to be empty much of the time; most buffers will be in the ring
+ * or on loan.
+ */
+typedef struct vioif_rxbuf {
+	vioif_t				*rb_vioif;
+	frtn_t				rb_frtn;
+
+	virtio_dma_t			*rb_dma;
+	virtio_chain_t			*rb_chain;
+
+	list_node_t			rb_link;
+} vioif_rxbuf_t;
+
+/*
+ * Transmit buffers are also allocated in advance.  DMA memory is allocated for
+ * the virtio net header, and to hold small packets.  Larger packets are mapped
+ * from storage loaned to the driver by the network stack.
+ *
+ * When transmit buffers are not in use, they are linked into the per-instance
+ * free list, "vif_txbufs" via "tb_link".
+ */
+typedef struct vioif_txbuf {
+	mblk_t				*tb_mp;
+
+	/*
+	 * Inline buffer space (VIOIF_TX_INLINE_SIZE) for storage of the virtio
+	 * net header, and to hold copied (rather than mapped) packet data.
+	 */
+	virtio_dma_t			*tb_dma;
+	virtio_chain_t			*tb_chain;
+
+	/*
+	 * External buffer mapping.  The capacity is fixed at allocation time,
+	 * and "tb_ndmaext" tracks the current number of mappings.
+	 */
+	virtio_dma_t			**tb_dmaext;
+	uint_t				tb_dmaext_capacity;
+	uint_t				tb_ndmaext;
+
+	list_node_t			tb_link;
+} vioif_txbuf_t;
+
+typedef enum vioif_runstate {
+	VIOIF_RUNSTATE_STOPPED = 1,
+	VIOIF_RUNSTATE_STOPPING,
+	VIOIF_RUNSTATE_RUNNING
+} vioif_runstate_t;
+
+/*
+ * Per-instance driver object.
+ */
+struct vioif {
+	dev_info_t			*vif_dip;
+	virtio_t			*vif_virtio;
+
+	kmutex_t			vif_mutex;
+
+	/*
+	 * The NIC is considered RUNNING between the mc_start(9E) and
+	 * mc_stop(9E) calls.  Otherwise it is STOPPING (while draining
+	 * resources) then STOPPED.  When not RUNNING, we will drop incoming
+	 * frames and refuse to insert more receive buffers into the receive
+	 * queue.
+	 */
+	vioif_runstate_t		vif_runstate;
+
+	mac_handle_t			vif_mac_handle;
+	mac_register_t			*vif_macp;
+
+	virtio_queue_t			*vif_rx_vq;
+	virtio_queue_t			*vif_tx_vq;
+
+	/* TX virtqueue management resources */
+	boolean_t			vif_tx_corked;
+	boolean_t			vif_tx_drain;
+	timeout_id_t			vif_tx_reclaim_tid;
+
+	/*
+	 * Configured offload features:
+	 */
+	unsigned int			vif_tx_csum:1;
+	unsigned int			vif_tx_tso4:1;
+
+	/*
+	 * For debugging, it is useful to know whether the MAC address we
+	 * are using came from the host (via VIRTIO_NET_CONFIG_MAC) or
+	 * was otherwise generated or set from within the guest.
+	 */
+	unsigned int			vif_mac_from_host:1;
+
+	uint_t				vif_mtu;
+	uint_t				vif_mtu_max;
+	uint8_t				vif_mac[ETHERADDRL];
+
+	/*
+	 * Receive buffer free list and accounting:
+	 */
+	list_t				vif_rxbufs;
+	uint_t				vif_nrxbufs_alloc;
+	uint_t				vif_nrxbufs_onloan;
+	uint_t				vif_nrxbufs_onloan_max;
+	uint_t				vif_rxbufs_capacity;
+	vioif_rxbuf_t			*vif_rxbufs_mem;
+
+	/*
+	 * Transmit buffer free list and accounting:
+	 */
+	list_t				vif_txbufs;
+	uint_t				vif_ntxbufs_alloc;
+	uint_t				vif_txbufs_capacity;
+	vioif_txbuf_t			*vif_txbufs_mem;
+
+	/*
+	 * These copy size thresholds are exposed as private MAC properties so
+	 * that they can be tuned without rebooting.
+	 */
+	uint_t				vif_rxcopy_thresh;
+	uint_t				vif_txcopy_thresh;
+
+	/*
+	 * Statistics visible through mac:
+	 */
+	uint64_t			vif_ipackets;
+	uint64_t			vif_opackets;
+	uint64_t			vif_rbytes;
+	uint64_t			vif_obytes;
+	uint64_t			vif_brdcstxmt;
+	uint64_t			vif_brdcstrcv;
+	uint64_t			vif_multixmt;
+	uint64_t			vif_multircv;
+	uint64_t			vif_norecvbuf;
+	uint64_t			vif_notxbuf;
+	uint64_t			vif_ierrors;
+	uint64_t			vif_oerrors;
+
+	/*
+	 * Internal debugging statistics:
+	 */
+	uint64_t			vif_rxfail_dma_handle;
+	uint64_t			vif_rxfail_dma_buffer;
+	uint64_t			vif_rxfail_dma_bind;
+	uint64_t			vif_rxfail_chain_undersize;
+	uint64_t			vif_rxfail_no_descriptors;
+	uint64_t			vif_txfail_dma_handle;
+	uint64_t			vif_txfail_dma_bind;
+	uint64_t			vif_txfail_indirect_limit;
+
+	uint64_t			vif_stat_tx_reclaim;
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _VIOIF_H */
diff --git a/usr/src/uts/common/io/virtio/virtio.c b/usr/src/uts/common/io/virtio/virtio.c
deleted file mode 100644
index 19a66b8f38..0000000000
--- a/usr/src/uts/common/io/virtio/virtio.c
+++ /dev/null
@@ -1,1364 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
- * Copyright 2012 Alexey Zaytsev <alexey.zaytsev@gmail.com>
- * Copyright (c) 2016 by Delphix. All rights reserved.
- * Copyright 2017 Joyent, Inc.
- */
-
-/* Based on the NetBSD virtio driver by Minoura Makoto. */
-/*
- * Copyright (c) 2010 Minoura Makoto.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
- * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
- * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <sys/conf.h>
-#include <sys/kmem.h>
-#include <sys/debug.h>
-#include <sys/modctl.h>
-#include <sys/autoconf.h>
-#include <sys/ddi_impldefs.h>
-#include <sys/ddi.h>
-#include <sys/sunddi.h>
-#include <sys/sunndi.h>
-#include <sys/avintr.h>
-#include <sys/spl.h>
-#include <sys/promif.h>
-#include <sys/list.h>
-#include <sys/bootconf.h>
-#include <sys/bootsvcs.h>
-#include <sys/sysmacros.h>
-#include <sys/pci.h>
-
-#include "virtiovar.h"
-#include "virtioreg.h"
-
-#define	NDEVNAMES	(sizeof (virtio_device_name) / sizeof (char *))
-#define	MINSEG_INDIRECT	2	/* use indirect if nsegs >= this value */
-#define	VIRTQUEUE_ALIGN(n) (((n)+(VIRTIO_PAGE_SIZE-1)) & \
-	    ~(VIRTIO_PAGE_SIZE-1))
-
-void
-virtio_set_status(struct virtio_softc *sc, unsigned int status)
-{
-	int old = 0;
-
-	if (status != 0) {
-		old = ddi_get8(sc->sc_ioh, (uint8_t *)(sc->sc_io_addr +
-		    VIRTIO_CONFIG_DEVICE_STATUS));
-	}
-
-	ddi_put8(sc->sc_ioh, (uint8_t *)(sc->sc_io_addr +
-	    VIRTIO_CONFIG_DEVICE_STATUS), status | old);
-}
-
-/*
- * Negotiate features, save the result in sc->sc_features
- */
-uint32_t
-virtio_negotiate_features(struct virtio_softc *sc, uint32_t guest_features)
-{
-	uint32_t host_features;
-	uint32_t features;
-
-	host_features = ddi_get32(sc->sc_ioh,
-	    /* LINTED E_BAD_PTR_CAST_ALIGN */
-	    (uint32_t *)(sc->sc_io_addr + VIRTIO_CONFIG_DEVICE_FEATURES));
-
-	dev_debug(sc->sc_dev, CE_NOTE, "host features: %x, guest features: %x",
-	    host_features, guest_features);
-
-	features = host_features & guest_features;
-	ddi_put32(sc->sc_ioh,
-	    /* LINTED E_BAD_PTR_CAST_ALIGN */
-	    (uint32_t *)(sc->sc_io_addr + VIRTIO_CONFIG_GUEST_FEATURES),
-	    features);
-
-	sc->sc_features = features;
-
-	return (host_features);
-}
-
-size_t
-virtio_show_features(uint32_t features, char *buf, size_t len)
-{
-	char *orig_buf = buf;
-	char *bufend = buf + len;
-
-	/* LINTED E_PTRDIFF_OVERFLOW */
-	buf += snprintf(buf, bufend - buf, "Generic ( ");
-	if (features & VIRTIO_F_RING_INDIRECT_DESC)
-		/* LINTED E_PTRDIFF_OVERFLOW */
-		buf += snprintf(buf, bufend - buf, "INDIRECT_DESC ");
-
-	/* LINTED E_PTRDIFF_OVERFLOW */
-	buf += snprintf(buf, bufend - buf, ") ");
-
-	/* LINTED E_PTRDIFF_OVERFLOW */
-	return (buf - orig_buf);
-}
-
-boolean_t
-virtio_has_feature(struct virtio_softc *sc, uint32_t feature)
-{
-	return (sc->sc_features & feature);
-}
-
-/*
- * Device configuration registers.
- */
-uint8_t
-virtio_read_device_config_1(struct virtio_softc *sc, unsigned int index)
-{
-	ASSERT(sc->sc_config_offset);
-	return ddi_get8(sc->sc_ioh,
-	    (uint8_t *)(sc->sc_io_addr + sc->sc_config_offset + index));
-}
-
-uint16_t
-virtio_read_device_config_2(struct virtio_softc *sc, unsigned int index)
-{
-	ASSERT(sc->sc_config_offset);
-	return ddi_get16(sc->sc_ioh,
-	    /* LINTED E_BAD_PTR_CAST_ALIGN */
-	    (uint16_t *)(sc->sc_io_addr + sc->sc_config_offset + index));
-}
-
-uint32_t
-virtio_read_device_config_4(struct virtio_softc *sc, unsigned int index)
-{
-	ASSERT(sc->sc_config_offset);
-	return ddi_get32(sc->sc_ioh,
-	    /* LINTED E_BAD_PTR_CAST_ALIGN */
-	    (uint32_t *)(sc->sc_io_addr + sc->sc_config_offset + index));
-}
-
-uint64_t
-virtio_read_device_config_8(struct virtio_softc *sc, unsigned int index)
-{
-	uint64_t r;
-
-	ASSERT(sc->sc_config_offset);
-	r = ddi_get32(sc->sc_ioh,
-	    /* LINTED E_BAD_PTR_CAST_ALIGN */
-	    (uint32_t *)(sc->sc_io_addr + sc->sc_config_offset +
-	    index + sizeof (uint32_t)));
-
-	r <<= 32;
-
-	r += ddi_get32(sc->sc_ioh,
-	    /* LINTED E_BAD_PTR_CAST_ALIGN */
-	    (uint32_t *)(sc->sc_io_addr + sc->sc_config_offset + index));
-	return (r);
-}
-
-void
-virtio_write_device_config_1(struct virtio_softc *sc, unsigned int index,
-    uint8_t value)
-{
-	ASSERT(sc->sc_config_offset);
-	ddi_put8(sc->sc_ioh,
-	    (uint8_t *)(sc->sc_io_addr + sc->sc_config_offset + index), value);
-}
-
-void
-virtio_write_device_config_2(struct virtio_softc *sc, unsigned int index,
-    uint16_t value)
-{
-	ASSERT(sc->sc_config_offset);
-	ddi_put16(sc->sc_ioh,
-	    /* LINTED E_BAD_PTR_CAST_ALIGN */
-	    (uint16_t *)(sc->sc_io_addr + sc->sc_config_offset + index), value);
-}
-
-void
-virtio_write_device_config_4(struct virtio_softc *sc, unsigned int index,
-    uint32_t value)
-{
-	ASSERT(sc->sc_config_offset);
-	ddi_put32(sc->sc_ioh,
-	    /* LINTED E_BAD_PTR_CAST_ALIGN */
-	    (uint32_t *)(sc->sc_io_addr + sc->sc_config_offset + index), value);
-}
-
-void
-virtio_write_device_config_8(struct virtio_softc *sc, unsigned int index,
-    uint64_t value)
-{
-	ASSERT(sc->sc_config_offset);
-	ddi_put32(sc->sc_ioh,
-	    /* LINTED E_BAD_PTR_CAST_ALIGN */
-	    (uint32_t *)(sc->sc_io_addr + sc->sc_config_offset + index),
-	    value & 0xFFFFFFFF);
-	ddi_put32(sc->sc_ioh,
-	    /* LINTED E_BAD_PTR_CAST_ALIGN */
-	    (uint32_t *)(sc->sc_io_addr + sc->sc_config_offset +
-	    index + sizeof (uint32_t)), value >> 32);
-}
-
-/*
- * Start/stop vq interrupt.  No guarantee.
- */
-void
-virtio_stop_vq_intr(struct virtqueue *vq)
-{
-	vq->vq_avail->flags |= VRING_AVAIL_F_NO_INTERRUPT;
-}
-
-void
-virtio_start_vq_intr(struct virtqueue *vq)
-{
-	vq->vq_avail->flags &= ~VRING_AVAIL_F_NO_INTERRUPT;
-}
-
-static ddi_dma_attr_t virtio_vq_dma_attr = {
-	DMA_ATTR_V0,		/* Version number */
-	0,			/* low address */
-	0x00000FFFFFFFFFFF,	/* high address. Has to fit into 32 bits */
-				/* after page-shifting */
-	0xFFFFFFFF,		/* counter register max */
-	VIRTIO_PAGE_SIZE,	/* page alignment required */
-	0x3F,			/* burst sizes: 1 - 32 */
-	0x1,			/* minimum transfer size */
-	0xFFFFFFFF,		/* max transfer size */
-	0xFFFFFFFF,		/* address register max */
-	1,			/* no scatter-gather */
-	1,			/* device operates on bytes */
-	0,			/* attr flag: set to 0 */
-};
-
-static ddi_dma_attr_t virtio_vq_indirect_dma_attr = {
-	DMA_ATTR_V0,		/* Version number */
-	0,			/* low address */
-	0xFFFFFFFFFFFFFFFF,	/* high address */
-	0xFFFFFFFF,		/* counter register max */
-	1,			/* No specific alignment */
-	0x3F,			/* burst sizes: 1 - 32 */
-	0x1,			/* minimum transfer size */
-	0xFFFFFFFF,		/* max transfer size */
-	0xFFFFFFFF,		/* address register max */
-	1,			/* no scatter-gather */
-	1,			/* device operates on bytes */
-	0,			/* attr flag: set to 0 */
-};
-
-/* Same for direct and indirect descriptors. */
-static ddi_device_acc_attr_t virtio_vq_devattr = {
-	DDI_DEVICE_ATTR_V0,
-	DDI_NEVERSWAP_ACC,
-	DDI_STORECACHING_OK_ACC,
-	DDI_DEFAULT_ACC
-};
-
-static void
-virtio_free_indirect(struct vq_entry *entry)
-{
-
-	(void) ddi_dma_unbind_handle(entry->qe_indirect_dma_handle);
-	ddi_dma_mem_free(&entry->qe_indirect_dma_acch);
-	ddi_dma_free_handle(&entry->qe_indirect_dma_handle);
-
-	entry->qe_indirect_descs = NULL;
-}
-
-
-static int
-virtio_alloc_indirect(struct virtio_softc *sc, struct vq_entry *entry)
-{
-	int allocsize, num;
-	size_t len;
-	unsigned int ncookies;
-	int ret;
-
-	num = entry->qe_queue->vq_indirect_num;
-	ASSERT(num > 1);
-
-	allocsize = sizeof (struct vring_desc) * num;
-
-	ret = ddi_dma_alloc_handle(sc->sc_dev, &virtio_vq_indirect_dma_attr,
-	    DDI_DMA_SLEEP, NULL, &entry->qe_indirect_dma_handle);
-	if (ret != DDI_SUCCESS) {
-		dev_err(sc->sc_dev, CE_WARN,
-		    "Failed to allocate dma handle for indirect descriptors, "
-		    "entry %d, vq %d", entry->qe_index,
-		    entry->qe_queue->vq_index);
-		goto out_alloc_handle;
-	}
-
-	ret = ddi_dma_mem_alloc(entry->qe_indirect_dma_handle, allocsize,
-	    &virtio_vq_devattr, DDI_DMA_CONSISTENT, DDI_DMA_SLEEP, NULL,
-	    (caddr_t *)&entry->qe_indirect_descs, &len,
-	    &entry->qe_indirect_dma_acch);
-	if (ret != DDI_SUCCESS) {
-		dev_err(sc->sc_dev, CE_WARN,
-		    "Failed to allocate dma memory for indirect descriptors, "
-		    "entry %d, vq %d,", entry->qe_index,
-		    entry->qe_queue->vq_index);
-		goto out_alloc;
-	}
-
-	(void) memset(entry->qe_indirect_descs, 0xff, allocsize);
-
-	ret = ddi_dma_addr_bind_handle(entry->qe_indirect_dma_handle, NULL,
-	    (caddr_t)entry->qe_indirect_descs, len,
-	    DDI_DMA_RDWR | DDI_DMA_CONSISTENT, DDI_DMA_SLEEP, NULL,
-	    &entry->qe_indirect_dma_cookie, &ncookies);
-	if (ret != DDI_DMA_MAPPED) {
-		dev_err(sc->sc_dev, CE_WARN,
-		    "Failed to bind dma memory for indirect descriptors, "
-		    "entry %d, vq %d", entry->qe_index,
-		    entry->qe_queue->vq_index);
-		goto out_bind;
-	}
-
-	/* We asked for a single segment */
-	ASSERT(ncookies == 1);
-
-	return (0);
-
-out_bind:
-	ddi_dma_mem_free(&entry->qe_indirect_dma_acch);
-out_alloc:
-	ddi_dma_free_handle(&entry->qe_indirect_dma_handle);
-out_alloc_handle:
-
-	return (ret);
-}
-
-/*
- * Initialize the vq structure.
- */
-static int
-virtio_init_vq(struct virtio_softc *sc, struct virtqueue *vq)
-{
-	int ret;
-	uint16_t i;
-	int vq_size = vq->vq_num;
-	int indirect_num = vq->vq_indirect_num;
-
-	/* free slot management */
-	list_create(&vq->vq_freelist, sizeof (struct vq_entry),
-	    offsetof(struct vq_entry, qe_list));
-
-	for (i = 0; i < vq_size; i++) {
-		struct vq_entry *entry = &vq->vq_entries[i];
-		list_insert_tail(&vq->vq_freelist, entry);
-		entry->qe_index = i;
-		entry->qe_desc = &vq->vq_descs[i];
-		entry->qe_queue = vq;
-
-		if (indirect_num) {
-			ret = virtio_alloc_indirect(sc, entry);
-			if (ret)
-				goto out_indirect;
-		}
-	}
-
-	mutex_init(&vq->vq_freelist_lock, "virtio-freelist", MUTEX_DRIVER,
-	    DDI_INTR_PRI(sc->sc_intr_prio));
-	mutex_init(&vq->vq_avail_lock, "virtio-avail", MUTEX_DRIVER,
-	    DDI_INTR_PRI(sc->sc_intr_prio));
-	mutex_init(&vq->vq_used_lock, "virtio-used", MUTEX_DRIVER,
-	    DDI_INTR_PRI(sc->sc_intr_prio));
-
-	return (0);
-
-out_indirect:
-	for (i = 0; i < vq_size; i++) {
-		struct vq_entry *entry = &vq->vq_entries[i];
-		if (entry->qe_indirect_descs)
-			virtio_free_indirect(entry);
-	}
-
-	return (ret);
-}
-
-/*
- * Allocate/free a vq.
- */
-struct virtqueue *
-virtio_alloc_vq(struct virtio_softc *sc, unsigned int index, unsigned int size,
-    unsigned int indirect_num, const char *name)
-{
-	int vq_size, allocsize1, allocsize2, allocsize = 0;
-	int ret;
-	unsigned int ncookies;
-	size_t len;
-	struct virtqueue *vq;
-
-	ddi_put16(sc->sc_ioh,
-	    /* LINTED E_BAD_PTR_CAST_ALIGN */
-	    (uint16_t *)(sc->sc_io_addr + VIRTIO_CONFIG_QUEUE_SELECT), index);
-	vq_size = ddi_get16(sc->sc_ioh,
-	    /* LINTED E_BAD_PTR_CAST_ALIGN */
-	    (uint16_t *)(sc->sc_io_addr + VIRTIO_CONFIG_QUEUE_SIZE));
-	if (vq_size == 0) {
-		dev_err(sc->sc_dev, CE_WARN,
-		    "virtqueue dest not exist, index %d for %s\n", index, name);
-		goto out;
-	}
-
-	vq = kmem_zalloc(sizeof (struct virtqueue), KM_SLEEP);
-
-	/* size 0 => use native vq size, good for receive queues. */
-	if (size)
-		vq_size = MIN(vq_size, size);
-
-	/* allocsize1: descriptor table + avail ring + pad */
-	allocsize1 = VIRTQUEUE_ALIGN(sizeof (struct vring_desc) * vq_size +
-	    sizeof (struct vring_avail) + sizeof (uint16_t) * vq_size);
-	/* allocsize2: used ring + pad */
-	allocsize2 = VIRTQUEUE_ALIGN(sizeof (struct vring_used) +
-	    sizeof (struct vring_used_elem) * vq_size);
-
-	allocsize = allocsize1 + allocsize2;
-
-	ret = ddi_dma_alloc_handle(sc->sc_dev, &virtio_vq_dma_attr,
-	    DDI_DMA_SLEEP, NULL, &vq->vq_dma_handle);
-	if (ret != DDI_SUCCESS) {
-		dev_err(sc->sc_dev, CE_WARN,
-		    "Failed to allocate dma handle for vq %d", index);
-		goto out_alloc_handle;
-	}
-
-	ret = ddi_dma_mem_alloc(vq->vq_dma_handle, allocsize,
-	    &virtio_vq_devattr, DDI_DMA_CONSISTENT, DDI_DMA_SLEEP, NULL,
-	    (caddr_t *)&vq->vq_vaddr, &len, &vq->vq_dma_acch);
-	if (ret != DDI_SUCCESS) {
-		dev_err(sc->sc_dev, CE_WARN,
-		    "Failed to allocate dma memory for vq %d", index);
-		goto out_alloc;
-	}
-
-	ret = ddi_dma_addr_bind_handle(vq->vq_dma_handle, NULL,
-	    (caddr_t)vq->vq_vaddr, len, DDI_DMA_RDWR | DDI_DMA_CONSISTENT,
-	    DDI_DMA_SLEEP, NULL, &vq->vq_dma_cookie, &ncookies);
-	if (ret != DDI_DMA_MAPPED) {
-		dev_err(sc->sc_dev, CE_WARN,
-		    "Failed to bind dma memory for vq %d", index);
-		goto out_bind;
-	}
-
-	/* We asked for a single segment */
-	ASSERT(ncookies == 1);
-	/* and page-ligned buffers. */
-	ASSERT(vq->vq_dma_cookie.dmac_laddress % VIRTIO_PAGE_SIZE == 0);
-
-	(void) memset(vq->vq_vaddr, 0, allocsize);
-
-	/* Make sure all zeros hit the buffer before we point the host to it */
-	membar_producer();
-
-	/* set the vq address */
-	ddi_put32(sc->sc_ioh,
-	    /* LINTED E_BAD_PTR_CAST_ALIGN */
-	    (uint32_t *)(sc->sc_io_addr + VIRTIO_CONFIG_QUEUE_ADDRESS),
-	    (vq->vq_dma_cookie.dmac_laddress / VIRTIO_PAGE_SIZE));
-
-	/* remember addresses and offsets for later use */
-	vq->vq_owner = sc;
-	vq->vq_num = vq_size;
-	vq->vq_index = index;
-	vq->vq_descs = vq->vq_vaddr;
-	vq->vq_availoffset = sizeof (struct vring_desc)*vq_size;
-	vq->vq_avail = (void *)(((char *)vq->vq_descs) + vq->vq_availoffset);
-	vq->vq_usedoffset = allocsize1;
-	vq->vq_used = (void *)(((char *)vq->vq_descs) + vq->vq_usedoffset);
-
-	ASSERT(indirect_num == 0 ||
-	    virtio_has_feature(sc, VIRTIO_F_RING_INDIRECT_DESC));
-	vq->vq_indirect_num = indirect_num;
-
-	/* free slot management */
-	vq->vq_entries = kmem_zalloc(sizeof (struct vq_entry) * vq_size,
-	    KM_SLEEP);
-
-	ret = virtio_init_vq(sc, vq);
-	if (ret)
-		goto out_init;
-
-	dev_debug(sc->sc_dev, CE_NOTE,
-	    "Allocated %d entries for vq %d:%s (%d indirect descs)",
-	    vq_size, index, name, indirect_num * vq_size);
-
-	return (vq);
-
-out_init:
-	kmem_free(vq->vq_entries, sizeof (struct vq_entry) * vq_size);
-	(void) ddi_dma_unbind_handle(vq->vq_dma_handle);
-out_bind:
-	ddi_dma_mem_free(&vq->vq_dma_acch);
-out_alloc:
-	ddi_dma_free_handle(&vq->vq_dma_handle);
-out_alloc_handle:
-	kmem_free(vq, sizeof (struct virtqueue));
-out:
-	return (NULL);
-}
-
-void
-virtio_free_vq(struct virtqueue *vq)
-{
-	struct virtio_softc *sc = vq->vq_owner;
-	int i;
-
-	/* tell device that there's no virtqueue any longer */
-	ddi_put16(sc->sc_ioh,
-	    /* LINTED E_BAD_PTR_CAST_ALIGN */
-	    (uint16_t *)(sc->sc_io_addr + VIRTIO_CONFIG_QUEUE_SELECT),
-	    vq->vq_index);
-	ddi_put32(sc->sc_ioh,
-	    /* LINTED E_BAD_PTR_CAST_ALIGN */
-	    (uint32_t *)(sc->sc_io_addr + VIRTIO_CONFIG_QUEUE_ADDRESS), 0);
-
-	/* Free the indirect descriptors, if any. */
-	for (i = 0; i < vq->vq_num; i++) {
-		struct vq_entry *entry = &vq->vq_entries[i];
-		if (entry->qe_indirect_descs)
-			virtio_free_indirect(entry);
-	}
-
-	kmem_free(vq->vq_entries, sizeof (struct vq_entry) * vq->vq_num);
-
-	(void) ddi_dma_unbind_handle(vq->vq_dma_handle);
-	ddi_dma_mem_free(&vq->vq_dma_acch);
-	ddi_dma_free_handle(&vq->vq_dma_handle);
-
-	mutex_destroy(&vq->vq_used_lock);
-	mutex_destroy(&vq->vq_avail_lock);
-	mutex_destroy(&vq->vq_freelist_lock);
-
-	kmem_free(vq, sizeof (struct virtqueue));
-}
-
-/*
- * Free descriptor management.
- */
-struct vq_entry *
-vq_alloc_entry(struct virtqueue *vq)
-{
-	struct vq_entry *qe;
-
-	mutex_enter(&vq->vq_freelist_lock);
-	if (list_is_empty(&vq->vq_freelist)) {
-		mutex_exit(&vq->vq_freelist_lock);
-		return (NULL);
-	}
-	qe = list_remove_head(&vq->vq_freelist);
-
-	ASSERT(vq->vq_used_entries >= 0);
-	vq->vq_used_entries++;
-
-	mutex_exit(&vq->vq_freelist_lock);
-
-	qe->qe_next = NULL;
-	qe->qe_indirect_next = 0;
-	(void) memset(qe->qe_desc, 0, sizeof (struct vring_desc));
-
-	return (qe);
-}
-
-void
-vq_free_entry(struct virtqueue *vq, struct vq_entry *qe)
-{
-	mutex_enter(&vq->vq_freelist_lock);
-
-	list_insert_head(&vq->vq_freelist, qe);
-	vq->vq_used_entries--;
-	ASSERT(vq->vq_used_entries >= 0);
-	mutex_exit(&vq->vq_freelist_lock);
-}
-
-/*
- * We (intentionally) don't have a global vq mutex, so you are
- * responsible for external locking to avoid allocting/freeing any
- * entries before using the returned value. Have fun.
- */
-uint_t
-vq_num_used(struct virtqueue *vq)
-{
-	/* vq->vq_freelist_lock would not help here. */
-	return (vq->vq_used_entries);
-}
-
-static inline void
-virtio_ve_set_desc(struct vring_desc *desc, uint64_t paddr, uint32_t len,
-    boolean_t write)
-{
-	desc->addr = paddr;
-	desc->len = len;
-	desc->next = 0;
-	desc->flags = 0;
-
-	/* 'write' - from the driver's point of view */
-	if (!write)
-		desc->flags = VRING_DESC_F_WRITE;
-}
-
-void
-virtio_ve_set(struct vq_entry *qe, uint64_t paddr, uint32_t len,
-    boolean_t write)
-{
-	virtio_ve_set_desc(qe->qe_desc, paddr, len, write);
-}
-
-unsigned int
-virtio_ve_indirect_available(struct vq_entry *qe)
-{
-	return (qe->qe_queue->vq_indirect_num - qe->qe_indirect_next);
-}
-
-void
-virtio_ve_add_indirect_buf(struct vq_entry *qe, uint64_t paddr, uint32_t len,
-    boolean_t write)
-{
-	struct vring_desc *indirect_desc;
-
-	ASSERT(qe->qe_queue->vq_indirect_num);
-	ASSERT(qe->qe_indirect_next < qe->qe_queue->vq_indirect_num);
-
-	indirect_desc = &qe->qe_indirect_descs[qe->qe_indirect_next];
-	virtio_ve_set_desc(indirect_desc, paddr, len, write);
-	qe->qe_indirect_next++;
-}
-
-void
-virtio_ve_add_cookie(struct vq_entry *qe, ddi_dma_handle_t dma_handle,
-    ddi_dma_cookie_t dma_cookie, unsigned int ncookies, boolean_t write)
-{
-	int i;
-
-	for (i = 0; i < ncookies; i++) {
-		virtio_ve_add_indirect_buf(qe, dma_cookie.dmac_laddress,
-		    dma_cookie.dmac_size, write);
-		ddi_dma_nextcookie(dma_handle, &dma_cookie);
-	}
-}
-
-void
-virtio_sync_vq(struct virtqueue *vq)
-{
-	struct virtio_softc *vsc = vq->vq_owner;
-
-	/* Make sure the avail ring update hit the buffer */
-	membar_producer();
-
-	vq->vq_avail->idx = vq->vq_avail_idx;
-
-	/* Make sure the avail idx update hits the buffer */
-	membar_producer();
-
-	/* Make sure we see the flags update */
-	membar_consumer();
-
-	if (!(vq->vq_used->flags & VRING_USED_F_NO_NOTIFY)) {
-		ddi_put16(vsc->sc_ioh,
-		    /* LINTED E_BAD_PTR_CAST_ALIGN */
-		    (uint16_t *)(vsc->sc_io_addr +
-		    VIRTIO_CONFIG_QUEUE_NOTIFY),
-		    vq->vq_index);
-	}
-}
-
-void
-virtio_push_chain(struct vq_entry *qe, boolean_t sync)
-{
-	struct virtqueue *vq = qe->qe_queue;
-	struct vq_entry *head = qe;
-	struct vring_desc *desc;
-	int idx;
-
-	ASSERT(qe);
-
-	/*
-	 * Bind the descs together, paddr and len should be already
-	 * set with virtio_ve_set
-	 */
-	do {
-		/* Bind the indirect descriptors */
-		if (qe->qe_indirect_next > 1) {
-			uint16_t i = 0;
-
-			/*
-			 * Set the pointer/flags to the
-			 * first indirect descriptor
-			 */
-			virtio_ve_set_desc(qe->qe_desc,
-			    qe->qe_indirect_dma_cookie.dmac_laddress,
-			    sizeof (struct vring_desc) * qe->qe_indirect_next,
-			    B_FALSE);
-			qe->qe_desc->flags |= VRING_DESC_F_INDIRECT;
-
-			/* For all but the last one, add the next index/flag */
-			do {
-				desc = &qe->qe_indirect_descs[i];
-				i++;
-
-				desc->flags |= VRING_DESC_F_NEXT;
-				desc->next = i;
-			} while (i < qe->qe_indirect_next - 1);
-
-		}
-
-		if (qe->qe_next) {
-			qe->qe_desc->flags |= VRING_DESC_F_NEXT;
-			qe->qe_desc->next = qe->qe_next->qe_index;
-		}
-
-		qe = qe->qe_next;
-	} while (qe);
-
-	mutex_enter(&vq->vq_avail_lock);
-	idx = vq->vq_avail_idx;
-	vq->vq_avail_idx++;
-
-	/* Make sure the bits hit the descriptor(s) */
-	membar_producer();
-	vq->vq_avail->ring[idx % vq->vq_num] = head->qe_index;
-
-	/* Notify the device, if needed. */
-	if (sync)
-		virtio_sync_vq(vq);
-
-	mutex_exit(&vq->vq_avail_lock);
-}
-
-/*
- * Get a chain of descriptors from the used ring, if one is available.
- */
-struct vq_entry *
-virtio_pull_chain(struct virtqueue *vq, uint32_t *len)
-{
-	struct vq_entry *head;
-	int slot;
-	int usedidx;
-
-	mutex_enter(&vq->vq_used_lock);
-
-	/* No used entries? Bye. */
-	if (vq->vq_used_idx == vq->vq_used->idx) {
-		mutex_exit(&vq->vq_used_lock);
-		return (NULL);
-	}
-
-	usedidx = vq->vq_used_idx;
-	vq->vq_used_idx++;
-	mutex_exit(&vq->vq_used_lock);
-
-	usedidx %= vq->vq_num;
-
-	/* Make sure we do the next step _after_ checking the idx. */
-	membar_consumer();
-
-	slot = vq->vq_used->ring[usedidx].id;
-	*len = vq->vq_used->ring[usedidx].len;
-
-	head = &vq->vq_entries[slot];
-
-	return (head);
-}
-
-void
-virtio_free_chain(struct vq_entry *qe)
-{
-	struct vq_entry *tmp;
-	struct virtqueue *vq = qe->qe_queue;
-
-	ASSERT(qe);
-
-	do {
-		ASSERT(qe->qe_queue == vq);
-		tmp = qe->qe_next;
-		vq_free_entry(vq, qe);
-		qe = tmp;
-	} while (tmp != NULL);
-}
-
-void
-virtio_ventry_stick(struct vq_entry *first, struct vq_entry *second)
-{
-	first->qe_next = second;
-}
-
-static int
-virtio_register_msi(struct virtio_softc *sc,
-    struct virtio_int_handler *config_handler,
-    struct virtio_int_handler vq_handlers[], int intr_types)
-{
-	int count, actual;
-	int int_type;
-	int i;
-	int handler_count;
-	int ret;
-
-	/* If both MSI and MSI-x are reported, prefer MSI-x. */
-	int_type = DDI_INTR_TYPE_MSI;
-	if (intr_types & DDI_INTR_TYPE_MSIX)
-		int_type = DDI_INTR_TYPE_MSIX;
-
-	/* Walk the handler table to get the number of handlers. */
-	for (handler_count = 0;
-	    vq_handlers && vq_handlers[handler_count].vh_func;
-	    handler_count++)
-		;
-
-	/* +1 if there is a config change handler. */
-	if (config_handler != NULL)
-		handler_count++;
-
-	/* Number of MSIs supported by the device. */
-	ret = ddi_intr_get_nintrs(sc->sc_dev, int_type, &count);
-	if (ret != DDI_SUCCESS) {
-		dev_err(sc->sc_dev, CE_WARN, "ddi_intr_get_nintrs failed");
-		return (ret);
-	}
-
-	/*
-	 * Those who try to register more handlers then the device
-	 * supports shall suffer.
-	 */
-	ASSERT(handler_count <= count);
-
-	sc->sc_intr_htable = kmem_zalloc(sizeof (ddi_intr_handle_t) *
-	    handler_count, KM_SLEEP);
-
-	ret = ddi_intr_alloc(sc->sc_dev, sc->sc_intr_htable, int_type, 0,
-	    handler_count, &actual, DDI_INTR_ALLOC_NORMAL);
-	if (ret != DDI_SUCCESS) {
-		dev_err(sc->sc_dev, CE_WARN, "Failed to allocate MSI: %d", ret);
-		goto out_msi_alloc;
-	}
-
-	if (actual != handler_count) {
-		dev_err(sc->sc_dev, CE_WARN,
-		    "Not enough MSI available: need %d, available %d",
-		    handler_count, actual);
-		goto out_msi_available;
-	}
-
-	sc->sc_intr_num = handler_count;
-	sc->sc_intr_config = B_FALSE;
-	if (config_handler != NULL) {
-		sc->sc_intr_config = B_TRUE;
-	}
-
-	/* Assume they are all same priority */
-	ret = ddi_intr_get_pri(sc->sc_intr_htable[0], &sc->sc_intr_prio);
-	if (ret != DDI_SUCCESS) {
-		dev_err(sc->sc_dev, CE_WARN, "ddi_intr_get_pri failed");
-		goto out_msi_prio;
-	}
-
-	/* Add the vq handlers */
-	for (i = 0; vq_handlers[i].vh_func; i++) {
-		ret = ddi_intr_add_handler(sc->sc_intr_htable[i],
-		    vq_handlers[i].vh_func, sc, vq_handlers[i].vh_priv);
-		if (ret != DDI_SUCCESS) {
-			dev_err(sc->sc_dev, CE_WARN,
-			    "ddi_intr_add_handler failed");
-			/* Remove the handlers that succeeded. */
-			while (--i >= 0) {
-				(void) ddi_intr_remove_handler(
-				    sc->sc_intr_htable[i]);
-			}
-			goto out_add_handlers;
-		}
-	}
-
-	/* Don't forget the config handler */
-	if (config_handler != NULL) {
-		ret = ddi_intr_add_handler(sc->sc_intr_htable[i],
-		    config_handler->vh_func, sc, config_handler->vh_priv);
-		if (ret != DDI_SUCCESS) {
-			dev_err(sc->sc_dev, CE_WARN,
-			    "ddi_intr_add_handler failed");
-			/* Remove the handlers that succeeded. */
-			while (--i >= 0) {
-				(void) ddi_intr_remove_handler(
-				    sc->sc_intr_htable[i]);
-			}
-			goto out_add_handlers;
-		}
-	}
-
-	ret = ddi_intr_get_cap(sc->sc_intr_htable[0], &sc->sc_intr_cap);
-	if (ret == DDI_SUCCESS) {
-		sc->sc_int_type = int_type;
-		return (DDI_SUCCESS);
-	}
-
-out_add_handlers:
-out_msi_prio:
-out_msi_available:
-	for (i = 0; i < actual; i++)
-		(void) ddi_intr_free(sc->sc_intr_htable[i]);
-out_msi_alloc:
-	kmem_free(sc->sc_intr_htable,
-	    sizeof (ddi_intr_handle_t) * handler_count);
-
-	return (ret);
-}
-
-struct virtio_handler_container {
-	int nhandlers;
-	struct virtio_int_handler config_handler;
-	struct virtio_int_handler vq_handlers[];
-};
-
-uint_t
-virtio_intx_dispatch(caddr_t arg1, caddr_t arg2)
-{
-	struct virtio_softc *sc = (void *)arg1;
-	struct virtio_handler_container *vhc = (void *)arg2;
-	uint8_t isr_status;
-	int i;
-
-	isr_status = ddi_get8(sc->sc_ioh, (uint8_t *)(sc->sc_io_addr +
-	    VIRTIO_CONFIG_ISR_STATUS));
-
-	if (!isr_status)
-		return (DDI_INTR_UNCLAIMED);
-
-	if ((isr_status & VIRTIO_CONFIG_ISR_CONFIG_CHANGE) &&
-	    vhc->config_handler.vh_func) {
-		vhc->config_handler.vh_func((void *)sc,
-		    vhc->config_handler.vh_priv);
-	}
-
-	/* Notify all handlers */
-	for (i = 0; i < vhc->nhandlers; i++) {
-		vhc->vq_handlers[i].vh_func((void *)sc,
-		    vhc->vq_handlers[i].vh_priv);
-	}
-
-	return (DDI_INTR_CLAIMED);
-}
-
-/*
- * config_handler and vq_handlers may be allocated on stack.
- * Take precautions not to loose them.
- */
-static int
-virtio_register_intx(struct virtio_softc *sc,
-    struct virtio_int_handler *config_handler,
-    struct virtio_int_handler vq_handlers[])
-{
-	int vq_handler_count;
-	int actual;
-	struct virtio_handler_container *vhc;
-	size_t vhc_sz;
-	int ret = DDI_FAILURE;
-
-	/* Walk the handler table to get the number of handlers. */
-	for (vq_handler_count = 0;
-	    vq_handlers && vq_handlers[vq_handler_count].vh_func;
-	    vq_handler_count++)
-		;
-
-	vhc_sz = sizeof (struct virtio_handler_container) +
-	    sizeof (struct virtio_int_handler) * vq_handler_count;
-	vhc = kmem_zalloc(vhc_sz, KM_SLEEP);
-
-	vhc->nhandlers = vq_handler_count;
-	(void) memcpy(vhc->vq_handlers, vq_handlers,
-	    sizeof (struct virtio_int_handler) * vq_handler_count);
-
-	if (config_handler != NULL) {
-		(void) memcpy(&vhc->config_handler, config_handler,
-		    sizeof (struct virtio_int_handler));
-	}
-
-	/* Just a single entry for a single interrupt. */
-	sc->sc_intr_htable = kmem_zalloc(sizeof (ddi_intr_handle_t), KM_SLEEP);
-
-	ret = ddi_intr_alloc(sc->sc_dev, sc->sc_intr_htable,
-	    DDI_INTR_TYPE_FIXED, 0, 1, &actual, DDI_INTR_ALLOC_NORMAL);
-	if (ret != DDI_SUCCESS) {
-		dev_err(sc->sc_dev, CE_WARN,
-		    "Failed to allocate a fixed interrupt: %d", ret);
-		goto out_int_alloc;
-	}
-
-	ASSERT(actual == 1);
-	sc->sc_intr_num = 1;
-
-	ret = ddi_intr_get_pri(sc->sc_intr_htable[0], &sc->sc_intr_prio);
-	if (ret != DDI_SUCCESS) {
-		dev_err(sc->sc_dev, CE_WARN, "ddi_intr_get_pri failed");
-		goto out_prio;
-	}
-
-	ret = ddi_intr_add_handler(sc->sc_intr_htable[0],
-	    virtio_intx_dispatch, sc, vhc);
-	if (ret != DDI_SUCCESS) {
-		dev_err(sc->sc_dev, CE_WARN, "ddi_intr_add_handler failed");
-		goto out_add_handlers;
-	}
-
-	sc->sc_int_type = DDI_INTR_TYPE_FIXED;
-
-	return (DDI_SUCCESS);
-
-out_add_handlers:
-out_prio:
-	(void) ddi_intr_free(sc->sc_intr_htable[0]);
-out_int_alloc:
-	kmem_free(sc->sc_intr_htable, sizeof (ddi_intr_handle_t));
-	kmem_free(vhc, vhc_sz);
-	return (ret);
-}
-
-/*
- * We find out if we support MSI during this, and the register layout
- * depends on the MSI (doh). Don't acces the device specific bits in
- * BAR 0 before calling it!
- */
-int
-virtio_register_ints(struct virtio_softc *sc,
-    struct virtio_int_handler *config_handler,
-    struct virtio_int_handler vq_handlers[])
-{
-	int ret;
-	int intr_types;
-
-	/* Default offset until MSI-X is enabled, if ever. */
-	sc->sc_config_offset = VIRTIO_CONFIG_DEVICE_CONFIG_NOMSIX;
-
-	/* Determine which types of interrupts are supported */
-	ret = ddi_intr_get_supported_types(sc->sc_dev, &intr_types);
-	if (ret != DDI_SUCCESS) {
-		dev_err(sc->sc_dev, CE_WARN, "Can't get supported int types");
-		goto out_inttype;
-	}
-
-	/* If we have msi, let's use them. */
-	if (intr_types & (DDI_INTR_TYPE_MSIX | DDI_INTR_TYPE_MSI)) {
-		ret = virtio_register_msi(sc, config_handler,
-		    vq_handlers, intr_types);
-		if (!ret)
-			return (0);
-	}
-
-	/* Fall back to old-fashioned interrupts. */
-	if (intr_types & DDI_INTR_TYPE_FIXED) {
-		dev_debug(sc->sc_dev, CE_WARN,
-		    "Using legacy interrupts");
-
-		return (virtio_register_intx(sc, config_handler, vq_handlers));
-	}
-
-	dev_err(sc->sc_dev, CE_WARN,
-	    "MSI failed and fixed interrupts not supported. Giving up.");
-	ret = DDI_FAILURE;
-
-out_inttype:
-	return (ret);
-}
-
-static int
-virtio_enable_msi(struct virtio_softc *sc)
-{
-	int ret, i;
-	int vq_handler_count = sc->sc_intr_num;
-
-	/* Number of handlers, not counting the counfig. */
-	if (sc->sc_intr_config)
-		vq_handler_count--;
-
-	/* Enable the interrupts. Either the whole block, or one by one. */
-	if (sc->sc_intr_cap & DDI_INTR_FLAG_BLOCK) {
-		ret = ddi_intr_block_enable(sc->sc_intr_htable,
-		    sc->sc_intr_num);
-		if (ret != DDI_SUCCESS) {
-			dev_err(sc->sc_dev, CE_WARN,
-			    "Failed to enable MSI, falling back to INTx");
-			goto out_enable;
-		}
-	} else {
-		for (i = 0; i < sc->sc_intr_num; i++) {
-			ret = ddi_intr_enable(sc->sc_intr_htable[i]);
-			if (ret != DDI_SUCCESS) {
-				dev_err(sc->sc_dev, CE_WARN,
-				    "Failed to enable MSI %d, "
-				    "falling back to INTx", i);
-
-				while (--i >= 0) {
-					(void) ddi_intr_disable(
-					    sc->sc_intr_htable[i]);
-				}
-				goto out_enable;
-			}
-		}
-	}
-
-	/* Bind the allocated MSI to the queues and config */
-	for (i = 0; i < vq_handler_count; i++) {
-		int check;
-
-		ddi_put16(sc->sc_ioh,
-		    /* LINTED E_BAD_PTR_CAST_ALIGN */
-		    (uint16_t *)(sc->sc_io_addr +
-		    VIRTIO_CONFIG_QUEUE_SELECT), i);
-
-		ddi_put16(sc->sc_ioh,
-		    /* LINTED E_BAD_PTR_CAST_ALIGN */
-		    (uint16_t *)(sc->sc_io_addr +
-		    VIRTIO_CONFIG_QUEUE_VECTOR), i);
-
-		check = ddi_get16(sc->sc_ioh,
-		    /* LINTED E_BAD_PTR_CAST_ALIGN */
-		    (uint16_t *)(sc->sc_io_addr +
-		    VIRTIO_CONFIG_QUEUE_VECTOR));
-		if (check != i) {
-			dev_err(sc->sc_dev, CE_WARN, "Failed to bind handler "
-			    "for VQ %d, MSI %d. Check = %x", i, i, check);
-			ret = ENODEV;
-			goto out_bind;
-		}
-	}
-
-	if (sc->sc_intr_config) {
-		int check;
-
-		ddi_put16(sc->sc_ioh,
-		    /* LINTED E_BAD_PTR_CAST_ALIGN */
-		    (uint16_t *)(sc->sc_io_addr +
-		    VIRTIO_CONFIG_CONFIG_VECTOR), i);
-
-		check = ddi_get16(sc->sc_ioh,
-		    /* LINTED E_BAD_PTR_CAST_ALIGN */
-		    (uint16_t *)(sc->sc_io_addr +
-		    VIRTIO_CONFIG_CONFIG_VECTOR));
-		if (check != i) {
-			dev_err(sc->sc_dev, CE_WARN, "Failed to bind handler "
-			    "for Config updates, MSI %d", i);
-			ret = ENODEV;
-			goto out_bind;
-		}
-	}
-
-	/* Configuration offset depends on whether MSI-X is used. */
-	if (sc->sc_int_type == DDI_INTR_TYPE_MSIX)
-		sc->sc_config_offset = VIRTIO_CONFIG_DEVICE_CONFIG_MSIX;
-	else
-		ASSERT(sc->sc_int_type == DDI_INTR_TYPE_MSI);
-
-	return (DDI_SUCCESS);
-
-out_bind:
-	/* Unbind the vqs */
-	for (i = 0; i < vq_handler_count - 1; i++) {
-		ddi_put16(sc->sc_ioh,
-		    /* LINTED E_BAD_PTR_CAST_ALIGN */
-		    (uint16_t *)(sc->sc_io_addr +
-		    VIRTIO_CONFIG_QUEUE_SELECT), i);
-
-		ddi_put16(sc->sc_ioh,
-		    /* LINTED E_BAD_PTR_CAST_ALIGN */
-		    (uint16_t *)(sc->sc_io_addr +
-		    VIRTIO_CONFIG_QUEUE_VECTOR),
-		    VIRTIO_MSI_NO_VECTOR);
-	}
-	/* And the config */
-	/* LINTED E_BAD_PTR_CAST_ALIGN */
-	ddi_put16(sc->sc_ioh, (uint16_t *)(sc->sc_io_addr +
-	    VIRTIO_CONFIG_CONFIG_VECTOR), VIRTIO_MSI_NO_VECTOR);
-
-	/* Disable the interrupts. Either the whole block, or one by one. */
-	if (sc->sc_intr_cap & DDI_INTR_FLAG_BLOCK) {
-		ret = ddi_intr_block_disable(sc->sc_intr_htable,
-		    sc->sc_intr_num);
-		if (ret != DDI_SUCCESS) {
-			dev_err(sc->sc_dev, CE_WARN,
-			    "Failed to disable MSIs, won't be able to "
-			    "reuse next time");
-		}
-	} else {
-		for (i = 0; i < sc->sc_intr_num; i++) {
-			ret = ddi_intr_disable(sc->sc_intr_htable[i]);
-			if (ret != DDI_SUCCESS) {
-				dev_err(sc->sc_dev, CE_WARN,
-				    "Failed to disable interrupt %d, "
-				    "won't be able to reuse", i);
-			}
-		}
-	}
-
-	ret = DDI_FAILURE;
-
-out_enable:
-	return (ret);
-}
-
-static int
-virtio_enable_intx(struct virtio_softc *sc)
-{
-	int ret;
-
-	ret = ddi_intr_enable(sc->sc_intr_htable[0]);
-	if (ret != DDI_SUCCESS) {
-		dev_err(sc->sc_dev, CE_WARN,
-		    "Failed to enable interrupt: %d", ret);
-	}
-
-	return (ret);
-}
-
-/*
- * We can't enable/disable individual handlers in the INTx case so do
- * the whole bunch even in the msi case.
- */
-int
-virtio_enable_ints(struct virtio_softc *sc)
-{
-
-	ASSERT(sc->sc_config_offset == VIRTIO_CONFIG_DEVICE_CONFIG_NOMSIX);
-
-	/* See if we are using MSI. */
-	if (sc->sc_int_type == DDI_INTR_TYPE_MSIX ||
-	    sc->sc_int_type == DDI_INTR_TYPE_MSI)
-		return (virtio_enable_msi(sc));
-
-	ASSERT(sc->sc_int_type == DDI_INTR_TYPE_FIXED);
-	return (virtio_enable_intx(sc));
-}
-
-void
-virtio_release_ints(struct virtio_softc *sc)
-{
-	int i;
-	int ret;
-
-	/* We were running with MSI, unbind them. */
-	if (sc->sc_int_type == DDI_INTR_TYPE_MSIX ||
-	    sc->sc_int_type == DDI_INTR_TYPE_MSI) {
-		/* Unbind all vqs */
-		for (i = 0; i < sc->sc_nvqs; i++) {
-			ddi_put16(sc->sc_ioh,
-			    /* LINTED E_BAD_PTR_CAST_ALIGN */
-			    (uint16_t *)(sc->sc_io_addr +
-			    VIRTIO_CONFIG_QUEUE_SELECT), i);
-
-			ddi_put16(sc->sc_ioh,
-			    /* LINTED E_BAD_PTR_CAST_ALIGN */
-			    (uint16_t *)(sc->sc_io_addr +
-			    VIRTIO_CONFIG_QUEUE_VECTOR),
-			    VIRTIO_MSI_NO_VECTOR);
-		}
-		/* And the config */
-		/* LINTED E_BAD_PTR_CAST_ALIGN */
-		ddi_put16(sc->sc_ioh, (uint16_t *)(sc->sc_io_addr +
-		    VIRTIO_CONFIG_CONFIG_VECTOR),
-		    VIRTIO_MSI_NO_VECTOR);
-
-	}
-
-	/* Disable the interrupts. Either the whole block, or one by one. */
-	if (sc->sc_intr_cap & DDI_INTR_FLAG_BLOCK) {
-		ret = ddi_intr_block_disable(sc->sc_intr_htable,
-		    sc->sc_intr_num);
-		if (ret != DDI_SUCCESS) {
-			dev_err(sc->sc_dev, CE_WARN,
-			    "Failed to disable MSIs, won't be able to "
-			    "reuse next time");
-		}
-	} else {
-		for (i = 0; i < sc->sc_intr_num; i++) {
-			ret = ddi_intr_disable(sc->sc_intr_htable[i]);
-			if (ret != DDI_SUCCESS) {
-				dev_err(sc->sc_dev, CE_WARN,
-				    "Failed to disable interrupt %d, "
-				    "won't be able to reuse", i);
-			}
-		}
-	}
-
-
-	for (i = 0; i < sc->sc_intr_num; i++) {
-		(void) ddi_intr_remove_handler(sc->sc_intr_htable[i]);
-	}
-
-	for (i = 0; i < sc->sc_intr_num; i++)
-		(void) ddi_intr_free(sc->sc_intr_htable[i]);
-
-	kmem_free(sc->sc_intr_htable, sizeof (ddi_intr_handle_t) *
-	    sc->sc_intr_num);
-
-	/* After disabling interrupts, the config offset is non-MSI-X. */
-	sc->sc_config_offset = VIRTIO_CONFIG_DEVICE_CONFIG_NOMSIX;
-}
-
-/*
- * Module linkage information for the kernel.
- */
-static struct modlmisc modlmisc = {
-	&mod_miscops,	/* Type of module */
-	"VirtIO common library module",
-};
-
-static struct modlinkage modlinkage = {
-	MODREV_1,
-	{
-		(void *)&modlmisc,
-		NULL
-	}
-};
-
-int
-_init(void)
-{
-	return (mod_install(&modlinkage));
-}
-
-int
-_fini(void)
-{
-	return (mod_remove(&modlinkage));
-}
-
-int
-_info(struct modinfo *modinfop)
-{
-	return (mod_info(&modlinkage, modinfop));
-}
diff --git a/usr/src/uts/common/io/virtio/virtio.h b/usr/src/uts/common/io/virtio/virtio.h
new file mode 100644
index 0000000000..420f9ccfed
--- /dev/null
+++ b/usr/src/uts/common/io/virtio/virtio.h
@@ -0,0 +1,342 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2019 Joyent, Inc.
+ */
+
+#ifndef _VIRTIO_H
+#define	_VIRTIO_H
+
+/*
+ * VIRTIO FRAMEWORK
+ *
+ * This framework handles the initialisation and operation common to all Virtio
+ * device types; e.g., Virtio Block (vioblk), Virtio Network (vioif), etc.  The
+ * framework presently provides for what is now described as a "legacy" driver
+ * in the current issue of the "Virtual I/O Device (VIRTIO) Version 1.1"
+ * specification.  Though several new specifications have been released, legacy
+ * devices are still the most widely available on current hypervisor platforms.
+ * Legacy devices make use of the native byte order of the host system.
+ *
+ * FRAMEWORK INITIALISATION: STARTING
+ *
+ * Client drivers will, in their attach(9E) routine, make an early call to
+ * virtio_init().  This causes the framework to allocate some base resources
+ * and begin initialising the device.  This routine confirms that the device
+ * will operate in the supported legacy mode as per the specification.  A
+ * failure here means that we cannot presently support this device.
+ *
+ * Once virtio_init() returns, the initialisation phase has begun and the
+ * driver can examine negotiated features and set up virtqueues.  The
+ * initialisation phase ends when the driver calls either
+ * virtio_init_complete() or virtio_fini().
+ *
+ * FRAMEWORK INITIALISATION: FEATURE NEGOTIATION
+ *
+ * The virtio_init() call accepts a bitmask of desired features that the driver
+ * supports.  The framework will negotiate the common set of features supported
+ * by both the driver and the device.  The presence of any individual feature
+ * can be tested after the initialisation phase has begun using
+ * virtio_feature_present().
+ *
+ * The framework will additionally negotiate some set of features that are not
+ * specific to a device type on behalf of the client driver; e.g., support for
+ * indirect descriptors.
+ *
+ * Some features allow the driver to read additional configuration values from
+ * the device-specific regions of the device register space.  These can be
+ * accessed via the virtio_dev_get*() and virtio_dev_put*() family of
+ * functions.
+ *
+ * FRAMEWORK INITIALISATION: VIRTQUEUE CONFIGURATION
+ *
+ * During the initialisation phase, the client driver may configure some number
+ * of virtqueues with virtio_queue_alloc().  Once initialisation has been
+ * completed, no further queues can be configured without destroying the
+ * framework object and beginning again from scratch.
+ *
+ * When configuring a queue, the driver must know the queue index number.  This
+ * generally comes from the section of the specification describing the
+ * specific device type; e.g., Virtio Network devices have a receive queue at
+ * index 0, and a transmit queue at index 1.  The name given to the queue is
+ * informational and has no impact on device operation.
+ *
+ * Most queues will require an interrupt handler function.  When a queue
+ * notification interrupt is received, the provided handler will be called with
+ * two arguments: first, the provided user data argument; and second, a pointer
+ * to the "virtio_t" object for this instance.
+ *
+ * A maximum segment count must be selected for each queue.  This count is the
+ * upper bound on the number of scatter-gather cookies that will be accepted,
+ * and applies to both direct and indirect descriptor based queues.  This cap
+ * is usually either negotiated with the device, or determined structurally
+ * based on the shape of the buffers required for device operation.
+ *
+ * FRAMEWORK INITIALISATION: FINISHING
+ *
+ * Once queue configuration has been completed, the client driver calls
+ * virtio_init_complete() to finalise resource allocation and set the device to
+ * the running state (DRIVER_OK).  The framework will allocate any interrupts
+ * needed for queue notifications at this time.
+ *
+ * If the client driver cannot complete initialisation, the instance may
+ * instead be torn down with virtio_fini().  Signalling failure to this routine
+ * will report failure to the device instead of resetting it, which may be
+ * reported by the hypervisor as a fault.
+ *
+ * DESCRIPTOR CHAINS
+ *
+ * Most devices accept I/O requests from the driver through a least one queue.
+ * Some devices are operated by submission of synchronous requests.  The device
+ * is expected to process the request and return some kind of status; e.g., a
+ * block device accepts write requests from the file system and signals when
+ * they have completed or failed.
+ *
+ * Other devices operate by asynchronous delivery of I/O requests to the
+ * driver; e.g., a network device may receive incoming frames at any time.
+ * Inbound asynchronous delivery is usually achieved by populating a queue with
+ * a series of memory buffers where the incoming data will be written by the
+ * device at some later time.
+ *
+ * Whether for inbound or outbound transfers, buffers are inserted into the
+ * ring through chains of one or more descriptors.  Each descriptor has a
+ * transfer direction (to or from the device), and a physical address and
+ * length (i.e., a DMA cookie).  The framework automatically manages the slight
+ * differences in operation between direct and indirect descriptor usage on
+ * behalf of the client driver.
+ *
+ * A chain of descriptors is allocated by calling virtio_chain_alloc() against
+ * a particular queue.  This function accepts a kmem flag as per
+ * kmem_alloc(9F).  A client driver specific void pointer may be attached to
+ * the chain with virtio_chain_data_set() and read back later with
+ * virtio_chain_data(); e.g., after it is returned by a call to
+ * virtio_queue_poll().
+ *
+ * Cookies are added to a chain by calling virtio_chain_append() with the
+ * appropriate physical address and transfer direction.  This function may fail
+ * if the chain is already using the maximum number of cookies for this queue.
+ * Client drivers are responsible for appropriate use of virtio_dma_sync()
+ * or ddi_dma_sync(9F) on any memory appended to a descriptor chain prior to
+ * chain submission.
+ *
+ * Once fully constructed and synced, a chain can be submitted to the device by
+ * calling virtio_chain_submit().  The caller may choose to flush the queue
+ * contents to the device on each submission, or to batch notifications until
+ * later to amortise the notification cost over more requests.  If batching
+ * notifications, outstanding submissions can be flushed with a call to
+ * virtio_queue_flush().  Note that the framework will insert an appropriate
+ * memory barrier to ensure writes by the driver complete before making the
+ * submitted descriptor visible to the device.
+ *
+ * A chain may be reset for reuse with new cookies by calling
+ * virtio_chain_clear().  The chain may be freed completely by calling
+ * virtio_chain_free().
+ *
+ * When a descriptor chain is returned to the driver by the device, it may
+ * include a received data length value.  This value can be accessed via
+ * virtio_chain_received_length().  There is some suggestion in more recent
+ * Virtio specifications that, depending on the device type and the hypervisor
+ * this value may not always be accurate or useful.
+ *
+ * VIRTQUEUE OPERATION
+ *
+ * The queue size (i.e., the number of direct descriptor entries) can be
+ * found with virtio_queue_size().  This value is static over the lifetime
+ * of the queue.
+ *
+ * The number of descriptor chains presently submitted to the device and not
+ * yet returned can be obtained via virtio_queue_nactive().
+ *
+ * Over time the device will return descriptor chains to the driver in response
+ * to device activity.  Any newly returned chains may be retrieved by the
+ * driver by calling virtio_queue_poll().  See the DESCRIPTOR CHAINS section
+ * for more detail about managing descriptor chain objects.  Note that the
+ * framework will insert an appropriate memory barrier to ensure that writes by
+ * the host are complete before returning the chain to the client driver.
+ *
+ * The NO_INTERRUPT flag on a queue may be set or cleared with
+ * virtio_queue_no_interrupt().  Note that this flag is purely advisory, and
+ * may not actually stop interrupts from the device in a timely fashion.
+ *
+ * INTERRUPT MANAGEMENT
+ *
+ * A mutex used within an interrupt handler must be initialised with the
+ * correct interrupt priority.  After the initialisation phase is complete, the
+ * client should use virtio_intr_pri() to get a value suitable to pass to
+ * mutex_init(9F).
+ *
+ * When the driver is ready to receive notifications from the device, the
+ * virtio_interrupts_enable() routine may be called.  Interrupts may be
+ * disabled again by calling virtio_interrupts_disable().  Interrupt resources
+ * will be deallocated as part of a subsequent call to virtio_fini().
+ *
+ * DMA MEMORY MANAGEMENT: ALLOCATION AND FREE
+ *
+ * Client drivers may allocate memory suitable for communication with the
+ * device by using virtio_dma_alloc().  This function accepts an allocation
+ * size, a DMA attribute template, a set of DMA flags, and a kmem flag.
+ * A "virtio_dma_t" object is returned to track and manage the allocation.
+ *
+ * The DMA flags value will be a combination of direction flags (e.g.,
+ * DDI_DMA_READ or DDI_DMA_WRITE) and mapping flags (e.g., DDI_DMA_CONSISTENT
+ * or DDI_DMA_STREAMING).  The kmem flag is either KM_SLEEP or KM_NOSLEEP,
+ * as described in kmem_alloc(9F).
+ *
+ * Memory that is no longer required can be freed using virtio_dma_free().
+ *
+ * DMA MEMORY MANAGEMENT: BINDING WITHOUT ALLOCATION
+ *
+ * If another subsystem has loaned memory to your client driver, you may need
+ * to allocate and bind a handle without additional backing memory.  The
+ * virtio_dma_alloc_nomem() function can be used for this purpose, returning a
+ * "virtio_dma_t" object.
+ *
+ * Once allocated, an arbitrary kernel memory location can be bound for DMA
+ * with virtio_dma_bind().  The binding can be subsequently undone with
+ * virtio_dma_unbind(), allowing the "virtio_dma_t" object to be reused for
+ * another binding.
+ *
+ * DMA MEMORY MANAGEMENT: VIRTUAL AND PHYSICAL ADDRESSES
+ *
+ * The total size of a mapping (with or without own backing memory) can be
+ * found with virtio_dma_size().  A void pointer to a kernel virtual address
+ * within the buffer can be obtained via virtio_dma_va(); this function accepts
+ * a linear offset into the VA range and performs bounds checking.
+ *
+ * The number of physical memory addresses (DMA cookies) can be found with
+ * virtio_dma_ncookies().  The physical address and length of each cookie can
+ * be found with virtio_dma_cookie_pa() and virtio_dma_cookie_size(); these
+ * functions are keyed on the zero-indexed cookie number.
+ *
+ * DMA MEMORY MANAGEMENT: SYNCHRONISATION
+ *
+ * When passing memory to the device, or reading memory returned from the
+ * device, DMA synchronisation must be performed in case it is required by the
+ * underlying platform.  A convenience wrapper exists: virtio_dma_sync().  This
+ * routine synchronises the entire binding and accepts the same synchronisation
+ * type values as ddi_dma_sync(9F).
+ *
+ * QUIESCE
+ *
+ * As quiesce(9E) merely requires that the device come to a complete stop, most
+ * client drivers will be able to call virtio_quiesce() without additional
+ * actions.  This will reset the device, immediately halting all queue
+ * activity, and return a value suitable for returning from the client driver
+ * quiesce(9E) entrypoint.  This routine must only be called from quiesce
+ * context as it performs no synchronisation with other threads.
+ *
+ * DETACH
+ *
+ * Some devices are effectively long-polled; that is, they submit some number
+ * of descriptor chains to the device that are not returned to the driver until
+ * some asynchronous event occurs such as the receipt of an incoming packet or
+ * a device hot plug event.  When detaching the device the return of these
+ * outstanding buffers must be arranged.  Some device types may have task
+ * management commands that can force the orderly return of these chains, but
+ * the only way to do so uniformly is to reset the device and claw back the
+ * memory.
+ *
+ * If the client driver has outstanding descriptors and needs a hard stop on
+ * device activity it can call virtio_shutdown().  This routine will bring
+ * queue processing to an orderly stop and then reset the device, causing it to
+ * cease use of any DMA resources.  Once this function returns, the driver may
+ * call virtio_queue_evacuate() on each queue to retrieve any previously
+ * submitted chains.
+ *
+ * To tear down resources (e.g., interrupts and allocated memory) the client
+ * driver must finally call virtio_fini().  If virtio_shutdown() was not
+ * needed, this routine will also reset the device.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct virtio virtio_t;
+typedef struct virtio_queue virtio_queue_t;
+typedef struct virtio_chain virtio_chain_t;
+typedef struct virtio_dma virtio_dma_t;
+
+typedef enum virtio_direction {
+	/*
+	 * In the base specification, a descriptor is either set up to be
+	 * written by the device or to be read by the device, but not both.
+	 */
+	VIRTIO_DIR_DEVICE_WRITES = 1,
+	VIRTIO_DIR_DEVICE_READS
+} virtio_direction_t;
+
+void virtio_fini(virtio_t *, boolean_t);
+virtio_t *virtio_init(dev_info_t *, uint64_t, boolean_t);
+int virtio_init_complete(virtio_t *, int);
+int virtio_quiesce(virtio_t *);
+void virtio_shutdown(virtio_t *);
+
+void *virtio_intr_pri(virtio_t *);
+
+void virtio_device_reset(virtio_t *);
+
+uint8_t virtio_dev_get8(virtio_t *, uintptr_t);
+uint16_t virtio_dev_get16(virtio_t *, uintptr_t);
+uint32_t virtio_dev_get32(virtio_t *, uintptr_t);
+uint64_t virtio_dev_get64(virtio_t *, uintptr_t);
+
+void virtio_dev_put8(virtio_t *, uintptr_t, uint8_t);
+void virtio_dev_put16(virtio_t *, uintptr_t, uint16_t);
+void virtio_dev_put32(virtio_t *, uintptr_t, uint32_t);
+
+boolean_t virtio_feature_present(virtio_t *, uint64_t);
+
+virtio_queue_t *virtio_queue_alloc(virtio_t *, uint16_t, const char *,
+    ddi_intr_handler_t *, void *, boolean_t, uint_t);
+
+virtio_chain_t *virtio_queue_poll(virtio_queue_t *);
+virtio_chain_t *virtio_queue_evacuate(virtio_queue_t *);
+void virtio_queue_flush(virtio_queue_t *);
+void virtio_queue_no_interrupt(virtio_queue_t *, boolean_t);
+uint_t virtio_queue_nactive(virtio_queue_t *);
+uint_t virtio_queue_size(virtio_queue_t *);
+
+virtio_chain_t *virtio_chain_alloc(virtio_queue_t *, int);
+void virtio_chain_clear(virtio_chain_t *);
+void virtio_chain_free(virtio_chain_t *);
+int virtio_chain_append(virtio_chain_t *, uint64_t, size_t, virtio_direction_t);
+
+void *virtio_chain_data(virtio_chain_t *);
+void virtio_chain_data_set(virtio_chain_t *, void *);
+
+void virtio_chain_submit(virtio_chain_t *, boolean_t);
+size_t virtio_chain_received_length(virtio_chain_t *);
+
+int virtio_interrupts_enable(virtio_t *);
+void virtio_interrupts_disable(virtio_t *);
+
+virtio_dma_t *virtio_dma_alloc(virtio_t *, size_t, const ddi_dma_attr_t *, int,
+    int);
+virtio_dma_t *virtio_dma_alloc_nomem(virtio_t *, const ddi_dma_attr_t *, int);
+void virtio_dma_free(virtio_dma_t *);
+int virtio_dma_bind(virtio_dma_t *, void *, size_t, int, int);
+void virtio_dma_unbind(virtio_dma_t *);
+void virtio_dma_sync(virtio_dma_t *, int);
+
+void *virtio_dma_va(virtio_dma_t *, size_t);
+size_t virtio_dma_size(virtio_dma_t *);
+uint_t virtio_dma_ncookies(virtio_dma_t *);
+uint64_t virtio_dma_cookie_pa(virtio_dma_t *, uint_t);
+size_t virtio_dma_cookie_size(virtio_dma_t *, uint_t);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _VIRTIO_H */
diff --git a/usr/src/uts/common/io/virtio/virtio_dma.c b/usr/src/uts/common/io/virtio/virtio_dma.c
new file mode 100644
index 0000000000..81972b5402
--- /dev/null
+++ b/usr/src/uts/common/io/virtio/virtio_dma.c
@@ -0,0 +1,295 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2019 Joyent, Inc.
+ */
+
+/*
+ * VIRTIO FRAMEWORK: DMA ROUTINES
+ *
+ * For design and usage documentation, see the comments in "virtio.h".
+ */
+
+#include <sys/conf.h>
+#include <sys/kmem.h>
+#include <sys/debug.h>
+#include <sys/modctl.h>
+#include <sys/autoconf.h>
+#include <sys/ddi_impldefs.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/sunndi.h>
+#include <sys/avintr.h>
+#include <sys/spl.h>
+#include <sys/promif.h>
+#include <sys/list.h>
+#include <sys/bootconf.h>
+#include <sys/bootsvcs.h>
+#include <sys/sysmacros.h>
+#include <sys/pci.h>
+
+#include "virtio.h"
+#include "virtio_impl.h"
+
+
+
+void
+virtio_dma_sync(virtio_dma_t *vidma, int flag)
+{
+	VERIFY0(ddi_dma_sync(vidma->vidma_dma_handle, 0, 0, flag));
+}
+
+uint_t
+virtio_dma_ncookies(virtio_dma_t *vidma)
+{
+	return (vidma->vidma_dma_ncookies);
+}
+
+size_t
+virtio_dma_size(virtio_dma_t *vidma)
+{
+	return (vidma->vidma_size);
+}
+
+void *
+virtio_dma_va(virtio_dma_t *vidma, size_t offset)
+{
+	VERIFY3U(offset, <, vidma->vidma_size);
+
+	return (vidma->vidma_va + offset);
+}
+
+uint64_t
+virtio_dma_cookie_pa(virtio_dma_t *vidma, uint_t cookie)
+{
+	VERIFY3U(cookie, <, vidma->vidma_dma_ncookies);
+
+	return (vidma->vidma_dma_cookies[cookie].dmac_laddress);
+}
+
+size_t
+virtio_dma_cookie_size(virtio_dma_t *vidma, uint_t cookie)
+{
+	VERIFY3U(cookie, <, vidma->vidma_dma_ncookies);
+
+	return (vidma->vidma_dma_cookies[cookie].dmac_size);
+}
+
+int
+virtio_dma_init_handle(virtio_t *vio, virtio_dma_t *vidma,
+    const ddi_dma_attr_t *attr, int kmflags)
+{
+	int r;
+	dev_info_t *dip = vio->vio_dip;
+
+	VERIFY(kmflags == KM_SLEEP || kmflags == KM_NOSLEEP);
+	int (*dma_wait)(caddr_t) = (kmflags == KM_SLEEP) ? DDI_DMA_SLEEP :
+	    DDI_DMA_DONTWAIT;
+
+	vidma->vidma_virtio = vio;
+
+	/*
+	 * Ensure we don't try to allocate a second time using the same
+	 * tracking object.
+	 */
+	VERIFY0(vidma->vidma_level);
+
+	if ((r = ddi_dma_alloc_handle(dip, (ddi_dma_attr_t *)attr, dma_wait,
+	    NULL, &vidma->vidma_dma_handle)) != DDI_SUCCESS) {
+		dev_err(dip, CE_WARN, "DMA handle allocation failed (%x)", r);
+		goto fail;
+	}
+	vidma->vidma_level |= VIRTIO_DMALEVEL_HANDLE_ALLOC;
+
+	return (DDI_SUCCESS);
+
+fail:
+	virtio_dma_fini(vidma);
+	return (DDI_FAILURE);
+}
+
+int
+virtio_dma_init(virtio_t *vio, virtio_dma_t *vidma, size_t sz,
+    const ddi_dma_attr_t *attr, int dmaflags, int kmflags)
+{
+	int r;
+	dev_info_t *dip = vio->vio_dip;
+	caddr_t va = NULL;
+
+	VERIFY(kmflags == KM_SLEEP || kmflags == KM_NOSLEEP);
+	int (*dma_wait)(caddr_t) = (kmflags == KM_SLEEP) ? DDI_DMA_SLEEP :
+	    DDI_DMA_DONTWAIT;
+
+	if (virtio_dma_init_handle(vio, vidma, attr, kmflags) !=
+	    DDI_SUCCESS) {
+		goto fail;
+	}
+
+	if ((r = ddi_dma_mem_alloc(vidma->vidma_dma_handle, sz,
+	    &virtio_acc_attr,
+	    dmaflags & (DDI_DMA_STREAMING | DDI_DMA_CONSISTENT),
+	    dma_wait, NULL, &va, &vidma->vidma_real_size,
+	    &vidma->vidma_acc_handle)) != DDI_SUCCESS) {
+		dev_err(dip, CE_WARN, "DMA memory allocation failed (%x)", r);
+		goto fail;
+	}
+	vidma->vidma_level |= VIRTIO_DMALEVEL_MEMORY_ALLOC;
+
+	/*
+	 * Zero the memory to avoid accidental exposure of arbitrary kernel
+	 * memory.
+	 */
+	bzero(va, vidma->vidma_real_size);
+
+	if (virtio_dma_bind(vidma, va, sz, dmaflags, kmflags) != DDI_SUCCESS) {
+		goto fail;
+	}
+
+	return (DDI_SUCCESS);
+
+fail:
+	virtio_dma_fini(vidma);
+	return (DDI_FAILURE);
+}
+
+int
+virtio_dma_bind(virtio_dma_t *vidma, void *va, size_t sz, int dmaflags,
+    int kmflags)
+{
+	int r;
+	dev_info_t *dip = vidma->vidma_virtio->vio_dip;
+	ddi_dma_cookie_t dmac;
+
+	VERIFY(kmflags == KM_SLEEP || kmflags == KM_NOSLEEP);
+	int (*dma_wait)(caddr_t) = (kmflags == KM_SLEEP) ? DDI_DMA_SLEEP :
+	    DDI_DMA_DONTWAIT;
+
+	VERIFY(vidma->vidma_level & VIRTIO_DMALEVEL_HANDLE_ALLOC);
+	VERIFY(!(vidma->vidma_level & VIRTIO_DMALEVEL_HANDLE_BOUND));
+
+	vidma->vidma_va = va;
+	vidma->vidma_size = sz;
+
+	if ((r = ddi_dma_addr_bind_handle(vidma->vidma_dma_handle, NULL,
+	    vidma->vidma_va, vidma->vidma_size, dmaflags, dma_wait, NULL,
+	    &dmac, &vidma->vidma_dma_ncookies)) != DDI_DMA_MAPPED) {
+		VERIFY3S(r, !=, DDI_DMA_PARTIAL_MAP);
+		dev_err(dip, CE_WARN, "DMA handle bind failed (%x)", r);
+		goto fail;
+	}
+	vidma->vidma_level |= VIRTIO_DMALEVEL_HANDLE_BOUND;
+
+	if ((vidma->vidma_dma_cookies = kmem_alloc(
+	    vidma->vidma_dma_ncookies * sizeof (ddi_dma_cookie_t),
+	    kmflags)) == NULL) {
+		dev_err(dip, CE_WARN, "DMA cookie array allocation failure");
+		goto fail;
+	}
+	vidma->vidma_level |= VIRTIO_DMALEVEL_COOKIE_ARRAY;
+
+	vidma->vidma_dma_cookies[0] = dmac;
+	for (uint_t n = 1; n < vidma->vidma_dma_ncookies; n++) {
+		ddi_dma_nextcookie(vidma->vidma_dma_handle,
+		    &vidma->vidma_dma_cookies[n]);
+	}
+
+	return (DDI_SUCCESS);
+
+fail:
+	virtio_dma_unbind(vidma);
+	return (DDI_FAILURE);
+}
+
+virtio_dma_t *
+virtio_dma_alloc(virtio_t *vio, size_t sz, const ddi_dma_attr_t *attr,
+    int dmaflags, int kmflags)
+{
+	virtio_dma_t *vidma;
+
+	if ((vidma = kmem_zalloc(sizeof (*vidma), kmflags)) == NULL) {
+		return (NULL);
+	}
+
+	if (virtio_dma_init(vio, vidma, sz, attr, dmaflags, kmflags) !=
+	    DDI_SUCCESS) {
+		kmem_free(vidma, sizeof (*vidma));
+		return (NULL);
+	}
+
+	return (vidma);
+}
+
+virtio_dma_t *
+virtio_dma_alloc_nomem(virtio_t *vio, const ddi_dma_attr_t *attr, int kmflags)
+{
+	virtio_dma_t *vidma;
+
+	if ((vidma = kmem_zalloc(sizeof (*vidma), kmflags)) == NULL) {
+		return (NULL);
+	}
+
+	if (virtio_dma_init_handle(vio, vidma, attr, kmflags) != DDI_SUCCESS) {
+		kmem_free(vidma, sizeof (*vidma));
+		return (NULL);
+	}
+
+	return (vidma);
+}
+
+void
+virtio_dma_fini(virtio_dma_t *vidma)
+{
+	virtio_dma_unbind(vidma);
+
+	if (vidma->vidma_level & VIRTIO_DMALEVEL_MEMORY_ALLOC) {
+		ddi_dma_mem_free(&vidma->vidma_acc_handle);
+
+		vidma->vidma_level &= ~VIRTIO_DMALEVEL_MEMORY_ALLOC;
+	}
+
+	if (vidma->vidma_level & VIRTIO_DMALEVEL_HANDLE_ALLOC) {
+		ddi_dma_free_handle(&vidma->vidma_dma_handle);
+
+		vidma->vidma_level &= ~VIRTIO_DMALEVEL_HANDLE_ALLOC;
+	}
+
+	VERIFY0(vidma->vidma_level);
+	bzero(vidma, sizeof (*vidma));
+}
+
+void
+virtio_dma_unbind(virtio_dma_t *vidma)
+{
+	if (vidma->vidma_level & VIRTIO_DMALEVEL_COOKIE_ARRAY) {
+		kmem_free(vidma->vidma_dma_cookies,
+		    vidma->vidma_dma_ncookies * sizeof (ddi_dma_cookie_t));
+
+		vidma->vidma_level &= ~VIRTIO_DMALEVEL_COOKIE_ARRAY;
+	}
+
+	if (vidma->vidma_level & VIRTIO_DMALEVEL_HANDLE_BOUND) {
+		VERIFY3U(ddi_dma_unbind_handle(vidma->vidma_dma_handle), ==,
+		    DDI_SUCCESS);
+
+		vidma->vidma_level &= ~VIRTIO_DMALEVEL_HANDLE_BOUND;
+	}
+
+	vidma->vidma_va = 0;
+	vidma->vidma_size = 0;
+}
+
+void
+virtio_dma_free(virtio_dma_t *vidma)
+{
+	virtio_dma_fini(vidma);
+	kmem_free(vidma, sizeof (*vidma));
+}
diff --git a/usr/src/uts/common/io/virtio/virtio_impl.h b/usr/src/uts/common/io/virtio/virtio_impl.h
new file mode 100644
index 0000000000..518667c7f4
--- /dev/null
+++ b/usr/src/uts/common/io/virtio/virtio_impl.h
@@ -0,0 +1,368 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2019 Joyent, Inc.
+ */
+
+#ifndef _VIRTIO_IMPL_H
+#define	_VIRTIO_IMPL_H
+
+/*
+ * VIRTIO FRAMEWORK: FRAMEWORK-PRIVATE DEFINITIONS
+ *
+ * For design and usage documentation, see the comments in "virtio.h".
+ *
+ * NOTE: Client drivers should not use definitions from this file.
+ */
+
+#include <sys/types.h>
+#include <sys/dditypes.h>
+#include <sys/list.h>
+#include <sys/ccompile.h>
+
+#include "virtio.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern ddi_device_acc_attr_t virtio_acc_attr;
+extern ddi_dma_attr_t virtio_dma_attr;
+
+typedef struct virtio_vq_desc virtio_vq_desc_t;
+typedef struct virtio_vq_driver virtio_vq_driver_t;
+typedef struct virtio_vq_device virtio_vq_device_t;
+typedef struct virtio_vq_elem virtio_vq_elem_t;
+
+int virtio_dma_init(virtio_t *, virtio_dma_t *, size_t, const ddi_dma_attr_t *,
+    int, int);
+void virtio_dma_fini(virtio_dma_t *);
+
+
+
+typedef enum virtio_dma_level {
+	VIRTIO_DMALEVEL_HANDLE_ALLOC =	(1ULL << 0),
+	VIRTIO_DMALEVEL_MEMORY_ALLOC =	(1ULL << 1),
+	VIRTIO_DMALEVEL_HANDLE_BOUND =	(1ULL << 2),
+	VIRTIO_DMALEVEL_COOKIE_ARRAY =	(1ULL << 3),
+} virtio_dma_level_t;
+
+struct virtio_dma {
+	virtio_dma_level_t		vidma_level;
+	virtio_t			*vidma_virtio;
+	caddr_t				vidma_va;
+	size_t				vidma_size;
+	size_t				vidma_real_size;
+	ddi_dma_handle_t		vidma_dma_handle;
+	ddi_acc_handle_t		vidma_acc_handle;
+	uint_t				vidma_dma_ncookies;
+	ddi_dma_cookie_t		*vidma_dma_cookies;
+};
+
+typedef enum virtio_initlevel {
+	VIRTIO_INITLEVEL_REGS =		(1ULL << 0),
+	VIRTIO_INITLEVEL_PROVIDER =	(1ULL << 1),
+	VIRTIO_INITLEVEL_INT_ALLOC =	(1ULL << 2),
+	VIRTIO_INITLEVEL_INT_ADDED =	(1ULL << 3),
+	VIRTIO_INITLEVEL_INT_ENABLED =	(1ULL << 4),
+	VIRTIO_INITLEVEL_SHUTDOWN =	(1ULL << 5),
+} virtio_initlevel_t;
+
+struct virtio {
+	dev_info_t			*vio_dip;
+
+	kmutex_t			vio_mutex;
+
+	virtio_initlevel_t		vio_initlevel;
+
+	list_t				vio_queues;
+
+	ddi_acc_handle_t		vio_barh;
+	caddr_t				vio_bar;
+	uint_t				vio_config_offset;
+
+	uint32_t			vio_features;
+	uint32_t			vio_features_device;
+
+	ddi_intr_handle_t		*vio_interrupts;
+	int				vio_ninterrupts;
+	int				vio_interrupt_type;
+	int				vio_interrupt_cap;
+	uint_t				vio_interrupt_priority;
+};
+
+struct virtio_queue {
+	virtio_t			*viq_virtio;
+	kmutex_t			viq_mutex;
+	const char			*viq_name;
+	list_node_t			viq_link;
+
+	boolean_t			viq_shutdown;
+	boolean_t			viq_indirect;
+	uint_t				viq_max_segs;
+
+	/*
+	 * Each Virtio device type has some set of queues for data transfer to
+	 * and from the host.  This index is described in the specification for
+	 * the particular device and queue type, and written to QUEUE_SELECT to
+	 * allow interaction with the queue.  For example, a network device has
+	 * at least a receive queue with index 0, and a transmit queue with
+	 * index 1.
+	 */
+	uint16_t			viq_index;
+
+	/*
+	 * For legacy Virtio devices, the size and shape of the queue is
+	 * determined entirely by the number of queue entries.
+	 */
+	uint16_t			viq_size;
+	id_space_t			*viq_descmap;
+
+	/*
+	 * The memory shared between the device and the driver is allocated as
+	 * a large phyisically contiguous chunk.  Access to this area is
+	 * through three pointers to packed structures.
+	 */
+	virtio_dma_t			viq_dma;
+	virtio_vq_desc_t		*viq_dma_descs;
+	virtio_vq_driver_t		*viq_dma_driver;
+	virtio_vq_device_t		*viq_dma_device;
+
+	uint16_t			viq_device_index;
+	uint16_t			viq_driver_index;
+
+	/*
+	 * Interrupt handler function, or NULL if not provided.
+	 */
+	ddi_intr_handler_t		*viq_func;
+	void				*viq_funcarg;
+	boolean_t			viq_handler_added;
+	uint_t				viq_handler_index;
+
+	/*
+	 * When a chain is submitted to the queue, it is also stored in this
+	 * AVL tree keyed by the index of the first descriptor in the chain.
+	 */
+	avl_tree_t			viq_inflight;
+};
+
+struct virtio_chain {
+	virtio_queue_t			*vic_vq;
+	avl_node_t			vic_node;
+
+	void				*vic_data;
+
+	uint16_t			vic_head;
+	uint32_t			vic_received_length;
+
+	virtio_dma_t			vic_indirect_dma;
+	uint_t				vic_indirect_capacity;
+	uint_t				vic_indirect_used;
+
+	uint_t				vic_direct_capacity;
+	uint_t				vic_direct_used;
+	uint16_t			vic_direct[];
+};
+
+/*
+ * PACKED STRUCTS FOR DEVICE ACCESS
+ */
+
+struct virtio_vq_desc {
+	/*
+	 * Buffer physical address and length.
+	 */
+	uint64_t			vqd_addr;
+	uint32_t			vqd_len;
+
+	/*
+	 * Flags.  Use with the VIRTQ_DESC_F_* family of constants.  See below.
+	 */
+	uint16_t			vqd_flags;
+
+	/*
+	 * If VIRTQ_DESC_F_NEXT is set in flags, this refers to the next
+	 * descriptor in the chain by table index.
+	 */
+	uint16_t			vqd_next;
+} __packed;
+
+/*
+ * VIRTIO DESCRIPTOR FLAGS (vqd_flags)
+ */
+
+/*
+ * NEXT:
+ *	Signals that this descriptor (direct or indirect) is part of a chain.
+ *	If populated, "vqd_next" names the next descriptor in the chain by its
+ *	table index.
+ */
+#define	VIRTQ_DESC_F_NEXT		(1 << 0)
+
+/*
+ * WRITE:
+ *	Determines whether this buffer is to be written by the device (WRITE is
+ *	set) or by the driver (WRITE is not set).
+ */
+#define	VIRTQ_DESC_F_WRITE		(1 << 1)
+
+/*
+ * INDIRECT:
+ *	This bit signals that a direct descriptor refers to an indirect
+ *	descriptor list, rather than directly to a buffer.  This bit may only
+ *	be used in a direct descriptor; indirect descriptors are not allowed to
+ *	refer to additional layers of indirect tables.  If this bit is set,
+ *	NEXT must be clear; indirect descriptors may not be chained.
+ */
+#define	VIRTQ_DESC_F_INDIRECT		(1 << 2)
+
+/*
+ * This structure is variously known as the "available" or "avail" ring, or the
+ * driver-owned portion of the queue structure.  It is used by the driver to
+ * submit descriptor chains to the device.
+ */
+struct virtio_vq_driver {
+	uint16_t			vqdr_flags;
+	uint16_t			vqdr_index;
+	uint16_t			vqdr_ring[];
+} __packed;
+
+#define	VIRTQ_AVAIL_F_NO_INTERRUPT	(1 << 0)
+
+/*
+ * We use the sizeof operator on this packed struct to calculate the offset of
+ * subsequent structs.  Ensure the compiler is not adding any padding to the
+ * end of the struct.
+ */
+CTASSERT(sizeof (virtio_vq_driver_t) ==
+    offsetof(virtio_vq_driver_t, vqdr_ring));
+
+struct virtio_vq_elem {
+	/*
+	 * The device returns chains of descriptors by specifying the table
+	 * index of the first descriptor in the chain.
+	 */
+	uint32_t			vqe_start;
+	uint32_t			vqe_len;
+} __packed;
+
+/*
+ * This structure is variously known as the "used" ring, or the device-owned
+ * portion of the queue structure.  It is used by the device to return
+ * completed descriptor chains to the device.
+ */
+struct virtio_vq_device {
+	uint16_t			vqde_flags;
+	uint16_t			vqde_index;
+	virtio_vq_elem_t		vqde_ring[];
+} __packed;
+
+#define	VIRTQ_USED_F_NO_NOTIFY		(1 << 0)
+
+/*
+ * BASIC CONFIGURATION
+ *
+ * Legacy devices expose both their generic and their device-specific
+ * configuration through PCI BAR0.  This is the second entry in the register
+ * address space set for these devices.
+ */
+#define	VIRTIO_LEGACY_PCI_BAR0		1
+
+/*
+ * These are offsets into the base configuration space available through the
+ * virtio_get*() and virtio_put*() family of functions.  These offsets are for
+ * what the specification describes as the "legacy" mode of device operation.
+ */
+#define	VIRTIO_LEGACY_FEATURES_DEVICE	0x00	/* 32 R   */
+#define	VIRTIO_LEGACY_FEATURES_DRIVER	0x04	/* 32 R/W */
+#define	VIRTIO_LEGACY_QUEUE_ADDRESS	0x08	/* 32 R/W */
+#define	VIRTIO_LEGACY_QUEUE_SIZE	0x0C	/* 16 R   */
+#define	VIRTIO_LEGACY_QUEUE_SELECT	0x0E	/* 16 R/W */
+#define	VIRTIO_LEGACY_QUEUE_NOTIFY	0x10	/* 16 R/W */
+#define	VIRTIO_LEGACY_DEVICE_STATUS	0x12	/* 8  R/W */
+#define	VIRTIO_LEGACY_ISR_STATUS	0x13	/* 8  R   */
+
+#define	VIRTIO_LEGACY_MSIX_CONFIG	0x14	/* 16 R/W */
+#define	VIRTIO_LEGACY_MSIX_QUEUE	0x16	/* 16 R/W */
+
+#define	VIRTIO_LEGACY_CFG_OFFSET	(VIRTIO_LEGACY_ISR_STATUS + 1)
+#define	VIRTIO_LEGACY_CFG_OFFSET_MSIX	(VIRTIO_LEGACY_MSIX_QUEUE + 2)
+
+#define	VIRTIO_LEGACY_MSI_NO_VECTOR	0xFFFF
+
+/*
+ * Bits in the Device Status byte (VIRTIO_LEGACY_DEVICE_STATUS):
+ */
+#define	VIRTIO_STATUS_RESET		0
+#define	VIRTIO_STATUS_ACKNOWLEDGE	(1 << 0)
+#define	VIRTIO_STATUS_DRIVER		(1 << 1)
+#define	VIRTIO_STATUS_DRIVER_OK		(1 << 2)
+#define	VIRTIO_STATUS_FAILED		(1 << 7)
+
+/*
+ * Bits in the Interrupt Service Routine Status byte
+ * (VIRTIO_LEGACY_ISR_STATUS):
+ */
+#define	VIRTIO_ISR_CHECK_QUEUES		(1 << 0)
+#define	VIRTIO_ISR_CHECK_CONFIG		(1 << 1)
+
+/*
+ * Bits in the Features fields (VIRTIO_LEGACY_FEATURES_DEVICE,
+ * VIRTIO_LEGACY_FEATURES_DRIVER):
+ */
+#define	VIRTIO_F_RING_INDIRECT_DESC	(1ULL << 28)
+
+/*
+ * For devices operating in the legacy mode, virtqueues must be aligned on a
+ * "page size" of 4096 bytes; this is also called the "Queue Align" value in
+ * newer versions of the specification.
+ */
+#define	VIRTIO_PAGE_SHIFT		12
+#define	VIRTIO_PAGE_SIZE		(1 << VIRTIO_PAGE_SHIFT)
+CTASSERT(VIRTIO_PAGE_SIZE == 4096);
+CTASSERT(ISP2(VIRTIO_PAGE_SIZE));
+
+/*
+ * DMA SYNCHRONISATION WRAPPERS
+ */
+
+/*
+ * Synchronise the driver-owned portion of the queue so that the device can see
+ * our writes.  This covers the memory accessed via the "viq_dma_descs" and
+ * "viq_dma_device" members.
+ */
+#define	VIRTQ_DMA_SYNC_FORDEV(viq)	VERIFY0(ddi_dma_sync( \
+					    (viq)->viq_dma.vidma_dma_handle, \
+					    0, \
+					    (uintptr_t)(viq)->viq_dma_device - \
+					    (uintptr_t)(viq)->viq_dma_descs, \
+					    DDI_DMA_SYNC_FORDEV))
+
+/*
+ * Synchronise the device-owned portion of the queue so that we can see any
+ * writes from the device.  This covers the memory accessed via the
+ * "viq_dma_device" member.
+ */
+#define	VIRTQ_DMA_SYNC_FORKERNEL(viq)	VERIFY0(ddi_dma_sync( \
+					    (viq)->viq_dma.vidma_dma_handle, \
+					    (uintptr_t)(viq)->viq_dma_device - \
+					    (uintptr_t)(viq)->viq_dma_descs, \
+					    (viq)->viq_dma.vidma_size - \
+					    (uintptr_t)(viq)->viq_dma_device - \
+					    (uintptr_t)(viq)->viq_dma_descs, \
+					    DDI_DMA_SYNC_FORKERNEL))
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _VIRTIO_IMPL_H */
diff --git a/usr/src/uts/common/io/virtio/virtio_main.c b/usr/src/uts/common/io/virtio/virtio_main.c
new file mode 100644
index 0000000000..be92dacfba
--- /dev/null
+++ b/usr/src/uts/common/io/virtio/virtio_main.c
@@ -0,0 +1,1730 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2019 Joyent, Inc.
+ */
+
+/*
+ * VIRTIO FRAMEWORK
+ *
+ * For design and usage documentation, see the comments in "virtio.h".
+ */
+
+#include <sys/conf.h>
+#include <sys/kmem.h>
+#include <sys/debug.h>
+#include <sys/modctl.h>
+#include <sys/autoconf.h>
+#include <sys/ddi_impldefs.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/sunndi.h>
+#include <sys/avintr.h>
+#include <sys/spl.h>
+#include <sys/promif.h>
+#include <sys/list.h>
+#include <sys/bootconf.h>
+#include <sys/bootsvcs.h>
+#include <sys/sysmacros.h>
+#include <sys/pci.h>
+
+#include "virtio.h"
+#include "virtio_impl.h"
+
+
+/*
+ * Linkage structures
+ */
+static struct modlmisc virtio_modlmisc = {
+	.misc_modops =			&mod_miscops,
+	.misc_linkinfo =		"VIRTIO common routines",
+};
+
+static struct modlinkage virtio_modlinkage = {
+	.ml_rev =			MODREV_1,
+	.ml_linkage =			{ &virtio_modlmisc, NULL }
+};
+
+int
+_init(void)
+{
+	return (mod_install(&virtio_modlinkage));
+}
+
+int
+_fini(void)
+{
+	return (mod_remove(&virtio_modlinkage));
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&virtio_modlinkage, modinfop));
+}
+
+
+
+static void virtio_set_status(virtio_t *, uint8_t);
+static int virtio_chain_append_impl(virtio_chain_t *, uint64_t, size_t,
+    uint16_t);
+static int virtio_interrupts_setup(virtio_t *, int);
+static void virtio_interrupts_teardown(virtio_t *);
+static void virtio_interrupts_disable_locked(virtio_t *);
+static void virtio_queue_free(virtio_queue_t *);
+
+/*
+ * We use the same device access attributes for BAR mapping and access to the
+ * virtqueue memory.
+ */
+ddi_device_acc_attr_t virtio_acc_attr = {
+	.devacc_attr_version =		DDI_DEVICE_ATTR_V1,
+	.devacc_attr_endian_flags =	DDI_NEVERSWAP_ACC,
+	.devacc_attr_dataorder =	DDI_STORECACHING_OK_ACC,
+	.devacc_attr_access =		DDI_DEFAULT_ACC
+};
+
+
+/*
+ * DMA attributes for the memory given to the device for queue management.
+ */
+ddi_dma_attr_t virtio_dma_attr_queue = {
+	.dma_attr_version =		DMA_ATTR_V0,
+	.dma_attr_addr_lo =		0x0000000000000000,
+	/*
+	 * Queue memory is aligned on VIRTIO_PAGE_SIZE with the address shifted
+	 * down by VIRTIO_PAGE_SHIFT before being passed to the device in a
+	 * 32-bit register.
+	 */
+	.dma_attr_addr_hi =		0x00000FFFFFFFF000,
+	.dma_attr_count_max =		0x00000000FFFFFFFF,
+	.dma_attr_align =		VIRTIO_PAGE_SIZE,
+	.dma_attr_burstsizes =		1,
+	.dma_attr_minxfer =		1,
+	.dma_attr_maxxfer =		0x00000000FFFFFFFF,
+	.dma_attr_seg =			0x00000000FFFFFFFF,
+	.dma_attr_sgllen =		1,
+	.dma_attr_granular =		1,
+	.dma_attr_flags =		0
+};
+
+/*
+ * DMA attributes for the the allocation of indirect descriptor lists.  The
+ * indirect list is referenced by a regular descriptor entry: the physical
+ * address field is 64 bits wide, but the length field is only 32 bits.  Each
+ * descriptor is 16 bytes long.
+ */
+ddi_dma_attr_t virtio_dma_attr_indirect = {
+	.dma_attr_version =		DMA_ATTR_V0,
+	.dma_attr_addr_lo =		0x0000000000000000,
+	.dma_attr_addr_hi =		0xFFFFFFFFFFFFFFFF,
+	.dma_attr_count_max =		0x00000000FFFFFFFF,
+	.dma_attr_align =		sizeof (struct virtio_vq_desc),
+	.dma_attr_burstsizes =		1,
+	.dma_attr_minxfer =		1,
+	.dma_attr_maxxfer =		0x00000000FFFFFFFF,
+	.dma_attr_seg =			0x00000000FFFFFFFF,
+	.dma_attr_sgllen =		1,
+	.dma_attr_granular =		1,
+	.dma_attr_flags =		0
+};
+
+
+uint8_t
+virtio_get8(virtio_t *vio, uintptr_t offset)
+{
+	return (ddi_get8(vio->vio_barh, (uint8_t *)(vio->vio_bar + offset)));
+}
+
+uint16_t
+virtio_get16(virtio_t *vio, uintptr_t offset)
+{
+	return (ddi_get16(vio->vio_barh, (uint16_t *)(vio->vio_bar + offset)));
+}
+
+uint32_t
+virtio_get32(virtio_t *vio, uintptr_t offset)
+{
+	return (ddi_get32(vio->vio_barh, (uint32_t *)(vio->vio_bar + offset)));
+}
+
+void
+virtio_put8(virtio_t *vio, uintptr_t offset, uint8_t value)
+{
+	ddi_put8(vio->vio_barh, (uint8_t *)(vio->vio_bar + offset), value);
+}
+
+void
+virtio_put16(virtio_t *vio, uintptr_t offset, uint16_t value)
+{
+	ddi_put16(vio->vio_barh, (uint16_t *)(vio->vio_bar + offset), value);
+}
+
+void
+virtio_put32(virtio_t *vio, uintptr_t offset, uint32_t value)
+{
+	ddi_put32(vio->vio_barh, (uint32_t *)(vio->vio_bar + offset), value);
+}
+
+void
+virtio_fini(virtio_t *vio, boolean_t failed)
+{
+	mutex_enter(&vio->vio_mutex);
+
+	virtio_interrupts_teardown(vio);
+
+	virtio_queue_t *viq;
+	while ((viq = list_remove_head(&vio->vio_queues)) != NULL) {
+		virtio_queue_free(viq);
+	}
+	list_destroy(&vio->vio_queues);
+
+	if (failed) {
+		/*
+		 * Signal to the host that device setup failed.
+		 */
+		virtio_set_status(vio, VIRTIO_STATUS_FAILED);
+	} else {
+		virtio_device_reset(vio);
+	}
+
+	/*
+	 * We don't need to do anything for the provider initlevel, as it
+	 * merely records the fact that virtio_init_complete() was called.
+	 */
+	vio->vio_initlevel &= ~VIRTIO_INITLEVEL_PROVIDER;
+
+	if (vio->vio_initlevel & VIRTIO_INITLEVEL_REGS) {
+		/*
+		 * Unmap PCI BAR0.
+		 */
+		ddi_regs_map_free(&vio->vio_barh);
+
+		vio->vio_initlevel &= ~VIRTIO_INITLEVEL_REGS;
+	}
+
+	/*
+	 * Ensure we have torn down everything we set up.
+	 */
+	VERIFY0(vio->vio_initlevel);
+
+	mutex_exit(&vio->vio_mutex);
+	mutex_destroy(&vio->vio_mutex);
+
+	kmem_free(vio, sizeof (*vio));
+}
+
+/*
+ * Early device initialisation for legacy (pre-1.0 specification) virtio
+ * devices.
+ */
+virtio_t *
+virtio_init(dev_info_t *dip, uint64_t driver_features, boolean_t allow_indirect)
+{
+	int r;
+
+	/*
+	 * First, confirm that this is a legacy device.
+	 */
+	ddi_acc_handle_t pci;
+	if (pci_config_setup(dip, &pci) != DDI_SUCCESS) {
+		dev_err(dip, CE_WARN, "pci_config_setup failed");
+		return (NULL);
+	}
+
+	uint8_t revid;
+	if ((revid = pci_config_get8(pci, PCI_CONF_REVID)) == PCI_EINVAL8) {
+		dev_err(dip, CE_WARN, "could not read config space");
+		pci_config_teardown(&pci);
+		return (NULL);
+	}
+
+	pci_config_teardown(&pci);
+
+	/*
+	 * The legacy specification requires that the device advertise as PCI
+	 * Revision 0.
+	 */
+	if (revid != 0) {
+		dev_err(dip, CE_WARN, "PCI Revision %u incorrect for "
+		    "legacy virtio device", (uint_t)revid);
+		return (NULL);
+	}
+
+	virtio_t *vio = kmem_zalloc(sizeof (*vio), KM_SLEEP);
+	vio->vio_dip = dip;
+
+	/*
+	 * Map PCI BAR0 for legacy device access.
+	 */
+	if ((r = ddi_regs_map_setup(dip, VIRTIO_LEGACY_PCI_BAR0,
+	    (caddr_t *)&vio->vio_bar, 0, 0, &virtio_acc_attr,
+	    &vio->vio_barh)) != DDI_SUCCESS) {
+		dev_err(dip, CE_WARN, "ddi_regs_map_setup failure (%d)", r);
+		kmem_free(vio, sizeof (*vio));
+		return (NULL);
+	}
+	vio->vio_initlevel |= VIRTIO_INITLEVEL_REGS;
+
+	/*
+	 * We initialise the mutex without an interrupt priority to ease the
+	 * implementation of some of the configuration space access routines.
+	 * Drivers using the virtio framework MUST make a call to
+	 * "virtio_init_complete()" prior to spawning other threads or enabling
+	 * interrupt handlers, at which time we will destroy and reinitialise
+	 * the mutex for use in our interrupt handlers.
+	 */
+	mutex_init(&vio->vio_mutex, NULL, MUTEX_DRIVER, NULL);
+
+	list_create(&vio->vio_queues, sizeof (virtio_queue_t),
+	    offsetof(virtio_queue_t, viq_link));
+
+	/*
+	 * Legacy virtio devices require a few common steps before we can
+	 * negotiate device features.
+	 */
+	virtio_device_reset(vio);
+	virtio_set_status(vio, VIRTIO_STATUS_ACKNOWLEDGE);
+	virtio_set_status(vio, VIRTIO_STATUS_DRIVER);
+
+	/*
+	 * Negotiate features with the device.  Record the original supported
+	 * feature set for debugging purposes.
+	 */
+	vio->vio_features_device = virtio_get32(vio,
+	    VIRTIO_LEGACY_FEATURES_DEVICE);
+	if (allow_indirect) {
+		driver_features |= VIRTIO_F_RING_INDIRECT_DESC;
+	}
+	vio->vio_features = vio->vio_features_device & driver_features;
+	virtio_put32(vio, VIRTIO_LEGACY_FEATURES_DRIVER, vio->vio_features);
+
+	/*
+	 * The device-specific configuration begins at an offset into the BAR
+	 * that depends on whether we have enabled MSI-X interrupts or not.
+	 * Start out with the offset for pre-MSI-X operation so that we can
+	 * read device configuration space prior to configuring interrupts.
+	 */
+	vio->vio_config_offset = VIRTIO_LEGACY_CFG_OFFSET;
+
+	return (vio);
+}
+
+/*
+ * This function must be called by the driver once it has completed early setup
+ * calls.
+ */
+int
+virtio_init_complete(virtio_t *vio, int allowed_interrupt_types)
+{
+	VERIFY(!(vio->vio_initlevel & VIRTIO_INITLEVEL_PROVIDER));
+	vio->vio_initlevel |= VIRTIO_INITLEVEL_PROVIDER;
+
+	if (!list_is_empty(&vio->vio_queues)) {
+		/*
+		 * Set up interrupts for the queues that have been registered.
+		 */
+		if (virtio_interrupts_setup(vio, allowed_interrupt_types) !=
+		    DDI_SUCCESS) {
+			return (DDI_FAILURE);
+		}
+	}
+
+	/*
+	 * We can allocate the mutex once we know the priority.
+	 */
+	mutex_destroy(&vio->vio_mutex);
+	mutex_init(&vio->vio_mutex, NULL, MUTEX_DRIVER, virtio_intr_pri(vio));
+	for (virtio_queue_t *viq = list_head(&vio->vio_queues); viq != NULL;
+	    viq = list_next(&vio->vio_queues, viq)) {
+		mutex_destroy(&viq->viq_mutex);
+		mutex_init(&viq->viq_mutex, NULL, MUTEX_DRIVER,
+		    virtio_intr_pri(vio));
+	}
+
+	virtio_set_status(vio, VIRTIO_STATUS_DRIVER_OK);
+
+	return (DDI_SUCCESS);
+}
+
+boolean_t
+virtio_feature_present(virtio_t *vio, uint64_t feature_mask)
+{
+	return ((vio->vio_features & feature_mask) != 0);
+}
+
+void *
+virtio_intr_pri(virtio_t *vio)
+{
+	VERIFY(vio->vio_initlevel & VIRTIO_INITLEVEL_INT_ADDED);
+
+	return (DDI_INTR_PRI(vio->vio_interrupt_priority));
+}
+
+/*
+ * Enable a bit in the device status register.  Each bit signals a level of
+ * guest readiness to the host.  Use the VIRTIO_CONFIG_DEVICE_STATUS_*
+ * constants for "status".  To zero the status field use virtio_device_reset().
+ */
+static void
+virtio_set_status(virtio_t *vio, uint8_t status)
+{
+	VERIFY3U(status, !=, 0);
+
+	mutex_enter(&vio->vio_mutex);
+
+	uint8_t old = virtio_get8(vio, VIRTIO_LEGACY_DEVICE_STATUS);
+	virtio_put8(vio, VIRTIO_LEGACY_DEVICE_STATUS, status | old);
+
+	mutex_exit(&vio->vio_mutex);
+}
+
+static void
+virtio_device_reset_locked(virtio_t *vio)
+{
+	virtio_put8(vio, VIRTIO_LEGACY_DEVICE_STATUS, VIRTIO_STATUS_RESET);
+}
+
+void
+virtio_device_reset(virtio_t *vio)
+{
+	mutex_enter(&vio->vio_mutex);
+	virtio_device_reset_locked(vio);
+	mutex_exit(&vio->vio_mutex);
+}
+
+/*
+ * Some queues are effectively long-polled; the driver submits a series of
+ * buffers and the device only returns them when there is data available.
+ * During detach, we need to coordinate the return of these buffers.  Calling
+ * "virtio_shutdown()" will reset the device, then allow the removal of all
+ * buffers that were in flight at the time of shutdown via
+ * "virtio_queue_evacuate()".
+ */
+void
+virtio_shutdown(virtio_t *vio)
+{
+	mutex_enter(&vio->vio_mutex);
+	if (vio->vio_initlevel & VIRTIO_INITLEVEL_SHUTDOWN) {
+		/*
+		 * Shutdown has been performed already.
+		 */
+		mutex_exit(&vio->vio_mutex);
+		return;
+	}
+
+	/*
+	 * First, mark all of the queues as shutdown.  This will prevent any
+	 * further activity.
+	 */
+	for (virtio_queue_t *viq = list_head(&vio->vio_queues); viq != NULL;
+	    viq = list_next(&vio->vio_queues, viq)) {
+		mutex_enter(&viq->viq_mutex);
+		viq->viq_shutdown = B_TRUE;
+		mutex_exit(&viq->viq_mutex);
+	}
+
+	/*
+	 * Now, reset the device.  This removes any queue configuration on the
+	 * device side.
+	 */
+	virtio_device_reset_locked(vio);
+	vio->vio_initlevel |= VIRTIO_INITLEVEL_SHUTDOWN;
+	mutex_exit(&vio->vio_mutex);
+}
+
+/*
+ * Common implementation of quiesce(9E) for simple Virtio-based devices.
+ */
+int
+virtio_quiesce(virtio_t *vio)
+{
+	if (vio->vio_initlevel & VIRTIO_INITLEVEL_SHUTDOWN) {
+		/*
+		 * Device has already been reset.
+		 */
+		return (DDI_SUCCESS);
+	}
+
+	/*
+	 * When we reset the device, it should immediately stop using any DMA
+	 * memory we've previously passed to it.  All queue configuration is
+	 * discarded.  This is good enough for quiesce(9E).
+	 */
+	virtio_device_reset_locked(vio);
+
+	return (DDI_SUCCESS);
+}
+
+/*
+ * DEVICE-SPECIFIC REGISTER ACCESS
+ *
+ * Note that these functions take the mutex to avoid racing with interrupt
+ * enable/disable, when the device-specific offset can potentially change.
+ */
+
+uint8_t
+virtio_dev_get8(virtio_t *vio, uintptr_t offset)
+{
+	mutex_enter(&vio->vio_mutex);
+	uint8_t r = virtio_get8(vio, vio->vio_config_offset + offset);
+	mutex_exit(&vio->vio_mutex);
+
+	return (r);
+}
+
+uint16_t
+virtio_dev_get16(virtio_t *vio, uintptr_t offset)
+{
+	mutex_enter(&vio->vio_mutex);
+	uint16_t r = virtio_get16(vio, vio->vio_config_offset + offset);
+	mutex_exit(&vio->vio_mutex);
+
+	return (r);
+}
+
+uint32_t
+virtio_dev_get32(virtio_t *vio, uintptr_t offset)
+{
+	mutex_enter(&vio->vio_mutex);
+	uint32_t r = virtio_get32(vio, vio->vio_config_offset + offset);
+	mutex_exit(&vio->vio_mutex);
+
+	return (r);
+}
+
+uint64_t
+virtio_dev_get64(virtio_t *vio, uintptr_t offset)
+{
+	mutex_enter(&vio->vio_mutex);
+	/*
+	 * On at least some systems, a 64-bit read or write to this BAR is not
+	 * possible.  For legacy devices, there is no generation number to use
+	 * to determine if configuration may have changed half-way through a
+	 * read.  We need to continue to read both halves of the value until we
+	 * read the same value at least twice.
+	 */
+	uintptr_t o_lo = vio->vio_config_offset + offset;
+	uintptr_t o_hi = o_lo + 4;
+
+	uint64_t val = virtio_get32(vio, o_lo) |
+	    ((uint64_t)virtio_get32(vio, o_hi) << 32);
+
+	for (;;) {
+		uint64_t tval = virtio_get32(vio, o_lo) |
+		    ((uint64_t)virtio_get32(vio, o_hi) << 32);
+
+		if (tval == val) {
+			break;
+		}
+
+		val = tval;
+	}
+
+	mutex_exit(&vio->vio_mutex);
+	return (val);
+}
+
+void
+virtio_dev_put8(virtio_t *vio, uintptr_t offset, uint8_t value)
+{
+	mutex_enter(&vio->vio_mutex);
+	virtio_put8(vio, vio->vio_config_offset + offset, value);
+	mutex_exit(&vio->vio_mutex);
+}
+
+void
+virtio_dev_put16(virtio_t *vio, uintptr_t offset, uint16_t value)
+{
+	mutex_enter(&vio->vio_mutex);
+	virtio_put16(vio, vio->vio_config_offset + offset, value);
+	mutex_exit(&vio->vio_mutex);
+}
+
+void
+virtio_dev_put32(virtio_t *vio, uintptr_t offset, uint32_t value)
+{
+	mutex_enter(&vio->vio_mutex);
+	virtio_put32(vio, vio->vio_config_offset + offset, value);
+	mutex_exit(&vio->vio_mutex);
+}
+
+/*
+ * VIRTQUEUE MANAGEMENT
+ */
+
+static int
+virtio_inflight_compar(const void *lp, const void *rp)
+{
+	const virtio_chain_t *l = lp;
+	const virtio_chain_t *r = rp;
+
+	if (l->vic_head < r->vic_head) {
+		return (-1);
+	} else if (l->vic_head > r->vic_head) {
+		return (1);
+	} else {
+		return (0);
+	}
+}
+
+virtio_queue_t *
+virtio_queue_alloc(virtio_t *vio, uint16_t qidx, const char *name,
+    ddi_intr_handler_t *func, void *funcarg, boolean_t force_direct,
+    uint_t max_segs)
+{
+	uint16_t qsz;
+	char space_name[256];
+
+	if (max_segs < 1) {
+		/*
+		 * Every descriptor, direct or indirect, needs to refer to at
+		 * least one buffer.
+		 */
+		dev_err(vio->vio_dip, CE_WARN, "queue \"%s\" (%u) "
+		    "segment count must be at least 1", name, (uint_t)qidx);
+		return (NULL);
+	}
+
+	mutex_enter(&vio->vio_mutex);
+
+	if (vio->vio_initlevel & VIRTIO_INITLEVEL_PROVIDER) {
+		/*
+		 * Cannot configure any more queues once initial setup is
+		 * complete and interrupts have been allocated.
+		 */
+		dev_err(vio->vio_dip, CE_WARN, "queue \"%s\" (%u) "
+		    "alloc after init complete", name, (uint_t)qidx);
+		mutex_exit(&vio->vio_mutex);
+		return (NULL);
+	}
+
+	/*
+	 * There is no way to negotiate a different queue size for legacy
+	 * devices.  We must read and use the native queue size of the device.
+	 */
+	virtio_put16(vio, VIRTIO_LEGACY_QUEUE_SELECT, qidx);
+	if ((qsz = virtio_get16(vio, VIRTIO_LEGACY_QUEUE_SIZE)) == 0) {
+		/*
+		 * A size of zero means the device does not have a queue with
+		 * this index.
+		 */
+		dev_err(vio->vio_dip, CE_WARN, "queue \"%s\" (%u) "
+		    "does not exist on device", name, (uint_t)qidx);
+		mutex_exit(&vio->vio_mutex);
+		return (NULL);
+	}
+
+	mutex_exit(&vio->vio_mutex);
+
+	virtio_queue_t *viq = kmem_zalloc(sizeof (*viq), KM_SLEEP);
+	viq->viq_virtio = vio;
+	viq->viq_name = name;
+	viq->viq_index = qidx;
+	viq->viq_size = qsz;
+	viq->viq_func = func;
+	viq->viq_funcarg = funcarg;
+	viq->viq_max_segs = max_segs;
+	avl_create(&viq->viq_inflight, virtio_inflight_compar,
+	    sizeof (virtio_chain_t), offsetof(virtio_chain_t, vic_node));
+
+	/*
+	 * Allocate the mutex without an interrupt priority for now, as we do
+	 * with "vio_mutex".  We'll reinitialise it in
+	 * "virtio_init_complete()".
+	 */
+	mutex_init(&viq->viq_mutex, NULL, MUTEX_DRIVER, NULL);
+
+	if (virtio_feature_present(vio, VIRTIO_F_RING_INDIRECT_DESC) &&
+	    !force_direct) {
+		/*
+		 * If we were able to negotiate the indirect descriptor
+		 * feature, and the caller has not explicitly forced the use of
+		 * direct descriptors, we'll allocate indirect descriptor lists
+		 * for each chain.
+		 */
+		viq->viq_indirect = B_TRUE;
+	}
+
+	/*
+	 * Track descriptor usage in an identifier space.
+	 */
+	(void) snprintf(space_name, sizeof (space_name), "%s%d_vq_%s",
+	    ddi_get_name(vio->vio_dip), ddi_get_instance(vio->vio_dip), name);
+	if ((viq->viq_descmap = id_space_create(space_name, 0, qsz)) == NULL) {
+		dev_err(vio->vio_dip, CE_WARN, "could not allocate descriptor "
+		    "ID space");
+		virtio_queue_free(viq);
+		return (NULL);
+	}
+
+	/*
+	 * For legacy devices, memory for the queue has a strict layout
+	 * determined by the queue size.
+	 */
+	size_t sz_descs = sizeof (virtio_vq_desc_t) * qsz;
+	size_t sz_driver = P2ROUNDUP_TYPED(sz_descs +
+	    sizeof (virtio_vq_driver_t) +
+	    sizeof (uint16_t) * qsz,
+	    VIRTIO_PAGE_SIZE, size_t);
+	size_t sz_device = P2ROUNDUP_TYPED(sizeof (virtio_vq_device_t) +
+	    sizeof (virtio_vq_elem_t) * qsz,
+	    VIRTIO_PAGE_SIZE, size_t);
+
+	if (virtio_dma_init(vio, &viq->viq_dma, sz_driver + sz_device,
+	    &virtio_dma_attr_queue, DDI_DMA_RDWR | DDI_DMA_CONSISTENT,
+	    KM_SLEEP) != DDI_SUCCESS) {
+		dev_err(vio->vio_dip, CE_WARN, "could not allocate queue "
+		    "DMA memory");
+		virtio_queue_free(viq);
+		return (NULL);
+	}
+
+	/*
+	 * NOTE: The viq_dma_* members below are used by
+	 * VIRTQ_DMA_SYNC_FORDEV() and VIRTQ_DMA_SYNC_FORKERNEL() to calculate
+	 * offsets into the DMA allocation for partial synchronisation.  If the
+	 * ordering of, or relationship between, these pointers changes, the
+	 * macros must be kept in sync.
+	 */
+	viq->viq_dma_descs = virtio_dma_va(&viq->viq_dma, 0);
+	viq->viq_dma_driver = virtio_dma_va(&viq->viq_dma, sz_descs);
+	viq->viq_dma_device = virtio_dma_va(&viq->viq_dma, sz_driver);
+
+	/*
+	 * Install in the per-device list of queues.
+	 */
+	mutex_enter(&vio->vio_mutex);
+	for (virtio_queue_t *chkvq = list_head(&vio->vio_queues); chkvq != NULL;
+	    chkvq = list_next(&vio->vio_queues, chkvq)) {
+		if (chkvq->viq_index == qidx) {
+			dev_err(vio->vio_dip, CE_WARN, "attempt to register "
+			    "queue \"%s\" with same index (%d) as queue \"%s\"",
+			    name, qidx, chkvq->viq_name);
+			mutex_exit(&vio->vio_mutex);
+			virtio_queue_free(viq);
+			return (NULL);
+		}
+	}
+	list_insert_tail(&vio->vio_queues, viq);
+
+	/*
+	 * Ensure the zeroing of the queue memory is visible to the host before
+	 * we inform the device of the queue address.
+	 */
+	membar_producer();
+	VIRTQ_DMA_SYNC_FORDEV(viq);
+
+	virtio_put16(vio, VIRTIO_LEGACY_QUEUE_SELECT, qidx);
+	virtio_put32(vio, VIRTIO_LEGACY_QUEUE_ADDRESS,
+	    virtio_dma_cookie_pa(&viq->viq_dma, 0) >> VIRTIO_PAGE_SHIFT);
+
+	mutex_exit(&vio->vio_mutex);
+	return (viq);
+}
+
+static void
+virtio_queue_free(virtio_queue_t *viq)
+{
+	virtio_t *vio = viq->viq_virtio;
+
+	/*
+	 * We are going to destroy the queue mutex.  Make sure we've already
+	 * removed the interrupt handlers.
+	 */
+	VERIFY(!(vio->vio_initlevel & VIRTIO_INITLEVEL_INT_ADDED));
+
+	mutex_enter(&viq->viq_mutex);
+
+	/*
+	 * If the device has not already been reset as part of a shutdown,
+	 * detach the queue from the device now.
+	 */
+	if (!viq->viq_shutdown) {
+		virtio_put16(vio, VIRTIO_LEGACY_QUEUE_SELECT, viq->viq_index);
+		virtio_put32(vio, VIRTIO_LEGACY_QUEUE_ADDRESS, 0);
+	}
+
+	virtio_dma_fini(&viq->viq_dma);
+
+	VERIFY(avl_is_empty(&viq->viq_inflight));
+	avl_destroy(&viq->viq_inflight);
+	if (viq->viq_descmap != NULL) {
+		id_space_destroy(viq->viq_descmap);
+	}
+
+	mutex_exit(&viq->viq_mutex);
+	mutex_destroy(&viq->viq_mutex);
+
+	kmem_free(viq, sizeof (*viq));
+}
+
+void
+virtio_queue_no_interrupt(virtio_queue_t *viq, boolean_t stop_interrupts)
+{
+	mutex_enter(&viq->viq_mutex);
+
+	if (stop_interrupts) {
+		viq->viq_dma_driver->vqdr_flags |= VIRTQ_AVAIL_F_NO_INTERRUPT;
+	} else {
+		viq->viq_dma_driver->vqdr_flags &= ~VIRTQ_AVAIL_F_NO_INTERRUPT;
+	}
+	VIRTQ_DMA_SYNC_FORDEV(viq);
+
+	mutex_exit(&viq->viq_mutex);
+}
+
+static virtio_chain_t *
+virtio_queue_complete(virtio_queue_t *viq, uint_t index)
+{
+	VERIFY(MUTEX_HELD(&viq->viq_mutex));
+
+	virtio_chain_t *vic;
+
+	virtio_chain_t search;
+	bzero(&search, sizeof (search));
+	search.vic_head = index;
+
+	if ((vic = avl_find(&viq->viq_inflight, &search, NULL)) == NULL) {
+		return (NULL);
+	}
+	avl_remove(&viq->viq_inflight, vic);
+
+	return (vic);
+}
+
+uint_t
+virtio_queue_size(virtio_queue_t *viq)
+{
+	return (viq->viq_size);
+}
+
+uint_t
+virtio_queue_nactive(virtio_queue_t *viq)
+{
+	mutex_enter(&viq->viq_mutex);
+	uint_t r = avl_numnodes(&viq->viq_inflight);
+	mutex_exit(&viq->viq_mutex);
+
+	return (r);
+}
+
+virtio_chain_t *
+virtio_queue_poll(virtio_queue_t *viq)
+{
+	mutex_enter(&viq->viq_mutex);
+	if (viq->viq_shutdown) {
+		/*
+		 * The device has been reset by virtio_shutdown(), and queue
+		 * processing has been halted.  Any previously submitted chains
+		 * will be evacuated using virtio_queue_evacuate().
+		 */
+		mutex_exit(&viq->viq_mutex);
+		return (NULL);
+	}
+
+	VIRTQ_DMA_SYNC_FORKERNEL(viq);
+	if (viq->viq_device_index == viq->viq_dma_device->vqde_index) {
+		/*
+		 * If the device index has not changed since the last poll,
+		 * there are no new chains to process.
+		 */
+		mutex_exit(&viq->viq_mutex);
+		return (NULL);
+	}
+
+	/*
+	 * We need to ensure that all reads from the descriptor (vqde_ring[])
+	 * and any referenced memory by the descriptor occur after we have read
+	 * the descriptor index value above (vqde_index).
+	 */
+	membar_consumer();
+
+	uint16_t index = (viq->viq_device_index++) % viq->viq_size;
+	uint16_t start = viq->viq_dma_device->vqde_ring[index].vqe_start;
+	uint32_t len = viq->viq_dma_device->vqde_ring[index].vqe_len;
+
+	virtio_chain_t *vic;
+	if ((vic = virtio_queue_complete(viq, start)) == NULL) {
+		/*
+		 * We could not locate a chain for this descriptor index, which
+		 * suggests that something has gone horribly wrong.
+		 */
+		dev_err(viq->viq_virtio->vio_dip, CE_PANIC,
+		    "queue \"%s\" ring entry %u (descriptor %u) has no chain",
+		    viq->viq_name, (uint16_t)index, (uint16_t)start);
+	}
+
+	vic->vic_received_length = len;
+
+	mutex_exit(&viq->viq_mutex);
+
+	return (vic);
+}
+
+/*
+ * After a call to "virtio_shutdown()", the driver must retrieve any previously
+ * submitted chains and free any associated resources.
+ */
+virtio_chain_t *
+virtio_queue_evacuate(virtio_queue_t *viq)
+{
+	virtio_t *vio = viq->viq_virtio;
+
+	mutex_enter(&vio->vio_mutex);
+	if (!(vio->vio_initlevel & VIRTIO_INITLEVEL_SHUTDOWN)) {
+		dev_err(vio->vio_dip, CE_PANIC,
+		    "virtio_queue_evacuate() without virtio_shutdown()");
+	}
+	mutex_exit(&vio->vio_mutex);
+
+	mutex_enter(&viq->viq_mutex);
+	VERIFY(viq->viq_shutdown);
+
+	virtio_chain_t *vic = avl_first(&viq->viq_inflight);
+	if (vic != NULL) {
+		avl_remove(&viq->viq_inflight, vic);
+	}
+
+	mutex_exit(&viq->viq_mutex);
+
+	return (vic);
+}
+
+/*
+ * VIRTQUEUE DESCRIPTOR CHAIN MANAGEMENT
+ */
+
+/*
+ * When the device returns a descriptor chain to the driver, it may provide the
+ * length in bytes of data written into the chain.  Client drivers should use
+ * this value with care; the specification suggests some device implementations
+ * have not always provided a useful or correct value.
+ */
+size_t
+virtio_chain_received_length(virtio_chain_t *vic)
+{
+	return (vic->vic_received_length);
+}
+
+/*
+ * Allocate a descriptor chain for use with this queue.  The "kmflags" value
+ * may be KM_SLEEP or KM_NOSLEEP as per kmem_alloc(9F).
+ */
+virtio_chain_t *
+virtio_chain_alloc(virtio_queue_t *viq, int kmflags)
+{
+	virtio_t *vio = viq->viq_virtio;
+	virtio_chain_t *vic;
+	uint_t cap;
+
+	/*
+	 * Direct descriptors are known by their index in the descriptor table
+	 * for the queue.  We use the variable-length array member at the end
+	 * of the chain tracking object to hold the list of direct descriptors
+	 * assigned to this chain.
+	 */
+	if (viq->viq_indirect) {
+		/*
+		 * When using indirect descriptors we still need one direct
+		 * descriptor entry to hold the physical address and length of
+		 * the indirect descriptor table.
+		 */
+		cap = 1;
+	} else {
+		/*
+		 * For direct descriptors we need to be able to track a
+		 * descriptor for each possible segment in a single chain.
+		 */
+		cap = viq->viq_max_segs;
+	}
+
+	size_t vicsz = sizeof (*vic) + sizeof (uint16_t) * cap;
+	if ((vic = kmem_zalloc(vicsz, kmflags)) == NULL) {
+		return (NULL);
+	}
+	vic->vic_vq = viq;
+	vic->vic_direct_capacity = cap;
+
+	if (viq->viq_indirect) {
+		/*
+		 * Allocate an indirect descriptor list with the appropriate
+		 * number of entries.
+		 */
+		if (virtio_dma_init(vio, &vic->vic_indirect_dma,
+		    sizeof (virtio_vq_desc_t) * viq->viq_max_segs,
+		    &virtio_dma_attr_indirect,
+		    DDI_DMA_CONSISTENT | DDI_DMA_WRITE,
+		    kmflags) != DDI_SUCCESS) {
+			goto fail;
+		}
+
+		/*
+		 * Allocate a single descriptor to hold the indirect list.
+		 * Leave the length as zero for now; it will be set to include
+		 * any occupied entries at push time.
+		 */
+		mutex_enter(&viq->viq_mutex);
+		if (virtio_chain_append_impl(vic,
+		    virtio_dma_cookie_pa(&vic->vic_indirect_dma, 0), 0,
+		    VIRTQ_DESC_F_INDIRECT) != DDI_SUCCESS) {
+			mutex_exit(&viq->viq_mutex);
+			goto fail;
+		}
+		mutex_exit(&viq->viq_mutex);
+		VERIFY3U(vic->vic_direct_used, ==, 1);
+
+		/*
+		 * Don't set the indirect capacity until after we've installed
+		 * the direct descriptor which points at the indirect list, or
+		 * virtio_chain_append_impl() will be confused.
+		 */
+		vic->vic_indirect_capacity = viq->viq_max_segs;
+	}
+
+	return (vic);
+
+fail:
+	virtio_dma_fini(&vic->vic_indirect_dma);
+	kmem_free(vic, vicsz);
+	return (NULL);
+}
+
+void *
+virtio_chain_data(virtio_chain_t *vic)
+{
+	return (vic->vic_data);
+}
+
+void
+virtio_chain_data_set(virtio_chain_t *vic, void *data)
+{
+	vic->vic_data = data;
+}
+
+void
+virtio_chain_clear(virtio_chain_t *vic)
+{
+	if (vic->vic_indirect_capacity != 0) {
+		/*
+		 * There should only be one direct descriptor, which points at
+		 * our indirect descriptor list.  We don't want to clear it
+		 * here.
+		 */
+		VERIFY3U(vic->vic_direct_capacity, ==, 1);
+
+		if (vic->vic_indirect_used > 0) {
+			/*
+			 * Clear out the indirect descriptor table.
+			 */
+			vic->vic_indirect_used = 0;
+			bzero(virtio_dma_va(&vic->vic_indirect_dma, 0),
+			    virtio_dma_size(&vic->vic_indirect_dma));
+		}
+
+	} else if (vic->vic_direct_capacity > 0) {
+		/*
+		 * Release any descriptors that were assigned to us previously.
+		 */
+		for (uint_t i = 0; i < vic->vic_direct_used; i++) {
+			id_free(vic->vic_vq->viq_descmap, vic->vic_direct[i]);
+			vic->vic_direct[i] = 0;
+		}
+		vic->vic_direct_used = 0;
+	}
+}
+
+void
+virtio_chain_free(virtio_chain_t *vic)
+{
+	/*
+	 * First ensure that we have released any descriptors used by this
+	 * chain.
+	 */
+	virtio_chain_clear(vic);
+
+	if (vic->vic_indirect_capacity > 0) {
+		/*
+		 * Release the direct descriptor that points to our indirect
+		 * descriptor list.
+		 */
+		VERIFY3U(vic->vic_direct_capacity, ==, 1);
+		id_free(vic->vic_vq->viq_descmap, vic->vic_direct[0]);
+
+		virtio_dma_fini(&vic->vic_indirect_dma);
+	}
+
+	size_t vicsz = sizeof (*vic) +
+	    vic->vic_direct_capacity * sizeof (uint16_t);
+
+	kmem_free(vic, vicsz);
+}
+
+static inline int
+virtio_queue_descmap_alloc(virtio_queue_t *viq, uint_t *indexp)
+{
+	id_t index;
+
+	if ((index = id_alloc_nosleep(viq->viq_descmap)) == -1) {
+		return (ENOMEM);
+	}
+
+	VERIFY3S(index, >=, 0);
+	VERIFY3S(index, <=, viq->viq_size);
+
+	*indexp = (uint_t)index;
+	return (0);
+}
+
+static int
+virtio_chain_append_impl(virtio_chain_t *vic, uint64_t pa, size_t len,
+    uint16_t flags)
+{
+	virtio_queue_t *viq = vic->vic_vq;
+	virtio_vq_desc_t *vqd;
+	uint_t index;
+
+	/*
+	 * We're modifying the queue-wide descriptor list so make sure we have
+	 * the appropriate lock.
+	 */
+	VERIFY(MUTEX_HELD(&viq->viq_mutex));
+
+	if (vic->vic_indirect_capacity != 0) {
+		/*
+		 * Use indirect descriptors.
+		 */
+		if (vic->vic_indirect_used >= vic->vic_indirect_capacity) {
+			return (DDI_FAILURE);
+		}
+
+		vqd = virtio_dma_va(&vic->vic_indirect_dma, 0);
+
+		if ((index = vic->vic_indirect_used++) > 0) {
+			/*
+			 * Chain the current last indirect descriptor to the
+			 * new one.
+			 */
+			vqd[index - 1].vqd_flags |= VIRTQ_DESC_F_NEXT;
+			vqd[index - 1].vqd_next = index;
+		}
+
+	} else {
+		/*
+		 * Use direct descriptors.
+		 */
+		if (vic->vic_direct_used >= vic->vic_direct_capacity) {
+			return (DDI_FAILURE);
+		}
+
+		if (virtio_queue_descmap_alloc(viq, &index) != 0) {
+			return (DDI_FAILURE);
+		}
+
+		vqd = virtio_dma_va(&viq->viq_dma, 0);
+
+		if (vic->vic_direct_used > 0) {
+			/*
+			 * This is not the first entry.  Chain the current
+			 * descriptor to the next one.
+			 */
+			uint16_t p = vic->vic_direct[vic->vic_direct_used - 1];
+
+			vqd[p].vqd_flags |= VIRTQ_DESC_F_NEXT;
+			vqd[p].vqd_next = index;
+		}
+		vic->vic_direct[vic->vic_direct_used++] = index;
+	}
+
+	vqd[index].vqd_addr = pa;
+	vqd[index].vqd_len = len;
+	vqd[index].vqd_flags = flags;
+	vqd[index].vqd_next = 0;
+
+	return (DDI_SUCCESS);
+}
+
+int
+virtio_chain_append(virtio_chain_t *vic, uint64_t pa, size_t len,
+    virtio_direction_t dir)
+{
+	virtio_queue_t *viq = vic->vic_vq;
+	uint16_t flags = 0;
+
+	switch (dir) {
+	case VIRTIO_DIR_DEVICE_WRITES:
+		flags |= VIRTQ_DESC_F_WRITE;
+		break;
+
+	case VIRTIO_DIR_DEVICE_READS:
+		break;
+
+	default:
+		panic("unknown direction value %u", dir);
+	}
+
+	mutex_enter(&viq->viq_mutex);
+	int r = virtio_chain_append_impl(vic, pa, len, flags);
+	mutex_exit(&viq->viq_mutex);
+
+	return (r);
+}
+
+static void
+virtio_queue_flush_locked(virtio_queue_t *viq)
+{
+	VERIFY(MUTEX_HELD(&viq->viq_mutex));
+
+	/*
+	 * Make sure any writes we have just made to the descriptors
+	 * (vqdr_ring[]) are visible to the device before we update the ring
+	 * pointer (vqdr_index).
+	 */
+	membar_producer();
+	viq->viq_dma_driver->vqdr_index = viq->viq_driver_index;
+	VIRTQ_DMA_SYNC_FORDEV(viq);
+
+	/*
+	 * Determine whether the device expects us to notify it of new
+	 * descriptors.
+	 */
+	VIRTQ_DMA_SYNC_FORKERNEL(viq);
+	if (!(viq->viq_dma_device->vqde_flags & VIRTQ_USED_F_NO_NOTIFY)) {
+		virtio_put16(viq->viq_virtio, VIRTIO_LEGACY_QUEUE_NOTIFY,
+		    viq->viq_index);
+	}
+}
+
+void
+virtio_queue_flush(virtio_queue_t *viq)
+{
+	mutex_enter(&viq->viq_mutex);
+	virtio_queue_flush_locked(viq);
+	mutex_exit(&viq->viq_mutex);
+}
+
+void
+virtio_chain_submit(virtio_chain_t *vic, boolean_t flush)
+{
+	virtio_queue_t *viq = vic->vic_vq;
+
+	mutex_enter(&viq->viq_mutex);
+
+	if (vic->vic_indirect_capacity != 0) {
+		virtio_vq_desc_t *vqd = virtio_dma_va(&viq->viq_dma, 0);
+
+		VERIFY3U(vic->vic_direct_used, ==, 1);
+
+		/*
+		 * This is an indirect descriptor queue.  The length in bytes
+		 * of the descriptor must extend to cover the populated
+		 * indirect descriptor entries.
+		 */
+		vqd[vic->vic_direct[0]].vqd_len =
+		    sizeof (virtio_vq_desc_t) * vic->vic_indirect_used;
+
+		virtio_dma_sync(&vic->vic_indirect_dma, DDI_DMA_SYNC_FORDEV);
+	}
+
+	/*
+	 * Populate the next available slot in the driver-owned ring for this
+	 * chain.  The updated value of viq_driver_index is not yet visible to
+	 * the device until a subsequent queue flush.
+	 */
+	uint16_t index = (viq->viq_driver_index++) % viq->viq_size;
+	viq->viq_dma_driver->vqdr_ring[index] = vic->vic_direct[0];
+
+	vic->vic_head = vic->vic_direct[0];
+	avl_add(&viq->viq_inflight, vic);
+
+	if (flush) {
+		virtio_queue_flush_locked(vic->vic_vq);
+	}
+
+	mutex_exit(&viq->viq_mutex);
+}
+
+/*
+ * INTERRUPTS MANAGEMENT
+ */
+
+static const char *
+virtio_interrupt_type_name(int type)
+{
+	switch (type) {
+	case DDI_INTR_TYPE_MSIX:
+		return ("MSI-X");
+	case DDI_INTR_TYPE_MSI:
+		return ("MSI");
+	case DDI_INTR_TYPE_FIXED:
+		return ("fixed");
+	default:
+		return ("?");
+	}
+}
+
+static int
+virtio_interrupts_alloc(virtio_t *vio, int type, int nrequired)
+{
+	dev_info_t *dip = vio->vio_dip;
+	int nintrs = 0;
+	int navail = 0;
+
+	VERIFY(MUTEX_HELD(&vio->vio_mutex));
+	VERIFY(!(vio->vio_initlevel & VIRTIO_INITLEVEL_INT_ALLOC));
+
+	if (ddi_intr_get_nintrs(dip, type, &nintrs) != DDI_SUCCESS) {
+		dev_err(dip, CE_WARN, "could not count %s interrupts",
+		    virtio_interrupt_type_name(type));
+		return (DDI_FAILURE);
+	}
+	if (nintrs < 1) {
+		dev_err(dip, CE_WARN, "no %s interrupts supported",
+		    virtio_interrupt_type_name(type));
+		return (DDI_FAILURE);
+	}
+
+	if (ddi_intr_get_navail(dip, type, &navail) != DDI_SUCCESS) {
+		dev_err(dip, CE_WARN, "could not count available %s interrupts",
+		    virtio_interrupt_type_name(type));
+		return (DDI_FAILURE);
+	}
+	if (navail < nrequired) {
+		dev_err(dip, CE_WARN, "need %d %s interrupts, but only %d "
+		    "available", nrequired, virtio_interrupt_type_name(type),
+		    navail);
+		return (DDI_FAILURE);
+	}
+
+	VERIFY3P(vio->vio_interrupts, ==, NULL);
+	vio->vio_interrupts = kmem_zalloc(
+	    sizeof (ddi_intr_handle_t) * nrequired, KM_SLEEP);
+
+	int r;
+	if ((r = ddi_intr_alloc(dip, vio->vio_interrupts, type, 0, nrequired,
+	    &vio->vio_ninterrupts, DDI_INTR_ALLOC_STRICT)) != DDI_SUCCESS) {
+		dev_err(dip, CE_WARN, "%s interrupt allocation failure (%d)",
+		    virtio_interrupt_type_name(type), r);
+		kmem_free(vio->vio_interrupts,
+		    sizeof (ddi_intr_handle_t) * nrequired);
+		vio->vio_interrupts = NULL;
+		return (DDI_FAILURE);
+	}
+
+	vio->vio_initlevel |= VIRTIO_INITLEVEL_INT_ALLOC;
+	vio->vio_interrupt_type = type;
+	return (DDI_SUCCESS);
+}
+
+static uint_t
+virtio_shared_isr(caddr_t arg0, caddr_t arg1)
+{
+	virtio_t *vio = (virtio_t *)arg0;
+	uint_t r = DDI_INTR_UNCLAIMED;
+	uint8_t isr;
+
+	mutex_enter(&vio->vio_mutex);
+
+	/*
+	 * Check the ISR status to see if the interrupt applies to us.  Reading
+	 * this field resets it to zero.
+	 */
+	isr = virtio_get8(vio, VIRTIO_LEGACY_ISR_STATUS);
+	if ((isr & VIRTIO_ISR_CHECK_QUEUES) == 0) {
+		goto done;
+	}
+
+	for (virtio_queue_t *viq = list_head(&vio->vio_queues); viq != NULL;
+	    viq = list_next(&vio->vio_queues, viq)) {
+		if (viq->viq_func != NULL) {
+			mutex_exit(&vio->vio_mutex);
+			if (viq->viq_func(viq->viq_funcarg, arg0) ==
+			    DDI_INTR_CLAIMED) {
+				r = DDI_INTR_CLAIMED;
+			}
+			mutex_enter(&vio->vio_mutex);
+
+			if (vio->vio_initlevel & VIRTIO_INITLEVEL_SHUTDOWN) {
+				/*
+				 * The device was shut down while in a queue
+				 * handler routine.
+				 */
+				goto done;
+			}
+		}
+	}
+
+done:
+	mutex_exit(&vio->vio_mutex);
+	return (r);
+}
+
+static int
+virtio_interrupts_setup(virtio_t *vio, int allow_types)
+{
+	dev_info_t *dip = vio->vio_dip;
+	int types;
+	int count = 0;
+
+	mutex_enter(&vio->vio_mutex);
+
+	/*
+	 * Determine the number of interrupts we'd like based on the number of
+	 * virtqueues.
+	 */
+	for (virtio_queue_t *viq = list_head(&vio->vio_queues); viq != NULL;
+	    viq = list_next(&vio->vio_queues, viq)) {
+		if (viq->viq_func != NULL) {
+			count++;
+		}
+	}
+
+	if (ddi_intr_get_supported_types(dip, &types) != DDI_SUCCESS) {
+		dev_err(dip, CE_WARN, "could not get supported interrupts");
+		mutex_exit(&vio->vio_mutex);
+		return (DDI_FAILURE);
+	}
+
+	if (allow_types != 0) {
+		/*
+		 * Restrict the possible interrupt types at the request of the
+		 * driver.
+		 */
+		types &= allow_types;
+	}
+
+	/*
+	 * Try each potential interrupt type in descending order of preference.
+	 * Note that the specification does not appear to allow for the use of
+	 * classical MSI, so we are limited to either MSI-X or fixed
+	 * interrupts.
+	 */
+	if (types & DDI_INTR_TYPE_MSIX) {
+		if (virtio_interrupts_alloc(vio, DDI_INTR_TYPE_MSIX,
+		    count) == DDI_SUCCESS) {
+			goto add_handlers;
+		}
+	}
+	if (types & DDI_INTR_TYPE_FIXED) {
+		/*
+		 * If fixed interrupts are all that are available, we'll just
+		 * ask for one.
+		 */
+		if (virtio_interrupts_alloc(vio, DDI_INTR_TYPE_FIXED, 1) ==
+		    DDI_SUCCESS) {
+			goto add_handlers;
+		}
+	}
+
+	dev_err(dip, CE_WARN, "interrupt allocation failed");
+	mutex_exit(&vio->vio_mutex);
+	return (DDI_FAILURE);
+
+add_handlers:
+	/*
+	 * Ensure that we have not been given any high-level interrupts as our
+	 * interrupt handlers do not support them.
+	 */
+	for (int i = 0; i < vio->vio_ninterrupts; i++) {
+		uint_t ipri;
+
+		if (ddi_intr_get_pri(vio->vio_interrupts[i], &ipri) !=
+		    DDI_SUCCESS) {
+			dev_err(dip, CE_WARN, "could not determine interrupt "
+			    "priority");
+			goto fail;
+		}
+
+		if (ipri >= ddi_intr_get_hilevel_pri()) {
+			dev_err(dip, CE_WARN, "high level interrupts not "
+			    "supported");
+			goto fail;
+		}
+
+		/*
+		 * Record the highest priority we've been allocated to use for
+		 * mutex initialisation.
+		 */
+		if (i == 0 || ipri > vio->vio_interrupt_priority) {
+			vio->vio_interrupt_priority = ipri;
+		}
+	}
+
+	/*
+	 * Get the interrupt capabilities from the first handle to determine
+	 * whether we need to use ddi_intr_block_enable(9F).
+	 */
+	if (ddi_intr_get_cap(vio->vio_interrupts[0],
+	    &vio->vio_interrupt_cap) != DDI_SUCCESS) {
+		dev_err(dip, CE_WARN, "failed to get interrupt capabilities");
+		goto fail;
+	}
+
+	if (vio->vio_interrupt_type == DDI_INTR_TYPE_FIXED) {
+		VERIFY3S(vio->vio_ninterrupts, ==, 1);
+		/*
+		 * For fixed interrupts, we need to use our shared handler to
+		 * multiplex the per-queue handlers provided by the driver.
+		 */
+		if (ddi_intr_add_handler(vio->vio_interrupts[0],
+		    virtio_shared_isr, (caddr_t)vio, NULL) != DDI_SUCCESS) {
+			dev_err(dip, CE_WARN, "adding shared %s interrupt "
+			    "handler failed", virtio_interrupt_type_name(
+			    vio->vio_interrupt_type));
+			goto fail;
+		}
+
+		goto done;
+	}
+
+	VERIFY3S(vio->vio_ninterrupts, ==, count);
+
+	uint_t n = 0;
+	for (virtio_queue_t *viq = list_head(&vio->vio_queues); viq != NULL;
+	    viq = list_next(&vio->vio_queues, viq)) {
+		if (viq->viq_func == NULL) {
+			continue;
+		}
+
+		if (ddi_intr_add_handler(vio->vio_interrupts[n],
+		    viq->viq_func, (caddr_t)viq->viq_funcarg,
+		    (caddr_t)vio) != DDI_SUCCESS) {
+			dev_err(dip, CE_WARN, "adding interrupt %u (%s) failed",
+			    n, viq->viq_name);
+			goto fail;
+		}
+
+		viq->viq_handler_index = n;
+		viq->viq_handler_added = B_TRUE;
+		n++;
+	}
+
+done:
+	vio->vio_initlevel |= VIRTIO_INITLEVEL_INT_ADDED;
+	mutex_exit(&vio->vio_mutex);
+	return (DDI_SUCCESS);
+
+fail:
+	virtio_interrupts_teardown(vio);
+	mutex_exit(&vio->vio_mutex);
+	return (DDI_FAILURE);
+}
+
+static void
+virtio_interrupts_teardown(virtio_t *vio)
+{
+	VERIFY(MUTEX_HELD(&vio->vio_mutex));
+
+	virtio_interrupts_disable_locked(vio);
+
+	if (vio->vio_interrupt_type == DDI_INTR_TYPE_FIXED) {
+		/*
+		 * Remove the multiplexing interrupt handler.
+		 */
+		if (vio->vio_initlevel & VIRTIO_INITLEVEL_INT_ADDED) {
+			int r;
+
+			VERIFY3S(vio->vio_ninterrupts, ==, 1);
+
+			if ((r = ddi_intr_remove_handler(
+			    vio->vio_interrupts[0])) != DDI_SUCCESS) {
+				dev_err(vio->vio_dip, CE_WARN, "removing "
+				    "shared interrupt handler failed (%d)", r);
+			}
+		}
+	} else {
+		for (virtio_queue_t *viq = list_head(&vio->vio_queues);
+		    viq != NULL; viq = list_next(&vio->vio_queues, viq)) {
+			int r;
+
+			if (!viq->viq_handler_added) {
+				continue;
+			}
+
+			if ((r = ddi_intr_remove_handler(
+			    vio->vio_interrupts[viq->viq_handler_index])) !=
+			    DDI_SUCCESS) {
+				dev_err(vio->vio_dip, CE_WARN, "removing "
+				    "interrupt handler (%s) failed (%d)",
+				    viq->viq_name, r);
+			}
+
+			viq->viq_handler_added = B_FALSE;
+		}
+	}
+	vio->vio_initlevel &= ~VIRTIO_INITLEVEL_INT_ADDED;
+
+	if (vio->vio_initlevel & VIRTIO_INITLEVEL_INT_ALLOC) {
+		for (int i = 0; i < vio->vio_ninterrupts; i++) {
+			int r;
+
+			if ((r = ddi_intr_free(vio->vio_interrupts[i])) !=
+			    DDI_SUCCESS) {
+				dev_err(vio->vio_dip, CE_WARN, "freeing "
+				    "interrupt %u failed (%d)", i, r);
+			}
+		}
+		kmem_free(vio->vio_interrupts,
+		    sizeof (ddi_intr_handle_t) * vio->vio_ninterrupts);
+		vio->vio_interrupts = NULL;
+		vio->vio_ninterrupts = 0;
+		vio->vio_interrupt_type = 0;
+		vio->vio_interrupt_cap = 0;
+		vio->vio_interrupt_priority = 0;
+
+		vio->vio_initlevel &= ~VIRTIO_INITLEVEL_INT_ALLOC;
+	}
+}
+
+static void
+virtio_interrupts_unwind(virtio_t *vio)
+{
+	VERIFY(MUTEX_HELD(&vio->vio_mutex));
+
+	if (vio->vio_interrupt_type == DDI_INTR_TYPE_MSIX) {
+		for (virtio_queue_t *viq = list_head(&vio->vio_queues);
+		    viq != NULL; viq = list_next(&vio->vio_queues, viq)) {
+			if (!viq->viq_handler_added) {
+				continue;
+			}
+
+			virtio_put16(vio, VIRTIO_LEGACY_QUEUE_SELECT,
+			    viq->viq_index);
+			virtio_put16(vio, VIRTIO_LEGACY_MSIX_QUEUE,
+			    VIRTIO_LEGACY_MSI_NO_VECTOR);
+		}
+	}
+
+	if (vio->vio_interrupt_cap & DDI_INTR_FLAG_BLOCK) {
+		(void) ddi_intr_block_disable(vio->vio_interrupts,
+		    vio->vio_ninterrupts);
+	} else {
+		for (int i = 0; i < vio->vio_ninterrupts; i++) {
+			(void) ddi_intr_disable(vio->vio_interrupts[i]);
+		}
+	}
+
+	/*
+	 * Disabling the interrupts makes the MSI-X fields disappear from the
+	 * BAR once more.
+	 */
+	vio->vio_config_offset = VIRTIO_LEGACY_CFG_OFFSET;
+}
+
+int
+virtio_interrupts_enable(virtio_t *vio)
+{
+	mutex_enter(&vio->vio_mutex);
+	if (vio->vio_initlevel & VIRTIO_INITLEVEL_INT_ENABLED) {
+		mutex_exit(&vio->vio_mutex);
+		return (DDI_SUCCESS);
+	}
+
+	int r = DDI_SUCCESS;
+	if (vio->vio_interrupt_cap & DDI_INTR_FLAG_BLOCK) {
+		r = ddi_intr_block_enable(vio->vio_interrupts,
+		    vio->vio_ninterrupts);
+	} else {
+		for (int i = 0; i < vio->vio_ninterrupts; i++) {
+			if ((r = ddi_intr_enable(vio->vio_interrupts[i])) !=
+			    DDI_SUCCESS) {
+				/*
+				 * Disable the interrupts we have enabled so
+				 * far.
+				 */
+				for (i--; i >= 0; i--) {
+					(void) ddi_intr_disable(
+					    vio->vio_interrupts[i]);
+				}
+				break;
+			}
+		}
+	}
+
+	if (r != DDI_SUCCESS) {
+		mutex_exit(&vio->vio_mutex);
+		return (r);
+	}
+
+	if (vio->vio_interrupt_type == DDI_INTR_TYPE_MSIX) {
+		/*
+		 * When asked to enable the interrupts, the system enables
+		 * MSI-X in the PCI configuration for the device.  While
+		 * enabled, the extra MSI-X configuration table fields appear
+		 * between the general and the device-specific regions of the
+		 * BAR.
+		 */
+		vio->vio_config_offset = VIRTIO_LEGACY_CFG_OFFSET_MSIX;
+
+		for (virtio_queue_t *viq = list_head(&vio->vio_queues);
+		    viq != NULL; viq = list_next(&vio->vio_queues, viq)) {
+			if (!viq->viq_handler_added) {
+				continue;
+			}
+
+			uint16_t qi = viq->viq_index;
+			uint16_t msi = viq->viq_handler_index;
+
+			/*
+			 * Route interrupts for this queue to the assigned
+			 * MSI-X vector number.
+			 */
+			virtio_put16(vio, VIRTIO_LEGACY_QUEUE_SELECT, qi);
+			virtio_put16(vio, VIRTIO_LEGACY_MSIX_QUEUE, msi);
+
+			/*
+			 * The device may not actually accept the vector number
+			 * we're attempting to program.  We need to confirm
+			 * that configuration was successful by re-reading the
+			 * configuration we just wrote.
+			 */
+			if (virtio_get16(vio, VIRTIO_LEGACY_MSIX_QUEUE) !=
+			    msi) {
+				dev_err(vio->vio_dip, CE_WARN,
+				    "failed to configure MSI-X vector %u for "
+				    "queue \"%s\" (#%u)", (uint_t)msi,
+				    viq->viq_name, (uint_t)qi);
+
+				virtio_interrupts_unwind(vio);
+				mutex_exit(&vio->vio_mutex);
+				return (DDI_FAILURE);
+			}
+		}
+	}
+
+	vio->vio_initlevel |= VIRTIO_INITLEVEL_INT_ENABLED;
+
+	mutex_exit(&vio->vio_mutex);
+	return (DDI_SUCCESS);
+}
+
+static void
+virtio_interrupts_disable_locked(virtio_t *vio)
+{
+	VERIFY(MUTEX_HELD(&vio->vio_mutex));
+
+	if (!(vio->vio_initlevel & VIRTIO_INITLEVEL_INT_ENABLED)) {
+		return;
+	}
+
+	virtio_interrupts_unwind(vio);
+
+	vio->vio_initlevel &= ~VIRTIO_INITLEVEL_INT_ENABLED;
+}
+
+void
+virtio_interrupts_disable(virtio_t *vio)
+{
+	mutex_enter(&vio->vio_mutex);
+	virtio_interrupts_disable_locked(vio);
+	mutex_exit(&vio->vio_mutex);
+}
diff --git a/usr/src/uts/common/io/virtio/virtioreg.h b/usr/src/uts/common/io/virtio/virtioreg.h
deleted file mode 100644
index 19579e96bc..0000000000
--- a/usr/src/uts/common/io/virtio/virtioreg.h
+++ /dev/null
@@ -1,178 +0,0 @@
-/*
- * Copyright (c) 2010 Minoura Makoto.
- * Copyright (c) 2012 Nexenta Systems, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
- * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
- * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-/*
- * Part of the file derived from `Virtio PCI Card Specification v0.8.6 DRAFT'
- * Appendix A.
- */
-
-/*
- * An interface for efficient virtio implementation.
- *
- * This header is BSD licensed so anyone can use the definitions
- * to implement compatible drivers/servers.
- *
- * Copyright 2007, 2009, IBM Corporation
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. Neither the name of IBM nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * ``AS IS'' ANDANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
- * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-
-#ifndef __VIRTIOREG_H__
-#define	__VIRTIOREG_H__
-
-#include <sys/types.h>
-
-#define	PCI_VENDOR_QUMRANET 0x1af4
-#define	PCI_DEV_VIRTIO_MIN 0x1000
-#define	PCI_DEV_VIRTIO_MAX 0x103f
-#define	VIRTIO_PCI_ABI_VERSION 0
-
-/* Virtio product id (subsystem) */
-#define	PCI_PRODUCT_VIRTIO_NETWORK	1
-#define	PCI_PRODUCT_VIRTIO_BLOCK	2
-#define	PCI_PRODUCT_VIRTIO_CONSOLE	3
-#define	PCI_PRODUCT_VIRTIO_ENTROPY	4
-#define	PCI_PRODUCT_VIRTIO_BALLOON	5
-#define	PCI_PRODUCT_VIRTIO_9P		9
-
-/* Virtio header */
-#define	VIRTIO_CONFIG_DEVICE_FEATURES		0 /* 32bit */
-#define	VIRTIO_CONFIG_GUEST_FEATURES		4 /* 32bit */
-
-#define	VIRTIO_F_NOTIFY_ON_EMPTY		(1<<24)
-#define	VIRTIO_F_RING_INDIRECT_DESC		(1<<28)
-#define	VIRTIO_F_BAD_FEATURE			(1<<30)
-
-#define	VIRTIO_CONFIG_QUEUE_ADDRESS		8 /* 32bit */
-#define	VIRTIO_CONFIG_QUEUE_SIZE		12 /* 16bit */
-#define	VIRTIO_CONFIG_QUEUE_SELECT		14 /* 16bit */
-#define	VIRTIO_CONFIG_QUEUE_NOTIFY		16 /* 16bit */
-#define	VIRTIO_CONFIG_DEVICE_STATUS		18 /* 8bit */
-
-#define	VIRTIO_CONFIG_DEVICE_STATUS_RESET	0
-#define	VIRTIO_CONFIG_DEVICE_STATUS_ACK		1
-#define	VIRTIO_CONFIG_DEVICE_STATUS_DRIVER	2
-#define	VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK	4
-#define	VIRTIO_CONFIG_DEVICE_STATUS_FAILED	128
-
-#define	VIRTIO_CONFIG_ISR_STATUS		19 /* 8bit */
-#define	VIRTIO_CONFIG_ISR_CONFIG_CHANGE		2
-
-#define	VIRTIO_CONFIG_CONFIG_VECTOR		20 /* 16bit, optional */
-#define	VIRTIO_CONFIG_QUEUE_VECTOR		22
-
-#define	VIRTIO_CONFIG_DEVICE_CONFIG_NOMSIX	20
-#define	VIRTIO_CONFIG_DEVICE_CONFIG_MSIX	24
-
-#define	VIRTIO_MSI_NO_VECTOR 0xffff
-
-/* Virtqueue */
-/* This marks a buffer as continuing via the next field. */
-#define	VRING_DESC_F_NEXT	1
-/*
- * This marks a buffer as write-only, from the devices's perspective.
- * (otherwise read-only).
- */
-#define	VRING_DESC_F_WRITE	2
-/* This means the buffer contains a list of buffer descriptors. */
-#define	VRING_DESC_F_INDIRECT	4
-
-/*
- * The Host uses this in used->flags to advise the Guest: don't kick me
- * when you add a buffer.  It's unreliable, so it's simply an
- * optimization.  Guest will still kick if it's out of buffers.
- */
-#define	VRING_USED_F_NO_NOTIFY	1
-/*
- *  The Guest uses this in avail->flags to advise the Host: don't
- * interrupt me when you consume a buffer.  It's unreliable, so it's
- * simply an optimization.
- */
-#define	VRING_AVAIL_F_NO_INTERRUPT	1
-
-/*
- * Virtio ring descriptors: 16 bytes.
- * These can chain together via "next".
- */
-struct vring_desc {
-	/* Address (guest-physical). */
-	uint64_t addr;
-	/* Length. */
-	uint32_t len;
-	/* The flags as indicated above. */
-	uint16_t flags;
-	/* We chain unused descriptors via this, too */
-	uint16_t next;
-} __attribute__((packed));
-
-struct vring_avail {
-	uint16_t flags;
-	uint16_t idx;
-	uint16_t ring[];
-} __attribute__((packed));
-
-/* u32 is used here for ids for padding reasons. */
-struct vring_used_elem {
-	/* Index of start of used descriptor chain. */
-	uint32_t id;
-	/* Total length of the descriptor chain which was written to. */
-	uint32_t len;
-} __attribute__((packed));
-
-struct vring_used {
-	uint16_t flags;
-	uint16_t idx;
-	struct vring_used_elem ring[];
-} __attribute__((packed));
-
-
-/* Got nothing to do with the system page size, just a confusing name. */
-#define	VIRTIO_PAGE_SIZE	(4096)
-
-#endif /* __VIRTIOREG_H__ */
diff --git a/usr/src/uts/common/io/virtio/virtiovar.h b/usr/src/uts/common/io/virtio/virtiovar.h
deleted file mode 100644
index 17aebe3864..0000000000
--- a/usr/src/uts/common/io/virtio/virtiovar.h
+++ /dev/null
@@ -1,211 +0,0 @@
-/*
- * Copyright (c) 2010 Minoura Makoto.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
- * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
- * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-/*
- * Part of the file derived from `Virtio PCI Card Specification v0.8.6 DRAFT'
- * Appendix A.
- */
-
-/*
- * An interface for efficient virtio implementation.
- *
- * This header is BSD licensed so anyone can use the definitions
- * to implement compatible drivers/servers.
- *
- * Copyright 2007, 2009, IBM Corporation
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. Neither the name of IBM nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * ``AS IS'' ANDANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
- * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-/*
- * Copyright 2012 Nexenta Systems, Inc. All rights reserved.
- */
-
-#ifndef __VIRTIOVAR_H__
-#define	__VIRTIOVAR_H__
-
-#include <sys/types.h>
-#include <sys/dditypes.h>
-#include <sys/cmn_err.h>
-#include <sys/list.h>
-
-#ifdef DEBUG
-#define	dev_debug(dip, fmt, arg...) \
-	dev_err(dip, fmt, ##arg)
-#else
-#define	dev_debug(dip, fmt, arg...)
-#endif
-
-struct vq_entry {
-	list_node_t		qe_list;
-	struct virtqueue	*qe_queue;
-	uint16_t		qe_index; /* index in vq_desc array */
-	/* followings are used only when it is the `head' entry */
-	struct vq_entry		*qe_next;
-	struct vring_desc	*qe_desc;
-	ddi_dma_cookie_t	qe_indirect_dma_cookie;
-	ddi_dma_handle_t	qe_indirect_dma_handle;
-	ddi_acc_handle_t	qe_indirect_dma_acch;
-	struct vring_desc	*qe_indirect_descs;
-	unsigned int 		qe_indirect_next;
-};
-
-struct virtqueue {
-	struct virtio_softc	*vq_owner;
-	unsigned int		vq_num; /* queue size (# of entries) */
-	unsigned int		vq_indirect_num;
-	int			vq_index; /* queue number (0, 1, ...) */
-
-	/* vring pointers (KVA) */
-	struct vring_desc	*vq_descs;
-	struct vring_avail	*vq_avail;
-	struct vring_used	*vq_used;
-
-	/* virtqueue allocation info */
-	void			*vq_vaddr;
-	int			vq_availoffset;
-	int			vq_usedoffset;
-	ddi_dma_cookie_t	vq_dma_cookie;
-	ddi_dma_handle_t	vq_dma_handle;
-	ddi_acc_handle_t	vq_dma_acch;
-
-	int			vq_maxsegsize;
-
-	/* free entry management */
-	struct vq_entry		*vq_entries;
-	list_t			vq_freelist;
-	kmutex_t		vq_freelist_lock;
-	int			vq_used_entries;
-
-	/* enqueue/dequeue status */
-	uint16_t		vq_avail_idx;
-	kmutex_t		vq_avail_lock;
-	uint16_t		vq_used_idx;
-	kmutex_t		vq_used_lock;
-};
-
-struct virtio_softc {
-	dev_info_t		*sc_dev;
-
-	uint_t			sc_intr_prio;
-
-	ddi_acc_handle_t	sc_ioh;
-	caddr_t			sc_io_addr;
-	int			sc_config_offset;
-
-	uint32_t		sc_features;
-
-	int			sc_nvqs; /* set by the user */
-
-	ddi_intr_handle_t	*sc_intr_htable;
-	int			sc_intr_num;
-	boolean_t		sc_intr_config;
-	int			sc_intr_cap;
-	int			sc_int_type;
-};
-
-struct virtio_int_handler {
-	ddi_intr_handler_t *vh_func;
-	void *vh_priv;
-};
-
-/* public interface */
-uint32_t virtio_negotiate_features(struct virtio_softc *, uint32_t);
-size_t virtio_show_features(uint32_t features, char *buffer, size_t len);
-boolean_t virtio_has_feature(struct virtio_softc *sc, uint32_t feature);
-void virtio_set_status(struct virtio_softc *sc, unsigned int);
-#define	virtio_device_reset(sc)	virtio_set_status((sc), 0)
-
-uint8_t virtio_read_device_config_1(struct virtio_softc *sc,
-		unsigned int index);
-uint16_t virtio_read_device_config_2(struct virtio_softc *sc,
-		unsigned int index);
-uint32_t virtio_read_device_config_4(struct virtio_softc *sc,
-		unsigned int index);
-uint64_t virtio_read_device_config_8(struct virtio_softc *sc,
-		unsigned int index);
-void virtio_write_device_config_1(struct virtio_softc *sc,
-		unsigned int index, uint8_t value);
-void virtio_write_device_config_2(struct virtio_softc *sc,
-		unsigned int index, uint16_t value);
-void virtio_write_device_config_4(struct virtio_softc *sc,
-		unsigned int index, uint32_t value);
-void virtio_write_device_config_8(struct virtio_softc *sc,
-		unsigned int index, uint64_t value);
-
-struct virtqueue *virtio_alloc_vq(struct virtio_softc *sc,
-		unsigned int index, unsigned int size,
-		unsigned int indirect_num, const char *name);
-void virtio_free_vq(struct virtqueue *);
-void virtio_reset(struct virtio_softc *);
-struct vq_entry *vq_alloc_entry(struct virtqueue *vq);
-void vq_free_entry(struct virtqueue *vq, struct vq_entry *qe);
-uint_t vq_num_used(struct virtqueue *vq);
-unsigned int virtio_ve_indirect_available(struct vq_entry *qe);
-
-void virtio_stop_vq_intr(struct virtqueue *);
-void virtio_start_vq_intr(struct virtqueue *);
-
-void virtio_ve_add_cookie(struct vq_entry *qe, ddi_dma_handle_t dma_handle,
-    ddi_dma_cookie_t dma_cookie, unsigned int ncookies, boolean_t write);
-void virtio_ve_add_indirect_buf(struct vq_entry *qe, uint64_t paddr,
-    uint32_t len, boolean_t write);
-void virtio_ve_set(struct vq_entry *qe, uint64_t paddr, uint32_t len,
-		boolean_t write);
-
-void virtio_push_chain(struct vq_entry *qe, boolean_t sync);
-struct vq_entry *virtio_pull_chain(struct virtqueue *vq, uint32_t *len);
-void virtio_free_chain(struct vq_entry *ve);
-void virtio_sync_vq(struct virtqueue *vq);
-
-int virtio_register_ints(struct virtio_softc *sc,
-		struct virtio_int_handler *config_handler,
-		struct virtio_int_handler vq_handlers[]);
-void virtio_release_ints(struct virtio_softc *sc);
-int virtio_enable_ints(struct virtio_softc *sc);
-
-#endif /* __VIRTIOVAR_H__ */
diff --git a/usr/src/uts/common/smbsrv/smb2_kproto.h b/usr/src/uts/common/smbsrv/smb2_kproto.h
index 97b13af868..ed553bedcd 100644
--- a/usr/src/uts/common/smbsrv/smb2_kproto.h
+++ b/usr/src/uts/common/smbsrv/smb2_kproto.h
@@ -32,6 +32,7 @@ extern uint32_t smb2_dh_def_timeout;
 extern uint32_t smb2_dh_max_timeout;
 extern uint32_t smb2_res_def_timeout;
 extern uint32_t smb2_res_max_timeout;
+extern uint32_t smb2_persist_timeout;
 extern int smb2_enable_dh;
 
 #define	SMB3_CLIENT_ENCRYPTS(sr) \
@@ -131,7 +132,7 @@ uint32_t smb2_setinfo_quota(smb_request_t *, smb_setinfo_t *);
 void smb2_oplock_acquire(smb_request_t *sr);
 void smb2_oplock_reconnect(smb_request_t *sr);
 void smb2_lease_acquire(smb_request_t *sr);
-uint32_t smb2_lease_create(smb_request_t *sr);
+uint32_t smb2_lease_create(smb_request_t *sr, uint8_t *);
 void smb2_lease_rele(smb_lease_t *);
 void smb2_lease_init(void);
 void smb2_lease_fini(void);
@@ -142,6 +143,15 @@ void smb2_durable_timers(smb_server_t *);
 uint32_t smb2_dh_reconnect(smb_request_t *);
 boolean_t smb_dh_should_save(smb_ofile_t *);
 extern void smb2_dh_shutdown(smb_server_t *);
+int smb2_dh_new_ca_share(smb_server_t *, smb_kshare_t *);
+void smb2_dh_close_persistent(smb_ofile_t *);
+void smb2_dh_close_my_orphans(smb_request_t *, smb_ofile_t *);
+int smb2_dh_make_persistent(smb_request_t *, smb_ofile_t *);
+void smb2_dh_setdoc_persistent(smb_ofile_t *);
+void smb2_dh_update_nvfile(smb_request_t *);
+void smb2_dh_update_oplock(smb_request_t *, smb_ofile_t *);
+void smb2_dh_update_locks(smb_request_t *, smb_ofile_t *);
+void smb2_dh_update_times(smb_request_t *, smb_ofile_t *, smb_attr_t *);
 
 #ifdef	__cplusplus
 }
diff --git a/usr/src/uts/common/smbsrv/smb_kproto.h b/usr/src/uts/common/smbsrv/smb_kproto.h
index d18ff80d5e..751f047e0c 100644
--- a/usr/src/uts/common/smbsrv/smb_kproto.h
+++ b/usr/src/uts/common/smbsrv/smb_kproto.h
@@ -338,6 +338,8 @@ boolean_t smb_validate_dirname(smb_request_t *, smb_pathname_t *);
 boolean_t smb_validate_object_name(smb_request_t *, smb_pathname_t *);
 boolean_t smb_validate_stream_name(smb_request_t *, smb_pathname_t *);
 boolean_t smb_is_stream_name(char *);
+boolean_t smb_strname_restricted(char *);
+
 void smb_stream_parse_name(char *, char *, char *);
 
 
@@ -438,7 +440,7 @@ int smb_server_get_count(void);
 int smb_server_g_init(void);
 void smb_server_g_fini(void);
 int smb_server_create(void);
-int smb_server_delete(void);
+int smb_server_delete(smb_server_t *);
 int smb_server_configure(smb_ioc_cfg_t *);
 int smb_server_start(smb_ioc_start_t *);
 int smb_server_stop(void);
@@ -451,7 +453,7 @@ int smb_server_numopen(smb_ioc_opennum_t *);
 int smb_server_enum(smb_ioc_svcenum_t *);
 int smb_server_session_close(smb_ioc_session_t *);
 int smb_server_file_close(smb_ioc_fileid_t *);
-int smb_server_sharevp(smb_server_t *, const char *, vnode_t **);
+int smb_server_share_lookup(smb_server_t *, const char *, smb_node_t **);
 int smb_server_unshare(const char *);
 
 void smb_server_logoff_ssnid(smb_request_t *, uint64_t);
@@ -553,14 +555,6 @@ int smb_pathname(smb_request_t *, char *, int, smb_node_t *,
     smb_node_t *, smb_node_t **, smb_node_t **, cred_t *);
 
 /*
- * smb_vfs functions
- */
-
-int smb_vfs_hold(smb_export_t *, vfs_t *);
-void smb_vfs_rele(smb_export_t *, vfs_t *);
-void smb_vfs_rele_all(smb_export_t *);
-
-/*
  * smb_notify.c
  */
 uint32_t smb_notify_act1(smb_request_t *, uint32_t, uint32_t);
@@ -633,6 +627,7 @@ smb_tree_t *smb_session_lookup_volume(smb_session_t *, const char *,
 void smb_session_close_pid(smb_session_t *, uint32_t);
 void smb_session_disconnect_owned_trees(smb_session_t *, smb_user_t *);
 void smb_session_disconnect_share(smb_session_t *, const char *);
+void smb_session_logoff(smb_session_t *);
 void smb_session_getclient(smb_session_t *, char *, size_t);
 boolean_t smb_session_isclient(smb_session_t *, const char *);
 void smb_session_correct_keep_alive_values(smb_llist_t *, uint32_t);
@@ -654,7 +649,7 @@ smb_ofile_t *smb_ofile_lookup_by_uniqid(smb_tree_t *, uint32_t);
 smb_ofile_t *smb_ofile_lookup_by_persistid(smb_request_t *, uint64_t);
 boolean_t smb_ofile_disallow_fclose(smb_ofile_t *);
 smb_ofile_t *smb_ofile_alloc(smb_request_t *, smb_arg_open_t *, smb_node_t *,
-    uint16_t, uint16_t, uint32_t);
+    uint16_t, uint16_t);
 void smb_ofile_open(smb_request_t *, smb_arg_open_t *, smb_ofile_t *);
 void smb_ofile_close(smb_ofile_t *, int32_t);
 void smb_ofile_free(smb_ofile_t *);
@@ -678,7 +673,9 @@ void smb_delayed_write_timer(smb_llist_t *);
 void smb_ofile_set_quota_resume(smb_ofile_t *, char *);
 void smb_ofile_get_quota_resume(smb_ofile_t *, char *, int);
 void smb_ofile_del_persistid(smb_ofile_t *);
-void smb_ofile_set_persistid(smb_ofile_t *);
+void smb_ofile_set_persistid_dh(smb_ofile_t *);
+void smb_ofile_set_persistid_ph(smb_ofile_t *);
+int smb_ofile_insert_persistid(smb_ofile_t *, uint64_t);
 
 #define	SMB_OFILE_GET_SESSION(of)	((of)->f_session)
 #define	SMB_OFILE_GET_TREE(of)		((of)->f_tree)
@@ -734,6 +731,7 @@ void smb_user_netinfo_fini(smb_netuserinfo_t *);
 int smb_user_netinfo_encode(smb_user_t *, uint8_t *, size_t, uint32_t *);
 smb_token_t *smb_get_token(smb_session_t *, smb_logon_t *);
 cred_t *smb_cred_create(smb_token_t *);
+cred_t *smb_kcred_create(void);
 void smb_user_setcred(smb_user_t *, cred_t *, uint32_t);
 boolean_t smb_is_same_user(cred_t *, cred_t *);
 
@@ -741,6 +739,7 @@ boolean_t smb_is_same_user(cred_t *, cred_t *);
  * SMB tree functions (file smb_tree.c)
  */
 uint32_t smb_tree_connect(smb_request_t *);
+uint32_t smb_tree_connect_disk(smb_request_t *, smb_arg_tcon_t *);
 void smb_tree_disconnect(smb_tree_t *, boolean_t);
 void smb_tree_close_pid(smb_tree_t *, uint32_t);
 boolean_t smb_tree_has_feature(smb_tree_t *, uint_t);
@@ -751,6 +750,8 @@ void smb_tree_hold_internal(smb_tree_t *);
 void smb_tree_release(smb_tree_t *);
 smb_odir_t *smb_tree_lookup_odir(smb_request_t *, uint16_t);
 boolean_t smb_tree_is_connected(smb_tree_t *);
+smb_tree_t *smb_tree_alloc(smb_request_t *, const smb_kshare_t *,
+    smb_node_t *, uint32_t, uint32_t);
 
 smb_xa_t *smb_xa_create(smb_session_t *session, smb_request_t *sr,
     uint32_t total_parameter_count, uint32_t total_data_count,
@@ -937,7 +938,7 @@ void smb_threshold_exit(smb_cmd_threshold_t *);
 void smb_threshold_wake_all(smb_cmd_threshold_t *);
 
 /* SMB hash function prototypes */
-smb_hash_t *smb_hash_create(size_t, size_t, uint32_t num_buckets);
+smb_hash_t *smb_hash_create(size_t, size_t, uint32_t);
 void smb_hash_destroy(smb_hash_t *);
 uint_t smb_hash_uint64(smb_hash_t *, uint64_t);
 
diff --git a/usr/src/uts/common/smbsrv/smb_ktypes.h b/usr/src/uts/common/smbsrv/smb_ktypes.h
index 09e52b70f7..1f8ce704fb 100644
--- a/usr/src/uts/common/smbsrv/smb_ktypes.h
+++ b/usr/src/uts/common/smbsrv/smb_ktypes.h
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2017 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright 2018 Nexenta Systems, Inc.  All rights reserved.
  */
 
 /*
@@ -61,6 +61,7 @@ extern "C" {
 
 struct __door_handle;	/* <sys/door.h> */
 struct edirent;		/* <sys/extdirent.h> */
+struct nvlist;
 
 struct smb_disp_entry;
 struct smb_request;
@@ -476,7 +477,6 @@ typedef struct {
 typedef struct smb_export {
 	kmutex_t	e_mutex;
 	boolean_t	e_ready;
-	smb_llist_t	e_vfs_list;
 	smb_avl_t	e_share_avl;
 	smb_slist_t	e_unexport_list;
 	smb_thread_t	e_unexport_thread;
@@ -629,16 +629,6 @@ typedef struct smb_lease {
 	uint8_t			ls_clnt[SMB_LEASE_KEY_SZ];
 } smb_lease_t;
 
-#define	SMB_VFS_MAGIC	0x534D4256	/* 'SMBV' */
-
-typedef struct smb_vfs {
-	list_node_t		sv_lnd;
-	uint32_t		sv_magic;
-	uint32_t		sv_refcnt;
-	vfs_t			*sv_vfsp;
-	vnode_t			*sv_rootvp;
-} smb_vfs_t;
-
 #define	SMB_NODE_MAGIC		0x4E4F4445	/* 'NODE' */
 #define	SMB_NODE_VALID(p)	ASSERT((p)->n_magic == SMB_NODE_MAGIC)
 
@@ -703,6 +693,9 @@ typedef struct smb_node {
 
 typedef struct smb_kshare {
 	uint32_t	shr_magic;
+	avl_node_t	shr_link;
+	kmutex_t	shr_mutex;
+	kcondvar_t	shr_cv;
 	char		*shr_name;
 	char		*shr_path;
 	char		*shr_cmnt;
@@ -717,8 +710,9 @@ typedef struct smb_kshare {
 	char		*shr_access_none;
 	char		*shr_access_ro;
 	char		*shr_access_rw;
-	avl_node_t	shr_link;
-	kmutex_t	shr_mutex;
+	smb_node_t	*shr_root_node;
+	smb_node_t	*shr_ca_dir;
+	void		*shr_import_busy;
 	smb_cfg_val_t	shr_encrypt; /* Share.EncryptData */
 } smb_kshare_t;
 
@@ -984,7 +978,7 @@ typedef struct smb_session {
 	unsigned char		MAC_key[44];
 	char			ip_addr_str[INET6_ADDRSTRLEN];
 	uint8_t			clnt_uuid[16];
-	char 			workstation[SMB_PI_MAX_HOST];
+	char			workstation[SMB_PI_MAX_HOST];
 } smb_session_t;
 
 /*
@@ -1100,6 +1094,7 @@ typedef struct smb_user {
 #define	SMB_TREE_SPARSE			0x00040000
 #define	SMB_TREE_TRAVERSE_MOUNTS	0x00080000
 #define	SMB_TREE_FORCE_L2_OPLOCK	0x00100000
+#define	SMB_TREE_CA			0x00200000
 /* Note: SMB_TREE_... in the mdb module too. */
 
 /*
@@ -1166,15 +1161,15 @@ typedef struct smb_tree {
 	(((sr) && (sr)->tid_tree) ?					\
 	(((sr)->tid_tree->t_access) & (acemask)) : 0)))
 
-#define	SMB_TREE_SUPPORTS_CATIA(sr)            				\
+#define	SMB_TREE_SUPPORTS_CATIA(sr)					\
 	(((sr) && (sr)->tid_tree) ?                                     \
 	smb_tree_has_feature((sr)->tid_tree, SMB_TREE_CATIA) : 0)
 
-#define	SMB_TREE_SUPPORTS_ABE(sr)            				\
+#define	SMB_TREE_SUPPORTS_ABE(sr)					\
 	(((sr) && (sr)->tid_tree) ?                                     \
 	smb_tree_has_feature((sr)->tid_tree, SMB_TREE_ABE) : 0)
 
-#define	SMB_TREE_IS_DFSROOT(sr)            				\
+#define	SMB_TREE_IS_DFSROOT(sr)						\
 	(((sr) && (sr)->tid_tree) ?                                     \
 	smb_tree_has_feature((sr)->tid_tree, SMB_TREE_DFSROOT) : 0)
 
@@ -1202,7 +1197,7 @@ typedef struct smb_tree {
 	(SMB_TREE_IS_READONLY((sr)) ||				\
 	smb_node_file_is_readonly((node)))
 
-#define	SMB_ODIR_MAGIC 		0x4F444952	/* 'ODIR' */
+#define	SMB_ODIR_MAGIC		0x4F444952	/* 'ODIR' */
 #define	SMB_ODIR_VALID(p)	\
     ASSERT((p != NULL) && ((p)->d_magic == SMB_ODIR_MAGIC))
 
@@ -1332,7 +1327,7 @@ typedef struct smb_opipe {
 #define	SMB_OFLAGS_SET_DELETE_ON_CLOSE	0x0004
 #define	SMB_OFLAGS_LLF_POS_VALID	0x0008
 
-#define	SMB_OFILE_MAGIC 	0x4F464C45	/* 'OFLE' */
+#define	SMB_OFILE_MAGIC		0x4F464C45	/* 'OFLE' */
 #define	SMB_OFILE_VALID(p)	\
     ASSERT((p != NULL) && ((p)->f_magic == SMB_OFILE_MAGIC))
 
@@ -1416,6 +1411,10 @@ typedef struct smb_ofile {
 	hrtime_t		dh_timeout_offset; /* time offset for timeout */
 	hrtime_t		dh_expire_time; /* time the handle expires */
 	boolean_t		dh_persist;
+	kmutex_t		dh_nvlock;
+	struct nvlist		*dh_nvlist;
+	smb_node_t		*dh_nvfile;
+
 	uint8_t			dh_create_guid[16];
 	char			f_quota_resume[SMB_SID_STRSZ];
 	uint8_t			f_lock_seq[SMB_OFILE_LSEQ_MAX];
@@ -1441,7 +1440,7 @@ typedef struct smb_streaminfo {
 	char		si_name[MAXPATHLEN];
 } smb_streaminfo_t;
 
-#define	SMB_LOCK_MAGIC 	0x4C4F434B	/* 'LOCK' */
+#define	SMB_LOCK_MAGIC	0x4C4F434B	/* 'LOCK' */
 
 typedef struct smb_lock {
 	list_node_t		l_lnd;
@@ -1472,7 +1471,7 @@ typedef struct smb_lock {
 typedef struct vardata_block {
 	uint8_t			vdb_tag;
 	uint32_t		vdb_len;
-	struct uio 		vdb_uio;
+	struct uio		vdb_uio;
 	struct iovec		vdb_iovec[MAX_IOVEC];
 } smb_vdb_t;
 
@@ -1760,7 +1759,7 @@ typedef struct smb_arg_olbrk {
  *
  */
 
-#define	SMB_REQ_MAGIC 		0x534D4252	/* 'SMBR' */
+#define	SMB_REQ_MAGIC		0x534D4252	/* 'SMBR' */
 #define	SMB_REQ_VALID(p)	ASSERT((p)->sr_magic == SMB_REQ_MAGIC)
 
 typedef enum smb_req_state {
@@ -1810,7 +1809,7 @@ typedef struct smb_request {
 	list_t			sr_storage;
 	struct smb_xa		*r_xa;
 	int			andx_prev_wct;
-	int 			cur_reply_offset;
+	int			cur_reply_offset;
 	int			orig_request_hdr;
 	unsigned int		reply_seqnum;	/* reply sequence number */
 	unsigned char		first_smb_com;	/* command code */
@@ -1868,6 +1867,7 @@ typedef struct smb_request {
 	uint8_t			nonce[16];
 
 	boolean_t		encrypted;
+	boolean_t		dh_nvl_dirty;
 
 	boolean_t		smb2_async;
 	uint64_t		smb2_async_id;
@@ -2068,7 +2068,7 @@ typedef enum smb_server_state {
 typedef struct {
 	/* protected by sv_mutex */
 	kcondvar_t		sp_cv;
-	uint32_t 		sp_cnt;
+	uint32_t		sp_cnt;
 	smb_llist_t		sp_list;
 	smb_llist_t		sp_fidlist;
 } smb_spool_t;
@@ -2094,11 +2094,12 @@ typedef struct smb_server {
 	krwlock_t		sv_cfg_lock;
 	smb_kmod_cfg_t		sv_cfg;
 	smb_session_t		*sv_session;
+	smb_user_t		*sv_rootuser;
 	smb_llist_t		sv_session_list;
 	smb_hash_t		*sv_persistid_ht;
 	smb_hash_t		*sv_lease_ht;
 
-	struct smb_export	sv_export;
+	smb_export_t		sv_export;
 	struct __door_handle	*sv_lmshrd;
 
 	/* Internal door for up-calls to smbd */
diff --git a/usr/src/uts/common/smbsrv/smb_share.h b/usr/src/uts/common/smbsrv/smb_share.h
index 7c2219caad..090de59105 100644
--- a/usr/src/uts/common/smbsrv/smb_share.h
+++ b/usr/src/uts/common/smbsrv/smb_share.h
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2017 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright 2018 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2016 by Delphix. All rights reserved.
  */
 
@@ -92,6 +92,7 @@ extern "C" {
 #define	SHOPT_AD_CONTAINER	"ad-container"
 #define	SHOPT_ABE		"abe"
 #define	SHOPT_NAME		"name"
+#define	SHOPT_CA		"ca"
 #define	SHOPT_CSC		"csc"
 #define	SHOPT_CATIA		"catia"
 #define	SHOPT_GUEST		"guestok"
@@ -185,6 +186,7 @@ extern "C" {
 
 #define	SMB_SHRF_QUOTAS		0x1000	/* Enable SMB Quotas */
 #define	SMB_SHRF_FSO		0x2000	/* Force Shared Oplocks */
+#define	SMB_SHRF_CA		0x4000	/* Continuous Availability */
 
 /*
  * Runtime flags
@@ -193,6 +195,7 @@ extern "C" {
 #define	SMB_SHRF_TRANS		0x10000000
 #define	SMB_SHRF_PERM		0x20000000
 #define	SMB_SHRF_AUTOHOME	0x40000000
+#define	SMB_SHRF_REMOVED	0x80000000	/* unshared */
 
 #define	SMB_SHARE_PRINT		"print$"
 #define	SMB_SHARE_PRINT_LEN	6
diff --git a/usr/src/uts/intel/vioblk/Makefile b/usr/src/uts/intel/vioblk/Makefile
index 5e5783fca6..ace9b626d0 100644
--- a/usr/src/uts/intel/vioblk/Makefile
+++ b/usr/src/uts/intel/vioblk/Makefile
@@ -1,90 +1,68 @@
 #
-# CDDL HEADER START
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
 #
-# The contents of this file are subject to the terms of the
-# Common Development and Distribution License (the "License").
-# You may not use this file except in compliance with the License.
-#
-# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
-# or http://www.opensolaris.org/os/licensing.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-# When distributing Covered Code, include this CDDL HEADER in each
-# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
-# If applicable, add the following below this CDDL HEADER, with the
-# fields enclosed by brackets "[]" replaced with your own identifying
-# information: Portions Copyright [yyyy] [name of copyright owner]
-#
-# CDDL HEADER END
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
 #
+
 #
 # Copyright 2012 Nexenta Systems, Inc. All rights reserved.
+# Copyright 2019 Joyent, Inc.
 #
 
 #
-#	Path to the base of the uts directory tree (usually /usr/src/uts).
+# Path to the base of the uts directory tree (usually /usr/src/uts).
 #
-UTSBASE		= ../..
+UTSBASE =		../..
 
 #
-#	Define the module and object file sets.
+# Define the module and object file sets.
 #
-MODULE		= vioblk
-OBJECTS		= $(VIOBLK_OBJS:%=$(OBJS_DIR)/%)
-LINTS		= $(VIOBLK_OBJS:%.o=$(LINTS_DIR)/%.ln)
-ROOTMODULE	= $(ROOT_DRV_DIR)/$(MODULE)
+MODULE =		vioblk
+OBJECTS =		$(VIOBLK_OBJS:%=$(OBJS_DIR)/%)
+ROOTMODULE =		$(ROOT_DRV_DIR)/$(MODULE)
 
 #
-#	Include common rules.
+# Include common rules.
 #
 include $(UTSBASE)/intel/Makefile.intel
 
 #
-#	Define targets
+# Define targets
 #
-ALL_TARGET	= $(BINARY)
-LINT_TARGET	= $(MODULE).lint
-INSTALL_TARGET	= $(BINARY) $(ROOTMODULE)
+ALL_TARGET =		$(BINARY)
+INSTALL_TARGET =	$(BINARY) $(ROOTMODULE)
 
 #
-#	Overrides
+# Overrides
 #
-
-INC_PATH += -I$(UTSBASE)/common/io/virtio
-
-#
-# lint pass one enforcement
-#
-CFLAGS += $(CCVERBOSE)
+INC_PATH +=		-I$(UTSBASE)/common/io/virtio
 
 #
 # Driver depends on virtio and blkdev
 #
-LDFLAGS		+= -dy -N misc/virtio -N drv/blkdev
+LDFLAGS +=		-dy -N misc/virtio -N drv/blkdev
 
 #
-#	Default build targets.
+# Default build targets.
 #
 .KEEP_STATE:
 
-def:		$(DEF_DEPS)
-
-all:		$(ALL_DEPS)
-
-clean:		$(CLEAN_DEPS)
-
-clobber:	$(CLOBBER_DEPS)
+def: $(DEF_DEPS)
 
-lint:		$(LINT_DEPS)
+all: $(ALL_DEPS)
 
-modlintlib:	$(MODLINTLIB_DEPS)
+clean: $(CLEAN_DEPS)
 
-clean.lint:	$(CLEAN_LINT_DEPS)
+clobber: $(CLOBBER_DEPS)
 
-install:	$(INSTALL_DEPS)
+install: $(INSTALL_DEPS)
 
 #
-#	Include common targets.
+# Include common targets.
 #
 include $(UTSBASE)/intel/Makefile.targ
diff --git a/usr/src/uts/intel/vioif/Makefile b/usr/src/uts/intel/vioif/Makefile
index ba87d97c61..a2dc4a337b 100644
--- a/usr/src/uts/intel/vioif/Makefile
+++ b/usr/src/uts/intel/vioif/Makefile
@@ -11,70 +11,58 @@
 
 #
 # Copyright 2013 Nexenta Inc.  All rights reserved.
+# Copyright 2019 Joyent, Inc.
 #
 
 #
-#	Path to the base of the uts directory tree (usually /usr/src/uts).
+# Path to the base of the uts directory tree (usually /usr/src/uts).
 #
-UTSBASE		= ../..
+UTSBASE =		../..
 
 #
-#	Define the module and object file sets.
+# Define the module and object file sets.
 #
-MODULE		= vioif
-OBJECTS		= $(VIOIF_OBJS:%=$(OBJS_DIR)/%)
-LINTS		= $(VIOIF_OBJS:%.o=$(LINTS_DIR)/%.ln)
-ROOTMODULE	= $(ROOT_DRV_DIR)/$(MODULE)
+MODULE =		vioif
+OBJECTS =		$(VIOIF_OBJS:%=$(OBJS_DIR)/%)
+ROOTMODULE =		$(ROOT_DRV_DIR)/$(MODULE)
 
 #
-#	Include common rules.
+# Include common rules.
 #
 include $(UTSBASE)/intel/Makefile.intel
 
 #
-#	Define targets
+# Define targets
 #
-ALL_TARGET	= $(BINARY)
-LINT_TARGET	= $(MODULE).lint
-INSTALL_TARGET	= $(BINARY) $(ROOTMODULE)
+ALL_TARGET =		$(BINARY)
+INSTALL_TARGET =	$(BINARY) $(ROOTMODULE)
 
 #
-#	Overrides
+# Overrides
 #
+INC_PATH +=		-I$(UTSBASE)/common/io/virtio
 
-INC_PATH += -I$(UTSBASE)/common/io/virtio
-
-#
-# lint pass one enforcement
-#
-CFLAGS += $(CCVERBOSE)
 #
-# Driver depends on virtio and blkdev
+# Driver depends on virtio and mac
 #
-LDFLAGS		+= -dy -N misc/virtio -N misc/mac
+LDFLAGS +=		-dy -N misc/virtio -N misc/mac
 
 #
-#	Default build targets.
+# Default build targets.
 #
 .KEEP_STATE:
 
-def:		$(DEF_DEPS)
-
-all:		$(ALL_DEPS)
-
-clean:		$(CLEAN_DEPS)
-
-clobber:	$(CLOBBER_DEPS)
+def: $(DEF_DEPS)
 
-lint:		$(LINT_DEPS)
+all: $(ALL_DEPS)
 
-modlintlib:	$(MODLINTLIB_DEPS)
+clean: $(CLEAN_DEPS)
 
-clean.lint:	$(CLEAN_LINT_DEPS)
+clobber: $(CLOBBER_DEPS)
 
-install:	$(INSTALL_DEPS)
+install: $(INSTALL_DEPS)
 
 #
-#	Include common targets.
+# Include common targets.
 #
 include $(UTSBASE)/intel/Makefile.targ
diff --git a/usr/src/uts/intel/virtio/Makefile b/usr/src/uts/intel/virtio/Makefile
index 1f6548a135..c5a0d05b6a 100644
--- a/usr/src/uts/intel/virtio/Makefile
+++ b/usr/src/uts/intel/virtio/Makefile
@@ -1,90 +1,63 @@
 #
-# CDDL HEADER START
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
 #
-# The contents of this file are subject to the terms of the
-# Common Development and Distribution License (the "License").
-# You may not use this file except in compliance with the License.
-#
-# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
-# or http://www.opensolaris.org/os/licensing.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-# When distributing Covered Code, include this CDDL HEADER in each
-# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
-# If applicable, add the following below this CDDL HEADER, with the
-# fields enclosed by brackets "[]" replaced with your own identifying
-# information: Portions Copyright [yyyy] [name of copyright owner]
-#
-# CDDL HEADER END
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
 #
 
 #
 # Copyright (c) 2012 Nexenta Systems, Inc. All rights reserved.
-# Copyright (c) 2018, Joyent, Inc.
+# Copyright 2019 Joyent, Inc.
 #
 
 #
-#	Path to the base of the uts directory tree (usually /usr/src/uts).
+# Path to the base of the uts directory tree (usually /usr/src/uts).
 #
-UTSBASE		= ../..
+UTSBASE =		../..
 
 #
-#	Define the module and object file sets.
+# Define the module and object file sets.
 #
-MODULE		= virtio
-OBJECTS		= $(VIRTIO_OBJS:%=$(OBJS_DIR)/%)
-LINTS		= $(VIRTIO_OBJS:%.o=$(LINTS_DIR)/%.ln)
-ROOTMODULE	= $(ROOT_MISC_DIR)/$(MODULE)
+MODULE =		virtio
+OBJECTS =		$(VIRTIO_OBJS:%=$(OBJS_DIR)/%)
+ROOTMODULE =		$(ROOT_MISC_DIR)/$(MODULE)
 
 #
-#	Include common rules.
+# Include common rules.
 #
 include $(UTSBASE)/intel/Makefile.intel
 
 #
-#	Define targets
+# Define targets
 #
-ALL_TARGET	= $(BINARY)
-LINT_TARGET	= $(MODULE).lint
-INSTALL_TARGET	= $(BINARY) $(ROOTMODULE)
+ALL_TARGET =		$(BINARY)
+INSTALL_TARGET =	$(BINARY) $(ROOTMODULE)
 
 #
-#	Overrides
+# Overrides
 #
-
-INC_PATH += -I$(UTSBASE)/common/io/virtio
+INC_PATH +=		-I$(UTSBASE)/common/io/virtio
 
 #
-# lint pass one enforcement
-#
-CFLAGS += $(CCVERBOSE)
-
-# needs work
-SMOFF += all_func_returns
-
-#
-#	Default build targets.
+# Default build targets.
 #
 .KEEP_STATE:
 
-def:		$(DEF_DEPS)
-
-all:		$(ALL_DEPS)
-
-clean:		$(CLEAN_DEPS)
-
-clobber:	$(CLOBBER_DEPS)
+def: $(DEF_DEPS)
 
-lint:		$(LINT_DEPS)
+all: $(ALL_DEPS)
 
-modlintlib:	$(MODLINTLIB_DEPS)
+clean: $(CLEAN_DEPS)
 
-clean.lint:	$(CLEAN_LINT_DEPS)
+clobber: $(CLOBBER_DEPS)
 
-install:	$(INSTALL_DEPS)
+install: $(INSTALL_DEPS)
 
 #
-#	Include common targets.
+# Include common targets.
 #
 include $(UTSBASE)/intel/Makefile.targ
author	Jerry Jelinek <jerry.jelinek@joyent.com>	2019-08-19 12:06:05 +0000
committer	Jerry Jelinek <jerry.jelinek@joyent.com>	2019-08-19 12:06:05 +0000
commit	289a9bb49771505b864985403334d2f94f0ca3ec (patch)
tree	2853dbf40fb16b4ea3df020177473835c0641dcb
parent	fb22979c02ec1ab84832084bea882640c366be5b (diff)
parent	2052a1fb16201e50b4c3a91ebcbeeccbc8276644 (diff)
download	illumos-joyent-289a9bb49771505b864985403334d2f94f0ca3ec.tar.gz