482 files changed, 144151 insertions, 3253 deletions
diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files
index e46c461c54..16d89ee737 100644
--- a/usr/src/uts/common/Makefile.files
+++ b/usr/src/uts/common/Makefile.files
@@ -21,10 +21,10 @@
 
 #
 # Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
-# Copyright (c) 2012 Joyent, Inc.  All rights reserved.
 # Copyright (c) 2011, 2014 by Delphix. All rights reserved.
 # Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
 # Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
+# Copyright 2016 Joyent, Inc.
 # Copyright 2016 Garrett D'Amore <garrett@damore.org>
 #
 
@@ -282,6 +282,7 @@ GENUNIX_OBJS +=	\
 		rctl.o		\
 		rctlsys.o	\
 		readlink.o	\
+		refhash.o	\
 		refstr.o	\
 		rename.o	\
 		resolvepath.o	\
@@ -303,6 +304,7 @@ GENUNIX_OBJS +=	\
 		seg_map.o	\
 		seg_vn.o	\
 		seg_spt.o	\
+		seg_umap.o	\
 		semaphore.o	\
 		sendfile.o	\
 		session.o	\
@@ -428,6 +430,8 @@ PROFILE_OBJS += profile.o
 
 SYSTRACE_OBJS += systrace.o
 
+LX_SYSTRACE_OBJS += lx_systrace.o
+
 LOCKSTAT_OBJS += lockstat.o
 
 FASTTRAP_OBJS += fasttrap.o fasttrap_isa.o
@@ -492,6 +496,10 @@ PTSL_OBJS +=	tty_pts.o
 
 PTM_OBJS +=	ptm.o
 
+LX_PTM_OBJS +=	lx_ptm.o
+
+LX_NETLINK_OBJS +=	lx_netlink.o
+
 MII_OBJS +=	mii.o mii_cicada.o mii_natsemi.o mii_intel.o mii_qualsemi.o \
 		mii_marvell.o mii_realtek.o mii_other.o
 
@@ -549,6 +557,7 @@ IP_SCTP_OBJS =	sctp.o sctp_opt_data.o sctp_output.o \
 		sctp_addr.o tn_ipopt.o tnet.o ip_netinfo.o \
 		sctp_misc.o
 IP_ILB_OBJS =	ilb.o ilb_nat.o ilb_conn.o ilb_alg_hash.o ilb_alg_rr.o
+IP_COMM_OBJS =	inet_hash.o
 
 IP_OBJS +=	igmp.o ipmp.o ip.o ip6.o ip6_asp.o ip6_if.o ip6_ire.o \
 		ip6_rts.o ip_if.o ip_ire.o ip_listutils.o ip_mroute.o \
@@ -564,7 +573,8 @@ IP_OBJS +=	igmp.o ipmp.o ip.o ip6.o ip6_asp.o ip6_if.o ip6_ire.o \
 		$(IP_TCP_OBJS) \
 		$(IP_UDP_OBJS) \
 		$(IP_SCTP_OBJS) \
-		$(IP_ILB_OBJS)
+		$(IP_ILB_OBJS) \
+		$(IP_COMM_OBJS)
 
 IP6_OBJS +=	ip6ddi.o
 
@@ -582,6 +592,8 @@ IPSECESP_OBJS += ipsecespddi.o ipsecesp.o
 
 IPSECAH_OBJS +=	ipsecahddi.o ipsecah.o sadb.o
 
+DATAFILT_OBJS += datafilt.o
+
 SPPP_OBJS +=	sppp.o sppp_dlpi.o sppp_mod.o s_common.o
 
 SPPPTUN_OBJS +=	sppptun.o sppptun_mod.o
@@ -679,6 +691,15 @@ NET80211_OBJS += net80211.o net80211_proto.o net80211_input.o \
 
 VNIC_OBJS +=	vnic_ctl.o vnic_dev.o
 
+OVERLAY_OBJS +=	overlay.o overlay_fm.o overlay_mux.o overlay_plugin.o \
+		overlay_prop.o overlay_target.o
+
+OVERLAY_VXLAN_OBJS +=	overlay_vxlan.o
+
+VND_OBJS +=	vnd.o frameio.o
+
+GSQUEUE_OBJS +=	gsqueue.o
+
 SIMNET_OBJS +=	simnet.o
 
 IB_OBJS +=	ibnex.o ibnex_ioctl.o ibnex_hca.o
@@ -997,6 +1018,8 @@ SIGNALFD_OBJS += signalfd.o
 
 I8042_OBJS +=	i8042.o
 
+INOTIFY_OBJS +=	inotify.o
+
 KB8042_OBJS +=	\
 		at_keyprocess.o	\
 		kb8042.o	\
@@ -1071,6 +1094,8 @@ QLGE_OBJS += qlge.o qlge_dbg.o qlge_flash.o qlge_fm.o qlge_gld.o qlge_mpi.o
 
 ZCONS_OBJS += zcons.o
 
+ZFD_OBJS += zfd.o
+
 NV_SATA_OBJS += nv_sata.o
 
 SI3124_OBJS += si3124.o
@@ -1124,8 +1149,7 @@ DEVFS_OBJS +=	devfs_subr.o	devfs_vfsops.o	devfs_vnops.o
 DEV_OBJS  +=	sdev_subr.o	sdev_vfsops.o	sdev_vnops.o	\
 		sdev_ptsops.o	sdev_zvolops.o	sdev_comm.o	\
 		sdev_profile.o	sdev_ncache.o	sdev_netops.o	\
-		sdev_ipnetops.o	\
-		sdev_vtops.o
+		sdev_ipnetops.o	sdev_vtops.o	sdev_plugin.o
 
 CTFS_OBJS +=	ctfs_all.o ctfs_cdir.o ctfs_ctl.o ctfs_event.o \
 		ctfs_latest.o ctfs_root.o ctfs_sym.o ctfs_tdir.o ctfs_tmpl.o
@@ -1142,8 +1166,13 @@ PIPE_OBJS +=	pipe.o
 HSFS_OBJS +=	hsfs_node.o	hsfs_subr.o	hsfs_vfsops.o	hsfs_vnops.o \
 		hsfs_susp.o	hsfs_rrip.o	hsfs_susp_subr.o
 
+HYPRLOFS_OBJS += hyprlofs_dir.o hyprlofs_subr.o \
+		hyprlofs_vnops.o hyprlofs_vfsops.o
+
 LOFS_OBJS +=	lofs_subr.o	lofs_vfsops.o	lofs_vnops.o
 
+LXPROC_OBJS +=	lxpr_subr.o	lxpr_vfsops.o	lxpr_vnops.o
+
 NAMEFS_OBJS +=	namevfs.o	namevno.o
 
 NFS_OBJS +=	nfs_client.o	nfs_common.o	nfs_dump.o \
@@ -1295,8 +1324,8 @@ SMBSRV_OBJS +=	$(SMBSRV_SHARED_OBJS)			\
 PCFS_OBJS +=	pc_alloc.o	pc_dir.o	pc_node.o	pc_subr.o \
 		pc_vfsops.o	pc_vnops.o
 
-PROC_OBJS +=	prcontrol.o	prioctl.o	prsubr.o	prusrio.o \
-		prvfsops.o	prvnops.o
+PROC_OBJS +=	prargv.o	prcontrol.o	prioctl.o	prsubr.o \
+		prusrio.o	prvfsops.o	prvnops.o
 
 MNTFS_OBJS +=	mntvfsops.o	mntvnops.o
 
@@ -1438,6 +1467,7 @@ ZFS_COMMON_OBJS +=		\
 	zfs_fuid.o		\
 	zfs_sa.o		\
 	zfs_znode.o		\
+	zfs_zone.o		\
 	zil.o			\
 	zio.o			\
 	zio_checksum.o		\
@@ -1884,7 +1914,7 @@ ZYD_OBJS += zyd.o zyd_usb.o zyd_hw.o zyd_fw.o
 
 MXFE_OBJS += mxfe.o
 
-MPTSAS_OBJS += mptsas.o mptsas_hash.o mptsas_impl.o mptsas_init.o \
+MPTSAS_OBJS += mptsas.o mptsas_impl.o mptsas_init.o \
 		mptsas_raid.o mptsas_smhba.o
 
 SFE_OBJS += sfe.o sfe_util.o
@@ -2005,6 +2035,15 @@ IXGBE_OBJS =    ixgbe_82598.o ixgbe_82599.o ixgbe_api.o		\
                 ixgbe_tx.o  ixgbe_x540.o ixgbe_mbx.o
 
 #
+#	Intel 40GbE PCIe NIC driver module
+#
+
+# illumos-written ones.
+I40E_OBJS =	i40e_main.o i40e_osdep.o i40e_intr.o i40e_transceiver.o \
+		i40e_stats.o i40e_gld.o 
+# Intel-written ones.
+I40E_INTC_OBJS = i40e_adminq.o i40e_common.o i40e_hmc.o i40e_lan_hmc.o \
+		 i40e_nvm.o
 #	Solarflare 1/10/40GbE NIC driver module
 #
 #	NB: The illumos specific sources are listed first, with the
@@ -2103,6 +2142,11 @@ MEGA_SAS_OBJS = megaraid_sas.o
 MR_SAS_OBJS = ld_pd_map.o mr_sas.o mr_sas_tbolt.o mr_sas_list.o
 
 #
+#	DR_SAS module
+#
+DR_SAS_OBJS = dr_sas.o
+
+#
 #	CPQARY3 module
 #
 CPQARY3_OBJS =	cpqary3.o cpqary3_noe.o cpqary3_talk2ctlr.o	\
@@ -2141,6 +2185,20 @@ NULLDRIVER_OBJS =	nulldriver.o
 TPM_OBJS =	tpm.o tpm_hcall.o
 
 #
+# USB Fast ethernet drivers
+#
+USBGEM_OBJS = usbgem.o
+AXF_OBJS = axf_usbgem.o
+UDMF_OBJS = udmf_usbgem.o
+URF_OBJS = urf_usbgem.o
+UPF_OBJS = upf_usbgem.o
+
+#
+#	NFP objects
+#
+NFP_OBJS = hostif.o osif.o drvlist.o i21555.o i21285.o i21555d.o
+
+#
 #	BNXE objects
 #
 BNXE_OBJS +=	bnxe_cfg.o		\
diff --git a/usr/src/uts/common/Makefile.rules b/usr/src/uts/common/Makefile.rules
index fdbe9717f3..a80ec6293f 100644
--- a/usr/src/uts/common/Makefile.rules
+++ b/usr/src/uts/common/Makefile.rules
@@ -23,6 +23,7 @@
 # Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
 # Copyright 2016 Garrett D'Amore <garrett@damore.org>
 # Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
+# Copyright 2016 Joyent, Inc.
 # Copyright 2013 Saso Kiselkov. All rights reserved.
 #
 
@@ -96,6 +97,10 @@ $(OBJS_DIR)/%.o:		$(COMMONBASE)/avl/%.c
 	$(COMPILE.c) -o $@ $<
 	$(CTFCONVERT_O)
 
+$(OBJS_DIR)/%.o:		$(COMMONBASE)/inet/%.c
+	$(COMPILE.c) -o $@ $<
+	$(CTFCONVERT_O)
+
 $(OBJS_DIR)/%.o:		$(COMMONBASE)/ucode/%.c
 	$(COMPILE.c) -o $@ $<
 	$(CTFCONVERT_O)
@@ -244,10 +249,18 @@ $(OBJS_DIR)/%.o:		$(UTSBASE)/common/fs/hsfs/%.c
 	$(COMPILE.c) -o $@ $<
 	$(CTFCONVERT_O)
 
+$(OBJS_DIR)/%.o:		$(UTSBASE)/common/fs/hyprlofs/%.c
+	$(COMPILE.c) -o $@ $<
+	$(CTFCONVERT_O)
+
 $(OBJS_DIR)/%.o:		$(UTSBASE)/common/fs/lofs/%.c
 	$(COMPILE.c) -o $@ $<
 	$(CTFCONVERT_O)
 
+$(OBJS_DIR)/%.o:		$(UTSBASE)/common/fs/lxproc/%.c
+	$(COMPILE.c) -o $@ $<
+	$(CTFCONVERT_O)
+
 $(OBJS_DIR)/%.o:		$(UTSBASE)/common/fs/mntfs/%.c
 	$(COMPILE.c) -o $@ $<
 	$(CTFCONVERT_O)
@@ -759,6 +772,10 @@ $(OBJS_DIR)/%.o:		$(UTSBASE)/common/io/drm/%.c
 	$(COMPILE.c) -o $@ $<
 	$(CTFCONVERT_O)
 
+$(OBJS_DIR)/%.o:		$(UTSBASE)/common/io/dr_sas/%.c
+	$(COMPILE.c) -o $@ $<
+	$(CTFCONVERT_O)
+
 $(OBJS_DIR)/%.o:		$(UTSBASE)/common/io/efe/%.c
 	$(COMPILE.c) -o $@ $<
 	$(CTFCONVERT_O)
@@ -987,6 +1004,10 @@ $(OBJS_DIR)/%.o:		$(UTSBASE)/common/io/net80211/%.c
 	$(COMPILE.c) -o $@ $<
 	$(CTFCONVERT_O)
 
+$(OBJS_DIR)/%.o:		$(UTSBASE)/common/io/nfp/%.c
+	$(COMPILE.c) -o $@ $<
+	$(CTFCONVERT_O)
+
 $(OBJS_DIR)/%.o:		$(UTSBASE)/common/io/nge/%.c
 	$(COMPILE.c) -o $@ $<
 	$(CTFCONVERT_O)
@@ -1006,6 +1027,14 @@ $(OBJS_DIR)/%.o:		$(UTSBASE)/common/io/nxge/npi/%.c
 $(OBJS_DIR)/%.o:		$(UTSBASE)/common/io/nxge/%.s
 	$(COMPILE.s) -o $@ $<
 
+$(OBJS_DIR)/%.o:		$(UTSBASE)/common/io/overlay/%.c
+	$(COMPILE.c) -o $@ $<
+	$(CTFCONVERT_O)
+
+$(OBJS_DIR)/%.o:		$(UTSBASE)/common/io/overlay/plugins/%.c
+	$(COMPILE.c) -o $@ $<
+	$(CTFCONVERT_O)
+
 $(OBJS_DIR)/%.o:		$(UTSBASE)/common/io/pci-ide/%.c
 	$(COMPILE.c) -o $@ $<
 	$(CTFCONVERT_O)
@@ -1142,6 +1171,10 @@ $(OBJS_DIR)/%.o:		$(UTSBASE)/common/io/sdcard/targets/sdcard/%.c
 	$(COMPILE.c) -o $@ $<
 	$(CTFCONVERT_O)
 
+$(OBJS_DIR)/%.o:		$(UTSBASE)/common/io/gsqueue/%.c
+	$(COMPILE.c) -o $@ $<
+	$(CTFCONVERT_O)
+
 $(OBJS_DIR)/%.o:		$(UTSBASE)/common/io/sfe/%.c
 	$(COMPILE.c) -o $@ $<
 	$(CTFCONVERT_O)
@@ -1154,6 +1187,10 @@ $(OBJS_DIR)/%.o:		$(UTSBASE)/common/io/softmac/%.c
 	$(COMPILE.c) -o $@ $<
 	$(CTFCONVERT_O)
 
+$(OBJS_DIR)/%.o:		$(UTSBASE)/common/io/vnd/%.c
+	$(COMPILE.c) -o $@ $<
+	$(CTFCONVERT_O)
+
 $(OBJS_DIR)/%.o:		$(UTSBASE)/common/io/uath/%.c
 	$(COMPILE.c) -o $@ $<
 	$(CTFCONVERT_O)
@@ -1278,6 +1315,30 @@ $(OBJS_DIR)/%.o:		$(UTSBASE)/common/io/usb/usba10/%.c
 	$(COMPILE.c) -o $@ $<
 	$(CTFCONVERT_O)
 
+$(OBJS_DIR)/%.o:		$(UTSBASE)/common/io/usbgem/%.c
+	$(COMPILE.c) -o $@ $<
+	$(CTFCONVERT_O)
+
+$(OBJS_DIR)/%.o:		$(UTSBASE)/common/io/axf/%.c
+	$(COMPILE.c) -o $@ $<
+	$(CTFCONVERT_O)
+
+$(OBJS_DIR)/%.o:		$(UTSBASE)/common/io/udf/%.c
+	$(COMPILE.c) -o $@ $<
+	$(CTFCONVERT_O)
+
+$(OBJS_DIR)/%.o:		$(UTSBASE)/common/io/udmf/%.c
+	$(COMPILE.c) -o $@ $<
+	$(CTFCONVERT_O)
+
+$(OBJS_DIR)/%.o:		$(UTSBASE)/common/io/upf/%.c
+	$(COMPILE.c) -o $@ $<
+	$(CTFCONVERT_O)
+
+$(OBJS_DIR)/%.o:		$(UTSBASE)/common/io/urf/%.c
+	$(COMPILE.c) -o $@ $<
+	$(CTFCONVERT_O)
+
 $(OBJS_DIR)/%.o:		$(UTSBASE)/common/io/vuidmice/%.c
 	$(COMPILE.c) -o $@ $<
 	$(CTFCONVERT_O)
@@ -1358,6 +1419,14 @@ $(OBJS_DIR)/%.o:		$(UTSBASE)/common/io/ixgbe/core/%.c
 	$(COMPILE.c) -o $@ $<
 	$(CTFCONVERT_O)
 
+$(OBJS_DIR)/%.o:		$(UTSBASE)/common/io/i40e/%.c
+	$(COMPILE.c) -o $@ $<
+	$(CTFCONVERT_O)
+
+$(OBJS_DIR)/%.o:		$(UTSBASE)/common/io/i40e/core/%.c
+	$(COMPILE.c) -o $@ $<
+	$(CTFCONVERT_O)
+
 $(OBJS_DIR)/%.o:		$(UTSBASE)/common/io/ntxn/%.c
 	$(COMPILE.c) -o $@ $<
 	$(CTFCONVERT_O)
@@ -1462,9 +1531,14 @@ $(OBJS_DIR)/%.o:		$(UTSBASE)/common/io/vioblk/%.c
 	$(COMPILE.c) -o $@ $<
 	$(CTFCONVERT_O)
 
+$(OBJS_DIR)/%.o:		$(COMMONBASE)/idspace/%.c
+	$(COMPILE.c) -o $@ $<
+	$(CTFCONVERT_O)
+
 $(OBJS_DIR)/%.o:		$(UTSBASE)/common/io/vioif/%.c
 	$(COMPILE.c) -o $@ $<
 	$(CTFCONVERT_O)
+
 #
 # krtld must refer to its own bzero/bcopy until the kernel is fully linked
 #
@@ -1533,6 +1607,10 @@ $(OBJS_DIR)/%.o:		$(UTSBASE)/common/pcmcia/pcs/%.c
 	$(COMPILE.c) -o $@ $<
 	$(CTFCONVERT_O)
 
+$(OBJS_DIR)/%.o:		$(UTSBASE)/common/refhash/%.c
+	$(COMPILE.c) -o $@ $<
+	$(CTFCONVERT_O)
+
 $(OBJS_DIR)/%.o:		$(UTSBASE)/common/rpc/%.c
 	$(COMPILE.c) -o $@ $<
 	$(CTFCONVERT_O)
@@ -1656,6 +1734,9 @@ $(LINTS_DIR)/%.ln:		$(COMMONBASE)/acl/%.c
 $(LINTS_DIR)/%.ln:		$(COMMONBASE)/avl/%.c
 	@($(LHEAD) $(LINT.c) $< $(LTAIL))
 
+$(LINTS_DIR)/%.ln:		$(COMMONBASE)/inet/%.c
+	@($(LHEAD) $(LINT.c) $< $(LTAIL))
+
 $(LINTS_DIR)/%.ln:		$(COMMONBASE)/ucode/%.c
 	@($(LHEAD) $(LINT.c) $< $(LTAIL))
 
@@ -1791,9 +1872,15 @@ $(LINTS_DIR)/%.ln:		$(UTSBASE)/common/fs/fifofs/%.c
 $(LINTS_DIR)/%.ln:		$(UTSBASE)/common/fs/hsfs/%.c
 	@($(LHEAD) $(LINT.c) $< $(LTAIL))
 
+$(LINTS_DIR)/%.ln:		$(UTSBASE)/common/fs/hyprlofs/%.c
+	@($(LHEAD) $(LINT.c) $< $(LTAIL))
+
 $(LINTS_DIR)/%.ln:		$(UTSBASE)/common/fs/lofs/%.c
 	@($(LHEAD) $(LINT.c) $< $(LTAIL))
 
+$(LINTS_DIR)/%.ln:		$(UTSBASE)/common/fs/lxproc/%.c
+	@($(LHEAD) $(LINT.c) $< $(LTAIL))
+
 $(LINTS_DIR)/%.ln:		$(UTSBASE)/common/fs/mntfs/%.c
 	@($(LHEAD) $(LINT.c) $< $(LTAIL))
 
@@ -2139,6 +2226,9 @@ $(LINTS_DIR)/%.ln:		$(UTSBASE)/common/io/dmfe/%.c
 $(LINTS_DIR)/%.ln:		$(UTSBASE)/common/io/drm/%.c
 	@($(LHEAD) $(LINT.c) $< $(LTAIL))
 
+$(LINTS_DIR)/%.ln:		$(UTSBASE)/common/io/dr_sas/%.c
+	@($(LHEAD) $(LINT.c) $< $(LTAIL))
+
 $(LINTS_DIR)/%.ln:		$(UTSBASE)/common/io/efe/%.c
 	@($(LHEAD) $(LINT.c) $< $(LTAIL))
 
@@ -2310,6 +2400,9 @@ $(LINTS_DIR)/%.ln:		$(UTSBASE)/common/io/mwl/mwl_fw/%.c
 $(LINTS_DIR)/%.ln:		$(UTSBASE)/common/io/net80211/%.c
 	@($(LHEAD) $(LINT.c) $< $(LTAIL))
 
+$(LINTS_DIR)/%.ln:		$(UTSBASE)/common/io/nfp/%.c
+	@($(LHEAD) $(LINT.c) $< $(LTAIL))
+
 $(LINTS_DIR)/%.ln:              $(UTSBASE)/common/io/nge/%.c
 	@($(LHEAD) $(LINT.c) $< $(LTAIL))
 
@@ -2325,6 +2418,12 @@ $(LINTS_DIR)/%.ln:		$(UTSBASE)/common/io/nxge/%.s
 $(LINTS_DIR)/%.ln:		$(UTSBASE)/common/io/nxge/npi/%.c
 	@($(LHEAD) $(LINT.c) $< $(LTAIL))
 
+$(LINTS_DIR)/%.ln:		$(UTSBASE)/common/io/overlay/%.c
+	@($(LHEAD) $(LINT.c) $< $(LTAIL))
+
+$(LINTS_DIR)/%.ln:		$(UTSBASE)/common/io/overlay/plugins/%.c
+	@($(LHEAD) $(LINT.c) $< $(LTAIL))
+
 $(LINTS_DIR)/%.ln:		$(UTSBASE)/common/io/pci-ide/%.c
 	@($(LHEAD) $(LINT.c) $< $(LTAIL))
 
@@ -2532,6 +2631,21 @@ $(LINTS_DIR)/%.ln:		$(UTSBASE)/common/io/usb/usba/%.c
 $(LINTS_DIR)/%.ln:		$(UTSBASE)/common/io/usb/usba10/%.c
 	@($(LHEAD) $(LINT.c) $< $(LTAIL))
 
+$(LINTS_DIR)/%.ln:		$(UTSBASE)/common/io/usbgem/%.c
+	@($(LHEAD) $(LINT.c) $< $(LTAIL))
+
+$(LINTS_DIR)/%.ln:		$(UTSBASE)/common/io/axf/%.c
+	@($(LHEAD) $(LINT.c) $< $(LTAIL))
+
+$(LINTS_DIR)/%.ln:		$(UTSBASE)/common/io/udmf/%.c
+	@($(LHEAD) $(LINT.c) $< $(LTAIL))
+
+$(LINTS_DIR)/%.ln:		$(UTSBASE)/common/io/upf/%.c
+	@($(LHEAD) $(LINT.c) $< $(LTAIL))
+
+$(LINTS_DIR)/%.ln:		$(UTSBASE)/common/io/urf/%.c
+	@($(LHEAD) $(LINT.c) $< $(LTAIL))
+
 $(LINTS_DIR)/%.ln:		$(UTSBASE)/common/io/vuidmice/%.c
 	@($(LHEAD) $(LINT.c) $< $(LTAIL))
 
@@ -2592,6 +2706,9 @@ $(LINTS_DIR)/%.ln:		$(UTSBASE)/common/io/ixgbe/%.c
 $(LINTS_DIR)/%.ln:		$(UTSBASE)/common/io/ixgbe/core/%.c
 	@($(LHEAD) $(LINT.c) $< $(LTAIL))
 
+$(LINTS_DIR)/%.ln:		$(UTSBASE)/common/io/i40e/%.c
+	@($(LHEAD) $(LINT.c) $< $(LTAIL))
+
 $(LINTS_DIR)/%.ln:		$(UTSBASE)/common/io/ntxn/%.c
 	@($(LHEAD) $(LINT.c) $< $(LTAIL))
 
@@ -2682,6 +2799,9 @@ $(LINTS_DIR)/%.ln:		$(UTSBASE)/common/pcmcia/nexus/%.c
 $(LINTS_DIR)/%.ln:		$(UTSBASE)/common/pcmcia/pcs/%.c
 	@($(LHEAD) $(LINT.c) $< $(LTAIL))
 
+$(LINTS_DIR)/%.ln:		$(UTSBASE)/common/refhash/%.c
+	@($(LHEAD) $(LINT.c) $< $(LTAIL))
+
 $(LINTS_DIR)/%.ln:		$(UTSBASE)/common/rpc/%.c
 	@($(LHEAD) $(LINT.c) $< $(LTAIL))
 
@@ -2775,3 +2895,6 @@ $(LINTS_DIR)/%.ln:		$(UTSBASE)/common/io/skd/%.c
 
 $(LINTS_DIR)/%.ln:		$(COMMONBASE)/fsreparse/%.c
 	@($(LHEAD) $(LINT.c) $< $(LTAIL))
+
+$(LINTS_DIR)/%.ln:		$(COMMONBASE)/idspace/%.c
+	@($(LHEAD) $(LINT.c) $< $(LTAIL))
diff --git a/usr/src/uts/common/brand/lx/autofs/lx_autofs.c b/usr/src/uts/common/brand/lx/autofs/lx_autofs.c
new file mode 100644
index 0000000000..c55fc6d95f
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/autofs/lx_autofs.c
@@ -0,0 +1,3152 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ * Copyright 2016 Joyent, Inc.
+ */
+
+/*
+ * See the big theory statement in ../sys/lx_autofs.h
+ */
+
+#include <fs/fs_subr.h>
+#include <sys/stat.h>
+#include <sys/atomic.h>
+#include <sys/cmn_err.h>
+#include <sys/dirent.h>
+#include <sys/fs/fifonode.h>
+#include <sys/modctl.h>
+#include <sys/mount.h>
+#include <sys/policy.h>
+#include <sys/sunddi.h>
+#include <sys/conf.h>
+#include <sys/sdt.h>
+
+#include <sys/sysmacros.h>
+#include <sys/vfs.h>
+#include <sys/vfs_opreg.h>
+
+#include <sys/dnlc.h>
+#include <nfs/rnode.h>
+#include <nfs/rnode4.h>
+#include <sys/lx_autofs_impl.h>
+#include <sys/lx_types.h>
+
+/*
+ * External functions
+ */
+extern uintptr_t		space_fetch(char *key);
+extern int			space_store(char *key, uintptr_t ptr);
+extern int			umount2_engine(vfs_t *, int, cred_t *, int);
+
+/*
+ * Globals
+ */
+static vfsops_t			*lx_autofs_vfsops;
+static vnodeops_t		*lx_autofs_vn_ops = NULL;
+static int			lx_autofs_fstype;
+static major_t			lx_autofs_major;
+static minor_t			lx_autofs_minor = 0;
+static dev_info_t		*lx_autofs_dip = NULL;
+
+#define	LX_AUTOFS_DEV_VERSION_MAJOR	1
+#define	LX_AUTOFS_DEV_VERSION_MINOR	0
+
+/* The Linux autofs superblock magic number */
+#define	LX_AUTOFS_SB_MAGIC	0x0187
+
+/* Linux autofs mount types */
+#define	LX_AUTOFS_TYPE_INDIRECT		1
+#define	LX_AUTOFS_TYPE_DIRECT		2
+#define	LX_AUTOFS_TYPE_OFFSET		4
+
+/* Structure passed for autofs dev ioctls */
+typedef struct lx_autofs_dv_ioctl {
+	uint32_t lad_ver_major;
+	uint32_t lad_ver_minor;
+	uint32_t lad_size;
+	uint32_t lad_ioctlfd;
+	uint32_t lad_arg1;
+	uint32_t lad_arg2;
+	char	lad_path[0];
+} lx_autofs_dv_ioctl_t;
+
+/*
+ * Support functions
+ */
+static void
+lx_autofs_strfree(char *str)
+{
+	kmem_free(str, strlen(str) + 1);
+}
+
+static char *
+lx_autofs_strdup(char *str)
+{
+	int	n = strlen(str);
+	char	*ptr = kmem_alloc(n + 1, KM_SLEEP);
+	bcopy(str, ptr, n + 1);
+	return (ptr);
+}
+
+static int
+lx_autofs_str_to_int(char *str, int *val)
+{
+	long	res;
+
+	if (str == NULL)
+		return (-1);
+
+	if ((ddi_strtol(str, NULL, 10, &res) != 0) ||
+	    (res < INT_MIN) || (res > INT_MAX))
+		return (-1);
+
+	*val = res;
+	return (0);
+}
+
+static void
+ls_autofs_stack_init(list_t *lp)
+{
+	list_create(lp,
+	    sizeof (stack_elem_t), offsetof(stack_elem_t, se_list));
+}
+
+static void
+lx_autofs_stack_fini(list_t *lp)
+{
+	ASSERT(list_head(lp) == NULL);
+	list_destroy(lp);
+}
+
+static void
+lx_autofs_stack_push(list_t *lp, caddr_t ptr1, caddr_t ptr2, caddr_t ptr3)
+{
+	stack_elem_t	*se;
+
+	se = kmem_alloc(sizeof (*se), KM_SLEEP);
+	se->se_ptr1 = ptr1;
+	se->se_ptr2 = ptr2;
+	se->se_ptr3 = ptr3;
+	list_insert_head(lp, se);
+}
+
+static int
+lx_autofs_stack_pop(list_t *lp, caddr_t *ptr1, caddr_t *ptr2, caddr_t *ptr3)
+{
+	stack_elem_t	*se;
+
+	if ((se = list_head(lp)) == NULL)
+		return (-1);
+	list_remove(lp, se);
+	if (ptr1 != NULL)
+		*ptr1 = se->se_ptr1;
+	if (ptr2 != NULL)
+		*ptr2 = se->se_ptr2;
+	if (ptr3 != NULL)
+		*ptr3 = se->se_ptr3;
+	kmem_free(se, sizeof (*se));
+	return (0);
+}
+
+static vnode_t *
+lx_autofs_fifo_peer_vp(vnode_t *vp)
+{
+	fifonode_t *fnp = VTOF(vp);
+	fifonode_t *fn_dest = fnp->fn_dest;
+	return (FTOV(fn_dest));
+}
+
+static vnode_t *
+lx_autofs_vn_alloc(vfs_t *vfsp, vnode_t *uvp)
+{
+	lx_autofs_vfs_t	*data = (lx_autofs_vfs_t *)vfsp->vfs_data;
+	vnode_t		*vp, *vp_old;
+
+	/* Allocate a new vnode structure in case we need it. */
+	vp = vn_alloc(KM_SLEEP);
+	vn_setops(vp, lx_autofs_vn_ops);
+	VN_SET_VFS_TYPE_DEV(vp, vfsp, uvp->v_type, uvp->v_rdev);
+	vp->v_data = uvp;
+	ASSERT(vp->v_count == 1);
+
+	/*
+	 * Take a hold on the vfs structure.  This is how unmount will
+	 * determine if there are any active vnodes in the file system.
+	 */
+	VFS_HOLD(vfsp);
+
+	/*
+	 * Check if we already have a vnode allocated for this underlying
+	 * vnode_t.
+	 */
+	mutex_enter(&data->lav_lock);
+	if (mod_hash_find(data->lav_vn_hash,
+	    (mod_hash_key_t)uvp, (mod_hash_val_t *)&vp_old) != 0) {
+
+		/*
+		 * Didn't find an existing node.
+		 * Add this node to the hash and return.
+		 */
+		VERIFY(mod_hash_insert(data->lav_vn_hash,
+		    (mod_hash_key_t)uvp,
+		    (mod_hash_val_t)vp) == 0);
+		mutex_exit(&data->lav_lock);
+		return (vp);
+	}
+
+	/* Get a hold on the existing vnode and free up the one we allocated. */
+	VN_HOLD(vp_old);
+	mutex_exit(&data->lav_lock);
+
+	/* Free up the new vnode we allocated. */
+	VN_RELE(uvp);
+	VFS_RELE(vfsp);
+	vn_invalid(vp);
+	vn_free(vp);
+
+	return (vp_old);
+}
+
+static void
+lx_autofs_vn_free(vnode_t *vp)
+{
+	vfs_t		*vfsp = vp->v_vfsp;
+	lx_autofs_vfs_t	*data = (lx_autofs_vfs_t *)vfsp->vfs_data;
+	vnode_t		*uvp = vp->v_data;
+	vnode_t	*vp_tmp;
+
+	ASSERT(MUTEX_HELD((&data->lav_lock)));
+	ASSERT(MUTEX_HELD((&vp->v_lock)));
+
+	ASSERT(vp->v_count == 0);
+
+	/* We're about to free this vnode so take it out of the hash. */
+	(void) mod_hash_remove(data->lav_vn_hash,
+	    (mod_hash_key_t)uvp, (mod_hash_val_t)&vp_tmp);
+
+	/*
+	 * No one else can lookup this vnode any more so there's no need
+	 * to hold locks.
+	 */
+	mutex_exit(&data->lav_lock);
+	mutex_exit(&vp->v_lock);
+
+	/* Release the underlying vnode. */
+	VN_RELE(uvp);
+	VFS_RELE(vfsp);
+	vn_invalid(vp);
+	vn_free(vp);
+}
+
+static lx_autofs_automnt_req_t *
+lx_autofs_la_alloc(lx_autofs_vfs_t *data, boolean_t *is_dup, boolean_t expire,
+    char *nm)
+{
+	lx_autofs_automnt_req_t	*laar, *laar_dup;
+
+	/* Pre-allocate a new automounter request before grabbing locks. */
+	laar = kmem_zalloc(sizeof (*laar), KM_SLEEP);
+	mutex_init(&laar->laar_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&laar->laar_cv, NULL, CV_DEFAULT, NULL);
+	laar->laar_ref = 1;
+
+	if (data->lav_min_proto == 5) {
+		laar->laar_pkt.lap_protover = LX_AUTOFS_PROTO_VERS5;
+
+		if (data->lav_mnttype == LXAMT_INDIR) {
+			if (expire) {
+				laar->laar_pkt.lap_type =
+				    LX_AUTOFS_PTYPE_EXPIRE_INDIR;
+			} else {
+				laar->laar_pkt.lap_type =
+				    LX_AUTOFS_PTYPE_MISSING_INDIR;
+			}
+		} else {
+			if (expire) {
+				laar->laar_pkt.lap_type =
+				    LX_AUTOFS_PTYPE_EXPIRE_DIRECT;
+			} else {
+				laar->laar_pkt.lap_type =
+				    LX_AUTOFS_PTYPE_MISSING_DIRECT;
+			}
+		}
+		laar->laar_pkt_size = sizeof (lx_autofs_v5_pkt_t);
+
+		laar->laar_pkt.lap_v5.lap_dev = data->lav_dev;
+		laar->laar_pkt.lap_v5.lap_ino = data->lav_ino;
+		/*
+		 * Note that we're currently not filling in the other v5 pkt
+		 * fields (pid, uid, etc.) since they don't appear to be used
+		 * by the automounter. We can fill those in later if it proves
+		 * necessary.
+		 */
+
+		/*
+		 * For indirect mounts the token expected by the automounter is
+		 * the name of the directory entry to look up (not the entire
+		 * path that is being accessed.) For direct mounts the Linux
+		 * kernel passes a dummy name, so this is just as good.
+		 */
+		laar->laar_pkt.lap_v5.lap_name_len = strlen(nm);
+		if (laar->laar_pkt.lap_v5.lap_name_len >
+		    (sizeof (laar->laar_pkt.lap_v5.lap_name) - 1)) {
+			zcmn_err(getzoneid(), CE_NOTE,
+			    "invalid autofs automnt req: \"%s\"", nm);
+			kmem_free(laar, sizeof (*laar));
+			return (NULL);
+		}
+		(void) strlcpy(laar->laar_pkt.lap_v5.lap_name, nm,
+		    sizeof (laar->laar_pkt.lap_v5.lap_name));
+
+	} else if (expire) {
+		zcmn_err(getzoneid(), CE_WARN,
+		    "unsupported expire protocol request: \"%s\"", nm);
+		kmem_free(laar, sizeof (*laar));
+		return (NULL);
+
+	} else {
+		ASSERT(expire == B_FALSE);
+
+		/* Older protocol pkt (really v2) */
+		laar->laar_pkt.lap_protover = LX_AUTOFS_PROTO_VERS2;
+		laar->laar_pkt.lap_type = LX_AUTOFS_PTYPE_MISSING;
+		laar->laar_pkt_size = sizeof (lx_autofs_v2_pkt_t);
+
+		/*
+		 * The token expected by the linux automount is the name of
+		 * the directory entry to look up.  (And not the entire
+		 * path that is being accessed.)
+		 */
+		laar->laar_pkt.lap_v2.lap_name_len = strlen(nm);
+		if (laar->laar_pkt.lap_v2.lap_name_len >
+		    (sizeof (laar->laar_pkt.lap_v2.lap_name) - 1)) {
+			zcmn_err(getzoneid(), CE_NOTE,
+			    "invalid autofs lookup: \"%s\"", nm);
+			kmem_free(laar, sizeof (*laar));
+			return (NULL);
+		}
+		(void) strlcpy(laar->laar_pkt.lap_v2.lap_name, nm,
+		    sizeof (laar->laar_pkt.lap_v2.lap_name));
+	}
+
+	/* Assign a unique id for this request. */
+	laar->laar_pkt.lap_id = id_alloc(data->lav_ids);
+
+	/* Check for an outstanding request for this path. */
+	mutex_enter(&data->lav_lock);
+	if (mod_hash_find(data->lav_path_hash,
+	    (mod_hash_key_t)nm, (mod_hash_val_t *)&laar_dup) == 0) {
+		/*
+		 * There's already an outstanding request for this
+		 * path so we don't need a new one.
+		 */
+		id_free(data->lav_ids, laar->laar_pkt.lap_id);
+		kmem_free(laar, sizeof (*laar));
+		laar = laar_dup;
+
+		/* Bump the ref count on the old request. */
+		atomic_add_int(&laar->laar_ref, 1);
+
+		*is_dup = 1;
+	} else {
+		/* Add it to the hashes. */
+		VERIFY(mod_hash_insert(data->lav_id_hash,
+		    (mod_hash_key_t)(uintptr_t)laar->laar_pkt.lap_id,
+		    (mod_hash_val_t)laar) == 0);
+		VERIFY(mod_hash_insert(data->lav_path_hash,
+		    (mod_hash_key_t)lx_autofs_strdup(nm),
+		    (mod_hash_val_t)laar) == 0);
+
+		*is_dup = 0;
+	}
+	mutex_exit(&data->lav_lock);
+
+	return (laar);
+}
+
+static lx_autofs_automnt_req_t *
+lx_autofs_la_find(lx_autofs_vfs_t *data, int id)
+{
+	lx_autofs_automnt_req_t	*laar;
+
+	/* Check for an outstanding request for this id. */
+	mutex_enter(&data->lav_lock);
+	if (mod_hash_find(data->lav_id_hash, (mod_hash_key_t)(uintptr_t)id,
+	    (mod_hash_val_t *)&laar) != 0) {
+		mutex_exit(&data->lav_lock);
+		return (NULL);
+	}
+	atomic_add_int(&laar->laar_ref, 1);
+	mutex_exit(&data->lav_lock);
+	return (laar);
+}
+
+static void
+lx_autofs_la_complete(lx_autofs_vfs_t *data, lx_autofs_automnt_req_t *laar)
+{
+	lx_autofs_automnt_req_t	*laar_tmp;
+
+	/* Remove this request from the hashes so no one can look it up. */
+	mutex_enter(&data->lav_lock);
+	(void) mod_hash_remove(data->lav_id_hash,
+	    (mod_hash_key_t)(uintptr_t)laar->laar_pkt.lap_id,
+	    (mod_hash_val_t)&laar_tmp);
+	if (data->lav_min_proto == 5) {
+		(void) mod_hash_remove(data->lav_path_hash,
+		    (mod_hash_key_t)laar->laar_pkt.lap_v5.lap_name,
+		    (mod_hash_val_t)&laar_tmp);
+	} else {
+		(void) mod_hash_remove(data->lav_path_hash,
+		    (mod_hash_key_t)laar->laar_pkt.lap_v2.lap_name,
+		    (mod_hash_val_t)&laar_tmp);
+	}
+	mutex_exit(&data->lav_lock);
+
+	/* Mark this requst as complete and wakeup anyone waiting on it. */
+	mutex_enter(&laar->laar_lock);
+	laar->laar_complete = 1;
+	cv_broadcast(&laar->laar_cv);
+	mutex_exit(&laar->laar_lock);
+}
+
+static void
+lx_autofs_la_release(lx_autofs_vfs_t *data, lx_autofs_automnt_req_t *laar)
+{
+	ASSERT(!MUTEX_HELD(&laar->laar_lock));
+	if (atomic_add_int_nv(&laar->laar_ref, -1) > 0)
+		return;
+	ASSERT(laar->laar_ref == 0);
+	id_free(data->lav_ids, laar->laar_pkt.lap_id);
+	kmem_free(laar, sizeof (*laar));
+}
+
+static void
+lx_autofs_la_abort(lx_autofs_vfs_t *data, lx_autofs_automnt_req_t *laar)
+{
+	lx_autofs_automnt_req_t	*laar_tmp;
+
+	/*
+	 * This is a little tricky.  We're aborting the wait for this
+	 * request.  So if anyone else is waiting for this request we
+	 * can't free it, but if no one else is waiting for the request
+	 * we should free it.
+	 */
+	mutex_enter(&data->lav_lock);
+	if (atomic_add_int_nv(&laar->laar_ref, -1) > 0) {
+		mutex_exit(&data->lav_lock);
+		return;
+	}
+	ASSERT(laar->laar_ref == 0);
+
+	/* Remove this request from the hashes so no one can look it up. */
+	(void) mod_hash_remove(data->lav_id_hash,
+	    (mod_hash_key_t)(uintptr_t)laar->laar_pkt.lap_id,
+	    (mod_hash_val_t)&laar_tmp);
+	if (data->lav_min_proto == 5) {
+		(void) mod_hash_remove(data->lav_path_hash,
+		    (mod_hash_key_t)laar->laar_pkt.lap_v5.lap_name,
+		    (mod_hash_val_t)&laar_tmp);
+	} else {
+		(void) mod_hash_remove(data->lav_path_hash,
+		    (mod_hash_key_t)laar->laar_pkt.lap_v2.lap_name,
+		    (mod_hash_val_t)&laar_tmp);
+	}
+	mutex_exit(&data->lav_lock);
+
+	/* It's ok to free this now because the ref count was zero. */
+	id_free(data->lav_ids, laar->laar_pkt.lap_id);
+	kmem_free(laar, sizeof (*laar));
+}
+
+static int
+lx_autofs_fifo_lookup(pid_t pgrp, int fd, file_t **fpp_wr, file_t **fpp_rd)
+{
+	proc_t		*prp;
+	uf_info_t	*fip;
+	uf_entry_t	*ufp_wr, *ufp_rd = NULL;
+	file_t		*fp_wr, *fp_rd = NULL;
+	vnode_t		*vp_wr, *vp_rd;
+	int		i;
+
+	/*
+	 * sprlock() is zone aware, so assuming this mount call was
+	 * initiated by a process in a zone, if it tries to specify
+	 * a pgrp outside of it's zone this call will fail.
+	 *
+	 * Also, we want to grab hold of the main automounter process
+	 * and its going to be the group leader for pgrp, so its
+	 * pid will be equal to pgrp.
+	 */
+	prp = sprlock(pgrp);
+	if (prp == NULL)
+		return (-1);
+	mutex_exit(&prp->p_lock);
+
+	/* Now we want to access the processes open file descriptors. */
+	fip = P_FINFO(prp);
+	mutex_enter(&fip->fi_lock);
+
+	/* Sanity check fifo write fd. */
+	if (fd >= fip->fi_nfiles) {
+		mutex_exit(&fip->fi_lock);
+		mutex_enter(&prp->p_lock);
+		sprunlock(prp);
+		return (-1);
+	}
+
+	/* Get a pointer to the write fifo. */
+	UF_ENTER(ufp_wr, fip, fd);
+	if (((fp_wr = ufp_wr->uf_file) == NULL) ||
+	    ((vp_wr = fp_wr->f_vnode) == NULL) || (vp_wr->v_type != VFIFO)) {
+		/* Invalid fifo fd. */
+		UF_EXIT(ufp_wr);
+		mutex_exit(&fip->fi_lock);
+		mutex_enter(&prp->p_lock);
+		sprunlock(prp);
+		return (-1);
+	}
+
+	/*
+	 * Now we need to find the read end of the fifo (for reasons
+	 * explained below.)  We assume that the read end of the fifo
+	 * is in the same process as the write end.
+	 */
+	vp_rd = lx_autofs_fifo_peer_vp(fp_wr->f_vnode);
+	for (i = 0; i < fip->fi_nfiles; i++) {
+		if (i == fd)
+			continue;
+		UF_ENTER(ufp_rd, fip, i);
+		if (((fp_rd = ufp_rd->uf_file) != NULL) &&
+		    (fp_rd->f_vnode == vp_rd))
+			break;
+		UF_EXIT(ufp_rd);
+	}
+	if (i == fip->fi_nfiles) {
+		/* Didn't find it. */
+		UF_EXIT(ufp_wr);
+		mutex_exit(&fip->fi_lock);
+		mutex_enter(&prp->p_lock);
+		sprunlock(prp);
+		return (-1);
+	}
+
+	/*
+	 * We need to drop fi_lock before we can try to acquire f_tlock
+	 * the good news is that the file pointers are protected because
+	 * we're still holding uf_lock.
+	 */
+	mutex_exit(&fip->fi_lock);
+
+	/*
+	 * Here we bump the open counts on the fifos.  The reason
+	 * that we do this is because when we go to write to the
+	 * fifo we want to ensure that they are actually open (and
+	 * not in the process of being closed) without having to
+	 * stop the automounter.  (If the write end of the fifo
+	 * were closed and we tried to write to it we would panic.
+	 * If the read end of the fifo was closed and we tried to
+	 * write to the other end, the process that invoked the
+	 * lookup operation would get an unexpected SIGPIPE.)
+	 */
+	mutex_enter(&fp_wr->f_tlock);
+	fp_wr->f_count++;
+	ASSERT(fp_wr->f_count >= 2);
+	mutex_exit(&fp_wr->f_tlock);
+
+	mutex_enter(&fp_rd->f_tlock);
+	fp_rd->f_count++;
+	ASSERT(fp_rd->f_count >= 2);
+	mutex_exit(&fp_rd->f_tlock);
+
+	/* Release all our locks. */
+	UF_EXIT(ufp_wr);
+	UF_EXIT(ufp_rd);
+	mutex_enter(&prp->p_lock);
+	sprunlock(prp);
+
+	/* Return the file pointers. */
+	*fpp_rd = fp_rd;
+	*fpp_wr = fp_wr;
+	return (0);
+}
+
+static uint_t
+/*ARGSUSED*/
+lx_autofs_fifo_close_cb(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
+{
+	int	*id = (int *)arg;
+	/* Return the key and terminate the walk. */
+	*id = (uintptr_t)key;
+	return (MH_WALK_TERMINATE);
+}
+
+static void
+lx_autofs_fifo_close(lx_autofs_vfs_t *data)
+{
+	/*
+	 * Close the fifo to prevent any future requests from
+	 * getting sent to the automounter.
+	 */
+	mutex_enter(&data->lav_lock);
+	if (data->lav_fifo_wr != NULL) {
+		(void) closef(data->lav_fifo_wr);
+		data->lav_fifo_wr = NULL;
+	}
+	if (data->lav_fifo_rd != NULL) {
+		(void) closef(data->lav_fifo_rd);
+		data->lav_fifo_rd = NULL;
+	}
+	mutex_exit(&data->lav_lock);
+
+	/*
+	 * Wakeup any threads currently waiting for the automounter
+	 * note that it's possible for multiple threads to have entered
+	 * this function and to be doing the work below simultaneously.
+	 */
+	for (;;) {
+		lx_autofs_automnt_req_t	*laar;
+		int			id;
+
+		/* Lookup the first entry in the hash. */
+		id = -1;
+		mod_hash_walk(data->lav_id_hash,
+		    lx_autofs_fifo_close_cb, &id);
+		if (id == -1) {
+			/* No more id's in the hash. */
+			break;
+		}
+		if ((laar = lx_autofs_la_find(data, id)) == NULL) {
+			/* Someone else beat us to it. */
+			continue;
+		}
+
+		/* Mark the request as complete and release it. */
+		lx_autofs_la_complete(data, laar);
+		lx_autofs_la_release(data, laar);
+	}
+}
+
+static int
+lx_autofs_fifo_verify_rd(lx_autofs_vfs_t *data)
+{
+	proc_t		*prp;
+	uf_info_t	*fip;
+	uf_entry_t	*ufp_rd = NULL;
+	file_t		*fp_rd =  NULL;
+	vnode_t		*vp_rd;
+	int		i;
+
+	ASSERT(MUTEX_HELD((&data->lav_lock)));
+
+	/* Check if we've already been shut down. */
+	if (data->lav_fifo_wr == NULL) {
+		ASSERT(data->lav_fifo_rd == NULL);
+		return (-1);
+	}
+	vp_rd = lx_autofs_fifo_peer_vp(data->lav_fifo_wr->f_vnode);
+
+	/*
+	 * sprlock() is zone aware, so assuming this mount call was
+	 * initiated by a process in a zone, if it tries to specify
+	 * a pgrp outside of it's zone this call will fail.
+	 *
+	 * Also, we want to grab hold of the main automounter process
+	 * and its going to be the group leader for pgrp, so its
+	 * pid will be equal to pgrp.
+	 */
+	prp = sprlock(data->lav_pgrp);
+	if (prp == NULL)
+		return (-1);
+	mutex_exit(&prp->p_lock);
+
+	/* Now we want to access the processes open file descriptors. */
+	fip = P_FINFO(prp);
+	mutex_enter(&fip->fi_lock);
+
+	/*
+	 * Now we need to find the read end of the fifo (for reasons
+	 * explained below.)  We assume that the read end of the fifo
+	 * is in the same process as the write end.
+	 */
+	for (i = 0; i < fip->fi_nfiles; i++) {
+		UF_ENTER(ufp_rd, fip, i);
+		if (((fp_rd = ufp_rd->uf_file) != NULL) &&
+		    (fp_rd->f_vnode == vp_rd))
+			break;
+		UF_EXIT(ufp_rd);
+	}
+	if (i == fip->fi_nfiles) {
+		/* Didn't find it. */
+		mutex_exit(&fip->fi_lock);
+		mutex_enter(&prp->p_lock);
+		sprunlock(prp);
+		return (-1);
+	}
+
+	/*
+	 * Seems the automounter still has the read end of the fifo
+	 * open, we're done here.  Release all our locks and exit.
+	 */
+	mutex_exit(&fip->fi_lock);
+	UF_EXIT(ufp_rd);
+	mutex_enter(&prp->p_lock);
+	sprunlock(prp);
+
+	return (0);
+}
+
+static int
+lx_autofs_fifo_write(lx_autofs_vfs_t *data, lx_autofs_automnt_req_t *laarp)
+{
+	struct uio	uio;
+	struct iovec	iov;
+	file_t		*fp_wr, *fp_rd;
+	int		error;
+
+	/*
+	 * The catch here is we need to make sure _we_ don't close
+	 * the the fifo while writing to it.  (Another thread could come
+	 * along and realize the automounter process is gone and close
+	 * the fifo.  To do this we bump the open count before we
+	 * write to the fifo.
+	 */
+	mutex_enter(&data->lav_lock);
+	if (data->lav_fifo_wr == NULL) {
+		ASSERT(data->lav_fifo_rd == NULL);
+		mutex_exit(&data->lav_lock);
+		return (ENOENT);
+	}
+	fp_wr = data->lav_fifo_wr;
+	fp_rd = data->lav_fifo_rd;
+
+	/* Bump the open count on the write fifo. */
+	mutex_enter(&fp_wr->f_tlock);
+	fp_wr->f_count++;
+	mutex_exit(&fp_wr->f_tlock);
+
+	/* Bump the open count on the read fifo. */
+	mutex_enter(&fp_rd->f_tlock);
+	fp_rd->f_count++;
+	mutex_exit(&fp_rd->f_tlock);
+
+	mutex_exit(&data->lav_lock);
+
+	iov.iov_base = (caddr_t)&laarp->laar_pkt;
+	iov.iov_len = laarp->laar_pkt_size;
+	uio.uio_iov = &iov;
+	uio.uio_iovcnt = 1;
+	uio.uio_loffset = 0;
+	uio.uio_segflg = (short)UIO_SYSSPACE;
+	uio.uio_resid = laarp->laar_pkt_size;
+	uio.uio_llimit = 0;
+	uio.uio_fmode = FWRITE | FNDELAY | FNONBLOCK;
+
+	error = VOP_WRITE(fp_wr->f_vnode, &uio, 0, kcred, NULL);
+	(void) closef(fp_wr);
+	(void) closef(fp_rd);
+
+	/*
+	 * After every write we verify that the automounter still has
+	 * these files open.
+	 */
+	mutex_enter(&data->lav_lock);
+	if (lx_autofs_fifo_verify_rd(data) != 0) {
+		/*
+		 * Something happened to the automounter.
+		 * Close down the communication pipe we setup.
+		 */
+		mutex_exit(&data->lav_lock);
+		lx_autofs_fifo_close(data);
+		if (error != 0)
+			return (error);
+		return (ENOENT);
+	}
+	mutex_exit(&data->lav_lock);
+
+	return (error);
+}
+
+static int
+lx_autofs_bs_readdir(vnode_t *dvp, list_t *dir_stack, list_t *file_stack)
+{
+	struct iovec	iov;
+	struct uio	uio;
+	dirent64_t	*dp, *dbuf;
+	vnode_t		*vp;
+	size_t		dlen, dbuflen;
+	int		eof, error, ndirents = 64;
+	char		*nm;
+
+	dlen = ndirents * (sizeof (*dbuf));
+	dbuf = kmem_alloc(dlen, KM_SLEEP);
+
+	uio.uio_iov = &iov;
+	uio.uio_iovcnt = 1;
+	uio.uio_segflg = UIO_SYSSPACE;
+	uio.uio_fmode = 0;
+	uio.uio_extflg = UIO_COPY_CACHED;
+	uio.uio_loffset = 0;
+	uio.uio_llimit = MAXOFFSET_T;
+
+	eof = 0;
+	error = 0;
+	while (!error && !eof) {
+		uio.uio_resid = dlen;
+		iov.iov_base = (char *)dbuf;
+		iov.iov_len = dlen;
+
+		(void) VOP_RWLOCK(dvp, V_WRITELOCK_FALSE, NULL);
+		if (VOP_READDIR(dvp, &uio, kcred, &eof, NULL, 0) != 0) {
+			VOP_RWUNLOCK(dvp, V_WRITELOCK_FALSE, NULL);
+			kmem_free(dbuf, dlen);
+			return (-1);
+		}
+		VOP_RWUNLOCK(dvp, V_WRITELOCK_FALSE, NULL);
+
+		if ((dbuflen = dlen - uio.uio_resid) == 0) {
+			/* We're done. */
+			break;
+		}
+
+		for (dp = dbuf; ((intptr_t)dp < (intptr_t)dbuf + dbuflen);
+		    dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen)) {
+
+			nm = dp->d_name;
+
+			if (strcmp(nm, ".") == 0 || strcmp(nm, "..") == 0)
+				continue;
+
+			if (VOP_LOOKUP(dvp, nm, &vp, NULL, 0, NULL, kcred,
+			    NULL, NULL, NULL) != 0) {
+				kmem_free(dbuf, dlen);
+				return (-1);
+			}
+			if (vp->v_type == VDIR) {
+				if (dir_stack != NULL) {
+					lx_autofs_stack_push(dir_stack,
+					    (caddr_t)dvp,
+					    (caddr_t)vp, lx_autofs_strdup(nm));
+				} else {
+					VN_RELE(vp);
+				}
+			} else {
+				if (file_stack != NULL) {
+					lx_autofs_stack_push(file_stack,
+					    (caddr_t)dvp,
+					    (caddr_t)vp, lx_autofs_strdup(nm));
+				} else {
+					VN_RELE(vp);
+				}
+			}
+		}
+	}
+	kmem_free(dbuf, dlen);
+	return (0);
+}
+
+static void
+lx_autofs_bs_destroy(vnode_t *dvp, char *path)
+{
+	list_t	search_stack;
+	list_t	dir_stack;
+	list_t	file_stack;
+	vnode_t	*pdvp, *vp;
+	char	*dpath, *fpath;
+	int	ret;
+
+	if (VOP_LOOKUP(dvp, path, &vp, NULL, 0, NULL, kcred,
+	    NULL, NULL, NULL) != 0) {
+		/* A directory entry with this name doesn't actually exist. */
+		return;
+	}
+
+	if ((vp->v_type & VDIR) == 0) {
+		/* Easy, the directory entry is a file so delete it. */
+		VN_RELE(vp);
+		(void) VOP_REMOVE(dvp, path, kcred, NULL, 0);
+		return;
+	}
+
+	/*
+	 * The directory entry is a subdirectory, now we have a bit more
+	 * work to do.  (We'll have to recurse into the sub directory.)
+	 * It would have been much easier to do this recursively but kernel
+	 * stacks are notoriously small.
+	 */
+	ls_autofs_stack_init(&search_stack);
+	ls_autofs_stack_init(&dir_stack);
+	ls_autofs_stack_init(&file_stack);
+
+	/* Save our newfound subdirectory into a list. */
+	lx_autofs_stack_push(&search_stack, (caddr_t)dvp, (caddr_t)vp,
+	    lx_autofs_strdup(path));
+
+	/* Do a recursive depth first search into the subdirectories. */
+	while (lx_autofs_stack_pop(&search_stack,
+	    (caddr_t *)&pdvp, (caddr_t *)&dvp, &dpath) == 0) {
+
+		/* Get a list of the subdirectories in this directory. */
+		if (lx_autofs_bs_readdir(dvp, &search_stack, NULL) != 0)
+			goto exit;
+
+		/* Save the current directory a separate stack. */
+		lx_autofs_stack_push(&dir_stack, (caddr_t)pdvp, (caddr_t)dvp,
+		    dpath);
+	}
+
+	/*
+	 * Now dir_stack contains a list of directories, the deepest paths
+	 * are at the top of the list.  So let's go through and process them.
+	 */
+	while (lx_autofs_stack_pop(&dir_stack,
+	    (caddr_t *)&pdvp, (caddr_t *)&dvp, &dpath) == 0) {
+
+		/* Get a list of the files in this directory. */
+		if (lx_autofs_bs_readdir(dvp, NULL, &file_stack) != 0) {
+			VN_RELE(dvp);
+			lx_autofs_strfree(dpath);
+			goto exit;
+		}
+
+		/* Delete all the files in this directory. */
+		while (lx_autofs_stack_pop(&file_stack,
+		    NULL, (caddr_t *)&vp, &fpath) == 0) {
+			VN_RELE(vp)
+			ret = VOP_REMOVE(dvp, fpath, kcred, NULL, 0);
+			lx_autofs_strfree(fpath);
+			if (ret != 0) {
+				lx_autofs_strfree(dpath);
+				goto exit;
+			}
+		}
+
+		/* Delete this directory. */
+		VN_RELE(dvp);
+		ret = VOP_RMDIR(pdvp, dpath, pdvp, kcred, NULL, 0);
+		lx_autofs_strfree(dpath);
+		if (ret != 0)
+			goto exit;
+	}
+
+exit:
+	while (
+	    (lx_autofs_stack_pop(&search_stack, NULL, (caddr_t *)&vp,
+	    &path) == 0) ||
+	    (lx_autofs_stack_pop(&dir_stack, NULL, (caddr_t *)&vp,
+	    &path) == 0) ||
+	    (lx_autofs_stack_pop(&file_stack, NULL, (caddr_t *)&vp,
+	    &path) == 0)) {
+		VN_RELE(vp);
+		lx_autofs_strfree(path);
+	}
+	lx_autofs_stack_fini(&search_stack);
+	lx_autofs_stack_fini(&dir_stack);
+	lx_autofs_stack_fini(&file_stack);
+}
+
+static vnode_t *
+lx_autofs_bs_create(vnode_t *dvp, char *bs_name)
+{
+	vnode_t	*vp;
+	vattr_t	vattr;
+
+	/*
+	 * After looking at the mkdir syscall path it seems we don't need
+	 * to initialize all of the vattr_t structure.
+	 */
+	bzero(&vattr, sizeof (vattr));
+	vattr.va_type = VDIR;
+	vattr.va_mode = 0755; /* u+rwx,og=rx */
+	vattr.va_mask = AT_TYPE|AT_MODE;
+
+	if (VOP_MKDIR(dvp, bs_name, &vattr, &vp, kcred, NULL, 0, NULL) != 0)
+		return (NULL);
+	return (vp);
+}
+
+static int
+lx_autofs_automounter_call(vnode_t *dvp, char *nm)
+{
+	lx_autofs_automnt_req_t	*laar;
+	lx_autofs_vfs_t		*data;
+	int			error;
+	boolean_t		is_dup;
+
+	/* Get a pointer to the vfs mount data. */
+	data = (lx_autofs_vfs_t *)dvp->v_vfsp->vfs_data;
+
+	/* The automounter only supports queries in the root directory. */
+	if (dvp != data->lav_root)
+		return (ENOENT);
+
+	/*
+	 * Check if the current process is in the automounters process
+	 * group.  (If it is, the current process is either the autmounter
+	 * itself or one of it's forked child processes.)  If so, don't
+	 * redirect this call back into the automounter because we'll
+	 * hang.
+	 */
+	mutex_enter(&pidlock);
+	if (data->lav_pgrp == curproc->p_pgrp) {
+		mutex_exit(&pidlock);
+		return (ENOENT);
+	}
+	mutex_exit(&pidlock);
+
+	/* Verify that the automount process pipe still exists. */
+	mutex_enter(&data->lav_lock);
+	if (data->lav_fifo_wr == NULL) {
+		ASSERT(data->lav_fifo_rd == NULL);
+		mutex_exit(&data->lav_lock);
+		return (ENOENT);
+	}
+	mutex_exit(&data->lav_lock);
+
+	/* Allocate an automounter request structure. */
+	if ((laar = lx_autofs_la_alloc(data, &is_dup, B_FALSE,
+	    nm)) == NULL)
+		return (ENOENT);
+
+	/*
+	 * If we were the first one to allocate this request then we
+	 * need to send it to the automounter.
+	 */
+	if ((!is_dup) &&
+	    ((error = lx_autofs_fifo_write(data, laar)) != 0)) {
+		/*
+		 * Unable to send the request to the automounter.
+		 * Unblock any other threads waiting on the request
+		 * and release the request.
+		 */
+		lx_autofs_la_complete(data, laar);
+		lx_autofs_la_release(data, laar);
+		return (error);
+	}
+
+	/* Wait for someone to signal us that this request has completed. */
+	mutex_enter(&laar->laar_lock);
+	while (!laar->laar_complete) {
+		if (cv_wait_sig(&laar->laar_cv, &laar->laar_lock) == 0) {
+			/* We got a signal, abort this call. */
+			mutex_exit(&laar->laar_lock);
+			lx_autofs_la_abort(data, laar);
+			return (EINTR);
+		}
+	}
+	mutex_exit(&laar->laar_lock);
+
+	if (laar->laar_result == LXACR_READY) {
+		/*
+		 * Mount succeeded, keep track for future expire calls.
+		 *
+		 * See vfs lav_vn_hash. Is this something we could use for
+		 * iterating mounts under this autofs? Used by
+		 * lx_autofs_vn_alloc
+		 */
+		lx_autofs_mntent_t *mp;
+
+		mp = kmem_zalloc(sizeof (lx_autofs_mntent_t), KM_SLEEP);
+		mp->lxafme_len = strlen(nm) + 1;
+		mp->lxafme_path = kmem_zalloc(mp->lxafme_len, KM_SLEEP);
+		mp->lxafme_ts = TICK_TO_SEC(ddi_get_lbolt64());
+		(void) strlcpy(mp->lxafme_path, nm, mp->lxafme_len);
+
+		mutex_enter(&data->lav_lock);
+		list_insert_tail(&data->lav_mnt_list, mp);
+		mutex_exit(&data->lav_lock);
+	}
+
+	lx_autofs_la_release(data, laar);
+
+	return (0);
+}
+
+/*
+ * Same preliminary checks as in lx_autofs_unmount.
+ */
+static boolean_t
+lx_autofs_may_unmount(vfs_t *vfsp, struct cred *cr)
+{
+	lx_autofs_vfs_t *data;
+
+	if (secpolicy_fs_unmount(cr, vfsp) != 0)
+		return (B_FALSE);
+
+	/*
+	 * We should never have a reference count of less than 2: one for the
+	 * caller, one for the root vnode.
+	 */
+	ASSERT(vfsp->vfs_count >= 2);
+
+	/* If there are any outstanding vnodes, we can't unmount. */
+	if (vfsp->vfs_count > 2)
+		return (B_FALSE);
+
+	data = (lx_autofs_vfs_t *)vfsp->vfs_data;
+	ASSERT(data->lav_root->v_vfsp == vfsp);
+
+	/* Check for any remaining holds on the root vnode. */
+	if (data->lav_root->v_count > 1)
+		return (B_FALSE);
+
+	return (B_TRUE);
+}
+
+static vfs_t *
+lx_autofs_get_mountvfs(char *fs_mntpt, int *cnt)
+{
+	struct vfs *vfsp;
+	struct vfs *vfslist;
+	vfs_t *fnd_vfs = NULL;
+	int fsmplen;
+	int acnt = 0;
+
+	fsmplen = strlen(fs_mntpt);
+
+	vfs_list_read_lock();
+
+	vfsp = vfslist = curzone->zone_vfslist;
+	if (vfslist == NULL) {
+		vfs_list_unlock();
+		*cnt = 0;
+		return (NULL);
+	}
+
+	do {
+		/* Skip mounts we shouldn't show. */
+		if (!(vfsp->vfs_flag & VFS_NOMNTTAB)) {
+			char *mntpt;
+
+			mntpt = (char *)refstr_value(vfsp->vfs_mntpt);
+			if (strncmp(fs_mntpt, mntpt, fsmplen) == 0 &&
+			    (mntpt[fsmplen] == '\0' || mntpt[fsmplen] == '/')) {
+				/*
+				 * We'll return the first one we find but don't
+				 * return a mount that is actually autofs (i.e.
+				 * autofs direct or offset mount).
+				 */
+				if (vfsp->vfs_op == lx_autofs_vfsops) {
+					acnt++;
+				} else if (fnd_vfs == NULL) {
+					fnd_vfs = vfsp;
+					VFS_HOLD(fnd_vfs)
+				}
+			}
+		}
+		vfsp = vfsp->vfs_zone_next;
+	} while (vfsp != vfslist);
+
+	vfs_list_unlock();
+
+	*cnt = acnt;
+	return (fnd_vfs);
+}
+
+/*
+ * Unmount all autofs offset mounts below the given path.
+ */
+static boolean_t
+lx_autofs_umount_offset(char *fs_mntpt, struct cred *cr)
+{
+	struct vfs *vfsp;
+	struct vfs *vfslist;
+	boolean_t busy = B_FALSE;
+	int fsmplen = strlen(fs_mntpt);
+
+restart:
+	vfs_list_read_lock();
+
+	vfsp = vfslist = curzone->zone_vfslist;
+	if (vfslist == NULL) {
+		vfs_list_unlock();
+		return (B_FALSE);
+	}
+
+	do {
+		char *mntpt;
+		lx_autofs_vfs_t *data;
+
+		/* Skip mounts we should ignore. */
+		if ((vfsp->vfs_flag & VFS_NOMNTTAB)) {
+			vfsp = vfsp->vfs_zone_next;
+			continue;
+		}
+
+		mntpt = (char *)refstr_value(vfsp->vfs_mntpt);
+		if (strncmp(fs_mntpt, mntpt, fsmplen) != 0 ||
+		    (mntpt[fsmplen] != '\0' && mntpt[fsmplen] != '/')) {
+			vfsp = vfsp->vfs_zone_next;
+			continue;
+		}
+
+		if (vfsp->vfs_op != lx_autofs_vfsops) {
+			/*
+			 * Something got mounted over the autofs mountpoint
+			 * after we checked that this inidrect hierarchy was
+			 * not busy.
+			 */
+			busy = B_TRUE;
+			break;
+		}
+
+		data = (lx_autofs_vfs_t *)vfsp->vfs_data;
+		if (data->lav_mnttype != LXAMT_OFFSET) {
+			/*
+			 * Something mounted a non-offset autofs fs under this
+			 * indirect mnt!
+			 */
+			busy = B_TRUE;
+			break;
+		}
+
+		/*
+		 * Attempt to umount - set busy if fails.
+		 *
+		 * umount2_engine will call VFS_RELE, so we need to take an
+		 * extra hold to match the behavior during the normal umount
+		 * path.
+		 *
+		 * We also need to drop the list lock to prevent deadlock
+		 * during umount.
+		 */
+		VFS_HOLD(vfsp);
+		vfs_list_unlock();
+		if (umount2_engine(vfsp, 0, cr, 0) != 0) {
+			busy = B_TRUE;
+			goto errexit;
+		}
+
+		/* Retake list lock and look for more. */
+		goto restart;
+	} while (vfsp != vfslist);
+
+	vfs_list_unlock();
+
+errexit:
+	return (busy);
+}
+
+
+/*
+ * Note that lx_autofs_automounter_call() only supports queries in the root
+ * directory, so all mntent names are relative to that.
+ */
+static int
+lx_autofs_expire(vfs_t *vfsp, struct cred *cr)
+{
+	lx_autofs_vfs_t *data;
+	lx_autofs_mntent_t *mp;
+	lx_autofs_automnt_req_t	*laar;
+	boolean_t is_dup;
+	vfs_t *fnd_vfs;
+	int autofs_cnt;
+	boolean_t busy = B_FALSE;
+	char exp_path[MAXPATHLEN];
+
+	data = (lx_autofs_vfs_t *)vfsp->vfs_data;
+
+	/*
+	 * We process only the first element (i.e. do not do multi). This
+	 * works fine for the automounter.
+	 */
+	mutex_enter(&data->lav_lock);
+	mp = (lx_autofs_mntent_t *)list_remove_head(&data->lav_mnt_list);
+	mutex_exit(&data->lav_lock);
+	if (mp == NULL) {
+		if (data->lav_mnttype == LXAMT_OFFSET) {
+			/*
+			 * During restart the automounter will openmount each
+			 * offset mount for management. It won't closemount the
+			 * offset mount until we expire it, even though nothing
+			 * is mounted over that offset. We handle this as a
+			 * special expiration case.
+			 */
+			int cnt;
+
+			mutex_enter(&data->lav_lock);
+			cnt = data->lav_openmnt_cnt;
+			mutex_exit(&data->lav_lock);
+
+			if (cnt == 1 && vn_ismntpt(data->lav_root) == 0) {
+				char *mntpt = (char *)
+				    refstr_value(vfsp->vfs_mntpt);
+				char *nm = ZONE_PATH_TRANSLATE(mntpt, curzone);
+
+				mp = kmem_zalloc(sizeof (lx_autofs_mntent_t),
+				    KM_SLEEP);
+				mp->lxafme_len = strlen(nm) + 1;
+				mp->lxafme_path = kmem_zalloc(mp->lxafme_len,
+				    KM_SLEEP);
+				mp->lxafme_ts = TICK_TO_SEC(ddi_get_lbolt64());
+				(void) strlcpy(mp->lxafme_path, nm,
+				    mp->lxafme_len);
+
+				goto exp_offset;
+			}
+		}
+
+		return (EAGAIN);
+	}
+
+	/*
+	 * We only return an expired mount if it is inactive for the full
+	 * timeout. This reduces overly aggressive umount/mount activity.
+	 */
+	if (data->lav_timeout > 0) {
+		uint64_t now = TICK_TO_SEC(ddi_get_lbolt64());
+
+		if ((now - mp->lxafme_ts) < data->lav_timeout) {
+			/* put it back at the end of the line */
+			mutex_enter(&data->lav_lock);
+			list_insert_tail(&data->lav_mnt_list, mp);
+			mutex_exit(&data->lav_lock);
+			return (EAGAIN);
+		}
+	}
+
+	if (data->lav_mnttype == LXAMT_INDIR) {
+		(void) snprintf(exp_path, sizeof (exp_path), "%s/%s",
+		    (char *)refstr_value(vfsp->vfs_mntpt), mp->lxafme_path);
+	} else {
+		(void) strlcpy(exp_path, (char *)refstr_value(vfsp->vfs_mntpt),
+		    sizeof (exp_path));
+	}
+
+	fnd_vfs = lx_autofs_get_mountvfs(exp_path, &autofs_cnt);
+	if (fnd_vfs != NULL) {
+		boolean_t skip = B_FALSE;
+		vfssw_t *vfssw;
+
+		/*
+		 * If it's an NFS file system (typical) then we check in
+		 * advance to see if it can be unmounted, otherwise, proceed.
+		 * The fs-specific umount attempted by the automounter will
+		 * either succeed or fail. Both are valid outcomes but checking
+		 * now for nfs will save a bunch of work by the automounter
+		 * if the fs is busy.
+		 *
+		 * Unfortunately, for NFS the vfs_fstype is the same for all
+		 * versions of NFS, so we need to check the vfs_op member to
+		 * determine which version of NFS we're dealing with.
+		 */
+		if (!skip && (vfssw = vfs_getvfssw("nfs4")) != NULL) {
+			if (vfs_matchops(fnd_vfs, &vfssw->vsw_vfsops)) {
+				(void) dnlc_purge_vfsp(fnd_vfs, 0);
+				if (check_rtable4(fnd_vfs))
+					busy = B_TRUE;
+				skip = B_TRUE;
+			}
+			vfs_unrefvfssw(vfssw);
+		}
+
+		if (!skip && (vfssw = vfs_getvfssw("nfs3")) != NULL) {
+			if (vfs_matchops(fnd_vfs, &vfssw->vsw_vfsops)) {
+				(void) dnlc_purge_vfsp(fnd_vfs, 0);
+				if (check_rtable(fnd_vfs))
+					busy = B_TRUE;
+			}
+			vfs_unrefvfssw(vfssw);
+		}
+
+		VFS_RELE(fnd_vfs);
+
+	} else if (autofs_cnt > 0) {
+		/*
+		 * The automounter is asking us to expire and we pulled this
+		 * name from our vfs mountpoint list, but if
+		 * lx_autofs_get_mountvfs returns null then that means we
+		 * didn't find a non-autofs mount under this name. Thus, the
+		 * name could be a subdirectory under an autofs toplevel
+		 * indirect mount with one or more offset mounts below.
+		 * autofs_cnt will indicate how many autofs mounts exist below
+		 * this subdirectory name.
+		 *
+		 * The automounter will take care of unmounting any fs mounted
+		 * over one of these offset mounts (i.e. offset is like a
+		 * direct mount which the automounter will manage) but the
+		 * automounter will not unmount the actual autofs offset mount
+		 * itself, so we have to do that before we can expire the
+		 * top-level subrectory name.
+		 */
+		busy = lx_autofs_umount_offset(exp_path, cr);
+	}
+
+	if (busy) {
+		/*
+		 * Can't unmount this one right now, put it at the end of the
+		 * list and return. The caller will return EAGAIN for the
+		 * expire ioctl and the automounter will check again later.
+		 */
+		mp->lxafme_ts = TICK_TO_SEC(ddi_get_lbolt64());
+		mutex_enter(&data->lav_lock);
+		list_insert_tail(&data->lav_mnt_list, mp);
+		mutex_exit(&data->lav_lock);
+		return (EAGAIN);
+	}
+
+	/*
+	 * See lx_autofs_automounter_call. We want to send a msg up the pipe
+	 * to the automounter in a similar way.
+	 */
+
+exp_offset:
+	/* Verify that the automount process pipe still exists. */
+	mutex_enter(&data->lav_lock);
+	if (data->lav_fifo_wr == NULL) {
+		ASSERT(data->lav_fifo_rd == NULL);
+		mutex_exit(&data->lav_lock);
+		goto err_free;
+	}
+	mutex_exit(&data->lav_lock);
+
+	/* Allocate an automounter expire structure. */
+	if ((laar = lx_autofs_la_alloc(data, &is_dup, B_TRUE,
+	    mp->lxafme_path)) == NULL)
+		goto err_free;
+
+	/*
+	 * If we were the first one to allocate this request then we
+	 * need to send it to the automounter.
+	 */
+	if (!is_dup && lx_autofs_fifo_write(data, laar) != 0) {
+		/*
+		 * Unable to send the request to the automounter.
+		 * Unblock any other threads waiting on the request
+		 * and release the request.
+		 */
+		lx_autofs_la_complete(data, laar);
+		lx_autofs_la_release(data, laar);
+		goto err_free;
+	}
+
+	/* Wait for someone to signal us that this request has completed. */
+	mutex_enter(&laar->laar_lock);
+	while (!laar->laar_complete) {
+		if (cv_wait_sig(&laar->laar_cv, &laar->laar_lock) == 0) {
+			/* We got a signal, abort this request. */
+			mutex_exit(&laar->laar_lock);
+			lx_autofs_la_abort(data, laar);
+			goto err_free;
+		}
+	}
+	mutex_exit(&laar->laar_lock);
+
+	/*
+	 * If it failed or if the file system is still mounted after we get the
+	 * response from our expire msg, then that means the automounter tried
+	 * to unmount it but failed because the file system is busy, so we put
+	 * this entry back on our list to try to expire it again later.
+	 */
+	fnd_vfs = NULL;
+	if (laar->laar_result == LXACR_FAIL ||
+	    (fnd_vfs = lx_autofs_get_mountvfs(exp_path, &autofs_cnt)) != NULL ||
+	    autofs_cnt > 0) {
+		if (fnd_vfs != NULL)
+			VFS_RELE(fnd_vfs);
+		mp->lxafme_ts = TICK_TO_SEC(ddi_get_lbolt64());
+		mutex_enter(&data->lav_lock);
+		list_insert_tail(&data->lav_mnt_list, mp);
+		mutex_exit(&data->lav_lock);
+	} else {
+		kmem_free(mp->lxafme_path, mp->lxafme_len);
+		kmem_free(mp, sizeof (lx_autofs_mntent_t));
+	}
+
+	lx_autofs_la_release(data, laar);
+	return (0);
+
+err_free:
+	kmem_free(mp->lxafme_path, mp->lxafme_len);
+	kmem_free(mp, sizeof (lx_autofs_mntent_t));
+	return (EAGAIN);
+}
+
+static int
+lx_autofs_ack(int reqid, vfs_t *vfsp, enum lx_autofs_callres result)
+{
+	lx_autofs_vfs_t	*data;
+	lx_autofs_automnt_req_t	*laar;
+
+	data = (lx_autofs_vfs_t *)vfsp->vfs_data;
+	if ((laar = lx_autofs_la_find(data, reqid)) == NULL)
+		return (ENXIO);
+
+	/* Mark the request as complete and release it. */
+	laar->laar_result = result;
+	lx_autofs_la_complete(data, laar);
+	lx_autofs_la_release(data, laar);
+	return (0);
+}
+
+static int
+lx_autofs_automounter_ioctl(vnode_t *vp, int cmd, intptr_t arg, cred_t *cr)
+{
+	lx_autofs_vfs_t *data = (lx_autofs_vfs_t *)vp->v_vfsp->vfs_data;
+	int			id = arg;
+	int			v;
+	int			err;
+
+	/*
+	 * Be strict.
+	 * We only accept ioctls from the automounter process group.
+	 */
+	mutex_enter(&pidlock);
+	if (data->lav_pgrp != curproc->p_pgrp) {
+		mutex_exit(&pidlock);
+		return (ENOENT);
+	}
+	mutex_exit(&pidlock);
+
+	switch (cmd) {
+	case LX_AUTOFS_IOC_READY:
+		if ((err = lx_autofs_ack(id, vp->v_vfsp, LXACR_READY)) != 0)
+			return (err);
+		return (0);
+
+	case LX_AUTOFS_IOC_FAIL:
+		if ((err = lx_autofs_ack(id, vp->v_vfsp, LXACR_FAIL)) != 0)
+			return (err);
+		return (0);
+
+	case LX_AUTOFS_IOC_CATATONIC:
+		/* The automounter is shutting down. */
+		lx_autofs_fifo_close(data);
+		return (0);
+
+	case LX_AUTOFS_IOC_PROTOVER:
+		v = LX_AUTOFS_PROTO_VERS5;
+		if (copyout(&v, (caddr_t)arg, sizeof (int)) != 0)
+			return (EFAULT);
+		return (0);
+
+	case LX_AUTOFS_IOC_PROTOSUBVER:
+		v = LX_AUTOFS_PROTO_SUBVERSION;
+		if (copyout(&v, (caddr_t)arg, sizeof (int)) != 0)
+			return (EFAULT);
+		return (0);
+
+	case LX_AUTOFS_IOC_ASKUMOUNT:
+		/*
+		 * This is asking if autofs can be unmounted, not asking to
+		 * actually unmount it. We return 1 if it is busy or 0 if it
+		 * can be unmounted.
+		 */
+		v = 1;
+		if (lx_autofs_may_unmount(vp->v_vfsp, cr))
+			v = 0;
+
+		if (copyout(&v, (caddr_t)arg, sizeof (int)) != 0)
+			return (EFAULT);
+		return (0);
+
+	case LX_AUTOFS_IOC_SETTIMEOUT:
+		if (copyin((caddr_t)arg, &data->lav_timeout, sizeof (ulong_t))
+		    != 0)
+			return (EFAULT);
+		return (0);
+
+	case LX_AUTOFS_IOC_EXPIRE:
+		return (ENOTSUP);
+
+	case LX_AUTOFS_IOC_EXPIRE_MULTI:
+		lx_autofs_expire(vp->v_vfsp, cr);
+		return (EAGAIN);
+
+	default:
+		ASSERT(0);
+		return (ENOTSUP);
+	}
+}
+
+static int
+lx_autofs_parse_mntopt(vfs_t *vfsp, lx_autofs_vfs_t *data)
+{
+	char		*fd_str, *pgrp_str, *minproto_str, *maxproto_str;
+	int		fd, pgrp, minproto, maxproto;
+	file_t		*fp_wr, *fp_rd;
+
+	/* Require these options to be present. */
+	if ((vfs_optionisset(vfsp, LX_MNTOPT_FD, &fd_str) != 1) ||
+	    (vfs_optionisset(vfsp, LX_MNTOPT_PGRP, &pgrp_str) != 1) ||
+	    (vfs_optionisset(vfsp, LX_MNTOPT_MINPROTO, &minproto_str) != 1) ||
+	    (vfs_optionisset(vfsp, LX_MNTOPT_MAXPROTO, &maxproto_str) != 1))
+		return (EINVAL);
+
+	/* Get the values for each parameter. */
+	if ((lx_autofs_str_to_int(fd_str, &fd) != 0) ||
+	    (lx_autofs_str_to_int(pgrp_str, &pgrp) != 0) ||
+	    (lx_autofs_str_to_int(minproto_str, &minproto) != 0) ||
+	    (lx_autofs_str_to_int(maxproto_str, &maxproto) != 0))
+		return (EINVAL);
+
+	/*
+	 * We primarily support v2 & v5 of the linux kernel automounter
+	 * protocol. The userland daemon typically needs v5. We'll reject
+	 * unsupported ioctls later if we get one.
+	 */
+	if ((minproto > 5) || (maxproto < 2))
+		return (EINVAL);
+
+	/*
+	 * Now we need to lookup the fifos we'll be using
+	 * to talk to the userland automounter process.
+	 */
+	if (lx_autofs_fifo_lookup(pgrp, fd, &fp_wr, &fp_rd) != 0) {
+		/*
+		 * The automounter doesn't always have the same id as the pgrp.
+		 * This happens when it is started via one of the various
+		 * service managers. In this case the fifo lookup will fail
+		 * so we retry with our own pid.
+		 */
+		int pid = (int)curproc->p_pid;
+
+		if (lx_autofs_fifo_lookup(pid, fd, &fp_wr, &fp_rd) != 0)
+			return (EINVAL);
+	}
+
+	if (vfs_optionisset(vfsp, LX_MNTOPT_INDIRECT, NULL)) {
+		data->lav_mnttype = LXAMT_INDIR;
+	}
+	if (vfs_optionisset(vfsp, LX_MNTOPT_DIRECT, NULL)) {
+		if (data->lav_mnttype != LXAMT_NONE)
+			return (EINVAL);
+		data->lav_mnttype = LXAMT_DIRECT;
+	}
+	if (vfs_optionisset(vfsp, LX_MNTOPT_OFFSET, NULL)) {
+		if (data->lav_mnttype != LXAMT_NONE)
+			return (EINVAL);
+		data->lav_mnttype = LXAMT_OFFSET;
+	}
+	/* The automounter does test mounts with none of the options */
+	if (data->lav_mnttype == LXAMT_NONE)
+		data->lav_mnttype = LXAMT_DIRECT;
+
+	/* Save the mount options and fifo pointers. */
+	data->lav_fd = fd;
+	data->lav_min_proto = minproto;
+	data->lav_pgrp = pgrp;
+	data->lav_fifo_rd = fp_rd;
+	data->lav_fifo_wr = fp_wr;
+	return (0);
+}
+
+static uint64_t
+s2l_dev(dev_t dev)
+{
+	major_t	maj = getmajor(dev);
+	minor_t	min = getminor(dev);
+
+	return (LX_MAKEDEVICE(maj, min));
+}
+
+/*
+ * VFS entry points
+ */
+static int
+lx_autofs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
+{
+	lx_autofs_vfs_t	*data;
+	dev_t		dev;
+	char		name[40];
+	int		error;
+	vattr_t		va;
+
+	if (secpolicy_fs_mount(cr, mvp, vfsp) != 0)
+		return (EPERM);
+
+	if (mvp->v_type != VDIR)
+		return (ENOTDIR);
+
+	if ((uap->flags & MS_OVERLAY) == 0 &&
+	    (mvp->v_count > 1 || (mvp->v_flag & VROOT)))
+		return (EBUSY);
+
+	/* We don't support mounts in the global zone. */
+	if (getzoneid() == GLOBAL_ZONEID)
+		return (EPERM);
+
+	/*
+	 * Offset mounts will occur below the top-level mountpoint so we
+	 * need to allow for autofs mounts even though mvp is an autofs.
+	 */
+
+	/* Allocate a vfs struct. */
+	data = kmem_zalloc(sizeof (lx_autofs_vfs_t), KM_SLEEP);
+
+	/* Parse mount options. */
+	if ((error = lx_autofs_parse_mntopt(vfsp, data)) != 0) {
+		kmem_free(data, sizeof (lx_autofs_vfs_t));
+		return (error);
+	}
+
+	/* Initialize the backing store. */
+	lx_autofs_bs_destroy(mvp, LX_AUTOFS_BS_DIR);
+	data->lav_bs_vp = lx_autofs_bs_create(mvp, LX_AUTOFS_BS_DIR);
+	if (data->lav_bs_vp == NULL) {
+		kmem_free(data, sizeof (lx_autofs_vfs_t));
+		return (EBUSY);
+	}
+	data->lav_bs_name = LX_AUTOFS_BS_DIR;
+
+	/* Get the backing store inode for use in v5 protocol msgs */
+	va.va_mask = AT_STAT;
+	if ((error = VOP_GETATTR(data->lav_bs_vp, &va, 0, cr, NULL)) != 0) {
+		kmem_free(data, sizeof (lx_autofs_vfs_t));
+		return (error);
+	}
+	data->lav_ino = va.va_nodeid;
+
+	/* We have to hold the underlying vnode we're mounted on. */
+	data->lav_mvp = mvp;
+	VN_HOLD(mvp);
+
+	/* Initialize vfs fields */
+	vfsp->vfs_bsize = DEV_BSIZE;
+	vfsp->vfs_fstype = lx_autofs_fstype;
+	vfsp->vfs_data = data;
+
+	/* Invent a dev_t (sigh) */
+	do {
+		dev = makedevice(lx_autofs_major,
+		    atomic_add_32_nv(&lx_autofs_minor, 1) & L_MAXMIN32);
+	} while (vfs_devismounted(dev));
+	vfsp->vfs_dev = dev;
+	vfs_make_fsid(&vfsp->vfs_fsid, dev, lx_autofs_fstype);
+
+	data->lav_dev = s2l_dev(vfsp->vfs_dev);
+
+	/* Create an id space arena for automounter requests. */
+	(void) snprintf(name, sizeof (name), "lx_autofs_id_%d",
+	    getminor(vfsp->vfs_dev));
+	data->lav_ids = id_space_create(name, 1, INT_MAX);
+
+	/* Create hashes to keep track of automounter requests. */
+	mutex_init(&data->lav_lock, NULL, MUTEX_DEFAULT, NULL);
+	(void) snprintf(name, sizeof (name), "lx_autofs_path_hash_%d",
+	    getminor(vfsp->vfs_dev));
+	data->lav_path_hash = mod_hash_create_strhash(name,
+	    LX_AUTOFS_VFS_PATH_HASH_SIZE, mod_hash_null_valdtor);
+	(void) snprintf(name, sizeof (name), "lx_autofs_id_hash_%d",
+	    getminor(vfsp->vfs_dev));
+	data->lav_id_hash = mod_hash_create_idhash(name,
+	    LX_AUTOFS_VFS_ID_HASH_SIZE, mod_hash_null_valdtor);
+
+	/* Create a hash to keep track of vnodes. */
+	(void) snprintf(name, sizeof (name), "lx_autofs_vn_hash_%d",
+	    getminor(vfsp->vfs_dev));
+	data->lav_vn_hash = mod_hash_create_ptrhash(name,
+	    LX_AUTOFS_VFS_VN_HASH_SIZE, mod_hash_null_valdtor,
+	    sizeof (vnode_t));
+
+	list_create(&data->lav_mnt_list, sizeof (lx_autofs_mntent_t),
+	    offsetof(lx_autofs_mntent_t, lxafme_lst));
+
+	/* Create root vnode */
+	data->lav_root = lx_autofs_vn_alloc(vfsp, data->lav_bs_vp);
+
+	data->lav_root->v_flag |= VROOT | VNOCACHE | VNOMAP | VNOSWAP;
+
+	/*
+	 * For a direct mountpoint we need to allow a filesystem to be
+	 * mounted overtop of this autofs mount. Otherwise, disallow that.
+	 */
+	if (data->lav_mnttype == LXAMT_INDIR)
+		data->lav_root->v_flag |= VNOMOUNT;
+
+	return (0);
+}
+
+static int
+lx_autofs_unmount(vfs_t *vfsp, int flag, struct cred *cr)
+{
+	lx_autofs_vfs_t *data;
+
+	if (secpolicy_fs_unmount(cr, vfsp) != 0)
+		return (EPERM);
+
+	/* We do not currently support forced unmounts. */
+	if (flag & MS_FORCE)
+		return (ENOTSUP);
+
+	/*
+	 * We should never have a reference count of less than 2: one for the
+	 * caller, one for the root vnode.
+	 */
+	ASSERT(vfsp->vfs_count >= 2);
+
+	/* If there are any outstanding vnodes, we can't unmount. */
+	if (vfsp->vfs_count > 2)
+		return (EBUSY);
+
+	/* Check for any remaining holds on the root vnode. */
+	data = (lx_autofs_vfs_t *)vfsp->vfs_data;
+	ASSERT(data->lav_root->v_vfsp == vfsp);
+	if (data->lav_root->v_count > 1)
+		return (EBUSY);
+
+	/* Close the fifo to the automount process. */
+	if (data->lav_fifo_wr != NULL)
+		(void) closef(data->lav_fifo_wr);
+	if (data->lav_fifo_rd != NULL)
+		(void) closef(data->lav_fifo_rd);
+
+	/*
+	 * We have to release our hold on our root vnode before we can
+	 * delete the backing store.  (Since the root vnode is linked
+	 * to the backing store.)
+	 */
+	VN_RELE(data->lav_root);
+
+	/* Cleanup the backing store. */
+	lx_autofs_bs_destroy(data->lav_mvp, data->lav_bs_name);
+	VN_RELE(data->lav_mvp);
+
+	/*
+	 * Delete all listed mounts.
+	 */
+	for (;;) {
+		lx_autofs_mntent_t *mp;
+
+		mp = list_remove_head(&data->lav_mnt_list);
+		if (mp == NULL)
+			break;
+		kmem_free(mp->lxafme_path, mp->lxafme_len);
+		kmem_free(mp, sizeof (lx_autofs_mntent_t));
+	}
+
+	/* Cleanup out remaining data structures. */
+	mod_hash_destroy_strhash(data->lav_path_hash);
+	mod_hash_destroy_idhash(data->lav_id_hash);
+	mod_hash_destroy_ptrhash(data->lav_vn_hash);
+	id_space_destroy(data->lav_ids);
+	list_destroy(&data->lav_mnt_list);
+	kmem_free(data, sizeof (lx_autofs_vfs_t));
+
+	return (0);
+}
+
+static int
+lx_autofs_root(vfs_t *vfsp, vnode_t **vpp)
+{
+	lx_autofs_vfs_t	*data = (lx_autofs_vfs_t *)vfsp->vfs_data;
+
+	*vpp = data->lav_root;
+	VN_HOLD(*vpp);
+
+	return (0);
+}
+
+static int
+lx_autofs_statvfs(vfs_t *vfsp, statvfs64_t *sp)
+{
+	lx_autofs_vfs_t	*data = (lx_autofs_vfs_t *)vfsp->vfs_data;
+	vnode_t		*urvp = data->lav_root->v_data;
+	dev32_t		d32;
+	int		error;
+
+	if ((error = VFS_STATVFS(urvp->v_vfsp, sp)) != 0)
+		return (error);
+
+	/* Update some of values before returning. */
+	(void) cmpldev(&d32, vfsp->vfs_dev);
+	sp->f_fsid = d32;
+	(void) strlcpy(sp->f_basetype, vfssw[vfsp->vfs_fstype].vsw_name,
+	    sizeof (sp->f_basetype));
+	sp->f_flag = vf_to_stf(vfsp->vfs_flag);
+	bzero(sp->f_fstr, sizeof (sp->f_fstr));
+	return (0);
+}
+
+static const fs_operation_def_t lx_autofs_vfstops[] = {
+	{ VFSNAME_MOUNT,	{ .vfs_mount = lx_autofs_mount } },
+	{ VFSNAME_UNMOUNT,	{ .vfs_unmount = lx_autofs_unmount } },
+	{ VFSNAME_ROOT,		{ .vfs_root = lx_autofs_root } },
+	{ VFSNAME_STATVFS,	{ .vfs_statvfs = lx_autofs_statvfs } },
+	{ NULL, NULL }
+};
+
+/*
+ * VOP entry points - simple passthrough
+ *
+ * For most VOP entry points we can simply pass the request on to
+ * the underlying filesystem we're mounted on.
+ */
+static int
+lx_autofs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
+    caller_context_t *ctp)
+{
+	vnode_t *uvp = vp->v_data;
+	return (VOP_CLOSE(uvp, flag, count, offset, cr, ctp));
+}
+
+static int
+lx_autofs_readdir(vnode_t *vp, uio_t *uiop, cred_t *cr, int *eofp,
+    caller_context_t *ctp, int flags)
+{
+	vnode_t *uvp = vp->v_data;
+	return (VOP_READDIR(uvp, uiop, cr, eofp, ctp, flags));
+}
+
+static int
+lx_autofs_access(vnode_t *vp, int mode, int flags, cred_t *cr,
+    caller_context_t *ctp)
+{
+	vnode_t *uvp = vp->v_data;
+	return (VOP_ACCESS(uvp, mode, flags, cr, ctp));
+}
+
+static int
+lx_autofs_rwlock(struct vnode *vp, int write_lock, caller_context_t *ctp)
+{
+	vnode_t *uvp = vp->v_data;
+	return (VOP_RWLOCK(uvp, write_lock, ctp));
+}
+
+static void
+lx_autofs_rwunlock(struct vnode *vp, int write_lock, caller_context_t *ctp)
+{
+	vnode_t *uvp = vp->v_data;
+	VOP_RWUNLOCK(uvp, write_lock, ctp);
+}
+
+/*
+ * Check if attempting to access a 'direct' mount and if so, call the
+ * automounter to perform the mount. Once the mount occurs, the new filesystem
+ * will be mounted overtop of this autofs mountpoint and we will no longer
+ * come through this path.
+ */
+static vnode_t *
+lx_autofs_do_direct(vnode_t *vp)
+{
+	vfs_t	*vfsp = vp->v_vfsp;
+	lx_autofs_vfs_t *data = (lx_autofs_vfs_t *)vfsp->vfs_data;
+	vnode_t *nvp;
+	boolean_t skip_am_call = B_FALSE;
+
+	if (data->lav_mnttype == LXAMT_INDIR)
+		return (NULL);
+
+	/*
+	 * Check if the current process is in the automounter's process group.
+	 * If it is, the current process is either the automounter itself or
+	 * one of it's children. If so, don't call back into the automounter.
+	 */
+	mutex_enter(&pidlock);
+	if (data->lav_pgrp == curproc->p_pgrp) {
+		skip_am_call = B_TRUE;
+	}
+	mutex_exit(&pidlock);
+
+	/*
+	 * It is possible there is already a new fs mounted on top of our vnode.
+	 * This can happen if the caller first did a lookup of a file name
+	 * using our vnode as the directory vp. The lookup would trigger the
+	 * autofs mount on top of ourself, but if the caller then uses our
+	 * vnode to do a getattr on the directory, it will use the autofs
+	 * vnode and not the newly mounted vnode. We need to skip re-calling
+	 * the automounter for this case.
+	 */
+	if (!skip_am_call && vn_mountedvfs(vp) == NULL) {
+		char tbuf[MAXPATHLEN];
+		char *nm;
+
+		(void) strlcpy(tbuf, (char *)refstr_value(vfsp->vfs_mntpt),
+		    sizeof (tbuf));
+		nm = tbuf + strlen(tbuf);
+		while (*nm != '/' && nm != tbuf)
+			nm--;
+		if (*nm == '/')
+			nm++;
+		(void) lx_autofs_automounter_call(vp, nm);
+	}
+
+	/*
+	 * We need to take an extra hold on our vp (which is the autofs
+	 * root vp) to account for the rele done in traverse. traverse will
+	 * take a hold on the new vp so the caller is responsible for calling
+	 * VN_RELE on the returned vp.
+	 */
+	VN_HOLD(vp);
+	nvp = vp;
+	if (traverse(&nvp) != 0) {
+		VN_RELE(nvp);
+		return (NULL);
+	}
+
+	/* Confirm that we have a non-autofs fs mounted now */
+	if (nvp->v_op == lx_autofs_vn_ops) {
+		VN_RELE(nvp);
+		return (NULL);
+	}
+
+	return (nvp);
+}
+
+/*ARGSUSED*/
+static int
+lx_autofs_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr,
+    caller_context_t *ctp, int flags)
+{
+	vnode_t *udvp = dvp->v_data;
+	vnode_t		*nvp;
+
+	/* handle direct mount here */
+	if ((nvp = lx_autofs_do_direct(dvp)) != NULL) {
+		int error;
+
+		error = VOP_RMDIR(nvp, nm, cdir, cr, ctp, flags);
+		VN_RELE(nvp);
+		return (error);
+	}
+
+	/*
+	 * cdir is the calling processes current directory.
+	 * If cdir is lx_autofs vnode then get its real underlying
+	 * vnode ptr.  (It seems like the only thing cdir is
+	 * ever used for is to make sure the user doesn't delete
+	 * their current directory.)
+	 */
+	if (vn_matchops(cdir, lx_autofs_vn_ops)) {
+		vnode_t *ucdir = cdir->v_data;
+		return (VOP_RMDIR(udvp, nm, ucdir, cr, ctp, flags));
+	}
+
+	return (VOP_RMDIR(udvp, nm, cdir, cr, ctp, flags));
+}
+
+/*
+ * VOP entry points - special passthrough
+ *
+ * For some VOP entry points we will first pass the request on to
+ * the underlying filesystem we're mounted on.  If there's an error
+ * then we immediately return the error, but if the request succeeds
+ * we have to do some extra work before returning.
+ */
+static int
+lx_autofs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ctp)
+{
+	vnode_t		*ovp = *vpp;
+	vnode_t		*uvp = ovp->v_data;
+	int		error;
+
+	/* direct mounts were handled by the lookup to get *vpp */
+
+	if ((error = VOP_OPEN(&uvp, flag, cr, ctp)) != 0)
+		return (error);
+
+	/* Check for clone opens. */
+	if (uvp == ovp->v_data)
+		return (0);
+
+	/* Deal with clone opens by returning a new vnode. */
+	*vpp = lx_autofs_vn_alloc(ovp->v_vfsp, uvp);
+	VN_RELE(ovp);
+	return (0);
+}
+
+static int
+lx_autofs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
+    caller_context_t *ctp)
+{
+	vnode_t		*uvp = vp->v_data;
+	vnode_t		*dvp;
+	int		error;
+	lx_autofs_vfs_t *data = (lx_autofs_vfs_t *)vp->v_vfsp->vfs_data;
+
+	if ((dvp = lx_autofs_do_direct(vp)) != NULL) {
+		uvp = dvp;
+	}
+
+	error = VOP_GETATTR(uvp, vap, flags, cr, ctp);
+
+	if (dvp != NULL) {
+		/* we operated on the direct mounted fs */
+		VN_RELE(dvp);
+		if (error == 0) {
+			/*
+			 * During automounter restart recovery the automounter
+			 * will fstat the fd provided in the setpipe ioctl. It
+			 * uses the resulting inode & dev to correlate future
+			 * autofs fifo requests to the correct entry. Thus, we
+			 * have to update the attributes with our own id's.
+			 */
+			vap->va_fsid = data->lav_dev;
+			vap->va_nodeid = data->lav_ino;
+		}
+	} else if (error == 0) {
+		/* Update the attributes with our filesystem id. */
+		vap->va_fsid = data->lav_dev;
+	}
+
+	return (error);
+}
+
+static int
+lx_autofs_mkdir(vnode_t *dvp, char *nm, struct vattr *vap, vnode_t **vpp,
+    cred_t *cr, caller_context_t *ctp, int flags, vsecattr_t *vsecp)
+{
+	vnode_t		*udvp = dvp->v_data;
+	vnode_t		*nvp;
+	int		error;
+
+	if ((nvp = lx_autofs_do_direct(dvp)) != NULL) {
+		udvp = nvp;
+	}
+
+	error = VOP_MKDIR(udvp, nm, vap, vpp, cr, ctp, flags, vsecp);
+
+	if (nvp != NULL) {
+		/* we operated on the direct mounted fs */
+		VN_RELE(nvp);
+	} else if (error == 0) {
+		vnode_t		*uvp = NULL;
+
+		/* Update the attributes with our filesystem id. */
+		vap->va_fsid = dvp->v_vfsp->vfs_dev;
+
+		/* Allocate our new vnode. */
+		uvp = *vpp;
+		*vpp = lx_autofs_vn_alloc(dvp->v_vfsp, uvp);
+	}
+
+	return (error);
+}
+
+/*
+ * VOP entry points - custom
+ */
+/*ARGSUSED*/
+static void
+lx_autofs_inactive(struct vnode *vp, struct cred *cr, caller_context_t *ctp)
+{
+	lx_autofs_vfs_t	*data = (lx_autofs_vfs_t *)vp->v_vfsp->vfs_data;
+
+	/*
+	 * We need to hold the vfs lock because if we're going to free
+	 * this vnode we have to prevent anyone from looking it up
+	 * in the vnode hash.
+	 */
+	mutex_enter(&data->lav_lock);
+	mutex_enter(&vp->v_lock);
+
+	if (vp->v_count < 1) {
+		panic("lx_autofs_inactive: bad v_count");
+		/*NOTREACHED*/
+	}
+
+	/* Drop the temporary hold by vn_rele now. */
+	if (--vp->v_count > 0) {
+		mutex_exit(&vp->v_lock);
+		mutex_exit(&data->lav_lock);
+		return;
+	}
+
+	/*
+	 * No one should have been blocked on this lock because we're
+	 * about to free this vnode.
+	 */
+	lx_autofs_vn_free(vp);
+}
+
+static int
+lx_autofs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
+    int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ctp,
+    int *direntflags, pathname_t *realpnp)
+{
+	vnode_t			*udvp = dvp->v_data;
+	vnode_t			*uvp = NULL;
+	lx_autofs_vfs_t		*data;
+	int			error = ENOENT;
+
+	data = (lx_autofs_vfs_t *)dvp->v_vfsp->vfs_data;
+
+	/*
+	 * For an indirect mount first try to lookup if this path component
+	 * already exists.
+	 */
+	if (data->lav_mnttype == LXAMT_INDIR) {
+		if ((error = VOP_LOOKUP(udvp, nm, &uvp, pnp, flags, rdir, cr,
+		    ctp, direntflags, realpnp)) == 0) {
+			*vpp = lx_autofs_vn_alloc(dvp->v_vfsp, uvp);
+			return (0);
+		}
+	}
+
+	/* Only query the automounter if the path does not exist. */
+	if (error != ENOENT)
+		return (error);
+
+	if (data->lav_catatonic)
+		return (ENOENT);
+
+	/* Save the uid/gid for the requestor ioctl. */
+	data->lav_uid = crgetuid(cr);
+	data->lav_gid = crgetgid(cr);
+
+	/* Refer the lookup to the automounter. */
+	if ((error = lx_autofs_automounter_call(dvp, nm)) != 0)
+		return (error);
+
+	if (data->lav_mnttype == LXAMT_INDIR) {
+		/*
+		 * Indirect mount. The automounter call should have mounted
+		 * something on nm. Retry the lookup operation.
+		 */
+		if ((error = VOP_LOOKUP(udvp, nm, &uvp, pnp, flags, rdir, cr,
+		    ctp, direntflags, realpnp)) == 0) {
+			*vpp = lx_autofs_vn_alloc(dvp->v_vfsp, uvp);
+			return (0);
+		}
+	} else {
+		/*
+		 * Direct or offset mount. The automounter call should have
+		 * covered our 'dvp' with a new filesystem. Traverse into the
+		 * new mount and retry the lookup.
+		 *
+		 * We need to take an extra hold on our vp (which is the autofs
+		 * root vp) to acount for the rele done in traverse. Our caller
+		 * will also do a rele on the original dvp and that would leave
+		 * us one ref short on our autofs root vnode.
+		 */
+		VN_HOLD(dvp);
+		if ((error = traverse(&dvp)) != 0) {
+			VN_RELE(dvp);
+			return (error);
+		}
+
+		error = VOP_LOOKUP(dvp, nm, vpp, pnp, flags, rdir, cr, ctp,
+		    direntflags, realpnp);
+
+		/* release the traverse hold */
+		VN_RELE(dvp);
+	}
+	return (error);
+}
+
+static int
+lx_autofs_ioctl(vnode_t *vp, int cmd, intptr_t arg, int mode, cred_t *cr,
+    int *rvalp, caller_context_t *ctp)
+{
+	vnode_t			*uvp = vp->v_data;
+
+	/* Intercept our ioctls. */
+	switch ((uint_t)cmd) {
+	case LX_AUTOFS_IOC_READY:
+	case LX_AUTOFS_IOC_FAIL:
+	case LX_AUTOFS_IOC_CATATONIC:
+	case LX_AUTOFS_IOC_PROTOVER:
+	case LX_AUTOFS_IOC_SETTIMEOUT:
+	case LX_AUTOFS_IOC_EXPIRE:
+	case LX_AUTOFS_IOC_EXPIRE_MULTI:
+	case LX_AUTOFS_IOC_PROTOSUBVER:
+	case LX_AUTOFS_IOC_ASKUMOUNT:
+		return (lx_autofs_automounter_ioctl(vp, cmd, arg, cr));
+	}
+
+	/* Pass any remaining ioctl on. */
+	return (VOP_IOCTL(uvp, cmd, arg, mode, cr, rvalp, ctp));
+}
+
+/*
+ * VOP entry points definitions
+ */
+static const fs_operation_def_t lx_autofs_tops_root[] = {
+	{ VOPNAME_OPEN,		{ .vop_open = lx_autofs_open } },
+	{ VOPNAME_CLOSE,	{ .vop_close = lx_autofs_close } },
+	{ VOPNAME_IOCTL,	{ .vop_ioctl = lx_autofs_ioctl } },
+	{ VOPNAME_RWLOCK,	{ .vop_rwlock = lx_autofs_rwlock } },
+	{ VOPNAME_RWUNLOCK,	{ .vop_rwunlock = lx_autofs_rwunlock } },
+	{ VOPNAME_GETATTR,	{ .vop_getattr = lx_autofs_getattr } },
+	{ VOPNAME_ACCESS,	{ .vop_access = lx_autofs_access } },
+	{ VOPNAME_READDIR,	{ .vop_readdir = lx_autofs_readdir } },
+	{ VOPNAME_LOOKUP,	{ .vop_lookup = lx_autofs_lookup } },
+	{ VOPNAME_INACTIVE,	{ .vop_inactive = lx_autofs_inactive } },
+	{ VOPNAME_MKDIR,	{ .vop_mkdir = lx_autofs_mkdir } },
+	{ VOPNAME_RMDIR,	{ .vop_rmdir = lx_autofs_rmdir } },
+	{ NULL }
+};
+
+/*
+ * DEV-specific entry points
+ */
+
+/*ARGSUSED*/
+static int
+lx_autofs_dev_open(dev_t *devp, int flags, int otyp, cred_t *credp)
+{
+	return (0);
+}
+
+/*ARGSUSED*/
+static int
+lx_autofs_dev_close(dev_t dev, int flags, int otyp, cred_t *credp)
+{
+	return (0);
+}
+
+static int
+lx_autofs_dev_validate_cmd(intptr_t arg, lx_autofs_dv_ioctl_t *dcmd)
+{
+	if (copyin((caddr_t)arg, dcmd, sizeof (lx_autofs_dv_ioctl_t)) != 0)
+		return (EFAULT);
+
+	if (dcmd->lad_ver_major != LX_AUTOFS_DEV_VERSION_MAJOR ||
+	    dcmd->lad_ver_minor > LX_AUTOFS_DEV_VERSION_MINOR)
+		return (EINVAL);
+
+	DTRACE_PROBE1(lx__dev__cmd, void *, dcmd);
+
+	/* Fill in the version for return */
+	dcmd->lad_ver_major = LX_AUTOFS_DEV_VERSION_MAJOR;
+	dcmd->lad_ver_minor = LX_AUTOFS_DEV_VERSION_MINOR;
+	return (0);
+}
+
+static vfs_t *
+lx_autofs_dev_getvfs_bypath(char *fs_mntpt)
+{
+	struct vfs *vfsp;
+	struct vfs *vfslist;
+	vfs_t *fnd_vfs = NULL;
+	zone_t *zone = curzone;
+
+	vfs_list_read_lock();
+
+	vfsp = vfslist = curzone->zone_vfslist;
+	if (vfslist == NULL) {
+		vfs_list_unlock();
+		return (NULL);
+	}
+
+	do {
+		if (vfsp->vfs_op == lx_autofs_vfsops) {
+			char *mntpt = (char *)refstr_value(vfsp->vfs_mntpt);
+
+			if (strcmp(fs_mntpt, ZONE_PATH_TRANSLATE(mntpt, zone))
+			    == 0) {
+				fnd_vfs = vfsp;
+				VFS_HOLD(fnd_vfs)
+				break;
+			}
+		}
+		vfsp = vfsp->vfs_zone_next;
+	} while (vfsp != vfslist);
+
+	vfs_list_unlock();
+
+	return (fnd_vfs);
+}
+
+static int
+lx_autofs_dev_fd_preamble(intptr_t arg, lx_autofs_dv_ioctl_t *dc, vfs_t **vfspp)
+{
+	int err;
+	lx_autofs_vfs_t	*data;
+	file_t *fp;
+	vfs_t *vfsp;
+
+	if ((err = lx_autofs_dev_validate_cmd(arg, dc)) != 0)
+		return (err);
+
+	if ((fp = getf(dc->lad_ioctlfd)) == NULL)
+		return (EBADF);
+
+	vfsp = fp->f_vnode->v_vfsp;
+	if (vfsp->vfs_op != lx_autofs_vfsops) {
+		releasef(dc->lad_ioctlfd);
+		return (EBADF);
+	}
+
+	data = (lx_autofs_vfs_t *)vfsp->vfs_data;
+	if (data->lav_root->v_count <= 1) {
+		releasef(dc->lad_ioctlfd);
+		return (EBADF);
+	}
+
+	VFS_HOLD(vfsp);
+	*vfspp = vfsp;
+
+	releasef(dc->lad_ioctlfd);
+	return (0);
+}
+
+static int
+lx_autofs_dev_vers(intptr_t arg)
+{
+	int err;
+	lx_autofs_dv_ioctl_t dcmd;
+
+	if ((err = lx_autofs_dev_validate_cmd(arg, &dcmd)) != 0)
+		return (err);
+
+	if (copyout(&dcmd, (caddr_t)arg, sizeof (dcmd)) != 0)
+		return (EFAULT);
+
+	return (0);
+}
+
+static int
+lx_autofs_dev_protver(intptr_t arg)
+{
+	int err;
+	lx_autofs_dv_ioctl_t dcmd;
+
+	if ((err = lx_autofs_dev_validate_cmd(arg, &dcmd)) != 0)
+		return (err);
+
+	dcmd.lad_arg1 = LX_AUTOFS_PROTO_VERS5;
+
+	if (copyout(&dcmd, (caddr_t)arg, sizeof (dcmd)) != 0)
+		return (EFAULT);
+
+	return (0);
+}
+
+static int
+lx_autofs_dev_protosubver(intptr_t arg)
+{
+	int err;
+	lx_autofs_dv_ioctl_t dcmd;
+
+	if ((err = lx_autofs_dev_validate_cmd(arg, &dcmd)) != 0)
+		return (err);
+
+	dcmd.lad_arg1 = LX_AUTOFS_PROTO_SUBVERSION;
+
+	if (copyout(&dcmd, (caddr_t)arg, sizeof (dcmd)) != 0)
+		return (EFAULT);
+
+	return (0);
+}
+
+static int
+lx_autofs_dev_get_path_cmd(intptr_t arg, lx_autofs_dv_ioctl_t **dcp)
+{
+	int err;
+	lx_autofs_dv_ioctl_t dcmd, *dc;
+
+	if ((err = lx_autofs_dev_validate_cmd(arg, &dcmd)) != 0)
+		return (err);
+
+	if (dcmd.lad_size <= sizeof (dcmd) ||
+	    dcmd.lad_size > (sizeof (dcmd) + MAXPATHLEN))
+		return (EINVAL);
+
+	dc = kmem_alloc(dcmd.lad_size, KM_SLEEP);
+
+	/* re-copyin the full struct with the path */
+	if (copyin((caddr_t)arg, dc, dcmd.lad_size) != 0) {
+		kmem_free(dc, dcmd.lad_size);
+		return (EFAULT);
+	}
+	dc->lad_size = dcmd.lad_size;
+
+	if (dc->lad_path[0] != '/' ||
+	    dc->lad_path[dcmd.lad_size - sizeof (dcmd) - 1] != '\0') {
+		kmem_free(dc, dcmd.lad_size);
+		return (EINVAL);
+	}
+
+	*dcp = dc;
+	return (0);
+}
+
+static int
+lx_autofs_dev_openmount(intptr_t arg)
+{
+	int err;
+	int fd;
+	lx_autofs_dv_ioctl_t *dc;
+	vfs_t *vfsp;
+	lx_autofs_vfs_t	*data;
+
+	if ((err = lx_autofs_dev_get_path_cmd(arg, &dc)) != 0)
+		return (err);
+
+	if ((vfsp = lx_autofs_dev_getvfs_bypath(dc->lad_path)) == NULL) {
+		kmem_free(dc, dc->lad_size);
+		return (EINVAL);
+	}
+
+	/* lad_arg1 is the dev number of the mnt but we don't check that */
+
+	/*
+	 * Do an "open" on the root vnode. To fully simulate "open" we also add
+	 * a hold on the root vnode itself since lx_autofs_open will only open
+	 * (and hold) the underlying vnode.
+	 */
+	data = (lx_autofs_vfs_t *)vfsp->vfs_data;
+	VN_HOLD(data->lav_root);
+	if ((err = fassign(&data->lav_root, FWRITE|FREAD, &fd)) != 0) {
+		VN_RELE(data->lav_root);
+		VFS_RELE(vfsp);
+		kmem_free(dc, dc->lad_size);
+		return (err);
+	}
+
+	mutex_enter(&data->lav_lock);
+	data->lav_openmnt_cnt++;
+	mutex_exit(&data->lav_lock);
+
+	dc->lad_ioctlfd = fd;
+
+	if (copyout(dc, (caddr_t)arg, sizeof (lx_autofs_dv_ioctl_t)) != 0) {
+		mutex_enter(&data->lav_lock);
+		data->lav_openmnt_cnt--;
+		mutex_exit(&data->lav_lock);
+		(void) closeandsetf(fd, NULL);
+		VFS_RELE(vfsp);
+		kmem_free(dc, dc->lad_size);
+		return (EFAULT);
+	}
+	VFS_RELE(vfsp);
+
+	kmem_free(dc, dc->lad_size);
+	return (0);
+}
+
+static int
+lx_autofs_dev_closemount(intptr_t arg)
+{
+	int err;
+	lx_autofs_dv_ioctl_t dcmd;
+	vfs_t *vfsp;
+	lx_autofs_vfs_t	*data;
+
+	if ((err = lx_autofs_dev_fd_preamble(arg, &dcmd, &vfsp)) != 0)
+		return (err);
+
+	data = (lx_autofs_vfs_t *)vfsp->vfs_data;
+
+	/* "close" the vnode */
+	if ((err = closeandsetf(dcmd.lad_ioctlfd, NULL)) != 0) {
+		VFS_RELE(vfsp);
+		return (err);
+	}
+
+	mutex_enter(&data->lav_lock);
+	ASSERT(data->lav_openmnt_cnt > 0);
+	data->lav_openmnt_cnt--;
+	mutex_exit(&data->lav_lock);
+
+	VFS_RELE(vfsp);
+	return (0);
+}
+
+static int
+lx_autofs_dev_ready(intptr_t arg)
+{
+	int err;
+	lx_autofs_dv_ioctl_t dcmd;
+	vfs_t *vfsp;
+
+	if ((err = lx_autofs_dev_fd_preamble(arg, &dcmd, &vfsp)) != 0)
+		return (err);
+
+	if ((err = lx_autofs_ack(dcmd.lad_arg1, vfsp, LXACR_READY)) != 0) {
+		VFS_RELE(vfsp);
+		return (err);
+	}
+
+	VFS_RELE(vfsp);
+	return (0);
+}
+
+static int
+lx_autofs_dev_fail(intptr_t arg)
+{
+	int err;
+	lx_autofs_dv_ioctl_t dcmd;
+	vfs_t *vfsp;
+
+	if ((err = lx_autofs_dev_fd_preamble(arg, &dcmd, &vfsp)) != 0)
+		return (err);
+
+	if ((err = lx_autofs_ack(dcmd.lad_arg1, vfsp, LXACR_FAIL)) != 0) {
+		VFS_RELE(vfsp);
+		return (err);
+	}
+
+	VFS_RELE(vfsp);
+	return (0);
+}
+
+/*
+ * Update the fifo pipe information we use to talk to the automounter. The
+ * ioctl is used when the automounter restarts. This logic is similar to the
+ * handling done in lx_autofs_parse_mntopt() when the filesytem is first
+ * mounted.
+ */
+static int
+lx_autofs_dev_setpipefd(intptr_t arg)
+{
+	int err;
+	lx_autofs_dv_ioctl_t dcmd;
+	vfs_t *vfsp;
+	lx_autofs_vfs_t	*data;
+	int fd, pgrp;
+	file_t *fp_wr, *fp_rd;
+
+	if ((err = lx_autofs_dev_fd_preamble(arg, &dcmd, &vfsp)) != 0)
+		return (err);
+
+	mutex_enter(&pidlock);
+	pgrp = curproc->p_pgrp;
+	mutex_exit(&pidlock);
+	fd = dcmd.lad_arg1;
+
+	/* Lookup the new fifos. See comment in lx_autofs_parse_mntopt. */
+	if (lx_autofs_fifo_lookup(pgrp, fd, &fp_wr, &fp_rd) != 0) {
+		int pid = (int)curproc->p_pid;
+
+		if (lx_autofs_fifo_lookup(pid, fd, &fp_wr, &fp_rd) != 0) {
+			VFS_RELE(vfsp);
+			return (EINVAL);
+		}
+	}
+
+	data = (lx_autofs_vfs_t *)vfsp->vfs_data;
+
+	/* Close the old fifos. */
+	if (data->lav_fifo_wr != NULL)
+		(void) closef(data->lav_fifo_wr);
+	if (data->lav_fifo_rd != NULL)
+		(void) closef(data->lav_fifo_rd);
+
+	data->lav_fd = fd;
+	data->lav_pgrp = pgrp;
+	data->lav_fifo_rd = fp_rd;
+	data->lav_fifo_wr = fp_wr;
+	/*
+	 * Not explicitly in the ioctl spec. but necessary for correct recovery
+	 */
+	data->lav_catatonic = B_FALSE;
+
+	VFS_RELE(vfsp);
+
+	return (0);
+}
+
+static int
+lx_autofs_dev_catatonic(intptr_t arg)
+{
+	int err;
+	lx_autofs_dv_ioctl_t dcmd;
+	vfs_t *vfsp;
+	lx_autofs_vfs_t	*data;
+
+	if ((err = lx_autofs_dev_fd_preamble(arg, &dcmd, &vfsp)) != 0)
+		return (err);
+
+	data = (lx_autofs_vfs_t *)vfsp->vfs_data;
+	data->lav_catatonic = B_TRUE;
+	VFS_RELE(vfsp);
+
+	return (0);
+}
+
+static int
+lx_autofs_dev_expire(intptr_t arg)
+{
+	int err;
+	lx_autofs_dv_ioctl_t dcmd;
+	vfs_t *vfsp;
+
+	if ((err = lx_autofs_dev_fd_preamble(arg, &dcmd, &vfsp)) != 0)
+		return (err);
+
+	/* If it succeeds in expiring then we don't want to return EAGAIN */
+	if ((err = lx_autofs_expire(vfsp, kcred)) == 0) {
+		VFS_RELE(vfsp);
+		return (0);
+	}
+
+	VFS_RELE(vfsp);
+	return (EAGAIN);
+}
+
+static int
+lx_autofs_dev_timeout(intptr_t arg)
+{
+	int err;
+	lx_autofs_dv_ioctl_t dcmd;
+	vfs_t *vfsp;
+	lx_autofs_vfs_t	*data;
+
+	if ((err = lx_autofs_dev_fd_preamble(arg, &dcmd, &vfsp)) != 0)
+		return (err);
+
+	data = (lx_autofs_vfs_t *)vfsp->vfs_data;
+	data->lav_timeout = dcmd.lad_arg1;
+	VFS_RELE(vfsp);
+
+	return (0);
+}
+
+static int
+lx_autofs_dev_requestor(intptr_t arg)
+{
+	int err;
+	lx_autofs_dv_ioctl_t *dc;
+	vfs_t *vfsp;
+	vfs_t *fnd_vfs = NULL;
+	struct vfs *vfslist;
+	zone_t *zone = curzone;
+	lx_autofs_vfs_t	*data;
+	uid_t uid;
+	gid_t gid;
+
+	if ((err = lx_autofs_dev_get_path_cmd(arg, &dc)) != 0)
+		return (err);
+
+	vfs_list_read_lock();
+	vfsp = vfslist = curzone->zone_vfslist;
+	if (vfslist == NULL) {
+		vfs_list_unlock();
+		kmem_free(dc, dc->lad_size);
+		return (EINVAL);
+	}
+
+	do {
+		/* Skip mounts we shouldn't show. */
+		if (!(vfsp->vfs_flag & VFS_NOMNTTAB)) {
+			char *mntpt = (char *)refstr_value(vfsp->vfs_mntpt);
+
+			if (strcmp(dc->lad_path,
+			    ZONE_PATH_TRANSLATE(mntpt, zone)) == 0) {
+
+				if (vfsp->vfs_op != lx_autofs_vfsops) {
+					/*
+					 * Found an indirect mount (probably
+					 * NFS) so we need to get the vfs it's
+					 * mounted onto.
+					 */
+					vnode_t *vn = vfsp->vfs_vnodecovered;
+					vfsp = vn->v_vfsp;
+
+					if (vfsp->vfs_op != lx_autofs_vfsops) {
+						/*
+						 * autofs doesn't manage this
+						 * path.
+						 */
+						break;
+					}
+				}
+
+				fnd_vfs = vfsp;
+				VFS_HOLD(fnd_vfs)
+				break;
+			}
+		}
+		vfsp = vfsp->vfs_zone_next;
+	} while (vfsp != vfslist);
+	vfs_list_unlock();
+
+	if (fnd_vfs == NULL) {
+		kmem_free(dc, dc->lad_size);
+		return (EINVAL);
+	}
+
+	data = (lx_autofs_vfs_t *)fnd_vfs->vfs_data;
+	uid = data->lav_uid;
+	gid = data->lav_gid;
+	VFS_RELE(fnd_vfs);
+
+	dc->lad_arg1 = uid;
+	dc->lad_arg2 = gid;
+
+	if (copyout(dc, (caddr_t)arg, sizeof (lx_autofs_dv_ioctl_t)) != 0) {
+		kmem_free(dc, dc->lad_size);
+		return (EFAULT);
+	}
+
+	kmem_free(dc, dc->lad_size);
+	return (0);
+}
+
+static int
+lx_autofs_dev_ismntpt(intptr_t arg)
+{
+	int err = 0;
+	lx_autofs_dv_ioctl_t *dc;
+	struct vfs *vfslist;
+	vfs_t *vfsp;
+	vfs_t *fnd_vfs = NULL;
+	zone_t *zone = curzone;
+
+	if ((err = lx_autofs_dev_get_path_cmd(arg, &dc)) != 0)
+		return (err);
+
+	/*
+	 * The automounter will always pass a path. It can also either pass an
+	 * ioctlfd or, if it's -1, arg1 can be an LX_AUTOFS_TYPE_* value. We
+	 * currently don't need those for our algorithm.
+	 */
+
+	vfs_list_read_lock();
+	vfsp = vfslist = curzone->zone_vfslist;
+	if (vfslist == NULL) {
+		vfs_list_unlock();
+		kmem_free(dc, dc->lad_size);
+		return (0);	/* return 0 if not a mount point */
+	}
+
+	do {
+		if (!(vfsp->vfs_flag & VFS_NOMNTTAB)) {
+			char *mntpt = (char *)refstr_value(vfsp->vfs_mntpt);
+
+			if (strcmp(dc->lad_path,
+			    ZONE_PATH_TRANSLATE(mntpt, zone)) == 0) {
+
+				/*
+				 * To handle direct mounts (on top of an autofs
+				 * mount), we must prefer non-autofs vfs for
+				 * this request.
+				 */
+				if (fnd_vfs != NULL)
+					VFS_RELE(fnd_vfs);
+
+				fnd_vfs = vfsp;
+				VFS_HOLD(fnd_vfs)
+
+				if (fnd_vfs->vfs_op != lx_autofs_vfsops)
+					break;
+			}
+		}
+		vfsp = vfsp->vfs_zone_next;
+	} while (vfsp != vfslist);
+	vfs_list_unlock();
+
+	if (fnd_vfs == NULL) {
+		kmem_free(dc, dc->lad_size);
+		return (0);	/* return 0 if not a mount point */
+	}
+
+	/*
+	 * arg1 is device number, arg2 is superblock magic number
+	 * The superblock value only matters if autofs or not.
+	 */
+	dc->lad_arg1 = fnd_vfs->vfs_dev;
+	if (fnd_vfs->vfs_op == lx_autofs_vfsops) {
+		dc->lad_arg2 = LX_AUTOFS_SB_MAGIC;
+	} else {
+		dc->lad_arg2 = ~LX_AUTOFS_SB_MAGIC;
+	}
+
+	VFS_RELE(fnd_vfs);
+
+	if (copyout(dc, (caddr_t)arg, sizeof (lx_autofs_dv_ioctl_t)) != 0) {
+		kmem_free(dc, dc->lad_size);
+		return (EFAULT);
+	}
+
+	kmem_free(dc, dc->lad_size);
+
+	/*
+	 * We have to return 1 if it is a mount point. The lx ioctl autofs
+	 * translator will convert a negative value back to a positive,
+	 * non-error return value.
+	 */
+	return (-1);
+}
+
+static int
+lx_autofs_dev_askumount(intptr_t arg)
+{
+	int err;
+	int v;
+	lx_autofs_dv_ioctl_t dcmd;
+	vfs_t *vfsp;
+
+	if ((err = lx_autofs_dev_fd_preamble(arg, &dcmd, &vfsp)) != 0)
+		return (err);
+
+	if (lx_autofs_may_unmount(vfsp, kcred)) {
+		v = 0;
+	} else {
+		v = 1;
+	}
+	VFS_RELE(vfsp);
+
+	dcmd.lad_arg1 = v;
+	if (copyout(&dcmd, (caddr_t)arg, sizeof (dcmd)) != 0)
+		return (EFAULT);
+
+	return (0);
+}
+
+/*ARGSUSED*/
+static int
+lx_autofs_dev_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
+    int *rvalp)
+{
+	switch (cmd) {
+	case LX_AUTOFS_DEV_IOC_VERSION_CMD:
+		return (lx_autofs_dev_vers(arg));
+
+	case LX_AUTOFS_DEV_IOC_PROTOVER_CMD:
+		return (lx_autofs_dev_protver(arg));
+
+	case LX_AUTOFS_DEV_IOC_PROTOSUBVER_CMD:
+		return (lx_autofs_dev_protosubver(arg));
+
+	case LX_AUTOFS_DEV_IOC_OPENMOUNT_CMD:
+		return (lx_autofs_dev_openmount(arg));
+
+	case LX_AUTOFS_DEV_IOC_CLOSEMOUNT_CMD:
+		return (lx_autofs_dev_closemount(arg));
+
+	case LX_AUTOFS_DEV_IOC_READY_CMD:
+		return (lx_autofs_dev_ready(arg));
+
+	case LX_AUTOFS_DEV_IOC_FAIL_CMD:
+		return (lx_autofs_dev_fail(arg));
+
+	case LX_AUTOFS_DEV_IOC_SETPIPEFD_CMD:
+		return (lx_autofs_dev_setpipefd(arg));
+
+	case LX_AUTOFS_DEV_IOC_CATATONIC_CMD:
+		return (lx_autofs_dev_catatonic(arg));
+
+	case LX_AUTOFS_DEV_IOC_TIMEOUT_CMD:
+		return (lx_autofs_dev_timeout(arg));
+
+	case LX_AUTOFS_DEV_IOC_REQUESTER_CMD:
+		return (lx_autofs_dev_requestor(arg));
+
+	case LX_AUTOFS_DEV_IOC_EXPIRE_CMD:
+		return (lx_autofs_dev_expire(arg));
+
+	case LX_AUTOFS_DEV_IOC_ASKUMOUNT_CMD:
+		return (lx_autofs_dev_askumount(arg));
+
+	case LX_AUTOFS_DEV_IOC_ISMOUNTPOINT_CMD:
+		return (lx_autofs_dev_ismntpt(arg));
+	}
+
+	return (EINVAL);
+}
+
+/*
+ * lx_autofs_init() gets invoked via the mod_install() call in
+ * this module's _init() routine.  Therefore, the code that cleans
+ * up the structures we allocate below is actually found in
+ * our _fini() routine.
+ */
+/* ARGSUSED */
+static int
+lx_autofs_init(int fstype, char *name)
+{
+	int		error;
+
+	lx_autofs_major = ddi_name_to_major(LX_AUTOFS_NAME);
+
+	lx_autofs_fstype = fstype;
+	if ((error = vfs_setfsops(fstype, lx_autofs_vfstops,
+	    &lx_autofs_vfsops)) != 0) {
+		cmn_err(CE_WARN, "lx_autofs_init: bad vfs ops template");
+		return (error);
+	}
+
+	if ((error = vn_make_ops(name, lx_autofs_tops_root,
+	    &lx_autofs_vn_ops)) != 0) {
+		VERIFY(vfs_freevfsops_by_type(fstype) == 0);
+		lx_autofs_vn_ops = NULL;
+		return (error);
+	}
+
+	return (0);
+}
+
+/*ARGSUSED*/
+static int
+lx_autofs_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+	int	instance = ddi_get_instance(dip);
+
+	if (cmd != DDI_ATTACH)
+		return (DDI_FAILURE);
+
+	ASSERT(instance == 0);
+	if (instance != 0)
+		return (DDI_FAILURE);
+
+	/* create our minor node */
+	if (ddi_create_minor_node(dip, LX_AUTOFS_MINORNAME, S_IFCHR, 0,
+	    DDI_PSEUDO, 0) != DDI_SUCCESS)
+		return (DDI_FAILURE);
+
+	lx_autofs_dip = dip;
+
+	return (DDI_SUCCESS);
+}
+
+/*ARGSUSED*/
+static int
+lx_autofs_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+	if (cmd != DDI_DETACH)
+		return (DDI_FAILURE);
+
+	lx_autofs_dip = NULL;
+
+	return (DDI_SUCCESS);
+}
+
+/*ARGSUSED*/
+static int
+lx_autofs_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg,
+    void **resultp)
+{
+	switch (infocmd) {
+	case DDI_INFO_DEVT2DEVINFO:
+		*resultp = lx_autofs_dip;
+		return (DDI_SUCCESS);
+
+	case DDI_INFO_DEVT2INSTANCE:
+		*resultp = (void *)0;
+		return (DDI_SUCCESS);
+	}
+	return (DDI_FAILURE);
+}
+
+/*
+ * Driver flags
+ */
+static struct cb_ops lx_autofs_cb_ops = {
+	lx_autofs_dev_open,	/* open */
+	lx_autofs_dev_close,	/* close */
+	nodev,			/* strategy */
+	nodev,			/* print */
+	nodev,			/* dump */
+	nodev,			/* read */
+	nodev,			/* write */
+	lx_autofs_dev_ioctl,	/* ioctl */
+	nodev,			/* devmap */
+	nodev,			/* mmap */
+	nodev,			/* segmap */
+	nochpoll,		/* poll */
+	ddi_prop_op,		/* vb_prop_op */
+	NULL,			/* streamtab */
+	D_NEW | D_MP		/* Driver compatibility flag */
+};
+
+/*
+ * Module linkage
+ */
+static mntopt_t lx_autofs_mntopt[] = {
+	{ LX_MNTOPT_FD,		NULL,	0,	MO_HASVALUE },
+	{ LX_MNTOPT_PGRP,	NULL,	0,	MO_HASVALUE },
+	{ LX_MNTOPT_MINPROTO,	NULL,	0,	MO_HASVALUE },
+	{ LX_MNTOPT_MAXPROTO,	NULL,	0,	MO_HASVALUE },
+	{ LX_MNTOPT_INDIRECT,	NULL,	0,	0 },
+	{ LX_MNTOPT_DIRECT,	NULL,	0,	0 },
+	{ LX_MNTOPT_OFFSET,	NULL,	0,	0 }
+};
+
+static mntopts_t lx_autofs_mntopts = {
+	sizeof (lx_autofs_mntopt) / sizeof (mntopt_t),
+	lx_autofs_mntopt
+};
+
+static vfsdef_t vfw = {
+	VFSDEF_VERSION,
+	LX_AUTOFS_NAME,
+	lx_autofs_init,
+	VSW_HASPROTO | VSW_VOLATILEDEV | VSW_ZMOUNT,
+	&lx_autofs_mntopts
+};
+
+static struct dev_ops lx_autofs_dev_ops = {
+	DEVO_REV,		/* version */
+	0,			/* refcnt */
+	lx_autofs_info,		/* info */
+	nulldev,		/* identify */
+	nulldev,		/* probe */
+	lx_autofs_attach,	/* attach */
+	lx_autofs_detach,	/* detach */
+	nodev,			/* reset */
+	&lx_autofs_cb_ops,	/* driver operations */
+	NULL,			/* no bus operations */
+	NULL,			/* power */
+	ddi_quiesce_not_needed	/* quiesce */
+};
+
+extern struct mod_ops mod_fsops;
+
+static struct modlfs modlfs = {
+	&mod_fsops, "lx autofs filesystem", &vfw
+};
+
+static struct modldrv modldrv = {
+	&mod_driverops, "lx autofs driver", &lx_autofs_dev_ops
+};
+
+static struct modlinkage modlinkage = {
+	MODREV_1,
+	(void *)&modlfs,
+	(void *)&modldrv,
+	NULL
+};
+
+int
+_init(void)
+{
+	int error;
+
+	if ((error = mod_install(&modlinkage)) != 0) {
+		return (error);
+	}
+
+	return (0);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&modlinkage, modinfop));
+}
+
+int
+_fini(void)
+{
+	int		error;
+
+	if ((error = mod_remove(&modlinkage)) != 0)
+		return (error);
+
+	if (lx_autofs_vn_ops != NULL) {
+		vn_freevnodeops(lx_autofs_vn_ops);
+		lx_autofs_vn_ops = NULL;
+	}
+
+	/*
+	 * In our init routine, if we get an error after calling
+	 * vfs_setfsops() we cleanup by calling vfs_freevfsops_by_type().
+	 * But we don't need to call vfs_freevfsops_by_type() here
+	 * because the fs framework did this for us as part of the
+	 * mod_remove() call above.
+	 */
+	return (0);
+}
diff --git a/usr/src/uts/common/brand/lx/autofs/lxautofs.conf b/usr/src/uts/common/brand/lx/autofs/lxautofs.conf
new file mode 100644
index 0000000000..36e0119e33
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/autofs/lxautofs.conf
@@ -0,0 +1,14 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# Copyright 2016 Joyent, Inc.
+#
+
+name="lxautofs" parent="pseudo" instance=0;
diff --git a/usr/src/uts/common/brand/lx/cgroups/cgrps.h b/usr/src/uts/common/brand/lx/cgroups/cgrps.h
new file mode 100644
index 0000000000..df938adcea
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/cgroups/cgrps.h
@@ -0,0 +1,223 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#ifndef	_LXCGRPS_H
+#define	_LXCGRPS_H
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * cgrps.h: declarations, data structures and macros for lx_cgroup
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/policy.h>
+#include <sys/dirent.h>
+#include <sys/errno.h>
+#include <sys/kmem.h>
+#include <sys/pathname.h>
+#include <sys/systm.h>
+#include <sys/var.h>
+#include <sys/sysmacros.h>
+#include <sys/cred.h>
+#include <sys/priv.h>
+#include <sys/vnode.h>
+#include <sys/vfs.h>
+#include <sys/statvfs.h>
+#include <sys/cmn_err.h>
+#include <sys/zone.h>
+#include <sys/uio.h>
+#include <sys/utsname.h>
+#include <sys/atomic.h>
+#include <vm/anon.h>
+
+/*
+ * cgrpmgr ioctl interface.
+ */
+#define	CGRPFS_IOC	('C' << 16 | 'G' << 8)
+#define	CGRPFS_GETEVNT	(CGRPFS_IOC | 1)
+
+typedef struct cgrpmgr_info {
+	pid_t	cgmi_pid;
+	char	*cgmi_rel_agent_path;
+	char	*cgmi_cgroup_path;
+} cgrpmgr_info_t;
+
+#if defined(_KERNEL)
+
+#include <sys/lx_brand.h>
+
+typedef struct cgrpmgr_info32 {
+	pid_t		cgmi_pid;
+	caddr32_t	cgmi_rel_agent_path;
+	caddr32_t	cgmi_cgroup_path;
+} cgrpmgr_info32_t;
+
+#define	CG_PSNSIZE	256	/* max size of pseudo file name entries */
+#define	CG_PSDSIZE	16	/* pretend that a dir entry takes 16 bytes */
+
+/*
+ * The order of these entries must be in sync with the cg_ssde_dir array.
+ */
+typedef enum cgrp_ssid {
+	CG_SSID_GENERIC = 1,
+	CG_SSID_NUM		/* last ssid for range checking */
+} cgrp_ssid_t;
+
+typedef enum cgrp_nodetype {
+	CG_CGROUP_DIR = 1,	/* cgroup directory entry */
+	CG_NOTIFY,		/* notify_on_release file */
+	CG_PROCS,		/* cgroup.procs file */
+	CG_REL_AGENT,		/* release_agent file */
+	CG_TASKS,		/* tasks file */
+} cgrp_nodetype_t;
+
+typedef struct cgrp_subsys_dirent {
+	cgrp_nodetype_t cgrp_ssd_type;
+	char		*cgrp_ssd_name;
+} cgrp_subsys_dirent_t;
+
+#define	N_DIRENTS(m)	(cgrp_num_pseudo_ents((m)->cg_ssid) + 2)
+
+/*
+ * A modern systemd-based Linux system typically has 50-60 cgroups so
+ * we size the hash for 2x that number.
+ */
+#define	CGRP_HASH_SZ	128
+#define	CGRP_AGENT_LEN	(MAXPATHLEN + 1)
+
+/*
+ * cgroups per-mount data structure.
+ *
+ * All but the event related fields are protected by cg_contents.
+ * The evnt_list and counter is protected by cg_events.
+ */
+typedef struct cgrp_mnt {
+	struct vfs	*cg_vfsp;	/* filesystem's vfs struct */
+	struct cgrp_node *cg_rootnode;	/* root cgrp_node */
+	char 		*cg_mntpath;	/* name of cgroup mount point */
+	cgrp_ssid_t	cg_ssid;	/* subsystem type */
+	dev_t		cg_dev;		/* unique dev # of mounted `device' */
+	uint_t		cg_gen;		/* node ID source for files */
+	uint_t		cg_grp_gen;	/* ID source for cgroups */
+	kmutex_t	cg_contents;	/* global lock for most fs activity */
+	char		cg_agent[CGRP_AGENT_LEN]; /* release_agent path */
+	/* ptr to zone data for containing zone */
+	lx_zone_data_t	*cg_lxzdata;
+	struct cgrp_node **cg_grp_hash;	/* hash list of cgroups in the fs */
+} cgrp_mnt_t;
+
+/*
+ * cgrp_node is the file system dependent node for cgroups.
+ *
+ * The node is used to represent both directories (a cgroup) and pseudo files
+ * within the directory.
+ *
+ * Members are tagged in the comment to note which type of node they apply to:
+ * A - all
+ * D - dir (i.e. a cgroup)
+ * F - pseudo file
+ */
+
+typedef struct cgrp_node {
+	struct cgrp_node	*cgn_back;	/* A lnked lst of cgrp_nodes */
+	struct cgrp_node	*cgn_forw;	/* A lnked lst of cgrp_nodes */
+	struct cgrp_dirent	*cgn_dir;	/* D dirent list */
+	struct cgrp_node	*cgn_parent;	/* A dir containing this node */
+	struct cgrp_node	*cgn_next;	/* D link in per-mount cgroup */
+						/*   hash table */
+	uint_t			cgn_dirents;	/* D number of dirents */
+	cgrp_nodetype_t		cgn_type;	/* A type for this node */
+	uint_t			cgn_notify;	/* D notify_on_release value */
+	uint_t			cgn_task_cnt;	/* D number of threads in grp */
+	struct vnode 		*cgn_vnode;	/* A vnode for this cgrp_node */
+	uint_t 			cgn_id;		/* D ID number for the cgroup */
+	struct vattr		cgn_attr;	/* A attributes */
+} cgrp_node_t;
+
+/*
+ * File system independent to cgroups conversion macros
+ */
+#define	VFSTOCGM(vfsp)		((cgrp_mnt_t *)(vfsp)->vfs_data)
+#define	VTOCGM(vp)		((cgrp_mnt_t *)(vp)->v_vfsp->vfs_data)
+#define	VTOCGN(vp)		((struct cgrp_node *)(vp)->v_data)
+#define	CGNTOV(cn)		((cn)->cgn_vnode)
+#define	cgnode_hold(cn)		VN_HOLD(CGNTOV(cn))
+#define	cgnode_rele(cn)		VN_RELE(CGNTOV(cn))
+
+/*
+ * Attributes
+ */
+#define	cgn_mask	cgn_attr.va_mask
+#define	cgn_mode	cgn_attr.va_mode
+#define	cgn_uid		cgn_attr.va_uid
+#define	cgn_gid		cgn_attr.va_gid
+#define	cgn_fsid	cgn_attr.va_fsid
+#define	cgn_nodeid	cgn_attr.va_nodeid
+#define	cgn_nlink	cgn_attr.va_nlink
+#define	cgn_size	cgn_attr.va_size
+#define	cgn_atime	cgn_attr.va_atime
+#define	cgn_mtime	cgn_attr.va_mtime
+#define	cgn_ctime	cgn_attr.va_ctime
+#define	cgn_rdev	cgn_attr.va_rdev
+#define	cgn_blksize	cgn_attr.va_blksize
+#define	cgn_nblocks	cgn_attr.va_nblocks
+#define	cgn_seq		cgn_attr.va_seq
+
+/*
+ * cgroup directories are made up of a linked list of cg_dirent structures
+ * hanging off directory cgrp_nodes.  File names are not fixed length,
+ * but are null terminated.
+ */
+typedef struct cgrp_dirent {
+	struct cgrp_node	*cgd_cgrp_node;	/* cg node for this file */
+	struct cgrp_dirent	*cgd_next;	/* next directory entry */
+	struct cgrp_dirent	*cgd_prev;	/* prev directory entry */
+	uint_t			cgd_offset;	/* "offset" of dir entry */
+	uint_t			cgd_hash;	/* a hash of cgd_name */
+	struct cgrp_dirent	*cgd_link;	/* linked via hash table */
+	struct cgrp_node	*cgd_parent;	/* parent, dir we are in */
+	char			*cgd_name;	/* null terminated */
+} cgrp_dirent_t;
+
+enum de_op	{ DE_CREATE, DE_MKDIR, DE_RENAME };	/* direnter ops */
+enum dr_op	{ DR_REMOVE, DR_RMDIR, DR_RENAME };	/* dirremove ops */
+
+extern struct vnodeops *cgrp_vnodeops;
+
+int cgrp_dirdelete(cgrp_node_t *, cgrp_node_t *, char *, enum dr_op, cred_t *);
+int cgrp_direnter(cgrp_mnt_t *, cgrp_node_t *, char *, enum de_op,
+    cgrp_node_t *, struct vattr *, cgrp_node_t **, cred_t *,
+    caller_context_t *);
+void cgrp_dirinit(cgrp_node_t *, cgrp_node_t *, cred_t *);
+int cgrp_dirlookup(cgrp_node_t *, char *, cgrp_node_t **, cred_t *);
+void cgrp_dirtrunc(cgrp_node_t *);
+void cgrp_node_init(cgrp_mnt_t *, cgrp_node_t *, vattr_t *, cred_t *);
+int cgrp_taccess(void *, int, cred_t *);
+ino_t cgrp_inode(cgrp_nodetype_t, unsigned int);
+int cgrp_num_pseudo_ents(cgrp_ssid_t);
+cgrp_node_t *cgrp_cg_hash_lookup(cgrp_mnt_t *, uint_t);
+void cgrp_rel_agent_event(cgrp_mnt_t *, cgrp_node_t *);
+
+#endif /* KERNEL */
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _LXCGRPS_H */
diff --git a/usr/src/uts/common/brand/lx/cgroups/cgrps_node.c b/usr/src/uts/common/brand/lx/cgroups/cgrps_node.c
new file mode 100644
index 0000000000..8950be1966
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/cgroups/cgrps_node.c
@@ -0,0 +1,1019 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/time.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/errno.h>
+#include <sys/cmn_err.h>
+#include <sys/cred.h>
+#include <sys/stat.h>
+#include <sys/mode.h>
+#include <sys/policy.h>
+#include <sys/sdt.h>
+
+#include "cgrps.h"
+
+static int cgrp_dirmakecgnode(cgrp_node_t *, cgrp_mnt_t *, struct vattr *,
+	enum de_op, cgrp_node_t **, struct cred *);
+static int cgrp_diraddentry(cgrp_node_t *, cgrp_node_t *, char *, enum de_op);
+
+static cgrp_subsys_dirent_t cgrp_generic_dir[] = {
+	{ CG_PROCS,		"cgroup.procs" },
+	{ CG_NOTIFY,		"notify_on_release" },
+	{ CG_TASKS,		"tasks" }
+};
+
+typedef struct cgrp_ssde {
+	cgrp_subsys_dirent_t	*cg_ssde_files;
+	int			cg_ssde_nfiles;
+} cgrp_ssde_t;
+
+#define	CGDIRLISTSZ(l)		(sizeof (l) / sizeof ((l)[0]))
+
+/*
+ * Note, these entries must be in the same order as the cgrp_ssid_t entries.
+ */
+static cgrp_ssde_t cg_ssde_dir[] = {
+	/* subsystems start at 1 */
+	{NULL, 0},
+
+	/* CG_SSID_GENERIC */
+	{cgrp_generic_dir, CGDIRLISTSZ(cgrp_generic_dir)},
+};
+
+
+#define	CG_HASH_SIZE	8192		/* must be power of 2 */
+#define	CG_MUTEX_SIZE	64
+
+static cgrp_dirent_t	*cg_hashtable[CG_HASH_SIZE];
+static kmutex_t		 cg_hashmutex[CG_MUTEX_SIZE];
+
+#define	CG_HASH_INDEX(a)	((a) & (CG_HASH_SIZE-1))
+#define	CG_MUTEX_INDEX(a)	((a) & (CG_MUTEX_SIZE-1))
+
+#define	CG_HASH(cp, name, hash)				\
+	{							\
+		char Xc, *Xcp;					\
+		hash = (uint_t)(uintptr_t)(cp) >> 8;		\
+		for (Xcp = (name); (Xc = *Xcp) != 0; Xcp++)	\
+			hash = (hash << 4) + hash + (uint_t)Xc;	\
+	}
+
+#define	MODESHIFT	3
+
+typedef enum cgrp_nodehold {
+	NOHOLD,
+	HOLD
+} cgrp_nodehold_t;
+
+void
+cgrp_hash_init(void)
+{
+	int i;
+
+	for (i = 0; i < CG_MUTEX_SIZE; i++)
+		mutex_init(&cg_hashmutex[i], NULL, MUTEX_DEFAULT, NULL);
+}
+
+static void
+cgrp_hash_in(cgrp_dirent_t *c)
+{
+	uint_t		hash;
+	cgrp_dirent_t	**prevpp;
+	kmutex_t	*cg_hmtx;
+
+	CG_HASH(c->cgd_parent, c->cgd_name, hash);
+	c->cgd_hash = hash;
+	prevpp = &cg_hashtable[CG_HASH_INDEX(hash)];
+	cg_hmtx = &cg_hashmutex[CG_MUTEX_INDEX(hash)];
+	mutex_enter(cg_hmtx);
+	c->cgd_link = *prevpp;
+	*prevpp = c;
+	mutex_exit(cg_hmtx);
+}
+
+static void
+cgrp_hash_out(cgrp_dirent_t *c)
+{
+	uint_t		hash;
+	cgrp_dirent_t	**prevpp;
+	kmutex_t	*cg_hmtx;
+
+	hash = c->cgd_hash;
+	prevpp = &cg_hashtable[CG_HASH_INDEX(hash)];
+	cg_hmtx = &cg_hashmutex[CG_MUTEX_INDEX(hash)];
+	mutex_enter(cg_hmtx);
+	while (*prevpp != c)
+		prevpp = &(*prevpp)->cgd_link;
+	*prevpp = c->cgd_link;
+	mutex_exit(cg_hmtx);
+}
+
+static cgrp_dirent_t *
+cgrp_hash_lookup(char *name, cgrp_node_t *parent, cgrp_nodehold_t hold,
+	cgrp_node_t **found)
+{
+	cgrp_dirent_t	*l;
+	uint_t		hash;
+	kmutex_t	*cg_hmtx;
+	cgrp_node_t	*cnp;
+
+	CG_HASH(parent, name, hash);
+	cg_hmtx = &cg_hashmutex[CG_MUTEX_INDEX(hash)];
+	mutex_enter(cg_hmtx);
+	l = cg_hashtable[CG_HASH_INDEX(hash)];
+	while (l) {
+		if ((l->cgd_hash == hash) &&
+		    (l->cgd_parent == parent) &&
+		    (strcmp(l->cgd_name, name) == 0)) {
+			/*
+			 * We need to make sure that the cgrp_node that
+			 * we put a hold on is the same one that we pass back.
+			 * Hence, temporary variable cnp is necessary.
+			 */
+			cnp = l->cgd_cgrp_node;
+			if (hold == HOLD) {
+				ASSERT(cnp);
+				cgnode_hold(cnp);
+			}
+			if (found)
+				*found = cnp;
+			mutex_exit(cg_hmtx);
+			return (l);
+		} else {
+			l = l->cgd_link;
+		}
+	}
+	mutex_exit(cg_hmtx);
+	return (NULL);
+}
+
+/*
+ * The following functions maintain the per-mount cgroup hash table.
+ */
+static void
+cgrp_cg_hash_insert(cgrp_mnt_t *cgm, cgrp_node_t *cn)
+{
+	uint_t cgid;
+	int hsh;
+
+	ASSERT(MUTEX_HELD(&cgm->cg_contents));
+
+	cgid = cn->cgn_id;
+	hsh = cgid % CGRP_HASH_SZ;
+
+	cn->cgn_next = cgm->cg_grp_hash[hsh];
+	cgm->cg_grp_hash[hsh] = cn;
+}
+
+static void
+cgrp_cg_hash_remove(cgrp_mnt_t *cgm, cgrp_node_t *cn)
+{
+	uint_t cgid;
+	int hsh;
+	cgrp_node_t *np = NULL, *curp, *prevp = NULL;
+
+	ASSERT(MUTEX_HELD(&cgm->cg_contents));
+
+	cgid = cn->cgn_id;
+	hsh = cgid % CGRP_HASH_SZ;
+
+	for (curp = cgm->cg_grp_hash[hsh]; curp != NULL;
+	    curp = curp->cgn_next) {
+		if (curp->cgn_id == cgid) {
+			if (prevp == NULL) {
+				cgm->cg_grp_hash[hsh] = curp->cgn_next;
+			} else {
+				prevp->cgn_next = curp->cgn_next;
+			}
+			np = curp;
+			np->cgn_next = NULL;
+			break;
+		}
+
+		prevp = curp;
+	}
+
+	ASSERT(np != NULL);
+	ASSERT(np->cgn_task_cnt == 0);
+}
+
+/*
+ * Count up the number of threads already running in the zone and initialize the
+ * first cgroup's task counter.
+ *
+ * We have to look at all of the processes to find applicable ones.
+ */
+static void
+cgrp_cg_hash_init(cgrp_mnt_t *cgm, cgrp_node_t *cn)
+{
+	int i;
+	int cnt = 0;
+	zoneid_t zoneid = curproc->p_zone->zone_id;
+	pid_t schedpid = curproc->p_zone->zone_zsched->p_pid;
+
+	ASSERT(MUTEX_HELD(&cgm->cg_contents));
+
+	/* Scan all of the process entries */
+	mutex_enter(&pidlock);
+	for (i = 1; i < v.v_proc; i++) {
+		proc_t *p;
+
+		/*
+		 * Skip indices for which there is no pid_entry, PIDs for
+		 * which there is no corresponding process, system processes,
+		 * a PID of 0, the pid for our zsched process,  anything the
+		 * security policy doesn't allow us to look at, its not an
+		 * lx-branded process and processes that are not in the zone.
+		 */
+		if ((p = pid_entry(i)) == NULL ||
+		    p->p_stat == SIDL ||
+		    (p->p_flag & SSYS) != 0 ||
+		    p->p_pid == 0 ||
+		    p->p_pid == schedpid ||
+		    secpolicy_basic_procinfo(CRED(), p, curproc) != 0 ||
+		    p->p_zone->zone_id != zoneid) {
+			continue;
+		}
+
+		mutex_enter(&p->p_lock);
+		if (p->p_brand != &lx_brand) {
+			mutex_exit(&p->p_lock);
+			continue;
+		}
+		cnt += p->p_lwpcnt;
+		mutex_exit(&p->p_lock);
+	}
+
+	/*
+	 * There should be at least the init process with 1 thread in the zone
+	 */
+	ASSERT(cnt > 0);
+	cn->cgn_task_cnt = cnt;
+
+	DTRACE_PROBE2(cgrp__grp__init, void *, cn, int, cnt);
+
+	mutex_exit(&pidlock);
+}
+
+cgrp_node_t *
+cgrp_cg_hash_lookup(cgrp_mnt_t *cgm, uint_t cgid)
+{
+	int hsh = cgid % CGRP_HASH_SZ;
+	cgrp_node_t *curp;
+
+	ASSERT(MUTEX_HELD(&cgm->cg_contents));
+
+	for (curp = cgm->cg_grp_hash[hsh]; curp != NULL;
+	    curp = curp->cgn_next) {
+		if (curp->cgn_id == cgid) {
+			return (curp);
+		}
+	}
+
+	return (NULL);
+}
+
+/*
+ * Calculate an inode number
+ *
+ * This takes various bits of info and munges them to give the inode number for
+ * a cgrp pseudo file node.
+ */
+ino_t
+cgrp_inode(cgrp_nodetype_t type, unsigned int cgrpid)
+{
+	/*
+	 * cgroup inode format:
+	 * 00000000AABBBBBB
+	 *
+	 * AA		- node type (from subsystem list)
+	 * BBBBBB	- id of the cgroup
+	 */
+
+	return ((ino_t)(type << 24) | (cgrpid & 0xffffff));
+}
+
+/*
+ * Return the number of pseudo file entries in a cgroup directory for the
+ * given subsystem.
+ */
+int
+cgrp_num_pseudo_ents(cgrp_ssid_t ssid)
+{
+	cgrp_ssde_t *ssdp = &cg_ssde_dir[ssid];
+
+	return (ssdp->cg_ssde_nfiles);
+}
+
+int
+cgrp_taccess(void *vcp, int mode, cred_t *cred)
+{
+	cgrp_node_t *cn = vcp;
+	int shift = 0;
+	/*
+	 * Check access based on owner, group and public perms in cgrp_node.
+	 */
+	if (crgetuid(cred) != cn->cgn_uid) {
+		shift += MODESHIFT;
+		if (groupmember(cn->cgn_gid, cred) == 0)
+			shift += MODESHIFT;
+	}
+
+	return (secpolicy_vnode_access2(cred, CGNTOV(cn), cn->cgn_uid,
+	    cn->cgn_mode << shift, mode));
+}
+
+/*
+ * Search directory 'parent' for entry 'name'.
+ *
+ * 0 is returned on success and *foundcp points
+ * to the found cgrp_node with its vnode held.
+ */
+int
+cgrp_dirlookup(cgrp_node_t *parent, char *name, cgrp_node_t **foundcp,
+    cred_t *cred)
+{
+	cgrp_mnt_t *cgm = VTOCGM(parent->cgn_vnode);
+	int error;
+
+	ASSERT(MUTEX_HELD(&cgm->cg_contents));
+	*foundcp = NULL;
+	if (parent->cgn_type != CG_CGROUP_DIR)
+		return (ENOTDIR);
+
+	if ((error = cgrp_taccess(parent, VEXEC, cred)))
+		return (error);
+
+	if (*name == '\0') {
+		cgnode_hold(parent);
+		*foundcp = parent;
+		return (0);
+	}
+
+	/*
+	 * Search the directory for the matching name
+	 * We need the lock protecting the cgn_dir list
+	 * so that it doesn't change out from underneath us.
+	 * cgrp_hash_lookup() will pass back the cgrp_node
+	 * with a hold on it.
+	 */
+
+	if (cgrp_hash_lookup(name, parent, HOLD, foundcp) != NULL) {
+		ASSERT(*foundcp);
+		return (0);
+	}
+
+	return (ENOENT);
+}
+
+/*
+ * Enter a directory entry for 'name' and 'cp' into directory 'dir'
+ *
+ * Returns 0 on success.
+ */
+int
+cgrp_direnter(
+	cgrp_mnt_t	*cgm,
+	cgrp_node_t	*dir,		/* target directory to make entry in */
+	char		*name,		/* name of entry */
+	enum de_op	op,		/* entry operation */
+	cgrp_node_t	*cn,		/* existing cgrp_node, if rename */
+	struct vattr	*va,
+	cgrp_node_t	**cnp,		/* return cgrp_node, if create/mkdir */
+	cred_t		*cred,
+	caller_context_t *ctp)
+{
+	cgrp_dirent_t *cdp;
+	cgrp_node_t *found = NULL;
+	int error = 0;
+	char *s;
+
+	ASSERT(MUTEX_HELD(&cgm->cg_contents));
+	ASSERT(dir->cgn_type == CG_CGROUP_DIR);
+
+	/*
+	 * Don't allow '/' characters in pathname component,
+	 */
+	for (s = name; *s; s++)
+		if (*s == '/')
+			return (EACCES);
+
+	if (name[0] == '\0')
+		panic("cgrp_direnter: NULL name");
+
+	/*
+	 * For rename lock the source entry and check the link count
+	 * to see if it has been removed while it was unlocked.
+	 * Remember that we can only rename within the same directory.
+	 */
+	if (op == DE_RENAME) {
+		if (cn->cgn_nlink == 0) {
+			return (ENOENT);
+		}
+
+		if (cn->cgn_nlink == MAXLINK) {
+			return (EMLINK);
+		}
+		cn->cgn_nlink++;
+		gethrestime(&cn->cgn_ctime);
+	}
+
+	/*
+	 * This might be a "dangling detached directory".
+	 * it could have been removed, but a reference
+	 * to it kept in u_cwd.  don't bother searching
+	 * it, and with any luck the user will get tired
+	 * of dealing with us and cd to some absolute
+	 * pathway.  *sigh*, thus in ufs, too.
+	 */
+	if (dir->cgn_nlink == 0) {
+		error = ENOENT;
+		goto out;
+	}
+
+	/*
+	 * Search for the entry. In all cases it is an error if it exists.
+	 */
+	cdp = cgrp_hash_lookup(name, dir, HOLD, &found);
+
+	if (cdp) {
+		ASSERT(found != NULL);
+		error = EEXIST;
+		mutex_exit(&cgm->cg_contents);
+		cgnode_rele(found);
+		mutex_enter(&cgm->cg_contents);
+	} else {
+
+		/*
+		 * The entry does not exist. Check write permission in
+		 * directory to see if entry can be created.
+		 */
+		if ((error = cgrp_taccess(dir, VWRITE, cred)) != 0)
+			goto out;
+		if (op == DE_CREATE || op == DE_MKDIR) {
+			/*
+			 * Make new cgrp_node and directory entry as required.
+			 */
+			error = cgrp_dirmakecgnode(dir, cgm, va, op, &cn, cred);
+			if (error)
+				goto out;
+
+			if (op == DE_MKDIR) {
+				/*
+				 * inherit notify_on_release value from parent
+				 */
+				cn->cgn_notify = dir->cgn_notify;
+			}
+		}
+
+		error = cgrp_diraddentry(dir, cn, name, op);
+		if (error != 0) {
+			if (op == DE_CREATE || op == DE_MKDIR) {
+				/*
+				 * Unmake the inode we just made.
+				 */
+				if ((cn->cgn_type) == CG_CGROUP_DIR) {
+					ASSERT(cdp == NULL);
+					/*
+					 * cleanup allocs made by cgrp_dirinit
+					 */
+					cgrp_dirtrunc(cn);
+				}
+				cn->cgn_nlink = 0;
+				gethrestime(&cn->cgn_ctime);
+				mutex_exit(&cgm->cg_contents);
+				cgnode_rele(cn);
+				mutex_enter(&cgm->cg_contents);
+				cn = NULL;
+			}
+		} else if (cnp) {
+			*cnp = cn;
+		} else if (op == DE_CREATE || op == DE_MKDIR) {
+			mutex_exit(&cgm->cg_contents);
+			cgnode_rele(cn);
+			mutex_enter(&cgm->cg_contents);
+		}
+	}
+
+out:
+	if (error && op == DE_RENAME) {
+		/* Undo bumped link count. */
+		cn->cgn_nlink--;
+		gethrestime(&cn->cgn_ctime);
+	}
+	return (error);
+}
+
+/*
+ * Delete entry cn of name "nm" from parent dir. This is used to both remove
+ * a cgroup directory and to remove the pseudo file nodes within the cgroup
+ * directory (by recursively calling itself). It frees the dir entry space
+ * and decrements link count on cgrp_node(s).
+ *
+ * Return 0 on success.
+ */
+int
+cgrp_dirdelete(cgrp_node_t *dir, cgrp_node_t *cn, char *nm, enum dr_op op,
+    cred_t *cred)
+{
+	cgrp_mnt_t *cgm = VTOCGM(cn->cgn_vnode);
+	cgrp_dirent_t *cndp;
+	int error;
+	size_t namelen;
+	cgrp_node_t *cnnp;
+	timestruc_t now;
+
+	ASSERT(MUTEX_HELD(&cgm->cg_contents));
+
+	if (nm[0] == '\0')
+		panic("cgrp_dirdelete: empty name for 0x%p", (void *)cn);
+
+	/*
+	 * return error when removing . and ..
+	 */
+	if (nm[0] == '.') {
+		if (nm[1] == '\0')
+			return (EINVAL);
+		if (nm[1] == '.' && nm[2] == '\0')
+			return (EEXIST); /* thus in ufs */
+	}
+
+	if ((error = cgrp_taccess(dir, VEXEC|VWRITE, cred)) != 0)
+		return (error);
+
+	if (dir->cgn_dir == NULL)
+		return (ENOENT);
+
+	if (op == DR_RMDIR) {
+		/*
+		 * This is the top-level removal of a cgroup dir. Start by
+		 * removing the fixed pseudo file entries from the dir. We do
+		 * this by recursively calling back into this function with
+		 * a different op code. The caller of this function has
+		 * already verified that it is safe to remove this directory.
+		 */
+		cgrp_dirent_t *cdp;
+
+		ASSERT(cn->cgn_type == CG_CGROUP_DIR);
+
+		cdp = cn->cgn_dir;
+		while (cdp) {
+			cgrp_node_t *pseudo_node;
+			cgrp_dirent_t *nextp;
+
+			if (strcmp(cdp->cgd_name, ".") == 0 ||
+			    strcmp(cdp->cgd_name, "..") == 0) {
+				cdp = cdp->cgd_next;
+				continue;
+			}
+
+			pseudo_node = cdp->cgd_cgrp_node;
+			nextp = cdp->cgd_next;
+
+			cgnode_hold(pseudo_node);
+			error = cgrp_dirdelete(cn, pseudo_node,
+			    cdp->cgd_name, DR_REMOVE, cred);
+			mutex_exit(&cgm->cg_contents);
+			cgnode_rele(pseudo_node);
+			mutex_enter(&cgm->cg_contents);
+
+			cdp = nextp;
+		}
+
+		cgrp_cg_hash_remove(cgm, cn);
+	}
+
+	cndp = cgrp_hash_lookup(nm, dir, NOHOLD, &cnnp);
+	VERIFY(cndp != NULL);
+	VERIFY(cn == cnnp);
+
+	cgrp_hash_out(cndp);
+
+	/* Take cndp out of the directory list. */
+	ASSERT(cndp->cgd_next != cndp);
+	ASSERT(cndp->cgd_prev != cndp);
+	if (cndp->cgd_prev) {
+		cndp->cgd_prev->cgd_next = cndp->cgd_next;
+	}
+	if (cndp->cgd_next) {
+		cndp->cgd_next->cgd_prev = cndp->cgd_prev;
+	}
+
+	/*
+	 * If the roving slot pointer happens to match cndp,
+	 * point it at the previous dirent.
+	 */
+	if (dir->cgn_dir->cgd_prev == cndp) {
+		dir->cgn_dir->cgd_prev = cndp->cgd_prev;
+	}
+	ASSERT(cndp->cgd_next != cndp);
+	ASSERT(cndp->cgd_prev != cndp);
+
+	/* cndp points to the correct directory entry */
+	namelen = strlen(cndp->cgd_name) + 1;
+
+	kmem_free(cndp, sizeof (cgrp_dirent_t) + namelen);
+	dir->cgn_size -= (sizeof (cgrp_dirent_t) + namelen);
+	dir->cgn_dirents--;
+
+	gethrestime(&now);
+	dir->cgn_mtime = now;
+	dir->cgn_ctime = now;
+	cn->cgn_ctime = now;
+
+	ASSERT(cn->cgn_nlink > 0);
+	cn->cgn_nlink--;
+	if (op == DR_RMDIR && cn->cgn_type == CG_CGROUP_DIR) {
+		cgrp_dirtrunc(cn);
+		ASSERT(cn->cgn_nlink == 0);
+	}
+	return (0);
+}
+
+/*
+ * Initialize a cgrp_node and add it to file list under mount point.
+ */
+void
+cgrp_node_init(cgrp_mnt_t *cgm, cgrp_node_t *cn, vattr_t *vap, cred_t *cred)
+{
+	struct vnode *vp;
+	timestruc_t now;
+
+	ASSERT(MUTEX_HELD(&cgm->cg_contents));
+	ASSERT(vap != NULL);
+
+	cn->cgn_mode = MAKEIMODE(vap->va_type, vap->va_mode);
+	cn->cgn_mask = 0;
+	cn->cgn_attr.va_type = vap->va_type;
+	cn->cgn_nlink = 1;
+	cn->cgn_size = 0;
+
+	if (cred == NULL) {
+		cn->cgn_uid = vap->va_uid;
+		cn->cgn_gid = vap->va_gid;
+	} else {
+		cn->cgn_uid = crgetuid(cred);
+		cn->cgn_gid = crgetgid(cred);
+	}
+
+	cn->cgn_fsid = cgm->cg_dev;
+	cn->cgn_rdev = vap->va_rdev;
+	cn->cgn_blksize = PAGESIZE;
+	cn->cgn_nblocks = 0;
+	gethrestime(&now);
+	cn->cgn_atime = now;
+	cn->cgn_mtime = now;
+	cn->cgn_ctime = now;
+	cn->cgn_seq = 0;
+	cn->cgn_dir = NULL;
+
+	cn->cgn_vnode = vn_alloc(KM_SLEEP);
+	vp = CGNTOV(cn);
+	vn_setops(vp, cgrp_vnodeops);
+	vp->v_vfsp = cgm->cg_vfsp;
+	vp->v_type = vap->va_type;
+	vp->v_rdev = vap->va_rdev;
+	vp->v_data = (caddr_t)cn;
+
+	cn->cgn_nodeid = cgm->cg_gen++;
+
+	/*
+	 * Add new cgrp_node to end of linked list of cgrp_nodes for this
+	 * cgroup fs. Root directory is handled specially in cgrp_mount.
+	 */
+	if (cgm->cg_rootnode != (cgrp_node_t *)NULL) {
+		cn->cgn_forw = NULL;
+		cn->cgn_back = cgm->cg_rootnode->cgn_back;
+		cn->cgn_back->cgn_forw = cgm->cg_rootnode->cgn_back = cn;
+	}
+	vn_exists(vp);
+}
+
+void
+cgrp_addnode(cgrp_mnt_t *cgm, cgrp_node_t *dir, char *name,
+    cgrp_nodetype_t type, struct vattr *nattr, cred_t *cr)
+{
+	cgrp_node_t *ncn;
+
+	ASSERT(MUTEX_HELD(&cgm->cg_contents));
+
+	cgrp_direnter(cgm, dir, name, DE_CREATE, (cgrp_node_t *)NULL, nattr,
+	    &ncn, cr, NULL);
+
+	/*
+	 * Fix the inode and assign the pseudo file type to be correct.
+	 */
+	ncn->cgn_nodeid = cgrp_inode(type, dir->cgn_nodeid);
+	ncn->cgn_type = type;
+
+	/*
+	 * Since we're creating these entries here and not via the
+	 * normal VOP_CREATE code path, we need to do the rele to drop
+	 * our hold. This will leave the vnode v_count at 0 when we
+	 * come out of cgrp_inactive but we won't reclaim the vnode
+	 * there since the cgn_nlink value will still be 1.
+	 */
+	mutex_exit(&cgm->cg_contents);
+	cgnode_rele(ncn);
+	mutex_enter(&cgm->cg_contents);
+}
+
+/*
+ * cgrp_dirinit is used internally to initialize a directory (dir)
+ * with '.' and '..' entries without checking permissions and locking
+ * It also creates the entries for the pseudo file nodes that reside in the
+ * directory.
+ */
+void
+cgrp_dirinit(cgrp_node_t *parent, cgrp_node_t *dir, cred_t *cr)
+{
+	cgrp_dirent_t *dot, *dotdot;
+	timestruc_t now;
+	cgrp_mnt_t *cgm = VTOCGM(dir->cgn_vnode);
+	cgrp_ssde_t *ssdp;
+	cgrp_subsys_dirent_t *pseudo_files;
+	struct vattr nattr;
+	int i;
+
+	ASSERT(MUTEX_HELD(&cgm->cg_contents));
+	ASSERT(dir->cgn_type == CG_CGROUP_DIR);
+
+	ASSERT(cgm->cg_ssid > 0 && cgm->cg_ssid < CG_SSID_NUM);
+	ssdp = &cg_ssde_dir[cgm->cg_ssid];
+
+	/*
+	 * If this is the top-level cgroup created by the mount then we need to
+	 * count up the number of procs and tasks already running in the zone.
+	 */
+
+	/*
+	 * Set the cgroup ID for this cgrp_node by using a counter on each
+	 * mount.
+	 */
+	dir->cgn_id = cgm->cg_grp_gen++;
+	cgrp_cg_hash_insert(cgm, dir);
+	/* Initialise the first cgroup if this is top-level group */
+	if (parent == dir)
+		cgrp_cg_hash_init(cgm, dir);
+
+	/*
+	 * Initialize the entries
+	 */
+	dot = kmem_zalloc(sizeof (cgrp_dirent_t) + 2, KM_SLEEP);
+	dot->cgd_cgrp_node = dir;
+	dot->cgd_offset = 0;
+	dot->cgd_name = (char *)dot + sizeof (cgrp_dirent_t);
+	dot->cgd_name[0] = '.';
+	dot->cgd_parent = dir;
+	cgrp_hash_in(dot);
+
+	dotdot = kmem_zalloc(sizeof (cgrp_dirent_t) + 3, KM_SLEEP);
+	dotdot->cgd_cgrp_node = parent;
+	dotdot->cgd_offset = 1;
+	dotdot->cgd_name = (char *)dotdot + sizeof (cgrp_dirent_t);
+	dotdot->cgd_name[0] = '.';
+	dotdot->cgd_name[1] = '.';
+	dotdot->cgd_parent = dir;
+	cgrp_hash_in(dotdot);
+
+	/*
+	 * Initialize directory entry list.
+	 */
+	dot->cgd_next = dotdot;
+	dot->cgd_prev = dotdot;	/* dot's cgd_prev holds roving slot pointer */
+	dotdot->cgd_next = NULL;
+	dotdot->cgd_prev = dot;
+
+	gethrestime(&now);
+	dir->cgn_mtime = now;
+	dir->cgn_ctime = now;
+
+	parent->cgn_nlink++;
+	parent->cgn_ctime = now;
+
+	dir->cgn_dir = dot;
+	dir->cgn_size = 2 * sizeof (cgrp_dirent_t) + 5;	/* dot and dotdot */
+	dir->cgn_dirents = 2;
+	dir->cgn_nlink = 2;
+
+	bzero(&nattr, sizeof (struct vattr));
+	nattr.va_mode = (mode_t)(0644);
+	nattr.va_type = VREG;
+	nattr.va_rdev = 0;
+
+	/*
+	 * If this is the top-level dir in the file system then it always
+	 * has a release_agent pseudo file. Only the top-level dir has this
+	 * file.
+	 */
+	if (parent == dir) {
+		cgrp_addnode(cgm, dir, "release_agent", CG_REL_AGENT, &nattr,
+		    cr);
+	}
+
+	pseudo_files = ssdp->cg_ssde_files;
+	for (i = 0; i < ssdp->cg_ssde_nfiles; i++) {
+		cgrp_addnode(cgm, dir, pseudo_files[i].cgrp_ssd_name,
+		    pseudo_files[i].cgrp_ssd_type, &nattr, cr);
+	}
+}
+
+/*
+ * cgrp_dirtrunc is called to remove all directory entries under this directory.
+ */
+void
+cgrp_dirtrunc(cgrp_node_t *dir)
+{
+	cgrp_dirent_t *cgdp;
+	timestruc_t now;
+	cgrp_mnt_t *cgm = VTOCGM(dir->cgn_vnode);
+
+	ASSERT(MUTEX_HELD(&cgm->cg_contents));
+	ASSERT(dir->cgn_type == CG_CGROUP_DIR);
+
+	for (cgdp = dir->cgn_dir; cgdp; cgdp = dir->cgn_dir) {
+		size_t namelen;
+		cgrp_node_t *cn;
+
+		ASSERT(cgdp->cgd_next != cgdp);
+		ASSERT(cgdp->cgd_prev != cgdp);
+		ASSERT(cgdp->cgd_cgrp_node);
+
+		dir->cgn_dir = cgdp->cgd_next;
+		namelen = strlen(cgdp->cgd_name) + 1;
+
+		/*
+		 * Adjust the link counts to account for this directory entry
+		 * removal. We do hold/rele operations to free up these nodes.
+		 */
+		cn = cgdp->cgd_cgrp_node;
+		ASSERT(cn->cgn_nlink > 0);
+		cn->cgn_nlink--;
+
+		cgrp_hash_out(cgdp);
+		kmem_free(cgdp, sizeof (cgrp_dirent_t) + namelen);
+		dir->cgn_size -= (sizeof (cgrp_dirent_t) + namelen);
+		dir->cgn_dirents--;
+	}
+
+	gethrestime(&now);
+	dir->cgn_mtime = now;
+	dir->cgn_ctime = now;
+
+	ASSERT(dir->cgn_dir == NULL);
+	ASSERT(dir->cgn_size == 0);
+	ASSERT(dir->cgn_dirents == 0);
+}
+
+static int
+cgrp_diraddentry(cgrp_node_t *dir, cgrp_node_t *cn, char *name, enum de_op op)
+{
+	cgrp_dirent_t *cdp, *cpdp;
+	size_t		namelen, alloc_size;
+	timestruc_t	now;
+
+	/*
+	 * Make sure the parent directory wasn't removed from
+	 * underneath the caller.
+	 */
+	if (dir->cgn_dir == NULL)
+		return (ENOENT);
+
+	/* Check that everything is on the same filesystem. */
+	if (cn->cgn_vnode->v_vfsp != dir->cgn_vnode->v_vfsp)
+		return (EXDEV);
+
+	/* Allocate and initialize directory entry */
+	namelen = strlen(name) + 1;
+	alloc_size = namelen + sizeof (cgrp_dirent_t);
+	cdp = kmem_zalloc(alloc_size, KM_NOSLEEP | KM_NORMALPRI);
+	if (cdp == NULL)
+		return (ENOSPC);
+
+	cn->cgn_parent = dir;
+
+	dir->cgn_size += alloc_size;
+	dir->cgn_dirents++;
+	cdp->cgd_cgrp_node = cn;
+	cdp->cgd_parent = dir;
+
+	/* The directory entry and its name were allocated sequentially. */
+	cdp->cgd_name = (char *)cdp + sizeof (cgrp_dirent_t);
+	(void) strcpy(cdp->cgd_name, name);
+
+	cgrp_hash_in(cdp);
+
+	/*
+	 * Some utilities expect the size of a directory to remain
+	 * somewhat static.  For example, a routine which removes
+	 * subdirectories between calls to readdir(); the size of the
+	 * directory changes from underneath it and so the real
+	 * directory offset in bytes is invalid.  To circumvent
+	 * this problem, we initialize a directory entry with an
+	 * phony offset, and use this offset to determine end of
+	 * file in cgrp_readdir.
+	 */
+	cpdp = dir->cgn_dir->cgd_prev;
+	/*
+	 * Install at first empty "slot" in directory list.
+	 */
+	while (cpdp->cgd_next != NULL && (cpdp->cgd_next->cgd_offset -
+	    cpdp->cgd_offset) <= 1) {
+		ASSERT(cpdp->cgd_next != cpdp);
+		ASSERT(cpdp->cgd_prev != cpdp);
+		ASSERT(cpdp->cgd_next->cgd_offset > cpdp->cgd_offset);
+		cpdp = cpdp->cgd_next;
+	}
+	cdp->cgd_offset = cpdp->cgd_offset + 1;
+
+	/*
+	 * If we're at the end of the dirent list and the offset (which
+	 * is necessarily the largest offset in this directory) is more
+	 * than twice the number of dirents, that means the directory is
+	 * 50% holes.  At this point we reset the slot pointer back to
+	 * the beginning of the directory so we start using the holes.
+	 * The idea is that if there are N dirents, there must also be
+	 * N holes, so we can satisfy the next N creates by walking at
+	 * most 2N entries; thus the average cost of a create is constant.
+	 * Note that we use the first dirent's cgd_prev as the roving
+	 * slot pointer; it's ugly, but it saves a word in every dirent.
+	 */
+	if (cpdp->cgd_next == NULL && cpdp->cgd_offset > 2 * dir->cgn_dirents)
+		dir->cgn_dir->cgd_prev = dir->cgn_dir->cgd_next;
+	else
+		dir->cgn_dir->cgd_prev = cdp;
+
+	ASSERT(cpdp->cgd_next != cpdp);
+	ASSERT(cpdp->cgd_prev != cpdp);
+
+	cdp->cgd_next = cpdp->cgd_next;
+	if (cdp->cgd_next) {
+		cdp->cgd_next->cgd_prev = cdp;
+	}
+	cdp->cgd_prev = cpdp;
+	cpdp->cgd_next = cdp;
+
+	ASSERT(cdp->cgd_next != cdp);
+	ASSERT(cdp->cgd_prev != cdp);
+	ASSERT(cpdp->cgd_next != cpdp);
+	ASSERT(cpdp->cgd_prev != cpdp);
+
+	gethrestime(&now);
+	dir->cgn_mtime = now;
+	dir->cgn_ctime = now;
+
+	return (0);
+}
+
+static int
+cgrp_dirmakecgnode(cgrp_node_t *dir, cgrp_mnt_t *cgm, struct vattr *va,
+    enum de_op op, cgrp_node_t **newnode, struct cred *cred)
+{
+	cgrp_node_t *cn;
+
+	ASSERT(MUTEX_HELD(&cgm->cg_contents));
+	ASSERT(va != NULL);
+
+	if (((va->va_mask & AT_ATIME) && TIMESPEC_OVERFLOW(&va->va_atime)) ||
+	    ((va->va_mask & AT_MTIME) && TIMESPEC_OVERFLOW(&va->va_mtime)))
+		return (EOVERFLOW);
+
+	cn = kmem_zalloc(sizeof (cgrp_node_t), KM_SLEEP);
+	cgrp_node_init(cgm, cn, va, cred);
+
+	cn->cgn_vnode->v_rdev = cn->cgn_rdev = NODEV;
+	cn->cgn_vnode->v_type = va->va_type;
+	cn->cgn_uid = crgetuid(cred);
+	cn->cgn_gid = crgetgid(cred);
+
+	if (va->va_mask & AT_ATIME)
+		cn->cgn_atime = va->va_atime;
+	if (va->va_mask & AT_MTIME)
+		cn->cgn_mtime = va->va_mtime;
+
+	if (op == DE_MKDIR) {
+		cn->cgn_type = CG_CGROUP_DIR;
+		cgrp_dirinit(dir, cn, cred);
+	}
+
+	*newnode = cn;
+	return (0);
+}
diff --git a/usr/src/uts/common/brand/lx/cgroups/cgrps_vfsops.c b/usr/src/uts/common/brand/lx/cgroups/cgrps_vfsops.c
new file mode 100644
index 0000000000..a9bd783569
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/cgroups/cgrps_vfsops.c
@@ -0,0 +1,1052 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+/*
+ * The cgroup file system implements a subset of the Linux cgroup functionality
+ * for use by lx-branded zones. On Linux, cgroups are a generic process grouping
+ * mechanism which is used to apply various behaviors to the processes within
+ * the group, although it's primary purpose is for resource management.
+ *
+ * In Linux, the cgroup file system provides two pieces of functionality:
+ * 1) A per-mount set of cgroups arranged in a tree, such that every task in
+ *    the system is in one, and only one, of the cgroups in the tree.
+ * 2) A set of subsystems; each subsystem has subsystem-specific state and
+ *    behavior and is associated with a cgroup mount. This provides a way to
+ *    apply arbitrary functionality (but generally resource management related)
+ *    to the processes associated with the nodes in the tree at that mount
+ *    point.
+ *
+ * For example, it is common to see cgroup trees (each is its own mount with a
+ * different subsystem controller) for blkio, cpuset, memory, systemd (has no
+ * controller), etc. Within each tree there is a top-level directory with at
+ * least a cgroup.procs, notify_on_release, release_agent, and tasks file.
+ * The cgroup.procs file lists the processes within that group and the tasks
+ * file lists the threads in the group. There could be subdirectories, which
+ * define new cgroups, that then contain a subset of the processes. Each
+ * subdirectory also has, at a minimum, a cgroup.procs, notify_on_release, and
+ * tasks file.
+ *
+ * Since we're using lx to run user-level code within zones, the majority (all?)
+ * of the cgroup resource management functionality simply doesn't apply to us.
+ * The primary need for cgroups is to support the init program 'systemd' as the
+ * consumer. systemd only requires the process grouping hierarchy of cgroups,
+ * although it can also use the resource management features if they are
+ * available. Given this, our cgroup file system only implements the process
+ * hierarchy and does not report that any resource management controllers are
+ * available for separate mounts.
+ *
+ * In addition to the hierarchy, the other important component of cgroups that
+ * is used by systemd is the 'release_agent'. This provides a mechanism to
+ * run a command when a cgroup becomes empty (the last task in the group
+ * leaves, either by exit or move, and there are no more sub-cgroups). The
+ * 'release_agent' file only exists in the top-level cgroup of the mounted
+ * file system and holds the path to a command to run. The 'notify_on_release'
+ * file exists in each cgroup dir. If that file contains a '1' then the agent
+ * is run when that group becomes empty. The agent is passed a path string of
+ * the cgroup, relative to the file system mount point (e.g. a mount on
+ * /sys/fs/cgroups/systemd with a sub-cgroup of /sys/fs/cgroups/systemd/foo/bar
+ * gets the arg /foo/bar).
+ *
+ * Cgroup membership is implemented via hooks into the lx brand code. When
+ * the cgroup file system loads it installs callbacks for:
+ *    lx_cgrp_initlwp
+ *    lx_cgrp_freelwp
+ * and when it unloads it clears those hooks. The lx brand code calls those
+ * hooks when a lwp starts and when it exits. Internally we use a
+ * simple reference counter (cgn_task_cnt) on the cgroup node to track how many
+ * threads are in the group, so we can tell when a group becomes empty.
+ * To make this quick, a hash table (cg_grp_hash) is maintained on the
+ * cgrp_mnt_t struct to allow quick lookups by cgroup ID. The hash table is
+ * sized so that there should typically only be 0 or 1 cgroups per bucket.
+ * We also keep a reference to the file system in the zone-specific brand data
+ * (lxzd_cgroup) so that the lx brand code can pass in the correct vfs_t
+ * when it runs the hook.
+ *
+ * Once a cgroup is about to become empty, the final process exiting the cgroup
+ * will launch a new user-level process which execs the release agent. The new
+ * process is created as a child of zsched (indicated by the -1 pid argument
+ * to newproc) and is not associated with the exiting process in any way.
+ *
+ * This file system is similar to tmpfs in that directories only exist in
+ * memory. Each subdirectory represents a different cgroup. Within the cgroup
+ * there are pseudo files (see cg_ssde_dir) with well-defined names which
+ * control the configuration and behavior of the cgroup (see cgrp_nodetype_t).
+ * The primary files within every cgroup are named 'cgroup.procs',
+ * 'notify_on_release', and 'tasks' (as well as 'release_agent' in the
+ * top-level cgroup). The cgroup.procs and tasks files are used to control and
+ * list which processes/threads belong to the cgroup. In the general case there
+ * could be additional files in the cgroup, which defined additional behavior
+ * (i.e. subsystem specific pseudo files), although none exist at this time.
+ *
+ * Each cgroup node has a unique ID (cgn_nodeid) within the mount. This ID is
+ * used to correlate with the threads to determine cgroup membership. When
+ * assigning a PID to a cgroup (via write) the code updates the br_cgroupid
+ * member in the brand-specific lx_lwp_data structure to control which cgroup
+ * the thread belongs to. Note that because the br_cgroupid lives in
+ * lx_lwp_data, native processes will not appear in the cgroup hierarchy.
+ *
+ * An overview of the behavior for the various vnode operations is:
+ * - no hardlinks or symlinks
+ * - no file create (the subsystem-specific files are a fixed list of
+ *   pseudo-files accessible within the directory)
+ * - no file remove
+ * - no file rename, but a directory (i.e. a cgroup) can be renamed within the
+ *   containing directory, but not into a different directory
+ * - can mkdir and rmdir to create/destroy cgroups
+ * - cannot rmdir while it contains tasks or a subdir (i.e. a sub-cgroup)
+ * - open, read/write, close on the subsytem-specific pseudo files is
+ *   allowed, as this is the interface to configure and report on the cgroup.
+ *   The pseudo file's mode controls write access and cannot be changed.
+ *
+ * The locking in this file system is simple since the file system is not
+ * subjected to heavy I/O activity and all data is in-memory. There is a single
+ * global mutex for each mount (cg_contents). This mutex is held for the life
+ * of most vnode operations. The most active path is probably the LWP start and
+ * exit hooks which increment/decrement the reference counter on the cgroup
+ * node. The lock is important for this case since we don't want concurrent
+ * activity (such as moving the process into another cgroup) while we're trying
+ * to lookup the cgroup from the mount's hash table. We must be careful to
+ * avoid a deadlock while reading or writing since that code can take pidlock
+ * and p_lock, but the cgrp_lwp_fork_helper can also be called while one of
+ * those is held. To prevent deadlock we always take cg_contents after pidlock
+ * and p_lock.
+ *
+ * EXTENDING THE FILE SYSTEM
+ *
+ * When adding support for a new subsystem, be sure to also update the
+ * lxpr_read_cgroups function in lx_procfs so that the subsystem is reported
+ * by proc.
+ *
+ * Although we don't currently support any subsystem controllers, the design
+ * allows for the file system to be extended to add controller emulation
+ * if needed. New controller IDs (i.e. different subsystems) for a mount can
+ * be defined in the cgrp_ssid_t enum (e.g. CG_SSID_CPUSET or CG_SSID_MEMORY)
+ * and new node types for additional pseudo files in the tree can be defined in
+ * the cgrp_nodetype_t enum (e.g. CG_CPUSET_CPUS or CG_MEMORY_USAGE_IN_BYTES).
+ * The cg_ssde_dir array would need a new entry for the new subsystem to
+ * control which nodes are visible in a directory for the new subsystem.
+ *
+ * New emulation would then need to be written to manage the behavior on the
+ * new pseudo file(s) associated with new cgrp_nodetype_t types.
+ *
+ * Within lx procfs the lxpr_read_pid_cgroup() function would need to be
+ * updated so that it reported the various subsystems used by the different
+ * mounts.
+ *
+ * In addition, in order to support more than one cgroup mount we would need a
+ * list of cgroup IDs associated with every thread, instead of just one ID
+ * (br_cgroupid). The thread data would need to become a struct which held
+ * both an ID and an indication as to which mounted cgroup file system instance
+ * the ID was associated with. We would also need a list of cgroup mounts per
+ * zone, instead the current single zone reference.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/sysmacros.h>
+#include <sys/kmem.h>
+#include <sys/time.h>
+#include <sys/pathname.h>
+#include <sys/vfs.h>
+#include <sys/vfs_opreg.h>
+#include <sys/vnode.h>
+#include <sys/stat.h>
+#include <sys/uio.h>
+#include <sys/stat.h>
+#include <sys/errno.h>
+#include <sys/cmn_err.h>
+#include <sys/cred.h>
+#include <sys/statvfs.h>
+#include <sys/mount.h>
+#include <sys/systm.h>
+#include <sys/mntent.h>
+#include <sys/policy.h>
+#include <sys/sdt.h>
+#include <sys/ddi.h>
+#include <sys/vmparam.h>
+#include <sys/corectl.h>
+#include <sys/contract_impl.h>
+#include <sys/pool.h>
+#include <sys/stack.h>
+#include <sys/rt.h>
+#include <sys/fx.h>
+#include <sys/brand.h>
+#include <sys/lx_brand.h>
+
+#include "cgrps.h"
+
+/* Module level parameters */
+static int	cgrp_fstype;
+static dev_t	cgrp_dev;
+
+#define	MAX_AGENT_EVENTS	32		/* max num queued events */
+
+#define	UMNT_DELAY_TIME	drv_usectohz(50000)	/* 500th of a second */
+#define	UMNT_RETRY_MAX	100			/* 100 times - 2 secs */
+
+/*
+ * cgrp_mountcount is used to prevent module unloads while there is still
+ * state from a former mount hanging around. The filesystem module must not be
+ * allowed to go away before the last VFS_FREEVFS() call has been made. Since
+ * this is just an atomic counter, there's no need for locking.
+ */
+static uint32_t cgrp_mountcount;
+
+/*
+ * cgrp_minfree is the minimum amount of swap space that cgroups leaves for
+ * the rest of the zone. In other words, if the amount of free swap space
+ * in the zone drops below cgrp_minfree, cgroup anon allocations will fail.
+ * This number is only likely to become factor when DRAM and swap have both
+ * been capped low to allow for maximum tenancy.
+ */
+size_t cgrp_minfree = 0;
+
+/*
+ * CGMINFREE -- the value from which cgrp_minfree is derived -- should be
+ * configured to a value that is roughly the smallest practical value for
+ * memory + swap minus the largest reasonable size for cgroups in such
+ * a configuration. As of this writing, the smallest practical memory + swap
+ * configuration is 128MB, and it seems reasonable to allow cgroups to consume
+ * no more than half of this, yielding a CGMINFREE of 64MB.
+ */
+#define	CGMINFREE	64 * 1024 * 1024	/* 64 Megabytes */
+
+extern pgcnt_t swapfs_minfree;
+
+/*
+ * cgroup vfs operations.
+ */
+static int cgrp_init(int, char *);
+static int cgrp_mount(struct vfs *, struct vnode *,
+	struct mounta *, struct cred *);
+static int cgrp_unmount(struct vfs *, int, struct cred *);
+static int cgrp_root(struct vfs *, struct vnode **);
+static int cgrp_statvfs(struct vfs *, struct statvfs64 *);
+static void cgrp_freevfs(vfs_t *vfsp);
+
+/* Forward declarations for hooks */
+static void cgrp_lwp_fork_helper(vfs_t *, uint_t, id_t, pid_t);
+static void cgrp_lwp_exit_helper(vfs_t *, uint_t, id_t, pid_t);
+
+/*
+ * Loadable module wrapper
+ */
+#include <sys/modctl.h>
+
+static vfsdef_t vfw = {
+	VFSDEF_VERSION,
+	"lx_cgroup",
+	cgrp_init,
+	VSW_ZMOUNT,
+	NULL
+};
+
+/*
+ * Module linkage information
+ */
+static struct modlfs modlfs = {
+	&mod_fsops, "lx brand cgroups", &vfw
+};
+
+static struct modlinkage modlinkage = {
+	MODREV_1, &modlfs, NULL
+};
+
+int
+_init()
+{
+	return (mod_install(&modlinkage));
+}
+
+int
+_fini()
+{
+	int error;
+
+	if (cgrp_mountcount)
+		return (EBUSY);
+
+	if ((error = mod_remove(&modlinkage)) != 0)
+		return (error);
+
+	/* Disable hooks used by the lx brand module. */
+	lx_cgrp_initlwp = NULL;
+	lx_cgrp_freelwp = NULL;
+
+	/*
+	 * Tear down the operations vectors
+	 */
+	(void) vfs_freevfsops_by_type(cgrp_fstype);
+	vn_freevnodeops(cgrp_vnodeops);
+	return (0);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&modlinkage, modinfop));
+}
+
+/*
+ * Initialize global locks, etc. Called when loading cgroup module.
+ */
+static int
+cgrp_init(int fstype, char *name)
+{
+	static const fs_operation_def_t cgrp_vfsops_template[] = {
+		VFSNAME_MOUNT,		{ .vfs_mount = cgrp_mount },
+		VFSNAME_UNMOUNT,	{ .vfs_unmount = cgrp_unmount },
+		VFSNAME_ROOT,		{ .vfs_root = cgrp_root },
+		VFSNAME_STATVFS,	{ .vfs_statvfs = cgrp_statvfs },
+		VFSNAME_FREEVFS,	{ .vfs_freevfs = cgrp_freevfs },
+		NULL,			NULL
+	};
+	extern const struct fs_operation_def cgrp_vnodeops_template[];
+	int error;
+	extern  void    cgrp_hash_init();
+	major_t dev;
+
+	cgrp_hash_init();
+	cgrp_fstype = fstype;
+	ASSERT(cgrp_fstype != 0);
+
+	error = vfs_setfsops(fstype, cgrp_vfsops_template, NULL);
+	if (error != 0) {
+		cmn_err(CE_WARN, "cgrp_init: bad vfs ops template");
+		return (error);
+	}
+
+	error = vn_make_ops(name, cgrp_vnodeops_template, &cgrp_vnodeops);
+	if (error != 0) {
+		(void) vfs_freevfsops_by_type(fstype);
+		cmn_err(CE_WARN, "cgrp_init: bad vnode ops template");
+		return (error);
+	}
+
+	/*
+	 * cgrp_minfree doesn't need to be some function of configured
+	 * swap space since it really is an absolute limit of swap space
+	 * which still allows other processes to execute.
+	 */
+	if (cgrp_minfree == 0) {
+		/* Set if not patched */
+		cgrp_minfree = btopr(CGMINFREE);
+	}
+
+	if ((dev = getudev()) == (major_t)-1) {
+		cmn_err(CE_WARN, "cgrp_init: Can't get unique device number.");
+		dev = 0;
+	}
+
+	/*
+	 * Make the pseudo device
+	 */
+	cgrp_dev = makedevice(dev, 0);
+
+	/* Install the hooks used by the lx brand module. */
+	lx_cgrp_initlwp = cgrp_lwp_fork_helper;
+	lx_cgrp_freelwp = cgrp_lwp_exit_helper;
+
+	return (0);
+}
+
+static int
+cgrp_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
+{
+	cgrp_mnt_t *cgm = NULL;
+	struct cgrp_node *cp;
+	struct pathname dpn;
+	int error;
+	struct vattr rattr;
+	cgrp_ssid_t ssid = CG_SSID_GENERIC;
+	lx_zone_data_t *lxzdata;
+
+	if ((error = secpolicy_fs_mount(cr, mvp, vfsp)) != 0)
+		return (error);
+
+	if (mvp->v_type != VDIR)
+		return (ENOTDIR);
+
+	/*
+	 * Since we depend on per-thread lx brand data, only allow mounting
+	 * within lx zones.
+	 */
+	if (curproc->p_zone->zone_brand != &lx_brand)
+		return (EINVAL);
+
+	/*
+	 * Ensure we don't allow overlaying mounts
+	 */
+	mutex_enter(&mvp->v_lock);
+	if ((uap->flags & MS_OVERLAY) == 0 &&
+	    (mvp->v_count > 1 || (mvp->v_flag & VROOT))) {
+		mutex_exit(&mvp->v_lock);
+		return (EBUSY);
+	}
+	mutex_exit(&mvp->v_lock);
+
+	/*
+	 * Having the resource be anything but "swap" doesn't make sense.
+	 */
+	vfs_setresource(vfsp, "swap", 0);
+
+	/* cgroups don't support read-only mounts */
+	if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) {
+		error = EINVAL;
+		goto out;
+	}
+
+	/*
+	 * Here is where we could support subsystem-specific controller
+	 * mounting. For example, if mounting a cgroup fs with the 'cpuset'
+	 * option to specify that particular controller.
+	 *
+	 * char *argstr;
+	 * if (vfs_optionisset(vfsp, "cpuset", &argstr)) {
+	 *	if (ssid != CG_SSID_GENERIC) {
+	 *		error = EINVAL;
+	 *		goto out;
+	 *	}
+	 *	ssid = CG_SSID_CPUSET;
+	 * }
+	 */
+
+	error = pn_get(uap->dir,
+	    (uap->flags & MS_SYSSPACE) ? UIO_SYSSPACE : UIO_USERSPACE, &dpn);
+	if (error != 0)
+		goto out;
+
+	/*
+	 * We currently only support one mount per zone.
+	 */
+	lxzdata = ztolxzd(curproc->p_zone);
+	mutex_enter(&lxzdata->lxzd_lock);
+	if (lxzdata->lxzd_cgroup != NULL) {
+		mutex_exit(&lxzdata->lxzd_lock);
+		return (EINVAL);
+	}
+
+	cgm = kmem_zalloc(sizeof (*cgm), KM_SLEEP);
+
+	/* Set but don't bother entering the mutex (not on mount list yet) */
+	mutex_init(&cgm->cg_contents, NULL, MUTEX_DEFAULT, NULL);
+
+	cgm->cg_vfsp = lxzdata->lxzd_cgroup = vfsp;
+	mutex_exit(&lxzdata->lxzd_lock);
+
+	cgm->cg_lxzdata = lxzdata;
+	cgm->cg_ssid = ssid;
+
+	vfsp->vfs_data = (caddr_t)cgm;
+	vfsp->vfs_fstype = cgrp_fstype;
+	vfsp->vfs_dev = cgrp_dev;
+	vfsp->vfs_bsize = PAGESIZE;
+	vfsp->vfs_flag |= VFS_NOTRUNC;
+	vfs_make_fsid(&vfsp->vfs_fsid, cgrp_dev, cgrp_fstype);
+	cgm->cg_mntpath = kmem_zalloc(dpn.pn_pathlen + 1, KM_SLEEP);
+	(void) strcpy(cgm->cg_mntpath, dpn.pn_path);
+
+	cgm->cg_grp_hash = kmem_zalloc(sizeof (cgrp_node_t *) * CGRP_HASH_SZ,
+	    KM_SLEEP);
+
+	/* allocate and initialize root cgrp_node structure */
+	bzero(&rattr, sizeof (struct vattr));
+	rattr.va_mode = (mode_t)(S_IFDIR | 0755);
+	rattr.va_type = VDIR;
+	rattr.va_rdev = 0;
+	cp = kmem_zalloc(sizeof (struct cgrp_node), KM_SLEEP);
+
+	mutex_enter(&cgm->cg_contents);
+	cgrp_node_init(cgm, cp, &rattr, cr);
+
+	CGNTOV(cp)->v_flag |= VROOT;
+
+	/*
+	 * initialize linked list of cgrp_nodes so that the back pointer of
+	 * the root cgrp_node always points to the last one on the list
+	 * and the forward pointer of the last node is null
+	 */
+	cp->cgn_back = cp;
+	cp->cgn_forw = NULL;
+	cp->cgn_nlink = 0;
+	cgm->cg_rootnode = cp;
+
+	cp->cgn_type = CG_CGROUP_DIR;
+	cp->cgn_nodeid = cgrp_inode(ssid, cgm->cg_gen);
+	cgrp_dirinit(cp, cp, cr);
+
+	mutex_exit(&cgm->cg_contents);
+
+	pn_free(&dpn);
+	error = 0;
+	atomic_inc_32(&cgrp_mountcount);
+
+out:
+	if (error == 0)
+		vfs_set_feature(vfsp, VFSFT_SYSATTR_VIEWS);
+
+	return (error);
+}
+
+static int
+cgrp_unmount(struct vfs *vfsp, int flag, struct cred *cr)
+{
+	cgrp_mnt_t *cgm = (cgrp_mnt_t *)VFSTOCGM(vfsp);
+	cgrp_node_t *cgnp, *cancel;
+	struct vnode	*vp;
+	int error;
+	uint_t cnt;
+	int retry_cnt = 0;
+
+	if ((error = secpolicy_fs_unmount(cr, vfsp)) != 0)
+		return (error);
+
+retry:
+	mutex_enter(&cgm->cg_contents);
+
+	/*
+	 * In the normal unmount case, if there were no open files, only the
+	 * root node would have a reference count. However, the user-level
+	 * agent manager should have the root vnode open and be waiting in
+	 * ioctl. We need to wake the manager and it may take some retries
+	 * before it closes its file descriptor.
+	 *
+	 * With cg_contents held, nothing can be added or removed.
+	 * There may be some dirty pages.  To prevent fsflush from
+	 * disrupting the unmount, put a hold on each node while scanning.
+	 * If we find a previously referenced node, undo the holds we have
+	 * placed and fail EBUSY.
+	 */
+	cgnp = cgm->cg_rootnode;
+
+	ASSERT(cgm->cg_lxzdata->lxzd_cgroup != NULL);
+
+	vp = CGNTOV(cgnp);
+	mutex_enter(&vp->v_lock);
+
+	if (flag & MS_FORCE) {
+		mutex_exit(&vp->v_lock);
+		mutex_exit(&cgm->cg_contents);
+		return (EINVAL);
+	}
+
+
+	cnt = vp->v_count;
+	if (cnt > 1) {
+		mutex_exit(&vp->v_lock);
+		mutex_exit(&cgm->cg_contents);
+		/* Likely because the user-level manager hasn't exited yet */
+		if (retry_cnt++ < UMNT_RETRY_MAX) {
+			delay(UMNT_DELAY_TIME);
+			goto retry;
+		}
+		return (EBUSY);
+	}
+
+	mutex_exit(&vp->v_lock);
+
+	/*
+	 * Check for open files. An open file causes everything to unwind.
+	 */
+	for (cgnp = cgnp->cgn_forw; cgnp; cgnp = cgnp->cgn_forw) {
+		vp = CGNTOV(cgnp);
+		mutex_enter(&vp->v_lock);
+		cnt = vp->v_count;
+		if (cnt > 0) {
+			/* An open file; unwind the holds we've been adding. */
+			mutex_exit(&vp->v_lock);
+			cancel = cgm->cg_rootnode->cgn_forw;
+			while (cancel != cgnp) {
+				vp = CGNTOV(cancel);
+				ASSERT(vp->v_count > 0);
+				VN_RELE(vp);
+				cancel = cancel->cgn_forw;
+			}
+			mutex_exit(&cgm->cg_contents);
+			return (EBUSY);
+		} else {
+			/* directly add a VN_HOLD since we have the lock */
+			vp->v_count++;
+			mutex_exit(&vp->v_lock);
+		}
+	}
+
+	mutex_enter(&cgm->cg_lxzdata->lxzd_lock);
+	cgm->cg_lxzdata->lxzd_cgroup = NULL;
+	mutex_exit(&cgm->cg_lxzdata->lxzd_lock);
+	kmem_free(cgm->cg_grp_hash, sizeof (cgrp_node_t *) * CGRP_HASH_SZ);
+
+	/*
+	 * We can drop the mutex now because
+	 * no one can find this mount anymore
+	 */
+	vfsp->vfs_flag |= VFS_UNMOUNTED;
+	mutex_exit(&cgm->cg_contents);
+
+	return (0);
+}
+
+/*
+ * Implementation of VFS_FREEVFS(). This is called by the vfs framework after
+ * umount and the last VFS_RELE, to trigger the release of any resources still
+ * associated with the given vfs_t. This is normally called immediately after
+ * cgrp_umount.
+ */
+void
+cgrp_freevfs(vfs_t *vfsp)
+{
+	cgrp_mnt_t *cgm = (cgrp_mnt_t *)VFSTOCGM(vfsp);
+	cgrp_node_t *cn;
+	struct vnode	*vp;
+
+	/*
+	 * Free all kmemalloc'd and anonalloc'd memory associated with
+	 * this filesystem.  To do this, we go through the file list twice,
+	 * once to remove all the directory entries, and then to remove
+	 * all the pseudo files.
+	 */
+
+	/*
+	 * Now that we are tearing ourselves down we need to remove the
+	 * UNMOUNTED flag. If we don't, we'll later hit a VN_RELE when we remove
+	 * files from the system causing us to have a negative value. Doing this
+	 * seems a bit better than trying to set a flag on the tmount that says
+	 * we're tearing down.
+	 */
+	vfsp->vfs_flag &= ~VFS_UNMOUNTED;
+
+	/*
+	 * Remove all directory entries
+	 */
+	for (cn = cgm->cg_rootnode; cn; cn = cn->cgn_forw) {
+		mutex_enter(&cgm->cg_contents);
+		if (cn->cgn_type == CG_CGROUP_DIR)
+			cgrp_dirtrunc(cn);
+		mutex_exit(&cgm->cg_contents);
+	}
+
+	ASSERT(cgm->cg_rootnode);
+
+	/*
+	 * All links are gone, v_count is keeping nodes in place.
+	 * VN_RELE should make the node disappear, unless somebody
+	 * is holding pages against it.  Nap and retry until it disappears.
+	 *
+	 * We re-acquire the lock to prevent others who have a HOLD on
+	 * a cgrp_node via its pages or anon slots from blowing it away
+	 * (in cgrp_inactive) while we're trying to get to it here. Once
+	 * we have a HOLD on it we know it'll stick around.
+	 *
+	 */
+	mutex_enter(&cgm->cg_contents);
+
+	/* Remove all the files (except the rootnode) backwards. */
+	while ((cn = cgm->cg_rootnode->cgn_back) != cgm->cg_rootnode) {
+		mutex_exit(&cgm->cg_contents);
+		/*
+		 * All nodes will be released here. Note we handled the link
+		 * count above.
+		 */
+		vp = CGNTOV(cn);
+		VN_RELE(vp);
+		mutex_enter(&cgm->cg_contents);
+		/*
+		 * It's still there after the RELE. Someone else like pageout
+		 * has a hold on it so wait a bit and then try again - we know
+		 * they'll give it up soon.
+		 */
+		if (cn == cgm->cg_rootnode->cgn_back) {
+			VN_HOLD(vp);
+			mutex_exit(&cgm->cg_contents);
+			delay(hz / 4);
+			mutex_enter(&cgm->cg_contents);
+		}
+	}
+	mutex_exit(&cgm->cg_contents);
+
+	VN_RELE(CGNTOV(cgm->cg_rootnode));
+
+	ASSERT(cgm->cg_mntpath);
+
+	kmem_free(cgm->cg_mntpath, strlen(cgm->cg_mntpath) + 1);
+
+	mutex_destroy(&cgm->cg_contents);
+	kmem_free(cgm, sizeof (cgrp_mnt_t));
+
+	/* Allow _fini() to succeed now */
+	atomic_dec_32(&cgrp_mountcount);
+}
+
+/*
+ * return root cgnode for given vnode
+ */
+static int
+cgrp_root(struct vfs *vfsp, struct vnode **vpp)
+{
+	cgrp_mnt_t *cgm = (cgrp_mnt_t *)VFSTOCGM(vfsp);
+	cgrp_node_t *cp = cgm->cg_rootnode;
+	struct vnode *vp;
+
+	ASSERT(cp);
+
+	vp = CGNTOV(cp);
+	VN_HOLD(vp);
+	*vpp = vp;
+	return (0);
+}
+
+static int
+cgrp_statvfs(struct vfs *vfsp, struct statvfs64 *sbp)
+{
+	cgrp_mnt_t *cgm = (cgrp_mnt_t *)VFSTOCGM(vfsp);
+	ulong_t	blocks;
+	dev32_t d32;
+	zoneid_t eff_zid;
+	struct zone *zp;
+
+	zp = cgm->cg_vfsp->vfs_zone;
+
+	if (zp == NULL)
+		eff_zid = GLOBAL_ZONEUNIQID;
+	else
+		eff_zid = zp->zone_id;
+
+	sbp->f_bsize = PAGESIZE;
+	sbp->f_frsize = PAGESIZE;
+
+	/*
+	 * Find the amount of available physical and memory swap
+	 */
+	mutex_enter(&anoninfo_lock);
+	ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
+	blocks = (ulong_t)CURRENT_TOTAL_AVAILABLE_SWAP;
+	mutex_exit(&anoninfo_lock);
+
+	if (blocks > cgrp_minfree)
+		sbp->f_bfree = blocks - cgrp_minfree;
+	else
+		sbp->f_bfree = 0;
+
+	sbp->f_bavail = sbp->f_bfree;
+
+	/*
+	 * Total number of blocks is just what's available
+	 */
+	sbp->f_blocks = (fsblkcnt64_t)(sbp->f_bfree);
+
+	if (eff_zid != GLOBAL_ZONEUNIQID &&
+	    zp->zone_max_swap_ctl != UINT64_MAX) {
+		/*
+		 * If the fs is used by a zone with a swap cap,
+		 * then report the capped size.
+		 */
+		rctl_qty_t cap, used;
+		pgcnt_t pgcap, pgused;
+
+		mutex_enter(&zp->zone_mem_lock);
+		cap = zp->zone_max_swap_ctl;
+		used = zp->zone_max_swap;
+		mutex_exit(&zp->zone_mem_lock);
+
+		pgcap = btop(cap);
+		pgused = btop(used);
+
+		sbp->f_bfree = MIN(pgcap - pgused, sbp->f_bfree);
+		sbp->f_bavail = sbp->f_bfree;
+		sbp->f_blocks = MIN(pgcap, sbp->f_blocks);
+	}
+
+	/*
+	 * The maximum number of files available is approximately the number
+	 * of cgrp_nodes we can allocate from the remaining kernel memory
+	 * available to cgroups.  This is fairly inaccurate since it doesn't
+	 * take into account the names stored in the directory entries.
+	 */
+	sbp->f_ffree = sbp->f_files = ptob(availrmem) /
+	    (sizeof (cgrp_node_t) + sizeof (cgrp_dirent_t));
+	sbp->f_favail = (fsfilcnt64_t)(sbp->f_ffree);
+	(void) cmpldev(&d32, vfsp->vfs_dev);
+	sbp->f_fsid = d32;
+	(void) strcpy(sbp->f_basetype, vfssw[cgrp_fstype].vsw_name);
+	(void) strncpy(sbp->f_fstr, cgm->cg_mntpath, sizeof (sbp->f_fstr));
+	/* ensure null termination */
+	sbp->f_fstr[sizeof (sbp->f_fstr) - 1] = '\0';
+	sbp->f_flag = vf_to_stf(vfsp->vfs_flag);
+	sbp->f_namemax = MAXNAMELEN - 1;
+	return (0);
+}
+
+static int
+cgrp_get_dirname(cgrp_node_t *cn, char *buf, int blen)
+{
+	cgrp_node_t *parent;
+	cgrp_dirent_t *dp;
+
+	buf[0] = '\0';
+
+	parent = cn->cgn_parent;
+	if (parent == NULL || parent == cn) {
+		(void) strlcpy(buf, ".", blen);
+		return (0);
+	}
+
+	/*
+	 * Search the parent dir list to find this cn's name.
+	 */
+	for (dp = parent->cgn_dir; dp != NULL; dp = dp->cgd_next) {
+		if (dp->cgd_cgrp_node->cgn_id == cn->cgn_id) {
+			(void) strlcpy(buf, dp->cgd_name, blen);
+			return (0);
+		}
+	}
+
+	return (-1);
+}
+
+typedef struct cgrp_rra_arg {
+	char *crraa_agent_path;
+	char *crraa_event_path;
+} cgrp_rra_arg_t;
+
+static void
+cgrp_run_rel_agent(void *a)
+{
+	cgrp_rra_arg_t *rarg = a;
+	proc_t *p = ttoproc(curthread);
+	zone_t *z = p->p_zone;
+	struct core_globals *cg;
+	int res;
+
+	ASSERT(!INGLOBALZONE(curproc));
+
+	/* The following block is derived from start_init_common */
+	ASSERT_STACK_ALIGNED();
+
+	p->p_cstime = p->p_stime = p->p_cutime = p->p_utime = 0;
+	p->p_usrstack = (caddr_t)USRSTACK32;
+	p->p_model = DATAMODEL_ILP32;
+	p->p_stkprot = PROT_ZFOD & ~PROT_EXEC;
+	p->p_datprot = PROT_ZFOD & ~PROT_EXEC;
+	p->p_stk_ctl = INT32_MAX;
+
+	p->p_as = as_alloc();
+	p->p_as->a_proc = p;
+	p->p_as->a_userlimit = (caddr_t)USERLIMIT32;
+	(void) hat_setup(p->p_as->a_hat, HAT_INIT);
+
+	VERIFY((cg = zone_getspecific(core_zone_key, z)) != NULL);
+
+	corectl_path_hold(cg->core_default_path);
+	corectl_content_hold(cg->core_default_content);
+
+	curproc->p_corefile = cg->core_default_path;
+	curproc->p_content = cg->core_default_content;
+
+	init_mstate(curthread, LMS_SYSTEM);
+	res = exec_init(rarg->crraa_agent_path, rarg->crraa_event_path);
+
+	/* End of code derived from start_init_common */
+
+	kmem_free(rarg->crraa_event_path, MAXPATHLEN);
+	kmem_free(rarg->crraa_agent_path, CGRP_AGENT_LEN);
+	kmem_free(rarg, sizeof (cgrp_rra_arg_t));
+
+	/* The following is derived from zone_start_init - see comments there */
+	if (res != 0 || zone_status_get(global_zone) >= ZONE_IS_SHUTTING_DOWN) {
+		if (proc_exit(CLD_EXITED, res) != 0) {
+			mutex_enter(&p->p_lock);
+			ASSERT(p->p_flag & SEXITLWPS);
+			lwp_exit();
+		}
+	} else {
+		id_t cid = curthread->t_cid;
+
+		mutex_enter(&class_lock);
+		ASSERT(cid < loaded_classes);
+		if (strcmp(sclass[cid].cl_name, "FX") == 0 &&
+		    z->zone_fixed_hipri) {
+			pcparms_t pcparms;
+
+			pcparms.pc_cid = cid;
+			((fxkparms_t *)pcparms.pc_clparms)->fx_upri = FXMAXUPRI;
+			((fxkparms_t *)pcparms.pc_clparms)->fx_uprilim =
+			    FXMAXUPRI;
+			((fxkparms_t *)pcparms.pc_clparms)->fx_cflags =
+			    FX_DOUPRILIM | FX_DOUPRI;
+
+			mutex_enter(&pidlock);
+			mutex_enter(&curproc->p_lock);
+			(void) parmsset(&pcparms, curthread);
+			mutex_exit(&curproc->p_lock);
+			mutex_exit(&pidlock);
+		} else if (strcmp(sclass[cid].cl_name, "RT") == 0) {
+			curthread->t_pri = RTGPPRIO0;
+		}
+		mutex_exit(&class_lock);
+
+		/* cause the process to return to userland. */
+		lwp_rtt();
+	}
+}
+
+/*
+ * Launch the user-level release_agent manager. The event data is the
+ * pathname (relative to the mount point of the file system) of the newly empty
+ * cgroup.
+ *
+ * The cg_contents mutex is held on entry and dropped before returning.
+ */
+void
+cgrp_rel_agent_event(cgrp_mnt_t *cgm, cgrp_node_t *cn)
+{
+	cgrp_node_t *parent;
+	char nm[MAXNAMELEN];
+	char *argstr, *oldstr, *tmp;
+	id_t cid;
+	int agent_err;
+	proc_t *p = ttoproc(curthread);
+	zone_t *z = p->p_zone;
+	lx_lwp_data_t *plwpd = ttolxlwp(curthread);
+	cgrp_rra_arg_t *rarg;
+
+	ASSERT(MUTEX_HELD(&cgm->cg_contents));
+
+	/* Nothing to do if the agent is not set */
+	if (cgm->cg_agent[0] == '\0') {
+		mutex_exit(&cgm->cg_contents);
+		return;
+	}
+
+	parent = cn->cgn_parent;
+	/* Cannot remove the top-level cgroup (only via unmount) */
+	if (parent == cn) {
+		mutex_exit(&cgm->cg_contents);
+		return;
+	}
+
+	argstr = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+	oldstr = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+	*argstr = '\0';
+
+	/*
+	 * Iterate up the directory tree to construct the agent argument string.
+	 */
+	do {
+		cgrp_get_dirname(cn, nm, sizeof (nm));
+		DTRACE_PROBE1(cgrp__dir__name, char *, nm);
+		if (*argstr == '\0') {
+			(void) snprintf(argstr, MAXPATHLEN, "/%s", nm);
+		} else {
+			tmp = oldstr;
+			oldstr = argstr;
+			argstr = tmp;
+			(void) snprintf(argstr, MAXPATHLEN, "/%s%s", nm,
+			    oldstr);
+		}
+
+		if (cn->cgn_parent == NULL)
+			break;
+		cn = cn->cgn_parent;
+		parent = cn->cgn_parent;
+
+		/*
+		 * The arg path is relative to the mountpoint so we stop when
+		 * we get to the top level.
+		 */
+		if (parent == NULL || parent == cn)
+			break;
+	} while (parent != cn);
+
+	kmem_free(oldstr, MAXPATHLEN);
+
+	rarg = kmem_alloc(sizeof (cgrp_rra_arg_t), KM_SLEEP);
+	rarg->crraa_agent_path = kmem_alloc(sizeof (cgm->cg_agent), KM_SLEEP);
+	(void) strlcpy(rarg->crraa_agent_path, cgm->cg_agent,
+	    sizeof (cgm->cg_agent));
+	rarg->crraa_event_path = argstr;
+
+	DTRACE_PROBE2(cgrp__agent__event, cgrp_rra_arg_t *, rarg,
+	    int, plwpd->br_cgroupid);
+
+	/* The release agent process cannot belong to our cgroup */
+	plwpd->br_cgroupid = 0;
+
+	/*
+	 * The cg_contents mutex cannot be held while taking the pool lock
+	 * or calling newproc.
+	 */
+	mutex_exit(&cgm->cg_contents);
+
+	if (z->zone_defaultcid > 0) {
+		cid = z->zone_defaultcid;
+	} else {
+		pool_lock();
+		cid = pool_get_class(z->zone_pool);
+		pool_unlock();
+	}
+	if (cid == -1)
+		cid = defaultcid;
+
+	if ((agent_err = newproc(cgrp_run_rel_agent, (void *)rarg, cid,
+	    minclsyspri - 1, NULL, -1)) != 0) {
+		/* There's nothing we can do if creating the proc fails. */
+		kmem_free(rarg->crraa_event_path, MAXPATHLEN);
+		kmem_free(rarg->crraa_agent_path, sizeof (cgm->cg_agent));
+		kmem_free(rarg, sizeof (cgrp_rra_arg_t));
+	}
+}
+
+/*ARGSUSED*/
+static void
+cgrp_lwp_fork_helper(vfs_t *vfsp, uint_t cg_id, id_t tid, pid_t tpid)
+{
+	cgrp_mnt_t *cgm = (cgrp_mnt_t *)VFSTOCGM(vfsp);
+	cgrp_node_t *cn;
+
+	mutex_enter(&cgm->cg_contents);
+	cn = cgrp_cg_hash_lookup(cgm, cg_id);
+	ASSERT(cn != NULL);
+	cn->cgn_task_cnt++;
+	mutex_exit(&cgm->cg_contents);
+
+	DTRACE_PROBE1(cgrp__lwp__fork, void *, cn);
+}
+
+/*ARGSUSED*/
+static void
+cgrp_lwp_exit_helper(vfs_t *vfsp, uint_t cg_id, id_t tid, pid_t tpid)
+{
+	cgrp_mnt_t *cgm = (cgrp_mnt_t *)VFSTOCGM(vfsp);
+	cgrp_node_t *cn;
+
+	mutex_enter(&cgm->cg_contents);
+	cn = cgrp_cg_hash_lookup(cgm, cg_id);
+	ASSERT(cn != NULL);
+	if (cn->cgn_task_cnt == 0) {
+		/* top-level cgroup cnt can be 0 during reboot */
+		mutex_exit(&cgm->cg_contents);
+		return;
+	}
+	cn->cgn_task_cnt--;
+	DTRACE_PROBE1(cgrp__lwp__exit, void *, cn);
+
+	if (cn->cgn_task_cnt == 0 && cn->cgn_dirents == N_DIRENTS(cgm) &&
+	    cn->cgn_notify == 1) {
+		cgrp_rel_agent_event(cgm, cn);
+		ASSERT(MUTEX_NOT_HELD(&cgm->cg_contents));
+	} else {
+		mutex_exit(&cgm->cg_contents);
+	}
+}
diff --git a/usr/src/uts/common/brand/lx/cgroups/cgrps_vnops.c b/usr/src/uts/common/brand/lx/cgroups/cgrps_vnops.c
new file mode 100644
index 0000000000..bd571c8c18
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/cgroups/cgrps_vnops.c
@@ -0,0 +1,1608 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/user.h>
+#include <sys/time.h>
+#include <sys/vfs.h>
+#include <sys/vfs_opreg.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/fcntl.h>
+#include <sys/flock.h>
+#include <sys/kmem.h>
+#include <sys/uio.h>
+#include <sys/errno.h>
+#include <sys/stat.h>
+#include <sys/cred.h>
+#include <sys/dirent.h>
+#include <sys/pathname.h>
+#include <vm/seg_vn.h>
+#include <sys/cmn_err.h>
+#include <sys/buf.h>
+#include <sys/vm.h>
+#include <sys/prsystm.h>
+#include <sys/policy.h>
+#include <fs/fs_subr.h>
+#include <sys/sdt.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/brand.h>
+#include <sys/lx_brand.h>
+
+#include "cgrps.h"
+
+typedef enum cgrp_wr_type {
+	CG_WR_PROCS = 1,
+	CG_WR_TASKS
+} cgrp_wr_type_t;
+
+/* ARGSUSED1 */
+static int
+cgrp_open(struct vnode **vpp, int flag, struct cred *cred, caller_context_t *ct)
+{
+	/*
+	 * swapon to a cgrp file is not supported so access is denied on open
+	 * if VISSWAP is set.
+	 */
+	if ((*vpp)->v_flag & VISSWAP)
+		return (EINVAL);
+
+	return (0);
+}
+
+/* ARGSUSED1 */
+static int
+cgrp_close(struct vnode *vp, int flag, int count, offset_t offset,
+    struct cred *cred, caller_context_t *ct)
+{
+	cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
+	cleanshares(vp, ttoproc(curthread)->p_pid);
+	return (0);
+}
+
+/*
+ * Lookup proc or task based on pid and typ.
+ */
+static proc_t *
+cgrp_p_for_wr(pid_t pid, cgrp_wr_type_t typ)
+{
+	int i;
+	zoneid_t zoneid = curproc->p_zone->zone_id;
+	pid_t schedpid = curproc->p_zone->zone_zsched->p_pid;
+
+	ASSERT(MUTEX_HELD(&pidlock));
+
+	/* getting a proc from a pid is easy */
+	if (typ == CG_WR_PROCS)
+		return (prfind(pid));
+
+	ASSERT(typ == CG_WR_TASKS);
+
+	/*
+	 * We have to scan all of the process entries to find the proc
+	 * containing this task.
+	 */
+	mutex_exit(&pidlock);
+	for (i = 1; i < v.v_proc; i++) {
+		proc_t *p;
+		kthread_t *t;
+
+		mutex_enter(&pidlock);
+		/*
+		 * Skip indices for which there is no pid_entry, PIDs for
+		 * which there is no corresponding process, system processes,
+		 * a PID of 0, the pid for our zsched process, anything the
+		 * security policy doesn't allow us to look at, its not an
+		 * lx-branded process and processes that are not in the zone.
+		 */
+		if ((p = pid_entry(i)) == NULL ||
+		    p->p_stat == SIDL ||
+		    (p->p_flag & SSYS) != 0 ||
+		    p->p_pid == 0 ||
+		    p->p_pid == schedpid ||
+		    secpolicy_basic_procinfo(CRED(), p, curproc) != 0 ||
+		    p->p_brand != &lx_brand ||
+		    p->p_zone->zone_id != zoneid) {
+			mutex_exit(&pidlock);
+			continue;
+		}
+
+		mutex_enter(&p->p_lock);
+		if ((t = p->p_tlist) == NULL) {
+			/* no threads, skip it */
+			mutex_exit(&p->p_lock);
+			mutex_exit(&pidlock);
+			continue;
+		}
+
+		/*
+		 * Check all threads in this proc.
+		 */
+		do {
+			lx_lwp_data_t *plwpd = ttolxlwp(t);
+			if (plwpd != NULL && plwpd->br_pid == pid) {
+				mutex_exit(&p->p_lock);
+				return (p);
+			}
+
+			t = t->t_forw;
+		} while (t != p->p_tlist);
+
+		mutex_exit(&p->p_lock);
+		mutex_exit(&pidlock);
+	}
+
+	mutex_enter(&pidlock);
+	return (NULL);
+}
+
+/*
+ * Move a thread from one cgroup to another. If the old cgroup is empty
+ * we queue up an agent event. We return true in that case since we've
+ * dropped the locks and the caller needs to reacquire them.
+ */
+static boolean_t
+cgrp_thr_move(cgrp_mnt_t *cgm, lx_lwp_data_t *plwpd, cgrp_node_t *ncn,
+    uint_t cg_id, proc_t *p)
+{
+	cgrp_node_t *ocn;
+
+	ASSERT(MUTEX_HELD(&cgm->cg_contents));
+	ASSERT(MUTEX_HELD(&p->p_lock));
+
+	ocn = cgrp_cg_hash_lookup(cgm, plwpd->br_cgroupid);
+	VERIFY(ocn != NULL);
+
+	ASSERT(ocn->cgn_task_cnt > 0);
+	atomic_dec_32(&ocn->cgn_task_cnt);
+	atomic_inc_32(&ncn->cgn_task_cnt);
+	plwpd->br_cgroupid = cg_id;
+
+	if (ocn->cgn_task_cnt == 0 && ocn->cgn_dirents == N_DIRENTS(cgm) &&
+	    ocn->cgn_notify == 1) {
+		/*
+		 * We want to drop p_lock before queuing the event since
+		 * that might sleep. Dropping p_lock might cause the caller to
+		 * have to restart the move process from the beginning.
+		 */
+		mutex_exit(&p->p_lock);
+		cgrp_rel_agent_event(cgm, ocn);
+		ASSERT(MUTEX_NOT_HELD(&cgm->cg_contents));
+		return (B_TRUE);
+	}
+
+	return (B_FALSE);
+}
+
+/*
+ * Assign either all of the threads, or a single thread, for the specified pid
+ * to the new cgroup. Controlled by the typ argument.
+ */
+static int
+cgrp_proc_set_id(cgrp_mnt_t *cgm, uint_t cg_id, pid_t pid, cgrp_wr_type_t typ)
+{
+	proc_t *p;
+	kthread_t *t;
+	int error;
+	cgrp_node_t *ncn;
+
+	if (pid == 1)
+		pid = curproc->p_zone->zone_proc_initpid;
+
+	/*
+	 * Move one or all threads to this cgroup.
+	 */
+	if (typ == CG_WR_TASKS) {
+		error = ESRCH;
+	} else {
+		error = 0;
+	}
+
+restart:
+	mutex_enter(&pidlock);
+
+	p = cgrp_p_for_wr(pid, typ);
+	if (p == NULL) {
+		mutex_exit(&pidlock);
+		return (ESRCH);
+	}
+
+	/*
+	 * Fail writes for pids for which there is no corresponding process,
+	 * system processes, a pid of 0, the pid for our zsched process,
+	 * anything the security policy doesn't allow us to look at, and
+	 * processes that are not in the zone.
+	 */
+	if (p->p_stat == SIDL ||
+	    (p->p_flag & SSYS) != 0 ||
+	    p->p_pid == 0 ||
+	    p->p_pid == curproc->p_zone->zone_zsched->p_pid ||
+	    secpolicy_basic_procinfo(CRED(), p, curproc) != 0 ||
+	    p->p_zone->zone_id != curproc->p_zone->zone_id) {
+		mutex_exit(&pidlock);
+		return (ESRCH);
+	}
+
+	/*
+	 * Ignore writes for PID which is not an lx-branded process or with
+	 * no threads.
+	 */
+
+	mutex_enter(&p->p_lock);
+	mutex_exit(&pidlock);
+	if (p->p_brand != &lx_brand || (t = p->p_tlist) == NULL ||
+	    p->p_flag & SEXITING) {
+		mutex_exit(&p->p_lock);
+		return (0);
+	}
+
+	mutex_enter(&cgm->cg_contents);
+
+	ncn = cgrp_cg_hash_lookup(cgm, cg_id);
+	VERIFY(ncn != NULL);
+
+	do {
+		lx_lwp_data_t *plwpd = ttolxlwp(t);
+		if (plwpd != NULL && plwpd->br_cgroupid != cg_id) {
+			if (typ == CG_WR_PROCS) {
+				if (cgrp_thr_move(cgm, plwpd, ncn, cg_id, p)) {
+					/*
+					 * We dropped all of the locks so we
+					 * need to start over.
+					 */
+					goto restart;
+				}
+
+			} else if (plwpd->br_pid == pid) {
+				/* type is CG_WR_TASKS and we found the task */
+				error = 0;
+				if (cgrp_thr_move(cgm, plwpd, ncn, cg_id, p)) {
+					goto done;
+				} else {
+					break;
+				}
+			}
+		}
+		t = t->t_forw;
+	} while (t != p->p_tlist);
+
+	mutex_exit(&cgm->cg_contents);
+	mutex_exit(&p->p_lock);
+done:
+
+	return (error);
+}
+
+/*
+ * User-level is writing a pid string. We need to get that string and convert
+ * it to a pid. The user-level code has to completely write an entire pid
+ * string at once. The user-level code could write multiple strings (delimited
+ * by newline) although that is frowned upon. However, we must handle this
+ * case too. Thus we consume the input one byte at a time until we get a whole
+ * pid string. We can't consume more than a byte at a time since otherwise we
+ * might be left with a partial pid string.
+ */
+static int
+cgrp_get_pid_str(struct uio *uio, pid_t *pid)
+{
+	char buf[16];	/* big enough for a pid string */
+	int i;
+	int error;
+	char *p = &buf[0];
+	char *ep;
+	long pidnum;
+
+	bzero(buf, sizeof (buf));
+	for (i = 0; uio->uio_resid > 0 && i < sizeof (buf); i++, p++) {
+		error = uiomove(p, 1, UIO_WRITE, uio);
+		if (error != 0)
+			return (error);
+		if (buf[i] == '\n') {
+			buf[i] = '\0';
+			break;
+		}
+	}
+
+	if (buf[0] == '\0' || i >= sizeof (buf)) /* no input or too long */
+		return (EINVAL);
+
+	error = ddi_strtol(buf, &ep, 10, &pidnum);
+	if (error != 0 || *ep != '\0' || pidnum > maxpid || pidnum < 0)
+		return (EINVAL);
+
+	*pid = (pid_t)pidnum;
+	return (0);
+}
+
+static int
+cgrp_wr_notify(cgrp_node_t *cn, struct uio *uio)
+{
+	int error;
+	uint_t value;
+
+	/*
+	 * This is cheesy but since we only take a 0 or 1 value we can
+	 * let the pid_str function do the uio string conversion.
+	 */
+	error = cgrp_get_pid_str(uio, (pid_t *)&value);
+	if (error != 0)
+		return (error);
+
+	if (value != 0 && value != 1)
+		return (EINVAL);
+
+	/*
+	 * The flag is on the containing dir. We don't bother taking the
+	 * cg_contents lock since this is a simple assignment.
+	 */
+	cn->cgn_parent->cgn_notify = value;
+	return (0);
+}
+
+static int
+cgrp_wr_rel_agent(cgrp_mnt_t *cgm, struct uio *uio)
+{
+	int error;
+	int len;
+	char *wrp;
+
+	len = uio->uio_offset + uio->uio_resid;
+	if (len > MAXPATHLEN)
+		return (EFBIG);
+
+	mutex_enter(&cgm->cg_contents);
+
+	wrp = &cgm->cg_agent[uio->uio_offset];
+	error = uiomove(wrp, uio->uio_resid, UIO_WRITE, uio);
+	cgm->cg_agent[len] = '\0';
+	if (len > 1 && cgm->cg_agent[len - 1] == '\n')
+		cgm->cg_agent[len - 1] = '\0';
+
+	mutex_exit(&cgm->cg_contents);
+	return (error);
+}
+
+static int
+cgrp_wr_proc_or_task(cgrp_mnt_t *cgm, cgrp_node_t *cn, struct uio *uio,
+    cgrp_wr_type_t typ)
+{
+	/* the cgroup ID is on the containing dir */
+	uint_t cg_id = cn->cgn_parent->cgn_id;
+	int error;
+	pid_t pidnum;
+
+	while (uio->uio_resid > 0) {
+		error = cgrp_get_pid_str(uio, &pidnum);
+		if (error != 0)
+			return (error);
+
+		error = cgrp_proc_set_id(cgm, cg_id, pidnum, typ);
+		if (error != 0)
+			return (error);
+	}
+
+	return (0);
+}
+
+static int
+cgrp_wr(cgrp_mnt_t *cgm, cgrp_node_t *cn, struct uio *uio, struct cred *cr,
+    caller_context_t *ct)
+{
+	struct vnode *vp;
+	int error = 0;
+	rlim64_t limit = uio->uio_llimit;
+
+	vp = CGNTOV(cn);
+	ASSERT(vp->v_type == VREG);
+
+	if (uio->uio_loffset < 0)
+		return (EINVAL);
+
+	if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
+		limit = MAXOFFSET_T;
+
+	if (uio->uio_loffset >= MAXOFF_T)
+		return (EFBIG);
+
+	if (uio->uio_resid == 0)
+		return (0);
+
+	if (limit > MAXOFF_T)
+		limit = MAXOFF_T;
+
+	switch (cn->cgn_type) {
+	case CG_NOTIFY:
+		error = cgrp_wr_notify(cn, uio);
+		break;
+	case CG_PROCS:
+		error = cgrp_wr_proc_or_task(cgm, cn, uio, CG_WR_PROCS);
+		break;
+	case CG_REL_AGENT:
+		error = cgrp_wr_rel_agent(cgm, uio);
+		break;
+	case CG_TASKS:
+		error = cgrp_wr_proc_or_task(cgm, cn, uio, CG_WR_TASKS);
+		break;
+	default:
+		VERIFY(0);
+	}
+
+	return (error);
+}
+
+/*
+ * pidlock is held on entry but dropped on exit. Because we might have to drop
+ * locks and loop if the process is already P_PR_LOCKed, it is possible that
+ * the process might be gone when we return from this function.
+ */
+static proc_t *
+cgrp_p_lock(proc_t *p)
+{
+	kmutex_t *mp;
+	pid_t pid;
+
+	ASSERT(MUTEX_HELD(&pidlock));
+
+	/* first try the fast path */
+	mutex_enter(&p->p_lock);
+	if (p->p_flag & SEXITING) {
+		mutex_exit(&p->p_lock);
+		mutex_exit(&pidlock);
+		return (NULL);
+	}
+
+	if (!(p->p_proc_flag & P_PR_LOCK)) {
+		p->p_proc_flag |= P_PR_LOCK;
+		mutex_exit(&p->p_lock);
+		mutex_exit(&pidlock);
+		THREAD_KPRI_REQUEST();
+		return (p);
+	}
+	mutex_exit(&p->p_lock);
+
+	pid = p->p_pid;
+	for (;;) {
+		/*
+		 * p_lock is persistent, but p itself is not -- it could
+		 * vanish during cv_wait().  Load p->p_lock now so we can
+		 * drop it after cv_wait() without referencing p.
+		 */
+		mp = &p->p_lock;
+		mutex_enter(mp);
+		mutex_exit(&pidlock);
+
+		if (p->p_flag & SEXITING) {
+			mutex_exit(mp);
+			return (NULL);
+		}
+
+		if (!(p->p_proc_flag & P_PR_LOCK))
+			break;
+
+		cv_wait(&pr_pid_cv[p->p_slot], mp);
+		mutex_exit(mp);
+
+		mutex_enter(&pidlock);
+		p = prfind(pid);
+		if (p == NULL || p->p_stat == SIDL) {
+			mutex_exit(&pidlock);
+			return (NULL);
+		}
+	}
+
+	p->p_proc_flag |= P_PR_LOCK;
+	mutex_exit(mp);
+	ASSERT(!MUTEX_HELD(&pidlock));
+	THREAD_KPRI_REQUEST();
+	return (p);
+}
+
+static void
+cgrp_p_unlock(proc_t *p)
+{
+	ASSERT(p->p_proc_flag & P_PR_LOCK);
+	ASSERT(MUTEX_HELD(&p->p_lock));
+	ASSERT(!MUTEX_HELD(&pidlock));
+
+	p->p_proc_flag &= ~P_PR_LOCK;
+	cv_signal(&pr_pid_cv[p->p_slot]);
+	mutex_exit(&p->p_lock);
+	THREAD_KPRI_RELEASE();
+}
+
+/*
+ * Read value from the notify_on_release pseudo file on the parent node
+ * (which is the actual cgroup node). We don't bother taking the cg_contents
+ * lock since it's a single instruction so an empty group action/read will
+ * only see one value or the other.
+ */
+/* ARGSUSED */
+static int
+cgrp_rd_notify(cgrp_mnt_t *cgm, cgrp_node_t *cn, struct uio *uio)
+{
+	int len;
+	int error = 0;
+	char buf[16];
+	char *rdp;
+	/* the flag is on the containing dir */
+	uint_t value = cn->cgn_parent->cgn_notify;
+
+	len = snprintf(buf, sizeof (buf), "%u\n", value);
+	if (uio->uio_offset > len)
+		return (0);
+
+	len -= uio->uio_offset;
+	rdp = &buf[uio->uio_offset];
+	len = (uio->uio_resid < len) ? uio->uio_resid : len;
+
+	error = uiomove(rdp, len, UIO_READ, uio);
+	return (error);
+}
+
+/*
+ * Read value from the release_agent pseudo file.
+ */
+static int
+cgrp_rd_rel_agent(cgrp_mnt_t *cgm, struct uio *uio)
+{
+	int len;
+	int error = 0;
+	char *rdp;
+
+	mutex_enter(&cgm->cg_contents);
+
+	if (cgm->cg_agent[0] == '\0') {
+		mutex_exit(&cgm->cg_contents);
+		return (0);
+	}
+
+	len = strlen(cgm->cg_agent);
+	if (uio->uio_offset > len) {
+		mutex_exit(&cgm->cg_contents);
+		return (0);
+	}
+
+	len -= uio->uio_offset;
+	rdp = &cgm->cg_agent[uio->uio_offset];
+	len = (uio->uio_resid < len) ? uio->uio_resid : len;
+
+	error = uiomove(rdp, len, UIO_READ, uio);
+
+	mutex_exit(&cgm->cg_contents);
+
+	return (error);
+}
+
+/*
+ * Read pids from the cgroup.procs pseudo file. We have to look at all of the
+ * processes to find applicable ones, then report pids for any process which
+ * has all of its threads in the same cgroup.
+ */
+static int
+cgrp_rd_procs(cgrp_mnt_t *cgm, cgrp_node_t *cn, struct uio *uio)
+{
+	int i;
+	ssize_t offset = 0;
+	ssize_t uresid;
+	zoneid_t zoneid = curproc->p_zone->zone_id;
+	int error = 0;
+	pid_t initpid = curproc->p_zone->zone_proc_initpid;
+	pid_t schedpid = curproc->p_zone->zone_zsched->p_pid;
+	/* the cgroup ID is on the containing dir */
+	uint_t cg_id = cn->cgn_parent->cgn_id;
+
+	/* Scan all of the process entries */
+	for (i = 1; i < v.v_proc && (uresid = uio->uio_resid) > 0; i++) {
+		proc_t *p;
+		int len;
+		pid_t pid;
+		char buf[16];
+		char *rdp;
+		kthread_t *t;
+		boolean_t in_cg;
+
+		mutex_enter(&pidlock);
+		/*
+		 * Skip indices for which there is no pid_entry, PIDs for
+		 * which there is no corresponding process, system processes,
+		 * a PID of 0, the pid for our zsched process,  anything the
+		 * security policy doesn't allow us to look at, its not an
+		 * lx-branded process and processes that are not in the zone.
+		 */
+		if ((p = pid_entry(i)) == NULL ||
+		    p->p_stat == SIDL ||
+		    (p->p_flag & SSYS) != 0 ||
+		    p->p_pid == 0 ||
+		    p->p_pid == schedpid ||
+		    secpolicy_basic_procinfo(CRED(), p, curproc) != 0 ||
+		    p->p_brand != &lx_brand ||
+		    p->p_zone->zone_id != zoneid) {
+			mutex_exit(&pidlock);
+			continue;
+		}
+
+		mutex_enter(&p->p_lock);
+		if ((t = p->p_tlist) == NULL) {
+			/* no threads, skip it */
+			mutex_exit(&p->p_lock);
+			mutex_exit(&pidlock);
+			continue;
+		}
+
+		/*
+		 * Check if all threads are in this cgroup.
+		 */
+		in_cg = B_TRUE;
+		mutex_enter(&cgm->cg_contents);
+		do {
+			lx_lwp_data_t *plwpd = ttolxlwp(t);
+			if (plwpd == NULL || plwpd->br_cgroupid != cg_id) {
+				in_cg = B_FALSE;
+				break;
+			}
+
+			t = t->t_forw;
+		} while (t != p->p_tlist);
+		mutex_exit(&cgm->cg_contents);
+
+		mutex_exit(&p->p_lock);
+		if (!in_cg) {
+			/*
+			 * This proc, or at least one of its threads, is not
+			 * in this cgroup.
+			 */
+			mutex_exit(&pidlock);
+			continue;
+		}
+
+		/*
+		 * Convert pid to the Linux default of 1 if we're the zone's
+		 * init process, otherwise use the value from the proc struct
+		 */
+		if (p->p_pid == initpid) {
+			pid = 1;
+		} else {
+			pid = p->p_pid;
+		}
+
+		mutex_exit(&pidlock);
+
+		/*
+		 * Generate pid line and write all or part of it if we're
+		 * in the right spot within the pseudo file.
+		 */
+		len = snprintf(buf, sizeof (buf), "%u\n", pid);
+		if ((offset + len) > uio->uio_offset) {
+			int diff = (int)(uio->uio_offset - offset);
+
+			ASSERT(diff < len);
+			offset += diff;
+			rdp = &buf[diff];
+			len -= diff;
+			if (len > uresid)
+				len = uresid;
+
+			error = uiomove(rdp, len, UIO_READ, uio);
+			if (error != 0)
+				return (error);
+		}
+		offset += len;
+	}
+
+	return (0);
+}
+
+/*
+ * We are given a locked process we know is valid, report on any of its thresds
+ * that are in the cgroup.
+ */
+static int
+cgrp_rd_proc_tasks(uint_t cg_id, proc_t *p, pid_t initpid, ssize_t *offset,
+    struct uio *uio)
+{
+	int error = 0;
+	uint_t tid;
+	char buf[16];
+	char *rdp;
+	kthread_t *t;
+
+	ASSERT(p->p_proc_flag & P_PR_LOCK);
+
+	/*
+	 * Report all threads in this cgroup.
+	 */
+	t = p->p_tlist;
+	do {
+		lx_lwp_data_t *plwpd = ttolxlwp(t);
+		if (plwpd == NULL) {
+			t = t->t_forw;
+			continue;
+		}
+
+		if (plwpd->br_cgroupid == cg_id) {
+			int len;
+
+			/*
+			 * Convert taskid to the Linux default of 1 if
+			 * we're the zone's init process.
+			 */
+			tid = plwpd->br_pid;
+			if (tid == initpid)
+				tid = 1;
+
+			len = snprintf(buf, sizeof (buf), "%u\n", tid);
+			if ((*offset + len) > uio->uio_offset) {
+				int diff;
+
+				diff = (int)(uio->uio_offset - *offset);
+				ASSERT(diff < len);
+				*offset = *offset + diff;
+				rdp = &buf[diff];
+				len -= diff;
+				if (len > uio->uio_resid)
+					len = uio->uio_resid;
+
+				error = uiomove(rdp, len, UIO_READ, uio);
+				if (error != 0)
+					return (error);
+			}
+			*offset = *offset + len;
+		}
+
+		t = t->t_forw;
+	} while (t != p->p_tlist && uio->uio_resid > 0);
+
+	return (0);
+}
+
+/*
+ * Read pids from the tasks pseudo file. We have to look at all of the
+ * processes to find applicable ones, then report pids for any thread in the
+ * cgroup. We return the emulated lx thread pid here, not the internal thread
+ * ID. Because we're possibly doing IO for each taskid we lock the process
+ * so that the threads don't change while we're working on it (although threads
+ * can change if we fill up the read buffer and come back later for a
+ * subsequent read).
+ */
+int
+cgrp_rd_tasks(cgrp_mnt_t *cgm, cgrp_node_t *cn, struct uio *uio)
+{
+	int i;
+	ssize_t offset = 0;
+	ssize_t uresid;
+	zoneid_t zoneid = curproc->p_zone->zone_id;
+	int error = 0;
+	pid_t initpid = curproc->p_zone->zone_proc_initpid;
+	pid_t schedpid = curproc->p_zone->zone_zsched->p_pid;
+	/* the cgroup ID is on the containing dir */
+	uint_t cg_id = cn->cgn_parent->cgn_id;
+
+	/* Scan all of the process entries */
+	for (i = 1; i < v.v_proc && (uresid = uio->uio_resid) > 0; i++) {
+		proc_t *p;
+
+		mutex_enter(&pidlock);
+		/*
+		 * Skip indices for which there is no pid_entry, PIDs for
+		 * which there is no corresponding process, system processes,
+		 * a PID of 0, the pid for our zsched process,  anything the
+		 * security policy doesn't allow us to look at, its not an
+		 * lx-branded process and processes that are not in the zone.
+		 */
+		if ((p = pid_entry(i)) == NULL ||
+		    p->p_stat == SIDL ||
+		    (p->p_flag & SSYS) != 0 ||
+		    p->p_pid == 0 ||
+		    p->p_pid == schedpid ||
+		    secpolicy_basic_procinfo(CRED(), p, curproc) != 0 ||
+		    p->p_brand != &lx_brand ||
+		    p->p_zone->zone_id != zoneid) {
+			mutex_exit(&pidlock);
+			continue;
+		}
+
+		if (p->p_tlist == NULL) {
+			/* no threads, skip it */
+			mutex_exit(&pidlock);
+			continue;
+		}
+
+		p = cgrp_p_lock(p);
+		ASSERT(!MUTEX_HELD(&pidlock));
+		if (p == NULL)
+			continue;
+
+		mutex_enter(&cgm->cg_contents);
+		error = cgrp_rd_proc_tasks(cg_id, p, initpid, &offset, uio);
+		mutex_exit(&cgm->cg_contents);
+
+		mutex_enter(&p->p_lock);
+		cgrp_p_unlock(p);
+
+		if (error != 0)
+			return (error);
+	}
+
+	return (0);
+}
+
+static int
+cgrp_rd(cgrp_mnt_t *cgm, cgrp_node_t *cn, struct uio *uio, caller_context_t *ct)
+{
+	int error = 0;
+
+	if (uio->uio_loffset >= MAXOFF_T)
+		return (0);
+	if (uio->uio_loffset < 0)
+		return (EINVAL);
+	if (uio->uio_resid == 0)
+		return (0);
+
+	switch (cn->cgn_type) {
+	case CG_NOTIFY:
+		error = cgrp_rd_notify(cgm, cn, uio);
+		break;
+	case CG_PROCS:
+		error = cgrp_rd_procs(cgm, cn, uio);
+		break;
+	case CG_REL_AGENT:
+		error = cgrp_rd_rel_agent(cgm, uio);
+		break;
+	case CG_TASKS:
+		error = cgrp_rd_tasks(cgm, cn, uio);
+		break;
+	default:
+		VERIFY(0);
+	}
+
+	return (error);
+}
+
+/* ARGSUSED2 */
+static int
+cgrp_read(struct vnode *vp, struct uio *uiop, int ioflag, cred_t *cred,
+    struct caller_context *ct)
+{
+	cgrp_node_t *cn = VTOCGN(vp);
+	cgrp_mnt_t *cgm = VTOCGM(vp);
+	int error;
+
+	/*
+	 * We don't support reading non-regular files
+	 */
+	if (vp->v_type == VDIR)
+		return (EISDIR);
+	if (vp->v_type != VREG)
+		return (EINVAL);
+	error = cgrp_rd(cgm, cn, uiop, ct);
+
+	return (error);
+}
+
+static int
+cgrp_write(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cred,
+    struct caller_context *ct)
+{
+	cgrp_node_t *cn = VTOCGN(vp);
+	cgrp_mnt_t *cgm = VTOCGM(vp);
+	int error;
+
+	/*
+	 * We don't support writing to non-regular files
+	 */
+	if (vp->v_type != VREG)
+		return (EINVAL);
+
+	if (ioflag & FAPPEND) {
+		/* In append mode start at end of file. */
+		uiop->uio_loffset = cn->cgn_size;
+	}
+
+	error = cgrp_wr(cgm, cn, uiop, cred, ct);
+
+	return (error);
+}
+
+/* ARGSUSED2 */
+static int
+cgrp_getattr(struct vnode *vp, struct vattr *vap, int flags, struct cred *cred,
+    caller_context_t *ct)
+{
+	cgrp_node_t *cn = VTOCGN(vp);
+	cgrp_mnt_t *cgm;
+	struct vattr va;
+	int attrs = 1;
+
+	cgm = VTOCGM(cn->cgn_vnode);
+	mutex_enter(&cgm->cg_contents);
+	if (attrs == 0) {
+		cn->cgn_uid = va.va_uid;
+		cn->cgn_gid = va.va_gid;
+	}
+	vap->va_type = vp->v_type;
+	vap->va_mode = cn->cgn_mode & MODEMASK;
+	vap->va_uid = cn->cgn_uid;
+	vap->va_gid = cn->cgn_gid;
+	vap->va_fsid = cn->cgn_fsid;
+	vap->va_nodeid = (ino64_t)cn->cgn_nodeid;
+	vap->va_nlink = cn->cgn_nlink;
+	vap->va_size = (u_offset_t)cn->cgn_size;
+	vap->va_atime = cn->cgn_atime;
+	vap->va_mtime = cn->cgn_mtime;
+	vap->va_ctime = cn->cgn_ctime;
+	vap->va_blksize = PAGESIZE;
+	vap->va_rdev = cn->cgn_rdev;
+	vap->va_seq = cn->cgn_seq;
+
+	vap->va_nblocks = (fsblkcnt64_t)btodb(ptob(btopr(vap->va_size)));
+	mutex_exit(&cgm->cg_contents);
+	return (0);
+}
+
+/*ARGSUSED4*/
+static int
+cgrp_setattr(struct vnode *vp, struct vattr *vap, int flags, struct cred *cred,
+    caller_context_t *ct)
+{
+	cgrp_node_t *cn = VTOCGN(vp);
+	cgrp_mnt_t *cgm;
+	int error = 0;
+	struct vattr *get;
+	long mask;
+
+	/*
+	 * Cannot set these attributes
+	 */
+	if ((vap->va_mask & AT_NOSET) || (vap->va_mask & AT_XVATTR) ||
+	    (vap->va_mode & (S_ISUID | S_ISGID)) || (vap->va_mask & AT_SIZE))
+		return (EINVAL);
+
+	cgm = VTOCGM(cn->cgn_vnode);
+	mutex_enter(&cgm->cg_contents);
+
+	get = &cn->cgn_attr;
+	/*
+	 * Change file access modes. Must be owner or have sufficient
+	 * privileges.
+	 */
+	error = secpolicy_vnode_setattr(cred, vp, vap, get, flags, cgrp_taccess,
+	    cn);
+
+	if (error)
+		goto out;
+
+	mask = vap->va_mask;
+
+	if (mask & AT_MODE) {
+		get->va_mode &= S_IFMT;
+		get->va_mode |= vap->va_mode & ~S_IFMT;
+	}
+
+	if (mask & AT_UID)
+		get->va_uid = vap->va_uid;
+	if (mask & AT_GID)
+		get->va_gid = vap->va_gid;
+	if (mask & AT_ATIME)
+		get->va_atime = vap->va_atime;
+	if (mask & AT_MTIME)
+		get->va_mtime = vap->va_mtime;
+
+	if (mask & (AT_UID | AT_GID | AT_MODE | AT_MTIME))
+		gethrestime(&cn->cgn_ctime);
+
+out:
+	mutex_exit(&cgm->cg_contents);
+	return (error);
+}
+
+/* ARGSUSED2 */
+static int
+cgrp_access(struct vnode *vp, int mode, int flags, struct cred *cred,
+    caller_context_t *ct)
+{
+	cgrp_node_t *cn = VTOCGN(vp);
+	cgrp_mnt_t *cgm;
+	int error;
+
+	cgm = VTOCGM(cn->cgn_vnode);
+	mutex_enter(&cgm->cg_contents);
+	error = cgrp_taccess(cn, mode, cred);
+	mutex_exit(&cgm->cg_contents);
+	return (error);
+}
+
+/* ARGSUSED3 */
+static int
+cgrp_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
+    struct pathname *pnp, int flags, struct vnode *rdir, struct cred *cred,
+    caller_context_t *ct, int *direntflags, pathname_t *realpnp)
+{
+	cgrp_node_t *cn = VTOCGN(dvp);
+	cgrp_mnt_t *cgm;
+	cgrp_node_t *ncn = NULL;
+	int error;
+
+	/* disallow extended attrs */
+	if (flags & LOOKUP_XATTR)
+		return (EINVAL);
+
+	/*
+	 * Null component name is a synonym for directory being searched.
+	 */
+	if (*nm == '\0') {
+		VN_HOLD(dvp);
+		*vpp = dvp;
+		return (0);
+	}
+	ASSERT(cn);
+
+	cgm = VTOCGM(cn->cgn_vnode);
+	mutex_enter(&cgm->cg_contents);
+	error = cgrp_dirlookup(cn, nm, &ncn, cred);
+	mutex_exit(&cgm->cg_contents);
+
+	if (error == 0) {
+		ASSERT(ncn);
+		*vpp = CGNTOV(ncn);
+	}
+
+	return (error);
+}
+
+/*ARGSUSED7*/
+static int
+cgrp_create(struct vnode *dvp, char *nm, struct vattr *vap,
+    enum vcexcl exclusive, int mode, struct vnode **vpp, struct cred *cred,
+    int flag, caller_context_t *ct, vsecattr_t *vsecp)
+{
+	cgrp_node_t *parent = VTOCGN(dvp);
+	cgrp_node_t *cn = NULL;
+	cgrp_mnt_t *cgm;
+	int error;
+
+	if (*nm == '\0')
+		return (EPERM);
+
+	cgm = VTOCGM(parent->cgn_vnode);
+	mutex_enter(&cgm->cg_contents);
+	error = cgrp_dirlookup(parent, nm, &cn, cred);
+	if (error == 0) {		/* name found */
+		ASSERT(cn);
+
+		mutex_exit(&cgm->cg_contents);
+		/*
+		 * Creating an existing file, allow it except for the following
+		 * errors.
+		 */
+		if (exclusive == EXCL) {
+			error = EEXIST;
+		} else if ((CGNTOV(cn)->v_type == VDIR) && (mode & VWRITE)) {
+			error = EISDIR;
+		} else {
+			error = cgrp_taccess(cn, mode, cred);
+		}
+		if (error != 0) {
+			cgnode_rele(cn);
+			return (error);
+		}
+		*vpp = CGNTOV(cn);
+		return (0);
+	}
+	mutex_exit(&cgm->cg_contents);
+
+	/*
+	 * cgroups doesn't allow creation of additional, non-subsystem specific
+	 * files in a dir
+	 */
+	return (EPERM);
+}
+
+/* ARGSUSED3 */
+static int
+cgrp_remove(struct vnode *dvp, char *nm, struct cred *cred,
+    caller_context_t *ct, int flags)
+{
+	cgrp_node_t *parent = VTOCGN(dvp);
+	int error;
+	cgrp_node_t *cn = NULL;
+	cgrp_mnt_t *cgm;
+
+	/*
+	 * Removal of subsystem-specific files is not allowed but we need
+	 * to return the correct error if they try to remove a non-existent
+	 * file.
+	 */
+
+	cgm = VTOCGM(parent->cgn_vnode);
+	mutex_enter(&cgm->cg_contents);
+	error = cgrp_dirlookup(parent, nm, &cn, cred);
+	mutex_exit(&cgm->cg_contents);
+	if (error)
+		return (error);
+
+	ASSERT(cn);
+	cgnode_rele(cn);
+	return (EPERM);
+}
+
+/* ARGSUSED4 */
+static int
+cgrp_link(struct vnode *dvp, struct vnode *srcvp, char *cnm, struct cred *cred,
+    caller_context_t *ct, int flags)
+{
+	/* cgroups doesn't support hard links */
+	return (EPERM);
+}
+
+/*
+ * Rename of subsystem-specific files is not allowed but we can rename
+ * directories (i.e. sub-groups). We cannot mv subdirs from one group to
+ * another so the src and dest vnode must be the same.
+ */
+/* ARGSUSED5 */
+static int
+cgrp_rename(
+	struct vnode *odvp,	/* source parent vnode */
+	char *onm,		/* source name */
+	struct vnode *ndvp,	/* destination parent vnode */
+	char *nnm,		/* destination name */
+	struct cred *cred,
+	caller_context_t *ct,
+	int flags)
+{
+	cgrp_node_t *fromparent;
+	cgrp_node_t *toparent;
+	cgrp_node_t *fromcn = NULL;	/* source cgrp_node */
+	cgrp_mnt_t *cgm = VTOCGM(odvp);
+	int error, err;
+
+	fromparent = VTOCGN(odvp);
+	toparent = VTOCGN(ndvp);
+
+	if (fromparent != toparent)
+		return (EIO);
+
+	/* discourage additional use of toparent */
+	toparent = NULL;
+
+	mutex_enter(&cgm->cg_contents);
+
+	/*
+	 * Look up cgrp_node of file we're supposed to rename.
+	 */
+	error = cgrp_dirlookup(fromparent, onm, &fromcn, cred);
+	if (error) {
+		mutex_exit(&cgm->cg_contents);
+		return (error);
+	}
+
+	if (fromcn->cgn_type != CG_CGROUP_DIR) {
+		error = EPERM;
+		goto done;
+	}
+
+	/*
+	 * Make sure we can delete the old (source) entry.  This
+	 * requires write permission on the containing directory.
+	 */
+	if (((error = cgrp_taccess(fromparent, VWRITE, cred)) != 0))
+		goto done;
+
+	/*
+	 * Check for renaming to or from '.' or '..' or that
+	 * fromcn == fromparent
+	 */
+	if ((onm[0] == '.' &&
+	    (onm[1] == '\0' || (onm[1] == '.' && onm[2] == '\0'))) ||
+	    (nnm[0] == '.' &&
+	    (nnm[1] == '\0' || (nnm[1] == '.' && nnm[2] == '\0'))) ||
+	    (fromparent == fromcn)) {
+		error = EINVAL;
+		goto done;
+	}
+
+	/*
+	 * Link source to new target
+	 */
+	error = cgrp_direnter(cgm, fromparent, nnm, DE_RENAME,
+	    fromcn, (struct vattr *)NULL,
+	    (cgrp_node_t **)NULL, cred, ct);
+
+	if (error)
+		goto done;
+
+	/*
+	 * Unlink from source.
+	 */
+	error = err = cgrp_dirdelete(fromparent, fromcn, onm, DR_RENAME, cred);
+
+	/*
+	 * The following handles the case where our source cgrp_node was
+	 * removed before we got to it.
+	 */
+	if (error == ENOENT)
+		error = 0;
+
+	if (err == 0) {
+		vnevent_rename_src(CGNTOV(fromcn), odvp, onm, ct);
+		vnevent_rename_dest_dir(ndvp, CGNTOV(fromcn), nnm, ct);
+	}
+
+done:
+	mutex_exit(&cgm->cg_contents);
+	cgnode_rele(fromcn);
+
+	return (error);
+}
+
+/* ARGSUSED5 */
+static int
+cgrp_mkdir(struct vnode *dvp, char *nm, struct vattr *va, struct vnode **vpp,
+    struct cred *cred, caller_context_t *ct, int flags, vsecattr_t *vsecp)
+{
+	cgrp_node_t *parent = VTOCGN(dvp);
+	cgrp_node_t *self = NULL;
+	cgrp_mnt_t *cgm = VTOCGM(dvp);
+	int error;
+
+	/*
+	 * Might be dangling directory.  Catch it here, because a ENOENT
+	 * return from cgrp_dirlookup() is an "ok return".
+	 */
+	if (parent->cgn_nlink == 0)
+		return (ENOENT);
+
+	mutex_enter(&cgm->cg_contents);
+	error = cgrp_dirlookup(parent, nm, &self, cred);
+	if (error == 0) {
+		ASSERT(self != NULL);
+		mutex_exit(&cgm->cg_contents);
+		cgnode_rele(self);
+		return (EEXIST);
+	}
+	if (error != ENOENT) {
+		mutex_exit(&cgm->cg_contents);
+		return (error);
+	}
+
+	error = cgrp_direnter(cgm, parent, nm, DE_MKDIR, (cgrp_node_t *)NULL,
+	    va, &self, cred, ct);
+	if (error) {
+		mutex_exit(&cgm->cg_contents);
+		if (self != NULL)
+			cgnode_rele(self);
+		return (error);
+	}
+	mutex_exit(&cgm->cg_contents);
+	*vpp = CGNTOV(self);
+	return (0);
+}
+
+/* ARGSUSED4 */
+static int
+cgrp_rmdir(struct vnode *dvp, char *nm, struct vnode *cdir, struct cred *cred,
+    caller_context_t *ct, int flags)
+{
+	cgrp_node_t *parent = VTOCGN(dvp);
+	cgrp_mnt_t *cgm;
+	cgrp_node_t *self = NULL;
+	struct vnode *vp;
+	int error = 0;
+
+	/*
+	 * Return error when removing . and ..
+	 */
+	if (strcmp(nm, ".") == 0)
+		return (EINVAL);
+	if (strcmp(nm, "..") == 0)
+		return (EEXIST); /* Should be ENOTEMPTY */
+
+	cgm = VTOCGM(parent->cgn_vnode);
+	mutex_enter(&cgm->cg_contents);
+
+	error = cgrp_dirlookup(parent, nm, &self, cred);
+	if (error) {
+		mutex_exit(&cgm->cg_contents);
+		return (error);
+	}
+
+	vp = CGNTOV(self);
+	if (vp == dvp || vp == cdir) {
+		error = EINVAL;
+		goto done;
+	}
+	if (self->cgn_type != CG_CGROUP_DIR) {
+		error = ENOTDIR;
+		goto done;
+	}
+
+	cgm = (cgrp_mnt_t *)VFSTOCGM(self->cgn_vnode->v_vfsp);
+
+	/*
+	 * Check for the existence of any sub-cgroup directories or tasks in
+	 * the cgroup.
+	 */
+	if (self->cgn_task_cnt > 0 || self->cgn_dirents > N_DIRENTS(cgm)) {
+		error = EEXIST;
+		/*
+		 * Update atime because checking cn_dirents is logically
+		 * equivalent to reading the directory
+		 */
+		gethrestime(&self->cgn_atime);
+		goto done;
+	}
+
+	if (vn_vfswlock(vp)) {
+		error = EBUSY;
+		goto done;
+	}
+	if (vn_mountedvfs(vp) != NULL) {
+		error = EBUSY;
+	} else {
+		error = cgrp_dirdelete(parent, self, nm, DR_RMDIR, cred);
+	}
+
+	vn_vfsunlock(vp);
+
+	if (parent->cgn_task_cnt == 0 &&
+	    parent->cgn_dirents == N_DIRENTS(cgm) && parent->cgn_notify == 1) {
+		cgrp_rel_agent_event(cgm, parent);
+		ASSERT(MUTEX_NOT_HELD(&cgm->cg_contents));
+		goto dropped;
+	}
+
+done:
+	mutex_exit(&cgm->cg_contents);
+dropped:
+	vnevent_rmdir(CGNTOV(self), dvp, nm, ct);
+	cgnode_rele(self);
+
+	return (error);
+}
+
+/* ARGSUSED2 */
+static int
+cgrp_readdir(struct vnode *vp, struct uio *uiop, struct cred *cred, int *eofp,
+    caller_context_t *ct, int flags)
+{
+	cgrp_node_t *cn = VTOCGN(vp);
+	cgrp_mnt_t *cgm;
+	cgrp_dirent_t *cdp;
+	int error = 0;
+	size_t namelen;
+	struct dirent64 *dp;
+	ulong_t offset;
+	ulong_t total_bytes_wanted;
+	long outcount = 0;
+	long bufsize;
+	int reclen;
+	caddr_t outbuf;
+
+	if (uiop->uio_loffset >= MAXOFF_T) {
+		if (eofp)
+			*eofp = 1;
+		return (0);
+	}
+
+	if (uiop->uio_iovcnt != 1)
+		return (EINVAL);
+
+	if (vp->v_type != VDIR)
+		return (ENOTDIR);
+
+	cgm = VTOCGM(cn->cgn_vnode);
+	mutex_enter(&cgm->cg_contents);
+
+	if (cn->cgn_dir == NULL) {
+		VERIFY(cn->cgn_nlink == 0);
+		mutex_exit(&cgm->cg_contents);
+		return (0);
+	}
+
+	/*
+	 * Get space for multiple directory entries
+	 */
+	total_bytes_wanted = uiop->uio_iov->iov_len;
+	bufsize = total_bytes_wanted + sizeof (struct dirent64);
+	outbuf = kmem_alloc(bufsize, KM_SLEEP);
+
+	dp = (struct dirent64 *)outbuf;
+
+	offset = 0;
+	cdp = cn->cgn_dir;
+	while (cdp) {
+		namelen = strlen(cdp->cgd_name);	/* no +1 needed */
+		offset = cdp->cgd_offset;
+		if (offset >= uiop->uio_offset) {
+			reclen = (int)DIRENT64_RECLEN(namelen);
+			if (outcount + reclen > total_bytes_wanted) {
+				if (!outcount) {
+					/* Buffer too small for any entries. */
+					error = EINVAL;
+				}
+				break;
+			}
+			ASSERT(cdp->cgd_cgrp_node != NULL);
+
+			/* use strncpy(9f) to zero out uninitialized bytes */
+
+			(void) strncpy(dp->d_name, cdp->cgd_name,
+			    DIRENT64_NAMELEN(reclen));
+			dp->d_reclen = (ushort_t)reclen;
+			dp->d_ino = (ino64_t)cdp->cgd_cgrp_node->cgn_nodeid;
+			dp->d_off = (offset_t)cdp->cgd_offset + 1;
+			dp = (struct dirent64 *)((uintptr_t)dp + dp->d_reclen);
+			outcount += reclen;
+			ASSERT(outcount <= bufsize);
+		}
+		cdp = cdp->cgd_next;
+	}
+
+	if (!error)
+		error = uiomove(outbuf, outcount, UIO_READ, uiop);
+
+	if (!error) {
+		/*
+		 * If we reached the end of the list our offset should now be
+		 * just past the end.
+		 */
+		if (!cdp) {
+			offset += 1;
+			if (eofp)
+				*eofp = 1;
+		} else if (eofp)
+			*eofp = 0;
+		uiop->uio_offset = offset;
+	}
+	gethrestime(&cn->cgn_atime);
+
+	mutex_exit(&cgm->cg_contents);
+
+	kmem_free(outbuf, bufsize);
+	return (error);
+}
+
+/* ARGSUSED5 */
+static int
+cgrp_symlink(struct vnode *dvp, char *lnm, struct vattr *cva, char *cnm,
+    struct cred *cred, caller_context_t *ct, int flags)
+{
+	/* cgroups doesn't support symlinks */
+	return (EPERM);
+}
+
+/* ARGSUSED */
+static void
+cgrp_inactive(struct vnode *vp, struct cred *cred, caller_context_t *ct)
+{
+	cgrp_node_t *cn = VTOCGN(vp);
+	cgrp_mnt_t *cgm = VFSTOCGM(vp->v_vfsp);
+
+	mutex_enter(&cgm->cg_contents);
+	mutex_enter(&vp->v_lock);
+	ASSERT(vp->v_count >= 1);
+
+	/*
+	 * If we don't have the last hold or the link count is non-zero,
+	 * there's little to do -- just drop our hold.
+	 */
+	if (vp->v_count > 1 || cn->cgn_nlink != 0) {
+		vp->v_count--;
+		mutex_exit(&vp->v_lock);
+		mutex_exit(&cgm->cg_contents);
+		return;
+	}
+
+	if (cn->cgn_forw == NULL)
+		cgm->cg_rootnode->cgn_back = cn->cgn_back;
+	else
+		cn->cgn_forw->cgn_back = cn->cgn_back;
+	cn->cgn_back->cgn_forw = cn->cgn_forw;
+
+	mutex_exit(&vp->v_lock);
+	mutex_exit(&cgm->cg_contents);
+
+	/* Here's our chance to send invalid event */
+	vn_invalid(CGNTOV(cn));
+
+	vn_free(CGNTOV(cn));
+	kmem_free(cn, sizeof (cgrp_node_t));
+}
+
+/* ARGSUSED */
+static int
+cgrp_seek(struct vnode *vp, offset_t ooff, offset_t *noffp,
+    caller_context_t *ct)
+{
+	return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
+}
+
+/* ARGSUSED */
+static int
+cgrp_rwlock(struct vnode *vp, int write_lock, caller_context_t *ctp)
+{
+	return (write_lock);
+}
+
+/* ARGSUSED */
+static void
+cgrp_rwunlock(struct vnode *vp, int write_lock, caller_context_t *ctp)
+{
+}
+
+static int
+cgrp_pathconf(struct vnode *vp, int cmd, ulong_t *valp, cred_t *cr,
+    caller_context_t *ct)
+{
+	int error;
+
+	switch (cmd) {
+	case _PC_XATTR_EXISTS:
+		if (vp->v_vfsp->vfs_flag & VFS_XATTR) {
+			*valp = 0;	/* assume no attributes */
+			error = 0;	/* okay to ask */
+		} else {
+			error = EINVAL;
+		}
+		break;
+	case _PC_SATTR_ENABLED:
+	case _PC_SATTR_EXISTS:
+		*valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
+		    (vp->v_type == VREG || vp->v_type == VDIR);
+		error = 0;
+		break;
+	case _PC_TIMESTAMP_RESOLUTION:
+		/* nanosecond timestamp resolution */
+		*valp = 1L;
+		error = 0;
+		break;
+	default:
+		error = fs_pathconf(vp, cmd, valp, cr, ct);
+	}
+	return (error);
+}
+
+
+struct vnodeops *cgrp_vnodeops;
+
+const fs_operation_def_t cgrp_vnodeops_template[] = {
+	VOPNAME_OPEN,		{ .vop_open = cgrp_open },
+	VOPNAME_CLOSE,		{ .vop_close = cgrp_close },
+	VOPNAME_READ,		{ .vop_read = cgrp_read },
+	VOPNAME_WRITE,		{ .vop_write = cgrp_write },
+	VOPNAME_GETATTR,	{ .vop_getattr = cgrp_getattr },
+	VOPNAME_SETATTR,	{ .vop_setattr = cgrp_setattr },
+	VOPNAME_ACCESS,		{ .vop_access = cgrp_access },
+	VOPNAME_LOOKUP,		{ .vop_lookup = cgrp_lookup },
+	VOPNAME_CREATE,		{ .vop_create = cgrp_create },
+	VOPNAME_REMOVE,		{ .vop_remove = cgrp_remove },
+	VOPNAME_LINK,		{ .vop_link = cgrp_link },
+	VOPNAME_RENAME,		{ .vop_rename = cgrp_rename },
+	VOPNAME_MKDIR,		{ .vop_mkdir = cgrp_mkdir },
+	VOPNAME_RMDIR,		{ .vop_rmdir = cgrp_rmdir },
+	VOPNAME_READDIR,	{ .vop_readdir = cgrp_readdir },
+	VOPNAME_SYMLINK,	{ .vop_symlink = cgrp_symlink },
+	VOPNAME_INACTIVE,	{ .vop_inactive = cgrp_inactive },
+	VOPNAME_RWLOCK,		{ .vop_rwlock = cgrp_rwlock },
+	VOPNAME_RWUNLOCK,	{ .vop_rwunlock = cgrp_rwunlock },
+	VOPNAME_SEEK,		{ .vop_seek = cgrp_seek },
+	VOPNAME_PATHCONF,	{ .vop_pathconf = cgrp_pathconf },
+	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
+	NULL,			NULL
+};
diff --git a/usr/src/uts/common/brand/lx/devfs/lxd.h b/usr/src/uts/common/brand/lx/devfs/lxd.h
new file mode 100644
index 0000000000..cd256c27c5
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/devfs/lxd.h
@@ -0,0 +1,232 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#ifndef	_LXD_H
+#define	_LXD_H
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * lxd.h: declarations, data structures and macros for lxd (lxd devfs).
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/policy.h>
+#include <sys/dirent.h>
+#include <sys/errno.h>
+#include <sys/kmem.h>
+#include <sys/pathname.h>
+#include <sys/systm.h>
+#include <sys/var.h>
+#include <sys/sysmacros.h>
+#include <sys/cred.h>
+#include <sys/priv.h>
+#include <sys/vnode.h>
+#include <sys/vfs.h>
+#include <sys/statvfs.h>
+#include <sys/cmn_err.h>
+#include <sys/zone.h>
+#include <sys/uio.h>
+#include <sys/utsname.h>
+#include <sys/atomic.h>
+#include <vm/anon.h>
+#include <sys/lx_types.h>
+
+#if defined(_KERNEL)
+
+#include <sys/lx_brand.h>
+
+/*
+ * It's unlikely that we need to create more than 50-60 subdirs/symlinks
+ * as front files so we size the file system hash for 2x that number.
+ * The back devfs typically has ~80 nodes so this is also a comfortable size
+ * for the back hash table.
+ */
+#define	LXD_HASH_SZ	128
+
+#define	LXD_BACK_HASH(v)	((((intptr_t)(v)) >> 10) & ((LXD_HASH_SZ) - 1))
+
+#define	LXD_NM_HASH(ldn, name, hash)				\
+	{							\
+		char Xc, *Xcp;					\
+		hash = (uint_t)(uintptr_t)(ldn) >> 8;		\
+		for (Xcp = (name); (Xc = *Xcp) != 0; Xcp++)	\
+			hash = (hash << 4) + hash + (uint_t)Xc;	\
+		hash &= (LXD_HASH_SZ - 1);			\
+	}
+
+
+enum lxd_node_type	{ LXDNT_NONE, LXDNT_BACK, LXDNT_FRONT };
+
+/*
+ * lxd per-mount data structure.
+ *
+ * All fields are protected by lxd_contents.
+ * File renames on a specific file system are protected lxdm_renamelck.
+ */
+typedef struct lxd_mnt {
+	struct vfs	*lxdm_vfsp;	/* filesystem's vfs struct */
+	struct lxd_node *lxdm_rootnode;	/* root lxd_node */
+	char 		*lxdm_mntpath;	/* name of lxd mount point */
+	dev_t		lxdm_dev;	/* unique dev # of mounted `device' */
+	kmutex_t	lxdm_contents;	/* per-mount lock */
+	kmutex_t	lxdm_renamelck;	/* rename lock for this mount */
+	uint_t		lxdm_gen;	/* node ID source for files */
+
+	/* protects buckets in both "dir ent" and "back" hash tables */
+	kmutex_t	lxdm_hash_mutex[LXD_HASH_SZ];
+
+	/* per-mount data for "back" vnodes in the fs */
+	uint_t		lxdm_back_refcnt; /* # outstanding "back" vnodes */
+	struct lxd_node *lxdm_back_htable[LXD_HASH_SZ];
+
+	/*
+	 * Per-mount directory data for "front" nodes in the fs.
+	 * Each front node has a directory entry but directory entries can live
+	 * on either front or back nodes.
+	 */
+	uint_t		lxdm_dent_refcnt; /* # outstanding dir ents */
+	struct lxd_dirent *lxdm_dent_htable[LXD_HASH_SZ];
+} lxd_mnt_t;
+
+/*
+ * lxd_node is the file system dependent node for lxd.
+ *
+ * The node is used to represent both front and back files. For front files
+ * the node can represent either a directory or symlink.
+ */
+typedef struct lxd_node {
+	enum lxd_node_type	lxdn_type;
+
+	/* Data for "front" nodes */
+	struct lxd_node		*lxdn_prev;	/* lnked lst of lxd nodes */
+	struct lxd_node		*lxdn_next;	/* lnked lst of lxd nodes */
+	struct lxd_node		*lxdn_parent;	/* dir containing this node */
+	krwlock_t		lxdn_rwlock;	/* serialize mods/dir updates */
+	kmutex_t		lxdn_tlock;	/* time, flag, and nlink lock */
+
+	/* these could be in a union ala tmpfs but not really necessary */
+	uint_t			lxdn_dirents;	/* number of dirents */
+	struct lxd_dirent	*lxdn_dir;	/* dirent list */
+	char			*lxdn_symlink;	/* pointer to symlink */
+	struct vattr		lxdn_attr;	/* attributes */
+
+	/* Hash table link */
+	struct lxd_node		*lxdn_hnxt;	/* link in per-mount entry */
+						/* hash table */
+	vnode_t 		*lxdn_vnode;	/* vnode for this lxd_node */
+
+	vnode_t			*lxdn_real_vp;	/* back file - real vnode */
+} lxd_node_t;
+
+/*
+ * Attributes
+ */
+#define	lxdn_mask	lxdn_attr.va_mask
+#define	lxdn_mode	lxdn_attr.va_mode
+#define	lxdn_uid	lxdn_attr.va_uid
+#define	lxdn_gid	lxdn_attr.va_gid
+#define	lxdn_fsid	lxdn_attr.va_fsid
+#define	lxdn_nodeid	lxdn_attr.va_nodeid
+#define	lxdn_nlink	lxdn_attr.va_nlink
+#define	lxdn_size	lxdn_attr.va_size
+#define	lxdn_atime	lxdn_attr.va_atime
+#define	lxdn_mtime	lxdn_attr.va_mtime
+#define	lxdn_ctime	lxdn_attr.va_ctime
+#define	lxdn_rdev	lxdn_attr.va_rdev
+#define	lxdn_blksize	lxdn_attr.va_blksize
+#define	lxdn_nblocks	lxdn_attr.va_nblocks
+#define	lxdn_seq	lxdn_attr.va_seq
+
+/*
+ * lx devfs conversion macros
+ */
+#define	VFSTOLXDM(vfsp)		((lxd_mnt_t *)(vfsp)->vfs_data)
+#define	VTOLXDM(vp)		((lxd_mnt_t *)(vp)->v_vfsp->vfs_data)
+#define	VTOLDN(vp)		((lxd_node_t *)(vp)->v_data)
+#define	LDNTOV(ln)		((ln)->lxdn_vnode)
+#define	ldnode_hold(ln)		VN_HOLD(LDNTOV(ln))
+#define	ldnode_rele(ln)		VN_RELE(LDNTOV(ln))
+
+#define	REALVP(vp)		(VTOLDN(vp)->lxdn_real_vp)
+
+/*
+ * front directories are made up of a linked list of lxd_dirent structures
+ * hanging off directory lxdn_nodes.  File names are not fixed length, but are
+ * null terminated.
+ */
+typedef struct lxd_dirent {
+	lxd_node_t		*lddir_node;	/* lxd node for this file */
+	struct lxd_dirent	*lddir_next;	/* next directory entry */
+	struct lxd_dirent	*lddir_prev;	/* prev directory entry */
+	uint_t			lddir_offset;	/* "offset" of dir entry */
+	uint_t			lddir_hash;	/* a hash of lddir_name */
+	struct lxd_dirent	*lddir_link;	/* linked via hash table */
+	lxd_node_t		*lddir_parent;	/* parent, dir we are in */
+	char			*lddir_name;	/* null terminated */
+} lxd_dirent_t;
+
+enum de_op	{ DE_CREATE, DE_MKDIR, DE_RENAME };	/* direnter ops */
+enum dr_op	{ DR_REMOVE, DR_RMDIR, DR_RENAME };	/* dirremove ops */
+
+typedef struct lxd_minor_translator {
+	char	*lxd_mt_path;		/* illumos minor node path */
+	minor_t	lxd_mt_minor;		/* illumos minor node number */
+	int	lxd_mt_lx_major;	/* linux major node number */
+	int	lxd_mt_lx_minor;	/* linux minor node number */
+} lxd_minor_translator_t;
+
+enum lxd_xl_tp	{ DTT_INVALID, DTT_LIST, DTT_CUSTOM };
+
+#define	xl_list		lxd_xl_minor.lxd_xl_list
+#define	xl_custom	lxd_xl_minor.lxd_xl_custom
+
+typedef struct lxd_devt_translator {
+	char		*lxd_xl_driver;	/* driver name */
+	major_t		lxd_xl_major;	/* driver number */
+
+	enum lxd_xl_tp	lxd_xl_type;	/* dictates how we intrep. xl_minor */
+	union {
+		uintptr_t		lxd_xl_foo; /* required to compile */
+		lxd_minor_translator_t	*lxd_xl_list;
+		void			(*lxd_xl_custom)(dev_t, dev_t *);
+	} lxd_xl_minor;
+} lxd_devt_translator_t;
+
+extern struct vnodeops *lxd_vnodeops;
+extern lxd_devt_translator_t lxd_devt_translators[];
+
+vnode_t *lxd_make_back_node(vnode_t *, lxd_mnt_t *);
+void lxd_free_back_node(lxd_node_t *);
+int lxd_dirdelete(lxd_node_t *, lxd_node_t *, char *, enum dr_op, cred_t *);
+int lxd_direnter(lxd_mnt_t *, lxd_node_t *, char *, enum de_op, lxd_node_t *,
+	lxd_node_t *, struct vattr *, lxd_node_t **, cred_t *,
+	caller_context_t *);
+void lxd_dirinit(lxd_node_t *, lxd_node_t *, cred_t *);
+int lxd_dirlookup(lxd_node_t *, char *, lxd_node_t **, cred_t *);
+void lxd_dirtrunc(lxd_node_t *);
+void lxd_node_init(lxd_mnt_t *, lxd_node_t *, vnode_t *, vattr_t *, cred_t *);
+int lxd_naccess(void *, int, cred_t *);
+
+#endif /* KERNEL */
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _LXD_H */
diff --git a/usr/src/uts/common/brand/lx/devfs/lxd_node.c b/usr/src/uts/common/brand/lx/devfs/lxd_node.c
new file mode 100644
index 0000000000..9e67f988bc
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/devfs/lxd_node.c
@@ -0,0 +1,1004 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/time.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/errno.h>
+#include <sys/cmn_err.h>
+#include <sys/cred.h>
+#include <sys/stat.h>
+#include <sys/mode.h>
+#include <sys/policy.h>
+#include <sys/sdt.h>
+
+#include "lxd.h"
+
+#define	LXD_HASH_SIZE	8192		/* must be power of 2 */
+#define	LXD_MUTEX_SIZE	64
+
+
+#define	MODESHIFT	3
+
+typedef enum lxd_nodehold {
+	NOHOLD,
+	HOLD
+} lxd_nodehold_t;
+
+/*
+ * The following functions maintain the per-mount "front" files.
+ */
+static void
+lxd_save_dirent(lxd_dirent_t *de)
+{
+	lxd_mnt_t	*lxdm = VTOLXDM(LDNTOV(de->lddir_parent));
+	uint_t		hash;
+	kmutex_t	*hmtx;
+
+	LXD_NM_HASH(de->lddir_parent, de->lddir_name, hash);
+	de->lddir_hash = hash;
+
+	hmtx = &lxdm->lxdm_hash_mutex[hash];
+
+	mutex_enter(hmtx);
+	ASSERT(de->lddir_link == NULL);
+	de->lddir_link = lxdm->lxdm_dent_htable[hash];
+	lxdm->lxdm_dent_htable[hash] = de;
+	mutex_exit(hmtx);
+
+	atomic_inc_32(&lxdm->lxdm_dent_refcnt);
+}
+
+static void
+lxd_rm_dirent(lxd_dirent_t *de)
+{
+	lxd_mnt_t	*lxdm = VTOLXDM(LDNTOV(de->lddir_parent));
+	uint_t		hash;
+	lxd_dirent_t	**prevpp;
+	kmutex_t	*hmtx;
+
+	hash = de->lddir_hash;
+	hmtx = &lxdm->lxdm_hash_mutex[hash];
+
+	mutex_enter(hmtx);
+	prevpp = &lxdm->lxdm_dent_htable[hash];
+	while (*prevpp != de)
+		prevpp = &(*prevpp)->lddir_link;
+	*prevpp = de->lddir_link;
+	de->lddir_link = NULL;
+	mutex_exit(hmtx);
+
+	ASSERT(lxdm->lxdm_dent_refcnt > 0);
+	atomic_dec_32(&lxdm->lxdm_dent_refcnt);
+}
+
+static lxd_dirent_t *
+lxd_find_dirent(char *name, lxd_node_t *parent, lxd_nodehold_t do_hold,
+    lxd_node_t **found)
+{
+	lxd_mnt_t	*lxdm = VTOLXDM(LDNTOV(parent));
+	lxd_dirent_t	*de;
+	uint_t		hash;
+	kmutex_t	*hmtx;
+
+	LXD_NM_HASH(parent, name, hash);
+	hmtx = &lxdm->lxdm_hash_mutex[hash];
+
+	mutex_enter(hmtx);
+	de = lxdm->lxdm_dent_htable[hash];
+	while (de) {
+		if (de->lddir_hash == hash && de->lddir_parent == parent &&
+		    strcmp(de->lddir_name, name) == 0) {
+			lxd_node_t *ldn = de->lddir_node;
+
+			if (do_hold == HOLD) {
+				ASSERT(ldn != NULL);
+				ldnode_hold(ldn);
+			}
+			if (found != NULL)
+				*found = ldn;
+			mutex_exit(hmtx);
+			return (de);
+		}
+
+		de = de->lddir_link;
+	}
+	mutex_exit(hmtx);
+	return (NULL);
+}
+
+int
+lxd_naccess(void *vcp, int mode, cred_t *cr)
+{
+	lxd_node_t *ldn = vcp;
+	int shift = 0;
+	/*
+	 * Check access based on owner, group and public perms in lxd_node.
+	 */
+	if (crgetuid(cr) != ldn->lxdn_uid) {
+		shift += MODESHIFT;
+		if (groupmember(ldn->lxdn_gid, cr) == 0)
+			shift += MODESHIFT;
+	}
+
+	if (ldn->lxdn_type == LXDNT_FRONT)
+		return (secpolicy_vnode_access2(cr, LDNTOV(ldn),
+		    ldn->lxdn_uid, ldn->lxdn_mode << shift, mode));
+
+	ASSERT(ldn->lxdn_type == LXDNT_BACK);
+	return (VOP_ACCESS(ldn->lxdn_real_vp, mode, 0, cr, NULL));
+}
+
+static lxd_node_t *
+lxd_find_back(struct vnode *vp, uint_t hash, lxd_mnt_t *lxdm)
+{
+	lxd_node_t *l;
+
+	ASSERT(MUTEX_HELD(&lxdm->lxdm_hash_mutex[hash]));
+
+	for (l = lxdm->lxdm_back_htable[hash]; l != NULL; l = l->lxdn_hnxt) {
+		if (l->lxdn_real_vp == vp) {
+			ASSERT(l->lxdn_type == LXDNT_BACK);
+
+			VN_HOLD(LDNTOV(l));
+			return (l);
+		}
+	}
+	return (NULL);
+}
+
+static void
+lxd_save_back(lxd_node_t *l, uint_t hash, lxd_mnt_t *lxdm)
+{
+	ASSERT(l->lxdn_type == LXDNT_BACK);
+	ASSERT(l->lxdn_real_vp != NULL);
+	ASSERT(MUTEX_HELD(&lxdm->lxdm_hash_mutex[hash]));
+
+	atomic_inc_32(&lxdm->lxdm_back_refcnt);
+
+	l->lxdn_hnxt = lxdm->lxdm_back_htable[hash];
+	lxdm->lxdm_back_htable[hash] = l;
+}
+
+
+struct vnode *
+lxd_make_back_node(struct vnode *vp, lxd_mnt_t *lxdm)
+{
+	uint_t hash;
+	kmutex_t *hmtx;
+	lxd_node_t *l;
+
+	hash = LXD_BACK_HASH(vp);	/* Note: hashing with realvp */
+	hmtx = &lxdm->lxdm_hash_mutex[hash];
+	mutex_enter(hmtx);
+
+	l = lxd_find_back(vp, hash, lxdm);
+	if (l == NULL) {
+		vnode_t *nvp;
+
+		l = kmem_zalloc(sizeof (lxd_node_t), KM_SLEEP);
+		nvp = vn_alloc(KM_SLEEP);
+
+		rw_init(&l->lxdn_rwlock, NULL, RW_DEFAULT, NULL);
+		mutex_init(&l->lxdn_tlock, NULL, MUTEX_DEFAULT, NULL);
+
+		l->lxdn_vnode = nvp;
+		l->lxdn_type = LXDNT_BACK;
+		l->lxdn_real_vp = vp;
+
+		VN_SET_VFS_TYPE_DEV(nvp, lxdm->lxdm_vfsp, vp->v_type,
+		    vp->v_rdev);
+		nvp->v_flag |= (vp->v_flag & (VNOMOUNT|VNOMAP|VDIROPEN));
+		vn_setops(nvp, lxd_vnodeops);
+		nvp->v_data = (caddr_t)l;
+
+		lxd_save_back(l, hash, lxdm);
+		vn_exists(vp);
+	} else {
+		VN_RELE(vp);
+	}
+
+	mutex_exit(hmtx);
+	return (LDNTOV(l));
+}
+
+void
+lxd_free_back_node(lxd_node_t *lp)
+{
+	uint_t hash;
+	kmutex_t *hmtx;
+	lxd_node_t *l;
+	lxd_node_t *lprev = NULL;
+	vnode_t *vp = LDNTOV(lp);
+	vnode_t *realvp = REALVP(vp);
+	lxd_mnt_t *lxdm = VTOLXDM(vp);
+
+	/* in lxd_make_back_node we call lxd_find_back with the realvp */
+	hash = LXD_BACK_HASH(realvp);
+	hmtx = &lxdm->lxdm_hash_mutex[hash];
+	mutex_enter(hmtx);
+
+	mutex_enter(&vp->v_lock);
+	if (vp->v_count > 1) {
+		vp->v_count--;	/* release our hold from vn_rele */
+		mutex_exit(&vp->v_lock);
+		mutex_exit(hmtx);
+		return;
+	}
+	mutex_exit(&vp->v_lock);
+
+	for (l = lxdm->lxdm_back_htable[hash]; l != NULL;
+	    lprev = l, l = l->lxdn_hnxt) {
+
+		if (l != lp)
+			continue;
+
+		ASSERT(l->lxdn_type == LXDNT_BACK);
+		ASSERT(lxdm->lxdm_back_refcnt > 0);
+
+		atomic_dec_32(&lxdm->lxdm_back_refcnt);
+		vn_invalid(vp);
+
+		if (lprev == NULL) {
+			lxdm->lxdm_back_htable[hash] = l->lxdn_hnxt;
+		} else {
+			lprev->lxdn_hnxt = l->lxdn_hnxt;
+		}
+
+		mutex_exit(hmtx);
+		rw_destroy(&l->lxdn_rwlock);
+		mutex_destroy(&l->lxdn_tlock);
+		kmem_free(l, sizeof (lxd_node_t));
+		vn_free(vp);
+		VN_RELE(realvp);
+		return;
+	}
+
+	panic("lxd_free_back_node");
+	/*NOTREACHED*/
+}
+/*
+ * Search directory 'parent' for entry 'name'.
+ *
+ * 0 is returned on success and *foundcp points
+ * to the found lxd_node with its vnode held.
+ */
+int
+lxd_dirlookup(lxd_node_t *parent, char *name, lxd_node_t **foundnp, cred_t *cr)
+{
+	int error;
+
+	*foundnp = NULL;
+	if (parent->lxdn_vnode->v_type != VDIR)
+		return (ENOTDIR);
+
+	if ((error = lxd_naccess(parent, VEXEC, cr)))
+		return (error);
+
+	if (*name == '\0') {
+		ldnode_hold(parent);
+		*foundnp = parent;
+		return (0);
+	}
+
+	/*
+	 * Search the directory for the matching name
+	 * We need the lock protecting the lxdn_dir list
+	 * so that it doesn't change out from underneath us.
+	 * lxd_find_dirent() will pass back the lxd_node
+	 * with a hold on it.
+	 */
+
+	if (lxd_find_dirent(name, parent, HOLD, foundnp) != NULL) {
+		ASSERT(*foundnp);
+		return (0);
+	}
+
+	return (ENOENT);
+}
+
+/*
+ * Check if the source directory is in the path of the target directory.
+ * The target directory is locked by the caller.
+ */
+static int
+lxd_dircheckpath(lxd_node_t *fromnode, lxd_node_t *toparent, cred_t *cr)
+{
+	int error = 0;
+	lxd_node_t *dir, *dotdot;
+
+	ASSERT(RW_WRITE_HELD(&toparent->lxdn_rwlock));
+	ASSERT(toparent->lxdn_vnode->v_type == VDIR);
+
+	dotdot = toparent->lxdn_parent;
+	if (dotdot == NULL)
+		return (ENOENT);
+	ldnode_hold(dotdot);
+
+	if (dotdot == toparent) {
+		/* root of fs.  search trivially satisfied. */
+		ldnode_rele(dotdot);
+		return (0);
+	}
+
+	for (;;) {
+		/*
+		 * Return error for cases like "mv c c/d",
+		 * "mv c c/d/e" and so on.
+		 */
+		if (dotdot == fromnode) {
+			ldnode_rele(dotdot);
+			error = EINVAL;
+			break;
+		}
+
+		dir = dotdot;
+		dotdot = dir->lxdn_parent;
+		if (dotdot == NULL) {
+			ldnode_rele(dir);
+			error = ENOENT;
+			break;
+		}
+		ldnode_hold(dotdot);
+
+		/*
+		 * We're okay if we traverse the directory tree up to
+		 * the root directory and don't run into the
+		 * parent directory.
+		 */
+		if (dir == dotdot) {
+			ldnode_rele(dir);
+			ldnode_rele(dotdot);
+			break;
+		}
+		ldnode_rele(dir);
+	}
+
+	return (error);
+}
+
+static int
+lxd_dir_make_node(lxd_node_t *dir, lxd_mnt_t *lxdm, struct vattr *va,
+    enum de_op op, lxd_node_t **newnode, struct cred *cred)
+{
+	lxd_node_t *ldn;
+
+	ASSERT(va != NULL);
+
+	if (((va->va_mask & AT_ATIME) && TIMESPEC_OVERFLOW(&va->va_atime)) ||
+	    ((va->va_mask & AT_MTIME) && TIMESPEC_OVERFLOW(&va->va_mtime)))
+		return (EOVERFLOW);
+
+	ldn = kmem_zalloc(sizeof (lxd_node_t), KM_SLEEP);
+
+	ldn->lxdn_type = LXDNT_FRONT;
+	lxd_node_init(lxdm, ldn, NULL, va, cred);
+
+	ldn->lxdn_vnode->v_rdev = ldn->lxdn_rdev = NODEV;
+	ldn->lxdn_vnode->v_type = va->va_type;
+	ldn->lxdn_uid = crgetuid(cred);
+	ldn->lxdn_gid = crgetgid(cred);
+	ldn->lxdn_nodeid = lxdm->lxdm_gen++;
+
+	if (va->va_mask & AT_ATIME)
+		ldn->lxdn_atime = va->va_atime;
+	if (va->va_mask & AT_MTIME)
+		ldn->lxdn_mtime = va->va_mtime;
+
+	if (op == DE_MKDIR) {
+		lxd_dirinit(dir, ldn, cred);
+	}
+
+	*newnode = ldn;
+	return (0);
+}
+
+static int
+lxd_diraddentry(lxd_node_t *dir, lxd_node_t *ldn, char *name, enum de_op op)
+{
+	lxd_dirent_t	*dp, *pdp;
+	size_t		namelen, alloc_size;
+	timestruc_t	now;
+
+	/*
+	 * Make sure the parent directory wasn't removed from
+	 * underneath the caller.
+	 */
+	if (dir->lxdn_dir == NULL)
+		return (ENOENT);
+
+	/* Check that everything is on the same filesystem. */
+	if (ldn->lxdn_vnode->v_vfsp != dir->lxdn_vnode->v_vfsp)
+		return (EXDEV);
+
+	/* Allocate and initialize directory entry */
+	namelen = strlen(name) + 1;
+	alloc_size = namelen + sizeof (lxd_dirent_t);
+	dp = kmem_zalloc(alloc_size, KM_NOSLEEP | KM_NORMALPRI);
+	if (dp == NULL)
+		return (ENOSPC);
+
+	ldn->lxdn_parent = dir;
+
+	dir->lxdn_size += alloc_size;
+	dir->lxdn_dirents++;
+	dp->lddir_node = ldn;
+	dp->lddir_parent = dir;
+
+	/* The directory entry and its name were allocated sequentially. */
+	dp->lddir_name = (char *)dp + sizeof (lxd_dirent_t);
+	(void) strcpy(dp->lddir_name, name);
+
+	lxd_save_dirent(dp);
+
+	/*
+	 * Some utilities expect the size of a directory to remain
+	 * somewhat static.  For example, a routine which removes
+	 * subdirectories between calls to readdir(); the size of the
+	 * directory changes from underneath it and so the real
+	 * directory offset in bytes is invalid.  To circumvent
+	 * this problem, we initialize a directory entry with an
+	 * phony offset, and use this offset to determine end of
+	 * file in lxd_readdir.
+	 */
+	pdp = dir->lxdn_dir->lddir_prev;
+	/*
+	 * Install at first empty "slot" in directory list.
+	 */
+	while (pdp->lddir_next != NULL &&
+	    (pdp->lddir_next->lddir_offset - pdp->lddir_offset) <= 1) {
+		ASSERT(pdp->lddir_next != pdp);
+		ASSERT(pdp->lddir_prev != pdp);
+		ASSERT(pdp->lddir_next->lddir_offset > pdp->lddir_offset);
+		pdp = pdp->lddir_next;
+	}
+	dp->lddir_offset = pdp->lddir_offset + 1;
+
+	/*
+	 * If we're at the end of the dirent list and the offset (which
+	 * is necessarily the largest offset in this directory) is more
+	 * than twice the number of dirents, that means the directory is
+	 * 50% holes.  At this point we reset the slot pointer back to
+	 * the beginning of the directory so we start using the holes.
+	 * The idea is that if there are N dirents, there must also be
+	 * N holes, so we can satisfy the next N creates by walking at
+	 * most 2N entries; thus the average cost of a create is constant.
+	 * Note that we use the first dirent's lddir_prev as the roving
+	 * slot pointer; it's ugly, but it saves a word in every dirent.
+	 */
+	if (pdp->lddir_next == NULL &&
+	    pdp->lddir_offset > 2 * dir->lxdn_dirents)
+		dir->lxdn_dir->lddir_prev = dir->lxdn_dir->lddir_next;
+	else
+		dir->lxdn_dir->lddir_prev = dp;
+
+	ASSERT(pdp->lddir_next != pdp);
+	ASSERT(pdp->lddir_prev != pdp);
+
+	dp->lddir_next = pdp->lddir_next;
+	if (dp->lddir_next) {
+		dp->lddir_next->lddir_prev = dp;
+	}
+	dp->lddir_prev = pdp;
+	pdp->lddir_next = dp;
+
+	ASSERT(dp->lddir_next != dp);
+	ASSERT(dp->lddir_prev != dp);
+	ASSERT(pdp->lddir_next != pdp);
+	ASSERT(pdp->lddir_prev != pdp);
+
+	gethrestime(&now);
+	dir->lxdn_mtime = now;
+	dir->lxdn_ctime = now;
+
+	return (0);
+}
+
+/*
+ * Enter a directory entry for 'name' into directory 'dir'
+ *
+ * Returns 0 on success.
+ */
+int
+lxd_direnter(
+	lxd_mnt_t	*lxdm,
+	lxd_node_t	*dir,		/* target directory to make entry in */
+	char		*name,		/* name of entry */
+	enum de_op	op,		/* entry operation */
+	lxd_node_t	*fromparent,    /* original directory if rename */
+	lxd_node_t	*ldn,		/* existing lxd_node, if rename */
+	struct vattr	*va,
+	lxd_node_t	**rnp,		/* return lxd_node, if create/mkdir */
+	cred_t		*cr,
+	caller_context_t *ctp)
+{
+	lxd_dirent_t *dirp;
+	lxd_node_t *found = NULL;
+	int error = 0;
+	char *s;
+
+	/* lxdn_rwlock is held to serialize direnter and dirdeletes */
+	ASSERT(RW_WRITE_HELD(&dir->lxdn_rwlock));
+	ASSERT(dir->lxdn_vnode->v_type == VDIR);
+
+	/*
+	 * Don't allow '/' characters in pathname component,
+	 */
+	for (s = name; *s; s++)
+		if (*s == '/')
+			return (EACCES);
+
+	if (name[0] == '\0')
+		panic("lxd_direnter: NULL name");
+
+	/*
+	 * For rename lock the source entry and check the link count
+	 * to see if it has been removed while it was unlocked.
+	 */
+	if (op == DE_RENAME) {
+		mutex_enter(&ldn->lxdn_tlock);
+		if (ldn->lxdn_nlink == 0) {
+			mutex_exit(&ldn->lxdn_tlock);
+			return (ENOENT);
+		}
+
+		if (ldn->lxdn_nlink == MAXLINK) {
+			mutex_exit(&ldn->lxdn_tlock);
+			return (EMLINK);
+		}
+		ldn->lxdn_nlink++;
+		gethrestime(&ldn->lxdn_ctime);
+		mutex_exit(&ldn->lxdn_tlock);
+	}
+
+	/*
+	 * This might be a "dangling detached directory" (it could have been
+	 * removed, but a reference to it kept in u_cwd). Don't bother
+	 * searching it, and with any luck the user will get tired of dealing
+	 * with us and cd to some absolute pathway (thus in ufs, too).
+	 */
+	if (dir->lxdn_nlink == 0) {
+		error = ENOENT;
+		goto out;
+	}
+
+	/*
+	 * If this is a rename of a directory and the parent is different
+	 * (".." must be changed), then the source directory must not be in the
+	 * directory hierarchy above the target, as this would orphan
+	 * everything below the source directory.
+	 */
+	if (op == DE_RENAME) {
+		if (ldn == dir) {
+			error = EINVAL;
+			goto out;
+		}
+		if ((ldn->lxdn_vnode->v_type) == VDIR) {
+			if ((fromparent != dir) &&
+			    (error = lxd_dircheckpath(ldn, dir, cr)) != 0) {
+				goto out;
+			}
+		}
+	}
+
+	/* Search for an existing entry. */
+	dirp = lxd_find_dirent(name, dir, HOLD, &found);
+	if (dirp != NULL) {
+		ASSERT(found != NULL);
+		switch (op) {
+		case DE_CREATE:
+		case DE_MKDIR:
+			if (rnp != NULL) {
+				*rnp = found;
+				error = EEXIST;
+			} else {
+				ldnode_rele(found);
+			}
+			break;
+
+		case DE_RENAME:
+			/*
+			 * Note that we only hit this path when we're renaming
+			 * a symlink from one directory to another and there is
+			 * a pre-existing symlink as the target. lxd_rename
+			 * will unlink the src from the original directory but
+			 * here we need to unlink the dest that we collided
+			 * with, then create the new directory entry as we do
+			 * below when there is no pre-existing symlink.
+			 */
+			if ((error = lxd_naccess(dir, VWRITE, cr)) != 0)
+				goto out;
+
+			ASSERT(found->lxdn_vnode->v_type == VLNK);
+			/* dir rw lock is already held and asserted above */
+			rw_enter(&found->lxdn_rwlock, RW_WRITER);
+			error = lxd_dirdelete(dir, found, name, DR_RENAME, cr);
+			rw_exit(&found->lxdn_rwlock);
+			ldnode_rele(found);
+			if (error != 0)
+				goto out;
+
+			error = lxd_diraddentry(dir, ldn, name, op);
+			if (error == 0 && rnp != NULL)
+				*rnp = ldn;
+			break;
+		}
+	} else {
+
+		/*
+		 * The directory entry does not exist, but the node might if
+		 * this is a rename. Check write permission in directory to
+		 * see if entry can be created.
+		 */
+		if ((error = lxd_naccess(dir, VWRITE, cr)) != 0)
+			goto out;
+		if (op == DE_CREATE || op == DE_MKDIR) {
+			/*
+			 * Make new lxd_node and directory entry as required.
+			 */
+			error = lxd_dir_make_node(dir, lxdm, va, op, &ldn, cr);
+			if (error)
+				goto out;
+		}
+
+		error = lxd_diraddentry(dir, ldn, name, op);
+		if (error != 0) {
+			if (op == DE_CREATE || op == DE_MKDIR) {
+				/*
+				 * Unmake the inode we just made.
+				 */
+				rw_enter(&ldn->lxdn_rwlock, RW_WRITER);
+				if ((ldn->lxdn_vnode->v_type) == VDIR) {
+					ASSERT(dirp == NULL);
+					/*
+					 * cleanup allocs made by lxd_dirinit
+					 */
+					lxd_dirtrunc(ldn);
+				}
+				mutex_enter(&ldn->lxdn_tlock);
+				ldn->lxdn_nlink = 0;
+				gethrestime(&ldn->lxdn_ctime);
+				mutex_exit(&ldn->lxdn_tlock);
+				rw_exit(&ldn->lxdn_rwlock);
+				ldnode_rele(ldn);
+				ldn = NULL;
+			}
+		} else if (rnp != NULL) {
+			*rnp = ldn;
+		} else if (op == DE_CREATE || op == DE_MKDIR) {
+			ldnode_rele(ldn);
+		}
+	}
+
+out:
+	if (error && op == DE_RENAME) {
+		/* Undo bumped link count. */
+		mutex_enter(&ldn->lxdn_tlock);
+		ldn->lxdn_nlink--;
+		gethrestime(&ldn->lxdn_ctime);
+		mutex_exit(&ldn->lxdn_tlock);
+	}
+	return (error);
+}
+
+/*
+ * Delete entry ldn of name "nm" from parent dir. This is used to both remove
+ * a directory and to remove file nodes within the directory (by recursively
+ * calling itself). It frees the dir entry space and decrements link count on
+ * lxd_node(s).
+ *
+ * Return 0 on success.
+ */
+int
+lxd_dirdelete(lxd_node_t *dir, lxd_node_t *ldn, char *nm, enum dr_op op,
+    cred_t *cred)
+{
+	lxd_dirent_t *dirp;
+	int error;
+	size_t namelen;
+	lxd_node_t *fndnp;
+	timestruc_t now;
+
+	ASSERT(RW_WRITE_HELD(&dir->lxdn_rwlock));
+	ASSERT(RW_WRITE_HELD(&ldn->lxdn_rwlock));
+	ASSERT(dir->lxdn_vnode->v_type == VDIR);
+
+	if (nm[0] == '\0')
+		panic("lxd_dirdelete: empty name for 0x%p", (void *)ldn);
+
+	/*
+	 * return error when removing . and ..
+	 */
+	if (nm[0] == '.') {
+		if (nm[1] == '\0')
+			return (EINVAL);
+		if (nm[1] == '.' && nm[2] == '\0')
+			return (EEXIST); /* thus in ufs */
+	}
+
+	if ((error = lxd_naccess(dir, VEXEC|VWRITE, cred)) != 0)
+		return (error);
+
+	if (dir->lxdn_dir == NULL)
+		return (ENOENT);
+
+	if (op == DR_RMDIR) {
+		/*
+		 * This is the top-level removal of a directory. Start by
+		 * removing any file entries from the dir. We do this by
+		 * recursively calling back into this function with a different
+		 * op code. The caller of this function has already verified
+		 * that it is safe to remove this directory.
+		 */
+		lxd_dirent_t *dirp;
+
+		ASSERT(ldn->lxdn_vnode->v_type == VDIR);
+
+		dirp = ldn->lxdn_dir;
+		while (dirp) {
+			lxd_node_t *dn;
+			lxd_dirent_t *nextp;
+
+			if (strcmp(dirp->lddir_name, ".") == 0 ||
+			    strcmp(dirp->lddir_name, "..") == 0) {
+				dirp = dirp->lddir_next;
+				continue;
+			}
+
+			dn = dirp->lddir_node;
+			nextp = dirp->lddir_next;
+
+			ldnode_hold(dn);
+			error = lxd_dirdelete(ldn, dn, dirp->lddir_name,
+			    DR_REMOVE, cred);
+			ldnode_rele(dn);
+
+			dirp = nextp;
+		}
+	}
+
+	dirp = lxd_find_dirent(nm, dir, NOHOLD, &fndnp);
+	VERIFY(dirp != NULL);
+	VERIFY(ldn == fndnp);
+
+	lxd_rm_dirent(dirp);
+
+	/* Take dirp out of the directory list. */
+	ASSERT(dirp->lddir_next != dirp);
+	ASSERT(dirp->lddir_prev != dirp);
+	if (dirp->lddir_prev) {
+		dirp->lddir_prev->lddir_next = dirp->lddir_next;
+	}
+	if (dirp->lddir_next) {
+		dirp->lddir_next->lddir_prev = dirp->lddir_prev;
+	}
+
+	/*
+	 * If the roving slot pointer happens to match dirp,
+	 * point it at the previous dirent.
+	 */
+	if (dir->lxdn_dir->lddir_prev == dirp) {
+		dir->lxdn_dir->lddir_prev = dirp->lddir_prev;
+	}
+	ASSERT(dirp->lddir_next != dirp);
+	ASSERT(dirp->lddir_prev != dirp);
+
+	/* dirp points to the correct directory entry */
+	namelen = strlen(dirp->lddir_name) + 1;
+
+	kmem_free(dirp, sizeof (lxd_dirent_t) + namelen);
+	dir->lxdn_size -= (sizeof (lxd_dirent_t) + namelen);
+	dir->lxdn_dirents--;
+
+	gethrestime(&now);
+	dir->lxdn_mtime = now;
+	dir->lxdn_ctime = now;
+	ldn->lxdn_ctime = now;
+
+	ASSERT(ldn->lxdn_nlink > 0);
+	mutex_enter(&ldn->lxdn_tlock);
+	ldn->lxdn_nlink--;
+	mutex_exit(&ldn->lxdn_tlock);
+	if (op == DR_RMDIR && ldn->lxdn_vnode->v_type == VDIR) {
+		lxd_dirtrunc(ldn);
+		ASSERT(ldn->lxdn_nlink == 0);
+	}
+	return (0);
+}
+
+/*
+ * Initialize a lxd_node and add it to file list under mount point.
+ */
+void
+lxd_node_init(lxd_mnt_t *lxdm, lxd_node_t *ldn, vnode_t *realvp, vattr_t *vap,
+    cred_t *cred)
+{
+	struct vnode *vp;
+	timestruc_t now;
+
+	ASSERT(vap != NULL);
+
+	rw_init(&ldn->lxdn_rwlock, NULL, RW_DEFAULT, NULL);
+	mutex_init(&ldn->lxdn_tlock, NULL, MUTEX_DEFAULT, NULL);
+	ldn->lxdn_mode = MAKEIMODE(vap->va_type, vap->va_mode);
+	ldn->lxdn_mask = 0;
+	ldn->lxdn_attr.va_type = vap->va_type;
+	ldn->lxdn_nlink = 1;
+	ldn->lxdn_size = 0;
+
+	if (cred == NULL) {
+		ldn->lxdn_uid = vap->va_uid;
+		ldn->lxdn_gid = vap->va_gid;
+	} else {
+		ldn->lxdn_uid = crgetuid(cred);
+		ldn->lxdn_gid = crgetgid(cred);
+	}
+
+	ldn->lxdn_fsid = lxdm->lxdm_dev;
+	ldn->lxdn_rdev = vap->va_rdev;
+	ldn->lxdn_blksize = PAGESIZE;
+	ldn->lxdn_nblocks = 0;
+	gethrestime(&now);
+	ldn->lxdn_atime = now;
+	ldn->lxdn_mtime = now;
+	ldn->lxdn_ctime = now;
+	ldn->lxdn_seq = 0;
+	ldn->lxdn_dir = NULL;
+
+	ldn->lxdn_real_vp = realvp;
+
+	ldn->lxdn_vnode = vn_alloc(KM_SLEEP);
+	vp = LDNTOV(ldn);
+	vn_setops(vp, lxd_vnodeops);
+	vp->v_vfsp = lxdm->lxdm_vfsp;
+	vp->v_type = vap->va_type;
+	vp->v_rdev = vap->va_rdev;
+	vp->v_data = (caddr_t)ldn;
+
+	mutex_enter(&lxdm->lxdm_contents);
+	ldn->lxdn_nodeid = lxdm->lxdm_gen++;
+
+	/*
+	 * Add new lxd_node to end of linked list of lxd_nodes for this
+	 * lxdevfs. Root directory is handled specially in lxd_mount.
+	 */
+	if (lxdm->lxdm_rootnode != (lxd_node_t *)NULL) {
+		ldn->lxdn_next = NULL;
+		ldn->lxdn_prev = lxdm->lxdm_rootnode->lxdn_prev;
+		ldn->lxdn_prev->lxdn_next = lxdm->lxdm_rootnode->lxdn_prev =
+		    ldn;
+	}
+	mutex_exit(&lxdm->lxdm_contents);
+	vn_exists(vp);
+}
+
+/*
+ * lxd_dirinit is used internally to initialize a directory (dir)
+ * with '.' and '..' entries without checking permissions and locking
+ * It also creates the entries for the pseudo file nodes that reside in the
+ * directory.
+ */
+void
+lxd_dirinit(lxd_node_t *parent, lxd_node_t *dir, cred_t *cr)
+{
+	lxd_dirent_t *dot, *dotdot;
+	timestruc_t now;
+	lxd_mnt_t *lxdm = VTOLXDM(dir->lxdn_vnode);
+	struct vattr nattr;
+
+	ASSERT(RW_WRITE_HELD(&parent->lxdn_rwlock));
+	ASSERT(dir->lxdn_vnode->v_type == VDIR);
+
+	dir->lxdn_nodeid = lxdm->lxdm_gen++;
+
+	/*
+	 * Initialize the entries
+	 */
+	dot = kmem_zalloc(sizeof (lxd_dirent_t) + 2, KM_SLEEP);
+	dot->lddir_node = dir;
+	dot->lddir_offset = 0;
+	dot->lddir_name = (char *)dot + sizeof (lxd_dirent_t);
+	dot->lddir_name[0] = '.';
+	dot->lddir_parent = dir;
+	lxd_save_dirent(dot);
+
+	dotdot = kmem_zalloc(sizeof (lxd_dirent_t) + 3, KM_SLEEP);
+	dotdot->lddir_node = parent;
+	dotdot->lddir_offset = 1;
+	dotdot->lddir_name = (char *)dotdot + sizeof (lxd_dirent_t);
+	dotdot->lddir_name[0] = '.';
+	dotdot->lddir_name[1] = '.';
+	dotdot->lddir_parent = dir;
+	lxd_save_dirent(dotdot);
+
+	/*
+	 * Initialize directory entry list.
+	 */
+	dot->lddir_next = dotdot;
+	dot->lddir_prev = dotdot; /* dot's lddir_prev holds roving slot ptr */
+	dotdot->lddir_next = NULL;
+	dotdot->lddir_prev = dot;
+
+	gethrestime(&now);
+	dir->lxdn_mtime = now;
+	dir->lxdn_ctime = now;
+
+	parent->lxdn_nlink++;
+	parent->lxdn_ctime = now;
+
+	dir->lxdn_dir = dot;
+	dir->lxdn_size = 2 * sizeof (lxd_dirent_t) + 5;	/* dot and dotdot */
+	dir->lxdn_dirents = 2;
+	dir->lxdn_nlink = 2;
+	dir->lxdn_parent = parent;
+
+	bzero(&nattr, sizeof (struct vattr));
+	nattr.va_mode = (mode_t)(0644);
+	nattr.va_type = VREG;
+	nattr.va_rdev = 0;
+}
+
+/*
+ * lxd_dirtrunc is called to remove all directory entries under this directory.
+ */
+void
+lxd_dirtrunc(lxd_node_t *dir)
+{
+	lxd_dirent_t *ldp;
+	timestruc_t now;
+
+	ASSERT(RW_WRITE_HELD(&dir->lxdn_rwlock));
+	ASSERT(dir->lxdn_vnode->v_type == VDIR);
+
+	for (ldp = dir->lxdn_dir; ldp; ldp = dir->lxdn_dir) {
+		size_t namelen;
+		lxd_node_t *ldn;
+
+		ASSERT(ldp->lddir_next != ldp);
+		ASSERT(ldp->lddir_prev != ldp);
+		ASSERT(ldp->lddir_node);
+
+		dir->lxdn_dir = ldp->lddir_next;
+		namelen = strlen(ldp->lddir_name) + 1;
+
+		/*
+		 * Adjust the link counts to account for this directory entry
+		 * removal. We do hold/rele operations to free up these nodes.
+		 */
+		ldn = ldp->lddir_node;
+
+		ASSERT(ldn->lxdn_nlink > 0);
+		mutex_enter(&ldn->lxdn_tlock);
+		ldn->lxdn_nlink--;
+		mutex_exit(&ldn->lxdn_tlock);
+
+		lxd_rm_dirent(ldp);
+		kmem_free(ldp, sizeof (lxd_dirent_t) + namelen);
+		dir->lxdn_size -= (sizeof (lxd_dirent_t) + namelen);
+		dir->lxdn_dirents--;
+	}
+
+	gethrestime(&now);
+	dir->lxdn_mtime = now;
+	dir->lxdn_ctime = now;
+
+	ASSERT(dir->lxdn_dir == NULL);
+	ASSERT(dir->lxdn_size == 0);
+	ASSERT(dir->lxdn_dirents == 0);
+}
diff --git a/usr/src/uts/common/brand/lx/devfs/lxd_vfsops.c b/usr/src/uts/common/brand/lx/devfs/lxd_vfsops.c
new file mode 100644
index 0000000000..b474c329ad
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/devfs/lxd_vfsops.c
@@ -0,0 +1,830 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+/*
+ * The lx devfs (lxd) file system is used within lx branded zones to provide
+ * the Linux view of /dev.
+ *
+ * In the past, the Linux /dev was simply a lofs mount pointing at /native/dev.
+ * lxd now provides the Linux /dev.
+ *
+ * The lxd file system is a hybrid of lofs and tmpfs. It supports a "back" file
+ * system which is the special device and corresponds to the special device in
+ * a lofs mount. As with lofs, all files in the special device are accessible
+ * through the lxd mount. Because the zone's devfs is not directly modifiable
+ * within the zone (also mknod(2) is not generally allowed within a zone) it is
+ * impossible to create files in devfs. For lx, in some cases it's useful to be
+ * able to make new symlinks or new directories under /dev. lxd implements
+ * these operations by creating "files" in memory in the same way as tmpfs
+ * does. Within lxd these are referred to as "front" files. For operations such
+ * as lookup or readdir, lxd provides a merged view of both the front and back
+ * files. lxd does not support regular front files or simple I/O (read/write)
+ * to front files, since there is no need for that. For back files, all
+ * operations are simply passed through to the real vnode, as is done with
+ * lofs. Front files are not allowed to mask back files.
+ *
+ * The Linux /dev is now a lxd mount with the special file (i.e. the back
+ * file system) as /native/dev.
+ *
+ * In addition, lx has a need for some illumos/Linux translation for the
+ * various *stat(2) system calls when used on a device. This translation can
+ * be centralized within lxd's getattr vnode entry point.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/sysmacros.h>
+#include <sys/kmem.h>
+#include <sys/time.h>
+#include <sys/pathname.h>
+#include <sys/vfs.h>
+#include <sys/vfs_opreg.h>
+#include <sys/vnode.h>
+#include <sys/stat.h>
+#include <sys/uio.h>
+#include <sys/stat.h>
+#include <sys/errno.h>
+#include <sys/cmn_err.h>
+#include <sys/cred.h>
+#include <sys/statvfs.h>
+#include <sys/mount.h>
+#include <sys/systm.h>
+#include <sys/mntent.h>
+#include <sys/policy.h>
+#include <sys/sdt.h>
+#include <sys/ddi.h>
+#include <sys/lx_brand.h>
+#include <sys/lx_ptm.h>
+#include <sys/lx_impl.h>
+
+#include "lxd.h"
+
+/* Module level parameters */
+static int	lxd_fstype;
+static dev_t	lxd_dev;
+
+/*
+ * lxd_mountcount is used to prevent module unloads while there is still
+ * state from a former mount hanging around. The filesystem module must not be
+ * allowed to go away before the last VFS_FREEVFS() call has been made. Since
+ * this is just an atomic counter, there's no need for locking.
+ */
+static uint32_t lxd_mountcount;
+
+/*
+ * lxd_minfree is the minimum amount of swap space that lx devfs leaves for
+ * the rest of the zone.
+ */
+size_t lxd_minfree = 0;
+
+/*
+ * LXDMINFREE -- the value from which lxd_minfree is derived -- should be
+ * configured to a value that is roughly the smallest practical value for
+ * memory + swap minus the largest reasonable size for lxd in such
+ * a configuration. As of this writing, the smallest practical memory + swap
+ * configuration is 128MB, and it seems reasonable to allow lxd to consume
+ * no more than ~10% of this, yielding a LXDMINFREE of 12MB.
+ */
+#define	LXDMINFREE	12 * 1024 * 1024	/* 12 Megabytes */
+
+extern pgcnt_t swapfs_minfree;
+
+extern int lxd_symlink(vnode_t *, char *, struct vattr *, char *, cred_t *,
+    caller_context_t *, int);
+extern int stat64(char *, struct stat64 *);
+
+/*
+ * lxd vfs operations.
+ */
+static int lxd_init(int, char *);
+static int lxd_mount(vfs_t *, vnode_t *, struct mounta *, cred_t *);
+static int lxd_unmount(vfs_t *, int, cred_t *);
+static int lxd_root(vfs_t *, vnode_t **);
+static int lxd_statvfs(vfs_t *, statvfs64_t *);
+static void lxd_freevfs(vfs_t *vfsp);
+
+/*
+ * Loadable module wrapper
+ */
+#include <sys/modctl.h>
+
+static vfsdef_t vfw = {
+	VFSDEF_VERSION,
+	"lx_devfs",
+	lxd_init,
+	VSW_ZMOUNT,
+	NULL
+};
+
+/*
+ * Module linkage information
+ */
+static struct modlfs modlfs = {
+	&mod_fsops, "lx brand devfs", &vfw
+};
+
+static struct modlinkage modlinkage = {
+	MODREV_1, &modlfs, NULL
+};
+
+/*
+ * Definitions and translators for devt's.
+ */
+static void lxd_pts_devt_translator(dev_t, dev_t *);
+static void lxd_ptm_devt_translator(dev_t, dev_t *);
+
+static kmutex_t			lxd_xlate_lock;
+static boolean_t		lxd_xlate_initialized = B_FALSE;
+
+static lxd_minor_translator_t lxd_mtranslator_mm[] = {
+	{ "/dev/null",		0, 1, 3 },
+	{ "/dev/zero",		0, 1, 5 },
+	{ NULL,			0, 0, 0 }
+};
+static lxd_minor_translator_t lxd_mtranslator_random[] = {
+	{ "/dev/random",	0, 1, 8 },
+	{ "/dev/urandom",	0, 1, 9 },
+	{ NULL,			0, 0, 0 }
+};
+static lxd_minor_translator_t lxd_mtranslator_sy[] = {
+	{ "/dev/tty",		0, LX_TTY_MAJOR, 0 },
+	{ NULL,			0, 0, 0 }
+};
+static lxd_minor_translator_t lxd_mtranslator_zcons[] = {
+	{ "/dev/console",	0, LX_TTY_MAJOR, 1 },
+	{ NULL,			0, 0, 0 }
+};
+lxd_devt_translator_t lxd_devt_translators[] = {
+	{ "mm",		0, DTT_LIST,	(uintptr_t)&lxd_mtranslator_mm },
+	{ "random",	0, DTT_LIST,	(uintptr_t)&lxd_mtranslator_random },
+	{ "sy",		0, DTT_LIST,	(uintptr_t)&lxd_mtranslator_sy },
+	{ "zcons",	0, DTT_LIST,	(uintptr_t)&lxd_mtranslator_zcons },
+	{ LX_PTM_DRV,	0, DTT_CUSTOM,	(uintptr_t)lxd_ptm_devt_translator },
+	{ "pts",	0, DTT_CUSTOM,	(uintptr_t)lxd_pts_devt_translator },
+	{ NULL,		0, DTT_INVALID,	NULL }
+};
+
+int
+_init()
+{
+	return (mod_install(&modlinkage));
+}
+
+int
+_fini()
+{
+	int error;
+
+	if (lxd_mountcount > 0)
+		return (EBUSY);
+
+	if ((error = mod_remove(&modlinkage)) != 0)
+		return (error);
+
+	/*
+	 * Tear down the operations vectors
+	 */
+	(void) vfs_freevfsops_by_type(lxd_fstype);
+	vn_freevnodeops(lxd_vnodeops);
+	mutex_destroy(&lxd_xlate_lock);
+	return (0);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&modlinkage, modinfop));
+}
+
+/*
+ * Initialize global locks, etc. Called when loading lxd module.
+ */
+static int
+lxd_init(int fstype, char *name)
+{
+	static const fs_operation_def_t lxd_vfsops_template[] = {
+		VFSNAME_MOUNT,		{ .vfs_mount = lxd_mount },
+		VFSNAME_UNMOUNT,	{ .vfs_unmount = lxd_unmount },
+		VFSNAME_ROOT,		{ .vfs_root = lxd_root },
+		VFSNAME_STATVFS,	{ .vfs_statvfs = lxd_statvfs },
+		VFSNAME_FREEVFS,	{ .vfs_freevfs = lxd_freevfs },
+		NULL,			NULL
+	};
+	extern const struct fs_operation_def lxd_vnodeops_template[];
+	int error;
+	major_t dev;
+
+	lxd_fstype = fstype;
+	ASSERT(lxd_fstype != 0);
+
+	error = vfs_setfsops(fstype, lxd_vfsops_template, NULL);
+	if (error != 0) {
+		cmn_err(CE_WARN, "lxd_init: bad vfs ops template");
+		return (error);
+	}
+
+	error = vn_make_ops(name, lxd_vnodeops_template, &lxd_vnodeops);
+	if (error != 0) {
+		(void) vfs_freevfsops_by_type(fstype);
+		cmn_err(CE_WARN, "lxd_init: bad vnode ops template");
+		return (error);
+	}
+
+	/*
+	 * lxd_minfree doesn't need to be some function of configured
+	 * swap space since it really is an absolute limit of swap space
+	 * which still allows other processes to execute.
+	 */
+	if (lxd_minfree == 0) {
+		/* Set if not patched */
+		lxd_minfree = btopr(LXDMINFREE);
+	}
+
+	if ((dev = getudev()) == (major_t)-1) {
+		cmn_err(CE_WARN, "lxd_init: Can't get unique device number.");
+		dev = 0;
+	}
+
+	/*
+	 * Make the pseudo device
+	 */
+	lxd_dev = makedevice(dev, 0);
+
+	mutex_init(&lxd_xlate_lock, NULL, MUTEX_DEFAULT, NULL);
+
+	return (0);
+}
+
+/*
+ * Initialize device translator mapping table.
+ *
+ * Note that we cannot do this in lxd_init since that can lead to a recursive
+ * rw_enter while we're doing lookupnameat (via sdev_lookup/prof_make_maps/
+ * devi_attach_node/modload). Thus we do it in the mount path and keep track
+ * so that we only initialize the table once.
+ */
+static void
+lxd_xlate_init()
+{
+	int i;
+
+	mutex_enter(&lxd_xlate_lock);
+	if (lxd_xlate_initialized) {
+		mutex_exit(&lxd_xlate_lock);
+		return;
+	}
+
+	for (i = 0; lxd_devt_translators[i].lxd_xl_driver != NULL; i++) {
+		lxd_minor_translator_t	*mt;
+		int j;
+
+		lxd_devt_translators[i].lxd_xl_major =
+		    mod_name_to_major(lxd_devt_translators[i].lxd_xl_driver);
+
+		/* if this translator doesn't use a list mapping we're done. */
+		if (lxd_devt_translators[i].lxd_xl_type != DTT_LIST)
+			continue;
+
+		/* for each device listed, lookup the minor node number */
+		mt = lxd_devt_translators[i].xl_list;
+		for (j = 0; mt[j].lxd_mt_path != NULL; j++) {
+			vnode_t *vp;
+			struct vattr va;
+			char *tpath;
+			char tnm[MAXPATHLEN];
+
+			/*
+			 * The attach might be triggered in either the global
+			 * zone or in a non-global zone, so we may need to
+			 * adjust the path if we're in a NGZ.
+			 */
+			if (curproc->p_zone->zone_id == GLOBAL_ZONEUNIQID) {
+				tpath = mt[j].lxd_mt_path;
+			} else {
+				(void) snprintf(tnm, sizeof (tnm), "/native%s",
+				    mt[j].lxd_mt_path);
+				tpath = tnm;
+			}
+
+			if (lookupnameat(tpath, UIO_SYSSPACE, FOLLOW, NULL,
+			    &vp, NULL) != 0) {
+				mt[j].lxd_mt_minor = -1;
+				continue;
+			}
+
+			va.va_mask = AT_RDEV;
+			if (VOP_GETATTR(vp, &va, 0, kcred, NULL) != 0) {
+				va.va_rdev = NODEV;
+			} else {
+				ASSERT(getmajor(va.va_rdev) ==
+				    lxd_devt_translators[i].lxd_xl_major);
+				ASSERT(mt[j].lxd_mt_lx_minor < LX_MAXMIN);
+			}
+
+			mt[j].lxd_mt_minor = getminor(va.va_rdev);
+
+			VN_RELE(vp);
+		}
+	}
+
+	lxd_xlate_initialized = B_TRUE;
+	mutex_exit(&lxd_xlate_lock);
+}
+
+static int
+lxd_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
+{
+	lxd_mnt_t *lxdm = NULL;
+	struct lxd_node *ldn;
+	struct pathname dpn;
+	int error;
+	int i;
+	int nodev;
+	struct vattr rattr;
+	vnode_t *realrootvp;
+	vnode_t *tvp;
+	lx_zone_data_t *lxzdata;
+	lx_virt_disk_t *vd;
+	vattr_t vattr;
+
+	nodev = vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL);
+
+	if ((error = secpolicy_fs_mount(cr, mvp, vfsp)) != 0)
+		return (error);
+
+	if (mvp->v_type != VDIR)
+		return (ENOTDIR);
+
+	lxd_xlate_init();
+
+	/*
+	 * This is the same behavior as with lofs.
+	 * Loopback devices which get "nodevices" added can be done without
+	 * "nodevices" set because we cannot import devices into a zone
+	 * with loopback.  Note that we have all zone privileges when
+	 * this happens; if not, we'd have gotten "nosuid".
+	 */
+	if (!nodev && vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL))
+		vfs_setmntopt(vfsp, MNTOPT_DEVICES, NULL, VFS_NODISPLAY);
+
+	/*
+	 * Only allow mounting within lx zones.
+	 */
+	if (curproc->p_zone->zone_brand != &lx_brand)
+		return (EINVAL);
+
+	/*
+	 * Ensure we don't allow overlaying mounts
+	 */
+	mutex_enter(&mvp->v_lock);
+	if ((uap->flags & MS_OVERLAY) == 0 &&
+	    (mvp->v_count > 1 || (mvp->v_flag & VROOT))) {
+		mutex_exit(&mvp->v_lock);
+		return (EBUSY);
+	}
+	mutex_exit(&mvp->v_lock);
+
+	/* lxd doesn't support read-only mounts */
+	if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) {
+		error = EINVAL;
+		goto out;
+	}
+
+	error = pn_get(uap->dir,
+	    (uap->flags & MS_SYSSPACE) ? UIO_SYSSPACE : UIO_USERSPACE, &dpn);
+	if (error != 0)
+		goto out;
+
+	/*
+	 * Find real root
+	 */
+	if ((error = lookupname(uap->spec, (uap->flags & MS_SYSSPACE) ?
+	    UIO_SYSSPACE : UIO_USERSPACE, FOLLOW, NULLVPP, &realrootvp))) {
+		pn_free(&dpn);
+		return (error);
+	}
+
+	if ((error = VOP_ACCESS(realrootvp, 0, 0, cr, NULL)) != 0) {
+		pn_free(&dpn);
+		VN_RELE(realrootvp);
+		return (error);
+	}
+
+	/* If realroot is not a devfs, error out */
+	if (strcmp(realrootvp->v_op->vnop_name, "dev") != 0) {
+		pn_free(&dpn);
+		VN_RELE(realrootvp);
+		return (EINVAL);
+	}
+
+	lxdm = kmem_zalloc(sizeof (*lxdm), KM_SLEEP);
+
+	/* init but don't bother entering the mutex (not on mount list yet) */
+	mutex_init(&lxdm->lxdm_contents, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&lxdm->lxdm_renamelck, NULL, MUTEX_DEFAULT, NULL);
+
+	/* Initialize the hash table mutexes */
+	for (i = 0; i < LXD_HASH_SZ; i++) {
+		mutex_init(&lxdm->lxdm_hash_mutex[i], NULL, MUTEX_DEFAULT,
+		    NULL);
+	}
+
+	lxdm->lxdm_vfsp = vfsp;
+	lxdm->lxdm_gen = 1;	/* start inode counter at 1 */
+
+	vfsp->vfs_data = (caddr_t)lxdm;
+	vfsp->vfs_fstype = lxd_fstype;
+	vfsp->vfs_dev = lxd_dev;
+	vfsp->vfs_bsize = PAGESIZE;
+	vfsp->vfs_flag |= VFS_NOTRUNC;
+	vfs_make_fsid(&vfsp->vfs_fsid, lxd_dev, lxd_fstype);
+	lxdm->lxdm_mntpath = kmem_zalloc(dpn.pn_pathlen + 1, KM_SLEEP);
+	(void) strcpy(lxdm->lxdm_mntpath, dpn.pn_path);
+
+	/* allocate and initialize root lxd_node structure */
+	bzero(&rattr, sizeof (struct vattr));
+	rattr.va_mode = (mode_t)(S_IFDIR | 0755);
+	rattr.va_type = VDIR;
+	rattr.va_rdev = 0;
+
+	tvp = lxd_make_back_node(realrootvp, lxdm);
+	ldn = VTOLDN(tvp);
+
+	rw_enter(&ldn->lxdn_rwlock, RW_WRITER);
+	LDNTOV(ldn)->v_flag |= VROOT;
+
+	/*
+	 * initialize linked list of lxd_nodes so that the back pointer of
+	 * the root lxd_node always points to the last one on the list
+	 * and the forward pointer of the last node is null
+	 */
+	ldn->lxdn_prev = ldn;
+	ldn->lxdn_next = NULL;
+	ldn->lxdn_nlink = 0;
+	lxdm->lxdm_rootnode = ldn;
+
+	ldn->lxdn_nodeid = lxdm->lxdm_gen++;
+	lxd_dirinit(ldn, ldn, cr);
+
+	rw_exit(&ldn->lxdn_rwlock);
+
+	pn_free(&dpn);
+	error = 0;
+	atomic_inc_32(&lxd_mountcount);
+
+	lxzdata = ztolxzd(curproc->p_zone);
+	ASSERT(lxzdata->lxzd_vdisks != NULL);
+
+	vattr.va_mask = AT_TYPE | AT_MODE;
+	vattr.va_type = VLNK;
+	vattr.va_mode = 0777;
+
+	vd = list_head(lxzdata->lxzd_vdisks);
+	while (vd != NULL) {
+		/* only create links for actual zvols */
+		if (vd->lxvd_type == LXVD_ZVOL) {
+			char lnknm[MAXPATHLEN];
+
+			(void) snprintf(lnknm, sizeof (lnknm),
+			    "./zvol/dsk/%s", vd->lxvd_real_name);
+			(void) lxd_symlink(LDNTOV(ldn), vd->lxvd_name, &vattr,
+			    lnknm, cr, NULL, 0);
+		}
+
+		vd = list_next(lxzdata->lxzd_vdisks, vd);
+	}
+
+out:
+	if (error == 0)
+		vfs_set_feature(vfsp, VFSFT_SYSATTR_VIEWS);
+
+	return (error);
+}
+
+static int
+lxd_unmount(struct vfs *vfsp, int flag, struct cred *cr)
+{
+	lxd_mnt_t *lxdm = (lxd_mnt_t *)VFSTOLXDM(vfsp);
+	lxd_node_t *ldn, *cancel;
+	struct vnode	*vp;
+	int error;
+	uint_t cnt;
+
+	if ((error = secpolicy_fs_unmount(cr, vfsp)) != 0)
+		return (error);
+
+	mutex_enter(&lxdm->lxdm_contents);
+
+	/*
+	 * In the normal unmount case only the root node would have a reference
+	 * count.
+	 *
+	 * With lxdm_contents held, nothing can be added or removed.
+	 * If we find a previously referenced node, undo the holds we have
+	 * placed and fail EBUSY.
+	 */
+	ldn = lxdm->lxdm_rootnode;
+
+	vp = LDNTOV(ldn);
+	mutex_enter(&vp->v_lock);
+
+	if (flag & MS_FORCE) {
+		mutex_exit(&vp->v_lock);
+		mutex_exit(&lxdm->lxdm_contents);
+		return (EINVAL);
+	}
+
+	cnt = vp->v_count;
+	if (cnt > 1) {
+		mutex_exit(&vp->v_lock);
+		mutex_exit(&lxdm->lxdm_contents);
+		return (EBUSY);
+	}
+
+	mutex_exit(&vp->v_lock);
+
+	/*
+	 * Check for open files. An open file causes everything to unwind.
+	 */
+	for (ldn = ldn->lxdn_next; ldn; ldn = ldn->lxdn_next) {
+		vp = LDNTOV(ldn);
+		mutex_enter(&vp->v_lock);
+		cnt = vp->v_count;
+		if (cnt > 0) {
+			/* An open file; unwind the holds we've been adding. */
+			mutex_exit(&vp->v_lock);
+			cancel = lxdm->lxdm_rootnode->lxdn_next;
+			while (cancel != ldn) {
+				vp = LDNTOV(cancel);
+				ASSERT(vp->v_count > 0);
+				VN_RELE(vp);
+				cancel = cancel->lxdn_next;
+			}
+			mutex_exit(&lxdm->lxdm_contents);
+			return (EBUSY);
+		} else {
+			/*
+			 * It may seem incorrect for us to have a vnode with
+			 * a count of 0, but this is modeled on tmpfs and works
+			 * the same way. See lxd_front_inactive. There we allow
+			 * the v_count to go to 0 but rely on the link count to
+			 * keep the vnode alive. Since we now want to cleanup
+			 * these vnodes we manually add a VN_HOLD so that the
+			 * VN_RELEs that occur in the lxd_freevfs() cleanup
+			 * will take us down the lxd_inactive code path. We
+			 * can directly add a VN_HOLD since we have the lock.
+			 */
+			vp->v_count++;
+			mutex_exit(&vp->v_lock);
+		}
+	}
+
+	/*
+	 * We can drop the mutex now because
+	 * no one can find this mount anymore
+	 */
+	vfsp->vfs_flag |= VFS_UNMOUNTED;
+	mutex_exit(&lxdm->lxdm_contents);
+
+	return (0);
+}
+
+/*
+ * Implementation of VFS_FREEVFS(). This is called by the vfs framework after
+ * umount and the last VFS_RELE, to trigger the release of any resources still
+ * associated with the given vfs_t. This is normally called immediately after
+ * lxd_unmount.
+ */
+void
+lxd_freevfs(vfs_t *vfsp)
+{
+	lxd_mnt_t *lxdm = (lxd_mnt_t *)VFSTOLXDM(vfsp);
+	lxd_node_t *ldn;
+	struct vnode *vp;
+
+	/*
+	 * Free all kmemalloc'd and anonalloc'd memory associated with
+	 * this filesystem.  To do this, we go through the file list twice,
+	 * once to remove all the directory entries, and then to remove
+	 * all the pseudo files.
+	 */
+
+	/*
+	 * Now that we are tearing ourselves down we need to remove the
+	 * UNMOUNTED flag. If we don't, we'll later hit a VN_RELE when we remove
+	 * files from the system causing us to have a negative value. Doing this
+	 * seems a bit better than trying to set a flag on the lxd_mnt_t that
+	 * says we're tearing down.
+	 */
+	vfsp->vfs_flag &= ~VFS_UNMOUNTED;
+
+	/*
+	 * Remove all directory entries (this doesn't remove top-level dirs).
+	 */
+	for (ldn = lxdm->lxdm_rootnode; ldn; ldn = ldn->lxdn_next) {
+		rw_enter(&ldn->lxdn_rwlock, RW_WRITER);
+		if (ldn->lxdn_vnode->v_type == VDIR)
+			lxd_dirtrunc(ldn);
+		rw_exit(&ldn->lxdn_rwlock);
+	}
+
+	ASSERT(lxdm->lxdm_rootnode != NULL);
+
+	/*
+	 * All links are gone, v_count is keeping nodes in place.
+	 * VN_RELE should make the node disappear, unless somebody
+	 * is holding pages against it.  Nap and retry until it disappears.
+	 *
+	 * We re-acquire the lock to prevent others who have a HOLD on a
+	 * lxd_node from blowing it away (in lxd_inactive) while we're trying
+	 * to get to it here. Once we have a HOLD on it we know it'll stick
+	 * around.
+	 */
+	mutex_enter(&lxdm->lxdm_contents);
+
+	/*
+	 * Remove all the files (except the rootnode) backwards.
+	 */
+	while ((ldn = lxdm->lxdm_rootnode->lxdn_prev) != lxdm->lxdm_rootnode) {
+		mutex_exit(&lxdm->lxdm_contents);
+		/*
+		 * All nodes will be released here. Note we handled the link
+		 * count above.
+		 */
+		vp = LDNTOV(ldn);
+		ASSERT(vp->v_type == VLNK || vp->v_type == VDIR ||
+		    vp->v_type == VSOCK);
+		VN_RELE(vp);
+		mutex_enter(&lxdm->lxdm_contents);
+		/*
+		 * It's still there after the RELE. Someone else like pageout
+		 * has a hold on it so wait a bit and then try again - we know
+		 * they'll give it up soon.
+		 */
+		if (ldn == lxdm->lxdm_rootnode->lxdn_prev) {
+			VN_HOLD(vp);
+			mutex_exit(&lxdm->lxdm_contents);
+			delay(hz / 4);
+			mutex_enter(&lxdm->lxdm_contents);
+		}
+	}
+	mutex_exit(&lxdm->lxdm_contents);
+
+	ASSERT(lxdm->lxdm_back_refcnt == 1);
+	ASSERT(lxdm->lxdm_dent_refcnt == 0);
+
+	VN_RELE(LDNTOV(lxdm->lxdm_rootnode));
+
+	ASSERT(lxdm->lxdm_mntpath != NULL);
+	kmem_free(lxdm->lxdm_mntpath, strlen(lxdm->lxdm_mntpath) + 1);
+
+	mutex_destroy(&lxdm->lxdm_contents);
+	mutex_destroy(&lxdm->lxdm_renamelck);
+	kmem_free(lxdm, sizeof (lxd_mnt_t));
+
+	/* Allow _fini() to succeed now */
+	atomic_dec_32(&lxd_mountcount);
+}
+
+/*
+ * return root lxdnode for given vnode
+ */
+static int
+lxd_root(struct vfs *vfsp, struct vnode **vpp)
+{
+	lxd_mnt_t *lxdm = (lxd_mnt_t *)VFSTOLXDM(vfsp);
+	lxd_node_t *ldn = lxdm->lxdm_rootnode;
+	struct vnode *vp;
+
+	ASSERT(ldn != NULL);
+
+	vp = LDNTOV(ldn);
+	VN_HOLD(vp);
+	*vpp = vp;
+	return (0);
+}
+
+static int
+lxd_statvfs(struct vfs *vfsp, statvfs64_t *sbp)
+{
+	lxd_mnt_t *lxdm = (lxd_mnt_t *)VFSTOLXDM(vfsp);
+	ulong_t	blocks;
+	dev32_t d32;
+	zoneid_t eff_zid;
+	struct zone *zp;
+
+	zp = lxdm->lxdm_vfsp->vfs_zone;
+
+	if (zp == NULL)
+		eff_zid = GLOBAL_ZONEUNIQID;
+	else
+		eff_zid = zp->zone_id;
+
+	sbp->f_bsize = PAGESIZE;
+	sbp->f_frsize = PAGESIZE;
+
+	/*
+	 * Find the amount of available physical and memory swap
+	 */
+	mutex_enter(&anoninfo_lock);
+	ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
+	blocks = (ulong_t)CURRENT_TOTAL_AVAILABLE_SWAP;
+	mutex_exit(&anoninfo_lock);
+
+	if (blocks > lxd_minfree)
+		sbp->f_bfree = blocks - lxd_minfree;
+	else
+		sbp->f_bfree = 0;
+
+	sbp->f_bavail = sbp->f_bfree;
+
+	/*
+	 * Total number of blocks is just what's available
+	 */
+	sbp->f_blocks = (fsblkcnt64_t)(sbp->f_bfree);
+
+	if (eff_zid != GLOBAL_ZONEUNIQID &&
+	    zp->zone_max_swap_ctl != UINT64_MAX) {
+		/*
+		 * If the fs is used by a zone with a swap cap,
+		 * then report the capped size.
+		 */
+		rctl_qty_t cap, used;
+		pgcnt_t pgcap, pgused;
+
+		mutex_enter(&zp->zone_mem_lock);
+		cap = zp->zone_max_swap_ctl;
+		used = zp->zone_max_swap;
+		mutex_exit(&zp->zone_mem_lock);
+
+		pgcap = btop(cap);
+		pgused = btop(used);
+
+		sbp->f_bfree = MIN(pgcap - pgused, sbp->f_bfree);
+		sbp->f_bavail = sbp->f_bfree;
+		sbp->f_blocks = MIN(pgcap, sbp->f_blocks);
+	}
+
+	/*
+	 * The maximum number of files available is approximately the number
+	 * of lxd_nodes we can allocate from the remaining kernel memory
+	 * available to lxdevfs in this zone.  This is fairly inaccurate since
+	 * it doesn't take into account the names stored in the directory
+	 * entries.
+	 */
+	sbp->f_ffree = sbp->f_files = ptob(availrmem) /
+	    (sizeof (lxd_node_t) + sizeof (lxd_dirent_t));
+	sbp->f_favail = (fsfilcnt64_t)(sbp->f_ffree);
+	(void) cmpldev(&d32, vfsp->vfs_dev);
+	sbp->f_fsid = d32;
+	(void) strcpy(sbp->f_basetype, vfssw[lxd_fstype].vsw_name);
+	(void) strncpy(sbp->f_fstr, lxdm->lxdm_mntpath, sizeof (sbp->f_fstr));
+	/* ensure null termination */
+	sbp->f_fstr[sizeof (sbp->f_fstr) - 1] = '\0';
+	sbp->f_flag = vf_to_stf(vfsp->vfs_flag);
+	sbp->f_namemax = MAXNAMELEN - 1;
+	return (0);
+}
+
+static void
+lxd_pts_devt_translator(dev_t dev, dev_t *jdev)
+{
+	minor_t	min = getminor(dev);
+	int	lx_maj, lx_min;
+
+	/*
+	 * Linux uses a range of major numbers for pts devices to address the
+	 * relatively small minor number space (20 bits).
+	 */
+
+	lx_maj = LX_PTS_MAJOR_MIN + (min / LX_MAXMIN);
+	lx_min = min % LX_MAXMIN;
+	if (lx_maj > LX_PTS_MAJOR_MAX) {
+		/*
+		 * The major is outside the acceptable range but there's little
+		 * we can presently do about it short of overhauling the
+		 * translation logic.
+		 */
+		lx_unsupported("pts major out of translation range");
+	}
+
+	*jdev = LX_MAKEDEVICE(lx_maj, lx_min);
+}
+
+static void
+lxd_ptm_devt_translator(dev_t dev, dev_t *jdev)
+{
+	*jdev = LX_MAKEDEVICE(LX_PTM_MAJOR, LX_PTM_MINOR);
+}
diff --git a/usr/src/uts/common/brand/lx/devfs/lxd_vnops.c b/usr/src/uts/common/brand/lx/devfs/lxd_vnops.c
new file mode 100644
index 0000000000..bee93f6aad
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/devfs/lxd_vnops.c
@@ -0,0 +1,1506 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/vnode.h>
+#include <sys/vfs.h>
+#include <sys/vfs_opreg.h>
+#include <sys/stat.h>
+#include <sys/uio.h>
+#include <sys/cred.h>
+#include <sys/pathname.h>
+#include <sys/debug.h>
+#include <sys/sdt.h>
+#include <fs/fs_subr.h>
+#include <vm/as.h>
+#include <vm/seg.h>
+#include <sys/lx_brand.h>
+#include <sys/brand.h>
+
+#include "lxd.h"
+
+static int
+lxd_open(vnode_t **vpp, int flag, struct cred *cr, caller_context_t *ct)
+{
+	lxd_node_t *ldn = VTOLDN(*vpp);
+	vnode_t *vp = *vpp;
+	vnode_t *rvp;
+	vnode_t *oldvp;
+	int error;
+
+	if (ldn->lxdn_type == LXDNT_FRONT)
+		return (0);
+
+	ASSERT(ldn->lxdn_type == LXDNT_BACK);
+	oldvp = vp;
+	vp = rvp = REALVP(vp);
+	/*
+	 * Need to hold new reference to vp since VOP_OPEN() may
+	 * decide to release it.
+	 */
+	VN_HOLD(vp);
+	error = VOP_OPEN(&rvp, flag, cr, ct);
+
+	if (!error && rvp != vp) {
+		/*
+		 * the FS which we called should have released the
+		 * new reference on vp
+		 */
+		*vpp = lxd_make_back_node(rvp, VFSTOLXDM(oldvp->v_vfsp));
+
+		if (IS_DEVVP(*vpp)) {
+			vnode_t *svp;
+
+			svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
+			VN_RELE(*vpp);
+			if (svp == NULL)
+				error = ENOSYS;
+			else
+				*vpp = svp;
+		}
+		VN_RELE(oldvp);
+	} else {
+		ASSERT(rvp->v_count > 1);
+		VN_RELE(rvp);
+	}
+
+	return (error);
+}
+
+static int
+lxd_close(vnode_t *vp, int flag, int count, offset_t offset, struct cred *cr,
+    caller_context_t *ct)
+{
+	lxd_node_t *ldn = VTOLDN(vp);
+
+	if (ldn->lxdn_type == LXDNT_FRONT)
+		return (0);
+
+	ASSERT(ldn->lxdn_type == LXDNT_BACK);
+	vp = REALVP(vp);
+	return (VOP_CLOSE(vp, flag, count, offset, cr, ct));
+}
+
+static int
+lxd_read(vnode_t *vp, struct uio *uiop, int ioflag, struct cred *cr,
+    caller_context_t *ct)
+{
+	lxd_node_t *ldn = VTOLDN(vp);
+
+	if (ldn->lxdn_type == LXDNT_FRONT)
+		return (ENOTSUP);
+
+	ASSERT(ldn->lxdn_type == LXDNT_BACK);
+	vp = REALVP(vp);
+	return (VOP_READ(vp, uiop, ioflag, cr, ct));
+}
+
+static int
+lxd_write(vnode_t *vp, struct uio *uiop, int ioflag, struct cred *cr,
+    caller_context_t *ct)
+{
+	lxd_node_t *ldn = VTOLDN(vp);
+
+	if (ldn->lxdn_type == LXDNT_FRONT)
+		return (ENOTSUP);
+
+	ASSERT(ldn->lxdn_type == LXDNT_BACK);
+	vp = REALVP(vp);
+	return (VOP_WRITE(vp, uiop, ioflag, cr, ct));
+}
+
+static int
+lxd_ioctl(vnode_t *vp, int cmd, intptr_t arg, int flag, struct cred *cr,
+    int *rvalp, caller_context_t *ct)
+{
+	lxd_node_t *ldn = VTOLDN(vp);
+
+	if (ldn->lxdn_type == LXDNT_FRONT)
+		return (ENOTSUP);
+
+	ASSERT(ldn->lxdn_type == LXDNT_BACK);
+	vp = REALVP(vp);
+	return (VOP_IOCTL(vp, cmd, arg, flag, cr, rvalp, ct));
+}
+
+static int
+lxd_setfl(vnode_t *vp, int oflags, int nflags, cred_t *cr, caller_context_t *ct)
+{
+	lxd_node_t *ldn = VTOLDN(vp);
+
+	if (ldn->lxdn_type == LXDNT_FRONT)
+		return (ENOTSUP);
+
+	ASSERT(ldn->lxdn_type == LXDNT_BACK);
+	vp = REALVP(vp);
+	return (VOP_SETFL(vp, oflags, nflags, cr, ct));
+}
+
+/*
+ * Translate SunOS devt to Linux devt.
+ */
+static void
+lxd_s2l_devt(dev_t dev, dev_t *rdev)
+{
+	lxd_minor_translator_t	*mt;
+	int			i, j;
+	major_t			maj = getmajor(dev);
+	minor_t			min = getminor(dev);
+
+	/* look for a devt translator for this major number */
+	for (i = 0; lxd_devt_translators[i].lxd_xl_driver != NULL; i++) {
+		if (lxd_devt_translators[i].lxd_xl_major == maj)
+			break;
+	}
+
+	if (lxd_devt_translators[i].lxd_xl_driver != NULL) {
+		/* try to translate the illumos devt to a linux devt */
+		switch (lxd_devt_translators[i].lxd_xl_type) {
+		case DTT_INVALID:
+			ASSERT(0);
+			break;
+
+		case DTT_LIST:
+			mt = lxd_devt_translators[i].xl_list;
+			for (j = 0; mt[j].lxd_mt_path != NULL; j++) {
+				if (mt[j].lxd_mt_minor == min) {
+					ASSERT(mt[j].lxd_mt_minor < LX_MAXMIN);
+
+					/* found a translation */
+					*rdev = LX_MAKEDEVICE(
+					    mt[j].lxd_mt_lx_major,
+					    mt[j].lxd_mt_lx_minor);
+					return;
+				}
+			}
+			break;
+
+		case DTT_CUSTOM:
+			lxd_devt_translators[i].xl_custom(dev, rdev);
+			return;
+		}
+	}
+
+	/* we don't have a translator for this device */
+	*rdev = LX_MAKEDEVICE(maj, min);
+}
+
+static int
+lxd_getattr(vnode_t *vp, struct vattr *vap, int flags, struct cred *cr,
+    caller_context_t *ct)
+{
+	lxd_node_t *ldn = VTOLDN(vp);
+	int error;
+	vnode_t *rvp;
+
+	if (ldn->lxdn_type == LXDNT_FRONT) {
+		mutex_enter(&ldn->lxdn_tlock);
+
+		vap->va_type = vp->v_type;
+		vap->va_mode = ldn->lxdn_mode & MODEMASK;
+		vap->va_uid = ldn->lxdn_uid;
+		vap->va_gid = ldn->lxdn_gid;
+		vap->va_fsid = ldn->lxdn_fsid;
+		vap->va_nodeid = (ino64_t)ldn->lxdn_nodeid;
+		vap->va_nlink = ldn->lxdn_nlink;
+		vap->va_size = (u_offset_t)ldn->lxdn_size;
+		vap->va_atime = ldn->lxdn_atime;
+		vap->va_mtime = ldn->lxdn_mtime;
+		vap->va_ctime = ldn->lxdn_ctime;
+		vap->va_blksize = PAGESIZE;
+		vap->va_rdev = 0;	/* no devs in front */
+		vap->va_seq = ldn->lxdn_seq;
+
+		vap->va_nblocks = (fsblkcnt64_t)btodb(ptob(btopr(
+		    vap->va_size)));
+		mutex_exit(&ldn->lxdn_tlock);
+		return (0);
+	}
+
+	ASSERT(ldn->lxdn_type == LXDNT_BACK);
+	rvp = REALVP(vp);
+	if ((error = VOP_GETATTR(rvp, vap, flags, cr, ct)))
+		return (error);
+
+	/* Skip devt translation for native programs */
+	if (curproc->p_brand != &lx_brand) {
+		return (0);
+	} else {
+		/*
+		 * We also skip translation when called from the user-land
+		 * emulation code.
+		 */
+		lx_lwp_data_t *lwpd = ttolxlwp(curthread);
+
+		if (lwpd == NULL || lwpd->br_stack_mode != LX_STACK_MODE_BRAND)
+			return (0);
+	}
+
+	if (rvp->v_type == VCHR) {
+		dev_t ldev;
+
+		lxd_s2l_devt(vap->va_rdev, &ldev);
+		DTRACE_PROBE3(lxd__devxl, void *, rvp, void *, vap, int, ldev);
+		vap->va_rdev = ldev;
+	}
+
+	return (0);
+}
+
+static int
+lxd_setattr(vnode_t *vp, struct vattr *vap, int flags, struct cred *cr,
+    caller_context_t *ct)
+{
+	lxd_node_t *ldn = VTOLDN(vp);
+
+	if (ldn->lxdn_type == LXDNT_FRONT) {
+		int error = 0;
+		struct vattr *set;
+		long mask = vap->va_mask;
+
+		/* Cannot set these attributes */
+		if ((mask & AT_NOSET) || (mask & AT_XVATTR) ||
+		    (mask & AT_MODE && vap->va_mode & (S_ISUID | S_ISGID)) ||
+		    (mask & AT_SIZE))
+			return (EINVAL);
+
+		mutex_enter(&ldn->lxdn_tlock);
+
+		set = &ldn->lxdn_attr;
+		/*
+		 * Change file access modes. Must be owner or have sufficient
+		 * privileges.
+		 */
+		error = secpolicy_vnode_setattr(cr, vp, vap, set, flags,
+		    lxd_naccess, ldn);
+		if (error) {
+			mutex_exit(&ldn->lxdn_tlock);
+			return (error);
+		}
+
+		if (mask & AT_MODE) {
+			set->va_mode &= S_IFMT;
+			set->va_mode |= vap->va_mode & ~S_IFMT;
+		}
+
+		if (mask & AT_UID)
+			set->va_uid = vap->va_uid;
+		if (mask & AT_GID)
+			set->va_gid = vap->va_gid;
+		if (mask & AT_ATIME)
+			set->va_atime = vap->va_atime;
+		if (mask & AT_MTIME)
+			set->va_mtime = vap->va_mtime;
+
+		if (mask & (AT_UID | AT_GID | AT_MODE | AT_MTIME))
+			gethrestime(&ldn->lxdn_ctime);
+
+		mutex_exit(&ldn->lxdn_tlock);
+		return (error);
+	}
+
+	ASSERT(ldn->lxdn_type == LXDNT_BACK);
+	vp = REALVP(vp);
+	return (VOP_SETATTR(vp, vap, flags, cr, ct));
+}
+
+static int
+lxd_access(vnode_t *vp, int mode, int flags, struct cred *cr,
+    caller_context_t *ct)
+{
+	lxd_node_t *ldn = VTOLDN(vp);
+
+	if (ldn->lxdn_type == LXDNT_FRONT) {
+		int error;
+
+		mutex_enter(&ldn->lxdn_tlock);
+		error = lxd_naccess(ldn, mode, cr);
+		mutex_exit(&ldn->lxdn_tlock);
+		return (error);
+	}
+
+	ASSERT(ldn->lxdn_type == LXDNT_BACK);
+	if (mode & VWRITE) {
+		if (vp->v_type == VREG && vn_is_readonly(vp))
+			return (EROFS);
+	}
+	vp = REALVP(vp);
+	return (VOP_ACCESS(vp, mode, flags, cr, ct));
+}
+
+static int
+lxd_fsync(vnode_t *vp, int syncflag, struct cred *cr, caller_context_t *ct)
+{
+	lxd_node_t *ldn = VTOLDN(vp);
+
+	if (ldn->lxdn_type == LXDNT_FRONT)
+		return (0);
+
+	ASSERT(ldn->lxdn_type == LXDNT_BACK);
+	vp = REALVP(vp);
+	return (VOP_FSYNC(vp, syncflag, cr, ct));
+}
+
+static void
+lxd_front_inactive(struct vnode *vp, struct cred *cred, caller_context_t *ct)
+{
+	lxd_node_t *ldn = VTOLDN(vp);
+	lxd_mnt_t *lxdm = VTOLXDM(vp);
+
+	ASSERT(ldn->lxdn_type == LXDNT_FRONT);
+	rw_enter(&ldn->lxdn_rwlock, RW_WRITER);
+
+	mutex_enter(&ldn->lxdn_tlock);
+	mutex_enter(&vp->v_lock);
+	ASSERT(vp->v_count >= 1);
+
+	/*
+	 * If we don't have the last hold or the link count is non-zero,
+	 * there's little to do -- just drop our hold.
+	 */
+	if (vp->v_count > 1 || ldn->lxdn_nlink != 0) {
+		vp->v_count--;
+
+		mutex_exit(&vp->v_lock);
+		mutex_exit(&ldn->lxdn_tlock);
+		rw_exit(&ldn->lxdn_rwlock);
+		return;
+	}
+
+	/*
+	 * We have the last hold *and* the link count is zero, so this node is
+	 * dead from the filesystem's viewpoint.
+	 */
+	if (ldn->lxdn_size != 0) {
+		if (ldn->lxdn_vnode->v_type == VLNK)
+			kmem_free(ldn->lxdn_symlink, ldn->lxdn_size + 1);
+	}
+
+	mutex_exit(&vp->v_lock);
+	mutex_exit(&ldn->lxdn_tlock);
+
+	vn_invalid(LDNTOV(ldn));
+
+	mutex_enter(&lxdm->lxdm_contents);
+	if (ldn->lxdn_next == NULL)
+		lxdm->lxdm_rootnode->lxdn_prev = ldn->lxdn_prev;
+	else
+		ldn->lxdn_next->lxdn_prev = ldn->lxdn_prev;
+	ldn->lxdn_prev->lxdn_next = ldn->lxdn_next;
+
+	mutex_exit(&lxdm->lxdm_contents);
+	rw_exit(&ldn->lxdn_rwlock);
+	rw_destroy(&ldn->lxdn_rwlock);
+	mutex_destroy(&ldn->lxdn_tlock);
+
+	vn_free(LDNTOV(ldn));
+	kmem_free(ldn, sizeof (lxd_node_t));
+}
+
+/*ARGSUSED*/
+static void
+lxd_inactive(vnode_t *vp, struct cred *cr, caller_context_t *ct)
+{
+	lxd_node_t *ldn = VTOLDN(vp);
+
+	if (ldn->lxdn_type == LXDNT_FRONT) {
+		lxd_front_inactive(vp, cr, ct);
+		return;
+	}
+
+	ASSERT(ldn->lxdn_type == LXDNT_BACK);
+	lxd_free_back_node(ldn);
+}
+
+/* ARGSUSED */
+static int
+lxd_fid(vnode_t *vp, struct fid *fidp, caller_context_t *ct)
+{
+	lxd_node_t *ldn = VTOLDN(vp);
+
+	if (ldn->lxdn_type == LXDNT_FRONT)
+		return (ENOTSUP);
+
+	ASSERT(ldn->lxdn_type == LXDNT_BACK);
+	vp = REALVP(vp);
+	return (VOP_FID(vp, fidp, ct));
+}
+
+/*
+ * For a front node lookup in the dirent hash table and return a shadow vnode
+ * (lxd_node_t type) of type LXDNT_FRONT.
+ *
+ * For a back node, lookup nm name and return a shadow vnode (lxd_node_t type)
+ * of the real vnode found.
+ */
+static int
+lxd_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
+    int flags, vnode_t *rdir, struct cred *cr, caller_context_t *ct,
+    int *direntflags, pathname_t *realpnp)
+{
+	vnode_t *vp = NULL;
+	int error;
+	vnode_t *realdvp;
+	lxd_mnt_t *lxdm = VTOLXDM(dvp);
+	int doingdotdot = 0;
+	lxd_node_t *ldn = VTOLDN(dvp);
+	lxd_node_t *nldn = NULL;
+
+	/*
+	 * First check for front file which could be instantiated on either a
+	 * front or back node (e.g. the top-level moint point directory node is
+	 * a back node which can have front files created in it).
+	 */
+
+	/* disallow extended attrs */
+	if (flags & LOOKUP_XATTR)
+		return (EINVAL);
+
+	/* Null component name is a synonym for dir being searched. */
+	if (*nm == '\0') {
+		VN_HOLD(dvp);
+		*vpp = dvp;
+		return (0);
+	}
+
+	rw_enter(&ldn->lxdn_rwlock, RW_READER);
+	error = lxd_dirlookup(ldn, nm, &nldn, cr);
+	rw_exit(&ldn->lxdn_rwlock);
+
+	if (error == 0) {
+		/* found */
+		ASSERT(nldn != NULL);
+		*vpp = LDNTOV(nldn);
+		return (0);
+	}
+
+	/* At this point, if dir node is a front node, error */
+	if (ldn->lxdn_type == LXDNT_FRONT) {
+		return (ENOENT);
+	}
+
+	realdvp = REALVP(dvp);
+
+	if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') {
+		doingdotdot++;
+		/*
+		 * Handle ".." out of mounted filesystem
+		 */
+		while ((realdvp->v_flag & VROOT) && realdvp != rootdir) {
+			realdvp = realdvp->v_vfsp->vfs_vnodecovered;
+			ASSERT(realdvp != NULL);
+		}
+	}
+
+	*vpp = NULL;	/* default(error) case */
+
+	/*
+	 * Do the normal lookup
+	 */
+	if ((error = VOP_LOOKUP(realdvp, nm, &vp, pnp, flags, rdir, cr,
+	    ct, direntflags, realpnp)) != 0) {
+		vp = NULL;
+		goto out;
+	}
+
+	/*
+	 * We do this check here to avoid returning a stale file handle to the
+	 * caller.
+	 */
+	if (nm[0] == '.' && nm[1] == '\0') {
+		ASSERT(vp == realdvp);
+		VN_HOLD(dvp);
+		VN_RELE(vp);
+		*vpp = dvp;
+		return (0);
+	}
+
+	if (doingdotdot) {
+		*vpp = lxd_make_back_node(vp, lxdm);
+		return (0);
+	}
+
+	/*
+	 * If this vnode is mounted on, then we
+	 * traverse to the vnode which is the root of
+	 * the mounted file system.
+	 */
+	if ((error = traverse(&vp)) != 0)
+		goto out;
+
+	/*
+	 * Make a lxd node for the real vnode.
+	 */
+	*vpp = lxd_make_back_node(vp, lxdm);
+	if (vp->v_type != VDIR) {
+		if (IS_DEVVP(*vpp)) {
+			vnode_t *svp;
+
+			svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
+			VN_RELE(*vpp);
+			if (svp == NULL) {
+				VN_RELE(vp);
+				error = ENOSYS;
+			} else {
+				*vpp = svp;
+			}
+		}
+		return (error);
+	}
+
+out:
+	if (error != 0 && vp != NULL)
+		VN_RELE(vp);
+
+	return (error);
+}
+
+/*ARGSUSED*/
+static int
+lxd_create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive,
+    int mode, vnode_t **vpp, struct cred *cr, int flag, caller_context_t *ct,
+    vsecattr_t *vsecp)
+{
+	int error;
+	lxd_node_t *parent = VTOLDN(dvp);
+	lxd_node_t *lnp = NULL;
+
+	rw_enter(&parent->lxdn_rwlock, RW_READER);
+	error = lxd_dirlookup(parent, nm, &lnp, cr);
+	rw_exit(&parent->lxdn_rwlock);
+	/*
+	 * If this vnode already exists in lx devfs, we should pass the create
+	 * operation through to the underlying resource it represents.  For
+	 * existing back nodes, the VOP_CREATE is done directly against the
+	 * returned lxd node with an empty name (to avoid a redunant lookup).
+	 * For existing front nodes, an appropriate error must be chosen since
+	 * they cannot represent regular files
+	 */
+	if (error == 0) {
+		if (lnp->lxdn_type == LXDNT_BACK) {
+			error = VOP_CREATE(lnp->lxdn_real_vp, "\0", va,
+			    exclusive, mode, vpp, cr, flag, ct, vsecp);
+		} else {
+			if (exclusive == EXCL) {
+				error = EEXIST;
+			} else if (LDNTOV(lnp)->v_type == VDIR &&
+			    (mode & S_IWRITE)) {
+				error = EISDIR;
+			} else {
+				error = ENOTSUP;
+			}
+		}
+		if (error != 0) {
+			ldnode_rele(lnp);
+		}
+		return (error);
+	}
+
+	/*
+	 * We cannot create files in the back devfs but we want to allow for
+	 * O_CREAT on existing files.  Pass this through and let the back file
+	 * system allow or deny it.
+	 */
+	if (parent->lxdn_type == LXDNT_BACK) {
+		vnode_t *vp = NULL;
+
+		if (*nm == '\0') {
+			ASSERT(vpp && dvp == *vpp);
+			vp = REALVP(*vpp);
+		}
+		if ((error = VOP_CREATE(REALVP(dvp), nm, va, exclusive, mode,
+		    &vp, cr, flag, ct, vsecp)) == 0) {
+			*vpp = lxd_make_back_node(vp, VFSTOLXDM(dvp->v_vfsp));
+			if (IS_DEVVP(*vpp)) {
+				vnode_t *svp;
+
+				svp = specvp(*vpp, (*vpp)->v_rdev,
+				    (*vpp)->v_type, cr);
+				VN_RELE(*vpp);
+				if (svp == NULL) {
+					return (ENOSYS);
+				}
+				*vpp = svp;
+			}
+			return (0);
+		}
+		/*
+		 * If we were unable to perform the VOP_CREATE for any reason
+		 * other than sdev being read-only, we should bail.
+		 */
+		if (error != ENOTSUP && error != EROFS) {
+			return (error);
+		}
+	}
+
+	/*
+	 * While we don't allow create data-containing files under LX devfs, we
+	 * must allow VSOCK front nodes to be created so that paths such as
+	 * /dev/log can be used as AF_UNIX sockets.
+	 */
+	if (va->va_type == VSOCK) {
+		lxd_mnt_t *lxdm = VTOLXDM(parent->lxdn_vnode);
+
+		lnp = NULL;
+		rw_enter(&parent->lxdn_rwlock, RW_WRITER);
+		error = lxd_direnter(lxdm, parent, nm, DE_CREATE, NULL, NULL,
+		    va, &lnp, cr, ct);
+		rw_exit(&parent->lxdn_rwlock);
+
+		if (error == 0) {
+			*vpp = LDNTOV(lnp);
+		} else if (lnp != NULL) {
+			/*
+			 * It's possible that a racing process created an entry
+			 * at this name since we last performed the lookup.
+			 */
+			ldnode_rele(lnp);
+		}
+	} else {
+		error = ENOTSUP;
+	}
+
+	return (error);
+}
+
+static int
+lxd_remove(vnode_t *dvp, char *nm, struct cred *cr, caller_context_t *ct,
+    int flags)
+{
+	lxd_node_t *parent = VTOLDN(dvp);
+	lxd_node_t *ldn = NULL;
+	int error;
+
+	/* can only remove existing front nodes */
+	error = lxd_dirlookup(parent, nm, &ldn, cr);
+	if (error) {
+		return (error);
+	}
+
+	ASSERT(ldn != NULL);
+	ASSERT(ldn->lxdn_type == LXDNT_FRONT);
+	rw_enter(&parent->lxdn_rwlock, RW_WRITER);
+	rw_enter(&ldn->lxdn_rwlock, RW_WRITER);
+
+	error = lxd_dirdelete(parent, ldn, nm, DR_REMOVE, cr);
+
+	rw_exit(&ldn->lxdn_rwlock);
+	rw_exit(&parent->lxdn_rwlock);
+
+	ldnode_rele(ldn);
+
+	return (error);
+}
+
+static int
+lxd_link(vnode_t *tdvp, vnode_t *vp, char *tnm, struct cred *cr,
+    caller_context_t *ct, int flags)
+{
+	return (ENOTSUP);
+}
+
+static int
+lxd_rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, struct cred *cr,
+    caller_context_t *ct, int flags)
+{
+	lxd_node_t *oldparent = VTOLDN(odvp);
+	lxd_node_t *newparent;
+	lxd_mnt_t *lxdm = VTOLXDM(oldparent->lxdn_vnode);
+	lxd_node_t *fromnode = NULL;
+	int error;
+	int samedir = 0;
+
+	if (!vn_matchops(ndvp, lxd_vnodeops)) {
+		/* cannot rename out of this file system */
+		return (EACCES);
+	}
+
+	mutex_enter(&lxdm->lxdm_renamelck);
+
+	newparent = VTOLDN(ndvp);
+
+	/*
+	 * We can only rename front nodes.
+	 */
+	error = lxd_dirlookup(oldparent, onm, &fromnode, cr);
+	if (error != 0) {
+		/* not found in front */
+		mutex_exit(&lxdm->lxdm_renamelck);
+		return (error);
+	}
+
+	/*
+	 * Make sure we can delete the old (source) entry.  This
+	 * requires write permission on the containing directory.  If
+	 * that directory is "sticky" it requires further checks.
+	 */
+	if ((error = lxd_naccess(oldparent, VWRITE, cr)) != 0)
+		goto done;
+
+	/*
+	 * Check for renaming to or from '.' or '..' or that
+	 * fromnode == oldparent
+	 */
+	if ((onm[0] == '.' &&
+	    (onm[1] == '\0' || (onm[1] == '.' && onm[2] == '\0'))) ||
+	    (nnm[0] == '.' &&
+	    (nnm[1] == '\0' || (nnm[1] == '.' && nnm[2] == '\0'))) ||
+	    (oldparent == fromnode)) {
+		error = EINVAL;
+		goto done;
+	}
+
+	samedir = (oldparent == newparent);
+
+	/*
+	 * Make sure we can search and rename into the destination directory.
+	 */
+	if (!samedir) {
+		if ((error = lxd_naccess(newparent, VEXEC|VWRITE, cr)) != 0)
+			goto done;
+	}
+
+	/*
+	 * Link source to new target
+	 */
+	rw_enter(&newparent->lxdn_rwlock, RW_WRITER);
+	error = lxd_direnter(lxdm, newparent, nnm, DE_RENAME,
+	    oldparent, fromnode, (struct vattr *)NULL, (lxd_node_t **)NULL,
+	    cr, ct);
+	rw_exit(&newparent->lxdn_rwlock);
+
+	if (error)
+		goto done;
+
+	/*
+	 * Unlink from source.
+	 */
+	rw_enter(&oldparent->lxdn_rwlock, RW_WRITER);
+	rw_enter(&fromnode->lxdn_rwlock, RW_WRITER);
+
+	error = lxd_dirdelete(oldparent, fromnode, onm, DR_RENAME, cr);
+
+	/*
+	 * The following handles the case where our source node was
+	 * removed before we got to it.
+	 */
+	if (error == ENOENT)
+		error = 0;
+
+	rw_exit(&fromnode->lxdn_rwlock);
+	rw_exit(&oldparent->lxdn_rwlock);
+
+done:
+	ldnode_rele(fromnode);
+	mutex_exit(&lxdm->lxdm_renamelck);
+	return (error);
+}
+
+static int
+lxd_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp,
+    struct cred *cr, caller_context_t *ct, int flags, vsecattr_t *vsecp)
+{
+	int error;
+	vnode_t *tvp;
+	lxd_node_t *ndir = NULL;
+	lxd_node_t *parent = VTOLDN(dvp);
+	lxd_mnt_t *lxdm = VTOLXDM(parent->lxdn_vnode);
+
+	/* check for existence in both front and back */
+	if (lxd_lookup(dvp, nm, &tvp, NULL, 0, NULL, cr, ct, NULL, NULL) == 0) {
+		/* The entry already exists */
+		VN_RELE(tvp);
+		return (EEXIST);
+	}
+
+	/* make front directory */
+	rw_enter(&parent->lxdn_rwlock, RW_WRITER);
+	error = lxd_direnter(lxdm, parent, nm, DE_MKDIR, NULL, NULL,
+	    va, &ndir, cr, ct);
+	rw_exit(&parent->lxdn_rwlock);
+
+	if (error != 0) {
+		if (ndir != NULL)
+			ldnode_rele(ndir);
+	} else {
+		*vpp = LDNTOV(ndir);
+	}
+
+	return (error);
+}
+
+static int
+lxd_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct)
+{
+	lxd_node_t *ldn = VTOLDN(vp);
+
+	if (ldn->lxdn_type == LXDNT_FRONT) {
+		*vpp = vp;
+		return (0);
+	}
+
+	ASSERT(ldn->lxdn_type == LXDNT_BACK);
+	while (vn_matchops(vp, lxd_vnodeops))
+		vp = REALVP(vp);
+
+	if (VOP_REALVP(vp, vpp, ct) != 0)
+		*vpp = vp;
+	return (0);
+}
+
+static int
+lxd_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, struct cred *cr,
+    caller_context_t *ct, int flags)
+{
+	int error;
+	lxd_node_t *ldn;
+	struct vnode *vp;
+	lxd_node_t *parent = VTOLDN(dvp);
+
+	/*
+	 * Return error if trying to remove . or ..
+	 */
+	if (strcmp(nm, ".") == 0)
+		return (EINVAL);
+	if (strcmp(nm, "..") == 0)
+		return (EEXIST);
+
+	error = lxd_dirlookup(VTOLDN(dvp), nm, &ldn, cr);
+	if (error != 0) {
+		/* not found in front */
+		return (error);
+	}
+
+	rw_enter(&parent->lxdn_rwlock, RW_WRITER);
+	rw_enter(&ldn->lxdn_rwlock, RW_WRITER);
+
+	vp = LDNTOV(ldn);
+	if (vp == dvp || vp == cdir) {
+		error = EINVAL;
+		goto err;
+	}
+
+	if (ldn->lxdn_vnode->v_type != VDIR) {
+		error = ENOTDIR;
+		goto err;
+	}
+
+	mutex_enter(&ldn->lxdn_tlock);
+	if (ldn->lxdn_nlink > 2) {
+		mutex_exit(&ldn->lxdn_tlock);
+		error = EEXIST;
+		goto err;
+	}
+	mutex_exit(&ldn->lxdn_tlock);
+
+	/* Check for an empty directory */
+	if (ldn->lxdn_dirents > 2) {
+		error = EEXIST;
+		gethrestime(&ldn->lxdn_atime);
+		goto err;
+	}
+
+	if (vn_vfswlock(vp)) {
+		error = EBUSY;
+		goto err;
+	}
+	if (vn_mountedvfs(vp) != NULL) {
+		error = EBUSY;
+		vn_vfsunlock(vp);
+		goto err;
+	}
+
+	error = lxd_dirdelete(parent, ldn, nm, DR_RMDIR, cr);
+	vn_vfsunlock(vp);
+
+err:
+	rw_exit(&ldn->lxdn_rwlock);
+	rw_exit(&parent->lxdn_rwlock);
+	ldnode_rele(ldn);
+
+	return (error);
+}
+
+/* Not static so it can be used during mount. */
+int
+lxd_symlink(vnode_t *dvp, char *nm, struct vattr *tva, char *tnm,
+    struct cred *cr, caller_context_t *ct, int flags)
+{
+	lxd_node_t *parent = VTOLDN(dvp);
+	lxd_mnt_t *lxdm = VTOLXDM(parent->lxdn_vnode);
+	lxd_node_t *self = NULL;
+	vnode_t *tvp;
+	char *cp = NULL;
+	int error;
+	size_t len;
+
+	/* this will check for existence in both front and back */
+	if (lxd_lookup(dvp, nm, &tvp, NULL, 0, NULL, cr, ct, NULL, NULL) == 0) {
+		/* The entry already exists */
+		VN_RELE(tvp);
+		return (EEXIST);
+	}
+
+	/* make symlink in the front */
+	rw_enter(&parent->lxdn_rwlock, RW_WRITER);
+	error = lxd_direnter(lxdm, parent, nm, DE_CREATE, NULL, NULL,
+	    tva, &self, cr, ct);
+	rw_exit(&parent->lxdn_rwlock);
+
+	if (error) {
+		if (self != NULL)
+			ldnode_rele(self);
+		return (error);
+	}
+
+	len = strlen(tnm) + 1;
+	cp = kmem_alloc(len, KM_NOSLEEP | KM_NORMALPRI);
+	if (cp == NULL) {
+		ldnode_rele(self);
+		return (ENOSPC);
+	}
+	(void) strcpy(cp, tnm);
+
+	self->lxdn_symlink = cp;
+	self->lxdn_size = len - 1;
+	ldnode_rele(self);
+
+	return (error);
+}
+
+static int
+lxd_readlink(vnode_t *vp, struct uio *uiop, struct cred *cr,
+    caller_context_t *ct)
+{
+	lxd_node_t *ldn = VTOLDN(vp);
+
+	if (ldn->lxdn_type == LXDNT_FRONT) {
+		int error;
+
+		if (vp->v_type != VLNK)
+			return (EINVAL);
+
+		rw_enter(&ldn->lxdn_rwlock, RW_READER);
+		error = uiomove(ldn->lxdn_symlink, ldn->lxdn_size, UIO_READ,
+		    uiop);
+		gethrestime(&ldn->lxdn_atime);
+		rw_exit(&ldn->lxdn_rwlock);
+		return (error);
+	}
+
+	ASSERT(ldn->lxdn_type == LXDNT_BACK);
+	vp = REALVP(vp);
+	return (VOP_READLINK(vp, uiop, cr, ct));
+}
+
+static int
+lx_merge_front(vnode_t *vp, struct uio *uiop, off_t req_off, int *eofp)
+{
+	lxd_node_t *ldn = VTOLDN(vp);
+	struct dirent *sd;
+	lxd_dirent_t *ldp;
+	enum lxd_node_type type = ldn->lxdn_type;
+	ssize_t uresid;
+	off_t front_off;
+	int error = 0;
+	int sdlen;
+
+	/* skip the front entries if the back read was incomplete */
+	if (*eofp == 0)
+		return (0);
+
+	/*
+	 * If this was a back node then reading that node has completed and we
+	 * may have a partially full uio struct. eof should be set to true.
+	 * Leave it set since we're likely to hit eof for the front nodes (if
+	 * any).
+	 */
+
+	front_off = uiop->uio_offset + 1;
+	sdlen = sizeof (struct dirent) + MAXPATHLEN;
+	/* zalloc to ensure we don't have anything in the d_name buffer */
+	sd = (struct dirent *)kmem_zalloc(sdlen, KM_SLEEP);
+	ldp = ldn->lxdn_dir;
+	while (ldp != NULL && (uresid = uiop->uio_resid) > 0) {
+		int namelen;
+		int reclen;
+
+		/*
+		 * Skip dot and dotdot for back nodes since we have them
+		 * already.
+		 */
+		if (type == LXDNT_BACK &&
+		    (strcmp(ldp->lddir_name, ".") == 0 ||
+		    strcmp(ldp->lddir_name, "..") == 0)) {
+			ldp = ldp->lddir_next;
+			continue;
+		}
+
+		/*
+		 * Might have previously had a partial readdir of the front
+		 * nodes, and now we're back for more, or we may just be
+		 * be doing a follow-up readdir after we've previously
+		 * returned all front and back nodes.
+		 */
+		if (front_off > req_off) {
+			namelen = strlen(ldp->lddir_name); /* no +1 needed */
+			reclen = (int)DIRENT64_RECLEN(namelen);
+
+			/*
+			 * If the size of the data to transfer is greater
+			 * than that requested, then we can't do it this
+			 * transfer.
+			 */
+			if (reclen > uresid) {
+				*eofp = 0;
+				/* Buffer too small for any entries. */
+				if (front_off == 0)
+					error = EINVAL;
+				break;
+			}
+
+			(void) strncpy(sd->d_name, ldp->lddir_name,
+			    DIRENT64_NAMELEN(reclen));
+			sd->d_reclen = (ushort_t)reclen;
+			sd->d_ino = (ino_t)ldp->lddir_node->lxdn_nodeid;
+			sd->d_off = front_off;
+
+			/* uiomove will adjust iov_base properly */
+			if ((error = uiomove((caddr_t)sd, reclen, UIO_READ,
+			    uiop)) != 0) {
+				*eofp = 0;
+				break;
+			}
+		}
+
+		/*
+		 * uiomove() above updates both uio_resid and uio_offset by the
+		 * same amount but we want uio_offset to change in increments
+		 * of 1, which is different from the number of bytes being
+		 * returned to the caller, so we set uio_offset explicitly,
+		 * ignoring what uiomove() did.
+		 */
+		uiop->uio_offset = front_off;
+		front_off++;
+
+		ldp = ldp->lddir_next;
+	}
+
+	kmem_free(sd, sdlen);
+	return (error);
+}
+
+static int
+lxd_readdir(vnode_t *vp, struct uio *uiop, struct cred *cr, int *eofp,
+    caller_context_t *ct, int flags)
+{
+	lxd_node_t *ldn = VTOLDN(vp);
+	vnode_t *rvp;
+	int res;
+	off_t req_off;
+
+	if (uiop->uio_iovcnt != 1)
+		return (EINVAL);
+
+	if (vp->v_type != VDIR)
+		return (ENOTDIR);
+
+	req_off = uiop->uio_offset;
+
+	/* First read the back node (if it is one) */
+	if (ldn->lxdn_type == LXDNT_BACK) {
+		rvp = REALVP(vp);
+		res = VOP_READDIR(rvp, uiop, cr, eofp, ct, flags);
+		if (res != 0)
+			return (res);
+	} else {
+		/* setup for merge_front */
+		ASSERT(ldn->lxdn_type == LXDNT_FRONT);
+		/* caller should have already called lxd_rwlock */
+		ASSERT(RW_READ_HELD(&ldn->lxdn_rwlock));
+
+		*eofp = 1;
+		/*
+		 * The merge code starts the offset calculation from uio_offset,
+		 * which is normally already set to the high value by the back
+		 * code, but in this case we need to count up from 0.
+		 */
+		uiop->uio_offset = 0;
+	}
+
+	/*
+	 * Our back nodes can also have front entries hanging on them so we
+	 * need to merge those in. Or, we may simply have a front node (i.e. a
+	 * front subdir).
+	 */
+	res = lx_merge_front(vp, uiop, req_off, eofp);
+	return (res);
+}
+
+static int
+lxd_rwlock(vnode_t *vp, int write_lock, caller_context_t *ct)
+{
+	lxd_node_t *ldn = VTOLDN(vp);
+
+	if (ldn->lxdn_type == LXDNT_FRONT) {
+		if (write_lock) {
+			rw_enter(&ldn->lxdn_rwlock, RW_WRITER);
+		} else {
+			rw_enter(&ldn->lxdn_rwlock, RW_READER);
+		}
+		return (write_lock);
+	}
+
+	ASSERT(ldn->lxdn_type == LXDNT_BACK);
+	vp = REALVP(vp);
+	return (VOP_RWLOCK(vp, write_lock, ct));
+}
+
+static void
+lxd_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ct)
+{
+	lxd_node_t *ldn = VTOLDN(vp);
+
+	if (ldn->lxdn_type == LXDNT_FRONT) {
+		rw_exit(&ldn->lxdn_rwlock);
+		return;
+	}
+
+	ASSERT(ldn->lxdn_type == LXDNT_BACK);
+	vp = REALVP(vp);
+	VOP_RWUNLOCK(vp, write_lock, ct);
+}
+
+static int
+lxd_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct)
+{
+	lxd_node_t *ldn = VTOLDN(vp);
+
+	if (ldn->lxdn_type == LXDNT_FRONT) {
+		return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
+	}
+
+	ASSERT(ldn->lxdn_type == LXDNT_BACK);
+	vp = REALVP(vp);
+	return (VOP_SEEK(vp, ooff, noffp, ct));
+}
+
+static int
+lxd_cmp(vnode_t *vp1, vnode_t *vp2, caller_context_t *ct)
+{
+	while (vn_matchops(vp1, lxd_vnodeops) &&
+	    VTOLDN(vp1)->lxdn_type == LXDNT_BACK) {
+		vp1 = REALVP(vp1);
+	}
+	while (vn_matchops(vp2, lxd_vnodeops) &&
+	    VTOLDN(vp2)->lxdn_type == LXDNT_BACK) {
+		vp2 = REALVP(vp2);
+	}
+
+	if (vn_matchops(vp1, lxd_vnodeops) || vn_matchops(vp2, lxd_vnodeops))
+		return (vp1 == vp2);
+
+	return (VOP_CMP(vp1, vp2, ct));
+}
+
+static int
+lxd_frlock(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, offset_t offset,
+    struct flk_callback *flk_cbp, cred_t *cr, caller_context_t *ct)
+{
+	lxd_node_t *ldn = VTOLDN(vp);
+
+	if (ldn->lxdn_type == LXDNT_FRONT) {
+		return (EINVAL);
+	}
+
+	ASSERT(ldn->lxdn_type == LXDNT_BACK);
+	vp = REALVP(vp);
+	return (VOP_FRLOCK(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
+}
+
+static int
+lxd_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, offset_t offset,
+    struct cred *cr, caller_context_t *ct)
+{
+	lxd_node_t *ldn = VTOLDN(vp);
+
+	if (ldn->lxdn_type == LXDNT_FRONT) {
+		return (EINVAL);
+	}
+
+	ASSERT(ldn->lxdn_type == LXDNT_BACK);
+	vp = REALVP(vp);
+	return (VOP_SPACE(vp, cmd, bfp, flag, offset, cr, ct));
+}
+
+static int
+lxd_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *prot,
+    struct page *parr[], size_t psz, struct seg *seg, caddr_t addr,
+    enum seg_rw rw, struct cred *cr, caller_context_t *ct)
+{
+	lxd_node_t *ldn = VTOLDN(vp);
+
+	if (ldn->lxdn_type == LXDNT_FRONT) {
+		return (EINVAL);
+	}
+
+	ASSERT(ldn->lxdn_type == LXDNT_BACK);
+	vp = REALVP(vp);
+	return (VOP_GETPAGE(vp, off, len, prot, parr, psz, seg, addr, rw, cr,
+	    ct));
+}
+
+static int
+lxd_putpage(vnode_t *vp, offset_t off, size_t len, int flags, struct cred *cr,
+    caller_context_t *ct)
+{
+	lxd_node_t *ldn = VTOLDN(vp);
+
+	if (ldn->lxdn_type == LXDNT_FRONT) {
+		return (EINVAL);
+	}
+
+	ASSERT(ldn->lxdn_type == LXDNT_BACK);
+	vp = REALVP(vp);
+	return (VOP_PUTPAGE(vp, off, len, flags, cr, ct));
+}
+
+static int
+lxd_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, size_t len,
+    uchar_t prot, uchar_t maxprot, uint_t flags, struct cred *cr,
+    caller_context_t *ct)
+{
+	lxd_node_t *ldn = VTOLDN(vp);
+
+	if (ldn->lxdn_type == LXDNT_FRONT) {
+		return (EINVAL);
+	}
+
+	ASSERT(ldn->lxdn_type == LXDNT_BACK);
+	vp = REALVP(vp);
+	return (VOP_MAP(vp, off, as, addrp, len, prot, maxprot, flags, cr, ct));
+}
+
+static int
+lxd_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, size_t len,
+    uchar_t prot, uchar_t maxprot, uint_t flags, struct cred *cr,
+    caller_context_t *ct)
+{
+	lxd_node_t *ldn = VTOLDN(vp);
+
+	if (ldn->lxdn_type == LXDNT_FRONT) {
+		return (EINVAL);
+	}
+
+	ASSERT(ldn->lxdn_type == LXDNT_BACK);
+	vp = REALVP(vp);
+	return (VOP_ADDMAP(vp, off, as, addr, len, prot, maxprot, flags, cr,
+	    ct));
+}
+
+static int
+lxd_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, size_t len,
+    uint_t prot, uint_t maxprot, uint_t flags, struct cred *cr,
+    caller_context_t *ct)
+{
+	lxd_node_t *ldn = VTOLDN(vp);
+
+	if (ldn->lxdn_type == LXDNT_FRONT) {
+		return (EINVAL);
+	}
+
+	ASSERT(ldn->lxdn_type == LXDNT_BACK);
+	vp = REALVP(vp);
+	return (VOP_DELMAP(vp, off, as, addr, len, prot, maxprot, flags, cr,
+	    ct));
+}
+
+static int
+lxd_poll(vnode_t *vp, short events, int anyyet, short *reventsp,
+    struct pollhead **phpp, caller_context_t *ct)
+{
+	lxd_node_t *ldn = VTOLDN(vp);
+
+	if (ldn->lxdn_type == LXDNT_FRONT) {
+		return (EINVAL);
+	}
+
+	ASSERT(ldn->lxdn_type == LXDNT_BACK);
+	vp = REALVP(vp);
+	return (VOP_POLL(vp, events, anyyet, reventsp, phpp, ct));
+}
+
+static int
+lxd_dump(vnode_t *vp, caddr_t addr, offset_t bn, offset_t count,
+    caller_context_t *ct)
+{
+	lxd_node_t *ldn = VTOLDN(vp);
+
+	if (ldn->lxdn_type == LXDNT_FRONT) {
+		return (EINVAL);
+	}
+
+	ASSERT(ldn->lxdn_type == LXDNT_BACK);
+	vp = REALVP(vp);
+	return (VOP_DUMP(vp, addr, bn, count, ct));
+}
+
+static int
+lxd_pathconf(vnode_t *vp, int cmd, ulong_t *valp, struct cred *cr,
+    caller_context_t *ct)
+{
+	lxd_node_t *ldn = VTOLDN(vp);
+
+	if (ldn->lxdn_type == LXDNT_FRONT) {
+		return (EINVAL);
+	}
+
+	ASSERT(ldn->lxdn_type == LXDNT_BACK);
+	vp = REALVP(vp);
+	return (VOP_PATHCONF(vp, cmd, valp, cr, ct));
+}
+
+static int
+lxd_pageio(vnode_t *vp, struct page *pp, u_offset_t io_off, size_t io_len,
+    int flags, cred_t *cr, caller_context_t *ct)
+{
+	lxd_node_t *ldn = VTOLDN(vp);
+
+	if (ldn->lxdn_type == LXDNT_FRONT) {
+		return (EINVAL);
+	}
+
+	ASSERT(ldn->lxdn_type == LXDNT_BACK);
+	vp = REALVP(vp);
+	return (VOP_PAGEIO(vp, pp, io_off, io_len, flags, cr, ct));
+}
+
+static void
+lxd_dispose(vnode_t *vp, page_t *pp, int fl, int dn, cred_t *cr,
+    caller_context_t *ct)
+{
+	lxd_node_t *ldn = VTOLDN(vp);
+
+	if (ldn->lxdn_type == LXDNT_FRONT) {
+		return;
+	}
+
+	ASSERT(ldn->lxdn_type == LXDNT_BACK);
+	vp = REALVP(vp);
+	if (vp != NULL && !VN_ISKAS(vp))
+		VOP_DISPOSE(vp, pp, fl, dn, cr, ct);
+}
+
+static int
+lxd_setsecattr(vnode_t *vp, vsecattr_t *secattr, int flags, struct cred *cr,
+    caller_context_t *ct)
+{
+	lxd_node_t *ldn = VTOLDN(vp);
+	if (ldn->lxdn_type == LXDNT_FRONT) {
+		return (ENOSYS);
+	}
+
+	ASSERT(ldn->lxdn_type == LXDNT_BACK);
+	if (vn_is_readonly(vp))
+		return (EROFS);
+
+	vp = REALVP(vp);
+	return (VOP_SETSECATTR(vp, secattr, flags, cr, ct));
+}
+
+static int
+lxd_getsecattr(vnode_t *vp, vsecattr_t *secattr, int flags, struct cred *cr,
+    caller_context_t *ct)
+{
+	lxd_node_t *ldn = VTOLDN(vp);
+
+	if (ldn->lxdn_type == LXDNT_FRONT) {
+		return (ENOSYS);
+	}
+
+	ASSERT(ldn->lxdn_type == LXDNT_BACK);
+	vp = REALVP(vp);
+	return (VOP_GETSECATTR(vp, secattr, flags, cr, ct));
+}
+
+static int
+lxd_shrlock(vnode_t *vp, int cmd, struct shrlock *shr, int flag, cred_t *cr,
+    caller_context_t *ct)
+{
+	lxd_node_t *ldn = VTOLDN(vp);
+
+	if (ldn->lxdn_type == LXDNT_FRONT) {
+		return (EINVAL);
+	}
+
+	ASSERT(ldn->lxdn_type == LXDNT_BACK);
+	vp = REALVP(vp);
+	return (VOP_SHRLOCK(vp, cmd, shr, flag, cr, ct));
+}
+
+/*
+ * Loopback vnode operations vector.
+ */
+
+struct vnodeops *lxd_vnodeops;
+
+const fs_operation_def_t lxd_vnodeops_template[] = {
+	VOPNAME_OPEN,		{ .vop_open = lxd_open },
+	VOPNAME_CLOSE,		{ .vop_close = lxd_close },
+	VOPNAME_READ,		{ .vop_read = lxd_read },
+	VOPNAME_WRITE,		{ .vop_write = lxd_write },
+	VOPNAME_IOCTL,		{ .vop_ioctl = lxd_ioctl },
+	VOPNAME_SETFL,		{ .vop_setfl = lxd_setfl },
+	VOPNAME_GETATTR,	{ .vop_getattr = lxd_getattr },
+	VOPNAME_SETATTR,	{ .vop_setattr = lxd_setattr },
+	VOPNAME_ACCESS,		{ .vop_access = lxd_access },
+	VOPNAME_LOOKUP,		{ .vop_lookup = lxd_lookup },
+	VOPNAME_CREATE,		{ .vop_create = lxd_create },
+	VOPNAME_REMOVE,		{ .vop_remove = lxd_remove },
+	VOPNAME_LINK,		{ .vop_link = lxd_link },
+	VOPNAME_RENAME,		{ .vop_rename = lxd_rename },
+	VOPNAME_MKDIR,		{ .vop_mkdir = lxd_mkdir },
+	VOPNAME_RMDIR,		{ .vop_rmdir = lxd_rmdir },
+	VOPNAME_READDIR,	{ .vop_readdir = lxd_readdir },
+	VOPNAME_SYMLINK,	{ .vop_symlink = lxd_symlink },
+	VOPNAME_READLINK,	{ .vop_readlink = lxd_readlink },
+	VOPNAME_FSYNC,		{ .vop_fsync = lxd_fsync },
+	VOPNAME_INACTIVE,	{ .vop_inactive = lxd_inactive },
+	VOPNAME_FID,		{ .vop_fid = lxd_fid },
+	VOPNAME_RWLOCK,		{ .vop_rwlock = lxd_rwlock },
+	VOPNAME_RWUNLOCK,	{ .vop_rwunlock = lxd_rwunlock },
+	VOPNAME_SEEK,		{ .vop_seek = lxd_seek },
+	VOPNAME_CMP,		{ .vop_cmp = lxd_cmp },
+	VOPNAME_FRLOCK,		{ .vop_frlock = lxd_frlock },
+	VOPNAME_SPACE,		{ .vop_space = lxd_space },
+	VOPNAME_REALVP,		{ .vop_realvp = lxd_realvp },
+	VOPNAME_GETPAGE,	{ .vop_getpage = lxd_getpage },
+	VOPNAME_PUTPAGE,	{ .vop_putpage = lxd_putpage },
+	VOPNAME_MAP,		{ .vop_map = lxd_map },
+	VOPNAME_ADDMAP,		{ .vop_addmap = lxd_addmap },
+	VOPNAME_DELMAP,		{ .vop_delmap = lxd_delmap },
+	VOPNAME_POLL,		{ .vop_poll = lxd_poll },
+	VOPNAME_DUMP,		{ .vop_dump = lxd_dump },
+	VOPNAME_DUMPCTL,	{ .error = fs_error },
+	VOPNAME_PATHCONF,	{ .vop_pathconf = lxd_pathconf },
+	VOPNAME_PAGEIO,		{ .vop_pageio = lxd_pageio },
+	VOPNAME_DISPOSE,	{ .vop_dispose = lxd_dispose },
+	VOPNAME_SETSECATTR,	{ .vop_setsecattr = lxd_setsecattr },
+	VOPNAME_GETSECATTR,	{ .vop_getsecattr = lxd_getsecattr },
+	VOPNAME_SHRLOCK,	{ .vop_shrlock = lxd_shrlock },
+	NULL,			NULL
+};
diff --git a/usr/src/uts/common/brand/lx/dtrace/lx_systrace.c b/usr/src/uts/common/brand/lx/dtrace/lx_systrace.c
new file mode 100644
index 0000000000..510626d220
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/dtrace/lx_systrace.c
@@ -0,0 +1,497 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ * Copyright 2015 Joyent, Inc.
+ */
+
+
+#include <sys/modctl.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/stat.h>
+#include <sys/conf.h>
+#include <sys/frame.h>
+#include <sys/dtrace.h>
+#include <sys/dtrace_impl.h>
+
+#include <sys/lx_brand.h>
+#include <sys/lx_impl.h>
+
+/*
+ * We store the syscall number in the low 16 bits (which limits us to 64k
+ * syscalls). The next bit indicates entry/return probe and the next bit
+ * indicates 64bit/32bit syscall.
+ */
+#define	SCALL_MASK	0xffff
+#define	ENTRY_FLAG	0x10000
+#define	SYSC_64_BIT	0x100000
+
+#define	LX_SYSTRACE_IS64BIT(x)	((int)(x) & SYSC_64_BIT)
+#define	LX_SYSTRACE_ISENTRY(x)	((int)(x) & ENTRY_FLAG)
+#define	LX_SYSTRACE_SYSNUM(x)	((int)(x) & SCALL_MASK)
+
+#define	LX_SYSTRACE32_ENTRY(id)	(ENTRY_FLAG | (id))
+#define	LX_SYSTRACE32_RETURN(id) (id)
+
+#define	LX_SYSTRACE64_ENTRY(id)	(SYSC_64_BIT | ENTRY_FLAG | (id))
+#define	LX_SYSTRACE64_RETURN(id) (SYSC_64_BIT | id)
+
+#define	LX_SYSTRACE_ENTRY_AFRAMES	2
+#define	LX_SYSTRACE_RETURN_AFRAMES	4
+
+typedef struct lx_systrace_sysent {
+	const char *lss_name;
+	dtrace_id_t lss_entry;
+	dtrace_id_t lss_return;
+} lx_systrace_sysent_t;
+
+static dev_info_t *lx_systrace_devi;
+static dtrace_provider_id_t lx_systrace_id;
+static kmutex_t lx_systrace_lock;
+static uint_t lx_systrace_nenabled;
+
+static int lx_systrace_nsysent32;
+static lx_systrace_sysent_t *lx_systrace_sysent32;
+
+#if defined(_LP64)
+static int lx_systrace_nsysent64;
+static lx_systrace_sysent_t *lx_systrace_sysent64;
+#endif
+
+/*ARGSUSED*/
+static void
+lx_systrace_entry(ulong_t sysnum, ulong_t arg0, ulong_t arg1, ulong_t arg2,
+    ulong_t arg3, ulong_t arg4, ulong_t arg5)
+{
+	dtrace_id_t id;
+
+#if defined(_LP64)
+	if ((ttoproc(curthread))->p_model == DATAMODEL_NATIVE) {
+		if (sysnum >= lx_systrace_nsysent64)
+			return;
+		id = lx_systrace_sysent64[sysnum].lss_entry;
+	} else
+#endif
+	{
+		if (sysnum >= lx_systrace_nsysent32)
+			return;
+		id = lx_systrace_sysent32[sysnum].lss_entry;
+	}
+
+	if (id == DTRACE_IDNONE)
+		return;
+	dtrace_probe(id, arg0, arg1, arg2, arg3, arg4);
+}
+
+/*ARGSUSED*/
+static void
+lx_systrace_return(ulong_t sysnum, ulong_t arg0, ulong_t arg1, ulong_t arg2,
+    ulong_t arg3, ulong_t arg4, ulong_t arg5)
+{
+	dtrace_id_t id;
+
+#if defined(_LP64)
+	if ((ttoproc(curthread))->p_model == DATAMODEL_NATIVE) {
+		if (sysnum >= lx_systrace_nsysent64)
+			return;
+		id = lx_systrace_sysent64[sysnum].lss_return;
+	} else
+#endif
+	{
+		if (sysnum >= lx_systrace_nsysent32)
+			return;
+		id = lx_systrace_sysent32[sysnum].lss_return;
+	}
+
+	if (id == DTRACE_IDNONE)
+		return;
+	dtrace_probe(id, arg0, arg1, arg2, arg3, arg4);
+}
+
+/*ARGSUSED*/
+static void
+lx_systrace_provide(void *arg, const dtrace_probedesc_t *desc)
+{
+	int i;
+
+	if (desc != NULL)
+		return;
+
+	for (i = 0; i < lx_systrace_nsysent32; i++) {
+		if (dtrace_probe_lookup(lx_systrace_id, "sys32",
+		    lx_systrace_sysent32[i].lss_name, "entry") != 0)
+			continue;
+
+		(void) dtrace_probe_create(lx_systrace_id, "sys32",
+		    lx_systrace_sysent32[i].lss_name, "entry",
+		    LX_SYSTRACE_ENTRY_AFRAMES,
+		    (void *)((uintptr_t)LX_SYSTRACE32_ENTRY(i)));
+
+		(void) dtrace_probe_create(lx_systrace_id, "sys32",
+		    lx_systrace_sysent32[i].lss_name, "return",
+		    LX_SYSTRACE_RETURN_AFRAMES,
+		    (void *)((uintptr_t)LX_SYSTRACE32_RETURN(i)));
+
+		lx_systrace_sysent32[i].lss_entry = DTRACE_IDNONE;
+		lx_systrace_sysent32[i].lss_return = DTRACE_IDNONE;
+	}
+
+#if defined(_LP64)
+	for (i = 0; i < lx_systrace_nsysent64; i++) {
+		if (dtrace_probe_lookup(lx_systrace_id, "sys64",
+		    lx_systrace_sysent64[i].lss_name, "entry") != 0)
+			continue;
+
+		(void) dtrace_probe_create(lx_systrace_id, "sys64",
+		    lx_systrace_sysent64[i].lss_name, "entry",
+		    LX_SYSTRACE_ENTRY_AFRAMES,
+		    (void *)((uintptr_t)LX_SYSTRACE64_ENTRY(i)));
+
+		(void) dtrace_probe_create(lx_systrace_id, "sys64",
+		    lx_systrace_sysent64[i].lss_name, "return",
+		    LX_SYSTRACE_RETURN_AFRAMES,
+		    (void *)((uintptr_t)LX_SYSTRACE64_RETURN(i)));
+
+		lx_systrace_sysent64[i].lss_entry = DTRACE_IDNONE;
+		lx_systrace_sysent64[i].lss_return = DTRACE_IDNONE;
+	}
+#endif
+}
+
+/*ARGSUSED*/
+static int
+lx_systrace_enable(void *arg, dtrace_id_t id, void *parg)
+{
+	int sysnum = LX_SYSTRACE_SYSNUM((uintptr_t)parg);
+
+	mutex_enter(&lx_systrace_lock);
+	if (lx_systrace_nenabled++ == 0)
+		lx_brand_systrace_enable();
+	mutex_exit(&lx_systrace_lock);
+
+	if (LX_SYSTRACE_IS64BIT((uintptr_t)parg)) {
+#if defined(_LP64)
+		ASSERT(sysnum < lx_systrace_nsysent64);
+
+		if (LX_SYSTRACE_ISENTRY((uintptr_t)parg)) {
+			lx_systrace_sysent64[sysnum].lss_entry = id;
+		} else {
+			lx_systrace_sysent64[sysnum].lss_return = id;
+		}
+#endif
+	} else {
+		ASSERT(sysnum < lx_systrace_nsysent32);
+
+		if (LX_SYSTRACE_ISENTRY((uintptr_t)parg)) {
+			lx_systrace_sysent32[sysnum].lss_entry = id;
+		} else {
+			lx_systrace_sysent32[sysnum].lss_return = id;
+		}
+	}
+	return (0);
+}
+
+/*ARGSUSED*/
+static void
+lx_systrace_disable(void *arg, dtrace_id_t id, void *parg)
+{
+	int sysnum = LX_SYSTRACE_SYSNUM((uintptr_t)parg);
+
+	if (LX_SYSTRACE_IS64BIT((uintptr_t)parg)) {
+#if defined(_LP64)
+		ASSERT(sysnum < lx_systrace_nsysent64);
+
+		if (LX_SYSTRACE_ISENTRY((uintptr_t)parg)) {
+			lx_systrace_sysent64[sysnum].lss_entry = DTRACE_IDNONE;
+		} else {
+			lx_systrace_sysent64[sysnum].lss_return = DTRACE_IDNONE;
+		}
+#endif
+	} else {
+		ASSERT(sysnum < lx_systrace_nsysent32);
+
+		if (LX_SYSTRACE_ISENTRY((uintptr_t)parg)) {
+			lx_systrace_sysent32[sysnum].lss_entry = DTRACE_IDNONE;
+		} else {
+			lx_systrace_sysent32[sysnum].lss_return = DTRACE_IDNONE;
+		}
+	}
+
+	mutex_enter(&lx_systrace_lock);
+	if (--lx_systrace_nenabled == 0)
+		lx_brand_systrace_disable();
+	mutex_exit(&lx_systrace_lock);
+}
+
+/*ARGSUSED*/
+static void
+lx_systrace_destroy(void *arg, dtrace_id_t id, void *parg)
+{
+}
+
+/*ARGSUSED*/
+static uint64_t
+lx_systrace_getarg(void *arg, dtrace_id_t id, void *parg, int argno,
+    int aframes)
+{
+	struct frame *fp = (struct frame *)dtrace_getfp();
+	uintptr_t *stack;
+	uint64_t val = 0;
+	int i;
+
+	if (argno >= 6)
+		return (0);
+
+	/*
+	 * Walk the four frames down the stack to the entry or return callback.
+	 * Our callback calls dtrace_probe() which calls dtrace_dif_variable()
+	 * which invokes this function to get the extended arguments. We get
+	 * the frame pointer in via call to dtrace_getfp() above which makes for
+	 * four frames.
+	 */
+	for (i = 0; i < 4; i++) {
+		fp = (struct frame *)fp->fr_savfp;
+	}
+
+	stack = (uintptr_t *)&fp[1];
+
+	/*
+	 * Skip the first argument to the callback -- the system call number.
+	 */
+	argno++;
+
+#ifdef __amd64
+	/*
+	 * On amd64, the first 6 arguments are passed in registers while
+	 * subsequent arguments are on the stack.
+	 */
+	argno -= 6;
+#endif
+
+	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
+	val = stack[argno];
+	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
+
+	return (val);
+}
+
+
+static const dtrace_pattr_t lx_systrace_attr = {
+{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
+{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
+{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
+{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
+{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
+};
+
+static dtrace_pops_t lx_systrace_pops = {
+	lx_systrace_provide,
+	NULL,
+	lx_systrace_enable,
+	lx_systrace_disable,
+	NULL,
+	NULL,
+	NULL,
+	lx_systrace_getarg,
+	NULL,
+	lx_systrace_destroy
+};
+
+static int
+lx_systrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
+{
+	int i;
+
+	switch (cmd) {
+	case DDI_ATTACH:
+		break;
+	case DDI_RESUME:
+		return (DDI_SUCCESS);
+	default:
+		return (DDI_FAILURE);
+	}
+
+	if (ddi_create_minor_node(devi, "lx_systrace", S_IFCHR,
+	    0, DDI_PSEUDO, NULL) == DDI_FAILURE ||
+	    dtrace_register("lx-syscall", &lx_systrace_attr,
+	    DTRACE_PRIV_USER, 0, &lx_systrace_pops, NULL,
+	    &lx_systrace_id) != 0) {
+		ddi_remove_minor_node(devi, NULL);
+		return (DDI_FAILURE);
+	}
+
+	ddi_report_dev(devi);
+	lx_systrace_devi = devi;
+
+	/*
+	 * Initialize the 32-bit table.
+	 */
+	VERIFY(lx_nsysent32 > 0);
+	lx_systrace_nsysent32 = lx_nsysent32;
+	lx_systrace_sysent32 = kmem_zalloc(lx_systrace_nsysent32 *
+	    sizeof (lx_systrace_sysent_t), KM_SLEEP);
+
+	for (i = 0; i < lx_systrace_nsysent32; i++) {
+		lx_systrace_sysent32[i].lss_name = lx_sysent32[i].sy_name;
+		lx_systrace_sysent32[i].lss_entry = DTRACE_IDNONE;
+		lx_systrace_sysent32[i].lss_return = DTRACE_IDNONE;
+	}
+
+#if defined(_LP64)
+	/*
+	 * Initialize the 64-bit table.
+	 */
+	VERIFY(lx_nsysent64 > 0);
+	lx_systrace_nsysent64 = lx_nsysent64;
+	lx_systrace_sysent64 = kmem_zalloc(lx_systrace_nsysent64 *
+	    sizeof (lx_systrace_sysent_t), KM_SLEEP);
+
+	for (i = 0; i < lx_systrace_nsysent64; i++) {
+		lx_systrace_sysent64[i].lss_name = lx_sysent64[i].sy_name;
+		lx_systrace_sysent64[i].lss_entry = DTRACE_IDNONE;
+		lx_systrace_sysent64[i].lss_return = DTRACE_IDNONE;
+	}
+#endif
+
+	/*
+	 * Install probe triggers.
+	 */
+	lx_systrace_entry_ptr = lx_systrace_entry;
+	lx_systrace_return_ptr = lx_systrace_return;
+
+	return (DDI_SUCCESS);
+}
+
+/*ARGSUSED*/
+static int
+lx_systrace_detach(dev_info_t *devi, ddi_detach_cmd_t cmd)
+{
+	switch (cmd) {
+	case DDI_DETACH:
+		break;
+	case DDI_SUSPEND:
+		return (DDI_SUCCESS);
+	default:
+		return (DDI_FAILURE);
+	}
+
+	if (dtrace_unregister(lx_systrace_id) != 0)
+		return (DDI_FAILURE);
+
+	/*
+	 * Free tables.
+	 */
+	kmem_free(lx_systrace_sysent32, lx_systrace_nsysent32 *
+	    sizeof (lx_systrace_sysent_t));
+	lx_systrace_sysent32 = NULL;
+	lx_systrace_nsysent32 = 0;
+
+#if defined(_LP64)
+	kmem_free(lx_systrace_sysent64, lx_systrace_nsysent64 *
+	    sizeof (lx_systrace_sysent_t));
+	lx_systrace_sysent64 = NULL;
+	lx_systrace_nsysent64 = 0;
+#endif
+
+	/*
+	 * Reset probe triggers.
+	 */
+	lx_systrace_entry_ptr = NULL;
+	lx_systrace_return_ptr = NULL;
+
+	return (DDI_SUCCESS);
+}
+
+/*ARGSUSED*/
+static int
+lx_systrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
+{
+	return (0);
+}
+
+static struct cb_ops lx_systrace_cb_ops = {
+	lx_systrace_open,	/* open */
+	nodev,			/* close */
+	nulldev,		/* strategy */
+	nulldev,		/* print */
+	nodev,			/* dump */
+	nodev,			/* read */
+	nodev,			/* write */
+	nodev,			/* ioctl */
+	nodev,			/* devmap */
+	nodev,			/* mmap */
+	nodev,			/* segmap */
+	nochpoll,		/* poll */
+	ddi_prop_op,		/* cb_prop_op */
+	0,			/* streamtab */
+	D_NEW | D_MP		/* Driver compatibility flag */
+};
+
+static struct dev_ops lx_systrace_ops = {
+	DEVO_REV,		/* devo_rev */
+	0,			/* refcnt */
+	ddi_getinfo_1to1,	/* get_dev_info */
+	nulldev,		/* identify */
+	nulldev,		/* probe */
+	lx_systrace_attach,	/* attach */
+	lx_systrace_detach,	/* detach */
+	nodev,			/* reset */
+	&lx_systrace_cb_ops,	/* driver operations */
+	NULL,			/* bus operations */
+	nodev,			/* dev power */
+	ddi_quiesce_not_needed,		/* quiesce */
+};
+
+/*
+ * Module linkage information for the kernel.
+ */
+static struct modldrv modldrv = {
+	&mod_driverops,		/* module type (this is a pseudo driver) */
+	"Linux Brand System Call Tracing", /* name of module */
+	&lx_systrace_ops	/* driver ops */
+};
+
+static struct modlinkage modlinkage = {
+	MODREV_1,
+	(void *)&modldrv,
+	NULL
+};
+
+int
+_init(void)
+{
+	return (mod_install(&modlinkage));
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&modlinkage, modinfop));
+}
+
+int
+_fini(void)
+{
+	return (mod_remove(&modlinkage));
+}
diff --git a/usr/src/uts/common/brand/lx/dtrace/lx_systrace.conf b/usr/src/uts/common/brand/lx/dtrace/lx_systrace.conf
new file mode 100644
index 0000000000..e4499c8a5b
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/dtrace/lx_systrace.conf
@@ -0,0 +1,27 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+
+name="lx_systrace" parent="pseudo" instance=0;
diff --git a/usr/src/uts/common/brand/lx/io/lx_netlink.c b/usr/src/uts/common/brand/lx/io/lx_netlink.c
new file mode 100644
index 0000000000..6fec9ef4cb
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/io/lx_netlink.c
@@ -0,0 +1,1684 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+/*
+ * Compatibility for the Linux netlink(7) kernel/user transport, as well as
+ * for in-kernel netlink(7) providers like rtnetlink(7).  See RFC 3549 for
+ * details of the protocol, and the Linux man pages for details of the Linux
+ * implementation that we're mimicking.
+ */
+
+#include <sys/strsubr.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sunddi.h>
+#include <sys/sunldi.h>
+#include <sys/strsun.h>
+#include <sys/tihdr.h>
+#include <sys/sockio.h>
+#include <sys/brand.h>
+#include <sys/debug.h>
+#include <sys/ucred.h>
+#include <inet/ip.h>
+#include <inet/ip6.h>
+#include <inet/ip_impl.h>
+#include <inet/ip_ire.h>
+#include <sys/lx_brand.h>
+#include <sys/lx_misc.h>
+#include <sys/lx_socket.h>
+#include <sys/ethernet.h>
+#include <sys/dlpi.h>
+#include <sys/policy.h>
+
+/*
+ * Flags in netlink header
+ */
+#define	LX_NETLINK_NLM_F_REQUEST		1
+#define	LX_NETLINK_NLM_F_MULTI			2
+#define	LX_NETLINK_NLM_F_ACK			4
+#define	LX_NETLINK_NLM_F_ECHO			8
+#define	LX_NETLINK_NLM_F_DUMP_INTR		16
+#define	LX_NETLINK_NLM_F_ROOT			0x100
+#define	LX_NETLINK_NLM_F_MATCH			0x200
+#define	LX_NETLINK_NLM_F_ATOMIC			0x400
+
+/*
+ * Generic message type constants
+ */
+#define	LX_NETLINK_NLMSG_NONE			0
+#define	LX_NETLINK_NLMSG_NOOP			1
+#define	LX_NETLINK_NLMSG_ERROR			2
+#define	LX_NETLINK_NLMSG_DONE			3
+#define	LX_NETLINK_NLMSG_OVERRUN		4
+
+/*
+ * Protocol constants.
+ */
+#define	LX_NETLINK_ROUTE			0
+#define	LX_NETLINK_UNUSED			1
+#define	LX_NETLINK_USERSOCK			2
+#define	LX_NETLINK_FIREWALL			3
+#define	LX_NETLINK_SOCK_DIAG			4
+#define	LX_NETLINK_NFLOG			5
+#define	LX_NETLINK_XFRM				6
+#define	LX_NETLINK_SELINUX			7
+#define	LX_NETLINK_ISCSI			8
+#define	LX_NETLINK_AUDIT			9
+#define	LX_NETLINK_FIB_LOOKUP			10
+#define	LX_NETLINK_CONNECTOR			11
+#define	LX_NETLINK_NETFILTER			12
+#define	LX_NETLINK_IP6_FW			13
+#define	LX_NETLINK_DNRTMSG			14
+#define	LX_NETLINK_KOBJECT_UEVENT		15
+#define	LX_NETLINK_GENERIC			16
+#define	LX_NETLINK_SCSITRANSPORT		18
+#define	LX_NETLINK_ECRYPTFS			19
+#define	LX_NETLINK_RDMA				20
+#define	LX_NETLINK_CRYPTO			21
+
+/*
+ * rtnetlink(7) attribute-related constants
+ */
+#define	LX_NETLINK_NLA_ALIGNTO			4
+
+#define	LX_NETLINK_RTM_NEWLINK			16
+#define	LX_NETLINK_RTM_DELLINK			17
+#define	LX_NETLINK_RTM_GETLINK			18
+#define	LX_NETLINK_RTM_SETLINK			19
+#define	LX_NETLINK_RTM_NEWADDR			20
+#define	LX_NETLINK_RTM_DELADDR			21
+#define	LX_NETLINK_RTM_GETADDR			22
+#define	LX_NETLINK_RTM_NEWROUTE			24
+#define	LX_NETLINK_RTM_DELROUTE			25
+#define	LX_NETLINK_RTM_GETROUTE			26
+#define	LX_NETLINK_RTM_NEWNEIGH			28
+#define	LX_NETLINK_RTM_DELNEIGH			29
+#define	LX_NETLINK_RTM_GETNEIGH			30
+#define	LX_NETLINK_RTM_NEWRULE			32
+#define	LX_NETLINK_RTM_DELRULE			33
+#define	LX_NETLINK_RTM_GETRULE			34
+#define	LX_NETLINK_RTM_NEWQDISC			36
+#define	LX_NETLINK_RTM_DELQDISC			37
+#define	LX_NETLINK_RTM_GETQDISC			38
+#define	LX_NETLINK_RTM_NEWTCLASS		40
+#define	LX_NETLINK_RTM_DELTCLASS		41
+#define	LX_NETLINK_RTM_GETTCLASS		42
+#define	LX_NETLINK_RTM_NEWTFILTER		44
+#define	LX_NETLINK_RTM_DELTFILTER		45
+#define	LX_NETLINK_RTM_GETTFILTER		46
+#define	LX_NETLINK_RTM_NEWACTION		48
+#define	LX_NETLINK_RTM_DELACTION		49
+#define	LX_NETLINK_RTM_GETACTION		50
+#define	LX_NETLINK_RTM_NEWPREFIX		52
+#define	LX_NETLINK_RTM_GETMULTICAST		58
+#define	LX_NETLINK_RTM_GETANYCAST		62
+#define	LX_NETLINK_RTM_NEWNEIGHTBL		64
+#define	LX_NETLINK_RTM_GETNEIGHTBL		66
+#define	LX_NETLINK_RTM_SETNEIGHTBL		67
+#define	LX_NETLINK_RTM_NEWNDUSEROPT		68
+#define	LX_NETLINK_RTM_NEWADDRLABEL		72
+#define	LX_NETLINK_RTM_DELADDRLABEL		73
+#define	LX_NETLINK_RTM_GETADDRLABEL		74
+#define	LX_NETLINK_RTM_GETDCB			78
+#define	LX_NETLINK_RTM_SETDCB			79
+#define	LX_NETLINK_RTM_NEWNETCONF		80
+#define	LX_NETLINK_RTM_GETNETCONF		82
+#define	LX_NETLINK_RTM_NEWMDB			84
+#define	LX_NETLINK_RTM_DELMDB			85
+#define	LX_NETLINK_RTM_GETMDB			86
+#define	LX_NETLINK_RTM_MAX			87
+
+/*
+ * rtnetlink(7) attribute constants
+ */
+#define	LX_NETLINK_RTA_UNSPEC		0
+#define	LX_NETLINK_RTA_DST		1
+#define	LX_NETLINK_RTA_SRC		2
+#define	LX_NETLINK_RTA_IIF		3
+#define	LX_NETLINK_RTA_OIF		4
+#define	LX_NETLINK_RTA_GATEWAY		5
+#define	LX_NETLINK_RTA_PRIORITY		6
+#define	LX_NETLINK_RTA_PREFSRC		7
+#define	LX_NETLINK_RTA_METRICS		8
+#define	LX_NETLINK_RTA_MULTIPATH	9
+#define	LX_NETLINK_RTA_PROTOINFO	10
+#define	LX_NETLINK_RTA_FLOW		11
+#define	LX_NETLINK_RTA_CACHEINFO	12
+#define	LX_NETLINK_RTA_SESSION		13
+#define	LX_NETLINK_RTA_MP_ALGO		14
+#define	LX_NETLINK_RTA_TABLE		15
+#define	LX_NETLINK_RTA_MARK		16
+#define	LX_NETLINK_RTA_MFC_STATS	17
+#define	LX_NETLINK_MAX_RTA	LX_NETLINK_RTA_MFC_STATS
+
+/*
+ * rtnetlink(7) NEWLINK/DELLINK/GETLINK constants
+ */
+#define	LX_NETLINK_IFLA_UNSPEC			0
+#define	LX_NETLINK_IFLA_ADDRESS			1
+#define	LX_NETLINK_IFLA_BROADCAST		2
+#define	LX_NETLINK_IFLA_IFNAME			3
+#define	LX_NETLINK_IFLA_MTU			4
+#define	LX_NETLINK_IFLA_LINK			5
+#define	LX_NETLINK_IFLA_QDISC			6
+#define	LX_NETLINK_IFLA_STATS			7
+#define	LX_NETLINK_IFLA_COST			8
+#define	LX_NETLINK_IFLA_PRIORITY		9
+#define	LX_NETLINK_IFLA_MASTER			10
+#define	LX_NETLINK_IFLA_WIRELESS		11
+#define	LX_NETLINK_IFLA_PROTINFO		12
+#define	LX_NETLINK_IFLA_TXQLEN			13
+#define	LX_NETLINK_IFLA_MAP			14
+#define	LX_NETLINK_IFLA_WEIGHT			15
+#define	LX_NETLINK_IFLA_OPERSTATE		16
+#define	LX_NETLINK_IFLA_LINKMODE		17
+#define	LX_NETLINK_IFLA_LINKINFO		18
+#define	LX_NETLINK_IFLA_NET_NS_PID		19
+#define	LX_NETLINK_IFLA_IFALIAS			20
+#define	LX_NETLINK_IFLA_NUM_VF			21
+#define	LX_NETLINK_IFLA_VFINFO_LIST		22
+#define	LX_NETLINK_IFLA_STATS64			23
+#define	LX_NETLINK_IFLA_VF_PORTS		24
+#define	LX_NETLINK_IFLA_PORT_SELF		25
+#define	LX_NETLINK_IFLA_AF_SPEC			26
+#define	LX_NETLINK_IFLA_GROUP			27
+#define	LX_NETLINK_IFLA_NET_NS_FD		28
+#define	LX_NETLINK_IFLA_EXT_MASK		29
+#define	LX_NETLINK_IFLA_PROMISCUITY		30
+#define	LX_NETLINK_IFLA_NUM_TX_QUEUES		31
+#define	LX_NETLINK_IFLA_NUM_RX_QUEUES		32
+#define	LX_NETLINK_IFLA_CARRIER			33
+#define	LX_NETLINK_IFLA_PHYS_PORT_ID		34
+#define	LX_NETLINK_IFLA_CARRIER_CHANGES		35
+#define	LX_NETLINK_IFLA_MAX			36
+
+/*
+ * rtnetlink(7) NEWADDR/DELADDR/GETADDR constants
+ */
+#define	LX_NETLINK_IFA_UNSPEC			0
+#define	LX_NETLINK_IFA_ADDRESS			1
+#define	LX_NETLINK_IFA_LOCAL			2
+#define	LX_NETLINK_IFA_LABEL			3
+#define	LX_NETLINK_IFA_BROADCAST		4
+#define	LX_NETLINK_IFA_ANYCAST			5
+#define	LX_NETLINK_IFA_CACHEINFO		6
+#define	LX_NETLINK_IFA_MULTICAST		7
+#define	LX_NETLINK_IFA_FLAGS			8
+#define	LX_NETLINK_IFA_MAX			9
+
+#define	LX_NETLINK_IFA_F_SECONDARY		0x01
+#define	LX_NETLINK_IFA_F_TEMPORARY		LX_NETLINK_IFA_F_SECONDARY
+#define	LX_NETLINK_IFA_F_NODAD			0x02
+#define	LX_NETLINK_IFA_F_OPTIMISTIC		0x04
+#define	LX_NETLINK_IFA_F_DADFAILED		0x08
+#define	LX_NETLINK_IFA_F_HOMEADDRESS		0x10
+#define	LX_NETLINK_IFA_F_DEPRECATED		0x20
+#define	LX_NETLINK_IFA_F_TENTATIVE		0x40
+#define	LX_NETLINK_IFA_F_PERMANENT		0x80
+#define	LX_NETLINK_IFA_F_MANAGETEMPADDR		0x100
+#define	LX_NETLINK_IFA_F_NOPREFIXROUTE		0x200
+
+/*
+ * Linux interface flags.
+ */
+#define	LX_IFF_UP		(1<<0)
+#define	LX_IFF_BROADCAST	(1<<1)
+#define	LX_IFF_DEBUG		(1<<2)
+#define	LX_IFF_LOOPBACK		(1<<3)
+#define	LX_IFF_POINTOPOINT	(1<<4)
+#define	LX_IFF_NOTRAILERS	(1<<5)
+#define	LX_IFF_RUNNING		(1<<6)
+#define	LX_IFF_NOARP		(1<<7)
+#define	LX_IFF_PROMISC		(1<<8)
+#define	LX_IFF_ALLMULTI		(1<<9)
+#define	LX_IFF_MASTER		(1<<10)
+#define	LX_IFF_SLAVE		(1<<11)
+#define	LX_IFF_MULTICAST	(1<<12)
+#define	LX_IFF_PORTSEL		(1<<13)
+#define	LX_IFF_AUTOMEDIA	(1<<14)
+#define	LX_IFF_DYNAMIC		(1<<15)
+#define	LX_IFF_LOWER_UP		(1<<16)
+#define	LX_IFF_DORMANT		(1<<17)
+#define	LX_IFF_ECHO		(1<<18)
+
+/* rtm_table */
+#define	LX_ROUTE_TABLE_MAIN	254
+
+/* rtm_type */
+#define	LX_RTN_UNSPEC		0
+#define	LX_RTN_UNICAST		1
+#define	LX_RTN_LOCAL		2
+#define	LX_RTN_BROADCAST	3
+#define	LX_RTN_ANYCAST		4
+#define	LX_RTN_MULTICAST	5
+#define	LX_RTN_BLACKHOLE	6
+#define	LX_RTN_UNREACHABLE	7
+#define	LX_RTN_PROHIBIT		8
+#define	LX_RTN_THROW		9
+#define	LX_RTN_NAT		10
+#define	LX_RTN_XRESOLVE		11
+
+/* rtm_protocol */
+#define	LX_RTPROT_UNSPEC	0
+#define	LX_RTPROT_REDIRECT	1	/* From ICMP redir	*/
+#define	LX_RTPROT_KERNEL	2	/* From kernel		*/
+#define	LX_RTPROT_BOOT		3	/* From boot		*/
+#define	LX_RTPROT_STATIC	4	/* From administrator	*/
+#define	LX_RTPROT_NULL		0xff	/* Uninitialized	*/
+
+/* rtm_scope */
+#define	LX_RTSCOPE_UNIVERSE	0
+#define	LX_RTSCOPE_SITE		200
+#define	LX_RTSCOPE_LINK		253
+#define	LX_RTSCOPE_HOST		254
+#define	LX_RTSCOPE_NOWHERE	255
+
+
+/*
+ * Netlink sockopts
+ */
+#define	SOL_LX_NETLINK	270
+
+#define	LX_NETLINK_SO_ADD_MEMBERSHIP	1
+#define	LX_NETLINK_SO_DROP_MEMBERSHIP	2
+#define	LX_NETLINK_SO_PKTINFO		3
+#define	LX_NETLINK_SO_BROADCAST_ERROR	4
+#define	LX_NETLINK_SO_NO_ENOBUFS	5
+#define	LX_NETLINK_SO_RX_RING		6
+#define	LX_NETLINK_SO_TX_RING		7
+
+/* Internal socket flags */
+#define	LXNLF_RECVUCRED			0x1
+
+/* nlmsg structure macros */
+#define	LXNLMSG_ALIGNTO	4
+#define	LXNLMSG_ALIGN(len)	\
+	(((len) + LXNLMSG_ALIGNTO - 1) & ~(LXNLMSG_ALIGNTO - 1))
+#define	LXNLMSG_HDRLEN	\
+	((int)LXNLMSG_ALIGN(sizeof (lx_netlink_hdr_t)))
+#define	LXNLMSG_LENGTH(len)	((len) + NLMSG_HDRLEN)
+#define	LXNLMSG_SPACE(len)	NLMSG_ALIGN(NLMSG_LENGTH(len))
+#define	LXNLMSG_DATA(nlh)	((void*)(((char *)nlh) + NLMSG_LENGTH(0)))
+#define	LXNLMSG_PAYLOAD(nlh, len)	\
+	((nlh)->nlmsg_len - NLMSG_SPACE((len)))
+
+#define	LXATTR_PAYLOAD(lxa)	\
+	((void*)((caddr_t)(lxa) + sizeof (lx_netlink_attr_t)))
+#define	LXATTR_HDRLEN	LXNLMSG_ALIGN(sizeof (lx_netlink_attr_t))
+#define	LXATTR_LEN(len)	(LXATTR_HDRLEN + LXNLMSG_ALIGN(len))
+
+typedef struct lx_netlink_hdr {
+	uint32_t lxnh_len;			/* length of message */
+	uint16_t lxnh_type;			/* type of message */
+	uint16_t lxnh_flags;			/* flags */
+	uint32_t lxnh_seq;			/* sequence number */
+	uint32_t lxnh_pid;			/* sending pid */
+} lx_netlink_hdr_t;
+
+typedef struct lx_netlink_err {
+	lx_netlink_hdr_t	lxne_hdr;	/* header */
+	int32_t			lxne_errno;	/* errno */
+	lx_netlink_hdr_t	lxne_failed;	/* header of err */
+} lx_netlink_err_t;
+
+typedef struct lx_netlink_attr {
+	uint16_t	lxna_len;		/* length of attribute */
+	uint16_t	lxna_type;		/* type of attribute */
+} lx_netlink_attr_t;
+
+typedef struct lx_netlink_ifinfomsg {
+	uint8_t		lxnl_ifi_family;	/* family: AF_UNSPEC */
+	uint8_t		lxnl_ifi__pad;
+	uint16_t	lxnl_ifi_type;		/* device type */
+	uint32_t	lxnl_ifi_index;		/* interface index */
+	uint32_t	lxnl_ifi_flags;		/* device flags */
+	uint32_t 	lxnl_ifi_change;	/* unused; must be -1 */
+} lx_netlink_ifinfomsg_t;
+
+typedef struct lx_netlink_ifaddrmsg {
+	uint8_t		lxnl_ifa_family;	/* address type */
+	uint8_t		lxnl_ifa_prefixlen;	/* prefix length of address */
+	uint8_t		lxnl_ifa_flags;		/* address flags */
+	uint8_t		lxnl_ifa_scope;		/* address scope */
+	uint8_t		lxnl_ifa_index;		/* interface index */
+} lx_netlink_ifaddrmsg_t;
+
+typedef struct lx_netlink_rtmsg {
+	uint8_t		rtm_family;	/* route AF			*/
+	uint8_t		rtm_dst_len;	/* destination addr length	*/
+	uint8_t		rtm_src_len;	/* source addr length		*/
+	uint8_t		rtm_tos;	/* TOS filter			*/
+	uint8_t		rtm_table;	/* routing table ID		*/
+	uint8_t		rtm_protocol;	/* routing protocol		*/
+	uint8_t		rtm_scope;
+	uint8_t		rtm_type;
+	uint32_t	rtm_flags;
+} lx_netlink_rtmsg_t;
+
+typedef struct lx_netlink_sockaddr {
+	sa_family_t	lxnl_family;		/* AF_LX_NETLINK */
+	uint16_t	lxnl_pad;		/* padding */
+	uint32_t	lxnl_port;		/* port id */
+	uint32_t	lxnl_groups;		/* multicast groups mask */
+} lx_netlink_sockaddr_t;
+
+typedef struct lx_netlink_sock {
+	struct lx_netlink_sock *lxns_next;	/* list of lx_netlink sockets */
+	sock_upcalls_t *lxns_upcalls;		/* pointer to socket upcalls */
+	sock_upper_handle_t lxns_uphandle;	/* socket upcall handle */
+	ldi_handle_t lxns_iphandle;		/* handle to /dev/ip */
+	ldi_handle_t lxns_ip6handle;		/* handle to /dev/ip6 */
+	ldi_handle_t lxns_current;		/* current ip handle */
+	int lxns_proto;				/* protocol */
+	uint32_t lxns_port;			/* port identifier */
+	uint32_t lxns_groups;			/* group subscriptions */
+	uint32_t lxns_bufsize;			/* buffer size */
+	uint32_t lxns_flags;			/* socket flags */
+} lx_netlink_sock_t;
+
+typedef struct lx_netlink_reply {
+	lx_netlink_hdr_t lxnr_hdr;		/* header that we're reply to */
+	lx_netlink_sock_t *lxnr_sock;		/* socket */
+	uint32_t lxnr_seq;			/* sequence number */
+	uint16_t lxnr_type;			/* type of reply */
+	mblk_t *lxnr_mp;			/* current mblk */
+	mblk_t *lxnr_err;			/* error mblk */
+	mblk_t *lxnr_mp1;			/* T_UNITDATA_IND mblk */
+	int lxnr_errno;				/* errno, if any */
+} lx_netlink_reply_t;
+
+static lx_netlink_sock_t *lx_netlink_head;	/* head of lx_netlink sockets */
+static kmutex_t lx_netlink_lock;		/* lock to protect state */
+static ldi_ident_t lx_netlink_ldi;		/* LDI handle */
+static int lx_netlink_bufsize = 4096;		/* default buffer size */
+static int lx_netlink_flowctrld;		/* # of times flow controlled */
+
+/*ARGSUSED*/
+static void
+lx_netlink_activate(sock_lower_handle_t handle,
+    sock_upper_handle_t sock_handle, sock_upcalls_t *sock_upcalls,
+    int flags, cred_t *cr)
+{
+	lx_netlink_sock_t *lxsock = (lx_netlink_sock_t *)handle;
+	struct sock_proto_props sopp;
+
+	sopp.sopp_flags = SOCKOPT_WROFF | SOCKOPT_RCVHIWAT |
+	    SOCKOPT_RCVLOWAT | SOCKOPT_MAXADDRLEN | SOCKOPT_MAXPSZ |
+	    SOCKOPT_MAXBLK | SOCKOPT_MINPSZ;
+	sopp.sopp_wroff = 0;
+	sopp.sopp_rxhiwat = SOCKET_RECVHIWATER;
+	sopp.sopp_rxlowat = SOCKET_RECVLOWATER;
+	sopp.sopp_maxaddrlen = sizeof (struct sockaddr_dl);
+	sopp.sopp_maxpsz = INFPSZ;
+	sopp.sopp_maxblk = INFPSZ;
+	sopp.sopp_minpsz = 0;
+
+	lxsock->lxns_upcalls = sock_upcalls;
+	lxsock->lxns_uphandle = sock_handle;
+
+	sock_upcalls->su_set_proto_props(sock_handle, &sopp);
+}
+
+/*ARGSUSED*/
+static int
+lx_netlink_setsockopt(sock_lower_handle_t handle, int level,
+    int option_name, const void *optval, socklen_t optlen, struct cred *cr)
+{
+	lx_netlink_sock_t *lxsock = (lx_netlink_sock_t *)handle;
+
+	if (level == SOL_SOCKET && option_name == SO_RECVUCRED) {
+		int *ival;
+		if (optlen != sizeof (int)) {
+			return (EINVAL);
+		}
+		ival = (int *)optval;
+		if (*ival == 0) {
+			lxsock->lxns_flags &= ~LXNLF_RECVUCRED;
+		} else {
+			lxsock->lxns_flags |= LXNLF_RECVUCRED;
+		}
+		return (0);
+	} else if (level == SOL_SOCKET) {
+		/* Punt on the other SOL_SOCKET options */
+		return (0);
+	} else if (level != SOL_LX_NETLINK) {
+		return (EOPNOTSUPP);
+	}
+
+	switch (option_name) {
+	case LX_NETLINK_SO_ADD_MEMBERSHIP:
+	case LX_NETLINK_SO_DROP_MEMBERSHIP:
+	case LX_NETLINK_SO_PKTINFO:
+	case LX_NETLINK_SO_BROADCAST_ERROR:
+	case LX_NETLINK_SO_NO_ENOBUFS:
+	case LX_NETLINK_SO_RX_RING:
+	case LX_NETLINK_SO_TX_RING:
+		/* Blatant lie */
+		return (0);
+	default:
+		return (EINVAL);
+	}
+}
+
+/*ARGSUSED*/
+static int
+lx_netlink_bind(sock_lower_handle_t handle, struct sockaddr *name,
+    socklen_t namelen, struct cred *cr)
+{
+	lx_netlink_sock_t *lxsock = (lx_netlink_sock_t *)handle;
+	lx_netlink_sockaddr_t *lxsa = (lx_netlink_sockaddr_t *)name;
+
+	if (namelen != sizeof (lx_netlink_sockaddr_t) ||
+	    lxsa->lxnl_family != AF_LX_NETLINK) {
+		return (EINVAL);
+	}
+
+
+	if (lxsa->lxnl_groups != 0) {
+		/*
+		 * On linux, CAP_NET_ADMIN is needed to bind to netlink groups.
+		 * This roughly maps to PRIV_SYS_IP_CONFIG.
+		 */
+		if (secpolicy_ip_config(cr, B_FALSE) != 0) {
+			return (EACCES);
+		}
+
+		/* Lie about group subscription for now */
+		lxsock->lxns_groups = lxsa->lxnl_groups;
+	}
+
+	/*
+	 * Linux netlink uses nl_port to identify distinct netlink sockets.
+	 * Binding to an address of nl_port=0 triggers the kernel to
+	 * automatically assign a free nl_port identifier.  Originally,
+	 * consumers of lx_netlink were required to bind with that automatic
+	 * address.  We now support non-zero values for nl_port although strict
+	 * checking to identify conflicts is not performed.  Use of the
+	 * id_space facility could be a convenient solution, if a need arose.
+	 */
+	if (lxsa->lxnl_port == 0) {
+		/*
+		 * Because we are not doing conflict detection, there is no
+		 * need to expend effort selecting a unique port for automatic
+		 * addressing during bind.
+		 */
+		lxsock->lxns_port = curproc->p_pid;
+	} else {
+		lxsock->lxns_port = lxsa->lxnl_port;
+	}
+
+	return (0);
+}
+
+/*ARGSUSED*/
+static int
+lx_netlink_getsockname(sock_lower_handle_t handle, struct sockaddr *sa,
+    socklen_t *len, struct cred *cr)
+{
+	lx_netlink_sock_t *lxsock = (lx_netlink_sock_t *)handle;
+	lx_netlink_sockaddr_t *lxsa = (lx_netlink_sockaddr_t *)sa;
+
+	if (*len < sizeof (lx_netlink_sockaddr_t))
+		return (EINVAL);
+
+	lxsa->lxnl_family = AF_LX_NETLINK;
+	lxsa->lxnl_pad = 0;
+	lxsa->lxnl_port = lxsock->lxns_port;
+	lxsa->lxnl_groups = lxsock->lxns_groups;
+
+	*len = sizeof (lx_netlink_sockaddr_t);
+
+	return (0);
+}
+
+static mblk_t *
+lx_netlink_alloc_mp1(lx_netlink_sock_t *lxsock)
+{
+	mblk_t *mp;
+	size_t size;
+	struct T_unitdata_ind *tunit;
+	lx_netlink_sockaddr_t *lxsa;
+	boolean_t send_ucred;
+
+	/*
+	 * Certain netlink clients (such as systemd) will set SO_RECVUCRED
+	 * (via the Linux SCM_CREDENTIALS) on the expectation that all replies
+	 * will contain credentials passed via cmsg.  They require this to
+	 * authenticate those messages as having originated in the kernel by
+	 * checking uc_pid == 0.
+	 */
+	VERIFY(lxsock != NULL);
+	send_ucred = ((lxsock->lxns_flags & LXNLF_RECVUCRED) != 0);
+
+	/*
+	 * Message structure:
+	 * +----------------------------+
+	 * | struct T_unit_data_ind	|
+	 * +----------------------------+
+	 * | lx_netlink_sockaddr_t	|
+	 * +----------------------------+  -+
+	 * | struct cmsghdr (SCM_UCRED)	|   |
+	 * +----------------------------+   +-(optional)
+	 * | struct ucred_s (cmsg data)	|   |
+	 * +----------------------------+  -+
+	 */
+	size = sizeof (*tunit) + sizeof (*lxsa);
+	if (send_ucred) {
+		size += sizeof (struct cmsghdr) +
+		    ROUNDUP_cmsglen(sizeof (struct ucred_s));
+	}
+	mp = allocb(size, 0);
+	if (mp == NULL) {
+		return (NULL);
+	}
+
+	tunit = (struct T_unitdata_ind *)mp->b_rptr;
+	lxsa = (lx_netlink_sockaddr_t *)((caddr_t)tunit + sizeof (*tunit));
+	mp->b_wptr += size;
+
+	mp->b_datap->db_type = M_PROTO;
+	tunit->PRIM_type = T_UNITDATA_IND;
+	tunit->SRC_length = sizeof (*lxsa);
+	tunit->SRC_offset = (caddr_t)lxsa - (caddr_t)mp->b_rptr;
+
+	lxsa->lxnl_family = AF_LX_NETLINK;
+	lxsa->lxnl_port = 0;
+	lxsa->lxnl_groups = 0;
+	lxsa->lxnl_pad = 0;
+
+	if (send_ucred) {
+		struct cmsghdr *cmsg;
+		struct ucred_s *ucred;
+
+		cmsg = (struct cmsghdr *)((caddr_t)lxsa + sizeof (*lxsa));
+		ucred = (struct ucred_s *)CMSG_CONTENT(cmsg);
+		cmsg->cmsg_len = sizeof (*cmsg) + sizeof (*ucred);
+		cmsg->cmsg_level = SOL_SOCKET;
+		cmsg->cmsg_type = SCM_UCRED;
+		bzero(ucred, sizeof (*ucred));
+		ucred->uc_size = sizeof (*ucred);
+		ucred->uc_zoneid = getzoneid();
+
+		tunit->OPT_length = sizeof (*cmsg) +
+		    ROUNDUP_cmsglen(sizeof (*ucred));
+		tunit->OPT_offset = (caddr_t)cmsg - (caddr_t)mp->b_rptr;
+	} else {
+		tunit->OPT_length = 0;
+		tunit->OPT_offset = 0;
+	}
+
+	return (mp);
+}
+
+static lx_netlink_reply_t *
+lx_netlink_reply(lx_netlink_sock_t *lxsock,
+    lx_netlink_hdr_t *hdr, uint16_t type)
+{
+	lx_netlink_reply_t *reply;
+	mblk_t *err, *mp1;
+
+	/*
+	 * We always allocate an error block to assure that even if subsequent
+	 * allocations fail, we can return an error.
+	 */
+	if ((err = allocb(sizeof (lx_netlink_err_t), 0)) == NULL)
+		return (NULL);
+
+	if ((mp1 = lx_netlink_alloc_mp1(lxsock)) == NULL) {
+		freeb(err);
+		return (NULL);
+	}
+
+	reply = kmem_zalloc(sizeof (lx_netlink_reply_t), KM_SLEEP);
+	reply->lxnr_err = err;
+	reply->lxnr_sock = lxsock;
+	reply->lxnr_hdr = *hdr;
+	reply->lxnr_type = type;
+	reply->lxnr_mp1 = mp1;
+
+	return (reply);
+}
+
+static void
+lx_netlink_reply_add(lx_netlink_reply_t *reply, void *payload, uint32_t size)
+{
+	lx_netlink_hdr_t *hdr;
+	lx_netlink_sock_t *lxsock = reply->lxnr_sock;
+	uint32_t aligned;
+	mblk_t *mp = reply->lxnr_mp;
+
+	if (reply->lxnr_errno)
+		return;
+
+	aligned = LXNLMSG_ALIGN(size);
+	hdr = (lx_netlink_hdr_t *)mp->b_rptr;
+
+	if (hdr->lxnh_len + aligned > lxsock->lxns_bufsize) {
+		reply->lxnr_errno = E2BIG;
+		return;
+	}
+
+	bcopy(payload, mp->b_wptr, size);
+	hdr->lxnh_len += aligned;
+	mp->b_wptr += aligned;
+}
+
+static void
+lx_netlink_reply_msg(lx_netlink_reply_t *reply, void *payload, uint32_t size)
+{
+	lx_netlink_hdr_t *hdr;
+	lx_netlink_sock_t *lxsock = reply->lxnr_sock;
+	mblk_t *mp;
+
+	if (reply->lxnr_errno)
+		return;
+
+	VERIFY(reply->lxnr_mp == NULL);
+
+	if ((reply->lxnr_mp = mp = allocb(lxsock->lxns_bufsize, 0)) == NULL) {
+		reply->lxnr_errno = ENOMEM;
+		return;
+	}
+
+	bzero(mp->b_rptr, lxsock->lxns_bufsize);
+	hdr = (lx_netlink_hdr_t *)mp->b_rptr;
+	hdr->lxnh_flags = LX_NETLINK_NLM_F_MULTI;
+	hdr->lxnh_len = LXNLMSG_ALIGN(sizeof (lx_netlink_hdr_t));
+	hdr->lxnh_seq = reply->lxnr_hdr.lxnh_seq;
+	hdr->lxnh_pid = lxsock->lxns_port;
+
+	mp->b_wptr += LXNLMSG_ALIGN(sizeof (lx_netlink_hdr_t));
+
+	if (payload == NULL) {
+		/*
+		 * A NULL payload denotes a "done" message.
+		 */
+		hdr->lxnh_type = LX_NETLINK_NLMSG_DONE;
+	} else {
+		hdr->lxnh_type = reply->lxnr_type;
+		lx_netlink_reply_add(reply, payload, size);
+	}
+}
+
+static void
+lx_netlink_reply_attr(lx_netlink_reply_t *reply, uint16_t type,
+    void *payload, uint32_t size)
+{
+	lx_netlink_attr_t attr;
+
+	attr.lxna_len = size + sizeof (lx_netlink_attr_t);
+	attr.lxna_type = type;
+
+	lx_netlink_reply_add(reply, &attr, sizeof (attr));
+	lx_netlink_reply_add(reply, payload, size);
+}
+
+static void
+lx_netlink_reply_attr_string(lx_netlink_reply_t *reply,
+    uint16_t type, const char *str)
+{
+	lx_netlink_reply_attr(reply, type, (void *)str, strlen(str) + 1);
+}
+
+static void
+lx_netlink_reply_attr_int32(lx_netlink_reply_t *reply,
+    uint16_t type, int32_t val)
+{
+	int32_t v = val;
+
+	lx_netlink_reply_attr(reply, type, &v, sizeof (int32_t));
+}
+
+static int
+lx_netlink_reply_ioctl(lx_netlink_reply_t *reply, int cmd, void *arg)
+{
+	int rval;
+
+	if (reply->lxnr_errno != 0)
+		return (reply->lxnr_errno);
+
+	if ((rval = ldi_ioctl(reply->lxnr_sock->lxns_current,
+	    cmd, (intptr_t)arg, FKIOCTL, kcred, NULL)) != 0) {
+		reply->lxnr_errno = rval;
+	}
+
+	return (rval);
+}
+
+static void
+lx_netlink_reply_sendup(lx_netlink_reply_t *reply, mblk_t *mp, mblk_t *mp1)
+{
+	lx_netlink_sock_t *lxsock = reply->lxnr_sock;
+	int error;
+
+	/*
+	 * To prevent the stream head from coalescing messages and to indicate
+	 * their origin, we send them as T_UNITDATA_IND messages, not as raw
+	 * M_DATA.
+	 */
+	mp1->b_cont = mp;
+
+	lxsock->lxns_upcalls->su_recv(lxsock->lxns_uphandle, mp1,
+	    msgdsize(mp1), 0, &error, NULL);
+
+	if (error != 0)
+		lx_netlink_flowctrld++;
+}
+
+static void
+lx_netlink_reply_send(lx_netlink_reply_t *reply)
+{
+	mblk_t *mp1;
+
+	if (reply->lxnr_errno)
+		return;
+
+	if ((mp1 = lx_netlink_alloc_mp1(reply->lxnr_sock)) == NULL) {
+		reply->lxnr_errno = ENOMEM;
+		return;
+	}
+
+	lx_netlink_reply_sendup(reply, reply->lxnr_mp, mp1);
+	reply->lxnr_mp = NULL;
+}
+
+static void
+lx_netlink_reply_done(lx_netlink_reply_t *reply)
+{
+	lx_netlink_sock_t *lxsock = reply->lxnr_sock;
+	mblk_t *mp;
+
+	/*
+	 * Denote that we're done via a message with a NULL payload.
+	 */
+	lx_netlink_reply_msg(reply, NULL, 0);
+
+	if (reply->lxnr_errno) {
+		/*
+		 * If anything failed, we'll send up an error message.
+		 */
+		lx_netlink_hdr_t *hdr;
+		lx_netlink_err_t *err;
+
+		if (reply->lxnr_mp != NULL) {
+			freeb(reply->lxnr_mp);
+			reply->lxnr_mp = NULL;
+		}
+
+		mp = reply->lxnr_err;
+		VERIFY(mp != NULL);
+		reply->lxnr_err = NULL;
+		err = (lx_netlink_err_t *)mp->b_rptr;
+		hdr = &err->lxne_hdr;
+		mp->b_wptr += sizeof (lx_netlink_err_t);
+
+		err->lxne_failed = reply->lxnr_hdr;
+		err->lxne_errno = reply->lxnr_errno;
+		hdr->lxnh_type = LX_NETLINK_NLMSG_ERROR;
+		hdr->lxnh_seq = reply->lxnr_hdr.lxnh_seq;
+		hdr->lxnh_len = sizeof (lx_netlink_err_t);
+		hdr->lxnh_seq = reply->lxnr_hdr.lxnh_seq;
+		hdr->lxnh_pid = lxsock->lxns_port;
+	} else {
+		mp = reply->lxnr_mp;
+		VERIFY(mp != NULL);
+		reply->lxnr_mp = NULL;
+	}
+
+	lx_netlink_reply_sendup(reply, mp, reply->lxnr_mp1);
+
+	if (reply->lxnr_mp != NULL)
+		freeb(reply->lxnr_mp);
+
+	if (reply->lxnr_err != NULL)
+		freeb(reply->lxnr_err);
+
+	kmem_free(reply, sizeof (lx_netlink_reply_t));
+}
+
+static int
+lx_netlink_reply_error(lx_netlink_sock_t *lxsock,
+    lx_netlink_hdr_t *hdr, int errno)
+{
+	/*
+	 * The type of the message doesn't matter, as we're going to explicitly
+	 * set lxnr_errno and therefore send only an error message.
+	 */
+	lx_netlink_reply_t *reply = lx_netlink_reply(lxsock, hdr, 0);
+
+	if (reply == NULL)
+		return (ENOMEM);
+
+	reply->lxnr_errno = errno;
+	lx_netlink_reply_done(reply);
+
+	return (0);
+}
+
+static int
+lx_netlink_parse_msg_attrs(mblk_t *mp, void **msgp, unsigned int msg_size,
+    lx_netlink_attr_t **attrp, unsigned int *attr_max)
+{
+	lx_netlink_hdr_t *hdr = (lx_netlink_hdr_t *)mp->b_rptr;
+	lx_netlink_attr_t *lxa;
+	unsigned char *buf = mp->b_rptr + LXNLMSG_HDRLEN;
+	unsigned int i;
+	uint32_t buf_left = MBLKL(mp) - LXNLMSG_HDRLEN;
+	uint32_t msg_left = hdr->lxnh_len;
+
+	msg_size = LXNLMSG_ALIGN(msg_size);
+	if (msg_size > buf_left || msg_size > msg_left) {
+		return (-1);
+	}
+
+	*msgp = (void *)buf;
+	buf += msg_size;
+	buf_left -= msg_size;
+	msg_left -= msg_size;
+
+	/* Do not bother with attr parsing if not requested */
+	if (attrp == NULL || *attr_max == 0) {
+		return (0);
+	}
+
+	for (i = 0; i < *attr_max; i++) {
+		if (buf_left < LXATTR_HDRLEN || msg_left < LXATTR_HDRLEN) {
+			break;
+		}
+
+		lxa = (lx_netlink_attr_t *)buf;
+		if (lxa->lxna_len > buf_left || lxa->lxna_len > msg_left) {
+			return (-1);
+		}
+
+		attrp[i] = lxa;
+		buf += lxa->lxna_len;
+		buf_left -= lxa->lxna_len;
+		msg_left -= lxa->lxna_len;
+	}
+	*attr_max = i;
+
+	return (0);
+}
+
+/*
+ * Takes an IPv4 address (in network byte order) and returns the address scope.
+ */
+static uint8_t
+lx_ipv4_rtscope(in_addr_t nbo_addr) {
+	in_addr_t addr = ntohl(nbo_addr);
+	if ((addr >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
+		return (LX_RTSCOPE_HOST);
+	} else if ((addr & IN_AUTOCONF_MASK) == IN_AUTOCONF_NET) {
+		return (LX_RTSCOPE_LINK);
+	} else if ((addr & IN_PRIVATE8_MASK) == IN_PRIVATE8_NET ||
+	    (addr & IN_PRIVATE12_MASK) == IN_PRIVATE12_NET ||
+	    (addr & IN_PRIVATE16_MASK) == IN_PRIVATE16_NET) {
+		return (LX_RTSCOPE_SITE);
+	} else {
+		return (LX_RTSCOPE_UNIVERSE);
+	}
+}
+
+/*
+ * Takes an IPv6 address and returns the address scope.
+ */
+static uint8_t
+lx_ipv6_rtscope(const in6_addr_t *addr) {
+	if (IN6_ARE_ADDR_EQUAL(addr, &ipv6_loopback)) {
+		return (LX_RTSCOPE_HOST);
+	} else if (IN6_IS_ADDR_LINKLOCAL(addr)) {
+		return (LX_RTSCOPE_LINK);
+	} else if (IN6_IS_ADDR_SITELOCAL(addr)) {
+		return (LX_RTSCOPE_SITE);
+	} else {
+		return (LX_RTSCOPE_UNIVERSE);
+	}
+}
+
+static void
+lx_netlink_getlink_lifreq(lx_netlink_reply_t *reply, struct lifreq *lifr)
+{
+	lx_netlink_ifinfomsg_t ifi;
+	int i;
+	char if_name[IFNAMSIZ];
+	struct sockaddr_dl *sdl;
+	struct sockaddr hwaddr;
+	int hwaddr_size;
+	boolean_t is_loopback;
+
+	struct {
+		int native;
+		int lx;
+	} flags[] = {
+		{ IFF_UP, LX_IFF_UP },
+		{ IFF_BROADCAST, LX_IFF_BROADCAST },
+		{ IFF_DEBUG, LX_IFF_DEBUG },
+		{ IFF_LOOPBACK, LX_IFF_LOOPBACK },
+		{ IFF_POINTOPOINT, LX_IFF_POINTOPOINT },
+		{ IFF_NOTRAILERS, LX_IFF_NOTRAILERS },
+		{ IFF_RUNNING, LX_IFF_RUNNING },
+		{ IFF_NOARP, LX_IFF_NOARP },
+		{ IFF_PROMISC, LX_IFF_PROMISC },
+		{ IFF_ALLMULTI, LX_IFF_ALLMULTI },
+		{ IFF_MULTICAST, LX_IFF_MULTICAST },
+		{ 0 }
+	};
+
+	/*
+	 * illumos interfaces that contain a ':' are non-zero logical
+	 * interfaces. We should only emit the name of the zeroth logical
+	 * interface, since RTM_GETLINK only expects to see the name of
+	 * devices. The addresses of all logical devices will be
+	 * returned via an RTM_GETADDR.
+	 */
+	if (strchr(lifr->lifr_name, ':') != NULL)
+		return;
+
+	/*
+	 * Most of the lx_netlink module is architected to emit information in
+	 * an illumos-native manner.  Socket syscalls such as getsockname will
+	 * not translate fields to values Linux programs would expect since
+	 * that conversion is performed by the generic socket emulation.
+	 *
+	 * This is _not_ true of the actual protocol output from lx_netlink.
+	 * Since translating it at the socket layer would be onerous, all
+	 * output (including constants and names) is pre-translated to values
+	 * valid for Linux.
+	 */
+
+	bzero(&ifi, sizeof (ifi));
+	ifi.lxnl_ifi_family = AF_UNSPEC;
+	ifi.lxnl_ifi_change = (uint32_t)-1;
+
+	/* Convert the name to be Linux-friendly */
+	(void) strlcpy(if_name, lifr->lifr_name, IFNAMSIZ);
+	lx_ifname_convert(if_name, LX_IF_FROMNATIVE);
+	is_loopback = (strncmp(if_name, "lo", 2) == 0);
+
+	if (lx_netlink_reply_ioctl(reply, SIOCGLIFINDEX, lifr) != 0)
+		return;
+
+	ifi.lxnl_ifi_index = lifr->lifr_index;
+
+	if (lx_netlink_reply_ioctl(reply, SIOCGLIFFLAGS, lifr) != 0)
+		return;
+
+	for (i = 0; flags[i].native; i++) {
+		if (lifr->lifr_flags & flags[i].native)
+			ifi.lxnl_ifi_flags |= flags[i].lx;
+	}
+
+	/*
+	 * Query the datalink address.
+	 * The interface type will be included in the outgoing infomsg while
+	 * the address itself will be output separately.
+	 */
+	sdl = (struct sockaddr_dl *)&lifr->lifr_addr;
+	bzero(sdl, sizeof (*sdl));
+	if (!is_loopback) {
+		lx_netlink_reply_ioctl(reply, SIOCGLIFHWADDR, lifr);
+	} else {
+		/* Simulate an empty hwaddr for loopback */
+		sdl->sdl_type = DL_LOOP;
+		sdl->sdl_alen = ETHERADDRL;
+	}
+	lx_stol_hwaddr(sdl, &hwaddr, &hwaddr_size);
+
+	ifi.lxnl_ifi_type = hwaddr.sa_family;
+	lx_netlink_reply_msg(reply, &ifi, sizeof (lx_netlink_ifinfomsg_t));
+
+	lx_netlink_reply_attr_string(reply, LX_NETLINK_IFLA_IFNAME, if_name);
+
+	if (lx_netlink_reply_ioctl(reply, SIOCGLIFMTU, lifr) != 0)
+		return;
+
+	lx_netlink_reply_attr_int32(reply, LX_NETLINK_IFLA_MTU, lifr->lifr_mtu);
+
+	if (hwaddr_size != 0) {
+		lx_netlink_reply_attr(reply, LX_NETLINK_IFLA_ADDRESS,
+		    hwaddr.sa_data, hwaddr_size);
+	}
+
+	/* Emulate a txqlen of 1. (0 for loopbacks) */
+	lx_netlink_reply_attr_int32(reply, LX_NETLINK_IFLA_TXQLEN,
+	    (is_loopback) ? 0 : 1);
+
+	lx_netlink_reply_send(reply);
+}
+
+static void
+lx_netlink_reply_eachfamily(lx_netlink_reply_t *reply,
+    void (*func)(lx_netlink_reply_t *, struct lifreq *), boolean_t distinct)
+{
+	lx_netlink_sock_t *sock = reply->lxnr_sock;
+	int nlifr, i;
+
+	struct {
+		int family;
+		ldi_handle_t handle;
+		struct lifconf lifc;
+		struct lifnum lifn;
+	} families[] = {
+		{ AF_INET, sock->lxns_iphandle },
+		{ AF_INET6, sock->lxns_ip6handle },
+		{ AF_UNSPEC }
+	}, *family, *check;
+
+	for (family = families; family->family != AF_UNSPEC; family++) {
+		struct lifconf *lifc = &family->lifc;
+		struct lifnum *lifn = &family->lifn;
+
+		lifn->lifn_family = family->family;
+		sock->lxns_current = family->handle;
+
+		if (lx_netlink_reply_ioctl(reply, SIOCGLIFNUM, lifn) != 0)
+			break;
+
+		lifc->lifc_family = lifn->lifn_family;
+		lifc->lifc_flags = 0;
+		lifc->lifc_len = lifn->lifn_count * sizeof (struct lifreq);
+		if (lifn->lifn_count == 0) {
+			lifc->lifc_buf = NULL;
+			continue;
+		}
+		lifc->lifc_buf = kmem_alloc(lifc->lifc_len, KM_SLEEP);
+
+		if (lx_netlink_reply_ioctl(reply, SIOCGLIFCONF, lifc) != 0)
+			break;
+
+		nlifr = lifc->lifc_len / sizeof (lifc->lifc_req[0]);
+
+		for (i = 0; i < nlifr; i++) {
+			if (!distinct) {
+				func(reply, &lifc->lifc_req[i]);
+				continue;
+			}
+
+			/*
+			 * If we have been asked to provide each interface
+			 * exactly once, we need to (annoyingly) check this
+			 * name against others that we've already processed for
+			 * other families.  Yes, this is quadratic time -- but
+			 * the number of interfaces per family is expected to
+			 * be very small.
+			 */
+			for (check = families; check != family; check++) {
+				struct lifconf *clifc = &check->lifc;
+				int cnlifr = clifc->lifc_len /
+				    sizeof (clifc->lifc_req[0]), j;
+				char *nm = lifc->lifc_req[i].lifr_name, *cnm;
+
+				for (j = 0; j < cnlifr; j++) {
+					cnm = clifc->lifc_req[j].lifr_name;
+
+					if (strcmp(nm, cnm) == 0)
+						break;
+				}
+
+				if (j != cnlifr)
+					break;
+			}
+
+			if (check != family)
+				continue;
+
+			func(reply, &lifc->lifc_req[i]);
+		}
+	}
+
+	for (family = families; family->family != AF_UNSPEC; family++) {
+		struct lifconf *lifc = &family->lifc;
+
+		if (lifc->lifc_buf != NULL)
+			kmem_free(lifc->lifc_buf, lifc->lifc_len);
+	}
+}
+
+/*ARGSUSED*/
+static int
+lx_netlink_getlink(lx_netlink_sock_t *lxsock, lx_netlink_hdr_t *hdr, mblk_t *mp)
+{
+	lx_netlink_reply_t *reply;
+
+	reply = lx_netlink_reply(lxsock, hdr, LX_NETLINK_RTM_NEWLINK);
+
+	if (reply == NULL)
+		return (ENOMEM);
+
+	lx_netlink_reply_eachfamily(reply, lx_netlink_getlink_lifreq, B_TRUE);
+	lx_netlink_reply_done(reply);
+
+	return (0);
+}
+
+static void
+lx_netlink_getaddr_lifreq(lx_netlink_reply_t *reply, struct lifreq *lifr)
+{
+	lx_netlink_ifaddrmsg_t ifa;
+
+	bzero(&ifa, sizeof (ifa));
+
+	if (lx_netlink_reply_ioctl(reply, SIOCGLIFINDEX, lifr) != 0)
+		return;
+
+	ifa.lxnl_ifa_index = lifr->lifr_index;
+
+	if (lx_netlink_reply_ioctl(reply, SIOCGLIFFLAGS, lifr) != 0)
+		return;
+
+	/*
+	 * Don't report on-link subnets
+	 */
+	if ((lifr->lifr_flags & IFF_NOLOCAL) != 0)
+		return;
+
+	if (lx_netlink_reply_ioctl(reply, SIOCGLIFSUBNET, lifr) != 0)
+		return;
+
+	ifa.lxnl_ifa_prefixlen = lifr->lifr_addrlen;
+
+	if (lx_netlink_reply_ioctl(reply, SIOCGLIFADDR, lifr) != 0)
+		return;
+
+	if (lifr->lifr_addr.ss_family == AF_INET) {
+		struct sockaddr_in *sin;
+
+		ifa.lxnl_ifa_family = LX_AF_INET;
+
+		sin = (struct sockaddr_in *)&lifr->lifr_addr;
+		ifa.lxnl_ifa_scope = lx_ipv4_rtscope(
+		    sin->sin_addr.s_addr);
+
+		lx_netlink_reply_msg(reply, &ifa,
+		    sizeof (lx_netlink_ifaddrmsg_t));
+
+		lx_netlink_reply_attr_int32(reply,
+		    LX_NETLINK_IFA_ADDRESS, sin->sin_addr.s_addr);
+	} else {
+		struct sockaddr_in6 *sin;
+
+		ifa.lxnl_ifa_family = LX_AF_INET6;
+
+		sin = (struct sockaddr_in6 *)&lifr->lifr_addr;
+		ifa.lxnl_ifa_scope = lx_ipv6_rtscope(&sin->sin6_addr);
+
+		lx_netlink_reply_msg(reply, &ifa,
+		    sizeof (lx_netlink_ifaddrmsg_t));
+
+		lx_netlink_reply_attr(reply, LX_NETLINK_IFA_ADDRESS,
+		    &sin->sin6_addr, sizeof (sin->sin6_addr));
+	}
+
+	lx_netlink_reply_send(reply);
+}
+
+/*ARGSUSED*/
+static int
+lx_netlink_getaddr(lx_netlink_sock_t *lxsock, lx_netlink_hdr_t *hdr, mblk_t *mp)
+{
+	lx_netlink_reply_t *reply;
+
+	reply = lx_netlink_reply(lxsock, hdr, LX_NETLINK_RTM_NEWADDR);
+
+	if (reply == NULL)
+		return (ENOMEM);
+
+	lx_netlink_reply_eachfamily(reply, lx_netlink_getaddr_lifreq, B_FALSE);
+	lx_netlink_reply_done(reply);
+
+	return (0);
+}
+
+struct lx_getroute_ctx {
+	lx_netlink_reply_t *lgrtctx_reply;
+	lx_netlink_rtmsg_t *lgrtctx_rtmsg;
+	lx_netlink_attr_t *lgrtctx_attrs[LX_NETLINK_MAX_RTA];
+	unsigned int lgrtctx_max_attr;
+	lx_netlink_attr_t *lgrtctx_rtadst;
+};
+
+static void
+lx_netlink_getroute_ipv4(ire_t *ire, struct lx_getroute_ctx *ctx)
+{
+	lx_netlink_reply_t *reply = ctx->lgrtctx_reply;
+	lx_netlink_rtmsg_t *rtmsg = ctx->lgrtctx_rtmsg;
+	lx_netlink_attr_t *rtadst = ctx->lgrtctx_rtadst;
+	lx_netlink_rtmsg_t res;
+	ill_t *ill = NULL;
+
+	/* Certain IREs are too specific for netlink */
+	if ((ire->ire_type & (IRE_BROADCAST | IRE_MULTICAST | IRE_NOROUTE |
+	    IRE_LOOPBACK | IRE_LOCAL)) != 0 || ire->ire_testhidden != 0) {
+		return;
+	}
+	/*
+	 * When listing routes, CLONE entries are undesired.
+	 * They are required for 'ip route get' on a local address.
+	 */
+	if (rtmsg->rtm_dst_len == 0 && (ire->ire_type & IRE_IF_CLONE) != 0) {
+		return;
+	}
+
+	bzero(&res, sizeof (res));
+	res.rtm_family = LX_AF_INET;
+	res.rtm_table = LX_ROUTE_TABLE_MAIN;
+	res.rtm_type = LX_RTN_UNICAST;
+	res.rtm_dst_len = ire->ire_masklen;
+
+	if (ire->ire_type & (IRE_IF_NORESOLVER|IRE_IF_RESOLVER)) {
+		/* Interface-local networks considered kernel-created */
+		res.rtm_protocol = LX_RTPROT_KERNEL;
+		res.rtm_scope = LX_RTSCOPE_LINK;
+	} else if (ire->ire_flags & RTF_STATIC) {
+		res.rtm_protocol = LX_RTPROT_STATIC;
+	}
+
+	if (rtmsg->rtm_dst_len == 0x20 && rtadst != NULL) {
+		/*
+		 * SpecifY single-destination route.
+		 * RTA_DST details will be added later
+		 */
+		res.rtm_dst_len = rtmsg->rtm_dst_len;
+	}
+
+
+	lx_netlink_reply_msg(reply, &res, sizeof (res));
+
+	if (rtmsg->rtm_dst_len == 0x20 && rtadst != NULL) {
+		/* Add RTA_DST details for single-destination route. */
+		lx_netlink_reply_attr(reply, LX_NETLINK_RTA_DST,
+		    LXATTR_PAYLOAD(rtadst), sizeof (ipaddr_t));
+	} else if (ire->ire_masklen != 0) {
+		lx_netlink_reply_attr(reply, LX_NETLINK_RTA_DST,
+		    &ire->ire_addr, sizeof (ire->ire_addr));
+	}
+
+	if (ire->ire_ill != NULL) {
+		ill = ire->ire_ill;
+	} else if (ire->ire_dep_parent != NULL) {
+		ill = ire->ire_dep_parent->ire_ill;
+	}
+
+	if (ill != NULL) {
+		uint32_t ifindex, addr_src;
+
+		ifindex = ill->ill_phyint->phyint_ifindex;
+		lx_netlink_reply_attr(reply, LX_NETLINK_RTA_OIF,
+		    &ifindex, sizeof (ifindex));
+
+		addr_src = ill->ill_ipif->ipif_lcl_addr;
+		lx_netlink_reply_attr(reply, LX_NETLINK_RTA_PREFSRC,
+		    &addr_src, sizeof (addr_src));
+	}
+
+	if (ire->ire_flags & RTF_GATEWAY) {
+		lx_netlink_reply_attr(reply, LX_NETLINK_RTA_GATEWAY,
+		    &ire->ire_gateway_addr, sizeof (ire->ire_gateway_addr));
+	}
+
+	lx_netlink_reply_send(reply);
+}
+
+/*ARGSUSED*/
+static int
+lx_netlink_getroute(lx_netlink_sock_t *lxsock, lx_netlink_hdr_t *hdr,
+    mblk_t *mp)
+{
+	struct lx_getroute_ctx ctx;
+	lx_netlink_reply_t *reply;
+	lx_netlink_rtmsg_t rtmsg, *rtmsgp;
+	int rtmsg_size = sizeof (rtmsg);
+	netstack_t *ns;
+	int i;
+
+	bzero(&ctx, sizeof (ctx));
+	ctx.lgrtctx_max_attr = LX_NETLINK_MAX_RTA;
+
+	if (lx_netlink_parse_msg_attrs(mp, (void **)&rtmsgp,
+	    rtmsg_size, ctx.lgrtctx_attrs, &ctx.lgrtctx_max_attr) != 0) {
+		return (EPROTO);
+	}
+
+	/*
+	 * Older version of libnetlink send a truncated rtmsg struct for
+	 * certain RTM_GETROUTE queries.  We must detect this condition and
+	 * truncate our input to prevent later confusion.
+	 */
+	if (curproc->p_zone->zone_brand == &lx_brand &&
+	    lx_kern_release_cmp(curproc->p_zone, "2.6.32") <= 0 &&
+	    rtmsgp->rtm_dst_len == 0) {
+		rtmsg_size = sizeof (rtmsg.rtm_family);
+	}
+	bzero(&rtmsg, sizeof (rtmsg));
+	bcopy(rtmsgp, &rtmsg, rtmsg_size);
+	ctx.lgrtctx_rtmsg = &rtmsg;
+
+	/* If RTA_DST was passed, it effects later decisions */
+	for (i = 0; i < ctx.lgrtctx_max_attr; i++) {
+		lx_netlink_attr_t *attr = ctx.lgrtctx_attrs[i];
+
+		if (attr->lxna_type == LX_NETLINK_RTA_DST &&
+		    attr->lxna_len == LXATTR_LEN(sizeof (ipaddr_t))) {
+			ctx.lgrtctx_rtadst = attr;
+			break;
+		}
+	}
+
+	reply = lx_netlink_reply(lxsock, hdr, LX_NETLINK_RTM_NEWROUTE);
+	if (reply == NULL) {
+		return (ENOMEM);
+	}
+	ctx.lgrtctx_reply = reply;
+
+	/* Do not report anything outside the main table */
+	if (rtmsg.rtm_table != LX_ROUTE_TABLE_MAIN &&
+	    rtmsg.rtm_table != 0) {
+		lx_netlink_reply_done(reply);
+		return (0);
+	}
+
+	ns = netstack_get_current();
+	if (ns == NULL) {
+		lx_netlink_reply_done(reply);
+		return (0);
+	}
+	if (rtmsg.rtm_family == LX_AF_INET || rtmsg.rtm_family == 0) {
+		if (rtmsg.rtm_dst_len == 0x20 && ctx.lgrtctx_rtadst != NULL) {
+			/* resolve route for host */
+			ipaddr_t *dst = LXATTR_PAYLOAD(ctx.lgrtctx_rtadst);
+			ire_t *ire_dst;
+
+			ire_dst = ire_route_recursive_dstonly_v4(*dst, 0, 0,
+			    ns->netstack_ip);
+			lx_netlink_getroute_ipv4(ire_dst, &ctx);
+			ire_refrele(ire_dst);
+		} else {
+			/* get route listing */
+			ire_walk_v4(&lx_netlink_getroute_ipv4, &ctx, ALL_ZONES,
+			    ns->netstack_ip);
+		}
+	}
+	if (rtmsg.rtm_family == LX_AF_INET6) {
+		/* punt on ipv6 for now */
+		netstack_rele(ns);
+		lx_netlink_reply_done(reply);
+		return (EPROTO);
+	}
+	netstack_rele(ns);
+
+	lx_netlink_reply_done(reply);
+	return (0);
+}
+
+
+/*ARGSUSED*/
+static int
+lx_netlink_audit(lx_netlink_sock_t *lxsock, lx_netlink_hdr_t *hdr, mblk_t *mp)
+{
+	/*
+	 * For all auditing messages, we return ECONNREFUSED, which seems to
+	 * keep user-level auditing happy.  (Or at least, non-suicidal.)
+	 */
+	return (ECONNREFUSED);
+}
+
+/*ARGSUSED*/
+static int
+lx_netlink_kobject_uevent(lx_netlink_sock_t *lxsock,
+    lx_netlink_hdr_t *hdr, mblk_t *mp)
+{
+	/*
+	 * For udev, we just silently accept all writes and never actually
+	 * reply with anything -- which appears to be sufficient for things
+	 * to work.
+	 */
+	return (0);
+}
+
+/*ARGSUSED*/
+static int
+lx_netlink_send(sock_lower_handle_t handle, mblk_t *mp,
+    struct nmsghdr *msg, cred_t *cr)
+{
+	lx_netlink_sock_t *lxsock = (lx_netlink_sock_t *)handle;
+	lx_netlink_hdr_t *hdr = (lx_netlink_hdr_t *)mp->b_rptr;
+	int i, rval;
+
+	static struct {
+		int proto;
+		uint16_t type;
+		int (*func)(lx_netlink_sock_t *, lx_netlink_hdr_t *, mblk_t *);
+	} handlers[] = {
+		{ LX_NETLINK_ROUTE,
+		    LX_NETLINK_RTM_GETLINK, lx_netlink_getlink },
+		{ LX_NETLINK_ROUTE,
+		    LX_NETLINK_RTM_GETADDR, lx_netlink_getaddr },
+		{ LX_NETLINK_ROUTE,
+		    LX_NETLINK_RTM_GETROUTE, lx_netlink_getroute },
+		{ LX_NETLINK_AUDIT,
+		    LX_NETLINK_NLMSG_NONE, lx_netlink_audit },
+		{ LX_NETLINK_KOBJECT_UEVENT,
+		    LX_NETLINK_NLMSG_NONE, lx_netlink_kobject_uevent },
+		{ LX_NETLINK_NLMSG_NOOP, LX_NETLINK_NLMSG_NONE, NULL }
+	};
+
+	if (DB_TYPE(mp) != M_DATA || MBLKL(mp) < sizeof (lx_netlink_hdr_t)) {
+		freemsg(mp);
+		return (EPROTO);
+	}
+
+	for (i = 0; handlers[i].func != NULL; i++) {
+		if (lxsock->lxns_proto != handlers[i].proto)
+			continue;
+
+		if (handlers[i].type != LX_NETLINK_NLMSG_NONE &&
+		    hdr->lxnh_type != handlers[i].type)
+			continue;
+
+		rval = handlers[i].func(lxsock, hdr, mp);
+		freemsg(mp);
+
+		return (rval);
+	}
+
+	/*
+	 * An unrecognized message.  We will bounce up an EOPNOTSUPP reply.
+	 */
+	rval = lx_netlink_reply_error(lxsock, hdr, EOPNOTSUPP);
+	freemsg(mp);
+
+	return (rval);
+}
+
+/*ARGSUSED*/
+static int
+lx_netlink_close(sock_lower_handle_t handle, int flags, cred_t *cr)
+{
+	lx_netlink_sock_t *lxsock = (lx_netlink_sock_t *)handle, *sock, **prev;
+
+	mutex_enter(&lx_netlink_lock);
+
+	prev = &lx_netlink_head;
+
+	for (sock = *prev; sock != lxsock; sock = sock->lxns_next)
+		prev = &sock->lxns_next;
+
+	*prev = sock->lxns_next;
+
+	mutex_exit(&lx_netlink_lock);
+
+	(void) ldi_close(lxsock->lxns_iphandle, FREAD, kcred);
+	(void) ldi_close(lxsock->lxns_ip6handle, FREAD, kcred);
+	kmem_free(lxsock, sizeof (lx_netlink_sock_t));
+
+	return (0);
+}
+
+static sock_downcalls_t sock_lx_netlink_downcalls = {
+	lx_netlink_activate,		/* sd_activate */
+	sock_accept_notsupp,		/* sd_accept */
+	lx_netlink_bind,		/* sd_bind */
+	sock_listen_notsupp,		/* sd_listen */
+	sock_connect_notsupp,		/* sd_connect */
+	sock_getpeername_notsupp,	/* sd_getpeername */
+	lx_netlink_getsockname,		/* sd_getsockname */
+	sock_getsockopt_notsupp,	/* sd_getsockopt */
+	lx_netlink_setsockopt,		/* sd_setsockopt */
+	lx_netlink_send,		/* sd_send */
+	NULL,				/* sd_send_uio */
+	NULL,				/* sd_recv_uio */
+	NULL,				/* sd_poll */
+	sock_shutdown_notsupp,		/* sd_shutdown */
+	sock_clr_flowctrl_notsupp,	/* sd_setflowctrl */
+	sock_ioctl_notsupp,		/* sd_ioctl */
+	lx_netlink_close		/* sd_close */
+};
+
+/*ARGSUSED*/
+static sock_lower_handle_t
+lx_netlink_create(int family, int type, int proto,
+    sock_downcalls_t **sock_downcalls, uint_t *smodep, int *errorp,
+    int flags, cred_t *credp)
+{
+	lx_netlink_sock_t *lxsock;
+	ldi_handle_t handle, handle6;
+	cred_t *kcred = zone_kcred();
+	int err;
+
+	if (family != AF_LX_NETLINK ||
+	    (type != SOCK_DGRAM && type != SOCK_RAW)) {
+		*errorp = EPROTONOSUPPORT;
+		return (NULL);
+	}
+
+	switch (proto) {
+	case LX_NETLINK_ROUTE:
+	case LX_NETLINK_AUDIT:
+	case LX_NETLINK_KOBJECT_UEVENT:
+		break;
+
+	default:
+		*errorp = EPROTONOSUPPORT;
+		return (NULL);
+	}
+
+	if ((err = ldi_open_by_name(DEV_IP, FREAD, kcred,
+	    &handle, lx_netlink_ldi)) != 0) {
+		*errorp = err;
+		return (NULL);
+	}
+
+	if ((err = ldi_open_by_name(DEV_IP6, FREAD, kcred,
+	    &handle6, lx_netlink_ldi)) != 0) {
+		(void) ldi_close(handle, FREAD, kcred);
+		*errorp = err;
+		return (NULL);
+	}
+
+	*sock_downcalls = &sock_lx_netlink_downcalls;
+	*smodep = SM_ATOMIC;
+
+	lxsock = kmem_zalloc(sizeof (lx_netlink_sock_t), KM_SLEEP);
+	lxsock->lxns_iphandle = handle;
+	lxsock->lxns_ip6handle = handle6;
+	lxsock->lxns_bufsize = lx_netlink_bufsize;
+	lxsock->lxns_proto = proto;
+
+	mutex_enter(&lx_netlink_lock);
+
+	lxsock->lxns_next = lx_netlink_head;
+	lx_netlink_head = lxsock;
+
+	mutex_exit(&lx_netlink_lock);
+
+	return ((sock_lower_handle_t)lxsock);
+}
+
+static void
+lx_netlink_init(void)
+{
+	major_t major = mod_name_to_major("ip");
+	int err;
+
+	VERIFY(major != DDI_MAJOR_T_NONE);
+
+	err = ldi_ident_from_major(major, &lx_netlink_ldi);
+	VERIFY(err == 0);
+}
+
+static void
+lx_netlink_fini(void)
+{
+	ldi_ident_release(lx_netlink_ldi);
+}
+
+static smod_reg_t sinfo = {
+	SOCKMOD_VERSION,
+	"lx_netlink",
+	SOCK_UC_VERSION,
+	SOCK_DC_VERSION,
+	lx_netlink_create,
+	NULL
+};
+
+/* modldrv structure */
+static struct modlsockmod sockmod = {
+	&mod_sockmodops, "AF_LX_NETLINK socket module", &sinfo
+};
+
+/* modlinkage structure */
+static struct modlinkage ml = {
+	MODREV_1,
+	&sockmod,
+	NULL
+};
+
+int
+_init(void)
+{
+	int err;
+
+	lx_netlink_init();
+
+	if ((err = mod_install(&ml)) != 0)
+		lx_netlink_fini();
+
+	return (err);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&ml, modinfop));
+}
+
+int
+_fini(void)
+{
+	int err = 0;
+
+	mutex_enter(&lx_netlink_lock);
+
+	if (lx_netlink_head != NULL)
+		err = EBUSY;
+
+	mutex_exit(&lx_netlink_lock);
+
+	if (err == 0 && (err = mod_remove(&ml)) == 0)
+		lx_netlink_fini();
+
+	return (err);
+}
diff --git a/usr/src/uts/common/brand/lx/io/lx_ptm.c b/usr/src/uts/common/brand/lx/io/lx_ptm.c
new file mode 100644
index 0000000000..23e0c6f459
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/io/lx_ptm.c
@@ -0,0 +1,1188 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright 2016 Joyent, Inc.  All rights reserved.
+ */
+
+
+/*
+ * This driver attempts to emulate some of the the behaviors of
+ * Linux terminal devices (/dev/ptmx and /dev/pts/[0-9][0-9]*) on Solaris
+ *
+ * It does this by layering over the /dev/ptmx device and intercepting
+ * opens to it.
+ *
+ * This driver makes the following assumptions about the way the ptm/pts
+ * drivers on Solaris work:
+ *
+ *    - all opens of the /dev/ptmx device node return a unique dev_t.
+ *
+ *    - the dev_t minor node value for each open ptm instance corrospondes
+ *      to it's associated slave terminal device number.  ie. the path to
+ *      the slave terminal device associated with an open ptm instance
+ *      who's dev_t minor node vaue is 5, is /dev/pts/5.
+ *
+ *    - the ptm driver always allocates the lowest numbered slave terminal
+ *      device possible.
+ */
+
+#include <sys/conf.h>
+#include <sys/ddi.h>
+#include <sys/devops.h>
+#include <sys/file.h>
+#include <sys/filio.h>
+#include <sys/kstr.h>
+#include <sys/lx_ptm.h>
+#include <sys/modctl.h>
+#include <sys/pathname.h>
+#include <sys/ptms.h>
+#include <sys/ptyvar.h>
+#include <sys/stat.h>
+#include <sys/stropts.h>
+#include <sys/sunddi.h>
+#include <sys/sunldi.h>
+#include <sys/sysmacros.h>
+#include <sys/types.h>
+#include <sys/sdt.h>
+
+#define	LP_PTM_PATH		"/dev/ptmx"
+#define	LP_PTS_PATH		"/dev/pts/"
+#define	LP_PTS_DRV_NAME		"pts"
+#define	LP_PTS_USEC_DELAY	(5 * 1000)	/* 5 ms */
+#define	LP_PTS_USEC_DELAY_MAX	(5 * MILLISEC)	/* 5 ms */
+
+/*
+ * this driver is layered on top of the ptm driver.  we'd like to
+ * make this drivers minor name space a mirror of the ptm drivers
+ * namespace, but we can't actually do this.  the reason is that the
+ * ptm driver is opened via the clone driver.  there for no minor nodes
+ * of the ptm driver are actually accessible via the filesystem.
+ * since we're not a streams device we can't be opened by the clone
+ * driver.  there for we need to have at least minor node accessible
+ * via the filesystem so that consumers can open it.  we use the device
+ * node with a minor number of 0 for this purpose.  what this means is
+ * that minor node 0 can't be used to map ptm minor node 0.  since this
+ * minor node is now reserved we need to shift our ptm minor node
+ * mappings by one.  ie. a ptm minor node with a value of 0 will
+ * corrospond to our minor node with a value of 1.  these mappings are
+ * managed with the following macros.
+ */
+#define	DEVT_TO_INDEX(x)	LX_PTM_DEV_TO_PTS(x)
+#define	INDEX_TO_MINOR(x)	((x) + 1)
+
+/*
+ * grow our layered handle array by the same size increment that the ptm
+ * driver uses to grow the pty device space - PTY_MAXDELTA
+ */
+#define	LP_PTY_INC	128
+
+/*
+ * lx_ptm_ops contains state information about outstanding operations on the
+ * underlying master terminal device.  Currently we only track information
+ * for read operations.
+ *
+ * Note that this data has not been rolled directly into the lx_ptm_handle
+ * structure because we can't put mutex's of condition variables into
+ * lx_ptm_handle structure.  The reason is that the array of lx_ptm_handle
+ * structures linked to from the global lx_ptm state can be resized
+ * dynamically, and when it's resized, the new array is at a different
+ * memory location and the old array memory is discarded.  Mutexs and cvs
+ * are accessed based off their address, so if this array was re-sized while
+ * there were outstanding operations on any mutexs or cvs in the array
+ * then the system would tip over.  In the future the lx_ptm_handle structure
+ * array should probably be replaced with either an array of pointers to
+ * lx_ptm_handle structures or some other kind of data structure containing
+ * pointers to lx_ptm_handle structures.  Then the lx_ptm_ops structure
+ * could be folded directly into the lx_ptm_handle structures.  (This will
+ * also require the definition of a new locking mechanism to protect the
+ * contents of lx_ptm_handle structures.)
+ */
+typedef struct lx_ptm_ops {
+	int			lpo_rops;
+	kcondvar_t		lpo_rops_cv;
+	kmutex_t		lpo_rops_lock;
+} lx_ptm_ops_t;
+
+/*
+ * Every open of the master terminal device in a zone results in a new
+ * lx_ptm_handle handle allocation.  These handles are stored in an array
+ * hanging off the lx_ptm_state structure.
+ */
+typedef struct lx_ptm_handle {
+	/* Device handle to the underlying real /dev/ptmx master terminal. */
+	ldi_handle_t		lph_handle;
+
+	/* Flag to indicate if TIOCPKT mode has been enabled. */
+	int			lph_pktio;
+
+	/* Number of times the slave device has been opened/closed. */
+	int			lph_eofed;
+
+	/* Callback handler in the ptm driver to check if slave is open. */
+	ptmptsopencb_t		lph_ppocb;
+
+	/* Pointer to state for operations on underlying device. */
+	lx_ptm_ops_t		*lph_lpo;
+} lx_ptm_handle_t;
+
+/*
+ * Global state for the lx_ptm driver.
+ */
+typedef struct lx_ptm_state {
+	/* lx_ptm device devinfo pointer */
+	dev_info_t		*lps_dip;
+
+	/* LDI ident used to open underlying real /dev/ptmx master terminals. */
+	ldi_ident_t		lps_li;
+
+	/* pts drivers major number */
+	major_t			lps_pts_major;
+
+	/* rw lock used to manage access and growth of lps_lh_array */
+	krwlock_t		lps_lh_rwlock;
+
+	/* number of elements in lps_lh_array */
+	uint_t			lps_lh_count;
+
+	/* Array of handles to underlying real /dev/ptmx master terminals. */
+	lx_ptm_handle_t		*lps_lh_array;
+} lx_ptm_state_t;
+
+/* Pointer to the lx_ptm global state structure. */
+static lx_ptm_state_t	lps;
+
+/*
+ * List of modules to be autopushed onto slave terminal devices when they
+ * are opened in an lx branded zone.
+ */
+static char *lx_pts_mods[] = {
+	"ptem",
+	"ldterm",
+	"ttcompat",
+	NULL
+};
+
+static void
+lx_ptm_lh_grow(uint_t index)
+{
+	uint_t			new_lh_count, old_lh_count;
+	lx_ptm_handle_t		*new_lh_array, *old_lh_array;
+
+	/*
+	 * allocate a new array.  we drop the rw lock on the array so that
+	 * readers can still access devices in case our memory allocation
+	 * blocks.
+	 */
+	new_lh_count = MAX(lps.lps_lh_count + LP_PTY_INC, index + 1);
+	new_lh_array =
+	    kmem_zalloc(sizeof (lx_ptm_handle_t) * new_lh_count, KM_SLEEP);
+
+	/*
+	 * double check that we still actually need to increase the size
+	 * of the array
+	 */
+	rw_enter(&lps.lps_lh_rwlock, RW_WRITER);
+	if (index < lps.lps_lh_count) {
+		/* someone beat us to it so there's nothing more to do */
+		rw_exit(&lps.lps_lh_rwlock);
+		kmem_free(new_lh_array,
+		    sizeof (lx_ptm_handle_t) * new_lh_count);
+		return;
+	}
+
+	/* copy the existing data into the new array */
+	ASSERT((lps.lps_lh_count != 0) || (lps.lps_lh_array == NULL));
+	ASSERT((lps.lps_lh_count == 0) || (lps.lps_lh_array != NULL));
+	if (lps.lps_lh_count != 0) {
+		bcopy(lps.lps_lh_array, new_lh_array,
+		    sizeof (lx_ptm_handle_t) * lps.lps_lh_count);
+	}
+
+	/* save info on the old array */
+	old_lh_array = lps.lps_lh_array;
+	old_lh_count = lps.lps_lh_count;
+
+	/* install the new array */
+	lps.lps_lh_array = new_lh_array;
+	lps.lps_lh_count = new_lh_count;
+
+	rw_exit(&lps.lps_lh_rwlock);
+
+	/* free the old array */
+	if (old_lh_array != NULL) {
+		kmem_free(old_lh_array,
+		    sizeof (lx_ptm_handle_t) * old_lh_count);
+	}
+}
+
+static void
+lx_ptm_lh_insert(uint_t index, ldi_handle_t lh)
+{
+	lx_ptm_ops_t *lpo;
+
+	ASSERT(lh != NULL);
+
+	/* Allocate and initialize the ops structure */
+	lpo = kmem_zalloc(sizeof (lx_ptm_ops_t), KM_SLEEP);
+	mutex_init(&lpo->lpo_rops_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&lpo->lpo_rops_cv, NULL, CV_DEFAULT, NULL);
+
+	rw_enter(&lps.lps_lh_rwlock, RW_WRITER);
+
+	/* check if we need to grow the size of the layered handle array */
+	if (index >= lps.lps_lh_count) {
+		rw_exit(&lps.lps_lh_rwlock);
+		lx_ptm_lh_grow(index);
+		rw_enter(&lps.lps_lh_rwlock, RW_WRITER);
+	}
+
+	ASSERT(index < lps.lps_lh_count);
+	ASSERT(lps.lps_lh_array[index].lph_handle == NULL);
+	ASSERT(lps.lps_lh_array[index].lph_pktio == 0);
+	ASSERT(lps.lps_lh_array[index].lph_eofed == 0);
+	ASSERT(lps.lps_lh_array[index].lph_lpo == NULL);
+
+	/* insert the new handle and return */
+	lps.lps_lh_array[index].lph_handle = lh;
+	lps.lps_lh_array[index].lph_pktio = 0;
+	lps.lps_lh_array[index].lph_eofed = 0;
+	lps.lps_lh_array[index].lph_lpo = lpo;
+
+	rw_exit(&lps.lps_lh_rwlock);
+}
+
+static ldi_handle_t
+lx_ptm_lh_remove(uint_t index)
+{
+	ldi_handle_t	lh;
+
+	rw_enter(&lps.lps_lh_rwlock, RW_WRITER);
+
+	ASSERT(index < lps.lps_lh_count);
+	ASSERT(lps.lps_lh_array[index].lph_handle != NULL);
+	ASSERT(lps.lps_lh_array[index].lph_lpo->lpo_rops == 0);
+	ASSERT(!MUTEX_HELD(&lps.lps_lh_array[index].lph_lpo->lpo_rops_lock));
+
+	/* free the write handle */
+	kmem_free(lps.lps_lh_array[index].lph_lpo, sizeof (lx_ptm_ops_t));
+	lps.lps_lh_array[index].lph_lpo = NULL;
+
+	/* remove the handle and return it */
+	lh = lps.lps_lh_array[index].lph_handle;
+	lps.lps_lh_array[index].lph_handle = NULL;
+	lps.lps_lh_array[index].lph_pktio = 0;
+	lps.lps_lh_array[index].lph_eofed = 0;
+	rw_exit(&lps.lps_lh_rwlock);
+	return (lh);
+}
+
+static void
+lx_ptm_lh_get_ppocb(uint_t index, ptmptsopencb_t *ppocb)
+{
+	rw_enter(&lps.lps_lh_rwlock, RW_WRITER);
+
+	ASSERT(index < lps.lps_lh_count);
+	ASSERT(lps.lps_lh_array[index].lph_handle != NULL);
+
+	*ppocb = lps.lps_lh_array[index].lph_ppocb;
+	rw_exit(&lps.lps_lh_rwlock);
+}
+
+static void
+lx_ptm_lh_set_ppocb(uint_t index, ptmptsopencb_t *ppocb)
+{
+	rw_enter(&lps.lps_lh_rwlock, RW_WRITER);
+
+	ASSERT(index < lps.lps_lh_count);
+	ASSERT(lps.lps_lh_array[index].lph_handle != NULL);
+
+	lps.lps_lh_array[index].lph_ppocb = *ppocb;
+	rw_exit(&lps.lps_lh_rwlock);
+}
+
+static ldi_handle_t
+lx_ptm_lh_lookup(uint_t index)
+{
+	ldi_handle_t	lh;
+
+	rw_enter(&lps.lps_lh_rwlock, RW_READER);
+
+	ASSERT(index < lps.lps_lh_count);
+	ASSERT(lps.lps_lh_array[index].lph_handle != NULL);
+
+	/* return the handle */
+	lh = lps.lps_lh_array[index].lph_handle;
+	rw_exit(&lps.lps_lh_rwlock);
+	return (lh);
+}
+
+static lx_ptm_ops_t *
+lx_ptm_lpo_lookup(uint_t index)
+{
+	lx_ptm_ops_t	*lpo;
+
+	rw_enter(&lps.lps_lh_rwlock, RW_READER);
+
+	ASSERT(index < lps.lps_lh_count);
+	ASSERT(lps.lps_lh_array[index].lph_lpo != NULL);
+
+	/* return the handle */
+	lpo = lps.lps_lh_array[index].lph_lpo;
+	rw_exit(&lps.lps_lh_rwlock);
+	return (lpo);
+}
+
+static int
+lx_ptm_lh_pktio_get(uint_t index)
+{
+	int		pktio;
+
+	rw_enter(&lps.lps_lh_rwlock, RW_READER);
+
+	ASSERT(index < lps.lps_lh_count);
+	ASSERT(lps.lps_lh_array[index].lph_handle != NULL);
+
+	/* return the pktio state */
+	pktio = lps.lps_lh_array[index].lph_pktio;
+	rw_exit(&lps.lps_lh_rwlock);
+	return (pktio);
+}
+
+static void
+lx_ptm_lh_pktio_set(uint_t index, int pktio)
+{
+	rw_enter(&lps.lps_lh_rwlock, RW_WRITER);
+
+	ASSERT(index < lps.lps_lh_count);
+	ASSERT(lps.lps_lh_array[index].lph_handle != NULL);
+
+	/* set the pktio state */
+	lps.lps_lh_array[index].lph_pktio = pktio;
+	rw_exit(&lps.lps_lh_rwlock);
+}
+
+static int
+lx_ptm_lh_eofed_get(uint_t index)
+{
+	int		eofed;
+
+	rw_enter(&lps.lps_lh_rwlock, RW_READER);
+
+	ASSERT(index < lps.lps_lh_count);
+	ASSERT(lps.lps_lh_array[index].lph_handle != NULL);
+
+	/* return the eofed state */
+	eofed = lps.lps_lh_array[index].lph_eofed;
+	rw_exit(&lps.lps_lh_rwlock);
+	return (eofed);
+}
+
+static void
+lx_ptm_lh_eofed_set(uint_t index)
+{
+	rw_enter(&lps.lps_lh_rwlock, RW_WRITER);
+
+	ASSERT(index < lps.lps_lh_count);
+	ASSERT(lps.lps_lh_array[index].lph_handle != NULL);
+
+	/* set the eofed state */
+	lps.lps_lh_array[index].lph_eofed++;
+	rw_exit(&lps.lps_lh_rwlock);
+}
+
+static int
+lx_ptm_read_start(dev_t dev)
+{
+	lx_ptm_ops_t	*lpo = lx_ptm_lpo_lookup(DEVT_TO_INDEX(dev));
+
+	mutex_enter(&lpo->lpo_rops_lock);
+	ASSERT(lpo->lpo_rops >= 0);
+
+	/* Wait for other read operations to finish */
+	while (lpo->lpo_rops != 0) {
+		if (cv_wait_sig(&lpo->lpo_rops_cv, &lpo->lpo_rops_lock) == 0) {
+			mutex_exit(&lpo->lpo_rops_lock);
+			return (-1);
+		}
+	}
+
+	/* Start a read operation */
+	VERIFY(++lpo->lpo_rops == 1);
+	mutex_exit(&lpo->lpo_rops_lock);
+	return (0);
+}
+
+static void
+lx_ptm_read_end(dev_t dev)
+{
+	lx_ptm_ops_t	*lpo = lx_ptm_lpo_lookup(DEVT_TO_INDEX(dev));
+
+	mutex_enter(&lpo->lpo_rops_lock);
+	ASSERT(lpo->lpo_rops >= 0);
+
+	/* End a read operation */
+	VERIFY(--lpo->lpo_rops == 0);
+	cv_signal(&lpo->lpo_rops_cv);
+
+	mutex_exit(&lpo->lpo_rops_lock);
+}
+
+static int
+lx_ptm_pts_isopen(dev_t dev)
+{
+	ptmptsopencb_t	ppocb;
+
+	lx_ptm_lh_get_ppocb(DEVT_TO_INDEX(dev), &ppocb);
+	return (ppocb.ppocb_func(ppocb.ppocb_arg));
+}
+
+static void
+lx_ptm_eof_read(ldi_handle_t lh)
+{
+	struct uio	uio;
+	iovec_t		iov;
+	char		junk[1];
+
+	/*
+	 * We can remove any EOF message from the head of the stream by
+	 * doing a zero byte read from the stream.
+	 */
+	iov.iov_len = 0;
+	iov.iov_base = junk;
+	uio.uio_iovcnt = 1;
+	uio.uio_iov = &iov;
+	uio.uio_resid = iov.iov_len;
+	uio.uio_offset = 0;
+	uio.uio_segflg = UIO_SYSSPACE;
+	uio.uio_fmode = 0;
+	uio.uio_extflg = 0;
+	uio.uio_llimit = MAXOFFSET_T;
+	(void) ldi_read(lh, &uio, kcred);
+}
+
+static int
+lx_ptm_eof_drop_1(dev_t dev, int *rvalp)
+{
+	ldi_handle_t	lh = lx_ptm_lh_lookup(DEVT_TO_INDEX(dev));
+	int		err, msg_size, msg_count;
+
+	*rvalp = 0;
+
+	/*
+	 * Check if there is an EOF message (represented by a zero length
+	 * data message) at the head of the stream.  Note that the
+	 * I_NREAD ioctl is a streams framework ioctl so it will succeed
+	 * even if there have been previous write errors on this stream.
+	 */
+	if ((err = ldi_ioctl(lh, I_NREAD, (intptr_t)&msg_size,
+	    FKIOCTL, kcred, &msg_count)) != 0)
+		return (err);
+
+	if ((msg_count == 0) || (msg_size != 0)) {
+		/* No EOF message found */
+		return (0);
+	}
+
+	/* Record the fact that the slave device has been closed. */
+	lx_ptm_lh_eofed_set(DEVT_TO_INDEX(dev));
+
+	/* drop the EOF */
+	lx_ptm_eof_read(lh);
+	*rvalp = 1;
+	return (0);
+}
+
+static int
+lx_ptm_eof_drop(dev_t dev, int *rvalp)
+{
+	int rval, err;
+
+	if (rvalp != NULL)
+		*rvalp = 0;
+	for (;;) {
+		if ((err = lx_ptm_eof_drop_1(dev, &rval)) != 0)
+			return (err);
+		if (rval == 0)
+			return (0);
+		if (rvalp != NULL)
+			*rvalp = 1;
+	}
+}
+
+static int
+lx_ptm_data_check(dev_t dev, int ignore_eof, int *rvalp)
+{
+	ldi_handle_t	lh = lx_ptm_lh_lookup(DEVT_TO_INDEX(dev));
+	int		err;
+
+	*rvalp = 0;
+	if (ignore_eof) {
+		int	size, rval;
+
+		if ((err = ldi_ioctl(lh, FIONREAD, (intptr_t)&size,
+		    FKIOCTL, kcred, &rval)) != 0)
+			return (err);
+		if (size != 0)
+			*rvalp = 1;
+	} else {
+		int	msg_size, msg_count;
+
+		if ((err = ldi_ioctl(lh, I_NREAD, (intptr_t)&msg_size,
+		    FKIOCTL, kcred, &msg_count)) != 0)
+			return (err);
+		if (msg_count != 0)
+			*rvalp = 1;
+	}
+	return (0);
+}
+
+static int
+lx_ptm_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+	int err;
+
+	if (cmd != DDI_ATTACH)
+		return (DDI_FAILURE);
+
+	if (ddi_create_minor_node(dip, LX_PTM_MINOR_NODE, S_IFCHR,
+	    ddi_get_instance(dip), DDI_PSEUDO, 0) != DDI_SUCCESS)
+		return (DDI_FAILURE);
+
+	err = ldi_ident_from_dip(dip, &lps.lps_li);
+	if (err != 0) {
+		ddi_remove_minor_node(dip, ddi_get_name(dip));
+		return (DDI_FAILURE);
+	}
+
+	lps.lps_dip = dip;
+	lps.lps_pts_major = ddi_name_to_major(LP_PTS_DRV_NAME);
+
+	rw_init(&lps.lps_lh_rwlock, NULL, RW_DRIVER, NULL);
+	lps.lps_lh_count = 0;
+	lps.lps_lh_array = NULL;
+
+	return (DDI_SUCCESS);
+}
+
+/*ARGSUSED*/
+static int
+lx_ptm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+	if (cmd != DDI_DETACH)
+		return (DDI_FAILURE);
+
+	ldi_ident_release(lps.lps_li);
+	lps.lps_dip = NULL;
+
+	ASSERT((lps.lps_lh_count != 0) || (lps.lps_lh_array == NULL));
+	ASSERT((lps.lps_lh_count == 0) || (lps.lps_lh_array != NULL));
+	if (lps.lps_lh_array != NULL) {
+		kmem_free(lps.lps_lh_array,
+		    sizeof (lx_ptm_handle_t) * lps.lps_lh_count);
+		lps.lps_lh_array = NULL;
+		lps.lps_lh_count = 0;
+	}
+
+	return (DDI_SUCCESS);
+}
+
+/*ARGSUSED*/
+static int
+lx_ptm_open(dev_t *devp, int flag, int otyp, cred_t *credp)
+{
+	struct strioctl	iocb;
+	ptmptsopencb_t	ppocb = { NULL, NULL };
+	ldi_handle_t	lh;
+	major_t		maj, our_major = getmajor(*devp);
+	minor_t		min, lastmin;
+	uint_t		index, anchor = 1;
+	dev_t		ptm_dev;
+	int		err, rval = 0;
+
+	/*
+	 * Don't support the FNDELAY flag and FNONBLOCK until we either
+	 * find a Linux app that opens /dev/ptmx with the O_NDELAY
+	 * or O_NONBLOCK flags explicitly, or until we create test cases
+	 * to determine how reads of master terminal devices opened with
+	 * these flags behave in different situations on Linux.  Supporting
+	 * these flags will involve enhancing our read implementation
+	 * and changing the way it deals with EOF notifications.
+	 */
+	if (flag & (FNDELAY | FNONBLOCK))
+		return (ENOTSUP);
+
+	/*
+	 * we're layered on top of the ptm driver so open that driver
+	 * first.  (note that we're opening /dev/ptmx in the global
+	 * zone, not ourselves in the lx zone.)
+	 */
+	err = ldi_open_by_name(LP_PTM_PATH, flag, credp, &lh, lps.lps_li);
+	if (err != 0)
+		return (err);
+
+	/* get the devt returned by the ptmx open */
+	err = ldi_get_dev(lh, &ptm_dev);
+	if (err != 0) {
+		(void) ldi_close(lh, flag, credp);
+		return (err);
+	}
+
+	/*
+	 * we're a cloning driver so here's where we'll change the devt that we
+	 * return.  the ptmx is also a cloning driver so we'll just use
+	 * it's minor number as our minor number (it already manages it's
+	 * minor name space so no reason to duplicate the effort.)
+	 */
+	index = getminor(ptm_dev);
+	*devp = makedevice(our_major, INDEX_TO_MINOR(index));
+
+	/* Get a callback function to query if the pts device is open. */
+	iocb.ic_cmd = PTMPTSOPENCB;
+	iocb.ic_timout = 0;
+	iocb.ic_len = sizeof (ppocb);
+	iocb.ic_dp = (char *)&ppocb;
+
+	err = ldi_ioctl(lh, I_STR, (intptr_t)&iocb, FKIOCTL, kcred, &rval);
+	if ((err != 0) || (rval != 0)) {
+		(void) ldi_close(lh, flag, credp);
+		return (EIO); /* XXX return something else here? */
+	}
+	ASSERT(ppocb.ppocb_func != NULL);
+
+	/*
+	 * now setup autopush for the terminal slave device.  this is
+	 * necessary so that when a Linux program opens the device we
+	 * can push required strmod modules onto the stream.  in Solaris
+	 * this is normally done by the application that actually
+	 * allocates the terminal.
+	 */
+	maj = lps.lps_pts_major;
+	min = index;
+	lastmin = 0;
+	err = kstr_autopush(SET_AUTOPUSH, &maj, &min, &lastmin,
+	    &anchor, lx_pts_mods);
+	if (err != 0 && err != EEXIST) {
+		(void) ldi_close(lh, flag, credp);
+		return (EIO); /* XXX return something else here? */
+	}
+
+	/* save off this layered handle for future accesses */
+	lx_ptm_lh_insert(index, lh);
+	lx_ptm_lh_set_ppocb(index, &ppocb);
+	return (0);
+}
+
+/*ARGSUSED*/
+static int
+lx_ptm_close(dev_t dev, int flag, int otyp, cred_t *credp)
+{
+	ldi_handle_t	lh;
+	major_t		maj;
+	minor_t		min, lastmin;
+	uint_t		index;
+	int		err;
+	int		i;
+
+	index = DEVT_TO_INDEX(dev);
+
+	/*
+	 * we must cleanup all the state associated with this major/minor
+	 * terminal pair before actually closing the ptm master device.
+	 * this is required because once the close of the ptm device is
+	 * complete major/minor terminal pair is immediatly available for
+	 * re-use in any zone.
+	 */
+
+	/* free up our saved reference for this layered handle */
+	lh = lx_ptm_lh_remove(index);
+
+	/* unconfigure autopush for the associated terminal slave device */
+	maj = lps.lps_pts_major;
+	min = index;
+	lastmin = 0;
+	for (i = 0; i < 5; i++) {
+		/*
+		 * we loop here because we don't want to release this ptm
+		 * node if autopush can't be disabled on the associated
+		 * slave device because then bad things could happen if
+		 * another brand were to get this terminal allocated
+		 * to them. If we keep failing we eventually drive on so that
+		 * things don't hang.
+		 */
+		err = kstr_autopush(CLR_AUTOPUSH, &maj, &min, &lastmin,
+		    0, NULL);
+		if (err == 0)
+			break;
+
+		cmn_err(CE_WARN, "lx zoneid %d: error %d on kstr_autopush",
+		    getzoneid(), err);
+
+		/* wait one second and try again */
+		delay(drv_usectohz(1000000));
+	}
+
+	err = ldi_close(lh, flag, credp);
+
+	/*
+	 * note that we don't have to bother with changing the permissions
+	 * on the associated slave device here.  the reason is that no one
+	 * can actually open the device untill it's associated master
+	 * device is re-opened, which will result in the permissions on
+	 * it being reset.
+	 */
+	return (err);
+}
+
+static int
+lx_ptm_read_loop(dev_t dev, struct uio *uiop, cred_t *credp, int *loop)
+{
+	ldi_handle_t	lh = lx_ptm_lh_lookup(DEVT_TO_INDEX(dev));
+	int		err, rval;
+	struct uio	uio = *uiop;
+
+	*loop = 0;
+
+	/*
+	 * Here's another way that Linux master terminals behave differently
+	 * from Solaris master terminals.  If you do a read on a Linux
+	 * master terminal (that was opened witout NDELAY and NONBLOCK)
+	 * who's corrosponding slave terminal is currently closed and
+	 * has been opened and closed at least once, Linux return -1 and
+	 * set errno to EIO where as Solaris blocks.
+	 */
+	if (lx_ptm_lh_eofed_get(DEVT_TO_INDEX(dev))) {
+		/* Slave has been opened and closed at least once. */
+		if (lx_ptm_pts_isopen(dev) == 0) {
+			/*
+			 * Slave is closed.  Make sure that data is avaliable
+			 * before attempting a read.
+			 */
+			if ((err = lx_ptm_data_check(dev, 0, &rval)) != 0)
+				return (err);
+
+			/* If there is no data available then return. */
+			if (rval == 0)
+				return (EIO);
+		}
+	}
+
+	/* Actually do the read operation. */
+	if ((err = ldi_read(lh, uiop, credp)) != 0)
+		return (err);
+
+	/* If read returned actual data then return. */
+	if (uio.uio_resid != uiop->uio_resid)
+		return (0);
+
+	/*
+	 * This was a zero byte read (ie, an EOF).  This indicates
+	 * that the slave terinal device has been closed.  Record
+	 * the fact that the slave device has been closed and retry
+	 * the read operation.
+	 */
+	lx_ptm_lh_eofed_set(DEVT_TO_INDEX(dev));
+	*loop = 1;
+	return (0);
+}
+
+static int
+lx_ptm_read(dev_t dev, struct uio *uiop, cred_t *credp)
+{
+	int		pktio = lx_ptm_lh_pktio_get(DEVT_TO_INDEX(dev));
+	int		err, loop;
+	struct uio	uio;
+	struct iovec	iovp;
+
+	ASSERT(uiop->uio_iovcnt > 0);
+
+	/*
+	 * If packet mode has been enabled (via TIOCPKT) we need to pad
+	 * all read requests with a leading byte that indicates any
+	 * relevant control status information.
+	 */
+	if (pktio != 0) {
+		/*
+		 * We'd like to write the control information into
+		 * the current buffer but we can't yet.  We don't
+		 * want to modify userspace memory here only to have
+		 * the read operation fail later.  So instead
+		 * what we'll do here is read one character from the
+		 * beginning of the memory pointed to by the uio
+		 * structure.  This will advance the output pointer
+		 * by one.  Then when the read completes successfully
+		 * we can update the byte that we passed over.  Before
+		 * we do the read make a copy of the current uiop and
+		 * iovec structs so we can write to them later.
+		 */
+		uio = *uiop;
+		iovp = *uiop->uio_iov;
+		uio.uio_iov = &iovp;
+
+		if (uwritec(uiop) == -1)
+			return (EFAULT);
+	}
+
+	do {
+		/*
+		 * Before we actually attempt a read operation we need
+		 * to make sure there's some buffer space to actually
+		 * read in some data.  We do this because if we're in
+		 * pktio mode and the caller only requested one byte,
+		 * then we've already used up that one byte and we
+		 * don't want to pass this read request.  Doing a 0
+		 * byte read (unless there is a problem with the stream
+		 * head) always returns succcess.  Normally when a streams
+		 * read returns 0 bytes we interpret that as an EOF on
+		 * the stream (ie, the slave side has been opened and
+		 * closed) and we ignore it and re-try the read operation.
+		 * So if we pass on a 0 byte read here lx_ptm_read_loop()
+		 * will tell us to loop around and we'll end up in an
+		 * infinite loop.
+		 */
+		if (uiop->uio_resid == 0)
+			break;
+
+		/*
+		 * Serialize all reads.  We need to do this so that we can
+		 * properly emulate the behavior of master terminals on Linux.
+		 * In reality this serializaion should not pose any kind of
+		 * performance problem since it would be very strange to have
+		 * multiple threads trying to read from the same master
+		 * terminal device concurrently.
+		 */
+		if (lx_ptm_read_start(dev) != 0)
+			return (EINTR);
+
+		err = lx_ptm_read_loop(dev, uiop, credp, &loop);
+		lx_ptm_read_end(dev);
+		if (err != 0)
+			return (err);
+	} while (loop != 0);
+
+	if (pktio != 0) {
+		uint8_t		pktio_data = TIOCPKT_DATA;
+
+		/*
+		 * Note that the control status information we
+		 * pass back is faked up in the sense that we
+		 * don't actually report any events, we always
+		 * report a status of 0.
+		 */
+		if (uiomove(&pktio_data, 1, UIO_READ, &uio) != 0)
+			return (EFAULT);
+	}
+
+	return (0);
+}
+
+static int
+lx_ptm_write(dev_t dev, struct uio *uiop, cred_t *credp)
+{
+	ldi_handle_t		lh = lx_ptm_lh_lookup(DEVT_TO_INDEX(dev));
+	int		err;
+
+	err = ldi_write(lh, uiop, credp);
+
+	return (err);
+}
+
+static int
+lx_ptm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
+    int *rvalp)
+{
+	ldi_handle_t	lh = lx_ptm_lh_lookup(DEVT_TO_INDEX(dev));
+	int		err;
+
+	/*
+	 * here we need to make sure that we never allow the
+	 * I_SETSIG and I_ESETSIG ioctls to pass through.  we
+	 * do this because we can't support them.
+	 *
+	 * the native Solaris ptm device supports these ioctls because
+	 * they are streams framework ioctls and all streams devices
+	 * support them by default.  these ioctls cause the current
+	 * process to be registered with a stream and receive signals
+	 * when certain stream events occur.
+	 *
+	 * a problem arises with cleanup of these registrations
+	 * for layered drivers.
+	 *
+	 * normally the streams framework is notified whenever a
+	 * process closes any reference to a stream and it goes ahead
+	 * and cleans up these registrations.  but actual device drivers
+	 * are not notified when a process performs a close operation
+	 * unless the process is closing the last opened reference to
+	 * the device on the entire system.
+	 *
+	 * so while we could pass these ioctls on and allow processes
+	 * to register for signal delivery, we would never receive
+	 * any notification when those processes exit (or close a
+	 * stream) and we wouldn't be able to unregister them.
+	 *
+	 * luckily these operations are streams specific and Linux
+	 * doesn't support streams devices.  so it doesn't actually
+	 * seem like we need to support these ioctls.  if it turns
+	 * out that we do need to support them for some reason in
+	 * the future, the current driver model will have to be
+	 * enhanced to better support streams device layering.
+	 */
+	if ((cmd == I_SETSIG) || (cmd == I_ESETSIG))
+		return (EINVAL);
+
+	/*
+	 * here we fake up support for TIOCPKT.  Linux applications expect
+	 * /etc/ptmx to support this ioctl, but on Solaris it doesn't.
+	 * (it is supported on older bsd style ptys.)  so we'll fake
+	 * up support for it here.
+	 *
+	 * the reason that this ioctl is emulated here instead of in
+	 * userland is that this ioctl affects the results returned
+	 * from read() operations.  if this ioctl was emulated in
+	 * userland the brand library would need to intercept all
+	 * read operations and check to see if pktio was enabled
+	 * for the fd being read from.  since this ioctl only needs
+	 * to be supported on the ptmx device it makes more sense
+	 * to support it here where we can easily update the results
+	 * returned for read() operations performed on ourselves.
+	 */
+	if (cmd == TIOCPKT) {
+		int	pktio;
+
+		if (ddi_copyin((void *)arg, &pktio, sizeof (pktio),
+		    mode) != DDI_SUCCESS)
+			return (EFAULT);
+
+		if (pktio == 0)
+			lx_ptm_lh_pktio_set(DEVT_TO_INDEX(dev), 0);
+		else
+			lx_ptm_lh_pktio_set(DEVT_TO_INDEX(dev), 1);
+
+		return (0);
+	}
+
+	err = ldi_ioctl(lh, cmd, arg, mode, credp, rvalp);
+
+	/*
+	 * On recent versions of Linux some apps issue the following ioctls to
+	 * the master side of the ptm before opening the slave side. Because
+	 * our streams modules (specifically ptem) aren't autopushed until the
+	 * slave side has been opened, these ioctls will fail. To alleviate the
+	 * issue we simply pretend that these ioctls have succeeded.
+	 *
+	 * We could push our own "lx_ptem" module onto the master side of the
+	 * stream in lx_ptm_open if we need better emulation, but that would
+	 * require an "lx_ptem" module which duplicates most of ptem. ptem
+	 * doesn't work properly when pushed on the master side.
+	 */
+	if (err == EINVAL && (cmd == TIOCSWINSZ || cmd == TCSETS) &&
+	    lx_ptm_pts_isopen(dev) == 0) {
+		/* slave side not open, assume we need to succeed */
+		DTRACE_PROBE1(lx_ptm_ioctl__override, int, cmd);
+		return (0);
+	}
+
+	return (err);
+}
+
+static int
+lx_ptm_poll_loop(dev_t dev, short events, int anyyet, short *reventsp,
+    struct pollhead **phpp, int *loop)
+{
+	ldi_handle_t	lh = lx_ptm_lh_lookup(DEVT_TO_INDEX(dev));
+	short		reventsp2;
+	int		err, rval;
+
+	*loop = 0;
+
+	/*
+	 * If the slave device has been opened and closed at least
+	 * once and the slave device is currently closed, then poll
+	 * always needs to returns immediatly.
+	 */
+	if ((lx_ptm_lh_eofed_get(DEVT_TO_INDEX(dev)) != 0) &&
+	    (lx_ptm_pts_isopen(dev) == 0)) {
+		/* In this case always return POLLHUP */
+		*reventsp = POLLHUP;
+
+		/*
+		 * Check if there really is data on the stream.
+		 * If so set the correct return flags.
+		 */
+		if ((err = lx_ptm_data_check(dev, 1, &rval)) != 0) {
+			/* Something went wrong. */
+			return (err);
+		}
+		if (rval != 0)
+			*reventsp |= (events & (POLLIN | POLLRDNORM));
+
+		/*
+		 * Is the user checking for writability?  Note that for ptm
+		 * devices Linux seems to ignore the POLLWRBAND write flag.
+		 */
+		if ((events & POLLWRNORM) == 0)
+			return (0);
+
+		/*
+		 * To check if the stream is writable we have to actually
+		 * call poll, but make sure to set anyyet to 1 to prevent
+		 * the streams framework from setting up callbacks.
+		 */
+		if ((err = ldi_poll(lh, POLLWRNORM, 1, &reventsp2, NULL)) != 0)
+			return (err);
+
+		*reventsp |= (reventsp2 & POLLWRNORM);
+	} else {
+		int lockstate;
+
+		/* The slave device is open, do the poll */
+		if ((err = ldi_poll(lh, events, anyyet, reventsp, phpp)) != 0)
+			return (err);
+
+		/*
+		 * Drop any leading EOFs on the stream.
+		 *
+		 * Note that we have to use pollunlock() here to avoid
+		 * recursive mutex enters in the poll framework.  The
+		 * reason is that if there is an EOF message on the stream
+		 * then the act of reading from the queue to remove the
+		 * message can cause the ptm drivers event service
+		 * routine to be invoked, and if there is no open
+		 * slave device then the ptm driver may generate
+		 * error messages and put them on the stream.  This
+		 * in turn will generate a poll event and the poll
+		 * framework will try to invoke any poll callbacks
+		 * associated with the stream.  In the process of
+		 * doing that the poll framework will try to aquire
+		 * locks that we are already holding.  So we need to
+		 * drop those locks here before we do our read.
+		 */
+		if (pollunlock(&lockstate) != 0) {
+			*reventsp = POLLNVAL;
+			return (0);
+		}
+		err = lx_ptm_eof_drop(dev, &rval);
+		pollrelock(lockstate);
+		if (err)
+			return (err);
+
+		/* If no EOF was dropped then return */
+		if (rval == 0)
+			return (0);
+
+		/*
+		 * An EOF was removed from the stream.  Retry the entire
+		 * poll operation from the top because polls on the ptm
+		 * device should behave differently now.
+		 */
+		*loop = 1;
+	}
+	return (0);
+}
+
+static int
+lx_ptm_poll(dev_t dev, short events, int anyyet, short *reventsp,
+    struct pollhead **phpp)
+{
+	int loop, err;
+
+	do {
+		/* Serialize ourself wrt read operations. */
+		if (lx_ptm_read_start(dev) != 0)
+			return (EINTR);
+
+		err = lx_ptm_poll_loop(dev,
+		    events, anyyet, reventsp, phpp, &loop);
+		lx_ptm_read_end(dev);
+		if (err != 0)
+			return (err);
+	} while (loop != 0);
+	return (0);
+}
+
+static struct cb_ops lx_ptm_cb_ops = {
+	lx_ptm_open,		/* open */
+	lx_ptm_close,		/* close */
+	nodev,			/* strategy */
+	nodev,			/* print */
+	nodev,			/* dump */
+	lx_ptm_read,		/* read */
+	lx_ptm_write,		/* write */
+	lx_ptm_ioctl,		/* ioctl */
+	nodev,			/* devmap */
+	nodev,			/* mmap */
+	nodev,			/* segmap */
+	lx_ptm_poll,		/* chpoll */
+	ddi_prop_op,		/* prop_op */
+	NULL,			/* cb_str */
+	D_NEW | D_MP,
+	CB_REV,
+	NULL,
+	NULL
+};
+
+static struct dev_ops lx_ptm_ops = {
+	DEVO_REV,
+	0,
+	ddi_getinfo_1to1,
+	nulldev,
+	nulldev,
+	lx_ptm_attach,
+	lx_ptm_detach,
+	nodev,
+	&lx_ptm_cb_ops,
+	NULL,
+	NULL,
+	ddi_quiesce_not_needed,		/* quiesce */
+};
+
+static struct modldrv modldrv = {
+	&mod_driverops,			/* type of module */
+	"Linux master terminal driver",	/* description of module */
+	&lx_ptm_ops			/* driver ops */
+};
+
+static struct modlinkage modlinkage = {
+	MODREV_1,
+	&modldrv,
+	NULL
+};
+
+int
+_init(void)
+{
+	return (mod_install(&modlinkage));
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&modlinkage, modinfop));
+}
+
+int
+_fini(void)
+{
+	return (mod_remove(&modlinkage));
+}
diff --git a/usr/src/uts/common/brand/lx/io/lx_ptm.conf b/usr/src/uts/common/brand/lx/io/lx_ptm.conf
new file mode 100644
index 0000000000..481b4e3c74
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/io/lx_ptm.conf
@@ -0,0 +1,27 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+
+name="lx_ptm" parent="pseudo" instance=0;
diff --git a/usr/src/uts/common/brand/lx/os/lx_brand.c b/usr/src/uts/common/brand/lx/os/lx_brand.c
new file mode 100644
index 0000000000..6cff045a80
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/os/lx_brand.c
@@ -0,0 +1,2586 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright 2016, Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * The LX Brand: emulation of a Linux operating environment within a zone.
+ *
+ * OVERVIEW
+ *
+ * The LX brand enables a full Linux userland -- including a C library,
+ * init(1) framework, and some set of applications -- to run unmodified
+ * within an illumos zone.  Unlike illumos, where applications are expected
+ * to link against and consume functions exported from libraries, the
+ * supported Linux binary compatibility boundary is the system call
+ * interface.  By accurately emulating the behaviour of Linux system calls,
+ * Linux software can be executed in this environment as if it were running
+ * on a native Linux system.
+ *
+ * EMULATING LINUX SYSTEM CALLS
+ *
+ * Linux system calls are made in 32-bit processes via the "int 0x80"
+ * instruction; in 64-bit processes the "syscall" instruction is used, as it
+ * is with native illumos processes.  In both cases, arguments to system
+ * calls are generally passed in registers and the usermode stack is not
+ * interpreted or modified by the Linux kernel.
+ *
+ * When the emulated Linux process makes a system call, it traps into the
+ * illumos kernel.  The in-kernel brand module contains various emulation
+ * routines, and can fully service some emulated system calls; e.g. read(2)
+ * and write(2).  Other system calls require assistance from the illumos
+ * libc, bouncing back out to the brand library ("lx_brand.so.1") for
+ * emulation.
+ *
+ * The brand mechanism allows for the provision of an alternative trap
+ * handler for the various system call mechanisms.  Traditionally this was
+ * used to immediately revector execution to the usermode emulation library,
+ * which was responsible for handling all system calls.  In the interests of
+ * more accurate emulation and increased performance, much of the regular
+ * illumos system call path is now invoked.  Only the argument processing and
+ * handler dispatch are replaced by the brand, via the per-LWP
+ * "lwp_brand_syscall" interposition function pointer.
+ *
+ * THE NATIVE AND BRAND STACKS
+ *
+ * Some runtime environments (e.g. the Go language) allocate very small
+ * thread stacks, preferring to grow or split the stack as necessary.  The
+ * Linux kernel generally does not use the usermode stack when servicing
+ * system calls, so this is not a problem.  In order for our emulation to
+ * have the same zero stack impact, we must execute usermode emulation
+ * routines on an _alternate_ stack.  This is similar, in principle, to the
+ * use of sigaltstack(3C) to run signal handlers off the main thread stack.
+ *
+ * To this end, the brand library allocates and installs an alternate stack
+ * (called the "native" stack) for each LWP.  The in-kernel brand code uses
+ * this stack for usermode emulation calls and interposed signal delivery,
+ * while the emulated Linux process sees only the data on the main thread
+ * stack, known as the "brand" stack.  The stack mode is tracked in the
+ * per-LWP brand-private data, using the LX_STACK_MODE_* enum.
+ *
+ * The stack mode doubles as a system call "mode bit".  When in the
+ * LX_STACK_MODE_BRAND mode, system calls are processed as emulated Linux
+ * system calls.  In other modes, system calls are assumed to be native
+ * illumos system calls as made during brand library initialisation and
+ * usermode emulation.
+ *
+ * USERMODE EMULATION
+ *
+ * When a Linux system call cannot be emulated within the kernel, we preserve
+ * the register state of the Linux process and revector the LWP to the brand
+ * library usermode emulation handler: the "lx_emulate()" function in
+ * "lx_brand.so.1".  This revectoring is modelled on the delivery of signals,
+ * and is performed in "lx_emulate_user()".
+ *
+ * First, the emulated process state is written out to the usermode stack of
+ * the process as a "ucontext_t" object.  Arguments to the emulation routine
+ * are passed on the stack or in registers, depending on the ABI.  When the
+ * usermode emulation is complete, the result is passed back to the kernel
+ * (via the "B_EMULATION_DONE" brandsys subcommand) with the saved context
+ * for restoration.
+ *
+ * SIGNAL DELIVERY, SETCONTEXT AND GETCONTEXT
+ *
+ * When servicing emulated system calls in the usermode brand library, or
+ * during signal delivery, various state is preserved by the kernel so that
+ * the running LWP may be revectored to a handling routine.  The context
+ * allows the kernel to restart the program at the point of interruption,
+ * either at the return of the signal handler, via setcontext(3C); or after
+ * the usermode emulation request has been serviced, via B_EMULATION_DONE.
+ *
+ * In illumos native processes, the saved context (a "ucontext_t" object)
+ * includes the state of registers and the current signal mask at the point
+ * of interruption.  The context also includes a link to the most recently
+ * saved context, forming a chain to be unwound as requests complete.  The LX
+ * brand requires additional book-keeping to describe the machine state: in
+ * particular, the current stack mode and the occupied extent of the native
+ * stack.
+ *
+ * The brand code is able to interpose on the context save and restore
+ * operations in the kernel -- see "lx_savecontext()" and
+ * "lx_restorecontext()" -- to enable getcontext(3C) and setcontext(3C) to
+ * function correctly in the face of a dual stack LWP.  The brand also
+ * interposes on the signal delivery mechanism -- see "lx_sendsig()" and
+ * "lx_sendsig_stack()" -- to allow all signals to be delivered to the brand
+ * library interposer on the native stack, regardless of the interrupted
+ * execution mode.  Linux sigaltstack(2) emulation is performed entirely by
+ * the usermode brand library during signal handler interposition.
+ */
+
+#include <sys/types.h>
+#include <sys/kmem.h>
+#include <sys/errno.h>
+#include <sys/thread.h>
+#include <sys/systm.h>
+#include <sys/syscall.h>
+#include <sys/proc.h>
+#include <sys/modctl.h>
+#include <sys/cmn_err.h>
+#include <sys/model.h>
+#include <sys/exec.h>
+#include <sys/lx_impl.h>
+#include <sys/machbrand.h>
+#include <sys/lx_syscalls.h>
+#include <sys/lx_misc.h>
+#include <sys/lx_futex.h>
+#include <sys/lx_brand.h>
+#include <sys/lx_types.h>
+#include <sys/param.h>
+#include <sys/termios.h>
+#include <sys/sunddi.h>
+#include <sys/ddi.h>
+#include <sys/vnode.h>
+#include <sys/pathname.h>
+#include <sys/auxv.h>
+#include <sys/priv.h>
+#include <sys/regset.h>
+#include <sys/privregs.h>
+#include <sys/archsystm.h>
+#include <sys/zone.h>
+#include <sys/brand.h>
+#include <sys/sdt.h>
+#include <sys/x86_archext.h>
+#include <sys/controlregs.h>
+#include <sys/core.h>
+#include <sys/stack.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <lx_signum.h>
+#include <util/sscanf.h>
+#include <sys/lx_brand.h>
+#include <sys/zfs_ioctl.h>
+
+int	lx_debug = 0;
+
+void	lx_init_brand_data(zone_t *, kmutex_t *);
+void	lx_free_brand_data(zone_t *);
+void	lx_setbrand(proc_t *);
+int	lx_getattr(zone_t *, int, void *, size_t *);
+int	lx_setattr(zone_t *, int, void *, size_t);
+int	lx_brandsys(int, int64_t *, uintptr_t, uintptr_t, uintptr_t,
+		uintptr_t, uintptr_t);
+void	lx_set_kern_version(zone_t *, char *);
+void	lx_copy_procdata(proc_t *, proc_t *);
+
+extern int getsetcontext(int, void *);
+extern int waitsys(idtype_t, id_t, siginfo_t *, int);
+#if defined(_SYSCALL32_IMPL)
+extern int getsetcontext32(int, void *);
+extern int waitsys32(idtype_t, id_t, siginfo_t *, int);
+#endif
+
+extern int zvol_name2minor(const char *, minor_t *);
+extern int zvol_create_minor(const char *);
+
+extern void lx_proc_exit(proc_t *);
+extern int lx_sched_affinity(int, uintptr_t, int, uintptr_t, int64_t *);
+
+extern void lx_ioctl_init();
+extern void lx_ioctl_fini();
+extern void lx_socket_init();
+extern void lx_socket_fini();
+
+lx_systrace_f *lx_systrace_entry_ptr;
+lx_systrace_f *lx_systrace_return_ptr;
+
+static int lx_systrace_enabled;
+
+/*
+ * cgroup file system maintenance functions which are set when cgroups loads.
+ */
+void (*lx_cgrp_initlwp)(vfs_t *, uint_t, id_t, pid_t);
+void (*lx_cgrp_freelwp)(vfs_t *, uint_t, id_t, pid_t);
+
+/*
+ * While this is effectively mmu.hole_start - PAGESIZE, we don't particularly
+ * want an MMU dependency here (and should there be a microprocessor without
+ * a hole, we don't want to start allocating from the top of the VA range).
+ */
+#define	LX_MAXSTACK64	0x7ffffff00000
+
+uint64_t lx_maxstack64 = LX_MAXSTACK64;
+
+static int lx_elfexec(struct vnode *vp, struct execa *uap, struct uarg *args,
+    struct intpdata *idata, int level, long *execsz, int setid,
+    caddr_t exec_file, struct cred *cred, int *brand_action);
+
+static boolean_t lx_native_exec(uint8_t, const char **);
+static uint32_t lx_map32limit(proc_t *);
+
+static void lx_savecontext(ucontext_t *);
+static void lx_restorecontext(ucontext_t *);
+static caddr_t lx_sendsig_stack(int);
+static void lx_sendsig(int);
+#if defined(_SYSCALL32_IMPL)
+static void lx_savecontext32(ucontext32_t *);
+#endif
+static int lx_setid_clear(vattr_t *, cred_t *);
+#if defined(_LP64)
+static int lx_pagefault(proc_t *, klwp_t *, caddr_t, enum fault_type,
+    enum seg_rw);
+#endif
+
+typedef struct lx_zfs_ds {
+	list_node_t	ds_link;
+	char		ds_name[MAXPATHLEN];
+	uint64_t	ds_cookie;
+} lx_zfs_ds_t;
+
+/* lx brand */
+struct brand_ops lx_brops = {
+	lx_init_brand_data,		/* b_init_brand_data */
+	lx_free_brand_data,		/* b_free_brand_data */
+	lx_brandsys,			/* b_brandsys */
+	lx_setbrand,			/* b_setbrand */
+	lx_getattr,			/* b_getattr */
+	lx_setattr,			/* b_setattr */
+	lx_copy_procdata,		/* b_copy_procdata */
+	lx_proc_exit,			/* b_proc_exit */
+	lx_exec,			/* b_exec */
+	lx_setrval,			/* b_lwp_setrval */
+	lx_lwpdata_alloc,		/* b_lwpdata_alloc */
+	lx_lwpdata_free,		/* b_lwpdata_free */
+	lx_initlwp,			/* b_initlwp */
+	lx_initlwp_post,		/* b_initlwp_post */
+	lx_forklwp,			/* b_forklwp */
+	lx_freelwp,			/* b_freelwp */
+	lx_exitlwp,			/* b_lwpexit */
+	lx_elfexec,			/* b_elfexec */
+	NULL,				/* b_sigset_native_to_brand */
+	NULL,				/* b_sigset_brand_to_native */
+	lx_sigfd_translate,		/* b_sigfd_translate */
+	NSIG,				/* b_nsig */
+	lx_exit_with_sig,		/* b_exit_with_sig */
+	lx_wait_filter,			/* b_wait_filter */
+	lx_native_exec,			/* b_native_exec */
+	lx_map32limit,			/* b_map32limit */
+	lx_stop_notify,			/* b_stop_notify */
+	lx_waitid_helper,		/* b_waitid_helper */
+	lx_sigcld_repost,		/* b_sigcld_repost */
+	lx_ptrace_issig_stop,		/* b_issig_stop */
+	lx_ptrace_sig_ignorable,	/* b_sig_ignorable */
+	lx_savecontext,			/* b_savecontext */
+#if defined(_SYSCALL32_IMPL)
+	lx_savecontext32,		/* b_savecontext32 */
+#endif
+	lx_restorecontext,		/* b_restorecontext */
+	lx_sendsig_stack,		/* b_sendsig_stack */
+	lx_sendsig,			/* b_sendsig */
+	lx_setid_clear,			/* b_setid_clear */
+#if defined(_LP64)
+	lx_pagefault,			/* b_pagefault */
+#else
+	NULL,
+#endif
+	B_FALSE				/* b_intp_parse_arg */
+};
+
+struct brand_mach_ops lx_mops = {
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	lx_fixsegreg,
+	lx_fsbase
+};
+
+struct brand lx_brand = {
+	BRAND_VER_1,
+	"lx",
+	&lx_brops,
+	&lx_mops,
+	sizeof (struct lx_proc_data)
+};
+
+static struct modlbrand modlbrand = {
+	&mod_brandops, "lx brand", &lx_brand
+};
+
+static struct modlinkage modlinkage = {
+	MODREV_1, (void *)&modlbrand, NULL
+};
+
+void
+lx_proc_exit(proc_t *p)
+{
+	lx_proc_data_t *lxpd;
+	proc_t *cp;
+
+	mutex_enter(&p->p_lock);
+	VERIFY(lxpd = ptolxproc(p));
+	VERIFY(lxpd->l_ptrace == 0);
+	if ((lxpd->l_flags & LX_PROC_CHILD_DEATHSIG) == 0) {
+		mutex_exit(&p->p_lock);
+		return;
+	}
+	mutex_exit(&p->p_lock);
+
+	/* Check for children which desire notification of parental death. */
+	mutex_enter(&pidlock);
+	for (cp = p->p_child; cp != NULL; cp = cp->p_sibling) {
+		mutex_enter(&cp->p_lock);
+		if ((lxpd = ptolxproc(cp)) == NULL) {
+			mutex_exit(&cp->p_lock);
+			continue;
+		}
+		if (lxpd->l_parent_deathsig != 0) {
+			sigtoproc(cp, NULL, lxpd->l_parent_deathsig);
+		}
+		mutex_exit(&cp->p_lock);
+	}
+	mutex_exit(&pidlock);
+}
+
+void
+lx_setbrand(proc_t *p)
+{
+	/* Send SIGCHLD to parent by default when child exits */
+	ptolxproc(p)->l_signal = stol_signo[SIGCHLD];
+
+	lx_read_argv_bounds(p);
+}
+
+/* ARGSUSED */
+int
+lx_setattr(zone_t *zone, int attr, void *ubuf, size_t ubufsz)
+{
+	lx_zone_data_t *lxzd = (lx_zone_data_t *)zone->zone_brand_data;
+
+	switch (attr) {
+	case LX_ATTR_KERN_RELEASE: {
+		char buf[LX_KERN_RELEASE_MAX];
+		bzero(buf, LX_KERN_RELEASE_MAX);
+		if (ubufsz >= LX_KERN_RELEASE_MAX) {
+			return (ERANGE);
+		}
+		if (copyin(ubuf, buf, ubufsz) != 0) {
+			return (EFAULT);
+		}
+		mutex_enter(&lxzd->lxzd_lock);
+		(void) strlcpy(lxzd->lxzd_kernel_release, buf,
+		    LX_KERN_RELEASE_MAX);
+		mutex_exit(&lxzd->lxzd_lock);
+		return (0);
+	}
+	case LX_ATTR_KERN_VERSION: {
+		char buf[LX_KERN_VERSION_MAX];
+		bzero(buf, LX_KERN_VERSION_MAX);
+		if (ubufsz >= LX_KERN_VERSION_MAX) {
+			return (ERANGE);
+		}
+		if (copyin(ubuf, buf, ubufsz) != 0) {
+			return (EFAULT);
+		}
+		mutex_enter(&lxzd->lxzd_lock);
+		(void) strlcpy(lxzd->lxzd_kernel_version, buf,
+		    LX_KERN_VERSION_MAX);
+		mutex_exit(&lxzd->lxzd_lock);
+		return (0);
+	}
+	default:
+		return (EINVAL);
+	}
+}
+
+/* ARGSUSED */
+int
+lx_getattr(zone_t *zone, int attr, void *ubuf, size_t *ubufsz)
+{
+	lx_zone_data_t *lxzd = (lx_zone_data_t *)zone->zone_brand_data;
+	int len;
+
+	switch (attr) {
+	case LX_ATTR_KERN_RELEASE: {
+		char buf[LX_KERN_RELEASE_MAX];
+
+		mutex_enter(&lxzd->lxzd_lock);
+		len = strnlen(lxzd->lxzd_kernel_release, LX_KERN_RELEASE_MAX);
+		len++;
+		if (*ubufsz < len) {
+			mutex_exit(&lxzd->lxzd_lock);
+			return (ERANGE);
+		}
+		bzero(buf, sizeof (buf));
+		(void) strncpy(buf, lxzd->lxzd_kernel_release, sizeof (buf));
+		mutex_exit(&lxzd->lxzd_lock);
+		if (copyout(buf, ubuf, len) != 0) {
+			return (EFAULT);
+		}
+		*ubufsz = len;
+		return (0);
+	}
+	case LX_ATTR_KERN_VERSION: {
+		char buf[LX_KERN_VERSION_MAX];
+
+		mutex_enter(&lxzd->lxzd_lock);
+		len = strnlen(lxzd->lxzd_kernel_version, LX_KERN_VERSION_MAX);
+		len++;
+		if (*ubufsz < len) {
+			mutex_exit(&lxzd->lxzd_lock);
+			return (ERANGE);
+		}
+		bzero(buf, sizeof (buf));
+		(void) strncpy(buf, lxzd->lxzd_kernel_version, sizeof (buf));
+		mutex_exit(&lxzd->lxzd_lock);
+		if (copyout(buf, ubuf, len) != 0) {
+			return (EFAULT);
+		}
+		*ubufsz = len;
+		return (0);
+	}
+	default:
+		return (EINVAL);
+	}
+}
+
+uint32_t
+lx_map32limit(proc_t *p)
+{
+	/*
+	 * To be bug-for-bug compatible with Linux, we have MAP_32BIT only
+	 * allow mappings in the first 31 bits.  This was a nuance in the
+	 * original Linux implementation circa 2002, and applications have
+	 * come to depend on its behavior.
+	 *
+	 * This is only relevant for 64-bit processes.
+	 */
+	if (p->p_model == DATAMODEL_LP64)
+		return (1 << 31);
+
+	return ((uint32_t)USERLIMIT32);
+}
+
+void
+lx_brand_systrace_enable(void)
+{
+	VERIFY(!lx_systrace_enabled);
+
+	lx_systrace_enabled = 1;
+}
+
+void
+lx_brand_systrace_disable(void)
+{
+	VERIFY(lx_systrace_enabled);
+
+	lx_systrace_enabled = 0;
+}
+
+void
+lx_lwp_set_native_stack_current(lx_lwp_data_t *lwpd, uintptr_t new_sp)
+{
+	VERIFY(lwpd->br_ntv_stack != 0);
+
+	/*
+	 * The "brand-lx-set-ntv-stack-current" probe has arguments:
+	 *   arg0: stack pointer before change
+	 *   arg1: stack pointer after change
+	 *   arg2: current stack base
+	 */
+	DTRACE_PROBE3(brand__lx__set__ntv__stack__current,
+	    uintptr_t, lwpd->br_ntv_stack_current,
+	    uintptr_t, new_sp,
+	    uintptr_t, lwpd->br_ntv_stack);
+
+	lwpd->br_ntv_stack_current = new_sp;
+}
+
+#if defined(_LP64)
+static int
+lx_pagefault(proc_t *p, klwp_t *lwp, caddr_t addr, enum fault_type type,
+    enum seg_rw rw)
+{
+	int syscall_num;
+
+	/*
+	 * We only want to handle a very specific set of circumstances.
+	 * Namely: this is a 64-bit LX-branded process attempting to execute an
+	 * address in a page for which it does not have a valid mapping.  If
+	 * this is not the case, we bail out as fast as possible.
+	 */
+	VERIFY(PROC_IS_BRANDED(p));
+	if (type != F_INVAL || rw != S_EXEC || lwp_getdatamodel(lwp) !=
+	    DATAMODEL_NATIVE) {
+		return (-1);
+	}
+
+	if (!lx_vsyscall_iscall(lwp, (uintptr_t)addr, &syscall_num)) {
+		return (-1);
+	}
+
+	/*
+	 * This is a valid vsyscall address.  We service the system call and
+	 * return 0 to signal that the pagefault has been handled completely.
+	 */
+	lx_vsyscall_enter(p, lwp, syscall_num);
+	return (0);
+}
+#endif
+
+/*
+ * This hook runs prior to sendsig() processing and allows us to nominate
+ * an alternative stack pointer for delivery of the signal handling frame.
+ * Critically, this routine should _not_ modify any LWP state as the
+ * savecontext() does not run until after this hook.
+ */
+static caddr_t
+lx_sendsig_stack(int sig)
+{
+	klwp_t *lwp = ttolwp(curthread);
+	lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
+
+	/*
+	 * We want to take signal delivery on the native stack, but only if
+	 * one has been allocated and installed for this LWP.
+	 */
+	if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) {
+		/*
+		 * The program is not running on the native stack.  Return
+		 * the native stack pointer from our brand-private data so
+		 * that we may switch to it for signal handling.
+		 */
+		return ((caddr_t)lwpd->br_ntv_stack_current);
+	} else {
+		struct regs *rp = lwptoregs(lwp);
+
+		/*
+		 * Either the program is already running on the native stack,
+		 * or one has not yet been allocated for this LWP.  Use the
+		 * current stack pointer value.
+		 */
+		return ((caddr_t)rp->r_sp);
+	}
+}
+
+/*
+ * This hook runs after sendsig() processing and allows us to update the
+ * per-LWP mode flags for system calls and stacks.  The pre-signal
+ * context has already been saved and delivered to the user at this point.
+ */
+static void
+lx_sendsig(int sig)
+{
+	klwp_t *lwp = ttolwp(curthread);
+	lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
+	struct regs *rp = lwptoregs(lwp);
+
+	switch (lwpd->br_stack_mode) {
+	case LX_STACK_MODE_BRAND:
+	case LX_STACK_MODE_NATIVE:
+		/*
+		 * In lx_sendsig_stack(), we nominated a stack pointer from the
+		 * native stack.  Update the stack mode, and the current in-use
+		 * extent of the native stack, accordingly:
+		 */
+		lwpd->br_stack_mode = LX_STACK_MODE_NATIVE;
+		lx_lwp_set_native_stack_current(lwpd, rp->r_sp);
+
+		/*
+		 * Fix up segment registers, etc.
+		 */
+		lx_switch_to_native(lwp);
+		break;
+
+	default:
+		/*
+		 * Otherwise, the brand library has not yet installed the
+		 * alternate stack for this LWP.  Signals will be handled on
+		 * the regular stack thread.
+		 */
+		return;
+	}
+}
+
+/*
+ * This hook runs prior to the context restoration, allowing us to take action
+ * or modify the context before it is loaded.
+ */
+static void
+lx_restorecontext(ucontext_t *ucp)
+{
+	klwp_t *lwp = ttolwp(curthread);
+	lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
+	uintptr_t flags = (uintptr_t)ucp->uc_brand_data[0];
+	caddr_t sp = ucp->uc_brand_data[1];
+
+	/*
+	 * We have a saved native stack pointer value that we must restore
+	 * into the per-LWP data.
+	 */
+	if (flags & LX_UC_RESTORE_NATIVE_SP) {
+		lx_lwp_set_native_stack_current(lwpd, (uintptr_t)sp);
+	}
+
+	/*
+	 * We do not wish to restore the value of uc_link in this context,
+	 * so replace it with the value currently in the LWP.
+	 */
+	if (flags & LX_UC_IGNORE_LINK) {
+		ucp->uc_link = (ucontext_t *)lwp->lwp_oldcontext;
+	}
+
+	/*
+	 * Restore the stack mode:
+	 */
+	if (flags & LX_UC_STACK_NATIVE) {
+		lwpd->br_stack_mode = LX_STACK_MODE_NATIVE;
+	} else if (flags & LX_UC_STACK_BRAND) {
+		lwpd->br_stack_mode = LX_STACK_MODE_BRAND;
+	}
+
+#if defined(__amd64)
+	/*
+	 * Override the fs/gsbase in the context with the value provided
+	 * through the Linux arch_prctl(2) system call.
+	 */
+	if (flags & LX_UC_STACK_BRAND) {
+		if (lwpd->br_lx_fsbase != 0) {
+			ucp->uc_mcontext.gregs[REG_FSBASE] = lwpd->br_lx_fsbase;
+		}
+		if (lwpd->br_lx_gsbase != 0) {
+			ucp->uc_mcontext.gregs[REG_GSBASE] = lwpd->br_lx_gsbase;
+		}
+	}
+#endif
+}
+
+static void
+lx_savecontext(ucontext_t *ucp)
+{
+	klwp_t *lwp = ttolwp(curthread);
+	lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
+	uintptr_t flags = 0;
+
+	/*
+	 * The ucontext_t affords us three private pointer-sized members in
+	 * "uc_brand_data".  We pack a variety of flags into the first element,
+	 * and an optional stack pointer in the second element.  The flags
+	 * determine which stack pointer (native or brand), if any, is stored
+	 * in the second element.  The third element may contain the system
+	 * call number; this is analogous to the "orig_[er]ax" member of a
+	 * Linux "user_regs_struct".
+	 */
+
+	if (lwpd->br_stack_mode != LX_STACK_MODE_INIT &&
+	    lwpd->br_stack_mode != LX_STACK_MODE_PREINIT) {
+		/*
+		 * Record the value of the native stack pointer to restore
+		 * when returning to this branded context:
+		 */
+		flags |= LX_UC_RESTORE_NATIVE_SP;
+		ucp->uc_brand_data[1] = (void *)lwpd->br_ntv_stack_current;
+	}
+
+	/*
+	 * Save the stack mode:
+	 */
+	if (lwpd->br_stack_mode == LX_STACK_MODE_NATIVE) {
+		flags |= LX_UC_STACK_NATIVE;
+	} else if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) {
+		flags |= LX_UC_STACK_BRAND;
+	}
+
+	/*
+	 * If we might need to restart this system call, save that information
+	 * in the context:
+	 */
+	if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) {
+		ucp->uc_brand_data[2] =
+		    (void *)(uintptr_t)lwpd->br_syscall_num;
+		if (lwpd->br_syscall_restart) {
+			flags |= LX_UC_RESTART_SYSCALL;
+		}
+	} else {
+		ucp->uc_brand_data[2] = NULL;
+	}
+
+	ucp->uc_brand_data[0] = (void *)flags;
+}
+
+#if defined(_SYSCALL32_IMPL)
+static void
+lx_savecontext32(ucontext32_t *ucp)
+{
+	klwp_t *lwp = ttolwp(curthread);
+	lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
+	unsigned int flags = 0;
+
+	/*
+	 * The ucontext_t affords us three private pointer-sized members in
+	 * "uc_brand_data".  We pack a variety of flags into the first element,
+	 * and an optional stack pointer in the second element.  The flags
+	 * determine which stack pointer (native or brand), if any, is stored
+	 * in the second element.  The third element may contain the system
+	 * call number; this is analogous to the "orig_[er]ax" member of a
+	 * Linux "user_regs_struct".
+	 */
+
+	if (lwpd->br_stack_mode != LX_STACK_MODE_INIT &&
+	    lwpd->br_stack_mode != LX_STACK_MODE_PREINIT) {
+		/*
+		 * Record the value of the native stack pointer to restore
+		 * when returning to this branded context:
+		 */
+		flags |= LX_UC_RESTORE_NATIVE_SP;
+		ucp->uc_brand_data[1] = (caddr32_t)lwpd->br_ntv_stack_current;
+	}
+
+	/*
+	 * Save the stack mode:
+	 */
+	if (lwpd->br_stack_mode == LX_STACK_MODE_NATIVE) {
+		flags |= LX_UC_STACK_NATIVE;
+	} else if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) {
+		flags |= LX_UC_STACK_BRAND;
+	}
+
+	/*
+	 * If we might need to restart this system call, save that information
+	 * in the context:
+	 */
+	if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) {
+		ucp->uc_brand_data[2] = (caddr32_t)lwpd->br_syscall_num;
+		if (lwpd->br_syscall_restart) {
+			flags |= LX_UC_RESTART_SYSCALL;
+		}
+	} else {
+		ucp->uc_brand_data[2] = NULL;
+	}
+
+	ucp->uc_brand_data[0] = flags;
+}
+#endif
+
+static int
+lx_zfs_ioctl(ldi_handle_t lh, int cmd, zfs_cmd_t *zc, size_t *dst_alloc_size)
+{
+	uint64_t	cookie;
+	size_t		dstsize;
+	int		rc, unused;
+
+	cookie = zc->zc_cookie;
+
+	dstsize = (dst_alloc_size == NULL ? 0 : 8192);
+
+again:
+	if (dst_alloc_size != NULL) {
+		zc->zc_nvlist_dst = (uint64_t)(intptr_t)kmem_alloc(dstsize,
+		    KM_SLEEP);
+		zc->zc_nvlist_dst_size = dstsize;
+	}
+
+	rc = ldi_ioctl(lh, cmd, (intptr_t)zc, FKIOCTL, kcred, &unused);
+	if (rc == ENOMEM && dst_alloc_size != NULL) {
+		/*
+		 * Our nvlist_dst buffer was too small, retry with a bigger
+		 * buffer. ZFS will tell us the exact needed size.
+		 */
+		size_t newsize = zc->zc_nvlist_dst_size;
+		ASSERT(newsize > dstsize);
+
+		kmem_free((void *)(uintptr_t)zc->zc_nvlist_dst, dstsize);
+		dstsize = newsize;
+		zc->zc_cookie = cookie;
+
+		goto again;
+	}
+
+	if (dst_alloc_size != NULL) {
+		*dst_alloc_size = dstsize;
+	}
+
+	return (rc);
+}
+
+static int
+lx_zone_zfs_open(ldi_handle_t *lh, dev_t *zfs_dev)
+{
+	ldi_ident_t li;
+
+	if (ldi_ident_from_mod(&modlinkage, &li) != 0) {
+		return (-1);
+	}
+	if (ldi_open_by_name("/dev/zfs", FREAD|FWRITE, kcred, lh, li) != 0) {
+		ldi_ident_release(li);
+		return (-1);
+	}
+	ldi_ident_release(li);
+	if (ldi_get_dev(*lh, zfs_dev) != 0) {
+		ldi_close(*lh, FREAD|FWRITE, kcred);
+		return (-1);
+	}
+	return (0);
+}
+
+/*
+ * We only get the relevant properties for zvols. This is because we're
+ * essentially iterating all of the ZFS datasets/zvols on the entire system
+ * when we boot the zone and there is a significant performance penalty if we
+ * have to retrieve all of the properties for everything. Especially since we
+ * don't care about any of them except the zvols actually in our delegated
+ * datasets.
+ *
+ * Note that the two properties we care about, volsize & volblocksize, are
+ * mandatory for zvols and should always be present. Also, note that the
+ * blocksize property value cannot change after the zvol has been created.
+ */
+static void
+lx_zvol_props(ldi_handle_t lh, zfs_cmd_t *zc, uint64_t *vsz, uint64_t *bsz)
+{
+	int		rc;
+	size_t		size;
+	nvlist_t	*nv = NULL, *nv2;
+
+	rc = lx_zfs_ioctl(lh, ZFS_IOC_OBJSET_STATS, zc, &size);
+	if (rc != 0)
+		return;
+
+	rc = nvlist_unpack((char *)(uintptr_t)zc->zc_nvlist_dst,
+	    zc->zc_nvlist_dst_size, &nv, 0);
+	ASSERT(rc == 0);
+
+	kmem_free((void *)(uintptr_t)zc->zc_nvlist_dst, size);
+	zc->zc_nvlist_dst = NULL;
+	zc->zc_nvlist_dst_size = 0;
+
+	if ((rc = nvlist_lookup_nvlist(nv, "volsize", &nv2)) == 0) {
+		uint64_t val;
+
+		rc = nvlist_lookup_uint64(nv2, ZPROP_VALUE, &val);
+		if (rc == 0) {
+			*vsz = val;
+		}
+	}
+
+	if ((rc = nvlist_lookup_nvlist(nv, "volblocksize", &nv2)) == 0) {
+		uint64_t val;
+
+		rc = nvlist_lookup_uint64(nv2, ZPROP_VALUE, &val);
+		if (rc == 0) {
+			*bsz = val;
+		}
+	}
+
+	nvlist_free(nv);
+}
+
+/*
+ * Unlike ZFS proper, which does dynamic zvols, we currently only generate the
+ * zone's "disk" list once at zone boot time and use that consistently in all
+ * of the various subsystems (devfs, sysfs, procfs).  This allows us to avoid
+ * re-iterating the datasets every time one of those subsystems accesses a
+ * "disk" and allows us to keep the view consistent across all subsystems, but
+ * it does mean a reboot is required to see new "disks". This is somewhat
+ * mitigated by its similarity to actual disk drives on a real system.
+ */
+static void
+lx_zone_get_zvols(zone_t *zone, ldi_handle_t lh, minor_t *emul_minor)
+{
+	lx_zone_data_t *lxzd;
+	list_t *zvol_lst, ds_lst;
+	int rc;
+	unsigned int devnum = 0;
+	size_t size;
+	zfs_cmd_t *zc;
+	nvpair_t *elem = NULL;
+	nvlist_t *pnv = NULL;
+
+	lxzd = ztolxzd(zone);
+	ASSERT(lxzd != NULL);
+	zvol_lst = lxzd->lxzd_vdisks;
+
+	zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
+	if (lx_zfs_ioctl(lh, ZFS_IOC_POOL_CONFIGS, zc, &size) != 0) {
+		goto out;
+	}
+	ASSERT(zc->zc_cookie > 0);
+
+	rc = nvlist_unpack((char *)(uintptr_t)zc->zc_nvlist_dst,
+	    zc->zc_nvlist_dst_size, &pnv, 0);
+	kmem_free((void *)(uintptr_t)zc->zc_nvlist_dst, size);
+	if (rc != 0)
+		goto out;
+
+	/*
+	 * We use a dataset list to process all of the datasets in the pool
+	 * without doing recursion so that we don't risk blowing the kernel
+	 * stack.
+	 */
+	list_create(&ds_lst, sizeof (lx_zfs_ds_t),
+	    offsetof(lx_zfs_ds_t, ds_link));
+
+	while ((elem = nvlist_next_nvpair(pnv, elem)) != NULL) {
+		lx_zfs_ds_t *ds;
+
+		ds = kmem_zalloc(sizeof (lx_zfs_ds_t), KM_SLEEP);
+		(void) strcpy(ds->ds_name, nvpair_name(elem));
+		list_insert_head(&ds_lst, ds);
+
+		while (ds != NULL) {
+			int w;		/* dummy variable */
+
+			bzero(zc, sizeof (zfs_cmd_t));
+			zc->zc_cookie = ds->ds_cookie;
+			(void) strcpy(zc->zc_name, ds->ds_name);
+
+			rc = lx_zfs_ioctl(lh, ZFS_IOC_DATASET_LIST_NEXT,
+			    zc, NULL);
+			/* Update the cookie before doing anything else. */
+			ds->ds_cookie = zc->zc_cookie;
+
+			if (rc != 0) {
+				list_remove(&ds_lst, ds);
+				kmem_free(ds, sizeof (lx_zfs_ds_t));
+				ds = list_tail(&ds_lst);
+				continue;
+			}
+
+			/* Reserved internal names, skip over these. */
+			if (strchr(zc->zc_name, '$') != NULL ||
+			    strchr(zc->zc_name, '%') != NULL)
+				continue;
+
+			if (!zone_dataset_visible_inzone(zone, zc->zc_name, &w))
+				continue;
+
+			if (zc->zc_objset_stats.dds_type == DMU_OST_ZVOL) {
+				lx_virt_disk_t *vd;
+				minor_t m = 0;
+				char *znm = zc->zc_name;
+
+				/* Create a virtual disk entry for the zvol */
+				vd = kmem_zalloc(sizeof (lx_virt_disk_t),
+				    KM_SLEEP);
+				vd->lxvd_type = LXVD_ZVOL;
+				(void) snprintf(vd->lxvd_name,
+				    sizeof (vd->lxvd_name),
+				    "zvol%u", devnum++);
+				(void) strlcpy(vd->lxvd_real_name,
+				    zc->zc_name,
+				    sizeof (vd->lxvd_real_name));
+
+				/* Record emulated and real dev_t values */
+				vd->lxvd_emul_dev = makedevice(LX_MAJOR_DISK,
+				    (*emul_minor)++);
+				if (zvol_name2minor(znm, &m) != 0) {
+					(void) zvol_create_minor(znm);
+					zvol_name2minor(znm, &m);
+				}
+				if (m != 0) {
+					vd->lxvd_real_dev = makedevice(
+					    getmajor(lxzd->lxzd_zfs_dev), m);
+				}
+
+				/* Query volume size properties */
+				lx_zvol_props(lh, zc, &vd->lxvd_volsize,
+				    &vd->lxvd_blksize);
+
+				list_insert_tail(zvol_lst, vd);
+			} else {
+				lx_zfs_ds_t *nds;
+
+				/* Create a new ds_t for the child. */
+				nds = kmem_zalloc(sizeof (lx_zfs_ds_t),
+				    KM_SLEEP);
+				(void) strcpy(nds->ds_name, zc->zc_name);
+				list_insert_after(&ds_lst, ds, nds);
+
+				/* Depth-first, so do the one just created. */
+				ds = nds;
+			}
+		}
+
+		ASSERT(list_is_empty(&ds_lst));
+	}
+
+	list_destroy(&ds_lst);
+
+out:
+	nvlist_free(pnv);
+	kmem_free(zc, sizeof (zfs_cmd_t));
+}
+
+static void
+lx_zone_get_zfsds(zone_t *zone, minor_t *emul_minor)
+{
+	lx_zone_data_t *lxzd = ztolxzd(zone);
+	vfs_t *vfsp = zone->zone_rootvp->v_vfsp;
+
+	/*
+	 * Only the root will be mounted at zone init time.
+	 * Finding means of discovering other datasets mounted in the zone
+	 * would be a good enhancement later.
+	 */
+	if (getmajor(vfsp->vfs_dev) == getmajor(lxzd->lxzd_zfs_dev)) {
+		lx_virt_disk_t *vd;
+
+		vd = kmem_zalloc(sizeof (lx_virt_disk_t), KM_SLEEP);
+		vd->lxvd_type = LXVD_ZFS_DS;
+		vd->lxvd_real_dev = vfsp->vfs_dev;
+		vd->lxvd_emul_dev = makedevice(LX_MAJOR_DISK, (*emul_minor)++);
+		snprintf(vd->lxvd_name, sizeof (vd->lxvd_name),
+		    "zfsds%u", 0);
+		(void) strlcpy(vd->lxvd_real_name,
+		    refstr_value(vfsp->vfs_resource),
+		    sizeof (vd->lxvd_real_name));
+
+		list_insert_tail(lxzd->lxzd_vdisks, vd);
+	}
+}
+
+/* Cleanup virtual disk list */
+static void
+lx_zone_cleanup_vdisks(lx_zone_data_t *lxzd)
+{
+	lx_virt_disk_t *vd;
+
+	ASSERT(lxzd->lxzd_vdisks != NULL);
+	vd = (list_remove_head(lxzd->lxzd_vdisks));
+	while (vd != NULL) {
+		kmem_free(vd, sizeof (lx_virt_disk_t));
+		vd = list_remove_head(lxzd->lxzd_vdisks);
+	}
+
+	list_destroy(lxzd->lxzd_vdisks);
+	kmem_free(lxzd->lxzd_vdisks, sizeof (list_t));
+	lxzd->lxzd_vdisks = NULL;
+}
+
+void
+lx_init_brand_data(zone_t *zone, kmutex_t *zsl)
+{
+	lx_zone_data_t *data;
+	ldi_handle_t lh;
+
+	ASSERT(MUTEX_HELD(zsl));
+	ASSERT(zone->zone_brand == &lx_brand);
+	ASSERT(zone->zone_brand_data == NULL);
+
+	data = (lx_zone_data_t *)kmem_zalloc(sizeof (lx_zone_data_t), KM_SLEEP);
+	mutex_init(&data->lxzd_lock, NULL, MUTEX_DEFAULT, NULL);
+
+	/* No need to hold mutex now since zone_brand_data is not set yet. */
+
+	/*
+	 * Set the default lxzd_kernel_version to 2.4.
+	 * This can be changed by a call to setattr() during zone boot.
+	 */
+	(void) strlcpy(data->lxzd_kernel_release, "2.4.21",
+	    LX_KERN_RELEASE_MAX);
+	(void) strlcpy(data->lxzd_kernel_version, "BrandZ virtual linux",
+	    LX_KERN_VERSION_MAX);
+
+	zone->zone_brand_data = data;
+
+	/*
+	 * In Linux, if the init(1) process terminates the system panics.
+	 * The zone must reboot to simulate this behaviour.
+	 */
+	zone->zone_reboot_on_init_exit = B_TRUE;
+
+	/*
+	 * We cannot hold the zone_status_lock while performing zfs operations
+	 * so we drop the lock, get the zfs devs as the last step in this
+	 * function, then reaquire the lock. Don't add any code after this
+	 * which requires that the zone_status_lock was continuously held.
+	 */
+	mutex_exit(zsl);
+
+	data->lxzd_vdisks = kmem_alloc(sizeof (list_t), KM_SLEEP);
+	list_create(data->lxzd_vdisks, sizeof (lx_virt_disk_t),
+	    offsetof(lx_virt_disk_t, lxvd_link));
+
+	if (lx_zone_zfs_open(&lh, &data->lxzd_zfs_dev) == 0) {
+		minor_t emul_minor = 1;
+
+		lx_zone_get_zfsds(zone, &emul_minor);
+		lx_zone_get_zvols(zone, lh, &emul_minor);
+		ldi_close(lh, FREAD|FWRITE, kcred);
+	} else {
+		/* Avoid matching any devices */
+		data->lxzd_zfs_dev = makedevice(-1, 0);
+	}
+	mutex_enter(zsl);
+}
+
+void
+lx_free_brand_data(zone_t *zone)
+{
+	lx_zone_data_t *data = ztolxzd(zone);
+	ASSERT(data != NULL);
+	mutex_enter(&data->lxzd_lock);
+	if (data->lxzd_ioctl_sock != NULL) {
+		/*
+		 * Since zone_kcred has been cleaned up already, close the
+		 * socket using the global kcred.
+		 */
+		ksocket_close(data->lxzd_ioctl_sock, kcred);
+		data->lxzd_ioctl_sock = NULL;
+	}
+	ASSERT(data->lxzd_cgroup == NULL);
+
+	lx_zone_cleanup_vdisks(data);
+
+	mutex_exit(&data->lxzd_lock);
+	zone->zone_brand_data = NULL;
+	mutex_destroy(&data->lxzd_lock);
+	kmem_free(data, sizeof (*data));
+}
+
+void
+lx_unsupported(char *dmsg)
+{
+	lx_proc_data_t *pd = ttolxproc(curthread);
+
+	DTRACE_PROBE1(brand__lx__unsupported, char *, dmsg);
+
+	if (pd != NULL && (pd->l_flags & LX_PROC_STRICT_MODE) != 0) {
+		/*
+		 * If this process was run with strict mode enabled
+		 * (via LX_STRICT in the environment), we mark this
+		 * LWP as having triggered an unsupported behaviour.
+		 * This flag will be checked at an appropriate point
+		 * by lx_check_strict_failure().
+		 */
+		lx_lwp_data_t *lwpd = ttolxlwp(curthread);
+
+		lwpd->br_strict_failure = B_TRUE;
+	}
+}
+
+void
+lx_check_strict_failure(lx_lwp_data_t *lwpd)
+{
+	proc_t *p;
+
+	if (!lwpd->br_strict_failure) {
+		return;
+	}
+
+	lwpd->br_strict_failure = B_FALSE;
+
+	/*
+	 * If this process is operating in strict mode (via LX_STRICT in
+	 * the environment), and has triggered a call to
+	 * lx_unsupported(), we drop SIGSYS on it as we return.
+	 */
+	p = curproc;
+	mutex_enter(&p->p_lock);
+	sigtoproc(p, curthread, SIGSYS);
+	mutex_exit(&p->p_lock);
+}
+
+void
+lx_trace_sysenter(int syscall_num, uintptr_t *args)
+{
+	if (lx_systrace_enabled) {
+		VERIFY(lx_systrace_entry_ptr != NULL);
+
+		(*lx_systrace_entry_ptr)(syscall_num, args[0], args[1],
+		    args[2], args[3], args[4], args[5]);
+	}
+}
+
+void
+lx_trace_sysreturn(int syscall_num, long ret)
+{
+	if (lx_systrace_enabled) {
+		VERIFY(lx_systrace_return_ptr != NULL);
+
+		(*lx_systrace_return_ptr)(syscall_num, ret, ret, 0, 0, 0, 0);
+	}
+}
+
+/*
+ * Get the addresses of the user-space system call handler and attach it to
+ * the proc structure. Returning 0 indicates success; the value returned
+ * by the system call is the value stored in rval. Returning a non-zero
+ * value indicates a failure; the value returned is used to set errno, -1
+ * is returned from the syscall and the contents of rval are ignored. To
+ * set errno and have the syscall return a value other than -1 we can
+ * manually set errno and rval and return 0.
+ */
+int
+lx_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2,
+    uintptr_t arg3, uintptr_t arg4, uintptr_t arg5)
+{
+	kthread_t *t = curthread;
+	klwp_t *lwp = ttolwp(t);
+	proc_t *p = ttoproc(t);
+	lx_proc_data_t *pd;
+	struct termios *termios;
+	uint_t termios_len;
+	int error;
+	int code;
+	int sig;
+	lx_brand_registration_t reg;
+	lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
+
+	/*
+	 * There is one operation that is suppored for non-branded
+	 * process.  B_EXEC_BRAND.  This is the equilivant of an
+	 * exec call, but the new process that is created will be
+	 * a branded process.
+	 */
+	if (cmd == B_EXEC_BRAND) {
+		VERIFY(p->p_zone != NULL);
+		VERIFY(p->p_zone->zone_brand == &lx_brand);
+		return (exec_common(
+		    (char *)arg1, (const char **)arg2, (const char **)arg3,
+		    EBA_BRAND));
+	}
+
+	/* For all other operations this must be a branded process. */
+	if (p->p_brand == NULL)
+		return (ENOSYS);
+
+	VERIFY(p->p_brand == &lx_brand);
+	VERIFY(p->p_brand_data != NULL);
+
+	switch (cmd) {
+	case B_REGISTER:
+		if (lwpd->br_stack_mode != LX_STACK_MODE_PREINIT) {
+			lx_print("stack mode was not PREINIT during "
+			    "REGISTER\n");
+			return (EINVAL);
+		}
+
+		if (p->p_model == DATAMODEL_NATIVE) {
+			if (copyin((void *)arg1, &reg, sizeof (reg)) != 0) {
+				lx_print("Failed to copyin brand registration "
+				    "at 0x%p\n", (void *)arg1);
+				return (EFAULT);
+			}
+		}
+#ifdef _LP64
+		else {
+			/* 32-bit userland on 64-bit kernel */
+			lx_brand_registration32_t reg32;
+
+			if (copyin((void *)arg1, &reg32, sizeof (reg32)) != 0) {
+				lx_print("Failed to copyin brand registration "
+				    "at 0x%p\n", (void *)arg1);
+				return (EFAULT);
+			}
+
+			reg.lxbr_version = (uint_t)reg32.lxbr_version;
+			reg.lxbr_handler =
+			    (void *)(uintptr_t)reg32.lxbr_handler;
+			reg.lxbr_flags = reg32.lxbr_flags;
+		}
+#endif
+
+		if (reg.lxbr_version != LX_VERSION_1) {
+			lx_print("Invalid brand library version (%u)\n",
+			    reg.lxbr_version);
+			return (EINVAL);
+		}
+
+		if ((reg.lxbr_flags & ~LX_PROC_ALL) != 0) {
+			lx_print("Invalid brand flags (%u)\n",
+			    reg.lxbr_flags);
+			return (EINVAL);
+		}
+
+		lx_print("Assigning brand 0x%p and handler 0x%p to proc 0x%p\n",
+		    (void *)&lx_brand, (void *)reg.lxbr_handler, (void *)p);
+		pd = p->p_brand_data;
+		pd->l_handler = (uintptr_t)reg.lxbr_handler;
+		pd->l_flags = reg.lxbr_flags & LX_PROC_ALL;
+
+		return (0);
+
+	case B_TTYMODES:
+		/* This is necessary for emulating TCGETS ioctls. */
+		if (ddi_prop_lookup_byte_array(DDI_DEV_T_ANY, ddi_root_node(),
+		    DDI_PROP_NOTPROM, "ttymodes", (uchar_t **)&termios,
+		    &termios_len) != DDI_SUCCESS)
+			return (EIO);
+
+		ASSERT(termios_len == sizeof (*termios));
+
+		if (copyout(&termios, (void *)arg1, sizeof (termios)) != 0) {
+			ddi_prop_free(termios);
+			return (EFAULT);
+		}
+
+		ddi_prop_free(termios);
+		return (0);
+
+	case B_ELFDATA: {
+		mutex_enter(&p->p_lock);
+		pd = curproc->p_brand_data;
+		if (get_udatamodel() == DATAMODEL_NATIVE) {
+			lx_elf_data_t led;
+
+			bcopy(&pd->l_elf_data, &led, sizeof (led));
+			mutex_exit(&p->p_lock);
+
+			if (copyout(&led, (void *)arg1,
+			    sizeof (lx_elf_data_t)) != 0) {
+				return (EFAULT);
+			}
+		}
+#if defined(_LP64)
+		else {
+			/* 32-bit userland on 64-bit kernel */
+			lx_elf_data32_t led32;
+
+			led32.ed_phdr = (int)pd->l_elf_data.ed_phdr;
+			led32.ed_phent = (int)pd->l_elf_data.ed_phent;
+			led32.ed_phnum = (int)pd->l_elf_data.ed_phnum;
+			led32.ed_entry = (int)pd->l_elf_data.ed_entry;
+			led32.ed_base = (int)pd->l_elf_data.ed_base;
+			led32.ed_ldentry = (int)pd->l_elf_data.ed_ldentry;
+			mutex_exit(&p->p_lock);
+
+			if (copyout(&led32, (void *)arg1,
+			    sizeof (led32)) != 0) {
+				return (EFAULT);
+			}
+		}
+#endif
+		return (0);
+	}
+
+	case B_EXEC_NATIVE:
+		return (exec_common((char *)arg1, (const char **)arg2,
+		    (const char **)arg3, EBA_NATIVE));
+
+	/*
+	 * The B_TRUSS_POINT subcommand is used so that we can make a no-op
+	 * syscall for debugging purposes (dtracing) from within the user-level
+	 * emulation.
+	 */
+	case B_TRUSS_POINT:
+		return (0);
+
+	case B_LPID_TO_SPAIR: {
+		/*
+		 * Given a Linux pid as arg1, return the Solaris pid in arg2 and
+		 * the Solaris LWP in arg3.  We also translate pid 1 (which is
+		 * hardcoded in many applications) to the zone's init process.
+		 */
+		pid_t s_pid;
+		id_t s_tid;
+
+		if ((pid_t)arg1 == 1) {
+			s_pid = p->p_zone->zone_proc_initpid;
+			/* handle the dead/missing init(1M) case */
+			if (s_pid == -1)
+				s_pid = 1;
+			s_tid = 1;
+		} else if (lx_lpid_to_spair((pid_t)arg1, &s_pid, &s_tid) < 0) {
+			return (ESRCH);
+		}
+
+		if (copyout(&s_pid, (void *)arg2, sizeof (s_pid)) != 0 ||
+		    copyout(&s_tid, (void *)arg3, sizeof (s_tid)) != 0) {
+			return (EFAULT);
+		}
+
+		return (0);
+	}
+
+	case B_SIGEV_THREAD_ID: {
+		/*
+		 * Emulate Linux's timer_create(2) SIGEV_THREAD_ID
+		 * notification method. This mechanism is only meant
+		 * for userland threading libraries such as glibc and
+		 * is documented as such. Therefore, assume this is
+		 * only ever invoked for the purpose of alerting a
+		 * Linux threading library. Assume that the tid is a
+		 * member of the caller's process and the signal
+		 * number is valid. See lx_sigev_thread_id() for the
+		 * userland side of this emulation.
+		 *
+		 * The return code from this function is not checked
+		 * by the caller since it executes in an asynchronous
+		 * context and there is nothing much to be done. If
+		 * this function does fail then it will manifest as
+		 * Linux threads waiting for a signal they will never
+		 * receive.
+		 *
+		 * arg1 -- Linux tid
+		 * arg2 -- Linux signal number
+		 * arg3 -- sigval pointer
+		 */
+
+		int native_sig = lx_ltos_signo((int)arg2, 0);
+		pid_t native_pid;
+		int native_tid;
+		sigqueue_t *sqp;
+
+		if (native_sig == 0)
+			return (EINVAL);
+
+		lx_lpid_to_spair((pid_t)arg1, &native_pid, &native_tid);
+		sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP);
+		mutex_enter(&curproc->p_lock);
+
+		if ((t = idtot(curproc, native_tid)) == NULL) {
+			mutex_exit(&curproc->p_lock);
+			kmem_free(sqp, sizeof (sigqueue_t));
+			return (ESRCH);
+		}
+
+		sqp->sq_info.si_signo = native_sig;
+		sqp->sq_info.si_code = SI_TIMER;
+		sqp->sq_info.si_pid = curproc->p_pid;
+		sqp->sq_info.si_zoneid = getzoneid();
+		sqp->sq_info.si_uid = crgetruid(CRED());
+		sqp->sq_info.si_value.sival_ptr = (void *)arg3;
+		sigaddqa(curproc, t, sqp);
+
+		mutex_exit(&curproc->p_lock);
+
+		return (0);
+	}
+
+	case B_SET_AFFINITY_MASK:
+	case B_GET_AFFINITY_MASK:
+		/*
+		 * Retrieve or store the CPU affinity mask for the
+		 * requested linux pid.
+		 *
+		 * arg1 is a linux PID (0 means curthread).
+		 * arg2 is the size of the given mask.
+		 * arg3 is the address of the affinity mask.
+		 */
+		return (lx_sched_affinity(cmd, arg1, arg2, arg3, rval));
+
+	case B_PTRACE_STOP_FOR_OPT:
+		return (lx_ptrace_stop_for_option((int)arg1, arg2 == 0 ?
+		    B_FALSE : B_TRUE, (ulong_t)arg3, arg4));
+
+	case B_PTRACE_CLONE_BEGIN:
+		return (lx_ptrace_set_clone_inherit((int)arg1, arg2 == 0 ?
+		    B_FALSE : B_TRUE));
+
+	case B_HELPER_WAITID: {
+		idtype_t idtype = (idtype_t)arg1;
+		id_t id = (id_t)arg2;
+		siginfo_t *infop = (siginfo_t *)arg3;
+		int options = (int)arg4;
+
+		lwpd = ttolxlwp(curthread);
+
+		/*
+		 * Our brand-specific waitid helper only understands a subset of
+		 * the possible idtypes.  Ensure we keep to that subset here:
+		 */
+		if (idtype != P_ALL && idtype != P_PID && idtype != P_PGID) {
+			return (EINVAL);
+		}
+
+		/*
+		 * Enable the return of emulated ptrace(2) stop conditions
+		 * through lx_waitid_helper, and stash the Linux-specific
+		 * extra waitid() flags.
+		 */
+		lwpd->br_waitid_emulate = B_TRUE;
+		lwpd->br_waitid_flags = (int)arg5;
+
+#if defined(_SYSCALL32_IMPL)
+		if (get_udatamodel() != DATAMODEL_NATIVE) {
+			return (waitsys32(idtype, id, infop, options));
+		} else
+#endif
+		{
+			return (waitsys(idtype, id, infop, options));
+		}
+
+		lwpd->br_waitid_emulate = B_FALSE;
+		lwpd->br_waitid_flags = 0;
+
+		return (0);
+	}
+
+	case B_UNSUPPORTED: {
+		char dmsg[256];
+
+		if (copyin((void *)arg1, &dmsg, sizeof (dmsg)) != 0) {
+			lx_print("Failed to copyin unsupported msg "
+			    "at 0x%p\n", (void *)arg1);
+			return (EFAULT);
+		}
+		dmsg[255] = '\0';
+		lx_unsupported(dmsg);
+
+		lx_check_strict_failure(lwpd);
+
+		return (0);
+	}
+
+	case B_STORE_ARGS: {
+		/*
+		 * B_STORE_ARGS subcommand
+		 * arg1 = address of struct to be copied in
+		 * arg2 = size of the struct being copied in
+		 * arg3-arg6 ignored
+		 * rval = the amount of data copied.
+		 */
+		void *buf;
+
+		/* only have upper limit because arg2 is unsigned */
+		if (arg2 > LX_BR_ARGS_SIZE_MAX) {
+			return (EINVAL);
+		}
+
+		buf = kmem_alloc(arg2, KM_SLEEP);
+		if (copyin((void *)arg1, buf, arg2) != 0) {
+			lx_print("Failed to copyin scall arg at 0x%p\n",
+			    (void *) arg1);
+			kmem_free(buf, arg2);
+			/*
+			 * Purposely not setting br_scall_args to NULL
+			 * to preserve data for debugging.
+			 */
+			return (EFAULT);
+		}
+
+		if (lwpd->br_scall_args != NULL) {
+			ASSERT(lwpd->br_args_size > 0);
+			kmem_free(lwpd->br_scall_args,
+			    lwpd->br_args_size);
+		}
+
+		lwpd->br_scall_args = buf;
+		lwpd->br_args_size = arg2;
+		*rval = arg2;
+		return (0);
+	}
+
+	case B_HELPER_CLONE:
+		return (lx_helper_clone(rval, arg1, (void *)arg2, (void *)arg3,
+		    (void *)arg4));
+
+	case B_HELPER_SETGROUPS:
+		return (lx_helper_setgroups(arg1, (gid_t *)arg2));
+
+	case B_HELPER_SIGQUEUE:
+		return (lx_helper_rt_sigqueueinfo(arg1, arg2,
+		    (siginfo_t *)arg3));
+
+	case B_HELPER_TGSIGQUEUE:
+		return (lx_helper_rt_tgsigqueueinfo(arg1, arg2, arg3,
+		    (siginfo_t *)arg4));
+
+	case B_SET_THUNK_PID:
+		lwpd->br_lx_thunk_pid = arg1;
+		return (0);
+
+	case B_GETPID:
+		/*
+		 * The usermode clone(2) code needs to be able to call
+		 * lx_getpid() from native code:
+		 */
+		*rval = lx_getpid();
+		return (0);
+
+	case B_SET_NATIVE_STACK:
+		/*
+		 * B_SET_NATIVE_STACK subcommand
+		 * arg1 = the base of the stack to use for emulation
+		 */
+		if (lwpd->br_stack_mode != LX_STACK_MODE_PREINIT) {
+			lx_print("B_SET_NATIVE_STACK when stack was already "
+			    "set to %p\n", (void *)arg1);
+			return (EEXIST);
+		}
+
+		/*
+		 * We move from the PREINIT state, where we have no brand
+		 * emulation stack, to the INIT state.  Here, we are still
+		 * running on what will become the BRAND stack, but are running
+		 * emulation (i.e. native) code.  Once the initialisation
+		 * process for this thread has finished, we will jump to
+		 * brand-specific code, while moving to the BRAND mode.
+		 *
+		 * When a new LWP is created, lx_initlwp() will clear the
+		 * stack data.  If that LWP is actually being duplicated
+		 * into a child process by fork(2), lx_forklwp() will copy
+		 * it so that the cloned thread will keep using the same
+		 * alternate stack.
+		 */
+		lwpd->br_ntv_stack = arg1;
+		lwpd->br_stack_mode = LX_STACK_MODE_INIT;
+		lx_lwp_set_native_stack_current(lwpd, arg1);
+
+		return (0);
+
+	case B_GET_CURRENT_CONTEXT:
+		/*
+		 * B_GET_CURRENT_CONTEXT subcommand:
+		 * arg1 = address for pointer to current ucontext_t
+		 */
+
+#if defined(_SYSCALL32_IMPL)
+		if (get_udatamodel() != DATAMODEL_NATIVE) {
+			caddr32_t addr = (caddr32_t)lwp->lwp_oldcontext;
+
+			error = copyout(&addr, (void *)arg1, sizeof (addr));
+		} else
+#endif
+		{
+			error = copyout(&lwp->lwp_oldcontext, (void *)arg1,
+			    sizeof (lwp->lwp_oldcontext));
+		}
+
+		return (error != 0 ? EFAULT : 0);
+
+	case B_JUMP_TO_LINUX:
+		/*
+		 * B_JUMP_TO_LINUX subcommand:
+		 * arg1 = ucontext_t pointer for jump state
+		 */
+
+		if (arg1 == NULL)
+			return (EINVAL);
+
+		switch (lwpd->br_stack_mode) {
+		case LX_STACK_MODE_NATIVE: {
+			struct regs *rp = lwptoregs(lwp);
+
+			/*
+			 * We are on the NATIVE stack, so we must preserve
+			 * the extent of that stack.  The pointer will be
+			 * reset by a future setcontext().
+			 */
+			lx_lwp_set_native_stack_current(lwpd,
+			    (uintptr_t)rp->r_sp);
+			break;
+		}
+
+		case LX_STACK_MODE_INIT:
+			/*
+			 * The LWP is transitioning to Linux code for the first
+			 * time.
+			 */
+			break;
+
+		case LX_STACK_MODE_PREINIT:
+			/*
+			 * This LWP has not installed an alternate stack for
+			 * usermode emulation handling.
+			 */
+			return (ENOENT);
+
+		case LX_STACK_MODE_BRAND:
+			/*
+			 * The LWP should not be on the BRAND stack.
+			 */
+			exit(CLD_KILLED, SIGSYS);
+			return (0);
+		}
+
+		/*
+		 * Transfer control to Linux:
+		 */
+		return (lx_runexe(lwp, (void *)arg1));
+
+	case B_EMULATION_DONE:
+		/*
+		 * B_EMULATION_DONE subcommand:
+		 * arg1 = ucontext_t * to restore
+		 * arg2 = system call number
+		 * arg3 = return code
+		 * arg4 = if operation failed, the errno value
+		 */
+
+		/*
+		 * The first part of this operation is a setcontext() to
+		 * restore the register state to the copy we preserved
+		 * before vectoring to the usermode emulation routine.
+		 * If that fails, we return (hopefully) to the emulation
+		 * routine and it will handle the error.
+		 */
+#if (_SYSCALL32_IMPL)
+		if (get_udatamodel() != DATAMODEL_NATIVE) {
+			error = getsetcontext32(SETCONTEXT, (void *)arg1);
+		} else
+#endif
+		{
+			error = getsetcontext(SETCONTEXT, (void *)arg1);
+		}
+
+		if (error != 0) {
+			return (error);
+		}
+
+		/*
+		 * The saved Linux context has been restored.  We handle the
+		 * return value or errno with code common to the in-kernel
+		 * system call emulation.
+		 */
+		if ((error = (int)arg4) != 0) {
+			/*
+			 * lx_syscall_return() looks at the errno in the LWP,
+			 * so set it here:
+			 */
+			set_errno(error);
+		}
+		lx_syscall_return(ttolwp(curthread), (int)arg2, (long)arg3);
+
+		return (0);
+
+	case B_EXIT_AS_SIG:
+		code = CLD_KILLED;
+		sig = (int)arg1;
+		proc_is_exiting(p);
+		if (exitlwps(1) != 0) {
+			mutex_enter(&p->p_lock);
+			lwp_exit();
+		}
+		ttolwp(curthread)->lwp_cursig = sig;
+		if (sig == SIGSEGV) {
+			if (core(sig, 0) == 0)
+				code = CLD_DUMPED;
+		}
+		exit(code, sig);
+		/* NOTREACHED */
+		break;
+
+	case B_OVERRIDE_KERN_VER: {
+		void *urel = (void *)arg1;
+		void *uver = (void *)arg2;
+		size_t len;
+
+		pd = ptolxproc(p);
+		if (urel != NULL) {
+			if (copyinstr(urel, pd->l_uname_release,
+			    LX_KERN_RELEASE_MAX, &len) != 0) {
+				return (EFAULT);
+			}
+			pd->l_uname_release[LX_KERN_RELEASE_MAX - 1] = '\0';
+		}
+		if (uver != NULL) {
+			if (copyinstr(uver, pd->l_uname_version,
+			    LX_KERN_VERSION_MAX, &len) != 0) {
+				return (EFAULT);
+			}
+			pd->l_uname_version[LX_KERN_VERSION_MAX - 1] = '\0';
+		}
+
+		return (0);
+	}
+
+	case B_GET_PERSONALITY: {
+		unsigned int result;
+
+		mutex_enter(&p->p_lock);
+		pd = ptolxproc(p);
+		result = pd->l_personality;
+		mutex_exit(&p->p_lock);
+		return (result);
+	}
+
+	}
+
+	return (EINVAL);
+}
+
+/*
+ * Compare linux kernel version to the one set for the zone.
+ * Returns greater than 0 if zone version is higher, less than 0 if the zone
+ * version is lower, and 0 if the versions are equal.
+ */
+int
+lx_kern_release_cmp(zone_t *zone, const char *vers)
+{
+	int zvers[3] = {0, 0, 0};
+	int cvers[3] = {0, 0, 0};
+	int i;
+	lx_zone_data_t *lxzd = (lx_zone_data_t *)zone->zone_brand_data;
+
+	VERIFY(zone->zone_brand == &lx_brand);
+
+	mutex_enter(&lxzd->lxzd_lock);
+	(void) sscanf(lxzd->lxzd_kernel_release, "%d.%d.%d", &zvers[0],
+	    &zvers[1], &zvers[2]);
+	mutex_exit(&lxzd->lxzd_lock);
+	(void) sscanf(vers, "%d.%d.%d", &cvers[0], &cvers[1], &cvers[2]);
+
+	for (i = 0; i < 3; i++) {
+		if (zvers[i] > cvers[i]) {
+			return (1);
+		} else if (zvers[i] < cvers[i]) {
+			return (-1);
+		}
+	}
+	return (0);
+}
+
+/*
+ * Linux unconditionally removes the setuid and setgid bits when changing
+ * file ownership.  This brand hook overrides the illumos native behaviour,
+ * which is based on the PRIV_FILE_SETID privilege.
+ */
+static int
+lx_setid_clear(vattr_t *vap, cred_t *cr)
+{
+	if (S_ISDIR(vap->va_mode)) {
+		return (0);
+	}
+
+	if (vap->va_mode & S_ISUID) {
+		vap->va_mask |= AT_MODE;
+		vap->va_mode &= ~S_ISUID;
+	}
+	if ((vap->va_mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
+		vap->va_mask |= AT_MODE;
+		vap->va_mode &= ~S_ISGID;
+	}
+
+	return (0);
+}
+
+/*
+ * Copy the per-process brand data from a parent proc to a child.
+ */
+void
+lx_copy_procdata(proc_t *cp, proc_t *pp)
+{
+	lx_proc_data_t *cpd, *ppd;
+
+	/*
+	 * Since b_copy_procdata is called during getproc(), while the child
+	 * process is still being initialized, acquiring cp->p_lock should not
+	 * be required.
+	 */
+	VERIFY(cp->p_brand == &lx_brand);
+	VERIFY(cpd = cp->p_brand_data);
+
+	mutex_enter(&pp->p_lock);
+	VERIFY(pp->p_brand == &lx_brand);
+	VERIFY(ppd = pp->p_brand_data);
+
+	bcopy(ppd, cpd, sizeof (lx_proc_data_t));
+	mutex_exit(&pp->p_lock);
+
+	/*
+	 * The l_ptrace count is normally manipulated only while under holding
+	 * p_lock.  Since this is a freshly created process, it's safe to zero
+	 * out.  If it is to be inherited, the attach will occur later.
+	 */
+	cpd->l_ptrace = 0;
+
+	cpd->l_fake_limits[LX_RLFAKE_LOCKS].rlim_cur = LX_RLIM64_INFINITY;
+	cpd->l_fake_limits[LX_RLFAKE_LOCKS].rlim_max = LX_RLIM64_INFINITY;
+
+	cpd->l_fake_limits[LX_RLFAKE_NICE].rlim_cur = 20;
+	cpd->l_fake_limits[LX_RLFAKE_NICE].rlim_max = 20;
+
+	cpd->l_fake_limits[LX_RLFAKE_RTPRIO].rlim_cur = LX_RLIM64_INFINITY;
+	cpd->l_fake_limits[LX_RLFAKE_RTPRIO].rlim_max = LX_RLIM64_INFINITY;
+
+	cpd->l_fake_limits[LX_RLFAKE_RTTIME].rlim_cur = LX_RLIM64_INFINITY;
+	cpd->l_fake_limits[LX_RLFAKE_RTTIME].rlim_max = LX_RLIM64_INFINITY;
+}
+
+#if defined(_LP64)
+static void
+Ehdr32to64(Elf32_Ehdr *src, Ehdr *dst)
+{
+	bcopy(src->e_ident, dst->e_ident, sizeof (src->e_ident));
+	dst->e_type =		src->e_type;
+	dst->e_machine =	src->e_machine;
+	dst->e_version =	src->e_version;
+	dst->e_entry =		src->e_entry;
+	dst->e_phoff =		src->e_phoff;
+	dst->e_shoff =		src->e_shoff;
+	dst->e_flags =		src->e_flags;
+	dst->e_ehsize =		src->e_ehsize;
+	dst->e_phentsize =	src->e_phentsize;
+	dst->e_phnum =		src->e_phnum;
+	dst->e_shentsize =	src->e_shentsize;
+	dst->e_shnum =		src->e_shnum;
+	dst->e_shstrndx =	src->e_shstrndx;
+}
+#endif /* _LP64 */
+
+static void
+restoreexecenv(struct execenv *ep, stack_t *sp)
+{
+	klwp_t *lwp = ttolwp(curthread);
+
+	setexecenv(ep);
+	lwp->lwp_sigaltstack.ss_sp = sp->ss_sp;
+	lwp->lwp_sigaltstack.ss_size = sp->ss_size;
+	lwp->lwp_sigaltstack.ss_flags = sp->ss_flags;
+}
+
+extern int elfexec(vnode_t *, execa_t *, uarg_t *, intpdata_t *, int,
+    long *, int, caddr_t, cred_t *, int *);
+
+extern int elf32exec(struct vnode *, execa_t *, uarg_t *, intpdata_t *, int,
+    long *, int, caddr_t, cred_t *, int *);
+
+static uintptr_t
+lx_map_vdso(struct uarg *args, struct cred *cred)
+{
+	int err;
+	char *fpath = LX_VDSO_PATH;
+	vnode_t *vp;
+	vattr_t attr;
+	caddr_t addr;
+
+#if defined(_LP64)
+	if (args->to_model != DATAMODEL_NATIVE) {
+		fpath = LX_VDSO_PATH32;
+	}
+#endif
+
+	/*
+	 * The comm page should have been mapped in already.
+	 */
+	if (args->commpage == NULL) {
+		return (NULL);
+	}
+
+	/*
+	 * Ensure the VDSO library is present and appropriately sized.
+	 * This lookup is started at the zone root to avoid complications for
+	 * processes which have chrooted.  For the specified lookup root to be
+	 * used, the leading slash must be dropped from the path.
+	 */
+	ASSERT(fpath[0] == '/');
+	fpath++;
+	if (lookupnameat(fpath, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp,
+	    curzone->zone_rootvp) != 0) {
+		return (NULL);
+	}
+
+	/*
+	 * The VDSO requires data exposed via the comm page in order to
+	 * function properly.  The VDSO is always mapped in at a fixed known
+	 * offset from the comm page, providing an easy means to locate it.
+	 */
+	addr = (caddr_t)(args->commpage - LX_VDSO_SIZE);
+	attr.va_mask = AT_SIZE;
+	if (VOP_GETATTR(vp, &attr, 0, cred, NULL) != 0 ||
+	    attr.va_size > LX_VDSO_SIZE) {
+		VN_RELE(vp);
+		return (NULL);
+	}
+
+	err = execmap(vp, addr, attr.va_size, 0, 0,
+	    PROT_USER|PROT_READ|PROT_EXEC, 1, 0);
+	VN_RELE(vp);
+	if (err != 0) {
+		return (NULL);
+	}
+	return ((uintptr_t)addr);
+}
+
+/*
+ * Exec routine called by elfexec() to load either 32-bit or 64-bit Linux
+ * binaries.
+ */
+static int
+lx_elfexec(struct vnode *vp, struct execa *uap, struct uarg *args,
+    struct intpdata *idata, int level, long *execsz, int setid,
+    caddr_t exec_file, struct cred *cred, int *brand_action)
+{
+	int		error, i;
+	vnode_t		*nvp;
+	Ehdr		ehdr;
+	Addr		uphdr_vaddr;
+	intptr_t	voffset;
+	char		*interp = NULL;
+	uintptr_t	ldaddr = NULL;
+	proc_t		*p = ttoproc(curthread);
+	klwp_t		*lwp = ttolwp(curthread);
+	lx_proc_data_t	*lxpd = ptolxproc(p);
+	struct execenv	env, origenv;
+	stack_t		orig_sigaltstack;
+	struct user	*up = PTOU(ttoproc(curthread));
+	lx_elf_data_t	edp;
+	char		*lib_path = LX_LIB_PATH;
+	boolean_t	execstk = B_TRUE;
+	unsigned int	personality;
+
+	ASSERT(p->p_brand == &lx_brand);
+	ASSERT(lxpd != NULL);
+
+	/*
+	 * Start with a separate struct for ELF data instead of inheriting
+	 * values from the currently running binary.  This ensures that fields
+	 * such as ed_base are cleared if the new binary does not utilize an
+	 * interpreter.
+	 */
+	bzero(&edp, sizeof (edp));
+
+#if defined(_LP64)
+	if (args->to_model != DATAMODEL_NATIVE) {
+		lib_path = LX_LIB_PATH32;
+	}
+#endif
+
+	/*
+	 * Set the brandname and library name for the new process so that
+	 * elfexec() puts them onto the stack.
+	 */
+	args->brandname = LX_BRANDNAME;
+	args->emulator = lib_path;
+
+#if defined(_LP64)
+	/*
+	 * To conform with the way Linux lays out the address space, we clamp
+	 * the stack to be the top of the lower region of the x86-64 canonical
+	 * form address space -- which has the side-effect of laying out the
+	 * entire address space in that lower region.  Note that this only
+	 * matters on 64-bit processes (this value will always be greater than
+	 * the size of a 32-bit address space) and doesn't actually affect
+	 * USERLIMIT:  if a Linux-branded processes wishes to map something
+	 * into the top half of the address space, it can do so -- but with
+	 * the user stack starting at the top of the bottom region, those high
+	 * virtual addresses won't be used unless explicitly directed.
+	 */
+	args->maxstack = lx_maxstack64;
+#endif
+
+	/*
+	 * Search the binary for a PT_GNU_STACK header.  The PF_X bit contained
+	 * within is used to dictate protection defaults for the stack, among
+	 * other things.
+	 */
+	if (args->to_model == DATAMODEL_NATIVE) {
+		Ehdr ehdr;
+		Phdr *phdrp;
+		caddr_t phdrbase = NULL;
+		ssize_t phdrsize = 0;
+		int nphdrs, hsize;
+
+		if ((error = elfreadhdr(vp, cred, &ehdr, &nphdrs, &phdrbase,
+		    &phdrsize)) != 0) {
+			return (error);
+		}
+
+		hsize = ehdr.e_phentsize;
+		phdrp = (Phdr *)phdrbase;
+		for (i = nphdrs; i > 0; i--) {
+			switch (phdrp->p_type) {
+			case PT_GNU_STACK:
+				if ((phdrp->p_flags & PF_X) == 0) {
+					execstk = B_FALSE;
+				}
+				break;
+			}
+			phdrp = (Phdr *)((caddr_t)phdrp + hsize);
+		}
+		kmem_free(phdrbase, phdrsize);
+	}
+#if defined(_LP64)
+	else {
+		Elf32_Ehdr ehdr;
+		Elf32_Phdr *phdrp;
+		caddr_t phdrbase = NULL;
+		ssize_t phdrsize = 0;
+		int nphdrs, hsize;
+
+		if ((error = elf32readhdr(vp, cred, &ehdr, &nphdrs, &phdrbase,
+		    &phdrsize)) != 0) {
+			return (error);
+		}
+
+		hsize = ehdr.e_phentsize;
+		phdrp = (Elf32_Phdr *)phdrbase;
+		for (i = nphdrs; i > 0; i--) {
+			switch (phdrp->p_type) {
+			case PT_GNU_STACK:
+				if ((phdrp->p_flags & PF_X) == 0) {
+					execstk = B_FALSE;
+				}
+				break;
+			}
+			phdrp = (Elf32_Phdr *)((caddr_t)phdrp + hsize);
+		}
+		kmem_free(phdrbase, phdrsize);
+	}
+#endif
+
+	/*
+	 * Revert the base personality while maintaining any existing flags.
+	 */
+	personality = LX_PER_LINUX | (lxpd->l_personality & ~LX_PER_MASK);
+
+	/*
+	 * Linux defaults to an executable stack unless the aformentioned
+	 * PT_GNU_STACK entry in the elf header dictates otherwise.  Enabling
+	 * the READ_IMPLIES_EXEC personality flag is also implied in this case.
+	 */
+	if (execstk) {
+		args->stk_prot |= PROT_EXEC;
+		args->stk_prot_override = B_TRUE;
+		personality |= LX_PER_READ_IMPLIES_EXEC;
+	}
+
+	/*
+	 * We will first exec the brand library, then map in the linux
+	 * executable and the linux linker.
+	 */
+	if ((error = lookupname(lib_path, UIO_SYSSPACE, FOLLOW, NULLVPP,
+	    &nvp))) {
+		uprintf("%s: not found.", lib_path);
+		return (error);
+	}
+
+	/*
+	 * We will eventually set the p_exec member to be the vnode for the new
+	 * executable when we call setexecenv(). However, if we get an error
+	 * before that call we need to restore the execenv to its original
+	 * values so that when we return to the caller fop_close() works
+	 * properly while cleaning up from the failed exec().  Restoring the
+	 * original value will also properly decrement the 2nd VN_RELE that we
+	 * took on the brand library.
+	 */
+	origenv.ex_bssbase = p->p_bssbase;
+	origenv.ex_brkbase = p->p_brkbase;
+	origenv.ex_brksize = p->p_brksize;
+	origenv.ex_vp = p->p_exec;
+	orig_sigaltstack.ss_sp = lwp->lwp_sigaltstack.ss_sp;
+	orig_sigaltstack.ss_size = lwp->lwp_sigaltstack.ss_size;
+	orig_sigaltstack.ss_flags = lwp->lwp_sigaltstack.ss_flags;
+
+	if (args->to_model == DATAMODEL_NATIVE) {
+		error = elfexec(nvp, uap, args, idata, INTP_MAXDEPTH + 1,
+		    execsz, setid, exec_file, cred, brand_action);
+	}
+#if defined(_LP64)
+	else {
+		error = elf32exec(nvp, uap, args, idata, INTP_MAXDEPTH + 1,
+		    execsz, setid, exec_file, cred, brand_action);
+	}
+#endif
+	VN_RELE(nvp);
+	if (error != 0) {
+		restoreexecenv(&origenv, &orig_sigaltstack);
+		return (error);
+	}
+
+	/*
+	 * exec-ed in the brand library above.
+	 * The u_auxv vectors are now setup by elfexec to point to the
+	 * brand emulation library and its linker.
+	 */
+
+	/*
+	 * After execing the brand library (which should have implicitly mapped
+	 * in the comm page), map the VDSO into the approprate place in the AS.
+	 */
+	lxpd->l_vdso = lx_map_vdso(args, cred);
+
+	bzero(&env, sizeof (env));
+
+	/*
+	 * map in the the Linux executable
+	 */
+	if (args->to_model == DATAMODEL_NATIVE) {
+		error = mapexec_brand(vp, args, &ehdr, &uphdr_vaddr,
+		    &voffset, exec_file, &interp, &env.ex_bssbase,
+		    &env.ex_brkbase, &env.ex_brksize, NULL, NULL);
+	}
+#if defined(_LP64)
+	else {
+		Elf32_Ehdr	ehdr32;
+		Elf32_Addr	uphdr_vaddr32;
+
+		error = mapexec32_brand(vp, args, &ehdr32, &uphdr_vaddr32,
+		    &voffset, exec_file, &interp, &env.ex_bssbase,
+		    &env.ex_brkbase, &env.ex_brksize, NULL, NULL);
+
+		Ehdr32to64(&ehdr32, &ehdr);
+
+		if (uphdr_vaddr32 == (Elf32_Addr)-1)
+			uphdr_vaddr = (Addr)-1;
+		else
+			uphdr_vaddr = uphdr_vaddr32;
+	}
+#endif
+	if (error != 0) {
+		restoreexecenv(&origenv, &orig_sigaltstack);
+
+		if (interp != NULL)
+			kmem_free(interp, MAXPATHLEN);
+
+		return (error);
+	}
+
+	/*
+	 * Save off the important properties of the lx executable. The brand
+	 * library will ask us for this data later, when it is ready to set
+	 * things up for the lx executable.
+	 */
+	edp.ed_phdr = (uphdr_vaddr == -1) ? voffset + ehdr.e_phoff :
+	    voffset + uphdr_vaddr;
+	edp.ed_entry = voffset + ehdr.e_entry;
+	edp.ed_phent = ehdr.e_phentsize;
+	edp.ed_phnum = ehdr.e_phnum;
+
+	if (interp != NULL) {
+		if (ehdr.e_type == ET_DYN) {
+			/*
+			 * This is a shared object executable, so we need to
+			 * pick a reasonable place to put the heap. Just don't
+			 * use the first page.
+			 */
+			env.ex_brkbase = (caddr_t)PAGESIZE;
+			env.ex_bssbase = (caddr_t)PAGESIZE;
+		}
+
+		/*
+		 * If the program needs an interpreter (most do), map it in and
+		 * store relevant information about it in the aux vector, where
+		 * the brand library can find it.
+		 */
+		if ((error = lookupname(interp, UIO_SYSSPACE, FOLLOW,
+		    NULLVPP, &nvp))) {
+			uprintf("%s: not found.", interp);
+			restoreexecenv(&origenv, &orig_sigaltstack);
+			kmem_free(interp, MAXPATHLEN);
+			return (error);
+		}
+
+		kmem_free(interp, MAXPATHLEN);
+		interp = NULL;
+
+		/*
+		 * map in the Linux linker
+		 */
+		if (args->to_model == DATAMODEL_NATIVE) {
+			error = mapexec_brand(nvp, args, &ehdr,
+			    &uphdr_vaddr, &voffset, exec_file, NULL, NULL,
+			    NULL, NULL, NULL, &ldaddr);
+		}
+#if defined(_LP64)
+		else {
+			Elf32_Ehdr	ehdr32;
+			Elf32_Addr	uphdr_vaddr32;
+
+			error = mapexec32_brand(nvp, args, &ehdr32,
+			    &uphdr_vaddr32, &voffset, exec_file, NULL, NULL,
+			    NULL, NULL, NULL, &ldaddr);
+
+			Ehdr32to64(&ehdr32, &ehdr);
+
+			if (uphdr_vaddr32 == (Elf32_Addr)-1)
+				uphdr_vaddr = (Addr)-1;
+			else
+				uphdr_vaddr = uphdr_vaddr32;
+		}
+#endif
+
+		VN_RELE(nvp);
+		if (error != 0) {
+			restoreexecenv(&origenv, &orig_sigaltstack);
+			return (error);
+		}
+
+		/*
+		 * Now that we know the base address of the brand's linker,
+		 * we also save this for later use by the brand library.
+		 */
+		edp.ed_base = voffset;
+		edp.ed_ldentry = voffset + ehdr.e_entry;
+	} else {
+		/*
+		 * This program has no interpreter. The lx brand library will
+		 * jump to the address in the AT_SUN_BRAND_LDENTRY aux vector,
+		 * so in this case, put the entry point of the main executable
+		 * there.
+		 */
+		if (ehdr.e_type == ET_EXEC) {
+			/*
+			 * An executable with no interpreter, this must be a
+			 * statically linked executable, which means we loaded
+			 * it at the address specified in the elf header, in
+			 * which case the e_entry field of the elf header is an
+			 * absolute address.
+			 */
+			edp.ed_ldentry = ehdr.e_entry;
+			edp.ed_entry = ehdr.e_entry;
+		} else {
+			/*
+			 * A shared object with no interpreter, we use the
+			 * calculated address from above.
+			 */
+			edp.ed_ldentry = edp.ed_entry;
+
+			/*
+			 * In all situations except an ET_DYN elf object with no
+			 * interpreter, we want to leave the brk and base
+			 * values set by mapexec_brand alone. Normally when
+			 * running ET_DYN objects on Solaris (most likely
+			 * /lib/ld.so.1) the kernel sets brk and base to 0 since
+			 * it doesn't know where to put the heap, and later the
+			 * linker will call brk() to initialize the heap in:
+			 *	usr/src/cmd/sgs/rtld/common/setup.c:setup()
+			 * after it has determined where to put it.  (This
+			 * decision is made after the linker loads and inspects
+			 * elf properties of the target executable being run.)
+			 *
+			 * So for ET_DYN Linux executables, we also don't know
+			 * where the heap should go, so we'll set the brk and
+			 * base to 0.  But in this case the Solaris linker will
+			 * not initialize the heap, so when the Linux linker
+			 * starts running there is no heap allocated.  This
+			 * seems to be ok on Linux 2.4 based systems because the
+			 * Linux linker/libc fall back to using mmap() to
+			 * allocate memory. But on 2.6 systems, running
+			 * applications by specifying them as command line
+			 * arguments to the linker results in segfaults for an
+			 * as yet undetermined reason (which seems to indicatej
+			 * that a more permanent fix for heap initalization in
+			 * these cases may be necessary).
+			 */
+			if (ehdr.e_type == ET_DYN) {
+				env.ex_bssbase = (caddr_t)0;
+				env.ex_brkbase = (caddr_t)0;
+				env.ex_brksize = 0;
+			}
+		}
+	}
+
+	env.ex_vp = vp;
+	setexecenv(&env);
+
+	/*
+	 * We try to keep /proc's view of the aux vector consistent with
+	 * what's on the process stack.
+	 */
+	if (args->to_model == DATAMODEL_NATIVE) {
+		auxv_t phdr_auxv[4] = {
+		    { AT_SUN_BRAND_LX_PHDR, 0 },
+		    { AT_SUN_BRAND_LX_INTERP, 0 },
+		    { AT_SUN_BRAND_LX_CLKTCK, 0 },
+		    { AT_SUN_BRAND_LX_SYSINFO_EHDR, 0 }
+		};
+		phdr_auxv[0].a_un.a_val = edp.ed_phdr;
+		phdr_auxv[1].a_un.a_val = ldaddr;
+		phdr_auxv[2].a_un.a_val = hz;
+		phdr_auxv[3].a_un.a_val = lxpd->l_vdso;
+
+		if (copyout(&phdr_auxv, args->auxp_brand,
+		    sizeof (phdr_auxv)) == -1)
+			return (EFAULT);
+	}
+#if defined(_LP64)
+	else {
+		auxv32_t phdr_auxv32[4] = {
+		    { AT_SUN_BRAND_LX_PHDR, 0 },
+		    { AT_SUN_BRAND_LX_INTERP, 0 },
+		    { AT_SUN_BRAND_LX_CLKTCK, 0 },
+		    { AT_SUN_BRAND_LX_SYSINFO_EHDR, 0 }
+		};
+		phdr_auxv32[0].a_un.a_val = edp.ed_phdr;
+		phdr_auxv32[1].a_un.a_val = ldaddr;
+		phdr_auxv32[2].a_un.a_val = hz;
+		phdr_auxv32[3].a_un.a_val = lxpd->l_vdso;
+
+		if (copyout(&phdr_auxv32, args->auxp_brand,
+		    sizeof (phdr_auxv32)) == -1)
+			return (EFAULT);
+	}
+#endif
+
+	/*
+	 * /proc uses the AT_ENTRY aux vector entry to deduce
+	 * the location of the executable in the address space. The user
+	 * structure contains a copy of the aux vector that needs to have those
+	 * entries patched with the values of the real lx executable (they
+	 * currently contain the values from the lx brand library that was
+	 * elfexec'd, above).
+	 *
+	 * For live processes, AT_BASE is used to locate the linker segment,
+	 * which /proc and friends will later use to find Solaris symbols
+	 * (such as rtld_db_preinit). However, for core files, /proc uses
+	 * AT_ENTRY to find the right segment to label as the executable.
+	 * So we set AT_ENTRY to be the entry point of the linux executable,
+	 * but leave AT_BASE to be the address of the Solaris linker.
+	 */
+	for (i = 0; i < __KERN_NAUXV_IMPL; i++) {
+		switch (up->u_auxv[i].a_type) {
+		case AT_ENTRY:
+			up->u_auxv[i].a_un.a_val = edp.ed_entry;
+			break;
+
+		case AT_SUN_BRAND_LX_PHDR:
+			up->u_auxv[i].a_un.a_val = edp.ed_phdr;
+			break;
+
+		case AT_SUN_BRAND_LX_INTERP:
+			up->u_auxv[i].a_un.a_val = ldaddr;
+			break;
+
+		case AT_SUN_BRAND_LX_CLKTCK:
+			up->u_auxv[i].a_un.a_val = hz;
+			break;
+
+		default:
+			break;
+		}
+	}
+
+	/*
+	 * Record the brand ELF data and new personality now that the exec has
+	 * proceeded successfully.
+	 */
+	bcopy(&edp, &lxpd->l_elf_data, sizeof (edp));
+	lxpd->l_personality = personality;
+
+	return (0);
+}
+
+boolean_t
+lx_native_exec(uint8_t osabi, const char **interp)
+{
+	if (osabi != ELFOSABI_SOLARIS)
+		return (B_FALSE);
+
+	/*
+	 * If the process root matches the zone root, prepend /native to the
+	 * interpreter path for native executables.  Absolute precision from
+	 * VN_CMP is not necessary since any change of process root is likely
+	 * to make native binaries inaccessible via /native.
+	 *
+	 * Processes which chroot directly into /native will be able to
+	 * function as expected with no need for the prefix.
+	 */
+	if (VN_CMP(curproc->p_user.u_rdir, curproc->p_zone->zone_rootvp)) {
+		*interp = "/native";
+	}
+
+	return (B_TRUE);
+}
+
+static void
+lx_syscall_init(void)
+{
+	int i;
+
+	/*
+	 * Count up the 32-bit Linux system calls.  Note that lx_sysent32
+	 * has (LX_NSYSCALLS + 1) entries.
+	 */
+	for (i = 0; i <= LX_NSYSCALLS && lx_sysent32[i].sy_name != NULL; i++)
+		continue;
+	lx_nsysent32 = i;
+
+#if defined(_LP64)
+	/*
+	 * Count up the 64-bit Linux system calls.  Note that lx_sysent64
+	 * has (LX_NSYSCALLS + 1) entries.
+	 */
+	for (i = 0; i <= LX_NSYSCALLS && lx_sysent64[i].sy_name != NULL; i++)
+		continue;
+	lx_nsysent64 = i;
+#endif
+}
+
+int
+_init(void)
+{
+	int err = 0;
+
+	lx_syscall_init();
+	lx_pid_init();
+	lx_ioctl_init();
+	lx_futex_init();
+	lx_ptrace_init();
+	lx_socket_init();
+
+	err = mod_install(&modlinkage);
+	if (err != 0) {
+		cmn_err(CE_WARN, "Couldn't install lx brand module");
+
+		/*
+		 * This looks drastic, but it should never happen.  These
+		 * two data structures should be completely free-able until
+		 * they are used by Linux processes.  Since the brand
+		 * wasn't loaded there should be no Linux processes, and
+		 * thus no way for these data structures to be modified.
+		 */
+		lx_pid_fini();
+		lx_ioctl_fini();
+		if (lx_futex_fini())
+			panic("lx brand module cannot be loaded or unloaded.");
+	}
+	return (err);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&modlinkage, modinfop));
+}
+
+int
+_fini(void)
+{
+	int err;
+	int futex_done = 0;
+
+	/*
+	 * If there are any zones using this brand, we can't allow it to be
+	 * unloaded.
+	 */
+	if (brand_zone_count(&lx_brand))
+		return (EBUSY);
+
+	lx_ptrace_fini();
+	lx_pid_fini();
+	lx_ioctl_fini();
+	lx_socket_fini();
+
+	if ((err = lx_futex_fini()) != 0) {
+		goto done;
+	}
+	futex_done = 1;
+
+	err = mod_remove(&modlinkage);
+
+done:
+	if (err) {
+		/*
+		 * If we can't unload the module, then we have to get it
+		 * back into a sane state.
+		 */
+		lx_ptrace_init();
+		lx_pid_init();
+		lx_ioctl_init();
+		lx_socket_init();
+
+		if (futex_done) {
+			lx_futex_init();
+		}
+	}
+
+	return (err);
+}
diff --git a/usr/src/uts/common/brand/lx/os/lx_misc.c b/usr/src/uts/common/brand/lx/os/lx_misc.c
new file mode 100644
index 0000000000..7ede833ca4
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/os/lx_misc.c
@@ -0,0 +1,1103 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright 2016, Joyent, Inc.
+ */
+
+#include <sys/errno.h>
+#include <sys/systm.h>
+#include <sys/archsystm.h>
+#include <sys/privregs.h>
+#include <sys/exec.h>
+#include <sys/lwp.h>
+#include <sys/sem.h>
+#include <sys/brand.h>
+#include <sys/lx_brand.h>
+#include <sys/lx_misc.h>
+#include <sys/lx_siginfo.h>
+#include <sys/lx_futex.h>
+#include <lx_errno.h>
+#include <sys/cmn_err.h>
+#include <sys/siginfo.h>
+#include <sys/contract/process_impl.h>
+#include <sys/x86_archext.h>
+#include <sys/sdt.h>
+#include <lx_signum.h>
+#include <lx_syscall.h>
+#include <sys/proc.h>
+#include <sys/procfs.h>
+#include <net/if.h>
+#include <inet/ip6.h>
+#include <sys/sunddi.h>
+#include <sys/dlpi.h>
+#include <sys/sysmacros.h>
+
+/* Linux specific functions and definitions */
+static void lx_save(klwp_t *);
+static void lx_restore(klwp_t *);
+
+/*
+ * Set the return code for the forked child, always zero
+ */
+/*ARGSUSED*/
+void
+lx_setrval(klwp_t *lwp, int v1, int v2)
+{
+	lwptoregs(lwp)->r_r0 = 0;
+}
+
+/*
+ * Reset process state on exec(2)
+ */
+void
+lx_exec()
+{
+	klwp_t *lwp = ttolwp(curthread);
+	struct lx_lwp_data *lwpd = lwptolxlwp(lwp);
+	proc_t *p = ttoproc(curthread);
+	lx_proc_data_t *pd = ptolxproc(p);
+	struct regs *rp = lwptoregs(lwp);
+
+	/* b_exec is called without p_lock held */
+	VERIFY(MUTEX_NOT_HELD(&p->p_lock));
+
+	/*
+	 * Any l_handler handlers set as a result of B_REGISTER are now
+	 * invalid; clear them.
+	 */
+	pd->l_handler = NULL;
+
+	/*
+	 * If this was a multi-threaded Linux process and this lwp wasn't the
+	 * main lwp, then we need to make its Illumos and Linux PIDs match.
+	 */
+	if (curthread->t_tid != 1) {
+		lx_pid_reassign(curthread);
+	}
+
+	/*
+	 * Inform ptrace(2) that we are processing an execve(2) call so that if
+	 * we are traced we can post either the PTRACE_EVENT_EXEC event or the
+	 * legacy SIGTRAP.
+	 */
+	(void) lx_ptrace_stop_for_option(LX_PTRACE_O_TRACEEXEC, B_FALSE, 0, 0);
+
+	/* clear the fs/gsbase values until the app. can reinitialize them */
+	lwpd->br_lx_fsbase = NULL;
+	lwpd->br_ntv_fsbase = NULL;
+	lwpd->br_lx_gsbase = NULL;
+	lwpd->br_ntv_gsbase = NULL;
+
+	/*
+	 * Clear the native stack flags.  This will be reinitialised by
+	 * lx_init() in the new process image.
+	 */
+	lwpd->br_stack_mode = LX_STACK_MODE_PREINIT;
+	lwpd->br_ntv_stack = 0;
+	lwpd->br_ntv_stack_current = 0;
+
+	installctx(lwptot(lwp), lwp, lx_save, lx_restore, NULL, NULL, lx_save,
+	    NULL);
+
+	/*
+	 * clear out the tls array
+	 */
+	bzero(lwpd->br_tls, sizeof (lwpd->br_tls));
+
+	/*
+	 * reset the tls entries in the gdt
+	 */
+	kpreempt_disable();
+	lx_restore(lwp);
+	kpreempt_enable();
+
+	/* Grab the updated argv bounds */
+	mutex_enter(&p->p_lock);
+	lx_read_argv_bounds(p);
+	mutex_exit(&p->p_lock);
+
+	/*
+	 * The exec syscall doesn't return (so we don't call lx_syscall_return)
+	 * but for our ptrace emulation we need to do this so that a tracer
+	 * does not get out of sync. We know that by the time this lx_exec
+	 * function is called that the exec has succeeded.
+	 */
+	rp->r_r0 = 0;
+	lx_ptrace_stop(LX_PR_SYSEXIT);
+}
+
+static void
+lx_cleanlwp(klwp_t *lwp, proc_t *p)
+{
+	struct lx_lwp_data *lwpd = lwptolxlwp(lwp);
+	void *rb_list = NULL;
+
+	VERIFY(lwpd != NULL);
+
+	mutex_enter(&p->p_lock);
+	if ((lwpd->br_ptrace_flags & LX_PTF_EXITING) == 0) {
+		lx_ptrace_exit(p, lwp);
+	}
+
+	/*
+	 * While we have p_lock, safely grab any robust_list references and
+	 * clear the lwp field.
+	 */
+	sprlock_proc(p);
+	rb_list = lwpd->br_robust_list;
+	lwpd->br_robust_list = NULL;
+	sprunlock(p);
+
+	if (rb_list != NULL) {
+		lx_futex_robust_exit((uintptr_t)rb_list, lwpd->br_pid);
+	}
+}
+
+void
+lx_exitlwp(klwp_t *lwp)
+{
+	struct lx_lwp_data *lwpd = lwptolxlwp(lwp);
+	proc_t *p = lwptoproc(lwp);
+	kthread_t *t;
+	sigqueue_t *sqp = NULL;
+	pid_t ppid;
+	id_t ptid;
+
+	VERIFY(MUTEX_NOT_HELD(&p->p_lock));
+
+	if (lwpd == NULL) {
+		/* second time thru' */
+		return;
+	}
+
+	lx_cleanlwp(lwp, p);
+
+	if (lwpd->br_clear_ctidp != NULL) {
+		(void) suword32(lwpd->br_clear_ctidp, 0);
+		(void) lx_futex((uintptr_t)lwpd->br_clear_ctidp, FUTEX_WAKE, 1,
+		    NULL, NULL, 0);
+		lwpd->br_clear_ctidp = NULL;
+	}
+
+	if (lwpd->br_signal != 0) {
+		/*
+		 * The first thread in a process doesn't cause a signal to
+		 * be sent when it exits.  It was created by a fork(), not
+		 * a clone(), so the parent should get signalled when the
+		 * process exits.
+		 */
+		if (lwpd->br_ptid == -1)
+			goto free;
+
+		sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP);
+		/*
+		 * If br_ppid is 0, it means this is a CLONE_PARENT thread,
+		 * so the signal goes to the parent process - not to a
+		 * specific thread in this process.
+		 */
+		p = lwptoproc(lwp);
+		if (lwpd->br_ppid == 0) {
+			mutex_enter(&p->p_lock);
+			ppid = p->p_ppid;
+			t = NULL;
+		} else {
+			/*
+			 * If we have been reparented to init or if our
+			 * parent thread is gone, then nobody gets
+			 * signaled.
+			 */
+			if ((lx_lwp_ppid(lwp, &ppid, &ptid) == 1) ||
+			    (ptid == -1))
+				goto free;
+
+			mutex_enter(&pidlock);
+			if ((p = prfind(ppid)) == NULL || p->p_stat == SIDL) {
+				mutex_exit(&pidlock);
+				goto free;
+			}
+			mutex_enter(&p->p_lock);
+			mutex_exit(&pidlock);
+
+			if ((t = idtot(p, ptid)) == NULL) {
+				mutex_exit(&p->p_lock);
+				goto free;
+			}
+		}
+
+		sqp->sq_info.si_signo = lwpd->br_signal;
+		sqp->sq_info.si_code = lwpd->br_exitwhy;
+		sqp->sq_info.si_status = lwpd->br_exitwhat;
+		sqp->sq_info.si_pid = lwpd->br_pid;
+		sqp->sq_info.si_uid = crgetruid(CRED());
+		sigaddqa(p, t, sqp);
+		mutex_exit(&p->p_lock);
+		sqp = NULL;
+	}
+
+free:
+	if (lwpd->br_scall_args != NULL) {
+		ASSERT(lwpd->br_args_size > 0);
+		kmem_free(lwpd->br_scall_args, lwpd->br_args_size);
+	}
+	if (sqp)
+		kmem_free(sqp, sizeof (sigqueue_t));
+}
+
+void
+lx_freelwp(klwp_t *lwp)
+{
+	struct lx_lwp_data *lwpd = lwptolxlwp(lwp);
+	proc_t *p = lwptoproc(lwp);
+	lx_zone_data_t *lxzdata;
+	vfs_t *cgrp;
+
+	VERIFY(MUTEX_NOT_HELD(&p->p_lock));
+
+	if (lwpd == NULL) {
+		/*
+		 * There is one case where an LX branded process will possess
+		 * LWPs which lack their own brand data.  During the course of
+		 * executing native binary, the process will be preemptively
+		 * branded to allow hooks such as b_native_exec to function.
+		 * If that process possesses multiple LWPS, they will _not_ be
+		 * branded since they will exit if the exec succeeds.  It's
+		 * during this LWP exit that lx_freelwp would be called on an
+		 * unbranded LWP.  When that is the case, it is acceptable to
+		 * bypass the hook.
+		 */
+		return;
+	}
+
+	/* cgroup integration */
+	lxzdata = ztolxzd(p->p_zone);
+	mutex_enter(&lxzdata->lxzd_lock);
+	cgrp = lxzdata->lxzd_cgroup;
+	if (cgrp != NULL) {
+		VFS_HOLD(cgrp);
+		mutex_exit(&lxzdata->lxzd_lock);
+		ASSERT(lx_cgrp_freelwp != NULL);
+		(*lx_cgrp_freelwp)(cgrp, lwpd->br_cgroupid, lwptot(lwp)->t_tid,
+		    lwpd->br_pid);
+		VFS_RELE(cgrp);
+	} else {
+		mutex_exit(&lxzdata->lxzd_lock);
+	}
+
+	/*
+	 * It is possible for the lx_freelwp hook to be called without a prior
+	 * call to lx_exitlwp being made.  This happens as part of lwp
+	 * de-branding when a native binary is executed from a branded process.
+	 *
+	 * To cover all cases, lx_cleanlwp is called from lx_exitlwp as well
+	 * here in lx_freelwp.  When the second call is redundant, the
+	 * resources will already be freed and no work will be needed.
+	 */
+	lx_cleanlwp(lwp, p);
+
+	/*
+	 * Remove our system call interposer.
+	 */
+	lwp->lwp_brand_syscall = NULL;
+	lwp->lwp_brand_syscall_fast = NULL;
+
+	(void) removectx(lwptot(lwp), lwp, lx_save, lx_restore, NULL, NULL,
+	    lx_save, NULL);
+	if (lwpd->br_pid != 0) {
+		lx_pid_rele(lwptoproc(lwp)->p_pid, lwptot(lwp)->t_tid);
+	}
+
+	/*
+	 * Ensure that lx_ptrace_exit() has been called to detach
+	 * ptrace(2) tracers and tracees.
+	 */
+	VERIFY(lwpd->br_ptrace_tracer == NULL);
+	VERIFY(lwpd->br_ptrace_accord == NULL);
+
+	lwp->lwp_brand = NULL;
+	kmem_free(lwpd, sizeof (struct lx_lwp_data));
+}
+
+void *
+lx_lwpdata_alloc(proc_t *p)
+{
+	lx_lwp_data_t *lwpd;
+	struct lx_pid *lpidp;
+	pid_t newpid = 0;
+	struct pid *pidp = NULL;
+
+	VERIFY(MUTEX_NOT_HELD(&p->p_lock));
+
+	/*
+	 * LWPs beyond the first will require a pid to be allocated to emulate
+	 * Linux's goofy thread model.  While this  allocation may be
+	 * unnecessary when a single-lwp process undergoes branding, it cannot
+	 * be performed during b_initlwp due to p_lock being held.
+	 */
+	if (p->p_lwpcnt > 0) {
+		if ((newpid = pid_allocate(p, 0, 0)) < 0) {
+			return (NULL);
+		}
+		pidp = pid_find(newpid);
+	}
+
+	lwpd = kmem_zalloc(sizeof (struct lx_lwp_data), KM_SLEEP);
+	lpidp = kmem_zalloc(sizeof (struct lx_pid), KM_SLEEP);
+
+	lpidp->l_pid = newpid;
+	lpidp->l_pidp = pidp;
+	lwpd->br_lpid = lpidp;
+	return (lwpd);
+}
+
+/*
+ * Free lwp brand data if an error occurred during lwp_create.
+ * Otherwise, lx_freelwp will be used to free the resources after they're
+ * associated with the lwp via lx_initlwp.
+ */
+void
+lx_lwpdata_free(void *lwpbd)
+{
+	lx_lwp_data_t *lwpd = (lx_lwp_data_t *)lwpbd;
+	VERIFY(lwpd != NULL);
+	VERIFY(lwpd->br_lpid != NULL);
+
+	if (lwpd->br_lpid->l_pidp != NULL) {
+		(void) pid_rele(lwpd->br_lpid->l_pidp);
+	}
+	kmem_free(lwpd->br_lpid, sizeof (*lwpd->br_lpid));
+	kmem_free(lwpd, sizeof (*lwpd));
+}
+
+void
+lx_initlwp(klwp_t *lwp, void *lwpbd)
+{
+	lx_lwp_data_t *lwpd = (lx_lwp_data_t *)lwpbd;
+	lx_lwp_data_t *plwpd = ttolxlwp(curthread);
+	kthread_t *tp = lwptot(lwp);
+	proc_t *p = lwptoproc(lwp);
+	lx_zone_data_t *lxzdata;
+	vfs_t *cgrp;
+
+	VERIFY(MUTEX_HELD(&p->p_lock));
+	VERIFY(lwp->lwp_brand == NULL);
+
+	lwpd->br_exitwhy = CLD_EXITED;
+	lwpd->br_lwp = lwp;
+	lwpd->br_clear_ctidp = NULL;
+	lwpd->br_set_ctidp = NULL;
+	lwpd->br_signal = 0;
+	lwpd->br_stack_mode = LX_STACK_MODE_PREINIT;
+	/*
+	 * lwpd->br_affinitymask was zeroed by kmem_zalloc()
+	 * as was lwpd->br_scall_args and lwpd->br_args_size.
+	 */
+
+	/*
+	 * The first thread in a process has ppid set to the parent
+	 * process's pid, and ptid set to -1.  Subsequent threads in the
+	 * process have their ppid set to the pid of the thread that
+	 * created them, and their ptid to that thread's tid.
+	 */
+	if (tp->t_next == tp) {
+		lwpd->br_ppid = tp->t_procp->p_ppid;
+		lwpd->br_ptid = -1;
+	} else if (plwpd != NULL) {
+		bcopy(plwpd->br_tls, lwpd->br_tls, sizeof (lwpd->br_tls));
+		lwpd->br_ppid = plwpd->br_pid;
+		lwpd->br_ptid = curthread->t_tid;
+		/* The child inherits the fs/gsbase values from the parent */
+		lwpd->br_lx_fsbase = plwpd->br_lx_fsbase;
+		lwpd->br_ntv_fsbase = plwpd->br_ntv_fsbase;
+		lwpd->br_lx_gsbase = plwpd->br_lx_gsbase;
+		lwpd->br_ntv_gsbase = plwpd->br_ntv_gsbase;
+	} else {
+		/*
+		 * Oddball case: the parent thread isn't a Linux process.
+		 */
+		lwpd->br_ppid = 0;
+		lwpd->br_ptid = -1;
+	}
+	lwp->lwp_brand = lwpd;
+
+	/*
+	 * When during lx_lwpdata_alloc, we must decide whether or not to
+	 * allocate a new pid to associate with the lwp. Since p_lock is not
+	 * held at that point, the only time we can guarantee a new pid isn't
+	 * needed is when p_lwpcnt == 0.  This is because other lwps won't be
+	 * present to race with us with regards to pid allocation.
+	 *
+	 * This means that in all other cases (where p_lwpcnt > 0), we expect
+	 * that lx_lwpdata_alloc will allocate a pid for us to use here, even
+	 * if it is uneeded.  If this process is undergoing an exec, for
+	 * example, the single existing lwp will not need a new pid when it is
+	 * rebranded.  In that case, lx_pid_assign will free the uneeded pid.
+	 */
+	VERIFY(lwpd->br_lpid->l_pidp != NULL || p->p_lwpcnt == 0);
+
+	lx_pid_assign(tp, lwpd->br_lpid);
+	lwpd->br_tgid = lwpd->br_pid;
+	/*
+	 * Having performed the lx pid assignement, the lpid reference is no
+	 * longer needed.  The underlying data will be freed during lx_freelwp.
+	 */
+	lwpd->br_lpid = NULL;
+
+	installctx(lwptot(lwp), lwp, lx_save, lx_restore, NULL, NULL,
+	    lx_save, NULL);
+
+	/*
+	 * Install branded system call hooks for this LWP:
+	 */
+	lwp->lwp_brand_syscall = lx_syscall_enter;
+	lwp->lwp_brand_syscall_fast = lx_syscall_fast_enter;
+
+	/*
+	 * The new LWP inherits the parent LWP cgroup ID.
+	 */
+	if (plwpd != NULL) {
+		lwpd->br_cgroupid = plwpd->br_cgroupid;
+	}
+	lxzdata = ztolxzd(p->p_zone);
+	mutex_enter(&lxzdata->lxzd_lock);
+	cgrp = lxzdata->lxzd_cgroup;
+	if (cgrp != NULL) {
+		VFS_HOLD(cgrp);
+		mutex_exit(&lxzdata->lxzd_lock);
+		ASSERT(lx_cgrp_initlwp != NULL);
+		(*lx_cgrp_initlwp)(cgrp, lwpd->br_cgroupid, lwptot(lwp)->t_tid,
+		    lwpd->br_pid);
+		VFS_RELE(cgrp);
+	} else {
+		mutex_exit(&lxzdata->lxzd_lock);
+	}
+}
+
+void
+lx_initlwp_post(klwp_t *lwp)
+{
+	lx_lwp_data_t *plwpd = ttolxlwp(curthread);
+	/*
+	 * If the parent LWP has a ptrace(2) tracer, the new LWP may
+	 * need to inherit that same tracer.
+	 */
+	if (plwpd != NULL) {
+		lx_ptrace_inherit_tracer(plwpd, lwptolxlwp(lwp));
+	}
+}
+
+/*
+ * There is no need to have any locking for either the source or
+ * destination struct lx_lwp_data structs.  This is always run in the
+ * thread context of the source thread, and the destination thread is
+ * always newly created and not referred to from anywhere else.
+ */
+void
+lx_forklwp(klwp_t *srclwp, klwp_t *dstlwp)
+{
+	struct lx_lwp_data *src = srclwp->lwp_brand;
+	struct lx_lwp_data *dst = dstlwp->lwp_brand;
+
+	dst->br_ppid = src->br_pid;
+	dst->br_ptid = lwptot(srclwp)->t_tid;
+	bcopy(src->br_tls, dst->br_tls, sizeof (dst->br_tls));
+
+	switch (src->br_stack_mode) {
+	case LX_STACK_MODE_BRAND:
+	case LX_STACK_MODE_NATIVE:
+		/*
+		 * The parent LWP has an alternate stack installed.
+		 * The child LWP should have the same stack base and extent.
+		 */
+		dst->br_stack_mode = src->br_stack_mode;
+		dst->br_ntv_stack = src->br_ntv_stack;
+		dst->br_ntv_stack_current = src->br_ntv_stack_current;
+		break;
+
+	default:
+		/*
+		 * Otherwise, clear the stack data for this LWP.
+		 */
+		dst->br_stack_mode = LX_STACK_MODE_PREINIT;
+		dst->br_ntv_stack = 0;
+		dst->br_ntv_stack_current = 0;
+	}
+
+	/*
+	 * copy only these flags
+	 */
+	dst->br_lwp_flags = src->br_lwp_flags & BR_CPU_BOUND;
+	dst->br_scall_args = NULL;
+}
+
+/*
+ * When switching a Linux process off the CPU, clear its GDT entries.
+ */
+/* ARGSUSED */
+static void
+lx_save(klwp_t *t)
+{
+	int i;
+
+#if defined(__amd64)
+	reset_sregs();
+#endif
+	for (i = 0; i < LX_TLSNUM; i++)
+		gdt_update_usegd(GDT_TLSMIN + i, &null_udesc);
+}
+
+/*
+ * When switching a Linux process on the CPU, set its GDT entries.
+ *
+ * For 64-bit code we don't have to worry about explicitly setting the
+ * %fsbase via wrmsr(MSR_AMD_FSBASE) here. Instead, that should happen
+ * automatically in update_sregs if we are executing in user-land. If this
+ * is the case then pcb_rupdate should be set.
+ */
+static void
+lx_restore(klwp_t *t)
+{
+	struct lx_lwp_data *lwpd = lwptolxlwp(t);
+	user_desc_t *tls;
+	int i;
+
+	ASSERT(lwpd);
+
+	tls = lwpd->br_tls;
+	for (i = 0; i < LX_TLSNUM; i++)
+		gdt_update_usegd(GDT_TLSMIN + i, &tls[i]);
+}
+
+void
+lx_set_gdt(int entry, user_desc_t *descrp)
+{
+
+	gdt_update_usegd(entry, descrp);
+}
+
+void
+lx_clear_gdt(int entry)
+{
+	gdt_update_usegd(entry, &null_udesc);
+}
+
+longlong_t
+lx_nosys()
+{
+	return (set_errno(ENOSYS));
+}
+
+/*
+ * Brand-specific routine to check if given non-Solaris standard segment
+ * register values should be modified to other values.
+ */
+/*ARGSUSED*/
+greg_t
+lx_fixsegreg(greg_t sr, model_t datamodel)
+{
+	uint16_t idx = SELTOIDX(sr);
+
+	ASSERT(sr == (sr & 0xffff));
+
+	/*
+	 * If the segment selector is a valid TLS selector, just return it.
+	 */
+	if (!SELISLDT(sr) && idx >= GDT_TLSMIN && idx <= GDT_TLSMAX)
+		return (sr | SEL_UPL);
+
+	/*
+	 * Force the SR into the LDT in ring 3 for 32-bit processes.
+	 *
+	 * 64-bit processes get the null GDT selector since they are not
+	 * allowed to have a private LDT.
+	 */
+#if defined(__amd64)
+	return (datamodel == DATAMODEL_ILP32 ? (sr | SEL_TI_LDT | SEL_UPL) : 0);
+#elif defined(__i386)
+	datamodel = datamodel;  /* datamodel currently unused for 32-bit */
+	return (sr | SEL_TI_LDT | SEL_UPL);
+#endif	/* __amd64 */
+}
+
+/*
+ * Brand-specific function to convert the fsbase as pulled from the register
+ * into a native fsbase suitable for locating the ulwp_t from the kernel.
+ */
+uintptr_t
+lx_fsbase(klwp_t *lwp, uintptr_t fsbase)
+{
+	lx_lwp_data_t *lwpd = lwp->lwp_brand;
+
+	if (lwpd->br_stack_mode != LX_STACK_MODE_BRAND ||
+	    lwpd->br_ntv_fsbase == NULL) {
+		return (fsbase);
+	}
+
+	return (lwpd->br_ntv_fsbase);
+}
+
+/*
+ * These two functions simulate winfo and post_sigcld for the lx brand. The
+ * difference is delivering a designated signal as opposed to always SIGCLD.
+ */
+static void
+lx_winfo(proc_t *pp, k_siginfo_t *ip, struct lx_proc_data *dat)
+{
+	ASSERT(MUTEX_HELD(&pidlock));
+	bzero(ip, sizeof (k_siginfo_t));
+	ip->si_signo = ltos_signo[dat->l_signal];
+	ip->si_code = pp->p_wcode;
+	ip->si_pid = pp->p_pid;
+	ip->si_ctid = PRCTID(pp);
+	ip->si_zoneid = pp->p_zone->zone_id;
+	ip->si_status = pp->p_wdata;
+	ip->si_stime = pp->p_stime;
+	ip->si_utime = pp->p_utime;
+}
+
+static void
+lx_post_exit_sig(proc_t *cp, sigqueue_t *sqp, struct lx_proc_data *dat)
+{
+	proc_t *pp = cp->p_parent;
+
+	ASSERT(MUTEX_HELD(&pidlock));
+	mutex_enter(&pp->p_lock);
+	/*
+	 * Since Linux doesn't queue SIGCHLD, or any other non RT
+	 * signals, we just blindly deliver whatever signal we can.
+	 */
+	ASSERT(sqp != NULL);
+	lx_winfo(cp, &sqp->sq_info, dat);
+	sigaddqa(pp, NULL, sqp);
+	sqp = NULL;
+	mutex_exit(&pp->p_lock);
+}
+
+
+/*
+ * Brand specific code for exiting and sending a signal to the parent, as
+ * opposed to sigcld().
+ */
+void
+lx_exit_with_sig(proc_t *cp, sigqueue_t *sqp)
+{
+	proc_t *pp = cp->p_parent;
+	lx_proc_data_t *lx_brand_data = ptolxproc(cp);
+	ASSERT(MUTEX_HELD(&pidlock));
+
+	switch (cp->p_wcode) {
+	case CLD_EXITED:
+	case CLD_DUMPED:
+	case CLD_KILLED:
+			ASSERT(cp->p_stat == SZOMB);
+			/*
+			 * The broadcast on p_srwchan_cv is a kludge to
+			 * wakeup a possible thread in uadmin(A_SHUTDOWN).
+			 */
+			cv_broadcast(&cp->p_srwchan_cv);
+
+			/*
+			 * Add to newstate list of the parent
+			 */
+			add_ns(pp, cp);
+
+			cv_broadcast(&pp->p_cv);
+			if ((pp->p_flag & SNOWAIT) ||
+			    PTOU(pp)->u_signal[SIGCLD - 1] == SIG_IGN) {
+				if (!(cp->p_pidflag & CLDWAITPID))
+					freeproc(cp);
+			} else if (!(cp->p_pidflag & CLDNOSIGCHLD) &&
+			    lx_brand_data->l_signal != 0) {
+				lx_post_exit_sig(cp, sqp, lx_brand_data);
+				sqp = NULL;
+			}
+			break;
+
+	case CLD_STOPPED:
+	case CLD_CONTINUED:
+	case CLD_TRAPPED:
+			panic("Should not be called in this case");
+	}
+
+	if (sqp)
+		siginfofree(sqp);
+}
+
+/*
+ * Filters based on arguments that have been passed in by a separate syscall
+ * using the B_STORE_ARGS mechanism. if the __WALL flag is set, no filter is
+ * applied, otherwise we look at the difference between a clone and non-clone
+ * process.
+ * The definition of a clone process in Linux is a thread that does not deliver
+ * SIGCHLD to its parent. The option __WCLONE indicates to wait only on clone
+ * processes. Without that option, a process should only wait on normal
+ * children. The following table shows the cases.
+ *
+ *                   default    __WCLONE
+ *   no SIGCHLD      -           X
+ *   SIGCHLD         X           -
+ *
+ * This is an XOR of __WCLONE being set, and SIGCHLD being the signal sent on
+ * process exit.
+ *
+ * More information on wait in lx brands can be found at
+ * usr/src/lib/brand/lx/lx_brand/common/wait.c.
+ */
+boolean_t
+lx_wait_filter(proc_t *pp, proc_t *cp)
+{
+	lx_lwp_data_t *lwpd = ttolxlwp(curthread);
+	int flags = lwpd->br_waitid_flags;
+	boolean_t ret;
+
+	if (!lwpd->br_waitid_emulate) {
+		return (B_TRUE);
+	}
+
+	mutex_enter(&cp->p_lock);
+	if (flags & LX_WALL) {
+		ret = B_TRUE;
+	} else {
+		lx_proc_data_t *pd = ptolxproc(cp);
+		boolean_t is_sigchld = B_TRUE;
+		boolean_t match_wclone = B_FALSE;
+
+		/*
+		 * When calling clone, an alternate signal can be chosen to
+		 * deliver to the parent when the child exits.
+		 */
+		if (pd != NULL && pd->l_signal != stol_signo[SIGCHLD]) {
+			is_sigchld = B_FALSE;
+		}
+		if ((flags & LX_WCLONE) != 0) {
+			match_wclone = B_TRUE;
+		}
+
+		ret = (match_wclone ^ is_sigchld) ? B_TRUE : B_FALSE;
+	}
+	mutex_exit(&cp->p_lock);
+
+	return (ret);
+}
+
+void
+lx_ifname_convert(char *ifname, lx_if_action_t act)
+{
+	if (act == LX_IF_TONATIVE) {
+		if (strncmp(ifname, "lo", IFNAMSIZ) == 0)
+			(void) strlcpy(ifname, "lo0", IFNAMSIZ);
+	} else {
+		if (strncmp(ifname, "lo0", IFNAMSIZ) == 0)
+			(void) strlcpy(ifname, "lo", IFNAMSIZ);
+	}
+}
+
+void
+lx_ifflags_convert(uint64_t *flags, lx_if_action_t act)
+{
+	uint64_t buf;
+
+	buf = *flags & (IFF_UP | IFF_BROADCAST | IFF_DEBUG |
+	    IFF_LOOPBACK | IFF_POINTOPOINT | IFF_NOTRAILERS |
+	    IFF_RUNNING | IFF_NOARP | IFF_PROMISC | IFF_ALLMULTI);
+
+	/* Linux has different shift for multicast flag */
+	if (act == LX_IF_TONATIVE) {
+		if (*flags & 0x1000)
+			buf |= IFF_MULTICAST;
+	} else {
+		if (*flags & IFF_MULTICAST)
+			buf |= 0x1000;
+	}
+	*flags = buf;
+}
+
+/*
+ * Convert an IPv6 address into the numbers used by /proc/net/if_inet6
+ */
+unsigned int
+lx_ipv6_scope_convert(const in6_addr_t *addr)
+{
+	if (IN6_IS_ADDR_V4COMPAT(addr)) {
+		return (LX_IPV6_ADDR_COMPATv4);
+	} else if (IN6_ARE_ADDR_EQUAL(addr, &ipv6_loopback)) {
+		return (LX_IPV6_ADDR_LOOPBACK);
+	} else if (IN6_IS_ADDR_LINKLOCAL(addr)) {
+		return (LX_IPV6_ADDR_LINKLOCAL);
+	} else if (IN6_IS_ADDR_SITELOCAL(addr)) {
+		return (LX_IPV6_ADDR_SITELOCAL);
+	} else {
+		return (0x0000U);
+	}
+}
+
+
+void
+lx_stol_hwaddr(const struct sockaddr_dl *src, struct sockaddr *dst, int *size)
+{
+	int copy_size = MIN(src->sdl_alen, sizeof (dst->sa_data));
+
+	switch (src->sdl_type) {
+	case DL_ETHER:
+		dst->sa_family = LX_ARPHRD_ETHER;
+		break;
+	case DL_LOOP:
+		dst->sa_family = LX_ARPHRD_LOOPBACK;
+		break;
+	default:
+		dst->sa_family = LX_ARPHRD_VOID;
+	}
+
+	bcopy(LLADDR(src), dst->sa_data, copy_size);
+	*size = copy_size;
+}
+
+/*
+ * Brand hook to convert native kernel siginfo signal number, errno, code, pid
+ * and si_status to Linux values. Similar to the stol_ksiginfo function but
+ * this one converts in-place, converts the pid, and does not copyout.
+ */
+void
+lx_sigfd_translate(k_siginfo_t *infop)
+{
+	infop->si_signo = lx_stol_signo(infop->si_signo, LX_SIGKILL);
+
+	infop->si_status = lx_stol_status(infop->si_status, LX_SIGKILL);
+
+	infop->si_code = lx_stol_sigcode(infop->si_code);
+
+	infop->si_errno = lx_errno(infop->si_errno, EINVAL);
+
+	if (infop->si_pid == curproc->p_zone->zone_proc_initpid) {
+		infop->si_pid = 1;
+	} else if (infop->si_pid == curproc->p_zone->zone_zsched->p_pid) {
+		infop->si_pid = 0;
+	}
+}
+
+int
+stol_ksiginfo_copyout(k_siginfo_t *sip, void *ulxsip)
+{
+	lx_siginfo_t lsi;
+
+	bzero(&lsi, sizeof (lsi));
+	lsi.lsi_signo = lx_stol_signo(sip->si_signo, SIGCLD);
+	lsi.lsi_code = lx_stol_sigcode(sip->si_code);
+	lsi.lsi_errno = lx_errno(sip->si_errno, EINVAL);
+
+	switch (lsi.lsi_signo) {
+	case LX_SIGPOLL:
+		lsi.lsi_band = sip->si_band;
+		lsi.lsi_fd = sip->si_fd;
+		break;
+
+	case LX_SIGCHLD:
+		lsi.lsi_pid = sip->si_pid;
+		if (sip->si_code <= 0 || sip->si_code == CLD_EXITED) {
+			lsi.lsi_status = sip->si_status;
+		} else {
+			lsi.lsi_status = lx_stol_status(sip->si_status,
+			    SIGKILL);
+		}
+		lsi.lsi_utime = sip->si_utime;
+		lsi.lsi_stime = sip->si_stime;
+		break;
+
+	case LX_SIGILL:
+	case LX_SIGBUS:
+	case LX_SIGFPE:
+	case LX_SIGSEGV:
+		lsi.lsi_addr = sip->si_addr;
+		break;
+
+	default:
+		lsi.lsi_pid = sip->si_pid;
+		lsi.lsi_uid = LX_UID32_TO_UID16(sip->si_uid);
+	}
+
+	if (copyout(&lsi, ulxsip, sizeof (lsi)) != 0) {
+		return (set_errno(EFAULT));
+	}
+
+	return (0);
+}
+
+#if defined(_SYSCALL32_IMPL)
+int
+stol_ksiginfo32_copyout(k_siginfo_t *sip, void *ulxsip)
+{
+	lx_siginfo32_t lsi;
+
+	bzero(&lsi, sizeof (lsi));
+	lsi.lsi_signo = lx_stol_signo(sip->si_signo, SIGCLD);
+	lsi.lsi_code = lx_stol_sigcode(sip->si_code);
+	lsi.lsi_errno = lx_errno(sip->si_errno, EINVAL);
+
+	switch (lsi.lsi_signo) {
+	case LX_SIGPOLL:
+		lsi.lsi_band = sip->si_band;
+		lsi.lsi_fd = sip->si_fd;
+		break;
+
+	case LX_SIGCHLD:
+		lsi.lsi_pid = sip->si_pid;
+		if (sip->si_code <= 0 || sip->si_code == CLD_EXITED) {
+			lsi.lsi_status = sip->si_status;
+		} else {
+			lsi.lsi_status = lx_stol_status(sip->si_status,
+			    SIGKILL);
+		}
+		lsi.lsi_utime = sip->si_utime;
+		lsi.lsi_stime = sip->si_stime;
+		break;
+
+	case LX_SIGILL:
+	case LX_SIGBUS:
+	case LX_SIGFPE:
+	case LX_SIGSEGV:
+		lsi.lsi_addr = (caddr32_t)(uintptr_t)sip->si_addr;
+		break;
+
+	default:
+		lsi.lsi_pid = sip->si_pid;
+		lsi.lsi_uid = LX_UID32_TO_UID16(sip->si_uid);
+	}
+
+	if (copyout(&lsi, ulxsip, sizeof (lsi)) != 0) {
+		return (set_errno(EFAULT));
+	}
+
+	return (0);
+}
+#endif
+
+/*
+ * Linux uses the original bounds of the argv array when determining the
+ * contents of /proc/<pid/cmdline.  We mimic those bounds using argv[0] and
+ * envp[0] as the beginning and end, respectively.
+ */
+void
+lx_read_argv_bounds(proc_t *p)
+{
+	user_t *up = PTOU(p);
+	lx_proc_data_t *pd = ptolxproc(p);
+	uintptr_t addr_arg = up->u_argv;
+	uintptr_t addr_env = up->u_envp;
+	uintptr_t arg_start = 0, env_start = 0, env_end = 0;
+	int i = 0;
+
+	VERIFY(pd != NULL);
+	VERIFY(MUTEX_HELD(&p->p_lock));
+
+	/*
+	 * Use AT_SUN_PLATFORM in the aux vector to find the end of the envp
+	 * strings.
+	 */
+	for (i = 0; i < __KERN_NAUXV_IMPL; i++) {
+		if (up->u_auxv[i].a_type == AT_SUN_PLATFORM) {
+			env_end = (uintptr_t)up->u_auxv[i].a_un.a_val;
+		}
+	}
+
+	mutex_exit(&p->p_lock);
+#if defined(_LP64)
+	if (p->p_model != DATAMODEL_NATIVE) {
+		uint32_t buf32;
+		if (copyin((void *)addr_arg, &buf32, sizeof (buf32)) == 0) {
+			arg_start = (uintptr_t)buf32;
+		}
+		if (copyin((void *)addr_env, &buf32, sizeof (buf32)) == 0) {
+			env_start = (uintptr_t)buf32;
+		}
+	} else
+#endif /* defined(_LP64) */
+	{
+		uintptr_t buf;
+		if (copyin((void *)addr_arg, &buf, sizeof (buf)) == 0) {
+			arg_start = buf;
+		}
+		if (copyin((void *)addr_env, &buf, sizeof (buf)) == 0) {
+			env_start = buf;
+		}
+	}
+	mutex_enter(&p->p_lock);
+	pd->l_args_start = arg_start;
+	pd->l_envs_start = env_start;
+	pd->l_envs_end = env_end;
+}
+
+/* Given an LX LWP, determine where user register state is stored. */
+lx_regs_location_t
+lx_regs_location(lx_lwp_data_t *lwpd, void **ucp, boolean_t for_write)
+{
+	switch (lwpd->br_stack_mode) {
+	case LX_STACK_MODE_BRAND:
+		/*
+		 * The LWP was stopped with the brand stack and register state
+		 * loaded, e.g. during a syscall emulated within the kernel.
+		 */
+		return (LX_REG_LOC_LWP);
+
+	case LX_STACK_MODE_PREINIT:
+		if (for_write) {
+			/* setting registers not allowed in this state */
+			break;
+		}
+		if (lwpd->br_ptrace_whatstop == LX_PR_SIGNALLED ||
+		    lwpd->br_ptrace_whatstop == LX_PR_SYSEXIT) {
+			/* The LWP was stopped by tracing on exec. */
+			return (LX_REG_LOC_LWP);
+		}
+		break;
+
+	case LX_STACK_MODE_NATIVE:
+		if (for_write) {
+			/* setting registers not allowed in this state */
+			break;
+		}
+		if (lwpd->br_ptrace_whystop == PR_BRAND &&
+		    lwpd->br_ptrace_whatstop == LX_PR_EVENT) {
+			/* Called while ptrace-event-stopped by lx_exec. */
+			return (LX_REG_LOC_LWP);
+		}
+		break;
+	default:
+		break;
+	}
+
+	if (lwpd->br_ptrace_stopucp != NULL) {
+		/*
+		 * The LWP was stopped in the usermode emulation library
+		 * but a ucontext_t for the preserved brand stack and
+		 * register state was provided.  Return the register state
+		 * from that ucontext_t.
+		 */
+		VERIFY(ucp != NULL);
+		*ucp = (void *)lwpd->br_ptrace_stopucp;
+		return (LX_REG_LOC_UCP);
+	}
+
+	return (LX_REG_LOC_UNAVAIL);
+}
diff --git a/usr/src/uts/common/brand/lx/os/lx_pid.c b/usr/src/uts/common/brand/lx/os/lx_pid.c
new file mode 100644
index 0000000000..40179bbdaf
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/os/lx_pid.c
@@ -0,0 +1,395 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ * Copyright 2015, Joyent, Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/sysmacros.h>
+#include <sys/bitmap.h>
+#include <sys/var.h>
+#include <sys/thread.h>
+#include <sys/proc.h>
+#include <sys/brand.h>
+#include <sys/zone.h>
+#include <sys/lx_brand.h>
+
+#define	LINUX_PROC_FACTOR	8	/* factor down the hash table by this */
+static int hash_len = 4;		/* desired average hash chain length */
+static int hash_size;			/* no of buckets in the hash table */
+
+static struct lx_pid **stol_pid_hash;
+static struct lx_pid **ltos_pid_hash;
+
+#define	LTOS_HASH(pid)		((pid) & (hash_size - 1))
+#define	STOL_HASH(pid, tid)	(((pid) + (tid)) & (hash_size - 1))
+
+static kmutex_t hash_lock;
+
+static void
+lx_pid_insert_hash(struct lx_pid *lpidp)
+{
+	int shash = STOL_HASH(lpidp->s_pid, lpidp->s_tid);
+	int lhash = LTOS_HASH(lpidp->l_pid);
+
+	ASSERT(MUTEX_HELD(&hash_lock));
+
+	lpidp->stol_next = stol_pid_hash[shash];
+	stol_pid_hash[shash] = lpidp;
+
+	lpidp->ltos_next = ltos_pid_hash[lhash];
+	ltos_pid_hash[lhash] = lpidp;
+}
+
+static struct lx_pid *
+lx_pid_remove_hash(pid_t pid, id_t tid)
+{
+	struct lx_pid **hpp;
+	struct lx_pid *lpidp = NULL;
+
+	ASSERT(MUTEX_HELD(&hash_lock));
+
+	hpp = &stol_pid_hash[STOL_HASH(pid, tid)];
+	while (*hpp) {
+		if ((*hpp)->s_pid == pid && (*hpp)->s_tid == tid) {
+			lpidp = *hpp;
+			*hpp = (*hpp)->stol_next;
+			break;
+		}
+		hpp = &(*hpp)->stol_next;
+	}
+
+	/*
+	 * when called during error recovery the pid may already
+	 * be released
+	 */
+	if (lpidp == NULL)
+		return (NULL);
+
+	hpp = &ltos_pid_hash[LTOS_HASH(lpidp->l_pid)];
+	while (*hpp) {
+		if (*hpp == lpidp) {
+			*hpp = lpidp->ltos_next;
+			break;
+		}
+		hpp = &(*hpp)->ltos_next;
+	}
+
+	return (lpidp);
+}
+
+/*
+ * given a solaris pid/tid pair, create a linux pid
+ */
+void
+lx_pid_assign(kthread_t *t, struct lx_pid *lpidp)
+{
+	proc_t *p = ttoproc(t);
+	lx_lwp_data_t *lwpd = ttolxlwp(t);
+	pid_t s_pid = p->p_pid;
+	id_t s_tid = t->t_tid;
+
+	/*
+	 * When lx_initlwp is called from lx_setbrand, p_lwpcnt will already be
+	 * equal to 1. Since lx_initlwp is being called against an lwp that
+	 * already exists, an additional pid allocation is not necessary.
+	 *
+	 * We check for this by testing br_ppid == 0.
+	 */
+	if (p->p_lwpcnt > 0 && lwpd->br_ppid != 0) {
+		/*
+		 * Assign allocated pid to any thread other than the first.
+		 * The l_pid and l_pidp fields should be populated.
+		 */
+		VERIFY(lpidp->l_pidp != NULL);
+		VERIFY(lpidp->l_pid != 0);
+	} else {
+		/*
+		 * There are cases where a pid is speculatively allocated but
+		 * is not needed.  We are obligated to free it here.
+		 */
+		if (lpidp->l_pidp != NULL) {
+			(void) pid_rele(lpidp->l_pidp);
+		}
+		lpidp->l_pidp = NULL;
+		lpidp->l_pid = s_pid;
+	}
+
+	lpidp->s_pid = s_pid;
+	lpidp->s_tid = s_tid;
+	lpidp->l_start = t->t_start;
+
+	/*
+	 * now put the pid into the linux-solaris and solaris-linux
+	 * conversion hash tables
+	 */
+	mutex_enter(&hash_lock);
+	lx_pid_insert_hash(lpidp);
+	mutex_exit(&hash_lock);
+
+	lwpd->br_pid = lpidp->l_pid;
+}
+
+/*
+ * If we are exec()ing the process, this thread's tid is about to be reset
+ * to 1.  Make sure the Linux PID bookkeeping reflects that change.
+ */
+void
+lx_pid_reassign(kthread_t *t)
+{
+	proc_t *p = ttoproc(t);
+	struct pid *old_pidp;
+	struct lx_pid *lpidp;
+
+	ASSERT(p->p_lwpcnt == 1);
+
+	mutex_enter(&hash_lock);
+
+	/*
+	 * Clean up all the traces of this thread's 'fake' Linux PID.
+	 */
+	lpidp = lx_pid_remove_hash(p->p_pid, t->t_tid);
+	ASSERT(lpidp != NULL);
+	old_pidp = lpidp->l_pidp;
+	lpidp->l_pidp = NULL;
+
+	/*
+	 * Now register this thread as (pid, 1).
+	 */
+	lpidp->l_pid = p->p_pid;
+	lpidp->s_pid = p->p_pid;
+	lpidp->s_tid = 1;
+	lx_pid_insert_hash(lpidp);
+
+	mutex_exit(&hash_lock);
+
+	if (old_pidp)
+		(void) pid_rele(old_pidp);
+}
+
+/*
+ * release a solaris pid/tid pair
+ */
+void
+lx_pid_rele(pid_t pid, id_t tid)
+{
+	struct lx_pid *lpidp;
+
+	mutex_enter(&hash_lock);
+	lpidp = lx_pid_remove_hash(pid, tid);
+	mutex_exit(&hash_lock);
+
+	if (lpidp) {
+		if (lpidp->l_pidp)
+			(void) pid_rele(lpidp->l_pidp);
+
+		kmem_free(lpidp, sizeof (*lpidp));
+	}
+}
+
+/*
+ * given a linux pid, return the solaris pid/tid pair
+ */
+int
+lx_lpid_to_spair(pid_t l_pid, pid_t *s_pid, id_t *s_tid)
+{
+	struct lx_pid *hp;
+
+	if (l_pid == 1) {
+		pid_t initpid;
+
+		/*
+		 * We are trying to look up the Linux init process for the
+		 * current zone, which we pretend has pid 1.
+		 */
+		if ((initpid = curzone->zone_proc_initpid) == -1) {
+			/*
+			 * We could not find the init process for this zone.
+			 */
+			return (-1);
+		}
+
+		if (s_pid != NULL)
+			*s_pid = initpid;
+		if (s_tid != NULL)
+			*s_tid = 1;
+
+		return (0);
+	}
+
+	mutex_enter(&hash_lock);
+	for (hp = ltos_pid_hash[LTOS_HASH(l_pid)]; hp; hp = hp->ltos_next) {
+		if (l_pid == hp->l_pid) {
+			if (s_pid)
+				*s_pid = hp->s_pid;
+			if (s_tid)
+				*s_tid = hp->s_tid;
+			break;
+		}
+	}
+	mutex_exit(&hash_lock);
+	if (hp != NULL)
+		return (0);
+
+	/*
+	 * We didn't find this pid in our translation table.
+	 * But this still could be the pid of a native process
+	 * running in the current zone so check for that here.
+	 *
+	 * Note that prfind() only searches for processes in the current zone.
+	 */
+	mutex_enter(&pidlock);
+	if (prfind(l_pid) != NULL) {
+		mutex_exit(&pidlock);
+		if (s_pid)
+			*s_pid = l_pid;
+		if (s_tid)
+			*s_tid = 0;
+		return (0);
+	}
+	mutex_exit(&pidlock);
+
+	return (-1);
+}
+
+/*
+ * Given an lwp, return the Linux pid of its parent.  If the caller
+ * wants them, we return the Solaris (pid, tid) as well.
+ */
+pid_t
+lx_lwp_ppid(klwp_t *lwp, pid_t *ppidp, id_t *ptidp)
+{
+	lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
+	proc_t *p = lwptoproc(lwp);
+	struct lx_pid *hp;
+	pid_t zoneinit = curproc->p_zone->zone_proc_initpid;
+	pid_t lppid, ppid;
+
+	/*
+	 * Be sure not to return a parent pid that should be invisible
+	 * within this zone.
+	 */
+	ppid = ((p->p_flag & SZONETOP)
+	    ? curproc->p_zone->zone_zsched->p_pid : p->p_ppid);
+
+	/*
+	 * If the parent process's pid is the zone's init process, force it
+	 * to the Linux init pid value of 1.
+	 */
+	if (ppid == zoneinit)
+		ppid = 1;
+
+	/*
+	 * There are two cases in which the Linux definition of a 'parent'
+	 * matches that of Solaris:
+	 *
+	 * - if our tgid is the same as our PID, then we are either the
+	 *   first thread in the process or a CLONE_THREAD thread.
+	 *
+	 * - if the brand lwp value for ppid is 0, then we are either the
+	 *   child of a differently-branded process or a CLONE_PARENT thread.
+	 */
+	if (p->p_pid == lwpd->br_tgid || lwpd->br_ppid == 0) {
+		if (ppidp != NULL)
+			*ppidp = ppid;
+		if (ptidp != NULL)
+			*ptidp = -1;
+		return (ppid);
+	}
+
+	/*
+	 * Set the default Linux parent pid to be the pid of the zone's init
+	 * process; this will get converted back to the Linux default of 1
+	 * later.
+	 */
+	lppid = zoneinit;
+
+	/*
+	 * If the process's parent isn't init, try and look up the Linux "pid"
+	 * corresponding to the process's parent.
+	 */
+	if (ppid != 1) {
+		/*
+		 * In all other cases, we are looking for the parent of this
+		 * specific thread, which in Linux refers to the thread that
+		 * clone()d it.   We stashed that thread's PID away when this
+		 * thread was created.
+		 */
+		mutex_enter(&hash_lock);
+		for (hp = ltos_pid_hash[LTOS_HASH(lwpd->br_ppid)]; hp;
+		    hp = hp->ltos_next) {
+			if (lwpd->br_ppid == hp->l_pid) {
+				/*
+				 * We found the PID we were looking for, but
+				 * since we cached its value in this LWP's brand
+				 * structure, it has exited and been reused by
+				 * another process.
+				 */
+				if (hp->l_start > lwptot(lwp)->t_start)
+					break;
+
+				lppid = lwpd->br_ppid;
+				if (ppidp != NULL)
+					*ppidp = hp->s_pid;
+				if (ptidp != NULL)
+					*ptidp = hp->s_tid;
+
+				break;
+			}
+		}
+		mutex_exit(&hash_lock);
+	}
+
+	if (lppid == zoneinit) {
+		lppid = 1;
+
+		if (ppidp != NULL)
+			*ppidp = lppid;
+		if (ptidp != NULL)
+			*ptidp = -1;
+	}
+
+	return (lppid);
+}
+
+void
+lx_pid_init(void)
+{
+	hash_size = 1 << highbit(v.v_proc / (hash_len * LINUX_PROC_FACTOR));
+
+	stol_pid_hash = kmem_zalloc(sizeof (struct lx_pid *) * hash_size,
+	    KM_SLEEP);
+	ltos_pid_hash = kmem_zalloc(sizeof (struct lx_pid *) * hash_size,
+	    KM_SLEEP);
+
+	mutex_init(&hash_lock, NULL, MUTEX_DEFAULT, NULL);
+}
+
+void
+lx_pid_fini(void)
+{
+	kmem_free(stol_pid_hash, sizeof (struct lx_pid *) * hash_size);
+	kmem_free(ltos_pid_hash, sizeof (struct lx_pid *) * hash_size);
+}
diff --git a/usr/src/uts/common/brand/lx/os/lx_ptrace.c b/usr/src/uts/common/brand/lx/os/lx_ptrace.c
new file mode 100644
index 0000000000..0f521df61b
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/os/lx_ptrace.c
@@ -0,0 +1,2564 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+/*
+ * Emulation of the Linux ptrace(2) interface.
+ *
+ * OVERVIEW
+ *
+ * The Linux process model is somewhat different from the illumos native
+ * model.  One critical difference is that each Linux thread has a unique
+ * identifier in the pid namespace.  The lx brand assigns a pid to each LWP
+ * within the emulated process, giving the pid of the process itself to the
+ * first LWP.
+ *
+ * The Linux ptrace(2) interface allows for any LWP in a branded process to
+ * exert control over any other LWP within the same zone.  Control is exerted
+ * by the use of the ptrace(2) system call itself, which accepts a number of
+ * request codes.  Feedback on traced events is primarily received by the
+ * tracer through SIGCLD and the emulated waitpid(2) and waitid(2) system
+ * calls.  Many of the possible ptrace(2) requests will only succeed if the
+ * target LWP is in a "ptrace-stop" condition.
+ *
+ * HISTORY
+ *
+ * The brand support for ptrace(2) was originally built on top of the rich
+ * support for debugging and tracing provided through the illumos /proc
+ * interfaces, mounted at /native/proc within the zone.  The native legacy
+ * ptrace(3C) functionality was used as a starting point, but was generally
+ * insufficient for complete and precise emulation.  The extant legacy
+ * interface, and indeed our native SIGCLD and waitid(2) facilities, are
+ * focused on _process_ level concerns -- the Linux interface has been
+ * extended to be aware of LWPs as well.
+ *
+ * In order to allow us to focus on providing more complete and accurate
+ * emulation without extensive and undesirable changes to the native
+ * facilities, this second generation ptrace(2) emulation is mostly separate
+ * from any other tracing or debugging framework in the system.
+ *
+ * ATTACHING TRACERS TO TRACEES
+ *
+ * There are several ways that a child LWP may becomed traced by a tracer.
+ * To determine which attach method caused a tracee to become attached, one
+ * may inspect the "br_ptrace_attach" member of the LWP-specific brand data
+ * with the debugger.
+ *
+ * The first attach methods to consider are the attaching ptrace(2) requests:
+ *
+ *   PTRACE_TRACEME
+ *
+ *   If an LWP makes a PTRACE_TRACEME call, it will be attached as a tracee
+ *   to its parent LWP (br_ppid).  Using PTRACE_TRACEME does _not_ cause the
+ *   tracee to be held in a stop condition.  It is common practice for
+ *   consumers to raise(SIGSTOP) immediately afterward.
+ *
+ *   PTRACE_ATTACH
+ *
+ *   An LWP may attempt to trace any other LWP in this, or another, process.
+ *   We currently allow any attach where the process containing the tracer
+ *   LWP has permission to write to /proc for the process containing the
+ *   intended tracer.  This action also sends a SIGSTOP to the newly attached
+ *   tracee.
+ *
+ * The second class of attach methods are the clone(2)/fork(2) inheritance
+ * options that may be set on a tracee with PTRACE_SETOPTIONS:
+ *
+ *   PTRACE_O_TRACEFORK, PTRACE_O_TRACEVFORK and PTRACE_O_TRACECLONE
+ *
+ *   If these options have been set on a tracee, then a fork(2), vfork(2) or
+ *   clone(2) respectively will cause the newly created LWP to be traced by
+ *   the same tracer.  The same set of ptrace(2) options will also be set on
+ *   the new child.
+ *
+ * The third class of attach method is the PTRACE_CLONE flag to clone(2).
+ * This flag induces the same inheritance as PTRACE_O_TRACECLONE, but is
+ * passed by the tracee as an argument to clone(2).
+ *
+ * DETACHING TRACEES
+ *
+ * Tracees can be detached by the tracer with the PTRACE_DETACH request.
+ * This request is only valid when the tracee is in a ptrace(2) stop
+ * condition, and is itself a restarting action.
+ *
+ * If the tracer exits without detaching all of its tracees, then all of the
+ * tracees are automatically detached and restarted.  If a tracee was in
+ * "signal-delivery-stop" at the time the tracer exited, the signal will be
+ * released to the child unless it is a SIGSTOP.  We drop this instance of
+ * SIGSTOP in order to prevent the child from becoming stopped by job
+ * control.
+ *
+ * ACCORD ALLOCATION AND MANAGEMENT
+ *
+ * The "lx_ptrace_accord_t" object tracks the agreement between a tracer LWP
+ * and zero or more tracee LWPs.  It is explicitly illegal for a tracee to
+ * trace its tracer, and we block this in PTRACE_ATTACH/PTRACE_TRACEME.
+ *
+ * An LWP starts out without an accord.  If a child of that LWP calls
+ * ptrace(2) with the PTRACE_TRACEME subcommand, or if the LWP itself uses
+ * PTRACE_ATTACH, an accord will be allocated and stored on that LWP.  The
+ * accord structure is not released from that LWP until it arrives in
+ * lx_exitlwp(), as called by lwp_exit().  A new accord will not be
+ * allocated, even if one does not exist, once an LWP arrives in lx_exitlwp()
+ * and sets the LX_PTF_EXITING flag.  An LWP will have at most one accord
+ * structure throughout its entire lifecycle; once it has one, it has the
+ * same one until death.
+ *
+ * The accord is reference counted (lxpa_refcnt), starting at a count of one
+ * at creation to represent the link from the tracer LWP to its accord.  The
+ * accord is not freed until the reference count falls to zero.
+ *
+ * To make mutual exclusion between a detaching tracer and various notifying
+ * tracees simpler, the tracer will hold "pidlock" while it clears the
+ * accord members that point back to the tracer LWP and CV.
+ *
+ * SIGNALS AND JOB CONTROL
+ *
+ * Various actions, either directly ptrace(2) related or commonly associated
+ * with tracing, cause process- or thread-directed SIGSTOP signals to be sent
+ * to tracees.  These signals, and indeed any signal other than SIGKILL, can
+ * be suppressed by the tracer when using a restarting request (including
+ * PTRACE_DETACH) on a child.  The signal may also be substituted for a
+ * different signal.
+ *
+ * If a SIGSTOP (or other stopping signal) is not suppressed by the tracer,
+ * it will induce the regular illumos native job control stop of the entire
+ * traced process.  This is at least passingly similar to the Linux "group
+ * stop" ptrace(2) condition.
+ *
+ * SYSTEM CALL TRACING
+ *
+ * The ptrace(2) interface enables the tracer to hold the tracee on entry and
+ * exit from system calls.  When a stopped tracee is restarted through the
+ * PTRACE_SYSCALL request, the LX_PTF_SYSCALL flag is set until the next
+ * system call boundary.  Whether this is a "syscall-entry-stop" or
+ * "syscall-exit-stop", the tracee is held and the tracer is notified via
+ * SIGCLD/waitpid(2) in the usual way.  The flag LX_PTF_SYSCALL flag is
+ * cleared after each stop; for ongoing system call tracing the tracee must
+ * be continuously restarted with PTRACE_SYSCALL.
+ *
+ * EVENT STOPS
+ *
+ * Various events (particularly FORK, VFORK, CLONE, EXEC and EXIT) are
+ * enabled by the tracer through PTRACE_SETOPTIONS.  Once enabled, the tracee
+ * will be stopped at the nominated points of interest and the tracer
+ * notified.  The tracer may request additional information about the event,
+ * such as the pid of new LWPs and processes, via PTRACE_GETEVENTMSG.
+ *
+ * LOCK ORDERING RULES
+ *
+ * It is not safe, in general, to hold p_lock for two different processes at
+ * the same time.  This constraint is the primary reason for the existence
+ * (and complexity) of the ptrace(2) accord mechanism.
+ *
+ * In order to facilitate looking up accords by the "pid" of a tracer LWP,
+ * p_lock for the tracer process may be held while entering the accord mutex
+ * (lxpa_lock).  This mutex protects the accord flags and reference count.
+ * The reference count is manipulated through lx_ptrace_accord_hold() and
+ * lx_ptrace_accord_rele().
+ *
+ * DO NOT interact with the accord mutex (lxpa_lock) directly.  The
+ * lx_ptrace_accord_enter() and lx_ptrace_accord_exit() functions do various
+ * book-keeping and lock ordering enforcement and MUST be used.
+ *
+ * It is NOT legal to take ANY p_lock while holding the accord mutex
+ * (lxpa_lock).  If the lxpa_tracees_lock is to be held concurrently with
+ * lxpa_lock, lxpa_lock MUST be taken first and dropped before taking p_lock
+ * of any processes from the tracee list.
+ *
+ * It is NOT legal to take a tracee p_lock and then attempt to enter the
+ * accord mutex (or tracee list mutex) of its tracer.  When running as the
+ * tracee LWP, the tracee's hold will prevent the accord from being freed.
+ * Use of the LX_PTF_STOPPING or LX_PTF_CLONING flag in the LWP-specific brand
+ * data prevents an exiting tracer from altering the tracee until the tracee
+ * has come to an orderly stop, without requiring the tracee to hold its own
+ * p_lock the entire time it is stopping.
+ *
+ * It is not safe, in general, to enter "pidlock" while holding the p_lock of
+ * any process.  It is similarly illegal to hold any accord locks (lxpa_lock
+ * or lxpa_sublock) while attempting to enter "pidlock".  As "pidlock" is a
+ * global mutex, it should be held for the shortest possible time.
+ */
+
+#include <sys/types.h>
+#include <sys/kmem.h>
+#include <sys/ksynch.h>
+#include <sys/sysmacros.h>
+#include <sys/procfs.h>
+#include <sys/cmn_err.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/wait.h>
+#include <sys/prsystm.h>
+#include <sys/note.h>
+
+#include <sys/brand.h>
+#include <sys/lx_brand.h>
+#include <sys/lx_impl.h>
+#include <sys/lx_misc.h>
+#include <lx_syscall.h>
+#include <lx_signum.h>
+
+
+typedef enum lx_ptrace_cont_flags_t {
+	LX_PTC_NONE = 0x00,
+	LX_PTC_SYSCALL = 0x01,
+	LX_PTC_SINGLESTEP = 0x02
+} lx_ptrace_cont_flags_t;
+
+
+extern int lx_user_regs_copyin(lx_lwp_data_t *, void *);
+extern int lx_user_regs_copyout(lx_lwp_data_t *, void *);
+extern int lx_ptrace_peekuser(lx_lwp_data_t *, uintptr_t, void *);
+extern int lx_ptrace_pokeuser(lx_lwp_data_t *, uintptr_t, void *);
+extern int lx_user_fpregs_copyin(lx_lwp_data_t *, void *);
+extern int lx_user_fpregs_copyout(lx_lwp_data_t *, void *);
+extern int lx_user_fpxregs_copyin(lx_lwp_data_t *, void *);
+extern int lx_user_fpxregs_copyout(lx_lwp_data_t *, void *);
+
+/*
+ * Macros for checking the state of an LWP via "br_ptrace_flags":
+ */
+#define	LX_PTRACE_BUSY \
+	(LX_PTF_EXITING | LX_PTF_STOPPING | LX_PTF_CLONING)
+
+#define	VISIBLE(a)	(((a)->br_ptrace_flags & LX_PTF_EXITING) == 0)
+#define	TRACEE_BUSY(a)	(((a)->br_ptrace_flags & LX_PTRACE_BUSY) != 0)
+
+#define	ACCORD_HELD(a)	MUTEX_HELD(&(a)->lxpa_lock)
+
+static kcondvar_t lx_ptrace_busy_cv;
+static kmem_cache_t *lx_ptrace_accord_cache;
+
+/*
+ * Enter the accord mutex.
+ */
+static void
+lx_ptrace_accord_enter(lx_ptrace_accord_t *accord)
+{
+	VERIFY(MUTEX_NOT_HELD(&accord->lxpa_tracees_lock));
+
+	mutex_enter(&accord->lxpa_lock);
+}
+
+/*
+ * Exit the accord mutex.  If the reference count has dropped to zero,
+ * free the accord.
+ */
+static void
+lx_ptrace_accord_exit(lx_ptrace_accord_t *accord)
+{
+	VERIFY(ACCORD_HELD(accord));
+
+	if (accord->lxpa_refcnt > 0) {
+		mutex_exit(&accord->lxpa_lock);
+		return;
+	}
+
+	/*
+	 * When the reference count drops to zero we must free the accord.
+	 */
+	VERIFY(accord->lxpa_tracer == NULL);
+	VERIFY(MUTEX_NOT_HELD(&accord->lxpa_tracees_lock));
+	VERIFY(list_is_empty(&accord->lxpa_tracees));
+	VERIFY(accord->lxpa_flags & LX_ACC_TOMBSTONE);
+
+	mutex_destroy(&accord->lxpa_lock);
+	mutex_destroy(&accord->lxpa_tracees_lock);
+
+	kmem_cache_free(lx_ptrace_accord_cache, accord);
+}
+
+/*
+ * Drop our reference to this accord.  If this drops the reference count
+ * to zero, the next lx_ptrace_accord_exit() will free the accord.
+ */
+static void
+lx_ptrace_accord_rele(lx_ptrace_accord_t *accord)
+{
+	VERIFY(ACCORD_HELD(accord));
+
+	VERIFY(accord->lxpa_refcnt > 0);
+	accord->lxpa_refcnt--;
+}
+
+/*
+ * Place an additional hold on an accord.
+ */
+static void
+lx_ptrace_accord_hold(lx_ptrace_accord_t *accord)
+{
+	VERIFY(ACCORD_HELD(accord));
+
+	accord->lxpa_refcnt++;
+}
+
+/*
+ * Fetch the accord for this LWP.  If one has not yet been created, and the
+ * process is not exiting, allocate it now.  Must be called with p_lock held
+ * for the process containing the target LWP.
+ *
+ * If successful, we return holding the accord lock (lxpa_lock).
+ */
+static int
+lx_ptrace_accord_get_locked(klwp_t *lwp, lx_ptrace_accord_t **accordp,
+    boolean_t allocate_one)
+{
+	lx_ptrace_accord_t *lxpa;
+	lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
+	proc_t *p = lwptoproc(lwp);
+
+	VERIFY(MUTEX_HELD(&p->p_lock));
+
+	/*
+	 * If this LWP does not have an accord, we wish to allocate
+	 * and install one.
+	 */
+	if ((lxpa = lwpd->br_ptrace_accord) == NULL) {
+		if (!allocate_one || !VISIBLE(lwpd)) {
+			/*
+			 * Either we do not wish to allocate an accord, or this
+			 * LWP has already begun exiting from a ptrace
+			 * perspective.
+			 */
+			*accordp = NULL;
+			return (ESRCH);
+		}
+
+		lxpa = kmem_cache_alloc(lx_ptrace_accord_cache, KM_SLEEP);
+		bzero(lxpa, sizeof (*lxpa));
+
+		/*
+		 * The initial reference count is 1 because we are referencing
+		 * it in from the soon-to-be tracer LWP.
+		 */
+		lxpa->lxpa_refcnt = 1;
+		mutex_init(&lxpa->lxpa_lock, NULL, MUTEX_DEFAULT, NULL);
+		mutex_init(&lxpa->lxpa_tracees_lock, NULL, MUTEX_DEFAULT, NULL);
+		list_create(&lxpa->lxpa_tracees, sizeof (lx_lwp_data_t),
+		    offsetof(lx_lwp_data_t, br_ptrace_linkage));
+		lxpa->lxpa_cvp = &p->p_cv;
+
+		lxpa->lxpa_tracer = lwpd;
+		lwpd->br_ptrace_accord = lxpa;
+	}
+
+	/*
+	 * Lock the accord before returning it to the caller.
+	 */
+	lx_ptrace_accord_enter(lxpa);
+
+	/*
+	 * There should be at least one active reference to this accord,
+	 * otherwise it should have been freed.
+	 */
+	VERIFY(lxpa->lxpa_refcnt > 0);
+
+	*accordp = lxpa;
+	return (0);
+}
+
+/*
+ * Accords belong to the tracer LWP.  Get the accord for this tracer or return
+ * an error if it was not possible.  To prevent deadlocks, the caller MUST NOT
+ * hold p_lock on its own or any other process.
+ *
+ * If successful, we return holding the accord lock (lxpa_lock).
+ */
+static int
+lx_ptrace_accord_get_by_pid(pid_t lxpid, lx_ptrace_accord_t **accordp)
+{
+	int ret = ESRCH;
+	pid_t apid;
+	id_t atid;
+	proc_t *aproc;
+	kthread_t *athr;
+	klwp_t *alwp;
+	lx_lwp_data_t *alwpd;
+
+	VERIFY(MUTEX_NOT_HELD(&curproc->p_lock));
+
+	/*
+	 * Locate the process containing the tracer LWP based on its Linux pid
+	 * and lock it.
+	 */
+	if (lx_lpid_to_spair(lxpid, &apid, &atid) != 0 ||
+	    (aproc = sprlock(apid)) == NULL) {
+		return (ESRCH);
+	}
+
+	/*
+	 * Locate the tracer LWP itself and ensure that it is visible to
+	 * ptrace(2).
+	 */
+	if ((athr = idtot(aproc, atid)) == NULL ||
+	    (alwp = ttolwp(athr)) == NULL ||
+	    (alwpd = lwptolxlwp(alwp)) == NULL ||
+	    !VISIBLE(alwpd)) {
+		sprunlock(aproc);
+		return (ESRCH);
+	}
+
+	/*
+	 * We should not fetch our own accord this way.
+	 */
+	if (athr == curthread) {
+		sprunlock(aproc);
+		return (EPERM);
+	}
+
+	/*
+	 * Fetch (or allocate) the accord owned by this tracer LWP:
+	 */
+	ret = lx_ptrace_accord_get_locked(alwp, accordp, B_TRUE);
+
+	/*
+	 * Unlock the process and return.
+	 */
+	sprunlock(aproc);
+	return (ret);
+}
+
+/*
+ * Get (or allocate) the ptrace(2) accord for the current LWP, acting as a
+ * tracer.  The caller MUST NOT currently hold p_lock on the process containing
+ * this LWP.
+ *
+ * If successful, we return holding the accord lock (lxpa_lock).
+ */
+static int
+lx_ptrace_accord_get(lx_ptrace_accord_t **accordp, boolean_t allocate_one)
+{
+	klwp_t *lwp = ttolwp(curthread);
+	proc_t *p = lwptoproc(lwp);
+	int ret;
+
+	VERIFY(MUTEX_NOT_HELD(&p->p_lock));
+
+	/*
+	 * Lock the tracer (this LWP).
+	 */
+	mutex_enter(&p->p_lock);
+
+	/*
+	 * Fetch (or allocate) the accord for this LWP:
+	 */
+	ret = lx_ptrace_accord_get_locked(lwp, accordp, allocate_one);
+
+	mutex_exit(&p->p_lock);
+
+	return (ret);
+}
+
+/*
+ * Restart an LWP if it is in "ptrace-stop".  This function may induce sleep,
+ * so the caller MUST NOT hold any mutexes other than p_lock for the process
+ * containing the LWP.
+ */
+static void
+lx_ptrace_restart_lwp(klwp_t *lwp)
+{
+	kthread_t *rt = lwptot(lwp);
+	proc_t *rproc = lwptoproc(lwp);
+	lx_lwp_data_t *rlwpd = lwptolxlwp(lwp);
+
+	VERIFY(rt != curthread);
+	VERIFY(MUTEX_HELD(&rproc->p_lock));
+
+	/*
+	 * Exclude potential meddling from procfs.
+	 */
+	prbarrier(rproc);
+
+	/*
+	 * Check that the LWP is still in "ptrace-stop" and, if so, restart it.
+	 */
+	thread_lock(rt);
+	if (BSTOPPED(rt) && rt->t_whystop == PR_BRAND) {
+		rt->t_schedflag |= TS_BSTART;
+		setrun_locked(rt);
+
+		/*
+		 * Clear stop reason.
+		 */
+		rlwpd->br_ptrace_whystop = 0;
+		rlwpd->br_ptrace_whatstop = 0;
+		rlwpd->br_ptrace_flags &= ~(LX_PTF_CLDPEND | LX_PTF_WAITPEND);
+	}
+	thread_unlock(rt);
+}
+
+static void
+lx_ptrace_winfo(lx_lwp_data_t *remote, k_siginfo_t *ip, boolean_t waitflag,
+    pid_t *event_ppid, pid_t *event_pid)
+{
+	int signo;
+
+	/*
+	 * Populate our k_siginfo_t with data about this "ptrace-stop"
+	 * condition:
+	 */
+	bzero(ip, sizeof (*ip));
+	ip->si_signo = SIGCLD;
+	ip->si_pid = remote->br_pid;
+	ip->si_code = CLD_TRAPPED;
+
+	switch (remote->br_ptrace_whatstop) {
+	case LX_PR_SYSENTRY:
+	case LX_PR_SYSEXIT:
+		ip->si_status = SIGTRAP;
+		if (remote->br_ptrace_options & LX_PTRACE_O_TRACESYSGOOD) {
+			ip->si_status |= 0x80;
+		}
+		break;
+
+	case LX_PR_SIGNALLED:
+		signo = remote->br_ptrace_stopsig;
+		if (signo < 1 || signo >= LX_NSIG) {
+			/*
+			 * If this signal number is not valid, pretend it
+			 * was a SIGTRAP.
+			 */
+			ip->si_status = SIGTRAP;
+		} else {
+			ip->si_status = ltos_signo[signo];
+		}
+		break;
+
+	case LX_PR_EVENT:
+		ip->si_status = SIGTRAP | remote->br_ptrace_event;
+		/*
+		 * Record the Linux pid of both this LWP and the create
+		 * event we are dispatching.  We will use this information
+		 * to unblock any subsequent ptrace(2) events that depend
+		 * on this one.
+		 */
+		if (event_ppid != NULL)
+			*event_ppid = remote->br_pid;
+		if (event_pid != NULL)
+			*event_pid = (pid_t)remote->br_ptrace_eventmsg;
+		break;
+
+	default:
+		cmn_err(CE_PANIC, "unxpected stop subreason: %d",
+		    remote->br_ptrace_whatstop);
+	}
+
+	/*
+	 * If WNOWAIT was specified, do not mark the event as posted
+	 * so that it may be re-fetched on another call to waitid().
+	 */
+	if (waitflag)
+		remote->br_ptrace_flags &= ~(LX_PTF_CLDPEND | LX_PTF_WAITPEND);
+}
+
+/*
+ * Receive notification from stop() of a PR_BRAND stop.
+ */
+void
+lx_stop_notify(proc_t *p, klwp_t *lwp, ushort_t why, ushort_t what)
+{
+	lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
+	lx_ptrace_accord_t *accord;
+	klwp_t *plwp = NULL;
+	proc_t *pp = NULL;
+	lx_lwp_data_t *parent;
+	boolean_t cldpend = B_TRUE;
+	boolean_t cldpost = B_FALSE;
+	sigqueue_t *sqp = NULL;
+
+	/*
+	 * We currently only care about LX-specific stop reasons.
+	 */
+	if (why != PR_BRAND)
+		return;
+
+	switch (what) {
+	case LX_PR_SYSENTRY:
+	case LX_PR_SYSEXIT:
+	case LX_PR_SIGNALLED:
+	case LX_PR_EVENT:
+		break;
+	default:
+		cmn_err(CE_PANIC, "unexpected subreason for PR_BRAND"
+		    " stop: %d", (int)what);
+	}
+
+	/*
+	 * We should be holding the lock on our containing process.  The
+	 * STOPPING flag should have been set by lx_ptrace_stop() for all
+	 * PR_BRAND stops.
+	 */
+	VERIFY(MUTEX_HELD(&p->p_lock));
+	VERIFY(lwpd->br_ptrace_flags & LX_PTF_STOPPING);
+	VERIFY((accord = lwpd->br_ptrace_tracer) != NULL);
+
+	/*
+	 * We must drop our process lock to take "pidlock".  The
+	 * LX_PTF_STOPPING flag protects us from an exiting tracer.
+	 */
+	mutex_exit(&p->p_lock);
+
+	/*
+	 * Allocate before we enter any mutexes.
+	 */
+	sqp = kmem_zalloc(sizeof (*sqp), KM_SLEEP);
+
+	/*
+	 * We take pidlock now, which excludes all callers of waitid() and
+	 * prevents a detaching tracer from clearing critical accord members.
+	 */
+	mutex_enter(&pidlock);
+	mutex_enter(&p->p_lock);
+
+	/*
+	 * Get the ptrace(2) "parent" process, to which we may send
+	 * a SIGCLD signal later.
+	 */
+	if ((parent = accord->lxpa_tracer) != NULL &&
+	    (plwp = parent->br_lwp) != NULL) {
+		pp = lwptoproc(plwp);
+	}
+
+	/*
+	 * Our tracer should not have been modified in our absence; the
+	 * LX_PTF_STOPPING flag prevents it.
+	 */
+	VERIFY(lwpd->br_ptrace_tracer == accord);
+
+	/*
+	 * Stash data for this stop condition in the LWP data while we hold
+	 * both pidlock and our p_lock.
+	 */
+	lwpd->br_ptrace_whystop = why;
+	lwpd->br_ptrace_whatstop = what;
+	lwpd->br_ptrace_flags |= LX_PTF_WAITPEND;
+
+	/*
+	 * If this event does not depend on an event from the parent LWP,
+	 * populate the siginfo_t for the event pending on this tracee LWP.
+	 */
+	if (!(lwpd->br_ptrace_flags & LX_PTF_PARENT_WAIT) && pp != NULL) {
+		cldpost = B_TRUE;
+		lx_ptrace_winfo(lwpd, &sqp->sq_info, B_FALSE, NULL, NULL);
+	}
+
+	/*
+	 * Drop our p_lock so that we may lock the tracer.
+	 */
+	mutex_exit(&p->p_lock);
+	if (cldpost && pp != NULL) {
+		/*
+		 * Post the SIGCLD to the tracer.
+		 */
+		mutex_enter(&pp->p_lock);
+		if (!sigismember(&pp->p_sig, SIGCLD)) {
+			sigaddqa(pp, plwp->lwp_thread, sqp);
+			cldpend = B_FALSE;
+			sqp = NULL;
+		}
+		mutex_exit(&pp->p_lock);
+	}
+
+	/*
+	 * We re-take our process lock now.  The lock will be held until
+	 * the thread is actually marked stopped, so we will not race with
+	 * lx_ptrace_lock_if_stopped() or lx_waitid_helper().
+	 */
+	mutex_enter(&p->p_lock);
+
+	/*
+	 * We clear the STOPPING flag; stop() continues to hold our p_lock
+	 * until our thread stop state is visible.
+	 */
+	lwpd->br_ptrace_flags &= ~LX_PTF_STOPPING;
+	lwpd->br_ptrace_flags |= LX_PTF_STOPPED;
+	if (cldpend) {
+		/*
+		 * We sent the SIGCLD for this new wait condition already.
+		 */
+		lwpd->br_ptrace_flags |= LX_PTF_CLDPEND;
+	}
+
+	/*
+	 * If lx_ptrace_exit_tracer() is trying to detach our tracer, it will
+	 * be sleeping on this CV until LX_PTF_STOPPING is clear.  Wake it
+	 * now.
+	 */
+	cv_broadcast(&lx_ptrace_busy_cv);
+
+	/*
+	 * While still holding pidlock, we attempt to wake our tracer from a
+	 * potential waitid() slumber.
+	 */
+	if (accord->lxpa_cvp != NULL) {
+		cv_broadcast(accord->lxpa_cvp);
+	}
+
+	/*
+	 * We release pidlock and return as we were called: with our p_lock
+	 * held.
+	 */
+	mutex_exit(&pidlock);
+
+	if (sqp != NULL) {
+		kmem_free(sqp, sizeof (*sqp));
+	}
+}
+
+/*
+ * For any restarting action (e.g. PTRACE_CONT, PTRACE_SYSCALL or
+ * PTRACE_DETACH) to be allowed, the tracee LWP must be in "ptrace-stop".  This
+ * check must ONLY be run on tracees of the current LWP.  If the check is
+ * successful, we return with the tracee p_lock held.
+ */
+static int
+lx_ptrace_lock_if_stopped(lx_ptrace_accord_t *accord, lx_lwp_data_t *remote)
+{
+	klwp_t *rlwp = remote->br_lwp;
+	proc_t *rproc = lwptoproc(rlwp);
+	kthread_t *rt = lwptot(rlwp);
+
+	/*
+	 * We must never check that we, ourselves, are stopped.  We must also
+	 * have the accord tracee list locked while we lock our tracees.
+	 */
+	VERIFY(curthread != rt);
+	VERIFY(MUTEX_HELD(&accord->lxpa_tracees_lock));
+	VERIFY(accord->lxpa_tracer == ttolxlwp(curthread));
+
+	/*
+	 * Lock the process containing the tracee LWP.
+	 */
+	mutex_enter(&rproc->p_lock);
+	if (!VISIBLE(remote)) {
+		/*
+		 * The tracee LWP is currently detaching itself as it exits.
+		 * It is no longer visible to ptrace(2).
+		 */
+		mutex_exit(&rproc->p_lock);
+		return (ESRCH);
+	}
+
+	/*
+	 * We must only check whether tracees of the current LWP are stopped.
+	 * We check this condition after confirming visibility as an exiting
+	 * tracee may no longer be completely consistent.
+	 */
+	VERIFY(remote->br_ptrace_tracer == accord);
+
+	if (!(remote->br_ptrace_flags & LX_PTF_STOPPED)) {
+		/*
+		 * The tracee is not in "ptrace-stop", so we release the
+		 * process.
+		 */
+		mutex_exit(&rproc->p_lock);
+		return (ESRCH);
+	}
+
+	/*
+	 * The tracee is stopped.  We return holding its process lock so that
+	 * the caller may manipulate it.
+	 */
+	return (0);
+}
+
+static int
+lx_ptrace_setoptions(lx_lwp_data_t *remote, uintptr_t options)
+{
+	/*
+	 * Check for valid options.
+	 */
+	if ((options & ~LX_PTRACE_O_ALL) != 0) {
+		return (EINVAL);
+	}
+
+	/*
+	 * Set ptrace options on the target LWP.
+	 */
+	remote->br_ptrace_options = (lx_ptrace_options_t)options;
+
+	return (0);
+}
+
+static int
+lx_ptrace_geteventmsg(lx_lwp_data_t *remote, void *umsgp)
+{
+	int error;
+
+#if defined(_SYSCALL32_IMPL)
+	if (get_udatamodel() != DATAMODEL_NATIVE) {
+		uint32_t tmp = remote->br_ptrace_eventmsg;
+
+		error = copyout(&tmp, umsgp, sizeof (uint32_t));
+	} else
+#endif
+	{
+		error = copyout(&remote->br_ptrace_eventmsg, umsgp,
+		    sizeof (ulong_t));
+	}
+
+	return (error);
+}
+
+static int
+lx_ptrace_getsiginfo(lx_lwp_data_t *remote, void *usiginfo)
+{
+	klwp_t *lwp = remote->br_lwp;
+	int lx_sig;
+
+	lx_sig = lx_stol_signo(lwp->lwp_cursig, 0);
+	if (lx_sig < 1 || lwp->lwp_curinfo == NULL) {
+		return (EINVAL);
+	}
+
+#if defined(_SYSCALL32_IMPL)
+	if (get_udatamodel() != DATAMODEL_NATIVE) {
+		if (stol_ksiginfo32_copyout(&lwp->lwp_curinfo->sq_info,
+		    usiginfo) != 0) {
+			return (EFAULT);
+		}
+	} else
+#endif
+	{
+		if (stol_ksiginfo_copyout(&lwp->lwp_curinfo->sq_info,
+		    usiginfo) != 0) {
+			return (EFAULT);
+		}
+	}
+
+	return (0);
+}
+
+
+/*
+ * Implements the PTRACE_CONT subcommand of the Linux ptrace(2) interface.
+ */
+static int
+lx_ptrace_cont(lx_lwp_data_t *remote, lx_ptrace_cont_flags_t flags, int signo)
+{
+	klwp_t *lwp = remote->br_lwp;
+
+	if (flags & LX_PTC_SINGLESTEP) {
+		/*
+		 * We do not currently support single-stepping.
+		 */
+		lx_unsupported("PTRACE_SINGLESTEP not currently implemented");
+		return (EINVAL);
+	}
+
+	/*
+	 * The tracer may choose to suppress the delivery of a signal, or
+	 * select an alternative signal for delivery.  If this is an
+	 * appropriate ptrace(2) "signal-delivery-stop", br_ptrace_stopsig
+	 * will be used as the new signal number.
+	 *
+	 * As with so many other aspects of the Linux ptrace(2) interface, this
+	 * may fail silently if the state machine is not aligned correctly.
+	 */
+	remote->br_ptrace_stopsig = signo;
+	remote->br_ptrace_donesig = 0;
+
+	/*
+	 * Handle the syscall-stop flag if this is a PTRACE_SYSCALL restart:
+	 */
+	if (flags & LX_PTC_SYSCALL) {
+		remote->br_ptrace_flags |= LX_PTF_SYSCALL;
+	} else {
+		remote->br_ptrace_flags &= ~LX_PTF_SYSCALL;
+	}
+
+	lx_ptrace_restart_lwp(lwp);
+
+	return (0);
+}
+
+/*
+ * Implements the PTRACE_DETACH subcommand of the Linux ptrace(2) interface.
+ *
+ * The LWP identified by the Linux pid "lx_pid" will, if it as a tracee of the
+ * current LWP, be detached and set runnable.  If the specified LWP is not
+ * currently in the "ptrace-stop" state, the routine will return ESRCH as if
+ * the LWP did not exist at all.
+ *
+ * The caller must not hold p_lock on any process.
+ */
+static int
+lx_ptrace_detach(lx_ptrace_accord_t *accord, lx_lwp_data_t *remote, int signo,
+    boolean_t *release_hold)
+{
+	klwp_t *rlwp = remote->br_lwp;
+
+	/*
+	 * The tracee LWP was in "ptrace-stop" and we now hold its p_lock.
+	 * Detach the LWP from the accord and set it running.
+	 */
+	VERIFY(!TRACEE_BUSY(remote));
+	VERIFY(MUTEX_HELD(&accord->lxpa_tracees_lock));
+	remote->br_ptrace_flags &= ~(LX_PTF_SYSCALL | LX_PTF_INHERIT);
+	VERIFY(list_link_active(&remote->br_ptrace_linkage));
+	list_remove(&accord->lxpa_tracees, remote);
+
+	remote->br_ptrace_attach = LX_PTA_NONE;
+	remote->br_ptrace_tracer = NULL;
+	remote->br_ptrace_flags = 0;
+	*release_hold = B_TRUE;
+
+	/*
+	 * Decrement traced-lwp count for the process.
+	 */
+	ASSERT(MUTEX_HELD(&rlwp->lwp_procp->p_lock));
+	VERIFY(ptolxproc(rlwp->lwp_procp)->l_ptrace-- >= 1);
+
+	/*
+	 * The tracer may, as described in lx_ptrace_cont(), choose to suppress
+	 * or modify the delivered signal.
+	 */
+	remote->br_ptrace_stopsig = signo;
+	remote->br_ptrace_donesig = 0;
+
+	lx_ptrace_restart_lwp(rlwp);
+
+	return (0);
+}
+
+/*
+ * This routine implements the PTRACE_ATTACH operation of the Linux ptrace(2)
+ * interface.
+ *
+ * This LWP is requesting to be attached as a tracer to another LWP -- the
+ * tracee.  If a ptrace accord to track the list of tracees has not yet been
+ * allocated, one will be allocated and attached to this LWP now.
+ *
+ * The "br_ptrace_tracer" on the tracee LWP is set to this accord, and the
+ * tracee LWP is then added to the "lxpa_tracees" list in the accord.  We drop
+ * locks between these two phases; the only consumer of trace events from this
+ * accord is this LWP, which obviously cannot be running waitpid(2) at the same
+ * time as this call to ptrace(2).
+ */
+static int
+lx_ptrace_attach(pid_t lx_pid)
+{
+	int error = ESRCH;
+	/*
+	 * Our (Tracer) LWP:
+	 */
+	lx_ptrace_accord_t *accord;
+	lx_lwp_data_t *lwpd = ttolxlwp(curthread);
+	/*
+	 * Remote (Tracee) LWP:
+	 */
+	pid_t rpid;
+	id_t rtid;
+	proc_t *rproc;
+	kthread_t *rthr;
+	klwp_t *rlwp;
+	lx_lwp_data_t *rlwpd;
+
+	if (lwpd->br_pid == lx_pid) {
+		/*
+		 * We cannot trace ourselves.
+		 */
+		return (EPERM);
+	}
+
+	/*
+	 * Ensure that we have an accord and obtain a lock on it.  This
+	 * routine should not fail because the LWP cannot make ptrace(2) system
+	 * calls after it has begun exiting.
+	 */
+	VERIFY0(lwpd->br_ptrace_flags & LX_PTF_EXITING);
+	VERIFY(lx_ptrace_accord_get(&accord, B_TRUE) == 0);
+
+	/*
+	 * Place speculative hold in case the attach is successful.
+	 */
+	lx_ptrace_accord_hold(accord);
+	lx_ptrace_accord_exit(accord);
+
+	/*
+	 * Locate the process containing the tracee LWP based on its Linux pid
+	 * and lock it.
+	 */
+	if (lx_lpid_to_spair(lx_pid, &rpid, &rtid) != 0 ||
+	    (rproc = sprlock(rpid)) == NULL) {
+		/*
+		 * We could not find the target process.
+		 */
+		goto errout;
+	}
+
+	/*
+	 * Locate the tracee LWP.
+	 */
+	if ((rthr = idtot(rproc, rtid)) == NULL ||
+	    (rlwp = ttolwp(rthr)) == NULL ||
+	    (rlwpd = lwptolxlwp(rlwp)) == NULL ||
+	    !VISIBLE(rlwpd)) {
+		/*
+		 * The LWP could not be found, was not branded, or is not
+		 * visible to ptrace(2) at this time.
+		 */
+		goto unlock_errout;
+	}
+
+	/*
+	 * We now hold the lock on the tracee.  Attempt to install ourselves
+	 * as the tracer.
+	 */
+	if (curproc != rproc && priv_proc_cred_perm(curproc->p_cred, rproc,
+	    NULL, VWRITE) != 0) {
+		/*
+		 * This process does not have permission to trace the remote
+		 * process.
+		 */
+		error = EPERM;
+	} else if (rlwpd->br_ptrace_tracer != NULL) {
+		/*
+		 * This LWP is already being traced.
+		 */
+		VERIFY(list_link_active(&rlwpd->br_ptrace_linkage));
+		VERIFY(rlwpd->br_ptrace_attach != LX_PTA_NONE);
+		error = EPERM;
+	} else {
+		lx_proc_data_t *rprocd = ptolxproc(rproc);
+
+		/*
+		 * Bond the tracee to the accord.
+		 */
+		VERIFY0(rlwpd->br_ptrace_flags & LX_PTF_EXITING);
+		VERIFY(rlwpd->br_ptrace_attach == LX_PTA_NONE);
+		rlwpd->br_ptrace_attach = LX_PTA_ATTACH;
+		rlwpd->br_ptrace_tracer = accord;
+
+		/*
+		 * We had no tracer, and are thus not in the tracees list.
+		 * It is safe to take the tracee list lock while we insert
+		 * ourselves.
+		 */
+		mutex_enter(&accord->lxpa_tracees_lock);
+		VERIFY(!list_link_active(&rlwpd->br_ptrace_linkage));
+		list_insert_tail(&accord->lxpa_tracees, rlwpd);
+		/*
+		 * Bump traced-lwp count for the remote process.
+		 */
+		rprocd->l_ptrace++;
+		mutex_exit(&accord->lxpa_tracees_lock);
+
+		/*
+		 * Send a thread-directed SIGSTOP.
+		 */
+		sigtoproc(rproc, rthr, SIGSTOP);
+
+
+		error = 0;
+	}
+
+unlock_errout:
+	/*
+	 * Unlock the process containing the tracee LWP and the accord.
+	 */
+	sprunlock(rproc);
+
+errout:
+	if (error != 0) {
+		/*
+		 * The attach was not successful.  Remove our speculative
+		 * hold.
+		 */
+		lx_ptrace_accord_enter(accord);
+		lx_ptrace_accord_rele(accord);
+		lx_ptrace_accord_exit(accord);
+	}
+
+	return (error);
+}
+
+int
+lx_ptrace_set_clone_inherit(int option, boolean_t inherit_flag)
+{
+	klwp_t *lwp = ttolwp(curthread);
+	proc_t *p = lwptoproc(lwp);
+	lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
+
+	mutex_enter(&p->p_lock);
+
+	switch (option) {
+	case LX_PTRACE_O_TRACEFORK:
+	case LX_PTRACE_O_TRACEVFORK:
+	case LX_PTRACE_O_TRACECLONE:
+		lwpd->br_ptrace_clone_option = option;
+		break;
+
+	default:
+		return (EINVAL);
+	}
+
+	if (inherit_flag) {
+		lwpd->br_ptrace_flags |= LX_PTF_INHERIT;
+	} else {
+		lwpd->br_ptrace_flags &= ~LX_PTF_INHERIT;
+	}
+
+	mutex_exit(&p->p_lock);
+	return (0);
+}
+
+/*
+ * If the parent LWP is being traced, we want to attach ourselves to the
+ * same accord.
+ */
+void
+lx_ptrace_inherit_tracer(lx_lwp_data_t *src, lx_lwp_data_t *dst)
+{
+	proc_t *srcp = lwptoproc(src->br_lwp);
+	proc_t *dstp = lwptoproc(dst->br_lwp);
+	lx_ptrace_accord_t *accord;
+	boolean_t is_fork = B_FALSE;
+
+	VERIFY(MUTEX_HELD(&dstp->p_lock));
+	if (srcp != dstp) {
+		/*
+		 * In the case of being called via forklwp, some lock shuffling
+		 * is required.  The destination p_lock must be dropped to
+		 * avoid deadlocks when locking the source and manipulating
+		 * ptrace accord resources.
+		 */
+		is_fork = B_TRUE;
+		sprlock_proc(dstp);
+		mutex_exit(&dstp->p_lock);
+		mutex_enter(&srcp->p_lock);
+	}
+
+	if ((accord = src->br_ptrace_tracer) == NULL) {
+		/*
+		 * The source LWP does not have a tracer to inherit.
+		 */
+		goto out;
+	}
+
+	/*
+	 * There are two conditions to check when determining if the new
+	 * child should inherit the same tracer (and tracing options) as its
+	 * parent.  Either condition is sufficient to trigger inheritance.
+	 */
+	dst->br_ptrace_attach = LX_PTA_NONE;
+	if ((src->br_ptrace_options & src->br_ptrace_clone_option) != 0) {
+		/*
+		 * Condition 1:
+		 * The clone(2), fork(2) and vfork(2) emulated system calls
+		 * populate "br_ptrace_clone_option" with the specific
+		 * ptrace(2) SETOPTIONS option that applies to this
+		 * operation.  If the relevant option has been enabled by the
+		 * tracer then we inherit.
+		 */
+		dst->br_ptrace_attach |= LX_PTA_INHERIT_OPTIONS;
+
+	} else if ((src->br_ptrace_flags & LX_PTF_INHERIT) != 0) {
+		/*
+		 * Condition 2:
+		 * If the caller opted in to inheritance with the
+		 * PTRACE_CLONE flag to clone(2), the LX_PTF_INHERIT flag
+		 * will be set and we inherit.
+		 */
+		dst->br_ptrace_attach |= LX_PTA_INHERIT_CLONE;
+	}
+
+	/*
+	 * These values only apply for the duration of a single clone(2), et
+	 * al, system call.
+	 */
+	src->br_ptrace_flags &= ~LX_PTF_INHERIT;
+	src->br_ptrace_clone_option = 0;
+
+	if (dst->br_ptrace_attach == LX_PTA_NONE) {
+		/*
+		 * No condition triggered inheritance.
+		 */
+		goto out;
+	}
+
+	/*
+	 * Set the LX_PTF_CLONING flag to prevent us from being detached
+	 * while our p_lock is dropped.
+	 */
+	src->br_ptrace_flags |= LX_PTF_CLONING;
+	mutex_exit(&srcp->p_lock);
+
+	/*
+	 * Hold the accord for the new LWP.
+	 */
+	lx_ptrace_accord_enter(accord);
+	lx_ptrace_accord_hold(accord);
+	lx_ptrace_accord_exit(accord);
+
+	/*
+	 * Install the tracer and copy the current PTRACE_SETOPTIONS options.
+	 */
+	dst->br_ptrace_tracer = accord;
+	dst->br_ptrace_options = src->br_ptrace_options;
+
+	/*
+	 * This flag prevents waitid() from seeing events for the new child
+	 * until the parent is able to post the relevant ptrace event to
+	 * the tracer.
+	 */
+	dst->br_ptrace_flags |= LX_PTF_PARENT_WAIT;
+
+	mutex_enter(&accord->lxpa_tracees_lock);
+	VERIFY(list_link_active(&src->br_ptrace_linkage));
+	VERIFY(!list_link_active(&dst->br_ptrace_linkage));
+	list_insert_tail(&accord->lxpa_tracees, dst);
+	mutex_exit(&accord->lxpa_tracees_lock);
+
+	/*
+	 * Relock our process and clear our busy flag.
+	 */
+	mutex_enter(&srcp->p_lock);
+	src->br_ptrace_flags &= ~LX_PTF_CLONING;
+
+	/*
+	 * Bump traced-lwp count for the process.
+	 */
+	ptolxproc(dstp)->l_ptrace++;
+
+	/*
+	 * If lx_ptrace_exit_tracer() is trying to detach our tracer, it will
+	 * be sleeping on this CV until LX_PTF_CLONING is clear.  Wake it
+	 * now.
+	 */
+	cv_broadcast(&lx_ptrace_busy_cv);
+
+out:
+	if (is_fork) {
+		mutex_exit(&srcp->p_lock);
+		mutex_enter(&dstp->p_lock);
+		sprunprlock(dstp);
+	}
+}
+
+static int
+lx_ptrace_traceme(void)
+{
+	int error;
+	boolean_t did_attach = B_FALSE;
+	/*
+	 * Our (Tracee) LWP:
+	 */
+	klwp_t *lwp = ttolwp(curthread);
+	proc_t *p = lwptoproc(lwp);
+	lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
+	/*
+	 * Remote (Tracer) LWP:
+	 */
+	lx_ptrace_accord_t *accord;
+
+	/*
+	 * We are intending to be the tracee.  Fetch (or allocate) the accord
+	 * for our parent LWP.
+	 */
+	if ((error = lx_ptrace_accord_get_by_pid(lx_lwp_ppid(lwp, NULL,
+	    NULL), &accord)) != 0) {
+		/*
+		 * Could not determine the Linux pid of the parent LWP, or
+		 * could not get the accord for that LWP.
+		 */
+		return (error);
+	}
+
+	/*
+	 * We now hold the accord lock.
+	 */
+	if (accord->lxpa_flags & LX_ACC_TOMBSTONE) {
+		/*
+		 * The accord is marked for death; give up now.
+		 */
+		lx_ptrace_accord_exit(accord);
+		return (ESRCH);
+	}
+
+	/*
+	 * Bump the reference count so that the accord is not freed.  We need
+	 * to drop the accord lock before we take our own p_lock.
+	 */
+	lx_ptrace_accord_hold(accord);
+	lx_ptrace_accord_exit(accord);
+
+	/*
+	 * We now lock _our_ process and determine if we can install our parent
+	 * as our tracer.
+	 */
+	mutex_enter(&p->p_lock);
+	if (lwpd->br_ptrace_tracer != NULL) {
+		/*
+		 * This LWP is already being traced.
+		 */
+		VERIFY(lwpd->br_ptrace_attach != LX_PTA_NONE);
+		error = EPERM;
+	} else {
+		/*
+		 * Bond ourselves to the accord.  We already bumped the accord
+		 * reference count.
+		 */
+		VERIFY(lwpd->br_ptrace_attach == LX_PTA_NONE);
+		lwpd->br_ptrace_attach = LX_PTA_TRACEME;
+		lwpd->br_ptrace_tracer = accord;
+		did_attach = B_TRUE;
+		error = 0;
+
+		/*
+		 * Speculatively bump l_ptrace now before dropping p_lock.
+		 * It will be reverted if the tracee attachment fails.
+		 */
+		ptolxproc(p)->l_ptrace++;
+	}
+	mutex_exit(&p->p_lock);
+
+	/*
+	 * Lock the accord tracee list and add this LWP.  Once we are in the
+	 * tracee list, it is the responsibility of the tracer to detach us.
+	 */
+	if (error == 0) {
+		lx_ptrace_accord_enter(accord);
+		mutex_enter(&accord->lxpa_tracees_lock);
+
+		if (!(accord->lxpa_flags & LX_ACC_TOMBSTONE)) {
+			/*
+			 * Put ourselves in the tracee list for this accord.
+			 */
+			VERIFY(!list_link_active(&lwpd->br_ptrace_linkage));
+			list_insert_tail(&accord->lxpa_tracees, lwpd);
+			mutex_exit(&accord->lxpa_tracees_lock);
+			lx_ptrace_accord_exit(accord);
+
+			return (0);
+		}
+		mutex_exit(&accord->lxpa_tracees_lock);
+
+		/*
+		 * The accord has been marked for death.  We must
+		 * untrace ourselves.
+		 */
+		error = ESRCH;
+		lx_ptrace_accord_exit(accord);
+
+		/*
+		 * Undo speculative increment of ptracer count.
+		 */
+		mutex_enter(&p->p_lock);
+		ptolxproc(p)->l_ptrace--;
+		mutex_exit(&p->p_lock);
+	}
+
+	/*
+	 * Our optimism was unjustified: We were unable to attach.  We need to
+	 * lock the process containing this LWP again in order to remove the
+	 * tracer.
+	 */
+	VERIFY(error != 0);
+	mutex_enter(&p->p_lock);
+	if (did_attach) {
+		/*
+		 * Verify that things were as we left them:
+		 */
+		VERIFY(!list_link_active(&lwpd->br_ptrace_linkage));
+		VERIFY(lwpd->br_ptrace_tracer == accord);
+
+		lwpd->br_ptrace_attach = LX_PTA_NONE;
+		lwpd->br_ptrace_tracer = NULL;
+	}
+	mutex_exit(&p->p_lock);
+
+	/*
+	 * Remove our speculative hold on the accord, possibly causing it to be
+	 * freed in the process.
+	 */
+	lx_ptrace_accord_enter(accord);
+	lx_ptrace_accord_rele(accord);
+	lx_ptrace_accord_exit(accord);
+
+	return (error);
+}
+
+static boolean_t
+lx_ptrace_stop_common(proc_t *p, lx_lwp_data_t *lwpd, ushort_t what)
+{
+	boolean_t reset_nostop = B_FALSE;
+
+	VERIFY(MUTEX_HELD(&p->p_lock));
+
+	/*
+	 * Mark this LWP as stopping and call stop() to enter "ptrace-stop".
+	 */
+	VERIFY0(lwpd->br_ptrace_flags & LX_PTF_STOPPING);
+	lwpd->br_ptrace_flags |= LX_PTF_STOPPING;
+
+	if (lwpd->br_lwp->lwp_nostop == 1 &&
+	    lwpd->br_ptrace_event == LX_PTRACE_EVENT_EXEC) {
+		/* We need to clear this to get the signal delivered. */
+		lwpd->br_lwp->lwp_nostop = 0;
+		reset_nostop = B_TRUE;
+	}
+
+	stop(PR_BRAND, what);
+
+	if (reset_nostop) {
+		VERIFY(lwpd->br_lwp->lwp_nostop == 0);
+		lwpd->br_lwp->lwp_nostop = 1;
+	}
+
+	/*
+	 * We are back from "ptrace-stop" with our process lock held.
+	 */
+	lwpd->br_ptrace_flags &= ~(LX_PTF_STOPPING | LX_PTF_STOPPED |
+	    LX_PTF_CLDPEND);
+	lwpd->br_ptrace_stopucp = NULL;
+	cv_broadcast(&lx_ptrace_busy_cv);
+	mutex_exit(&p->p_lock);
+
+	return (B_TRUE);
+}
+
+int
+lx_ptrace_stop_for_option(int option, boolean_t child, ulong_t msg,
+    uintptr_t ucp)
+{
+	kthread_t *t = curthread;
+	klwp_t *lwp = ttolwp(t);
+	proc_t *p = lwptoproc(lwp);
+	lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
+
+	mutex_enter(&p->p_lock);
+	if (lwpd->br_ptrace_tracer == NULL) {
+		mutex_exit(&p->p_lock);
+		return (ESRCH);
+	}
+
+	if (!child) {
+		/*
+		 * Only the first event posted by a new process is to be held
+		 * until the matching parent event is dispatched, and only if
+		 * it is a "child" event.  This is not a child event, so we
+		 * clear the wait flag.
+		 */
+		lwpd->br_ptrace_flags &= ~LX_PTF_PARENT_WAIT;
+
+	} else if (option == LX_PTRACE_O_TRACEVFORK) {
+		/*
+		 * For a child, we have to handle vfork as a special case. In
+		 * lx_ptrace_inherit_tracer() we set LX_PTF_PARENT_WAIT to
+		 * force events to be delayed until the parent posts its event.
+		 * This flag is cleared in lx_waitid_helper() to enforce a
+		 * "happens after" relationship. However, this obviously cannot
+		 * work for the vfork case. Thus, we clear our flag now so that
+		 * we can deliver the signal in lx_stop_notify(), if necessary.
+		 */
+		lwpd->br_ptrace_flags &= ~LX_PTF_PARENT_WAIT;
+	}
+
+	if (!(lwpd->br_ptrace_options & option)) {
+		if (option == LX_PTRACE_O_TRACEEXEC) {
+			/*
+			 * Without PTRACE_O_TRACEEXEC, the Linux kernel will
+			 * send SIGTRAP to the process.
+			 */
+			sigtoproc(p, t, SIGTRAP);
+			mutex_exit(&p->p_lock);
+			return (0);
+		}
+
+		/*
+		 * The flag for this trace event is not enabled, so we will not
+		 * stop.
+		 */
+		mutex_exit(&p->p_lock);
+		return (ESRCH);
+	}
+
+	if (child) {
+		switch (option) {
+		case LX_PTRACE_O_TRACECLONE:
+		case LX_PTRACE_O_TRACEFORK:
+		case LX_PTRACE_O_TRACEVFORK:
+			/*
+			 * Send the child LWP a directed SIGSTOP.
+			 */
+			sigtoproc(p, t, SIGSTOP);
+			mutex_exit(&p->p_lock);
+			return (0);
+		default:
+			goto nostop;
+		}
+	}
+
+	lwpd->br_ptrace_eventmsg = msg;
+
+	switch (option) {
+	case LX_PTRACE_O_TRACECLONE:
+		lwpd->br_ptrace_event = LX_PTRACE_EVENT_CLONE;
+		break;
+	case LX_PTRACE_O_TRACEEXEC:
+		lwpd->br_ptrace_event = LX_PTRACE_EVENT_EXEC;
+		lwpd->br_ptrace_eventmsg = 0;
+		break;
+	case LX_PTRACE_O_TRACEEXIT:
+		lwpd->br_ptrace_event = LX_PTRACE_EVENT_EXIT;
+		break;
+	case LX_PTRACE_O_TRACEFORK:
+		lwpd->br_ptrace_event = LX_PTRACE_EVENT_FORK;
+		break;
+	case LX_PTRACE_O_TRACEVFORK:
+		lwpd->br_ptrace_event = LX_PTRACE_EVENT_VFORK;
+		break;
+	case LX_PTRACE_O_TRACEVFORKDONE:
+		lwpd->br_ptrace_event = LX_PTRACE_EVENT_VFORK_DONE;
+		lwpd->br_ptrace_eventmsg = 0;
+		break;
+	default:
+		goto nostop;
+	}
+
+	/*
+	 * Userland may have passed in a ucontext_t pointer for
+	 * PTRACE_GETREGS/PTRACE_SETREGS usage while stopped.
+	 */
+	lwpd->br_ptrace_stopucp = ucp;
+
+	/*
+	 * p_lock for the process containing the tracee will be dropped by
+	 * lx_ptrace_stop_common().
+	 */
+	return (lx_ptrace_stop_common(p, lwpd, LX_PR_EVENT) ? 0 : ESRCH);
+
+nostop:
+	lwpd->br_ptrace_event = 0;
+	lwpd->br_ptrace_eventmsg = 0;
+	mutex_exit(&p->p_lock);
+	return (ESRCH);
+}
+
+boolean_t
+lx_ptrace_stop(ushort_t what)
+{
+	klwp_t *lwp = ttolwp(curthread);
+	proc_t *p = lwptoproc(lwp);
+	lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
+
+	VERIFY(what == LX_PR_SYSENTRY || what == LX_PR_SYSEXIT ||
+	    what == LX_PR_SIGNALLED);
+
+	/*
+	 * If we do not have an accord, bail out early.
+	 */
+	if (lwpd->br_ptrace_tracer == NULL)
+		return (B_FALSE);
+
+	/*
+	 * Lock this process and re-check the condition.
+	 */
+	mutex_enter(&p->p_lock);
+	if (lwpd->br_ptrace_tracer == NULL) {
+		VERIFY0(lwpd->br_ptrace_flags & LX_PTF_SYSCALL);
+		mutex_exit(&p->p_lock);
+		return (B_FALSE);
+	}
+
+	if (what == LX_PR_SYSENTRY || what == LX_PR_SYSEXIT) {
+		/*
+		 * This is a syscall-entry-stop or syscall-exit-stop point.
+		 */
+		if (!(lwpd->br_ptrace_flags & LX_PTF_SYSCALL)) {
+			/*
+			 * A system call stop has not been requested.
+			 */
+			mutex_exit(&p->p_lock);
+			return (B_FALSE);
+		}
+
+		/*
+		 * The PTRACE_SYSCALL restart command applies only to the next
+		 * system call entry or exit.  The tracer must restart us with
+		 * PTRACE_SYSCALL while we are in ptrace-stop for us to fire
+		 * again at the next system call boundary.
+		 */
+		lwpd->br_ptrace_flags &= ~LX_PTF_SYSCALL;
+	}
+
+	/*
+	 * p_lock for the process containing the tracee will be dropped by
+	 * lx_ptrace_stop_common().
+	 */
+	return (lx_ptrace_stop_common(p, lwpd, what));
+}
+
+int
+lx_ptrace_issig_stop(proc_t *p, klwp_t *lwp)
+{
+	lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
+	int lx_sig;
+
+	VERIFY(MUTEX_HELD(&p->p_lock));
+
+	/*
+	 * In very rare circumstances, a process which is almost completely
+	 * through proc_exit() may incur issig checks in the current thread via
+	 * clean-up actions.  The process will still be branded, but the thread
+	 * will have already been stripped of any LX-specific data on its way
+	 * to the grave.  Bail early if the brand data is missing.
+	 */
+	if (lwpd == NULL) {
+		return (0);
+	}
+
+	/*
+	 * If we do not have an accord, bail out now.  Additionally, if there
+	 * is no valid signal then we have no reason to stop.
+	 */
+	if (lwpd->br_ptrace_tracer == NULL || lwp->lwp_cursig == SIGKILL ||
+	    (lwp->lwp_cursig == 0 || lwp->lwp_cursig > NSIG) ||
+	    (lx_sig = stol_signo[lwp->lwp_cursig]) < 1) {
+		if (lwp->lwp_cursig == 0) {
+			/*
+			 * If this lwp has no current signal, it means that any
+			 * signal ignorance enabled by br_ptrace_donesig has
+			 * already taken place (the signal was consumed).
+			 * By clearing donesig, we declare desire to ignore no
+			 * signals for accurate ptracing.
+			 */
+			lwpd->br_ptrace_donesig = 0;
+		}
+		return (0);
+	}
+
+	/*
+	 * We stash the signal on the LWP where our waitid_helper will find it
+	 * and enter the ptrace "signal-delivery-stop" condition.
+	 */
+	lwpd->br_ptrace_stopsig = lx_sig;
+	lwpd->br_ptrace_donesig = 0;
+	(void) lx_ptrace_stop_common(p, lwpd, LX_PR_SIGNALLED);
+	mutex_enter(&p->p_lock);
+
+	/*
+	 * When we return, the signal may have been altered or suppressed.
+	 */
+	if (lwpd->br_ptrace_stopsig != lx_sig) {
+		int native_sig;
+		lx_sig = lwpd->br_ptrace_stopsig;
+
+		if (lx_sig >= LX_NSIG) {
+			lx_sig = 0;
+		}
+
+		/*
+		 * Translate signal from Linux signal number back to
+		 * an illumos native signal.
+		 */
+		if (lx_sig >= LX_NSIG || lx_sig < 0 || (native_sig =
+		    ltos_signo[lx_sig]) < 1) {
+			/*
+			 * The signal is not deliverable.
+			 */
+			lwp->lwp_cursig = 0;
+			lwp->lwp_extsig = 0;
+			if (lwp->lwp_curinfo) {
+				siginfofree(lwp->lwp_curinfo);
+				lwp->lwp_curinfo = NULL;
+			}
+		} else {
+			/*
+			 * Alter the currently dispatching signal.
+			 */
+			if (native_sig == SIGKILL) {
+				/*
+				 * We mark ourselves the victim and request
+				 * a restart of signal processing.
+				 */
+				p->p_flag |= SKILLED;
+				p->p_flag &= ~SEXTKILLED;
+				return (-1);
+			}
+			lwp->lwp_cursig = native_sig;
+			lwp->lwp_extsig = 0;
+			if (lwp->lwp_curinfo != NULL) {
+				lwp->lwp_curinfo->sq_info.si_signo = native_sig;
+			}
+		}
+	}
+
+	lwpd->br_ptrace_donesig = lwp->lwp_cursig;
+	lwpd->br_ptrace_stopsig = 0;
+	return (0);
+}
+
+boolean_t
+lx_ptrace_sig_ignorable(proc_t *p, klwp_t *lwp, int sig)
+{
+	lx_proc_data_t *lxpd = ptolxproc(p);
+
+	/*
+	 * Ignored signals and ptrace:
+	 *
+	 * When a process is being ptraced by another, special care is needed
+	 * while handling signals.  Since the tracer is interested in all
+	 * signals sent to the tracee, an effort must be made to initially
+	 * bypass signal ignorance logic.  This allows the signal to be placed
+	 * in the tracee's sigqueue to be inspected and potentially altered by
+	 * the tracer.
+	 *
+	 * A critical detail in this procedure is how a signal is handled after
+	 * tracer has completed processing for the event.  If the signal would
+	 * have been ignored, were it not for the initial ptrace override, then
+	 * lx_ptrace_sig_ignorable must report B_TRUE when the tracee is
+	 * restarted and resumes signal processing.  This is done by recording
+	 * the most recent tracee signal consumed by ptrace.
+	 */
+
+	if (lxpd->l_ptrace != 0 && lx_stol_signo(sig, 0) != 0) {
+		/*
+		 * This process is being ptraced.  Bypass signal ignorance for
+		 * anything that maps to a valid Linux signal...
+		 */
+		if (lwp != NULL && lwptolxlwp(lwp)->br_ptrace_donesig == sig) {
+			/*
+			 * ...Unless it is a signal which has already been
+			 * processed by the tracer.
+			 */
+			return (B_TRUE);
+		}
+		return (B_FALSE);
+	}
+	return (B_TRUE);
+}
+
+static void
+lx_ptrace_exit_tracer(proc_t *p, lx_lwp_data_t *lwpd,
+    lx_ptrace_accord_t *accord)
+{
+	VERIFY(MUTEX_NOT_HELD(&p->p_lock));
+
+	lx_ptrace_accord_enter(accord);
+	/*
+	 * Mark this accord for death.  This means no new tracees can be
+	 * attached to this accord.
+	 */
+	VERIFY0(accord->lxpa_flags & LX_ACC_TOMBSTONE);
+	accord->lxpa_flags |= LX_ACC_TOMBSTONE;
+	lx_ptrace_accord_exit(accord);
+
+	/*
+	 * Walk the list of tracees, detaching them and setting them runnable
+	 * if they are stopped.
+	 */
+	for (;;) {
+		klwp_t *rlwp;
+		proc_t *rproc;
+		lx_lwp_data_t *remote;
+		kmutex_t *rmp;
+
+		mutex_enter(&accord->lxpa_tracees_lock);
+		if (list_is_empty(&accord->lxpa_tracees)) {
+			mutex_exit(&accord->lxpa_tracees_lock);
+			break;
+		}
+
+		/*
+		 * Fetch the first tracee LWP in the list and lock the process
+		 * which contains it.
+		 */
+		remote = list_head(&accord->lxpa_tracees);
+		rlwp = remote->br_lwp;
+		rproc = lwptoproc(rlwp);
+		/*
+		 * The p_lock mutex persists beyond the life of the process
+		 * itself.  We save the address, here, to prevent the need to
+		 * dereference the proc_t after awaking from sleep.
+		 */
+		rmp = &rproc->p_lock;
+		mutex_enter(rmp);
+
+		if (TRACEE_BUSY(remote)) {
+			/*
+			 * This LWP is currently detaching itself on exit, or
+			 * mid-way through stop().  We must wait for this
+			 * action to be completed.  While we wait on the CV, we
+			 * must drop the accord tracee list lock.
+			 */
+			mutex_exit(&accord->lxpa_tracees_lock);
+			cv_wait(&lx_ptrace_busy_cv, rmp);
+
+			/*
+			 * While we were waiting, some state may have changed.
+			 * Restart the walk to be sure we don't miss anything.
+			 */
+			mutex_exit(rmp);
+			continue;
+		}
+
+		/*
+		 * We now hold p_lock on the process.  Remove the tracee from
+		 * the list.
+		 */
+		VERIFY(list_link_active(&remote->br_ptrace_linkage));
+		list_remove(&accord->lxpa_tracees, remote);
+
+		/*
+		 * Unlink the accord and clear our trace flags.
+		 */
+		remote->br_ptrace_attach = LX_PTA_NONE;
+		remote->br_ptrace_tracer = NULL;
+		remote->br_ptrace_flags = 0;
+
+		/*
+		 * Let go of the list lock before we restart the LWP.  We must
+		 * not hold any locks other than the process p_lock when
+		 * we call lx_ptrace_restart_lwp() as it will thread_lock
+		 * the tracee.
+		 */
+		mutex_exit(&accord->lxpa_tracees_lock);
+
+		/*
+		 * Decrement traced-lwp count for the remote process.
+		 */
+		VERIFY(ptolxproc(rproc)->l_ptrace-- >= 1);
+
+		/*
+		 * Ensure that the LWP is not stopped on our account.
+		 */
+		lx_ptrace_restart_lwp(rlwp);
+
+		/*
+		 * Unlock the former tracee.
+		 */
+		mutex_exit(rmp);
+
+		/*
+		 * Drop the hold this tracee had on the accord.
+		 */
+		lx_ptrace_accord_enter(accord);
+		lx_ptrace_accord_rele(accord);
+		lx_ptrace_accord_exit(accord);
+	}
+
+	mutex_enter(&p->p_lock);
+	lwpd->br_ptrace_accord = NULL;
+	mutex_exit(&p->p_lock);
+
+	/*
+	 * Clean up and release our hold on the accord If we completely
+	 * detached all tracee LWPs, this will free the accord.  Otherwise, it
+	 * will be freed when they complete their cleanup.
+	 *
+	 * We hold "pidlock" while clearing these members for easy exclusion of
+	 * waitid(), etc.
+	 */
+	mutex_enter(&pidlock);
+	lx_ptrace_accord_enter(accord);
+	accord->lxpa_cvp = NULL;
+	accord->lxpa_tracer = NULL;
+	mutex_exit(&pidlock);
+	lx_ptrace_accord_rele(accord);
+	lx_ptrace_accord_exit(accord);
+}
+
+static void
+lx_ptrace_exit_tracee(proc_t *p, lx_lwp_data_t *lwpd,
+    lx_ptrace_accord_t *accord)
+{
+	VERIFY(MUTEX_NOT_HELD(&p->p_lock));
+
+	/*
+	 * We are the tracee LWP.  Lock the accord tracee list and then our
+	 * containing process.
+	 */
+	mutex_enter(&accord->lxpa_tracees_lock);
+	mutex_enter(&p->p_lock);
+
+	/*
+	 * Remove our reference to the accord.  We will release our hold
+	 * later.
+	 */
+	VERIFY(lwpd->br_ptrace_tracer == accord);
+	lwpd->br_ptrace_attach = LX_PTA_NONE;
+	lwpd->br_ptrace_tracer = NULL;
+
+	/*
+	 * Remove this LWP from the accord tracee list:
+	 */
+	VERIFY(list_link_active(&lwpd->br_ptrace_linkage));
+	list_remove(&accord->lxpa_tracees, lwpd);
+
+	/*
+	 * Wake up any tracers waiting for us to detach from the accord.
+	 */
+	cv_broadcast(&lx_ptrace_busy_cv);
+
+	/*
+	 * Decrement traced-lwp count for the process.
+	 */
+	VERIFY(ptolxproc(p)->l_ptrace-- >= 1);
+
+	mutex_exit(&p->p_lock);
+	mutex_exit(&accord->lxpa_tracees_lock);
+
+	/*
+	 * Grab "pidlock" and wake the tracer if it is blocked in waitid().
+	 */
+	mutex_enter(&pidlock);
+	if (accord->lxpa_cvp != NULL) {
+		cv_broadcast(accord->lxpa_cvp);
+	}
+	mutex_exit(&pidlock);
+
+	/*
+	 * Release our hold on the accord.
+	 */
+	lx_ptrace_accord_enter(accord);
+	lx_ptrace_accord_rele(accord);
+	lx_ptrace_accord_exit(accord);
+}
+
+/*
+ * This routine is called from lx_exitlwp() when an LWP is ready to exit.  If
+ * this LWP is being traced, it will be detached from the tracer's accord.  The
+ * routine will also detach any LWPs being traced by this LWP.
+ */
+void
+lx_ptrace_exit(proc_t *p, klwp_t *lwp)
+{
+	lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
+	lx_ptrace_accord_t *accord;
+
+	VERIFY(MUTEX_HELD(&p->p_lock));
+
+	/*
+	 * Mark our LWP as exiting from a ptrace perspective.  This will
+	 * prevent a new accord from being allocated if one does not exist
+	 * already, and will make us invisible to PTRACE_ATTACH/PTRACE_TRACEME.
+	 */
+	VERIFY0(lwpd->br_ptrace_flags & LX_PTF_EXITING);
+	lwpd->br_ptrace_flags |= LX_PTF_EXITING;
+
+	if ((accord = lwpd->br_ptrace_tracer) != NULL) {
+		/*
+		 * We are traced by another LWP and must detach ourselves.
+		 */
+		mutex_exit(&p->p_lock);
+		lx_ptrace_exit_tracee(p, lwpd, accord);
+		mutex_enter(&p->p_lock);
+	}
+
+	if ((accord = lwpd->br_ptrace_accord) != NULL) {
+		/*
+		 * We have been tracing other LWPs, and must detach from
+		 * them and clean up our accord.
+		 */
+		mutex_exit(&p->p_lock);
+		lx_ptrace_exit_tracer(p, lwpd, accord);
+		mutex_enter(&p->p_lock);
+	}
+}
+
+/*
+ * Called when a SIGCLD signal is dispatched so that we may enqueue another.
+ * Return 0 if we enqueued a signal, or -1 if not.
+ */
+int
+lx_sigcld_repost(proc_t *pp, sigqueue_t *sqp)
+{
+	klwp_t *lwp = ttolwp(curthread);
+	lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
+	lx_ptrace_accord_t *accord;
+	lx_lwp_data_t *remote;
+	klwp_t *rlwp;
+	proc_t *rproc;
+	boolean_t found = B_FALSE;
+
+	VERIFY(MUTEX_HELD(&pidlock));
+	VERIFY(MUTEX_NOT_HELD(&pp->p_lock));
+	VERIFY(lwptoproc(lwp) == pp);
+
+	mutex_enter(&pp->p_lock);
+	if ((accord = lwpd->br_ptrace_accord) == NULL) {
+		/*
+		 * This LWP is not a tracer LWP, so there will be no
+		 * SIGCLD.
+		 */
+		mutex_exit(&pp->p_lock);
+		return (-1);
+	}
+	mutex_exit(&pp->p_lock);
+
+	mutex_enter(&accord->lxpa_tracees_lock);
+	for (remote = list_head(&accord->lxpa_tracees); remote != NULL;
+	    remote = list_next(&accord->lxpa_tracees, remote)) {
+		rlwp = remote->br_lwp;
+		rproc = lwptoproc(rlwp);
+
+		/*
+		 * Check if this LWP is in "ptrace-stop".  If in the correct
+		 * stop condition, lock the process containing the tracee LWP.
+		 */
+		if (lx_ptrace_lock_if_stopped(accord, remote) != 0) {
+			continue;
+		}
+
+		if (remote->br_ptrace_flags & LX_PTF_PARENT_WAIT) {
+			/*
+			 * This event depends on waitid() clearing out the
+			 * event of another LWP.  Skip it for now.
+			 */
+			mutex_exit(&rproc->p_lock);
+			continue;
+		}
+
+		if (!(remote->br_ptrace_flags & LX_PTF_CLDPEND)) {
+			/*
+			 * No SIGCLD is required for this LWP.
+			 */
+			mutex_exit(&rproc->p_lock);
+			continue;
+		}
+
+		if (!(remote->br_ptrace_flags & LX_PTF_WAITPEND) ||
+		    remote->br_ptrace_whystop == 0 ||
+		    remote->br_ptrace_whatstop == 0) {
+			/*
+			 * No (new) stop reason to post for this LWP.
+			 */
+			mutex_exit(&rproc->p_lock);
+			continue;
+		}
+
+		/*
+		 * We found a process of interest.  Leave the process
+		 * containing the tracee LWP locked and break out of the loop.
+		 */
+		found = B_TRUE;
+		break;
+	}
+	mutex_exit(&accord->lxpa_tracees_lock);
+
+	if (!found) {
+		return (-1);
+	}
+
+	/*
+	 * Generate siginfo for this tracee LWP.
+	 */
+	lx_ptrace_winfo(remote, &sqp->sq_info, B_FALSE, NULL, NULL);
+	remote->br_ptrace_flags &= ~LX_PTF_CLDPEND;
+	mutex_exit(&rproc->p_lock);
+
+	mutex_enter(&pp->p_lock);
+	if (sigismember(&pp->p_sig, SIGCLD)) {
+		mutex_exit(&pp->p_lock);
+
+		mutex_enter(&rproc->p_lock);
+		remote->br_ptrace_flags |= LX_PTF_CLDPEND;
+		mutex_exit(&rproc->p_lock);
+
+		return (-1);
+	}
+	sigaddqa(pp, curthread, sqp);
+	mutex_exit(&pp->p_lock);
+
+	return (0);
+}
+
+/*
+ * Consume the next available ptrace(2) event queued against the accord for
+ * this LWP.  The event will be emitted as if through waitid(), and converted
+ * by lx_waitpid() and friends before the return to usermode.
+ */
+int
+lx_waitid_helper(idtype_t idtype, id_t id, k_siginfo_t *ip, int options,
+    boolean_t *brand_wants_wait, int *rval)
+{
+	lx_ptrace_accord_t *accord;
+	klwp_t *lwp = ttolwp(curthread);
+	proc_t *p = lwptoproc(lwp);
+	lx_lwp_data_t *local = lwptolxlwp(lwp);
+	lx_lwp_data_t *remote;
+	boolean_t found = B_FALSE;
+	klwp_t *rlwp = NULL;
+	proc_t *rproc = NULL;
+	pid_t event_pid = 0, event_ppid = 0;
+	boolean_t waitflag = !(options & WNOWAIT);
+	boolean_t target_found = B_FALSE;
+
+	VERIFY(MUTEX_HELD(&pidlock));
+	VERIFY(MUTEX_NOT_HELD(&p->p_lock));
+
+	/*
+	 * By default, we do not expect waitid() to block on our account.
+	 */
+	*brand_wants_wait = B_FALSE;
+
+	if (!local->br_waitid_emulate) {
+		/*
+		 * This waitid() call is not expecting emulated results.
+		 */
+		return (-1);
+	}
+
+	switch (idtype) {
+	case P_ALL:
+	case P_PID:
+	case P_PGID:
+		break;
+	default:
+		/*
+		 * This idtype has no power here.
+		 */
+		return (-1);
+	}
+
+	if (lx_ptrace_accord_get(&accord, B_FALSE) != 0) {
+		/*
+		 * This LWP does not have an accord; it cannot be tracing.
+		 */
+		return (-1);
+	}
+
+	/*
+	 * We do not need an additional hold on the accord as it belongs to
+	 * the running, tracer, LWP.
+	 */
+	lx_ptrace_accord_exit(accord);
+
+	mutex_enter(&accord->lxpa_tracees_lock);
+	if (list_is_empty(&accord->lxpa_tracees)) {
+		/*
+		 * Though it has an accord, there are currently no tracees in
+		 * the list for this LWP.
+		 */
+		mutex_exit(&accord->lxpa_tracees_lock);
+		return (-1);
+	}
+
+	/*
+	 * Walk the list of tracees and determine if any of them have events to
+	 * report.
+	 */
+	for (remote = list_head(&accord->lxpa_tracees); remote != NULL;
+	    remote = list_next(&accord->lxpa_tracees, remote)) {
+		rlwp = remote->br_lwp;
+		rproc = lwptoproc(rlwp);
+
+		/*
+		 * We check to see if this LWP matches an id we are waiting for.
+		 */
+		switch (idtype) {
+		case P_ALL:
+			break;
+		case P_PID:
+			if (remote->br_pid != id)
+				continue;
+			break;
+		case P_PGID:
+			if (rproc->p_pgrp != id)
+				continue;
+			break;
+		default:
+			cmn_err(CE_PANIC, "unexpected idtype: %d", idtype);
+		}
+
+		/* This tracee matches provided idtype and id */
+		target_found = B_TRUE;
+
+		/*
+		 * Check if this LWP is in "ptrace-stop".  If in the correct
+		 * stop condition, lock the process containing the tracee LWP.
+		 */
+		if (lx_ptrace_lock_if_stopped(accord, remote) != 0) {
+			continue;
+		}
+
+		if (remote->br_ptrace_flags & LX_PTF_PARENT_WAIT) {
+			/*
+			 * This event depends on waitid() clearing out the
+			 * event of another LWP.  Skip it for now.
+			 */
+			mutex_exit(&rproc->p_lock);
+			continue;
+		}
+
+		if (!(remote->br_ptrace_flags & LX_PTF_WAITPEND) ||
+		    remote->br_ptrace_whystop == 0 ||
+		    remote->br_ptrace_whatstop == 0) {
+			/*
+			 * No (new) stop reason to post for this LWP.
+			 */
+			mutex_exit(&rproc->p_lock);
+			continue;
+		}
+
+		/*
+		 * We found a process of interest.  Leave the process
+		 * containing the tracee LWP locked and break out of the loop.
+		 */
+		found = B_TRUE;
+		break;
+	}
+	mutex_exit(&accord->lxpa_tracees_lock);
+
+	if (!found) {
+		/*
+		 * There were no events of interest, but we have tracees.
+		 * If any of the tracees matched the spcified criteria, signal
+		 * to waitid() that it should block if the provided flags allow
+		 * for it.
+		 */
+		if (target_found) {
+			*brand_wants_wait = B_TRUE;
+		}
+
+		return (-1);
+	}
+
+	/*
+	 * Populate the signal information.
+	 */
+	lx_ptrace_winfo(remote, ip, waitflag, &event_ppid, &event_pid);
+
+	/*
+	 * Unlock the tracee.
+	 */
+	mutex_exit(&rproc->p_lock);
+
+	if (event_pid != 0 && event_ppid != 0) {
+		/*
+		 * We need to do another pass around the tracee list and
+		 * unblock any events that have a "happens after" relationship
+		 * with this event.
+		 */
+		mutex_enter(&accord->lxpa_tracees_lock);
+		for (remote = list_head(&accord->lxpa_tracees); remote != NULL;
+		    remote = list_next(&accord->lxpa_tracees, remote)) {
+			rlwp = remote->br_lwp;
+			rproc = lwptoproc(rlwp);
+
+			mutex_enter(&rproc->p_lock);
+
+			if (remote->br_pid != event_pid ||
+			    remote->br_ppid != event_ppid) {
+				mutex_exit(&rproc->p_lock);
+				continue;
+			}
+
+			remote->br_ptrace_flags &= ~LX_PTF_PARENT_WAIT;
+
+			mutex_exit(&rproc->p_lock);
+		}
+		mutex_exit(&accord->lxpa_tracees_lock);
+	}
+
+	/*
+	 * If we are consuming this wait state, we remove the SIGCLD from
+	 * the queue and post another.
+	 */
+	if (waitflag) {
+		mutex_exit(&pidlock);
+		sigcld_delete(ip);
+		sigcld_repost();
+		mutex_enter(&pidlock);
+	}
+
+	*rval = 0;
+	return (0);
+}
+
+static int
+lx_ptrace_peek(lx_lwp_data_t *lwpd, uintptr_t addr, void *data)
+{
+	proc_t *p = lwptoproc(lwpd->br_lwp);
+	long buf;
+	int error = 0, size = sizeof (buf);
+
+#if defined(_LP64)
+	if (get_udatamodel() != DATAMODEL_NATIVE) {
+		size = sizeof (uint32_t);
+	}
+#endif
+	if ((addr & (size - 1)) != 0) {
+		/* unaligned access */
+		return (EINVAL);
+	}
+
+	mutex_exit(&p->p_lock);
+	error = uread(p, &buf, size, addr);
+	mutex_enter(&p->p_lock);
+
+	if (error != 0) {
+		return (EIO);
+	}
+	if (copyout(&buf, data, size) != 0) {
+		return (EFAULT);
+	}
+
+	return (0);
+}
+
+static int
+lx_ptrace_poke(lx_lwp_data_t *lwpd, uintptr_t addr, uintptr_t data)
+{
+	proc_t *p = lwptoproc(lwpd->br_lwp);
+	int error = 0, size = sizeof (data);
+
+#if defined(_LP64)
+	if (get_udatamodel() != DATAMODEL_NATIVE) {
+		size = sizeof (uint32_t);
+	}
+#endif
+	if ((addr & (size - 1)) != 0) {
+		/* unaligned access */
+		return (EINVAL);
+	}
+
+	mutex_exit(&p->p_lock);
+	error = uwrite(p, &data, size, addr);
+	mutex_enter(&p->p_lock);
+
+	if (error != 0) {
+		return (EIO);
+	}
+	return (0);
+}
+
+static int
+lx_ptrace_kill(lx_lwp_data_t *lwpd)
+{
+	sigtoproc(lwptoproc(lwpd->br_lwp), NULL, SIGKILL);
+
+	return (0);
+}
+
+static int
+lx_ptrace_kernel(int ptrace_op, pid_t lxpid, uintptr_t addr, uintptr_t data)
+{
+	lx_lwp_data_t *local = ttolxlwp(curthread);
+	lx_ptrace_accord_t *accord;
+	lx_lwp_data_t *remote;
+	klwp_t *rlwp;
+	proc_t *rproc;
+	int error;
+	boolean_t found = B_FALSE;
+
+	/*
+	 * PTRACE_TRACEME and PTRACE_ATTACH operations induce the tracing of
+	 * one LWP by another.  The target LWP must not be traced already.
+	 */
+	switch (ptrace_op) {
+	case LX_PTRACE_TRACEME:
+		return (lx_ptrace_traceme());
+
+	case LX_PTRACE_ATTACH:
+		return (lx_ptrace_attach(lxpid));
+	}
+
+	/*
+	 * Ensure that we have an accord and obtain a lock on it.  This routine
+	 * should not fail because the LWP cannot make ptrace(2) system calls
+	 * after it has begun exiting.
+	 */
+	VERIFY0(local->br_ptrace_flags & LX_PTF_EXITING);
+	VERIFY(lx_ptrace_accord_get(&accord, B_TRUE) == 0);
+
+	/*
+	 * The accord belongs to this (the tracer) LWP, and we have a hold on
+	 * it.  We drop the lock so that we can take other locks.
+	 */
+	lx_ptrace_accord_exit(accord);
+
+	/*
+	 * Does the tracee list contain the pid in question?
+	 */
+	mutex_enter(&accord->lxpa_tracees_lock);
+	for (remote = list_head(&accord->lxpa_tracees); remote != NULL;
+	    remote = list_next(&accord->lxpa_tracees, remote)) {
+		if (remote->br_pid == lxpid) {
+			found = B_TRUE;
+			break;
+		}
+	}
+	if (!found) {
+		/*
+		 * The requested pid does not appear in the tracee list.
+		 */
+		mutex_exit(&accord->lxpa_tracees_lock);
+		return (ESRCH);
+	}
+
+	/*
+	 * Attempt to lock the target LWP.
+	 */
+	if ((error = lx_ptrace_lock_if_stopped(accord, remote)) != 0) {
+		/*
+		 * The LWP was not in "ptrace-stop".
+		 */
+		mutex_exit(&accord->lxpa_tracees_lock);
+		return (error);
+	}
+
+	/*
+	 * The target LWP is in "ptrace-stop".  We have the containing process
+	 * locked.
+	 */
+	rlwp = remote->br_lwp;
+	rproc = lwptoproc(rlwp);
+
+
+	if (ptrace_op == LX_PTRACE_DETACH) {
+		boolean_t release_hold = B_FALSE;
+		error = lx_ptrace_detach(accord, remote, (int)data,
+		    &release_hold);
+		/*
+		 * Drop the lock on both the tracee process and the tracee list.
+		 */
+		mutex_exit(&rproc->p_lock);
+		mutex_exit(&accord->lxpa_tracees_lock);
+
+		if (release_hold) {
+			/*
+			 * Release a hold from the accord.
+			 */
+			lx_ptrace_accord_enter(accord);
+			lx_ptrace_accord_rele(accord);
+			lx_ptrace_accord_exit(accord);
+		}
+
+		return (error);
+	}
+
+	/*
+	 * The tracees lock is not needed for any of the other operations.
+	 * Drop it so further actions can avoid deadlock.
+	 */
+	mutex_exit(&accord->lxpa_tracees_lock);
+
+	/*
+	 * Process the ptrace(2) request:
+	 */
+	switch (ptrace_op) {
+	case LX_PTRACE_CONT:
+		error = lx_ptrace_cont(remote, LX_PTC_NONE, (int)data);
+		break;
+
+	case LX_PTRACE_SYSCALL:
+		error = lx_ptrace_cont(remote, LX_PTC_SYSCALL, (int)data);
+		break;
+
+	case LX_PTRACE_SINGLESTEP:
+		error = lx_ptrace_cont(remote, LX_PTC_SINGLESTEP, (int)data);
+		break;
+
+	case LX_PTRACE_SETOPTIONS:
+		error = lx_ptrace_setoptions(remote, data);
+		break;
+
+	case LX_PTRACE_GETEVENTMSG:
+		error = lx_ptrace_geteventmsg(remote, (void *)data);
+		break;
+
+	case LX_PTRACE_GETREGS:
+		error = lx_user_regs_copyout(remote, (void *)data);
+		break;
+
+	case LX_PTRACE_SETREGS:
+		error = lx_user_regs_copyin(remote, (void *)data);
+		break;
+
+	case LX_PTRACE_GETSIGINFO:
+		error = lx_ptrace_getsiginfo(remote, (void *)data);
+		break;
+
+	case LX_PTRACE_PEEKTEXT:
+	case LX_PTRACE_PEEKDATA:
+		error = lx_ptrace_peek(remote, addr, (void *)data);
+		break;
+
+	case LX_PTRACE_POKETEXT:
+	case LX_PTRACE_POKEDATA:
+		error = lx_ptrace_poke(remote, addr, data);
+		break;
+
+	case LX_PTRACE_PEEKUSER:
+		error = lx_ptrace_peekuser(remote, addr, (void *)data);
+		break;
+
+	case LX_PTRACE_POKEUSER:
+		error = lx_ptrace_pokeuser(remote, addr, (void *)data);
+		break;
+
+	case LX_PTRACE_GETFPREGS:
+		error = lx_user_fpregs_copyout(remote, (void *)data);
+		break;
+
+	case LX_PTRACE_SETFPREGS:
+		error = lx_user_fpregs_copyin(remote, (void *)data);
+		break;
+
+	case LX_PTRACE_GETFPXREGS:
+		error = lx_user_fpxregs_copyout(remote, (void *)data);
+		break;
+
+	case LX_PTRACE_SETFPXREGS:
+		error = lx_user_fpxregs_copyin(remote, (void *)data);
+		break;
+
+	case LX_PTRACE_KILL:
+		error = lx_ptrace_kill(remote);
+		break;
+
+	default:
+		error = EINVAL;
+	}
+
+	/*
+	 * Drop the lock on both the tracee process and the tracee list.
+	 */
+	mutex_exit(&rproc->p_lock);
+
+	return (error);
+}
+
+int
+lx_ptrace(int ptrace_op, pid_t lxpid, uintptr_t addr, uintptr_t data)
+{
+	int error;
+
+	error = lx_ptrace_kernel(ptrace_op, lxpid, addr, data);
+	if (error != 0) {
+		return (set_errno(error));
+	}
+	return (0);
+}
+
+void
+lx_ptrace_init(void)
+{
+	cv_init(&lx_ptrace_busy_cv, NULL, CV_DEFAULT, NULL);
+
+	lx_ptrace_accord_cache = kmem_cache_create("lx_ptrace_accord",
+	    sizeof (lx_ptrace_accord_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+}
+
+void
+lx_ptrace_fini(void)
+{
+	cv_destroy(&lx_ptrace_busy_cv);
+
+	kmem_cache_destroy(lx_ptrace_accord_cache);
+}
diff --git a/usr/src/uts/common/brand/lx/os/lx_signal.c b/usr/src/uts/common/brand/lx/os/lx_signal.c
new file mode 100644
index 0000000000..53e0cecc14
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/os/lx_signal.c
@@ -0,0 +1,50 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/signal.h>
+#include <sys/sunddi.h>
+#include <lx_signum.h>
+
+void
+lx_ltos_sigset(lx_sigset_t *lsigp, k_sigset_t *ssigp)
+{
+	int lx_sig, sig;
+
+	sigemptyset(ssigp);
+	for (lx_sig = 1; lx_sig <= LX_NSIG; lx_sig++) {
+		if (lx_sigismember(lsigp, lx_sig) &&
+		    ((sig = ltos_signo[lx_sig]) > 0))
+			sigaddset(ssigp, sig);
+	}
+
+	/* Emulate sigutok() restrictions */
+	ssigp->__sigbits[0] &= (FILLSET0 & ~CANTMASK0);
+	ssigp->__sigbits[1] &= (FILLSET1 & ~CANTMASK1);
+	ssigp->__sigbits[2] &= (FILLSET2 & ~CANTMASK2);
+}
+
+void
+lx_stol_sigset(k_sigset_t *ssigp, lx_sigset_t *lsigp)
+{
+	int sig, lx_sig;
+
+	bzero(lsigp, sizeof (lx_sigset_t));
+	for (sig = 1; sig < NSIG; sig++) {
+		if (sigismember(ssigp, sig) &&
+		    ((lx_sig = stol_signo[sig]) > 0))
+			lx_sigaddset(lsigp, lx_sig);
+	}
+}
diff --git a/usr/src/uts/common/brand/lx/os/lx_syscall.c b/usr/src/uts/common/brand/lx/os/lx_syscall.c
new file mode 100644
index 0000000000..f48b043aa3
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/os/lx_syscall.c
@@ -0,0 +1,1316 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#include <sys/kmem.h>
+#include <sys/errno.h>
+#include <sys/thread.h>
+#include <sys/systm.h>
+#include <sys/syscall.h>
+#include <sys/proc.h>
+#include <sys/modctl.h>
+#include <sys/cmn_err.h>
+#include <sys/model.h>
+#include <sys/privregs.h>
+#include <sys/brand.h>
+#include <sys/machbrand.h>
+#include <sys/sdt.h>
+#include <sys/lx_syscalls.h>
+#include <sys/lx_brand.h>
+#include <sys/lx_impl.h>
+#include <sys/lx_misc.h>
+#include <lx_errno.h>
+
+
+/*
+ * Flags for sysent entries:
+ */
+#define	LX_SYS_NOSYS_REASON	0x07
+#define	LX_SYS_EBPARG6		0x08
+
+/*
+ * Flags that denote the specific reason we do not have a particular system
+ * call.  These reasons are only valid if the function is NULL.
+ */
+#define	NOSYS_USERMODE		0
+#define	NOSYS_NULL		1
+#define	NOSYS_NONE		2
+#define	NOSYS_NO_EQUIV		3
+#define	NOSYS_KERNEL		4
+#define	NOSYS_UNDOC		5
+#define	NOSYS_OBSOLETE		6
+#define	NOSYS_MAX		NOSYS_OBSOLETE
+
+#if NOSYS_MAX > LX_SYS_NOSYS_REASON
+#error NOSYS reason codes must fit in LX_SYS_NOSYS_REASON
+#endif
+
+/*
+ * Strings describing the reason we do not emulate a particular system call
+ * in the kernel.
+ */
+static char *nosys_reasons[] = {
+	NULL, /* NOSYS_USERMODE means this call is emulated in usermode */
+	"Not done yet",
+	"No such Linux system call",
+	"No equivalent illumos functionality",
+	"Reads/modifies Linux kernel state",
+	"Undocumented and/or rarely used system call",
+	"Unsupported, obsolete system call"
+};
+
+
+#if defined(_LP64)
+/*
+ * System call handler table and entry count for Linux x86_64 (amd64):
+ */
+lx_sysent_t lx_sysent64[LX_NSYSCALLS + 1];
+int lx_nsysent64;
+#endif
+/*
+ * System call handler table and entry count for Linux x86 (i386):
+ */
+lx_sysent_t lx_sysent32[LX_NSYSCALLS + 1];
+int lx_nsysent32;
+
+#if defined(_LP64)
+struct lx_vsyscall
+{
+	uintptr_t lv_addr;
+	uintptr_t lv_scnum;
+} lx_vsyscalls[] = {
+	{ LX_VSYS_gettimeofday, LX_SYS_gettimeofday },
+	{ LX_VSYS_time, LX_SYS_time },
+	{ LX_VSYS_getcpu, LX_SYS_getcpu },
+	{ NULL, NULL }
+};
+#endif
+
+#if defined(__amd64)
+static int
+lx_emulate_args(klwp_t *lwp, const lx_sysent_t *s, uintptr_t *args)
+{
+	struct regs *rp = lwptoregs(lwp);
+
+	if (get_udatamodel() == DATAMODEL_NATIVE) {
+		/*
+		 * Note: Syscall argument passing is different from function
+		 * call argument passing on amd64.  For function calls, the
+		 * fourth arg is passed via %rcx, but for system calls the 4th
+		 * arg is passed via %r10.  This is because in amd64, the
+		 * syscall instruction puts the lower 32 bits of %rflags in
+		 * %r11 and puts the %rip value to %rcx.
+		 *
+		 * Appendix A of the amd64 ABI (Linux conventions) states that
+		 * syscalls are limited to 6 args and no arg is passed on the
+		 * stack.
+		 */
+		args[0] = rp->r_rdi;
+		args[1] = rp->r_rsi;
+		args[2] = rp->r_rdx;
+		args[3] = rp->r_r10;
+		args[4] = rp->r_r8;
+		args[5] = rp->r_r9;
+	} else {
+		/*
+		 * If the system call takes 6 args, then libc has stashed them
+		 * in memory at the address contained in %ebx. Except for some
+		 * syscalls which store the 6th argument in %ebp.
+		 */
+		if (s->sy_narg == 6 && !(s->sy_flags & LX_SYS_EBPARG6)) {
+			uint32_t args32[6];
+
+			if (copyin((void *)rp->r_rbx, &args32,
+			    sizeof (args32)) != 0) {
+				/*
+				 * Clear the argument vector so that the
+				 * trace probe does not expose kernel
+				 * memory.
+				 */
+				bzero(args, 6 * sizeof (uintptr_t));
+				return (set_errno(EFAULT));
+			}
+
+			args[0] = args32[0];
+			args[1] = args32[1];
+			args[2] = args32[2];
+			args[3] = args32[3];
+			args[4] = args32[4];
+			args[5] = args32[5];
+		} else {
+			args[0] = rp->r_rbx;
+			args[1] = rp->r_rcx;
+			args[2] = rp->r_rdx;
+			args[3] = rp->r_rsi;
+			args[4] = rp->r_rdi;
+			args[5] = rp->r_rbp;
+		}
+	}
+
+	return (0);
+}
+
+#else	/* !__amd64 */
+
+static int
+lx_emulate_args(klwp_t *lwp, const lx_sysent_t *s, uintptr_t *args)
+{
+	struct regs *rp = lwptoregs(lwp);
+
+	/*
+	 * If the system call takes 6 args, then libc has stashed them
+	 * in memory at the address contained in %ebx. Except for some
+	 * syscalls which store the 6th argument in %ebp.
+	 */
+	if (s->sy_narg == 6 && !(s->sy_flags & LX_SYS_EBPARG6)) {
+		if (copyin((void *)rp->r_ebx, args, 6 * sizeof (uintptr_t)) !=
+		    0) {
+			/*
+			 * Clear the argument vector so that the trace probe
+			 * does not expose kernel memory.
+			 */
+			bzero(args, 6 * sizeof (uintptr_t));
+			return (set_errno(EFAULT));
+		}
+	} else {
+		args[0] = rp->r_ebx;
+		args[1] = rp->r_ecx;
+		args[2] = rp->r_edx;
+		args[3] = rp->r_esi;
+		args[4] = rp->r_edi;
+		args[5] = rp->r_ebp;
+	}
+
+	return (0);
+}
+#endif
+
+int
+lx_syscall_return(klwp_t *lwp, int syscall_num, long ret)
+{
+	lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
+	struct regs *rp = lwptoregs(lwp);
+	int error = lwp->lwp_errno;
+
+	if (error != EINTR) {
+		/*
+		 * If this system call was not interrupted, clear the system
+		 * call restart flag before lx_setcontext() can pass it to
+		 * usermode.
+		 */
+		lwpd->br_syscall_restart = B_FALSE;
+	}
+
+	if (error != 0) {
+		/*
+		 * Convert from illumos to Linux errno:
+		 */
+		ret = -lx_errno(error, EINVAL);
+	}
+
+	/*
+	 * 32-bit Linux system calls return via %eax; 64-bit calls return via
+	 * %rax.
+	 */
+	rp->r_r0 = ret;
+
+	/*
+	 * Hold for the ptrace(2) "syscall-exit-stop" condition if required by
+	 * PTRACE_SYSCALL.  Note that the register state may be modified by
+	 * tracer.
+	 */
+	lx_ptrace_stop(LX_PR_SYSEXIT);
+
+	/*
+	 * Fire the DTrace "lx-syscall:::return" probe:
+	 */
+	lx_trace_sysreturn(syscall_num, ret);
+
+	/*
+	 * Clear errno for next time.  We do not clear "br_syscall_restart" or
+	 * "br_syscall_num" as they are potentially used by "lx_savecontext()"
+	 * in the signal delivery path.
+	 */
+	lwp->lwp_errno = 0;
+
+	lx_check_strict_failure(lwpd);
+
+	/*
+	 * We want complete control of the registers on return from this
+	 * emulated Linux system call:
+	 */
+	lwp->lwp_eosys = JUSTRETURN;
+	curthread->t_post_sys = 1;
+	aston(curthread);
+
+	return (0);
+}
+
+static void
+lx_syscall_unsup_msg(lx_sysent_t *s, int syscall_num, int unsup_reason)
+{
+	char buf[100];
+
+	if (s == NULL) {
+		(void) snprintf(buf, sizeof (buf), "NOSYS (%d): out of bounds",
+		    syscall_num);
+	} else {
+		VERIFY(unsup_reason < (sizeof (nosys_reasons) /
+		    sizeof (*nosys_reasons)));
+
+		if (s->sy_name == NULL) {
+			(void) snprintf(buf, sizeof (buf), "NOSYS (%d): %s",
+			    syscall_num, nosys_reasons[unsup_reason]);
+		} else {
+			(void) snprintf(buf, sizeof (buf), "NOSYS (%s): %s",
+			    s->sy_name, nosys_reasons[unsup_reason]);
+		}
+	}
+
+	lx_unsupported(buf);
+}
+
+/*
+ * This function is used to override the processing of arguments and
+ * invocation of a handler for emulated system calls, installed on each
+ * branded LWP as "lwp_brand_syscall".  If this system call should use the
+ * native path, we return 1.  If we handled this system call (and have made
+ * arrangements with respect to post-return usermode register state) we
+ * return 0.
+ */
+int
+lx_syscall_enter(void)
+{
+	klwp_t *lwp = ttolwp(curthread);
+	lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
+	struct regs *rp = lwptoregs(lwp);
+	int syscall_num;
+	int error;
+	long ret = 0;
+	lx_sysent_t *s;
+	uintptr_t args[6];
+	unsigned int unsup_reason;
+
+	/*
+	 * If we got here, we should have an LWP-specific brand data
+	 * structure.
+	 */
+	VERIFY(lwpd != NULL);
+
+	if (lwpd->br_stack_mode != LX_STACK_MODE_BRAND) {
+		/*
+		 * The lwp is not in in BRAND execution mode, so we return
+		 * to the regular native system call path.
+		 */
+		DTRACE_PROBE(brand__lx__syscall__hook__skip);
+		return (1);
+	}
+
+	/*
+	 * Clear the restartable system call flag.  This flag will be set
+	 * on in the system call handler if the call is a candidate for
+	 * a restart.  It will be saved by lx_setcontext() in the event
+	 * that we take a signal, and used in the signal handling path
+	 * to restart the system call iff SA_RESTART was set for this
+	 * signal.  Save the system call number so that we can store it
+	 * in the saved context if required.
+	 */
+	lwpd->br_syscall_restart = B_FALSE;
+	lwpd->br_syscall_num = (int)rp->r_r0;
+
+	/*
+	 * Hold for the ptrace(2) "syscall-entry-stop" condition if traced by
+	 * PTRACE_SYSCALL.  The system call number and arguments may be
+	 * modified by the tracer.
+	 */
+	lx_ptrace_stop(LX_PR_SYSENTRY);
+
+	/*
+	 * Check that the system call number is within the bounds we expect.
+	 */
+	syscall_num = lwpd->br_syscall_num;
+	if (syscall_num < 0 || syscall_num > LX_MAX_SYSCALL(lwp)) {
+		lx_syscall_unsup_msg(NULL, syscall_num, 0);
+
+		set_errno(ENOTSUP);
+		lx_syscall_return(lwp, syscall_num, -1);
+		return (0);
+	}
+
+#if defined(_LP64)
+	if (lwp_getdatamodel(lwp) == DATAMODEL_NATIVE) {
+		s = &lx_sysent64[syscall_num];
+	} else
+#endif
+	{
+		s = &lx_sysent32[syscall_num];
+	}
+
+	/*
+	 * Process the arguments for this system call and fire the DTrace
+	 * "lx-syscall:::entry" probe:
+	 */
+	error = lx_emulate_args(lwp, s, args);
+	lx_trace_sysenter(syscall_num, args);
+	if (error != 0) {
+		/*
+		 * Could not read and process the arguments.  Return the error
+		 * to the process.
+		 */
+		set_errno(error);
+		lx_syscall_return(lwp, syscall_num, -1);
+		return (0);
+	}
+
+	if (s->sy_callc != NULL) {
+		/*
+		 * Call the in-kernel handler for this Linux system call:
+		 */
+		lwpd->br_eosys = NORMALRETURN;
+		ret = s->sy_callc(args[0], args[1], args[2], args[3], args[4],
+		    args[5]);
+		if (lwpd->br_eosys == NORMALRETURN) {
+			lx_syscall_return(lwp, syscall_num, ret);
+		}
+		return (0);
+	}
+
+	/*
+	 * There is no in-kernel handler.
+	 */
+	switch (unsup_reason = (s->sy_flags & LX_SYS_NOSYS_REASON)) {
+	case NOSYS_USERMODE:
+		/*
+		 * Pass to the usermode emulation routine.
+		 */
+#if defined(_LP64)
+		if (get_udatamodel() != DATAMODEL_NATIVE) {
+			lx_emulate_user32(lwp, syscall_num, args);
+		} else
+#endif
+		{
+			lx_emulate_user(lwp, syscall_num, args);
+		}
+		return (0);
+
+	default:
+		/*
+		 * We are not emulating this system call at all.
+		 */
+		lx_syscall_unsup_msg(s, syscall_num, unsup_reason);
+
+		set_errno(ENOTSUP);
+		lx_syscall_return(lwp, syscall_num, -1);
+		return (0);
+	}
+}
+
+#if defined(_LP64)
+/*
+ * Emulate vsyscall support.
+ *
+ * Linux magically maps a single page into the address space of each process,
+ * allowing them to make 'vsyscalls'.  Originally designed to counteract the
+ * perceived overhead of regular system calls, vsyscalls were implemented as
+ * code residing in userspace which could be called directly.  The userspace
+ * implementations of these vsyscalls which have now been replaced by
+ * instructions which vector into the normal syscall path.
+ *
+ * Implementing vsyscalls on Illumos is complicated by the fact that the
+ * required static address region resides inside the kernel address space.
+ * Rather than mapping a user-accessible page into the KAS, a different
+ * approach is taken.  The vsyscall gate is emulated by interposing on
+ * pagefaults in trap().  An attempt to execute a known vsyscall address will
+ * result in emulating the appropriate system call rather than inducing a
+ * SIGSEGV.
+ */
+void
+lx_vsyscall_enter(proc_t *p, klwp_t *lwp, int scnum)
+{
+	struct regs *rp = lwptoregs(lwp);
+	uintptr_t raddr;
+
+	/*
+	 * Fetch the return address from the process stack.
+	 */
+	VERIFY(MUTEX_NOT_HELD(&p->p_lock));
+	if (copyin((void *)rp->r_rsp, &raddr, sizeof (raddr)) != 0) {
+#if DEBUG
+		printf("lx_vsyscall_call: bad brand stack at vsyscall "
+		    "cmd=%s, pid=%d, sp=0x%p\n", PTOU(p)->u_comm,
+		    p->p_pid, (void *)rp->r_rsp);
+#endif
+
+		/*
+		 * The process jumped to the vsyscall address without a
+		 * correctly configured stack.  Terminate the process.
+		 */
+		exit(CLD_KILLED, SIGSEGV);
+		return;
+	}
+
+	DTRACE_PROBE1(brand__lx__vsyscall, int, scnum);
+
+	/* Simulate vectoring into the syscall */
+	rp->r_rax = scnum;
+	rp->r_rip = raddr;
+	rp->r_rsp += sizeof (uintptr_t);
+
+	lx_syscall_enter();
+}
+
+boolean_t
+lx_vsyscall_iscall(klwp_t *lwp, uintptr_t addr, int *scnum)
+{
+	lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
+	int i;
+
+	if (lwpd->br_stack_mode != LX_STACK_MODE_BRAND) {
+		/*
+		 * We only handle vsyscalls when running Linux code.
+		 */
+		return (B_FALSE);
+	}
+
+	if (addr < LX_VSYSCALL_ADDR ||
+	    addr >= (LX_VSYSCALL_ADDR + LX_VSYSCALL_SIZE)) {
+		/*
+		 * Ignore faults outside the vsyscall page.
+		 */
+		return (B_FALSE);
+	}
+
+	for (i = 0; lx_vsyscalls[i].lv_addr != NULL; i++) {
+		if (addr == lx_vsyscalls[i].lv_addr) {
+			/*
+			 * This is a valid vsyscall address.
+			 */
+			*scnum = lx_vsyscalls[i].lv_scnum;
+			return (B_TRUE);
+		}
+	}
+
+	lx_unsupported("bad vsyscall access");
+	return (B_FALSE);
+}
+#endif
+
+/*
+ * This function is used to provide a fasttrap-like interface for emulated
+ * syscalls.  By skipping housekeeping such as mstate transitions, it should
+ * cut down on overhead for syscalls which would normally be fasttraps in a
+ * native process.
+ */
+int
+lx_syscall_fast_enter(void)
+{
+	klwp_t *lwp = ttolwp(curthread);
+	lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
+	struct regs *rp = lwptoregs(lwp);
+	int syscall_num, error;
+	lx_sysent_t *s;
+	uintptr_t args[6];
+	long ret = 0;
+
+	/*
+	 * If we got here, we should have an LWP-specific brand data structure.
+	 */
+	VERIFY(lwpd != NULL);
+
+	if (lwpd->br_stack_mode != LX_STACK_MODE_BRAND) {
+		/*
+		 * The lwp is not in in BRAND execution mode, so we return to
+		 * the regular native system call path.
+		 */
+		DTRACE_PROBE(brand__lx__syscall__hook__skip);
+		return (1);
+	}
+	if (lwpd->br_ptrace_tracer != NULL) {
+		/*
+		 * Given that ptrace is the antithesis of "fast", return to the
+		 * regular system call path if we are being traced.
+		 */
+		return (1);
+	}
+
+	syscall_num = (int)rp->r_r0;
+#if defined(_LP64)
+	if (get_udatamodel() != DATAMODEL_NATIVE) {
+		switch (syscall_num) {
+		case LX_SYS32_gettimeofday:
+		case LX_SYS32_time:
+		case LX_SYS32_clock_gettime:
+		case LX_SYS32_getcpu:
+			s = &lx_sysent32[syscall_num];
+			break;
+		default:
+			return (1);
+		}
+	} else
+#endif
+	{
+		switch (syscall_num) {
+		case LX_SYS_gettimeofday:
+		case LX_SYS_time:
+		case LX_SYS_clock_gettime:
+		case LX_SYS_getcpu:
+#if defined(_LP64)
+			s = &lx_sysent64[syscall_num];
+#else
+			s = &lx_sysent32[syscall_num];
+#endif
+			break;
+		default:
+			return (1);
+		}
+	}
+
+	/*
+	 * The above syscall restrictions should ensure that we do not arrive
+	 * at this point without a suitable syscall planned.  Since the
+	 * lx_emulate_args routine can only fail for 6-arg syscalls, none of
+	 * which would be performed as a fasttrap, it is assumed to succeed.
+	 */
+	VERIFY(s->sy_callc != NULL);
+	VERIFY(s->sy_narg < 6);
+	(void) lx_emulate_args(lwp, s, args);
+	lx_trace_sysenter(syscall_num, args);
+	ret = s->sy_callc(args[0], args[1], args[2], args[3], args[4],
+	    args[5]);
+
+	if ((error = lwp->lwp_errno) != 0) {
+		ret = -lx_errno(error, EINVAL);
+		lwp->lwp_errno = 0;
+	}
+	rp->r_r0 = ret;
+	lx_trace_sysreturn(syscall_num, ret);
+	lwp->lwp_eosys = JUSTRETURN;
+	return (0);
+}
+
+/*
+ * Linux defines system call numbers for 32-bit x86 in the file:
+ *   arch/x86/syscalls/syscall_32.tbl
+ */
+lx_sysent_t lx_sysent32[] = {
+	{"nosys",	NULL,			NOSYS_NONE,	0}, /*  0 */
+	{"exit",	NULL,			0,		1}, /*  1 */
+	{"fork",	NULL,			0,		0}, /*  2 */
+	{"read",	lx_read,		0,		3}, /*  3 */
+	{"write",	lx_write,		0,		3}, /*  4 */
+	{"open",	lx_open,		0,		3}, /*  5 */
+	{"close",	lx_close,		0,		1}, /*  6 */
+	{"waitpid",	lx_waitpid,		0,		3}, /*  7 */
+	{"creat",	NULL,			0,		2}, /*  8 */
+	{"link",	lx_link,		0,		2}, /*  9 */
+	{"unlink",	NULL,			0,		1}, /* 10 */
+	{"execve",	NULL,			0,		3}, /* 11 */
+	{"chdir",	NULL,			0,		1}, /* 12 */
+	{"time",	lx_time,		0,		1}, /* 13 */
+	{"mknod",	NULL,			0,		3}, /* 14 */
+	{"chmod",	lx_chmod,		0,		2}, /* 15 */
+	{"lchown16",	lx_lchown16,		0,		3}, /* 16 */
+	{"break",	NULL,			NOSYS_OBSOLETE,	0}, /* 17 */
+	{"stat",	NULL,			NOSYS_OBSOLETE,	0}, /* 18 */
+	{"lseek",	NULL,			0,		3}, /* 19 */
+	{"getpid",	lx_getpid,		0,		0}, /* 20 */
+	{"mount",	NULL,			0,		5}, /* 21 */
+	{"umount",	NULL,			0,		1}, /* 22 */
+	{"setuid16",	NULL,			0,		1}, /* 23 */
+	{"getuid16",	NULL,			0,		0}, /* 24 */
+	{"stime",	NULL,			0,		1}, /* 25 */
+	{"ptrace",	lx_ptrace,		0,		4}, /* 26 */
+	{"alarm",	NULL,			0,		1}, /* 27 */
+	{"fstat",	NULL,			NOSYS_OBSOLETE,	0}, /* 28 */
+	{"pause",	NULL,			0,		0}, /* 29 */
+	{"utime",	NULL,			0,		2}, /* 30 */
+	{"stty",	NULL,			NOSYS_OBSOLETE,	0}, /* 31 */
+	{"gtty",	NULL,			NOSYS_OBSOLETE,	0}, /* 32 */
+	{"access",	lx_access,		0,		2}, /* 33 */
+	{"nice",	NULL,			0,		1}, /* 34 */
+	{"ftime",	NULL,			NOSYS_OBSOLETE,	0}, /* 35 */
+	{"sync",	NULL,			0, 		0}, /* 36 */
+	{"kill",	lx_kill,		0,		2}, /* 37 */
+	{"rename",	NULL,			0,		2}, /* 38 */
+	{"mkdir",	lx_mkdir,		0,		2}, /* 39 */
+	{"rmdir",	NULL,			0,		1}, /* 40 */
+	{"dup",		NULL,			0,		1}, /* 41 */
+	{"pipe",	lx_pipe,		0,		1}, /* 42 */
+	{"times",	NULL,			0,		1}, /* 43 */
+	{"prof",	NULL,			NOSYS_OBSOLETE,	0}, /* 44 */
+	{"brk",		lx_brk,			0,		1}, /* 45 */
+	{"setgid16",	NULL,			0,		1}, /* 46 */
+	{"getgid16",	NULL,			0,		0}, /* 47 */
+	{"signal",	NULL,			0,		2}, /* 48 */
+	{"geteuid16",	NULL,			0,		0}, /* 49 */
+	{"getegid16",	NULL,			0,		0}, /* 50 */
+	{"acct",	NULL,			NOSYS_NO_EQUIV,	0}, /* 51 */
+	{"umount2",	NULL,			0,		2}, /* 52 */
+	{"lock",	NULL,			NOSYS_OBSOLETE,	0}, /* 53 */
+	{"ioctl",	lx_ioctl,		0,		3}, /* 54 */
+	{"fcntl",	lx_fcntl,		0,		3}, /* 55 */
+	{"mpx",		NULL,			NOSYS_OBSOLETE,	0}, /* 56 */
+	{"setpgid",	NULL,			0,		2}, /* 57 */
+	{"ulimit",	NULL,			NOSYS_OBSOLETE,	0}, /* 58 */
+	{"olduname",	NULL,			NOSYS_OBSOLETE,	0}, /* 59 */
+	{"umask",	NULL,			0,		1}, /* 60 */
+	{"chroot",	NULL,			0,		1}, /* 61 */
+	{"ustat",	NULL,			NOSYS_OBSOLETE,	2}, /* 62 */
+	{"dup2",	NULL,			0,		2}, /* 63 */
+	{"getppid",	lx_getppid,		0,		0}, /* 64 */
+	{"getpgrp",	NULL,			0,		0}, /* 65 */
+	{"setsid",	NULL,			0,		0}, /* 66 */
+	{"sigaction",	NULL,			0,		3}, /* 67 */
+	{"sgetmask",	NULL,			NOSYS_OBSOLETE,	0}, /* 68 */
+	{"ssetmask",	NULL,			NOSYS_OBSOLETE,	0}, /* 69 */
+	{"setreuid16",	NULL,			0,		2}, /* 70 */
+	{"setregid16",	NULL,			0,		2}, /* 71 */
+	{"sigsuspend",	NULL,			0,		1}, /* 72 */
+	{"sigpending",	NULL,			0,		1}, /* 73 */
+	{"sethostname",	NULL,			0,		2}, /* 74 */
+	{"setrlimit",	lx_setrlimit,		0,		2}, /* 75 */
+	{"getrlimit",	lx_oldgetrlimit,	0,		2}, /* 76 */
+	{"getrusage",	NULL,			0,		2}, /* 77 */
+	{"gettimeofday", lx_gettimeofday,	0,		2}, /* 78 */
+	{"settimeofday", NULL, 			0,		2}, /* 79 */
+	{"getgroups16",	NULL,			0,		2}, /* 80 */
+	{"setgroups16",	NULL,			0,		2}, /* 81 */
+	{"select",	NULL,			NOSYS_OBSOLETE,	0}, /* 82 */
+	{"symlink",	NULL,			0,		2}, /* 83 */
+	{"oldlstat",	NULL,			NOSYS_OBSOLETE,	0}, /* 84 */
+	{"readlink",	NULL,			0,		3}, /* 85 */
+	{"uselib",	NULL,			NOSYS_KERNEL,	0}, /* 86 */
+	{"swapon",	NULL,			NOSYS_KERNEL,	0}, /* 87 */
+	{"reboot",	NULL,			0,		4}, /* 88 */
+	{"readdir",	NULL,			0,		3}, /* 89 */
+	{"mmap",	NULL,			0,		6}, /* 90 */
+	{"munmap",	NULL,			0,		2}, /* 91 */
+	{"truncate",	NULL,			0,		2}, /* 92 */
+	{"ftruncate",	NULL,			0,		2}, /* 93 */
+	{"fchmod",	lx_fchmod,		0,		2}, /* 94 */
+	{"fchown16",	lx_fchown16,		0,		3}, /* 95 */
+	{"getpriority",	NULL,			0,		2}, /* 96 */
+	{"setpriority",	NULL,			0,		3}, /* 97 */
+	{"profil",	NULL,			NOSYS_NO_EQUIV,	0}, /* 98 */
+	{"statfs",	NULL,			0,		2}, /* 99 */
+	{"fstatfs",	NULL,			0,		2}, /* 100 */
+	{"ioperm",	NULL,			NOSYS_NO_EQUIV,	0}, /* 101 */
+	{"socketcall",	lx_socketcall,		0,		2}, /* 102 */
+	{"syslog",	NULL,			0,		3}, /* 103 */
+	{"setitimer",	NULL,			0,		3}, /* 104 */
+	{"getitimer",	NULL,			0,		2}, /* 105 */
+	{"stat",	lx_stat32,		0,		2}, /* 106 */
+	{"lstat",	lx_lstat32,		0,		2}, /* 107 */
+	{"fstat",	lx_fstat32,		0,		2}, /* 108 */
+	{"uname",	NULL,			NOSYS_OBSOLETE,	0}, /* 109 */
+	{"oldiopl",	NULL,			NOSYS_NO_EQUIV,	0}, /* 110 */
+	{"vhangup",	NULL,			0,		0}, /* 111 */
+	{"idle",	NULL,			NOSYS_NO_EQUIV,	0}, /* 112 */
+	{"vm86old",	NULL,			NOSYS_OBSOLETE,	0}, /* 113 */
+	{"wait4",	lx_wait4,		0,		4}, /* 114 */
+	{"swapoff",	NULL,			NOSYS_KERNEL,	0}, /* 115 */
+	{"sysinfo",	lx_sysinfo32,		0,		1}, /* 116 */
+	{"ipc",		NULL,			0,		5}, /* 117 */
+	{"fsync",	NULL,			0,		1}, /* 118 */
+	{"sigreturn",	NULL,			0,		1}, /* 119 */
+	{"clone",	NULL,			0,		5}, /* 120 */
+	{"setdomainname", NULL,			0,		2}, /* 121 */
+	{"uname",	lx_uname,		0,		1}, /* 122 */
+	{"modify_ldt",	lx_modify_ldt,		0,		3}, /* 123 */
+	{"adjtimex",	NULL,			0,		1}, /* 124 */
+	{"mprotect",	NULL,			0,		3}, /* 125 */
+	{"sigprocmask",	NULL,			0,		3}, /* 126 */
+	{"create_module", NULL,			NOSYS_KERNEL,	0}, /* 127 */
+	{"init_module",	NULL,			NOSYS_KERNEL,	0}, /* 128 */
+	{"delete_module", NULL,			NOSYS_KERNEL,	0}, /* 129 */
+	{"get_kernel_syms", NULL,		NOSYS_KERNEL,	0}, /* 130 */
+	{"quotactl",	NULL,			NOSYS_KERNEL,	0}, /* 131 */
+	{"getpgid",	NULL,			0,		1}, /* 132 */
+	{"fchdir",	NULL,			0,		1}, /* 133 */
+	{"bdflush",	NULL,			NOSYS_KERNEL,	0}, /* 134 */
+	{"sysfs",	NULL,			0,		3}, /* 135 */
+	{"personality",	lx_personality,		0,		1}, /* 136 */
+	{"afs_syscall",	NULL,			NOSYS_KERNEL,	0}, /* 137 */
+	{"setfsuid16",	NULL,			0,		1}, /* 138 */
+	{"setfsgid16",	NULL,			0,		1}, /* 139 */
+	{"llseek",	NULL,			0,		5}, /* 140 */
+	{"getdents",	lx_getdents_32,		0,		3}, /* 141 */
+	{"select",	lx_select,		0,		5}, /* 142 */
+	{"flock",	NULL,			0,		2}, /* 143 */
+	{"msync",	NULL,			0,		3}, /* 144 */
+	{"readv",	lx_readv,		0,		3}, /* 145 */
+	{"writev",	lx_writev,		0,		3}, /* 146 */
+	{"getsid",	NULL,			0,		1}, /* 147 */
+	{"fdatasync",	NULL,			0,		1}, /* 148 */
+	{"sysctl",	NULL,			0,		1}, /* 149 */
+	{"mlock",	NULL,			0,		2}, /* 150 */
+	{"munlock",	NULL,			0,		2}, /* 151 */
+	{"mlockall",	NULL,			0,		1}, /* 152 */
+	{"munlockall",	NULL,			0,		0}, /* 153 */
+	{"sched_setparam", NULL,		0,		2}, /* 154 */
+	{"sched_getparam", NULL,		0,		2}, /* 155 */
+	{"sched_setscheduler", NULL,		0,		3}, /* 156 */
+	{"sched_getscheduler", NULL,		0,		1}, /* 157 */
+	{"sched_yield",	lx_sched_yield,		0,		0}, /* 158 */
+	{"sched_get_priority_max", NULL, 	0,		1}, /* 159 */
+	{"sched_get_priority_min", NULL, 	0,		1}, /* 160 */
+	{"sched_rr_get_interval", NULL, 	0,		2}, /* 161 */
+	{"nanosleep",	lx_nanosleep,		0,		2}, /* 162 */
+	{"mremap",	NULL,			0,		5}, /* 163 */
+	{"setresuid16",	lx_setresuid16,		0,		3}, /* 164 */
+	{"getresuid16",	NULL,			0,		3}, /* 165 */
+	{"vm86",	NULL,			NOSYS_NO_EQUIV,	0}, /* 166 */
+	{"query_module", NULL,			0,		5}, /* 167 */
+	{"poll",	lx_poll,		0,		3}, /* 168 */
+	{"nfsservctl",	NULL,			NOSYS_KERNEL,	0}, /* 169 */
+	{"setresgid16",	lx_setresgid16,		0,		3}, /* 170 */
+	{"getresgid16",	NULL,			0,		3}, /* 171 */
+	{"prctl",	lx_prctl,		0,		5}, /* 172 */
+	{"rt_sigreturn", NULL,			0,		0}, /* 173 */
+	{"rt_sigaction", NULL,			0,		4}, /* 174 */
+	{"rt_sigprocmask", NULL,		0,		4}, /* 175 */
+	{"rt_sigpending", NULL,			0,		2}, /* 176 */
+	{"rt_sigtimedwait", NULL,		0,		4}, /* 177 */
+	{"rt_sigqueueinfo", NULL,		0,		3}, /* 178 */
+	{"rt_sigsuspend", NULL,			0,		2}, /* 179 */
+	{"pread64",	lx_pread32,		0,		5}, /* 180 */
+	{"pwrite64",	lx_pwrite32,		0,		5}, /* 181 */
+	{"chown16",	lx_chown16,		0,		3}, /* 182 */
+	{"getcwd",	lx_getcwd,		0,		2}, /* 183 */
+	{"capget",	NULL,			0,		2}, /* 184 */
+	{"capset",	NULL,			0,		2}, /* 185 */
+	{"sigaltstack",	NULL,			0,		2}, /* 186 */
+	{"sendfile",	NULL,			0,		4}, /* 187 */
+	{"getpmsg",	NULL,			NOSYS_OBSOLETE,	0}, /* 188 */
+	{"putpmsg",	NULL,			NOSYS_OBSOLETE,	0}, /* 189 */
+	{"vfork",	NULL,			0,		0}, /* 190 */
+	{"getrlimit",	lx_getrlimit,		0,		2}, /* 191 */
+	{"mmap2",	NULL,			LX_SYS_EBPARG6,	6}, /* 192 */
+	{"truncate64",	NULL,			0,		3}, /* 193 */
+	{"ftruncate64",	NULL,			0,		3}, /* 194 */
+	{"stat64",	lx_stat64,		0,		2}, /* 195 */
+	{"lstat64",	lx_lstat64,		0,		2}, /* 196 */
+	{"fstat64",	lx_fstat64,		0,		2}, /* 197 */
+	{"lchown",	lx_lchown,		0,		3}, /* 198 */
+	{"getuid",	NULL,			0,		0}, /* 199 */
+	{"getgid",	NULL,			0,		0}, /* 200 */
+	{"geteuid",	NULL,			0,		0}, /* 201 */
+	{"getegid",	NULL,			0,		0}, /* 202 */
+	{"setreuid",	NULL,			0,		0}, /* 203 */
+	{"setregid",	NULL,			0,		0}, /* 204 */
+	{"getgroups",	NULL,			0,		2}, /* 205 */
+	{"setgroups",	NULL,			0,		2}, /* 206 */
+	{"fchown",	lx_fchown,		0,		3}, /* 207 */
+	{"setresuid",	lx_setresuid,		0,		3}, /* 208 */
+	{"getresuid",	NULL,			0,		3}, /* 209 */
+	{"setresgid",	lx_setresgid,		0,		3}, /* 210 */
+	{"getresgid",	NULL,			0,		3}, /* 211 */
+	{"chown",	lx_chown,		0,		3}, /* 212 */
+	{"setuid",	NULL,			0,		1}, /* 213 */
+	{"setgid",	NULL,			0,		1}, /* 214 */
+	{"setfsuid",	NULL,			0,		1}, /* 215 */
+	{"setfsgid",	NULL,			0,		1}, /* 216 */
+	{"pivot_root",	NULL,			NOSYS_KERNEL,	0}, /* 217 */
+	{"mincore",	NULL,			0,		3}, /* 218 */
+	{"madvise",	NULL,			0,		3}, /* 219 */
+	{"getdents64",	lx_getdents64,		0,		3}, /* 220 */
+	{"fcntl64",	lx_fcntl64,		0,		3}, /* 221 */
+	{"tux",		NULL,			NOSYS_NO_EQUIV,	0}, /* 222 */
+	{"security",	NULL,			NOSYS_NO_EQUIV,	0}, /* 223 */
+	{"gettid",	lx_gettid,		0,		0}, /* 224 */
+	{"readahead",	NULL,			NOSYS_NO_EQUIV,	0}, /* 225 */
+	{"setxattr",	lx_setxattr,		0,		5}, /* 226 */
+	{"lsetxattr",	lx_lsetxattr,		0,		5}, /* 227 */
+	{"fsetxattr",	lx_fsetxattr,		0,		5}, /* 228 */
+	{"getxattr",	lx_getxattr,		0,		4}, /* 229 */
+	{"lgetxattr",	lx_lgetxattr,		0,		4}, /* 230 */
+	{"fgetxattr",	lx_fgetxattr,		0,		4}, /* 231 */
+	{"listxattr",	lx_listxattr,		0,		3}, /* 232 */
+	{"llistxattr",	lx_llistxattr,		0,		3}, /* 233 */
+	{"flistxattr",	lx_flistxattr,		0,		3}, /* 234 */
+	{"removexattr",	lx_removexattr,		0,		2}, /* 235 */
+	{"lremovexattr", lx_lremovexattr,	0,		2}, /* 236 */
+	{"fremovexattr", lx_fremovexattr,	0,		2}, /* 237 */
+	{"tkill",	lx_tkill,		0,		2}, /* 238 */
+	{"sendfile64",	NULL,			0,		4}, /* 239 */
+	{"futex",	lx_futex,		LX_SYS_EBPARG6,	6}, /* 240 */
+	{"sched_setaffinity", NULL, 		0,		3}, /* 241 */
+	{"sched_getaffinity", NULL, 		0,		3}, /* 242 */
+	{"set_thread_area", lx_set_thread_area,	0,		1}, /* 243 */
+	{"get_thread_area", lx_get_thread_area,	0,		1}, /* 244 */
+	{"io_setup",	lx_io_setup,		0,		2}, /* 245 */
+	{"io_destroy",	NULL,			0,		1}, /* 246 */
+	{"io_getevents", NULL,			0,		5}, /* 247 */
+	{"io_submit",	NULL,			0,		3}, /* 248 */
+	{"io_cancel",	NULL,			0,		3}, /* 249 */
+	{"fadvise64",	NULL,			0,		4}, /* 250 */
+	{"nosys",	NULL,			0,		0}, /* 251 */
+	{"group_exit",	NULL,			0,		1}, /* 252 */
+	{"lookup_dcookie", NULL,		NOSYS_NO_EQUIV,	0}, /* 253 */
+	{"epoll_create", lx_epoll_create,	0,		1}, /* 254 */
+	{"epoll_ctl",	lx_epoll_ctl,		0,		4}, /* 255 */
+	{"epoll_wait",	lx_epoll_wait,		0,		4}, /* 256 */
+	{"remap_file_pages", NULL,		NOSYS_NO_EQUIV,	0}, /* 257 */
+	{"set_tid_address", lx_set_tid_address,	0,		1}, /* 258 */
+	{"timer_create", NULL,			0,		3}, /* 259 */
+	{"timer_settime", NULL,			0,		4}, /* 260 */
+	{"timer_gettime", NULL,			0,		2}, /* 261 */
+	{"timer_getoverrun", NULL,		0,		1}, /* 262 */
+	{"timer_delete", NULL,			0,		1}, /* 263 */
+	{"clock_settime", lx_clock_settime,	0,		2}, /* 264 */
+	{"clock_gettime", lx_clock_gettime,	0,		2}, /* 265 */
+	{"clock_getres", lx_clock_getres,	0,		2}, /* 266 */
+	{"clock_nanosleep", NULL,		0,		4}, /* 267 */
+	{"statfs64",	NULL,			0,		2}, /* 268 */
+	{"fstatfs64",	NULL,			0,		2}, /* 269 */
+	{"tgkill",	lx_tgkill,		0,		3}, /* 270 */
+
+/*
+ * The following system calls only exist in kernel 2.6 and greater:
+ */
+	{"utimes",	NULL,			0,		2}, /* 271 */
+	{"fadvise64_64", NULL, 			0,		4}, /* 272 */
+	{"vserver",	NULL,			NOSYS_NULL,	0}, /* 273 */
+	{"mbind",	NULL,			NOSYS_NULL,	0}, /* 274 */
+	{"get_mempolicy", NULL,			NOSYS_NULL,	0}, /* 275 */
+	{"set_mempolicy", NULL,			NOSYS_NULL,	0}, /* 276 */
+	{"mq_open",	NULL,			NOSYS_NULL,	0}, /* 277 */
+	{"mq_unlink",	NULL,			NOSYS_NULL,	0}, /* 278 */
+	{"mq_timedsend", NULL,			NOSYS_NULL,	0}, /* 279 */
+	{"mq_timedreceive", NULL,		NOSYS_NULL,	0}, /* 280 */
+	{"mq_notify",	NULL,			NOSYS_NULL,	0}, /* 281 */
+	{"mq_getsetattr", NULL,			NOSYS_NULL,	0}, /* 282 */
+	{"kexec_load",	NULL,			NOSYS_NULL,	0}, /* 283 */
+	{"waitid",	lx_waitid,		0,		4}, /* 284 */
+	{"sys_setaltroot", NULL,		NOSYS_NULL,	0}, /* 285 */
+	{"add_key",	NULL,			NOSYS_NULL,	0}, /* 286 */
+	{"request_key",	NULL,			NOSYS_NULL,	0}, /* 287 */
+	{"keyctl",	NULL,			NOSYS_NULL,	0}, /* 288 */
+	{"ioprio_set",	lx_ioprio_set,		0,		3}, /* 289 */
+	{"ioprio_get",	lx_ioprio_get,		0,		2}, /* 290 */
+	{"inotify_init", NULL,			0,		0}, /* 291 */
+	{"inotify_add_watch", NULL,		0,		3}, /* 292 */
+	{"inotify_rm_watch", NULL,		0,		2}, /* 293 */
+	{"migrate_pages", NULL,			NOSYS_NULL,	0}, /* 294 */
+	{"openat",	lx_openat,		0,		4}, /* 295 */
+	{"mkdirat",	lx_mkdirat,		0,		3}, /* 296 */
+	{"mknodat",	NULL,			0,		4}, /* 297 */
+	{"fchownat",	lx_fchownat,		0,		5}, /* 298 */
+	{"futimesat",	NULL,			0,		3}, /* 299 */
+	{"fstatat64",	lx_fstatat64,		0,		4}, /* 300 */
+	{"unlinkat",	NULL,			0,		3}, /* 301 */
+	{"renameat",	NULL,			0,		4}, /* 302 */
+	{"linkat",	lx_linkat,		0,		5}, /* 303 */
+	{"symlinkat",	NULL,			0,		3}, /* 304 */
+	{"readlinkat",	NULL,			0,		4}, /* 305 */
+	{"fchmodat",	lx_fchmodat,		0,		3}, /* 306 */
+	{"faccessat",	lx_faccessat,		0,		4}, /* 307 */
+	{"pselect6",	lx_pselect,		LX_SYS_EBPARG6,	6}, /* 308 */
+	{"ppoll",	lx_ppoll,		0,		5}, /* 309 */
+	{"unshare",	NULL,			NOSYS_NULL,	0}, /* 310 */
+	{"set_robust_list", lx_set_robust_list,	0,		2}, /* 311 */
+	{"get_robust_list", lx_get_robust_list,	0,		3}, /* 312 */
+	{"splice",	NULL,			NOSYS_NULL,	0}, /* 313 */
+	{"sync_file_range", lx_sync_file_range,	0,		4}, /* 314 */
+	{"tee",		NULL,			NOSYS_NULL,	0}, /* 315 */
+	{"vmsplice",	NULL,			NOSYS_NULL,	0}, /* 316 */
+	{"move_pages",	NULL,			NOSYS_NULL,	0}, /* 317 */
+	{"getcpu",	lx_getcpu,		0,		3}, /* 318 */
+	{"epoll_pwait",	lx_epoll_pwait,		0,		5}, /* 319 */
+	{"utimensat",	NULL,			0,		4}, /* 320 */
+	{"signalfd",	NULL,			0,		3}, /* 321 */
+	{"timerfd_create", NULL,		0,		2}, /* 322 */
+	{"eventfd",	NULL,			0,		1}, /* 323 */
+	{"fallocate",	lx_fallocate32,		LX_SYS_EBPARG6,	6}, /* 324 */
+	{"timerfd_settime", NULL,		0,		4}, /* 325 */
+	{"timerfd_gettime", NULL,		0,		2}, /* 326 */
+	{"signalfd4",	NULL,			0,		4}, /* 327 */
+	{"eventfd2",	NULL,			0,		2}, /* 328 */
+	{"epoll_create1", lx_epoll_create1,	0,		1}, /* 329 */
+	{"dup3",	NULL,			0,		3}, /* 330 */
+	{"pipe2",	lx_pipe2,		0,		2}, /* 331 */
+	{"inotify_init1", NULL,			0,		1}, /* 332 */
+	{"preadv",	lx_preadv32,		0,		5}, /* 333 */
+	{"pwritev",	lx_pwritev32,		0,		5}, /* 334 */
+	{"rt_tgsigqueueinfo", NULL,		0,		4}, /* 335 */
+	{"perf_event_open", NULL,		NOSYS_NULL,	0}, /* 336 */
+	{"recvmmsg",	NULL,			NOSYS_NULL,	0}, /* 337 */
+	{"fanotify_init", NULL,			NOSYS_NULL,	0}, /* 338 */
+	{"fanotify_mark", NULL,			NOSYS_NULL,	0}, /* 339 */
+	{"prlimit64",	lx_prlimit64,		0,		4}, /* 340 */
+	{"name_to_handle_at", NULL,		NOSYS_NULL,	0}, /* 341 */
+	{"open_by_handle_at", NULL,		NOSYS_NULL,	0}, /* 342 */
+	{"clock_adjtime", NULL,			NOSYS_NULL,	0}, /* 343 */
+	{"syncfs",	lx_syncfs,		0,		1}, /* 344 */
+	{"sendmmsg",	NULL,			NOSYS_NULL,	0}, /* 345 */
+	{"setns",	NULL,			NOSYS_NULL,	0}, /* 346 */
+	{"process_vm_readv", NULL,		NOSYS_NULL,	0}, /* 347 */
+	{"process_vm_writev", NULL,		NOSYS_NULL,	0}, /* 348 */
+	{"kcmp",	NULL,			NOSYS_NULL,	0}, /* 349 */
+	{"finit_module", NULL,			NOSYS_NULL,	0}, /* 350 */
+	{"sched_setattr", NULL,			NOSYS_NULL,	0}, /* 351 */
+	{"sched_getattr", NULL,			NOSYS_NULL,	0}, /* 352 */
+	{"renameat2",	NULL,			NOSYS_NULL,	0}, /* 353 */
+	{"seccomp",	NULL,			NOSYS_NULL,	0}, /* 354 */
+	{"getrandom",	lx_getrandom,		0,		3}, /* 355 */
+	{"memfd_create", NULL,			NOSYS_NULL,	0}, /* 356 */
+	{"bpf",		NULL,			NOSYS_NULL,	0}, /* 357 */
+	{"execveat",	NULL,			NOSYS_NULL,	0}, /* 358 */
+};
+
+#if defined(_LP64)
+/*
+ * Linux defines system call numbers for 64-bit x86 in the file:
+ *   arch/x86/syscalls/syscall_64.tbl
+ */
+lx_sysent_t lx_sysent64[] = {
+	{"read",	lx_read,		0,		3}, /* 0 */
+	{"write",	lx_write,		0,		3}, /* 1 */
+	{"open",	lx_open,		0,		3}, /* 2 */
+	{"close",	lx_close,		0,		1}, /* 3 */
+	{"stat",	lx_stat64,		0,		2}, /* 4 */
+	{"fstat",	lx_fstat64,		0,		2}, /* 5 */
+	{"lstat",	lx_lstat64,		0,		2}, /* 6 */
+	{"poll",	lx_poll,		0,		3}, /* 7 */
+	{"lseek",	NULL,			0,		3}, /* 8 */
+	{"mmap",	NULL,			0,		6}, /* 9 */
+	{"mprotect",	NULL,			0,		3}, /* 10 */
+	{"munmap",	NULL,			0,		2}, /* 11 */
+	{"brk",		lx_brk,			0,		1}, /* 12 */
+	{"rt_sigaction", NULL,			0,		4}, /* 13 */
+	{"rt_sigprocmask", NULL,		0,		4}, /* 14 */
+	{"rt_sigreturn", NULL,			0,		0}, /* 15 */
+	{"ioctl",	lx_ioctl,		0,		3}, /* 16 */
+	{"pread64",	lx_pread,		0,		4}, /* 17 */
+	{"pwrite64",	lx_pwrite,		0,		4}, /* 18 */
+	{"readv",	lx_readv,		0,		3}, /* 19 */
+	{"writev",	lx_writev,		0,		3}, /* 20 */
+	{"access",	lx_access,		0,		2}, /* 21 */
+	{"pipe",	lx_pipe,		0,		1}, /* 22 */
+	{"select",	lx_select,		0,		5}, /* 23 */
+	{"sched_yield",	lx_sched_yield,		0,		0}, /* 24 */
+	{"mremap",	NULL,			0,		5}, /* 25 */
+	{"msync",	NULL,			0,		3}, /* 26 */
+	{"mincore",	NULL,			0,		3}, /* 27 */
+	{"madvise",	NULL,			0,		3}, /* 28 */
+	{"shmget",	NULL,			0,		3}, /* 29 */
+	{"shmat",	NULL,			0,		4}, /* 30 */
+	{"shmctl",	NULL,			0,		3}, /* 31 */
+	{"dup",		NULL,			0,		1}, /* 32 */
+	{"dup2",	NULL,			0,		2}, /* 33 */
+	{"pause",	NULL,			0,		0}, /* 34 */
+	{"nanosleep",	lx_nanosleep,		0,		2}, /* 35 */
+	{"getitimer",	NULL,			0,		2}, /* 36 */
+	{"alarm",	NULL,			0,		1}, /* 37 */
+	{"setitimer",	NULL,			0,		3}, /* 38 */
+	{"getpid",	lx_getpid,		0,		0}, /* 39 */
+	{"sendfile",	NULL,			0,		4}, /* 40 */
+	{"socket",	lx_socket,		0,		3}, /* 41 */
+	{"connect",	lx_connect,		0,		3}, /* 42 */
+	{"accept",	lx_accept,		0,		3}, /* 43 */
+	{"sendto",	lx_sendto,		0,		6}, /* 44 */
+	{"recvfrom",	lx_recvfrom,		0,		6}, /* 45 */
+	{"sendmsg",	lx_sendmsg,		0,		3}, /* 46 */
+	{"recvmsg",	lx_recvmsg,		0,		3}, /* 47 */
+	{"shutdown",	NULL,			0,		2}, /* 48 */
+	{"bind",	lx_bind,		0,		3}, /* 49 */
+	{"listen",	NULL,			0,		2}, /* 50 */
+	{"getsockname",	lx_getsockname,		0,		3}, /* 51 */
+	{"getpeername",	lx_getpeername,		0,		3}, /* 52 */
+	{"socketpair",	NULL,			0,		4}, /* 53 */
+	{"setsockopt",	lx_setsockopt,		0,		5}, /* 54 */
+	{"getsockopt",	lx_getsockopt,		0,		5}, /* 55 */
+	{"clone",	NULL,			0,		5}, /* 56 */
+	{"fork",	NULL,			0,		0}, /* 57 */
+	{"vfork",	NULL,			0,		0}, /* 58 */
+	{"execve",	NULL,			0,		3}, /* 59 */
+	{"exit",	NULL,			0,		1}, /* 60 */
+	{"wait4",	lx_wait4,		0,		4}, /* 61 */
+	{"kill",	lx_kill,		0,		2}, /* 62 */
+	{"uname",	lx_uname,		0,		1}, /* 63 */
+	{"semget",	NULL,			0,		3}, /* 64 */
+	{"semop",	NULL,			0,		3}, /* 65 */
+	{"semctl",	NULL,			0,		4}, /* 66 */
+	{"shmdt",	NULL,			0,		1}, /* 67 */
+	{"msgget",	NULL,			0,		2}, /* 68 */
+	{"msgsnd",	NULL,			0,		4}, /* 69 */
+	{"msgrcv",	NULL,			0,		5}, /* 70 */
+	{"msgctl",	NULL,			0,		3}, /* 71 */
+	{"fcntl",	lx_fcntl64,		0,		3}, /* 72 */
+	{"flock",	NULL,			0,		2}, /* 73 */
+	{"fsync",	NULL,			0,		1}, /* 74 */
+	{"fdatasync",	NULL,			0,		1}, /* 75 */
+	{"truncate",	NULL,			0,		2}, /* 76 */
+	{"ftruncate",	NULL,			0,		2}, /* 77 */
+	{"getdents",	lx_getdents_64,		0,		3}, /* 78 */
+	{"getcwd",	lx_getcwd,		0,		2}, /* 79 */
+	{"chdir",	NULL,			0,		1}, /* 80 */
+	{"fchdir",	NULL,			0,		1}, /* 81 */
+	{"rename",	NULL,			0,		2}, /* 82 */
+	{"mkdir",	lx_mkdir,		0,		2}, /* 83 */
+	{"rmdir",	NULL,			0,		1}, /* 84 */
+	{"creat",	NULL,			0,		2}, /* 85 */
+	{"link",	lx_link,		0,		2}, /* 86 */
+	{"unlink",	NULL,			0,		1}, /* 87 */
+	{"symlink",	NULL,			0,		2}, /* 88 */
+	{"readlink",	NULL,			0,		3}, /* 89 */
+	{"chmod",	lx_chmod,		0,		2}, /* 90 */
+	{"fchmod",	lx_fchmod,		0,		2}, /* 91 */
+	{"chown",	lx_chown,		0,		3}, /* 92 */
+	{"fchown",	lx_fchown,		0,		3}, /* 93 */
+	{"lchown",	lx_lchown,		0,		3}, /* 94 */
+	{"umask",	NULL,			0,		1}, /* 95 */
+	{"gettimeofday", lx_gettimeofday,	0,		2}, /* 96 */
+	{"getrlimit",	lx_getrlimit,		0,		2}, /* 97 */
+	{"getrusage",	NULL,			0,		2}, /* 98 */
+	{"sysinfo",	lx_sysinfo64,		0,		1}, /* 99 */
+	{"times",	NULL,			0,		1}, /* 100 */
+	{"ptrace",	lx_ptrace,		0,		4}, /* 101 */
+	{"getuid",	NULL,			0,		0}, /* 102 */
+	{"syslog",	NULL,			0,		3}, /* 103 */
+	{"getgid",	NULL,			0,		0}, /* 104 */
+	{"setuid",	NULL,			0,		1}, /* 105 */
+	{"setgid",	NULL,			0,		1}, /* 106 */
+	{"geteuid",	NULL,			0,		0}, /* 107 */
+	{"getegid",	NULL,			0,		0}, /* 108 */
+	{"setpgid",	NULL,			0,		2}, /* 109 */
+	{"getppid",	lx_getppid,		0,		0}, /* 110 */
+	{"getpgrp",	NULL,			0,		0}, /* 111 */
+	{"setsid",	NULL,			0,		0}, /* 112 */
+	{"setreuid",	NULL,			0,		0}, /* 113 */
+	{"setregid",	NULL,			0,		0}, /* 114 */
+	{"getgroups",	NULL,			0,		2}, /* 115 */
+	{"setgroups",	NULL,			0,		2}, /* 116 */
+	{"setresuid",	lx_setresuid,		0,		3}, /* 117 */
+	{"getresuid",	NULL,			0,		3}, /* 118 */
+	{"setresgid",	lx_setresgid,		0,		3}, /* 119 */
+	{"getresgid",	NULL,			0,		3}, /* 120 */
+	{"getpgid",	NULL,			0,		1}, /* 121 */
+	{"setfsuid",	NULL,			0,		1}, /* 122 */
+	{"setfsgid",	NULL,			0,		1}, /* 123 */
+	{"getsid",	NULL,			0,		1}, /* 124 */
+	{"capget",	NULL,			0,		2}, /* 125 */
+	{"capset",	NULL,			0,		2}, /* 126 */
+	{"rt_sigpending", NULL,			0,		2}, /* 127 */
+	{"rt_sigtimedwait", NULL,		0,		4}, /* 128 */
+	{"rt_sigqueueinfo", NULL,		0,		3}, /* 129 */
+	{"rt_sigsuspend", NULL,			0,		2}, /* 130 */
+	{"sigaltstack",	NULL,			0,		2}, /* 131 */
+	{"utime",	NULL,			0,		2}, /* 132 */
+	{"mknod",	NULL,			0,		3}, /* 133 */
+	{"uselib",	NULL,			NOSYS_KERNEL,	0}, /* 134 */
+	{"personality",	lx_personality,		0,		1}, /* 135 */
+	{"ustat",	NULL,			NOSYS_OBSOLETE,	2}, /* 136 */
+	{"statfs",	NULL,			0,		2}, /* 137 */
+	{"fstatfs",	NULL,			0,		2}, /* 138 */
+	{"sysfs",	NULL,			0,		3}, /* 139 */
+	{"getpriority",	NULL,			0,		2}, /* 140 */
+	{"setpriority",	NULL,			0,		3}, /* 141 */
+	{"sched_setparam", NULL,		0,		2}, /* 142 */
+	{"sched_getparam", NULL,		0,		2}, /* 143 */
+	{"sched_setscheduler", NULL,		0,		3}, /* 144 */
+	{"sched_getscheduler", NULL,		0,		1}, /* 145 */
+	{"sched_get_priority_max", NULL,	0,		1}, /* 146 */
+	{"sched_get_priority_min", NULL,	0,		1}, /* 147 */
+	{"sched_rr_get_interval", NULL,		0,		2}, /* 148 */
+	{"mlock",	NULL,			0,		2}, /* 149 */
+	{"munlock",	NULL,			0,		2}, /* 150 */
+	{"mlockall",	NULL,			0,		1}, /* 151 */
+	{"munlockall",	NULL,			0,		0}, /* 152 */
+	{"vhangup",	NULL,			0,		0}, /* 153 */
+	{"modify_ldt",	lx_modify_ldt,		0,		3}, /* 154 */
+	{"pivot_root",	NULL,			NOSYS_KERNEL,	0}, /* 155 */
+	{"sysctl",	NULL,			0,		1}, /* 156 */
+	{"prctl",	lx_prctl,		0,		5}, /* 157 */
+	{"arch_prctl",	lx_arch_prctl,		0,		2}, /* 158 */
+	{"adjtimex",	NULL,			0,		1}, /* 159 */
+	{"setrlimit",	lx_setrlimit,		0,		2}, /* 160 */
+	{"chroot",	NULL,			0,		1}, /* 161 */
+	{"sync",	NULL,			0,		0}, /* 162 */
+	{"acct",	NULL,			NOSYS_NO_EQUIV,	0}, /* 163 */
+	{"settimeofday", NULL,			0,		2}, /* 164 */
+	{"mount",	NULL,			0,		5}, /* 165 */
+	{"umount2",	NULL,			0,		2}, /* 166 */
+	{"swapon",	NULL,			NOSYS_KERNEL,	0}, /* 167 */
+	{"swapoff",	NULL,			NOSYS_KERNEL,	0}, /* 168 */
+	{"reboot",	NULL,			0,		4}, /* 169 */
+	{"sethostname",	NULL,			0,		2}, /* 170 */
+	{"setdomainname", NULL,			0,		2}, /* 171 */
+	{"iopl",	NULL,			NOSYS_NO_EQUIV,	0}, /* 172 */
+	{"ioperm",	NULL,			NOSYS_NO_EQUIV,	0}, /* 173 */
+	{"create_module", NULL,			NOSYS_KERNEL,	0}, /* 174 */
+	{"init_module",	NULL,			NOSYS_KERNEL,	0}, /* 175 */
+	{"delete_module", NULL,			NOSYS_KERNEL,	0}, /* 176 */
+	{"get_kernel_syms", NULL,		NOSYS_KERNEL,	0}, /* 177 */
+	{"query_module", NULL,			0,		5}, /* 178 */
+	{"quotactl",	NULL,			NOSYS_KERNEL,	0}, /* 179 */
+	{"nfsservctl",	NULL,			NOSYS_KERNEL,	0}, /* 180 */
+	{"getpmsg",	NULL,			NOSYS_OBSOLETE,	0}, /* 181 */
+	{"putpmsg",	NULL,			NOSYS_OBSOLETE,	0}, /* 182 */
+	{"afs_syscall",	NULL,			NOSYS_KERNEL,	0}, /* 183 */
+	{"tux",		NULL,			NOSYS_NO_EQUIV,	0}, /* 184 */
+	{"security",	NULL,			NOSYS_NO_EQUIV,	0}, /* 185 */
+	{"gettid",	lx_gettid,		0,		0}, /* 186 */
+	{"readahead",	NULL,			NOSYS_NO_EQUIV,	0}, /* 187 */
+	{"setxattr",	lx_setxattr,		0,		5}, /* 188 */
+	{"lsetxattr",	lx_lsetxattr,		0,		5}, /* 189 */
+	{"fsetxattr",	lx_fsetxattr,		0,		5}, /* 190 */
+	{"getxattr",	lx_getxattr,		0,		4}, /* 191 */
+	{"lgetxattr",	lx_lgetxattr,		0,		4}, /* 192 */
+	{"fgetxattr",	lx_fgetxattr,		0,		4}, /* 193 */
+	{"listxattr",	lx_listxattr,		0,		3}, /* 194 */
+	{"llistxattr",	lx_llistxattr,		0,		3}, /* 195 */
+	{"flistxattr",	lx_flistxattr,		0,		3}, /* 196 */
+	{"removexattr",	lx_removexattr,		0,		2}, /* 197 */
+	{"lremovexattr", lx_lremovexattr,	0,		2}, /* 198 */
+	{"fremovexattr", lx_fremovexattr,	0,		2}, /* 199 */
+	{"tkill",	lx_tkill,		0,		2}, /* 200 */
+	{"time",	lx_time,		0,		1}, /* 201 */
+	{"futex",	lx_futex,		0,		6}, /* 202 */
+	{"sched_setaffinity", NULL,		0,		3}, /* 203 */
+	{"sched_getaffinity", NULL,		0,		3}, /* 204 */
+	{"set_thread_area", lx_set_thread_area, 0,		1}, /* 205 */
+	{"io_setup",	lx_io_setup,		0,		2}, /* 206 */
+	{"io_destroy",	NULL,			0,		1}, /* 207 */
+	{"io_getevents", NULL,			0,		5}, /* 208 */
+	{"io_submit",	NULL,			0,		3}, /* 209 */
+	{"io_cancel",	NULL,			0,		3}, /* 210 */
+	{"get_thread_area", lx_get_thread_area,	0,		1}, /* 211 */
+	{"lookup_dcookie", NULL,		NOSYS_NO_EQUIV,	0}, /* 212 */
+	{"epoll_create", lx_epoll_create,	0,		1}, /* 213 */
+	{"epoll_ctl_old", NULL,			NOSYS_NULL,	0}, /* 214 */
+	{"epoll_wait_old", NULL,		NOSYS_NULL,	0}, /* 215 */
+	{"remap_file_pages", NULL,		NOSYS_NO_EQUIV,	0}, /* 216 */
+	{"getdents64",	lx_getdents64,		0,		3}, /* 217 */
+	{"set_tid_address", lx_set_tid_address, 0,		1}, /* 218 */
+	{"restart_syscall", NULL,		NOSYS_NULL,	0}, /* 219 */
+	{"semtimedop",	NULL,			0,		4}, /* 220 */
+	{"fadvise64",	NULL,			0,		4}, /* 221 */
+	{"timer_create", NULL,			0,		3}, /* 222 */
+	{"timer_settime", NULL,			0,		4}, /* 223 */
+	{"timer_gettime", NULL,			0,		2}, /* 224 */
+	{"timer_getoverrun", NULL,		0,		1}, /* 225 */
+	{"timer_delete", NULL,			0,		1}, /* 226 */
+	{"clock_settime", lx_clock_settime,	0,		2}, /* 227 */
+	{"clock_gettime", lx_clock_gettime,	0,		2}, /* 228 */
+	{"clock_getres", lx_clock_getres,	0,		2}, /* 229 */
+	{"clock_nanosleep", NULL,		0,		4}, /* 230 */
+	{"exit_group",	NULL,			0,		1}, /* 231 */
+	{"epoll_wait",	lx_epoll_wait,		0,		4}, /* 232 */
+	{"epoll_ctl",	lx_epoll_ctl,		0,		4}, /* 233 */
+	{"tgkill",	lx_tgkill,		0,		3}, /* 234 */
+	{"utimes",	NULL,			0,		2}, /* 235 */
+	{"vserver",	NULL,			NOSYS_NULL,	0}, /* 236 */
+	{"mbind",	NULL,			NOSYS_NULL,	0}, /* 237 */
+	{"set_mempolicy", NULL,			NOSYS_NULL,	0}, /* 238 */
+	{"get_mempolicy", NULL,			NOSYS_NULL,	0}, /* 239 */
+	{"mq_open",	NULL,			NOSYS_NULL,	0}, /* 240 */
+	{"mq_unlink",	NULL,			NOSYS_NULL,	0}, /* 241 */
+	{"mq_timedsend", NULL,			NOSYS_NULL,	0}, /* 242 */
+	{"mq_timedreceive", NULL,		NOSYS_NULL,	0}, /* 243 */
+	{"mq_notify",	NULL,			NOSYS_NULL,	0}, /* 244 */
+	{"mq_getsetattr", NULL,			NOSYS_NULL,	0}, /* 245 */
+	{"kexec_load",	NULL,			NOSYS_NULL,	0}, /* 246 */
+	{"waitid",	lx_waitid,		0,		4}, /* 247 */
+	{"add_key",	NULL,			NOSYS_NULL,	0}, /* 248 */
+	{"request_key",	NULL,			NOSYS_NULL,	0}, /* 249 */
+	{"keyctl",	NULL,			NOSYS_NULL,	0}, /* 250 */
+	{"ioprio_set",	lx_ioprio_set,		0,		3}, /* 251 */
+	{"ioprio_get",	lx_ioprio_get,		0,		2}, /* 252 */
+	{"inotify_init", NULL,			0,		0}, /* 253 */
+	{"inotify_add_watch", NULL,		0,		3}, /* 254 */
+	{"inotify_rm_watch", NULL,		0,		2}, /* 255 */
+	{"migrate_pages", NULL,			NOSYS_NULL,	0}, /* 256 */
+	{"openat",	lx_openat,		0,		4}, /* 257 */
+	{"mkdirat",	lx_mkdirat,		0,		3}, /* 258 */
+	{"mknodat",	NULL,			0,		4}, /* 259 */
+	{"fchownat",	lx_fchownat,		0,		5}, /* 260 */
+	{"futimesat",	NULL,			0,		3}, /* 261 */
+	{"fstatat64",	lx_fstatat64,		0,		4}, /* 262 */
+	{"unlinkat",	NULL,			0,		3}, /* 263 */
+	{"renameat",	NULL,			0,		4}, /* 264 */
+	{"linkat",	lx_linkat,		0,		5}, /* 265 */
+	{"symlinkat",	NULL,			0,		3}, /* 266 */
+	{"readlinkat",	NULL,			0,		4}, /* 267 */
+	{"fchmodat",	lx_fchmodat,		0,		3}, /* 268 */
+	{"faccessat",	lx_faccessat,		0,		4}, /* 269 */
+	{"pselect6",	lx_pselect,		0,		6}, /* 270 */
+	{"ppoll",	lx_ppoll,		0,		5}, /* 271 */
+	{"unshare",	NULL,			NOSYS_NULL,	0}, /* 272 */
+	{"set_robust_list", lx_set_robust_list,	0,		2}, /* 273 */
+	{"get_robust_list", lx_get_robust_list,	0,		3}, /* 274 */
+	{"splice",	NULL,			NOSYS_NULL,	0}, /* 275 */
+	{"tee",		NULL,			NOSYS_NULL,	0}, /* 276 */
+	{"sync_file_range", lx_sync_file_range,	0,		4}, /* 277 */
+	{"vmsplice",	NULL,			NOSYS_NULL,	0}, /* 278 */
+	{"move_pages",	NULL,			NOSYS_NULL,	0}, /* 279 */
+	{"utimensat",	NULL,			0,		4}, /* 280 */
+	{"epoll_pwait",	lx_epoll_pwait,		0,		5}, /* 281 */
+	{"signalfd",	NULL,			0,		3}, /* 282 */
+	{"timerfd_create", NULL,		0,		2}, /* 283 */
+	{"eventfd",	NULL,			0,		1}, /* 284 */
+	{"fallocate",	lx_fallocate,		0,		4}, /* 285 */
+	{"timerfd_settime", NULL,		0,		4}, /* 286 */
+	{"timerfd_gettime", NULL,		0,		2}, /* 287 */
+	{"accept4",	lx_accept4,		0,		4}, /* 288 */
+	{"signalfd4",	NULL,			0,		4}, /* 289 */
+	{"eventfd2",	NULL,			0,		2}, /* 290 */
+	{"epoll_create1", lx_epoll_create1,	0,		1}, /* 291 */
+	{"dup3",	NULL,			0,		3}, /* 292 */
+	{"pipe2",	lx_pipe2,		0,		2}, /* 293 */
+	{"inotify_init1", NULL,			0,		1}, /* 294 */
+	{"preadv",	lx_preadv,		0,		4}, /* 295 */
+	{"pwritev",	lx_pwritev,		0,		4}, /* 296 */
+	{"rt_tgsigqueueinfo", NULL, 		0,		4}, /* 297 */
+	{"perf_event_open", NULL,		NOSYS_NULL,	0}, /* 298 */
+	{"recvmmsg",	NULL,			NOSYS_NULL,	0}, /* 299 */
+	{"fanotify_init", NULL,			NOSYS_NULL,	0}, /* 300 */
+	{"fanotify_mark", NULL,			NOSYS_NULL,	0}, /* 301 */
+	{"prlimit64",	lx_prlimit64,		0,		4}, /* 302 */
+	{"name_to_handle_at", NULL,		NOSYS_NULL,	0}, /* 303 */
+	{"open_by_handle_at", NULL,		NOSYS_NULL,	0}, /* 304 */
+	{"clock_adjtime", NULL,			NOSYS_NULL,	0}, /* 305 */
+	{"syncfs",	lx_syncfs,		0,		1}, /* 306 */
+	{"sendmmsg",	NULL,			NOSYS_NULL,	0}, /* 307 */
+	{"setns",	NULL,			NOSYS_NULL,	0}, /* 309 */
+	{"getcpu",	lx_getcpu,		0,		3}, /* 309 */
+	{"process_vm_readv", NULL,		NOSYS_NULL,	0}, /* 310 */
+	{"process_vm_writev", NULL,		NOSYS_NULL,	0}, /* 311 */
+	{"kcmp",	NULL,			NOSYS_NULL,	0}, /* 312 */
+	{"finit_module", NULL,			NOSYS_NULL,	0}, /* 313 */
+	{"sched_setattr", NULL,			NOSYS_NULL,	0}, /* 314 */
+	{"sched_getattr", NULL,			NOSYS_NULL,	0}, /* 315 */
+	{"renameat2", NULL,			NOSYS_NULL,	0}, /* 316 */
+	{"seccomp",	NULL,			NOSYS_NULL,	0}, /* 317 */
+	{"getrandom",	lx_getrandom,		0,		3}, /* 318 */
+	{"memfd_create", NULL,			NOSYS_NULL,	0}, /* 319 */
+	{"kexec_file_load", NULL,		NOSYS_NULL,	0}, /* 320 */
+	{"bpf",		NULL,			NOSYS_NULL,	0}, /* 321 */
+	{"execveat",	NULL,			NOSYS_NULL,	0}, /* 322 */
+
+	/* XXX TBD gap then x32 syscalls from 512 - 544 */
+};
+#endif
diff --git a/usr/src/uts/common/brand/lx/procfs/lx_proc.h b/usr/src/uts/common/brand/lx/procfs/lx_proc.h
new file mode 100644
index 0000000000..131a061062
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/procfs/lx_proc.h
@@ -0,0 +1,350 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#ifndef	_LX_PROC_H
+#define	_LX_PROC_H
+
+#ifdef _LXPROC_NATIVE_H
+#error Attempted to include branded lx_proc.h after native lxproc.h
+#endif
+
+#define	_LXPROC_BRANDED_H
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * lxproc.h: declarations, data structures and macros for lxprocfs
+ */
+
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/policy.h>
+#include <sys/debug.h>
+#include <sys/dirent.h>
+#include <sys/errno.h>
+#include <sys/file.h>
+#include <sys/kmem.h>
+#include <sys/pathname.h>
+#include <sys/proc.h>
+#include <sys/systm.h>
+#include <sys/var.h>
+#include <sys/user.h>
+#include <sys/t_lock.h>
+#include <sys/sysmacros.h>
+#include <sys/cred.h>
+#include <sys/priv.h>
+#include <sys/vnode.h>
+#include <sys/vfs.h>
+#include <sys/statvfs.h>
+#include <sys/cmn_err.h>
+#include <sys/zone.h>
+#include <sys/uio.h>
+#include <sys/utsname.h>
+#include <sys/dnlc.h>
+#include <sys/atomic.h>
+#include <sys/sunddi.h>
+#include <sys/sunldi.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/nvpair.h>
+#include <vm/as.h>
+#include <vm/anon.h>
+
+/*
+ * Convert a vnode into an lxpr_mnt_t
+ */
+#define	VTOLXPM(vp)	((lxpr_mnt_t *)(vp)->v_vfsp->vfs_data)
+
+/*
+ * convert a vnode into an lxpr_node
+ */
+#define	VTOLXP(vp)	((lxpr_node_t *)(vp)->v_data)
+
+/*
+ * convert a lxprnode into a vnode
+ */
+#define	LXPTOV(lxpnp)	((lxpnp)->lxpr_vnode)
+
+/*
+ * convert a lxpr_node into zone for fs
+ */
+#define	LXPTOZ(lxpnp) \
+	(((lxpr_mnt_t *)(lxpnp)->lxpr_vnode->v_vfsp->vfs_data)->lxprm_zone)
+
+#define	LXPNSIZ		256	/* max size of lx /proc file name entries */
+
+/*
+ * Pretend that a directory entry takes 16 bytes
+ */
+#define	LXPR_SDSIZE	16
+
+/*
+ * Node/file types for lx /proc files
+ * (directories and files contained therein).
+ */
+typedef enum lxpr_nodetype {
+	LXPR_INVALID,		/* nodes start at 1	*/
+	LXPR_PROCDIR,		/* /proc		*/
+	LXPR_PIDDIR,		/* /proc/<pid>		*/
+	LXPR_PID_AUXV,		/* /proc/<pid>/auxv	*/
+	LXPR_PID_CGROUP,	/* /proc/<pid>/cgroup	*/
+	LXPR_PID_CMDLINE,	/* /proc/<pid>/cmdline	*/
+	LXPR_PID_COMM,		/* /proc/<pid>/comm	*/
+	LXPR_PID_CPU,		/* /proc/<pid>/cpu	*/
+	LXPR_PID_CURDIR,	/* /proc/<pid>/cwd	*/
+	LXPR_PID_ENV,		/* /proc/<pid>/environ	*/
+	LXPR_PID_EXE,		/* /proc/<pid>/exe	*/
+	LXPR_PID_LIMITS,	/* /proc/<pid>/limits	*/
+	LXPR_PID_LOGINUID,	/* /proc/<pid>/loginuid	*/
+	LXPR_PID_MAPS,		/* /proc/<pid>/maps	*/
+	LXPR_PID_MEM,		/* /proc/<pid>/mem	*/
+	LXPR_PID_MOUNTINFO,	/* /proc/<pid>/mountinfo */
+	LXPR_PID_OOM_SCR_ADJ,	/* /proc/<pid>/oom_score_adj	*/
+	LXPR_PID_PERSONALITY,	/* /proc/<pid>/personality	*/
+	LXPR_PID_ROOTDIR,	/* /proc/<pid>/root	*/
+	LXPR_PID_STAT,		/* /proc/<pid>/stat	*/
+	LXPR_PID_STATM,		/* /proc/<pid>/statm	*/
+	LXPR_PID_STATUS,	/* /proc/<pid>/status	*/
+	LXPR_PID_TASKDIR,	/* /proc/<pid>/task	*/
+	LXPR_PID_TASK_IDDIR,	/* /proc/<pid>/task/<tid>		*/
+	LXPR_PID_FDDIR,		/* /proc/<pid>/fd	*/
+	LXPR_PID_FD_FD,		/* /proc/<pid>/fd/nn	*/
+	LXPR_PID_TID_AUXV,	/* /proc/<pid>/task/<tid>/auxv		*/
+	LXPR_PID_TID_CGROUP,	/* /proc/<pid>/task/<tid>/cgroup	*/
+	LXPR_PID_TID_CMDLINE,	/* /proc/<pid>/task/<tid>/cmdline	*/
+	LXPR_PID_TID_COMM,	/* /proc/<pid>/task/<tid>/comm		*/
+	LXPR_PID_TID_CPU,	/* /proc/<pid>/task/<tid>/cpu		*/
+	LXPR_PID_TID_CURDIR,	/* /proc/<pid>/task/<tid>/cwd		*/
+	LXPR_PID_TID_ENV,	/* /proc/<pid>/task/<tid>/environ	*/
+	LXPR_PID_TID_EXE,	/* /proc/<pid>/task/<tid>/exe		*/
+	LXPR_PID_TID_LIMITS,	/* /proc/<pid>/task/<tid>/limits	*/
+	LXPR_PID_TID_LOGINUID,	/* /proc/<pid>/task/<tid>/loginuid	*/
+	LXPR_PID_TID_MAPS,	/* /proc/<pid>/task/<tid>/maps		*/
+	LXPR_PID_TID_MEM,	/* /proc/<pid>/task/<tid>/mem		*/
+	LXPR_PID_TID_MOUNTINFO,	/* /proc/<pid>/task/<tid>/mountinfo	*/
+	LXPR_PID_TID_OOM_SCR_ADJ, /* /proc/<pid>/task/<tid>/oom_score_adj */
+	LXPR_PID_TID_PERSONALITY, /* /proc/<pid>/task/<tid>/personality */
+	LXPR_PID_TID_ROOTDIR,	/* /proc/<pid>/task/<tid>/root		*/
+	LXPR_PID_TID_STAT,	/* /proc/<pid>/task/<tid>/stat		*/
+	LXPR_PID_TID_STATM,	/* /proc/<pid>/task/<tid>/statm		*/
+	LXPR_PID_TID_STATUS,	/* /proc/<pid>/task/<tid>/status	*/
+	LXPR_PID_TID_FDDIR,	/* /proc/<pid>/task/<tid>/fd		*/
+	LXPR_PID_TID_FD_FD,	/* /proc/<pid>/task/<tid>/fd/nn		*/
+	LXPR_CGROUPS,		/* /proc/cgroups	*/
+	LXPR_CMDLINE,		/* /proc/cmdline	*/
+	LXPR_CPUINFO,		/* /proc/cpuinfo	*/
+	LXPR_DEVICES,		/* /proc/devices	*/
+	LXPR_DISKSTATS,		/* /proc/diskstats	*/
+	LXPR_DMA,		/* /proc/dma		*/
+	LXPR_FILESYSTEMS,	/* /proc/filesystems	*/
+	LXPR_INTERRUPTS,	/* /proc/interrupts	*/
+	LXPR_IOPORTS,		/* /proc/ioports	*/
+	LXPR_KCORE,		/* /proc/kcore		*/
+	LXPR_KMSG,		/* /proc/kmsg		*/
+	LXPR_LOADAVG,		/* /proc/loadavg	*/
+	LXPR_MEMINFO,		/* /proc/meminfo	*/
+	LXPR_MODULES,		/* /proc/modules	*/
+	LXPR_MOUNTS,		/* /proc/mounts		*/
+	LXPR_NETDIR,		/* /proc/net		*/
+	LXPR_NET_ARP,		/* /proc/net/arp	*/
+	LXPR_NET_DEV,		/* /proc/net/dev	*/
+	LXPR_NET_DEV_MCAST,	/* /proc/net/dev_mcast	*/
+	LXPR_NET_IF_INET6,	/* /proc/net/if_inet6	*/
+	LXPR_NET_IGMP,		/* /proc/net/igmp	*/
+	LXPR_NET_IP_MR_CACHE,	/* /proc/net/ip_mr_cache */
+	LXPR_NET_IP_MR_VIF,	/* /proc/net/ip_mr_vif	*/
+	LXPR_NET_IPV6_ROUTE,	/* /proc/net/ipv6_route	*/
+	LXPR_NET_MCFILTER,	/* /proc/net/mcfilter	*/
+	LXPR_NET_NETSTAT,	/* /proc/net/netstat	*/
+	LXPR_NET_RAW,		/* /proc/net/raw	*/
+	LXPR_NET_ROUTE,		/* /proc/net/route	*/
+	LXPR_NET_RPC,		/* /proc/net/rpc	*/
+	LXPR_NET_RT_CACHE,	/* /proc/net/rt_cache	*/
+	LXPR_NET_SOCKSTAT,	/* /proc/net/sockstat	*/
+	LXPR_NET_SNMP,		/* /proc/net/snmp	*/
+	LXPR_NET_STAT,		/* /proc/net/stat	*/
+	LXPR_NET_TCP,		/* /proc/net/tcp	*/
+	LXPR_NET_TCP6,		/* /proc/net/tcp6	*/
+	LXPR_NET_UDP,		/* /proc/net/udp	*/
+	LXPR_NET_UDP6,		/* /proc/net/udp6	*/
+	LXPR_NET_UNIX,		/* /proc/net/unix	*/
+	LXPR_PARTITIONS,	/* /proc/partitions	*/
+	LXPR_SELF,		/* /proc/self		*/
+	LXPR_STAT,		/* /proc/stat		*/
+	LXPR_SWAPS,		/* /proc/swaps		*/
+	LXPR_SYSDIR,		/* /proc/sys/		*/
+	LXPR_SYS_FSDIR,		/* /proc/sys/fs/	*/
+	LXPR_SYS_FS_INOTIFYDIR,	/* /proc/sys/fs/inotify	*/
+	LXPR_SYS_FS_INOTIFY_MAX_QUEUED_EVENTS,	/* inotify/max_queued_events */
+	LXPR_SYS_FS_INOTIFY_MAX_USER_INSTANCES,	/* inotify/max_user_instances */
+	LXPR_SYS_FS_INOTIFY_MAX_USER_WATCHES,	/* inotify/max_user_watches */
+	LXPR_SYS_KERNELDIR,	/* /proc/sys/kernel/	*/
+	LXPR_SYS_KERNEL_CAPLCAP,	/* /proc/sys/kernel/cap_last_cap */
+	LXPR_SYS_KERNEL_COREPATT,	/* /proc/sys/kernel/core_pattern */
+	LXPR_SYS_KERNEL_HOSTNAME,	/* /proc/sys/kernel/hostname */
+	LXPR_SYS_KERNEL_MSGMNI,	/* /proc/sys/kernel/msgmni */
+	LXPR_SYS_KERNEL_NGROUPS_MAX,	/* /proc/sys/kernel/ngroups_max */
+	LXPR_SYS_KERNEL_OSREL,	/* /proc/sys/kernel/osrelease */
+	LXPR_SYS_KERNEL_PID_MAX,	/* /proc/sys/kernel/pid_max */
+	LXPR_SYS_KERNEL_RANDDIR,	/* /proc/sys/kernel/random */
+	LXPR_SYS_KERNEL_RAND_BOOTID, /* /proc/sys/kernel/random/boot_id */
+	LXPR_SYS_KERNEL_SEM,		/* /proc/sys/kernel/sem		*/
+	LXPR_SYS_KERNEL_SHMALL,		/* /proc/sys/kernel/shmall	*/
+	LXPR_SYS_KERNEL_SHMMAX,		/* /proc/sys/kernel/shmmax	*/
+	LXPR_SYS_KERNEL_SHMMNI,		/* /proc/sys/kernel/shmmni	*/
+	LXPR_SYS_KERNEL_THREADS_MAX,	/* /proc/sys/kernel/threads-max */
+	LXPR_SYS_NETDIR,		/* /proc/sys/net		*/
+	LXPR_SYS_NET_COREDIR,		/* /proc/sys/net/core		*/
+	LXPR_SYS_NET_CORE_SOMAXCON,	/* /proc/sys/net/core/somaxconn	*/
+	LXPR_SYS_NET_IPV4DIR,		/* /proc/sys/net/ipv4		*/
+	LXPR_SYS_NET_IPV4_IP_LPORT_RANGE, /* .../net/ipv4/ip_local_port_range */
+	LXPR_SYS_NET_IPV4_TCP_FIN_TO,	/* /proc/sys/net/ipv4/tcp_fin_timeout */
+	LXPR_SYS_NET_IPV4_TCP_KA_INT,	/* .../net/ipv4/tcp_keepalive_intvl */
+	LXPR_SYS_NET_IPV4_TCP_KA_TIM,	/* .../net/ipv4/tcp_keepalive_time */
+	LXPR_SYS_NET_IPV4_TCP_SACK,	/* /proc/sys/net/ipv4/tcp_sack */
+	LXPR_SYS_NET_IPV4_TCP_WINSCALE,	/* .../net/ipv4/tcp_window_scaling */
+	LXPR_SYS_VMDIR,			/* /proc/sys/vm			*/
+	LXPR_SYS_VM_MAX_MAP_CNT,	/* /proc/sys/vm/max_map_count	*/
+	LXPR_SYS_VM_MINFR_KB,		/* /proc/sys/vm/min_free_kbytes	*/
+	LXPR_SYS_VM_NHUGEP,		/* /proc/sys/vm/nr_hugepages	*/
+	LXPR_SYS_VM_OVERCOMMIT_MEM,	/* /proc/sys/vm/overcommit_memory */
+	LXPR_SYS_VM_SWAPPINESS,		/* /proc/sys/vm/swappiness	*/
+	LXPR_UPTIME,		/* /proc/uptime		*/
+	LXPR_VERSION,		/* /proc/version	*/
+	LXPR_VMSTAT,		/* /proc/vmstat		*/
+	LXPR_NFILES		/* number of lx /proc file types */
+} lxpr_nodetype_t;
+
+
+/*
+ * Number of fds allowed for in the inode number calculation
+ * per process (if a process has more fds then inode numbers
+ * may be duplicated)
+ */
+#define	LXPR_FD_PERPROC 2000
+
+/*
+ * Linux sector size for /proc/diskstats
+ */
+#define	LXPR_SECTOR_SIZE	512
+
+/*
+ * external dirent characteristics
+ */
+typedef struct {
+	lxpr_nodetype_t	d_type;
+	char		*d_name;
+} lxpr_dirent_t;
+
+/*
+ * This is the lxprocfs private data object
+ * which is attached to v_data in the vnode structure
+ */
+typedef struct lxpr_node {
+	lxpr_nodetype_t	lxpr_type;	/* type of this node 		*/
+	vnode_t		*lxpr_vnode;	/* vnode for the node		*/
+	vnode_t		*lxpr_parent;	/* parent directory		*/
+	vnode_t		*lxpr_realvp;	/* real vnode, file in dirs	*/
+	timestruc_t	lxpr_time;	/* creation etc time for file	*/
+	mode_t		lxpr_mode;	/* file mode bits		*/
+	uid_t		lxpr_uid;	/* file owner			*/
+	gid_t		lxpr_gid;	/* file group owner		*/
+	pid_t		lxpr_pid;	/* pid of proc referred to	*/
+	uint_t		lxpr_desc;	/* addl. descriptor (fd or tid)	*/
+	ino_t		lxpr_ino;	/* node id 			*/
+} lxpr_node_t;
+
+struct zone;    /* forward declaration */
+
+/*
+ * This is the lxprocfs private data object
+ * which is attached to vfs_data in the vfs structure
+ */
+typedef struct lxpr_mnt {
+	lxpr_node_t	*lxprm_node;	/* node at root of proc mount */
+	struct zone	*lxprm_zone;	/* zone for this mount */
+	ldi_ident_t	lxprm_li;	/* ident for ldi */
+} lxpr_mnt_t;
+
+extern vnodeops_t	*lxpr_vnodeops;
+extern int		nproc_highbit;	/* highbit(v.v_nproc)		*/
+
+typedef struct mounta	mounta_t;
+
+extern void lxpr_initnodecache();
+extern void lxpr_fininodecache();
+extern void lxpr_initrootnode(lxpr_node_t **, vfs_t *);
+extern ino_t lxpr_inode(lxpr_nodetype_t, pid_t, int);
+extern ino_t lxpr_parentinode(lxpr_node_t *);
+extern boolean_t lxpr_is_writable(lxpr_nodetype_t);
+extern lxpr_node_t *lxpr_getnode(vnode_t *, lxpr_nodetype_t, proc_t *, int);
+extern void lxpr_freenode(lxpr_node_t *);
+extern vnode_t *lxpr_lookup_fdnode(vnode_t *, const char *);
+extern int lxpr_readlink_fdnode(lxpr_node_t *, char *, size_t);
+
+typedef struct lxpr_uiobuf {
+	uio_t *uiop;
+	char *buffer;
+	uint32_t buffsize;
+	char *pos;
+	size_t beg;
+	int error;
+} lxpr_uiobuf_t;
+
+extern lxpr_uiobuf_t *lxpr_uiobuf_new(uio_t *);
+extern void lxpr_uiobuf_free(lxpr_uiobuf_t *);
+extern int lxpr_uiobuf_flush(lxpr_uiobuf_t *);
+extern void lxpr_uiobuf_seek(lxpr_uiobuf_t *, offset_t);
+extern boolean_t lxpr_uiobuf_nonblock(lxpr_uiobuf_t *);
+extern void lxpr_uiobuf_write(lxpr_uiobuf_t *, const char *, size_t);
+extern void lxpr_uiobuf_printf(lxpr_uiobuf_t *, const char *, ...);
+extern void lxpr_uiobuf_seterr(lxpr_uiobuf_t *, int);
+
+extern int lxpr_core_path_l2s(const char *, char *, size_t);
+extern int lxpr_core_path_s2l(const char *, char *, size_t);
+
+typedef enum lxpr_zombok {
+	NO_ZOMB = 0,
+	ZOMB_OK
+} zombok_t;
+
+proc_t *lxpr_lock(pid_t, zombok_t);
+void lxpr_unlock(proc_t *);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#ifndef islower
+#define	islower(x)	(((unsigned)(x) >= 'a') && ((unsigned)(x) <= 'z'))
+#endif
+#ifndef toupper
+#define	toupper(x)	(islower(x) ? (x) - 'a' + 'A' : (x))
+#endif
+
+#endif /* _LX_PROC_H */
diff --git a/usr/src/uts/common/brand/lx/procfs/lx_prsubr.c b/usr/src/uts/common/brand/lx/procfs/lx_prsubr.c
new file mode 100644
index 0000000000..c12118a3ea
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/procfs/lx_prsubr.c
@@ -0,0 +1,851 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ * Copyright 2016 Joyent, Inc.  All rights reserved.
+ */
+
+/*
+ * lxprsubr.c: Various functions for the /lxproc vnodeops.
+ */
+
+#include <sys/varargs.h>
+
+#include <sys/cpuvar.h>
+#include <sys/mman.h>
+#include <sys/vmsystm.h>
+#include <sys/prsystm.h>
+#include <sys/zfs_ioctl.h>
+
+#include "lx_proc.h"
+
+#define	LXPRCACHE_NAME "lxbpr_cache"
+
+static int lxpr_node_constructor(void *, void *, int);
+static void lxpr_node_destructor(void *, void *);
+
+static kmem_cache_t *lxpr_node_cache;
+
+int lx_pr_bufsize = 4000;
+
+struct lxpr_zfs_ds {
+	list_node_t	ds_link;
+	char		ds_name[MAXPATHLEN];
+	uint64_t	ds_cookie;
+};
+
+struct lxpr_uiobuf *
+lxpr_uiobuf_new(uio_t *uiop)
+{
+	/* Allocate memory for both lxpr_uiobuf and output buffer */
+	int bufsize = lx_pr_bufsize;
+	struct lxpr_uiobuf *uiobuf =
+	    kmem_alloc(sizeof (struct lxpr_uiobuf) + bufsize, KM_SLEEP);
+
+	uiobuf->uiop = uiop;
+	uiobuf->buffer = (char *)&uiobuf[1];
+	uiobuf->buffsize = bufsize;
+	uiobuf->pos = uiobuf->buffer;
+	uiobuf->beg = 0;
+	uiobuf->error = 0;
+
+	return (uiobuf);
+}
+
+void
+lxpr_uiobuf_free(struct lxpr_uiobuf *uiobuf)
+{
+	ASSERT(uiobuf != NULL);
+	ASSERT(uiobuf->pos == uiobuf->buffer);
+
+	kmem_free(uiobuf, sizeof (struct lxpr_uiobuf) + uiobuf->buffsize);
+}
+
+void
+lxpr_uiobuf_seek(struct lxpr_uiobuf *uiobuf, offset_t offset)
+{
+	uiobuf->uiop->uio_offset = (off_t)offset;
+}
+
+boolean_t
+lxpr_uiobuf_nonblock(struct lxpr_uiobuf *uiobuf)
+{
+	if ((uiobuf->uiop->uio_fmode & FNONBLOCK) != 0)
+		return (B_TRUE);
+	return (B_FALSE);
+}
+
+void
+lxpr_uiobuf_seterr(struct lxpr_uiobuf *uiobuf, int err)
+{
+	ASSERT(uiobuf->error == 0);
+
+	uiobuf->error = err;
+}
+
+int
+lxpr_uiobuf_flush(struct lxpr_uiobuf *uiobuf)
+{
+	off_t off = uiobuf->uiop->uio_offset;
+	caddr_t uaddr = uiobuf->buffer;
+	size_t beg = uiobuf->beg;
+	size_t size = (uintptr_t)uiobuf->pos - (uintptr_t)uaddr;
+
+	if (uiobuf->error == 0 && uiobuf->uiop->uio_resid != 0) {
+		ASSERT(off >= beg);
+
+		if (beg + size > off && off >= 0)
+			uiobuf->error =
+			    uiomove(uaddr + (off - beg), size - (off - beg),
+			    UIO_READ, uiobuf->uiop);
+
+		uiobuf->beg += size;
+	}
+
+	uiobuf->pos = uaddr;
+
+	return (uiobuf->error);
+}
+
+void
+lxpr_uiobuf_write(struct lxpr_uiobuf *uiobuf, const char *buf, size_t size)
+{
+	/* While we can still carry on */
+	while (uiobuf->error == 0 && uiobuf->uiop->uio_resid != 0) {
+		uintptr_t remain = (uintptr_t)uiobuf->buffsize -
+		    ((uintptr_t)uiobuf->pos - (uintptr_t)uiobuf->buffer);
+
+		/* Enough space in buffer? */
+		if (remain >= size) {
+			bcopy(buf, uiobuf->pos, size);
+			uiobuf->pos += size;
+			return;
+		}
+
+		/* Not enough space, so copy all we can and try again */
+		bcopy(buf, uiobuf->pos, remain);
+		uiobuf->pos += remain;
+		(void) lxpr_uiobuf_flush(uiobuf);
+		buf += remain;
+		size -= remain;
+	}
+}
+
+#define	TYPBUFFSIZE 256
+
+void
+lxpr_uiobuf_printf(struct lxpr_uiobuf *uiobuf, const char *fmt, ...)
+{
+	va_list args;
+	char buff[TYPBUFFSIZE];
+	int len;
+	char *buffer;
+
+	/* Can we still do any output */
+	if (uiobuf->error != 0 || uiobuf->uiop->uio_resid == 0)
+		return;
+
+	va_start(args, fmt);
+
+	/* Try using stack allocated buffer */
+	len = vsnprintf(buff, TYPBUFFSIZE, fmt, args);
+	if (len < TYPBUFFSIZE) {
+		va_end(args);
+		lxpr_uiobuf_write(uiobuf, buff, len);
+		return;
+	}
+
+	/* Not enough space in pre-allocated buffer */
+	buffer = kmem_alloc(len + 1, KM_SLEEP);
+
+	/*
+	 * We know we allocated the correct amount of space
+	 * so no check on the return value
+	 */
+	(void) vsnprintf(buffer, len+1, fmt, args);
+	lxpr_uiobuf_write(uiobuf, buffer, len);
+	va_end(args);
+	kmem_free(buffer, len+1);
+}
+
+/*
+ * lxpr_lock():
+ *
+ * Lookup process from pid and return with p_plock and P_PR_LOCK held.
+ */
+proc_t *
+lxpr_lock(pid_t pid, zombok_t zombie_ok)
+{
+	proc_t *p;
+	kmutex_t *mp;
+	pid_t find_pid;
+	uint_t flags;
+
+	ASSERT(!MUTEX_HELD(&pidlock));
+
+	for (;;) {
+		mutex_enter(&pidlock);
+
+		/*
+		 * If the pid is 1, we really want the zone's init process;
+		 * if 0 we want zsched.
+		 */
+		if (pid == 1) {
+			find_pid = curproc->p_zone->zone_proc_initpid;
+		} else if (pid == 0) {
+			find_pid = curproc->p_zone->zone_zsched->p_pid;
+		} else {
+			find_pid = pid;
+		}
+		p = prfind(find_pid);
+
+		if (p == NULL || p->p_stat == SIDL) {
+			mutex_exit(&pidlock);
+			return (NULL);
+		}
+
+		/*
+		 * p_lock is persistent, but p itself is not -- it could
+		 * vanish during cv_wait().  Load p->p_lock now so we can
+		 * drop it after cv_wait() without referencing p.
+		 */
+		mp = &p->p_lock;
+		mutex_enter(mp);
+
+		mutex_exit(&pidlock);
+
+		/*
+		 * Filter out exiting or zombie processes, if requested.
+		 */
+		if (zombie_ok == NO_ZOMB &&
+		    ((p->p_flag & SEXITING) || p->p_stat == SZOMB)) {
+			mutex_exit(mp);
+			return (NULL);
+		}
+
+		flags = p->p_proc_flag & (P_PR_LOCK | P_PR_EXEC);
+		if (flags == 0) {
+			break;
+		} else if (flags == P_PR_EXEC && p == curproc) {
+			/*
+			 * Forward progress with (only) the PR_EXEC flag is
+			 * safe if a process is accessing resources in its own
+			 * piddir.  Executing its own /proc/<pid>/exe symlink
+			 * is one potential example.
+			 *
+			 * For all other processes, it is necessary to wait
+			 * until the exec is completed.
+			 */
+			break;
+		}
+
+		cv_wait(&pr_pid_cv[p->p_slot], mp);
+		mutex_exit(mp);
+	}
+
+	p->p_proc_flag |= P_PR_LOCK;
+	THREAD_KPRI_REQUEST();
+	return (p);
+}
+
+/*
+ * lxpr_unlock()
+ *
+ * Unlock locked process
+ */
+void
+lxpr_unlock(proc_t *p)
+{
+	ASSERT(p->p_proc_flag & P_PR_LOCK);
+	ASSERT(MUTEX_HELD(&p->p_lock));
+	ASSERT(!MUTEX_HELD(&pidlock));
+
+	cv_signal(&pr_pid_cv[p->p_slot]);
+	p->p_proc_flag &= ~P_PR_LOCK;
+	mutex_exit(&p->p_lock);
+	THREAD_KPRI_RELEASE();
+}
+
+void
+lxpr_initnodecache()
+{
+	lxpr_node_cache = kmem_cache_create(LXPRCACHE_NAME,
+	    sizeof (lxpr_node_t), 0,
+	    lxpr_node_constructor, lxpr_node_destructor, NULL, NULL, NULL, 0);
+}
+
+void
+lxpr_fininodecache()
+{
+	kmem_cache_destroy(lxpr_node_cache);
+}
+
+/* ARGSUSED */
+static int
+lxpr_node_constructor(void *buf, void *un, int kmflags)
+{
+	lxpr_node_t	*lxpnp = buf;
+	vnode_t		*vp;
+
+	vp = lxpnp->lxpr_vnode = vn_alloc(kmflags);
+	if (vp == NULL)
+		return (-1);
+
+	(void) vn_setops(vp, lxpr_vnodeops);
+	vp->v_data = lxpnp;
+
+	return (0);
+}
+
+/* ARGSUSED */
+static void
+lxpr_node_destructor(void *buf, void *un)
+{
+	lxpr_node_t	*lxpnp = buf;
+
+	vn_free(LXPTOV(lxpnp));
+}
+
+/*
+ * Calculate an inode number
+ *
+ * This takes various bits of info and munges them
+ * to give the inode number for an lxproc node
+ */
+ino_t
+lxpr_inode(lxpr_nodetype_t type, pid_t pid, int desc)
+{
+	if (pid == 1) {
+		pid = curproc->p_zone->zone_proc_initpid;
+	} else if (pid == 0) {
+		pid = curproc->p_zone->zone_zsched->p_pid;
+	}
+
+	switch (type) {
+	case LXPR_PIDDIR:
+		return (maxpid + pid + 1);
+	case LXPR_PID_TASK_IDDIR:
+		return (maxpid + (desc * 10));
+	case LXPR_PROCDIR:
+		return (maxpid + 2);
+	case LXPR_PID_FD_FD:
+		return (maxpid + 2 +
+		    (pid * (LXPR_FD_PERPROC + LXPR_NFILES)) +
+		    LXPR_NFILES + desc);
+	default:
+		return (maxpid + 2 +
+		    (pid * (LXPR_FD_PERPROC + LXPR_NFILES)) +
+		    type);
+	}
+}
+
+/*
+ * Return inode number of parent (directory)
+ */
+ino_t
+lxpr_parentinode(lxpr_node_t *lxpnp)
+{
+	/*
+	 * If the input node is the root then the parent inode
+	 * is the mounted on inode so just return our inode number
+	 */
+	if (lxpnp->lxpr_type != LXPR_PROCDIR)
+		return (VTOLXP(lxpnp->lxpr_parent)->lxpr_ino);
+	else
+		return (lxpnp->lxpr_ino);
+}
+
+/*
+ * Allocate a new lxproc node
+ *
+ * This also allocates the vnode associated with it
+ */
+lxpr_node_t *
+lxpr_getnode(vnode_t *dp, lxpr_nodetype_t type, proc_t *p, int desc)
+{
+	lxpr_node_t *lxpnp;
+	vnode_t *vp;
+	user_t *up;
+	timestruc_t now;
+
+	/*
+	 * Allocate a new node. It is deallocated in vop_innactive
+	 */
+	lxpnp = kmem_cache_alloc(lxpr_node_cache, KM_SLEEP);
+
+	/*
+	 * Set defaults (may be overridden below)
+	 */
+	gethrestime(&now);
+	lxpnp->lxpr_type = type;
+	lxpnp->lxpr_realvp = NULL;
+	lxpnp->lxpr_parent = dp;
+	lxpnp->lxpr_desc = desc;
+	VN_HOLD(dp);
+	if (p != NULL) {
+		if (p->p_pid == curproc->p_zone->zone_proc_initpid) {
+			lxpnp->lxpr_pid = 1;
+		} else if (p->p_pid == curproc->p_zone->zone_zsched->p_pid) {
+			lxpnp->lxpr_pid = 0;
+		} else {
+			lxpnp->lxpr_pid = p->p_pid;
+		}
+
+		lxpnp->lxpr_time = PTOU(p)->u_start;
+		lxpnp->lxpr_uid = crgetruid(p->p_cred);
+		lxpnp->lxpr_gid = crgetrgid(p->p_cred);
+		lxpnp->lxpr_ino = lxpr_inode(type, p->p_pid, desc);
+	} else {
+		/* Pretend files without a proc belong to sched */
+		lxpnp->lxpr_pid = 0;
+		lxpnp->lxpr_time = now;
+		lxpnp->lxpr_uid = lxpnp->lxpr_gid = 0;
+		lxpnp->lxpr_ino = lxpr_inode(type, 0, 0);
+	}
+
+	/* initialize the vnode data */
+	vp = lxpnp->lxpr_vnode;
+	vn_reinit(vp);
+	vp->v_flag = VNOCACHE|VNOMAP|VNOSWAP|VNOMOUNT;
+	vp->v_vfsp = dp->v_vfsp;
+
+	/*
+	 * Do node specific stuff
+	 */
+	if (lxpr_is_writable(type)) {
+		/* These two have different modes; handled later. */
+		if (type != LXPR_PID_FD_FD && type != LXPR_PID_TID_FD_FD) {
+			vp->v_type = VREG;
+			lxpnp->lxpr_mode = 0644;
+			return (lxpnp);
+		}
+	}
+
+	switch (type) {
+	case LXPR_PROCDIR:
+		vp->v_flag |= VROOT;
+		vp->v_type = VDIR;
+		lxpnp->lxpr_mode = 0555;	/* read-search by everyone */
+		break;
+
+	case LXPR_PID_CURDIR:
+		ASSERT(p != NULL);
+
+		/*
+		 * Zombie check.  p_stat is officially protected by pidlock,
+		 * but we can't grab pidlock here because we already hold
+		 * p_lock.  Luckily if we look at the process exit code
+		 * we see that p_stat only transisions from SRUN to SZOMB
+		 * while p_lock is held.  Aside from this, the only other
+		 * p_stat transition that we need to be aware about is
+		 * SIDL to SRUN, but that's not a problem since lxpr_lock()
+		 * ignores nodes in the SIDL state so we'll never get a node
+		 * that isn't already in the SRUN state.
+		 */
+		if (p->p_stat == SZOMB) {
+			lxpnp->lxpr_realvp = NULL;
+		} else {
+			up = PTOU(p);
+			lxpnp->lxpr_realvp = up->u_cdir;
+			ASSERT(lxpnp->lxpr_realvp != NULL);
+			VN_HOLD(lxpnp->lxpr_realvp);
+		}
+		vp->v_type = VLNK;
+		lxpnp->lxpr_mode = 0777;	/* anyone does anything ! */
+		break;
+
+	case LXPR_PID_ROOTDIR:
+		ASSERT(p != NULL);
+		/* Zombie check.  see locking comment above */
+		if (p->p_stat == SZOMB) {
+			lxpnp->lxpr_realvp = NULL;
+		} else {
+			up = PTOU(p);
+			lxpnp->lxpr_realvp =
+			    up->u_rdir != NULL ? up->u_rdir : rootdir;
+			ASSERT(lxpnp->lxpr_realvp != NULL);
+			VN_HOLD(lxpnp->lxpr_realvp);
+		}
+		vp->v_type = VLNK;
+		lxpnp->lxpr_mode = 0777;	/* anyone does anything ! */
+		break;
+
+	case LXPR_PID_EXE:
+		ASSERT(p != NULL);
+		lxpnp->lxpr_realvp = p->p_exec;
+		if (lxpnp->lxpr_realvp != NULL) {
+			VN_HOLD(lxpnp->lxpr_realvp);
+		}
+		vp->v_type = VLNK;
+		lxpnp->lxpr_mode = 0777;
+		break;
+
+	case LXPR_SELF:
+		vp->v_type = VLNK;
+		lxpnp->lxpr_mode = 0777;	/* anyone does anything ! */
+		break;
+
+	case LXPR_PID_TASKDIR:
+		ASSERT(p != NULL);
+		vp->v_type = VDIR;
+		lxpnp->lxpr_mode = 0555;	/* read-search by everyone */
+		break;
+
+	case LXPR_PID_TASK_IDDIR:
+		ASSERT(p != NULL);
+		vp->v_type = VDIR;
+		lxpnp->lxpr_mode = 0555;	/* read-search by everyone */
+		break;
+
+	case LXPR_PID_FD_FD:
+	case LXPR_PID_TID_FD_FD:
+		ASSERT(p != NULL);
+		/* lxpr_realvp is set after we return */
+		lxpnp->lxpr_mode = 0700;	/* read-write-exe owner only */
+		vp->v_type = VLNK;
+		break;
+
+	case LXPR_PID_FDDIR:
+	case LXPR_PID_TID_FDDIR:
+		ASSERT(p != NULL);
+		vp->v_type = VDIR;
+		lxpnp->lxpr_mode = 0500;	/* read-search by owner only */
+		break;
+
+	case LXPR_PIDDIR:
+		ASSERT(p != NULL);
+		vp->v_type = VDIR;
+		lxpnp->lxpr_mode = 0511;
+		break;
+
+	case LXPR_NETDIR:
+	case LXPR_SYSDIR:
+	case LXPR_SYS_FSDIR:
+	case LXPR_SYS_FS_INOTIFYDIR:
+	case LXPR_SYS_KERNELDIR:
+	case LXPR_SYS_KERNEL_RANDDIR:
+	case LXPR_SYS_NETDIR:
+	case LXPR_SYS_NET_COREDIR:
+	case LXPR_SYS_NET_IPV4DIR:
+	case LXPR_SYS_VMDIR:
+		vp->v_type = VDIR;
+		lxpnp->lxpr_mode = 0555;	/* read-search by all */
+		break;
+
+	case LXPR_PID_ENV:
+	case LXPR_PID_MEM:
+		ASSERT(p != NULL);
+		/*FALLTHRU*/
+	case LXPR_KCORE:
+		vp->v_type = VREG;
+		lxpnp->lxpr_mode = 0400;	/* read-only by owner only */
+		break;
+
+	default:
+		vp->v_type = VREG;
+		lxpnp->lxpr_mode = 0444;	/* read-only by all */
+		break;
+	}
+
+	return (lxpnp);
+}
+
+
+/*
+ * Free the storage obtained from lxpr_getnode().
+ */
+void
+lxpr_freenode(lxpr_node_t *lxpnp)
+{
+	ASSERT(lxpnp != NULL);
+	ASSERT(LXPTOV(lxpnp) != NULL);
+
+	/*
+	 * delete any association with realvp
+	 */
+	if (lxpnp->lxpr_realvp != NULL)
+		VN_RELE(lxpnp->lxpr_realvp);
+
+	/*
+	 * delete any association with parent vp
+	 */
+	if (lxpnp->lxpr_parent != NULL)
+		VN_RELE(lxpnp->lxpr_parent);
+
+	/*
+	 * Release the lxprnode.
+	 */
+	kmem_cache_free(lxpr_node_cache, lxpnp);
+}
+
+/*
+ * Attempt to locate vnode for /proc/<pid>/fd/<#>.
+ */
+vnode_t *
+lxpr_lookup_fdnode(vnode_t *dvp, const char *name)
+{
+	lxpr_node_t *lxdp = VTOLXP(dvp);
+	lxpr_node_t *lxfp;
+	char *endptr = NULL;
+	long num;
+	int fd;
+	proc_t *p;
+	vnode_t *vp = NULL;
+	file_t *fp;
+	uf_entry_t *ufp;
+	uf_info_t *fip;
+
+	ASSERT(lxdp->lxpr_type == LXPR_PID_FDDIR ||
+	    lxdp->lxpr_type == LXPR_PID_TID_FDDIR);
+
+	if (ddi_strtol(name, &endptr, 10, &num) != 0) {
+		return (NULL);
+	} else if (name[0] < '0' || name[0] > '9' || *endptr != '\0') {
+		/*
+		 * ddi_strtol allows leading spaces and trailing garbage
+		 * We do not tolerate such foolishness.
+		 */
+		return (NULL);
+	} else if ((fd = (int)num) < 0) {
+		return (NULL);
+	}
+
+	/* Lock the owner process */
+	p = lxpr_lock(lxdp->lxpr_pid, NO_ZOMB);
+	if ((p == NULL))
+		return (NULL);
+
+	/* Not applicable to processes which are system-owned. */
+	if ((p->p_flag & SSYS) || (p->p_as == &kas)) {
+		lxpr_unlock(p);
+		return (NULL);
+	}
+
+	lxfp = lxpr_getnode(dvp, LXPR_PID_FD_FD, p, fd);
+
+	/*
+	 * Drop p_lock, but keep the process P_PR_LOCK'd to prevent it from
+	 * going away while we dereference into fi_list.
+	 */
+	fip = P_FINFO(p);
+	mutex_exit(&p->p_lock);
+	mutex_enter(&fip->fi_lock);
+	if (fd < fip->fi_nfiles) {
+		UF_ENTER(ufp, fip, fd);
+		if ((fp = ufp->uf_file) != NULL) {
+			vp = fp->f_vnode;
+			VN_HOLD(vp);
+		}
+		UF_EXIT(ufp);
+	}
+	mutex_exit(&fip->fi_lock);
+	mutex_enter(&p->p_lock);
+
+	if (vp == NULL) {
+		lxpr_unlock(p);
+		lxpr_freenode(lxfp);
+		return (NULL);
+	} else {
+		/*
+		 * Fill in the lxpr_node so future references will be able to
+		 * find the underlying vnode. The vnode is held on the realvp.
+		 */
+		lxfp->lxpr_realvp = vp;
+
+		/*
+		 * For certain entries (sockets, pipes, etc), Linux expects a
+		 * bogus-named symlink.  If that's the case, report the type as
+		 * VNON to bypass link-following elsewhere in the vfs system.
+		 *
+		 * See lxpr_readlink for more details.
+		 */
+		if (lxpr_readlink_fdnode(lxfp, NULL, 0) == 0)
+			LXPTOV(lxfp)->v_type = VNON;
+	}
+
+	lxpr_unlock(p);
+	ASSERT(LXPTOV(lxfp) != NULL);
+	return (LXPTOV(lxfp));
+}
+
+/*
+ * Attempt to create Linux-proc-style fake symlinks contents for supported
+ * /proc/<pid>/fd/<#> entries.
+ */
+int
+lxpr_readlink_fdnode(lxpr_node_t *lxpnp, char *bp, size_t len)
+{
+	const char *format;
+	vnode_t *rvp = lxpnp->lxpr_realvp;
+	vattr_t attr;
+
+	switch (rvp->v_type) {
+	case VSOCK:
+		format = "socket:[%lu]";
+		break;
+	case VFIFO:
+		format = "pipe:[%lu]";
+		break;
+	default:
+		return (-1);
+	}
+
+	/* Fetch the inode of the underlying vnode */
+	if (VOP_GETATTR(rvp, &attr, 0, CRED(), NULL) != 0)
+		return (-1);
+
+	if (bp != NULL)
+		(void) snprintf(bp, len, format, (ino_t)attr.va_nodeid);
+	return (0);
+}
+
+/*
+ * Translate a Linux core_pattern path to a native Illumos one, by replacing
+ * the appropriate % escape sequences.
+ *
+ * Any % escape sequences that are not recognised are double-escaped so that
+ * they will be inserted literally into the path (to mimic Linux).
+ */
+int
+lxpr_core_path_l2s(const char *inp, char *outp, size_t outsz)
+{
+	int i = 0, j = 0;
+	char x;
+
+	while (j < outsz - 1) {
+		x = inp[i++];
+		if (x == '\0')
+			break;
+		if (x != '%') {
+			outp[j++] = x;
+			continue;
+		}
+
+		x = inp[i++];
+		if (x == '\0')
+			break;
+
+		/* Make sure we have enough space in the output buffer. */
+		if (j + 2 >= outsz - 1)
+			return (EINVAL);
+
+		switch (x) {
+		case 'E':
+			if (j + 4 >= outsz - 1)
+				return (EINVAL);
+			outp[j++] = '%';
+			outp[j++] = 'd';
+			outp[j++] = '%';
+			outp[j++] = 'f';
+			break;
+		case 'e':
+			outp[j++] = '%';
+			outp[j++] = 'f';
+			break;
+		case 'p':
+		case 'g':
+		case 'u':
+		case 't':
+		case '%':
+			outp[j++] = '%';
+			outp[j++] = x;
+			break;
+		case 'h':
+			outp[j++] = '%';
+			outp[j++] = 'n';
+			break;
+		default:
+			/* No translation, make it literal. */
+			if (j + 3 >= outsz - 1)
+				return (EINVAL);
+			outp[j++] = '%';
+			outp[j++] = '%';
+			outp[j++] = x;
+			break;
+		}
+	}
+
+	outp[j] = '\0';
+	return (0);
+}
+
+/*
+ * Translate an Illumos core pattern path back to Linux format.
+ */
+int
+lxpr_core_path_s2l(const char *inp, char *outp, size_t outsz)
+{
+	int i = 0, j = 0;
+	char x;
+
+	while (j < outsz - 1) {
+		x = inp[i++];
+		if (x == '\0')
+			break;
+		if (x != '%') {
+			outp[j++] = x;
+			continue;
+		}
+
+		x = inp[i++];
+		if (x == '\0')
+			break;
+
+		/* Make sure we have enough space in the output buffer. */
+		if (j + 2 >= outsz - 1)
+			return (EINVAL);
+
+		switch (x) {
+		case 'd':
+			/* No Linux equivalent unless it's %d%f. */
+			if (inp[i] == '%' && inp[i + 1] == 'f') {
+				i += 2;
+				outp[j++] = '%';
+				outp[j++] = 'E';
+			}
+			break;
+		case 'f':
+			outp[j++] = '%';
+			outp[j++] = 'e';
+			break;
+		case 'p':
+		case 'P':
+		case 'g':
+		case 'u':
+		case 't':
+		case '%':
+			outp[j++] = '%';
+			outp[j++] = (x == 'P' ? 'p' : x);
+			break;
+		case 'n':
+			outp[j++] = '%';
+			outp[j++] = 'h';
+			break;
+		default:
+			/* No translation. */
+			break;
+		}
+	}
+
+	outp[j] = '\0';
+	return (0);
+}
diff --git a/usr/src/uts/common/brand/lx/procfs/lx_prvfsops.c b/usr/src/uts/common/brand/lx/procfs/lx_prvfsops.c
new file mode 100644
index 0000000000..b4dc5091c2
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/procfs/lx_prvfsops.c
@@ -0,0 +1,377 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ * Copyright 2016 Joyent, Inc.
+ */
+
+/*
+ * lxprvfsops.c: vfs operations for /lxprocfs.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/cmn_err.h>
+#include <sys/cred.h>
+#include <sys/debug.h>
+#include <sys/errno.h>
+#include <sys/proc.h>
+#include <sys/stat.h>
+#include <sys/statvfs.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/var.h>
+#include <sys/vfs.h>
+#include <sys/vfs_opreg.h>
+#include <sys/vnode.h>
+#include <sys/mode.h>
+#include <sys/signal.h>
+#include <sys/user.h>
+#include <sys/mount.h>
+#include <sys/bitmap.h>
+#include <sys/kmem.h>
+#include <sys/policy.h>
+#include <sys/modctl.h>
+#include <sys/sunddi.h>
+#include <sys/sunldi.h>
+#include <sys/lx_impl.h>
+#include <sys/lx_brand.h>
+
+#include "lx_proc.h"
+
+/* Module level parameters */
+static int	lxprocfstype;
+static dev_t	lxprocdev;
+static kmutex_t	lxpr_mount_lock;
+
+int nproc_highbit;	/* highbit(v.v_nproc) */
+
+static int lxpr_mount(vfs_t *, vnode_t *, mounta_t *, cred_t *);
+static int lxpr_unmount(vfs_t *, int, cred_t *);
+static int lxpr_root(vfs_t *, vnode_t **);
+static int lxpr_statvfs(vfs_t *, statvfs64_t *);
+static int lxpr_init(int, char *);
+
+static vfsdef_t vfw = {
+	VFSDEF_VERSION,
+	"lx_proc",
+	lxpr_init,
+	VSW_ZMOUNT,
+	NULL
+};
+
+/*
+ * Module linkage information for the kernel.
+ */
+extern struct mod_ops mod_fsops;
+
+static struct modlfs modlfs = {
+	&mod_fsops, "lx brand procfs", &vfw
+};
+
+static struct modlinkage modlinkage = {
+	MODREV_1, (void *)&modlfs, NULL
+};
+
+int
+_init(void)
+{
+	return (mod_install(&modlinkage));
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&modlinkage, modinfop));
+}
+
+int
+_fini(void)
+{
+	int retval;
+
+	/*
+	 * attempt to unload the module
+	 */
+	if ((retval = mod_remove(&modlinkage)) != 0)
+		goto done;
+
+	/*
+	 * destroy lxpr_node cache
+	 */
+	lxpr_fininodecache();
+
+	/*
+	 * clean out the vfsops and vnodeops
+	 */
+	(void) vfs_freevfsops_by_type(lxprocfstype);
+	vn_freevnodeops(lxpr_vnodeops);
+
+	mutex_destroy(&lxpr_mount_lock);
+done:
+	return (retval);
+}
+
+static int
+lxpr_init(int fstype, char *name)
+{
+	static const fs_operation_def_t lxpr_vfsops_template[] = {
+		VFSNAME_MOUNT,		{ .vfs_mount = lxpr_mount },
+		VFSNAME_UNMOUNT,	{ .vfs_unmount = lxpr_unmount },
+		VFSNAME_ROOT,		{ .vfs_root = lxpr_root },
+		VFSNAME_STATVFS,	{ .vfs_statvfs = lxpr_statvfs },
+		NULL,			NULL
+	};
+	extern const fs_operation_def_t lxpr_vnodeops_template[];
+	int error;
+	major_t dev;
+
+	nproc_highbit = highbit(v.v_proc);
+	lxprocfstype = fstype;
+	ASSERT(lxprocfstype != 0);
+
+	mutex_init(&lxpr_mount_lock, NULL, MUTEX_DEFAULT, NULL);
+
+	/*
+	 * Associate VFS ops vector with this fstype.
+	 */
+	error = vfs_setfsops(fstype, lxpr_vfsops_template, NULL);
+	if (error != 0) {
+		cmn_err(CE_WARN, "lxpr_init: bad vfs ops template");
+		return (error);
+	}
+
+	/*
+	 * Set up vnode ops vector too.
+	 */
+	error = vn_make_ops(name, lxpr_vnodeops_template, &lxpr_vnodeops);
+	if (error != 0) {
+		(void) vfs_freevfsops_by_type(fstype);
+		cmn_err(CE_WARN, "lxpr_init: bad vnode ops template");
+		return (error);
+	}
+
+	/*
+	 * Assign a unique "device" number (used by stat(2)).
+	 */
+	if ((dev = getudev()) == (major_t)-1) {
+		cmn_err(CE_WARN, "lxpr_init: can't get unique device number");
+		dev = 0;
+	}
+
+	/*
+	 * Make the pseudo device
+	 */
+	lxprocdev = makedevice(dev, 0);
+
+	/*
+	 * Initialise cache for lxpr_nodes
+	 */
+	lxpr_initnodecache();
+
+	return (0);
+}
+
+static int
+lxpr_mount(vfs_t *vfsp, vnode_t *mvp, mounta_t *uap, cred_t *cr)
+{
+	lxpr_mnt_t *lxpr_mnt;
+	zone_t *zone = curproc->p_zone;
+	ldi_ident_t li;
+	int err;
+
+	/*
+	 * must be root to mount
+	 */
+	if (secpolicy_fs_mount(cr, mvp, vfsp) != 0)
+		return (EPERM);
+
+	/*
+	 * mount point must be a directory
+	 */
+	if (mvp->v_type != VDIR)
+		return (ENOTDIR);
+
+	/*
+	 * Mounting lx_proc is not allowed outside an LX zone.
+	 */
+	if (zone->zone_brand != &lx_brand) {
+		return (ENOTSUP);
+	}
+
+	/*
+	 * Having the resource be anything but "lxproc" doesn't make sense
+	 */
+	vfs_setresource(vfsp, "lxproc", 0);
+
+	lxpr_mnt = kmem_alloc(sizeof (*lxpr_mnt), KM_SLEEP);
+
+	if ((err = ldi_ident_from_mod(&modlinkage, &li)) != 0) {
+		kmem_free(lxpr_mnt, sizeof (*lxpr_mnt));
+		return (err);
+	}
+	lxpr_mnt->lxprm_li = li;
+
+	mutex_enter(&lxpr_mount_lock);
+
+	/*
+	 * Ensure we don't allow overlaying mounts
+	 */
+	mutex_enter(&mvp->v_lock);
+	if ((uap->flags & MS_OVERLAY) == 0 &&
+	    (mvp->v_count > 1 || (mvp->v_flag & VROOT))) {
+		mutex_exit(&mvp->v_lock);
+		mutex_exit(&lxpr_mount_lock);
+		kmem_free(lxpr_mnt, sizeof ((*lxpr_mnt)));
+		return (EBUSY);
+	}
+	mutex_exit(&mvp->v_lock);
+
+	/*
+	 * Hold a zone reference for access to the lxzd structure.
+	 */
+	zone_hold(lxpr_mnt->lxprm_zone = zone);
+
+	/*
+	 * Allocate the first vnode and arbitrarily set the parent vnode to the
+	 * mounted over directory
+	 */
+	lxpr_mnt->lxprm_node = lxpr_getnode(mvp, LXPR_PROCDIR, NULL, 0);
+
+	/* Correctly set the fs for the root node */
+	lxpr_mnt->lxprm_node->lxpr_vnode->v_vfsp = vfsp;
+
+	vfs_make_fsid(&vfsp->vfs_fsid, lxprocdev, lxprocfstype);
+	vfsp->vfs_bsize = DEV_BSIZE;
+	vfsp->vfs_fstype = lxprocfstype;
+	vfsp->vfs_data = (caddr_t)lxpr_mnt;
+	vfsp->vfs_dev = lxprocdev;
+
+	mutex_exit(&lxpr_mount_lock);
+
+	return (0);
+}
+
+static int
+lxpr_unmount(vfs_t *vfsp, int flag, cred_t *cr)
+{
+	lxpr_mnt_t *lxpr_mnt = (lxpr_mnt_t *)vfsp->vfs_data;
+	vnode_t *vp;
+	int count;
+
+	ASSERT(lxpr_mnt != NULL);
+	vp = LXPTOV(lxpr_mnt->lxprm_node);
+
+	mutex_enter(&lxpr_mount_lock);
+
+	/*
+	 * must be root to unmount
+	 */
+	if (secpolicy_fs_unmount(cr, vfsp) != 0) {
+		mutex_exit(&lxpr_mount_lock);
+		return (EPERM);
+	}
+
+	/*
+	 * forced unmount is not supported by this file system
+	 */
+	if (flag & MS_FORCE) {
+		mutex_exit(&lxpr_mount_lock);
+		return (ENOTSUP);
+	}
+
+	/*
+	 * Ensure that no vnodes are in use on this mount point.
+	 */
+	mutex_enter(&vp->v_lock);
+	count = vp->v_count;
+	mutex_exit(&vp->v_lock);
+	if (count > 1) {
+		mutex_exit(&lxpr_mount_lock);
+		return (EBUSY);
+	}
+
+
+	/*
+	 * purge the dnlc cache for vnode entries
+	 * associated with this file system
+	 */
+	count = dnlc_purge_vfsp(vfsp, 0);
+
+	/*
+	 * free up the lxprnode
+	 */
+	lxpr_freenode(lxpr_mnt->lxprm_node);
+	zone_rele(lxpr_mnt->lxprm_zone);
+
+	ldi_ident_release(lxpr_mnt->lxprm_li);
+
+	kmem_free(lxpr_mnt, sizeof (*lxpr_mnt));
+
+	mutex_exit(&lxpr_mount_lock);
+
+	return (0);
+}
+
+static int
+lxpr_root(vfs_t *vfsp, vnode_t **vpp)
+{
+	lxpr_node_t *lxpnp = ((lxpr_mnt_t *)vfsp->vfs_data)->lxprm_node;
+	vnode_t *vp = LXPTOV(lxpnp);
+
+	VN_HOLD(vp);
+	*vpp = vp;
+	return (0);
+}
+
+static int
+lxpr_statvfs(vfs_t *vfsp, statvfs64_t *sp)
+{
+	int n;
+	dev32_t d32;
+	extern uint_t nproc;
+
+	n = v.v_proc - nproc;
+
+	bzero((caddr_t)sp, sizeof (*sp));
+	sp->f_bsize	= DEV_BSIZE;
+	sp->f_frsize	= DEV_BSIZE;
+	sp->f_blocks	= (fsblkcnt64_t)0;
+	sp->f_bfree	= (fsblkcnt64_t)0;
+	sp->f_bavail	= (fsblkcnt64_t)0;
+	sp->f_files	= (fsfilcnt64_t)v.v_proc + 2;
+	sp->f_ffree	= (fsfilcnt64_t)n;
+	sp->f_favail	= (fsfilcnt64_t)n;
+	(void) cmpldev(&d32, vfsp->vfs_dev);
+	sp->f_fsid	= d32;
+	/* It is guaranteed that vsw_name will fit in f_basetype */
+	(void) strcpy(sp->f_basetype, vfssw[lxprocfstype].vsw_name);
+	sp->f_flag = vf_to_stf(vfsp->vfs_flag);
+	sp->f_namemax = 64;		/* quite arbitrary */
+	bzero(sp->f_fstr, sizeof (sp->f_fstr));
+
+	/* We know f_fstr is 32 chars */
+	(void) strcpy(sp->f_fstr, "/proc");
+	(void) strcpy(&sp->f_fstr[6], "/proc");
+
+	return (0);
+}
diff --git a/usr/src/uts/common/brand/lx/procfs/lx_prvnops.c b/usr/src/uts/common/brand/lx/procfs/lx_prvnops.c
new file mode 100644
index 0000000000..262339c31c
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/procfs/lx_prvnops.c
@@ -0,0 +1,7085 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ * Copyright 2016 Joyent, Inc.
+ */
+
+/*
+ * lx_proc -- a Linux-compatible /proc for the LX brand
+ *
+ * We have -- confusingly -- two implementations of Linux /proc.  One is to
+ * support native (but Linux-borne) programs that wish to view the native
+ * system through the Linux /proc model; the other -- this one -- is to
+ * support Linux binaries via the LX brand.  These two implementations differ
+ * greatly in their aspirations (and their willingness to bend the truth
+ * of the system to accommodate those aspirations); they should not be unified.
+ */
+
+#include <sys/cpupart.h>
+#include <sys/cpuvar.h>
+#include <sys/session.h>
+#include <sys/vmparam.h>
+#include <sys/mman.h>
+#include <vm/rm.h>
+#include <vm/seg_vn.h>
+#include <sys/sdt.h>
+#include <lx_signum.h>
+#include <sys/strlog.h>
+#include <sys/stropts.h>
+#include <sys/cmn_err.h>
+#include <sys/lx_brand.h>
+#include <lx_auxv.h>
+#include <sys/x86_archext.h>
+#include <sys/archsystm.h>
+#include <sys/fp.h>
+#include <sys/pool_pset.h>
+#include <sys/pset.h>
+#include <sys/zone.h>
+#include <sys/fcntl.h>
+#include <sys/pghw.h>
+#include <sys/vfs_opreg.h>
+#include <sys/param.h>
+#include <sys/utsname.h>
+#include <sys/rctl.h>
+#include <sys/kstat.h>
+#include <sys/lx_misc.h>
+#include <sys/lx_types.h>
+#include <sys/brand.h>
+#include <sys/cred_impl.h>
+#include <sys/tihdr.h>
+#include <sys/corectl.h>
+#include <inet/ip.h>
+#include <inet/ip_ire.h>
+#include <inet/ip6.h>
+#include <inet/ip_if.h>
+#include <inet/tcp.h>
+#include <inet/tcp_impl.h>
+#include <inet/udp_impl.h>
+#include <inet/ipclassifier.h>
+#include <sys/socketvar.h>
+#include <fs/sockfs/socktpi.h>
+
+/* Dependent on procfs */
+extern kthread_t *prchoose(proc_t *);
+extern int prreadargv(proc_t *, char *, size_t, size_t *);
+extern int prreadenvv(proc_t *, char *, size_t, size_t *);
+extern int prreadbuf(proc_t *, uintptr_t, uint8_t *, size_t, size_t *);
+
+#include "lx_proc.h"
+
+extern pgcnt_t swapfs_minfree;
+extern time_t boot_time;
+
+/*
+ * Pointer to the vnode ops vector for this fs.
+ * This is instantiated in lxprinit() in lxpr_vfsops.c
+ */
+vnodeops_t *lxpr_vnodeops;
+
+static int lxpr_open(vnode_t **, int, cred_t *, caller_context_t *);
+static int lxpr_close(vnode_t *, int, int, offset_t, cred_t *,
+    caller_context_t *);
+static int lxpr_create(struct vnode *, char *, struct vattr *, enum vcexcl,
+    int, struct vnode **, struct cred *, int, caller_context_t *, vsecattr_t *);
+static int lxpr_read(vnode_t *, uio_t *, int, cred_t *, caller_context_t *);
+static int lxpr_write(vnode_t *, uio_t *, int, cred_t *, caller_context_t *);
+static int lxpr_space(vnode_t *, int, flock64_t *, int, offset_t, cred_t *,
+    caller_context_t *);
+static int lxpr_setattr(vnode_t *, vattr_t *, int, cred_t *,
+    caller_context_t *);
+static int lxpr_getattr(vnode_t *, vattr_t *, int, cred_t *,
+    caller_context_t *);
+static int lxpr_access(vnode_t *, int, int, cred_t *, caller_context_t *);
+static int lxpr_lookup(vnode_t *, char *, vnode_t **,
+    pathname_t *, int, vnode_t *, cred_t *, caller_context_t *, int *,
+    pathname_t *);
+static int lxpr_readdir(vnode_t *, uio_t *, cred_t *, int *,
+    caller_context_t *, int);
+static int lxpr_readlink(vnode_t *, uio_t *, cred_t *, caller_context_t *);
+static int lxpr_cmp(vnode_t *, vnode_t *, caller_context_t *);
+static int lxpr_realvp(vnode_t *, vnode_t **, caller_context_t *);
+static int lxpr_sync(void);
+static void lxpr_inactive(vnode_t *, cred_t *, caller_context_t *);
+
+static vnode_t *lxpr_lookup_procdir(vnode_t *, char *);
+static vnode_t *lxpr_lookup_piddir(vnode_t *, char *);
+static vnode_t *lxpr_lookup_not_a_dir(vnode_t *, char *);
+static vnode_t *lxpr_lookup_fddir(vnode_t *, char *);
+static vnode_t *lxpr_lookup_netdir(vnode_t *, char *);
+static vnode_t *lxpr_lookup_sysdir(vnode_t *, char *);
+static vnode_t *lxpr_lookup_sys_fsdir(vnode_t *, char *);
+static vnode_t *lxpr_lookup_sys_fs_inotifydir(vnode_t *, char *);
+static vnode_t *lxpr_lookup_sys_kerneldir(vnode_t *, char *);
+static vnode_t *lxpr_lookup_sys_kdir_randdir(vnode_t *, char *);
+static vnode_t *lxpr_lookup_sys_netdir(vnode_t *, char *);
+static vnode_t *lxpr_lookup_sys_net_coredir(vnode_t *, char *);
+static vnode_t *lxpr_lookup_sys_net_ipv4dir(vnode_t *, char *);
+static vnode_t *lxpr_lookup_sys_vmdir(vnode_t *, char *);
+static vnode_t *lxpr_lookup_taskdir(vnode_t *, char *);
+static vnode_t *lxpr_lookup_task_tid_dir(vnode_t *, char *);
+
+static int lxpr_readdir_procdir(lxpr_node_t *, uio_t *, int *);
+static int lxpr_readdir_piddir(lxpr_node_t *, uio_t *, int *);
+static int lxpr_readdir_not_a_dir(lxpr_node_t *, uio_t *, int *);
+static int lxpr_readdir_fddir(lxpr_node_t *, uio_t *, int *);
+static int lxpr_readdir_netdir(lxpr_node_t *, uio_t *, int *);
+static int lxpr_readdir_sysdir(lxpr_node_t *, uio_t *, int *);
+static int lxpr_readdir_sys_fsdir(lxpr_node_t *, uio_t *, int *);
+static int lxpr_readdir_sys_fs_inotifydir(lxpr_node_t *, uio_t *, int *);
+static int lxpr_readdir_sys_kerneldir(lxpr_node_t *, uio_t *, int *);
+static int lxpr_readdir_sys_kdir_randdir(lxpr_node_t *, uio_t *, int *);
+static int lxpr_readdir_sys_netdir(lxpr_node_t *, uio_t *, int *);
+static int lxpr_readdir_sys_net_coredir(lxpr_node_t *, uio_t *, int *);
+static int lxpr_readdir_sys_net_ipv4dir(lxpr_node_t *, uio_t *, int *);
+static int lxpr_readdir_sys_vmdir(lxpr_node_t *, uio_t *, int *);
+static int lxpr_readdir_taskdir(lxpr_node_t *, uio_t *, int *);
+static int lxpr_readdir_task_tid_dir(lxpr_node_t *, uio_t *, int *);
+
+static void lxpr_read_invalid(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_empty(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_cgroups(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_cpuinfo(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_devices(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_diskstats(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_isdir(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_fd(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_filesystems(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_kmsg(lxpr_node_t *, lxpr_uiobuf_t *, ldi_handle_t);
+static void lxpr_read_loadavg(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_meminfo(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_mounts(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_partitions(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_stat(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_swaps(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_uptime(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_version(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_vmstat(lxpr_node_t *, lxpr_uiobuf_t *);
+
+static void lxpr_read_pid_auxv(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_pid_cgroup(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_pid_cmdline(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_pid_comm(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_pid_env(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_pid_limits(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_pid_loginuid(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_pid_maps(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_pid_mountinfo(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_pid_oom_scr_adj(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_pid_personality(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_pid_stat(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_pid_statm(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_pid_status(lxpr_node_t *, lxpr_uiobuf_t *);
+
+static void lxpr_read_pid_tid_stat(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_pid_tid_status(lxpr_node_t *, lxpr_uiobuf_t *);
+
+static void lxpr_read_net_arp(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_dev(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_dev_mcast(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_if_inet6(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_igmp(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_ip_mr_cache(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_ip_mr_vif(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_ipv6_route(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_mcfilter(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_netstat(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_raw(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_route(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_rpc(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_rt_cache(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_sockstat(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_snmp(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_stat(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_tcp(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_tcp6(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_udp(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_udp6(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_unix(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_sys_fs_inotify_max_queued_events(lxpr_node_t *,
+    lxpr_uiobuf_t *);
+static void lxpr_read_sys_fs_inotify_max_user_instances(lxpr_node_t *,
+    lxpr_uiobuf_t *);
+static void lxpr_read_sys_fs_inotify_max_user_watches(lxpr_node_t *,
+    lxpr_uiobuf_t *);
+static void lxpr_read_sys_kernel_caplcap(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_sys_kernel_corepatt(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_sys_kernel_hostname(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_sys_kernel_msgmni(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_sys_kernel_ngroups_max(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_sys_kernel_osrel(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_sys_kernel_pid_max(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_sys_kernel_rand_bootid(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_sys_kernel_sem(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_sys_kernel_shmall(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_sys_kernel_shmmax(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_sys_kernel_shmmni(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_sys_kernel_threads_max(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_sys_net_core_somaxc(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_sys_net_ipv4_ip_lport_range(lxpr_node_t *,
+    lxpr_uiobuf_t *);
+static void lxpr_read_sys_net_ipv4_tcp_fin_to(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_sys_net_ipv4_tcp_ka_int(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_sys_net_ipv4_tcp_ka_tim(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_sys_net_ipv4_tcp_sack(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_sys_net_ipv4_tcp_winscale(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_sys_vm_max_map_cnt(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_sys_vm_minfr_kb(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_sys_vm_nhpages(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_sys_vm_overcommit_mem(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_sys_vm_swappiness(lxpr_node_t *, lxpr_uiobuf_t *);
+
+static int lxpr_write_pid_loginuid(lxpr_node_t *, uio_t *, cred_t *,
+    caller_context_t *);
+static int lxpr_write_sys_net_core_somaxc(lxpr_node_t *, uio_t *, cred_t *,
+    caller_context_t *);
+static int lxpr_write_sys_net_ipv4_ip_lport_range(lxpr_node_t *, uio_t *,
+    cred_t *, caller_context_t *);
+static int lxpr_write_sys_net_ipv4_tcp_fin_to(lxpr_node_t *, uio_t *, cred_t *,
+    caller_context_t *);
+static int lxpr_write_sys_net_ipv4_tcp_ka_int(lxpr_node_t *, uio_t *,
+    cred_t *, caller_context_t *);
+static int lxpr_write_sys_net_ipv4_tcp_ka_tim(lxpr_node_t *, uio_t *,
+    cred_t *, caller_context_t *);
+static int lxpr_write_sys_net_ipv4_tcp_sack(lxpr_node_t *, uio_t *,
+    cred_t *, caller_context_t *);
+static int lxpr_write_sys_net_ipv4_tcp_winscale(lxpr_node_t *, uio_t *,
+    cred_t *, caller_context_t *);
+static int lxpr_write_sys_kernel_corepatt(lxpr_node_t *, uio_t *, cred_t *,
+    caller_context_t *);
+
+/*
+ * Simple conversion
+ */
+#define	btok(x)	((x) >> 10)			/* bytes to kbytes */
+#define	ptok(x)	((x) << (PAGESHIFT - 10))	/* pages to kbytes */
+
+#define	ttolxlwp(t)	((struct lx_lwp_data *)ttolwpbrand(t))
+
+extern rctl_hndl_t rc_process_semmsl;
+extern rctl_hndl_t rc_process_semopm;
+extern rctl_hndl_t rc_zone_semmni;
+
+extern rctl_hndl_t rc_zone_msgmni;
+extern rctl_hndl_t rc_zone_shmmax;
+extern rctl_hndl_t rc_zone_shmmni;
+#define	FOURGB	4294967295
+
+/*
+ * The maximum length of the concatenation of argument vector strings we
+ * will return to the user via the branded procfs. Likewise for the env vector.
+ */
+int lxpr_maxargvlen = 4096;
+int lxpr_maxenvvlen = 4096;
+
+/*
+ * The lx /proc vnode operations vector
+ */
+const fs_operation_def_t lxpr_vnodeops_template[] = {
+	VOPNAME_OPEN,		{ .vop_open = lxpr_open },
+	VOPNAME_CLOSE,		{ .vop_close = lxpr_close },
+	VOPNAME_READ,		{ .vop_read = lxpr_read },
+	VOPNAME_WRITE,		{ .vop_read = lxpr_write },
+	VOPNAME_GETATTR,	{ .vop_getattr = lxpr_getattr },
+	VOPNAME_ACCESS,		{ .vop_access = lxpr_access },
+	VOPNAME_LOOKUP,		{ .vop_lookup = lxpr_lookup },
+	VOPNAME_CREATE,		{ .vop_create = lxpr_create },
+	VOPNAME_READDIR,	{ .vop_readdir = lxpr_readdir },
+	VOPNAME_READLINK,	{ .vop_readlink = lxpr_readlink },
+	VOPNAME_SPACE,		{ .vop_space = lxpr_space },
+	VOPNAME_SETATTR,	{ .vop_setattr = lxpr_setattr },
+	VOPNAME_FSYNC,		{ .error = lxpr_sync },
+	VOPNAME_SEEK,		{ .error = lxpr_sync },
+	VOPNAME_INACTIVE,	{ .vop_inactive = lxpr_inactive },
+	VOPNAME_CMP,		{ .vop_cmp = lxpr_cmp },
+	VOPNAME_REALVP,		{ .vop_realvp = lxpr_realvp },
+	NULL,			NULL
+};
+
+
+/*
+ * file contents of an lx /proc directory.
+ */
+static lxpr_dirent_t lx_procdir[] = {
+	{ LXPR_CGROUPS,		"cgroups" },
+	{ LXPR_CMDLINE,		"cmdline" },
+	{ LXPR_CPUINFO,		"cpuinfo" },
+	{ LXPR_DEVICES,		"devices" },
+	{ LXPR_DISKSTATS,	"diskstats" },
+	{ LXPR_DMA,		"dma" },
+	{ LXPR_FILESYSTEMS,	"filesystems" },
+	{ LXPR_INTERRUPTS,	"interrupts" },
+	{ LXPR_IOPORTS,		"ioports" },
+	{ LXPR_KCORE,		"kcore" },
+	{ LXPR_KMSG,		"kmsg" },
+	{ LXPR_LOADAVG,		"loadavg" },
+	{ LXPR_MEMINFO,		"meminfo" },
+	{ LXPR_MODULES,		"modules" },
+	{ LXPR_MOUNTS,		"mounts" },
+	{ LXPR_NETDIR,		"net" },
+	{ LXPR_PARTITIONS,	"partitions" },
+	{ LXPR_SELF,		"self" },
+	{ LXPR_STAT,		"stat" },
+	{ LXPR_SWAPS,		"swaps" },
+	{ LXPR_SYSDIR,		"sys" },
+	{ LXPR_UPTIME,		"uptime" },
+	{ LXPR_VERSION,		"version" },
+	{ LXPR_VMSTAT,		"vmstat" }
+};
+
+#define	PROCDIRFILES	(sizeof (lx_procdir) / sizeof (lx_procdir[0]))
+
+/*
+ * Contents of an lx /proc/<pid> directory.
+ */
+static lxpr_dirent_t piddir[] = {
+	{ LXPR_PID_AUXV,	"auxv" },
+	{ LXPR_PID_CGROUP,	"cgroup" },
+	{ LXPR_PID_CMDLINE,	"cmdline" },
+	{ LXPR_PID_COMM,	"comm" },
+	{ LXPR_PID_CPU,		"cpu" },
+	{ LXPR_PID_CURDIR,	"cwd" },
+	{ LXPR_PID_ENV,		"environ" },
+	{ LXPR_PID_EXE,		"exe" },
+	{ LXPR_PID_LIMITS,	"limits" },
+	{ LXPR_PID_LOGINUID,	"loginuid" },
+	{ LXPR_PID_MAPS,	"maps" },
+	{ LXPR_PID_MEM,		"mem" },
+	{ LXPR_PID_MOUNTINFO,	"mountinfo" },
+	{ LXPR_PID_OOM_SCR_ADJ,	"oom_score_adj" },
+	{ LXPR_PID_PERSONALITY,	"personality" },
+	{ LXPR_PID_ROOTDIR,	"root" },
+	{ LXPR_PID_STAT,	"stat" },
+	{ LXPR_PID_STATM,	"statm" },
+	{ LXPR_PID_STATUS,	"status" },
+	{ LXPR_PID_TASKDIR,	"task" },
+	{ LXPR_PID_FDDIR,	"fd" }
+};
+
+#define	PIDDIRFILES	(sizeof (piddir) / sizeof (piddir[0]))
+
+/*
+ * Contents of an lx /proc/<pid>/task/<tid> directory.
+ */
+static lxpr_dirent_t tiddir[] = {
+	{ LXPR_PID_TID_AUXV,	"auxv" },
+	{ LXPR_PID_CGROUP,	"cgroup" },
+	{ LXPR_PID_CMDLINE,	"cmdline" },
+	{ LXPR_PID_TID_COMM,	"comm" },
+	{ LXPR_PID_CPU,		"cpu" },
+	{ LXPR_PID_CURDIR,	"cwd" },
+	{ LXPR_PID_ENV,		"environ" },
+	{ LXPR_PID_EXE,		"exe" },
+	{ LXPR_PID_LIMITS,	"limits" },
+	{ LXPR_PID_LOGINUID,	"loginuid" },
+	{ LXPR_PID_MAPS,	"maps" },
+	{ LXPR_PID_MEM,		"mem" },
+	{ LXPR_PID_MOUNTINFO,	"mountinfo" },
+	{ LXPR_PID_TID_OOM_SCR_ADJ,	"oom_score_adj" },
+	{ LXPR_PID_PERSONALITY,	"personality" },
+	{ LXPR_PID_ROOTDIR,	"root" },
+	{ LXPR_PID_TID_STAT,	"stat" },
+	{ LXPR_PID_STATM,	"statm" },
+	{ LXPR_PID_TID_STATUS,	"status" },
+	{ LXPR_PID_FDDIR,	"fd" }
+};
+
+#define	TIDDIRFILES	(sizeof (tiddir) / sizeof (tiddir[0]))
+
+#define	LX_RLIM_INFINITY	0xFFFFFFFFFFFFFFFF
+
+#define	RCTL_INFINITE(x) \
+	((x.rcv_flagaction & RCTL_LOCAL_MAXIMAL) && \
+	(x.rcv_flagaction & RCTL_GLOBAL_INFINITE))
+
+typedef struct lxpr_rlimtab {
+	char	*rlim_name;	/* limit name */
+	char	*rlim_unit;	/* limit unit */
+	char	*rlim_rctl;	/* rctl source */
+} lxpr_rlimtab_t;
+
+static lxpr_rlimtab_t lxpr_rlimtab[] = {
+	{ "Max cpu time",	"seconds",	"process.max-cpu-time" },
+	{ "Max file size",	"bytes",	"process.max-file-size" },
+	{ "Max data size",	"bytes",	"process.max-data-size" },
+	{ "Max stack size",	"bytes",	"process.max-stack-size" },
+	{ "Max core file size",	"bytes",	"process.max-core-size" },
+	{ "Max resident set",	"bytes",	"zone.max-physical-memory" },
+	{ "Max processes",	"processes",	"zone.max-lwps" },
+	{ "Max open files",	"files",	"process.max-file-descriptor" },
+	{ "Max locked memory",	"bytes",	"zone.max-locked-memory" },
+	{ "Max address space",	"bytes",	"process.max-address-space" },
+	{ "Max file locks",	"locks",	NULL },
+	{ "Max pending signals",	"signals",
+		"process.max-sigqueue-size" },
+	{ "Max msgqueue size",	"bytes",	"process.max-msg-messages" }
+};
+
+#define	LX_RLIM_TAB_LEN	(sizeof (lxpr_rlimtab) / sizeof (lxpr_rlimtab[0]))
+
+
+/*
+ * contents of lx /proc/net directory
+ */
+static lxpr_dirent_t netdir[] = {
+	{ LXPR_NET_ARP,		"arp" },
+	{ LXPR_NET_DEV,		"dev" },
+	{ LXPR_NET_DEV_MCAST,	"dev_mcast" },
+	{ LXPR_NET_IF_INET6,	"if_inet6" },
+	{ LXPR_NET_IGMP,	"igmp" },
+	{ LXPR_NET_IP_MR_CACHE,	"ip_mr_cache" },
+	{ LXPR_NET_IP_MR_VIF,	"ip_mr_vif" },
+	{ LXPR_NET_IPV6_ROUTE,	"ipv6_route" },
+	{ LXPR_NET_MCFILTER,	"mcfilter" },
+	{ LXPR_NET_NETSTAT,	"netstat" },
+	{ LXPR_NET_RAW,		"raw" },
+	{ LXPR_NET_ROUTE,	"route" },
+	{ LXPR_NET_RPC,		"rpc" },
+	{ LXPR_NET_RT_CACHE,	"rt_cache" },
+	{ LXPR_NET_SOCKSTAT,	"sockstat" },
+	{ LXPR_NET_SNMP,	"snmp" },
+	{ LXPR_NET_STAT,	"stat" },
+	{ LXPR_NET_TCP,		"tcp" },
+	{ LXPR_NET_TCP6,	"tcp6" },
+	{ LXPR_NET_UDP,		"udp" },
+	{ LXPR_NET_UDP6,	"udp6" },
+	{ LXPR_NET_UNIX,	"unix" }
+};
+
+#define	NETDIRFILES	(sizeof (netdir) / sizeof (netdir[0]))
+
+/*
+ * contents of /proc/sys directory
+ */
+static lxpr_dirent_t sysdir[] = {
+	{ LXPR_SYS_FSDIR,	"fs" },
+	{ LXPR_SYS_KERNELDIR,	"kernel" },
+	{ LXPR_SYS_NETDIR,	"net" },
+	{ LXPR_SYS_VMDIR,	"vm" },
+};
+
+#define	SYSDIRFILES	(sizeof (sysdir) / sizeof (sysdir[0]))
+
+/*
+ * contents of /proc/sys/fs directory
+ */
+static lxpr_dirent_t sys_fsdir[] = {
+	{ LXPR_SYS_FS_INOTIFYDIR,	"inotify" },
+};
+
+#define	SYS_FSDIRFILES (sizeof (sys_fsdir) / sizeof (sys_fsdir[0]))
+
+/*
+ * contents of /proc/sys/fs/inotify directory
+ */
+static lxpr_dirent_t sys_fs_inotifydir[] = {
+	{ LXPR_SYS_FS_INOTIFY_MAX_QUEUED_EVENTS,	"max_queued_events" },
+	{ LXPR_SYS_FS_INOTIFY_MAX_USER_INSTANCES,	"max_user_instances" },
+	{ LXPR_SYS_FS_INOTIFY_MAX_USER_WATCHES,		"max_user_watches" },
+};
+
+#define	SYS_FS_INOTIFYDIRFILES \
+	(sizeof (sys_fs_inotifydir) / sizeof (sys_fs_inotifydir[0]))
+
+/*
+ * contents of /proc/sys/kernel directory
+ */
+static lxpr_dirent_t sys_kerneldir[] = {
+	{ LXPR_SYS_KERNEL_CAPLCAP,	"cap_last_cap" },
+	{ LXPR_SYS_KERNEL_COREPATT,	"core_pattern" },
+	{ LXPR_SYS_KERNEL_HOSTNAME,	"hostname" },
+	{ LXPR_SYS_KERNEL_MSGMNI,	"msgmni" },
+	{ LXPR_SYS_KERNEL_NGROUPS_MAX,	"ngroups_max" },
+	{ LXPR_SYS_KERNEL_OSREL,	"osrelease" },
+	{ LXPR_SYS_KERNEL_PID_MAX,	"pid_max" },
+	{ LXPR_SYS_KERNEL_RANDDIR,	"random" },
+	{ LXPR_SYS_KERNEL_SEM,		"sem" },
+	{ LXPR_SYS_KERNEL_SHMALL,	"shmall" },
+	{ LXPR_SYS_KERNEL_SHMMAX,	"shmmax" },
+	{ LXPR_SYS_KERNEL_SHMMNI,	"shmmni" },
+	{ LXPR_SYS_KERNEL_THREADS_MAX,	"threads-max" },
+};
+
+#define	SYS_KERNELDIRFILES (sizeof (sys_kerneldir) / sizeof (sys_kerneldir[0]))
+
+/*
+ * contents of /proc/sys/kernel/random directory
+ */
+static lxpr_dirent_t sys_randdir[] = {
+	{ LXPR_SYS_KERNEL_RAND_BOOTID,	"boot_id" },
+};
+
+#define	SYS_RANDDIRFILES (sizeof (sys_randdir) / sizeof (sys_randdir[0]))
+
+/*
+ * contents of /proc/sys/net directory
+ */
+static lxpr_dirent_t sys_netdir[] = {
+	{ LXPR_SYS_NET_COREDIR,		"core" },
+	{ LXPR_SYS_NET_IPV4DIR,		"ipv4" },
+};
+
+#define	SYS_NETDIRFILES (sizeof (sys_netdir) / sizeof (sys_netdir[0]))
+
+/*
+ * contents of /proc/sys/net/core directory
+ */
+static lxpr_dirent_t sys_net_coredir[] = {
+	{ LXPR_SYS_NET_CORE_SOMAXCON,	"somaxconn" },
+};
+
+#define	SYS_NET_COREDIRFILES \
+	(sizeof (sys_net_coredir) / sizeof (sys_net_coredir[0]))
+
+/*
+ * contents of /proc/sys/net/ipv4 directory
+ * See the Linux ip(7) & tcp(7) man pages for descriptions and the illumos
+ * ip(7p) & tcp(7p) man pages for the native descriptions.
+ */
+static lxpr_dirent_t sys_net_ipv4dir[] = {
+	{ LXPR_SYS_NET_IPV4_IP_LPORT_RANGE, "ip_local_port_range" },
+	{ LXPR_SYS_NET_IPV4_TCP_FIN_TO,	"tcp_fin_timeout" },
+	{ LXPR_SYS_NET_IPV4_TCP_KA_INT,	"tcp_keepalive_intvl" },
+	{ LXPR_SYS_NET_IPV4_TCP_KA_TIM,	"tcp_keepalive_time" },
+	{ LXPR_SYS_NET_IPV4_TCP_SACK,	"tcp_sack" },
+	{ LXPR_SYS_NET_IPV4_TCP_WINSCALE, "tcp_window_scaling" },
+};
+
+#define	SYS_NET_IPV4DIRFILES \
+	(sizeof (sys_net_ipv4dir) / sizeof (sys_net_ipv4dir[0]))
+
+/*
+ * contents of /proc/sys/vm directory
+ */
+static lxpr_dirent_t sys_vmdir[] = {
+	{ LXPR_SYS_VM_MAX_MAP_CNT,	"max_map_count" },
+	{ LXPR_SYS_VM_MINFR_KB,		"min_free_kbytes" },
+	{ LXPR_SYS_VM_NHUGEP,		"nr_hugepages" },
+	{ LXPR_SYS_VM_OVERCOMMIT_MEM,	"overcommit_memory" },
+	{ LXPR_SYS_VM_SWAPPINESS,	"swappiness" },
+};
+
+#define	SYS_VMDIRFILES (sizeof (sys_vmdir) / sizeof (sys_vmdir[0]))
+
+/*
+ * Table for standard writable files. Non-standard writable files not in this
+ * table can be handled explicitly as special cases.
+ * This table drives lxpr_is_writable, lxpr_write, and lxpr_create.
+ * Note that the entries LXPR_PID_FD_FD and LXPR_PID_TID_FD_FD exist in the
+ * table both to verify writability and to satisfy opening with O_CREATE.
+ */
+typedef struct wftab {
+	lxpr_nodetype_t	wft_type;	/* file entry type */
+	int		(*wft_wrf)(lxpr_node_t *, struct uio *, cred_t *,
+			    caller_context_t *); /* write function */
+} wftab_t;
+
+static wftab_t wr_tab[] = {
+	{LXPR_PID_FD_FD, NULL},
+	{LXPR_PID_LOGINUID, lxpr_write_pid_loginuid},
+	{LXPR_PID_OOM_SCR_ADJ, NULL},
+	{LXPR_PID_TID_FD_FD, NULL},
+	{LXPR_PID_TID_OOM_SCR_ADJ, NULL},
+	{LXPR_SYS_KERNEL_COREPATT, lxpr_write_sys_kernel_corepatt},
+	{LXPR_SYS_KERNEL_SHMALL, NULL},
+	{LXPR_SYS_KERNEL_SHMMAX, NULL},
+	{LXPR_SYS_NET_CORE_SOMAXCON, lxpr_write_sys_net_core_somaxc},
+	{LXPR_SYS_NET_IPV4_IP_LPORT_RANGE,
+	    lxpr_write_sys_net_ipv4_ip_lport_range},
+	{LXPR_SYS_NET_IPV4_TCP_FIN_TO, lxpr_write_sys_net_ipv4_tcp_fin_to},
+	{LXPR_SYS_NET_IPV4_TCP_KA_INT, lxpr_write_sys_net_ipv4_tcp_ka_int},
+	{LXPR_SYS_NET_IPV4_TCP_KA_TIM, lxpr_write_sys_net_ipv4_tcp_ka_tim},
+	{LXPR_SYS_NET_IPV4_TCP_SACK, lxpr_write_sys_net_ipv4_tcp_sack},
+	{LXPR_SYS_NET_IPV4_TCP_WINSCALE, lxpr_write_sys_net_ipv4_tcp_winscale},
+	{LXPR_SYS_VM_OVERCOMMIT_MEM, NULL},
+	{LXPR_SYS_VM_SWAPPINESS, NULL},
+	{LXPR_INVALID, NULL}
+};
+
+/*
+ * Centralized test for the standard writable proc files. Other non-standard
+ * writable files might be handled separately.
+ */
+boolean_t
+lxpr_is_writable(lxpr_nodetype_t type)
+{
+	int i;
+
+	for (i = 0; wr_tab[i].wft_type != LXPR_INVALID; i++) {
+		if (wr_tab[i].wft_type == type)
+			return (B_TRUE);
+	}
+	return (B_FALSE);
+}
+
+/*
+ * lxpr_open(): Vnode operation for VOP_OPEN()
+ */
+static int
+lxpr_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
+{
+	vnode_t		*vp = *vpp;
+	lxpr_node_t	*lxpnp = VTOLXP(vp);
+	lxpr_nodetype_t	type = lxpnp->lxpr_type;
+	vnode_t		*rvp;
+	int		error = 0;
+
+	/* Restrict writes to certain files */
+	if ((flag & FWRITE) && !lxpr_is_writable(type)) {
+		return (EPERM);
+	}
+
+	/*
+	 * If we are opening an underlying file only allow regular files,
+	 * fifos or sockets; reject the open for anything else.
+	 * Just do it if we are opening the current or root directory.
+	 */
+	if (lxpnp->lxpr_realvp != NULL) {
+		rvp = lxpnp->lxpr_realvp;
+
+		if (type == LXPR_PID_FD_FD && rvp->v_type != VREG &&
+		    rvp->v_type != VFIFO && rvp->v_type != VSOCK) {
+			error = EACCES;
+		} else {
+			if (type == LXPR_PID_FD_FD && rvp->v_type == VFIFO) {
+				/*
+				 * This flag lets the fifo open know that
+				 * we're using proc/fd to open a fd which we
+				 * already have open. Otherwise, the fifo might
+				 * reject an open if the other end has closed.
+				 */
+				flag |= FKLYR;
+			}
+			/*
+			 * Need to hold rvp since VOP_OPEN() may release it.
+			 */
+			VN_HOLD(rvp);
+			error = VOP_OPEN(&rvp, flag, cr, ct);
+			if (error) {
+				VN_RELE(rvp);
+			} else {
+				*vpp = rvp;
+				VN_RELE(vp);
+			}
+		}
+	}
+
+	return (error);
+}
+
+
+/*
+ * lxpr_close(): Vnode operation for VOP_CLOSE()
+ */
+/* ARGSUSED */
+static int
+lxpr_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
+    caller_context_t *ct)
+{
+	lxpr_node_t	*lxpr = VTOLXP(vp);
+	lxpr_nodetype_t	type = lxpr->lxpr_type;
+
+	/*
+	 * we should never get here because the close is done on the realvp
+	 * for these nodes
+	 */
+	ASSERT(type != LXPR_PID_FD_FD &&
+	    type != LXPR_PID_CURDIR &&
+	    type != LXPR_PID_ROOTDIR &&
+	    type != LXPR_PID_EXE);
+
+	return (0);
+}
+
+static void (*lxpr_read_function[LXPR_NFILES])() = {
+	NULL,				/* invalid		*/
+	lxpr_read_isdir,		/* /proc		*/
+	lxpr_read_isdir,		/* /proc/<pid>		*/
+	lxpr_read_pid_auxv,		/* /proc/<pid>/auxv	*/
+	lxpr_read_pid_cgroup,		/* /proc/<pid>/cgroup	*/
+	lxpr_read_pid_cmdline,		/* /proc/<pid>/cmdline	*/
+	lxpr_read_pid_comm,		/* /proc/<pid>/comm	*/
+	lxpr_read_empty,		/* /proc/<pid>/cpu	*/
+	lxpr_read_invalid,		/* /proc/<pid>/cwd	*/
+	lxpr_read_pid_env,		/* /proc/<pid>/environ	*/
+	lxpr_read_invalid,		/* /proc/<pid>/exe	*/
+	lxpr_read_pid_limits,		/* /proc/<pid>/limits	*/
+	lxpr_read_pid_loginuid,		/* /proc/<pid>/loginuid	*/
+	lxpr_read_pid_maps,		/* /proc/<pid>/maps	*/
+	lxpr_read_empty,		/* /proc/<pid>/mem	*/
+	lxpr_read_pid_mountinfo,	/* /proc/<pid>/mountinfo */
+	lxpr_read_pid_oom_scr_adj,	/* /proc/<pid>/oom_score_adj */
+	lxpr_read_pid_personality,	/* /proc/<pid>/personality */
+	lxpr_read_invalid,		/* /proc/<pid>/root	*/
+	lxpr_read_pid_stat,		/* /proc/<pid>/stat	*/
+	lxpr_read_pid_statm,		/* /proc/<pid>/statm	*/
+	lxpr_read_pid_status,		/* /proc/<pid>/status	*/
+	lxpr_read_isdir,		/* /proc/<pid>/task	*/
+	lxpr_read_isdir,		/* /proc/<pid>/task/nn	*/
+	lxpr_read_isdir,		/* /proc/<pid>/fd	*/
+	lxpr_read_fd,			/* /proc/<pid>/fd/nn	*/
+	lxpr_read_pid_auxv,		/* /proc/<pid>/task/<tid>/auxv	*/
+	lxpr_read_pid_cgroup,		/* /proc/<pid>/task/<tid>/cgroup */
+	lxpr_read_pid_cmdline,		/* /proc/<pid>/task/<tid>/cmdline */
+	lxpr_read_pid_comm,		/* /proc/<pid>/task/<tid>/comm	*/
+	lxpr_read_empty,		/* /proc/<pid>/task/<tid>/cpu	*/
+	lxpr_read_invalid,		/* /proc/<pid>/task/<tid>/cwd	*/
+	lxpr_read_pid_env,		/* /proc/<pid>/task/<tid>/environ */
+	lxpr_read_invalid,		/* /proc/<pid>/task/<tid>/exe	*/
+	lxpr_read_pid_limits,		/* /proc/<pid>/task/<tid>/limits */
+	lxpr_read_pid_loginuid,		/* /proc/<pid>/task/<tid>/loginuid */
+	lxpr_read_pid_maps,		/* /proc/<pid>/task/<tid>/maps	*/
+	lxpr_read_empty,		/* /proc/<pid>/task/<tid>/mem	*/
+	lxpr_read_pid_mountinfo,	/* /proc/<pid>/task/<tid>/mountinfo */
+	lxpr_read_pid_oom_scr_adj,	/* /proc/<pid>/task/<tid>/oom_scr_adj */
+	lxpr_read_pid_personality,	/* /proc/<pid>/task/<tid>/personality */
+	lxpr_read_invalid,		/* /proc/<pid>/task/<tid>/root	*/
+	lxpr_read_pid_tid_stat,		/* /proc/<pid>/task/<tid>/stat	*/
+	lxpr_read_pid_statm,		/* /proc/<pid>/task/<tid>/statm	*/
+	lxpr_read_pid_tid_status,	/* /proc/<pid>/task/<tid>/status */
+	lxpr_read_isdir,		/* /proc/<pid>/task/<tid>/fd	*/
+	lxpr_read_fd,			/* /proc/<pid>/task/<tid>/fd/nn	*/
+	lxpr_read_cgroups,		/* /proc/cgroups	*/
+	lxpr_read_empty,		/* /proc/cmdline	*/
+	lxpr_read_cpuinfo,		/* /proc/cpuinfo	*/
+	lxpr_read_devices,		/* /proc/devices	*/
+	lxpr_read_diskstats,		/* /proc/diskstats	*/
+	lxpr_read_empty,		/* /proc/dma		*/
+	lxpr_read_filesystems,		/* /proc/filesystems	*/
+	lxpr_read_empty,		/* /proc/interrupts	*/
+	lxpr_read_empty,		/* /proc/ioports	*/
+	lxpr_read_empty,		/* /proc/kcore		*/
+	lxpr_read_invalid,		/* /proc/kmsg -- see lxpr_read() */
+	lxpr_read_loadavg,		/* /proc/loadavg	*/
+	lxpr_read_meminfo,		/* /proc/meminfo	*/
+	lxpr_read_empty,		/* /proc/modules	*/
+	lxpr_read_mounts,		/* /proc/mounts		*/
+	lxpr_read_isdir,		/* /proc/net		*/
+	lxpr_read_net_arp,		/* /proc/net/arp	*/
+	lxpr_read_net_dev,		/* /proc/net/dev	*/
+	lxpr_read_net_dev_mcast,	/* /proc/net/dev_mcast	*/
+	lxpr_read_net_if_inet6,		/* /proc/net/if_inet6	*/
+	lxpr_read_net_igmp,		/* /proc/net/igmp	*/
+	lxpr_read_net_ip_mr_cache,	/* /proc/net/ip_mr_cache */
+	lxpr_read_net_ip_mr_vif,	/* /proc/net/ip_mr_vif	*/
+	lxpr_read_net_ipv6_route,	/* /proc/net/ipv6_route	*/
+	lxpr_read_net_mcfilter,		/* /proc/net/mcfilter	*/
+	lxpr_read_net_netstat,		/* /proc/net/netstat	*/
+	lxpr_read_net_raw,		/* /proc/net/raw	*/
+	lxpr_read_net_route,		/* /proc/net/route	*/
+	lxpr_read_net_rpc,		/* /proc/net/rpc	*/
+	lxpr_read_net_rt_cache,		/* /proc/net/rt_cache	*/
+	lxpr_read_net_sockstat,		/* /proc/net/sockstat	*/
+	lxpr_read_net_snmp,		/* /proc/net/snmp	*/
+	lxpr_read_net_stat,		/* /proc/net/stat	*/
+	lxpr_read_net_tcp,		/* /proc/net/tcp	*/
+	lxpr_read_net_tcp6,		/* /proc/net/tcp6	*/
+	lxpr_read_net_udp,		/* /proc/net/udp	*/
+	lxpr_read_net_udp6,		/* /proc/net/udp6	*/
+	lxpr_read_net_unix,		/* /proc/net/unix	*/
+	lxpr_read_partitions,		/* /proc/partitions	*/
+	lxpr_read_invalid,		/* /proc/self		*/
+	lxpr_read_stat,			/* /proc/stat		*/
+	lxpr_read_swaps,		/* /proc/swaps		*/
+	lxpr_read_invalid,		/* /proc/sys		*/
+	lxpr_read_invalid,		/* /proc/sys/fs		*/
+	lxpr_read_invalid,		/* /proc/sys/fs/inotify	*/
+	lxpr_read_sys_fs_inotify_max_queued_events, /* max_queued_events */
+	lxpr_read_sys_fs_inotify_max_user_instances, /* max_user_instances */
+	lxpr_read_sys_fs_inotify_max_user_watches, /* max_user_watches */
+	lxpr_read_invalid,		/* /proc/sys/kernel	*/
+	lxpr_read_sys_kernel_caplcap,	/* /proc/sys/kernel/cap_last_cap */
+	lxpr_read_sys_kernel_corepatt,	/* /proc/sys/kernel/core_pattern */
+	lxpr_read_sys_kernel_hostname,	/* /proc/sys/kernel/hostname */
+	lxpr_read_sys_kernel_msgmni,	/* /proc/sys/kernel/msgmni */
+	lxpr_read_sys_kernel_ngroups_max, /* /proc/sys/kernel/ngroups_max */
+	lxpr_read_sys_kernel_osrel,	/* /proc/sys/kernel/osrelease */
+	lxpr_read_sys_kernel_pid_max,	/* /proc/sys/kernel/pid_max */
+	lxpr_read_invalid,		/* /proc/sys/kernel/random */
+	lxpr_read_sys_kernel_rand_bootid, /* /proc/sys/kernel/random/boot_id */
+	lxpr_read_sys_kernel_sem,	/* /proc/sys/kernel/sem */
+	lxpr_read_sys_kernel_shmall,	/* /proc/sys/kernel/shmall */
+	lxpr_read_sys_kernel_shmmax,	/* /proc/sys/kernel/shmmax */
+	lxpr_read_sys_kernel_shmmni,	/* /proc/sys/kernel/shmmni */
+	lxpr_read_sys_kernel_threads_max, /* /proc/sys/kernel/threads-max */
+	lxpr_read_invalid,		/* /proc/sys/net	*/
+	lxpr_read_invalid,		/* /proc/sys/net/core	*/
+	lxpr_read_sys_net_core_somaxc,	/* /proc/sys/net/core/somaxconn	*/
+	lxpr_read_invalid,		/* /proc/sys/net/ipv4	*/
+	lxpr_read_sys_net_ipv4_ip_lport_range, /* ../ipv4/ip_local_port_range */
+	lxpr_read_sys_net_ipv4_tcp_fin_to, /* .../ipv4/tcp_fin_timeout */
+	lxpr_read_sys_net_ipv4_tcp_ka_int, /* .../ipv4/tcp_keepalive_intvl */
+	lxpr_read_sys_net_ipv4_tcp_ka_tim, /* .../ipv4/tcp_keepalive_time */
+	lxpr_read_sys_net_ipv4_tcp_sack, /* .../ipv4/tcp_sack */
+	lxpr_read_sys_net_ipv4_tcp_winscale, /* .../ipv4/tcp_window_scaling */
+	lxpr_read_invalid,		/* /proc/sys/vm	*/
+	lxpr_read_sys_vm_max_map_cnt,	/* /proc/sys/vm/max_map_count */
+	lxpr_read_sys_vm_minfr_kb,	/* /proc/sys/vm/min_free_kbytes */
+	lxpr_read_sys_vm_nhpages,	/* /proc/sys/vm/nr_hugepages */
+	lxpr_read_sys_vm_overcommit_mem, /* /proc/sys/vm/overcommit_memory */
+	lxpr_read_sys_vm_swappiness,	/* /proc/sys/vm/swappiness */
+	lxpr_read_uptime,		/* /proc/uptime		*/
+	lxpr_read_version,		/* /proc/version	*/
+	lxpr_read_vmstat,		/* /proc/vmstat		*/
+};
+
+/*
+ * Array of lookup functions, indexed by lx /proc file type.
+ */
+static vnode_t *(*lxpr_lookup_function[LXPR_NFILES])() = {
+	NULL,				/* invalid		*/
+	lxpr_lookup_procdir,		/* /proc		*/
+	lxpr_lookup_piddir,		/* /proc/<pid>		*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/auxv	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/cgroup	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/cmdline	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/comm	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/cpu	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/cwd	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/environ	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/exe	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/limits	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/loginuid	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/maps	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/mem	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/mountinfo */
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/oom_score_adj */
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/personality */
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/root	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/stat	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/statm	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/status	*/
+	lxpr_lookup_taskdir,		/* /proc/<pid>/task	*/
+	lxpr_lookup_task_tid_dir,	/* /proc/<pid>/task/nn	*/
+	lxpr_lookup_fddir,		/* /proc/<pid>/fd	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/fd/nn	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/task/<tid>/auxv	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/task/<tid>/cgroup */
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/task/<tid>/cmdline */
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/task/<tid>/comm	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/task/<tid>/cpu	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/task/<tid>/cwd	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/task/<tid>/environ */
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/task/<tid>/exe	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/task/<tid>/limits */
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/task/<tid>/loginuid */
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/task/<tid>/maps	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/task/<tid>/mem	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/task/<tid>/mountinfo */
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/task/<tid>/oom_scr_adj */
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/task/<tid>/personality */
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/task/<tid>/root	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/task/<tid>/stat	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/task/<tid>/statm	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/task/<tid>/status */
+	lxpr_lookup_fddir,		/* /proc/<pid>/task/<tid>/fd	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/task/<tid>/fd/nn	*/
+	lxpr_lookup_not_a_dir,		/* /proc/cgroups	*/
+	lxpr_lookup_not_a_dir,		/* /proc/cmdline	*/
+	lxpr_lookup_not_a_dir,		/* /proc/cpuinfo	*/
+	lxpr_lookup_not_a_dir,		/* /proc/devices	*/
+	lxpr_lookup_not_a_dir,		/* /proc/diskstats	*/
+	lxpr_lookup_not_a_dir,		/* /proc/dma		*/
+	lxpr_lookup_not_a_dir,		/* /proc/filesystems	*/
+	lxpr_lookup_not_a_dir,		/* /proc/interrupts	*/
+	lxpr_lookup_not_a_dir,		/* /proc/ioports	*/
+	lxpr_lookup_not_a_dir,		/* /proc/kcore		*/
+	lxpr_lookup_not_a_dir,		/* /proc/kmsg		*/
+	lxpr_lookup_not_a_dir,		/* /proc/loadavg	*/
+	lxpr_lookup_not_a_dir,		/* /proc/meminfo	*/
+	lxpr_lookup_not_a_dir,		/* /proc/modules	*/
+	lxpr_lookup_not_a_dir,		/* /proc/mounts		*/
+	lxpr_lookup_netdir,		/* /proc/net		*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/arp	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/dev	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/dev_mcast	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/if_inet6	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/igmp	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/ip_mr_cache */
+	lxpr_lookup_not_a_dir,		/* /proc/net/ip_mr_vif	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/ipv6_route	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/mcfilter	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/netstat	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/raw	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/route	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/rpc	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/rt_cache	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/sockstat	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/snmp	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/stat	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/tcp	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/tcp6	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/udp	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/udp6	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/unix	*/
+	lxpr_lookup_not_a_dir,		/* /proc/partitions	*/
+	lxpr_lookup_not_a_dir,		/* /proc/self		*/
+	lxpr_lookup_not_a_dir,		/* /proc/stat		*/
+	lxpr_lookup_not_a_dir,		/* /proc/swaps		*/
+	lxpr_lookup_sysdir,		/* /proc/sys		*/
+	lxpr_lookup_sys_fsdir,		/* /proc/sys/fs		*/
+	lxpr_lookup_sys_fs_inotifydir,	/* /proc/sys/fs/inotify	*/
+	lxpr_lookup_not_a_dir,		/* .../inotify/max_queued_events */
+	lxpr_lookup_not_a_dir,		/* .../inotify/max_user_instances */
+	lxpr_lookup_not_a_dir,		/* .../inotify/max_user_watches */
+	lxpr_lookup_sys_kerneldir,	/* /proc/sys/kernel	*/
+	lxpr_lookup_not_a_dir,		/* /proc/sys/kernel/cap_last_cap */
+	lxpr_lookup_not_a_dir,		/* /proc/sys/kernel/core_pattern */
+	lxpr_lookup_not_a_dir,		/* /proc/sys/kernel/hostname */
+	lxpr_lookup_not_a_dir,		/* /proc/sys/kernel/msgmni */
+	lxpr_lookup_not_a_dir,		/* /proc/sys/kernel/ngroups_max */
+	lxpr_lookup_not_a_dir,		/* /proc/sys/kernel/osrelease */
+	lxpr_lookup_not_a_dir,		/* /proc/sys/kernel/pid_max */
+	lxpr_lookup_sys_kdir_randdir,	/* /proc/sys/kernel/random */
+	lxpr_lookup_not_a_dir,		/* /proc/sys/kernel/random/boot_id */
+	lxpr_lookup_not_a_dir,		/* /proc/sys/kernel/sem */
+	lxpr_lookup_not_a_dir,		/* /proc/sys/kernel/shmall */
+	lxpr_lookup_not_a_dir,		/* /proc/sys/kernel/shmmax */
+	lxpr_lookup_not_a_dir,		/* /proc/sys/kernel/shmmni */
+	lxpr_lookup_not_a_dir,		/* /proc/sys/kernel/threads-max */
+	lxpr_lookup_sys_netdir,		/* /proc/sys/net */
+	lxpr_lookup_sys_net_coredir,	/* /proc/sys/net/core */
+	lxpr_lookup_not_a_dir,		/* /proc/sys/net/core/somaxconn */
+	lxpr_lookup_sys_net_ipv4dir,	/* /proc/sys/net/ipv4 */
+	lxpr_lookup_not_a_dir,		/* .../net/ipv4/ip_local_port_range */
+	lxpr_lookup_not_a_dir,		/* .../net/ipv4/tcp_fin_timeout */
+	lxpr_lookup_not_a_dir,		/* .../net/ipv4/tcp_keepalive_intvl */
+	lxpr_lookup_not_a_dir,		/* .../net/ipv4/tcp_keepalive_time */
+	lxpr_lookup_not_a_dir,		/* .../net/ipv4/tcp_sack */
+	lxpr_lookup_not_a_dir,		/* .../net/ipv4/tcp_window_scaling */
+	lxpr_lookup_sys_vmdir,		/* /proc/sys/vm */
+	lxpr_lookup_not_a_dir,		/* /proc/sys/vm/max_map_count */
+	lxpr_lookup_not_a_dir,		/* /proc/sys/vm/min_free_kbytes */
+	lxpr_lookup_not_a_dir,		/* /proc/sys/vm/nr_hugepages */
+	lxpr_lookup_not_a_dir,		/* /proc/sys/vm/overcommit_memory */
+	lxpr_lookup_not_a_dir,		/* /proc/sys/vm/swappiness */
+	lxpr_lookup_not_a_dir,		/* /proc/uptime		*/
+	lxpr_lookup_not_a_dir,		/* /proc/version	*/
+	lxpr_lookup_not_a_dir,		/* /proc/vmstat		*/
+};
+
+/*
+ * Array of readdir functions, indexed by /proc file type.
+ */
+static int (*lxpr_readdir_function[LXPR_NFILES])() = {
+	NULL,				/* invalid		*/
+	lxpr_readdir_procdir,		/* /proc		*/
+	lxpr_readdir_piddir,		/* /proc/<pid>		*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/auxv	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/cgroup	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/cmdline	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/comm	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/cpu	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/cwd	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/environ	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/exe	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/limits	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/loginuid	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/maps	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/mem	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/mountinfo */
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/oom_score_adj */
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/personality */
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/root	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/stat	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/statm	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/status	*/
+	lxpr_readdir_taskdir,		/* /proc/<pid>/task	*/
+	lxpr_readdir_task_tid_dir,	/* /proc/<pid>/task/nn	*/
+	lxpr_readdir_fddir,		/* /proc/<pid>/fd	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/fd/nn	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/task/<tid>/auxv	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/task/<tid>/cgroup */
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/task/<tid>/cmdline */
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/task/<tid>/comm	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/task/<tid>/cpu	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/task/<tid>/cwd	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/task/<tid>/environ */
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/task/<tid>/exe	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/task/<tid>/limits */
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/task/<tid>/loginuid */
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/task/<tid>/maps	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/task/<tid>/mem	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/task/<tid>/mountinfo */
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/task/<tid/oom_scr_adj */
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/task/<tid/personality */
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/task/<tid>/root	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/task/<tid>/stat	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/task/<tid>/statm	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/task/<tid>/status */
+	lxpr_readdir_fddir,		/* /proc/<pid>/task/<tid>/fd	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/task/<tid>/fd/nn	*/
+	lxpr_readdir_not_a_dir,		/* /proc/cgroups	*/
+	lxpr_readdir_not_a_dir,		/* /proc/cmdline	*/
+	lxpr_readdir_not_a_dir,		/* /proc/cpuinfo	*/
+	lxpr_readdir_not_a_dir,		/* /proc/devices	*/
+	lxpr_readdir_not_a_dir,		/* /proc/diskstats	*/
+	lxpr_readdir_not_a_dir,		/* /proc/dma		*/
+	lxpr_readdir_not_a_dir,		/* /proc/filesystems	*/
+	lxpr_readdir_not_a_dir,		/* /proc/interrupts	*/
+	lxpr_readdir_not_a_dir,		/* /proc/ioports	*/
+	lxpr_readdir_not_a_dir,		/* /proc/kcore		*/
+	lxpr_readdir_not_a_dir,		/* /proc/kmsg		*/
+	lxpr_readdir_not_a_dir,		/* /proc/loadavg	*/
+	lxpr_readdir_not_a_dir,		/* /proc/meminfo	*/
+	lxpr_readdir_not_a_dir,		/* /proc/modules	*/
+	lxpr_readdir_not_a_dir,		/* /proc/mounts		*/
+	lxpr_readdir_netdir,		/* /proc/net		*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/arp	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/dev	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/dev_mcast	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/if_inet6	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/igmp	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/ip_mr_cache */
+	lxpr_readdir_not_a_dir,		/* /proc/net/ip_mr_vif	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/ipv6_route	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/mcfilter	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/netstat	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/raw	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/route	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/rpc	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/rt_cache	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/sockstat	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/snmp	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/stat	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/tcp	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/tcp6	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/udp	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/udp6	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/unix	*/
+	lxpr_readdir_not_a_dir,		/* /proc/partitions	*/
+	lxpr_readdir_not_a_dir,		/* /proc/self		*/
+	lxpr_readdir_not_a_dir,		/* /proc/stat		*/
+	lxpr_readdir_not_a_dir,		/* /proc/swaps		*/
+	lxpr_readdir_sysdir,		/* /proc/sys		*/
+	lxpr_readdir_sys_fsdir,		/* /proc/sys/fs		*/
+	lxpr_readdir_sys_fs_inotifydir,	/* /proc/sys/fs/inotify	*/
+	lxpr_readdir_not_a_dir,		/* .../inotify/max_queued_events */
+	lxpr_readdir_not_a_dir,		/* .../inotify/max_user_instances */
+	lxpr_readdir_not_a_dir,		/* .../inotify/max_user_watches	*/
+	lxpr_readdir_sys_kerneldir,	/* /proc/sys/kernel	*/
+	lxpr_readdir_not_a_dir,		/* /proc/sys/kernel/cap_last_cap */
+	lxpr_readdir_not_a_dir,		/* /proc/sys/kernel/core_pattern */
+	lxpr_readdir_not_a_dir,		/* /proc/sys/kernel/hostname */
+	lxpr_readdir_not_a_dir,		/* /proc/sys/kernel/msgmni */
+	lxpr_readdir_not_a_dir,		/* /proc/sys/kernel/ngroups_max */
+	lxpr_readdir_not_a_dir,		/* /proc/sys/kernel/osrelease */
+	lxpr_readdir_not_a_dir,		/* /proc/sys/kernel/pid_max */
+	lxpr_readdir_sys_kdir_randdir,	/* /proc/sys/kernel/random */
+	lxpr_readdir_not_a_dir,		/* /proc/sys/kernel/random/boot_id */
+	lxpr_readdir_not_a_dir,		/* /proc/sys/kernel/sem */
+	lxpr_readdir_not_a_dir,		/* /proc/sys/kernel/shmall */
+	lxpr_readdir_not_a_dir,		/* /proc/sys/kernel/shmmax */
+	lxpr_readdir_not_a_dir,		/* /proc/sys/kernel/shmmni */
+	lxpr_readdir_not_a_dir,		/* /proc/sys/kernel/threads-max */
+	lxpr_readdir_sys_netdir,	/* /proc/sys/net */
+	lxpr_readdir_sys_net_coredir,	/* /proc/sys/net/core */
+	lxpr_readdir_not_a_dir,		/* /proc/sys/net/core/somaxconn */
+	lxpr_readdir_sys_net_ipv4dir,	/* /proc/sys/net/ipv4 */
+	lxpr_readdir_not_a_dir,		/* .../net/ipv4/ip_local_port_range */
+	lxpr_readdir_not_a_dir,		/* .../net/ipv4/tcp_fin_timeout */
+	lxpr_readdir_not_a_dir,		/* .../net/ipv4/tcp_keepalive_intvl */
+	lxpr_readdir_not_a_dir,		/* .../net/ipv4/tcp_keepalive_time */
+	lxpr_readdir_not_a_dir,		/* .../net/ipv4/tcp_sack */
+	lxpr_readdir_not_a_dir,		/* .../net/ipv4/tcp_window_scaling */
+	lxpr_readdir_sys_vmdir,		/* /proc/sys/vm */
+	lxpr_readdir_not_a_dir,		/* /proc/sys/vm/max_map_count */
+	lxpr_readdir_not_a_dir,		/* /proc/sys/vm/min_free_kbytes */
+	lxpr_readdir_not_a_dir,		/* /proc/sys/vm/nr_hugepages */
+	lxpr_readdir_not_a_dir,		/* /proc/sys/vm/overcommit_memory */
+	lxpr_readdir_not_a_dir,		/* /proc/sys/vm/swappiness */
+	lxpr_readdir_not_a_dir,		/* /proc/uptime		*/
+	lxpr_readdir_not_a_dir,		/* /proc/version	*/
+	lxpr_readdir_not_a_dir,		/* /proc/vmstat		*/
+};
+
+
+/*
+ * lxpr_read(): Vnode operation for VOP_READ()
+ *
+ * As the format of all the files that can be read in the lx procfs is human
+ * readable and not binary structures there do not have to be different
+ * read variants depending on whether the reading process model is 32 or 64 bits
+ * (at least in general, and certainly the difference is unlikely to be enough
+ * to justify have different routines for 32 and 64 bit reads
+ */
+/* ARGSUSED */
+static int
+lxpr_read(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr,
+    caller_context_t *ct)
+{
+	lxpr_node_t *lxpnp = VTOLXP(vp);
+	lxpr_nodetype_t type = lxpnp->lxpr_type;
+	lxpr_uiobuf_t *uiobuf = lxpr_uiobuf_new(uiop);
+	int error;
+
+	ASSERT(type < LXPR_NFILES);
+
+	if (type == LXPR_KMSG) {
+		ldi_ident_t	li = VTOLXPM(vp)->lxprm_li;
+		ldi_handle_t	ldih;
+		struct strioctl	str;
+		int		rv;
+
+		/*
+		 * Open the zone's console device using the layered driver
+		 * interface.
+		 */
+		if ((error =
+		    ldi_open_by_name("/dev/log", FREAD, cr, &ldih, li)) != 0)
+			return (error);
+
+		/*
+		 * Send an ioctl to the underlying console device, letting it
+		 * know we're interested in getting console messages.
+		 */
+		str.ic_cmd = I_CONSLOG;
+		str.ic_timout = 0;
+		str.ic_len = 0;
+		str.ic_dp = NULL;
+		if ((error = ldi_ioctl(ldih, I_STR,
+		    (intptr_t)&str, FKIOCTL, cr, &rv)) != 0)
+			return (error);
+
+		lxpr_read_kmsg(lxpnp, uiobuf, ldih);
+
+		if ((error = ldi_close(ldih, FREAD, cr)) != 0)
+			return (error);
+	} else {
+		lxpr_read_function[type](lxpnp, uiobuf);
+	}
+
+	error = lxpr_uiobuf_flush(uiobuf);
+	lxpr_uiobuf_free(uiobuf);
+
+	return (error);
+}
+
+/*
+ * lxpr_read_invalid(), lxpr_read_isdir(), lxpr_read_empty()
+ *
+ * Various special case reads:
+ * - trying to read a directory
+ * - invalid file (used to mean a file that should be implemented,
+ *   but isn't yet)
+ * - empty file
+ * - wait to be able to read a file that will never have anything to read
+ */
+/* ARGSUSED */
+static void
+lxpr_read_isdir(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	lxpr_uiobuf_seterr(uiobuf, EISDIR);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_invalid(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	lxpr_uiobuf_seterr(uiobuf, EINVAL);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_empty(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/*
+ * lxpr_read_pid_auxv(): read process aux vector
+ */
+static void
+lxpr_read_pid_auxv(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	proc_t *p;
+	lx_proc_data_t *pd;
+	lx_elf_data_t *edp = NULL;
+	int i, cnt;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_PID_AUXV ||
+	    lxpnp->lxpr_type == LXPR_PID_TID_AUXV);
+
+	p = lxpr_lock(lxpnp->lxpr_pid, NO_ZOMB);
+
+	if (p == NULL) {
+		return;
+	}
+	if ((pd = ptolxproc(p)) == NULL) {
+		/* Emit a single AT_NULL record for non-branded processes */
+		auxv_t buf;
+
+		bzero(&buf, sizeof (buf));
+		lxpr_unlock(p);
+		lxpr_uiobuf_write(uiobuf, (char *)&buf, sizeof (buf));
+		return;
+	} else {
+		edp = &pd->l_elf_data;
+	}
+
+	if (p->p_model == DATAMODEL_NATIVE) {
+		auxv_t buf[__KERN_NAUXV_IMPL];
+
+		/*
+		 * Because a_type is only of size int (not long), the buffer
+		 * contents must be zeroed first to ensure cleanliness.
+		 */
+		bzero(buf, sizeof (buf));
+		for (i = 0, cnt = 0; i < __KERN_NAUXV_IMPL; i++) {
+			if (lx_auxv_stol(&p->p_user.u_auxv[i],
+			    &buf[cnt], edp) == 0) {
+				cnt++;
+			}
+			if (p->p_user.u_auxv[i].a_type == AT_NULL) {
+				break;
+			}
+		}
+		lxpr_unlock(p);
+		lxpr_uiobuf_write(uiobuf, (char *)buf, cnt * sizeof (buf[0]));
+	}
+#if defined(_SYSCALL32_IMPL)
+	else {
+		auxv32_t buf[__KERN_NAUXV_IMPL];
+
+		for (i = 0, cnt = 0; i < __KERN_NAUXV_IMPL; i++) {
+			auxv_t temp;
+
+			if (lx_auxv_stol(&p->p_user.u_auxv[i],
+			    &temp, edp) == 0) {
+				buf[cnt].a_type = (int)temp.a_type;
+				buf[cnt].a_un.a_val = (int)temp.a_un.a_val;
+				cnt++;
+			}
+			if (p->p_user.u_auxv[i].a_type == AT_NULL) {
+				break;
+			}
+		}
+		lxpr_unlock(p);
+		lxpr_uiobuf_write(uiobuf, (char *)buf, cnt * sizeof (buf[0]));
+	}
+#endif /* defined(_SYSCALL32_IMPL) */
+}
+
+/*
+ * lxpr_read_pid_cgroup(): read cgroups for process
+ */
+static void
+lxpr_read_pid_cgroup(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	proc_t *p;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_PID_CGROUP ||
+	    lxpnp->lxpr_type == LXPR_PID_TID_CGROUP);
+
+	p = lxpr_lock(lxpnp->lxpr_pid, ZOMB_OK);
+	if (p == NULL) {
+		lxpr_uiobuf_seterr(uiobuf, EINVAL);
+		return;
+	}
+	lxpr_unlock(p);
+
+	/* basic stub, 3rd field will need to be populated */
+	lxpr_uiobuf_printf(uiobuf, "1:name=systemd:/\n");
+}
+
+static void
+lxpr_copy_cmdline(proc_t *p, lx_proc_data_t *pd, lxpr_uiobuf_t *uiobuf)
+{
+	uio_t *uiop = uiobuf->uiop;
+	char *buf = uiobuf->buffer;
+	int bsz = uiobuf->buffsize;
+	boolean_t env_overflow = B_FALSE;
+	uintptr_t pos = pd->l_args_start + uiop->uio_offset;
+	uintptr_t estart = pd->l_envs_start;
+	uintptr_t eend = pd->l_envs_end;
+	size_t chunk, copied;
+	int err = 0;
+
+	/* Do not bother with data beyond the end of the envp strings area. */
+	if (pos > eend) {
+		return;
+	}
+	mutex_exit(&p->p_lock);
+
+	/*
+	 * If the starting or ending bounds are outside the argv strings area,
+	 * check to see if the process has overwritten the terminating NULL.
+	 * If not, no data needs to be copied from oustide the argv area.
+	 */
+	if (pos >= estart || (pos + uiop->uio_resid) >= estart) {
+		uint8_t term;
+		if (uread(p, &term, sizeof (term), estart - 1) != 0) {
+			err = EFAULT;
+		} else if (term != 0) {
+			env_overflow = B_TRUE;
+		}
+	}
+
+	/* Data between astart and estart-1 can be copied freely. */
+	while (pos < estart && uiop->uio_resid > 0 && err == 0) {
+		chunk = MIN(estart - pos, uiop->uio_resid);
+		chunk = MIN(chunk, bsz);
+
+		if (prreadbuf(p, pos, (uint8_t *)buf, chunk, &copied) != 0 ||
+		    copied != chunk) {
+			err = EFAULT;
+			break;
+		}
+		err = uiomove(buf, copied, UIO_READ, uiop);
+		pos += copied;
+	}
+
+	/*
+	 * Onward from estart, data is copied as a contiguous string.  To
+	 * protect env data from potential snooping, only one buffer-sized copy
+	 * is allowed to avoid complex seek logic.
+	 */
+	if (err == 0 && env_overflow && pos == estart && uiop->uio_resid > 0) {
+		chunk = MIN(eend - pos, uiop->uio_resid);
+		chunk = MIN(chunk, bsz);
+		if (prreadbuf(p, pos, (uint8_t *)buf, chunk, &copied) == 0) {
+			int len = strnlen(buf, copied);
+			if (len > 0) {
+				err = uiomove(buf, len, UIO_READ, uiop);
+			}
+		}
+	}
+
+	uiobuf->error = err;
+	/* reset any uiobuf state */
+	uiobuf->pos = uiobuf->buffer;
+	uiobuf->beg = 0;
+
+	mutex_enter(&p->p_lock);
+}
+
+/*
+ * lxpr_read_pid_cmdline(): read argument vector from process
+ */
+static void
+lxpr_read_pid_cmdline(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	proc_t *p;
+	char *buf;
+	size_t asz = lxpr_maxargvlen, sz;
+	lx_proc_data_t *pd;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_PID_CMDLINE ||
+	    lxpnp->lxpr_type == LXPR_PID_TID_CMDLINE);
+
+	buf = kmem_alloc(asz, KM_SLEEP);
+
+	p = lxpr_lock(lxpnp->lxpr_pid, NO_ZOMB);
+	if (p == NULL) {
+		kmem_free(buf, asz);
+		return;
+	}
+
+	if ((pd = ptolxproc(p)) != NULL && pd->l_args_start != 0 &&
+	    pd->l_envs_start != 0 && pd->l_envs_end != 0) {
+		/* Use Linux-style argv bounds if possible. */
+		lxpr_copy_cmdline(p, pd, uiobuf);
+		lxpr_unlock(p);
+	} else {
+		int r;
+
+		r = prreadargv(p, buf, asz, &sz);
+		lxpr_unlock(p);
+
+		if (r != 0) {
+			lxpr_uiobuf_seterr(uiobuf, EINVAL);
+		} else {
+			lxpr_uiobuf_write(uiobuf, buf, sz);
+		}
+	}
+	kmem_free(buf, asz);
+}
+
+/*
+ * lxpr_read_pid_comm(): read command from process
+ */
+static void
+lxpr_read_pid_comm(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	proc_t *p;
+	char buf[MAXCOMLEN + 1];
+
+	VERIFY(lxpnp->lxpr_type == LXPR_PID_COMM ||
+	    lxpnp->lxpr_type == LXPR_PID_TID_COMM);
+
+	/*
+	 * Because prctl(PR_SET_NAME) does not set custom names for threads
+	 * (vs processes), there is no need for special handling here.
+	 */
+	if ((p = lxpr_lock(lxpnp->lxpr_pid, ZOMB_OK)) == NULL) {
+		lxpr_uiobuf_seterr(uiobuf, EINVAL);
+		return;
+	}
+	strlcpy(buf, p->p_user.u_comm, sizeof (buf));
+	lxpr_unlock(p);
+	lxpr_uiobuf_printf(uiobuf, "%s\n", buf);
+}
+
+/*
+ * lxpr_read_pid_env(): read env vector from process
+ */
+static void
+lxpr_read_pid_env(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	proc_t *p;
+	char *buf;
+	size_t asz = lxpr_maxenvvlen, sz;
+	int r;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_PID_ENV);
+
+	buf = kmem_alloc(asz, KM_SLEEP);
+
+	p = lxpr_lock(lxpnp->lxpr_pid, NO_ZOMB);
+	if (p == NULL) {
+		kmem_free(buf, asz);
+		return;
+	}
+
+	r = prreadenvv(p, buf, asz, &sz);
+	lxpr_unlock(p);
+
+	if (r != 0) {
+		lxpr_uiobuf_seterr(uiobuf, EINVAL);
+	} else {
+		lxpr_uiobuf_write(uiobuf, buf, sz);
+	}
+	kmem_free(buf, asz);
+}
+
+/*
+ * lxpr_read_pid_limits(): ulimit file
+ */
+static void
+lxpr_read_pid_limits(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	proc_t *p;
+	rctl_qty_t cur[LX_RLIM_TAB_LEN], max[LX_RLIM_TAB_LEN];
+	int i;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_PID_LIMITS ||
+	    lxpnp->lxpr_type == LXPR_PID_TID_LIMITS);
+
+	p = lxpr_lock(lxpnp->lxpr_pid, NO_ZOMB);
+	if (p == NULL) {
+		return;
+	}
+
+	for (i = 0; i < LX_RLIM_TAB_LEN; i++) {
+		char *kname = lxpr_rlimtab[i].rlim_rctl;
+		rctl_val_t nval, *oval = NULL;
+		rctl_hndl_t hndl;
+
+		/* default to unlimited for resources without an analog */
+		cur[i] = RLIM_INFINITY;
+		max[i] = RLIM_INFINITY;
+		if (kname == NULL || (hndl = rctl_hndl_lookup(kname)) == -1) {
+			continue;
+		}
+		while (rctl_local_get(hndl, oval, &nval, p) == 0) {
+			oval = &nval;
+			switch (nval.rcv_privilege) {
+			case RCPRIV_BASIC:
+				if (!RCTL_INFINITE(nval))
+					cur[i] = nval.rcv_value;
+				break;
+			case RCPRIV_PRIVILEGED:
+				if (!RCTL_INFINITE(nval))
+					max[i] = nval.rcv_value;
+				break;
+			}
+		}
+	}
+	lxpr_unlock(p);
+
+	lxpr_uiobuf_printf(uiobuf, "%-25s %-20s %-20s %-10s\n",
+	    "Limit", "Soft Limit", "Hard Limit", "Units");
+	for (i = 0; i < LX_RLIM_TAB_LEN; i++) {
+		lxpr_uiobuf_printf(uiobuf, "%-25s", lxpr_rlimtab[i].rlim_name);
+		if (cur[i] == RLIM_INFINITY || cur[i] == LX_RLIM_INFINITY) {
+			lxpr_uiobuf_printf(uiobuf, " %-20s", "unlimited");
+		} else {
+			lxpr_uiobuf_printf(uiobuf, " %-20lu", cur[i]);
+		}
+		if (max[i] == RLIM_INFINITY || max[i] == LX_RLIM_INFINITY) {
+			lxpr_uiobuf_printf(uiobuf, " %-20s", "unlimited");
+		} else {
+			lxpr_uiobuf_printf(uiobuf, " %-20lu", max[i]);
+		}
+		lxpr_uiobuf_printf(uiobuf, " %-10s\n",
+		    lxpr_rlimtab[i].rlim_unit);
+	}
+}
+
+/*
+ * lxpr_read_pid_loginuid(): loginuid file
+ */
+static void
+lxpr_read_pid_loginuid(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	proc_t *p;
+	lx_proc_data_t *pd;
+	uid_t lu = 0;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_PID_LOGINUID ||
+	    lxpnp->lxpr_type == LXPR_PID_TID_LOGINUID);
+
+	p = lxpr_lock(lxpnp->lxpr_pid, NO_ZOMB);
+	if (p == NULL) {
+		lxpr_uiobuf_seterr(uiobuf, ENXIO);
+		return;
+	}
+
+	if ((pd = ptolxproc(p)) != NULL) {
+		lu = pd->l_loginuid;
+	}
+	lxpr_unlock(p);
+
+	lxpr_uiobuf_printf(uiobuf, "%d", lu);
+}
+
+/*
+ * lxpr_read_pid_maps(): memory map file
+ */
+static void
+lxpr_read_pid_maps(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	proc_t *p;
+	lx_proc_data_t *lxpd;
+	struct as *as;
+	struct seg *seg;
+	char *buf;
+	int buflen = MAXPATHLEN;
+	struct print_data {
+		uintptr_t saddr;
+		uintptr_t eaddr;
+		int type;
+		char prot[5];
+		uintptr_t offset;
+		vnode_t *vp;
+		char *name_override;
+		struct print_data *next;
+	} *print_head = NULL;
+	struct print_data **print_tail = &print_head;
+	struct print_data *pbuf;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_PID_MAPS ||
+	    lxpnp->lxpr_type == LXPR_PID_TID_MAPS);
+
+	p = lxpr_lock(lxpnp->lxpr_pid, NO_ZOMB);
+	if (p == NULL) {
+		return;
+	}
+
+	as = p->p_as;
+	lxpd = ptolxproc(p);
+
+	if (as == &kas) {
+		lxpr_unlock(p);
+		return;
+	}
+
+	mutex_exit(&p->p_lock);
+
+	/* Iterate over all segments in the address space */
+	AS_LOCK_ENTER(as, RW_READER);
+	for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
+		vnode_t *vp;
+		uint_t protbits;
+
+		pbuf = kmem_alloc(sizeof (*pbuf), KM_SLEEP);
+
+		pbuf->saddr = (uintptr_t)seg->s_base;
+		pbuf->eaddr = pbuf->saddr + seg->s_size;
+		pbuf->type = SEGOP_GETTYPE(seg, seg->s_base);
+
+		/*
+		 * Cheat and only use the protection bits of the first page
+		 * in the segment
+		 */
+		(void) strncpy(pbuf->prot, "----", sizeof (pbuf->prot));
+		(void) SEGOP_GETPROT(seg, seg->s_base, 0, &protbits);
+
+		if (protbits & PROT_READ)	   pbuf->prot[0] = 'r';
+		if (protbits & PROT_WRITE)	   pbuf->prot[1] = 'w';
+		if (protbits & PROT_EXEC)	   pbuf->prot[2] = 'x';
+		if (pbuf->type & MAP_SHARED)	   pbuf->prot[3] = 's';
+		else if (pbuf->type & MAP_PRIVATE) pbuf->prot[3] = 'p';
+
+		if (seg->s_ops == &segvn_ops &&
+		    SEGOP_GETVP(seg, seg->s_base, &vp) == 0 &&
+		    vp != NULL && vp->v_type == VREG) {
+			VN_HOLD(vp);
+			pbuf->vp = vp;
+		} else {
+			pbuf->vp = NULL;
+		}
+
+		pbuf->offset = SEGOP_GETOFFSET(seg, (caddr_t)pbuf->saddr);
+
+		pbuf->name_override = NULL;
+		if (lxpd != NULL) {
+			if (pbuf->saddr == lxpd->l_vdso) {
+				pbuf->name_override = "[vdso]";
+			} else if (pbuf->saddr == p->p_user.u_commpagep) {
+				pbuf->name_override = "[vvar]";
+			}
+		}
+
+		pbuf->next = NULL;
+		*print_tail = pbuf;
+		print_tail = &pbuf->next;
+	}
+	AS_LOCK_EXIT(as);
+	mutex_enter(&p->p_lock);
+	lxpr_unlock(p);
+
+	buf = kmem_alloc(buflen, KM_SLEEP);
+
+	/* print the data we've extracted */
+	pbuf = print_head;
+	while (pbuf != NULL) {
+		struct print_data *pbuf_next;
+		vattr_t vattr;
+
+		int maj = 0;
+		int min = 0;
+		ino_t inode = 0;
+
+		*buf = '\0';
+		if (pbuf->name_override != NULL) {
+			(void) strncpy(buf, pbuf->name_override, buflen);
+		} else if (pbuf->vp != NULL) {
+			vattr.va_mask = AT_FSID | AT_NODEID;
+			if (VOP_GETATTR(pbuf->vp, &vattr, 0, CRED(),
+			    NULL) == 0) {
+				maj = getmajor(vattr.va_fsid);
+				min = getminor(vattr.va_fsid);
+				inode = vattr.va_nodeid;
+			}
+			(void) vnodetopath(NULL, pbuf->vp, buf, buflen, CRED());
+			VN_RELE(pbuf->vp);
+		}
+
+		if (p->p_model == DATAMODEL_LP64) {
+			lxpr_uiobuf_printf(uiobuf,
+			    "%08llx-%08llx %s %08llx %02x:%02x %llu%s%s\n",
+			    pbuf->saddr, pbuf->eaddr, pbuf->prot, pbuf->offset,
+			    maj, min, inode, *buf != '\0' ? " " : "", buf);
+		} else {
+			lxpr_uiobuf_printf(uiobuf,
+			    "%08x-%08x %s %08x %02x:%02x %llu%s%s\n",
+			    (uint32_t)pbuf->saddr, (uint32_t)pbuf->eaddr,
+			    pbuf->prot, (uint32_t)pbuf->offset, maj, min,
+			    inode, *buf != '\0' ? " " : "", buf);
+		}
+
+		pbuf_next = pbuf->next;
+		kmem_free(pbuf, sizeof (*pbuf));
+		pbuf = pbuf_next;
+	}
+
+	kmem_free(buf, buflen);
+}
+
+/*
+ * Make mount entry look more like Linux. Non-zero return to skip it.
+ */
+static int
+lxpr_clean_mntent(char **mntpt, char **fstype, char **resource)
+{
+	if (strcmp(*mntpt, "/var/ld") == 0 ||
+	    strcmp(*fstype, "objfs") == 0 ||
+	    strcmp(*fstype, "mntfs") == 0 ||
+	    strcmp(*fstype, "ctfs") == 0 ||
+	    strncmp(*mntpt, "/native/", 8) == 0) {
+		return (1);
+	}
+
+	if (strcmp(*fstype, "tmpfs") == 0) {
+		*resource = "tmpfs";
+	} else if (strcmp(*fstype, "lx_proc") == 0) {
+		*resource = *fstype = "proc";
+	} else if (strcmp(*fstype, "lx_sysfs") == 0) {
+		*resource = *fstype = "sysfs";
+	} else if (strcmp(*fstype, "lx_devfs") == 0) {
+		*resource = *fstype = "devtmpfs";
+	} else if (strcmp(*fstype, "lx_cgroup") == 0) {
+		*resource = *fstype = "cgroup";
+	} else if (strcmp(*fstype, "lxautofs") == 0) {
+		*fstype = "autofs";
+	}
+
+	return (0);
+}
+
+
+typedef struct lxpr_mount_entry {
+	list_node_t	lme_link;
+	uint_t		lme_id;
+	uint_t		lme_parent_id;
+	refstr_t	*lme_mntpt;
+	refstr_t	*lme_resource;
+	uint_t		lme_flag;
+	int		lme_fstype;
+	dev_t		lme_dev;
+	boolean_t	lme_force;
+} lxpr_mount_entry_t;
+
+static int lxpr_zfs_fstype = -1;
+
+#define	LXPR_ROOT_MOUNT_ID	15
+
+static list_t *
+lxpr_enumerate_mounts(zone_t *zone)
+{
+	vfs_t *vfsp, *rvfsp, *vfslist;
+	lx_zone_data_t *lxzd = ztolxzd(zone);
+	list_t *result;
+	lxpr_mount_entry_t *lme;
+	lx_virt_disk_t *vd;
+	uint_t root_id, mount_id;
+	char tmppath[MAXPATHLEN];
+
+	result = kmem_alloc(sizeof (list_t), KM_SLEEP);
+	list_create(result, sizeof (lxpr_mount_entry_t),
+	    offsetof(lxpr_mount_entry_t, lme_link));
+	/* use an arbitrary start value for the root mount_id */
+	root_id = 15;
+	mount_id = root_id + 1;
+
+	ASSERT(zone != global_zone);
+	ASSERT(lxzd != NULL);
+	ASSERT(lxzd->lxzd_vdisks != NULL);
+
+	vfs_list_read_lock();
+	vfsp = vfslist = zone->zone_vfslist;
+
+	/*
+	 * If the zone has a root entry, it will be the first in the list.
+	 * Conjure one up if needed.
+	 */
+	if (vfslist == NULL || strcmp(refstr_value(vfsp->vfs_mntpt),
+	    zone->zone_rootpath) != 0) {
+		rvfsp = zone->zone_rootvp->v_vfsp;
+	} else {
+		rvfsp = vfslist;
+		vfsp = vfslist->vfs_zone_next;
+	}
+
+	lme = kmem_alloc(sizeof (lxpr_mount_entry_t), KM_SLEEP);
+	lme->lme_id = root_id;
+	lme->lme_parent_id = 0;
+	lme->lme_mntpt = refstr_alloc(zone->zone_rootpath);
+	lme->lme_flag = rvfsp->vfs_flag;
+	lme->lme_fstype = rvfsp->vfs_fstype;
+	lme->lme_force = B_TRUE;
+
+	lme->lme_resource = NULL;
+	vd = list_head(lxzd->lxzd_vdisks);
+	while (vd != NULL) {
+		if (vd->lxvd_type == LXVD_ZFS_DS &&
+		    vd->lxvd_real_dev == rvfsp->vfs_dev) {
+			(void) snprintf(tmppath, sizeof (tmppath),
+			    "%sdev/%s", zone->zone_rootpath, vd->lxvd_name);
+			lme->lme_resource = refstr_alloc(tmppath);
+			lme->lme_dev = vd->lxvd_emul_dev;
+			break;
+		}
+		vd = list_next(lxzd->lxzd_vdisks, vd);
+	}
+	if (lme->lme_resource == NULL) {
+		lme->lme_resource = refstr_alloc(zone->zone_rootpath);
+		lme->lme_dev = rvfsp->vfs_dev;
+	}
+	list_insert_head(result, lme);
+
+	do {
+		if (vfsp == NULL) {
+			break;
+		}
+		/* Skip mounts we shouldn't show */
+		if ((vfsp->vfs_flag & VFS_NOMNTTAB) != 0) {
+			vfsp = vfsp->vfs_zone_next;
+			continue;
+		}
+
+		lme = kmem_alloc(sizeof (lxpr_mount_entry_t), KM_SLEEP);
+		lme->lme_id = mount_id++;
+		lme->lme_parent_id = root_id;
+		lme->lme_mntpt = vfsp->vfs_mntpt;
+		refstr_hold(vfsp->vfs_mntpt);
+		lme->lme_flag = vfsp->vfs_flag;
+		lme->lme_fstype = vfsp->vfs_fstype;
+		lme->lme_force = B_FALSE;
+
+		lme->lme_resource = NULL;
+		vd = list_head(lxzd->lxzd_vdisks);
+		while (vd != NULL) {
+			if (vd->lxvd_type == LXVD_ZFS_DS &&
+			    vd->lxvd_real_dev == vfsp->vfs_dev) {
+				char vdev[MAXPATHLEN];
+
+				(void) snprintf(vdev, sizeof (vdev),
+				    "%sdev/%s",
+				    zone->zone_rootpath, vd->lxvd_name);
+				lme->lme_resource = refstr_alloc(vdev);
+				lme->lme_dev = vd->lxvd_emul_dev;
+				break;
+			}
+			vd = list_next(lxzd->lxzd_vdisks, vd);
+		}
+		if (lme->lme_resource == NULL) {
+			lme->lme_resource = vfsp->vfs_resource;
+			refstr_hold(vfsp->vfs_resource);
+			lme->lme_dev = vfsp->vfs_dev;
+		}
+		list_insert_tail(result, lme);
+		vfsp = vfsp->vfs_zone_next;
+	} while (vfsp != vfslist);
+
+	vfs_list_unlock();
+
+	/* Add a single dummy entry for /native/usr */
+	lme = kmem_alloc(sizeof (lxpr_mount_entry_t), KM_SLEEP);
+	lme->lme_id = mount_id++;
+	lme->lme_parent_id = root_id;
+	lme->lme_flag = VFS_RDONLY;
+	lme->lme_dev = makedevice(0, 1);
+	(void) snprintf(tmppath, sizeof (tmppath),
+	    "%snative/usr", zone->zone_rootpath);
+	lme->lme_mntpt = refstr_alloc(tmppath);
+	lme->lme_resource = lme->lme_mntpt;
+	refstr_hold(lme->lme_mntpt);
+	if (lxpr_zfs_fstype == -1) {
+		vfssw_t *zfssw = vfs_getvfssw("zfs");
+		VERIFY(zfssw != NULL);
+		lxpr_zfs_fstype = ((uintptr_t)zfssw - (uintptr_t)vfssw) /
+		    sizeof (vfssw[0]);
+		VERIFY(&vfssw[lxpr_zfs_fstype] == zfssw);
+	}
+	lme->lme_fstype = lxpr_zfs_fstype;
+	lme->lme_force = B_TRUE;
+	list_insert_tail(result, lme);
+
+	return (result);
+}
+
+/*
+ * lxpr_read_pid_mountinfo(): information about process mount points.
+ */
+static void
+lxpr_read_pid_mountinfo(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	zone_t *zone = LXPTOZ(lxpnp);
+	list_t *mounts;
+	lxpr_mount_entry_t *lme;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_PID_MOUNTINFO ||
+	    lxpnp->lxpr_type == LXPR_PID_TID_MOUNTINFO);
+
+	mounts = lxpr_enumerate_mounts(zone);
+
+	/*
+	 * now we can run through what we've extracted without holding
+	 * vfs_list_read_lock()
+	 */
+	lme = (lxpr_mount_entry_t *)list_remove_head(mounts);
+	while (lme != NULL) {
+		char *resource, *mntpt, *fstype, *rwflag;
+		vnode_t *vp;
+		int error;
+
+		mntpt = (char *)refstr_value(lme->lme_mntpt);
+		resource = (char *)refstr_value(lme->lme_resource);
+
+		if (mntpt == NULL || mntpt[0] == '\0') {
+			goto nextp;
+		}
+		mntpt = ZONE_PATH_TRANSLATE(mntpt, zone);
+		error = lookupname(mntpt, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp);
+		if (error != 0) {
+			goto nextp;
+		} else if ((vp->v_flag & VROOT) == 0 && !lme->lme_force) {
+			VN_RELE(vp);
+			goto nextp;
+		}
+		VN_RELE(vp);
+
+		if (resource != NULL && resource[0] != '\0') {
+			if (resource[0] == '/') {
+				resource = ZONE_PATH_VISIBLE(resource, zone) ?
+				    ZONE_PATH_TRANSLATE(resource, zone) : mntpt;
+			}
+		} else {
+			resource = "none";
+		}
+
+		/*  Make things look more like Linux. */
+		fstype = vfssw[lme->lme_fstype].vsw_name;
+		if (lxpr_clean_mntent(&mntpt, &fstype, &resource) != 0 &&
+		    !lme->lme_force) {
+			goto nextp;
+		}
+		rwflag = ((lme->lme_flag & VFS_RDONLY) == 0) ? "rw" : "ro";
+
+		/*
+		 * XXX parent ID is not tracked correctly here. Currently we
+		 * always assume the parent ID is the root ID.
+		 */
+		lxpr_uiobuf_printf(uiobuf,
+		    "%d %d %d:%d / %s %s - %s %s %s\n",
+		    lme->lme_id, lme->lme_parent_id,
+		    getmajor(lme->lme_dev), getminor(lme->lme_dev),
+		    mntpt, rwflag, fstype, resource, rwflag);
+
+nextp:
+		refstr_rele(lme->lme_mntpt);
+		refstr_rele(lme->lme_resource);
+		kmem_free(lme, sizeof (lxpr_mount_entry_t));
+		lme = (lxpr_mount_entry_t *)list_remove_head(mounts);
+	}
+
+	list_destroy(mounts);
+	kmem_free(mounts, sizeof (list_t));
+}
+
+/*
+ * lxpr_read_pid_oom_scr_adj(): read oom_score_adj for process
+ */
+static void
+lxpr_read_pid_oom_scr_adj(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	proc_t *p;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_PID_OOM_SCR_ADJ ||
+	    lxpnp->lxpr_type == LXPR_PID_TID_OOM_SCR_ADJ);
+
+	p = lxpr_lock(lxpnp->lxpr_pid, ZOMB_OK);
+	if (p == NULL) {
+		lxpr_uiobuf_seterr(uiobuf, EINVAL);
+		return;
+	}
+	lxpr_unlock(p);
+
+	/* always 0 */
+	lxpr_uiobuf_printf(uiobuf, "0\n");
+}
+
+/*
+ * lxpr_read_pid_personality(): read personality for process
+ */
+static void
+lxpr_read_pid_personality(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	proc_t *p;
+	lx_proc_data_t *lxpd;
+	unsigned int personality;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_PID_PERSONALITY);
+
+	p = lxpr_lock(lxpnp->lxpr_pid, ZOMB_OK);
+	if (p == NULL) {
+		lxpr_uiobuf_seterr(uiobuf, EINVAL);
+		return;
+	}
+	if ((lxpd = ptolxproc(p)) != NULL) {
+		personality = lxpd->l_personality;
+	} else {
+		/* Report native processes as having the SunOS personality */
+		personality = LX_PER_SUNOS;
+	}
+	lxpr_unlock(p);
+
+	lxpr_uiobuf_printf(uiobuf, "%08x\n", personality);
+}
+
+/*
+ * lxpr_read_pid_statm(): memory status file
+ */
+static void
+lxpr_read_pid_statm(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	proc_t *p;
+	struct as *as;
+	size_t vsize, rss;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_PID_STATM ||
+	    lxpnp->lxpr_type == LXPR_PID_TID_STATM);
+
+	p = lxpr_lock(lxpnp->lxpr_pid, ZOMB_OK);
+	if (p == NULL) {
+		lxpr_uiobuf_seterr(uiobuf, EINVAL);
+		return;
+	}
+
+	as = p->p_as;
+	mutex_exit(&p->p_lock);
+	if (as != &kas) {
+		AS_LOCK_ENTER(as, RW_READER);
+		vsize = btopr(as->a_resvsize);
+		rss = rm_asrss(as);
+		AS_LOCK_EXIT(as);
+	} else {
+		vsize = 0;
+		rss = 0;
+	}
+	mutex_enter(&p->p_lock);
+	lxpr_unlock(p);
+
+	lxpr_uiobuf_printf(uiobuf,
+	    "%lu %lu %lu %lu %lu %lu %lu\n",
+	    vsize, rss, 0l, rss, 0l, 0l, 0l);
+}
+
+/*
+ * Look for either the main thread (lookup_id is 0) or the specified thread.
+ * If we're looking for the main thread but the proc does not have one, we
+ * fallback to using prchoose to get any thread available.
+ */
+static kthread_t *
+lxpr_get_thread(proc_t *p, uint_t lookup_id)
+{
+	kthread_t *t;
+	uint_t emul_tid;
+	lx_lwp_data_t *lwpd;
+	pid_t pid = p->p_pid;
+	pid_t init_pid = curproc->p_zone->zone_proc_initpid;
+	boolean_t branded = (p->p_brand == &lx_brand);
+
+	/* get specified thread  */
+	if ((t = p->p_tlist) == NULL)
+		return (NULL);
+
+	do {
+		if (lookup_id == 0 && t->t_tid == 1) {
+			thread_lock(t);
+			return (t);
+		}
+
+		lwpd = ttolxlwp(t);
+		if (branded && lwpd != NULL) {
+			if (pid == init_pid && lookup_id == 1) {
+				emul_tid = t->t_tid;
+			} else {
+				emul_tid = lwpd->br_pid;
+			}
+		} else {
+			/*
+			 * Make only the first (assumed to be main) thread
+			 * visible for non-branded processes.
+			 */
+			emul_tid = p->p_pid;
+		}
+		if (emul_tid == lookup_id) {
+			thread_lock(t);
+			return (t);
+		}
+	} while ((t = t->t_forw) != p->p_tlist);
+
+	if (lookup_id == 0)
+		return (prchoose(p));
+	return (NULL);
+}
+
+/*
+ * Lookup the real pid for procs 0 or 1.
+ */
+static pid_t
+get_real_pid(pid_t p)
+{
+	pid_t find_pid;
+
+	if (p == 1) {
+		find_pid = curproc->p_zone->zone_proc_initpid;
+	} else if (p == 0) {
+		find_pid = curproc->p_zone->zone_zsched->p_pid;
+	} else {
+		find_pid = p;
+	}
+
+	return (find_pid);
+}
+
+/*
+ * pid/tid common code to read status file
+ */
+static void
+lxpr_read_status_common(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf,
+    uint_t lookup_id)
+{
+	proc_t		*p;
+	kthread_t	*t;
+	user_t		*up;
+	cred_t		*cr;
+	const gid_t	*groups;
+	struct as	*as;
+	char		*status;
+	pid_t		pid, ppid;
+	k_sigset_t	current, ignore, handle;
+	int		i, lx_sig, lwpcnt, ngroups;
+	pid_t		real_pid;
+	char		buf_comm[MAXCOMLEN + 1];
+	rlim64_t	fdlim;
+	size_t		vsize = 0, nlocked = 0, rss = 0, stksize = 0;
+	boolean_t	printsz = B_FALSE;
+
+	real_pid = get_real_pid(lxpnp->lxpr_pid);
+	p = lxpr_lock(real_pid, ZOMB_OK);
+	if (p == NULL) {
+		lxpr_uiobuf_seterr(uiobuf, EINVAL);
+		return;
+	}
+
+	pid = p->p_pid;
+
+	/*
+	 * Convert pid to the Linux default of 1 if we're the zone's init
+	 * process or if we're the zone's zsched the pid is 0.
+	 */
+	if (pid == curproc->p_zone->zone_proc_initpid) {
+		pid = 1;
+		ppid = 0;	/* parent pid for init is 0 */
+	} else if (pid == curproc->p_zone->zone_zsched->p_pid) {
+		pid = 0;	/* zsched is pid 0 */
+		ppid = 0;	/* parent pid for zsched is itself */
+	} else {
+		/*
+		 * Make sure not to reference parent PIDs that reside outside
+		 * the zone
+		 */
+		ppid = ((p->p_flag & SZONETOP)
+		    ? curproc->p_zone->zone_zsched->p_pid : p->p_ppid);
+
+		/*
+		 * Convert ppid to the Linux default of 1 if our parent is the
+		 * zone's init process
+		 */
+		if (ppid == curproc->p_zone->zone_proc_initpid)
+			ppid = 1;
+	}
+
+	t = lxpr_get_thread(p, lookup_id);
+	if (t != NULL) {
+		switch (t->t_state) {
+		case TS_SLEEP:
+			status = "S (sleeping)";
+			break;
+		case TS_RUN:
+		case TS_ONPROC:
+			status = "R (running)";
+			break;
+		case TS_ZOMB:
+			status = "Z (zombie)";
+			break;
+		case TS_STOPPED:
+			status = "T (stopped)";
+			break;
+		default:
+			status = "! (unknown)";
+			break;
+		}
+		thread_unlock(t);
+	} else {
+		if (lookup_id != 0) {
+			/* we can't find this specific thread */
+			lxpr_uiobuf_seterr(uiobuf, EINVAL);
+			lxpr_unlock(p);
+			return;
+		}
+
+		/*
+		 * there is a hole in the exit code, where a proc can have
+		 * no threads but it is yet to be flagged SZOMB. We will
+		 * assume we are about to become a zombie
+		 */
+		status = "Z (zombie)";
+	}
+
+	up = PTOU(p);
+	mutex_enter(&p->p_crlock);
+	crhold(cr = p->p_cred);
+	mutex_exit(&p->p_crlock);
+
+	strlcpy(buf_comm, up->u_comm, sizeof (buf_comm));
+	fdlim = p->p_fno_ctl;
+	lwpcnt = p->p_lwpcnt;
+
+	/*
+	 * Gather memory information
+	 */
+	as = p->p_as;
+	if ((p->p_stat != SZOMB) && !(p->p_flag & (SSYS | SEXITING)) &&
+	    (as != &kas)) {
+		mutex_exit(&p->p_lock);
+		AS_LOCK_ENTER(as, RW_READER);
+		vsize = as->a_resvsize;
+		rss = rm_asrss(as);
+		AS_LOCK_EXIT(as);
+		mutex_enter(&p->p_lock);
+
+		nlocked = p->p_locked_mem;
+		stksize = p->p_stksize;
+		printsz = B_TRUE;
+	}
+
+	/*
+	 * Gather signal information
+	 */
+	sigemptyset(&current);
+	sigemptyset(&ignore);
+	sigemptyset(&handle);
+	for (i = 1; i < NSIG; i++) {
+		lx_sig = stol_signo[i];
+
+		if ((lx_sig > 0) && (lx_sig <= LX_NSIG)) {
+			if (sigismember(&p->p_sig, i))
+				sigaddset(&current, lx_sig);
+
+			if (up->u_signal[i - 1] == SIG_IGN)
+				sigaddset(&ignore, lx_sig);
+			else if (up->u_signal[i - 1] != SIG_DFL)
+				sigaddset(&handle, lx_sig);
+		}
+	}
+	lxpr_unlock(p);
+
+	lxpr_uiobuf_printf(uiobuf,
+	    "Name:\t%s\n"
+	    "State:\t%s\n"
+	    "Tgid:\t%d\n"
+	    "Pid:\t%d\n"
+	    "PPid:\t%d\n"
+	    "TracerPid:\t%d\n"
+	    "Uid:\t%u\t%u\t%u\t%u\n"
+	    "Gid:\t%u\t%u\t%u\t%u\n"
+	    "FDSize:\t%d\n"
+	    "Groups:\t",
+	    buf_comm,
+	    status,
+	    pid, /* thread group id - same as pid */
+	    (lookup_id == 0) ? pid : lxpnp->lxpr_desc,
+	    ppid,
+	    0,
+	    crgetruid(cr), crgetuid(cr), crgetsuid(cr), crgetuid(cr),
+	    crgetrgid(cr), crgetgid(cr), crgetsgid(cr), crgetgid(cr),
+	    fdlim);
+	ngroups = crgetngroups(cr);
+	groups  = crgetgroups(cr);
+	for (i = 0; i < ngroups; i++) {
+		lxpr_uiobuf_printf(uiobuf,
+		    "%u ",
+		    groups[i]);
+	}
+	crfree(cr);
+	if (printsz) {
+		lxpr_uiobuf_printf(uiobuf,
+		    "\n"
+		    "VmSize:\t%8lu kB\n"
+		    "VmLck:\t%8lu kB\n"
+		    "VmRSS:\t%8lu kB\n"
+		    "VmData:\t%8lu kB\n"
+		    "VmStk:\t%8lu kB\n"
+		    "VmExe:\t%8lu kB\n"
+		    "VmLib:\t%8lu kB",
+		    btok(vsize),
+		    btok(nlocked),
+		    ptok(rss),
+		    0l,
+		    btok(stksize),
+		    ptok(rss),
+		    0l);
+	}
+	lxpr_uiobuf_printf(uiobuf, "\nThreads:\t%u\n", lwpcnt);
+	lxpr_uiobuf_printf(uiobuf,
+	    "SigPnd:\t%08x%08x\n"
+	    "SigBlk:\t%08x%08x\n"
+	    "SigIgn:\t%08x%08x\n"
+	    "SigCgt:\t%08x%08x\n",
+	    current.__sigbits[1], current.__sigbits[0],
+	    0, 0, /* signals blocked on per thread basis */
+	    ignore.__sigbits[1], ignore.__sigbits[0],
+	    handle.__sigbits[1], handle.__sigbits[0]);
+	/* Report only the full bounding set for now */
+	lxpr_uiobuf_printf(uiobuf,
+	    "CapInh:\t%016x\n"
+	    "CapPrm:\t%016x\n"
+	    "CapEff:\t%016x\n"
+	    "CapBnd:\t%016llx\n",
+	    0, 0, 0, 0x1fffffffffLL);
+}
+
+/*
+ * lxpr_read_pid_status(): status file
+ */
+static void
+lxpr_read_pid_status(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	ASSERT(lxpnp->lxpr_type == LXPR_PID_STATUS);
+
+	lxpr_read_status_common(lxpnp, uiobuf, 0);
+}
+
+/*
+ * lxpr_read_pid_tid_status(): status file
+ */
+static void
+lxpr_read_pid_tid_status(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	ASSERT(lxpnp->lxpr_type == LXPR_PID_TID_STATUS);
+	lxpr_read_status_common(lxpnp, uiobuf, lxpnp->lxpr_desc);
+}
+
+/*
+ * Same logic as the lx devfs lxd_pts_devt_translator.
+ */
+static dev_t
+lxpr_xlate_pts_dev(dev_t dev)
+{
+	minor_t min = getminor(dev);
+	int lx_maj, lx_min;
+
+	lx_maj = LX_PTS_MAJOR_MIN + (min / LX_MAXMIN);
+	lx_min = min % LX_MAXMIN;
+
+	return (LX_MAKEDEVICE(lx_maj, lx_min));
+}
+
+/*
+ * pid/tid common code to read stat file
+ */
+static void
+lxpr_read_stat_common(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf,
+    uint_t lookup_id)
+{
+	proc_t *p;
+	kthread_t *t;
+	struct as *as;
+	char stat;
+	pid_t pid, ppid, pgpid, spid;
+	gid_t psgid;
+	dev_t psdev;
+	size_t rss, vsize;
+	int nice, pri, lwpcnt;
+	caddr_t wchan, stackbase;
+	processorid_t cpu;
+	pid_t real_pid;
+	clock_t utime, stime, cutime, cstime, ticks, boottime;
+	char buf_comm[MAXCOMLEN + 1];
+	rlim64_t vmem_ctl;
+
+	real_pid = get_real_pid(lxpnp->lxpr_pid);
+	p = lxpr_lock(real_pid, ZOMB_OK);
+	if (p == NULL) {
+		lxpr_uiobuf_seterr(uiobuf, EINVAL);
+		return;
+	}
+
+	/*
+	 * Set Linux defaults if we're the zone's init process
+	 */
+	pid = p->p_pid;
+	if (pid == curproc->p_zone->zone_proc_initpid) {
+		pid = 1;		/* PID for init */
+		ppid = 0;		/* parent PID for init is 0 */
+		pgpid = 0;		/* process group for init is 0 */
+		psgid = (gid_t)-1;	/* credential GID for init is -1 */
+		spid = 0;		/* session id for init is 0 */
+		psdev = 0;		/* session device for init is 0 */
+	} else if (pid == curproc->p_zone->zone_zsched->p_pid) {
+		pid = 0;		/* PID for zsched */
+		ppid = 0;		/* parent PID for zsched is 0 */
+		pgpid = 0;		/* process group for zsched is 0 */
+		psgid = (gid_t)-1;	/* credential GID for zsched is -1 */
+		spid = 0;		/* session id for zsched is 0 */
+		psdev = 0;		/* session device for zsched is 0 */
+	} else {
+		/*
+		 * Make sure not to reference parent PIDs that reside outside
+		 * the zone
+		 */
+		ppid = ((p->p_flag & SZONETOP) ?
+		    curproc->p_zone->zone_zsched->p_pid : p->p_ppid);
+
+		/*
+		 * Convert ppid to the Linux default of 1 if our parent is the
+		 * zone's init process
+		 */
+		if (ppid == curproc->p_zone->zone_proc_initpid)
+			ppid = 1;
+
+		pgpid = p->p_pgrp;
+
+		mutex_enter(&p->p_splock);
+		mutex_enter(&p->p_sessp->s_lock);
+		spid = p->p_sessp->s_sid;
+		psdev = lxpr_xlate_pts_dev(p->p_sessp->s_dev);
+		if (p->p_sessp->s_cred)
+			psgid = crgetgid(p->p_sessp->s_cred);
+		else
+			psgid = crgetgid(p->p_cred);
+
+		mutex_exit(&p->p_sessp->s_lock);
+		mutex_exit(&p->p_splock);
+	}
+
+	if ((p->p_stat == SZOMB) || (p->p_flag & (SSYS | SEXITING)) ||
+	    (p->p_as == &kas)) {
+		stackbase = 0;
+	} else {
+		/* from prgetstackbase() */
+		stackbase = p->p_usrstack - p->p_stksize;
+	}
+
+	utime = stime = 0;
+	t = lxpr_get_thread(p, lookup_id);
+	if (t != NULL) {
+		klwp_t *lwp = ttolwp(t);
+		struct mstate *ms = &lwp->lwp_mstate;
+		hrtime_t utm, stm;
+
+		switch (t->t_state) {
+		case TS_SLEEP:
+			stat = 'S';
+			break;
+		case TS_RUN:
+		case TS_ONPROC:
+			stat = 'R';
+			break;
+		case TS_ZOMB:
+			stat = 'Z';
+			break;
+		case TS_STOPPED:
+			stat = 'T';
+			break;
+		default:
+			stat = '!';
+			break;
+		}
+
+		if (CL_DONICE(t, NULL, 0, &nice) != 0)
+			nice = 0;
+
+		pri = t->t_pri;
+		wchan = t->t_wchan;
+		cpu = t->t_cpu->cpu_id;
+
+		utm = ms->ms_acct[LMS_USER];
+		stm = ms->ms_acct[LMS_SYSTEM];
+
+		thread_unlock(t);
+
+		/* convert unscaled high-res time to nanoseconds */
+		scalehrtime(&utm);
+		scalehrtime(&stm);
+
+		/* Linux /proc expects these values in ticks */
+		utime = (clock_t)NSEC_TO_TICK(utm);
+		stime = (clock_t)NSEC_TO_TICK(stm);
+	} else {
+		if (lookup_id != 0) {
+			/* we can't find this specific thread */
+			lxpr_uiobuf_seterr(uiobuf, EINVAL);
+			lxpr_unlock(p);
+			return;
+		}
+
+		/* Only zombies have no threads */
+		stat = 'Z';
+		nice = 0;
+		pri = 0;
+		wchan = 0;
+		cpu = 0;
+	}
+	as = p->p_as;
+	mutex_exit(&p->p_lock);
+	if (as != &kas) {
+		AS_LOCK_ENTER(as, RW_READER);
+		vsize = as->a_resvsize;
+		rss = rm_asrss(as);
+		AS_LOCK_EXIT(as);
+	} else {
+		vsize = 0;
+		rss = 0;
+	}
+	mutex_enter(&p->p_lock);
+
+	if (lookup_id == 0) {
+		/* process */
+		utime = p->p_utime;
+		stime = p->p_stime;
+	} else {
+		/* tid: utime & stime for the thread set in block above */
+	}
+	cutime = p->p_cutime;
+	cstime = p->p_cstime;
+	lwpcnt = p->p_lwpcnt;
+	vmem_ctl = p->p_vmem_ctl;
+	strlcpy(buf_comm, p->p_user.u_comm, sizeof (buf_comm));
+	ticks = p->p_user.u_ticks;	/* lbolt at process start */
+	/* adjust ticks to account for zone boot time */
+	boottime = LXPTOZ(lxpnp)->zone_zsched->p_user.u_ticks;
+	ticks -= boottime;
+	lxpr_unlock(p);
+
+	lxpr_uiobuf_printf(uiobuf,
+	    "%d "					/* 1 */
+	    "(%s) %c %d %d %d %d %d "			/* 2-8 */
+	    "%lu %lu %lu %lu %lu "			/* 9-13 */
+	    "%lu %lu %ld %ld "				/* 14-17 */
+	    "%d %d %d "					/* 18-20 */
+	    "%lu "					/* 21 */
+	    "%lu "					/* 22 */
+	    "%lu %ld %llu "				/* 23-25 */
+	    "%lu %lu %llu "				/* 26-28 */
+	    "%lu %lu "					/* 29-30 */
+	    "%lu %lu %lu %lu "				/* 31-34 */
+	    "%lu "					/* 35 */
+	    "%lu %lu "					/* 36-37 */
+	    "%d "					/* 38 */
+	    "%d"					/* 39 */
+	    "\n",
+	    (lookup_id == 0) ? pid : lxpnp->lxpr_desc,	/* 1 */
+	    buf_comm, stat, ppid, pgpid, spid, psdev, psgid, /* 2-8 */
+	    0l, 0l, 0l, 0l, 0l, /* flags, minflt, cminflt, majflt, cmajflt */
+	    utime, stime, cutime, cstime,		/* 14-17 */
+	    pri, nice, lwpcnt,				/* 18-20 */
+	    0l, /* itrealvalue (time before next SIGALRM) 21 */
+	    ticks,					/* 22 */
+	    vsize, rss, vmem_ctl,			/* 23-25 */
+	    0l, 0l, stackbase, /* startcode, endcode, startstack 26-28 */
+	    0l, 0l,				/* kstkesp, kstkeip 29-30 */
+	    0l, 0l, 0l, 0l, /* signal, blocked, sigignore, sigcatch 31-34 */
+	    wchan,					/* 35 */
+	    0l, 0l,					/* nswap,cnswap 36-37 */
+	    0,						/* exit_signal	38 */
+	    cpu						/* 39 */);
+}
+
+/*
+ * lxpr_read_pid_stat(): pid stat file
+ */
+static void
+lxpr_read_pid_stat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	ASSERT(lxpnp->lxpr_type == LXPR_PID_STAT);
+
+	lxpr_read_stat_common(lxpnp, uiobuf, 0);
+}
+
+/*
+ * lxpr_read_pid_tid_stat(): pid stat file
+ */
+static void
+lxpr_read_pid_tid_stat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	ASSERT(lxpnp->lxpr_type == LXPR_PID_TID_STAT);
+	lxpr_read_stat_common(lxpnp, uiobuf, lxpnp->lxpr_desc);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_arp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+struct lxpr_ifstat {
+	uint64_t rx_bytes;
+	uint64_t rx_packets;
+	uint64_t rx_errors;
+	uint64_t rx_drop;
+	uint64_t tx_bytes;
+	uint64_t tx_packets;
+	uint64_t tx_errors;
+	uint64_t tx_drop;
+	uint64_t collisions;
+	uint64_t rx_multicast;
+};
+
+static void *
+lxpr_kstat_read(kstat_t *kn, boolean_t byname, size_t *size, int *num)
+{
+	kstat_t *kp;
+	int i, nrec = 0;
+	size_t bufsize;
+	void *buf = NULL;
+
+	if (byname == B_TRUE) {
+		kp = kstat_hold_byname(kn->ks_module, kn->ks_instance,
+		    kn->ks_name, getzoneid());
+	} else {
+		kp = kstat_hold_bykid(kn->ks_kid, getzoneid());
+	}
+	if (kp == NULL) {
+		return (NULL);
+	}
+	if (kp->ks_flags & KSTAT_FLAG_INVALID) {
+		kstat_rele(kp);
+		return (NULL);
+	}
+
+	bufsize = kp->ks_data_size + 1;
+	kstat_rele(kp);
+
+	/*
+	 * The kstat in question is released so that kmem_alloc(KM_SLEEP) is
+	 * performed without it held.  After the alloc, the kstat is reacquired
+	 * and its size is checked again. If the buffer is no longer large
+	 * enough, the alloc and check are repeated up to three times.
+	 */
+	for (i = 0; i < 2; i++) {
+		buf = kmem_alloc(bufsize, KM_SLEEP);
+
+		/* Check if bufsize still appropriate */
+		if (byname == B_TRUE) {
+			kp = kstat_hold_byname(kn->ks_module, kn->ks_instance,
+			    kn->ks_name, getzoneid());
+		} else {
+			kp = kstat_hold_bykid(kn->ks_kid, getzoneid());
+		}
+		if (kp == NULL || kp->ks_flags & KSTAT_FLAG_INVALID) {
+			if (kp != NULL) {
+				kstat_rele(kp);
+			}
+			kmem_free(buf, bufsize);
+			return (NULL);
+		}
+		KSTAT_ENTER(kp);
+		(void) KSTAT_UPDATE(kp, KSTAT_READ);
+		if (bufsize < kp->ks_data_size) {
+			kmem_free(buf, bufsize);
+			buf = NULL;
+			bufsize = kp->ks_data_size + 1;
+			KSTAT_EXIT(kp);
+			kstat_rele(kp);
+			continue;
+		} else {
+			if (KSTAT_SNAPSHOT(kp, buf, KSTAT_READ) != 0) {
+				kmem_free(buf, bufsize);
+				buf = NULL;
+			}
+			nrec = kp->ks_ndata;
+			KSTAT_EXIT(kp);
+			kstat_rele(kp);
+			break;
+		}
+	}
+
+	if (buf != NULL) {
+		*size = bufsize;
+		*num = nrec;
+	}
+	return (buf);
+}
+
+static int
+lxpr_kstat_ifstat(kstat_t *kn, struct lxpr_ifstat *ifs)
+{
+	kstat_named_t *kp;
+	int i, num;
+	size_t size;
+
+	/*
+	 * Search by name instead of by kid since there's a small window to
+	 * race against kstats being added/removed.
+	 */
+	bzero(ifs, sizeof (*ifs));
+	kp = (kstat_named_t *)lxpr_kstat_read(kn, B_TRUE, &size, &num);
+	if (kp == NULL)
+		return (-1);
+	for (i = 0; i < num; i++) {
+		if (strncmp(kp[i].name, "rbytes64", KSTAT_STRLEN) == 0)
+			ifs->rx_bytes = kp[i].value.ui64;
+		else if (strncmp(kp[i].name, "ipackets64", KSTAT_STRLEN) == 0)
+			ifs->rx_packets = kp[i].value.ui64;
+		else if (strncmp(kp[i].name, "ierrors", KSTAT_STRLEN) == 0)
+			ifs->rx_errors = kp[i].value.ui32;
+		else if (strncmp(kp[i].name, "norcvbuf", KSTAT_STRLEN) == 0)
+			ifs->rx_drop = kp[i].value.ui32;
+		else if (strncmp(kp[i].name, "multircv", KSTAT_STRLEN) == 0)
+			ifs->rx_multicast = kp[i].value.ui32;
+		else if (strncmp(kp[i].name, "obytes64", KSTAT_STRLEN) == 0)
+			ifs->tx_bytes = kp[i].value.ui64;
+		else if (strncmp(kp[i].name, "opackets64", KSTAT_STRLEN) == 0)
+			ifs->tx_packets = kp[i].value.ui64;
+		else if (strncmp(kp[i].name, "oerrors", KSTAT_STRLEN) == 0)
+			ifs->tx_errors = kp[i].value.ui32;
+		else if (strncmp(kp[i].name, "noxmtbuf", KSTAT_STRLEN) == 0)
+			ifs->tx_drop = kp[i].value.ui32;
+		else if (strncmp(kp[i].name, "collisions", KSTAT_STRLEN) == 0)
+			ifs->collisions = kp[i].value.ui32;
+	}
+	kmem_free(kp, size);
+	return (0);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_dev(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	kstat_t *ksr;
+	kstat_t ks0;
+	int i, nidx;
+	size_t sidx;
+	struct lxpr_ifstat ifs;
+
+	lxpr_uiobuf_printf(uiobuf, "Inter-|   Receive                   "
+	    "                             |  Transmit\n");
+	lxpr_uiobuf_printf(uiobuf, " face |bytes    packets errs drop fifo"
+	    " frame compressed multicast|bytes    packets errs drop fifo"
+	    " colls carrier compressed\n");
+
+	ks0.ks_kid = 0;
+	ksr = (kstat_t *)lxpr_kstat_read(&ks0, B_FALSE, &sidx, &nidx);
+	if (ksr == NULL)
+		return;
+
+	for (i = 1; i < nidx; i++) {
+		if (strncmp(ksr[i].ks_module, "link", KSTAT_STRLEN) == 0 ||
+		    strncmp(ksr[i].ks_module, "lo", KSTAT_STRLEN) == 0) {
+			if (lxpr_kstat_ifstat(&ksr[i], &ifs) != 0)
+				continue;
+
+			/* Overwriting the name is ok in the local snapshot */
+			lx_ifname_convert(ksr[i].ks_name, LX_IF_FROMNATIVE);
+			lxpr_uiobuf_printf(uiobuf, "%6s: %7llu %7llu %4lu "
+			    "%4lu %4u %5u %10u %9lu %8llu %7llu %4lu %4lu %4u "
+			    "%5lu %7u %10u\n",
+			    ksr[i].ks_name,
+			    ifs.rx_bytes, ifs.rx_packets,
+			    ifs.rx_errors, ifs.rx_drop,
+			    0, 0, 0, ifs.rx_multicast,
+			    ifs.tx_bytes, ifs.tx_packets,
+			    ifs.tx_errors, ifs.tx_drop,
+			    0, ifs.collisions, 0, 0);
+		}
+	}
+
+	kmem_free(ksr, sidx);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_dev_mcast(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+static void
+lxpr_inet6_out(const in6_addr_t *addr, char buf[33])
+{
+	const uint8_t *ip = addr->s6_addr;
+	char digits[] = "0123456789abcdef";
+	int i;
+	for (i = 0; i < 16; i++) {
+		buf[2 * i] = digits[ip[i] >> 4];
+		buf[2 * i + 1] = digits[ip[i] & 0xf];
+	}
+	buf[32] = '\0';
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_if_inet6(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	netstack_t *ns;
+	ip_stack_t *ipst;
+	ill_t *ill;
+	ipif_t *ipif;
+	ill_walk_context_t	ctx;
+	char ifname[LIFNAMSIZ], ip6out[33];
+
+	ns = netstack_get_current();
+	if (ns == NULL)
+		return;
+	ipst = ns->netstack_ip;
+
+	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
+	ill = ILL_START_WALK_V6(&ctx, ipst);
+
+	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
+		for (ipif = ill->ill_ipif; ipif != NULL;
+		    ipif = ipif->ipif_next) {
+			uint_t index = ill->ill_phyint->phyint_ifindex;
+			int plen = ip_mask_to_plen_v6(&ipif->ipif_v6net_mask);
+			unsigned int scope = lx_ipv6_scope_convert(
+			    &ipif->ipif_v6lcl_addr);
+			/* Always report PERMANENT flag */
+			int flag = 0x80;
+
+			(void) snprintf(ifname, LIFNAMSIZ, "%s", ill->ill_name);
+			lx_ifname_convert(ifname, LX_IF_FROMNATIVE);
+			lxpr_inet6_out(&ipif->ipif_v6lcl_addr, ip6out);
+
+			lxpr_uiobuf_printf(uiobuf, "%32s %02x %02x %02x %02x"
+			    " %8s\n", ip6out, index, plen, scope, flag, ifname);
+		}
+	}
+	rw_exit(&ipst->ips_ill_g_lock);
+	netstack_rele(ns);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_igmp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_ip_mr_cache(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_ip_mr_vif(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+static void
+lxpr_format_route_ipv6(ire_t *ire, lxpr_uiobuf_t *uiobuf)
+{
+	uint32_t flags;
+	char name[IFNAMSIZ];
+	char ipv6addr[33];
+
+	lxpr_inet6_out(&ire->ire_addr_v6, ipv6addr);
+	lxpr_uiobuf_printf(uiobuf, "%s %02x ", ipv6addr,
+	    ip_mask_to_plen_v6(&ire->ire_mask_v6));
+
+	/* punt on this for now */
+	lxpr_uiobuf_printf(uiobuf, "%s %02x ",
+	    "00000000000000000000000000000000", 0);
+
+	lxpr_inet6_out(&ire->ire_gateway_addr_v6, ipv6addr);
+	lxpr_uiobuf_printf(uiobuf, "%s", ipv6addr);
+
+	flags = ire->ire_flags &
+	    (RTF_UP|RTF_GATEWAY|RTF_HOST|RTF_DYNAMIC|RTF_MODIFIED);
+	/* Linux's RTF_LOCAL equivalent */
+	if (ire->ire_metrics.iulp_local)
+		flags |= 0x80000000;
+
+	if (ire->ire_ill != NULL) {
+		ill_get_name(ire->ire_ill, name, sizeof (name));
+		lx_ifname_convert(name, LX_IF_FROMNATIVE);
+	} else {
+		name[0] = '\0';
+	}
+
+	lxpr_uiobuf_printf(uiobuf, " %08x %08x %08x %08x %8s\n",
+	    0, /* metric */
+	    ire->ire_refcnt,
+	    0,
+	    flags,
+	    name);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_ipv6_route(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	netstack_t *ns;
+	ip_stack_t *ipst;
+
+	ns = netstack_get_current();
+	if (ns == NULL)
+		return;
+	ipst = ns->netstack_ip;
+
+	/*
+	 * LX branded zones are expected to have exclusive IP stack, hence
+	 * using ALL_ZONES as the zoneid filter.
+	 */
+	ire_walk_v6(&lxpr_format_route_ipv6, uiobuf, ALL_ZONES, ipst);
+
+	netstack_rele(ns);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_mcfilter(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_netstat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_raw(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+#define	LXPR_SKIP_ROUTE(type)	\
+	(((IRE_IF_CLONE | IRE_BROADCAST | IRE_MULTICAST | \
+	IRE_NOROUTE | IRE_LOOPBACK | IRE_LOCAL) & type) != 0)
+
+static void
+lxpr_format_route_ipv4(ire_t *ire, lxpr_uiobuf_t *uiobuf)
+{
+	uint32_t flags;
+	char name[IFNAMSIZ];
+	ill_t *ill;
+	ire_t *nire;
+	ipif_t *ipif;
+	ipaddr_t gateway;
+
+	if (LXPR_SKIP_ROUTE(ire->ire_type) || ire->ire_testhidden != 0)
+		return;
+
+	/* These route flags have direct Linux equivalents */
+	flags = ire->ire_flags &
+	    (RTF_UP|RTF_GATEWAY|RTF_HOST|RTF_DYNAMIC|RTF_MODIFIED);
+
+	/*
+	 * Search for a suitable IRE for naming purposes.
+	 * On Linux, the default route is typically associated with the
+	 * interface used to access gateway.  The default IRE on Illumos
+	 * typically lacks an ill reference but its parent might have one.
+	 */
+	nire = ire;
+	do {
+		ill = nire->ire_ill;
+		nire = nire->ire_dep_parent;
+	} while (ill == NULL && nire != NULL);
+	if (ill != NULL) {
+		ill_get_name(ill, name, sizeof (name));
+		lx_ifname_convert(name, LX_IF_FROMNATIVE);
+	} else {
+		name[0] = '*';
+		name[1] = '\0';
+	}
+
+	/*
+	 * Linux suppresses the gateway address for directly connected
+	 * interface networks.  To emulate this behavior, we walk all addresses
+	 * of a given route interface.  If one matches the gateway, it is
+	 * displayed as NULL.
+	 */
+	gateway = ire->ire_gateway_addr;
+	if ((ill = ire->ire_ill) != NULL) {
+		for (ipif = ill->ill_ipif; ipif != NULL;
+		    ipif = ipif->ipif_next) {
+			if (ipif->ipif_lcl_addr == gateway) {
+				gateway = 0;
+				break;
+			}
+		}
+	}
+
+	lxpr_uiobuf_printf(uiobuf, "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
+	    "%d\t%08X\t%d\t%u\t%u\n",
+	    name,
+	    ire->ire_addr,
+	    gateway,
+	    flags, 0, 0,
+	    0, /* priority */
+	    ire->ire_mask,
+	    0, 0, /* mss, window */
+	    ire->ire_metrics.iulp_rtt);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_route(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	netstack_t *ns;
+	ip_stack_t *ipst;
+
+	lxpr_uiobuf_printf(uiobuf, "Iface\tDestination\tGateway \tFlags\t"
+	    "RefCnt\tUse\tMetric\tMask\t\tMTU\tWindow\tIRTT\n");
+
+	ns = netstack_get_current();
+	if (ns == NULL)
+		return;
+	ipst = ns->netstack_ip;
+
+	/*
+	 * LX branded zones are expected to have exclusive IP stack, hence
+	 * using ALL_ZONES as the zoneid filter.
+	 */
+	ire_walk_v4(&lxpr_format_route_ipv4, uiobuf, ALL_ZONES, ipst);
+
+	netstack_rele(ns);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_rpc(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_rt_cache(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_sockstat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+typedef struct lxpr_snmp_table {
+	const char *lst_proto;
+	const char *lst_fields[];
+} lxpr_snmp_table_t;
+
+static lxpr_snmp_table_t lxpr_snmp_ip = { "ip",
+	{
+	"forwarding", "defaultTTL", "inReceives", "inHdrErrors",
+	"inAddrErrors", "forwDatagrams", "inUnknownProtos", "inDiscards",
+	"inDelivers", "outRequests", "outDiscards", "outNoRoutes",
+	"reasmTimeout", "reasmReqds", "reasmOKs", "reasmFails", "fragOKs",
+	"fragFails", "fragCreates",
+	NULL
+	}
+};
+static lxpr_snmp_table_t lxpr_snmp_icmp = { "icmp",
+	{
+	"inMsgs", "inErrors", "inCsumErrors", "inDestUnreachs", "inTimeExcds",
+	"inParmProbs", "inSrcQuenchs", "inRedirects", "inEchos", "inEchoReps",
+	"inTimestamps", "inTimestampReps", "inAddrMasks", "inAddrMaskReps",
+	"outMsgs", "outErrors", "outDestUnreachs", "outTimeExcds",
+	"outParmProbs", "outSrcQuenchs", "outRedirects", "outEchos",
+	"outEchoReps", "outTimestamps", "outTimestampReps", "outAddrMasks",
+	"outAddrMaskReps",
+	NULL
+	}
+};
+static lxpr_snmp_table_t lxpr_snmp_tcp = { "tcp",
+	{
+	"rtoAlgorithm", "rtoMin", "rtoMax", "maxConn", "activeOpens",
+	"passiveOpens", "attemptFails", "estabResets", "currEstab", "inSegs",
+	"outSegs", "retransSegs", "inErrs", "outRsts", "inCsumErrors",
+	NULL
+	}
+};
+static lxpr_snmp_table_t lxpr_snmp_udp = { "udp",
+	{
+	"inDatagrams", "noPorts", "inErrors", "outDatagrams", "rcvbufErrors",
+	"sndbufErrors", "inCsumErrors",
+	NULL
+	}
+};
+
+static lxpr_snmp_table_t *lxpr_net_snmptab[] = {
+	&lxpr_snmp_ip,
+	&lxpr_snmp_icmp,
+	&lxpr_snmp_tcp,
+	&lxpr_snmp_udp,
+	NULL
+};
+
+static void
+lxpr_kstat_print_tab(lxpr_uiobuf_t *uiobuf, lxpr_snmp_table_t *table,
+    kstat_t *kn)
+{
+	kstat_named_t *klist;
+	char upname[KSTAT_STRLEN], upfield[KSTAT_STRLEN];
+	int i, j, num;
+	size_t size;
+
+	klist = (kstat_named_t *)lxpr_kstat_read(kn, B_TRUE, &size, &num);
+	if (klist == NULL)
+		return;
+
+	/* Print the header line, fields capitalized */
+	(void) strncpy(upname, table->lst_proto, KSTAT_STRLEN);
+	upname[0] = toupper(upname[0]);
+	lxpr_uiobuf_printf(uiobuf, "%s:", upname);
+	for (i = 0; table->lst_fields[i] != NULL; i++) {
+		(void) strncpy(upfield, table->lst_fields[i], KSTAT_STRLEN);
+		upfield[0] = toupper(upfield[0]);
+		lxpr_uiobuf_printf(uiobuf, " %s", upfield);
+	}
+	lxpr_uiobuf_printf(uiobuf, "\n%s:", upname);
+
+	/* Then loop back through to print the value line. */
+	for (i = 0; table->lst_fields[i] != NULL; i++) {
+		kstat_named_t *kpoint = NULL;
+		for (j = 0; j < num; j++) {
+			if (strncmp(klist[j].name, table->lst_fields[i],
+			    KSTAT_STRLEN) == 0) {
+				kpoint = &klist[j];
+				break;
+			}
+		}
+		if (kpoint == NULL) {
+			/* Output 0 for unknown fields */
+			lxpr_uiobuf_printf(uiobuf, " 0");
+		} else {
+			switch (kpoint->data_type) {
+			case KSTAT_DATA_INT32:
+				lxpr_uiobuf_printf(uiobuf, " %d",
+				    kpoint->value.i32);
+				break;
+			case KSTAT_DATA_UINT32:
+				lxpr_uiobuf_printf(uiobuf, " %u",
+				    kpoint->value.ui32);
+				break;
+			case KSTAT_DATA_INT64:
+				lxpr_uiobuf_printf(uiobuf, " %ld",
+				    kpoint->value.l);
+				break;
+			case KSTAT_DATA_UINT64:
+				lxpr_uiobuf_printf(uiobuf, " %lu",
+				    kpoint->value.ul);
+				break;
+			}
+		}
+	}
+	lxpr_uiobuf_printf(uiobuf, "\n");
+	kmem_free(klist, size);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_snmp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	kstat_t *ksr;
+	kstat_t ks0;
+	lxpr_snmp_table_t **table = lxpr_net_snmptab;
+	int i, t, nidx;
+	size_t sidx;
+
+	ks0.ks_kid = 0;
+	ksr = (kstat_t *)lxpr_kstat_read(&ks0, B_FALSE, &sidx, &nidx);
+	if (ksr == NULL)
+		return;
+
+	for (t = 0; table[t] != NULL; t++) {
+		for (i = 0; i < nidx; i++) {
+			if (strncmp(ksr[i].ks_class, "mib2", KSTAT_STRLEN) != 0)
+				continue;
+			if (strncmp(ksr[i].ks_name, table[t]->lst_proto,
+			    KSTAT_STRLEN) == 0) {
+				lxpr_kstat_print_tab(uiobuf, table[t], &ksr[i]);
+				break;
+			}
+		}
+	}
+	kmem_free(ksr, sidx);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_stat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+static int
+lxpr_convert_tcp_state(int st)
+{
+	/*
+	 * Derived from the enum located in the Linux kernel sources:
+	 * include/net/tcp_states.h
+	 */
+	switch (st) {
+	case TCPS_ESTABLISHED:
+		return (1);
+	case TCPS_SYN_SENT:
+		return (2);
+	case TCPS_SYN_RCVD:
+		return (3);
+	case TCPS_FIN_WAIT_1:
+		return (4);
+	case TCPS_FIN_WAIT_2:
+		return (5);
+	case TCPS_TIME_WAIT:
+		return (6);
+	case TCPS_CLOSED:
+		return (7);
+	case TCPS_CLOSE_WAIT:
+		return (8);
+	case TCPS_LAST_ACK:
+		return (9);
+	case TCPS_LISTEN:
+		return (10);
+	case TCPS_CLOSING:
+		return (11);
+	default:
+		/* No translation for TCPS_IDLE, TCPS_BOUND or anything else */
+		return (0);
+	}
+}
+
+static void
+lxpr_format_tcp(lxpr_uiobuf_t *uiobuf, ushort_t ipver)
+{
+	int i, sl = 0;
+	connf_t *connfp;
+	conn_t *connp;
+	netstack_t *ns;
+	ip_stack_t *ipst;
+
+	ASSERT(ipver == IPV4_VERSION || ipver == IPV6_VERSION);
+	if (ipver == IPV4_VERSION) {
+		lxpr_uiobuf_printf(uiobuf, "  sl  local_address rem_address   "
+		    "st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout "
+		    "inode\n");
+	} else {
+		lxpr_uiobuf_printf(uiobuf, "  sl  "
+		    "local_address                         "
+		    "remote_address                        "
+		    "st tx_queue rx_queue tr tm->when retrnsmt   "
+		    "uid  timeout inode\n");
+	}
+	/*
+	 * Due to differences between the Linux and illumos TCP
+	 * implementations, some data will be omitted from the output here.
+	 *
+	 * Valid fields:
+	 *  - local_address
+	 *  - remote_address
+	 *  - st
+	 *  - tx_queue
+	 *  - rx_queue
+	 *  - uid
+	 *  - inode
+	 *
+	 * Omitted/invalid fields
+	 *  - tr
+	 *  - tm->when
+	 *  - retrnsmt
+	 *  - timeout
+	 */
+
+	ns = netstack_get_current();
+	if (ns == NULL)
+		return;
+	ipst = ns->netstack_ip;
+
+	for (i = 0; i < CONN_G_HASH_SIZE; i++) {
+		connfp = &ipst->ips_ipcl_globalhash_fanout[i];
+		connp = NULL;
+		while ((connp =
+		    ipcl_get_next_conn(connfp, connp, IPCL_TCPCONN)) != NULL) {
+			tcp_t *tcp;
+			vattr_t attr;
+			sonode_t *so = (sonode_t *)connp->conn_upper_handle;
+			vnode_t *vp = (so != NULL) ? so->so_vnode : NULL;
+			if (connp->conn_ipversion != ipver)
+				continue;
+			tcp = connp->conn_tcp;
+			if (ipver == IPV4_VERSION) {
+				lxpr_uiobuf_printf(uiobuf,
+				    "%4d: %08X:%04X %08X:%04X ",
+				    ++sl,
+				    connp->conn_laddr_v4,
+				    ntohs(connp->conn_lport),
+				    connp->conn_faddr_v4,
+				    ntohs(connp->conn_fport));
+			} else {
+				lxpr_uiobuf_printf(uiobuf, "%4d: "
+				    "%08X%08X%08X%08X:%04X "
+				    "%08X%08X%08X%08X:%04X ",
+				    ++sl,
+				    connp->conn_laddr_v6.s6_addr32[0],
+				    connp->conn_laddr_v6.s6_addr32[1],
+				    connp->conn_laddr_v6.s6_addr32[2],
+				    connp->conn_laddr_v6.s6_addr32[3],
+				    ntohs(connp->conn_lport),
+				    connp->conn_faddr_v6.s6_addr32[0],
+				    connp->conn_faddr_v6.s6_addr32[1],
+				    connp->conn_faddr_v6.s6_addr32[2],
+				    connp->conn_faddr_v6.s6_addr32[3],
+				    ntohs(connp->conn_fport));
+			}
+
+			/* fetch the simulated inode for the socket */
+			if (vp == NULL ||
+			    VOP_GETATTR(vp, &attr, 0, CRED(), NULL) != 0)
+				attr.va_nodeid = 0;
+
+			lxpr_uiobuf_printf(uiobuf,
+			    "%02X %08X:%08X %02X:%08X %08X "
+			    "%5u %8d %lu %d %p %u %u %u %u %d\n",
+			    lxpr_convert_tcp_state(tcp->tcp_state),
+			    tcp->tcp_rcv_cnt, tcp->tcp_unsent, /* rx/tx queue */
+			    0, 0, /* tr, when */
+			    0, /* per-connection rexmits aren't tracked today */
+			    connp->conn_cred->cr_uid,
+			    0, /* timeout */
+			    /* inode + more */
+			    (ino_t)attr.va_nodeid, 0, NULL, 0, 0, 0, 0, 0);
+		}
+	}
+	netstack_rele(ns);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_tcp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	lxpr_format_tcp(uiobuf, IPV4_VERSION);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_tcp6(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	lxpr_format_tcp(uiobuf, IPV6_VERSION);
+}
+
+static void
+lxpr_format_udp(lxpr_uiobuf_t *uiobuf, ushort_t ipver)
+{
+	int i, sl = 0;
+	connf_t *connfp;
+	conn_t *connp;
+	netstack_t *ns;
+	ip_stack_t *ipst;
+
+	ASSERT(ipver == IPV4_VERSION || ipver == IPV6_VERSION);
+	if (ipver == IPV4_VERSION) {
+		lxpr_uiobuf_printf(uiobuf, "  sl  local_address rem_address"
+		    "   st tx_queue rx_queue tr tm->when retrnsmt   uid"
+		    "  timeout inode ref pointer drops\n");
+	} else {
+		lxpr_uiobuf_printf(uiobuf, "  sl  "
+		    "local_address                         "
+		    "remote_address                        "
+		    "st tx_queue rx_queue tr tm->when retrnsmt   "
+		    "uid  timeout inode ref pointer drops\n");
+	}
+	/*
+	 * Due to differences between the Linux and illumos UDP
+	 * implementations, some data will be omitted from the output here.
+	 *
+	 * Valid fields:
+	 *  - local_address
+	 *  - remote_address
+	 *  - st: limited
+	 *  - uid
+	 *
+	 * Omitted/invalid fields
+	 *  - tx_queue
+	 *  - rx_queue
+	 *  - tr
+	 *  - tm->when
+	 *  - retrnsmt
+	 *  - timeout
+	 *  - inode
+	 */
+
+	ns = netstack_get_current();
+	if (ns == NULL)
+		return;
+	ipst = ns->netstack_ip;
+
+	for (i = 0; i < CONN_G_HASH_SIZE; i++) {
+		connfp = &ipst->ips_ipcl_globalhash_fanout[i];
+		connp = NULL;
+		while ((connp =
+		    ipcl_get_next_conn(connfp, connp, IPCL_UDPCONN)) != NULL) {
+			udp_t *udp;
+			int state = 0;
+			vattr_t attr;
+			sonode_t *so = (sonode_t *)connp->conn_upper_handle;
+			vnode_t *vp = (so != NULL) ? so->so_vnode : NULL;
+			if (connp->conn_ipversion != ipver)
+				continue;
+			udp = connp->conn_udp;
+			if (ipver == IPV4_VERSION) {
+				lxpr_uiobuf_printf(uiobuf,
+				    "%4d: %08X:%04X %08X:%04X ",
+				    ++sl,
+				    connp->conn_laddr_v4,
+				    ntohs(connp->conn_lport),
+				    connp->conn_faddr_v4,
+				    ntohs(connp->conn_fport));
+			} else {
+				lxpr_uiobuf_printf(uiobuf, "%4d: "
+				    "%08X%08X%08X%08X:%04X "
+				    "%08X%08X%08X%08X:%04X ",
+				    ++sl,
+				    connp->conn_laddr_v6.s6_addr32[0],
+				    connp->conn_laddr_v6.s6_addr32[1],
+				    connp->conn_laddr_v6.s6_addr32[2],
+				    connp->conn_laddr_v6.s6_addr32[3],
+				    ntohs(connp->conn_lport),
+				    connp->conn_faddr_v6.s6_addr32[0],
+				    connp->conn_faddr_v6.s6_addr32[1],
+				    connp->conn_faddr_v6.s6_addr32[2],
+				    connp->conn_faddr_v6.s6_addr32[3],
+				    ntohs(connp->conn_fport));
+			}
+
+			switch (udp->udp_state) {
+			case TS_UNBND:
+			case TS_IDLE:
+				state = 7;
+				break;
+			case TS_DATA_XFER:
+				state = 1;
+				break;
+			}
+
+			/* fetch the simulated inode for the socket */
+			if (vp == NULL ||
+			    VOP_GETATTR(vp, &attr, 0, CRED(), NULL) != 0)
+				attr.va_nodeid = 0;
+
+			lxpr_uiobuf_printf(uiobuf,
+			    "%02X %08X:%08X %02X:%08X %08X "
+			    "%5u %8d %lu %d %p %d\n",
+			    state,
+			    0, 0, /* rx/tx queue */
+			    0, 0, /* tr, when */
+			    0, /* retrans */
+			    connp->conn_cred->cr_uid,
+			    0, /* timeout */
+			    /* inode, ref, pointer, drops */
+			    (ino_t)attr.va_nodeid, 0, NULL, 0);
+		}
+	}
+	netstack_rele(ns);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_udp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	lxpr_format_udp(uiobuf, IPV4_VERSION);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_udp6(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	lxpr_format_udp(uiobuf, IPV6_VERSION);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_unix(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	sonode_t *so;
+	zoneid_t zoneid = getzoneid();
+
+	lxpr_uiobuf_printf(uiobuf, "Num       RefCount Protocol Flags    Type "
+	    "St Inode Path\n");
+
+	mutex_enter(&socklist.sl_lock);
+	for (so = socklist.sl_list; so != NULL;
+	    so = _SOTOTPI(so)->sti_next_so) {
+		vnode_t *vp = so->so_vnode;
+		vattr_t attr;
+		sotpi_info_t *sti;
+		const char *name = NULL;
+		int status = 0;
+		int type = 0;
+		int flags = 0;
+
+		/* Only process active sonodes in this zone */
+		if (so->so_count == 0 || so->so_zoneid != zoneid)
+			continue;
+
+		/*
+		 * Grab the inode, if possible.
+		 * This must be done before entering so_lock.
+		 */
+		if (vp == NULL ||
+		    VOP_GETATTR(vp, &attr, 0, CRED(), NULL) != 0)
+			attr.va_nodeid = 0;
+
+		mutex_enter(&so->so_lock);
+		sti = _SOTOTPI(so);
+
+		if (sti->sti_laddr_sa != NULL &&
+		    sti->sti_laddr_len > 0) {
+			name = sti->sti_laddr_sa->sa_data;
+		} else if (sti->sti_faddr_sa != NULL &&
+		    sti->sti_faddr_len > 0) {
+			name = sti->sti_faddr_sa->sa_data;
+		}
+
+		/*
+		 * Derived from enum values in Linux kernel source:
+		 * include/uapi/linux/net.h
+		 */
+		if ((so->so_state & SS_ISDISCONNECTING) != 0) {
+			status = 4;
+		} else if ((so->so_state & SS_ISCONNECTING) != 0) {
+			status = 2;
+		} else if ((so->so_state & SS_ISCONNECTED) != 0) {
+			status = 3;
+		} else {
+			status = 1;
+			/* Add ACC flag for stream-type server sockets */
+			if (so->so_type != SOCK_DGRAM &&
+			    sti->sti_laddr_sa != NULL)
+				flags |= 0x10000;
+		}
+
+		/* Convert to Linux type */
+		switch (so->so_type) {
+		case SOCK_DGRAM:
+			type = 2;
+			break;
+		case SOCK_SEQPACKET:
+			type = 5;
+			break;
+		default:
+			type = 1;
+		}
+
+		lxpr_uiobuf_printf(uiobuf, "%p: %08X %08X %08X %04X %02X %5llu",
+		    so,
+		    so->so_count,
+		    0, /* proto, always 0 */
+		    flags,
+		    type,
+		    status,
+		    (ino_t)attr.va_nodeid);
+
+		/*
+		 * Due to shortcomings in the abstract socket emulation, they
+		 * cannot be properly represented here (as @<path>).
+		 *
+		 * This will be the case until they are better implemented.
+		 */
+		if (name != NULL)
+			lxpr_uiobuf_printf(uiobuf, " %s\n", name);
+		else
+			lxpr_uiobuf_printf(uiobuf, "\n");
+		mutex_exit(&so->so_lock);
+	}
+	mutex_exit(&socklist.sl_lock);
+}
+
+/*
+ * lxpr_read_kmsg(): read the contents of the kernel message queue. We
+ * translate this into the reception of console messages for this zone; each
+ * read copies out a single zone console message, or blocks until the next one
+ * is produced, unless we're open non-blocking, in which case we return after
+ * 1ms.
+ */
+
+#define	LX_KMSG_PRI	"<0>"
+
+static void
+lxpr_read_kmsg(lxpr_node_t *lxpnp, struct lxpr_uiobuf *uiobuf, ldi_handle_t lh)
+{
+	mblk_t		*mp;
+	timestruc_t	to;
+	timestruc_t	*tp = NULL;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_KMSG);
+
+	if (lxpr_uiobuf_nonblock(uiobuf)) {
+		to.tv_sec = 0;
+		to.tv_nsec = 1000000; /* 1msec */
+		tp = &to;
+	}
+
+	if (ldi_getmsg(lh, &mp, tp) == 0) {
+		/*
+		 * lx procfs doesn't like successive reads to the same file
+		 * descriptor unless we do an explicit rewind each time.
+		 */
+		lxpr_uiobuf_seek(uiobuf, 0);
+
+		lxpr_uiobuf_printf(uiobuf, "%s%s", LX_KMSG_PRI,
+		    mp->b_cont->b_rptr);
+
+		freemsg(mp);
+	}
+}
+
+/*
+ * lxpr_read_loadavg(): read the contents of the "loadavg" file.  We do just
+ * enough for uptime and other simple lxproc readers to work
+ */
+extern int nthread;
+
+static void
+lxpr_read_loadavg(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	ulong_t avenrun1;
+	ulong_t avenrun5;
+	ulong_t avenrun15;
+	ulong_t avenrun1_cs;
+	ulong_t avenrun5_cs;
+	ulong_t avenrun15_cs;
+	int loadavg[3];
+	int *loadbuf;
+	cpupart_t *cp;
+	zone_t *zone = LXPTOZ(lxpnp);
+
+	uint_t nrunnable = 0;
+	rctl_qty_t nlwps;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_LOADAVG);
+
+	mutex_enter(&cpu_lock);
+
+	/*
+	 * Need to add up values over all CPU partitions. If pools are active,
+	 * only report the values of the zone's partition, which by definition
+	 * includes the current CPU.
+	 */
+	if (pool_pset_enabled()) {
+		psetid_t psetid = zone_pset_get(curproc->p_zone);
+
+		ASSERT(curproc->p_zone != &zone0);
+		cp = CPU->cpu_part;
+
+		nrunnable = cp->cp_nrunning + cp->cp_nrunnable;
+		(void) cpupart_get_loadavg(psetid, &loadavg[0], 3);
+		loadbuf = &loadavg[0];
+	} else {
+		cp = cp_list_head;
+		do {
+			nrunnable += cp->cp_nrunning + cp->cp_nrunnable;
+		} while ((cp = cp->cp_next) != cp_list_head);
+
+		loadbuf = zone == global_zone ?
+		    &avenrun[0] : zone->zone_avenrun;
+	}
+
+	/*
+	 * If we're in the non-global zone, we'll report the total number of
+	 * LWPs in the zone for the "nproc" parameter of /proc/loadavg,
+	 * otherwise will just use nthread (which will include kernel threads,
+	 * but should be good enough for lxproc).
+	 */
+	nlwps = zone == global_zone ? nthread : zone->zone_nlwps;
+
+	mutex_exit(&cpu_lock);
+
+	avenrun1 = loadbuf[0] >> FSHIFT;
+	avenrun1_cs = ((loadbuf[0] & (FSCALE-1)) * 100) >> FSHIFT;
+	avenrun5 = loadbuf[1] >> FSHIFT;
+	avenrun5_cs = ((loadbuf[1] & (FSCALE-1)) * 100) >> FSHIFT;
+	avenrun15 = loadbuf[2] >> FSHIFT;
+	avenrun15_cs = ((loadbuf[2] & (FSCALE-1)) * 100) >> FSHIFT;
+
+	lxpr_uiobuf_printf(uiobuf,
+	    "%ld.%02d %ld.%02d %ld.%02d %d/%d %d\n",
+	    avenrun1, avenrun1_cs,
+	    avenrun5, avenrun5_cs,
+	    avenrun15, avenrun15_cs,
+	    nrunnable, nlwps, 0);
+}
+
+/*
+ * lxpr_read_meminfo(): read the contents of the "meminfo" file.
+ */
+static void
+lxpr_read_meminfo(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	zone_t *zone = LXPTOZ(lxpnp);
+	int global = zone == global_zone;
+	long total_mem, free_mem, total_swap;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_MEMINFO);
+
+	if (global || zone->zone_phys_mem_ctl == UINT64_MAX) {
+		total_mem = physmem * PAGESIZE;
+		free_mem = freemem * PAGESIZE;
+	} else {
+		total_mem = zone->zone_phys_mem_ctl;
+		free_mem = zone->zone_phys_mem_ctl - zone->zone_phys_mem;
+	}
+
+	if (global || zone->zone_max_swap_ctl == UINT64_MAX) {
+		total_swap = k_anoninfo.ani_max * PAGESIZE;
+	} else {
+		mutex_enter(&zone->zone_mem_lock);
+		total_swap = zone->zone_max_swap_ctl;
+		mutex_exit(&zone->zone_mem_lock);
+	}
+
+	/*
+	 * SwapFree
+	 * On illumos we reserve swap up front, whereas on Linux they just
+	 * wing it and kill a random process if they run out of backing store
+	 * for virtual memory. Our swap reservation doesn't translate to that
+	 * model, so just inform the caller that no swap is being used.
+	 */
+	lxpr_uiobuf_printf(uiobuf,
+	    "MemTotal:  %8lu kB\n"
+	    "MemFree:   %8lu kB\n"
+	    "MemShared: %8u kB\n"
+	    "Buffers:   %8u kB\n"
+	    "Cached:    %8u kB\n"
+	    "SwapCached:%8u kB\n"
+	    "Active:    %8u kB\n"
+	    "Inactive:  %8u kB\n"
+	    "HighTotal: %8u kB\n"
+	    "HighFree:  %8u kB\n"
+	    "LowTotal:  %8u kB\n"
+	    "LowFree:   %8u kB\n"
+	    "SwapTotal: %8lu kB\n"
+	    "SwapFree:  %8lu kB\n",
+	    btok(total_mem),				/* MemTotal */
+	    btok(free_mem),				/* MemFree */
+	    0,						/* MemShared */
+	    0,						/* Buffers */
+	    0,						/* Cached */
+	    0,						/* SwapCached */
+	    0,						/* Active */
+	    0,						/* Inactive */
+	    0,						/* HighTotal */
+	    0,						/* HighFree */
+	    btok(total_mem),				/* LowTotal */
+	    btok(free_mem),				/* LowFree */
+	    btok(total_swap),				/* SwapTotal */
+	    btok(total_swap));				/* SwapFree */
+}
+
+/*
+ * lxpr_read_mounts():
+ */
+/* ARGSUSED */
+static void
+lxpr_read_mounts(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	zone_t *zone = LXPTOZ(lxpnp);
+	list_t *mounts;
+	lxpr_mount_entry_t *lme;
+
+	mounts = lxpr_enumerate_mounts(zone);
+
+	/*
+	 * now we can run through what we've extracted without holding
+	 * vfs_list_read_lock()
+	 */
+	lme = list_remove_head(mounts);
+	while (lme != NULL) {
+		char *resource, *mntpt, *fstype, *rwflag;
+		vnode_t *vp;
+		int error;
+
+		mntpt = (char *)refstr_value(lme->lme_mntpt);
+		resource = (char *)refstr_value(lme->lme_resource);
+
+		if (mntpt == NULL || mntpt[0] == '\0') {
+			goto nextp;
+		}
+		mntpt = ZONE_PATH_TRANSLATE(mntpt, zone);
+		error = lookupname(mntpt, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp);
+		if (error != 0) {
+			goto nextp;
+		} else if ((vp->v_flag & VROOT) == 0 && !lme->lme_force) {
+			VN_RELE(vp);
+			goto nextp;
+		}
+		VN_RELE(vp);
+
+		if (resource != NULL && resource[0] != '\0') {
+			if (resource[0] == '/') {
+				resource = ZONE_PATH_VISIBLE(resource, zone) ?
+				    ZONE_PATH_TRANSLATE(resource, zone) : mntpt;
+			}
+		} else {
+			resource = "none";
+		}
+
+		/*  Make things look more like Linux. */
+		fstype = vfssw[lme->lme_fstype].vsw_name;
+		if (lxpr_clean_mntent(&mntpt, &fstype, &resource) != 0 &&
+		    !lme->lme_force) {
+			goto nextp;
+		}
+		rwflag = ((lme->lme_flag & VFS_RDONLY) == 0) ? "rw" : "ro";
+
+		lxpr_uiobuf_printf(uiobuf, "%s %s %s %s 0 0\n",
+		    resource, mntpt, fstype, rwflag);
+
+nextp:
+		refstr_rele(lme->lme_mntpt);
+		refstr_rele(lme->lme_resource);
+		kmem_free(lme, sizeof (lxpr_mount_entry_t));
+		lme = list_remove_head(mounts);
+	}
+
+	list_destroy(mounts);
+	kmem_free(mounts, sizeof (list_t));
+}
+
+/*
+ * lxpr_read_partitions():
+ *
+ * Over the years, /proc/partitions has been made considerably smaller -- to
+ * the point that it really is only major number, minor number, number of
+ * blocks (which we report as 0), and partition name.
+ *
+ * We support this because some things want to see it to make sense of
+ * /proc/diskstats, and also because "fdisk -l" and a few other things look
+ * here to find all disks on the system.
+ */
+/* ARGSUSED */
+static void
+lxpr_read_partitions(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	lx_zone_data_t *lxzd;
+	lx_virt_disk_t *vd;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_PARTITIONS);
+
+	lxpr_uiobuf_printf(uiobuf, "major minor  #blocks  name\n\n");
+
+	lxzd = ztolxzd(curproc->p_zone);
+	if (lxzd == NULL)
+		return;
+	ASSERT(lxzd->lxzd_vdisks != NULL);
+
+	vd = list_head(lxzd->lxzd_vdisks);
+	while (vd != NULL) {
+		lxpr_uiobuf_printf(uiobuf, "%4d  %7d %10d %s\n",
+		    getmajor(vd->lxvd_emul_dev), getminor(vd->lxvd_emul_dev),
+		    0, vd->lxvd_name);
+		vd = list_next(lxzd->lxzd_vdisks, vd);
+	}
+}
+
+/*
+ * There aren't many actual devices inside a zone but we want to provide the
+ * major numbers for the pseudo devices that do exist, including  our pts/ptm
+ * device, as well as the zvol virtual disk device. We simply hardcode the
+ * emulated major numbers that are used elsewhere in the code and that match
+ * the expected Linux major numbers. See lx devfs where some of the major
+ * numbers have no defined constants.
+ */
+/* ARGSUSED */
+static void
+lxpr_read_devices(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	ASSERT(lxpnp->lxpr_type == LXPR_DEVICES);
+
+	lxpr_uiobuf_printf(uiobuf, "Character devices:\n");
+	lxpr_uiobuf_printf(uiobuf, "%3d /dev/tty\n", LX_TTY_MAJOR);
+	lxpr_uiobuf_printf(uiobuf, "%3d /dev/console\n", LX_TTY_MAJOR);
+	lxpr_uiobuf_printf(uiobuf, "%3d /dev/ptmx\n", LX_TTY_MAJOR);
+	lxpr_uiobuf_printf(uiobuf, "%3d ptm\n", LX_PTM_MAJOR);
+	lxpr_uiobuf_printf(uiobuf, "%3d pts\n", LX_PTS_MAJOR_MIN);
+
+	lxpr_uiobuf_printf(uiobuf, "\nBlock devices:\n");
+	lxpr_uiobuf_printf(uiobuf, "%3d zvol\n", LX_MAJOR_DISK);
+}
+
+/*
+ * lxpr_read_diskstats():
+ *
+ * See the block comment above the per-device output-generating line for the
+ * details of the format.
+ */
+/* ARGSUSED */
+static void
+lxpr_read_diskstats(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	zone_t *zone = LXPTOZ(lxpnp);
+	lx_zone_data_t *lxzd;
+	kstat_t kn;
+	int num;
+	zone_vfs_kstat_t *kip;
+	size_t size;
+	lx_virt_disk_t *vd;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_DISKSTATS);
+
+	lxzd = ztolxzd(zone);
+	if (lxzd == NULL)
+		return;
+	ASSERT(lxzd->lxzd_vdisks != NULL);
+
+	/*
+	 * Use the zone_vfs kstat, which is a superset of a kstat_io_t, since
+	 * it tracks IO at the zone level.
+	 */
+	strlcpy(kn.ks_module, "zone_vfs", sizeof (kn.ks_module));
+	strlcpy(kn.ks_name, zone->zone_name, sizeof (kn.ks_name));
+	kn.ks_instance = getzoneid();
+
+	kip = (zone_vfs_kstat_t *)lxpr_kstat_read(&kn, B_TRUE, &size, &num);
+	if (kip == NULL)
+		return;
+
+	if (size < sizeof (kstat_io_t)) {
+		kmem_free(kip, size);
+		return;
+	}
+
+	/*
+	 * Because the zone vfs stats are tracked at the zone level we use
+	 * the same kstat for the zone's virtual disk (the zpool) and any
+	 * zvols that might also visible within the zone.
+	 */
+	vd = list_head(lxzd->lxzd_vdisks);
+	while (vd != NULL) {
+		/*
+		 * /proc/diskstats is defined to have one line of output for
+		 * each block device, with each line containing the following
+		 * 14 fields:
+		 *
+		 *	1 - major number
+		 *	2 - minor mumber
+		 *	3 - device name
+		 *	4 - reads completed successfully
+		 * 	5 - reads merged
+		 *	6 - sectors read
+		 *	7 - time spent reading (ms)
+		 *	8 - writes completed
+		 *	9 - writes merged
+		 *	10 - sectors written
+		 *	11 - time spent writing (ms)
+		 *	12 - I/Os currently in progress
+		 *	13 - time spent doing I/Os (ms)
+		 *	14 - weighted time spent doing I/Os (ms)
+		 *
+		 * One small hiccup:  we don't actually keep track of time
+		 * spent reading vs. time spent writing -- we keep track of
+		 * time waiting vs. time actually performing I/O.  While we
+		 * could divide the total time by the I/O mix (making the
+		 * obviously wrong assumption that I/O operations all take the
+		 * same amount of time), this has the undesirable side-effect
+		 * of moving backwards.  Instead, we report the total time
+		 * (read + write) for all three stats (read, write, total).
+		 * This is also a lie of sorts, but it should be more
+		 * immediately clear to the user that reads and writes are
+		 * each being double-counted as the other.
+		 *
+		 * Since certain consumers interpret the major/minor numbers to
+		 * infer device names, some translation is required to avoid
+		 * output which results in totally unexpected results.
+		 */
+
+		lxpr_uiobuf_printf(uiobuf, "%4d %7d %s ",
+		    getmajor(vd->lxvd_emul_dev),
+		    getminor(vd->lxvd_emul_dev),
+		    vd->lxvd_name);
+
+		if (vd->lxvd_type == LXVD_ZFS_DS) {
+			/*
+			 * Use the zone-wide vfs stats for any zfs datasets
+			 * represented via virtual devices.
+			 */
+#define	KV(N)	kip->zv_ ## N.value.ui64
+#define	NS_PER_MS	(uint64_t)(NANOSEC / MILLISEC)
+			lxpr_uiobuf_printf(uiobuf,
+			    "%llu %llu %llu %llu "
+			    "%llu %llu %llu %llu "
+			    "%llu %llu %llu\n",
+			    (uint64_t)KV(reads), 0LL,
+			    KV(nread) / (uint64_t)LXPR_SECTOR_SIZE,
+			    (KV(rtime) + KV(wtime)) / NS_PER_MS,
+			    (uint64_t)KV(writes), 0LL,
+			    KV(nwritten) / (uint64_t)LXPR_SECTOR_SIZE,
+			    (KV(rtime) + KV(wtime)) / NS_PER_MS,
+			    (uint64_t)(KV(rcnt) + KV(wcnt)),
+			    (KV(rtime) + KV(wtime)) / NS_PER_MS,
+			    (KV(rlentime) + KV(wlentime)) / NS_PER_MS);
+#undef	KV
+#undef	NS_PER_MS
+		} else {
+			/*
+			 * Report nearly-zeroed statistics for other devices.
+			 *
+			 * Since iostat will ignore devices which report no
+			 * succesful reads or writes, a single read of one
+			 * sector, taking 1ms, is reported.
+			 */
+			lxpr_uiobuf_printf(uiobuf,
+			    "1 0 1 1 0 0 0 0 0 0 0\n");
+		}
+
+		vd = list_next(lxzd->lxzd_vdisks, vd);
+	}
+
+	kmem_free(kip, size);
+}
+
+/*
+ * lxpr_read_version(): read the contents of the "version" file.
+ */
+/* ARGSUSED */
+static void
+lxpr_read_version(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	lx_zone_data_t *lxzd = ztolxzd(LXPTOZ(lxpnp));
+	lx_proc_data_t *lxpd = ptolxproc(curproc);
+	char release[LX_KERN_RELEASE_MAX];
+	char version[LX_KERN_VERSION_MAX];
+
+	mutex_enter(&lxzd->lxzd_lock);
+	(void) strlcpy(release, lxzd->lxzd_kernel_release, sizeof (release));
+	(void) strlcpy(version, lxzd->lxzd_kernel_version, sizeof (version));
+	mutex_exit(&lxzd->lxzd_lock);
+
+	/* Use per-process overrides, if specified */
+	if (lxpd != NULL && lxpd->l_uname_release[0] != '\0') {
+		(void) strlcpy(release, lxpd->l_uname_release,
+		    sizeof (release));
+	}
+	if (lxpd != NULL && lxpd->l_uname_version[0] != '\0') {
+		(void) strlcpy(version, lxpd->l_uname_version,
+		    sizeof (version));
+	}
+
+	lxpr_uiobuf_printf(uiobuf,
+	    "%s version %s (%s version %d.%d.%d) %s\n",
+	    LX_UNAME_SYSNAME, release,
+#if defined(__GNUC__)
+	    "gcc", __GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__,
+#else
+	    "cc", 1, 0, 0,
+#endif
+	    version);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_vmstat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	cpu_t *cp, *cpstart;
+	int pools_enabled;
+
+	ulong_t pgpgin_cum    = 0;
+	ulong_t pgpgout_cum   = 0;
+	ulong_t pgswapout_cum = 0;
+	ulong_t pgswapin_cum  = 0;
+
+	mutex_enter(&cpu_lock);
+	pools_enabled = pool_pset_enabled();
+	/* Calculate cumulative stats */
+	cp = cpstart = CPU->cpu_part->cp_cpulist;
+	do {
+		/* Only count CPUs which are present and active. */
+		if ((cp->cpu_flags & CPU_EXISTS) == 0) {
+			continue;
+		}
+
+		pgpgin_cum += CPU_STATS(cp, vm.pgpgin);
+		pgpgout_cum += CPU_STATS(cp, vm.pgpgout);
+		pgswapin_cum += CPU_STATS(cp, vm.pgswapin);
+		pgswapout_cum += CPU_STATS(cp, vm.pgswapout);
+
+		if (pools_enabled)
+			cp = cp->cpu_next_part;
+		else
+			cp = cp->cpu_next;
+	} while (cp != cpstart);
+	mutex_exit(&cpu_lock);
+
+	/*
+	 * Needless to say, the metrics presented by vmstat are very specific
+	 * to the internals of the Linux kernel.  There is little per-zone
+	 * information which can be translated in a meaningful way to fit the
+	 * expected fields.  For the time being, the output is kept sparse.
+	 */
+	lxpr_uiobuf_printf(uiobuf,
+	    "pgpgin %lu\n"
+	    "pgpgout %lu\n"
+	    "pswpin %lu\n"
+	    "pswpout %lu\n",
+	    pgpgin_cum,
+	    pgpgout_cum,
+	    pgswapin_cum,
+	    pgswapout_cum);
+}
+
+/*
+ * lxpr_read_stat(): read the contents of the "stat" file.
+ *
+ */
+/* ARGSUSED */
+static void
+lxpr_read_stat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	cpu_t *cp, *cpstart;
+	int pools_enabled;
+	ulong_t idle_cum = 0;
+	ulong_t sys_cum  = 0;
+	ulong_t user_cum = 0;
+	ulong_t irq_cum = 0;
+	ulong_t cpu_nrunnable_cum = 0;
+	ulong_t w_io_cum = 0;
+
+	ulong_t pgpgin_cum    = 0;
+	ulong_t pgpgout_cum   = 0;
+	ulong_t pgswapout_cum = 0;
+	ulong_t pgswapin_cum  = 0;
+	ulong_t intr_cum = 0;
+	ulong_t pswitch_cum = 0;
+	ulong_t forks_cum = 0;
+	hrtime_t msnsecs[NCMSTATES];
+	/* is the emulated release > 2.4 */
+	boolean_t newer_than24 = lx_kern_release_cmp(LXPTOZ(lxpnp), "2.4") > 0;
+	/* temporary variable since scalehrtime modifies data in place */
+	hrtime_t tmptime;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_STAT);
+
+	mutex_enter(&cpu_lock);
+	pools_enabled = pool_pset_enabled();
+
+	/* Calculate cumulative stats */
+	cp = cpstart = CPU->cpu_part->cp_cpulist;
+	do {
+		int i;
+
+		/*
+		 * Don't count CPUs that aren't even in the system
+		 * or aren't up yet.
+		 */
+		if ((cp->cpu_flags & CPU_EXISTS) == 0) {
+			continue;
+		}
+
+		get_cpu_mstate(cp, msnsecs);
+
+		idle_cum += NSEC_TO_TICK(msnsecs[CMS_IDLE]);
+		sys_cum  += NSEC_TO_TICK(msnsecs[CMS_SYSTEM]);
+		user_cum += NSEC_TO_TICK(msnsecs[CMS_USER]);
+
+		pgpgin_cum += CPU_STATS(cp, vm.pgpgin);
+		pgpgout_cum += CPU_STATS(cp, vm.pgpgout);
+		pgswapin_cum += CPU_STATS(cp, vm.pgswapin);
+		pgswapout_cum += CPU_STATS(cp, vm.pgswapout);
+
+
+		if (newer_than24) {
+			cpu_nrunnable_cum += cp->cpu_disp->disp_nrunnable;
+			w_io_cum += CPU_STATS(cp, sys.iowait);
+			for (i = 0; i < NCMSTATES; i++) {
+				tmptime = cp->cpu_intracct[i];
+				scalehrtime(&tmptime);
+				irq_cum += NSEC_TO_TICK(tmptime);
+			}
+		}
+
+		for (i = 0; i < PIL_MAX; i++)
+			intr_cum += CPU_STATS(cp, sys.intr[i]);
+
+		pswitch_cum += CPU_STATS(cp, sys.pswitch);
+		forks_cum += CPU_STATS(cp, sys.sysfork);
+		forks_cum += CPU_STATS(cp, sys.sysvfork);
+
+		if (pools_enabled)
+			cp = cp->cpu_next_part;
+		else
+			cp = cp->cpu_next;
+	} while (cp != cpstart);
+
+	if (newer_than24) {
+		lxpr_uiobuf_printf(uiobuf,
+		    "cpu %lu %lu %lu %lu %lu %lu %lu\n",
+		    user_cum, 0L, sys_cum, idle_cum, 0L, irq_cum, 0L);
+	} else {
+		lxpr_uiobuf_printf(uiobuf,
+		    "cpu %lu %lu %lu %lu\n",
+		    user_cum, 0L, sys_cum, idle_cum);
+	}
+
+	/* Do per processor stats */
+	do {
+		int i;
+
+		ulong_t idle_ticks;
+		ulong_t sys_ticks;
+		ulong_t user_ticks;
+		ulong_t irq_ticks = 0;
+
+		/*
+		 * Don't count CPUs that aren't even in the system
+		 * or aren't up yet.
+		 */
+		if ((cp->cpu_flags & CPU_EXISTS) == 0) {
+			continue;
+		}
+
+		get_cpu_mstate(cp, msnsecs);
+
+		idle_ticks = NSEC_TO_TICK(msnsecs[CMS_IDLE]);
+		sys_ticks  = NSEC_TO_TICK(msnsecs[CMS_SYSTEM]);
+		user_ticks = NSEC_TO_TICK(msnsecs[CMS_USER]);
+
+		for (i = 0; i < NCMSTATES; i++) {
+			tmptime = cp->cpu_intracct[i];
+			scalehrtime(&tmptime);
+			irq_ticks += NSEC_TO_TICK(tmptime);
+		}
+
+		if (newer_than24) {
+			lxpr_uiobuf_printf(uiobuf,
+			    "cpu%d %lu %lu %lu %lu %lu %lu %lu\n",
+			    cp->cpu_id, user_ticks, 0L, sys_ticks, idle_ticks,
+			    0L, irq_ticks, 0L);
+		} else {
+			lxpr_uiobuf_printf(uiobuf,
+			    "cpu%d %lu %lu %lu %lu\n",
+			    cp->cpu_id,
+			    user_ticks, 0L, sys_ticks, idle_ticks);
+		}
+
+		if (pools_enabled)
+			cp = cp->cpu_next_part;
+		else
+			cp = cp->cpu_next;
+	} while (cp != cpstart);
+
+	mutex_exit(&cpu_lock);
+
+	if (newer_than24) {
+		lxpr_uiobuf_printf(uiobuf,
+		    "page %lu %lu\n"
+		    "swap %lu %lu\n"
+		    "intr %lu\n"
+		    "ctxt %lu\n"
+		    "btime %lu\n"
+		    "processes %lu\n"
+		    "procs_running %lu\n"
+		    "procs_blocked %lu\n",
+		    pgpgin_cum, pgpgout_cum,
+		    pgswapin_cum, pgswapout_cum,
+		    intr_cum,
+		    pswitch_cum,
+		    boot_time,
+		    forks_cum,
+		    cpu_nrunnable_cum,
+		    w_io_cum);
+	} else {
+		lxpr_uiobuf_printf(uiobuf,
+		    "page %lu %lu\n"
+		    "swap %lu %lu\n"
+		    "intr %lu\n"
+		    "ctxt %lu\n"
+		    "btime %lu\n"
+		    "processes %lu\n",
+		    pgpgin_cum, pgpgout_cum,
+		    pgswapin_cum, pgswapout_cum,
+		    intr_cum,
+		    pswitch_cum,
+		    boot_time,
+		    forks_cum);
+	}
+}
+
+/*
+ * lxpr_read_swaps():
+ *
+ * We don't support swap files or partitions, but some programs like to look
+ * here just to check we have some swap on the system, so we lie and show
+ * our entire swap cap as one swap partition. See lxpr_read_meminfo for an
+ * explanation on why we report 0 used swap.
+ *
+ * It is important to use formatting identical to the Linux implementation
+ * so that consumers do not break. See swap_show() in mm/swapfile.c.
+ */
+/* ARGSUSED */
+static void
+lxpr_read_swaps(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	zone_t *zone = curzone;
+	uint64_t totswap, usedswap;
+
+	if (zone == global_zone || zone->zone_max_swap_ctl == UINT64_MAX) {
+		totswap = (k_anoninfo.ani_max * PAGESIZE) >> 10;
+	} else {
+		mutex_enter(&zone->zone_mem_lock);
+		/* Uses units of 1 kb (2^10). */
+		totswap = zone->zone_max_swap_ctl >> 10;
+		mutex_exit(&zone->zone_mem_lock);
+	}
+	usedswap = 0;
+
+	lxpr_uiobuf_printf(uiobuf,
+	    "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
+	lxpr_uiobuf_printf(uiobuf, "%-40s%s\t%llu\t%llu\t%d\n",
+	    "/dev/swap", "partition", totswap, usedswap, -1);
+}
+
+/*
+ * inotify tunables exported via /proc.
+ */
+extern int inotify_maxevents;
+extern int inotify_maxinstances;
+extern int inotify_maxwatches;
+
+static void
+lxpr_read_sys_fs_inotify_max_queued_events(lxpr_node_t *lxpnp,
+    lxpr_uiobuf_t *uiobuf)
+{
+	ASSERT(lxpnp->lxpr_type == LXPR_SYS_FS_INOTIFY_MAX_QUEUED_EVENTS);
+	lxpr_uiobuf_printf(uiobuf, "%d\n", inotify_maxevents);
+}
+
+static void
+lxpr_read_sys_fs_inotify_max_user_instances(lxpr_node_t *lxpnp,
+    lxpr_uiobuf_t *uiobuf)
+{
+	ASSERT(lxpnp->lxpr_type == LXPR_SYS_FS_INOTIFY_MAX_USER_INSTANCES);
+	lxpr_uiobuf_printf(uiobuf, "%d\n", inotify_maxinstances);
+}
+
+static void
+lxpr_read_sys_fs_inotify_max_user_watches(lxpr_node_t *lxpnp,
+    lxpr_uiobuf_t *uiobuf)
+{
+	ASSERT(lxpnp->lxpr_type == LXPR_SYS_FS_INOTIFY_MAX_USER_WATCHES);
+	lxpr_uiobuf_printf(uiobuf, "%d\n", inotify_maxwatches);
+}
+
+static void
+lxpr_read_sys_kernel_caplcap(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_CAPLCAP);
+	lxpr_uiobuf_printf(uiobuf, "%d\n", LX_CAP_MAX_VALID);
+}
+
+static void
+lxpr_read_sys_kernel_corepatt(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	zone_t *zone = curproc->p_zone;
+	struct core_globals *cg;
+	refstr_t *rp;
+	corectl_path_t *ccp;
+	char tr[MAXPATHLEN];
+
+	ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_COREPATT);
+
+	cg = zone_getspecific(core_zone_key, zone);
+	ASSERT(cg != NULL);
+
+	/* If core dumps are disabled, return an empty string. */
+	if ((cg->core_options & CC_PROCESS_PATH) == 0) {
+		lxpr_uiobuf_printf(uiobuf, "\n");
+		return;
+	}
+
+	ccp = cg->core_default_path;
+	mutex_enter(&ccp->ccp_mtx);
+	if ((rp = ccp->ccp_path) != NULL)
+		refstr_hold(rp);
+	mutex_exit(&ccp->ccp_mtx);
+
+	if (rp == NULL) {
+		lxpr_uiobuf_printf(uiobuf, "\n");
+		return;
+	}
+
+	bzero(tr, sizeof (tr));
+	if (lxpr_core_path_s2l(refstr_value(rp), tr, sizeof (tr)) != 0) {
+		refstr_rele(rp);
+		lxpr_uiobuf_printf(uiobuf, "\n");
+		return;
+	}
+
+	refstr_rele(rp);
+	lxpr_uiobuf_printf(uiobuf, "%s\n", tr);
+}
+
+static void
+lxpr_read_sys_kernel_hostname(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_HOSTNAME);
+	lxpr_uiobuf_printf(uiobuf, "%s\n", uts_nodename());
+}
+
+static void
+lxpr_read_sys_kernel_msgmni(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	rctl_qty_t val;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_MSGMNI);
+
+	mutex_enter(&curproc->p_lock);
+	val = rctl_enforced_value(rc_zone_msgmni,
+	    curproc->p_zone->zone_rctls, curproc);
+	mutex_exit(&curproc->p_lock);
+
+	lxpr_uiobuf_printf(uiobuf, "%u\n", (uint_t)val);
+}
+
+static void
+lxpr_read_sys_kernel_ngroups_max(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_NGROUPS_MAX);
+	lxpr_uiobuf_printf(uiobuf, "%d\n", ngroups_max);
+}
+
+static void
+lxpr_read_sys_kernel_osrel(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	lx_zone_data_t *br_data;
+	char version[LX_KERN_VERSION_MAX];
+
+	ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_OSREL);
+	br_data = ztolxzd(curproc->p_zone);
+	if (curproc->p_zone->zone_brand == &lx_brand) {
+		mutex_enter(&br_data->lxzd_lock);
+		(void) strlcpy(version, br_data->lxzd_kernel_version,
+		    sizeof (version));
+		mutex_exit(&br_data->lxzd_lock);
+
+		lxpr_uiobuf_printf(uiobuf, "%s\n", version);
+	} else {
+		lxpr_uiobuf_printf(uiobuf, "\n");
+	}
+}
+
+static void
+lxpr_read_sys_kernel_pid_max(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_PID_MAX);
+	lxpr_uiobuf_printf(uiobuf, "%d\n", maxpid);
+}
+
+static void
+lxpr_read_sys_kernel_rand_bootid(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	/*
+	 * This file isn't documented on the Linux proc(5) man page but
+	 * according to the blog of the author of systemd/journald (the
+	 * consumer), he says:
+	 *    boot_id: A random ID that is regenerated on each boot. As such it
+	 *    can be used to identify the local machine's current boot. It's
+	 *    universally available on any recent Linux kernel. It's a good and
+	 *    safe choice if you need to identify a specific boot on a specific
+	 *    booted kernel.
+	 *
+	 * We'll just generate a random ID if necessary. On Linux the format
+	 * appears to resemble a uuid but since it is not documented to be a
+	 * uuid, we don't worry about that.
+	 */
+	lx_zone_data_t *br_data;
+	char bootid[LX_BOOTID_LEN];
+
+	ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_RAND_BOOTID);
+
+	if (curproc->p_zone->zone_brand != &lx_brand) {
+		lxpr_uiobuf_printf(uiobuf, "0\n");
+		return;
+	}
+
+	br_data = ztolxzd(curproc->p_zone);
+	mutex_enter(&br_data->lxzd_lock);
+	if (br_data->lxzd_bootid[0] == '\0') {
+		extern int getrandom(void *, size_t, int);
+		int i;
+
+		for (i = 0; i < 5; i++) {
+			u_longlong_t n;
+			char s[32];
+
+			(void) random_get_bytes((uint8_t *)&n, sizeof (n));
+			switch (i) {
+			case 0:	(void) snprintf(s, sizeof (s), "%08llx", n);
+				s[8] = '\0';
+				break;
+			case 4:	(void) snprintf(s, sizeof (s), "%012llx", n);
+				s[12] = '\0';
+				break;
+			default: (void) snprintf(s, sizeof (s), "%04llx", n);
+				s[4] = '\0';
+				break;
+			}
+			if (i > 0)
+				strlcat(br_data->lxzd_bootid, "-",
+				    sizeof (br_data->lxzd_bootid));
+			strlcat(br_data->lxzd_bootid, s,
+			    sizeof (br_data->lxzd_bootid));
+		}
+	}
+	(void) strlcpy(bootid, br_data->lxzd_bootid, sizeof (bootid));
+	mutex_exit(&br_data->lxzd_lock);
+
+	lxpr_uiobuf_printf(uiobuf, "%s\n", bootid);
+
+}
+
+static void
+lxpr_read_sys_kernel_sem(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	proc_t *pp = curproc;
+	rctl_qty_t vmsl, vopm, vmni, vmns;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_SEM);
+
+	mutex_enter(&pp->p_lock);
+	vmsl = rctl_enforced_value(rc_process_semmsl, pp->p_rctls, pp);
+	vopm = rctl_enforced_value(rc_process_semopm, pp->p_rctls, pp);
+	vmni = rctl_enforced_value(rc_zone_semmni, pp->p_zone->zone_rctls, pp);
+	mutex_exit(&pp->p_lock);
+	vmns = vmsl * vmni;
+	if (vmns < vmsl || vmns < vmni) {
+		vmns = ULLONG_MAX;
+	}
+	/*
+	 * Format: semmsl semmns semopm semmni
+	 *  - semmsl: Limit semaphores in a sempahore set.
+	 *  - semmns: Limit semaphores in all semaphore sets
+	 *  - semopm: Limit operations in a single semop call
+	 *  - semmni: Limit number of semaphore sets
+	 */
+	lxpr_uiobuf_printf(uiobuf, "%llu\t%llu\t%llu\t%llu\n",
+	    vmsl, vmns, vopm, vmni);
+}
+
+static void
+lxpr_read_sys_kernel_shmall(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	rctl_qty_t val;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_SHMALL);
+
+	mutex_enter(&curproc->p_lock);
+	val = rctl_enforced_value(rc_zone_shmmax,
+	    curproc->p_zone->zone_rctls, curproc);
+	mutex_exit(&curproc->p_lock);
+
+	/* value is in pages */
+	lxpr_uiobuf_printf(uiobuf, "%u\n", (uint_t)btop(val));
+}
+
+static void
+lxpr_read_sys_kernel_shmmax(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	rctl_qty_t val;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_SHMMAX);
+
+	mutex_enter(&curproc->p_lock);
+	val = rctl_enforced_value(rc_zone_shmmax,
+	    curproc->p_zone->zone_rctls, curproc);
+	mutex_exit(&curproc->p_lock);
+
+	if (val > FOURGB)
+		val = FOURGB;
+
+	lxpr_uiobuf_printf(uiobuf, "%u\n", (uint_t)val);
+}
+
+static void
+lxpr_read_sys_kernel_shmmni(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	rctl_qty_t val;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_SHMMNI);
+
+	mutex_enter(&curproc->p_lock);
+	val = rctl_enforced_value(rc_zone_shmmni,
+	    curproc->p_zone->zone_rctls, curproc);
+	mutex_exit(&curproc->p_lock);
+
+	if (val > FOURGB)
+		val = FOURGB;
+
+	lxpr_uiobuf_printf(uiobuf, "%u\n", (uint_t)val);
+}
+
+static void
+lxpr_read_sys_kernel_threads_max(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_THREADS_MAX);
+	lxpr_uiobuf_printf(uiobuf, "%d\n", curproc->p_zone->zone_nlwps_ctl);
+}
+
+static void
+lxpr_read_sys_net_core_somaxc(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	netstack_t *ns;
+	tcp_stack_t	*tcps;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_CORE_SOMAXCON);
+
+	ns = netstack_get_current();
+	if (ns == NULL) {
+		lxpr_uiobuf_printf(uiobuf, "%d\n", SOMAXCONN);
+		return;
+	}
+
+	tcps = ns->netstack_tcp;
+	lxpr_uiobuf_printf(uiobuf, "%d\n", tcps->tcps_conn_req_max_q);
+	netstack_rele(ns);
+}
+
+/*
+ * ip_local_port_range
+ *
+ * The low & high port number range.
+ * integers; default: 32768 61000
+ *
+ * illumos: tcp_smallest_anon_port & tcp_largest_anon_port
+ * Not in tcp(7p) man page.
+ */
+static void
+lxpr_read_sys_net_ipv4_ip_lport_range(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	netstack_t *ns;
+	tcp_stack_t	*tcps;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_IP_LPORT_RANGE);
+
+	ns = netstack_get_current();
+	if (ns == NULL) {
+		lxpr_uiobuf_seterr(uiobuf, ENXIO);
+		return;
+	}
+
+	tcps = ns->netstack_tcp;
+	lxpr_uiobuf_printf(uiobuf, "%d\t%d\n",
+	    tcps->tcps_smallest_anon_port, tcps->tcps_largest_anon_port);
+	netstack_rele(ns);
+}
+
+/*
+ * tcp_fin_timeout
+ *
+ * This specifies how many seconds to wait for a final FIN packet before the
+ * socket is forcibly closed. This is strictly a violation of the TCP
+ * specification, but required to prevent denial-of-service attacks.
+ * integer; default: 60;
+ *
+ * illumos: tcp_fin_wait_2_flush_interval
+ * Not in tcp(7p) man page but see comment in uts/common/inet/tcp/tcp_input.c
+ * in the tcp_input_data() function on the use of tcp_fin_wait_2_flush_interval.
+ * The value is in milliseconds.
+ */
+static void
+lxpr_read_sys_net_ipv4_tcp_fin_to(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	netstack_t *ns;
+	tcp_stack_t	*tcps;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_FIN_TO);
+
+	ns = netstack_get_current();
+	if (ns == NULL) {
+		lxpr_uiobuf_seterr(uiobuf, ENXIO);
+		return;
+	}
+
+	tcps = ns->netstack_tcp;
+	lxpr_uiobuf_printf(uiobuf, "%d\n",
+	    tcps->tcps_fin_wait_2_flush_interval / 1000);
+	netstack_rele(ns);
+}
+
+/*
+ * tcp_keepalive_intvl
+ *
+ * The number of seconds between TCP keep-alive probes. default: 75
+ * Linux retries tcp_keepalive_probes (9) times before timing out.
+ *
+ * illumos:
+ * We have tcp_ka_rinterval but there is no corresponding tcps_* tunable for
+ * this. The closest is tcps_keepalive_abort_interval which specifies the
+ * time threshold for aborting a TCP connection in milliseconds. Linux retries
+ * 9 times (giving a total of 11.25 minutes) so we emulate this by dividing out
+ * tcps_keepalive_abort_interval by 9.
+ */
+static void
+lxpr_read_sys_net_ipv4_tcp_ka_int(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	netstack_t *ns;
+	tcp_stack_t	*tcps;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_KA_INT);
+
+	ns = netstack_get_current();
+	if (ns == NULL) {
+		lxpr_uiobuf_seterr(uiobuf, ENXIO);
+		return;
+	}
+
+	tcps = ns->netstack_tcp;
+	lxpr_uiobuf_printf(uiobuf, "%d\n",
+	    (tcps->tcps_keepalive_abort_interval / 1000) / 9);
+	netstack_rele(ns);
+}
+
+/*
+ * tcp_keepalive_time
+ *
+ * The number of seconds a connection needs to be idle before TCP begins
+ * sending out keep-alive probes. The default value is 7200 seconds (2 hours).
+ *
+ * illumos: tcp_keepalive_interval
+ * The interval for sending out the first probe in milliseconds. The default is
+ * two hours.
+ */
+static void
+lxpr_read_sys_net_ipv4_tcp_ka_tim(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	netstack_t *ns;
+	tcp_stack_t	*tcps;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_KA_TIM);
+
+	ns = netstack_get_current();
+	if (ns == NULL) {
+		lxpr_uiobuf_seterr(uiobuf, ENXIO);
+		return;
+	}
+
+	tcps = ns->netstack_tcp;
+	lxpr_uiobuf_printf(uiobuf, "%d\n",
+	    (tcps->tcps_keepalive_interval / 1000));
+	netstack_rele(ns);
+}
+
+/*
+ * tcp_sack
+ *
+ * Enable RFC 2018 TCP Selective Acknowledgements. Boolean, default: enabled
+ *
+ * illumos: tcp_sack_permitted
+ * tcp_sack_permitted 0 == disabled, 1 == no initiate but accept,
+ * 2 == initiate and accept. default is 2.
+ */
+static void
+lxpr_read_sys_net_ipv4_tcp_sack(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	netstack_t *ns;
+	tcp_stack_t	*tcps;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_SACK);
+
+	ns = netstack_get_current();
+	if (ns == NULL) {
+		lxpr_uiobuf_seterr(uiobuf, ENXIO);
+		return;
+	}
+
+	tcps = ns->netstack_tcp;
+	lxpr_uiobuf_printf(uiobuf, "%d\n",
+	    (tcps->tcps_sack_permitted  == 0 ? 0 : 1));
+	netstack_rele(ns);
+}
+
+/*
+ * tcp_window_scaling
+ *
+ * RFC 1323 TCP window scaling. This feature allows the use of a large window
+ * (> 64K) on a TCP connection. Boolean; default: enabled
+ *
+ * illumos: tcp_wscale_always
+ * tcp_wscale_always is set to 1, the window scale option will always be
+ * set when connecting to a remote system. If tcp_wscale_always is 0, the
+ * window scale option will be set only if the user has requested a send or
+ * receive window larger than 64K. The default value of is 1.
+ */
+static void
+lxpr_read_sys_net_ipv4_tcp_winscale(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	netstack_t *ns;
+	tcp_stack_t	*tcps;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_WINSCALE);
+
+	ns = netstack_get_current();
+	if (ns == NULL) {
+		lxpr_uiobuf_seterr(uiobuf, ENXIO);
+		return;
+	}
+
+	tcps = ns->netstack_tcp;
+	lxpr_uiobuf_printf(uiobuf, "%d\n", tcps->tcps_wscale_always);
+	netstack_rele(ns);
+}
+
+static void
+lxpr_read_sys_vm_max_map_cnt(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	ASSERT(lxpnp->lxpr_type == LXPR_SYS_VM_MAX_MAP_CNT);
+	/* We don't limit mappings, just say we have a large limit. */
+	lxpr_uiobuf_printf(uiobuf, "%d\n", 16777215);
+}
+
+static void
+lxpr_read_sys_vm_minfr_kb(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	ASSERT(lxpnp->lxpr_type == LXPR_SYS_VM_MINFR_KB);
+	lxpr_uiobuf_printf(uiobuf, "%d\n", 0);
+}
+
+static void
+lxpr_read_sys_vm_nhpages(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	ASSERT(lxpnp->lxpr_type == LXPR_SYS_VM_NHUGEP);
+	lxpr_uiobuf_printf(uiobuf, "%d\n", 0);
+}
+
+static void
+lxpr_read_sys_vm_overcommit_mem(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	ASSERT(lxpnp->lxpr_type == LXPR_SYS_VM_OVERCOMMIT_MEM);
+	lxpr_uiobuf_printf(uiobuf, "%d\n", 0);
+}
+
+static void
+lxpr_read_sys_vm_swappiness(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	ASSERT(lxpnp->lxpr_type == LXPR_SYS_VM_SWAPPINESS);
+	lxpr_uiobuf_printf(uiobuf, "%d\n", 0);
+}
+
+/*
+ * lxpr_read_uptime(): read the contents of the "uptime" file.
+ *
+ * format is: "%.2lf, %.2lf",uptime_secs, idle_secs
+ * Use fixed point arithmetic to get 2 decimal places
+ */
+/* ARGSUSED */
+static void
+lxpr_read_uptime(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	cpu_t *cp, *cpstart;
+	int pools_enabled;
+	ulong_t idle_cum = 0;
+	ulong_t cpu_count = 0;
+	ulong_t idle_s;
+	ulong_t idle_cs;
+	ulong_t up_s;
+	ulong_t up_cs;
+	hrtime_t birthtime;
+	hrtime_t centi_sec = 10000000;  /* 10^7 */
+
+	ASSERT(lxpnp->lxpr_type == LXPR_UPTIME);
+
+	/* Calculate cumulative stats */
+	mutex_enter(&cpu_lock);
+	pools_enabled = pool_pset_enabled();
+
+	cp = cpstart = CPU->cpu_part->cp_cpulist;
+	do {
+		/*
+		 * Don't count CPUs that aren't even in the system
+		 * or aren't up yet.
+		 */
+		if ((cp->cpu_flags & CPU_EXISTS) == 0) {
+			continue;
+		}
+
+		idle_cum += CPU_STATS(cp, sys.cpu_ticks_idle);
+		idle_cum += CPU_STATS(cp, sys.cpu_ticks_wait);
+		cpu_count += 1;
+
+		if (pools_enabled)
+			cp = cp->cpu_next_part;
+		else
+			cp = cp->cpu_next;
+	} while (cp != cpstart);
+	mutex_exit(&cpu_lock);
+
+	/* Getting the Zone zsched process startup time */
+	birthtime = LXPTOZ(lxpnp)->zone_zsched->p_mstart;
+	up_cs = (gethrtime() - birthtime) / centi_sec;
+	up_s = up_cs / 100;
+	up_cs %= 100;
+
+	ASSERT(cpu_count > 0);
+	idle_cum /= cpu_count;
+	idle_s = idle_cum / hz;
+	idle_cs = idle_cum % hz;
+	idle_cs *= 100;
+	idle_cs /= hz;
+
+	lxpr_uiobuf_printf(uiobuf,
+	    "%ld.%02d %ld.%02d\n", up_s, up_cs, idle_s, idle_cs);
+}
+
+static const char *amd_x_edx[] = {
+	NULL,	NULL,	NULL,	NULL,
+	NULL,	NULL,	NULL,	NULL,
+	NULL,	NULL,	NULL,	"syscall",
+	NULL,	NULL,	NULL,	NULL,
+	NULL,	NULL,	NULL,	"mp",
+	"nx",	NULL,	"mmxext", NULL,
+	NULL,	NULL,	NULL,	NULL,
+	NULL,	"lm",	"3dnowext", "3dnow"
+};
+
+static const char *amd_x_ecx[] = {
+	"lahf_lm", NULL, "svm", NULL,
+	"altmovcr8"
+};
+
+static const char *tm_x_edx[] = {
+	"recovery", "longrun", NULL, "lrti"
+};
+
+/*
+ * Intel calls no-execute "xd" in its docs, but Linux still reports it as "nx."
+ */
+static const char *intc_x_edx[] = {
+	NULL,	NULL,	NULL,	NULL,
+	NULL,	NULL,	NULL,	NULL,
+	NULL,	NULL,	NULL,	"syscall",
+	NULL,	NULL,	NULL,	NULL,
+	NULL,	NULL,	NULL,	NULL,
+	"nx",	NULL,	NULL,   NULL,
+	NULL,	NULL,	NULL,	NULL,
+	NULL,	"lm",   NULL,   NULL
+};
+
+static const char *intc_edx[] = {
+	"fpu",	"vme",	"de",	"pse",
+	"tsc",	"msr",	"pae",	"mce",
+	"cx8",	"apic",	 NULL,	"sep",
+	"mtrr",	"pge",	"mca",	"cmov",
+	"pat",	"pse36", "pn",	"clflush",
+	NULL,	"dts",	"acpi",	"mmx",
+	"fxsr",	"sse",	"sse2",	"ss",
+	"ht",	"tm",	"ia64",	"pbe"
+};
+
+/*
+ * "sse3" on linux is called "pni" (Prescott New Instructions).
+ */
+static const char *intc_ecx[] = {
+	"pni",	NULL,	NULL, "monitor",
+	"ds_cpl", NULL,	NULL, "est",
+	"tm2",	NULL,	"cid", NULL,
+	NULL,	"cx16",	"xtpr"
+};
+
+/*
+ * Report a list of each cgroup subsystem supported by our emulated cgroup fs.
+ * This needs to exist for systemd to run but for now we don't report any
+ * cgroup subsystems as being installed. The commented example below shows
+ * how to print a subsystem entry.
+ */
+static void
+lxpr_read_cgroups(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	lxpr_uiobuf_printf(uiobuf, "%s\t%s\t%s\t%s\n",
+	    "#subsys_name", "hierarchy", "num_cgroups", "enabled");
+
+	/*
+	 * lxpr_uiobuf_printf(uiobuf, "%s\t%s\t%s\t%s\n",
+	 *   "cpu,cpuacct", "2", "1", "1");
+	 */
+}
+
+static void
+lxpr_read_cpuinfo(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	int i;
+	uint32_t bits;
+	cpu_t *cp, *cpstart;
+	int pools_enabled;
+	const char **fp;
+	char brandstr[CPU_IDSTRLEN];
+	struct cpuid_regs cpr;
+	int maxeax;
+	int std_ecx, std_edx, ext_ecx, ext_edx;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_CPUINFO);
+
+	mutex_enter(&cpu_lock);
+	pools_enabled = pool_pset_enabled();
+
+	cp = cpstart = CPU->cpu_part->cp_cpulist;
+	do {
+		/*
+		 * This returns the maximum eax value for standard cpuid
+		 * functions in eax.
+		 */
+		cpr.cp_eax = 0;
+		(void) cpuid_insn(cp, &cpr);
+		maxeax = cpr.cp_eax;
+
+		/*
+		 * Get standard x86 feature flags.
+		 */
+		cpr.cp_eax = 1;
+		(void) cpuid_insn(cp, &cpr);
+		std_ecx = cpr.cp_ecx;
+		std_edx = cpr.cp_edx;
+
+		/*
+		 * Now get extended feature flags.
+		 */
+		cpr.cp_eax = 0x80000001;
+		(void) cpuid_insn(cp, &cpr);
+		ext_ecx = cpr.cp_ecx;
+		ext_edx = cpr.cp_edx;
+
+		(void) cpuid_getbrandstr(cp, brandstr, CPU_IDSTRLEN);
+
+		lxpr_uiobuf_printf(uiobuf,
+		    "processor\t: %d\n"
+		    "vendor_id\t: %s\n"
+		    "cpu family\t: %d\n"
+		    "model\t\t: %d\n"
+		    "model name\t: %s\n"
+		    "stepping\t: %d\n"
+		    "cpu MHz\t\t: %u.%03u\n",
+		    cp->cpu_id, cpuid_getvendorstr(cp), cpuid_getfamily(cp),
+		    cpuid_getmodel(cp), brandstr, cpuid_getstep(cp),
+		    (uint32_t)(cpu_freq_hz / 1000000),
+		    ((uint32_t)(cpu_freq_hz / 1000)) % 1000);
+
+		lxpr_uiobuf_printf(uiobuf, "cache size\t: %u KB\n",
+		    getl2cacheinfo(cp, NULL, NULL, NULL) / 1024);
+
+		if (is_x86_feature(x86_featureset, X86FSET_HTT)) {
+			/*
+			 * 'siblings' is used for HT-style threads
+			 */
+			lxpr_uiobuf_printf(uiobuf,
+			    "physical id\t: %lu\n"
+			    "siblings\t: %u\n",
+			    pg_plat_hw_instance_id(cp, PGHW_CHIP),
+			    cpuid_get_ncpu_per_chip(cp));
+		}
+
+		/*
+		 * Since we're relatively picky about running on older hardware,
+		 * we can be somewhat cavalier about the answers to these ones.
+		 *
+		 * In fact, given the hardware we support, we just say:
+		 *
+		 *	fdiv_bug	: no	(if we're on a 64-bit kernel)
+		 *	hlt_bug		: no
+		 *	f00f_bug	: no
+		 *	coma_bug	: no
+		 *	wp		: yes	(write protect in supervsr mode)
+		 */
+		lxpr_uiobuf_printf(uiobuf,
+		    "fdiv_bug\t: %s\n"
+		    "hlt_bug \t: no\n"
+		    "f00f_bug\t: no\n"
+		    "coma_bug\t: no\n"
+		    "fpu\t\t: %s\n"
+		    "fpu_exception\t: %s\n"
+		    "cpuid level\t: %d\n"
+		    "flags\t\t:",
+#if defined(__i386)
+		    fpu_pentium_fdivbug ? "yes" : "no",
+#else
+		    "no",
+#endif /* __i386 */
+		    fpu_exists ? "yes" : "no", fpu_exists ? "yes" : "no",
+		    maxeax);
+
+		for (bits = std_edx, fp = intc_edx, i = 0;
+		    i < sizeof (intc_edx) / sizeof (intc_edx[0]); fp++, i++)
+			if ((bits & (1 << i)) != 0 && *fp)
+				lxpr_uiobuf_printf(uiobuf, " %s", *fp);
+
+		/*
+		 * name additional features where appropriate
+		 */
+		switch (x86_vendor) {
+		case X86_VENDOR_Intel:
+			for (bits = ext_edx, fp = intc_x_edx, i = 0;
+			    i < sizeof (intc_x_edx) / sizeof (intc_x_edx[0]);
+			    fp++, i++)
+				if ((bits & (1 << i)) != 0 && *fp)
+					lxpr_uiobuf_printf(uiobuf, " %s", *fp);
+			break;
+
+		case X86_VENDOR_AMD:
+			for (bits = ext_edx, fp = amd_x_edx, i = 0;
+			    i < sizeof (amd_x_edx) / sizeof (amd_x_edx[0]);
+			    fp++, i++)
+				if ((bits & (1 << i)) != 0 && *fp)
+					lxpr_uiobuf_printf(uiobuf, " %s", *fp);
+
+			for (bits = ext_ecx, fp = amd_x_ecx, i = 0;
+			    i < sizeof (amd_x_ecx) / sizeof (amd_x_ecx[0]);
+			    fp++, i++)
+				if ((bits & (1 << i)) != 0 && *fp)
+					lxpr_uiobuf_printf(uiobuf, " %s", *fp);
+			break;
+
+		case X86_VENDOR_TM:
+			for (bits = ext_edx, fp = tm_x_edx, i = 0;
+			    i < sizeof (tm_x_edx) / sizeof (tm_x_edx[0]);
+			    fp++, i++)
+				if ((bits & (1 << i)) != 0 && *fp)
+					lxpr_uiobuf_printf(uiobuf, " %s", *fp);
+			break;
+		default:
+			break;
+		}
+
+		for (bits = std_ecx, fp = intc_ecx, i = 0;
+		    i < sizeof (intc_ecx) / sizeof (intc_ecx[0]); fp++, i++)
+			if ((bits & (1 << i)) != 0 && *fp)
+				lxpr_uiobuf_printf(uiobuf, " %s", *fp);
+
+		lxpr_uiobuf_printf(uiobuf, "\n\n");
+
+		if (pools_enabled)
+			cp = cp->cpu_next_part;
+		else
+			cp = cp->cpu_next;
+	} while (cp != cpstart);
+
+	mutex_exit(&cpu_lock);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_fd(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	ASSERT(lxpnp->lxpr_type == LXPR_PID_FD_FD);
+	lxpr_uiobuf_seterr(uiobuf, EFAULT);
+}
+
+/*
+ * Report a list of file systems loaded in the kernel. We only report the ones
+ * which we support and which may be checked by various components to see if
+ * they are loaded.
+ */
+static void
+lxpr_read_filesystems(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	lxpr_uiobuf_printf(uiobuf, "%s\t%s\n", "nodev", "autofs");
+	lxpr_uiobuf_printf(uiobuf, "%s\t%s\n", "nodev", "cgroup");
+	lxpr_uiobuf_printf(uiobuf, "%s\t%s\n", "nodev", "nfs");
+	lxpr_uiobuf_printf(uiobuf, "%s\t%s\n", "nodev", "proc");
+	lxpr_uiobuf_printf(uiobuf, "%s\t%s\n", "nodev", "sysfs");
+	lxpr_uiobuf_printf(uiobuf, "%s\t%s\n", "nodev", "tmpfs");
+}
+
+/*
+ * lxpr_getattr(): Vnode operation for VOP_GETATTR()
+ */
+static int
+lxpr_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
+    caller_context_t *ct)
+{
+	register lxpr_node_t *lxpnp = VTOLXP(vp);
+	lxpr_nodetype_t type = lxpnp->lxpr_type;
+	extern uint_t nproc;
+	int error;
+
+	/*
+	 * Return attributes of underlying vnode if ATTR_REAL
+	 *
+	 * but keep fd files with the symlink permissions
+	 */
+	if (lxpnp->lxpr_realvp != NULL && (flags & ATTR_REAL)) {
+		vnode_t *rvp = lxpnp->lxpr_realvp;
+
+		/*
+		 * withold attribute information to owner or root
+		 */
+		if ((error = VOP_ACCESS(rvp, 0, 0, cr, ct)) != 0) {
+			return (error);
+		}
+
+		/*
+		 * now its attributes
+		 */
+		if ((error = VOP_GETATTR(rvp, vap, flags, cr, ct)) != 0) {
+			return (error);
+		}
+
+		/*
+		 * if it's a file in lx /proc/pid/fd/xx then set its
+		 * mode and keep it looking like a symlink, fifo or socket
+		 */
+		if (type == LXPR_PID_FD_FD) {
+			vap->va_mode = lxpnp->lxpr_mode;
+			vap->va_type = lxpnp->lxpr_realvp->v_type;
+			vap->va_size = 0;
+			vap->va_nlink = 1;
+		}
+		return (0);
+	}
+
+	/* Default attributes, that may be overridden below */
+	bzero(vap, sizeof (*vap));
+	vap->va_atime = vap->va_mtime = vap->va_ctime = lxpnp->lxpr_time;
+	vap->va_nlink = 1;
+	vap->va_type = vp->v_type;
+	vap->va_mode = lxpnp->lxpr_mode;
+	vap->va_fsid = vp->v_vfsp->vfs_dev;
+	vap->va_blksize = DEV_BSIZE;
+	vap->va_uid = lxpnp->lxpr_uid;
+	vap->va_gid = lxpnp->lxpr_gid;
+	vap->va_nodeid = lxpnp->lxpr_ino;
+
+	switch (type) {
+	case LXPR_PROCDIR:
+		vap->va_nlink = nproc + 2 + PROCDIRFILES;
+		vap->va_size = (nproc + 2 + PROCDIRFILES) * LXPR_SDSIZE;
+		break;
+	case LXPR_PIDDIR:
+		vap->va_nlink = PIDDIRFILES;
+		vap->va_size = PIDDIRFILES * LXPR_SDSIZE;
+		break;
+	case LXPR_PID_TASK_IDDIR:
+		vap->va_nlink = TIDDIRFILES;
+		vap->va_size = TIDDIRFILES * LXPR_SDSIZE;
+		break;
+	case LXPR_SELF:
+		vap->va_uid = crgetruid(curproc->p_cred);
+		vap->va_gid = crgetrgid(curproc->p_cred);
+		break;
+	case LXPR_PID_FD_FD:
+	case LXPR_PID_TID_FD_FD:
+		/*
+		 * Restore VLNK type for lstat-type activity.
+		 * See lxpr_readlink for more details.
+		 */
+		if ((flags & FOLLOW) == 0)
+			vap->va_type = VLNK;
+	default:
+		break;
+	}
+
+	vap->va_nblocks = (fsblkcnt64_t)btod(vap->va_size);
+	return (0);
+}
+
+/*
+ * lxpr_access(): Vnode operation for VOP_ACCESS()
+ */
+static int
+lxpr_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct)
+{
+	lxpr_node_t *lxpnp = VTOLXP(vp);
+	lxpr_nodetype_t type = lxpnp->lxpr_type;
+	int shift = 0;
+	proc_t *tp;
+
+	/* lx /proc is primarily a read only file system */
+	if ((mode & VWRITE) && !lxpr_is_writable(type)) {
+		return (EROFS);
+	}
+
+	/*
+	 * If this is a restricted file, check access permissions.
+	 */
+	switch (type) {
+	case LXPR_PIDDIR:
+		return (0);
+	case LXPR_PID_CURDIR:
+	case LXPR_PID_ENV:
+	case LXPR_PID_EXE:
+	case LXPR_PID_LIMITS:
+	case LXPR_PID_MAPS:
+	case LXPR_PID_MEM:
+	case LXPR_PID_ROOTDIR:
+	case LXPR_PID_FDDIR:
+	case LXPR_PID_FD_FD:
+	case LXPR_PID_TID_FDDIR:
+	case LXPR_PID_TID_FD_FD:
+		if ((tp = lxpr_lock(lxpnp->lxpr_pid, ZOMB_OK)) == NULL)
+			return (ENOENT);
+		if (tp != curproc && secpolicy_proc_access(cr) != 0 &&
+		    priv_proc_cred_perm(cr, tp, NULL, mode) != 0) {
+			lxpr_unlock(tp);
+			return (EACCES);
+		}
+		lxpr_unlock(tp);
+	default:
+		break;
+	}
+
+	if (lxpnp->lxpr_realvp != NULL) {
+		/*
+		 * For these we use the underlying vnode's accessibility.
+		 */
+		return (VOP_ACCESS(lxpnp->lxpr_realvp, mode, flags, cr, ct));
+	}
+
+	/* If user is root allow access regardless of permission bits */
+	if (secpolicy_proc_access(cr) == 0)
+		return (0);
+
+	/*
+	 * Access check is based on only one of owner, group, public.  If not
+	 * owner, then check group.  If not a member of the group, then check
+	 * public access.
+	 */
+	if (crgetuid(cr) != lxpnp->lxpr_uid) {
+		shift += 3;
+		if (!groupmember((uid_t)lxpnp->lxpr_gid, cr))
+			shift += 3;
+	}
+
+	mode &= ~(lxpnp->lxpr_mode << shift);
+
+	if (mode == 0)
+		return (0);
+
+	return (EACCES);
+}
+
+/* ARGSUSED */
+static vnode_t *
+lxpr_lookup_not_a_dir(vnode_t *dp, char *comp)
+{
+	return (NULL);
+}
+
+/*
+ * lxpr_lookup(): Vnode operation for VOP_LOOKUP()
+ */
+/* ARGSUSED */
+static int
+lxpr_lookup(vnode_t *dp, char *comp, vnode_t **vpp, pathname_t *pathp,
+    int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
+    int *direntflags, pathname_t *realpnp)
+{
+	lxpr_node_t *lxpnp = VTOLXP(dp);
+	lxpr_nodetype_t type = lxpnp->lxpr_type;
+	int error;
+
+	ASSERT(dp->v_type == VDIR);
+	ASSERT(type < LXPR_NFILES);
+
+	/*
+	 * we should never get here because the lookup
+	 * is done on the realvp for these nodes
+	 */
+	ASSERT(type != LXPR_PID_FD_FD &&
+	    type != LXPR_PID_CURDIR &&
+	    type != LXPR_PID_ROOTDIR);
+
+	/*
+	 * restrict lookup permission to owner or root
+	 */
+	if ((error = lxpr_access(dp, VEXEC, 0, cr, ct)) != 0) {
+		return (error);
+	}
+
+	/*
+	 * Just return the parent vnode if that's where we are trying to go.
+	 */
+	if (strcmp(comp, "..") == 0) {
+		VN_HOLD(lxpnp->lxpr_parent);
+		*vpp = lxpnp->lxpr_parent;
+		return (0);
+	}
+
+	/*
+	 * Special handling for directory searches.  Note: null component name
+	 * denotes that the current directory is being searched.
+	 */
+	if ((dp->v_type == VDIR) && (*comp == '\0' || strcmp(comp, ".") == 0)) {
+		VN_HOLD(dp);
+		*vpp = dp;
+		return (0);
+	}
+
+	*vpp = (lxpr_lookup_function[type](dp, comp));
+	return ((*vpp == NULL) ? ENOENT : 0);
+}
+
+/*
+ * Do a sequential search on the given directory table
+ */
+static vnode_t *
+lxpr_lookup_common(vnode_t *dp, char *comp, proc_t *p,
+    lxpr_dirent_t *dirtab, int dirtablen)
+{
+	lxpr_node_t *lxpnp;
+	int count;
+
+	for (count = 0; count < dirtablen; count++) {
+		if (strcmp(dirtab[count].d_name, comp) == 0) {
+			lxpnp = lxpr_getnode(dp, dirtab[count].d_type, p, 0);
+			dp = LXPTOV(lxpnp);
+			ASSERT(dp != NULL);
+			return (dp);
+		}
+	}
+	return (NULL);
+}
+
+static vnode_t *
+lxpr_lookup_piddir(vnode_t *dp, char *comp)
+{
+	proc_t *p;
+
+	ASSERT(VTOLXP(dp)->lxpr_type == LXPR_PIDDIR);
+
+	p = lxpr_lock(VTOLXP(dp)->lxpr_pid, ZOMB_OK);
+	if (p == NULL)
+		return (NULL);
+
+	dp = lxpr_lookup_common(dp, comp, p, piddir, PIDDIRFILES);
+
+	lxpr_unlock(p);
+
+	return (dp);
+}
+
+/*
+ * Lookup one of the process's task ID's.
+ */
+static vnode_t *
+lxpr_lookup_taskdir(vnode_t *dp, char *comp)
+{
+	lxpr_node_t *dlxpnp = VTOLXP(dp);
+	lxpr_node_t *lxpnp;
+	proc_t *p;
+	pid_t real_pid;
+	uint_t tid;
+	int c;
+	kthread_t *t;
+
+	ASSERT(dlxpnp->lxpr_type == LXPR_PID_TASKDIR);
+
+	/*
+	 * convert the string rendition of the filename to a thread ID
+	 */
+	tid = 0;
+	while ((c = *comp++) != '\0') {
+		int otid;
+		if (c < '0' || c > '9')
+			return (NULL);
+
+		otid = tid;
+		tid = 10 * tid + c - '0';
+		/* integer overflow */
+		if (tid / 10 != otid)
+			return (NULL);
+	}
+
+	/*
+	 * get the proc to work with and lock it
+	 */
+	real_pid = get_real_pid(dlxpnp->lxpr_pid);
+	p = lxpr_lock(real_pid, NO_ZOMB);
+	if ((p == NULL))
+		return (NULL);
+
+	/*
+	 * Bail if this is a system process.
+	 */
+	if ((p->p_flag & SSYS) || (p->p_as == &kas)) {
+		lxpr_unlock(p);
+		return (NULL);
+	}
+
+	if (p->p_brand == &lx_brand) {
+		t = lxpr_get_thread(p, tid);
+	} else {
+		/*
+		 * Only the main thread is visible for non-branded processes.
+		 */
+		t = p->p_tlist;
+		if (tid != p->p_pid || t == NULL) {
+			t = NULL;
+		} else {
+			thread_lock(t);
+		}
+	}
+	if (t == NULL) {
+		lxpr_unlock(p);
+		return (NULL);
+	}
+	thread_unlock(t);
+
+	/*
+	 * Allocate and fill in a new lx /proc taskid node.
+	 * Instead of the last arg being a fd, it is a tid.
+	 */
+	lxpnp = lxpr_getnode(dp, LXPR_PID_TASK_IDDIR, p, tid);
+	dp = LXPTOV(lxpnp);
+	ASSERT(dp != NULL);
+	lxpr_unlock(p);
+	return (dp);
+}
+
+/*
+ * Lookup one of the process's task ID's.
+ */
+static vnode_t *
+lxpr_lookup_task_tid_dir(vnode_t *dp, char *comp)
+{
+	lxpr_node_t *dlxpnp = VTOLXP(dp);
+	lxpr_node_t *lxpnp;
+	proc_t *p;
+	pid_t real_pid;
+	kthread_t *t;
+	int i;
+
+	ASSERT(dlxpnp->lxpr_type == LXPR_PID_TASK_IDDIR);
+
+	/*
+	 * get the proc to work with and lock it
+	 */
+	real_pid = get_real_pid(dlxpnp->lxpr_pid);
+	p = lxpr_lock(real_pid, NO_ZOMB);
+	if ((p == NULL))
+		return (NULL);
+
+	/*
+	 * Bail if this is a system process.
+	 */
+	if ((p->p_flag & SSYS) || (p->p_as == &kas)) {
+		lxpr_unlock(p);
+		return (NULL);
+	}
+
+	/* need to confirm tid is still there */
+	t = lxpr_get_thread(p, dlxpnp->lxpr_desc);
+	if (t == NULL) {
+		lxpr_unlock(p);
+		return (NULL);
+	}
+	thread_unlock(t);
+
+	/*
+	 * allocate and fill in the new lx /proc taskid dir node
+	 */
+	for (i = 0; i < TIDDIRFILES; i++) {
+		if (strcmp(tiddir[i].d_name, comp) == 0) {
+			lxpnp = lxpr_getnode(dp, tiddir[i].d_type, p,
+			    dlxpnp->lxpr_desc);
+			dp = LXPTOV(lxpnp);
+			ASSERT(dp != NULL);
+			lxpr_unlock(p);
+			return (dp);
+		}
+	}
+
+	lxpr_unlock(p);
+	return (NULL);
+}
+
+/*
+ * Lookup one of the process's open files.
+ */
+static vnode_t *
+lxpr_lookup_fddir(vnode_t *dp, char *comp)
+{
+	lxpr_node_t *dlxpnp = VTOLXP(dp);
+
+	ASSERT(dlxpnp->lxpr_type == LXPR_PID_FDDIR ||
+	    dlxpnp->lxpr_type == LXPR_PID_TID_FDDIR);
+
+	return (lxpr_lookup_fdnode(dp, comp));
+}
+
+static vnode_t *
+lxpr_lookup_netdir(vnode_t *dp, char *comp)
+{
+	ASSERT(VTOLXP(dp)->lxpr_type == LXPR_NETDIR);
+
+	dp = lxpr_lookup_common(dp, comp, NULL, netdir, NETDIRFILES);
+
+	return (dp);
+}
+
+static vnode_t *
+lxpr_lookup_procdir(vnode_t *dp, char *comp)
+{
+	ASSERT(VTOLXP(dp)->lxpr_type == LXPR_PROCDIR);
+
+	/*
+	 * We know all the names of files & dirs in our file system structure
+	 * except those that are pid names.  These change as pids are created/
+	 * deleted etc., so we just look for a number as the first char to see
+	 * if we are we doing pid lookups.
+	 *
+	 * Don't need to check for "self" as it is implemented as a symlink
+	 */
+	if (*comp >= '0' && *comp <= '9') {
+		pid_t pid = 0;
+		lxpr_node_t *lxpnp = NULL;
+		proc_t *p;
+		int c;
+
+		while ((c = *comp++) != '\0')
+			pid = 10 * pid + c - '0';
+
+		/*
+		 * Can't continue if the process is still loading or it doesn't
+		 * really exist yet (or maybe it just died!)
+		 */
+		p = lxpr_lock(pid, ZOMB_OK);
+		if (p == NULL)
+			return (NULL);
+
+		if (secpolicy_basic_procinfo(CRED(), p, curproc) != 0) {
+			lxpr_unlock(p);
+			return (NULL);
+		}
+
+		/*
+		 * allocate and fill in a new lx /proc node
+		 */
+		lxpnp = lxpr_getnode(dp, LXPR_PIDDIR, p, 0);
+
+		lxpr_unlock(p);
+
+		dp = LXPTOV(lxpnp);
+		ASSERT(dp != NULL);
+
+		return (dp);
+	}
+
+	/* Lookup fixed names */
+	return (lxpr_lookup_common(dp, comp, NULL, lx_procdir, PROCDIRFILES));
+}
+
+static vnode_t *
+lxpr_lookup_sysdir(vnode_t *dp, char *comp)
+{
+	ASSERT(VTOLXP(dp)->lxpr_type == LXPR_SYSDIR);
+	return (lxpr_lookup_common(dp, comp, NULL, sysdir, SYSDIRFILES));
+}
+
+static vnode_t *
+lxpr_lookup_sys_kerneldir(vnode_t *dp, char *comp)
+{
+	ASSERT(VTOLXP(dp)->lxpr_type == LXPR_SYS_KERNELDIR);
+	return (lxpr_lookup_common(dp, comp, NULL, sys_kerneldir,
+	    SYS_KERNELDIRFILES));
+}
+
+static vnode_t *
+lxpr_lookup_sys_kdir_randdir(vnode_t *dp, char *comp)
+{
+	ASSERT(VTOLXP(dp)->lxpr_type == LXPR_SYS_KERNEL_RANDDIR);
+	return (lxpr_lookup_common(dp, comp, NULL, sys_randdir,
+	    SYS_RANDDIRFILES));
+}
+
+static vnode_t *
+lxpr_lookup_sys_netdir(vnode_t *dp, char *comp)
+{
+	ASSERT(VTOLXP(dp)->lxpr_type == LXPR_SYS_NETDIR);
+	return (lxpr_lookup_common(dp, comp, NULL, sys_netdir,
+	    SYS_NETDIRFILES));
+}
+
+static vnode_t *
+lxpr_lookup_sys_net_coredir(vnode_t *dp, char *comp)
+{
+	ASSERT(VTOLXP(dp)->lxpr_type == LXPR_SYS_NET_COREDIR);
+	return (lxpr_lookup_common(dp, comp, NULL, sys_net_coredir,
+	    SYS_NET_COREDIRFILES));
+}
+
+static vnode_t *
+lxpr_lookup_sys_net_ipv4dir(vnode_t *dp, char *comp)
+{
+	ASSERT(VTOLXP(dp)->lxpr_type == LXPR_SYS_NET_IPV4DIR);
+	return (lxpr_lookup_common(dp, comp, NULL, sys_net_ipv4dir,
+	    SYS_NET_IPV4DIRFILES));
+}
+
+static vnode_t *
+lxpr_lookup_sys_vmdir(vnode_t *dp, char *comp)
+{
+	ASSERT(VTOLXP(dp)->lxpr_type == LXPR_SYS_VMDIR);
+	return (lxpr_lookup_common(dp, comp, NULL, sys_vmdir,
+	    SYS_VMDIRFILES));
+}
+
+static vnode_t *
+lxpr_lookup_sys_fsdir(vnode_t *dp, char *comp)
+{
+	ASSERT(VTOLXP(dp)->lxpr_type == LXPR_SYS_FSDIR);
+	return (lxpr_lookup_common(dp, comp, NULL, sys_fsdir,
+	    SYS_FSDIRFILES));
+}
+
+static vnode_t *
+lxpr_lookup_sys_fs_inotifydir(vnode_t *dp, char *comp)
+{
+	ASSERT(VTOLXP(dp)->lxpr_type == LXPR_SYS_FS_INOTIFYDIR);
+	return (lxpr_lookup_common(dp, comp, NULL, sys_fs_inotifydir,
+	    SYS_FS_INOTIFYDIRFILES));
+}
+
+/*
+ * lxpr_readdir(): Vnode operation for VOP_READDIR()
+ */
+/* ARGSUSED */
+static int
+lxpr_readdir(vnode_t *dp, uio_t *uiop, cred_t *cr, int *eofp,
+    caller_context_t *ct, int flags)
+{
+	lxpr_node_t *lxpnp = VTOLXP(dp);
+	lxpr_nodetype_t type = lxpnp->lxpr_type;
+	ssize_t uresid;
+	off_t uoffset;
+	int error;
+
+	ASSERT(dp->v_type == VDIR);
+	ASSERT(type < LXPR_NFILES);
+
+	/*
+	 * we should never get here because the readdir
+	 * is done on the realvp for these nodes
+	 */
+	ASSERT(type != LXPR_PID_FD_FD &&
+	    type != LXPR_PID_CURDIR &&
+	    type != LXPR_PID_ROOTDIR);
+
+	/*
+	 * restrict readdir permission to owner or root
+	 */
+	if ((error = lxpr_access(dp, VREAD, 0, cr, ct)) != 0)
+		return (error);
+
+	uoffset = uiop->uio_offset;
+	uresid = uiop->uio_resid;
+
+	/* can't do negative reads */
+	if (uoffset < 0 || uresid <= 0)
+		return (EINVAL);
+
+	/* can't read directory entries that don't exist! */
+	if (uoffset % LXPR_SDSIZE)
+		return (ENOENT);
+
+	return (lxpr_readdir_function[lxpnp->lxpr_type](lxpnp, uiop, eofp));
+}
+
+/* ARGSUSED */
+static int
+lxpr_readdir_not_a_dir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+	return (ENOTDIR);
+}
+
+/*
+ * This has the common logic for returning directory entries
+ */
+static int
+lxpr_readdir_common(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp,
+    lxpr_dirent_t *dirtab, int dirtablen)
+{
+	/* bp holds one dirent64 structure */
+	longlong_t bp[DIRENT64_RECLEN(LXPNSIZ) / sizeof (longlong_t)];
+	dirent64_t *dirent = (dirent64_t *)bp;
+	ssize_t oresid;	/* save a copy for testing later */
+	ssize_t uresid;
+
+	oresid = uiop->uio_resid;
+
+	/* clear out the dirent buffer */
+	bzero(bp, sizeof (bp));
+
+	/*
+	 * Satisfy user request
+	 */
+	while ((uresid = uiop->uio_resid) > 0) {
+		int dirindex;
+		off_t uoffset;
+		int reclen;
+		int error;
+
+		uoffset = uiop->uio_offset;
+		dirindex  = (uoffset / LXPR_SDSIZE) - 2;
+
+		if (uoffset == 0) {
+
+			dirent->d_ino = lxpnp->lxpr_ino;
+			dirent->d_name[0] = '.';
+			dirent->d_name[1] = '\0';
+			reclen = DIRENT64_RECLEN(1);
+
+		} else if (uoffset == LXPR_SDSIZE) {
+
+			dirent->d_ino = lxpr_parentinode(lxpnp);
+			dirent->d_name[0] = '.';
+			dirent->d_name[1] = '.';
+			dirent->d_name[2] = '\0';
+			reclen = DIRENT64_RECLEN(2);
+
+		} else if (dirindex >= 0 && dirindex < dirtablen) {
+			int slen = strlen(dirtab[dirindex].d_name);
+
+			dirent->d_ino = lxpr_inode(dirtab[dirindex].d_type,
+			    lxpnp->lxpr_pid, 0);
+
+			VERIFY(slen < LXPNSIZ);
+			(void) strcpy(dirent->d_name, dirtab[dirindex].d_name);
+			reclen = DIRENT64_RECLEN(slen);
+
+		} else {
+			/* Run out of table entries */
+			if (eofp) {
+				*eofp = 1;
+			}
+			return (0);
+		}
+
+		dirent->d_off = (off64_t)(uoffset + LXPR_SDSIZE);
+		dirent->d_reclen = (ushort_t)reclen;
+
+		/*
+		 * if the size of the data to transfer is greater
+		 * that that requested then we can't do it this transfer.
+		 */
+		if (reclen > uresid) {
+			/*
+			 * Error if no entries have been returned yet.
+			 */
+			if (uresid == oresid) {
+				return (EINVAL);
+			}
+			break;
+		}
+
+		/*
+		 * uiomove() updates both uiop->uio_resid and uiop->uio_offset
+		 * by the same amount.  But we want uiop->uio_offset to change
+		 * in increments of LXPR_SDSIZE, which is different from the
+		 * number of bytes being returned to the user.  So we set
+		 * uiop->uio_offset separately, ignoring what uiomove() does.
+		 */
+		if ((error = uiomove((caddr_t)dirent, reclen, UIO_READ,
+		    uiop)) != 0)
+			return (error);
+
+		uiop->uio_offset = uoffset + LXPR_SDSIZE;
+	}
+
+	/* Have run out of space, but could have just done last table entry */
+	if (eofp) {
+		*eofp =
+		    (uiop->uio_offset >= ((dirtablen+2) * LXPR_SDSIZE)) ? 1 : 0;
+	}
+	return (0);
+}
+
+
+static int
+lxpr_readdir_procdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+	/* bp holds one dirent64 structure */
+	longlong_t bp[DIRENT64_RECLEN(LXPNSIZ) / sizeof (longlong_t)];
+	dirent64_t *dirent = (dirent64_t *)bp;
+	ssize_t oresid;	/* save a copy for testing later */
+	ssize_t uresid;
+	off_t uoffset;
+	zoneid_t zoneid;
+	pid_t pid;
+	int error;
+	int ceof;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_PROCDIR);
+
+	oresid = uiop->uio_resid;
+	zoneid = LXPTOZ(lxpnp)->zone_id;
+
+	/*
+	 * We return directory entries in the order: "." and ".." then the
+	 * unique lxproc files, then the directories corresponding to the
+	 * running processes.  We have defined this as the ordering because
+	 * it allows us to more easily keep track of where we are betwen calls
+	 * to getdents().  If the number of processes changes between calls
+	 * then we can't lose track of where we are in the lxproc files.
+	 */
+
+	/* Do the fixed entries */
+	error = lxpr_readdir_common(lxpnp, uiop, &ceof, lx_procdir,
+	    PROCDIRFILES);
+
+	/* Finished if we got an error or if we couldn't do all the table */
+	if (error != 0 || ceof == 0)
+		return (error);
+
+	/* clear out the dirent buffer */
+	bzero(bp, sizeof (bp));
+
+	/* Do the process entries */
+	while ((uresid = uiop->uio_resid) > 0) {
+		proc_t *p;
+		int len;
+		int reclen;
+		int i;
+
+		uoffset = uiop->uio_offset;
+
+		/*
+		 * Stop when entire proc table has been examined.
+		 */
+		i = (uoffset / LXPR_SDSIZE) - 2 - PROCDIRFILES;
+		if (i < 0 || i >= v.v_proc) {
+			/* Run out of table entries */
+			if (eofp) {
+				*eofp = 1;
+			}
+			return (0);
+		}
+		mutex_enter(&pidlock);
+
+		/*
+		 * Skip indices for which there is no pid_entry, PIDs for
+		 * which there is no corresponding process, a PID of 0,
+		 * and anything the security policy doesn't allow
+		 * us to look at.
+		 */
+		if ((p = pid_entry(i)) == NULL || p->p_stat == SIDL ||
+		    p->p_pid == 0 ||
+		    secpolicy_basic_procinfo(CRED(), p, curproc) != 0) {
+			mutex_exit(&pidlock);
+			goto next;
+		}
+		mutex_exit(&pidlock);
+
+		/*
+		 * Convert pid to the Linux default of 1 if we're the zone's
+		 * init process, or 0 if zsched, otherwise use the value from
+		 * the proc structure
+		 */
+		if (p->p_pid == curproc->p_zone->zone_proc_initpid) {
+			pid = 1;
+		} else if (p->p_pid == curproc->p_zone->zone_zsched->p_pid) {
+			pid = 0;
+		} else {
+			pid = p->p_pid;
+		}
+
+		/*
+		 * If this /proc was mounted in the global zone, view
+		 * all procs; otherwise, only view zone member procs.
+		 */
+		if (zoneid != GLOBAL_ZONEID && p->p_zone->zone_id != zoneid) {
+			goto next;
+		}
+
+		ASSERT(p->p_stat != 0);
+
+		dirent->d_ino = lxpr_inode(LXPR_PIDDIR, pid, 0);
+		len = snprintf(dirent->d_name, LXPNSIZ, "%d", pid);
+		ASSERT(len < LXPNSIZ);
+		reclen = DIRENT64_RECLEN(len);
+
+		dirent->d_off = (off64_t)(uoffset + LXPR_SDSIZE);
+		dirent->d_reclen = (ushort_t)reclen;
+
+		/*
+		 * if the size of the data to transfer is greater
+		 * that that requested then we can't do it this transfer.
+		 */
+		if (reclen > uresid) {
+			/*
+			 * Error if no entries have been returned yet.
+			 */
+			if (uresid == oresid)
+				return (EINVAL);
+			break;
+		}
+
+		/*
+		 * uiomove() updates both uiop->uio_resid and uiop->uio_offset
+		 * by the same amount.  But we want uiop->uio_offset to change
+		 * in increments of LXPR_SDSIZE, which is different from the
+		 * number of bytes being returned to the user.  So we set
+		 * uiop->uio_offset separately, in the increment of this for
+		 * the loop, ignoring what uiomove() does.
+		 */
+		if ((error = uiomove((caddr_t)dirent, reclen, UIO_READ,
+		    uiop)) != 0)
+			return (error);
+next:
+		uiop->uio_offset = uoffset + LXPR_SDSIZE;
+	}
+
+	if (eofp != NULL) {
+		*eofp = (uiop->uio_offset >=
+		    ((v.v_proc + PROCDIRFILES + 2) * LXPR_SDSIZE)) ? 1 : 0;
+	}
+
+	return (0);
+}
+
+static int
+lxpr_readdir_piddir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+	proc_t *p;
+	pid_t find_pid;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_PIDDIR);
+
+	/* can't read its contents if it died */
+	mutex_enter(&pidlock);
+
+	if (lxpnp->lxpr_pid == 1) {
+		find_pid = curproc->p_zone->zone_proc_initpid;
+	} else if (lxpnp->lxpr_pid == 0) {
+		find_pid = curproc->p_zone->zone_zsched->p_pid;
+	} else {
+		find_pid = lxpnp->lxpr_pid;
+	}
+	p = prfind(find_pid);
+
+	if (p == NULL || p->p_stat == SIDL) {
+		mutex_exit(&pidlock);
+		return (ENOENT);
+	}
+	mutex_exit(&pidlock);
+
+	return (lxpr_readdir_common(lxpnp, uiop, eofp, piddir, PIDDIRFILES));
+}
+
+static int
+lxpr_readdir_netdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+	ASSERT(lxpnp->lxpr_type == LXPR_NETDIR);
+	return (lxpr_readdir_common(lxpnp, uiop, eofp, netdir, NETDIRFILES));
+}
+
+static int
+lxpr_readdir_taskdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+	/* bp holds one dirent64 structure */
+	longlong_t bp[DIRENT64_RECLEN(LXPNSIZ) / sizeof (longlong_t)];
+	dirent64_t *dirent = (dirent64_t *)bp;
+	ssize_t oresid;	/* save a copy for testing later */
+	ssize_t uresid;
+	off_t uoffset;
+	int error, ceof, tiddirsize, tasknum;
+	proc_t *p;
+	pid_t real_pid;
+	kthread_t *t;
+	boolean_t branded;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_PID_TASKDIR);
+
+	oresid = uiop->uio_resid;
+
+	real_pid = get_real_pid(lxpnp->lxpr_pid);
+	p = lxpr_lock(real_pid, ZOMB_OK);
+	if (p == NULL) {
+		return (ENOENT);
+	}
+	if (p->p_stat == SIDL) {
+		lxpr_unlock(p);
+		return (ENOENT);
+	}
+
+	/*
+	 * Just emit static entries for system processes and zombies.
+	 */
+	if ((p->p_stat == SZOMB) || (p->p_flag & (SSYS | SEXITING)) ||
+	    (p->p_as == &kas)) {
+		lxpr_unlock(p);
+		return (lxpr_readdir_common(lxpnp, uiop, eofp, 0, 0));
+	}
+
+	/*
+	 * Drop p_lock, but keep the process P_PR_LOCK'd to prevent it from
+	 * going away while we iterate over its threads.
+	 */
+	tiddirsize = p->p_lwpcnt;
+	branded = (p->p_brand == &lx_brand);
+	mutex_exit(&p->p_lock);
+
+	/* Do the fixed entries (in this case just "." & "..") */
+	error = lxpr_readdir_common(lxpnp, uiop, &ceof, 0, 0);
+
+	/* Finished if we got an error or if we couldn't do all the table */
+	if (error != 0 || ceof == 0)
+		goto out;
+
+	if ((t = p->p_tlist) == NULL) {
+		if (eofp != NULL)
+			*eofp = 1;
+		goto out;
+	}
+
+	/* clear out the dirent buffer */
+	bzero(bp, sizeof (bp));
+
+	/*
+	 * Loop until user's request is satisfied or until all thread's have
+	 * been returned.
+	 */
+	for (tasknum = 0; (uresid = uiop->uio_resid) > 0; tasknum++) {
+		int i, reclen, len;
+		uint_t emul_tid;
+		lx_lwp_data_t *lwpd;
+
+		uoffset = uiop->uio_offset;
+
+		/*
+		 * Stop at the end of the thread list
+		 */
+		i = (uoffset / LXPR_SDSIZE) - 2;
+		if (i < 0 || i >= tiddirsize) {
+			if (eofp) {
+				*eofp = 1;
+			}
+			goto out;
+		}
+
+		if (i != tasknum)
+			goto next;
+
+		if (!branded) {
+			/*
+			 * Emulating the goofy linux task model is impossible
+			 * to do for native processes.  We can compromise by
+			 * presenting only the main thread to the consumer.
+			 */
+			emul_tid = p->p_pid;
+		} else {
+			if ((lwpd = ttolxlwp(t)) == NULL) {
+				goto next;
+			}
+			emul_tid = lwpd->br_pid;
+			/*
+			 * Convert pid to Linux default of 1 if we're the
+			 * zone's init.
+			 */
+			if (emul_tid == curproc->p_zone->zone_proc_initpid)
+				emul_tid = 1;
+		}
+
+		dirent->d_ino = lxpr_inode(LXPR_PID_TASK_IDDIR, lxpnp->lxpr_pid,
+		    emul_tid);
+		len = snprintf(dirent->d_name, LXPNSIZ, "%d", emul_tid);
+		ASSERT(len < LXPNSIZ);
+		reclen = DIRENT64_RECLEN(len);
+
+		dirent->d_off = (off64_t)(uoffset + LXPR_SDSIZE);
+		dirent->d_reclen = (ushort_t)reclen;
+
+		if (reclen > uresid) {
+			/*
+			 * Error if no entries have been returned yet.
+			 */
+			if (uresid == oresid)
+				error = EINVAL;
+			goto out;
+		}
+
+		/*
+		 * uiomove() updates both uiop->uio_resid and uiop->uio_offset
+		 * by the same amount.  But we want uiop->uio_offset to change
+		 * in increments of LXPR_SDSIZE, which is different from the
+		 * number of bytes being returned to the user.  So we set
+		 * uiop->uio_offset separately, in the increment of this for
+		 * the loop, ignoring what uiomove() does.
+		 */
+		if ((error = uiomove((caddr_t)dirent, reclen, UIO_READ,
+		    uiop)) != 0)
+			goto out;
+
+next:
+		uiop->uio_offset = uoffset + LXPR_SDSIZE;
+
+		if ((t = t->t_forw) == p->p_tlist || !branded) {
+			if (eofp != NULL)
+				*eofp = 1;
+			goto out;
+		}
+	}
+
+	if (eofp != NULL)
+		*eofp = 0;
+
+out:
+	mutex_enter(&p->p_lock);
+	lxpr_unlock(p);
+	return (error);
+}
+
+static int
+lxpr_readdir_task_tid_dir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+	proc_t *p;
+	pid_t real_pid;
+	kthread_t *t;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_PID_TASK_IDDIR);
+
+	mutex_enter(&pidlock);
+
+	real_pid = get_real_pid(lxpnp->lxpr_pid);
+	p = prfind(real_pid);
+
+	/* can't read its contents if it died */
+	if (p == NULL || p->p_stat == SIDL) {
+		mutex_exit(&pidlock);
+		return (ENOENT);
+	}
+
+	mutex_exit(&pidlock);
+
+	/* need to confirm tid is still there */
+	t = lxpr_get_thread(p, lxpnp->lxpr_desc);
+	if (t == NULL) {
+		/* we can't find this specific thread */
+		return (NULL);
+	}
+	thread_unlock(t);
+
+	return (lxpr_readdir_common(lxpnp, uiop, eofp, tiddir, TIDDIRFILES));
+}
+
+static int
+lxpr_readdir_fddir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+	/* bp holds one dirent64 structure */
+	longlong_t bp[DIRENT64_RECLEN(LXPNSIZ) / sizeof (longlong_t)];
+	dirent64_t *dirent = (dirent64_t *)bp;
+	ssize_t oresid;	/* save a copy for testing later */
+	ssize_t uresid;
+	off_t uoffset;
+	int error, ceof, fddirsize;
+	proc_t *p;
+	uf_info_t *fip;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_PID_FDDIR ||
+	    lxpnp->lxpr_type == LXPR_PID_TID_FDDIR);
+
+	oresid = uiop->uio_resid;
+
+	/* can't read its contents if it died */
+	p = lxpr_lock(lxpnp->lxpr_pid, ZOMB_OK);
+	if (p == NULL)
+		return (ENOENT);
+
+	/*
+	 * For exiting/exited processes or those belonging to the system, only
+	 * emit the fixed entries.
+	 */
+	if ((p->p_stat == SZOMB) || (p->p_flag & (SSYS | SEXITING)) ||
+	    (p->p_as == &kas)) {
+		lxpr_unlock(p);
+		return (lxpr_readdir_common(lxpnp, uiop, eofp, 0, 0));
+	}
+
+	/*
+	 * Drop p_lock, but keep the process P_PR_LOCK'd to prevent it from
+	 * going away while we iterate over its fi_list.
+	 */
+	mutex_exit(&p->p_lock);
+
+	/* Get open file info */
+	fip = (&(p)->p_user.u_finfo);
+	mutex_enter(&fip->fi_lock);
+	fddirsize = fip->fi_nfiles;
+
+	/* Do the fixed entries (in this case just "." & "..") */
+	error = lxpr_readdir_common(lxpnp, uiop, &ceof, 0, 0);
+
+	/* Finished if we got an error or if we couldn't do all the table */
+	if (error != 0 || ceof == 0)
+		goto out;
+
+	/* clear out the dirent buffer */
+	bzero(bp, sizeof (bp));
+
+	/*
+	 * Loop until user's request is satisfied or until
+	 * all file descriptors have been examined.
+	 */
+	for (; (uresid = uiop->uio_resid) > 0;
+	    uiop->uio_offset = uoffset + LXPR_SDSIZE) {
+		int reclen;
+		int fd;
+		int len;
+
+		uoffset = uiop->uio_offset;
+
+		/*
+		 * Stop at the end of the fd list
+		 */
+		fd = (uoffset / LXPR_SDSIZE) - 2;
+		if (fd < 0 || fd >= fddirsize) {
+			if (eofp) {
+				*eofp = 1;
+			}
+			goto out;
+		}
+
+		if (fip->fi_list[fd].uf_file == NULL)
+			continue;
+
+		dirent->d_ino = lxpr_inode(LXPR_PID_FD_FD, lxpnp->lxpr_pid, fd);
+		len = snprintf(dirent->d_name, LXPNSIZ, "%d", fd);
+		ASSERT(len < LXPNSIZ);
+		reclen = DIRENT64_RECLEN(len);
+
+		dirent->d_off = (off64_t)(uoffset + LXPR_SDSIZE);
+		dirent->d_reclen = (ushort_t)reclen;
+
+		if (reclen > uresid) {
+			/*
+			 * Error if no entries have been returned yet.
+			 */
+			if (uresid == oresid)
+				error = EINVAL;
+			goto out;
+		}
+
+		if ((error = uiomove((caddr_t)dirent, reclen, UIO_READ,
+		    uiop)) != 0)
+			goto out;
+	}
+
+	if (eofp != NULL) {
+		*eofp =
+		    (uiop->uio_offset >= ((fddirsize+2) * LXPR_SDSIZE)) ? 1 : 0;
+	}
+
+out:
+	mutex_exit(&fip->fi_lock);
+	mutex_enter(&p->p_lock);
+	lxpr_unlock(p);
+	return (error);
+}
+
+static int
+lxpr_readdir_sysdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+	ASSERT(lxpnp->lxpr_type == LXPR_SYSDIR);
+	return (lxpr_readdir_common(lxpnp, uiop, eofp, sysdir, SYSDIRFILES));
+}
+
+static int
+lxpr_readdir_sys_fsdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+	ASSERT(lxpnp->lxpr_type == LXPR_SYS_FSDIR);
+	return (lxpr_readdir_common(lxpnp, uiop, eofp, sys_fsdir,
+	    SYS_FSDIRFILES));
+}
+
+static int
+lxpr_readdir_sys_fs_inotifydir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+	ASSERT(lxpnp->lxpr_type == LXPR_SYS_FS_INOTIFYDIR);
+	return (lxpr_readdir_common(lxpnp, uiop, eofp, sys_fs_inotifydir,
+	    SYS_FS_INOTIFYDIRFILES));
+}
+
+static int
+lxpr_readdir_sys_kerneldir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+	ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNELDIR);
+	return (lxpr_readdir_common(lxpnp, uiop, eofp, sys_kerneldir,
+	    SYS_KERNELDIRFILES));
+}
+
+static int
+lxpr_readdir_sys_kdir_randdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+	ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_RANDDIR);
+	return (lxpr_readdir_common(lxpnp, uiop, eofp, sys_randdir,
+	    SYS_RANDDIRFILES));
+}
+
+static int
+lxpr_readdir_sys_netdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+	ASSERT(lxpnp->lxpr_type == LXPR_SYS_NETDIR);
+	return (lxpr_readdir_common(lxpnp, uiop, eofp, sys_netdir,
+	    SYS_NETDIRFILES));
+}
+
+static int
+lxpr_readdir_sys_net_coredir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+	ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_COREDIR);
+	return (lxpr_readdir_common(lxpnp, uiop, eofp, sys_net_coredir,
+	    SYS_NET_COREDIRFILES));
+}
+
+static int
+lxpr_readdir_sys_net_ipv4dir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+	ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4DIR);
+	return (lxpr_readdir_common(lxpnp, uiop, eofp, sys_net_ipv4dir,
+	    SYS_NET_IPV4DIRFILES));
+}
+
+static int
+lxpr_readdir_sys_vmdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+	ASSERT(lxpnp->lxpr_type == LXPR_SYS_VMDIR);
+	return (lxpr_readdir_common(lxpnp, uiop, eofp, sys_vmdir,
+	    SYS_VMDIRFILES));
+}
+
+#define	isdigit(c)	((c) >= '0' && (c) <= '9')
+#define	isspace(c)	((c) == ' ' || (c) == '\t' || (c) == '\n')
+
+/*
+ * Obtain a numeric value from the null-terminated input string.
+ * We don't have strtok in the kernel, so tokenize this ourselves and
+ * validate the input.
+ */
+static int
+lxpr_tokenize_num(char *str, long *pv, char **ep)
+{
+	char *pstart, *pc, c, *endptr;
+	long v;
+
+	for (pc = str; isspace(*pc); pc++)
+		;
+
+	for (pstart = pc; isdigit(*pc); pc++)
+		;
+	if (pc == pstart || (!isspace(*pc) && *pc != '\0'))
+		return (EINVAL);
+	c = *pc;
+	*pc = '\0';
+
+	if (ddi_strtol(pstart, &endptr, 10, &v) != 0) {
+		*pc = c;
+		return (EINVAL);
+	}
+	if (*endptr != '\0') {
+		*pc = c;
+		return (EINVAL);
+	}
+
+	if (pv != NULL)
+		*pv = v;
+	if (ep != NULL)
+		*ep = ++pc;
+
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+lxpr_write_tcp_property(lxpr_node_t *lxpnp, struct uio *uio,
+    struct cred *cr, caller_context_t *ct, char *prop,
+    int (*xlate)(char *, int))
+{
+	int error;
+	int res = 0;
+	size_t olen;
+	char val[16];	/* big enough for a uint numeric string */
+	netstack_t *ns;
+	mod_prop_info_t *ptbl = NULL;
+	mod_prop_info_t *pinfo = NULL;
+
+	if (uio->uio_loffset != 0)
+		return (EINVAL);
+
+	if (uio->uio_resid == 0)
+		return (0);
+
+	olen = uio->uio_resid;
+	if (olen > sizeof (val) - 1)
+		return (EINVAL);
+
+	bzero(val, sizeof (val));
+	error = uiomove(val, olen, UIO_WRITE, uio);
+	if (error != 0)
+		return (error);
+
+	if (val[olen - 1] == '\n')
+		val[olen - 1] = '\0';
+
+	if (val[0] == '\0') /* no input */
+		return (EINVAL);
+
+	ns = netstack_get_current();
+	if (ns == NULL)
+		return (EINVAL);
+
+	if (xlate != NULL && xlate(val, sizeof (val)) != 0) {
+		netstack_rele(ns);
+		return (EINVAL);
+	}
+
+	ptbl = ns->netstack_tcp->tcps_propinfo_tbl;
+	pinfo = mod_prop_lookup(ptbl, prop, MOD_PROTO_TCP);
+	if (pinfo == NULL || pinfo->mpi_setf(ns, cr, pinfo, NULL, val, 0) != 0)
+		res = EINVAL;
+
+	netstack_rele(ns);
+	return (res);
+}
+
+static int
+lxpr_write_sys_net_core_somaxc(lxpr_node_t *lxpnp, struct uio *uio,
+    struct cred *cr, caller_context_t *ct)
+{
+	ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_CORE_SOMAXCON);
+	return (lxpr_write_tcp_property(lxpnp, uio, cr, ct,
+	    "_conn_req_max_q", NULL));
+}
+
+static int
+lxpr_xlate_sec2ms(char *val, int size)
+{
+	long sec;
+	char *ep;
+
+	if (lxpr_tokenize_num(val, &sec, &ep) != 0)
+		return (EINVAL);
+	if (*ep != '\0')
+		return (EINVAL);
+	if (snprintf(val, size, "%ld", sec * 1000) >= size)
+		return (EINVAL);
+	return (0);
+}
+
+static int
+lxpr_xlate_ka_intvl(char *val, int size)
+{
+	long sec;
+	char *ep;
+
+	if (lxpr_tokenize_num(val, &sec, &ep) != 0)
+		return (EINVAL);
+	if (*ep != '\0')
+		return (EINVAL);
+	if (snprintf(val, size, "%ld", sec * 1000 * 9) >= size)
+		return (EINVAL);
+	return (0);
+}
+
+static int
+lxpr_xlate_sack(char *val, int size)
+{
+	long flag;
+	char *ep;
+
+	if (lxpr_tokenize_num(val, &flag, &ep) != 0)
+		return (EINVAL);
+	if (*ep != '\0')
+		return (EINVAL);
+	if (flag != 0 && flag != 1)
+		return (EINVAL);
+	/* see comment on lxpr_read_sys_net_ipv4_tcp_sack */
+	if (snprintf(val, size, "%d", (flag == 0 ? 0 : 2)) >= size)
+		return (EINVAL);
+	return (0);
+}
+
+/*
+ * We expect two port numbers on a line as input for the range, and we have to
+ * set two properties on the netstack_tcp, so we can't reuse
+ * lxpr_write_tcp_property.
+ */
+static int
+lxpr_write_sys_net_ipv4_ip_lport_range(lxpr_node_t *lxpnp, struct uio *uio,
+    struct cred *cr, caller_context_t *ct)
+{
+	int res;
+	size_t olen;
+	char vals[32];	/* big enough for a line w/ 2 16-bit numeric strings */
+	char *ep;
+	long low, high;
+	netstack_t *ns;
+	tcp_stack_t *tcps;
+	mod_prop_info_t *ptbl = NULL;
+	mod_prop_info_t *pinfo = NULL;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_IP_LPORT_RANGE);
+
+	if (uio->uio_loffset != 0)
+		return (EINVAL);
+
+	if (uio->uio_resid == 0)
+		return (0);
+
+	olen = uio->uio_resid;
+	if (olen > sizeof (vals) - 1)
+		return (EINVAL);
+
+	bzero(vals, sizeof (vals));
+	res = uiomove(vals, olen, UIO_WRITE, uio);
+	if (res != 0)
+		return (res);
+
+	if (lxpr_tokenize_num(vals, &low, &ep) != 0)
+		return (EINVAL);
+
+	if (lxpr_tokenize_num(ep, &high, &ep) != 0)
+		return (EINVAL);
+
+	if (*ep != '\0') {
+		/* make sure no other tokens on the line */
+		*ep++ = '\0';
+		for (; isspace(*ep); ep++)
+			;
+		if (*ep != '\0')
+			return (EINVAL);
+	}
+
+	if (low > high || high > 65535)
+		return (EINVAL);
+
+	ns = netstack_get_current();
+	if (ns == NULL)
+		return (EINVAL);
+
+	tcps = ns->netstack_tcp;
+	if (low < tcps->tcps_smallest_nonpriv_port) {
+		netstack_rele(ns);
+		return (EINVAL);
+	}
+
+	ptbl = ns->netstack_tcp->tcps_propinfo_tbl;
+
+	(void) snprintf(vals, sizeof (vals), "%ld", low);
+	pinfo = mod_prop_lookup(ptbl, "smallest_anon_port", MOD_PROTO_TCP);
+	if (pinfo == NULL || pinfo->mpi_setf(ns, cr, pinfo, NULL, vals, 0) != 0)
+		res = EINVAL;
+
+	(void) snprintf(vals, sizeof (vals), "%ld", high);
+	pinfo = mod_prop_lookup(ptbl, "largest_anon_port", MOD_PROTO_TCP);
+	if (pinfo == NULL || pinfo->mpi_setf(ns, cr, pinfo, NULL, vals, 0) != 0)
+		res = EINVAL;
+
+	netstack_rele(ns);
+	return (res);
+}
+
+static int
+lxpr_write_sys_net_ipv4_tcp_fin_to(lxpr_node_t *lxpnp, struct uio *uio,
+    struct cred *cr, caller_context_t *ct)
+{
+	ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_FIN_TO);
+	return (lxpr_write_tcp_property(lxpnp, uio, cr, ct,
+	    "_fin_wait_2_flush_interval", lxpr_xlate_sec2ms));
+}
+
+static int
+lxpr_write_sys_net_ipv4_tcp_ka_int(lxpr_node_t *lxpnp, struct uio *uio,
+    struct cred *cr, caller_context_t *ct)
+{
+	ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_KA_INT);
+	return (lxpr_write_tcp_property(lxpnp, uio, cr, ct,
+	    "_keepalive_abort_interval", lxpr_xlate_ka_intvl));
+}
+
+static int
+lxpr_write_sys_net_ipv4_tcp_ka_tim(lxpr_node_t *lxpnp, struct uio *uio,
+    struct cred *cr, caller_context_t *ct)
+{
+	ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_KA_TIM);
+	return (lxpr_write_tcp_property(lxpnp, uio, cr, ct,
+	    "_keepalive_interval", lxpr_xlate_sec2ms));
+}
+
+static int
+lxpr_write_sys_net_ipv4_tcp_sack(lxpr_node_t *lxpnp, struct uio *uio,
+    struct cred *cr, caller_context_t *ct)
+{
+	ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_SACK);
+	return (lxpr_write_tcp_property(lxpnp, uio, cr, ct, "sack",
+	    lxpr_xlate_sack));
+}
+
+static int
+lxpr_write_sys_net_ipv4_tcp_winscale(lxpr_node_t *lxpnp, struct uio *uio,
+    struct cred *cr, caller_context_t *ct)
+{
+	ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_WINSCALE);
+	return (lxpr_write_tcp_property(lxpnp, uio, cr, ct, "_wscale_always",
+	    NULL));
+}
+
+/* ARGSUSED */
+static int
+lxpr_write_sys_kernel_corepatt(lxpr_node_t *lxpnp, struct uio *uio,
+    struct cred *cr, caller_context_t *ct)
+{
+	zone_t *zone = curproc->p_zone;
+	struct core_globals *cg;
+	refstr_t *rp, *nrp;
+	corectl_path_t *ccp;
+	char val[MAXPATHLEN];
+	char valtr[MAXPATHLEN];
+	size_t olen;
+	int error;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_COREPATT);
+
+	cg = zone_getspecific(core_zone_key, zone);
+	ASSERT(cg != NULL);
+
+	if (secpolicy_coreadm(cr) != 0)
+		return (EPERM);
+
+	if (uio->uio_loffset != 0)
+		return (EINVAL);
+
+	if (uio->uio_resid == 0)
+		return (0);
+
+	olen = uio->uio_resid;
+	if (olen > sizeof (val) - 1)
+		return (EINVAL);
+
+	bzero(val, sizeof (val));
+	error = uiomove(val, olen, UIO_WRITE, uio);
+	if (error != 0)
+		return (error);
+
+	if (val[olen - 1] == '\n')
+		val[olen - 1] = '\0';
+
+	if (val[0] == '|')
+		return (EINVAL);
+
+	if ((error = lxpr_core_path_l2s(val, valtr, sizeof (valtr))) != 0)
+		return (error);
+
+	nrp = refstr_alloc(valtr);
+
+	ccp = cg->core_default_path;
+	mutex_enter(&ccp->ccp_mtx);
+	rp = ccp->ccp_path;
+	refstr_hold((ccp->ccp_path = nrp));
+	cg->core_options |= CC_PROCESS_PATH;
+	mutex_exit(&ccp->ccp_mtx);
+
+	if (rp != NULL)
+		refstr_rele(rp);
+
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+lxpr_write_pid_loginuid(lxpr_node_t *lxpnp, struct uio *uio, struct cred *cr,
+    caller_context_t *ct)
+{
+	int error;
+	size_t olen;
+	char val[16];	/* big enough for a uint numeric string */
+	char *ep;
+	long u;
+	proc_t *p;
+	lx_proc_data_t *pd;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_PID_LOGINUID);
+
+	if (uio->uio_loffset != 0)
+		return (EINVAL);
+
+	if (uio->uio_resid == 0)
+		return (0);
+
+	olen = uio->uio_resid;
+	if (olen > sizeof (val) - 1)
+		return (EINVAL);
+
+	bzero(val, sizeof (val));
+	error = uiomove(val, olen, UIO_WRITE, uio);
+	if (error != 0)
+		return (error);
+
+	if (lxpr_tokenize_num(val, &u, &ep) != 0)
+		return (EINVAL);
+	if (*ep != '\0')
+		return (EINVAL);
+
+	if ((p = lxpr_lock(lxpnp->lxpr_pid, NO_ZOMB)) == NULL)
+		return (ENXIO);
+
+	if ((pd = ptolxproc(p)) != NULL) {
+		pd->l_loginuid = (uid_t)u;
+	}
+	lxpr_unlock(p);
+
+	return (0);
+}
+
+/*
+ * lxpr_readlink(): Vnode operation for VOP_READLINK()
+ */
+/* ARGSUSED */
+static int
+lxpr_readlink(vnode_t *vp, uio_t *uiop, cred_t *cr, caller_context_t *ct)
+{
+	char bp[MAXPATHLEN + 1];
+	size_t buflen = sizeof (bp);
+	lxpr_node_t *lxpnp = VTOLXP(vp);
+	vnode_t *rvp = lxpnp->lxpr_realvp;
+	pid_t pid;
+	int error = 0;
+
+	/*
+	 * Linux does something very "clever" for /proc/<pid>/fd/<num> entries.
+	 * Open FDs are represented as symlinks, the link contents
+	 * corresponding to the open resource.  For plain files or devices,
+	 * this isn't absurd since one can dereference the symlink to query
+	 * the underlying resource.  For sockets or pipes, it becomes ugly in a
+	 * hurry.  To maintain this human-readable output, those FD symlinks
+	 * point to bogus targets such as "socket:[<inodenum>]".  This requires
+	 * circumventing vfs since the stat/lstat behavior on those FD entries
+	 * will be unusual. (A stat must retrieve information about the open
+	 * socket or pipe.  It cannot fail because the link contents point to
+	 * an absent file.)
+	 *
+	 * To accomplish this, lxpr_getnode returns an vnode typed VNON for FD
+	 * entries.  This bypasses code paths which would normally
+	 * short-circuit on symlinks and allows us to emulate the vfs behavior
+	 * expected by /proc consumers.
+	 */
+	if (vp->v_type != VLNK && lxpnp->lxpr_type != LXPR_PID_FD_FD)
+		return (EINVAL);
+
+	/* Try to produce a symlink name for anything that has a realvp */
+	if (rvp != NULL) {
+		if ((error = lxpr_access(vp, VREAD, 0, CRED(), ct)) != 0)
+			return (error);
+		if ((error = vnodetopath(NULL, rvp, bp, buflen, CRED())) != 0) {
+			/*
+			 * Special handling possible for /proc/<pid>/fd/<num>
+			 * Generate <type>:[<inode>] links, if allowed.
+			 */
+			if (lxpnp->lxpr_type != LXPR_PID_FD_FD ||
+			    lxpr_readlink_fdnode(lxpnp, bp, buflen) != 0) {
+				return (error);
+			}
+		}
+	} else {
+		switch (lxpnp->lxpr_type) {
+		case LXPR_SELF:
+			/*
+			 * Convert pid to the Linux default of 1 if we're the
+			 * zone's init process or 0 if zsched.
+			 */
+			if (curproc->p_pid ==
+			    curproc->p_zone->zone_proc_initpid) {
+				pid = 1;
+			} else if (curproc->p_pid ==
+			    curproc->p_zone->zone_zsched->p_pid) {
+				pid = 0;
+			} else {
+				pid = curproc->p_pid;
+			}
+
+			/*
+			 * Don't need to check result as every possible int
+			 * will fit within MAXPATHLEN bytes.
+			 */
+			(void) snprintf(bp, buflen, "%d", pid);
+			break;
+		case LXPR_PID_CURDIR:
+		case LXPR_PID_ROOTDIR:
+		case LXPR_PID_EXE:
+			return (EACCES);
+		default:
+			/*
+			 * Need to return error so that nothing thinks
+			 * that the symlink is empty and hence "."
+			 */
+			return (EINVAL);
+		}
+	}
+
+	/* copy the link data to user space */
+	return (uiomove(bp, strlen(bp), UIO_READ, uiop));
+}
+
+
+/*
+ * lxpr_inactive(): Vnode operation for VOP_INACTIVE()
+ * Vnode is no longer referenced, deallocate the file
+ * and all its resources.
+ */
+/* ARGSUSED */
+static void
+lxpr_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
+{
+	lxpr_freenode(VTOLXP(vp));
+}
+
+/*
+ * lxpr_sync(): Vnode operation for VOP_SYNC()
+ */
+static int
+lxpr_sync()
+{
+	/*
+	 * Nothing to sync but this function must never fail
+	 */
+	return (0);
+}
+
+/*
+ * lxpr_cmp(): Vnode operation for VOP_CMP()
+ */
+static int
+lxpr_cmp(vnode_t *vp1, vnode_t *vp2, caller_context_t *ct)
+{
+	vnode_t *rvp;
+
+	while (vn_matchops(vp1, lxpr_vnodeops) &&
+	    (rvp = VTOLXP(vp1)->lxpr_realvp) != NULL) {
+		vp1 = rvp;
+	}
+
+	while (vn_matchops(vp2, lxpr_vnodeops) &&
+	    (rvp = VTOLXP(vp2)->lxpr_realvp) != NULL) {
+		vp2 = rvp;
+	}
+
+	if (vn_matchops(vp1, lxpr_vnodeops) || vn_matchops(vp2, lxpr_vnodeops))
+		return (vp1 == vp2);
+	return (VOP_CMP(vp1, vp2, ct));
+}
+
+/*
+ * lxpr_realvp(): Vnode operation for VOP_REALVP()
+ */
+static int
+lxpr_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct)
+{
+	vnode_t *rvp;
+
+	if ((rvp = VTOLXP(vp)->lxpr_realvp) != NULL) {
+		vp = rvp;
+		if (VOP_REALVP(vp, &rvp, ct) == 0)
+			vp = rvp;
+	}
+
+	*vpp = vp;
+	return (0);
+}
+
+static int
+lxpr_write(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr,
+    caller_context_t *ct)
+{
+	lxpr_node_t	*lxpnp = VTOLXP(vp);
+	lxpr_nodetype_t	type = lxpnp->lxpr_type;
+	int i;
+
+	for (i = 0; wr_tab[i].wft_type != LXPR_INVALID; i++) {
+		if (wr_tab[i].wft_type == type) {
+			if (wr_tab[i].wft_wrf != NULL) {
+				return (wr_tab[i].wft_wrf(lxpnp, uiop, cr, ct));
+			}
+			break;
+		}
+	}
+
+	/* pretend we wrote the whole thing */
+	uiop->uio_offset += uiop->uio_resid;
+	uiop->uio_resid = 0;
+	return (0);
+}
+
+/* Needed for writable files which are first "truncated" */
+static int
+lxpr_space(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset,
+    cred_t *cred, caller_context_t *ct)
+{
+	int error;
+
+	if (cmd != F_FREESP)
+		return (EINVAL);
+	if ((error = lxpr_access(vp, VWRITE, 0, cred, ct)) != 0)
+		return (error);
+
+	return (0);
+}
+
+/*
+ * Needed for writable files which are first "truncated". We only support
+ * truncation.
+ */
+static int
+lxpr_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
+    caller_context_t *ct)
+{
+	int error;
+
+	if (vap->va_mask != AT_SIZE)
+		return (EINVAL);
+	if ((error = lxpr_access(vp, VWRITE, 0, cr, ct)) != 0)
+		return (error);
+
+	return (0);
+}
+
+/*
+ * We need to allow open with O_CREAT for the writable files.
+ */
+/*ARGSUSED7*/
+static int
+lxpr_create(vnode_t *dvp, char *nm, vattr_t *vap, enum vcexcl exclusive,
+    int mode, vnode_t **vpp, cred_t *cr, int flag, caller_context_t *ct,
+    vsecattr_t *vsecp)
+{
+	lxpr_node_t *lxpnp = VTOLXP(dvp);
+	lxpr_nodetype_t type = lxpnp->lxpr_type;
+	vnode_t *vp = NULL;
+	int error;
+
+	ASSERT(type < LXPR_NFILES);
+
+	/*
+	 * restrict create permission to owner or root
+	 */
+	if ((error = lxpr_access(dvp, VEXEC, 0, cr, ct)) != 0) {
+		return (error);
+	}
+
+	if (*nm == '\0')
+		return (EPERM);
+
+	if (dvp->v_type != VDIR)
+		return (EPERM);
+
+	if (exclusive == EXCL)
+		return (EEXIST);
+
+	/*
+	 * No writable files in top-level proc dir. We check this to avoid
+	 * getting a non-proc node via "..".
+	 */
+	if (type != LXPR_PROCDIR &&
+	    lxpr_lookup(dvp, nm, &vp, NULL, 0, NULL, cr, ct, NULL, NULL) == 0) {
+		lxpr_nodetype_t ftype = VTOLXP(vp)->lxpr_type;
+		if (!lxpr_is_writable(ftype)) {
+			VN_RELE(vp);
+			vp = NULL;
+		}
+	}
+
+	if (vp != NULL) {
+		ASSERT(vp->v_type != VDIR);
+
+		/* confirm permissions against existing file */
+		if ((error = lxpr_access(vp, mode, 0, cr, ct)) != 0) {
+			VN_RELE(vp);
+			return (error);
+		}
+
+		*vpp = vp;
+		return (0);
+	}
+
+	/*
+	 * Linux proc does not allow creation of addition, non-subsystem
+	 * specific files inside the hierarchy.  ENOENT is tossed when such
+	 * actions are attempted.
+	 */
+	return (ENOENT);
+}
diff --git a/usr/src/uts/common/brand/lx/sys/lx_autofs.h b/usr/src/uts/common/brand/lx/sys/lx_autofs.h
new file mode 100644
index 0000000000..17b19895f4
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/sys/lx_autofs.h
@@ -0,0 +1,511 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#ifndef	_LX_AUTOFS_H
+#define	_LX_AUTOFS_H
+
+/*
+ * The lxautofs filesystem and driver exist to emulate the Linux autofs
+ * filesystem and /dev/autofs device (this code emulates both). The
+ * purpose is to provide support for the Linux "automount" automounter.
+ *
+ * The device ioctls map fairly closely to the filesystem ioctls. The device
+ * ioctls have superseded the filesystem ioctls and the automounter will
+ * use the device ioctls if the device exists.
+ *
+ * The device ioctls are used by the automounter to perform recovery
+ * in cases where the automounter is restarted while mounts are present. It
+ * also allows for better management operations when a filesystem is mounted
+ * on top of an autofs mountpoint, as in the case of an NFS direct mount on
+ * top of an autofs mount.
+ *
+ *
+ * +++ Linux automounter background.
+ *
+ * Linux has two automounters: "amd" (not used in any popular, modern distro)
+ * and "automount".
+ *
+ * "automount" is the normal Linux automounter.  It utilizes a kernel
+ * filesystem (autofs) and device (/dev/autofs) to provide its functionality.
+ * Basically, it mounts the autofs filesystem at any automounter controlled
+ * mountpoint. This filesystem then intercepts and redirects lookup operations
+ * to the userland automounter process via a pipe. The pipe to the automounter
+ * is established via a mount option when the autofs filesystem is mounted or
+ * via the setpipefd ioctl if the automounter restarts. When the automounter
+ * receives a request via this pipe, it does lookups (or unmounts) to whatever
+ * backing store it's configured to use, does mkdir operations on the autofs
+ * filesystem, mounts remote NFS filesystems on any directories it manages or
+ * just created, and signals the autofs device via an ioctl to let it know
+ * that the lookup (or expire) can continue. Other management operations (such
+ * as querying expiration for unmounting) are performed using the autofs device.
+ *
+ *
+ * +++ Linux autofs documentation.
+ *
+ * Within the Linux src tree, see the file:
+ * Documentation/filesystems/autofs4-mount-control.txt
+ * This documents some of the autofs behavior and the device driver ioctls.
+ *
+ * The following URL (https://lwn.net/Articles/606960/) documents autofs in
+ * general. This patch was targeted for Documentation/filesystems/autofs4.txt,
+ * but seems to have never integrated into the Linux src tree.
+ *
+ *
+ * +++ Linux autofs (and automount daemon) notes
+ *
+ * Since we're mimicking the behavior of the Linux autofs filesystem and
+ * device, we document some of the observed behavior here.
+ *
+ * There are multiple versions of the autofs filesystem kernel API protocol
+ * and modern implementations of the user-land automount daemon would depend
+ * on v5, although the filesystem API has been superseded by the driver ioctl
+ * API, which is roughly similar.
+ *
+ * We'll describe the filesystem ioctls first, since support for those was
+ * implemented first. The device ioctls roughly correspond to the filesystem
+ * ioctls and were implemented last, but the automounter will use those
+ * ioctls, instead of the filesystem ioctls, when the device is present.
+ *
+ * Our original autofs implementation was developed in the mid-2000s around the
+ * v2 protocol, but that is currently obsolete. Our current implementation is
+ * based around the v5 protocol API. There was no autofs device support at that
+ * time.
+ *
+ * The autoumounter supports 3 different, mutually exclusive, mount options for
+ * each mountpoint:
+ *   - indirect (this was all you got with the v2 support)
+ *   - direct
+ *   - offset
+ *
+ * An 'indirect' mountpoint is managed with dynamic mounts below that
+ * mountpoint. For example, if '/home' were an indirect autofs mount, then
+ * accessing a username under /home would traverse the 'lookup' code described
+ * below, cause a local subdirectory to be created, and a mount, usually NFS,
+ * onto that username subdirectory.
+ *
+ * A 'direct' mountpoint is an autofs mountpoint which will trigger the
+ * mounting of another filesystem overtop that mountpoint when accessed.
+ *
+ * An 'offset' mountpoint behaves like a 'direct' mountpoint but it is
+ * created dynamically by the automounter underneath an 'indirect' mountpoint.
+ * For example, if '/net' were an indirect autosfs mountpoint and the host
+ * 'jurassic' exported two NFS filesystems; '/var/crash' and '/var/core', then
+ * accessing '/net/jurassic' would trigger the automounter to create two
+ * subdirectories; '/net/jurassic/var/crash' and '/net/jurassic/var/core'. The
+ * automounter would then mount an autofs offset mount onto each one of these
+ * directories. Accessing either of those directories would then trigger
+ * automounter to perform another mount on top, as is done with a 'direct'
+ * mount.
+ *
+ * General behavior
+ *
+ * A) Autofs allows root owned, non-automounter processes to create
+ * directories in the autofs filesystem.  The autofs filesystem treats the
+ * automounter's process group as special, but it doesn't prevent root
+ * processes outside of the automounter's process group from creating new
+ * directories in the autofs filesystem.
+ *
+ * B) Autofs doesn't allow creation of any non-directory entries in the
+ * autofs filesystem.  No entity can create files (e.g. /bin/touch or
+ * VOP_CREATE/VOP_SYMLINK/etc.)  The only entries that can exist within
+ * the autofs filesystem are directories.
+ *
+ * C) Autofs only intercepts vop lookup operations.  Notably, it does _not_
+ * intercept and re-direct vop readdir operations.  This means that the
+ * observed behavior of the Linux automounter can be considerably different
+ * from that of the illumos automounter.  Specifically, on illumos if an autofs
+ * mountpoint is mounted _without_ the -nobrowse option then if a user does
+ * an ls operation (which translates into a vop readdir operation) then the
+ * automounter will intercept that operation and list all the possible
+ * directories and mountpoints without actually mounting any filesystems.
+ * Essentially, all automounter managed mountpoints on Linux will behave
+ * like "-nobrowse" mountpoints on illumos.  Here's an example to illustrate
+ * this.  If /ws was mounted on illumos without the -nobrowse option and an
+ * auto_ws yp map was setup as the backing store for this mountpoint, then an
+ * "ls /ws" would list all the keys in the map as valid directories, but an
+ * "ls /ws" on Linux would list an emptry directory.
+ *
+ * D) NFS mounts are performed by the automount process.  When the automount
+ * process gets a redirected lookup request, it determines _all_ the
+ * possible remote mountpoints for that request, creates directory paths
+ * via mkdir, and mounts the remote filesystems on the newly created paths.
+ * This is described in the offset mount example above. Once the automounter
+ * completed the mounts it would signal the autofs filesystem (via an ioctl)
+ * that the lookup could continue.
+ *
+ * E.1) Autofs only redirects vop lookup operations for path entries that
+ * don't already exist in the autofs filesystem.  So for the example above,
+ * an initial (after the start of the automounter) "ls /net/jurassic" would
+ * result in a request to the automounter.  A subsequest "ls /net/jurassic"
+ * would not result in a request to the automounter.  Even if
+ * /net/jurassic/var/crash and /net/jurassic/var/core were manually unmounted
+ * after the initial "ls /net/jurassic", a subsequest "ls /net/jurassic"
+ * would not result in a new request to the automounter.
+ *
+ * E.2) Autofs lookup requests that are sent to the automounter only include
+ * the root directory path component.  So for example, after starting up
+ * the automounter if a user were to do a "ls /net/jurassic/var/crash", the
+ * initial lookup request actually sent to the automounter would just be for
+ * "jurassic" (the same request as if the user had done "ls /net/jurassic").
+ * After the initial mounting of the two offset mounts onto crash and core the
+ * lookup would continue and a final lookup request would be sent to the
+ * automounter for "crash" (but this would be on a different vfs from the
+ * /net vfs).
+ *
+ * E.3) The two statements above aren't entirely entirely true.  The Linux
+ * autofs filesystem will also redirect lookup operations for leaf
+ * directories that don't have a filesystem mounted on them.  Using the
+ * example above, if a user did a "ls /net/jurassic", then manually
+ * unmounted /net/jurassic/var/crash, and then did an "ls
+ * /net/jurassic/var/crash", this would result in a request for
+ * "jurassic/var/crash" being sent to the automounter.  The strange thing
+ * (a Linux bug perhaps) is that the automounter won't do anything with this
+ * request and the lookup will fail.
+ *
+ * F) The autofs filesystem communication protocol (what ioctls it supports
+ * and what data it passes to the automount process) is versioned. The
+ * userland automount daemon (as of version v5.0.7) expects v5 of the protocol
+ * (by running the AUTOFS_IOC_PROTOSUBVER ioctl), and exits if that is not
+ * supported. For v2-v5 the structure passed through the pipe always begins
+ * with a common header followed by different fields depending on the packet
+ * type. In addition the different versions support additional ioctls.
+ *
+ * v2 - basic lookup request
+ * v3 - adds expiring (umounting)
+ * v4 - adds expire multi
+ * v5 - adds missing indirect, expire indirect, missing direct & expire direct.
+ *      Defines a new protocol structure layout.
+ *      The v5 'missing indirect' and 'missing direct' ioctls are analogous to
+ *      the v2 'missing' ioctl. These ioctls are used to initiate a mount via
+ *	a lookup. The 'expire' ioctls are used by the automounter to query if
+ *	it is possible to unmount the filesystem. 'direct' and 'indirect'
+ *	refer to the mount option type that the automounter performed and
+ *	correlate to an automounter direct or indirect map mointpoint.
+ *
+ * G) The automounter periodically issues an 'expire' ioctl to autofs to
+ * obtain the name of a mountpoint which the automounter can unmount.
+ * Unmounting is dicussed in more detail below.
+ *
+ * H) The device ioctls roughly correspond to the filesystem ioctls, but
+ * instead of being tied to an auotfs mountpoint vnode, they can be called any
+ * time. The argument structure uses either a path or an autofs pipe file
+ * descriptor to indicate what is being operated on.
+ *
+ * +++ lxautofs notes
+ *
+ * 1) In general, the lxautofs filesystem tries to mimic the behavior of the
+ * Linux autofs filesystem with the following exceptions:
+ *
+ * 	1.1) We don't bother to implement the E.3 functionality listed above
+ * 	since it doesn't appear to be of any use.
+ *
+ * 	1.2) We only fully implement v2 and v5 of the autofs protocol.
+ *
+ * 2) In general, the approach taken for lxautofs is to keep it as simple
+ * as possible and to minimize it's memory usage.  To do this all information
+ * about the contents of the lxautofs filesystem are mirrored in the
+ * underlying filesystem that lxautofs is mounted on and most vop operations
+ * are simply passed onto this underlying filesystem.  This means we don't
+ * have to implement most of the complex operations that a full filesystem
+ * normally has to implement.  It also means that most of our filesystem state
+ * (wrt the contents of the filesystem) doesn't actually have to be stored
+ * in memory, we can simply go to the underlying filesystem to get it when
+ * it's requested.  For the purposes of discussion, we'll call the underlying
+ * filesystem the "backing store."
+ *
+ * The backing store is actually a directory called ".lxautofs" which is created
+ * in the directory where the lxautofs filesystem is mounted. When the
+ * lxautofs filesystem is unmounted this backing store directory is deleted.
+ * If this directory exists at mount time (perhaps the system crashed while a
+ * previous lxautofs instance was mounted at the same location) it will be
+ * deleted. There are a few implications of using a backing store worth
+ * mentioning.
+ *
+ * 	2.1) lxautofs can't be mounted on a read only filesystem.  If this
+ * 	proves to be a problem we can probably move the location of the
+ * 	backing store.
+ *
+ * 	2.2) If the backing store filesystem runs out of space then the
+ * 	automounter process won't be able to create more directories and mount
+ * 	new filesystems.  Of course, strange failures usually happen when
+ * 	filesystems run out of space.
+ *
+ * 3) Why aren't we using gfs?  gfs has two different usage models.
+ *
+ * 	3.1) I'm my own filesystem but i'm using gfs to help with managing
+ * 	readdir operations.
+ *
+ * 	3.2) I'm a gfs filesystem and gfs is managing all my vnodes
+ *
+ * We're not using the 3.1 interfaces because we don't implement readdir
+ * ourselves.  We pass all readdir operations onto the backing store
+ * filesystem and utilize its readdir implementation.
+ *
+ * We're not using the 3.2 interfaces because they are really designed for
+ * in memory filesystems where all of the filesystem state is stored in
+ * memory.  They don't lend themselves to filesystems where part of the
+ * state is in memory and part of the state is on disk.
+ *
+ * For more information on gfs take a look at the block comments in the
+ * top of gfs.c
+ *
+ * 4) Unmounting
+ *
+ * The automounter has a timeout associated with each mount. It informs autofs
+ * of this timeout using the LX_AUTOFS_DEV_IOC_TIMEOUT_CMD ioctl after autofs
+ * has been mounted on the mountpoint.
+ *
+ * After the automounter has mounted something associated with the mountpoint
+ * then periodically (<timeout>/4 seconds) the automounter will issue the
+ * LX_AUTOFS_DEV_IOC_EXPIRE_CMD ioctl on the autofs mount. autofs is expected
+ * to respond with an underlying mountpoint entry which is a candidate for
+ * unmounting. The automounter will attempt to unmount the filesystem
+ * (which may fail if it is busy, since this is obviously racy) and then
+ * acknowledge the expire ioctl. The successful acknowledgement is independent
+ * of the success of unmounting the underlying filesystem.
+ *
+ * Unmount handling varies based on which type of mount the autofs was mounted
+ * with (indirect, direct or offset).
+ *
+ * To support 'indirect' mount expiration, the autofs vfs keeps track of the
+ * filesystems mounted immediately under the autofs mountpoint (in
+ * lav_mnt_list) after a lookup has completed successfully. Upon receipt of the
+ * LX_AUTOFS_IOC_DEV_EXPIRE_CMD ioctl, autofs removes the first element from
+ * the list, attempts to check if it is busy and if not, returns that mountpoint
+ * over the fifo (if busy the entry is added to the end of the list). When the
+ * ioctl is acknowledged, if the mountpoint still exists, that means the unmount
+ * failed and the entry is added at the back of the list. If there are no
+ * elements or the first one is busy, EAGAIN is returned for the 'expire' ioctl
+ * and the autoumounter will check again in <timeout>/4 seconds.
+ *
+ * For example, if /home is an autofs indirect mount, then there are typically
+ * many different {username}-specific NFS mounts under that /home autofs mount.
+ * autofs uses the lav_mnt_list to respond to 'expire' ioctls in a round-robin
+ * fashion so that the automounter can unmount user file systems that aren't in
+ * use.
+ *
+ * Expiring 'direct' mounts is similar, but since there is only a single mount,
+ * the lav_mnt_list only will have at most one entry if there is a filesystem
+ * mounted overtop of the autofs mount.
+ *
+ * Expiring 'offset' mounts is more complicated because there are at least
+ * two different autofs VFSs involved (the top-level and one for each offset
+ * mount underneath). The actual offset mount is handled exactly like a 'direct'
+ * mount. The top-level is an indirect mount and is handled in a similar way
+ * as described above for indirect mounts, but special handling is needed for
+ * each offset mount below.
+ *
+ * This can be explained using the same 'jurassic' example described earlier
+ * (/net is an autofs 'indirect' mount and the host 'jurassic' has two exported
+ * file systems; /var/crash and /var/core). If the user accesses
+ * /net/jurassic/var/crash then the automounter would setup the system so that
+ * the following mounts exist:
+ *   - /net (the original autofs indirect mount which triggers everything)
+ *   - /net/jurassic/var/crash (autofs offset mount)
+ *   - /net/jurassic/var/crash (NFS mount on top of the autofs offset mount)
+ *   - /net/jurassic/var/core (autofs offset mount)
+ *
+ * For expiration the automounter will issue the LX_AUTOFS_IOC_EXPIRE_MULTI
+ * ioctl on each autofs vfs for which something is mounted, so we would receive
+ * an expire ioctl on /net and another on /net/jusrassic/var/crash. The vfs for
+ * /net will be tracking "jurassic", but we detect it is busy and won't do
+ * anything at first. The vfs for "crash" will work like a direct mount and
+ * acknowledge the expire ioctl to the automounter once that filesystem times
+ * out and is no longer busy. The automounter will then unmount the "crash"
+ * NFS mount.
+ *
+ * Once the "crash" NFS mount has been unmounted by the automounter, we're left
+ * with the two autofs offset mounts under jurassic. The automounter will not
+ * try to unmount either of those, so we have to do that. Once we get another
+ * expire ioctl on /net and check "jurassic", we'll see there are only autofs
+ * mounts under /net/jurassic. We umount those using the lx_autofs_umount_offset
+ * function and respond to the automounter expire ioctl with "jurassic", in the
+ * same way as we would for any other indirect mount.
+ *
+ * 5) Recovery
+ *
+ * If the automounter is restarted for any reason, it needs to cope with
+ * pre-existing autofs mounts, as well as other automount-initiated mounts (e.g.
+ * a direct mount on top of an autofs mountpoint). The automounter uses the
+ * /proc/mounts file to correlate mounts to the managed mountpoints. It then
+ * uses the /dev/autofs device to openmount each of the autofs devices and
+ * reinitialize them using the various dev ioctls (timeout, requester, etc.).
+ *
+ * In general, the autoumounter will closemount the mountpoint once it's done,
+ * but it doesn't in the case of an offset mountpoint with nothing mounted
+ * on top. In this case the automounter expects autofs to expire that mountpoint
+ * before it will closemount (so things can subsequently cleanup). We handle
+ * this special case in the expire code path.
+ */
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * Note that the name of the actual file system is lxautofs, not lx_autofs, but
+ * the code uses lx_autofs to prefix the various names. This is because file
+ * system names are limited to 8 characters.
+ */
+#define	LX_AUTOFS_NAME			"lxautofs"
+
+#define	LX_AUTOFS_MINORNAME		"autofs"
+
+/*
+ * Mount options supported.
+ */
+#define	LX_MNTOPT_FD			"fd"
+#define	LX_MNTOPT_PGRP			"pgrp"
+#define	LX_MNTOPT_MINPROTO		"minproto"
+#define	LX_MNTOPT_MAXPROTO		"maxproto"
+#define	LX_MNTOPT_INDIRECT		"indirect"
+#define	LX_MNTOPT_DIRECT		"direct"
+#define	LX_MNTOPT_OFFSET		"offset"
+
+/*
+ * Version/subversion of the Linux kernel automount protocol we support.
+ *
+ * We fully support v2 and v5. We'll return ENOTSUP for all of the ioctls we
+ * don't yet handle.
+ */
+#define	LX_AUTOFS_PROTO_VERS5		5
+#define	LX_AUTOFS_PROTO_SUBVERSION	2
+#define	LX_AUTOFS_PROTO_VERS2		2
+
+/* packet types */
+typedef enum laph_ptype {
+	LX_AUTOFS_PTYPE_MISSING,	/* 0 */
+	LX_AUTOFS_PTYPE_EXPIRE,		/* 1 */
+	LX_AUTOFS_PTYPE_EXPIRE_MULTI,	/* 2 */
+	LX_AUTOFS_PTYPE_MISSING_INDIR,	/* 3 */
+	LX_AUTOFS_PTYPE_EXPIRE_INDIR,	/* 4 */
+	LX_AUTOFS_PTYPE_MISSING_DIRECT,	/* 5 */
+	LX_AUTOFS_PTYPE_EXPIRE_DIRECT	/* 6 */
+} laph_ptype_t;
+
+/*
+ * Common header for all versions of the protocol.
+ */
+typedef struct lx_autofs_pkt_hdr {
+	int		laph_protover;	/* protocol version number */
+	laph_ptype_t	laph_type;
+	int		laph_id;	/* every pkt must have a unique id */
+} lx_autofs_pkt_hdr_t;
+
+/*
+ * Command structure sent to automount process from lxautofs via a pipe.
+ * This structure is the same for v2-v4 of the automount protocol
+ * (the communication pipe is established at mount time).
+ */
+typedef struct lx_autofs_v2_pkt {
+	lx_autofs_pkt_hdr_t lap_hdr;
+	int	lap_name_len;	/* don't include newline or NULL */
+	char	lap_name[256];	/* path component to lookup */
+} lx_autofs_v2_pkt_t;
+
+/* v4 multi-expire */
+typedef struct lx_autofs_v4_exp_pkt {
+	lx_autofs_pkt_hdr_t lape_hdr;
+	int lape_len;
+	char lape_name[MAXNAMELEN];
+} lx_autofs_v4_exp_pkt_t;
+
+/* v5 */
+typedef struct lx_autofs_v5_pkt {
+	lx_autofs_pkt_hdr_t lap_hdr;
+	uint32_t lap_dev;
+	uint64_t lap_ino;
+	uint32_t lap_uid;
+	uint32_t lap_gid;
+	uint32_t lap_pid;
+	uint32_t lap_tgid;
+	uint32_t lap_name_len;
+	char	lap_name[256];
+} lx_autofs_v5_pkt_t;
+
+union lx_autofs_pkt {
+	lx_autofs_v2_pkt_t	lap_v2;
+	lx_autofs_v5_pkt_t	lap_v5;
+};
+
+#define	lap_protover	lap_v2.lap_hdr.laph_protover
+#define	lap_type	lap_v2.lap_hdr.laph_type
+#define	lap_id		lap_v2.lap_hdr.laph_id
+
+/*
+ * Ioctls fully supported (v2 protocol).
+ */
+#define	LX_AUTOFS_IOC_READY		0x00009360 /* arg: int */
+#define	LX_AUTOFS_IOC_FAIL		0x00009361 /* arg: int */
+#define	LX_AUTOFS_IOC_CATATONIC		0x00009362 /* arg: <none> */
+
+/*
+ * Ioctls supported (v3/v4 protocol).
+ */
+#define	LX_AUTOFS_IOC_PROTOVER		0x80049363 /* arg: int */
+#define	LX_AUTOFS_IOC_SETTIMEOUT	0xc0089364 /* arg: ulong_t */
+
+/*
+ * Ioctls not supported (v3/v4 protocol).
+ */
+					/* arg: lx_autofs_v3_exp_pkt_t * */
+#define	LX_AUTOFS_IOC_EXPIRE		0x81109365
+
+/*
+ * Ioctls supported (v5 protocol).
+ */
+#define	LX_AUTOFS_IOC_PROTOSUBVER	0x80049367 /* arg: int */
+#define	LX_AUTOFS_IOC_ASKUMOUNT		0x80049370 /* arg: int */
+#define	LX_AUTOFS_IOC_EXPIRE_MULTI	0x40049366 /* arg: int */
+#define	LX_AUTOFS_IOC_EXPIRE_INDIRECT	LX_AUTOFS_IOC_EXPIRE_MULTI
+#define	LX_AUTOFS_IOC_EXPIRE_DIRECT	LX_AUTOFS_IOC_EXPIRE_MULTI
+
+/*
+ * autofs device ioctls
+ */
+#define	LX_AUTOFS_DEV_IOC_VERSION_CMD		0xc0189371
+#define	LX_AUTOFS_DEV_IOC_PROTOVER_CMD		0xc0189372
+#define	LX_AUTOFS_DEV_IOC_PROTOSUBVER_CMD	0xc0189373
+#define	LX_AUTOFS_DEV_IOC_OPENMOUNT_CMD		0xc0189374
+#define	LX_AUTOFS_DEV_IOC_CLOSEMOUNT_CMD	0xc0189375
+#define	LX_AUTOFS_DEV_IOC_READY_CMD		0xc0189376
+#define	LX_AUTOFS_DEV_IOC_FAIL_CMD		0xc0189377
+#define	LX_AUTOFS_DEV_IOC_SETPIPEFD_CMD		0xc0189378
+#define	LX_AUTOFS_DEV_IOC_CATATONIC_CMD		0xc0189379
+#define	LX_AUTOFS_DEV_IOC_TIMEOUT_CMD		0xc018937a
+#define	LX_AUTOFS_DEV_IOC_REQUESTER_CMD		0xc018937b
+#define	LX_AUTOFS_DEV_IOC_EXPIRE_CMD		0xc018937c
+#define	LX_AUTOFS_DEV_IOC_ASKUMOUNT_CMD		0xc018937d
+#define	LX_AUTOFS_DEV_IOC_ISMOUNTPOINT_CMD	0xc018937e
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _LX_AUTOFS_H */
diff --git a/usr/src/uts/common/brand/lx/sys/lx_autofs_impl.h b/usr/src/uts/common/brand/lx/sys/lx_autofs_impl.h
new file mode 100644
index 0000000000..39ea96d1fe
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/sys/lx_autofs_impl.h
@@ -0,0 +1,162 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#ifndef	_LX_AUTOFS_IMPL_H
+#define	_LX_AUTOFS_IMPL_H
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#include <sys/file.h>
+#include <sys/id_space.h>
+#include <sys/modhash.h>
+#include <sys/vnode.h>
+
+#include <sys/lx_autofs.h>
+
+/*
+ * Space key.
+ * Used to persist data across lx_autofs filesystem module unloads.
+ */
+#define	LX_AUTOFS_SPACE_KEY_UDEV	LX_AUTOFS_NAME "_udev"
+
+/*
+ * Name of the backing store directory.
+ */
+#define	LX_AUTOFS_BS_DIR		"." LX_AUTOFS_NAME
+
+#define	LX_AUTOFS_VFS_ID_HASH_SIZE	15
+#define	LX_AUTOFS_VFS_PATH_HASH_SIZE	15
+#define	LX_AUTOFS_VFS_VN_HASH_SIZE	15
+
+enum lx_autofs_mnttype	{ LXAMT_NONE, LXAMT_INDIR, LXAMT_DIRECT, LXAMT_OFFSET };
+
+typedef struct lx_autofs_mntent {
+	list_node_t	lxafme_lst;
+	uint64_t	lxafme_ts;	/* time stamp */
+	uint_t		lxafme_len;
+	char		*lxafme_path;
+} lx_autofs_mntent_t;
+
+/*
+ * VFS data object.
+ */
+typedef struct lx_autofs_vfs {
+	/* Info about the underlying filesystem and backing store. */
+	vnode_t		*lav_mvp;
+	char		*lav_bs_name;
+	vnode_t		*lav_bs_vp;
+
+	/* Info about the automounter process managing this filesystem. */
+	int		lav_fd;
+	pid_t		lav_pgrp;
+	file_t		*lav_fifo_wr;
+	file_t		*lav_fifo_rd;
+
+	/* The mount's dev and ino values for v5 protocol msg */
+	uint64_t	lav_dev;
+	u_longlong_t	lav_ino;
+
+	/* options from the mount */
+	enum lx_autofs_mnttype lav_mnttype;
+	int		lav_min_proto;
+
+	/*
+	 * ioctl-set timeout value. The automounter will perform an expire
+	 * ioctl every timeout/4 seconds. We use this to expire a mount once
+	 * it is inactive for the full timeout.
+	 */
+	ulong_t		lav_timeout;
+
+	/* ioctl-set catatonic value (prevents future mounts).  */
+	boolean_t	lav_catatonic;
+
+	/* Mount initiator's uid/gid for recovery handling. */
+	uid_t		lav_uid;
+	gid_t		lav_gid;
+
+	/* Each automount requests needs a unique id. */
+	id_space_t	*lav_ids;
+
+	/* All remaining structure members are protected by lav_lock. */
+	kmutex_t	lav_lock;
+	/* openmount counter */
+	int		lav_openmnt_cnt;
+
+
+	/* Hashes to keep track of outstanding automounter requests. */
+	mod_hash_t	*lav_path_hash;
+	mod_hash_t	*lav_id_hash;
+
+	/* We need to keep track of all our vnodes. */
+	vnode_t		*lav_root;
+	mod_hash_t	*lav_vn_hash;
+
+	/* list of current mounts */
+	list_t		lav_mnt_list;
+} lx_autofs_vfs_t;
+
+enum lx_autofs_callres	{ LXACR_NONE, LXACR_READY, LXACR_FAIL };
+
+/*
+ * Structure to keep track of automounter requests sent to user-land.
+ */
+typedef struct lx_autofs_automnt_req {
+	/* Packet that gets sent to the automounter. */
+	union lx_autofs_pkt laar_pkt;
+	int		laar_pkt_size;
+
+	/* Reference count.  Always updated atomically. */
+	uint_t		laar_ref;
+
+	/*
+	 * Fields to keep track and sync threads waiting on a lookup.
+	 * Fields are protected by lalr_lock.
+	 */
+	kmutex_t	laar_lock;
+	kcondvar_t	laar_cv;
+	int		laar_complete;
+
+	enum lx_autofs_callres laar_result;
+} lx_autofs_automnt_req_t;
+
+/*
+ * Generic stack structure.
+ */
+typedef struct stack_elem {
+	list_node_t	se_list;
+	caddr_t		se_ptr1;
+	caddr_t		se_ptr2;
+	caddr_t		se_ptr3;
+} stack_elem_t;
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _LX_AUTOFS_IMPL_H */
diff --git a/usr/src/uts/common/brand/lx/sys/lx_brand.h b/usr/src/uts/common/brand/lx/sys/lx_brand.h
new file mode 100644
index 0000000000..4906e444f1
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/sys/lx_brand.h
@@ -0,0 +1,680 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#ifndef _LX_BRAND_H
+#define	_LX_BRAND_H
+
+#ifndef _ASM
+#include <sys/types.h>
+#include <sys/cpuvar.h>
+#include <sys/zone.h>
+#include <sys/ksocket.h>
+#include <sys/vfs.h>
+#include <sys/sunddi.h>
+#include <sys/sunldi.h>
+#endif
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#define	LX_BRANDNAME	"lx"
+
+/*
+ * Brand uname info
+ */
+#define	LX_UNAME_SYSNAME	"Linux"
+#define	LX_UNAME_RELEASE_2_6	"2.6.18"
+#define	LX_UNAME_RELEASE_2_4	"2.4.21"
+#define	LX_UNAME_VERSION	"BrandZ virtual linux"
+#define	LX_UNAME_MACHINE32	"i686"
+#define	LX_UNAME_MACHINE64	"x86_64"
+
+#define	LX_LIB_PATH32	"/native/usr/lib/lx_brand.so.1"
+#define	LX_LIB_PATH64	"/native/usr/lib/amd64/lx_brand.so.1"
+
+#define	LX_VDSO_PATH32	"/native/usr/lib/brand/lx/lx_vdso.so.1"
+#define	LX_VDSO_PATH64	"/native/usr/lib/brand/lx/amd64/lx_vdso.so.1"
+
+#if defined(_LP64)
+#define	LX_LIB_PATH		LX_LIB_PATH64
+#define	LX_UNAME_MACHINE	LX_UNAME_MACHINE64
+#define	LX_VDSO_PATH		LX_VDSO_PATH64
+#else
+#define	LX_LIB_PATH		LX_LIB_PATH32
+#define	LX_UNAME_MACHINE	LX_UNAME_MACHINE32
+#define	LX_VDSO_PATH		LX_VDSO_PATH32
+#endif
+
+/*
+ * This must be large enough for both the 32-bit table and 64-bit table.
+ */
+#define	LX_NSYSCALLS		358
+
+/* Highest capability we know about */
+#define	LX_CAP_MAX_VALID	36
+
+/*
+ * brand(2) subcommands
+ *
+ * Everything >= 128 is a brand-specific subcommand.
+ * > 192 is reserved for in-kernel emulated system calls.
+ */
+#define	B_LPID_TO_SPAIR		128
+#define	B_GET_CURRENT_CONTEXT	129
+#define	B_EMULATION_DONE	130
+/* formerly B_PTRACE_KERNEL	131 */
+#define	B_SET_AFFINITY_MASK	132
+#define	B_GET_AFFINITY_MASK	133
+#define	B_PTRACE_CLONE_BEGIN	134
+#define	B_PTRACE_STOP_FOR_OPT	135
+#define	B_UNSUPPORTED		136
+#define	B_STORE_ARGS		137
+#define	B_GETPID		138
+#define	B_JUMP_TO_LINUX		139
+#define	B_SET_THUNK_PID		140
+#define	B_EXIT_AS_SIG		141
+#define	B_HELPER_WAITID		142
+#define	B_HELPER_CLONE		143
+#define	B_HELPER_SETGROUPS	144
+#define	B_HELPER_SIGQUEUE	145
+#define	B_HELPER_TGSIGQUEUE	146
+#define	B_SET_NATIVE_STACK	147
+#define	B_SIGEV_THREAD_ID	148
+#define	B_OVERRIDE_KERN_VER	149
+/* formerly B_NOTIFY_VDSO_LOC	150 */
+#define	B_GET_PERSONALITY	151
+
+#ifndef _ASM
+/*
+ * Support for Linux PTRACE_SETOPTIONS handling.
+ */
+typedef enum lx_ptrace_options {
+	LX_PTRACE_O_TRACESYSGOOD =	0x0001,
+	LX_PTRACE_O_TRACEFORK =		0x0002,
+	LX_PTRACE_O_TRACEVFORK =	0x0004,
+	LX_PTRACE_O_TRACECLONE =	0x0008,
+	LX_PTRACE_O_TRACEEXEC =		0x0010,
+	LX_PTRACE_O_TRACEVFORKDONE =	0x0020,
+	LX_PTRACE_O_TRACEEXIT =		0x0040,
+	LX_PTRACE_O_TRACESECCOMP =	0x0080
+} lx_ptrace_options_t;
+
+#define	LX_PTRACE_O_ALL							\
+	(LX_PTRACE_O_TRACESYSGOOD | LX_PTRACE_O_TRACEFORK | 		\
+	LX_PTRACE_O_TRACEVFORK | LX_PTRACE_O_TRACECLONE | 		\
+	LX_PTRACE_O_TRACEEXEC | LX_PTRACE_O_TRACEVFORKDONE |		\
+	LX_PTRACE_O_TRACEEXIT | LX_PTRACE_O_TRACESECCOMP)
+#endif /* !_ASM */
+
+/* siginfo si_status for traced events */
+#define	LX_PTRACE_EVENT_FORK		0x100
+#define	LX_PTRACE_EVENT_VFORK		0x200
+#define	LX_PTRACE_EVENT_CLONE		0x300
+#define	LX_PTRACE_EVENT_EXEC		0x400
+#define	LX_PTRACE_EVENT_VFORK_DONE	0x500
+#define	LX_PTRACE_EVENT_EXIT		0x600
+#define	LX_PTRACE_EVENT_SECCOMP		0x700
+
+/*
+ * Brand-private values for the "pr_what" member of lwpstatus, for use with the
+ * PR_BRAND stop reason.  These reasons are validated in lx_stop_notify();
+ * update it if you add new reasons here.
+ */
+#define	LX_PR_SYSENTRY		1
+#define	LX_PR_SYSEXIT		2
+#define	LX_PR_SIGNALLED		3
+#define	LX_PR_EVENT		4
+
+
+#define	LX_VERSION_1		1
+#define	LX_VERSION		LX_VERSION_1
+
+#define	LX_ATTR_KERN_RELEASE	ZONE_ATTR_BRAND_ATTRS
+#define	LX_ATTR_KERN_VERSION	(ZONE_ATTR_BRAND_ATTRS + 1)
+
+/*
+ * Aux vector containing phdr of Linux executable and ehdr of interpreter
+ * (if any), both of which are used by lx_librtld_db to ascertain r_debug.
+ * We repurpose the 3rd brand-specific aux vector slot for the Linux
+ * AT_SYSINFO_EHDR entry (we modify the a_type in the brand library).
+ */
+#define	AT_SUN_BRAND_LX_PHDR		AT_SUN_BRAND_AUX1
+#define	AT_SUN_BRAND_LX_INTERP		AT_SUN_BRAND_AUX2
+#define	AT_SUN_BRAND_LX_CLKTCK		AT_SUN_BRAND_AUX3
+#define	AT_SUN_BRAND_LX_SYSINFO_EHDR	AT_SUN_BRAND_AUX4
+
+/* Aux vectors containing real/effective user/group IDs */
+#define	AT_LX_UID		11
+#define	AT_LX_EUID		12
+#define	AT_LX_GID		13
+#define	AT_LX_EGID		14
+/* Aux vector containing hz value */
+#define	AT_CLKTCK	17
+/* Aux vector containing secure boolean */
+#define	AT_SECURE	23
+/* Aux vector containing vDSO addr */
+#define	AT_SYSINFO_EHDR	33
+
+/*
+ * Usermode emulation routines are run on an alternate stack allocated by
+ * the brand library.  Every LWP in a process will incur this overhead beyond
+ * the regular thread stack:
+ */
+#define	LX_NATIVE_STACK_PAGE_COUNT	64
+
+/*
+ * When returning in a new child process created with vfork(2) (or CLONE_VFORK)
+ * we discard some of the native stack to prevent corruption of the parent
+ * emulation state.
+ */
+#define	LX_NATIVE_STACK_VFORK_GAP	0x3000
+
+#ifndef	_ASM
+
+extern struct brand lx_brand;
+
+typedef struct lx_brand_registration {
+	uint_t lxbr_version;		/* version number */
+	void *lxbr_handler;		/* base address of handler */
+	uint32_t lxbr_flags;		/* LX_PROC_* registration flags */
+} lx_brand_registration_t;
+
+typedef struct lx_brand_registration32 {
+	uint_t lxbr_version;		/* version number */
+	uint32_t lxbr_handler;		/* base address of handler */
+	uint32_t lxbr_flags;		/* LX_PROC_* registration flags */
+} lx_brand_registration32_t;
+
+#endif /* _ASM */
+
+/*
+ * GDT usage
+ */
+#define	GDT_TLSMIN	(GDT_BRANDMIN)
+#define	GDT_TLSMAX	(GDT_TLSMIN + 2)
+#define	LX_TLSNUM	(GDT_TLSMAX - GDT_TLSMIN)
+
+#ifndef _ASM
+
+/*
+ * Stores information needed by the lx linker to launch the main
+ * lx executable.
+ */
+typedef struct lx_elf_data64 {
+	uintptr_t	ed_phdr;
+	uintptr_t	ed_phent;
+	uintptr_t	ed_phnum;
+	uintptr_t	ed_entry;
+	uintptr_t	ed_base;
+	uintptr_t	ed_ldentry;
+} lx_elf_data64_t;
+
+typedef struct lx_elf_data32 {
+	uint32_t	ed_phdr;
+	uint32_t	ed_phent;
+	uint32_t	ed_phnum;
+	uint32_t	ed_entry;
+	uint32_t	ed_base;
+	uint32_t	ed_ldentry;
+} lx_elf_data32_t;
+
+#if defined(_LP64)
+typedef lx_elf_data64_t lx_elf_data_t;
+#else
+typedef lx_elf_data32_t lx_elf_data_t;
+#endif
+
+typedef enum lx_proc_flags {
+	/* flags configurable via brandsys() and members of LX_PROC_ALL */
+	LX_PROC_INSTALL_MODE	= 0x01,
+	LX_PROC_STRICT_MODE	= 0x02,
+	/* internal flags */
+	LX_PROC_CHILD_DEATHSIG	= 0x04,
+	LX_PROC_AIO_USED	= 0x08
+} lx_proc_flags_t;
+
+#define	LX_PROC_ALL	(LX_PROC_INSTALL_MODE | LX_PROC_STRICT_MODE)
+
+/* Maximum length for fields of LX uname */
+#define	LX_SYS_UTS_LN	65
+
+/* Max. length of kernel release string */
+#define	LX_KERN_RELEASE_MAX	LX_SYS_UTS_LN
+#define	LX_KERN_VERSION_MAX	LX_SYS_UTS_LN
+
+#ifdef	_KERNEL
+
+/*
+ * Entry points for cgroup integration.
+ */
+extern void (*lx_cgrp_initlwp)(vfs_t *, uint_t, id_t, pid_t);
+extern void (*lx_cgrp_freelwp)(vfs_t *, uint_t, id_t, pid_t);
+
+#define	LX_RLFAKE_LOCKS		0
+#define	LX_RLFAKE_NICE		1
+#define	LX_RLFAKE_RTPRIO	2
+#define	LX_RLFAKE_RTTIME	3
+
+#define	LX_RLFAKE_NLIMITS	4
+
+#define	LX_RLIM64_INFINITY	(~0ULL)
+
+typedef struct {
+	uint64_t	rlim_cur;
+	uint64_t	rlim_max;
+} lx_rlimit64_t;
+
+typedef struct lx_proc_data {
+	uintptr_t l_handler;	/* address of user-space handler */
+	pid_t l_ppid;		/* pid of originating parent proc */
+	uid_t l_loginuid;	/* /proc/{pid}/loginuid */
+	int64_t l_ptrace;	/* count of process lwps observed by ptrace */
+	lx_elf_data_t l_elf_data; /* ELF data for linux executable */
+	/* signal to deliver to parent when this thread group dies */
+	int l_signal;
+	/* native signal to deliver to process when parent dies */
+	int l_parent_deathsig;
+	lx_proc_flags_t l_flags;
+
+	lx_rlimit64_t l_fake_limits[LX_RLFAKE_NLIMITS];
+
+	/* original start/end bounds of arg/env string data */
+	uintptr_t l_args_start;
+	uintptr_t l_envs_start;
+	uintptr_t l_envs_end;
+
+	/* Override zone-wide settings for uname release and version */
+	char l_uname_release[LX_KERN_RELEASE_MAX];
+	char l_uname_version[LX_KERN_VERSION_MAX];
+
+	/* Linux process personality */
+	unsigned int l_personality;
+
+	/* VDSO location */
+	uintptr_t l_vdso;
+} lx_proc_data_t;
+
+#endif	/* _KERNEL */
+
+/*
+ * Linux process personality(2) flags stored in l_personality
+ */
+#define	LX_PER_UNAME26			0x0020000
+#define	LX_PER_ADDR_NO_RANDOMIZE	0x0040000
+#define	LX_PER_FDPIC_FUNCPTRS		0x0080000
+#define	LX_PER_MMAP_PAGE_ZERO		0x0100000
+#define	LX_PER_ADDR_COMPAT_LAYOUT	0x0200000
+#define	LX_PER_READ_IMPLIES_EXEC	0x0400000
+#define	LX_PER_ADDR_LIMIT_32BIT		0x0800000
+#define	LX_PER_SHORT_INODE		0x1000000
+#define	LX_PER_WHOLE_SECONDS		0x2000000
+#define	LX_PER_STICKY_TIMEOUTS		0x4000000
+#define	LX_PER_ADDR_LIMIT_3GB		0x8000000
+
+#define	LX_PER_LINUX	0x00
+#define	LX_PER_SUNOS	(0x06 | LX_PER_STICKY_TIMEOUTS)
+#define	LX_PER_MASK	0xff
+
+/*
+ * A data type big enough to bitmap all Linux possible cpus.
+ * The bitmap size is defined as 1024 cpus in the Linux 2.4 and 2.6 man pages
+ * for sched_getaffinity() and sched_getaffinity().
+ */
+#define	LX_NCPU		(1024)
+#define	LX_AFF_ULONGS	(LX_NCPU / (8 * sizeof (ulong_t)))
+typedef ulong_t lx_affmask_t[LX_AFF_ULONGS];
+
+/* Length of proc boot_id string */
+#define	LX_BOOTID_LEN	37
+
+/*
+ * Flag values for uc_brand_data[0] in the ucontext_t:
+ */
+#define	LX_UC_STACK_NATIVE	0x00001
+#define	LX_UC_STACK_BRAND	0x00002
+#define	LX_UC_RESTORE_NATIVE_SP	0x00010
+#define	LX_UC_FRAME_IS_SYSCALL	0x00100
+#define	LX_UC_RESTART_SYSCALL	0x01000
+#define	LX_UC_IGNORE_LINK	0x10000
+
+#ifdef	_KERNEL
+
+typedef struct lx_lwp_data lx_lwp_data_t;
+
+/*
+ * Flag values for "lxpa_flags" on a ptrace(2) accord.
+ */
+typedef enum lx_accord_flags {
+	LX_ACC_TOMBSTONE = 0x01
+} lx_accord_flags_t;
+
+/*
+ * Flags values for "br_ptrace_flags" in the LWP-specific data.
+ */
+typedef enum lx_ptrace_flags {
+	LX_PTF_SYSCALL = 0x01,
+	LX_PTF_EXITING = 0x02,
+	LX_PTF_STOPPING = 0x04,
+	LX_PTF_INHERIT = 0x08,
+	LX_PTF_STOPPED = 0x10,
+	LX_PTF_PARENT_WAIT = 0x20,
+	LX_PTF_CLDPEND = 0x40,
+	LX_PTF_CLONING = 0x80,
+	LX_PTF_WAITPEND = 0x100
+} lx_ptrace_flags_t;
+
+/*
+ * A ptrace(2) accord represents the relationship between a tracer LWP and the
+ * set of LWPs that it is tracing: the tracees.  This data structure belongs
+ * primarily to the tracer, but is reference counted so that it may be freed by
+ * whoever references it last.
+ */
+typedef struct lx_ptrace_accord {
+	kmutex_t		lxpa_lock;
+	uint_t			lxpa_refcnt;
+	lx_accord_flags_t	lxpa_flags;
+
+	/*
+	 * The tracer must hold "pidlock" while clearing these fields for
+	 * exclusion of waitid(), etc.
+	 */
+	lx_lwp_data_t		*lxpa_tracer;
+	kcondvar_t		*lxpa_cvp;
+
+	/*
+	 * The "lxpa_tracees_lock" mutex protects the tracee list.
+	 */
+	kmutex_t		lxpa_tracees_lock;
+	list_t			lxpa_tracees;
+} lx_ptrace_accord_t;
+
+/*
+ * These values are stored in the per-LWP data for a tracee when it is attached
+ * to a tracer.  They record the method that was used to attach.
+ */
+typedef enum lx_ptrace_attach {
+	LX_PTA_NONE = 0x00,		/* not attached */
+	LX_PTA_ATTACH = 0x01,		/* due to tracer using PTRACE_ATTACH */
+	LX_PTA_TRACEME = 0x02,		/* due to child using PTRACE_TRACEME */
+	LX_PTA_INHERIT_CLONE = 0x04,	/* due to PTRACE_CLONE clone(2) flag */
+	LX_PTA_INHERIT_OPTIONS = 0x08	/* due to PTRACE_SETOPTIONS options */
+} lx_ptrace_attach_t;
+
+typedef enum lx_stack_mode {
+	LX_STACK_MODE_PREINIT = 0,
+	LX_STACK_MODE_INIT,
+	LX_STACK_MODE_NATIVE,
+	LX_STACK_MODE_BRAND
+} lx_stack_mode_t;
+
+struct lx_pid {
+	pid_t	s_pid;			/* the SunOS pid and ... */
+	id_t	s_tid;			/* ... tid pair */
+	pid_t	l_pid;			/* the corresponding linux pid */
+	time_t	l_start;		/* birthday of this pid */
+	struct pid *l_pidp;
+	struct lx_pid *stol_next;	/* link in stol hash table */
+	struct lx_pid *ltos_next;	/* link in ltos hash table */
+};
+
+/*
+ * lx-specific data in the klwp_t
+ */
+struct lx_lwp_data {
+	uint_t	br_lwp_flags;		/* misc. flags */
+	klwp_t	*br_lwp;		/* back pointer to container lwp */
+	int	br_signal;		/* signal to send to parent when */
+					/* clone()'ed child terminates */
+	int	br_exitwhy;		/* reason for thread (process) exit */
+	int	br_exitwhat;		/* exit code / killing signal */
+	lx_affmask_t br_affinitymask;	/* bitmask of CPU sched affinities */
+	struct user_desc br_tls[LX_TLSNUM];
+			/* descriptors used by libc for TLS */
+	ulong_t	br_lx_fsbase;		/* lx fsbase for 64-bit thread ptr */
+	ulong_t	br_ntv_fsbase;		/* native fsbase 64-bit thread ptr */
+	ulong_t	br_lx_gsbase;		/* lx user-land gsbase */
+	ulong_t	br_ntv_gsbase;		/* native user-land gsbase */
+	pid_t	br_pid;			/* converted pid for this thread */
+	pid_t	br_tgid;		/* thread group ID for this thread */
+	pid_t	br_ppid;		/* parent pid for this thread */
+	id_t	br_ptid;		/* parent tid for this thread */
+	void	*br_clear_ctidp;	/* clone thread id ptr */
+	void	*br_set_ctidp;		/* clone thread id ptr */
+	void	*br_robust_list;	/* robust lock list, if any */
+
+	/*
+	 * The following struct is used by some system calls to pass extra
+	 * flags into the kernel without impinging on the namespace for
+	 * illumos.
+	 */
+	void	*br_scall_args;
+	int	br_args_size; /* size in bytes of br_scall_args */
+
+	boolean_t br_waitid_emulate;
+	int br_waitid_flags;
+
+	lx_ptrace_flags_t br_ptrace_flags; /* ptrace flags for this LWP */
+	lx_ptrace_options_t br_ptrace_options; /* PTRACE_SETOPTIONS options */
+	lx_ptrace_options_t br_ptrace_clone_option; /* current clone(2) type */
+
+	lx_ptrace_attach_t br_ptrace_attach; /* how did we get attached */
+	lx_ptrace_accord_t *br_ptrace_accord; /* accord for this tracer LWP */
+	lx_ptrace_accord_t *br_ptrace_tracer; /* accord tracing this LWP */
+	list_node_t br_ptrace_linkage;	/* linkage for lxpa_tracees list */
+
+	ushort_t br_ptrace_whystop; 	/* stop reason, 0 for no stop */
+	ushort_t br_ptrace_whatstop;	/* stop sub-reason */
+
+	int32_t br_ptrace_stopsig;	/* stop signal, 0 for no signal */
+	/*
+	 * Track the last (native) signal number processed by a ptrace.
+	 * This allows the tracee to properly handle ignored signals after
+	 * the tracer has been notified and the tracee restarted.
+	 */
+	int32_t br_ptrace_donesig;
+	uintptr_t br_ptrace_stopucp;	/* usermode ucontext_t pointer */
+
+	uint_t	br_ptrace_event;
+	ulong_t	br_ptrace_eventmsg;
+
+	int	br_syscall_num;		/* current system call number */
+	boolean_t br_syscall_restart;	/* should restart on EINTR */
+
+	/*
+	 * Store the LX_STACK_MODE for this LWP, and the current extent of the
+	 * native (emulation) stack.  This is similar, in principle, to the
+	 * sigaltstack mechanism for signal handling.  We also use this mode
+	 * flag to determine how to process system calls from this LWP.
+	 */
+	lx_stack_mode_t	br_stack_mode;
+	uintptr_t br_ntv_stack;
+	uintptr_t br_ntv_stack_current;
+
+	/*
+	 * If this pid is set, we return it with getpid().  This allows the
+	 * thunking server to interpose on the pid returned to the Linux
+	 * syslog software.
+	 */
+	pid_t	br_lx_thunk_pid;
+
+	/*
+	 * If strict mode is enabled (via LX_STRICT in the environment), any
+	 * call to lx_unsupported() will set this boolean to B_TRUE.  This will
+	 * cause us to drop SIGSYS on the LWP as it attempts to return to
+	 * usermode.
+	 */
+	boolean_t br_strict_failure;
+
+	/*
+	 * Some syscalls emulated in-kernel still call back out to the
+	 * userspace emulation for certain functions.  When that is the case,
+	 * the syscall_return logic must be bypassed at the end of the
+	 * in-kernel syscall code.  The NORMALRETURN and JUSTRETURN constants
+	 * are used to choose the behavior.
+	 */
+	char br_eosys;
+
+	/*
+	 * Hold a pre-allocated lx_pid structure to be used during lx_initlwp.
+	 */
+	struct lx_pid *br_lpid;
+
+	/*
+	 * ID of the cgroup this thread belongs to.
+	 */
+	uint_t br_cgroupid;
+};
+
+/*
+ * Upper limit on br_args_size, low because this value can persist until
+ * overridden with another value, and the size is given from userland.
+ */
+#define	LX_BR_ARGS_SIZE_MAX	(1024)
+
+/*
+ * brand specific data
+ *
+ * We currently only support a single cgroup mount in an lx zone so we only have
+ * one ptr (lxzd_cgroup) but this could be changed to a list if cgroups is ever
+ * enhanced to support different mounts with different subsystem controllers.
+ */
+typedef struct lx_zone_data {
+	kmutex_t lxzd_lock;			/* protects all members */
+	char lxzd_kernel_release[LX_KERN_RELEASE_MAX];
+	char lxzd_kernel_version[LX_KERN_VERSION_MAX];
+	ksocket_t lxzd_ioctl_sock;
+	char lxzd_bootid[LX_BOOTID_LEN];	/* procfs boot_id */
+	vfs_t *lxzd_cgroup;			/* cgroup for this zone */
+	list_t *lxzd_vdisks;			/* virtual disks (zvols) */
+	dev_t lxzd_zfs_dev;			/* major num for zfs */
+} lx_zone_data_t;
+
+#define	BR_CPU_BOUND	0x0001
+
+#define	ttolxlwp(t)	((struct lx_lwp_data *)ttolwpbrand(t))
+#define	lwptolxlwp(l)	((struct lx_lwp_data *)lwptolwpbrand(l))
+#define	ttolxproc(t)	\
+	(((t)->t_procp->p_brand == &lx_brand) ? \
+	(struct lx_proc_data *)(t)->t_procp->p_brand_data : NULL)
+#define	ptolxproc(p)	\
+	(((p)->p_brand == &lx_brand) ? \
+	(struct lx_proc_data *)(p)->p_brand_data : NULL)
+#define	ztolxzd(z)		\
+	(((z)->zone_brand == &lx_brand) ?  \
+	(lx_zone_data_t *)(z)->zone_brand_data : NULL)
+
+/* Macro for converting to system call arguments. */
+#define	LX_ARGS(scall) ((struct lx_##scall##_args *)\
+	(ttolxlwp(curthread)->br_scall_args))
+
+typedef enum lx_virt_disk_type {
+	LXVD_NONE,
+	LXVD_ZFS_DS,
+	LXVD_ZVOL
+} lx_virt_disk_type_t;
+
+typedef struct lx_virt_disk {
+	list_node_t		lxvd_link;
+	char			lxvd_name[MAXNAMELEN];
+	lx_virt_disk_type_t	lxvd_type;
+	dev_t			lxvd_emul_dev;
+	dev_t			lxvd_real_dev;
+	uint64_t		lxvd_volsize;
+	uint64_t		lxvd_blksize;
+	char			lxvd_real_name[MAXPATHLEN];
+} lx_virt_disk_t;
+
+/*
+ * Determine the upper bound on the system call number:
+ */
+#if defined(_LP64)
+#define	LX_MAX_SYSCALL(lwp)						\
+	((lwp_getdatamodel(lwp) == DATAMODEL_NATIVE) ?			\
+	    lx_nsysent64 : lx_nsysent32)
+#else
+#define	LX_MAX_SYSCALL(lwp)	lx_nsysent32
+#endif
+
+extern int lx_kern_release_cmp(zone_t *, const char *);
+
+extern void lx_lwp_set_native_stack_current(lx_lwp_data_t *, uintptr_t);
+extern void lx_divert(klwp_t *, uintptr_t);
+extern int lx_runexe(klwp_t *, void *);
+extern void lx_switch_to_native(klwp_t *);
+
+extern int lx_syscall_enter(void);
+extern int lx_syscall_return(klwp_t *, int, long);
+
+extern int lx_syscall_fast_enter(void);
+
+extern void lx_trace_sysenter(int, uintptr_t *);
+extern void lx_trace_sysreturn(int, long);
+
+extern void lx_emulate_user(klwp_t *, int, uintptr_t *);
+#if defined(_SYSCALL32_IMPL)
+extern void lx_emulate_user32(klwp_t *, int, uintptr_t *);
+#endif
+
+extern int lx_debug;
+#define	lx_print	if (lx_debug) printf
+
+extern void lx_pid_assign(kthread_t *, struct lx_pid *);
+extern void lx_pid_reassign(kthread_t *);
+extern void lx_pid_rele(pid_t, id_t);
+extern pid_t lx_lpid_to_spair(pid_t, pid_t *, id_t *);
+extern pid_t lx_lwp_ppid(klwp_t *, pid_t *, id_t *);
+extern void lx_pid_init(void);
+extern void lx_pid_fini(void);
+
+/*
+ * In-Kernel Linux System Call Description.
+ */
+typedef struct lx_sysent {
+	char	*sy_name;
+	long	(*sy_callc)();
+	char	sy_flags;
+	char	sy_narg;
+} lx_sysent_t;
+
+#if defined(_LP64)
+extern lx_sysent_t lx_sysent64[LX_NSYSCALLS + 1];
+extern int lx_nsysent64;
+#endif
+extern lx_sysent_t lx_sysent32[LX_NSYSCALLS + 1];
+extern int lx_nsysent32;
+
+#endif	/* _KERNEL */
+#endif /* _ASM */
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _LX_BRAND_H */
diff --git a/usr/src/uts/common/brand/lx/sys/lx_fcntl.h b/usr/src/uts/common/brand/lx/sys/lx_fcntl.h
new file mode 100644
index 0000000000..f82c6b867d
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/sys/lx_fcntl.h
@@ -0,0 +1,161 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ * Copyright 2015 Joyent, Inc.
+ */
+
+#ifndef _SYS_LX_FCNTL_H
+#define	_SYS_LX_FCNTL_H
+
+#include <sys/vnode.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * Lx open/fcntl flags
+ */
+#define	LX_O_RDONLY		00
+#define	LX_O_WRONLY		01
+#define	LX_O_RDWR		02
+#define	LX_O_ACCMODE		(LX_O_RDONLY | LX_O_WRONLY | LX_O_RDWR)
+#define	LX_O_CREAT		0100
+#define	LX_O_EXCL		0200
+#define	LX_O_NOCTTY		0400
+#define	LX_O_TRUNC		01000
+#define	LX_O_APPEND		02000
+#define	LX_O_NONBLOCK		04000
+#define	LX_O_NDELAY		LX_O_NONBLOCK
+#define	LX_O_SYNC		010000
+#define	LX_O_FSYNC		LX_O_SYNC
+#define	LX_O_ASYNC		020000
+#define	LX_O_DIRECT		040000
+#define	LX_O_LARGEFILE		0100000
+#define	LX_O_DIRECTORY		0200000
+#define	LX_O_NOFOLLOW		0400000
+#define	LX_O_CLOEXEC		02000000
+#define	LX_O_PATH		010000000
+
+#define	LX_F_DUPFD		0
+#define	LX_F_GETFD		1
+#define	LX_F_SETFD		2
+#define	LX_F_GETFL		3
+#define	LX_F_SETFL		4
+#define	LX_F_GETLK		5
+#define	LX_F_SETLK		6
+#define	LX_F_SETLKW		7
+#define	LX_F_SETOWN		8
+#define	LX_F_GETOWN		9
+#define	LX_F_SETSIG		10
+#define	LX_F_GETSIG		11
+
+#define	LX_F_GETLK64		12
+#define	LX_F_SETLK64		13
+#define	LX_F_SETLKW64		14
+
+#define	LX_F_SETLEASE		1024
+#define	LX_F_GETLEASE		1025
+#define	LX_F_NOTIFY		1026
+#define	LX_F_CANCELLK		1029
+#define	LX_F_DUPFD_CLOEXEC	1030
+#define	LX_F_SETPIPE_SZ		1031
+#define	LX_F_GETPIPE_SZ		1032
+
+#define	LX_F_RDLCK		0
+#define	LX_F_WRLCK		1
+#define	LX_F_UNLCK		2
+
+/* Test for emulated O_PATH setting in file_t flags */
+#define	LX_IS_O_PATH(f)		(((f)->f_flag & (FREAD|FWRITE)) == 0)
+
+extern int lx_vp_at(int, char *, vnode_t **, int);
+
+/*
+ * Lx flock codes.
+ */
+#define	LX_NAME_MAX		255
+#define	LX_LOCK_SH		1	/* shared */
+#define	LX_LOCK_EX		2	/* exclusive */
+#define	LX_LOCK_NB		4	/* non-blocking */
+#define	LX_LOCK_UN		8	/* unlock */
+
+/*
+ * On Linux the constants AT_REMOVEDIR and AT_EACCESS have the same value.
+ * AT_REMOVEDIR is used only by unlinkat and AT_EACCESS is used only by
+ * faccessat.
+ */
+#define	LX_AT_FDCWD		(-100)
+#define	LX_AT_SYMLINK_NOFOLLOW	0x100
+#define	LX_AT_REMOVEDIR		0x200
+#define	LX_AT_EACCESS		0x200
+#define	LX_AT_SYMLINK_FOLLOW	0x400
+#define	LX_AT_NO_AUTOMOUNT	0x800
+#define	LX_AT_EMPTY_PATH	0x1000
+
+typedef struct lx_flock {
+	short		l_type;
+	short		l_whence;
+	long		l_start;
+	long		l_len;
+	int		l_pid;
+} lx_flock_t;
+
+typedef struct lx_flock64 {
+	short		l_type;
+	short		l_whence;
+	long long	l_start;
+	long long	l_len;
+	int		l_pid;
+} lx_flock64_t;
+
+#if defined(_KERNEL)
+
+/*
+ * 64-bit kernel view of 32-bit usermode structs.
+ */
+#pragma pack(4)
+typedef struct lx_flock32 {
+	int16_t		l_type;
+	int16_t		l_whence;
+	int32_t		l_start;
+	int32_t		l_len;
+	int32_t		l_pid;
+} lx_flock32_t;
+
+typedef struct lx_flock64_32 {
+	int16_t		l_type;
+	int16_t		l_whence;
+	int64_t		l_start;
+	int64_t		l_len;
+	int32_t		l_pid;
+} lx_flock64_32_t;
+#pragma pack()
+
+#endif /* _KERNEL && _SYSCALL32_IMPL */
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_LX_FCNTL_H */
diff --git a/usr/src/uts/common/brand/lx/sys/lx_futex.h b/usr/src/uts/common/brand/lx/sys/lx_futex.h
new file mode 100644
index 0000000000..a400b3bd83
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/sys/lx_futex.h
@@ -0,0 +1,121 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2014, Joyent, Inc.  All rights reserved.
+ */
+
+#ifndef _SYS_LX_FUTEX_H
+#define	_SYS_LX_FUTEX_H
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#define	FUTEX_WAIT		0
+#define	FUTEX_WAKE		1
+#define	FUTEX_FD		2
+#define	FUTEX_REQUEUE		3
+#define	FUTEX_CMP_REQUEUE	4
+#define	FUTEX_WAKE_OP		5
+#define	FUTEX_LOCK_PI		6
+#define	FUTEX_UNLOCK_PI		7
+#define	FUTEX_TRYLOCK_PI	8
+#define	FUTEX_WAIT_BITSET	9
+#define	FUTEX_WAKE_BITSET	10
+#define	FUTEX_WAIT_REQUEUE_PI	11
+#define	FUTEX_CMP_REQUEUE_PI	12
+#define	FUTEX_MAX_CMD		FUTEX_CMP_REQUEUE_PI
+
+/*
+ * Flags that can be OR'd into a futex operation.
+ */
+#define	FUTEX_CMD_MASK		0x007f
+#define	FUTEX_PRIVATE_FLAG	0x0080
+#define	FUTEX_CLOCK_REALTIME	0x0100
+
+#define	FUTEX_BITSET_MATCH_ANY	0xffffffff
+/*
+ * FUTEX_WAKE_OP operations
+ */
+#define	FUTEX_OP_SET		0	/* *(int *)UADDR2 = OPARG; */
+#define	FUTEX_OP_ADD		1	/* *(int *)UADDR2 += OPARG; */
+#define	FUTEX_OP_OR		2	/* *(int *)UADDR2 |= OPARG; */
+#define	FUTEX_OP_ANDN		3	/* *(int *)UADDR2 &= ~OPARG; */
+#define	FUTEX_OP_XOR		4	/* *(int *)UADDR2 ^= OPARG; */
+
+/*
+ * FUTEX_WAKE_OP comparison operations
+ */
+#define	FUTEX_OP_CMP_EQ		0	/* if (oldval == CMPARG) wake */
+#define	FUTEX_OP_CMP_NE		1	/* if (oldval != CMPARG) wake */
+#define	FUTEX_OP_CMP_LT		2	/* if (oldval < CMPARG) wake */
+#define	FUTEX_OP_CMP_LE		3	/* if (oldval <= CMPARG) wake */
+#define	FUTEX_OP_CMP_GT		4	/* if (oldval > CMPARG) wake */
+#define	FUTEX_OP_CMP_GE		5	/* if (oldval >= CMPARG) wake */
+
+/*
+ * The encoding of the FUTEX_WAKE_OP operation in 32 bits:
+ *
+ *	+--+-- - --+-- - --+-- - --+-- - --+
+ *	|S |OP     |CMP    |OPARG  |CMPARG |
+ *	+--+-- - --+-- - --+-- - --+-- - --+
+ *	|31|30 - 28|27 - 24|23 - 12|11 -  0|
+ *
+ * The S bit denotes that the OPARG should be (1 << OPARG) instead of OPARG.
+ * (Yes, this whole thing is entirely absurd -- see the block comment in
+ * lx_futex.c for an explanation of this nonsense.)  Macros to extract the
+ * various components from the operation, given the above encoding:
+ */
+#define	FUTEX_OP_OP(x)		(((x) >> 28) & 7)
+#define	FUTEX_OP_CMP(x)		(((x) >> 24) & 15)
+#define	FUTEX_OP_OPARG(x)	(((x) >> 31) ? (1 << (((x) << 8) >> 20)) : \
+				((((x) << 8) >> 20)))
+#define	FUTEX_OP_CMPARG(x)	(((x) << 20) >> 20)
+
+#ifdef _KERNEL
+
+#define	FUTEX_WAITERS			0x80000000
+#define	FUTEX_OWNER_DIED		0x40000000
+#define	FUTEX_TID_MASK			0x3fffffff
+
+#define	FUTEX_ROBUST_LOCK_PI		1
+#define	FUTEX_ROBUST_LIST_LIMIT		2048
+
+extern long lx_futex(uintptr_t addr, int cmd, int val, uintptr_t lx_timeout,
+    uintptr_t addr2, int val2);
+extern void lx_futex_init(void);
+extern int lx_futex_fini(void);
+extern long lx_set_robust_list(void *listp, size_t len);
+extern long lx_get_robust_list(pid_t pid, void **listp, size_t *lenp);
+extern void lx_futex_robust_exit(uintptr_t addr, uint32_t tid);
+
+#endif /* _KERNEL */
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_LX_FUTEX_H */
diff --git a/usr/src/uts/common/brand/lx/sys/lx_impl.h b/usr/src/uts/common/brand/lx/sys/lx_impl.h
new file mode 100644
index 0000000000..03b9d43038
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/sys/lx_impl.h
@@ -0,0 +1,52 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ * Copyright 2014 Joyent, Inc.  All rights reserved.
+ */
+
+#ifndef	_LX_IMPL_H
+#define	_LX_IMPL_H
+
+#include <sys/types.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+typedef void (lx_systrace_f)(ulong_t, ulong_t, ulong_t, ulong_t, ulong_t,
+    ulong_t, ulong_t);
+
+
+extern lx_systrace_f *lx_systrace_entry_ptr;
+extern lx_systrace_f *lx_systrace_return_ptr;
+
+extern void lx_brand_systrace_enable(void);
+extern void lx_brand_systrace_disable(void);
+
+extern void lx_unsupported(char *);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _LX_IMPL_H */
diff --git a/usr/src/uts/common/brand/lx/sys/lx_ldt.h b/usr/src/uts/common/brand/lx/sys/lx_ldt.h
new file mode 100644
index 0000000000..825933e86c
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/sys/lx_ldt.h
@@ -0,0 +1,91 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright 2014 Joyent, Inc.  All rights reserved.
+ */
+
+#ifndef _SYS_LINUX_LDT_H
+#define	_SYS_LINUX_LDT_H
+
+#include <sys/segments.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+struct ldt_info {
+	uint_t	entry_number;
+	uint_t	base_addr;
+	uint_t	limit;
+	uint_t	seg_32bit:1,
+		contents:2,
+		read_exec_only:1,
+		limit_in_pages:1,
+		seg_not_present:1,
+		useable:1;
+};
+
+#define	LDT_INFO_EMPTY(info)						\
+	((info)->base_addr == 0 && (info)->limit == 0 &&		\
+	(info)->contents == 0 && (info)->read_exec_only == 1 &&		\
+	(info)->seg_32bit == 0 && (info)->limit_in_pages == 0 &&	\
+	(info)->seg_not_present == 1 && (info)->useable == 0)
+
+#if defined(__amd64)
+#define	SETMODE(desc)	(desc)->usd_long = SDP_SHORT;
+#else
+#define	SETMODE(desc)
+#endif
+
+#define	LDT_INFO_TO_DESC(info, desc)	{				\
+	USEGD_SETBASE(desc, (info)->base_addr);				\
+	USEGD_SETLIMIT(desc, (info)->limit);				\
+	(desc)->usd_type = ((info)->contents << 2) | 			\
+	    ((info)->read_exec_only ^ 1) << 1 | 0x10;			\
+	(desc)->usd_dpl = SEL_UPL;					\
+	(desc)->usd_p = (info)->seg_not_present ^ 1;			\
+	(desc)->usd_def32 = (info)->seg_32bit;				\
+	(desc)->usd_gran = (info)->limit_in_pages;			\
+	(desc)->usd_avl = (info)->useable;				\
+	SETMODE(desc);							\
+}
+
+#define	DESC_TO_LDT_INFO(desc, info)	{				\
+	bzero((info), sizeof (*(info)));				\
+	(info)->base_addr = USEGD_GETBASE(desc);			\
+	(info)->limit = USEGD_GETLIMIT(desc);				\
+	(info)->seg_not_present = (desc)->usd_p ^ 1;			\
+	(info)->contents = ((desc)->usd_type >> 2) & 3;			\
+	(info)->read_exec_only = (((desc)->usd_type >> 1) & 1) ^ 1;	\
+	(info)->seg_32bit = (desc)->usd_def32;				\
+	(info)->limit_in_pages = (desc)->usd_gran;			\
+	(info)->useable = (desc)->usd_avl;				\
+}
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_LINUX_LDT_H */
diff --git a/usr/src/uts/common/brand/lx/sys/lx_misc.h b/usr/src/uts/common/brand/lx/sys/lx_misc.h
new file mode 100644
index 0000000000..7c1e50362c
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/sys/lx_misc.h
@@ -0,0 +1,117 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc.  All rights reserved.
+ */
+
+#ifndef _SYS__LX_MISC_H
+#define	_SYS__LX_MISC_H
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#include <inet/ip.h>
+#include <inet/ip6.h>
+#include <sys/siginfo.h>
+#include <sys/lx_brand.h>
+
+#ifdef _KERNEL
+
+extern void lx_setrval(klwp_t *, int, int);
+extern void lx_exec();
+extern void lx_exitlwp(klwp_t *);
+extern void lx_freelwp(klwp_t *);
+extern void *lx_lwpdata_alloc(proc_t *);
+extern void lx_lwpdata_free(void *);
+extern void lx_initlwp(klwp_t *, void *);
+extern void lx_initlwp_post(klwp_t *);
+extern void lx_forklwp(klwp_t *, klwp_t *);
+
+extern void lx_set_gdt(int, user_desc_t *);
+extern void lx_clear_gdt(int);
+
+extern longlong_t lx_nosys();
+
+extern greg_t lx_fixsegreg(greg_t, model_t);
+extern uintptr_t lx_fsbase(klwp_t *, uintptr_t);
+extern void lx_exit_with_sig(proc_t *, sigqueue_t *);
+extern boolean_t lx_wait_filter(proc_t *, proc_t *);
+extern void lx_sigfd_translate(k_siginfo_t *);
+extern int stol_ksiginfo_copyout(k_siginfo_t *, void *);
+#if defined(_SYSCALL32_IMPL)
+extern int stol_ksiginfo32_copyout(k_siginfo_t *, void *);
+#endif
+extern void lx_read_argv_bounds(proc_t *p);
+
+typedef enum lx_regs_location {
+	LX_REG_LOC_UNAVAIL,
+	LX_REG_LOC_LWP,
+	LX_REG_LOC_UCP
+} lx_regs_location_t;
+
+extern lx_regs_location_t lx_regs_location(lx_lwp_data_t *, void **, boolean_t);
+
+
+typedef enum lx_if_action {
+	LX_IF_FROMNATIVE,
+	LX_IF_TONATIVE
+} lx_if_action_t;
+
+/* Linux ARP protocol hardware identifiers */
+#define	LX_ARPHRD_ETHER		1	/* Ethernet */
+#define	LX_ARPHRD_LOOPBACK	772	/* Loopback */
+#define	LX_ARPHRD_VOID		0xffff	/* Unknown */
+
+/* IPv6 address scope values used in /proc/net/if_inet6 */
+#define	LX_IPV6_ADDR_LOOPBACK	0x0010U
+#define	LX_IPV6_ADDR_LINKLOCAL	0x0020U
+#define	LX_IPV6_ADDR_SITELOCAL	0x0040U
+#define	LX_IPV6_ADDR_COMPATv4	0x0080U
+
+extern void lx_ifname_convert(char *, lx_if_action_t);
+extern void lx_ifflags_convert(uint64_t *, lx_if_action_t);
+extern unsigned int lx_ipv6_scope_convert(const in6_addr_t *);
+extern void lx_stol_hwaddr(const struct sockaddr_dl *, struct sockaddr *,
+    int *);
+
+extern boolean_t lx_ptrace_stop(ushort_t);
+extern void lx_stop_notify(proc_t *, klwp_t *, ushort_t, ushort_t);
+extern void lx_ptrace_init(void);
+extern void lx_ptrace_fini(void);
+extern int lx_waitid_helper(idtype_t, id_t, k_siginfo_t *, int, boolean_t *,
+    int *);
+extern void lx_ptrace_exit(proc_t *, klwp_t *);
+extern void lx_ptrace_inherit_tracer(lx_lwp_data_t *, lx_lwp_data_t *);
+extern int lx_ptrace_stop_for_option(int, boolean_t, ulong_t, uintptr_t);
+extern int lx_ptrace_set_clone_inherit(int, boolean_t);
+extern int lx_sigcld_repost(proc_t *, sigqueue_t *);
+extern int lx_ptrace_issig_stop(proc_t *, klwp_t *);
+extern boolean_t lx_ptrace_sig_ignorable(proc_t *, klwp_t *, int);
+
+extern int lx_helper_clone(int64_t *, int, void *, void *, void *);
+extern int lx_helper_setgroups(int, gid_t *);
+extern int lx_helper_rt_sigqueueinfo(pid_t, int, siginfo_t *);
+extern int lx_helper_rt_tgsigqueueinfo(pid_t, pid_t, int, siginfo_t *);
+
+extern boolean_t lx_vsyscall_iscall(klwp_t *, uintptr_t, int *);
+extern void lx_vsyscall_enter(proc_t *, klwp_t *, int);
+
+extern void lx_check_strict_failure(lx_lwp_data_t *);
+
+#endif
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS__LX_MISC_H */
diff --git a/usr/src/uts/common/brand/lx/sys/lx_ptm.h b/usr/src/uts/common/brand/lx/sys/lx_ptm.h
new file mode 100644
index 0000000000..74bbc939a3
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/sys/lx_ptm.h
@@ -0,0 +1,44 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_PTM_LINUX_H
+#define	_SYS_PTM_LINUX_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#define	LX_PTM_DRV		"lx_ptm"
+#define	LX_PTM_MINOR_NODE	"lx_ptmajor"
+
+#define	LX_PTM_DEV_TO_PTS(dev)	(getminor(dev) - 1)
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_PTM_LINUX_H */
diff --git a/usr/src/uts/common/brand/lx/sys/lx_sched.h b/usr/src/uts/common/brand/lx/sys/lx_sched.h
new file mode 100644
index 0000000000..b0ae748f3c
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/sys/lx_sched.h
@@ -0,0 +1,60 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_LINUX_SCHED_H
+#define	_SYS_LINUX_SCHED_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/procset.h>
+#include <sys/priocntl.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * Linux scheduler policies.
+ */
+#define	LX_SCHED_OTHER		0
+#define	LX_SCHED_FIFO		1
+#define	LX_SCHED_RR		2
+
+#define	LX_PRI_MAX		99
+
+typedef	int l_pid_t;
+
+struct lx_sched_param {
+	int	lx_sched_prio;
+};
+
+extern int sched_setprocset(procset_t *, l_pid_t);
+extern long do_priocntlsys(int, procset_t *, void *);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_LINUX_SCHED_H */
diff --git a/usr/src/uts/common/brand/lx/sys/lx_siginfo.h b/usr/src/uts/common/brand/lx/sys/lx_siginfo.h
new file mode 100644
index 0000000000..9f606b614f
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/sys/lx_siginfo.h
@@ -0,0 +1,190 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ * Copyright 2015 Joyent, Inc.
+ */
+
+#ifndef _LX_SIGINFO_H
+#define	_LX_SIGINFO_H
+
+#include <sys/lx_types.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * lx_siginfo_t lsi_code values
+ *
+ *	LX_SI_ASYNCNL:	Sent by asynch name lookup completion
+ *	LX_SI_DETHREAD:	Sent by execve() killing subsidiary threads
+ *	LX_SI_SIGIO:	Sent by queued SIGIO
+ *	LX_SI_ASYNCIO:	Sent by asynchronous I/O completion
+ *	LX_SI_MESGQ:	Sent by real time message queue state change
+ *	LX_SI_TIMER:	Sent by timer expiration
+ *	LX_SI_QUEUE:	Sent by sigqueue
+ *	LX_SI_USER:	Sent by kill, sigsend, raise, etc.
+ *	LX_SI_KERNEL:	Sent by kernel
+ *	LX_SI_CODE_NOT_EXIST: Error code. When translating from Linux to
+ *	    illumos errors, if there is no translation available, this value
+ *	    should be used. This value should have no meaning as an si_code in
+ *	    illumos or Linux.
+ *
+ * At present, LX_SI_ASYNCNL, LX_SI_DETHREAD, and LX_SI_SIGIO are unused by
+ * BrandZ.
+ */
+#define	LX_SI_CODE_NOT_EXIST	(-61)
+#define	LX_SI_ASYNCNL		(-60)
+#define	LX_SI_DETHREAD		(-7)
+#define	LX_SI_TKILL		(-6)
+#define	LX_SI_SIGIO		(-5)
+#define	LX_SI_ASYNCIO		(-4)
+#define	LX_SI_MESGQ		(-3)
+#define	LX_SI_TIMER		(-2)
+#define	LX_SI_QUEUE		(-1)
+#define	LX_SI_USER		(0)
+#define	LX_SI_KERNEL		(0x80)
+
+#define	LX_SI_MAX_SIZE		128
+#define	LX_SI_PAD_SIZE_32	((LX_SI_MAX_SIZE / sizeof (int)) - 3)
+#define	LX_SI_PAD_SIZE_64	((LX_SI_MAX_SIZE / sizeof (int)) - 4)
+
+#if defined(_LP64)
+/*
+ * Because of the odd number (3) of ints before the union, we need to account
+ * for the smaller padding needed on x64 due to the union being offset to an 8
+ * byte boundary.
+ */
+#define	LX_SI_PAD_SIZE		LX_SI_PAD_SIZE_64
+#else
+#define	LX_SI_PAD_SIZE		LX_SI_PAD_SIZE_32
+#endif
+
+typedef struct lx_siginfo {
+	int lsi_signo;
+	int lsi_errno;
+	int lsi_code;
+	union {
+		int _pad[LX_SI_PAD_SIZE];
+
+		struct {
+			pid_t _pid;
+			lx_uid16_t _uid;
+		} _kill;
+
+		struct {
+			uint_t _timer1;
+			uint_t _timer2;
+		} _timer;
+
+		struct {
+			pid_t _pid;
+			lx_uid16_t _uid;
+			union sigval _sigval;
+		} _rt;
+
+		struct {
+			pid_t _pid;
+			lx_uid16_t _uid;
+			int _status;
+			clock_t _utime;
+			clock_t _stime;
+		} _sigchld;
+
+		struct {
+			void *_addr;
+		} _sigfault;
+
+		struct {
+			int _band;
+			int _fd;
+		} _sigpoll;
+	} _sifields;
+} lx_siginfo_t;
+
+#if defined(_KERNEL) && defined(_SYSCALL32_IMPL)
+/*
+ * 64-bit kernel view of the 32-bit "lx_siginfo_t" object.
+ */
+#pragma pack(4)
+typedef struct lx_siginfo32 {
+	int lsi_signo;
+	int lsi_errno;
+	int lsi_code;
+	union {
+		int _pad[LX_SI_PAD_SIZE_32];
+
+		struct {
+			pid32_t _pid;
+			lx_uid16_t _uid;
+		} _kill;
+
+		struct {
+			uint_t _timer1;
+			uint_t _timer2;
+		} _timer;
+
+		struct {
+			pid32_t _pid;
+			lx_uid16_t _uid;
+			union sigval32 _sigval;
+		} _rt;
+
+		struct {
+			pid32_t _pid;
+			lx_uid16_t _uid;
+			int _status;
+			clock32_t _utime;
+			clock32_t _stime;
+		} _sigchld;
+
+		struct {
+			caddr32_t _addr;
+		} _sigfault;
+
+		struct {
+			int _band;
+			int _fd;
+		} _sigpoll;
+	} _sifields;
+} lx_siginfo32_t;
+#pragma pack()
+#endif /* defined(_KERNEL) && defined(_SYSCALL32_IMPL) */
+
+#define	lsi_pid		_sifields._kill._pid
+#define	lsi_uid		_sifields._kill._uid
+#define	lsi_status	_sifields._sigchld._status
+#define	lsi_utime	_sifields._sigchld._utime
+#define	lsi_stime	_sifields._sigchld._stime
+#define	lsi_value	_sifields._rt._sigval
+#define	lsi_int		_sifields._rt._sigval.sivalx_int
+#define	lsi_ptr		_sifields._rt._sigval.sivalx_ptr
+#define	lsi_addr	_sifields._sigfault._addr
+#define	lsi_band	_sifields._sigpoll._band
+#define	lsi_fd		_sifields._sigpoll._fd
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _LX_SIGINFO_H */
diff --git a/usr/src/uts/common/brand/lx/sys/lx_signal.h b/usr/src/uts/common/brand/lx/sys/lx_signal.h
new file mode 100644
index 0000000000..552c36238b
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/sys/lx_signal.h
@@ -0,0 +1,32 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+#ifndef _LX_SIGNAL_H
+#define	_LX_SIGNAL_H
+
+#include <lx_signum.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+extern void lx_ltos_sigset(lx_sigset_t *, k_sigset_t *);
+extern void lx_stol_sigset(k_sigset_t *, lx_sigset_t *);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _LX_SIGNAL_H */
diff --git a/usr/src/uts/common/brand/lx/sys/lx_socket.h b/usr/src/uts/common/brand/lx/sys/lx_socket.h
new file mode 100644
index 0000000000..eb9826eebe
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/sys/lx_socket.h
@@ -0,0 +1,434 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#ifndef _SYS_LX_SOCKET_H
+#define	_SYS_LX_SOCKET_H
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * Linux address family definitions
+ * Some of these are not supported
+ */
+#define	LX_AF_UNSPEC		0  /* Unspecified */
+#define	LX_AF_UNIX		1  /* local file/pipe name */
+#define	LX_AF_INET		2  /* IP protocol family */
+#define	LX_AF_AX25		3  /* Amateur Radio AX.25 */
+#define	LX_AF_IPX		4  /* Novell Internet Protocol */
+#define	LX_AF_APPLETALK		5  /* Appletalk */
+#define	LX_AF_NETROM		6  /* Amateur radio */
+#define	LX_AF_BRIDGE		7  /* Multiprotocol bridge */
+#define	LX_AF_ATMPVC		8  /* ATM PVCs */
+#define	LX_AF_X25		9  /* X.25 */
+#define	LX_AF_INET6		10 /* IPV 6 */
+#define	LX_AF_ROSE		11 /* Amateur Radio X.25 */
+#define	LX_AF_DECNET		12 /* DECnet */
+#define	LX_AF_NETBEUI		13 /* 802.2LLC */
+#define	LX_AF_SECURITY		14 /* Security callback */
+#define	LX_AF_KEY		15 /* key management */
+#define	LX_AF_ROUTE		16 /* Alias to emulate 4.4BSD */
+#define	LX_AF_NETLINK		LX_AF_ROUTE
+#define	LX_AF_PACKET		17 /* Packet family */
+#define	LX_AF_ASH		18 /* Ash ? */
+#define	LX_AF_ECONET		19 /* Acorn Econet */
+#define	LX_AF_ATMSVC		20 /* ATM SVCs */
+#define	LX_AF_SNA		22 /* Linux SNA */
+#define	LX_AF_IRDA		23 /* IRDA sockets */
+#define	LX_AF_PPPOX		24 /* PPPoX sockets */
+#define	LX_AF_WANPIPE		25 /* Wanpipe API sockets */
+#define	LX_AF_LLC		26
+/* gap in Linux defines for 27 and 28 */
+#define	LX_AF_CAN		29
+#define	LX_AF_TIPC		30
+#define	LX_AF_BLUETOOTH		31 /* Bluetooth sockets */
+#define	LX_AF_IUCV		32
+#define	LX_AF_RXRPC		33
+
+/* limit of AF mappings */
+#define	LX_AF_MAX		LX_AF_RXRPC
+
+#define	AF_NOTSUPPORTED		-1
+#define	AF_INVAL		-2
+
+/*
+ * Options for use with [gs]etsockopt at the SOL_SOCKET level.
+ */
+#define	LX_SOL_SOCKET				1
+
+#define	LX_SCM_RIGHTS				1
+#define	LX_SCM_CRED				2
+
+#define	LX_SO_DEBUG				1
+#define	LX_SO_REUSEADDR				2
+#define	LX_SO_TYPE				3
+#define	LX_SO_ERROR				4
+#define	LX_SO_DONTROUTE				5
+#define	LX_SO_BROADCAST				6
+#define	LX_SO_SNDBUF				7
+#define	LX_SO_RCVBUF				8
+#define	LX_SO_KEEPALIVE				9
+#define	LX_SO_OOBINLINE				10
+#define	LX_SO_NO_CHECK				11
+#define	LX_SO_PRIORITY				12
+#define	LX_SO_LINGER				13
+#define	LX_SO_BSDCOMPAT				14
+#define	LX_SO_REUSEPORT				15
+/*
+ * For Linux see unix(7) man page SO_PASSCRED description. For Illumos see
+ * socket.h(3HEAD) man page SO_RECVUCRED description.
+ */
+#define	LX_SO_PASSCRED				16
+#define	LX_SO_PEERCRED				17
+#define	LX_SO_RCVLOWAT				18
+#define	LX_SO_SNDLOWAT				19
+#define	LX_SO_RCVTIMEO				20
+#define	LX_SO_SNDTIMEO				21
+/* Security levels - as per NRL IPv6 - don't actually do anything */
+#define	LX_SO_SECURITY_AUTHENTICATION		22
+#define	LX_SO_SECURITY_ENCRYPTION_TRANSPORT	23
+#define	LX_SO_SECURITY_ENCRYPTION_NETWORK	24
+#define	LX_SO_BINDTODEVICE			25
+/* Socket filtering */
+#define	LX_SO_ATTACH_FILTER			26
+#define	LX_SO_DETACH_FILTER			27
+#define	LX_SO_PEERNAME				28
+#define	LX_SO_TIMESTAMP				29
+#define	LX_SCM_TIMESTAMP			LX_SO_TIMESTAMP
+#define	LX_SO_ACCEPTCONN			30
+
+#define	LX_SO_PEERSEC				31
+#define	LX_SO_SNDBUFFORCE			32
+#define	LX_SO_RCVBUFFORCE			33
+#define	LX_SO_PASSSEC				34
+#define	LX_SO_TIMESTAMPNS			35
+#define	LX_SCM_TIMESTAMPNS			LX_SO_TIMESTAMPNS
+#define	LX_SO_MARK				36
+#define	LX_SO_TIMESTAMPING			37
+#define	LX_SCM_TIMESTAMPING			LX_SO_TIMESTAMPING
+#define	LX_SO_PROTOCOL				38
+#define	LX_SO_DOMAIN				39
+#define	LX_SO_RXQ_OVFL				40
+#define	LX_SO_WIFI_STATUS			41
+#define	LX_SCM_WIFI_STATUS			LX_SO_WIFI_STATUS
+#define	LX_SO_PEEK_OFF				42
+#define	LX_SO_NOFCS				43
+#define	LX_SO_LOCK_FILTER			44
+#define	LX_SO_SELECT_ERR_QUEUE			45
+#define	LX_SO_BUSY_POLL				46
+#define	LX_SO_MAX_PACING_RATE			47
+#define	LX_SO_BPF_EXTENSIONS			48
+
+/*
+ * Options for use with [gs]etsockopt at the RAW level.
+ * IPPROTO_RAW
+ */
+#define	LX_ICMP_FILTER				1
+
+/*
+ * Options for use with [gs]etsockopt at the PACKET level.
+ * SOL_PACKET
+ */
+#define	LX_SOL_PACKET				263
+
+#define	LX_PACKET_ADD_MEMBERSHIP		1
+#define	LX_PACKET_DROP_MEMBERSHIP		2
+#define	LX_PACKET_RECV_OUTPUT			3
+#define	LX_PACKET_RX_RING			5
+#define	LX_PACKET_STATISTICS			6
+
+/*
+ * Options for use with [gs]etsockopt at the NETLINK level.
+ * SOL_NETLINK
+ */
+#define	LX_SOL_NETLINK				270
+
+/*
+ * Linux socket type definitions
+ */
+#define	LX_SOCK_STREAM		1	/* Connection-based byte streams */
+#define	LX_SOCK_DGRAM		2	/* Connectionless, datagram */
+#define	LX_SOCK_RAW		3	/* Raw protocol interface */
+#define	LX_SOCK_RDM		4	/* Reliably-delivered message */
+#define	LX_SOCK_SEQPACKET	5	/* Sequenced packet stream */
+#define	LX_SOCK_PACKET		10	/* Linux specific */
+#define	LX_SOCK_MAX		11
+
+/*
+ * The Linux socket type can be or-ed with other flags (e.g. SOCK_CLOEXEC).
+ */
+#define	LX_SOCK_TYPE_MASK	0xf
+
+/*
+ * Linux flags for socket, socketpair and accept4. These are or-ed into the
+ * socket type value. In the Linux net.h header these come from fcntl.h (note
+ * that they are in octal in the Linux header).
+ */
+#define	LX_SOCK_CLOEXEC		0x80000
+#define	LX_SOCK_NONBLOCK	0x800
+
+#define	SOCK_NOTSUPPORTED	-1
+#define	SOCK_INVAL		-2
+
+/*
+ * PF_PACKET protocol definitions.
+ */
+#define	LX_ETH_P_802_3	0x0001
+#define	LX_ETH_P_ALL	0x0003
+#define	LX_ETH_P_802_2	0x0004
+#define	LX_ETH_P_IP	0x0800
+#define	LX_ETH_P_ARP	0x0806
+#define	LX_ETH_P_IPV6	0x86DD
+
+/*
+ * IP Protocol levels. Some of these match the Illumos IPPROTO_* values.
+ */
+#define	LX_IPPROTO_IP		0
+#define	LX_IPPROTO_ICMP		1
+#define	LX_IPPROTO_IGMP		2
+#define	LX_IPPROTO_TCP		6
+#define	LX_IPPROTO_UDP		17
+#define	LX_IPPROTO_IPV6		41
+#define	LX_IPPROTO_ICMPV6	58
+#define	LX_IPPROTO_RAW		255
+
+/*
+ * Options for use with [gs]etsockopt at the IP level.
+ * IPPROTO_IP
+ */
+#define	LX_IP_TOS		1
+#define	LX_IP_TTL		2
+#define	LX_IP_HDRINCL		3
+#define	LX_IP_OPTIONS		4
+#define	LX_IP_ROUTER_ALERT	5
+#define	LX_IP_RECVOPTS		6
+#define	LX_IP_RETOPTS		7
+#define	LX_IP_PKTINFO		8
+#define	LX_IP_PKTOPTIONS	9
+#define	LX_IP_MTU_DISCOVER	10
+#define	LX_IP_RECVERR		11
+#define	LX_IP_RECVTTL		12
+#define	LX_IP_RECVTOS		13
+#define	LX_IP_MTU		14
+#define	LX_IP_FREEBIND		15
+#define	LX_IP_IPSEC_POLICY	16
+#define	LX_IP_XFRM_POLICY	17
+#define	LX_IP_PASSSEC		18
+#define	LX_IP_TRANSPARENT	19
+#define	LX_IP_ORIGDSTADDR	20
+#define	LX_IP_MINTTL		21
+#define	LX_IP_NODEFRAG		22
+/* Linux apparently leaves a gap here */
+#define	LX_IP_MULTICAST_IF	32
+#define	LX_IP_MULTICAST_TTL	33
+#define	LX_IP_MULTICAST_LOOP	34
+#define	LX_IP_ADD_MEMBERSHIP	35
+#define	LX_IP_DROP_MEMBERSHIP	36
+#define	LX_IP_UNBLOCK_SOURC	37
+#define	LX_IP_BLOCK_SOURCE	38
+#define	LX_IP_ADD_SOURCE_MEMBERSHIP 39
+#define	LX_IP_DROP_SOURCE_MEMBERSHIP 40
+#define	LX_IP_MSFILTER		41
+#define	LX_MCAST_JOIN_GROUP	42
+#define	LX_MCAST_BLOCK_SOURCE	43
+#define	LX_MCAST_UNBLOCK_SOURCE	44
+#define	LX_MCAST_LEAVE_GROUP	45
+#define	LX_MCAST_JOIN_SOURCE_GROUP 46
+#define	LX_MCAST_LEAVE_SOURCE_GROUP 47
+#define	LX_MCAST_MSFILTER	48
+#define	LX_IP_MULTICAST_ALL	49
+#define	LX_IP_UNICAST_IF	50
+
+/*
+ * LX_IP_MTU_DISCOVER values
+ */
+#define	LX_IP_PMTUDISC_DONT		0
+#define	LX_IP_PMTUDISC_WANT		1
+#define	LX_IP_PMTUDISC_DO		2
+#define	LX_IP_PMTUDISC_PROBE		3
+#define	LX_IP_PMTUDISC_INTERFACE	4
+#define	LX_IP_PMTUDISC_OMIT		5
+
+/*
+ * Options for use with [gs]etsockopt at the IP level.
+ * IPPROTO_IPV6
+ */
+
+#define	LX_IPV6_ADDRFORM	1
+#define	LX_IPV6_2292PKTINFO	2
+#define	LX_IPV6_2292HOPOPTS	3
+#define	LX_IPV6_2292DSTOPTS	4
+#define	LX_IPV6_2292RTHDR	5
+#define	LX_IPV6_2292PKTOPTIONS	6
+#define	LX_IPV6_CHECKSUM	7
+#define	LX_IPV6_2292HOPLIMIT	8
+#define	LX_IPV6_NEXTHOP		9
+#define	LX_IPV6_AUTHHDR		10
+#define	LX_IPV6_UNICAST_HOPS	16
+#define	LX_IPV6_MULTICAST_IF	17
+#define	LX_IPV6_MULTICAST_HOPS	18
+#define	LX_IPV6_MULTICAST_LOOP	19
+#define	LX_IPV6_JOIN_GROUP	20
+#define	LX_IPV6_LEAVE_GROUP	21
+#define	LX_IPV6_ROUTER_ALERT	22
+#define	LX_IPV6_MTU_DISCOVER	23
+#define	LX_IPV6_MTU		24
+#define	LX_IPV6_RECVERR		25
+#define	LX_IPV6_V6ONLY		26
+#define	LX_IPV6_JOIN_ANYCAST	27
+#define	LX_IPV6_LEAVE_ANYCAST	28
+#define	LX_IPV6_IPSEC_POLICY	34
+#define	LX_IPV6_XFRM_POLICY	35
+
+#define	LX_IPV6_RECVPKTINFO	49
+#define	LX_IPV6_PKTINFO		50
+#define	LX_IPV6_RECVHOPLIMIT	51
+#define	LX_IPV6_HOPLIMIT	52
+#define	LX_IPV6_RECVHOPOPTS	53
+#define	LX_IPV6_HOPOPTS		54
+#define	LX_IPV6_RTHDRDSTOPTS	55
+#define	LX_IPV6_RECVRTHDR	56
+#define	LX_IPV6_RTHDR		57
+#define	LX_IPV6_RECVDSTOPTS	58
+#define	LX_IPV6_DSTOPTS		59
+#define	LX_IPV6_RECVTCLASS	66
+#define	LX_IPV6_TCLASS		67
+
+/*
+ * Options for use with [gs]etsockopt at the IP level.
+ * IPPROTO_ICMPV6
+ */
+
+#define	LX_ICMP6_FILTER		1
+
+/*
+ * Options for use with [gs]etsockopt at the TCP level.
+ * IPPROTO_TCP
+ */
+#define	LX_TCP_NODELAY		1  /* Don't delay send to coalesce packets  */
+#define	LX_TCP_MAXSEG		2  /* Set maximum segment size  */
+#define	LX_TCP_CORK		3  /* Control sending of partial frames  */
+#define	LX_TCP_KEEPIDLE		4  /* Start keeplives after this period */
+#define	LX_TCP_KEEPINTVL	5  /* Interval between keepalives */
+#define	LX_TCP_KEEPCNT		6  /* Number of keepalives before death */
+#define	LX_TCP_SYNCNT		7  /* Number of SYN retransmits */
+#define	LX_TCP_LINGER2		8  /* Life time of orphaned FIN-WAIT-2 state */
+#define	LX_TCP_DEFER_ACCEPT	9  /* Wake up listener only when data arrive */
+#define	LX_TCP_WINDOW_CLAMP	10 /* Bound advertised window */
+#define	LX_TCP_INFO		11 /* Information about this connection. */
+#define	LX_TCP_QUICKACK		12 /* Bock/reenable quick ACKs.  */
+#define	LX_TCP_CONGESTION	13 /* Congestion control algorithm */
+#define	LX_TCP_MD5SIG		14 /* TCP MD5 Signature (RFC2385) */
+#define	LX_TCP_THIN_LINEAR_TIMEOUTS 16 /* Use linear timeouts on thin streams */
+#define	LX_TCP_THIN_DUPACK	17 /* Fast retrans. after 1 dupack */
+#define	LX_TCP_USER_TIMEOUT	18 /* How long for loss retry before timeout */
+#define	LX_TCP_REPAIR		19 /* TCP socket under repair */
+#define	LX_TCP_REPAIR_QUEUE	20
+#define	LX_TCP_QUEUE_SEQ	21
+#define	LX_TCP_REPAIR_OPTIONS	22
+#define	LX_TCP_FASTOPEN		23 /* Enable FastOpen on listeners */
+#define	LX_TCP_TIMESTAMP	24
+#define	LX_TCP_NOTSENT_LOWAT	25 /* limit number of unsent bytes */
+
+/*
+ * Options for use with [gs]etsockopt at the IGMP level.
+ * IPPROTO_IGMP
+ */
+#define	LX_IGMP_MINLEN				8
+#define	LX_IGMP_MAX_HOST_REPORT_DELAY		10
+#define	LX_IGMP_HOST_MEMBERSHIP_QUERY		0x11
+#define	LX_IGMP_HOST_MEMBERSHIP_REPORT		0x12
+#define	LX_IGMP_DVMRP				0x13
+#define	LX_IGMP_PIM				0x14
+#define	LX_IGMP_TRACE				0x15
+#define	LX_IGMP_HOST_NEW_MEMBERSHIP_REPORT	0x16
+#define	LX_IGMP_HOST_LEAVE_MESSAGE		0x17
+#define	LX_IGMP_MTRACE_RESP			0x1e
+#define	LX_IGMP_MTRACE				0x1f
+
+/*
+ * Linux socket flags for use with recv(2)/send(2)/recvmsg(2)/sendmsg(2)
+ */
+#define	LX_MSG_OOB		0x1
+#define	LX_MSG_PEEK		0x2
+#define	LX_MSG_DONTROUTE	0x4
+#define	LX_MSG_CTRUNC		0x8
+#define	LX_MSG_PROXY		0x10
+#define	LX_MSG_TRUNC		0x20
+#define	LX_MSG_DONTWAIT		0x40
+#define	LX_MSG_EOR		0x80
+#define	LX_MSG_WAITALL		0x100
+#define	LX_MSG_FIN		0x200
+#define	LX_MSG_SYN		0x400
+#define	LX_MSG_CONFIRM		0x800
+#define	LX_MSG_RST		0x1000
+#define	LX_MSG_ERRQUEUE		0x2000
+#define	LX_MSG_NOSIGNAL		0x4000
+#define	LX_MSG_MORE		0x8000
+#define	LX_MSG_WAITFORONE	0x10000
+#define	LX_MSG_FASTOPEN		0x20000000
+#define	LX_MSG_CMSG_CLOEXEC	0x40000000
+
+typedef struct lx_msghdr {
+	void		*msg_name;	/* optional address */
+	socklen_t	msg_namelen;	/* size of address */
+	struct iovec	*msg_iov;	/* scatter/gather array */
+	size_t		msg_iovlen;	/* # elements in msg_iov */
+	void		*msg_control;	/* ancillary data */
+	size_t		msg_controllen;	/* ancillary data buffer len */
+	int		msg_flags;	/* flags on received message */
+} lx_msghdr_t;
+
+
+#if defined(_LP64)
+
+typedef struct lx_msghdr32 {
+	caddr32_t	msg_name;	/* optional address */
+	uint32_t	msg_namelen;	/* size of address */
+	caddr32_t	msg_iov;	/* scatter/gather array */
+	int32_t		msg_iovlen;	/* # elements in msg_iov */
+	caddr32_t	msg_control;	/* ancillary data */
+	uint32_t	msg_controllen;	/* ancillary data buffer len */
+	int32_t		msg_flags;	/* flags on received message */
+} lx_msghdr32_t;
+
+#endif
+
+typedef struct lx_sockaddr_in6 {
+	sa_family_t	sin6_family;
+	in_port_t	sin6_port;
+	uint32_t	sin6_flowinfo;
+	struct in6_addr	sin6_addr;
+	uint32_t	sin6_scope_id;  /* Depends on scope of sin6_addr */
+	/* one 32-bit field shorter than illumos */
+} lx_sockaddr_in6_t;
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_LX_SOCKET_H */
diff --git a/usr/src/uts/common/brand/lx/sys/lx_syscalls.h b/usr/src/uts/common/brand/lx/sys/lx_syscalls.h
new file mode 100644
index 0000000000..64084b77f1
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/sys/lx_syscalls.h
@@ -0,0 +1,232 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#ifndef _SYS_LINUX_SYSCALLS_H
+#define	_SYS_LINUX_SYSCALLS_H
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#ifdef	_KERNEL
+
+extern long lx_accept();
+extern long lx_accept4();
+extern long lx_access();
+extern long lx_arch_prctl();
+extern long lx_bind();
+extern long lx_brk();
+extern long lx_chmod();
+extern long lx_chown();
+extern long lx_chown16();
+extern long lx_clock_getres();
+extern long lx_clock_gettime();
+extern long lx_clock_settime();
+extern long lx_close();
+extern long lx_connect();
+extern long lx_epoll_create();
+extern long lx_epoll_create1();
+extern long lx_epoll_ctl();
+extern long lx_epoll_pwait();
+extern long lx_epoll_wait();
+extern long lx_faccessat();
+extern long lx_fallocate();
+extern long lx_fallocate32();
+extern long lx_fchmod();
+extern long lx_fchmodat();
+extern long lx_fchown();
+extern long lx_fchown16();
+extern long lx_fchownat();
+extern long lx_fcntl();
+extern long lx_fcntl64();
+extern long lx_fgetxattr();
+extern long lx_flistxattr();
+extern long lx_fremovexattr();
+extern long lx_fsetxattr();
+extern long lx_fstat32();
+extern long lx_fstat64();
+extern long lx_fstatat64();
+extern long lx_futex();
+extern long lx_get_robust_list();
+extern long lx_get_thread_area();
+extern long lx_getcpu();
+extern long lx_getcwd();
+extern long lx_getdents_32();
+extern long lx_getdents_64();
+extern long lx_getdents64();
+extern long lx_getpeername();
+extern long lx_getsockname();
+extern long lx_getpid();
+extern long lx_getppid();
+extern long lx_getrandom();
+extern long lx_getrlimit();
+extern long lx_getsockopt();
+extern long lx_gettid();
+extern long lx_gettimeofday();
+extern long lx_getxattr();
+extern long lx_io_setup();
+extern long lx_ioctl();
+extern long lx_ioprio_get();
+extern long lx_ioprio_set();
+extern long lx_kill();
+extern long lx_lchown();
+extern long lx_lchown16();
+extern long lx_lgetxattr();
+extern long lx_link();
+extern long lx_linkat();
+extern long lx_llistxattr();
+extern long lx_lremovexattr();
+extern long lx_lsetxattr();
+extern long lx_lstat32();
+extern long lx_lstat64();
+extern long lx_listxattr();
+extern long lx_mkdir();
+extern long lx_mkdirat();
+extern long lx_modify_ldt();
+extern long lx_nanosleep();
+extern long lx_oldgetrlimit();
+extern long lx_open();
+extern long lx_openat();
+extern long lx_personality();
+extern long lx_pipe();
+extern long lx_pipe2();
+extern long lx_poll();
+extern long lx_ppoll();
+extern long lx_pread();
+extern long lx_pread32();
+extern long lx_preadv();
+extern long lx_preadv32();
+extern long lx_prctl();
+extern long lx_prlimit64();
+extern long lx_pselect();
+extern long lx_ptrace();
+extern long lx_pwrite();
+extern long lx_pwrite32();
+extern long lx_pwritev();
+extern long lx_pwritev32();
+extern long lx_read();
+extern long lx_readv();
+extern long lx_recv();
+extern long lx_recvmsg();
+extern long lx_recvfrom();
+extern long lx_sched_getparam();
+extern long lx_sched_getscheduler();
+extern long lx_sched_rr_get_interval();
+extern long lx_sched_setparam();
+extern long lx_sched_setscheduler();
+extern long lx_sched_yield();
+extern long lx_select();
+extern long lx_send();
+extern long lx_sendmsg();
+extern long lx_sendto();
+extern long lx_set_robust_list();
+extern long lx_set_thread_area();
+extern long lx_set_tid_address();
+extern long lx_setresgid();
+extern long lx_setresgid16();
+extern long lx_setresuid();
+extern long lx_setresuid16();
+extern long lx_setrlimit();
+extern long lx_setxattr();
+extern long lx_setsockopt();
+extern long lx_socket();
+extern long lx_socketcall();
+extern long lx_stat32();
+extern long lx_stat64();
+extern long lx_sync_file_range();
+extern long lx_syncfs();
+extern long lx_sysinfo32();
+extern long lx_sysinfo64();
+extern long lx_removexattr();
+extern long lx_tgkill();
+extern long lx_time();
+extern long lx_tkill();
+extern long lx_uname();
+extern long lx_wait4();
+extern long lx_waitid();
+extern long lx_waitpid();
+extern long lx_write();
+extern long lx_writev();
+
+#if defined(_LP64)
+/*
+ * Linux vsyscall addresses:
+ */
+#define	LX_VSYS_gettimeofday	(uintptr_t)0xffffffffff600000
+#define	LX_VSYS_time		(uintptr_t)0xffffffffff600400
+#define	LX_VSYS_getcpu		(uintptr_t)0xffffffffff600800
+
+#define	LX_VSYSCALL_ADDR		(uintptr_t)0xffffffffff600000
+#define	LX_VSYSCALL_SIZE		(uintptr_t)0x1000
+#endif
+
+#endif	/* _KERNEL */
+
+/*
+ * System call numbers for revectoring:
+ */
+
+#if defined(__amd64)
+#define	LX_SYS_close		3
+#define	LX_SYS_gettimeofday	96
+#define	LX_SYS_time		201
+#define	LX_SYS_io_setup		206
+#define	LX_SYS_clock_gettime	228
+#define	LX_SYS_getcpu		309
+
+#define	LX_SYS32_close		6
+#define	LX_SYS32_gettimeofday	78
+#define	LX_SYS32_time		13
+#define	LX_SYS32_clock_gettime	265
+#define	LX_SYS32_io_setup	245
+#define	LX_SYS32_getcpu		318
+#elif defined(__i386)
+#define	LX_SYS_close		6
+#define	LX_SYS_gettimeofday	78
+#define	LX_SYS_time		13
+#define	LX_SYS_clock_gettime	265
+#define	LX_SYS_io_setup		245
+#define	LX_SYS_getcpu		318
+#else
+#error "Architecture not supported"
+#endif /* defined(__amd64) */
+
+/*
+ * The current code in the VDSO operates under the expectation that it will be
+ * mapped at a fixed offset from the comm page.  This simplifies the act of
+ * locating said page without any other reference.  The VDSO must fit within
+ * this offset, matching the same value as COMM_PAGE_ALIGN.
+ * See: uts/i86pc/sys/comm_page.h
+ */
+#define	LX_VDSO_SIZE		0x4000
+#define	LX_VDSO_ADDR_MASK	~(LX_VDSO_SIZE - 1)
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_LINUX_SYSCALLS_H */
diff --git a/usr/src/uts/common/brand/lx/sys/lx_types.h b/usr/src/uts/common/brand/lx/sys/lx_types.h
new file mode 100644
index 0000000000..90363c8939
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/sys/lx_types.h
@@ -0,0 +1,144 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ * Copyright 2016 Joyent, Inc.  All rights reserved.
+ */
+
+#ifndef _SYS_LX_TYPES_H
+#define	_SYS_LX_TYPES_H
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#ifndef	_KERNEL
+
+#define	SHRT_MIN	(-32768)	/* min value of a "short int" */
+#define	SHRT_MAX	32767		/* max value of a "short int" */
+#define	USHRT_MAX	65535		/* max of "unsigned short int" */
+#define	INT_MIN		(-2147483647-1) /* min value of an "int" */
+#define	INT_MAX		2147483647	/* max value of an "int" */
+#define	UINT_MAX	4294967295U	/* max value of an "unsigned int" */
+
+#ifndef LLONG_MAX
+#define	LLONG_MAX	9223372036854775807LL
+#endif
+
+#if defined(_LP64)
+#define	LONG_MAX	9223372036854775807L
+#define	ULONG_MAX	18446744073709551615UL
+#else
+#define	LONG_MAX	2147483647L	/* max value of a 32-bit "long int" */
+#define	ULONG_MAX	4294967295UL	/* max value of a 32-bit "ulong int" */
+#endif
+
+#endif /* !_KERNEL */
+
+
+typedef	uint64_t	lx_dev_t;
+typedef	uint16_t	lx_dev16_t;
+typedef	uint32_t	lx_ino_t;
+typedef	uint64_t	lx_ino64_t;
+typedef	uint32_t	lx_uid_t;
+typedef	uint16_t	lx_uid16_t;
+typedef	uint32_t	lx_gid_t;
+typedef	uint16_t	lx_gid16_t;
+typedef	uint32_t	lx_off_t;
+typedef	uint64_t	lx_off64_t;
+typedef	uint32_t	lx_blksize_t;
+typedef	uint32_t	lx_blkcnt_t;
+typedef	uint64_t	lx_blkcnt64_t;
+typedef	uint32_t	lx_mode_t;
+typedef	uint16_t	lx_mode16_t;
+
+/*
+ * Linux mangles major/minor numbers into dev_t differently than SunOS.
+ */
+#ifdef _LP64
+#define	LX_MAKEDEVICE(maj, min) \
+	(((min) & 0xff) | (((maj) & 0xfff) << 8) | \
+	((uint64_t)((min) & ~0xff) << 12) | ((uint64_t)((maj) & ~0xfff) << 32))
+
+#define	LX_GETMAJOR(lx_dev)	((((lx_dev) >> 8) & 0xfff) | \
+	((((uint64_t)(lx_dev)) >> 32) & ~0xfff))
+
+#else
+#define	LX_MAKEDEVICE(maj, min) \
+	(((min) & 0xff) | (((maj) & 0xfff) << 8) | (((min) & ~0xff) << 12))
+
+#define	LX_GETMAJOR(lx_dev)	(((lx_dev) >> 8) & 0xfff)
+#endif
+
+#define	LX_GETMINOR(lx_dev)	(((lx_dev) & 0xff) | (((lx_dev) >> 12) & ~0xff))
+/* Linux supports 20 bits for the minor, and 12 bits for the major number */
+#define	LX_MAXMIN	0xfffff
+#define	LX_MAXMAJ	0xfff
+
+/*
+ * Certain Linux tools care deeply about major/minor number mapping.
+ * Map virtual disks (zfs datasets, zvols, etc) into a safe reserved range.
+ */
+#define	LX_MAJOR_DISK	203
+
+/* LX ptm driver major/minor number */
+#define	LX_PTM_MAJOR		5
+#define	LX_PTM_MINOR		2
+
+/* LX pts driver major number range */
+#define	LX_PTS_MAJOR_MIN	136
+#define	LX_PTS_MAJOR_MAX	143
+
+/* LX tty/cons driver major number */
+#define	LX_TTY_MAJOR		5
+
+#define	LX_UID16_TO_UID32(uid16)	\
+	(((uid16) == (lx_uid16_t)-1) ? ((lx_uid_t)-1) : (lx_uid_t)(uid16))
+
+#define	LX_GID16_TO_GID32(gid16)     \
+	(((gid16) == (lx_gid16_t)-1) ? ((lx_gid_t)-1) : (lx_gid_t)(gid16))
+
+/* Overflow values default to NFS nobody. */
+
+#define	UID16_OVERFLOW	((lx_uid16_t)65534)
+#define	GID16_OVERFLOW	((lx_gid16_t)65534)
+
+/*
+ * All IDs with high word non-zero are converted to default overflow values to
+ * avoid inadvertent truncation to zero (root) (!).
+ */
+#define	LX_UID32_TO_UID16(uid32)	\
+	((((uid32) & 0xffff0000) == 0)  ? ((lx_uid16_t)(uid32)) : \
+	    (((uid32) == ((lx_uid_t)-1)) ? ((lx_uid16_t)-1) : UID16_OVERFLOW))
+
+#define	LX_GID32_TO_GID16(gid32)	\
+	((((gid32) & 0xffff0000) == 0)  ? ((lx_gid16_t)(gid32)) : \
+	    (((gid32) == ((lx_gid_t)-1)) ? ((lx_gid16_t)-1) : GID16_OVERFLOW))
+
+#define	LX_32TO64(lo, hi)	\
+	((uint64_t)((uint64_t)(lo) | ((uint64_t)(hi) << 32)))
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_LX_TYPES_H */
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_access.c b/usr/src/uts/common/brand/lx/syscall/lx_access.c
new file mode 100644
index 0000000000..24805a5e96
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_access.c
@@ -0,0 +1,224 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T
+ *     All Rights Reserved
+ *
+ * Portions of this source code were derived from Berkeley 4.3 BSD
+ * under license from the Regents of the University of California.
+ *
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#include <sys/param.h>
+#include <sys/isa_defs.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/cred_impl.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/pathname.h>
+#include <sys/vnode.h>
+#include <sys/uio.h>
+#include <sys/cmn_err.h>
+#include <sys/debug.h>
+#include <sys/file.h>
+#include <fs/fs_subr.h>
+#include <c2/audit.h>
+#include <sys/fcntl.h>
+#include <sys/stat.h>
+#include <sys/mode.h>
+#include <sys/lx_brand.h>
+#include <sys/lx_fcntl.h>
+
+/*
+ * Determine accessibility of file.
+ */
+
+#define	E_OK	010	/* use effective ids */
+#define	R_OK	004
+#define	W_OK	002
+#define	X_OK	001
+
+/*
+ * Convert Linux LX_AT_* flags to SunOS AT_* flags but skip verifying allowed
+ * flags have been passed. This also allows EACCESS/REMOVEDIR to be translated
+ * correctly since on linux they have the same value.
+ *
+ * Some code can actually pass in other bits in the flag. We may have to simply
+ * ignore these, as indicated by the enforce parameter. See lx_fchmodat for
+ * another example of this type of behavior.
+ */
+static int
+ltos_at_flag(int lflag, int allow, boolean_t enforce)
+{
+	int sflag = 0;
+
+	if ((lflag & LX_AT_EACCESS) && (allow & AT_EACCESS)) {
+		lflag &= ~LX_AT_EACCESS;
+		sflag |= AT_EACCESS;
+	}
+
+	if ((lflag & LX_AT_REMOVEDIR) && (allow & AT_REMOVEDIR)) {
+		lflag &= ~LX_AT_REMOVEDIR;
+		sflag |= AT_REMOVEDIR;
+	}
+
+	if ((lflag & LX_AT_SYMLINK_NOFOLLOW) && (allow & AT_SYMLINK_NOFOLLOW)) {
+		lflag &= ~LX_AT_SYMLINK_NOFOLLOW;
+		sflag |= AT_SYMLINK_NOFOLLOW;
+	}
+
+	/* right now solaris doesn't have a _FOLLOW flag, so use a fake one */
+	if ((lflag & LX_AT_SYMLINK_FOLLOW) && (allow & LX_AT_SYMLINK_FOLLOW)) {
+		lflag &= ~LX_AT_SYMLINK_FOLLOW;
+		sflag |= LX_AT_SYMLINK_FOLLOW;
+	}
+
+	/* If lflag is not zero than some flags did not hit the above code. */
+	if (enforce && lflag)
+		return (-EINVAL);
+
+	return (sflag);
+}
+
+/*
+ * For illumos, access() does this:
+ *    If the process has appropriate privileges, an implementation may indicate
+ *    success for X_OK even if none of the execute file permission bits are set.
+ *
+ * But for Linux, access() does this:
+ *    If the calling process is privileged (i.e., its real UID is zero), then
+ *    an X_OK check is successful for a regular file if execute permission is
+ *    enabled for any of the file owner, group, or other.
+ *
+ * Linux used to behave more like illumos on older kernels:
+ *    In  kernel  2.4 (and earlier) there is some strangeness in the handling
+ *    of X_OK tests for superuser.  If all categories of  execute  permission
+ *    are  disabled for a nondirectory file, then the only access() test that
+ *    returns -1 is when mode is specified as just X_OK; if R_OK or  W_OK  is
+ *    also  specified in mode, then access() returns 0 for such files.
+ *
+ * So we need to handle the case where a privileged process is checking for
+ * X_OK but none of the execute bits are set on the file. We'll keep the old
+ * 2.4 behavior for 2.4 emulation but use the new behavior for any other
+ * kernel rev.
+ */
+static int
+lx_common_access(char *fname, int fmode, vnode_t *startvp)
+{
+	vnode_t *vp;
+	cred_t *tmpcr;
+	int error;
+	int mode;
+	cred_t *cr;
+	int estale_retry = 0;
+
+	if (fmode & ~(E_OK|R_OK|W_OK|X_OK))
+		return (EINVAL);
+
+	mode = ((fmode & (R_OK|W_OK|X_OK)) << 6);
+
+	cr = CRED();
+
+	/* OK to use effective uid/gid, i.e., no need to crdup(CRED())? */
+	if ((fmode & E_OK) != 0 ||
+	    (cr->cr_uid == cr->cr_ruid && cr->cr_gid == cr->cr_rgid)) {
+		tmpcr = cr;
+		crhold(tmpcr);
+	} else {
+		tmpcr = crdup(cr);
+		tmpcr->cr_uid = cr->cr_ruid;
+		tmpcr->cr_gid = cr->cr_rgid;
+		tmpcr->cr_ruid = cr->cr_uid;
+		tmpcr->cr_rgid = cr->cr_gid;
+	}
+
+lookup:
+	if ((error = lookupnameatcred(fname, UIO_USERSPACE, FOLLOW, NULLVPP,
+	    &vp, startvp, tmpcr)) != 0) {
+		if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
+			goto lookup;
+		crfree(tmpcr);
+		return (error);
+	}
+
+	if (mode != 0) {
+		error = VOP_ACCESS(vp, mode, 0, tmpcr, NULL);
+		if (error != 0) {
+			if ((error == ESTALE) &&
+			    fs_need_estale_retry(estale_retry++)) {
+				VN_RELE(vp);
+				goto lookup;
+			}
+
+		} else if ((fmode & X_OK) != 0 && cr->cr_ruid == 0 &&
+		    lx_kern_release_cmp(curproc->p_zone, "2.4.0") > 0) {
+			/* check for incorrect execute success */
+			vattr_t va;
+
+			va.va_mask = AT_MODE;
+			if ((error = VOP_GETATTR(vp, &va, 0, cr, NULL)) == 0) {
+				mode_t m = VTTOIF(va.va_type) | va.va_mode;
+
+				if ((m & S_IFMT) == S_IFREG &&
+				    !(m & (S_IXUSR | S_IXGRP | S_IXOTH))) {
+					/* no execute bits set in the mode */
+					error = EACCES;
+				}
+			}
+		}
+	}
+
+	crfree(tmpcr);
+	VN_RELE(vp);
+	return (error);
+}
+
+int
+lx_faccessat(int atfd, char *fname, int fmode, int flag)
+{
+	vnode_t *startvp;
+	int error;
+
+	if (atfd == LX_AT_FDCWD)
+		atfd = AT_FDCWD;
+
+	if ((flag = ltos_at_flag(flag, AT_EACCESS, B_FALSE)) < 0)
+		return (set_errno(EINVAL));
+
+	if (fname == NULL)
+		return (set_errno(EFAULT));
+	if ((error = fgetstartvp(atfd, fname, &startvp)) != 0)
+		return (set_errno(error));
+	if (AU_AUDITING() && startvp != NULL)
+		audit_setfsat_path(1);
+
+	/* Do not allow E_OK unless AT_EACCESS flag is set */
+	if ((flag & AT_EACCESS) == 0)
+		fmode &= ~E_OK;
+
+	error = lx_common_access(fname, fmode, startvp);
+	if (startvp != NULL)
+		VN_RELE(startvp);
+	if (error)
+		return (set_errno(error));
+	return (0);
+}
+
+int
+lx_access(char *fname, int fmode)
+{
+	return (lx_faccessat(LX_AT_FDCWD, fname, fmode, 0));
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_aio.c b/usr/src/uts/common/brand/lx/syscall/lx_aio.c
new file mode 100644
index 0000000000..12f37ea4c7
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_aio.c
@@ -0,0 +1,45 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+#include <sys/systm.h>
+#include <sys/mutex.h>
+#include <sys/brand.h>
+
+#include <sys/lx_brand.h>
+#include <sys/lx_syscalls.h>
+
+
+long
+lx_io_setup(unsigned int nr_events, void **ctxp)
+{
+	lx_proc_data_t *lxpd = ptolxproc(curproc);
+	uintptr_t uargs[2] = {(uintptr_t)nr_events, (uintptr_t)ctxp};
+
+	mutex_enter(&curproc->p_lock);
+	lxpd->l_flags |= LX_PROC_AIO_USED;
+	mutex_exit(&curproc->p_lock);
+
+	ttolxlwp(curthread)->br_eosys = JUSTRETURN;
+#if defined(_LP64)
+	if (get_udatamodel() != DATAMODEL_NATIVE) {
+		lx_emulate_user32(ttolwp(curthread), LX_SYS32_io_setup, uargs);
+	} else
+#endif
+	{
+		lx_emulate_user(ttolwp(curthread), LX_SYS_io_setup, uargs);
+	}
+	/* NOTREACHED */
+	return (0);
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_brk.c b/usr/src/uts/common/brand/lx/syscall/lx_brk.c
new file mode 100644
index 0000000000..19a7577ac0
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_brk.c
@@ -0,0 +1,57 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+
+/*
+ * The brk() system call needs to be in-kernel because Linux expects a call to
+ * brk(0) to return the current breakpoint.  In Solaris, the process breakpoint
+ * is setup and managed by libc.  Due to the way we link our libraries and the
+ * need for Linux to manage its own breakpoint, this has to remain in the
+ * kernel.
+ */
+extern int brk(caddr_t);
+
+long
+lx_brk(caddr_t nva)
+{
+	proc_t *p = curproc;
+	klwp_t *lwp = ttolwp(curthread);
+
+	if (nva != 0) {
+		(void) brk(nva);
+
+		/*
+		 * Despite claims to the contrary in the manpage, when Linux
+		 * brk() fails, errno is left unchanged.
+		 */
+		lwp->lwp_errno = 0;
+	}
+	return ((long)(p->p_brkbase + p->p_brksize));
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_chmod.c b/usr/src/uts/common/brand/lx/syscall/lx_chmod.c
new file mode 100644
index 0000000000..7783b97cb0
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_chmod.c
@@ -0,0 +1,107 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+#include <sys/systm.h>
+#include <sys/fcntl.h>
+#include <sys/thread.h>
+#include <sys/klwp.h>
+#include <sys/lx_brand.h>
+#include <sys/lx_fcntl.h>
+
+long
+lx_vn_chmod(vnode_t *vp, int mode)
+{
+	vattr_t vattr;
+
+	vattr.va_mode = mode & MODEMASK;
+	vattr.va_mask = AT_MODE;
+
+	if (vn_is_readonly(vp)) {
+		return (EROFS);
+	}
+	return (VOP_SETATTR(vp, &vattr, 0, CRED(), NULL));
+}
+
+static long
+lx_fchmodat_wrapper(int fd, char *path, int mode)
+{
+	long error;
+	vnode_t *vp;
+
+	if ((error = lx_vp_at(fd, path, &vp, 0)) != 0) {
+		lx_proc_data_t *pd = ttolxproc(curthread);
+
+		/*
+		 * If the process is in "install mode", return success
+		 * if the operation failed due to an absent file.
+		 */
+		if (error == ENOENT &&
+		    (pd->l_flags & LX_PROC_INSTALL_MODE)) {
+			return (0);
+		}
+		return (set_errno(error));
+	}
+
+	error = lx_vn_chmod(vp, mode);
+	VN_RELE(vp);
+
+	if (error != 0) {
+		return (set_errno(error));
+	}
+	return (0);
+}
+
+long
+lx_fchmodat(int fd, char *path, int mode)
+{
+	return (lx_fchmodat_wrapper(fd, path, mode));
+}
+
+long
+lx_fchmod(int fd, int mode)
+{
+	file_t *fp;
+	vnode_t *vp;
+	long error;
+
+	/*
+	 * In order to do proper O_PATH handling, lx_fchmod cannot leverage
+	 * lx_fchmodat with a NULL path since the desired behavior differs.
+	 */
+	if ((fp = getf(fd)) == NULL) {
+		return (set_errno(EBADF));
+	}
+	if (LX_IS_O_PATH(fp)) {
+		releasef(fd);
+		return (set_errno(EBADF));
+	}
+	vp = fp->f_vnode;
+	VN_HOLD(vp);
+	releasef(fd);
+
+	error = lx_vn_chmod(vp, mode);
+	VN_RELE(vp);
+
+	if (error != 0) {
+		return (set_errno(error));
+	}
+	return (0);
+}
+
+long
+lx_chmod(char *path, int mode)
+{
+	return (lx_fchmodat_wrapper(LX_AT_FDCWD, path, mode));
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_chown.c b/usr/src/uts/common/brand/lx/syscall/lx_chown.c
new file mode 100644
index 0000000000..830fba0a73
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_chown.c
@@ -0,0 +1,180 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+#include <sys/systm.h>
+#include <sys/fcntl.h>
+#include <sys/zone.h>
+#include <sys/lx_brand.h>
+#include <sys/lx_fcntl.h>
+#include <sys/lx_types.h>
+
+long
+lx_vn_chown(vnode_t *vp, uid_t uid, gid_t gid)
+{
+	vattr_t vattr;
+	zone_t *zone = crgetzone(CRED());
+
+	if ((uid != (uid_t)-1 && !VALID_UID(uid, zone)) ||
+	    (gid != (gid_t)-1 && !VALID_GID(gid, zone))) {
+		return (EINVAL);
+	}
+	vattr.va_uid = uid;
+	vattr.va_gid = gid;
+	vattr.va_mask = 0;
+	if (vattr.va_uid != -1)
+		vattr.va_mask |= AT_UID;
+	if (vattr.va_gid != -1)
+		vattr.va_mask |= AT_GID;
+
+	if (vn_is_readonly(vp)) {
+		return (EROFS);
+	}
+	return (VOP_SETATTR(vp, &vattr, 0, CRED(), NULL));
+}
+
+long
+lx_fchownat_wrapper(int fd, char *path, uid_t uid, gid_t gid, int native_flag)
+{
+	long error;
+	vnode_t *vp;
+
+	if ((error = lx_vp_at(fd, path, &vp, native_flag)) != 0) {
+		lx_proc_data_t *pd = ttolxproc(curthread);
+
+		/*
+		 * If the process is in "install mode", return success
+		 * if the operation failed due to an absent file.
+		 */
+		if (error == ENOENT &&
+		    (pd->l_flags & LX_PROC_INSTALL_MODE)) {
+			return (0);
+		}
+		return (set_errno(error));
+	}
+
+	error = lx_vn_chown(vp, uid, gid);
+	VN_RELE(vp);
+
+	if (error != 0) {
+		return (set_errno(error));
+	}
+	return (0);
+}
+
+long
+lx_fchown_wrapper(int fd, uid_t uid, gid_t gid)
+{
+	file_t *fp;
+	vnode_t *vp;
+	long error;
+
+	/*
+	 * In order to do proper O_PATH handling, lx_fchown cannot leverage
+	 * lx_fchownat with a NULL path since the desired behavior differs.
+	 */
+	if ((fp = getf(fd)) == NULL) {
+		return (set_errno(EBADF));
+	}
+	if (LX_IS_O_PATH(fp)) {
+		releasef(fd);
+		return (set_errno(EBADF));
+	}
+	vp = fp->f_vnode;
+	VN_HOLD(vp);
+	releasef(fd);
+
+	error = lx_vn_chown(vp, uid, gid);
+	VN_RELE(vp);
+
+	if (error != 0) {
+		return (set_errno(error));
+	}
+	return (0);
+}
+
+long
+lx_fchownat(int fd, char *path, uid_t uid, gid_t gid, int flag)
+{
+	int native_flag = 0;
+
+	if (flag & LX_AT_EMPTY_PATH) {
+		char c;
+
+		/*
+		 * According to fchownat(2), when AT_EMPTY_PATH is set: "if
+		 * path is an empty string, operate on the file referred to by
+		 * fd".  We pass NULL in place of the empty string, which
+		 * causes fchownat() to operate on the fd we passed without an
+		 * additional lookup.
+		 */
+		if (copyin(path, &c, sizeof (c)) != 0) {
+			return (set_errno(EFAULT));
+		}
+		if (c == '\0') {
+			path = NULL;
+		}
+
+		flag &= ~LX_AT_EMPTY_PATH;
+	}
+	if (flag & LX_AT_SYMLINK_NOFOLLOW) {
+		flag &= ~LX_AT_SYMLINK_NOFOLLOW;
+		native_flag |= AT_SYMLINK_NOFOLLOW;
+	}
+	if (flag != 0) {
+		return (set_errno(EINVAL));
+	}
+
+	return (lx_fchownat_wrapper(fd, path, uid, gid, native_flag));
+}
+
+long
+lx_fchown(int fd, uid_t uid, gid_t gid)
+{
+	return (lx_fchown_wrapper(fd, uid, gid));
+}
+
+long
+lx_lchown(char *path, uid_t uid, gid_t gid)
+{
+	return (lx_fchownat_wrapper(AT_FDCWD, path, uid, gid,
+	    AT_SYMLINK_NOFOLLOW));
+}
+
+long
+lx_chown(char *path, uid_t uid, gid_t gid)
+{
+	return (lx_fchownat_wrapper(AT_FDCWD, path, uid, gid, 0));
+}
+
+long
+lx_fchown16(int fd, lx_uid16_t uid, lx_gid16_t gid)
+{
+	return (lx_fchown_wrapper(fd, LX_UID16_TO_UID32(uid),
+	    LX_GID16_TO_GID32(gid)));
+}
+
+long
+lx_lchown16(char *path, uid_t uid, gid_t gid)
+{
+	return (lx_fchownat_wrapper(AT_FDCWD, path, LX_UID16_TO_UID32(uid),
+	    LX_GID16_TO_GID32(gid), AT_SYMLINK_NOFOLLOW));
+}
+
+long
+lx_chown16(char *path, lx_uid16_t uid, lx_gid16_t gid)
+{
+	return (lx_fchownat_wrapper(AT_FDCWD, path, LX_UID16_TO_UID32(uid),
+	    LX_GID16_TO_GID32(gid), 0));
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_clone.c b/usr/src/uts/common/brand/lx/syscall/lx_clone.c
new file mode 100644
index 0000000000..50cdeaeab9
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_clone.c
@@ -0,0 +1,143 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ * Copyright 2015 Joyent, Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/brand.h>
+#include <sys/lx_brand.h>
+#include <sys/lx_ldt.h>
+#include <sys/lx_misc.h>
+#include <lx_signum.h>
+#include <lx_syscall.h>
+#include <sys/x86_archext.h>
+#include <sys/controlregs.h>
+
+/*
+ * Our lwp has already been created at this point, so this routine is
+ * responsible for setting up all the state needed to track this as a
+ * linux cloned thread.
+ */
+/* ARGSUSED */
+int
+lx_helper_clone(int64_t *rval, int flags, void *ptidp, void *tls, void *ctidp)
+{
+	struct lx_lwp_data *lwpd = ttolxlwp(curthread);
+	struct lx_proc_data *lproc = ttolxproc(curthread);
+	struct ldt_info info;
+	struct user_desc descr;
+	int tls_index;
+	int entry = -1;
+	int signo;
+
+	signo = flags & LX_CSIGNAL;
+	if (signo < 0 || signo > LX_NSIG)
+		return (set_errno(EINVAL));
+
+	if (!(flags & LX_CLONE_THREAD)) {
+		lproc->l_signal = signo;
+	} else {
+		if (flags & LX_CLONE_SETTLS) {
+			if (get_udatamodel() == DATAMODEL_ILP32) {
+				if (copyin((caddr_t)tls, &info, sizeof (info)))
+					return (set_errno(EFAULT));
+
+				if (LDT_INFO_EMPTY(&info))
+					return (set_errno(EINVAL));
+
+				entry = info.entry_number;
+				if (entry < GDT_TLSMIN || entry > GDT_TLSMAX)
+					return (set_errno(EINVAL));
+
+				tls_index = entry - GDT_TLSMIN;
+
+				/*
+				 * Convert the user-space structure into a real
+				 * x86 descriptor and copy it into this LWP's
+				 * TLS array.  We also load it into the GDT.
+				 */
+				LDT_INFO_TO_DESC(&info, &descr);
+				bcopy(&descr, &lwpd->br_tls[tls_index],
+				    sizeof (descr));
+				lx_set_gdt(entry, &lwpd->br_tls[tls_index]);
+			} else {
+				/*
+				 * Set the Linux %fsbase for this LWP.  We will
+				 * restore it the next time we return to Linux
+				 * via setcontext()/lx_restorecontext().
+				 */
+				lwpd->br_lx_fsbase = (uintptr_t)tls;
+			}
+		}
+
+		lwpd->br_clear_ctidp =
+		    (flags & LX_CLONE_CHILD_CLEARTID) ?  ctidp : NULL;
+
+		if (signo && ! (flags & LX_CLONE_DETACH))
+			lwpd->br_signal = signo;
+		else
+			lwpd->br_signal = 0;
+
+		if (flags & LX_CLONE_THREAD)
+			lwpd->br_tgid = curthread->t_procp->p_pid;
+
+		if (flags & LX_CLONE_PARENT)
+			lwpd->br_ppid = 0;
+
+		if ((flags & LX_CLONE_CHILD_SETTID) && (ctidp != NULL) &&
+		    (suword32(ctidp, lwpd->br_pid) != 0)) {
+			if (entry >= 0)
+				lx_clear_gdt(entry);
+			return (set_errno(EFAULT));
+		}
+		if ((flags & LX_CLONE_PARENT_SETTID) && (ptidp != NULL) &&
+		    (suword32(ptidp, lwpd->br_pid) != 0)) {
+			if (entry >= 0)
+				lx_clear_gdt(entry);
+			return (set_errno(EFAULT));
+		}
+	}
+
+	*rval = lwpd->br_pid;
+	return (0);
+}
+
+long
+lx_set_tid_address(int *tidp)
+{
+	struct lx_lwp_data *lwpd = ttolxlwp(curthread);
+	long rv;
+
+	lwpd->br_clear_ctidp = tidp;
+
+	if (curproc->p_pid == curproc->p_zone->zone_proc_initpid) {
+		rv = 1;
+	} else {
+		rv = lwpd->br_pid;
+	}
+
+	return (rv);
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_close.c b/usr/src/uts/common/brand/lx/syscall/lx_close.c
new file mode 100644
index 0000000000..8df0cbbe2f
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_close.c
@@ -0,0 +1,57 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+#include <sys/systm.h>
+#include <sys/mutex.h>
+#include <sys/brand.h>
+
+#include <sys/lx_brand.h>
+#include <sys/lx_syscalls.h>
+
+
+extern int close(int);
+
+long
+lx_close(int fdes)
+{
+	lx_proc_data_t *lxpd = ptolxproc(curproc);
+	boolean_t aio_used;
+	uintptr_t uargs[1] = {(uintptr_t)fdes};
+
+	mutex_enter(&curproc->p_lock);
+	aio_used = ((lxpd->l_flags & LX_PROC_AIO_USED) != 0);
+	mutex_exit(&curproc->p_lock);
+
+	if (!aio_used) {
+		return (close(fdes));
+	}
+
+	/*
+	 * If the process potentially has any AIO contexts open, the userspace
+	 * emulation must be used so that libc can properly maintain its state.
+	 */
+
+	ttolxlwp(curthread)->br_eosys = JUSTRETURN;
+#if defined(_LP64)
+	if (get_udatamodel() != DATAMODEL_NATIVE) {
+		lx_emulate_user32(ttolwp(curthread), LX_SYS32_close, uargs);
+	} else
+#endif
+	{
+		lx_emulate_user(ttolwp(curthread), LX_SYS_close, uargs);
+	}
+	/* NOTREACHED */
+	return (0);
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_cpu.c b/usr/src/uts/common/brand/lx/syscall/lx_cpu.c
new file mode 100644
index 0000000000..ec8b7576d8
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_cpu.c
@@ -0,0 +1,35 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+#include <sys/systm.h>
+#include <sys/thread.h>
+#include <sys/cpuvar.h>
+#include <sys/cmn_err.h>
+#include <sys/lx_impl.h>
+
+/*
+ * We support neither the second argument (NUMA node), nor the third (obsolete
+ * pre-2.6.24 caching functionality which was ultimately broken).
+ */
+long
+lx_getcpu(unsigned int *cpu, uintptr_t p2, uintptr_t p3)
+{
+	unsigned int curcpu = curthread->t_cpu->cpu_id;
+
+	if (copyout(&curcpu, cpu, sizeof (curcpu)) != 0)
+		return (set_errno(EFAULT));
+
+	return (0);
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_epoll.c b/usr/src/uts/common/brand/lx/syscall/lx_epoll.c
new file mode 100644
index 0000000000..62a0eccf4b
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_epoll.c
@@ -0,0 +1,272 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/thread.h>
+#include <sys/proc.h>
+#include <sys/zone.h>
+#include <sys/brand.h>
+#include <sys/epoll.h>
+#include <sys/devpoll.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/sunldi.h>
+#include <sys/vnode.h>
+#include <sys/lx_brand.h>
+#include <sys/lx_types.h>
+#include <sys/lx_signal.h>
+
+static major_t devpoll_major = 0;
+
+static boolean_t
+lx_epoll_isvalid(file_t *fp)
+{
+	vnode_t *vp = fp->f_vnode;
+
+	if (vp->v_type == VCHR && getmajor(vp->v_rdev) == devpoll_major)
+		return (B_TRUE);
+	return (B_FALSE);
+}
+
+long
+lx_epoll_create1(int flags)
+{
+	int err, fd, rv;
+	int fmode = FREAD | FWRITE;
+	boolean_t cloexec = B_FALSE;
+	vnode_t *vp = NULL;
+	file_t *fp = NULL;
+
+	if (flags & EPOLL_CLOEXEC) {
+		cloexec = B_TRUE;
+		flags &= ~EPOLL_CLOEXEC;
+	}
+	if (flags != 0) {
+		/* No other flags accepted at this time */
+		return (set_errno(EINVAL));
+	}
+
+	if (falloc((vnode_t *)NULL, fmode, &fp, &fd) != 0) {
+		err = EMFILE;
+		goto error;
+	}
+	if (ldi_vp_from_name("/devices/pseudo/poll@0:poll", &vp) != 0) {
+		err = ENOENT;
+		goto error;
+	}
+	if ((err = VOP_OPEN(&vp, fmode | FKLYR, CRED(), NULL)) != 0) {
+		goto error;
+	}
+	err = VOP_IOCTL(vp, DP_EPOLLCOMPAT, 0, fmode, CRED(), &rv, NULL);
+	if (err != 0) {
+		(void) VOP_CLOSE(vp, fmode, 0, 0, CRED(), NULL);
+		goto error;
+	}
+
+	devpoll_major = getmajor(vp->v_rdev);
+
+	fp->f_vnode = vp;
+	mutex_exit(&fp->f_tlock);
+	setf(fd, fp);
+	if (cloexec) {
+		f_setfd(fd, FD_CLOEXEC);
+	}
+	return (fd);
+
+error:
+	if (fp != NULL) {
+		setf(fd, NULL);
+		unfalloc(fp);
+	}
+	if (vp != NULL) {
+		VN_RELE(vp);
+	}
+	return (set_errno(err));
+}
+
+long
+lx_epoll_create(int size)
+{
+	if (size <= 0) {
+		return (set_errno(EINVAL));
+	}
+
+	return (lx_epoll_create1(0));
+}
+
+
+/* Match values from libc implementation */
+#define	EPOLLIGNORED 	(EPOLLMSG | EPOLLWAKEUP)
+#define	EPOLLSWIZZLED	\
+	(EPOLLRDHUP | EPOLLONESHOT | EPOLLET | EPOLLWRBAND | EPOLLWRNORM)
+
+long
+lx_epoll_ctl(int fd, int op, int pfd, void *event)
+{
+	epoll_event_t epevent;
+	dvpoll_epollfd_t dpevent[2];
+	file_t *fp;
+	iovec_t aiov;
+	uio_t auio;
+	uint32_t events, ev = 0;
+	int error = 0, i = 0;
+
+	dpevent[i].dpep_pollfd.fd = pfd;
+	switch (op) {
+	case EPOLL_CTL_DEL:
+		dpevent[i].dpep_pollfd.events = POLLREMOVE;
+		break;
+
+	case EPOLL_CTL_MOD:
+		/*
+		 * In the modify case, we pass down two events:  one to
+		 * remove the event and another to add it back.
+		 */
+		dpevent[i++].dpep_pollfd.events = POLLREMOVE;
+		dpevent[i].dpep_pollfd.fd = pfd;
+		/* FALLTHROUGH */
+
+	case EPOLL_CTL_ADD:
+		if (copyin(event, &epevent, sizeof (epevent)) != 0)
+			return (set_errno(EFAULT));
+
+		/*
+		 * Mask off the events that we ignore, and then swizzle the
+		 * events for which our values differ from their epoll(7)
+		 * equivalents.
+		 */
+		events = epevent.events;
+		ev = events & ~(EPOLLIGNORED | EPOLLSWIZZLED);
+
+		if (events & EPOLLRDHUP)
+			ev |= POLLRDHUP;
+		if (events & EPOLLET)
+			ev |= POLLET;
+		if (events & EPOLLONESHOT)
+			ev |= POLLONESHOT;
+		if (events & EPOLLWRNORM)
+			ev |= POLLWRNORM;
+		if (events & EPOLLWRBAND)
+			ev |= POLLWRBAND;
+
+		dpevent[i].dpep_data = epevent.data.u64;
+		dpevent[i].dpep_pollfd.events = ev;
+		break;
+
+	default:
+		return (set_errno(EINVAL));
+	}
+
+	if ((fp = getf(fd)) == NULL) {
+		return (set_errno(EBADF));
+	} else if (!lx_epoll_isvalid(fp)) {
+		releasef(fd);
+		return (set_errno(EINVAL));
+	}
+
+	aiov.iov_base = (void *)dpevent;
+	aiov.iov_len = sizeof (dvpoll_epollfd_t) * (i + 1);
+	auio.uio_iov = &aiov;
+	auio.uio_iovcnt = 1;
+	auio.uio_resid = aiov.iov_len;
+	auio.uio_segflg = UIO_SYSSPACE;
+	auio.uio_loffset = 0;
+	auio.uio_fmode = fp->f_flag;
+
+	error = VOP_WRITE(fp->f_vnode, &auio, 1, fp->f_cred, NULL);
+
+	releasef(fd);
+	if (error)
+		return (set_errno(error));
+	return (0);
+}
+
+long
+lx_epoll_wait(int fd, void *events, int maxevents, int timeout)
+{
+	struct dvpoll arg;
+	file_t *fp;
+	int rv = 0, error, flag;
+
+	if (maxevents <= 0) {
+		return (set_errno(EINVAL));
+	}
+	if ((fp = getf(fd)) == NULL) {
+		return (set_errno(EBADF));
+	} else if (!lx_epoll_isvalid(fp)) {
+		releasef(fd);
+		return (set_errno(EINVAL));
+	}
+
+	arg.dp_nfds = maxevents;
+	arg.dp_timeout = timeout;
+	arg.dp_fds = (pollfd_t *)events;
+	flag = fp->f_flag | DATAMODEL_NATIVE | FKIOCTL;
+	error = VOP_IOCTL(fp->f_vnode, DP_POLL, (uintptr_t)&arg, flag,
+	    fp->f_cred, &rv, NULL);
+
+	releasef(fd);
+	if (error != 0) {
+		return (set_errno(error));
+	}
+	return (rv);
+}
+
+long
+lx_epoll_pwait(int fd, void *events, int maxevents, int timeout, void *sigmask)
+{
+	struct dvpoll arg;
+	file_t *fp;
+	int rv = 0, error, flag;
+	k_sigset_t ksig;
+
+	if (maxevents <= 0) {
+		return (set_errno(EINVAL));
+	}
+	if ((fp = getf(fd)) == NULL) {
+		return (set_errno(EBADF));
+	} else if (!lx_epoll_isvalid(fp)) {
+		releasef(fd);
+		return (set_errno(EINVAL));
+	}
+	if (sigmask != NULL) {
+		lx_sigset_t lsig;
+
+		if (copyin(sigmask, &lsig, sizeof (lsig)) != 0) {
+			releasef(fd);
+			return (set_errno(EFAULT));
+		}
+		lx_ltos_sigset(&lsig, &ksig);
+		arg.dp_setp = (sigset_t *)&ksig;
+	} else {
+		arg.dp_setp = NULL;
+	}
+
+	arg.dp_nfds = maxevents;
+	arg.dp_timeout = timeout;
+	arg.dp_fds = (pollfd_t *)events;
+	flag = fp->f_flag | DATAMODEL_NATIVE | FKIOCTL;
+	error = VOP_IOCTL(fp->f_vnode, DP_PPOLL, (uintptr_t)&arg, flag,
+	    fp->f_cred, &rv, NULL);
+
+	releasef(fd);
+	if (error != 0) {
+		return (set_errno(error));
+	}
+	return (rv);
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_fallocate.c b/usr/src/uts/common/brand/lx/syscall/lx_fallocate.c
new file mode 100644
index 0000000000..338e4399fe
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_fallocate.c
@@ -0,0 +1,251 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+#include <sys/systm.h>
+#include <sys/zone.h>
+#include <sys/types.h>
+#include <sys/filio.h>
+#include <sys/fcntl.h>
+#include <sys/stat.h>
+#include <sys/nbmlock.h>
+#include <sys/lx_impl.h>
+#include <sys/lx_brand.h>
+#include <sys/sdt.h>
+
+extern int flock_check(vnode_t *, flock64_t *, offset_t, offset_t);
+
+#define	LX_FALLOC_FL_KEEP_SIZE		0x01
+#define	LX_FALLOC_FL_PUNCH_HOLE		0x02
+#define	LX_FALLOC_FL_NO_HIDE_STALE	0x04
+#define	LX_FALLOC_FL_COLLAPSE_RANGE	0x08
+#define	LX_FALLOC_FL_ZERO_RANGE		0x10
+
+#define	LX_FALLOC_VALID	(LX_FALLOC_FL_KEEP_SIZE | LX_FALLOC_FL_PUNCH_HOLE | \
+	LX_FALLOC_FL_NO_HIDE_STALE | LX_FALLOC_FL_COLLAPSE_RANGE | \
+	LX_FALLOC_FL_ZERO_RANGE)
+
+#define	LX_FALLOC_UNSUPP	(LX_FALLOC_FL_NO_HIDE_STALE | \
+	LX_FALLOC_FL_COLLAPSE_RANGE)
+
+long
+lx_fallocate(int fd, int mode, off_t offset, off_t len)
+{
+	int error = 0;
+	file_t *fp;
+	vnode_t *vp;
+	int64_t tot;
+	struct flock64 bf;
+	vattr_t vattr;
+	u_offset_t f_offset;
+	boolean_t in_crit = B_FALSE;
+
+	/*
+	 * Error checking is in a specific order to make LTP happy.
+	 */
+
+	tot = offset + len;
+	if (tot > (LLONG_MAX / (int64_t)1024))
+		return (set_errno(EFBIG));
+
+	if (mode & LX_FALLOC_UNSUPP)
+		return (set_errno(EOPNOTSUPP));
+
+	if ((fp = getf(fd)) == NULL)
+		return (set_errno(EBADF));
+
+	if ((fp->f_flag & FWRITE) == 0) {
+		error = EBADF;
+		goto done;
+	}
+
+	vp = fp->f_vnode;
+	if (vp->v_type != VREG) {
+		error = EINVAL;
+		goto done;
+	}
+
+	if (offset < 0 || len <= 0) {
+		error = EINVAL;
+		goto done;
+	}
+
+	if (tot < 0LL) {
+		error = EFBIG;
+		goto done;
+	}
+
+	if ((mode & ~LX_FALLOC_VALID) != 0) {
+		error = EINVAL;
+		goto done;
+	}
+
+	/*
+	 * If this is the only flag then we don't actually do any work.
+	 */
+	if (mode == LX_FALLOC_FL_KEEP_SIZE)
+		goto done;
+
+	bzero(&bf, sizeof (bf));
+
+	vattr.va_mask = AT_SIZE;
+	if ((error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL)) != 0)
+		goto done;
+
+	if (mode == 0) {
+		/* Nothing to do if not extending the file */
+		if (vattr.va_size >= tot)
+			goto done;
+
+		/* Extend the file. */
+		bf.l_start = (off64_t)tot;
+		bf.l_len = (off64_t)0;
+
+	} else if (mode & LX_FALLOC_FL_PUNCH_HOLE) {
+		/*
+		 * Deallocate space in the file.
+		 */
+		if ((mode & LX_FALLOC_FL_KEEP_SIZE) == 0) {
+			/* this flag is required with punch hole */
+			error = EINVAL;
+			goto done;
+		}
+
+		if (mode &
+		    ~(LX_FALLOC_FL_PUNCH_HOLE | LX_FALLOC_FL_KEEP_SIZE)) {
+			error = EINVAL;
+			goto done;
+		}
+
+		/* Make sure we don't extend since keep_size is set. */
+		if (vattr.va_size < tot) {
+			if (offset > vattr.va_size)
+				goto done;
+			len = (off_t)vattr.va_size - offset;
+		}
+
+		bf.l_start = (off64_t)offset;
+		bf.l_len = (off64_t)len;
+
+	} else if (mode & LX_FALLOC_FL_ZERO_RANGE) {
+		/*
+		 * Zero out the space in the file.
+		 */
+		if (mode &
+		    ~(LX_FALLOC_FL_ZERO_RANGE | LX_FALLOC_FL_KEEP_SIZE)) {
+			error = EINVAL;
+			goto done;
+		}
+
+		/* Make sure we don't extend when keep_size is set. */
+		if (mode & LX_FALLOC_FL_KEEP_SIZE && vattr.va_size < tot) {
+			if (offset > vattr.va_size)
+				goto done;
+			len = vattr.va_size - offset;
+		}
+
+		bf.l_start = (off64_t)offset;
+		bf.l_len = (off64_t)len;
+	} else {
+		/* We should have already handled all flags */
+		VERIFY(0);
+	}
+
+	/*
+	 * Check for locks in the range.
+	 */
+	f_offset = fp->f_offset;
+	error = flock_check(vp, &bf, f_offset, MAXOFF_T);
+	if (error != 0)
+		goto done;
+
+	/*
+	 * Check for conflicting non-blocking mandatory locks.
+	 * We need to get the size again under nbl_start_crit.
+	 */
+	if (nbl_need_check(vp)) {
+		u_offset_t	begin;
+		ssize_t		length;
+
+		nbl_start_crit(vp, RW_READER);
+		in_crit = B_TRUE;
+		vattr.va_mask = AT_SIZE;
+		if ((error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL)) != 0)
+			goto done;
+
+		/*
+		 * Make sure we don't extend when keep_size is set.
+		 */
+		if (mode & LX_FALLOC_FL_KEEP_SIZE && vattr.va_size < tot) {
+			ASSERT(mode & (LX_FALLOC_FL_PUNCH_HOLE |
+			    LX_FALLOC_FL_ZERO_RANGE));
+
+			/*
+			 * If the size grew we can short-circuit the rest of
+			 * the work, otherwise adjust bf for the vop_space
+			 * call.
+			 */
+			if (offset >= vattr.va_size)
+				goto done;
+			len = vattr.va_size - offset;
+			bf.l_len = (off64_t)len;
+		}
+
+		if (offset > vattr.va_size) {
+			begin = vattr.va_size;
+			length = offset - vattr.va_size;
+		} else {
+			begin = offset;
+			length = vattr.va_size - offset;
+		}
+
+		if (nbl_conflict(vp, NBL_WRITE, begin, length, 0, NULL)) {
+			error = EACCES;
+			goto done;
+		}
+	}
+
+	error = VOP_SPACE(vp, F_FREESP, &bf, 0, f_offset, fp->f_cred, NULL);
+
+done:
+	if (in_crit)
+		nbl_end_crit(vp);
+
+	releasef(fd);
+	if (error != 0)
+		return (set_errno(error));
+
+	return (0);
+}
+
+long
+lx_fallocate32(int fd, int mode, uint32_t offl, uint32_t offh, uint32_t lenl,
+    uint32_t lenh)
+{
+	int64_t offset = 0, len = 0;
+
+	/*
+	 * From 32-bit callers, Linux passes the 64-bit offset and len by
+	 * concatenating consecutive arguments. We must perform the same
+	 * conversion here.
+	 */
+	offset = offh;
+	offset = offset << 32;
+	offset |= offl;
+	len = lenh;
+	len = len << 32;
+	len |= lenl;
+
+	return (lx_fallocate(fd, mode, (off_t)offset, (off_t)len));
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_fcntl.c b/usr/src/uts/common/brand/lx/syscall/lx_fcntl.c
new file mode 100644
index 0000000000..2699b9bac7
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_fcntl.c
@@ -0,0 +1,644 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#include <sys/systm.h>
+#include <sys/zone.h>
+#include <sys/types.h>
+#include <sys/filio.h>
+#include <sys/fcntl.h>
+#include <sys/stat.h>
+#include <sys/cmn_err.h>
+#include <sys/pathname.h>
+#include <sys/lx_impl.h>
+#include <sys/lx_brand.h>
+#include <sys/lx_fcntl.h>
+#include <sys/lx_misc.h>
+#include <sys/lx_socket.h>
+#include <sys/fs/fifonode.h>
+#include <sys/strsubr.h>
+#include <sys/stream.h>
+
+extern int fcntl(int, int, intptr_t);
+extern int flock_check(vnode_t *, flock64_t *, offset_t, offset_t);
+
+
+int
+lx_vp_at(int fd, char *upath, vnode_t **vpp, int flag)
+{
+	vnode_t *startvp;
+	int error;
+
+	if (fd == LX_AT_FDCWD) {
+		fd = AT_FDCWD;
+	}
+
+	if ((error = fgetstartvp(fd, upath, &startvp)) != 0) {
+		return (error);
+	}
+
+	if (upath != NULL) {
+		uio_seg_t seg = UIO_USERSPACE;
+
+		error = lookupnameat(upath, seg,
+		    (flag == AT_SYMLINK_NOFOLLOW) ?  NO_FOLLOW : FOLLOW,
+		    NULLVPP, vpp, startvp);
+		if (startvp != NULL) {
+			VN_RELE(startvp);
+		}
+		return (error);
+	} else {
+		/* VN_HOLD was established in fgetstartvp */
+		*vpp = startvp;
+		VERIFY(*vpp);
+		return (0);
+	}
+}
+
+#define	LTOS_FLOCK(l, s)						\
+{									\
+	s->l_type = ltos_type(l->l_type);				\
+	s->l_whence = l->l_whence;					\
+	s->l_start = l->l_start;					\
+	s->l_len = l->l_len;						\
+	s->l_sysid = 0;			/* not defined in linux */	\
+	s->l_pid = (pid_t)l->l_pid;					\
+}
+
+#define	STOL_FLOCK(s, l)						\
+{									\
+	l->l_type = stol_type(s->l_type);				\
+	l->l_whence = s->l_whence;					\
+	l->l_start = s->l_start;					\
+	l->l_len = s->l_len;						\
+	l->l_pid = (int)s->l_pid;					\
+}
+
+static short
+ltos_type(short l_type)
+{
+	switch (l_type) {
+	case LX_F_RDLCK:
+		return (F_RDLCK);
+	case LX_F_WRLCK:
+		return (F_WRLCK);
+	case LX_F_UNLCK:
+		return (F_UNLCK);
+	default:
+		return (-1);
+	}
+}
+
+static short
+stol_type(short l_type)
+{
+	switch (l_type) {
+	case F_RDLCK:
+		return (LX_F_RDLCK);
+	case F_WRLCK:
+		return (LX_F_WRLCK);
+	case F_UNLCK:
+		return (LX_F_UNLCK);
+	default:
+		/* can't ever happen */
+		return (0);
+	}
+}
+
+static void
+ltos_flock(struct lx_flock *l, struct flock64 *s)
+{
+	LTOS_FLOCK(l, s)
+}
+
+static void
+stol_flock(struct flock64 *s, struct lx_flock *l)
+{
+	STOL_FLOCK(s, l)
+}
+
+static void
+ltos_flock64(struct lx_flock64_32 *l, struct flock64 *s)
+{
+	LTOS_FLOCK(l, s)
+}
+
+static void
+stol_flock64(struct flock64 *s, struct lx_flock64_32 *l)
+{
+	STOL_FLOCK(s, l)
+}
+
+static int
+lx_fcntl_getfl(int fd)
+{
+	int retval;
+	int rc;
+
+	retval = fcntl(fd, F_GETFL, 0);
+	if (ttolwp(curthread)->lwp_errno != 0)
+		return (ttolwp(curthread)->lwp_errno);
+
+	if ((retval & O_ACCMODE) == O_RDONLY)
+		rc = LX_O_RDONLY;
+	else if ((retval & O_ACCMODE) == O_WRONLY)
+		rc = LX_O_WRONLY;
+	else
+		rc = LX_O_RDWR;
+	/* O_NDELAY != O_NONBLOCK, so we need to check for both */
+	if (retval & O_NDELAY)
+		rc |= LX_O_NDELAY;
+	if (retval & O_NONBLOCK)
+		rc |= LX_O_NONBLOCK;
+	if (retval & O_APPEND)
+		rc |= LX_O_APPEND;
+	if (retval & O_SYNC)
+		rc |= LX_O_SYNC;
+	if (retval & O_LARGEFILE)
+		rc |= LX_O_LARGEFILE;
+	if (retval & FASYNC)
+		rc |= LX_O_ASYNC;
+
+	return (rc);
+}
+
+static int
+lx_fcntl_setfl(int fd, ulong_t arg)
+{
+	int new_arg;
+
+	new_arg = 0;
+	/* LX_O_NDELAY == LX_O_NONBLOCK, so we only check for one */
+	if (arg & LX_O_NDELAY)
+		new_arg |= O_NONBLOCK;
+	if (arg & LX_O_APPEND)
+		new_arg |= O_APPEND;
+	if (arg & LX_O_SYNC)
+		new_arg |= O_SYNC;
+	if (arg & LX_O_LARGEFILE)
+		new_arg |= O_LARGEFILE;
+	if (arg & LX_O_ASYNC)
+		new_arg |= FASYNC;
+
+	return (fcntl(fd, F_SETFL, new_arg));
+}
+
+/* The default unprivileged limit in Linux is 1MB */
+static int lx_pipe_max_size = 1048576;
+
+static int
+lx_fcntl_pipesz(int fd, int cmd, ulong_t arg)
+{
+	file_t *fp;
+	vnode_t *vp;
+	stdata_t *str;
+	int err = 0, res = 0;
+
+	if ((fp = getf(fd)) == NULL) {
+		return (set_errno(EBADF));
+	}
+	vp = fp->f_vnode;
+	if (vp->v_type != VFIFO || vp->v_op != fifo_vnodeops) {
+		err = EBADF;
+		goto out;
+	}
+	VERIFY((str = vp->v_stream) != NULL);
+
+	if (cmd == LX_F_SETPIPE_SZ) {
+		stdata_t *mate;
+		intptr_t val = arg;
+
+		if (val < PAGESIZE || val > lx_pipe_max_size) {
+			err = EINVAL;
+			goto out;
+		}
+		if (!STRMATED(str)) {
+			err = strqset(RD(str->sd_wrq), QHIWAT, 0, val);
+			goto out;
+		}
+
+		/*
+		 * Ensure consistent order so the set operation is always
+		 * attempted on the "higher" stream first.
+		 */
+		if (str > str->sd_mate) {
+			VERIFY((mate = str->sd_mate) != NULL);
+		} else {
+			mate = str;
+			VERIFY((str = mate->sd_mate) != NULL);
+		}
+
+		/*
+		 * While it is unfortunate that an error could occur for the
+		 * latter half of the stream pair, there is little to be done
+		 * about it aside from reporting the failure.
+		 */
+		if ((err = strqset(RD(str->sd_wrq), QHIWAT, 0, val)) != 0) {
+			goto out;
+		}
+		err = strqset(RD(mate->sd_wrq), QHIWAT, 0, val);
+	} else if (cmd == LX_F_GETPIPE_SZ) {
+		size_t val;
+
+		err = strqget(RD(str->sd_wrq), QHIWAT, 0, &val);
+		res = val;
+	} else {
+		/* NOTREACHED */
+		ASSERT(0);
+	}
+
+out:
+	releasef(fd);
+	if (err != 0) {
+		return (set_errno(err));
+	}
+	return (res);
+}
+
+static int
+lx_fcntl_common(int fd, int cmd, ulong_t arg)
+{
+	int		rc = 0;
+	pid_t		pid;
+	int		error;
+	int		rv;
+	int32_t		flag;
+	file_t		*fp;
+
+	/*
+	 * We depend on the call to fcntl to set the errno if necessary.
+	 */
+	ttolwp(curthread)->lwp_errno = 0;
+
+	switch (cmd) {
+	case LX_F_SETSIG:
+	case LX_F_GETSIG:
+	case LX_F_SETLEASE:
+	case LX_F_GETLEASE:
+	case LX_F_NOTIFY:
+	case LX_F_CANCELLK:
+		{
+			char buf[80];
+
+			(void) snprintf(buf, sizeof (buf),
+			    "unsupported fcntl command: %d", cmd);
+			lx_unsupported(buf);
+		}
+		return (set_errno(ENOTSUP));
+
+	case LX_F_DUPFD:
+		rc = fcntl(fd, F_DUPFD, arg);
+		break;
+
+	case LX_F_DUPFD_CLOEXEC:
+		rc = fcntl(fd, F_DUPFD_CLOEXEC, arg);
+		break;
+
+	case LX_F_GETFD:
+		rc = fcntl(fd, F_GETFD, 0);
+		break;
+
+	case LX_F_SETFD:
+		rc = fcntl(fd, F_SETFD, arg);
+		break;
+
+	case LX_F_GETFL:
+		rc = lx_fcntl_getfl(fd);
+		break;
+
+	case LX_F_SETFL:
+		rc = lx_fcntl_setfl(fd, arg);
+		break;
+
+	case LX_F_SETOWN:
+		pid = (pid_t)arg;
+		if (pid == 1) {
+			/* Setown for the init process uses the real pid. */
+			pid = curzone->zone_proc_initpid;
+		}
+
+		if ((fp = getf(fd)) == NULL)
+			return (set_errno(EBADF));
+
+		rv = 0;
+
+		flag = fp->f_flag | get_udatamodel() | FKIOCTL;
+		error = VOP_IOCTL(fp->f_vnode, FIOSETOWN, (intptr_t)&pid,
+		    flag, CRED(), &rv, NULL);
+		releasef(fd);
+		if (error != 0) {
+			/*
+			 * On illumos F_SETOWN is only defined for sockets, but
+			 * some apps hardcode to do this fcntl on other devices
+			 * (e.g. /dev/tty) to setup signal handling. If the
+			 * app is only setting itself to be the signal
+			 * handler, we pretend to succeed.
+			 */
+			if (error != EINVAL ||
+			    curthread->t_procp->p_pid != pid) {
+				return (set_errno(error));
+			}
+		}
+
+		rc = 0;
+		break;
+
+	case LX_F_GETOWN:
+		if ((fp = getf(fd)) == NULL)
+			return (set_errno(EBADF));
+
+		rv = 0;
+
+		flag = fp->f_flag | get_udatamodel() | FKIOCTL;
+		error = VOP_IOCTL(fp->f_vnode, FIOGETOWN, (intptr_t)&pid,
+		    flag, CRED(), &rv, NULL);
+		releasef(fd);
+		if (error != 0)
+			return (set_errno(error));
+
+		if (pid == curzone->zone_proc_initpid) {
+			/* Getown for the init process returns 1. */
+			pid = 1;
+		}
+
+		rc = pid;
+		break;
+
+	case LX_F_SETPIPE_SZ:
+	case LX_F_GETPIPE_SZ:
+		rc = lx_fcntl_pipesz(fd, cmd, arg);
+		break;
+
+	default:
+		return (set_errno(EINVAL));
+	}
+
+	return (rc);
+}
+
+static int
+lx_fcntl_lock_cmd_to_s(int lx_cmd)
+{
+	switch (lx_cmd) {
+	case LX_F_GETLK:
+		return (F_GETLK);
+	case LX_F_SETLK:
+		return (F_SETLK);
+	case LX_F_SETLKW:
+		return (F_SETLKW);
+	case LX_F_GETLK64:
+		return (F_GETLK64);
+	case LX_F_SETLK64:
+		return (F_SETLK64);
+	case LX_F_SETLKW64:
+		return (F_SETLKW64);
+	default:
+		VERIFY(0);
+		/*NOTREACHED*/
+		return (0);
+	}
+}
+
+/*
+ * This is a pain but we can't re-use the fcntl code for locking since it does
+ * its own copyin/copyout for the flock struct. Since we have to convert the
+ * struct we have to do our own copyin/out. Thus we replicate the fcntl code for
+ * these 3 cmds. Luckily it's not much.
+ */
+static int
+lx_fcntl_lock(int fd, int lx_cmd, void *arg)
+{
+	int cmd;
+	int error = 0;
+	file_t *fp;
+	vnode_t *vp;
+	int flag;
+	offset_t maxoffset;
+	u_offset_t offset;
+	model_t datamodel;
+	lx_flock_t lxflk;
+	lx_flock64_32_t lxflk64;
+	struct flock64 bf;
+
+	if ((fp = getf(fd)) == NULL)
+		return (set_errno(EBADF));
+
+	maxoffset = MAXOFF_T;
+	datamodel = DATAMODEL_NATIVE;
+#if defined(_SYSCALL32_IMPL)
+	if ((datamodel = get_udatamodel()) == DATAMODEL_ILP32)
+		maxoffset = MAXOFF32_T;
+#endif
+	vp = fp->f_vnode;
+	flag = fp->f_flag;
+	offset = fp->f_offset;
+
+	cmd = lx_fcntl_lock_cmd_to_s(lx_cmd);
+
+	switch (cmd) {
+	case F_GETLK:
+	case F_SETLK:
+	case F_SETLKW:
+		if (datamodel == DATAMODEL_NATIVE) {
+			if (copyin(arg, &lxflk, sizeof (lx_flock_t)) != 0) {
+				error = EFAULT;
+				break;
+			}
+		}
+#if defined(_SYSCALL32_IMPL)
+		else {
+			lx_flock32_t lxflk32;
+
+			if (copyin(arg, &lxflk32, sizeof (lxflk32)) != 0) {
+				error = EFAULT;
+				break;
+			}
+
+			lxflk.l_type = lxflk32.l_type;
+			lxflk.l_whence = lxflk32.l_whence;
+			lxflk.l_start = (off64_t)lxflk32.l_start;
+			lxflk.l_len = (off64_t)lxflk32.l_len;
+			lxflk.l_pid = lxflk32.l_pid;
+		}
+#endif /* _SYSCALL32_IMPL */
+
+		ltos_flock(&lxflk, &bf);
+
+		if ((error = flock_check(vp, &bf, offset, maxoffset)) != 0)
+			break;
+
+		if ((error = VOP_FRLOCK(vp, cmd, &bf, flag, offset, NULL,
+		    fp->f_cred, NULL)) != 0)
+			break;
+
+		if (cmd != F_GETLK)
+			break;
+
+		/*
+		 * The command is GETLK, return result.
+		 */
+		stol_flock(&bf, &lxflk);
+
+		/*
+		 * If no lock is found, only the type field is changed.
+		 */
+		if (lxflk.l_type == LX_F_UNLCK) {
+			/* l_type always first entry, always a short */
+			if (copyout(&lxflk.l_type, &((lx_flock_t *)arg)->l_type,
+			    sizeof (lxflk.l_type)))
+				error = EFAULT;
+			break;
+		}
+
+		if (bf.l_start > maxoffset || bf.l_len > maxoffset) {
+			error = EOVERFLOW;
+			break;
+		}
+
+		if (datamodel == DATAMODEL_NATIVE) {
+			if (copyout(&lxflk, arg, sizeof (lxflk)) != 0) {
+				error = EFAULT;
+				break;
+			}
+		}
+#if defined(_SYSCALL32_IMPL)
+		else {
+			lx_flock32_t lxflk32;
+
+			if (bf.l_start > MAXOFF32_T || bf.l_len > MAXOFF32_T) {
+				error = EOVERFLOW;
+				break;
+			}
+
+			lxflk32.l_type = lxflk.l_type;
+			lxflk32.l_whence = lxflk.l_whence;
+			lxflk32.l_start = lxflk.l_start;
+			lxflk32.l_len = lxflk.l_len;
+			lxflk32.l_pid = lxflk.l_pid;
+
+			if (copyout(&lxflk32, arg, sizeof (lxflk32)) != 0) {
+				error = EFAULT;
+				break;
+			}
+		}
+#endif /* _SYSCALL32_IMPL */
+		break;
+
+	case F_GETLK64:
+	case F_SETLK64:
+	case F_SETLKW64:
+		/*
+		 * Large File support is only used for ILP32 apps.
+		 */
+		if (datamodel != DATAMODEL_ILP32) {
+			error = EINVAL;
+			break;
+		}
+
+		if (cmd == F_GETLK64)
+			cmd = F_GETLK;
+		else if (cmd == F_SETLK64)
+			cmd = F_SETLK;
+		else if (cmd == F_SETLKW64)
+			cmd = F_SETLKW;
+
+		if (copyin(arg, &lxflk64, sizeof (lxflk64)) != 0) {
+			error = EFAULT;
+			break;
+		}
+
+		ltos_flock64(&lxflk64, &bf);
+
+		if ((error = flock_check(vp, &bf, offset, MAXOFFSET_T)) != 0)
+			break;
+
+		if ((error = VOP_FRLOCK(vp, cmd, &bf, flag, offset, NULL,
+		    fp->f_cred, NULL)) != 0)
+			break;
+
+		if (cmd != F_GETLK)
+			break;
+
+		/*
+		 * The command is GETLK, return result.
+		 */
+		stol_flock64(&bf, &lxflk64);
+
+		/*
+		 * If no lock is found, only the type field is changed.
+		 */
+		if (lxflk64.l_type == LX_F_UNLCK) {
+			/* l_type always first entry, always a short */
+			if (copyout(&lxflk64.l_type,
+			    &((lx_flock64_t *)arg)->l_type,
+			    sizeof (lxflk64.l_type)))
+				error = EFAULT;
+			break;
+		}
+
+		if (bf.l_start > maxoffset || bf.l_len > maxoffset) {
+			error = EOVERFLOW;
+			break;
+		}
+
+		if (copyout(&lxflk64, arg, sizeof (lxflk64)) != 0) {
+			error = EFAULT;
+			break;
+		}
+		break;
+	}
+
+	releasef(fd);
+	if (error)
+		return (set_errno(error));
+
+	return (0);
+}
+
+long
+lx_fcntl(int fd, int cmd, intptr_t arg)
+{
+	switch (cmd) {
+	case LX_F_GETLK64:
+	case LX_F_SETLK64:
+	case LX_F_SETLKW64:
+		/* The 64-bit fcntl commands must go through fcntl64(). */
+		return (set_errno(EINVAL));
+
+	case LX_F_GETLK:
+	case LX_F_SETLK:
+	case LX_F_SETLKW:
+		return (lx_fcntl_lock(fd, cmd, (void *)arg));
+
+	default:
+		return (lx_fcntl_common(fd, cmd, arg));
+	}
+}
+
+long
+lx_fcntl64(int fd, int cmd, intptr_t arg)
+{
+	switch (cmd) {
+	case LX_F_GETLK:
+	case LX_F_SETLK:
+	case LX_F_SETLKW:
+	case LX_F_GETLK64:
+	case LX_F_SETLKW64:
+	case LX_F_SETLK64:
+		return (lx_fcntl_lock(fd, cmd, (void *)arg));
+
+	default:
+		return (lx_fcntl_common(fd, cmd, (ulong_t)arg));
+	}
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_futex.c b/usr/src/uts/common/brand/lx/syscall/lx_futex.c
new file mode 100644
index 0000000000..e7648e1fc3
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_futex.c
@@ -0,0 +1,1104 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/debug.h>
+#include <vm/as.h>
+#include <vm/seg.h>
+#include <vm/seg_vn.h>
+#include <vm/page.h>
+#include <sys/priv.h>
+#include <sys/mman.h>
+#include <sys/timer.h>
+#include <sys/condvar.h>
+#include <sys/inttypes.h>
+#include <sys/cmn_err.h>
+#include <sys/brand.h>
+#include <sys/lx_brand.h>
+#include <sys/lx_futex.h>
+#include <sys/lx_impl.h>
+
+/*
+ * Futexes are a Linux-specific implementation of inter-process mutexes.
+ * They are designed to use shared memory for simple, uncontested
+ * operations, and rely on the kernel to resolve any contention issues.
+ *
+ * Most of the information in this section comes from the paper "Futexes
+ * Are Tricky", by Ulrich Drepper.  This paper is currently available at:
+ * http://people.redhat.com/~drepper/futex.pdf.
+ *
+ * A futex itself a 4-byte integer, which must be 4-byte aligned.  The
+ * value of this integer is expected to be modified using user-level atomic
+ * operations.  The futex(4) design itself does not impose any semantic
+ * constraints on the value stored in the futex; it is up to the
+ * application to define its own protocol.
+ *
+ * When the application decides that kernel intervention is required, it
+ * will use the futex(2) system call.  There are 5 different operations
+ * that can be performed on a futex, using this system call.  Since this
+ * interface has evolved over time, there are several different prototypes
+ * available to the user.  Fortunately, there is only a single kernel-level
+ * interface:
+ *
+ * long sys_futex(void *futex1, int cmd, int val1,
+ * 	struct timespec	*timeout, void *futex2, int val2)
+ *
+ * The kernel-level operations that may be performed on a futex are:
+ *
+ * FUTEX_WAIT
+ *
+ *	Atomically verify that futex1 contains the value val1.  If it
+ *	doesn't, return EWOULDBLOCK.  If it does contain the expected
+ *	value, the thread will sleep until somebody performs a FUTEX_WAKE
+ *	on the futex.  The caller may also specify a timeout, indicating
+ *	the maximum time the thread should sleep.  If the timer expires,
+ *	the call returns ETIMEDOUT.  If the thread is awoken with a signal,
+ *	the call returns EINTR.  Otherwise, the call returns 0.
+ *
+ * FUTEX_WAKE
+ *
+ *	Wake up val1 processes that are waiting on futex1.  The call
+ *	returns the number of blocked threads that were woken up.
+ *
+ * FUTEX_WAIT_BITSET/FUTEX_WAKE_BITSET
+ *
+ *	Similar to FUTEX_WAIT/FUTEX_WAKE, but each takes an additional argument
+ *	denoting a bit vector, with wakers will only waking waiters that match
+ *	in one or more bits.  These semantics are dubious enough, but the
+ *	interface has an inconsistency that is glaring even by the
+ *	embarrassingly low standards that Linux sets for itself:  the timeout
+ *	argument to FUTEX_WAIT_BITSET is absolute, not relative as it is for
+ *	FUTEX_WAIT.  And as if that weren't enough unnecessary complexity,
+ *	the caller may specify this absolute timeout to be against either
+ *	CLOCK_MONOTONIC or CLOCK_REALTIME -- but only for FUTEX_WAIT_BITSET,
+ *	of course!
+ *
+ * FUTEX_WAKE_OP
+ *
+ *	The implementation of a conditional variable in terms of futexes
+ *	actually uses two futexes:  one to assure sequential access and one to
+ *	represent the condition variable.  This implementation gives rise to a
+ *	particular performance problem whereby a thread is awoken on the futex
+ *	that represents the condition variable only to have to (potentially)
+ *	immediately wait on the futex that protects the condition variable.
+ *	(Do not confuse the futex that serves to protect the condition variable
+ *	with the pthread_mutex_t associated with pthread_cond_t -- which
+ *	represents a third futex.)  To (over)solve this problem, FUTEX_WAKE_OP
+ *	was invented, which performs an atomic compare-and-exchange on a
+ *	second address in a specified fashion (that is, with a specified
+ *	operation).  Here are the possible operations (OPARG is defined
+ *	to be 12 bit value embedded in the operation):
+ *
+ *	- FUTEX_OP_SET: Sets the value at the second address to OPARG
+ *	- FUTEX_OP_ADD: Adds the value to OPARG
+ *	- FUTEX_OP_OR: OR's the value with OPARG
+ *	- FUTEX_OP_ANDN: Performs a negated AND of the value with OPARG
+ *	- FUTEX_OP_XOR: XOR's the value with OPARG
+ *
+ *	After this compare-and-exchange on the second address, a FUTEX_WAKE is
+ *	performed on the first address and -- if the compare-and-exchange
+ *	matches a specified result based on a specified comparison operation --
+ *	a FUTEX_WAKE is performed on the second address.  Here are the possible
+ *	comparison operations:
+ *
+ *	- FUTEX_OP_CMP_EQ: If old value is CMPARG, wake
+ *	- FUTEX_OP_CMP_NE: If old value is not equal to CMPARG, wake
+ *	- FUTEX_OP_CMP_LT: If old value is less than CMPARG, wake
+ *	- FUTEX_OP_CMP_LE: If old value is less than or equal to CMPARG, wake
+ *	- FUTEX_OP_CMP_GT: If old value is greater than CMPARG, wake
+ *	- FUTEX_OP_CMP_GE: If old value is greater than or equal to CMPARG, wake
+ *
+ *	As a practical matter, the only way that this is used (or, some might
+ *	argue, is usable) is by the implementation of pthread_cond_signal(),
+ *	which uses FUTEX_WAKE_OP to -- in a single system call -- unlock the
+ *	futex that protects the condition variable and wake the futex that
+ *	represents the condition variable.  The second wake-up is conditional
+ *	because the futex that protects the condition variable (rather than the
+ *	one that represents it) may or may not have waiters.  Given that this
+ *	is the use case, FUTEX_WAKE_OP is falsely generic: despite allowing for
+ *	five different kinds of operations and six different kinds of
+ *	comparision operations, in practice only one is used.  (Namely, setting
+ *	to 0 and waking if the old value is greater than 1 -- which denotes
+ *	that waiters are present and the wakeup should be performed.) Moreover,
+ *	because FUTEX_WAKE_OP does not (and cannot) optimize anything in the
+ *	case that the pthread_mutex_t associated with the pthread_cond_t is
+ *	held at the time of a pthread_cond_signal(), this entire mechanism is
+ *	essentially for naught in this case.  As one can imagine (and can
+ *	verify on just about any source base that uses pthread_cond_signal()),
+ *	it is overwhelmingly the common case that the lock associated with the
+ *	pthread_cond_t is held at the time of pthread_cond_signal(), assuring
+ *	that the problem that all of this complexity was designed to solve
+ *	isn't, in fact, solved because the signalled thread simply wakes up
+ *	only to block again on the held mutex.  Cue a slow clap!
+ *
+ * FUTEX_CMP_REQUEUE
+ *
+ *	If the value stored in futex1 matches that passed in in val2, wake
+ *	up val1 processes that are waiting on futex1.  Otherwise, return
+ *	EAGAIN.
+ *
+ *	If there are more than val1 threads waiting on the futex, remove
+ *	the remaining threads from this futex, and requeue them on futex2.
+ *	The caller can limit the number of threads being requeued by
+ *	encoding an integral numerical value in the position usually used
+ *	for the timeout pointer.
+ *
+ *	The call returns the number of blocked threads that were woken up
+ *	or requeued.
+ *
+ * FUTEX_REQUEUE
+ *
+ *	 Identical to FUTEX_CMP_REQUEUE except that it does not use val2.
+ *	 This command has been declared broken and obsolete, but we still
+ *	 need to support it.
+ *
+ * FUTEX_FD
+ *
+ *	Return a file descriptor, which can be used to refer to the futex.
+ *	This operation was broken by design, and was blessedly removed in
+ *	Linux 2.6.26 ("because it was inherently racy"); it should go without
+ *	saying that we don't support this operation.
+ */
+
+/*
+ * This structure is used to track all the threads currently waiting on a
+ * futex.  There is one fwaiter_t for each blocked thread.  We store all
+ * fwaiter_t's in a hash structure, indexed by the memid_t of the integer
+ * containing the futex's value.
+ *
+ * At the moment, all fwaiter_t's for a single futex are simply dumped into
+ * the hash bucket.  If futex contention ever becomes a hot path, we can
+ * chain a single futex's waiters together.
+ */
+typedef struct fwaiter {
+	memid_t		fw_memid;	/* memid of the user-space futex */
+	kcondvar_t	fw_cv;		/* cond var */
+	struct fwaiter	*fw_next;	/* hash queue */
+	struct fwaiter	*fw_prev;	/* hash queue */
+	uint32_t	fw_bits;	/* bits waiting on */
+	volatile int	fw_woken;
+} fwaiter_t;
+
+/*
+ * The structure of the robust_list, as set with the set_robust_list() system
+ * call.  See lx_futex_robust_exit(), below, for details.
+ */
+typedef struct futex_robust_list {
+	uintptr_t	frl_head;	/* list of robust locks held */
+	uint64_t	frl_offset;	/* offset of lock word within a lock */
+	uintptr_t	frl_pending;	/* pending operation */
+} futex_robust_list_t;
+
+#if defined(_SYSCALL32_IMPL)
+
+#pragma pack(4)
+typedef struct futex_robust_list32 {
+	uint32_t	frl_head;	/* list of robust locks held */
+	uint32_t	frl_offset;	/* offset of lock word within a lock */
+	uint32_t	frl_pending;	/* pending operation */
+} futex_robust_list32_t;
+#pragma pack()
+
+#endif
+
+#define	MEMID_COPY(s, d) \
+	{ (d)->val[0] = (s)->val[0]; (d)->val[1] = (s)->val[1]; }
+#define	MEMID_EQUAL(s, d) \
+	((d)->val[0] == (s)->val[0] && (d)->val[1] == (s)->val[1])
+
+/*
+ * Because collisions on this hash table can be a source of negative
+ * scalability, we make it pretty large: 4,096 entries -- 64K.  If this
+ * size is found to be insufficient, the size should be made dynamic.
+ * (Making it dynamic will be delicate because the per-chain locking will
+ * necessitate memory retiring or similar; see the 2008 ACM Queue article
+ * "Real-world concurrency" for details on this technique.)
+ */
+#define	HASH_SHIFT_SZ	12
+#define	HASH_SIZE	(1 << HASH_SHIFT_SZ)
+#define	HASH_FUNC(id)						\
+	((((uintptr_t)((id)->val[1]) >> 3) +			\
+	((uintptr_t)((id)->val[1]) >> (3 + HASH_SHIFT_SZ)) +		\
+	((uintptr_t)((id)->val[1]) >> (3 + 2 * HASH_SHIFT_SZ)) +	\
+	((uintptr_t)((id)->val[0]) >> 3) +				\
+	((uintptr_t)((id)->val[0]) >> (3 + HASH_SHIFT_SZ)) +		\
+	((uintptr_t)((id)->val[0]) >> (3 + 2 * HASH_SHIFT_SZ))) &	\
+	(HASH_SIZE - 1))
+
+/*
+ * We place the per-chain lock next to the pointer to the chain itself.
+ * When compared to an array of orthogonal locks, this reduces false sharing
+ * (though adjacent entries can still be falsely shared -- just not as many),
+ * while having the additional bonus of increasing locality.
+ */
+typedef struct futex_hash {
+	kmutex_t fh_lock;
+	fwaiter_t *fh_waiters;
+} futex_hash_t;
+
+static futex_hash_t futex_hash[HASH_SIZE];
+
+static void
+futex_hashin(fwaiter_t *fwp)
+{
+	int index;
+
+	index = HASH_FUNC(&fwp->fw_memid);
+	ASSERT(MUTEX_HELD(&futex_hash[index].fh_lock));
+
+	fwp->fw_prev = NULL;
+	fwp->fw_next = futex_hash[index].fh_waiters;
+	if (fwp->fw_next)
+		fwp->fw_next->fw_prev = fwp;
+	futex_hash[index].fh_waiters = fwp;
+}
+
+static void
+futex_hashout(fwaiter_t *fwp)
+{
+	int index;
+
+	index = HASH_FUNC(&fwp->fw_memid);
+	ASSERT(MUTEX_HELD(&futex_hash[index].fh_lock));
+
+	if (fwp->fw_prev)
+		fwp->fw_prev->fw_next = fwp->fw_next;
+	if (fwp->fw_next)
+		fwp->fw_next->fw_prev = fwp->fw_prev;
+	if (futex_hash[index].fh_waiters == fwp)
+		futex_hash[index].fh_waiters = fwp->fw_next;
+
+	fwp->fw_prev = NULL;
+	fwp->fw_next = NULL;
+}
+
+/*
+ * Go to sleep until somebody does a WAKE operation on this futex, we get a
+ * signal, or the timeout expires.
+ */
+static int
+futex_wait(memid_t *memid, caddr_t addr,
+    int val, timespec_t *timeout, uint32_t bits)
+{
+	kthread_t *t = curthread;
+	int err, ret;
+	int32_t curval;
+	fwaiter_t fw;
+	int index;
+
+	/*
+	 * The LMS_USER_LOCK micro state becomes valid if we sleep; otherwise
+	 * our time will accrue against LMS_SYSTEM.  Use of this micro state
+	 * is modelled on lwp_mutex_timedlock(), a native analogue of
+	 * futex_wait().
+	 */
+	(void) new_mstate(t, LMS_USER_LOCK);
+
+	fw.fw_woken = 0;
+	fw.fw_bits = bits;
+
+	MEMID_COPY(memid, &fw.fw_memid);
+	cv_init(&fw.fw_cv, NULL, CV_DEFAULT, NULL);
+
+	index = HASH_FUNC(&fw.fw_memid);
+	mutex_enter(&futex_hash[index].fh_lock);
+
+	if (fuword32(addr, (uint32_t *)&curval)) {
+		err = set_errno(EFAULT);
+		goto out;
+	}
+	if (curval != val) {
+		err = set_errno(EWOULDBLOCK);
+		goto out;
+	}
+
+	futex_hashin(&fw);
+
+	err = 0;
+	while ((fw.fw_woken == 0) && (err == 0)) {
+		ret = cv_waituntil_sig(&fw.fw_cv, &futex_hash[index].fh_lock,
+		    timeout, timechanged);
+		if (ret < 0) {
+			err = set_errno(ETIMEDOUT);
+		} else if (ret == 0) {
+			/*
+			 * According to signal(7), a futex(2) call with the
+			 * FUTEX_WAIT operation is restartable.
+			 */
+			ttolxlwp(t)->br_syscall_restart = B_TRUE;
+			err = set_errno(EINTR);
+		}
+	}
+
+	/*
+	 * The futex is normally hashed out in wakeup.  If we timed out or
+	 * got a signal, we need to hash it out here instead.
+	 */
+	if (fw.fw_woken == 0)
+		futex_hashout(&fw);
+
+out:
+	mutex_exit(&futex_hash[index].fh_lock);
+
+	return (err);
+}
+
+/*
+ * Wake up to wake_threads threads that are blocked on the futex at memid.
+ */
+static int
+futex_wake(memid_t *memid, int wake_threads, uint32_t mask)
+{
+	fwaiter_t *fwp, *next;
+	int index;
+	int ret = 0;
+
+	index = HASH_FUNC(memid);
+
+	mutex_enter(&futex_hash[index].fh_lock);
+
+	for (fwp = futex_hash[index].fh_waiters;
+	    fwp != NULL && ret < wake_threads; fwp = next) {
+		next = fwp->fw_next;
+		if (MEMID_EQUAL(&fwp->fw_memid, memid) &&
+		    (fwp->fw_bits & mask)) {
+			futex_hashout(fwp);
+			fwp->fw_woken = 1;
+			cv_signal(&fwp->fw_cv);
+			ret++;
+		}
+	}
+
+	mutex_exit(&futex_hash[index].fh_lock);
+
+	return (ret);
+}
+
+static int
+futex_wake_op_execute(int32_t *addr, int32_t val3)
+{
+	int32_t op = FUTEX_OP_OP(val3);
+	int32_t cmp = FUTEX_OP_CMP(val3);
+	int32_t cmparg = FUTEX_OP_CMPARG(val3);
+	int32_t oparg, oldval, newval;
+	label_t ljb;
+	int rval;
+
+	if ((uintptr_t)addr >= KERNELBASE)
+		return (set_errno(EFAULT));
+
+	if (on_fault(&ljb))
+		return (set_errno(EFAULT));
+
+	oparg = FUTEX_OP_OPARG(val3);
+
+	do {
+		oldval = *addr;
+		newval = oparg;
+
+		switch (op) {
+		case FUTEX_OP_SET:
+			break;
+
+		case FUTEX_OP_ADD:
+			newval += oparg;
+			break;
+
+		case FUTEX_OP_OR:
+			newval |= oparg;
+			break;
+
+		case FUTEX_OP_ANDN:
+			newval &= ~oparg;
+			break;
+
+		case FUTEX_OP_XOR:
+			newval ^= oparg;
+			break;
+
+		default:
+			no_fault();
+			return (set_errno(EINVAL));
+		}
+	} while (atomic_cas_32((uint32_t *)addr, oldval, newval) != oldval);
+
+	no_fault();
+
+	switch (cmp) {
+	case FUTEX_OP_CMP_EQ:
+		rval = (oldval == cmparg);
+		break;
+
+	case FUTEX_OP_CMP_NE:
+		rval = (oldval != cmparg);
+		break;
+
+	case FUTEX_OP_CMP_LT:
+		rval = (oldval < cmparg);
+		break;
+
+	case FUTEX_OP_CMP_LE:
+		rval = (oldval <= cmparg);
+		break;
+
+	case FUTEX_OP_CMP_GT:
+		rval = (oldval > cmparg);
+		break;
+
+	case FUTEX_OP_CMP_GE:
+		rval = (oldval >= cmparg);
+		break;
+
+	default:
+		return (set_errno(EINVAL));
+	}
+
+	return (rval);
+}
+
+static int
+futex_wake_op(memid_t *memid, caddr_t addr2, memid_t *memid2,
+    int wake_threads, int wake_threads2, int val3)
+{
+	kmutex_t *l1, *l2;
+	int ret = 0, ret2 = 0, wake;
+	fwaiter_t *fwp, *next;
+	int index1, index2;
+
+	index1 = HASH_FUNC(memid);
+	index2 = HASH_FUNC(memid2);
+
+	if (index1 == index2) {
+		l1 = &futex_hash[index1].fh_lock;
+		l2 = NULL;
+	} else if (index1 < index2) {
+		l1 = &futex_hash[index1].fh_lock;
+		l2 = &futex_hash[index2].fh_lock;
+	} else {
+		l1 = &futex_hash[index2].fh_lock;
+		l2 = &futex_hash[index1].fh_lock;
+	}
+
+	mutex_enter(l1);
+	if (l2 != NULL)
+		mutex_enter(l2);
+
+	/* LINTED: alignment */
+	if ((wake = futex_wake_op_execute((int32_t *)addr2, val3)) < 0)
+		goto out;
+
+	for (fwp = futex_hash[index1].fh_waiters; fwp != NULL; fwp = next) {
+		next = fwp->fw_next;
+		if (!MEMID_EQUAL(&fwp->fw_memid, memid))
+			continue;
+
+		futex_hashout(fwp);
+		fwp->fw_woken = 1;
+		cv_signal(&fwp->fw_cv);
+		if (++ret >= wake_threads) {
+			break;
+		}
+	}
+
+	if (!wake)
+		goto out;
+
+	for (fwp = futex_hash[index2].fh_waiters; fwp != NULL; fwp = next) {
+		next = fwp->fw_next;
+		if (!MEMID_EQUAL(&fwp->fw_memid, memid2))
+			continue;
+
+		futex_hashout(fwp);
+		fwp->fw_woken = 1;
+		cv_signal(&fwp->fw_cv);
+		if (++ret2 >= wake_threads2) {
+			break;
+		}
+	}
+
+	ret += ret2;
+out:
+	if (l2 != NULL)
+		mutex_exit(l2);
+	mutex_exit(l1);
+
+	return (ret);
+}
+
+/*
+ * Wake up to wake_threads waiting on the futex at memid.  If there are
+ * more than that many threads waiting, requeue the remaining threads on
+ * the futex at requeue_memid.
+ */
+static int
+futex_requeue(memid_t *memid, memid_t *requeue_memid, int wake_threads,
+	ulong_t requeue_threads, caddr_t addr, int *cmpval)
+{
+	fwaiter_t *fwp, *next;
+	int index1, index2;
+	int ret = 0;
+	int32_t curval;
+	kmutex_t *l1, *l2;
+
+	/*
+	 * To ensure that we don't miss a wakeup if the value of cmpval
+	 * changes, we need to grab locks on both the original and new hash
+	 * buckets.  To avoid deadlock, we always grab the lower-indexed
+	 * lock first.
+	 */
+	index1 = HASH_FUNC(memid);
+	index2 = HASH_FUNC(requeue_memid);
+
+	if (index1 == index2) {
+		l1 = &futex_hash[index1].fh_lock;
+		l2 = NULL;
+	} else if (index1 < index2) {
+		l1 = &futex_hash[index1].fh_lock;
+		l2 = &futex_hash[index2].fh_lock;
+	} else {
+		l1 = &futex_hash[index2].fh_lock;
+		l2 = &futex_hash[index1].fh_lock;
+	}
+
+	mutex_enter(l1);
+	if (l2 != NULL)
+		mutex_enter(l2);
+
+	if (cmpval != NULL) {
+		if (fuword32(addr, (uint32_t *)&curval)) {
+			ret = -EFAULT;
+			goto out;
+		}
+		if (curval != *cmpval) {
+			ret = -EAGAIN;
+			goto out;
+		}
+	}
+
+	for (fwp = futex_hash[index1].fh_waiters; fwp != NULL; fwp = next) {
+		next = fwp->fw_next;
+		if (!MEMID_EQUAL(&fwp->fw_memid, memid))
+			continue;
+
+		futex_hashout(fwp);
+		if (ret++ < wake_threads) {
+			fwp->fw_woken = 1;
+			cv_signal(&fwp->fw_cv);
+		} else {
+			MEMID_COPY(requeue_memid, &fwp->fw_memid);
+			futex_hashin(fwp);
+
+			if ((ret - wake_threads) >= requeue_threads)
+				break;
+		}
+	}
+
+out:
+	if (l2 != NULL)
+		mutex_exit(l2);
+	mutex_exit(l1);
+
+	if (ret < 0)
+		return (set_errno(-ret));
+	return (ret);
+}
+
+/*
+ * Copy in the relative timeout provided by the application and convert it
+ * to an absolute timeout.  Sadly, this is complicated by the different
+ * timeout of semantics of FUTEX_WAIT vs. FUTEX_WAIT_BITSET.  (Yes, you read
+ * that correctly; FUTEX_WAIT and FUTEX_WAIT_BITSET have different timeout
+ * semantics; see the block comment at the top of the file for commentary
+ * on this inanity.)
+ */
+static int
+get_timeout(void *lx_timeout, timestruc_t *timeout, int cmd, int clock)
+{
+	timestruc_t now;
+
+	if (get_udatamodel() == DATAMODEL_NATIVE) {
+		if (copyin(lx_timeout, timeout, sizeof (timestruc_t)))
+			return (EFAULT);
+	}
+#ifdef _SYSCALL32_IMPL
+	else {
+		timestruc32_t timeout32;
+		if (copyin(lx_timeout, &timeout32, sizeof (timestruc32_t)))
+			return (EFAULT);
+		timeout->tv_sec = (time_t)timeout32.tv_sec;
+		timeout->tv_nsec = timeout32.tv_nsec;
+	}
+#endif
+	if (itimerspecfix(timeout))
+		return (EINVAL);
+
+	if (cmd == FUTEX_WAIT) {
+		/*
+		 * We've been given a relative time; add it to the current
+		 * time to derive an absolute time.
+		 */
+		gethrestime(&now);
+		timespecadd(timeout, &now);
+	} else {
+		/*
+		 * This is a FUTEX_WAIT_BITSET operation, which (1) specifies
+		 * the timeout as an absolute rather than a relative timeout
+		 * and (2) allows for different clock types to be specified.
+		 * If the clock is CLOCK_REALTIME, we actually have nothing
+		 * to do -- but if this is CLOCK_MONOTONIC, we need to convert
+		 * our absolute time back into a relative time and then add
+		 * it to our current hrestime to get an absolute CLOCK_REALTIME
+		 * timeout.
+		 */
+		if (clock == CLOCK_MONOTONIC) {
+			/*
+			 * Get our current time, and subtract it from our
+			 * timeout to get the relative value.
+			 */
+			hrt2ts(gethrtime(), &now);
+			timespecsub(timeout, &now);
+
+			/*
+			 * If our timeout is in the past, set it to be 0.
+			 */
+			if (timeout->tv_sec < 0) {
+				timeout->tv_sec = 0;
+				timeout->tv_nsec = 0;
+			}
+
+			/*
+			 * Add the relative time back into the current time.
+			 */
+			gethrestime(&now);
+			timespecadd(timeout, &now);
+		}
+	}
+
+	return (0);
+}
+
+long
+lx_futex(uintptr_t addr, int op, int val, uintptr_t lx_timeout,
+	uintptr_t addr2, int val3)
+{
+	struct as *as = curproc->p_as;
+	memid_t memid, memid2;
+	timestruc_t timeout;
+	timestruc_t *tptr = NULL;
+	int val2 = NULL;
+	int rval = 0;
+	int cmd = op & FUTEX_CMD_MASK;
+	int private = op & FUTEX_PRIVATE_FLAG;
+	char dmsg[32];
+
+	/* must be aligned on int boundary */
+	if (addr & 0x3)
+		return (set_errno(EINVAL));
+
+	/* Sanity check the futex command */
+	if (cmd < 0 || cmd > FUTEX_MAX_CMD)
+		return (set_errno(EINVAL));
+
+	if (cmd == FUTEX_FD) {
+		/*
+		 * FUTEX_FD was sentenced to death for grievous crimes of
+		 * semantics against humanity; it has been ripped out of Linux
+		 * and will never be supported by us.
+		 */
+		(void) snprintf(dmsg, sizeof (dmsg), "futex 0x%x", cmd);
+		lx_unsupported(dmsg);
+		return (set_errno(ENOSYS));
+	}
+
+	switch (cmd) {
+	case FUTEX_LOCK_PI:
+	case FUTEX_UNLOCK_PI:
+	case FUTEX_TRYLOCK_PI:
+	case FUTEX_WAIT_REQUEUE_PI:
+	case FUTEX_CMP_REQUEUE_PI:
+		/*
+		 * These are operations that we don't currently support, but
+		 * may well need to in the future.  For now, callers need to
+		 * deal with these being missing -- but if and as that changes,
+		 * they may well need to be implemented.
+		 */
+		(void) snprintf(dmsg, sizeof (dmsg), "futex 0x%x", cmd);
+		lx_unsupported(dmsg);
+		return (set_errno(ENOSYS));
+	}
+
+	if ((op & FUTEX_CLOCK_REALTIME) && cmd != FUTEX_WAIT_BITSET) {
+		/*
+		 * Linux only allows FUTEX_CLOCK_REALTIME to be set on the
+		 * FUTEX_WAIT_BITSET and FUTEX_WAIT_REQUEUE_PI commands.
+		 */
+		return (set_errno(ENOSYS));
+	}
+
+	/* Copy in the timeout structure from userspace. */
+	if ((cmd == FUTEX_WAIT || cmd == FUTEX_WAIT_BITSET) &&
+	    lx_timeout != NULL) {
+		rval = get_timeout((timespec_t *)lx_timeout, &timeout, cmd,
+		    op & FUTEX_CLOCK_REALTIME ? CLOCK_REALTIME :
+		    CLOCK_MONOTONIC);
+
+		if (rval != 0)
+			return (set_errno(rval));
+		tptr = &timeout;
+	}
+
+	switch (cmd) {
+	case FUTEX_REQUEUE:
+	case FUTEX_CMP_REQUEUE:
+	case FUTEX_WAKE_OP:
+		/*
+		 * lx_timeout is nominally a pointer to a userspace address.
+		 * For several commands, however, it actually contains
+		 * an additional integer parameter.  This is horrible, and
+		 * the people who did this to us should be sorry.
+		 */
+		val2 = (int)lx_timeout;
+	}
+
+	/*
+	 * Translate the process-specific, user-space futex virtual
+	 * address(es) to a universal memid.  If the private bit is set, we
+	 * can just use our as plus the virtual address, saving quite a bit
+	 * of effort.
+	 */
+	if (private) {
+		memid.val[0] = (uintptr_t)as;
+		memid.val[1] = (uintptr_t)addr;
+	} else {
+		rval = as_getmemid(as, (void *)addr, &memid);
+		if (rval != 0)
+			return (set_errno(rval));
+	}
+
+	if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
+	    cmd == FUTEX_WAKE_OP) {
+		if (addr2 & 0x3)
+			return (set_errno(EINVAL));
+
+		if (private) {
+			memid2.val[0] = (uintptr_t)as;
+			memid2.val[1] = (uintptr_t)addr2;
+		} else {
+			rval = as_getmemid(as, (void *)addr2, &memid2);
+			if (rval)
+				return (set_errno(rval));
+		}
+	}
+
+	switch (cmd) {
+	case FUTEX_WAIT:
+		rval = futex_wait(&memid, (void *)addr, val,
+		    tptr, FUTEX_BITSET_MATCH_ANY);
+		break;
+
+	case FUTEX_WAIT_BITSET:
+		rval = futex_wait(&memid, (void *)addr, val, tptr, val3);
+		break;
+
+	case FUTEX_WAKE:
+		rval = futex_wake(&memid, val, FUTEX_BITSET_MATCH_ANY);
+		break;
+
+	case FUTEX_WAKE_BITSET:
+		rval = futex_wake(&memid, val, val3);
+		break;
+
+	case FUTEX_WAKE_OP:
+		rval = futex_wake_op(&memid, (void *)addr2, &memid2,
+		    val, val2, val3);
+		break;
+
+	case FUTEX_CMP_REQUEUE:
+	case FUTEX_REQUEUE:
+		rval = futex_requeue(&memid, &memid2, val,
+		    val2, (void *)addr2, &val3);
+
+		break;
+	}
+
+	return (rval);
+}
+
+/*
+ * Does the dirty work of actually dropping a held robust lock in the event
+ * of the untimely death of the owner; see lx_futex_robust_exit(), below.
+ */
+static void
+lx_futex_robust_drop(uintptr_t addr, uint32_t tid)
+{
+	memid_t memid;
+	uint32_t oldval, newval;
+
+	VERIFY(addr + sizeof (uint32_t) < KERNELBASE);
+
+	do {
+		fuword32_noerr((void *)addr, &oldval);
+
+		if ((oldval & FUTEX_TID_MASK) != tid)
+			return;
+
+		newval = (oldval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
+	} while (atomic_cas_32((uint32_t *)addr, oldval, newval) != oldval);
+
+	/*
+	 * We have now denoted that this lock's owner is dead; we need to
+	 * wake any waiters.
+	 */
+	if (as_getmemid(curproc->p_as, (void *)addr, &memid) != 0)
+		return;
+
+	(void) futex_wake(&memid, 1, FUTEX_BITSET_MATCH_ANY);
+}
+
+/*
+ * Called when a thread is exiting.  The role of the kernel is very clearly
+ * spelled out in the Linux design document entitled robust-futex-ABI.txt:
+ * we must (carefully!) iterate over the list of held locks pointed to by
+ * the robust list head; for each lock, we'll check to see if the calling
+ * (exiting) thread is the owner, and if so, denote that the lock is dead
+ * and wake any waiters.  (The "pending" field of the head points to a lock
+ * that is in transition; it should be dropped if held.)  If there are any
+ * errors through here at all (including memory operations), we abort the
+ * entire operation.
+ */
+void
+lx_futex_robust_exit(uintptr_t addr, uint32_t tid)
+{
+	futex_robust_list_t list;
+	uintptr_t entry, next;
+	model_t model = get_udatamodel();
+	int length = 0;
+	label_t ljb;
+
+	if (on_fault(&ljb))
+		return;
+
+	if (addr + sizeof (futex_robust_list_t) >= KERNELBASE)
+		goto out;
+
+	if (model == DATAMODEL_NATIVE) {
+		copyin_noerr((void *)addr, &list, sizeof (list));
+	}
+#if defined(_SYSCALL32_IMPL)
+	else {
+		futex_robust_list32_t list32;
+
+		copyin_noerr((void *)addr, &list32, sizeof (list32));
+		list.frl_head = list32.frl_head;
+		list.frl_offset = list32.frl_offset;
+		list.frl_pending = list32.frl_pending;
+	}
+#endif
+
+	/*
+	 * Strip off the PI bit, if any.
+	 */
+	entry = list.frl_head & ~FUTEX_ROBUST_LOCK_PI;
+
+	while (entry != addr && length++ < FUTEX_ROBUST_LIST_LIMIT) {
+		if (entry + list.frl_offset + sizeof (uint32_t) >= KERNELBASE)
+			goto out;
+
+		if (model == DATAMODEL_NATIVE) {
+			fulword_noerr((void *)entry, &next);
+		}
+#if defined(_SYSCALL32_IMPL)
+		else {
+			uint32_t next32;
+			fuword32_noerr((void *)entry, &next32);
+			next = next32;
+		}
+#endif
+
+		/*
+		 * Drop the robust mutex -- but only if our pending lock didn't
+		 * somehow sneak on there.
+		 */
+		if (entry != list.frl_pending)
+			lx_futex_robust_drop(entry + list.frl_offset, tid);
+
+		entry = next & ~FUTEX_LOCK_PI;
+	}
+
+	/*
+	 * Finally, drop the pending lock if there is one.
+	 */
+	if (list.frl_pending != NULL && list.frl_pending +
+	    list.frl_offset + sizeof (uint32_t) < KERNELBASE)
+		lx_futex_robust_drop(list.frl_pending + list.frl_offset, tid);
+
+out:
+	no_fault();
+}
+
+long
+lx_set_robust_list(void *listp, size_t len)
+{
+	proc_t *p = curproc;
+	klwp_t *lwp = ttolwp(curthread);
+	struct lx_lwp_data *lwpd = lwptolxlwp(lwp);
+
+	if (get_udatamodel() == DATAMODEL_NATIVE) {
+		if (len != sizeof (futex_robust_list_t))
+			return (set_errno(EINVAL));
+	}
+#if defined(_SYSCALL32_IMPL)
+	else {
+		if (len != sizeof (futex_robust_list32_t))
+			return (set_errno(EINVAL));
+	}
+#endif
+
+	/*
+	 * To assure that we are serialized with respect to any racing call
+	 * to lx_get_robust_list(), we lock ourselves to set the value.  (Note
+	 * that sprunlock() drops p_lock.)
+	 */
+	mutex_enter(&p->p_lock);
+	sprlock_proc(p);
+	lwpd->br_robust_list = listp;
+	sprunlock(p);
+
+	return (0);
+}
+
+long
+lx_get_robust_list(pid_t pid, void **listp, size_t *lenp)
+{
+	model_t model = get_udatamodel();
+	pid_t rpid;
+	id_t rtid;
+	proc_t *rproc;
+	klwp_t *rlwp;
+	lx_lwp_data_t *rlwpd;
+	kthread_t *rthr;
+	void *list;
+	int err = 0;
+
+	if (pid == 0) {
+		/*
+		 * A pid of 0 denotes the current thread; we lock the current
+		 * process even though it isn't strictly necessary (we can't
+		 * race with set_robust_list() because a thread may only set
+		 * its robust list on itself).
+		 */
+		rproc = curproc;
+		rlwpd = lwptolxlwp(ttolwp(curthread));
+		mutex_enter(&curproc->p_lock);
+		sprlock_proc(rproc);
+	} else {
+		if (lx_lpid_to_spair(pid, &rpid, &rtid) != 0 ||
+		    (rproc = sprlock(rpid)) == NULL) {
+			/*
+			 * We couldn't find the specified process.
+			 */
+			return (set_errno(ESRCH));
+		}
+
+		if (rproc->p_model != model ||
+		    (rthr = idtot(rproc, rtid)) == NULL ||
+		    (rlwp = ttolwp(rthr)) == NULL ||
+		    (rlwpd = lwptolxlwp(rlwp)) == NULL) {
+			/*
+			 * The target process does not match our data model, or
+			 * we couldn't find the LWP, or the target process is
+			 * not branded.
+			 */
+			err = ESRCH;
+			goto out;
+		}
+	}
+
+	if (curproc != rproc &&
+	    priv_proc_cred_perm(curproc->p_cred, rproc, NULL, VREAD) != 0) {
+		/*
+		 * We don't have the permission to examine the target.
+		 */
+		err = EPERM;
+		goto out;
+	}
+
+	list = rlwpd->br_robust_list;
+
+out:
+	sprunlock(rproc);
+
+	if (err != 0)
+		return (set_errno(err));
+
+	if (model == DATAMODEL_NATIVE) {
+		if (sulword(listp, (uintptr_t)list) != 0)
+			return (set_errno(EFAULT));
+
+		if (sulword(lenp, sizeof (futex_robust_list_t)) != 0)
+			return (set_errno(EFAULT));
+	}
+#if defined(_SYSCALL32_IMPL)
+	else {
+		if (suword32(listp, (uint32_t)(uintptr_t)list) != 0)
+			return (set_errno(EFAULT));
+
+		if (suword32(lenp, sizeof (futex_robust_list32_t)) != 0)
+			return (set_errno(EFAULT));
+	}
+#endif
+
+	return (0);
+}
+
+void
+lx_futex_init(void)
+{
+	int i;
+
+	for (i = 0; i < HASH_SIZE; i++)
+		mutex_init(&futex_hash[i].fh_lock, NULL, MUTEX_DEFAULT, NULL);
+}
+
+int
+lx_futex_fini(void)
+{
+	int i, err;
+
+	err = 0;
+	for (i = 0; (err == 0) && (i < HASH_SIZE); i++) {
+		mutex_enter(&futex_hash[i].fh_lock);
+		if (futex_hash[i].fh_waiters != NULL)
+			err = EBUSY;
+		mutex_exit(&futex_hash[i].fh_lock);
+	}
+	return (err);
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_getcwd.c b/usr/src/uts/common/brand/lx/syscall/lx_getcwd.c
new file mode 100644
index 0000000000..7fcc594d81
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_getcwd.c
@@ -0,0 +1,50 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#include <sys/systm.h>
+#include <sys/types.h>
+#include <sys/proc.h>
+#include <sys/vnode.h>
+#include <sys/pathname.h>
+
+/*
+ * getcwd() - Linux syscall semantics are slightly different; we need to return
+ * the length of the pathname copied (+ 1 for the terminating NULL byte.)
+ */
+long
+lx_getcwd(char *buf, int size)
+{
+	int len;
+	int error;
+	vnode_t *vp;
+	char path[MAXPATHLEN + 1];
+
+	vp = PTOU(curproc)->u_cdir;
+	VN_HOLD(vp);
+	if ((error = vnodetopath(NULL, vp, path, sizeof (path), CRED())) != 0) {
+		VN_RELE(vp);
+		return (set_errno(error));
+	}
+	VN_RELE(vp);
+
+	len = strlen(path) + 1;
+	if (len > size)
+		return (set_errno(ERANGE));
+
+	if (copyout(path, buf, len) != 0)
+		return (set_errno(EFAULT));
+
+	return (len);
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_getdents.c b/usr/src/uts/common/brand/lx/syscall/lx_getdents.c
new file mode 100644
index 0000000000..102d521e02
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_getdents.c
@@ -0,0 +1,350 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+#include <sys/systm.h>
+#include <sys/filio.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/inttypes.h>
+#include <sys/vnode.h>
+#include <sys/dirent.h>
+#include <sys/errno.h>
+#include <sys/file.h>
+#include <sys/sysmacros.h>
+#include <sys/kmem.h>
+#include <sys/sunddi.h>
+
+#include <sys/lx_types.h>
+#include <sys/lx_misc.h>
+
+#define	LX_NAMEMAX	256
+
+#define	LX_GETDENTS_MAX_BUFSZ	65536
+
+/*
+ * Because the Linux dirent has an extra field (d_type), it's possible that
+ * each entry will be 8 bytes larger (and aligned to 8 bytes) due to padding.
+ * To prevent overrun during translation, the illumos-native buffer is sized
+ * pessimistically.
+ */
+#define	LTOS_GETDENTS_BUFSZ(bufsz, datasz)	\
+	(((bufsz) / (((datasz) + 15) & ~7)) * sizeof (struct dirent))
+
+/*
+ * Record must be long enough to house d_name string, null terminator and
+ * d_type field.  It's then padded to nearest 8-byte boundary
+ */
+#define	LX_RECLEN(l, t)	\
+	((offsetof(t, d_name) + 2 + (l) + 7) & ~7)
+
+/*
+ * Bytes after d_name string until d_reclen should be zeroed.
+ * Includes zero-terminating d_name
+ */
+#define	LX_ZEROLEN(l, t)	\
+	(LX_RECLEN(l, t) -	\
+	((offsetof(t, d_name) + (l))))
+
+/* The output format of getdents differs if the caller is 32 or 64 bit. */
+struct lx_dirent_32 {
+	uint32_t	d_ino;
+	int32_t		d_off;
+	ushort_t	d_reclen;
+	char		d_name[1];
+	uchar_t		d_type;
+};
+
+struct lx_dirent_64 {
+	uint64_t	d_ino;
+	int64_t		d_off;
+	ushort_t	d_reclen;
+	char		d_name[1];
+	uchar_t		d_type;
+};
+
+static long
+lx_getdents_common(int fd, caddr_t uptr, size_t count,
+    unsigned int lx_size, int (*outcb)(caddr_t, caddr_t, int))
+{
+	vnode_t *vp;
+	file_t *fp;
+	struct uio auio;
+	struct iovec aiov;
+	int error;
+	int sbufsz, lbufsz, bufsz;
+	void *lbuf, *sbuf;
+	size_t outb = 0;
+
+	if (count < lx_size) {
+		return (set_errno(EINVAL));
+	}
+	if ((fp = getf(fd)) == NULL) {
+		return (set_errno(EBADF));
+	}
+	vp = fp->f_vnode;
+	if (vp->v_type != VDIR) {
+		releasef(fd);
+		return (set_errno(ENOTDIR));
+	}
+	if (!(fp->f_flag & FREAD)) {
+		releasef(fd);
+		return (set_errno(EBADF));
+	}
+
+	if (count > LX_GETDENTS_MAX_BUFSZ) {
+		/*
+		 * If the target buffer passed to us is huge, keep the
+		 * translation buffers moderate in size.  Iteration will be
+		 * used to fill the request.
+		 */
+		lbufsz = LX_GETDENTS_MAX_BUFSZ;
+		sbufsz = LTOS_GETDENTS_BUFSZ(LX_GETDENTS_MAX_BUFSZ, lx_size);
+	} else if (count < (lx_size + MAXPATHLEN)) {
+		/*
+		 * If the target buffer is tiny, allocate a Linux-format buffer
+		 * big enough to hold at least one max-length row while keeping
+		 * the illumos-format buffer pesimistic in size.
+		 *
+		 * Assuming the buffer is truely tiny, it's likely that the
+		 * result will not fit and an EINVAL will be tossed.
+		 */
+		lbufsz = (lx_size + MAXPATHLEN);
+		sbufsz = MAX((LTOS_GETDENTS_BUFSZ(count, lx_size)),
+		    sizeof (struct dirent));
+	} else {
+		lbufsz = count;
+		sbufsz = LTOS_GETDENTS_BUFSZ(count, lx_size);
+	}
+	bufsz = sbufsz;
+	lbuf = kmem_alloc(lbufsz, KM_SLEEP);
+	sbuf = kmem_alloc(sbufsz, KM_SLEEP);
+
+	aiov.iov_base = sbuf;
+	aiov.iov_len = sbufsz;
+	auio.uio_iov = &aiov;
+	auio.uio_iovcnt = 1;
+	auio.uio_loffset = fp->f_offset;
+	auio.uio_segflg = UIO_SYSSPACE;
+	auio.uio_resid = sbufsz;
+	auio.uio_fmode = 0;
+	auio.uio_extflg = UIO_COPY_CACHED;
+
+	/*
+	 * Since we use a conservative buffer allocation for the differing
+	 * struct sizing and Linux places fewer limits on getdents buffers in
+	 * general, there's a chance we'll undershoot on the record count.
+	 * When this happens, we can simply repeat the READDIR operation until
+	 * the available records are exhausted or we've filled the user buffer.
+	 */
+	while (1) {
+		int at_eof, res;
+		(void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
+		error = VOP_READDIR(vp, &auio, fp->f_cred, &at_eof, NULL, 0);
+		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
+		if (error != 0 || auio.uio_resid == sbufsz) {
+			break;
+		}
+		res = outcb(sbuf, lbuf, bufsz - auio.uio_resid);
+		VERIFY(res <= lbufsz);
+		if (res == 0) {
+			/* no records to copyout from this batch */
+			break;
+		} else if (res > count) {
+			/*
+			 * For very small buffer sizes, it's possible that a
+			 * single record is too large due to a long filename.
+			 */
+			error = EINVAL;
+			break;
+		}
+
+		VERIFY(outb + res <= count);
+		if (copyout(lbuf, (void *)(uptr + outb), res) != 0) {
+			error = EFAULT;
+			break;
+		}
+		outb += res;
+
+		if (at_eof != 0 || (count - outb) < (lx_size + MAXPATHLEN)) {
+			/*
+			 * If there are no records left or the remaining buffer
+			 * space is not large enough to hold a max-length
+			 * filename, do not continue iteration.
+			 */
+			break;
+		}
+
+		/*
+		 * We undershot the request buffer.
+		 * Reset for another READDIR, taking care not to overshoot.
+		 */
+		bufsz = MIN(sbufsz, LTOS_GETDENTS_BUFSZ(count - outb, lx_size));
+		auio.uio_resid = bufsz;
+		aiov.iov_len = bufsz;
+		aiov.iov_base = sbuf;
+	}
+
+	kmem_free(lbuf, lbufsz);
+	kmem_free(sbuf, sbufsz);
+
+	if (error) {
+		releasef(fd);
+		return (set_errno(error));
+	}
+
+	fp->f_offset = auio.uio_loffset;
+	releasef(fd);
+	return (outb);
+}
+
+
+static int
+lx_getdents_format32(caddr_t sbuf, caddr_t lbuf, int len)
+{
+	struct dirent *sd;
+	struct lx_dirent_32 *ld;
+	int namelen;
+	int size = 0;
+
+	while (len > 0) {
+		sd = (struct dirent *)sbuf;
+		ld = (struct lx_dirent_32 *)lbuf;
+		namelen = MIN(strlen(sd->d_name), LX_NAMEMAX - 1);
+
+		ld->d_ino = sd->d_ino;
+		ld->d_off = sd->d_off;
+		(void) strncpy(ld->d_name, sd->d_name, namelen);
+		ld->d_name[namelen] = 0;
+		ld->d_reclen = (ushort_t)LX_RECLEN(namelen,
+		    struct lx_dirent_32);
+		/* Zero out any alignment padding and d_type */
+		bzero(ld->d_name + namelen,
+		    LX_ZEROLEN(namelen, struct lx_dirent_32));
+
+		len -= sd->d_reclen;
+		size += ld->d_reclen;
+		sbuf += sd->d_reclen;
+		lbuf += ld->d_reclen;
+	}
+	return (size);
+}
+
+static int
+lx_getdents_format64(caddr_t sbuf, caddr_t lbuf, int len)
+{
+	struct dirent *sd;
+	struct lx_dirent_64 *ld;
+	int namelen;
+	int size = 0;
+
+	while (len > 0) {
+		sd = (struct dirent *)sbuf;
+		ld = (struct lx_dirent_64 *)lbuf;
+		namelen = MIN(strlen(sd->d_name), LX_NAMEMAX - 1);
+
+		ld->d_ino = sd->d_ino;
+		ld->d_off = sd->d_off;
+		(void) strncpy(ld->d_name, sd->d_name, namelen);
+		ld->d_name[namelen] = 0;
+		ld->d_reclen = (ushort_t)LX_RECLEN(namelen,
+		    struct lx_dirent_64);
+		/* Zero out any alignment padding and d_type */
+		bzero(ld->d_name + namelen,
+		    LX_ZEROLEN(namelen, struct lx_dirent_64));
+
+		len -= sd->d_reclen;
+		size += ld->d_reclen;
+		sbuf += sd->d_reclen;
+		lbuf += ld->d_reclen;
+	}
+	return (size);
+}
+
+long
+lx_getdents_32(int fd, caddr_t buf, size_t count)
+{
+	return (lx_getdents_common(fd, buf, count,
+	    sizeof (struct lx_dirent_32), lx_getdents_format32));
+}
+
+long
+lx_getdents_64(int fd, caddr_t buf, size_t count)
+{
+	return (lx_getdents_common(fd, buf, count,
+	    sizeof (struct lx_dirent_64), lx_getdents_format64));
+}
+
+struct lx_dirent64 {
+	uint64_t	d_ino;
+	int64_t		d_off;
+	ushort_t	d_reclen;
+	uchar_t		d_type;
+	char		d_name[1];
+};
+
+#define	LX_RECLEN64(namelen)	\
+	((offsetof(struct lx_dirent64, d_name) + 1 + (namelen) + 7) & ~7)
+
+#define	LX_ZEROLEN64(namelen)	\
+	(LX_RECLEN64(namelen) -	\
+	((offsetof(struct lx_dirent64, d_name) + (namelen))))
+
+static int
+lx_getdents64_format(caddr_t sbuf, caddr_t lbuf, int len)
+{
+	struct dirent *sd;
+	struct lx_dirent64 *ld;
+	int namelen;
+	int size = 0;
+
+	while (len > 0) {
+		sd = (struct dirent *)sbuf;
+		ld = (struct lx_dirent64 *)lbuf;
+		namelen = MIN(strlen(sd->d_name), LX_NAMEMAX - 1);
+
+		ld->d_ino = sd->d_ino;
+		ld->d_off = sd->d_off;
+		ld->d_type = 0;
+		(void) strncpy(ld->d_name, sd->d_name, namelen);
+		ld->d_name[namelen] = 0;
+		ld->d_reclen = (ushort_t)LX_RECLEN64(namelen);
+		/* Zero out any alignment padding */
+		bzero(ld->d_name + namelen, LX_ZEROLEN64(namelen));
+
+		len -= sd->d_reclen;
+		size += ld->d_reclen;
+		sbuf += sd->d_reclen;
+		lbuf += ld->d_reclen;
+	}
+	return (size);
+}
+
+
+long
+lx_getdents64(int fd, caddr_t buf, size_t count)
+{
+	return (lx_getdents_common(fd, buf, count,
+	    sizeof (struct lx_dirent64), lx_getdents64_format));
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_getpid.c b/usr/src/uts/common/brand/lx/syscall/lx_getpid.c
new file mode 100644
index 0000000000..c2506f52c5
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_getpid.c
@@ -0,0 +1,79 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+#include <sys/zone.h>
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/thread.h>
+#include <sys/cpuvar.h>
+#include <sys/brand.h>
+#include <sys/lx_brand.h>
+
+/*
+ * return the pid
+ */
+long
+lx_getpid(void)
+{
+	lx_lwp_data_t *lwpd = ttolxlwp(curthread);
+	long rv;
+
+	if (curproc->p_pid == curproc->p_zone->zone_proc_initpid) {
+		rv = 1;
+	} else {
+		VERIFY(lwpd != NULL);
+
+		if (lwpd->br_lx_thunk_pid != 0) {
+			rv = lwpd->br_lx_thunk_pid;
+		} else {
+			rv = lwpd->br_tgid;
+		}
+	}
+
+	return (rv);
+}
+
+/*
+ * return the parent pid
+ */
+long
+lx_getppid(void)
+{
+	return (lx_lwp_ppid(ttolwp(curthread), NULL, NULL));
+}
+
+/*
+ * return the thread id
+ */
+long
+lx_gettid(void)
+{
+	lx_lwp_data_t *lwpd = ttolxlwp(curthread);
+
+	return (lwpd->br_pid);
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_getrandom.c b/usr/src/uts/common/brand/lx/syscall/lx_getrandom.c
new file mode 100644
index 0000000000..acc4073483
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_getrandom.c
@@ -0,0 +1,33 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+#include <sys/brand.h>
+#include <sys/lx_brand.h>
+
+/*
+ * From "uts/common/syscall/getrandom.c":
+ */
+extern int getrandom(void *, size_t, int);
+
+long
+lx_getrandom(void *bufp, size_t buflen, int flags)
+{
+	/*
+	 * According to signal(7), calls to getrandom(2) are restartable.
+	 */
+	ttolxlwp(curthread)->br_syscall_restart = B_TRUE;
+
+	return (getrandom(bufp, buflen, flags));
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_id.c b/usr/src/uts/common/brand/lx/syscall/lx_id.c
new file mode 100644
index 0000000000..baa41f52fa
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_id.c
@@ -0,0 +1,296 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/zone.h>
+#include <sys/cred_impl.h>
+#include <sys/policy.h>
+
+typedef ushort_t	l_uid16_t;
+typedef ushort_t	l_gid16_t;
+typedef uint_t		l_uid_t;
+typedef uint_t		l_gid_t;
+
+#define	LINUX_UID16_TO_UID32(uid16)	\
+	(((uid16) == (l_uid16_t)-1) ? ((l_uid_t)-1) : (l_uid_t)(uid16))
+
+#define	LINUX_GID16_TO_GID32(gid16)     \
+	(((gid16) == (l_gid16_t)-1) ? ((l_gid_t)-1) : (l_gid_t)(gid16))
+
+#define	LX_NGROUPS_MAX	32
+extern int setgroups(int, gid_t *);
+
+/*
+ * This function is based on setreuid in common/syscall/uid.c and exists
+ * because illumos does not have a way to explicitly set the saved uid (suid)
+ * from any other system call.
+ */
+long
+lx_setresuid(l_uid_t ruid, l_uid_t euid, l_uid_t suid)
+{
+	proc_t	*p;
+	int	error = 0;
+	int	do_nocd = 0;
+	int	uidchge = 0;
+	uid_t	oldruid = ruid;
+	cred_t	*cr, *newcr;
+	zoneid_t zoneid = getzoneid();
+
+	if ((ruid != -1 && (ruid > MAXUID)) ||
+	    (euid != -1 && (euid > MAXUID)) ||
+	    (suid != -1 && (suid > MAXUID))) {
+		error = EINVAL;
+		goto done;
+	}
+
+	/*
+	 * Need to pre-allocate the new cred structure before grabbing
+	 * the p_crlock mutex.
+	 */
+	newcr = cralloc();
+
+	p = ttoproc(curthread);
+
+retry:
+	mutex_enter(&p->p_crlock);
+	cr = p->p_cred;
+
+	if (ruid != -1 &&
+	    ruid != cr->cr_ruid && ruid != cr->cr_uid &&
+	    ruid != cr->cr_suid && secpolicy_allow_setid(cr, ruid, B_FALSE)) {
+		error = EPERM;
+	} else if (euid != -1 &&
+	    euid != cr->cr_ruid && euid != cr->cr_uid &&
+	    euid != cr->cr_suid && secpolicy_allow_setid(cr, euid, B_FALSE)) {
+		error = EPERM;
+	} else if (suid != -1 &&
+	    suid != cr->cr_ruid && suid != cr->cr_uid &&
+	    suid != cr->cr_suid && secpolicy_allow_setid(cr, suid, B_FALSE)) {
+		error = EPERM;
+	} else {
+		if (!uidchge && ruid != -1 && cr->cr_ruid != ruid) {
+			/*
+			 * The ruid of the process is going to change. In order
+			 * to avoid a race condition involving the
+			 * process count associated with the newly given ruid,
+			 * we increment the count before assigning the
+			 * credential to the process.
+			 * To do that, we'll have to take pidlock, so we first
+			 * release p_crlock.
+			 */
+			mutex_exit(&p->p_crlock);
+			uidchge = 1;
+			mutex_enter(&pidlock);
+			upcount_inc(ruid, zoneid);
+			mutex_exit(&pidlock);
+			/*
+			 * As we released p_crlock we can't rely on the cr
+			 * we read. So retry the whole thing.
+			 */
+			goto retry;
+		}
+		crhold(cr);
+		crcopy_to(cr, newcr);
+		p->p_cred = newcr;
+
+		if (euid != -1)
+			newcr->cr_uid = euid;
+		if (suid != -1)
+			newcr->cr_suid = suid;
+		if (ruid != -1) {
+			oldruid = newcr->cr_ruid;
+			newcr->cr_ruid = ruid;
+			ASSERT(ruid != oldruid ? uidchge : 1);
+		}
+
+		/*
+		 * A process that gives up its privilege
+		 * must be marked to produce no core dump.
+		 */
+		if ((cr->cr_uid != newcr->cr_uid ||
+		    cr->cr_ruid != newcr->cr_ruid ||
+		    cr->cr_suid != newcr->cr_suid))
+			do_nocd = 1;
+
+		crfree(cr);
+	}
+	mutex_exit(&p->p_crlock);
+
+	/*
+	 * We decrement the number of processes associated with the oldruid
+	 * to match the increment above, even if the ruid of the process
+	 * did not change or an error occurred (oldruid == uid).
+	 */
+	if (uidchge) {
+		ASSERT(oldruid != -1 && ruid != -1);
+		mutex_enter(&pidlock);
+		upcount_dec(oldruid, zoneid);
+		mutex_exit(&pidlock);
+	}
+
+	if (error == 0) {
+		if (do_nocd) {
+			mutex_enter(&p->p_lock);
+			p->p_flag |= SNOCD;
+			mutex_exit(&p->p_lock);
+		}
+		crset(p, newcr);	/* broadcast to process threads */
+		goto done;
+	}
+	crfree(newcr);
+done:
+	if (error)
+		return (set_errno(error));
+	else
+		return (0);
+}
+
+long
+lx_setresuid16(l_uid16_t ruid16, l_uid16_t euid16, l_uid16_t suid16)
+{
+	long	rval;
+
+	rval = lx_setresuid(
+	    LINUX_UID16_TO_UID32(ruid16),
+	    LINUX_UID16_TO_UID32(euid16),
+	    LINUX_UID16_TO_UID32(suid16));
+
+	return (rval);
+}
+
+/*
+ * This function is based on setregid in common/syscall/gid.c
+ */
+long
+lx_setresgid(l_gid_t rgid, l_gid_t egid, l_gid_t sgid)
+{
+	proc_t	*p;
+	int	error = 0;
+	int	do_nocd = 0;
+	cred_t	*cr, *newcr;
+
+	if ((rgid != -1 && (rgid > MAXUID)) ||
+	    (egid != -1 && (egid > MAXUID)) ||
+	    (sgid != -1 && (sgid > MAXUID))) {
+		error = EINVAL;
+		goto done;
+	}
+
+	/*
+	 * Need to pre-allocate the new cred structure before grabbing
+	 * the p_crlock mutex.
+	 */
+	newcr = cralloc();
+
+	p = ttoproc(curthread);
+	mutex_enter(&p->p_crlock);
+	cr = p->p_cred;
+
+	if (rgid != -1 &&
+	    rgid != cr->cr_rgid && rgid != cr->cr_gid &&
+	    rgid != cr->cr_sgid && secpolicy_allow_setid(cr, -1, B_FALSE)) {
+		error = EPERM;
+	} else if (egid != -1 &&
+	    egid != cr->cr_rgid && egid != cr->cr_gid &&
+	    egid != cr->cr_sgid && secpolicy_allow_setid(cr, -1, B_FALSE)) {
+		error = EPERM;
+	} else if (sgid != -1 &&
+	    sgid != cr->cr_rgid && sgid != cr->cr_gid &&
+	    sgid != cr->cr_sgid && secpolicy_allow_setid(cr, -1, B_FALSE)) {
+		error = EPERM;
+	} else {
+		crhold(cr);
+		crcopy_to(cr, newcr);
+		p->p_cred = newcr;
+
+		if (egid != -1)
+			newcr->cr_gid = egid;
+		if (sgid != -1)
+			newcr->cr_sgid = sgid;
+		if (rgid != -1)
+			newcr->cr_rgid = rgid;
+
+		/*
+		 * A process that gives up its privilege
+		 * must be marked to produce no core dump.
+		 */
+		if ((cr->cr_gid != newcr->cr_gid ||
+		    cr->cr_rgid != newcr->cr_rgid ||
+		    cr->cr_sgid != newcr->cr_sgid))
+			do_nocd = 1;
+
+		crfree(cr);
+	}
+	mutex_exit(&p->p_crlock);
+
+	if (error == 0) {
+		if (do_nocd) {
+			mutex_enter(&p->p_lock);
+			p->p_flag |= SNOCD;
+			mutex_exit(&p->p_lock);
+		}
+		crset(p, newcr);	/* broadcast to process threads */
+		goto done;
+	}
+	crfree(newcr);
+done:
+	if (error)
+		return (set_errno(error));
+	else
+		return (0);
+}
+
+long
+lx_setresgid16(l_gid16_t rgid16, l_gid16_t egid16, l_gid16_t sgid16)
+{
+	long	rval;
+
+	rval = lx_setresgid(
+	    LINUX_GID16_TO_GID32(rgid16),
+	    LINUX_GID16_TO_GID32(egid16),
+	    LINUX_GID16_TO_GID32(sgid16));
+
+	return (rval);
+}
+
+/*
+ * Linux defines NGROUPS_MAX to be 32, but on illumos it is only 16. We employ
+ * the terrible hack below so that tests may proceed, if only on DEBUG kernels.
+ */
+long
+lx_helper_setgroups(int ngroups, gid_t *grouplist)
+{
+#ifdef DEBUG
+	if (ngroups > ngroups_max && ngroups <= LX_NGROUPS_MAX)
+		ngroups = ngroups_max;
+#endif /* DEBUG */
+
+	return (setgroups(ngroups, grouplist));
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_ioctl.c b/usr/src/uts/common/brand/lx/syscall/lx_ioctl.c
new file mode 100644
index 0000000000..2bd5da9961
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_ioctl.c
@@ -0,0 +1,1741 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#include <sys/errno.h>
+#include <sys/systm.h>
+#include <sys/file.h>
+#include <sys/filio.h>
+#include <sys/vnode.h>
+#include <sys/fcntl.h>
+#include <sys/termio.h>
+#include <sys/termios.h>
+#include <sys/ptyvar.h>
+#include <net/if.h>
+#include <net/if_dl.h>
+#include <sys/sockio.h>
+#include <sys/stropts.h>
+#include <sys/ptms.h>
+#include <sys/cred.h>
+#include <sys/cred_impl.h>
+#include <sys/sysmacros.h>
+#include <sys/lx_misc.h>
+#include <sys/lx_ptm.h>
+#include <sys/sunddi.h>
+#include <sys/thread.h>
+#include <sys/proc.h>
+#include <sys/session.h>
+#include <sys/kmem.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <net/if_arp.h>
+#include <sys/ioccom.h>
+#include <sys/dtrace.h>
+#include <sys/ethernet.h>
+#include <sys/dlpi.h>
+#include <sys/lx_autofs.h>
+#include <sys/netstack.h>
+#include <inet/ip.h>
+#include <inet/ip_if.h>
+#include <sys/dkio.h>
+#include <sys/sdt.h>
+
+/*
+ * Linux ioctl types
+ */
+#define	LX_IOC_TYPE_HD		0x03
+#define	LX_IOC_TYPE_BLK		0x12
+#define	LX_IOC_TYPE_FD		0x54
+#define	LX_IOC_TYPE_DTRACE	0x68
+#define	LX_IOC_TYPE_SOCK	0x89
+#define	LX_IOC_TYPE_AUTOFS	0x93
+
+/*
+ * Supported ioctls
+ */
+#define	LX_HDIO_GETGEO		0x0301
+#define	LX_BLKGETSIZE		0x1260
+#define	LX_BLKSSZGET		0x1268
+#define	LX_BLKGETSIZE64		0x80081272
+#define	LX_TCGETS		0x5401
+#define	LX_TCSETS		0x5402
+#define	LX_TCSETSW		0x5403
+#define	LX_TCSETSF		0x5404
+#define	LX_TCGETA		0x5405
+#define	LX_TCSETA		0x5406
+#define	LX_TCSETAW		0x5407
+#define	LX_TCSETAF		0x5408
+#define	LX_TCSBRK		0x5409
+#define	LX_TCXONC		0x540a
+#define	LX_TCFLSH		0x540b
+#define	LX_TIOCEXCL		0x540c
+#define	LX_TIOCNXCL		0x540d
+#define	LX_TIOCSCTTY		0x540e
+#define	LX_TIOCGPGRP		0x540f
+#define	LX_TIOCSPGRP		0x5410
+#define	LX_TIOCOUTQ		0x5411
+#define	LX_TIOCSTI		0x5412
+#define	LX_TIOCGWINSZ		0x5413
+#define	LX_TIOCSWINSZ		0x5414
+#define	LX_TIOCMGET		0x5415
+#define	LX_TIOCMBIS		0x5416
+#define	LX_TIOCMBIC		0x5417
+#define	LX_TIOCMSET		0x5418
+#define	LX_TIOCGSOFTCAR		0x5419
+#define	LX_TIOCSSOFTCAR		0x541a
+#define	LX_FIONREAD		0x541b
+#define	LX_TIOCPKT		0x5420
+#define	LX_FIONBIO		0x5421
+#define	LX_TIOCNOTTY		0x5422
+#define	LX_TIOCSETD		0x5423
+#define	LX_TIOCGETD		0x5424
+#define	LX_TCSBRKP		0x5425
+#define	LX_TIOCGSID		0x5429
+#define	LX_TIOCGPTN		0x80045430
+#define	LX_TIOCSPTLCK		0x40045431
+#define	LX_FIONCLEX		0x5450
+#define	LX_FIOCLEX		0x5451
+#define	LX_FIOASYNC		0x5452
+#define	LX_FIOSETOWN		0x8901
+#define	LX_SIOCSPGRP		0x8902
+#define	LX_FIOGETOWN		0x8903
+#define	LX_SIOCGPGRP		0x8904
+#define	LX_SIOCATMARK		0x8905
+#define	LX_SIOCGSTAMP		0x8906
+#define	LX_SIOCADDRT		0x890b
+#define	LX_SIOCDELRT		0x890c
+#define	LX_SIOCRTMSG		0x890d
+#define	LX_SIOCGIFNAME		0x8910
+#define	LX_SIOCSIFLINK		0x8911
+#define	LX_SIOCGIFCONF		0x8912
+#define	LX_SIOCGIFFLAGS		0x8913
+#define	LX_SIOCSIFFLAGS		0x8914
+#define	LX_SIOCGIFADDR		0x8915
+#define	LX_SIOCSIFADDR		0x8916
+#define	LX_SIOCGIFDSTADDR	0x8917
+#define	LX_SIOCSIFDSTADDR	0x8918
+#define	LX_SIOCGIFBRDADDR	0x8919
+#define	LX_SIOCSIFBRDADDR	0x891a
+#define	LX_SIOCGIFNETMASK	0x891b
+#define	LX_SIOCSIFNETMASK	0x891c
+#define	LX_SIOCGIFMETRIC	0x891d
+#define	LX_SIOCSIFMETRIC	0x891e
+#define	LX_SIOCGIFMEM		0x891f
+#define	LX_SIOCSIFMEM		0x8920
+#define	LX_SIOCGIFMTU		0x8921
+#define	LX_SIOCSIFMTU		0x8922
+#define	LX_SIOCSIFNAME		0x8923
+#define	LX_SIOCSIFHWADDR	0x8924
+#define	LX_SIOCGIFENCAP		0x8925
+#define	LX_SIOCSIFENCAP		0x8926
+#define	LX_SIOCGIFHWADDR	0x8927
+#define	LX_SIOCGIFSLAVE		0x8929
+#define	LX_SIOCSIFSLAVE		0x8930
+#define	LX_SIOCADDMULTI		0x8931
+#define	LX_SIOCDELMULTI		0x8932
+#define	LX_SIOCGIFINDEX		0x8933
+#define	LX_SIOCSIFPFLAGS	0x8934
+#define	LX_SIOCGIFPFLAGS	0x8935
+#define	LX_SIOCDIFADDR		0x8936
+#define	LX_SIOCSIFHWBROADCAST	0x8937
+#define	LX_SIOCGIFCOUNT		0x8938
+#define	LX_SIOCGIFBR		0x8940
+#define	LX_SIOCSIFBR		0x8941
+#define	LX_SIOCGIFTXQLEN	0x8942
+#define	LX_SIOCSIFTXQLEN	0x8943
+#define	LX_SIOCETHTOOL		0x8946
+#define	LX_SIOCGMIIPHY		0x8947
+#define	LX_SIOCGMIIREG		0x8948
+#define	LX_SIOCSMIIREG		0x8949
+#define	LX_SIOCWANDEV		0x894a
+#define	LX_SIOCOUTQNSD		0x894b
+#define	LX_SIOCDARP		0x8953
+#define	LX_SIOCGARP		0x8954
+#define	LX_SIOCSARP		0x8955
+#define	LX_SIOCDRARP		0x8960
+#define	LX_SIOCGRARP		0x8961
+#define	LX_SIOCSRARP		0x8962
+#define	LX_SIOCGIFMAP		0x8970
+#define	LX_SIOCSIFMAP		0x8971
+#define	LX_SIOCADDDLCI		0x8980
+#define	LX_SIOCDELDLCI		0x8981
+#define	LX_SIOCGIFVLAN		0x8982
+#define	LX_SIOCSIFVLAN		0x8983
+#define	LX_SIOCBONDENSLAVE	0x8990
+#define	LX_SIOCBONDRELEASE	0x8991
+#define	LX_SIOCBONDSETHWADDR	0x8992
+#define	LX_SIOCBONDSLAVEINFOQUERY 0x8993
+#define	LX_SIOCBONDINFOQUERY	0x8994
+#define	LX_SIOCBONDCHANGEACTIVE	0x8995
+#define	LX_SIOCBRADDBR		0x89a0
+#define	LX_SIOCBRDELBR		0x89a1
+#define	LX_SIOCBRADDIF		0x89a2
+#define	LX_SIOCBRDELIF		0x89a3
+#define	LX_SIOCSHWTSTAMP	0x89b0
+#define	LX_SIOCGHWTSTAMP	0x89b1
+#define	LX_SIOCDEVPRIVATE	0x89f0
+#define	LX_SIOCPROTOPRIVATE	0x89e0
+
+#define	FLUSER(fp)	fp->f_flag | get_udatamodel()
+#define	FLFAKE(fp)	fp->f_flag | FKIOCTL
+
+/*
+ * LX_NCC must be different from LX_NCCS since while the termio and termios
+ * structures may look similar they are fundamentally different sizes and
+ * have different members.
+ */
+#define	LX_NCC	8
+#define	LX_NCCS	19
+
+struct lx_termio {
+	unsigned short c_iflag;		/* input mode flags */
+	unsigned short c_oflag;		/* output mode flags */
+	unsigned short c_cflag;		/* control mode flags */
+	unsigned short c_lflag;		/* local mode flags */
+	unsigned char c_line;		/* line discipline */
+	unsigned char c_cc[LX_NCC];	/* control characters */
+};
+
+struct lx_termios {
+	uint32_t c_iflag;		/* input mode flags */
+	uint32_t c_oflag;		/* output mode flags */
+	uint32_t c_cflag;		/* control mode flags */
+	uint32_t c_lflag;		/* local mode flags */
+	unsigned char c_line;		/* line discipline */
+	unsigned char c_cc[LX_NCCS];	/* control characters */
+};
+
+/*
+ * c_cc characters which are valid for lx_termio and lx_termios
+ */
+#define	LX_VINTR	0
+#define	LX_VQUIT	1
+#define	LX_VERASE	2
+#define	LX_VKILL	3
+#define	LX_VEOF		4
+#define	LX_VTIME	5
+#define	LX_VMIN		6
+#define	LX_VSWTC	7
+
+/*
+ * c_cc characters which are valid for lx_termios
+ */
+#define	LX_VSTART	8
+#define	LX_VSTOP	9
+#define	LX_VSUSP	10
+#define	LX_VEOL		11
+#define	LX_VREPRINT	12
+#define	LX_VDISCARD	13
+#define	LX_VWERASE	14
+#define	LX_VLNEXT	15
+#define	LX_VEOL2	16
+
+/*
+ * Defaults needed for SunOS to Linux format conversion.
+ * See INIT_C_CC in linux-stable/include/asm-generic/termios.h
+ */
+#define	LX_DEF_VTIME	0
+#define	LX_DEF_VMIN	1
+#define	LX_DEF_VEOF	'\004'
+#define	LX_DEF_VEOL	0
+
+/* VSD key for lx_cc information */
+static uint_t lx_ioctl_vsd = 0;
+
+extern int lx_lpid_to_spair(pid_t l_pid, pid_t *s_pid, id_t *s_tid);
+
+/* Terminal helpers */
+
+static void
+l2s_termios(struct lx_termios *l_tios, struct termios *s_tios)
+{
+	ASSERT((l_tios != NULL) && (s_tios != NULL));
+
+	bzero(s_tios, sizeof (*s_tios));
+
+	s_tios->c_iflag = l_tios->c_iflag;
+	s_tios->c_oflag = l_tios->c_oflag;
+	s_tios->c_cflag = l_tios->c_cflag;
+	s_tios->c_lflag = l_tios->c_lflag;
+
+	if (s_tios->c_lflag & ICANON) {
+		s_tios->c_cc[VEOF] = l_tios->c_cc[LX_VEOF];
+		s_tios->c_cc[VEOL] = l_tios->c_cc[LX_VEOL];
+	} else {
+		s_tios->c_cc[VMIN] = l_tios->c_cc[LX_VMIN];
+		s_tios->c_cc[VTIME] = l_tios->c_cc[LX_VTIME];
+	}
+
+	s_tios->c_cc[VEOL2] = l_tios->c_cc[LX_VEOL2];
+	s_tios->c_cc[VERASE] = l_tios->c_cc[LX_VERASE];
+	s_tios->c_cc[VKILL] = l_tios->c_cc[LX_VKILL];
+	s_tios->c_cc[VREPRINT] = l_tios->c_cc[LX_VREPRINT];
+	s_tios->c_cc[VLNEXT] = l_tios->c_cc[LX_VLNEXT];
+	s_tios->c_cc[VWERASE] = l_tios->c_cc[LX_VWERASE];
+	s_tios->c_cc[VINTR] = l_tios->c_cc[LX_VINTR];
+	s_tios->c_cc[VQUIT] = l_tios->c_cc[LX_VQUIT];
+	s_tios->c_cc[VSWTCH] = l_tios->c_cc[LX_VSWTC];
+	s_tios->c_cc[VSTART] = l_tios->c_cc[LX_VSTART];
+	s_tios->c_cc[VSTOP] = l_tios->c_cc[LX_VSTOP];
+	s_tios->c_cc[VSUSP] = l_tios->c_cc[LX_VSUSP];
+	s_tios->c_cc[VDISCARD] = l_tios->c_cc[LX_VDISCARD];
+}
+
+static void
+l2s_termio(struct lx_termio *l_tio, struct termio *s_tio)
+{
+	ASSERT((l_tio != NULL) && (s_tio != NULL));
+
+	bzero(s_tio, sizeof (*s_tio));
+
+	s_tio->c_iflag = l_tio->c_iflag;
+	s_tio->c_oflag = l_tio->c_oflag;
+	s_tio->c_cflag = l_tio->c_cflag;
+	s_tio->c_lflag = l_tio->c_lflag;
+
+	if (s_tio->c_lflag & ICANON) {
+		s_tio->c_cc[VEOF] = l_tio->c_cc[LX_VEOF];
+	} else {
+		s_tio->c_cc[VMIN] = l_tio->c_cc[LX_VMIN];
+		s_tio->c_cc[VTIME] = l_tio->c_cc[LX_VTIME];
+	}
+
+	s_tio->c_cc[VINTR] = l_tio->c_cc[LX_VINTR];
+	s_tio->c_cc[VQUIT] = l_tio->c_cc[LX_VQUIT];
+	s_tio->c_cc[VERASE] = l_tio->c_cc[LX_VERASE];
+	s_tio->c_cc[VKILL] = l_tio->c_cc[LX_VKILL];
+	s_tio->c_cc[VSWTCH] = l_tio->c_cc[LX_VSWTC];
+}
+
+static void
+termios2lx_cc(struct lx_termios *l_tios, struct lx_cc *lio)
+{
+	ASSERT((l_tios != NULL) && (lio != NULL));
+
+	bzero(lio, sizeof (*lio));
+
+	lio->veof = l_tios->c_cc[LX_VEOF];
+	lio->veol = l_tios->c_cc[LX_VEOL];
+	lio->vmin = l_tios->c_cc[LX_VMIN];
+	lio->vtime = l_tios->c_cc[LX_VTIME];
+}
+
+static void
+termio2lx_cc(struct lx_termio *l_tio, struct lx_cc *lio)
+{
+	ASSERT((l_tio != NULL) && (lio != NULL));
+
+	bzero(lio, sizeof (*lio));
+
+	lio->veof = l_tio->c_cc[LX_VEOF];
+	lio->veol = 0;
+	lio->vmin = l_tio->c_cc[LX_VMIN];
+	lio->vtime = l_tio->c_cc[LX_VTIME];
+}
+
+static void
+s2l_termios(struct termios *s_tios, struct lx_termios *l_tios)
+{
+	ASSERT((s_tios != NULL) && (l_tios != NULL));
+
+	bzero(l_tios, sizeof (*l_tios));
+
+	l_tios->c_iflag = s_tios->c_iflag;
+	l_tios->c_oflag = s_tios->c_oflag;
+	l_tios->c_cflag = s_tios->c_cflag;
+	l_tios->c_lflag = s_tios->c_lflag;
+
+	/*
+	 * Since use of the VMIN/VTIME and VEOF/VEOL control characters is
+	 * mutually exclusive (determined by ICANON), SunOS aliases them in the
+	 * c_cc field in termio/termios.  Linux does not perform this aliasing,
+	 * so it expects that the default values are present regardless of
+	 * ICANON status.
+	 *
+	 * These defaults can be overridden later by any values stored via the
+	 * lx_cc mechanism.
+	 */
+	if (s_tios->c_lflag & ICANON) {
+		l_tios->c_cc[LX_VEOF] = s_tios->c_cc[VEOF];
+		l_tios->c_cc[LX_VEOL] = s_tios->c_cc[VEOL];
+		l_tios->c_cc[LX_VTIME] = LX_DEF_VTIME;
+		l_tios->c_cc[LX_VMIN] = LX_DEF_VMIN;
+
+	} else {
+		l_tios->c_cc[LX_VMIN] = s_tios->c_cc[VMIN];
+		l_tios->c_cc[LX_VTIME] = s_tios->c_cc[VTIME];
+		l_tios->c_cc[LX_VEOF] = LX_DEF_VEOF;
+		l_tios->c_cc[LX_VEOL] = LX_DEF_VEOL;
+	}
+
+	l_tios->c_cc[LX_VEOL2] = s_tios->c_cc[VEOL2];
+	l_tios->c_cc[LX_VERASE] = s_tios->c_cc[VERASE];
+	l_tios->c_cc[LX_VKILL] = s_tios->c_cc[VKILL];
+	l_tios->c_cc[LX_VREPRINT] = s_tios->c_cc[VREPRINT];
+	l_tios->c_cc[LX_VLNEXT] = s_tios->c_cc[VLNEXT];
+	l_tios->c_cc[LX_VWERASE] = s_tios->c_cc[VWERASE];
+	l_tios->c_cc[LX_VINTR] = s_tios->c_cc[VINTR];
+	l_tios->c_cc[LX_VQUIT] = s_tios->c_cc[VQUIT];
+	l_tios->c_cc[LX_VSWTC] = s_tios->c_cc[VSWTCH];
+	l_tios->c_cc[LX_VSTART] = s_tios->c_cc[VSTART];
+	l_tios->c_cc[LX_VSTOP] = s_tios->c_cc[VSTOP];
+	l_tios->c_cc[LX_VSUSP] = s_tios->c_cc[VSUSP];
+	l_tios->c_cc[LX_VDISCARD] = s_tios->c_cc[VDISCARD];
+}
+
+static void
+s2l_termio(struct termio *s_tio, struct lx_termio *l_tio)
+{
+	ASSERT((s_tio != NULL) && (l_tio != NULL));
+
+	bzero(l_tio, sizeof (*l_tio));
+
+	l_tio->c_iflag = s_tio->c_iflag;
+	l_tio->c_oflag = s_tio->c_oflag;
+	l_tio->c_cflag = s_tio->c_cflag;
+	l_tio->c_lflag = s_tio->c_lflag;
+
+	if (s_tio->c_lflag & ICANON) {
+		l_tio->c_cc[LX_VEOF] = s_tio->c_cc[VEOF];
+		l_tio->c_cc[LX_VTIME] = LX_DEF_VTIME;
+		l_tio->c_cc[LX_VMIN] = LX_DEF_VMIN;
+	} else {
+		l_tio->c_cc[LX_VMIN] = s_tio->c_cc[VMIN];
+		l_tio->c_cc[LX_VTIME] = s_tio->c_cc[VTIME];
+		l_tio->c_cc[LX_VEOF] = LX_DEF_VEOF;
+	}
+
+	l_tio->c_cc[LX_VINTR] = s_tio->c_cc[VINTR];
+	l_tio->c_cc[LX_VQUIT] = s_tio->c_cc[VQUIT];
+	l_tio->c_cc[LX_VERASE] = s_tio->c_cc[VERASE];
+	l_tio->c_cc[LX_VKILL] = s_tio->c_cc[VKILL];
+	l_tio->c_cc[LX_VSWTC] = s_tio->c_cc[VSWTCH];
+}
+
+static void
+set_lx_cc(vnode_t *vp, struct lx_cc *lio)
+{
+	struct lx_cc *cur;
+	/*
+	 * Linux expects that the termio/termios control characters are
+	 * preserved more strictly than illumos supports.  In order to preserve
+	 * the illusion that the characters are maintained, they are stored as
+	 * vnode-specific data.
+	 */
+	mutex_enter(&vp->v_vsd_lock);
+	cur = (struct lx_cc *)vsd_get(vp, lx_ioctl_vsd);
+	if (cur == NULL) {
+		cur = kmem_alloc(sizeof (struct lx_cc), KM_SLEEP);
+		bcopy(lio, cur, sizeof (struct lx_cc));
+		(void) vsd_set(vp, lx_ioctl_vsd, cur);
+	} else {
+		bcopy(lio, cur, sizeof (struct lx_cc));
+	}
+	mutex_exit(&vp->v_vsd_lock);
+}
+
+static int
+get_lx_cc(vnode_t *vp, struct lx_cc *lio)
+{
+	struct lx_cc *cur;
+	int rv = 1;
+	mutex_enter(&vp->v_vsd_lock);
+	cur = (struct lx_cc *)vsd_get(vp, lx_ioctl_vsd);
+	if (cur != NULL) {
+		bcopy(cur, lio, sizeof (*lio));
+		rv = 0;
+	}
+	mutex_exit(&vp->v_vsd_lock);
+	return (rv);
+}
+
+/* Socket helpers */
+
+typedef struct lx_ifreq32 {
+	char	ifr_name[IFNAMSIZ];
+	union {
+		struct	sockaddr ifru_addr;
+	};
+} lx_ifreq32_t;
+
+typedef struct lx_ifreq64 {
+	char	ifr_name[IFNAMSIZ];
+	union {
+		struct	sockaddr ifru_addr;
+		/* pad this out to the Linux size */
+		uint64_t	ifmap[3];
+	};
+} lx_ifreq64_t;
+
+typedef struct lx_ifconf32 {
+	int32_t	if_len;
+	caddr32_t if_buf;
+} lx_ifconf32_t;
+
+typedef struct lx_ifconf64 {
+	int32_t	if_len;
+	caddr_t if_buf;
+} lx_ifconf64_t;
+
+
+/* Generic translators */
+
+static int
+ict_pass(file_t *fp, int cmd, intptr_t arg, int lxcmd)
+{
+	int error = 0;
+	int rv;
+
+	error = VOP_IOCTL(fp->f_vnode, cmd, arg, FLUSER(fp), fp->f_cred, &rv,
+	    NULL);
+	return ((error != 0) ? set_errno(error) : 0);
+}
+
+static int
+ict_fionbio(file_t *fp, int cmd, intptr_t arg, int lxcmd)
+{
+	vnode_t *vp;
+	int32_t iflag, flags;
+	int error;
+
+	if (copyin((caddr_t)arg, &iflag, sizeof (iflag)))
+		return (set_errno(EFAULT));
+
+	mutex_enter(&fp->f_tlock);
+	vp = fp->f_vnode;
+	flags = fp->f_flag;
+	/* Linux sets NONBLOCK instead of FIONBIO */
+	if (iflag)
+		flags |= FNONBLOCK;
+	else
+		flags &= ~FNONBLOCK;
+	/* push the flag down */
+	error = VOP_SETFL(vp, fp->f_flag, flags, fp->f_cred, NULL);
+	fp->f_flag = flags;
+	mutex_exit(&fp->f_tlock);
+	return ((error != 0) ? set_errno(error) : 0);
+}
+
+static int
+ict_fionread(file_t *fp, int cmd, intptr_t arg, int lxcmd)
+{
+	vnode_t *vp;
+	struct vattr vattr;
+	int error = 0;
+	int rv;
+	/*
+	 * offset is int32_t because that is what FIONREAD is defined in terms
+	 * of.  We cap at INT_MAX as in other cases for this ioctl.
+	 */
+	int32_t offset;
+
+	vp = fp->f_vnode;
+
+	if (vp->v_type == VREG || vp->v_type == VDIR) {
+		vattr.va_mask = AT_SIZE;
+		error = VOP_GETATTR(vp, &vattr, 0, fp->f_cred, NULL);
+		if (error != 0)
+			return (set_errno(error));
+		offset = MIN(vattr.va_size - fp->f_offset, INT_MAX);
+		if (copyout(&offset, (caddr_t)arg, sizeof (offset)))
+			return (set_errno(EFAULT));
+	} else {
+		error = VOP_IOCTL(vp, FIONREAD, arg, FLUSER(fp), fp->f_cred,
+		    &rv, NULL);
+		if (error)
+			return (set_errno(error));
+	}
+	return (0);
+}
+
+/*
+ * hard disk-related translators
+ *
+ * Note that the normal disk ioctls only work for VCHR devices. See spec_ioctl
+ * which will return ENOTTY for a VBLK device. However, fdisk, etc. expect to
+ * work with block devices.
+ *
+ * We expect a zvol to be the primary block device we're interacting with and
+ * we use the zone's lxzd_vdisks list to handle zvols specifically.
+ */
+
+typedef struct lx_hd_geom {
+	unsigned char heads;
+	unsigned char sectors;
+	unsigned short cylinders;
+	unsigned long start;
+} lx_hd_geom_t;
+
+static lx_virt_disk_t *
+lx_lookup_zvol(lx_zone_data_t *lxzd, dev_t dev)
+{
+	lx_virt_disk_t *vd;
+
+	vd = list_head(lxzd->lxzd_vdisks);
+	while (vd != NULL) {
+		if (vd->lxvd_type == LXVD_ZVOL && vd->lxvd_real_dev == dev)
+			return (vd);
+		vd = list_next(lxzd->lxzd_vdisks, vd);
+	}
+
+	return (NULL);
+}
+
+/*
+ * See zvol_ioctl() which always fails for DKIOCGGEOM. The geometry for a
+ * zvol (or really any modern disk) is made up, so we do that here as well.
+ */
+static int
+ict_hdgetgeo(file_t *fp, int cmd, intptr_t arg, int lxcmd)
+{
+	lx_hd_geom_t lx_geom;
+	lx_zone_data_t *lxzd;
+
+	if (fp->f_vnode->v_type != VCHR && fp->f_vnode->v_type != VBLK)
+		return (set_errno(EINVAL));
+
+	lxzd = ztolxzd(curproc->p_zone);
+	ASSERT(lxzd != NULL);
+	ASSERT(lxzd->lxzd_vdisks != NULL);
+
+	if (getmajor(fp->f_vnode->v_rdev) == getmajor(lxzd->lxzd_zfs_dev)) {
+		lx_virt_disk_t *vd;
+
+		vd = lx_lookup_zvol(lxzd, fp->f_vnode->v_rdev);
+		if (vd == NULL) {
+			/* should only happen if new zvol */
+			bzero(&lx_geom, sizeof (lx_geom));
+		} else {
+			diskaddr_t tot;
+
+			tot = vd->lxvd_volsize / vd->lxvd_blksize;
+
+			/*
+			 * Since the 'sectors' value is only one byte we make
+			 * up heads/cylinder values to get things to fit.
+			 * We roundup the number of heads to ensure we don't
+			 * overflow the sectors due to truncation.
+			 */
+			lx_geom.heads = lx_geom.cylinders = (tot / 0xff) + 1;
+			lx_geom.sectors = tot / lx_geom.heads;
+			lx_geom.start = 0;
+		}
+	} else {
+		int res, rv;
+		struct dk_geom geom;
+
+		res = VOP_IOCTL(fp->f_vnode, DKIOCGGEOM, (intptr_t)&geom,
+		    fp->f_flag | FKIOCTL, fp->f_cred, &rv, NULL);
+		if (res > 0)
+			return (set_errno(res));
+
+		lx_geom.heads = geom.dkg_nhead;
+		lx_geom.sectors = geom.dkg_nsect;
+		lx_geom.cylinders = geom.dkg_ncyl;
+		lx_geom.start = 0;
+	}
+
+	if (copyout(&lx_geom, (caddr_t)arg, sizeof (lx_geom)))
+		return (set_errno(EFAULT));
+	return (0);
+}
+
+/*
+ * Per the Linux sd(4) man page, get the number of sectors. The linux/fs.h
+ * header says its 512 byte blocks.
+ */
+static int
+ict_blkgetsize(file_t *fp, int cmd, intptr_t arg, int lxcmd)
+{
+	diskaddr_t tot;
+	lx_zone_data_t *lxzd;
+
+	if (fp->f_vnode->v_type != VCHR && fp->f_vnode->v_type != VBLK)
+		return (set_errno(EINVAL));
+
+	lxzd = ztolxzd(curproc->p_zone);
+	ASSERT(lxzd != NULL);
+	ASSERT(lxzd->lxzd_vdisks != NULL);
+
+	if (getmajor(fp->f_vnode->v_rdev) == getmajor(lxzd->lxzd_zfs_dev)) {
+		lx_virt_disk_t *vd;
+
+		vd = lx_lookup_zvol(lxzd, fp->f_vnode->v_rdev);
+		if (vd == NULL) {
+			/* should only happen if new zvol */
+			tot = 0;
+		} else {
+			tot = vd->lxvd_volsize / 512;
+		}
+	} else {
+		int res, rv;
+		struct dk_minfo minfo;
+
+		res = VOP_IOCTL(fp->f_vnode, DKIOCGMEDIAINFO, (intptr_t)&minfo,
+		    fp->f_flag | FKIOCTL, fp->f_cred, &rv, NULL);
+		if (res > 0)
+			return (set_errno(res));
+
+		tot = minfo.dki_capacity;
+		if (minfo.dki_lbsize > 512) {
+			uint_t bsize = minfo.dki_lbsize / 512;
+
+			tot *= bsize;
+		}
+	}
+
+	if (copyout(&tot, (caddr_t)arg, sizeof (long)))
+		return (set_errno(EFAULT));
+	return (0);
+}
+
+/*
+ * Get the sector size (i.e. the logical block size).
+ */
+static int
+ict_blkgetssize(file_t *fp, int cmd, intptr_t arg, int lxcmd)
+{
+	uint_t bsize;
+	lx_zone_data_t *lxzd;
+
+	if (fp->f_vnode->v_type != VCHR && fp->f_vnode->v_type != VBLK)
+		return (set_errno(EINVAL));
+
+	lxzd = ztolxzd(curproc->p_zone);
+	ASSERT(lxzd != NULL);
+	ASSERT(lxzd->lxzd_vdisks != NULL);
+
+	if (getmajor(fp->f_vnode->v_rdev) == getmajor(lxzd->lxzd_zfs_dev)) {
+		lx_virt_disk_t *vd;
+
+		vd = lx_lookup_zvol(lxzd, fp->f_vnode->v_rdev);
+		if (vd == NULL) {
+			/* should only happen if new zvol */
+			bsize = 0;
+		} else {
+			bsize = (uint_t)vd->lxvd_blksize;
+		}
+	} else {
+		int res, rv;
+		struct dk_minfo minfo;
+
+		res = VOP_IOCTL(fp->f_vnode, DKIOCGMEDIAINFO, (intptr_t)&minfo,
+		    fp->f_flag | FKIOCTL, fp->f_cred, &rv, NULL);
+		if (res > 0)
+			return (set_errno(res));
+
+		bsize = (uint_t)minfo.dki_lbsize;
+	}
+
+	if (copyout(&bsize, (caddr_t)arg, sizeof (bsize)))
+		return (set_errno(EFAULT));
+	return (0);
+}
+
+/*
+ * Get the size. The linux/fs.h header says its in bytes.
+ */
+static int
+ict_blkgetsize64(file_t *fp, int cmd, intptr_t arg, int lxcmd)
+{
+	uint64_t tot;
+	lx_zone_data_t *lxzd;
+
+	if (fp->f_vnode->v_type != VCHR && fp->f_vnode->v_type != VBLK)
+		return (set_errno(EINVAL));
+
+	lxzd = ztolxzd(curproc->p_zone);
+	ASSERT(lxzd != NULL);
+	ASSERT(lxzd->lxzd_vdisks != NULL);
+
+	if (getmajor(fp->f_vnode->v_rdev) == getmajor(lxzd->lxzd_zfs_dev)) {
+		lx_virt_disk_t *vd;
+
+		vd = lx_lookup_zvol(lxzd, fp->f_vnode->v_rdev);
+		if (vd == NULL) {
+			/* should only happen if new zvol */
+			tot = 0;
+		} else {
+			tot = vd->lxvd_volsize;
+		}
+	} else {
+		int res, rv;
+		struct dk_minfo minfo;
+
+		res = VOP_IOCTL(fp->f_vnode, DKIOCGMEDIAINFO, (intptr_t)&minfo,
+		    fp->f_flag | FKIOCTL, fp->f_cred, &rv, NULL);
+		if (res > 0)
+			return (set_errno(res));
+
+		tot = minfo.dki_capacity * minfo.dki_lbsize;
+	}
+
+	if (copyout(&tot, (caddr_t)arg, sizeof (uint64_t)))
+		return (set_errno(EFAULT));
+	return (0);
+}
+
+/* Terminal-related translators */
+
+static int
+ict_tcsets(file_t *fp, int cmd, intptr_t arg, int lxcmd)
+{
+	struct lx_termios	l_tios;
+	struct termios		s_tios;
+	struct lx_cc		lio;
+	int			error, rv;
+
+	ASSERT(cmd == TCSETS || cmd == TCSETSW || cmd == TCSETSF);
+
+	if (copyin((struct lx_termios *)arg, &l_tios, sizeof (l_tios)) != 0)
+		return (set_errno(EFAULT));
+	termios2lx_cc(&l_tios, &lio);
+	l2s_termios(&l_tios, &s_tios);
+
+	error = VOP_IOCTL(fp->f_vnode, cmd, (intptr_t)&s_tios,
+	    FLFAKE(fp), fp->f_cred, &rv, NULL);
+	if (error)
+		return (set_errno(error));
+	/* preserve lx_cc */
+	set_lx_cc(fp->f_vnode, &lio);
+
+	return (0);
+}
+
+static int
+ict_tcseta(file_t *fp, int cmd, intptr_t arg, int lxcmd)
+{
+	struct lx_termio	l_tio;
+	struct termio		s_tio;
+	struct lx_cc		lio;
+	int			error, rv;
+
+	ASSERT(cmd == TCSETA || cmd == TCSETAW || cmd == TCSETAF);
+
+	if (copyin((struct lx_termio *)arg, &l_tio, sizeof (l_tio)) != 0)
+		return (set_errno(EFAULT));
+	l2s_termio(&l_tio, &s_tio);
+	termio2lx_cc(&l_tio, &lio);
+
+	error = VOP_IOCTL(fp->f_vnode, cmd, (intptr_t)&s_tio,
+	    FLFAKE(fp), fp->f_cred, &rv, NULL);
+	if (error)
+		return (set_errno(error));
+	/* preserve lx_cc */
+	set_lx_cc(fp->f_vnode, &lio);
+
+	return (0);
+}
+
+static int
+ict_tcgets_ptm(file_t *fp, int cmd, intptr_t arg, int lxcmd)
+{
+	struct lx_termios	l_tios;
+	struct termios		s_tios, *s_tiosd;
+	uint_t			s_tiosl;
+
+	/* get termios defaults */
+	if (ddi_prop_lookup_byte_array(DDI_DEV_T_ANY, ddi_root_node(),
+	    DDI_PROP_NOTPROM, "ttymodes", (uchar_t **)&s_tiosd,
+	    &s_tiosl) != DDI_SUCCESS)
+		return (EIO);
+	ASSERT(s_tiosl == sizeof (*s_tiosd));
+	bcopy(s_tiosd, &s_tios, sizeof (s_tios));
+	ddi_prop_free(s_tiosd);
+
+	/* Now munge the data to how Linux wants it. */
+	s2l_termios(&s_tios, &l_tios);
+	if (copyout(&l_tios, (struct lx_termios *)arg, sizeof (l_tios)) != 0)
+		return (set_errno(EFAULT));
+
+	return (0);
+}
+
+static int
+ict_tcgets_native(file_t *fp, int cmd, intptr_t arg, int lxcmd)
+{
+	struct lx_termios	l_tios;
+	struct termios		s_tios;
+	struct lx_cc		lio;
+	int			error, rv;
+
+	error = VOP_IOCTL(fp->f_vnode, cmd, (intptr_t)&s_tios,
+	    FLFAKE(fp), fp->f_cred, &rv, NULL);
+	if (error)
+		return (set_errno(error));
+
+	/* Now munge the data to how Linux wants it. */
+	s2l_termios(&s_tios, &l_tios);
+
+	/* return preserved lx_cc */
+	if (get_lx_cc(fp->f_vnode, &lio) == 0) {
+		l_tios.c_cc[LX_VEOF] = lio.veof;
+		l_tios.c_cc[LX_VEOL] = lio.veol;
+		l_tios.c_cc[LX_VMIN] = lio.vmin;
+		l_tios.c_cc[LX_VTIME] = lio.vtime;
+	}
+
+	if (copyout(&l_tios, (struct lx_termios *)arg, sizeof (l_tios)) != 0)
+		return (set_errno(EFAULT));
+
+	return (0);
+}
+
+static int
+ict_tcgets(file_t *fp, int cmd, intptr_t arg, int lxcmd)
+{
+	if (getmajor(fp->f_vnode->v_rdev) == ddi_name_to_major(LX_PTM_DRV))
+		return (ict_tcgets_ptm(fp, cmd, arg, lxcmd));
+	else
+		return (ict_tcgets_native(fp, cmd, arg, lxcmd));
+}
+
+static int
+ict_tcgeta(file_t *fp, int cmd, intptr_t arg, int lxcmd)
+{
+	struct lx_termio	l_tio;
+	struct termio		s_tio;
+	struct lx_cc		lio;
+	int			error, rv;
+
+	error = VOP_IOCTL(fp->f_vnode, cmd, (intptr_t)&s_tio,
+	    FLFAKE(fp), fp->f_cred, &rv, NULL);
+	if (error)
+		return (set_errno(error));
+
+	s2l_termio(&s_tio, &l_tio);
+	/* return preserved lx_cc */
+	if (get_lx_cc(fp->f_vnode, &lio) == 0) {
+		l_tio.c_cc[LX_VEOF] = lio.veof;
+		l_tio.c_cc[LX_VMIN] = lio.vmin;
+		l_tio.c_cc[LX_VTIME] = lio.vtime;
+	}
+
+	if (copyout(&l_tio, (struct lx_termios *)arg, sizeof (l_tio)) != 0)
+		return (set_errno(EFAULT));
+
+	return (0);
+}
+
+static int
+ict_tiocspgrp(file_t *fp, int cmd, intptr_t arg, int lxcmd)
+{
+	pid_t lpid, spid, tid;
+	int error, rv;
+
+	/* Converting to the illumos pid is necessary */
+	if (copyin((pid_t *)arg, &lpid, sizeof (lpid)) < 0)
+		return (set_errno(EFAULT));
+	if (lx_lpid_to_spair(lpid, &spid, &tid) < 0)
+		return (set_errno(EPERM));
+
+	error = VOP_IOCTL(fp->f_vnode, cmd, (intptr_t)&spid,
+	    fp->f_flag |FKIOCTL, fp->f_cred, &rv, NULL);
+	return ((error != 0) ? set_errno(error) : 0);
+}
+
+static int
+ict_tcsbrkp(file_t *fp, int cmd, intptr_t arg, int lxcmd)
+{
+	int rv, error;
+	/* use null duration to emulate TCSBRKP */
+	int dur = 0;
+	error = VOP_IOCTL(fp->f_vnode, TCSBRK, (intptr_t)&dur,
+	    FLFAKE(fp), fp->f_cred, &rv, NULL);
+	return ((error != 0) ? set_errno(error) : 0);
+}
+
+static int
+ict_tiocgpgrp(file_t *fp, int cmd, intptr_t arg, int lxcmd)
+{
+	pid_t	spgrp;
+	int	error, rv;
+
+	error = VOP_IOCTL(fp->f_vnode, cmd, (intptr_t)&spgrp, FLFAKE(fp),
+	    fp->f_cred, &rv, NULL);
+	if (error == 0) {
+		if (spgrp == curproc->p_zone->zone_proc_initpid) {
+			spgrp = 1;
+		}
+		if (copyout(&spgrp, (caddr_t)arg, sizeof (spgrp))) {
+			return (set_errno(EFAULT));
+		}
+	}
+	return ((error != 0) ? set_errno(error) : 0);
+}
+
+static int
+ict_sptlock(file_t *fp, int cmd, intptr_t arg, int lxcmd)
+{
+	struct strioctl istr;
+	int error, rv;
+
+	istr.ic_cmd = UNLKPT;
+	istr.ic_len = 0;
+	istr.ic_timout = 0;
+	istr.ic_dp = NULL;
+	error = VOP_IOCTL(fp->f_vnode, I_STR, (intptr_t)&istr,
+	    fp->f_flag |FKIOCTL, fp->f_cred, &rv, NULL);
+	/*
+	 * The success/fail return values are different between Linux
+	 * and illumos.   Linux expects 0 or -1.  Illumos can return
+	 * positive number on success.
+	 */
+	return ((error != 0) ? set_errno(error) : 0);
+}
+
+static int
+ict_gptn(file_t *fp, int cmd, intptr_t arg, int lxcmd)
+{
+	struct strioctl istr;
+	cred_t *cr;
+	pt_own_t pto;
+	int error, rv;
+	int ptyno;
+
+	/* This operation is only valid for the lx_ptm device. */
+	if (getmajor(fp->f_vnode->v_rdev) != ddi_name_to_major(LX_PTM_DRV))
+		return (set_errno(ENOTTY));
+
+	cr = CRED();
+	pto.pto_ruid = cr->cr_uid;
+	pto.pto_rgid = cr->cr_gid;
+
+	istr.ic_cmd = OWNERPT;
+	istr.ic_len = sizeof (pto);
+	istr.ic_timout = 0;
+	istr.ic_dp = (char *)&pto;
+	error = VOP_IOCTL(fp->f_vnode, I_STR, (intptr_t)&istr,
+	    FLFAKE(fp), fp->f_cred, &rv, NULL);
+
+	if (error)
+		return (set_errno((error == ENOTTY) ? error: EACCES));
+
+	ptyno = getminor(fp->f_vnode->v_rdev) - 1;
+	if (copyout(&ptyno, (caddr_t)arg, sizeof (ptyno)))
+		return (set_errno(EFAULT));
+
+	return (0);
+}
+
+static int
+ict_tiocgwinsz(file_t *fp, int cmd, intptr_t arg, int lxcmd)
+{
+	int error, rv;
+
+	error = VOP_IOCTL(fp->f_vnode, cmd, arg, FLUSER(fp), fp->f_cred, &rv,
+	    NULL);
+
+	/*
+	 * A few Linux libc's (e.g. musl) have chosen to implement isatty()
+	 * using the TIOCGWINSZ ioctl. Some apps also do the same thing
+	 * directly. On Linux that ioctl will return a size of 0x0 for dumb
+	 * terminals but on illumos see the handling for TIOCGWINSZ in ptem's
+	 * ptioc(). We fail if the winsize is all zeros. To emulate the Linux
+	 * behavior use the native ioctl check that we do for isatty and return
+	 * a size of 0x0 if that succeeds.
+	 */
+	if (error == EINVAL) {
+		int err;
+		struct termio s_tio;
+
+		err = VOP_IOCTL(fp->f_vnode, TCGETA, (intptr_t)&s_tio,
+		    FLFAKE(fp), fp->f_cred, &rv, NULL);
+
+		if (err == 0) {
+			struct winsize w;
+
+			bzero(&w, sizeof (w));
+			if (copyout(&w, (struct winsize *)arg, sizeof (w)) != 0)
+				return (set_errno(EFAULT));
+			return (0);
+		}
+	}
+
+	if (error != 0)
+		return (set_errno(error));
+
+	return (0);
+}
+
+static int
+ict_tiocsctty(file_t *fp, int cmd, intptr_t arg, int lxcmd)
+{
+	pid_t	ttysid, mysid;
+	int	error, rv;
+	proc_t *p = curproc;
+
+	/* getsid */
+	mutex_enter(&p->p_splock);
+	mysid = p->p_sessp->s_sid;
+	mutex_exit(&p->p_splock);
+
+	/*
+	 * Report success if we already control the tty.
+	 * If no one controls it, TIOCSCTTY will change that later.
+	 */
+	error = VOP_IOCTL(fp->f_vnode, TIOCGSID, (intptr_t)&ttysid,
+	    FLFAKE(fp), fp->f_cred, &rv, NULL);
+	if (error == 0 && ttysid == mysid)
+		return (0);
+
+	/*
+	 * Need to make sure we're a session leader, otherwise the
+	 * TIOCSCTTY ioctl will fail.
+	 */
+	mutex_enter(&pidlock);
+	if (p->p_sessp->s_sidp != p->p_pidp && !pgmembers(p->p_pid)) {
+		mutex_exit(&pidlock);
+		sess_create();
+	} else {
+		mutex_exit(&pidlock);
+	}
+
+	error = VOP_IOCTL(fp->f_vnode, cmd, 0, FLUSER(fp),
+	    fp->f_cred, &rv, NULL);
+	return ((error != 0) ? set_errno(error) : 0);
+}
+
+/* Socket-related translators */
+
+static int
+ict_siocatmark(file_t *fp, int cmd, intptr_t arg, int lxcmd)
+{
+	vnode_t *vp = fp->f_vnode;
+	int error, rv;
+	/*
+	 * Linux expects a SIOCATMARK of a UDP socket to return ENOTTY, while
+	 * Illumos allows it. Linux prior to 2.6.39 returned EINVAL for this.
+	 */
+	if (vp->v_type != VSOCK || VTOSO(vp)->so_type != SOCK_STREAM)
+		return (set_errno(ENOTTY));
+
+	error = VOP_IOCTL(fp->f_vnode, cmd, arg, FLUSER(fp), fp->f_cred, &rv,
+	    NULL);
+	if (error)
+		return (set_errno(error));
+
+	return (0);
+}
+
+static int
+ict_if_ioctl(vnode_t *vn, int cmd, intptr_t arg, int flags, cred_t *cred)
+{
+	int error, rv;
+	lx_zone_data_t *lxzd = ztolxzd(curproc->p_zone);
+	ksocket_t ks;
+
+	ASSERT(lxzd != NULL);
+
+	/*
+	 * For ioctls of this type, we are strict about address family
+	 * whereas Linux is lenient.  This strictness can be avoided by using
+	 * an internal AF_INET ksocket, which we use if the family is anything
+	 * but AF_PACKET.
+	 */
+	if (vn->v_type == VSOCK && VTOSO(vn)->so_family == AF_PACKET)
+		return (VOP_IOCTL(vn, cmd, arg, flags, cred, &rv, NULL));
+
+	mutex_enter(&lxzd->lxzd_lock);
+	ks = lxzd->lxzd_ioctl_sock;
+	if (ks == NULL) {
+		/*
+		 * Linux is not at all picky about address family when it comes
+		 * to supporting interface-related ioctls. To mimic this
+		 * behavior, we'll attempt those ioctls against a ksocket
+		 * configured for that purpose.
+		 */
+		(void) ksocket_socket(&lxzd->lxzd_ioctl_sock, AF_INET,
+		    SOCK_DGRAM, 0, 0, curproc->p_zone->zone_kcred);
+		ks = lxzd->lxzd_ioctl_sock;
+	}
+	mutex_exit(&lxzd->lxzd_lock);
+
+	if (ks != NULL) {
+		error = ksocket_ioctl(ks, cmd, arg, &rv, cred);
+	} else {
+		error = VOP_IOCTL(vn, cmd, arg, flags, cred, &rv, NULL);
+	}
+
+	return (error);
+}
+
+static int
+ict_sioghwaddr(file_t *fp, struct lifreq *lreq)
+{
+	struct sockaddr_dl *sdl = (struct sockaddr_dl *)&lreq->lifr_addr;
+	struct sockaddr hwaddr;
+	int error, size;
+
+	error = ict_if_ioctl(fp->f_vnode, SIOCGLIFHWADDR, (intptr_t)lreq,
+	    FLFAKE(fp), fp->f_cred);
+
+	if (error == EADDRNOTAVAIL &&
+	    strncmp(lreq->lifr_name, "lo", 2) == 0) {
+		/* Emulate success on suspected loopbacks */
+		sdl->sdl_type = DL_LOOP;
+		sdl->sdl_alen = ETHERADDRL;
+		bzero(LLADDR(sdl), sdl->sdl_alen);
+		error = 0;
+	}
+
+	if (error == 0) {
+		bzero(&hwaddr, sizeof (hwaddr));
+		lx_stol_hwaddr(sdl, &hwaddr, &size);
+		bcopy(&hwaddr, &lreq->lifr_addr,
+		    size + sizeof (sdl->sdl_family));
+	}
+
+	return (error);
+}
+
+static int
+ict_siocgifname(file_t *fp, int cmd, intptr_t arg, int lxcmd)
+{
+	struct ifreq	req;
+	int		len;
+	char		name[LIFNAMSIZ];
+	netstack_t *ns;
+	ip_stack_t *ipst;
+	phyint_t *phyi;
+
+	if (fp->f_vnode->v_type != VSOCK) {
+		return (set_errno(EINVAL));
+	}
+
+	len = (curproc->p_model == DATAMODEL_LP64) ? sizeof (lx_ifreq64_t) :
+	    sizeof (lx_ifreq32_t);
+	if (copyin((struct ifreq *)arg, &req, len) != 0) {
+		return (set_errno(EFAULT));
+	}
+
+	/*
+	 * Since Linux calls this ioctl on all sorts of sockets, perform the
+	 * interface name lookup manually.
+	 */
+	if ((ns = netstack_get_current()) == NULL) {
+		return (set_errno(EINVAL));
+	}
+	ipst = ns->netstack_ip;
+
+	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
+	phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
+	    (void *) &req.ifr_index, NULL);
+	if (phyi != NULL) {
+		strncpy(name, phyi->phyint_name, LIFNAMSIZ);
+		lx_ifname_convert(name, LX_IF_FROMNATIVE);
+	} else {
+		name[0] = '\0';
+	}
+
+	rw_exit(&ipst->ips_ill_g_lock);
+	netstack_rele(ns);
+
+	if (strlen(name) != 0) {
+		/* Truncate for ifreq and copyout */
+		strncpy(req.ifr_name, name, IFNAMSIZ);
+		if (copyout(&req, (struct ifreq *)arg, len) != 0) {
+			return (set_errno(EFAULT));
+		}
+		return (0);
+	}
+
+	return (set_errno(EINVAL));
+}
+
+static int
+ict_siolifreq(file_t *fp, int cmd, intptr_t arg, int lxcmd)
+{
+	struct ifreq	req;
+	struct lifreq	lreq;
+	int		error, len;
+
+	/* Convert from Linux ifreq to illumos lifreq */
+	if (curproc->p_model == DATAMODEL_LP64)
+		len = sizeof (lx_ifreq64_t);
+	else
+		len = sizeof (lx_ifreq32_t);
+	if (copyin((struct ifreq *)arg, &req, len) != 0)
+		return (set_errno(EFAULT));
+	bzero(&lreq, sizeof (lreq));
+	strncpy(lreq.lifr_name, req.ifr_name, IFNAMSIZ);
+	bcopy(&req.ifr_ifru, &lreq.lifr_lifru, len - IFNAMSIZ);
+	lx_ifname_convert(lreq.lifr_name, LX_IF_TONATIVE);
+
+	switch (cmd) {
+	case SIOCGIFADDR:
+	case SIOCSIFADDR:
+	case SIOCGIFDSTADDR:
+	case SIOCSIFDSTADDR:
+	case SIOCGIFBRDADDR:
+	case SIOCSIFBRDADDR:
+	case SIOCGIFNETMASK:
+	case SIOCSIFNETMASK:
+	case SIOCGIFMETRIC:
+	case SIOCSIFMETRIC:
+	case SIOCGIFMTU:
+	case SIOCSIFMTU:
+		/*
+		 * Convert cmd from SIO*IF* to SIO*LIF*.
+		 * This is needed since Linux allows ifreq operations on ipv6
+		 * sockets where illumos does not.
+		 */
+		cmd = ((cmd & IOC_INOUT) |
+		    _IOW('i', ((cmd & 0xff) + 100), struct lifreq));
+		error = ict_if_ioctl(fp->f_vnode, cmd, (intptr_t)&lreq,
+		    FLFAKE(fp), fp->f_cred);
+		break;
+	case SIOCGIFINDEX:
+		cmd = SIOCGLIFINDEX;
+		error = ict_if_ioctl(fp->f_vnode, cmd, (intptr_t)&lreq,
+		    FLFAKE(fp), fp->f_cred);
+		break;
+	case SIOCGIFFLAGS:
+		cmd = SIOCGLIFFLAGS;
+		error = ict_if_ioctl(fp->f_vnode, cmd, (intptr_t)&lreq,
+		    FLFAKE(fp), fp->f_cred);
+		if (error == 0)
+			lx_ifflags_convert(&lreq.lifr_flags, LX_IF_FROMNATIVE);
+		break;
+	case SIOCSIFFLAGS:
+		cmd = SIOCSLIFFLAGS;
+		lx_ifflags_convert(&lreq.lifr_flags, LX_IF_TONATIVE);
+		error = ict_if_ioctl(fp->f_vnode, cmd, (intptr_t)&lreq,
+		    FLFAKE(fp), fp->f_cred);
+		break;
+	case SIOCGIFHWADDR:
+		error = ict_sioghwaddr(fp, &lreq);
+		break;
+	case LX_SIOCGIFTXQLEN:
+		/*
+		 * Illumos lacks the notion of txqlen.  Confirm the provided
+		 * interface is valid with SIOCGLIFINDEX and return a fake
+		 * txqlen of 1.  Loopback devices will report txqlen of 0.
+		 */
+		if (strncmp(lreq.lifr_name, "lo", 2) == 0) {
+			lreq.lifr_index = 0;
+			error = 0;
+			break;
+		}
+		cmd = SIOCGLIFINDEX;
+		error = ict_if_ioctl(fp->f_vnode, cmd, (intptr_t)&lreq,
+		    FLFAKE(fp), fp->f_cred);
+		if (error == 0) {
+			/* lifr_index aliases to the qlen field */
+			lreq.lifr_index = 1;
+		}
+		break;
+	case LX_SIOCSIFHWADDR:
+		/*
+		 * We're not going to support SIOCSIFHWADDR, but we need to be
+		 * able to check the result of the copyin first to see if the
+		 * command should have returned EFAULT.
+		 */
+	default:
+		error = EINVAL;
+	}
+
+	if (error != 0)
+		return (set_errno(error));
+
+	/* Convert back to a Linux ifreq */
+	lx_ifname_convert(lreq.lifr_name, LX_IF_FROMNATIVE);
+	bzero(&req, sizeof (req));
+	strncpy(req.ifr_name, lreq.lifr_name, IFNAMSIZ);
+	bcopy(&lreq.lifr_lifru, &req.ifr_ifru, len - IFNAMSIZ);
+
+	if (copyout(&req, (struct lifreq *)arg, len) != 0)
+		return (set_errno(EFAULT));
+
+	return (0);
+}
+
+static int
+ict_siocgifconf32(file_t *fp, int cmd, intptr_t arg, int lxcmd)
+{
+	lx_ifconf32_t	conf;
+	lx_ifreq32_t	*oreq;
+	struct ifconf	sconf;
+	int		ifcount, error, i, buf_len;
+
+	if (copyin((lx_ifconf32_t *)arg, &conf, sizeof (conf)) != 0)
+		return (set_errno(EFAULT));
+
+	/* They want to know how many interfaces there are. */
+	if (conf.if_len <= 0 || conf.if_buf == NULL) {
+		error = ict_if_ioctl(fp->f_vnode, SIOCGIFNUM,
+		    (intptr_t)&ifcount, FLFAKE(fp), fp->f_cred);
+		if (error != 0)
+			return (set_errno(error));
+
+		conf.if_len = ifcount * sizeof (lx_ifreq32_t);
+
+		if (copyout(&conf, (lx_ifconf32_t *)arg, sizeof (conf)) != 0)
+			return (set_errno(EFAULT));
+		return (0);
+	} else {
+		ifcount = conf.if_len / sizeof (lx_ifreq32_t);
+	}
+
+	/* Get interface configuration list. */
+	sconf.ifc_len = ifcount * sizeof (struct ifreq);
+	sconf.ifc_req = (struct ifreq *)kmem_alloc(sconf.ifc_len, KM_SLEEP);
+
+	error = ict_if_ioctl(fp->f_vnode, cmd, (intptr_t)&sconf, FLFAKE(fp),
+	    fp->f_cred);
+	if (error != 0) {
+		kmem_free(sconf.ifc_req, ifcount * sizeof (struct ifreq));
+		return (set_errno(error));
+	}
+
+	/* Convert data to Linux format & rename interfaces */
+	buf_len = ifcount * sizeof (lx_ifreq32_t);
+	oreq = (lx_ifreq32_t *)kmem_alloc(buf_len, KM_SLEEP);
+	for (i = 0; i < sconf.ifc_len / sizeof (struct ifreq); i++) {
+		bcopy(&sconf.ifc_req[i], oreq + i, sizeof (lx_ifreq32_t));
+		lx_ifname_convert(oreq[i].ifr_name, LX_IF_FROMNATIVE);
+	}
+	conf.if_len = i * sizeof (*oreq);
+	kmem_free(sconf.ifc_req, ifcount * sizeof (struct ifreq));
+
+	error = 0;
+	if (copyout(oreq, (caddr_t)(uintptr_t)conf.if_buf, conf.if_len) != 0 ||
+	    copyout(&conf, (lx_ifconf32_t *)arg, sizeof (conf)) != 0)
+		error = set_errno(EFAULT);
+
+	kmem_free(oreq, buf_len);
+	return (error);
+}
+
+static int
+ict_siocgifconf64(file_t *fp, int cmd, intptr_t arg, int lxcmd)
+{
+	lx_ifconf64_t	conf;
+	lx_ifreq64_t	*oreq;
+	struct ifconf	sconf;
+	int		ifcount, error, i, buf_len;
+
+	if (copyin((lx_ifconf64_t *)arg, &conf, sizeof (conf)) != 0)
+		return (set_errno(EFAULT));
+
+	/* They want to know how many interfaces there are. */
+	if (conf.if_len <= 0 || conf.if_buf == NULL) {
+		error = ict_if_ioctl(fp->f_vnode, SIOCGIFNUM,
+		    (intptr_t)&ifcount, FLFAKE(fp), fp->f_cred);
+		if (error != 0)
+			return (set_errno(error));
+
+		conf.if_len = ifcount * sizeof (lx_ifreq64_t);
+
+		if (copyout(&conf, (lx_ifconf64_t *)arg, sizeof (conf)) != 0)
+			return (set_errno(EFAULT));
+		return (0);
+	} else {
+		ifcount = conf.if_len / sizeof (lx_ifreq64_t);
+	}
+
+	/* Get interface configuration list. */
+	sconf.ifc_len = ifcount * sizeof (struct ifreq);
+	sconf.ifc_req = (struct ifreq *)kmem_alloc(sconf.ifc_len, KM_SLEEP);
+
+	error = ict_if_ioctl(fp->f_vnode, cmd, (intptr_t)&sconf, FLFAKE(fp),
+	    fp->f_cred);
+	if (error != 0) {
+		kmem_free(sconf.ifc_req, ifcount * sizeof (struct ifreq));
+		return (set_errno(error));
+	}
+
+	/* Convert data to Linux format & rename interfaces */
+	buf_len = ifcount * sizeof (lx_ifreq64_t);
+	oreq = (lx_ifreq64_t *)kmem_alloc(buf_len, KM_SLEEP);
+	for (i = 0; i < sconf.ifc_len / sizeof (struct ifreq); i++) {
+		bcopy(&sconf.ifc_req[i], oreq + i, sizeof (lx_ifreq64_t));
+		lx_ifname_convert(oreq[i].ifr_name, LX_IF_FROMNATIVE);
+	}
+	conf.if_len = i * sizeof (*oreq);
+	kmem_free(sconf.ifc_req, ifcount * sizeof (struct ifreq));
+
+	error = 0;
+	if (copyout(oreq, (caddr_t)(uintptr_t)conf.if_buf, conf.if_len) != 0 ||
+	    copyout(&conf, (lx_ifconf64_t *)arg, sizeof (conf)) != 0)
+		error = set_errno(EFAULT);
+
+	kmem_free(oreq, buf_len);
+	return (error);
+}
+
+static int
+ict_siocgifconf(file_t *fp, int cmd, intptr_t arg, int lxcmd)
+{
+	if (curproc->p_model == DATAMODEL_LP64)
+		return (ict_siocgifconf64(fp, cmd, arg, lxcmd));
+	else
+		return (ict_siocgifconf32(fp, cmd, arg, lxcmd));
+}
+
+/*
+ * Unfortunately some of the autofs ioctls want to return a positive integer
+ * result which does not indicate an error. To minimize disruption in the
+ * rest of the code, we'll treat a positive return as an errno and a negative
+ * return as the non-error return (which we then negate).
+ */
+static int
+ict_autofs(file_t *fp, int cmd, intptr_t arg, int lxcmd)
+{
+	int res = 0;
+	int rv;
+
+	res = VOP_IOCTL(fp->f_vnode, cmd, arg, FLUSER(fp), fp->f_cred, &rv,
+	    NULL);
+	if (res > 0)
+		return (set_errno(res));
+	if (res == 0)
+		return (0);
+	return (-res);
+}
+
+/* Structure used to define an ioctl translator. */
+typedef struct lx_ioc_cmd_translator {
+	int	lict_lxcmd;
+	int	lict_cmd;
+	int	(*lict_func)(file_t *fp, int cmd, intptr_t arg, int lxcmd);
+} lx_ioc_cmd_translator_t;
+
+#define	LX_IOC_CMD_TRANSLATOR_PASS(ioc_cmd_sym)				\
+	{ (int)LX_##ioc_cmd_sym, (int)ioc_cmd_sym, ict_pass },
+
+#define	LX_IOC_CMD_TRANSLATOR_FILTER(ioc_cmd_sym, ioct_handler)		\
+	{ (int)LX_##ioc_cmd_sym, (int)ioc_cmd_sym, ioct_handler },
+
+#define	LX_IOC_CMD_TRANSLATOR_CUSTOM(ioc_cmd_sym, ioct_handler)		\
+	{ (int)ioc_cmd_sym, (int)ioc_cmd_sym, ioct_handler },
+
+#define	LX_IOC_CMD_TRANSLATOR_PTHRU(ioc_cmd_sym)			\
+	{ (int)ioc_cmd_sym, (int)ioc_cmd_sym, ict_pass },
+
+#define	LX_IOC_CMD_TRANSLATOR_END					\
+	{0, 0, NULL}
+
+static lx_ioc_cmd_translator_t lx_ioc_xlate_fd[] = {
+	LX_IOC_CMD_TRANSLATOR_FILTER(FIONBIO,	ict_fionbio)
+	LX_IOC_CMD_TRANSLATOR_FILTER(FIONREAD,	ict_fionread)
+	LX_IOC_CMD_TRANSLATOR_PASS(FIOASYNC)
+
+	/* streams related */
+	LX_IOC_CMD_TRANSLATOR_PASS(TCXONC)
+	LX_IOC_CMD_TRANSLATOR_PASS(TCFLSH)
+	LX_IOC_CMD_TRANSLATOR_PASS(TIOCEXCL)
+	LX_IOC_CMD_TRANSLATOR_PASS(TIOCNXCL)
+	LX_IOC_CMD_TRANSLATOR_PASS(TIOCSTI)
+	LX_IOC_CMD_TRANSLATOR_PASS(TIOCSWINSZ)
+	LX_IOC_CMD_TRANSLATOR_PASS(TIOCMBIS)
+	LX_IOC_CMD_TRANSLATOR_PASS(TIOCMBIC)
+	LX_IOC_CMD_TRANSLATOR_PASS(TIOCMSET)
+	LX_IOC_CMD_TRANSLATOR_PASS(TIOCSETD)
+	LX_IOC_CMD_TRANSLATOR_PASS(TCSBRK)
+
+	/* terminal related */
+	LX_IOC_CMD_TRANSLATOR_PASS(TIOCGETD)
+	LX_IOC_CMD_TRANSLATOR_PASS(TIOCGSID)
+	LX_IOC_CMD_TRANSLATOR_PASS(TIOCNOTTY)
+	LX_IOC_CMD_TRANSLATOR_PASS(TIOCPKT)
+
+	LX_IOC_CMD_TRANSLATOR_FILTER(TCSETS,		ict_tcsets)
+	LX_IOC_CMD_TRANSLATOR_FILTER(TCSETSW,		ict_tcsets)
+	LX_IOC_CMD_TRANSLATOR_FILTER(TCSETSF,		ict_tcsets)
+	LX_IOC_CMD_TRANSLATOR_FILTER(TCSETA,		ict_tcseta)
+	LX_IOC_CMD_TRANSLATOR_FILTER(TCSETAW,		ict_tcseta)
+	LX_IOC_CMD_TRANSLATOR_FILTER(TCSETAF,		ict_tcseta)
+	LX_IOC_CMD_TRANSLATOR_FILTER(TCGETS,		ict_tcgets)
+	LX_IOC_CMD_TRANSLATOR_FILTER(TCGETA,		ict_tcgeta)
+	LX_IOC_CMD_TRANSLATOR_FILTER(TIOCGWINSZ,	ict_tiocgwinsz)
+	LX_IOC_CMD_TRANSLATOR_CUSTOM(LX_TCSBRKP,	ict_tcsbrkp)
+	LX_IOC_CMD_TRANSLATOR_FILTER(TIOCSPGRP,		ict_tiocspgrp)
+	LX_IOC_CMD_TRANSLATOR_FILTER(TIOCGPGRP,		ict_tiocgpgrp)
+	LX_IOC_CMD_TRANSLATOR_CUSTOM(LX_TIOCSPTLCK,	ict_sptlock)
+	LX_IOC_CMD_TRANSLATOR_CUSTOM(LX_TIOCGPTN,	ict_gptn)
+	LX_IOC_CMD_TRANSLATOR_FILTER(TIOCSCTTY,		ict_tiocsctty)
+
+	LX_IOC_CMD_TRANSLATOR_END
+};
+
+static lx_ioc_cmd_translator_t lx_ioc_xlate_socket[] = {
+	LX_IOC_CMD_TRANSLATOR_PASS(FIOGETOWN)
+
+	LX_IOC_CMD_TRANSLATOR_PASS(SIOCSPGRP)
+	LX_IOC_CMD_TRANSLATOR_PASS(SIOCGPGRP)
+	LX_IOC_CMD_TRANSLATOR_PASS(SIOCGSTAMP)
+	LX_IOC_CMD_TRANSLATOR_FILTER(SIOCATMARK,	ict_siocatmark)
+	LX_IOC_CMD_TRANSLATOR_FILTER(SIOCGIFFLAGS,	ict_siolifreq)
+	LX_IOC_CMD_TRANSLATOR_FILTER(SIOCSIFFLAGS,	ict_siolifreq)
+	LX_IOC_CMD_TRANSLATOR_FILTER(SIOCGIFADDR,	ict_siolifreq)
+	LX_IOC_CMD_TRANSLATOR_FILTER(SIOCSIFADDR,	ict_siolifreq)
+	LX_IOC_CMD_TRANSLATOR_FILTER(SIOCGIFDSTADDR,	ict_siolifreq)
+	LX_IOC_CMD_TRANSLATOR_FILTER(SIOCSIFDSTADDR,	ict_siolifreq)
+	LX_IOC_CMD_TRANSLATOR_FILTER(SIOCGIFBRDADDR,	ict_siolifreq)
+	LX_IOC_CMD_TRANSLATOR_FILTER(SIOCSIFBRDADDR,	ict_siolifreq)
+	LX_IOC_CMD_TRANSLATOR_FILTER(SIOCGIFNETMASK,	ict_siolifreq)
+	LX_IOC_CMD_TRANSLATOR_FILTER(SIOCSIFNETMASK,	ict_siolifreq)
+	LX_IOC_CMD_TRANSLATOR_FILTER(SIOCGIFMETRIC,	ict_siolifreq)
+	LX_IOC_CMD_TRANSLATOR_FILTER(SIOCSIFMETRIC,	ict_siolifreq)
+	LX_IOC_CMD_TRANSLATOR_FILTER(SIOCGIFMTU,	ict_siolifreq)
+	LX_IOC_CMD_TRANSLATOR_FILTER(SIOCSIFMTU,	ict_siolifreq)
+	LX_IOC_CMD_TRANSLATOR_FILTER(SIOCGIFHWADDR,	ict_siolifreq)
+	LX_IOC_CMD_TRANSLATOR_CUSTOM(LX_SIOCSIFHWADDR,	ict_siolifreq)
+	LX_IOC_CMD_TRANSLATOR_FILTER(SIOCGIFINDEX,	ict_siolifreq)
+	LX_IOC_CMD_TRANSLATOR_CUSTOM(LX_SIOCGIFTXQLEN,	ict_siolifreq)
+	LX_IOC_CMD_TRANSLATOR_FILTER(SIOCGIFCONF,	ict_siocgifconf)
+	LX_IOC_CMD_TRANSLATOR_CUSTOM(LX_SIOCGIFNAME,	ict_siocgifname)
+
+	LX_IOC_CMD_TRANSLATOR_END
+};
+
+static lx_ioc_cmd_translator_t lx_ioc_xlate_dtrace[] = {
+	LX_IOC_CMD_TRANSLATOR_PTHRU(DTRACEHIOC_ADD)
+	LX_IOC_CMD_TRANSLATOR_PTHRU(DTRACEHIOC_REMOVE)
+	LX_IOC_CMD_TRANSLATOR_PTHRU(DTRACEHIOC_ADDDOF)
+
+	LX_IOC_CMD_TRANSLATOR_END
+};
+
+static lx_ioc_cmd_translator_t lx_ioc_xlate_autofs[] = {
+	LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_IOC_READY)
+	LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_IOC_FAIL)
+	LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_IOC_CATATONIC)
+	LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_IOC_PROTOVER)
+	LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_IOC_SETTIMEOUT)
+	LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_IOC_EXPIRE)
+	LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_IOC_EXPIRE_MULTI)
+	LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_IOC_PROTOSUBVER)
+	LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_IOC_ASKUMOUNT)
+
+	LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_DEV_IOC_VERSION_CMD)
+	LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_DEV_IOC_PROTOVER_CMD)
+	LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_DEV_IOC_PROTOSUBVER_CMD)
+	LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_DEV_IOC_OPENMOUNT_CMD)
+	LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_DEV_IOC_CLOSEMOUNT_CMD)
+	LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_DEV_IOC_READY_CMD)
+	LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_DEV_IOC_FAIL_CMD)
+	LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_DEV_IOC_SETPIPEFD_CMD)
+	LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_DEV_IOC_CATATONIC_CMD)
+	LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_DEV_IOC_TIMEOUT_CMD)
+	LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_DEV_IOC_REQUESTER_CMD)
+	LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_DEV_IOC_EXPIRE_CMD)
+	LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_DEV_IOC_ASKUMOUNT_CMD)
+	LX_IOC_CMD_TRANSLATOR_CUSTOM(LX_AUTOFS_DEV_IOC_ISMOUNTPOINT_CMD,
+	    ict_autofs)
+
+	LX_IOC_CMD_TRANSLATOR_END
+};
+
+static lx_ioc_cmd_translator_t lx_ioc_xlate_hd[] = {
+	LX_IOC_CMD_TRANSLATOR_CUSTOM(LX_HDIO_GETGEO, ict_hdgetgeo)
+
+	LX_IOC_CMD_TRANSLATOR_END
+};
+
+static lx_ioc_cmd_translator_t lx_ioc_xlate_blk[] = {
+	LX_IOC_CMD_TRANSLATOR_CUSTOM(LX_BLKGETSIZE, ict_blkgetsize)
+	LX_IOC_CMD_TRANSLATOR_CUSTOM(LX_BLKSSZGET, ict_blkgetssize)
+	LX_IOC_CMD_TRANSLATOR_CUSTOM(LX_BLKGETSIZE64, ict_blkgetsize64)
+
+	LX_IOC_CMD_TRANSLATOR_END
+};
+
+static void
+lx_ioctl_vsd_free(void *data)
+{
+	kmem_free(data, sizeof (struct lx_cc));
+}
+
+void
+lx_ioctl_init()
+{
+	vsd_create(&lx_ioctl_vsd, lx_ioctl_vsd_free);
+}
+
+void
+lx_ioctl_fini()
+{
+	vsd_destroy(&lx_ioctl_vsd);
+}
+
+long
+lx_ioctl(int fdes, int cmd, intptr_t arg)
+{
+	file_t *fp;
+	int res = 0, error = ENOTTY;
+	lx_ioc_cmd_translator_t *ict = NULL;
+
+	if (cmd == LX_FIOCLEX || cmd == LX_FIONCLEX) {
+		res = f_setfd_error(fdes, (cmd == LX_FIOCLEX) ? FD_CLOEXEC : 0);
+		return ((res != 0) ? set_errno(res) : 0);
+	}
+
+	if ((fp = getf(fdes)) == NULL)
+		return (set_errno(EBADF));
+
+	switch ((cmd & 0xff00) >> 8) {
+	case LX_IOC_TYPE_FD:
+		ict = lx_ioc_xlate_fd;
+		break;
+
+	case LX_IOC_TYPE_DTRACE:
+		ict = lx_ioc_xlate_dtrace;
+		break;
+
+	case LX_IOC_TYPE_SOCK:
+		ict = lx_ioc_xlate_socket;
+		error = EOPNOTSUPP;
+		break;
+
+	case LX_IOC_TYPE_AUTOFS:
+		ict = lx_ioc_xlate_autofs;
+		break;
+
+	case LX_IOC_TYPE_BLK:
+		ict = lx_ioc_xlate_blk;
+		break;
+
+	case LX_IOC_TYPE_HD:
+		ict = lx_ioc_xlate_hd;
+		break;
+
+	default:
+		releasef(fdes);
+		return (set_errno(ENOTTY));
+	}
+
+	/*
+	 * Today, none of the ioctls supported by the emulation possess
+	 * overlapping cmd values.  Because of that, no type interrogation of
+	 * the fd is done before executing specific ioctl emulation.  It's
+	 * assumed that the vnode-specific logic called by the emulation
+	 * function will reject ioctl commands not supported by the fd.
+	 */
+	VERIFY(ict != NULL);
+	while (ict->lict_func != NULL) {
+		if (ict->lict_lxcmd == cmd)
+			break;
+		ict++;
+	}
+	if (ict->lict_func == NULL) {
+		releasef(fdes);
+		return (set_errno(error));
+	}
+
+	res = ict->lict_func(fp, ict->lict_cmd, arg, ict->lict_lxcmd);
+	releasef(fdes);
+	return (res);
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_ioprio.c b/usr/src/uts/common/brand/lx/syscall/lx_ioprio.c
new file mode 100644
index 0000000000..13397e199e
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_ioprio.c
@@ -0,0 +1,66 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+#include <sys/errno.h>
+#include <sys/systm.h>
+#include <sys/lx_brand.h>
+
+/* 'which' values. */
+#define	LX_IOPRIO_WHO_PROCESS	1
+#define	LX_IOPRIO_WHO_PGRP	2
+#define	LX_IOPRIO_WHO_USER	3
+
+/*
+ * The possible values for the class. We report best effort (BE) as the class
+ * in use.
+ */
+#define	LX_IOPRIO_CLASS_RT	1
+#define	LX_IOPRIO_CLASS_BE	2
+#define	LX_IOPRIO_CLASS_IDLE	3
+
+/* Macro to determine the class from the input mask */
+#define	LX_IOPRIO_PRIO_CLASS(m)	((m) >> 13)
+
+/* ARGSUSED */
+long
+lx_ioprio_get(int which, int who)
+{
+	if (which < LX_IOPRIO_WHO_PROCESS || which > LX_IOPRIO_WHO_USER)
+		return (set_errno(EINVAL));
+
+	return (LX_IOPRIO_CLASS_BE);
+}
+
+/*
+ * We allow setting any valid class, even though it's ignored.
+ * We ignore the 'who' parameter which means that we're not searching for
+ * the specified target in order to return a specific errno in the case that
+ * the target does not exist.
+ */
+/* ARGSUSED */
+long
+lx_ioprio_set(int which, int who, int mask)
+{
+	int class;
+
+	if (which < LX_IOPRIO_WHO_PROCESS || which > LX_IOPRIO_WHO_USER)
+		return (set_errno(EINVAL));
+
+	class = LX_IOPRIO_PRIO_CLASS(mask);
+	if (class < LX_IOPRIO_CLASS_RT || class > LX_IOPRIO_CLASS_IDLE)
+		return (set_errno(EINVAL));
+
+	return (0);
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_kill.c b/usr/src/uts/common/brand/lx/syscall/lx_kill.c
new file mode 100644
index 0000000000..eeed914566
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_kill.c
@@ -0,0 +1,402 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/proc.h>
+#include <sys/zone.h>
+#include <sys/thread.h>
+#include <sys/signal.h>
+#include <sys/brand.h>
+#include <sys/lx_brand.h>
+#include <lx_signum.h>
+#include <sys/contract/process_impl.h>
+
+extern int kill(pid_t, int);
+
+/*
+ * Check if it is legal to send this signal to the init process.  Linux
+ * kill(2) semantics dictate that no _unhandled_ signal may be sent to pid
+ * 1.
+ */
+static int
+init_sig_check(int sig, pid_t pid)
+{
+	proc_t *p;
+	int rv = 0;
+
+	mutex_enter(&pidlock);
+
+	if (((p = prfind(pid)) == NULL) || (p->p_stat == SIDL))
+		rv = ESRCH;
+	else if (sig && (sigismember(&cantmask, sig) ||
+	    (PTOU(p)->u_signal[sig-1] == SIG_DFL) ||
+	    (PTOU(p)->u_signal[sig-1] == SIG_IGN)))
+		rv = EPERM;
+
+	mutex_exit(&pidlock);
+
+	return (rv);
+}
+
+static long
+lx_thrkill(pid_t tgid, pid_t pid, int lx_sig, boolean_t tgkill)
+{
+	kthread_t *t;
+	proc_t *pp, *cp = curproc;
+	pid_t initpid;
+	sigqueue_t *sqp;
+	int tid = 1;	/* default tid */
+	int sig, rv;
+
+	/*
+	 * Unlike kill(2), Linux tkill(2) doesn't allow signals to
+	 * be sent to process IDs <= 0 as it doesn't overlay any special
+	 * semantics on the pid.
+	 */
+	if ((pid <= 0) || ((lx_sig < 0) || (lx_sig > LX_NSIG)) ||
+	    ((sig = ltos_signo[lx_sig]) < 0))
+		return (set_errno(EINVAL));
+
+	/*
+	 * If the Linux pid is 1, translate the pid to the actual init
+	 * pid for the zone.  Note that Linux dictates that no unhandled
+	 * signals may be sent to init, so check for that, too.
+	 *
+	 * Otherwise, extract the tid and real pid from the Linux pid.
+	 */
+	initpid = cp->p_zone->zone_proc_initpid;
+	if (pid == 1)
+		pid = initpid;
+	if ((pid == initpid) && ((rv = init_sig_check(sig, pid)) != 0))
+		return (set_errno(rv));
+	else if (lx_lpid_to_spair(pid, &pid, &tid) < 0)
+		return (set_errno(ESRCH));
+
+	if (tgkill && tgid != pid)
+		return (set_errno(ESRCH));
+
+	sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP);
+
+	/*
+	 * Find the process for the passed pid...
+	 */
+	mutex_enter(&pidlock);
+	if (((pp = prfind(pid)) == NULL) || (pp->p_stat == SIDL)) {
+		mutex_exit(&pidlock);
+		rv = set_errno(ESRCH);
+		goto free_and_exit;
+	}
+	mutex_enter(&pp->p_lock);
+	mutex_exit(&pidlock);
+
+	/*
+	 * Deny permission to send the signal if either of the following
+	 * is true:
+	 *
+	 *	+ The signal is SIGCONT and the target pid is not in the same
+	 *	  session as the sender
+	 *
+	 *	+ prochasprocperm() shows the user lacks sufficient permission
+	 *	  to send the signal to the target pid
+	 */
+	if (((sig == SIGCONT) && (pp->p_sessp != cp->p_sessp)) ||
+	    (!prochasprocperm(pp, cp, CRED()))) {
+		mutex_exit(&pp->p_lock);
+		rv = set_errno(EPERM);
+		goto free_and_exit;
+	}
+
+	/* check for the tid */
+	if ((t = idtot(pp, tid)) == NULL) {
+		mutex_exit(&pp->p_lock);
+		rv = set_errno(ESRCH);
+		goto free_and_exit;
+	}
+
+	/* a signal of 0 means just check for the existence of the thread */
+	if (lx_sig == 0) {
+		mutex_exit(&pp->p_lock);
+		rv = 0;
+		goto free_and_exit;
+	}
+
+	sqp->sq_info.si_signo = sig;
+	sqp->sq_info.si_code = SI_LWP;
+	sqp->sq_info.si_pid = cp->p_pid;
+	sqp->sq_info.si_zoneid = getzoneid();
+	sqp->sq_info.si_uid = crgetruid(CRED());
+	sigaddqa(pp, t, sqp);
+
+	mutex_exit(&pp->p_lock);
+
+	return (0);
+
+free_and_exit:
+	kmem_free(sqp, sizeof (sigqueue_t));
+	return (rv);
+}
+
+long
+lx_tgkill(pid_t tgid, pid_t pid, int lx_sig)
+{
+	return (lx_thrkill(tgid, pid, lx_sig, B_TRUE));
+}
+
+long
+lx_tkill(pid_t pid, int lx_sig)
+{
+	return (lx_thrkill(0, pid, lx_sig, B_FALSE));
+}
+
+long
+lx_kill(pid_t lx_pid, int lx_sig)
+{
+	pid_t s_pid, initpid;
+	sigsend_t v;
+	zone_t *zone = curproc->p_zone;
+	struct proc *p;
+	int err, sig, nfound;
+
+	if ((lx_sig < 0) || (lx_sig > LX_NSIG) ||
+	    ((sig = ltos_signo[lx_sig]) < 0))
+		return (set_errno(EINVAL));
+
+	/*
+	 * Since some linux apps rely on init(1M) having PID 1, we
+	 * transparently translate 1 to the real init(1M)'s pid.  We then
+	 * check to be sure that it is legal for this process to send this
+	 * signal to init(1M).
+	 */
+	initpid = zone->zone_proc_initpid;
+	if (lx_pid == 1) {
+		s_pid = initpid;
+	} else if (lx_pid == 0 || lx_pid == -1) {
+		s_pid = 0;
+	} else if (lx_pid > 0) {
+		if (lx_lpid_to_spair(lx_pid, &s_pid, NULL) != 0) {
+			/*
+			 * If we didn't find this pid that means it doesn't
+			 * exist in this zone.
+			 */
+			return (set_errno(ESRCH));
+		}
+	} else {
+		ASSERT(lx_pid < 0);
+		if (lx_lpid_to_spair(-lx_pid, &s_pid, NULL) != 0) {
+			/*
+			 * If we didn't find this pid it means that the
+			 * process group leader doesn't exist in this zone.
+			 * In this case assuming that the Linux pid is
+			 * the same as the Solaris pid will get us the
+			 * correct behavior.
+			 */
+			s_pid = -lx_pid;
+		}
+	}
+
+	if ((s_pid == initpid) && ((err = init_sig_check(sig, s_pid)) != 0))
+		return (set_errno(err));
+
+	/*
+	 * For individual processes, kill() semantics are the same between
+	 * Solaris and Linux.
+	 */
+	if (lx_pid >= 0)
+		return (kill(s_pid, sig));
+
+	/*
+	 * In Solaris, sending a signal to -pid means "send a signal to
+	 * everyone in process group pid."  In Linux it means "send a
+	 * signal to everyone in the group other than init."  Sending a
+	 * signal to -1 means "send a signal to every process except init
+	 * and myself."
+	 */
+
+	bzero(&v, sizeof (v));
+	v.sig = sig;
+	v.checkperm = 1;
+	v.sicode = SI_USER;
+	err = 0;
+
+	mutex_enter(&pidlock);
+
+	p = (lx_pid == -1) ? practive : pgfind(s_pid);
+	nfound = 0;
+	while (err == 0 && p != NULL) {
+		if ((p->p_zone == zone) && (p->p_stat != SIDL) &&
+		    (p->p_pid != initpid) && (lx_pid < -1 || p != curproc)) {
+			nfound++;
+			err = sigsendproc(p, &v);
+		}
+
+		p = (lx_pid == -1) ? p->p_next : p->p_pglink;
+	}
+	mutex_exit(&pidlock);
+
+	/*
+	 * If we found no processes, we'll return ESRCH -- but unlike our
+	 * native kill(2), we do not return EPERM if processes are found but
+	 * we did not have permission to send any of them a signal.
+	 */
+	if (nfound == 0)
+		err = ESRCH;
+
+	return (err ? set_errno(err) : 0);
+}
+
+/*
+ * This handles the unusual case where the user sends a non-queueable signal
+ * through rt_sigqueueinfo. Signals sent with codes that indicate they are
+ * queuable are sent through the sigqueue syscall via the user level function
+ * lx_rt_sigqueueinfo().
+ */
+int
+lx_helper_rt_sigqueueinfo(pid_t tgid, int sig, siginfo_t *uinfo)
+{
+	proc_t *target_proc;
+	pid_t s_pid;
+	zone_t *zone = curproc->p_zone;
+	sigsend_t send;
+	int err;
+	siginfo_t kinfo;
+
+	if (copyin(uinfo, &kinfo, sizeof (siginfo_t)) != 0)
+		return (set_errno(EFAULT));
+	/* Unlike in lx_kill, this process id must be exact, no negatives. */
+	if (tgid == 0)
+		return (set_errno(ESRCH));
+	if (tgid < 0)
+		return (set_errno(EINVAL));
+	/*
+	 * Translate init directly, otherwise use the convenient utility
+	 * function to translate. Since we're sending to the whole group, we
+	 * only need the solaris pid, and not the lwp id.
+	 */
+	if (tgid == 1) {
+		s_pid = zone->zone_proc_initpid;
+	} else {
+		if (lx_lpid_to_spair(tgid, &s_pid, NULL) != 0) {
+			/*
+			 * If we didn't find this pid that means it doesn't
+			 * exist in this zone.
+			 */
+			return (set_errno(ESRCH));
+		}
+	}
+	/*
+	 * We shouldn't have queuable signals here, those are sent elsewhere by
+	 * the usermode handler for this emulated call.
+	 */
+	if (!SI_CANQUEUE(kinfo.si_code)) {
+		return (set_errno(EINVAL));
+	}
+	/* Since our signal shouldn't queue, we just call sigsendproc(). */
+	bzero(&send, sizeof (send));
+	send.sig = sig;
+	send.checkperm = 1;
+	send.sicode = kinfo.si_code;
+	send.value = kinfo.si_value;
+
+	mutex_enter(&pidlock);
+	target_proc = prfind(s_pid);
+	err = 0;
+	if (target_proc != NULL) {
+		err = sigsendproc(target_proc, &send);
+		if (err == 0 && send.perm == 0)
+			err = EPERM;
+	} else {
+		err = ESRCH;
+	}
+	mutex_exit(&pidlock);
+
+	return (err ? set_errno(err) : 0);
+}
+
+/*
+ * Unlike the above function, this handles all system calls to rt_tgsigqueue
+ * regardless of si_code.
+ */
+int
+lx_helper_rt_tgsigqueueinfo(pid_t tgid, pid_t tid, int sig, siginfo_t *uinfo)
+{
+	id_t s_tid;
+	pid_t s_pid;
+	proc_t *target_proc;
+	sigqueue_t *sqp;
+	kthread_t *t;
+	siginfo_t kinfo;
+
+	if (copyin(uinfo, &kinfo, sizeof (siginfo_t)) != 0)
+		return (set_errno(EFAULT));
+	if (lx_lpid_to_spair(tid, &s_pid, &s_tid) != 0)
+		return (set_errno(ESRCH));
+	/*
+	 * For group leaders, solaris pid == linux pid, so the solaris leader
+	 * pid should be the same as the tgid but since the tgid comes in via
+	 * the syscall we need to check for an invalid value.
+	 */
+	if (s_pid != tgid)
+		return (set_errno(EINVAL));
+
+	mutex_enter(&pidlock);
+	target_proc = prfind(s_pid);
+	if (target_proc != NULL)
+		mutex_enter(&target_proc->p_lock);
+	mutex_exit(&pidlock);
+
+	if (target_proc == NULL) {
+		return (set_errno(ESRCH));
+	}
+	if (sig < 0 || sig >= NSIG)
+		return (set_errno(EINVAL));
+
+	/*
+	 * Some code adapted from lwp_kill, duplicated here because we do some
+	 * customization to the sq_info field of sqp.
+	 */
+	if ((t = idtot(target_proc, s_tid)) == NULL) {
+		mutex_exit(&target_proc->p_lock);
+		return (set_errno(ESRCH));
+	}
+	/* Just checking for existence of the process, not sending a signal. */
+	if (sig == 0) {
+		mutex_exit(&target_proc->p_lock);
+		return (0);
+	}
+	sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP);
+	sqp->sq_info.si_signo = sig;
+	sqp->sq_info.si_code = kinfo.si_code;
+	sqp->sq_info.si_pid = target_proc->p_pid;
+	sqp->sq_info.si_ctid = PRCTID(target_proc);
+	sqp->sq_info.si_zoneid = getzoneid();
+	sqp->sq_info.si_uid = crgetruid(CRED());
+	sigaddqa(target_proc, t, sqp);
+	mutex_exit(&target_proc->p_lock);
+	return (0);
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_link.c b/usr/src/uts/common/brand/lx/syscall/lx_link.c
new file mode 100644
index 0000000000..23e0768581
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_link.c
@@ -0,0 +1,97 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#include <sys/fcntl.h>
+#include <sys/errno.h>
+#include <sys/file.h>
+#include <sys/vnode.h>
+#include <sys/systm.h>
+#include <sys/lx_fcntl.h>
+
+#define	LX_LINK_ALLOWED	(LX_AT_SYMLINK_FOLLOW | LX_AT_EMPTY_PATH)
+
+static long
+lx_link_common(int ffd, char *from, int tfd, char *to, int flags)
+{
+	int error;
+	vnode_t *fsvp = NULL, *tsvp = NULL;
+	enum symfollow follow = NO_FOLLOW;
+
+	if ((flags & ~LX_LINK_ALLOWED) != 0) {
+		return (set_errno(EINVAL));
+	}
+	if ((flags & LX_AT_EMPTY_PATH) == 0) {
+		char c;
+
+		/*
+		 * Check that both 'from' and 'to' names are non-empty if
+		 * AT_EMPTY_PATH is not set.
+		 */
+		if (copyin(from, &c, sizeof (c)) != 0) {
+			return (set_errno(EFAULT));
+		} else if (c == '\0') {
+			return (set_errno(ENOENT));
+		}
+		if (copyin(to, &c, sizeof (c)) != 0) {
+			return (set_errno(EFAULT));
+		} else if (c == '\0') {
+			return (set_errno(ENOENT));
+		}
+
+		/*
+		 * XXX: When our support for LX capabilities improves, ENOENT
+		 * should be thrown when a process lacking CAP_DAC_READ_SEARCH
+		 * attempts to use the AT_EMPTY_PATH flag.
+		 */
+	}
+	if ((flags & LX_AT_SYMLINK_FOLLOW) != 0) {
+		follow = FOLLOW;
+	}
+
+	if ((error = fgetstartvp(ffd, from, &fsvp)) != 0) {
+		goto out;
+	}
+	if ((error = fgetstartvp(tfd, to, &tsvp)) != 0) {
+		goto out;
+	}
+	error = vn_linkat(fsvp, from, follow, tsvp, to, UIO_USERSPACE);
+
+out:
+	if (fsvp != NULL) {
+		VN_RELE(fsvp);
+	}
+	if (tsvp != NULL) {
+		VN_RELE(tsvp);
+	}
+	if (error) {
+		return (set_errno(error));
+	}
+	return (0);
+}
+
+long
+lx_link(char *from, char *to)
+{
+	return (lx_link_common(AT_FDCWD, from, AT_FDCWD, to, 0));
+}
+
+long
+lx_linkat(int ffd, char *from, int tfd, char *to, int flags)
+{
+	ffd = (ffd == LX_AT_FDCWD) ? AT_FDCWD : ffd;
+	tfd = (tfd == LX_AT_FDCWD) ? AT_FDCWD : tfd;
+
+	return (lx_link_common(ffd, from, tfd, to, flags));
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_mkdir.c b/usr/src/uts/common/brand/lx/syscall/lx_mkdir.c
new file mode 100644
index 0000000000..2f29f56d5f
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_mkdir.c
@@ -0,0 +1,38 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+#include <sys/fcntl.h>
+#include <sys/lx_fcntl.h>
+
+/*
+ * From "uts/common/syscall/mkdir.c":
+ */
+extern int mkdirat(int, char *, int);
+
+long
+lx_mkdirat(int fd, char *dname, int dmode)
+{
+	if (fd == LX_AT_FDCWD) {
+		fd = AT_FDCWD;
+	}
+
+	return (mkdirat(fd, dname, dmode));
+}
+
+long
+lx_mkdir(char *dname, int dmode)
+{
+	return (mkdirat(AT_FDCWD, dname, dmode));
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_modify_ldt.c b/usr/src/uts/common/brand/lx/syscall/lx_modify_ldt.c
new file mode 100644
index 0000000000..aa6e12a7d8
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_modify_ldt.c
@@ -0,0 +1,121 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/segments.h>
+#include <sys/archsystm.h>
+#include <sys/proc.h>
+#include <sys/sysi86.h>
+#include <sys/cmn_err.h>
+#include <sys/lx_ldt.h>
+
+/*
+ * Read the ldt_info structure in from the Linux app, convert it to an ssd
+ * structure, and then call setdscr() to do all the heavy lifting.
+ */
+static int
+write_ldt(void *data, ulong_t count)
+{
+	user_desc_t usd;
+	struct ssd ssd;
+	struct ldt_info ldt_inf;
+	proc_t *pp = curthread->t_procp;
+	int err;
+
+	if (count != sizeof (ldt_inf))
+		return (set_errno(EINVAL));
+
+	if (copyin(data, &ldt_inf, sizeof (ldt_inf)))
+		return (set_errno(EFAULT));
+
+	if (ldt_inf.entry_number >= MAXNLDT)
+		return (set_errno(EINVAL));
+
+	LDT_INFO_TO_DESC(&ldt_inf, &usd);
+	usd_to_ssd(&usd, &ssd, SEL_LDT(ldt_inf.entry_number));
+
+	/*
+	 * Get everyone into a safe state before changing the LDT.
+	 */
+	if (!holdlwps(SHOLDFORK1))
+		return (set_errno(EINTR));
+
+	err = setdscr(&ssd);
+
+	/*
+	 * Release the hounds!
+	 */
+	mutex_enter(&pp->p_lock);
+	continuelwps(pp);
+	mutex_exit(&pp->p_lock);
+
+	return (err ? set_errno(err) : 0);
+}
+
+static int
+read_ldt(void *uptr, ulong_t count)
+{
+	proc_t *pp = curproc;
+	int bytes;
+
+	if (pp->p_ldt == NULL)
+		return (0);
+
+	bytes = (pp->p_ldtlimit + 1) * sizeof (user_desc_t);
+	if (bytes > count)
+		bytes = count;
+
+	if (copyout(pp->p_ldt, uptr, bytes))
+		return (set_errno(EFAULT));
+
+	return (bytes);
+}
+
+long
+lx_modify_ldt(int op, void *data, ulong_t count)
+{
+	int rval;
+
+	switch (op) {
+	case 0:
+		rval = read_ldt(data, count);
+		break;
+
+	case 1:
+		rval = write_ldt(data, count);
+		break;
+
+	default:
+		rval = set_errno(ENOSYS);
+		break;
+	}
+
+	return (rval);
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_open.c b/usr/src/uts/common/brand/lx/syscall/lx_open.c
new file mode 100644
index 0000000000..431c2ed1ba
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_open.c
@@ -0,0 +1,260 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ * Copyright 2015 Joyent, Inc.  All rights reserved.
+ */
+
+#include <sys/systm.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/filio.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/inttypes.h>
+#include <sys/mutex.h>
+
+#include <sys/lx_types.h>
+#include <sys/lx_fcntl.h>
+#include <sys/lx_misc.h>
+
+extern int fcntl(int, int, intptr_t);
+extern int openat(int, char *, int, int);
+extern int open(char *, int, int);
+extern int close(int);
+extern int cioctl(file_t *, int, intptr_t, int *);
+extern int lookupnameat(char *, enum uio_seg, int, vnode_t **, vnode_t **,
+    vnode_t *);
+
+
+static int
+ltos_open_flags(int input)
+{
+	int flags;
+
+	if (input & LX_O_PATH) {
+		input &= (LX_O_DIRECTORY | LX_O_NOFOLLOW | LX_O_CLOEXEC);
+	}
+
+	/* This depends on the Linux ACCMODE flags being the same as SunOS. */
+	flags = (input & LX_O_ACCMODE);
+
+	if (input & LX_O_CREAT) {
+		flags |= O_CREAT;
+	}
+
+	if (input & LX_O_EXCL)
+		flags |= O_EXCL;
+	if (input & LX_O_NOCTTY)
+		flags |= O_NOCTTY;
+	if (input & LX_O_TRUNC)
+		flags |= O_TRUNC;
+	if (input & LX_O_APPEND)
+		flags |= O_APPEND;
+	if (input & LX_O_NONBLOCK)
+		flags |= O_NONBLOCK;
+	if (input & LX_O_SYNC)
+		flags |= O_SYNC;
+	if (input & LX_O_LARGEFILE)
+		flags |= O_LARGEFILE;
+	if (input & LX_O_NOFOLLOW)
+		flags |= O_NOFOLLOW;
+	if (input & LX_O_CLOEXEC)
+		flags |= O_CLOEXEC;
+
+	/*
+	 * Linux uses the LX_O_DIRECT flag to do raw, synchronous I/O to the
+	 * device backing the fd in question.  Illumos doesn't have similar
+	 * functionality, but we can attempt to simulate it using the flags
+	 * (O_RSYNC|O_SYNC) and directio(3C).
+	 *
+	 * The LX_O_DIRECT flag also requires that the transfer size and
+	 * alignment of I/O buffers be a multiple of the logical block size for
+	 * the underlying file system, but frankly there isn't an easy way to
+	 * support that functionality without doing something like adding an
+	 * fcntl(2) flag to denote LX_O_DIRECT mode.
+	 *
+	 * Since LX_O_DIRECT is merely a performance advisory, we'll just
+	 * emulate what we can and trust that the only applications expecting
+	 * an error when performing I/O from a misaligned buffer or when
+	 * passing a transfer size is not a multiple of the underlying file
+	 * system block size will be test suites.
+	 */
+	if (input & LX_O_DIRECT)
+		flags |= (O_RSYNC|O_SYNC);
+
+	return (flags);
+}
+
+#define	LX_POSTPROCESS_OPTS	(LX_O_DIRECT | LX_O_ASYNC | LX_O_PATH)
+
+static int
+lx_open_postprocess(int fd, int fmode)
+{
+	file_t *fp;
+	int rv, error = 0;
+
+	if ((fmode & LX_POSTPROCESS_OPTS) == 0) {
+		/* Skip out early, if possible */
+		return (0);
+	}
+
+	if ((fp = getf(fd)) == NULL) {
+		/*
+		 * It is possible that this fd was closed by the time we
+		 * arrived here if some one is hammering away with close().
+		 */
+		return (EIO);
+	}
+
+	if (fmode & LX_O_DIRECT && error == 0) {
+		(void) VOP_IOCTL(fp->f_vnode, _FIODIRECTIO, DIRECTIO_ON,
+		    fp->f_flag, fp->f_cred, &rv, NULL);
+	}
+
+	if (fmode & LX_O_ASYNC && error == 0) {
+		if ((error = VOP_SETFL(fp->f_vnode, fp->f_flag, FASYNC,
+		    fp->f_cred, NULL)) == 0) {
+			mutex_enter(&fp->f_tlock);
+			fp->f_flag |= FASYNC;
+			mutex_exit(&fp->f_tlock);
+		}
+	}
+
+	if (fmode & LX_O_PATH && error == 0) {
+		/*
+		 * While the O_PATH flag has no direct analog in SunOS, it is
+		 * emulated by removing both FREAD and FWRITE from f_flag.
+		 * This causes read(2) and write(2) result in EBADF and can be
+		 * checked for in other syscalls to tigger the correct behavior
+		 * there.
+		 */
+		mutex_enter(&fp->f_tlock);
+		fp->f_flag &= ~(FREAD|FWRITE);
+		mutex_exit(&fp->f_tlock);
+	}
+
+	releasef(fd);
+	if (error != 0) {
+		(void) closeandsetf(fd, NULL);
+	}
+	return (error);
+}
+
+long
+lx_openat(int atfd, char *path, int fmode, int cmode)
+{
+	int flags, fd, error;
+	mode_t mode = 0;
+
+	if (atfd == LX_AT_FDCWD)
+		atfd = AT_FDCWD;
+
+	flags = ltos_open_flags(fmode);
+
+	/*
+	 * We use the FSEARCH flag to make sure this is a directory. We have to
+	 * explicitly add 1 to emulate the FREAD/FWRITE mapping of the OPENMODE
+	 * macro since it won't get set via OPENMODE when FSEARCH is used.
+	 */
+	if (fmode & LX_O_DIRECTORY) {
+		flags |= FSEARCH;
+		flags++;
+	}
+
+	if (flags & O_CREAT)
+		mode = (mode_t)cmode;
+
+	ttolwp(curthread)->lwp_errno = 0;
+	fd = openat(atfd, path, flags, mode);
+	if (ttolwp(curthread)->lwp_errno != 0) {
+		if ((fmode & LX_O_DIRECTORY) &&
+		    ttolwp(curthread)->lwp_errno != ENOTDIR) {
+			/*
+			 * We got an error trying to open a file as a directory.
+			 * We need to determine if we should return the original
+			 * error or ENOTDIR.
+			 */
+			vnode_t *startvp;
+			vnode_t *vp;
+			int oerror, error = 0;
+
+			oerror = ttolwp(curthread)->lwp_errno;
+
+			if (atfd == AT_FDCWD) {
+				/* regular open */
+				startvp = NULL;
+			} else {
+				char startchar;
+
+				if (copyin(path, &startchar, sizeof (char)))
+					return (set_errno(oerror));
+
+				/* if startchar is / then startfd is ignored */
+				if (startchar == '/') {
+					startvp = NULL;
+				} else {
+					file_t *startfp;
+
+					if ((startfp = getf(atfd)) == NULL)
+						return (set_errno(oerror));
+					startvp = startfp->f_vnode;
+					VN_HOLD(startvp);
+					releasef(atfd);
+				}
+			}
+
+			if (lookupnameat(path, UIO_USERSPACE,
+			    (fmode & LX_O_NOFOLLOW) ?  NO_FOLLOW : FOLLOW,
+			    NULLVPP, &vp, startvp) != 0) {
+				if (startvp != NULL)
+					VN_RELE(startvp);
+				return (set_errno(oerror));
+			}
+
+			if (startvp != NULL)
+				VN_RELE(startvp);
+
+			if (vp->v_type != VDIR)
+				error = ENOTDIR;
+
+			VN_RELE(vp);
+			if (error != 0)
+				return (set_errno(ENOTDIR));
+
+			set_errno(oerror);
+		}
+		return (ttolwp(curthread)->lwp_errno);
+	}
+
+	if ((error = lx_open_postprocess(fd, fmode)) != 0) {
+		return (set_errno(error));
+	}
+	return (fd);
+}
+
+long
+lx_open(char *path, int fmode, int cmode)
+{
+	return (lx_openat(LX_AT_FDCWD, path, fmode, cmode));
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_personality.c b/usr/src/uts/common/brand/lx/syscall/lx_personality.c
new file mode 100644
index 0000000000..e7aa945b50
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_personality.c
@@ -0,0 +1,112 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#include <sys/systm.h>
+#include <sys/mutex.h>
+#include <sys/brand.h>
+
+#include <sys/lx_brand.h>
+#include <sys/lx_impl.h>
+
+
+/*
+ * These flags are for what Linux calls "bug emulation".
+ * (Descriptions from the personality(2) Linux man page.)
+ *
+ * Flags which are currently actionable in LX:
+ * - READ_IMPLIES_EXEC (since Linux 2.6.8)
+ *   With this flag set, PROT_READ implies PROT_EXEC for mmap(2).
+ *
+ * Flags which are current accepted but ignored:
+ * - UNAME26 (since Linux 3.1)
+ *   Have uname(2) report a 2.6.40+ version number rather than a 3.x version
+ *   number.  Added as a stopgap measure to support broken applications that
+ *   could not handle the kernel version- numbering switch from 2.6.x to 3.x.
+ *
+ * - ADDR_NO_RANDOMIZE (since Linux 2.6.12)
+ *   With this flag set, disable address-space-layout randomization.
+ *
+ * - FDPIC_FUNCPTRS (since Linux 2.6.11)
+ *   User-space function pointers to signal handlers point (on certain
+ *   architectures) to descriptors.
+ *
+ * - MMAP_PAGE_ZERO (since Linux 2.4.0)
+ *   Map page 0 as read-only (to support binaries that depend on this SVr4
+ *   behavior).
+ *
+ * - ADDR_COMPAT_LAYOUT (since Linux 2.6.9)
+ *   With this flag set, provide legacy virtual address space layout.
+ *
+ * - ADDR_LIMIT_32BIT (since Linux 2.2)
+ *   Limit the address space to 32 bits.
+ *
+ * - SHORT_INODE (since Linux 2.4.0)
+ *   No effects(?).
+ *
+ * - WHOLE_SECONDS (since Linux 1.2.0)
+ *   No effects(?).
+ *
+ * - STICKY_TIMEOUTS (since Linux 1.2.0)
+ *   With this flag set, select(2), pselect(2), and ppoll(2) do not modify the
+ *   returned timeout argument when interrupted by a signal handler.
+ *
+ * - ADDR_LIMIT_3GB (since Linux 2.4.0)
+ *   With this flag set, use 0xc0000000 as the offset at which to search a
+ *   virtual memory chunk on mmap(2); otherwise use 0xffffe000.
+ */
+
+#define	LX_PER_GET	0xffffffff
+
+long
+lx_personality(unsigned int arg)
+{
+	lx_proc_data_t *lxpd = ptolxproc(curproc);
+	unsigned int result = 0;
+
+	mutex_enter(&curproc->p_lock);
+	result = lxpd->l_personality;
+
+	if (arg == LX_PER_GET) {
+		mutex_exit(&curproc->p_lock);
+		return (result);
+	}
+
+	/*
+	 * Prevent changes to the personality if the process is undergoing an
+	 * exec.  This will allow elfexec and friends to manipulate the
+	 * personality without hinderance.
+	 */
+	if ((curproc->p_flag & P_PR_EXEC) != 0) {
+		mutex_exit(&curproc->p_lock);
+		return (set_errno(EINVAL));
+	}
+
+	/*
+	 * Keep tabs when a non-Linux personality is set.  This is silently
+	 * allowed to succeed, even though the emulation required is almost
+	 * certainly missing.
+	 */
+	if ((arg & LX_PER_MASK) != LX_PER_LINUX) {
+		char buf[64];
+
+		(void) snprintf(buf, sizeof (buf), "invalid personality: %02X",
+		    arg & LX_PER_MASK);
+		lx_unsupported(buf);
+	}
+
+	lxpd->l_personality = arg;
+	mutex_exit(&curproc->p_lock);
+	return (result);
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_pipe.c b/usr/src/uts/common/brand/lx/syscall/lx_pipe.c
new file mode 100644
index 0000000000..519c742abc
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_pipe.c
@@ -0,0 +1,200 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T. All Rights Reserved.
+ *
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ * Copyright 2013 OmniTI Computer Consulting, Inc. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
+ */
+
+#include <sys/zone.h>
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/thread.h>
+#include <sys/cpuvar.h>
+#include <sys/cred.h>
+#include <sys/user.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/stream.h>
+#include <sys/strsubr.h>
+#include <sys/errno.h>
+#include <sys/debug.h>
+#include <sys/fs/fifonode.h>
+#include <sys/fcntl.h>
+#include <sys/brand.h>
+#include <sys/lx_brand.h>
+#include <sys/lx_fcntl.h>
+
+/*
+ * Based on native pipe(2) system call, except that the pipe is half-duplex.
+ */
+static int
+lx_hd_pipe(intptr_t arg, int flags)
+{
+	vnode_t *vp1, *vp2;
+	struct file *fp1, *fp2;
+	int error = 0;
+	int flag1, flag2, iflags;
+	int fd1, fd2;
+
+	/*
+	 * Validate allowed flags.
+	 */
+	if ((flags & ~(FCLOEXEC|FNONBLOCK)) != 0) {
+		return (set_errno(EINVAL));
+	}
+	/*
+	 * Allocate and initialize two vnodes.
+	 */
+	makepipe(&vp1, &vp2);
+
+	/*
+	 * Allocate and initialize two file table entries and two
+	 * file pointers. The first file pointer is open for read and the
+	 * second is open for write.
+	 */
+	if ((error = falloc(vp1, FREAD, &fp1, &fd1)) != 0) {
+		VN_RELE(vp1);
+		VN_RELE(vp2);
+		return (set_errno(error));
+	}
+
+	if ((error = falloc(vp2, FWRITE, &fp2, &fd2)) != 0)
+		goto out2;
+
+	/*
+	 * Create two stream heads and attach to each vnode.
+	 */
+	if ((error = fifo_stropen(&vp1, FREAD, fp1->f_cred, 0, 0)) != 0)
+		goto out;
+
+	if ((error = fifo_stropen(&vp2, FWRITE, fp2->f_cred, 0, 0)) != 0) {
+		(void) VOP_CLOSE(vp1, FREAD, 1, (offset_t)0,
+		    fp1->f_cred, NULL);
+		goto out;
+	}
+
+	strmate(vp1, vp2);
+
+	VTOF(vp1)->fn_ino = VTOF(vp2)->fn_ino = fifogetid();
+
+	/*
+	 * Set the O_NONBLOCK flag if requested.
+	 */
+	if (flags & FNONBLOCK) {
+		flag1 = fp1->f_flag;
+		flag2 = fp2->f_flag;
+		iflags = flags & FNONBLOCK;
+
+		if ((error = VOP_SETFL(vp1, flag1, iflags, fp1->f_cred,
+		    NULL)) != 0) {
+			goto out_vop_close;
+		}
+		fp1->f_flag |= iflags;
+
+		if ((error = VOP_SETFL(vp2, flag2, iflags, fp2->f_cred,
+		    NULL)) != 0) {
+			goto out_vop_close;
+		}
+		fp2->f_flag |= iflags;
+	}
+
+	/*
+	 * Return the file descriptors to the user. They now
+	 * point to two different vnodes which have different
+	 * stream heads.
+	 */
+	if (copyout(&fd1, &((int *)arg)[0], sizeof (int)) ||
+	    copyout(&fd2, &((int *)arg)[1], sizeof (int))) {
+		error = EFAULT;
+		goto out_vop_close;
+	}
+
+	/*
+	 * Now fill in the entries that falloc reserved
+	 */
+	mutex_exit(&fp1->f_tlock);
+	mutex_exit(&fp2->f_tlock);
+	setf(fd1, fp1);
+	setf(fd2, fp2);
+
+	/*
+	 * Optionally set the FCLOEXEC flag
+	 */
+	if ((flags & FCLOEXEC) != 0) {
+		f_setfd(fd1, FD_CLOEXEC);
+		f_setfd(fd2, FD_CLOEXEC);
+	}
+
+	return (0);
+out_vop_close:
+	(void) VOP_CLOSE(vp1, FREAD, 1, (offset_t)0, fp1->f_cred, NULL);
+	(void) VOP_CLOSE(vp2, FWRITE, 1, (offset_t)0, fp2->f_cred, NULL);
+out:
+	setf(fd2, NULL);
+	unfalloc(fp2);
+out2:
+	setf(fd1, NULL);
+	unfalloc(fp1);
+	VN_RELE(vp1);
+	VN_RELE(vp2);
+	return (set_errno(error));
+}
+
+/*
+ * pipe(2) system call.
+ */
+long
+lx_pipe(intptr_t arg)
+{
+	return (lx_hd_pipe(arg, 0));
+}
+
+/*
+ * pipe2(2) system call.
+ */
+long
+lx_pipe2(intptr_t arg, int lxflags)
+{
+	int flags = 0;
+
+	/*
+	 * Validate allowed flags.
+	 */
+	if ((lxflags & ~(LX_O_NONBLOCK | LX_O_CLOEXEC)) != 0) {
+		return (set_errno(EINVAL));
+	}
+
+	/*
+	 * Convert from Linux flags to illumos flags.
+	 */
+	if (lxflags & LX_O_NONBLOCK) {
+		flags |= FNONBLOCK;
+	}
+	if (lxflags & LX_O_CLOEXEC) {
+		flags |= FCLOEXEC;
+	}
+
+	return (lx_hd_pipe(arg, flags));
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_poll.c b/usr/src/uts/common/brand/lx/syscall/lx_poll.c
new file mode 100644
index 0000000000..1d92a55ddf
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_poll.c
@@ -0,0 +1,762 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/thread.h>
+#include <sys/proc.h>
+#include <sys/zone.h>
+#include <sys/brand.h>
+#include <sys/sunddi.h>
+#include <sys/lx_brand.h>
+#include <sys/lx_types.h>
+#include <sys/poll_impl.h>
+#include <sys/schedctl.h>
+#include <sys/lx_signal.h>
+
+
+/* From uts/common/syscall/poll.c */
+extern int poll_copyin(pollstate_t *, pollfd_t *, nfds_t);
+extern int poll_common(pollstate_t *, pollfd_t *, nfds_t, timespec_t *, int *);
+
+/*
+ * These events are identical between Linux and SunOS
+ */
+#define	LX_POLLIN	0x001
+#define	LX_POLLPRI	0x002
+#define	LX_POLLOUT	0x004
+#define	LX_POLLERR	0x008
+#define	LX_POLLHUP	0x010
+#define	LX_POLLNVAL	0x020
+#define	LX_POLLRDNORM	0x040
+#define	LX_POLLRDBAND	0x080
+
+#define	LX_POLL_COMMON_EVENTS (LX_POLLIN | LX_POLLPRI | LX_POLLOUT |	\
+	LX_POLLERR | LX_POLLHUP | LX_POLLNVAL | LX_POLLRDNORM | LX_POLLRDBAND)
+
+/*
+ * These events differ between Linux and SunOS
+ */
+#define	LX_POLLWRNORM	0x0100
+#define	LX_POLLWRBAND	0x0200
+#define	LX_POLLRDHUP	0x2000
+
+
+#define	LX_POLL_SUPPORTED_EVENTS	\
+	(LX_POLL_COMMON_EVENTS | LX_POLLWRNORM | LX_POLLWRBAND | LX_POLLRDHUP)
+
+
+static int
+lx_poll_copyin(pollstate_t *ps, pollfd_t *fds, nfds_t nfds, short *oldevt)
+{
+	int i, error = 0;
+	pollfd_t *pollfdp;
+
+	if ((error = poll_copyin(ps, fds, nfds)) != 0) {
+		return (error);
+	}
+	pollfdp = ps->ps_pollfd;
+
+	/* Convert the Linux events bitmask into SunOS equivalent. */
+	for (i = 0; i < nfds; i++) {
+		short lx_events = pollfdp[i].events;
+		short events;
+
+		/*
+		 * If the caller is polling for an unsupported event, we
+		 * have to bail out.
+		 */
+		if (lx_events & ~LX_POLL_SUPPORTED_EVENTS) {
+			return (ENOTSUP);
+		}
+
+		events = lx_events & LX_POLL_COMMON_EVENTS;
+		if (lx_events & LX_POLLWRNORM)
+			events |= POLLWRNORM;
+		if (lx_events & LX_POLLWRBAND)
+			events |= POLLWRBAND;
+		if (lx_events & LX_POLLRDHUP)
+			events |= POLLRDHUP;
+		pollfdp[i].events = events;
+		oldevt[i] = lx_events;
+	}
+	return (0);
+}
+
+static int
+lx_poll_copyout(pollfd_t *pollfdp, pollfd_t *fds, nfds_t nfds, short *oldevt)
+{
+	int i;
+
+	/*
+	 * Convert SunOS revents bitmask into Linux equivalent and restore
+	 * cached events field which was swizzled by lx_poll_copyin.
+	 */
+	for (i = 0; i < nfds; i++) {
+		short revents = pollfdp[i].revents;
+		short lx_revents = revents & LX_POLL_COMMON_EVENTS;
+		short orig_events = oldevt[i];
+
+		if (revents & POLLWRBAND)
+			lx_revents |= LX_POLLWRBAND;
+		if (revents & POLLRDHUP)
+			lx_revents |= LX_POLLRDHUP;
+		/*
+		 * Because POLLOUT and POLLWRNORM are native defined as the
+		 * same value, care must be taken when translating them to
+		 * Linux where they differ.
+		 */
+		if (revents & POLLOUT) {
+			if ((orig_events & LX_POLLOUT) == 0)
+				lx_revents &= ~LX_POLLOUT;
+			if (orig_events & LX_POLLWRNORM)
+				lx_revents |= LX_POLLWRNORM;
+		}
+
+		pollfdp[i].revents = lx_revents;
+		pollfdp[i].events = orig_events;
+	}
+
+	if (copyout(pollfdp, fds, sizeof (pollfd_t) * nfds) != 0)
+		return (EFAULT);
+
+	return (0);
+}
+
+static long
+lx_poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp)
+{
+	kthread_t *t = curthread;
+	klwp_t *lwp = ttolwp(t);
+	proc_t *p = ttoproc(t);
+	pollstate_t *ps = NULL;
+	pollfd_t *pollfdp = NULL;
+	short *oldevt = NULL;
+	int error = 0, fdcnt = 0;
+
+	/*
+	 * Reset our signal mask, if requested.
+	 */
+	if (ksetp != NULL) {
+		mutex_enter(&p->p_lock);
+		schedctl_finish_sigblock(t);
+		lwp->lwp_sigoldmask = t->t_hold;
+		t->t_hold = *ksetp;
+		t->t_flag |= T_TOMASK;
+		/*
+		 * Call cv_reltimedwait_sig() just to check for signals.
+		 * We will return immediately with either 0 or -1.
+		 */
+		if (!cv_reltimedwait_sig(&t->t_delay_cv, &p->p_lock, 0,
+		    TR_CLOCK_TICK)) {
+			mutex_exit(&p->p_lock);
+			error = EINTR;
+			goto pollout;
+		}
+		mutex_exit(&p->p_lock);
+	}
+
+	/*
+	 * Initialize pollstate and copy in pollfd data if present.
+	 */
+	if (nfds != 0) {
+		if (nfds > p->p_fno_ctl) {
+			mutex_enter(&p->p_lock);
+			(void) rctl_action(rctlproc_legacy[RLIMIT_NOFILE],
+			    p->p_rctls, p, RCA_SAFE);
+			mutex_exit(&p->p_lock);
+			error = EINVAL;
+			goto pollout;
+		}
+
+		/*
+		 * Need to allocate memory for pollstate before anything
+		 * because the mutex and cv are created in this space
+		 */
+		ps = pollstate_create();
+		if (ps->ps_pcache == NULL)
+			ps->ps_pcache = pcache_alloc();
+
+		/*
+		 * Certain event types which are distinct on Linux are aliased
+		 * against each other on illumos.  In order properly translate
+		 * back into the Linux format, the original events of interest
+		 * are stored in 'oldevt' for use during lx_poll_copyout.
+		 */
+		oldevt = kmem_alloc(nfds * sizeof (short), KM_SLEEP);
+		if ((error = lx_poll_copyin(ps, fds, nfds, oldevt)) != 0)
+			goto pollout;
+		pollfdp = ps->ps_pollfd;
+	}
+
+	/*
+	 * Perform the actual poll.
+	 */
+	error = poll_common(ps, fds, nfds, tsp, &fdcnt);
+
+pollout:
+	/*
+	 * If we changed the signal mask but we received no signal then restore
+	 * the signal mask.  Otherwise psig() will deal with the signal mask.
+	 */
+	if (ksetp != NULL) {
+		mutex_enter(&p->p_lock);
+		if (lwp->lwp_cursig == 0) {
+			t->t_hold = lwp->lwp_sigoldmask;
+			t->t_flag &= ~T_TOMASK;
+		}
+		mutex_exit(&p->p_lock);
+	}
+
+	/*
+	 * Copy out the events and return the fdcnt to the user.
+	 */
+	if (nfds != 0 && error == 0) {
+		error = lx_poll_copyout(pollfdp, fds, nfds, oldevt);
+	}
+	if (oldevt != NULL) {
+		kmem_free(oldevt, nfds * sizeof (short));
+	}
+	if (error) {
+		return (set_errno(error));
+	}
+	return (fdcnt);
+}
+
+long
+lx_poll(pollfd_t *fds, nfds_t nfds, int timeout)
+{
+	timespec_t ts, *tsp = NULL;
+
+	if (timeout >= 0) {
+		ts.tv_sec = timeout / MILLISEC;
+		ts.tv_nsec = (timeout % MILLISEC) * MICROSEC;
+		tsp = &ts;
+	}
+
+	return (lx_poll_common(fds, nfds, tsp, NULL));
+}
+
+long
+lx_ppoll(pollfd_t *fds, nfds_t nfds, timespec_t *timeoutp, lx_sigset_t *setp)
+{
+	timespec_t ts, *tsp = NULL;
+	k_sigset_t kset, *ksetp = NULL;
+
+	/*
+	 * Copy in timeout and sigmask.
+	 */
+	if (timeoutp != NULL) {
+		if (get_udatamodel() == DATAMODEL_NATIVE) {
+			if (copyin(timeoutp, &ts, sizeof (ts)))
+				return (set_errno(EFAULT));
+		} else {
+			timespec32_t ts32;
+
+			if (copyin(timeoutp, &ts32, sizeof (ts32)))
+				return (set_errno(EFAULT));
+			TIMESPEC32_TO_TIMESPEC(&ts, &ts32)
+		}
+
+		if (itimerspecfix(&ts))
+			return (set_errno(EINVAL));
+		tsp = &ts;
+	}
+	if (setp != NULL) {
+		lx_sigset_t lset;
+
+		if (copyin(setp, &lset, sizeof (lset)))
+			return (set_errno(EFAULT));
+		lx_ltos_sigset(&lset, &kset);
+		ksetp = &kset;
+	}
+
+	return (lx_poll_common(fds, nfds, tsp, ksetp));
+}
+
+typedef struct lx_select_buf_s {
+	long		*lsb_rfds;
+	long		*lsb_wfds;
+	long		*lsb_efds;
+	unsigned int	lsb_size;
+} lx_select_buf_t;
+
+/*
+ * Size (in bytes) of buffer appropriate for fd_set copyin/copyout.
+ * Linux uses buffers of 'long' to accomplish this.
+ */
+#define	LX_FD_SET_BYTES		(sizeof (long))
+#define	LX_FD_SET_BITS		(8 * LX_FD_SET_BYTES)
+#define	LX_FD_SET_SIZE(nfds)	\
+	((((nfds) + (LX_FD_SET_BITS - 1)) / LX_FD_SET_BITS) * LX_FD_SET_BYTES)
+
+static int
+lx_select_copyin(pollstate_t *ps, lx_select_buf_t *sbuf, int nfds,
+    long *rfds, long *wfds, long *efds)
+{
+	int n;
+	long *in, *out, *ex;
+	long absent = 0;
+	pollfd_t *pfd;
+	nfds_t old_nfds;
+
+	/*
+	 * Just like pollsys and lx_poll, attempt to reuse ps_pollfd if it is
+	 * appropriately sized.  See poll_copyin for more detail.
+	 */
+	old_nfds = ps->ps_nfds;
+	if (nfds != old_nfds) {
+		kmem_free(ps->ps_pollfd, old_nfds * sizeof (pollfd_t));
+		pfd = kmem_alloc(nfds * sizeof (pollfd_t), KM_SLEEP);
+		ps->ps_pollfd = pfd;
+		ps->ps_nfds = nfds;
+	} else {
+		pfd = ps->ps_pollfd;
+	}
+
+	if (rfds != NULL) {
+		if (copyin(rfds, sbuf->lsb_rfds, sbuf->lsb_size) != 0) {
+			return (EFAULT);
+		}
+	}
+	if (wfds != NULL) {
+		if (copyin(wfds, sbuf->lsb_wfds, sbuf->lsb_size) != 0) {
+			return (EFAULT);
+		}
+	}
+	if (efds != NULL) {
+		if (copyin(efds, sbuf->lsb_efds, sbuf->lsb_size) != 0) {
+			return (EFAULT);
+		}
+	}
+
+	/*
+	 * For each fd, if any bits are set convert them into the appropriate
+	 * pollfd struct. (Derived from libc's select logic)
+	 */
+	in = (rfds != NULL) ? sbuf->lsb_rfds : &absent;
+	out = (wfds != NULL) ? sbuf->lsb_wfds : &absent;
+	ex = (efds != NULL) ? sbuf->lsb_efds : &absent;
+	for (n = 0; n < nfds; n += LX_FD_SET_BITS) {
+		unsigned long b, m, j;
+
+		b = (unsigned long)(*in | *out | *ex);
+		m = 1;
+		for (j = 0; j < LX_FD_SET_BITS; j++) {
+			int fd = n + j;
+
+			if (fd >= nfds)
+				return (0);
+			pfd->events = 0;
+			if (b & 1) {
+				pfd->fd = fd;
+				if (*in & m)
+					pfd->events |= POLLRDNORM;
+				if (*out & m)
+					pfd->events |= POLLWRNORM;
+				if (*ex & m)
+					pfd->events |= POLLRDBAND;
+			} else {
+				pfd->fd = -1;
+			}
+			pfd++;
+			b >>= 1;
+			m <<= 1;
+		}
+
+		if (rfds != NULL)
+			in++;
+		if (wfds != NULL)
+			out++;
+		if (efds != NULL)
+			ex++;
+	}
+	return (0);
+}
+
+static int
+lx_select_copyout(pollfd_t *pollfdp, lx_select_buf_t *sbuf, int nfds,
+    long *rfds, long *wfds, long *efds, int *fdcnt)
+{
+	int n;
+	pollfd_t *pfd;
+	long rv = 0;
+
+	/*
+	 * If poll did not find any fds of interest, we can just zero out the
+	 * fd_set fields for copyout.
+	 */
+	if (*fdcnt == 0) {
+		if (rfds != NULL) {
+			bzero(sbuf->lsb_rfds, sbuf->lsb_size);
+		}
+		if (wfds != NULL) {
+			bzero(sbuf->lsb_wfds, sbuf->lsb_size);
+		}
+		if (efds != NULL) {
+			bzero(sbuf->lsb_efds, sbuf->lsb_size);
+		}
+		goto copyout;
+	}
+
+	/*
+	 * For each fd, if any bits are set convert them into the appropriate
+	 * pollfd struct. (Derived from libc's select logic)
+	 */
+	pfd = pollfdp;
+	for (n = 0; n < nfds; n += LX_FD_SET_BITS) {
+		unsigned long m, j;
+		long in = 0, out = 0, ex = 0;
+
+		m = 1;
+		for (j = 0; j < LX_FD_SET_BITS; j++) {
+			if ((n + j) >= nfds)
+				break;
+			if (pfd->revents != 0) {
+				if (pfd->revents & POLLNVAL) {
+					return (EBADF);
+				}
+				if (pfd->revents & POLLRDNORM) {
+					in |= m;
+					rv++;
+				}
+				if (pfd->revents & POLLWRNORM) {
+					out |= m;
+					rv++;
+				}
+				if (pfd->revents & POLLRDBAND) {
+					ex |= m;
+					rv++;
+				}
+				/*
+				 * Only set this bit on return if we asked
+				 * about input conditions.
+				 */
+				if ((pfd->revents & (POLLHUP|POLLERR)) &&
+				    (pfd->events & POLLRDNORM)) {
+					if ((in & m) == 0) {
+						/* wasn't already set */
+						rv++;
+					}
+					in |= m;
+				}
+				/*
+				 * Only set this bit on return if we asked
+				 * about output conditions.
+				 */
+				if ((pfd->revents & (POLLHUP|POLLERR)) &&
+				    (pfd->events & POLLWRNORM)) {
+					if ((out & m) == 0) {
+						/* wasn't already set */
+						rv++;
+					}
+					out |= m;
+				}
+				/*
+				 * Only set this bit on return if we asked
+				 * about output conditions.
+				 */
+				if ((pfd->revents & (POLLHUP|POLLERR)) &&
+				    (pfd->events & POLLRDBAND)) {
+					if ((ex & m) == 0) {
+						/* wasn't already set */
+						rv++;
+					}
+					ex |= m;
+				}
+			}
+			m <<= 1;
+			pfd++;
+		}
+		if (rfds != NULL)
+			sbuf->lsb_rfds[n / LX_FD_SET_BITS] = in;
+		if (wfds != NULL)
+			sbuf->lsb_wfds[n / LX_FD_SET_BITS] = out;
+		if (efds != NULL)
+			sbuf->lsb_efds[n / LX_FD_SET_BITS] = ex;
+	}
+
+copyout:
+	if (rfds != NULL) {
+		if (copyout(sbuf->lsb_rfds, rfds, sbuf->lsb_size) != 0) {
+			return (EFAULT);
+		}
+	}
+	if (wfds != NULL) {
+		if (copyout(sbuf->lsb_wfds, wfds, sbuf->lsb_size) != 0) {
+			return (EFAULT);
+		}
+	}
+	if (efds != NULL) {
+		if (copyout(sbuf->lsb_efds, efds, sbuf->lsb_size) != 0) {
+			return (EFAULT);
+		}
+	}
+	*fdcnt = rv;
+	return (0);
+}
+
+
+static long
+lx_select_common(int nfds, long *rfds, long *wfds, long *efds,
+    timespec_t *tsp, k_sigset_t *ksetp)
+{
+	kthread_t *t = curthread;
+	klwp_t *lwp = ttolwp(t);
+	proc_t *p = ttoproc(t);
+	pollstate_t *ps = NULL;
+	pollfd_t *pollfdp = NULL, *fake_fds = NULL;
+	lx_select_buf_t sbuf = {0};
+	int error = 0, fdcnt = 0;
+
+	if (nfds < 0) {
+		return (set_errno(EINVAL));
+	}
+
+	/*
+	 * Reset our signal mask, if requested.
+	 */
+	if (ksetp != NULL) {
+		mutex_enter(&p->p_lock);
+		schedctl_finish_sigblock(t);
+		lwp->lwp_sigoldmask = t->t_hold;
+		t->t_hold = *ksetp;
+		t->t_flag |= T_TOMASK;
+		/*
+		 * Call cv_reltimedwait_sig() just to check for signals.
+		 * We will return immediately with either 0 or -1.
+		 */
+		if (!cv_reltimedwait_sig(&t->t_delay_cv, &p->p_lock, 0,
+		    TR_CLOCK_TICK)) {
+			mutex_exit(&p->p_lock);
+			error = EINTR;
+			goto out;
+		}
+		mutex_exit(&p->p_lock);
+	}
+
+	/*
+	 * Because poll caching uses the userspace pollfd_t pointer to verify
+	 * cache reuse validity, a simulated value must be supplied when
+	 * emulating Linux select(2).  The first non-NULL pointer from
+	 * rfds/wfds/efds is used for this purpose.
+	 */
+	if (rfds != NULL) {
+		fake_fds = (pollfd_t *)rfds;
+	} else if (wfds != NULL) {
+		fake_fds = (pollfd_t *)wfds;
+	} else if (efds != NULL) {
+		fake_fds = (pollfd_t *)efds;
+	} else {
+		/*
+		 * A non-zero nfds was supplied but all three fd_set pointers
+		 * were null.  Fall back to doing a simple timeout.
+		 */
+		nfds = 0;
+	}
+
+	/*
+	 * Initialize pollstate and copy in pollfd data if present.
+	 */
+	if (nfds != 0) {
+		if (nfds > p->p_fno_ctl) {
+			mutex_enter(&p->p_lock);
+			(void) rctl_action(rctlproc_legacy[RLIMIT_NOFILE],
+			    p->p_rctls, p, RCA_SAFE);
+			mutex_exit(&p->p_lock);
+			error = EINVAL;
+			goto out;
+		}
+
+		/*
+		 * Need to allocate memory for pollstate before anything
+		 * because the mutex and cv are created in this space
+		 */
+		ps = pollstate_create();
+		if (ps->ps_pcache == NULL)
+			ps->ps_pcache = pcache_alloc();
+
+		sbuf.lsb_size = LX_FD_SET_SIZE(nfds);
+		if (rfds != NULL)
+			sbuf.lsb_rfds = kmem_alloc(sbuf.lsb_size, KM_SLEEP);
+		if (wfds != NULL)
+			sbuf.lsb_wfds = kmem_alloc(sbuf.lsb_size, KM_SLEEP);
+		if (efds != NULL)
+			sbuf.lsb_efds = kmem_alloc(sbuf.lsb_size, KM_SLEEP);
+
+		error = lx_select_copyin(ps, &sbuf, nfds, rfds, wfds, efds);
+		if (error != 0) {
+			goto out;
+		}
+
+		pollfdp = ps->ps_pollfd;
+	}
+
+	/*
+	 * Perform the actual poll.
+	 */
+	error = poll_common(ps, fake_fds, (nfds_t)nfds, tsp, &fdcnt);
+
+out:
+	/*
+	 * If we changed the signal mask but we received no signal then restore
+	 * the signal mask.  Otherwise psig() will deal with the signal mask.
+	 */
+	if (ksetp != NULL) {
+		mutex_enter(&p->p_lock);
+		if (lwp->lwp_cursig == 0) {
+			t->t_hold = lwp->lwp_sigoldmask;
+			t->t_flag &= ~T_TOMASK;
+		}
+		mutex_exit(&p->p_lock);
+	}
+
+	/*
+	 * Copy out the events and return the fdcnt to the user.
+	 */
+	if (error == 0 && nfds != 0) {
+		error = lx_select_copyout(pollfdp, &sbuf, nfds, rfds, wfds,
+		    efds, &fdcnt);
+	}
+	if (sbuf.lsb_size != 0) {
+		if (sbuf.lsb_rfds != NULL)
+			kmem_free(sbuf.lsb_rfds, sbuf.lsb_size);
+		if (sbuf.lsb_wfds != NULL)
+			kmem_free(sbuf.lsb_wfds, sbuf.lsb_size);
+		if (sbuf.lsb_efds != NULL)
+			kmem_free(sbuf.lsb_efds, sbuf.lsb_size);
+	}
+	if (error) {
+		return (set_errno(error));
+	}
+	return (fdcnt);
+}
+
+long
+lx_select(int nfds, long *rfds, long *wfds, long *efds,
+    struct timeval *timeoutp)
+{
+	timespec_t ts, *tsp = NULL;
+
+	if (timeoutp != NULL) {
+		if (get_udatamodel() == DATAMODEL_NATIVE) {
+			struct timeval tv;
+
+			if (copyin(timeoutp, &tv, sizeof (tv)))
+				return (set_errno(EFAULT));
+			ts.tv_sec = tv.tv_sec;
+			ts.tv_nsec = tv.tv_usec * (NANOSEC / MICROSEC);
+		} else {
+			struct timeval32 tv32;
+
+			if (copyin(timeoutp, &tv32, sizeof (tv32)))
+				return (set_errno(EFAULT));
+			ts.tv_sec = tv32.tv_sec;
+			ts.tv_nsec = tv32.tv_usec * (NANOSEC / MICROSEC);
+		}
+
+		if (itimerspecfix(&ts))
+			return (set_errno(EINVAL));
+		tsp = &ts;
+	}
+
+	return (lx_select_common(nfds, rfds, wfds, efds, tsp, NULL));
+}
+
+
+typedef struct {
+	uintptr_t lpsa_addr;
+	unsigned long lpsa_len;
+} lx_pselect_sig_arg_t;
+
+#if defined(_LP64)
+typedef struct {
+	caddr32_t lpsa_addr;
+	uint32_t lpsa_len;
+} lx_pselect_sig_arg32_t;
+#endif /* defined(_LP64) */
+
+long
+lx_pselect(int nfds, long *rfds, long *wfds, long *efds,
+    timespec_t *timeoutp, void *setp)
+{
+	timespec_t ts, *tsp = NULL;
+	k_sigset_t kset, *ksetp = NULL;
+
+	/*
+	 * Copy in timeout and sigmask.
+	 */
+	if (timeoutp != NULL) {
+		if (get_udatamodel() == DATAMODEL_NATIVE) {
+			if (copyin(timeoutp, &ts, sizeof (ts)))
+				return (set_errno(EFAULT));
+		} else {
+			timespec32_t ts32;
+
+			if (copyin(timeoutp, &ts32, sizeof (ts32)))
+				return (set_errno(EFAULT));
+			TIMESPEC32_TO_TIMESPEC(&ts, &ts32)
+		}
+
+		if (itimerspecfix(&ts))
+			return (set_errno(EINVAL));
+		tsp = &ts;
+	}
+	if (setp != NULL) {
+		lx_sigset_t lset, *sigaddr = NULL;
+
+		if (get_udatamodel() == DATAMODEL_NATIVE) {
+			lx_pselect_sig_arg_t lpsa;
+
+			if (copyin(setp, &lpsa, sizeof (lpsa)) != 0)
+				return (set_errno(EFAULT));
+			/*
+			 * Linux forces a size to be passed only so it can
+			 * check that it's the size of a sigset_t.
+			 */
+			if (lpsa.lpsa_len != sizeof (lx_sigset_t))
+				return (set_errno(EINVAL));
+
+			sigaddr = (lx_sigset_t *)lpsa.lpsa_addr;
+		}
+#if defined(_LP64)
+		else {
+			lx_pselect_sig_arg32_t lpsa32;
+
+			if (copyin(setp, &lpsa32, sizeof (lpsa32)) != 0)
+				return (set_errno(EFAULT));
+			/*
+			 * Linux forces a size to be passed only so it can
+			 * check that it's the size of a sigset_t.
+			 */
+			if (lpsa32.lpsa_len != sizeof (lx_sigset_t))
+				return (set_errno(EINVAL));
+
+			sigaddr = (lx_sigset_t *)(uint64_t)lpsa32.lpsa_addr;
+		}
+#endif /* defined(_LP64) */
+
+		/* This is where we check if the sigset is *really* NULL. */
+		if (sigaddr != NULL) {
+			if (copyin(sigaddr, &lset, sizeof (lset)) != 0)
+				return (set_errno(EFAULT));
+
+			lx_ltos_sigset(&lset, &kset);
+			ksetp = &kset;
+		}
+	}
+
+	return (lx_select_common(nfds, rfds, wfds, efds, tsp, ksetp));
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_prctl.c b/usr/src/uts/common/brand/lx/syscall/lx_prctl.c
new file mode 100644
index 0000000000..091a6f547b
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_prctl.c
@@ -0,0 +1,210 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+#include <sys/systm.h>
+#include <sys/types.h>
+#include <sys/user.h>
+#include <sys/priv.h>
+#include <sys/brand.h>
+#include <sys/cmn_err.h>
+#include <sys/lx_brand.h>
+#include <sys/lx_impl.h>
+#include <lx_signum.h>
+
+#define	LX_PR_SET_PDEATHSIG		1
+#define	LX_PR_GET_PDEATHSIG		2
+#define	LX_PR_GET_DUMPABLE		3
+#define	LX_PR_SET_DUMPABLE		4
+#define	LX_PR_GET_UNALIGN		5
+#define	LX_PR_SET_UNALIGN		6
+#define	LX_PR_GET_KEEPCAPS		7
+#define	LX_PR_SET_KEEPCAPS		8
+#define	LX_PR_GET_FPEMU			9
+#define	LX_PR_SET_FPEMU			10
+#define	LX_PR_GET_FPEXC			11
+#define	LX_PR_SET_FPEXC			12
+#define	LX_PR_GET_TIMING		13
+#define	LX_PR_SET_TIMING		14
+#define	LX_PR_SET_NAME			15
+#define	LX_PR_GET_NAME			16
+#define	LX_PR_GET_ENDIAN		19
+#define	LX_PR_SET_ENDIAN		20
+#define	LX_PR_GET_SECCOMP		21
+#define	LX_PR_SET_SECCOMP		22
+#define	LX_PR_CAPBSET_READ		23
+#define	LX_PR_CAPBSET_DROP		24
+#define	LX_PR_GET_TSC			25
+#define	LX_PR_SET_TSC			26
+#define	LX_PR_GET_SECUREBITS		27
+#define	LX_PR_SET_SECUREBITS		28
+#define	LX_PR_SET_TIMERSLACK		29
+#define	LX_PR_GET_TIMERSLACK		30
+#define	LX_PR_TASK_PERF_EVENTS_DISABLE	31
+#define	LX_PR_TASK_PERF_EVENTS_ENABLE	32
+#define	LX_PR_MCE_KILL			33
+#define	LX_PR_MCE_KILL_GET		34
+#define	LX_PR_SET_MM			35
+#define	LX_PR_SET_CHILD_SUBREAPER	36
+#define	LX_PR_GET_CHILD_SUBREAPER	37
+#define	LX_PR_SET_NO_NEW_PRIVS		38
+#define	LX_PR_GET_NO_NEW_PRIVS		39
+#define	LX_PR_GET_TID_ADDRESS		40
+#define	LX_PR_SET_THP_DISABLE		41
+#define	LX_PR_GET_THP_DISABLE		42
+
+#define	LX_PR_SET_NAME_NAMELEN	16
+
+long
+lx_prctl(int opt, uintptr_t data)
+{
+	long err;
+	char ebuf[64];
+
+	switch (opt) {
+	case LX_PR_GET_DUMPABLE: {
+		/* Indicate that process is always dumpable */
+		return (1);
+	}
+
+	case LX_PR_SET_DUMPABLE: {
+		if (data != 0 && data != 1) {
+			return (set_errno(EINVAL));
+		}
+		/* Lie about altering process dumpability */
+		return (0);
+	}
+
+	case LX_PR_GET_SECUREBITS: {
+		/* Our bits are always 0 */
+		return (0);
+	}
+
+	case LX_PR_SET_SECUREBITS: {
+		/* Ignore setting any bits from arg2 */
+		return (0);
+	}
+
+	case LX_PR_SET_KEEPCAPS: {
+		/*
+		 * The closest illumos analog to SET_KEEPCAPS is the PRIV_AWARE
+		 * flag.  There are probably some cases where it's not exactly
+		 * the same, but this will do for a first try.
+		 */
+		if (data == 0) {
+			err = setpflags(PRIV_AWARE_RESET, 1, NULL);
+		} else {
+			err = setpflags(PRIV_AWARE, 1, NULL);
+		}
+
+		if (err != 0) {
+			return (set_errno(err));
+		}
+		return (0);
+	}
+
+	case LX_PR_SET_NAME: {
+		char name[LX_PR_SET_NAME_NAMELEN + 1];
+		proc_t *p = curproc;
+		/*
+		 * In Linux, PR_SET_NAME sets the name of the thread, not the
+		 * process.  Due to the historical quirks of Linux's asinine
+		 * thread model, this name is effectively the name of the
+		 * process (as visible via ps(1)) if the thread is the first of
+		 * its task group.  The first thread is therefore special, and
+		 * to best mimic Linux semantics (and absent a notion of
+		 * per-LWP names), we do nothing (but return success) on LWPs
+		 * other than LWP 1.
+		 */
+		if (curthread->t_tid != 1) {
+			return (0);
+		}
+		if (copyin((void *)data, name, LX_PR_SET_NAME_NAMELEN) != 0) {
+			return (set_errno(EFAULT));
+		}
+		name[LX_PR_SET_NAME_NAMELEN] = '\0';
+		mutex_enter(&p->p_lock);
+		(void) strncpy(p->p_user.u_comm, name, MAXCOMLEN + 1);
+		(void) strncpy(p->p_user.u_psargs, name, PSARGSZ);
+		mutex_exit(&p->p_lock);
+		return (0);
+	}
+
+	case LX_PR_GET_PDEATHSIG: {
+		int sig;
+		lx_proc_data_t *lxpd;
+
+		mutex_enter(&curproc->p_lock);
+		VERIFY(lxpd = ptolxproc(curproc));
+		sig = lxpd->l_parent_deathsig;
+		mutex_exit(&curproc->p_lock);
+
+		return (sig);
+	}
+
+	case LX_PR_SET_PDEATHSIG: {
+		int sig = lx_ltos_signo((int)data, 0);
+		proc_t *pp = NULL;
+		lx_proc_data_t *lxpd;
+
+		if (sig == 0 && data != 0) {
+			return (set_errno(EINVAL));
+		}
+
+		mutex_enter(&pidlock);
+		/* Set signal on our self */
+		mutex_enter(&curproc->p_lock);
+		VERIFY(lxpd = ptolxproc(curproc));
+		lxpd->l_parent_deathsig = sig;
+		pp = curproc->p_parent;
+		mutex_exit(&curproc->p_lock);
+
+		/* Configure parent to potentially signal children on death */
+		mutex_enter(&pp->p_lock);
+		if (PROC_IS_BRANDED(pp)) {
+			VERIFY(lxpd = ptolxproc(pp));
+			/*
+			 * Mark the parent as having children which wish to be
+			 * signaled on death of parent.
+			 */
+			lxpd->l_flags |= LX_PROC_CHILD_DEATHSIG;
+		} else {
+			/*
+			 * If the parent is not a branded process, the needed
+			 * hooks to facilitate this mechanism will not fire
+			 * when it dies. We lie about success in this case.
+			 */
+		}
+		mutex_exit(&pp->p_lock);
+		mutex_exit(&pidlock);
+		return (0);
+	}
+
+	case LX_PR_CAPBSET_DROP: {
+		/*
+		 * On recent versions of Linux the login svc drops capabilities
+		 * and if that fails the svc dies and is restarted by systemd.
+		 * For now we pretend dropping capabilities succeeded.
+		 */
+		return (0);
+	}
+
+	default:
+		break;
+	}
+
+	snprintf(ebuf, 64, "prctl option %d", opt);
+	lx_unsupported(ebuf);
+	return (set_errno(EINVAL));
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_rlimit.c b/usr/src/uts/common/brand/lx/syscall/lx_rlimit.c
new file mode 100644
index 0000000000..6581ead25b
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_rlimit.c
@@ -0,0 +1,575 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+#include <sys/systm.h>
+#include <sys/thread.h>
+#include <sys/zone.h>
+#include <sys/cpuvar.h>
+#include <sys/cmn_err.h>
+#include <sys/lx_impl.h>
+#include <sys/lx_brand.h>
+
+#define	LX_RLIMIT_CPU		0
+#define	LX_RLIMIT_FSIZE		1
+#define	LX_RLIMIT_DATA		2
+#define	LX_RLIMIT_STACK		3
+#define	LX_RLIMIT_CORE		4
+#define	LX_RLIMIT_RSS		5
+#define	LX_RLIMIT_NPROC		6
+#define	LX_RLIMIT_NOFILE	7
+#define	LX_RLIMIT_MEMLOCK	8
+#define	LX_RLIMIT_AS		9
+#define	LX_RLIMIT_LOCKS		10	/* NA limit on locks, early 2.4 only */
+#define	LX_RLIMIT_SIGPENDING	11
+#define	LX_RLIMIT_MSGQUEUE	12
+#define	LX_RLIMIT_NICE		13	/* NA ceiling for nice */
+#define	LX_RLIMIT_RTPRIO	14	/* NA ceiling on the RT priority */
+#define	LX_RLIMIT_RTTIME	15	/* NA cpu limit for RT proc. */
+
+#define	LX_RLIMIT_NLIMITS	16
+
+#define	RCTL_INFINITE(x) \
+	((x->rcv_flagaction & RCTL_LOCAL_MAXIMAL) && \
+	(x->rcv_flagaction & RCTL_GLOBAL_INFINITE))
+
+typedef struct {
+	ulong_t	rlim_cur;
+	ulong_t	rlim_max;
+} lx_rlimit_t;
+
+typedef struct {
+	uint32_t	rlim_cur;
+	uint32_t	rlim_max;
+} lx_rlimit32_t;
+
+/*
+ * Linux supports many of the same resources that we do, but on Illumos these
+ * are rctls. Instead of using rlimit, we use rctls for all of the limits.
+ * This table is used to translate Linux rlimit keys into the Illumos legacy
+ * rlimit. We then primarily use the rctl/rlimit compatability code to
+ * manage these.
+ */
+static int l_to_r[LX_RLIMIT_NLIMITS] = {
+	RLIMIT_CPU,		/* 0 CPU */
+	RLIMIT_FSIZE,		/* 1 FSIZE */
+	RLIMIT_DATA,		/* 2 DATA */
+	RLIMIT_STACK,		/* 3 STACK */
+	RLIMIT_CORE,		/* 4 CORE */
+	-1,			/* 5 RSS */
+	-1,			/* 6 NPROC */
+	RLIMIT_NOFILE,		/* 7 NOFILE */
+	-1,			/* 8 MEMLOCK */
+	RLIMIT_AS,		/* 9 AS */
+	-1,			/* 10 LOCKS */
+	-1, 			/* 11 SIGPENDING */
+	-1, 			/* 12 MSGQUEUE */
+	-1,			/* 13 NICE */
+	-1,			/* 14 RTPRIO */
+	-1			/* 15 RTTIME */
+};
+
+/*
+ * Magic value Linux uses to indicate infinity
+ */
+#define	LX_RLIM_INFINITY_N	ULONG_MAX
+
+static void
+lx_get_rctl(char *nm, struct rlimit64 *rlp64)
+{
+	rctl_hndl_t hndl;
+	rctl_val_t *oval, *nval;
+
+	rlp64->rlim_cur = RLIM_INFINITY;
+	rlp64->rlim_max = RLIM_INFINITY;
+
+	nval = kmem_alloc(sizeof (rctl_val_t), KM_SLEEP);
+	mutex_enter(&curproc->p_lock);
+
+	hndl = rctl_hndl_lookup(nm);
+	oval = NULL;
+	while ((hndl != -1) && rctl_local_get(hndl, oval, nval, curproc) == 0) {
+		oval = nval;
+		switch (nval->rcv_privilege) {
+		case RCPRIV_BASIC:
+			if (!RCTL_INFINITE(nval))
+				rlp64->rlim_cur = nval->rcv_value;
+			break;
+		case RCPRIV_PRIVILEGED:
+			if (!RCTL_INFINITE(nval))
+				rlp64->rlim_max = nval->rcv_value;
+			break;
+		}
+	}
+
+	mutex_exit(&curproc->p_lock);
+	kmem_free(nval, sizeof (rctl_val_t));
+
+	if (rlp64->rlim_cur == RLIM_INFINITY &&
+	    rlp64->rlim_max != RLIM_INFINITY)
+		rlp64->rlim_cur = rlp64->rlim_max;
+}
+
+static int
+lx_getrlimit_common(int lx_resource, uint64_t *rlim_curp, uint64_t *rlim_maxp)
+{
+	lx_proc_data_t *pd = ptolxproc(curproc);
+	int resource;
+	int64_t cur = -1;
+	boolean_t cur_inf = B_FALSE;
+	int64_t max = -1;
+	boolean_t max_inf = B_FALSE;
+	struct rlimit64 rlim64;
+
+	if (lx_resource < 0 || lx_resource >= LX_RLIMIT_NLIMITS)
+		return (EINVAL);
+
+	switch (lx_resource) {
+	case LX_RLIMIT_LOCKS:
+		rlim64.rlim_cur = pd->l_fake_limits[LX_RLFAKE_LOCKS].rlim_cur;
+		rlim64.rlim_max = pd->l_fake_limits[LX_RLFAKE_LOCKS].rlim_max;
+		break;
+
+	case LX_RLIMIT_NICE:
+		rlim64.rlim_cur = pd->l_fake_limits[LX_RLFAKE_NICE].rlim_cur;
+		rlim64.rlim_max = pd->l_fake_limits[LX_RLFAKE_NICE].rlim_max;
+		break;
+
+	case LX_RLIMIT_RTPRIO:
+		rlim64.rlim_cur = pd->l_fake_limits[LX_RLFAKE_RTPRIO].rlim_cur;
+		rlim64.rlim_max = pd->l_fake_limits[LX_RLFAKE_RTPRIO].rlim_max;
+		break;
+
+	case LX_RLIMIT_RTTIME:
+		rlim64.rlim_cur = pd->l_fake_limits[LX_RLFAKE_RTTIME].rlim_cur;
+		rlim64.rlim_max = pd->l_fake_limits[LX_RLFAKE_RTTIME].rlim_max;
+		break;
+
+	case LX_RLIMIT_RSS:
+		/* zone.max-physical-memory */
+		rlim64.rlim_cur = rlim64.rlim_max = curzone->zone_phys_mem_ctl;
+		break;
+
+	case LX_RLIMIT_NPROC:
+		/*  zone.max-lwps */
+		rlim64.rlim_cur = rlim64.rlim_max = curzone->zone_nlwps_ctl;
+		break;
+
+	case LX_RLIMIT_MEMLOCK:
+		/* zone.max-locked-memory */
+		rlim64.rlim_cur = rlim64.rlim_max =
+		    curzone->zone_locked_mem_ctl;
+		break;
+
+	case LX_RLIMIT_SIGPENDING:
+		lx_get_rctl("process.max-sigqueue-size", &rlim64);
+		break;
+
+	case LX_RLIMIT_MSGQUEUE:
+		lx_get_rctl("process.max-msg-messages", &rlim64);
+		break;
+
+	default:
+		resource = l_to_r[lx_resource];
+
+		mutex_enter(&curproc->p_lock);
+		(void) rctl_rlimit_get(rctlproc_legacy[resource], curproc,
+		    &rlim64);
+		mutex_exit(&curproc->p_lock);
+		break;
+	}
+
+
+	if (rlim64.rlim_cur == RLIM64_INFINITY) {
+		cur = LX_RLIM_INFINITY_N;
+	} else {
+		cur = rlim64.rlim_cur;
+	}
+	if (rlim64.rlim_max == RLIM64_INFINITY) {
+		max = LX_RLIM_INFINITY_N;
+	} else {
+		max = rlim64.rlim_max;
+	}
+
+	if (lx_resource == LX_RLIMIT_STACK && cur > INT_MAX) {
+		/*
+		 * Stunningly, Linux has somehow managed to confuse the concept
+		 * of a "limit" with that of a "default" -- and the value of
+		 * RLIMIT_STACK is used by NPTL as the _default_ stack size if
+		 * it isn't specified. (!!)  Even for a system that prides
+		 * itself on slapdash castles of junk, this is an amazingly
+		 * willful act of incompetence -- and one that is gleefully
+		 * confessed in the pthread_create() man page: "if the
+		 * RLIMIT_STACK soft resource limit at the time the program
+		 * started has any value other than 'unlimited', then it
+		 * determines the default stack size of new threads."  A
+		 * typical stack limit for us is 32TB; if it needs to be said,
+		 * setting the default stack size to be 32TB doesn't work so
+		 * well!  Of course, glibc dropping a deuce in its pants
+		 * becomes our problem -- so to prevent smelly accidents we
+		 * tell Linux that any stack limit over the old (32-bit) values
+		 * for infinity are just infinitely large.
+		 */
+		cur_inf = B_TRUE;
+		max_inf = B_TRUE;
+	}
+
+	if (cur_inf) {
+		*rlim_curp = LX_RLIM64_INFINITY;
+	} else {
+		*rlim_curp = cur;
+	}
+
+	if (max_inf) {
+		*rlim_maxp = LX_RLIM64_INFINITY;
+	} else {
+		*rlim_maxp = max;
+	}
+
+	return (0);
+}
+
+/*
+ * This is the 'new' getrlimit, variously called getrlimit or ugetrlimit
+ * in Linux headers and code.  The only difference between this and the old
+ * getrlimit (variously called getrlimit or old_getrlimit) is the value of
+ * RLIM_INFINITY, which is smaller for the older version.  Modern code will
+ * use this version by default.
+ */
+long
+lx_getrlimit(int resource, lx_rlimit_t *rlp)
+{
+	int rv;
+	lx_rlimit_t rl;
+	uint64_t rlim_cur, rlim_max;
+
+	rv = lx_getrlimit_common(resource, &rlim_cur, &rlim_max);
+	if (rv != 0)
+		return (set_errno(rv));
+
+	if (get_udatamodel() == DATAMODEL_NATIVE) {
+		if (rlim_cur == LX_RLIM64_INFINITY)
+			rl.rlim_cur = LX_RLIM_INFINITY_N;
+		else if (rlim_cur > LX_RLIM_INFINITY_N)
+			rl.rlim_cur = LX_RLIM_INFINITY_N;
+		else
+			rl.rlim_cur = (ulong_t)rlim_cur;
+
+		if (rlim_max == LX_RLIM64_INFINITY)
+			rl.rlim_max = LX_RLIM_INFINITY_N;
+		else if (rlim_max > LX_RLIM_INFINITY_N)
+			rl.rlim_max = LX_RLIM_INFINITY_N;
+		else
+			rl.rlim_max = (ulong_t)rlim_max;
+
+		if (copyout(&rl, rlp, sizeof (rl)) != 0)
+			return (set_errno(EFAULT));
+	}
+#ifdef _SYSCALL32_IMPL
+	else {
+		lx_rlimit32_t rl32;
+
+		if (rlim_cur > UINT_MAX)
+			rl.rlim_cur = UINT_MAX;
+		else
+			rl.rlim_cur = (ulong_t)rlim_cur;
+
+		if (rlim_max > UINT_MAX)
+			rl.rlim_max = UINT_MAX;
+		else
+			rl.rlim_max = (ulong_t)rlim_max;
+
+		rl32.rlim_cur = rl.rlim_cur;
+		rl32.rlim_max = rl.rlim_max;
+
+		if (copyout(&rl32, rlp, sizeof (rl32)) != 0)
+			return (set_errno(EFAULT));
+	}
+#endif
+
+	return (0);
+}
+
+/*
+ * This is the 'old' getrlimit, variously called getrlimit or old_getrlimit
+ * in Linux headers and code.  The only difference between this and the new
+ * getrlimit (variously called getrlimit or ugetrlimit) is the value of
+ * RLIM_INFINITY, which is smaller for the older version.
+ *
+ * This is only used for 32-bit code.
+ */
+long
+lx_oldgetrlimit(int resource, lx_rlimit_t *rlp)
+{
+	int rv;
+	lx_rlimit32_t rl32;
+	uint64_t rlim_cur, rlim_max;
+
+	rv = lx_getrlimit_common(resource, &rlim_cur, &rlim_max);
+	if (rv != 0)
+		return (set_errno(rv));
+
+	if (rlim_cur > INT_MAX)
+		rl32.rlim_cur = INT_MAX;
+	else
+		rl32.rlim_cur = (ulong_t)rlim_cur;
+
+	if (rlim_max > INT_MAX)
+		rl32.rlim_max = INT_MAX;
+	else
+		rl32.rlim_max = (ulong_t)rlim_cur;
+
+	if (copyout(&rl32, rlp, sizeof (rl32)) != 0)
+		return (set_errno(EFAULT));
+
+	return (0);
+}
+
+static int
+lx_set_rctl(char *nm, struct rlimit64 *rlp64)
+{
+	int err;
+	rctl_hndl_t hndl;
+	rctl_alloc_gp_t *gp;
+
+	gp = rctl_rlimit_set_prealloc(1);
+
+	mutex_enter(&curproc->p_lock);
+
+	hndl = rctl_hndl_lookup(nm);
+
+	/*
+	 * We're not supposed to do this but since we want all our rctls to
+	 * behave like rlimits, we take advantage of this function to set up
+	 * this way.
+	 */
+	err = rctl_rlimit_set(hndl, curproc, rlp64, gp, RCTL_LOCAL_DENY, 0,
+	    CRED());
+
+	mutex_exit(&curproc->p_lock);
+
+	rctl_prealloc_destroy(gp);
+
+	return (err);
+}
+
+static int
+lx_setrlimit_common(int lx_resource, uint64_t rlim_cur, uint64_t rlim_max)
+{
+	lx_proc_data_t *pd = ptolxproc(curproc);
+	int err;
+	int resource;
+	rctl_alloc_gp_t *gp;
+	struct rlimit64 rl64;
+
+	if (lx_resource < 0 || lx_resource >= LX_RLIMIT_NLIMITS)
+		return (EINVAL);
+
+	switch (lx_resource) {
+	case LX_RLIMIT_LOCKS:
+		pd->l_fake_limits[LX_RLFAKE_LOCKS].rlim_cur = rlim_cur;
+		pd->l_fake_limits[LX_RLFAKE_LOCKS].rlim_max = rlim_max;
+		break;
+
+	case LX_RLIMIT_NICE:
+		pd->l_fake_limits[LX_RLFAKE_NICE].rlim_cur = rlim_cur;
+		pd->l_fake_limits[LX_RLFAKE_NICE].rlim_max = rlim_max;
+		break;
+
+	case LX_RLIMIT_RTPRIO:
+		pd->l_fake_limits[LX_RLFAKE_RTPRIO].rlim_cur = rlim_cur;
+		pd->l_fake_limits[LX_RLFAKE_RTPRIO].rlim_max = rlim_max;
+		break;
+
+	case LX_RLIMIT_RTTIME:
+		pd->l_fake_limits[LX_RLFAKE_RTTIME].rlim_cur = rlim_cur;
+		pd->l_fake_limits[LX_RLFAKE_RTTIME].rlim_max = rlim_max;
+		break;
+
+	case LX_RLIMIT_RSS:
+		/*
+		 * zone.max-physical-memory
+		 * Since we're emulating the value via a zone rctl, we can't
+		 * set that from within the zone. Lie and say we set the value.
+		 */
+		break;
+
+	case LX_RLIMIT_NPROC:
+		/*
+		 * zone.max-lwps
+		 * Since we're emulating the value via a zone rctl, we can't
+		 * set that from within the zone. Lie and say we set the value.
+		 */
+		break;
+
+	case LX_RLIMIT_MEMLOCK:
+		/*
+		 * zone.max-locked-memory
+		 * Since we're emulating the value via a zone rctl, we can't
+		 * set that from within the zone. Lie and say we set the value.
+		 */
+		break;
+
+	case LX_RLIMIT_SIGPENDING:
+		/*
+		 * On Ubuntu at least, the login and sshd processes expect to
+		 * set this limit to 16k and login will fail if this fails. On
+		 * Illumos we have a system limit of 8k and normally the
+		 * privileged limit is 512. We simply pretend this works to
+		 * allow login to work.
+		 */
+		if (rlim_max > 8192)
+			return (0);
+
+		rl64.rlim_cur = rlim_cur;
+		rl64.rlim_max = rlim_max;
+		if ((err = lx_set_rctl("process.max-sigqueue-size", &rl64))
+		    != 0)
+			return (set_errno(err));
+		break;
+
+	case LX_RLIMIT_MSGQUEUE:
+		rl64.rlim_cur = rlim_cur;
+		rl64.rlim_max = rlim_max;
+		if ((err = lx_set_rctl("process.max-msg-messages", &rl64)) != 0)
+			return (set_errno(err));
+		break;
+
+	default:
+		resource = l_to_r[lx_resource];
+
+		/*
+		 * Linux limits the max number of open files to 1m and there is
+		 * a test for this.
+		 */
+		if (lx_resource == LX_RLIMIT_NOFILE && rlim_max > (1024 * 1024))
+			return (EPERM);
+
+		rl64.rlim_cur = rlim_cur;
+		rl64.rlim_max = rlim_max;
+		gp = rctl_rlimit_set_prealloc(1);
+
+		mutex_enter(&curproc->p_lock);
+		err = rctl_rlimit_set(rctlproc_legacy[resource], curproc,
+		    &rl64, gp, rctlproc_flags[resource],
+		    rctlproc_signals[resource], CRED());
+		mutex_exit(&curproc->p_lock);
+
+		rctl_prealloc_destroy(gp);
+		if (err != 0)
+			return (set_errno(err));
+		break;
+	}
+
+	return (0);
+}
+
+long
+lx_setrlimit(int resource, lx_rlimit_t *rlp)
+{
+	int rv;
+	lx_rlimit_t rl;
+	uint64_t rlim_cur, rlim_max;
+
+	if (get_udatamodel() == DATAMODEL_NATIVE) {
+		if (copyin(rlp, &rl, sizeof (rl)) != 0)
+			return (set_errno(EFAULT));
+	}
+#ifdef _SYSCALL32_IMPL
+	else {
+		lx_rlimit32_t rl32;
+
+		if (copyin(rlp, &rl32, sizeof (rl32)) != 0)
+			return (set_errno(EFAULT));
+
+		rl.rlim_cur = rl32.rlim_cur;
+		rl.rlim_max = rl32.rlim_max;
+	}
+#endif
+
+	if ((rl.rlim_max != LX_RLIM_INFINITY_N &&
+	    rl.rlim_cur == LX_RLIM_INFINITY_N) ||
+	    rl.rlim_cur > rl.rlim_max)
+		return (set_errno(EINVAL));
+
+	if (rl.rlim_cur == LX_RLIM_INFINITY_N)
+		rlim_cur = LX_RLIM64_INFINITY;
+	else
+		rlim_cur = rl.rlim_cur;
+
+	if (rl.rlim_max == LX_RLIM_INFINITY_N)
+		rlim_max = LX_RLIM64_INFINITY;
+	else
+		rlim_max = rl.rlim_max;
+
+	rv = lx_setrlimit_common(resource, rlim_cur, rlim_max);
+	if (rv != 0)
+		return (set_errno(rv));
+	return (0);
+}
+
+/*
+ * From the man page:
+ * The Linux-specific prlimit() system call combines and extends the
+ * functionality of setrlimit() and getrlimit(). It can be used to both set
+ * and get the resource limits of an arbitrary process.
+ *
+ * If pid is 0, then the call applies to the calling process.
+ */
+long
+lx_prlimit64(pid_t pid, int resource, lx_rlimit64_t *nrlp, lx_rlimit64_t *orlp)
+{
+	int rv;
+	lx_rlimit64_t nrl, orl;
+
+	if (pid != 0) {
+		/* XXX TBD if needed */
+		char buf[80];
+
+		(void) snprintf(buf, sizeof (buf),
+		    "setting prlimit %d for another process\n", resource);
+		lx_unsupported(buf);
+		return (ENOTSUP);
+	}
+
+	if (orlp != NULL) {
+		/* we first get the current limits */
+		rv = lx_getrlimit_common(resource, &orl.rlim_cur,
+		    &orl.rlim_max);
+		if (rv != 0)
+			return (set_errno(rv));
+	}
+
+	if (nrlp != NULL) {
+		if (copyin(nrlp, &nrl, sizeof (nrl)) != 0)
+			return (set_errno(EFAULT));
+
+		if ((nrl.rlim_max != LX_RLIM64_INFINITY &&
+		    nrl.rlim_cur == LX_RLIM64_INFINITY) ||
+		    nrl.rlim_cur > nrl.rlim_max)
+			return (set_errno(EINVAL));
+
+		rv = lx_setrlimit_common(resource, nrl.rlim_cur, nrl.rlim_max);
+		if (rv != 0)
+			return (set_errno(rv));
+	}
+
+	if (orlp != NULL) {
+		/* now return the original limits, if necessary */
+		if (copyout(&orl, orlp, sizeof (orl)) != 0)
+			return (set_errno(EFAULT));
+	}
+
+	return (0);
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_rw.c b/usr/src/uts/common/brand/lx/syscall/lx_rw.c
new file mode 100644
index 0000000000..50d532ff51
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_rw.c
@@ -0,0 +1,949 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#include <sys/errno.h>
+#include <sys/systm.h>
+#include <sys/file.h>
+#include <sys/vnode.h>
+#include <sys/brand.h>
+#include <sys/lx_brand.h>
+#include <sys/lx_types.h>
+#include <sys/nbmlock.h>
+#include <sys/limits.h>
+
+/* uts/common/syscall/rw.c */
+extern size_t copyout_max_cached;
+
+
+/* Common routines */
+
+static int
+lx_iovec_copyin(void *uiovp, int iovcnt, iovec_t *kiovp, ssize_t *count)
+{
+#ifdef _SYSCALL32_IMPL
+	/*
+	 * 32-bit callers need to have their iovec expanded, while ensuring
+	 * that they can't move more than 2Gbytes of data in a single call.
+	 */
+	if (get_udatamodel() == DATAMODEL_ILP32) {
+		struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32;
+		int aiov32len = 0;
+		ssize32_t total32 = 0;
+		int i;
+
+		if (iovcnt > IOV_MAX_STACK) {
+			aiov32len = iovcnt * sizeof (iovec32_t);
+			aiov32 = kmem_alloc(aiov32len, KM_SLEEP);
+		}
+
+		if (copyin(uiovp, aiov32, iovcnt * sizeof (iovec32_t))) {
+			if (aiov32len != 0) {
+				kmem_free(aiov32, aiov32len);
+			}
+			return (EFAULT);
+		}
+
+		for (i = 0; i < iovcnt; i++) {
+			ssize32_t iovlen32 = aiov32[i].iov_len;
+			total32 += iovlen32;
+			if (iovlen32 < 0 || total32 < 0) {
+				if (aiov32len != 0) {
+					kmem_free(aiov32, aiov32len);
+				}
+				return (EINVAL);
+			}
+			kiovp[i].iov_len = iovlen32;
+			kiovp[i].iov_base =
+			    (caddr_t)(uintptr_t)aiov32[i].iov_base;
+			/* Linux does a basic sanity test on the address */
+			if ((uintptr_t)kiovp[i].iov_base >= USERLIMIT32) {
+				if (aiov32len != 0) {
+					kmem_free(aiov32, aiov32len);
+				}
+				return (EFAULT);
+			}
+		}
+		*count = total32;
+
+		if (aiov32len != 0)
+			kmem_free(aiov32, aiov32len);
+	} else
+#endif
+	{
+		ssize_t total = 0;
+		int i;
+
+		if (copyin(uiovp, kiovp, iovcnt * sizeof (iovec_t)))
+			return (EFAULT);
+		for (i = 0; i < iovcnt; i++) {
+			ssize_t iovlen = kiovp[i].iov_len;
+			total += iovlen;
+			if (iovlen < 0 || total < 0) {
+				return (EINVAL);
+			}
+			/* Linux does a basic sanity test on the address */
+			if ((uintptr_t)kiovp[i].iov_base >= USERLIMIT) {
+				return (EFAULT);
+			}
+		}
+		*count = total;
+	}
+	return (0);
+}
+
+static int
+lx_read_common(file_t *fp, uio_t *uiop, size_t *nread, boolean_t positioned)
+{
+	vnode_t *vp = fp->f_vnode;
+	int error = 0, rwflag = 0, ioflag;
+	ssize_t count = uiop->uio_resid;
+	size_t rcount = 0;
+	struct cpu *cp;
+	boolean_t in_crit = B_FALSE;
+
+	/*
+	 * We have to enter the critical region before calling VOP_RWLOCK
+	 * to avoid a deadlock with ufs.
+	 */
+	if (nbl_need_check(vp)) {
+		int svmand;
+
+		nbl_start_crit(vp, RW_READER);
+		in_crit = B_TRUE;
+		error = nbl_svmand(vp, fp->f_cred, &svmand);
+		if (error != 0)
+			goto out;
+		if (nbl_conflict(vp, NBL_READ, uiop->uio_offset, count, svmand,
+		    NULL) != 0) {
+			error = EACCES;
+			goto out;
+		}
+	}
+
+	(void) VOP_RWLOCK(vp, rwflag, NULL);
+	/*
+	 * For non-positioned reads, recheck offset/count validity inside
+	 * VOP_WRLOCK to prevent filesize from changing during validation.
+	 */
+	if (!positioned) {
+		u_offset_t uoffset = (u_offset_t)(ulong_t)fp->f_offset;
+
+		if ((vp->v_type == VREG) && (uoffset >= OFFSET_MAX(fp))) {
+			struct vattr va;
+
+			va.va_mask = AT_SIZE;
+			error = VOP_GETATTR(vp, &va, 0, fp->f_cred, NULL);
+			VOP_RWUNLOCK(vp, rwflag, NULL);
+			if (error != 0)
+				goto out;
+			/* We have to return EOF if fileoff is >= file size. */
+			if (uoffset >= va.va_size)
+				goto out;
+			/*
+			 * File is greater than or equal to maxoff and
+			 * therefore we return EOVERFLOW.
+			 */
+			error = EOVERFLOW;
+			goto out;
+		}
+		if ((vp->v_type == VREG) &&
+		    (uoffset + count > OFFSET_MAX(fp))) {
+			count = (ssize_t)(OFFSET_MAX(fp) - uoffset);
+			uiop->uio_resid = count;
+		}
+		uiop->uio_offset = uoffset;
+	}
+	ioflag = uiop->uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
+	/* If read sync is not asked for, filter sync flags */
+	if ((ioflag & FRSYNC) == 0)
+		ioflag &= ~(FSYNC|FDSYNC);
+	error = VOP_READ(vp, uiop, ioflag, fp->f_cred, NULL);
+	rcount = count - uiop->uio_resid;
+	CPU_STATS_ENTER_K();
+	cp = CPU;
+	CPU_STATS_ADDQ(cp, sys, sysread, 1);
+	CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)rcount);
+	CPU_STATS_EXIT_K();
+	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)rcount;
+	/* Store offset for non-positioned reads */
+	if (!positioned) {
+		if (vp->v_type == VFIFO) {
+			/* Backward compatibility */
+			fp->f_offset = rcount;
+		} else if (((fp->f_flag & FAPPEND) == 0) ||
+		    (vp->v_type != VREG) || (count != 0)) {
+			/* POSIX */
+			fp->f_offset = uiop->uio_loffset;
+		}
+	}
+	VOP_RWUNLOCK(vp, rwflag, NULL);
+
+out:
+	if (in_crit)
+		nbl_end_crit(vp);
+	*nread = rcount;
+	return (error);
+}
+
+static int
+lx_write_common(file_t *fp, uio_t *uiop, size_t *nwrite, boolean_t positioned)
+{
+	vnode_t *vp = fp->f_vnode;
+	int error = 0, rwflag = 1, ioflag;
+	ssize_t count = uiop->uio_resid;
+	size_t wcount = 0;
+	struct cpu *cp;
+	boolean_t in_crit = B_FALSE;
+
+	/*
+	 * We have to enter the critical region before calling VOP_RWLOCK
+	 * to avoid a deadlock with ufs.
+	 */
+	if (nbl_need_check(vp)) {
+		int svmand;
+
+		nbl_start_crit(vp, RW_READER);
+		in_crit = B_TRUE;
+		error = nbl_svmand(vp, fp->f_cred, &svmand);
+		if (error != 0)
+			goto out;
+		if (nbl_conflict(vp, NBL_WRITE, uiop->uio_loffset, count,
+		    svmand, NULL) != 0) {
+			error = EACCES;
+			goto out;
+		}
+	}
+
+	(void) VOP_RWLOCK(vp, rwflag, NULL);
+
+	if (!positioned) {
+		/*
+		 * For non-positioned writes, the value of fp->f_offset is
+		 * re-queried while inside VOP_RWLOCK.  This ensures that other
+		 * writes which alter the filesize will be taken into account.
+		 */
+		uiop->uio_loffset = fp->f_offset;
+		ioflag = uiop->uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
+	} else {
+		/*
+		 * In a senseless departure from POSIX, positioned write calls
+		 * on Linux do _not_ ignore the O_APPEND flag.
+		 */
+		ioflag = uiop->uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
+	}
+	if (vp->v_type == VREG) {
+		u_offset_t fileoff = (u_offset_t)(ulong_t)uiop->uio_loffset;
+
+		if (fileoff >= curproc->p_fsz_ctl) {
+			VOP_RWUNLOCK(vp, rwflag, NULL);
+			mutex_enter(&curproc->p_lock);
+			(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
+			    curproc->p_rctls, curproc, RCA_UNSAFE_SIGINFO);
+			mutex_exit(&curproc->p_lock);
+			error = EFBIG;
+			goto out;
+		}
+		if (fileoff >= OFFSET_MAX(fp)) {
+			VOP_RWUNLOCK(vp, rwflag, NULL);
+			error = EFBIG;
+			goto out;
+		}
+		if (fileoff + count > OFFSET_MAX(fp)) {
+			count = (ssize_t)(OFFSET_MAX(fp) - fileoff);
+			uiop->uio_resid = count;
+		}
+	}
+
+	error = VOP_WRITE(vp, uiop, ioflag, fp->f_cred, NULL);
+	wcount = count - uiop->uio_resid;
+	CPU_STATS_ENTER_K();
+	cp = CPU;
+	CPU_STATS_ADDQ(cp, sys, syswrite, 1);
+	CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)wcount);
+	CPU_STATS_EXIT_K();
+	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)wcount;
+
+	/* Store offset for non-positioned writes */
+	if (!positioned) {
+		if (vp->v_type == VFIFO) {
+			/* Backward compatibility */
+			fp->f_offset = wcount;
+		} else if (((fp->f_flag & FAPPEND) == 0) ||
+		    (vp->v_type != VREG) || (count != 0)) {
+			/* POSIX */
+			fp->f_offset = uiop->uio_loffset;
+		}
+	}
+	VOP_RWUNLOCK(vp, rwflag, NULL);
+
+out:
+	if (in_crit)
+		nbl_end_crit(vp);
+	*nwrite = wcount;
+	return (error);
+}
+
+/*
+ * The Linux routines for reading and writing data from file descriptors behave
+ * differently from their SunOS counterparts in a few key ways:
+ *
+ * - Passing an iovcnt of 0 to the vectored functions results in an error on
+ *   SunOS, but on Linux it yields return value of 0.
+ *
+ * - If any data is successfully read or written, Linux will return a success.
+ *   This is unlike SunOS which would return an error code for the entire
+ *   operation in cases where vectors had gone unprocessed.
+ *
+ * - Breaking from POSIX, Linux positioned writes (pwrite/pwritev) on Linux
+ *   will obey the O_APPEND flag if it is set on the descriptor.
+ */
+
+ssize_t
+lx_read(int fdes, void *cbuf, size_t ccount)
+{
+	struct uio auio;
+	struct iovec aiov;
+	file_t *fp;
+	ssize_t count = (ssize_t)ccount;
+	size_t nread = 0;
+	int fflag, error = 0;
+
+	if (count < 0)
+		return (set_errno(EINVAL));
+	if ((fp = getf(fdes)) == NULL)
+		return (set_errno(EBADF));
+	if (((fflag = fp->f_flag) & FREAD) == 0) {
+		error = EBADF;
+		goto out;
+	}
+	if (fp->f_vnode->v_type == VREG && count == 0) {
+		goto out;
+	}
+	if (fp->f_vnode->v_type == VDIR) {
+		error = EISDIR;
+		goto out;
+	}
+
+	aiov.iov_base = cbuf;
+	aiov.iov_len = count;
+	auio.uio_iov = &aiov;
+	auio.uio_iovcnt = 1;
+	auio.uio_loffset = fp->f_offset;
+	auio.uio_resid = count;
+	auio.uio_segflg = UIO_USERSPACE;
+	auio.uio_llimit = MAXOFFSET_T;
+	auio.uio_fmode = fflag;
+	if (count <= copyout_max_cached)
+		auio.uio_extflg = UIO_COPY_CACHED;
+	else
+		auio.uio_extflg = UIO_COPY_DEFAULT;
+
+	error = lx_read_common(fp, &auio, &nread, B_FALSE);
+
+	if (error == EINTR) {
+		if (nread != 0) {
+			error = 0;
+		} else {
+			ttolxlwp(curthread)->br_syscall_restart = B_TRUE;
+		}
+	}
+out:
+	releasef(fdes);
+	if (error != 0)
+		return (set_errno(error));
+	return ((ssize_t)nread);
+}
+
+ssize_t
+lx_write(int fdes, void *cbuf, size_t ccount)
+{
+	struct uio auio;
+	struct iovec aiov;
+	file_t *fp;
+	ssize_t count = (ssize_t)ccount;
+	size_t nwrite = 0;
+	int fflag, error = 0;
+
+	if (count < 0)
+		return (set_errno(EINVAL));
+	if ((fp = getf(fdes)) == NULL)
+		return (set_errno(EBADF));
+	if (((fflag = fp->f_flag) & FWRITE) == 0) {
+		error = EBADF;
+		goto out;
+	}
+	if (fp->f_vnode->v_type == VREG && count == 0) {
+		goto out;
+	}
+
+	aiov.iov_base = cbuf;
+	aiov.iov_len = count;
+	auio.uio_iov = &aiov;
+	auio.uio_iovcnt = 1;
+	auio.uio_loffset = fp->f_offset;
+	auio.uio_resid = count;
+	auio.uio_segflg = UIO_USERSPACE;
+	auio.uio_llimit = curproc->p_fsz_ctl;
+	auio.uio_fmode = fflag;
+	auio.uio_extflg = UIO_COPY_DEFAULT;
+
+	error = lx_write_common(fp, &auio, &nwrite, B_FALSE);
+
+	if (error == EINTR) {
+		if (nwrite != 0) {
+			error = 0;
+		} else {
+			ttolxlwp(curthread)->br_syscall_restart = B_TRUE;
+		}
+	}
+out:
+	releasef(fdes);
+	if (error != 0)
+		return (set_errno(error));
+	return (nwrite);
+}
+
+ssize_t
+lx_readv(int fdes, struct iovec *iovp, int iovcnt)
+{
+	struct uio auio;
+	struct iovec buf[IOV_MAX_STACK], *aiov = buf;
+	int aiovlen = 0;
+	file_t *fp;
+	ssize_t count;
+	size_t nread = 0;
+	int fflag, error = 0;
+
+	if (iovcnt < 0 || iovcnt > IOV_MAX) {
+		return (set_errno(EINVAL));
+	} else if (iovcnt == 0) {
+		return (0);
+	}
+
+	if (iovcnt > IOV_MAX_STACK) {
+		aiovlen = iovcnt * sizeof (iovec_t);
+		aiov = kmem_alloc(aiovlen, KM_SLEEP);
+	}
+	if ((error = lx_iovec_copyin(iovp, iovcnt, aiov, &count)) != 0) {
+		if (aiovlen != 0)
+			kmem_free(aiov, aiovlen);
+		return (set_errno(error));
+	}
+
+	if ((fp = getf(fdes)) == NULL) {
+		if (aiovlen != 0)
+			kmem_free(aiov, aiovlen);
+		return (set_errno(EBADF));
+	}
+	if (((fflag = fp->f_flag) & FREAD) == 0) {
+		error = EBADF;
+		goto out;
+	}
+	if (fp->f_vnode->v_type == VREG && count == 0) {
+		goto out;
+	}
+	if (fp->f_vnode->v_type == VDIR) {
+		error = EISDIR;
+		goto out;
+	}
+
+	auio.uio_iov = aiov;
+	auio.uio_iovcnt = iovcnt;
+	auio.uio_loffset = fp->f_offset;
+	auio.uio_resid = count;
+	auio.uio_segflg = UIO_USERSPACE;
+	auio.uio_llimit = MAXOFFSET_T;
+	auio.uio_fmode = fflag;
+	if (count <= copyout_max_cached)
+		auio.uio_extflg = UIO_COPY_CACHED;
+	else
+		auio.uio_extflg = UIO_COPY_DEFAULT;
+
+	error = lx_read_common(fp, &auio, &nread, B_FALSE);
+
+	if (error != 0) {
+		if (nread != 0) {
+			error = 0;
+		} else if (error == EINTR) {
+			ttolxlwp(curthread)->br_syscall_restart = B_TRUE;
+		}
+	}
+out:
+	releasef(fdes);
+	if (aiovlen != 0)
+		kmem_free(aiov, aiovlen);
+	if (error != 0) {
+		return (set_errno(error));
+	}
+	return (nread);
+}
+
+ssize_t
+lx_writev(int fdes, struct iovec *iovp, int iovcnt)
+{
+	struct uio auio;
+	struct iovec buf[IOV_MAX_STACK], *aiov = buf;
+	int aiovlen = 0;
+	file_t *fp;
+	ssize_t count;
+	size_t nwrite = 0;
+	int fflag, error = 0;
+
+	if (iovcnt < 0 || iovcnt > IOV_MAX) {
+		return (set_errno(EINVAL));
+	} else if (iovcnt == 0) {
+		return (0);
+	}
+
+	if (iovcnt > IOV_MAX_STACK) {
+		aiovlen = iovcnt * sizeof (iovec_t);
+		aiov = kmem_alloc(aiovlen, KM_SLEEP);
+	}
+	if ((error = lx_iovec_copyin(iovp, iovcnt, aiov, &count)) != 0) {
+		if (aiovlen != 0)
+			kmem_free(aiov, aiovlen);
+		return (set_errno(error));
+	}
+
+	if ((fp = getf(fdes)) == NULL) {
+		if (aiovlen != 0)
+			kmem_free(aiov, aiovlen);
+		return (set_errno(EBADF));
+	}
+	if (((fflag = fp->f_flag) & FWRITE) == 0) {
+		error = EBADF;
+		goto out;
+	}
+	if (fp->f_vnode->v_type == VREG && count == 0) {
+		goto out;
+	}
+
+	auio.uio_iov = aiov;
+	auio.uio_iovcnt = iovcnt;
+	auio.uio_loffset = fp->f_offset;
+	auio.uio_resid = count;
+	auio.uio_segflg = UIO_USERSPACE;
+	auio.uio_llimit = curproc->p_fsz_ctl;
+	auio.uio_fmode = fflag;
+	auio.uio_extflg = UIO_COPY_DEFAULT;
+
+	error = lx_write_common(fp, &auio, &nwrite, B_FALSE);
+
+	if (error != 0) {
+		if (nwrite != 0) {
+			error = 0;
+		} else if (error == EINTR) {
+			ttolxlwp(curthread)->br_syscall_restart = B_TRUE;
+		}
+	}
+out:
+	releasef(fdes);
+	if (aiovlen != 0)
+		kmem_free(aiov, aiovlen);
+	if (error != 0) {
+		return (set_errno(error));
+	}
+	return (nwrite);
+}
+
+ssize_t
+lx_pread(int fdes, void *cbuf, size_t ccount, off64_t offset)
+{
+	struct uio auio;
+	struct iovec aiov;
+	file_t *fp;
+	ssize_t count = (ssize_t)ccount;
+	size_t nread = 0;
+	int fflag, error = 0;
+
+	if (count < 0)
+		return (set_errno(EINVAL));
+	if ((fp = getf(fdes)) == NULL)
+		return (set_errno(EBADF));
+	if (((fflag = fp->f_flag) & FREAD) == 0) {
+		error = EBADF;
+		goto out;
+	}
+	if (fp->f_vnode->v_type == VREG) {
+		u_offset_t fileoff = (u_offset_t)offset;
+
+		if (count == 0)
+			goto out;
+		/*
+		 * Return EINVAL if an invalid offset comes to pread.
+		 * Negative offset from user will cause this error.
+		 */
+		if (fileoff > MAXOFFSET_T) {
+			error = EINVAL;
+			goto out;
+		}
+		/*
+		 * Limit offset such that we don't read or write
+		 * a file beyond the maximum offset representable in
+		 * an off_t structure.
+		 */
+		if (fileoff + count > MAXOFFSET_T)
+			count = (ssize_t)((offset_t)MAXOFFSET_T - fileoff);
+	} else if (fp->f_vnode->v_type == VFIFO) {
+		error = ESPIPE;
+		goto out;
+	} else if (fp->f_vnode->v_type == VDIR) {
+		error = EISDIR;
+		goto out;
+	}
+
+	aiov.iov_base = cbuf;
+	aiov.iov_len = count;
+	auio.uio_iov = &aiov;
+	auio.uio_iovcnt = 1;
+	auio.uio_loffset = offset;
+	auio.uio_resid = count;
+	auio.uio_segflg = UIO_USERSPACE;
+	auio.uio_llimit = MAXOFFSET_T;
+	auio.uio_fmode = fflag;
+	auio.uio_extflg = UIO_COPY_CACHED;
+
+	error = lx_read_common(fp, &auio, &nread, B_TRUE);
+
+	if (error == EINTR) {
+		if (nread != 0) {
+			error = 0;
+		} else {
+			ttolxlwp(curthread)->br_syscall_restart = B_TRUE;
+		}
+	}
+out:
+	releasef(fdes);
+	if (error) {
+		return (set_errno(error));
+	}
+	return ((ssize_t)nread);
+
+}
+
+ssize_t
+lx_pwrite(int fdes, void *cbuf, size_t ccount, off64_t offset)
+{
+	struct uio auio;
+	struct iovec aiov;
+	file_t *fp;
+	ssize_t count = (ssize_t)ccount;
+	size_t nwrite = 0;
+	int fflag, error = 0;
+
+	if (count < 0)
+		return (set_errno(EINVAL));
+	if ((fp = getf(fdes)) == NULL)
+		return (set_errno(EBADF));
+	if (((fflag = fp->f_flag) & (FWRITE)) == 0) {
+		error = EBADF;
+		goto out;
+	}
+	if (fp->f_vnode->v_type == VREG) {
+		u_offset_t fileoff = (u_offset_t)offset;
+
+		if (count == 0)
+			goto out;
+		/*
+		 * return EINVAL for offsets that cannot be
+		 * represented in an off_t.
+		 */
+		if (fileoff > MAXOFFSET_T) {
+			error = EINVAL;
+			goto out;
+		}
+		/*
+		 * Take appropriate action if we are trying to write above the
+		 * resource limit.
+		 */
+		if (fileoff >= curproc->p_fsz_ctl) {
+			mutex_enter(&curproc->p_lock);
+			(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
+			    curproc->p_rctls, curproc, RCA_UNSAFE_SIGINFO);
+			mutex_exit(&curproc->p_lock);
+
+			error = EFBIG;
+			goto out;
+		}
+		/*
+		 * Don't allow pwrite to cause file sizes to exceed maxoffset.
+		 */
+		if (fileoff == MAXOFFSET_T) {
+			error = EFBIG;
+			goto out;
+		}
+		if (fileoff + count > MAXOFFSET_T)
+			count = (ssize_t)((u_offset_t)MAXOFFSET_T - fileoff);
+	} else if (fp->f_vnode->v_type == VFIFO) {
+		error = ESPIPE;
+		goto out;
+	}
+
+	aiov.iov_base = cbuf;
+	aiov.iov_len = count;
+	auio.uio_iov = &aiov;
+	auio.uio_iovcnt = 1;
+	auio.uio_loffset = offset;
+	auio.uio_resid = count;
+	auio.uio_segflg = UIO_USERSPACE;
+	auio.uio_llimit = curproc->p_fsz_ctl;
+	auio.uio_fmode = fflag;
+	auio.uio_extflg = UIO_COPY_CACHED;
+
+	error = lx_write_common(fp, &auio, &nwrite, B_TRUE);
+
+	if (error == EINTR) {
+		if (nwrite != 0) {
+			error = 0;
+		} else {
+			ttolxlwp(curthread)->br_syscall_restart = B_TRUE;
+		}
+	}
+out:
+	releasef(fdes);
+	if (error) {
+		return (set_errno(error));
+	}
+	return (nwrite);
+}
+
+ssize_t
+lx_pread32(int fdes, void *cbuf, size_t ccount, uint32_t off_lo,
+    uint32_t off_hi)
+{
+	return (lx_pread(fdes, cbuf, ccount, LX_32TO64(off_lo, off_hi)));
+}
+
+ssize_t
+lx_pwrite32(int fdes, void *cbuf, size_t ccount, uint32_t off_lo,
+    uint32_t off_hi)
+{
+	return (lx_pwrite(fdes, cbuf, ccount, LX_32TO64(off_lo, off_hi)));
+}
+
+ssize_t
+lx_preadv(int fdes, void *iovp, int iovcnt, off64_t offset)
+{
+	struct uio auio;
+	struct iovec buf[IOV_MAX_STACK], *aiov = buf;
+	int aiovlen = 0;
+	file_t *fp;
+	ssize_t count;
+	size_t nread = 0;
+	int fflag, error = 0;
+
+	if (iovcnt < 0 || iovcnt > IOV_MAX) {
+		return (set_errno(EINVAL));
+	} else if (iovcnt == 0) {
+		return (0);
+	}
+
+	if (iovcnt > IOV_MAX_STACK) {
+		aiovlen = iovcnt * sizeof (iovec_t);
+		aiov = kmem_alloc(aiovlen, KM_SLEEP);
+	}
+	if ((error = lx_iovec_copyin(iovp, iovcnt, aiov, &count)) != 0) {
+		if (aiovlen != 0)
+			kmem_free(aiov, aiovlen);
+		return (set_errno(error));
+	}
+
+	if ((fp = getf(fdes)) == NULL) {
+		if (aiovlen != 0)
+			kmem_free(aiov, aiovlen);
+		return (set_errno(EBADF));
+	}
+	if (((fflag = fp->f_flag) & FREAD) == 0) {
+		error = EBADF;
+		goto out;
+	}
+	if (fp->f_vnode->v_type == VREG) {
+		u_offset_t fileoff = (u_offset_t)offset;
+
+		if (count == 0)
+			goto out;
+		/*
+		 * Return EINVAL if an invalid offset comes to pread.
+		 * Negative offset from user will cause this error.
+		 */
+		if (fileoff > MAXOFFSET_T) {
+			error = EINVAL;
+			goto out;
+		}
+		/*
+		 * Limit offset such that we don't read or write a file beyond
+		 * the maximum offset representable in an off_t structure.
+		 */
+		if (fileoff + count > MAXOFFSET_T)
+			count = (ssize_t)((offset_t)MAXOFFSET_T - fileoff);
+	} else if (fp->f_vnode->v_type == VDIR) {
+		error = EISDIR;
+		goto out;
+	} else if (fp->f_vnode->v_type == VFIFO) {
+		error = ESPIPE;
+		goto out;
+	}
+
+	auio.uio_iov = aiov;
+	auio.uio_iovcnt = iovcnt;
+	auio.uio_loffset = offset;
+	auio.uio_resid = count;
+	auio.uio_segflg = UIO_USERSPACE;
+	auio.uio_llimit = MAXOFFSET_T;
+	auio.uio_fmode = fflag;
+	if (count <= copyout_max_cached)
+		auio.uio_extflg = UIO_COPY_CACHED;
+	else
+		auio.uio_extflg = UIO_COPY_DEFAULT;
+
+	error = lx_read_common(fp, &auio, &nread, B_TRUE);
+
+	if (error != 0) {
+		if (nread != 0) {
+			error = 0;
+		} else if (error == EINTR) {
+			ttolxlwp(curthread)->br_syscall_restart = B_TRUE;
+		}
+	}
+out:
+	releasef(fdes);
+	if (aiovlen != 0)
+		kmem_free(aiov, aiovlen);
+	if (error != 0) {
+		return (set_errno(error));
+	}
+	return (nread);
+}
+
+ssize_t
+lx_pwritev(int fdes, void *iovp, int iovcnt, off64_t offset)
+{
+	struct uio auio;
+	struct iovec buf[IOV_MAX_STACK], *aiov = buf;
+	int aiovlen = 0;
+	file_t *fp;
+	ssize_t count;
+	size_t nwrite = 0;
+	int fflag, error = 0;
+
+	if (iovcnt < 0 || iovcnt > IOV_MAX) {
+		return (set_errno(EINVAL));
+	} else if (iovcnt == 0) {
+		return (0);
+	}
+
+	if (iovcnt > IOV_MAX_STACK) {
+		aiovlen = iovcnt * sizeof (iovec_t);
+		aiov = kmem_alloc(aiovlen, KM_SLEEP);
+	}
+	if ((error = lx_iovec_copyin(iovp, iovcnt, aiov, &count)) != 0) {
+		if (aiovlen != 0)
+			kmem_free(aiov, aiovlen);
+		return (set_errno(error));
+	}
+
+	if ((fp = getf(fdes)) == NULL) {
+		if (aiovlen != 0)
+			kmem_free(aiov, aiovlen);
+		return (set_errno(EBADF));
+	}
+	if (((fflag = fp->f_flag) & FWRITE) == 0) {
+		error = EBADF;
+		goto out;
+	}
+	if (fp->f_vnode->v_type == VREG) {
+		u_offset_t fileoff = (u_offset_t)offset;
+
+		if (count == 0)
+			goto out;
+		/*
+		 * Return EINVAL if an invalid offset comes to pread.
+		 * Negative offset from user will cause this error.
+		 */
+		if (fileoff > MAXOFFSET_T) {
+			error = EINVAL;
+			goto out;
+		}
+		/*
+		 * Take appropriate action if we are trying to write above the
+		 * resource limit.
+		 */
+		if (fileoff >= curproc->p_fsz_ctl) {
+			mutex_enter(&curproc->p_lock);
+			(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
+			    curproc->p_rctls, curproc, RCA_UNSAFE_SIGINFO);
+			mutex_exit(&curproc->p_lock);
+
+			error = EFBIG;
+			goto out;
+		}
+		/*
+		 * Don't allow pwritev to cause file sizes to exceed maxoffset.
+		 */
+		if (fileoff == MAXOFFSET_T) {
+			error = EFBIG;
+			goto out;
+		}
+		/*
+		 * Limit offset such that we don't read or write a file beyond
+		 * the maximum offset representable in an off_t structure.
+		 */
+		if (fileoff + count > MAXOFFSET_T)
+			count = (ssize_t)((u_offset_t)MAXOFFSET_T - fileoff);
+	} else if (fp->f_vnode->v_type == VFIFO) {
+		error = ESPIPE;
+		goto out;
+	}
+
+	auio.uio_iov = aiov;
+	auio.uio_iovcnt = iovcnt;
+	auio.uio_loffset = offset;
+	auio.uio_resid = count;
+	auio.uio_segflg = UIO_USERSPACE;
+	auio.uio_llimit = curproc->p_fsz_ctl;
+	auio.uio_fmode = fflag;
+	auio.uio_extflg = UIO_COPY_DEFAULT;
+
+	error = lx_write_common(fp, &auio, &nwrite, B_TRUE);
+
+	if (error != 0) {
+		if (nwrite != 0) {
+			error = 0;
+		} else if (error == EINTR) {
+			ttolxlwp(curthread)->br_syscall_restart = B_TRUE;
+		}
+	}
+out:
+	releasef(fdes);
+	if (aiovlen != 0)
+		kmem_free(aiov, aiovlen);
+	if (error != 0) {
+		return (set_errno(error));
+	}
+	return (nwrite);
+}
+
+ssize_t
+lx_preadv32(int fdes, void *iovp, int iovcnt, uint32_t off_lo, uint32_t off_hi)
+{
+	return (lx_preadv(fdes, iovp, iovcnt, LX_32TO64(off_lo, off_hi)));
+}
+
+ssize_t
+lx_pwritev32(int fdes, void *iovp, int iovcnt, uint32_t off_lo,
+    uint32_t off_hi)
+{
+	return (lx_pwritev(fdes, iovp, iovcnt, LX_32TO64(off_lo, off_hi)));
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_sched.c b/usr/src/uts/common/brand/lx/syscall/lx_sched.c
new file mode 100644
index 0000000000..0def559e29
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_sched.c
@@ -0,0 +1,524 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/proc.h>
+#include <sys/cpu.h>
+#include <sys/rtpriocntl.h>
+#include <sys/tspriocntl.h>
+#include <sys/processor.h>
+#include <sys/brand.h>
+#include <sys/lx_sched.h>
+#include <sys/lx_brand.h>
+
+extern int yield();
+extern long priocntl_common(int, procset_t *, int, caddr_t, caddr_t, uio_seg_t);
+
+long
+lx_sched_yield(void)
+{
+	yield();
+
+	return (0);
+}
+
+int
+lx_sched_affinity(int cmd, uintptr_t pid, int len, uintptr_t maskp,
+    int64_t *rval)
+{
+	pid_t		s_pid;
+	id_t		s_tid;
+	kthread_t	*t = curthread;
+	lx_lwp_data_t	*lx_lwp;
+
+	if (cmd != B_GET_AFFINITY_MASK && cmd != B_SET_AFFINITY_MASK)
+		return (set_errno(EINVAL));
+
+	/*
+	 * The caller wants to know how large the mask should be.
+	 */
+	if (cmd == B_GET_AFFINITY_MASK && len == 0) {
+		*rval = sizeof (lx_affmask_t);
+		return (0);
+	}
+
+	/*
+	 * Otherwise, ensure they have a large enough mask.
+	 */
+	if (cmd == B_GET_AFFINITY_MASK && len < sizeof (lx_affmask_t)) {
+		*rval = -1;
+		return (set_errno(EINVAL));
+	}
+
+	if (pid == 0) {
+		s_pid = curproc->p_pid;
+		s_tid = curthread->t_tid;
+	} else if (lx_lpid_to_spair((pid_t)pid, &s_pid, &s_tid) == -1) {
+		return (set_errno(ESRCH));
+	}
+
+	/*
+	 * For now, we only support manipulating threads in the
+	 * same process.
+	 */
+	if (curproc->p_pid != s_pid)
+		return (set_errno(EPERM));
+
+	/*
+	 * We must hold the process lock so that the thread list
+	 * doesn't change while we're looking at it. We'll hold
+	 * the lock until we no longer reference the
+	 * corresponding lwp.
+	 */
+
+	mutex_enter(&curproc->p_lock);
+
+	do {
+		if (t->t_tid == s_tid)
+			break;
+		t = t->t_forw;
+	} while (t != curthread);
+
+	/*
+	 * If the given PID is in the current thread's process,
+	 * then we _must_ find it in the process's thread list.
+	 */
+	ASSERT(t->t_tid == s_tid);
+
+	lx_lwp = t->t_lwp->lwp_brand;
+
+	if (cmd == B_SET_AFFINITY_MASK) {
+		if (copyin_nowatch((void *)maskp, &lx_lwp->br_affinitymask,
+		    sizeof (lx_affmask_t)) != 0) {
+			mutex_exit(&curproc->p_lock);
+			return (set_errno(EFAULT));
+		}
+
+		*rval = 0;
+	} else {
+		if (copyout_nowatch(&lx_lwp->br_affinitymask, (void *)maskp,
+		    sizeof (lx_affmask_t)) != 0) {
+			mutex_exit(&curproc->p_lock);
+			return (set_errno(EFAULT));
+		}
+
+		*rval = sizeof (lx_affmask_t);
+	}
+
+	mutex_exit(&curproc->p_lock);
+	return (0);
+}
+
+long
+lx_sched_setscheduler(l_pid_t pid, int policy, struct lx_sched_param *param)
+{
+	klwp_t *lwp = ttolwp(curthread);
+	procset_t procset;
+	procset_t procset_cid;
+	pcparms_t pcparm;
+	pcinfo_t pcinfo;
+	struct lx_sched_param sched_param;
+	tsparms_t *tsp;
+	int prio, maxupri;
+	int rv;
+
+	if (pid < 0)
+		return (set_errno(ESRCH));
+
+	if ((rv = sched_setprocset(&procset, pid)))
+		return (rv);
+
+	if (copyin(param, &sched_param, sizeof (sched_param)))
+		return (set_errno(EFAULT));
+
+	prio = sched_param.lx_sched_prio;
+
+	if (policy < 0) {
+		/*
+		 * get the class id
+		 */
+		pcparm.pc_cid = PC_CLNULL;
+		(void) do_priocntlsys(PC_GETPARMS, &procset, &pcparm);
+		if (lwp->lwp_errno)
+			return (lwp->lwp_errno);
+
+		/*
+		 * get the current policy
+		 */
+		bzero(&pcinfo, sizeof (pcinfo));
+		pcinfo.pc_cid = pcparm.pc_cid;
+		(void) do_priocntlsys(PC_GETCLINFO, &procset, &pcinfo);
+		if (lwp->lwp_errno)
+			return (lwp->lwp_errno);
+
+		if (strcmp(pcinfo.pc_clname, "TS") == 0) {
+			policy = LX_SCHED_OTHER;
+		} else if (strcmp(pcinfo.pc_clname, "RT") == 0) {
+			policy = ((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs ==
+			    RT_TQINF ? LX_SCHED_FIFO : LX_SCHED_RR;
+		} else {
+			return (set_errno(EINVAL));
+		}
+	}
+
+	bzero(&pcinfo, sizeof (pcinfo));
+	bzero(&pcparm, sizeof (pcparm));
+	setprocset(&procset_cid, POP_AND, P_PID, 0, P_ALL, 0);
+	switch (policy) {
+	case LX_SCHED_FIFO:
+	case LX_SCHED_RR:
+		(void) strcpy(pcinfo.pc_clname, "RT");
+		(void) do_priocntlsys(PC_GETCID, &procset_cid, &pcinfo);
+		if (lwp->lwp_errno)
+			return (lwp->lwp_errno);
+
+		if (prio < 0 ||
+		    prio > ((rtinfo_t *)pcinfo.pc_clinfo)->rt_maxpri)
+			return (set_errno(EINVAL));
+		pcparm.pc_cid = pcinfo.pc_cid;
+		((rtparms_t *)pcparm.pc_clparms)->rt_pri = prio;
+		((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs =
+		    policy == LX_SCHED_RR ? RT_TQDEF : RT_TQINF;
+		break;
+
+	case LX_SCHED_OTHER:
+		(void) strcpy(pcinfo.pc_clname, "TS");
+		(void) do_priocntlsys(PC_GETCID, &procset_cid, &pcinfo);
+		if (lwp->lwp_errno)
+			return (lwp->lwp_errno);
+
+		maxupri = ((tsinfo_t *)pcinfo.pc_clinfo)->ts_maxupri;
+		if (prio > maxupri || prio < -maxupri)
+			return (set_errno(EINVAL));
+
+		pcparm.pc_cid = pcinfo.pc_cid;
+		tsp = (tsparms_t *)pcparm.pc_clparms;
+		tsp->ts_upri = prio;
+		tsp->ts_uprilim = TS_NOCHANGE;
+		break;
+
+	default:
+		return (set_errno(EINVAL));
+	}
+
+	/*
+	 * finally set scheduling policy and parameters
+	 */
+	(void) do_priocntlsys(PC_SETPARMS, &procset, &pcparm);
+
+	return (0);
+}
+
+long
+lx_sched_getscheduler(l_pid_t pid)
+{
+	klwp_t *lwp = ttolwp(curthread);
+	procset_t procset;
+	pcparms_t pcparm;
+	pcinfo_t pcinfo;
+	int policy;
+	int rv;
+
+	if (pid < 0)
+		return (set_errno(ESRCH));
+
+	if ((rv = sched_setprocset(&procset, pid)))
+		return (rv);
+
+	/*
+	 * get the class id
+	 */
+	pcparm.pc_cid = PC_CLNULL;
+	(void) do_priocntlsys(PC_GETPARMS, &procset, &pcparm);
+	if (lwp->lwp_errno)
+		return (lwp->lwp_errno);
+
+	/*
+	 * get the class info and identify the equivalent linux policy
+	 */
+	bzero(&pcinfo, sizeof (pcinfo));
+	pcinfo.pc_cid = pcparm.pc_cid;
+	(void) do_priocntlsys(PC_GETCLINFO, &procset, &pcinfo);
+	if (lwp->lwp_errno)
+		return (lwp->lwp_errno);
+
+	if (strcmp(pcinfo.pc_clname, "TS") == 0)
+		policy = LX_SCHED_OTHER;
+	else if (strcmp(pcinfo.pc_clname, "RT") == 0)
+		policy = ((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs ==
+		    RT_TQINF ? LX_SCHED_FIFO : LX_SCHED_RR;
+	else
+		policy = set_errno(EINVAL);
+
+	return (policy);
+}
+
+long
+lx_sched_setparam(l_pid_t pid, struct lx_sched_param *param)
+{
+	klwp_t *lwp = ttolwp(curthread);
+	procset_t procset;
+	procset_t procset_cid;
+	pcparms_t pcparm;
+	pcinfo_t pcinfo;
+	struct lx_sched_param sched_param;
+	tsparms_t *tsp;
+	int policy;
+	int prio, maxupri;
+	int rv;
+
+	if (pid < 0)
+		return (set_errno(ESRCH));
+
+	if ((rv = sched_setprocset(&procset, pid)))
+		return (rv);
+
+	if (copyin(param, &sched_param, sizeof (sched_param)))
+		return (set_errno(EFAULT));
+
+	prio = sched_param.lx_sched_prio;
+
+	/*
+	 * get the class id
+	 */
+	pcparm.pc_cid = PC_CLNULL;
+	(void) do_priocntlsys(PC_GETPARMS, &procset, &pcparm);
+	if (lwp->lwp_errno)
+		return (lwp->lwp_errno);
+
+	/*
+	 * get the current policy
+	 */
+	bzero(&pcinfo, sizeof (pcinfo));
+	pcinfo.pc_cid = pcparm.pc_cid;
+	(void) do_priocntlsys(PC_GETCLINFO, &procset, &pcinfo);
+	if (lwp->lwp_errno)
+		return (lwp->lwp_errno);
+
+	if (strcmp(pcinfo.pc_clname, "TS") == 0)
+		policy = LX_SCHED_OTHER;
+	else if (strcmp(pcinfo.pc_clname, "RT") == 0)
+		policy = ((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs ==
+		    RT_TQINF ? LX_SCHED_FIFO : LX_SCHED_RR;
+	else
+		return (set_errno(EINVAL));
+
+	bzero(&pcinfo, sizeof (pcinfo));
+	bzero(&pcparm, sizeof (pcparm));
+	setprocset(&procset_cid, POP_AND, P_PID, 0, P_ALL, 0);
+	switch (policy) {
+	case LX_SCHED_FIFO:
+	case LX_SCHED_RR:
+		(void) strcpy(pcinfo.pc_clname, "RT");
+		(void) do_priocntlsys(PC_GETCID, &procset_cid, &pcinfo);
+		if (lwp->lwp_errno)
+			return (lwp->lwp_errno);
+
+		if (prio < 0 ||
+		    prio > ((rtinfo_t *)pcinfo.pc_clinfo)->rt_maxpri)
+			return (set_errno(EINVAL));
+		pcparm.pc_cid = pcinfo.pc_cid;
+		((rtparms_t *)pcparm.pc_clparms)->rt_pri = prio;
+		((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs =
+		    policy == LX_SCHED_RR ? RT_TQDEF : RT_TQINF;
+		break;
+
+	case LX_SCHED_OTHER:
+		(void) strcpy(pcinfo.pc_clname, "TS");
+		(void) do_priocntlsys(PC_GETCID, &procset_cid, &pcinfo);
+		if (lwp->lwp_errno)
+			return (lwp->lwp_errno);
+
+		maxupri = ((tsinfo_t *)pcinfo.pc_clinfo)->ts_maxupri;
+		if (prio > maxupri || prio < -maxupri)
+			return (set_errno(EINVAL));
+
+		pcparm.pc_cid = pcinfo.pc_cid;
+		tsp = (tsparms_t *)pcparm.pc_clparms;
+		tsp->ts_upri = prio;
+		tsp->ts_uprilim = TS_NOCHANGE;
+		break;
+
+	default:
+		return (set_errno(EINVAL));
+	}
+
+	/*
+	 * finally set scheduling policy and parameters
+	 */
+	(void) do_priocntlsys(PC_SETPARMS, &procset, &pcparm);
+
+	return (0);
+}
+
+long
+lx_sched_getparam(l_pid_t pid, struct lx_sched_param *param)
+{
+	klwp_t *lwp = ttolwp(curthread);
+	struct lx_sched_param local_param;
+	procset_t procset;
+	pcparms_t pcparm;
+	pcinfo_t pcinfo;
+	tsinfo_t *tsi;
+	int prio, scale;
+	int rv;
+
+	if (pid < 0)
+		return (set_errno(ESRCH));
+
+	if ((rv = sched_setprocset(&procset, pid)))
+		return (rv);
+
+	/*
+	 * get the class id
+	 */
+	pcparm.pc_cid = PC_CLNULL;
+	(void) do_priocntlsys(PC_GETPARMS, &procset, &pcparm);
+	if (lwp->lwp_errno)
+		return (lwp->lwp_errno);
+
+	/*
+	 * get the class info and identify the equivalent linux policy
+	 */
+	bzero(&pcinfo, sizeof (pcinfo));
+	pcinfo.pc_cid = pcparm.pc_cid;
+	(void) do_priocntlsys(PC_GETCLINFO, &procset, &pcinfo);
+	if (lwp->lwp_errno)
+		return (lwp->lwp_errno);
+
+	bzero(&local_param, sizeof (local_param));
+	if (strcmp(pcinfo.pc_clname, "TS") == 0) {
+		/*
+		 * I don't know if we need to do this, coz it can't be
+		 * changed from zero anyway.....
+		 */
+		tsi = (tsinfo_t *)pcinfo.pc_clinfo;
+		prio = ((tsparms_t *)pcparm.pc_clparms)->ts_upri;
+		scale = tsi->ts_maxupri;
+		if (scale == 0)
+			local_param.lx_sched_prio = 0;
+		else
+			local_param.lx_sched_prio = -(prio * 20) / scale;
+	} else if (strcmp(pcinfo.pc_clname, "RT") == 0) {
+		local_param.lx_sched_prio =
+		    ((rtparms_t *)pcparm.pc_clparms)->rt_pri;
+	} else {
+		rv = set_errno(EINVAL);
+	}
+
+	if (rv == 0)
+		if (copyout(&local_param, param, sizeof (local_param)))
+			return (set_errno(EFAULT));
+
+	return (rv);
+}
+
+long
+lx_sched_rr_get_interval(l_pid_t pid, struct timespec *ival)
+{
+	klwp_t *lwp = ttolwp(curthread);
+	struct timespec interval;
+	procset_t procset;
+	pcparms_t pcparm;
+	pcinfo_t pcinfo;
+	int rv;
+
+	if (pid < 0)
+		return (set_errno(ESRCH));
+
+	if ((rv = sched_setprocset(&procset, pid)))
+		return (rv);
+
+	/*
+	 * get the class id
+	 */
+	pcparm.pc_cid = PC_CLNULL;
+	(void) do_priocntlsys(PC_GETPARMS, &procset, &pcparm);
+	if (lwp->lwp_errno)
+		return (lwp->lwp_errno);
+
+	/*
+	 * get the class info and identify the equivalent linux policy
+	 */
+	setprocset(&procset, POP_AND, P_PID, 0, P_ALL, 0);
+	bzero(&pcinfo, sizeof (pcinfo));
+	(void) strcpy(pcinfo.pc_clname, "RT");
+	(void) do_priocntlsys(PC_GETCID, &procset, &pcinfo);
+	if (lwp->lwp_errno)
+		return (lwp->lwp_errno);
+
+	if (pcparm.pc_cid == pcinfo.pc_cid &&
+	    ((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs != RT_TQINF) {
+		interval.tv_sec = ((rtparms_t *)pcparm.pc_clparms)->rt_tqsecs;
+		interval.tv_nsec = ((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs;
+
+		if (copyout(&interval, ival, sizeof (interval)))
+			return (set_errno(EFAULT));
+
+		return (0);
+	}
+
+	return (set_errno(EINVAL));
+}
+
+int
+sched_setprocset(procset_t *procset, l_pid_t pid)
+{
+	id_t lid, rid;
+	idtype_t lidtype, ridtype;
+
+	/*
+	 * define the target lwp
+	 */
+	if (pid == 0) {
+		ridtype = P_ALL;
+		lidtype = P_PID;
+		rid = 0;
+		lid = P_MYID;
+	} else {
+		if (lx_lpid_to_spair(pid, &pid, &lid) < 0)
+			return (set_errno(ESRCH));
+		if (pid != curproc->p_pid)
+			return (set_errno(ESRCH));
+		rid = 0;
+		ridtype = P_ALL;
+		lidtype = P_LWPID;
+	}
+	setprocset(procset, POP_AND, lidtype, lid, ridtype, rid);
+
+	return (0);
+}
+
+long
+do_priocntlsys(int cmd, procset_t *procset, void *arg)
+{
+	return (priocntl_common(PC_VERSION, procset, cmd, (caddr_t)arg, 0,
+	    UIO_SYSSPACE));
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_socket.c b/usr/src/uts/common/brand/lx/syscall/lx_socket.c
new file mode 100644
index 0000000000..e8e9714143
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_socket.c
@@ -0,0 +1,3750 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#include <sys/errno.h>
+#include <sys/systm.h>
+#include <sys/types.h>
+#include <sys/cmn_err.h>
+#include <sys/sockio.h>
+#include <sys/thread.h>
+#include <sys/stropts.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/kmem.h>
+#include <sys/un.h>
+#include <sys/sunddi.h>
+#include <sys/cred.h>
+#include <sys/ucred.h>
+#include <sys/model.h>
+#include <sys/brand.h>
+#include <sys/vmsystm.h>
+#include <sys/limits.h>
+#include <sys/fcntl.h>
+#include <sys/sysmacros.h>
+#include <netpacket/packet.h>
+#include <sockcommon.h>
+#include <socktpi_impl.h>
+#include <netinet/udp.h>
+#include <sys/sdt.h>
+#include <netinet/tcp.h>
+#include <netinet/igmp.h>
+#include <netinet/icmp6.h>
+#include <lx_errno.h>
+
+#include <sys/lx_brand.h>
+#include <sys/lx_socket.h>
+#include <sys/lx_types.h>
+#include <sys/lx_impl.h>
+
+
+typedef struct lx_ucred {
+	pid_t		lxu_pid;
+	lx_uid_t	lxu_uid;
+	lx_gid_t	lxu_gid;
+} lx_ucred_t;
+
+typedef struct lx_socket_aux_data
+{
+	kmutex_t lxsad_lock;
+	enum lxsad_status_t {
+		LXSS_NONE = 0,
+		LXSS_CONNECTING,
+		LXSS_CONNECTED
+	} lxsad_status;
+	boolean_t lxsad_stream_cred;
+} lx_socket_aux_data_t;
+
+static lx_socket_aux_data_t *lx_sad_acquire(vnode_t *);
+
+/* VSD key for lx-specific socket information */
+static uint_t lx_socket_vsd = 0;
+
+/* Convenience enum to enforce translation direction */
+typedef enum lx_xlate_dir {
+	SUNOS_TO_LX,
+	LX_TO_SUNOS
+} lx_xlate_dir_t;
+
+/* enum for getpeername/getsockname handling */
+typedef enum lx_getname_type {
+	LX_GETPEERNAME,
+	LX_GETSOCKNAME
+} lx_getname_type_t;
+
+/*
+ * What follows are a series of tables we use to translate Linux constants
+ * into equivalent Illumos constants and back again.  I wish this were
+ * cleaner, more programmatic, and generally nicer.  Sadly, life is messy,
+ * and Unix networking even more so.
+ */
+static const int ltos_family[LX_AF_MAX + 1] =  {
+	AF_UNSPEC,		/* LX_AF_UNSPEC		*/
+	AF_UNIX,		/* LX_AF_UNIX		*/
+	AF_INET,		/* LX_AF_INET		*/
+	AF_NOTSUPPORTED,	/* LX_AF_AX25		*/
+	AF_NOTSUPPORTED,	/* LX_AF_IPX		*/
+	AF_NOTSUPPORTED,	/* LX_AF_APPLETALK	*/
+	AF_NOTSUPPORTED,	/* LX_AF_NETROM		*/
+	AF_NOTSUPPORTED,	/* LX_AF_BRIDGE		*/
+	AF_NOTSUPPORTED,	/* LX_AF_ATMPVC		*/
+	AF_NOTSUPPORTED,	/* LX_AF_X25		*/
+	AF_INET6,		/* LX_AF_INET6		*/
+	AF_NOTSUPPORTED,	/* LX_AF_ROSE		*/
+	AF_NOTSUPPORTED,	/* LX_AF_DECNET		*/
+	AF_NOTSUPPORTED,	/* LX_AF_NETBEUI	*/
+	AF_NOTSUPPORTED,	/* LX_AF_SECURITY	*/
+	AF_NOTSUPPORTED,	/* LX_AF_KEY		*/
+	AF_LX_NETLINK,		/* LX_AF_NETLINK	*/
+	AF_PACKET,		/* LX_AF_PACKET		*/
+	AF_NOTSUPPORTED,	/* LX_AF_ASH		*/
+	AF_NOTSUPPORTED,	/* LX_AF_ECONET		*/
+	AF_NOTSUPPORTED,	/* LX_AF_ATMSVC		*/
+	AF_NOTSUPPORTED,	/* LX_AF_RDS		*/
+	AF_NOTSUPPORTED,	/* LX_AF_SNA		*/
+	AF_NOTSUPPORTED,	/* LX_AF_IRDA		*/
+	AF_NOTSUPPORTED,	/* LX_AF_PPOX		*/
+	AF_NOTSUPPORTED,	/* LX_AF_WANPIPE	*/
+	AF_NOTSUPPORTED,	/* LX_AF_LLC		*/
+	AF_NOTSUPPORTED,	/* EMPTY		*/
+	AF_NOTSUPPORTED,	/* EMPTY		*/
+	AF_NOTSUPPORTED,	/* LX_AF_CAN		*/
+	AF_NOTSUPPORTED,	/* LX_AF_TIPC		*/
+	AF_NOTSUPPORTED,	/* LX_AF_BLUETOOTH	*/
+	AF_NOTSUPPORTED,	/* LX_AF_IUCV		*/
+	AF_NOTSUPPORTED		/* LX_AF_RXRPC		*/
+				/* LX_AF_ISDN		*/
+				/* LX_AF_PHONET		*/
+				/* LX_AF_IEEE802154	*/
+				/* LX_AF_CAIF		*/
+				/* LX_AF_ALG		*/
+				/* LX_AF_NFC		*/
+				/* LX_AF_VSOCK		*/
+};
+
+static const int stol_family[LX_AF_MAX + 1] =  {
+	AF_UNSPEC,		/* AF_UNSPEC		*/
+	AF_UNIX,		/* AF_UNIX		*/
+	AF_INET,		/* AF_INET		*/
+	AF_NOTSUPPORTED,	/* AF_IMPLINK		*/
+	AF_NOTSUPPORTED,	/* AF_PUP		*/
+	AF_NOTSUPPORTED,	/* AF_CHAOS		*/
+	AF_NOTSUPPORTED,	/* AF_NS		*/
+	AF_NOTSUPPORTED,	/* AF_NBS		*/
+	AF_NOTSUPPORTED,	/* AF_ECMA		*/
+	AF_NOTSUPPORTED,	/* AF_DATAKIT		*/
+	AF_NOTSUPPORTED,	/* AF_CCITT		*/
+	AF_NOTSUPPORTED,	/* AF_SNA		*/
+	AF_NOTSUPPORTED,	/* AF_DECNET		*/
+	AF_NOTSUPPORTED,	/* AF_DLI		*/
+	AF_NOTSUPPORTED,	/* AF_LAT		*/
+	AF_NOTSUPPORTED,	/* AF_HYLINK		*/
+	AF_NOTSUPPORTED,	/* AF_APPLETALK		*/
+	AF_NOTSUPPORTED,	/* AF_NIT		*/
+	AF_NOTSUPPORTED,	/* AF_802		*/
+	AF_NOTSUPPORTED,	/* AF_OSI		*/
+	AF_NOTSUPPORTED,	/* AF_X25		*/
+	AF_NOTSUPPORTED,	/* AF_OSINET		*/
+	AF_NOTSUPPORTED,	/* AF_GOSIP		*/
+	AF_NOTSUPPORTED,	/* AF_IPX		*/
+	AF_NOTSUPPORTED,	/* AF_ROUTE		*/
+	AF_NOTSUPPORTED,	/* AF_LINK		*/
+	LX_AF_INET6,		/* AF_INET6		*/
+	AF_NOTSUPPORTED,	/* AF_KEY		*/
+	AF_NOTSUPPORTED,	/* AF_NCA		*/
+	AF_NOTSUPPORTED,	/* AF_POLICY		*/
+	AF_NOTSUPPORTED,	/* AF_INET_OFFLOAD	*/
+	AF_NOTSUPPORTED,	/* AF_TRILL		*/
+	LX_AF_PACKET,		/* AF_PACKET		*/
+	LX_AF_NETLINK		/* AF_LX_NETLINK	*/
+};
+
+#define	LTOS_FAMILY(d) ((d) <= LX_AF_MAX ? ltos_family[(d)] : AF_INVAL)
+#define	STOL_FAMILY(d) ((d) <= LX_AF_MAX ? stol_family[(d)] : AF_INVAL)
+
+
+static const int ltos_socktype[LX_SOCK_PACKET + 1] = {
+	SOCK_NOTSUPPORTED, SOCK_STREAM, SOCK_DGRAM, SOCK_RAW,
+	SOCK_RDM, SOCK_SEQPACKET, SOCK_NOTSUPPORTED, SOCK_NOTSUPPORTED,
+	SOCK_NOTSUPPORTED, SOCK_NOTSUPPORTED, SOCK_NOTSUPPORTED
+};
+
+static const int stol_socktype[SOCK_SEQPACKET + 1] = {
+	SOCK_NOTSUPPORTED, LX_SOCK_DGRAM, LX_SOCK_STREAM, SOCK_NOTSUPPORTED,
+	LX_SOCK_RAW, LX_SOCK_RDM, LX_SOCK_SEQPACKET
+};
+
+#define	LTOS_SOCKTYPE(t)	\
+	((t) <= LX_SOCK_PACKET ? ltos_socktype[(t)] : SOCK_INVAL)
+#define	STOL_SOCKTYPE(t)	\
+	((t) <= SOCK_SEQPACKET ? ltos_socktype[(t)] : SOCK_INVAL)
+
+
+/*
+ * This string is used to prefix all abstract namespace Unix sockets, ie all
+ * abstract namespace sockets are converted to regular sockets in the /tmp
+ * directory with .ABSK_ prefixed to their names.
+ */
+#define	ABST_PRFX "/tmp/.ABSK_"
+#define	ABST_PRFX_LEN (sizeof (ABST_PRFX) - 1)
+
+#define	DATAFILT	"datafilt"
+
+typedef enum {
+	lxa_none,
+	lxa_abstract,
+	lxa_devlog
+} lx_addr_type_t;
+
+static int
+ltos_pkt_proto(int protocol)
+{
+	switch (ntohs(protocol)) {
+	case LX_ETH_P_802_2:
+		return (ETH_P_802_2);
+	case LX_ETH_P_IP:
+		return (ETH_P_IP);
+	case LX_ETH_P_ARP:
+		return (ETH_P_ARP);
+	case LX_ETH_P_IPV6:
+		return (ETH_P_IPV6);
+	case LX_ETH_P_ALL:
+	case LX_ETH_P_802_3:
+		return (ETH_P_ALL);
+	default:
+		return (-1);
+	}
+}
+
+
+typedef struct lx_flag_map {
+	enum {
+		LXFM_MAP,
+		LXFM_IGNORE,
+		LXFM_UNSUP
+	} lxfm_action;
+	int lxfm_sunos_flag;
+	int lxfm_linux_flag;
+	char *lxfm_name;
+} lx_flag_map_t;
+
+static lx_flag_map_t lx_flag_map_tbl[] = {
+	{ LXFM_MAP,	MSG_OOB,		LX_MSG_OOB,		NULL },
+	{ LXFM_MAP,	MSG_PEEK,		LX_MSG_PEEK,		NULL },
+	{ LXFM_MAP,	MSG_DONTROUTE,		LX_MSG_DONTROUTE,	NULL },
+	{ LXFM_MAP,	MSG_CTRUNC,		LX_MSG_CTRUNC,		NULL },
+	{ LXFM_MAP,	MSG_TRUNC,		LX_MSG_TRUNC,		NULL },
+	{ LXFM_MAP,	MSG_DONTWAIT,		LX_MSG_DONTWAIT,	NULL },
+	{ LXFM_MAP,	MSG_EOR,		LX_MSG_EOR,		NULL },
+	{ LXFM_MAP,	MSG_WAITALL,		LX_MSG_WAITALL,		NULL },
+	/* MSG_CONFIRM is safe to ignore */
+	{ LXFM_IGNORE,	0,			LX_MSG_CONFIRM,		NULL },
+	/*
+	 * The NOSIGNAL and CMSG_CLOEXEC flags are handled by the emulation
+	 * outside of the flag-conversion routine.
+	 */
+	{ LXFM_IGNORE,	0,			LX_MSG_NOSIGNAL,	NULL },
+	{ LXFM_IGNORE,	0,			LX_MSG_CMSG_CLOEXEC,	NULL },
+	{ LXFM_UNSUP,	LX_MSG_PROXY,		0,	"MSG_PROXY" },
+	{ LXFM_UNSUP,	LX_MSG_FIN,		0,	"MSG_FIN" },
+	{ LXFM_UNSUP,	LX_MSG_SYN,		0,	"MSG_SYN" },
+	{ LXFM_UNSUP,	LX_MSG_RST,		0,	"MSG_RST" },
+	{ LXFM_UNSUP,	LX_MSG_ERRQUEUE,	0,	"MSG_ERRQUEUE" },
+	{ LXFM_UNSUP,	LX_MSG_MORE,		0,	"MSG_MORE" },
+	{ LXFM_UNSUP,	LX_MSG_WAITFORONE,	0,	"MSG_WAITFORONE" },
+	{ LXFM_UNSUP,	LX_MSG_FASTOPEN,	0,	"MSG_FASTOPEN" },
+};
+
+#define	LX_FLAG_MAP_MAX	\
+	(sizeof (lx_flag_map_tbl) / sizeof (lx_flag_map_tbl[0]))
+
+#define	LX_UNSUP_BUFSZ	64
+
+static int
+lx_xlate_sock_flags(int inflags, lx_xlate_dir_t dir)
+{
+	int i, outflags = 0;
+	char buf[LX_UNSUP_BUFSZ];
+
+	VERIFY(dir == SUNOS_TO_LX || dir == LX_TO_SUNOS);
+
+	for (i = 0; i < LX_FLAG_MAP_MAX; i++) {
+		lx_flag_map_t *map = &lx_flag_map_tbl[i];
+		int match, out;
+
+		if (dir == SUNOS_TO_LX) {
+			match = inflags & map->lxfm_sunos_flag;
+			out = map->lxfm_linux_flag;
+		} else {
+			match = inflags & map->lxfm_linux_flag;
+			out = map->lxfm_sunos_flag;
+		}
+		switch (map->lxfm_action) {
+		case LXFM_MAP:
+			if (match != 0) {
+				inflags &= ~(match);
+				outflags |= out;
+			}
+			break;
+		case LXFM_IGNORE:
+			if (match != 0) {
+				inflags &= ~(match);
+			}
+			break;
+		case LXFM_UNSUP:
+			if (match != 0) {
+				snprintf(buf, LX_UNSUP_BUFSZ,
+				    "unsupported sock flag %s", map->lxfm_name);
+				lx_unsupported(buf);
+			}
+		}
+	}
+	if (inflags != 0) {
+		snprintf(buf, LX_UNSUP_BUFSZ, "unsupported sock flags 0x%08x",
+		    inflags);
+		lx_unsupported(buf);
+	}
+
+	return (outflags);
+}
+
+typedef enum lx_sun_type {
+	LX_SUN_NORMAL,
+	LX_SUN_ABSTRACT,
+} lx_sun_type_t;
+
+static void
+ltos_sockaddr_ux(const struct sockaddr *inaddr, const socklen_t inlen,
+    struct sockaddr **outaddr, socklen_t *outlen, lx_sun_type_t *sun_type)
+{
+	struct sockaddr_un buf;
+	/* Calculate size of (sun_family + any padding) in sockaddr */
+	int sizediff = (sizeof (buf) - sizeof (buf.sun_path));
+	int len = inlen - sizediff;
+
+	VERIFY(len > 0);
+	VERIFY(len <= sizeof (buf.sun_path));
+	bzero(&buf, sizeof (buf));
+
+	if (inaddr->sa_data[0] == '\0') {
+		/*
+		 * Linux supports abstract Unix sockets, which are simply
+		 * sockets that do not exist on the file system.  These sockets
+		 * are denoted by beginning the path with a NULL character. To
+		 * support these, we strip out the leading NULL character and
+		 * change the path to point to a real place in /tmp directory,
+		 * by prepending ABST_PRFX and replacing all illegal characters
+		 * with * '_'.
+		 *
+		 * Since these sockets are supposed to exist outside the
+		 * filesystem, they must be cleaned up after use.  This removal
+		 * is performed during bind().
+		 */
+		int idx, odx;
+
+		/* Add our abstract prefix */
+		(void) strcpy(buf.sun_path, ABST_PRFX);
+		for (idx = 1, odx = ABST_PRFX_LEN;
+		    idx < len && odx < sizeof (buf.sun_path);
+		    idx++, odx++) {
+			char c = inaddr->sa_data[idx];
+			if (c == '\0' || c == '/') {
+				buf.sun_path[odx] = '_';
+			} else {
+				buf.sun_path[odx] = c;
+			}
+		}
+
+		/*
+		 * Since abstract socket addresses might not be NUL terminated,
+		 * we must explicitly NUL terminate the translated path.
+		 * Care is taken not to overflow the buffer.
+		 */
+		if (odx == sizeof (buf.sun_path)) {
+			buf.sun_path[odx - 1] = '\0';
+		} else {
+			buf.sun_path[odx] = '\0';
+		}
+
+		if (sun_type != NULL) {
+			*sun_type = LX_SUN_ABSTRACT;
+		}
+	} else {
+		/* Copy the address directly, minding termination */
+		(void) strncpy(buf.sun_path, inaddr->sa_data, len);
+		len = strnlen(buf.sun_path, len);
+		if (len == sizeof (buf.sun_path)) {
+			buf.sun_path[len - 1] = '\0';
+		} else {
+			VERIFY(len < sizeof (buf.sun_path));
+			buf.sun_path[len] = '\0';
+		}
+
+		if (sun_type != NULL) {
+			*sun_type = LX_SUN_NORMAL;
+		}
+	}
+	buf.sun_family = AF_UNIX;
+	*outlen = strlen(buf.sun_path) + 1 + sizediff;
+	VERIFY(*outlen <= sizeof (struct sockaddr_un));
+
+	*outaddr = kmem_alloc(*outlen, KM_SLEEP);
+	bcopy(&buf, *outaddr, *outlen);
+}
+
+/*
+ * Copy in a Linux-native socket address from userspace and convert it into
+ * illumos format.  When successful, it will allocate an appropriately sized
+ * struct to be freed by the caller.
+ */
+static long
+ltos_sockaddr_copyin(const struct sockaddr *inaddr, const socklen_t inlen,
+    struct sockaddr **outaddr, socklen_t *outlen, lx_sun_type_t *sun_type)
+{
+	sa_family_t family;
+	struct sockaddr *laddr;
+	struct sockaddr_ll *sal;
+	int proto, error = 0;
+
+	VERIFY(inaddr != NULL);
+
+	if (inlen < sizeof (sa_family_t) ||
+	    inlen > sizeof (struct sockaddr_storage)) {
+		return (EINVAL);
+	}
+	laddr = kmem_alloc(inlen, KM_SLEEP);
+	if (copyin(inaddr, laddr, inlen) != 0) {
+		kmem_free(laddr, inlen);
+		return (EFAULT);
+	}
+
+	family = LTOS_FAMILY(laddr->sa_family);
+	switch (family) {
+		case (sa_family_t)AF_NOTSUPPORTED:
+			error = EPROTONOSUPPORT;
+			break;
+
+		case (sa_family_t)AF_INVAL:
+			error = EAFNOSUPPORT;
+			break;
+
+		case AF_UNIX:
+			if (inlen < sizeof (sa_family_t) + 2 ||
+			    inlen > sizeof (struct sockaddr_un)) {
+				error = EINVAL;
+				break;
+			}
+			ltos_sockaddr_ux(laddr, inlen, outaddr, outlen,
+			    sun_type);
+
+			/* AF_UNIX bypasses the standard copy logic */
+			kmem_free(laddr, inlen);
+			return (0);
+
+		case AF_PACKET:
+			if (inlen < sizeof (struct sockaddr_ll)) {
+				error = EINVAL;
+				break;
+			}
+			*outlen = sizeof (struct sockaddr_ll);
+
+			/* sll_protocol must be translated */
+			sal = (struct sockaddr_ll *)laddr;
+			proto = ltos_pkt_proto(sal->sll_protocol);
+			if (proto < 0) {
+				error = EINVAL;
+			}
+			sal->sll_protocol = proto;
+			break;
+
+		case AF_INET:
+			if (inlen < sizeof (struct sockaddr)) {
+				error = EINVAL;
+				break;
+			}
+			*outlen = sizeof (struct sockaddr);
+			break;
+
+		case AF_INET6:
+			/*
+			 * The illumos sockaddr_in6 has one more 32-bit field
+			 * than the Linux version.  We simply zero that field
+			 * via kmem_zalloc.
+			 */
+			if (inlen < sizeof (lx_sockaddr_in6_t)) {
+				error = EINVAL;
+				break;
+			}
+			*outlen = sizeof (struct sockaddr_in6);
+			*outaddr = (struct sockaddr *)kmem_zalloc(*outlen,
+			    KM_SLEEP);
+			bcopy(laddr, *outaddr, sizeof (lx_sockaddr_in6_t));
+			(*outaddr)->sa_family = AF_INET6;
+			/* AF_INET6 bypasses the standard copy logic */
+			kmem_free(laddr, inlen);
+			return (0);
+
+		default:
+			*outlen = inlen;
+	}
+
+	if (error == 0) {
+		/*
+		 * For most address families, just copying into a sockaddr of
+		 * the correct size and updating sa_family is adequate.
+		 */
+		VERIFY(inlen >= *outlen);
+
+		*outaddr = (struct sockaddr *)kmem_zalloc(*outlen, KM_SLEEP);
+		bcopy(laddr, *outaddr, *outlen);
+		(*outaddr)->sa_family = family;
+	}
+	kmem_free(laddr, inlen);
+	return (error);
+}
+
+/*
+ * Convert an illumos-native socket address into Linux format and copy it out
+ * to userspace.
+ */
+static long
+stol_sockaddr_copyout(struct sockaddr *inaddr, socklen_t inlen,
+    struct sockaddr *outaddr, void *outlenp, socklen_t orig)
+{
+	socklen_t size = inlen;
+	struct sockaddr_storage buf;
+	struct sockaddr *bufaddr;
+
+	/*
+	 * Either we were passed a valid sockaddr (with length) or the length
+	 * is set to 0.
+	 */
+	VERIFY(inaddr != NULL || inlen == 0);
+
+	if (inlen == 0) {
+		goto finish;
+	}
+
+
+	switch (inaddr->sa_family) {
+	case AF_INET:
+		if (inlen != sizeof (struct sockaddr)) {
+			return (EINVAL);
+		}
+		break;
+
+	case AF_INET6:
+		if (inlen != sizeof (struct sockaddr_in6)) {
+			return (EINVAL);
+		}
+		/*
+		 * The linux sockaddr_in6 is shorter than illumos.
+		 * Truncate the extra field on the way out.
+		 */
+		size = (sizeof (lx_sockaddr_in6_t));
+		inlen = (sizeof (lx_sockaddr_in6_t));
+		break;
+
+	case AF_UNIX:
+		if (inlen > sizeof (struct sockaddr_un)) {
+			return (EINVAL);
+		}
+
+		/*
+		 * On Linux an empty AF_UNIX address is returned as NULL, which
+		 * means setting the returned length to only encompass the
+		 * address family part of the buffer. However, some code also
+		 * references the address portion of the buffer and uses it,
+		 * even though the returned length has been shortened. Thus, we
+		 * clear the buffer to ensure that the address portion is NULL.
+		 */
+		if (inaddr->sa_data[0] == '\0') {
+			bzero(&buf, sizeof (buf));
+			inlen = sizeof (inaddr->sa_family);
+		}
+		break;
+
+	case (sa_family_t)AF_NOTSUPPORTED:
+		return (EPROTONOSUPPORT);
+
+	case (sa_family_t)AF_INVAL:
+		return (EAFNOSUPPORT);
+
+	default:
+		break;
+	}
+
+	/*
+	 * The input should be smaller than sockaddr_storage, the largest
+	 * sockaddr we support.
+	 */
+	VERIFY(inlen <= sizeof (buf));
+
+	bufaddr = (struct sockaddr *)&buf;
+	bcopy(inaddr, bufaddr, inlen);
+	bufaddr->sa_family = STOL_FAMILY(bufaddr->sa_family);
+
+	/*
+	 * It is possible that userspace passed us a smaller buffer than we
+	 * hope to output.  When this is the case, we will truncate our output
+	 * to the max size of their buffer but report the true size of the
+	 * sockaddr when outputting the outlen value.
+	 */
+	size = (orig < size) ? orig : size;
+
+	if (copyout(bufaddr, outaddr, size) != 0) {
+		return (EFAULT);
+	}
+
+finish:
+#if defined(_LP64)
+	if (get_udatamodel() != DATAMODEL_NATIVE) {
+		int32_t len32 = (int32_t)inlen;
+		if (copyout(&len32, outlenp, sizeof (len32)) != 0) {
+			return (EFAULT);
+		}
+	} else
+#endif /* defined(_LP64) */
+	{
+		if (copyout(&inlen, outlenp, sizeof (inlen)) != 0) {
+			return (EFAULT);
+		}
+	}
+
+	return (0);
+}
+
+typedef struct lx_cmsg_xlate {
+	int lcx_sunos_level;
+	int lcx_sunos_type;
+	int (*lcx_stol_conv)(struct cmsghdr *, struct cmsghdr *);
+	int lcx_linux_level;
+	int lcx_linux_type;
+	int (*lcx_ltos_conv)(struct cmsghdr *, struct cmsghdr *);
+} lx_cmsg_xlate_t;
+
+static int cmsg_conv_generic(struct cmsghdr *, struct cmsghdr *);
+static int stol_conv_ucred(struct cmsghdr *, struct cmsghdr *);
+static int ltos_conv_ucred(struct cmsghdr *, struct cmsghdr *);
+static int stol_conv_recvttl(struct cmsghdr *, struct cmsghdr *);
+
+/*
+ * Table describing SunOS <-> Linux cmsg translation mappings.
+ * Certain types (IP_RECVTTL) are only converted in one direction and are
+ * indicated by one of the translation functions being set to NULL.
+ */
+static lx_cmsg_xlate_t lx_cmsg_xlate_tbl[] = {
+	{ SOL_SOCKET, SCM_RIGHTS, cmsg_conv_generic,
+	    LX_SOL_SOCKET, LX_SCM_RIGHTS, cmsg_conv_generic },
+	{ SOL_SOCKET, SCM_UCRED, stol_conv_ucred,
+	    LX_SOL_SOCKET, LX_SCM_CRED, ltos_conv_ucred },
+	{ SOL_SOCKET, SCM_TIMESTAMP, cmsg_conv_generic,
+	    LX_SOL_SOCKET, LX_SCM_TIMESTAMP, cmsg_conv_generic },
+	{ IPPROTO_IP, IP_PKTINFO, cmsg_conv_generic,
+	    LX_IPPROTO_IP, LX_IP_PKTINFO, cmsg_conv_generic },
+	{ IPPROTO_IP, IP_RECVTTL, stol_conv_recvttl,
+	    LX_IPPROTO_IP, LX_IP_TTL, NULL },
+	{ IPPROTO_IP, IP_TTL, cmsg_conv_generic,
+	    LX_IPPROTO_IP, LX_IP_TTL, cmsg_conv_generic },
+	{ IPPROTO_IPV6, IPV6_HOPLIMIT, cmsg_conv_generic,
+	    LX_IPPROTO_IPV6, LX_IPV6_HOPLIMIT, cmsg_conv_generic },
+	{ IPPROTO_IPV6, IPV6_PKTINFO, cmsg_conv_generic,
+	    LX_IPPROTO_IPV6, LX_IPV6_PKTINFO, cmsg_conv_generic }
+};
+
+#define	LX_MAX_CMSG_XLATE	\
+	(sizeof (lx_cmsg_xlate_tbl) / sizeof (lx_cmsg_xlate_tbl[0]))
+
+#if defined(_LP64)
+
+typedef struct {
+	int64_t	cmsg_len;
+	int32_t	cmsg_level;
+	int32_t	cmsg_type;
+} lx_cmsghdr64_t;
+
+/* The alignment/padding for 64bit Linux cmsghdr is not the same. */
+#define	LX_CMSG64_ALIGNMENT	8
+#define	ISALIGNED_LX_CMSG64(addr)					\
+	(((uintptr_t)(addr) & (LX_CMSG64_ALIGNMENT - 1)) == 0)
+#define	ROUNDUP_LX_CMSG64_LEN(len)					\
+	(((len) + LX_CMSG64_ALIGNMENT - 1) & ~(LX_CMSG64_ALIGNMENT - 1))
+
+#define	LX_CMSG64_IS_ALIGNED(m)			\
+	(((uintptr_t)(m) & (_CMSG_DATA_ALIGNMENT - 1)) == 0)
+#define	LX_CMSG64_DATA(c)	((unsigned char *)(((lx_cmsghdr64_t *)(c)) + 1))
+/*
+ * LX_CMSG64_VALID is closely derived from CMSG_VALID with one particularly
+ * important addition.  Since cmsg_len is 64bit, (cmsg + cmsg_len) is checked
+ * against the start address as well.  This prevents bogus inputs from wrapping
+ * around the address space.
+ */
+#define	LX_CMSG64_VALID(cmsg, start, end)				\
+	(ISALIGNED_LX_CMSG64(cmsg) &&					\
+	((uintptr_t)(cmsg) >= (uintptr_t)(start)) &&			\
+	((uintptr_t)(cmsg) < (uintptr_t)(end)) &&			\
+	((cmsg)->cmsg_len >= sizeof (lx_cmsghdr64_t)) &&		\
+	((uintptr_t)(cmsg) + (cmsg)->cmsg_len <= (uintptr_t)(end)) &&	\
+	((uintptr_t)(cmsg) + (cmsg)->cmsg_len >= (uintptr_t)(start)))
+#define	LX_CMSG64_NEXT(cmsg)				\
+	(lx_cmsghdr64_t *)((uintptr_t)(cmsg) +		\
+	    ROUNDUP_LX_CMSG64_LEN((cmsg)->cmsg_len))
+#define	LX_CMSG64_DIFF	sizeof (uint32_t)
+
+#endif /* defined(_LP64) */
+
+/*
+ * convert ucred_s to lx_ucred.
+ */
+static int
+stol_conv_ucred(struct cmsghdr *inmsg, struct cmsghdr *omsg)
+{
+	/*
+	 * Format the data correctly in the omsg buffer.
+	 */
+	if (omsg != NULL) {
+		struct ucred_s *scred = (struct ucred_s *)CMSG_CONTENT(inmsg);
+		prcred_t *cr;
+		lx_ucred_t lcred;
+
+		lcred.lxu_pid = scred->uc_pid;
+		cr = UCCRED(scred);
+		if (cr != NULL) {
+			lcred.lxu_uid = cr->pr_euid;
+			lcred.lxu_gid = cr->pr_egid;
+		} else {
+			lcred.lxu_uid = lcred.lxu_gid = 0;
+		}
+
+		bcopy(&lcred, CMSG_CONTENT(omsg), sizeof (lx_ucred_t));
+	}
+
+	return (sizeof (struct cmsghdr) + sizeof (lx_ucred_t));
+}
+
+static int
+ltos_conv_ucred(struct cmsghdr *inmsg, struct cmsghdr *omsg)
+{
+	if (omsg != NULL) {
+		struct ucred_s *uc;
+		prcred_t *pc;
+		lx_ucred_t *lcred;
+
+		uc = (struct ucred_s *)CMSG_CONTENT(omsg);
+		pc = (prcred_t *)((char *)uc + sizeof (struct ucred_s));
+
+		uc->uc_credoff = sizeof (struct ucred_s);
+
+		lcred = (lx_ucred_t *)CMSG_CONTENT(inmsg);
+
+		uc->uc_pid = lcred->lxu_pid;
+		pc->pr_euid = lcred->lxu_uid;
+		pc->pr_egid = lcred->lxu_gid;
+	}
+
+	return (sizeof (struct cmsghdr) + sizeof (struct ucred_s) +
+	    sizeof (prcred_t));
+
+}
+
+static int
+stol_conv_recvttl(struct cmsghdr *inmsg, struct cmsghdr *omsg)
+{
+	/*
+	 * SunOS communicates the TTL of incoming packets via IP_RECVTTL using
+	 * a uint8_t value instead of IP_TTL using an int. This conversion is
+	 * only needed in the one direction since Linux does not handle
+	 * IP_RECVTTL in the sendmsg path.
+	 */
+	if (omsg != NULL) {
+		uint8_t *inttl = (uint8_t *)CMSG_CONTENT(inmsg);
+		int *ottl = (int *)CMSG_CONTENT(omsg);
+
+		*ottl = (int)*inttl;
+	}
+
+	return (sizeof (struct cmsghdr) + sizeof (int));
+}
+
+static int
+cmsg_conv_generic(struct cmsghdr *inmsg, struct cmsghdr *omsg)
+{
+	if (omsg != NULL) {
+		size_t data_len;
+
+		data_len = inmsg->cmsg_len - sizeof (struct cmsghdr);
+		bcopy(CMSG_CONTENT(inmsg), CMSG_CONTENT(omsg), data_len);
+	}
+
+	return (inmsg->cmsg_len);
+}
+
+static int
+lx_xlate_cmsg(struct cmsghdr *inmsg, struct cmsghdr *omsg, lx_xlate_dir_t dir)
+{
+	int i;
+	int len;
+
+	VERIFY(dir == SUNOS_TO_LX || dir == LX_TO_SUNOS);
+
+	for (i = 0; i < LX_MAX_CMSG_XLATE; i++) {
+		lx_cmsg_xlate_t *xlate = &lx_cmsg_xlate_tbl[i];
+		if (dir == LX_TO_SUNOS &&
+		    inmsg->cmsg_level == xlate->lcx_linux_level &&
+		    inmsg->cmsg_type == xlate->lcx_linux_type &&
+		    xlate->lcx_ltos_conv != NULL) {
+			len = xlate->lcx_ltos_conv(inmsg, omsg);
+			if (omsg != NULL) {
+				omsg->cmsg_len = len;
+				omsg->cmsg_level = xlate->lcx_sunos_level;
+				omsg->cmsg_type = xlate->lcx_sunos_type;
+			}
+			return (len);
+		} else if (dir == SUNOS_TO_LX &&
+		    inmsg->cmsg_level == xlate->lcx_sunos_level &&
+		    inmsg->cmsg_type == xlate->lcx_sunos_type &&
+		    xlate->lcx_stol_conv != NULL) {
+			len = xlate->lcx_stol_conv(inmsg, omsg);
+			if (omsg != NULL) {
+				omsg->cmsg_len = len;
+				omsg->cmsg_level = xlate->lcx_linux_level;
+				omsg->cmsg_type = xlate->lcx_linux_type;
+			}
+			return (len);
+		}
+	}
+	/*
+	 * The Linux man page for sendmsg does not define a specific error for
+	 * unsupported cmsgs.  While it is meant to indicated bad values for
+	 * passed flags, EOPNOTSUPP appears to be the next closest choice.
+	 */
+	return (-EOPNOTSUPP);
+}
+
+static long
+ltos_cmsgs_copyin(void *addr, socklen_t inlen, void **outmsg,
+    socklen_t *outlenp)
+{
+	void *inbuf, *obuf;
+	struct cmsghdr *inmsg, *omsg;
+	int slen = 0;
+
+	if (inlen < sizeof (struct cmsghdr) || inlen > SO_MAXARGSIZE) {
+		return (EINVAL);
+	}
+
+#if defined(_LP64)
+	if (get_udatamodel() == DATAMODEL_NATIVE &&
+	    inlen < sizeof (lx_cmsghdr64_t)) {
+		/* The size requirements are more strict for 64bit. */
+		return (EINVAL);
+	}
+#endif /* defined(_LP64) */
+
+	inbuf = kmem_alloc(inlen, KM_SLEEP);
+	if (copyin(addr, inbuf, inlen) != 0) {
+		kmem_free(inbuf, inlen);
+		return (EFAULT);
+	}
+
+#if defined(_LP64)
+	if (get_udatamodel() == DATAMODEL_NATIVE) {
+		/*
+		 * Linux cmsg headers are longer than illumos under x86_64.
+		 * Convert to regular cmsgs first.
+		 */
+		lx_cmsghdr64_t *lmsg;
+		struct cmsghdr *smsg;
+		void *newbuf;
+		int len = 0;
+
+		/* Inventory the new cmsg size */
+		for (lmsg = (lx_cmsghdr64_t *)inbuf;
+		    LX_CMSG64_VALID(lmsg, inbuf, (uintptr_t)inbuf + inlen) != 0;
+		    lmsg = LX_CMSG64_NEXT(lmsg)) {
+			len += ROUNDUP_cmsglen(lmsg->cmsg_len - LX_CMSG64_DIFF);
+		}
+
+		VERIFY(len < inlen);
+		if (len == 0) {
+			/* Input was bogus, so we can give up early. */
+			kmem_free(inbuf, inlen);
+			*outmsg = NULL;
+			*outlenp = 0;
+			return (EINVAL);
+		}
+
+		newbuf = kmem_alloc(len, KM_SLEEP);
+
+		for (lmsg = (lx_cmsghdr64_t *)inbuf,
+		    smsg = (struct cmsghdr *)newbuf;
+		    LX_CMSG64_VALID(lmsg, inbuf, (uintptr_t)inbuf + inlen) != 0;
+		    lmsg = LX_CMSG64_NEXT(lmsg), smsg = CMSG_NEXT(smsg)) {
+			smsg->cmsg_level = lmsg->cmsg_level;
+			smsg->cmsg_type = lmsg->cmsg_type;
+			smsg->cmsg_len = lmsg->cmsg_len - LX_CMSG64_DIFF;
+
+			/* The above length measurement should ensure this */
+			ASSERT(CMSG_VALID(smsg, newbuf,
+			    (uintptr_t)newbuf + len));
+
+			bcopy(LX_CMSG64_DATA(lmsg), CMSG_CONTENT(smsg),
+			    smsg->cmsg_len - sizeof (*smsg));
+		}
+
+		kmem_free(inbuf, inlen);
+		inbuf = newbuf;
+		inlen = len;
+	}
+#endif /* defined(_LP64) */
+
+	/*
+	 * Now determine how much space we need for the conversion.
+	 */
+	for (inmsg = (struct cmsghdr *)inbuf;
+	    CMSG_VALID(inmsg, inbuf, (uintptr_t)inbuf + inlen) != 0;
+	    inmsg = CMSG_NEXT(inmsg)) {
+		int sz;
+
+		if ((sz = lx_xlate_cmsg(inmsg, NULL, LX_TO_SUNOS)) < 0) {
+			/* unsupported msg */
+			kmem_free(inbuf, inlen);
+			return (-sz);
+		}
+
+		slen += ROUNDUP_cmsglen(sz);
+	}
+
+	obuf = kmem_zalloc(slen, KM_SLEEP);
+
+	/*
+	 * Now do the conversion.
+	 */
+	for (inmsg = (struct cmsghdr *)inbuf, omsg = (struct cmsghdr *)obuf;
+	    CMSG_VALID(inmsg, inbuf, (uintptr_t)inbuf + inlen) != 0;
+	    inmsg = CMSG_NEXT(inmsg), omsg = CMSG_NEXT(omsg)) {
+		VERIFY(lx_xlate_cmsg(inmsg, omsg, LX_TO_SUNOS) >= 0);
+	}
+
+	kmem_free(inbuf, inlen);
+	*outmsg = obuf;
+	*outlenp = slen;
+	return (0);
+}
+
+static long
+stol_cmsgs_copyout(void *input, socklen_t inlen, void *addr,
+    void *outlenp, socklen_t orig_outlen)
+{
+	void *obuf;
+	struct cmsghdr *inmsg, *omsg;
+	int error = 0;
+	socklen_t lx_len = 0;
+#if defined(_LP64)
+	model_t model = get_udatamodel();
+#endif
+
+	if (inlen == 0) {
+		/* Simply output the zero controllen */
+		goto finish;
+	}
+
+	VERIFY(inlen > sizeof (struct cmsghdr));
+
+	/*
+	 * First determine how much space we need for the conversion and
+	 * make sure the caller has provided at least that much space to return
+	 * results.
+	 */
+	for (inmsg = (struct cmsghdr *)input;
+	    CMSG_VALID(inmsg, input, (uintptr_t)input + inlen) != 0;
+	    inmsg = CMSG_NEXT(inmsg)) {
+		int sz;
+
+		if ((sz = lx_xlate_cmsg(inmsg, NULL, SUNOS_TO_LX)) < 0) {
+			/* unsupported msg */
+			return (-sz);
+		}
+
+#if defined(_LP64)
+		if (model == DATAMODEL_NATIVE) {
+			/*
+			 * The converted 64-bit cmsgs require an additional 4
+			 * bytes of header space and must be aligned to 8 bytes
+			 * (instead of the typical 4 for x86)
+			 */
+			sz = ROUNDUP_LX_CMSG64_LEN(sz + LX_CMSG64_DIFF);
+		} else
+#endif /* defined(_LP64) */
+		{
+			/*
+			 * The converted 32-bit cmsgs do not require additional
+			 * header space or padding for Linux conversion.
+			 */
+			sz = ROUNDUP_cmsglen(sz);
+		}
+
+		/*
+		 * Unlike SunOS, Linux requires that the last cmsg be
+		 * adequately padded for alignment.
+		 */
+		lx_len += sz;
+	}
+
+	if (lx_len > orig_outlen || addr == NULL) {
+		/* This will be interpreted by the caller */
+		error = EMSGSIZE;
+		lx_len = 0;
+		goto finish;
+	}
+
+	/*
+	 * Since cmsgs are often padded to an aligned size, kmem_zalloc is
+	 * necessary to prevent leaking the contents of uninitialized memory.
+	 */
+	obuf = kmem_zalloc(lx_len, KM_SLEEP);
+
+	/*
+	 * Convert the msgs.
+	 */
+	for (inmsg = (struct cmsghdr *)input, omsg = (struct cmsghdr *)obuf;
+	    CMSG_VALID(inmsg, input, (uintptr_t)input + inlen) != 0;
+	    inmsg = CMSG_NEXT(inmsg), omsg = CMSG_NEXT(omsg)) {
+		VERIFY(lx_xlate_cmsg(inmsg, omsg, SUNOS_TO_LX) >= 0);
+	}
+
+#if defined(_LP64)
+	if (model == DATAMODEL_NATIVE) {
+		/* Linux cmsg headers are longer than illumos under x86_64. */
+		struct cmsghdr *smsg;
+		lx_cmsghdr64_t *lmsg;
+		void *newbuf;
+
+		/*
+		 * Once again, kmem_zalloc is needed to avoid leaking the
+		 * contents of uninialized memory
+		 */
+		newbuf = kmem_zalloc(lx_len, KM_SLEEP);
+		for (smsg = (struct cmsghdr *)obuf,
+		    lmsg = (lx_cmsghdr64_t *)newbuf;
+		    CMSG_VALID(smsg, obuf, (uintptr_t)obuf + inlen) != 0;
+		    smsg = CMSG_NEXT(smsg), lmsg = LX_CMSG64_NEXT(lmsg)) {
+			lmsg->cmsg_level = smsg->cmsg_level;
+			lmsg->cmsg_type = smsg->cmsg_type;
+			lmsg->cmsg_len = smsg->cmsg_len + LX_CMSG64_DIFF;
+
+			ASSERT(LX_CMSG64_VALID(lmsg, newbuf,
+			    (uintptr_t)newbuf + lx_len) != 0);
+
+			bcopy(CMSG_CONTENT(smsg), LX_CMSG64_DATA(lmsg),
+			    smsg->cmsg_len - sizeof (*smsg));
+		}
+
+		kmem_free(obuf, lx_len);
+		obuf = newbuf;
+	}
+#endif /* defined(_LP64) */
+
+	if (copyout(obuf, addr, lx_len) != 0) {
+		kmem_free(obuf, lx_len);
+		return (EFAULT);
+	}
+	kmem_free(obuf, lx_len);
+
+finish:
+	if (outlenp != NULL) {
+#if defined(_LP64)
+		if (model != DATAMODEL_NATIVE) {
+			int32_t len32 = (int32_t)lx_len;
+			if (copyout(&len32, outlenp, sizeof (len32)) != 0) {
+				return (EFAULT);
+			}
+		} else
+#endif /* defined(_LP64) */
+		{
+			if (copyout(&lx_len, outlenp, sizeof (lx_len)) != 0) {
+				return (EFAULT);
+			}
+		}
+	}
+	return (error);
+}
+
+static void
+lx_cmsg_set_cloexec(void *input, socklen_t inlen)
+{
+	struct cmsghdr *inmsg;
+
+	if (inlen == 0) {
+		return;
+	}
+
+	for (inmsg = (struct cmsghdr *)input;
+	    CMSG_VALID(inmsg, input, (uintptr_t)input + inlen) != 0;
+	    inmsg = CMSG_NEXT(inmsg)) {
+		if (inmsg->cmsg_level == SOL_SOCKET &&
+		    inmsg->cmsg_type == SCM_RIGHTS) {
+			int *fds = (int *)CMSG_CONTENT(inmsg);
+			int i, num = (int)CMSG_CONTENTLEN(inmsg) / sizeof (int);
+			for (i = 0; i < num; i++) {
+				char flags;
+				file_t *fp;
+				/* set CLOEXEC on the fd */
+				fp = getf(fds[i]);
+				VERIFY(fp != NULL);
+				flags = f_getfd(fds[i]);
+				flags |= FD_CLOEXEC;
+				f_setfd(fds[i], flags);
+				releasef(fds[i]);
+			}
+		}
+	}
+}
+
+static int
+lx_cmsg_try_ucred(sonode_t *so, struct nmsghdr *msg, socklen_t origlen)
+{
+	lx_socket_aux_data_t *sad;
+	struct cmsghdr *cmsg = NULL;
+	int msgsize;
+	cred_t *cred;
+
+	if (origlen == 0) {
+		return (0);
+	}
+	sad = lx_sad_acquire(SOTOV(so));
+	if (!sad->lxsad_stream_cred) {
+		mutex_exit(&sad->lxsad_lock);
+		return (0);
+	}
+	mutex_exit(&sad->lxsad_lock);
+
+	mutex_enter(&so->so_lock);
+	if (so->so_peercred == NULL) {
+		mutex_exit(&so->so_lock);
+		return (0);
+	}
+	crhold(cred = so->so_peercred);
+	mutex_exit(&so->so_lock);
+
+	msgsize = ucredminsize(cred) + sizeof (struct cmsghdr);
+	if (msg->msg_control == NULL) {
+		msg->msg_controllen = msgsize;
+		msg->msg_control = cmsg = kmem_zalloc(msgsize, KM_SLEEP);
+	} else {
+		/*
+		 * The so_recvmsg operation may have allocated a msg_control
+		 * buffer which precisely fits all returned cmsgs.  We must
+		 * manually verify the length of that cmsg data and reallocate
+		 * the buffer if it lacks the necessary space.
+		 */
+		uintptr_t start = (uintptr_t)msg->msg_control;
+		uintptr_t end = start + msg->msg_controllen;
+
+		ASSERT(msg->msg_controllen > 0);
+		cmsg = (struct cmsghdr *)msg->msg_control;
+		while (CMSG_VALID(cmsg, start, end) != 0) {
+			if (cmsg->cmsg_level == SOL_SOCKET &&
+			    cmsg->cmsg_type == SCM_UCRED) {
+				/*
+				 * If some later code change results in a ucred
+				 * being attached anyways, there is no need for
+				 * us to do it manually
+				 */
+				crfree(cred);
+				return (0);
+			}
+			cmsg = CMSG_NEXT(cmsg);
+		}
+		if (((uintptr_t)cmsg + msgsize) > end) {
+			socklen_t offset = (uintptr_t)cmsg - start;
+			socklen_t newsize = offset + msgsize;
+			void *newbuf;
+
+			if (newsize < msg->msg_controllen) {
+				/* size overflow, bail */
+				crfree(cred);
+				return (-1);
+			}
+			newbuf = kmem_alloc(newsize, KM_SLEEP);
+			bcopy(msg->msg_control, newbuf, msg->msg_controllen);
+			kmem_free(msg->msg_control, msg->msg_controllen);
+
+			msg->msg_control = newbuf;
+			msg->msg_controllen = newsize;
+			cmsg = (struct cmsghdr *)((uintptr_t)newbuf + offset);
+		}
+	}
+
+	cmsg->cmsg_level = SOL_SOCKET;
+	cmsg->cmsg_type = SCM_UCRED;
+	cmsg->cmsg_len = msgsize;
+	(void) cred2ucred(cred, so->so_cpid, CMSG_CONTENT(cmsg), CRED());
+	crfree(cred);
+	return (0);
+}
+
+static lx_socket_aux_data_t *
+lx_sad_acquire(vnode_t *vp)
+{
+	lx_socket_aux_data_t *cur, *created;
+
+	mutex_enter(&vp->v_vsd_lock);
+	cur = (lx_socket_aux_data_t *)vsd_get(vp, lx_socket_vsd);
+	if (cur == NULL) {
+		/* perform our allocation carefully */
+		mutex_exit(&vp->v_vsd_lock);
+
+		created = (lx_socket_aux_data_t *)kmem_zalloc(
+		    sizeof (*created), KM_SLEEP);
+
+		mutex_enter(&vp->v_vsd_lock);
+		cur = (lx_socket_aux_data_t *)vsd_get(vp, lx_socket_vsd);
+		if (cur == NULL) {
+			mutex_init(&created->lxsad_lock, NULL, MUTEX_DEFAULT,
+			    NULL);
+			(void) vsd_set(vp, lx_socket_vsd, created);
+			cur = created;
+		} else {
+			kmem_free(created, sizeof (*created));
+		}
+	}
+	mutex_exit(&vp->v_vsd_lock);
+	mutex_enter(&cur->lxsad_lock);
+	return (cur);
+}
+
+static int
+lx_convert_pkt_proto(int protocol)
+{
+	switch (ntohs(protocol)) {
+	case LX_ETH_P_802_2:
+		return (ETH_P_802_2);
+	case LX_ETH_P_IP:
+		return (ETH_P_IP);
+	case LX_ETH_P_ARP:
+		return (ETH_P_ARP);
+	case LX_ETH_P_IPV6:
+		return (ETH_P_IPV6);
+	case LX_ETH_P_ALL:
+	case LX_ETH_P_802_3:
+		return (ETH_P_ALL);
+	default:
+		return (-1);
+	}
+}
+
+static int
+lx_convert_sock_args(int in_dom, int in_type, int in_proto, int *out_dom,
+    int *out_type, int *out_options, int *out_proto)
+{
+	int domain, type, options;
+
+	if (in_dom < 0 || in_type < 0 || in_proto < 0)
+		return (EINVAL);
+
+	domain = LTOS_FAMILY(in_dom);
+	if (domain == AF_NOTSUPPORTED || domain == AF_UNSPEC)
+		return (EAFNOSUPPORT);
+	if (domain == AF_INVAL)
+		return (EINVAL);
+
+	type = LTOS_SOCKTYPE(in_type & LX_SOCK_TYPE_MASK);
+	if (type == SOCK_NOTSUPPORTED)
+		return (ESOCKTNOSUPPORT);
+	if (type == SOCK_INVAL)
+		return (EINVAL);
+
+	/*
+	 * Linux does not allow the app to specify IP Protocol for raw sockets.
+	 * SunOS does, so bail out here.
+	 */
+	if (domain == AF_INET && type == SOCK_RAW && in_proto == IPPROTO_IP)
+		return (ESOCKTNOSUPPORT);
+
+	options = 0;
+	in_type &= ~(LX_SOCK_TYPE_MASK);
+	if (in_type & LX_SOCK_NONBLOCK) {
+		in_type ^= LX_SOCK_NONBLOCK;
+		options |= SOCK_NONBLOCK;
+	}
+	if (in_type & LX_SOCK_CLOEXEC) {
+		in_type ^= LX_SOCK_CLOEXEC;
+		options |= SOCK_CLOEXEC;
+	}
+	if (in_type != 0) {
+		return (EINVAL);
+	}
+
+	/* Protocol definitions for PF_PACKET differ between Linux and SunOS */
+	if (domain == PF_PACKET &&
+	    (in_proto = lx_convert_pkt_proto(in_proto)) < 0)
+		return (EINVAL);
+
+	*out_dom = domain;
+	*out_type = type;
+	*out_options = options;
+	*out_proto = in_proto;
+	return (0);
+}
+
+long
+lx_socket(int domain, int type, int protocol)
+{
+	int fd, error, options;
+	sonode_t *so;
+	vnode_t *vp;
+	struct file *fp;
+
+	if ((error = lx_convert_sock_args(domain, type, protocol, &domain,
+	    &type, &options, &protocol)) != 0) {
+		return (set_errno(error));
+	}
+
+	/* logic cloned from so_socket */
+	so = socket_create(domain, type, protocol, NULL, NULL, SOCKET_SLEEP,
+	    SOV_DEFAULT, CRED(), &error);
+
+	if (so == NULL) {
+		if (error == EPROTOTYPE || error == EPROTONOSUPPORT) {
+			error = ESOCKTNOSUPPORT;
+		}
+		return (set_errno(error));
+	}
+
+	/* Allocate a file descriptor for the socket */
+	vp = SOTOV(so);
+	if ((error = falloc(vp, FWRITE|FREAD, &fp, &fd)) != 0) {
+		(void) socket_close(so, 0, CRED());
+		socket_destroy(so);
+		return (set_errno(error));
+	}
+
+	/*
+	 * Linux programs do not tolerate errors appearing from asynchronous
+	 * events (such as ICMP messages arriving).  Setting SM_DEFERERR will
+	 * prevent checking/delivery of such errors.
+	 */
+	so->so_mode |= SM_DEFERERR;
+
+	/* Now fill in the entries that falloc reserved */
+	if (options & SOCK_NONBLOCK) {
+		so->so_state |= SS_NONBLOCK;
+		fp->f_flag |= FNONBLOCK;
+	}
+	mutex_exit(&fp->f_tlock);
+	setf(fd, fp);
+	if ((options & SOCK_CLOEXEC) != 0) {
+		f_setfd(fd, FD_CLOEXEC);
+	}
+	return (fd);
+}
+
+long
+lx_bind(long sock, uintptr_t name, socklen_t namelen)
+{
+	struct sonode *so;
+	struct sockaddr *addr = NULL;
+	socklen_t len = 0;
+	file_t *fp;
+	int error;
+	lx_sun_type_t sun_type;
+	boolean_t not_sock = B_FALSE;
+
+	if ((so = getsonode(sock, &error, &fp)) == NULL) {
+		return (set_errno(error));
+	}
+
+	if (namelen != 0) {
+		error = ltos_sockaddr_copyin((struct sockaddr *)name, namelen,
+		    &addr, &len, &sun_type);
+		if (error != 0) {
+			releasef(sock);
+			return (set_errno(error));
+		}
+	}
+
+	if (addr != NULL && addr->sa_family == AF_UNIX) {
+		vnode_t *vp;
+
+		error = so_ux_lookup(so, (struct sockaddr_un *)addr, B_TRUE,
+		    &vp);
+		if (error == 0) {
+			/* A valid socket exists and is open at this address. */
+			VN_RELE(vp);
+		} else {
+			/* Keep track of paths which are not valid sockets. */
+			if (error == ENOTSOCK) {
+				not_sock = B_TRUE;
+			}
+
+			/*
+			 * When binding to an abstract namespace address or
+			 * /dev/log, implicit clean-up must occur if there is
+			 * not a valid socket at the specififed address.  See
+			 * ltos_sockaddr_copyin for details about why these
+			 * socket types act differently.
+			 */
+			if (sun_type == LX_SUN_ABSTRACT) {
+				(void) vn_removeat(NULL, addr->sa_data,
+				    UIO_SYSSPACE, RMFILE);
+			}
+		}
+	}
+
+	error = socket_bind(so, addr, len, _SOBIND_XPG4_2, CRED());
+
+	/*
+	 * Linux returns EADDRINUSE for attempts to bind to Unix domain
+	 * sockets that aren't sockets.
+	 */
+	if (error == EINVAL && addr != NULL && addr->sa_family == AF_UNIX &&
+	    not_sock == B_TRUE) {
+		error = EADDRINUSE;
+	}
+
+	releasef(sock);
+
+	if (addr != NULL) {
+		kmem_free(addr, len);
+	}
+
+	if (error != 0) {
+		return (set_errno(error));
+	}
+	return (0);
+}
+
+long
+lx_connect(long sock, uintptr_t name, socklen_t namelen)
+{
+	struct sonode *so;
+	struct sockaddr *addr = NULL;
+	lx_socket_aux_data_t *sad = NULL;
+	socklen_t len = 0;
+	file_t *fp;
+	int error;
+
+	if ((so = getsonode(sock, &error, &fp)) == NULL) {
+		return (set_errno(error));
+	}
+
+	/*
+	 * Ensure the name is sized appropriately before we alloc memory and
+	 * copy it in from userspace.  We need at least the address family to
+	 * make later sizing decisions.
+	 */
+	if (namelen != 0) {
+		error = ltos_sockaddr_copyin((struct sockaddr *)name, namelen,
+		    &addr, &len, NULL);
+		if (error != 0) {
+			releasef(sock);
+			return (set_errno(error));
+		}
+	}
+
+	error = socket_connect(so, addr, len, fp->f_flag,
+	    _SOCONNECT_XPG4_2, CRED());
+
+	/*
+	 * Linux connect(2) behavior is rather strange when using the
+	 * O_NONBLOCK flag.  The first call will return EINPROGRESS, as
+	 * expected.  Provided that is successful, a second call to connect
+	 * will return 0 instead of EISCONN.  Subsequent connect calls will
+	 * return EISCONN.
+	 */
+	if ((fp->f_flag & FNONBLOCK) != 0 && error != 0) {
+		sad = lx_sad_acquire(SOTOV(so));
+		if (error == EISCONN &&
+		    sad->lxsad_status == LXSS_CONNECTING) {
+			/* Report the one success */
+			sad->lxsad_status = LXSS_CONNECTED;
+			error = 0;
+		} else if (error == EINPROGRESS) {
+			sad->lxsad_status = LXSS_CONNECTING;
+		}
+		mutex_exit(&sad->lxsad_lock);
+	}
+
+	/*
+	 * When connecting to a UDP socket, configure it so that future
+	 * sendto/sendmsg operations are allowed to specify a destination
+	 * address. See the Posix spec. for sendto(2). Linux allows this while
+	 * illumos would return EISCONN if the option is not set.
+	 */
+	if (error == 0 && so->so_protocol == IPPROTO_UDP &&
+	    (so->so_family == AF_INET || so->so_family == AF_INET6)) {
+		int val = 1;
+
+		DTRACE_PROBE(lx__connect__udp);
+		(void) socket_setsockopt(so, IPPROTO_UDP, UDP_SND_TO_CONNECTED,
+		    &val, sizeof (val), CRED());
+	}
+
+	releasef(sock);
+
+	if (addr != NULL) {
+		kmem_free(addr, len);
+	}
+
+	if (error != 0) {
+		return (set_errno(error));
+	}
+	return (0);
+}
+
+/*
+ * Custom version of socket_recvmsg for error-handling overrides.
+ */
+static int
+lx_socket_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
+    cred_t *cr)
+{
+	int error;
+	ssize_t orig_resid = uiop->uio_resid;
+
+	/*
+	 * Do not bypass the cache when reading data, as the application
+	 * is likely to access the data shortly.
+	 */
+	uiop->uio_extflg |= UIO_COPY_CACHED;
+
+	error = SOP_RECVMSG(so, msg, uiop, cr);
+
+	switch (error) {
+	case EINTR:
+	/* EAGAIN is EWOULDBLOCK */
+	case EWOULDBLOCK:
+		/* We did a partial read */
+		if (uiop->uio_resid != orig_resid)
+			error = 0;
+		break;
+	case ENOTCONN:
+		/*
+		 * The rules are different for non-blocking sockets which are
+		 * still in the process of making a connection
+		 */
+		if ((msg->msg_flags & MSG_DONTWAIT) != 0 ||
+		    (uiop->uio_fmode & (FNONBLOCK|FNDELAY)) != 0) {
+			error = EAGAIN;
+		}
+		break;
+	default:
+		break;
+	}
+	return (error);
+}
+
+static long
+lx_recv_common(int sock, struct nmsghdr *msg, xuio_t *xuiop, int flags,
+    void *namelenp, void *controllenp, void *flagsp)
+{
+	struct sonode *so;
+	file_t *fp;
+	void *name;
+	socklen_t namelen;
+	void *control;
+	socklen_t controllen;
+	ssize_t len;
+	int error;
+	boolean_t fd_cloexec;
+	boolean_t is_peek_trunc;
+
+	if ((so = getsonode(sock, &error, &fp)) == NULL) {
+		return (set_errno(error));
+	}
+
+	fd_cloexec = ((flags & LX_MSG_CMSG_CLOEXEC) != 0);
+	flags = lx_xlate_sock_flags(flags, LX_TO_SUNOS);
+	is_peek_trunc = (flags & (MSG_PEEK|MSG_TRUNC)) == (MSG_PEEK|MSG_TRUNC);
+	len = xuiop->xu_uio.uio_resid;
+	xuiop->xu_uio.uio_fmode = fp->f_flag;
+	xuiop->xu_uio.uio_extflg = UIO_COPY_CACHED;
+
+	/*
+	 * Linux accepts MSG_TRUNC as an input flag, unlike SunOS and many
+	 * other UNIX distributions.  When combined with MSG_PEEK, it causes
+	 * recvmsg to return the size of the waiting message, regardless of
+	 * buffer size.  This behavior is commonly used with a 0-length buffer
+	 * to interrogate the size of a queued message prior to allocating a
+	 * buffer for it.
+	 *
+	 * In order to support this functionality, a custom XUIO type is used
+	 * to communicate the total message size out from the depths of sockfs.
+	 */
+	if (is_peek_trunc) {
+		xuiop->xu_uio.uio_extflg |= UIO_XUIO;
+		xuiop->xu_type = UIOTYPE_PEEKSIZE;
+		xuiop->xu_ext.xu_ps.xu_ps_set = B_FALSE;
+		xuiop->xu_ext.xu_ps.xu_ps_size = 0;
+	}
+
+	name = msg->msg_name;
+	namelen = msg->msg_namelen;
+	control = msg->msg_control;
+	controllen = msg->msg_controllen;
+
+	/*
+	 * socket_recvmsg will allocate these if needed.
+	 * NULL them out to prevent any confusion.
+	 */
+	msg->msg_name = NULL;
+	msg->msg_control = NULL;
+
+	msg->msg_flags = flags & (MSG_OOB | MSG_PEEK | MSG_WAITALL |
+	    MSG_DONTWAIT);
+	/* Default to XPG4.2 operation */
+	msg->msg_flags |= MSG_XPG4_2;
+
+	error = lx_socket_recvmsg(so, msg, (struct uio *)xuiop, CRED());
+	if (error) {
+		releasef(sock);
+		return (set_errno(error));
+	}
+	lwp_stat_update(LWP_STAT_MSGRCV, 1);
+	releasef(sock);
+
+	if (namelen != 0) {
+		error = stol_sockaddr_copyout(msg->msg_name, msg->msg_namelen,
+		    name, namelenp, namelen);
+
+		if (msg->msg_namelen != 0) {
+			kmem_free(msg->msg_name, (size_t)msg->msg_namelen);
+			msg->msg_namelen = 0;
+		}
+
+		/*
+		 * Errors during copyout of the name are not a concern to Linux
+		 * callers at this point in the syscall
+		 */
+		if (error != 0 && error != EFAULT) {
+			goto err;
+		}
+	}
+
+	if (controllen != 0) {
+		if (fd_cloexec) {
+			/*
+			 * If CLOEXEC needs to set on file descriptors passed
+			 * via SCM_RIGHTS, do so before formatting the cmsgs
+			 * for Linux.
+			 */
+			lx_cmsg_set_cloexec(msg->msg_control,
+			    msg->msg_controllen);
+		}
+		if (so->so_family == AF_UNIX &&
+		    (so->so_mode & SM_CONNREQUIRED) != 0) {
+			/*
+			 * It may be necessary to append a SCM_UCRED cmsg to
+			 * the controls if SO_PASSCRED is set on a
+			 * connection-oriented AF_UNIX socket.
+			 *
+			 * See lx_setsockopt_socket for more details.
+			 */
+			if (lx_cmsg_try_ucred(so, msg, controllen) != 0) {
+				msg->msg_flags |= MSG_CTRUNC;
+			}
+		}
+
+		error = stol_cmsgs_copyout(msg->msg_control,
+		    msg->msg_controllen, control, controllenp, controllen);
+
+		if (error != 0) {
+			/*
+			 * If there was an error during cmsg translation or
+			 * copyout, we need to clean up any FDs that are being
+			 * passed back via SCM_RIGHTS.  This prevents us from
+			 * leaking those open files.
+			 */
+			so_closefds(msg->msg_control, msg->msg_controllen, 0,
+			    0);
+
+			/*
+			 * An error during cmsg_copyout means we had
+			 * _something_ to process.
+			 */
+			VERIFY(msg->msg_controllen != 0);
+
+			kmem_free(msg->msg_control,
+			    (size_t)msg->msg_controllen);
+			msg->msg_controllen = 0;
+
+			if (error == EMSGSIZE) {
+				/* Communicate that messages were truncated */
+				msg->msg_flags |= MSG_CTRUNC;
+				error = 0;
+			} else {
+				goto err;
+			}
+		} else if (msg->msg_controllen != 0) {
+			kmem_free(msg->msg_control,
+			    (size_t)msg->msg_controllen);
+			msg->msg_controllen = 0;
+		}
+	}
+
+	if (flagsp != NULL) {
+		int flags;
+
+		/* Clear internal flag. */
+		flags = msg->msg_flags & ~MSG_XPG4_2;
+		flags = lx_xlate_sock_flags(flags, SUNOS_TO_LX);
+
+		if (copyout(&flags, flagsp, sizeof (flags) != 0)) {
+			error = EFAULT;
+			goto err;
+		}
+	}
+
+	/*
+	 * If both MSG_PEEK|MSG_TRUNC were set on the input flags and the
+	 * socket layer was able to calculate the total message size for us,
+	 * return that instead of the copied size.
+	 */
+	if (is_peek_trunc && xuiop->xu_ext.xu_ps.xu_ps_set == B_TRUE) {
+		return (xuiop->xu_ext.xu_ps.xu_ps_size);
+	}
+
+	return (len - xuiop->xu_uio.uio_resid);
+
+err:
+	if (msg->msg_controllen != 0) {
+		/* Prevent FD leakage (see above) */
+		so_closefds(msg->msg_control, msg->msg_controllen, 0, 0);
+		kmem_free(msg->msg_control, (size_t)msg->msg_controllen);
+	}
+	if (msg->msg_namelen != 0) {
+		kmem_free(msg->msg_name, (size_t)msg->msg_namelen);
+	}
+	return (set_errno(error));
+}
+
+long
+lx_recv(int sock, void *buffer, size_t len, int flags)
+{
+	struct nmsghdr smsg;
+	xuio_t xuio;
+	struct iovec uiov;
+
+	if ((ssize_t)len < 0) {
+		/*
+		 * The input len is unsigned, so limit it to SSIZE_MAX since
+		 * the return value is signed.
+		 */
+		return (set_errno(EINVAL));
+	}
+
+	uiov.iov_base = buffer;
+	uiov.iov_len = len;
+	xuio.xu_uio.uio_loffset = 0;
+	xuio.xu_uio.uio_iov = &uiov;
+	xuio.xu_uio.uio_iovcnt = 1;
+	xuio.xu_uio.uio_resid = len;
+	xuio.xu_uio.uio_segflg = UIO_USERSPACE;
+	xuio.xu_uio.uio_limit = 0;
+
+	smsg.msg_namelen = 0;
+	smsg.msg_controllen = 0;
+	smsg.msg_flags = 0;
+	return (lx_recv_common(sock, &smsg, &xuio, flags, NULL, NULL, NULL));
+}
+
+long
+lx_recvfrom(int sock, void *buffer, size_t len, int flags,
+    struct sockaddr *srcaddr, socklen_t *addrlenp)
+{
+	struct nmsghdr smsg;
+	xuio_t xuio;
+	struct iovec uiov;
+
+	if ((ssize_t)len < 0) {
+		/* Keep len reasonably limited (see lx_recv) */
+		return (set_errno(EINVAL));
+	}
+
+	uiov.iov_base = buffer;
+	uiov.iov_len = len;
+	xuio.xu_uio.uio_loffset = 0;
+	xuio.xu_uio.uio_iov = &uiov;
+	xuio.xu_uio.uio_iovcnt = 1;
+	xuio.xu_uio.uio_resid = len;
+	xuio.xu_uio.uio_segflg = UIO_USERSPACE;
+	xuio.xu_uio.uio_limit = 0;
+
+	smsg.msg_name = (char *)srcaddr;
+	if (addrlenp != NULL && srcaddr != NULL) {
+		/*
+		 * Despite addrlenp being defined as a socklen_t *, Linux
+		 * treats it internally as an int *.  Certain LTP tests depend
+		 * upon this behavior, so we must emulate it as well.
+		 */
+		int namelen;
+
+		if (copyin(addrlenp, &namelen, sizeof (namelen)) != 0) {
+			return (set_errno(EFAULT));
+		}
+		if (namelen < 0) {
+			return (set_errno(EINVAL));
+		}
+		smsg.msg_namelen = namelen;
+	} else {
+		smsg.msg_namelen = 0;
+	}
+	smsg.msg_controllen = 0;
+	smsg.msg_flags = 0;
+
+	return (lx_recv_common(sock, &smsg, &xuio, flags, addrlenp, NULL,
+	    NULL));
+}
+
+long
+lx_recvmsg(int sock, void *msg, int flags)
+{
+	struct nmsghdr smsg;
+	xuio_t xuio;
+	struct iovec luiov[IOV_MAX_STACK], *uiov;
+	int i, iovcnt, iovsize;
+	long res;
+	ssize_t len = 0;
+	void *namelenp, *controllenp, *flagsp;
+
+#if defined(_LP64)
+	if (get_udatamodel() != DATAMODEL_NATIVE) {
+		lx_msghdr32_t lmsg32;
+		if (copyin(msg, &lmsg32, sizeof (lmsg32)) != 0) {
+			return (set_errno(EFAULT));
+		}
+		smsg.msg_name = (void *)(uintptr_t)lmsg32.msg_name;
+		smsg.msg_namelen = lmsg32.msg_namelen;
+		smsg.msg_iov = (struct iovec *)(uintptr_t)lmsg32.msg_iov;
+		smsg.msg_iovlen = lmsg32.msg_iovlen;
+		smsg.msg_control = (void *)(uintptr_t)lmsg32.msg_control;
+		smsg.msg_controllen = lmsg32.msg_controllen;
+		smsg.msg_flags = lmsg32.msg_flags;
+
+		namelenp = &((lx_msghdr32_t *)msg)->msg_namelen;
+		controllenp = &((lx_msghdr32_t *)msg)->msg_controllen;
+		flagsp = &((lx_msghdr32_t *)msg)->msg_flags;
+	} else
+#endif /* defined(_LP64) */
+	{
+		lx_msghdr_t lmsg;
+		if (copyin(msg, &lmsg, sizeof (lmsg)) != 0) {
+			return (set_errno(EFAULT));
+		}
+		smsg.msg_name = lmsg.msg_name;
+		smsg.msg_namelen = lmsg.msg_namelen;
+		smsg.msg_iov = lmsg.msg_iov;
+		smsg.msg_iovlen = lmsg.msg_iovlen;
+		smsg.msg_control = lmsg.msg_control;
+		smsg.msg_controllen = lmsg.msg_controllen;
+		smsg.msg_flags = lmsg.msg_flags;
+
+		namelenp = &((lx_msghdr_t *)msg)->msg_namelen;
+		controllenp = &((lx_msghdr_t *)msg)->msg_controllen;
+		flagsp = &((lx_msghdr_t *)msg)->msg_flags;
+	}
+
+	iovcnt = smsg.msg_iovlen;
+	if (iovcnt < 0 || iovcnt > IOV_MAX) {
+		return (set_errno(EMSGSIZE));
+	}
+	if (iovcnt > IOV_MAX_STACK) {
+		iovsize = iovcnt * sizeof (struct iovec);
+		uiov = kmem_alloc(iovsize, KM_SLEEP);
+	} else if (iovcnt > 0) {
+		iovsize = 0;
+		uiov = luiov;
+	} else {
+		iovsize = 0;
+		uiov = NULL;
+		goto noiov;
+	}
+
+#if defined(_LP64)
+	if (get_udatamodel() != DATAMODEL_NATIVE) {
+		/* convert from 32bit iovec structs */
+		struct iovec32 luiov32[IOV_MAX_STACK], *uiov32;
+		ssize_t iov32size;
+		ssize32_t count32;
+
+		iov32size = iovcnt * sizeof (struct iovec32);
+		if (iovsize != 0) {
+			uiov32 = kmem_alloc(iov32size, KM_SLEEP);
+		} else {
+			uiov32 = luiov32;
+		}
+
+		if (copyin((struct iovec32 *)smsg.msg_iov, uiov32, iov32size)) {
+			if (iovsize != 0) {
+				kmem_free(uiov32, iov32size);
+				kmem_free(uiov, iovsize);
+			}
+
+			return (set_errno(EFAULT));
+		}
+
+		count32 = 0;
+		for (i = 0; i < iovcnt; i++) {
+			ssize32_t iovlen32;
+
+			iovlen32 = uiov32[i].iov_len;
+			count32 += iovlen32;
+			if (iovlen32 < 0 || count32 < 0) {
+				if (iovsize != 0) {
+					kmem_free(uiov32, iov32size);
+					kmem_free(uiov, iovsize);
+				}
+
+				return (set_errno(EINVAL));
+			}
+
+			uiov[i].iov_len = iovlen32;
+			uiov[i].iov_base =
+			    (caddr_t)(uintptr_t)uiov32[i].iov_base;
+		}
+		len = count32;
+
+		if (iovsize != 0) {
+			kmem_free(uiov32, iov32size);
+		}
+	} else
+#endif /* defined(_LP64) */
+	{
+		if (copyin(smsg.msg_iov, uiov,
+		    iovcnt * sizeof (struct iovec)) != 0) {
+			if (iovsize != 0) {
+				kmem_free(uiov, iovsize);
+			}
+			return (set_errno(EFAULT));
+		}
+
+		len = 0;
+		for (i = 0; i < iovcnt; i++) {
+			ssize_t iovlen = uiov[i].iov_len;
+			len += iovlen;
+			if (iovlen < 0 || len < 0) {
+				if (iovsize != 0) {
+					kmem_free(uiov, iovsize);
+				}
+				return (set_errno(EINVAL));
+			}
+		}
+	}
+
+noiov:
+	/* Since the iovec is passed via the uio, NULL it out in the msg */
+	smsg.msg_iov = NULL;
+
+	xuio.xu_uio.uio_loffset = 0;
+	xuio.xu_uio.uio_iov = uiov;
+	xuio.xu_uio.uio_iovcnt = iovcnt;
+	xuio.xu_uio.uio_resid = len;
+	xuio.xu_uio.uio_segflg = UIO_USERSPACE;
+	xuio.xu_uio.uio_limit = 0;
+
+	res = lx_recv_common(sock, &smsg, &xuio, flags, namelenp, controllenp,
+	    flagsp);
+
+	if (iovsize != 0) {
+		kmem_free(uiov, iovsize);
+	}
+
+	return (res);
+}
+
+/*
+ * Custom version of socket_sendmsg for error-handling overrides.
+ */
+static int
+lx_socket_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
+    cred_t *cr, boolean_t nosig)
+{
+	int error = 0;
+	ssize_t orig_resid = uiop->uio_resid;
+
+	/*
+	 * Do not bypass the cache if we are doing a local (AF_UNIX) write.
+	 */
+	if (so->so_family == AF_UNIX) {
+		uiop->uio_extflg |= UIO_COPY_CACHED;
+	} else {
+		uiop->uio_extflg &= ~UIO_COPY_CACHED;
+	}
+
+	error = SOP_SENDMSG(so, msg, uiop, cr);
+
+	switch (error) {
+	case EINTR:
+	case ENOMEM:
+	/* EAGAIN is EWOULDBLOCK */
+	case EWOULDBLOCK:
+		/* We did a partial send */
+		if (uiop->uio_resid != orig_resid) {
+			error = 0;
+		}
+		break;
+
+	case ENOTCONN:
+		/*
+		 * The rules are different for non-blocking sockets which are
+		 * still in the process of making a connection
+		 */
+		if ((msg->msg_flags & MSG_DONTWAIT) != 0 ||
+		    (uiop->uio_fmode & (FNONBLOCK|FNDELAY)) != 0) {
+			error = EAGAIN;
+			break;
+		}
+
+		/* Appease LTP and match behavior detailed in the man page */
+		error = EPIPE;
+		/* FALLTHROUGH */
+	case EPIPE:
+		if (nosig == B_FALSE) {
+			tsignal(curthread, SIGPIPE);
+		}
+		break;
+
+	default:
+		break;
+	}
+
+	return (error);
+}
+
+static long
+lx_send_common(int sock, struct nmsghdr *msg, struct uio *uiop, int flags)
+{
+	struct sonode *so;
+	file_t *fp;
+	struct sockaddr *name = NULL;
+	socklen_t namelen;
+	void *control = NULL;
+	socklen_t controllen;
+	ssize_t len = 0;
+	int error;
+	boolean_t nosig;
+
+	if ((so = getsonode(sock, &error, &fp)) == NULL) {
+		return (set_errno(error));
+	}
+
+	uiop->uio_fmode = fp->f_flag;
+
+	/* Allocate and copyin name and control */
+	if (msg->msg_name != NULL && msg->msg_namelen != 0) {
+		ASSERT(MUTEX_NOT_HELD(&so->so_lock));
+
+		error = ltos_sockaddr_copyin((struct sockaddr *)msg->msg_name,
+		    msg->msg_namelen, &name, &namelen, NULL);
+		if (error != 0) {
+			goto done;
+		}
+		/* copyin_name null terminates addresses for AF_UNIX */
+		msg->msg_namelen = namelen;
+		msg->msg_name = name;
+	} else {
+		msg->msg_name = name = NULL;
+		msg->msg_namelen = namelen = 0;
+	}
+
+	if (msg->msg_control != NULL && msg->msg_controllen != 0) {
+		/*
+		 * Verify that the length is not excessive to prevent
+		 * an application from consuming all of kernel memory.
+		 */
+		if (msg->msg_controllen > SO_MAXARGSIZE) {
+			error = EINVAL;
+			goto done;
+		}
+		if ((error = ltos_cmsgs_copyin(msg->msg_control,
+		    msg->msg_controllen, &control, &controllen)) != 0) {
+			goto done;
+		}
+		msg->msg_control = control;
+		msg->msg_controllen = controllen;
+	} else {
+		msg->msg_control = control = NULL;
+		msg->msg_controllen = controllen = 0;
+	}
+
+	len = uiop->uio_resid;
+	msg->msg_flags = lx_xlate_sock_flags(flags, LX_TO_SUNOS);
+	/* Default to XPG4.2 operation */
+	msg->msg_flags |= MSG_XPG4_2;
+	nosig = ((flags & LX_MSG_NOSIGNAL) != 0);
+
+	error = lx_socket_sendmsg(so, msg, uiop, CRED(), nosig);
+done:
+	if (control != NULL) {
+		kmem_free(control, controllen);
+	}
+	if (name != NULL) {
+		kmem_free(name, namelen);
+	}
+	if (error != 0) {
+		releasef(sock);
+		return (set_errno(error));
+	}
+	lwp_stat_update(LWP_STAT_MSGSND, 1);
+	releasef(sock);
+	return (len - uiop->uio_resid);
+}
+
+long
+lx_send(int sock, void *buffer, size_t len, int flags)
+{
+	struct nmsghdr smsg;
+	struct uio auio;
+	struct iovec aiov[1];
+
+	if ((ssize_t)len < 0) {
+		/* Keep len reasonably limited (see lx_recv) */
+		return (set_errno(EINVAL));
+	}
+
+	aiov[0].iov_base = buffer;
+	aiov[0].iov_len = len;
+	auio.uio_loffset = 0;
+	auio.uio_iov = aiov;
+	auio.uio_iovcnt = 1;
+	auio.uio_resid = len;
+	auio.uio_segflg = UIO_USERSPACE;
+	auio.uio_limit = 0;
+
+	smsg.msg_name = NULL;
+	smsg.msg_control = NULL;
+	return (lx_send_common(sock, &smsg, &auio, flags));
+}
+
+long
+lx_sendto(int sock, void *buffer, size_t len, int flags,
+    struct sockaddr *dstaddr, socklen_t addrlen)
+{
+	struct nmsghdr smsg;
+	struct uio auio;
+	struct iovec aiov[1];
+
+	if ((ssize_t)len < 0) {
+		/* Keep len reasonably limited (see lx_recv) */
+		return (set_errno(EINVAL));
+	}
+
+	aiov[0].iov_base = buffer;
+	aiov[0].iov_len = len;
+	auio.uio_loffset = 0;
+	auio.uio_iov = aiov;
+	auio.uio_iovcnt = 1;
+	auio.uio_resid = len;
+	auio.uio_segflg = UIO_USERSPACE;
+	auio.uio_limit = 0;
+
+	smsg.msg_name = (char *)dstaddr;
+	smsg.msg_namelen = addrlen;
+	smsg.msg_control = NULL;
+	return (lx_send_common(sock, &smsg, &auio, flags));
+}
+
+long
+lx_sendmsg(int sock, void *msg, int flags)
+{
+	struct nmsghdr smsg;
+	struct uio auio;
+	struct iovec buf[IOV_MAX_STACK], *aiov;
+	int i, iovcnt, iovsize;
+	long res;
+	ssize_t len = 0;
+
+#if defined(_LP64)
+	if (get_udatamodel() != DATAMODEL_NATIVE) {
+		lx_msghdr32_t lmsg32;
+		if (copyin(msg, &lmsg32, sizeof (lmsg32)) != 0) {
+			return (set_errno(EFAULT));
+		}
+		smsg.msg_name = (void *)(uintptr_t)lmsg32.msg_name;
+		smsg.msg_namelen = lmsg32.msg_namelen;
+		smsg.msg_iov = (struct iovec *)(uintptr_t)lmsg32.msg_iov;
+		smsg.msg_iovlen = lmsg32.msg_iovlen;
+		smsg.msg_control = (void *)(uintptr_t)lmsg32.msg_control;
+		smsg.msg_controllen = lmsg32.msg_controllen;
+		smsg.msg_flags = lmsg32.msg_flags;
+	} else
+#endif /* defined(_LP64) */
+	{
+		lx_msghdr_t lmsg;
+		if (copyin(msg, &lmsg, sizeof (lmsg)) != 0) {
+			return (set_errno(EFAULT));
+		}
+		smsg.msg_name = lmsg.msg_name;
+		smsg.msg_namelen = lmsg.msg_namelen;
+		smsg.msg_iov = lmsg.msg_iov;
+		smsg.msg_iovlen = lmsg.msg_iovlen;
+		smsg.msg_control = lmsg.msg_control;
+		smsg.msg_controllen = lmsg.msg_controllen;
+		smsg.msg_flags = lmsg.msg_flags;
+	}
+
+	iovcnt = smsg.msg_iovlen;
+	if (iovcnt <= 0 || iovcnt > IOV_MAX) {
+		return (set_errno(EMSGSIZE));
+	}
+	if (iovcnt > IOV_MAX_STACK) {
+		iovsize = iovcnt * sizeof (struct iovec);
+		aiov = kmem_alloc(iovsize, KM_SLEEP);
+	} else {
+		iovsize = 0;
+		aiov = buf;
+	}
+
+#if defined(_LP64)
+	if (get_udatamodel() != DATAMODEL_NATIVE) {
+		/* convert from 32bit iovec structs */
+		struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32;
+		ssize_t iov32size;
+		ssize32_t count32;
+
+		iov32size = iovcnt * sizeof (struct iovec32);
+		if (iovsize != 0) {
+			aiov32 = kmem_alloc(iov32size, KM_SLEEP);
+		}
+
+		if (copyin((struct iovec32 *)smsg.msg_iov, aiov32, iov32size)) {
+			if (iovsize != 0) {
+				kmem_free(aiov32, iov32size);
+				kmem_free(aiov, iovsize);
+			}
+
+			return (set_errno(EFAULT));
+		}
+
+		count32 = 0;
+		for (i = 0; i < iovcnt; i++) {
+			ssize32_t iovlen32;
+
+			iovlen32 = aiov32[i].iov_len;
+			count32 += iovlen32;
+			if (iovlen32 < 0 || count32 < 0) {
+				if (iovsize != 0) {
+					kmem_free(aiov32, iov32size);
+					kmem_free(aiov, iovsize);
+				}
+
+				return (set_errno(EINVAL));
+			}
+
+			aiov[i].iov_len = iovlen32;
+			aiov[i].iov_base =
+			    (caddr_t)(uintptr_t)aiov32[i].iov_base;
+		}
+		len = count32;
+
+		if (iovsize != 0) {
+			kmem_free(aiov32, iov32size);
+		}
+	} else
+#endif /* defined(_LP64) */
+	{
+		if (copyin(smsg.msg_iov, aiov,
+		    iovcnt * sizeof (struct iovec)) != 0) {
+			if (iovsize != 0) {
+				kmem_free(aiov, iovsize);
+			}
+			return (set_errno(EFAULT));
+		}
+
+		len = 0;
+		for (i = 0; i < iovcnt; i++) {
+			ssize_t iovlen = aiov[i].iov_len;
+
+			len += iovlen;
+			if (iovlen < 0 || len < 0) {
+				if (iovsize != 0) {
+					kmem_free(aiov, iovsize);
+				}
+				return (set_errno(EINVAL));
+			}
+		}
+	}
+	/* Since the iovec is passed via the uio, NULL it out in the msg */
+	smsg.msg_iov = NULL;
+
+	auio.uio_loffset = 0;
+	auio.uio_iov = aiov;
+	auio.uio_iovcnt = iovcnt;
+	auio.uio_resid = len;
+	auio.uio_segflg = UIO_USERSPACE;
+	auio.uio_limit = 0;
+
+	res = lx_send_common(sock, &smsg, &auio, flags);
+
+	if (iovsize != 0) {
+		kmem_free(aiov, iovsize);
+	}
+
+	return (res);
+}
+
+/*
+ * Linux socket option type definitions
+ *
+ * The protocol `levels` are well defined (see in.h) The option values are
+ * not so well defined. Linux often uses different values vs. Illumos
+ * although they mean the same thing. For example, IP_TOS in Linux is
+ * defined as value 1 but in Illumos it is defined as value 3. This table
+ * maps all the Protocol levels to their options and maps them between
+ * Linux and Illumos and vice versa.  Hence the reason for the complexity.
+ *
+ * For a certain subset of sockopts, Linux will implicitly truncate optval
+ * input, so long as optlen meets a minimum size.  Because illumos is strict
+ * about optlen, we must cap optlen for those options.
+ */
+
+typedef struct lx_sockopt_map {
+	const int lsm_opt;	/* Illumos-native equivalent */
+	const int lsm_lcap;	/* Cap optlen to this size. (Ignored if 0) */
+} lx_sockopt_map_t;
+
+typedef struct lx_proto_opts {
+	const lx_sockopt_map_t	*lpo_entries;	/* Linux->SunOS map entries */
+	unsigned int		lpo_max;	/* max entries in table */
+} lx_proto_opts_t;
+
+#define	OPTNOTSUP	-1	/* we don't support it */
+
+#define	PROTO_SOCKOPTS(opts)    \
+	{ (opts), sizeof ((opts)) / sizeof ((opts)[0]) }
+
+/* Shorten name so the columns can line up */
+#define	IP_MREQ_SZ	sizeof (struct ip_mreq)
+
+static const lx_sockopt_map_t ltos_ip_sockopts[LX_IP_UNICAST_IF + 1] = {
+	{ OPTNOTSUP, 0 },
+	{ IP_TOS, sizeof (int) },		/* IP_TOS		*/
+	{ IP_TTL, sizeof (int) },		/* IP_TTL		*/
+	{ IP_HDRINCL, sizeof (int) },		/* IP_HDRINCL		*/
+	{ IP_OPTIONS, 0 },			/* IP_OPTIONS		*/
+	{ OPTNOTSUP, 0 },			/* IP_ROUTER_ALERT	*/
+	{ IP_RECVOPTS, sizeof (int) },		/* IP_RECVOPTS		*/
+	{ IP_RETOPTS, sizeof (int) },		/* IP_RETOPTS		*/
+	{ IP_PKTINFO, sizeof (int) },		/* IP_PKTINFO		*/
+	{ OPTNOTSUP, 0 },			/* IP_PKTOPTIONS	*/
+	{ OPTNOTSUP, 0 },			/* IP_MTUDISCOVER	*/
+	{ OPTNOTSUP, 0 },			/* IP_RECVERR		*/
+	{ IP_RECVTTL, sizeof (int) },		/* IP_RECVTTL		*/
+	{ OPTNOTSUP, 0 },			/* IP_RECVTOS		*/
+	{ OPTNOTSUP, 0 },			/* IP_MTU		*/
+	{ OPTNOTSUP, 0 },			/* IP_FREEBIND		*/
+	{ OPTNOTSUP, 0 },			/* IP_IPSEC_POLICY	*/
+	{ OPTNOTSUP, 0 },			/* IP_XFRM_POLICY	*/
+	{ OPTNOTSUP, 0 },			/* IP_PASSSEC		*/
+	{ OPTNOTSUP, 0 },			/* IP_TRANSPARENT	*/
+	{ OPTNOTSUP, 0 },			/* IP_ORIGDSTADDR	*/
+	{ OPTNOTSUP, 0 },			/* IP_MINTTL		*/
+	{ OPTNOTSUP, 0 },			/* IP_NODEFRAG		*/
+	{ OPTNOTSUP, 0 },
+	{ OPTNOTSUP, 0 },
+	{ OPTNOTSUP, 0 },
+	{ OPTNOTSUP, 0 },
+	{ OPTNOTSUP, 0 },
+	{ OPTNOTSUP, 0 },
+	{ OPTNOTSUP, 0 },
+	{ OPTNOTSUP, 0 },
+	{ OPTNOTSUP, 0 },
+	{ IP_MULTICAST_IF, sizeof (int) },	/* IP_MULTICAST_IF	*/
+	{ IP_MULTICAST_TTL, sizeof (int) },	/* IP_MULTICAST_TTL	*/
+	{ IP_MULTICAST_LOOP, sizeof (int) },	/* IP_MULTICAST_LOOP	*/
+	{ IP_ADD_MEMBERSHIP, IP_MREQ_SZ },	/* IP_ADD_MEMBERSHIP	*/
+	{ IP_DROP_MEMBERSHIP, IP_MREQ_SZ },	/* IP_DROP_MEMBERSHIP	*/
+	{ IP_UNBLOCK_SOURCE, 0 },		/* IP_UNBLOCK_SOURCE	*/
+	{ IP_BLOCK_SOURCE, 0 },			/* IP_BLOCK_SOURCE	*/
+	{ IP_ADD_SOURCE_MEMBERSHIP, 0 },	/* IP_ADD_SOURCE_MEMBERSHIP */
+	{ OPTNOTSUP, 0 },			/* IP_DROP_SOURCE_MEMBERSHIP */
+	{ OPTNOTSUP, 0 },			/* IP_MSFILTER		*/
+	{ OPTNOTSUP, 0 },			/* MCAST_JOIN_GROUP	*/
+	{ OPTNOTSUP, 0 },			/* MCAST_BLOCK_SOURCE	*/
+	{ OPTNOTSUP, 0 },			/* MCAST_UNBLOCK_SOURCE	*/
+	{ OPTNOTSUP, 0 },			/* MCAST_LEAVE_GROUP	*/
+	{ OPTNOTSUP, 0 },			/* MCAST_JOIN_SOURCE_GROUP */
+	{ OPTNOTSUP, 0 },			/* MCAST_LEAVE_SOURCE_GROUP */
+	{ OPTNOTSUP, 0 },			/* MCAST_MSFILTER	*/
+	{ OPTNOTSUP, 0 },			/* IP_MULTICAST_ALL	*/
+	{ OPTNOTSUP, 0 }			/* IP_UNICAST_IF	*/
+};
+
+static const lx_sockopt_map_t ltos_ipv6_sockopts[LX_IPV6_TCLASS + 1] = {
+	{ OPTNOTSUP, 0 },
+	{ OPTNOTSUP, 0 },			/* IPV6_ADDRFORM	*/
+	{ OPTNOTSUP, 0 },			/* IPV6_2292PKTINFO	*/
+	{ OPTNOTSUP, 0 },			/* IPV6_2292HOPOPTS	*/
+	{ OPTNOTSUP, 0 },			/* IPV6_2292DSTOPTS	*/
+	{ OPTNOTSUP, 0 },			/* IPV6_2292RTHDR	*/
+	{ OPTNOTSUP, 0 },			/* IPV6_2292PKTOPTIONS	*/
+	{ IPV6_CHECKSUM, sizeof (int) },	/* IPV6_CHECKSUM	*/
+	{ OPTNOTSUP, 0 },			/* IPV6_2292HOPLIMIT	*/
+	{ OPTNOTSUP, 0 },			/* IPV6_NEXTHOP		*/
+	{ OPTNOTSUP, 0 },			/* IPV6_AUTHHDR		*/
+	{ OPTNOTSUP, 0 },
+	{ OPTNOTSUP, 0 },
+	{ OPTNOTSUP, 0 },
+	{ OPTNOTSUP, 0 },
+	{ OPTNOTSUP, 0 },
+	{ IPV6_UNICAST_HOPS, sizeof (int) },	/* IPV6_UNICAST_HOPS	*/
+	{ IPV6_MULTICAST_IF, sizeof (int) },	/* IPV6_MULTICAST_IF	*/
+	{ IPV6_MULTICAST_HOPS, sizeof (int) },	/* IPV6_MULTICAST_HOPS	*/
+	{ IPV6_MULTICAST_LOOP, sizeof (int) },	/* IPV6_MULTICAST_LOOP	*/
+	{ OPTNOTSUP, 0 },			/* IPV6_JOIN_GROUP	*/
+	{ OPTNOTSUP, 0 },			/* IPV6_LEAVE_GROUP	*/
+	{ OPTNOTSUP, 0 },			/* IPV6_ROUTER_ALERT	*/
+	{ OPTNOTSUP, 0 },			/* IPV6_MTU_DISCOVER	*/
+	{ OPTNOTSUP, 0 },			/* IPV6_MTU		*/
+	{ OPTNOTSUP, 0 },			/* IPV6_RECVERR		*/
+	{ IPV6_V6ONLY, sizeof (int) },		/* IPV6_V6ONLY		*/
+	{ OPTNOTSUP, 0 },			/* IPV6_JOIN_ANYCAST	*/
+	{ OPTNOTSUP, 0 },			/* IPV6_LEAVE_ANYCAST	*/
+	{ OPTNOTSUP, 0 },			/* IPV6_IPSEC_POLICY	*/
+	{ OPTNOTSUP, 0 },			/* IPV6_XFRM_POLICY	*/
+	{ OPTNOTSUP, 0 },
+	{ OPTNOTSUP, 0 },
+	{ OPTNOTSUP, 0 },
+	{ OPTNOTSUP, 0 },
+	{ OPTNOTSUP, 0 },
+	{ OPTNOTSUP, 0 },
+	{ OPTNOTSUP, 0 },
+	{ OPTNOTSUP, 0 },
+	{ OPTNOTSUP, 0 },
+	{ OPTNOTSUP, 0 },
+	{ OPTNOTSUP, 0 },
+	{ OPTNOTSUP, 0 },
+	{ OPTNOTSUP, 0 },
+	{ OPTNOTSUP, 0 },
+	{ OPTNOTSUP, 0 },
+	{ OPTNOTSUP, 0 },
+	{ OPTNOTSUP, 0 },
+	{ OPTNOTSUP, 0 },
+	{ IPV6_RECVPKTINFO, sizeof (int) },	/* IPV6_RECVPKTINFO	*/
+	{ IPV6_PKTINFO, 0 },			/* IPV6_PKTINFO		*/
+	{ IPV6_RECVHOPLIMIT, sizeof (int) },	/* IPV6_RECVHOPLIMIT	*/
+	{ IPV6_HOPLIMIT, 0 },			/* IPV6_HOPLIMIT	*/
+	{ OPTNOTSUP, 0 },			/* IPV6_RECVHOPOPTS	*/
+	{ OPTNOTSUP, 0 },			/* IPV6_HOPOPTS		*/
+	{ OPTNOTSUP, 0 },			/* IPV6_RTHDRDSTOPTS	*/
+	{ OPTNOTSUP, 0 },			/* IPV6_RECVRTHDR	*/
+	{ OPTNOTSUP, 0 },			/* IPV6_RTHDR		*/
+	{ OPTNOTSUP, 0 },			/* IPV6_RECVDSTOPTS	*/
+	{ OPTNOTSUP, 0 },			/* IPV6_DSTOPTS		*/
+	{ OPTNOTSUP, 0 },
+	{ OPTNOTSUP, 0 },
+	{ OPTNOTSUP, 0 },
+	{ OPTNOTSUP, 0 },
+	{ OPTNOTSUP, 0 },
+	{ OPTNOTSUP, 0 },
+	{ OPTNOTSUP, 0 },			/* IPV6_RECVTCLASS	*/
+	{ IPV6_TCLASS, sizeof (int) }		/* IPV6_TCLASS		*/
+};
+
+static const lx_sockopt_map_t ltos_icmpv6_sockopts[LX_ICMP6_FILTER + 1] = {
+	{ OPTNOTSUP, 0 },
+	{ ICMP6_FILTER, 0 }	/* ICMP6_FILTER	*/
+};
+
+static const lx_sockopt_map_t ltos_tcp_sockopts[LX_TCP_NOTSENT_LOWAT + 1] = {
+	{ OPTNOTSUP, 0 },
+	{ TCP_NODELAY, sizeof (int) },		/* TCP_NODELAY		*/
+	{ TCP_MAXSEG, sizeof (int) },		/* TCP_MAXSEG		*/
+	{ TCP_CORK, sizeof (int) },		/* TCP_CORK		*/
+	{ TCP_KEEPIDLE, sizeof (int) },		/* TCP_KEEPIDLE		*/
+	{ TCP_KEEPINTVL, sizeof (int) },	/* TCP_KEEPINTVL	*/
+	{ TCP_KEEPCNT, sizeof (int) },		/* TCP_KEEPCNT		*/
+	{ OPTNOTSUP, 0 },			/* TCP_SYNCNT		*/
+	{ TCP_LINGER2, sizeof (int) },		/* TCP_LINGER2		*/
+	{ OPTNOTSUP, 0 },			/* TCP_DEFER_ACCEPT	*/
+	{ OPTNOTSUP, 0 },			/* TCP_WINDOW_CLAMP	*/
+	{ OPTNOTSUP, 0 },			/* TCP_INFO		*/
+	{ OPTNOTSUP, 0 },			/* TCP_QUICKACK		*/
+	{ OPTNOTSUP, 0 },			/* TCP_CONGESTION	*/
+	{ OPTNOTSUP, 0 },			/* TCP_MD5SIG		*/
+	{ OPTNOTSUP, 0 },
+	{ OPTNOTSUP, 0 },			/* TCP_THIN_LINEAR_TIMEOUTS */
+	{ OPTNOTSUP, 0 },			/* TCP_THIN_DUPACK	*/
+	{ OPTNOTSUP, 0 },			/* TCP_USER_TIMEOUT	*/
+	{ OPTNOTSUP, 0 },			/* TCP_REPAIR		*/
+	{ OPTNOTSUP, 0 },			/* TCP_REPAIR_QUEUE	*/
+	{ OPTNOTSUP, 0 },			/* TCP_QUEUE_SEQ	*/
+	{ OPTNOTSUP, 0 },			/* TCP_REPAIR_OPTIONS	*/
+	{ OPTNOTSUP, 0 },			/* TCP_FASTOPEN		*/
+	{ OPTNOTSUP, 0 },			/* TCP_TIMESTAMP	*/
+	{ OPTNOTSUP, 0 }			/* TCP_NOTSENT_LOWAT	*/
+};
+
+static const lx_sockopt_map_t ltos_igmp_sockopts[IGMP_MTRACE + 1] = {
+	{ OPTNOTSUP, 0 },
+	{ OPTNOTSUP, 0 },
+	{ OPTNOTSUP, 0 },
+	{ OPTNOTSUP, 0 },
+	{ OPTNOTSUP, 0 },
+	{ OPTNOTSUP, 0 },
+	{ OPTNOTSUP, 0 },
+	{ OPTNOTSUP, 0 },
+	{ IGMP_MINLEN, 0 },		/* IGMP_MINLEN			*/
+	{ OPTNOTSUP, 0 },
+	{ OPTNOTSUP, 0 },
+	{ OPTNOTSUP, 0 },
+	{ OPTNOTSUP, 0 },
+	{ OPTNOTSUP, 0 },
+	{ OPTNOTSUP, 0 },
+	{ OPTNOTSUP, 0 },
+	{ OPTNOTSUP, 0 },
+	{ IGMP_MEMBERSHIP_QUERY, 0 },	/* IGMP_HOST_MEMBERSHIP_QUERY	*/
+	{ IGMP_V1_MEMBERSHIP_REPORT, 0 }, /* IGMP_HOST_MEMBERSHIP_REPORT */
+	{ IGMP_DVMRP, 0 },		/* IGMP_DVMRP			*/
+	{ IGMP_PIM, 0 },		/* IGMP_PIM			*/
+	{ OPTNOTSUP, 0 },		/* IGMP_TRACE			*/
+	{ IGMP_V2_MEMBERSHIP_REPORT, 0 }, /* IGMPV2_HOST_MEMBERSHIP_REPORT */
+	{ IGMP_V2_LEAVE_GROUP, 0 },	/* IGMP_HOST_LEAVE_MESSAGE	*/
+	{ OPTNOTSUP, 0 },
+	{ OPTNOTSUP, 0 },
+	{ OPTNOTSUP, 0 },
+	{ OPTNOTSUP, 0 },
+	{ OPTNOTSUP, 0 },
+	{ OPTNOTSUP, 0 },
+	{ IGMP_MTRACE_RESP, 0 },	/* IGMP_MTRACE_RESP		*/
+	{ IGMP_MTRACE, 0 }		/* IGMP_MTRACE			*/
+};
+
+static const lx_sockopt_map_t ltos_socket_sockopts[LX_SO_BPF_EXTENSIONS + 1] = {
+	{ OPTNOTSUP, 0 },
+	{ SO_DEBUG, sizeof (int) },	/* SO_DEBUG			*/
+	{ SO_REUSEADDR, sizeof (int) },	/* SO_REUSEADDR			*/
+	{ SO_TYPE, 0 },			/* SO_TYPE			*/
+	{ SO_ERROR, 0 },		/* SO_ERROR			*/
+	{ SO_DONTROUTE, sizeof (int) },	/* SO_DONTROUTE			*/
+	{ SO_BROADCAST, sizeof (int) },	/* SO_BROADCAST			*/
+	{ SO_SNDBUF, sizeof (int) },	/* SO_SNDBUF			*/
+	{ SO_RCVBUF, sizeof (int) },	/* SO_RCVBUF			*/
+	{ SO_KEEPALIVE, sizeof (int) },	/* SO_KEEPALIVE			*/
+	{ SO_OOBINLINE, sizeof (int) },	/* SO_OOBINLINE			*/
+	{ OPTNOTSUP, 0 },		/* SO_NO_CHECK			*/
+	{ OPTNOTSUP, 0 },		/* SO_PRIORITY			*/
+	{ SO_LINGER, 0 },		/* SO_LINGER			*/
+	{ OPTNOTSUP, 0 },		/* SO_BSDCOMPAT			*/
+	{ SO_REUSEPORT, sizeof (int) },	/* SO_REUSEPORT			*/
+	{ SO_RECVUCRED, sizeof (int) },	/* SO_PASSCRED			*/
+	{ OPTNOTSUP, 0 },		/* SO_PEERCRED			*/
+	{ SO_RCVLOWAT, sizeof (int) },	/* SO_RCVLOWAT			*/
+	{ SO_SNDLOWAT, sizeof (int) },	/* SO_SNDLOWAT			*/
+	{ SO_RCVTIMEO, 0 },		/* SO_RCVTIMEO			*/
+	{ SO_SNDTIMEO, 0 },		/* SO_SNDTIMEO			*/
+	{ OPTNOTSUP, 0 },		/* SO_SECURITY_AUTHENTICATION	*/
+	{ OPTNOTSUP, 0 },		/* SO_SECURITY_ENCRYPTION_TRANSPORT */
+	{ OPTNOTSUP, 0 },		/* SO_SECURITY_ENCRYPTION_NETWORK */
+	{ OPTNOTSUP, 0 },		/* SO_BINDTODEVICE		*/
+	{ SO_ATTACH_FILTER, 0 },	/* SO_ATTACH_FILTER		*/
+	{ SO_DETACH_FILTER, 0 },	/* SO_DETACH_FILTER		*/
+	{ OPTNOTSUP, 0 },		/* SO_PEERNAME			*/
+	{ SO_TIMESTAMP, sizeof (int) },	/* SO_TIMESTAMP			*/
+	{ SO_ACCEPTCONN, 0 },		/* SO_ACCEPTCONN		*/
+	{ OPTNOTSUP, 0 },		/* SO_PEERSEC			*/
+	{ SO_SNDBUF, sizeof (int) },	/* SO_SNDBUFFORCE		*/
+	{ SO_RCVBUF, sizeof (int) },	/* SO_RCVBUFFORCE		*/
+	{ OPTNOTSUP, 0 },		/* SO_PASSSEC			*/
+	{ OPTNOTSUP, 0 },		/* SO_TIMESTAMPNS		*/
+	{ OPTNOTSUP, 0 },		/* SO_MARK			*/
+	{ OPTNOTSUP, 0 },		/* SO_TIMESTAMPING		*/
+	{ SO_PROTOTYPE, 0 },		/* SO_PROTOCOL			*/
+	{ SO_DOMAIN, 0 },		/* SO_DOMAIN			*/
+	{ OPTNOTSUP, 0 },		/* SO_RXQ_OVFL			*/
+	{ OPTNOTSUP, 0 },		/* SO_WIFI_STATUS		*/
+	{ OPTNOTSUP, 0 },		/* SO_PEEK_OFF			*/
+	{ OPTNOTSUP, 0 },		/* SO_NOFCS			*/
+	{ OPTNOTSUP, 0 },		/* SO_LOCK_FILTER		*/
+	{ OPTNOTSUP, 0 },		/* SO_SELECT_ERR_QUEUE		*/
+	{ OPTNOTSUP, 0 },		/* SO_BUSY_POLL			*/
+	{ OPTNOTSUP, 0 },		/* SO_MAX_PACING_RATE		*/
+	{ OPTNOTSUP, 0 }		/* SO_BPF_EXTENSIONS		*/
+};
+
+static const lx_sockopt_map_t ltos_raw_sockopts[LX_ICMP_FILTER + 1] = {
+	{ OPTNOTSUP, 0 },
+	{ OPTNOTSUP, 0 }		/* ICMP_FILTER		*/
+};
+
+static const lx_sockopt_map_t ltos_packet_sockopts[LX_PACKET_STATISTICS + 1] = {
+	{ OPTNOTSUP, 0 },
+	{ PACKET_ADD_MEMBERSHIP, 0 },	/* PACKET_ADD_MEMBERSHIP	*/
+	{ PACKET_DROP_MEMBERSHIP, 0 },	/* PACKET_DROP_MEMBERSHIP	*/
+	{ OPTNOTSUP, 0 },		/* PACKET_RECV_OUTPUT		*/
+	{ OPTNOTSUP, 0 },
+	{ OPTNOTSUP, 0 },		/* PACKET_RX_RING		*/
+	{ PACKET_STATISTICS, 0 }	/* PACKET_STATISTICS		*/
+};
+
+/* Needed for SO_ATTACH_FILTER */
+struct lx_bpf_program {
+    unsigned short bf_len;
+    caddr_t bf_insns;
+};
+
+/* Invert filter fields as Linux expects */
+#define	LX_ICMP6_FILTER_INVERT(filterp) ( \
+	((filterp)->__icmp6_filt[0] ^= 0xFFFFFFFFU), \
+	((filterp)->__icmp6_filt[1] ^= 0xFFFFFFFFU), \
+	((filterp)->__icmp6_filt[2] ^= 0xFFFFFFFFU), \
+	((filterp)->__icmp6_filt[3] ^= 0xFFFFFFFFU), \
+	((filterp)->__icmp6_filt[4] ^= 0xFFFFFFFFU), \
+	((filterp)->__icmp6_filt[5] ^= 0xFFFFFFFFU), \
+	((filterp)->__icmp6_filt[6] ^= 0xFFFFFFFFU), \
+	((filterp)->__icmp6_filt[7] ^= 0xFFFFFFFFU))
+
+static boolean_t
+lx_sockopt_lookup(lx_proto_opts_t tbl, int *optname, socklen_t *optlen)
+{
+	const lx_sockopt_map_t *entry;
+
+	if (*optname > tbl.lpo_max) {
+		return (B_FALSE);
+	}
+	entry = &tbl.lpo_entries[*optname];
+	if (entry->lsm_opt == OPTNOTSUP) {
+		return (B_FALSE);
+	}
+	*optname = entry->lsm_opt;
+	/* Truncate the optlen if needed/allowed */
+	if (entry->lsm_lcap != 0 && *optlen > entry->lsm_lcap) {
+		*optlen = entry->lsm_lcap;
+	}
+	return (B_TRUE);
+}
+
+static int
+lx_setsockopt_ip(sonode_t *so, int optname, void *optval, socklen_t optlen)
+{
+	int error;
+	int *intval = (int *)optval;
+	lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_ip_sockopts);
+
+	switch (optname) {
+	case LX_IP_RECVERR:
+		/*
+		 * Ping sets this option to receive errors on raw sockets.
+		 * Currently we just ignore it to make ping happy. From the
+		 * Linux ip.7 man page:
+		 *
+		 *   For raw sockets, IP_RECVERR enables passing of all
+		 *   received ICMP errors to the application.
+		 *
+		 * Programs known to depend upon this:
+		 * - ping
+		 * - traceroute
+		 * - mount.nfs
+		 */
+		return (0);
+
+	case LX_IP_MTU_DISCOVER: {
+		int val;
+
+		/*
+		 * We translate Linux's IP_MTU_DISCOVER into our IP_DONTFRAG,
+		 * allowing this be a byte or an integer and observing the
+		 * inverted sense of the two relative to one another (and
+		 * translating accordingly).
+		 */
+		if (optlen < sizeof (int)) {
+			val = *((uint8_t *)optval);
+		} else {
+			val = *((int *)optval);
+		}
+
+		switch (val) {
+		case LX_IP_PMTUDISC_DONT:
+			val = 1;
+			break;
+
+		case LX_IP_PMTUDISC_DO:
+		case LX_IP_PMTUDISC_WANT:
+			val = 0;
+			break;
+
+		default:
+			return (EOPNOTSUPP);
+		}
+
+		error = socket_setsockopt(so, IPPROTO_IP, IP_DONTFRAG,
+		    &val, sizeof (val), CRED());
+		return (error);
+	}
+
+	case LX_IP_MULTICAST_TTL:
+	case LX_IP_MULTICAST_LOOP:
+		/*
+		 * For IP_MULTICAST_TTL and IP_MULTICAST_LOOP, Linux defines
+		 * the option value to be an integer while we define it to be
+		 * an unsigned character.  To prevent the kernel from spitting
+		 * back an error on an illegal length, verify that the option
+		 * value is less than UCHAR_MAX before truncating optlen.
+		 */
+		if (optlen <= 0 || optlen > sizeof (int) ||
+		    *intval > UINT8_MAX) {
+			return (EINVAL);
+		}
+		optlen = sizeof (uchar_t);
+		break;
+
+	default:
+		break;
+	}
+
+	if (!lx_sockopt_lookup(sockopts_tbl, &optname, &optlen)) {
+		return (ENOPROTOOPT);
+	}
+
+	error = socket_setsockopt(so, IPPROTO_IP, optname, optval, optlen,
+	    CRED());
+	return (error);
+}
+
+static int
+lx_setsockopt_ipv6(sonode_t *so, int optname, void *optval, socklen_t optlen)
+{
+	int error;
+	lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_ipv6_sockopts);
+
+	if (optname == LX_IPV6_MTU) {
+		/*
+		 * There isn't a good translation for IPV6_MTU and certain apps
+		 * such as bind9 will bail if it cannot be set.
+		 * We just lie about the success for now.
+		 */
+		return (0);
+	}
+
+	if (!lx_sockopt_lookup(sockopts_tbl, &optname, &optlen)) {
+		return (ENOPROTOOPT);
+	}
+	error = socket_setsockopt(so, IPPROTO_IPV6, optname, optval, optlen,
+	    CRED());
+	return (error);
+}
+
+static int
+lx_setsockopt_icmpv6(sonode_t *so, int optname, void *optval, socklen_t optlen)
+{
+	int error;
+	lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_icmpv6_sockopts);
+
+	if (optname == LX_ICMP6_FILTER && optval != NULL) {
+		/*
+		 * Surprise! The input to ICMP6_FILTER on Linux is inverted
+		 * when compared to illumos.
+		 */
+		if (optlen != sizeof (icmp6_filter_t)) {
+			return (EINVAL);
+		}
+		LX_ICMP6_FILTER_INVERT((icmp6_filter_t *)optval);
+	}
+
+	if (!lx_sockopt_lookup(sockopts_tbl, &optname, &optlen)) {
+		return (ENOPROTOOPT);
+	}
+	error = socket_setsockopt(so, IPPROTO_ICMPV6, optname, optval, optlen,
+	    CRED());
+	return (error);
+}
+
+static int
+lx_setsockopt_tcp(sonode_t *so, int optname, void *optval, socklen_t optlen)
+{
+	int error;
+	lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_tcp_sockopts);
+
+	if (optname == LX_TCP_DEFER_ACCEPT) {
+		int *intval;
+		char *dfp;
+
+		/*
+		 * Emulate TCP_DEFER_ACCEPT using the datafilt(7M) socket
+		 * filter but we can't emulate the timeout aspect so treat any
+		 * non-zero value as enabling and zero as disabling.
+		 */
+		if (optlen != sizeof (int)) {
+			return (EINVAL);
+		}
+		intval = (int *)optval;
+
+		/*
+		 * socket_setsockopt asserts that the optval is aligned, so
+		 * we use kmem_alloc to ensure this.
+		 */
+		dfp = (char *)kmem_alloc(sizeof (DATAFILT), KM_SLEEP);
+		(void) strcpy(dfp, DATAFILT);
+
+		if (*intval > 0) {
+			error = socket_setsockopt(so, SOL_FILTER, FIL_ATTACH,
+			    dfp, 9, CRED());
+			if (error == EEXIST) {
+				error = 0;
+			}
+		} else {
+			error = socket_setsockopt(so, SOL_FILTER, FIL_DETACH,
+			    dfp, 9, CRED());
+			if (error == ENXIO) {
+				error = 0;
+			}
+		}
+		kmem_free(dfp, sizeof (DATAFILT));
+		return (error);
+	}
+
+	if (!lx_sockopt_lookup(sockopts_tbl, &optname, &optlen)) {
+		return (ENOPROTOOPT);
+	}
+
+	error = socket_setsockopt(so, IPPROTO_TCP, optname, optval, optlen,
+	    CRED());
+	return (error);
+}
+
+static int
+lx_setsockopt_socket(sonode_t *so, int optname, void *optval, socklen_t optlen)
+{
+	int error;
+	lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_socket_sockopts);
+	struct lx_bpf_program *lbp;
+	int *intval;
+	struct bpf_program bp;
+
+	switch (optname) {
+	case LX_SO_BSDCOMPAT:
+		/* Linux ignores this option. */
+		return (0);
+
+	case LX_SO_TIMESTAMP:
+		/*
+		 * SO_TIMESTAMP is not supported on AF_UNIX sockets but we have
+		 * some of those which apps use for logging, etc., so pretend
+		 * this worked.
+		 */
+		if (so->so_family == AF_UNIX) {
+			return (0);
+		}
+		break;
+
+	case LX_SO_ATTACH_FILTER:
+		/*
+		 * Convert bpf program struct
+		 */
+		if (optlen != sizeof (struct lx_bpf_program)) {
+			return (EINVAL);
+		}
+		lbp = (struct lx_bpf_program *)optval;
+		bp.bf_len = lbp->bf_len;
+		bp.bf_insns = (struct bpf_insn *)lbp->bf_insns;
+		optval = &bp;
+		break;
+
+	case LX_SO_PASSSEC:
+		/*
+		 * SO_PASSSEC is very similar to SO_PASSCRED (emulated by
+		 * SO_RECVUCRED) in that it requests that cmsgs containing
+		 * identity information be attached to recieved messages.
+		 * Instead of ucred information, security-module-specific
+		 * information such as selinux label is expected
+		 *
+		 * Since LX does not at all support selinux today, the
+		 * option is silently accepted.
+		 */
+		return (0);
+
+	case LX_SO_PASSCRED:
+		/*
+		 * In many cases, the Linux SO_PASSCRED is mapped to the SunOS
+		 * SO_RECVUCRED to enable the passing of peer credential
+		 * information via received cmsgs.  One exception is for
+		 * connection-oriented AF_UNIX sockets which do not yet support
+		 * that option.  Instead, we track the setting internally and,
+		 * when there is appropriate cmsg space, emulate the credential
+		 * passing by querying the STREAMS ioctl.
+		 */
+		if (so->so_family == AF_UNIX &&
+		    (so->so_mode & SM_CONNREQUIRED) != 0) {
+			lx_socket_aux_data_t *sad;
+
+			if (optlen != sizeof (int)) {
+				return (EINVAL);
+			}
+			intval = (int *)optval;
+			sad = lx_sad_acquire(SOTOV(so));
+			sad->lxsad_stream_cred = !(*intval == 0);
+			mutex_exit(&sad->lxsad_lock);
+			return (0);
+		}
+		break;
+	}
+
+	if (!lx_sockopt_lookup(sockopts_tbl, &optname, &optlen)) {
+		return (ENOPROTOOPT);
+	}
+
+	error = socket_setsockopt(so, SOL_SOCKET, optname, optval, optlen,
+	    CRED());
+	return (error);
+}
+
+static int
+lx_setsockopt_raw(sonode_t *so, int optname, void *optval, socklen_t optlen)
+{
+	int error;
+	lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_raw_sockopts);
+
+	switch (optname) {
+	case LX_ICMP_FILTER:
+		/*
+		 * This option is currently ignored to appease ping.
+		 */
+		return (0);
+
+	case LX_IPV6_CHECKSUM:
+		/*
+		 * Ping6 tries to set the IPV6_CHECKSUM offset in a way that
+		 * illumos won't allow.  Quietly ignore this to prevent it from
+		 * complaining.
+		 */
+		return (0);
+
+	default:
+		break;
+	}
+
+	if (!lx_sockopt_lookup(sockopts_tbl, &optname, &optlen)) {
+		return (ENOPROTOOPT);
+	}
+
+	error = socket_setsockopt(so, IPPROTO_TCP, optname, optval, optlen,
+	    CRED());
+	return (error);
+}
+
+static int
+lx_setsockopt_packet(sonode_t *so, int optname, void *optval, socklen_t optlen)
+{
+	int error;
+	lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_packet_sockopts);
+	struct packet_mreq *mr;
+
+	switch (optname) {
+	case LX_PACKET_ADD_MEMBERSHIP:
+	case LX_PACKET_DROP_MEMBERSHIP:
+		/* Convert Linux mr_type to illumos */
+		if (optlen != sizeof (struct packet_mreq)) {
+			return (EINVAL);
+		}
+		mr = (struct packet_mreq *)optval;
+		if (--mr->mr_type > PACKET_MR_ALLMULTI)
+			return (EINVAL);
+		optval = mr;
+		break;
+
+	default:
+		break;
+	}
+
+	if (!lx_sockopt_lookup(sockopts_tbl, &optname, &optlen)) {
+		return (ENOPROTOOPT);
+	}
+
+	error = socket_setsockopt(so, SOL_PACKET, optname, optval, optlen,
+	    CRED());
+	return (error);
+}
+
+static int
+lx_setsockopt_igmp(sonode_t *so, int optname, void *optval, socklen_t optlen)
+{
+	int error;
+	lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_igmp_sockopts);
+
+	if (!lx_sockopt_lookup(sockopts_tbl, &optname, &optlen)) {
+		return (ENOPROTOOPT);
+	}
+
+	error = socket_setsockopt(so, IPPROTO_IGMP, optname, optval, optlen,
+	    CRED());
+	return (error);
+}
+
+static int
+lx_getsockopt_ip(sonode_t *so, int optname, void *optval, socklen_t *optlen)
+{
+	int error = 0;
+	lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_ip_sockopts);
+
+	if (!lx_sockopt_lookup(sockopts_tbl, &optname, optlen)) {
+		return (ENOPROTOOPT);
+	}
+
+	error = socket_getsockopt(so, IPPROTO_IP, optname, optval, optlen, 0,
+	    CRED());
+	return (error);
+}
+
+static int
+lx_getsockopt_ipv6(sonode_t *so, int optname, void *optval, socklen_t *optlen)
+{
+	int error = 0;
+	lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_ipv6_sockopts);
+
+	if (!lx_sockopt_lookup(sockopts_tbl, &optname, optlen)) {
+		return (ENOPROTOOPT);
+	}
+
+	error = socket_getsockopt(so, IPPROTO_IPV6, optname, optval, optlen, 0,
+	    CRED());
+	return (error);
+}
+
+static int
+lx_getsockopt_icmpv6(sonode_t *so, int optname, void *optval,
+    socklen_t *optlen)
+{
+	int error = 0;
+	lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_icmpv6_sockopts);
+
+	if (optname == LX_ICMP6_FILTER) {
+		error = socket_getsockopt(so, IPPROTO_ICMPV6, ICMP6_FILTER,
+		    optval, optlen, 0, CRED());
+
+		/*
+		 * ICMP6_FILTER is inverted on Linux. Make it so before copying
+		 * back to caller's buffer.
+		 */
+		if (error == 0) {
+			LX_ICMP6_FILTER_INVERT((icmp6_filter_t *)optval);
+		}
+		return (error);
+	}
+
+	if (!lx_sockopt_lookup(sockopts_tbl, &optname, optlen)) {
+		return (ENOPROTOOPT);
+	}
+
+	error = socket_getsockopt(so, IPPROTO_ICMPV6, optname, optval, optlen,
+	    0, CRED());
+	return (error);
+}
+
+static int
+lx_getsockopt_tcp(sonode_t *so, int optname, void *optval, socklen_t *optlen)
+{
+	int error = 0;
+	int *intval = (int *)optval;
+	lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_tcp_sockopts);
+
+	switch (optname) {
+	case LX_TCP_CORK:
+		/*
+		 * We do not support TCP_CORK but some apps rely on it.  Rather
+		 * than return an error we just return 0.  This isn't exactly a
+		 * lie, since this option really isn't set, but it's not the
+		 * whole truth either. Fortunately, we aren't under oath.
+		 */
+		if (*optlen < sizeof (int)) {
+			error = EINVAL;
+		} else {
+			*intval = 0;
+		}
+		*optlen = sizeof (int);
+		return (error);
+
+	case LX_TCP_DEFER_ACCEPT:
+		/*
+		 * We do support TCP_DEFER_ACCEPT using the datafilt(7M) socket
+		 * filter but we don't emulate the timeout aspect so treat the
+		 * existence as 1 and absence as 0.
+		 */
+		if (*optlen < sizeof (int)) {
+			error = EINVAL;
+		} else {
+			struct fil_info fi[10];
+			int i;
+			socklen_t len = sizeof (fi);
+
+			if ((error = socket_getsockopt(so, SOL_FILTER,
+			    FIL_LIST, fi, &len, 0, CRED()) != 0)) {
+				*optlen = sizeof (int);
+				return (error);
+			}
+
+			*intval = 0;
+			len = len / sizeof (struct fil_info);
+			for (i = 0; i < len; i++) {
+				if (fi[i].fi_flags == FILF_PROG &&
+				    strcmp(fi[i].fi_name, "datafilt") == 0) {
+					*intval = 1;
+					break;
+				}
+			}
+		}
+		*optlen = sizeof (int);
+		return (error);
+	default:
+		break;
+	}
+
+	if (!lx_sockopt_lookup(sockopts_tbl, &optname, optlen)) {
+		return (ENOPROTOOPT);
+	}
+
+	error = socket_getsockopt(so, IPPROTO_TCP, optname, optval, optlen, 0,
+	    CRED());
+	return (error);
+}
+
+static int
+lx_getsockopt_socket(sonode_t *so, int optname, void *optval,
+    socklen_t *optlen)
+{
+	int error = 0;
+	int *intval = (int *)optval;
+	lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_socket_sockopts);
+
+	switch (optname) {
+	case LX_SO_PASSSEC:
+		/*
+		 * Communicate value of 0 since selinux-related functionality
+		 * is not supported.
+		 */
+		if (*optlen < sizeof (int)) {
+			error = EINVAL;
+		} else {
+			*intval = 0;
+		}
+		*optlen = sizeof (int);
+		return (error);
+
+	case LX_SO_PASSCRED:
+		/*
+		 * Special handling for connection-oriented AF_UNIX sockets.
+		 * See lx_setsockopt_socket for more details.
+		 */
+		if (so->so_family == AF_UNIX &&
+		    (so->so_mode & SM_CONNREQUIRED) != 0) {
+			lx_socket_aux_data_t *sad;
+
+			if (*optlen < sizeof (int)) {
+				return (EINVAL);
+			}
+			sad = lx_sad_acquire(SOTOV(so));
+			*intval = sad->lxsad_stream_cred;
+			*optlen = sizeof (int);
+			mutex_exit(&sad->lxsad_lock);
+			return (0);
+		}
+		break;
+
+	case LX_SO_PEERCRED:
+		if (*optlen < sizeof (struct lx_ucred)) {
+			error = EINVAL;
+		} else {
+			struct lx_ucred *lcred = (struct lx_ucred *)optval;
+
+			mutex_enter(&so->so_lock);
+			if ((so->so_mode & SM_CONNREQUIRED) == 0) {
+				error = ENOTSUP;
+			} else if (so->so_peercred == NULL) {
+				error = EINVAL;
+			} else {
+				lcred->lxu_uid = crgetuid(so->so_peercred);
+				lcred->lxu_gid = crgetgid(so->so_peercred);
+				lcred->lxu_pid = so->so_cpid;
+			}
+			mutex_exit(&so->so_lock);
+		}
+		*optlen = sizeof (struct lx_ucred);
+		return (error);
+
+	default:
+		break;
+	}
+
+	if (!lx_sockopt_lookup(sockopts_tbl, &optname, optlen)) {
+		return (ENOPROTOOPT);
+	}
+
+	error = socket_getsockopt(so, SOL_SOCKET, optname, optval, optlen, 0,
+	    CRED());
+
+	if (error == 0) {
+		switch (optname) {
+		case SO_TYPE:
+			/* translate our type back to Linux */
+			*intval = STOL_SOCKTYPE(*intval);
+			break;
+
+		case SO_ERROR:
+			*intval = lx_errno(*intval, EINVAL);
+			break;
+		default:
+			break;
+		}
+	}
+	return (error);
+}
+
+static int
+lx_getsockopt_raw(sonode_t *so, int optname, void *optval, socklen_t *optlen)
+{
+	int error = 0;
+	lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_raw_sockopts);
+
+	if (!lx_sockopt_lookup(sockopts_tbl, &optname, optlen)) {
+		return (ENOPROTOOPT);
+	}
+
+	error = socket_getsockopt(so, IPPROTO_RAW, optname, optval, optlen, 0,
+	    CRED());
+	return (error);
+}
+
+static int
+lx_getsockopt_packet(sonode_t *so, int optname, void *optval,
+    socklen_t *optlen)
+{
+	int error = 0;
+	lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_packet_sockopts);
+
+	if (!lx_sockopt_lookup(sockopts_tbl, &optname, optlen)) {
+		return (ENOPROTOOPT);
+	}
+
+	error = socket_getsockopt(so, SOL_PACKET, optname, optval, optlen, 0,
+	    CRED());
+	return (error);
+}
+
+static int
+lx_getsockopt_igmp(sonode_t *so, int optname, void *optval, socklen_t *optlen)
+{
+	int error = 0;
+	lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_igmp_sockopts);
+
+	if (!lx_sockopt_lookup(sockopts_tbl, &optname, optlen)) {
+		return (ENOPROTOOPT);
+	}
+
+	error = socket_getsockopt(so, IPPROTO_IGMP, optname, optval, optlen, 0,
+	    CRED());
+	return (error);
+}
+
+long
+lx_setsockopt(int sock, int level, int optname, void *optval, socklen_t optlen)
+{
+	struct sonode *so;
+	file_t *fp;
+	int buflen = 0;
+	intptr_t stkbuf[2];
+	void *optbuf = stkbuf;
+	int error = 0;
+
+	if (optlen != 0) {
+		if (optlen > SO_MAXARGSIZE) {
+			return (set_errno(EINVAL));
+		}
+		if (optlen > sizeof (stkbuf)) {
+			buflen = optlen;
+			optbuf = kmem_alloc(optlen, KM_SLEEP);
+		} else {
+			/*
+			 * Zero the on-stack buffer to avoid poisoning smaller
+			 * optvals with stack garbage.
+			 */
+			stkbuf[0] = 0;
+			stkbuf[1] = 0;
+		}
+		if (copyin(optval, optbuf, optlen) != 0) {
+			if (buflen != 0) {
+				kmem_free(optbuf, buflen);
+			}
+			return (set_errno(EFAULT));
+		}
+	} else {
+		optbuf = NULL;
+	}
+	if ((so = getsonode(sock, &error, &fp)) == NULL) {
+		if (buflen != 0) {
+			kmem_free(optbuf, buflen);
+		}
+		return (set_errno(error));
+	}
+
+	switch (level) {
+	case LX_IPPROTO_IP:
+		error = lx_setsockopt_ip(so, optname, optbuf, optlen);
+		break;
+	case LX_IPPROTO_IPV6:
+		error = lx_setsockopt_ipv6(so, optname, optbuf, optlen);
+		break;
+	case LX_IPPROTO_ICMPV6:
+		error = lx_setsockopt_icmpv6(so, optname, optbuf, optlen);
+		break;
+	case LX_IPPROTO_TCP:
+		error = lx_setsockopt_tcp(so, optname, optbuf, optlen);
+		break;
+	case LX_SOL_SOCKET:
+		error = lx_setsockopt_socket(so, optname, optbuf, optlen);
+		break;
+	case LX_IPPROTO_RAW:
+		error = lx_setsockopt_raw(so, optname, optbuf, optlen);
+		break;
+	case LX_SOL_PACKET:
+		error = lx_setsockopt_packet(so, optname, optbuf, optlen);
+		break;
+	case LX_IPPROTO_IGMP:
+		error = lx_setsockopt_igmp(so, optname, optbuf, optlen);
+		break;
+	case LX_SOL_NETLINK:
+		/*
+		 * Since our netlink implmentation is modeled after Linux,
+		 * sockopts can be passed directly through.
+		 */
+		error = socket_setsockopt(so, LX_SOL_NETLINK, optname, optval,
+		    optlen, CRED());
+		break;
+	default:
+		error = ENOPROTOOPT;
+		break;
+	}
+
+	if (error == ENOPROTOOPT) {
+		char buf[LX_UNSUP_BUFSZ];
+
+		snprintf(buf, LX_UNSUP_BUFSZ, "setsockopt(%d, %d)", level,
+		    optname);
+		lx_unsupported(buf);
+	}
+	if (buflen != 0) {
+		kmem_free(optbuf, buflen);
+	}
+	releasef(sock);
+	if (error != 0) {
+		return (set_errno(error));
+	}
+	return (0);
+}
+
+long
+lx_getsockopt(int sock, int level, int optname, void *optval,
+    socklen_t *optlenp)
+{
+	struct sonode *so;
+	file_t *fp;
+	int error = 0, buflen = 0;
+	socklen_t optlen;
+	intptr_t stkbuf[2];
+	void *optbuf = stkbuf;
+
+	if (copyin(optlenp, &optlen, sizeof (optlen)) != 0) {
+		return (set_errno(EFAULT));
+	}
+	if (optlen != 0) {
+		if (optlen > SO_MAXARGSIZE) {
+			return (set_errno(EINVAL));
+		}
+		if (optlen > sizeof (stkbuf)) {
+			buflen = optlen;
+			optbuf = kmem_zalloc(optlen, KM_SLEEP);
+		} else {
+			/* zero the on-stack buffer, just in case */
+			stkbuf[0] = 0;
+			stkbuf[1] = 0;
+		}
+	} else {
+		optbuf = NULL;
+	}
+	if ((so = getsonode(sock, &error, &fp)) == NULL) {
+		if (buflen != 0) {
+			kmem_free(optbuf, buflen);
+		}
+		return (set_errno(error));
+	}
+
+	switch (level) {
+	case LX_IPPROTO_IP:
+		error = lx_getsockopt_ip(so, optname, optbuf, &optlen);
+		break;
+	case LX_IPPROTO_IPV6:
+		error = lx_getsockopt_ipv6(so, optname, optbuf, &optlen);
+		break;
+	case LX_IPPROTO_ICMPV6:
+		error = lx_getsockopt_icmpv6(so, optname, optbuf, &optlen);
+		break;
+	case LX_IPPROTO_TCP:
+		error = lx_getsockopt_tcp(so, optname, optbuf, &optlen);
+		break;
+	case LX_SOL_SOCKET:
+		error = lx_getsockopt_socket(so, optname, optbuf, &optlen);
+		break;
+	case LX_IPPROTO_RAW:
+		error = lx_getsockopt_raw(so, optname, optbuf, &optlen);
+		break;
+	case LX_SOL_PACKET:
+		error = lx_getsockopt_packet(so, optname, optbuf, &optlen);
+		break;
+	case LX_IPPROTO_IGMP:
+		error = lx_getsockopt_igmp(so, optname, optbuf, &optlen);
+		break;
+	case LX_SOL_NETLINK:
+		/*
+		 * Since our netlink implmentation is modeled after Linux,
+		 * sockopts can be passed directly through.
+		 */
+		error = socket_getsockopt(so, LX_SOL_NETLINK, optname, optval,
+		    &optlen, 0, CRED());
+		break;
+	default:
+		error = EOPNOTSUPP;
+		break;
+	}
+
+	if (error == ENOPROTOOPT) {
+		char buf[LX_UNSUP_BUFSZ];
+
+		snprintf(buf, LX_UNSUP_BUFSZ, "getsockopt(%d, %d)", level,
+		    optname);
+		lx_unsupported(buf);
+	}
+	if (copyout(&optlen, optlenp, sizeof (optlen)) != 0) {
+		error = EFAULT;
+	}
+	if (error == 0 && optlen > 0) {
+		VERIFY(optlen <= sizeof (stkbuf) || optlen <= buflen);
+		if (copyout(optbuf, optval, optlen) != 0) {
+			error = EFAULT;
+		}
+	}
+	if (buflen != 0) {
+		kmem_free(optbuf, buflen);
+	}
+	releasef(sock);
+	if (error != 0) {
+		return (set_errno(error));
+	}
+	return (0);
+}
+
+long
+lx_getname_common(lx_getname_type_t type, int sockfd, void *np, int *nlp)
+{
+	struct sockaddr_storage buf;
+	struct sockaddr *name = (struct sockaddr *)&buf;
+	socklen_t namelen, namelen_orig;
+	int err, tmp;
+	struct sonode *so;
+
+	/* We need to validate the name address up front to pass LTP. */
+	if (copyin(np, &tmp, sizeof (tmp)) != 0)
+		return (set_errno(EFAULT));
+
+	if (copyin(nlp, &namelen, sizeof (socklen_t)) != 0)
+		return (set_errno(EFAULT));
+	namelen_orig = namelen;
+
+	/* LTP can pass -1 */
+	if ((int)namelen < 0)
+		return (set_errno(EINVAL));
+
+	if ((so = getsonode(sockfd, &err, NULL)) == NULL)
+		return (set_errno(err));
+
+	bzero(&buf, sizeof (buf));
+	namelen = sizeof (struct sockaddr_storage);
+	if (type == LX_GETPEERNAME) {
+		err = socket_getpeername(so, name, &namelen, B_FALSE, CRED());
+	} else {
+		err = socket_getsockname(so, name, &namelen, CRED());
+	}
+
+	if (err == 0) {
+		ASSERT(namelen <= so->so_max_addr_len);
+		err = stol_sockaddr_copyout(name, namelen,
+		    (struct sockaddr *)np, (socklen_t *)nlp, namelen_orig);
+	}
+
+	releasef(sockfd);
+	return (err != 0 ? set_errno(err) : 0);
+}
+
+long
+lx_getpeername(int sockfd, void *np, int *nlp)
+{
+	return (lx_getname_common(LX_GETPEERNAME, sockfd, np, nlp));
+}
+
+long
+lx_getsockname(int sockfd, void *np, int *nlp)
+{
+	return (lx_getname_common(LX_GETSOCKNAME, sockfd, np, nlp));
+}
+
+static int
+lx_accept_common(int sock, struct sockaddr *name, socklen_t *nlp, int flags)
+{
+	struct sonode *so;
+	file_t *fp;
+	int error;
+	socklen_t namelen;
+	struct sonode *nso;
+	struct vnode *nvp;
+	struct file *nfp;
+	int nfd;
+	int arg;
+
+	if (flags & ~(LX_SOCK_CLOEXEC | LX_SOCK_NONBLOCK)) {
+		return (set_errno(EINVAL));
+	}
+
+	if ((so = getsonode(sock, &error, &fp)) == NULL)
+		return (set_errno(error));
+
+	if (name != NULL) {
+		/*
+		 * The Linux man page says that -1 is returned and errno is set
+		 * to EFAULT if the "name" address is bad, but it is silent on
+		 * what to set errno to if the "namelen" address is bad.
+		 * LTP expects EINVAL.
+		 *
+		 * Note that we must first check the name pointer, as the Linux
+		 * docs state nothing is copied out if the "name" pointer is
+		 * NULL. If it is NULL, we don't care about the namelen
+		 * pointer's value or about dereferencing it.
+		 */
+		if (copyin(nlp, &namelen, sizeof (namelen))) {
+			releasef(sock);
+			return (set_errno(EINVAL));
+		}
+		if (namelen == 0) {
+			name = NULL;
+		}
+	} else {
+		namelen = 0;
+	}
+
+	/*
+	 * Allocate the user fd before socket_accept() in order to
+	 * catch EMFILE errors before calling socket_accept().
+	 */
+	if ((error = falloc(NULL, FWRITE|FREAD, &nfp, &nfd)) != 0) {
+		eprintsoline(so, EMFILE);
+		releasef(sock);
+		return (set_errno(error));
+	}
+	if ((error = socket_accept(so, fp->f_flag, CRED(), &nso)) != 0) {
+		setf(nfd, NULL);
+		unfalloc(nfp);
+		releasef(sock);
+		return (set_errno(error));
+	}
+
+	nvp = SOTOV(nso);
+
+	if (namelen != 0) {
+		socklen_t addrlen = sizeof (struct sockaddr_storage);
+		struct sockaddr_storage buf;
+		struct sockaddr *addrp = (struct sockaddr *)&buf;
+
+		if ((error = socket_getpeername(nso, addrp, &addrlen, B_TRUE,
+		    CRED())) == 0) {
+			error = stol_sockaddr_copyout(addrp, addrlen,
+			    name, nlp, namelen);
+			/*
+			 * Logic might dictate that we should check if we can
+			 * write to the namelen pointer earlier so we don't
+			 * accept a pending connection only to fail the call
+			 * because we can't write the namelen value back out.
+			 * However, testing shows Linux does indeed fail the
+			 * call after accepting the connection so we must
+			 * behave in a compatible manner.
+			 */
+		} else {
+			ASSERT(error == EINVAL || error == ENOTCONN);
+			error = ECONNABORTED;
+		}
+	}
+
+	if (error != 0) {
+		setf(nfd, NULL);
+		unfalloc(nfp);
+		(void) socket_close(nso, 0, CRED());
+		socket_destroy(nso);
+		releasef(sock);
+		return (set_errno(error));
+	}
+
+	/* Fill in the entries that falloc reserved */
+	nfp->f_vnode = nvp;
+	mutex_exit(&nfp->f_tlock);
+	setf(nfd, nfp);
+
+	/* Act on LX_SOCK_CLOEXEC from flags */
+	if (flags & LX_SOCK_CLOEXEC) {
+		f_setfd(nfd, FD_CLOEXEC);
+	}
+
+	/*
+	 * In Linux, accept()ed sockets do not inherit anything set by fcntl(),
+	 * so either explicitly set the flags or filter those out.
+	 *
+	 * The VOP_SETFL code is a simplification of the F_SETFL code in
+	 * fcntl(). Ignore any errors from VOP_SETFL.
+	 */
+	arg = 0;
+	if (flags & LX_SOCK_NONBLOCK)
+		arg |= FNONBLOCK;
+
+	error = VOP_SETFL(nvp, nfp->f_flag, arg, nfp->f_cred, NULL);
+	if (error != 0) {
+		eprintsoline(so, error);
+		error = 0;
+	} else {
+		mutex_enter(&nfp->f_tlock);
+		nfp->f_flag &= ~FMASK | (FREAD|FWRITE);
+		nfp->f_flag |= arg;
+		mutex_exit(&nfp->f_tlock);
+	}
+
+	releasef(sock);
+	return (nfd);
+}
+
+long
+lx_accept(int sockfd, void *np, int *nlp)
+{
+	return (lx_accept_common(sockfd, (struct sockaddr *)np,
+	    (socklen_t *)nlp, 0));
+}
+
+long
+lx_accept4(int sockfd, void *np, int *nlp, int flags)
+{
+	return (lx_accept_common(sockfd, (struct sockaddr *)np,
+	    (socklen_t *)nlp, flags));
+}
+
+#if defined(_SYSCALL32_IMPL)
+
+#define	LX_SYS_SOCKETCALL		102
+#define	LX_SOCKETCALL_MAX		20
+
+typedef long (*lx_sockfn_t)();
+
+static struct {
+	lx_sockfn_t s_fn;	/* Function implementing the subcommand */
+	int s_nargs;		/* Number of arguments the function takes */
+} lx_socketcall_fns[] = {
+	lx_socket,	3,	/* socket */
+	lx_bind,	3,	/* bind */
+	lx_connect,	3,	/* connect */
+	NULL,		2,	/* listen */
+	lx_accept,	3,	/* accept */
+	lx_getsockname,	3,	/* getsockname */
+	lx_getpeername,	3,	/* getpeername */
+	NULL,		4,	/* socketpair */
+	lx_send,	4,	/* send */
+	lx_recv,	4,	/* recv */
+	lx_sendto,	6,	/* sendto */
+	lx_recvfrom,	6,	/* recvfrom */
+	NULL,		2,	/* shutdown */
+	lx_setsockopt,	5,	/* setsockopt */
+	lx_getsockopt,	5,	/* getsockopt */
+	lx_sendmsg,	3,	/* sendmsg */
+	lx_recvmsg,	3,	/* recvmsg */
+	lx_accept4,	4,	/* accept4 */
+	NULL,		5,	/* recvmmsg */
+	NULL,		4	/* sendmmsg */
+};
+
+long
+lx_socketcall(long p1, uint32_t *p2)
+{
+	int subcmd, i;
+	unsigned long args[6] = { 0, 0, 0, 0, 0, 0 };
+	lx_lwp_data_t *lwpd = ttolxlwp(curthread);
+
+	/* incoming subcmds are 1-indexed */
+	subcmd = (int)p1 - 1;
+
+	if (subcmd < 0 || subcmd >= LX_SOCKETCALL_MAX) {
+		return (-EINVAL);
+	}
+
+	/* Vector back out to userland emulation if we lack IKE */
+	if (lx_socketcall_fns[subcmd].s_fn == NULL) {
+		uintptr_t uargs[2] = {p1, (uintptr_t)p2};
+		/* The userspace emulation will handle the syscall return */
+		lwpd->br_eosys = JUSTRETURN;
+		lx_emulate_user32(ttolwp(curthread), LX_SYS_SOCKETCALL, uargs);
+		return (0);
+	}
+
+	/*
+	 * Copy the arguments to the subcommand in from the app's address
+	 * space, returning EFAULT if we get a bogus pointer.
+	 */
+	for (i = 0; i < lx_socketcall_fns[subcmd].s_nargs; i++) {
+		uint32_t arg;
+
+		if (copyin(&p2[i], &arg, sizeof (uint32_t)) != 0) {
+			return (set_errno(EFAULT));
+		}
+		args[i] = (unsigned long)arg;
+	}
+
+	return ((lx_socketcall_fns[subcmd].s_fn)(args[0], args[1], args[2],
+	    args[3], args[4], args[5]));
+}
+
+#endif /* defined(_SYSCALL32_IMPL) */
+
+static void
+lx_socket_vsd_free(void *data)
+{
+	lx_socket_aux_data_t *entry;
+
+	entry = (lx_socket_aux_data_t *)data;
+	mutex_destroy(&entry->lxsad_lock);
+	kmem_free(entry, sizeof (*entry));
+}
+
+void
+lx_socket_init()
+{
+	vsd_create(&lx_socket_vsd, lx_socket_vsd_free);
+}
+
+void
+lx_socket_fini()
+{
+	vsd_destroy(&lx_socket_vsd);
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_stat.c b/usr/src/uts/common/brand/lx/syscall/lx_stat.c
new file mode 100644
index 0000000000..2ec8a4542d
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_stat.c
@@ -0,0 +1,439 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/fcntl.h>
+#include <sys/debug.h>
+#include <sys/errno.h>
+#include <sys/model.h>
+#include <sys/mode.h>
+#include <sys/stat.h>
+#include <sys/lx_brand.h>
+#include <sys/lx_fcntl.h>
+#include <sys/lx_types.h>
+#include <sys/lx_impl.h>
+#include <sys/brand.h>
+#include <sys/ddi.h>
+
+/* From "uts/common/syscall/stat.c" */
+extern int cstatat_getvp(int, char *, int, vnode_t **, cred_t **);
+
+typedef struct lx_timespec32 {
+	int32_t	ts_sec;
+	int32_t	ts_nsec;
+} lx_timespec32_t;
+
+typedef struct lx_timespec64 {
+	int64_t	ts_sec;
+	int64_t	ts_nsec;
+}lx_timespec64_t;
+
+struct lx_stat32 {
+	uint16_t	st_dev;
+	uint16_t	st_pad1;
+	uint32_t	st_ino;
+	uint16_t	st_mode;
+	uint16_t	st_nlink;
+	uint16_t	st_uid;
+	uint16_t	st_gid;
+	uint16_t	st_rdev;
+	uint16_t 	st_pad2;
+	uint32_t	st_size;
+	uint32_t	st_blksize;
+	uint32_t	st_blocks;
+	lx_timespec32_t	st_atime;
+	lx_timespec32_t	st_mtime;
+	lx_timespec32_t	st_ctime;
+	uint32_t	st_pad3;
+	uint32_t	st_pad4;
+};
+
+#pragma pack(4)
+struct lx_stat64_32 {
+	uint64_t	st_dev;
+	uint32_t	st_pad1;
+	uint32_t	st_small_ino;
+	uint32_t	st_mode;
+	uint32_t	st_nlink;
+	uint32_t	st_uid;
+	uint32_t	st_gid;
+	uint64_t	st_rdev;
+	uint32_t	st_pad2;
+	uint64_t	st_size;
+	uint32_t	st_blksize;
+	uint64_t	st_blocks;
+	lx_timespec32_t	st_atime;
+	lx_timespec32_t	st_mtime;
+	lx_timespec32_t	st_ctime;
+	uint64_t	st_ino;
+};
+#pragma pack()
+
+#if defined(_LP64)
+struct lx_stat64_64 {
+	uint64_t	st_dev;
+	uint64_t	st_ino;
+	uint64_t	st_nlink;	/* yes, the order really is */
+	uint32_t	st_mode;	/* different for these two */
+	uint32_t	st_uid;
+	uint32_t	st_gid;
+	uint32_t	st_pad0;
+	uint64_t	st_rdev;
+	int64_t		st_size;
+	int64_t		st_blksize;
+	int64_t		st_blocks;
+	lx_timespec64_t	st_atime;
+	lx_timespec64_t	st_mtime;
+	lx_timespec64_t	st_ctime;
+	int64_t		st_unused[3];
+};
+#endif /* defined(_LP64) */
+
+typedef enum lx_stat_fmt {
+	LXF_STAT32,
+	LXF_STAT64_32,
+	LXF_STAT64_64
+} lx_stat_fmt_t;
+
+static void
+lx_stat_xlate_dev(vattr_t *vattr)
+{
+	lx_zone_data_t *lxzd = ztolxzd(curproc->p_zone);
+	dev_t dev = vattr->va_fsid;
+	lx_virt_disk_t *vd;
+
+	/* Substitute emulated major/minor on mounted datasets */
+	vd = list_head(lxzd->lxzd_vdisks);
+	while (vd != NULL) {
+		if (vd->lxvd_real_dev == dev) {
+			dev = vd->lxvd_emul_dev;
+			break;
+		}
+		vd = list_next(lxzd->lxzd_vdisks, vd);
+	}
+
+	/* Mangle st_dev into expected format */
+	vattr->va_fsid = LX_MAKEDEVICE(getmajor(dev), getminor(dev));
+}
+
+static long
+lx_stat_common(vnode_t *vp, cred_t *cr, void *outp, lx_stat_fmt_t fmt)
+{
+	vattr_t vattr;
+	mode_t mode;
+	int error;
+
+	vattr.va_mask = AT_STAT | AT_NBLOCKS | AT_BLKSIZE | AT_SIZE;
+	if ((error = VOP_GETATTR(vp, &vattr, 0, cr, NULL)) != 0) {
+		return (error);
+	}
+
+	mode = VTTOIF(vattr.va_type) | vattr.va_mode;
+	if ((mode & S_IFMT) == S_IFBLK) {
+		/* Linux seems to report a 0 st_size for all block devices */
+		vattr.va_size = 0;
+	}
+	if (vattr.va_rdev == NODEV) {
+		/* Linux leaves st_rdev zeroed when it is absent */
+		vattr.va_rdev = 0;
+	}
+
+	lx_stat_xlate_dev(&vattr);
+
+	if (fmt == LXF_STAT32) {
+		struct lx_stat32 sb;
+
+		if (vattr.va_fsid > USHRT_MAX || vattr.va_rdev > USHRT_MAX ||
+		    vattr.va_nlink > USHRT_MAX || vattr.va_size > INT_MAX) {
+			return (EOVERFLOW);
+		}
+
+		bzero(&sb, sizeof (sb));
+		sb.st_dev = vattr.va_fsid;
+		sb.st_ino = vattr.va_nodeid;
+		sb.st_mode = mode;
+		sb.st_nlink = vattr.va_nlink;
+		sb.st_uid = LX_UID32_TO_UID16(vattr.va_uid);
+		sb.st_gid = LX_GID32_TO_GID16(vattr.va_gid);
+		sb.st_rdev = vattr.va_rdev;
+		sb.st_size = vattr.va_size;
+		sb.st_blksize = vattr.va_blksize;
+		sb.st_blocks = vattr.va_nblocks;
+		sb.st_atime.ts_sec = vattr.va_atime.tv_sec;
+		sb.st_atime.ts_nsec = vattr.va_atime.tv_nsec;
+		sb.st_mtime.ts_sec = vattr.va_mtime.tv_sec;
+		sb.st_mtime.ts_nsec = vattr.va_mtime.tv_nsec;
+		sb.st_ctime.ts_sec = vattr.va_ctime.tv_sec;
+		sb.st_ctime.ts_nsec = vattr.va_ctime.tv_nsec;
+		if (copyout(&sb, outp, sizeof (sb)) != 0) {
+			return (EFAULT);
+		}
+		return (0);
+	} else if (fmt == LXF_STAT64_32) {
+		struct lx_stat64_32 sb;
+
+		bzero(&sb, sizeof (sb));
+		sb.st_dev = vattr.va_fsid;
+		sb.st_ino = vattr.va_nodeid;
+		sb.st_small_ino = (vattr.va_nodeid & UINT_MAX);
+		sb.st_mode = mode;
+		sb.st_nlink = vattr.va_nlink;
+		sb.st_uid = vattr.va_uid;
+		sb.st_gid = vattr.va_gid;
+		sb.st_rdev = vattr.va_rdev;
+		sb.st_size = vattr.va_size;
+		sb.st_blksize = vattr.va_blksize;
+		sb.st_blocks = vattr.va_nblocks;
+		sb.st_atime.ts_sec = vattr.va_atime.tv_sec;
+		sb.st_atime.ts_nsec = vattr.va_atime.tv_nsec;
+		sb.st_mtime.ts_sec = vattr.va_mtime.tv_sec;
+		sb.st_mtime.ts_nsec = vattr.va_mtime.tv_nsec;
+		sb.st_ctime.ts_sec = vattr.va_ctime.tv_sec;
+		sb.st_ctime.ts_nsec = vattr.va_ctime.tv_nsec;
+		if (copyout(&sb, outp, sizeof (sb)) != 0) {
+			return (EFAULT);
+		}
+		return (0);
+	} else if (fmt == LXF_STAT64_64) {
+#if defined(_LP64)
+		struct lx_stat64_64 sb;
+
+		bzero(&sb, sizeof (sb));
+		sb.st_dev = vattr.va_fsid;
+		sb.st_ino = vattr.va_nodeid;
+		sb.st_mode = mode;
+		sb.st_nlink = vattr.va_nlink;
+		sb.st_uid = vattr.va_uid;
+		sb.st_gid = vattr.va_gid;
+		sb.st_rdev = vattr.va_rdev;
+		sb.st_size = vattr.va_size;
+		sb.st_blksize = vattr.va_blksize;
+		sb.st_blocks = vattr.va_nblocks;
+		sb.st_atime.ts_sec = vattr.va_atime.tv_sec;
+		sb.st_atime.ts_nsec = vattr.va_atime.tv_nsec;
+		sb.st_mtime.ts_sec = vattr.va_mtime.tv_sec;
+		sb.st_mtime.ts_nsec = vattr.va_mtime.tv_nsec;
+		sb.st_ctime.ts_sec = vattr.va_ctime.tv_sec;
+		sb.st_ctime.ts_nsec = vattr.va_ctime.tv_nsec;
+		if (copyout(&sb, outp, sizeof (sb)) != 0) {
+			return (EFAULT);
+		}
+		return (0);
+#else
+		/* Invalid output format on 32-bit */
+		VERIFY(0);
+#endif
+	}
+
+	/* Invalid output format */
+	VERIFY(0);
+	return (0);
+}
+
+long
+lx_stat32(char *name, void *outp)
+{
+	vnode_t *vp = NULL;
+	cred_t *cr = NULL;
+	int error;
+
+	if ((error = cstatat_getvp(AT_FDCWD, name, FOLLOW, &vp, &cr)) != 0) {
+		return (set_errno(error));
+	}
+	error = lx_stat_common(vp, cr, outp, LXF_STAT32);
+	VN_RELE(vp);
+	crfree(cr);
+	if (error != 0) {
+		return (set_errno(error));
+	}
+	return (0);
+}
+
+long
+lx_fstat32(int fd, void *outp)
+{
+	file_t *fp;
+	int error;
+
+	if ((fp = getf(fd)) == NULL) {
+		return (set_errno(EBADF));
+	}
+	error = lx_stat_common(fp->f_vnode, fp->f_cred, outp, LXF_STAT32);
+	releasef(fd);
+	if (error != 0) {
+		return (set_errno(error));
+	}
+	return (0);
+}
+
+long
+lx_lstat32(char *name, void *outp)
+{
+	vnode_t *vp = NULL;
+	cred_t *cr = NULL;
+	int error;
+
+	if ((error = cstatat_getvp(AT_FDCWD, name, NO_FOLLOW, &vp, &cr)) != 0) {
+		return (set_errno(error));
+	}
+	error = lx_stat_common(vp, cr, outp, LXF_STAT32);
+	VN_RELE(vp);
+	crfree(cr);
+	if (error != 0) {
+		return (set_errno(error));
+	}
+	return (0);
+}
+
+long
+lx_stat64(char *name, void *outp)
+{
+	vnode_t *vp = NULL;
+	cred_t *cr = NULL;
+	model_t model = get_udatamodel();
+	int error;
+
+	if ((error = cstatat_getvp(AT_FDCWD, name, FOLLOW, &vp, &cr)) != 0) {
+		return (set_errno(error));
+	}
+	error = lx_stat_common(vp, cr, outp,
+	    (model == DATAMODEL_LP64) ? LXF_STAT64_64 : LXF_STAT64_32);
+	VN_RELE(vp);
+	crfree(cr);
+	if (error != 0) {
+		return (set_errno(error));
+	}
+	return (0);
+}
+
+long
+lx_fstat64(int fd, void *outp)
+{
+	file_t *fp;
+	model_t model = get_udatamodel();
+	int error;
+
+	if ((fp = getf(fd)) == NULL) {
+		return (set_errno(EBADF));
+	}
+	error = lx_stat_common(fp->f_vnode, fp->f_cred, outp,
+	    (model == DATAMODEL_LP64) ? LXF_STAT64_64 : LXF_STAT64_32);
+	releasef(fd);
+	if (error != 0) {
+		return (set_errno(error));
+	}
+	return (0);
+}
+
+#define	LX_FSTATAT_ALLOWED	(LX_AT_SYMLINK_NOFOLLOW | LX_AT_EMPTY_PATH | \
+    LX_AT_NO_AUTOMOUNT)
+
+long
+lx_fstatat64(int fd, char *name, void *outp, int flag)
+{
+	vnode_t *vp = NULL;
+	cred_t *cr = NULL;
+	model_t model = get_udatamodel();
+	enum symfollow follow = FOLLOW;
+	int error;
+	char c;
+
+	if (fd == LX_AT_FDCWD) {
+		fd = AT_FDCWD;
+	}
+	if ((flag & ~LX_FSTATAT_ALLOWED) != 0) {
+		return (set_errno(EINVAL));
+	}
+	if ((flag & LX_AT_NO_AUTOMOUNT) != 0) {
+		/*
+		 * While AT_NO_AUTOMOUNT is a legal flag for fstatat64, it is
+		 * not yet supported by lx_autofs.
+		 */
+		lx_unsupported("fstatat(AT_NO_AUTOMOUNT)");
+		return (set_errno(EINVAL));
+	}
+	if ((flag & LX_AT_SYMLINK_NOFOLLOW) != 0) {
+		follow = NO_FOLLOW;
+	}
+
+	if (copyin(name, &c, sizeof (c)) != 0) {
+		return (set_errno(EFAULT));
+	}
+	if (c == '\0') {
+		if ((flag & LX_AT_EMPTY_PATH) == 0) {
+			return (set_errno(ENOENT));
+		}
+
+		/*
+		 * When AT_EMPTY_PATH is set and and empty string has been
+		 * passed for the name parameter, direct the lookup against the
+		 * vnode for that fd.
+		 */
+		if (fd == AT_FDCWD) {
+			vp = PTOU(curproc)->u_cdir;
+			VN_HOLD(vp);
+			cr = CRED();
+			crhold(cr);
+		} else {
+			file_t *fp;
+
+			if ((fp = getf(fd)) == NULL) {
+				return (set_errno(EBADF));
+			}
+			vp = fp->f_vnode;
+			VN_HOLD(vp);
+			cr = fp->f_cred;
+			crhold(cr);
+			releasef(fd);
+		}
+	} else {
+		if ((error = cstatat_getvp(fd, name, follow, &vp, &cr)) != 0) {
+			return (set_errno(error));
+		}
+	}
+
+	error = lx_stat_common(vp, cr, outp,
+	    (model == DATAMODEL_LP64) ? LXF_STAT64_64 : LXF_STAT64_32);
+	VN_RELE(vp);
+	crfree(cr);
+	if (error != 0) {
+		return (set_errno(error));
+	}
+	return (0);
+}
+
+long
+lx_lstat64(char *name, void *outp)
+{
+	vnode_t *vp = NULL;
+	cred_t *cr = NULL;
+	model_t model = get_udatamodel();
+	int error;
+
+	if ((error = cstatat_getvp(AT_FDCWD, name, NO_FOLLOW, &vp, &cr)) != 0) {
+		return (set_errno(error));
+	}
+	error = lx_stat_common(vp, cr, outp,
+	    (model == DATAMODEL_LP64) ? LXF_STAT64_64 : LXF_STAT64_32);
+	VN_RELE(vp);
+	crfree(cr);
+	if (error != 0) {
+		return (set_errno(error));
+	}
+	return (0);
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_sync.c b/usr/src/uts/common/brand/lx/syscall/lx_sync.c
new file mode 100644
index 0000000000..614afca0b0
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_sync.c
@@ -0,0 +1,86 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+#include <sys/systm.h>
+#include <sys/types.h>
+#include <sys/errno.h>
+#include <sys/file.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/lx_impl.h>
+#include <sys/lx_brand.h>
+
+long
+lx_syncfs(int fd)
+{
+	file_t *fp;
+	vfs_t *vfsp;
+
+	if ((fp = getf(fd)) == NULL)
+		return (set_errno(EBADF));
+
+	vfsp = fp->f_vnode->v_vfsp;
+	releasef(fd);
+
+	(void) (vfsp->vfs_op->vfs_sync)(vfsp, 0, CRED());
+
+	return (0);
+}
+
+#define	LX_SYNC_FILE_RANGE_WAIT_BEFORE	0x1
+#define	LX_SYNC_FILE_RANGE_WRITE	0x2
+#define	LX_SYNC_FILE_RANGE_WAIT_AFTER	0x4
+
+#define	LX_SYNC_FILE_RANGE_VALID	(LX_SYNC_FILE_RANGE_WAIT_BEFORE | \
+	LX_SYNC_FILE_RANGE_WRITE | LX_SYNC_FILE_RANGE_WAIT_AFTER)
+
+
+long
+lx_sync_file_range(int fd, off_t offset, off_t nbytes, int flags)
+{
+	file_t *fp;
+	int error, sflags = 0;
+
+	if ((flags & ~LX_SYNC_FILE_RANGE_VALID) != 0)
+		return (set_errno(EINVAL));
+	if (offset < 0 || nbytes < 0)
+		return (set_errno(EINVAL));
+
+	if ((fp = getf(fd)) == NULL)
+		return (set_errno(EBADF));
+
+	/*
+	 * Since sync_file_range is implemented in terms of VOP_PUTPAGE, both
+	 * SYNC_FILE_RANGE_WAIT flags are treated as forcing synchronous
+	 * operation.  While this differs from the Linux behavior where
+	 * BEFORE/AFTER are distinct, it achieves an adequate level of safety
+	 * since the requested data is synced out at the end of the call.
+	 */
+	if ((flags & (LX_SYNC_FILE_RANGE_WAIT_BEFORE |
+	    LX_SYNC_FILE_RANGE_WAIT_AFTER)) == 0) {
+		sflags |= B_ASYNC;
+	}
+
+	error = VOP_PUTPAGE(fp->f_vnode, offset, nbytes, sflags, CRED(), NULL);
+	if (error == ENOSYS) {
+		error = ESPIPE;
+	}
+
+	releasef(fd);
+	if (error != 0) {
+		return (set_errno(error));
+	}
+	return (0);
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_sysinfo.c b/usr/src/uts/common/brand/lx/syscall/lx_sysinfo.c
new file mode 100644
index 0000000000..449d5882d4
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_sysinfo.c
@@ -0,0 +1,218 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ * Copyright 2015 Joyent, Inc.
+ */
+
+#include <vm/anon.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/zone.h>
+#include <sys/time.h>
+
+typedef struct lx_sysinfo {
+	int64_t si_uptime;	/* Seconds since boot */
+	uint64_t si_loads[3];	/* 1, 5, and 15 minute avg runq length */
+	uint64_t si_totalram;	/* Total memory size */
+	uint64_t si_freeram;	/* Available memory */
+	uint64_t si_sharedram;	/* Shared memory */
+	uint64_t si_bufferram;	/* Buffer memory */
+	uint64_t si_totalswap;	/* Total swap space */
+	uint64_t si_freeswap;	/* Avail swap space */
+	uint16_t si_procs;	/* Process count */
+	uint16_t si_pad;	/* Padding */
+	uint64_t si_totalhigh;	/* High memory size */
+	uint64_t si_freehigh;	/* Avail high memory */
+	uint32_t si_mem_unit;	/* Unit size of memory fields */
+} lx_sysinfo_t;
+
+#if defined(_SYSCALL32_IMPL)
+/*
+ * 64-bit kernel view of the 32-bit usermode struct.
+ */
+#pragma pack(4)
+typedef struct lx_sysinfo32 {
+	int32_t si_uptime;	/* Seconds since boot */
+	uint32_t si_loads[3];	/* 1, 5, and 15 minute avg runq length */
+	uint32_t si_totalram;	/* Total memory size */
+	uint32_t si_freeram;	/* Available memory */
+	uint32_t si_sharedram;	/* Shared memory */
+	uint32_t si_bufferram;	/* Buffer memory */
+	uint32_t si_totalswap;	/* Total swap space */
+	uint32_t si_freeswap;	/* Avail swap space */
+	uint16_t si_procs;	/* Process count */
+	uint16_t si_pad;	/* Padding */
+	uint32_t si_totalhigh;	/* High memory size */
+	uint32_t si_freehigh;	/* Avail high memory */
+	uint32_t si_mem_unit;	/* Unit size of memory fields */
+	char __si_pad[8];
+} lx_sysinfo32_t;
+#pragma pack()
+#endif
+
+extern pgcnt_t swapfs_minfree;
+
+static void
+lx_sysinfo_common(lx_sysinfo_t *si)
+{
+	zone_t *zone = curthread->t_procp->p_zone;
+	uint64_t zphysmem, zfreemem, ztotswap, zfreeswap;
+
+	si->si_uptime = gethrestime_sec() - zone->zone_boot_time;
+
+	si->si_loads[0] = zone->zone_hp_avenrun[0];
+	si->si_loads[1] = zone->zone_hp_avenrun[1];
+	si->si_loads[2] = zone->zone_hp_avenrun[2];
+
+	/*
+	 * In linux each thread looks like a process, so we conflate the
+	 * two in this stat as well.
+	 */
+	si->si_procs = (int32_t)zone->zone_nlwps;
+
+	/*
+	 * If memory or swap limits are set on the zone, use those, otherwise
+	 * use the system values. physmem and freemem are in pages, but the
+	 * zone values are in bytes. Likewise, ani_max and ani_free are in
+	 * pages.
+	 */
+	if (zone->zone_phys_mem_ctl == UINT64_MAX) {
+		zphysmem = physmem;
+		zfreemem = freemem;
+	} else {
+		zphysmem = btop(zone->zone_phys_mem_ctl);
+		zfreemem = btop(zone->zone_phys_mem_ctl - zone->zone_phys_mem);
+	}
+
+	if (zone->zone_max_swap_ctl == UINT64_MAX) {
+		ztotswap = k_anoninfo.ani_max;
+		zfreeswap = k_anoninfo.ani_free;
+	} else {
+		/*
+		 * See the comment in swapctl for a description of how free is
+		 * calculated within a zone.
+		 */
+		rctl_qty_t used;
+		spgcnt_t avail;
+		uint64_t max;
+
+		avail = MAX((spgcnt_t)(availrmem - swapfs_minfree), 0);
+		max = k_anoninfo.ani_max + k_anoninfo.ani_mem_resv + avail;
+
+		mutex_enter(&zone->zone_mem_lock);
+		ztotswap = btop(zone->zone_max_swap_ctl);
+		used = btop(zone->zone_max_swap);
+		mutex_exit(&zone->zone_mem_lock);
+
+		zfreeswap = MIN(ztotswap, max) - used;
+	}
+
+	/*
+	 * If the maximum memory stat is less than 1^20 pages (i.e. 4GB),
+	 * then we report the result in bytes.  Otherwise we use pages.
+	 * Once we start supporting >1TB systems/zones, we'll need a third
+	 * option.
+	 */
+	if (MAX(zphysmem, ztotswap) < 1024 * 1024) {
+		si->si_totalram = ptob(zphysmem);
+		si->si_freeram = ptob(zfreemem);
+		si->si_totalswap = ptob(ztotswap);
+		si->si_freeswap = ptob(zfreeswap);
+		si->si_mem_unit = 1;
+	} else {
+		si->si_totalram = zphysmem;
+		si->si_freeram = zfreemem;
+		si->si_totalswap = ztotswap;
+		si->si_freeswap = zfreeswap;
+		si->si_mem_unit = PAGESIZE;
+	}
+	si->si_bufferram = 0;
+	si->si_sharedram = 0;
+
+	/*
+	 * These two stats refer to high physical memory.  If an
+	 * application running in a Linux zone cares about this, then
+	 * either it or we are broken.
+	 */
+	si->si_totalhigh = 0;
+	si->si_freehigh = 0;
+}
+
+long
+lx_sysinfo64(caddr_t sip)
+{
+	lx_sysinfo_t si;
+
+	bzero(&si, sizeof (si));
+	lx_sysinfo_common(&si);
+
+	if (copyout(&si, sip, sizeof (si)) != 0) {
+		return (set_errno(EFAULT));
+	}
+
+	return (0);
+}
+
+#if defined(_SYSCALL32_IMPL)
+long
+lx_sysinfo32(caddr_t sip)
+{
+	lx_sysinfo_t si;
+	lx_sysinfo32_t si32;
+	int i;
+
+	lx_sysinfo_common(&si);
+
+	/*
+	 * Convert the lx_sysinfo_t into the legacy 32-bit view:
+	 */
+	bzero(&si32, sizeof (si32));
+	si32.si_uptime = si.si_uptime;
+
+	for (i = 0; i < 3; i++) {
+		if ((si.si_loads[i]) > 0x7fffffff)
+			si32.si_loads[i] = 0x7fffffff;
+		else
+			si32.si_loads[i] = si.si_loads[i];
+	}
+
+	si32.si_procs = si.si_procs;
+	si32.si_totalram = si.si_totalram;
+	si32.si_freeram = si.si_freeram;
+	si32.si_totalswap = si.si_totalswap;
+	si32.si_freeswap = si.si_freeswap;
+	si32.si_mem_unit = si.si_mem_unit;
+
+	si32.si_bufferram = si.si_bufferram;
+	si32.si_sharedram = si.si_sharedram;
+
+	si32.si_totalhigh = si.si_totalhigh;
+	si32.si_freehigh = si.si_freehigh;
+
+	if (copyout(&si32, sip, sizeof (si32)) != 0) {
+		return (set_errno(EFAULT));
+	}
+
+	return (0);
+}
+#endif
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_thread_area.c b/usr/src/uts/common/brand/lx/syscall/lx_thread_area.c
new file mode 100644
index 0000000000..48d91b09cc
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_thread_area.c
@@ -0,0 +1,196 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ * Copyright 2015 Joyent, Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/cpuvar.h>
+#include <sys/archsystm.h>
+#include <sys/proc.h>
+#include <sys/brand.h>
+#include <sys/lx_brand.h>
+#include <sys/lx_ldt.h>
+#include <sys/lx_misc.h>
+#include <sys/x86_archext.h>
+#include <sys/controlregs.h>
+#include <lx_syscall.h>
+
+long
+lx_arch_prctl(int code, ulong_t addr)
+{
+#if defined(__amd64)
+	klwp_t *lwp = ttolwp(curthread);
+	lx_lwp_data_t *llwp = lwptolxlwp(lwp);
+	pcb_t *pcb = &lwp->lwp_pcb;
+
+	switch (code) {
+	case LX_ARCH_GET_FS:
+		if (copyout(&llwp->br_lx_fsbase, (void *)addr,
+		    sizeof (llwp->br_lx_fsbase)) != 0) {
+			return (set_errno(EFAULT));
+		}
+		break;
+
+	case LX_ARCH_SET_FS:
+		llwp->br_lx_fsbase = addr;
+
+		kpreempt_disable();
+		if (pcb->pcb_fsbase != llwp->br_lx_fsbase) {
+			pcb->pcb_fsbase = llwp->br_lx_fsbase;
+
+			/*
+			 * Ensure we go out via update_sregs.
+			 */
+			pcb->pcb_rupdate = 1;
+		}
+		kpreempt_enable();
+		break;
+
+	case LX_ARCH_GET_GS:
+		if (copyout(&llwp->br_lx_gsbase, (void *)addr,
+		    sizeof (llwp->br_lx_gsbase)) != 0) {
+			return (set_errno(EFAULT));
+		}
+		break;
+
+	case LX_ARCH_SET_GS:
+		llwp->br_lx_gsbase = addr;
+
+		kpreempt_disable();
+		if (pcb->pcb_gsbase != llwp->br_lx_gsbase) {
+			pcb->pcb_gsbase = llwp->br_lx_gsbase;
+
+			/*
+			 * Ensure we go out via update_sregs.
+			 */
+			pcb->pcb_rupdate = 1;
+		}
+		kpreempt_enable();
+		break;
+
+	default:
+		return (set_errno(EINVAL));
+	}
+#endif
+
+	return (0);
+}
+
+long
+lx_get_thread_area(struct ldt_info *inf)
+{
+	struct lx_lwp_data *jlwp = ttolxlwp(curthread);
+	struct ldt_info ldt_inf;
+	user_desc_t *dscrp;
+	int entry;
+
+	if (fuword32(&inf->entry_number, (uint32_t *)&entry))
+		return (set_errno(EFAULT));
+
+	if (entry < GDT_TLSMIN || entry > GDT_TLSMAX)
+		return (set_errno(EINVAL));
+
+	dscrp = jlwp->br_tls + entry - GDT_TLSMIN;
+
+	/*
+	 * convert the solaris ldt to the linux format expected by the
+	 * caller
+	 */
+	DESC_TO_LDT_INFO(dscrp, &ldt_inf);
+	ldt_inf.entry_number = entry;
+
+	if (copyout(&ldt_inf, inf, sizeof (struct ldt_info)))
+		return (set_errno(EFAULT));
+
+	return (0);
+}
+
+long
+lx_set_thread_area(struct ldt_info *inf)
+{
+	struct lx_lwp_data *jlwp = ttolxlwp(curthread);
+	struct ldt_info ldt_inf;
+	user_desc_t *dscrp;
+	int entry;
+	int i;
+
+	/* Check that casts for accessing the words in user_desc are valid */
+	ASSERT(sizeof (user_desc_t) == 8);
+
+	if (copyin(inf, &ldt_inf, sizeof (ldt_inf)))
+		return (set_errno(EFAULT));
+
+	entry = ldt_inf.entry_number;
+	if (entry == -1) {
+		/*
+		 * Find an empty entry in the tls for this thread.
+		 * The casts assume each user_desc_t entry is 8 bytes.
+		 */
+		for (i = 0, dscrp = jlwp->br_tls; i < LX_TLSNUM; i++, dscrp++) {
+			if (((uint_t *)dscrp)[0] == 0 &&
+			    ((uint_t *)dscrp)[1] == 0)
+				break;
+		}
+
+		if (i < LX_TLSNUM) {
+			/*
+			 * found one
+			 */
+			entry = i + GDT_TLSMIN;
+			if (suword32(&inf->entry_number, entry))
+				return (set_errno(EFAULT));
+		} else {
+			return (set_errno(ESRCH));
+		}
+	}
+
+	if (entry < GDT_TLSMIN || entry > GDT_TLSMAX)
+		return (set_errno(EINVAL));
+
+	/*
+	 * convert the linux ldt info to standard intel descriptor
+	 */
+	dscrp = jlwp->br_tls + entry - GDT_TLSMIN;
+
+	if (LDT_INFO_EMPTY(&ldt_inf)) {
+		((uint_t *)dscrp)[0] = 0;
+		((uint_t *)dscrp)[1] = 0;
+	} else {
+		LDT_INFO_TO_DESC(&ldt_inf, dscrp);
+	}
+
+	/*
+	 * update the gdt with the new descriptor
+	 */
+	kpreempt_disable();
+
+	for (i = 0, dscrp = jlwp->br_tls; i < LX_TLSNUM; i++, dscrp++)
+		lx_set_gdt(GDT_TLSMIN + i, dscrp);
+
+	kpreempt_enable();
+
+	return (0);
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_timer.c b/usr/src/uts/common/brand/lx/syscall/lx_timer.c
new file mode 100644
index 0000000000..c2fb4a4c7d
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_timer.c
@@ -0,0 +1,379 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+/*
+ * The illumos kernel provides two clock backends: CLOCK_REALTIME, the
+ * adjustable system wall clock; and CLOCK_HIGHRES, the monotonically
+ * increasing time source that is not subject to drift or adjustment.  By
+ * contrast, the Linux kernel is furnished with an overabundance of narrowly
+ * differentiated clock types.
+ *
+ * Fortunately, most of the commonly used Linux clock types are either similar
+ * enough to the native clock backends that they can be directly mapped, or
+ * represent queries to the per-process and per-LWP microstate counters.
+ *
+ * CLOCK_BOOTTIME is identical to CLOCK_MONOTONIC, except that it takes into
+ * account time that the system is suspended. Since that is uninteresting to
+ * us, we treat it the same.
+ */
+
+#include <sys/time.h>
+#include <sys/systm.h>
+#include <sys/cmn_err.h>
+#include <sys/lx_impl.h>
+
+/*
+ * From "uts/common/os/timer.c":
+ */
+extern int clock_settime(clockid_t, timespec_t *);
+extern int clock_gettime(clockid_t, timespec_t *);
+extern int clock_getres(clockid_t, timespec_t *);
+extern int nanosleep(timespec_t *, timespec_t *);
+
+
+static int lx_emul_clock_getres(clockid_t, timespec_t *);
+static int lx_emul_clock_gettime(clockid_t, timespec_t *);
+static int lx_emul_clock_settime(clockid_t, timespec_t *);
+
+typedef struct lx_clock_backend {
+	clockid_t lclk_ntv_id;
+	int (*lclk_clock_getres)(clockid_t, timespec_t *);
+	int (*lclk_clock_gettime)(clockid_t, timespec_t *);
+	int (*lclk_clock_settime)(clockid_t, timespec_t *);
+} lx_clock_backend_t;
+
+/*
+ * NOTE: The Linux man pages state this structure is obsolete and is
+ * unsupported, so it is declared here for sizing purposes only.
+ */
+struct lx_timezone {
+	int tz_minuteswest;	/* minutes W of Greenwich */
+	int tz_dsttime;		/* type of dst correction */
+};
+
+/*
+ * Use the native clock_* system call implementation, but with a translated
+ * clock identifier:
+ */
+#define	NATIVE(ntv_id)							\
+	{ ntv_id, clock_getres, clock_gettime, clock_settime }
+
+/*
+ * This backend is not supported, so we provide an emulation handler:
+ */
+#define	EMUL(ntv_id)							\
+	{ ntv_id, lx_emul_clock_getres, lx_emul_clock_gettime,		\
+	    lx_emul_clock_settime }
+
+static lx_clock_backend_t lx_clock_backends[] = {
+	NATIVE(CLOCK_REALTIME),		/* LX_CLOCK_REALTIME */
+	NATIVE(CLOCK_HIGHRES),		/* LX_CLOCK_MONOTONIC */
+	EMUL(CLOCK_PROCESS_CPUTIME_ID),	/* LX_CLOCK_PROCESS_CPUTIME_ID */
+	EMUL(CLOCK_THREAD_CPUTIME_ID),	/* LX_CLOCK_THREAD_CPUTIME_ID */
+	NATIVE(CLOCK_HIGHRES),		/* LX_CLOCK_MONOTONIC_RAW */
+	NATIVE(CLOCK_REALTIME),		/* LX_CLOCK_REALTIME_COARSE */
+	NATIVE(CLOCK_HIGHRES),		/* LX_CLOCK_MONOTONIC_COARSE */
+	NATIVE(CLOCK_HIGHRES)		/* LX_CLOCK_BOOTTIME */
+};
+
+#define	LX_CLOCK_MAX \
+	(sizeof (lx_clock_backends) / sizeof (lx_clock_backends[0]))
+#define	LX_CLOCK_BACKEND(clk) \
+	((clk) < LX_CLOCK_MAX && (clk) >= 0 ? &lx_clock_backends[(clk)] : NULL)
+
+static int
+lx_emul_clock_settime(clockid_t clock, timespec_t *tp)
+{
+	return (set_errno(EINVAL));
+}
+
+static int
+lx_emul_clock_gettime(clockid_t clock, timespec_t *tp)
+{
+	timespec_t t;
+
+	switch (clock) {
+	case CLOCK_PROCESS_CPUTIME_ID: {
+		proc_t *p = ttoproc(curthread);
+		hrtime_t snsecs, unsecs;
+
+		/*
+		 * Based on getrusage() in "rusagesys.c":
+		 */
+		mutex_enter(&p->p_lock);
+		unsecs = mstate_aggr_state(p, LMS_USER);
+		snsecs = mstate_aggr_state(p, LMS_SYSTEM);
+		mutex_exit(&p->p_lock);
+
+		hrt2ts(unsecs + snsecs, &t);
+		break;
+	}
+
+	case CLOCK_THREAD_CPUTIME_ID: {
+		klwp_t *lwp = ttolwp(curthread);
+		struct mstate *ms = &lwp->lwp_mstate;
+		hrtime_t snsecs, unsecs;
+
+		/*
+		 * Based on getrusage_lwp() in "rusagesys.c":
+		 */
+		unsecs = ms->ms_acct[LMS_USER];
+		snsecs = ms->ms_acct[LMS_SYSTEM] + ms->ms_acct[LMS_TRAP];
+
+		scalehrtime(&unsecs);
+		scalehrtime(&snsecs);
+
+		hrt2ts(unsecs + snsecs, &t);
+		break;
+	}
+
+	default:
+		return (set_errno(EINVAL));
+	}
+
+#if defined(_SYSCALL32_IMPL)
+	if (get_udatamodel() != DATAMODEL_NATIVE) {
+		timespec32_t t32;
+
+		if (TIMESPEC_OVERFLOW(&t)) {
+			return (set_errno(EOVERFLOW));
+		}
+		TIMESPEC_TO_TIMESPEC32(&t32, &t);
+
+		if (copyout(&t32, tp, sizeof (t32)) != 0) {
+			return (set_errno(EFAULT));
+		}
+
+		return (0);
+	}
+#endif
+
+	if (copyout(&t, tp, sizeof (t)) != 0) {
+		return (set_errno(EFAULT));
+	}
+
+	return (0);
+}
+
+static int
+lx_emul_clock_getres(clockid_t clock, timespec_t *tp)
+{
+	timespec_t t;
+
+	if (tp == NULL) {
+		return (0);
+	}
+
+	switch (clock) {
+	case CLOCK_PROCESS_CPUTIME_ID:
+	case CLOCK_THREAD_CPUTIME_ID:
+		/*
+		 * These clock backends return microstate accounting values for
+		 * the LWP or the entire process.  The Linux kernel claims they
+		 * have nanosecond resolution; so will we.
+		 */
+		t.tv_sec = 0;
+		t.tv_nsec = 1;
+		break;
+
+	default:
+		return (set_errno(EINVAL));
+	}
+
+#if defined(_SYSCALL32_IMPL)
+	if (get_udatamodel() != DATAMODEL_NATIVE) {
+		timespec32_t t32;
+
+		if (TIMESPEC_OVERFLOW(&t)) {
+			return (set_errno(EOVERFLOW));
+		}
+		TIMESPEC_TO_TIMESPEC32(&t32, &t);
+
+		if (copyout(&t32, tp, sizeof (t32)) != 0) {
+			return (set_errno(EFAULT));
+		}
+
+		return (0);
+	}
+#endif
+
+	if (copyout(&t, tp, sizeof (t)) != 0) {
+		return (set_errno(EFAULT));
+	}
+
+	return (0);
+}
+
+static void
+lx_clock_unsupported(int clock)
+{
+	char buf[100];
+
+	(void) snprintf(buf, sizeof (buf), "unsupported clock: %d", clock);
+	lx_unsupported(buf);
+}
+
+long
+lx_clock_settime(int clock, timespec_t *tp)
+{
+	lx_clock_backend_t *backend;
+
+	if ((backend = LX_CLOCK_BACKEND(clock)) == NULL) {
+		lx_clock_unsupported(clock);
+		return (set_errno(EINVAL));
+	}
+
+	return (backend->lclk_clock_settime(backend->lclk_ntv_id, tp));
+}
+
+long
+lx_clock_gettime(int clock, timespec_t *tp)
+{
+	lx_clock_backend_t *backend;
+
+	if ((backend = LX_CLOCK_BACKEND(clock)) == NULL) {
+		lx_clock_unsupported(clock);
+		return (set_errno(EINVAL));
+	}
+
+	return (backend->lclk_clock_gettime(backend->lclk_ntv_id, tp));
+}
+
+long
+lx_clock_getres(int clock, timespec_t *tp)
+{
+	lx_clock_backend_t *backend;
+
+	if (tp == NULL) {
+		return (0);
+	}
+
+	if ((backend = LX_CLOCK_BACKEND(clock)) == NULL) {
+		lx_clock_unsupported(clock);
+		return (set_errno(EINVAL));
+	}
+
+	return (backend->lclk_clock_getres(backend->lclk_ntv_id, tp));
+}
+
+
+long
+lx_gettimeofday(struct timeval *tvp, struct lx_timezone *tzp)
+{
+	struct lx_timezone tz;
+
+	bzero(&tz, sizeof (tz));
+
+	/*
+	 * We want to be similar to libc which just does a fasttrap to
+	 * gethrestime and simply converts that result. We follow how uniqtime
+	 * does the conversion but we can't use that code since it does some
+	 * extra work which can cause the result to bounce around based on which
+	 * CPU we run on.
+	 */
+	if (tvp != NULL) {
+		struct timeval tv;
+		timestruc_t ts;
+		int usec, nsec;
+
+		gethrestime(&ts);
+		nsec = ts.tv_nsec;
+		usec = nsec + (nsec >> 2);
+		usec = nsec + (usec >> 1);
+		usec = nsec + (usec >> 2);
+		usec = nsec + (usec >> 4);
+		usec = nsec - (usec >> 3);
+		usec = nsec + (usec >> 2);
+		usec = nsec + (usec >> 3);
+		usec = nsec + (usec >> 4);
+		usec = nsec + (usec >> 1);
+		usec = nsec + (usec >> 6);
+		usec = usec >> 10;
+
+		tv.tv_sec = ts.tv_sec;
+		tv.tv_usec = usec;
+
+		if (get_udatamodel() == DATAMODEL_NATIVE) {
+			if (copyout(&tv, tvp, sizeof (tv)) != 0)
+				return (set_errno(EFAULT));
+		}
+#ifdef _SYSCALL32_IMPL
+		else {
+			struct timeval32 tv32;
+
+			if (TIMEVAL_OVERFLOW(&tv))
+				return (set_errno(EOVERFLOW));
+			TIMEVAL_TO_TIMEVAL32(&tv32, &tv);
+
+			if (copyout(&tv32, tvp, sizeof (tv32)))
+				return (set_errno(EFAULT));
+		}
+#endif
+	}
+
+	/*
+	 * The Linux man page states use of the second parameter is obsolete,
+	 * but gettimeofday(2) should still return EFAULT if it is set
+	 * to a bad non-NULL pointer (sigh...)
+	 */
+	if (tzp != NULL && copyout(&tz, tzp, sizeof (tz)) != 0)
+		return (set_errno(EFAULT));
+
+	return (0);
+}
+
+/*
+ * On Linux a bad buffer will set errno to EFAULT, and on Illumos the failure
+ * mode is documented as "undefined."
+ */
+long
+lx_time(time_t *tp)
+{
+	timestruc_t ts;
+	struct timeval tv;
+
+	gethrestime(&ts);
+	tv.tv_sec = ts.tv_sec;
+	tv.tv_usec = 0;
+
+	if (get_udatamodel() == DATAMODEL_NATIVE) {
+		if (tp != NULL &&
+		    copyout(&tv.tv_sec, tp, sizeof (tv.tv_sec)) != 0)
+			return (set_errno(EFAULT));
+
+		return (tv.tv_sec);
+	}
+#ifdef _SYSCALL32_IMPL
+	else {
+		struct timeval32 tv32;
+
+		if (TIMEVAL_OVERFLOW(&tv))
+			return (set_errno(EOVERFLOW));
+		TIMEVAL_TO_TIMEVAL32(&tv32, &tv);
+
+		if (tp != NULL &&
+		    copyout(&tv32.tv_sec, tp, sizeof (tv32.tv_sec)))
+			return (set_errno(EFAULT));
+
+		return (tv32.tv_sec);
+	}
+#endif
+}
+
+long
+lx_nanosleep(timespec_t *rqtp, timespec_t *rmtp)
+{
+	return (nanosleep(rqtp, rmtp));
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_uname.c b/usr/src/uts/common/brand/lx/syscall/lx_uname.c
new file mode 100644
index 0000000000..2d18408eaa
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_uname.c
@@ -0,0 +1,82 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/thread.h>
+#include <sys/proc.h>
+#include <sys/zone.h>
+#include <sys/brand.h>
+#include <sys/lx_brand.h>
+#include <sys/lx_types.h>
+
+struct lx_utsname {
+	char lxu_sysname[LX_SYS_UTS_LN];
+	char lxu_nodename[LX_SYS_UTS_LN];
+	char lxu_release[LX_SYS_UTS_LN];
+	char lxu_version[LX_SYS_UTS_LN];
+	char lxu_machine[LX_SYS_UTS_LN];
+	char lxu_domainname[LX_SYS_UTS_LN];
+};
+
+long
+lx_uname(void *uptr)
+{
+	proc_t *p = curproc;
+	lx_proc_data_t *lxpd = ptolxproc(p);
+	lx_zone_data_t *lxzd = ztolxzd(p->p_zone);
+	struct lx_utsname un;
+
+	bzero(&un, sizeof (un));
+
+	(void) strlcpy(un.lxu_sysname, LX_UNAME_SYSNAME, LX_SYS_UTS_LN);
+	(void) strlcpy(un.lxu_nodename, p->p_zone->zone_nodename,
+	    LX_SYS_UTS_LN);
+
+	mutex_enter(&lxzd->lxzd_lock);
+
+	if (lxpd->l_uname_release[0] != '\0') {
+		(void) strlcpy(un.lxu_release, lxpd->l_uname_release,
+		    LX_SYS_UTS_LN);
+	} else {
+		(void) strlcpy(un.lxu_release, lxzd->lxzd_kernel_release,
+		    LX_SYS_UTS_LN);
+	}
+	if (lxpd->l_uname_version[0] != '\0') {
+		(void) strlcpy(un.lxu_version, lxpd->l_uname_version,
+		    LX_SYS_UTS_LN);
+	} else {
+		(void) strlcpy(un.lxu_version, lxzd->lxzd_kernel_version,
+		    LX_SYS_UTS_LN);
+	}
+
+	mutex_exit(&lxzd->lxzd_lock);
+
+	if (get_udatamodel() == DATAMODEL_LP64) {
+		(void) strlcpy(un.lxu_machine, LX_UNAME_MACHINE64,
+		    LX_SYS_UTS_LN);
+	} else {
+		(void) strlcpy(un.lxu_machine, LX_UNAME_MACHINE32,
+		    LX_SYS_UTS_LN);
+	}
+	(void) strlcpy(un.lxu_domainname, p->p_zone->zone_domain,
+	    LX_SYS_UTS_LN);
+
+	if (copyout(&un, uptr, sizeof (un)) != 0) {
+		return (set_errno(EFAULT));
+	}
+
+	return (0);
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_wait.c b/usr/src/uts/common/brand/lx/syscall/lx_wait.c
new file mode 100644
index 0000000000..e8358f9f69
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_wait.c
@@ -0,0 +1,377 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ * Copyright 2015 Joyent, Inc.
+ */
+
+/*
+ * wait() family of functions.
+ *
+ * The first minor difference between the Linux and Solaris family of wait()
+ * calls is that the values for WNOHANG and WUNTRACED are different. Thankfully,
+ * the exit status values are identical between the two implementations.
+ *
+ * Things get very different and very complicated when we introduce the Linux
+ * threading model.  Under linux, both threads and child processes are
+ * represented as processes.  However, the behavior of wait() with respect to
+ * each child varies according to the flags given to clone()
+ *
+ *	SIGCHLD 	The SIGCHLD signal should be sent on termination
+ *	CLONE_THREAD	The child shares the same thread group as the parent
+ *	CLONE_DETACHED	The parent receives no notification when the child exits
+ *
+ * The following flags control the Linux behavior w.r.t. the above attributes:
+ *
+ * 	__WALL		Wait on all children, regardless of type
+ * 	__WCLONE	Wait only on non-SIGCHLD children
+ * 	__WNOTHREAD	Don't wait on children of other threads in this group
+ *
+ * The following chart shows whether wait() returns when the child exits:
+ *
+ *                           default    __WCLONE    __WALL
+ *           no SIGCHLD		-	    X	      X
+ *              SIGCHLD		X	    -	      X
+ *
+ * The following chart shows whether wait() returns when the grandchild exits:
+ *
+ *                           default   __WNOTHREAD
+ * 	no CLONE_THREAD		-	    -
+ *         CLONE_THREAD		X	    -
+ *
+ * The CLONE_DETACHED flag is universal - when the child exits, no state is
+ * stored and wait() has no effect.
+ *
+ * XXX Support the above combination of options, or some reasonable subset that
+ *     covers at least fork() and pthread_create().
+ */
+
+#include <sys/wait.h>
+#include <sys/brand.h>
+#include <sys/lx_brand.h>
+#include <sys/lx_types.h>
+#include <sys/lx_misc.h>
+#include <lx_signum.h>
+#include <lx_errno.h>
+#include <lx_syscall.h>
+
+/*
+ * From "uts/common/os/exit.c" and "uts/common/syscall/rusagesys.c":
+ */
+extern int waitid(idtype_t, id_t, k_siginfo_t *, int);
+extern int rusagesys(int, void *, void *, void *, void *);
+
+/*
+ * Convert between Linux options and Solaris options, returning -1 if any
+ * invalid flags are found.
+ */
+#define	LX_WNOHANG	0x00000001
+#define	LX_WUNTRACED	0x00000002
+#define	LX_WSTOPPED	LX_WUNTRACED
+#define	LX_WEXITED	0x00000004
+#define	LX_WCONTINUED	0x00000008
+#define	LX_WNOWAIT	0x01000000
+
+#define	LX_WNOTHREAD	0x20000000
+#define	LX_WALL		0x40000000
+#define	LX_WCLONE	0x80000000
+
+#define	LX_P_ALL	0x0
+#define	LX_P_PID	0x1
+#define	LX_P_GID	0x2
+
+/*
+ * Split the passed waitpid/waitid options into two separate variables:
+ * those for the native illumos waitid(2), and the extra Linux-specific
+ * options we will handle in our brand-specific code.
+ */
+static int
+ltos_options(uintptr_t options, int *native_options, int *extra_options)
+{
+	int newoptions = 0;
+
+	if (((options) & ~(LX_WNOHANG | LX_WUNTRACED | LX_WEXITED |
+	    LX_WCONTINUED | LX_WNOWAIT | LX_WNOTHREAD | LX_WALL |
+	    LX_WCLONE)) != 0) {
+		return (-1);
+	}
+
+	*extra_options = options & (LX_WNOTHREAD | LX_WALL | LX_WCLONE);
+
+	if (options & LX_WNOHANG)
+		newoptions |= WNOHANG;
+	if (options & LX_WUNTRACED)
+		newoptions |= WUNTRACED;
+	if (options & LX_WEXITED)
+		newoptions |= WEXITED;
+	if (options & LX_WCONTINUED)
+		newoptions |= WCONTINUED;
+	if (options & LX_WNOWAIT)
+		newoptions |= WNOWAIT;
+
+	/*
+	 * The trapped option is implicit on Linux.
+	 */
+	newoptions |= WTRAPPED;
+
+	*native_options = newoptions;
+	return (0);
+}
+
+static int
+lx_wstat(int code, int status)
+{
+	int stat = 0;
+
+	switch (code) {
+	case CLD_EXITED:
+		stat = status << 8;
+		break;
+	case CLD_DUMPED:
+		stat = lx_stol_signo(status, SIGKILL) | WCOREFLG;
+		break;
+	case CLD_KILLED:
+		stat = lx_stol_signo(status, SIGKILL);
+		break;
+	case CLD_TRAPPED:
+	case CLD_STOPPED:
+		stat = (lx_stol_status(status, SIGKILL) << 8) | WSTOPFLG;
+		break;
+	case CLD_CONTINUED:
+		stat = WCONTFLG;
+		break;
+	}
+
+	return (stat);
+}
+
+static int
+lx_call_waitid(idtype_t idtype, id_t id, k_siginfo_t *sip, int native_options,
+    int extra_options)
+{
+	lx_lwp_data_t *lwpd = ttolxlwp(curthread);
+	int error;
+
+	/*
+	 * Our brand-specific waitid helper only understands a subset of
+	 * the possible idtypes.  Ensure we keep to that subset here:
+	 */
+	if (idtype != P_ALL && idtype != P_PID && idtype != P_PGID) {
+		return (EINVAL);
+	}
+
+	/*
+	 * Enable the return of emulated ptrace(2) stop conditions
+	 * through lx_waitid_helper, and stash the Linux-specific
+	 * extra waitid() flags.
+	 */
+	lwpd->br_waitid_emulate = B_TRUE;
+	lwpd->br_waitid_flags = extra_options;
+
+	if ((error = waitid(idtype, id, sip, native_options)) == EINTR) {
+		/*
+		 * According to signal(7), the wait4(2), waitid(2), and
+		 * waitpid(2) system calls are restartable.
+		 */
+		ttolxlwp(curthread)->br_syscall_restart = B_TRUE;
+	}
+
+	lwpd->br_waitid_emulate = B_FALSE;
+	lwpd->br_waitid_flags = 0;
+
+	return (error);
+}
+
+long
+lx_wait4(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4)
+{
+	k_siginfo_t info = { 0 };
+	idtype_t idtype;
+	id_t id;
+	int status = 0;
+	pid_t pid = (pid_t)p1;
+	int error;
+	int native_options, extra_options;
+	int *statusp = (int *)p2;
+	void *rup = (void *)p4;
+
+	if (ltos_options(p3, &native_options, &extra_options) == -1) {
+		return (set_errno(EINVAL));
+	}
+
+	if (pid > maxpid) {
+		return (set_errno(ECHILD));
+	}
+
+	/*
+	 * While not listed as a valid return code, Linux's wait4(2) does,
+	 * in fact, get an EFAULT if either the status pointer or rusage
+	 * pointer is invalid. Since a failed waitpid should leave child
+	 * process in a state where a future wait4(2) will succeed, we
+	 * check them by copying out the values their buffers originally
+	 * contained.  (We need to do this as a failed system call should
+	 * never affect the contents of a passed buffer.)
+	 *
+	 * This will fail if the buffers in question are write-only.
+	 */
+	if (statusp != NULL) {
+		if (copyin(statusp, &status, sizeof (status)) != 0 ||
+		    copyout(&status, statusp, sizeof (status)) != 0) {
+			return (set_errno(EFAULT));
+		}
+	}
+
+	/*
+	 * Do the same check for the "struct rusage" pointer, which differs
+	 * in size for 32- and 64-bit processes.
+	 */
+	if (rup != NULL) {
+		struct rusage ru;
+		void *krup = &ru;
+		size_t rusz = sizeof (ru);
+#if defined(_SYSCALL32_IMPL)
+		struct rusage32 ru32;
+
+		if (get_udatamodel() != DATAMODEL_NATIVE) {
+			krup = &ru32;
+			rusz = sizeof (ru32);
+		}
+#endif
+
+		if (copyin(rup, krup, rusz) != 0 ||
+		    copyout(krup, rup, rusz) != 0) {
+			return (set_errno(EFAULT));
+		}
+	}
+
+	if (pid < -1) {
+		idtype = P_PGID;
+		id = -pid;
+	} else if (pid == -1) {
+		idtype = P_ALL;
+		id = 0;
+	} else if (pid == 0) {
+		idtype = P_PGID;
+		mutex_enter(&pidlock);
+		id = curproc->p_pgrp;
+		mutex_exit(&pidlock);
+	} else {
+		idtype = P_PID;
+		id = pid;
+	}
+
+	native_options |= (WEXITED | WTRAPPED);
+
+	if ((error = lx_call_waitid(idtype, id, &info, native_options,
+	    extra_options)) != 0) {
+		return (set_errno(error));
+	}
+
+	/*
+	 * If the WNOHANG flag was specified and no child was found return 0.
+	 */
+	if ((native_options & WNOHANG) && info.si_pid == 0) {
+		return (0);
+	}
+
+	status = lx_wstat(info.si_code, info.si_status);
+
+	/*
+	 * Unfortunately if this attempt to copy out either the status or the
+	 * rusage fails, the process will be in an inconsistent state as
+	 * subsequent calls to wait for the same child will fail where they
+	 * should succeed on a Linux system. This, however, is rather
+	 * unlikely since we tested the validity of both above.
+	 */
+	if (statusp != NULL) {
+		if (copyout(&status, statusp, sizeof (status)) != 0) {
+			return (set_errno(EFAULT));
+		}
+	}
+
+	if (rup != NULL) {
+		if ((error = rusagesys(_RUSAGESYS_GETRUSAGE_CHLD, rup, NULL,
+		    NULL, NULL)) != 0) {
+			return (set_errno(error));
+		}
+	}
+
+	return (info.si_pid);
+}
+
+long
+lx_waitpid(uintptr_t p1, uintptr_t p2, uintptr_t p3)
+{
+	return (lx_wait4(p1, p2, p3, NULL));
+}
+
+long
+lx_waitid(uintptr_t idtype, uintptr_t id, uintptr_t infop, uintptr_t opt)
+{
+	int error;
+	int native_options, extra_options;
+	k_siginfo_t info = { 0 };
+
+	if (ltos_options(opt, &native_options, &extra_options) == -1) {
+		return (set_errno(EINVAL));
+	}
+
+	if (((opt) & (LX_WEXITED | LX_WSTOPPED | LX_WCONTINUED)) == 0) {
+		return (set_errno(EINVAL));
+	}
+
+	switch (idtype) {
+	case LX_P_ALL:
+		idtype = P_ALL;
+		break;
+	case LX_P_PID:
+		idtype = P_PID;
+		break;
+	case LX_P_GID:
+		idtype = P_PGID;
+		break;
+	default:
+		return (set_errno(EINVAL));
+	}
+
+	if ((error = lx_call_waitid(idtype, id, &info, native_options,
+	    extra_options)) != 0) {
+		return (set_errno(error));
+	}
+
+	/*
+	 * If the WNOHANG flag was specified and no child was found return 0.
+	 */
+	if ((native_options & WNOHANG) && info.si_pid == 0) {
+		return (0);
+	}
+
+#if defined(_SYSCALL32_IMPL)
+	if (get_udatamodel() != DATAMODEL_NATIVE) {
+		return (stol_ksiginfo32_copyout(&info, (void *)infop));
+	} else
+#endif
+	{
+		return (stol_ksiginfo_copyout(&info, (void *)infop));
+	}
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_xattr.c b/usr/src/uts/common/brand/lx/syscall/lx_xattr.c
new file mode 100644
index 0000000000..bd7667226f
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_xattr.c
@@ -0,0 +1,371 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#include <sys/errno.h>
+#include <sys/systm.h>
+#include <sys/file.h>
+#include <sys/vnode.h>
+#include <sys/pathname.h>
+
+
+#define	LX_XATTR_NAME_MAX	255
+#define	LX_XATTR_SIZE_MAX	65536
+#define	LX_XATTR_LIST_MAX	65536
+
+#define	LX_XATTR_FLAG_CREATE	0x1
+#define	LX_XATTR_FLAG_REPLACE	0x2
+#define	LX_XATTR_FLAGS_VALID	(LX_XATTR_FLAG_CREATE | LX_XATTR_FLAG_REPLACE)
+
+#define	LX_CAP_XATTR_NAME	"security.capability"
+
+/*
+ * *xattr() family of functions.
+ *
+ * These are largely unimplemented.  In most cases we return EOPNOTSUPP, rather
+ * than using NOSYS_NO_EQUIV to avoid unwanted stderr output from ls(1).
+ *
+ * Note that CRED() is used instead of f_cred in the f*xattr functions.  This
+ * is intentional as Linux does not have the same notion of per-fd credentials.
+ */
+
+/* ARGSUSED */
+static int
+lx_setxattr_common(vnode_t *vp, char *name, void *value, size_t size,
+    int flags)
+{
+	int error;
+	char name_buf[LX_XATTR_NAME_MAX + 1];
+	size_t name_len;
+
+	if ((flags & ~LX_XATTR_FLAGS_VALID) != 0) {
+		return (EINVAL);
+	}
+	error = copyinstr(name, name_buf, sizeof (name_buf), &name_len);
+	if (error == ENAMETOOLONG || name_len == sizeof (name_buf)) {
+		return (ERANGE);
+	} else if (error != 0) {
+		return (EFAULT);
+	}
+	if (size > LX_XATTR_SIZE_MAX) {
+		return (E2BIG);
+	}
+
+	/*
+	 * In order to keep package management software happy, despite lacking
+	 * support for file-based Linux capabilities via xattrs, we fake
+	 * success when root attempts a setxattr on that attribute.
+	 */
+	if (crgetuid(CRED()) == 0 &&
+	    strcmp(name_buf, LX_CAP_XATTR_NAME) == 0) {
+		return (0);
+	}
+
+
+	return (EOPNOTSUPP);
+}
+
+/* ARGSUSED */
+static int
+lx_getxattr_common(vnode_t *vp, char *name, char *value, size_t size,
+    ssize_t *osize)
+{
+	int error;
+	char name_buf[LX_XATTR_NAME_MAX + 1];
+	size_t name_len;
+
+	error = copyinstr(name, name_buf, sizeof (name_buf), &name_len);
+	if (error == ENAMETOOLONG || name_len == sizeof (name_buf)) {
+		return (ERANGE);
+	} else if (error != 0) {
+		return (EFAULT);
+	}
+
+	/*
+	 * Only parameter validation is attempted for now.
+	 */
+	return (EOPNOTSUPP);
+}
+
+/* ARGSUSED */
+static int
+lx_listxattr_common(vnode_t *vp, char *list, size_t size, ssize_t *osize)
+{
+	return (EOPNOTSUPP);
+}
+
+/* ARGSUSED */
+static int
+lx_removexattr_common(vnode_t *vp, char *name)
+{
+	int error;
+	char name_buf[LX_XATTR_NAME_MAX + 1];
+	size_t name_len;
+
+	error = copyinstr(name, name_buf, sizeof (name_buf), &name_len);
+	if (error == ENAMETOOLONG || name_len == sizeof (name_buf)) {
+		return (ERANGE);
+	} else if (error != 0) {
+		return (EFAULT);
+	}
+
+	/*
+	 * Only parameter validation is attempted for now.
+	 */
+	return (EOPNOTSUPP);
+}
+
+
+long
+lx_setxattr(char *path, char *name, void *value, size_t size, int flags)
+{
+	int error;
+	vnode_t *vp = NULL;
+
+	error = lookupname(path, UIO_USERSPACE, FOLLOW, NULLVPP, &vp);
+	if (error != 0) {
+		return (set_errno(error));
+	}
+
+	error = lx_setxattr_common(vp, name, value, size, flags);
+	VN_RELE(vp);
+
+	if (error != 0) {
+		return (set_errno(error));
+	}
+	return (0);
+}
+
+long
+lx_lsetxattr(char *path, char *name, void *value, size_t size, int flags)
+{
+	int error;
+	vnode_t *vp = NULL;
+
+	error = lookupname(path, UIO_USERSPACE, NO_FOLLOW, NULLVPP, &vp);
+	if (error != 0) {
+		return (set_errno(error));
+	}
+
+	error = lx_setxattr_common(vp, name, value, size, flags);
+	VN_RELE(vp);
+
+	if (error != 0) {
+		return (set_errno(error));
+	}
+	return (0);
+}
+
+long
+lx_fsetxattr(int fd, char *name, void *value, size_t size, int flags)
+{
+	int error;
+	file_t *fp;
+
+	if ((fp = getf(fd)) == NULL) {
+		return (set_errno(EBADF));
+	}
+
+	error = lx_setxattr_common(fp->f_vnode, name, value, size, flags);
+	releasef(fd);
+
+	if (error != 0) {
+		return (set_errno(error));
+	}
+	return (0);
+}
+
+ssize_t
+lx_getxattr(char *path, char *name, void *value, size_t size)
+{
+	int error;
+	vnode_t *vp = NULL;
+	ssize_t osize;
+
+	error = lookupname(path, UIO_USERSPACE, FOLLOW, NULLVPP, &vp);
+	if (error != 0) {
+		return (set_errno(error));
+	}
+
+	error = lx_getxattr_common(vp, name, value, size, &osize);
+	VN_RELE(vp);
+
+	if (error != 0) {
+		return (set_errno(error));
+	}
+	return (osize);
+}
+
+ssize_t
+lx_lgetxattr(char *path, char *name, void *value, size_t size)
+{
+
+	int error;
+	vnode_t *vp = NULL;
+	ssize_t osize;
+
+	error = lookupname(path, UIO_USERSPACE, NO_FOLLOW, NULLVPP, &vp);
+	if (error != 0) {
+		return (set_errno(error));
+	}
+
+	error = lx_getxattr_common(vp, name, value, size, &osize);
+	VN_RELE(vp);
+
+	if (error != 0) {
+		return (set_errno(error));
+	}
+	return (osize);
+}
+
+ssize_t
+lx_fgetxattr(int fd, char *name, void *value, size_t size)
+{
+	int error;
+	file_t *fp;
+	ssize_t osize;
+
+	if ((fp = getf(fd)) == NULL) {
+		return (set_errno(EBADF));
+	}
+
+	error = lx_getxattr_common(fp->f_vnode, name, value, size, &osize);
+	releasef(fd);
+
+	if (error != 0) {
+		return (set_errno(error));
+	}
+	return (osize);
+}
+
+ssize_t
+lx_listxattr(char *path, char *list, size_t size)
+{
+	int error;
+	vnode_t *vp = NULL;
+	ssize_t osize;
+
+	error = lookupname(path, UIO_USERSPACE, FOLLOW, NULLVPP, &vp);
+	if (error != 0) {
+		return (set_errno(error));
+	}
+
+	error = lx_listxattr_common(vp, list, size, &osize);
+	VN_RELE(vp);
+
+	if (error != 0) {
+		return (set_errno(error));
+	}
+	return (osize);
+}
+
+ssize_t
+lx_llistxattr(char *path, char *list, size_t size)
+{
+	int error;
+	vnode_t *vp = NULL;
+	ssize_t osize;
+
+	error = lookupname(path, UIO_USERSPACE, NO_FOLLOW, NULLVPP, &vp);
+	if (error != 0) {
+		return (set_errno(error));
+	}
+
+	error = lx_listxattr_common(vp, list, size, &osize);
+	VN_RELE(vp);
+
+	if (error != 0) {
+		return (set_errno(error));
+	}
+	return (osize);
+}
+
+ssize_t
+lx_flistxattr(int fd, char *list, size_t size)
+{
+	int error;
+	file_t *fp;
+	ssize_t osize;
+
+	if ((fp = getf(fd)) == NULL) {
+		return (set_errno(EBADF));
+	}
+
+	error = lx_listxattr_common(fp->f_vnode, list, size, &osize);
+	releasef(fd);
+
+	if (error != 0) {
+		return (set_errno(error));
+	}
+	return (osize);
+}
+
+int
+lx_removexattr(char *path, char *name)
+{
+	int error;
+	vnode_t *vp = NULL;
+
+	error = lookupname(path, UIO_USERSPACE, FOLLOW, NULLVPP, &vp);
+	if (error != 0) {
+		return (set_errno(error));
+	}
+
+	error = lx_removexattr_common(vp, name);
+	VN_RELE(vp);
+
+	if (error != 0) {
+		return (set_errno(error));
+	}
+	return (0);
+}
+
+int
+lx_lremovexattr(char *path, char *name)
+{
+	int error;
+	vnode_t *vp = NULL;
+
+	error = lookupname(path, UIO_USERSPACE, NO_FOLLOW, NULLVPP, &vp);
+	if (error != 0) {
+		return (set_errno(error));
+	}
+
+	error = lx_removexattr_common(vp, name);
+	VN_RELE(vp);
+
+	if (error != 0) {
+		return (set_errno(error));
+	}
+	return (0);
+}
+
+int
+lx_fremovexattr(int fd, char *name)
+{
+	int error;
+	file_t *fp;
+
+	if ((fp = getf(fd)) == NULL) {
+		return (set_errno(EBADF));
+	}
+
+	error = lx_removexattr_common(fp->f_vnode, name);
+	releasef(fd);
+
+	if (error != 0) {
+		return (set_errno(error));
+	}
+	return (0);
+}
diff --git a/usr/src/uts/common/brand/lx/sysfs/lx_sysfs.h b/usr/src/uts/common/brand/lx/sysfs/lx_sysfs.h
new file mode 100644
index 0000000000..93dc316c1e
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/sysfs/lx_sysfs.h
@@ -0,0 +1,196 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#ifndef	_LXSYSFS_H
+#define	_LXSYSFS_H
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * lx_sysfs.h: declarations, data structures and macros for lx_sysfs
+ */
+
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/policy.h>
+#include <sys/debug.h>
+#include <sys/dirent.h>
+#include <sys/errno.h>
+#include <sys/file.h>
+#include <sys/kmem.h>
+#include <sys/pathname.h>
+#include <sys/systm.h>
+#include <sys/var.h>
+#include <sys/user.h>
+#include <sys/t_lock.h>
+#include <sys/sysmacros.h>
+#include <sys/cred.h>
+#include <sys/priv.h>
+#include <sys/vnode.h>
+#include <sys/vfs.h>
+#include <sys/statvfs.h>
+#include <sys/cmn_err.h>
+#include <sys/zone.h>
+#include <sys/uio.h>
+#include <sys/utsname.h>
+#include <sys/dnlc.h>
+#include <sys/atomic.h>
+#include <sys/sunddi.h>
+#include <sys/sunldi.h>
+#include <vm/as.h>
+#include <vm/anon.h>
+#include <sys/netstack.h>
+#include <inet/ip.h>
+#include <inet/ip_if.h>
+
+/*
+ * Convert a vnode into an lxsys_mnt_t
+ */
+#define	VTOLXSM(vp)	((lxsys_mnt_t *)(vp)->v_vfsp->vfs_data)
+
+/*
+ * convert a vnode into an lxsys_node
+ */
+#define	VTOLXS(vp)	((lxsys_node_t *)(vp)->v_data)
+
+/*
+ * convert a lxsys_node into a vnode
+ */
+#define	LXSTOV(lxsnp)	((lxsnp)->lxsys_vnode)
+
+/*
+ * convert a lxsys_node into zone for fs
+ */
+#define	LXSTOZ(lxsnp) \
+	(((lxsys_mnt_t *)(lxsnp)->lxsys_vnode->v_vfsp->vfs_data)->lxsysm_zone)
+
+#define	LXSNSIZ		256	/* max size of lx /sys file name entries */
+
+/*
+ * Pretend that a directory entry takes 16 bytes
+ */
+#define	LXSYS_SDSIZE	16
+
+/* Root sysfs lxsys_instance */
+#define	LXSYS_INST_ROOT	0
+
+/*
+ * Node/file types for lx /sys files
+ * (directories and files contained therein).
+ */
+typedef enum lxsys_nodetype {
+	LXSYS_NONE,		/* None-type to keep inodes non-zero	*/
+	LXSYS_STATIC,		/* Statically defined entries		*/
+	LXSYS_CLASS_NET,	/* /sys/class/net/<iface>		*/
+	LXSYS_DEV_NET,		/* /sys/devices/virtual/net/<iface>	*/
+	LXSYS_BLOCK,		/* /sys/block/<dev>			*/
+	LXSYS_DEV_ZFS,		/* /sys/devices/zfs/<dev>		*/
+	LXSYS_DEV_SYS_CPU,	/* /sys/devices/system/cpu/<cpu>	*/
+	LXSYS_DEV_SYS_CPUINFO,	/* /sys/devices/system/cpu/cpuN/<info>	*/
+	LXSYS_DEV_SYS_NODE,	/* /sys/devices/system/node/node0/<info> */
+	LXSYS_MAXTYPE,		/* type limit				*/
+} lxsys_nodetype_t;
+
+/*
+ * external dirent characteristics
+ */
+typedef struct {
+	unsigned int	d_idnum;
+	char		*d_name;
+} lxsys_dirent_t;
+
+typedef struct {
+	unsigned int	dl_instance;
+	lxsys_dirent_t	*dl_list;
+	int		dl_length;
+} lxsys_dirlookup_t;
+
+/*
+ * This is the lx sysfs private data object
+ * which is attached to v_data in the vnode structure
+ */
+struct lxsys_node;
+typedef struct lxsys_node lxsys_node_t;
+struct lxsys_node {
+	lxsys_nodetype_t	lxsys_type;	/* type ID of node 	*/
+	unsigned int		lxsys_instance;	/* instance ID node	*/
+	unsigned int		lxsys_endpoint;	/* endpoint ID node	*/
+	vnode_t			*lxsys_vnode;	/* vnode for the node	*/
+	vnode_t			*lxsys_parentvp; /* parent directory	*/
+	lxsys_node_t		*lxsys_next;	/* next list entry	*/
+	timestruc_t		lxsys_time;	/* creation time	*/
+	mode_t			lxsys_mode;	/* file mode bits	*/
+	uid_t			lxsys_uid;	/* file owner		*/
+	gid_t			lxsys_gid;	/* file group owner	*/
+	ino_t			lxsys_ino;	/* node id		*/
+};
+
+/*
+ * This is the lxsysfs private data object
+ * which is attached to vfs_data in the vfs structure
+ */
+typedef struct lxsys_mnt {
+	kmutex_t	lxsysm_lock;	/* protects fields		*/
+	lxsys_node_t	*lxsysm_node;	/* node at root of sys mount	*/
+	zone_t		*lxsysm_zone;	/* zone for this mount		*/
+} lxsys_mnt_t;
+
+extern vnodeops_t	*lxsys_vnodeops;
+
+typedef struct mounta	mounta_t;
+
+extern void lxsys_initnodecache();
+extern void lxsys_fininodecache();
+extern ino_t lxsys_inode(lxsys_nodetype_t, unsigned int, unsigned int);
+extern ino_t lxsys_parentinode(lxsys_node_t *);
+extern lxsys_node_t *lxsys_getnode(vnode_t *, lxsys_nodetype_t, unsigned int,
+    unsigned int);
+extern lxsys_node_t *lxsys_getnode_static(vnode_t *, unsigned int);
+extern void lxsys_freenode(lxsys_node_t *);
+
+extern netstack_t *lxsys_netstack(lxsys_node_t *);
+extern ill_t *lxsys_find_ill(ip_stack_t *, uint_t);
+
+typedef struct lxpr_uiobuf {
+	uio_t *uiop;
+	char *buffer;
+	uint32_t bufsize;
+	char *pos;
+	size_t beg;
+	int error;
+} lxsys_uiobuf_t;
+
+extern lxsys_uiobuf_t *lxsys_uiobuf_new(uio_t *);
+extern void lxsys_uiobuf_free(lxsys_uiobuf_t *);
+extern void lxsys_uiobuf_seterr(lxsys_uiobuf_t *, int);
+extern int lxsys_uiobuf_flush(lxsys_uiobuf_t *);
+extern void lxsys_uiobuf_write(lxsys_uiobuf_t *, const char *, size_t);
+extern void lxsys_uiobuf_printf(lxsys_uiobuf_t *uiobuf, const char *fmt, ...);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#ifndef islower
+#define	islower(x)	(((unsigned)(x) >= 'a') && ((unsigned)(x) <= 'z'))
+#endif
+#ifndef toupper
+#define	toupper(x)	(islower(x) ? (x) - 'a' + 'A' : (x))
+#endif
+
+#endif /* _LXSYSFS_H */
diff --git a/usr/src/uts/common/brand/lx/sysfs/lx_syssubr.c b/usr/src/uts/common/brand/lx/sysfs/lx_syssubr.c
new file mode 100644
index 0000000000..3184b34d08
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/sysfs/lx_syssubr.c
@@ -0,0 +1,457 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+/*
+ * lx_syssubr.c: Various functions for the /sys vnodeops.
+ */
+
+#include <sys/varargs.h>
+
+#include <sys/cpuvar.h>
+#include <sys/mman.h>
+#include <sys/vmsystm.h>
+#include <sys/prsystm.h>
+
+#include "lx_sysfs.h"
+
+#define	LXSYSCACHE_NAME "lxsys_cache"
+
+static int lxsys_node_constructor(void *, void *, int);
+static void lxsys_node_destructor(void *, void *);
+
+static kmem_cache_t *lxsys_node_cache;
+
+void
+lxsys_initnodecache()
+{
+	lxsys_node_cache = kmem_cache_create(LXSYSCACHE_NAME,
+	    sizeof (lxsys_node_t), 0,
+	    lxsys_node_constructor, lxsys_node_destructor, NULL, NULL, NULL, 0);
+}
+
+void
+lxsys_fininodecache()
+{
+	kmem_cache_destroy(lxsys_node_cache);
+}
+
+/* ARGSUSED */
+static int
+lxsys_node_constructor(void *buf, void *un, int kmflags)
+{
+	lxsys_node_t	*lxsnp = buf;
+	vnode_t		*vp;
+
+	vp = lxsnp->lxsys_vnode = vn_alloc(kmflags);
+	if (vp == NULL)
+		return (-1);
+
+	(void) vn_setops(vp, lxsys_vnodeops);
+	vp->v_data = lxsnp;
+
+	return (0);
+}
+
+/* ARGSUSED */
+static void
+lxsys_node_destructor(void *buf, void *un)
+{
+	lxsys_node_t	*lxsnp = buf;
+
+	vn_free(LXSTOV(lxsnp));
+}
+
+/*
+ * Calculate an inode number
+ *
+ * This takes various bits of info and munges them
+ * to give the inode number for an lxsys node
+ */
+ino_t
+lxsys_inode(lxsys_nodetype_t type, unsigned int instance,
+    unsigned int endpoint)
+{
+	/*
+	 * Sysfs Inode format:
+	 * 0000AABBBBCC
+	 *
+	 * AA - TYPE
+	 * BBBB - INSTANCE
+	 * CC - ENDPOINT
+	 */
+	ASSERT(instance <= 0xffff);
+	ASSERT(endpoint <= 0xff);
+
+	return ((ino_t)(type << 24)|(instance << 8)|endpoint);
+}
+
+/*
+ * Return inode number of parent (directory)
+ */
+ino_t
+lxsys_parentinode(lxsys_node_t *lxsnp)
+{
+	/*
+	 * If the input node is the root then the parent inode
+	 * is the mounted on inode so just return our inode number
+	 */
+	if (lxsnp->lxsys_type == LXSYS_STATIC &&
+	    lxsnp->lxsys_instance == LXSYS_INST_ROOT) {
+		return (lxsnp->lxsys_ino);
+	} else {
+		return (VTOLXS(lxsnp->lxsys_parentvp)->lxsys_ino);
+	}
+}
+
+/*
+ * Allocate a new lxsys node
+ *
+ * This also allocates the vnode associated with it
+ */
+lxsys_node_t *
+lxsys_getnode(vnode_t *dp, lxsys_nodetype_t type, unsigned int instance,
+    unsigned int endpoint)
+{
+	lxsys_node_t *lxsnp;
+	vnode_t *vp;
+	timestruc_t now;
+
+	/*
+	 * Allocate a new node. It is deallocated in vop_innactive
+	 */
+	lxsnp = kmem_cache_alloc(lxsys_node_cache, KM_SLEEP);
+
+	/*
+	 * Set defaults (may be overridden below)
+	 */
+	gethrestime(&now);
+	lxsnp->lxsys_type = type;
+	lxsnp->lxsys_instance = instance;
+	lxsnp->lxsys_endpoint = endpoint;
+	lxsnp->lxsys_next = NULL;
+	lxsnp->lxsys_parentvp = dp;
+	VN_HOLD(dp);
+
+	lxsnp->lxsys_time = now;
+	lxsnp->lxsys_uid = lxsnp->lxsys_gid = 0;
+	lxsnp->lxsys_ino = lxsys_inode(type, instance, endpoint);
+
+	/* initialize the vnode data */
+	vp = lxsnp->lxsys_vnode;
+	vn_reinit(vp);
+	vp->v_flag = VNOCACHE|VNOMAP|VNOSWAP|VNOMOUNT;
+	vp->v_vfsp = dp->v_vfsp;
+
+	/*
+	 * Default to a directory with open permissions.
+	 * Specific components will override this
+	 */
+	if (type == LXSYS_STATIC && instance == LXSYS_INST_ROOT) {
+		vp->v_flag |= VROOT;
+	}
+	vp->v_type = VDIR;
+	lxsnp->lxsys_mode = 0555;
+
+	return (lxsnp);
+}
+
+lxsys_node_t *
+lxsys_getnode_static(vnode_t *dp, unsigned int instance)
+{
+	lxsys_mnt_t *lxsm = VTOLXSM(dp);
+	lxsys_node_t *lnp;
+
+	mutex_enter(&lxsm->lxsysm_lock);
+	lnp = lxsm->lxsysm_node;
+	while (1) {
+		if (lnp->lxsys_instance == instance) {
+			VERIFY(lnp->lxsys_parentvp == dp);
+
+			VN_HOLD(lnp->lxsys_vnode);
+			mutex_exit(&lxsm->lxsysm_lock);
+			return (lnp);
+		} else if (lnp->lxsys_next == NULL) {
+			break;
+		}
+		lnp = lnp->lxsys_next;
+	}
+
+	/*
+	 * No persistent node found.
+	 * Create one and add it to the end of the list.
+	 */
+	lnp->lxsys_next = lxsys_getnode(dp, LXSYS_STATIC, instance, 0);
+	lnp = lnp->lxsys_next;
+	/* Allow mounts on static entries */
+	LXSTOV(lnp)->v_flag &= (~VNOMOUNT);
+
+	mutex_exit(&lxsm->lxsysm_lock);
+	return (lnp);
+}
+
+/* Clean up persistence for static lxsys_node */
+int
+lxsys_freenode_static(lxsys_node_t *lnp)
+{
+	lxsys_node_t *plnp;
+	vnode_t *vp = LXSTOV(lnp);
+	lxsys_mnt_t *lxsm = VTOLXSM(vp);
+
+	if (lnp->lxsys_instance == LXSYS_INST_ROOT) {
+		/*
+		 * The root vnode does not need special cleanup since it
+		 * anchors the list and is freed by lxsys_unmount.
+		 */
+		return (0);
+	}
+
+	mutex_enter(&lxsm->lxsysm_lock);
+
+	/*
+	 * It is possible that a different process acquired a fresh reference
+	 * to this vnode via lookup while we were waiting on the lxsysm_lock.
+	 * To avoid freeing the vnode out from under them, we will double-check
+	 * v_count and bail from the fop_inactive if it was grabbed.
+	 */
+	mutex_enter(&vp->v_lock);
+	if (vp->v_count != 1) {
+		VERIFY(vp->v_count > 0);
+
+		/* Release our hold before bailing out of lxsys_inactive */
+		vp->v_count--;
+
+		mutex_exit(&vp->v_lock);
+		mutex_exit(&lxsm->lxsysm_lock);
+		return (-1);
+	}
+	mutex_exit(&vp->v_lock);
+
+	/* search for the record pointing to lnp */
+	plnp = lxsm->lxsysm_node;
+	while (plnp != NULL && plnp->lxsys_next != lnp) {
+		plnp = plnp->lxsys_next;
+	}
+	/* entry should always be found */
+	VERIFY(plnp != NULL);
+	plnp->lxsys_next = lnp->lxsys_next;
+
+	mutex_exit(&lxsm->lxsysm_lock);
+	return (0);
+}
+
+/*
+ * Free the storage obtained from lxsys_getnode().
+ */
+void
+lxsys_freenode(lxsys_node_t *lxsnp)
+{
+	vnode_t *vp = LXSTOV(lxsnp);
+
+	VERIFY(vp != NULL);
+
+	if (lxsnp->lxsys_type == LXSYS_STATIC) {
+		if (lxsys_freenode_static(lxsnp) != 0) {
+			return;
+		}
+	}
+
+	/*
+	 * delete any association with parent vp
+	 */
+	if (lxsnp->lxsys_parentvp != NULL)
+		VN_RELE(lxsnp->lxsys_parentvp);
+
+	/*
+	 * Release the lxsysnode.
+	 */
+	kmem_cache_free(lxsys_node_cache, lxsnp);
+}
+
+/*
+ * Get the netstack associated with this lxsys mount
+ */
+netstack_t *
+lxsys_netstack(lxsys_node_t *lnp)
+{
+	zone_t *zone = VTOLXSM(LXSTOV(lnp))->lxsysm_zone;
+	netstack_t *ns = zone->zone_netstack;
+
+	VERIFY(ns != NULL);
+
+	if (ns->netstack_flags & (NSF_UNINIT|NSF_CLOSING)) {
+		ns = NULL;
+	} else {
+		netstack_hold(ns);
+	}
+
+	return (ns);
+}
+
+ill_t *
+lxsys_find_ill(ip_stack_t *ipst, uint_t ifindex)
+{
+	ill_t *ill;
+	phyint_t *phyi;
+
+	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
+	phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
+	    (void *) &ifindex, NULL);
+	if (phyi != NULL) {
+		/*
+		 * Since interface information presented via /sys is not
+		 * specific to IPv4 or IPv6, an ill reference from either
+		 * protocol will be adequate.  Check both, starting with IPv4
+		 * for a valid reference to use.
+		 */
+		for (ill = phyi->phyint_illv4; ill != phyi->phyint_illv6;
+		    ill = phyi->phyint_illv6) {
+			if (ill != NULL) {
+				mutex_enter(&ill->ill_lock);
+				if (!ILL_IS_CONDEMNED(ill)) {
+					ill_refhold_locked(ill);
+					mutex_exit(&ill->ill_lock);
+					rw_exit(&ipst->ips_ill_g_lock);
+					return (ill);
+				}
+				mutex_exit(&ill->ill_lock);
+			}
+		}
+	}
+	rw_exit(&ipst->ips_ill_g_lock);
+	return (NULL);
+}
+
+
+#define	LXSYSUIOBUFSZ	4096
+
+lxsys_uiobuf_t *
+lxsys_uiobuf_new(uio_t *uiop)
+{
+	/* Allocate memory for both lxsys_uiobuf and output buffer */
+	int bufsize = LXSYSUIOBUFSZ;
+	lxsys_uiobuf_t *uiobuf =
+	    kmem_alloc(sizeof (lxsys_uiobuf_t) + bufsize, KM_SLEEP);
+
+	uiobuf->uiop = uiop;
+	uiobuf->buffer = (char *)&uiobuf[1];
+	uiobuf->bufsize = bufsize;
+	uiobuf->pos = uiobuf->buffer;
+	uiobuf->beg = 0;
+	uiobuf->error = 0;
+
+	return (uiobuf);
+}
+
+void
+lxsys_uiobuf_free(lxsys_uiobuf_t *uiobuf)
+{
+	ASSERT(uiobuf != NULL);
+	ASSERT(uiobuf->pos == uiobuf->buffer);
+
+	kmem_free(uiobuf, sizeof (lxsys_uiobuf_t) + uiobuf->bufsize);
+}
+
+void
+lxsys_uiobuf_seterr(lxsys_uiobuf_t *uiobuf, int err)
+{
+	ASSERT(uiobuf->error == 0);
+
+	uiobuf->error = err;
+}
+
+int
+lxsys_uiobuf_flush(lxsys_uiobuf_t *uiobuf)
+{
+	off_t off = uiobuf->uiop->uio_offset;
+	caddr_t uaddr = uiobuf->buffer;
+	size_t beg = uiobuf->beg;
+	size_t size = (uintptr_t)uiobuf->pos - (uintptr_t)uaddr;
+
+	if (uiobuf->error == 0 && uiobuf->uiop->uio_resid != 0) {
+		ASSERT(off >= beg);
+
+		if (beg + size > off && off >= 0)
+			uiobuf->error =
+			    uiomove(uaddr + (off - beg), size - (off - beg),
+			    UIO_READ, uiobuf->uiop);
+
+		uiobuf->beg += size;
+	}
+
+	uiobuf->pos = uaddr;
+
+	return (uiobuf->error);
+}
+
+void
+lxsys_uiobuf_write(lxsys_uiobuf_t *uiobuf, const char *buf, size_t size)
+{
+	/* While we can still carry on */
+	while (uiobuf->error == 0 && uiobuf->uiop->uio_resid != 0) {
+		uintptr_t remain = (uintptr_t)uiobuf->bufsize -
+		    ((uintptr_t)uiobuf->pos - (uintptr_t)uiobuf->buffer);
+
+		/* Enough space in buffer? */
+		if (remain >= size) {
+			bcopy(buf, uiobuf->pos, size);
+			uiobuf->pos += size;
+			return;
+		}
+
+		/* Not enough space, so copy all we can and try again */
+		bcopy(buf, uiobuf->pos, remain);
+		uiobuf->pos += remain;
+		(void) lxsys_uiobuf_flush(uiobuf);
+		buf += remain;
+		size -= remain;
+	}
+}
+
+#define	TYPBUFFSIZE 256
+
+void
+lxsys_uiobuf_printf(lxsys_uiobuf_t *uiobuf, const char *fmt, ...)
+{
+	va_list args;
+	char buff[TYPBUFFSIZE];
+	int len;
+	char *buffer;
+
+	/* Can we still do any output */
+	if (uiobuf->error != 0 || uiobuf->uiop->uio_resid == 0)
+		return;
+
+	va_start(args, fmt);
+
+	/* Try using stack allocated buffer */
+	len = vsnprintf(buff, TYPBUFFSIZE, fmt, args);
+	if (len < TYPBUFFSIZE) {
+		va_end(args);
+		lxsys_uiobuf_write(uiobuf, buff, len);
+		return;
+	}
+
+	/* Not enough space in pre-allocated buffer */
+	buffer = kmem_alloc(len + 1, KM_SLEEP);
+
+	/*
+	 * We know we allocated the correct amount of space
+	 * so no check on the return value
+	 */
+	(void) vsnprintf(buffer, len+1, fmt, args);
+	lxsys_uiobuf_write(uiobuf, buffer, len);
+	va_end(args);
+	kmem_free(buffer, len+1);
+}
diff --git a/usr/src/uts/common/brand/lx/sysfs/lx_sysvfsops.c b/usr/src/uts/common/brand/lx/sysfs/lx_sysvfsops.c
new file mode 100644
index 0000000000..9bb1d70527
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/sysfs/lx_sysvfsops.c
@@ -0,0 +1,348 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+/*
+ * lxsysvfsops.c: vfs operations for lx sysfs.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/cmn_err.h>
+#include <sys/cred.h>
+#include <sys/debug.h>
+#include <sys/errno.h>
+#include <sys/proc.h>
+#include <sys/stat.h>
+#include <sys/statvfs.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/var.h>
+#include <sys/vfs.h>
+#include <sys/vfs_opreg.h>
+#include <sys/vnode.h>
+#include <sys/mode.h>
+#include <sys/signal.h>
+#include <sys/user.h>
+#include <sys/mount.h>
+#include <sys/bitmap.h>
+#include <sys/kmem.h>
+#include <sys/policy.h>
+#include <sys/modctl.h>
+#include <sys/sunddi.h>
+#include <sys/sunldi.h>
+#include <sys/lx_impl.h>
+
+#include "lx_sysfs.h"
+
+/* Module level parameters */
+static int	lxsysfstype;
+static dev_t	lxsysdev;
+static kmutex_t	lxsys_mount_lock;
+
+static int lxsys_mount(vfs_t *, vnode_t *, mounta_t *, cred_t *);
+static int lxsys_unmount(vfs_t *, int, cred_t *);
+static int lxsys_root(vfs_t *, vnode_t **);
+static int lxsys_statvfs(vfs_t *, statvfs64_t *);
+static int lxsys_init(int, char *);
+
+static vfsdef_t vfw = {
+	VFSDEF_VERSION,
+	"lx_sysfs",
+	lxsys_init,
+	VSW_ZMOUNT,
+	NULL
+};
+
+/*
+ * Module linkage information for the kernel.
+ */
+extern struct mod_ops mod_fsops;
+
+static struct modlfs modlfs = {
+	&mod_fsops, "lx brand sysfs", &vfw
+};
+
+static struct modlinkage modlinkage = {
+	MODREV_1, (void *)&modlfs, NULL
+};
+
+int
+_init(void)
+{
+	return (mod_install(&modlinkage));
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&modlinkage, modinfop));
+}
+
+int
+_fini(void)
+{
+	int retval;
+
+	/*
+	 * attempt to unload the module
+	 */
+	if ((retval = mod_remove(&modlinkage)) != 0)
+		goto done;
+
+	/*
+	 * destroy lxsys_node cache
+	 */
+	lxsys_fininodecache();
+
+	/*
+	 * clean out the vfsops and vnodeops
+	 */
+	(void) vfs_freevfsops_by_type(lxsysfstype);
+	vn_freevnodeops(lxsys_vnodeops);
+
+	mutex_destroy(&lxsys_mount_lock);
+done:
+	return (retval);
+}
+
+static int
+lxsys_init(int fstype, char *name)
+{
+	static const fs_operation_def_t lxsys_vfsops_template[] = {
+		VFSNAME_MOUNT,		{ .vfs_mount = lxsys_mount },
+		VFSNAME_UNMOUNT,	{ .vfs_unmount = lxsys_unmount },
+		VFSNAME_ROOT,		{ .vfs_root = lxsys_root },
+		VFSNAME_STATVFS,	{ .vfs_statvfs = lxsys_statvfs },
+		NULL,			NULL
+	};
+	extern const fs_operation_def_t lxsys_vnodeops_template[];
+	int error;
+	major_t dev;
+
+	lxsysfstype = fstype;
+	ASSERT(lxsysfstype != 0);
+
+	mutex_init(&lxsys_mount_lock, NULL, MUTEX_DEFAULT, NULL);
+
+	/*
+	 * Associate VFS ops vector with this fstype.
+	 */
+	error = vfs_setfsops(fstype, lxsys_vfsops_template, NULL);
+	if (error != 0) {
+		cmn_err(CE_WARN, "lxsys_init: bad vfs ops template");
+		return (error);
+	}
+
+	/*
+	 * Set up vnode ops vector too.
+	 */
+	error = vn_make_ops(name, lxsys_vnodeops_template, &lxsys_vnodeops);
+	if (error != 0) {
+		(void) vfs_freevfsops_by_type(fstype);
+		cmn_err(CE_WARN, "lxsys_init: bad vnode ops template");
+		return (error);
+	}
+
+	/*
+	 * Assign a unique "device" number (used by stat(2)).
+	 */
+	if ((dev = getudev()) == (major_t)-1) {
+		cmn_err(CE_WARN, "lxsys_init: can't get unique device number");
+		dev = 0;
+	}
+
+	/*
+	 * Make the pseudo device
+	 */
+	lxsysdev = makedevice(dev, 0);
+
+	/*
+	 * Initialise cache for lxsys_nodes
+	 */
+	lxsys_initnodecache();
+
+	return (0);
+}
+
+static int
+lxsys_mount(vfs_t *vfsp, vnode_t *mvp, mounta_t *uap, cred_t *cr)
+{
+	lxsys_mnt_t *lxsys_mnt;
+	zone_t *zone = curproc->p_zone;
+
+	/*
+	 * must be root to mount
+	 */
+	if (secpolicy_fs_mount(cr, mvp, vfsp) != 0)
+		return (EPERM);
+
+	/*
+	 * mount point must be a directory
+	 */
+	if (mvp->v_type != VDIR)
+		return (ENOTDIR);
+
+	if (zone == global_zone) {
+		zone_t *mntzone;
+
+		mntzone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt));
+		zone_rele(mntzone);
+		if (zone != mntzone)
+			return (EBUSY);
+	}
+
+	/*
+	 * Having the resource be anything but "lxsys" doesn't make sense
+	 */
+	vfs_setresource(vfsp, "lxsys", 0);
+
+	lxsys_mnt = kmem_alloc(sizeof (*lxsys_mnt), KM_SLEEP);
+
+	mutex_enter(&lxsys_mount_lock);
+
+	/*
+	 * Ensure we don't allow overlaying mounts
+	 */
+	mutex_enter(&mvp->v_lock);
+	if ((uap->flags & MS_OVERLAY) == 0 &&
+	    (mvp->v_count > 1 || (mvp->v_flag & VROOT))) {
+		mutex_exit(&mvp->v_lock);
+		mutex_exit(&lxsys_mount_lock);
+		kmem_free(lxsys_mnt, sizeof ((*lxsys_mnt)));
+		return (EBUSY);
+	}
+	mutex_exit(&mvp->v_lock);
+
+
+	mutex_init(&lxsys_mnt->lxsysm_lock, NULL, MUTEX_DEFAULT, NULL);
+	zone_hold(lxsys_mnt->lxsysm_zone = zone);
+
+	/* Arbitrarily set the parent vnode to the mounted over directory */
+	lxsys_mnt->lxsysm_node = lxsys_getnode(mvp, LXSYS_STATIC,
+	    LXSYS_INST_ROOT, 0);
+	lxsys_mnt->lxsysm_node->lxsys_next = NULL;
+
+	/* Correctly set the fs for the root node */
+	lxsys_mnt->lxsysm_node->lxsys_vnode->v_vfsp = vfsp;
+
+	vfs_make_fsid(&vfsp->vfs_fsid, lxsysdev, lxsysfstype);
+	vfsp->vfs_bsize = DEV_BSIZE;
+	vfsp->vfs_fstype = lxsysfstype;
+	vfsp->vfs_data = (caddr_t)lxsys_mnt;
+	vfsp->vfs_dev = lxsysdev;
+
+	mutex_exit(&lxsys_mount_lock);
+
+	return (0);
+}
+
+static int
+lxsys_unmount(vfs_t *vfsp, int flag, cred_t *cr)
+{
+	lxsys_mnt_t *lxsys_mnt = (lxsys_mnt_t *)vfsp->vfs_data;
+	lxsys_node_t *lnp;
+	vnode_t *vp;
+	int count;
+
+	VERIFY(lxsys_mnt != NULL);
+
+	mutex_enter(&lxsys_mount_lock);
+
+	/* must be root to unmount */
+	if (secpolicy_fs_unmount(cr, vfsp) != 0) {
+		mutex_exit(&lxsys_mount_lock);
+		return (EPERM);
+	}
+
+	/* forced unmount is not supported by this fs */
+	if (flag & MS_FORCE) {
+		mutex_exit(&lxsys_mount_lock);
+		return (ENOTSUP);
+	}
+
+	/* Ensure that no vnodes are in use on this mount point. */
+	lnp = lxsys_mnt->lxsysm_node;
+	vp = LXSTOV(lnp);
+	mutex_enter(&vp->v_lock);
+	count = vp->v_count;
+	mutex_exit(&vp->v_lock);
+	if (count > 1) {
+		mutex_exit(&lxsys_mount_lock);
+		return (EBUSY);
+	}
+
+	/*
+	 * If there are no references to the root vnode the list of persistent
+	 * static vnodes should be empty
+	 */
+	VERIFY(lnp->lxsys_next == NULL);
+
+	(void) dnlc_purge_vfsp(vfsp, 0);
+
+	lxsys_mnt->lxsysm_node = NULL;
+	lxsys_freenode(lnp);
+	zone_rele(lxsys_mnt->lxsysm_zone);
+	vfsp->vfs_data = NULL;
+	kmem_free(lxsys_mnt, sizeof (*lxsys_mnt));
+
+	mutex_exit(&lxsys_mount_lock);
+
+	return (0);
+}
+
+static int
+lxsys_root(vfs_t *vfsp, vnode_t **vpp)
+{
+	lxsys_mnt_t *lxsm = (lxsys_mnt_t *)vfsp->vfs_data;
+	vnode_t *vp;
+
+	VERIFY(lxsm != NULL);
+	VERIFY(lxsm->lxsysm_node != NULL);
+
+	vp = LXSTOV(lxsm->lxsysm_node);
+	VN_HOLD(vp);
+	*vpp = vp;
+
+	return (0);
+}
+
+static int
+lxsys_statvfs(vfs_t *vfsp, statvfs64_t *sp)
+{
+	dev32_t d32;
+
+	bzero((caddr_t)sp, sizeof (*sp));
+	sp->f_bsize	= DEV_BSIZE;
+	sp->f_frsize	= DEV_BSIZE;
+	sp->f_blocks	= (fsblkcnt64_t)0;
+	sp->f_bfree	= (fsblkcnt64_t)0;
+	sp->f_bavail	= (fsblkcnt64_t)0;
+	sp->f_files	= (fsfilcnt64_t)3;
+	sp->f_ffree	= (fsfilcnt64_t)0;	/* none */
+	sp->f_favail	= (fsfilcnt64_t)0;	/* none */
+	(void) cmpldev(&d32, vfsp->vfs_dev);
+	sp->f_fsid	= d32;
+	/* It is guaranteed that vsw_name will fit in f_basetype */
+	(void) strcpy(sp->f_basetype, vfssw[lxsysfstype].vsw_name);
+	sp->f_flag = vf_to_stf(vfsp->vfs_flag);
+	sp->f_namemax = 64;		/* quite arbitrary */
+	bzero(sp->f_fstr, sizeof (sp->f_fstr));
+
+	/* We know f_fstr is 32 chars */
+	(void) strcpy(sp->f_fstr, "/sys");
+	(void) strcpy(&sp->f_fstr[6], "/sys");
+
+	return (0);
+}
diff --git a/usr/src/uts/common/brand/lx/sysfs/lx_sysvnops.c b/usr/src/uts/common/brand/lx/sysfs/lx_sysvnops.c
new file mode 100644
index 0000000000..f3df77428c
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/sysfs/lx_sysvnops.c
@@ -0,0 +1,1796 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+/*
+ * lx_sysfs -- a Linux-compatible /sys for the LX brand
+ */
+
+#include <vm/seg_vn.h>
+#include <sys/sdt.h>
+#include <sys/strlog.h>
+#include <sys/stropts.h>
+#include <sys/cmn_err.h>
+#include <sys/lx_brand.h>
+#include <sys/x86_archext.h>
+#include <sys/archsystm.h>
+#include <sys/fp.h>
+#include <sys/pool_pset.h>
+#include <sys/pset.h>
+#include <sys/zone.h>
+#include <sys/pghw.h>
+#include <sys/vfs_opreg.h>
+#include <sys/param.h>
+#include <sys/utsname.h>
+#include <sys/lx_misc.h>
+#include <sys/brand.h>
+#include <sys/cred_impl.h>
+#include <sys/tihdr.h>
+#include <sys/sunddi.h>
+#include <sys/vnode.h>
+#include <sys/netstack.h>
+#include <sys/ethernet.h>
+#include <inet/ip_arp.h>
+
+#include "lx_sysfs.h"
+
+/*
+ * Pointer to the vnode ops vector for this fs.
+ * This is instantiated in lxsys_init() in lx_sysvfsops.c
+ */
+vnodeops_t *lxsys_vnodeops;
+
+static int lxsys_open(vnode_t **, int, cred_t *, caller_context_t *);
+static int lxsys_close(vnode_t *, int, int, offset_t, cred_t *,
+    caller_context_t *);
+static int lxsys_read(vnode_t *, uio_t *, int, cred_t *, caller_context_t *);
+static int lxsys_getattr(vnode_t *, vattr_t *, int, cred_t *,
+    caller_context_t *);
+static int lxsys_access(vnode_t *, int, int, cred_t *, caller_context_t *);
+static int lxsys_lookup(vnode_t *, char *, vnode_t **,
+    pathname_t *, int, vnode_t *, cred_t *, caller_context_t *, int *,
+    pathname_t *);
+static int lxsys_readdir(vnode_t *, uio_t *, cred_t *, int *,
+    caller_context_t *, int);
+static int lxsys_readlink(vnode_t *, uio_t *, cred_t *, caller_context_t *);
+static int lxsys_cmp(vnode_t *, vnode_t *, caller_context_t *);
+static int lxsys_sync(void);
+static void lxsys_inactive(vnode_t *, cred_t *, caller_context_t *);
+
+static vnode_t *lxsys_lookup_static(lxsys_node_t *, char *);
+static vnode_t *lxsys_lookup_class_netdir(lxsys_node_t *, char *);
+static vnode_t *lxsys_lookup_devices_virtual_netdir(lxsys_node_t *, char *);
+static vnode_t *lxsys_lookup_blockdir(lxsys_node_t *, char *);
+static vnode_t *lxsys_lookup_devices_zfsdir(lxsys_node_t *, char *);
+static vnode_t *lxsys_lookup_devices_syscpu(lxsys_node_t *, char *);
+static vnode_t *lxsys_lookup_devices_syscpuinfo(lxsys_node_t *, char *);
+static vnode_t *lxsys_lookup_devices_sysnode(lxsys_node_t *, char *);
+
+static int lxsys_read_static(lxsys_node_t *, lxsys_uiobuf_t *);
+static int lxsys_read_devices_virtual_net(lxsys_node_t *, lxsys_uiobuf_t *);
+static int lxsys_read_devices_zfs_block(lxsys_node_t *, lxsys_uiobuf_t *);
+static int lxsys_read_devices_sysnode(lxsys_node_t *, lxsys_uiobuf_t *);
+
+static int lxsys_readdir_devices_syscpu(lxsys_node_t *, uio_t *, int *);
+static int lxsys_readdir_devices_syscpuinfo(lxsys_node_t *, uio_t *, int *);
+static int lxsys_readdir_devices_sysnode(lxsys_node_t *, uio_t *, int *);
+static int lxsys_readdir_static(lxsys_node_t *, uio_t *, int *);
+static int lxsys_readdir_class_netdir(lxsys_node_t *, uio_t *, int *);
+static int lxsys_readdir_devices_virtual_netdir(lxsys_node_t *, uio_t *, int *);
+static int lxsys_readdir_blockdir(lxsys_node_t *, uio_t *, int *);
+static int lxsys_readdir_devices_zfsdir(lxsys_node_t *, uio_t *, int *);
+
+static int lxsys_readlink_class_net(lxsys_node_t *, char *, size_t);
+static int lxsys_readlink_block(lxsys_node_t *, char *, size_t);
+
+/*
+ * The lx /sys vnode operations vector
+ */
+const fs_operation_def_t lxsys_vnodeops_template[] = {
+	VOPNAME_OPEN,		{ .vop_open = lxsys_open },
+	VOPNAME_CLOSE,		{ .vop_close = lxsys_close },
+	VOPNAME_READ,		{ .vop_read = lxsys_read },
+	VOPNAME_GETATTR,	{ .vop_getattr = lxsys_getattr },
+	VOPNAME_ACCESS,		{ .vop_access = lxsys_access },
+	VOPNAME_LOOKUP,		{ .vop_lookup = lxsys_lookup },
+	VOPNAME_READDIR,	{ .vop_readdir = lxsys_readdir },
+	VOPNAME_READLINK,	{ .vop_readlink = lxsys_readlink },
+	VOPNAME_FSYNC,		{ .error = lxsys_sync },
+	VOPNAME_SEEK,		{ .error = lxsys_sync },
+	VOPNAME_INACTIVE,	{ .vop_inactive = lxsys_inactive },
+	VOPNAME_CMP,		{ .vop_cmp = lxsys_cmp },
+	NULL,			NULL
+};
+
+/*
+ * Sysfs Inode format:
+ * 0000AABBBBCC
+ *
+ * AA - TYPE
+ * BBBB - INSTANCE
+ * CC - ENDPOINT
+ *
+ * Where TYPE is one of:
+ * 1 - SYS_STATIC
+ * 2 - SYS_CLASS_NET
+ * 3 - SYS_DEV_NET
+ * 4 - SYS_BLOCK
+ * 5 - SYS_DEV_ZFS
+ * 6 - SYS_DEV_SYS_CPU
+ * 7 - SYS_DEV_SYS_CPUINFO
+ * 8 - SYS_DEV_SYS_NODE
+ *
+ * Static entries will have assigned INSTANCE identifiers:
+ * - 0x00: /sys
+ * - 0x01: /sys/class
+ * - 0x02: /sys/devices
+ * - 0x03: /sys/fs
+ * - 0x04: /sys/class/net
+ * - 0x05: /sys/devices/virtual
+ * - 0x06: /sys/devices/system
+ * - 0x07: /sys/fs/cgroup
+ * - 0x08: /sys/devices/virtual/net
+ * - 0x09: /sys/block
+ * - 0x0a: /sys/devices/zfs
+ * - 0x0b: /sys/devices/system/cpu
+ * - 0x0c: /sys/devices/system/cpu/kernel_max
+ * - 0x0d: /sys/devices/system/node
+ *
+ * Dynamic /sys/class/net/<interface> symlinks will use an INSTANCE derived
+ * from the corresonding ifindex.
+ *
+ * Dynamic /sys/devices/virtual/net/<interface>/<entries> directories will use
+ * an INSTANCE derived from the ifindex and statically assigned ENDPOINT IDs
+ * for the contained entries.
+ *
+ * Dynamic /sys/block/<dev> symlinks will use an INSTANCE derived from the
+ * device major and instance from records listed in kstat or zvols.
+ *
+ * Dynamic /sys/devices/zfs/<dev> directories will use an INSTANCE derived from
+ * the emulated minor number.
+ *
+ * Static/Dynamic /sys/devices/system/cpu contains a static kernel_max file
+ * and a dynamic set of cpuN subdirectories.
+ *
+ * Static/Dynamic /sys/devices/system/node/node0 currently only contains a
+ * static cpulist file, but will likely need future dynamic entries for cpuN
+ * symlinks, and perhaps other static files. By only providing 'node0' we
+ * pretend that there is only a single NUMA node available to a zone (trying to
+ * be NUMA-aware inside a zone is generally not going to work anyway).
+ */
+
+#define	LXSYS_INST_CLASSDIR			0x1
+#define	LXSYS_INST_DEVICESDIR			0x2
+#define	LXSYS_INST_FSDIR			0x3
+#define	LXSYS_INST_CLASS_NETDIR			0x4
+#define	LXSYS_INST_DEVICES_VIRTUALDIR		0x5
+#define	LXSYS_INST_DEVICES_SYSTEMDIR		0x6
+#define	LXSYS_INST_FS_CGROUPDIR			0x7
+#define	LXSYS_INST_DEVICES_VIRTUAL_NETDIR	0x8
+#define	LXSYS_INST_BLOCKDIR			0x9
+#define	LXSYS_INST_DEVICES_ZFSDIR		0xa
+#define	LXSYS_INST_DEVICES_SYSCPU		0xb
+#define	LXSYS_INST_DEV_SYSCPU_KMAX		0xc
+#define	LXSYS_INST_DEVICES_SYSNODE		0xd
+
+/*
+ * file contents of an lx /sys directory.
+ */
+static lxsys_dirent_t dirlist_root[] = {
+	{ LXSYS_INST_BLOCKDIR,		"block" },
+	{ LXSYS_INST_CLASSDIR,		"class" },
+	{ LXSYS_INST_DEVICESDIR,	"devices" },
+	{ LXSYS_INST_FSDIR,		"fs" }
+};
+static lxsys_dirent_t dirlist_empty[] = {};
+static lxsys_dirent_t dirlist_class[] = {
+	{ LXSYS_INST_CLASS_NETDIR,	"net" }
+};
+static lxsys_dirent_t dirlist_fs[] = {
+	{ LXSYS_INST_FS_CGROUPDIR,	"cgroup" }
+};
+static lxsys_dirent_t dirlist_devices[] = {
+	{ LXSYS_INST_DEVICES_SYSTEMDIR,		"system" },
+	{ LXSYS_INST_DEVICES_VIRTUALDIR,	"virtual" },
+	{ LXSYS_INST_DEVICES_ZFSDIR,		"zfs" }
+};
+static lxsys_dirent_t dirlist_devices_virtual[] = {
+	{ LXSYS_INST_DEVICES_VIRTUAL_NETDIR,	"net" }
+};
+
+/*
+ * XXX: The presence of the cpu tree in sysfs triggers new behavior in various
+ * applications. The glibc code which accesses this part of the tree expects
+ * dirents to have the d_type field populated. We cannot implement the 'cpu'
+ * hierarchy until that is addressed. One such application is java, which
+ * becomes unstable due to the incorrect data from glibc.
+ */
+static lxsys_dirent_t dirlist_devices_system[] = {
+	/* { LXSYS_INST_DEVICES_SYSCPU,	"cpu" }, */
+	{ LXSYS_INST_DEVICES_SYSNODE,	"node" }
+};
+
+#define	LXSYS_ENDP_NET_ADDRESS	1
+#define	LXSYS_ENDP_NET_ADDRLEN	2
+#define	LXSYS_ENDP_NET_FLAGS	3
+#define	LXSYS_ENDP_NET_IFINDEX	4
+#define	LXSYS_ENDP_NET_MTU	5
+#define	LXSYS_ENDP_NET_TXQLEN	6
+#define	LXSYS_ENDP_NET_TYPE	7
+
+#define	LXSYS_ENDP_BLOCK_DEVICE	1
+
+#define	LXSYS_ENDP_NODE_CPULIST	1
+
+static lxsys_dirent_t dirlist_devices_virtual_net[] = {
+	{ LXSYS_ENDP_NET_ADDRESS,	"address" },
+	{ LXSYS_ENDP_NET_ADDRLEN,	"addr_len" },
+	{ LXSYS_ENDP_NET_FLAGS,		"flags" },
+	{ LXSYS_ENDP_NET_IFINDEX,	"ifindex" },
+	{ LXSYS_ENDP_NET_MTU,		"mtu" },
+	{ LXSYS_ENDP_NET_TXQLEN,	"tx_queue_len" },
+	{ LXSYS_ENDP_NET_TYPE,		"type" }
+};
+
+static lxsys_dirent_t dirlist_devices_zfs_block[] = {
+	{ LXSYS_ENDP_BLOCK_DEVICE,	"device" }
+};
+
+static lxsys_dirent_t dirlist_devices_sysnode[] = {
+	{ LXSYS_ENDP_NODE_CPULIST,	"cpulist" }
+};
+
+#define	SYSDIRLISTSZ(l)	(sizeof (l) / sizeof ((l)[0]))
+
+#define	SYSDLENT(i, l)	{ i, l, SYSDIRLISTSZ(l) }
+static lxsys_dirlookup_t lxsys_dirlookup[] = {
+	SYSDLENT(LXSYS_INST_ROOT, dirlist_root),
+	SYSDLENT(LXSYS_INST_CLASSDIR, dirlist_class),
+	SYSDLENT(LXSYS_INST_FSDIR, dirlist_fs),
+	SYSDLENT(LXSYS_INST_FS_CGROUPDIR, dirlist_empty),
+	SYSDLENT(LXSYS_INST_DEVICESDIR, dirlist_devices),
+	SYSDLENT(LXSYS_INST_DEVICES_SYSTEMDIR, dirlist_devices_system),
+	SYSDLENT(LXSYS_INST_DEVICES_VIRTUALDIR, dirlist_devices_virtual),
+	SYSDLENT(LXSYS_INST_DEVICES_SYSNODE, dirlist_devices_sysnode)
+};
+
+
+/*
+ * Array of lookup functions, indexed by lx /sys file type.
+ */
+static vnode_t *(*lxsys_lookup_function[LXSYS_MAXTYPE])() = {
+	NULL,					/* LXSYS_NONE		*/
+	lxsys_lookup_static,			/* LXSYS_STATIC		*/
+	lxsys_lookup_class_netdir,		/* LXSYS_CLASS_NET	*/
+	lxsys_lookup_devices_virtual_netdir,	/* LXSYS_DEV_NET	*/
+	lxsys_lookup_blockdir,			/* LXSYS_BLOCK		*/
+	lxsys_lookup_devices_zfsdir,		/* LXSYS_DEV_ZFS	*/
+	lxsys_lookup_devices_syscpu,		/* LXSYS_DEV_SYS_CPU	*/
+	lxsys_lookup_devices_syscpuinfo,	/* LXSYS_DEV_SYS_CPUINFO */
+	lxsys_lookup_devices_sysnode,		/* LXSYS_DEV_SYS_NODE	*/
+};
+
+/*
+ * Array of readdir functions, indexed by /sys file type.
+ */
+static int (*lxsys_readdir_function[LXSYS_MAXTYPE])() = {
+	NULL,					/* LXSYS_NONE		*/
+	lxsys_readdir_static,			/* LXSYS_STATIC		*/
+	lxsys_readdir_class_netdir,		/* LXSYS_CLASS_NET	*/
+	lxsys_readdir_devices_virtual_netdir,	/* LXSYS_DEV_NET	*/
+	lxsys_readdir_blockdir,			/* LXSYS_BLOCK		*/
+	lxsys_readdir_devices_zfsdir,		/* LXSYS_DEV_ZFS	*/
+	lxsys_readdir_devices_syscpu,		/* LXSYS_DEV_SYS_CPU	*/
+	lxsys_readdir_devices_syscpuinfo,	/* LXSYS_DEV_SYS_CPUINFO */
+	lxsys_readdir_devices_sysnode,		/* LXSYS_DEV_SYS_NODE	*/
+};
+
+/*
+ * Array of read functions, indexed by /sys file type.
+ */
+static int (*lxsys_read_function[LXSYS_MAXTYPE])() = {
+	NULL,					/* LXSYS_NONE		*/
+	lxsys_read_static,			/* LXSYS_STATIC		*/
+	NULL,					/* LXSYS_CLASS_NET	*/
+	lxsys_read_devices_virtual_net,		/* LXSYS_DEV_NET	*/
+	NULL,					/* LXSYS_BLOCK		*/
+	lxsys_read_devices_zfs_block,		/* LXSYS_DEV_ZFS	*/
+	NULL,					/* LXSYS_DEV_SYS_CPU	*/
+	NULL,					/* LXSYS_DEV_SYS_CPUINFO */
+	lxsys_read_devices_sysnode,		/* LXSYS_DEV_SYS_NODE	*/
+};
+
+/*
+ * Array of readlink functions, indexed by /sys file type.
+ */
+static int (*lxsys_readlink_function[LXSYS_MAXTYPE])() = {
+	NULL,					/* LXSYS_NONE		*/
+	NULL,					/* LXSYS_STATIC		*/
+	lxsys_readlink_class_net,		/* LXSYS_CLASS_NET	*/
+	NULL,					/* LXSYS_DEV_NET	*/
+	lxsys_readlink_block,			/* LXSYS_BLOCK		*/
+	NULL,					/* LXSYS_DEV_ZFS	*/
+	NULL,					/* LXSYS_DEV_SYS_CPU	*/
+	NULL,					/* LXSYS_DEV_SYS_CPUINFO */
+	NULL,					/* LXSYS_DEV_SYS_NODE	*/
+};
+
+typedef struct lxsys_cpu_info {
+	processorid_t	cpu_id;
+	processorid_t	cpu_seqid;
+} lxsys_cpu_info_t;
+
+/*
+ * lxsys_open(): Vnode operation for VOP_OPEN()
+ */
+static int
+lxsys_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
+{
+	/*
+	 * We only allow reading in this file system
+	 */
+	if (flag & FWRITE)
+		return (EROFS);
+
+	return (0);
+}
+
+
+/*
+ * lxsys_close(): Vnode operation for VOP_CLOSE()
+ */
+/* ARGSUSED */
+static int
+lxsys_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
+    caller_context_t *ct)
+{
+	return (0);
+}
+
+
+/*
+ * lxsys_read(): Vnode operation for VOP_READ()
+ * All we currently have in this fs are directories.
+ */
+/* ARGSUSED */
+static int
+lxsys_read(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr,
+    caller_context_t *ct)
+{
+	lxsys_node_t *lnp = VTOLXS(vp);
+	lxsys_nodetype_t type = lnp->lxsys_type;
+	int (*rlfunc)();
+	int error;
+	lxsys_uiobuf_t *luio;
+
+	VERIFY(type > LXSYS_NONE && type < LXSYS_MAXTYPE);
+
+	if (vp->v_type == VDIR) {
+		return (EISDIR);
+	}
+
+	rlfunc = lxsys_read_function[type];
+	if (rlfunc != NULL) {
+		luio = lxsys_uiobuf_new(uiop);
+		if ((error = rlfunc(lnp, luio)) == 0) {
+			error = lxsys_uiobuf_flush(luio);
+		}
+		lxsys_uiobuf_free(luio);
+	} else {
+		error = EIO;
+	}
+
+	return (error);
+}
+
+/*
+ * lxsys_getattr(): Vnode operation for VOP_GETATTR()
+ */
+static int
+lxsys_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
+    caller_context_t *ct)
+{
+	register lxsys_node_t *lxsnp = VTOLXS(vp);
+
+	/* Default attributes, that may be overridden below */
+	bzero(vap, sizeof (*vap));
+	vap->va_atime = vap->va_mtime = vap->va_ctime = lxsnp->lxsys_time;
+	vap->va_nlink = 1;
+	vap->va_type = vp->v_type;
+	vap->va_mode = lxsnp->lxsys_mode;
+	vap->va_fsid = vp->v_vfsp->vfs_dev;
+	vap->va_blksize = DEV_BSIZE;
+	vap->va_uid = lxsnp->lxsys_uid;
+	vap->va_gid = lxsnp->lxsys_gid;
+	vap->va_nodeid = lxsnp->lxsys_ino;
+
+	vap->va_nblocks = (fsblkcnt64_t)btod(vap->va_size);
+	return (0);
+}
+
+/*
+ * lxsys_access(): Vnode operation for VOP_ACCESS()
+ */
+static int
+lxsys_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct)
+{
+	lxsys_node_t *lxsnp = VTOLXS(vp);
+	int shift = 0;
+
+	/*
+	 * Although our lx sysfs is basically a read only file system, Linux
+	 * expects it to be writable so we can't just error if (mode & VWRITE).
+	 */
+
+	/* If user is root allow access regardless of permission bits */
+	if (secpolicy_proc_access(cr) == 0)
+		return (0);
+
+	/*
+	 * Access check is based on only one of owner, group, public.  If not
+	 * owner, then check group.  If not a member of the group, then check
+	 * public access.
+	 */
+	if (crgetuid(cr) != lxsnp->lxsys_uid) {
+		shift += 3;
+		if (!groupmember((uid_t)lxsnp->lxsys_gid, cr))
+			shift += 3;
+	}
+
+	mode &= ~(lxsnp->lxsys_mode << shift);
+
+	if (mode == 0)
+		return (0);
+
+	return (EACCES);
+}
+
+/*
+ * lxsys_lookup(): Vnode operation for VOP_LOOKUP()
+ */
+/* ARGSUSED */
+static int
+lxsys_lookup(vnode_t *dp, char *comp, vnode_t **vpp, pathname_t *pathp,
+    int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
+    int *direntflags, pathname_t *realpnp)
+{
+	lxsys_node_t *lxsnp = VTOLXS(dp);
+	lxsys_nodetype_t type = lxsnp->lxsys_type;
+	int error;
+
+	VERIFY(dp->v_type == VDIR);
+	VERIFY(type > LXSYS_NONE && type < LXSYS_MAXTYPE);
+
+	/*
+	 * restrict lookup permission to owner or root
+	 */
+	if ((error = lxsys_access(dp, VEXEC, 0, cr, ct)) != 0) {
+		return (error);
+	}
+
+	/*
+	 * Just return the parent vnode if that's where we are trying to go.
+	 */
+	if (strcmp(comp, "..") == 0) {
+		VN_HOLD(lxsnp->lxsys_parentvp);
+		*vpp = lxsnp->lxsys_parentvp;
+		return (0);
+	}
+
+	/*
+	 * Special handling for directory searches.  Note: null component name
+	 * denotes that the current directory is being searched.
+	 */
+	if ((dp->v_type == VDIR) && (*comp == '\0' || strcmp(comp, ".") == 0)) {
+		VN_HOLD(dp);
+		*vpp = dp;
+		return (0);
+	}
+
+	*vpp = (lxsys_lookup_function[type](lxsnp, comp));
+	return ((*vpp == NULL) ? ENOENT : 0);
+}
+
+static lxsys_node_t *
+lxsys_lookup_disk(lxsys_node_t *ldp, char *comp, lxsys_nodetype_t type)
+{
+	lxsys_node_t *lnp = NULL;
+	lx_zone_data_t *lxzdata;
+	lx_virt_disk_t *vd;
+
+	lxzdata = ztolxzd(curproc->p_zone);
+	if (lxzdata == NULL)
+		return (NULL);
+	ASSERT(lxzdata->lxzd_vdisks != NULL);
+
+	vd = list_head(lxzdata->lxzd_vdisks);
+	while (vd != NULL) {
+		int inst = getminor(vd->lxvd_emul_dev) & 0xffff;
+
+		if (strcmp(vd->lxvd_name, comp) == 0 && inst != 0) {
+			lnp = lxsys_getnode(ldp->lxsys_vnode, type, inst, 0);
+			break;
+		}
+
+		vd = list_next(lxzdata->lxzd_vdisks, vd);
+	}
+
+	return (lnp);
+}
+
+static vnode_t *
+lxsys_lookup_static(lxsys_node_t *ldp, char *comp)
+{
+	lxsys_dirent_t *dirent = NULL;
+	int i, len = 0;
+
+	for (i = 0; i < SYSDIRLISTSZ(lxsys_dirlookup); i++) {
+		if (ldp->lxsys_instance == lxsys_dirlookup[i].dl_instance) {
+			dirent = lxsys_dirlookup[i].dl_list;
+			len = lxsys_dirlookup[i].dl_length;
+			break;
+		}
+	}
+	if (dirent == NULL) {
+		return (NULL);
+	}
+
+	for (i = 0; i < len; i++) {
+		if (strncmp(comp, dirent[i].d_name, MAXPATHLEN) == 0) {
+			lxsys_nodetype_t node_type = ldp->lxsys_type;
+			unsigned int node_instance = 0;
+			lxsys_node_t *lnp;
+
+			switch (dirent[i].d_idnum) {
+			case LXSYS_INST_BLOCKDIR:
+				node_type = LXSYS_BLOCK;
+				break;
+			case LXSYS_INST_CLASS_NETDIR:
+				node_type = LXSYS_CLASS_NET;
+				break;
+			case LXSYS_INST_DEVICES_VIRTUAL_NETDIR:
+				node_type = LXSYS_DEV_NET;
+				break;
+			case LXSYS_INST_DEVICES_ZFSDIR:
+				node_type = LXSYS_DEV_ZFS;
+				break;
+			case LXSYS_INST_DEVICES_SYSCPU:
+				node_type = LXSYS_DEV_SYS_CPU;
+				break;
+			case LXSYS_INST_DEVICES_SYSNODE:
+				node_type = LXSYS_DEV_SYS_NODE;
+				break;
+			default:
+				/* Another static node */
+				node_instance = dirent[i].d_idnum;
+			}
+			if (node_type == LXSYS_STATIC) {
+				lnp = lxsys_getnode_static(ldp->lxsys_vnode,
+				    node_instance);
+			} else {
+				lnp = lxsys_getnode(ldp->lxsys_vnode,
+				    node_type, node_instance, 0);
+			}
+			return (lnp->lxsys_vnode);
+		}
+	}
+	return (NULL);
+}
+
+static vnode_t *
+lxsys_lookup_class_netdir(lxsys_node_t *ldp, char *comp)
+{
+	vnode_t *result = NULL;
+	lxsys_node_t *lnp;
+	netstack_t *ns;
+	ip_stack_t *ipst;
+	avl_tree_t *phytree;
+	phyint_t *phyi;
+	char ifname[LIFNAMSIZ];
+
+	if (ldp->lxsys_type != LXSYS_CLASS_NET ||
+	    ldp->lxsys_instance != 0) {
+		/* Lookups only allowed at directory level */
+		return (NULL);
+	}
+
+	(void) strncpy(ifname, comp, LIFNAMSIZ);
+	lx_ifname_convert(ifname, LX_IF_TONATIVE);
+
+	if ((ns = lxsys_netstack(ldp)) == NULL) {
+		return (NULL);
+	}
+	ipst = ns->netstack_ip;
+	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
+
+	phytree = &ipst->ips_phyint_g_list->phyint_list_avl_by_name;
+	phyi = avl_find(phytree, ifname, NULL);
+	if (phyi != NULL) {
+		lnp = lxsys_getnode(ldp->lxsys_vnode, ldp->lxsys_type,
+		    phyi->phyint_ifindex, 0);
+		result = lnp->lxsys_vnode;
+		result->v_type = VLNK;
+	}
+
+	rw_exit(&ipst->ips_ill_g_lock);
+	netstack_rele(ns);
+
+	return (result);
+}
+
+static vnode_t *
+lxsys_lookup_devices_virtual_netdir(lxsys_node_t *ldp, char *comp)
+{
+	lxsys_node_t *lnp;
+
+	if (ldp->lxsys_instance == 0) {
+		/* top-level interface listing */
+		vnode_t *result = NULL;
+		netstack_t *ns;
+		ip_stack_t *ipst;
+		avl_tree_t *phytree;
+		phyint_t *phyi;
+		char ifname[LIFNAMSIZ];
+
+		(void) strncpy(ifname, comp, LIFNAMSIZ);
+		lx_ifname_convert(ifname, LX_IF_TONATIVE);
+
+		if ((ns = lxsys_netstack(ldp)) == NULL) {
+			return (NULL);
+		}
+		ipst = ns->netstack_ip;
+		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
+
+		phytree = &ipst->ips_phyint_g_list->phyint_list_avl_by_name;
+		phyi = avl_find(phytree, ifname, NULL);
+		if (phyi != NULL) {
+			lnp = lxsys_getnode(ldp->lxsys_vnode, ldp->lxsys_type,
+			    phyi->phyint_ifindex, 0);
+			result = lnp->lxsys_vnode;
+		}
+
+		rw_exit(&ipst->ips_ill_g_lock);
+		netstack_rele(ns);
+
+		return (result);
+	} else if (ldp->lxsys_endpoint == 0) {
+		/* interface-level sub-item listing */
+		int i, size;
+		lxsys_dirent_t *dirent;
+
+		size = SYSDIRLISTSZ(dirlist_devices_virtual_net);
+		for (i = 0; i < size; i++) {
+			dirent = &dirlist_devices_virtual_net[i];
+			if (strncmp(comp, dirent->d_name, LXSNSIZ) == 0) {
+				lnp = lxsys_getnode(ldp->lxsys_vnode,
+				    ldp->lxsys_type, ldp->lxsys_instance,
+				    dirent->d_idnum);
+				lnp->lxsys_vnode->v_type = VREG;
+				lnp->lxsys_mode = 0444;
+				return (lnp->lxsys_vnode);
+			}
+		}
+	}
+
+	return (NULL);
+}
+
+static vnode_t *
+lxsys_lookup_blockdir(lxsys_node_t *ldp, char *comp)
+{
+	lxsys_node_t *lnp;
+
+	if (ldp->lxsys_instance == 0) {
+		/* top-level dev listing */
+		lnp = lxsys_lookup_disk(ldp, comp, LXSYS_BLOCK);
+
+		if (lnp != NULL) {
+			lnp->lxsys_vnode->v_type = VLNK;
+			return (lnp->lxsys_vnode);
+		}
+	}
+
+	return (NULL);
+}
+
+static vnode_t *
+lxsys_lookup_devices_zfsdir(lxsys_node_t *ldp, char *comp)
+{
+	lxsys_node_t *lnp;
+
+	if (ldp->lxsys_instance == 0) {
+		/* top-level dev listing */
+		lnp = lxsys_lookup_disk(ldp, comp, LXSYS_DEV_ZFS);
+
+		if (lnp != NULL) {
+			return (lnp->lxsys_vnode);
+		}
+	} else if (ldp->lxsys_endpoint == 0) {
+		/* disk-level sub-item listing */
+		int i, size;
+		lxsys_dirent_t *dirent;
+
+		/*
+		 * All of these entries currently look like regular files
+		 * but on a real Linux system some will be subdirs. This should
+		 * be fixed when we populate the directory for real.
+		 */
+		size = SYSDIRLISTSZ(dirlist_devices_zfs_block);
+		for (i = 0; i < size; i++) {
+			dirent = &dirlist_devices_zfs_block[i];
+			if (strncmp(comp, dirent->d_name, LXSNSIZ) == 0) {
+				lnp = lxsys_getnode(ldp->lxsys_vnode,
+				    ldp->lxsys_type, ldp->lxsys_instance,
+				    dirent->d_idnum);
+				lnp->lxsys_vnode->v_type = VREG;
+				lnp->lxsys_mode = 0444;
+				return (lnp->lxsys_vnode);
+			}
+		}
+	}
+
+	return (NULL);
+}
+
+static vnode_t *
+lxsys_lookup_devices_syscpu(lxsys_node_t *ldp, char *comp)
+{
+	lxsys_node_t *lnp = NULL;
+
+	if (ldp->lxsys_instance == 0) {
+		/* top-level cpu listing */
+
+		/* If fixed entry */
+		if (strcmp(comp, "kernel_max") == 0) {
+			lnp = lxsys_getnode_static(ldp->lxsys_vnode,
+			    LXSYS_INST_DEV_SYSCPU_KMAX);
+			lnp->lxsys_vnode->v_type = VREG;
+			lnp->lxsys_mode = 0444;
+		} else {
+			/* Else dynamic cpuN entry */
+			cpu_t *cp, *cpstart;
+			int pools_enabled;
+
+			mutex_enter(&cpu_lock);
+			pools_enabled = pool_pset_enabled();
+
+			cp = cpstart = CPU->cpu_part->cp_cpulist;
+			do {
+				char cpunm[16];
+
+				(void) snprintf(cpunm, sizeof (cpunm), "cpu%d",
+				    cp->cpu_seqid);
+
+				if (strcmp(comp, cpunm) == 0) {
+					lnp = lxsys_getnode(ldp->lxsys_vnode,
+					    LXSYS_DEV_SYS_CPUINFO,
+					    cp->cpu_id + 1, 0);
+					break;
+				}
+				if (pools_enabled) {
+					cp = cp->cpu_next_part;
+				} else {
+					cp = cp->cpu_next;
+				}
+			} while (cp != cpstart);
+
+			mutex_exit(&cpu_lock);
+		}
+
+		if (lnp != NULL) {
+			return (lnp->lxsys_vnode);
+		}
+	} else if (ldp->lxsys_endpoint == 0) {
+		/* cpu-level sub-item listing, currently empty */
+	}
+
+	return (NULL);
+}
+
+static vnode_t *
+lxsys_lookup_devices_syscpuinfo(lxsys_node_t *ldp, char *comp)
+{
+	return (NULL);
+}
+
+static vnode_t *
+lxsys_lookup_devices_sysnode(lxsys_node_t *ldp, char *comp)
+{
+	lxsys_node_t *lnp = NULL;
+
+	if (ldp->lxsys_instance == 0) {
+		/*
+		 * The system is presently represented as a single node,
+		 * regardless of any NUMA topology which exists.
+		 * The instances are offset by 1 to account for the top level
+		 * directory occupying instance 0.
+		 */
+		if (strcmp(comp, "node0") == 0) {
+			lnp = lxsys_getnode(ldp->lxsys_vnode, ldp->lxsys_type,
+			    1, 0);
+			return (lnp->lxsys_vnode);
+		}
+	} else {
+		/* interface-level sub-item listing */
+		int i, size;
+		lxsys_dirent_t *dirent;
+
+		size = SYSDIRLISTSZ(dirlist_devices_sysnode);
+		for (i = 0; i < size; i++) {
+			dirent = &dirlist_devices_sysnode[i];
+			if (strncmp(comp, dirent->d_name, LXSNSIZ) == 0) {
+				lnp = lxsys_getnode(ldp->lxsys_vnode,
+				    ldp->lxsys_type, ldp->lxsys_instance,
+				    dirent->d_idnum);
+				lnp->lxsys_vnode->v_type = VREG;
+				lnp->lxsys_mode = 0444;
+				return (lnp->lxsys_vnode);
+			}
+		}
+	}
+
+	return (NULL);
+}
+
+static int
+lxsys_read_devices_virtual_net(lxsys_node_t *lnp, lxsys_uiobuf_t *luio)
+{
+	netstack_t *ns;
+	ill_t *ill;
+	uint_t ifindex = lnp->lxsys_instance;
+	uint8_t *addr;
+	uint64_t flags;
+	int error = 0;
+
+	if (ifindex == 0 || lnp->lxsys_endpoint == 0) {
+		return (EISDIR);
+	}
+
+	if ((ns = lxsys_netstack(lnp)) == NULL) {
+		return (EIO);
+	}
+
+	ill = lxsys_find_ill(ns->netstack_ip, ifindex);
+	if (ill == NULL) {
+		netstack_rele(ns);
+		return (EIO);
+	}
+
+	switch (lnp->lxsys_endpoint) {
+	case LXSYS_ENDP_NET_ADDRESS:
+		if (ill->ill_phys_addr_length != ETHERADDRL) {
+			lxsys_uiobuf_printf(luio, "00:00:00:00:00:00\n");
+			break;
+		}
+		addr = ill->ill_phys_addr;
+		lxsys_uiobuf_printf(luio,
+		    "%02hhx:%02hhx:%02hhx:%02hhx:%02hhx:%02hhx\n",
+		    addr[0], addr[1], addr[2], addr[3], addr[4], addr[5]);
+		break;
+	case LXSYS_ENDP_NET_ADDRLEN:
+		lxsys_uiobuf_printf(luio, "%u\n",
+		    IS_LOOPBACK(ill) ? ETHERADDRL : ill->ill_phys_addr_length);
+		break;
+	case LXSYS_ENDP_NET_FLAGS:
+		flags = (ill->ill_flags | ill->ill_ipif->ipif_flags |
+		    ill->ill_phyint->phyint_flags) & 0xffff;
+		lx_ifflags_convert(&flags, LX_IF_FROMNATIVE);
+		lxsys_uiobuf_printf(luio, "0x%x\n", flags);
+		break;
+	case LXSYS_ENDP_NET_IFINDEX:
+		lxsys_uiobuf_printf(luio, "%u\n", ifindex);
+		break;
+	case LXSYS_ENDP_NET_MTU:
+		lxsys_uiobuf_printf(luio, "%u\n", ill->ill_mtu);
+		break;
+	case LXSYS_ENDP_NET_TXQLEN:
+		/* perpetuate the txqlen lie */
+		if (IS_LOOPBACK(ill)) {
+			lxsys_uiobuf_printf(luio, "0\n");
+		} else {
+			lxsys_uiobuf_printf(luio, "1\n");
+		}
+		break;
+	case LXSYS_ENDP_NET_TYPE:
+		lxsys_uiobuf_printf(luio, "%u\n",
+		    IS_LOOPBACK(ill) ? LX_ARPHRD_LOOPBACK :
+		    arp_hw_type(ill->ill_mactype));
+		break;
+	default:
+		error = EIO;
+	}
+
+	ill_refrele(ill);
+	netstack_rele(ns);
+	return (error);
+}
+
+static int
+lxsys_read_devices_zfs_block(lxsys_node_t *lnp, lxsys_uiobuf_t *luio)
+{
+	uint_t dskindex = lnp->lxsys_instance;
+
+	if (dskindex == 0 || lnp->lxsys_endpoint == 0) {
+		return (EISDIR);
+	}
+
+	return (EIO);
+}
+
+static int
+lxsys_read_devices_sysnode(lxsys_node_t *lnp, lxsys_uiobuf_t *luio)
+{
+	if (lnp->lxsys_instance == 1 &&
+	    lnp->lxsys_endpoint == LXSYS_ENDP_NODE_CPULIST) {
+		/* Show the range of CPUs */
+		cpu_t *cp, *cpstart;
+		int pools_enabled, maxid = -1;
+
+		mutex_enter(&cpu_lock);
+		pools_enabled = pool_pset_enabled();
+
+		cp = cpstart = CPU->cpu_part->cp_cpulist;
+		do {
+			if (cp->cpu_seqid > maxid)
+				maxid = cp->cpu_seqid;
+
+			if (pools_enabled) {
+				cp = cp->cpu_next_part;
+			} else {
+				cp = cp->cpu_next;
+			}
+		} while (cp != cpstart);
+
+		mutex_exit(&cpu_lock);
+
+		lxsys_uiobuf_printf(luio, "0-%d\n", maxid);
+		return (0);
+	}
+	return (EISDIR);
+
+}
+
+static int
+lxsys_read_static(lxsys_node_t *lnp, lxsys_uiobuf_t *luio)
+{
+	uint_t inst = lnp->lxsys_instance;
+
+	if (inst == LXSYS_INST_DEV_SYSCPU_KMAX) {
+		lxsys_uiobuf_printf(luio, "%d\n", NCPU);
+		return (0);
+	}
+
+	/* All other static nodes are directories */
+	return (EISDIR);
+}
+
+/*
+ * lxsys_readdir(): Vnode operation for VOP_READDIR()
+ */
+/* ARGSUSED */
+static int
+lxsys_readdir(vnode_t *dp, uio_t *uiop, cred_t *cr, int *eofp,
+    caller_context_t *ct, int flags)
+{
+	lxsys_node_t *lxsnp = VTOLXS(dp);
+	lxsys_nodetype_t type = lxsnp->lxsys_type;
+	ssize_t uresid;
+	off_t uoffset;
+	int error, leof;
+
+	ASSERT(dp->v_type == VDIR);
+	VERIFY(type > LXSYS_NONE && type < LXSYS_MAXTYPE);
+
+	/*
+	 * restrict readdir permission to owner or root
+	 */
+	if ((error = lxsys_access(dp, VREAD, 0, cr, ct)) != 0)
+		return (error);
+
+	uoffset = uiop->uio_offset;
+	uresid = uiop->uio_resid;
+
+	/* can't do negative reads */
+	if (uoffset < 0 || uresid <= 0)
+		return (EINVAL);
+
+	/* can't read directory entries that don't exist! */
+	if (uoffset % LXSYS_SDSIZE)
+		return (ENOENT);
+
+	/* Free lower functions from having to check eofp == NULL */
+	if (eofp == NULL) {
+		eofp = &leof;
+	}
+
+	return (lxsys_readdir_function[lxsnp->lxsys_type](lxsnp, uiop, eofp));
+}
+
+static int
+lxsys_dirent_out(dirent64_t *d, ushort_t n, struct uio *uio)
+{
+	int error;
+	off_t offset = uio->uio_offset;
+
+	/*
+	 * uiomove() updates both uiop->uio_resid and uiop->uio_offset by the
+	 * same amount.  But we want uiop->uio_offset to change in increments
+	 * of LXSYS_SDSIZE, which is different from the number of bytes being
+	 * returned to the user.  To accomplish this, we set uiop->uio_offset
+	 * separately on success, overriding what uiomove() does.
+	 */
+	d->d_off = (off64_t)(offset + LXSYS_SDSIZE);
+	d->d_reclen = n;
+	if ((error = uiomove(d, n, UIO_READ, uio)) != 0) {
+		return (error);
+	}
+	uio->uio_offset = offset + LXSYS_SDSIZE;
+	return (0);
+}
+
+/*
+ * This has the common logic for returning directory entries
+ */
+static int
+lxsys_readdir_common(lxsys_node_t *lxsnp, uio_t *uiop, int *eofp,
+    lxsys_dirent_t *dirtab, int dirtablen)
+{
+	/* bp holds one dirent64 structure */
+	longlong_t bp[DIRENT64_RECLEN(LXSNSIZ) / sizeof (longlong_t)];
+	dirent64_t *dirent = (dirent64_t *)bp;
+	ssize_t oresid;	/* save a copy for testing later */
+	ssize_t uresid;
+
+	oresid = uiop->uio_resid;
+
+	/* clear out the dirent buffer */
+	bzero(bp, sizeof (bp));
+
+	/* Satisfy user request */
+	while ((uresid = uiop->uio_resid) > 0) {
+		int dirindex;
+		off_t uoffset;
+		int reclen;
+		int error;
+
+		uoffset = uiop->uio_offset;
+		dirindex  = (uoffset / LXSYS_SDSIZE) - 2;
+
+		if (uoffset == 0) {
+
+			dirent->d_ino = lxsnp->lxsys_ino;
+			dirent->d_name[0] = '.';
+			dirent->d_name[1] = '\0';
+			reclen = DIRENT64_RECLEN(1);
+
+		} else if (uoffset == LXSYS_SDSIZE) {
+
+			dirent->d_ino = lxsys_parentinode(lxsnp);
+			dirent->d_name[0] = '.';
+			dirent->d_name[1] = '.';
+			dirent->d_name[2] = '\0';
+			reclen = DIRENT64_RECLEN(2);
+
+		} else if (dirindex >= 0 && dirindex < dirtablen) {
+
+			int slen = strlen(dirtab[dirindex].d_name);
+
+			dirent->d_ino = lxsys_inode(LXSYS_STATIC,
+			    dirtab[dirindex].d_idnum, 0);
+			(void) strcpy(dirent->d_name, dirtab[dirindex].d_name);
+			reclen = DIRENT64_RECLEN(slen);
+
+		} else {
+			/* Run out of table entries */
+			*eofp = 1;
+			return (0);
+		}
+
+		/*
+		 * If the size of the data to transfer is greater than the
+		 * user-provided buffer, we cannot continue.
+		 */
+		if (reclen > uresid) {
+			/* Error if no entries have been returned yet. */
+			if (uresid == oresid) {
+				return (EINVAL);
+			}
+			break;
+		}
+
+		if ((error = lxsys_dirent_out(dirent, reclen, uiop)) != 0) {
+			return (error);
+		}
+	}
+
+	/* Have run out of space, but could have just done last table entry */
+	*eofp = (uiop->uio_offset >= ((dirtablen+2) * LXSYS_SDSIZE)) ?  1 : 0;
+	return (0);
+}
+
+static int
+lxsys_readdir_subdir(lxsys_node_t *lxsnp, uio_t *uiop, int *eofp,
+    lxsys_dirent_t *dirtab, int dirtablen)
+{
+	/* bp holds one dirent64 structure */
+	longlong_t bp[DIRENT64_RECLEN(LXSNSIZ) / sizeof (longlong_t)];
+	dirent64_t *dirent = (dirent64_t *)bp;
+	ssize_t oresid;	/* save a copy for testing later */
+	ssize_t uresid;
+
+	oresid = uiop->uio_resid;
+
+	/* clear out the dirent buffer */
+	bzero(bp, sizeof (bp));
+
+	/* Satisfy user request */
+	while ((uresid = uiop->uio_resid) > 0) {
+		int dirindex;
+		off_t uoffset;
+		int reclen;
+		int error;
+
+		uoffset = uiop->uio_offset;
+		dirindex  = (uoffset / LXSYS_SDSIZE) - 2;
+
+		if (uoffset == 0) {
+
+			dirent->d_ino = lxsnp->lxsys_ino;
+			dirent->d_name[0] = '.';
+			dirent->d_name[1] = '\0';
+			reclen = DIRENT64_RECLEN(1);
+
+		} else if (uoffset == LXSYS_SDSIZE) {
+
+			dirent->d_ino = lxsys_parentinode(lxsnp);
+			dirent->d_name[0] = '.';
+			dirent->d_name[1] = '.';
+			dirent->d_name[2] = '\0';
+			reclen = DIRENT64_RECLEN(2);
+
+		} else if (dirindex >= 0 && dirindex < dirtablen) {
+
+			int slen = strlen(dirtab[dirindex].d_name);
+
+			dirent->d_ino = lxsys_inode(lxsnp->lxsys_type,
+			    lxsnp->lxsys_instance, dirtab[dirindex].d_idnum);
+			(void) strcpy(dirent->d_name, dirtab[dirindex].d_name);
+			reclen = DIRENT64_RECLEN(slen);
+
+		} else {
+			/* Run out of table entries */
+			*eofp = 1;
+			return (0);
+		}
+
+		/*
+		 * If the size of the data to transfer is greater than the
+		 * user-provided buffer, we cannot continue.
+		 */
+		if (reclen > uresid) {
+			/* Error if no entries have been returned yet. */
+			if (uresid == oresid) {
+				return (EINVAL);
+			}
+			break;
+		}
+
+		if ((error = lxsys_dirent_out(dirent, reclen, uiop)) != 0) {
+			return (error);
+		}
+	}
+
+	/* Have run out of space, but could have just done last table entry */
+	*eofp = (uiop->uio_offset >= ((dirtablen+2) * LXSYS_SDSIZE)) ?  1 : 0;
+	return (0);
+}
+
+static int
+lxsys_readdir_ifaces(lxsys_node_t *ldp, struct uio *uiop, int *eofp,
+    lxsys_nodetype_t type)
+{
+	longlong_t bp[DIRENT64_RECLEN(LXSNSIZ) / sizeof (longlong_t)];
+	dirent64_t *dirent = (dirent64_t *)bp;
+	ssize_t oresid, uresid;
+	netstack_t *ns;
+	ip_stack_t *ipst;
+	avl_tree_t *phytree;
+	phyint_t *phyi;
+	int error, i;
+
+
+	/* Emit "." and ".." entries */
+	oresid = uiop->uio_resid;
+	error = lxsys_readdir_common(ldp, uiop, eofp, NULL, 0);
+	if (error != 0 || *eofp == 0) {
+		return (error);
+	}
+
+	if ((ns = lxsys_netstack(ldp)) == NULL) {
+		*eofp = 1;
+		return (0);
+	}
+	ipst = ns->netstack_ip;
+
+	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
+	phytree = &ipst->ips_phyint_g_list->phyint_list_avl_by_index;
+	phyi = avl_first(phytree);
+	if (phyi == NULL) {
+		*eofp = 1;
+	}
+	bzero(bp, sizeof (bp));
+
+	/*
+	 * Skip records we have already passed with the offset.
+	 * This accounts for the two "." and ".." records already seen.
+	 */
+	for (i = (uiop->uio_offset/LXSYS_SDSIZE) - 2; i > 0; i--) {
+		if ((phyi = avl_walk(phytree, phyi, AVL_AFTER)) == NULL) {
+			*eofp = 1;
+			break;
+		}
+	}
+
+	while ((uresid = uiop->uio_resid) > 0 && phyi != NULL) {
+		uint_t ifindex;
+		int reclen;
+
+		ifindex = phyi->phyint_ifindex;
+		(void) strncpy(dirent->d_name, phyi->phyint_name, LIFNAMSIZ);
+		lx_ifname_convert(dirent->d_name, LX_IF_FROMNATIVE);
+		dirent->d_ino = lxsys_inode(type, ifindex, 0);
+		reclen = DIRENT64_RECLEN(strlen(dirent->d_name));
+
+		if (reclen > uresid) {
+			if (uresid == oresid) {
+				/* Not enough space for one record */
+				error = EINVAL;
+			}
+			break;
+		}
+		if ((error = lxsys_dirent_out(dirent, reclen, uiop)) != 0) {
+			break;
+		}
+
+		if ((phyi = avl_walk(phytree, phyi, AVL_AFTER)) == NULL) {
+			*eofp = 1;
+			break;
+		}
+	}
+
+	rw_exit(&ipst->ips_ill_g_lock);
+	netstack_rele(ns);
+	return (error);
+}
+
+static int
+lxsys_readdir_disks(lxsys_node_t *ldp, struct uio *uiop, int *eofp,
+    lxsys_nodetype_t type)
+{
+	longlong_t bp[DIRENT64_RECLEN(LXSNSIZ) / sizeof (longlong_t)];
+	dirent64_t *dirent = (dirent64_t *)bp;
+	ssize_t oresid, uresid;
+	int skip, error;
+	int reclen;
+	uint_t instance;
+	lx_zone_data_t *lxzdata;
+	lx_virt_disk_t *vd;
+
+	/* Emit "." and ".." entries */
+	oresid = uiop->uio_resid;
+	error = lxsys_readdir_common(ldp, uiop, eofp, NULL, 0);
+	if (error != 0 || *eofp == 0) {
+		return (error);
+	}
+
+	skip = (uiop->uio_offset/LXSYS_SDSIZE) - 2;
+
+	lxzdata = ztolxzd(curproc->p_zone);
+	if (lxzdata == NULL)
+		return (EINVAL);
+	ASSERT(lxzdata->lxzd_vdisks != NULL);
+
+	vd = list_head(lxzdata->lxzd_vdisks);
+	while (vd != NULL) {
+		if (skip > 0) {
+			skip--;
+			goto next;
+		}
+
+		if (strnlen(vd->lxvd_name, sizeof (vd->lxvd_name)) > LXSNSIZ)
+			goto next;
+
+		(void) strncpy(dirent->d_name, vd->lxvd_name, LXSNSIZ);
+
+		instance = getminor(vd->lxvd_emul_dev) & 0xffff;
+		if (instance == 0)
+			goto next;
+
+		dirent->d_ino = lxsys_inode(type, instance, 0);
+		reclen = DIRENT64_RECLEN(strlen(dirent->d_name));
+
+		uresid = uiop->uio_resid;
+		if (reclen > uresid) {
+			if (uresid == oresid) {
+				/* Not enough space for one record */
+				error = EINVAL;
+			}
+			break;
+		}
+		if ((error = lxsys_dirent_out(dirent, reclen, uiop)) != 0) {
+			break;
+		}
+
+next:
+		vd = list_next(lxzdata->lxzd_vdisks, vd);
+	}
+
+	/* Indicate EOF if we reached the end of the virtual disks. */
+	if (vd == NULL) {
+		*eofp = 1;
+	}
+
+	return (error);
+}
+
+
+static int
+lxsys_readdir_static(lxsys_node_t *lnp, uio_t *uiop, int *eofp)
+{
+	lxsys_dirent_t *dirent = NULL;
+	int i, len = 0;
+
+	for (i = 0; i < SYSDIRLISTSZ(lxsys_dirlookup); i++) {
+		if (lnp->lxsys_instance == lxsys_dirlookup[i].dl_instance) {
+			dirent = lxsys_dirlookup[i].dl_list;
+			len = lxsys_dirlookup[i].dl_length;
+			break;
+		}
+	}
+
+	if (dirent == NULL) {
+		return (ENOTDIR);
+	}
+
+	return (lxsys_readdir_common(lnp, uiop, eofp, dirent, len));
+}
+
+static int
+lxsys_readdir_class_netdir(lxsys_node_t *lnp, uio_t *uiop, int *eofp)
+{
+	if (lnp->lxsys_type != LXSYS_CLASS_NET ||
+	    lnp->lxsys_instance != 0) {
+		/*
+		 * Since /sys/class/net contains only symlinks, readdir
+		 * operations should not be performed anywhere except the top
+		 * level (instance == 0).
+		 */
+		return (ENOTDIR);
+	}
+
+	return (lxsys_readdir_ifaces(lnp, uiop, eofp, LXSYS_CLASS_NET));
+}
+
+static int
+lxsys_readdir_devices_virtual_netdir(lxsys_node_t *lnp, uio_t *uiop, int *eofp)
+{
+	int error;
+
+	if (lnp->lxsys_instance == 0) {
+		/* top-level interface listing */
+		error = lxsys_readdir_ifaces(lnp, uiop, eofp,
+		    LXSYS_DEV_NET);
+	} else if (lnp->lxsys_endpoint == 0) {
+		/* interface-level sub-item listing */
+		error = lxsys_readdir_subdir(lnp, uiop, eofp,
+		    dirlist_devices_virtual_net,
+		    SYSDIRLISTSZ(dirlist_devices_virtual_net));
+	} else {
+		/* there shouldn't be subdirs below this */
+		error = ENOTDIR;
+	}
+
+	return (error);
+}
+
+static int
+lxsys_readdir_blockdir(lxsys_node_t *lnp, uio_t *uiop, int *eofp)
+{
+	if (lnp->lxsys_type != LXSYS_BLOCK ||
+	    lnp->lxsys_instance != 0) {
+		/*
+		 * Since /sys/block contains only symlinks, readdir operations
+		 * should not be performed anywhere except the top level
+		 * (instance == 0).
+		 */
+		return (ENOTDIR);
+	}
+
+	return (lxsys_readdir_disks(lnp, uiop, eofp, LXSYS_BLOCK));
+}
+
+static int
+lxsys_readdir_devices_zfsdir(lxsys_node_t *lnp, uio_t *uiop, int *eofp)
+{
+	int error;
+
+	if (lnp->lxsys_instance == 0) {
+		/* top-level dev listing */
+		error = lxsys_readdir_disks(lnp, uiop, eofp,
+		    LXSYS_DEV_ZFS);
+	} else if (lnp->lxsys_endpoint == 0) {
+		/* disk-level sub-item listing */
+		error = lxsys_readdir_subdir(lnp, uiop, eofp,
+		    dirlist_devices_zfs_block,
+		    SYSDIRLISTSZ(dirlist_devices_zfs_block));
+	} else {
+		/*
+		 * Currently there shouldn't be subdirs below this but
+		 * on a real Linux system some will be subdirs. This should
+		 * be fixed when we populate the directory for real.
+		 */
+		error = ENOTDIR;
+	}
+
+	return (error);
+}
+
+static int
+lxsys_readdir_cpu(lxsys_node_t *ldp, struct uio *uiop, int *eofp)
+{
+	longlong_t bp[DIRENT64_RECLEN(LXSNSIZ) / sizeof (longlong_t)];
+	dirent64_t *dirent = (dirent64_t *)bp;
+	ssize_t oresid, uresid;
+	int skip, error;
+	int reclen;
+	cpu_t *cp, *cpstart;
+	int pools_enabled;
+	int i, cpucnt;
+	lxsys_cpu_info_t cpu_info[NCPU];
+
+	/* Emit "." and ".." entries */
+	oresid = uiop->uio_resid;
+	error = lxsys_readdir_common(ldp, uiop, eofp, NULL, 0);
+	if (error != 0 || *eofp == 0) {
+		return (error);
+	}
+
+	skip = (uiop->uio_offset/LXSYS_SDSIZE) - 2;
+
+	/* Fixed entries */
+	if (skip > 0) {
+		skip--;
+	} else {
+		(void) strncpy(dirent->d_name, "kernel_max", LXSNSIZ);
+
+		dirent->d_ino = lxsys_inode(LXSYS_STATIC,
+		    LXSYS_INST_DEV_SYSCPU_KMAX, 0);
+		reclen = DIRENT64_RECLEN(strlen(dirent->d_name));
+
+		uresid = uiop->uio_resid;
+		if (reclen > uresid) {
+			if (uresid == oresid) {
+				/* Not enough space for one record */
+				error = EINVAL;
+			}
+			goto done;
+		}
+		if ((error = lxsys_dirent_out(dirent, reclen, uiop)) != 0) {
+			goto done;
+		}
+	}
+
+	/* Collect a list of CPU info */
+	mutex_enter(&cpu_lock);
+	pools_enabled = pool_pset_enabled();
+
+	cpucnt = 0;
+	cp = cpstart = CPU->cpu_part->cp_cpulist;
+	do {
+		cpu_info[cpucnt].cpu_id = cp->cpu_id;
+		cpu_info[cpucnt++].cpu_seqid = cp->cpu_seqid;
+		ASSERT(cpucnt < NCPU);
+		if (pools_enabled) {
+			cp = cp->cpu_next_part;
+		} else {
+			cp = cp->cpu_next;
+		}
+	} while (cp != cpstart);
+
+	mutex_exit(&cpu_lock);
+
+	/* Output dynamic CPU info */
+	for (i = 0; i < cpucnt; i++) {
+		char cpunm[16];
+
+		if (skip > 0) {
+			skip--;
+			continue;
+		}
+
+		(void) snprintf(cpunm, sizeof (cpunm), "cpu%d",
+		    cpu_info[i].cpu_seqid);
+		(void) strncpy(dirent->d_name, cpunm, LXSNSIZ);
+
+		dirent->d_ino = lxsys_inode(LXSYS_DEV_SYS_CPU,
+		    cpu_info[i].cpu_id + 1, 0);
+		reclen = DIRENT64_RECLEN(strlen(dirent->d_name));
+
+		uresid = uiop->uio_resid;
+		if (reclen > uresid) {
+			if (uresid == oresid) {
+				/* Not enough space for one record */
+				error = EINVAL;
+			}
+			break;
+		}
+		if ((error = lxsys_dirent_out(dirent, reclen, uiop)) != 0) {
+			break;
+		}
+	}
+
+	/* Indicate EOF if we reached the end of the CPU list. */
+	if (i == cpucnt) {
+		*eofp = 1;
+	}
+
+done:
+	return (error);
+}
+
+static int
+lxsys_readdir_devices_syscpu(lxsys_node_t *lnp, uio_t *uiop, int *eofp)
+{
+	int error;
+
+	if (lnp->lxsys_instance == 0) {
+		/* top-level cpu listing */
+		error = lxsys_readdir_cpu(lnp, uiop, eofp);
+	} else if (lnp->lxsys_endpoint == 0) {
+		/* cpu-level sub-item listing */
+		error = lxsys_readdir_subdir(lnp, uiop, eofp,
+		    dirlist_empty, SYSDIRLISTSZ(dirlist_empty));
+	} else {
+		/*
+		 * Currently there shouldn't be subdirs below this but
+		 * on a real Linux system some will be subdirs. This should
+		 * be fixed when we populate the directory for real.
+		 */
+		error = ENOTDIR;
+	}
+
+	return (error);
+}
+
+static int
+lxsys_readdir_devices_syscpuinfo(lxsys_node_t *lnp, uio_t *uiop, int *eofp)
+{
+	int error;
+
+	if (lnp->lxsys_type != LXSYS_DEV_SYS_CPUINFO) {
+		/*
+		 * Since /sys/devices/system/cpu/cpuN is empty, readdir
+		 * operations should not be performed anywhere except the top
+		 * level.
+		 */
+		return (ENOTDIR);
+	}
+
+	/*
+	 * Emit "." and ".." entries
+	 * All cpuN directories are currently empty.
+	 */
+	error = lxsys_readdir_common(lnp, uiop, eofp, NULL, 0);
+	if (error != 0 || *eofp == 0) {
+		return (error);
+	}
+
+	/* Indicate EOF */
+	*eofp = 1;
+
+	return (error);
+}
+
+static int
+lxsys_readdir_devices_sysnode(lxsys_node_t *lnp, uio_t *uiop, int *eofp)
+{
+	int error;
+
+	if (lnp->lxsys_instance == 0) {
+		/* top-level node listing */
+		longlong_t bp[DIRENT64_RECLEN(LXSNSIZ) / sizeof (longlong_t)];
+		dirent64_t *dirent = (dirent64_t *)bp;
+		ssize_t oresid, uresid;
+		int reclen, skip;
+
+		/* Emit "." and ".." entries */
+		oresid = uiop->uio_resid;
+		error = lxsys_readdir_common(lnp, uiop, eofp, NULL, 0);
+		if (error != 0 || *eofp == 0) {
+			return (error);
+		}
+		skip = (uiop->uio_offset/LXSYS_SDSIZE) - 2;
+
+		/* Fixed entries */
+		if (skip > 0) {
+			skip--;
+		} else {
+			(void) strncpy(dirent->d_name, "node0", LXSNSIZ);
+
+			dirent->d_ino = lxsys_inode(LXSYS_DEV_SYS_NODE,
+			    1, 0);
+			reclen = DIRENT64_RECLEN(strlen(dirent->d_name));
+
+			uresid = uiop->uio_resid;
+			if (reclen > uresid) {
+				if (uresid == oresid) {
+					/* Not enough space for one record */
+					return (EINVAL);
+				}
+				return (0);
+			}
+			error = lxsys_dirent_out(dirent, reclen, uiop);
+		}
+		/* Indicate EOF */
+		if (error == 0) {
+			*eofp = 1;
+		}
+	} else if (lnp->lxsys_endpoint == 0) {
+		/* node-level sub-item listing */
+		error = lxsys_readdir_subdir(lnp, uiop, eofp,
+		    dirlist_devices_sysnode,
+		    SYSDIRLISTSZ(dirlist_devices_sysnode));
+	} else {
+		/* there shouldn't be subdirs below this */
+		error = ENOTDIR;
+	}
+
+	return (error);
+}
+
+/*
+ * lxsys_readlink(): Vnode operation for VOP_READLINK()
+ */
+/* ARGSUSED */
+static int
+lxsys_readlink(vnode_t *vp, uio_t *uiop, cred_t *cr, caller_context_t *ct)
+{
+	char buf[MAXPATHLEN + 1];
+	lxsys_node_t *lnp = VTOLXS(vp);
+	lxsys_nodetype_t type = lnp->lxsys_type;
+	int (*rlfunc)();
+	int error;
+
+	VERIFY(type > LXSYS_NONE && type < LXSYS_MAXTYPE);
+
+	if (vp->v_type != VLNK) {
+		return (EINVAL);
+	}
+
+	rlfunc = lxsys_readlink_function[lnp->lxsys_type];
+	if (rlfunc != NULL) {
+		if ((error = rlfunc(lnp, buf, sizeof (buf))) == 0) {
+			error = uiomove(buf, strlen(buf), UIO_READ, uiop);
+		}
+	} else {
+		error = EINVAL;
+	}
+
+	return (error);
+}
+
+
+static int
+lxsys_readlink_class_net(lxsys_node_t *lnp, char *buf, size_t len)
+{
+	netstack_t *ns;
+	ip_stack_t *ipst;
+	avl_tree_t *phytree;
+	phyint_t *phyi;
+	uint_t ifindex;
+	char ifname[LIFNAMSIZ];
+	int error = EINVAL;
+
+	if ((ifindex = lnp->lxsys_instance) == 0) {
+		return (error);
+	}
+
+	if ((ns = lxsys_netstack(lnp)) == NULL) {
+		return (error);
+	}
+	ipst = ns->netstack_ip;
+	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
+
+	phytree = &ipst->ips_phyint_g_list->phyint_list_avl_by_index;
+	phyi = avl_find(phytree, &ifindex, NULL);
+	if (phyi != NULL) {
+		(void) strncpy(ifname, phyi->phyint_name, LIFNAMSIZ);
+		lx_ifname_convert(ifname, LX_IF_FROMNATIVE);
+		(void) snprintf(buf, len, "/sys/devices/virtual/net/%s",
+		    ifname);
+		error = 0;
+	}
+
+	rw_exit(&ipst->ips_ill_g_lock);
+	netstack_rele(ns);
+	return (error);
+}
+
+static int
+lxsys_readlink_block(lxsys_node_t *lnp, char *buf, size_t len)
+{
+	int inst, error = EINVAL;
+	lx_zone_data_t *lxzdata;
+	lx_virt_disk_t *vd;
+
+	if ((inst = lnp->lxsys_instance) == 0) {
+		return (error);
+	}
+
+	lxzdata = ztolxzd(curproc->p_zone);
+	if (lxzdata == NULL)
+		return (error);
+	ASSERT(lxzdata->lxzd_vdisks != NULL);
+
+	vd = list_head(lxzdata->lxzd_vdisks);
+	while (vd != NULL) {
+		int vinst = getminor(vd->lxvd_emul_dev) & 0xffff;
+
+		if (vinst == inst) {
+			(void) snprintf(buf, len,
+			    "../devices/zfs/%s", vd->lxvd_name);
+			error = 0;
+			break;
+		}
+		vd = list_next(lxzdata->lxzd_vdisks, vd);
+	}
+
+	return (error);
+}
+
+/*
+ * lxsys_inactive(): Vnode operation for VOP_INACTIVE()
+ * Vnode is no longer referenced, deallocate the file
+ * and all its resources.
+ */
+/* ARGSUSED */
+static void
+lxsys_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
+{
+	lxsys_freenode(VTOLXS(vp));
+}
+
+/*
+ * lxsys_sync(): Vnode operation for VOP_SYNC()
+ */
+static int
+lxsys_sync()
+{
+	/*
+	 * Nothing to sync but this function must never fail
+	 */
+	return (0);
+}
+
+/*
+ * lxsys_cmp(): Vnode operation for VOP_CMP()
+ */
+static int
+lxsys_cmp(vnode_t *vp1, vnode_t *vp2, caller_context_t *ct)
+{
+	if (vn_matchops(vp1, lxsys_vnodeops) ||
+	    vn_matchops(vp2, lxsys_vnodeops))
+		return (vp1 == vp2);
+	return (VOP_CMP(vp1, vp2, ct));
+}
diff --git a/usr/src/uts/common/brand/sn1/sn1_brand.c b/usr/src/uts/common/brand/sn1/sn1_brand.c
index d61928d578..32fb7d9127 100644
--- a/usr/src/uts/common/brand/sn1/sn1_brand.c
+++ b/usr/src/uts/common/brand/sn1/sn1_brand.c
@@ -21,6 +21,7 @@
 
 /*
  * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
  */
 
 #include <sys/errno.h>
@@ -42,43 +43,66 @@
 
 char *sn1_emulation_table = NULL;
 
-void	sn1_init_brand_data(zone_t *);
+void	sn1_init_brand_data(zone_t *, kmutex_t *);
 void	sn1_free_brand_data(zone_t *);
 void	sn1_setbrand(proc_t *);
 int	sn1_getattr(zone_t *, int, void *, size_t *);
 int	sn1_setattr(zone_t *, int, void *, size_t);
 int	sn1_brandsys(int, int64_t *, uintptr_t, uintptr_t, uintptr_t,
-		uintptr_t, uintptr_t, uintptr_t);
+		uintptr_t, uintptr_t);
 void	sn1_copy_procdata(proc_t *, proc_t *);
-void	sn1_proc_exit(struct proc *, klwp_t *);
+void	sn1_proc_exit(struct proc *);
 void	sn1_exec();
-int	sn1_initlwp(klwp_t *);
+void	sn1_initlwp(klwp_t *, void *);
 void	sn1_forklwp(klwp_t *, klwp_t *);
 void	sn1_freelwp(klwp_t *);
 void	sn1_lwpexit(klwp_t *);
 int	sn1_elfexec(vnode_t *, execa_t *, uarg_t *, intpdata_t *, int,
-	long *, int, caddr_t, cred_t *, int);
+	long *, int, caddr_t, cred_t *, int *);
 
 /* sn1 brand */
 struct brand_ops sn1_brops = {
-	sn1_init_brand_data,
-	sn1_free_brand_data,
-	sn1_brandsys,
-	sn1_setbrand,
-	sn1_getattr,
-	sn1_setattr,
-	sn1_copy_procdata,
-	sn1_proc_exit,
-	sn1_exec,
-	lwp_setrval,
-	sn1_initlwp,
-	sn1_forklwp,
-	sn1_freelwp,
-	sn1_lwpexit,
-	sn1_elfexec,
-	NULL,
-	NULL,
-	NSIG,
+	sn1_init_brand_data,		/* b_init_brand_data */
+	sn1_free_brand_data,		/* b_free_brand_data */
+	sn1_brandsys,			/* b_brandsys */
+	sn1_setbrand,			/* b_setbrand */
+	sn1_getattr,			/* b_getattr */
+	sn1_setattr,			/* b_setattr */
+	sn1_copy_procdata,		/* b_copy_procdata */
+	sn1_proc_exit,			/* b_proc_exit */
+	sn1_exec,			/* b_exec */
+	lwp_setrval,			/* b_lwp_setrval */
+	NULL,				/* b_lwpdata_alloc */
+	NULL,				/* b_lwpdata_free */
+	sn1_initlwp,			/* b_initlwp */
+	NULL,				/* b_initlwp_post */
+	sn1_forklwp,			/* b_forklwp */
+	sn1_freelwp,			/* b_freelwp */
+	sn1_lwpexit,			/* b_lwpexit */
+	sn1_elfexec,			/* b_elfexec */
+	NULL,				/* b_sigset_native_to_brand */
+	NULL,				/* b_sigset_brand_to_native */
+	NULL,				/* b_sigfd_translate */
+	NSIG,				/* b_nsig */
+	NULL,				/* b_exit_with_sig */
+	NULL,				/* b_wait_filter */
+	NULL,				/* b_native_exec */
+	NULL,				/* b_map32limit */
+	NULL,				/* b_stop_notify */
+	NULL,				/* b_waitid_helper */
+	NULL,				/* b_sigcld_repost */
+	NULL,				/* b_issig_stop */
+	NULL,				/* b_sig_ignorable */
+	NULL,				/* b_savecontext */
+#if defined(_SYSCALL32_IMPL)
+	NULL,				/* b_savecontext32 */
+#endif
+	NULL,				/* b_restorecontext */
+	NULL,				/* b_sendsig_stack */
+	NULL,				/* b_sendsig */
+	NULL,				/* b_setid_clear */
+	NULL,				/* b_pagefault */
+	B_TRUE				/* b_intp_parse_arg */
 };
 
 #ifdef	sparc
@@ -94,9 +118,12 @@ struct brand_mach_ops sn1_mops = {
 
 struct brand_mach_ops sn1_mops = {
 	sn1_brand_sysenter_callback,
+	NULL,
 	sn1_brand_int91_callback,
 	sn1_brand_syscall_callback,
-	sn1_brand_syscall32_callback
+	sn1_brand_syscall32_callback,
+	NULL,
+	NULL
 };
 
 #else	/* ! __amd64 */
@@ -104,7 +131,10 @@ struct brand_mach_ops sn1_mops = {
 struct brand_mach_ops sn1_mops = {
 	sn1_brand_sysenter_callback,
 	NULL,
+	NULL,
 	sn1_brand_syscall_callback,
+	NULL,
+	NULL,
 	NULL
 };
 #endif	/* __amd64 */
@@ -115,7 +145,8 @@ struct brand	sn1_brand = {
 	BRAND_VER_1,
 	"sn1",
 	&sn1_brops,
-	&sn1_mops
+	&sn1_mops,
+	sizeof (brand_proc_data_t),
 };
 
 static struct modlbrand modlbrand = {
@@ -151,7 +182,7 @@ sn1_setattr(zone_t *zone, int attr, void *buf, size_t bufsize)
 /*ARGSUSED*/
 int
 sn1_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2,
-    uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, uintptr_t arg6)
+    uintptr_t arg3, uintptr_t arg4, uintptr_t arg5)
 {
 	int	res;
 
@@ -171,9 +202,9 @@ sn1_copy_procdata(proc_t *child, proc_t *parent)
 }
 
 void
-sn1_proc_exit(struct proc *p, klwp_t *l)
+sn1_proc_exit(struct proc *p)
 {
-	brand_solaris_proc_exit(p, l, &sn1_brand);
+	brand_solaris_proc_exit(p, &sn1_brand);
 }
 
 void
@@ -182,10 +213,10 @@ sn1_exec()
 	brand_solaris_exec(&sn1_brand);
 }
 
-int
-sn1_initlwp(klwp_t *l)
+void
+sn1_initlwp(klwp_t *l, void *bd)
 {
-	return (brand_solaris_initlwp(l, &sn1_brand));
+	brand_solaris_initlwp(l, &sn1_brand);
 }
 
 void
@@ -214,18 +245,18 @@ sn1_free_brand_data(zone_t *zone)
 
 /*ARGSUSED*/
 void
-sn1_init_brand_data(zone_t *zone)
+sn1_init_brand_data(zone_t *zone, kmutex_t *zsl)
 {
 }
 
 int
 sn1_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
 	int level, long *execsz, int setid, caddr_t exec_file, cred_t *cred,
-	int brand_action)
+	int *brand_action)
 {
 	return (brand_solaris_elfexec(vp, uap, args, idatap, level, execsz,
 	    setid, exec_file, cred, brand_action, &sn1_brand, SN1_BRANDNAME,
-	    SN1_LIB, SN1_LIB32, SN1_LINKER, SN1_LINKER32));
+	    SN1_LIB, SN1_LIB32));
 }
 
 int
diff --git a/usr/src/uts/common/brand/sn1/sn1_brand.h b/usr/src/uts/common/brand/sn1/sn1_brand.h
index b487745e21..fef9dc128b 100644
--- a/usr/src/uts/common/brand/sn1/sn1_brand.h
+++ b/usr/src/uts/common/brand/sn1/sn1_brand.h
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
  */
 
 #ifndef _SN1_BRAND_H
@@ -37,20 +38,14 @@ extern "C" {
 #define	SN1_VERSION		SN1_VERSION_1
 
 #define	SN1_LIB_NAME		"sn1_brand.so.1"
-#define	SN1_LINKER_NAME		"ld.so.1"
 
 #define	SN1_LIB32		BRAND_NATIVE_DIR "usr/lib/" SN1_LIB_NAME
-#define	SN1_LINKER32		"/lib/" SN1_LINKER_NAME
-
 #define	SN1_LIB64		BRAND_NATIVE_DIR "usr/lib/64/" SN1_LIB_NAME
-#define	SN1_LINKER64		"/lib/64/" SN1_LINKER_NAME
 
 #if defined(_LP64)
 #define	SN1_LIB		SN1_LIB64
-#define	SN1_LINKER	SN1_LINKER64
 #else /* !_LP64 */
 #define	SN1_LIB		SN1_LIB32
-#define	SN1_LINKER	SN1_LINKER32
 #endif /* !_LP64 */
 
 #if defined(_KERNEL)
diff --git a/usr/src/uts/common/brand/solaris10/s10_brand.c b/usr/src/uts/common/brand/solaris10/s10_brand.c
index f24b864eef..a02ee7de3d 100644
--- a/usr/src/uts/common/brand/solaris10/s10_brand.c
+++ b/usr/src/uts/common/brand/solaris10/s10_brand.c
@@ -22,6 +22,7 @@
 /*
  * Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved.
  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2016, Joyent, Inc.
  */
 
 #include <sys/errno.h>
@@ -45,45 +46,68 @@
 
 char *s10_emulation_table = NULL;
 
-void	s10_init_brand_data(zone_t *);
+void	s10_init_brand_data(zone_t *, kmutex_t *);
 void	s10_free_brand_data(zone_t *);
 void	s10_setbrand(proc_t *);
 int	s10_getattr(zone_t *, int, void *, size_t *);
 int	s10_setattr(zone_t *, int, void *, size_t);
 int	s10_brandsys(int, int64_t *, uintptr_t, uintptr_t, uintptr_t,
-		uintptr_t, uintptr_t, uintptr_t);
+		uintptr_t, uintptr_t);
 void	s10_copy_procdata(proc_t *, proc_t *);
-void	s10_proc_exit(struct proc *, klwp_t *);
+void	s10_proc_exit(struct proc *);
 void	s10_exec();
-int	s10_initlwp(klwp_t *);
+void	s10_initlwp(klwp_t *, void *);
 void	s10_forklwp(klwp_t *, klwp_t *);
 void	s10_freelwp(klwp_t *);
 void	s10_lwpexit(klwp_t *);
 int	s10_elfexec(vnode_t *, execa_t *, uarg_t *, intpdata_t *, int,
-	long *, int, caddr_t, cred_t *, int);
+	long *, int, caddr_t, cred_t *, int *);
 void	s10_sigset_native_to_s10(sigset_t *);
 void	s10_sigset_s10_to_native(sigset_t *);
 
 /* s10 brand */
 struct brand_ops s10_brops = {
-	s10_init_brand_data,
-	s10_free_brand_data,
-	s10_brandsys,
-	s10_setbrand,
-	s10_getattr,
-	s10_setattr,
-	s10_copy_procdata,
-	s10_proc_exit,
-	s10_exec,
-	lwp_setrval,
-	s10_initlwp,
-	s10_forklwp,
-	s10_freelwp,
-	s10_lwpexit,
-	s10_elfexec,
-	s10_sigset_native_to_s10,
-	s10_sigset_s10_to_native,
-	S10_NSIG,
+	s10_init_brand_data,		/* b_init_brand_data */
+	s10_free_brand_data,		/* b_free_brand_data */
+	s10_brandsys,			/* b_brandsys */
+	s10_setbrand,			/* b_setbrand */
+	s10_getattr,			/* b_getattr */
+	s10_setattr,			/* b_setattr */
+	s10_copy_procdata,		/* b_copy_procdata */
+	s10_proc_exit,			/* b_proc_exit */
+	s10_exec,			/* b_exec */
+	lwp_setrval,			/* b_lwp_setrval */
+	NULL,				/* b_lwpdata_alloc */
+	NULL,				/* b_lwpdata_free */
+	s10_initlwp,			/* b_initlwp */
+	NULL,				/* b_initlwp_post */
+	s10_forklwp,			/* b_forklwp */
+	s10_freelwp,			/* b_freelwp */
+	s10_lwpexit,			/* b_lwpexit */
+	s10_elfexec,			/* b_elfexec */
+	s10_sigset_native_to_s10,	/* b_sigset_native_to_brand */
+	s10_sigset_s10_to_native,	/* b_sigset_brand_to_native */
+	NULL,				/* b_sigfd_translate */
+	S10_NSIG,			/* b_nsig */
+	NULL,				/* b_exit_with_sig */
+	NULL,				/* b_wait_filter */
+	NULL,				/* b_native_exec */
+	NULL,				/* b_map32limit */
+	NULL,				/* b_stop_notify */
+	NULL,				/* b_waitid_helper */
+	NULL,				/* b_sigcld_repost */
+	NULL,				/* b_issig_stop */
+	NULL,				/* b_sig_ignorable */
+	NULL,				/* b_savecontext */
+#if defined(_SYSCALL32_IMPL)
+	NULL,				/* b_savecontext32 */
+#endif
+	NULL,				/* b_restorecontext */
+	NULL,				/* b_sendsig_stack */
+	NULL,				/* b_sendsig */
+	NULL,				/* b_setid_clear */
+	NULL,				/* b_pagefault */
+	B_TRUE				/* b_intp_parse_arg */
 };
 
 #ifdef	sparc
@@ -99,9 +123,12 @@ struct brand_mach_ops s10_mops = {
 
 struct brand_mach_ops s10_mops = {
 	s10_brand_sysenter_callback,
+	NULL,
 	s10_brand_int91_callback,
 	s10_brand_syscall_callback,
-	s10_brand_syscall32_callback
+	s10_brand_syscall32_callback,
+	NULL,
+	NULL
 };
 
 #else	/* ! __amd64 */
@@ -109,7 +136,10 @@ struct brand_mach_ops s10_mops = {
 struct brand_mach_ops s10_mops = {
 	s10_brand_sysenter_callback,
 	NULL,
+	NULL,
 	s10_brand_syscall_callback,
+	NULL,
+	NULL,
 	NULL
 };
 #endif	/* __amd64 */
@@ -120,7 +150,8 @@ struct brand	s10_brand = {
 	BRAND_VER_1,
 	"solaris10",
 	&s10_brops,
-	&s10_mops
+	&s10_mops,
+	sizeof (brand_proc_data_t),
 };
 
 static struct modlbrand modlbrand = {
@@ -252,7 +283,7 @@ s10_native(void *cmd, void *args)
 /*ARGSUSED*/
 int
 s10_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2,
-    uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, uintptr_t arg6)
+    uintptr_t arg3, uintptr_t arg4, uintptr_t arg5)
 {
 	proc_t	*p = curproc;
 	int	res;
@@ -326,9 +357,9 @@ s10_copy_procdata(proc_t *child, proc_t *parent)
 }
 
 void
-s10_proc_exit(struct proc *p, klwp_t *l)
+s10_proc_exit(struct proc *p)
 {
-	brand_solaris_proc_exit(p, l, &s10_brand);
+	brand_solaris_proc_exit(p, &s10_brand);
 }
 
 void
@@ -337,10 +368,10 @@ s10_exec()
 	brand_solaris_exec(&s10_brand);
 }
 
-int
-s10_initlwp(klwp_t *l)
+void
+s10_initlwp(klwp_t *l, void *bd)
 {
-	return (brand_solaris_initlwp(l, &s10_brand));
+	brand_solaris_initlwp(l, &s10_brand);
 }
 
 void
@@ -380,7 +411,7 @@ s10_free_brand_data(zone_t *zone)
 }
 
 void
-s10_init_brand_data(zone_t *zone)
+s10_init_brand_data(zone_t *zone, kmutex_t *zsl)
 {
 	ASSERT(zone->zone_brand == &s10_brand);
 	ASSERT(zone->zone_brand_data == NULL);
@@ -390,11 +421,11 @@ s10_init_brand_data(zone_t *zone)
 int
 s10_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
 	int level, long *execsz, int setid, caddr_t exec_file, cred_t *cred,
-	int brand_action)
+	int *brand_action)
 {
 	return (brand_solaris_elfexec(vp, uap, args, idatap, level, execsz,
 	    setid, exec_file, cred, brand_action, &s10_brand, S10_BRANDNAME,
-	    S10_LIB, S10_LIB32, S10_LINKER, S10_LINKER32));
+	    S10_LIB, S10_LIB32));
 }
 
 void
diff --git a/usr/src/uts/common/brand/solaris10/s10_brand.h b/usr/src/uts/common/brand/solaris10/s10_brand.h
index 11f9853f48..ffef485e12 100644
--- a/usr/src/uts/common/brand/solaris10/s10_brand.h
+++ b/usr/src/uts/common/brand/solaris10/s10_brand.h
@@ -22,6 +22,7 @@
 /*
  * Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved.
  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
  */
 
 #ifndef _S10_BRAND_H
@@ -42,17 +43,12 @@ extern "C" {
 #define	S10_LINKER_NAME		"ld.so.1"
 
 #define	S10_LIB32		BRAND_NATIVE_DIR "usr/lib/" S10_LIB_NAME
-#define	S10_LINKER32		"/lib/" S10_LINKER_NAME
-
 #define	S10_LIB64		BRAND_NATIVE_DIR "usr/lib/64/" S10_LIB_NAME
-#define	S10_LINKER64		"/lib/64/" S10_LINKER_NAME
 
 #if defined(_LP64)
 #define	S10_LIB		S10_LIB64
-#define	S10_LINKER	S10_LINKER64
 #else /* !_LP64 */
 #define	S10_LIB		S10_LIB32
-#define	S10_LINKER	S10_LINKER32
 #endif /* !_LP64 */
 
 /*
diff --git a/usr/src/uts/common/conf/param.c b/usr/src/uts/common/conf/param.c
index 6b3ba51f31..a71be771fd 100644
--- a/usr/src/uts/common/conf/param.c
+++ b/usr/src/uts/common/conf/param.c
@@ -22,6 +22,7 @@
 /*
  * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 1983, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  * Copyright 2012 Milan Jurik. All rights reserved.
  */
 
@@ -559,8 +560,8 @@ char *isa_list = architecture;
 static pgcnt_t original_physmem = 0;
 
 #define	MIN_DEFAULT_MAXUSERS	8u
-#define	MAX_DEFAULT_MAXUSERS	2048u
-#define	MAX_MAXUSERS		4096u
+#define	MAX_DEFAULT_MAXUSERS	10000u
+#define	MAX_MAXUSERS		20000u
 
 void
 param_preset(void)
@@ -572,7 +573,7 @@ void
 param_calc(int platform_max_nprocs)
 {
 	/*
-	 * Default to about one "user" per megabyte, taking into
+	 * Default to about one "user" per 8MB, taking into
 	 * account both physical and virtual constraints.
 	 * Note: 2^20 is a meg; shifting right by (20 - PAGESHIFT)
 	 * converts pages to megs without integer overflow.
@@ -586,8 +587,9 @@ param_calc(int platform_max_nprocs)
 	if (maxusers == 0) {
 		pgcnt_t physmegs = physmem >> (20 - PAGESHIFT);
 		pgcnt_t virtmegs = vmem_size(heap_arena, VMEM_FREE) >> 20;
-		maxusers = MIN(MAX(MIN(physmegs, virtmegs),
-		    MIN_DEFAULT_MAXUSERS), MAX_DEFAULT_MAXUSERS);
+		maxusers = MIN(physmegs, virtmegs) >> 3; /* divide by 8 */
+		maxusers = MAX(maxusers, MIN_DEFAULT_MAXUSERS);
+		maxusers = MIN(maxusers, MAX_DEFAULT_MAXUSERS);
 	}
 	if (maxusers > MAX_MAXUSERS) {
 		maxusers = MAX_MAXUSERS;
diff --git a/usr/src/uts/common/contract/process.c b/usr/src/uts/common/contract/process.c
index 9fd23fdb61..cad5d7f955 100644
--- a/usr/src/uts/common/contract/process.c
+++ b/usr/src/uts/common/contract/process.c
@@ -21,6 +21,7 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2015 Joyent, Inc.
  */
 
 #include <sys/mutex.h>
@@ -955,6 +956,18 @@ contract_process_exit(cont_process_t *ctp, proc_t *p, int exitstatus)
 		(void) cte_publish_all(ct, event, nvl, NULL);
 		mutex_enter(&ct->ct_lock);
 	}
+
+	/*
+	 * CT_PR_EV_EXIT is not part of the CT_PR_ALLFATAL definition since
+	 * we never allow including this in the fatal set via a user-land
+	 * application, but we do allow CT_PR_EV_EXIT in the contract's fatal
+	 * set for a process setup for zone init. See zone_start_init().
+	 */
+	if (EVFATALP(ctp, CT_PR_EV_EXIT)) {
+		ASSERT(MUTEX_HELD(&ct->ct_lock));
+		contract_process_kill(ct, p, B_TRUE);
+	}
+
 	if (empty) {
 		/*
 		 * Send EMPTY message.
diff --git a/usr/src/uts/common/crypto/api/kcf_random.c b/usr/src/uts/common/crypto/api/kcf_random.c
index bc72fa984a..75072fb686 100644
--- a/usr/src/uts/common/crypto/api/kcf_random.c
+++ b/usr/src/uts/common/crypto/api/kcf_random.c
@@ -70,6 +70,7 @@
 #include <sys/cpuvar.h>
 #include <sys/taskq.h>
 #include <rng/fips_random.h>
+#include <sys/strlog.h>
 
 #define	RNDPOOLSIZE		1024	/* Pool size in bytes */
 #define	MINEXTRACTBYTES		20
@@ -933,7 +934,8 @@ rnd_handler(void *arg)
 	int len = 0;
 
 	if (!rng_prov_found && rng_ok_to_log) {
-		cmn_err(CE_WARN, "No randomness provider enabled for "
+		(void) strlog(0, 0, 0, SL_NOTE,
+		    "No randomness provider enabled for "
 		    "/dev/random. Use cryptoadm(1M) to enable a provider.");
 		rng_ok_to_log = B_FALSE;
 	}
diff --git a/usr/src/uts/common/crypto/core/kcf_sched.c b/usr/src/uts/common/crypto/core/kcf_sched.c
index f461fe048c..8b2760b237 100644
--- a/usr/src/uts/common/crypto/core/kcf_sched.c
+++ b/usr/src/uts/common/crypto/core/kcf_sched.c
@@ -1027,9 +1027,9 @@ kcfpool_svc(void *arg)
 			case 0:
 			case -1:
 				/*
-				 * Woke up with no work to do. Check
-				 * if this thread should exit. We keep
-				 * at least kcf_minthreads.
+				 * Woke up with no work to do. Check if we
+				 * should lwp_exit() (which won't return). We
+				 * keep at least kcf_minthreads.
 				 */
 				if (kcfpool->kp_threads > kcf_minthreads) {
 					KCF_ATOMIC_DECR(kcfpool->kp_threads);
diff --git a/usr/src/uts/common/ctf/ctf_mod.c b/usr/src/uts/common/ctf/ctf_mod.c
index b34cf400cd..421b922c96 100644
--- a/usr/src/uts/common/ctf/ctf_mod.c
+++ b/usr/src/uts/common/ctf/ctf_mod.c
@@ -24,8 +24,6 @@
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/sysmacros.h>
 #include <sys/modctl.h>
 #include <sys/debug.h>
@@ -117,6 +115,15 @@ ctf_version(int version)
 
 /*ARGSUSED*/
 ctf_file_t *
+ctf_fdcreate_int(int fd, int *errp, ctf_sect_t *ctfp)
+{
+	if (errp != NULL)
+		*errp = ENOTSUP;
+	return (NULL);
+}
+
+/*ARGSUSED*/
+ctf_file_t *
 ctf_modopen(struct module *mp, int *error)
 {
 	ctf_sect_t ctfsect, symsect, strsect;
diff --git a/usr/src/uts/common/disp/cmt.c b/usr/src/uts/common/disp/cmt.c
index 1c5e1f79a9..3ecbf39393 100644
--- a/usr/src/uts/common/disp/cmt.c
+++ b/usr/src/uts/common/disp/cmt.c
@@ -201,13 +201,15 @@ pg_cmt_cpu_startup(cpu_t *cp)
 
 /*
  * Return non-zero if thread can migrate between "from" and "to"
- * without a performance penalty
+ * without a performance penalty.  This is true only if we share a core on
+ * virtually any CPU; sharing the last-level cache is insufficient to make
+ * migration possible without penalty.
  */
 int
 pg_cmt_can_migrate(cpu_t *from, cpu_t *to)
 {
-	if (from->cpu_physid->cpu_cacheid ==
-	    to->cpu_physid->cpu_cacheid)
+	if (from->cpu_physid->cpu_coreid ==
+	    to->cpu_physid->cpu_coreid)
 		return (1);
 	return (0);
 }
diff --git a/usr/src/uts/common/disp/cpucaps.c b/usr/src/uts/common/disp/cpucaps.c
index 46f53faab6..2a4365ff73 100644
--- a/usr/src/uts/common/disp/cpucaps.c
+++ b/usr/src/uts/common/disp/cpucaps.c
@@ -22,6 +22,7 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2013 Joyent, Inc.  All rights reserved.
  */
 
 #include <sys/disp.h>
@@ -74,6 +75,32 @@
  * Putting threads on wait queues in random places while running in the
  * kernel might lead to all kinds of locking problems.
  *
+ * Bursting
+ * ========
+ *
+ * CPU bursting occurs when the CPU usage is over the baseline but under the
+ * cap.  The baseline CPU (zone.cpu-baseline) is set in a multi-tenant
+ * environment so that we know how much CPU is allocated for a tenant under
+ * normal utilization.  We can then track how much time a zone is spending
+ * over the "normal" CPU utilization expected for that zone using the
+ * "above_base_sec" kstat. This kstat is cumulative.
+ *
+ * If the zone has a burst limit (zone.cpu-burst-time) then the zone can
+ * burst for that period of time (in seconds) before the effective cap is
+ * lowered to the baseline.  Once the effective cap is lowered, the zone
+ * will run at the baseline for the burst limit before the effective cap is
+ * raised again to the full value.  This will allow the zone to burst again.
+ * We can watch this behavior using the kstats.  The "effective" kstat shows
+ * which cap is being used, the baseline value or the burst value.  The
+ * "burst_limit_sec" shows the value of the zone.cpu-burst-time rctl and the
+ * "bursting_sec" kstat shows how many seconds the zone has currently been
+ * bursting.  When the CPU load is continuously greater than the baseline,
+ * bursting_sec will increase, up to the burst_limit_sec value, then the
+ * effective kstat will drop to the baseline and the bursting_sec value will
+ * decrease until it hits 0, at which time the effective kstat will return to
+ * the full burst value and the bursting_sec value will begin to increase
+ * again.
+ *
  * Accounting
  * ==========
  *
@@ -203,18 +230,28 @@ static void caps_update();
  */
 struct cap_kstat {
 	kstat_named_t	cap_value;
+	kstat_named_t	cap_baseline;
+	kstat_named_t	cap_effective;
+	kstat_named_t	cap_burst_limit;
+	kstat_named_t	cap_bursting;
 	kstat_named_t	cap_usage;
 	kstat_named_t	cap_nwait;
 	kstat_named_t	cap_below;
 	kstat_named_t	cap_above;
+	kstat_named_t	cap_above_base;
 	kstat_named_t	cap_maxusage;
 	kstat_named_t	cap_zonename;
 } cap_kstat = {
 	{ "value",	KSTAT_DATA_UINT64 },
+	{ "baseline",	KSTAT_DATA_UINT64 },
+	{ "effective",	KSTAT_DATA_UINT64 },
+	{ "burst_limit_sec", KSTAT_DATA_UINT64 },
+	{ "bursting_sec", KSTAT_DATA_UINT64 },
 	{ "usage",	KSTAT_DATA_UINT64 },
 	{ "nwait",	KSTAT_DATA_UINT64 },
 	{ "below_sec",	KSTAT_DATA_UINT64 },
 	{ "above_sec",	KSTAT_DATA_UINT64 },
+	{ "above_base_sec", KSTAT_DATA_UINT64 },
 	{ "maxusage",	KSTAT_DATA_UINT64 },
 	{ "zonename",	KSTAT_DATA_STRING },
 };
@@ -311,7 +348,7 @@ cap_enable(list_t *l, cpucap_t *cap, hrtime_t value)
 	cap->cap_below = cap->cap_above = 0;
 	cap->cap_maxusage = 0;
 	cap->cap_usage = 0;
-	cap->cap_value = value;
+	cap->cap_value = cap->cap_chk_value = value;
 	waitq_unblock(&cap->cap_waitq);
 	if (CPUCAPS_OFF()) {
 		cpucaps_enabled = B_TRUE;
@@ -340,19 +377,21 @@ cap_disable(list_t *l, cpucap_t *cap)
 	ASSERT(CAP_ENABLED(cap));
 
 	waitq_block(&cap->cap_waitq);
+
+	/* do this first to avoid race with cap_kstat_update */
+	if (cap->cap_kstat != NULL) {
+		kstat_delete(cap->cap_kstat);
+		cap->cap_kstat = NULL;
+	}
+
 	list_remove(l, cap);
 	if (list_is_empty(&capped_projects) && list_is_empty(&capped_zones)) {
 		cpucaps_enabled = B_FALSE;
 		cpucaps_clock_callout = NULL;
 	}
-	cap->cap_value = 0;
+	cap->cap_value = cap->cap_chk_value = 0;
 	cap->cap_project = NULL;
 	cap->cap_zone = NULL;
-	if (cap->cap_kstat != NULL) {
-		kstat_delete(cap->cap_kstat);
-		cap->cap_kstat = NULL;
-	}
-
 }
 
 /*
@@ -487,6 +526,8 @@ cap_walk(list_t *l, void (*cb)(cpucap_t *, int64_t))
  * The waitq_isempty check is performed without the waitq lock. If a new thread
  * is placed on the waitq right after the check, it will be picked up during the
  * next invocation of cap_poke_waitq().
+ *
+ * Called once per tick for zones.
  */
 /* ARGSUSED */
 static void
@@ -494,15 +535,92 @@ cap_poke_waitq(cpucap_t *cap, int64_t gen)
 {
 	ASSERT(MUTEX_HELD(&caps_lock));
 
-	if (cap->cap_usage >= cap->cap_value) {
+	if (cap->cap_base != 0) {
+		/*
+		 * Because of the way usage is calculated and decayed, its
+		 * possible for the zone to be slightly over its cap, but we
+		 * don't want to count that after we have reduced the effective
+		 * cap to the baseline.  That way the zone will be able to
+		 * burst again after the burst_limit has expired.
+		 */
+		if (cap->cap_usage > cap->cap_base &&
+		    cap->cap_chk_value == cap->cap_value) {
+			cap->cap_above_base++;
+
+			/*
+			 * If bursting is limited and we've been bursting
+			 * longer than we're supposed to, then set the
+			 * effective cap to the baseline.
+			 */
+			if (cap->cap_burst_limit != 0) {
+				cap->cap_bursting++;
+				if (cap->cap_bursting >= cap->cap_burst_limit)
+					cap->cap_chk_value = cap->cap_base;
+			}
+		} else if (cap->cap_bursting > 0) {
+			/*
+			 * We're not bursting now, but we were, decay the
+			 * bursting timer.
+			 */
+			cap->cap_bursting--;
+			/*
+			 * Reset the effective cap once we decay to 0 so we
+			 * can burst again.
+			 */
+			if (cap->cap_bursting == 0 &&
+			    cap->cap_chk_value != cap->cap_value)
+				cap->cap_chk_value = cap->cap_value;
+		}
+	}
+
+	if (cap->cap_usage >= cap->cap_chk_value) {
 		cap->cap_above++;
 	} else {
 		waitq_t *wq = &cap->cap_waitq;
 
 		cap->cap_below++;
 
-		if (!waitq_isempty(wq))
-			waitq_runone(wq);
+		if (!waitq_isempty(wq)) {
+			int i, ndequeue, p;
+
+			/*
+			 * Since this function is only called once per tick,
+			 * we can hit a situation where we have artificially
+			 * limited the project/zone below its cap.  This would
+			 * happen if we have multiple threads queued up but
+			 * only dequeued one thread/tick. To avoid this we
+			 * dequeue multiple threads, calculated based on the
+			 * usage percentage of the cap. It is possible that we
+			 * could dequeue too many threads and some of them
+			 * might be put back on the wait queue quickly, but
+			 * since we know that threads are on the wait queue
+			 * because we're capping, we know that there is unused
+			 * CPU cycles anyway, so this extra work would not
+			 * hurt. Also, the ndequeue number is only an upper
+			 * bound and we might dequeue less, depending on how
+			 * many threads are actually in the wait queue. The
+			 * ndequeue values are empirically derived and could be
+			 * adjusted or calculated in another way if necessary.
+			 */
+			p = (int)((100 * cap->cap_usage) / cap->cap_chk_value);
+			if (p >= 98)
+				ndequeue = 10;
+			else if (p >= 95)
+				ndequeue = 20;
+			else if (p >= 90)
+				ndequeue = 40;
+			else if (p >= 85)
+				ndequeue = 80;
+			else
+				ndequeue = 160;
+
+			for (i = 0; i < ndequeue; i++) {
+				waitq_runone(wq);
+				if (waitq_isempty(wq))
+					break;
+			}
+			DTRACE_PROBE2(cpucaps__pokeq, int, p, int, i);
+		}
 	}
 }
 
@@ -629,14 +747,14 @@ cap_project_zone_modify_walker(kproject_t *kpj, void *arg)
 		 * Remove all projects in this zone without caps
 		 * from the capped_projects list.
 		 */
-		if (project_cap->cap_value == MAX_USAGE) {
+		if (project_cap->cap_chk_value == MAX_USAGE) {
 			cap_project_disable(kpj);
 		}
 	} else if (CAP_DISABLED(project_cap)) {
 		/*
 		 * Add the project to capped_projects list.
 		 */
-		ASSERT(project_cap->cap_value == 0);
+		ASSERT(project_cap->cap_chk_value == 0);
 		cap_project_enable(kpj, MAX_USAGE);
 	}
 	mutex_exit(&caps_lock);
@@ -746,7 +864,7 @@ cpucaps_zone_set(zone_t *zone, rctl_qty_t cap_val)
 		/*
 		 * No state transitions, just change the value
 		 */
-		cap->cap_value = value;
+		cap->cap_value = cap->cap_chk_value = value;
 	}
 
 	ASSERT(MUTEX_HELD(&caps_lock));
@@ -757,6 +875,108 @@ cpucaps_zone_set(zone_t *zone, rctl_qty_t cap_val)
 }
 
 /*
+ * Set zone's base cpu value to base_val
+ */
+int
+cpucaps_zone_set_base(zone_t *zone, rctl_qty_t base_val)
+{
+	cpucap_t *cap = NULL;
+	hrtime_t value;
+
+	ASSERT(base_val <= MAXCAP);
+	if (base_val > MAXCAP)
+		base_val = MAXCAP;
+
+	if (CPUCAPS_OFF() || !ZONE_IS_CAPPED(zone))
+		return (0);
+
+	if (zone->zone_cpucap == NULL)
+		cap = cap_alloc();
+
+	mutex_enter(&caps_lock);
+
+	if (cpucaps_busy) {
+		mutex_exit(&caps_lock);
+		return (EBUSY);
+	}
+
+	/*
+	 * Double-check whether zone->zone_cpucap is NULL, now with caps_lock
+	 * held. If it is still NULL, assign a newly allocated cpucap to it.
+	 */
+	if (zone->zone_cpucap == NULL) {
+		zone->zone_cpucap = cap;
+	} else if (cap != NULL) {
+		cap_free(cap);
+	}
+
+	cap = zone->zone_cpucap;
+
+	value = base_val * cap_tick_cost;
+	if (value < 0 || value > cap->cap_value)
+		value = 0;
+
+	cap->cap_base = value;
+
+	mutex_exit(&caps_lock);
+
+	return (0);
+}
+
+/*
+ * Set zone's maximum burst time in seconds.  A burst time of 0 means that
+ * the zone can run over its baseline indefinitely.
+ */
+int
+cpucaps_zone_set_burst_time(zone_t *zone, rctl_qty_t base_val)
+{
+	cpucap_t *cap = NULL;
+	hrtime_t value;
+
+	ASSERT(base_val <= INT_MAX);
+	/* Treat the default as 0 - no limit */
+	if (base_val == INT_MAX)
+		base_val = 0;
+	if (base_val > INT_MAX)
+		base_val = INT_MAX;
+
+	if (CPUCAPS_OFF() || !ZONE_IS_CAPPED(zone))
+		return (0);
+
+	if (zone->zone_cpucap == NULL)
+		cap = cap_alloc();
+
+	mutex_enter(&caps_lock);
+
+	if (cpucaps_busy) {
+		mutex_exit(&caps_lock);
+		return (EBUSY);
+	}
+
+	/*
+	 * Double-check whether zone->zone_cpucap is NULL, now with caps_lock
+	 * held. If it is still NULL, assign a newly allocated cpucap to it.
+	 */
+	if (zone->zone_cpucap == NULL) {
+		zone->zone_cpucap = cap;
+	} else if (cap != NULL) {
+		cap_free(cap);
+	}
+
+	cap = zone->zone_cpucap;
+
+	value = SEC_TO_TICK(base_val);
+	if (value < 0)
+		value = 0;
+
+	cap->cap_burst_limit = value;
+
+	mutex_exit(&caps_lock);
+
+	return (0);
+}
+
+/*
  * The project is going away so disable its cap.
  */
 void
@@ -902,7 +1122,7 @@ cpucaps_project_set(kproject_t *kpj, rctl_qty_t cap_val)
 		if (CAP_DISABLED(cap))
 			cap_project_enable(kpj, value);
 		else
-			cap->cap_value = value;
+			cap->cap_value = cap->cap_chk_value = value;
 	} else if (CAP_ENABLED(cap)) {
 		/*
 		 * User requested to drop a cap on the project. If it is part of
@@ -910,7 +1130,7 @@ cpucaps_project_set(kproject_t *kpj, rctl_qty_t cap_val)
 		 * otherwise disable the cap.
 		 */
 		if (ZONE_IS_CAPPED(kpj->kpj_zone)) {
-			cap->cap_value = MAX_USAGE;
+			cap->cap_value = cap->cap_chk_value = MAX_USAGE;
 		} else {
 			cap_project_disable(kpj);
 		}
@@ -948,6 +1168,26 @@ cpucaps_zone_get(zone_t *zone)
 }
 
 /*
+ * Get current zone baseline.
+ */
+rctl_qty_t
+cpucaps_zone_get_base(zone_t *zone)
+{
+	return (zone->zone_cpucap != NULL ?
+	    (rctl_qty_t)(zone->zone_cpucap->cap_base / cap_tick_cost) : 0);
+}
+
+/*
+ * Get current zone maximum burst time.
+ */
+rctl_qty_t
+cpucaps_zone_get_burst_time(zone_t *zone)
+{
+	return (zone->zone_cpucap != NULL ?
+	    (rctl_qty_t)(TICK_TO_SEC(zone->zone_cpucap->cap_burst_limit)) : 0);
+}
+
+/*
  * Charge project of thread t the time thread t spent on CPU since previously
  * adjusted.
  *
@@ -1045,7 +1285,7 @@ cpucaps_charge(kthread_id_t t, caps_sc_t *csc, cpucaps_charge_t charge_type)
 
 	project_cap = kpj->kpj_cpucap;
 
-	if (project_cap->cap_usage >= project_cap->cap_value) {
+	if (project_cap->cap_usage >= project_cap->cap_chk_value) {
 		t->t_schedflag |= TS_PROJWAITQ;
 		rc = B_TRUE;
 	} else if (t->t_schedflag & TS_PROJWAITQ) {
@@ -1059,7 +1299,7 @@ cpucaps_charge(kthread_id_t t, caps_sc_t *csc, cpucaps_charge_t charge_type)
 	} else {
 		cpucap_t *zone_cap = zone->zone_cpucap;
 
-		if (zone_cap->cap_usage >= zone_cap->cap_value) {
+		if (zone_cap->cap_usage >= zone_cap->cap_chk_value) {
 			t->t_schedflag |= TS_ZONEWAITQ;
 			rc = B_TRUE;
 		} else if (t->t_schedflag & TS_ZONEWAITQ) {
@@ -1119,6 +1359,7 @@ cpucaps_enforce(kthread_t *t)
 
 /*
  * Convert internal cap statistics into values exported by cap kstat.
+ * Note that the kstat is held throughout this function but caps_lock is not.
  */
 static int
 cap_kstat_update(kstat_t *ksp, int rw)
@@ -1133,6 +1374,12 @@ cap_kstat_update(kstat_t *ksp, int rw)
 
 	capsp->cap_value.value.ui64 =
 	    ROUND_SCALE(cap->cap_value, cap_tick_cost);
+	capsp->cap_baseline.value.ui64 =
+	    ROUND_SCALE(cap->cap_base, cap_tick_cost);
+	capsp->cap_effective.value.ui64 =
+	    ROUND_SCALE(cap->cap_chk_value, cap_tick_cost);
+	capsp->cap_burst_limit.value.ui64 =
+	    ROUND_SCALE(cap->cap_burst_limit, tick_sec);
 	capsp->cap_usage.value.ui64 =
 	    ROUND_SCALE(cap->cap_usage, cap_tick_cost);
 	capsp->cap_maxusage.value.ui64 =
@@ -1140,6 +1387,10 @@ cap_kstat_update(kstat_t *ksp, int rw)
 	capsp->cap_nwait.value.ui64 = cap->cap_waitq.wq_count;
 	capsp->cap_below.value.ui64 = ROUND_SCALE(cap->cap_below, tick_sec);
 	capsp->cap_above.value.ui64 = ROUND_SCALE(cap->cap_above, tick_sec);
+	capsp->cap_above_base.value.ui64 =
+	    ROUND_SCALE(cap->cap_above_base, tick_sec);
+	capsp->cap_bursting.value.ui64 =
+	    ROUND_SCALE(cap->cap_bursting, tick_sec);
 	kstat_named_setstr(&capsp->cap_zonename, zonename);
 
 	return (0);
diff --git a/usr/src/uts/common/disp/disp.c b/usr/src/uts/common/disp/disp.c
index 0c2c0b4993..5f9c2c68a2 100644
--- a/usr/src/uts/common/disp/disp.c
+++ b/usr/src/uts/common/disp/disp.c
@@ -23,6 +23,10 @@
  * Use is subject to license terms.
  */
 
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
 /*	  All Rights Reserved  	*/
 
@@ -105,7 +109,7 @@ static void	cpu_resched(cpu_t *cp, pri_t tpri);
 /*
  * If this is set, only interrupt threads will cause kernel preemptions.
  * This is done by changing the value of kpreemptpri.  kpreemptpri
- * will either be the max sysclass pri + 1 or the min interrupt pri.
+ * will either be the max sysclass pri or the min interrupt pri.
  */
 int	only_intr_kpreempt;
 
@@ -252,7 +256,23 @@ dispinit(void)
 				maxglobpri = cl_maxglobpri;
 		}
 	}
-	kpreemptpri = (pri_t)v.v_maxsyspri + 1;
+
+	/*
+	 * Historically, kpreemptpri was set to v_maxsyspri + 1 -- which is
+	 * to say, maxclsyspri + 1.  However, over time, the system has used
+	 * more and more asynchronous kernel threads, with an increasing number
+	 * of these doing work on direct behalf of higher-level software (e.g.,
+	 * network processing).  This has led to potential priority inversions:
+	 * threads doing low-priority lengthy kernel work can effectively
+	 * delay kernel-level processing of higher-priority data. To minimize
+	 * such inversions, we set kpreemptpri to be v_maxsyspri; anything in
+	 * the kernel that runs at maxclsyspri will therefore induce kernel
+	 * preemption, and this priority should be used if/when an asynchronous
+	 * thread (or, as is often the case, task queue) is performing a task
+	 * on behalf of higher-level software (or any task that is otherwise
+	 * latency-sensitve).
+	 */
+	kpreemptpri = (pri_t)v.v_maxsyspri;
 	if (kpqpri == KPQPRI)
 		kpqpri = kpreemptpri;
 
@@ -2258,7 +2278,7 @@ disp_getbest(disp_t *dp)
 		 * placed earlier.
 		 */
 		if (tcp == NULL ||
-		    pri >= minclsyspri ||
+		    (pri >= minclsyspri && tp->t_procp == &p0) ||
 		    tp->t_cpu != tcp)
 			break;
 
diff --git a/usr/src/uts/common/disp/fx.c b/usr/src/uts/common/disp/fx.c
index ab5ba278a0..8260680a07 100644
--- a/usr/src/uts/common/disp/fx.c
+++ b/usr/src/uts/common/disp/fx.c
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2013, Joyent, Inc. All rights reserved.
+ * Copyright 2015, Joyent, Inc.
  */
 
 #include <sys/types.h>
@@ -71,16 +71,6 @@ static struct modlinkage modlinkage = {
 };
 
 
-/*
- * control flags (kparms->fx_cflags).
- */
-#define	FX_DOUPRILIM	0x01    /* change user priority limit */
-#define	FX_DOUPRI	0x02    /* change user priority */
-#define	FX_DOTQ		0x04    /* change FX time quantum */
-
-
-#define	FXMAXUPRI 60		/* maximum user priority setting */
-
 #define	FX_MAX_UNPRIV_PRI	0	/* maximum unpriviledge priority */
 
 /*
diff --git a/usr/src/uts/common/disp/rt.c b/usr/src/uts/common/disp/rt.c
index f87f8c56ce..115e42ccb8 100644
--- a/usr/src/uts/common/disp/rt.c
+++ b/usr/src/uts/common/disp/rt.c
@@ -22,7 +22,7 @@
 /*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
- * Copyright 2013 Joyent, Inc.  All rights reserved.
+ * Copyright 2015 Joyent, Inc.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -103,13 +103,6 @@ _info(struct modinfo *modinfop)
 pri_t rt_maxpri = RTMAXPRI;	/* maximum real-time priority */
 rtdpent_t *rt_dptbl;	  /* real-time dispatcher parameter table */
 
-/*
- * control flags (kparms->rt_cflags).
- */
-#define	RT_DOPRI	0x01	/* change priority */
-#define	RT_DOTQ		0x02	/* change RT time quantum */
-#define	RT_DOSIG	0x04	/* change RT time quantum signal */
-
 static int	rt_admin(caddr_t, cred_t *);
 static int	rt_enterclass(kthread_t *, id_t, void *, cred_t *, void *);
 static int	rt_fork(kthread_t *, kthread_t *, void *);
diff --git a/usr/src/uts/common/disp/rt_dptbl.c b/usr/src/uts/common/disp/rt_dptbl.c
index 768b499ef2..cc88ed72fc 100644
--- a/usr/src/uts/common/disp/rt_dptbl.c
+++ b/usr/src/uts/common/disp/rt_dptbl.c
@@ -28,8 +28,6 @@
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T	*/
 /*	  All Rights Reserved  	*/
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/proc.h>
 #include <sys/priocntl.h>
 #include <sys/class.h>
@@ -70,8 +68,6 @@ _info(struct modinfo *modinfop)
 	return (mod_info(&modlinkage, modinfop));
 }
 
-#define	RTGPPRIO0	100	/* Global priority for RT priority 0 */
-
 rtdpent_t	config_rt_dptbl[] = {
 
 /*   	prilevel    Time quantum */
diff --git a/usr/src/uts/common/disp/thread.c b/usr/src/uts/common/disp/thread.c
index f2685af534..ae6c5eef16 100644
--- a/usr/src/uts/common/disp/thread.c
+++ b/usr/src/uts/common/disp/thread.c
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013, Joyent, Inc.  All rights reserved.
+ * Copyright (c) 2015, Joyent, Inc.  All rights reserved.
  */
 
 #include <sys/types.h>
@@ -75,6 +75,10 @@
 #include <sys/cpucaps.h>
 #include <sys/kiconv.h>
 
+#ifndef	STACK_GROWTH_DOWN
+#error Stacks do not grow downward; 3b2 zombie attack detected!
+#endif
+
 struct kmem_cache *thread_cache;	/* cache of free threads */
 struct kmem_cache *lwp_cache;		/* cache of free lwps */
 struct kmem_cache *turnstile_cache;	/* cache of free turnstiles */
@@ -372,7 +376,7 @@ thread_create(
 		if (stksize <= sizeof (kthread_t) + PTR24_ALIGN)
 			cmn_err(CE_PANIC, "thread_create: proposed stack size"
 			    " too small to hold thread.");
-#ifdef STACK_GROWTH_DOWN
+
 		stksize -= SA(sizeof (kthread_t) + PTR24_ALIGN - 1);
 		stksize &= -PTR24_ALIGN;	/* make thread aligned */
 		t = (kthread_t *)(stk + stksize);
@@ -381,13 +385,6 @@ thread_create(
 			audit_thread_create(t);
 		t->t_stk = stk + stksize;
 		t->t_stkbase = stk;
-#else	/* stack grows to larger addresses */
-		stksize -= SA(sizeof (kthread_t));
-		t = (kthread_t *)(stk);
-		bzero(t, sizeof (kthread_t));
-		t->t_stk = stk + sizeof (kthread_t);
-		t->t_stkbase = stk + stksize + sizeof (kthread_t);
-#endif	/* STACK_GROWTH_DOWN */
 		t->t_flag |= T_TALLOCSTK;
 		t->t_swap = stk;
 	} else {
@@ -400,13 +397,8 @@ thread_create(
 		 * Initialize t_stk to the kernel stack pointer to use
 		 * upon entry to the kernel
 		 */
-#ifdef STACK_GROWTH_DOWN
 		t->t_stk = stk + stksize;
 		t->t_stkbase = stk;
-#else
-		t->t_stk = stk;			/* 3b2-like */
-		t->t_stkbase = stk + stksize;
-#endif /* STACK_GROWTH_DOWN */
 	}
 
 	if (kmem_stackinfo != 0) {
@@ -589,6 +581,9 @@ thread_exit(void)
 	if ((t->t_proc_flag & TP_ZTHREAD) != 0)
 		cmn_err(CE_PANIC, "thread_exit: zthread_exit() not called");
 
+	if ((t->t_flag & T_SPLITSTK) != 0)
+		cmn_err(CE_PANIC, "thread_exit: called when stack is split");
+
 	tsd_exit();		/* Clean up this thread's TSD */
 
 	kcpc_passivate();	/* clean up performance counter state */
@@ -1050,6 +1045,8 @@ installctx(
 	ctx->free_op = free;
 	ctx->arg = arg;
 	ctx->next = t->t_ctx;
+	ctx->save_ts = 0;
+	ctx->restore_ts = 0;
 	t->t_ctx = ctx;
 }
 
@@ -1124,9 +1121,12 @@ savectx(kthread_t *t)
 	struct ctxop *ctx;
 
 	ASSERT(t == curthread);
-	for (ctx = t->t_ctx; ctx != 0; ctx = ctx->next)
-		if (ctx->save_op != NULL)
+	for (ctx = t->t_ctx; ctx != 0; ctx = ctx->next) {
+		if (ctx->save_op != NULL) {
+			ctx->save_ts = gethrtime_unscaled();
 			(ctx->save_op)(ctx->arg);
+		}
+	}
 }
 
 void
@@ -1135,9 +1135,12 @@ restorectx(kthread_t *t)
 	struct ctxop *ctx;
 
 	ASSERT(t == curthread);
-	for (ctx = t->t_ctx; ctx != 0; ctx = ctx->next)
-		if (ctx->restore_op != NULL)
+	for (ctx = t->t_ctx; ctx != 0; ctx = ctx->next) {
+		if (ctx->restore_op != NULL) {
+			ctx->restore_ts = gethrtime_unscaled();
 			(ctx->restore_op)(ctx->arg);
+		}
+	}
 }
 
 void
@@ -1883,6 +1886,103 @@ thread_change_pri(kthread_t *t, pri_t disp_pri, int front)
 	return (on_rq);
 }
 
+
+/*
+ * There are occasions in the kernel when we need much more stack than we
+ * allocate by default, but we do not wish to have that work done
+ * asynchronously by another thread.  To accommodate these scenarios, we allow
+ * for a split stack (also known as a "segmented stack") whereby a new stack
+ * is dynamically allocated and the current thread jumps onto it for purposes
+ * of executing the specified function.  After the specified function returns,
+ * the stack is deallocated and control is returned to the caller.  This
+ * functionality is implemented by thread_splitstack(), below; there are a few
+ * constraints on its use:
+ *
+ * - The caller must be in a context where it is safe to block for memory.
+ * - The caller cannot be in a t_onfault context
+ * - The called function must not call thread_exit() while on the split stack
+ *
+ * The code will explicitly panic if these constraints are violated.  Notably,
+ * however, thread_splitstack() _can_ be called on a split stack -- there
+ * is no limit to the level that split stacks can nest.
+ *
+ * When the stack is split, it is constructed such that stack backtraces
+ * from kernel debuggers continue to function -- though note that DTrace's
+ * stack() action and stackdepth function will only show the stack up to and
+ * including thread_splitstack_run(); DTrace explicitly bounds itself to
+ * pointers that exist within the current declared stack as a safety
+ * mechanism.
+ */
+void
+thread_splitstack(void (*func)(void *), void *arg, size_t stksize)
+{
+	kthread_t *t = curthread;
+	caddr_t ostk, ostkbase, stk;
+	ushort_t otflag;
+
+	if (t->t_onfault != NULL)
+		panic("thread_splitstack: called with non-NULL t_onfault");
+
+	ostk = t->t_stk;
+	ostkbase = t->t_stkbase;
+	otflag = t->t_flag;
+
+	stksize = roundup(stksize, PAGESIZE);
+
+	if (stksize < default_stksize)
+		stksize = default_stksize;
+
+	if (stksize == default_stksize) {
+		stk = (caddr_t)segkp_cache_get(segkp_thread);
+	} else {
+		stksize = roundup(stksize, PAGESIZE);
+		stk = (caddr_t)segkp_get(segkp, stksize,
+		    (KPD_HASREDZONE | KPD_NO_ANON | KPD_LOCKED));
+	}
+
+	/*
+	 * We're going to lock ourselves before we set T_SPLITSTK to assure
+	 * that we're not swapped out in the meantime.  (Note that we don't
+	 * bother to set t_swap, as we're not going to be swapped out.)
+	 */
+	thread_lock(t);
+
+	if (!(otflag & T_SPLITSTK))
+		t->t_flag |= T_SPLITSTK;
+
+	t->t_stk = stk + stksize;
+	t->t_stkbase = stk;
+
+	thread_unlock(t);
+
+	/*
+	 * Now actually run on the new (split) stack...
+	 */
+	thread_splitstack_run(t->t_stk, func, arg);
+
+	/*
+	 * We're back onto our own stack; lock ourselves and restore our
+	 * pre-split state.
+	 */
+	thread_lock(t);
+
+	t->t_stk = ostk;
+	t->t_stkbase = ostkbase;
+
+	if (!(otflag & T_SPLITSTK))
+		t->t_flag &= ~T_SPLITSTK;
+
+	thread_unlock(t);
+
+	/*
+	 * Now that we are entirely back on our own stack, call back into
+	 * the platform layer to perform any platform-specific cleanup.
+	 */
+	thread_splitstack_cleanup();
+
+	segkp_release(segkp, stk);
+}
+
 /*
  * Tunable kmem_stackinfo is set, fill the kernel thread stack with a
  * specific pattern.
diff --git a/usr/src/uts/common/disp/thread_intr.c b/usr/src/uts/common/disp/thread_intr.c
index 67ccc6922f..c840bdf31a 100644
--- a/usr/src/uts/common/disp/thread_intr.c
+++ b/usr/src/uts/common/disp/thread_intr.c
@@ -23,19 +23,10 @@
  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
-
 /*
- * FILE NOTICE BEGIN
- *
- * This file should not be modified.  If you wish to modify it or have it
- * modified, please contact Sun Microsystems at <LFI149367@-sun-.-com->
- * (without anti-spam dashes)
- *
- * FILE NOTICE END
+ * Copyright 2015, Joyent, Inc.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/cpuvar.h>
 #include <sys/stack.h>
 #include <vm/seg_kp.h>
@@ -44,6 +35,17 @@
 #include <sys/sysmacros.h>
 
 /*
+ * Use a slightly larger thread stack size for interrupt threads rather than the
+ * default. This is useful for cases where the networking stack may do an rx and
+ * a tx in the context of a single interrupt and when combined with various
+ * promisc hooks that need memory, can cause us to get dangerously close to the
+ * edge of the traditional stack sizes. This is only a few pages more than a
+ * traditional stack and given that we don't have that many interrupt threads,
+ * the memory costs end up being more than worthwhile.
+ */
+#define	LL_INTR_STKSZ	(32 * 1024)
+
+/*
  * Create and initialize an interrupt thread.
  */
 static void
@@ -51,7 +53,7 @@ thread_create_intr(cpu_t *cp)
 {
 	kthread_t *tp;
 
-	tp = thread_create(NULL, 0,
+	tp = thread_create(NULL, LL_INTR_STKSZ,
 	    (void (*)())thread_create_intr, NULL, 0, &p0, TS_ONPROC, 0);
 
 	/*
@@ -97,9 +99,12 @@ thread_create_intr(cpu_t *cp)
 }
 
 /*
- * Allocate a given number of interrupt threads for a given CPU.
- * These threads will get freed by cpu_destroy_bound_threads()
- * when CPU gets unconfigured.
+ * Allocate a given number of interrupt threads for a given CPU.  These threads
+ * will get freed by cpu_destroy_bound_threads() when CPU gets unconfigured.
+ *
+ * Note, high level interrupts are always serviced using cpu_intr_stack and are
+ * not allowed to block. Low level interrupts or soft-interrupts use the
+ * kthread_t's that we create through the calls to thread_create_intr().
  */
 void
 cpu_intr_alloc(cpu_t *cp, int n)
@@ -110,6 +115,6 @@ cpu_intr_alloc(cpu_t *cp, int n)
 		thread_create_intr(cp);
 
 	cp->cpu_intr_stack = (caddr_t)segkp_get(segkp, INTR_STACK_SIZE,
-		KPD_HASREDZONE | KPD_NO_ANON | KPD_LOCKED) +
-		INTR_STACK_SIZE - SA(MINFRAME);
+	    KPD_HASREDZONE | KPD_NO_ANON | KPD_LOCKED) +
+	    INTR_STACK_SIZE - SA(MINFRAME);
 }
diff --git a/usr/src/uts/common/dtrace/dtrace.c b/usr/src/uts/common/dtrace/dtrace.c
index c775224d86..fc0206da29 100644
--- a/usr/src/uts/common/dtrace/dtrace.c
+++ b/usr/src/uts/common/dtrace/dtrace.c
@@ -7710,7 +7710,7 @@ dtrace_cred2priv(cred_t *cr, uint32_t *privp, uid_t *uidp, zoneid_t *zoneidp)
 		priv = DTRACE_PRIV_ALL;
 	} else {
 		*uidp = crgetuid(cr);
-		*zoneidp = crgetzoneid(cr);
+		*zoneidp = crgetzonedid(cr);
 
 		priv = 0;
 		if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE))
@@ -8206,7 +8206,7 @@ dtrace_register(const char *name, const dtrace_pattr_t *pap, uint32_t priv,
 	provider->dtpv_priv.dtpp_flags = priv;
 	if (cr != NULL) {
 		provider->dtpv_priv.dtpp_uid = crgetuid(cr);
-		provider->dtpv_priv.dtpp_zoneid = crgetzoneid(cr);
+		provider->dtpv_priv.dtpp_zoneid = crgetzonedid(cr);
 	}
 	provider->dtpv_pops = *pops;
 
@@ -8817,6 +8817,7 @@ dtrace_probe_enable(const dtrace_probedesc_t *desc, dtrace_enabling_t *enab)
 	uint32_t priv;
 	uid_t uid;
 	zoneid_t zoneid;
+	dtrace_state_t *state = enab->dten_vstate->dtvs_state;
 
 	ASSERT(MUTEX_HELD(&dtrace_lock));
 	dtrace_ecb_create_cache = NULL;
@@ -8831,8 +8832,22 @@ dtrace_probe_enable(const dtrace_probedesc_t *desc, dtrace_enabling_t *enab)
 	}
 
 	dtrace_probekey(desc, &pkey);
-	dtrace_cred2priv(enab->dten_vstate->dtvs_state->dts_cred.dcr_cred,
-	    &priv, &uid, &zoneid);
+	dtrace_cred2priv(state->dts_cred.dcr_cred, &priv, &uid, &zoneid);
+
+	if ((priv & DTRACE_PRIV_ZONEOWNER) &&
+	    state->dts_options[DTRACEOPT_ZONE] != DTRACEOPT_UNSET) {
+		/*
+		 * If we have the privilege of instrumenting all zones but we
+		 * have been told to instrument but one, we will spoof this up
+		 * depriving ourselves of DTRACE_PRIV_ZONEOWNER for purposes
+		 * of dtrace_match().  (Note that DTRACEOPT_ZONE is not for
+		 * security but rather for performance: it allows the global
+		 * zone to instrument USDT probes in a local zone without
+		 * requiring all zones to be instrumented.)
+		 */
+		priv &= ~DTRACE_PRIV_ZONEOWNER;
+		zoneid = state->dts_options[DTRACEOPT_ZONE];
+	}
 
 	return (dtrace_match(&pkey, priv, uid, zoneid, dtrace_ecb_create_enable,
 	    enab));
diff --git a/usr/src/uts/common/dtrace/sdt_subr.c b/usr/src/uts/common/dtrace/sdt_subr.c
index 157acc25fc..3d350ff278 100644
--- a/usr/src/uts/common/dtrace/sdt_subr.c
+++ b/usr/src/uts/common/dtrace/sdt_subr.c
@@ -97,6 +97,10 @@ static dtrace_pattr_t iscsi_attr = {
 { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_ISA },
 };
 
+/*
+ * When adding a new provider you must add it before sdt as sdt is a catch all
+ * for remaining probes.
+ */
 sdt_provider_t sdt_providers[] = {
 	{ "vtrace", "__vtrace_", &vtrace_attr },
 	{ "sysinfo", "__cpu_sysinfo_", &info_attr, DTRACE_PRIV_USER },
@@ -117,6 +121,7 @@ sdt_provider_t sdt_providers[] = {
 	{ "fc", "__fc_", &fc_attr },
 	{ "srp", "__srp_", &fc_attr },
 	{ "sysevent", "__sysevent_", &stab_attr },
+	{ "vnd", "__vnd_", &stab_attr },
 	{ "sdt", NULL, &sdt_attr },
 	{ NULL }
 };
@@ -1151,6 +1156,34 @@ sdt_argdesc_t sdt_args[] = {
 	{ "fc", "abts-receive", 2, 2, "fct_i_remote_port_t *",
 	    "fc_port_info_t *" },
 
+	{ "vnd", "flow-blocked", 0, 0, "vnd_str_t *", "ifinfo_t *" },
+	{ "vnd", "flow-blocked", 1, 1, "uint64_t", "uint64_t" },
+	{ "vnd", "flow-blocked", 2, 2, "uintptr_t", "uintptr_t" },
+	{ "vnd", "flow-resumed", 0, 0, "vnd_str_t *", "ifinfo_t *" },
+	{ "vnd", "flow-resumed", 1, 1, "uint64_t", "uint64_t" },
+	{ "vnd", "flow-resumed", 2, 2, "uintptr_t", "uintptr_t" },
+	{ "vnd", "drop-in", 0, 0, "mblk_t *", "pktinfo_t *" },
+	{ "vnd", "drop-in", 1, 1, "vnd_str_t *", "ifinfo_t *" },
+	{ "vnd", "drop-in", 2, 2, "mblk_t *", "etherinfo_t *" },
+	{ "vnd", "drop-in", 3, 3, "const char *", "const char *" },
+	{ "vnd", "drop-out", 0, 0, "mblk_t *", "pktinfo_t *" },
+	{ "vnd", "drop-out", 1, 1, "vnd_str_t *", "ifinfo_t *" },
+	{ "vnd", "drop-out", 2, 2, "mblk_t *", "etherinfo_t *" },
+	{ "vnd", "drop-out", 3, 3, "const char *", "const char *" },
+	{ "vnd", "drop-ctl", 0, 0, "mblk_t *", "pktinfo_t *" },
+	{ "vnd", "drop-ctl", 1, 1, "vnd_str_t *", "ifinfo_t *" },
+	{ "vnd", "drop-ctl", 2, 2, "mblk_t *", "etherinfo_t *" },
+	{ "vnd", "drop-ctl", 3, 3, "const char *", "const char *" },
+	{ "vnd", "send", 0, 0, "mblk_t *", "pktinfo_t *" },
+	{ "vnd", "send", 1, 1, "void *", "csinfo_t *" },
+	{ "vnd", "send", 2, 2, "void *", "ipinfo_t *" },
+	{ "vnd", "send", 3, 3, "vnd_str_t *", "ifinfo_t *" },
+	{ "vnd", "send", 4, 4, "mblk_t *", "etherinfo_t *" },
+	{ "vnd", "recv", 0, 0, "mblk_t *", "pktinfo_t *" },
+	{ "vnd", "recv", 1, 1, "void *", "csinfo_t *" },
+	{ "vnd", "recv", 2, 2, "void *", "ipinfo_t *" },
+	{ "vnd", "recv", 3, 3, "vnd_str_t *", "ifinfo_t *" },
+	{ "vnd", "recv", 4, 4, "mblk_t *", "etherinfo_t *" },
 
 	{ NULL }
 };
diff --git a/usr/src/uts/common/exec/aout/aout.c b/usr/src/uts/common/exec/aout/aout.c
index fc45bd9544..5dbb2ed28c 100644
--- a/usr/src/uts/common/exec/aout/aout.c
+++ b/usr/src/uts/common/exec/aout/aout.c
@@ -22,6 +22,7 @@
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  * Copyright (c) 2011 Bayard G. Bell. All rights reserved.
+ * Copyright 2015, Joyent, Inc.
  */
 
 #include <sys/types.h>
@@ -54,7 +55,7 @@
 
 static int aoutexec(vnode_t *vp, execa_t *uap, uarg_t *args,
     intpdata_t *idatap, int level, long *execsz, int setid,
-    caddr_t exec_file, cred_t *cred, int brand_action);
+    caddr_t exec_file, cred_t *cred, int *brand_action);
 static int get_aout_head(struct vnode **vpp, struct exdata *edp, long *execsz,
     int *isdyn);
 static int aoutcore(vnode_t *vp, proc_t *pp, cred_t *credp,
@@ -130,7 +131,7 @@ _info(struct modinfo *modinfop)
 static int
 aoutexec(vnode_t *vp, struct execa *uap, struct uarg *args,
     struct intpdata *idatap, int level, long *execsz, int setid,
-    caddr_t exec_file, cred_t *cred, int brand_action)
+    caddr_t exec_file, cred_t *cred, int *brand_action)
 {
 	auxv32_t auxflags_auxv32;
 	int error;
diff --git a/usr/src/uts/common/exec/elf/elf.c b/usr/src/uts/common/exec/elf/elf.c
index dc04b292b0..d74737dead 100644
--- a/usr/src/uts/common/exec/elf/elf.c
+++ b/usr/src/uts/common/exec/elf/elf.c
@@ -26,7 +26,7 @@
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
 /*	  All Rights Reserved  	*/
 /*
- * Copyright (c) 2013, Joyent, Inc.  All rights reserved.
+ * Copyright 2016 Joyent, Inc.
  */
 
 #include <sys/types.h>
@@ -66,6 +66,11 @@
 #include <sys/sdt.h>
 #include <sys/siginfo.h>
 
+#if defined(__x86) && !defined(__xpv)
+#include <sys/comm_page.h>
+#endif /* defined(__x86) && !defined(__xpv) */
+
+
 extern int at_flags;
 
 #define	ORIGIN_STR	"ORIGIN"
@@ -163,12 +168,16 @@ dtrace_safe_phdr(Phdr *phdrp, struct uarg *args, uintptr_t base)
 }
 
 /*
- * Map in the executable pointed to by vp. Returns 0 on success.
+ * Map in the executable pointed to by vp. Returns 0 on success.  Note that
+ * this function currently has the maximum number of arguments allowed by
+ * modstubs on x86 (MAXNARG)!  Do _not_ add to this function signature without
+ * adding to MAXNARG.  (Better yet, do not add to this monster of a function
+ * signature!)
  */
 int
 mapexec_brand(vnode_t *vp, uarg_t *args, Ehdr *ehdr, Addr *uphdr_vaddr,
-    intptr_t *voffset, caddr_t exec_file, int *interp, caddr_t *bssbase,
-    caddr_t *brkbase, size_t *brksize, uintptr_t *lddatap)
+    intptr_t *voffset, caddr_t exec_file, char **interpp, caddr_t *bssbase,
+    caddr_t *brkbase, size_t *brksize, uintptr_t *lddatap, uintptr_t *minaddrp)
 {
 	size_t		len;
 	struct vattr	vat;
@@ -180,6 +189,7 @@ mapexec_brand(vnode_t *vp, uarg_t *args, Ehdr *ehdr, Addr *uphdr_vaddr,
 	Phdr		*junk = NULL;
 	Phdr		*dynphdr = NULL;
 	Phdr		*dtrphdr = NULL;
+	char		*interp = NULL;
 	uintptr_t	lddata;
 	long		execsz;
 	intptr_t	minaddr;
@@ -187,6 +197,9 @@ mapexec_brand(vnode_t *vp, uarg_t *args, Ehdr *ehdr, Addr *uphdr_vaddr,
 	if (lddatap != NULL)
 		*lddatap = NULL;
 
+	if (minaddrp != NULL)
+		*minaddrp = NULL;
+
 	if (error = execpermissions(vp, &vat, args)) {
 		uprintf("%s: Cannot execute %s\n", exec_file, args->pathname);
 		return (error);
@@ -212,25 +225,89 @@ mapexec_brand(vnode_t *vp, uarg_t *args, Ehdr *ehdr, Addr *uphdr_vaddr,
 	    &junk, &dtrphdr, NULL, bssbase, brkbase, voffset, &minaddr,
 	    len, &execsz, brksize)) {
 		uprintf("%s: Cannot map %s\n", exec_file, args->pathname);
+		if (uphdr != NULL && uphdr->p_flags == 0)
+			kmem_free(uphdr, sizeof (Phdr));
 		kmem_free(phdrbase, phdrsize);
 		return (error);
 	}
 
+	if (minaddrp != NULL)
+		*minaddrp = minaddr;
+
 	/*
-	 * Inform our caller if the executable needs an interpreter.
+	 * If the executable requires an interpreter, determine its name.
 	 */
-	*interp = (dynphdr == NULL) ? 0 : 1;
+	if (dynphdr != NULL) {
+		ssize_t	resid;
+
+		if (dynphdr->p_filesz > MAXPATHLEN || dynphdr->p_filesz == 0) {
+			uprintf("%s: Invalid interpreter\n", exec_file);
+			kmem_free(phdrbase, phdrsize);
+			return (ENOEXEC);
+		}
+
+		interp = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+
+		if ((error = vn_rdwr(UIO_READ, vp, interp, dynphdr->p_filesz,
+		    (offset_t)dynphdr->p_offset, UIO_SYSSPACE, 0,
+		    (rlim64_t)0, CRED(), &resid)) != 0 || resid != 0 ||
+		    interp[dynphdr->p_filesz - 1] != '\0') {
+			uprintf("%s: Cannot obtain interpreter pathname\n",
+			    exec_file);
+			kmem_free(interp, MAXPATHLEN);
+			kmem_free(phdrbase, phdrsize);
+			return (error != 0 ? error : ENOEXEC);
+		}
+	}
 
 	/*
 	 * If this is a statically linked executable, voffset should indicate
 	 * the address of the executable itself (it normally holds the address
 	 * of the interpreter).
 	 */
-	if (ehdr->e_type == ET_EXEC && *interp == 0)
+	if (ehdr->e_type == ET_EXEC && interp == NULL)
 		*voffset = minaddr;
 
+	/*
+	 * If the caller has asked for the interpreter name, return it (it's
+	 * up to the caller to free it); if the caller hasn't asked for it,
+	 * free it ourselves.
+	 */
+	if (interpp != NULL) {
+		*interpp = interp;
+	} else if (interp != NULL) {
+		kmem_free(interp, MAXPATHLEN);
+	}
+
 	if (uphdr != NULL) {
 		*uphdr_vaddr = uphdr->p_vaddr;
+
+		if (uphdr->p_flags == 0)
+			kmem_free(uphdr, sizeof (Phdr));
+	} else if (ehdr->e_type == ET_DYN) {
+		/*
+		 * If we don't have a uphdr, we'll apply the logic found
+		 * in mapelfexec() and use the p_vaddr of the first PT_LOAD
+		 * section as the base address of the object.
+		 */
+		Phdr *phdr = (Phdr *)phdrbase;
+		int i, hsize = ehdr->e_phentsize;
+
+		for (i = nphdrs; i > 0; i--) {
+			if (phdr->p_type == PT_LOAD) {
+				*uphdr_vaddr = (uintptr_t)phdr->p_vaddr +
+				    ehdr->e_phoff;
+				break;
+			}
+
+			phdr = (Phdr *)((caddr_t)phdr + hsize);
+		}
+
+		/*
+		 * If we don't have a PT_LOAD segment, we should have returned
+		 * ENOEXEC when elfsize() returned 0, above.
+		 */
+		VERIFY(i > 0);
 	} else {
 		*uphdr_vaddr = (Addr)-1;
 	}
@@ -243,13 +320,13 @@ mapexec_brand(vnode_t *vp, uarg_t *args, Ehdr *ehdr, Addr *uphdr_vaddr,
 int
 elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
     int level, long *execsz, int setid, caddr_t exec_file, cred_t *cred,
-    int brand_action)
+    int *brand_action)
 {
 	caddr_t		phdrbase = NULL;
 	caddr_t 	bssbase = 0;
 	caddr_t 	brkbase = 0;
 	size_t		brksize = 0;
-	ssize_t		dlnsize;
+	ssize_t		dlnsize, nsize = 0;
 	aux_entry_t	*aux;
 	int		error;
 	ssize_t		resid;
@@ -273,6 +350,7 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
 	int		hasauxv = 0;
 	int		hasdy = 0;
 	int		branded = 0;
+	int		dynuphdr = 0;
 
 	struct proc *p = ttoproc(curthread);
 	struct user *up = PTOU(p);
@@ -327,7 +405,9 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
 		*execsz = btopr(SINCR) + btopr(SSIZE) + btopr(NCARGS32-1);
 	} else {
 		args->to_model = DATAMODEL_LP64;
-		args->stk_prot &= ~PROT_EXEC;
+		if (!args->stk_prot_override) {
+			args->stk_prot &= ~PROT_EXEC;
+		}
 #if defined(__i386) || defined(__amd64)
 		args->dat_prot &= ~PROT_EXEC;
 #endif
@@ -339,11 +419,25 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
 #endif	/* _LP64 */
 
 	/*
-	 * We delay invoking the brand callback until we've figured out
-	 * what kind of elf binary we're trying to run, 32-bit or 64-bit.
-	 * We do this because now the brand library can just check
-	 * args->to_model to see if the target is 32-bit or 64-bit without
-	 * having do duplicate all the code above.
+	 * We delay invoking the brand callback until we've figured out what
+	 * kind of elf binary we're trying to run, 32-bit or 64-bit.  We do this
+	 * because now the brand library can just check args->to_model to see if
+	 * the target is 32-bit or 64-bit without having do duplicate all the
+	 * code above.
+	 *
+	 * We also give the brand a chance to indicate that based on the ELF
+	 * OSABI of the target binary it should become unbranded and optionally
+	 * indicate that it should be treated as existing in a specific prefix.
+	 *
+	 * Note that if a brand opts to go down this route it does not actually
+	 * end up being debranded. In other words, future programs that exec
+	 * will still be considered for branding unless this escape hatch is
+	 * used. Consider the case of lx brand for example. If a user runs
+	 * /native/usr/sbin/dtrace -c /bin/ls, the isaexec and normal executable
+	 * of DTrace that's in /native will take this escape hatch and be run
+	 * and interpreted using the normal system call table; however, the
+	 * execution of a non-illumos binary in the form of /bin/ls will still
+	 * be branded and be subject to all of the normal actions of the brand.
 	 *
 	 * The level checks associated with brand handling below are used to
 	 * prevent a loop since the brand elfexec function typically comes back
@@ -351,8 +445,20 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
 	 * handling in the #! interpreter code will increment the level before
 	 * calling gexec to run the final elfexec interpreter.
 	 */
+	if ((level <= INTP_MAXDEPTH) && (*brand_action != EBA_NATIVE) &&
+	    (PROC_IS_BRANDED(p)) && (BROP(p)->b_native_exec != NULL)) {
+		if (BROP(p)->b_native_exec(ehdrp->e_ident[EI_OSABI],
+		    &args->brand_nroot) == B_TRUE) {
+			ASSERT(ehdrp->e_ident[EI_OSABI]);
+			*brand_action = EBA_NATIVE;
+			/* Add one for the trailing '/' in the path */
+			if (args->brand_nroot != NULL)
+				nsize = strlen(args->brand_nroot) + 1;
+		}
+	}
+
 	if ((level <= INTP_MAXDEPTH) &&
-	    (brand_action != EBA_NATIVE) && (PROC_IS_BRANDED(p))) {
+	    (*brand_action != EBA_NATIVE) && (PROC_IS_BRANDED(p))) {
 		error = BROP(p)->b_elfexec(vp, uap, args,
 		    idatap, level + 1, execsz, setid, exec_file, cred,
 		    brand_action);
@@ -423,14 +529,15 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
 		 *	AT_BASE
 		 *	AT_FLAGS
 		 *	AT_PAGESZ
+		 *	AT_RANDOM	(added in stk_copyout)
 		 *	AT_SUN_AUXFLAGS
 		 *	AT_SUN_HWCAP
 		 *	AT_SUN_HWCAP2
-		 *	AT_SUN_PLATFORM (added in stk_copyout)
-		 *	AT_SUN_EXECNAME (added in stk_copyout)
+		 *	AT_SUN_PLATFORM	(added in stk_copyout)
+		 *	AT_SUN_EXECNAME	(added in stk_copyout)
 		 *	AT_NULL
 		 *
-		 * total == 9
+		 * total == 10
 		 */
 		if (hasdy && hasu) {
 			/*
@@ -445,7 +552,7 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
 			 *
 			 * total = 5
 			 */
-			args->auxsize = (9 + 5) * sizeof (aux_entry_t);
+			args->auxsize = (10 + 5) * sizeof (aux_entry_t);
 		} else if (hasdy) {
 			/*
 			 * Has PT_INTERP but no PT_PHDR
@@ -455,9 +562,9 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
 			 *
 			 * total = 2
 			 */
-			args->auxsize = (9 + 2) * sizeof (aux_entry_t);
+			args->auxsize = (10 + 2) * sizeof (aux_entry_t);
 		} else {
-			args->auxsize = 9 * sizeof (aux_entry_t);
+			args->auxsize = 10 * sizeof (aux_entry_t);
 		}
 	} else {
 		args->auxsize = 0;
@@ -470,13 +577,41 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
 	if (args->emulator != NULL)
 		args->auxsize += sizeof (aux_entry_t);
 
-	if ((brand_action != EBA_NATIVE) && (PROC_IS_BRANDED(p))) {
+	/*
+	 * If this is a native binary that's been given a modified interpreter
+	 * root, inform it that the native system exists at that root.
+	 */
+	if (args->brand_nroot != NULL) {
+		args->auxsize += sizeof (aux_entry_t);
+	}
+
+
+	/*
+	 * On supported kernels (64-bit, non-xpv) make room in the auxv for the
+	 * AT_SUN_COMMPAGE entry.
+	 */
+#if defined(__amd64) && !defined(__xpv)
+	args->auxsize += sizeof (aux_entry_t);
+#endif /* defined(__amd64) && !defined(__xpv) */
+
+	/*
+	 * If we have user credentials, we'll supply the following entries:
+	 *	AT_SUN_UID
+	 *	AT_SUN_RUID
+	 *	AT_SUN_GID
+	 *	AT_SUN_RGID
+	 */
+	if (cred != NULL) {
+		args->auxsize += 4 * sizeof (aux_entry_t);
+	}
+
+	if ((*brand_action != EBA_NATIVE) && (PROC_IS_BRANDED(p))) {
 		branded = 1;
 		/*
-		 * We will be adding 4 entries to the aux vectors.  One for
-		 * the the brandname and 3 for the brand specific aux vectors.
+		 * We will be adding 5 entries to the aux vectors.  One for
+		 * the the brandname and 4 for the brand specific aux vectors.
 		 */
-		args->auxsize += 4 * sizeof (aux_entry_t);
+		args->auxsize += 5 * sizeof (aux_entry_t);
 	}
 
 	/* Hardware/Software capabilities */
@@ -507,7 +642,8 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
 	aux = bigwad->elfargs;
 	/*
 	 * Move args to the user's stack.
-	 * This can fill in the AT_SUN_PLATFORM and AT_SUN_EXECNAME aux entries.
+	 * This can fill in the AT_SUN_PLATFORM, AT_SUN_EXECNAME and AT_RANDOM
+	 * aux entries.
 	 */
 	if ((error = exec_args(uap, args, idatap, (void **)&aux)) != 0) {
 		if (error == -1) {
@@ -534,6 +670,14 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
 	    len, execsz, &brksize)) != 0)
 		goto bad;
 
+	if (uphdr != NULL) {
+		/*
+		 * Our uphdr has been dynamically allocated if (and only if)
+		 * its program header flags are clear.
+		 */
+		dynuphdr = (uphdr->p_flags == 0);
+	}
+
 	if (uphdr != NULL && dyphdr == NULL)
 		goto bad;
 
@@ -548,17 +692,22 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
 		char		*p;
 		struct vnode	*nvp;
 
-		dlnsize = dyphdr->p_filesz;
+		dlnsize = dyphdr->p_filesz + nsize;
 
 		if (dlnsize > MAXPATHLEN || dlnsize <= 0)
 			goto bad;
 
+		if (nsize != 0) {
+			bcopy(args->brand_nroot, dlnp, nsize - 1);
+			dlnp[nsize - 1] = '/';
+		}
+
 		/*
 		 * Read in "interpreter" pathname.
 		 */
-		if ((error = vn_rdwr(UIO_READ, vp, dlnp, dyphdr->p_filesz,
-		    (offset_t)dyphdr->p_offset, UIO_SYSSPACE, 0, (rlim64_t)0,
-		    CRED(), &resid)) != 0) {
+		if ((error = vn_rdwr(UIO_READ, vp, dlnp + nsize,
+		    dyphdr->p_filesz, (offset_t)dyphdr->p_offset, UIO_SYSSPACE,
+		    0, (rlim64_t)0, CRED(), &resid)) != 0) {
 			uprintf("%s: Cannot obtain interpreter pathname\n",
 			    exec_file);
 			goto bad;
@@ -703,9 +852,10 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
 
 		dtrphdr = NULL;
 
-		error = mapelfexec(nvp, ehdrp, nphdrs, phdrbase, &junk, &junk,
+		error = mapelfexec(nvp, ehdrp, nphdrs, phdrbase, NULL, &junk,
 		    &junk, &dtrphdr, NULL, NULL, NULL, &voffset, NULL, len,
 		    execsz, NULL);
+
 		if (error || junk != NULL) {
 			VN_RELE(nvp);
 			uprintf("%s: Cannot map %s\n", exec_file, dlnp);
@@ -732,9 +882,10 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
 
 	if (hasauxv) {
 		int auxf = AF_SUN_HWCAPVERIFY;
+
 		/*
-		 * Note: AT_SUN_PLATFORM and AT_SUN_EXECNAME were filled in via
-		 * exec_args()
+		 * Note: AT_SUN_PLATFORM, AT_SUN_EXECNAME and AT_RANDOM were
+		 * filled in via exec_args()
 		 */
 		ADDAUX(aux, AT_BASE, voffset)
 		ADDAUX(aux, AT_FLAGS, at_flags)
@@ -762,7 +913,7 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
 		 * malicious user within the zone from crafting a wrapper to
 		 * run native suid commands with unsecure libraries interposed.
 		 */
-		if ((brand_action == EBA_NATIVE) && (PROC_IS_BRANDED(p) &&
+		if ((*brand_action == EBA_NATIVE) && (PROC_IS_BRANDED(p) &&
 		    (setid &= ~EXECSETID_SETID) != 0))
 			auxf &= ~AF_SUN_SETUGID;
 
@@ -775,6 +926,18 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
 		    ((char *)&aux->a_type -
 		    (char *)bigwad->elfargs));
 		ADDAUX(aux, AT_SUN_AUXFLAGS, auxf);
+
+		/*
+		 * Record information about the real and effective user and
+		 * group IDs.
+		 */
+		if (cred != NULL) {
+			ADDAUX(aux, AT_SUN_UID, crgetuid(cred));
+			ADDAUX(aux, AT_SUN_RUID, crgetruid(cred));
+			ADDAUX(aux, AT_SUN_GID, crgetgid(cred));
+			ADDAUX(aux, AT_SUN_RGID, crgetrgid(cred));
+		}
+
 		/*
 		 * Hardware capability flag word (performance hints)
 		 * Used for choosing faster library routines.
@@ -804,8 +967,19 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
 			ADDAUX(aux, AT_SUN_BRAND_AUX1, 0)
 			ADDAUX(aux, AT_SUN_BRAND_AUX2, 0)
 			ADDAUX(aux, AT_SUN_BRAND_AUX3, 0)
+			ADDAUX(aux, AT_SUN_BRAND_AUX4, 0)
 		}
 
+		/*
+		 * Add the comm page auxv entry, mapping it in if needed.
+		 */
+#if defined(__amd64) && !defined(__xpv)
+		if (args->commpage != NULL ||
+		    (args->commpage = (uintptr_t)comm_page_mapin()) != NULL) {
+			ADDAUX(aux, AT_SUN_COMMPAGE, args->commpage)
+		}
+#endif /* defined(__amd64) && !defined(__xpv) */
+
 		ADDAUX(aux, AT_NULL, 0)
 		postfixsize = (char *)aux - (char *)bigwad->elfargs;
 
@@ -845,6 +1019,7 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
 	}
 
 	bzero(up->u_auxv, sizeof (up->u_auxv));
+	up->u_commpagep = args->commpage;
 	if (postfixsize) {
 		int num_auxv;
 
@@ -911,6 +1086,8 @@ bad:
 	if (error == 0)
 		error = ENOEXEC;
 out:
+	if (dynuphdr)
+		kmem_free(uphdr, sizeof (Phdr));
 	if (phdrbase != NULL)
 		kmem_free(phdrbase, phdrsize);
 	if (cap != NULL)
@@ -1177,6 +1354,29 @@ getelfshdr(vnode_t *vp, cred_t *credp, const Ehdr *ehdr,
 	return (0);
 }
 
+
+#ifdef _ELF32_COMPAT
+int
+elf32readhdr(vnode_t *vp, cred_t *credp, Ehdr *ehdrp, int *nphdrs,
+    caddr_t *phbasep, ssize_t *phsizep)
+#else
+int
+elfreadhdr(vnode_t *vp, cred_t *credp, Ehdr *ehdrp, int *nphdrs,
+    caddr_t *phbasep, ssize_t *phsizep)
+#endif
+{
+	int error, nshdrs, shstrndx;
+
+	if ((error = getelfhead(vp, credp, ehdrp, &nshdrs, &shstrndx,
+	    nphdrs)) != 0 ||
+	    (error = getelfphdr(vp, credp, ehdrp, *nphdrs, phbasep,
+	    phsizep)) != 0) {
+		return (error);
+	}
+	return (0);
+}
+
+
 static int
 mapelfexec(
 	vnode_t *vp,
@@ -1197,7 +1397,7 @@ mapelfexec(
 	size_t *brksize)
 {
 	Phdr *phdr;
-	int i, prot, error;
+	int i, prot, error, lastprot = 0;
 	caddr_t addr = NULL;
 	size_t zfodsz;
 	int ptload = 0;
@@ -1205,43 +1405,78 @@ mapelfexec(
 	off_t offset;
 	int hsize = ehdr->e_phentsize;
 	caddr_t mintmp = (caddr_t)-1;
+	uintptr_t lastaddr = NULL;
 	extern int use_brk_lpg;
 
 	if (ehdr->e_type == ET_DYN) {
-		/*
-		 * Obtain the virtual address of a hole in the
-		 * address space to map the "interpreter".
-		 */
-		map_addr(&addr, len, (offset_t)0, 1, 0);
-		if (addr == NULL)
-			return (ENOMEM);
-		*voffset = (intptr_t)addr;
+		caddr_t vaddr;
 
 		/*
-		 * Calculate the minimum vaddr so it can be subtracted out.
-		 * According to the ELF specification, since PT_LOAD sections
-		 * must be sorted by increasing p_vaddr values, this is
-		 * guaranteed to be the first PT_LOAD section.
+		 * Despite the fact that mmapobj(2) refuses to load them, we
+		 * need to support executing ET_DYN objects that have a
+		 * non-NULL p_vaddr.  When found in the wild, these objects
+		 * are likely to be due to an old (and largely obviated) Linux
+		 * facility, prelink(8), that rewrites shared objects to
+		 * prefer specific (disjoint) virtual address ranges.  (Yes,
+		 * this is putatively for performance -- and yes, it has
+		 * limited applicability, many edge conditions and grisly
+		 * failure modes; even for Linux, it's insane.)  As ELF
+		 * mandates that the PT_LOAD segments be in p_vaddr order, we
+		 * find the lowest p_vaddr by finding the first PT_LOAD
+		 * segment.
 		 */
 		phdr = (Phdr *)phdrbase;
 		for (i = nphdrs; i > 0; i--) {
 			if (phdr->p_type == PT_LOAD) {
-				*voffset -= (uintptr_t)phdr->p_vaddr;
+				addr = (caddr_t)(uintptr_t)phdr->p_vaddr;
 				break;
 			}
 			phdr = (Phdr *)((caddr_t)phdr + hsize);
 		}
 
+		/*
+		 * We have a non-zero p_vaddr in the first PT_LOAD segment --
+		 * presumably because we're directly executing a prelink(8)'d
+		 * ld-linux.so.  While we could correctly execute such an
+		 * object without locating it at its desired p_vaddr (it is,
+		 * after all, still relocatable), our inner antiquarian
+		 * derives a perverse pleasure in accommodating the steampunk
+		 * prelink(8) contraption -- goggles on!
+		 */
+		if ((vaddr = addr) != NULL) {
+			if (as_gap(curproc->p_as, len,
+			    &addr, &len, AH_LO, NULL) == -1 || addr != vaddr) {
+				addr = NULL;
+			}
+		}
+
+		if (addr == NULL) {
+			/*
+			 * We either have a NULL p_vaddr (the common case, by
+			 * many orders of magnitude) or we have a non-NULL
+			 * p_vaddr and we were unable to obtain the specified
+			 * VA range (presumably because it's an illegal
+			 * address).  Either way, obtain an address in which
+			 * to map the interpreter.
+			 */
+			map_addr(&addr, len, (offset_t)0, 1, 0);
+			if (addr == NULL)
+				return (ENOMEM);
+		}
+
+		/*
+		 * Our voffset is the difference between where we landed and
+		 * where we wanted to be.
+		 */
+		*voffset = (uintptr_t)addr - (uintptr_t)vaddr;
 	} else {
 		*voffset = 0;
 	}
+
 	phdr = (Phdr *)phdrbase;
 	for (i = nphdrs; i > 0; i--) {
 		switch (phdr->p_type) {
 		case PT_LOAD:
-			if ((*dyphdr != NULL) && (*uphdr == NULL))
-				return (0);
-
 			ptload = 1;
 			prot = PROT_USER;
 			if (phdr->p_flags & PF_R)
@@ -1253,6 +1488,34 @@ mapelfexec(
 
 			addr = (caddr_t)((uintptr_t)phdr->p_vaddr + *voffset);
 
+			if ((*dyphdr != NULL) && uphdr != NULL &&
+			    (*uphdr == NULL)) {
+				/*
+				 * The PT_PHDR program header is, strictly
+				 * speaking, optional.  If we find that this
+				 * is missing, we will determine the location
+				 * of the program headers based on the address
+				 * of the lowest PT_LOAD segment (namely, this
+				 * one):  we subtract the p_offset to get to
+				 * the ELF header and then add back the program
+				 * header offset to get to the program headers.
+				 * We then cons up a Phdr that corresponds to
+				 * the (missing) PT_PHDR, setting the flags
+				 * to 0 to denote that this is artificial and
+				 * should (must) be freed by the caller.
+				 */
+				Phdr *cons;
+
+				cons = kmem_zalloc(sizeof (Phdr), KM_SLEEP);
+
+				cons->p_flags = 0;
+				cons->p_type = PT_PHDR;
+				cons->p_vaddr = ((uintptr_t)addr -
+				    phdr->p_offset) + ehdr->e_phoff;
+
+				*uphdr = cons;
+			}
+
 			/*
 			 * Keep track of the segment with the lowest starting
 			 * address.
@@ -1260,6 +1523,41 @@ mapelfexec(
 			if (addr < mintmp)
 				mintmp = addr;
 
+			/*
+			 * Segments need not correspond to page boundaries:
+			 * they are permitted to share a page.  If two PT_LOAD
+			 * segments share the same page, and the permissions
+			 * of the segments differ, the behavior is historically
+			 * that the permissions of the latter segment are used
+			 * for the page that the two segments share.  This is
+			 * also historically a non-issue:  binaries generated
+			 * by most anything will make sure that two PT_LOAD
+			 * segments with differing permissions don't actually
+			 * share any pages.  However, there exist some crazy
+			 * things out there (including at least an obscure
+			 * Portuguese teaching language called G-Portugol) that
+			 * actually do the wrong thing and expect it to work:
+			 * they have a segment with execute permission share
+			 * a page with a subsequent segment that does not
+			 * have execute permissions and expect the resulting
+			 * shared page to in fact be executable.  To accommodate
+			 * such broken link editors, we take advantage of a
+			 * latitude explicitly granted to the loader:  it is
+			 * permitted to make _any_ PT_LOAD segment executable
+			 * (provided that it is readable or writable).  If we
+			 * see that we're sharing a page and that the previous
+			 * page was executable, we will add execute permissions
+			 * to our segment.
+			 */
+			if (btop(lastaddr) == btop((uintptr_t)addr) &&
+			    (phdr->p_flags & (PF_R | PF_W)) &&
+			    (lastprot & PROT_EXEC)) {
+				prot |= PROT_EXEC;
+			}
+
+			lastaddr = (uintptr_t)addr + phdr->p_filesz;
+			lastprot = prot;
+
 			zfodsz = (size_t)phdr->p_memsz - phdr->p_filesz;
 
 			offset = phdr->p_offset;
@@ -1324,8 +1622,22 @@ mapelfexec(
 			break;
 
 		case PT_INTERP:
-			if (ptload)
-				goto bad;
+			/*
+			 * The ELF specification is unequivocal about the
+			 * PT_INTERP program header with respect to any PT_LOAD
+			 * program header:  "If it is present, it must precede
+			 * any loadable segment entry." Linux, however, makes
+			 * no attempt to enforce this -- which has allowed some
+			 * binary editing tools to get away with generating
+			 * invalid ELF binaries in the respect that PT_INTERP
+			 * occurs after the first PT_LOAD program header.  This
+			 * is unfortunate (and of course, disappointing) but
+			 * it's no worse than that: there is no reason that we
+			 * can't process the PT_INTERP entry (if present) after
+			 * one or more PT_LOAD entries.  We therefore
+			 * deliberately do not check ptload here and always
+			 * store dyphdr to be the PT_INTERP program header.
+			 */
 			*dyphdr = phdr;
 			break;
 
@@ -1334,9 +1646,12 @@ mapelfexec(
 			break;
 
 		case PT_PHDR:
-			if (ptload)
+			if (ptload || phdr->p_flags == 0)
 				goto bad;
-			*uphdr = phdr;
+
+			if (uphdr != NULL)
+				*uphdr = phdr;
+
 			break;
 
 		case PT_NULL:
@@ -2185,7 +2500,7 @@ static struct modlexec modlexec = {
 extern int elf32exec(vnode_t *vp, execa_t *uap, uarg_t *args,
 			intpdata_t *idatap, int level, long *execsz,
 			int setid, caddr_t exec_file, cred_t *cred,
-			int brand_action);
+			int *brand_action);
 extern int elf32core(vnode_t *vp, proc_t *p, cred_t *credp,
 			rlim64_t rlimit, int sig, core_content_t content);
 
diff --git a/usr/src/uts/common/exec/intp/intp.c b/usr/src/uts/common/exec/intp/intp.c
index 269ba86b1b..512cab2b66 100644
--- a/usr/src/uts/common/exec/intp/intp.c
+++ b/usr/src/uts/common/exec/intp/intp.c
@@ -22,6 +22,7 @@
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  * Copyright 2012 Milan Jurik. All rights reserved.
+ * Copyright 2016, Joyent, Inc.
  */
 
 /*	Copyright (c) 1988 AT&T	*/
@@ -47,6 +48,7 @@
 #include <sys/kmem.h>
 #include <sys/note.h>
 #include <sys/sdt.h>
+#include <sys/brand.h>
 
 /*
  * This is the loadable module wrapper.
@@ -54,7 +56,7 @@
 #include <sys/modctl.h>
 
 extern int intpexec(struct vnode *, struct execa *, struct uarg *,
-    struct intpdata *, int, long *, int, caddr_t, struct cred *, int);
+    struct intpdata *, int, long *, int, caddr_t, struct cred *, int *);
 
 static struct execsw esw = {
 	intpmagicstr,
@@ -126,13 +128,20 @@ getintphead(struct vnode *vp, struct intpdata *idatap)
 	*cp = '\0';
 
 	/*
-	 * Locate the beginning and end of the interpreter name.
-	 * In addition to the name, one additional argument may
-	 * optionally be included here, to be prepended to the
-	 * arguments provided on the command line.  Thus, for
-	 * example, you can say
+	 * Locate the beginning and end of the interpreter name. Historically,
+	 * for illumos and its predecessors, in addition to the name, one
+	 * additional argument may optionally be included here, to be prepended
+	 * to the arguments provided on the command line. Thus, for example,
+	 * you can say
 	 *
 	 * 	#! /usr/bin/awk -f
+	 *
+	 * However, handling of interpreter arguments varies across operating
+	 * systems and other systems allow more than one argument. In
+	 * particular, Linux allows more than one and delivers all arguments
+	 * as a single string (argv[1] is "-arg1 -arg2 ..."). We support this
+	 * style of argument handling as a brand-specific option (setting
+	 * b_intp_parse_arg to B_FALSE).
 	 */
 	for (cp = &linep[2]; *cp == ' '; cp++)
 		;
@@ -151,9 +160,12 @@ getintphead(struct vnode *vp, struct intpdata *idatap)
 			idatap->intp_arg[0] = NULL;
 		else {
 			idatap->intp_arg[0] = cp;
-			while (*cp && *cp != ' ')
-				cp++;
-			*cp = '\0';
+			if (!PROC_IS_BRANDED(curproc) ||
+			    BROP(curproc)->b_intp_parse_arg) {
+				while (*cp && *cp != ' ')
+					cp++;
+				*cp = '\0';
+			}
 		}
 	}
 	return (0);
@@ -188,9 +200,8 @@ intpexec(
 	int setid,
 	caddr_t exec_file,
 	struct cred *cred,
-	int brand_action)
+	int *brand_action)
 {
-	_NOTE(ARGUNUSED(brand_action))
 	vnode_t *nvp;
 	int error = 0;
 	struct intpdata idata;
@@ -281,7 +292,7 @@ intpexec(
 	}
 
 	error = gexec(&nvp, uap, args, &idata, ++level, execsz, exec_file, cred,
-	    EBA_NONE);
+	    brand_action);
 
 	if (!error) {
 		/*
diff --git a/usr/src/uts/common/exec/java/java.c b/usr/src/uts/common/exec/java/java.c
index fdc327dcbb..5170fda5cb 100644
--- a/usr/src/uts/common/exec/java/java.c
+++ b/usr/src/uts/common/exec/java/java.c
@@ -21,6 +21,7 @@
 /*
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2015, Joyent, Inc.
  */
 
 /*
@@ -85,7 +86,7 @@ char *jexec_arg = "-jar";
 static int
 javaexec(vnode_t *vp, struct execa *uap, struct uarg *args,
     struct intpdata *idatap, int level, long *execsz, int setid,
-    caddr_t execfile, cred_t *cred, int brand_action)
+    caddr_t execfile, cred_t *cred, int *brand_action)
 {
 	struct intpdata idata;
 	int error;
diff --git a/usr/src/uts/common/exec/shbin/shbin.c b/usr/src/uts/common/exec/shbin/shbin.c
index ee5060a07e..016d87b9ef 100644
--- a/usr/src/uts/common/exec/shbin/shbin.c
+++ b/usr/src/uts/common/exec/shbin/shbin.c
@@ -22,6 +22,7 @@
 /*
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2015, Joyent, Inc.
  */
 
 #include <sys/types.h>
@@ -58,7 +59,7 @@ shbinexec(
 	int setid,
 	caddr_t exec_file,
 	struct cred *cred,
-	int brand_action);
+	int *brand_action);
 
 #define	SHBIN_CNTL(x)	((x)&037)
 #define	SHBINMAGIC_LEN	4
@@ -162,7 +163,7 @@ shbinexec(
 	int setid,
 	caddr_t exec_file,
 	struct cred *cred,
-	int brand_action)
+	int *brand_action)
 {
 	_NOTE(ARGUNUSED(brand_action))
 	vnode_t *nvp;
diff --git a/usr/src/uts/common/fs/dev/sdev_netops.c b/usr/src/uts/common/fs/dev/sdev_netops.c
index 4eaf38f484..41441ec52d 100644
--- a/usr/src/uts/common/fs/dev/sdev_netops.c
+++ b/usr/src/uts/common/fs/dev/sdev_netops.c
@@ -21,6 +21,7 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright (c) 2014, Joyent, Inc.  All rights reserved.
  */
 
 /*
@@ -41,8 +42,102 @@
 #include <sys/zone.h>
 #include <sys/dls.h>
 
+static const char *devnet_zpath = "/dev/net/zone/";
 struct vnodeops		*devnet_vnodeops;
 
+static zoneid_t
+devnet_nodetozone(sdev_node_t *dv)
+{
+	char *zname = NULL, *dup;
+	zone_t *zone;
+	int duplen;
+	zoneid_t zid;
+
+	/*
+	 * If in a non-global zone, always return it's zid no matter what the
+	 * node is.
+	 */
+	zid = getzoneid();
+	if (zid != GLOBAL_ZONEID)
+		return (zid);
+
+	/*
+	 * If it doesn't have /dev/net/zone/ then it can't be a specific zone
+	 * we're targetting.
+	 */
+	if (strncmp(devnet_zpath, dv->sdev_path, strlen(devnet_zpath)) != 0)
+		return (GLOBAL_ZONEID);
+
+	if (dv->sdev_vnode->v_type == VDIR) {
+		zone = zone_find_by_name(dv->sdev_name);
+	} else {
+		/* Non directories have the form /dev/net/zone/%z/%s */
+		dup = strdup(dv->sdev_path);
+		duplen = strlen(dup);
+		zname = strrchr(dup, '/');
+		*zname = '\0';
+		zname--;
+		zname = strrchr(dup, '/');
+		zname++;
+		zone = zone_find_by_name(zname);
+		kmem_free(dup, duplen + 1);
+	}
+	if (zone == NULL)
+		return (GLOBAL_ZONEID);
+	zid = zone->zone_id;
+	zone_rele(zone);
+	return (zid);
+}
+
+static int
+devnet_mkdir(struct sdev_node *ddv, char *name)
+{
+	sdev_node_t *dv;
+	struct vattr va;
+	int ret;
+
+	ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
+	dv = sdev_cache_lookup(ddv, name);
+	if (dv != NULL) {
+		SDEV_SIMPLE_RELE(dv);
+		return (EEXIST);
+	}
+
+	va = *sdev_getdefault_attr(VDIR);
+	gethrestime(&va.va_atime);
+	va.va_mtime = va.va_atime;
+	va.va_ctime = va.va_atime;
+
+	ret = sdev_mknode(ddv, name, &dv, &va, NULL, NULL, kcred, SDEV_READY);
+	if (ret != 0)
+		return (ret);
+	SDEV_SIMPLE_RELE(dv);
+	return (0);
+}
+
+/*
+ * We basically need to walk down the directory path to determine what we should
+ * do. At the top level of /dev/net, only the directory /dev/net/zone is valid,
+ * and it is always valid. Following on that, /dev/net/zone/%zonename is valid
+ * if and only if we can look up that zone name. If it's not, or it's some other
+ * name, then it's SDEV_VTOR_INVALID.
+ */
+static int
+devnet_dirvalidate(struct sdev_node *dv)
+{
+	zone_t *zonep;
+	char *path = "/dev/net/zone";
+
+	if (strcmp(path, dv->sdev_path) == 0)
+		return (SDEV_VTOR_VALID);
+
+	zonep = zone_find_by_name(dv->sdev_name);
+	if (zonep == NULL)
+		return (SDEV_VTOR_INVALID);
+	zone_rele(zonep);
+	return (SDEV_VTOR_VALID);
+}
+
 /*
  * Check if a net sdev_node is still valid - i.e. it represents a current
  * network link.
@@ -60,11 +155,20 @@ devnet_validate(struct sdev_node *dv)
 
 	ASSERT(dv->sdev_state == SDEV_READY);
 
-	if (dls_mgmt_get_linkid(dv->sdev_name, &linkid) != 0)
+	if (dv->sdev_vnode->v_type == VDIR)
+		return (devnet_dirvalidate(dv));
+
+	if (strncmp(devnet_zpath, dv->sdev_path, strlen(devnet_zpath)) == 0) {
+		ASSERT(SDEV_IS_GLOBAL(dv));
+		zoneid = devnet_nodetozone(dv);
+	} else {
+		zoneid = getzoneid();
+	}
+
+	if (dls_mgmt_get_linkid_in_zone(dv->sdev_name, &linkid, zoneid) != 0)
 		return (SDEV_VTOR_INVALID);
-	if (SDEV_IS_GLOBAL(dv))
+	if (zoneid == GLOBAL_ZONEID)
 		return (SDEV_VTOR_VALID);
-	zoneid = getzoneid();
 	return (zone_check_datalink(&zoneid, linkid) == 0 ?
 	    SDEV_VTOR_VALID : SDEV_VTOR_INVALID);
 }
@@ -74,13 +178,14 @@ devnet_validate(struct sdev_node *dv)
  * a net entry when the node is not found in the cache.
  */
 static int
-devnet_create_rvp(const char *nm, struct vattr *vap, dls_dl_handle_t *ddhp)
+devnet_create_rvp(const char *nm, struct vattr *vap, dls_dl_handle_t *ddhp,
+    zoneid_t zid)
 {
 	timestruc_t now;
 	dev_t dev;
 	int error;
 
-	if ((error = dls_devnet_open(nm, ddhp, &dev)) != 0) {
+	if ((error = dls_devnet_open_in_zone(nm, ddhp, &dev, zid)) != 0) {
 		sdcmn_err12(("devnet_create_rvp: not a valid vanity name "
 		    "network node: %s\n", nm));
 		return (error);
@@ -116,6 +221,7 @@ devnet_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
 	struct sdev_node *ddv = VTOSDEV(dvp);
 	struct sdev_node *dv = NULL;
 	dls_dl_handle_t ddh = NULL;
+	zone_t *zone;
 	struct vattr vattr;
 	int nmlen;
 	int error = ENOENT;
@@ -123,6 +229,9 @@ devnet_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
 	if (SDEVTOV(ddv)->v_type != VDIR)
 		return (ENOTDIR);
 
+	if (!SDEV_IS_GLOBAL(ddv) && crgetzoneid(cred) == GLOBAL_ZONEID)
+		return (EPERM);
+
 	/*
 	 * Empty name or ., return node itself.
 	 */
@@ -145,6 +254,12 @@ devnet_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
 	rw_enter(&ddv->sdev_contents, RW_WRITER);
 
 	/*
+	 * ZOMBIED parent does not allow new node creation, bail out early.
+	 */
+	if (ddv->sdev_state == SDEV_ZOMBIE)
+		goto failed;
+
+	/*
 	 * directory cache lookup:
 	 */
 	if ((dv = sdev_cache_lookup(ddv, nm)) != NULL) {
@@ -153,13 +268,42 @@ devnet_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
 			goto found;
 	}
 
+	if (SDEV_IS_GLOBAL(ddv)) {
+		/*
+		 * Check for /dev/net/zone
+		 */
+		if (strcmp("zone", nm) == 0 && strcmp("/dev/net",
+		    ddv->sdev_path) == 0) {
+			(void) devnet_mkdir(ddv, nm);
+			dv = sdev_cache_lookup(ddv, nm);
+			ASSERT(dv != NULL);
+			goto found;
+		}
+
+		/*
+		 * Check for /dev/net/zone/%z. We can't use devnet_zpath due to
+		 * its trailing slash.
+		 */
+		if (strcmp("/dev/net/zone", ddv->sdev_path) == 0) {
+			zone = zone_find_by_name(nm);
+			if (zone == NULL)
+				goto failed;
+			(void) devnet_mkdir(ddv, nm);
+			zone_rele(zone);
+			dv = sdev_cache_lookup(ddv, nm);
+			ASSERT(dv != NULL);
+			goto found;
+		}
+	} else if (strcmp("/dev/net", ddv->sdev_path) != 0) {
+		goto failed;
+	}
+
 	/*
-	 * ZOMBIED parent does not allow new node creation, bail out early.
+	 * We didn't find what we were looking for. What that is depends a lot
+	 * on what directory we're in.
 	 */
-	if (ddv->sdev_state == SDEV_ZOMBIE)
-		goto failed;
 
-	error = devnet_create_rvp(nm, &vattr, &ddh);
+	error = devnet_create_rvp(nm, &vattr, &ddh, devnet_nodetozone(ddv));
 	if (error != 0)
 		goto failed;
 
@@ -219,7 +363,7 @@ devnet_filldir_datalink(datalink_id_t linkid, void *arg)
 	if ((dv = sdev_cache_lookup(ddv, (char *)link)) != NULL)
 		goto found;
 
-	if (devnet_create_rvp(link, &vattr, &ddh) != 0)
+	if (devnet_create_rvp(link, &vattr, &ddh, devnet_nodetozone(arg)) != 0)
 		return (0);
 
 	ASSERT(ddh != NULL);
@@ -244,16 +388,77 @@ found:
 	return (0);
 }
 
+/*
+ * Fill in all the entries for the current zone.
+ */
 static void
-devnet_filldir(struct sdev_node *ddv)
+devnet_fillzone(struct sdev_node *ddv, zoneid_t zid)
 {
-	sdev_node_t	*dv, *next;
 	datalink_id_t	linkid;
 
+	ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
+	if (zid == GLOBAL_ZONEID) {
+		ASSERT(SDEV_IS_GLOBAL(ddv));
+		linkid = DATALINK_INVALID_LINKID;
+		do {
+			linkid = dls_mgmt_get_next(linkid, DATALINK_CLASS_ALL,
+			    DATALINK_ANY_MEDIATYPE, DLMGMT_ACTIVE);
+			if (linkid != DATALINK_INVALID_LINKID)
+				(void) devnet_filldir_datalink(linkid, ddv);
+		} while (linkid != DATALINK_INVALID_LINKID);
+	} else {
+		(void) zone_datalink_walk(zid,  devnet_filldir_datalink, ddv);
+	}
+}
+
+/*
+ * Callback for zone_walk when filling up /dev/net/zone/...
+ */
+static int
+devnet_fillzdir_cb(zone_t *zonep, void *arg)
+{
+	sdev_node_t *ddv = arg;
+
+	ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
+	(void) devnet_mkdir(ddv, zonep->zone_name);
+	return (0);
+}
+
+/*
+ * Fill in a directory that isn't the top level /dev/net.
+ */
+static void
+devnet_fillzdir(struct sdev_node *ddv)
+{
+	zone_t *zonep;
+	char *path = "/dev/net/zone";
+
+	if (strcmp(path, ddv->sdev_path) == 0) {
+		(void) zone_walk(devnet_fillzdir_cb, ddv);
+		return;
+	}
+
+	zonep = zone_find_by_name(ddv->sdev_name);
+	if (zonep == NULL)
+		return;
+	devnet_fillzone(ddv, zonep->zone_id);
+	zone_rele(zonep);
+}
+
+static void
+devnet_filldir(struct sdev_node *ddv)
+{
+	int ret;
+	sdev_node_t *dv, *next;
+
 	ASSERT(RW_READ_HELD(&ddv->sdev_contents));
 	if (rw_tryupgrade(&ddv->sdev_contents) == NULL) {
 		rw_exit(&ddv->sdev_contents);
 		rw_enter(&ddv->sdev_contents, RW_WRITER);
+		if (ddv->sdev_state == SDEV_ZOMBIE) {
+			rw_exit(&ddv->sdev_contents);
+			return;
+		}
 	}
 
 	for (dv = SDEV_FIRST_ENTRY(ddv); dv; dv = next) {
@@ -276,31 +481,38 @@ devnet_filldir(struct sdev_node *ddv)
 
 		if (SDEVTOV(dv)->v_count > 0)
 			continue;
+
 		SDEV_HOLD(dv);
+
+		/*
+		 * Clean out everything underneath before we remove ourselves.
+		 */
+		if (SDEVTOV(ddv)->v_type == VDIR) {
+			ret = sdev_cleandir(dv, NULL, 0);
+			ASSERT(ret == 0);
+		}
 		/* remove the cache node */
 		(void) sdev_cache_update(ddv, &dv, dv->sdev_name,
 		    SDEV_CACHE_DELETE);
 		SDEV_RELE(dv);
 	}
 
+	if (strcmp(ddv->sdev_path, "/dev/net") != 0) {
+		devnet_fillzdir(ddv);
+		goto done;
+	}
+
 	if (((ddv->sdev_flags & SDEV_BUILD) == 0) && !dls_devnet_rebuild())
 		goto done;
 
 	if (SDEV_IS_GLOBAL(ddv)) {
-		linkid = DATALINK_INVALID_LINKID;
-		do {
-			linkid = dls_mgmt_get_next(linkid, DATALINK_CLASS_ALL,
-			    DATALINK_ANY_MEDIATYPE, DLMGMT_ACTIVE);
-			if (linkid != DATALINK_INVALID_LINKID)
-				(void) devnet_filldir_datalink(linkid, ddv);
-		} while (linkid != DATALINK_INVALID_LINKID);
+		devnet_fillzone(ddv, GLOBAL_ZONEID);
+		(void) devnet_mkdir(ddv, "zone");
 	} else {
-		(void) zone_datalink_walk(getzoneid(),
-		    devnet_filldir_datalink, ddv);
+		devnet_fillzone(ddv, getzoneid());
 	}
 
 	ddv->sdev_flags &= ~SDEV_BUILD;
-
 done:
 	rw_downgrade(&ddv->sdev_contents);
 }
@@ -319,6 +531,9 @@ devnet_readdir(struct vnode *dvp, struct uio *uiop, struct cred *cred,
 
 	ASSERT(sdvp);
 
+	if (crgetzoneid(cred) == GLOBAL_ZONEID && !SDEV_IS_GLOBAL(sdvp))
+		return (EPERM);
+
 	if (uiop->uio_offset == 0)
 		devnet_filldir(sdvp);
 
diff --git a/usr/src/uts/common/fs/dev/sdev_plugin.c b/usr/src/uts/common/fs/dev/sdev_plugin.c
new file mode 100644
index 0000000000..885191175f
--- /dev/null
+++ b/usr/src/uts/common/fs/dev/sdev_plugin.c
@@ -0,0 +1,913 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * Dynamic directory plugin interface for sdev.
+ *
+ * The sdev plugin interfaces provides a means for a dynamic directory based on
+ * in-kernel state to be simply created. Traditionally, dynamic directories were
+ * built into sdev itself. While these legacy plugins are useful, it makes more
+ * sense for these pieces of functionality to live with the individual drivers.
+ *
+ * The plugin interface requires folks to implement three interfaces and
+ * provides a series of callbacks that can be made in the context of those
+ * interfaces to interrogate the sdev_node_t without having to leak
+ * implementation details of the sdev_node_t. These interfaces are:
+ *
+ *   o spo_validate
+ *
+ *   Given a particular node, answer the question as to whether or not this
+ *   entry is still valid. Here, plugins should use the name and the dev_t
+ *   associated with the node to verify that it matches something that still
+ *   exists.
+ *
+ *   o spo_filldir
+ *
+ *   Fill all the entries inside of a directory. Note that some of these entries
+ *   may already exist.
+ *
+ *   o spo_inactive
+ *
+ *   The given node is no longer being used. This allows the consumer to
+ *   potentially tear down anything that was being held open related to this.
+ *   Note that this only fires when the given sdev_node_t becomes a zombie.
+ *
+ * During these callbacks a consumer is not allowed to register or unregister a
+ * plugin, especially their own. They may call the sdev_ctx style functions. All
+ * callbacks fire in a context where blocking is allowed (eg. the spl is below
+ * LOCK_LEVEL).
+ *
+ * When a plugin is added, we create its directory in the global zone. By doing
+ * that, we ensure that something isn't already there and that nothing else can
+ * come along and try and create something without our knowledge. We only have
+ * to create it in the GZ and not for all other instances of sdev because an
+ * instance of sdev that isn't at /dev does not have dynamic directories, and
+ * second, any instance of sdev present in a non-global zone cannot create
+ * anything, therefore we know that by it not being in the global zone's
+ * instance of sdev that we're good to go.
+ *
+ * Lock Ordering
+ * -------------
+ *
+ * The global sdev_plugin_lock must be held before any of the individual
+ * sdev_plugin_t`sp_lock. Further, once any plugin related lock has been held,
+ * it is not legal to take any holds on any sdev_node_t or to grab the
+ * sdev_node_t`contents_lock in any way.
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/fs/sdev_impl.h>
+#include <sys/fs/sdev_plugin.h>
+#include <fs/fs_subr.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/ksynch.h>
+#include <sys/sysmacros.h>
+#include <sys/list.h>
+#include <sys/ctype.h>
+
+kmutex_t sdev_plugin_lock;
+list_t sdev_plugin_list;
+kmem_cache_t *sdev_plugin_cache;
+struct vnodeops *sdev_plugin_vnops;
+
+#define	SDEV_PLUGIN_NAMELEN	64
+
+typedef struct sdev_plugin {
+	list_node_t sp_link;
+	char sp_name[SDEV_PLUGIN_NAMELEN];	/* E */
+	int sp_nflags;				/* E */
+	struct vnodeops *sp_vnops;		/* E */
+	sdev_plugin_ops_t *sp_pops;		/* E */
+	boolean_t sp_islegacy;			/* E */
+	int (*sp_lvtor)(sdev_node_t *);		/* E */
+	kmutex_t sp_lock;			/* Protects everything below */
+	kcondvar_t sp_nodecv;
+	size_t sp_nnodes;
+} sdev_plugin_t;
+
+/* ARGSUSED */
+static int
+sdev_plugin_cache_constructor(void *buf, void *arg, int tags)
+{
+	sdev_plugin_t *spp = buf;
+	mutex_init(&spp->sp_lock, NULL, MUTEX_DRIVER, 0);
+	cv_init(&spp->sp_nodecv, NULL, CV_DRIVER, NULL);
+	return (0);
+}
+
+/* ARGSUSED */
+static void
+sdev_plugin_cache_destructor(void *buf, void *arg)
+{
+	sdev_plugin_t *spp = buf;
+	cv_destroy(&spp->sp_nodecv);
+	mutex_destroy(&spp->sp_lock);
+}
+
+enum vtype
+sdev_ctx_vtype(sdev_ctx_t ctx)
+{
+	sdev_node_t *sdp = (sdev_node_t *)ctx;
+
+	ASSERT(RW_LOCK_HELD(&sdp->sdev_contents));
+	return (sdp->sdev_vnode->v_type);
+}
+
+const char *
+sdev_ctx_path(sdev_ctx_t ctx)
+{
+	sdev_node_t *sdp = (sdev_node_t *)ctx;
+
+	ASSERT(RW_LOCK_HELD(&sdp->sdev_contents));
+	return (sdp->sdev_path);
+}
+
+const char *
+sdev_ctx_name(sdev_ctx_t ctx)
+{
+	sdev_node_t *sdp = (sdev_node_t *)ctx;
+
+	ASSERT(RW_LOCK_HELD(&sdp->sdev_contents));
+	return (sdp->sdev_name);
+}
+
+/*
+ * Currently we only support psasing through a single flag -- SDEV_IS_GLOBAL.
+ */
+sdev_ctx_flags_t
+sdev_ctx_flags(sdev_ctx_t ctx)
+{
+	sdev_node_t *sdp = (sdev_node_t *)ctx;
+
+	ASSERT(RW_LOCK_HELD(&sdp->sdev_contents));
+	return (sdp->sdev_flags & SDEV_GLOBAL);
+}
+
+/*
+ * Return some amount of private data specific to the vtype. In the case of a
+ * character or block device this is the device number.
+ */
+const void *
+sdev_ctx_vtype_data(sdev_ctx_t ctx)
+{
+	sdev_node_t *sdp = (sdev_node_t *)ctx;
+	void *ret;
+
+	ASSERT(RW_LOCK_HELD(&sdp->sdev_contents));
+	switch (sdp->sdev_vnode->v_type) {
+	case VCHR:
+	case VBLK:
+		ret = (void *)(uintptr_t)(sdp->sdev_vnode->v_rdev);
+		break;
+	default:
+		ret = NULL;
+		break;
+	}
+
+	return (ret);
+}
+
+/*
+ * Use the same rules as zones for a name. isalphanum + '-', '_', and '.'.
+ */
+static int
+sdev_plugin_name_isvalid(const char *c, int buflen)
+{
+	int i;
+
+	for (i = 0; i < buflen; i++, c++) {
+		if (*c == '\0')
+			return (1);
+
+		if (!isalnum(*c) && *c != '-' && *c != '_' && *c != '.')
+			return (0);
+	}
+	/* Never found a null terminator */
+	return (0);
+}
+
+static int
+sdev_plugin_mknode(sdev_plugin_t *spp, sdev_node_t *sdvp, char *name,
+    vattr_t *vap)
+{
+	int ret;
+	sdev_node_t *svp;
+
+	ASSERT(RW_WRITE_HELD(&sdvp->sdev_contents));
+	ASSERT(spp != NULL);
+	svp = sdev_cache_lookup(sdvp, name);
+	if (svp != NULL) {
+		SDEV_SIMPLE_RELE(svp);
+		return (EEXIST);
+	}
+
+	ret = sdev_mknode(sdvp, name, &svp, vap, NULL, NULL, kcred,
+	    SDEV_READY);
+	if (ret != 0)
+		return (ret);
+	SDEV_SIMPLE_RELE(svp);
+
+	return (0);
+}
+
+/*
+ * Plugin node creation callbacks
+ */
+int
+sdev_plugin_mkdir(sdev_ctx_t ctx, char *name)
+{
+	sdev_node_t *sdvp;
+	timestruc_t now;
+	struct vattr vap;
+
+	if (sdev_plugin_name_isvalid(name, SDEV_PLUGIN_NAMELEN) == 0)
+		return (EINVAL);
+
+	sdvp = (sdev_node_t *)ctx;
+	ASSERT(sdvp->sdev_private != NULL);
+	ASSERT(RW_WRITE_HELD(&sdvp->sdev_contents));
+
+	vap = *sdev_getdefault_attr(VDIR);
+	gethrestime(&now);
+	vap.va_atime = now;
+	vap.va_mtime = now;
+	vap.va_ctime = now;
+
+	return (sdev_plugin_mknode(sdvp->sdev_private, sdvp, name, &vap));
+}
+
+int
+sdev_plugin_mknod(sdev_ctx_t ctx, char *name, mode_t mode, dev_t dev)
+{
+	sdev_node_t *sdvp;
+	timestruc_t now;
+	struct vattr vap;
+
+	if (sdev_plugin_name_isvalid(name, SDEV_PLUGIN_NAMELEN) == 0)
+		return (EINVAL);
+
+	sdvp = (sdev_node_t *)ctx;
+	ASSERT(RW_WRITE_HELD(&sdvp->sdev_contents));
+	if (mode != S_IFCHR && mode != S_IFBLK)
+		return (EINVAL);
+
+	ASSERT(sdvp->sdev_private != NULL);
+
+	vap = *sdev_getdefault_attr(mode == S_IFCHR ? VCHR : VBLK);
+	gethrestime(&now);
+	vap.va_atime = now;
+	vap.va_mtime = now;
+	vap.va_ctime = now;
+	vap.va_rdev = dev;
+	vap.va_mode = mode | 0666;
+
+	/* Despite the similar name, this is in fact a different function */
+	return (sdev_plugin_mknode(sdvp->sdev_private, sdvp, name, &vap));
+
+}
+
+static int
+sdev_plugin_validate(sdev_node_t *sdp)
+{
+	int ret;
+	sdev_plugin_t *spp;
+
+	ASSERT(sdp->sdev_private != NULL);
+	spp = sdp->sdev_private;
+	ASSERT(spp->sp_islegacy == B_FALSE);
+	ASSERT(spp->sp_pops != NULL);
+	rw_enter(&sdp->sdev_contents, RW_READER);
+	ret = spp->sp_pops->spo_validate((uintptr_t)sdp);
+	rw_exit(&sdp->sdev_contents);
+	return (ret);
+}
+
+static void
+sdev_plugin_validate_dir(sdev_node_t *sdvp)
+{
+	int ret;
+	sdev_node_t *svp, *next;
+
+	ASSERT(RW_WRITE_HELD(&sdvp->sdev_contents));
+
+	for (svp = SDEV_FIRST_ENTRY(sdvp); svp != NULL; svp = next) {
+
+		next = SDEV_NEXT_ENTRY(sdvp, svp);
+		ASSERT(svp->sdev_state != SDEV_ZOMBIE);
+		/* skip nodes that aren't ready */
+		if (svp->sdev_state == SDEV_INIT)
+			continue;
+
+		switch (sdev_plugin_validate(svp)) {
+		case SDEV_VTOR_VALID:
+		case SDEV_VTOR_SKIP:
+			continue;
+		case SDEV_VTOR_INVALID:
+		case SDEV_VTOR_STALE:
+			break;
+		}
+
+		SDEV_HOLD(svp);
+
+		/*
+		 * Clean out everything underneath this node before we
+		 * remove it.
+		 */
+		if (svp->sdev_vnode->v_type == VDIR) {
+			ret = sdev_cleandir(svp, NULL, 0);
+			ASSERT(ret == 0);
+		}
+		/* remove the cache node */
+		(void) sdev_cache_update(sdvp, &svp, svp->sdev_name,
+		    SDEV_CACHE_DELETE);
+		SDEV_RELE(svp);
+	}
+}
+
+/* ARGSUSED */
+static int
+sdev_plugin_vop_readdir(struct vnode *dvp, struct uio *uiop, struct cred *cred,
+    int *eofp, caller_context_t *ct_unused, int flags_unused)
+{
+	int ret;
+	sdev_node_t *sdvp = VTOSDEV(dvp);
+	sdev_plugin_t *spp;
+
+	ASSERT(RW_READ_HELD(&sdvp->sdev_contents));
+
+	/* Sanity check we're not a zombie before we do anyting else */
+	if (sdvp->sdev_state == SDEV_ZOMBIE)
+		return (ENOENT);
+
+	spp = sdvp->sdev_private;
+	ASSERT(spp != NULL);
+	ASSERT(spp->sp_islegacy == B_FALSE);
+	ASSERT(spp->sp_pops != NULL);
+
+	if (crgetzoneid(cred) == GLOBAL_ZONEID && !SDEV_IS_GLOBAL(sdvp))
+		return (EPERM);
+
+	if (uiop->uio_offset == 0) {
+		/*
+		 * We upgrade to a write lock and grab the plugin's lock along
+		 * the way. We're almost certainly going to get creation
+		 * callbacks, so this is the only safe way to go.
+		 */
+		if (rw_tryupgrade(&sdvp->sdev_contents) == 0) {
+			rw_exit(&sdvp->sdev_contents);
+			rw_enter(&sdvp->sdev_contents, RW_WRITER);
+			if (sdvp->sdev_state == SDEV_ZOMBIE) {
+				rw_downgrade(&sdvp->sdev_contents);
+				return (ENOENT);
+			}
+		}
+
+		sdev_plugin_validate_dir(sdvp);
+		ret = spp->sp_pops->spo_filldir((uintptr_t)sdvp);
+		rw_downgrade(&sdvp->sdev_contents);
+		if (ret != 0)
+			return (ret);
+	}
+
+	return (devname_readdir_func(dvp, uiop, cred, eofp, 0));
+}
+
+/*
+ * If we don't have a callback function that returns a failure, then sdev will
+ * try to create a node for us which violates all of our basic assertions. To
+ * work around that we create our own callback for devname_lookup_func which
+ * always returns ENOENT as at this point either it was created with the filldir
+ * callback or it was not.
+ */
+/*ARGSUSED*/
+static int
+sdev_plugin_vop_lookup_cb(sdev_node_t *ddv, char *nm, void **arg, cred_t *cred,
+    void *unused, char *unused2)
+{
+	return (ENOENT);
+}
+
+/* ARGSUSED */
+static int
+sdev_plugin_vop_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
+    struct pathname *pnp, int flags, struct vnode *rdir, struct cred *cred,
+    caller_context_t *ct, int *direntflags, pathname_t *realpnp)
+{
+	int ret;
+	sdev_node_t *sdvp;
+	sdev_plugin_t *spp;
+
+	/* execute access is required to search the directory */
+	if ((ret = VOP_ACCESS(dvp, VEXEC, 0, cred, ct)) != 0)
+		return (ret);
+
+	sdvp = VTOSDEV(dvp);
+	spp = sdvp->sdev_private;
+	ASSERT(spp != NULL);
+	ASSERT(spp->sp_islegacy == B_FALSE);
+	ASSERT(spp->sp_pops != NULL);
+
+	if (crgetzoneid(cred) == GLOBAL_ZONEID && !SDEV_IS_GLOBAL(sdvp))
+		return (EPERM);
+
+	/*
+	 * Go straight for the write lock.
+	 */
+	rw_enter(&sdvp->sdev_contents, RW_WRITER);
+	if (sdvp->sdev_state == SDEV_ZOMBIE) {
+		rw_exit(&sdvp->sdev_contents);
+		return (ENOENT);
+	}
+	sdev_plugin_validate_dir(sdvp);
+	ret = spp->sp_pops->spo_filldir((uintptr_t)sdvp);
+	rw_exit(&sdvp->sdev_contents);
+	if (ret != 0)
+		return (ret);
+
+	return (devname_lookup_func(sdvp, nm, vpp, cred,
+	    sdev_plugin_vop_lookup_cb, SDEV_VATTR));
+}
+
+/*
+ * sdev is not a good citizen. We get inactive callbacks whenever a vnode goes
+ * to zero, but isn't necessairily a zombie yet. As such, to make things easier
+ * for users, we only fire the inactive callback when the node becomes a zombie
+ * and thus will be torn down here.
+ */
+static void
+sdev_plugin_vop_inactive_cb(struct vnode *dvp)
+{
+	sdev_node_t *sdp = VTOSDEV(dvp);
+	sdev_plugin_t *spp = sdp->sdev_private;
+
+	rw_enter(&sdp->sdev_contents, RW_READER);
+	if (sdp->sdev_state != SDEV_ZOMBIE) {
+		rw_exit(&sdp->sdev_contents);
+		return;
+	}
+	spp->sp_pops->spo_inactive((uintptr_t)sdp);
+	mutex_enter(&spp->sp_lock);
+	VERIFY(spp->sp_nnodes > 0);
+	spp->sp_nnodes--;
+	cv_signal(&spp->sp_nodecv);
+	mutex_exit(&spp->sp_lock);
+	rw_exit(&sdp->sdev_contents);
+}
+
+/*ARGSUSED*/
+static void
+sdev_plugin_vop_inactive(struct vnode *dvp, struct cred *cred,
+    caller_context_t *ct)
+{
+	sdev_node_t *sdp = VTOSDEV(dvp);
+	sdev_plugin_t *spp = sdp->sdev_private;
+	ASSERT(sdp->sdev_private != NULL);
+	ASSERT(spp->sp_islegacy == B_FALSE);
+	devname_inactive_func(dvp, cred, sdev_plugin_vop_inactive_cb);
+}
+
+const fs_operation_def_t sdev_plugin_vnodeops_tbl[] = {
+	VOPNAME_READDIR,	{ .vop_readdir = sdev_plugin_vop_readdir },
+	VOPNAME_LOOKUP,		{ .vop_lookup = sdev_plugin_vop_lookup },
+	VOPNAME_INACTIVE,	{ .vop_inactive = sdev_plugin_vop_inactive },
+	VOPNAME_CREATE,		{ .error = fs_nosys },
+	VOPNAME_REMOVE,		{ .error = fs_nosys },
+	VOPNAME_MKDIR,		{ .error = fs_nosys },
+	VOPNAME_RMDIR,		{ .error = fs_nosys },
+	VOPNAME_SYMLINK,	{ .error = fs_nosys },
+	VOPNAME_SETSECATTR,	{ .error = fs_nosys },
+	NULL,			NULL
+};
+
+/*
+ * construct a new template with overrides from vtab
+ */
+static fs_operation_def_t *
+sdev_merge_vtab(const fs_operation_def_t tab[])
+{
+	fs_operation_def_t *new;
+	const fs_operation_def_t *tab_entry;
+
+	/* make a copy of standard vnode ops table */
+	new = kmem_alloc(sdev_vnodeops_tbl_size, KM_SLEEP);
+	bcopy((void *)sdev_vnodeops_tbl, new, sdev_vnodeops_tbl_size);
+
+	/* replace the overrides from tab */
+	for (tab_entry = tab; tab_entry->name != NULL; tab_entry++) {
+		fs_operation_def_t *std_entry = new;
+		while (std_entry->name) {
+			if (strcmp(tab_entry->name, std_entry->name) == 0) {
+				std_entry->func = tab_entry->func;
+				break;
+			}
+			std_entry++;
+		}
+	}
+
+	return (new);
+}
+
+/* free memory allocated by sdev_merge_vtab */
+static void
+sdev_free_vtab(fs_operation_def_t *new)
+{
+	kmem_free(new, sdev_vnodeops_tbl_size);
+}
+
+/*
+ * Register a new plugin.
+ */
+sdev_plugin_hdl_t
+sdev_plugin_register(const char *name, sdev_plugin_ops_t *ops, int *errp)
+{
+	int ret, err;
+	sdev_plugin_t *spp, *iter;
+	vnode_t *vp, *nvp;
+	sdev_node_t *sdp, *slp;
+	timestruc_t now;
+	struct vattr vap;
+
+	/*
+	 * Some consumers don't care about why they failed. To keep the code
+	 * simple, we'll just pretend they gave us something.
+	 */
+	if (errp == NULL)
+		errp = &err;
+
+	if (sdev_plugin_name_isvalid(name, SDEV_PLUGIN_NAMELEN) == 0) {
+		*errp = EINVAL;
+		return (NULL);
+	}
+
+	if (ops->spo_version != 1) {
+		*errp = EINVAL;
+		return (NULL);
+	}
+
+	if (ops->spo_validate == NULL || ops->spo_filldir == NULL ||
+	    ops->spo_inactive == NULL) {
+		*errp = EINVAL;
+		return (NULL);
+	}
+
+	if ((ops->spo_flags & ~SDEV_PLUGIN_FLAGS_MASK) != 0) {
+		*errp = EINVAL;
+		return (NULL);
+	}
+
+	spp = kmem_cache_alloc(sdev_plugin_cache, KM_SLEEP);
+	(void) strlcpy(spp->sp_name, name, SDEV_PLUGIN_NAMELEN);
+
+	spp->sp_pops = ops;
+	spp->sp_nflags = SDEV_DYNAMIC | SDEV_VTOR;
+	if (ops->spo_flags & SDEV_PLUGIN_NO_NCACHE)
+		spp->sp_nflags |= SDEV_NO_NCACHE;
+	if (ops->spo_flags & SDEV_PLUGIN_SUBDIR)
+		spp->sp_nflags |= SDEV_SUBDIR;
+	spp->sp_vnops = sdev_plugin_vnops;
+	spp->sp_islegacy = B_FALSE;
+	spp->sp_lvtor = NULL;
+	spp->sp_nnodes = 0;
+
+	/*
+	 * Make sure it's unique, nothing exists with this name already, and add
+	 * it to the list. We also need to go through and grab the sdev
+	 * root node as we cannot grab any sdev node locks once we've grabbed
+	 * the sdev_plugin_lock. We effectively assert that if a directory is
+	 * not present in the GZ's /dev, then it doesn't exist in any of the
+	 * local zones.
+	 */
+	ret = vn_openat("/dev", UIO_SYSSPACE, FREAD, 0, &vp, 0, 0, rootdir, -1);
+	if (ret != 0) {
+		*errp = ret;
+		kmem_cache_free(sdev_plugin_cache, spp);
+		return (NULL);
+	}
+	/* Make sure we have the real vnode */
+	if (VOP_REALVP(vp, &nvp, NULL) == 0) {
+		VN_HOLD(nvp);
+		VN_RELE(vp);
+		vp = nvp;
+		nvp = NULL;
+	}
+	VERIFY(vp->v_op == sdev_vnodeops);
+	sdp = VTOSDEV(vp);
+	rw_enter(&sdp->sdev_contents, RW_WRITER);
+	slp = sdev_cache_lookup(sdp, spp->sp_name);
+	if (slp != NULL) {
+		SDEV_RELE(slp);
+		rw_exit(&sdp->sdev_contents);
+		VN_RELE(vp);
+		*errp = EEXIST;
+		kmem_cache_free(sdev_plugin_cache, spp);
+		return (NULL);
+	}
+
+	mutex_enter(&sdev_plugin_lock);
+	for (iter = list_head(&sdev_plugin_list); iter != NULL;
+	    iter = list_next(&sdev_plugin_list, iter)) {
+		if (strcmp(spp->sp_name, iter->sp_name) == 0) {
+			mutex_exit(&sdev_plugin_lock);
+			rw_exit(&sdp->sdev_contents);
+			VN_RELE(vp);
+			*errp = EEXIST;
+			kmem_cache_free(sdev_plugin_cache, spp);
+			return (NULL);
+		}
+	}
+
+	list_insert_tail(&sdev_plugin_list, spp);
+	mutex_exit(&sdev_plugin_lock);
+
+	/*
+	 * Now go ahead and create the top level directory for the global zone.
+	 */
+	vap = *sdev_getdefault_attr(VDIR);
+	gethrestime(&now);
+	vap.va_atime = now;
+	vap.va_mtime = now;
+	vap.va_ctime = now;
+
+	(void) sdev_plugin_mknode(spp, sdp, spp->sp_name, &vap);
+
+	rw_exit(&sdp->sdev_contents);
+	VN_RELE(vp);
+
+	return ((sdev_plugin_hdl_t)spp);
+}
+
+static void
+sdev_plugin_unregister_cb(sdev_node_t *rdp, void *arg)
+{
+	sdev_plugin_t *spp = arg;
+	sdev_node_t *sdp;
+
+	rw_enter(&rdp->sdev_contents, RW_WRITER);
+	sdp = sdev_cache_lookup(rdp, spp->sp_name);
+	/* If it doesn't exist, we're done here */
+	if (sdp == NULL) {
+		rw_exit(&rdp->sdev_contents);
+		return;
+	}
+
+	/*
+	 * We first delete the directory before recursively marking everything
+	 * else stale. This ordering should ensure that we don't accidentally
+	 * miss anything.
+	 */
+	sdev_cache_update(rdp, &sdp, spp->sp_name, SDEV_CACHE_DELETE);
+	sdev_stale(sdp);
+	SDEV_RELE(sdp);
+	rw_exit(&rdp->sdev_contents);
+}
+
+/*
+ * Remove a plugin. This will block until everything has become a zombie, thus
+ * guaranteeing the caller that nothing will call into them again once this call
+ * returns. While the call is ongoing, it could be called into. Note that while
+ * this is ongoing, it will block other mounts.
+ */
+int
+sdev_plugin_unregister(sdev_plugin_hdl_t hdl)
+{
+	sdev_plugin_t *spp = (sdev_plugin_t *)hdl;
+	if (spp->sp_islegacy)
+		return (EINVAL);
+
+	mutex_enter(&sdev_plugin_lock);
+	list_remove(&sdev_plugin_list, spp);
+	mutex_exit(&sdev_plugin_lock);
+
+	sdev_mnt_walk(sdev_plugin_unregister_cb, spp);
+	mutex_enter(&spp->sp_lock);
+	while (spp->sp_nnodes > 0)
+		cv_wait(&spp->sp_nodecv, &spp->sp_lock);
+	mutex_exit(&spp->sp_lock);
+	kmem_cache_free(sdev_plugin_cache, spp);
+	return (0);
+}
+
+/*
+ * Register an old sdev style plugin to deal with what used to be in the vtab.
+ */
+static int
+sdev_plugin_register_legacy(struct sdev_vop_table *vtp)
+{
+	sdev_plugin_t *spp;
+
+	spp = kmem_cache_alloc(sdev_plugin_cache, KM_SLEEP);
+	(void) strlcpy(spp->sp_name, vtp->vt_name, SDEV_PLUGIN_NAMELEN);
+	spp->sp_islegacy = B_TRUE;
+	spp->sp_pops = NULL;
+	spp->sp_nflags = vtp->vt_flags;
+	spp->sp_lvtor = vtp->vt_vtor;
+	spp->sp_nnodes = 0;
+
+	if (vtp->vt_service != NULL) {
+		fs_operation_def_t *templ;
+		templ = sdev_merge_vtab(vtp->vt_service);
+		if (vn_make_ops(vtp->vt_name,
+		    (const fs_operation_def_t *)templ,
+		    &spp->sp_vnops) != 0) {
+			cmn_err(CE_WARN, "%s: malformed vnode ops\n",
+			    vtp->vt_name);
+			sdev_free_vtab(templ);
+			kmem_cache_free(sdev_plugin_cache, spp);
+			return (1);
+		}
+
+		if (vtp->vt_global_vops) {
+			*(vtp->vt_global_vops) = spp->sp_vnops;
+		}
+
+		sdev_free_vtab(templ);
+	} else {
+		spp->sp_vnops = sdev_vnodeops;
+	}
+
+	/*
+	 * No need to check for EEXIST here. These are loaded as a part of the
+	 * sdev's initialization function. Further, we don't have to create them
+	 * as that's taken care of in sdev's mount for the GZ.
+	 */
+	mutex_enter(&sdev_plugin_lock);
+	list_insert_tail(&sdev_plugin_list, spp);
+	mutex_exit(&sdev_plugin_lock);
+
+	return (0);
+}
+
+/*
+ * We need to match off of the sdev_path, not the sdev_name. We are only allowed
+ * to exist directly under /dev.
+ */
+static sdev_plugin_t *
+sdev_match(sdev_node_t *dv)
+{
+	int vlen;
+	const char *path;
+	sdev_plugin_t *spp;
+
+	if (strlen(dv->sdev_path) <= 5)
+		return (NULL);
+
+	if (strncmp(dv->sdev_path, "/dev/", 5) != 0)
+		return (NULL);
+	path = dv->sdev_path + 5;
+
+	mutex_enter(&sdev_plugin_lock);
+
+	for (spp = list_head(&sdev_plugin_list); spp != NULL;
+	    spp = list_next(&sdev_plugin_list, spp)) {
+		if (strcmp(spp->sp_name, path) == 0) {
+			mutex_exit(&sdev_plugin_lock);
+			return (spp);
+		}
+
+		if (spp->sp_nflags & SDEV_SUBDIR) {
+			vlen = strlen(spp->sp_name);
+			if ((strncmp(spp->sp_name, path,
+			    vlen - 1) == 0) && path[vlen] == '/') {
+				mutex_exit(&sdev_plugin_lock);
+				return (spp);
+			}
+
+		}
+	}
+
+	mutex_exit(&sdev_plugin_lock);
+	return (NULL);
+}
+
+void
+sdev_set_no_negcache(sdev_node_t *dv)
+{
+	char *path;
+	sdev_plugin_t *spp;
+
+	ASSERT(dv->sdev_path);
+	path = dv->sdev_path + strlen("/dev/");
+
+	mutex_enter(&sdev_plugin_lock);
+	for (spp = list_head(&sdev_plugin_list); spp != NULL;
+	    spp = list_next(&sdev_plugin_list, spp)) {
+		if (strcmp(spp->sp_name, path) == 0) {
+			if (spp->sp_nflags & SDEV_NO_NCACHE)
+				dv->sdev_flags |= SDEV_NO_NCACHE;
+			break;
+		}
+	}
+	mutex_exit(&sdev_plugin_lock);
+}
+
+struct vnodeops *
+sdev_get_vop(sdev_node_t *dv)
+{
+	char *path;
+	sdev_plugin_t *spp;
+
+	path = dv->sdev_path;
+	ASSERT(path);
+
+	/* gets the relative path to /dev/ */
+	path += 5;
+
+	if ((spp = sdev_match(dv)) != NULL) {
+		dv->sdev_flags |= spp->sp_nflags;
+		if (SDEV_IS_PERSIST(dv->sdev_dotdot) &&
+		    (SDEV_IS_PERSIST(dv) || !SDEV_IS_DYNAMIC(dv)))
+			dv->sdev_flags |= SDEV_PERSIST;
+		return (spp->sp_vnops);
+	}
+
+	/* child inherits the persistence of the parent */
+	if (SDEV_IS_PERSIST(dv->sdev_dotdot))
+		dv->sdev_flags |= SDEV_PERSIST;
+	return (sdev_vnodeops);
+}
+
+void *
+sdev_get_vtor(sdev_node_t *dv)
+{
+	sdev_plugin_t *spp;
+
+	if (dv->sdev_private == NULL) {
+		spp = sdev_match(dv);
+		if (spp == NULL)
+			return (NULL);
+	} else {
+		spp = dv->sdev_private;
+	}
+
+	if (spp->sp_islegacy)
+		return ((void *)spp->sp_lvtor);
+	else
+		return ((void *)sdev_plugin_validate);
+}
+
+void
+sdev_plugin_nodeready(sdev_node_t *sdp)
+{
+	sdev_plugin_t *spp;
+
+	ASSERT(RW_WRITE_HELD(&sdp->sdev_contents));
+	ASSERT(sdp->sdev_private == NULL);
+
+	spp = sdev_match(sdp);
+	if (spp == NULL)
+		return;
+	if (spp->sp_islegacy)
+		return;
+	sdp->sdev_private = spp;
+	mutex_enter(&spp->sp_lock);
+	spp->sp_nnodes++;
+	mutex_exit(&spp->sp_lock);
+}
+
+int
+sdev_plugin_init(void)
+{
+	sdev_vop_table_t *vtp;
+	fs_operation_def_t *templ;
+
+	sdev_plugin_cache = kmem_cache_create("sdev_plugin",
+	    sizeof (sdev_plugin_t), 0, sdev_plugin_cache_constructor,
+	    sdev_plugin_cache_destructor, NULL, NULL, NULL, 0);
+	if (sdev_plugin_cache == NULL)
+		return (1);
+	mutex_init(&sdev_plugin_lock, NULL, MUTEX_DRIVER, NULL);
+	list_create(&sdev_plugin_list, sizeof (sdev_plugin_t),
+	    offsetof(sdev_plugin_t, sp_link));
+
+	/*
+	 * Register all of the legacy vnops
+	 */
+	for (vtp = &vtab[0]; vtp->vt_name != NULL; vtp++)
+		if (sdev_plugin_register_legacy(vtp) != 0)
+			return (1);
+
+	templ = sdev_merge_vtab(sdev_plugin_vnodeops_tbl);
+	if (vn_make_ops("sdev_plugin",
+	    (const fs_operation_def_t *)templ,
+	    &sdev_plugin_vnops) != 0) {
+		sdev_free_vtab(templ);
+		return (1);
+	}
+
+	sdev_free_vtab(templ);
+	return (0);
+}
diff --git a/usr/src/uts/common/fs/dev/sdev_subr.c b/usr/src/uts/common/fs/dev/sdev_subr.c
index 9234cc4a0c..511432453f 100644
--- a/usr/src/uts/common/fs/dev/sdev_subr.c
+++ b/usr/src/uts/common/fs/dev/sdev_subr.c
@@ -150,12 +150,6 @@ vattr_t sdev_vattr_chr = {
 kmem_cache_t	*sdev_node_cache;	/* sdev_node cache */
 int		devtype;		/* fstype */
 
-/* static */
-static struct vnodeops *sdev_get_vop(struct sdev_node *);
-static void sdev_set_no_negcache(struct sdev_node *);
-static fs_operation_def_t *sdev_merge_vtab(const fs_operation_def_t []);
-static void sdev_free_vtab(fs_operation_def_t *);
-
 static void
 sdev_prof_free(struct sdev_node *dv)
 {
@@ -313,6 +307,7 @@ sdev_nodeinit(struct sdev_node *ddv, char *nm, struct sdev_node **newdv,
 	(void) snprintf(dv->sdev_path, len, "%s/%s", ddv->sdev_path, nm);
 	/* overwritten for VLNK nodes */
 	dv->sdev_symlink = NULL;
+	list_link_init(&dv->sdev_plist);
 
 	vp = SDEVTOV(dv);
 	vn_reinit(vp);
@@ -401,6 +396,7 @@ sdev_nodeready(struct sdev_node *dv, struct vattr *vap, struct vnode *avp,
 	} else {
 		dv->sdev_nlink = 1;
 	}
+	sdev_plugin_nodeready(dv);
 
 	if (!(SDEV_IS_GLOBAL(dv))) {
 		dv->sdev_origin = (struct sdev_node *)args;
@@ -497,37 +493,22 @@ sdev_mkroot(struct vfs *vfsp, dev_t devdev, struct vnode *mvp,
 	return (dv);
 }
 
-/* directory dependent vop table */
-struct sdev_vop_table {
-	char *vt_name;				/* subdirectory name */
-	const fs_operation_def_t *vt_service;	/* vnodeops table */
-	struct vnodeops *vt_vops;		/* constructed vop */
-	struct vnodeops **vt_global_vops;	/* global container for vop */
-	int (*vt_vtor)(struct sdev_node *);	/* validate sdev_node */
-	int vt_flags;
-};
-
-/*
- * A nice improvement would be to provide a plug-in mechanism
- * for this table instead of a const table.
- */
-static struct sdev_vop_table vtab[] =
-{
-	{ "pts", devpts_vnodeops_tbl, NULL, &devpts_vnodeops, devpts_validate,
+struct sdev_vop_table vtab[] = {
+	{ "pts", devpts_vnodeops_tbl, &devpts_vnodeops, devpts_validate,
 	SDEV_DYNAMIC | SDEV_VTOR },
 
-	{ "vt", devvt_vnodeops_tbl, NULL, &devvt_vnodeops, devvt_validate,
+	{ "vt", devvt_vnodeops_tbl, &devvt_vnodeops, devvt_validate,
 	SDEV_DYNAMIC | SDEV_VTOR },
 
-	{ "zvol", devzvol_vnodeops_tbl, NULL, &devzvol_vnodeops,
+	{ "zvol", devzvol_vnodeops_tbl, &devzvol_vnodeops,
 	devzvol_validate, SDEV_ZONED | SDEV_DYNAMIC | SDEV_VTOR | SDEV_SUBDIR },
 
-	{ "zcons", NULL, NULL, NULL, NULL, SDEV_NO_NCACHE },
+	{ "zcons", NULL, NULL, NULL, SDEV_NO_NCACHE },
 
-	{ "net", devnet_vnodeops_tbl, NULL, &devnet_vnodeops, devnet_validate,
-	SDEV_DYNAMIC | SDEV_VTOR },
+	{ "net", devnet_vnodeops_tbl, &devnet_vnodeops, devnet_validate,
+	SDEV_DYNAMIC | SDEV_VTOR | SDEV_SUBDIR },
 
-	{ "ipnet", devipnet_vnodeops_tbl, NULL, &devipnet_vnodeops,
+	{ "ipnet", devipnet_vnodeops_tbl, &devipnet_vnodeops,
 	devipnet_validate, SDEV_DYNAMIC | SDEV_VTOR | SDEV_NO_NCACHE },
 
 	/*
@@ -542,132 +523,14 @@ static struct sdev_vop_table vtab[] =
 	 * preventing a mkdir.
 	 */
 
-	{ "lofi", NULL, NULL, NULL, NULL,
+	{ "lofi", NULL, NULL, NULL,
 	    SDEV_ZONED | SDEV_DYNAMIC | SDEV_PERSIST },
-	{ "rlofi", NULL, NULL, NULL, NULL,
+	{ "rlofi", NULL, NULL, NULL,
 	    SDEV_ZONED | SDEV_DYNAMIC | SDEV_PERSIST },
 
-	{ NULL, NULL, NULL, NULL, NULL, 0}
+	{ NULL, NULL, NULL, NULL, 0}
 };
 
-/*
- * We need to match off of the sdev_path, not the sdev_name. We are only allowed
- * to exist directly under /dev.
- */
-struct sdev_vop_table *
-sdev_match(struct sdev_node *dv)
-{
-	int vlen;
-	int i;
-	const char *path;
-
-	if (strlen(dv->sdev_path) <= 5)
-		return (NULL);
-
-	if (strncmp(dv->sdev_path, "/dev/", 5) != 0)
-		return (NULL);
-	path = dv->sdev_path + 5;
-
-	for (i = 0; vtab[i].vt_name; i++) {
-		if (strcmp(vtab[i].vt_name, path) == 0)
-			return (&vtab[i]);
-		if (vtab[i].vt_flags & SDEV_SUBDIR) {
-			vlen = strlen(vtab[i].vt_name);
-			if ((strncmp(vtab[i].vt_name, path,
-			    vlen - 1) == 0) && path[vlen] == '/')
-				return (&vtab[i]);
-		}
-
-	}
-	return (NULL);
-}
-
-/*
- *  sets a directory's vnodeops if the directory is in the vtab;
- */
-static struct vnodeops *
-sdev_get_vop(struct sdev_node *dv)
-{
-	struct sdev_vop_table *vtp;
-	char *path;
-
-	path = dv->sdev_path;
-	ASSERT(path);
-
-	/* gets the relative path to /dev/ */
-	path += 5;
-
-	/* gets the vtab entry it matches */
-	if ((vtp = sdev_match(dv)) != NULL) {
-		dv->sdev_flags |= vtp->vt_flags;
-		if (SDEV_IS_PERSIST(dv->sdev_dotdot) &&
-		    (SDEV_IS_PERSIST(dv) || !SDEV_IS_DYNAMIC(dv)))
-			dv->sdev_flags |= SDEV_PERSIST;
-
-		if (vtp->vt_vops) {
-			if (vtp->vt_global_vops)
-				*(vtp->vt_global_vops) = vtp->vt_vops;
-
-			return (vtp->vt_vops);
-		}
-
-		if (vtp->vt_service) {
-			fs_operation_def_t *templ;
-			templ = sdev_merge_vtab(vtp->vt_service);
-			if (vn_make_ops(vtp->vt_name,
-			    (const fs_operation_def_t *)templ,
-			    &vtp->vt_vops) != 0) {
-				cmn_err(CE_PANIC, "%s: malformed vnode ops\n",
-				    vtp->vt_name);
-				/*NOTREACHED*/
-			}
-			if (vtp->vt_global_vops) {
-				*(vtp->vt_global_vops) = vtp->vt_vops;
-			}
-			sdev_free_vtab(templ);
-
-			return (vtp->vt_vops);
-		}
-
-		return (sdev_vnodeops);
-	}
-
-	/* child inherits the persistence of the parent */
-	if (SDEV_IS_PERSIST(dv->sdev_dotdot))
-		dv->sdev_flags |= SDEV_PERSIST;
-
-	return (sdev_vnodeops);
-}
-
-static void
-sdev_set_no_negcache(struct sdev_node *dv)
-{
-	int i;
-	char *path;
-
-	ASSERT(dv->sdev_path);
-	path = dv->sdev_path + strlen("/dev/");
-
-	for (i = 0; vtab[i].vt_name; i++) {
-		if (strcmp(vtab[i].vt_name, path) == 0) {
-			if (vtab[i].vt_flags & SDEV_NO_NCACHE)
-				dv->sdev_flags |= SDEV_NO_NCACHE;
-			break;
-		}
-	}
-}
-
-void *
-sdev_get_vtor(struct sdev_node *dv)
-{
-	struct sdev_vop_table *vtp;
-
-	vtp = sdev_match(dv);
-	if (vtp)
-		return ((void *)vtp->vt_vtor);
-	else
-		return (NULL);
-}
 
 /*
  * Build the base root inode
@@ -947,8 +810,11 @@ sdev_nodedestroy(struct sdev_node *dv, uint_t flags)
 		dv->sdev_path = NULL;
 	}
 
-	if (!SDEV_IS_GLOBAL(dv))
+	if (!SDEV_IS_GLOBAL(dv)) {
 		sdev_prof_free(dv);
+		if (dv->sdev_vnode->v_type != VLNK && dv->sdev_origin != NULL)
+			SDEV_RELE(dv->sdev_origin);
+	}
 
 	if (SDEVTOV(dv)->v_type == VDIR) {
 		ASSERT(SDEV_FIRST_ENTRY(dv) == NULL);
@@ -962,6 +828,7 @@ sdev_nodedestroy(struct sdev_node *dv, uint_t flags)
 	(void) memset((void *)&dv->sdev_instance_data, 0,
 	    sizeof (dv->sdev_instance_data));
 	vn_invalid(SDEVTOV(dv));
+	dv->sdev_private = NULL;
 	kmem_cache_free(sdev_node_cache, dv);
 }
 
@@ -2944,46 +2811,6 @@ sdev_modctl_devexists(const char *path)
 	return (error);
 }
 
-extern int sdev_vnodeops_tbl_size;
-
-/*
- * construct a new template with overrides from vtab
- */
-static fs_operation_def_t *
-sdev_merge_vtab(const fs_operation_def_t tab[])
-{
-	fs_operation_def_t *new;
-	const fs_operation_def_t *tab_entry;
-
-	/* make a copy of standard vnode ops table */
-	new = kmem_alloc(sdev_vnodeops_tbl_size, KM_SLEEP);
-	bcopy((void *)sdev_vnodeops_tbl, new, sdev_vnodeops_tbl_size);
-
-	/* replace the overrides from tab */
-	for (tab_entry = tab; tab_entry->name != NULL; tab_entry++) {
-		fs_operation_def_t *std_entry = new;
-		while (std_entry->name) {
-			if (strcmp(tab_entry->name, std_entry->name) == 0) {
-				std_entry->func = tab_entry->func;
-				break;
-			}
-			std_entry++;
-		}
-		if (std_entry->name == NULL)
-			cmn_err(CE_NOTE, "sdev_merge_vtab: entry %s unused.",
-			    tab_entry->name);
-	}
-
-	return (new);
-}
-
-/* free memory allocated by sdev_merge_vtab */
-static void
-sdev_free_vtab(fs_operation_def_t *new)
-{
-	kmem_free(new, sdev_vnodeops_tbl_size);
-}
-
 /*
  * a generic setattr() function
  *
diff --git a/usr/src/uts/common/fs/dev/sdev_vfsops.c b/usr/src/uts/common/fs/dev/sdev_vfsops.c
index 00e981ce9c..8de16926cd 100644
--- a/usr/src/uts/common/fs/dev/sdev_vfsops.c
+++ b/usr/src/uts/common/fs/dev/sdev_vfsops.c
@@ -172,7 +172,13 @@ devinit(int fstype, char *name)
 
 	if ((devmajor = getudev()) == (major_t)-1) {
 		cmn_err(CE_WARN, "%s: can't get unique dev", sdev_vfssw.name);
-		return (1);
+		return (ENXIO);
+	}
+
+	if (sdev_plugin_init() != 0) {
+		cmn_err(CE_WARN, "%s: failed to set init plugin subsystem",
+		    sdev_vfssw.name);
+		return (EIO);
 	}
 
 	/* initialize negative cache */
@@ -349,6 +355,7 @@ sdev_mount(struct vfs *vfsp, struct vnode *mvp, struct mounta *uap,
 		ASSERT(sdev_origins);
 		dv->sdev_flags &= ~SDEV_GLOBAL;
 		dv->sdev_origin = sdev_origins->sdev_root;
+		SDEV_HOLD(dv->sdev_origin);
 	} else {
 		sdev_ncache_setup();
 		rw_enter(&dv->sdev_contents, RW_WRITER);
@@ -521,3 +528,17 @@ sdev_mntinfo_rele(struct sdev_data *mntinfo)
 	SDEVTOV(mntinfo->sdev_root)->v_count--;
 	mutex_exit(&sdev_lock);
 }
+
+void
+sdev_mnt_walk(void (*func)(struct sdev_node *, void *), void *arg)
+{
+	struct sdev_data *mntinfo;
+
+	mutex_enter(&sdev_lock);
+	mntinfo = sdev_mntinfo;
+	while (mntinfo != NULL) {
+		func(mntinfo->sdev_root, arg);
+		mntinfo = mntinfo->sdev_next;
+	}
+	mutex_exit(&sdev_lock);
+}
diff --git a/usr/src/uts/common/fs/dev/sdev_vnops.c b/usr/src/uts/common/fs/dev/sdev_vnops.c
index 59a3c9f17a..6ce4b0b174 100644
--- a/usr/src/uts/common/fs/dev/sdev_vnops.c
+++ b/usr/src/uts/common/fs/dev/sdev_vnops.c
@@ -22,7 +22,7 @@
  * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 /*
- * Copyright (c) 2013, Joyent, Inc.  All rights reserved.
+ * Copyright 2016, Joyent, Inc.
  */
 
 /*
@@ -864,6 +864,9 @@ sdev_remove(struct vnode *dvp, char *nm, struct cred *cred,
 		}
 	}
 
+	if (error == 0)
+		i_ddi_di_cache_invalidate();
+
 	return (error);
 }
 
@@ -1188,6 +1191,7 @@ sdev_symlink(struct vnode *dvp, char *lnm, struct vattr *tva,
 	sdev_update_timestamps(dvp, kcred, AT_MTIME|AT_ATIME);
 	if (SDEV_IS_GLOBAL(parent))
 		atomic_inc_ulong(&parent->sdev_gdir_gen);
+	i_ddi_di_cache_invalidate();
 
 	/* wake up other threads blocked on looking up this node */
 	mutex_enter(&self->sdev_lookup_lock);
@@ -1260,6 +1264,7 @@ sdev_mkdir(struct vnode *dvp, char *nm, struct vattr *va, struct vnode **vpp,
 	sdev_update_timestamps(dvp, kcred, AT_MTIME|AT_ATIME);
 	if (SDEV_IS_GLOBAL(parent))
 		atomic_inc_ulong(&parent->sdev_gdir_gen);
+	i_ddi_di_cache_invalidate();
 
 	/* wake up other threads blocked on looking up this node */
 	mutex_enter(&self->sdev_lookup_lock);
@@ -1375,6 +1380,9 @@ sdev_rmdir(struct vnode *dvp, char *nm, struct vnode *cdir, struct cred *cred,
 
 	}
 
+	if (error == 0)
+		i_ddi_di_cache_invalidate();
+
 	return (error);
 }
 
diff --git a/usr/src/uts/common/fs/dev/sdev_zvolops.c b/usr/src/uts/common/fs/dev/sdev_zvolops.c
index 11384e33d3..407ad1d55b 100644
--- a/usr/src/uts/common/fs/dev/sdev_zvolops.c
+++ b/usr/src/uts/common/fs/dev/sdev_zvolops.c
@@ -459,8 +459,10 @@ devzvol_create_pool_dirs(struct vnode *dvp)
 		ASSERT(dvp->v_count > 0);
 		rc = VOP_LOOKUP(dvp, nvpair_name(elem), &vp, NULL, 0,
 		    NULL, kcred, NULL, 0, NULL);
-		/* should either work, or not be visible from a zone */
-		ASSERT(rc == 0 || rc == ENOENT);
+		/*
+		 * should either work or we should get an error if this should
+		 * not be visible from the zone, or disallowed in the zone
+		 */
 		if (rc == 0)
 			VN_RELE(vp);
 		pools++;
diff --git a/usr/src/uts/common/fs/dnlc.c b/usr/src/uts/common/fs/dnlc.c
index 25327d2852..c949117da6 100644
--- a/usr/src/uts/common/fs/dnlc.c
+++ b/usr/src/uts/common/fs/dnlc.c
@@ -921,50 +921,6 @@ dnlc_fs_purge1(vnodeops_t *vop)
 }
 
 /*
- * Perform a reverse lookup in the DNLC.  This will find the first occurrence of
- * the vnode.  If successful, it will return the vnode of the parent, and the
- * name of the entry in the given buffer.  If it cannot be found, or the buffer
- * is too small, then it will return NULL.  Note that this is a highly
- * inefficient function, since the DNLC is constructed solely for forward
- * lookups.
- */
-vnode_t *
-dnlc_reverse_lookup(vnode_t *vp, char *buf, size_t buflen)
-{
-	nc_hash_t *nch;
-	ncache_t *ncp;
-	vnode_t *pvp;
-
-	if (!doingcache)
-		return (NULL);
-
-	for (nch = nc_hash; nch < &nc_hash[nc_hashsz]; nch++) {
-		mutex_enter(&nch->hash_lock);
-		ncp = nch->hash_next;
-		while (ncp != (ncache_t *)nch) {
-			/*
-			 * We ignore '..' entries since it can create
-			 * confusion and infinite loops.
-			 */
-			if (ncp->vp == vp && !(ncp->namlen == 2 &&
-			    0 == bcmp(ncp->name, "..", 2)) &&
-			    ncp->namlen < buflen) {
-				bcopy(ncp->name, buf, ncp->namlen);
-				buf[ncp->namlen] = '\0';
-				pvp = ncp->dp;
-				/* VN_HOLD 2 of 2 in this file */
-				VN_HOLD_CALLER(pvp);
-				mutex_exit(&nch->hash_lock);
-				return (pvp);
-			}
-			ncp = ncp->hash_next;
-		}
-		mutex_exit(&nch->hash_lock);
-	}
-
-	return (NULL);
-}
-/*
  * Utility routine to search for a cache entry. Return the
  * ncache entry if found, NULL otherwise.
  */
diff --git a/usr/src/uts/common/fs/fem.c b/usr/src/uts/common/fs/fem.c
index b4e28cc860..5f524def30 100644
--- a/usr/src/uts/common/fs/fem.c
+++ b/usr/src/uts/common/fs/fem.c
@@ -23,6 +23,10 @@
  * Use is subject to license terms.
  */
 
+/*
+ * Copyright (c) 2015, Joyent, Inc.  All rights reserved.
+ */
+
 #include <sys/types.h>
 #include <sys/atomic.h>
 #include <sys/kmem.h>
@@ -33,11 +37,12 @@
 #include <sys/systm.h>
 #include <sys/cmn_err.h>
 #include <sys/debug.h>
-
 #include <sys/fem.h>
 #include <sys/vfs.h>
 #include <sys/vnode.h>
 #include <sys/vfs_opreg.h>
+#include <sys/stack.h>
+#include <sys/archsystm.h>
 
 #define	NNODES_DEFAULT	8	/* Default number of nodes in a fem_list */
 /*
@@ -291,6 +296,536 @@ _op_find(femarg_t *ap, void **fp, int offs0, int offs1)
 }
 #endif
 
+/*
+ * File event monitoring handoffs
+ *
+ * File event monitoring relies on being able to inject stack frames between
+ * vnode consumers and the underlying file systems.  This becomes problematic
+ * when there exist many monitors, as kernel stack depth is finite.  The model
+ * very much encodes this injected frame:  the flow of control deliberately
+ * lies with the monitor, not with the monitoring system.  While we could
+ * conceivably address this by allowing each subsystem to install at most
+ * one monitor per vnode (and impose on subsystems that they handle any
+ * of their own consumer multiplexing internally), this in fact exports a
+ * substantial amount of run-time complexity to deal with an uncommon case
+ * (and, it must be said, assumes a small number of consuming subsystems).
+ * To allow our abstraction to remain clean, we instead check our remaining
+ * stack in every vnext_*() call; if the amount of stack remaining is lower
+ * than a threshold (fem_stack_needed), we call thread_splitstack() to carry
+ * on the execution of the monitors and the underlying vnode operation on a
+ * split stack.  Because we can only pass a single argument to our split stack
+ * function, we must marshal our arguments, the mechanics of which are somewhat
+ * ornate in terms of the code: to marshal in a type-safe manner, we define a
+ * baton that is a union of payload structures for each kind of operation,
+ * loading the per-operation payload explicitly and calling into common handoff
+ * code that itself calls thread_splitstack().  The function passed to
+ * thread_splitstack() is a per-entry point function that continues monitor
+ * processing given the specified (marshalled) arguments.  While this method
+ * is a little verbose to implement, it has the advantage of being relatively
+ * robust (that is, broadly type-safe) while imposing minimal burden on each
+ * vnext_*() entry point.
+ *
+ * In terms of the implementation:
+ *
+ * - The FEM_BATON_n macros define the per-entry point baton structures
+ * - The fem_baton_payload_t contains the union of these structures
+ * - The FEM_VNEXTn_DECL macros declare the post-handoff entry point
+ * - The FEM_VNEXTn macros constitute the per-handoff entry point
+ *
+ * Note that we don't use variadic macros -- we define a variant of these
+ * macros for each of our relevant argument counts.  This may seem overly
+ * explicit, but it is deliberate:  the object here is to minimize the
+ * future maintenance burden by minimizing the likelihood of introduced
+ * error --  not to minimize the number of characters in this source file.
+ */
+
+#ifndef STACK_GROWTH_DOWN
+#error Downward stack growth assumed.
+#endif
+
+int fem_stack_toodeep;
+uintptr_t fem_stack_needed = 8 * 1024;
+size_t fem_handoff_stacksize = 128 * 1024;
+
+#define	FEM_TOODEEP() (STACK_BIAS + (uintptr_t)getfp() - \
+	(uintptr_t)curthread->t_stkbase < fem_stack_needed)
+
+#define	FEM_BATON_1(what, t0, l0)					\
+	struct {							\
+		void *fb_##what##_arg0;					\
+		caller_context_t *fb_##what##_ct;			\
+		t0 fb_##what##_##l0;					\
+	} fb_##what
+
+#define	FEM_BATON_2(what, t0, l0, t1, l1)				\
+	struct {							\
+		void *fb_##what##_arg0;					\
+		caller_context_t *fb_##what##_ct;			\
+		t0 fb_##what##_##l0;					\
+		t1 fb_##what##_##l1;					\
+	} fb_##what
+
+#define	FEM_BATON_3(what, t0, l0, t1, l1, t2, l2)			\
+	struct {							\
+		void *fb_##what##_arg0;					\
+		caller_context_t *fb_##what##_ct;			\
+		t0 fb_##what##_##l0;					\
+		t1 fb_##what##_##l1;					\
+		t2 fb_##what##_##l2;					\
+	} fb_##what
+
+#define	FEM_BATON_4(what, t0, l0, t1, l1, t2, l2, t3, l3)		\
+	struct {							\
+		void *fb_##what##_arg0;					\
+		caller_context_t *fb_##what##_ct;			\
+		t0 fb_##what##_##l0;					\
+		t1 fb_##what##_##l1;					\
+		t2 fb_##what##_##l2;					\
+		t3 fb_##what##_##l3;					\
+	} fb_##what
+
+#define	FEM_BATON_5(what, t0, l0, t1, l1, t2, l2, t3, l3, t4, l4)	\
+	struct {							\
+		void *fb_##what##_arg0;					\
+		caller_context_t *fb_##what##_ct;			\
+		t0 fb_##what##_##l0;					\
+		t1 fb_##what##_##l1;					\
+		t2 fb_##what##_##l2;					\
+		t3 fb_##what##_##l3;					\
+		t4 fb_##what##_##l4;					\
+	} fb_##what
+
+#define	FEM_BATON_6(what, t0, l0, t1, l1, t2, l2, t3, l3, t4, l4, t5, l5) \
+	struct {							\
+		void *fb_##what##_arg0;					\
+		caller_context_t *fb_##what##_ct;			\
+		t0 fb_##what##_##l0;					\
+		t1 fb_##what##_##l1;					\
+		t2 fb_##what##_##l2;					\
+		t3 fb_##what##_##l3;					\
+		t4 fb_##what##_##l4;					\
+		t5 fb_##what##_##l5;					\
+	} fb_##what
+
+#define	FEM_BATON_8(what, t0, l0, t1, l1, t2, l2, t3, l3, t4, l4, t5, l5, \
+    t6, l6, t7, l7) \
+	struct {							\
+		void *fb_##what##_arg0;					\
+		caller_context_t *fb_##what##_ct;			\
+		t0 fb_##what##_##l0;					\
+		t1 fb_##what##_##l1;					\
+		t2 fb_##what##_##l2;					\
+		t3 fb_##what##_##l3;					\
+		t4 fb_##what##_##l4;					\
+		t5 fb_##what##_##l5;					\
+		t6 fb_##what##_##l6;					\
+		t7 fb_##what##_##l7;					\
+	} fb_##what
+
+#define	FEM_BATON_9(what, t0, l0, t1, l1, t2, l2, t3, l3, t4, l4, t5, l5, \
+    t6, l6, t7, l7, t8, l8) \
+	struct {							\
+		void *fb_##what##_arg0;					\
+		caller_context_t *fb_##what##_ct;			\
+		t0 fb_##what##_##l0;					\
+		t1 fb_##what##_##l1;					\
+		t2 fb_##what##_##l2;					\
+		t3 fb_##what##_##l3;					\
+		t4 fb_##what##_##l4;					\
+		t5 fb_##what##_##l5;					\
+		t6 fb_##what##_##l6;					\
+		t7 fb_##what##_##l7;					\
+		t8 fb_##what##_##l8;					\
+	} fb_##what
+
+typedef union {
+	FEM_BATON_2(open, int, mode, cred_t *, cr);
+	FEM_BATON_4(close, int, flag, int, count,
+	    offset_t, offset, cred_t *, cr);
+	FEM_BATON_3(read, uio_t *, uiop, int, ioflag, cred_t *, cr);
+	FEM_BATON_3(write, uio_t *, uiop, int, ioflag, cred_t *, cr);
+	FEM_BATON_5(ioctl, int, cmd, intptr_t, arg,
+	    int, flag, cred_t *, cr, int *, rvalp);
+	FEM_BATON_3(setfl, int, oflags, int, nflags, cred_t *, cr);
+	FEM_BATON_3(getattr, vattr_t *, vap, int, flags, cred_t *, cr);
+	FEM_BATON_3(setattr, vattr_t *, vap, int, flags, cred_t *, cr);
+	FEM_BATON_3(access, int, mode, int, flags, cred_t *, cr);
+	FEM_BATON_8(lookup, char *, nm, vnode_t **, vpp,
+	    pathname_t *, pnp, int, flags, vnode_t *, rdir,
+	    cred_t *, cr, int *, direntflags, pathname_t *, realpnp);
+	FEM_BATON_8(create, char *, name, vattr_t *, vap,
+	    vcexcl_t, excl, int, mode, vnode_t **, vpp,
+	    cred_t *, cr, int, flag, vsecattr_t *, vsecp);
+	FEM_BATON_3(remove, char *, nm, cred_t *, cr, int, flags);
+	FEM_BATON_4(link, vnode_t *, svp, char *, tnm,
+	    cred_t *, cr, int, flags);
+	FEM_BATON_5(rename, char *, snm, vnode_t *, tdvp,
+	    char *, tnm, cred_t *, cr, int, flags);
+	FEM_BATON_6(mkdir, char *, dirname, vattr_t *, vap,
+	    vnode_t **, vpp, cred_t *, cr, int, flags,
+	    vsecattr_t *, vsecp);
+	FEM_BATON_4(rmdir, char *, nm, vnode_t *, cdir,
+	    cred_t *, cr, int, flags);
+	FEM_BATON_4(readdir, uio_t *, uiop, cred_t *, cr,
+	    int *, eofp, int, flags);
+	FEM_BATON_5(symlink, char *, linkname, vattr_t *, vap,
+	    char *, target, cred_t *, cr, int, flags);
+	FEM_BATON_2(readlink, uio_t *, uiop, cred_t *, cr);
+	FEM_BATON_2(fsync, int, syncflag, cred_t *, cr);
+	FEM_BATON_1(inactive, cred_t *, cr);
+	FEM_BATON_1(fid, fid_t *, fidp);
+	FEM_BATON_1(rwlock, int, write_lock);
+	FEM_BATON_1(rwunlock, int, write_lock);
+	FEM_BATON_2(seek, offset_t, ooff, offset_t *, noffp);
+	FEM_BATON_1(cmp, vnode_t *, vp2);
+	FEM_BATON_6(frlock, int, cmd, struct flock64 *, bfp,
+	    int, flag, offset_t, offset, struct flk_callback *, flk_cbp,
+	    cred_t *, cr);
+	FEM_BATON_5(space, int, cmd, struct flock64 *, bfp,
+	    int, flag, offset_t, offset, cred_t *, cr);
+	FEM_BATON_1(realvp, vnode_t **, vpp);
+	FEM_BATON_9(getpage, offset_t, off, size_t, len,
+	    uint_t *, protp, struct page **, plarr, size_t, plsz,
+	    struct seg *, seg, caddr_t, addr, enum seg_rw, rw,
+	    cred_t *, cr);
+	FEM_BATON_4(putpage, offset_t, off, size_t, len,
+	    int, flags, cred_t *, cr);
+	FEM_BATON_8(map, offset_t, off, struct as *, as,
+	    caddr_t *, addrp, size_t, len, uchar_t, prot,
+	    uchar_t, maxprot, uint_t, flags, cred_t *, cr);
+	FEM_BATON_8(addmap, offset_t, off, struct as *, as,
+	    caddr_t, addr, size_t, len, uchar_t, prot,
+	    uchar_t, maxprot, uint_t, flags, cred_t *, cr);
+	FEM_BATON_8(delmap, offset_t, off, struct as *, as,
+	    caddr_t, addr, size_t, len, uint_t, prot,
+	    uint_t, maxprot, uint_t, flags, cred_t *, cr);
+	FEM_BATON_4(poll, short, events, int, anyyet,
+	    short *, reventsp, struct pollhead **, phpp);
+	FEM_BATON_3(dump, caddr_t, addr, offset_t, lbdn, offset_t, dblks);
+	FEM_BATON_3(pathconf, int, cmd, ulong_t *, valp, cred_t *, cr);
+	FEM_BATON_5(pageio, struct page *, pp, u_offset_t, io_off,
+	    size_t, io_len, int, flags, cred_t *, cr);
+	FEM_BATON_2(dumpctl, int, action, offset_t *, blkp);
+	FEM_BATON_4(dispose, struct page *, pp, int, flag,
+	    int, dn, cred_t *, cr);
+	FEM_BATON_3(setsecattr, vsecattr_t *, vsap, int, flag, cred_t *, cr);
+	FEM_BATON_3(getsecattr, vsecattr_t *, vsap, int, flag, cred_t *, cr);
+	FEM_BATON_4(shrlock, int, cmd, struct shrlock *, shr,
+	    int, flag, cred_t *, cr);
+	FEM_BATON_3(vnevent, vnevent_t, vnevent, vnode_t *, dvp, char *, cname);
+	FEM_BATON_3(reqzcbuf, enum uio_rw, ioflag,
+	    xuio_t *, xuiop, cred_t *, cr);
+	FEM_BATON_2(retzcbuf, xuio_t *, xuiop, cred_t *, cr);
+} fem_baton_payload_t;
+
+typedef struct {
+	fem_baton_payload_t fb_payload;
+	int (*fb_func)();
+	void (*fb_handoff)();
+	int fb_rval;
+} fem_baton_t;
+
+static int
+fem_handoff(fem_baton_t *bp)
+{
+	fem_stack_toodeep++;
+	thread_splitstack(bp->fb_handoff, bp, fem_handoff_stacksize);
+
+	return (bp->fb_rval);
+}
+
+#define	FEM_VNEXT3_DECL(what, a0, a1, a2)				\
+void									\
+fem_handoff_##what(fem_baton_t *bp)					\
+{									\
+	bp->fb_rval = bp->fb_func(					\
+	    bp->fb_payload.fb_##what.fb_##what##_##a0,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a1,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a2);			\
+}
+
+#define	FEM_VNEXT4_DECL(what, a0, a1, a2, a3)				\
+void									\
+fem_handoff_##what(fem_baton_t *bp)					\
+{									\
+	bp->fb_rval = bp->fb_func(					\
+	    bp->fb_payload.fb_##what.fb_##what##_##a0,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a1,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a2,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a3);			\
+}
+
+#define	FEM_VNEXT5_DECL(what, a0, a1, a2, a3, a4)			\
+void									\
+fem_handoff_##what(fem_baton_t *bp)					\
+{									\
+	bp->fb_rval = bp->fb_func(					\
+	    bp->fb_payload.fb_##what.fb_##what##_##a0,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a1,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a2,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a3,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a4);			\
+}
+
+#define	FEM_VNEXT6_DECL(what, a0, a1, a2, a3, a4, a5)			\
+void									\
+fem_handoff_##what(fem_baton_t *bp)					\
+{									\
+	bp->fb_rval = bp->fb_func(					\
+	    bp->fb_payload.fb_##what.fb_##what##_##a0,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a1,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a2,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a3,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a4,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a5);			\
+}
+
+#define	FEM_VNEXT7_DECL(what, a0, a1, a2, a3, a4, a5, a6)		\
+void									\
+fem_handoff_##what(fem_baton_t *bp)					\
+{									\
+	bp->fb_rval = bp->fb_func(					\
+	    bp->fb_payload.fb_##what.fb_##what##_##a0,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a1,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a2,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a3,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a4,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a5,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a6);			\
+}
+
+#define	FEM_VNEXT8_DECL(what, a0, a1, a2, a3, a4, a5, a6, a7)		\
+void									\
+fem_handoff_##what(fem_baton_t *bp)					\
+{									\
+	bp->fb_rval = bp->fb_func(					\
+	    bp->fb_payload.fb_##what.fb_##what##_##a0,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a1,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a2,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a3,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a4,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a5,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a6,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a7);			\
+}
+
+#define	FEM_VNEXT10_DECL(what, a0, a1, a2, a3, a4, a5, a6, a7, a8, a9)	\
+void									\
+fem_handoff_##what(fem_baton_t *bp)					\
+{									\
+	bp->fb_rval = bp->fb_func(					\
+	    bp->fb_payload.fb_##what.fb_##what##_##a0,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a1,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a2,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a3,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a4,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a5,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a6,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a7,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a8,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a9);			\
+}
+
+#define	FEM_VNEXT11_DECL(what, a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10) \
+void									\
+fem_handoff_##what(fem_baton_t *bp)					\
+{									\
+	bp->fb_rval = bp->fb_func(					\
+	    bp->fb_payload.fb_##what.fb_##what##_##a0,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a1,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a2,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a3,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a4,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a5,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a6,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a7,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a8,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a9,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a10);		\
+}
+
+#define	FEM_VNEXT3(what, func, a0, a1, a2)				\
+	if (FEM_TOODEEP()) {						\
+		fem_baton_t *baton;					\
+		int rval;						\
+									\
+		baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP);	\
+		baton->fb_payload.fb_##what.fb_##what##_##a0 = a0;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a1 = a1;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a2 = a2;	\
+		baton->fb_handoff = fem_handoff_##what;			\
+		baton->fb_func = func;					\
+									\
+		rval = fem_handoff(baton);				\
+		kmem_free(baton, sizeof (fem_baton_t));			\
+									\
+		return (rval);						\
+	}								\
+	return (func(a0, a1, a2))
+
+#define	FEM_VNEXT4(what, func, a0, a1, a2, a3)				\
+	if (FEM_TOODEEP()) {						\
+		fem_baton_t *baton;					\
+		int rval;						\
+									\
+		baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP);	\
+		baton->fb_payload.fb_##what.fb_##what##_##a0 = a0;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a1 = a1;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a2 = a2;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a3 = a3;	\
+		baton->fb_handoff = fem_handoff_##what;			\
+		baton->fb_func = func;					\
+									\
+		rval = fem_handoff(baton);				\
+		kmem_free(baton, sizeof (fem_baton_t));			\
+									\
+		return (rval);						\
+	}								\
+	return (func(a0, a1, a2, a3))
+
+#define	FEM_VNEXT5(what, func, a0, a1, a2, a3, a4)			\
+	if (FEM_TOODEEP()) {						\
+		fem_baton_t *baton;					\
+		int rval;						\
+									\
+		baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP);	\
+		baton->fb_payload.fb_##what.fb_##what##_##a0 = a0;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a1 = a1;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a2 = a2;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a3 = a3;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a4 = a4;	\
+		baton->fb_handoff = fem_handoff_##what;			\
+		baton->fb_func = func;					\
+									\
+		rval = fem_handoff(baton);				\
+		kmem_free(baton, sizeof (fem_baton_t));			\
+									\
+		return (rval);						\
+	}								\
+	return (func(a0, a1, a2, a3, a4))
+
+#define	FEM_VNEXT6(what, func, a0, a1, a2, a3, a4, a5)			\
+	if (FEM_TOODEEP()) {						\
+		fem_baton_t *baton;					\
+		int rval;						\
+									\
+		baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP);	\
+		baton->fb_payload.fb_##what.fb_##what##_##a0 = a0;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a1 = a1;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a2 = a2;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a3 = a3;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a4 = a4;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a5 = a5;	\
+		baton->fb_handoff = fem_handoff_##what;			\
+		baton->fb_func = func;					\
+									\
+		rval = fem_handoff(baton);				\
+		kmem_free(baton, sizeof (fem_baton_t));			\
+									\
+		return (rval);						\
+	}								\
+	return (func(a0, a1, a2, a3, a4, a5))
+
+#define	FEM_VNEXT7(what, func, a0, a1, a2, a3, a4, a5, a6)		\
+	if (FEM_TOODEEP()) {						\
+		fem_baton_t *baton;					\
+		int rval;						\
+									\
+		baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP);	\
+		baton->fb_payload.fb_##what.fb_##what##_##a0 = a0;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a1 = a1;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a2 = a2;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a3 = a3;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a4 = a4;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a5 = a5;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a6 = a6;	\
+		baton->fb_handoff = fem_handoff_##what;			\
+		baton->fb_func = func;					\
+									\
+		rval = fem_handoff(baton);				\
+		kmem_free(baton, sizeof (fem_baton_t));			\
+									\
+		return (rval);						\
+	}								\
+	return (func(a0, a1, a2, a3, a4, a5, a6))
+
+#define	FEM_VNEXT8(what, func, a0, a1, a2, a3, a4, a5, a6, a7)		\
+	if (FEM_TOODEEP()) {						\
+		fem_baton_t *baton;					\
+		int rval;						\
+									\
+		baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP);	\
+		baton->fb_payload.fb_##what.fb_##what##_##a0 = a0;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a1 = a1;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a2 = a2;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a3 = a3;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a4 = a4;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a5 = a5;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a6 = a6;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a7 = a7;	\
+		baton->fb_handoff = fem_handoff_##what;			\
+		baton->fb_func = func;					\
+									\
+		rval = fem_handoff(baton);				\
+		kmem_free(baton, sizeof (fem_baton_t));			\
+									\
+		return (rval);						\
+	}								\
+	return (func(a0, a1, a2, a3, a4, a5, a6, a7))
+
+#define	FEM_VNEXT10(what, func, a0, a1, a2, a3, a4, a5, a6, a7, a8, a9)	\
+	if (FEM_TOODEEP()) {						\
+		fem_baton_t *baton;					\
+		int rval;						\
+									\
+		baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP);	\
+		baton->fb_payload.fb_##what.fb_##what##_##a0 = a0;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a1 = a1;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a2 = a2;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a3 = a3;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a4 = a4;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a5 = a5;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a6 = a6;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a7 = a7;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a8 = a8;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a9 = a9;	\
+		baton->fb_handoff = fem_handoff_##what;			\
+		baton->fb_func = func;					\
+									\
+		rval = fem_handoff(baton);				\
+		kmem_free(baton, sizeof (fem_baton_t));			\
+									\
+		return (rval);						\
+	}								\
+	return (func(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9))
+
+#define	FEM_VNEXT11(what, func, a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10) \
+	if (FEM_TOODEEP()) {						\
+		fem_baton_t *baton;					\
+		int rval;						\
+									\
+		baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP);	\
+		baton->fb_payload.fb_##what.fb_##what##_##a0 = a0;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a1 = a1;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a2 = a2;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a3 = a3;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a4 = a4;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a5 = a5;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a6 = a6;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a7 = a7;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a8 = a8;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a9 = a9;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a10 = a10;	\
+		baton->fb_handoff = fem_handoff_##what;			\
+		baton->fb_func = func;					\
+									\
+		rval = fem_handoff(baton);				\
+		kmem_free(baton, sizeof (fem_baton_t));			\
+									\
+		return (rval);						\
+	}								\
+	return (func(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10))
+
 static fem_t *
 fem_alloc()
 {
@@ -2036,10 +2571,60 @@ static struct fs_operation_def fshead_vfs_spec[]  = {
  * 5.  Return by invoking the base operation with the base object.
  *
  * for each classification, there needs to be at least one "next" operation
- * for each "head"operation.
- *
+ * for each "head" operation.  Note that we also use the FEM_VNEXTn_DECL macros
+ * to define the function to run when the stack is split; see the discussion
+ * on "File event monitoring handoffs", above.
  */
 
+FEM_VNEXT4_DECL(open, arg0, mode, cr, ct)
+FEM_VNEXT6_DECL(close, arg0, flag, count, offset, cr, ct)
+FEM_VNEXT5_DECL(read, arg0, uiop, ioflag, cr, ct)
+FEM_VNEXT5_DECL(write, arg0, uiop, ioflag, cr, ct)
+FEM_VNEXT7_DECL(ioctl, arg0, cmd, arg, flag, cr, rvalp, ct)
+FEM_VNEXT5_DECL(setfl, arg0, oflags, nflags, cr, ct)
+FEM_VNEXT5_DECL(getattr, arg0, vap, flags, cr, ct)
+FEM_VNEXT5_DECL(setattr, arg0, vap, flags, cr, ct)
+FEM_VNEXT5_DECL(access, arg0, mode, flags, cr, ct)
+FEM_VNEXT10_DECL(lookup, arg0, nm, vpp, pnp, flags, rdir,
+    cr, ct, direntflags, realpnp)
+FEM_VNEXT10_DECL(create, arg0, name, vap, excl, mode, vpp, cr, flag, ct, vsecp)
+FEM_VNEXT5_DECL(remove, arg0, nm, cr, ct, flags)
+FEM_VNEXT6_DECL(link, arg0, svp, tnm, cr, ct, flags)
+FEM_VNEXT7_DECL(rename, arg0, snm, tdvp, tnm, cr, ct, flags)
+FEM_VNEXT8_DECL(mkdir, arg0, dirname, vap, vpp, cr, ct, flags, vsecp)
+FEM_VNEXT6_DECL(rmdir, arg0, nm, cdir, cr, ct, flags)
+FEM_VNEXT6_DECL(readdir, arg0, uiop, cr, eofp, ct, flags)
+FEM_VNEXT7_DECL(symlink, arg0, linkname, vap, target, cr, ct, flags)
+FEM_VNEXT4_DECL(readlink, arg0, uiop, cr, ct)
+FEM_VNEXT4_DECL(fsync, arg0, syncflag, cr, ct)
+FEM_VNEXT3_DECL(fid, arg0, fidp, ct)
+FEM_VNEXT3_DECL(rwlock, arg0, write_lock, ct)
+FEM_VNEXT4_DECL(seek, arg0, ooff, noffp, ct)
+FEM_VNEXT3_DECL(cmp, arg0, vp2, ct)
+FEM_VNEXT8_DECL(frlock, arg0, cmd, bfp, flag, offset, flk_cbp, cr, ct)
+FEM_VNEXT7_DECL(space, arg0, cmd, bfp, flag, offset, cr, ct)
+FEM_VNEXT3_DECL(realvp, arg0, vpp, ct)
+FEM_VNEXT11_DECL(getpage, arg0, off, len, protp, plarr, plsz,
+    seg, addr, rw, cr, ct)
+FEM_VNEXT6_DECL(putpage, arg0, off, len, flags, cr, ct)
+FEM_VNEXT10_DECL(map, arg0, off, as, addrp, len, prot, maxprot,
+    flags, cr, ct)
+FEM_VNEXT10_DECL(addmap, arg0, off, as, addr, len, prot, maxprot,
+    flags, cr, ct)
+FEM_VNEXT10_DECL(delmap, arg0, off, as, addr, len, prot, maxprot,
+    flags, cr, ct)
+FEM_VNEXT6_DECL(poll, arg0, events, anyyet, reventsp, phpp, ct)
+FEM_VNEXT5_DECL(dump, arg0, addr, lbdn, dblks, ct)
+FEM_VNEXT5_DECL(pathconf, arg0, cmd, valp, cr, ct)
+FEM_VNEXT7_DECL(pageio, arg0, pp, io_off, io_len, flags, cr, ct)
+FEM_VNEXT4_DECL(dumpctl, arg0, action, blkp, ct)
+FEM_VNEXT5_DECL(setsecattr, arg0, vsap, flag, cr, ct)
+FEM_VNEXT5_DECL(getsecattr, arg0, vsap, flag, cr, ct)
+FEM_VNEXT6_DECL(shrlock, arg0, cmd, shr, flag, cr, ct)
+FEM_VNEXT5_DECL(vnevent, arg0, vnevent, dvp, cname, ct)
+FEM_VNEXT5_DECL(reqzcbuf, arg0, ioflag, xuiop, cr, ct)
+FEM_VNEXT4_DECL(retzcbuf, arg0, xuiop, cr, ct)
+
 int
 vnext_open(femarg_t *vf, int mode, cred_t *cr, caller_context_t *ct)
 {
@@ -2051,7 +2636,7 @@ vnext_open(femarg_t *vf, int mode, cred_t *cr, caller_context_t *ct)
 	vsop_find(vf, &func, int, &arg0, vop_open, femop_open);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, mode, cr, ct));
+	FEM_VNEXT4(open, func, arg0, mode, cr, ct);
 }
 
 int
@@ -2066,7 +2651,7 @@ vnext_close(femarg_t *vf, int flag, int count, offset_t offset, cred_t *cr,
 	vsop_find(vf, &func, int, &arg0, vop_close, femop_close);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, flag, count, offset, cr, ct));
+	FEM_VNEXT6(close, func, arg0, flag, count, offset, cr, ct);
 }
 
 int
@@ -2081,7 +2666,7 @@ vnext_read(femarg_t *vf, uio_t *uiop, int ioflag, cred_t *cr,
 	vsop_find(vf, &func, int, &arg0, vop_read, femop_read);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, uiop, ioflag, cr, ct));
+	FEM_VNEXT5(read, func, arg0, uiop, ioflag, cr, ct);
 }
 
 int
@@ -2096,7 +2681,7 @@ vnext_write(femarg_t *vf, uio_t *uiop, int ioflag, cred_t *cr,
 	vsop_find(vf, &func, int, &arg0, vop_write, femop_write);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, uiop, ioflag, cr, ct));
+	FEM_VNEXT5(write, func, arg0, uiop, ioflag, cr, ct);
 }
 
 int
@@ -2111,7 +2696,7 @@ vnext_ioctl(femarg_t *vf, int cmd, intptr_t arg, int flag, cred_t *cr,
 	vsop_find(vf, &func, int, &arg0, vop_ioctl, femop_ioctl);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, cmd, arg, flag, cr, rvalp, ct));
+	FEM_VNEXT7(ioctl, func, arg0, cmd, arg, flag, cr, rvalp, ct);
 }
 
 int
@@ -2126,7 +2711,7 @@ vnext_setfl(femarg_t *vf, int oflags, int nflags, cred_t *cr,
 	vsop_find(vf, &func, int, &arg0, vop_setfl, femop_setfl);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, oflags, nflags, cr, ct));
+	FEM_VNEXT5(setfl, func, arg0, oflags, nflags, cr, ct);
 }
 
 int
@@ -2141,7 +2726,7 @@ vnext_getattr(femarg_t *vf, vattr_t *vap, int flags, cred_t *cr,
 	vsop_find(vf, &func, int, &arg0, vop_getattr, femop_getattr);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, vap, flags, cr, ct));
+	FEM_VNEXT5(getattr, func, arg0, vap, flags, cr, ct);
 }
 
 int
@@ -2156,7 +2741,7 @@ vnext_setattr(femarg_t *vf, vattr_t *vap, int flags, cred_t *cr,
 	vsop_find(vf, &func, int, &arg0, vop_setattr, femop_setattr);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, vap, flags, cr, ct));
+	FEM_VNEXT5(setattr, func, arg0, vap, flags, cr, ct);
 }
 
 int
@@ -2171,7 +2756,7 @@ vnext_access(femarg_t *vf, int mode, int flags, cred_t *cr,
 	vsop_find(vf, &func, int, &arg0, vop_access, femop_access);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, mode, flags, cr, ct));
+	FEM_VNEXT5(access, func, arg0, mode, flags, cr, ct);
 }
 
 int
@@ -2187,8 +2772,8 @@ vnext_lookup(femarg_t *vf, char *nm, vnode_t **vpp, pathname_t *pnp,
 	vsop_find(vf, &func, int, &arg0, vop_lookup, femop_lookup);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, nm, vpp, pnp, flags, rdir, cr, ct,
-	    direntflags, realpnp));
+	FEM_VNEXT10(lookup, func, arg0, nm, vpp, pnp, flags, rdir, cr, ct,
+	    direntflags, realpnp);
 }
 
 int
@@ -2204,7 +2789,8 @@ vnext_create(femarg_t *vf, char *name, vattr_t *vap, vcexcl_t excl,
 	vsop_find(vf, &func, int, &arg0, vop_create, femop_create);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, name, vap, excl, mode, vpp, cr, flag, ct, vsecp));
+	FEM_VNEXT10(create, func, arg0, name, vap, excl,
+	    mode, vpp, cr, flag, ct, vsecp);
 }
 
 int
@@ -2219,7 +2805,7 @@ vnext_remove(femarg_t *vf, char *nm, cred_t *cr, caller_context_t *ct,
 	vsop_find(vf, &func, int, &arg0, vop_remove, femop_remove);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, nm, cr, ct, flags));
+	FEM_VNEXT5(remove, func, arg0, nm, cr, ct, flags);
 }
 
 int
@@ -2234,7 +2820,7 @@ vnext_link(femarg_t *vf, vnode_t *svp, char *tnm, cred_t *cr,
 	vsop_find(vf, &func, int, &arg0, vop_link, femop_link);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, svp, tnm, cr, ct, flags));
+	FEM_VNEXT6(link, func, arg0, svp, tnm, cr, ct, flags);
 }
 
 int
@@ -2249,7 +2835,7 @@ vnext_rename(femarg_t *vf, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr,
 	vsop_find(vf, &func, int, &arg0, vop_rename, femop_rename);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, snm, tdvp, tnm, cr, ct, flags));
+	FEM_VNEXT7(rename, func, arg0, snm, tdvp, tnm, cr, ct, flags);
 }
 
 int
@@ -2264,7 +2850,7 @@ vnext_mkdir(femarg_t *vf, char *dirname, vattr_t *vap, vnode_t **vpp,
 	vsop_find(vf, &func, int, &arg0, vop_mkdir, femop_mkdir);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, dirname, vap, vpp, cr, ct, flags, vsecp));
+	FEM_VNEXT8(mkdir, func, arg0, dirname, vap, vpp, cr, ct, flags, vsecp);
 }
 
 int
@@ -2279,7 +2865,7 @@ vnext_rmdir(femarg_t *vf, char *nm, vnode_t *cdir, cred_t *cr,
 	vsop_find(vf, &func, int, &arg0, vop_rmdir, femop_rmdir);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, nm, cdir, cr, ct, flags));
+	FEM_VNEXT6(rmdir, func, arg0, nm, cdir, cr, ct, flags);
 }
 
 int
@@ -2294,7 +2880,7 @@ vnext_readdir(femarg_t *vf, uio_t *uiop, cred_t *cr, int *eofp,
 	vsop_find(vf, &func, int, &arg0, vop_readdir, femop_readdir);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, uiop, cr, eofp, ct, flags));
+	FEM_VNEXT6(readdir, func, arg0, uiop, cr, eofp, ct, flags);
 }
 
 int
@@ -2309,7 +2895,7 @@ vnext_symlink(femarg_t *vf, char *linkname, vattr_t *vap, char *target,
 	vsop_find(vf, &func, int, &arg0, vop_symlink, femop_symlink);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, linkname, vap, target, cr, ct, flags));
+	FEM_VNEXT7(symlink, func, arg0, linkname, vap, target, cr, ct, flags);
 }
 
 int
@@ -2323,7 +2909,7 @@ vnext_readlink(femarg_t *vf, uio_t *uiop, cred_t *cr, caller_context_t *ct)
 	vsop_find(vf, &func, int, &arg0, vop_readlink, femop_readlink);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, uiop, cr, ct));
+	FEM_VNEXT4(readlink, func, arg0, uiop, cr, ct);
 }
 
 int
@@ -2337,7 +2923,7 @@ vnext_fsync(femarg_t *vf, int syncflag, cred_t *cr, caller_context_t *ct)
 	vsop_find(vf, &func, int, &arg0, vop_fsync, femop_fsync);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, syncflag, cr, ct));
+	FEM_VNEXT4(fsync, func, arg0, syncflag, cr, ct);
 }
 
 void
@@ -2365,7 +2951,7 @@ vnext_fid(femarg_t *vf, fid_t *fidp, caller_context_t *ct)
 	vsop_find(vf, &func, int, &arg0, vop_fid, femop_fid);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, fidp, ct));
+	FEM_VNEXT3(fid, func, arg0, fidp, ct);
 }
 
 int
@@ -2379,7 +2965,7 @@ vnext_rwlock(femarg_t *vf, int write_lock, caller_context_t *ct)
 	vsop_find(vf, &func, int, &arg0, vop_rwlock, femop_rwlock);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, write_lock, ct));
+	FEM_VNEXT3(rwlock, func, arg0, write_lock, ct);
 }
 
 void
@@ -2407,7 +2993,7 @@ vnext_seek(femarg_t *vf, offset_t ooff, offset_t *noffp, caller_context_t *ct)
 	vsop_find(vf, &func, int, &arg0, vop_seek, femop_seek);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, ooff, noffp, ct));
+	FEM_VNEXT4(seek, func, arg0, ooff, noffp, ct);
 }
 
 int
@@ -2421,7 +3007,7 @@ vnext_cmp(femarg_t *vf, vnode_t *vp2, caller_context_t *ct)
 	vsop_find(vf, &func, int, &arg0, vop_cmp, femop_cmp);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, vp2, ct));
+	FEM_VNEXT3(cmp, func, arg0, vp2, ct);
 }
 
 int
@@ -2437,7 +3023,7 @@ vnext_frlock(femarg_t *vf, int cmd, struct flock64 *bfp, int flag,
 	vsop_find(vf, &func, int, &arg0, vop_frlock, femop_frlock);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, cmd, bfp, flag, offset, flk_cbp, cr, ct));
+	FEM_VNEXT8(frlock, func, arg0, cmd, bfp, flag, offset, flk_cbp, cr, ct);
 }
 
 int
@@ -2452,7 +3038,7 @@ vnext_space(femarg_t *vf, int cmd, struct flock64 *bfp, int flag,
 	vsop_find(vf, &func, int, &arg0, vop_space, femop_space);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, cmd, bfp, flag, offset, cr, ct));
+	FEM_VNEXT7(space, func, arg0, cmd, bfp, flag, offset, cr, ct);
 }
 
 int
@@ -2466,7 +3052,7 @@ vnext_realvp(femarg_t *vf, vnode_t **vpp, caller_context_t *ct)
 	vsop_find(vf, &func, int, &arg0, vop_realvp, femop_realvp);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, vpp, ct));
+	FEM_VNEXT3(realvp, func, arg0, vpp, ct);
 }
 
 int
@@ -2482,8 +3068,8 @@ vnext_getpage(femarg_t *vf, offset_t off, size_t len, uint_t *protp,
 	vsop_find(vf, &func, int, &arg0, vop_getpage, femop_getpage);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, off, len, protp, plarr, plsz, seg, addr, rw,
-	    cr, ct));
+	FEM_VNEXT11(getpage, func, arg0, off, len, protp,
+	    plarr, plsz, seg, addr, rw, cr, ct);
 }
 
 int
@@ -2498,7 +3084,7 @@ vnext_putpage(femarg_t *vf, offset_t off, size_t len, int flags,
 	vsop_find(vf, &func, int, &arg0, vop_putpage, femop_putpage);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, off, len, flags, cr, ct));
+	FEM_VNEXT6(putpage, func, arg0, off, len, flags, cr, ct);
 }
 
 int
@@ -2514,8 +3100,8 @@ vnext_map(femarg_t *vf, offset_t off, struct as *as, caddr_t *addrp,
 	vsop_find(vf, &func, int, &arg0, vop_map, femop_map);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, off, as, addrp, len, prot, maxprot, flags,
-	    cr, ct));
+	FEM_VNEXT10(map, func, arg0, off, as, addrp, len, prot, maxprot, flags,
+	    cr, ct);
 }
 
 int
@@ -2531,8 +3117,8 @@ vnext_addmap(femarg_t *vf, offset_t off, struct as *as, caddr_t addr,
 	vsop_find(vf, &func, int, &arg0, vop_addmap, femop_addmap);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, off, as, addr, len, prot, maxprot, flags,
-	    cr, ct));
+	FEM_VNEXT10(addmap, func, arg0, off, as, addr, len, prot, maxprot,
+	    flags, cr, ct);
 }
 
 int
@@ -2548,8 +3134,8 @@ vnext_delmap(femarg_t *vf, offset_t off, struct as *as, caddr_t addr,
 	vsop_find(vf, &func, int, &arg0, vop_delmap, femop_delmap);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, off, as, addr, len, prot, maxprot, flags,
-	    cr, ct));
+	FEM_VNEXT10(delmap, func, arg0, off, as, addr, len, prot, maxprot,
+	    flags, cr, ct);
 }
 
 int
@@ -2564,7 +3150,7 @@ vnext_poll(femarg_t *vf, short events, int anyyet, short *reventsp,
 	vsop_find(vf, &func, int, &arg0, vop_poll, femop_poll);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, events, anyyet, reventsp, phpp, ct));
+	FEM_VNEXT6(poll, func, arg0, events, anyyet, reventsp, phpp, ct);
 }
 
 int
@@ -2579,7 +3165,7 @@ vnext_dump(femarg_t *vf, caddr_t addr, offset_t lbdn, offset_t dblks,
 	vsop_find(vf, &func, int, &arg0, vop_dump, femop_dump);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, addr, lbdn, dblks, ct));
+	FEM_VNEXT5(dump, func, arg0, addr, lbdn, dblks, ct);
 }
 
 int
@@ -2594,7 +3180,7 @@ vnext_pathconf(femarg_t *vf, int cmd, ulong_t *valp, cred_t *cr,
 	vsop_find(vf, &func, int, &arg0, vop_pathconf, femop_pathconf);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, cmd, valp, cr, ct));
+	FEM_VNEXT5(pathconf, func, arg0, cmd, valp, cr, ct);
 }
 
 int
@@ -2609,7 +3195,7 @@ vnext_pageio(femarg_t *vf, struct page *pp, u_offset_t io_off,
 	vsop_find(vf, &func, int, &arg0, vop_pageio, femop_pageio);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, pp, io_off, io_len, flags, cr, ct));
+	FEM_VNEXT7(pageio, func, arg0, pp, io_off, io_len, flags, cr, ct);
 }
 
 int
@@ -2623,7 +3209,7 @@ vnext_dumpctl(femarg_t *vf, int action, offset_t *blkp, caller_context_t *ct)
 	vsop_find(vf, &func, int, &arg0, vop_dumpctl, femop_dumpctl);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, action, blkp, ct));
+	FEM_VNEXT4(dumpctl, func, arg0, action, blkp, ct);
 }
 
 void
@@ -2653,7 +3239,7 @@ vnext_setsecattr(femarg_t *vf, vsecattr_t *vsap, int flag, cred_t *cr,
 	vsop_find(vf, &func, int, &arg0, vop_setsecattr, femop_setsecattr);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, vsap, flag, cr, ct));
+	FEM_VNEXT5(setsecattr, func, arg0, vsap, flag, cr, ct);
 }
 
 int
@@ -2668,7 +3254,7 @@ vnext_getsecattr(femarg_t *vf, vsecattr_t *vsap, int flag, cred_t *cr,
 	vsop_find(vf, &func, int, &arg0, vop_getsecattr, femop_getsecattr);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, vsap, flag, cr, ct));
+	FEM_VNEXT5(getsecattr, func, arg0, vsap, flag, cr, ct);
 }
 
 int
@@ -2683,7 +3269,7 @@ vnext_shrlock(femarg_t *vf, int cmd, struct shrlock *shr, int flag,
 	vsop_find(vf, &func, int, &arg0, vop_shrlock, femop_shrlock);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, cmd, shr, flag, cr, ct));
+	FEM_VNEXT6(shrlock, func, arg0, cmd, shr, flag, cr, ct);
 }
 
 int
@@ -2698,7 +3284,7 @@ vnext_vnevent(femarg_t *vf, vnevent_t vnevent, vnode_t *dvp, char *cname,
 	vsop_find(vf, &func, int, &arg0, vop_vnevent, femop_vnevent);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, vnevent, dvp, cname, ct));
+	FEM_VNEXT5(vnevent, func, arg0, vnevent, dvp, cname, ct);
 }
 
 int
@@ -2713,7 +3299,7 @@ vnext_reqzcbuf(femarg_t *vf, enum uio_rw ioflag, xuio_t *xuiop, cred_t *cr,
 	vsop_find(vf, &func, int, &arg0, vop_reqzcbuf, femop_reqzcbuf);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, ioflag, xuiop, cr, ct));
+	FEM_VNEXT5(reqzcbuf, func, arg0, ioflag, xuiop, cr, ct);
 }
 
 int
@@ -2727,7 +3313,7 @@ vnext_retzcbuf(femarg_t *vf, xuio_t *xuiop, cred_t *cr, caller_context_t *ct)
 	vsop_find(vf, &func, int, &arg0, vop_retzcbuf, femop_retzcbuf);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, xuiop, cr, ct));
+	FEM_VNEXT4(retzcbuf, func, arg0, xuiop, cr, ct);
 }
 
 int
diff --git a/usr/src/uts/common/fs/fifofs/fifosubr.c b/usr/src/uts/common/fs/fifofs/fifosubr.c
index 6e56000ffe..56204c6741 100644
--- a/usr/src/uts/common/fs/fifofs/fifosubr.c
+++ b/usr/src/uts/common/fs/fifofs/fifosubr.c
@@ -614,9 +614,12 @@ fifo_stropen(vnode_t **vpp, int flag, cred_t *crp, int dotwist, int lockheld)
 	/*
 	 * The other end of the pipe is almost closed so
 	 * reject any other open on this end of the pipe
-	 * This only happens with a pipe mounted under namefs
+	 * This normally only happens with a pipe mounted under namefs, but
+	 * we can also see an open via proc/fd, which should still succeed.
+	 * To indicate the proc/fd case the FKLYR flag is passed.
 	 */
-	if ((fnp->fn_flag & (FIFOCLOSE|ISPIPE)) == (FIFOCLOSE|ISPIPE)) {
+	if ((fnp->fn_flag & (FIFOCLOSE|ISPIPE)) == (FIFOCLOSE|ISPIPE) &&
+	    (flag & FKLYR) == 0) {
 		fifo_cleanup(oldvp, flag);
 		cv_broadcast(&fnp->fn_wait_cv);
 		if (!lockheld)
diff --git a/usr/src/uts/common/fs/hyprlofs/hyprlofs_dir.c b/usr/src/uts/common/fs/hyprlofs/hyprlofs_dir.c
new file mode 100644
index 0000000000..05ee2c6e09
--- /dev/null
+++ b/usr/src/uts/common/fs/hyprlofs/hyprlofs_dir.c
@@ -0,0 +1,640 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2012, Joyent, Inc.  All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/time.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/errno.h>
+#include <sys/cmn_err.h>
+#include <sys/cred.h>
+#include <sys/stat.h>
+#include <sys/policy.h>
+#include <sys/fs/hyprlofs_info.h>
+
+static int hldir_make_hlnode(hlnode_t *, hlfsmount_t *, vattr_t *, enum de_op,
+		vnode_t *, hlnode_t **, cred_t *);
+static int hldiraddentry(hlnode_t *, hlnode_t *, char *);
+
+
+#define	HL_HASH_SIZE	8192		/* must be power of 2 */
+#define	HL_MUTEX_SIZE	64
+
+static hldirent_t	*hl_hashtable[HL_HASH_SIZE];
+static kmutex_t		 hl_hashmutex[HL_MUTEX_SIZE];
+
+#define	HL_HASH_INDEX(a)	((a) & (HL_HASH_SIZE-1))
+#define	HL_MUTEX_INDEX(a)	((a) & (HL_MUTEX_SIZE-1))
+
+#define	HYPRLOFS_HASH(tp, name, hash)				\
+	{							\
+		char Xc, *Xcp;					\
+		hash = (uint_t)(uintptr_t)(tp) >> 8;		\
+		for (Xcp = (name); (Xc = *Xcp) != 0; Xcp++)	\
+			hash = (hash << 4) + hash + (uint_t)Xc;	\
+	}
+
+void
+hyprlofs_hash_init(void)
+{
+	int	ix;
+
+	for (ix = 0; ix < HL_MUTEX_SIZE; ix++)
+		mutex_init(&hl_hashmutex[ix], NULL, MUTEX_DEFAULT, NULL);
+}
+
+static void
+hyprlofs_hash_in(hldirent_t *h)
+{
+	uint_t		hash;
+	hldirent_t	**prevpp;
+	kmutex_t	*hmtx;
+
+	HYPRLOFS_HASH(h->hld_parent, h->hld_name, hash);
+	h->hld_hash = hash;
+	prevpp = &hl_hashtable[HL_HASH_INDEX(hash)];
+	hmtx = &hl_hashmutex[HL_MUTEX_INDEX(hash)];
+	mutex_enter(hmtx);
+	h->hld_link = *prevpp;
+	*prevpp = h;
+	mutex_exit(hmtx);
+}
+
+/* Remove hldirent *h from the hash list. */
+static void
+hyprlofs_hash_out(hldirent_t *h)
+{
+	uint_t		hash;
+	hldirent_t	**prevpp;
+	kmutex_t	*hmtx;
+
+	hash = h->hld_hash;
+	prevpp = &hl_hashtable[HL_HASH_INDEX(hash)];
+	hmtx = &hl_hashmutex[HL_MUTEX_INDEX(hash)];
+	mutex_enter(hmtx);
+	while (*prevpp != h)
+		prevpp = &(*prevpp)->hld_link;
+	*prevpp = h->hld_link;
+	mutex_exit(hmtx);
+}
+
+static hldirent_t *
+hyprlofs_hash_lookup(char *name, hlnode_t *parent, uint_t hold,
+    hlnode_t **found)
+{
+	hldirent_t	*l;
+	uint_t		hash;
+	kmutex_t	*hmtx;
+	hlnode_t	*hnp;
+
+	HYPRLOFS_HASH(parent, name, hash);
+	hmtx = &hl_hashmutex[HL_MUTEX_INDEX(hash)];
+	mutex_enter(hmtx);
+	l = hl_hashtable[HL_HASH_INDEX(hash)];
+	while (l) {
+		if (l->hld_hash == hash && l->hld_parent == parent &&
+		    strcmp(l->hld_name, name) == 0) {
+			/*
+			 * Ensure that the hlnode that we put a hold on is the
+			 * same one that we pass back. Thus the temp. var
+			 * hnp is necessary.
+			 */
+			hnp = l->hld_hlnode;
+			if (hold) {
+				ASSERT(hnp);
+				hlnode_hold(hnp);
+			}
+			if (found)
+				*found = hnp;
+			mutex_exit(hmtx);
+			return (l);
+		} else {
+			l = l->hld_link;
+		}
+	}
+	mutex_exit(hmtx);
+	return (NULL);
+}
+
+/*
+ * Search directory 'parent' for entry 'name'.
+ *
+ * The calling thread can't hold the write version of the rwlock for the
+ * directory being searched
+ *
+ * On success *foundtp points to the found hlnode with its vnode held.
+ */
+int
+hyprlofs_dirlookup(hlnode_t *parent, char *name, hlnode_t **foundtp, cred_t *cr)
+{
+	int error;
+
+	*foundtp = NULL;
+	if (parent->hln_type != VDIR)
+		return (ENOTDIR);
+
+	if ((error = hyprlofs_taccess(parent, VEXEC, cr)))
+		return (error);
+
+	if (*name == '\0') {
+		hlnode_hold(parent);
+		*foundtp = parent;
+		return (0);
+	}
+
+	/*
+	 * Search the directory for the matching name. We need the lock
+	 * protecting the hln_dir list so that it doesn't change out from
+	 * underneath us. hyprlofs_hash_lookup() will pass back the hlnode
+	 * with a hold on it.
+	 */
+	if (hyprlofs_hash_lookup(name, parent, 1, foundtp) != NULL) {
+		ASSERT(*foundtp);
+		return (0);
+	}
+
+	return (ENOENT);
+}
+
+/*
+ * Enter a directory entry (either a file or subdir, depending on op) for
+ * 'name' and 'hp' into directory 'dir'
+ */
+int
+hyprlofs_direnter(
+	hlfsmount_t	*hm,
+	hlnode_t	*dir,		/* target directory to make entry in */
+	char		*name,		/* name of entry */
+	enum de_op	op,		/* entry operation */
+	vnode_t		*realvp,	/* real vnode */
+	vattr_t		*va,
+	hlnode_t	**hpp,		/* return hlnode */
+	cred_t		*cr)
+{
+	hldirent_t *hdp;
+	hlnode_t *found = NULL;
+	hlnode_t *hp;
+	int error = 0;
+	char *s;
+
+	/* hln_rwlock is held to serialize direnter and dirdeletes */
+	ASSERT(RW_WRITE_HELD(&dir->hln_rwlock));
+	ASSERT(dir->hln_type == VDIR);
+
+	/* Don't allow '/' characters in pathname component */
+	for (s = name; *s; s++)
+		if (*s == '/')
+			return (EACCES);
+
+	if (name[0] == '\0')
+		panic("hyprlofs_direnter: NULL name");
+
+	/*
+	 * This might be a "dangling detached directory". It could have been
+	 * removed, but a reference to it kept in u_cwd. Don't bother searching
+	 * it, and with any luck the user will get tired of dealing with us and
+	 * cd to some absolute pathway. This is in ufs, too.
+	 */
+	if (dir->hln_nlink == 0) {
+		return (ENOENT);
+	}
+
+	/* Search for the entry.  Return "found" if it exists. */
+	hdp = hyprlofs_hash_lookup(name, dir, 1, &found);
+
+	if (hdp) {
+		ASSERT(found);
+		switch (op) {
+		case DE_CREATE:
+		case DE_MKDIR:
+			if (hpp) {
+				*hpp = found;
+				error = EEXIST;
+			} else {
+				hlnode_rele(found);
+			}
+			break;
+		}
+	} else {
+
+		/*
+		 * The entry does not exist. Check write perms in dir to see if
+		 * entry can be created.
+		 */
+		if ((error = hyprlofs_taccess(dir, VWRITE, cr)))
+			return (error);
+
+		/* Make new hlnode and directory entry as required. */
+		if ((error = hldir_make_hlnode(dir, hm, va, op, realvp, &hp,
+		    cr)))
+			return (error);
+
+		if ((error = hldiraddentry(dir, hp, name))) {
+			/* Unmake the inode we just made. */
+			rw_enter(&hp->hln_rwlock, RW_WRITER);
+			if ((hp->hln_type) == VDIR) {
+				ASSERT(hdp == NULL);
+				/* cleanup allocs made by hyprlofs_dirinit() */
+				hyprlofs_dirtrunc(hp);
+			}
+			mutex_enter(&hp->hln_tlock);
+			hp->hln_nlink = 0;
+			mutex_exit(&hp->hln_tlock);
+			gethrestime(&hp->hln_ctime);
+			rw_exit(&hp->hln_rwlock);
+			hlnode_rele(hp);
+			hp = NULL;
+		} else if (hpp) {
+			*hpp = hp;
+		} else {
+			hlnode_rele(hp);
+		}
+	}
+
+	return (error);
+}
+
+/*
+ * Delete entry hp of name "nm" from dir. Free dir entry space and decrement
+ * link count on hlnode(s).
+ */
+int
+hyprlofs_dirdelete(hlnode_t *dir, hlnode_t *hp, char *nm, enum dr_op op,
+    cred_t *cr)
+{
+	hldirent_t *hpdp;
+	int error;
+	size_t namelen;
+	hlnode_t *hnp;
+	timestruc_t now;
+
+	ASSERT(RW_WRITE_HELD(&dir->hln_rwlock));
+	ASSERT(RW_WRITE_HELD(&hp->hln_rwlock));
+	ASSERT(dir->hln_type == VDIR);
+
+	if (nm[0] == '\0')
+		panic("hyprlofs_dirdelete: NULL name for %p", (void *)hp);
+
+	/* return error if removing . or .. */
+	if (nm[0] == '.') {
+		if (nm[1] == '\0')
+			return (EINVAL);
+		if (nm[1] == '.' && nm[2] == '\0')
+			return (EEXIST); /* thus in ufs */
+	}
+
+	if ((error = hyprlofs_taccess(dir, VEXEC|VWRITE, cr)) != 0)
+		return (error);
+
+	if (dir->hln_dir == NULL)
+		return (ENOENT);
+
+	hpdp = hyprlofs_hash_lookup(nm, dir, 0, &hnp);
+	if (hpdp == NULL) {
+		/*
+		 * If it is gone, some other thread got here first!
+		 * Return error ENOENT.
+		 */
+		return (ENOENT);
+	}
+
+	/*
+	 * If the hlnode in the hldirent changed (shouldn't happen since we
+	 * don't support rename) then original is gone, so return that status
+	 * (same as UFS).
+	 */
+	if (hp != hnp)
+		return (ENOENT);
+
+	hyprlofs_hash_out(hpdp);
+
+	/* Take hpdp out of the directory list. */
+	ASSERT(hpdp->hld_next != hpdp);
+	ASSERT(hpdp->hld_prev != hpdp);
+	if (hpdp->hld_prev) {
+		hpdp->hld_prev->hld_next = hpdp->hld_next;
+	}
+	if (hpdp->hld_next) {
+		hpdp->hld_next->hld_prev = hpdp->hld_prev;
+	}
+
+	/*
+	 * If the roving slot pointer happens to match hpdp, point it at the
+	 * previous dirent.
+	 */
+	if (dir->hln_dir->hld_prev == hpdp) {
+		dir->hln_dir->hld_prev = hpdp->hld_prev;
+	}
+	ASSERT(hpdp->hld_next != hpdp);
+	ASSERT(hpdp->hld_prev != hpdp);
+
+	/* hpdp points to the correct directory entry */
+	namelen = strlen(hpdp->hld_name) + 1;
+
+	kmem_free(hpdp, sizeof (hldirent_t) + namelen);
+	dir->hln_size -= (sizeof (hldirent_t) + namelen);
+	dir->hln_dirents--;
+
+	gethrestime(&now);
+	dir->hln_mtime = now;
+	dir->hln_ctime = now;
+	hp->hln_ctime = now;
+
+	ASSERT(hp->hln_nlink > 0);
+	DECR_COUNT(&hp->hln_nlink, &hp->hln_tlock);
+	if (op == DR_RMDIR && hp->hln_type == VDIR) {
+		hyprlofs_dirtrunc(hp);
+		ASSERT(hp->hln_nlink == 0);
+	}
+	return (0);
+}
+
+/*
+ * hyprlofs_dirinit initializes a dir with '.' and '..' entries without
+ * checking perms and locking
+ */
+void
+hyprlofs_dirinit(
+	hlnode_t *parent,	/* parent of directory to initialize */
+	hlnode_t *dir)		/* the new directory */
+{
+	hldirent_t *dot, *dotdot;
+	timestruc_t now;
+
+	ASSERT(RW_WRITE_HELD(&parent->hln_rwlock));
+	ASSERT(dir->hln_type == VDIR);
+
+	dot = kmem_zalloc(sizeof (hldirent_t) + 2, KM_SLEEP);
+	dotdot = kmem_zalloc(sizeof (hldirent_t) + 3, KM_SLEEP);
+
+	/* Initialize the entries */
+	dot->hld_hlnode = dir;
+	dot->hld_offset = 0;
+	dot->hld_name = (char *)dot + sizeof (hldirent_t);
+	dot->hld_name[0] = '.';
+	dot->hld_parent = dir;
+	hyprlofs_hash_in(dot);
+
+	dotdot->hld_hlnode = parent;
+	dotdot->hld_offset = 1;
+	dotdot->hld_name = (char *)dotdot + sizeof (hldirent_t);
+	dotdot->hld_name[0] = '.';
+	dotdot->hld_name[1] = '.';
+	dotdot->hld_parent = dir;
+	hyprlofs_hash_in(dotdot);
+
+	/* Initialize directory entry list. */
+	dot->hld_next = dotdot;
+	dot->hld_prev = dotdot;
+	dotdot->hld_next = NULL;
+	dotdot->hld_prev = dot;
+
+	gethrestime(&now);
+	dir->hln_mtime = now;
+	dir->hln_ctime = now;
+
+	/*
+	 * Since hyprlofs_dirinit is called with both dir and parent being the
+	 * same for the root vnode, we need to increment this before we set
+	 * hln_nlink = 2 below.
+	 */
+	INCR_COUNT(&parent->hln_nlink, &parent->hln_tlock);
+	parent->hln_ctime = now;
+
+	dir->hln_dir = dot;
+	dir->hln_size = 2 * sizeof (hldirent_t) + 5; /* dot and dotdot */
+	dir->hln_dirents = 2;
+	dir->hln_nlink = 2;
+}
+
+
+/*
+ * hyprlofs_dirtrunc removes all dir entries under this dir.
+ */
+void
+hyprlofs_dirtrunc(hlnode_t *dir)
+{
+	hldirent_t *hdp;
+	hlnode_t *tp;
+	size_t namelen;
+	timestruc_t now;
+
+	ASSERT(RW_WRITE_HELD(&dir->hln_rwlock));
+	ASSERT(dir->hln_type == VDIR);
+
+	if (dir->hln_looped)
+		return;
+
+	for (hdp = dir->hln_dir; hdp; hdp = dir->hln_dir) {
+		ASSERT(hdp->hld_next != hdp);
+		ASSERT(hdp->hld_prev != hdp);
+		ASSERT(hdp->hld_hlnode);
+
+		dir->hln_dir = hdp->hld_next;
+		namelen = strlen(hdp->hld_name) + 1;
+
+		/*
+		 * Adjust the link counts to account for this dir entry removal.
+		 */
+		tp = hdp->hld_hlnode;
+
+		ASSERT(tp->hln_nlink > 0);
+		DECR_COUNT(&tp->hln_nlink, &tp->hln_tlock);
+
+		hyprlofs_hash_out(hdp);
+
+		kmem_free(hdp, sizeof (hldirent_t) + namelen);
+		dir->hln_size -= (sizeof (hldirent_t) + namelen);
+		dir->hln_dirents--;
+	}
+
+	gethrestime(&now);
+	dir->hln_mtime = now;
+	dir->hln_ctime = now;
+
+	ASSERT(dir->hln_dir == NULL);
+	ASSERT(dir->hln_size == 0);
+	ASSERT(dir->hln_dirents == 0);
+}
+
+static int
+hldiraddentry(
+    hlnode_t	*dir,	/* target directory to make entry in */
+    hlnode_t	*hp,	/* new hlnode */
+    char	*name)
+{
+	hldirent_t	*hdp, *hpdp;
+	size_t		namelen, alloc_size;
+	timestruc_t	now;
+
+	/*
+	 * Make sure the parent dir wasn't removed from underneath the caller.
+	 */
+	if (dir->hln_dir == NULL)
+		return (ENOENT);
+
+	/* Check that everything is on the same FS. */
+	if (hp->hln_vnode->v_vfsp != dir->hln_vnode->v_vfsp)
+		return (EXDEV);
+
+	/* Alloc and init dir entry */
+	namelen = strlen(name) + 1;
+	alloc_size = namelen + sizeof (hldirent_t);
+	hdp = kmem_zalloc(alloc_size, KM_NORMALPRI | KM_NOSLEEP);
+	if (hdp == NULL)
+		return (ENOSPC);
+
+	dir->hln_size += alloc_size;
+	dir->hln_dirents++;
+	hdp->hld_hlnode = hp;
+	hdp->hld_parent = dir;
+
+	/* The dir entry and its name were allocated sequentially. */
+	hdp->hld_name = (char *)hdp + sizeof (hldirent_t);
+	(void) strcpy(hdp->hld_name, name);
+
+	hyprlofs_hash_in(hdp);
+
+	/*
+	 * Some utilities expect the size of a directory to remain fairly
+	 * static.  For example, a routine which unlinks files between calls to
+	 * readdir(); the size of the dir changes from underneath it and so the
+	 * real dir offset in bytes is invalid.  To circumvent this problem, we
+	 * initialize a dir entry with a phony offset, and use this offset to
+	 * determine end of file in hyprlofs_readdir.
+	 */
+	hpdp = dir->hln_dir->hld_prev;
+	/*
+	 * Install at first empty "slot" in directory list.
+	 */
+	while (hpdp->hld_next != NULL && (hpdp->hld_next->hld_offset -
+	    hpdp->hld_offset) <= 1) {
+		ASSERT(hpdp->hld_next != hpdp);
+		ASSERT(hpdp->hld_prev != hpdp);
+		ASSERT(hpdp->hld_next->hld_offset > hpdp->hld_offset);
+		hpdp = hpdp->hld_next;
+	}
+	hdp->hld_offset = hpdp->hld_offset + 1;
+
+	/*
+	 * If we're at the end of the dirent list and the offset (which is
+	 * necessarily the largest offset in this dir) is more than twice the
+	 * number of dirents, that means the dir is 50% holes.  At this point
+	 * we reset the slot pointer back to the beginning of the dir so we
+	 * start using the holes. The idea is that if there are N dirents,
+	 * there must also be N holes, so we can satisfy the next N creates by
+	 * walking at most 2N entries; thus the average cost of a create is
+	 * constant. Note that we use the first dirent's hld_prev as the roving
+	 * slot pointer. This saves a word in every dirent.
+	 */
+	if (hpdp->hld_next == NULL && hpdp->hld_offset > 2 * dir->hln_dirents)
+		dir->hln_dir->hld_prev = dir->hln_dir->hld_next;
+	else
+		dir->hln_dir->hld_prev = hdp;
+
+	ASSERT(hpdp->hld_next != hpdp);
+	ASSERT(hpdp->hld_prev != hpdp);
+
+	hdp->hld_next = hpdp->hld_next;
+	if (hdp->hld_next) {
+		hdp->hld_next->hld_prev = hdp;
+	}
+	hdp->hld_prev = hpdp;
+	hpdp->hld_next = hdp;
+
+	ASSERT(hdp->hld_next != hdp);
+	ASSERT(hdp->hld_prev != hdp);
+	ASSERT(hpdp->hld_next != hpdp);
+	ASSERT(hpdp->hld_prev != hpdp);
+
+	gethrestime(&now);
+	dir->hln_mtime = now;
+	dir->hln_ctime = now;
+
+	return (0);
+}
+
+static int
+hldir_make_hlnode(hlnode_t *dir, hlfsmount_t *hm, vattr_t *va, enum de_op op,
+    vnode_t *realvp, hlnode_t **newnode, cred_t *cr)
+{
+	hlnode_t	*hp;
+	enum vtype	type;
+
+	ASSERT(va != NULL);
+	ASSERT(op == DE_CREATE || op == DE_MKDIR);
+	if (((va->va_mask & AT_ATIME) && TIMESPEC_OVERFLOW(&va->va_atime)) ||
+	    ((va->va_mask & AT_MTIME) && TIMESPEC_OVERFLOW(&va->va_mtime)))
+		return (EOVERFLOW);
+	type = va->va_type;
+	hp = kmem_zalloc(sizeof (hlnode_t), KM_SLEEP);
+	hyprlofs_node_init(hm, hp, va, cr);
+
+	hp->hln_vnode->v_rdev = hp->hln_rdev = NODEV;
+	hp->hln_vnode->v_type = type;
+	hp->hln_uid = crgetuid(cr);
+
+	/*
+	 * To determine the gid of the created file:
+	 *   If the directory's set-gid bit is set, set the gid to the gid
+	 *   of the parent dir, otherwise, use the process's gid.
+	 */
+	if (dir->hln_mode & VSGID)
+		hp->hln_gid = dir->hln_gid;
+	else
+		hp->hln_gid = crgetgid(cr);
+
+	/*
+	 * If we're creating a dir and the parent dir has the set-GID bit set,
+	 * set it on the new dir. Otherwise, if the user is neither privileged
+	 * nor a member of the file's new group, clear the file's set-GID bit.
+	 */
+	if (dir->hln_mode & VSGID && type == VDIR)
+		hp->hln_mode |= VSGID;
+	else {
+		if ((hp->hln_mode & VSGID) &&
+		    secpolicy_vnode_setids_setgids(cr, hp->hln_gid) != 0)
+			hp->hln_mode &= ~VSGID;
+	}
+
+	if (va->va_mask & AT_ATIME)
+		hp->hln_atime = va->va_atime;
+	if (va->va_mask & AT_MTIME)
+		hp->hln_mtime = va->va_mtime;
+
+	if (op == DE_MKDIR) {
+		hyprlofs_dirinit(dir, hp);
+		hp->hln_looped = 0;
+	} else {
+		hp->hln_realvp = realvp;
+		hp->hln_size = va->va_size;
+		hp->hln_looped = 1;
+	}
+
+	*newnode = hp;
+	return (0);
+}
diff --git a/usr/src/uts/common/fs/hyprlofs/hyprlofs_subr.c b/usr/src/uts/common/fs/hyprlofs/hyprlofs_subr.c
new file mode 100644
index 0000000000..1d857309f3
--- /dev/null
+++ b/usr/src/uts/common/fs/hyprlofs/hyprlofs_subr.c
@@ -0,0 +1,127 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/errno.h>
+#include <sys/param.h>
+#include <sys/t_lock.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/debug.h>
+#include <sys/time.h>
+#include <sys/cmn_err.h>
+#include <sys/vnode.h>
+#include <sys/stat.h>
+#include <sys/mode.h>
+#include <sys/vfs.h>
+#include <sys/cred.h>
+#include <sys/kmem.h>
+#include <sys/atomic.h>
+#include <sys/policy.h>
+#include <sys/fs/hyprlofs_info.h>
+
+#define	MODESHIFT	3
+
+/* Initialize a hlnode and add it to file list under mount point. */
+void
+hyprlofs_node_init(hlfsmount_t *hm, hlnode_t *h, vattr_t *vap, cred_t *cr)
+{
+	vnode_t *vp;
+	timestruc_t now;
+
+	ASSERT(vap != NULL);
+
+	rw_init(&h->hln_rwlock, NULL, RW_DEFAULT, NULL);
+	mutex_init(&h->hln_tlock, NULL, MUTEX_DEFAULT, NULL);
+	h->hln_mode = MAKEIMODE(vap->va_type, vap->va_mode);
+	h->hln_mask = 0;
+	h->hln_type = vap->va_type;
+	h->hln_nodeid = (ino64_t)(uint32_t)((uintptr_t)h >> 3);
+	h->hln_nlink = 1;
+	h->hln_size = 0;
+
+	if (cr == NULL) {
+		h->hln_uid = vap->va_uid;
+		h->hln_gid = vap->va_gid;
+	} else {
+		h->hln_uid = crgetuid(cr);
+		h->hln_gid = crgetgid(cr);
+	}
+
+	h->hln_fsid = hm->hlm_dev;
+	h->hln_rdev = vap->va_rdev;
+	h->hln_blksize = PAGESIZE;
+	h->hln_nblocks = 0;
+	gethrestime(&now);
+	h->hln_atime = now;
+	h->hln_mtime = now;
+	h->hln_ctime = now;
+	h->hln_seq = 0;
+	h->hln_dir = NULL;
+
+	h->hln_vnode = vn_alloc(KM_SLEEP);
+	vp = HLNTOV(h);
+	vn_setops(vp, hyprlofs_vnodeops);
+	vp->v_vfsp = hm->hlm_vfsp;
+	vp->v_type = vap->va_type;
+	vp->v_rdev = vap->va_rdev;
+	vp->v_data = (caddr_t)h;
+	mutex_enter(&hm->hlm_contents);
+	/*
+	 * Increment the pseudo generation number for this hlnode. Since
+	 * hlnodes are allocated and freed, there really is no particular
+	 * generation number for a new hlnode.  Just fake it by using a
+	 * counter in each file system.
+	 */
+	h->hln_gen = hm->hlm_gen++;
+
+	/*
+	 * Add new hlnode to end of linked list of hlnodes for this hyprlofs
+	 * Root dir is handled specially in hyprlofs_mount.
+	 */
+	if (hm->hlm_rootnode != (hlnode_t *)NULL) {
+		h->hln_forw = NULL;
+		h->hln_back = hm->hlm_rootnode->hln_back;
+		h->hln_back->hln_forw = hm->hlm_rootnode->hln_back = h;
+	}
+	mutex_exit(&hm->hlm_contents);
+	vn_exists(vp);
+}
+
+int
+hyprlofs_taccess(void *vtp, int mode, cred_t *cr)
+{
+	hlnode_t *hp = vtp;
+	int shift = 0;
+
+	/* Check access based on owner, group and public perms in hlnode. */
+	if (crgetuid(cr) != hp->hln_uid) {
+		shift += MODESHIFT;
+		if (groupmember(hp->hln_gid, cr) == 0)
+			shift += MODESHIFT;
+	}
+
+	return (secpolicy_vnode_access2(cr, HLNTOV(hp), hp->hln_uid,
+	    hp->hln_mode << shift, mode));
+}
diff --git a/usr/src/uts/common/fs/hyprlofs/hyprlofs_vfsops.c b/usr/src/uts/common/fs/hyprlofs/hyprlofs_vfsops.c
new file mode 100644
index 0000000000..c582a8cac2
--- /dev/null
+++ b/usr/src/uts/common/fs/hyprlofs/hyprlofs_vfsops.c
@@ -0,0 +1,614 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * Hyperlofs is a hybrid file system combining features of the tmpfs(7FS) and
+ * lofs(7FS) file systems.  It is modeled on code from both of these file
+ * systems.
+ *
+ * The purpose is to create a high performance name space for files on which
+ * applications will compute.  Given a large number of data files with various
+ * owners, we want to construct a view onto those files such that only a subset
+ * is visible to the applications and such that the view can be changed very
+ * quickly as compute progresses.  Entries in the name space are not mounts and
+ * thus do not appear in the mnttab.  Entries in the name space are allowed to
+ * refer to files on different backing file systems.  Intermediate directories
+ * in the name space exist only in-memory, ala tmpfs.  There are no leaf nodes
+ * in the name space except for entries that refer to backing files ala lofs.
+ *
+ * The name space is managed via ioctls issued on the mounted file system and
+ * is mostly read-only for the compute applications.  That is, applications
+ * cannot create new files in the name space. If a file is unlinked by an
+ * application, that only removes the file from the name space, the backing
+ * file remains in place.  It is possible for applications to write-through to
+ * the backing files if the file system is mounted read-write.
+ *
+ * The name space is managed via the HYPRLOFS_ADD_ENTRIES, HYPRLOFS_RM_ENTRIES,
+ * and HYPRLOFS_RM_ALL ioctls on the top-level mount.
+ *
+ * The HYPRLOFS_ADD_ENTRIES ioctl specifies path(s) to the backing file(s) and
+ * the name(s) for the file(s) in the name space.  The name(s) may be path(s)
+ * which will be relative to the root of the mount and thus cannot begin with
+ * a /. If the name is a path, it does not have to correspond to any backing
+ * path. The intermediate directories will only exist in the name space. The
+ * entry(ies) will be added to the name space.
+ *
+ * The HYPRLOFS_RM_ENTRIES ioctl specifies the name(s) of the file(s) in the
+ * name space which should be removed.  The name(s) may be path(s) which will
+ * be relative to the root of the mount and thus cannot begin with a /.  The
+ * named entry(ies) will be removed.
+ *
+ * The HYPRLOFS_RM_ALL ioctl will remove all mappings from the name space.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/sysmacros.h>
+#include <sys/kmem.h>
+#include <sys/time.h>
+#include <sys/pathname.h>
+#include <sys/vfs.h>
+#include <sys/vfs_opreg.h>
+#include <sys/vnode.h>
+#include <sys/stat.h>
+#include <sys/uio.h>
+#include <sys/stat.h>
+#include <sys/errno.h>
+#include <sys/cmn_err.h>
+#include <sys/cred.h>
+#include <sys/statvfs.h>
+#include <sys/mount.h>
+#include <sys/debug.h>
+#include <sys/systm.h>
+#include <sys/mntent.h>
+#include <fs/fs_subr.h>
+#include <vm/page.h>
+#include <vm/anon.h>
+#include <sys/model.h>
+#include <sys/policy.h>
+
+#include <sys/fs/swapnode.h>
+#include <sys/fs/hyprlofs_info.h>
+
+static int hyprlofsfstype;
+
+/*
+ * hyprlofs vfs operations.
+ */
+static int hyprlofsinit(int, char *);
+static int hyprlofs_mount(vfs_t *, vnode_t *, struct mounta *, cred_t *);
+static int hyprlofs_unmount(vfs_t *, int, cred_t *);
+static int hyprlofs_root(vfs_t *, vnode_t **);
+static int hyprlofs_statvfs(vfs_t *, struct statvfs64 *);
+static int hyprlofs_vget(vfs_t *, vnode_t **, struct fid *);
+
+/*
+ * Loadable module wrapper
+ */
+#include <sys/modctl.h>
+
+static mntopts_t hyprlofs_mntopts;
+
+static vfsdef_t vfw = {
+	VFSDEF_VERSION,
+	"hyprlofs",
+	hyprlofsinit,
+	VSW_HASPROTO|VSW_CANREMOUNT|VSW_STATS|VSW_ZMOUNT,
+	&hyprlofs_mntopts
+};
+
+static mntopts_t hyprlofs_mntopts = {
+	0, NULL
+};
+
+/*
+ * Module linkage information
+ */
+static struct modlfs modlfs = {
+	&mod_fsops, "filesystem for hyprlofs", &vfw
+};
+
+static struct modlinkage modlinkage = {
+	MODREV_1, &modlfs, NULL
+};
+
+int
+_init()
+{
+	return (mod_install(&modlinkage));
+}
+
+int
+_fini()
+{
+	int error;
+
+	error = mod_remove(&modlinkage);
+	if (error)
+		return (error);
+	/*
+	 * Tear down the operations vectors
+	 */
+	(void) vfs_freevfsops_by_type(hyprlofsfstype);
+	vn_freevnodeops(hyprlofs_vnodeops);
+	return (0);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&modlinkage, modinfop));
+}
+
+/*
+ * The following are patchable variables limiting the amount of system
+ * resources hyprlofs can use.
+ *
+ * hyprlofs_maxkmem limits the amount of kernel kmem_alloc memory hyprlofs can
+ * use for it's data structures (e.g. hlnodes, directory entries). It is set
+ * as a percentage of physical memory which is determined when hyprlofs is
+ * first used in the system.
+ *
+ * hyprlofs_minfree is the minimum amount of swap space that hyprlofs leaves for
+ * the rest of the system. If the amount of free swap space in the system
+ * (i.e. anoninfo.ani_free) drops below hyprlofs_minfree, hyprlofs anon
+ * allocations will fail.
+ */
+size_t hyprlofs_maxkmem = 0;
+size_t hyprlofs_minfree = 0;
+size_t hyprlofs_kmemspace;	/* bytes of kernel heap used by all hyprlofs */
+
+static major_t hyprlofs_major;
+static minor_t hyprlofs_minor;
+static kmutex_t	hyprlofs_minor_lock;
+
+/*
+ * initialize global hyprlofs locks and hashes when loading hyprlofs module
+ */
+static int
+hyprlofsinit(int fstype, char *name)
+{
+	static const fs_operation_def_t hl_vfsops_template[] = {
+		VFSNAME_MOUNT,		{ .vfs_mount = hyprlofs_mount },
+		VFSNAME_UNMOUNT,	{ .vfs_unmount = hyprlofs_unmount },
+		VFSNAME_ROOT,		{ .vfs_root = hyprlofs_root },
+		VFSNAME_STATVFS,	{ .vfs_statvfs = hyprlofs_statvfs },
+		VFSNAME_VGET,		{ .vfs_vget = hyprlofs_vget },
+		NULL,			NULL
+	};
+	int error;
+	extern  void    hyprlofs_hash_init();
+
+	hyprlofs_hash_init();
+	hyprlofsfstype = fstype;
+	ASSERT(hyprlofsfstype != 0);
+
+	error = vfs_setfsops(fstype, hl_vfsops_template, NULL);
+	if (error != 0) {
+		cmn_err(CE_WARN, "hyprlofsinit: bad vfs ops template");
+		return (error);
+	}
+
+	error = vn_make_ops(name, hyprlofs_vnodeops_template,
+	    &hyprlofs_vnodeops);
+	if (error != 0) {
+		(void) vfs_freevfsops_by_type(fstype);
+		cmn_err(CE_WARN, "hyprlofsinit: bad vnode ops template");
+		return (error);
+	}
+
+	/*
+	 * hyprlofs_minfree is an absolute limit of swap space which still
+	 * allows other processes to execute.  Set it if its not patched.
+	 */
+	if (hyprlofs_minfree == 0)
+		hyprlofs_minfree = btopr(HYPRLOFSMINFREE);
+
+	if ((hyprlofs_major = getudev()) == (major_t)-1) {
+		cmn_err(CE_WARN,
+		    "hyprlofsinit: Can't get unique device number.");
+		hyprlofs_major = 0;
+	}
+	mutex_init(&hyprlofs_minor_lock, NULL, MUTEX_DEFAULT, NULL);
+	return (0);
+}
+
+static int
+hyprlofs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
+{
+	hlfsmount_t *hm = NULL;
+	hlnode_t *hp;
+	struct pathname dpn;
+	int error;
+	vattr_t rattr;
+	int got_attrs;
+
+	if ((error = secpolicy_fs_mount(cr, mvp, vfsp)) != 0)
+		return (error);
+	if (secpolicy_hyprlofs_control(cr) != 0)
+		return (EPERM);
+
+	if (mvp->v_type != VDIR)
+		return (ENOTDIR);
+
+	if (uap->flags & MS_REMOUNT)
+		return (EBUSY);
+
+	mutex_enter(&mvp->v_lock);
+	if ((uap->flags & MS_OVERLAY) == 0 &&
+	    (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
+		mutex_exit(&mvp->v_lock);
+		return (EBUSY);
+	}
+	mutex_exit(&mvp->v_lock);
+
+	/* Having the resource be anything but "swap" doesn't make sense. */
+	vfs_setresource(vfsp, "swap", 0);
+
+	if ((error = pn_get(uap->dir,
+	    (uap->flags & MS_SYSSPACE) ? UIO_SYSSPACE : UIO_USERSPACE,
+	    &dpn)) != 0)
+		goto out;
+
+	if ((hm = kmem_zalloc(sizeof (hlfsmount_t),
+	    KM_NORMALPRI | KM_NOSLEEP)) == NULL) {
+		pn_free(&dpn);
+		error = ENOMEM;
+		goto out;
+	}
+
+	/* Get an available minor device number for this mount */
+	mutex_enter(&hyprlofs_minor_lock);
+	do {
+		hyprlofs_minor = (hyprlofs_minor + 1) & L_MAXMIN32;
+		hm->hlm_dev = makedevice(hyprlofs_major, hyprlofs_minor);
+	} while (vfs_devismounted(hm->hlm_dev));
+	mutex_exit(&hyprlofs_minor_lock);
+
+	/*
+	 * Set but don't bother entering the mutex since hlfsmount is not on
+	 * the mount list yet.
+	 */
+	mutex_init(&hm->hlm_contents, NULL, MUTEX_DEFAULT, NULL);
+
+	hm->hlm_vfsp = vfsp;
+
+	vfsp->vfs_data = (caddr_t)hm;
+	vfsp->vfs_fstype = hyprlofsfstype;
+	vfsp->vfs_dev = hm->hlm_dev;
+	vfsp->vfs_bsize = PAGESIZE;
+	vfsp->vfs_flag |= VFS_NOTRUNC;
+	vfs_make_fsid(&vfsp->vfs_fsid, hm->hlm_dev, hyprlofsfstype);
+	hm->hlm_mntpath = kmem_zalloc(dpn.pn_pathlen + 1, KM_SLEEP);
+	(void) strcpy(hm->hlm_mntpath, dpn.pn_path);
+
+	/* allocate and initialize root hlnode structure */
+	bzero(&rattr, sizeof (vattr_t));
+	rattr.va_mode = (mode_t)(S_IFDIR | 0777);
+	rattr.va_type = VDIR;
+	rattr.va_rdev = 0;
+	hp = kmem_zalloc(sizeof (hlnode_t), KM_SLEEP);
+	hyprlofs_node_init(hm, hp, &rattr, cr);
+
+	/* Get the mode, uid, and gid from the underlying mount point. */
+	rattr.va_mask = AT_MODE|AT_UID|AT_GID;
+	got_attrs = VOP_GETATTR(mvp, &rattr, 0, cr, NULL);
+
+	rw_enter(&hp->hln_rwlock, RW_WRITER);
+	HLNTOV(hp)->v_flag |= VROOT;
+
+	/*
+	 * If the getattr succeeded, use its results, otherwise allow the
+	 * previously set defaults to prevail.
+	 */
+	if (got_attrs == 0) {
+		hp->hln_mode = rattr.va_mode;
+		hp->hln_uid = rattr.va_uid;
+		hp->hln_gid = rattr.va_gid;
+	}
+
+	/*
+	 * Initialize linked list of hlnodes so that the back pointer of the
+	 * root hlnode always points to the last one on the list and the
+	 * forward pointer of the last node is null
+	 */
+	hp->hln_back = hp;
+	hp->hln_forw = NULL;
+	hp->hln_nlink = 0;
+	hm->hlm_rootnode = hp;
+
+	hyprlofs_dirinit(hp, hp);
+
+	rw_exit(&hp->hln_rwlock);
+
+	pn_free(&dpn);
+	error = 0;
+
+out:
+	return (error);
+}
+
+static int
+hyprlofs_unmount(vfs_t *vfsp, int flag, cred_t *cr)
+{
+	hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vfsp);
+	hlnode_t *hnp, *cancel;
+	vnode_t	*vp;
+	int error;
+
+	if ((error = secpolicy_fs_unmount(cr, vfsp)) != 0)
+		return (error);
+	if (secpolicy_hyprlofs_control(cr) != 0)
+		return (EPERM);
+
+	/*
+	 * forced unmount is not supported by this file system
+	 * and thus, ENOTSUP, is being returned.
+	 */
+	if (flag & MS_FORCE)
+		return (ENOTSUP);
+
+	mutex_enter(&hm->hlm_contents);
+
+	/*
+	 * If there are no open files, only the root node should have a ref cnt.
+	 * With hlm_contents held, nothing can be added or removed. There may
+	 * be some dirty pages.  To prevent fsflush from disrupting the unmount,
+	 * put a hold on each node while scanning. If we find a previously
+	 * referenced node, undo the holds we have placed and fail EBUSY.
+	 */
+	hnp = hm->hlm_rootnode;
+	if (HLNTOV(hnp)->v_count > 1) {
+		mutex_exit(&hm->hlm_contents);
+		return (EBUSY);
+	}
+
+	for (hnp = hnp->hln_forw; hnp; hnp = hnp->hln_forw) {
+		if ((vp = HLNTOV(hnp))->v_count > 0) {
+			cancel = hm->hlm_rootnode->hln_forw;
+			while (cancel != hnp) {
+				vp = HLNTOV(cancel);
+				ASSERT(vp->v_count > 0);
+				VN_RELE(vp);
+				cancel = cancel->hln_forw;
+			}
+			mutex_exit(&hm->hlm_contents);
+			return (EBUSY);
+		}
+		VN_HOLD(vp);
+	}
+
+	/* We can drop the mutex now because no one can find this mount */
+	mutex_exit(&hm->hlm_contents);
+
+	/*
+	 * Free all alloc'd memory associated with this FS. To do this, we go
+	 * through the file list twice, once to remove all the dir entries, and
+	 * then to remove all the files.
+	 */
+
+	/* Remove all directory entries */
+	for (hnp = hm->hlm_rootnode; hnp; hnp = hnp->hln_forw) {
+		rw_enter(&hnp->hln_rwlock, RW_WRITER);
+		if (hnp->hln_type == VDIR)
+			hyprlofs_dirtrunc(hnp);
+		rw_exit(&hnp->hln_rwlock);
+	}
+
+	ASSERT(hm->hlm_rootnode);
+
+	/*
+	 * All links are gone, v_count is keeping nodes in place. VN_RELE
+	 * should make the node disappear, unless somebody is holding pages
+	 * against it.  Wait and retry until it disappears.
+	 *
+	 * We re-acquire the lock to prevent others who have a HOLD on a hlnode
+	 * from blowing it away (in hyprlofs_inactive) while we're trying to
+	 * get to it here. Once we have a HOLD on it we know it'll stick around.
+	 */
+	mutex_enter(&hm->hlm_contents);
+
+	/* Remove all the files (except the rootnode) backwards. */
+	while ((hnp = hm->hlm_rootnode->hln_back) != hm->hlm_rootnode) {
+		mutex_exit(&hm->hlm_contents);
+		/* Note we handled the link count in pass 2 above. */
+		vp = HLNTOV(hnp);
+		VN_RELE(vp);
+		mutex_enter(&hm->hlm_contents);
+		/*
+		 * It's still there after the RELE. Someone else like pageout
+		 * has a hold on it so wait a bit and then try again.
+		 */
+		if (hnp == hm->hlm_rootnode->hln_back) {
+			VN_HOLD(vp);
+			mutex_exit(&hm->hlm_contents);
+			delay(hz / 4);
+			mutex_enter(&hm->hlm_contents);
+		}
+	}
+	mutex_exit(&hm->hlm_contents);
+
+	VN_RELE(HLNTOV(hm->hlm_rootnode));
+
+	ASSERT(hm->hlm_mntpath);
+
+	kmem_free(hm->hlm_mntpath, strlen(hm->hlm_mntpath) + 1);
+
+	mutex_destroy(&hm->hlm_contents);
+	kmem_free(hm, sizeof (hlfsmount_t));
+
+	return (0);
+}
+
+/* Return root hlnode for given vnode */
+static int
+hyprlofs_root(vfs_t *vfsp, vnode_t **vpp)
+{
+	hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vfsp);
+	hlnode_t *hp = hm->hlm_rootnode;
+	vnode_t *vp;
+
+	ASSERT(hp);
+
+	vp = HLNTOV(hp);
+	VN_HOLD(vp);
+	*vpp = vp;
+	return (0);
+}
+
+static int
+hyprlofs_statvfs(vfs_t *vfsp, struct statvfs64 *sbp)
+{
+	hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vfsp);
+	ulong_t	blocks;
+	dev32_t d32;
+	zoneid_t eff_zid;
+	struct zone *zp;
+
+	/*
+	 * The FS may have been mounted by the GZ on behalf of the NGZ.  In
+	 * that case, the hlfsmount zone_id will be the global zone.  We want
+	 * to show the swap cap inside the zone in this case, even though the
+	 * FS was mounted by the GZ.
+	 */
+	if (curproc->p_zone->zone_id != GLOBAL_ZONEUNIQID)
+		zp = curproc->p_zone;
+	else
+		zp = hm->hlm_vfsp->vfs_zone;
+
+	if (zp == NULL)
+		eff_zid = GLOBAL_ZONEUNIQID;
+	else
+		eff_zid = zp->zone_id;
+
+	sbp->f_bsize = PAGESIZE;
+	sbp->f_frsize = PAGESIZE;
+
+	/*
+	 * Find the amount of available physical and memory swap
+	 */
+	mutex_enter(&anoninfo_lock);
+	ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
+	blocks = (ulong_t)CURRENT_TOTAL_AVAILABLE_SWAP;
+	mutex_exit(&anoninfo_lock);
+
+	if (blocks > hyprlofs_minfree)
+		sbp->f_bfree = blocks - hyprlofs_minfree;
+	else
+		sbp->f_bfree = 0;
+
+	sbp->f_bavail = sbp->f_bfree;
+
+	/*
+	 * Total number of blocks is what's available plus what's been used
+	 */
+	sbp->f_blocks = (fsblkcnt64_t)(sbp->f_bfree);
+
+	if (eff_zid != GLOBAL_ZONEUNIQID &&
+	    zp->zone_max_swap_ctl != UINT64_MAX) {
+		/*
+		 * If the fs is used by a NGZ with a swap cap, then report the
+		 * capped size.
+		 */
+		rctl_qty_t cap, used;
+		pgcnt_t pgcap, pgused;
+
+		mutex_enter(&zp->zone_mem_lock);
+		cap = zp->zone_max_swap_ctl;
+		used = zp->zone_max_swap;
+		mutex_exit(&zp->zone_mem_lock);
+
+		pgcap = btop(cap);
+		pgused = btop(used);
+
+		sbp->f_bfree = MIN(pgcap - pgused, sbp->f_bfree);
+		sbp->f_bavail = sbp->f_bfree;
+		sbp->f_blocks = MIN(pgcap, sbp->f_blocks);
+	}
+
+	/*
+	 * This is fairly inaccurate since it doesn't take into account the
+	 * names stored in the directory entries.
+	 */
+	sbp->f_ffree = sbp->f_files = ptob(availrmem) /
+	    (sizeof (hlnode_t) + sizeof (hldirent_t));
+
+	sbp->f_favail = (fsfilcnt64_t)(sbp->f_ffree);
+	(void) cmpldev(&d32, vfsp->vfs_dev);
+	sbp->f_fsid = d32;
+	(void) strcpy(sbp->f_basetype, vfssw[hyprlofsfstype].vsw_name);
+	(void) strncpy(sbp->f_fstr, hm->hlm_mntpath, sizeof (sbp->f_fstr));
+	/*
+	 * ensure null termination
+	 */
+	sbp->f_fstr[sizeof (sbp->f_fstr) - 1] = '\0';
+	sbp->f_flag = vf_to_stf(vfsp->vfs_flag);
+	sbp->f_namemax = MAXNAMELEN - 1;
+	return (0);
+}
+
+static int
+hyprlofs_vget(vfs_t *vfsp, vnode_t **vpp, struct fid *fidp)
+{
+	hlfid_t *hfid;
+	hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vfsp);
+	hlnode_t *hp = NULL;
+
+	hfid = (hlfid_t *)fidp;
+	*vpp = NULL;
+
+	mutex_enter(&hm->hlm_contents);
+	for (hp = hm->hlm_rootnode; hp; hp = hp->hln_forw) {
+		mutex_enter(&hp->hln_tlock);
+		if (hp->hln_nodeid == hfid->hlfid_ino) {
+			/*
+			 * If the gen numbers don't match we know the file
+			 * won't be found since only one hlnode can have this
+			 * number at a time.
+			 */
+			if (hp->hln_gen != hfid->hlfid_gen ||
+			    hp->hln_nlink == 0) {
+				mutex_exit(&hp->hln_tlock);
+				mutex_exit(&hm->hlm_contents);
+				return (0);
+			}
+			*vpp = (vnode_t *)HLNTOV(hp);
+
+			VN_HOLD(*vpp);
+
+			if ((hp->hln_mode & S_ISVTX) &&
+			    !(hp->hln_mode & (S_IXUSR | S_IFDIR))) {
+				mutex_enter(&(*vpp)->v_lock);
+				(*vpp)->v_flag |= VISSWAP;
+				mutex_exit(&(*vpp)->v_lock);
+			}
+			mutex_exit(&hp->hln_tlock);
+			mutex_exit(&hm->hlm_contents);
+			return (0);
+		}
+		mutex_exit(&hp->hln_tlock);
+	}
+	mutex_exit(&hm->hlm_contents);
+	return (0);
+}
diff --git a/usr/src/uts/common/fs/hyprlofs/hyprlofs_vnops.c b/usr/src/uts/common/fs/hyprlofs/hyprlofs_vnops.c
new file mode 100644
index 0000000000..a2064dfa1f
--- /dev/null
+++ b/usr/src/uts/common/fs/hyprlofs/hyprlofs_vnops.c
@@ -0,0 +1,1441 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc.  All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/t_lock.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/user.h>
+#include <sys/time.h>
+#include <sys/vfs.h>
+#include <sys/vfs_opreg.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/fcntl.h>
+#include <sys/flock.h>
+#include <sys/kmem.h>
+#include <sys/errno.h>
+#include <sys/stat.h>
+#include <sys/cred.h>
+#include <sys/dirent.h>
+#include <sys/pathname.h>
+#include <sys/fs/hyprlofs.h>
+#include <sys/fs/hyprlofs_info.h>
+#include <sys/mman.h>
+#include <vm/pvn.h>
+#include <sys/cmn_err.h>
+#include <sys/buf.h>
+#include <sys/policy.h>
+#include <fs/fs_subr.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+
+static int hyprlofs_add_entry(vnode_t *, char *, char *, cred_t *,
+		caller_context_t *);
+static int hyprlofs_rm_entry(vnode_t *, char *, cred_t *, caller_context_t *,
+		int);
+static int hyprlofs_rm_all(vnode_t *, cred_t *, caller_context_t *, int);
+static int hyprlofs_remove(vnode_t *, char *, cred_t *, caller_context_t *,
+		int);
+static int hyprlofs_get_all(vnode_t *, intptr_t, cred_t *, caller_context_t *,
+		int);
+
+/*
+ * This is a somewhat arbitrary upper limit on the number of entries we can
+ * pass in on a single add/rm ioctl call.  This is only used to validate that
+ * the input list looks sane.
+ */
+#define	MAX_IOCTL_PARAMS	100000
+
+static int
+hyprlofs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
+{
+	vnode_t *rvp;
+	int error;
+
+	rvp = REALVP(*vpp);
+
+	if (VTOHLN(*vpp)->hln_looped == 0)
+		return (0);
+
+	/*
+	 * looped back, pass through to real vnode. Need to hold new reference
+	 * to vp since VOP_OPEN() may decide to release it.
+	 */
+	VN_HOLD(rvp);
+	error = VOP_OPEN(&rvp, flag, cr, ct);
+	ASSERT(rvp->v_count > 1);
+	VN_RELE(rvp);
+
+	return (error);
+}
+
+static int
+hyprlofs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
+    caller_context_t *ct)
+{
+	if (VTOHLN(vp)->hln_looped == 0) {
+		cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
+		cleanshares(vp, ttoproc(curthread)->p_pid);
+		return (0);
+	}
+
+	return (VOP_CLOSE(REALVP(vp), flag, count, offset, cr, ct));
+}
+
+static int
+hyprlofs_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
+    caller_context_t *ct)
+{
+	if (vp->v_type == VDIR)
+		return (EISDIR);
+	return (VOP_READ(REALVP(vp), uiop, ioflag, cr, ct));
+}
+
+static int
+hyprlofs_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
+    caller_context_t *ct)
+{
+	/* We don't support writing to non-regular files */
+	if (vp->v_type != VREG)
+		return (EINVAL);
+
+	if (vn_is_readonly(vp))
+		return (EROFS);
+
+	return (VOP_WRITE(REALVP(vp), uiop, ioflag, cr, ct));
+}
+
+/* ARGSUSED */
+static int
+hyprlofs_ioctl(vnode_t *vp, int cmd, intptr_t data, int flag,
+    cred_t *cr, int *rvalp, caller_context_t *ct)
+{
+	int len, cnt, error;
+	int i;
+	model_t model;
+	char path[MAXPATHLEN];
+	char nm[MAXPATHLEN];
+
+	/* We only support the hyprlofs ioctls on the root vnode */
+	if (!(vp->v_flag & VROOT))
+		return (ENOTTY);
+
+	/*
+	 * Check if managing hyprlofs is allowed.
+	 */
+	if (secpolicy_hyprlofs_control(cr) != 0)
+		return (EPERM);
+
+	if (cmd == HYPRLOFS_ADD_ENTRIES || cmd == HYPRLOFS_RM_ENTRIES) {
+		model = get_udatamodel();
+
+		if (model == DATAMODEL_NATIVE) {
+			hyprlofs_entries_t ebuf;
+			hyprlofs_entry_t *e;
+
+			if (copyin((void *)data, &ebuf, sizeof (ebuf)))
+				return (EFAULT);
+			cnt = ebuf.hle_len;
+			if (cnt > MAX_IOCTL_PARAMS)
+				return (EINVAL);
+			len = sizeof (hyprlofs_entry_t) * cnt;
+
+			e = kmem_alloc(len, KM_SLEEP);
+			if (copyin((void *)(ebuf.hle_entries), e, len)) {
+				kmem_free(e, len);
+				return (EFAULT);
+			}
+
+			for (i = 0; i < cnt; i++) {
+				if (e[i].hle_nlen == 0 ||
+				    e[i].hle_nlen > MAXPATHLEN)
+					return (EINVAL);
+
+				if (copyin(e[i].hle_name, nm, e[i].hle_nlen)
+				    != 0) {
+					kmem_free(e, len);
+					return (EFAULT);
+				}
+				nm[e[i].hle_nlen] = '\0';
+
+				if (cmd == HYPRLOFS_ADD_ENTRIES) {
+					if (e[i].hle_plen == 0 ||
+					    e[i].hle_plen > MAXPATHLEN)
+						return (EINVAL);
+
+					if (copyin(e[i].hle_path, path,
+					    e[i].hle_plen) != 0) {
+						kmem_free(e, len);
+						return (EFAULT);
+					}
+					path[e[i].hle_plen] = '\0';
+
+					if ((error = hyprlofs_add_entry(vp,
+					    path, nm, cr, ct)) != 0) {
+						kmem_free(e, len);
+						return (error);
+					}
+				} else {
+					if ((error = hyprlofs_rm_entry(vp, nm,
+					    cr, ct, flag)) != 0) {
+						kmem_free(e, len);
+						return (error);
+					}
+				}
+			}
+
+			kmem_free(e, len);
+			return (0);
+
+		} else {
+			hyprlofs_entries32_t ebuf32;
+			hyprlofs_entry32_t *e32;
+
+			if (copyin((void *)data, &ebuf32, sizeof (ebuf32)))
+				return (EFAULT);
+
+			cnt = ebuf32.hle_len;
+			if (cnt > MAX_IOCTL_PARAMS)
+				return (EINVAL);
+			len = sizeof (hyprlofs_entry32_t) * cnt;
+
+			e32 = kmem_alloc(len, KM_SLEEP);
+			if (copyin((void *)(unsigned long)(ebuf32.hle_entries),
+			    e32, len)) {
+				kmem_free(e32, len);
+				return (EFAULT);
+			}
+
+			for (i = 0; i < cnt; i++) {
+				if (e32[i].hle_nlen == 0 ||
+				    e32[i].hle_nlen > MAXPATHLEN)
+					return (EINVAL);
+
+				if (copyin((void *)(unsigned long)
+				    e32[i].hle_name, nm,
+				    e32[i].hle_nlen) != 0) {
+					kmem_free(e32, len);
+					return (EFAULT);
+				}
+				nm[e32[i].hle_nlen] = '\0';
+
+				if (cmd == HYPRLOFS_ADD_ENTRIES) {
+					if (e32[i].hle_plen == 0 ||
+					    e32[i].hle_plen > MAXPATHLEN)
+						return (EINVAL);
+
+					if (copyin((void *)(unsigned long)
+					    e32[i].hle_path, path,
+					    e32[i].hle_plen) != 0) {
+						kmem_free(e32, len);
+						return (EFAULT);
+					}
+					path[e32[i].hle_plen] = '\0';
+
+					if ((error = hyprlofs_add_entry(vp,
+					    path, nm, cr, ct)) != 0) {
+						kmem_free(e32, len);
+						return (error);
+					}
+				} else {
+					if ((error = hyprlofs_rm_entry(vp, nm,
+					    cr, ct, flag)) != 0) {
+						kmem_free(e32, len);
+						return (error);
+					}
+				}
+			}
+
+			kmem_free(e32, len);
+			return (0);
+		}
+	}
+
+	if (cmd == HYPRLOFS_RM_ALL) {
+		return (hyprlofs_rm_all(vp, cr, ct, flag));
+	}
+
+	if (cmd == HYPRLOFS_GET_ENTRIES) {
+		return (hyprlofs_get_all(vp, data, cr, ct, flag));
+	}
+
+	return (ENOTTY);
+}
+
+static int
+hyprlofs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
+    caller_context_t *ct)
+{
+	hlnode_t *tp = (hlnode_t *)VTOHLN(vp);
+	vattr_t tmp_va;
+
+	if (tp->hln_looped == 1) {
+		int error;
+
+		if ((error = VOP_GETATTR(REALVP(vp), &tmp_va, flags, cr,
+		    ct)) != 0)
+			return (error);
+	}
+
+	mutex_enter(&tp->hln_tlock);
+	vap->va_type = vp->v_type;
+	vap->va_mode = tp->hln_mode & MODEMASK;
+	vap->va_uid = tp->hln_uid;
+	vap->va_gid = tp->hln_gid;
+	vap->va_fsid = tp->hln_fsid;
+	vap->va_nodeid = (ino64_t)tp->hln_nodeid;
+	vap->va_nlink = tp->hln_nlink;
+	vap->va_size = (u_offset_t)tp->hln_size;
+	vap->va_atime = tp->hln_atime;
+	vap->va_mtime = tp->hln_mtime;
+	vap->va_ctime = tp->hln_ctime;
+	vap->va_blksize = PAGESIZE;
+	vap->va_rdev = tp->hln_rdev;
+	vap->va_seq = tp->hln_seq;
+
+	if (tp->hln_looped == 1) {
+		vap->va_nblocks = tmp_va.va_nblocks;
+	} else {
+		vap->va_nblocks =
+		    (fsblkcnt64_t)btodb(ptob(btopr(vap->va_size)));
+	}
+	mutex_exit(&tp->hln_tlock);
+	return (0);
+}
+
+/*ARGSUSED4*/
+static int
+hyprlofs_setattr(vnode_t *vp, vattr_t *vap, int flags,
+    cred_t *cr, caller_context_t *ct)
+{
+	hlnode_t *tp = (hlnode_t *)VTOHLN(vp);
+	int error = 0;
+	vattr_t *get;
+	long mask;
+
+	/*
+	 * Cannot set these attributes
+	 */
+	if ((vap->va_mask & AT_NOSET) || (vap->va_mask & AT_XVATTR))
+		return (EINVAL);
+
+	mutex_enter(&tp->hln_tlock);
+
+	get = &tp->hln_attr;
+	/*
+	 * Change file access modes. Must be owner or have sufficient
+	 * privileges.
+	 */
+	error = secpolicy_vnode_setattr(cr, vp, vap, get, flags,
+	    hyprlofs_taccess, tp);
+
+	if (error)
+		goto out;
+
+	mask = vap->va_mask;
+
+	if (mask & AT_MODE) {
+		get->va_mode &= S_IFMT;
+		get->va_mode |= vap->va_mode & ~S_IFMT;
+	}
+
+	if (mask & AT_UID)
+		get->va_uid = vap->va_uid;
+	if (mask & AT_GID)
+		get->va_gid = vap->va_gid;
+	if (mask & AT_ATIME)
+		get->va_atime = vap->va_atime;
+	if (mask & AT_MTIME)
+		get->va_mtime = vap->va_mtime;
+
+	if (mask & (AT_UID | AT_GID | AT_MODE | AT_MTIME))
+		gethrestime(&tp->hln_ctime);
+
+out:
+	mutex_exit(&tp->hln_tlock);
+	return (error);
+}
+
+static int
+hyprlofs_access(vnode_t *vp, int mode, int flags, cred_t *cr,
+    caller_context_t *ct)
+{
+	hlnode_t *tp = (hlnode_t *)VTOHLN(vp);
+	int error;
+
+	if (mode & VWRITE) {
+		if (vp->v_type == VREG && vn_is_readonly(vp))
+			return (EROFS);
+	}
+	if (VTOHLN(vp)->hln_looped == 1)
+		return (VOP_ACCESS(REALVP(vp), mode, flags, cr, ct));
+
+	mutex_enter(&tp->hln_tlock);
+	error = hyprlofs_taccess(tp, mode, cr);
+	mutex_exit(&tp->hln_tlock);
+	return (error);
+}
+
+/* ARGSUSED3 */
+static int
+hyprlofs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
+    int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
+    int *direntflags, pathname_t *realpnp)
+{
+	hlnode_t *tp = (hlnode_t *)VTOHLN(dvp);
+	hlnode_t *ntp = NULL;
+	int error;
+
+	if (VTOHLN(dvp)->hln_looped == 1)
+		return (VOP_LOOKUP(REALVP(dvp), nm, vpp, pnp, flags, rdir,
+		    cr, ct, direntflags, realpnp));
+
+	if (flags & LOOKUP_XATTR)
+		return (EINVAL);
+
+	/* Null component name is a synonym for directory being searched. */
+	if (*nm == '\0') {
+		VN_HOLD(dvp);
+		*vpp = dvp;
+		return (0);
+	}
+	ASSERT(tp);
+
+	if ((error = hyprlofs_dirlookup(tp, nm, &ntp, cr)) == 0) {
+		ASSERT(ntp);
+		*vpp = HLNTOV(ntp);
+	}
+	return (error);
+}
+
+/*
+ * Create the loopback from the hyprlofs vnode to the real vnode.
+ */
+static int
+hyprlofs_loopback(vnode_t *dvp, vnode_t *rvp, char *nm, vattr_t *vap,
+    int mode, cred_t *cr, caller_context_t *ct)
+{
+	hlnode_t *parent;
+	hlfsmount_t *tm;
+	int error;
+	hlnode_t *oldtp;
+	vnode_t *vp;
+
+	parent = (hlnode_t *)VTOHLN(dvp);
+	tm = (hlfsmount_t *)VTOHLM(dvp);
+	error = 0;
+	oldtp = NULL;
+
+	if (vap->va_type == VREG && (vap->va_mode & VSVTX)) {
+		/* we don't support the sticky bit */
+		vap->va_mode &= ~VSVTX;
+	} else if (vap->va_type == VNON) {
+		return (EINVAL);
+	}
+
+	/* Null component name is a synonym for directory being searched. */
+	if (*nm == '\0') {
+		VN_HOLD(dvp);
+		oldtp = parent;
+	} else {
+		error = hyprlofs_dirlookup(parent, nm, &oldtp, cr);
+	}
+
+	if (error == 0) {	/* name found */
+		ASSERT(oldtp);
+
+		rw_enter(&oldtp->hln_rwlock, RW_WRITER);
+
+		/*
+		 * if create/read-only an existing directory, allow it
+		 */
+		if ((oldtp->hln_type == VDIR) && (mode & VWRITE))
+			error = EISDIR;
+		else {
+			error = hyprlofs_taccess(oldtp, mode, cr);
+		}
+
+		if (error) {
+			rw_exit(&oldtp->hln_rwlock);
+			hlnode_rele(oldtp);
+			return (error);
+		}
+
+		vp = HLNTOV(oldtp);
+		rw_exit(&oldtp->hln_rwlock);
+
+		if (vp->v_type == VREG) {
+			hlnode_rele(oldtp);
+			return (EEXIST);
+		}
+
+		vnevent_create(vp, ct);
+		return (0);
+	}
+
+	if (error != ENOENT)
+		return (error);
+
+	rw_enter(&parent->hln_rwlock, RW_WRITER);
+	error = hyprlofs_direnter(tm, parent, nm, DE_CREATE, rvp, vap, NULL,
+	    cr);
+	rw_exit(&parent->hln_rwlock);
+
+	return (error);
+}
+
+/*
+ * Create an in-memory directory based on the add-entry ioctl name.
+ * If the dir exists, return EEXIST but still also return node in vpp.
+ */
+static int
+hyprlofs_mkdir(vnode_t *dvp, char *nm, vattr_t *va, vnode_t **vpp, cred_t *cr)
+{
+	hlnode_t *parent = (hlnode_t *)VTOHLN(dvp);
+	hlnode_t *self = NULL;
+	hlfsmount_t *tm = (hlfsmount_t *)VTOHLM(dvp);
+	int error;
+
+	/*
+	 * Might be dangling directory.  Catch it here, because a ENOENT return
+	 * from hyprlofs_dirlookup() is a valid return.
+	 */
+	if (parent->hln_nlink == 0)
+		return (ENOENT);
+
+	error = hyprlofs_dirlookup(parent, nm, &self, cr);
+	if (error == 0) {
+		ASSERT(self);
+		hlnode_rele(self);
+		/* We can't loop in under a looped in directory */
+		if (self->hln_looped)
+			return (EACCES);
+		*vpp = HLNTOV(self);
+		return (EEXIST);
+	}
+	if (error != ENOENT)
+		return (error);
+
+	rw_enter(&parent->hln_rwlock, RW_WRITER);
+	error = hyprlofs_direnter(tm, parent, nm, DE_MKDIR, (vnode_t *)NULL,
+	    va, &self, cr);
+	rw_exit(&parent->hln_rwlock);
+
+	if (error == 0 || error == EEXIST) {
+		hlnode_rele(self);
+		*vpp = HLNTOV(self);
+	}
+
+	return (error);
+}
+
+/*
+ * Loop in a file or directory into the namespace.
+ */
+static int
+hyprlofs_add_entry(vnode_t *vp, char *fspath, char *fsname,
+    cred_t *cr, caller_context_t *ct)
+{
+	int error;
+	char *p, *pnm;
+	vnode_t *realvp, *dvp;
+	vattr_t va;
+
+	/*
+	 * Get vnode for the real file/dir. We'll have a hold on realvp which
+	 * we won't vn_rele until hyprlofs_inactive.
+	 */
+	if ((error = lookupname(fspath, UIO_SYSSPACE, FOLLOW, NULLVPP,
+	    &realvp)) != 0)
+		return (error);
+
+	/* no devices allowed */
+	if (IS_DEVVP(realvp)) {
+		VN_RELE(realvp);
+		return (ENODEV);
+	}
+
+	/*
+	 * realvp may be an AUTOFS node, in which case we perform a VOP_ACCESS
+	 * to trigger the mount of the intended filesystem. This causes a
+	 * loopback mount of the intended filesystem instead of the AUTOFS
+	 * filesystem.
+	 */
+	if ((error = VOP_ACCESS(realvp, 0, 0, cr, NULL)) != 0) {
+		VN_RELE(realvp);
+		return (error);
+	}
+
+	/*
+	 * We're interested in the top most filesystem. This is specially
+	 * important when fspath is a trigger AUTOFS node, since we're really
+	 * interested in mounting the filesystem AUTOFS mounted as result of
+	 * the VOP_ACCESS() call not the AUTOFS node itself.
+	 */
+	if (vn_mountedvfs(realvp) != NULL) {
+		if ((error = traverse(&realvp)) != 0) {
+			VN_RELE(realvp);
+			return (error);
+		}
+	}
+
+	va.va_type = VNON;
+	/*
+	 * If the target name is a path, make sure we have all of the
+	 * intermediate directories, creating them if necessary.
+	 */
+	dvp = vp;
+	pnm = p = fsname;
+
+	/* path cannot be absolute */
+	if (*p == '/') {
+		VN_RELE(realvp);
+		return (EINVAL);
+	}
+
+	for (p = strchr(pnm, '/'); p != NULL; p = strchr(pnm, '/')) {
+		if (va.va_type == VNON)
+			/* use the top-level dir as the template va for mkdir */
+			if ((error = VOP_GETATTR(vp, &va, 0, cr, NULL)) != 0) {
+				VN_RELE(realvp);
+				return (error);
+			}
+
+		*p = '\0';
+
+		/* Path component cannot be empty or relative */
+		if (pnm[0] == '\0' ||
+		    (pnm[0] == '.' && pnm[1] == '.' && pnm[2] == '\0')) {
+			VN_RELE(realvp);
+			return (EINVAL);
+		}
+
+		if ((error = hyprlofs_mkdir(dvp, pnm, &va, &dvp, cr)) != 0 &&
+		    error != EEXIST) {
+			VN_RELE(realvp);
+			return (error);
+		}
+
+		*p = '/';
+		pnm = p + 1;
+	}
+
+	/* The file name is required */
+	if (pnm[0] == '\0') {
+		VN_RELE(realvp);
+		return (EINVAL);
+	}
+
+	/* Now use the real file's va as the template va */
+	if ((error = VOP_GETATTR(realvp, &va, 0, cr, NULL)) != 0) {
+		VN_RELE(realvp);
+		return (error);
+	}
+
+	/* Make the vnode */
+	error = hyprlofs_loopback(dvp, realvp, pnm, &va, va.va_mode, cr, ct);
+	if (error != 0)
+		VN_RELE(realvp);
+	return (error);
+}
+
+/*
+ * Remove a looped in file from the namespace.
+ */
+static int
+hyprlofs_rm_entry(vnode_t *dvp, char *fsname, cred_t *cr, caller_context_t *ct,
+    int flags)
+{
+	int error;
+	char *p, *pnm;
+	hlnode_t *parent;
+	hlnode_t *fndtp;
+
+	pnm = p = fsname;
+
+	/* path cannot be absolute */
+	if (*p == '/')
+		return (EINVAL);
+
+	/*
+	 * If the target name is a path, get the containing dir and simple
+	 * file name.
+	 */
+	parent = (hlnode_t *)VTOHLN(dvp);
+	for (p = strchr(pnm, '/'); p != NULL; p = strchr(pnm, '/')) {
+		*p = '\0';
+
+		/* Path component cannot be empty or relative */
+		if (pnm[0] == '\0' ||
+		    (pnm[0] == '.' && pnm[1] == '.' && pnm[2] == '\0'))
+			return (EINVAL);
+
+		if ((error = hyprlofs_dirlookup(parent, pnm, &fndtp, cr)) != 0)
+			return (error);
+
+		dvp = HLNTOV(fndtp);
+		parent = fndtp;
+		pnm = p + 1;
+	}
+
+	/* The file name is required */
+	if (pnm[0] == '\0')
+		return (EINVAL);
+
+	/* Remove the entry from the parent dir */
+	return (hyprlofs_remove(dvp, pnm, cr, ct, flags));
+}
+
+/*
+ * Remove all looped in files from the namespace.
+ */
+static int
+hyprlofs_rm_all(vnode_t *dvp, cred_t *cr, caller_context_t *ct,
+    int flags)
+{
+	int error = 0;
+	hlnode_t *hp = (hlnode_t *)VTOHLN(dvp);
+	hldirent_t *hdp;
+
+	hlnode_hold(hp);
+
+	/*
+	 * There's a window here where someone could have removed
+	 * all the entries in the directory after we put a hold on the
+	 * vnode but before we grabbed the rwlock.  Just return.
+	 */
+	if (hp->hln_dir == NULL) {
+		if (hp->hln_nlink) {
+			panic("empty directory 0x%p", (void *)hp);
+			/*NOTREACHED*/
+		}
+		goto done;
+	}
+
+	hdp = hp->hln_dir;
+	while (hdp) {
+		hlnode_t *fndhp;
+
+		if (strcmp(hdp->hld_name, ".") == 0 ||
+		    strcmp(hdp->hld_name, "..") == 0) {
+			hdp = hdp->hld_next;
+			continue;
+		}
+
+		/* This holds the fndhp vnode */
+		error = hyprlofs_dirlookup(hp, hdp->hld_name, &fndhp, cr);
+		if (error != 0)
+			goto done;
+		hlnode_rele(fndhp);
+
+		if (fndhp->hln_looped == 0) {
+			/* recursively remove contents of this subdir */
+			if (fndhp->hln_type == VDIR) {
+				vnode_t *tvp = HLNTOV(fndhp);
+
+				error = hyprlofs_rm_all(tvp, cr, ct, flags);
+				if (error != 0)
+					goto done;
+			}
+		}
+
+		/* remove the entry */
+		error = hyprlofs_remove(dvp, hdp->hld_name, cr, ct, flags);
+		if (error != 0)
+			goto done;
+
+		hdp = hp->hln_dir;
+	}
+
+done:
+	hlnode_rele(hp);
+	return (error);
+}
+
+/*
+ * Get a list of all looped in files in the namespace.
+ */
+static int
+hyprlofs_get_all_entries(vnode_t *dvp, hyprlofs_curr_entry_t *hcp,
+    char *prefix, int *pcnt, int n_max,
+    cred_t *cr, caller_context_t *ct, int flags)
+{
+	int error = 0;
+	int too_big = 0;
+	int cnt;
+	int len;
+	hlnode_t *hp = (hlnode_t *)VTOHLN(dvp);
+	hldirent_t *hdp;
+	char *path;
+
+	cnt = *pcnt;
+	path = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+
+	hlnode_hold(hp);
+
+	/*
+	 * There's a window here where someone could have removed
+	 * all the entries in the directory after we put a hold on the
+	 * vnode but before we grabbed the rwlock.  Just return.
+	 */
+	if (hp->hln_dir == NULL) {
+		if (hp->hln_nlink) {
+			panic("empty directory 0x%p", (void *)hp);
+			/*NOTREACHED*/
+		}
+		goto done;
+	}
+
+	hdp = hp->hln_dir;
+	while (hdp) {
+		hlnode_t *fndhp;
+		vnode_t *tvp;
+
+		if (strcmp(hdp->hld_name, ".") == 0 ||
+		    strcmp(hdp->hld_name, "..") == 0) {
+			hdp = hdp->hld_next;
+			continue;
+		}
+
+		/* This holds the fndhp vnode */
+		error = hyprlofs_dirlookup(hp, hdp->hld_name, &fndhp, cr);
+		if (error != 0)
+			goto done;
+		hlnode_rele(fndhp);
+
+		if (fndhp->hln_looped == 0) {
+			/* recursively get contents of this subdir */
+			VERIFY(fndhp->hln_type == VDIR);
+			tvp = HLNTOV(fndhp);
+
+			if (*prefix == '\0')
+				(void) strlcpy(path, hdp->hld_name, MAXPATHLEN);
+			else
+				(void) snprintf(path, MAXPATHLEN, "%s/%s",
+				    prefix, hdp->hld_name);
+
+			error = hyprlofs_get_all_entries(tvp, hcp, path,
+			    &cnt, n_max, cr, ct, flags);
+
+			if (error == E2BIG) {
+				too_big = 1;
+				error = 0;
+			}
+			if (error != 0)
+				goto done;
+		} else {
+			if (cnt < n_max) {
+				char *p;
+
+				if (*prefix == '\0')
+					(void) strlcpy(path, hdp->hld_name,
+					    MAXPATHLEN);
+				else
+					(void) snprintf(path, MAXPATHLEN,
+					    "%s/%s", prefix, hdp->hld_name);
+
+				len = strlen(path);
+				ASSERT(len <= MAXPATHLEN);
+				if (copyout(path, (void *)(hcp[cnt].hce_name),
+				    len)) {
+					error = EFAULT;
+					goto done;
+				}
+
+				tvp = REALVP(HLNTOV(fndhp));
+				if (tvp->v_path == vn_vpath_empty) {
+					p = "<unknown>";
+				} else {
+					p = tvp->v_path;
+				}
+				len = strlen(p);
+				ASSERT(len <= MAXPATHLEN);
+				if (copyout(p, (void *)(hcp[cnt].hce_path),
+				    len)) {
+					error = EFAULT;
+					goto done;
+				}
+			}
+
+			cnt++;
+			if (cnt > n_max)
+				too_big = 1;
+		}
+
+		hdp = hdp->hld_next;
+	}
+
+done:
+	hlnode_rele(hp);
+	kmem_free(path, MAXPATHLEN);
+
+	*pcnt = cnt;
+	if (error == 0 && too_big == 1)
+		error = E2BIG;
+
+	return (error);
+}
+
+/*
+ * Return a list of all looped in files in the namespace.
+ */
+static int
+hyprlofs_get_all(vnode_t *dvp, intptr_t data, cred_t *cr, caller_context_t *ct,
+    int flags)
+{
+	int limit, cnt, error;
+	model_t model;
+	hyprlofs_curr_entry_t *e;
+
+	model = get_udatamodel();
+
+	if (model == DATAMODEL_NATIVE) {
+		hyprlofs_curr_entries_t ebuf;
+
+		if (copyin((void *)data, &ebuf, sizeof (ebuf)))
+			return (EFAULT);
+		limit = ebuf.hce_cnt;
+		e = ebuf.hce_entries;
+		if (limit > MAX_IOCTL_PARAMS)
+			return (EINVAL);
+
+	} else {
+		hyprlofs_curr_entries32_t ebuf32;
+
+		if (copyin((void *)data, &ebuf32, sizeof (ebuf32)))
+			return (EFAULT);
+
+		limit = ebuf32.hce_cnt;
+		e = (hyprlofs_curr_entry_t *)(unsigned long)
+		    (ebuf32.hce_entries);
+		if (limit > MAX_IOCTL_PARAMS)
+			return (EINVAL);
+	}
+
+	cnt = 0;
+	error = hyprlofs_get_all_entries(dvp, e, "", &cnt, limit, cr, ct,
+	    flags);
+
+	if (error == 0 || error == E2BIG) {
+		if (model == DATAMODEL_NATIVE) {
+			hyprlofs_curr_entries_t ebuf;
+
+			ebuf.hce_cnt = cnt;
+			if (copyout(&ebuf, (void *)data, sizeof (ebuf)))
+				return (EFAULT);
+
+		} else {
+			hyprlofs_curr_entries32_t ebuf32;
+
+			ebuf32.hce_cnt = cnt;
+			if (copyout(&ebuf32, (void *)data, sizeof (ebuf32)))
+				return (EFAULT);
+		}
+	}
+
+	return (error);
+}
+
+/* ARGSUSED3 */
+static int
+hyprlofs_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct,
+    int flags)
+{
+	hlnode_t *parent = (hlnode_t *)VTOHLN(dvp);
+	int error;
+	hlnode_t *hp = NULL;
+
+	/* This holds the hp vnode */
+	error = hyprlofs_dirlookup(parent, nm, &hp, cr);
+	if (error)
+		return (error);
+
+	ASSERT(hp);
+	rw_enter(&parent->hln_rwlock, RW_WRITER);
+	rw_enter(&hp->hln_rwlock, RW_WRITER);
+
+	error = hyprlofs_dirdelete(parent, hp, nm, DR_REMOVE, cr);
+
+	rw_exit(&hp->hln_rwlock);
+	rw_exit(&parent->hln_rwlock);
+	vnevent_remove(HLNTOV(hp), dvp, nm, ct);
+
+	/*
+	 * We've now dropped the dir link so by rele-ing our vnode we should
+	 * clean up in hyprlofs_inactive.
+	 */
+	hlnode_rele(hp);
+
+	return (error);
+}
+
+/* ARGSUSED4 */
+static int
+hyprlofs_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr,
+    caller_context_t *ct, int flags)
+{
+	hlnode_t *parent = (hlnode_t *)VTOHLN(dvp);
+	hlnode_t *self = NULL;
+	vnode_t *vp;
+	int error = 0;
+
+	/* Return error if removing . or .. */
+	if (strcmp(nm, ".") == 0)
+		return (EINVAL);
+	if (strcmp(nm, "..") == 0)
+		return (EEXIST); /* Should be ENOTEMPTY */
+	error = hyprlofs_dirlookup(parent, nm, &self, cr);
+	if (error)
+		return (error);
+
+	rw_enter(&parent->hln_rwlock, RW_WRITER);
+	rw_enter(&self->hln_rwlock, RW_WRITER);
+
+	vp = HLNTOV(self);
+	if (vp == dvp || vp == cdir) {
+		error = EINVAL;
+		goto done1;
+	}
+	if (self->hln_type != VDIR) {
+		error = ENOTDIR;
+		goto done1;
+	}
+
+	/*
+	 * When a dir is looped in, we only remove the in-memory dir, not the
+	 * backing dir.
+	 */
+	if (self->hln_looped == 0) {
+		mutex_enter(&self->hln_tlock);
+		if (self->hln_nlink > 2) {
+			mutex_exit(&self->hln_tlock);
+			error = EEXIST;
+			goto done1;
+		}
+		mutex_exit(&self->hln_tlock);
+
+		if (vn_vfswlock(vp)) {
+			error = EBUSY;
+			goto done1;
+		}
+		if (vn_mountedvfs(vp) != NULL) {
+			error = EBUSY;
+			goto done;
+		}
+
+		/*
+		 * Check for an empty directory, i.e. only includes entries for
+		 * "." and ".."
+		 */
+		if (self->hln_dirents > 2) {
+			error = EEXIST;		/* SIGH should be ENOTEMPTY */
+			/*
+			 * Update atime because checking hln_dirents is
+			 * equivalent to reading the directory
+			 */
+			gethrestime(&self->hln_atime);
+			goto done;
+		}
+
+		error = hyprlofs_dirdelete(parent, self, nm, DR_RMDIR, cr);
+	} else {
+		error = hyprlofs_dirdelete(parent, self, nm, DR_REMOVE, cr);
+	}
+
+done:
+	if (self->hln_looped == 0)
+		vn_vfsunlock(vp);
+done1:
+	rw_exit(&self->hln_rwlock);
+	rw_exit(&parent->hln_rwlock);
+	vnevent_rmdir(HLNTOV(self), dvp, nm, ct);
+
+	/*
+	 * We've now dropped the dir link so by rele-ing our vnode we should
+	 * clean up in hyprlofs_inactive.
+	 */
+	hlnode_rele(self);
+
+	return (error);
+}
+
+static int
+hyprlofs_readdir(vnode_t *vp, struct uio *uiop, cred_t *cr, int *eofp,
+    caller_context_t *ct, int flags)
+{
+	hlnode_t *hp = (hlnode_t *)VTOHLN(vp);
+	hldirent_t *hdp;
+	int error = 0;
+	size_t namelen;
+	struct dirent64 *dp;
+	ulong_t offset;
+	ulong_t total_bytes_wanted;
+	long outcount = 0;
+	long bufsize;
+	int reclen;
+	caddr_t outbuf;
+
+	if (VTOHLN(vp)->hln_looped == 1)
+		return (VOP_READDIR(REALVP(vp), uiop, cr, eofp, ct, flags));
+
+	if (uiop->uio_loffset >= MAXOFF_T) {
+		if (eofp)
+			*eofp = 1;
+		return (0);
+	}
+	/* assuming syscall has already called hln_rwlock */
+	ASSERT(RW_READ_HELD(&hp->hln_rwlock));
+
+	if (uiop->uio_iovcnt != 1)
+		return (EINVAL);
+
+	if (vp->v_type != VDIR)
+		return (ENOTDIR);
+
+	/*
+	 * There's a window here where someone could have removed
+	 * all the entries in the directory after we put a hold on the
+	 * vnode but before we grabbed the rwlock.  Just return.
+	 */
+	if (hp->hln_dir == NULL) {
+		if (hp->hln_nlink) {
+			panic("empty directory 0x%p", (void *)hp);
+			/*NOTREACHED*/
+		}
+		return (0);
+	}
+
+	/* Get space for multiple dir entries */
+	total_bytes_wanted = uiop->uio_iov->iov_len;
+	bufsize = total_bytes_wanted + sizeof (struct dirent64);
+	outbuf = kmem_alloc(bufsize, KM_SLEEP);
+
+	dp = (struct dirent64 *)((uintptr_t)outbuf);
+
+	offset = 0;
+	hdp = hp->hln_dir;
+	while (hdp) {
+		namelen = strlen(hdp->hld_name);	/* no +1 needed */
+		offset = hdp->hld_offset;
+		if (offset >= uiop->uio_offset) {
+			reclen = (int)DIRENT64_RECLEN(namelen);
+			if (outcount + reclen > total_bytes_wanted) {
+				if (!outcount)
+					/* Buffer too small for any entries. */
+					error = EINVAL;
+				break;
+			}
+			ASSERT(hdp->hld_hlnode != NULL);
+
+			/* zero out uninitialized bytes */
+			(void) strncpy(dp->d_name, hdp->hld_name,
+			    DIRENT64_NAMELEN(reclen));
+			dp->d_reclen = (ushort_t)reclen;
+			dp->d_ino = (ino64_t)hdp->hld_hlnode->hln_nodeid;
+			dp->d_off = (offset_t)hdp->hld_offset + 1;
+			dp = (struct dirent64 *)
+			    ((uintptr_t)dp + dp->d_reclen);
+			outcount += reclen;
+			ASSERT(outcount <= bufsize);
+		}
+		hdp = hdp->hld_next;
+	}
+
+	if (!error)
+		error = uiomove(outbuf, outcount, UIO_READ, uiop);
+
+	if (!error) {
+		/*
+		 * If we reached the end of the list our offset should now be
+		 * just past the end.
+		 */
+		if (!hdp) {
+			offset += 1;
+			if (eofp)
+				*eofp = 1;
+		} else if (eofp)
+			*eofp = 0;
+		uiop->uio_offset = offset;
+	}
+	gethrestime(&hp->hln_atime);
+	kmem_free(outbuf, bufsize);
+	return (error);
+}
+
+static int
+hyprlofs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
+{
+	if (VTOHLN(vp)->hln_looped == 1)
+		return (VOP_FSYNC(REALVP(vp), syncflag, cr, ct));
+	return (0);
+}
+
+/* ARGSUSED */
+static void
+hyprlofs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
+{
+	hlnode_t *hp = (hlnode_t *)VTOHLN(vp);
+	hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vp->v_vfsp);
+
+	rw_enter(&hp->hln_rwlock, RW_WRITER);
+
+	mutex_enter(&hp->hln_tlock);
+	mutex_enter(&vp->v_lock);
+	ASSERT(vp->v_count >= 1);
+
+	/*
+	 * If we don't have the last hold or the link count is non-zero,
+	 * there's nothing to do except drop our hold.
+	 */
+	if (vp->v_count > 1 || hp->hln_nlink != 0) {
+		vp->v_count--;
+		mutex_exit(&vp->v_lock);
+		mutex_exit(&hp->hln_tlock);
+		rw_exit(&hp->hln_rwlock);
+		return;
+	}
+
+	mutex_exit(&vp->v_lock);
+	mutex_exit(&hp->hln_tlock);
+
+	/* release hold on the real vnode now */
+	if (hp->hln_looped == 1 && hp->hln_realvp != NULL)
+		VN_RELE(hp->hln_realvp);
+
+	/* Here's our chance to send invalid event while we're between locks */
+	vn_invalid(HLNTOV(hp));
+
+	mutex_enter(&hm->hlm_contents);
+	if (hp->hln_forw == NULL)
+		hm->hlm_rootnode->hln_back = hp->hln_back;
+	else
+		hp->hln_forw->hln_back = hp->hln_back;
+	hp->hln_back->hln_forw = hp->hln_forw;
+	mutex_exit(&hm->hlm_contents);
+	rw_exit(&hp->hln_rwlock);
+	rw_destroy(&hp->hln_rwlock);
+	mutex_destroy(&hp->hln_tlock);
+	vn_free(HLNTOV(hp));
+	kmem_free(hp, sizeof (hlnode_t));
+}
+
+static int
+hyprlofs_fid(vnode_t *vp, struct fid *fidp, caller_context_t *ct)
+{
+	hlnode_t *hp = (hlnode_t *)VTOHLN(vp);
+	hlfid_t *hfid;
+
+	if (VTOHLN(vp)->hln_looped == 1)
+		return (VOP_FID(REALVP(vp), fidp, ct));
+
+	if (fidp->fid_len < (sizeof (hlfid_t) - sizeof (ushort_t))) {
+		fidp->fid_len = sizeof (hlfid_t) - sizeof (ushort_t);
+		return (ENOSPC);
+	}
+
+	hfid = (hlfid_t *)fidp;
+	bzero(hfid, sizeof (hlfid_t));
+	hfid->hlfid_len = (int)sizeof (hlfid_t) - sizeof (ushort_t);
+
+	hfid->hlfid_ino = hp->hln_nodeid;
+	hfid->hlfid_gen = hp->hln_gen;
+
+	return (0);
+}
+
+static int
+hyprlofs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
+    page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, enum seg_rw rw,
+    cred_t *cr, caller_context_t *ct)
+{
+	/* return EACCES to be consistent with mmap */
+	if (VTOHLN(vp)->hln_looped != 1)
+		return (EACCES);
+	return (VOP_GETPAGE(REALVP(vp), off, len, protp, pl, plsz, seg, addr,
+	    rw, cr, ct));
+}
+
+int
+hyprlofs_putpage(vnode_t *vp, offset_t off, size_t len, int flags,
+    cred_t *cr, caller_context_t *ct)
+{
+	/* return EACCES to be consistent with mmap */
+	if (VTOHLN(vp)->hln_looped != 1)
+		return (EACCES);
+	return (VOP_PUTPAGE(REALVP(vp), off, len, flags, cr, ct));
+}
+
+static int
+hyprlofs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
+    size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
+    caller_context_t *ct)
+{
+	/* return EACCES to be consistent with mmap */
+	if (VTOHLN(vp)->hln_looped != 1)
+		return (EACCES);
+	return (VOP_MAP(REALVP(vp), off, as, addrp, len, prot, maxprot, flags,
+	    cr, ct));
+}
+
+static int
+hyprlofs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
+    size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
+    caller_context_t *ct)
+{
+	/* return EACCES to be consistent with mmap */
+	if (VTOHLN(vp)->hln_looped != 1)
+		return (EACCES);
+	return (VOP_ADDMAP(REALVP(vp), off, as, addr, len, prot, maxprot,
+	    flags, cr, ct));
+}
+
+static int
+hyprlofs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
+    size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr,
+    caller_context_t *ct)
+{
+	/* return EACCES to be consistent with mmap */
+	if (VTOHLN(vp)->hln_looped != 1)
+		return (EACCES);
+	return (VOP_DELMAP(REALVP(vp), off, as, addr, len, prot, maxprot,
+	    flags, cr, ct));
+}
+
+static int
+hyprlofs_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
+    offset_t offset, cred_t *cr, caller_context_t *ct)
+{
+	/* return EACCES to be consistent with mmap */
+	if (VTOHLN(vp)->hln_looped != 1)
+		return (EACCES);
+	return (VOP_SPACE(REALVP(vp), cmd, bfp, flag, offset, cr, ct));
+}
+
+static int
+hyprlofs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp,
+    caller_context_t *ct)
+{
+	if (VTOHLN(vp)->hln_looped == 0)
+		return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
+
+	return (VOP_SEEK(REALVP(vp), ooff, noffp, ct));
+}
+
+static int
+hyprlofs_rwlock(vnode_t *vp, int write_lock, caller_context_t *ct)
+{
+	hlnode_t *hp = VTOHLN(vp);
+
+	if (hp->hln_looped == 1)
+		return (VOP_RWLOCK(REALVP(vp), write_lock, ct));
+
+	if (write_lock) {
+		rw_enter(&hp->hln_rwlock, RW_WRITER);
+	} else {
+		rw_enter(&hp->hln_rwlock, RW_READER);
+	}
+	return (write_lock);
+}
+
+static void
+hyprlofs_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ct)
+{
+	hlnode_t *hp = VTOHLN(vp);
+
+	if (hp->hln_looped == 1) {
+		VOP_RWUNLOCK(REALVP(vp), write_lock, ct);
+		return;
+	}
+
+	rw_exit(&hp->hln_rwlock);
+}
+
+static int
+hyprlofs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
+    caller_context_t *ct)
+{
+	int error;
+
+	if (VTOHLN(vp)->hln_looped == 1)
+		return (VOP_PATHCONF(REALVP(vp), cmd, valp, cr, ct));
+
+	switch (cmd) {
+	case _PC_XATTR_ENABLED:
+	case _PC_XATTR_EXISTS:
+	case _PC_SATTR_ENABLED:
+	case _PC_SATTR_EXISTS:
+		error = EINVAL;
+		break;
+	case _PC_TIMESTAMP_RESOLUTION:
+		/* nanosecond timestamp resolution */
+		*valp = 1L;
+		error = 0;
+		break;
+	default:
+		error = fs_pathconf(vp, cmd, valp, cr, ct);
+	}
+	return (error);
+}
+
+
+struct vnodeops *hyprlofs_vnodeops;
+
+const fs_operation_def_t hyprlofs_vnodeops_template[] = {
+	VOPNAME_OPEN,		{ .vop_open = hyprlofs_open },
+	VOPNAME_CLOSE,		{ .vop_close = hyprlofs_close },
+	VOPNAME_READ,		{ .vop_read = hyprlofs_read },
+	VOPNAME_WRITE,		{ .vop_write = hyprlofs_write },
+	VOPNAME_IOCTL,		{ .vop_ioctl = hyprlofs_ioctl },
+	VOPNAME_GETATTR,	{ .vop_getattr = hyprlofs_getattr },
+	VOPNAME_SETATTR,	{ .vop_setattr = hyprlofs_setattr },
+	VOPNAME_ACCESS,		{ .vop_access = hyprlofs_access },
+	VOPNAME_LOOKUP,		{ .vop_lookup = hyprlofs_lookup },
+	VOPNAME_CREATE,		{ .error = fs_error },
+	VOPNAME_REMOVE,		{ .vop_remove = hyprlofs_remove },
+	VOPNAME_LINK,		{ .error = fs_error },
+	VOPNAME_RENAME,		{ .error = fs_error },
+	VOPNAME_MKDIR,		{ .error = fs_error },
+	VOPNAME_RMDIR,		{ .vop_rmdir = hyprlofs_rmdir },
+	VOPNAME_READDIR,	{ .vop_readdir = hyprlofs_readdir },
+	VOPNAME_SYMLINK,	{ .error = fs_error },
+	VOPNAME_READLINK,	{ .error = fs_error },
+	VOPNAME_FSYNC,		{ .vop_fsync = hyprlofs_fsync },
+	VOPNAME_INACTIVE,	{ .vop_inactive = hyprlofs_inactive },
+	VOPNAME_FID,		{ .vop_fid = hyprlofs_fid },
+	VOPNAME_RWLOCK,		{ .vop_rwlock = hyprlofs_rwlock },
+	VOPNAME_RWUNLOCK,	{ .vop_rwunlock = hyprlofs_rwunlock },
+	VOPNAME_SEEK,		{ .vop_seek = hyprlofs_seek },
+	VOPNAME_SPACE,		{ .vop_space = hyprlofs_space },
+	VOPNAME_GETPAGE,	{ .vop_getpage = hyprlofs_getpage },
+	VOPNAME_PUTPAGE,	{ .vop_putpage = hyprlofs_putpage },
+	VOPNAME_MAP,		{ .vop_map = hyprlofs_map },
+	VOPNAME_ADDMAP,		{ .vop_addmap = hyprlofs_addmap },
+	VOPNAME_DELMAP,		{ .vop_delmap = hyprlofs_delmap },
+	VOPNAME_PATHCONF,	{ .vop_pathconf = hyprlofs_pathconf },
+	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
+	NULL,			NULL
+};
diff --git a/usr/src/uts/common/fs/lookup.c b/usr/src/uts/common/fs/lookup.c
index 55ffb94805..59ec5d1829 100644
--- a/usr/src/uts/common/fs/lookup.c
+++ b/usr/src/uts/common/fs/lookup.c
@@ -21,6 +21,7 @@
 
 /*
  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2015, Joyent, Inc. All rights reserved.
  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
@@ -57,6 +58,7 @@
 #include <sys/zone.h>
 #include <sys/dnlc.h>
 #include <sys/fs/snode.h>
+#include <sys/brand.h>
 
 /* Controls whether paths are stored with vnodes. */
 int vfs_vnode_path = 1;
@@ -977,6 +979,96 @@ localpath(char *path, struct vnode *vrootp, cred_t *cr)
 }
 
 /*
+ * Clean a stale v_path from a vnode.  This is only performed if the v_path has
+ * not been altered since it was found to be stale
+ */
+static void
+vnode_clear_vpath(vnode_t *vp, char *vpath_old)
+{
+	mutex_enter(&vp->v_lock);
+	if (vp->v_path != vn_vpath_empty && vp->v_path == vpath_old) {
+		vp->v_path = vn_vpath_empty;
+		mutex_exit(&vp->v_lock);
+		kmem_free(vpath_old, strlen(vpath_old) + 1);
+	} else {
+		mutex_exit(&vp->v_lock);
+	}
+}
+
+/*
+ * Validate that a pathname refers to a given vnode.
+ */
+static int
+vnode_valid_pn(vnode_t *vp, vnode_t *vrootp, pathname_t *pn, pathname_t *rpn,
+    int flags, cred_t *cr)
+{
+	vnode_t *compvp;
+	/*
+	 * If we are in a zone or a chroot environment, then we have to
+	 * take additional steps, since the path to the root might not
+	 * be readable with the current credentials, even though the
+	 * process can legitmately access the file.  In this case, we
+	 * do the following:
+	 *
+	 * lookuppnvp() with all privileges to get the resolved path.
+	 * call localpath() to get the local portion of the path, and
+	 * continue as normal.
+	 *
+	 * If the the conversion to a local path fails, then we continue
+	 * as normal.  This is a heuristic to make process object file
+	 * paths available from within a zone.  Because lofs doesn't
+	 * support page operations, the vnode stored in the seg_t is
+	 * actually the underlying real vnode, not the lofs node itself.
+	 * Most of the time, the lofs path is the same as the underlying
+	 * vnode (for example, /usr/lib/libc.so.1).
+	 */
+	if (vrootp != rootdir) {
+		char *local = NULL;
+
+		VN_HOLD(rootdir);
+		if (lookuppnvp(pn, rpn, FOLLOW, NULL, &compvp, rootdir,
+		    rootdir, kcred) == 0) {
+			local = localpath(rpn->pn_path, vrootp, kcred);
+			VN_RELE(compvp);
+		}
+
+		/*
+		 * The original pn was changed through lookuppnvp().
+		 * Set it to local for next validation attempt.
+		 */
+		if (local) {
+			(void) pn_set(pn, local);
+		} else {
+			return (1);
+		}
+	}
+
+	/*
+	 * We should have a local path at this point, so start the search from
+	 * the root of the current process.
+	 */
+	VN_HOLD(vrootp);
+	if (vrootp != rootdir)
+		VN_HOLD(vrootp);
+	if (lookuppnvp(pn, rpn, FOLLOW | flags, NULL, &compvp, vrootp, vrootp,
+	    cr) == 0) {
+		/*
+		 * Check to see if the returned vnode is the same as the one we
+		 * expect.
+		 */
+		if (vn_compare(vp, compvp) ||
+		    vnode_match(vp, compvp, cr)) {
+			VN_RELE(compvp);
+			return (0);
+		} else {
+			VN_RELE(compvp);
+		}
+	}
+
+	return (1);
+}
+
+/*
  * Given a directory, return the full, resolved path.  This looks up "..",
  * searches for the given vnode in the parent, appends the component, etc.  It
  * is used to implement vnodetopath() and getcwd() when the cached path fails.
@@ -995,6 +1087,8 @@ dirtopath(vnode_t *vrootp, vnode_t *vp, char *buf, size_t buflen, int flags,
 	char		*bufloc;
 	size_t		dlen = DIRENT64_RECLEN(MAXPATHLEN);
 	refstr_t	*mntpt;
+	char *vpath_cached;
+	boolean_t vpath_stale;
 
 	/* Operation only allowed on directories */
 	ASSERT(vp->v_type == VDIR);
@@ -1088,40 +1182,28 @@ dirtopath(vnode_t *vrootp, vnode_t *vp, char *buf, size_t buflen, int flags,
 		 * Shortcut: see if this vnode has correct v_path. If so,
 		 * we have the work done.
 		 */
+		vpath_cached = NULL;
+		vpath_stale = B_FALSE;
 		mutex_enter(&vp->v_lock);
-		if (vp->v_path != NULL) {
-
-			if ((err = pn_set(&pn, vp->v_path)) == 0) {
-				mutex_exit(&vp->v_lock);
-				rpn.pn_path = rpn.pn_buf;
-
-				/*
-				 * Ensure the v_path pointing to correct vnode
-				 */
-				VN_HOLD(vrootp);
-				if (vrootp != rootdir)
-					VN_HOLD(vrootp);
-				if (lookuppnvp(&pn, &rpn, flags, NULL,
-				    &cmpvp, vrootp, vrootp, cr) == 0) {
-
-					if (VN_CMP(vp, cmpvp)) {
-						VN_RELE(cmpvp);
+		if (vp->v_path != vn_vpath_empty &&
+		    pn_set(&pn, vp->v_path) == 0) {
+			vpath_cached = vp->v_path;
+			mutex_exit(&vp->v_lock);
+			rpn.pn_path = rpn.pn_buf;
 
-						complen = strlen(rpn.pn_path);
-						bufloc -= complen;
-						if (bufloc < buf) {
-							err = ERANGE;
-							goto out;
-						}
-						bcopy(rpn.pn_path, bufloc,
-						    complen);
-						break;
-					} else {
-						VN_RELE(cmpvp);
-					}
+			/* Ensure the v_path pointing to correct vnode */
+			if (vnode_valid_pn(vp, vrootp, &pn, &rpn, flags,
+			    cr) == 0) {
+				complen = strlen(rpn.pn_path);
+				bufloc -= complen;
+				if (bufloc < buf) {
+					err = ERANGE;
+					goto out;
 				}
+				bcopy(rpn.pn_path, bufloc, complen);
+				break;
 			} else {
-				mutex_exit(&vp->v_lock);
+				vpath_stale = B_TRUE;
 			}
 		} else {
 			mutex_exit(&vp->v_lock);
@@ -1166,38 +1248,6 @@ dirtopath(vnode_t *vrootp, vnode_t *vp, char *buf, size_t buflen, int flags,
 		}
 
 		/*
-		 * Try to obtain the path component from dnlc cache
-		 * before searching through the directory.
-		 */
-		if ((cmpvp = dnlc_reverse_lookup(vp, dbuf, dlen)) != NULL) {
-			/*
-			 * If we got parent vnode as a result,
-			 * then the answered path is correct.
-			 */
-			if (VN_CMP(cmpvp, pvp)) {
-				VN_RELE(cmpvp);
-				complen = strlen(dbuf);
-				bufloc -= complen;
-				if (bufloc <= buf) {
-					err = ENAMETOOLONG;
-					goto out;
-				}
-				bcopy(dbuf, bufloc, complen);
-
-				/* Prepend a slash to the current path */
-				*--bufloc = '/';
-
-				/* And continue with the next component */
-				VN_RELE(vp);
-				vp = pvp;
-				pvp = NULL;
-				continue;
-			} else {
-				VN_RELE(cmpvp);
-			}
-		}
-
-		/*
 		 * Search the parent directory for the entry corresponding to
 		 * this vnode.
 		 */
@@ -1215,6 +1265,11 @@ dirtopath(vnode_t *vrootp, vnode_t *vp, char *buf, size_t buflen, int flags,
 		/* Prepend a slash to the current path.  */
 		*--bufloc = '/';
 
+		/* Clear vp->v_path if it was found to be stale. */
+		if (vpath_stale == B_TRUE) {
+			vnode_clear_vpath(vp, vpath_cached);
+		}
+
 		/* And continue with the next component */
 		VN_RELE(vp);
 		vp = pvp;
@@ -1306,144 +1361,49 @@ vnodetopath_common(vnode_t *vrootp, vnode_t *vp, char *buf, size_t buflen,
 			VN_RELE(vp);
 	}
 
-	pn_alloc(&pn);
 
 	/*
-	 * Check to see if we have a cached path in the vnode.
+	 * Check to see if we have a valid cached path in the vnode.
 	 */
+	pn_alloc(&pn);
 	mutex_enter(&vp->v_lock);
-	if (vp->v_path != NULL) {
+	if (vp->v_path != vn_vpath_empty) {
 		(void) pn_set(&pn, vp->v_path);
 		mutex_exit(&vp->v_lock);
 
-		pn_alloc(&rpn);
-
 		/* We should only cache absolute paths */
 		ASSERT(pn.pn_buf[0] == '/');
 
-		/*
-		 * If we are in a zone or a chroot environment, then we have to
-		 * take additional steps, since the path to the root might not
-		 * be readable with the current credentials, even though the
-		 * process can legitmately access the file.  In this case, we
-		 * do the following:
-		 *
-		 * lookuppnvp() with all privileges to get the resolved path.
-		 * call localpath() to get the local portion of the path, and
-		 * continue as normal.
-		 *
-		 * If the the conversion to a local path fails, then we continue
-		 * as normal.  This is a heuristic to make process object file
-		 * paths available from within a zone.  Because lofs doesn't
-		 * support page operations, the vnode stored in the seg_t is
-		 * actually the underlying real vnode, not the lofs node itself.
-		 * Most of the time, the lofs path is the same as the underlying
-		 * vnode (for example, /usr/lib/libc.so.1).
-		 */
-		if (vrootp != rootdir) {
-			char *local = NULL;
-			VN_HOLD(rootdir);
-			if (lookuppnvp(&pn, &rpn, FOLLOW,
-			    NULL, &compvp, rootdir, rootdir, kcred) == 0) {
-				local = localpath(rpn.pn_path, vrootp,
-				    kcred);
-				VN_RELE(compvp);
-			}
-
-			/*
-			 * The original pn was changed through lookuppnvp().
-			 * Set it to local for next validation attempt.
-			 */
-			if (local) {
-				(void) pn_set(&pn, local);
-			} else {
-				goto notcached;
+		pn_alloc(&rpn);
+		if (vnode_valid_pn(vp, vrootp, &pn, &rpn, flags, cr) == 0) {
+			/* Return the result, if we're able. */
+			if (buflen > rpn.pn_pathlen) {
+				bcopy(rpn.pn_path, buf, rpn.pn_pathlen + 1);
+				pn_free(&pn);
+				pn_free(&rpn);
+				VN_RELE(vrootp);
+				if (doclose) {
+					(void) VOP_CLOSE(vp, FREAD, 1, 0, cr,
+					    NULL);
+					VN_RELE(vp);
+				}
+				return (0);
 			}
 		}
-
 		/*
-		 * We should have a local path at this point, so start the
-		 * search from the root of the current process.
+		 * A stale v_path will be purged by the later dirtopath lookup.
 		 */
-		VN_HOLD(vrootp);
-		if (vrootp != rootdir)
-			VN_HOLD(vrootp);
-		ret = lookuppnvp(&pn, &rpn, FOLLOW | flags, NULL,
-		    &compvp, vrootp, vrootp, cr);
-		if (ret == 0) {
-			/*
-			 * Check to see if the returned vnode is the same as
-			 * the one we expect.  If not, give up.
-			 */
-			if (!vn_compare(vp, compvp) &&
-			    !vnode_match(vp, compvp, cr)) {
-				VN_RELE(compvp);
-				goto notcached;
-			}
-
-			VN_RELE(compvp);
-
-			/*
-			 * Return the result.
-			 */
-			if (buflen <= rpn.pn_pathlen)
-				goto notcached;
-
-			bcopy(rpn.pn_path, buf, rpn.pn_pathlen + 1);
-			pn_free(&pn);
-			pn_free(&rpn);
-			VN_RELE(vrootp);
-			if (doclose) {
-				(void) VOP_CLOSE(vp, FREAD, 1, 0, cr, NULL);
-				VN_RELE(vp);
-			}
-			return (0);
-		}
-
-notcached:
 		pn_free(&rpn);
 	} else {
 		mutex_exit(&vp->v_lock);
 	}
-
 	pn_free(&pn);
 
 	if (vp->v_type != VDIR) {
-		/*
-		 * If we don't have a directory, try to find it in the dnlc via
-		 * reverse lookup.  Once this is found, we can use the regular
-		 * directory search to find the full path.
-		 */
-		if ((pvp = dnlc_reverse_lookup(vp, path, MAXNAMELEN)) != NULL) {
-			/*
-			 * Check if we have read privilege so, that
-			 * we can lookup the path in the directory
-			 */
-			ret = 0;
-			if ((flags & LOOKUP_CHECKREAD)) {
-				ret = VOP_ACCESS(pvp, VREAD, 0, cr, NULL);
-			}
-			if (ret == 0) {
-				ret = dirtopath(vrootp, pvp, buf, buflen,
-				    flags, cr);
-			}
-			if (ret == 0) {
-				len = strlen(buf);
-				if (len + strlen(path) + 1 >= buflen) {
-					ret = ENAMETOOLONG;
-				} else {
-					if (buf[len - 1] != '/')
-						buf[len++] = '/';
-					bcopy(path, buf + len,
-					    strlen(path) + 1);
-				}
-			}
-
-			VN_RELE(pvp);
-		} else
-			ret = ENOENT;
-	} else
+		ret = ENOENT;
+	} else {
 		ret = dirtopath(vrootp, vp, buf, buflen, flags, cr);
+	}
 
 	VN_RELE(vrootp);
 	if (doclose) {
diff --git a/usr/src/uts/common/fs/lxproc/lxpr_subr.c b/usr/src/uts/common/fs/lxproc/lxpr_subr.c
new file mode 100644
index 0000000000..3c1405d4af
--- /dev/null
+++ b/usr/src/uts/common/fs/lxproc/lxpr_subr.c
@@ -0,0 +1,524 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
+#include <sys/varargs.h>
+#include <sys/cpuvar.h>
+#include <sys/mman.h>
+#include <sys/vmsystm.h>
+#include <sys/prsystm.h>
+
+#include "lxproc.h"
+
+#define	LXPRCACHE_NAME "lxpr_cache"
+
+static int lxpr_node_constructor(void *, void *, int);
+static void lxpr_node_destructor(void *, void *);
+
+static kmem_cache_t *lxpr_node_cache;
+
+struct lxpr_uiobuf {
+	uio_t *uiop;
+	char *buffer;
+	uint32_t buffsize;
+	char *pos;
+	size_t beg;
+	int error;
+};
+
+int lxpr_bufsize = 4000;
+
+struct lxpr_uiobuf *
+lxpr_uiobuf_new(uio_t *uiop)
+{
+	/* Allocate memory for both lxpr_uiobuf and output buffer */
+	int bufsize = lxpr_bufsize;
+	struct lxpr_uiobuf *uiobuf =
+	    kmem_alloc(sizeof (struct lxpr_uiobuf) + bufsize, KM_SLEEP);
+
+	uiobuf->uiop = uiop;
+	uiobuf->buffer = (char *)&uiobuf[1];
+	uiobuf->buffsize = bufsize;
+	uiobuf->pos = uiobuf->buffer;
+	uiobuf->beg = 0;
+	uiobuf->error = 0;
+
+	return (uiobuf);
+}
+
+void
+lxpr_uiobuf_free(struct lxpr_uiobuf *uiobuf)
+{
+	ASSERT(uiobuf != NULL);
+	ASSERT(uiobuf->pos == uiobuf->buffer);
+
+	kmem_free(uiobuf, sizeof (struct lxpr_uiobuf) + uiobuf->buffsize);
+}
+
+void
+lxpr_uiobuf_seek(struct lxpr_uiobuf *uiobuf, offset_t offset)
+{
+	uiobuf->uiop->uio_offset = (off_t)offset;
+}
+
+void
+lxpr_uiobuf_seterr(struct lxpr_uiobuf *uiobuf, int err)
+{
+	ASSERT(uiobuf->error == 0);
+
+	uiobuf->error = err;
+}
+
+int
+lxpr_uiobuf_flush(struct lxpr_uiobuf *uiobuf)
+{
+	off_t off = uiobuf->uiop->uio_offset;
+	caddr_t uaddr = uiobuf->buffer;
+	size_t beg = uiobuf->beg;
+	size_t size = (uintptr_t)uiobuf->pos - (uintptr_t)uaddr;
+
+	if (uiobuf->error == 0 && uiobuf->uiop->uio_resid != 0) {
+		ASSERT(off >= beg);
+
+		if (beg + size > off && off >= 0)
+			uiobuf->error =
+			    uiomove(uaddr + (off - beg), size - (off - beg),
+			    UIO_READ, uiobuf->uiop);
+
+		uiobuf->beg += size;
+	}
+
+	uiobuf->pos = uaddr;
+
+	return (uiobuf->error);
+}
+
+void
+lxpr_uiobuf_write(struct lxpr_uiobuf *uiobuf, const char *buf, size_t size)
+{
+	/* While we can still carry on */
+	while (uiobuf->error == 0 && uiobuf->uiop->uio_resid != 0) {
+		uintptr_t remain = (uintptr_t)uiobuf->buffsize -
+		    ((uintptr_t)uiobuf->pos - (uintptr_t)uiobuf->buffer);
+
+		/* Enough space in buffer? */
+		if (remain >= size) {
+			bcopy(buf, uiobuf->pos, size);
+			uiobuf->pos += size;
+			return;
+		}
+
+		/* Not enough space, so copy all we can and try again */
+		bcopy(buf, uiobuf->pos, remain);
+		uiobuf->pos += remain;
+		(void) lxpr_uiobuf_flush(uiobuf);
+		buf += remain;
+		size -= remain;
+	}
+}
+
+#define	TYPBUFFSIZE 256
+
+void
+lxpr_uiobuf_printf(struct lxpr_uiobuf *uiobuf, const char *fmt, ...)
+{
+	va_list args;
+	char buff[TYPBUFFSIZE];
+	int len;
+	char *buffer;
+
+	/* Can we still do any output */
+	if (uiobuf->error != 0 || uiobuf->uiop->uio_resid == 0)
+		return;
+
+	va_start(args, fmt);
+
+	/* Try using stack allocated buffer */
+	len = vsnprintf(buff, TYPBUFFSIZE, fmt, args);
+	if (len < TYPBUFFSIZE) {
+		va_end(args);
+		lxpr_uiobuf_write(uiobuf, buff, len);
+		return;
+	}
+
+	/* Not enough space in pre-allocated buffer */
+	buffer = kmem_alloc(len + 1, KM_SLEEP);
+
+	/*
+	 * We know we allocated the correct amount of space
+	 * so no check on the return value
+	 */
+	(void) vsnprintf(buffer, len+1, fmt, args);
+	lxpr_uiobuf_write(uiobuf, buffer, len);
+	va_end(args);
+	kmem_free(buffer, len+1);
+}
+
+/*
+ * lxpr_lock():
+ *
+ * Lookup process from pid and return with p_plock and P_PR_LOCK held.
+ */
+proc_t *
+lxpr_lock(pid_t pid)
+{
+	proc_t *p;
+	kmutex_t *mp;
+
+	ASSERT(!MUTEX_HELD(&pidlock));
+
+	for (;;) {
+		mutex_enter(&pidlock);
+
+		/*
+		 * If the pid is 1, we really want the zone's init process
+		 */
+		p = prfind((pid == 1) ?
+		    curproc->p_zone->zone_proc_initpid : pid);
+
+		if (p == NULL || p->p_stat == SIDL) {
+			mutex_exit(&pidlock);
+			return (NULL);
+		}
+
+		/*
+		 * p_lock is persistent, but p itself is not -- it could
+		 * vanish during cv_wait().  Load p->p_lock now so we can
+		 * drop it after cv_wait() without referencing p.
+		 */
+		mp = &p->p_lock;
+		mutex_enter(mp);
+
+		mutex_exit(&pidlock);
+
+		if (p->p_flag & SEXITING) {
+			/*
+			 * This process is exiting -- let it go.
+			 */
+			mutex_exit(mp);
+			return (NULL);
+		}
+
+		if (!(p->p_proc_flag & P_PR_LOCK))
+			break;
+
+		cv_wait(&pr_pid_cv[p->p_slot], mp);
+		mutex_exit(mp);
+	}
+
+	p->p_proc_flag |= P_PR_LOCK;
+	THREAD_KPRI_REQUEST();
+	return (p);
+}
+
+/*
+ * lxpr_unlock()
+ *
+ * Unlock locked process
+ */
+void
+lxpr_unlock(proc_t *p)
+{
+	ASSERT(p->p_proc_flag & P_PR_LOCK);
+	ASSERT(MUTEX_HELD(&p->p_lock));
+	ASSERT(!MUTEX_HELD(&pidlock));
+
+	cv_signal(&pr_pid_cv[p->p_slot]);
+	p->p_proc_flag &= ~P_PR_LOCK;
+	mutex_exit(&p->p_lock);
+	THREAD_KPRI_RELEASE();
+}
+
+void
+lxpr_initnodecache()
+{
+	lxpr_node_cache = kmem_cache_create(LXPRCACHE_NAME,
+	    sizeof (lxpr_node_t), 0,
+	    lxpr_node_constructor, lxpr_node_destructor, NULL, NULL, NULL, 0);
+}
+
+void
+lxpr_fininodecache()
+{
+	kmem_cache_destroy(lxpr_node_cache);
+}
+
+/* ARGSUSED */
+static int
+lxpr_node_constructor(void *buf, void *un, int kmflags)
+{
+	lxpr_node_t	*lxpnp = buf;
+	vnode_t		*vp;
+
+	vp = lxpnp->lxpr_vnode = vn_alloc(kmflags);
+	if (vp == NULL)
+		return (-1);
+
+	(void) vn_setops(vp, lxpr_vnodeops);
+	vp->v_data = lxpnp;
+
+	return (0);
+}
+
+/* ARGSUSED */
+static void
+lxpr_node_destructor(void *buf, void *un)
+{
+	lxpr_node_t	*lxpnp = buf;
+
+	vn_free(LXPTOV(lxpnp));
+}
+
+/*
+ * Calculate an inode number
+ *
+ * This takes various bits of info and munges them
+ * to give the inode number for an lxproc node
+ */
+ino_t
+lxpr_inode(lxpr_nodetype_t type, pid_t pid, int fd)
+{
+	if (pid == 1)
+		pid = curproc->p_zone->zone_proc_initpid;
+
+	switch (type) {
+	case LXPR_PIDDIR:
+		return (pid + 1);
+	case LXPR_PROCDIR:
+		return (maxpid + 2);
+	case LXPR_PID_FD_FD:
+		return (maxpid + 2 +
+		    (pid * (LXPR_FD_PERPROC + LXPR_NFILES)) +
+		    LXPR_NFILES + fd);
+	default:
+		return (maxpid + 2 +
+		    (pid * (LXPR_FD_PERPROC + LXPR_NFILES)) +
+		    type);
+	}
+}
+
+/*
+ * Return inode number of parent (directory)
+ */
+ino_t
+lxpr_parentinode(lxpr_node_t *lxpnp)
+{
+	/*
+	 * If the input node is the root then the parent inode
+	 * is the mounted on inode so just return our inode number
+	 */
+	if (lxpnp->lxpr_type != LXPR_PROCDIR)
+		return (VTOLXP(lxpnp->lxpr_parent)->lxpr_ino);
+	else
+		return (lxpnp->lxpr_ino);
+}
+
+/*
+ * Allocate a new lxproc node
+ *
+ * This also allocates the vnode associated with it
+ */
+lxpr_node_t *
+lxpr_getnode(vnode_t *dp, lxpr_nodetype_t type, proc_t *p, int fd)
+{
+	lxpr_node_t *lxpnp;
+	vnode_t *vp;
+	user_t *up;
+	timestruc_t now;
+
+	/*
+	 * Allocate a new node. It is deallocated in vop_innactive
+	 */
+	lxpnp = kmem_cache_alloc(lxpr_node_cache, KM_SLEEP);
+
+	/*
+	 * Set defaults (may be overridden below)
+	 */
+	gethrestime(&now);
+	lxpnp->lxpr_type = type;
+	lxpnp->lxpr_realvp = NULL;
+	lxpnp->lxpr_parent = dp;
+	VN_HOLD(dp);
+	if (p != NULL) {
+		lxpnp->lxpr_pid = ((p->p_pid ==
+		    curproc->p_zone->zone_proc_initpid) ? 1 : p->p_pid);
+
+		lxpnp->lxpr_time = PTOU(p)->u_start;
+		lxpnp->lxpr_uid = crgetruid(p->p_cred);
+		lxpnp->lxpr_gid = crgetrgid(p->p_cred);
+		lxpnp->lxpr_ino = lxpr_inode(type, p->p_pid, fd);
+	} else {
+		/* Pretend files without a proc belong to sched */
+		lxpnp->lxpr_pid = 0;
+		lxpnp->lxpr_time = now;
+		lxpnp->lxpr_uid = lxpnp->lxpr_gid = 0;
+		lxpnp->lxpr_ino = lxpr_inode(type, 0, 0);
+	}
+
+	/* initialize the vnode data */
+	vp = lxpnp->lxpr_vnode;
+	vn_reinit(vp);
+	vp->v_flag = VNOCACHE|VNOMAP|VNOSWAP|VNOMOUNT;
+	vp->v_vfsp = dp->v_vfsp;
+
+	/*
+	 * Do node specific stuff
+	 */
+	switch (type) {
+	case LXPR_PROCDIR:
+		vp->v_flag |= VROOT;
+		vp->v_type = VDIR;
+		lxpnp->lxpr_mode = 0555;	/* read-search by everyone */
+		break;
+
+	case LXPR_PID_CURDIR:
+		ASSERT(p != NULL);
+
+		/*
+		 * Zombie check.  p_stat is officially protected by pidlock,
+		 * but we can't grab pidlock here because we already hold
+		 * p_lock.  Luckily if we look at the process exit code
+		 * we see that p_stat only transisions from SRUN to SZOMB
+		 * while p_lock is held.  Aside from this, the only other
+		 * p_stat transition that we need to be aware about is
+		 * SIDL to SRUN, but that's not a problem since lxpr_lock()
+		 * ignores nodes in the SIDL state so we'll never get a node
+		 * that isn't already in the SRUN state.
+		 */
+		if (p->p_stat == SZOMB) {
+			lxpnp->lxpr_realvp = NULL;
+		} else {
+			up = PTOU(p);
+			lxpnp->lxpr_realvp = up->u_cdir;
+			ASSERT(lxpnp->lxpr_realvp != NULL);
+			VN_HOLD(lxpnp->lxpr_realvp);
+		}
+		vp->v_type = VLNK;
+		lxpnp->lxpr_mode = 0777;	/* anyone does anything ! */
+		break;
+
+	case LXPR_PID_ROOTDIR:
+		ASSERT(p != NULL);
+		/* Zombie check.  see locking comment above */
+		if (p->p_stat == SZOMB) {
+			lxpnp->lxpr_realvp = NULL;
+		} else {
+			up = PTOU(p);
+			lxpnp->lxpr_realvp =
+			    up->u_rdir != NULL ? up->u_rdir : rootdir;
+			ASSERT(lxpnp->lxpr_realvp != NULL);
+			VN_HOLD(lxpnp->lxpr_realvp);
+		}
+		vp->v_type = VLNK;
+		lxpnp->lxpr_mode = 0777;	/* anyone does anything ! */
+		break;
+
+	case LXPR_PID_EXE:
+		ASSERT(p != NULL);
+		lxpnp->lxpr_realvp = p->p_exec;
+		if (lxpnp->lxpr_realvp != NULL) {
+			VN_HOLD(lxpnp->lxpr_realvp);
+		}
+		vp->v_type = VLNK;
+		lxpnp->lxpr_mode = 0777;
+		break;
+
+	case LXPR_SELF:
+		vp->v_type = VLNK;
+		lxpnp->lxpr_mode = 0777;	/* anyone does anything ! */
+		break;
+
+	case LXPR_PID_FD_FD:
+		ASSERT(p != NULL);
+		/* lxpr_realvp is set after we return */
+		vp->v_type = VLNK;
+		lxpnp->lxpr_mode = 0700;	/* read-write-exe owner only */
+		break;
+
+	case LXPR_PID_FDDIR:
+		ASSERT(p != NULL);
+		vp->v_type = VDIR;
+		lxpnp->lxpr_mode = 0500;	/* read-search by owner only */
+		break;
+
+	case LXPR_PIDDIR:
+		ASSERT(p != NULL);
+		vp->v_type = VDIR;
+		lxpnp->lxpr_mode = 0511;
+		break;
+
+	case LXPR_NETDIR:
+		vp->v_type = VDIR;
+		lxpnp->lxpr_mode = 0555;	/* read-search by all */
+		break;
+
+	case LXPR_PID_ENV:
+	case LXPR_PID_MEM:
+		ASSERT(p != NULL);
+		/*FALLTHRU*/
+	case LXPR_KCORE:
+		vp->v_type = VREG;
+		lxpnp->lxpr_mode = 0400;	/* read-only by owner only */
+		break;
+
+	default:
+		vp->v_type = VREG;
+		lxpnp->lxpr_mode = 0444;	/* read-only by all */
+		break;
+	}
+
+	return (lxpnp);
+}
+
+
+/*
+ * Free the storage obtained from lxpr_getnode().
+ */
+void
+lxpr_freenode(lxpr_node_t *lxpnp)
+{
+	ASSERT(lxpnp != NULL);
+	ASSERT(LXPTOV(lxpnp) != NULL);
+
+	/*
+	 * delete any association with realvp
+	 */
+	if (lxpnp->lxpr_realvp != NULL)
+		VN_RELE(lxpnp->lxpr_realvp);
+
+	/*
+	 * delete any association with parent vp
+	 */
+	if (lxpnp->lxpr_parent != NULL)
+		VN_RELE(lxpnp->lxpr_parent);
+
+	/*
+	 * Release the lxprnode.
+	 */
+	kmem_cache_free(lxpr_node_cache, lxpnp);
+}
diff --git a/usr/src/uts/common/fs/lxproc/lxpr_vfsops.c b/usr/src/uts/common/fs/lxproc/lxpr_vfsops.c
new file mode 100644
index 0000000000..1bb7bd3823
--- /dev/null
+++ b/usr/src/uts/common/fs/lxproc/lxpr_vfsops.c
@@ -0,0 +1,367 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/cmn_err.h>
+#include <sys/cred.h>
+#include <sys/debug.h>
+#include <sys/errno.h>
+#include <sys/proc.h>
+#include <sys/stat.h>
+#include <sys/statvfs.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/var.h>
+#include <sys/vfs.h>
+#include <sys/vfs_opreg.h>
+#include <sys/vnode.h>
+#include <sys/mode.h>
+#include <sys/signal.h>
+#include <sys/user.h>
+#include <sys/mount.h>
+#include <sys/bitmap.h>
+#include <sys/kmem.h>
+#include <sys/policy.h>
+#include <sys/modctl.h>
+#include <sys/sunddi.h>
+#include <sys/sunldi.h>
+
+#include "lxproc.h"
+
+/* Module level parameters */
+static int	lxprocfstype;
+static dev_t	lxprocdev;
+static kmutex_t	lxpr_mount_lock;
+
+int nproc_highbit;	/* highbit(v.v_nproc) */
+
+static int lxpr_mount(vfs_t *, vnode_t *, mounta_t *, cred_t *);
+static int lxpr_unmount(vfs_t *, int, cred_t *);
+static int lxpr_root(vfs_t *, vnode_t **);
+static int lxpr_statvfs(vfs_t *, statvfs64_t *);
+static int lxpr_init(int, char *);
+
+static vfsdef_t vfw = {
+	VFSDEF_VERSION,
+	"lxproc",
+	lxpr_init,
+	VSW_ZMOUNT,
+	NULL
+};
+
+/*
+ * Module linkage information for the kernel.
+ */
+extern struct mod_ops mod_fsops;
+
+static struct modlfs modlfs = {
+	&mod_fsops, "generic linux procfs", &vfw
+};
+
+static struct modlinkage modlinkage = {
+	MODREV_1, (void *)&modlfs, NULL
+};
+
+int
+_init(void)
+{
+	return (mod_install(&modlinkage));
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&modlinkage, modinfop));
+}
+
+int
+_fini(void)
+{
+	int retval;
+
+	/*
+	 * attempt to unload the module
+	 */
+	if ((retval = mod_remove(&modlinkage)) != 0)
+		goto done;
+
+	/*
+	 * destroy lxpr_node cache
+	 */
+	lxpr_fininodecache();
+
+	/*
+	 * clean out the vfsops and vnodeops
+	 */
+	(void) vfs_freevfsops_by_type(lxprocfstype);
+	vn_freevnodeops(lxpr_vnodeops);
+
+	mutex_destroy(&lxpr_mount_lock);
+done:
+	return (retval);
+}
+
+static int
+lxpr_init(int fstype, char *name)
+{
+	static const fs_operation_def_t lxpr_vfsops_template[] = {
+		VFSNAME_MOUNT,		{ .vfs_mount = lxpr_mount },
+		VFSNAME_UNMOUNT,	{ .vfs_unmount = lxpr_unmount },
+		VFSNAME_ROOT,		{ .vfs_root = lxpr_root },
+		VFSNAME_STATVFS,	{ .vfs_statvfs = lxpr_statvfs },
+		NULL,			NULL
+	};
+	extern const fs_operation_def_t lxpr_vnodeops_template[];
+	int error;
+	major_t dev;
+
+	nproc_highbit = highbit(v.v_proc);
+	lxprocfstype = fstype;
+	ASSERT(lxprocfstype != 0);
+
+	mutex_init(&lxpr_mount_lock, NULL, MUTEX_DEFAULT, NULL);
+
+	/*
+	 * Associate VFS ops vector with this fstype.
+	 */
+	error = vfs_setfsops(fstype, lxpr_vfsops_template, NULL);
+	if (error != 0) {
+		cmn_err(CE_WARN, "lxpr_init: bad vfs ops template");
+		return (error);
+	}
+
+	/*
+	 * Set up vnode ops vector too.
+	 */
+	error = vn_make_ops(name, lxpr_vnodeops_template, &lxpr_vnodeops);
+	if (error != 0) {
+		(void) vfs_freevfsops_by_type(fstype);
+		cmn_err(CE_WARN, "lxpr_init: bad vnode ops template");
+		return (error);
+	}
+
+	/*
+	 * Assign a unique "device" number (used by stat(2)).
+	 */
+	if ((dev = getudev()) == (major_t)-1) {
+		cmn_err(CE_WARN, "lxpr_init: can't get unique device number");
+		dev = 0;
+	}
+
+	/*
+	 * Make the pseudo device
+	 */
+	lxprocdev = makedevice(dev, 0);
+
+	/*
+	 * Initialize cache for lxpr_nodes
+	 */
+	lxpr_initnodecache();
+
+	return (0);
+}
+
+static int
+lxpr_mount(vfs_t *vfsp, vnode_t *mvp, mounta_t *uap, cred_t *cr)
+{
+	lxpr_mnt_t *lxpr_mnt;
+	zone_t *zone = curproc->p_zone;
+	ldi_ident_t li;
+	int err;
+
+	/*
+	 * must be root to mount
+	 */
+	if (secpolicy_fs_mount(cr, mvp, vfsp) != 0)
+		return (EPERM);
+
+	/*
+	 * mount point must be a directory
+	 */
+	if (mvp->v_type != VDIR)
+		return (ENOTDIR);
+
+	if (zone == global_zone) {
+		zone_t *mntzone;
+
+		mntzone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt));
+		zone_rele(mntzone);
+		if (zone != mntzone)
+			return (EBUSY);
+	}
+
+	/*
+	 * Having the resource be anything but "lxproc" doesn't make sense
+	 */
+	vfs_setresource(vfsp, "lxproc", 0);
+
+	lxpr_mnt = kmem_alloc(sizeof (*lxpr_mnt), KM_SLEEP);
+
+	if ((err = ldi_ident_from_mod(&modlinkage, &li)) != 0) {
+		kmem_free(lxpr_mnt, sizeof (*lxpr_mnt));
+		return (err);
+	}
+
+	lxpr_mnt->lxprm_li = li;
+
+	mutex_enter(&lxpr_mount_lock);
+
+	/*
+	 * Ensure we don't allow overlaying mounts
+	 */
+	mutex_enter(&mvp->v_lock);
+	if ((uap->flags & MS_OVERLAY) == 0 &&
+	    (mvp->v_count > 1 || (mvp->v_flag & VROOT))) {
+		mutex_exit(&mvp->v_lock);
+		mutex_exit(&lxpr_mount_lock);
+		kmem_free(lxpr_mnt, sizeof ((*lxpr_mnt)));
+		return (EBUSY);
+	}
+	mutex_exit(&mvp->v_lock);
+
+	/*
+	 * allocate the first vnode
+	 */
+	zone_hold(lxpr_mnt->lxprm_zone = zone);
+
+	/* Arbitrarily set the parent vnode to the mounted over directory */
+	lxpr_mnt->lxprm_node = lxpr_getnode(mvp, LXPR_PROCDIR, NULL, 0);
+
+	/* Correctly set the fs for the root node */
+	lxpr_mnt->lxprm_node->lxpr_vnode->v_vfsp = vfsp;
+
+	vfs_make_fsid(&vfsp->vfs_fsid, lxprocdev, lxprocfstype);
+	vfsp->vfs_bsize = DEV_BSIZE;
+	vfsp->vfs_fstype = lxprocfstype;
+	vfsp->vfs_data = (caddr_t)lxpr_mnt;
+	vfsp->vfs_dev = lxprocdev;
+
+	mutex_exit(&lxpr_mount_lock);
+
+	return (0);
+}
+
+static int
+lxpr_unmount(vfs_t *vfsp, int flag, cred_t *cr)
+{
+	lxpr_mnt_t *lxpr_mnt = (lxpr_mnt_t *)vfsp->vfs_data;
+	vnode_t *vp;
+	int count;
+
+	ASSERT(lxpr_mnt != NULL);
+	vp = LXPTOV(lxpr_mnt->lxprm_node);
+
+	mutex_enter(&lxpr_mount_lock);
+
+	/*
+	 * must be root to unmount
+	 */
+	if (secpolicy_fs_unmount(cr, vfsp) != 0) {
+		mutex_exit(&lxpr_mount_lock);
+		return (EPERM);
+	}
+
+	/*
+	 * forced unmount is not supported by this file system
+	 */
+	if (flag & MS_FORCE) {
+		mutex_exit(&lxpr_mount_lock);
+		return (ENOTSUP);
+	}
+
+	/*
+	 * Ensure that no vnodes are in use on this mount point.
+	 */
+	mutex_enter(&vp->v_lock);
+	count = vp->v_count;
+	mutex_exit(&vp->v_lock);
+	if (count > 1) {
+		mutex_exit(&lxpr_mount_lock);
+		return (EBUSY);
+	}
+
+	/*
+	 * purge the dnlc cache for vnode entries
+	 * associated with this file system
+	 */
+	count = dnlc_purge_vfsp(vfsp, 0);
+
+	/*
+	 * free up the lxprnode
+	 */
+	lxpr_freenode(lxpr_mnt->lxprm_node);
+	zone_rele(lxpr_mnt->lxprm_zone);
+	kmem_free(lxpr_mnt, sizeof (*lxpr_mnt));
+
+	mutex_exit(&lxpr_mount_lock);
+
+	return (0);
+}
+
+static int
+lxpr_root(vfs_t *vfsp, vnode_t **vpp)
+{
+	lxpr_node_t *lxpnp = ((lxpr_mnt_t *)vfsp->vfs_data)->lxprm_node;
+	vnode_t *vp = LXPTOV(lxpnp);
+
+	VN_HOLD(vp);
+	*vpp = vp;
+	return (0);
+}
+
+static int
+lxpr_statvfs(vfs_t *vfsp, statvfs64_t *sp)
+{
+	int n;
+	dev32_t d32;
+	extern uint_t nproc;
+
+	n = v.v_proc - nproc;
+
+	bzero((caddr_t)sp, sizeof (*sp));
+	sp->f_bsize	= DEV_BSIZE;
+	sp->f_frsize	= DEV_BSIZE;
+	sp->f_blocks	= (fsblkcnt64_t)0;
+	sp->f_bfree	= (fsblkcnt64_t)0;
+	sp->f_bavail	= (fsblkcnt64_t)0;
+	sp->f_files	= (fsfilcnt64_t)v.v_proc + 2;
+	sp->f_ffree	= (fsfilcnt64_t)n;
+	sp->f_favail	= (fsfilcnt64_t)n;
+	(void) cmpldev(&d32, vfsp->vfs_dev);
+	sp->f_fsid	= d32;
+	/* It is guaranteed that vsw_name will fit in f_basetype */
+	(void) strcpy(sp->f_basetype, vfssw[lxprocfstype].vsw_name);
+	sp->f_flag = vf_to_stf(vfsp->vfs_flag);
+	sp->f_namemax = 64;		/* quite arbitrary */
+
+	(void) strcpy(sp->f_fstr, "lxproc");
+
+	return (0);
+}
diff --git a/usr/src/uts/common/fs/lxproc/lxpr_vnops.c b/usr/src/uts/common/fs/lxproc/lxpr_vnops.c
new file mode 100644
index 0000000000..9c996891f3
--- /dev/null
+++ b/usr/src/uts/common/fs/lxproc/lxpr_vnops.c
@@ -0,0 +1,3099 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+/*
+ * lxproc -- a loosely Linux-compatible /proc
+ *
+ * We have -- confusingly -- two implementations of Linux /proc.  One is to
+ * support the LX brand with a Linux /proc entirely compatible with the Linux
+ * world view; the other -- this one -- is to support native (but Linux-borne)
+ * programs that wish to view the native system via the Linux /proc model.  So
+ * the aspiration here is to provide something that sufficiently approximates
+ * the Linux /proc implementation for purposes of offering some compatibility
+ * for simple Linux /proc readers (e.g., ps/top/htop).  However, it is not
+ * intended to exactly mimic Linux semantics; when choosing between offering
+ * compatibility and telling the truth, we emphatically pick the truth.  A
+ * particular glaring example of this is the Linux notion of "tasks" (that is,
+ * threads), which -- due to historical misadventures on Linux -- allocate their
+ * identifiers from the process identifier space.  (That is, each thread has in
+ * effect a pid.)  Some Linux /proc readers have come to depend on this
+ * attribute, and become confused when threads appear with proper identifiers,
+ * so we simply opt for the pre-2.6 behavior, and do not present the tasks
+ * directory at all.  Similarly, when choosing between offering compatibility
+ * and remaining consistent with our broader security model, we (obviously)
+ * choose security over compatibility.  In short, this is meant to be a best
+ * effort -- no more -- and as such, it should not be unified with the much
+ * more complete Linux /proc implementation found in the LX brand.
+ */
+
+#include <sys/cpupart.h>
+#include <sys/cpuvar.h>
+#include <sys/session.h>
+#include <sys/vmparam.h>
+#include <sys/mman.h>
+#include <vm/rm.h>
+#include <vm/seg_vn.h>
+#include <sys/sdt.h>
+#include <sys/strlog.h>
+#include <sys/stropts.h>
+#include <sys/cmn_err.h>
+#include <sys/x86_archext.h>
+#include <sys/archsystm.h>
+#include <sys/fp.h>
+#include <sys/pool_pset.h>
+#include <sys/pset.h>
+#include <sys/zone.h>
+#include <sys/pghw.h>
+#include <sys/vfs_opreg.h>
+
+/* Dependent on procfs */
+extern kthread_t *prchoose(proc_t *);
+
+#include "lxproc.h"
+
+extern pgcnt_t swapfs_minfree;
+extern time_t boot_time;
+
+/*
+ * Pointer to the vnode ops vector for this fs.
+ * This is instantiated in lxprinit() in lxpr_vfsops.c
+ */
+vnodeops_t *lxpr_vnodeops;
+
+static int lxpr_open(vnode_t **, int, cred_t *, caller_context_t *);
+static int lxpr_close(vnode_t *, int, int, offset_t, cred_t *,
+    caller_context_t *);
+static int lxpr_read(vnode_t *, uio_t *, int, cred_t *, caller_context_t *);
+static int lxpr_getattr(vnode_t *, vattr_t *, int, cred_t *,
+    caller_context_t *);
+static int lxpr_access(vnode_t *, int, int, cred_t *, caller_context_t *);
+static int lxpr_lookup(vnode_t *, char *, vnode_t **,
+    pathname_t *, int, vnode_t *, cred_t *, caller_context_t *, int *,
+    pathname_t *);
+static int lxpr_readdir(vnode_t *, uio_t *, cred_t *, int *,
+    caller_context_t *, int);
+static int lxpr_readlink(vnode_t *, uio_t *, cred_t *, caller_context_t *);
+static int lxpr_cmp(vnode_t *, vnode_t *, caller_context_t *);
+static int lxpr_realvp(vnode_t *, vnode_t **, caller_context_t *);
+static int lxpr_sync(void);
+static void lxpr_inactive(vnode_t *, cred_t *, caller_context_t *);
+
+static vnode_t *lxpr_lookup_procdir(vnode_t *, char *);
+static vnode_t *lxpr_lookup_piddir(vnode_t *, char *);
+static vnode_t *lxpr_lookup_not_a_dir(vnode_t *, char *);
+static vnode_t *lxpr_lookup_fddir(vnode_t *, char *);
+static vnode_t *lxpr_lookup_netdir(vnode_t *, char *);
+
+static int lxpr_readdir_procdir(lxpr_node_t *, uio_t *, int *);
+static int lxpr_readdir_piddir(lxpr_node_t *, uio_t *, int *);
+static int lxpr_readdir_not_a_dir(lxpr_node_t *, uio_t *, int *);
+static int lxpr_readdir_fddir(lxpr_node_t *, uio_t *, int *);
+static int lxpr_readdir_netdir(lxpr_node_t *, uio_t *, int *);
+
+static void lxpr_read_invalid(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_empty(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_cpuinfo(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_isdir(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_fd(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_kmsg(lxpr_node_t *, lxpr_uiobuf_t *, ldi_handle_t);
+static void lxpr_read_loadavg(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_meminfo(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_mounts(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_partitions(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_stat(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_uptime(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_version(lxpr_node_t *, lxpr_uiobuf_t *);
+
+static void lxpr_read_pid_cmdline(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_pid_maps(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_pid_stat(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_pid_statm(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_pid_status(lxpr_node_t *, lxpr_uiobuf_t *);
+
+static void lxpr_read_net_arp(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_dev(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_dev_mcast(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_igmp(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_ip_mr_cache(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_ip_mr_vif(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_mcfilter(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_netstat(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_raw(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_route(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_rpc(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_rt_cache(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_sockstat(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_snmp(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_stat(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_tcp(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_udp(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_unix(lxpr_node_t *, lxpr_uiobuf_t *);
+
+/*
+ * Simple conversion
+ */
+#define	btok(x)	((x) >> 10)			/* bytes to kbytes */
+#define	ptok(x)	((x) << (PAGESHIFT - 10))	/* pages to kbytes */
+
+/*
+ * The lxproc vnode operations vector
+ */
+const fs_operation_def_t lxpr_vnodeops_template[] = {
+	VOPNAME_OPEN,		{ .vop_open = lxpr_open },
+	VOPNAME_CLOSE,		{ .vop_close = lxpr_close },
+	VOPNAME_READ,		{ .vop_read = lxpr_read },
+	VOPNAME_GETATTR,	{ .vop_getattr = lxpr_getattr },
+	VOPNAME_ACCESS,		{ .vop_access = lxpr_access },
+	VOPNAME_LOOKUP,		{ .vop_lookup = lxpr_lookup },
+	VOPNAME_READDIR,	{ .vop_readdir = lxpr_readdir },
+	VOPNAME_READLINK,	{ .vop_readlink = lxpr_readlink },
+	VOPNAME_FSYNC,		{ .error = lxpr_sync },
+	VOPNAME_SEEK,		{ .error = lxpr_sync },
+	VOPNAME_INACTIVE,	{ .vop_inactive = lxpr_inactive },
+	VOPNAME_CMP,		{ .vop_cmp = lxpr_cmp },
+	VOPNAME_REALVP,		{ .vop_realvp = lxpr_realvp },
+	NULL,			NULL
+};
+
+/*
+ * file contents of an lxproc directory.
+ */
+static lxpr_dirent_t lxpr_dir[] = {
+	{ LXPR_CMDLINE,		"cmdline" },
+	{ LXPR_CPUINFO,		"cpuinfo" },
+	{ LXPR_DEVICES,		"devices" },
+	{ LXPR_DMA,		"dma" },
+	{ LXPR_FILESYSTEMS,	"filesystems" },
+	{ LXPR_INTERRUPTS,	"interrupts" },
+	{ LXPR_IOPORTS,		"ioports" },
+	{ LXPR_KCORE,		"kcore" },
+	{ LXPR_KMSG,		"kmsg" },
+	{ LXPR_LOADAVG,		"loadavg" },
+	{ LXPR_MEMINFO,		"meminfo" },
+	{ LXPR_MOUNTS,		"mounts" },
+	{ LXPR_NETDIR,		"net" },
+	{ LXPR_PARTITIONS,	"partitions" },
+	{ LXPR_SELF,		"self" },
+	{ LXPR_STAT,		"stat" },
+	{ LXPR_UPTIME,		"uptime" },
+	{ LXPR_VERSION,		"version" }
+};
+
+#define	PROCDIRFILES	(sizeof (lxpr_dir) / sizeof (lxpr_dir[0]))
+
+/*
+ * Contents of an /lxproc/<pid> directory.
+ */
+static lxpr_dirent_t piddir[] = {
+	{ LXPR_PID_CMDLINE,	"cmdline" },
+	{ LXPR_PID_CPU,		"cpu" },
+	{ LXPR_PID_CURDIR,	"cwd" },
+	{ LXPR_PID_ENV,		"environ" },
+	{ LXPR_PID_EXE,		"exe" },
+	{ LXPR_PID_MAPS,	"maps" },
+	{ LXPR_PID_MEM,		"mem" },
+	{ LXPR_PID_ROOTDIR,	"root" },
+	{ LXPR_PID_STAT,	"stat" },
+	{ LXPR_PID_STATM,	"statm" },
+	{ LXPR_PID_STATUS,	"status" },
+	{ LXPR_PID_FDDIR,	"fd" }
+};
+
+#define	PIDDIRFILES	(sizeof (piddir) / sizeof (piddir[0]))
+
+/*
+ * contents of /lxproc/net directory
+ */
+static lxpr_dirent_t netdir[] = {
+	{ LXPR_NET_ARP,		"arp" },
+	{ LXPR_NET_DEV,		"dev" },
+	{ LXPR_NET_DEV_MCAST,	"dev_mcast" },
+	{ LXPR_NET_IGMP,	"igmp" },
+	{ LXPR_NET_IP_MR_CACHE,	"ip_mr_cache" },
+	{ LXPR_NET_IP_MR_VIF,	"ip_mr_vif" },
+	{ LXPR_NET_MCFILTER,	"mcfilter" },
+	{ LXPR_NET_NETSTAT,	"netstat" },
+	{ LXPR_NET_RAW,		"raw" },
+	{ LXPR_NET_ROUTE,	"route" },
+	{ LXPR_NET_RPC,		"rpc" },
+	{ LXPR_NET_RT_CACHE,	"rt_cache" },
+	{ LXPR_NET_SOCKSTAT,	"sockstat" },
+	{ LXPR_NET_SNMP,	"snmp" },
+	{ LXPR_NET_STAT,	"stat" },
+	{ LXPR_NET_TCP,		"tcp" },
+	{ LXPR_NET_UDP,		"udp" },
+	{ LXPR_NET_UNIX,	"unix" }
+};
+
+#define	NETDIRFILES	(sizeof (netdir) / sizeof (netdir[0]))
+
+/*
+ * These are the major signal number differences between Linux and native:
+ *
+ * 	====================================
+ * 	| Number | Linux      | Native     |
+ * 	| ====== | =========  | ========== |
+ *	|    7   | SIGBUS     | SIGEMT     |
+ *	|   10   | SIGUSR1    | SIGBUS     |
+ *	|   12   | SIGUSR2    | SIGSYS     |
+ *	|   16   | SIGSTKFLT  | SIGUSR1    |
+ *	|   17   | SIGCHLD    | SIGUSR2    |
+ * 	|   18   | SIGCONT    | SIGCHLD    |
+ *	|   19   | SIGSTOP    | SIGPWR     |
+ * 	|   20   | SIGTSTP    | SIGWINCH   |
+ * 	|   21   | SIGTTIN    | SIGURG     |
+ * 	|   22   | SIGTTOU    | SIGPOLL    |
+ *	|   23   | SIGURG     | SIGSTOP    |
+ * 	|   24   | SIGXCPU    | SIGTSTP    |
+ *	|   25   | SIGXFSZ    | SIGCONT    |
+ *	|   26   | SIGVTALARM | SIGTTIN    |
+ *	|   27   | SIGPROF    | SIGTTOU    |
+ *	|   28   | SIGWINCH   | SIGVTALARM |
+ *	|   29   | SIGPOLL    | SIGPROF    |
+ *	|   30   | SIGPWR     | SIGXCPU    |
+ *	|   31   | SIGSYS     | SIGXFSZ    |
+ * 	====================================
+ *
+ * Not every Linux signal maps to a native signal, nor does every native
+ * signal map to a Linux counterpart. However, when signals do map, the
+ * mapping is unique.
+ */
+static int
+lxpr_sigmap[NSIG] = {
+	0,
+	LX_SIGHUP,
+	LX_SIGINT,
+	LX_SIGQUIT,
+	LX_SIGILL,
+	LX_SIGTRAP,
+	LX_SIGABRT,
+	LX_SIGSTKFLT,
+	LX_SIGFPE,
+	LX_SIGKILL,
+	LX_SIGBUS,
+	LX_SIGSEGV,
+	LX_SIGSYS,
+	LX_SIGPIPE,
+	LX_SIGALRM,
+	LX_SIGTERM,
+	LX_SIGUSR1,
+	LX_SIGUSR2,
+	LX_SIGCHLD,
+	LX_SIGPWR,
+	LX_SIGWINCH,
+	LX_SIGURG,
+	LX_SIGPOLL,
+	LX_SIGSTOP,
+	LX_SIGTSTP,
+	LX_SIGCONT,
+	LX_SIGTTIN,
+	LX_SIGTTOU,
+	LX_SIGVTALRM,
+	LX_SIGPROF,
+	LX_SIGXCPU,
+	LX_SIGXFSZ,
+	-1,			/* 32:  illumos SIGWAITING */
+	-1,			/* 33:  illumos SIGLWP */
+	-1,			/* 34:  illumos SIGFREEZE */
+	-1,			/* 35:  illumos SIGTHAW */
+	-1,			/* 36:  illumos SIGCANCEL */
+	-1,			/* 37:  illumos SIGLOST */
+	-1,			/* 38:  illumos SIGXRES */
+	-1,			/* 39:  illumos SIGJVM1 */
+	-1,			/* 40:  illumos SIGJVM2 */
+	-1,			/* 41:  illumos SIGINFO */
+	LX_SIGRTMIN,		/* 42:  illumos _SIGRTMIN */
+	LX_SIGRTMIN + 1,
+	LX_SIGRTMIN + 2,
+	LX_SIGRTMIN + 3,
+	LX_SIGRTMIN + 4,
+	LX_SIGRTMIN + 5,
+	LX_SIGRTMIN + 6,
+	LX_SIGRTMIN + 7,
+	LX_SIGRTMIN + 8,
+	LX_SIGRTMIN + 9,
+	LX_SIGRTMIN + 10,
+	LX_SIGRTMIN + 11,
+	LX_SIGRTMIN + 12,
+	LX_SIGRTMIN + 13,
+	LX_SIGRTMIN + 14,
+	LX_SIGRTMIN + 15,
+	LX_SIGRTMIN + 16,
+	LX_SIGRTMIN + 17,
+	LX_SIGRTMIN + 18,
+	LX_SIGRTMIN + 19,
+	LX_SIGRTMIN + 20,
+	LX_SIGRTMIN + 21,
+	LX_SIGRTMIN + 22,
+	LX_SIGRTMIN + 23,
+	LX_SIGRTMIN + 24,
+	LX_SIGRTMIN + 25,
+	LX_SIGRTMIN + 26,
+	LX_SIGRTMIN + 27,
+	LX_SIGRTMIN + 28,
+	LX_SIGRTMIN + 29,
+	LX_SIGRTMIN + 30,
+	LX_SIGRTMAX
+};
+
+/*
+ * lxpr_open(): Vnode operation for VOP_OPEN()
+ */
+static int
+lxpr_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
+{
+	vnode_t		*vp = *vpp;
+	lxpr_node_t	*lxpnp = VTOLXP(vp);
+	lxpr_nodetype_t	type = lxpnp->lxpr_type;
+	vnode_t		*rvp;
+	int		error = 0;
+
+	/*
+	 * We only allow reading in this file systrem
+	 */
+	if (flag & FWRITE)
+		return (EROFS);
+
+	/*
+	 * If we are opening an underlying file only allow regular files
+	 * reject the open for anything but a regular file.
+	 * Just do it if we are opening the current or root directory.
+	 */
+	if (lxpnp->lxpr_realvp != NULL) {
+		rvp = lxpnp->lxpr_realvp;
+
+		if (type == LXPR_PID_FD_FD && rvp->v_type != VREG)
+			error = EACCES;
+		else {
+			/*
+			 * Need to hold rvp since VOP_OPEN() may release it.
+			 */
+			VN_HOLD(rvp);
+			error = VOP_OPEN(&rvp, flag, cr, ct);
+			if (error) {
+				VN_RELE(rvp);
+			} else {
+				*vpp = rvp;
+				VN_RELE(vp);
+			}
+		}
+	}
+
+	return (error);
+}
+
+
+/*
+ * lxpr_close(): Vnode operation for VOP_CLOSE()
+ */
+/* ARGSUSED */
+static int
+lxpr_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
+    caller_context_t *ct)
+{
+	lxpr_node_t	*lxpr = VTOLXP(vp);
+	lxpr_nodetype_t	type = lxpr->lxpr_type;
+
+	/*
+	 * we should never get here because the close is done on the realvp
+	 * for these nodes
+	 */
+	ASSERT(type != LXPR_PID_FD_FD &&
+	    type != LXPR_PID_CURDIR &&
+	    type != LXPR_PID_ROOTDIR &&
+	    type != LXPR_PID_EXE);
+
+	return (0);
+}
+
+static void (*lxpr_read_function[LXPR_NFILES])() = {
+	lxpr_read_isdir,		/* /proc		*/
+	lxpr_read_isdir,		/* /proc/<pid>		*/
+	lxpr_read_pid_cmdline,		/* /proc/<pid>/cmdline	*/
+	lxpr_read_empty,		/* /proc/<pid>/cpu	*/
+	lxpr_read_invalid,		/* /proc/<pid>/cwd	*/
+	lxpr_read_empty,		/* /proc/<pid>/environ	*/
+	lxpr_read_invalid,		/* /proc/<pid>/exe	*/
+	lxpr_read_pid_maps,		/* /proc/<pid>/maps	*/
+	lxpr_read_empty,		/* /proc/<pid>/mem	*/
+	lxpr_read_invalid,		/* /proc/<pid>/root	*/
+	lxpr_read_pid_stat,		/* /proc/<pid>/stat	*/
+	lxpr_read_pid_statm,		/* /proc/<pid>/statm	*/
+	lxpr_read_pid_status,		/* /proc/<pid>/status	*/
+	lxpr_read_isdir,		/* /proc/<pid>/fd	*/
+	lxpr_read_fd,			/* /proc/<pid>/fd/nn	*/
+	lxpr_read_empty,		/* /proc/cmdline	*/
+	lxpr_read_cpuinfo,		/* /proc/cpuinfo	*/
+	lxpr_read_empty,		/* /proc/devices	*/
+	lxpr_read_empty,		/* /proc/dma		*/
+	lxpr_read_empty,		/* /proc/filesystems	*/
+	lxpr_read_empty,		/* /proc/interrupts	*/
+	lxpr_read_empty,		/* /proc/ioports	*/
+	lxpr_read_empty,		/* /proc/kcore		*/
+	lxpr_read_invalid,		/* /proc/kmsg -- see lxpr_read() */
+	lxpr_read_loadavg,		/* /proc/loadavg	*/
+	lxpr_read_meminfo,		/* /proc/meminfo	*/
+	lxpr_read_mounts,		/* /proc/mounts		*/
+	lxpr_read_isdir,		/* /proc/net		*/
+	lxpr_read_net_arp,		/* /proc/net/arp	*/
+	lxpr_read_net_dev,		/* /proc/net/dev	*/
+	lxpr_read_net_dev_mcast,	/* /proc/net/dev_mcast	*/
+	lxpr_read_net_igmp,		/* /proc/net/igmp	*/
+	lxpr_read_net_ip_mr_cache,	/* /proc/net/ip_mr_cache */
+	lxpr_read_net_ip_mr_vif,	/* /proc/net/ip_mr_vif	*/
+	lxpr_read_net_mcfilter,		/* /proc/net/mcfilter	*/
+	lxpr_read_net_netstat,		/* /proc/net/netstat	*/
+	lxpr_read_net_raw,		/* /proc/net/raw	*/
+	lxpr_read_net_route,		/* /proc/net/route	*/
+	lxpr_read_net_rpc,		/* /proc/net/rpc	*/
+	lxpr_read_net_rt_cache,		/* /proc/net/rt_cache	*/
+	lxpr_read_net_sockstat,		/* /proc/net/sockstat	*/
+	lxpr_read_net_snmp,		/* /proc/net/snmp	*/
+	lxpr_read_net_stat,		/* /proc/net/stat	*/
+	lxpr_read_net_tcp,		/* /proc/net/tcp	*/
+	lxpr_read_net_udp,		/* /proc/net/udp	*/
+	lxpr_read_net_unix,		/* /proc/net/unix	*/
+	lxpr_read_partitions,		/* /proc/partitions	*/
+	lxpr_read_invalid,		/* /proc/self		*/
+	lxpr_read_stat,			/* /proc/stat		*/
+	lxpr_read_uptime,		/* /proc/uptime		*/
+	lxpr_read_version,		/* /proc/version	*/
+};
+
+/*
+ * Array of lookup functions, indexed by /lxproc file type.
+ */
+static vnode_t *(*lxpr_lookup_function[LXPR_NFILES])() = {
+	lxpr_lookup_procdir,		/* /proc		*/
+	lxpr_lookup_piddir,		/* /proc/<pid>		*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/cmdline	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/cpu	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/cwd	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/environ	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/exe	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/maps	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/mem	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/root	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/stat	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/statm	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/status	*/
+	lxpr_lookup_fddir,		/* /proc/<pid>/fd	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/fd/nn	*/
+	lxpr_lookup_not_a_dir,		/* /proc/cmdline	*/
+	lxpr_lookup_not_a_dir,		/* /proc/cpuinfo	*/
+	lxpr_lookup_not_a_dir,		/* /proc/devices	*/
+	lxpr_lookup_not_a_dir,		/* /proc/dma		*/
+	lxpr_lookup_not_a_dir,		/* /proc/filesystems	*/
+	lxpr_lookup_not_a_dir,		/* /proc/interrupts	*/
+	lxpr_lookup_not_a_dir,		/* /proc/ioports	*/
+	lxpr_lookup_not_a_dir,		/* /proc/kcore		*/
+	lxpr_lookup_not_a_dir,		/* /proc/kmsg		*/
+	lxpr_lookup_not_a_dir,		/* /proc/loadavg	*/
+	lxpr_lookup_not_a_dir,		/* /proc/meminfo	*/
+	lxpr_lookup_not_a_dir,		/* /proc/mounts		*/
+	lxpr_lookup_netdir,		/* /proc/net		*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/arp	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/dev	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/dev_mcast	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/igmp	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/ip_mr_cache */
+	lxpr_lookup_not_a_dir,		/* /proc/net/ip_mr_vif	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/mcfilter	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/netstat	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/raw	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/route	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/rpc	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/rt_cache	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/sockstat	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/snmp	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/stat	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/tcp	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/udp	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/unix	*/
+	lxpr_lookup_not_a_dir,		/* /proc/partitions	*/
+	lxpr_lookup_not_a_dir,		/* /proc/self		*/
+	lxpr_lookup_not_a_dir,		/* /proc/stat		*/
+	lxpr_lookup_not_a_dir,		/* /proc/uptime		*/
+	lxpr_lookup_not_a_dir,		/* /proc/version	*/
+};
+
+/*
+ * Array of readdir functions, indexed by /proc file type.
+ */
+static int (*lxpr_readdir_function[LXPR_NFILES])() = {
+	lxpr_readdir_procdir,		/* /proc		*/
+	lxpr_readdir_piddir,		/* /proc/<pid>		*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/cmdline	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/cpu	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/cwd	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/environ	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/exe	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/maps	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/mem	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/root	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/stat	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/statm	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/status	*/
+	lxpr_readdir_fddir,		/* /proc/<pid>/fd	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/fd/nn	*/
+	lxpr_readdir_not_a_dir,		/* /proc/cmdline	*/
+	lxpr_readdir_not_a_dir,		/* /proc/cpuinfo	*/
+	lxpr_readdir_not_a_dir,		/* /proc/devices	*/
+	lxpr_readdir_not_a_dir,		/* /proc/dma		*/
+	lxpr_readdir_not_a_dir,		/* /proc/filesystems	*/
+	lxpr_readdir_not_a_dir,		/* /proc/interrupts	*/
+	lxpr_readdir_not_a_dir,		/* /proc/ioports	*/
+	lxpr_readdir_not_a_dir,		/* /proc/kcore		*/
+	lxpr_readdir_not_a_dir,		/* /proc/kmsg		*/
+	lxpr_readdir_not_a_dir,		/* /proc/loadavg	*/
+	lxpr_readdir_not_a_dir,		/* /proc/meminfo	*/
+	lxpr_readdir_not_a_dir,		/* /proc/mounts		*/
+	lxpr_readdir_netdir,		/* /proc/net		*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/arp	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/dev	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/dev_mcast	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/igmp	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/ip_mr_cache */
+	lxpr_readdir_not_a_dir,		/* /proc/net/ip_mr_vif	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/mcfilter	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/netstat	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/raw	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/route	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/rpc	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/rt_cache	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/sockstat	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/snmp	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/stat	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/tcp	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/udp	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/unix	*/
+	lxpr_readdir_not_a_dir,		/* /proc/partitions	*/
+	lxpr_readdir_not_a_dir,		/* /proc/self		*/
+	lxpr_readdir_not_a_dir,		/* /proc/stat		*/
+	lxpr_readdir_not_a_dir,		/* /proc/uptime		*/
+	lxpr_readdir_not_a_dir,		/* /proc/version	*/
+};
+
+
+/*
+ * lxpr_read(): Vnode operation for VOP_READ()
+ *
+ * As the format of all the files that can be read in lxproc is human readable
+ * and not binary structures there do not have to be different read variants
+ * depending on whether the reading process model is 32- or 64-bit.
+ */
+/* ARGSUSED */
+static int
+lxpr_read(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr,
+    caller_context_t *ct)
+{
+	lxpr_node_t *lxpnp = VTOLXP(vp);
+	lxpr_nodetype_t type = lxpnp->lxpr_type;
+	lxpr_uiobuf_t *uiobuf = lxpr_uiobuf_new(uiop);
+	int error;
+
+	ASSERT(type < LXPR_NFILES);
+
+	if (type == LXPR_KMSG) {
+		ldi_ident_t	li = VTOLXPM(vp)->lxprm_li;
+		ldi_handle_t	ldih;
+		struct strioctl	str;
+		int		rv;
+
+		/*
+		 * Open the zone's console device using the layered driver
+		 * interface.
+		 */
+		if ((error =
+		    ldi_open_by_name("/dev/log", FREAD, cr, &ldih, li)) != 0)
+			return (error);
+
+		/*
+		 * Send an ioctl to the underlying console device, letting it
+		 * know we're interested in getting console messages.
+		 */
+		str.ic_cmd = I_CONSLOG;
+		str.ic_timout = 0;
+		str.ic_len = 0;
+		str.ic_dp = NULL;
+		if ((error = ldi_ioctl(ldih, I_STR,
+		    (intptr_t)&str, FKIOCTL, cr, &rv)) != 0)
+			return (error);
+
+		lxpr_read_kmsg(lxpnp, uiobuf, ldih);
+
+		if ((error = ldi_close(ldih, FREAD, cr)) != 0)
+			return (error);
+	} else {
+		lxpr_read_function[type](lxpnp, uiobuf);
+	}
+
+	error = lxpr_uiobuf_flush(uiobuf);
+	lxpr_uiobuf_free(uiobuf);
+
+	return (error);
+}
+
+/*
+ * lxpr_read_invalid(), lxpr_read_isdir(), lxpr_read_empty()
+ *
+ * Various special case reads:
+ * - trying to read a directory
+ * - invalid file (used to mean a file that should be implemented,
+ *   but isn't yet)
+ * - empty file
+ * - wait to be able to read a file that will never have anything to read
+ */
+/* ARGSUSED */
+static void
+lxpr_read_isdir(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	lxpr_uiobuf_seterr(uiobuf, EISDIR);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_invalid(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	lxpr_uiobuf_seterr(uiobuf, EINVAL);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_empty(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/*
+ * lxpr_read_pid_cmdline():
+ *
+ * This is not precisely compatible with Linux: the Linux cmdline returns argv
+ * with the correct separation using \0 between the arguments, but we cannot do
+ * that without copying the real argv from the correct process context.  This
+ * is too difficult to attempt so we pretend that the entire cmdline is just
+ * argv[0]. This is good enough for ps and htop to display correctly, but might
+ * cause some other things not to work correctly.
+ */
+static void
+lxpr_read_pid_cmdline(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	proc_t *p;
+	char *buf;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_PID_CMDLINE);
+
+	p = lxpr_lock(lxpnp->lxpr_pid);
+	if (p == NULL) {
+		lxpr_uiobuf_seterr(uiobuf, EINVAL);
+		return;
+	}
+
+	buf = PTOU(p)->u_argv != 0 ? PTOU(p)->u_psargs : PTOU(p)->u_comm;
+
+	lxpr_uiobuf_write(uiobuf, buf, strlen(buf) + 1);
+	lxpr_unlock(p);
+}
+
+/*
+ * lxpr_read_pid_maps(): memory map file
+ */
+static void
+lxpr_read_pid_maps(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	proc_t *p;
+	struct as *as;
+	struct seg *seg;
+	char *buf;
+	int buflen = MAXPATHLEN;
+	struct print_data {
+		caddr_t saddr;
+		caddr_t eaddr;
+		int type;
+		char prot[5];
+		uint32_t offset;
+		vnode_t *vp;
+		struct print_data *next;
+	} *print_head = NULL;
+	struct print_data **print_tail = &print_head;
+	struct print_data *pbuf;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_PID_MAPS);
+
+	p = lxpr_lock(lxpnp->lxpr_pid);
+	if (p == NULL) {
+		lxpr_uiobuf_seterr(uiobuf, EINVAL);
+		return;
+	}
+
+	as = p->p_as;
+
+	if (as == &kas) {
+		lxpr_unlock(p);
+		return;
+	}
+
+	mutex_exit(&p->p_lock);
+
+	/* Iterate over all segments in the address space */
+	AS_LOCK_ENTER(as, RW_READER);
+	for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
+		vnode_t *vp;
+		uint_t protbits;
+
+		pbuf = kmem_alloc(sizeof (*pbuf), KM_SLEEP);
+
+		pbuf->saddr = seg->s_base;
+		pbuf->eaddr = seg->s_base+seg->s_size;
+		pbuf->type = SEGOP_GETTYPE(seg, seg->s_base);
+
+		/*
+		 * Cheat and only use the protection bits of the first page
+		 * in the segment
+		 */
+		(void) strncpy(pbuf->prot, "----", sizeof (pbuf->prot));
+		(void) SEGOP_GETPROT(seg, seg->s_base, 0, &protbits);
+
+		if (protbits & PROT_READ)	   pbuf->prot[0] = 'r';
+		if (protbits & PROT_WRITE)	   pbuf->prot[1] = 'w';
+		if (protbits & PROT_EXEC)	   pbuf->prot[2] = 'x';
+		if (pbuf->type & MAP_SHARED)	   pbuf->prot[3] = 's';
+		else if (pbuf->type & MAP_PRIVATE) pbuf->prot[3] = 'p';
+
+		if (seg->s_ops == &segvn_ops &&
+		    SEGOP_GETVP(seg, seg->s_base, &vp) == 0 &&
+		    vp != NULL && vp->v_type == VREG) {
+			VN_HOLD(vp);
+			pbuf->vp = vp;
+		} else {
+			pbuf->vp = NULL;
+		}
+
+		pbuf->offset = (uint32_t)SEGOP_GETOFFSET(seg, pbuf->saddr);
+
+		pbuf->next = NULL;
+		*print_tail = pbuf;
+		print_tail = &pbuf->next;
+	}
+	AS_LOCK_EXIT(as);
+	mutex_enter(&p->p_lock);
+	lxpr_unlock(p);
+
+	buf = kmem_alloc(buflen, KM_SLEEP);
+
+	/* print the data we've extracted */
+	pbuf = print_head;
+	while (pbuf != NULL) {
+		struct print_data *pbuf_next;
+		vattr_t vattr;
+
+		int maj = 0;
+		int min = 0;
+		u_longlong_t inode = 0;
+
+		*buf = '\0';
+		if (pbuf->vp != NULL) {
+			vattr.va_mask = AT_FSID | AT_NODEID;
+			if (VOP_GETATTR(pbuf->vp, &vattr, 0, CRED(),
+			    NULL) == 0) {
+				maj = getmajor(vattr.va_fsid);
+				min = getminor(vattr.va_fsid);
+				inode = vattr.va_nodeid;
+			}
+			(void) vnodetopath(NULL, pbuf->vp, buf, buflen, CRED());
+			VN_RELE(pbuf->vp);
+		}
+
+		if (*buf != '\0') {
+			lxpr_uiobuf_printf(uiobuf,
+			    "%08x-%08x %s %08x %02d:%03d %lld %s\n",
+			    pbuf->saddr, pbuf->eaddr, pbuf->prot, pbuf->offset,
+			    maj, min, inode, buf);
+		} else {
+			lxpr_uiobuf_printf(uiobuf,
+			    "%08x-%08x %s %08x %02d:%03d %lld\n",
+			    pbuf->saddr, pbuf->eaddr, pbuf->prot, pbuf->offset,
+			    maj, min, inode);
+		}
+
+		pbuf_next = pbuf->next;
+		kmem_free(pbuf, sizeof (*pbuf));
+		pbuf = pbuf_next;
+	}
+
+	kmem_free(buf, buflen);
+}
+
+/*
+ * lxpr_read_pid_statm(): memory status file
+ */
+static void
+lxpr_read_pid_statm(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	proc_t *p;
+	struct as *as;
+	size_t vsize;
+	size_t rss;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_PID_STATM);
+
+	p = lxpr_lock(lxpnp->lxpr_pid);
+	if (p == NULL) {
+		lxpr_uiobuf_seterr(uiobuf, EINVAL);
+		return;
+	}
+
+	as = p->p_as;
+
+	mutex_exit(&p->p_lock);
+
+	AS_LOCK_ENTER(as, RW_READER);
+	vsize = btopr(as->a_resvsize);
+	rss = rm_asrss(as);
+	AS_LOCK_EXIT(as);
+
+	mutex_enter(&p->p_lock);
+	lxpr_unlock(p);
+
+	lxpr_uiobuf_printf(uiobuf,
+	    "%lu %lu %lu %lu %lu %lu %lu\n",
+	    vsize, rss, 0l, rss, 0l, 0l, 0l);
+}
+
+/*
+ * lxpr_read_pid_status(): status file
+ */
+static void
+lxpr_read_pid_status(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	proc_t *p;
+	kthread_t *t;
+	user_t *up;
+	cred_t *cr;
+	const gid_t *groups;
+	int    ngroups;
+	struct as *as;
+	char *status;
+	pid_t pid, ppid;
+	size_t vsize;
+	size_t rss;
+	k_sigset_t current, ignore, handle;
+	int    i, lx_sig;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_PID_STATUS);
+
+	p = lxpr_lock(lxpnp->lxpr_pid);
+	if (p == NULL) {
+		lxpr_uiobuf_seterr(uiobuf, EINVAL);
+		return;
+	}
+
+	pid = p->p_pid;
+
+	/*
+	 * Convert pid to the Linux default of 1 if we're the zone's init
+	 * process
+	 */
+	if (pid == curproc->p_zone->zone_proc_initpid) {
+		pid = 1;
+		ppid = 0;	/* parent pid for init is 0 */
+	} else {
+		/*
+		 * Make sure not to reference parent PIDs that reside outside
+		 * the zone
+		 */
+		ppid = ((p->p_flag & SZONETOP)
+		    ? curproc->p_zone->zone_zsched->p_pid : p->p_ppid);
+
+		/*
+		 * Convert ppid to the Linux default of 1 if our parent is the
+		 * zone's init process
+		 */
+		if (ppid == curproc->p_zone->zone_proc_initpid)
+			ppid = 1;
+	}
+
+	t = prchoose(p);
+	if (t != NULL) {
+		switch (t->t_state) {
+		case TS_SLEEP:
+			status = "S (sleeping)";
+			break;
+		case TS_RUN:
+		case TS_ONPROC:
+			status = "R (running)";
+			break;
+		case TS_ZOMB:
+			status = "Z (zombie)";
+			break;
+		case TS_STOPPED:
+			status = "T (stopped)";
+			break;
+		default:
+			status = "! (unknown)";
+			break;
+		}
+		thread_unlock(t);
+	} else {
+		/*
+		 * there is a hole in the exit code, where a proc can have
+		 * no threads but it is yet to be flagged SZOMB. We will
+		 * assume we are about to become a zombie
+		 */
+		status = "Z (zombie)";
+	}
+
+	up = PTOU(p);
+	mutex_enter(&p->p_crlock);
+	crhold(cr = p->p_cred);
+	mutex_exit(&p->p_crlock);
+
+	lxpr_uiobuf_printf(uiobuf,
+	    "Name:\t%s\n"
+	    "State:\t%s\n"
+	    "Tgid:\t%d\n"
+	    "Pid:\t%d\n"
+	    "PPid:\t%d\n"
+	    "TracerPid:\t%d\n"
+	    "Uid:\t%u\t%u\t%u\t%u\n"
+	    "Gid:\t%u\t%u\t%u\t%u\n"
+	    "FDSize:\t%d\n"
+	    "Groups:\t",
+	    up->u_comm,
+	    status,
+	    pid, /* thread group id - same as pid */
+	    pid,
+	    ppid,
+	    0,
+	    crgetruid(cr), crgetuid(cr), crgetsuid(cr), crgetuid(cr),
+	    crgetrgid(cr), crgetgid(cr), crgetsgid(cr), crgetgid(cr),
+	    p->p_fno_ctl);
+
+	ngroups = crgetngroups(cr);
+	groups  = crgetgroups(cr);
+	for (i = 0; i < ngroups; i++) {
+		lxpr_uiobuf_printf(uiobuf,
+		    "%u ",
+		    groups[i]);
+	}
+	crfree(cr);
+
+	as = p->p_as;
+	if ((p->p_stat != SZOMB) && !(p->p_flag & SSYS) && (as != &kas)) {
+		mutex_exit(&p->p_lock);
+		AS_LOCK_ENTER(as, RW_READER);
+		vsize = as->a_resvsize;
+		rss = rm_asrss(as);
+		AS_LOCK_EXIT(as);
+		mutex_enter(&p->p_lock);
+
+		lxpr_uiobuf_printf(uiobuf,
+		    "\n"
+		    "VmSize:\t%8lu kB\n"
+		    "VmLck:\t%8lu kB\n"
+		    "VmRSS:\t%8lu kB\n"
+		    "VmData:\t%8lu kB\n"
+		    "VmStk:\t%8lu kB\n"
+		    "VmExe:\t%8lu kB\n"
+		    "VmLib:\t%8lu kB",
+		    btok(vsize),
+		    0l,
+		    ptok(rss),
+		    0l,
+		    btok(p->p_stksize),
+		    ptok(rss),
+		    0l);
+	}
+
+	sigemptyset(&current);
+	sigemptyset(&ignore);
+	sigemptyset(&handle);
+
+	for (i = 1; i < NSIG; i++) {
+		lx_sig = lxpr_sigmap[i];
+
+		if ((lx_sig > 0) && (lx_sig <= LX_NSIG)) {
+			if (sigismember(&p->p_sig, i))
+				sigaddset(&current, lx_sig);
+
+			if (up->u_signal[i - 1] == SIG_IGN)
+				sigaddset(&ignore, lx_sig);
+			else if (up->u_signal[i - 1] != SIG_DFL)
+				sigaddset(&handle, lx_sig);
+		}
+	}
+
+	lxpr_uiobuf_printf(uiobuf,
+	    "\n"
+	    "SigPnd:\t%08x%08x\n"
+	    "SigBlk:\t%08x%08x\n"
+	    "SigIgn:\t%08x%08x\n"
+	    "SigCgt:\t%08x%08x\n"
+	    "CapInh:\t%016x\n"
+	    "CapPrm:\t%016x\n"
+	    "CapEff:\t%016x\n",
+	    current.__sigbits[1], current.__sigbits[0],
+	    0, 0, /* signals blocked on per thread basis */
+	    ignore.__sigbits[1], ignore.__sigbits[0],
+	    handle.__sigbits[1], handle.__sigbits[0],
+	    /* Can't do anything with linux capabilities */
+	    0,
+	    0,
+	    0);
+
+	lxpr_unlock(p);
+}
+
+
+/*
+ * lxpr_read_pid_stat(): pid stat file
+ */
+static void
+lxpr_read_pid_stat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	proc_t *p;
+	kthread_t *t;
+	struct as *as;
+	char stat;
+	pid_t pid, ppid, pgpid, spid;
+	gid_t psgid;
+	dev_t psdev;
+	size_t rss, vsize;
+	int nice, pri;
+	caddr_t wchan;
+	processorid_t cpu;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_PID_STAT);
+
+	p = lxpr_lock(lxpnp->lxpr_pid);
+	if (p == NULL) {
+		lxpr_uiobuf_seterr(uiobuf, EINVAL);
+		return;
+	}
+
+	pid = p->p_pid;
+
+	/*
+	 * Set Linux defaults if we're the zone's init process
+	 */
+	if (pid == curproc->p_zone->zone_proc_initpid) {
+		pid = 1;		/* PID for init */
+		ppid = 0;		/* parent PID for init is 0 */
+		pgpid = 0;		/* process group for init is 0 */
+		psgid = (gid_t)-1;	/* credential GID for init is -1 */
+		spid = 0;		/* session id for init is 0 */
+		psdev = 0;		/* session device for init is 0 */
+	} else {
+		/*
+		 * Make sure not to reference parent PIDs that reside outside
+		 * the zone
+		 */
+		ppid = ((p->p_flag & SZONETOP) ?
+		    curproc->p_zone->zone_zsched->p_pid : p->p_ppid);
+
+		/*
+		 * Convert ppid to the Linux default of 1 if our parent is the
+		 * zone's init process
+		 */
+		if (ppid == curproc->p_zone->zone_proc_initpid)
+			ppid = 1;
+
+		pgpid = p->p_pgrp;
+
+		mutex_enter(&p->p_splock);
+		mutex_enter(&p->p_sessp->s_lock);
+		spid = p->p_sessp->s_sid;
+		psdev = p->p_sessp->s_dev;
+		if (p->p_sessp->s_cred)
+			psgid = crgetgid(p->p_sessp->s_cred);
+		else
+			psgid = crgetgid(p->p_cred);
+
+		mutex_exit(&p->p_sessp->s_lock);
+		mutex_exit(&p->p_splock);
+	}
+
+	t = prchoose(p);
+	if (t != NULL) {
+		switch (t->t_state) {
+		case TS_SLEEP:
+			stat = 'S'; break;
+		case TS_RUN:
+		case TS_ONPROC:
+			stat = 'R'; break;
+		case TS_ZOMB:
+			stat = 'Z'; break;
+		case TS_STOPPED:
+			stat = 'T'; break;
+		default:
+			stat = '!'; break;
+		}
+
+		if (CL_DONICE(t, NULL, 0, &nice) != 0)
+			nice = 0;
+
+		pri = t->t_pri;
+		wchan = t->t_wchan;
+		cpu = t->t_cpu->cpu_id;
+		thread_unlock(t);
+	} else {
+		/* Only zombies have no threads */
+		stat = 'Z';
+		nice = 0;
+		pri = 0;
+		wchan = 0;
+		cpu = 0;
+	}
+	as = p->p_as;
+	mutex_exit(&p->p_lock);
+	AS_LOCK_ENTER(as, RW_READER);
+	vsize = as->a_resvsize;
+	rss = rm_asrss(as);
+	AS_LOCK_EXIT(as);
+	mutex_enter(&p->p_lock);
+
+	lxpr_uiobuf_printf(uiobuf,
+	    "%d (%s) %c %d %d %d %d %d "
+	    "%lu %lu %lu %lu %lu "
+	    "%lu %lu %ld %ld "
+	    "%d %d %d "
+	    "%lu "
+	    "%lu "
+	    "%lu %ld %llu "
+	    "%lu %lu %u "
+	    "%lu %lu "
+	    "%lu %lu %lu %lu "
+	    "%lu "
+	    "%lu %lu "
+	    "%d "
+	    "%d"
+	    "\n",
+	    pid, PTOU(p)->u_comm, stat, ppid, pgpid, spid, psdev, psgid,
+	    0l, 0l, 0l, 0l, 0l, /* flags, minflt, cminflt, majflt, cmajflt */
+	    p->p_utime, p->p_stime, p->p_cutime, p->p_cstime,
+	    pri, nice, p->p_lwpcnt,
+	    0l, /* itrealvalue (time before next SIGALRM) */
+	    PTOU(p)->u_ticks,
+	    vsize, rss, p->p_vmem_ctl,
+	    0l, 0l, USRSTACK, /* startcode, endcode, startstack */
+	    0l, 0l, /* kstkesp, kstkeip */
+	    0l, 0l, 0l, 0l, /* signal, blocked, sigignore, sigcatch */
+	    wchan,
+	    0l, 0l, /* nswap, cnswap */
+	    0, /* exit_signal */
+	    cpu);
+
+	lxpr_unlock(p);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_arp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_dev(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	lxpr_uiobuf_printf(uiobuf, "Inter-|   Receive                   "
+	    "                             |  Transmit\n");
+	lxpr_uiobuf_printf(uiobuf, " face |bytes    packets errs drop fifo"
+	    " frame compressed multicast|bytes    packets errs drop fifo"
+	    " colls carrier compressed\n");
+
+	/*
+	 * Data about each interface should go here, but that shouldn't be added
+	 * unless there is an lxproc reader that actually makes use of it (and
+	 * doesn't need anything else that we refuse to provide)...
+	 */
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_dev_mcast(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_igmp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_ip_mr_cache(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_ip_mr_vif(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_mcfilter(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_netstat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_raw(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_route(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_rpc(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_rt_cache(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_sockstat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_snmp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_stat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_tcp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_udp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_unix(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/*
+ * lxpr_read_kmsg(): read the contents of the kernel message queue. We
+ * translate this into the reception of console messages for this zone; each
+ * read copies out a single zone console message, or blocks until the next one
+ * is produced.
+ */
+
+#define	LX_KMSG_PRI	"<0>"
+
+static void
+lxpr_read_kmsg(lxpr_node_t *lxpnp, struct lxpr_uiobuf *uiobuf, ldi_handle_t lh)
+{
+	mblk_t		*mp;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_KMSG);
+
+	if (ldi_getmsg(lh, &mp, NULL) == 0) {
+		/*
+		 * lxproc doesn't like successive reads to the same file
+		 * descriptor unless we do an explicit rewind each time.
+		 */
+		lxpr_uiobuf_seek(uiobuf, 0);
+
+		lxpr_uiobuf_printf(uiobuf, "%s%s", LX_KMSG_PRI,
+		    mp->b_cont->b_rptr);
+
+		freemsg(mp);
+	}
+}
+
+/*
+ * lxpr_read_loadavg(): read the contents of the "loadavg" file.  We do just
+ * enough for uptime and other simple lxproc readers to work
+ */
+extern int nthread;
+
+static void
+lxpr_read_loadavg(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	ulong_t avenrun1;
+	ulong_t avenrun5;
+	ulong_t avenrun15;
+	ulong_t avenrun1_cs;
+	ulong_t avenrun5_cs;
+	ulong_t avenrun15_cs;
+	int loadavg[3];
+	int *loadbuf;
+	cpupart_t *cp;
+	zone_t *zone = LXPTOZ(lxpnp);
+
+	uint_t nrunnable = 0;
+	rctl_qty_t nlwps;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_LOADAVG);
+
+	mutex_enter(&cpu_lock);
+
+	/*
+	 * Need to add up values over all CPU partitions. If pools are active,
+	 * only report the values of the zone's partition, which by definition
+	 * includes the current CPU.
+	 */
+	if (pool_pset_enabled()) {
+		psetid_t psetid = zone_pset_get(curproc->p_zone);
+
+		ASSERT(curproc->p_zone != &zone0);
+		cp = CPU->cpu_part;
+
+		nrunnable = cp->cp_nrunning + cp->cp_nrunnable;
+		(void) cpupart_get_loadavg(psetid, &loadavg[0], 3);
+		loadbuf = &loadavg[0];
+	} else {
+		cp = cp_list_head;
+		do {
+			nrunnable += cp->cp_nrunning + cp->cp_nrunnable;
+		} while ((cp = cp->cp_next) != cp_list_head);
+
+		loadbuf = zone == global_zone ?
+		    &avenrun[0] : zone->zone_avenrun;
+	}
+
+	/*
+	 * If we're in the non-global zone, we'll report the total number of
+	 * LWPs in the zone for the "nproc" parameter of /proc/loadavg,
+	 * otherwise will just use nthread (which will include kernel threads,
+	 * but should be good enough for lxproc).
+	 */
+	nlwps = zone == global_zone ? nthread : zone->zone_nlwps;
+
+	mutex_exit(&cpu_lock);
+
+	avenrun1 = loadbuf[0] >> FSHIFT;
+	avenrun1_cs = ((loadbuf[0] & (FSCALE-1)) * 100) >> FSHIFT;
+	avenrun5 = loadbuf[1] >> FSHIFT;
+	avenrun5_cs = ((loadbuf[1] & (FSCALE-1)) * 100) >> FSHIFT;
+	avenrun15 = loadbuf[2] >> FSHIFT;
+	avenrun15_cs = ((loadbuf[2] & (FSCALE-1)) * 100) >> FSHIFT;
+
+	lxpr_uiobuf_printf(uiobuf,
+	    "%ld.%02d %ld.%02d %ld.%02d %d/%d %d\n",
+	    avenrun1, avenrun1_cs,
+	    avenrun5, avenrun5_cs,
+	    avenrun15, avenrun15_cs,
+	    nrunnable, nlwps, 0);
+}
+
+/*
+ * lxpr_read_meminfo(): read the contents of the "meminfo" file.
+ */
+static void
+lxpr_read_meminfo(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	zone_t *zone = LXPTOZ(lxpnp);
+	int global = zone == global_zone;
+	long total_mem, free_mem, total_swap, used_swap;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_MEMINFO);
+
+	if (global || zone->zone_phys_mem_ctl == UINT64_MAX) {
+		total_mem = physmem * PAGESIZE;
+		free_mem = freemem * PAGESIZE;
+	} else {
+		total_mem = zone->zone_phys_mem_ctl;
+		free_mem = zone->zone_phys_mem_ctl - zone->zone_phys_mem;
+	}
+
+	if (global || zone->zone_max_swap_ctl == UINT64_MAX) {
+		total_swap = k_anoninfo.ani_max * PAGESIZE;
+		used_swap = k_anoninfo.ani_phys_resv * PAGESIZE;
+	} else {
+		mutex_enter(&zone->zone_mem_lock);
+		total_swap = zone->zone_max_swap_ctl;
+		used_swap = zone->zone_max_swap;
+		mutex_exit(&zone->zone_mem_lock);
+	}
+
+	lxpr_uiobuf_printf(uiobuf,
+	    "        total:     used:    free:  shared: buffers:  cached:\n"
+	    "Mem:  %8lu %8lu %8lu %8u %8u %8u\n"
+	    "Swap: %8lu %8lu %8lu\n"
+	    "MemTotal:  %8lu kB\n"
+	    "MemFree:   %8lu kB\n"
+	    "MemShared: %8u kB\n"
+	    "Buffers:   %8u kB\n"
+	    "Cached:    %8u kB\n"
+	    "SwapCached:%8u kB\n"
+	    "Active:    %8u kB\n"
+	    "Inactive:  %8u kB\n"
+	    "HighTotal: %8u kB\n"
+	    "HighFree:  %8u kB\n"
+	    "LowTotal:  %8u kB\n"
+	    "LowFree:   %8u kB\n"
+	    "SwapTotal: %8lu kB\n"
+	    "SwapFree:  %8lu kB\n",
+	    total_mem, total_mem - free_mem, free_mem, 0, 0, 0,
+	    total_swap, used_swap, total_swap - used_swap,
+	    btok(total_mem),				/* MemTotal */
+	    btok(free_mem),				/* MemFree */
+	    0,						/* MemShared */
+	    0,						/* Buffers */
+	    0,						/* Cached */
+	    0,						/* SwapCached */
+	    0,						/* Active */
+	    0,						/* Inactive */
+	    0,						/* HighTotal */
+	    0,						/* HighFree */
+	    btok(total_mem),				/* LowTotal */
+	    btok(free_mem),				/* LowFree */
+	    btok(total_swap),				/* SwapTotal */
+	    btok(total_swap - used_swap));		/* SwapFree */
+}
+
+/*
+ * lxpr_read_mounts():
+ */
+/* ARGSUSED */
+static void
+lxpr_read_mounts(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	struct vfs *vfsp;
+	struct vfs *vfslist;
+	zone_t *zone = LXPTOZ(lxpnp);
+	struct print_data {
+		refstr_t *vfs_mntpt;
+		refstr_t *vfs_resource;
+		uint_t vfs_flag;
+		int vfs_fstype;
+		struct print_data *next;
+	} *print_head = NULL;
+	struct print_data **print_tail = &print_head;
+	struct print_data *printp;
+
+	vfs_list_read_lock();
+
+	if (zone == global_zone) {
+		vfsp = vfslist = rootvfs;
+	} else {
+		vfsp = vfslist = zone->zone_vfslist;
+		/*
+		 * If the zone has a root entry, it will be the first in
+		 * the list.  If it doesn't, we conjure one up.
+		 */
+		if (vfslist == NULL || strcmp(refstr_value(vfsp->vfs_mntpt),
+		    zone->zone_rootpath) != 0) {
+			struct vfs *tvfsp;
+			/*
+			 * The root of the zone is not a mount point.  The vfs
+			 * we want to report is that of the zone's root vnode.
+			 */
+			tvfsp = zone->zone_rootvp->v_vfsp;
+
+			lxpr_uiobuf_printf(uiobuf,
+			    "/ / %s %s 0 0\n",
+			    vfssw[tvfsp->vfs_fstype].vsw_name,
+			    tvfsp->vfs_flag & VFS_RDONLY ? "ro" : "rw");
+
+		}
+		if (vfslist == NULL) {
+			vfs_list_unlock();
+			return;
+		}
+	}
+
+	/*
+	 * Later on we have to do a lookupname, which can end up causing
+	 * another vfs_list_read_lock() to be called. Which can lead to a
+	 * deadlock. To avoid this, we extract the data we need into a local
+	 * list, then we can run this list without holding vfs_list_read_lock()
+	 * We keep the list in the same order as the vfs_list
+	 */
+	do {
+		/* Skip mounts we shouldn't show */
+		if (vfsp->vfs_flag & VFS_NOMNTTAB) {
+			goto nextfs;
+		}
+
+		printp = kmem_alloc(sizeof (*printp), KM_SLEEP);
+		refstr_hold(vfsp->vfs_mntpt);
+		printp->vfs_mntpt = vfsp->vfs_mntpt;
+		refstr_hold(vfsp->vfs_resource);
+		printp->vfs_resource = vfsp->vfs_resource;
+		printp->vfs_flag = vfsp->vfs_flag;
+		printp->vfs_fstype = vfsp->vfs_fstype;
+		printp->next = NULL;
+
+		*print_tail = printp;
+		print_tail = &printp->next;
+
+nextfs:
+		vfsp = (zone == global_zone) ?
+		    vfsp->vfs_next : vfsp->vfs_zone_next;
+
+	} while (vfsp != vfslist);
+
+	vfs_list_unlock();
+
+	/*
+	 * now we can run through what we've extracted without holding
+	 * vfs_list_read_lock()
+	 */
+	printp = print_head;
+	while (printp != NULL) {
+		struct print_data *printp_next;
+		const char *resource;
+		char *mntpt;
+		struct vnode *vp;
+		int error;
+
+		mntpt = (char *)refstr_value(printp->vfs_mntpt);
+		resource = refstr_value(printp->vfs_resource);
+
+		if (mntpt != NULL && mntpt[0] != '\0')
+			mntpt = ZONE_PATH_TRANSLATE(mntpt, zone);
+		else
+			mntpt = "-";
+
+		error = lookupname(mntpt, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp);
+
+		if (error != 0)
+			goto nextp;
+
+		if (!(vp->v_flag & VROOT)) {
+			VN_RELE(vp);
+			goto nextp;
+		}
+		VN_RELE(vp);
+
+		if (resource != NULL && resource[0] != '\0') {
+			if (resource[0] == '/') {
+				resource = ZONE_PATH_VISIBLE(resource, zone) ?
+				    ZONE_PATH_TRANSLATE(resource, zone) :
+				    mntpt;
+			}
+		} else {
+			resource = "-";
+		}
+
+		lxpr_uiobuf_printf(uiobuf,
+		    "%s %s %s %s 0 0\n",
+		    resource, mntpt, vfssw[printp->vfs_fstype].vsw_name,
+		    printp->vfs_flag & VFS_RDONLY ? "ro" : "rw");
+
+nextp:
+		printp_next = printp->next;
+		refstr_rele(printp->vfs_mntpt);
+		refstr_rele(printp->vfs_resource);
+		kmem_free(printp, sizeof (*printp));
+		printp = printp_next;
+
+	}
+}
+
+/*
+ * lxpr_read_partitions():
+ *
+ * We don't support partitions in a local zone because it requires access to
+ * physical devices.  But we need to fake up enough of the file to show that we
+ * have no partitions.
+ */
+/* ARGSUSED */
+static void
+lxpr_read_partitions(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	lxpr_uiobuf_printf(uiobuf,
+	    "major minor  #blocks  name     rio rmerge rsect ruse "
+	    "wio wmerge wsect wuse running use aveq\n\n");
+}
+
+/*
+ * lxpr_read_version(): read the contents of the "version" file.  Note that
+ * we don't lie here -- we don't pretend that we're Linux.  If lxproc is to
+ * be used in a Linux-branded zone, there will need to be a mount option to
+ * indicate that Linux should be more fully mimicked.
+ */
+/* ARGSUSED */
+static void
+lxpr_read_version(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	lxpr_uiobuf_printf(uiobuf,
+	    "%s version %s (%s version %d.%d.%d) "
+	    "#%s SMP %s\n",
+	    utsname.sysname, utsname.release,
+#if defined(__GNUC__)
+	    "gcc",
+	    __GNUC__,
+	    __GNUC_MINOR__,
+	    __GNUC_PATCHLEVEL__,
+#else
+	    "Sun C",
+	    __SUNPRO_C / 0x100,
+	    (__SUNPRO_C & 0xff) / 0x10,
+	    __SUNPRO_C & 0xf,
+#endif
+	    utsname.version,
+	    "00:00:00 00/00/00");
+}
+
+/*
+ * lxpr_read_stat(): read the contents of the "stat" file.
+ *
+ */
+/* ARGSUSED */
+static void
+lxpr_read_stat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	cpu_t *cp, *cpstart;
+	int pools_enabled;
+	ulong_t idle_cum = 0;
+	ulong_t sys_cum  = 0;
+	ulong_t user_cum = 0;
+	ulong_t irq_cum = 0;
+	ulong_t cpu_nrunnable_cum = 0;
+	ulong_t w_io_cum = 0;
+
+	ulong_t pgpgin_cum    = 0;
+	ulong_t pgpgout_cum   = 0;
+	ulong_t pgswapout_cum = 0;
+	ulong_t pgswapin_cum  = 0;
+	ulong_t intr_cum = 0;
+	ulong_t pswitch_cum = 0;
+	ulong_t forks_cum = 0;
+	hrtime_t msnsecs[NCMSTATES];
+
+	/* temporary variable since scalehrtime modifies data in place */
+	hrtime_t tmptime;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_STAT);
+
+	mutex_enter(&cpu_lock);
+	pools_enabled = pool_pset_enabled();
+
+	/* Calculate cumulative stats */
+	cp = cpstart = CPU->cpu_part->cp_cpulist;
+	do {
+		int i;
+
+		/*
+		 * Don't count CPUs that aren't even in the system
+		 * or aren't up yet.
+		 */
+		if ((cp->cpu_flags & CPU_EXISTS) == 0) {
+			continue;
+		}
+
+		get_cpu_mstate(cp, msnsecs);
+
+		idle_cum += NSEC_TO_TICK(msnsecs[CMS_IDLE]);
+		sys_cum  += NSEC_TO_TICK(msnsecs[CMS_SYSTEM]);
+		user_cum += NSEC_TO_TICK(msnsecs[CMS_USER]);
+
+		pgpgin_cum += CPU_STATS(cp, vm.pgpgin);
+		pgpgout_cum += CPU_STATS(cp, vm.pgpgout);
+		pgswapin_cum += CPU_STATS(cp, vm.pgswapin);
+		pgswapout_cum += CPU_STATS(cp, vm.pgswapout);
+
+		cpu_nrunnable_cum += cp->cpu_disp->disp_nrunnable;
+		w_io_cum += CPU_STATS(cp, sys.iowait);
+		for (i = 0; i < NCMSTATES; i++) {
+			tmptime = cp->cpu_intracct[i];
+			scalehrtime(&tmptime);
+			irq_cum += NSEC_TO_TICK(tmptime);
+		}
+
+		for (i = 0; i < PIL_MAX; i++)
+			intr_cum += CPU_STATS(cp, sys.intr[i]);
+
+		pswitch_cum += CPU_STATS(cp, sys.pswitch);
+		forks_cum += CPU_STATS(cp, sys.sysfork);
+		forks_cum += CPU_STATS(cp, sys.sysvfork);
+
+		if (pools_enabled)
+			cp = cp->cpu_next_part;
+		else
+			cp = cp->cpu_next;
+	} while (cp != cpstart);
+
+	lxpr_uiobuf_printf(uiobuf, "cpu %lu %lu %lu %lu %lu %lu %lu\n",
+	    user_cum, 0L, sys_cum, idle_cum, 0L, irq_cum, 0L);
+
+	/* Do per processor stats */
+	do {
+		int i;
+
+		ulong_t idle_ticks;
+		ulong_t sys_ticks;
+		ulong_t user_ticks;
+		ulong_t irq_ticks = 0;
+
+		/*
+		 * Don't count CPUs that aren't even in the system
+		 * or aren't up yet.
+		 */
+		if ((cp->cpu_flags & CPU_EXISTS) == 0) {
+			continue;
+		}
+
+		get_cpu_mstate(cp, msnsecs);
+
+		idle_ticks = NSEC_TO_TICK(msnsecs[CMS_IDLE]);
+		sys_ticks  = NSEC_TO_TICK(msnsecs[CMS_SYSTEM]);
+		user_ticks = NSEC_TO_TICK(msnsecs[CMS_USER]);
+
+		for (i = 0; i < NCMSTATES; i++) {
+			tmptime = cp->cpu_intracct[i];
+			scalehrtime(&tmptime);
+			irq_ticks += NSEC_TO_TICK(tmptime);
+		}
+
+		lxpr_uiobuf_printf(uiobuf,
+		    "cpu%d %lu %lu %lu %lu %lu %lu %lu\n",
+		    cp->cpu_id, user_ticks, 0L, sys_ticks, idle_ticks,
+		    0L, irq_ticks, 0L);
+
+		if (pools_enabled)
+			cp = cp->cpu_next_part;
+		else
+			cp = cp->cpu_next;
+	} while (cp != cpstart);
+
+	mutex_exit(&cpu_lock);
+
+	lxpr_uiobuf_printf(uiobuf,
+	    "page %lu %lu\n"
+	    "swap %lu %lu\n"
+	    "intr %lu\n"
+	    "ctxt %lu\n"
+	    "btime %lu\n"
+	    "processes %lu\n"
+	    "procs_running %lu\n"
+	    "procs_blocked %lu\n",
+	    pgpgin_cum, pgpgout_cum,
+	    pgswapin_cum, pgswapout_cum,
+	    intr_cum,
+	    pswitch_cum,
+	    boot_time,
+	    forks_cum,
+	    cpu_nrunnable_cum,
+	    w_io_cum);
+}
+
+/*
+ * lxpr_read_uptime(): read the contents of the "uptime" file.
+ *
+ * format is: "%.2lf, %.2lf",uptime_secs, idle_secs
+ * Use fixed point arithmetic to get 2 decimal places
+ */
+/* ARGSUSED */
+static void
+lxpr_read_uptime(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	cpu_t *cp, *cpstart;
+	int pools_enabled;
+	ulong_t idle_cum = 0;
+	ulong_t cpu_count = 0;
+	ulong_t idle_s;
+	ulong_t idle_cs;
+	ulong_t up_s;
+	ulong_t up_cs;
+	hrtime_t birthtime;
+	hrtime_t centi_sec = 10000000;  /* 10^7 */
+
+	ASSERT(lxpnp->lxpr_type == LXPR_UPTIME);
+
+	/* Calculate cumulative stats */
+	mutex_enter(&cpu_lock);
+	pools_enabled = pool_pset_enabled();
+
+	cp = cpstart = CPU->cpu_part->cp_cpulist;
+	do {
+		/*
+		 * Don't count CPUs that aren't even in the system
+		 * or aren't up yet.
+		 */
+		if ((cp->cpu_flags & CPU_EXISTS) == 0) {
+			continue;
+		}
+
+		idle_cum += CPU_STATS(cp, sys.cpu_ticks_idle);
+		idle_cum += CPU_STATS(cp, sys.cpu_ticks_wait);
+		cpu_count += 1;
+
+		if (pools_enabled)
+			cp = cp->cpu_next_part;
+		else
+			cp = cp->cpu_next;
+	} while (cp != cpstart);
+	mutex_exit(&cpu_lock);
+
+	/* Getting the Zone zsched process startup time */
+	birthtime = LXPTOZ(lxpnp)->zone_zsched->p_mstart;
+	up_cs = (gethrtime() - birthtime) / centi_sec;
+	up_s = up_cs / 100;
+	up_cs %= 100;
+
+	ASSERT(cpu_count > 0);
+	idle_cum /= cpu_count;
+	idle_s = idle_cum / hz;
+	idle_cs = idle_cum % hz;
+	idle_cs *= 100;
+	idle_cs /= hz;
+
+	lxpr_uiobuf_printf(uiobuf,
+	    "%ld.%02d %ld.%02d\n", up_s, up_cs, idle_s, idle_cs);
+}
+
+static const char *amd_x_edx[] = {
+	NULL,	NULL,	NULL,	NULL,
+	NULL,	NULL,	NULL,	NULL,
+	NULL,	NULL,	NULL,	"syscall",
+	NULL,	NULL,	NULL,	NULL,
+	NULL,	NULL,	NULL,	"mp",
+	"nx",	NULL,	"mmxext", NULL,
+	NULL,	NULL,	NULL,	NULL,
+	NULL,	"lm",	"3dnowext", "3dnow"
+};
+
+static const char *amd_x_ecx[] = {
+	"lahf_lm", NULL, "svm", NULL,
+	"altmovcr8"
+};
+
+static const char *tm_x_edx[] = {
+	"recovery", "longrun", NULL, "lrti"
+};
+
+/*
+ * Intel calls no-execute "xd" in its docs, but Linux still reports it as "nx."
+ */
+static const char *intc_x_edx[] = {
+	NULL,	NULL,	NULL,	NULL,
+	NULL,	NULL,	NULL,	NULL,
+	NULL,	NULL,	NULL,	"syscall",
+	NULL,	NULL,	NULL,	NULL,
+	NULL,	NULL,	NULL,	NULL,
+	"nx",	NULL,	NULL,   NULL,
+	NULL,	NULL,	NULL,	NULL,
+	NULL,	"lm",   NULL,   NULL
+};
+
+static const char *intc_edx[] = {
+	"fpu",	"vme",	"de",	"pse",
+	"tsc",	"msr",	"pae",	"mce",
+	"cx8",	"apic",	 NULL,	"sep",
+	"mtrr",	"pge",	"mca",	"cmov",
+	"pat",	"pse36", "pn",	"clflush",
+	NULL,	"dts",	"acpi",	"mmx",
+	"fxsr",	"sse",	"sse2",	"ss",
+	"ht",	"tm",	"ia64",	"pbe"
+};
+
+/*
+ * "sse3" on linux is called "pni" (Prescott New Instructions).
+ */
+static const char *intc_ecx[] = {
+	"pni",	NULL,	NULL, "monitor",
+	"ds_cpl", NULL,	NULL, "est",
+	"tm2",	NULL,	"cid", NULL,
+	NULL,	"cx16",	"xtpr"
+};
+
+static void
+lxpr_read_cpuinfo(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	int i;
+	uint32_t bits;
+	cpu_t *cp, *cpstart;
+	int pools_enabled;
+	const char **fp;
+	char brandstr[CPU_IDSTRLEN];
+	struct cpuid_regs cpr;
+	int maxeax;
+	int std_ecx, std_edx, ext_ecx, ext_edx;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_CPUINFO);
+
+	mutex_enter(&cpu_lock);
+	pools_enabled = pool_pset_enabled();
+
+	cp = cpstart = CPU->cpu_part->cp_cpulist;
+	do {
+		/*
+		 * This returns the maximum eax value for standard cpuid
+		 * functions in eax.
+		 */
+		cpr.cp_eax = 0;
+		(void) cpuid_insn(cp, &cpr);
+		maxeax = cpr.cp_eax;
+
+		/*
+		 * Get standard x86 feature flags.
+		 */
+		cpr.cp_eax = 1;
+		(void) cpuid_insn(cp, &cpr);
+		std_ecx = cpr.cp_ecx;
+		std_edx = cpr.cp_edx;
+
+		/*
+		 * Now get extended feature flags.
+		 */
+		cpr.cp_eax = 0x80000001;
+		(void) cpuid_insn(cp, &cpr);
+		ext_ecx = cpr.cp_ecx;
+		ext_edx = cpr.cp_edx;
+
+		(void) cpuid_getbrandstr(cp, brandstr, CPU_IDSTRLEN);
+
+		lxpr_uiobuf_printf(uiobuf,
+		    "processor\t: %d\n"
+		    "vendor_id\t: %s\n"
+		    "cpu family\t: %d\n"
+		    "model\t\t: %d\n"
+		    "model name\t: %s\n"
+		    "stepping\t: %d\n"
+		    "cpu MHz\t\t: %u.%03u\n",
+		    cp->cpu_id, cpuid_getvendorstr(cp), cpuid_getfamily(cp),
+		    cpuid_getmodel(cp), brandstr, cpuid_getstep(cp),
+		    (uint32_t)(cpu_freq_hz / 1000000),
+		    ((uint32_t)(cpu_freq_hz / 1000)) % 1000);
+
+		lxpr_uiobuf_printf(uiobuf, "cache size\t: %u KB\n",
+		    getl2cacheinfo(cp, NULL, NULL, NULL) / 1024);
+
+		if (is_x86_feature(x86_featureset, X86FSET_HTT)) {
+			/*
+			 * 'siblings' is used for HT-style threads
+			 */
+			lxpr_uiobuf_printf(uiobuf,
+			    "physical id\t: %lu\n"
+			    "siblings\t: %u\n",
+			    pg_plat_hw_instance_id(cp, PGHW_CHIP),
+			    cpuid_get_ncpu_per_chip(cp));
+		}
+
+		/*
+		 * Since we're relatively picky about running on older hardware,
+		 * we can be somewhat cavalier about the answers to these ones.
+		 *
+		 * In fact, given the hardware we support, we just say:
+		 *
+		 *	fdiv_bug	: no	(if we're on a 64-bit kernel)
+		 *	hlt_bug		: no
+		 *	f00f_bug	: no
+		 *	coma_bug	: no
+		 *	wp		: yes	(write protect in supervsr mode)
+		 */
+		lxpr_uiobuf_printf(uiobuf,
+		    "fdiv_bug\t: %s\n"
+		    "hlt_bug \t: no\n"
+		    "f00f_bug\t: no\n"
+		    "coma_bug\t: no\n"
+		    "fpu\t\t: %s\n"
+		    "fpu_exception\t: %s\n"
+		    "cpuid level\t: %d\n"
+		    "flags\t\t:",
+#if defined(__i386)
+		    fpu_pentium_fdivbug ? "yes" : "no",
+#else
+		    "no",
+#endif /* __i386 */
+		    fpu_exists ? "yes" : "no", fpu_exists ? "yes" : "no",
+		    maxeax);
+
+		for (bits = std_edx, fp = intc_edx, i = 0;
+		    i < sizeof (intc_edx) / sizeof (intc_edx[0]); fp++, i++)
+			if ((bits & (1 << i)) != 0 && *fp)
+				lxpr_uiobuf_printf(uiobuf, " %s", *fp);
+
+		/*
+		 * name additional features where appropriate
+		 */
+		switch (x86_vendor) {
+		case X86_VENDOR_Intel:
+			for (bits = ext_edx, fp = intc_x_edx, i = 0;
+			    i < sizeof (intc_x_edx) / sizeof (intc_x_edx[0]);
+			    fp++, i++)
+				if ((bits & (1 << i)) != 0 && *fp)
+					lxpr_uiobuf_printf(uiobuf, " %s", *fp);
+			break;
+
+		case X86_VENDOR_AMD:
+			for (bits = ext_edx, fp = amd_x_edx, i = 0;
+			    i < sizeof (amd_x_edx) / sizeof (amd_x_edx[0]);
+			    fp++, i++)
+				if ((bits & (1 << i)) != 0 && *fp)
+					lxpr_uiobuf_printf(uiobuf, " %s", *fp);
+
+			for (bits = ext_ecx, fp = amd_x_ecx, i = 0;
+			    i < sizeof (amd_x_ecx) / sizeof (amd_x_ecx[0]);
+			    fp++, i++)
+				if ((bits & (1 << i)) != 0 && *fp)
+					lxpr_uiobuf_printf(uiobuf, " %s", *fp);
+			break;
+
+		case X86_VENDOR_TM:
+			for (bits = ext_edx, fp = tm_x_edx, i = 0;
+			    i < sizeof (tm_x_edx) / sizeof (tm_x_edx[0]);
+			    fp++, i++)
+				if ((bits & (1 << i)) != 0 && *fp)
+					lxpr_uiobuf_printf(uiobuf, " %s", *fp);
+			break;
+		default:
+			break;
+		}
+
+		for (bits = std_ecx, fp = intc_ecx, i = 0;
+		    i < sizeof (intc_ecx) / sizeof (intc_ecx[0]); fp++, i++)
+			if ((bits & (1 << i)) != 0 && *fp)
+				lxpr_uiobuf_printf(uiobuf, " %s", *fp);
+
+		lxpr_uiobuf_printf(uiobuf, "\n\n");
+
+		if (pools_enabled)
+			cp = cp->cpu_next_part;
+		else
+			cp = cp->cpu_next;
+	} while (cp != cpstart);
+
+	mutex_exit(&cpu_lock);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_fd(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	ASSERT(lxpnp->lxpr_type == LXPR_PID_FD_FD);
+	lxpr_uiobuf_seterr(uiobuf, EFAULT);
+}
+
+/*
+ * lxpr_getattr(): Vnode operation for VOP_GETATTR()
+ */
+static int
+lxpr_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
+    caller_context_t *ct)
+{
+	register lxpr_node_t *lxpnp = VTOLXP(vp);
+	lxpr_nodetype_t type = lxpnp->lxpr_type;
+	extern uint_t nproc;
+	int error;
+
+	/*
+	 * Return attributes of underlying vnode if ATTR_REAL
+	 *
+	 * but keep fd files with the symlink permissions
+	 */
+	if (lxpnp->lxpr_realvp != NULL && (flags & ATTR_REAL)) {
+		vnode_t *rvp = lxpnp->lxpr_realvp;
+
+		/*
+		 * withold attribute information to owner or root
+		 */
+		if ((error = VOP_ACCESS(rvp, 0, 0, cr, ct)) != 0) {
+			return (error);
+		}
+
+		/*
+		 * now its attributes
+		 */
+		if ((error = VOP_GETATTR(rvp, vap, flags, cr, ct)) != 0) {
+			return (error);
+		}
+
+		/*
+		 * if it's a file in lx /proc/pid/fd/xx then set its
+		 * mode and keep it looking like a symlink
+		 */
+		if (type == LXPR_PID_FD_FD) {
+			vap->va_mode = lxpnp->lxpr_mode;
+			vap->va_type = vp->v_type;
+			vap->va_size = 0;
+			vap->va_nlink = 1;
+		}
+		return (0);
+	}
+
+	/* Default attributes, that may be overridden below */
+	bzero(vap, sizeof (*vap));
+	vap->va_atime = vap->va_mtime = vap->va_ctime = lxpnp->lxpr_time;
+	vap->va_nlink = 1;
+	vap->va_type = vp->v_type;
+	vap->va_mode = lxpnp->lxpr_mode;
+	vap->va_fsid = vp->v_vfsp->vfs_dev;
+	vap->va_blksize = DEV_BSIZE;
+	vap->va_uid = lxpnp->lxpr_uid;
+	vap->va_gid = lxpnp->lxpr_gid;
+	vap->va_nodeid = lxpnp->lxpr_ino;
+
+	switch (type) {
+	case LXPR_PROCDIR:
+		vap->va_nlink = nproc + 2 + PROCDIRFILES;
+		vap->va_size = (nproc + 2 + PROCDIRFILES) * LXPR_SDSIZE;
+		break;
+	case LXPR_PIDDIR:
+		vap->va_nlink = PIDDIRFILES;
+		vap->va_size = PIDDIRFILES * LXPR_SDSIZE;
+		break;
+	case LXPR_SELF:
+		vap->va_uid = crgetruid(curproc->p_cred);
+		vap->va_gid = crgetrgid(curproc->p_cred);
+		break;
+	default:
+		break;
+	}
+
+	vap->va_nblocks = (fsblkcnt64_t)btod(vap->va_size);
+	return (0);
+}
+
+/*
+ * lxpr_access(): Vnode operation for VOP_ACCESS()
+ */
+static int
+lxpr_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct)
+{
+	lxpr_node_t *lxpnp = VTOLXP(vp);
+	int shift = 0;
+	proc_t *tp;
+
+	/* lx /proc is a read only file system */
+	if (mode & VWRITE)
+		return (EROFS);
+
+	/*
+	 * If this is a restricted file, check access permissions.
+	 */
+	switch (lxpnp->lxpr_type) {
+	case LXPR_PIDDIR:
+		return (0);
+	case LXPR_PID_CURDIR:
+	case LXPR_PID_ENV:
+	case LXPR_PID_EXE:
+	case LXPR_PID_MAPS:
+	case LXPR_PID_MEM:
+	case LXPR_PID_ROOTDIR:
+	case LXPR_PID_FDDIR:
+	case LXPR_PID_FD_FD:
+		if ((tp = lxpr_lock(lxpnp->lxpr_pid)) == NULL)
+			return (ENOENT);
+		if (tp != curproc && secpolicy_proc_access(cr) != 0 &&
+		    priv_proc_cred_perm(cr, tp, NULL, mode) != 0) {
+			lxpr_unlock(tp);
+			return (EACCES);
+		}
+		lxpr_unlock(tp);
+	default:
+		break;
+	}
+
+	if (lxpnp->lxpr_realvp != NULL) {
+		/*
+		 * For these we use the underlying vnode's accessibility.
+		 */
+		return (VOP_ACCESS(lxpnp->lxpr_realvp, mode, flags, cr, ct));
+	}
+
+	/* If user is root allow access regardless of permission bits */
+	if (secpolicy_proc_access(cr) == 0)
+		return (0);
+
+	/*
+	 * Access check is based on only one of owner, group, public.  If not
+	 * owner, then check group.  If not a member of the group, then check
+	 * public access.
+	 */
+	if (crgetuid(cr) != lxpnp->lxpr_uid) {
+		shift += 3;
+		if (!groupmember((uid_t)lxpnp->lxpr_gid, cr))
+			shift += 3;
+	}
+
+	mode &= ~(lxpnp->lxpr_mode << shift);
+
+	if (mode == 0)
+		return (0);
+
+	return (EACCES);
+}
+
+/* ARGSUSED */
+static vnode_t *
+lxpr_lookup_not_a_dir(vnode_t *dp, char *comp)
+{
+	return (NULL);
+}
+
+/*
+ * lxpr_lookup(): Vnode operation for VOP_LOOKUP()
+ */
+/* ARGSUSED */
+static int
+lxpr_lookup(vnode_t *dp, char *comp, vnode_t **vpp, pathname_t *pathp,
+	int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
+	int *direntflags, pathname_t *realpnp)
+{
+	lxpr_node_t *lxpnp = VTOLXP(dp);
+	lxpr_nodetype_t type = lxpnp->lxpr_type;
+	int error;
+
+	ASSERT(dp->v_type == VDIR);
+	ASSERT(type < LXPR_NFILES);
+
+	/*
+	 * we should never get here because the lookup
+	 * is done on the realvp for these nodes
+	 */
+	ASSERT(type != LXPR_PID_FD_FD &&
+	    type != LXPR_PID_CURDIR &&
+	    type != LXPR_PID_ROOTDIR);
+
+	/*
+	 * restrict lookup permission to owner or root
+	 */
+	if ((error = lxpr_access(dp, VEXEC, 0, cr, ct)) != 0) {
+		return (error);
+	}
+
+	/*
+	 * Just return the parent vnode if that's where we are trying to go.
+	 */
+	if (strcmp(comp, "..") == 0) {
+		VN_HOLD(lxpnp->lxpr_parent);
+		*vpp = lxpnp->lxpr_parent;
+		return (0);
+	}
+
+	/*
+	 * Special handling for directory searches.  Note: null component name
+	 * denotes that the current directory is being searched.
+	 */
+	if ((dp->v_type == VDIR) && (*comp == '\0' || strcmp(comp, ".") == 0)) {
+		VN_HOLD(dp);
+		*vpp = dp;
+		return (0);
+	}
+
+	*vpp = (lxpr_lookup_function[type](dp, comp));
+	return ((*vpp == NULL) ? ENOENT : 0);
+}
+
+/*
+ * Do a sequential search on the given directory table
+ */
+static vnode_t *
+lxpr_lookup_common(vnode_t *dp, char *comp, proc_t *p,
+    lxpr_dirent_t *dirtab, int dirtablen)
+{
+	lxpr_node_t *lxpnp;
+	int count;
+
+	for (count = 0; count < dirtablen; count++) {
+		if (strcmp(dirtab[count].d_name, comp) == 0) {
+			lxpnp = lxpr_getnode(dp, dirtab[count].d_type, p, 0);
+			dp = LXPTOV(lxpnp);
+			ASSERT(dp != NULL);
+			return (dp);
+		}
+	}
+	return (NULL);
+}
+
+static vnode_t *
+lxpr_lookup_piddir(vnode_t *dp, char *comp)
+{
+	proc_t *p;
+
+	ASSERT(VTOLXP(dp)->lxpr_type == LXPR_PIDDIR);
+
+	p = lxpr_lock(VTOLXP(dp)->lxpr_pid);
+	if (p == NULL)
+		return (NULL);
+
+	dp = lxpr_lookup_common(dp, comp, p, piddir, PIDDIRFILES);
+
+	lxpr_unlock(p);
+
+	return (dp);
+}
+
+/*
+ * Lookup one of the process's open files.
+ */
+static vnode_t *
+lxpr_lookup_fddir(vnode_t *dp, char *comp)
+{
+	lxpr_node_t *dlxpnp = VTOLXP(dp);
+	lxpr_node_t *lxpnp;
+	vnode_t *vp = NULL;
+	proc_t *p;
+	file_t *fp;
+	uint_t fd;
+	int c;
+	uf_entry_t *ufp;
+	uf_info_t *fip;
+
+	ASSERT(dlxpnp->lxpr_type == LXPR_PID_FDDIR);
+
+	/*
+	 * convert the string rendition of the filename
+	 * to a file descriptor
+	 */
+	fd = 0;
+	while ((c = *comp++) != '\0') {
+		int ofd;
+		if (c < '0' || c > '9')
+			return (NULL);
+
+		ofd = fd;
+		fd = 10*fd + c - '0';
+		/* integer overflow */
+		if (fd / 10 != ofd)
+			return (NULL);
+	}
+
+	/*
+	 * get the proc to work with and lock it
+	 */
+	p = lxpr_lock(dlxpnp->lxpr_pid);
+	if ((p == NULL))
+		return (NULL);
+
+	/*
+	 * If the process is a zombie or system process
+	 * it can't have any open files.
+	 */
+	if ((p->p_stat == SZOMB) || (p->p_flag & SSYS) || (p->p_as == &kas)) {
+		lxpr_unlock(p);
+		return (NULL);
+	}
+
+	/*
+	 * get us a fresh node/vnode
+	 */
+	lxpnp = lxpr_getnode(dp, LXPR_PID_FD_FD, p, fd);
+
+	/*
+	 * Drop p_lock, but keep the process P_PR_LOCK'd to prevent it from
+	 * going away while we dereference into fi_list.
+	 */
+	mutex_exit(&p->p_lock);
+
+	/*
+	 * get open file info
+	 */
+	fip = (&(p)->p_user.u_finfo);
+	mutex_enter(&fip->fi_lock);
+
+	if (fd < fip->fi_nfiles) {
+		UF_ENTER(ufp, fip, fd);
+		/*
+		 * ensure the fd is still kosher.
+		 * it may have gone between the readdir and
+		 * the lookup
+		 */
+		if (fip->fi_list[fd].uf_file == NULL) {
+			mutex_exit(&fip->fi_lock);
+			UF_EXIT(ufp);
+			mutex_enter(&p->p_lock);
+			lxpr_unlock(p);
+			lxpr_freenode(lxpnp);
+			return (NULL);
+		}
+
+		if ((fp = ufp->uf_file) != NULL)
+			vp = fp->f_vnode;
+		UF_EXIT(ufp);
+	}
+	mutex_exit(&fip->fi_lock);
+
+	if (vp == NULL) {
+		mutex_enter(&p->p_lock);
+		lxpr_unlock(p);
+		lxpr_freenode(lxpnp);
+		return (NULL);
+	} else {
+		/*
+		 * Fill in the lxpr_node so future references will be able to
+		 * find the underlying vnode. The vnode is held on the realvp.
+		 */
+		lxpnp->lxpr_realvp = vp;
+		VN_HOLD(lxpnp->lxpr_realvp);
+	}
+
+	mutex_enter(&p->p_lock);
+	lxpr_unlock(p);
+	dp = LXPTOV(lxpnp);
+	ASSERT(dp != NULL);
+
+	return (dp);
+}
+
+static vnode_t *
+lxpr_lookup_netdir(vnode_t *dp, char *comp)
+{
+	ASSERT(VTOLXP(dp)->lxpr_type == LXPR_NETDIR);
+
+	dp = lxpr_lookup_common(dp, comp, NULL, netdir, NETDIRFILES);
+
+	return (dp);
+}
+
+static vnode_t *
+lxpr_lookup_procdir(vnode_t *dp, char *comp)
+{
+	ASSERT(VTOLXP(dp)->lxpr_type == LXPR_PROCDIR);
+
+	/*
+	 * We know all the names of files & dirs in our file system structure
+	 * except those that are pid names.  These change as pids are created/
+	 * deleted etc., so we just look for a number as the first char to see
+	 * if we are we doing pid lookups.
+	 *
+	 * Don't need to check for "self" as it is implemented as a symlink
+	 */
+	if (*comp >= '0' && *comp <= '9') {
+		pid_t pid = 0;
+		lxpr_node_t *lxpnp = NULL;
+		proc_t *p;
+		int c;
+
+		while ((c = *comp++) != '\0')
+			pid = 10 * pid + c - '0';
+
+		/*
+		 * Can't continue if the process is still loading or it doesn't
+		 * really exist yet (or maybe it just died!)
+		 */
+		p = lxpr_lock(pid);
+		if (p == NULL)
+			return (NULL);
+
+		if (secpolicy_basic_procinfo(CRED(), p, curproc) != 0) {
+			lxpr_unlock(p);
+			return (NULL);
+		}
+
+		/*
+		 * allocate and fill in a new lxpr node
+		 */
+		lxpnp = lxpr_getnode(dp, LXPR_PIDDIR, p, 0);
+
+		lxpr_unlock(p);
+
+		dp = LXPTOV(lxpnp);
+		ASSERT(dp != NULL);
+
+		return (dp);
+	}
+
+	/* Lookup fixed names */
+	return (lxpr_lookup_common(dp, comp, NULL, lxpr_dir, PROCDIRFILES));
+}
+
+/*
+ * lxpr_readdir(): Vnode operation for VOP_READDIR()
+ */
+/* ARGSUSED */
+static int
+lxpr_readdir(vnode_t *dp, uio_t *uiop, cred_t *cr, int *eofp,
+	caller_context_t *ct, int flags)
+{
+	lxpr_node_t *lxpnp = VTOLXP(dp);
+	lxpr_nodetype_t type = lxpnp->lxpr_type;
+	ssize_t uresid;
+	off_t uoffset;
+	int error;
+
+	ASSERT(dp->v_type == VDIR);
+	ASSERT(type < LXPR_NFILES);
+
+	/*
+	 * we should never get here because the readdir
+	 * is done on the realvp for these nodes
+	 */
+	ASSERT(type != LXPR_PID_FD_FD &&
+	    type != LXPR_PID_CURDIR &&
+	    type != LXPR_PID_ROOTDIR);
+
+	/*
+	 * restrict readdir permission to owner or root
+	 */
+	if ((error = lxpr_access(dp, VREAD, 0, cr, ct)) != 0)
+		return (error);
+
+	uoffset = uiop->uio_offset;
+	uresid = uiop->uio_resid;
+
+	/* can't do negative reads */
+	if (uoffset < 0 || uresid <= 0)
+		return (EINVAL);
+
+	/* can't read directory entries that don't exist! */
+	if (uoffset % LXPR_SDSIZE)
+		return (ENOENT);
+
+	return (lxpr_readdir_function[lxpnp->lxpr_type](lxpnp, uiop, eofp));
+}
+
+/* ARGSUSED */
+static int
+lxpr_readdir_not_a_dir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+	return (ENOTDIR);
+}
+
+/*
+ * This has the common logic for returning directory entries
+ */
+static int
+lxpr_readdir_common(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp,
+    lxpr_dirent_t *dirtab, int dirtablen)
+{
+	/* bp holds one dirent64 structure */
+	longlong_t bp[DIRENT64_RECLEN(LXPNSIZ) / sizeof (longlong_t)];
+	dirent64_t *dirent = (dirent64_t *)bp;
+	ssize_t oresid;	/* save a copy for testing later */
+	ssize_t uresid;
+
+	oresid = uiop->uio_resid;
+
+	/* clear out the dirent buffer */
+	bzero(bp, sizeof (bp));
+
+	/*
+	 * Satisfy user request
+	 */
+	while ((uresid = uiop->uio_resid) > 0) {
+		int dirindex;
+		off_t uoffset;
+		int reclen;
+		int error;
+
+		uoffset = uiop->uio_offset;
+		dirindex  = (uoffset / LXPR_SDSIZE) - 2;
+
+		if (uoffset == 0) {
+
+			dirent->d_ino = lxpnp->lxpr_ino;
+			dirent->d_name[0] = '.';
+			dirent->d_name[1] = '\0';
+			reclen = DIRENT64_RECLEN(1);
+
+		} else if (uoffset == LXPR_SDSIZE) {
+
+			dirent->d_ino = lxpr_parentinode(lxpnp);
+			dirent->d_name[0] = '.';
+			dirent->d_name[1] = '.';
+			dirent->d_name[2] = '\0';
+			reclen = DIRENT64_RECLEN(2);
+
+		} else if (dirindex >= 0 && dirindex < dirtablen) {
+			int slen = strlen(dirtab[dirindex].d_name);
+
+			dirent->d_ino = lxpr_inode(dirtab[dirindex].d_type,
+			    lxpnp->lxpr_pid, 0);
+
+			VERIFY(slen < LXPNSIZ);
+			(void) strcpy(dirent->d_name, dirtab[dirindex].d_name);
+			reclen = DIRENT64_RECLEN(slen);
+
+		} else {
+			/* Run out of table entries */
+			if (eofp) {
+				*eofp = 1;
+			}
+			return (0);
+		}
+
+		dirent->d_off = (off64_t)(uoffset + LXPR_SDSIZE);
+		dirent->d_reclen = (ushort_t)reclen;
+
+		/*
+		 * if the size of the data to transfer is greater
+		 * that that requested then we can't do it this transfer.
+		 */
+		if (reclen > uresid) {
+			/*
+			 * Error if no entries have been returned yet.
+			 */
+			if (uresid == oresid) {
+				return (EINVAL);
+			}
+			break;
+		}
+
+		/*
+		 * uiomove() updates both uiop->uio_resid and uiop->uio_offset
+		 * by the same amount.  But we want uiop->uio_offset to change
+		 * in increments of LXPR_SDSIZE, which is different from the
+		 * number of bytes being returned to the user.  So we set
+		 * uiop->uio_offset separately, ignoring what uiomove() does.
+		 */
+		if ((error = uiomove((caddr_t)dirent, reclen, UIO_READ,
+		    uiop)) != 0)
+			return (error);
+
+		uiop->uio_offset = uoffset + LXPR_SDSIZE;
+	}
+
+	/* Have run out of space, but could have just done last table entry */
+	if (eofp) {
+		*eofp =
+		    (uiop->uio_offset >= ((dirtablen+2) * LXPR_SDSIZE)) ? 1 : 0;
+	}
+	return (0);
+}
+
+
+static int
+lxpr_readdir_procdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+	/* bp holds one dirent64 structure */
+	longlong_t bp[DIRENT64_RECLEN(LXPNSIZ) / sizeof (longlong_t)];
+	dirent64_t *dirent = (dirent64_t *)bp;
+	ssize_t oresid;	/* save a copy for testing later */
+	ssize_t uresid;
+	off_t uoffset;
+	zoneid_t zoneid;
+	pid_t pid;
+	int error;
+	int ceof;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_PROCDIR);
+
+	oresid = uiop->uio_resid;
+	zoneid = LXPTOZ(lxpnp)->zone_id;
+
+	/*
+	 * We return directory entries in the order: "." and ".." then the
+	 * unique lxproc files, then the directories corresponding to the
+	 * running processes.  We have defined this as the ordering because
+	 * it allows us to more easily keep track of where we are betwen calls
+	 * to getdents().  If the number of processes changes between calls
+	 * then we can't lose track of where we are in the lxproc files.
+	 */
+
+	/* Do the fixed entries */
+	error = lxpr_readdir_common(lxpnp, uiop, &ceof, lxpr_dir,
+	    PROCDIRFILES);
+
+	/* Finished if we got an error or if we couldn't do all the table */
+	if (error != 0 || ceof == 0)
+		return (error);
+
+	/* clear out the dirent buffer */
+	bzero(bp, sizeof (bp));
+
+	/* Do the process entries */
+	while ((uresid = uiop->uio_resid) > 0) {
+		proc_t *p;
+		int len;
+		int reclen;
+		int i;
+
+		uoffset = uiop->uio_offset;
+
+		/*
+		 * Stop when entire proc table has been examined.
+		 */
+		i = (uoffset / LXPR_SDSIZE) - 2 - PROCDIRFILES;
+		if (i < 0 || i >= v.v_proc) {
+			/* Run out of table entries */
+			if (eofp) {
+				*eofp = 1;
+			}
+			return (0);
+		}
+		mutex_enter(&pidlock);
+
+		/*
+		 * Skip indices for which there is no pid_entry, PIDs for
+		 * which there is no corresponding process, a PID of 0,
+		 * and anything the security policy doesn't allow
+		 * us to look at.
+		 */
+		if ((p = pid_entry(i)) == NULL || p->p_stat == SIDL ||
+		    p->p_pid == 0 ||
+		    secpolicy_basic_procinfo(CRED(), p, curproc) != 0) {
+			mutex_exit(&pidlock);
+			goto next;
+		}
+		mutex_exit(&pidlock);
+
+		/*
+		 * Convert pid to the Linux default of 1 if we're the zone's
+		 * init process, otherwise use the value from the proc
+		 * structure
+		 */
+		pid = ((p->p_pid != curproc->p_zone->zone_proc_initpid) ?
+		    p->p_pid : 1);
+
+		/*
+		 * If this /proc was mounted in the global zone, view
+		 * all procs; otherwise, only view zone member procs.
+		 */
+		if (zoneid != GLOBAL_ZONEID && p->p_zone->zone_id != zoneid) {
+			goto next;
+		}
+
+		ASSERT(p->p_stat != 0);
+
+		dirent->d_ino = lxpr_inode(LXPR_PIDDIR, pid, 0);
+		len = snprintf(dirent->d_name, LXPNSIZ, "%d", pid);
+		ASSERT(len < LXPNSIZ);
+		reclen = DIRENT64_RECLEN(len);
+
+		dirent->d_off = (off64_t)(uoffset + LXPR_SDSIZE);
+		dirent->d_reclen = (ushort_t)reclen;
+
+		/*
+		 * if the size of the data to transfer is greater
+		 * that that requested then we can't do it this transfer.
+		 */
+		if (reclen > uresid) {
+			/*
+			 * Error if no entries have been returned yet.
+			 */
+			if (uresid == oresid)
+				return (EINVAL);
+			break;
+		}
+
+		/*
+		 * uiomove() updates both uiop->uio_resid and uiop->uio_offset
+		 * by the same amount.  But we want uiop->uio_offset to change
+		 * in increments of LXPR_SDSIZE, which is different from the
+		 * number of bytes being returned to the user.  So we set
+		 * uiop->uio_offset separately, in the increment of this for
+		 * the loop, ignoring what uiomove() does.
+		 */
+		if ((error = uiomove((caddr_t)dirent, reclen, UIO_READ,
+		    uiop)) != 0)
+			return (error);
+next:
+		uiop->uio_offset = uoffset + LXPR_SDSIZE;
+	}
+
+	if (eofp != NULL) {
+		*eofp = (uiop->uio_offset >=
+		    ((v.v_proc + PROCDIRFILES + 2) * LXPR_SDSIZE)) ? 1 : 0;
+	}
+
+	return (0);
+}
+
+static int
+lxpr_readdir_piddir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+	proc_t *p;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_PIDDIR);
+
+	/* can't read its contents if it died */
+	mutex_enter(&pidlock);
+
+	p = prfind((lxpnp->lxpr_pid == 1) ?
+	    curproc->p_zone->zone_proc_initpid : lxpnp->lxpr_pid);
+
+	if (p == NULL || p->p_stat == SIDL) {
+		mutex_exit(&pidlock);
+		return (ENOENT);
+	}
+	mutex_exit(&pidlock);
+
+	return (lxpr_readdir_common(lxpnp, uiop, eofp, piddir, PIDDIRFILES));
+}
+
+static int
+lxpr_readdir_netdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+	ASSERT(lxpnp->lxpr_type == LXPR_NETDIR);
+	return (lxpr_readdir_common(lxpnp, uiop, eofp, netdir, NETDIRFILES));
+}
+
+static int
+lxpr_readdir_fddir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+	/* bp holds one dirent64 structure */
+	longlong_t bp[DIRENT64_RECLEN(LXPNSIZ) / sizeof (longlong_t)];
+	dirent64_t *dirent = (dirent64_t *)bp;
+	ssize_t oresid;	/* save a copy for testing later */
+	ssize_t uresid;
+	off_t uoffset;
+	int error;
+	int ceof;
+	proc_t *p;
+	int fddirsize = -1;
+	uf_info_t *fip;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_PID_FDDIR);
+
+	oresid = uiop->uio_resid;
+
+	/* can't read its contents if it died */
+	p = lxpr_lock(lxpnp->lxpr_pid);
+	if (p == NULL)
+		return (ENOENT);
+
+	if ((p->p_stat == SZOMB) || (p->p_flag & SSYS) || (p->p_as == &kas))
+		fddirsize = 0;
+
+	/*
+	 * Drop p_lock, but keep the process P_PR_LOCK'd to prevent it from
+	 * going away while we iterate over its fi_list.
+	 */
+	mutex_exit(&p->p_lock);
+
+	/* Get open file info */
+	fip = (&(p)->p_user.u_finfo);
+	mutex_enter(&fip->fi_lock);
+
+	if (fddirsize == -1)
+		fddirsize = fip->fi_nfiles;
+
+	/* Do the fixed entries (in this case just "." & "..") */
+	error = lxpr_readdir_common(lxpnp, uiop, &ceof, 0, 0);
+
+	/* Finished if we got an error or if we couldn't do all the table */
+	if (error != 0 || ceof == 0)
+		goto out;
+
+	/* clear out the dirent buffer */
+	bzero(bp, sizeof (bp));
+
+	/*
+	 * Loop until user's request is satisfied or until
+	 * all file descriptors have been examined.
+	 */
+	for (; (uresid = uiop->uio_resid) > 0;
+	    uiop->uio_offset = uoffset + LXPR_SDSIZE) {
+		int reclen;
+		int fd;
+		int len;
+
+		uoffset = uiop->uio_offset;
+
+		/*
+		 * Stop at the end of the fd list
+		 */
+		fd = (uoffset / LXPR_SDSIZE) - 2;
+		if (fd < 0 || fd >= fddirsize) {
+			if (eofp) {
+				*eofp = 1;
+			}
+			goto out;
+		}
+
+		if (fip->fi_list[fd].uf_file == NULL)
+			continue;
+
+		dirent->d_ino = lxpr_inode(LXPR_PID_FD_FD, lxpnp->lxpr_pid, fd);
+		len = snprintf(dirent->d_name, LXPNSIZ, "%d", fd);
+		ASSERT(len < LXPNSIZ);
+		reclen = DIRENT64_RECLEN(len);
+
+		dirent->d_off = (off64_t)(uoffset + LXPR_SDSIZE);
+		dirent->d_reclen = (ushort_t)reclen;
+
+		if (reclen > uresid) {
+			/*
+			 * Error if no entries have been returned yet.
+			 */
+			if (uresid == oresid)
+				error = EINVAL;
+			goto out;
+		}
+
+		if ((error = uiomove((caddr_t)dirent, reclen, UIO_READ,
+		    uiop)) != 0)
+			goto out;
+	}
+
+	if (eofp != NULL) {
+		*eofp =
+		    (uiop->uio_offset >= ((fddirsize+2) * LXPR_SDSIZE)) ? 1 : 0;
+	}
+
+out:
+	mutex_exit(&fip->fi_lock);
+	mutex_enter(&p->p_lock);
+	lxpr_unlock(p);
+	return (error);
+}
+
+
+/*
+ * lxpr_readlink(): Vnode operation for VOP_READLINK()
+ */
+/* ARGSUSED */
+static int
+lxpr_readlink(vnode_t *vp, uio_t *uiop, cred_t *cr, caller_context_t *ct)
+{
+	char bp[MAXPATHLEN + 1];
+	size_t buflen = sizeof (bp);
+	lxpr_node_t *lxpnp = VTOLXP(vp);
+	vnode_t *rvp = lxpnp->lxpr_realvp;
+	pid_t pid;
+	int error = 0;
+
+	/* must be a symbolic link file */
+	if (vp->v_type != VLNK)
+		return (EINVAL);
+
+	/* Try to produce a symlink name for anything that has a realvp */
+	if (rvp != NULL) {
+		if ((error = lxpr_access(vp, VREAD, 0, CRED(), ct)) != 0)
+			return (error);
+		if ((error = vnodetopath(NULL, rvp, bp, buflen, CRED())) != 0)
+			return (error);
+	} else {
+		switch (lxpnp->lxpr_type) {
+		case LXPR_SELF:
+			/*
+			 * Convert pid to the Linux default of 1 if we're the
+			 * zone's init process
+			 */
+			pid = ((curproc->p_pid !=
+			    curproc->p_zone->zone_proc_initpid)
+			    ? curproc->p_pid : 1);
+
+			/*
+			 * Don't need to check result as every possible int
+			 * will fit within MAXPATHLEN bytes.
+			 */
+			(void) snprintf(bp, buflen, "%d", pid);
+			break;
+		case LXPR_PID_CURDIR:
+		case LXPR_PID_ROOTDIR:
+		case LXPR_PID_EXE:
+			return (EACCES);
+		default:
+			/*
+			 * Need to return error so that nothing thinks
+			 * that the symlink is empty and hence "."
+			 */
+			return (EINVAL);
+		}
+	}
+
+	/* copy the link data to user space */
+	return (uiomove(bp, strlen(bp), UIO_READ, uiop));
+}
+
+/*
+ * lxpr_inactive(): Vnode operation for VOP_INACTIVE()
+ * Vnode is no longer referenced, deallocate the file
+ * and all its resources.
+ */
+/* ARGSUSED */
+static void
+lxpr_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
+{
+	lxpr_freenode(VTOLXP(vp));
+}
+
+/*
+ * lxpr_sync(): Vnode operation for VOP_SYNC()
+ */
+static int
+lxpr_sync()
+{
+	/*
+	 * Nothing to sync but this function must never fail
+	 */
+	return (0);
+}
+
+/*
+ * lxpr_cmp(): Vnode operation for VOP_CMP()
+ */
+static int
+lxpr_cmp(vnode_t *vp1, vnode_t *vp2, caller_context_t *ct)
+{
+	vnode_t *rvp;
+
+	while (vn_matchops(vp1, lxpr_vnodeops) &&
+	    (rvp = VTOLXP(vp1)->lxpr_realvp) != NULL) {
+		vp1 = rvp;
+	}
+
+	while (vn_matchops(vp2, lxpr_vnodeops) &&
+	    (rvp = VTOLXP(vp2)->lxpr_realvp) != NULL) {
+		vp2 = rvp;
+	}
+
+	if (vn_matchops(vp1, lxpr_vnodeops) || vn_matchops(vp2, lxpr_vnodeops))
+		return (vp1 == vp2);
+
+	return (VOP_CMP(vp1, vp2, ct));
+}
+
+/*
+ * lxpr_realvp(): Vnode operation for VOP_REALVP()
+ */
+static int
+lxpr_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct)
+{
+	vnode_t *rvp;
+
+	if ((rvp = VTOLXP(vp)->lxpr_realvp) != NULL) {
+		vp = rvp;
+		if (VOP_REALVP(vp, &rvp, ct) == 0)
+			vp = rvp;
+	}
+
+	*vpp = vp;
+	return (0);
+}
diff --git a/usr/src/uts/common/fs/lxproc/lxproc.h b/usr/src/uts/common/fs/lxproc/lxproc.h
new file mode 100644
index 0000000000..eadb2ccd27
--- /dev/null
+++ b/usr/src/uts/common/fs/lxproc/lxproc.h
@@ -0,0 +1,278 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+#ifndef	_LXPROC_H
+#define	_LXPROC_H
+
+#ifdef _LXPROC_BRANDED_H
+#error Attempted to include native lxproc.h after branded lx_proc.h
+#endif
+
+#define	_LXPROC_NATIVE_H
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * lxproc.h: declarations, data structures and macros for lxprocfs
+ */
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/policy.h>
+#include <sys/debug.h>
+#include <sys/dirent.h>
+#include <sys/errno.h>
+#include <sys/file.h>
+#include <sys/kmem.h>
+#include <sys/pathname.h>
+#include <sys/proc.h>
+#include <sys/systm.h>
+#include <sys/var.h>
+#include <sys/user.h>
+#include <sys/t_lock.h>
+#include <sys/sysmacros.h>
+#include <sys/cred.h>
+#include <sys/priv.h>
+#include <sys/vnode.h>
+#include <sys/vfs.h>
+#include <sys/statvfs.h>
+#include <sys/cmn_err.h>
+#include <sys/zone.h>
+#include <sys/uio.h>
+#include <sys/utsname.h>
+#include <sys/dnlc.h>
+#include <sys/atomic.h>
+#include <sys/sunddi.h>
+#include <sys/sunldi.h>
+#include <vm/as.h>
+#include <vm/anon.h>
+
+#define	LX_SIGHUP	1
+#define	LX_SIGINT	2
+#define	LX_SIGQUIT	3
+#define	LX_SIGILL	4
+#define	LX_SIGTRAP	5
+#define	LX_SIGABRT	6
+#define	LX_SIGIOT	6
+#define	LX_SIGBUS	7
+#define	LX_SIGFPE	8
+#define	LX_SIGKILL	9
+#define	LX_SIGUSR1	10
+#define	LX_SIGSEGV	11
+#define	LX_SIGUSR2	12
+#define	LX_SIGPIPE	13
+#define	LX_SIGALRM	14
+#define	LX_SIGTERM	15
+#define	LX_SIGSTKFLT	16
+#define	LX_SIGCHLD	17
+#define	LX_SIGCONT	18
+#define	LX_SIGSTOP	19
+#define	LX_SIGTSTP	20
+#define	LX_SIGTTIN	21
+#define	LX_SIGTTOU	22
+#define	LX_SIGURG	23
+#define	LX_SIGXCPU	24
+#define	LX_SIGXFSZ	25
+#define	LX_SIGVTALRM	26
+#define	LX_SIGPROF	27
+#define	LX_SIGWINCH	28
+#define	LX_SIGIO	29
+#define	LX_SIGPOLL	LX_SIGIO
+#define	LX_SIGPWR	30
+#define	LX_SIGSYS	31
+#define	LX_SIGUNUSED	31
+
+#define	LX_NSIG		64	/* Linux _NSIG */
+
+#define	LX_SIGRTMIN	32
+#define	LX_SIGRTMAX	LX_NSIG
+
+/*
+ * Convert a vnode into an lxpr_mnt_t
+ */
+#define	VTOLXPM(vp)	((lxpr_mnt_t *)(vp)->v_vfsp->vfs_data)
+
+/*
+ * convert a vnode into an lxpr_node
+ */
+#define	VTOLXP(vp)	((lxpr_node_t *)(vp)->v_data)
+
+/*
+ * convert a lxprnode into a vnode
+ */
+#define	LXPTOV(lxpnp)	((lxpnp)->lxpr_vnode)
+
+/*
+ * convert a lxpr_node into zone for fs
+ */
+#define	LXPTOZ(lxpnp) \
+	(((lxpr_mnt_t *)(lxpnp)->lxpr_vnode->v_vfsp->vfs_data)->lxprm_zone)
+
+#define	LXPNSIZ		256	/* max size of lx /proc file name entries */
+
+/*
+ * Pretend that a directory entry takes 16 bytes
+ */
+#define	LXPR_SDSIZE	16
+
+/*
+ * Node/file types for lx /proc files
+ * (directories and files contained therein).
+ */
+typedef enum lxpr_nodetype {
+	LXPR_PROCDIR,		/* /proc		*/
+	LXPR_PIDDIR,		/* /proc/<pid>		*/
+	LXPR_PID_CMDLINE,	/* /proc/<pid>/cmdline	*/
+	LXPR_PID_CPU,		/* /proc/<pid>/cpu	*/
+	LXPR_PID_CURDIR,	/* /proc/<pid>/cwd	*/
+	LXPR_PID_ENV,		/* /proc/<pid>/environ	*/
+	LXPR_PID_EXE,		/* /proc/<pid>/exe	*/
+	LXPR_PID_MAPS,		/* /proc/<pid>/maps	*/
+	LXPR_PID_MEM,		/* /proc/<pid>/mem	*/
+	LXPR_PID_ROOTDIR,	/* /proc/<pid>/root	*/
+	LXPR_PID_STAT,		/* /proc/<pid>/stat	*/
+	LXPR_PID_STATM,		/* /proc/<pid>/statm	*/
+	LXPR_PID_STATUS,	/* /proc/<pid>/status	*/
+	LXPR_PID_FDDIR,		/* /proc/<pid>/fd	*/
+	LXPR_PID_FD_FD,		/* /proc/<pid>/fd/nn	*/
+	LXPR_CMDLINE,		/* /proc/cmdline	*/
+	LXPR_CPUINFO,		/* /proc/cpuinfo	*/
+	LXPR_DEVICES,		/* /proc/devices	*/
+	LXPR_DMA,		/* /proc/dma		*/
+	LXPR_FILESYSTEMS,	/* /proc/filesystems	*/
+	LXPR_INTERRUPTS,	/* /proc/interrupts	*/
+	LXPR_IOPORTS,		/* /proc/ioports	*/
+	LXPR_KCORE,		/* /proc/kcore		*/
+	LXPR_KMSG,		/* /proc/kmsg		*/
+	LXPR_LOADAVG,		/* /proc/loadavg	*/
+	LXPR_MEMINFO,		/* /proc/meminfo	*/
+	LXPR_MOUNTS,		/* /proc/mounts		*/
+	LXPR_NETDIR,		/* /proc/net		*/
+	LXPR_NET_ARP,		/* /proc/net/arp	*/
+	LXPR_NET_DEV,		/* /proc/net/dev	*/
+	LXPR_NET_DEV_MCAST,	/* /proc/net/dev_mcast	*/
+	LXPR_NET_IGMP,		/* /proc/net/igmp	*/
+	LXPR_NET_IP_MR_CACHE,	/* /proc/net/ip_mr_cache */
+	LXPR_NET_IP_MR_VIF,	/* /proc/net/ip_mr_vif	*/
+	LXPR_NET_MCFILTER,	/* /proc/net/mcfilter	*/
+	LXPR_NET_NETSTAT,	/* /proc/net/netstat	*/
+	LXPR_NET_RAW,		/* /proc/net/raw	*/
+	LXPR_NET_ROUTE,		/* /proc/net/route	*/
+	LXPR_NET_RPC,		/* /proc/net/rpc	*/
+	LXPR_NET_RT_CACHE,	/* /proc/net/rt_cache	*/
+	LXPR_NET_SOCKSTAT,	/* /proc/net/sockstat	*/
+	LXPR_NET_SNMP,		/* /proc/net/snmp	*/
+	LXPR_NET_STAT,		/* /proc/net/stat	*/
+	LXPR_NET_TCP,		/* /proc/net/tcp	*/
+	LXPR_NET_UDP,		/* /proc/net/udp	*/
+	LXPR_NET_UNIX,		/* /proc/net/unix	*/
+	LXPR_PARTITIONS,	/* /proc/partitions	*/
+	LXPR_SELF,		/* /proc/self		*/
+	LXPR_STAT,		/* /proc/stat		*/
+	LXPR_UPTIME,		/* /proc/uptime		*/
+	LXPR_VERSION,		/* /proc/version	*/
+	LXPR_NFILES		/* number of lx /proc file types */
+} lxpr_nodetype_t;
+
+/*
+ * Number of fds allowed for in the inode number calculation
+ * per process (if a process has more fds then inode numbers
+ * may be duplicated)
+ */
+#define	LXPR_FD_PERPROC 2000
+
+/*
+ * external dirent characteristics
+ */
+#define	LXPRMAXNAMELEN	14
+typedef struct {
+	lxpr_nodetype_t	d_type;
+	char		d_name[LXPRMAXNAMELEN];
+} lxpr_dirent_t;
+
+/*
+ * This is the lxprocfs private data object
+ * which is attached to v_data in the vnode structure
+ */
+typedef struct lxpr_node {
+	lxpr_nodetype_t	lxpr_type;	/* type of this node 		*/
+	vnode_t		*lxpr_vnode;	/* vnode for the node		*/
+	vnode_t		*lxpr_parent;	/* parent directory		*/
+	vnode_t		*lxpr_realvp;	/* real vnode, file in dirs	*/
+	timestruc_t	lxpr_time;	/* creation etc time for file	*/
+	mode_t		lxpr_mode;	/* file mode bits		*/
+	uid_t		lxpr_uid;	/* file owner			*/
+	gid_t		lxpr_gid;	/* file group owner		*/
+	pid_t		lxpr_pid;	/* pid of proc referred to	*/
+	ino_t		lxpr_ino;	/* node id 			*/
+} lxpr_node_t;
+
+struct zone;    /* forward declaration */
+
+/*
+ * This is the lxprocfs private data object
+ * which is attached to vfs_data in the vfs structure
+ */
+typedef struct lxpr_mnt {
+	lxpr_node_t	*lxprm_node;	/* node at root of proc mount */
+	struct zone	*lxprm_zone;	/* zone for this mount */
+	ldi_ident_t	lxprm_li;	/* ident for ldi */
+} lxpr_mnt_t;
+
+extern vnodeops_t	*lxpr_vnodeops;
+extern int		nproc_highbit;	/* highbit(v.v_nproc)		*/
+
+typedef struct mounta	mounta_t;
+
+extern void lxpr_initnodecache();
+extern void lxpr_fininodecache();
+extern void lxpr_initrootnode(lxpr_node_t **, vfs_t *);
+extern ino_t lxpr_inode(lxpr_nodetype_t, pid_t, int);
+extern ino_t lxpr_parentinode(lxpr_node_t *);
+extern lxpr_node_t *lxpr_getnode(vnode_t *, lxpr_nodetype_t, proc_t *, int);
+extern void lxpr_freenode(lxpr_node_t *);
+
+typedef struct lxpr_uiobuf lxpr_uiobuf_t;
+extern lxpr_uiobuf_t *lxpr_uiobuf_new(uio_t *);
+extern void lxpr_uiobuf_free(lxpr_uiobuf_t *);
+extern int lxpr_uiobuf_flush(lxpr_uiobuf_t *);
+extern void lxpr_uiobuf_seek(lxpr_uiobuf_t *, offset_t);
+extern void lxpr_uiobuf_write(lxpr_uiobuf_t *, const char *, size_t);
+extern void lxpr_uiobuf_printf(lxpr_uiobuf_t *, const char *, ...);
+extern void lxpr_uiobuf_seterr(lxpr_uiobuf_t *, int);
+
+proc_t *lxpr_lock(pid_t);
+void lxpr_unlock(proc_t *);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _LXPROC_H */
diff --git a/usr/src/uts/common/fs/nfs/nfs3_vfsops.c b/usr/src/uts/common/fs/nfs/nfs3_vfsops.c
index 207a708771..2176dcb9de 100644
--- a/usr/src/uts/common/fs/nfs/nfs3_vfsops.c
+++ b/usr/src/uts/common/fs/nfs/nfs3_vfsops.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2013, Joyent, Inc. All rights reserved.
  */
 
 /*
diff --git a/usr/src/uts/common/fs/nfs/nfs3_vnops.c b/usr/src/uts/common/fs/nfs/nfs3_vnops.c
index b7354c168a..d3b12817ba 100644
--- a/usr/src/uts/common/fs/nfs/nfs3_vnops.c
+++ b/usr/src/uts/common/fs/nfs/nfs3_vnops.c
@@ -29,7 +29,7 @@
  */
 
 /*
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  */
 
@@ -3353,10 +3353,9 @@ nfs3rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr,
 		if (nvp)
 			vnevent_rename_dest(nvp, ndvp, nnm, ct);
 
-		if (odvp != ndvp)
-			vnevent_rename_dest_dir(ndvp, ct);
 		ASSERT(ovp != NULL);
 		vnevent_rename_src(ovp, odvp, onm, ct);
+		vnevent_rename_dest_dir(ndvp, ovp, nnm, ct);
 	}
 
 	if (nvp) {
@@ -5523,8 +5522,13 @@ nfs3_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
 			va.va_size = bfp->l_start;
 			error = nfs3setattr(vp, &va, 0, cr);
 
-			if (error == 0 && bfp->l_start == 0)
-				vnevent_truncate(vp, ct);
+			if (error == 0) {
+				if (bfp->l_start == 0) {
+					vnevent_truncate(vp, ct);
+				} else {
+					vnevent_resize(vp, ct);
+				}
+			}
 		} else
 			error = EINVAL;
 	}
diff --git a/usr/src/uts/common/fs/nfs/nfs4_srv_ns.c b/usr/src/uts/common/fs/nfs/nfs4_srv_ns.c
index bc19d5a116..7b97b090af 100644
--- a/usr/src/uts/common/fs/nfs/nfs4_srv_ns.c
+++ b/usr/src/uts/common/fs/nfs/nfs4_srv_ns.c
@@ -22,6 +22,7 @@
 /*
  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, Joyent, Inc.
  */
 
 #include <sys/systm.h>
@@ -178,12 +179,12 @@ pseudo_exportfs(vnode_t *vp, fid_t *fid, struct exp_visible *vis_head,
 	kex = &exi->exi_export;
 	kex->ex_flags = EX_PSEUDO;
 
-	vpathlen = vp->v_path ? strlen(vp->v_path) : 0;
+	vpathlen = strlen(vp->v_path);
 	kex->ex_pathlen = vpathlen + strlen(PSEUDOFS_SUFFIX);
 	kex->ex_path = kmem_alloc(kex->ex_pathlen + 1, KM_SLEEP);
 
 	if (vpathlen)
-		(void) strcpy(kex->ex_path, vp->v_path);
+		(void) strncpy(kex->ex_path, vp->v_path, vpathlen);
 	(void) strcpy(kex->ex_path + vpathlen, PSEUDOFS_SUFFIX);
 
 	/* Transfer the secinfo data from exdata to this new pseudo node */
diff --git a/usr/src/uts/common/fs/nfs/nfs4_vfsops.c b/usr/src/uts/common/fs/nfs/nfs4_vfsops.c
index 151cb62403..55f6c95289 100644
--- a/usr/src/uts/common/fs/nfs/nfs4_vfsops.c
+++ b/usr/src/uts/common/fs/nfs/nfs4_vfsops.c
@@ -22,6 +22,7 @@
 /*
  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2013, Joyent, Inc. All rights reserved.
  */
 
 /*
diff --git a/usr/src/uts/common/fs/nfs/nfs4_vnops.c b/usr/src/uts/common/fs/nfs/nfs4_vnops.c
index d6bf384a8b..107fe97b95 100644
--- a/usr/src/uts/common/fs/nfs/nfs4_vnops.c
+++ b/usr/src/uts/common/fs/nfs/nfs4_vnops.c
@@ -34,7 +34,7 @@
  */
 
 /*
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
  */
 
 #include <sys/param.h>
@@ -3737,8 +3737,13 @@ nfs4_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
 	 */
 	error = nfs4setattr(vp, vap, flags, cr, NULL);
 
-	if (error == 0 && (vap->va_mask & AT_SIZE) && vap->va_size == 0)
-		vnevent_truncate(vp, ct);
+	if (error == 0 && (vap->va_mask & AT_SIZE)) {
+		if (vap->va_size == 0) {
+			vnevent_truncate(vp, ct);
+		} else {
+			vnevent_resize(vp, ct);
+		}
+	}
 
 	return (error);
 }
@@ -8061,8 +8066,9 @@ link_call:
 	 * vnode if it already existed.
 	 */
 	if (error == 0) {
-		vnode_t *tvp;
+		vnode_t *tvp, *tovp;
 		rnode4_t *trp;
+
 		/*
 		 * Notify the vnode. Each links is represented by
 		 * a different vnode, in nfsv4.
@@ -8075,23 +8081,20 @@ link_call:
 			vnevent_rename_dest(tvp, ndvp, nnm, ct);
 		}
 
-		/*
-		 * if the source and destination directory are not the
-		 * same notify the destination directory.
-		 */
-		if (VTOR4(odvp) != VTOR4(ndvp)) {
-			trp = VTOR4(ndvp);
-			tvp = ndvp;
-			if (IS_SHADOW(ndvp, trp))
-				tvp = RTOV4(trp);
-			vnevent_rename_dest_dir(tvp, ct);
-		}
-
 		trp = VTOR4(ovp);
-		tvp = ovp;
+		tovp = ovp;
 		if (IS_SHADOW(ovp, trp))
+			tovp = RTOV4(trp);
+
+		vnevent_rename_src(tovp, odvp, onm, ct);
+
+		trp = VTOR4(ndvp);
+		tvp = ndvp;
+
+		if (IS_SHADOW(ndvp, trp))
 			tvp = RTOV4(trp);
-		vnevent_rename_src(tvp, odvp, onm, ct);
+
+		vnevent_rename_dest_dir(tvp, tovp, nnm, ct);
 	}
 
 	if (nvp) {
@@ -11000,8 +11003,13 @@ nfs4_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
 			va.va_size = bfp->l_start;
 			error = nfs4setattr(vp, &va, 0, cr, NULL);
 
-			if (error == 0 && bfp->l_start == 0)
-				vnevent_truncate(vp, ct);
+			if (error == 0) {
+				if (bfp->l_start == 0) {
+					vnevent_truncate(vp, ct);
+				} else {
+					vnevent_resize(vp, ct);
+				}
+			}
 		} else
 			error = EINVAL;
 	}
diff --git a/usr/src/uts/common/fs/nfs/nfs_auth.c b/usr/src/uts/common/fs/nfs/nfs_auth.c
index 268badd6c0..da60a0ccd0 100644
--- a/usr/src/uts/common/fs/nfs/nfs_auth.c
+++ b/usr/src/uts/common/fs/nfs/nfs_auth.c
@@ -22,6 +22,7 @@
 /*
  * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015 Joyent, Inc.  All rights reserved.
  */
 
 #include <sys/param.h>
@@ -558,11 +559,16 @@ retry:
 			*access = res.ares.auth_perm;
 			*srv_uid = res.ares.auth_srv_uid;
 			*srv_gid = res.ares.auth_srv_gid;
-			*srv_gids_cnt = res.ares.auth_srv_gids.len;
-			*srv_gids = kmem_alloc(*srv_gids_cnt * sizeof (gid_t),
-			    KM_SLEEP);
-			bcopy(res.ares.auth_srv_gids.val, *srv_gids,
-			    *srv_gids_cnt * sizeof (gid_t));
+
+			if ((*srv_gids_cnt = res.ares.auth_srv_gids.len) != 0) {
+				*srv_gids = kmem_alloc(*srv_gids_cnt *
+				    sizeof (gid_t), KM_SLEEP);
+				bcopy(res.ares.auth_srv_gids.val, *srv_gids,
+				    *srv_gids_cnt * sizeof (gid_t));
+			} else {
+				*srv_gids = NULL;
+			}
+
 			break;
 
 		case NFSAUTH_DR_EFAIL:
@@ -1051,9 +1057,13 @@ nfsauth_cache_get(struct exportinfo *exi, struct svc_req *req, int flavor,
 		if (gid != NULL)
 			*gid = p->auth_srv_gid;
 		if (ngids != NULL && gids != NULL) {
-			*ngids = p->auth_srv_ngids;
-			*gids = kmem_alloc(*ngids * sizeof (gid_t), KM_SLEEP);
-			bcopy(p->auth_srv_gids, *gids, *ngids * sizeof (gid_t));
+			if ((*ngids = p->auth_srv_ngids) != 0) {
+				size_t sz = *ngids * sizeof (gid_t);
+				*gids = kmem_alloc(sz, KM_SLEEP);
+				bcopy(p->auth_srv_gids, *gids, sz);
+			} else {
+				*gids = NULL;
+			}
 		}
 
 		access = p->auth_access;
diff --git a/usr/src/uts/common/fs/nfs/nfs_server.c b/usr/src/uts/common/fs/nfs/nfs_server.c
index be28ac9071..5d2efc71b2 100644
--- a/usr/src/uts/common/fs/nfs/nfs_server.c
+++ b/usr/src/uts/common/fs/nfs/nfs_server.c
@@ -24,6 +24,7 @@
  * Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011 Bayard G. Bell. All rights reserved.
  * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2015, Joyent, Inc.
  */
 
 /*
@@ -2649,6 +2650,9 @@ nfs_srvinit(void)
 {
 	int error;
 
+	if (getzoneid() != GLOBAL_ZONEID)
+		return (EACCES);
+
 	error = nfs_exportinit();
 	if (error != 0)
 		return (error);
@@ -3287,7 +3291,7 @@ nfs_getflabel(vnode_t *vp, struct exportinfo *exi)
 	char *path;
 
 	mutex_enter(&vp->v_lock);
-	if (vp->v_path != NULL) {
+	if (vp->v_path != vn_vpath_empty) {
 		zone = zone_find_by_any_path(vp->v_path, B_FALSE);
 		mutex_exit(&vp->v_lock);
 	} else {
diff --git a/usr/src/uts/common/fs/nfs/nfs_vfsops.c b/usr/src/uts/common/fs/nfs/nfs_vfsops.c
index 57b21778b4..ffd5380a86 100644
--- a/usr/src/uts/common/fs/nfs/nfs_vfsops.c
+++ b/usr/src/uts/common/fs/nfs/nfs_vfsops.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2013, Joyent, Inc. All rights reserved.
  *
  *	Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
  *	All rights reserved.
diff --git a/usr/src/uts/common/fs/nfs/nfs_vnops.c b/usr/src/uts/common/fs/nfs/nfs_vnops.c
index 1a1082bcb8..ee3bac484f 100644
--- a/usr/src/uts/common/fs/nfs/nfs_vnops.c
+++ b/usr/src/uts/common/fs/nfs/nfs_vnops.c
@@ -26,7 +26,7 @@
  */
 
 /*
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  */
 
@@ -1174,8 +1174,13 @@ nfs_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
 
 	error = nfssetattr(vp, vap, flags, cr);
 
-	if (error == 0 && (mask & AT_SIZE) && vap->va_size == 0)
-		vnevent_truncate(vp, ct);
+	if (error == 0 && (mask & AT_SIZE)) {
+		if (vap->va_size == 0) {
+			vnevent_truncate(vp, ct);
+		} else {
+			vnevent_resize(vp, ct);
+		}
+	}
 
 	return (error);
 }
@@ -2688,11 +2693,9 @@ nfsrename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr,
 		if (nvp)
 			vnevent_rename_dest(nvp, ndvp, nnm, ct);
 
-		if (odvp != ndvp)
-			vnevent_rename_dest_dir(ndvp, ct);
-
 		ASSERT(ovp != NULL);
 		vnevent_rename_src(ovp, odvp, onm, ct);
+		vnevent_rename_dest_dir(ndvp, ovp, nnm, ct);
 	}
 
 	if (nvp) {
@@ -4620,8 +4623,13 @@ nfs_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
 			va.va_size = bfp->l_start;
 			error = nfssetattr(vp, &va, 0, cr);
 
-			if (error == 0 && bfp->l_start == 0)
-				vnevent_truncate(vp, ct);
+			if (error == 0) {
+				if (bfp->l_start == 0) {
+					vnevent_truncate(vp, ct);
+				} else {
+					vnevent_resize(vp, ct);
+				}
+			}
 		} else
 			error = EINVAL;
 	}
diff --git a/usr/src/uts/common/fs/pcfs/pc_dir.c b/usr/src/uts/common/fs/pcfs/pc_dir.c
index 976715e346..275330a0ae 100644
--- a/usr/src/uts/common/fs/pcfs/pc_dir.c
+++ b/usr/src/uts/common/fs/pcfs/pc_dir.c
@@ -22,7 +22,7 @@
 /*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
- * Copyright 2015 Joyent, Inc.
+ * Copyright 2016 Joyent, Inc.
  */
 
 #include <sys/param.h>
@@ -826,8 +826,7 @@ top:
 
 	if (error == 0) {
 		vnevent_rename_src(PCTOV(pcp), PCTOV(dp), snm, ctp);
-		if (dp != tdp)
-			vnevent_rename_dest_dir(PCTOV(tdp), ctp);
+		vnevent_rename_dest_dir(PCTOV(tdp), PCTOV(pcp), tnm, ctp);
 	}
 
 done:
diff --git a/usr/src/uts/common/fs/pcfs/pc_vnops.c b/usr/src/uts/common/fs/pcfs/pc_vnops.c
index a8743b245a..ae72cada7a 100644
--- a/usr/src/uts/common/fs/pcfs/pc_vnops.c
+++ b/usr/src/uts/common/fs/pcfs/pc_vnops.c
@@ -781,8 +781,11 @@ pcfs_setattr(
 		if (error)
 			goto out;
 
-		if (vap->va_size == 0)
+		if (vap->va_size == 0) {
 			vnevent_truncate(vp, ct);
+		} else {
+			vnevent_resize(vp, ct);
+		}
 	}
 	/*
 	 * Change file modified times.
diff --git a/usr/src/uts/common/fs/portfs/port.c b/usr/src/uts/common/fs/portfs/port.c
index 14be8cbbae..11b7386269 100644
--- a/usr/src/uts/common/fs/portfs/port.c
+++ b/usr/src/uts/common/fs/portfs/port.c
@@ -24,7 +24,9 @@
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
+/*
+ * Copyright (c) 2015 Joyent, Inc.  All rights reserved.
+ */
 
 #include <sys/types.h>
 #include <sys/systm.h>
@@ -1381,12 +1383,18 @@ portnowait:
 
 	if (model == DATAMODEL_NATIVE) {
 		eventsz = sizeof (port_event_t);
-		kevp = kmem_alloc(eventsz * nmax, KM_NOSLEEP);
-		if (kevp == NULL) {
-			if (nmax > pp->port_max_list)
-				nmax = pp->port_max_list;
-			kevp = kmem_alloc(eventsz * nmax, KM_SLEEP);
+
+		if (nmax == 0) {
+			kevp = NULL;
+		} else {
+			kevp = kmem_alloc(eventsz * nmax, KM_NOSLEEP);
+			if (kevp == NULL) {
+				if (nmax > pp->port_max_list)
+					nmax = pp->port_max_list;
+				kevp = kmem_alloc(eventsz * nmax, KM_SLEEP);
+			}
 		}
+
 		results = kevp;
 		lev = NULL;	/* start with first event in the queue */
 		for (nevents = 0; nevents < nmax; ) {
@@ -1423,12 +1431,18 @@ portnowait:
 		port_event32_t	*kevp32;
 
 		eventsz = sizeof (port_event32_t);
-		kevp32 = kmem_alloc(eventsz * nmax, KM_NOSLEEP);
-		if (kevp32 == NULL) {
-			if (nmax > pp->port_max_list)
-				nmax = pp->port_max_list;
-			kevp32 = kmem_alloc(eventsz * nmax, KM_SLEEP);
+
+		if (nmax == 0) {
+			kevp32 = NULL;
+		} else {
+			kevp32 = kmem_alloc(eventsz * nmax, KM_NOSLEEP);
+			if (kevp32 == NULL) {
+				if (nmax > pp->port_max_list)
+					nmax = pp->port_max_list;
+				kevp32 = kmem_alloc(eventsz * nmax, KM_SLEEP);
+			}
 		}
+
 		results = kevp32;
 		lev = NULL;	/* start with first event in the queue */
 		for (nevents = 0; nevents < nmax; ) {
diff --git a/usr/src/uts/common/fs/proc/prargv.c b/usr/src/uts/common/fs/proc/prargv.c
new file mode 100644
index 0000000000..b09a9c8afc
--- /dev/null
+++ b/usr/src/uts/common/fs/proc/prargv.c
@@ -0,0 +1,441 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015, Joyent, Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/sunddi.h>
+#include <sys/proc.h>
+#include <sys/procfs.h>
+#include <sys/sysmacros.h>
+#include <vm/as.h>
+
+/*
+ * Safely read a contiguous region of memory from 'addr' in the address space
+ * of a particular process into the supplied kernel buffer (*buf, sz).
+ * Partially mapped regions will result in a partial read terminating at the
+ * first hole in the address space.  The number of bytes actually read is
+ * returned to the caller via 'rdsz'.
+ */
+int
+prreadbuf(proc_t *p, uintptr_t ustart, uint8_t *buf, size_t sz, size_t *rdsz)
+{
+	int error = 0;
+	size_t rem = sz;
+	off_t pos = 0;
+
+	if (rdsz != NULL)
+		*rdsz = 0;
+
+	while (rem != 0) {
+		uintptr_t addr = ustart + pos;
+		size_t len = MIN(rem, PAGESIZE - (addr & PAGEOFFSET));
+
+		if ((error = uread(p, buf + pos, len, addr)) != 0) {
+			if (error == ENXIO) {
+				/*
+				 * ENXIO from uread() indicates that the page
+				 * does not exist.  This will simply be a
+				 * partial read.
+				 */
+				error = 0;
+			}
+			break;
+		}
+
+		rem -= len;
+		pos += len;
+	}
+
+	if (rdsz != NULL)
+		*rdsz = pos;
+
+	return (error);
+}
+
+/*
+ * Attempt to read the argument vector (argv) from this process.  The caller
+ * must hold the p_lock mutex, and have marked the process P_PR_LOCK (e.g. via
+ * prlock or lx_prlock).
+ *
+ * The caller must provide a buffer (buf, buflen).  We will concatenate each
+ * argument string (including the NUL terminator) into this buffer.  The number
+ * of characters written to this buffer (including the final NUL terminator)
+ * will be stored in 'slen'.
+ */
+int
+prreadargv(proc_t *p, char *buf, size_t bufsz, size_t *slen)
+{
+	int error;
+	user_t *up;
+	struct as *as;
+	size_t pos = 0;
+	caddr_t *argv = NULL;
+	size_t argvsz = 0;
+	int i;
+
+	VERIFY(MUTEX_HELD(&p->p_lock));
+	VERIFY(p->p_proc_flag & P_PR_LOCK);
+
+	up = PTOU(p);
+	as = p->p_as;
+
+	if ((p->p_flag & SSYS) || as == &kas || up->u_argv == NULL) {
+		/*
+		 * Return the regular psargs string to the caller.
+		 */
+		bcopy(up->u_psargs, buf, MIN(bufsz, sizeof (up->u_psargs)));
+		buf[bufsz - 1] = '\0';
+		*slen = strlen(buf) + 1;
+
+		return (0);
+	}
+
+	/*
+	 * Allocate space to store argv array.
+	 */
+	argvsz = up->u_argc * (p->p_model == DATAMODEL_ILP32 ?
+	    sizeof (caddr32_t) : sizeof (caddr_t));
+	argv = kmem_alloc(argvsz, KM_SLEEP);
+
+	/*
+	 * Extract the argv array from the target process.  Drop p_lock
+	 * while we do I/O to avoid deadlock with the clock thread.
+	 */
+	mutex_exit(&p->p_lock);
+	if ((error = prreadbuf(p, up->u_argv, (uint8_t *)argv, argvsz,
+	    NULL)) != 0) {
+		kmem_free(argv, argvsz);
+		mutex_enter(&p->p_lock);
+		VERIFY(p->p_proc_flag & P_PR_LOCK);
+		return (-1);
+	}
+
+	/*
+	 * Read each argument string from the pointers in the argv array.
+	 */
+	pos = 0;
+	for (i = 0; i < up->u_argc; i++) {
+		size_t rdsz, trysz;
+		uintptr_t arg;
+		off_t j;
+		boolean_t found_nul;
+		boolean_t do_retry = B_TRUE;
+
+#ifdef	_SYSCALL32_IMPL
+		if (p->p_model == DATAMODEL_ILP32) {
+			arg = (uintptr_t)((caddr32_t *)argv)[i];
+		} else {
+			arg = (uintptr_t)argv[i];
+		}
+#else
+		arg = (uintptr_t)argv[i];
+#endif
+
+		/*
+		 * Stop trying to read arguments if we reach a NULL
+		 * pointer in the vector.
+		 */
+		if (arg == NULL)
+			break;
+
+		/*
+		 * Stop reading if we have read the maximum length
+		 * we can return to the user.
+		 */
+		if (pos >= bufsz)
+			break;
+
+		/*
+		 * Initially we try a short read, on the assumption that
+		 * most individual argument strings are less than 80
+		 * characters long.
+		 */
+		if ((trysz = MIN(80, bufsz - pos - 1)) < 80) {
+			/*
+			 * We don't have room in the target buffer for even
+			 * an entire short read, so there is no need to retry
+			 * with a longer read.
+			 */
+			do_retry = B_FALSE;
+		}
+
+retry:
+		/*
+		 * Read string data for this argument.  Leave room
+		 * in the buffer for a final NUL terminator.
+		 */
+		if ((error = prreadbuf(p, arg, (uint8_t *)&buf[pos], trysz,
+		    &rdsz)) != 0) {
+			/*
+			 * There was a problem reading this string
+			 * from the process.  Give up.
+			 */
+			break;
+		}
+
+		/*
+		 * Find the NUL terminator.
+		 */
+		found_nul = B_FALSE;
+		for (j = 0; j < rdsz; j++) {
+			if (buf[pos + j] == '\0') {
+				found_nul = B_TRUE;
+				break;
+			}
+		}
+
+		if (!found_nul && do_retry) {
+			/*
+			 * We did not find a NUL terminator, but this
+			 * was a first pass short read.  Try once more
+			 * with feeling.
+			 */
+			trysz = bufsz - pos - 1;
+			do_retry = B_FALSE;
+			goto retry;
+		}
+
+		/*
+		 * Commit the string we read to the buffer.
+		 */
+		pos += j + 1;
+		if (!found_nul && pos < bufsz) {
+			/*
+			 * A NUL terminator was not found; add one.
+			 */
+			buf[pos++] = '\0';
+		}
+	}
+
+	/*
+	 * Ensure the entire string is NUL-terminated.
+	 */
+	buf[bufsz - 1] = '\0';
+
+	mutex_enter(&p->p_lock);
+	VERIFY(p->p_proc_flag & P_PR_LOCK);
+	kmem_free(argv, argvsz);
+
+	/*
+	 * If the operation was a success, return the copied string length
+	 * to the caller.
+	 */
+	*slen = (error == 0) ? pos : 0;
+
+	return (error);
+}
+
+/*
+ * Similar to prreadargv except reads the env vector. This is slightly more
+ * complex because there is no count for the env vector that corresponds to
+ * u_argc.
+ */
+int
+prreadenvv(proc_t *p, char *buf, size_t bufsz, size_t *slen)
+{
+	int error;
+	user_t *up;
+	struct as *as;
+	size_t pos = 0;
+	caddr_t *envp = NULL;
+	uintptr_t tmpp = NULL;
+	size_t envpsz = 0, rdsz = 0;
+	int i;
+	int cnt, bound;
+
+	VERIFY(MUTEX_HELD(&p->p_lock));
+	VERIFY(p->p_proc_flag & P_PR_LOCK);
+
+	up = PTOU(p);
+	as = p->p_as;
+
+	if ((p->p_flag & SSYS) || as == &kas || up->u_envp == NULL) {
+		/*
+		 * Return empty string.
+		 */
+		buf[0] = '\0';
+		*slen = 1;
+
+		return (0);
+	}
+
+	/*
+	 * Drop p_lock while we do I/O to avoid deadlock with the clock thread.
+	 */
+	mutex_exit(&p->p_lock);
+
+	/*
+	 * We first have to count how many env entries we have. This is
+	 * somewhat painful. We extract the env entries from the target process
+	 * one entry at a time. Stop trying to read env entries if we reach a
+	 * NULL pointer in the vector or hit our upper bound (which we take
+	 * as the bufsz/4) to ensure we don't run off.
+	 */
+	rdsz = (p->p_model == DATAMODEL_ILP32 ?
+	    sizeof (caddr32_t) : sizeof (caddr_t));
+	bound = (int)(bufsz / 4);
+	for (cnt = 0, tmpp = up->u_envp; cnt < bound; cnt++, tmpp += rdsz) {
+		caddr_t tmp = NULL;
+
+		if ((error = prreadbuf(p, tmpp, (uint8_t *)&tmp, rdsz,
+		    NULL)) != 0) {
+			mutex_enter(&p->p_lock);
+			VERIFY(p->p_proc_flag & P_PR_LOCK);
+			return (-1);
+		}
+
+		if (tmp == NULL)
+			break;
+	}
+	if (cnt == 0) {
+		/* Return empty string. */
+		buf[0] = '\0';
+		*slen = 1;
+		mutex_enter(&p->p_lock);
+		VERIFY(p->p_proc_flag & P_PR_LOCK);
+		return (0);
+	}
+
+	/*
+	 * Allocate space to store env array.
+	 */
+	envpsz = cnt * (p->p_model == DATAMODEL_ILP32 ?
+	    sizeof (caddr32_t) : sizeof (caddr_t));
+	envp = kmem_alloc(envpsz, KM_SLEEP);
+
+	/*
+	 * Extract the env array from the target process.
+	 */
+	if ((error = prreadbuf(p, up->u_envp, (uint8_t *)envp, envpsz,
+	    NULL)) != 0) {
+		kmem_free(envp, envpsz);
+		mutex_enter(&p->p_lock);
+		VERIFY(p->p_proc_flag & P_PR_LOCK);
+		return (-1);
+	}
+
+	/*
+	 * Read each env string from the pointers in the env array.
+	 */
+	pos = 0;
+	for (i = 0; i < cnt; i++) {
+		size_t rdsz, trysz;
+		uintptr_t ev;
+		off_t j;
+		boolean_t found_nul;
+		boolean_t do_retry = B_TRUE;
+
+#ifdef	_SYSCALL32_IMPL
+		if (p->p_model == DATAMODEL_ILP32) {
+			ev = (uintptr_t)((caddr32_t *)envp)[i];
+		} else {
+			ev = (uintptr_t)envp[i];
+		}
+#else
+		ev = (uintptr_t)envp[i];
+#endif
+
+		/*
+		 * Stop trying to read env entries if we reach a NULL
+		 * pointer in the vector.
+		 */
+		if (ev == NULL)
+			break;
+
+		/*
+		 * Stop reading if we have read the maximum length
+		 * we can return to the user.
+		 */
+		if (pos >= bufsz)
+			break;
+
+		/*
+		 * Initially we try a short read, on the assumption that
+		 * most individual env strings are less than 80
+		 * characters long.
+		 */
+		if ((trysz = MIN(80, bufsz - pos - 1)) < 80) {
+			/*
+			 * We don't have room in the target buffer for even
+			 * an entire short read, so there is no need to retry
+			 * with a longer read.
+			 */
+			do_retry = B_FALSE;
+		}
+
+retry:
+		/*
+		 * Read string data for this env var.  Leave room
+		 * in the buffer for a final NUL terminator.
+		 */
+		if ((error = prreadbuf(p, ev, (uint8_t *)&buf[pos], trysz,
+		    &rdsz)) != 0) {
+			/*
+			 * There was a problem reading this string
+			 * from the process.  Give up.
+			 */
+			break;
+		}
+
+		/*
+		 * Find the NUL terminator.
+		 */
+		found_nul = B_FALSE;
+		for (j = 0; j < rdsz; j++) {
+			if (buf[pos + j] == '\0') {
+				found_nul = B_TRUE;
+				break;
+			}
+		}
+
+		if (!found_nul && do_retry) {
+			/*
+			 * We did not find a NUL terminator, but this
+			 * was a first pass short read.  Try once more
+			 * with feeling.
+			 */
+			trysz = bufsz - pos - 1;
+			do_retry = B_FALSE;
+			goto retry;
+		}
+
+		/*
+		 * Commit the string we read to the buffer.
+		 */
+		pos += j + 1;
+		if (!found_nul && pos < bufsz) {
+			/*
+			 * A NUL terminator was not found; add one.
+			 */
+			buf[pos++] = '\0';
+		}
+	}
+
+	/*
+	 * Ensure the entire string is NUL-terminated.
+	 */
+	buf[bufsz - 1] = '\0';
+
+	mutex_enter(&p->p_lock);
+	VERIFY(p->p_proc_flag & P_PR_LOCK);
+	kmem_free(envp, envpsz);
+
+	/*
+	 * If the operation was a success, return the copied string length
+	 * to the caller.
+	 */
+	*slen = (error == 0) ? pos : 0;
+
+	return (error);
+}
diff --git a/usr/src/uts/common/fs/proc/prcontrol.c b/usr/src/uts/common/fs/proc/prcontrol.c
index 6b151a6369..07dcb1e7db 100644
--- a/usr/src/uts/common/fs/proc/prcontrol.c
+++ b/usr/src/uts/common/fs/proc/prcontrol.c
@@ -25,7 +25,7 @@
  */
 
 /*
- * Copyright (c) 2013, Joyent, Inc.  All rights reserved.
+ * Copyright 2015, Joyent, Inc.
  */
 
 #include <sys/types.h>
@@ -1481,7 +1481,7 @@ pr_setsig(prnode_t *pnp, siginfo_t *sip)
 		} else if (t->t_state == TS_STOPPED && sig == SIGKILL) {
 			/* If SIGKILL, set stopped lwp running */
 			p->p_stopsig = 0;
-			t->t_schedflag |= TS_XSTART | TS_PSTART;
+			t->t_schedflag |= TS_XSTART | TS_PSTART | TS_BSTART;
 			t->t_dtrace_stop = 0;
 			setrun_locked(t);
 		}
@@ -2276,9 +2276,17 @@ pr_szoneid(proc_t *p, zoneid_t zoneid, cred_t *cr)
 		return (EPERM);
 	if (zoneid != GLOBAL_ZONEID && zoneid != p->p_zone->zone_id)
 		return (EINVAL);
-	if ((zptr = zone_find_by_id(zoneid)) == NULL)
-		return (EINVAL);
+	/*
+	 * We cannot hold p_lock when we call zone_find_by_id since that can
+	 * lead to a deadlock. zone_find_by_id() takes zonehash_lock.
+	 * zone_enter() can hold the zonehash_lock and needs p_lock when it
+	 * calls task_join.
+	 */
 	mutex_exit(&p->p_lock);
+	if ((zptr = zone_find_by_id(zoneid)) == NULL) {
+		mutex_enter(&p->p_lock);
+		return (EINVAL);
+	}
 	mutex_enter(&p->p_crlock);
 	oldcred = p->p_cred;
 	crhold(oldcred);
diff --git a/usr/src/uts/common/fs/proc/prdata.h b/usr/src/uts/common/fs/proc/prdata.h
index 8ea516bf82..72f26b3c05 100644
--- a/usr/src/uts/common/fs/proc/prdata.h
+++ b/usr/src/uts/common/fs/proc/prdata.h
@@ -27,7 +27,7 @@
 /*	  All Rights Reserved  	*/
 
 /*
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright 2015, Joyent, Inc.
  */
 
 #ifndef _SYS_PROC_PRDATA_H
@@ -123,6 +123,7 @@ typedef enum prnodetype {
 #if defined(__i386) || defined(__amd64)
 	PR_LDT,			/* /proc/<pid>/ldt			*/
 #endif
+	PR_ARGV,		/* /proc/<pid>/argv			*/
 	PR_USAGE,		/* /proc/<pid>/usage			*/
 	PR_LUSAGE,		/* /proc/<pid>/lusage			*/
 	PR_PAGEDATA,		/* /proc/<pid>/pagedata			*/
@@ -347,6 +348,8 @@ extern	int	pr_unset(proc_t *, long);
 extern	void	pr_sethold(prnode_t *, sigset_t *);
 extern	void	pr_setfault(proc_t *, fltset_t *);
 extern	int	prusrio(proc_t *, enum uio_rw, struct uio *, int);
+extern	int	prreadargv(proc_t *, char *, size_t, size_t *);
+extern	int	prreadenvv(proc_t *, char *, size_t, size_t *);
 extern	int	prwritectl(vnode_t *, struct uio *, cred_t *);
 extern	int	prlock(prnode_t *, int);
 extern	void	prunmark(proc_t *);
diff --git a/usr/src/uts/common/fs/proc/prsubr.c b/usr/src/uts/common/fs/proc/prsubr.c
index 28950bf972..21c25a01e3 100644
--- a/usr/src/uts/common/fs/proc/prsubr.c
+++ b/usr/src/uts/common/fs/proc/prsubr.c
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright 2016, Joyent, Inc.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -201,6 +201,7 @@ prchoose(proc_t *p)
 			case PR_SYSEXIT:
 			case PR_SIGNALLED:
 			case PR_FAULTED:
+			case PR_BRAND:
 				/*
 				 * Make an lwp calling exit() be the
 				 * last lwp seen in the process.
@@ -534,6 +535,12 @@ prexecend(void)
 			pcp->prc_tslot = tslot;
 		}
 	}
+
+	/*
+	 * There may be threads waiting for the flag change blocked behind the
+	 * pr_pid_cv as well.
+	 */
+	cv_signal(&pr_pid_cv[p->p_slot]);
 }
 
 /*
diff --git a/usr/src/uts/common/fs/proc/prvnops.c b/usr/src/uts/common/fs/proc/prvnops.c
index 39f2abbc32..245133abf4 100644
--- a/usr/src/uts/common/fs/proc/prvnops.c
+++ b/usr/src/uts/common/fs/proc/prvnops.c
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright 2015, Joyent, Inc.
  */
 
 /*	Copyright (c) 1984,	 1986, 1987, 1988, 1989 AT&T	*/
@@ -96,6 +96,11 @@ struct prdirect {
 #define	PRSDSIZE	(sizeof (struct prdirect))
 
 /*
+ * Maximum length of the /proc/$$/argv file:
+ */
+int prmaxargvlen = 4096;
+
+/*
  * Directory characteristics.
  */
 typedef struct prdirent {
@@ -166,6 +171,8 @@ static prdirent_t piddir[] = {
 	{ PR_LDT,	27 * sizeof (prdirent_t), sizeof (prdirent_t),
 		"ldt" },
 #endif
+	{ PR_ARGV,	28 * sizeof (prdirent_t), sizeof (prdirent_t),
+		"argv" },
 };
 
 #define	NPIDDIRFILES	(sizeof (piddir) / sizeof (piddir[0]) - 2)
@@ -582,6 +589,7 @@ static int pr_read_inval(), pr_read_as(), pr_read_status(),
 #if defined(__x86)
 	pr_read_ldt(),
 #endif
+	pr_read_argv(),
 	pr_read_usage(), pr_read_lusage(), pr_read_pagedata(),
 	pr_read_watch(), pr_read_lwpstatus(), pr_read_lwpsinfo(),
 	pr_read_lwpusage(), pr_read_xregs(), pr_read_priv(),
@@ -610,6 +618,7 @@ static int (*pr_read_function[PR_NFILES])() = {
 #if defined(__x86)
 	pr_read_ldt,		/* /proc/<pid>/ldt			*/
 #endif
+	pr_read_argv,		/* /proc/<pid>/argv			*/
 	pr_read_usage,		/* /proc/<pid>/usage			*/
 	pr_read_lusage,		/* /proc/<pid>/lusage			*/
 	pr_read_pagedata,	/* /proc/<pid>/pagedata			*/
@@ -672,6 +681,41 @@ pr_uioread(void *base, long count, uio_t *uiop)
 }
 
 static int
+pr_read_argv(prnode_t *pnp, uio_t *uiop)
+{
+	char *args;
+	int error;
+	size_t asz = prmaxargvlen, sz;
+
+	/*
+	 * Allocate a scratch buffer for collection of the process arguments.
+	 */
+	args = kmem_alloc(asz, KM_SLEEP);
+
+	ASSERT(pnp->pr_type == PR_ARGV);
+
+	if ((error = prlock(pnp, ZNO)) != 0) {
+		kmem_free(args, asz);
+		return (error);
+	}
+
+	if ((error = prreadargv(pnp->pr_common->prc_proc, args, asz,
+	    &sz)) != 0) {
+		prunlock(pnp);
+		kmem_free(args, asz);
+		return (error);
+	}
+
+	prunlock(pnp);
+
+	error = pr_uioread(args, sz, uiop);
+
+	kmem_free(args, asz);
+
+	return (error);
+}
+
+static int
 pr_read_as(prnode_t *pnp, uio_t *uiop)
 {
 	int error;
@@ -1767,6 +1811,7 @@ static int (*pr_read_function_32[PR_NFILES])() = {
 #if defined(__x86)
 	pr_read_ldt,		/* /proc/<pid>/ldt			*/
 #endif
+	pr_read_argv,		/* /proc/<pid>/argv			*/
 	pr_read_usage_32,	/* /proc/<pid>/usage			*/
 	pr_read_lusage_32,	/* /proc/<pid>/lusage			*/
 	pr_read_pagedata_32,	/* /proc/<pid>/pagedata			*/
@@ -2686,6 +2731,103 @@ prread(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr, caller_context_t *ct)
 #endif
 }
 
+/*
+ * We make pr_write_psinfo_fname() somewhat simpler by asserting at compile
+ * time that PRFNSZ has the same definition as MAXCOMLEN.
+ */
+#if PRFNSZ != MAXCOMLEN
+#error PRFNSZ/MAXCOMLEN mismatch
+#endif
+
+static int
+pr_write_psinfo_fname(prnode_t *pnp, uio_t *uiop)
+{
+	char fname[PRFNSZ];
+	int offset = offsetof(psinfo_t, pr_fname), error;
+
+#ifdef _SYSCALL32_IMPL
+	if (curproc->p_model != DATAMODEL_LP64)
+		offset = offsetof(psinfo32_t, pr_fname);
+#endif
+
+	/*
+	 * If this isn't a write to pr_fname (or if the size doesn't match
+	 * PRFNSZ) return.
+	 */
+	if (uiop->uio_offset != offset || uiop->uio_resid != PRFNSZ)
+		return (0);
+
+	if ((error = uiomove(fname, PRFNSZ, UIO_WRITE, uiop)) != 0)
+		return (error);
+
+	fname[PRFNSZ - 1] = '\0';
+
+	if ((error = prlock(pnp, ZNO)) != 0)
+		return (error);
+
+	bcopy(fname, pnp->pr_common->prc_proc->p_user.u_comm, PRFNSZ);
+
+	prunlock(pnp);
+
+	return (0);
+}
+
+/*
+ * We make pr_write_psinfo_psargs() somewhat simpler by asserting at compile
+ * time that PRARGSZ has the same definition as PSARGSZ.
+ */
+#if PRARGSZ != PSARGSZ
+#error PRARGSZ/PSARGSZ mismatch
+#endif
+
+static int
+pr_write_psinfo_psargs(prnode_t *pnp, uio_t *uiop)
+{
+	char psargs[PRARGSZ];
+	int offset = offsetof(psinfo_t, pr_psargs), error;
+
+#ifdef _SYSCALL32_IMPL
+	if (curproc->p_model != DATAMODEL_LP64)
+		offset = offsetof(psinfo32_t, pr_psargs);
+#endif
+
+	/*
+	 * If this isn't a write to pr_psargs (or if the size doesn't match
+	 * PRARGSZ) return.
+	 */
+	if (uiop->uio_offset != offset || uiop->uio_resid != PRARGSZ)
+		return (0);
+
+	if ((error = uiomove(psargs, PRARGSZ, UIO_WRITE, uiop)) != 0)
+		return (error);
+
+	psargs[PRARGSZ - 1] = '\0';
+
+	if ((error = prlock(pnp, ZNO)) != 0)
+		return (error);
+
+	bcopy(psargs, pnp->pr_common->prc_proc->p_user.u_psargs, PRARGSZ);
+
+	prunlock(pnp);
+
+	return (0);
+}
+
+int
+pr_write_psinfo(prnode_t *pnp, uio_t *uiop)
+{
+	int error;
+
+	if ((error = pr_write_psinfo_fname(pnp, uiop)) != 0)
+		return (error);
+
+	if ((error = pr_write_psinfo_psargs(pnp, uiop)) != 0)
+		return (error);
+
+	return (0);
+}
+
+
 /* ARGSUSED */
 static int
 prwrite(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr, caller_context_t *ct)
@@ -2764,6 +2906,9 @@ prwrite(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr, caller_context_t *ct)
 			uiop->uio_resid = resid;
 		return (error);
 
+	case PR_PSINFO:
+		return (pr_write_psinfo(pnp, uiop));
+
 	default:
 		return ((vp->v_type == VDIR)? EISDIR : EBADF);
 	}
@@ -3047,6 +3192,13 @@ prgetattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
 	case PR_AUXV:
 		vap->va_size = __KERN_NAUXV_IMPL * PR_OBJSIZE(auxv32_t, auxv_t);
 		break;
+	case PR_ARGV:
+		if ((p->p_flag & SSYS) || p->p_as == &kas) {
+			vap->va_size = PSARGSZ;
+		} else {
+			vap->va_size = prmaxargvlen;
+		}
+		break;
 #if defined(__x86)
 	case PR_LDT:
 		mutex_exit(&p->p_lock);
@@ -3222,6 +3374,7 @@ praccess(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct)
 	case PR_USAGE:
 	case PR_LUSAGE:
 	case PR_LWPUSAGE:
+	case PR_ARGV:
 		p = pr_p_lock(pnp);
 		mutex_exit(&pr_pidlock);
 		if (p == NULL)
@@ -3307,6 +3460,7 @@ static vnode_t *(*pr_lookup_function[PR_NFILES])() = {
 #if defined(__x86)
 	pr_lookup_notdir,	/* /proc/<pid>/ldt			*/
 #endif
+	pr_lookup_notdir,	/* /proc/<pid>/argv			*/
 	pr_lookup_notdir,	/* /proc/<pid>/usage			*/
 	pr_lookup_notdir,	/* /proc/<pid>/lusage			*/
 	pr_lookup_notdir,	/* /proc/<pid>/pagedata			*/
@@ -4546,11 +4700,15 @@ prgetnode(vnode_t *dp, prnodetype_t type)
 		break;
 
 	case PR_PSINFO:
+		pnp->pr_mode = 0644;	/* readable by all + owner can write */
+		break;
+
 	case PR_LPSINFO:
 	case PR_LWPSINFO:
 	case PR_USAGE:
 	case PR_LUSAGE:
 	case PR_LWPUSAGE:
+	case PR_ARGV:
 		pnp->pr_mode = 0444;	/* read-only by all */
 		break;
 
@@ -4656,6 +4814,7 @@ static int (*pr_readdir_function[PR_NFILES])() = {
 #if defined(__x86)
 	pr_readdir_notdir,	/* /proc/<pid>/ldt			*/
 #endif
+	pr_readdir_notdir,	/* /proc/<pid>/argv			*/
 	pr_readdir_notdir,	/* /proc/<pid>/usage			*/
 	pr_readdir_notdir,	/* /proc/<pid>/lusage			*/
 	pr_readdir_notdir,	/* /proc/<pid>/pagedata			*/
@@ -4805,6 +4964,7 @@ pr_readdir_piddir(prnode_t *pnp, uio_t *uiop, int *eofp)
 			case PR_PROCDIR:
 			case PR_PSINFO:
 			case PR_USAGE:
+			case PR_ARGV:
 				break;
 			default:
 				continue;
diff --git a/usr/src/uts/common/fs/sockfs/sockcommon.c b/usr/src/uts/common/fs/sockfs/sockcommon.c
index 703e26ea61..682f1d867b 100644
--- a/usr/src/uts/common/fs/sockfs/sockcommon.c
+++ b/usr/src/uts/common/fs/sockfs/sockcommon.c
@@ -21,6 +21,7 @@
 
 /*
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015, Joyent, Inc.
  */
 
 #include <sys/types.h>
@@ -501,6 +502,9 @@ sonode_constructor(void *buf, void *cdrarg, int kmflags)
 	cv_init(&so->so_copy_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&so->so_closing_cv, NULL, CV_DEFAULT, NULL);
 
+	so->so_krecv_cb = NULL;
+	so->so_krecv_arg = NULL;
+
 	return (0);
 }
 
@@ -654,6 +658,10 @@ sonode_fini(struct sonode *so)
 	if (so->so_filter_top != NULL)
 		sof_sonode_cleanup(so);
 
+	/* Clean up any remnants of krecv callbacks */
+	so->so_krecv_cb = NULL;
+	so->so_krecv_arg = NULL;
+
 	ASSERT(list_is_empty(&so->so_acceptq_list));
 	ASSERT(list_is_empty(&so->so_acceptq_defer));
 	ASSERT(!list_link_active(&so->so_acceptq_node));
diff --git a/usr/src/uts/common/fs/sockfs/sockcommon_sops.c b/usr/src/uts/common/fs/sockfs/sockcommon_sops.c
index e5bc6dc845..9b8186a8a0 100644
--- a/usr/src/uts/common/fs/sockfs/sockcommon_sops.c
+++ b/usr/src/uts/common/fs/sockfs/sockcommon_sops.c
@@ -24,7 +24,7 @@
  */
 
 /*
- * Copyright (c) 2014, Joyent, Inc.  All rights reserved.
+ * Copyright (c) 2015, Joyent, Inc.  All rights reserved.
  */
 
 #include <sys/types.h>
@@ -128,7 +128,7 @@ so_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
 {
 	int error;
 
-	SO_BLOCK_FALLBACK(so, SOP_BIND(so, name, namelen, flags, cr));
+	SO_BLOCK_FALLBACK_SAFE(so, SOP_BIND(so, name, namelen, flags, cr));
 
 	ASSERT(flags == _SOBIND_XPG4_2 || flags == _SOBIND_SOCKBSD);
 
@@ -305,7 +305,7 @@ so_connect(struct sonode *so, struct sockaddr *name,
 	 * This can happen if a non blocking operation caused an error.
 	 */
 
-	if (so->so_error != 0) {
+	if (so->so_error != 0 && (so->so_mode & SM_DEFERERR) == 0) {
 		mutex_enter(&so->so_lock);
 		error = sogeterr(so, B_TRUE);
 		mutex_exit(&so->so_lock);
@@ -404,7 +404,7 @@ so_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
 			break;
 		}
 
-		if (so->so_error != 0) {
+		if (so->so_error != 0 && (so->so_mode & SM_DEFERERR) == 0) {
 			mutex_enter(&so->so_lock);
 			error = sogeterr(so, B_TRUE);
 			mutex_exit(&so->so_lock);
@@ -513,7 +513,7 @@ so_sendmblk_impl(struct sonode *so, struct nmsghdr *msg, int fflag,
 			error = EPIPE;
 			break;
 		}
-		if (so->so_error != 0) {
+		if (so->so_error != 0 && (so->so_mode & SM_DEFERERR) == 0) {
 			mutex_enter(&so->so_lock);
 			error = sogeterr(so, B_TRUE);
 			mutex_exit(&so->so_lock);
@@ -586,11 +586,6 @@ so_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag,
 
 	SO_BLOCK_FALLBACK(so, SOP_SENDMBLK(so, msg, fflag, cr, mpp));
 
-	if ((so->so_mode & SM_SENDFILESUPP) == 0) {
-		SO_UNBLOCK_FALLBACK(so);
-		return (EOPNOTSUPP);
-	}
-
 	error = so_sendmblk_impl(so, msg, fflag, cr, mpp, so->so_filter_top,
 	    B_FALSE);
 
@@ -653,7 +648,7 @@ so_getsockname(struct sonode *so, struct sockaddr *addr,
 {
 	int error;
 
-	SO_BLOCK_FALLBACK(so, SOP_GETSOCKNAME(so, addr, addrlen, cr));
+	SO_BLOCK_FALLBACK_SAFE(so, SOP_GETSOCKNAME(so, addr, addrlen, cr));
 
 	if (so->so_filter_active == 0 ||
 	    (error = sof_filter_getsockname(so, addr, addrlen, cr)) < 0)
@@ -702,7 +697,7 @@ so_getsockopt(struct sonode *so, int level, int option_name,
 	if (level == SOL_FILTER)
 		return (sof_getsockopt(so, option_name, optval, optlenp, cr));
 
-	SO_BLOCK_FALLBACK(so,
+	SO_BLOCK_FALLBACK_SAFE(so,
 	    SOP_GETSOCKOPT(so, level, option_name, optval, optlenp, flags, cr));
 
 	if ((so->so_filter_active == 0 ||
@@ -791,7 +786,7 @@ so_setsockopt(struct sonode *so, int level, int option_name,
 	if (level == SOL_FILTER)
 		return (sof_setsockopt(so, option_name, optval, optlen, cr));
 
-	SO_BLOCK_FALLBACK(so,
+	SO_BLOCK_FALLBACK_SAFE(so,
 	    SOP_SETSOCKOPT(so, level, option_name, optval, optlen, cr));
 
 	/* X/Open requires this check */
@@ -876,7 +871,7 @@ so_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode,
 	 * If there is a pending error, return error
 	 * This can happen if a non blocking operation caused an error.
 	 */
-	if (so->so_error != 0) {
+	if (so->so_error != 0 && (so->so_mode & SM_DEFERERR) == 0) {
 		mutex_enter(&so->so_lock);
 		error = sogeterr(so, B_TRUE);
 		mutex_exit(&so->so_lock);
@@ -1329,6 +1324,26 @@ so_queue_msg_impl(struct sonode *so, mblk_t *mp,
 		}
 	}
 
+	mutex_enter(&so->so_lock);
+	if (so->so_krecv_cb != NULL) {
+		boolean_t cont;
+		so_krecv_f func = so->so_krecv_cb;
+		void *arg = so->so_krecv_arg;
+
+		mutex_exit(&so->so_lock);
+		cont = func(so, mp, msg_size, flags & MSG_OOB, arg);
+		mutex_enter(&so->so_lock);
+		if (cont == B_TRUE) {
+			space_left = so->so_rcvbuf;
+		} else {
+			so->so_rcv_queued = so->so_rcvlowat;
+			*errorp = ENOSPC;
+			space_left = -1;
+		}
+		goto done_unlock;
+	}
+	mutex_exit(&so->so_lock);
+
 	if (flags & MSG_OOB) {
 		so_queue_oob(so, mp, msg_size);
 		mutex_enter(&so->so_lock);
@@ -1607,6 +1622,13 @@ so_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
 		return (ENOTCONN);
 	}
 
+	mutex_enter(&so->so_lock);
+	if (so->so_krecv_cb != NULL) {
+		mutex_exit(&so->so_lock);
+		return (EOPNOTSUPP);
+	}
+	mutex_exit(&so->so_lock);
+
 	if (msg->msg_flags & MSG_PEEK)
 		msg->msg_flags &= ~MSG_WAITALL;
 
diff --git a/usr/src/uts/common/fs/sockfs/sockcommon_subr.c b/usr/src/uts/common/fs/sockfs/sockcommon_subr.c
index 957c8f93b4..7bdd64393b 100644
--- a/usr/src/uts/common/fs/sockfs/sockcommon_subr.c
+++ b/usr/src/uts/common/fs/sockfs/sockcommon_subr.c
@@ -24,6 +24,7 @@
  */
 /*
  * Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
  */
 
 #include <sys/types.h>
@@ -670,10 +671,15 @@ so_dequeue_msg(struct sonode *so, mblk_t **mctlp, struct uio *uiop,
 	int more = 0;
 	int error;
 	ssize_t oobmark;
+	ssize_t copied = 0;
 	sodirect_t *sodp = so->so_direct;
+	xuio_t *xuio = NULL;
 
 	partial_read = B_FALSE;
 	*mctlp = NULL;
+	if ((uiop->uio_extflg & UIO_XUIO) != 0) {
+		xuio = (xuio_t *)uiop;
+	}
 again:
 	mutex_enter(&so->so_lock);
 again1:
@@ -784,8 +790,6 @@ again1:
 		 * enabled socket, uio_resid can be 0.
 		 */
 		if (uiop->uio_resid >= 0) {
-			ssize_t copied = 0;
-
 			if (sodp != NULL && (DB_FLAGS(mp) & DBLK_UIOA)) {
 				mutex_enter(&so->so_lock);
 				ASSERT(uiop == (uio_t *)&sodp->sod_uioa);
@@ -843,6 +847,18 @@ again1:
 		}
 		if (mp != NULL) { /* more data blocks in msg */
 			more |= MOREDATA;
+
+			/*
+			 * If requested, tally up remaining data along with the
+			 * amount already copied.
+			 */
+			if (xuio != NULL &&
+			    xuio->xu_type == UIOTYPE_PEEKSIZE) {
+				xuio->xu_ext.xu_ps.xu_ps_set = B_TRUE;
+				xuio->xu_ext.xu_ps.xu_ps_size =
+				    copied + msgdsize(mp);
+			}
+
 			if ((flags & (MSG_PEEK|MSG_TRUNC))) {
 				if (flags & MSG_PEEK) {
 					freemsg(mp);
@@ -2276,9 +2292,9 @@ so_tpi_fallback(struct sonode *so, struct cred *cr)
 	fbfunc = sp->sp_smod_info->smod_proto_fallback_func;
 
 	/*
-	 * Cannot fallback if the socket has active filters
+	 * Cannot fallback if the socket has active filters or a krecv callback.
 	 */
-	if (so->so_filter_active > 0)
+	if (so->so_filter_active > 0 || so->so_krecv_cb != NULL)
 		return (EINVAL);
 
 	switch (so->so_family) {
@@ -2456,3 +2472,50 @@ out:
 
 	return (error);
 }
+
+int
+so_krecv_set(sonode_t *so, so_krecv_f cb, void *arg)
+{
+	int ret;
+
+	if (cb == NULL && arg != NULL)
+		return (EINVAL);
+
+	SO_BLOCK_FALLBACK(so, so_krecv_set(so, cb, arg));
+
+	mutex_enter(&so->so_lock);
+	if (so->so_state & SS_FALLBACK_COMP) {
+		mutex_exit(&so->so_lock);
+		SO_UNBLOCK_FALLBACK(so);
+		return (ENOTSUP);
+	}
+
+	ret = so_lock_read(so, 0);
+	VERIFY(ret == 0);
+	/*
+	 * Other consumers may actually care about getting extant data delivered
+	 * to them, when they come along, they should figure out the best API
+	 * for that.
+	 */
+	so_rcv_flush(so);
+
+	so->so_krecv_cb = cb;
+	so->so_krecv_arg = arg;
+
+	so_unlock_read(so);
+	mutex_exit(&so->so_lock);
+	SO_UNBLOCK_FALLBACK(so);
+
+	return (0);
+}
+
+void
+so_krecv_unblock(sonode_t *so)
+{
+	mutex_enter(&so->so_lock);
+	VERIFY(so->so_krecv_cb != NULL);
+
+	so->so_rcv_queued = 0;
+	(void) so_check_flow_control(so);
+	mutex_exit(&so->so_lock);
+}
diff --git a/usr/src/uts/common/fs/sockfs/sockfilter.c b/usr/src/uts/common/fs/sockfs/sockfilter.c
index 971523945e..7dca6ae6fc 100644
--- a/usr/src/uts/common/fs/sockfs/sockfilter.c
+++ b/usr/src/uts/common/fs/sockfs/sockfilter.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
  */
 
 #include <sys/systm.h>
@@ -246,6 +247,18 @@ sof_setsockopt_impl(struct sonode *so, int option_name,
 
 		/* Module loaded OK, so there must be an ops vector */
 		ASSERT(ent->sofe_mod != NULL);
+
+		/*
+		 * Check again to confirm ATTACH is ok. See if the the module
+		 * is not SOF_ATT_SAFE after an unsafe operation has taken
+		 * place.
+		 */
+		if ((ent->sofe_mod->sofm_flags & SOF_ATT_SAFE) == 0 &&
+		    so->so_state & SS_FILOP_UNSF) {
+			sof_instance_destroy(inst);
+			return (EINVAL);
+		}
+
 		inst->sofi_ops = &ent->sofe_mod->sofm_ops;
 
 		SOF_STAT_ADD(inst, tot_active_attach, 1);
@@ -1444,7 +1457,13 @@ sof_filter_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode,
  * sof_register(version, name, ops, flags)
  *
  * Register a socket filter identified by name `name' and which should use
- * the ops vector `ops' for event notification. `flags' should be set to 0.
+ * the ops vector `ops' for event notification. `flags' should be set to 0
+ * by default for "unsafe" modules or SOF_ATT_SAFE for "safe" modules. An
+ * unsafe filter is one that cannot be attached after any socket operation has
+ * occured. This is the legacy default. A "safe" filter can be attached even
+ * after some basic initial socket operations have taken place. This set is
+ * currently bind, getsockname, getsockopt and setsockopt. The order in which
+ * a "safe" filter can be attached is more relaxed, and thus more flexible.
  * On success 0 is returned, otherwise an errno is returned.
  */
 int
@@ -1452,14 +1471,13 @@ sof_register(int version, const char *name, const sof_ops_t *ops, int flags)
 {
 	sof_module_t *mod;
 
-	_NOTE(ARGUNUSED(flags));
-
 	if (version != SOF_VERSION)
 		return (EINVAL);
 
 	mod = kmem_zalloc(sizeof (sof_module_t), KM_SLEEP);
 	mod->sofm_name = kmem_alloc(strlen(name) + 1, KM_SLEEP);
 	(void) strcpy(mod->sofm_name, name);
+	mod->sofm_flags = flags;
 	mod->sofm_ops = *ops;
 
 	mutex_enter(&sof_module_lock);
diff --git a/usr/src/uts/common/fs/sockfs/sockfilter_impl.h b/usr/src/uts/common/fs/sockfs/sockfilter_impl.h
index 7f7aece1f1..cf2ad8b20d 100644
--- a/usr/src/uts/common/fs/sockfs/sockfilter_impl.h
+++ b/usr/src/uts/common/fs/sockfs/sockfilter_impl.h
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
  */
 
 #ifndef	_SOCKFS_SOCKFILTER_H
@@ -51,6 +52,7 @@ typedef struct sof_kstat	sof_kstat_t;
 
 struct sof_module {
 	char		*sofm_name;
+	int		sofm_flags;
 	sof_ops_t	sofm_ops;
 	uint_t		sofm_refcnt;
 	list_node_t	sofm_node;
diff --git a/usr/src/uts/common/fs/sockfs/socksubr.c b/usr/src/uts/common/fs/sockfs/socksubr.c
index 30027200b6..eea86672b8 100644
--- a/usr/src/uts/common/fs/sockfs/socksubr.c
+++ b/usr/src/uts/common/fs/sockfs/socksubr.c
@@ -21,6 +21,7 @@
 
 /*
  * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015, Joyent, Inc. All rights reserved.
  */
 
 #include <sys/types.h>
@@ -436,10 +437,12 @@ sogetoff(mblk_t *mp, t_uscalar_t offset,
  *
  * The underlying filesystem VSOCK vnode has a v_stream pointer that
  * references the actual stream head (hence indirectly the actual sonode).
+ *
+ * This function is non-static so it can be used by brand emulation.
  */
-static int
+int
 so_ux_lookup(struct sonode *so, struct sockaddr_un *soun, int checkaccess,
-		vnode_t **vpp)
+    vnode_t **vpp)
 {
 	vnode_t		*vp;	/* Underlying filesystem vnode */
 	vnode_t		*rvp;	/* real vnode */
@@ -1879,7 +1882,7 @@ ssize_t
 soreadfile(file_t *fp, uchar_t *buf, u_offset_t fileoff, int *err, size_t size)
 {
 	struct uio auio;
-	struct iovec aiov[MSG_MAXIOVLEN];
+	struct iovec aiov[1];
 	register vnode_t *vp;
 	int ioflag, rwflag;
 	ssize_t cnt;
diff --git a/usr/src/uts/common/fs/sockfs/socksyscalls.c b/usr/src/uts/common/fs/sockfs/socksyscalls.c
index 21f3744895..854dd040b5 100644
--- a/usr/src/uts/common/fs/sockfs/socksyscalls.c
+++ b/usr/src/uts/common/fs/sockfs/socksyscalls.c
@@ -21,6 +21,8 @@
 
 /*
  * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved.
+ * Copyright 2015, Joyent, Inc.  All rights reserved.
  */
 
 /* Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. */
@@ -54,6 +56,7 @@
 #include <sys/cmn_err.h>
 #include <sys/vmsystm.h>
 #include <sys/policy.h>
+#include <sys/limits.h>
 
 #include <sys/socket.h>
 #include <sys/socketvar.h>
@@ -86,12 +89,6 @@ extern void	nl7c_init(void);
 extern int	sockfs_defer_nl7c_init;
 
 /*
- * Note: DEF_IOV_MAX is defined and used as it is in "fs/vncalls.c"
- *	 as there isn't a formal definition of IOV_MAX ???
- */
-#define	MSG_MAXIOVLEN	16
-
-/*
  * Kernel component of socket creation.
  *
  * The socket library determines which version number to use.
@@ -1026,9 +1023,10 @@ recvmsg(int sock, struct nmsghdr *msg, int flags)
 	STRUCT_HANDLE(nmsghdr, umsgptr);
 	struct nmsghdr lmsg;
 	struct uio auio;
-	struct iovec aiov[MSG_MAXIOVLEN];
+	struct iovec buf[IOV_MAX_STACK], *aiov = buf;
+	ssize_t iovsize = 0;
 	int iovcnt;
-	ssize_t len;
+	ssize_t len, rval;
 	int i;
 	int *flagsp;
 	model_t	model;
@@ -1071,22 +1069,37 @@ recvmsg(int sock, struct nmsghdr *msg, int flags)
 
 	iovcnt = lmsg.msg_iovlen;
 
-	if (iovcnt <= 0 || iovcnt > MSG_MAXIOVLEN) {
+	if (iovcnt <= 0 || iovcnt > IOV_MAX) {
 		return (set_errno(EMSGSIZE));
 	}
 
+	if (iovcnt > IOV_MAX_STACK) {
+		iovsize = iovcnt * sizeof (struct iovec);
+		aiov = kmem_alloc(iovsize, KM_SLEEP);
+	}
+
 #ifdef _SYSCALL32_IMPL
 	/*
 	 * 32-bit callers need to have their iovec expanded, while ensuring
 	 * that they can't move more than 2Gbytes of data in a single call.
 	 */
 	if (model == DATAMODEL_ILP32) {
-		struct iovec32 aiov32[MSG_MAXIOVLEN];
+		struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32;
+		ssize_t iov32size;
 		ssize32_t count32;
 
-		if (copyin((struct iovec32 *)lmsg.msg_iov, aiov32,
-		    iovcnt * sizeof (struct iovec32)))
+		iov32size = iovcnt * sizeof (struct iovec32);
+		if (iovsize != 0)
+			aiov32 = kmem_alloc(iov32size, KM_SLEEP);
+
+		if (copyin((struct iovec32 *)lmsg.msg_iov, aiov32, iov32size)) {
+			if (iovsize != 0) {
+				kmem_free(aiov32, iov32size);
+				kmem_free(aiov, iovsize);
+			}
+
 			return (set_errno(EFAULT));
+		}
 
 		count32 = 0;
 		for (i = 0; i < iovcnt; i++) {
@@ -1094,15 +1107,28 @@ recvmsg(int sock, struct nmsghdr *msg, int flags)
 
 			iovlen32 = aiov32[i].iov_len;
 			count32 += iovlen32;
-			if (iovlen32 < 0 || count32 < 0)
+			if (iovlen32 < 0 || count32 < 0) {
+				if (iovsize != 0) {
+					kmem_free(aiov32, iov32size);
+					kmem_free(aiov, iovsize);
+				}
+
 				return (set_errno(EINVAL));
+			}
+
 			aiov[i].iov_len = iovlen32;
 			aiov[i].iov_base =
 			    (caddr_t)(uintptr_t)aiov32[i].iov_base;
 		}
+
+		if (iovsize != 0)
+			kmem_free(aiov32, iov32size);
 	} else
 #endif /* _SYSCALL32_IMPL */
 	if (copyin(lmsg.msg_iov, aiov, iovcnt * sizeof (struct iovec))) {
+		if (iovsize != 0)
+			kmem_free(aiov, iovsize);
+
 		return (set_errno(EFAULT));
 	}
 	len = 0;
@@ -1110,6 +1136,9 @@ recvmsg(int sock, struct nmsghdr *msg, int flags)
 		ssize_t iovlen = aiov[i].iov_len;
 		len += iovlen;
 		if (iovlen < 0 || len < 0) {
+			if (iovsize != 0)
+				kmem_free(aiov, iovsize);
+
 			return (set_errno(EINVAL));
 		}
 	}
@@ -1124,12 +1153,20 @@ recvmsg(int sock, struct nmsghdr *msg, int flags)
 	    (do_useracc == 0 ||
 	    useracc(lmsg.msg_control, lmsg.msg_controllen,
 	    B_WRITE) != 0)) {
+		if (iovsize != 0)
+			kmem_free(aiov, iovsize);
+
 		return (set_errno(EFAULT));
 	}
 
-	return (recvit(sock, &lmsg, &auio, flags,
+	rval = recvit(sock, &lmsg, &auio, flags,
 	    STRUCT_FADDR(umsgptr, msg_namelen),
-	    STRUCT_FADDR(umsgptr, msg_controllen), flagsp));
+	    STRUCT_FADDR(umsgptr, msg_controllen), flagsp);
+
+	if (iovsize != 0)
+		kmem_free(aiov, iovsize);
+
+	return (rval);
 }
 
 /*
@@ -1267,9 +1304,10 @@ sendmsg(int sock, struct nmsghdr *msg, int flags)
 	struct nmsghdr lmsg;
 	STRUCT_DECL(nmsghdr, u_lmsg);
 	struct uio auio;
-	struct iovec aiov[MSG_MAXIOVLEN];
+	struct iovec buf[IOV_MAX_STACK], *aiov = buf;
+	ssize_t iovsize = 0;
 	int iovcnt;
-	ssize_t len;
+	ssize_t len, rval;
 	int i;
 	model_t	model;
 
@@ -1312,7 +1350,7 @@ sendmsg(int sock, struct nmsghdr *msg, int flags)
 
 	iovcnt = lmsg.msg_iovlen;
 
-	if (iovcnt <= 0 || iovcnt > MSG_MAXIOVLEN) {
+	if (iovcnt <= 0 || iovcnt > IOV_MAX) {
 		/*
 		 * Unless this is XPG 4.2 we allow iovcnt == 0 to
 		 * be compatible with SunOS 4.X and 4.4BSD.
@@ -1321,19 +1359,34 @@ sendmsg(int sock, struct nmsghdr *msg, int flags)
 			return (set_errno(EMSGSIZE));
 	}
 
+	if (iovcnt > IOV_MAX_STACK) {
+		iovsize = iovcnt * sizeof (struct iovec);
+		aiov = kmem_alloc(iovsize, KM_SLEEP);
+	}
+
 #ifdef _SYSCALL32_IMPL
 	/*
 	 * 32-bit callers need to have their iovec expanded, while ensuring
 	 * that they can't move more than 2Gbytes of data in a single call.
 	 */
 	if (model == DATAMODEL_ILP32) {
-		struct iovec32 aiov32[MSG_MAXIOVLEN];
+		struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32;
+		ssize_t iov32size;
 		ssize32_t count32;
 
+		iov32size = iovcnt * sizeof (struct iovec32);
+		if (iovsize != 0)
+			aiov32 = kmem_alloc(iov32size, KM_SLEEP);
+
 		if (iovcnt != 0 &&
-		    copyin((struct iovec32 *)lmsg.msg_iov, aiov32,
-		    iovcnt * sizeof (struct iovec32)))
+		    copyin((struct iovec32 *)lmsg.msg_iov, aiov32, iov32size)) {
+			if (iovsize != 0) {
+				kmem_free(aiov32, iov32size);
+				kmem_free(aiov, iovsize);
+			}
+
 			return (set_errno(EFAULT));
+		}
 
 		count32 = 0;
 		for (i = 0; i < iovcnt; i++) {
@@ -1341,17 +1394,30 @@ sendmsg(int sock, struct nmsghdr *msg, int flags)
 
 			iovlen32 = aiov32[i].iov_len;
 			count32 += iovlen32;
-			if (iovlen32 < 0 || count32 < 0)
+			if (iovlen32 < 0 || count32 < 0) {
+				if (iovsize != 0) {
+					kmem_free(aiov32, iov32size);
+					kmem_free(aiov, iovsize);
+				}
+
 				return (set_errno(EINVAL));
+			}
+
 			aiov[i].iov_len = iovlen32;
 			aiov[i].iov_base =
 			    (caddr_t)(uintptr_t)aiov32[i].iov_base;
 		}
+
+		if (iovsize != 0)
+			kmem_free(aiov32, iov32size);
 	} else
 #endif /* _SYSCALL32_IMPL */
 	if (iovcnt != 0 &&
 	    copyin(lmsg.msg_iov, aiov,
 	    (unsigned)iovcnt * sizeof (struct iovec))) {
+		if (iovsize != 0)
+			kmem_free(aiov, iovsize);
+
 		return (set_errno(EFAULT));
 	}
 	len = 0;
@@ -1359,6 +1425,9 @@ sendmsg(int sock, struct nmsghdr *msg, int flags)
 		ssize_t iovlen = aiov[i].iov_len;
 		len += iovlen;
 		if (iovlen < 0 || len < 0) {
+			if (iovsize != 0)
+				kmem_free(aiov, iovsize);
+
 			return (set_errno(EINVAL));
 		}
 	}
@@ -1369,7 +1438,12 @@ sendmsg(int sock, struct nmsghdr *msg, int flags)
 	auio.uio_segflg = UIO_USERSPACE;
 	auio.uio_limit = 0;
 
-	return (sendit(sock, &lmsg, &auio, flags));
+	rval = sendit(sock, &lmsg, &auio, flags);
+
+	if (iovsize != 0)
+		kmem_free(aiov, iovsize);
+
+	return (rval);
 }
 
 ssize_t
diff --git a/usr/src/uts/common/fs/sockfs/socktpi_impl.h b/usr/src/uts/common/fs/sockfs/socktpi_impl.h
index 6a515be122..24acb81a0a 100644
--- a/usr/src/uts/common/fs/sockfs/socktpi_impl.h
+++ b/usr/src/uts/common/fs/sockfs/socktpi_impl.h
@@ -22,6 +22,7 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2015 Joyent, Inc.
  */
 
 #ifndef _SOCKFS_SOCKTPI_IMPL_H
@@ -56,6 +57,8 @@ extern int	sogetrderr(vnode_t *, int, int *);
 extern int	sogetwrerr(vnode_t *, int, int *);
 extern int	so_addr_verify(struct sonode *, const struct sockaddr *,
 			socklen_t);
+extern int	so_ux_lookup(struct sonode *, struct sockaddr_un *, int,
+		vnode_t **);
 extern int	so_ux_addr_xlate(struct sonode *, struct sockaddr *,
 			socklen_t, int, void **, socklen_t *);
 extern void	so_unix_close(struct sonode *);
diff --git a/usr/src/uts/common/fs/swapfs/swap_subr.c b/usr/src/uts/common/fs/swapfs/swap_subr.c
index 74c4302da9..a4d983665b 100644
--- a/usr/src/uts/common/fs/swapfs/swap_subr.c
+++ b/usr/src/uts/common/fs/swapfs/swap_subr.c
@@ -110,9 +110,11 @@ swapfs_recalc(pgcnt_t pgs)
 		 * memory that can be used as swap space should do so by
 		 * setting swapfs_desfree at boot time, not swapfs_minfree.
 		 * However, swapfs_minfree is tunable by install as a
-		 * workaround for bugid 1147463.
+		 * workaround for bugid 1147463. Note swapfs_minfree is set
+		 * to 1/8th of memory, but clamped at the limit of 256 MB.
 		 */
-		new_swapfs_minfree = MAX(btopr(2 * 1024 * 1024), pgs >> 3);
+		new_swapfs_minfree = MIN(MAX(btopr(2 * 1024 * 1024), pgs >> 3),
+		    btopr(256 * 1024 * 1024));
 	}
 
 	/*
diff --git a/usr/src/uts/common/fs/tmpfs/tmp_dir.c b/usr/src/uts/common/fs/tmpfs/tmp_dir.c
index f6621c8097..387cc6ae54 100644
--- a/usr/src/uts/common/fs/tmpfs/tmp_dir.c
+++ b/usr/src/uts/common/fs/tmpfs/tmp_dir.c
@@ -516,7 +516,7 @@ tdirdelete(
 	 */
 	namelen = strlen(tpdp->td_name) + 1;
 
-	tmp_memfree(tpdp, sizeof (struct tdirent) + namelen);
+	kmem_free(tpdp, sizeof (struct tdirent) + namelen);
 	dir->tn_size -= (sizeof (struct tdirent) + namelen);
 	dir->tn_dirents--;
 
@@ -549,8 +549,8 @@ tdirinit(
 	ASSERT(RW_WRITE_HELD(&parent->tn_rwlock));
 	ASSERT(dir->tn_type == VDIR);
 
-	dot = tmp_memalloc(sizeof (struct tdirent) + 2, TMP_MUSTHAVE);
-	dotdot = tmp_memalloc(sizeof (struct tdirent) + 3, TMP_MUSTHAVE);
+	dot = kmem_zalloc(sizeof (struct tdirent) + 2, KM_SLEEP);
+	dotdot = kmem_zalloc(sizeof (struct tdirent) + 3, KM_SLEEP);
 
 	/*
 	 * Initialize the entries
@@ -650,7 +650,7 @@ tdirtrunc(struct tmpnode *dir)
 
 		tmpfs_hash_out(tdp);
 
-		tmp_memfree(tdp, sizeof (struct tdirent) + namelen);
+		kmem_free(tdp, sizeof (struct tdirent) + namelen);
 		dir->tn_size -= (sizeof (struct tdirent) + namelen);
 		dir->tn_dirents--;
 	}
@@ -925,7 +925,7 @@ tdiraddentry(
 	 */
 	namelen = strlen(name) + 1;
 	alloc_size = namelen + sizeof (struct tdirent);
-	tdp = tmp_memalloc(alloc_size, 0);
+	tdp = kmem_zalloc(alloc_size, KM_NOSLEEP | KM_NORMALPRI);
 	if (tdp == NULL)
 		return (ENOSPC);
 
@@ -1025,7 +1025,7 @@ tdirmaketnode(
 	    ((va->va_mask & AT_MTIME) && TIMESPEC_OVERFLOW(&va->va_mtime)))
 		return (EOVERFLOW);
 	type = va->va_type;
-	tp = tmp_memalloc(sizeof (struct tmpnode), TMP_MUSTHAVE);
+	tp = kmem_zalloc(sizeof (struct tmpnode), KM_SLEEP);
 	tmpnode_init(tm, tp, va, cred);
 
 	/* setup normal file/dir's extended attribute directory */
diff --git a/usr/src/uts/common/fs/tmpfs/tmp_subr.c b/usr/src/uts/common/fs/tmpfs/tmp_subr.c
index 2e59d28d80..e6e2b392fe 100644
--- a/usr/src/uts/common/fs/tmpfs/tmp_subr.c
+++ b/usr/src/uts/common/fs/tmpfs/tmp_subr.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
  */
 
 #include <sys/types.h>
@@ -40,9 +41,19 @@
 #include <sys/policy.h>
 #include <sys/fs/tmp.h>
 #include <sys/fs/tmpnode.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+
+#define	KILOBYTE	1024
+#define	MEGABYTE	(1024 * KILOBYTE)
+#define	GIGABYTE	(1024 * MEGABYTE)
 
 #define	MODESHIFT	3
 
+#define	VALIDMODEBITS	07777
+
+extern pgcnt_t swapfs_minfree;
+
 int
 tmp_taccess(void *vtp, int mode, struct cred *cred)
 {
@@ -71,7 +82,6 @@ tmp_taccess(void *vtp, int mode, struct cred *cred)
  * a plain file and you have write access to that file.
  * Function returns 0 if remove access is granted.
  */
-
 int
 tmp_sticky_remove_access(struct tmpnode *dir, struct tmpnode *entry,
 	struct cred *cr)
@@ -89,111 +99,122 @@ tmp_sticky_remove_access(struct tmpnode *dir, struct tmpnode *entry,
 }
 
 /*
- * Allocate zeroed memory if tmpfs_maxkmem has not been exceeded
- * or the 'musthave' flag is set.  'musthave' allocations should
- * always be subordinate to normal allocations so that tmpfs_maxkmem
- * can't be exceeded by more than a few KB.  Example: when creating
- * a new directory, the tmpnode is a normal allocation; if that
- * succeeds, the dirents for "." and ".." are 'musthave' allocations.
- */
-void *
-tmp_memalloc(size_t size, int musthave)
-{
-	static time_t last_warning;
-	time_t now;
-
-	if (atomic_add_long_nv(&tmp_kmemspace, size) < tmpfs_maxkmem ||
-	    musthave)
-		return (kmem_zalloc(size, KM_SLEEP));
-
-	atomic_add_long(&tmp_kmemspace, -size);
-	now = gethrestime_sec();
-	if (last_warning != now) {
-		last_warning = now;
-		cmn_err(CE_WARN, "tmp_memalloc: tmpfs over memory limit");
-	}
-	return (NULL);
-}
-
-void
-tmp_memfree(void *cp, size_t size)
-{
-	kmem_free(cp, size);
-	atomic_add_long(&tmp_kmemspace, -size);
-}
-
-/*
  * Convert a string containing a number (number of bytes) to a pgcnt_t,
  * containing the corresponding number of pages. On 32-bit kernels, the
  * maximum value encoded in 'str' is PAGESIZE * ULONG_MAX, while the value
  * returned in 'maxpg' is at most ULONG_MAX.
  *
- * If the number is followed by a "k" or "K", the value is converted from
- * kilobytes to bytes.  If it is followed by an "m" or "M" it is converted
- * from megabytes to bytes.  If it is not followed by a character it is
- * assumed to be in bytes. Multiple letter options are allowed, so for instance
- * '2mk' is interpreted as 2gb.
+ * The number may be followed by a magnitude suffix: "k" or "K" for kilobytes;
+ * "m" or "M" for megabytes; "g" or "G" for gigabytes.  This interface allows
+ * for an arguably esoteric interpretation of multiple suffix characters:
+ * namely, they cascade.  For example, the caller may specify "2mk", which is
+ * interpreted as 2 gigabytes.  It would seem, at this late stage, that the
+ * horse has left not only the barn but indeed the country, and possibly the
+ * entire planetary system. Alternatively, the number may be followed by a
+ * single '%' sign, indicating the size is a percentage of either the zone's
+ * swap limit or the system's overall swap size.
  *
  * Parse and overflow errors are detected and a non-zero number returned on
  * error.
  */
-
 int
 tmp_convnum(char *str, pgcnt_t *maxpg)
 {
-	uint64_t num = 0, oldnum;
+	u_longlong_t num = 0;
 #ifdef _LP64
-	uint64_t max_bytes = ULONG_MAX;
+	u_longlong_t max_bytes = ULONG_MAX;
 #else
-	uint64_t max_bytes = PAGESIZE * (uint64_t)ULONG_MAX;
+	u_longlong_t max_bytes = PAGESIZE * (uint64_t)ULONG_MAX;
 #endif
 	char *c;
-
-	if (str == NULL)
+	const struct convchar {
+		char *cc_char;
+		uint64_t cc_factor;
+	} convchars[] = {
+		{ "kK", KILOBYTE },
+		{ "mM", MEGABYTE },
+		{ "gG", GIGABYTE },
+		{ NULL, 0 }
+	};
+
+	if (str == NULL) {
 		return (EINVAL);
+	}
 	c = str;
 
 	/*
-	 * Convert str to number
+	 * Convert the initial numeric portion of the input string.
 	 */
-	while ((*c >= '0') && (*c <= '9')) {
-		oldnum = num;
-		num = num * 10 + (*c++ - '0');
-		if (oldnum > num) /* overflow */
+	if (ddi_strtoull(str, &c, 10, &num) != 0) {
+		return (EINVAL);
+	}
+
+	/*
+	 * Handle a size in percent. Anything other than a single percent
+	 * modifier is invalid. We use either the zone's swap limit or the
+	 * system's total available swap size as the initial value. Perform the
+	 * intermediate calculation in pages to avoid overflow.
+	 */
+	if (*c == '\%') {
+		u_longlong_t cap;
+
+		if (*(c + 1) != '\0')
+			return (EINVAL);
+
+		if (num > 100)
 			return (EINVAL);
+
+		cap = (u_longlong_t)curproc->p_zone->zone_max_swap_ctl;
+		if (cap == UINT64_MAX) {
+			/*
+			 * Use the amount of available physical and memory swap
+			 */
+			mutex_enter(&anoninfo_lock);
+			cap = TOTAL_AVAILABLE_SWAP;
+			mutex_exit(&anoninfo_lock);
+		} else {
+			cap = btop(cap);
+		}
+
+		num = ptob(cap * num / 100);
+		goto done;
 	}
 
 	/*
-	 * Terminate on null
+	 * Apply the (potentially cascading) magnitude suffixes until an
+	 * invalid character is found, or the string comes to an end.
 	 */
-	while (*c != '\0') {
-		switch (*c++) {
+	for (; *c != '\0'; c++) {
+		int i;
+
+		for (i = 0; convchars[i].cc_char != NULL; i++) {
+			/*
+			 * Check if this character matches this multiplier
+			 * class:
+			 */
+			if (strchr(convchars[i].cc_char, *c) != NULL) {
+				/*
+				 * Check for overflow:
+				 */
+				if (num > max_bytes / convchars[i].cc_factor) {
+					return (EINVAL);
+				}
+
+				num *= convchars[i].cc_factor;
+				goto valid_char;
+			}
+		}
 
 		/*
-		 * convert from kilobytes
+		 * This was not a valid multiplier suffix character.
 		 */
-		case 'k':
-		case 'K':
-			if (num > max_bytes / 1024) /* will overflow */
-				return (EINVAL);
-			num *= 1024;
-			break;
+		return (EINVAL);
 
-		/*
-		 * convert from megabytes
-		 */
-		case 'm':
-		case 'M':
-			if (num > max_bytes / (1024 * 1024)) /* will overflow */
-				return (EINVAL);
-			num *= 1024 * 1024;
-			break;
-
-		default:
-			return (EINVAL);
-		}
+valid_char:
+		continue;
 	}
 
+done:
 	/*
 	 * Since btopr() rounds up to page granularity, this round-up can
 	 * cause an overflow only if 'num' is between (max_bytes - PAGESIZE)
@@ -204,3 +225,29 @@ tmp_convnum(char *str, pgcnt_t *maxpg)
 		return (EINVAL);
 	return (0);
 }
+
+/*
+ * Parse an octal mode string for use as the permissions set for the root
+ * of the tmpfs mount.
+ */
+int
+tmp_convmode(char *str, mode_t *mode)
+{
+	ulong_t num;
+	char *c;
+
+	if (str == NULL) {
+		return (EINVAL);
+	}
+
+	if (ddi_strtoul(str, &c, 8, &num) != 0) {
+		return (EINVAL);
+	}
+
+	if ((num & ~VALIDMODEBITS) != 0) {
+		return (EINVAL);
+	}
+
+	*mode = VALIDMODEBITS & num;
+	return (0);
+}
diff --git a/usr/src/uts/common/fs/tmpfs/tmp_vfsops.c b/usr/src/uts/common/fs/tmpfs/tmp_vfsops.c
index f8a36a528f..3c088c442c 100644
--- a/usr/src/uts/common/fs/tmpfs/tmp_vfsops.c
+++ b/usr/src/uts/common/fs/tmpfs/tmp_vfsops.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
  */
 
 #include <sys/types.h>
@@ -55,6 +56,15 @@
 static int tmpfsfstype;
 
 /*
+ * tmpfs_mountcount is used to prevent module unloads while there is still
+ * state from a former mount hanging around. With forced umount support, the
+ * filesystem module must not be allowed to go away before the last
+ * VFS_FREEVFS() call has been made. Since this is just an atomic counter,
+ * there's no need for locking.
+ */
+static uint32_t	tmpfs_mountcount;
+
+/*
  * tmpfs vfs operations.
  */
 static int tmpfsinit(int, char *);
@@ -64,6 +74,7 @@ static int tmp_unmount(struct vfs *, int, struct cred *);
 static int tmp_root(struct vfs *, struct vnode **);
 static int tmp_statvfs(struct vfs *, struct statvfs64 *);
 static int tmp_vget(struct vfs *, struct vnode **, struct fid *);
+static void tmp_freevfs(vfs_t *vfsp);
 
 /*
  * Loadable module wrapper
@@ -76,7 +87,7 @@ static vfsdef_t vfw = {
 	VFSDEF_VERSION,
 	"tmpfs",
 	tmpfsinit,
-	VSW_HASPROTO|VSW_STATS|VSW_ZMOUNT,
+	VSW_HASPROTO|VSW_CANREMOUNT|VSW_STATS|VSW_ZMOUNT,
 	&tmpfs_proto_opttbl
 };
 
@@ -90,7 +101,8 @@ static mntopt_t tmpfs_options[] = {
 	/* Option name		Cancel Opt	Arg	Flags		Data */
 	{ MNTOPT_XATTR,		xattr_cancel,	NULL,	MO_DEFAULT,	NULL},
 	{ MNTOPT_NOXATTR,	noxattr_cancel,	NULL,	NULL,		NULL},
-	{ "size",		NULL,		"0",	MO_HASVALUE,	NULL}
+	{ "size",		NULL,		"0",	MO_HASVALUE,	NULL},
+	{ "mode",		NULL,		NULL,	MO_HASVALUE,	NULL}
 };
 
 
@@ -121,6 +133,14 @@ _fini()
 {
 	int error;
 
+	/*
+	 * If a forceably unmounted instance is still hanging around, we cannot
+	 * allow the module to be unloaded because that would cause panics once
+	 * the VFS framework decides it's time to call into VFS_FREEVFS().
+	 */
+	if (tmpfs_mountcount)
+		return (EBUSY);
+
 	error = mod_remove(&modlinkage);
 	if (error)
 		return (error);
@@ -139,14 +159,6 @@ _info(struct modinfo *modinfop)
 }
 
 /*
- * The following are patchable variables limiting the amount of system
- * resources tmpfs can use.
- *
- * tmpfs_maxkmem limits the amount of kernel kmem_alloc memory
- * tmpfs can use for it's data structures (e.g. tmpnodes, directory entries)
- * It is not determined by setting a hard limit but rather as a percentage of
- * physical memory which is determined when tmpfs is first used in the system.
- *
  * tmpfs_minfree is the minimum amount of swap space that tmpfs leaves for
  * the rest of the system.  In other words, if the amount of free swap space
  * in the system (i.e. anoninfo.ani_free) drops below tmpfs_minfree, tmpfs
@@ -155,9 +167,7 @@ _info(struct modinfo *modinfop)
  * There is also a per mount limit on the amount of swap space
  * (tmount.tm_anonmax) settable via a mount option.
  */
-size_t tmpfs_maxkmem = 0;
 size_t tmpfs_minfree = 0;
-size_t tmp_kmemspace;		/* bytes of kernel heap used by all tmpfs */
 
 static major_t tmpfs_major;
 static minor_t tmpfs_minor;
@@ -176,6 +186,7 @@ tmpfsinit(int fstype, char *name)
 		VFSNAME_ROOT,		{ .vfs_root = tmp_root },
 		VFSNAME_STATVFS,	{ .vfs_statvfs = tmp_statvfs },
 		VFSNAME_VGET,		{ .vfs_vget = tmp_vget },
+		VFSNAME_FREEVFS,	{ .vfs_freevfs = tmp_freevfs },
 		NULL,			NULL
 	};
 	int error;
@@ -210,27 +221,17 @@ tmpfsinit(int fstype, char *name)
 		tmpfs_minfree = btopr(TMPMINFREE);
 	}
 
-	/*
-	 * The maximum amount of space tmpfs can allocate is
-	 * TMPMAXPROCKMEM percent of kernel memory
-	 */
-	if (tmpfs_maxkmem == 0)
-		tmpfs_maxkmem = MAX(PAGESIZE, kmem_maxavail() / TMPMAXFRACKMEM);
-
 	if ((tmpfs_major = getudev()) == (major_t)-1) {
 		cmn_err(CE_WARN, "tmpfsinit: Can't get unique device number.");
 		tmpfs_major = 0;
 	}
 	mutex_init(&tmpfs_minor_lock, NULL, MUTEX_DEFAULT, NULL);
+	tmpfs_mountcount = 0;
 	return (0);
 }
 
 static int
-tmp_mount(
-	struct vfs *vfsp,
-	struct vnode *mvp,
-	struct mounta *uap,
-	struct cred *cr)
+tmp_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
 {
 	struct tmount *tm = NULL;
 	struct tmpnode *tp;
@@ -239,8 +240,9 @@ tmp_mount(
 	pgcnt_t anonmax;
 	struct vattr rattr;
 	int got_attrs;
-
-	char *sizestr;
+	boolean_t mode_arg = B_FALSE;
+	mode_t root_mode = 0777;
+	char *argstr;
 
 	if ((error = secpolicy_fs_mount(cr, mvp, vfsp)) != 0)
 		return (error);
@@ -249,7 +251,7 @@ tmp_mount(
 		return (ENOTDIR);
 
 	mutex_enter(&mvp->v_lock);
-	if ((uap->flags & MS_OVERLAY) == 0 &&
+	if ((uap->flags & MS_REMOUNT) == 0 && (uap->flags & MS_OVERLAY) == 0 &&
 	    (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
 		mutex_exit(&mvp->v_lock);
 		return (EBUSY);
@@ -275,18 +277,45 @@ tmp_mount(
 	 * tm_anonmax is set according to the mount arguments
 	 * if any.  Otherwise, it is set to a maximum value.
 	 */
-	if (vfs_optionisset(vfsp, "size", &sizestr)) {
-		if ((error = tmp_convnum(sizestr, &anonmax)) != 0)
+	if (vfs_optionisset(vfsp, "size", &argstr)) {
+		if ((error = tmp_convnum(argstr, &anonmax)) != 0)
 			goto out;
 	} else {
 		anonmax = ULONG_MAX;
 	}
 
+	/*
+	 * The "mode" mount argument allows the operator to override the
+	 * permissions of the root of the tmpfs mount.
+	 */
+	if (vfs_optionisset(vfsp, "mode", &argstr)) {
+		if ((error = tmp_convmode(argstr, &root_mode)) != 0) {
+			goto out;
+		}
+		mode_arg = B_TRUE;
+	}
+
 	if (error = pn_get(uap->dir,
 	    (uap->flags & MS_SYSSPACE) ? UIO_SYSSPACE : UIO_USERSPACE, &dpn))
 		goto out;
 
-	if ((tm = tmp_memalloc(sizeof (struct tmount), 0)) == NULL) {
+	if (uap->flags & MS_REMOUNT) {
+		tm = (struct tmount *)VFSTOTM(vfsp);
+
+		/*
+		 * If we change the size so its less than what is currently
+		 * being used, we allow that. The file system will simply be
+		 * full until enough files have been removed to get below the
+		 * new max.
+		 */
+		mutex_enter(&tm->tm_contents);
+		tm->tm_anonmax = anonmax;
+		mutex_exit(&tm->tm_contents);
+		goto out;
+	}
+
+	if ((tm = kmem_zalloc(sizeof (struct tmount),
+	    KM_NOSLEEP | KM_NORMALPRI)) == NULL) {
 		pn_free(&dpn);
 		error = ENOMEM;
 		goto out;
@@ -318,17 +347,17 @@ tmp_mount(
 	vfsp->vfs_bsize = PAGESIZE;
 	vfsp->vfs_flag |= VFS_NOTRUNC;
 	vfs_make_fsid(&vfsp->vfs_fsid, tm->tm_dev, tmpfsfstype);
-	tm->tm_mntpath = tmp_memalloc(dpn.pn_pathlen + 1, TMP_MUSTHAVE);
+	tm->tm_mntpath = kmem_zalloc(dpn.pn_pathlen + 1, KM_SLEEP);
 	(void) strcpy(tm->tm_mntpath, dpn.pn_path);
 
 	/*
 	 * allocate and initialize root tmpnode structure
 	 */
 	bzero(&rattr, sizeof (struct vattr));
-	rattr.va_mode = (mode_t)(S_IFDIR | 0777);	/* XXX modes */
+	rattr.va_mode = (mode_t)(S_IFDIR | root_mode);
 	rattr.va_type = VDIR;
 	rattr.va_rdev = 0;
-	tp = tmp_memalloc(sizeof (struct tmpnode), TMP_MUSTHAVE);
+	tp = kmem_zalloc(sizeof (struct tmpnode), KM_SLEEP);
 	tmpnode_init(tm, tp, &rattr, cr);
 
 	/*
@@ -345,7 +374,14 @@ tmp_mount(
 	 * the previously set hardwired defaults to prevail.
 	 */
 	if (got_attrs == 0) {
-		tp->tn_mode = rattr.va_mode;
+		if (!mode_arg) {
+			/*
+			 * Only use the underlying mount point for the
+			 * mode if the "mode" mount argument was not
+			 * provided.
+			 */
+			tp->tn_mode = rattr.va_mode;
+		}
 		tp->tn_uid = rattr.va_uid;
 		tp->tn_gid = rattr.va_gid;
 	}
@@ -366,6 +402,7 @@ tmp_mount(
 
 	pn_free(&dpn);
 	error = 0;
+	atomic_inc_32(&tmpfs_mountcount);
 
 out:
 	if (error == 0)
@@ -381,36 +418,107 @@ tmp_unmount(struct vfs *vfsp, int flag, struct cred *cr)
 	struct tmpnode *tnp, *cancel;
 	struct vnode	*vp;
 	int error;
+	uint_t cnt;
+	int i;
 
 	if ((error = secpolicy_fs_unmount(cr, vfsp)) != 0)
 		return (error);
 
-	/*
-	 * forced unmount is not supported by this file system
-	 * and thus, ENOTSUP, is being returned.
-	 */
-	if (flag & MS_FORCE)
-		return (ENOTSUP);
-
 	mutex_enter(&tm->tm_contents);
 
 	/*
-	 * If there are no open files, only the root node should have
-	 * a reference count.
+	 * In the normal unmount case (non-forced unmount), if there are no
+	 * open files, only the root node should have a reference count.
+	 *
 	 * With tm_contents held, nothing can be added or removed.
 	 * There may be some dirty pages.  To prevent fsflush from
 	 * disrupting the unmount, put a hold on each node while scanning.
 	 * If we find a previously referenced node, undo the holds we have
 	 * placed and fail EBUSY.
+	 *
+	 * However, in the case of a forced umount, things are a bit different.
+	 * An additional VFS_HOLD is added for each outstanding VN_HOLD to
+	 * ensure that the file system is not cleaned up (tmp_freevfs) until
+	 * the last vfs hold is dropped. This happens in tmp_inactive as the
+	 * vnodes are released. Also, we can't add an additional VN_HOLD in
+	 * this case since that would prevent tmp_inactive from ever being
+	 * called. Finally, we do need to drop the zone ref now (zone_rele_ref)
+	 * so that the zone is not blocked waiting for the final file system
+	 * cleanup.
 	 */
 	tnp = tm->tm_rootnode;
-	if (TNTOV(tnp)->v_count > 1) {
+
+	vp = TNTOV(tnp);
+	mutex_enter(&vp->v_lock);
+	cnt = vp->v_count;
+	if (flag & MS_FORCE) {
+		vfsp->vfs_flag |= VFS_UNMOUNTED;
+		/* Extra hold which we rele below when we drop the zone ref */
+		VFS_HOLD(vfsp);
+
+		for (i = 1; i < cnt; i++)
+			VFS_HOLD(vfsp);
+
+		/* drop the mutex now because no one can find this mount */
+		mutex_exit(&tm->tm_contents);
+	} else if (cnt > 1) {
+		mutex_exit(&vp->v_lock);
 		mutex_exit(&tm->tm_contents);
 		return (EBUSY);
 	}
+	mutex_exit(&vp->v_lock);
 
+	/*
+	 * Check for open files. An open file causes everything to unwind
+	 * unless this is a forced umount.
+	 */
 	for (tnp = tnp->tn_forw; tnp; tnp = tnp->tn_forw) {
-		if ((vp = TNTOV(tnp))->v_count > 0) {
+		vp = TNTOV(tnp);
+		mutex_enter(&vp->v_lock);
+		cnt = vp->v_count;
+		if (flag & MS_FORCE) {
+			for (i = 0; i < cnt; i++)
+				VFS_HOLD(vfsp);
+
+			/*
+			 * In the case of a forced umount don't add an
+			 * additional VN_HOLD on the already held vnodes, like
+			 * we do in the non-forced unmount case. If the
+			 * cnt > 0, then the vnode already has at least one
+			 * hold and we need tmp_inactive to get called when the
+			 * last pre-existing hold on the node is released so
+			 * that we can VFS_RELE the VFS holds we just added.
+			 */
+			if (cnt == 0) {
+				/* directly add VN_HOLD since have the lock */
+				vp->v_count++;
+			}
+
+			mutex_exit(&vp->v_lock);
+
+			/*
+			 * If the tmpnode has any pages associated with it
+			 * (i.e. if it's a normal file with non-zero size), the
+			 * tmpnode could still be discovered by pageout or
+			 * fsflush via the page vnode pointers. To prevent this
+			 * from interfering with the tmp_freevfs, truncate the
+			 * tmpnode now.
+			 */
+			if (tnp->tn_size != 0 && tnp->tn_type == VREG) {
+				rw_enter(&tnp->tn_rwlock, RW_WRITER);
+				rw_enter(&tnp->tn_contents, RW_WRITER);
+
+				(void) tmpnode_trunc(tm, tnp, 0);
+
+				rw_exit(&tnp->tn_contents);
+				rw_exit(&tnp->tn_rwlock);
+
+				ASSERT(tnp->tn_size == 0);
+				ASSERT(tnp->tn_nblocks == 0);
+			}
+		} else if (cnt > 0) {
+			/* An open file; unwind the holds we've been adding. */
+			mutex_exit(&vp->v_lock);
 			cancel = tm->tm_rootnode->tn_forw;
 			while (cancel != tnp) {
 				vp = TNTOV(cancel);
@@ -420,14 +528,50 @@ tmp_unmount(struct vfs *vfsp, int flag, struct cred *cr)
 			}
 			mutex_exit(&tm->tm_contents);
 			return (EBUSY);
+		} else {
+			/* directly add a VN_HOLD since we have the lock */
+			vp->v_count++;
+			mutex_exit(&vp->v_lock);
 		}
-		VN_HOLD(vp);
 	}
 
-	/*
-	 * We can drop the mutex now because no one can find this mount
-	 */
-	mutex_exit(&tm->tm_contents);
+	if (flag & MS_FORCE) {
+		/*
+		 * Drop the zone ref now since we don't know how long it will
+		 * be until the final vfs_rele is called by tmp_inactive.
+		 */
+		if (vfsp->vfs_zone) {
+			zone_rele_ref(&vfsp->vfs_implp->vi_zone_ref,
+			    ZONE_REF_VFS);
+			vfsp->vfs_zone = 0;
+		}
+		/* We can now drop the extra hold we added above. */
+		VFS_RELE(vfsp);
+	} else {
+		/*
+		 * For the non-forced case, we can drop the mutex now because
+		 * no one can find this mount anymore
+		 */
+		vfsp->vfs_flag |= VFS_UNMOUNTED;
+		mutex_exit(&tm->tm_contents);
+	}
+
+	return (0);
+}
+
+/*
+ * Implementation of VFS_FREEVFS() to support forced umounts. This is called by
+ * the vfs framework after umount and the last VFS_RELE, to trigger the release
+ * of any resources still associated with the given vfs_t. We only add
+ * additional VFS_HOLDs during the forced umount case, so this is normally
+ * called immediately after tmp_umount.
+ */
+void
+tmp_freevfs(vfs_t *vfsp)
+{
+	struct tmount *tm = (struct tmount *)VFSTOTM(vfsp);
+	struct tmpnode *tnp;
+	struct vnode	*vp;
 
 	/*
 	 * Free all kmemalloc'd and anonalloc'd memory associated with
@@ -437,6 +581,16 @@ tmp_unmount(struct vfs *vfsp, int flag, struct cred *cr)
 	 * tmpnode_free which assumes that the directory entry has been
 	 * removed before the file.
 	 */
+
+	/*
+	 * Now that we are tearing ourselves down we need to remove the
+	 * UNMOUNTED flag. If we don't, we'll later hit a VN_RELE when we remove
+	 * files from the system causing us to have a negative value. Doing this
+	 * seems a bit better than trying to set a flag on the tmount that says
+	 * we're tearing down.
+	 */
+	vfsp->vfs_flag &= ~VFS_UNMOUNTED;
+
 	/*
 	 * Remove all directory entries
 	 */
@@ -503,15 +657,16 @@ tmp_unmount(struct vfs *vfsp, int flag, struct cred *cr)
 
 	ASSERT(tm->tm_mntpath);
 
-	tmp_memfree(tm->tm_mntpath, strlen(tm->tm_mntpath) + 1);
+	kmem_free(tm->tm_mntpath, strlen(tm->tm_mntpath) + 1);
 
 	ASSERT(tm->tm_anonmem == 0);
 
 	mutex_destroy(&tm->tm_contents);
 	mutex_destroy(&tm->tm_renamelck);
-	tmp_memfree(tm, sizeof (struct tmount));
+	kmem_free(tm, sizeof (struct tmount));
 
-	return (0);
+	/* Allow _fini() to succeed now */
+	atomic_dec_32(&tmpfs_mountcount);
 }
 
 /*
@@ -614,13 +769,7 @@ tmp_statvfs(struct vfs *vfsp, struct statvfs64 *sbp)
 	 * available to tmpfs.  This is fairly inaccurate since it doesn't
 	 * take into account the names stored in the directory entries.
 	 */
-	if (tmpfs_maxkmem > tmp_kmemspace)
-		sbp->f_ffree = (tmpfs_maxkmem - tmp_kmemspace) /
-		    (sizeof (struct tmpnode) + sizeof (struct tdirent));
-	else
-		sbp->f_ffree = 0;
-
-	sbp->f_files = tmpfs_maxkmem /
+	sbp->f_ffree = sbp->f_files = ptob(availrmem) /
 	    (sizeof (struct tmpnode) + sizeof (struct tdirent));
 	sbp->f_favail = (fsfilcnt64_t)(sbp->f_ffree);
 	(void) cmpldev(&d32, vfsp->vfs_dev);
diff --git a/usr/src/uts/common/fs/tmpfs/tmp_vnops.c b/usr/src/uts/common/fs/tmpfs/tmp_vnops.c
index 3c251df0cc..18e037ee22 100644
--- a/usr/src/uts/common/fs/tmpfs/tmp_vnops.c
+++ b/usr/src/uts/common/fs/tmpfs/tmp_vnops.c
@@ -25,7 +25,7 @@
  */
 
 /*
- * Copyright (c) 2015, Joyent, Inc. All rights reserved.
+ * Copyright 2016, Joyent, Inc.
  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  */
 
@@ -584,6 +584,10 @@ tmp_read(struct vnode *vp, struct uio *uiop, int ioflag, cred_t *cred,
 	struct tmount *tm = (struct tmount *)VTOTM(vp);
 	int error;
 
+	/* If the filesystem was umounted by force, return immediately. */
+	if (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)
+		return (EIO);
+
 	/*
 	 * We don't currently support reading non-regular files
 	 */
@@ -613,6 +617,10 @@ tmp_write(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cred,
 	struct tmount *tm = (struct tmount *)VTOTM(vp);
 	int error;
 
+	/* If the filesystem was umounted by force, return immediately. */
+	if (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)
+		return (EIO);
+
 	/*
 	 * We don't currently support writing to non-regular files
 	 */
@@ -786,8 +794,13 @@ tmp_setattr(
 		rw_exit(&tp->tn_contents);
 		rw_exit(&tp->tn_rwlock);
 
-		if (error == 0 && vap->va_size == 0)
-			vnevent_truncate(vp, ct);
+		if (error == 0) {
+			if (vap->va_size == 0) {
+				vnevent_truncate(vp, ct);
+			} else {
+				vnevent_resize(vp, ct);
+			}
+		}
 
 		goto out1;
 	}
@@ -833,6 +846,9 @@ tmp_lookup(
 	struct tmpnode *ntp = NULL;
 	int error;
 
+	/* If the filesystem was umounted by force, return immediately. */
+	if (dvp->v_vfsp->vfs_flag & VFS_UNMOUNTED)
+		return (EIO);
 
 	/* allow cd into @ dir */
 	if (flags & LOOKUP_XATTR) {
@@ -871,8 +887,7 @@ tmp_lookup(
 				return (error);
 			}
 
-			xdp = tmp_memalloc(sizeof (struct tmpnode),
-			    TMP_MUSTHAVE);
+			xdp = kmem_zalloc(sizeof (struct tmpnode), KM_SLEEP);
 			tm = VTOTM(dvp);
 			tmpnode_init(tm, xdp, &tp->tn_attr, NULL);
 			/*
@@ -1302,10 +1317,8 @@ tmp_rename(
 		vnevent_rename_src(TNTOV(fromtp), odvp, onm, ct);
 		/*
 		 * vnevent_rename_dest is called in tdirenter().
-		 * Notify the target dir if not same as source dir.
 		 */
-		if (ndvp != odvp)
-			vnevent_rename_dest_dir(ndvp, ct);
+		vnevent_rename_dest_dir(ndvp, TNTOV(fromtp), nnm, ct);
 	}
 
 done:
@@ -1474,6 +1487,10 @@ tmp_readdir(
 	int reclen;
 	caddr_t outbuf;
 
+	/* If the filesystem was umounted by force, return immediately. */
+	if (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)
+		return (EIO);
+
 	if (uiop->uio_loffset >= MAXOFF_T) {
 		if (eofp)
 			*eofp = 1;
@@ -1612,7 +1629,7 @@ tmp_symlink(
 		return (error);
 	}
 	len = strlen(tnm) + 1;
-	cp = tmp_memalloc(len, 0);
+	cp = kmem_alloc(len, KM_NOSLEEP | KM_NORMALPRI);
 	if (cp == NULL) {
 		tmpnode_rele(self);
 		return (ENOSPC);
@@ -1677,10 +1694,27 @@ top:
 	 * there's little to do -- just drop our hold.
 	 */
 	if (vp->v_count > 1 || tp->tn_nlink != 0) {
-		vp->v_count--;
+		if (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED) {
+			/*
+			 * Since the file system was forcibly unmounted, we can
+			 * have a case (v_count == 1, tn_nlink != 0) where this
+			 * file was open so we didn't add an extra hold on the
+			 * file in tmp_unmount. We are counting on the
+			 * interaction of the hold made in tmp_unmount and
+			 * rele-ed in tmp_vfsfree so we need to be sure we
+			 * don't decrement in this case.
+			 */
+			if (vp->v_count > 1)
+				vp->v_count--;
+		} else {
+			vp->v_count--;
+		}
 		mutex_exit(&vp->v_lock);
 		mutex_exit(&tp->tn_tlock);
 		rw_exit(&tp->tn_rwlock);
+		/* If the filesystem was umounted by force, rele the vfs ref */
+		if (tm->tm_vfsp->vfs_flag & VFS_UNMOUNTED)
+			VFS_RELE(tm->tm_vfsp);
 		return;
 	}
 
@@ -1705,7 +1739,7 @@ top:
 			goto top;
 		}
 		if (tp->tn_type == VLNK)
-			tmp_memfree(tp->tn_symlink, tp->tn_size + 1);
+			kmem_free(tp->tn_symlink, tp->tn_size + 1);
 	}
 
 	/*
@@ -1739,7 +1773,11 @@ top:
 	rw_destroy(&tp->tn_rwlock);
 	mutex_destroy(&tp->tn_tlock);
 	vn_free(TNTOV(tp));
-	tmp_memfree(tp, sizeof (struct tmpnode));
+	kmem_free(tp, sizeof (struct tmpnode));
+
+	/* If the filesystem was umounted by force, rele the vfs ref */
+	if (tm->tm_vfsp->vfs_flag & VFS_UNMOUNTED)
+		VFS_RELE(tm->tm_vfsp);
 }
 
 /* ARGSUSED2 */
@@ -1861,6 +1899,10 @@ tmp_getapage(
 	struct vnode *pvp;
 	u_offset_t poff;
 
+	/* If the filesystem was umounted by force, return immediately. */
+	if (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)
+		return (EIO);
+
 	if (protp != NULL)
 		*protp = PROT_ALL;
 again:
@@ -2082,6 +2124,10 @@ tmp_putapage(
 	u_offset_t offset;
 	u_offset_t tmpoff;
 
+	/* If the filesystem was umounted by force, return immediately. */
+	if (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)
+		return (EIO);
+
 	ASSERT(PAGE_LOCKED(pp));
 
 	/* Kluster in tmp_klustsize chunks */
@@ -2342,8 +2388,13 @@ tmp_space(
 			return (EFBIG);
 		error = tmp_freesp(vp, bfp, flag);
 
-		if (error == 0 && bfp->l_start == 0)
-			vnevent_truncate(vp, ct);
+		if (error == 0) {
+			if (bfp->l_start == 0) {
+				vnevent_truncate(vp, ct);
+			} else {
+				vnevent_resize(vp, ct);
+			}
+		}
 	}
 	return (error);
 }
diff --git a/usr/src/uts/common/fs/udfs/udf_dir.c b/usr/src/uts/common/fs/udfs/udf_dir.c
index c1e2c74a87..def046a0bf 100644
--- a/usr/src/uts/common/fs/udfs/udf_dir.c
+++ b/usr/src/uts/common/fs/udfs/udf_dir.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
  */
 
 #include <sys/types.h>
@@ -562,9 +563,8 @@ out:
 				    namep, ctp);
 			}
 
-			if (sdp != tdp) {
-				vnevent_rename_dest_dir(ITOV(tdp), ctp);
-			}
+			vnevent_rename_dest_dir(ITOV(tdp), ITOV(tip),
+			    namep, ctp);
 		}
 
 		/*
diff --git a/usr/src/uts/common/fs/udfs/udf_vnops.c b/usr/src/uts/common/fs/udfs/udf_vnops.c
index 93cc4d49e8..7e17f16ce2 100644
--- a/usr/src/uts/common/fs/udfs/udf_vnops.c
+++ b/usr/src/uts/common/fs/udfs/udf_vnops.c
@@ -569,8 +569,11 @@ udf_setattr(
 			goto update_inode;
 		}
 
-		if (vap->va_size == 0)
+		if (vap->va_size == 0) {
 			vnevent_truncate(vp, ct);
+		} else {
+			vnevent_resize(vp, ct);
+		}
 	}
 	/*
 	 * Change file access or modified times.
@@ -1649,8 +1652,13 @@ udf_space(
 	} else if ((error = convoff(vp, bfp, 0, offset)) == 0) {
 		error = ud_freesp(vp, bfp, flag, cr);
 
-		if (error == 0 && bfp->l_start == 0)
-			vnevent_truncate(vp, ct);
+		if (error == 0) {
+			if (bfp->l_start == 0) {
+				vnevent_truncate(vp, ct);
+			} else {
+				vnevent_resize(vp, ct);
+			}
+		}
 	}
 
 	return (error);
diff --git a/usr/src/uts/common/fs/ufs/ufs_vnops.c b/usr/src/uts/common/fs/ufs/ufs_vnops.c
index cf45b48e3c..d689a8173b 100644
--- a/usr/src/uts/common/fs/ufs/ufs_vnops.c
+++ b/usr/src/uts/common/fs/ufs/ufs_vnops.c
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 1984, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2015, Joyent, Inc.
+ * Copyright 2016, Joyent, Inc.
  */
 
 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
@@ -2193,8 +2193,13 @@ again:
 			goto update_inode;
 		}
 
-		if (error == 0 && vap->va_size)
-			vnevent_truncate(vp, ct);
+		if (error == 0) {
+			if (vap->va_size) {
+				vnevent_truncate(vp, ct);
+			} else {
+				vnevent_resize(vp, ct);
+			}
+		}
 	}
 
 	if (ulp) {
@@ -3726,12 +3731,7 @@ retry_firstlock:
 
 	if (error == 0) {
 		vnevent_rename_src(ITOV(sip), sdvp, snm, ct);
-		/*
-		 * Notify the target directory of the rename event
-		 * if source and target directories are not the same.
-		 */
-		if (sdvp != tdvp)
-			vnevent_rename_dest_dir(tdvp, ct);
+		vnevent_rename_dest_dir(tdvp, ITOV(sip), tnm, ct);
 	}
 
 errout:
@@ -4478,8 +4478,13 @@ ufs_space(struct vnode *vp, int cmd, struct flock64 *bfp, int flag,
 				return (error);
 			error = ufs_freesp(vp, bfp, flag, cr);
 
-			if (error == 0 && bfp->l_start == 0)
-				vnevent_truncate(vp, ct);
+			if (error == 0) {
+				if (bfp->l_start == 0) {
+					vnevent_truncate(vp, ct);
+				} else {
+					vnevent_resize(vp, ct);
+				}
+			}
 		} else if (cmd == F_ALLOCSP) {
 			error = ufs_lockfs_begin(ufsvfsp, &ulp,
 			    ULOCKFS_FALLOCATE_MASK);
diff --git a/usr/src/uts/common/fs/vfs.c b/usr/src/uts/common/fs/vfs.c
index e179d934ed..35e65f15e6 100644
--- a/usr/src/uts/common/fs/vfs.c
+++ b/usr/src/uts/common/fs/vfs.c
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2015, Joyent, Inc.
  * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
  */
 
@@ -236,7 +236,8 @@ fsop_root(vfs_t *vfsp, vnode_t **vpp)
 	 * Make sure this root has a path.  With lofs, it is possible to have
 	 * a NULL mountpoint.
 	 */
-	if (ret == 0 && vfsp->vfs_mntpt != NULL && (*vpp)->v_path == NULL) {
+	if (ret == 0 && vfsp->vfs_mntpt != NULL &&
+	    (*vpp)->v_path == vn_vpath_empty) {
 		mntpt = vfs_getmntpoint(vfsp);
 		vn_setpath_str(*vpp, refstr_value(mntpt),
 		    strlen(refstr_value(mntpt)));
@@ -3901,6 +3902,8 @@ vfs_to_modname(const char *vfstype)
 		vfstype = "fdfs";
 	} else if (strncmp(vfstype, "nfs", 3) == 0) {
 		vfstype = "nfs";
+	} else if (strcmp(vfstype, "lxproc") == 0) {
+		vfstype = "lxprocfs";
 	}
 
 	return (vfstype);
diff --git a/usr/src/uts/common/fs/vnode.c b/usr/src/uts/common/fs/vnode.c
index e6b6adf56b..77b30da871 100644
--- a/usr/src/uts/common/fs/vnode.c
+++ b/usr/src/uts/common/fs/vnode.c
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2015, Joyent, Inc.
  */
 
 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
@@ -66,6 +66,7 @@
 #include <fs/fs_subr.h>
 #include <sys/taskq.h>
 #include <fs/fs_reparse.h>
+#include <sys/time.h>
 
 /* Determine if this vnode is a file that is read-only */
 #define	ISROFILE(vp)	\
@@ -102,6 +103,9 @@ kmutex_t	vskstat_tree_lock;
 /* Global variable which enables/disables the vopstats collection */
 int vopstats_enabled = 1;
 
+/* Global used for empty/invalid v_path */
+char *vn_vpath_empty = "";
+
 /*
  * forward declarations for internal vnode specific data (vsd)
  */
@@ -200,6 +204,11 @@ static void 		(**vsd_destructor)(void *);
 		cr = crgetmapped(cr);					\
 	}
 
+#define	VOP_LATENCY_10MS	10000000
+#define	VOP_LATENCY_100MS	100000000
+#define	VOP_LATENCY_1S		1000000000
+#define	VOP_LATENCY_10S		10000000000
+
 /*
  * Convert stat(2) formats to vnode types and vice versa.  (Knows about
  * numerical order of S_IFMT and vnode types.)
@@ -2284,7 +2293,7 @@ vn_cache_constructor(void *buf, void *cdrarg, int kmflags)
 	cv_init(&vp->v_cv, NULL, CV_DEFAULT, NULL);
 	rw_init(&vp->v_nbllock, NULL, RW_DEFAULT, NULL);
 	vp->v_femhead = NULL;	/* Must be done before vn_reinit() */
-	vp->v_path = NULL;
+	vp->v_path = vn_vpath_empty;
 	vp->v_mpssdata = NULL;
 	vp->v_vsd = NULL;
 	vp->v_fopdata = NULL;
@@ -2331,6 +2340,7 @@ void
 vn_recycle(vnode_t *vp)
 {
 	ASSERT(vp->v_pages == NULL);
+	VERIFY(vp->v_path != NULL);
 
 	/*
 	 * XXX - This really belongs in vn_reinit(), but we have some issues
@@ -2353,9 +2363,9 @@ vn_recycle(vnode_t *vp)
 		kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead)));
 		vp->v_femhead = NULL;
 	}
-	if (vp->v_path) {
+	if (vp->v_path != vn_vpath_empty) {
 		kmem_free(vp->v_path, strlen(vp->v_path) + 1);
-		vp->v_path = NULL;
+		vp->v_path = vn_vpath_empty;
 	}
 
 	if (vp->v_fopdata != NULL) {
@@ -2427,9 +2437,10 @@ vn_free(vnode_t *vp)
 	 */
 	ASSERT((vp->v_count == 0) || (vp->v_count == 1));
 	ASSERT(vp->v_count_dnlc == 0);
-	if (vp->v_path != NULL) {
+	VERIFY(vp->v_path != NULL);
+	if (vp->v_path != vn_vpath_empty) {
 		kmem_free(vp->v_path, strlen(vp->v_path) + 1);
-		vp->v_path = NULL;
+		vp->v_path = vn_vpath_empty;
 	}
 
 	/* If FEM was in use, make sure everything gets cleaned up */
@@ -2516,6 +2527,7 @@ vnevent_rename_src(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
 	if (vp == NULL || vp->v_femhead == NULL) {
 		return;
 	}
+	(void) VOP_VNEVENT(dvp, VE_RENAME_SRC_DIR, vp, name, ct);
 	(void) VOP_VNEVENT(vp, VE_RENAME_SRC, dvp, name, ct);
 }
 
@@ -2530,12 +2542,13 @@ vnevent_rename_dest(vnode_t *vp, vnode_t *dvp, char *name,
 }
 
 void
-vnevent_rename_dest_dir(vnode_t *vp, caller_context_t *ct)
+vnevent_rename_dest_dir(vnode_t *vp, vnode_t *nvp, char *name,
+    caller_context_t *ct)
 {
 	if (vp == NULL || vp->v_femhead == NULL) {
 		return;
 	}
-	(void) VOP_VNEVENT(vp, VE_RENAME_DEST_DIR, NULL, NULL, ct);
+	(void) VOP_VNEVENT(vp, VE_RENAME_DEST_DIR, nvp, name, ct);
 }
 
 void
@@ -2622,6 +2635,15 @@ vnevent_truncate(vnode_t *vp, caller_context_t *ct)
 	(void) VOP_VNEVENT(vp, VE_TRUNCATE, NULL, NULL, ct);
 }
 
+void
+vnevent_resize(vnode_t *vp, caller_context_t *ct)
+{
+	if (vp == NULL || vp->v_femhead == NULL) {
+		return;
+	}
+	(void) VOP_VNEVENT(vp, VE_RESIZE, NULL, NULL, ct);
+}
+
 /*
  * Vnode accessors.
  */
@@ -2981,7 +3003,7 @@ vn_setpath(vnode_t *rootvp, struct vnode *startvp, struct vnode *vp,
 	 * the potential for deadlock.
 	 */
 	mutex_enter(&base->v_lock);
-	if (base->v_path == NULL) {
+	if (base->v_path == vn_vpath_empty) {
 		mutex_exit(&base->v_lock);
 		return;
 	}
@@ -3008,7 +3030,8 @@ vn_setpath(vnode_t *rootvp, struct vnode *startvp, struct vnode *vp,
 	rpath = kmem_alloc(rpathalloc, KM_SLEEP);
 
 	mutex_enter(&base->v_lock);
-	if (base->v_path == NULL || strlen(base->v_path) != rpathlen) {
+	if (base->v_path == vn_vpath_empty ||
+	    strlen(base->v_path) != rpathlen) {
 		mutex_exit(&base->v_lock);
 		kmem_free(rpath, rpathalloc);
 		return;
@@ -3022,7 +3045,7 @@ vn_setpath(vnode_t *rootvp, struct vnode *startvp, struct vnode *vp,
 	rpath[rpathlen + plen] = '\0';
 
 	mutex_enter(&vp->v_lock);
-	if (vp->v_path != NULL) {
+	if (vp->v_path != vn_vpath_empty) {
 		mutex_exit(&vp->v_lock);
 		kmem_free(rpath, rpathalloc);
 	} else {
@@ -3042,7 +3065,7 @@ vn_setpath_str(struct vnode *vp, const char *str, size_t len)
 	char *buf = kmem_alloc(len + 1, KM_SLEEP);
 
 	mutex_enter(&vp->v_lock);
-	if (vp->v_path != NULL) {
+	if (vp->v_path != vn_vpath_empty) {
 		mutex_exit(&vp->v_lock);
 		kmem_free(buf, len + 1);
 		return;
@@ -3066,10 +3089,10 @@ vn_renamepath(vnode_t *dvp, vnode_t *vp, const char *nm, size_t len)
 
 	mutex_enter(&vp->v_lock);
 	tmp = vp->v_path;
-	vp->v_path = NULL;
+	vp->v_path = vn_vpath_empty;
 	mutex_exit(&vp->v_lock);
 	vn_setpath(rootdir, dvp, vp, nm, len);
-	if (tmp != NULL)
+	if (tmp != vn_vpath_empty)
 		kmem_free(tmp, strlen(tmp) + 1);
 }
 
@@ -3084,7 +3107,7 @@ vn_copypath(struct vnode *src, struct vnode *dst)
 	int alloc;
 
 	mutex_enter(&src->v_lock);
-	if (src->v_path == NULL) {
+	if (src->v_path == vn_vpath_empty) {
 		mutex_exit(&src->v_lock);
 		return;
 	}
@@ -3094,7 +3117,7 @@ vn_copypath(struct vnode *src, struct vnode *dst)
 	mutex_exit(&src->v_lock);
 	buf = kmem_alloc(alloc, KM_SLEEP);
 	mutex_enter(&src->v_lock);
-	if (src->v_path == NULL || strlen(src->v_path) + 1 != alloc) {
+	if (src->v_path == vn_vpath_empty || strlen(src->v_path) + 1 != alloc) {
 		mutex_exit(&src->v_lock);
 		kmem_free(buf, alloc);
 		return;
@@ -3103,7 +3126,7 @@ vn_copypath(struct vnode *src, struct vnode *dst)
 	mutex_exit(&src->v_lock);
 
 	mutex_enter(&dst->v_lock);
-	if (dst->v_path != NULL) {
+	if (dst->v_path != vn_vpath_empty) {
 		mutex_exit(&dst->v_lock);
 		kmem_free(buf, alloc);
 		return;
@@ -3261,14 +3284,57 @@ fop_read(
 	cred_t *cr,
 	caller_context_t *ct)
 {
-	int	err;
 	ssize_t	resid_start = uiop->uio_resid;
+	zone_t	*zonep = curzone;
+	zone_vfs_kstat_t *zvp = zonep->zone_vfs_stats;
+
+	hrtime_t start, lat;
+	ssize_t len;
+	int err;
+
+	if (vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) {
+		start = gethrtime();
+
+		mutex_enter(&zonep->zone_vfs_lock);
+		kstat_runq_enter(&zonep->zone_vfs_rwstats);
+		mutex_exit(&zonep->zone_vfs_lock);
+	}
 
 	VOPXID_MAP_CR(vp, cr);
 
 	err = (*(vp)->v_op->vop_read)(vp, uiop, ioflag, cr, ct);
-	VOPSTATS_UPDATE_IO(vp, read,
-	    read_bytes, (resid_start - uiop->uio_resid));
+	len = resid_start - uiop->uio_resid;
+
+	VOPSTATS_UPDATE_IO(vp, read, read_bytes, len);
+
+	if (vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) {
+		mutex_enter(&zonep->zone_vfs_lock);
+		zonep->zone_vfs_rwstats.reads++;
+		zonep->zone_vfs_rwstats.nread += len;
+		kstat_runq_exit(&zonep->zone_vfs_rwstats);
+		mutex_exit(&zonep->zone_vfs_lock);
+
+		lat = gethrtime() - start;
+
+		if (lat >= VOP_LATENCY_10MS) {
+			if (lat < VOP_LATENCY_100MS)
+				atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+			else if (lat < VOP_LATENCY_1S) {
+				atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+				atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+			} else if (lat < VOP_LATENCY_10S) {
+				atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+				atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+				atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
+			} else {
+				atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+				atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+				atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
+				atomic_inc_64(&zvp->zv_10s_ops.value.ui64);
+			}
+		}
+	}
+
 	return (err);
 }
 
@@ -3280,14 +3346,62 @@ fop_write(
 	cred_t *cr,
 	caller_context_t *ct)
 {
-	int	err;
 	ssize_t	resid_start = uiop->uio_resid;
+	zone_t	*zonep = curzone;
+	zone_vfs_kstat_t *zvp = zonep->zone_vfs_stats;
+
+	hrtime_t start, lat;
+	ssize_t len;
+	int	err;
+
+	/*
+	 * For the purposes of VFS kstat consumers, the "waitq" calculation is
+	 * repurposed as the active queue for VFS write operations.  There's no
+	 * actual wait queue for VFS operations.
+	 */
+	if (vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) {
+		start = gethrtime();
+
+		mutex_enter(&zonep->zone_vfs_lock);
+		kstat_waitq_enter(&zonep->zone_vfs_rwstats);
+		mutex_exit(&zonep->zone_vfs_lock);
+	}
 
 	VOPXID_MAP_CR(vp, cr);
 
 	err = (*(vp)->v_op->vop_write)(vp, uiop, ioflag, cr, ct);
-	VOPSTATS_UPDATE_IO(vp, write,
-	    write_bytes, (resid_start - uiop->uio_resid));
+	len = resid_start - uiop->uio_resid;
+
+	VOPSTATS_UPDATE_IO(vp, write, write_bytes, len);
+
+	if (vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) {
+		mutex_enter(&zonep->zone_vfs_lock);
+		zonep->zone_vfs_rwstats.writes++;
+		zonep->zone_vfs_rwstats.nwritten += len;
+		kstat_waitq_exit(&zonep->zone_vfs_rwstats);
+		mutex_exit(&zonep->zone_vfs_lock);
+
+		lat = gethrtime() - start;
+
+		if (lat >= VOP_LATENCY_10MS) {
+			if (lat < VOP_LATENCY_100MS)
+				atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+			else if (lat < VOP_LATENCY_1S) {
+				atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+				atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+			} else if (lat < VOP_LATENCY_10S) {
+				atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+				atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+				atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
+			} else {
+				atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+				atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+				atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
+				atomic_inc_64(&zvp->zv_10s_ops.value.ui64);
+			}
+		}
+	}
+
 	return (err);
 }
 
@@ -3451,7 +3565,7 @@ fop_lookup(
 	}
 	if (ret == 0 && *vpp) {
 		VOPSTATS_UPDATE(*vpp, lookup);
-		if ((*vpp)->v_path == NULL) {
+		if ((*vpp)->v_path == vn_vpath_empty) {
 			vn_setpath(rootdir, dvp, *vpp, nm, strlen(nm));
 		}
 	}
@@ -3493,7 +3607,7 @@ fop_create(
 	    (dvp, name, vap, excl, mode, vpp, cr, flags, ct, vsecp);
 	if (ret == 0 && *vpp) {
 		VOPSTATS_UPDATE(*vpp, create);
-		if ((*vpp)->v_path == NULL) {
+		if ((*vpp)->v_path == vn_vpath_empty) {
 			vn_setpath(rootdir, dvp, *vpp, name, strlen(name));
 		}
 	}
@@ -3615,7 +3729,7 @@ fop_mkdir(
 	    (dvp, dirname, vap, vpp, cr, ct, flags, vsecp);
 	if (ret == 0 && *vpp) {
 		VOPSTATS_UPDATE(*vpp, mkdir);
-		if ((*vpp)->v_path == NULL) {
+		if ((*vpp)->v_path == vn_vpath_empty) {
 			vn_setpath(rootdir, dvp, *vpp, dirname,
 			    strlen(dirname));
 		}
diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c
index 3c3cbdf4c1..5c06a1bb29 100644
--- a/usr/src/uts/common/fs/zfs/arc.c
+++ b/usr/src/uts/common/fs/zfs/arc.c
@@ -129,6 +129,7 @@
 #include <sys/vdev.h>
 #include <sys/vdev_impl.h>
 #include <sys/dsl_pool.h>
+#include <sys/zfs_zone.h>
 #include <sys/multilist.h>
 #ifdef _KERNEL
 #include <sys/vmsystm.h>
@@ -4343,6 +4344,14 @@ top:
 		rzio = zio_read(pio, spa, bp, buf->b_data, size,
 		    arc_read_done, buf, priority, zio_flags, zb);
 
+		/*
+		 * At this point, this read I/O has already missed in the ARC
+		 * and will be going through to the disk.  The I/O throttle
+		 * should delay this I/O if this zone is using more than its I/O
+		 * priority allows.
+		 */
+		zfs_zone_io_throttle(ZFS_ZONE_IOP_READ);
+
 		if (*arc_flags & ARC_FLAG_WAIT)
 			return (zio_wait(rzio));
 
diff --git a/usr/src/uts/common/fs/zfs/dbuf.c b/usr/src/uts/common/fs/zfs/dbuf.c
index 4f469fc750..64a4cb74d0 100644
--- a/usr/src/uts/common/fs/zfs/dbuf.c
+++ b/usr/src/uts/common/fs/zfs/dbuf.c
@@ -678,8 +678,19 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
 		arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
 		if (bonuslen < DN_MAX_BONUSLEN)
 			bzero(db->db.db_data, DN_MAX_BONUSLEN);
-		if (bonuslen)
-			bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
+
+		if (bonuslen) {
+			/*
+			 * Absent byzantine on-disk corruption, we fully expect
+			 * our bonuslen to be no more than DN_MAX_BONUSLEN --
+			 * but we nonetheless explicitly clamp it on the bcopy()
+			 * to prevent any on-disk corruption from becoming
+			 * rampant in-kernel corruption.
+			 */
+			bcopy(DN_BONUS(dn->dn_phys), db->db.db_data,
+			    MIN(bonuslen, DN_MAX_BONUSLEN));
+		}
+
 		DB_DNODE_EXIT(db);
 		db->db_state = DB_CACHED;
 		mutex_exit(&db->db_mtx);
diff --git a/usr/src/uts/common/fs/zfs/dmu.c b/usr/src/uts/common/fs/zfs/dmu.c
index 35015825b4..8ce9178ad2 100644
--- a/usr/src/uts/common/fs/zfs/dmu.c
+++ b/usr/src/uts/common/fs/zfs/dmu.c
@@ -1818,7 +1818,6 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
 			    ZCHECKSUM_FLAG_DEDUP))
 				dedup_verify = B_TRUE;
 		}
-
 		/*
 		 * Enable nopwrite if we have secure enough checksum
 		 * algorithm (see comment in zio_nop_write) and
diff --git a/usr/src/uts/common/fs/zfs/dmu_send.c b/usr/src/uts/common/fs/zfs/dmu_send.c
index 50b8aba876..f54d67202b 100644
--- a/usr/src/uts/common/fs/zfs/dmu_send.c
+++ b/usr/src/uts/common/fs/zfs/dmu_send.c
@@ -22,7 +22,7 @@
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
  * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
- * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2015, Joyent, Inc. All rights reserved.
  * Copyright 2014 HybridCluster. All rights reserved.
  * Copyright 2016 RackTop Systems.
  * Copyright (c) 2014 Integros [integros.com]
@@ -2492,8 +2492,12 @@ receive_read_record(struct receive_arg *ra)
 	{
 		struct drr_object *drro = &ra->rrd->header.drr_u.drr_object;
 		uint32_t size = P2ROUNDUP(drro->drr_bonuslen, 8);
-		void *buf = kmem_zalloc(size, KM_SLEEP);
+		void *buf = NULL;
 		dmu_object_info_t doi;
+
+		if (size > 0)
+			buf = kmem_zalloc(size, KM_SLEEP);
+
 		err = receive_read_payload_and_next_header(ra, size, buf);
 		if (err != 0) {
 			kmem_free(buf, size);
diff --git a/usr/src/uts/common/fs/zfs/dmu_tx.c b/usr/src/uts/common/fs/zfs/dmu_tx.c
index 15b9459ce2..9d3db4212e 100644
--- a/usr/src/uts/common/fs/zfs/dmu_tx.c
+++ b/usr/src/uts/common/fs/zfs/dmu_tx.c
@@ -39,11 +39,11 @@
 #include <sys/sa_impl.h>
 #include <sys/zfs_context.h>
 #include <sys/varargs.h>
+#include <sys/zfs_zone.h>
 
 typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
     uint64_t arg1, uint64_t arg2);
 
-
 dmu_tx_t *
 dmu_tx_create_dd(dsl_dir_t *dd)
 {
@@ -224,6 +224,8 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 	if (len == 0)
 		return;
 
+	zfs_zone_io_throttle(ZFS_ZONE_IOP_LOGICAL_WRITE);
+
 	min_bs = SPA_MINBLOCKSHIFT;
 	max_bs = highbit64(txh->txh_tx->tx_objset->os_recordsize) - 1;
 	min_ibs = DN_MIN_INDBLKSHIFT;
diff --git a/usr/src/uts/common/fs/zfs/dsl_dir.c b/usr/src/uts/common/fs/zfs/dsl_dir.c
index 7d86f72ad1..ca7d8b9bee 100644
--- a/usr/src/uts/common/fs/zfs/dsl_dir.c
+++ b/usr/src/uts/common/fs/zfs/dsl_dir.c
@@ -42,6 +42,7 @@
 #include <sys/zio.h>
 #include <sys/arc.h>
 #include <sys/sunddi.h>
+#include <sys/zfs_zone.h>
 #include <sys/zfeature.h>
 #include <sys/policy.h>
 #include <sys/zfs_znode.h>
@@ -1262,7 +1263,7 @@ dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize,
 			 * locks are held.
 			 */
 			txg_delay(dd->dd_pool, tx->tx_txg,
-			    MSEC2NSEC(10), MSEC2NSEC(10));
+			    zfs_zone_txg_delay(), MSEC2NSEC(10));
 			err = SET_ERROR(ERESTART);
 		}
 	}
diff --git a/usr/src/uts/common/fs/zfs/dsl_pool.c b/usr/src/uts/common/fs/zfs/dsl_pool.c
index ee2d8ee9eb..242db1c9f1 100644
--- a/usr/src/uts/common/fs/zfs/dsl_pool.c
+++ b/usr/src/uts/common/fs/zfs/dsl_pool.c
@@ -43,6 +43,7 @@
 #include <sys/zfs_znode.h>
 #include <sys/spa_impl.h>
 #include <sys/dsl_deadlist.h>
+#include <sys/zfs_zone.h>
 #include <sys/bptree.h>
 #include <sys/zfeature.h>
 #include <sys/zil_impl.h>
diff --git a/usr/src/uts/common/fs/zfs/metaslab.c b/usr/src/uts/common/fs/zfs/metaslab.c
index 9030b855a1..296f0006fc 100644
--- a/usr/src/uts/common/fs/zfs/metaslab.c
+++ b/usr/src/uts/common/fs/zfs/metaslab.c
@@ -64,6 +64,11 @@ uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1;	/* force gang blocks */
 int zfs_condense_pct = 200;
 
 /*
+ * Never condense any space map.  This is for debugging/recovery only.
+ */
+int zfs_condense_never = 0;
+
+/*
  * Condensing a metaslab is not guaranteed to actually reduce the amount of
  * space used on disk. In particular, a space map uses data in increments of
  * MAX(1 << ashift, space_map_blksize), so a metaslab might use the
@@ -1657,6 +1662,9 @@ metaslab_should_condense(metaslab_t *msp)
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	ASSERT(msp->ms_loaded);
 
+	if (zfs_condense_never != 0)
+		return (B_FALSE);
+
 	/*
 	 * Use the ms_size_tree range tree, which is ordered by size, to
 	 * obtain the largest segment in the free tree. We always condense
diff --git a/usr/src/uts/common/fs/zfs/sa.c b/usr/src/uts/common/fs/zfs/sa.c
index 7ddf806ec5..3168b47304 100644
--- a/usr/src/uts/common/fs/zfs/sa.c
+++ b/usr/src/uts/common/fs/zfs/sa.c
@@ -24,6 +24,7 @@
  * Portions Copyright 2011 iXsystems, Inc
  * Copyright (c) 2013 by Delphix. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright (c) 2015 Joyent, Inc.  All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  */
 
@@ -407,15 +408,18 @@ sa_add_layout_entry(objset_t *os, sa_attr_type_t *attrs, int attr_count,
 {
 	sa_os_t *sa = os->os_sa;
 	sa_lot_t *tb, *findtb;
-	int i;
+	int i, size;
 	avl_index_t loc;
 
 	ASSERT(MUTEX_HELD(&sa->sa_lock));
 	tb = kmem_zalloc(sizeof (sa_lot_t), KM_SLEEP);
 	tb->lot_attr_count = attr_count;
-	tb->lot_attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count,
-	    KM_SLEEP);
-	bcopy(attrs, tb->lot_attrs, sizeof (sa_attr_type_t) * attr_count);
+
+	if ((size = sizeof (sa_attr_type_t) * attr_count) != 0) {
+		tb->lot_attrs = kmem_alloc(size, KM_SLEEP);
+		bcopy(attrs, tb->lot_attrs, size);
+	}
+
 	tb->lot_num = lot_num;
 	tb->lot_hash = hash;
 	tb->lot_instance = 0;
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
index 3f1b7d8a54..0b99d08e72 100644
--- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
@@ -116,6 +116,7 @@ struct vdev_queue {
 	avl_tree_t	vq_read_offset_tree;
 	avl_tree_t	vq_write_offset_tree;
 	uint64_t	vq_last_offset;
+	zoneid_t	vq_last_zone_id;
 	hrtime_t	vq_io_complete_ts; /* time last i/o completed */
 	kmutex_t	vq_lock;
 };
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_zone.h b/usr/src/uts/common/fs/zfs/sys/zfs_zone.h
new file mode 100644
index 0000000000..f1431b3f55
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_zone.h
@@ -0,0 +1,63 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2015, Joyent, Inc. All rights reserved.
+ */
+
+#ifndef	_SYS_FS_ZFS_ZONE_H
+#define	_SYS_FS_ZFS_ZONE_H
+
+#ifdef _KERNEL
+#include <sys/isa_defs.h>
+#include <sys/types32.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+#endif
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+typedef enum {
+	ZFS_ZONE_IOP_READ = 0,
+	ZFS_ZONE_IOP_WRITE,
+	ZFS_ZONE_IOP_LOGICAL_WRITE,
+} zfs_zone_iop_type_t;
+
+extern void zfs_zone_io_throttle(zfs_zone_iop_type_t);
+
+extern void zfs_zone_zio_init(zio_t *);
+extern void zfs_zone_zio_start(zio_t *);
+extern void zfs_zone_zio_done(zio_t *);
+extern void zfs_zone_zio_dequeue(zio_t *);
+extern void zfs_zone_zio_enqueue(zio_t *);
+extern void zfs_zone_report_txg_sync(void *);
+extern hrtime_t zfs_zone_txg_delay();
+#ifdef _KERNEL
+extern zio_t *zfs_zone_schedule(vdev_queue_t *, zio_priority_t, avl_index_t,
+    avl_tree_t *);
+#endif
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_FS_ZFS_ZONE_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zio.h b/usr/src/uts/common/fs/zfs/sys/zio.h
index 6d8f7601f3..6d02c95b22 100644
--- a/usr/src/uts/common/fs/zfs/sys/zio.h
+++ b/usr/src/uts/common/fs/zfs/sys/zio.h
@@ -421,6 +421,7 @@ struct zio {
 	uint64_t	io_offset;
 	hrtime_t	io_timestamp;
 	hrtime_t	io_target_timestamp;
+	hrtime_t	io_dispatched;	/* time I/O was dispatched to disk */
 	avl_node_t	io_queue_node;
 	avl_node_t	io_offset_node;
 
@@ -449,6 +450,7 @@ struct zio {
 	zio_cksum_report_t *io_cksum_report;
 	uint64_t	io_ena;
 
+	zoneid_t	io_zoneid;	/* zone which originated this I/O */
 	/* Taskq dispatching state */
 	taskq_ent_t	io_tqent;
 };
diff --git a/usr/src/uts/common/fs/zfs/txg.c b/usr/src/uts/common/fs/zfs/txg.c
index 191259e75b..915c9bb4b2 100644
--- a/usr/src/uts/common/fs/zfs/txg.c
+++ b/usr/src/uts/common/fs/zfs/txg.c
@@ -31,6 +31,7 @@
 #include <sys/dsl_pool.h>
 #include <sys/dsl_scan.h>
 #include <sys/callb.h>
+#include <sys/zfs_zone.h>
 
 /*
  * ZFS Transaction Groups
@@ -506,6 +507,8 @@ txg_sync_thread(dsl_pool_t *dp)
 		    txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
 		mutex_exit(&tx->tx_sync_lock);
 
+		zfs_zone_report_txg_sync(dp);
+
 		start = ddi_get_lbolt();
 		spa_sync(spa, txg);
 		delta = ddi_get_lbolt() - start;
diff --git a/usr/src/uts/common/fs/zfs/vdev_disk.c b/usr/src/uts/common/fs/zfs/vdev_disk.c
index a6af0101e7..ab305ed694 100644
--- a/usr/src/uts/common/fs/zfs/vdev_disk.c
+++ b/usr/src/uts/common/fs/zfs/vdev_disk.c
@@ -26,6 +26,7 @@
  */
 
 #include <sys/zfs_context.h>
+#include <sys/zfs_zone.h>
 #include <sys/spa_impl.h>
 #include <sys/refcount.h>
 #include <sys/vdev_disk.h>
@@ -44,6 +45,11 @@ extern ldi_ident_t zfs_li;
 
 static void vdev_disk_close(vdev_t *);
 
+typedef struct vdev_disk_buf {
+	buf_t	vdb_buf;
+	zio_t	*vdb_io;
+} vdev_disk_buf_t;
+
 typedef struct vdev_disk_ldi_cb {
 	list_node_t		lcb_next;
 	ldi_callback_id_t	lcb_id;
@@ -127,6 +133,8 @@ vdev_disk_off_finalize(ldi_handle_t lh, ldi_ev_cookie_t ecookie,
     int ldi_result, void *arg, void *ev_data)
 {
 	vdev_t *vd = (vdev_t *)arg;
+	vdev_disk_t *dvd = vd->vdev_tsd;
+	vdev_disk_ldi_cb_t *lcb;
 
 	/*
 	 * Ignore events other than offline.
@@ -586,6 +594,7 @@ static void
 vdev_disk_close(vdev_t *vd)
 {
 	vdev_disk_t *dvd = vd->vdev_tsd;
+	vdev_disk_ldi_cb_t *lcb;
 
 	if (vd->vdev_reopening || dvd == NULL)
 		return;
@@ -815,6 +824,8 @@ vdev_disk_io_start(zio_t *zio)
 	bp->b_bufsize = zio->io_size;
 	bp->b_iodone = (int (*)())vdev_disk_io_intr;
 
+	zfs_zone_zio_start(zio);
+
 	/* ldi_strategy() will return non-zero only on programming errors */
 	VERIFY(ldi_strategy(dvd->vd_lh, bp) == 0);
 }
@@ -824,6 +835,8 @@ vdev_disk_io_done(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 
+	zfs_zone_zio_done(zio);
+
 	/*
 	 * If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if
 	 * the device has been removed.  If this is the case, then we trigger an
diff --git a/usr/src/uts/common/fs/zfs/vdev_queue.c b/usr/src/uts/common/fs/zfs/vdev_queue.c
index 4917cc9284..cc415e2ca0 100644
--- a/usr/src/uts/common/fs/zfs/vdev_queue.c
+++ b/usr/src/uts/common/fs/zfs/vdev_queue.c
@@ -21,6 +21,7 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2013, Joyent, Inc. All rights reserved.
  */
 
 /*
@@ -34,6 +35,7 @@
 #include <sys/zio.h>
 #include <sys/avl.h>
 #include <sys/dsl_pool.h>
+#include <sys/zfs_zone.h>
 
 /*
  * ZFS I/O Scheduler
@@ -142,7 +144,7 @@ uint32_t zfs_vdev_sync_write_min_active = 10;
 uint32_t zfs_vdev_sync_write_max_active = 10;
 uint32_t zfs_vdev_async_read_min_active = 1;
 uint32_t zfs_vdev_async_read_max_active = 3;
-uint32_t zfs_vdev_async_write_min_active = 1;
+uint32_t zfs_vdev_async_write_min_active = 3;
 uint32_t zfs_vdev_async_write_max_active = 10;
 uint32_t zfs_vdev_scrub_min_active = 1;
 uint32_t zfs_vdev_scrub_max_active = 2;
@@ -238,6 +240,8 @@ vdev_queue_init(vdev_t *vd)
 	    vdev_queue_offset_compare, sizeof (zio_t),
 	    offsetof(struct zio, io_offset_node));
 
+	vq->vq_last_zone_id = 0;
+
 	for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
 		int (*compfn) (const void *, const void *);
 
@@ -275,6 +279,7 @@ vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
+	zfs_zone_zio_enqueue(zio);
 	avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
 	avl_add(vdev_queue_type_tree(vq, zio->io_type), zio);
 
@@ -290,6 +295,7 @@ vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
+	zfs_zone_zio_dequeue(zio);
 	avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
 	avl_remove(vdev_queue_type_tree(vq, zio->io_type), zio);
 
@@ -649,7 +655,11 @@ again:
 	search.io_timestamp = 0;
 	search.io_offset = vq->vq_last_offset + 1;
 	VERIFY3P(avl_find(tree, &search, &idx), ==, NULL);
+#ifdef _KERNEL
+	zio = zfs_zone_schedule(vq, p, idx, tree);
+#else
 	zio = avl_nearest(tree, idx, AVL_AFTER);
+#endif
 	if (zio == NULL)
 		zio = avl_first(tree);
 	ASSERT3U(zio->io_priority, ==, p);
diff --git a/usr/src/uts/common/fs/zfs/zfs_dir.c b/usr/src/uts/common/fs/zfs/zfs_dir.c
index 2b18ecb01c..132e84b111 100644
--- a/usr/src/uts/common/fs/zfs/zfs_dir.c
+++ b/usr/src/uts/common/fs/zfs/zfs_dir.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, Joyent, Inc.
  * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
  */
 
@@ -853,9 +854,9 @@ zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
 		if (zp->z_links <= zp_is_dir) {
 			zfs_panic_recover("zfs: link count on %s is %u, "
 			    "should be at least %u",
-			    zp->z_vnode->v_path ? zp->z_vnode->v_path :
-			    "<unknown>", (int)zp->z_links,
-			    zp_is_dir + 1);
+			    zp->z_vnode->v_path != vn_vpath_empty ?
+			    zp->z_vnode->v_path : "<unknown>",
+			    (int)zp->z_links, zp_is_dir + 1);
 			zp->z_links = zp_is_dir + 1;
 		}
 		if (--zp->z_links == zp_is_dir) {
diff --git a/usr/src/uts/common/fs/zfs/zfs_ioctl.c b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
index a7feada44f..6d28956707 100644
--- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c
+++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
@@ -615,9 +615,10 @@ zfs_secpolicy_setprop(const char *dsname, zfs_prop_t prop, nvpair_t *propval,
 	 * Check permissions for special properties.
 	 */
 	switch (prop) {
+	case ZFS_PROP_DEDUP:
 	case ZFS_PROP_ZONED:
 		/*
-		 * Disallow setting of 'zoned' from within a local zone.
+		 * Disallow setting these properties from within a local zone.
 		 */
 		if (!INGLOBALZONE(curproc))
 			return (SET_ERROR(EPERM));
@@ -947,6 +948,9 @@ zfs_secpolicy_recv(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	int error;
 
+	if (secpolicy_fs_import(cr) != 0)
+		return (set_errno(EPERM));
+
 	if ((error = zfs_secpolicy_write_perms(zc->zc_name,
 	    ZFS_DELEG_PERM_RECEIVE, cr)) != 0)
 		return (error);
@@ -2037,7 +2041,8 @@ zfs_ioc_vdev_setfru(zfs_cmd_t *zc)
 }
 
 static int
-zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os)
+zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os,
+    boolean_t cachedpropsonly)
 {
 	int error = 0;
 	nvlist_t *nv;
@@ -2055,7 +2060,8 @@ zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os)
 		 * XXX reading with out owning
 		 */
 		if (!zc->zc_objset_stats.dds_inconsistent &&
-		    dmu_objset_type(os) == DMU_OST_ZVOL) {
+		    dmu_objset_type(os) == DMU_OST_ZVOL &&
+		    !cachedpropsonly) {
 			error = zvol_get_stats(os, nv);
 			if (error == EIO)
 				return (error);
@@ -2082,11 +2088,24 @@ static int
 zfs_ioc_objset_stats(zfs_cmd_t *zc)
 {
 	objset_t *os;
+	nvlist_t *nvl = NULL;
+	boolean_t cachedpropsonly = B_FALSE;
 	int error;
 
+	if (zc->zc_nvlist_src != NULL &&
+	    (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+	    zc->zc_iflags, &nvl) != 0))
+		return (error);
+
+	if (nvl != NULL) {
+		(void) nvlist_lookup_boolean_value(nvl, "cachedpropsonly",
+		    &cachedpropsonly);
+		nvlist_free(nvl);
+	}
+
 	error = dmu_objset_hold(zc->zc_name, FTAG, &os);
 	if (error == 0) {
-		error = zfs_ioc_objset_stats_impl(zc, os);
+		error = zfs_ioc_objset_stats_impl(zc, os, cachedpropsonly);
 		dmu_objset_rele(os, FTAG);
 	}
 
@@ -2281,8 +2300,21 @@ static int
 zfs_ioc_snapshot_list_next(zfs_cmd_t *zc)
 {
 	objset_t *os;
+	nvlist_t *nvl = NULL;
+	boolean_t cachedpropsonly = B_FALSE;
 	int error;
 
+	if (zc->zc_nvlist_src != NULL &&
+	    (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+	    zc->zc_iflags, &nvl) != 0))
+		return (error);
+
+	if (nvl != NULL) {
+		(void) nvlist_lookup_boolean_value(nvl, "cachedpropsonly",
+		    &cachedpropsonly);
+		nvlist_free(nvl);
+	}
+
 	error = dmu_objset_hold(zc->zc_name, FTAG, &os);
 	if (error != 0) {
 		return (error == ENOENT ? ESRCH : error);
@@ -2311,8 +2343,10 @@ zfs_ioc_snapshot_list_next(zfs_cmd_t *zc)
 			objset_t *ossnap;
 
 			error = dmu_objset_from_ds(ds, &ossnap);
-			if (error == 0)
-				error = zfs_ioc_objset_stats_impl(zc, ossnap);
+			if (error == 0) {
+				error = zfs_ioc_objset_stats_impl(zc,
+				    ossnap, cachedpropsonly);
+			}
 			dsl_dataset_rele(ds, FTAG);
 		}
 	} else if (error == ENOENT) {
@@ -3022,6 +3056,7 @@ zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver,
 	uint64_t sense = ZFS_PROP_UNDEFINED;
 	uint64_t norm = ZFS_PROP_UNDEFINED;
 	uint64_t u8 = ZFS_PROP_UNDEFINED;
+	int error;
 
 	ASSERT(zplprops != NULL);
 
@@ -3065,8 +3100,9 @@ zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver,
 	VERIFY(nvlist_add_uint64(zplprops,
 	    zfs_prop_to_name(ZFS_PROP_VERSION), zplver) == 0);
 
-	if (norm == ZFS_PROP_UNDEFINED)
-		VERIFY(zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &norm) == 0);
+	if (norm == ZFS_PROP_UNDEFINED &&
+	    (error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &norm)) != 0)
+		return (error);
 	VERIFY(nvlist_add_uint64(zplprops,
 	    zfs_prop_to_name(ZFS_PROP_NORMALIZE), norm) == 0);
 
@@ -3075,13 +3111,15 @@ zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver,
 	 */
 	if (norm)
 		u8 = 1;
-	if (u8 == ZFS_PROP_UNDEFINED)
-		VERIFY(zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &u8) == 0);
+	if (u8 == ZFS_PROP_UNDEFINED &&
+	    (error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &u8)) != 0)
+		return (error);
 	VERIFY(nvlist_add_uint64(zplprops,
 	    zfs_prop_to_name(ZFS_PROP_UTF8ONLY), u8) == 0);
 
-	if (sense == ZFS_PROP_UNDEFINED)
-		VERIFY(zfs_get_zplprop(os, ZFS_PROP_CASE, &sense) == 0);
+	if (sense == ZFS_PROP_UNDEFINED &&
+	    (error = zfs_get_zplprop(os, ZFS_PROP_CASE, &sense)) != 0)
+		return (error);
 	VERIFY(nvlist_add_uint64(zplprops,
 	    zfs_prop_to_name(ZFS_PROP_CASE), sense) == 0);
 
diff --git a/usr/src/uts/common/fs/zfs/zfs_vfsops.c b/usr/src/uts/common/fs/zfs/zfs_vfsops.c
index 0d02fd5bec..ace4bf8173 100644
--- a/usr/src/uts/common/fs/zfs/zfs_vfsops.c
+++ b/usr/src/uts/common/fs/zfs/zfs_vfsops.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  */
@@ -1903,6 +1904,17 @@ zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr)
 	if (zfsvfs->z_ctldir != NULL)
 		zfsctl_destroy(zfsvfs);
 
+	/*
+	 * If we're doing a forced unmount on a dataset which still has
+	 * references and is in a zone, then we need to cleanup the zone
+	 * reference at this point or else the zone will never be able to
+	 * shutdown.
+	 */
+	if ((fflag & MS_FORCE) && vfsp->vfs_count > 1 && vfsp->vfs_zone) {
+		zone_rele_ref(&vfsp->vfs_implp->vi_zone_ref, ZONE_REF_VFS);
+		vfsp->vfs_zone = NULL;
+	}
+
 	return (0);
 }
 
diff --git a/usr/src/uts/common/fs/zfs/zfs_vnops.c b/usr/src/uts/common/fs/zfs/zfs_vnops.c
index 9cba49b402..829d57b760 100644
--- a/usr/src/uts/common/fs/zfs/zfs_vnops.c
+++ b/usr/src/uts/common/fs/zfs/zfs_vnops.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
- * Copyright 2015 Joyent, Inc.
+ * Copyright 2016 Joyent, Inc.
  */
 
 /* Portions Copyright 2007 Jeremy Teo */
@@ -664,6 +664,7 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 	rl_t		*rl;
 	int		max_blksz = zfsvfs->z_max_blksz;
 	int		error = 0;
+	int		prev_error;
 	arc_buf_t	*abuf;
 	iovec_t		*aiov = NULL;
 	xuio_t		*xuio = NULL;
@@ -685,6 +686,17 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 	if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
 		limit = MAXOFFSET_T;
 
+	/*
+	 * Pre-fault the pages to ensure slow (eg NFS) pages
+	 * don't hold up txg.
+	 * Skip this if uio contains loaned arc_buf.
+	 */
+	if ((uio->uio_extflg == UIO_XUIO) &&
+	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
+		xuio = (xuio_t *)uio;
+	else
+		uio_prefaultpages(n, uio);
+
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
@@ -737,17 +749,6 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 	}
 
 	/*
-	 * Pre-fault the pages to ensure slow (eg NFS) pages
-	 * don't hold up txg.
-	 * Skip this if uio contains loaned arc_buf.
-	 */
-	if ((uio->uio_extflg == UIO_XUIO) &&
-	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
-		xuio = (xuio_t *)uio;
-	else
-		uio_prefaultpages(MIN(n, max_blksz), uio);
-
-	/*
 	 * If in append mode, set the io offset pointer to eof.
 	 */
 	if (ioflag & FAPPEND) {
@@ -968,7 +969,6 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 		while ((end_size = zp->z_size) < uio->uio_loffset) {
 			(void) atomic_cas_64(&zp->z_size, end_size,
 			    uio->uio_loffset);
-			ASSERT(error == 0);
 		}
 		/*
 		 * If we are replaying and eof is non zero then force
@@ -978,18 +978,20 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 		if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
 			zp->z_size = zfsvfs->z_replay_eof;
 
+		/*
+		 * Keep track of a possible pre-existing error from a partial
+		 * write via dmu_write_uio_dbuf above.
+		 */
+		prev_error = error;
 		error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 
 		zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
 		dmu_tx_commit(tx);
 
-		if (error != 0)
+		if (prev_error != 0 || error != 0)
 			break;
 		ASSERT(tx_bytes == nbytes);
 		n -= nbytes;
-
-		if (!xuio && n > 0)
-			uio_prefaultpages(MIN(n, max_blksz), uio);
 	}
 
 	zfs_range_unlock(rl);
@@ -2832,8 +2834,11 @@ top:
 			return (err);
 		}
 
-		if (vap->va_size == 0)
+		if (vap->va_size == 0) {
 			vnevent_truncate(ZTOV(zp), ct);
+		} else {
+			vnevent_resize(ZTOV(zp), ct);
+		}
 	}
 
 	if (mask & (AT_ATIME|AT_MTIME) ||
@@ -3761,9 +3766,7 @@ top:
 
 	if (error == 0) {
 		vnevent_rename_src(ZTOV(szp), sdvp, snm, ct);
-		/* notify the target dir if it is not the same as source dir */
-		if (tdvp != sdvp)
-			vnevent_rename_dest_dir(tdvp, ct);
+		vnevent_rename_dest_dir(tdvp, ZTOV(szp), tnm, ct);
 	}
 out:
 	if (zl != NULL)
@@ -4255,6 +4258,8 @@ zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
 		    &zp->z_pflags, 8);
 		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
 		    B_TRUE);
+		err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+
 		zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
 	}
 	dmu_tx_commit(tx);
@@ -4790,10 +4795,6 @@ zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
 	ASSERT3U(VTOZ(vp)->z_mapcnt, >=, pages);
 	atomic_add_64(&VTOZ(vp)->z_mapcnt, -pages);
 
-	if ((flags & MAP_SHARED) && (prot & PROT_WRITE) &&
-	    vn_has_cached_data(vp))
-		(void) VOP_PUTPAGE(vp, off, len, B_ASYNC, cr, ct);
-
 	return (0);
 }
 
@@ -4859,8 +4860,13 @@ zfs_space(vnode_t *vp, int cmd, flock64_t *bfp, int flag,
 
 	error = zfs_freesp(zp, off, len, flag, TRUE);
 
-	if (error == 0 && off == 0 && len == 0)
-		vnevent_truncate(ZTOV(zp), ct);
+	if (error == 0 && len == 0) {
+		if (off == 0) {
+			vnevent_truncate(ZTOV(zp), ct);
+		} else {
+			vnevent_resize(ZTOV(zp), ct);
+		}
+	}
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
diff --git a/usr/src/uts/common/fs/zfs/zfs_zone.c b/usr/src/uts/common/fs/zfs/zfs_zone.c
new file mode 100644
index 0000000000..4861c64f8e
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zfs_zone.c
@@ -0,0 +1,1336 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015, Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * The ZFS/Zone I/O throttle and scheduler attempts to ensure fair access to
+ * ZFS I/O resources for each zone.
+ *
+ * I/O contention can be major pain point on a multi-tenant system. A single
+ * zone can issue a stream of I/O operations, usually synchronous writes, which
+ * disrupt I/O performance for all other zones. This problem is further
+ * exacerbated by ZFS, which buffers all asynchronous writes in a single TXG,
+ * a set of blocks which are atomically synced to disk. The process of
+ * syncing a TXG can occupy all of a device's I/O bandwidth, thereby starving
+ * out any pending read operations.
+ *
+ * There are two facets to this capability; the throttle and the scheduler.
+ *
+ * Throttle
+ *
+ * The requirements on the throttle are:
+ *
+ *     1) Ensure consistent and predictable I/O latency across all zones.
+ *     2) Sequential and random workloads have very different characteristics,
+ *        so it is a non-starter to track IOPS or throughput.
+ *     3) A zone should be able to use the full disk bandwidth if no other zone
+ *        is actively using the disk.
+ *
+ * The throttle has two components: one to track and account for each zone's
+ * I/O requests, and another to throttle each zone's operations when it
+ * exceeds its fair share of disk I/O. When the throttle detects that a zone is
+ * consuming more than is appropriate, each read or write system call is
+ * delayed by up to 100 microseconds, which we've found is sufficient to allow
+ * other zones to interleave I/O requests during those delays.
+ *
+ * Note: The throttle will delay each logical I/O (as opposed to the physical
+ * I/O which will likely be issued asynchronously), so it may be easier to
+ * think of the I/O throttle delaying each read/write syscall instead of the
+ * actual I/O operation. For each zone, the throttle tracks an ongoing average
+ * of read and write operations performed to determine the overall I/O
+ * utilization for each zone.
+ *
+ * The throttle calculates a I/O utilization metric for each zone using the
+ * following formula:
+ *
+ *     (# of read syscalls) x (Average read latency) +
+ *     (# of write syscalls) x (Average write latency)
+ *
+ * Once each zone has its utilization metric, the I/O throttle will compare I/O
+ * utilization across all zones, and if a zone has a higher-than-average I/O
+ * utilization, system calls from that zone are throttled. That is, if one
+ * zone has a much higher utilization, that zone's delay is increased by 5
+ * microseconds, up to a maximum of 100 microseconds. Conversely, if a zone is
+ * already throttled and has a lower utilization than average, its delay will
+ * be lowered by 5 microseconds.
+ *
+ * The throttle calculation is driven by IO activity, but since IO does not
+ * happen at fixed intervals, timestamps are used to track when the last update
+ * was made and to drive recalculation.
+ *
+ * The throttle recalculates each zone's I/O usage and throttle delay (if any)
+ * on the zfs_zone_adjust_time interval. Overall I/O latency is maintained as
+ * a decayed average which is updated on the zfs_zone_sys_avg_cycle interval.
+ *
+ * Scheduler
+ *
+ * The I/O scheduler manages the vdev queues – the queues of pending I/Os to
+ * issue to the disks. It only makes scheduling decisions for the two
+ * synchronous I/O queues (read & write).
+ *
+ * The scheduler maintains how many I/Os in the queue are from each zone, and
+ * if one zone has a disproportionately large number of I/Os in the queue, the
+ * scheduler will allow certain I/Os from the underutilized zones to be "bumped"
+ * and pulled from the middle of the queue. This bump allows zones with a small
+ * number of I/Os (so small they may not even be taken into account by the
+ * throttle) to complete quickly instead of waiting behind dozens of I/Os from
+ * other zones.
+ */
+
+#include <sys/spa.h>
+#include <sys/vdev_impl.h>
+#include <sys/zfs_zone.h>
+
+#ifndef _KERNEL
+
+/*
+ * Stubs for when compiling for user-land.
+ */
+
+void
+zfs_zone_io_throttle(zfs_zone_iop_type_t type)
+{
+}
+
+void
+zfs_zone_zio_init(zio_t *zp)
+{
+}
+
+void
+zfs_zone_zio_start(zio_t *zp)
+{
+}
+
+void
+zfs_zone_zio_done(zio_t *zp)
+{
+}
+
+void
+zfs_zone_zio_dequeue(zio_t *zp)
+{
+}
+
+void
+zfs_zone_zio_enqueue(zio_t *zp)
+{
+}
+
+/*ARGSUSED*/
+void
+zfs_zone_report_txg_sync(void *dp)
+{
+}
+
+hrtime_t
+zfs_zone_txg_delay()
+{
+	return (MSEC2NSEC(10));
+}
+
+#else
+
+/*
+ * The real code.
+ */
+
+#include <sys/systm.h>
+#include <sys/thread.h>
+#include <sys/proc.h>
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/atomic.h>
+#include <sys/zio.h>
+#include <sys/zone.h>
+#include <sys/avl.h>
+#include <sys/sdt.h>
+#include <sys/ddi.h>
+
+/*
+ * The zone throttle delays read and write operations from certain zones based
+ * on each zone's IO utilitzation.  Once a cycle (defined by zfs_zone_cycle_time
+ * below), the delays for each zone are recalculated based on the utilization
+ * over the previous window.
+ */
+boolean_t	zfs_zone_delay_enable = B_TRUE;	/* enable IO throttle */
+uint16_t	zfs_zone_delay_step = 5;	/* usec amnt to change delay */
+uint16_t	zfs_zone_delay_ceiling = 100;	/* usec delay max */
+
+boolean_t	zfs_zone_priority_enable = B_TRUE;  /* enable IO priority */
+
+/*
+ * For certain workloads, one zone may be issuing primarily sequential I/O and
+ * another primarily random I/O.  The sequential I/O will complete much more
+ * quickly than the random I/O, driving the average system latency for those
+ * operations way down.  As a result, the random I/O may be throttled back, even
+ * though the sequential I/O should be throttled to allow the random I/O more
+ * access to the disk.
+ *
+ * This tunable limits the discrepancy between the read and write system
+ * latency.  If one becomes excessively high, this tunable prevents the I/O
+ * throttler from exacerbating the imbalance.
+ */
+uint_t		zfs_zone_rw_lat_limit = 10;
+
+/*
+ * The I/O throttle will only start delaying zones when it detects disk
+ * utilization has reached a certain level.  This tunable controls the
+ * threshold at which the throttle will start delaying zones.  When the number
+ * of vdevs is small, the calculation should correspond closely with the %b
+ * column from iostat -- but as the number of vdevs becomes large, it will
+ * correlate less and less to any single device (therefore making it a poor
+ * approximation for the actual I/O utilization on such systems).  We
+ * therefore use our derived utilization conservatively:  we know that low
+ * derived utilization does indeed correlate to low I/O use -- but that a high
+ * rate of derived utilization does not necesarily alone denote saturation;
+ * where we see a high rate of utilization, we also look for laggard I/Os to
+ * attempt to detect saturation.
+ */
+uint_t		zfs_zone_util_threshold = 80;
+uint_t		zfs_zone_underutil_threshold = 60;
+
+/*
+ * There are three important tunables here:  zfs_zone_laggard_threshold denotes
+ * the threshold at which an I/O is considered to be of notably high latency;
+ * zfs_zone_laggard_recent denotes the number of microseconds before the
+ * current time after which the last laggard is considered to be sufficiently
+ * recent to merit increasing the throttle; zfs_zone_laggard_ancient denotes
+ * the microseconds before the current time before which the last laggard is
+ * considered to be sufficiently old to merit decreasing the throttle.  The
+ * most important tunable of these three is the zfs_zone_laggard_threshold: in
+ * modeling data from a large public cloud, this tunable was found to have a
+ * much greater effect on the throttle than the two time-based thresholds.
+ * This must be set high enough to not result in spurious throttling, but not
+ * so high as to allow pathological I/O to persist in the system.
+ */
+uint_t		zfs_zone_laggard_threshold = 50000;	/* 50 ms */
+uint_t		zfs_zone_laggard_recent = 1000000;	/* 1000 ms */
+uint_t		zfs_zone_laggard_ancient = 5000000;	/* 5000 ms */
+
+/*
+ * Throughout this subsystem, our timestamps are in microseconds.  Our system
+ * average cycle is one second or 1 million microseconds.  Our zone counter
+ * update cycle is two seconds or 2 million microseconds.  We use a longer
+ * duration for that cycle because some ops can see a little over two seconds of
+ * latency when they are being starved by another zone.
+ */
+uint_t 		zfs_zone_sys_avg_cycle = 1000000;	/* 1 s */
+uint_t 		zfs_zone_cycle_time = 2000000;		/* 2 s */
+
+/*
+ * How often the I/O throttle will reevaluate each zone's utilization, in
+ * microseconds. Default is 1/4 sec.
+ */
+uint_t 		zfs_zone_adjust_time = 250000;		/* 250 ms */
+
+typedef struct {
+	hrtime_t	cycle_start;
+	int		cycle_cnt;
+	hrtime_t	cycle_lat;
+	hrtime_t	sys_avg_lat;
+} sys_lat_cycle_t;
+
+typedef struct {
+	hrtime_t zi_now;
+	uint_t zi_avgrlat;
+	uint_t zi_avgwlat;
+	uint64_t zi_totpri;
+	uint64_t zi_totutil;
+	int zi_active;
+	uint_t zi_diskutil;
+	boolean_t zi_underutil;
+	boolean_t zi_overutil;
+} zoneio_stats_t;
+
+static sys_lat_cycle_t	rd_lat;
+static sys_lat_cycle_t	wr_lat;
+
+/*
+ * Some basic disk stats to determine disk utilization. The utilization info
+ * for all disks on the system is aggregated into these values.
+ *
+ * Overall disk utilization for the current cycle is calculated as:
+ *
+ * ((zfs_disk_rtime - zfs_disk_last_rtime) * 100)
+ * ----------------------------------------------
+ *    ((now - zfs_zone_last_checked) * 1000);
+ */
+kmutex_t	zfs_disk_lock;		/* protects the following: */
+uint_t		zfs_disk_rcnt;		/* Number of outstanding IOs */
+hrtime_t	zfs_disk_rtime = 0; /* cummulative sum of time performing IO */
+hrtime_t	zfs_disk_rlastupdate = 0; /* time last IO dispatched */
+
+hrtime_t	zfs_disk_last_rtime = 0; /* prev. cycle's zfs_disk_rtime val */
+/* time that we last updated per-zone throttle info */
+hrtime_t	zfs_zone_last_checked = 0;
+hrtime_t	zfs_disk_last_laggard = 0;
+
+/*
+ * Data used to keep track of how often txg sync is running.
+ */
+extern int	zfs_txg_timeout;
+static uint_t	txg_last_check;
+static uint_t	txg_cnt;
+static uint_t	txg_sync_rate;
+
+boolean_t	zfs_zone_schedule_enable = B_TRUE;	/* enable IO sched. */
+/*
+ * Threshold for when zio scheduling should kick in.
+ *
+ * This threshold is based on the zfs_vdev_sync_read_max_active value for the
+ * number of I/Os that can be pending on a device.  If there are more than the
+ * max_active ops already queued up, beyond those already issued to the vdev,
+ * then use zone-based scheduling to get the next synchronous zio.
+ */
+uint32_t	zfs_zone_schedule_thresh = 10;
+
+/*
+ * On each pass of the scheduler we increment the zone's weight (up to this
+ * maximum). The weight is used by the scheduler to prevent starvation so
+ * that zones which haven't been able to do any IO over many iterations
+ * will max out thier weight to this value.
+ */
+#define	SCHED_WEIGHT_MAX	20
+
+/*
+ * Tunables for delay throttling when TXG sync is occurring.
+ *
+ * If the zone is performing a write and we're doing above normal TXG syncing,
+ * then throttle for longer than normal. The zone's wait time is multiplied
+ * by the scale (zfs_zone_txg_throttle_scale).
+ */
+int		zfs_zone_txg_throttle_scale = 2;
+hrtime_t	zfs_zone_txg_delay_nsec = MSEC2NSEC(20);
+
+typedef struct {
+	int		zq_qdepth;
+	zio_priority_t	zq_queue;
+	int		zq_priority;
+	int		zq_wt;
+	zoneid_t	zq_zoneid;
+} zone_q_bump_t;
+
+/*
+ * This uses gethrtime() but returns a value in usecs.
+ */
+#define	GET_USEC_TIME		(gethrtime() / 1000)
+#define	NANO_TO_MICRO(x)	(x / (NANOSEC / MICROSEC))
+
+/*
+ * Keep track of the zone's ZFS IOPs.
+ *
+ * See the comment on the zfs_zone_io_throttle function for which/how IOPs are
+ * accounted for.
+ *
+ * If the number of ops is >1 then we can just use that value.  However,
+ * if the number of ops is <2 then we might have a zone which is trying to do
+ * IO but is not able to get any ops through the system.  We don't want to lose
+ * track of this zone so we factor in its decayed count into the current count.
+ *
+ * Each cycle (zfs_zone_sys_avg_cycle) we want to update the decayed count.
+ * However, since this calculation is driven by IO activity and since IO does
+ * not happen at fixed intervals, we use a timestamp to see when the last update
+ * was made.  If it was more than one cycle ago, then we need to decay the
+ * historical count by the proper number of additional cycles in which no IO was
+ * performed.
+ *
+ * Return a time delta indicating how far into the current cycle we are or 0
+ * if the last IO was more than a cycle ago.
+ */
+static hrtime_t
+compute_historical_zone_cnt(hrtime_t unow, sys_zio_cntr_t *cp)
+{
+	hrtime_t delta;
+	int	gen_cnt;
+
+	/*
+	 * Check if its time to recompute a new zone count.
+	 * If we're still collecting data for the current cycle, return false.
+	 */
+	delta = unow - cp->cycle_start;
+	if (delta < zfs_zone_cycle_time)
+		return (delta);
+
+	/* A previous cycle is past, compute the new zone count. */
+
+	/*
+	 * Figure out how many generations we have to decay the historical
+	 * count, since multiple cycles may have elapsed since our last IO.
+	 * We depend on int rounding here.
+	 */
+	gen_cnt = (int)(delta / zfs_zone_cycle_time);
+
+	/* If more than 5 cycles since last the IO, reset count. */
+	if (gen_cnt > 5) {
+		cp->zone_avg_cnt = 0;
+	} else {
+		/* Update the count. */
+		int	i;
+
+		/*
+		 * If the zone did more than 1 IO, just use its current count
+		 * as the historical value, otherwise decay the historical
+		 * count and factor that into the new historical count.  We
+		 * pick a threshold > 1 so that we don't lose track of IO due
+		 * to int rounding.
+		 */
+		if (cp->cycle_cnt > 1)
+			cp->zone_avg_cnt = cp->cycle_cnt;
+		else
+			cp->zone_avg_cnt = cp->cycle_cnt +
+			    (cp->zone_avg_cnt / 2);
+
+		/*
+		 * If more than one generation has elapsed since the last
+		 * update, decay the values further.
+		 */
+		for (i = 1; i < gen_cnt; i++)
+			cp->zone_avg_cnt = cp->zone_avg_cnt / 2;
+	}
+
+	/* A new cycle begins. */
+	cp->cycle_start = unow;
+	cp->cycle_cnt = 0;
+
+	return (0);
+}
+
+/*
+ * Add IO op data to the zone.
+ */
+static void
+add_zone_iop(zone_t *zonep, hrtime_t unow, zfs_zone_iop_type_t op)
+{
+	switch (op) {
+	case ZFS_ZONE_IOP_READ:
+		(void) compute_historical_zone_cnt(unow, &zonep->zone_rd_ops);
+		zonep->zone_rd_ops.cycle_cnt++;
+		break;
+	case ZFS_ZONE_IOP_WRITE:
+		(void) compute_historical_zone_cnt(unow, &zonep->zone_wr_ops);
+		zonep->zone_wr_ops.cycle_cnt++;
+		break;
+	case ZFS_ZONE_IOP_LOGICAL_WRITE:
+		(void) compute_historical_zone_cnt(unow, &zonep->zone_lwr_ops);
+		zonep->zone_lwr_ops.cycle_cnt++;
+		break;
+	}
+}
+
+/*
+ * Use a decaying average to keep track of the overall system latency.
+ *
+ * We want to have the recent activity heavily weighted, but if the
+ * activity decreases or stops, then the average should quickly decay
+ * down to the new value.
+ *
+ * Each cycle (zfs_zone_sys_avg_cycle) we want to update the decayed average.
+ * However, since this calculation is driven by IO activity and since IO does
+ * not happen at fixed intervals, we use a timestamp to see when the last
+ * update was made. If it was more than one cycle ago, then we need to decay
+ * the average by the proper number of additional cycles in which no IO was
+ * performed.
+ *
+ * Return true if we actually computed a new system average.
+ * If we're still within an active cycle there is nothing to do, return false.
+ */
+static boolean_t
+compute_new_sys_avg(hrtime_t unow, sys_lat_cycle_t *cp)
+{
+	hrtime_t delta;
+	int	gen_cnt;
+
+	/*
+	 * Check if its time to recompute a new average.
+	 * If we're still collecting data for the current cycle, return false.
+	 */
+	delta = unow - cp->cycle_start;
+	if (delta < zfs_zone_sys_avg_cycle)
+		return (B_FALSE);
+
+	/* A previous cycle is past, compute a new system average. */
+
+	/*
+	 * Figure out how many generations we have to decay, since multiple
+	 * cycles may have elapsed since our last IO.
+	 * We count on int rounding here.
+	 */
+	gen_cnt = (int)(delta / zfs_zone_sys_avg_cycle);
+
+	/* If more than 5 cycles since last the IO, reset average. */
+	if (gen_cnt > 5) {
+		cp->sys_avg_lat = 0;
+	} else {
+		/* Update the average. */
+		int	i;
+
+		cp->sys_avg_lat =
+		    (cp->sys_avg_lat + cp->cycle_lat) / (1 + cp->cycle_cnt);
+
+		/*
+		 * If more than one generation has elapsed since the last
+		 * update, decay the values further.
+		 */
+		for (i = 1; i < gen_cnt; i++)
+			cp->sys_avg_lat = cp->sys_avg_lat / 2;
+	}
+
+	/* A new cycle begins. */
+	cp->cycle_start = unow;
+	cp->cycle_cnt = 0;
+	cp->cycle_lat = 0;
+
+	return (B_TRUE);
+}
+
+static void
+add_sys_iop(hrtime_t unow, int op, int lat)
+{
+	switch (op) {
+	case ZFS_ZONE_IOP_READ:
+		(void) compute_new_sys_avg(unow, &rd_lat);
+		rd_lat.cycle_cnt++;
+		rd_lat.cycle_lat += lat;
+		break;
+	case ZFS_ZONE_IOP_WRITE:
+		(void) compute_new_sys_avg(unow, &wr_lat);
+		wr_lat.cycle_cnt++;
+		wr_lat.cycle_lat += lat;
+		break;
+	}
+}
+
+/*
+ * Get the zone IO counts.
+ */
+static uint_t
+calc_zone_cnt(hrtime_t unow, sys_zio_cntr_t *cp)
+{
+	hrtime_t delta;
+	uint_t cnt;
+
+	if ((delta = compute_historical_zone_cnt(unow, cp)) == 0) {
+		/*
+		 * No activity in the current cycle, we already have the
+		 * historical data so we'll use that.
+		 */
+		cnt = cp->zone_avg_cnt;
+	} else {
+		/*
+		 * If we're less than half way through the cycle then use
+		 * the current count plus half the historical count, otherwise
+		 * just use the current count.
+		 */
+		if (delta < (zfs_zone_cycle_time / 2))
+			cnt = cp->cycle_cnt + (cp->zone_avg_cnt / 2);
+		else
+			cnt = cp->cycle_cnt;
+	}
+
+	return (cnt);
+}
+
+/*
+ * Get the average read/write latency in usecs for the system.
+ */
+static uint_t
+calc_avg_lat(hrtime_t unow, sys_lat_cycle_t *cp)
+{
+	if (compute_new_sys_avg(unow, cp)) {
+		/*
+		 * No activity in the current cycle, we already have the
+		 * historical data so we'll use that.
+		 */
+		return (cp->sys_avg_lat);
+	} else {
+		/*
+		 * We're within a cycle; weight the current activity higher
+		 * compared to the historical data and use that.
+		 */
+		DTRACE_PROBE3(zfs__zone__calc__wt__avg,
+		    uintptr_t, cp->sys_avg_lat,
+		    uintptr_t, cp->cycle_lat,
+		    uintptr_t, cp->cycle_cnt);
+
+		return ((cp->sys_avg_lat + (cp->cycle_lat * 8)) /
+		    (1 + (cp->cycle_cnt * 8)));
+	}
+}
+
+/*
+ * Account for the current IOP on the zone and for the system as a whole.
+ * The latency parameter is in usecs.
+ */
+static void
+add_iop(zone_t *zonep, hrtime_t unow, zfs_zone_iop_type_t op, hrtime_t lat)
+{
+	/* Add op to zone */
+	add_zone_iop(zonep, unow, op);
+
+	/* Track system latency */
+	if (op != ZFS_ZONE_IOP_LOGICAL_WRITE)
+		add_sys_iop(unow, op, lat);
+}
+
+/*
+ * Calculate and return the total number of read ops, write ops and logical
+ * write ops for the given zone.  If the zone has issued operations of any type
+ * return a non-zero value, otherwise return 0.
+ */
+static int
+get_zone_io_cnt(hrtime_t unow, zone_t *zonep, uint_t *rops, uint_t *wops,
+    uint_t *lwops)
+{
+	*rops = calc_zone_cnt(unow, &zonep->zone_rd_ops);
+	*wops = calc_zone_cnt(unow, &zonep->zone_wr_ops);
+	*lwops = calc_zone_cnt(unow, &zonep->zone_lwr_ops);
+
+	DTRACE_PROBE4(zfs__zone__io__cnt, uintptr_t, zonep->zone_id,
+	    uintptr_t, *rops, uintptr_t, *wops, uintptr_t, *lwops);
+
+	return (*rops | *wops | *lwops);
+}
+
+/*
+ * Get the average read/write latency in usecs for the system.
+ */
+static void
+get_sys_avg_lat(hrtime_t unow, uint_t *rlat, uint_t *wlat)
+{
+	*rlat = calc_avg_lat(unow, &rd_lat);
+	*wlat = calc_avg_lat(unow, &wr_lat);
+
+	/*
+	 * In an attempt to improve the accuracy of the throttling algorithm,
+	 * assume that IO operations can't have zero latency.  Instead, assume
+	 * a reasonable lower bound for each operation type. If the actual
+	 * observed latencies are non-zero, use those latency values instead.
+	 */
+	if (*rlat == 0)
+		*rlat = 1000;
+	if (*wlat == 0)
+		*wlat = 1000;
+
+	DTRACE_PROBE2(zfs__zone__sys__avg__lat, uintptr_t, *rlat,
+	    uintptr_t, *wlat);
+}
+
+/*
+ * Find disk utilization for each zone and average utilization for all active
+ * zones.
+ */
+static int
+zfs_zone_wait_adjust_calculate_cb(zone_t *zonep, void *arg)
+{
+	zoneio_stats_t *sp = arg;
+	uint_t rops, wops, lwops;
+
+	if (zonep->zone_id == GLOBAL_ZONEID ||
+	    get_zone_io_cnt(sp->zi_now, zonep, &rops, &wops, &lwops) == 0) {
+		zonep->zone_io_util = 0;
+		return (0);
+	}
+
+	zonep->zone_io_util = (rops * sp->zi_avgrlat) +
+	    (wops * sp->zi_avgwlat) + (lwops * sp->zi_avgwlat);
+	sp->zi_totutil += zonep->zone_io_util;
+
+	if (zonep->zone_io_util > 0) {
+		sp->zi_active++;
+		sp->zi_totpri += zonep->zone_zfs_io_pri;
+	}
+
+	/*
+	 * sdt:::zfs-zone-utilization
+	 *
+	 *	arg0: zone ID
+	 *	arg1: read operations observed during time window
+	 *	arg2: physical write operations observed during time window
+	 *	arg3: logical write ops observed during time window
+	 *	arg4: calculated utilization given read and write ops
+	 *	arg5: I/O priority assigned to this zone
+	 */
+	DTRACE_PROBE6(zfs__zone__utilization, uint_t, zonep->zone_id,
+	    uint_t, rops, uint_t, wops, uint_t, lwops,
+	    uint_t, zonep->zone_io_util, uint_t, zonep->zone_zfs_io_pri);
+
+	return (0);
+}
+
+static void
+zfs_zone_delay_inc(zone_t *zonep)
+{
+	if (zonep->zone_io_delay < zfs_zone_delay_ceiling)
+		zonep->zone_io_delay += zfs_zone_delay_step;
+}
+
+static void
+zfs_zone_delay_dec(zone_t *zonep)
+{
+	if (zonep->zone_io_delay > 0)
+		zonep->zone_io_delay -= zfs_zone_delay_step;
+}
+
+/*
+ * For all zones "far enough" away from the average utilization, increase that
+ * zones delay.  Otherwise, reduce its delay.
+ */
+static int
+zfs_zone_wait_adjust_delay_cb(zone_t *zonep, void *arg)
+{
+	zoneio_stats_t *sp = arg;
+	uint16_t delay = zonep->zone_io_delay;
+	uint_t fairutil = 0;
+
+	zonep->zone_io_util_above_avg = B_FALSE;
+
+	/*
+	 * Given the calculated total utilitzation for all zones, calculate the
+	 * fair share of I/O for this zone.
+	 */
+	if (zfs_zone_priority_enable && sp->zi_totpri > 0) {
+		fairutil = (sp->zi_totutil * zonep->zone_zfs_io_pri) /
+		    sp->zi_totpri;
+	} else if (sp->zi_active > 0) {
+		fairutil = sp->zi_totutil / sp->zi_active;
+	}
+
+	/*
+	 * Adjust each IO's delay.  If the overall delay becomes too high, avoid
+	 * increasing beyond the ceiling value.
+	 */
+	if (zonep->zone_io_util > fairutil && sp->zi_overutil) {
+		zonep->zone_io_util_above_avg = B_TRUE;
+
+		if (sp->zi_active > 1)
+			zfs_zone_delay_inc(zonep);
+	} else if (zonep->zone_io_util < fairutil || sp->zi_underutil ||
+	    sp->zi_active <= 1) {
+		zfs_zone_delay_dec(zonep);
+	}
+
+	/*
+	 * sdt:::zfs-zone-throttle
+	 *
+	 *	arg0: zone ID
+	 *	arg1: old delay for this zone
+	 *	arg2: new delay for this zone
+	 *	arg3: calculated fair I/O utilization
+	 *	arg4: actual I/O utilization
+	 */
+	DTRACE_PROBE5(zfs__zone__throttle, uintptr_t, zonep->zone_id,
+	    uintptr_t, delay, uintptr_t, zonep->zone_io_delay,
+	    uintptr_t, fairutil, uintptr_t, zonep->zone_io_util);
+
+	return (0);
+}
+
+/*
+ * Examine the utilization between different zones, and adjust the delay for
+ * each zone appropriately.
+ */
+static void
+zfs_zone_wait_adjust(hrtime_t unow, hrtime_t last_checked)
+{
+	zoneio_stats_t stats;
+	hrtime_t laggard_udelta = 0;
+
+	(void) bzero(&stats, sizeof (stats));
+
+	stats.zi_now = unow;
+	get_sys_avg_lat(unow, &stats.zi_avgrlat, &stats.zi_avgwlat);
+
+	if (stats.zi_avgrlat > stats.zi_avgwlat * zfs_zone_rw_lat_limit)
+		stats.zi_avgrlat = stats.zi_avgwlat * zfs_zone_rw_lat_limit;
+	else if (stats.zi_avgrlat * zfs_zone_rw_lat_limit < stats.zi_avgwlat)
+		stats.zi_avgwlat = stats.zi_avgrlat * zfs_zone_rw_lat_limit;
+
+	if (zone_walk(zfs_zone_wait_adjust_calculate_cb, &stats) != 0)
+		return;
+
+	/*
+	 * Calculate disk utilization for the most recent period.
+	 */
+	if (zfs_disk_last_rtime == 0 || unow - last_checked <= 0) {
+		stats.zi_diskutil = 0;
+	} else {
+		stats.zi_diskutil =
+		    ((zfs_disk_rtime - zfs_disk_last_rtime) * 100) /
+		    ((unow - last_checked) * 1000);
+	}
+	zfs_disk_last_rtime = zfs_disk_rtime;
+
+	if (unow > zfs_disk_last_laggard)
+		laggard_udelta = unow - zfs_disk_last_laggard;
+
+	/*
+	 * To minimize porpoising, we have three separate states for our
+	 * assessment of I/O performance:  overutilized, underutilized, and
+	 * neither overutilized nor underutilized.  We will increment the
+	 * throttle if a zone is using more than its fair share _and_ I/O
+	 * is overutilized; we will decrement the throttle if a zone is using
+	 * less than its fair share _or_ I/O is underutilized.
+	 */
+	stats.zi_underutil = stats.zi_diskutil < zfs_zone_underutil_threshold ||
+	    laggard_udelta > zfs_zone_laggard_ancient;
+
+	stats.zi_overutil = stats.zi_diskutil > zfs_zone_util_threshold &&
+	    laggard_udelta < zfs_zone_laggard_recent;
+
+	/*
+	 * sdt:::zfs-zone-stats
+	 *
+	 * Statistics observed over the last period:
+	 *
+	 *	arg0: average system read latency
+	 *	arg1: average system write latency
+	 *	arg2: number of active zones
+	 *	arg3: total I/O 'utilization' for all zones
+	 *	arg4: total I/O priority of all active zones
+	 *	arg5: calculated disk utilization
+	 */
+	DTRACE_PROBE6(zfs__zone__stats, uintptr_t, stats.zi_avgrlat,
+	    uintptr_t, stats.zi_avgwlat, uintptr_t, stats.zi_active,
+	    uintptr_t, stats.zi_totutil, uintptr_t, stats.zi_totpri,
+	    uintptr_t, stats.zi_diskutil);
+
+	(void) zone_walk(zfs_zone_wait_adjust_delay_cb, &stats);
+}
+
+/*
+ * Callback used to calculate a zone's IO schedule priority.
+ *
+ * We scan the zones looking for ones with ops in the queue.  Out of those,
+ * we pick the one that calculates to the highest schedule priority.
+ */
+static int
+get_sched_pri_cb(zone_t *zonep, void *arg)
+{
+	int pri;
+	uint_t cnt;
+	zone_q_bump_t *qbp = arg;
+	zio_priority_t p = qbp->zq_queue;
+
+	cnt = zonep->zone_zfs_queued[p];
+	if (cnt == 0) {
+		zonep->zone_zfs_weight = 0;
+		return (0);
+	}
+
+	/*
+	 * On each pass, increment the zone's weight.  We use this as input
+	 * to the calculation to prevent starvation.  The value is reset
+	 * each time we issue an IO for this zone so zones which haven't
+	 * done any IO over several iterations will see their weight max
+	 * out.
+	 */
+	if (zonep->zone_zfs_weight < SCHED_WEIGHT_MAX)
+		zonep->zone_zfs_weight++;
+
+	/*
+	 * This zone's IO priority is the inverse of the number of IOs
+	 * the zone has enqueued * zone's configured priority * weight.
+	 * The queue depth has already been scaled by 10 to avoid problems
+	 * with int rounding.
+	 *
+	 * This means that zones with fewer IOs in the queue will get
+	 * preference unless other zone's assigned priority pulls them
+	 * ahead.  The weight is factored in to help ensure that zones
+	 * which haven't done IO in a while aren't getting starved.
+	 */
+	pri = (qbp->zq_qdepth / cnt) *
+	    zonep->zone_zfs_io_pri * zonep->zone_zfs_weight;
+
+	/*
+	 * If this zone has a higher priority than what we found so far,
+	 * it becomes the new leading contender.
+	 */
+	if (pri > qbp->zq_priority) {
+		qbp->zq_zoneid = zonep->zone_id;
+		qbp->zq_priority = pri;
+		qbp->zq_wt = zonep->zone_zfs_weight;
+	}
+	return (0);
+}
+
+/*
+ * See if we need to bump a zone's zio to the head of the queue. This is only
+ * done on the two synchronous I/O queues (see the block comment on the
+ * zfs_zone_schedule function). We get the correct vdev_queue_class_t and
+ * queue depth from our caller.
+ *
+ * For single-threaded synchronous processes a zone cannot get more than
+ * 1 op into the queue at a time unless the zone is running multiple processes
+ * in parallel.  This can cause an imbalance in performance if there are zones
+ * with many parallel processes (and ops in the queue) vs. other zones which
+ * are doing simple single-threaded processes, such as interactive tasks in the
+ * shell.  These zones can get backed up behind a deep queue and their IO
+ * performance will appear to be very poor as a result.  This can make the
+ * zone work badly for interactive behavior.
+ *
+ * The scheduling algorithm kicks in once we start to get a deeper queue.
+ * Once that occurs, we look at all of the zones to see which one calculates
+ * to the highest priority.  We bump that zone's first zio to the head of the
+ * queue.
+ *
+ * We use a counter on the zone so that we can quickly find how many ops each
+ * zone has in the queue without having to search the entire queue itself.
+ * This scales better since the number of zones is expected to be on the
+ * order of 10-100 whereas the queue depth can be in the range of 50-2000.
+ * In addition, since the zio's in the queue only have the zoneid, we would
+ * have to look up the zone for each zio enqueued and that means the overhead
+ * for scanning the queue each time would be much higher.
+ *
+ * In all cases, we fall back to simply pulling the next op off the queue
+ * if something should go wrong.
+ */
+static zio_t *
+get_next_zio(vdev_queue_class_t *vqc, int qdepth, zio_priority_t p,
+    avl_tree_t *tree)
+{
+	zone_q_bump_t qbump;
+	zio_t *zp = NULL, *zphead;
+	int cnt = 0;
+
+	/* To avoid problems with int rounding, scale the queue depth by 10 */
+	qbump.zq_qdepth = qdepth * 10;
+	qbump.zq_priority = 0;
+	qbump.zq_zoneid = 0;
+	qbump.zq_queue = p;
+	(void) zone_walk(get_sched_pri_cb, &qbump);
+
+	zphead = avl_first(tree);
+
+	/* Check if the scheduler didn't pick a zone for some reason!? */
+	if (qbump.zq_zoneid != 0) {
+		for (zp = avl_first(tree); zp != NULL;
+		    zp = avl_walk(tree, zp, AVL_AFTER)) {
+			if (zp->io_zoneid == qbump.zq_zoneid)
+				break;
+			cnt++;
+		}
+	}
+
+	if (zp == NULL) {
+		zp = zphead;
+	} else if (zp != zphead) {
+		/*
+		 * Only fire the probe if we actually picked a different zio
+		 * than the one already at the head of the queue.
+		 */
+		DTRACE_PROBE4(zfs__zone__sched__bump, uint_t, zp->io_zoneid,
+		    uint_t, cnt, int, qbump.zq_priority, int, qbump.zq_wt);
+	}
+
+	return (zp);
+}
+
+/*
+ * Add our zone ID to the zio so we can keep track of which zones are doing
+ * what, even when the current thread processing the zio is not associated
+ * with the zone (e.g. the kernel taskq which pushes out TX groups).
+ */
+void
+zfs_zone_zio_init(zio_t *zp)
+{
+	zone_t	*zonep = curzone;
+
+	zp->io_zoneid = zonep->zone_id;
+}
+
+/*
+ * Track and throttle IO operations per zone. Called from:
+ *   - dmu_tx_count_write for (logical) write ops (both dataset and zvol writes
+ *     go through this path)
+ *   - arc_read for read ops that miss the ARC (both dataset and zvol)
+ * For each operation, increment that zone's counter based on the type of
+ * operation, then delay the operation, if necessary.
+ *
+ * There are three basic ways that we can see write ops:
+ * 1) An application does write syscalls.  Those ops go into a TXG which
+ *    we'll count here.  Sometime later a kernel taskq thread (we'll see the
+ *    vdev IO as zone 0) will perform some number of physical writes to commit
+ *    the TXG to disk.  Those writes are not associated with the zone which
+ *    made the write syscalls and the number of operations is not correlated
+ *    between the taskq and the zone. We only see logical writes in this
+ *    function, we see the physcial writes in the zfs_zone_zio_start and
+ *    zfs_zone_zio_done functions.
+ * 2) An application opens a file with O_SYNC.  Each write will result in
+ *    an operation which we'll see here plus a low-level vdev write from
+ *    that zone.
+ * 3) An application does write syscalls followed by an fsync().  We'll
+ *    count the writes going into a TXG here.  We'll also see some number
+ *    (usually much smaller, maybe only 1) of low-level vdev writes from this
+ *    zone when the fsync is performed, plus some other low-level vdev writes
+ *    from the taskq in zone 0 (are these metadata writes?).
+ *
+ * 4) In addition to the above, there are misc. system-level writes, such as
+ *    writing out dirty pages to swap, or sync(2) calls, which will be handled
+ *    by the global zone and which we count but don't generally worry about.
+ *
+ * Because of the above, we can see writes twice; first because this function
+ * is always called by a zone thread for logical writes, but then we also will
+ * count the physical writes that are performed at a low level via
+ * zfs_zone_zio_start. Without this, it can look like a non-global zone never
+ * writes (case 1). Depending on when the TXG is synced, the counts may be in
+ * the same sample bucket or in a different one.
+ *
+ * Tracking read operations is simpler due to their synchronous semantics.  The
+ * zfs_read function -- called as a result of a read(2) syscall -- will always
+ * retrieve the data to be read through arc_read and we only come into this
+ * function when we have an arc miss.
+ */
+void
+zfs_zone_io_throttle(zfs_zone_iop_type_t type)
+{
+	zone_t *zonep = curzone;
+	hrtime_t unow, last_checked;
+	uint16_t wait;
+
+	unow = GET_USEC_TIME;
+
+	/*
+	 * Only bump the counter for logical writes here.  The counters for
+	 * tracking physical IO operations are handled in zfs_zone_zio_done.
+	 */
+	if (type == ZFS_ZONE_IOP_LOGICAL_WRITE) {
+		mutex_enter(&zonep->zone_stg_io_lock);
+		add_iop(zonep, unow, type, 0);
+		mutex_exit(&zonep->zone_stg_io_lock);
+	}
+
+	if (!zfs_zone_delay_enable)
+		return;
+
+	/*
+	 * If the zone's I/O priority is set to zero, don't throttle that zone's
+	 * operations at all.
+	 */
+	if (zonep->zone_zfs_io_pri == 0)
+		return;
+
+	/*
+	 * XXX There's a potential race here in that more than one thread may
+	 * update the zone delays concurrently.  The worst outcome is corruption
+	 * of our data to track each zone's IO, so the algorithm may make
+	 * incorrect throttling decisions until the data is refreshed.
+	 */
+	last_checked = zfs_zone_last_checked;
+	if ((unow - last_checked) > zfs_zone_adjust_time) {
+		zfs_zone_last_checked = unow;
+		zfs_zone_wait_adjust(unow, last_checked);
+	}
+
+	if ((wait = zonep->zone_io_delay) > 0) {
+		/*
+		 * If this is a write and we're doing above normal TXG
+		 * syncing, then throttle for longer than normal.
+		 */
+		if (type == ZFS_ZONE_IOP_LOGICAL_WRITE &&
+		    (txg_cnt > 1 || txg_sync_rate > 1))
+			wait *= zfs_zone_txg_throttle_scale;
+
+		/*
+		 * sdt:::zfs-zone-wait
+		 *
+		 *	arg0: zone ID
+		 *	arg1: type of IO operation
+		 *	arg2: time to delay (in us)
+		 */
+		DTRACE_PROBE3(zfs__zone__wait, uintptr_t, zonep->zone_id,
+		    uintptr_t, type, uintptr_t, wait);
+
+		drv_usecwait(wait);
+
+		if (zonep->zone_vfs_stats != NULL) {
+			atomic_inc_64(&zonep->zone_vfs_stats->
+			    zv_delay_cnt.value.ui64);
+			atomic_add_64(&zonep->zone_vfs_stats->
+			    zv_delay_time.value.ui64, wait);
+		}
+	}
+}
+
+/*
+ * XXX Ignore the pool pointer parameter for now.
+ *
+ * Keep track to see if the TXG sync rate is running above the expected rate.
+ * If so, this implies that we are filling TXG's at a high rate due to a heavy
+ * write workload.  We use this as input into the zone throttle.
+ *
+ * This function is called every 5 seconds (zfs_txg_timeout) under a normal
+ * write load.  In this case, the sync rate is going to be 1.  When there
+ * is a heavy write load, TXG's fill up fast and the sync thread will write
+ * the TXG more frequently (perhaps once a second).  In this case the rate
+ * will be > 1.  The sync rate is a lagging indicator since it can be up
+ * to 5 seconds old.  We use the txg_cnt to keep track of the rate in the
+ * current 5 second interval and txg_sync_rate to keep track of the previous
+ * 5 second interval.  In that way we don't have a period (1 or more seconds)
+ * where the txg_cnt == 0 and we cut back on throttling even though the rate
+ * is still high.
+ */
+/*ARGSUSED*/
+void
+zfs_zone_report_txg_sync(void *dp)
+{
+	uint_t now;
+
+	txg_cnt++;
+	now = (uint_t)(gethrtime() / NANOSEC);
+	if ((now - txg_last_check) >= zfs_txg_timeout) {
+		txg_sync_rate = txg_cnt / 2;
+		txg_cnt = 0;
+		txg_last_check = now;
+	}
+}
+
+hrtime_t
+zfs_zone_txg_delay()
+{
+	if (curzone->zone_io_util_above_avg)
+		return (zfs_zone_txg_delay_nsec);
+
+	return (MSEC2NSEC(10));
+}
+
+/*
+ * Called from vdev_disk_io_start when an IO hits the end of the zio pipeline
+ * and is issued.
+ * Keep track of start time for latency calculation in zfs_zone_zio_done.
+ */
+void
+zfs_zone_zio_start(zio_t *zp)
+{
+	zone_t	*zonep;
+
+	/*
+	 * I/Os of type ZIO_TYPE_IOCTL are used to flush the disk cache, not for
+	 * an actual I/O operation.  Ignore those operations as they relate to
+	 * throttling and scheduling.
+	 */
+	if (zp->io_type == ZIO_TYPE_IOCTL)
+		return;
+
+	if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL)
+		return;
+
+	mutex_enter(&zonep->zone_zfs_lock);
+	if (zp->io_type == ZIO_TYPE_READ)
+		kstat_runq_enter(&zonep->zone_zfs_rwstats);
+	zonep->zone_zfs_weight = 0;
+	mutex_exit(&zonep->zone_zfs_lock);
+
+	mutex_enter(&zfs_disk_lock);
+	zp->io_dispatched = gethrtime();
+
+	if (zfs_disk_rcnt++ != 0)
+		zfs_disk_rtime += (zp->io_dispatched - zfs_disk_rlastupdate);
+	zfs_disk_rlastupdate = zp->io_dispatched;
+	mutex_exit(&zfs_disk_lock);
+
+	zone_rele(zonep);
+}
+
+/*
+ * Called from vdev_disk_io_done when an IO completes.
+ * Increment our counter for zone ops.
+ * Calculate the IO latency avg. for this zone.
+ */
+void
+zfs_zone_zio_done(zio_t *zp)
+{
+	zone_t	*zonep;
+	hrtime_t now, unow, udelta;
+
+	if (zp->io_type == ZIO_TYPE_IOCTL)
+		return;
+
+	if (zp->io_dispatched == 0)
+		return;
+
+	if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL)
+		return;
+
+	now = gethrtime();
+	unow = NANO_TO_MICRO(now);
+	udelta = unow - NANO_TO_MICRO(zp->io_dispatched);
+
+	mutex_enter(&zonep->zone_zfs_lock);
+
+	/*
+	 * To calculate the wsvc_t average, keep a cumulative sum of all the
+	 * wait time before each I/O was dispatched.  Since most writes are
+	 * asynchronous, only track the wait time for read I/Os.
+	 */
+	if (zp->io_type == ZIO_TYPE_READ) {
+		zonep->zone_zfs_rwstats.reads++;
+		zonep->zone_zfs_rwstats.nread += zp->io_size;
+
+		zonep->zone_zfs_stats->zz_waittime.value.ui64 +=
+		    zp->io_dispatched - zp->io_timestamp;
+
+		kstat_runq_exit(&zonep->zone_zfs_rwstats);
+	} else {
+		zonep->zone_zfs_rwstats.writes++;
+		zonep->zone_zfs_rwstats.nwritten += zp->io_size;
+	}
+
+	mutex_exit(&zonep->zone_zfs_lock);
+
+	mutex_enter(&zfs_disk_lock);
+	zfs_disk_rcnt--;
+	zfs_disk_rtime += (now - zfs_disk_rlastupdate);
+	zfs_disk_rlastupdate = now;
+
+	if (udelta > zfs_zone_laggard_threshold)
+		zfs_disk_last_laggard = unow;
+
+	mutex_exit(&zfs_disk_lock);
+
+	if (zfs_zone_delay_enable) {
+		mutex_enter(&zonep->zone_stg_io_lock);
+		add_iop(zonep, unow, zp->io_type == ZIO_TYPE_READ ?
+		    ZFS_ZONE_IOP_READ : ZFS_ZONE_IOP_WRITE, udelta);
+		mutex_exit(&zonep->zone_stg_io_lock);
+	}
+
+	zone_rele(zonep);
+
+	/*
+	 * sdt:::zfs-zone-latency
+	 *
+	 *	arg0: zone ID
+	 *	arg1: type of I/O operation
+	 *	arg2: I/O latency (in us)
+	 */
+	DTRACE_PROBE3(zfs__zone__latency, uintptr_t, zp->io_zoneid,
+	    uintptr_t, zp->io_type, uintptr_t, udelta);
+}
+
+void
+zfs_zone_zio_dequeue(zio_t *zp)
+{
+	zio_priority_t p;
+	zone_t	*zonep;
+
+	p = zp->io_priority;
+	if (p != ZIO_PRIORITY_SYNC_READ && p != ZIO_PRIORITY_SYNC_WRITE)
+		return;
+
+	/* We depend on p being defined as either 0 or 1 */
+	ASSERT(p < 2);
+
+	if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL)
+		return;
+
+	mutex_enter(&zonep->zone_stg_io_lock);
+	ASSERT(zonep->zone_zfs_queued[p] > 0);
+	if (zonep->zone_zfs_queued[p] == 0)
+		cmn_err(CE_WARN, "zfs_zone_zio_dequeue: count==0");
+	else
+		zonep->zone_zfs_queued[p]--;
+	mutex_exit(&zonep->zone_stg_io_lock);
+	zone_rele(zonep);
+}
+
+void
+zfs_zone_zio_enqueue(zio_t *zp)
+{
+	zio_priority_t p;
+	zone_t	*zonep;
+
+	p = zp->io_priority;
+	if (p != ZIO_PRIORITY_SYNC_READ && p != ZIO_PRIORITY_SYNC_WRITE)
+		return;
+
+	/* We depend on p being defined as either 0 or 1 */
+	ASSERT(p < 2);
+
+	if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL)
+		return;
+
+	mutex_enter(&zonep->zone_stg_io_lock);
+	zonep->zone_zfs_queued[p]++;
+	mutex_exit(&zonep->zone_stg_io_lock);
+	zone_rele(zonep);
+}
+
+/*
+ * Called from vdev_queue_io_to_issue. That function is where zio's are listed
+ * in FIFO order on one of the sync queues, then pulled off (by
+ * vdev_queue_io_remove) and issued.  We potentially do zone-based scheduling
+ * here to find a zone's zio deeper in the sync queue and issue that instead
+ * of simply doing FIFO.
+ *
+ * We only do zone-based zio scheduling for the two synchronous I/O queues
+ * (read & write). These queues are normally serviced in FIFO order but we
+ * may decide to move a zone's zio to the head of the line. A typical I/O
+ * load will be mostly synchronous reads and some asynchronous writes (which
+ * are scheduled differently due to transaction groups). There will also be
+ * some synchronous writes for those apps which want to ensure their data is on
+ * disk. We want to make sure that a zone with a single-threaded app (e.g. the
+ * shell) that is doing synchronous I/O (typically reads) isn't penalized by
+ * other zones which are doing lots of synchronous I/O because they have many
+ * running threads.
+ *
+ * The vq->vq_lock mutex is held when we're executing this function so we
+ * can safely access the "last zone" variable on the queue.
+ */
+zio_t *
+zfs_zone_schedule(vdev_queue_t *vq, zio_priority_t p, avl_index_t idx,
+    avl_tree_t *tree)
+{
+	vdev_queue_class_t *vqc = &vq->vq_class[p];
+	uint_t cnt;
+	zoneid_t last_zone;
+	zio_t *zio;
+
+	ASSERT(MUTEX_HELD(&vq->vq_lock));
+
+	/* Don't change the order on the LBA ordered queues. */
+	if (p != ZIO_PRIORITY_SYNC_READ && p != ZIO_PRIORITY_SYNC_WRITE)
+		return (avl_nearest(tree, idx, AVL_AFTER));
+
+	/* We depend on p being defined as either 0 or 1 */
+	ASSERT(p < 2);
+
+	cnt = avl_numnodes(tree);
+	last_zone = vq->vq_last_zone_id;
+
+	/*
+	 * If there are only a few zios in the queue then just issue the head.
+	 * If there are more than a few zios already queued up, then use
+	 * scheduling to get the next zio.
+	 */
+	if (!zfs_zone_schedule_enable || cnt < zfs_zone_schedule_thresh)
+		zio = avl_nearest(tree, idx, AVL_AFTER);
+	else
+		zio = get_next_zio(vqc, cnt, p, tree);
+
+	vq->vq_last_zone_id = zio->io_zoneid;
+
+	/*
+	 * Probe with 4 args; the number of IOs in the queue, the zone that
+	 * was last scheduled off this queue, the zone that was associated
+	 * with the next IO that is scheduled, and which queue (priority).
+	 */
+	DTRACE_PROBE4(zfs__zone__sched, uint_t, cnt, uint_t, last_zone,
+	    uint_t, zio->io_zoneid, uint_t, p);
+
+	return (zio);
+}
+
+#endif
diff --git a/usr/src/uts/common/fs/zfs/zil.c b/usr/src/uts/common/fs/zfs/zil.c
index e3889b3a30..abbb31a199 100644
--- a/usr/src/uts/common/fs/zfs/zil.c
+++ b/usr/src/uts/common/fs/zfs/zil.c
@@ -21,6 +21,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  */
 
@@ -1802,9 +1803,18 @@ zil_close(zilog_t *zilog)
 	if (lwb != NULL)
 		txg = lwb->lwb_max_txg;
 	mutex_exit(&zilog->zl_lock);
-	if (txg)
+
+	if (zilog_is_dirty(zilog)) {
+		/*
+		 * If we're dirty, always wait for the current transaction --
+		 * our lwb_max_txg may be in the past.
+		 */
+		txg_wait_synced(zilog->zl_dmu_pool, 0);
+	} else if (txg) {
 		txg_wait_synced(zilog->zl_dmu_pool, txg);
-	ASSERT(!zilog_is_dirty(zilog));
+	}
+
+	VERIFY(!zilog_is_dirty(zilog));
 
 	taskq_destroy(zilog->zl_clean_taskq);
 	zilog->zl_clean_taskq = NULL;
diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c
index 1acc8b2e6a..bfbcdfb511 100644
--- a/usr/src/uts/common/fs/zfs/zio.c
+++ b/usr/src/uts/common/fs/zfs/zio.c
@@ -22,6 +22,7 @@
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
  * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  */
 
@@ -40,6 +41,7 @@
 #include <sys/ddt.h>
 #include <sys/blkptr.h>
 #include <sys/zfeature.h>
+#include <sys/zfs_zone.h>
 
 /*
  * ==========================================================================
@@ -561,11 +563,14 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 		zio->io_bookmark = *zb;
 
 	if (pio != NULL) {
+		zio->io_zoneid = pio->io_zoneid;
 		if (zio->io_logical == NULL)
 			zio->io_logical = pio->io_logical;
 		if (zio->io_child_type == ZIO_CHILD_GANG)
 			zio->io_gang_leader = pio->io_gang_leader;
 		zio_add_child(pio, zio);
+	} else {
+		zfs_zone_zio_init(zio);
 	}
 
 	return (zio);
diff --git a/usr/src/uts/common/fs/zfs/zvol.c b/usr/src/uts/common/fs/zfs/zvol.c
index 95bb26c211..535bc057b9 100644
--- a/usr/src/uts/common/fs/zfs/zvol.c
+++ b/usr/src/uts/common/fs/zfs/zvol.c
@@ -25,7 +25,7 @@
  *
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  */
 
@@ -84,6 +84,7 @@
 #include <sys/zvol.h>
 #include <sys/dumphdr.h>
 #include <sys/zil_impl.h>
+#include <sys/sdt.h>
 #include <sys/dbuf.h>
 #include <sys/dmu_tx.h>
 #include <sys/zfeature.h>
@@ -138,6 +139,11 @@ typedef struct zvol_state {
 #define	ZVOL_EXCL	0x4
 #define	ZVOL_WCE	0x8
 
+#define	VOP_LATENCY_10MS	10000000
+#define	VOP_LATENCY_100MS	100000000
+#define	VOP_LATENCY_1S		1000000000
+#define	VOP_LATENCY_10S		10000000000
+
 /*
  * zvol maximum transfer in one DMU tx.
  */
@@ -1379,6 +1385,9 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr)
 	uint64_t volsize;
 	rl_t *rl;
 	int error = 0;
+	zone_t *zonep = curzone;
+	uint64_t tot_bytes;
+	hrtime_t start, lat;
 
 	zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
 	if (zv == NULL)
@@ -1395,6 +1404,14 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr)
 		return (error);
 	}
 
+	DTRACE_PROBE3(zvol__uio__start, dev_t, dev, uio_t *, uio, int, 0);
+
+	mutex_enter(&zonep->zone_vfs_lock);
+	kstat_runq_enter(&zonep->zone_vfs_rwstats);
+	mutex_exit(&zonep->zone_vfs_lock);
+	start = gethrtime();
+	tot_bytes = 0;
+
 	rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
 	    RL_READER);
 	while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
@@ -1404,6 +1421,7 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr)
 		if (bytes > volsize - uio->uio_loffset)
 			bytes = volsize - uio->uio_loffset;
 
+		tot_bytes += bytes;
 		error =  dmu_read_uio(zv->zv_objset, ZVOL_OBJ, uio, bytes);
 		if (error) {
 			/* convert checksum errors into IO errors */
@@ -1413,6 +1431,39 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr)
 		}
 	}
 	zfs_range_unlock(rl);
+
+	mutex_enter(&zonep->zone_vfs_lock);
+	zonep->zone_vfs_rwstats.reads++;
+	zonep->zone_vfs_rwstats.nread += tot_bytes;
+	kstat_runq_exit(&zonep->zone_vfs_rwstats);
+	mutex_exit(&zonep->zone_vfs_lock);
+
+	lat = gethrtime() - start;
+
+	if (lat >= VOP_LATENCY_10MS) {
+		zone_vfs_kstat_t *zvp;
+
+		zvp = zonep->zone_vfs_stats;
+		if (lat < VOP_LATENCY_100MS) {
+			atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+		} else if (lat < VOP_LATENCY_1S) {
+			atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+			atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+		} else if (lat < VOP_LATENCY_10S) {
+			atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+			atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+			atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
+		} else {
+			atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+			atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+			atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
+			atomic_inc_64(&zvp->zv_10s_ops.value.ui64);
+		}
+	}
+
+	DTRACE_PROBE4(zvol__uio__done, dev_t, dev, uio_t *, uio, int, 0, int,
+	    error);
+
 	return (error);
 }
 
@@ -1426,6 +1477,9 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
 	rl_t *rl;
 	int error = 0;
 	boolean_t sync;
+	zone_t *zonep = curzone;
+	uint64_t tot_bytes;
+	hrtime_t start, lat;
 
 	zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
 	if (zv == NULL)
@@ -1442,6 +1496,19 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
 		return (error);
 	}
 
+	DTRACE_PROBE3(zvol__uio__start, dev_t, dev, uio_t *, uio, int, 1);
+
+	/*
+	 * For the purposes of VFS kstat consumers, the "waitq" calculation is
+	 * repurposed as the active queue for zvol write operations. There's no
+	 * actual wait queue for zvol operations.
+	 */
+	mutex_enter(&zonep->zone_vfs_lock);
+	kstat_waitq_enter(&zonep->zone_vfs_rwstats);
+	mutex_exit(&zonep->zone_vfs_lock);
+	start = gethrtime();
+	tot_bytes = 0;
+
 	sync = !(zv->zv_flags & ZVOL_WCE) ||
 	    (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
 
@@ -1455,6 +1522,7 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
 		if (bytes > volsize - off)	/* don't write past the end */
 			bytes = volsize - off;
 
+		tot_bytes += bytes;
 		dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error) {
@@ -1472,6 +1540,39 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
 	zfs_range_unlock(rl);
 	if (sync)
 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
+
+	DTRACE_PROBE4(zvol__uio__done, dev_t, dev, uio_t *, uio, int, 1, int,
+	    error);
+
+	mutex_enter(&zonep->zone_vfs_lock);
+	zonep->zone_vfs_rwstats.writes++;
+	zonep->zone_vfs_rwstats.nwritten += tot_bytes;
+	kstat_waitq_exit(&zonep->zone_vfs_rwstats);
+	mutex_exit(&zonep->zone_vfs_lock);
+
+	lat = gethrtime() - start;
+
+	if (lat >= VOP_LATENCY_10MS) {
+		zone_vfs_kstat_t *zvp;
+
+		zvp = zonep->zone_vfs_stats;
+		if (lat < VOP_LATENCY_100MS) {
+			atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+		} else if (lat < VOP_LATENCY_1S) {
+			atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+			atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+		} else if (lat < VOP_LATENCY_10S) {
+			atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+			atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+			atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
+		} else {
+			atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+			atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+			atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
+			atomic_inc_64(&zvp->zv_10s_ops.value.ui64);
+		}
+	}
+
 	return (error);
 }
 
diff --git a/usr/src/uts/common/inet/inet_hash.h b/usr/src/uts/common/inet/inet_hash.h
new file mode 100644
index 0000000000..a790a797d1
--- /dev/null
+++ b/usr/src/uts/common/inet/inet_hash.h
@@ -0,0 +1,37 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+#ifndef _INET_INET_HASH_H
+#define	_INET_INET_HASH_H
+
+/*
+ * Common packet hashing routines shared across MAC, UDP, and others.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define	INET_PKT_HASH_L2	0x01
+#define	INET_PKT_HASH_L3	0x02
+#define	INET_PKT_HASH_L4	0x04
+
+extern uint64_t inet_pkt_hash(uint_t, mblk_t *, uint8_t);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _INET_INET_HASH_H */
diff --git a/usr/src/uts/common/inet/ip/conn_opt.c b/usr/src/uts/common/inet/ip/conn_opt.c
index bcbc1c4949..b4bff4d7b4 100644
--- a/usr/src/uts/common/inet/ip/conn_opt.c
+++ b/usr/src/uts/common/inet/ip/conn_opt.c
@@ -21,6 +21,7 @@
 
 /*
  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
  */
 /* Copyright (c) 1990 Mentat Inc. */
 
@@ -619,6 +620,9 @@ conn_opt_get(conn_opt_arg_t *coa, t_scalar_t level, t_scalar_t name,
 		case SO_REUSEADDR:
 			*i1 = connp->conn_reuseaddr ? SO_REUSEADDR : 0;
 			break;	/* goto sizeof (int) option return */
+		case SO_REUSEPORT:
+			*i1 = connp->conn_reuseport;
+			break;	/* goto sizeof (int) option return */
 		case SO_TYPE:
 			*i1 = connp->conn_so_type;
 			break;	/* goto sizeof (int) option return */
@@ -1186,8 +1190,24 @@ conn_opt_set_ip(conn_opt_arg_t *coa, t_scalar_t name, uint_t inlen,
 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
 	int		error;
 
-	if (connp->conn_family != AF_INET)
+	if (connp->conn_family == AF_INET6 &&
+	    connp->conn_ipversion == IPV4_VERSION) {
+		/*
+		 * Allow certain IPv4 options to be set on an AF_INET6 socket
+		 * if the connection is still IPv4.
+		 */
+		switch (name) {
+		case IP_TOS:
+		case T_IP_TOS:
+		case IP_TTL:
+		case IP_DONTFRAG:
+			break;
+		default:
+			return (EINVAL);
+		}
+	} else if (connp->conn_family != AF_INET) {
 		return (EINVAL);
+	}
 
 	switch (name) {
 	case IP_TTL:
diff --git a/usr/src/uts/common/inet/ip/ip.c b/usr/src/uts/common/inet/ip/ip.c
index f006e83a1f..73081b9c1c 100644
--- a/usr/src/uts/common/inet/ip/ip.c
+++ b/usr/src/uts/common/inet/ip/ip.c
@@ -12577,6 +12577,7 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg)
 	struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
 	ip_ioctl_cmd_t *ipip = arg;
 	ip_extract_func_t *extract_funcp;
+	ill_t *ill;
 	cmd_info_t ci;
 	int err;
 	boolean_t entered_ipsq = B_FALSE;
@@ -12697,6 +12698,13 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg)
 	ipsq_current_start(ipsq, ci.ci_ipif, ipip->ipi_cmd);
 
 	/*
+	 * We need to cache the ill_t that we're going to use as the argument
+	 * to the ipif-ioctl DTrace probe (below) because the ci_ipif can be
+	 * blown away by calling ipi_func.
+	 */
+	ill = ci.ci_ipif == NULL ? NULL : ci.ci_ipif->ipif_ill;
+
+	/*
 	 * A return value of EINPROGRESS means the ioctl is
 	 * either queued and waiting for some reason or has
 	 * already completed.
@@ -12704,9 +12712,7 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg)
 	err = (*ipip->ipi_func)(ci.ci_ipif, ci.ci_sin, q, mp, ipip, ci.ci_lifr);
 
 	DTRACE_PROBE4(ipif__ioctl, char *, "ip_process_ioctl finish WR",
-	    int, ipip->ipi_cmd,
-	    ill_t *, ci.ci_ipif == NULL ? NULL : ci.ci_ipif->ipif_ill,
-	    ipif_t *, ci.ci_ipif);
+	    int, ipip->ipi_cmd, ill_t *, ill, ipif_t *, ci.ci_ipif);
 	ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), ipsq);
 
 	if (entered_ipsq)
diff --git a/usr/src/uts/common/inet/ip/ip_attr.c b/usr/src/uts/common/inet/ip/ip_attr.c
index 85ee142dfc..c350d67c2d 100644
--- a/usr/src/uts/common/inet/ip/ip_attr.c
+++ b/usr/src/uts/common/inet/ip/ip_attr.c
@@ -909,6 +909,11 @@ ixa_safe_copy(ip_xmit_attr_t *src, ip_xmit_attr_t *ixa)
 	 */
 	if (ixa->ixa_free_flags & IXA_FREE_CRED)
 		crhold(ixa->ixa_cred);
+
+	/*
+	 * There is no cleanup in progress on this new copy.
+	 */
+	ixa->ixa_tcpcleanup = IXATC_IDLE;
 }
 
 /*
diff --git a/usr/src/uts/common/inet/ip/ip_squeue.c b/usr/src/uts/common/inet/ip/ip_squeue.c
index 33a2fa5935..dedb4dadcc 100644
--- a/usr/src/uts/common/inet/ip/ip_squeue.c
+++ b/usr/src/uts/common/inet/ip/ip_squeue.c
@@ -163,7 +163,7 @@ ip_squeue_create(pri_t pri)
 {
 	squeue_t *sqp;
 
-	sqp = squeue_create(ip_squeue_worker_wait, pri);
+	sqp = squeue_create(ip_squeue_worker_wait, pri, B_TRUE);
 	ASSERT(sqp != NULL);
 	if (ip_squeue_create_callback != NULL)
 		ip_squeue_create_callback(sqp);
diff --git a/usr/src/uts/common/inet/ip/ipclassifier.c b/usr/src/uts/common/inet/ip/ipclassifier.c
index bc2173ff24..3a12e58c3a 100644
--- a/usr/src/uts/common/inet/ip/ipclassifier.c
+++ b/usr/src/uts/common/inet/ip/ipclassifier.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
  */
 
 /*
@@ -868,67 +869,91 @@ ipcl_hash_remove_locked(conn_t *connp, connf_t	*connfp)
 	mutex_exit(&(connfp)->connf_lock);				\
 }
 
-#define	IPCL_HASH_INSERT_BOUND(connfp, connp) {				\
-	conn_t *pconnp = NULL, *nconnp;					\
-	IPCL_HASH_REMOVE((connp));					\
-	mutex_enter(&(connfp)->connf_lock);				\
-	nconnp = (connfp)->connf_head;					\
-	while (nconnp != NULL &&					\
-	    !_IPCL_V4_MATCH_ANY(nconnp->conn_laddr_v6)) {		\
-		pconnp = nconnp;					\
-		nconnp = nconnp->conn_next;				\
-	}								\
-	if (pconnp != NULL) {						\
-		pconnp->conn_next = (connp);				\
-		(connp)->conn_prev = pconnp;				\
-	} else {							\
-		(connfp)->connf_head = (connp);				\
-	}								\
-	if (nconnp != NULL) {						\
-		(connp)->conn_next = nconnp;				\
-		nconnp->conn_prev = (connp);				\
-	}								\
-	(connp)->conn_fanout = (connfp);				\
-	(connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |	\
-	    IPCL_BOUND;							\
-	CONN_INC_REF(connp);						\
-	mutex_exit(&(connfp)->connf_lock);				\
-}
+/*
+ * When inserting bound or wildcard entries into the hash, ordering rules are
+ * used to facilitate timely and correct lookups.  The order is as follows:
+ * 1. Entries bound to a specific address
+ * 2. Entries bound to INADDR_ANY
+ * 3. Entries bound to ADDR_UNSPECIFIED
+ * Entries in a category which share conn_lport (such as those using
+ * SO_REUSEPORT) will be ordered such that the newest inserted is first.
+ */
 
-#define	IPCL_HASH_INSERT_WILDCARD(connfp, connp) {			\
-	conn_t **list, *prev, *next;					\
-	boolean_t isv4mapped =						\
-	    IN6_IS_ADDR_V4MAPPED(&(connp)->conn_laddr_v6);		\
-	IPCL_HASH_REMOVE((connp));					\
-	mutex_enter(&(connfp)->connf_lock);				\
-	list = &(connfp)->connf_head;					\
-	prev = NULL;							\
-	while ((next = *list) != NULL) {				\
-		if (isv4mapped &&					\
-		    IN6_IS_ADDR_UNSPECIFIED(&next->conn_laddr_v6) &&	\
-		    connp->conn_zoneid == next->conn_zoneid) {		\
-			(connp)->conn_next = next;			\
-			if (prev != NULL)				\
-				prev = next->conn_prev;			\
-			next->conn_prev = (connp);			\
-			break;						\
-		}							\
-		list = &next->conn_next;				\
-		prev = next;						\
-	}								\
-	(connp)->conn_prev = prev;					\
-	*list = (connp);						\
-	(connp)->conn_fanout = (connfp);				\
-	(connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |	\
-	    IPCL_BOUND;							\
-	CONN_INC_REF((connp));						\
-	mutex_exit(&(connfp)->connf_lock);				\
+void
+ipcl_hash_insert_bound(connf_t *connfp, conn_t *connp)
+{
+	conn_t *pconnp, *nconnp;
+
+	IPCL_HASH_REMOVE(connp);
+	mutex_enter(&connfp->connf_lock);
+	nconnp = connfp->connf_head;
+	pconnp = NULL;
+	while (nconnp != NULL) {
+		/*
+		 * Walk though entries associated with the fanout until one is
+		 * found which fulfills any of these conditions:
+		 * 1. Listen address of ADDR_ANY/ADDR_UNSPECIFIED
+		 * 2. Listen port the same as connp
+		 */
+		if (_IPCL_V4_MATCH_ANY(nconnp->conn_laddr_v6) ||
+		    connp->conn_lport == nconnp->conn_lport)
+			break;
+		pconnp = nconnp;
+		nconnp = nconnp->conn_next;
+	}
+	if (pconnp != NULL) {
+		pconnp->conn_next = connp;
+		connp->conn_prev = pconnp;
+	} else {
+		connfp->connf_head = connp;
+	}
+	if (nconnp != NULL) {
+		connp->conn_next = nconnp;
+		nconnp->conn_prev = connp;
+	}
+	connp->conn_fanout = connfp;
+	connp->conn_flags = (connp->conn_flags & ~IPCL_REMOVED) | IPCL_BOUND;
+	CONN_INC_REF(connp);
+	mutex_exit(&connfp->connf_lock);
 }
 
 void
 ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp)
 {
-	IPCL_HASH_INSERT_WILDCARD(connfp, connp);
+	conn_t **list, *prev, *next;
+	conn_t *pconnp = NULL, *nconnp;
+	boolean_t isv4mapped = IN6_IS_ADDR_V4MAPPED(&connp->conn_laddr_v6);
+
+	IPCL_HASH_REMOVE(connp);
+	mutex_enter(&connfp->connf_lock);
+	nconnp = connfp->connf_head;
+	pconnp = NULL;
+	while (nconnp != NULL) {
+		if (IN6_IS_ADDR_V4MAPPED_ANY(&nconnp->conn_laddr_v6) &&
+		    isv4mapped && connp->conn_lport == nconnp->conn_lport)
+			break;
+		if (IN6_IS_ADDR_UNSPECIFIED(&nconnp->conn_laddr_v6) &&
+		    (isv4mapped ||
+		    connp->conn_lport == nconnp->conn_lport))
+			break;
+
+		pconnp = nconnp;
+		nconnp = nconnp->conn_next;
+	}
+	if (pconnp != NULL) {
+		pconnp->conn_next = connp;
+		connp->conn_prev = pconnp;
+	} else {
+		connfp->connf_head = connp;
+	}
+	if (nconnp != NULL) {
+		connp->conn_next = nconnp;
+		nconnp->conn_prev = connp;
+	}
+	connp->conn_fanout = connfp;
+	connp->conn_flags = (connp->conn_flags & ~IPCL_REMOVED) | IPCL_BOUND;
+	CONN_INC_REF(connp);
+	mutex_exit(&connfp->connf_lock);
 }
 
 /*
@@ -1034,9 +1059,9 @@ ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport)
 	    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
 		if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) ||
 		    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6)) {
-			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
+			ipcl_hash_insert_wildcard(connfp, connp);
 		} else {
-			IPCL_HASH_INSERT_BOUND(connfp, connp);
+			ipcl_hash_insert_bound(connfp, connp);
 		}
 	} else {
 		IPCL_HASH_INSERT_CONNECTED(connfp, connp);
@@ -1205,9 +1230,9 @@ ipcl_bind_insert_v4(conn_t *connp)
 		if (connp->conn_faddr_v4 != INADDR_ANY) {
 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
 		} else if (connp->conn_laddr_v4 != INADDR_ANY) {
-			IPCL_HASH_INSERT_BOUND(connfp, connp);
+			ipcl_hash_insert_bound(connfp, connp);
 		} else {
-			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
+			ipcl_hash_insert_wildcard(connfp, connp);
 		}
 		if (protocol == IPPROTO_RSVP)
 			ill_set_inputfn_all(ipst);
@@ -1219,9 +1244,9 @@ ipcl_bind_insert_v4(conn_t *connp)
 		connfp = &ipst->ips_ipcl_bind_fanout[
 		    IPCL_BIND_HASH(lport, ipst)];
 		if (connp->conn_laddr_v4 != INADDR_ANY) {
-			IPCL_HASH_INSERT_BOUND(connfp, connp);
+			ipcl_hash_insert_bound(connfp, connp);
 		} else {
-			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
+			ipcl_hash_insert_wildcard(connfp, connp);
 		}
 		if (cl_inet_listen != NULL) {
 			ASSERT(connp->conn_ipversion == IPV4_VERSION);
@@ -1271,9 +1296,9 @@ ipcl_bind_insert_v6(conn_t *connp)
 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) {
 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
 		} else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
-			IPCL_HASH_INSERT_BOUND(connfp, connp);
+			ipcl_hash_insert_bound(connfp, connp);
 		} else {
-			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
+			ipcl_hash_insert_wildcard(connfp, connp);
 		}
 		break;
 
@@ -1283,9 +1308,9 @@ ipcl_bind_insert_v6(conn_t *connp)
 		connfp = &ipst->ips_ipcl_bind_fanout[
 		    IPCL_BIND_HASH(lport, ipst)];
 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
-			IPCL_HASH_INSERT_BOUND(connfp, connp);
+			ipcl_hash_insert_bound(connfp, connp);
 		} else {
-			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
+			ipcl_hash_insert_wildcard(connfp, connp);
 		}
 		if (cl_inet_listen != NULL) {
 			sa_family_t	addr_family;
@@ -1416,9 +1441,9 @@ ipcl_conn_insert_v4(conn_t *connp)
 		if (connp->conn_faddr_v4 != INADDR_ANY) {
 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
 		} else if (connp->conn_laddr_v4 != INADDR_ANY) {
-			IPCL_HASH_INSERT_BOUND(connfp, connp);
+			ipcl_hash_insert_bound(connfp, connp);
 		} else {
-			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
+			ipcl_hash_insert_wildcard(connfp, connp);
 		}
 		break;
 	}
@@ -1504,9 +1529,9 @@ ipcl_conn_insert_v6(conn_t *connp)
 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) {
 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
 		} else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
-			IPCL_HASH_INSERT_BOUND(connfp, connp);
+			ipcl_hash_insert_bound(connfp, connp);
 		} else {
-			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
+			ipcl_hash_insert_wildcard(connfp, connp);
 		}
 		break;
 	}
diff --git a/usr/src/uts/common/inet/ip/ipsecesp.c b/usr/src/uts/common/inet/ip/ipsecesp.c
index c325e8dc26..2ca770ebe9 100644
--- a/usr/src/uts/common/inet/ip/ipsecesp.c
+++ b/usr/src/uts/common/inet/ip/ipsecesp.c
@@ -234,8 +234,7 @@ esp_kstat_init(ipsecesp_stack_t *espstack, netstackid_t stackid)
 {
 	espstack->esp_ksp = kstat_create_netstack("ipsecesp", 0, "esp_stat",
 	    "net", KSTAT_TYPE_NAMED,
-	    sizeof (esp_kstats_t) / sizeof (kstat_named_t),
-	    KSTAT_FLAG_PERSISTENT, stackid);
+	    sizeof (esp_kstats_t) / sizeof (kstat_named_t), 0, stackid);
 
 	if (espstack->esp_ksp == NULL || espstack->esp_ksp->ks_data == NULL)
 		return (B_FALSE);
diff --git a/usr/src/uts/common/inet/ipclassifier.h b/usr/src/uts/common/inet/ipclassifier.h
index f6466434f6..c3139d9288 100644
--- a/usr/src/uts/common/inet/ipclassifier.h
+++ b/usr/src/uts/common/inet/ipclassifier.h
@@ -21,6 +21,7 @@
 /*
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2015 Joyent, Inc.
  */
 
 #ifndef	_INET_IPCLASSIFIER_H
@@ -293,7 +294,8 @@ struct conn_s {
 		conn_ipv6_recvpathmtu : 1,	/* IPV6_RECVPATHMTU */
 		conn_mcbc_bind : 1,		/* Bound to multi/broadcast */
 
-		conn_pad_to_bit_31 : 12;
+		conn_reuseport : 1,		/* SO_REUSEPORT state */
+		conn_pad_to_bit_31 : 11;
 
 	boolean_t	conn_blocked;		/* conn is flow-controlled */
 
diff --git a/usr/src/uts/common/inet/ipf/ip_fil_solaris.c b/usr/src/uts/common/inet/ipf/ip_fil_solaris.c
index f958ca2261..227d2075f8 100644
--- a/usr/src/uts/common/inet/ipf/ip_fil_solaris.c
+++ b/usr/src/uts/common/inet/ipf/ip_fil_solaris.c
@@ -83,6 +83,14 @@ static	int	ipf_hook6_loop_out __P((hook_event_token_t, hook_data_t,
 static	int	ipf_hook6_loop_in __P((hook_event_token_t, hook_data_t,
     void *));
 static	int     ipf_hook6 __P((hook_data_t, int, int, void *));
+static	int	ipf_hookvndl3v4_in __P((hook_event_token_t, hook_data_t,
+    void *));
+static	int	ipf_hookvndl3v6_in __P((hook_event_token_t, hook_data_t,
+    void *));
+static	int	ipf_hookvndl3v4_out __P((hook_event_token_t, hook_data_t,
+    void *));
+static	int	ipf_hookvndl3v6_out __P((hook_event_token_t, hook_data_t,
+    void *));
 extern	int	ipf_geniter __P((ipftoken_t *, ipfgeniter_t *, ipf_stack_t *));
 extern	int	ipf_frruleiter __P((void *, int, void *, ipf_stack_t *));
 
@@ -152,6 +160,16 @@ char *hook6_loop_in_gz = 	"ipfilter_hook6_loop_in_gz";
 char *hook6_loop_out = 		"ipfilter_hook6_loop_out";
 char *hook6_loop_out_gz = 	"ipfilter_hook6_loop_out_gz";
 
+/* vnd IPv4/v6 hook names */
+char *hook4_vnd_in =		"ipfilter_hookvndl3v4_in";
+char *hook4_vnd_in_gz =		"ipfilter_hookvndl3v4_in_gz";
+char *hook6_vnd_in =		"ipfilter_hookvndl3v6_in";
+char *hook6_vnd_in_gz =		"ipfilter_hookvndl3v6_in_gz";
+char *hook4_vnd_out =		"ipfilter_hookvndl3v4_out";
+char *hook4_vnd_out_gz =	"ipfilter_hookvndl3v4_out_gz";
+char *hook6_vnd_out =		"ipfilter_hookvndl3v6_out";
+char *hook6_vnd_out_gz =	"ipfilter_hookvndl3v6_out_gz";
+
 /* ------------------------------------------------------------------------ */
 /* Function:    ipldetach                                                   */
 /* Returns:     int - 0 == success, else error.                             */
@@ -248,6 +266,31 @@ ipf_stack_t *ifs;
 		ifs->ifs_ipf_ipv4 = NULL;
 	}
 
+	/*
+	 * Remove VND hooks
+	 */
+	if (ifs->ifs_ipf_vndl3v4 != NULL) {
+		UNDO_HOOK(ifs_ipf_vndl3v4, ifs_hookvndl3v4_physical_in,
+		    NH_PHYSICAL_IN, ifs_ipfhookvndl3v4_in);
+		UNDO_HOOK(ifs_ipf_vndl3v4, ifs_hookvndl3v4_physical_out,
+		    NH_PHYSICAL_OUT, ifs_ipfhookvndl3v4_out);
+
+		if (net_protocol_release(ifs->ifs_ipf_vndl3v4) != 0)
+			goto detach_failed;
+		ifs->ifs_ipf_vndl3v4 = NULL;
+	}
+
+	if (ifs->ifs_ipf_vndl3v6 != NULL) {
+		UNDO_HOOK(ifs_ipf_vndl3v6, ifs_hookvndl3v6_physical_in,
+		    NH_PHYSICAL_IN, ifs_ipfhookvndl3v6_in);
+		UNDO_HOOK(ifs_ipf_vndl3v6, ifs_hookvndl3v6_physical_out,
+		    NH_PHYSICAL_OUT, ifs_ipfhookvndl3v6_out);
+
+		if (net_protocol_release(ifs->ifs_ipf_vndl3v6) != 0)
+			goto detach_failed;
+		ifs->ifs_ipf_vndl3v6 = NULL;
+	}
+
 #undef UNDO_HOOK
 
 #ifdef	IPFDEBUG
@@ -445,6 +488,48 @@ ipf_stack_t *ifs;
 	}
 
 	/*
+	 * Add VND INET hooks
+	 */
+	ifs->ifs_ipf_vndl3v4 = net_protocol_lookup(id, NHF_VND_INET);
+	if (ifs->ifs_ipf_vndl3v4 == NULL)
+		goto hookup_failed;
+
+	HOOK_INIT_GZ_BEFORE(ifs->ifs_ipfhookvndl3v4_in, ipf_hookvndl3v4_in,
+	    hook4_vnd_in, hook4_vnd_in_gz, ifs);
+	HOOK_INIT_GZ_AFTER(ifs->ifs_ipfhookvndl3v4_out, ipf_hookvndl3v4_out,
+	    hook4_vnd_out, hook4_vnd_out_gz, ifs);
+	ifs->ifs_hookvndl3v4_physical_in = (net_hook_register(ifs->ifs_ipf_vndl3v4,
+	    NH_PHYSICAL_IN, ifs->ifs_ipfhookvndl3v4_in) == 0);
+	if (!ifs->ifs_hookvndl3v4_physical_in)
+		goto hookup_failed;
+
+	ifs->ifs_hookvndl3v4_physical_out = (net_hook_register(ifs->ifs_ipf_vndl3v4,
+	    NH_PHYSICAL_OUT, ifs->ifs_ipfhookvndl3v4_out) == 0);
+	if (!ifs->ifs_hookvndl3v4_physical_out)
+		goto hookup_failed;
+
+
+	/*
+	 * VND INET6 hooks
+	 */
+	ifs->ifs_ipf_vndl3v6 = net_protocol_lookup(id, NHF_VND_INET6);
+	if (ifs->ifs_ipf_vndl3v6 == NULL)
+		goto hookup_failed;
+
+	HOOK_INIT_GZ_BEFORE(ifs->ifs_ipfhookvndl3v6_in, ipf_hookvndl3v6_in,
+	    hook6_vnd_in, hook6_vnd_in_gz, ifs);
+	HOOK_INIT_GZ_AFTER(ifs->ifs_ipfhookvndl3v6_out, ipf_hookvndl3v6_out,
+	    hook6_vnd_out, hook6_vnd_out_gz, ifs);
+	ifs->ifs_hookvndl3v6_physical_in = (net_hook_register(ifs->ifs_ipf_vndl3v6,
+	    NH_PHYSICAL_IN, ifs->ifs_ipfhookvndl3v6_in) == 0);
+	if (!ifs->ifs_hookvndl3v6_physical_in)
+		goto hookup_failed;
+
+	ifs->ifs_hookvndl3v6_physical_out = (net_hook_register(ifs->ifs_ipf_vndl3v6,
+	    NH_PHYSICAL_OUT, ifs->ifs_ipfhookvndl3v6_out) == 0);
+	if (!ifs->ifs_hookvndl3v6_physical_out)
+		goto hookup_failed;
+	/*
 	 * Reacquire ipf_global, now it is safe.
 	 */
 	WRITE_ENTER(&ifs->ifs_ipf_global);
@@ -1011,7 +1096,6 @@ cred_t *cp;
 		return ENXIO;
 	unit = isp->ipfs_minor;
 
-
         /*
 	 * ipf_find_stack returns with a read lock on ifs_ipf_global
 	 */
@@ -2045,6 +2129,42 @@ int ipf_hook6_loop_out(hook_event_token_t token, hook_data_t info, void *arg)
 }
 
 /* ------------------------------------------------------------------------ */
+/* Function:    ipf_hookvndl3_in					    */
+/* Returns:     int - 0 == packet ok, else problem, free packet if not done */
+/* Parameters:  event(I)     - pointer to event                             */
+/*              info(I)      - pointer to hook information for firewalling  */
+/*                                                                          */
+/* The vnd hooks are private hooks to ON. They represents a layer 2         */
+/* datapath generally used to implement virtual machines. The driver sends  */
+/* along L3 packets of either type IP or IPv6. The ethertype to distinguish */
+/* them is in the upper 16 bits while the remaining bits are the            */
+/* traditional packet hook flags.                                           */
+/*                                                                          */
+/* They end up calling the appropriate traditional ip hooks.                */
+/* ------------------------------------------------------------------------ */
+/*ARGSUSED*/
+int ipf_hookvndl3v4_in(hook_event_token_t token, hook_data_t info, void *arg)
+{
+	return ipf_hook4_in(token, info, arg);
+}
+
+int ipf_hookvndl3v6_in(hook_event_token_t token, hook_data_t info, void *arg)
+{
+	return ipf_hook6_in(token, info, arg);
+}
+
+/*ARGSUSED*/
+int ipf_hookvndl3v4_out(hook_event_token_t token, hook_data_t info, void *arg)
+{
+	return ipf_hook4_out(token, info, arg);
+}
+
+int ipf_hookvndl3v6_out(hook_event_token_t token, hook_data_t info, void *arg)
+{
+	return ipf_hook6_out(token, info, arg);
+}
+
+/* ------------------------------------------------------------------------ */
 /* Function:    ipf_hook4_loop_in                                           */
 /* Returns:     int - 0 == packet ok, else problem, free packet if not done */
 /* Parameters:  event(I)     - pointer to event                             */
diff --git a/usr/src/uts/common/inet/ipf/ipf.conf b/usr/src/uts/common/inet/ipf/ipf.conf
index 6b36f9fdbf..f49e024a72 100644
--- a/usr/src/uts/common/inet/ipf/ipf.conf
+++ b/usr/src/uts/common/inet/ipf/ipf.conf
@@ -1,3 +1,8 @@
 #
 #
 name="ipf" parent="pseudo" instance=0;
+
+# Increase the state table limits. fr_statemax should be ~70% of fr_statesize,
+# and both should be prime numbers
+fr_statesize=151007;
+fr_statemax=113279;
diff --git a/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h b/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h
index a239f1c1ca..9aa2478c6a 100644
--- a/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h
+++ b/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h
@@ -125,6 +125,10 @@ struct ipf_stack {
 	hook_t		*ifs_ipfhook6_loop_in;
 	hook_t		*ifs_ipfhook6_loop_out;
 	hook_t		*ifs_ipfhook6_nicevents;
+	hook_t		*ifs_ipfhookvndl3v4_in;
+	hook_t		*ifs_ipfhookvndl3v6_in;
+	hook_t		*ifs_ipfhookvndl3v4_out;
+	hook_t		*ifs_ipfhookvndl3v6_out;
 
 	/* flags to indicate whether hooks are registered. */
 	boolean_t	ifs_hook4_physical_in;
@@ -137,10 +141,16 @@ struct ipf_stack {
 	boolean_t	ifs_hook6_nic_events;
 	boolean_t	ifs_hook6_loopback_in;
 	boolean_t	ifs_hook6_loopback_out;
+	boolean_t	ifs_hookvndl3v4_physical_in;
+	boolean_t	ifs_hookvndl3v6_physical_in;
+	boolean_t	ifs_hookvndl3v4_physical_out;
+	boolean_t	ifs_hookvndl3v6_physical_out;
 
 	int		ifs_ipf_loopback;
 	net_handle_t	ifs_ipf_ipv4;
 	net_handle_t	ifs_ipf_ipv6;
+	net_handle_t	ifs_ipf_vndl3v4;
+	net_handle_t	ifs_ipf_vndl3v6;
 
 	/* ip_auth.c */
 	int			ifs_fr_authsize;
diff --git a/usr/src/uts/common/inet/ipf/solaris.c b/usr/src/uts/common/inet/ipf/solaris.c
index c541f4dddc..5d56debc31 100644
--- a/usr/src/uts/common/inet/ipf/solaris.c
+++ b/usr/src/uts/common/inet/ipf/solaris.c
@@ -625,7 +625,6 @@ ipf_stack_shutdown(const netid_t id, void *arg)
 /*
  * Destroy things for ipf for one stack.
  */
-/* ARGSUSED */
 static void
 ipf_stack_destroy_one(const netid_t id, ipf_stack_t *ifs)
 {
diff --git a/usr/src/uts/common/inet/sockmods/datafilt.c b/usr/src/uts/common/inet/sockmods/datafilt.c
new file mode 100644
index 0000000000..6e1171de46
--- /dev/null
+++ b/usr/src/uts/common/inet/sockmods/datafilt.c
@@ -0,0 +1,116 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2012, OmniTI Computer Consulting, Inc. All rights reserved.
+ */
+
+/*
+ * This file implements a socketfilter used to deter TCP connections.
+ * To defer a connection means to delay the return of accept(3SOCKET)
+ * until at least one byte is ready to be read(2). This filter may be
+ * applied automatically or programmatically through the use of
+ * soconfig(1M) and setsockopt(3SOCKET).
+ */
+
+#include <sys/kmem.h>
+#include <sys/systm.h>
+#include <sys/stropts.h>
+#include <sys/strsun.h>
+#include <sys/socketvar.h>
+#include <sys/sockfilter.h>
+#include <sys/note.h>
+#include <sys/taskq.h>
+
+#define	DATAFILT_MODULE "datafilt"
+
+static struct modlmisc dataf_modlmisc = {
+	&mod_miscops,
+	"Kernel data-ready socket filter"
+};
+
+static struct modlinkage dataf_modlinkage = {
+	MODREV_1,
+	&dataf_modlmisc,
+	NULL
+};
+
+static sof_rval_t
+dataf_attach_passive_cb(sof_handle_t handle, sof_handle_t ph,
+    void *parg, struct sockaddr *laddr, socklen_t laddrlen,
+    struct sockaddr *faddr, socklen_t faddrlen, void **cookiep)
+{
+	_NOTE(ARGUNUSED(handle, ph, parg, laddr, laddrlen, faddr, faddrlen,
+	cookiep));
+	return (SOF_RVAL_DEFER);
+}
+
+static void
+dataf_detach_cb(sof_handle_t handle, void *cookie, cred_t *cr)
+{
+	_NOTE(ARGUNUSED(handle, cookie, cr));
+}
+
+static mblk_t *
+dataf_data_in_cb(sof_handle_t handle, void *cookie, mblk_t *mp, int flags,
+    size_t *lenp)
+{
+	_NOTE(ARGUNUSED(cookie, flags, lenp));
+
+	if (mp != NULL && MBLKL(mp) > 0) {
+		sof_newconn_ready(handle);
+		sof_bypass(handle);
+	}
+
+	return (mp);
+}
+
+static sof_ops_t dataf_ops = {
+	.sofop_attach_passive = dataf_attach_passive_cb,
+	.sofop_detach = dataf_detach_cb,
+	.sofop_data_in = dataf_data_in_cb
+};
+
+int
+_init(void)
+{
+	int err;
+
+	/*
+	 * This module is safe to attach even after some preliminary socket
+	 * setup calls have taken place. See the comment for SOF_ATT_SAFE.
+	 */
+	err = sof_register(SOF_VERSION, DATAFILT_MODULE, &dataf_ops,
+	    SOF_ATT_SAFE);
+	if (err != 0)
+		return (err);
+	if ((err = mod_install(&dataf_modlinkage)) != 0)
+		(void) sof_unregister(DATAFILT_MODULE);
+
+	return (err);
+}
+
+int
+_fini(void)
+{
+	int err;
+
+	if ((err = sof_unregister(DATAFILT_MODULE)) != 0)
+		return (err);
+
+	return (mod_remove(&dataf_modlinkage));
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&dataf_modlinkage, modinfop));
+}
diff --git a/usr/src/uts/common/inet/squeue.c b/usr/src/uts/common/inet/squeue.c
index 2e08dc359b..1009f0700f 100644
--- a/usr/src/uts/common/inet/squeue.c
+++ b/usr/src/uts/common/inet/squeue.c
@@ -23,7 +23,7 @@
  */
 
 /*
- * Copyright 2012 Joyent, Inc.  All rights reserved.
+ * Copyright (c) 2014 Joyent, Inc.  All rights reserved.
  */
 
 /*
@@ -61,6 +61,10 @@
  * connection are processed on that squeue. The connection ("conn") to
  * squeue mapping is stored in "conn_t" member "conn_sqp".
  *
+ * If the squeue is not related to TCP/IP, then the value of sqp->sq_isip is
+ * false and it will not have an associated conn_t, which means many aspects of
+ * the system, such as polling and swtiching squeues will not be used.
+ *
  * Since the processing of the connection cuts across multiple layers
  * but still allows packets for different connnection to be processed on
  * other CPU/squeues, squeues are also termed as "Vertical Perimeter" or
@@ -244,7 +248,7 @@ squeue_init(void)
 
 /* ARGSUSED */
 squeue_t *
-squeue_create(clock_t wait, pri_t pri)
+squeue_create(clock_t wait, pri_t pri, boolean_t isip)
 {
 	squeue_t *sqp = kmem_cache_alloc(squeue_cache, KM_SLEEP);
 
@@ -260,11 +264,36 @@ squeue_create(clock_t wait, pri_t pri)
 
 	sqp->sq_enter = squeue_enter;
 	sqp->sq_drain = squeue_drain;
+	sqp->sq_isip = isip;
 
 	return (sqp);
 }
 
 /*
+ * We need to kill the threads and then clean up. We should VERIFY that
+ * polling is disabled so we don't have to worry about disassociating from
+ * MAC/IP/etc.
+ */
+void
+squeue_destroy(squeue_t *sqp)
+{
+	kt_did_t worker, poll;
+	mutex_enter(&sqp->sq_lock);
+	VERIFY(!(sqp->sq_state & (SQS_POLL_THR_QUIESCED |
+	    SQS_POLL_QUIESCE_DONE | SQS_PAUSE | SQS_EXIT)));
+	worker = sqp->sq_worker->t_did;
+	poll = sqp->sq_poll_thr->t_did;
+	sqp->sq_state |= SQS_EXIT;
+	cv_signal(&sqp->sq_poll_cv);
+	cv_signal(&sqp->sq_worker_cv);
+	mutex_exit(&sqp->sq_lock);
+
+	thread_join(poll);
+	thread_join(worker);
+	kmem_cache_free(squeue_cache, sqp);
+}
+
+/*
  * Bind squeue worker thread to the specified CPU, given by CPU id.
  * If the CPU id  value is -1, bind the worker thread to the value
  * specified in sq_bind field. If a thread is already bound to a
@@ -475,18 +504,21 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt,
 			 * Handle squeue switching. More details in the
 			 * block comment at the top of the file
 			 */
-			if (connp->conn_sqp == sqp) {
+			if (sqp->sq_isip == B_FALSE || connp->conn_sqp == sqp) {
 				SQUEUE_DBG_SET(sqp, mp, proc, connp,
 				    tag);
-				connp->conn_on_sqp = B_TRUE;
+				if (sqp->sq_isip == B_TRUE)
+					connp->conn_on_sqp = B_TRUE;
 				DTRACE_PROBE3(squeue__proc__start, squeue_t *,
 				    sqp, mblk_t *, mp, conn_t *, connp);
 				(*proc)(connp, mp, sqp, ira);
 				DTRACE_PROBE2(squeue__proc__end, squeue_t *,
 				    sqp, conn_t *, connp);
-				connp->conn_on_sqp = B_FALSE;
+				if (sqp->sq_isip == B_TRUE) {
+					connp->conn_on_sqp = B_FALSE;
+					CONN_DEC_REF(connp);
+				}
 				SQUEUE_DBG_CLEAR(sqp);
-				CONN_DEC_REF(connp);
 			} else {
 				SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc,
 				    connp, ira, SQ_FILL, SQTAG_SQUEUE_CHANGE);
@@ -513,7 +545,7 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt,
 				return;
 			}
 		} else {
-			if (ira != NULL) {
+			if (sqp->sq_isip == B_TRUE && ira != NULL) {
 				mblk_t	*attrmp;
 
 				ASSERT(cnt == 1);
@@ -587,7 +619,8 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt,
 		if (!(sqp->sq_state & SQS_REENTER) &&
 		    (process_flag != SQ_FILL) && (sqp->sq_first == NULL) &&
 		    (sqp->sq_run == curthread) && (cnt == 1) &&
-		    (connp->conn_on_sqp == B_FALSE)) {
+		    (sqp->sq_isip == B_FALSE ||
+		    connp->conn_on_sqp == B_FALSE)) {
 			sqp->sq_state |= SQS_REENTER;
 			mutex_exit(&sqp->sq_lock);
 
@@ -602,15 +635,21 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt,
 			 * Handle squeue switching. More details in the
 			 * block comment at the top of the file
 			 */
-			if (connp->conn_sqp == sqp) {
-				connp->conn_on_sqp = B_TRUE;
+			if (sqp->sq_isip == B_FALSE || connp->conn_sqp == sqp) {
+				SQUEUE_DBG_SET(sqp, mp, proc, connp,
+				    tag);
+				if (sqp->sq_isip == B_TRUE)
+					connp->conn_on_sqp = B_TRUE;
 				DTRACE_PROBE3(squeue__proc__start, squeue_t *,
 				    sqp, mblk_t *, mp, conn_t *, connp);
 				(*proc)(connp, mp, sqp, ira);
 				DTRACE_PROBE2(squeue__proc__end, squeue_t *,
 				    sqp, conn_t *, connp);
-				connp->conn_on_sqp = B_FALSE;
-				CONN_DEC_REF(connp);
+				if (sqp->sq_isip == B_TRUE) {
+					connp->conn_on_sqp = B_FALSE;
+					CONN_DEC_REF(connp);
+				}
+				SQUEUE_DBG_CLEAR(sqp);
 			} else {
 				SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc,
 				    connp, ira, SQ_FILL, SQTAG_SQUEUE_CHANGE);
@@ -631,7 +670,7 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt,
 #ifdef DEBUG
 		mp->b_tag = tag;
 #endif
-		if (ira != NULL) {
+		if (sqp->sq_isip && ira != NULL) {
 			mblk_t	*attrmp;
 
 			ASSERT(cnt == 1);
@@ -779,7 +818,7 @@ again:
 		mp->b_prev = NULL;
 
 		/* Is there an ip_recv_attr_t to handle? */
-		if (ip_recv_attr_is_mblk(mp)) {
+		if (sqp->sq_isip == B_TRUE && ip_recv_attr_is_mblk(mp)) {
 			mblk_t	*attrmp = mp;
 
 			ASSERT(attrmp->b_cont != NULL);
@@ -804,20 +843,25 @@ again:
 
 
 		/*
-		 * Handle squeue switching. More details in the
-		 * block comment at the top of the file
+		 * Handle squeue switching. More details in the block comment at
+		 * the top of the file. non-IP squeues cannot switch, as there
+		 * is no conn_t.
 		 */
-		if (connp->conn_sqp == sqp) {
+		if (sqp->sq_isip == B_FALSE || connp->conn_sqp == sqp) {
 			SQUEUE_DBG_SET(sqp, mp, proc, connp,
 			    mp->b_tag);
-			connp->conn_on_sqp = B_TRUE;
+			if (sqp->sq_isip == B_TRUE)
+				connp->conn_on_sqp = B_TRUE;
 			DTRACE_PROBE3(squeue__proc__start, squeue_t *,
 			    sqp, mblk_t *, mp, conn_t *, connp);
 			(*proc)(connp, mp, sqp, ira);
 			DTRACE_PROBE2(squeue__proc__end, squeue_t *,
 			    sqp, conn_t *, connp);
-			connp->conn_on_sqp = B_FALSE;
-			CONN_DEC_REF(connp);
+			if (sqp->sq_isip == B_TRUE) {
+				connp->conn_on_sqp = B_FALSE;
+				CONN_DEC_REF(connp);
+			}
+			SQUEUE_DBG_CLEAR(sqp);
 		} else {
 			SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc, connp, ira,
 			    SQ_FILL, SQTAG_SQUEUE_CHANGE);
@@ -1051,6 +1095,11 @@ squeue_polling_thread(squeue_t *sqp)
 		cv_wait(async, lock);
 		CALLB_CPR_SAFE_END(&cprinfo, lock);
 
+		if (sqp->sq_state & SQS_EXIT) {
+			mutex_exit(lock);
+			thread_exit();
+		}
+
 		ctl_state = sqp->sq_state & (SQS_POLL_THR_CONTROL |
 		    SQS_POLL_THR_QUIESCED);
 		if (ctl_state != 0) {
@@ -1076,6 +1125,9 @@ squeue_polling_thread(squeue_t *sqp)
 		    (SQS_PROC|SQS_POLLING|SQS_GET_PKTS)) ==
 		    (SQS_PROC|SQS_POLLING|SQS_GET_PKTS));
 
+		/* Only IP related squeues should reach this point */
+		VERIFY(sqp->sq_isip == B_TRUE);
+
 poll_again:
 		sq_rx_ring = sqp->sq_rx_ring;
 		sq_get_pkts = sq_rx_ring->rr_rx;
@@ -1205,6 +1257,7 @@ squeue_worker_thr_control(squeue_t *sqp)
 	ill_rx_ring_t	*rx_ring;
 
 	ASSERT(MUTEX_HELD(&sqp->sq_lock));
+	VERIFY(sqp->sq_isip == B_TRUE);
 
 	if (sqp->sq_state & SQS_POLL_RESTART) {
 		/* Restart implies a previous quiesce. */
@@ -1316,6 +1369,11 @@ squeue_worker(squeue_t *sqp)
 
 	for (;;) {
 		for (;;) {
+			if (sqp->sq_state & SQS_EXIT) {
+				mutex_exit(lock);
+				thread_exit();
+			}
+
 			/*
 			 * If the poll thread has handed control to us
 			 * we need to break out of the wait.
@@ -1412,6 +1470,7 @@ squeue_synch_enter(conn_t *connp, mblk_t *use_mp)
 
 again:
 	sqp = connp->conn_sqp;
+	VERIFY(sqp->sq_isip == B_TRUE);
 
 	mutex_enter(&sqp->sq_lock);
 	if (sqp->sq_first == NULL && !(sqp->sq_state & SQS_PROC)) {
@@ -1487,6 +1546,7 @@ void
 squeue_synch_exit(conn_t *connp)
 {
 	squeue_t *sqp = connp->conn_sqp;
+	VERIFY(sqp->sq_isip == B_TRUE);
 
 	mutex_enter(&sqp->sq_lock);
 	if (sqp->sq_run == curthread) {
diff --git a/usr/src/uts/common/inet/tcp.h b/usr/src/uts/common/inet/tcp.h
index b2b9973291..6ec2e6b2d7 100644
--- a/usr/src/uts/common/inet/tcp.h
+++ b/usr/src/uts/common/inet/tcp.h
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, Joyent, Inc. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
  * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
  * Copyright (c) 2014 by Delphix. All rights reserved.
  */
@@ -134,6 +134,7 @@ typedef	struct tcphdra_s {
 
 struct conn_s;
 struct tcp_listen_cnt_s;
+struct tcp_rg_s;
 
 /*
  * Control structure for each open TCP stream,
@@ -404,6 +405,13 @@ typedef struct tcp_s {
 	struct tcp_s *tcp_bind_hash_port; /* tcp_t's bound to the same lport */
 	struct tcp_s **tcp_ptpbhn;
 
+	/*
+	 * Group of tcp_t entries bound to the same adress and port via
+	 * SO_REUSEPORT.  The pointer itself is protected by tf_lock in the
+	 * containing tcps_bind_fanout slot.
+	 */
+	struct tcp_rg_s	*tcp_rg_bind;
+
 	uint_t		tcp_maxpsz_multiplier;
 
 	uint32_t	tcp_lso_max; /* maximum LSO payload */
diff --git a/usr/src/uts/common/inet/tcp/tcp.c b/usr/src/uts/common/inet/tcp/tcp.c
index fba7125690..cf046c968e 100644
--- a/usr/src/uts/common/inet/tcp/tcp.c
+++ b/usr/src/uts/common/inet/tcp/tcp.c
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, Joyent Inc. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
  * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
  * Copyright (c) 2013,2014 by Delphix. All rights reserved.
  * Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved.
@@ -1423,6 +1423,21 @@ tcp_free(tcp_t *tcp)
 	tcp_close_mpp(&tcp->tcp_conn.tcp_eager_conn_ind);
 
 	/*
+	 * Destroy any association with SO_REUSEPORT group.
+	 */
+	if (tcp->tcp_rg_bind != NULL) {
+		/*
+		 * This is only necessary for connections which enabled
+		 * SO_REUSEPORT but were never bound.  Such connections should
+		 * be the one and only member of the tcp_rg_tp to which they
+		 * have been associated.
+		 */
+		VERIFY(tcp_rg_remove(tcp->tcp_rg_bind, tcp));
+		tcp_rg_destroy(tcp->tcp_rg_bind);
+		tcp->tcp_rg_bind = NULL;
+	}
+
+	/*
 	 * If this is a non-STREAM socket still holding on to an upper
 	 * handle, release it. As a result of fallback we might also see
 	 * STREAMS based conns with upper handles, in which case there is
@@ -2054,8 +2069,7 @@ tcp_reinit(tcp_t *tcp)
  * structure!
  */
 static void
-tcp_reinit_values(tcp)
-	tcp_t *tcp;
+tcp_reinit_values(tcp_t *tcp)
 {
 	tcp_stack_t	*tcps = tcp->tcp_tcps;
 	conn_t		*connp = tcp->tcp_connp;
diff --git a/usr/src/uts/common/inet/tcp/tcp_bind.c b/usr/src/uts/common/inet/tcp/tcp_bind.c
index c6df39b91e..adc201eebb 100644
--- a/usr/src/uts/common/inet/tcp/tcp_bind.c
+++ b/usr/src/uts/common/inet/tcp/tcp_bind.c
@@ -22,6 +22,7 @@
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright 2015 Joyent, Inc.
  */
 
 #include <sys/types.h>
@@ -55,6 +56,7 @@ static uint32_t tcp_random_anon_port = 1;
 static int	tcp_bind_select_lport(tcp_t *, in_port_t *, boolean_t,
 		    cred_t *cr);
 static in_port_t	tcp_get_next_priv_port(const tcp_t *);
+static int tcp_rg_insert(tcp_rg_t *, struct tcp_s *);
 
 /*
  * Hash list insertion routine for tcp_t structures. Each hash bucket
@@ -172,6 +174,16 @@ tcp_bind_hash_remove(tcp_t *tcp)
 
 	ASSERT(lockp != NULL);
 	mutex_enter(lockp);
+
+	/* destroy any association with SO_REUSEPORT group */
+	if (tcp->tcp_rg_bind != NULL) {
+		if (tcp_rg_remove(tcp->tcp_rg_bind, tcp)) {
+			/* Last one out turns off the lights */
+			tcp_rg_destroy(tcp->tcp_rg_bind);
+		}
+		tcp->tcp_rg_bind = NULL;
+	}
+
 	if (tcp->tcp_ptpbhn) {
 		tcpnext = tcp->tcp_bind_hash_port;
 		if (tcpnext != NULL) {
@@ -636,13 +648,12 @@ tcp_bind_check(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
 }
 
 /*
- * If the "bind_to_req_port_only" parameter is set, if the requested port
- * number is available, return it, If not return 0
+ * If the "bind_to_req_port_only" parameter is set and the requested port
+ * number is available, return it (else return 0).
  *
- * If "bind_to_req_port_only" parameter is not set and
- * If the requested port number is available, return it.  If not, return
- * the first anonymous port we happen across.  If no anonymous ports are
- * available, return 0. addr is the requested local address, if any.
+ * If "bind_to_req_port_only" parameter is not set and the requested port
+ * number is available, return it.  If not, return the first anonymous port we
+ * happen across.  If no anonymous ports are available, return 0.
  *
  * In either case, when succeeding update the tcp_t to record the port number
  * and insert it in the bind hash table.
@@ -662,6 +673,7 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
 	int loopmax;
 	conn_t *connp = tcp->tcp_connp;
 	tcp_stack_t	*tcps = tcp->tcp_tcps;
+	boolean_t reuseport = connp->conn_reuseport;
 
 	/*
 	 * Lookup for free addresses is done in a loop and "loopmax"
@@ -698,6 +710,7 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
 		tf_t		*tbf;
 		tcp_t		*ltcp;
 		conn_t		*lconnp;
+		boolean_t	attempt_reuse = B_FALSE;
 
 		lport = htons(port);
 
@@ -724,6 +737,7 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
 		for (; ltcp != NULL; ltcp = ltcp->tcp_bind_hash_port) {
 			boolean_t not_socket;
 			boolean_t exclbind;
+			boolean_t addrmatch;
 
 			lconnp = ltcp->tcp_connp;
 
@@ -829,22 +843,34 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
 			    &lconnp->conn_faddr_v6)))
 				continue;
 
+			addrmatch = IN6_ARE_ADDR_EQUAL(laddr,
+			    &lconnp->conn_bound_addr_v6);
+
+			if (addrmatch && reuseport && bind_to_req_port_only &&
+			    (ltcp->tcp_state == TCPS_BOUND ||
+			    ltcp->tcp_state == TCPS_LISTEN)) {
+				/*
+				 * This entry is bound to the exact same
+				 * address and port.  If SO_REUSEPORT is set on
+				 * the calling socket, attempt to reuse this
+				 * binding if it too appears to be willing.
+				 */
+				attempt_reuse = B_TRUE;
+				break;
+			}
+
 			if (!reuseaddr) {
 				/*
-				 * No socket option SO_REUSEADDR.
-				 * If existing port is bound to
-				 * a non-wildcard IP address
-				 * and the requesting stream is
-				 * bound to a distinct
-				 * different IP addresses
-				 * (non-wildcard, also), keep
-				 * going.
+				 * No socket option SO_REUSEADDR.  If an
+				 * existing port is bound to a non-wildcard IP
+				 * address and the requesting stream is bound
+				 * to a distinct different IP address
+				 * (non-wildcard, also), keep going.
 				 */
 				if (!V6_OR_V4_INADDR_ANY(*laddr) &&
 				    !V6_OR_V4_INADDR_ANY(
 				    lconnp->conn_bound_addr_v6) &&
-				    !IN6_ARE_ADDR_EQUAL(laddr,
-				    &lconnp->conn_bound_addr_v6))
+				    !addrmatch)
 					continue;
 				if (ltcp->tcp_state >= TCPS_BOUND) {
 					/*
@@ -859,27 +885,47 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
 				 * socket option SO_REUSEADDR is set on the
 				 * binding tcp_t.
 				 *
-				 * If two streams are bound to
-				 * same IP address or both addr
-				 * and bound source are wildcards
-				 * (INADDR_ANY), we want to stop
-				 * searching.
-				 * We have found a match of IP source
-				 * address and source port, which is
-				 * refused regardless of the
-				 * SO_REUSEADDR setting, so we break.
+				 * If two streams are bound to the same IP
+				 * address or both addr and bound source are
+				 * wildcards (INADDR_ANY), we want to stop
+				 * searching.  We have found a match of IP
+				 * source address and source port, which is
+				 * refused regardless of the SO_REUSEADDR
+				 * setting, so we break.
 				 */
-				if (IN6_ARE_ADDR_EQUAL(laddr,
-				    &lconnp->conn_bound_addr_v6) &&
+				if (addrmatch &&
 				    (ltcp->tcp_state == TCPS_LISTEN ||
 				    ltcp->tcp_state == TCPS_BOUND))
 					break;
 			}
 		}
-		if (ltcp != NULL) {
+		if (ltcp != NULL && !attempt_reuse) {
 			/* The port number is busy */
 			mutex_exit(&tbf->tf_lock);
 		} else {
+			if (attempt_reuse) {
+				int err;
+
+				ASSERT(ltcp != NULL);
+				ASSERT(ltcp->tcp_rg_bind != NULL);
+				ASSERT(tcp->tcp_rg_bind != NULL);
+				ASSERT(ltcp->tcp_rg_bind != tcp->tcp_rg_bind);
+
+				err = tcp_rg_insert(ltcp->tcp_rg_bind, tcp);
+				if (err != 0) {
+					mutex_exit(&tbf->tf_lock);
+					return (0);
+				}
+				/*
+				 * Now that the newly-binding socket has joined
+				 * the existing reuseport group on ltcp, it
+				 * should clean up its own (empty) group.
+				 */
+				VERIFY(tcp_rg_remove(tcp->tcp_rg_bind, tcp));
+				tcp_rg_destroy(tcp->tcp_rg_bind);
+				tcp->tcp_rg_bind = ltcp->tcp_rg_bind;
+			}
+
 			/*
 			 * This port is ours. Insert in fanout and mark as
 			 * bound to prevent others from getting the port
@@ -944,3 +990,125 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
 	} while (++count < loopmax);
 	return (0);
 }
+
+/* Max number of members in TCP SO_REUSEPORT group */
+#define	TCP_RG_SIZE_MAX		64
+/* Step size when expanding members array */
+#define	TCP_RG_SIZE_STEP	2
+
+
+tcp_rg_t *
+tcp_rg_init(tcp_t *tcp)
+{
+	tcp_rg_t *rg;
+	rg = kmem_alloc(sizeof (tcp_rg_t), KM_NOSLEEP|KM_NORMALPRI);
+	if (rg == NULL)
+		return (NULL);
+	rg->tcprg_members = kmem_zalloc(2 * sizeof (tcp_t *),
+	    KM_NOSLEEP|KM_NORMALPRI);
+	if (rg->tcprg_members == NULL) {
+		kmem_free(rg, sizeof (tcp_rg_t));
+		return (NULL);
+	}
+
+	mutex_init(&rg->tcprg_lock, NULL, MUTEX_DEFAULT, NULL);
+	rg->tcprg_size = 2;
+	rg->tcprg_count = 1;
+	rg->tcprg_active = 1;
+	rg->tcprg_members[0] = tcp;
+	return (rg);
+}
+
+void
+tcp_rg_destroy(tcp_rg_t *rg)
+{
+	mutex_enter(&rg->tcprg_lock);
+	ASSERT(rg->tcprg_count == 0);
+	ASSERT(rg->tcprg_active == 0);
+	kmem_free(rg->tcprg_members, rg->tcprg_size * sizeof (tcp_t *));
+	mutex_destroy(&rg->tcprg_lock);
+	kmem_free(rg, sizeof (struct tcp_rg_s));
+}
+
+static int
+tcp_rg_insert(tcp_rg_t *rg, tcp_t *tcp)
+{
+	mutex_enter(&rg->tcprg_lock);
+
+	VERIFY(rg->tcprg_size > 0);
+	VERIFY(rg->tcprg_count <= rg->tcprg_size);
+	if (rg->tcprg_count != 0) {
+		cred_t *oldcred = rg->tcprg_members[0]->tcp_connp->conn_cred;
+		cred_t *newcred = tcp->tcp_connp->conn_cred;
+
+		if (crgetuid(oldcred) != crgetuid(newcred) ||
+		    crgetzoneid(oldcred) != crgetzoneid(newcred)) {
+			mutex_exit(&rg->tcprg_lock);
+			return (EPERM);
+		}
+	}
+
+	if (rg->tcprg_count == rg->tcprg_size) {
+		unsigned int oldalloc = rg->tcprg_size * sizeof (tcp_t *);
+		unsigned int newsize = rg->tcprg_size + TCP_RG_SIZE_STEP;
+		tcp_t **newmembers;
+
+		if (newsize > TCP_RG_SIZE_MAX) {
+			mutex_exit(&rg->tcprg_lock);
+			return (EINVAL);
+		}
+		newmembers = kmem_zalloc(newsize * sizeof (tcp_t *),
+		    KM_NOSLEEP|KM_NORMALPRI);
+		if (newmembers == NULL) {
+			mutex_exit(&rg->tcprg_lock);
+			return (ENOMEM);
+		}
+		bcopy(rg->tcprg_members, newmembers, oldalloc);
+		kmem_free(rg->tcprg_members, oldalloc);
+		rg->tcprg_members = newmembers;
+		rg->tcprg_size = newsize;
+	}
+
+	rg->tcprg_members[rg->tcprg_count] = tcp;
+	rg->tcprg_count++;
+	rg->tcprg_active++;
+
+	mutex_exit(&rg->tcprg_lock);
+	return (0);
+}
+
+boolean_t
+tcp_rg_remove(tcp_rg_t *rg, tcp_t *tcp)
+{
+	int i;
+	boolean_t is_empty;
+
+	mutex_enter(&rg->tcprg_lock);
+	for (i = 0; i < rg->tcprg_count; i++) {
+		if (rg->tcprg_members[i] == tcp)
+			break;
+	}
+	/* The item should be present */
+	ASSERT(i < rg->tcprg_count);
+	/* Move the last member into this position */
+	rg->tcprg_count--;
+	rg->tcprg_members[i] = rg->tcprg_members[rg->tcprg_count];
+	rg->tcprg_members[rg->tcprg_count] = NULL;
+	if (tcp->tcp_connp->conn_reuseport != 0)
+		rg->tcprg_active--;
+	is_empty = (rg->tcprg_count == 0);
+	mutex_exit(&rg->tcprg_lock);
+	return (is_empty);
+}
+
+void
+tcp_rg_setactive(tcp_rg_t *rg, boolean_t is_active)
+{
+	mutex_enter(&rg->tcprg_lock);
+	if (is_active) {
+		rg->tcprg_active++;
+	} else {
+		rg->tcprg_active--;
+	}
+	mutex_exit(&rg->tcprg_lock);
+}
diff --git a/usr/src/uts/common/inet/tcp/tcp_input.c b/usr/src/uts/common/inet/tcp/tcp_input.c
index cf8e0c6bd4..7cfdb9a4a2 100644
--- a/usr/src/uts/common/inet/tcp/tcp_input.c
+++ b/usr/src/uts/common/inet/tcp/tcp_input.c
@@ -22,7 +22,7 @@
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2011 Joyent, Inc. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
  * Copyright (c) 2014 by Delphix. All rights reserved.
  */
 
@@ -99,7 +99,7 @@
  * tcps_time_wait_interval since the period before upper layer closes the
  * connection is not accounted for when tcp_time_wait_append() is called.
  *
- * If uppser layer has closed the connection, call tcp_time_wait_append()
+ * If upper layer has closed the connection, call tcp_time_wait_append()
  * directly.
  *
  */
diff --git a/usr/src/uts/common/inet/tcp/tcp_opt_data.c b/usr/src/uts/common/inet/tcp/tcp_opt_data.c
index 1a5363bedc..835acd1b12 100644
--- a/usr/src/uts/common/inet/tcp/tcp_opt_data.c
+++ b/usr/src/uts/common/inet/tcp/tcp_opt_data.c
@@ -21,6 +21,7 @@
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
  */
 
 #include <sys/types.h>
@@ -62,7 +63,8 @@ opdes_t	tcp_opt_arr[] = {
 { SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
 	},
 { SO_BROADCAST,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
-{ SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_REUSEADDR,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_REUSEPORT,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 { SO_OOBINLINE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 { SO_TYPE,	SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
 { SO_SNDBUF,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
@@ -483,6 +485,42 @@ tcp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
 	return (retval);
 }
 
+static int
+tcp_set_reuseport(conn_t *connp, boolean_t do_enable)
+{
+	tcp_t *tcp = connp->conn_tcp;
+	struct tcp_rg_s *rg;
+
+	if (do_enable && !IPCL_IS_NONSTR(connp)) {
+		/*
+		 * SO_REUSEPORT cannot be enabled on sockets which have fallen
+		 * back to the STREAMS API.
+		 */
+		return (EINVAL);
+	}
+	if (connp->conn_reuseport == 0 && do_enable) {
+		/* disabled -> enabled */
+		if (tcp->tcp_rg_bind != NULL) {
+			tcp_rg_setactive(tcp->tcp_rg_bind, do_enable);
+		} else {
+			if (tcp->tcp_state >= TCPS_BOUND ||
+			    tcp->tcp_state <= TCPS_CLOSED)
+				return (EINVAL);
+			if ((rg = tcp_rg_init(tcp)) == NULL)
+				return (ENOMEM);
+			tcp->tcp_rg_bind = rg;
+		}
+		connp->conn_reuseport = 1;
+	} else if (connp->conn_reuseport != 0 && !do_enable) {
+		/* enabled -> disabled */
+		if (tcp->tcp_rg_bind != NULL) {
+			tcp_rg_setactive(tcp->tcp_rg_bind, do_enable);
+		}
+		connp->conn_reuseport = 0;
+	}
+	return (0);
+}
+
 /*
  * We declare as 'int' rather than 'void' to satisfy pfi_t arg requirements.
  * Parameters are assumed to be verified by the caller.
@@ -653,6 +691,11 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
 			}
 			*outlenp = inlen;
 			return (0);
+		case SO_REUSEPORT:
+			if (!checkonly) {
+				return (tcp_set_reuseport(connp, *i1 != 0));
+			}
+			return (0);
 		}
 		break;
 	case IPPROTO_TCP:
@@ -769,14 +812,37 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
 			if (*i1 == 0) {
 				return (EINVAL);
 			} else if (tcp->tcp_ka_rinterval == 0) {
-				if ((tcp->tcp_ka_abort_thres / *i1) <
-				    tcp->tcp_rto_min ||
-				    (tcp->tcp_ka_abort_thres / *i1) >
-				    tcp->tcp_rto_max)
-					return (EINVAL);
+				/*
+				 * When TCP_KEEPCNT is specified without first
+				 * specifying a TCP_KEEPINTVL, we infer an
+				 * interval based on a tunable specific to our
+				 * stack: the tcp_keepalive_abort_interval.
+				 * (Or the TCP_KEEPALIVE_ABORT_THRESHOLD, in
+				 * the unlikely event that that has been set.)
+				 * Given the abort interval's default value of
+				 * 480 seconds, low TCP_KEEPCNT values can
+				 * result in intervals that exceed the default
+				 * maximum RTO of 60 seconds.  Rather than
+				 * fail in these cases, we (implicitly) clamp
+				 * the interval at the maximum RTO; if the
+				 * TCP_KEEPCNT is shortly followed by a
+				 * TCP_KEEPINTVL (as we expect), the abort
+				 * threshold will be recalculated correctly --
+				 * and if a TCP_KEEPINTVL is not forthcoming,
+				 * keep-alive will at least operate reasonably
+				 * given the underconfigured state.
+				 */
+				uint32_t interval;
 
-				tcp->tcp_ka_rinterval =
-				    tcp->tcp_ka_abort_thres / *i1;
+				interval = tcp->tcp_ka_abort_thres / *i1;
+
+				if (interval < tcp->tcp_rto_min)
+					interval = tcp->tcp_rto_min;
+
+				if (interval > tcp->tcp_rto_max)
+					interval = tcp->tcp_rto_max;
+
+				tcp->tcp_ka_rinterval = interval;
 			} else {
 				if ((*i1 * tcp->tcp_ka_rinterval) <
 				    tcps->tcps_keepalive_abort_interval_low ||
@@ -953,10 +1019,6 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
 		}
 		break;
 	case IPPROTO_IP:
-		if (connp->conn_family != AF_INET) {
-			*outlenp = 0;
-			return (EINVAL);
-		}
 		switch (name) {
 		case IP_SEC_OPT:
 			/*
diff --git a/usr/src/uts/common/inet/tcp/tcp_socket.c b/usr/src/uts/common/inet/tcp/tcp_socket.c
index a431bf63d1..8f535a5dd1 100644
--- a/usr/src/uts/common/inet/tcp/tcp_socket.c
+++ b/usr/src/uts/common/inet/tcp/tcp_socket.c
@@ -21,6 +21,7 @@
 
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
  */
 
 /* This file contains all TCP kernel socket related functions. */
@@ -1022,6 +1023,16 @@ tcp_fallback(sock_lower_handle_t proto_handle, queue_t *q,
 	}
 
 	/*
+	 * Do not allow fallback on connections making use of SO_REUSEPORT.
+	 */
+	if (tcp->tcp_rg_bind != NULL) {
+		freeb(stropt_mp);
+		freeb(ordrel_mp);
+		squeue_synch_exit(connp);
+		return (EINVAL);
+	}
+
+	/*
 	 * Both endpoints must be of the same type (either STREAMS or
 	 * non-STREAMS) for fusion to be enabled. So if we are fused,
 	 * we have to unfuse.
diff --git a/usr/src/uts/common/inet/tcp/tcp_time_wait.c b/usr/src/uts/common/inet/tcp/tcp_time_wait.c
index b470934da0..6600296b18 100644
--- a/usr/src/uts/common/inet/tcp/tcp_time_wait.c
+++ b/usr/src/uts/common/inet/tcp/tcp_time_wait.c
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, Joyent Inc. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
  */
 
 /*
@@ -41,13 +41,13 @@
 #include <inet/tcp_impl.h>
 #include <inet/tcp_cluster.h>
 
-static void	tcp_timewait_close(void *, mblk_t *, void *, ip_recv_attr_t *);
+static void tcp_time_wait_purge(tcp_t *, tcp_squeue_priv_t *);
+
+#define	TW_BUCKET(t)					\
+	(((t) / MSEC_TO_TICK(TCP_TIME_WAIT_DELAY)) % TCP_TIME_WAIT_BUCKETS)
+
+#define	TW_BUCKET_NEXT(b)	(((b) + 1) % TCP_TIME_WAIT_BUCKETS)
 
-/*
- * TCP_TIME_WAIT_DELAY governs how often the time_wait_collector runs.
- * Running it every 5 seconds seems to give the best results.
- */
-#define	TCP_TIME_WAIT_DELAY ((hrtime_t)5 * NANOSEC)
 
 /*
  * Remove a connection from the list of detached TIME_WAIT connections.
@@ -56,17 +56,17 @@ static void	tcp_timewait_close(void *, mblk_t *, void *, ip_recv_attr_t *);
  * earlier call to tcp_time_wait_remove(); otherwise it returns B_TRUE.
  */
 boolean_t
-tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tcp_time_wait)
+tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tsp)
 {
 	boolean_t	locked = B_FALSE;
 
-	if (tcp_time_wait == NULL) {
-		tcp_time_wait = *((tcp_squeue_priv_t **)
+	if (tsp == NULL) {
+		tsp = *((tcp_squeue_priv_t **)
 		    squeue_getprivate(tcp->tcp_connp->conn_sqp, SQPRIVATE_TCP));
-		mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
+		mutex_enter(&tsp->tcp_time_wait_lock);
 		locked = B_TRUE;
 	} else {
-		ASSERT(MUTEX_HELD(&tcp_time_wait->tcp_time_wait_lock));
+		ASSERT(MUTEX_HELD(&tsp->tcp_time_wait_lock));
 	}
 
 	/* 0 means that the tcp_t has not been added to the time wait list. */
@@ -74,40 +74,34 @@ tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tcp_time_wait)
 		ASSERT(tcp->tcp_time_wait_next == NULL);
 		ASSERT(tcp->tcp_time_wait_prev == NULL);
 		if (locked)
-			mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
+			mutex_exit(&tsp->tcp_time_wait_lock);
 		return (B_FALSE);
 	}
 	ASSERT(TCP_IS_DETACHED(tcp));
 	ASSERT(tcp->tcp_state == TCPS_TIME_WAIT);
+	ASSERT(tsp->tcp_time_wait_cnt > 0);
 
-	if (tcp == tcp_time_wait->tcp_time_wait_head) {
-		ASSERT(tcp->tcp_time_wait_prev == NULL);
-		tcp_time_wait->tcp_time_wait_head = tcp->tcp_time_wait_next;
-		if (tcp_time_wait->tcp_time_wait_head != NULL) {
-			tcp_time_wait->tcp_time_wait_head->tcp_time_wait_prev =
-			    NULL;
-		} else {
-			tcp_time_wait->tcp_time_wait_tail = NULL;
-		}
-	} else if (tcp == tcp_time_wait->tcp_time_wait_tail) {
-		ASSERT(tcp->tcp_time_wait_next == NULL);
-		tcp_time_wait->tcp_time_wait_tail = tcp->tcp_time_wait_prev;
-		ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL);
-		tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next = NULL;
-	} else {
-		ASSERT(tcp->tcp_time_wait_prev->tcp_time_wait_next == tcp);
-		ASSERT(tcp->tcp_time_wait_next->tcp_time_wait_prev == tcp);
-		tcp->tcp_time_wait_prev->tcp_time_wait_next =
-		    tcp->tcp_time_wait_next;
+	if (tcp->tcp_time_wait_next != NULL) {
 		tcp->tcp_time_wait_next->tcp_time_wait_prev =
 		    tcp->tcp_time_wait_prev;
 	}
+	if (tcp->tcp_time_wait_prev != NULL) {
+		tcp->tcp_time_wait_prev->tcp_time_wait_next =
+		    tcp->tcp_time_wait_next;
+	} else {
+		unsigned int bucket;
+
+		bucket = TW_BUCKET(tcp->tcp_time_wait_expire);
+		ASSERT(tsp->tcp_time_wait_bucket[bucket] == tcp);
+		tsp->tcp_time_wait_bucket[bucket] = tcp->tcp_time_wait_next;
+	}
 	tcp->tcp_time_wait_next = NULL;
 	tcp->tcp_time_wait_prev = NULL;
 	tcp->tcp_time_wait_expire = 0;
+	tsp->tcp_time_wait_cnt--;
 
 	if (locked)
-		mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
+		mutex_exit(&tsp->tcp_time_wait_lock);
 	return (B_TRUE);
 }
 
@@ -126,6 +120,7 @@ tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tcp_time_wait)
 	((x)->tcp_connp->conn_ipversion == IPV6_VERSION && \
 	IN6_IS_ADDR_LOOPBACK(&(x)->tcp_connp->conn_laddr_v6)))
 
+
 /*
  * Add a connection to the list of detached TIME_WAIT connections
  * and set its time to expire.
@@ -135,9 +130,10 @@ tcp_time_wait_append(tcp_t *tcp)
 {
 	tcp_stack_t	*tcps = tcp->tcp_tcps;
 	squeue_t	*sqp = tcp->tcp_connp->conn_sqp;
-	tcp_squeue_priv_t *tcp_time_wait =
+	tcp_squeue_priv_t *tsp =
 	    *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP));
-	hrtime_t firetime = 0;
+	int64_t		now, schedule;
+	unsigned int	bucket;
 
 	tcp_timers_stop(tcp);
 
@@ -146,6 +142,8 @@ tcp_time_wait_append(tcp_t *tcp)
 	ASSERT(tcp->tcp_ack_tid == 0);
 
 	/* must have happened at the time of detaching the tcp */
+	ASSERT(TCP_IS_DETACHED(tcp));
+	ASSERT(tcp->tcp_state == TCPS_TIME_WAIT);
 	ASSERT(tcp->tcp_ptpahn == NULL);
 	ASSERT(tcp->tcp_flow_stopped == 0);
 	ASSERT(tcp->tcp_time_wait_next == NULL);
@@ -153,97 +151,112 @@ tcp_time_wait_append(tcp_t *tcp)
 	ASSERT(tcp->tcp_time_wait_expire == 0);
 	ASSERT(tcp->tcp_listener == NULL);
 
-	tcp->tcp_time_wait_expire = ddi_get_lbolt64();
-	if (IS_LOCAL_HOST(tcp)) {
-		/*
-		 * This is the fastpath for handling localhost connections.
-		 * Since we don't have to worry about packets on the localhost
-		 * showing up after a long network delay, we want to expire
-		 * these quickly so the port range on the localhost doesn't
-		 * get starved by short-running, local apps.
-		 *
-		 * Leave tcp_time_wait_expire at the current time. This
-		 * essentially means the connection is expired now and it will
-		 * clean up the next time tcp_time_wait_collector runs.  We set
-		 * firetime to use a short delay so that if we have to start a
-		 * tcp_time_wait_collector thread below, it runs soon instead
-		 * of after a delay of time_wait_interval. firetime being set
-		 * to a non-0 value is also our indicator that we should add
-		 * this connection to the head of the time wait list (since we
-		 * are already expired) so that its sure to get cleaned up on
-		 * the next run of tcp_time_wait_collector (which expects the
-		 * entries to appear in time-order and stops when it hits the
-		 * first non-expired entry).
-		 */
-		firetime = TCP_TIME_WAIT_DELAY;
-	} else {
-		/*
-		 * Since tcp_time_wait_expire is lbolt64, it should not wrap
-		 * around in practice.  Hence it cannot be 0.  Note that zero
-		 * means that the tcp_t is not in the TIME_WAIT list.
-		 */
-		tcp->tcp_time_wait_expire += MSEC_TO_TICK(
-		    tcps->tcps_time_wait_interval);
+	TCP_DBGSTAT(tcps, tcp_time_wait);
+	mutex_enter(&tsp->tcp_time_wait_lock);
+
+	/*
+	 * Immediately expire loopback connections.  Since there is no worry
+	 * about packets on the local host showing up after a long network
+	 * delay, this is safe and allows much higher rates of connection churn
+	 * for applications operating locally.
+	 *
+	 * This typically bypasses the tcp_free_list fast path due to squeue
+	 * re-entry for the loopback close operation.
+	 */
+	if (tcp->tcp_loopback) {
+		tcp_time_wait_purge(tcp, tsp);
+		mutex_exit(&tsp->tcp_time_wait_lock);
+		return;
 	}
 
-	ASSERT(TCP_IS_DETACHED(tcp));
-	ASSERT(tcp->tcp_state == TCPS_TIME_WAIT);
-	ASSERT(tcp->tcp_time_wait_next == NULL);
-	ASSERT(tcp->tcp_time_wait_prev == NULL);
-	TCP_DBGSTAT(tcps, tcp_time_wait);
+	/*
+	 * In order to reap TIME_WAITs reliably, we should use a source of time
+	 * that is not adjustable by the user.  While it would be more accurate
+	 * to grab this timestamp before (potentially) sleeping on the
+	 * tcp_time_wait_lock, doing so complicates bucket addressing later.
+	 */
+	now = ddi_get_lbolt64();
 
-	mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
-	if (tcp_time_wait->tcp_time_wait_head == NULL) {
-		ASSERT(tcp_time_wait->tcp_time_wait_tail == NULL);
-		tcp_time_wait->tcp_time_wait_head = tcp;
+	/*
+	 * Each squeue uses an arbitrary time offset when scheduling
+	 * expiration timers.  This prevents the bucketing from forcing
+	 * tcp_time_wait_collector to run in locksetup across squeues.
+	 *
+	 * This offset is (re)initialized when a new TIME_WAIT connection is
+	 * added to an squeue which has no connections waiting to expire.
+	 */
+	if (tsp->tcp_time_wait_tid == 0) {
+		ASSERT(tsp->tcp_time_wait_cnt == 0);
+		tsp->tcp_time_wait_offset =
+		    now % MSEC_TO_TICK(TCP_TIME_WAIT_DELAY);
+	}
+	now -= tsp->tcp_time_wait_offset;
+
+	/*
+	 * Use the netstack-defined timeout, rounded up to the minimum
+	 * time_wait_collector interval.
+	 */
+	schedule = now + MSEC_TO_TICK(tcps->tcps_time_wait_interval);
+	tcp->tcp_time_wait_expire = schedule;
+
+	/*
+	 * Append the connection into the appropriate bucket.
+	 */
+	bucket = TW_BUCKET(tcp->tcp_time_wait_expire);
+	tcp->tcp_time_wait_next = tsp->tcp_time_wait_bucket[bucket];
+	tsp->tcp_time_wait_bucket[bucket] = tcp;
+	if (tcp->tcp_time_wait_next != NULL) {
+		ASSERT(tcp->tcp_time_wait_next->tcp_time_wait_prev == NULL);
+		tcp->tcp_time_wait_next->tcp_time_wait_prev = tcp;
+	}
+	tsp->tcp_time_wait_cnt++;
+
+	/*
+	 * Round delay up to the nearest bucket boundary.
+	 */
+	schedule += MSEC_TO_TICK(TCP_TIME_WAIT_DELAY);
+	schedule -= schedule % MSEC_TO_TICK(TCP_TIME_WAIT_DELAY);
+
+	/*
+	 * The newly inserted entry may require a tighter schedule for the
+	 * expiration timer.
+	 */
+	if (schedule < tsp->tcp_time_wait_schedule) {
+		callout_id_t old_tid = tsp->tcp_time_wait_tid;
+
+		tsp->tcp_time_wait_schedule = schedule;
+		tsp->tcp_time_wait_tid =
+		    timeout_generic(CALLOUT_NORMAL,
+		    tcp_time_wait_collector, sqp,
+		    TICK_TO_NSEC(schedule - now),
+		    CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP);
 
 		/*
-		 * Even if the list was empty before, there may be a timer
-		 * running since a tcp_t can be removed from the list
-		 * in other places, such as tcp_clean_death().  So check if
-		 * a timer is needed.
-		 */
-		if (tcp_time_wait->tcp_time_wait_tid == 0) {
-			if (firetime == 0)
-				firetime = (hrtime_t)
-				    (tcps->tcps_time_wait_interval + 1) *
-				    MICROSEC;
-
-			tcp_time_wait->tcp_time_wait_tid =
-			    timeout_generic(CALLOUT_NORMAL,
-			    tcp_time_wait_collector, sqp, firetime,
-			    CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP);
-		}
-		tcp_time_wait->tcp_time_wait_tail = tcp;
-	} else {
-		/*
-		 * The list is not empty, so a timer must be running.  If not,
-		 * tcp_time_wait_collector() must be running on this
-		 * tcp_time_wait list at the same time.
+		 * It is possible for the timer to fire before the untimeout
+		 * action is able to complete.  In that case, the exclusion
+		 * offered by the tcp_time_wait_collector_active flag will
+		 * prevent multiple collector threads from processing records
+		 * simultaneously from the same squeue.
 		 */
-		ASSERT(tcp_time_wait->tcp_time_wait_tid != 0 ||
-		    tcp_time_wait->tcp_time_wait_running);
-		ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL);
-		ASSERT(tcp_time_wait->tcp_time_wait_tail->tcp_state ==
-		    TCPS_TIME_WAIT);
-
-		if (firetime == 0) {
-			/* add at end */
-			tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next =
-			    tcp;
-			tcp->tcp_time_wait_prev =
-			    tcp_time_wait->tcp_time_wait_tail;
-			tcp_time_wait->tcp_time_wait_tail = tcp;
-		} else {
-			/* add at head */
-			tcp->tcp_time_wait_next =
-			    tcp_time_wait->tcp_time_wait_head;
-			tcp_time_wait->tcp_time_wait_head->tcp_time_wait_prev =
-			    tcp;
-			tcp_time_wait->tcp_time_wait_head = tcp;
-		}
+		mutex_exit(&tsp->tcp_time_wait_lock);
+		(void) untimeout_default(old_tid, 0);
+		return;
+	}
+
+	/*
+	 * Start a fresh timer if none exists.
+	 */
+	if (tsp->tcp_time_wait_schedule == 0) {
+		ASSERT(tsp->tcp_time_wait_tid == 0);
+
+		tsp->tcp_time_wait_schedule = schedule;
+		tsp->tcp_time_wait_tid =
+		    timeout_generic(CALLOUT_NORMAL,
+		    tcp_time_wait_collector, sqp,
+		    TICK_TO_NSEC(schedule - now),
+		    CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP);
 	}
-	mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
+	mutex_exit(&tsp->tcp_time_wait_lock);
 }
 
 /*
@@ -278,216 +291,287 @@ tcp_timewait_close(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
 	tcp_close_detached(tcp);
 }
 
+
+static void
+tcp_time_wait_purge(tcp_t *tcp, tcp_squeue_priv_t *tsp)
+{
+	mblk_t *mp;
+	conn_t *connp = tcp->tcp_connp;
+	kmutex_t *lock;
+
+	ASSERT(MUTEX_HELD(&tsp->tcp_time_wait_lock));
+	ASSERT(connp->conn_fanout != NULL);
+
+	lock = &connp->conn_fanout->connf_lock;
+
+	/*
+	 * This is essentially a TIME_WAIT reclaim fast path optimization for
+	 * performance where the connection is checked under the fanout lock
+	 * (so that no one else can get access to the conn_t) that the refcnt
+	 * is 2 (one each for TCP and the classifier hash list).  That is the
+	 * case and clustering callbacks are not enabled, the conn can be
+	 * removed under the fanout lock and avoid clean-up under the squeue.
+	 *
+	 * This optimization is forgone when clustering is enabled since the
+	 * clustering callback must be made before setting the CONDEMNED flag
+	 * and after dropping all locks
+	 *
+	 * See the comments in tcp_closei_local for additional information
+	 * regarding the refcnt logic.
+	 */
+	if (mutex_tryenter(lock)) {
+		mutex_enter(&connp->conn_lock);
+		if (connp->conn_ref == 2 && cl_inet_disconnect == NULL) {
+			ipcl_hash_remove_locked(connp, connp->conn_fanout);
+			/*
+			 * Set the CONDEMNED flag now itself so that the refcnt
+			 * cannot increase due to any walker.
+			 */
+			connp->conn_state_flags |= CONN_CONDEMNED;
+			mutex_exit(&connp->conn_lock);
+			mutex_exit(lock);
+			if (tsp->tcp_free_list_cnt < tcp_free_list_max_cnt) {
+				/*
+				 * Add to head of tcp_free_list
+				 */
+				tcp_cleanup(tcp);
+				ASSERT(connp->conn_latch == NULL);
+				ASSERT(connp->conn_policy == NULL);
+				ASSERT(tcp->tcp_tcps == NULL);
+				ASSERT(connp->conn_netstack == NULL);
+
+				tcp->tcp_time_wait_next = tsp->tcp_free_list;
+				tcp->tcp_in_free_list = B_TRUE;
+				tsp->tcp_free_list = tcp;
+				tsp->tcp_free_list_cnt++;
+			} else {
+				/*
+				 * Do not add to tcp_free_list
+				 */
+				tcp_bind_hash_remove(tcp);
+				ixa_cleanup(tcp->tcp_connp->conn_ixa);
+				tcp_ipsec_cleanup(tcp);
+				CONN_DEC_REF(tcp->tcp_connp);
+			}
+
+			/*
+			 * With the fast-path complete, we can bail.
+			 */
+			return;
+		} else {
+			/*
+			 * Fall back to slow path.
+			 */
+			CONN_INC_REF_LOCKED(connp);
+			mutex_exit(&connp->conn_lock);
+			mutex_exit(lock);
+		}
+	} else {
+		CONN_INC_REF(connp);
+	}
+
+	/*
+	 * We can reuse the closemp here since conn has detached (otherwise we
+	 * wouldn't even be in time_wait list). It is safe to change
+	 * tcp_closemp_used without taking a lock as no other thread can
+	 * concurrently access it at this point in the connection lifecycle.
+	 */
+	if (tcp->tcp_closemp.b_prev == NULL) {
+		tcp->tcp_closemp_used = B_TRUE;
+	} else {
+		cmn_err(CE_PANIC,
+		    "tcp_timewait_collector: concurrent use of tcp_closemp: "
+		    "connp %p tcp %p\n", (void *)connp, (void *)tcp);
+	}
+
+	TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15);
+	mp = &tcp->tcp_closemp;
+	mutex_exit(&tsp->tcp_time_wait_lock);
+	SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_timewait_close, connp, NULL,
+	    SQ_FILL, SQTAG_TCP_TIMEWAIT);
+	mutex_enter(&tsp->tcp_time_wait_lock);
+}
+
 /*
- * Blows away all tcps whose TIME_WAIT has expired. List traversal
- * is done forwards from the head.
- * This walks all stack instances since
- * tcp_time_wait remains global across all stacks.
+ * Purge any tcp_t instances associated with this squeue which have expired
+ * from the TIME_WAIT state.
  */
-/* ARGSUSED */
 void
 tcp_time_wait_collector(void *arg)
 {
 	tcp_t *tcp;
-	int64_t now;
-	mblk_t *mp;
-	conn_t *connp;
-	kmutex_t *lock;
-	boolean_t removed;
-	extern void (*cl_inet_disconnect)(netstackid_t, uint8_t, sa_family_t,
-	    uint8_t *, in_port_t, uint8_t *, in_port_t, void *);
+	int64_t now, active_schedule, new_schedule;
+	unsigned int idx;
 
 	squeue_t *sqp = (squeue_t *)arg;
-	tcp_squeue_priv_t *tcp_time_wait =
+	tcp_squeue_priv_t *tsp =
 	    *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP));
 
-	mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
-	tcp_time_wait->tcp_time_wait_tid = 0;
-#ifdef DEBUG
-	tcp_time_wait->tcp_time_wait_running = B_TRUE;
-#endif
+	mutex_enter(&tsp->tcp_time_wait_lock);
+
+	/*
+	 * Because of timer scheduling complexity and the fact that the
+	 * tcp_time_wait_lock is dropped during tcp_time_wait_purge, it is
+	 * possible for multiple tcp_time_wait_collector threads to run against
+	 * the same squeue.  This flag is used to exclude other collectors from
+	 * the squeue during execution.
+	 */
+	if (tsp->tcp_time_wait_collector_active) {
+		mutex_exit(&tsp->tcp_time_wait_lock);
+		return;
+	}
+	tsp->tcp_time_wait_collector_active = B_TRUE;
 
-	if (tcp_time_wait->tcp_free_list != NULL &&
-	    tcp_time_wait->tcp_free_list->tcp_in_free_list == B_TRUE) {
+	/*
+	 * Purge the free list if necessary
+	 */
+	if (tsp->tcp_free_list != NULL) {
 		TCP_G_STAT(tcp_freelist_cleanup);
-		while ((tcp = tcp_time_wait->tcp_free_list) != NULL) {
-			tcp_time_wait->tcp_free_list = tcp->tcp_time_wait_next;
+		while ((tcp = tsp->tcp_free_list) != NULL) {
+			tsp->tcp_free_list = tcp->tcp_time_wait_next;
 			tcp->tcp_time_wait_next = NULL;
-			tcp_time_wait->tcp_free_list_cnt--;
+			tsp->tcp_free_list_cnt--;
 			ASSERT(tcp->tcp_tcps == NULL);
 			CONN_DEC_REF(tcp->tcp_connp);
 		}
-		ASSERT(tcp_time_wait->tcp_free_list_cnt == 0);
+		ASSERT(tsp->tcp_free_list_cnt == 0);
 	}
 
 	/*
-	 * In order to reap time waits reliably, we should use a
-	 * source of time that is not adjustable by the user -- hence
-	 * the call to ddi_get_lbolt64().
+	 * If there are no connections pending, clear timer-related state to be
+	 * reinitialized by the next caller.
 	 */
-	now = ddi_get_lbolt64();
-	while ((tcp = tcp_time_wait->tcp_time_wait_head) != NULL) {
+	if (tsp->tcp_time_wait_cnt == 0) {
+		tsp->tcp_time_wait_offset = 0;
+		tsp->tcp_time_wait_schedule = 0;
+		tsp->tcp_time_wait_tid = 0;
+		tsp->tcp_time_wait_collector_active = B_FALSE;
+		mutex_exit(&tsp->tcp_time_wait_lock);
+		return;
+	}
+
+	/*
+	 * Grab the bucket which we were scheduled to cleanse.
+	 */
+	active_schedule = tsp->tcp_time_wait_schedule;
+	idx = TW_BUCKET(active_schedule - 1);
+	now = ddi_get_lbolt64() - tsp->tcp_time_wait_offset;
+retry:
+	tcp = tsp->tcp_time_wait_bucket[idx];
+
+	while (tcp != NULL) {
 		/*
-		 * lbolt64 should not wrap around in practice...  So we can
-		 * do a direct comparison.
+		 * Since the bucket count is sized to prevent wrap-around
+		 * during typical operation and timers are schedule to process
+		 * buckets with only expired connections, there is only one
+		 * reason to encounter a connection expiring in the future:
+		 * The tcp_time_wait_collector thread has been so delayed in
+		 * its processing that connections have wrapped around the
+		 * timing wheel into this bucket.
+		 *
+		 * In that case, the remaining entires in the bucket can be
+		 * ignored since, being appended sequentially, they should all
+		 * expire in the future.
 		 */
-		if (now < tcp->tcp_time_wait_expire)
+		if (now < tcp->tcp_time_wait_expire) {
 			break;
+		}
 
-		removed = tcp_time_wait_remove(tcp, tcp_time_wait);
-		ASSERT(removed);
+		/*
+		 * Pull the connection out of the bucket.
+		 */
+		VERIFY(tcp_time_wait_remove(tcp, tsp));
 
-		connp = tcp->tcp_connp;
-		ASSERT(connp->conn_fanout != NULL);
-		lock = &connp->conn_fanout->connf_lock;
 		/*
-		 * This is essentially a TW reclaim fast path optimization for
-		 * performance where the timewait collector checks under the
-		 * fanout lock (so that no one else can get access to the
-		 * conn_t) that the refcnt is 2 i.e. one for TCP and one for
-		 * the classifier hash list. If ref count is indeed 2, we can
-		 * just remove the conn under the fanout lock and avoid
-		 * cleaning up the conn under the squeue, provided that
-		 * clustering callbacks are not enabled. If clustering is
-		 * enabled, we need to make the clustering callback before
-		 * setting the CONDEMNED flag and after dropping all locks and
-		 * so we forego this optimization and fall back to the slow
-		 * path. Also please see the comments in tcp_closei_local
-		 * regarding the refcnt logic.
+		 * Purge the connection.
 		 *
-		 * Since we are holding the tcp_time_wait_lock, its better
-		 * not to block on the fanout_lock because other connections
-		 * can't add themselves to time_wait list. So we do a
-		 * tryenter instead of mutex_enter.
+		 * While tcp_time_wait_lock will be temporarily dropped as part
+		 * of the process, there is no risk of the timer being
+		 * (re)scheduled while the collector is running since a value
+		 * corresponding to the past is left in tcp_time_wait_schedule.
 		 */
-		if (mutex_tryenter(lock)) {
-			mutex_enter(&connp->conn_lock);
-			if ((connp->conn_ref == 2) &&
-			    (cl_inet_disconnect == NULL)) {
-				ipcl_hash_remove_locked(connp,
-				    connp->conn_fanout);
-				/*
-				 * Set the CONDEMNED flag now itself so that
-				 * the refcnt cannot increase due to any
-				 * walker.
-				 */
-				connp->conn_state_flags |= CONN_CONDEMNED;
-				mutex_exit(lock);
-				mutex_exit(&connp->conn_lock);
-				if (tcp_time_wait->tcp_free_list_cnt <
-				    tcp_free_list_max_cnt) {
-					/* Add to head of tcp_free_list */
-					mutex_exit(
-					    &tcp_time_wait->tcp_time_wait_lock);
-					tcp_cleanup(tcp);
-					ASSERT(connp->conn_latch == NULL);
-					ASSERT(connp->conn_policy == NULL);
-					ASSERT(tcp->tcp_tcps == NULL);
-					ASSERT(connp->conn_netstack == NULL);
-
-					mutex_enter(
-					    &tcp_time_wait->tcp_time_wait_lock);
-					tcp->tcp_time_wait_next =
-					    tcp_time_wait->tcp_free_list;
-					tcp_time_wait->tcp_free_list = tcp;
-					tcp_time_wait->tcp_free_list_cnt++;
-					continue;
-				} else {
-					/* Do not add to tcp_free_list */
-					mutex_exit(
-					    &tcp_time_wait->tcp_time_wait_lock);
-					tcp_bind_hash_remove(tcp);
-					ixa_cleanup(tcp->tcp_connp->conn_ixa);
-					tcp_ipsec_cleanup(tcp);
-					CONN_DEC_REF(tcp->tcp_connp);
-				}
-			} else {
-				CONN_INC_REF_LOCKED(connp);
-				mutex_exit(lock);
-				mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
-				mutex_exit(&connp->conn_lock);
-				/*
-				 * We can reuse the closemp here since conn has
-				 * detached (otherwise we wouldn't even be in
-				 * time_wait list). tcp_closemp_used can safely
-				 * be changed without taking a lock as no other
-				 * thread can concurrently access it at this
-				 * point in the connection lifecycle.
-				 */
+		tcp_time_wait_purge(tcp, tsp);
 
-				if (tcp->tcp_closemp.b_prev == NULL)
-					tcp->tcp_closemp_used = B_TRUE;
-				else
-					cmn_err(CE_PANIC,
-					    "tcp_timewait_collector: "
-					    "concurrent use of tcp_closemp: "
-					    "connp %p tcp %p\n", (void *)connp,
-					    (void *)tcp);
-
-				TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15);
-				mp = &tcp->tcp_closemp;
-				SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
-				    tcp_timewait_close, connp, NULL,
-				    SQ_FILL, SQTAG_TCP_TIMEWAIT);
-			}
-		} else {
-			mutex_enter(&connp->conn_lock);
-			CONN_INC_REF_LOCKED(connp);
-			mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
-			mutex_exit(&connp->conn_lock);
-			/*
-			 * We can reuse the closemp here since conn has
-			 * detached (otherwise we wouldn't even be in
-			 * time_wait list). tcp_closemp_used can safely
-			 * be changed without taking a lock as no other
-			 * thread can concurrently access it at this
-			 * point in the connection lifecycle.
-			 */
+		/*
+		 * Because tcp_time_wait_remove clears the tcp_time_wait_next
+		 * field, the next item must be grabbed directly from the
+		 * bucket itself.
+		 */
+		tcp = tsp->tcp_time_wait_bucket[idx];
+	}
+
+	if (tsp->tcp_time_wait_cnt == 0) {
+		/*
+		 * There is not a need for the collector to schedule a new
+		 * timer if no pending items remain.  The timer state can be
+		 * cleared only if it was untouched while the collector dropped
+		 * its locks during tcp_time_wait_purge.
+		 */
+		if (tsp->tcp_time_wait_schedule == active_schedule) {
+			tsp->tcp_time_wait_offset = 0;
+			tsp->tcp_time_wait_schedule = 0;
+			tsp->tcp_time_wait_tid = 0;
+		}
+		tsp->tcp_time_wait_collector_active = B_FALSE;
+		mutex_exit(&tsp->tcp_time_wait_lock);
+		return;
+	} else {
+		unsigned int nidx;
 
-			if (tcp->tcp_closemp.b_prev == NULL)
-				tcp->tcp_closemp_used = B_TRUE;
-			else
-				cmn_err(CE_PANIC, "tcp_timewait_collector: "
-				    "concurrent use of tcp_closemp: "
-				    "connp %p tcp %p\n", (void *)connp,
-				    (void *)tcp);
-
-			TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15);
-			mp = &tcp->tcp_closemp;
-			SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
-			    tcp_timewait_close, connp, NULL,
-			    SQ_FILL, SQTAG_TCP_TIMEWAIT);
+		/*
+		 * Locate the next bucket containing entries.
+		 */
+		new_schedule = active_schedule
+		    + MSEC_TO_TICK(TCP_TIME_WAIT_DELAY);
+		nidx = TW_BUCKET_NEXT(idx);
+		while (tsp->tcp_time_wait_bucket[nidx] == NULL) {
+			if (nidx == idx) {
+				break;
+			}
+			nidx = TW_BUCKET_NEXT(nidx);
+			new_schedule += MSEC_TO_TICK(TCP_TIME_WAIT_DELAY);
 		}
-		mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
+		ASSERT(tsp->tcp_time_wait_bucket[nidx] != NULL);
 	}
 
-	if (tcp_time_wait->tcp_free_list != NULL)
-		tcp_time_wait->tcp_free_list->tcp_in_free_list = B_TRUE;
+	/*
+	 * It is possible that the system is under such dire load that between
+	 * the timer scheduling and TIME_WAIT processing delay, execution
+	 * overran the interval allocated to this bucket.
+	 */
+	now = ddi_get_lbolt64() - tsp->tcp_time_wait_offset;
+	if (new_schedule <= now) {
+		/*
+		 * Attempt to right the situation by immediately performing a
+		 * purge on the next bucket.  This loop will continue as needed
+		 * until the schedule can be pushed out ahead of the clock.
+		 */
+		idx = TW_BUCKET(new_schedule - 1);
+		goto retry;
+	}
 
 	/*
-	 * If the time wait list is not empty and there is no timer running,
-	 * restart it.
+	 * Another thread may have snuck in to reschedule the timer while locks
+	 * were dropped during tcp_time_wait_purge.  Defer to the running timer
+	 * if that is the case.
 	 */
-	if ((tcp = tcp_time_wait->tcp_time_wait_head) != NULL &&
-	    tcp_time_wait->tcp_time_wait_tid == 0) {
-		hrtime_t firetime;
-
-		/* shouldn't be necessary, but just in case */
-		if (tcp->tcp_time_wait_expire < now)
-			tcp->tcp_time_wait_expire = now;
-
-		firetime = TICK_TO_NSEC(tcp->tcp_time_wait_expire - now);
-		/* This ensures that we won't wake up too often. */
-		firetime = MAX(TCP_TIME_WAIT_DELAY, firetime);
-		tcp_time_wait->tcp_time_wait_tid =
-		    timeout_generic(CALLOUT_NORMAL, tcp_time_wait_collector,
-		    sqp, firetime, CALLOUT_TCP_RESOLUTION,
-		    CALLOUT_FLAG_ROUNDUP);
+	if (tsp->tcp_time_wait_schedule != active_schedule) {
+		tsp->tcp_time_wait_collector_active = B_FALSE;
+		mutex_exit(&tsp->tcp_time_wait_lock);
+		return;
 	}
-#ifdef DEBUG
-	tcp_time_wait->tcp_time_wait_running = B_FALSE;
-#endif
-	mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
+
+	/*
+	 * Schedule the next timer.
+	 */
+	tsp->tcp_time_wait_schedule = new_schedule;
+	tsp->tcp_time_wait_tid =
+	    timeout_generic(CALLOUT_NORMAL,
+	    tcp_time_wait_collector, sqp,
+	    TICK_TO_NSEC(new_schedule - now),
+	    CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP);
+	tsp->tcp_time_wait_collector_active = B_FALSE;
+	mutex_exit(&tsp->tcp_time_wait_lock);
 }
 
 /*
diff --git a/usr/src/uts/common/inet/tcp/tcp_tunables.c b/usr/src/uts/common/inet/tcp/tcp_tunables.c
index be75f1f663..f4d6c71914 100644
--- a/usr/src/uts/common/inet/tcp/tcp_tunables.c
+++ b/usr/src/uts/common/inet/tcp/tcp_tunables.c
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, Joyent Inc. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
  * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2013 by Delphix. All rights reserved.
  */
@@ -249,7 +249,7 @@ mod_prop_info_t tcp_propinfo_tbl[] = {
 	/* tunable - 0 */
 	{ "_time_wait_interval", MOD_PROTO_TCP,
 	    mod_set_uint32, mod_get_uint32,
-	    {1*SECONDS, 10*MINUTES, 1*MINUTES}, {1*MINUTES} },
+	    {1*SECONDS, TCP_TIME_WAIT_MAX, 1*MINUTES}, {1*MINUTES} },
 
 	{ "_conn_req_max_q", MOD_PROTO_TCP,
 	    mod_set_uint32, mod_get_uint32,
@@ -307,7 +307,7 @@ mod_prop_info_t tcp_propinfo_tbl[] = {
 
 	{ "_keepalive_interval", MOD_PROTO_TCP,
 	    mod_set_uint32, mod_get_uint32,
-	    {10*SECONDS, 10*DAYS, 2*HOURS}, {2*HOURS} },
+	    {1*SECONDS, 10*DAYS, 2*HOURS}, {2*HOURS} },
 
 	{ "_maxpsz_multiplier", MOD_PROTO_TCP,
 	    mod_set_uint32, mod_get_uint32,
diff --git a/usr/src/uts/common/inet/tcp_impl.h b/usr/src/uts/common/inet/tcp_impl.h
index 0f0f915a2b..cb83b91fad 100644
--- a/usr/src/uts/common/inet/tcp_impl.h
+++ b/usr/src/uts/common/inet/tcp_impl.h
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, Joyent Inc. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
  * Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved.
  * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
  */
@@ -61,9 +61,9 @@ extern sock_downcalls_t sock_tcp_downcalls;
  * by setting it to 0.
  */
 #define	TCP_XMIT_LOWATER	4096
-#define	TCP_XMIT_HIWATER	49152
+#define	TCP_XMIT_HIWATER	128000
 #define	TCP_RECV_LOWATER	2048
-#define	TCP_RECV_HIWATER	128000
+#define	TCP_RECV_HIWATER	1048576
 
 /*
  * Bind hash list size and has function.  It has to be a power of 2 for
@@ -105,7 +105,7 @@ extern sock_downcalls_t sock_tcp_downcalls;
  */
 #define	TCP_IS_DETACHED(tcp)	((tcp)->tcp_detached)
 
-/* TCP timers related data strucutres.  Refer to tcp_timers.c. */
+/* TCP timers related data structures.  Refer to tcp_timers.c. */
 typedef struct tcp_timer_s {
 	conn_t	*connp;
 	void 	(*tcpt_proc)(void *);
@@ -132,48 +132,79 @@ extern kmem_cache_t *tcp_timercache;
 	(tcp)->tcp_timer_tid = TCP_TIMER((tcp), tcp_timer, (intvl));	\
 }
 
+
+/*
+ * Maximum TIME_WAIT timeout.  It is defined here (instead of tcp_tunables.c)
+ * so that other parameters can be derived from it.
+ */
+#define	TCP_TIME_WAIT_MAX	(10 * MINUTES)
+
+/*
+ * TCP_TIME_WAIT_DELAY governs how often the time_wait_collector runs.
+ * Running it every 5 seconds seems to yield a reasonable balance between
+ * cleanup liveliness and system load.
+ */
+#define	TCP_TIME_WAIT_DELAY	(5 * SECONDS)
+
+#define	TCP_TIME_WAIT_BUCKETS	((TCP_TIME_WAIT_MAX / TCP_TIME_WAIT_DELAY) + 1)
+
 /*
  * For scalability, we must not run a timer for every TCP connection
  * in TIME_WAIT state.  To see why, consider (for time wait interval of
  * 1 minutes):
  *	10,000 connections/sec * 60 seconds/time wait = 600,000 active conn's
  *
- * This list is ordered by time, so you need only delete from the head
- * until you get to entries which aren't old enough to delete yet.
- * The list consists of only the detached TIME_WAIT connections.
+ * Since TIME_WAIT expiration occurs on a per-squeue basis, handling
+ * connections from all netstacks on the system, a simple queue is inadequate
+ * for pending entries.  This is because tcp_time_wait_interval may differ
+ * between connections, causing tail insertion to violate expiration order.
+ *
+ * Instead of performing expensive sorting or unnecessary list traversal to
+ * counteract interval variance between netstacks, a timing wheel structure is
+ * used.  The duration covered by each bucket in the wheel is determined by the
+ * TCP_TIME_WAIT_DELAY (5 seconds).  The number of buckets in the wheel is
+ * determined by dividing the maximum TIME_WAIT interval (10 minutes) by
+ * TCP_TIME_WAIT_DELAY, with one added bucket for rollover protection.
+ * (Yielding 121 buckets with the current parameters)  When items are inserted
+ * into the set of buckets, they are indexed by using their expiration time
+ * divided by the bucket size, modulo the number of buckets.  This means that
+ * when each bucket is processed, all items within should have expired within
+ * the last TCP_TIME_WAIT_DELAY interval.
+ *
+ * Since bucket timer schedules are rounded to the nearest TCP_TIME_WAIT_DELAY
+ * interval to ensure all connections in the pending bucket will be expired, a
+ * per-squeue offset is used when doing TIME_WAIT scheduling.  This offset is
+ * between 0 and the TCP_TIME_WAIT_DELAY and is designed to avoid scheduling
+ * all of the tcp_time_wait_collector threads to run in lock-step.  The offset
+ * is fixed while there are any connections present in the buckets.
  *
  * When a tcp_t enters TIME_WAIT state, a timer is started (timeout is
  * tcps_time_wait_interval).  When the tcp_t is detached (upper layer closes
- * the end point), it is moved to the time wait list and another timer is
- * started (expiry time is set at tcp_time_wait_expire, which is
- * also calculated using tcps_time_wait_interval).  This means that the
- * TIME_WAIT state can be extended (up to doubled) if the tcp_t doesn't
- * become detached for a long time.
+ * the end point), it is scheduled to be cleaned up by the squeue-driving
+ * tcp_time_wait_collector (also using tcps_time_wait_interval).  This means
+ * that the TIME_WAIT state can be extended (up to doubled) if the tcp_t
+ * doesn't become detached for a long time.
  *
  * The list manipulations (including tcp_time_wait_next/prev)
  * are protected by the tcp_time_wait_lock. The content of the
  * detached TIME_WAIT connections is protected by the normal perimeters.
  *
- * This list is per squeue and squeues are shared across the tcp_stack_t's.
- * Things on tcp_time_wait_head remain associated with the tcp_stack_t
- * and conn_netstack.
- * The tcp_t's that are added to tcp_free_list are disassociated and
- * have NULL tcp_tcps and conn_netstack pointers.
+ * These connection lists are per squeue and squeues are shared across the
+ * tcp_stack_t instances.  Things in a tcp_time_wait_bucket remain associated
+ * with the tcp_stack_t and conn_netstack.  Any tcp_t connections stored in the
+ * tcp_free_list are disassociated and have NULL tcp_tcps and conn_netstack
+ * pointers.
  */
 typedef struct tcp_squeue_priv_s {
 	kmutex_t	tcp_time_wait_lock;
+	boolean_t	tcp_time_wait_collector_active;
 	callout_id_t	tcp_time_wait_tid;
-	tcp_t		*tcp_time_wait_head;
-	tcp_t		*tcp_time_wait_tail;
+	uint64_t	tcp_time_wait_cnt;
+	int64_t		tcp_time_wait_schedule;
+	int64_t		tcp_time_wait_offset;
+	tcp_t		*tcp_time_wait_bucket[TCP_TIME_WAIT_BUCKETS];
 	tcp_t		*tcp_free_list;
 	uint_t		tcp_free_list_cnt;
-#ifdef DEBUG
-	/*
-	 * For debugging purpose, true when tcp_time_wait_collector() is
-	 * running.
-	 */
-	boolean_t	tcp_time_wait_running;
-#endif
 } tcp_squeue_priv_t;
 
 /*
@@ -375,6 +406,22 @@ typedef struct tcp_listen_cnt_s {
 	uint32_t	tlc_drop;
 } tcp_listen_cnt_t;
 
+/*
+ * Track tcp_t entities bound to the same port/address tuple via SO_REUSEPORT.
+ * - tcprg_lock:	Protects the other fields
+ * - tcprg_size:	Allocated size (in entries) of tcprg_members array
+ * - tcprg_count:	Count of occupied tcprg_members slots
+ * - tcprg_active:	Count of members which still have SO_REUSEPORT set
+ * - tcprg_members:	Connections associated with address/port group
+ */
+typedef struct tcp_rg_s {
+	kmutex_t	tcprg_lock;
+	unsigned int	tcprg_size;
+	unsigned int	tcprg_count;
+	unsigned int	tcprg_active;
+	tcp_t		**tcprg_members;
+} tcp_rg_t;
+
 #define	TCP_TLC_REPORT_INTERVAL	(30 * MINUTES)
 
 #define	TCP_DECR_LISTEN_CNT(tcp)					\
@@ -618,6 +665,10 @@ extern in_port_t	tcp_bindi(tcp_t *, in_port_t, const in6_addr_t *,
 			    int, boolean_t, boolean_t, boolean_t);
 extern in_port_t	tcp_update_next_port(in_port_t, const tcp_t *,
 			    boolean_t);
+extern tcp_rg_t *tcp_rg_init(tcp_t *);
+extern boolean_t tcp_rg_remove(tcp_rg_t *, tcp_t *);
+extern void tcp_rg_destroy(tcp_rg_t *);
+extern void tcp_rg_setactive(tcp_rg_t *, boolean_t);
 
 /*
  * Fusion related functions in tcp_fusion.c.
diff --git a/usr/src/uts/common/inet/udp/udp.c b/usr/src/uts/common/inet/udp/udp.c
index 5a15aea4de..a88bac932c 100644
--- a/usr/src/uts/common/inet/udp/udp.c
+++ b/usr/src/uts/common/inet/udp/udp.c
@@ -22,6 +22,7 @@
  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
  * Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved.
+ * Copyright 2015, Joyent, Inc.
  */
 /* Copyright (c) 1990 Mentat Inc. */
 
@@ -76,7 +77,8 @@
 #include <inet/ipclassifier.h>
 #include <sys/squeue_impl.h>
 #include <inet/ipnet.h>
-#include <sys/ethernet.h>
+#include <sys/vxlan.h>
+#include <inet/inet_hash.h>
 
 #include <sys/tsol/label.h>
 #include <sys/tsol/tnet.h>
@@ -346,6 +348,89 @@ void (*cl_inet_unbind)(netstackid_t stack_id, uint8_t protocol,
 typedef union T_primitives *t_primp_t;
 
 /*
+ * Various protocols that encapsulate UDP have no real use for the source port.
+ * Instead, they want to vary the source port to provide better equal-cost
+ * multipathing and other systems that use fanout. Consider something like
+ * VXLAN. If you're actually sending multiple different streams to a single
+ * host, if you don't vary the source port, then the tuple of ( SRC IP, DST IP,
+ * SRC Port, DST Port) will always be the same.
+ *
+ * Here, we return a port to hash this to, if we know how to hash it. If for
+ * some reason we can't perform an L4 hash, then we just return the default
+ * value, usually the default port. After we determine the hash we transform it
+ * so that it's in the range of [ min, max ].
+ *
+ * We'd like to avoid a pull up for the sake of performing the hash. If the
+ * first mblk_t doesn't have the full protocol header, then we just send it to
+ * the default. If for some reason we have an encapsulated packet that has its
+ * protocol header in different parts of an mblk_t, then we'll go with the
+ * default port. This means that that if a driver isn't consistent about how it
+ * generates the frames for a given flow, it will not always be consistently
+ * hashed. That should be an uncommon event.
+ */
+uint16_t
+udp_srcport_hash(mblk_t *mp, int type, uint16_t min, uint16_t max,
+    uint16_t def)
+{
+	size_t szused = 0;
+	struct ether_header *ether;
+	struct ether_vlan_header *vether;
+	ip6_t *ip6h;
+	ipha_t *ipha;
+	uint16_t sap;
+	uint64_t hash;
+	uint32_t mod;
+
+	ASSERT(min <= max);
+
+	if (type != UDP_HASH_VXLAN)
+		return (def);
+
+	if (!IS_P2ALIGNED(mp->b_rptr, sizeof (uint16_t)))
+		return (def);
+
+	/*
+	 * The following logic is VXLAN specific to get at the header, if we
+	 * have formats, eg. GENEVE, then we should ignore this.
+	 *
+	 * The kernel overlay device often puts a first mblk_t for the data
+	 * which is just the encap. If so, then we're going to use that and try
+	 * to avoid a pull up.
+	 */
+	if (MBLKL(mp) == VXLAN_HDR_LEN) {
+		if (mp->b_cont == NULL)
+			return (def);
+		mp = mp->b_cont;
+		ether = (struct ether_header *)mp->b_rptr;
+	} else if (MBLKL(mp) < VXLAN_HDR_LEN) {
+		return (def);
+	} else {
+		szused = VXLAN_HDR_LEN;
+		ether = (struct ether_header *)((uintptr_t)mp->b_rptr + szused);
+	}
+
+	/* Can we hold a MAC header? */
+	if (MBLKL(mp) + szused < sizeof (struct ether_header))
+		return (def);
+
+	/*
+	 * We need to lie about the starting offset into the message block for
+	 * convenience. Undo it at the end. We know that inet_pkt_hash() won't
+	 * modify the mblk_t.
+	 */
+	mp->b_rptr += szused;
+	hash = inet_pkt_hash(DL_ETHER, mp, INET_PKT_HASH_L2 |
+	    INET_PKT_HASH_L3 | INET_PKT_HASH_L4);
+	mp->b_rptr -= szused;
+
+	if (hash == 0)
+		return (def);
+
+	mod = max - min + 1;
+	return ((hash % mod) + min);
+}
+
+/*
  * Return the next anonymous port in the privileged port range for
  * bind checking.
  *
@@ -1583,6 +1668,16 @@ udp_opt_get(conn_t *connp, t_scalar_t level, t_scalar_t name,
 			*i1 = udp->udp_rcvhdr ? 1 : 0;
 			mutex_exit(&connp->conn_lock);
 			return (sizeof (int));
+		case UDP_SRCPORT_HASH:
+			mutex_enter(&connp->conn_lock);
+			*i1 = udp->udp_vxlanhash;
+			mutex_exit(&connp->conn_lock);
+			return (sizeof (int));
+		case UDP_SND_TO_CONNECTED:
+			mutex_enter(&connp->conn_lock);
+			*i1 = udp->udp_snd_to_conn ? 1 : 0;
+			mutex_exit(&connp->conn_lock);
+			return (sizeof (int));
 		}
 	}
 	mutex_enter(&connp->conn_lock);
@@ -1718,6 +1813,31 @@ udp_do_opt_set(conn_opt_arg_t *coa, int level, int name,
 			udp->udp_rcvhdr = onoff;
 			mutex_exit(&connp->conn_lock);
 			return (0);
+		case UDP_SRCPORT_HASH:
+			/*
+			 * This should have already been verified, but double
+			 * check.
+			 */
+			if ((error = secpolicy_ip_config(cr, B_FALSE)) != 0) {
+				return (error);
+			}
+
+			/* First see if the val is something we understand */
+			if (*i1 != UDP_HASH_DISABLE && *i1 != UDP_HASH_VXLAN)
+				return (EINVAL);
+
+			if (!checkonly) {
+				mutex_enter(&connp->conn_lock);
+				udp->udp_vxlanhash = *i1;
+				mutex_exit(&connp->conn_lock);
+			}
+			/* Fully handled this option. */
+			return (0);
+		case UDP_SND_TO_CONNECTED:
+			mutex_enter(&connp->conn_lock);
+			udp->udp_snd_to_conn = onoff;
+			mutex_exit(&connp->conn_lock);
+			return (0);
 		}
 		break;
 	}
@@ -2001,13 +2121,25 @@ udp_prepend_hdr(conn_t *connp, ip_xmit_attr_t *ixa, const ip_pkt_t *ipp,
 	uint32_t	cksum;
 	udp_t		*udp = connp->conn_udp;
 	boolean_t	insert_spi = udp->udp_nat_t_endpoint;
+	boolean_t	hash_srcport = udp->udp_vxlanhash;
 	uint_t		ulp_hdr_len;
+	uint16_t	srcport;
 
 	data_len = msgdsize(data_mp);
 	ulp_hdr_len = UDPH_SIZE;
 	if (insert_spi)
 		ulp_hdr_len += sizeof (uint32_t);
 
+	/*
+	 * If we have source port hashing going on, determine the hash before
+	 * we modify the mblk_t.
+	 */
+	if (hash_srcport == B_TRUE) {
+		srcport = udp_srcport_hash(mp, UDP_HASH_VXLAN,
+		    IPPORT_DYNAMIC_MIN, IPPORT_DYNAMIC_MAX,
+		    ntohs(connp->conn_lport));
+	}
+
 	mp = conn_prepend_hdr(ixa, ipp, v6src, v6dst, IPPROTO_UDP, flowinfo,
 	    ulp_hdr_len, data_mp, data_len, us->us_wroff_extra, &cksum, errorp);
 	if (mp == NULL) {
@@ -2019,7 +2151,11 @@ udp_prepend_hdr(conn_t *connp, ip_xmit_attr_t *ixa, const ip_pkt_t *ipp,
 	ixa->ixa_pktlen = data_len + ixa->ixa_ip_hdr_length;
 
 	udpha = (udpha_t *)(mp->b_rptr + ixa->ixa_ip_hdr_length);
-	udpha->uha_src_port = connp->conn_lport;
+	if (hash_srcport == B_TRUE) {
+		udpha->uha_src_port = htons(srcport);
+	} else {
+		udpha->uha_src_port = connp->conn_lport;
+	}
 	udpha->uha_dst_port = dstport;
 	udpha->uha_checksum = 0;
 	udpha->uha_length = htons(data_len);
@@ -3194,6 +3330,7 @@ udp_prepend_header_template(conn_t *connp, ip_xmit_attr_t *ixa, mblk_t *mp,
 	udp_t		*udp = connp->conn_udp;
 	udp_stack_t	*us = udp->udp_us;
 	boolean_t	insert_spi = udp->udp_nat_t_endpoint;
+	boolean_t	hash_srcport = udp->udp_vxlanhash;
 	uint_t		pktlen;
 	uint_t		alloclen;
 	uint_t		copylen;
@@ -3202,10 +3339,21 @@ udp_prepend_header_template(conn_t *connp, ip_xmit_attr_t *ixa, mblk_t *mp,
 	udpha_t		*udpha;
 	uint32_t	cksum;
 	ip_pkt_t	*ipp;
+	uint16_t	srcport;
 
 	ASSERT(MUTEX_HELD(&connp->conn_lock));
 
 	/*
+	 * If we have source port hashing going on, determine the hash before
+	 * we modify the mblk_t.
+	 */
+	if (hash_srcport == B_TRUE) {
+		srcport = udp_srcport_hash(mp, UDP_HASH_VXLAN,
+		    IPPORT_DYNAMIC_MIN, IPPORT_DYNAMIC_MAX,
+		    ntohs(connp->conn_lport));
+	}
+
+	/*
 	 * Copy the header template and leave space for an SPI
 	 */
 	copylen = connp->conn_ht_iphc_len;
@@ -3303,6 +3451,9 @@ udp_prepend_header_template(conn_t *connp, ip_xmit_attr_t *ixa, mblk_t *mp,
 		*((uint32_t *)(udpha + 1)) = 0;
 
 	udpha->uha_dst_port = dstport;
+	if (hash_srcport == B_TRUE)
+		udpha->uha_src_port = htons(srcport);
+
 	return (mp);
 }
 
@@ -5947,10 +6098,18 @@ udp_send(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg,
 		else
 			return (error);
 	}
-	if (udp->udp_state == TS_DATA_XFER) {
+
+	/*
+	 * Check if we're allowed to send to a connection on which we've
+	 * already called 'connect'. The posix spec. allows both behaviors but
+	 * historically we've returned an error if already connected. The
+	 * client can allow this via a sockopt.
+	 */
+	if (udp->udp_state == TS_DATA_XFER && !udp->udp_snd_to_conn) {
 		UDPS_BUMP_MIB(us, udpOutErrors);
 		return (EISCONN);
 	}
+
 	error = proto_verify_ip_addr(connp->conn_family,
 	    (struct sockaddr *)msg->msg_name, msg->msg_namelen);
 	if (error != 0) {
diff --git a/usr/src/uts/common/inet/udp/udp_opt_data.c b/usr/src/uts/common/inet/udp/udp_opt_data.c
index c279bb4a21..847e2cdde6 100644
--- a/usr/src/uts/common/inet/udp/udp_opt_data.c
+++ b/usr/src/uts/common/inet/udp/udp_opt_data.c
@@ -21,6 +21,7 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2015, Joyent, Inc.
  */
 
 #include <sys/types.h>
@@ -292,6 +293,9 @@ opdes_t	udp_opt_arr[] = {
 	},
 { UDP_NAT_T_ENDPOINT, IPPROTO_UDP, OA_RW, OA_RW, OP_PRIVPORT, 0, sizeof (int),
 	0 },
+{ UDP_SRCPORT_HASH, IPPROTO_UDP, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int), 0 },
+{ UDP_SND_TO_CONNECTED, IPPROTO_UDP, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int),
+	0 }
 };
 
 /*
diff --git a/usr/src/uts/common/inet/udp_impl.h b/usr/src/uts/common/inet/udp_impl.h
index 6a31ce5c22..ebba10c0f7 100644
--- a/usr/src/uts/common/inet/udp_impl.h
+++ b/usr/src/uts/common/inet/udp_impl.h
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015, Joyent, Inc.
  */
 
 #ifndef	_UDP_IMPL_H
@@ -178,8 +179,12 @@ typedef	struct udp_s {
 		udp_issocket : 1,	/* socket mode; sockfs is on top */
 		udp_nat_t_endpoint : 1,	/* UDP_NAT_T_ENDPOINT option */
 		udp_rcvhdr : 1,		/* UDP_RCVHDR option */
+		udp_vxlanhash: 1,	/* UDP_SRCPORT_HASH option */
+					/* Because there's only VXLAN, cheat */
+					/* and only use a single bit */
+		udp_snd_to_conn: 1,	/* UDP_SND_TO_CONNECTED option */
 
-		udp_pad_to_bit_31 : 29;
+		udp_pad_to_bit_31 : 27;
 
 	/* Following 2 fields protected by the uf_lock */
 	struct udp_s	*udp_bind_hash; /* Bind hash chain */
diff --git a/usr/src/uts/common/io/aggr/aggr_port.c b/usr/src/uts/common/io/aggr/aggr_port.c
index 00545d2c03..a39110255a 100644
--- a/usr/src/uts/common/io/aggr/aggr_port.c
+++ b/usr/src/uts/common/io/aggr/aggr_port.c
@@ -21,6 +21,7 @@
 /*
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2012 OmniTI Computer Consulting, Inc  All rights reserved.
  */
 
 /*
@@ -528,8 +529,13 @@ aggr_port_promisc(aggr_port_t *port, boolean_t on)
 
 	if (on) {
 		mac_rx_clear(port->lp_mch);
+		/* We use the promisc callback because without hardware
+		 * rings, we deliver through flows that will cause duplicate
+		 * delivery of packets when we've flipped into this mode
+		 * to compensate for the lack of hardware MAC matching
+		 */
 		rc = mac_promisc_add(port->lp_mch, MAC_CLIENT_PROMISC_ALL,
-		    aggr_recv_cb, port, &port->lp_mphp,
+		    aggr_recv_promisc_cb, port, &port->lp_mphp,
 		    MAC_PROMISC_FLAGS_NO_TX_LOOP);
 		if (rc != 0) {
 			mac_rx_set(port->lp_mch, aggr_recv_cb, port);
diff --git a/usr/src/uts/common/io/aggr/aggr_recv.c b/usr/src/uts/common/io/aggr/aggr_recv.c
index 2bdb7872e3..0dfe234b70 100644
--- a/usr/src/uts/common/io/aggr/aggr_recv.c
+++ b/usr/src/uts/common/io/aggr/aggr_recv.c
@@ -21,6 +21,7 @@
 /*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2012 OmniTI Computer Consulting, Inc  All rights reserved.
  */
 
 /*
@@ -68,16 +69,27 @@ aggr_recv_lacp(aggr_port_t *port, mac_resource_handle_t mrh, mblk_t *mp)
 
 /*
  * Callback function invoked by MAC service module when packets are
- * made available by a MAC port.
+ * made available by a MAC port, both in promisc_on mode and not.
  */
 /* ARGSUSED */
-void
-aggr_recv_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
-    boolean_t loopback)
+static void
+aggr_recv_path_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
+    boolean_t loopback, boolean_t promisc_path)
 {
 	aggr_port_t *port = (aggr_port_t *)arg;
 	aggr_grp_t *grp = port->lp_grp;
 
+	/* In the case where lp_promisc_on has been turned on to
+	 * compensate for insufficient hardware MAC matching and
+	 * hardware rings are not in use we will fall back to
+	 * using flows for delivery which can result in duplicates
+	 * pushed up the stack. Only respect the chosen path.
+	 */
+	if (port->lp_promisc_on != promisc_path) {
+		freemsgchain(mp);
+		return;
+	}
+
 	if (grp->lg_lacp_mode == AGGR_LACP_OFF) {
 		aggr_mac_rx(grp->lg_mh, mrh, mp);
 	} else {
@@ -161,3 +173,19 @@ aggr_recv_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
 		}
 	}
 }
+
+/* ARGSUSED */
+void
+aggr_recv_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
+    boolean_t loopback)
+{
+	aggr_recv_path_cb(arg, mrh, mp, loopback, B_FALSE);
+}
+
+/* ARGSUSED */
+void
+aggr_recv_promisc_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
+    boolean_t loopback)
+{
+	aggr_recv_path_cb(arg, mrh, mp, loopback, B_TRUE);
+}
diff --git a/usr/src/uts/common/io/axf/ax88172reg.h b/usr/src/uts/common/io/axf/ax88172reg.h
new file mode 100644
index 0000000000..8ca6ebc187
--- /dev/null
+++ b/usr/src/uts/common/io/axf/ax88172reg.h
@@ -0,0 +1,163 @@
+/*
+ * @(#)ax88172reg.h	1.1 09/06/15
+ * Macro definitions for ASIX AX88172 USB to fast ethernet controler
+ * based on ASIX AX88172/88772 data sheet
+ * This file is public domain. Coded by M.Murayama (KHF04453@nifty.com)
+ */
+
+#ifndef __AX88172_H__
+#define __AX88172_H__
+
+/*
+ * Vendor command definitions
+ */
+#define	VCMD_READ_SRAM			0x02
+#define	VCMD_WRITE_RXSRAM		0x03
+#define	VCMD_WRITE_TXSRAM		0x04
+#define	VCMD_SOFTWARE_MII_OP		0x06
+#define	VCMD_READ_MII_REG		0x07
+#define	VCMD_WRITE_MII_REG		0x08
+#define	VCMD_READ_MII_OPMODE		0x09
+#define	VCMD_HARDWARE_MII_OP		0x0a
+#define	VCMD_READ_SROM			0x0b
+#define	VCMD_WRITE_SROM			0x0c
+#define	VCMD_WRITE_SROM_ENABLE		0x0d
+#define	VCMD_WRITE_SROM_DISABLE		0x0e
+#define	VCMD_READ_RXCTRL		0x0f
+#define	VCMD_WRITE_RXCTRL		0x10
+#define	VCMD_READ_IPGS			0x11
+#define	VCMD_WRITE_IPG			0x12
+#define	VCMD_WRITE_IPG1			0x13
+#define	VCMD_WRITE_IPG2			0x14
+#define	VCMD_READ_MCAST_FILTER		0x15
+#define	VCMD_WRITE_MCAST_FILTER		0x16
+#define	VCMD_READ_NODE_ID		0x17
+#define	VCMD_READ_PHY_IDS		0x19
+#define	VCMD_READ_MEDIUM_STATUS		0x1a
+#define	VCMD_WRITE_MEDIUM_STATUS	0x1b
+#define	VCMD_SET_MONITOR_MODE		0x1c
+#define	VCMD_GET_MONITOR_MODE		0x1d
+#define	VCMD_READ_GPIO			0x1e
+#define	VCMD_WRITE_GPIO			0x1f
+
+/* ax88772 only,  currently not supported */
+#define	VCMD_WRITE_IPGS_88772		0x12
+#define	VCMD_READ_NODE_ID_88772		0x13
+#define	VCMD_WRITE_NODE_ID_88772	0x14
+#define	VCMD_WRITE_TEST_REG_88772	0x17
+#define	VCMD_SOFTWARE_RESET_88772	0x20
+#define	VCMD_READ_PHY_SELECT_88772	0x21
+#define	VCMD_WRITE_PHY_SELECT_88772	0x22
+
+
+/*
+ * Register definitions
+ */
+
+/* Rx control register */
+#define	RCR_SO		0x80	/* Start Operation */
+#define	RCR_AP_88772	0x20	/* accept physical address from mcast filter */
+#define	RCR_AM		0x10	/* accept multicast address */
+#define	RCR_AB		0x08	/* accept broadcast address */
+#define	RCR_SEP		0x04	/* save error packet */
+#define	RCR_AMALL	0x02	/* accept all multicast address */
+#define	RCR_PRO		0x01	/* promiscious, all frames received */
+
+#define	RCR_MFB	0x0300
+#define		RCR_MFB_SHIFT	8
+#define		RCR_MFB_2K	(0U << RCR_MFB_SHIFT)
+#define		RCR_MFB_4K	(1U << RCR_MFB_SHIFT)
+#define		RCR_MFB_8K	(2U << RCR_MFB_SHIFT)
+#define		RCR_MFB_16K	(3U << RCR_MFB_SHIFT)
+
+#define	RCR_BITS	\
+	"\020"	\
+	"\010SO"	\
+	"\006AP"	\
+	"\005AM"	\
+	"\004AB"	\
+	"\003SEP"	\
+	"\002AMALL"	\
+	"\001PRO"
+
+/* Medium status register */
+#define	MSR_SM		0x1000	/* super mac support */
+#define	MSR_SBP		0x0800	/* stop backpressure */
+#define	MSR_PS		0x0200	/* port speed in mii mode */
+#define	MSR_RE		0x0100	/* rx enable */
+#define	MSR_PF		0x0080	/* check only length/type for pause frame */
+#define	MSR_JFE		0x0040	/* jumbo frame enable */
+#define	MSR_TFC		0x0020	/* tx flow control enable */
+#define	MSR_RFC		0x0010	/* rx flow control enable (178) */
+#define	MSR_FCEN	0x0010	/* flow control enable (172/772) */
+#define	MSR_ENCK	0x0008	/* Enable GTX_CLK and TXC clock output (178) */
+#define	MSR_TXABT	0x0004	/* Tx abort allow, always set */
+#define	MSR_FDPX	0x0002	/* full duplex */
+#define	MSR_GM		0x0001	/* Gigabit mode (178) */
+
+#define	MSR_BITS	\
+	"\020"	\
+	"\015SM"	\
+	"\014SBP"	\
+	"\012PS"	\
+	"\011RE"	\
+	"\005FCEN"	\
+	"\004ENCK"	\
+	"\003TXABT"	\
+	"\002FDPX"	\
+	"\001GM"
+
+/* monitor mode register */
+#define	MMR_RWMP	0x04	/* remote wakeup by magic pkt */
+#define	MMR_RWLU	0x02	/* remote wakeup by linkup */
+#define	MMR_MOM		0x01	/* monitor mode 1:en, 0:dis */
+
+#define	MMR_BITS	\
+	"\020"	\
+	"\003RWMP"	\
+	"\002RWLU"	\
+	"\001MOM"
+
+/* GPIO register */
+#define	GPIO_RSE	0x80	/* reload serial eeprom (88772)*/
+#define	GPIO_DATA2	0x20
+#define	GPIO_EN2	0x10
+#define	GPIO_DATA1	0x08
+#define	GPIO_EN1	0x04
+#define	GPIO_DATA0	0x02
+#define	GPIO_EN0	0x01
+
+#define	GPIO_BITS	\
+	"\020"		\
+	"\010RSE"	\
+	"\006DATA2"	\
+	"\005EN2"	\
+	"\004DATA1"	\
+	"\003EN1"	\
+	"\002DATA0"	\
+	"\001EN0"
+
+/* Software reset register */
+#define	SWRST_IPPD	0x40	/* internal phy power down control */
+#define	SWRST_IPRL	0x20	/* internal phy reset control */
+#define	SWRST_BZ	0x10	/* force Bulk In to return zero-length pkt */
+#define	SWRST_PRL	0x08	/* external phy reset pin level */
+#define	SWRST_PRTE	0x04	/* external phy tri-state enable */
+#define	SWRST_RT	0x02	/* clear frame length error for Bulk-Out */
+#define	SWRST_RR	0x01	/* clear frame length error for Bulk-In */
+
+#define	SWRST_BITS	\
+	"\020"		\
+	"\007IPPD"	\
+	"\006IPRL"	\
+	"\005BZ"	\
+	"\004PRL"	\
+	"\003PRTE"	\
+	"\002RT"	\
+	"\001RR"
+
+/* Software PHY Select Status register */
+#define	SPSS_ASEL	0x02	/* 1:auto select 0:manual select */
+#define	SPSS_PSEL	0x01	/* 1:intenal phy, 0:external (when ASEL=0) */
+
+#endif /* __AX88172_H__ */
diff --git a/usr/src/uts/common/io/axf/axf_usbgem.c b/usr/src/uts/common/io/axf/axf_usbgem.c
new file mode 100644
index 0000000000..28963f6849
--- /dev/null
+++ b/usr/src/uts/common/io/axf/axf_usbgem.c
@@ -0,0 +1,1539 @@
+/*
+ * axf_usbgem.c : ASIX AX88172/772 USB to Fast Ethernet Driver for Solaris
+ *
+ * Copyright (c) 2004-2012 Masayuki Murayama.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the author nor the names of its contributors may be
+ *    used to endorse or promote products derived from this software without
+ *    specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
+
+#pragma ident "@(#)axf_usbgem.c	1.3 12/02/09"
+
+/*
+ *  Changelog:
+ */
+
+/*
+ * TODO
+ * handle RXMODE_ENABLE in set_rx_filter()
+ */
+/* ======================================================= */
+
+/*
+ * Solaris system header files and macros
+ */
+
+/* minimum kernel headers for drivers */
+#include <sys/types.h>
+#include <sys/conf.h>
+#include <sys/debug.h>
+#include <sys/kmem.h>
+#include <sys/modctl.h>
+#include <sys/errno.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/byteorder.h>
+
+/* ethernet stuff */
+#include <sys/ethernet.h>
+
+/* interface card depend stuff */
+#include <sys/stropts.h>
+#include <sys/stream.h>
+#include <sys/strlog.h>
+#include <sys/usb/usba.h>
+#include "usbgem.h"
+
+/* hardware stuff */
+#include "usbgem_mii.h"
+#include "ax88172reg.h"
+
+char	ident[] = "ax88x72 usbnic driver v" VERSION;
+
+/*
+ * Useful macros
+ */
+#define	CHECK_AND_JUMP(err, label)	if (err != USB_SUCCESS) goto label
+#define	LE16P(p)	((((uint8_t *)(p))[1] << 8) | ((uint8_t *)(p))[0])
+
+#define	AX88172(dp)	\
+	(((struct axf_dev *)(dp)->private)->chip->type == CHIP_TYPE_AX88172)
+
+#define	AX88772(dp)	\
+	(((struct axf_dev *)(dp)->private)->chip->type == CHIP_TYPE_AX88772)
+
+/*
+ * Debugging
+ */
+#ifdef DEBUG_LEVEL
+static int axf_debug = DEBUG_LEVEL;
+#define	DPRINTF(n, args)	if (axf_debug > (n)) cmn_err args
+#else
+#define	DPRINTF(n, args)
+#endif
+
+/*
+ * Our configration for ax88172
+ */
+/* timeouts */
+#define	ONESEC		(drv_usectohz(1*1000000))
+
+/*
+ * RX/TX buffer size
+ */
+
+/*
+ * Local device definitions
+ */
+struct chip_info {
+	uint16_t	vid;	/* usb vendor id */
+	uint16_t	pid;	/* usb product id */
+	int		type;
+	uint8_t		gpio_reset[2];
+	uint8_t		gpio_speed[2];
+	uint8_t		gpio_duplex[2];
+	char		*name;
+#define	CHIP_TYPE_AX88172	0
+#define	CHIP_TYPE_AX88772	1
+#define	CHIP_TYPE_AX88178	2
+};
+
+#define	GPIO_DEFAULT	{0x00, 0x15}, {0, 0}, {0, 0}
+struct chip_info chiptbl_88x7x[] = {
+/* AX88172 */
+{
+	/* Planex UE2-100TX, Hawking UF200, TrendNet TU2-ET100 */
+	0x07b8, 0x420a, CHIP_TYPE_AX88172,
+
+	/*
+	 * the default setting covers below:
+	 * gpio bit2 has to be 0 and gpio bit0 has to be 1
+	 */
+	{0, 0},
+	{GPIO_EN1, GPIO_DATA1 | GPIO_EN1},
+	{0, 0},
+	"Planex UE2-100TX",	/* tested */
+},
+{
+	0x2001, 0x1a00, CHIP_TYPE_AX88172,
+	{0x9f, 0x9e}, {0, 0}, {0, 0},
+	"D-Link dube100",	/* XXX */
+},
+{
+	0x077b, 0x2226, CHIP_TYPE_AX88172,
+	GPIO_DEFAULT,
+	"Linksys USB200M",
+},
+{
+	0x0846, 0x1040, CHIP_TYPE_AX88172,
+	GPIO_DEFAULT,
+	"Netgear FA120",
+},
+{
+	0x0b95, 0x1720, CHIP_TYPE_AX88172,
+	GPIO_DEFAULT,
+	"Intellinet, ST Lab USB Ethernet",
+},
+{
+	0x08dd, 0x90ff, CHIP_TYPE_AX88172,
+	GPIO_DEFAULT,
+	"Billionton Systems, USB2AR",
+},
+{
+	0x0557, 0x2009, CHIP_TYPE_AX88172,
+	GPIO_DEFAULT,
+	"ATEN UC210T",
+},
+{
+	0x0411, 0x003d, CHIP_TYPE_AX88172,
+	GPIO_DEFAULT,
+	"Buffalo LUA-U2-KTX",
+},
+{
+	0x6189, 0x182d, CHIP_TYPE_AX88172,
+	GPIO_DEFAULT,
+	"Sitecom LN-029 USB 2.0 10/100 Ethernet adapter",
+},
+{
+	0x07aa, 0x0017, CHIP_TYPE_AX88172,
+	GPIO_DEFAULT,
+	"corega FEther USB2-TX",
+},
+{
+	0x1189, 0x0893, CHIP_TYPE_AX88172,
+	GPIO_DEFAULT,
+	"Surecom EP-1427X-2",
+},
+{
+	0x1631, 0x6200, CHIP_TYPE_AX88172,
+	GPIO_DEFAULT,
+	"goodway corp usb gwusb2e",
+},
+/* AX88772 and AX88178 */
+{
+	0x13b1, 0x0018, CHIP_TYPE_AX88772,
+	{0, 0}, {0, 0}, {0, 0},
+	"Linksys USB200M rev.2",
+},
+{
+	0x1557, 0x7720, CHIP_TYPE_AX88772,
+	{0, 0}, {0, 0}, {0, 0},
+	"0Q0 cable ethernet",
+},
+{
+	0x07d1, 0x3c05, CHIP_TYPE_AX88772,
+	{0, 0}, {0, 0}, {0, 0},
+	"DLink DUB E100 ver B1",
+},
+{
+	0x2001, 0x3c05, CHIP_TYPE_AX88772,
+	{0, 0}, {0, 0}, {0, 0},
+	"DLink DUB E100 ver B1(2)",
+},
+{
+	0x05ac, 0x1402, CHIP_TYPE_AX88772,
+	{0, 0}, {0, 0}, {0, 0},
+	"Apple Ethernet USB Adapter",
+},
+{
+	0x1737, 0x0039, CHIP_TYPE_AX88178,
+	{0, 0}, {0, 0}, {0, 0},
+	"Linksys USB1000",
+},
+{
+	0x0411, 0x006e, CHIP_TYPE_AX88178,
+	{0, 0}, {0, 0}, {0, 0},
+	"Buffalo LUA-U2-KGT/LUA-U2-GT",
+},
+{
+	0x04bb, 0x0930, CHIP_TYPE_AX88178,
+	{0, 0}, {0, 0}, {0, 0},
+	"I/O DATA ETG-US2",
+},
+{
+	0x050d, 0x5055, CHIP_TYPE_AX88178,
+	{0, 0}, {0, 0}, {0, 0},
+	"Belkin F5D5055",
+},
+{
+	/* generic ax88772 must be the last entry */
+	/* planex UE-200TX-G */
+	0x0b95, 0x7720, CHIP_TYPE_AX88772,
+	{0, 0}, {0, 0}, {0, 0},
+	"ASIX AX88772/AX88178",	/* tested */
+},
+};
+
+#define	CHIPTABLESIZE	(sizeof (chiptbl_88x7x) / sizeof (struct chip_info))
+
+struct axf_dev {
+	/*
+	 * Misc HW information
+	 */
+	struct chip_info	*chip;
+	uint8_t			ipg[3];
+	uint8_t			gpio;
+	uint16_t		rcr;
+	uint16_t		msr;
+	uint8_t			last_link_state;
+	boolean_t		phy_has_reset;
+};
+
+/*
+ * private functions
+ */
+
+/* mii operations */
+static uint16_t axf_mii_read(struct usbgem_dev *, uint_t, int *errp);
+static void axf_mii_write(struct usbgem_dev *, uint_t, uint16_t, int *errp);
+
+/* nic operations */
+static int axf_reset_chip(struct usbgem_dev *);
+static int axf_init_chip(struct usbgem_dev *);
+static int axf_start_chip(struct usbgem_dev *);
+static int axf_stop_chip(struct usbgem_dev *);
+static int axf_set_media(struct usbgem_dev *);
+static int axf_set_rx_filter(struct usbgem_dev *);
+static int axf_get_stats(struct usbgem_dev *);
+static void  axf_interrupt(struct usbgem_dev *, mblk_t *);
+
+/* packet operations */
+static mblk_t *axf_tx_make_packet(struct usbgem_dev *, mblk_t *);
+static mblk_t *axf_rx_make_packet(struct usbgem_dev *, mblk_t *);
+
+/* =============================================================== */
+/*
+ * I/O functions
+ */
+/* =============================================================== */
+#define	OUT(dp, req, val, ix, len, buf, errp, label)	\
+	if ((*(errp) = usbgem_ctrl_out((dp), 	\
+	/* bmRequestType */ USB_DEV_REQ_HOST_TO_DEV	\
+		    | USB_DEV_REQ_TYPE_VENDOR | USB_DEV_REQ_RCPT_DEV,	\
+	/* bRequest */ (req),	\
+	/* wValue */   (val),	\
+	/* wIndex */   (ix),	\
+	/* wLength */  (len),	\
+	/* value */    (buf),	\
+	/* size */     (len))) != USB_SUCCESS) goto label
+
+#define	IN(dp, req, val, ix, len, buf, errp, label)	\
+	if ((*(errp) = usbgem_ctrl_in((dp), 	\
+	/* bmRequestType */ USB_DEV_REQ_DEV_TO_HOST	\
+		    | USB_DEV_REQ_TYPE_VENDOR | USB_DEV_REQ_RCPT_DEV,	\
+	/* bRequest */ (req),	\
+	/* wValue */   (val),	\
+	/* wIndex */   (ix),	\
+	/* wLength */  (len),	\
+	/* valuep */   (buf),	\
+	/* size */     (len))) != USB_SUCCESS) goto label
+
+/* =============================================================== */
+/*
+ * Hardware manupilation
+ */
+/* =============================================================== */
+static int
+axf_reset_phy(struct usbgem_dev *dp)
+{
+	uint8_t	phys[2];
+	uint8_t	val8;
+	int	err;
+	struct axf_dev	*lp = dp->private;
+
+	DPRINTF(2, (CE_CONT, "!%s: %s: called", dp->name, __func__));
+
+	if (AX88172(dp)) {
+		delay(drv_usectohz(5000));
+		IN(dp, VCMD_READ_GPIO, 0, 0, 1, &val8, &err, usberr);
+
+		DPRINTF(0, (CE_CONT, "!%s: %s: gpio 0x%b",
+		    dp->name, __func__, val8, GPIO_BITS));
+
+		/* reset MII PHY */
+		val8 = lp->chip->gpio_reset[1]
+		    | lp->chip->gpio_speed[dp->speed]
+		    | lp->chip->gpio_duplex[dp->full_duplex];
+
+		OUT(dp, VCMD_WRITE_GPIO,
+		    val8, 0, 0, NULL, &err, usberr);
+		delay(drv_usectohz(5000));
+
+		val8 = lp->chip->gpio_reset[0]
+		    | lp->chip->gpio_speed[dp->speed]
+		    | lp->chip->gpio_duplex[dp->full_duplex];
+
+		OUT(dp, VCMD_WRITE_GPIO,
+		    val8, 0, 0, NULL, &err, usberr);
+		delay(drv_usectohz(5000));
+	} else {
+		lp->gpio = GPIO_RSE | GPIO_DATA2 | GPIO_EN2;
+		OUT(dp, VCMD_WRITE_GPIO, lp->gpio, 0,
+		    0, NULL, &err, usberr);
+		drv_usecwait(1000);
+
+		OUT(dp, VCMD_WRITE_PHY_SELECT_88772,
+		    dp->mii_phy_addr == 16 ? 1 : 0, 0, 0, NULL, &err, usberr);
+
+		OUT(dp, VCMD_SOFTWARE_RESET_88772,
+		    SWRST_IPPD | SWRST_PRL, 0, 0, NULL, &err, usberr);
+		delay(drv_usectohz(150*1000));
+		OUT(dp, VCMD_SOFTWARE_RESET_88772,
+		    0, 0, 0, NULL, &err, usberr);
+
+		OUT(dp, VCMD_SOFTWARE_RESET_88772,
+		    dp->mii_phy_addr == 16 ? SWRST_IPRL : SWRST_PRTE,
+		    0, 0, NULL, &err, usberr);
+		delay(drv_usectohz(150*1000));
+	}
+
+
+	return (USB_SUCCESS);
+
+usberr:
+	return (USB_FAILURE);
+}
+
+static int
+axf_reset_chip(struct usbgem_dev *dp)
+{
+	int	err = USB_SUCCESS;
+
+	if (AX88172(dp)) {
+		/* there are no ways to reset nic */
+		return (USB_SUCCESS);
+	}
+#ifdef NEVER
+	OUT(dp, VCMD_SOFTWARE_RESET_88772,
+	    SWRST_RR | SWRST_RT, 0, 0, NULL, &err, usberr);
+	OUT(dp, VCMD_SOFTWARE_RESET_88772,
+	    0, 0, 0, NULL, &err, usberr);
+usberr:
+#endif
+	return (err);
+}
+
+/*
+ * Setup ax88172
+ */
+static int
+axf_init_chip(struct usbgem_dev *dp)
+{
+	int		i;
+	uint32_t	val;
+	int		err = USB_SUCCESS;
+	uint16_t	reg;
+	uint8_t		buf[2];
+	uint16_t	tmp16;
+	struct axf_dev	*lp = dp->private;
+
+	DPRINTF(2, (CE_CONT, "!%s: %s: called", dp->name, __func__));
+
+	/* rx conrol register: read default value */
+	if (!AX88172(dp)) {
+		/* clear rx control */
+		OUT(dp, VCMD_WRITE_RXCTRL, 0, 0, 0, NULL, &err, usberr);
+	}
+
+	IN(dp, VCMD_READ_RXCTRL, 0, 0, 2, buf, &err, usberr);
+	lp->rcr = LE16P(buf);
+	DPRINTF(0, (CE_CONT, "!%s: %s: rcr(default):%b",
+	    dp->name, __func__, lp->rcr, RCR_BITS));
+
+	lp->rcr &= ~RCR_SO;
+
+	/* Media status register */
+	if (AX88172(dp)) {
+#ifdef notdef
+		lp->msr = MSR_TXABT;
+#else
+		lp->msr = 0;
+#endif
+	} else {
+		lp->msr = MSR_RE | MSR_TXABT;
+	}
+	DPRINTF(0, (CE_CONT, "!%s: %s: msr:%b",
+	    dp->name, __func__, lp->msr, MSR_BITS));
+	err = axf_set_media(dp);
+	CHECK_AND_JUMP(err, usberr);
+
+	/* write IPG0-2 registers */
+	if (AX88172(dp)) {
+		OUT(dp, VCMD_WRITE_IPG, lp->ipg[0], 0, 0, NULL, &err, usberr);
+		OUT(dp, VCMD_WRITE_IPG1, lp->ipg[1], 0, 0, NULL, &err, usberr);
+		OUT(dp, VCMD_WRITE_IPG2, lp->ipg[2], 0, 0, NULL, &err, usberr);
+	} else {
+		/* EMPTY */
+	}
+#ifdef ENABLE_RX_IN_INIT_CHIP
+	/* enable Rx */
+	lp->rcr |= RCR_SO;
+	OUT(dp, VCMD_WRITE_RXCTRL, lp->rcr, 0, 0, NULL, &err, usberr);
+#endif
+usberr:
+	DPRINTF(2, (CE_CONT, "!%s: %s: end (%s)",
+	    dp->name, __func__,
+	    err, err == USB_SUCCESS ? "success" : "error"));
+	return (err);
+}
+
+static int
+axf_start_chip(struct usbgem_dev *dp)
+{
+	int	err = USB_SUCCESS;
+	struct axf_dev	*lp = dp->private;
+#ifndef ENABLE_RX_IN_INIT_CHIP
+	/* enable Rx */
+	lp->rcr |= RCR_SO;
+	OUT(dp, VCMD_WRITE_RXCTRL, lp->rcr, 0, 0, NULL, &err, usberr);
+
+usberr:
+	DPRINTF(2, (CE_CONT, "!%s: %s: end (%s)",
+	    dp->name, __func__,
+	    err, err == USB_SUCCESS ? "success" : "error"));
+#endif
+	return (err);
+}
+
+static int
+axf_stop_chip(struct usbgem_dev *dp)
+{
+	int	err = USB_SUCCESS;
+	struct axf_dev	*lp = dp->private;
+
+	/* Disable Rx */
+	lp->rcr &= ~RCR_SO;
+	OUT(dp, VCMD_WRITE_RXCTRL, lp->rcr, 0, 0, NULL, &err, usberr);
+
+	/*
+	 * Restore factory mac address
+	 * if we have changed current mac address
+	 */
+	if (!AX88172(dp) &&
+	    bcmp(dp->dev_addr.ether_addr_octet,
+	    dp->cur_addr.ether_addr_octet,
+	    ETHERADDRL) != 0) {
+		OUT(dp, VCMD_WRITE_NODE_ID_88772, 0, 0,
+		    ETHERADDRL, dp->cur_addr.ether_addr_octet, &err, usberr);
+	}
+usberr:
+	return (axf_reset_chip(dp));
+}
+
+static int
+axf_get_stats(struct usbgem_dev *dp)
+{
+	/* EMPTY */
+	return (USB_SUCCESS);
+}
+
+static uint_t
+axf_mcast_hash(struct usbgem_dev *dp, const uint8_t *addr)
+{
+	return (usbgem_ether_crc_be(addr) >> (32 - 6));
+}
+
+static int
+axf_set_rx_filter(struct usbgem_dev *dp)
+{
+	int		i;
+	uint8_t		mode;
+	uint8_t		mhash[8];
+	uint8_t		buf[2];
+	uint_t		h;
+	int		err = USB_SUCCESS;
+	struct axf_dev	*lp = dp->private;
+
+	DPRINTF(2, (CE_CONT, "!%s: %s: called, rxmode:%x",
+	    dp->name, __func__, dp->rxmode));
+
+	if (lp->rcr & RCR_SO) {
+		/* set promiscuous mode  before changing it. */
+		OUT(dp, VCMD_WRITE_RXCTRL,
+		    lp->rcr | RCR_PRO, 0, 0, NULL, &err, usberr);
+	}
+
+	lp->rcr &= ~(RCR_AP_88772 | RCR_AM | RCR_SEP | RCR_AMALL | RCR_PRO);
+	mode = RCR_AB;	/* accept broadcast packets */
+
+	bzero(mhash, sizeof (mhash));
+
+	if (dp->rxmode & RXMODE_PROMISC) {
+		/* promiscious mode implies all multicast and all physical */
+		mode |= RCR_PRO;
+	} else if ((dp->rxmode & RXMODE_ALLMULTI) || dp->mc_count > 32) {
+		/* accept all multicast packets */
+		mode |= RCR_AMALL;
+	} else if (dp->mc_count > 0) {
+		/*
+		 * make hash table to select interresting
+		 * multicast address only.
+		 */
+		mode |= RCR_AM;
+		for (i = 0; i < dp->mc_count; i++) {
+			h = dp->mc_list[i].hash;
+			mhash[h / 8] |= 1 << (h % 8);
+		}
+	}
+	if (AX88172(dp)) {
+		if (bcmp(dp->dev_addr.ether_addr_octet,
+		    dp->cur_addr.ether_addr_octet, ETHERADDRL) != 0) {
+			/*
+			 * we use promiscious mode instead of changing the
+			 * mac address in ax88172
+			 */
+			mode |= RCR_PRO;
+		}
+	} else {
+		OUT(dp, VCMD_WRITE_NODE_ID_88772, 0, 0,
+		    ETHERADDRL, dp->cur_addr.ether_addr_octet, &err, usberr);
+	}
+	lp->rcr |= mode;
+
+	/* set multicast hash table */
+	if (mode & RCR_AM) {
+		/* need to set up multicast hash table */
+		OUT(dp, VCMD_WRITE_MCAST_FILTER, 0, 0,
+		    sizeof (mhash), mhash, &err, usberr);
+	}
+
+	/* update rcr */
+	OUT(dp, VCMD_WRITE_RXCTRL, lp->rcr, 0,
+	    0, NULL, &err, usberr);
+
+#if DEBUG_LEVEL > 1
+	/* verify rxctrl reg */
+	IN(dp, VCMD_READ_RXCTRL, 0, 0, 2, buf, &err, usberr);
+	cmn_err(CE_CONT, "!%s: %s: rcr:%b returned",
+	    dp->name, __func__, LE16P(buf), RCR_BITS);
+#endif
+usberr:
+	DPRINTF(2, (CE_CONT, "!%s: %s: end (%s)",
+	    dp->name, __func__,
+	    err, err == USB_SUCCESS ? "success" : "error"));
+	return (err);
+}
+
+static int
+axf_set_media(struct usbgem_dev *dp)
+{
+	uint8_t	val8;
+	uint8_t	gpio;
+	uint8_t	gpio_old;
+	int	err = USB_SUCCESS;
+	uint16_t	msr;
+	struct axf_dev	*lp = dp->private;
+
+	IN(dp, VCMD_READ_GPIO, 0, 0, 1, &gpio, &err, usberr);
+
+	DPRINTF(0, (CE_CONT, "!%s: %s: called, gpio:%b",
+	    dp->name, __func__, gpio, GPIO_BITS));
+
+	msr = lp->msr;
+	gpio_old = gpio;
+	gpio = lp->chip->gpio_reset[0];
+
+	/* setup speed */
+	if (AX88172(dp)) {
+		/* EMPTY */
+	} else {
+		msr &= ~(MSR_PS | MSR_GM | MSR_ENCK);
+
+		switch (dp->speed) {
+		case USBGEM_SPD_1000:
+			msr |= MSR_GM | MSR_ENCK;
+			break;
+
+		case USBGEM_SPD_100:
+			msr |= MSR_PS;
+			break;
+
+		case USBGEM_SPD_10:
+			break;
+		}
+	}
+	gpio |= lp->chip->gpio_speed[dp->speed == USBGEM_SPD_100 ? 1 : 0];
+
+	/* select duplex */
+	msr &= ~MSR_FDPX;
+	if (dp->full_duplex) {
+		msr |= MSR_FDPX;
+
+		/* select flow control */
+		if (AX88172(dp)) {
+			msr &= ~MSR_FCEN;
+			switch (dp->flow_control) {
+			case FLOW_CONTROL_TX_PAUSE:
+			case FLOW_CONTROL_SYMMETRIC:
+			case FLOW_CONTROL_RX_PAUSE:
+				msr |= MSR_FCEN;
+				break;
+			}
+		} else {
+			msr &= ~(MSR_RFC | MSR_TFC);
+			switch (dp->flow_control) {
+			case FLOW_CONTROL_TX_PAUSE:
+				msr |= MSR_TFC;
+				break;
+
+			case FLOW_CONTROL_SYMMETRIC:
+				msr |= MSR_TFC | MSR_RFC;
+				break;
+
+			case FLOW_CONTROL_RX_PAUSE:
+				msr |= MSR_RFC;
+				break;
+			}
+		}
+	}
+	gpio |= lp->chip->gpio_duplex[dp->full_duplex ? 1 : 0];
+
+	/* update medium status register */
+	lp->msr = msr;
+	OUT(dp, VCMD_WRITE_MEDIUM_STATUS, lp->msr, 0,
+	    0, NULL, &err, usberr);
+
+	if (gpio != gpio_old) {
+		/* LED control required for some products */
+		OUT(dp, VCMD_WRITE_GPIO,
+		    gpio, 0, 0, NULL, &err, usberr);
+	}
+
+usberr:
+	DPRINTF(2, (CE_CONT, "!%s: %s: end (%s)",
+	    dp->name, __func__,
+	    err, err == USB_SUCCESS ? "success" : "error"));
+	return (err);
+}
+
+#define	FILL_PKT_HEADER(bp, len)	{	\
+	(bp)[0] = (uint8_t)(len);	\
+	(bp)[1] = (uint8_t)((len) >> 8);	\
+	(bp)[2] = (uint8_t)(~(len));	\
+	(bp)[3] = (uint8_t)((~(len)) >> 8);	\
+}
+
+#define	PKT_HEADER_SIZE	4
+
+/*
+ * send/receive packet check
+ */
+static mblk_t *
+axf_tx_make_packet(struct usbgem_dev *dp, mblk_t *mp)
+{
+	int		n;
+	size_t		len;
+	size_t		pkt_size;
+	mblk_t		*new;
+	mblk_t		*tp;
+	uint8_t		*bp;
+	uint8_t		*last_pos;
+	uint_t		align_mask;
+	size_t		header_size;
+	int		pad_size;
+
+	len = msgdsize(mp);
+
+	if (AX88172(dp)) {
+#ifdef notdef
+		align_mask = 63;
+#else
+		align_mask = 511;
+#endif
+		header_size = 0;
+
+		if (len >= ETHERMIN && mp->b_cont == NULL &&
+		    (len & align_mask) != 0) {
+			/* use the mp "as is" */
+			return (mp);
+		}
+	} else {
+		align_mask = 511;
+		header_size = PKT_HEADER_SIZE;
+	}
+
+	/*
+	 * re-allocate the mp
+	 */
+	/* minimum ethernet packet size of ETHERMIN */
+	pkt_size = max(len, ETHERMIN);
+
+	if (((pkt_size + header_size) & align_mask) == 0) {
+		/* padding is required in usb communication */
+		pad_size = PKT_HEADER_SIZE;
+	} else {
+		pad_size = 0;
+	}
+
+	if ((new = allocb(header_size + pkt_size + pad_size, 0)) == NULL) {
+		return (NULL);
+	}
+
+	bp = new->b_rptr;
+	if (header_size) {
+		uint16_t	tmp;
+
+		/* add a header */
+		tmp = (uint16_t)pkt_size;
+		FILL_PKT_HEADER(bp, tmp);
+		bp += header_size;
+	}
+
+	/* copy contents of the buffer */
+	for (tp = mp; tp; tp = tp->b_cont) {
+		n = tp->b_wptr - tp->b_rptr;
+		bcopy(tp->b_rptr, bp, n);
+		bp += n;
+	}
+
+	/* add pads for ethernet packets */
+	last_pos = new->b_rptr + header_size + pkt_size;
+	while (bp < last_pos) {
+		*bp++ = 0;
+	}
+
+	/* add a zero-length pad segment for usb communications */
+	if (pad_size) {
+		/* add a dummy header for zero-length packet */
+		FILL_PKT_HEADER(bp, 0);
+		bp += pad_size;
+	}
+
+	/* close the payload of the packet */
+	new->b_wptr = bp;
+
+	return (new);
+}
+
+static void
+axf_dump_packet(struct usbgem_dev *dp, uint8_t *bp, int n)
+{
+	int	i;
+
+	for (i = 0; i < n; i += 8, bp += 8) {
+		cmn_err(CE_CONT, "%02x %02x %02x %02x %02x %02x %02x %02x",
+		    bp[0], bp[1], bp[2], bp[3], bp[4], bp[5], bp[6], bp[7]);
+	}
+}
+
+static mblk_t *
+axf_rx_make_packet(struct usbgem_dev *dp, mblk_t *mp)
+{
+	mblk_t	*tp;
+	int	rest;
+
+	if (AX88172(dp)) {
+		return (mp);
+	}
+
+	tp = mp;
+	rest = tp->b_wptr - tp->b_rptr;
+
+	if (rest <= PKT_HEADER_SIZE) {
+		/*
+		 * the usb bulk-in frame doesn't include any valid
+		 * ethernet packets.
+		 */
+		return (NULL);
+	}
+
+	for (; ; ) {
+		uint16_t	len;
+		uint16_t	cksum;
+
+		/* analyse the header of the received usb frame */
+		len = LE16P(tp->b_rptr + 0);
+		cksum = LE16P(tp->b_rptr + 2);
+
+		/* test if the header is valid */
+		if (len + cksum != 0xffff) {
+			/* discard whole the packet */
+			cmn_err(CE_WARN,
+			    "!%s: %s: corrupted header:%04x %04x",
+			    dp->name, __func__, len, cksum);
+			return (NULL);
+		}
+#if DEBUG_LEVEL > 0
+		if (len < ETHERMIN || len > ETHERMAX) {
+			cmn_err(CE_NOTE,
+			    "!%s: %s: incorrect pktsize:%d",
+			    dp->name, __func__, len);
+		}
+#endif
+		/* extract a ethernet packet from the bulk-in frame */
+		tp->b_rptr += PKT_HEADER_SIZE;
+		tp->b_wptr = tp->b_rptr + len;
+
+		if (len & 1) {
+			/*
+			 * skip a tailing pad byte if the packet
+			 * length is odd
+			 */
+			len++;
+		}
+		rest -= len + PKT_HEADER_SIZE;
+
+		if (rest <= PKT_HEADER_SIZE) {
+			/* no more vaild ethernet packets */
+			break;
+		}
+
+#if DEBUG_LEVEL > 10
+		axf_dump_packet(dp, tp->b_wptr, 18);
+#endif
+		/* allocate a mblk_t header for the next ethernet packet */
+		tp->b_next = dupb(mp);
+		tp->b_next->b_rptr = tp->b_rptr + len;
+		tp = tp->b_next;
+	}
+
+	return (mp);
+}
+
+/*
+ * MII Interfaces
+ */
+static uint16_t
+axf_mii_read(struct usbgem_dev *dp, uint_t index, int *errp)
+{
+	uint8_t		buf[2];
+	uint16_t	val;
+
+	DPRINTF(4, (CE_CONT, "!%s: %s: called, ix:%d",
+	    dp->name, __func__, index));
+
+	/* switch to software MII operation mode */
+	OUT(dp, VCMD_SOFTWARE_MII_OP, 0, 0, 0, NULL, errp, usberr);
+
+	/* Read MII register */
+	IN(dp, VCMD_READ_MII_REG, dp->mii_phy_addr, index,
+	    2, buf, errp, usberr);
+
+	/* switch to hardware MII operation mode */
+	OUT(dp, VCMD_HARDWARE_MII_OP, 0, 0, 0, NULL, errp, usberr);
+
+	return (LE16P(buf));
+
+usberr:
+	cmn_err(CE_CONT,
+	    "!%s: %s: usberr(%d) detected", dp->name, __func__, *errp);
+	return (0);
+}
+
+static void
+axf_mii_write(struct usbgem_dev *dp, uint_t index, uint16_t val, int *errp)
+{
+	uint8_t		buf[2];
+
+	DPRINTF(4, (CE_CONT, "!%s: %s called, reg:%x val:%x",
+	    dp->name, __func__, index, val));
+
+	/* switch software MII operation mode */
+	OUT(dp, VCMD_SOFTWARE_MII_OP, 0, 0, 0, NULL, errp, usberr);
+
+	/* Write to the specified MII register */
+	buf[0] = (uint8_t)val;
+	buf[1] = (uint8_t)(val >> 8);
+	OUT(dp, VCMD_WRITE_MII_REG, dp->mii_phy_addr, index,
+	    2, buf, errp, usberr);
+
+	/* switch to hardware MII operation mode */
+	OUT(dp, VCMD_HARDWARE_MII_OP, 0, 0, 0, NULL, errp, usberr);
+
+usberr:
+	;
+}
+
+static void
+axf_interrupt(struct usbgem_dev *dp, mblk_t *mp)
+{
+	uint8_t	*bp;
+	struct axf_dev	*lp = dp->private;
+
+	bp = mp->b_rptr;
+
+	DPRINTF(2, (CE_CONT,
+	    "!%s: %s: size:%d, %02x %02x %02x %02x %02x %02x %02x %02x",
+	    dp->name, __func__, mp->b_wptr - mp->b_rptr,
+	    bp[0], bp[1], bp[2], bp[3], bp[4], bp[5], bp[6], bp[7]));
+
+	if (lp->last_link_state ^ bp[2]) {
+		usbgem_mii_update_link(dp);
+	}
+
+	lp->last_link_state = bp[2];
+}
+
+/* ======================================================== */
+/*
+ * OS depend (device driver DKI) routine
+ */
+/* ======================================================== */
+#ifdef DEBUG_LEVEL
+static void
+axf_eeprom_dump(struct usbgem_dev *dp, int size)
+{
+	int	i;
+	int	err;
+	uint8_t	w0[2], w1[2], w2[2], w3[2];
+
+	cmn_err(CE_CONT, "!%s: eeprom dump:", dp->name);
+
+	err = USB_SUCCESS;
+
+	for (i = 0; i < size; i += 4) {
+		IN(dp, VCMD_READ_SROM, i + 0, 0, 2, w0, &err, usberr);
+		IN(dp, VCMD_READ_SROM, i + 1, 0, 2, w1, &err, usberr);
+		IN(dp, VCMD_READ_SROM, i + 2, 0, 2, w2, &err, usberr);
+		IN(dp, VCMD_READ_SROM, i + 3, 0, 2, w3, &err, usberr);
+		cmn_err(CE_CONT, "!0x%02x: 0x%04x 0x%04x 0x%04x 0x%04x",
+		    i,
+		    (w0[1] << 8) | w0[0],
+		    (w1[1] << 8) | w1[0],
+		    (w2[1] << 8) | w2[0],
+		    (w3[1] << 8) | w3[0]);
+	}
+usberr:
+	;
+}
+#endif
+
+static int
+axf_attach_chip(struct usbgem_dev *dp)
+{
+	uint8_t	phys[2];
+	int	err;
+	uint_t	vcmd;
+	int	ret;
+#ifdef CONFIG_FULLSIZE_VLAN
+	uint8_t	maxpktsize[2];
+	uint16_t	vlan_pktsize;
+#endif
+#ifdef DEBUG_LEVEL
+	uint8_t	val8;
+#endif
+	struct axf_dev	*lp = dp->private;
+
+	DPRINTF(0, (CE_CONT, "!%s: %s enter", dp->name, __func__));
+
+	ret = USB_SUCCESS;
+	/*
+	 * mac address in EEPROM has loaded to ID registers.
+	 */
+	vcmd = AX88172(dp) ? VCMD_READ_NODE_ID : VCMD_READ_NODE_ID_88772;
+	IN(dp, vcmd, 0, 0,
+	    ETHERADDRL, dp->dev_addr.ether_addr_octet, &err, usberr);
+
+	/*
+	 * setup IPG values
+	 */
+	lp->ipg[0] = 0x15;
+	lp->ipg[1] = 0x0c;
+	lp->ipg[2] = 0x12;
+
+	/*
+	 * We cannot scan phy because the nic returns undefined
+	 * value, i.e. remained garbage, when MII phy is not at the
+	 * specified index.
+	 */
+#ifdef DEBUG_LEVELx
+	if (lp->chip->vid == 0x07b8 && lp->chip->pid == 0x420a) {
+		/*
+		 * restore the original phy address of brain
+		 * damaged Planex UE2-100TX
+		 */
+		OUT(dp, VCMD_WRITE_SROM_ENABLE, 0, 0, 0, NULL, &err, usberr);
+		OUT(dp, VCMD_WRITE_SROM, 0x11, 0xe004, 0, NULL, &err, usberr);
+		OUT(dp, VCMD_WRITE_SROM_DISABLE, 0, 0, 0, NULL, &err, usberr);
+	}
+#endif
+	if (AX88172(dp)) {
+		IN(dp, VCMD_READ_PHY_IDS, 0, 0, 2, &phys, &err, usberr);
+		dp->mii_phy_addr = phys[1];
+		DPRINTF(0, (CE_CONT, "!%s: %s: phys_addr:%d %d",
+		    dp->name, __func__, phys[0], phys[1]));
+	} else {
+		/* use built-in phy */
+		dp->mii_phy_addr = 0x10;
+	}
+
+	dp->misc_flag |= USBGEM_VLAN;
+#ifdef CONFIG_FULLSIZE_VLAN
+	if (AX88172(dp) || AX88772(dp)) {
+		/* check max packet size in srom */
+		IN(dp, VCMD_READ_SROM, 0x10, 0, 2, maxpktsize, &err, usberr);
+		vlan_pktsize = ETHERMAX + ETHERFCSL + 4 /* VTAG_SIZE */;
+
+		if (LE16P(maxpktsize) < vlan_pktsize) {
+			cmn_err(CE_CONT,
+			    "!%s: %s: max packet size in srom is too small, "
+			    "changing %d -> %d, do power cycle for the device",
+			    dp->name, __func__,
+			    LE16P(maxpktsize), vlan_pktsize);
+			OUT(dp, VCMD_WRITE_SROM_ENABLE,
+			    0, 0, 0, NULL, &err, usberr);
+			OUT(dp, VCMD_WRITE_SROM, 0x10,
+			    vlan_pktsize, 0, NULL, &err, usberr);
+			OUT(dp, VCMD_WRITE_SROM_DISABLE,
+			    0, 0, 0, NULL, &err, usberr);
+
+			/* need to power off the device */
+			ret = USB_FAILURE;
+		}
+	}
+#endif
+#ifdef DEBUG_LEVEL
+	IN(dp, VCMD_READ_GPIO, 0, 0, 1, &val8, &err, usberr);
+	cmn_err(CE_CONT,
+	    "!%s: %s: ipg 0x%02x 0x%02x 0x%02x, gpio 0x%b",
+	    dp->name, __func__, lp->ipg[0], lp->ipg[1], lp->ipg[2],
+	    val8, GPIO_BITS);
+#endif
+	/* fix rx buffer size */
+	if (!AX88172(dp)) {
+		dp->rx_buf_len = 2048;
+	}
+
+#if DEBUG_LEVEL > 0
+	axf_eeprom_dump(dp, 0x20);
+#endif
+	return (ret);
+
+usberr:
+	cmn_err(CE_WARN, "%s: %s: usb error detected (%d)",
+	    dp->name, __func__, err);
+	return (USB_FAILURE);
+}
+
+static boolean_t
+axf_scan_phy(struct usbgem_dev *dp)
+{
+	int	i;
+	int	err;
+	uint16_t	val;
+	int	phy_addr_saved;
+	struct axf_dev	*lp = dp->private;
+
+	DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__));
+
+	phy_addr_saved = dp->mii_phy_addr;
+
+	/* special probe routine for unreliable MII addr */
+#define	PROBE_PAT	\
+	(MII_ABILITY_100BASE_TX_FD |	\
+	MII_ABILITY_100BASE_TX |	\
+	MII_ABILITY_10BASE_T_FD |	\
+	MII_ABILITY_10BASE_T)
+
+	for (i = 0; i < 32; i++) {
+		dp->mii_phy_addr = i;
+		axf_mii_write(dp, MII_AN_ADVERT, 0, &err);
+		if (err != USBGEM_SUCCESS) {
+			break;
+		}
+		val = axf_mii_read(dp, MII_AN_ADVERT, &err);
+		if (err != USBGEM_SUCCESS) {
+			break;
+		}
+		if (val != 0) {
+			DPRINTF(0, (CE_CONT, "!%s: %s: index:%d,  val %b != 0",
+			    dp->name, __func__, i, val, MII_ABILITY_BITS));
+			continue;
+		}
+
+		axf_mii_write(dp, MII_AN_ADVERT, PROBE_PAT, &err);
+		if (err != USBGEM_SUCCESS) {
+			break;
+		}
+		val = axf_mii_read(dp, MII_AN_ADVERT, &err);
+		if (err != USBGEM_SUCCESS) {
+			break;
+		}
+		if ((val & MII_ABILITY_TECH) != PROBE_PAT) {
+			DPRINTF(0, (CE_CONT, "!%s: %s: "
+			    "index:%d,  pat:%x != val:%b",
+			    dp->name, __func__, i,
+			    PROBE_PAT, val, MII_ABILITY_BITS));
+			continue;
+		}
+
+		/* found */
+		dp->mii_phy_addr = phy_addr_saved;
+		return (i);
+	}
+#undef PROBE_PAT
+	if (i == 32) {
+		cmn_err(CE_CONT, "!%s: %s: no mii phy found",
+		    dp->name, __func__);
+	} else {
+		cmn_err(CE_CONT, "!%s: %s: i/o error while scanning phy",
+		    dp->name, __func__);
+	}
+	dp->mii_phy_addr = phy_addr_saved;
+	return (-1);
+}
+
+static int
+axf_mii_probe(struct usbgem_dev *dp)
+{
+	int	my_guess;
+	int	err;
+	uint8_t	old_11th[2];
+	uint8_t	new_11th[2];
+	struct axf_dev	*lp = dp->private;
+
+	DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__));
+	(void) axf_reset_phy(dp);
+	lp->phy_has_reset = B_TRUE;
+
+	if (AX88172(dp)) {
+		my_guess = axf_scan_phy(dp);
+		if (my_guess >= 0 && my_guess < 32 &&
+		    my_guess != dp->mii_phy_addr) {
+			/*
+			 * phy addr in srom is wrong, need to fix it
+			 */
+			IN(dp, VCMD_READ_SROM,
+			    0x11, 0, 2, old_11th, &err, usberr);
+
+			new_11th[0] = my_guess;
+			new_11th[1] = old_11th[1];
+
+			OUT(dp, VCMD_WRITE_SROM_ENABLE,
+			    0, 0, 0, NULL, &err, usberr);
+			OUT(dp, VCMD_WRITE_SROM,
+			    0x11, LE16P(new_11th), 0, NULL, &err, usberr);
+			OUT(dp, VCMD_WRITE_SROM_DISABLE,
+			    0, 0, 0, NULL, &err, usberr);
+#if 1
+			/* XXX - read back, but it doesn't work, why? */
+			delay(drv_usectohz(1000*1000));
+			IN(dp, VCMD_READ_SROM,
+			    0x11, 0, 2, new_11th, &err, usberr);
+#endif
+			cmn_err(CE_NOTE, "!%s: %s: phy addr in srom fixed: "
+			    "%04x -> %04x",
+			    dp->name, __func__,
+			    LE16P(old_11th), LE16P(new_11th));
+			return (USBGEM_FAILURE);
+usberr:
+			cmn_err(CE_NOTE,
+			    "!%s: %s:  failed to patch phy addr, "
+			    "current: %04x",
+			    dp->name, __func__, LE16P(old_11th));
+			return (USBGEM_FAILURE);
+		}
+	}
+	return (usbgem_mii_probe_default(dp));
+}
+
+static int
+axf_mii_init(struct usbgem_dev *dp)
+{
+	struct axf_dev	*lp = dp->private;
+
+	DPRINTF(2, (CE_CONT, "!%s: %s: called", dp->name, __func__));
+
+	if (!lp->phy_has_reset) {
+		(void) axf_reset_phy(dp);
+	}
+
+	/* prepare to reset phy on the next reconnect or resume */
+	lp->phy_has_reset = B_FALSE;
+
+	return (USB_SUCCESS);
+}
+
+static int
+axfattach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+	int			i;
+	ddi_iblock_cookie_t	c;
+	int			ret;
+	int			revid;
+	int			unit;
+	int			vid;
+	int			pid;
+	struct chip_info	*p;
+	int			len;
+	const char		*drv_name;
+	struct usbgem_dev	*dp;
+	void			*base;
+	struct usbgem_conf	*ugcp;
+	struct axf_dev		*lp;
+
+	unit = ddi_get_instance(dip);
+	drv_name = ddi_driver_name(dip);
+
+	DPRINTF(3, (CE_CONT, "!%s%d: %s: called, cmd:%d",
+	    drv_name, unit, __func__, cmd));
+
+	if (cmd == DDI_ATTACH) {
+		/*
+		 * Check if the chip is supported.
+		 */
+		vid = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
+		    "usb-vendor-id", -1);
+		pid = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
+		    "usb-product-id", -1);
+		revid = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
+		    "usb-revision-id", -1);
+
+		for (i = 0, p = chiptbl_88x7x; i < CHIPTABLESIZE; i++, p++) {
+			if (p->vid == vid && p->pid == pid) {
+				/* found */
+				cmn_err(CE_CONT, "!%s%d: %s "
+				    "(vid: 0x%04x, did: 0x%04x, revid: 0x%02x)",
+				    drv_name, unit, p->name, vid, pid, revid);
+				goto chip_found;
+			}
+		}
+
+		/* Not found */
+		cmn_err(CE_WARN, "!%s: %s: wrong usb venid/prodid (0x%x, 0x%x)",
+		    drv_name, __func__, vid, pid);
+
+		/* assume 88772 */
+		p = &chiptbl_88x7x[CHIPTABLESIZE - 1];
+chip_found:
+		/*
+		 * construct usbgem configration
+		 */
+		ugcp = kmem_zalloc(sizeof (*ugcp), KM_SLEEP);
+
+		/* name */
+		/*
+		 * softmac requires that ppa is the instance number
+		 * of the device, otherwise it hangs in seaching the device.
+		 */
+		sprintf(ugcp->usbgc_name, "%s%d", drv_name, unit);
+		ugcp->usbgc_ppa = unit;
+
+		ugcp->usbgc_ifnum = 0;
+		ugcp->usbgc_alt = 0;
+
+		ugcp->usbgc_tx_list_max = 64;
+
+		ugcp->usbgc_rx_header_len = 0;
+		ugcp->usbgc_rx_list_max = 64;
+
+		/* time out parameters */
+		ugcp->usbgc_tx_timeout = USBGEM_TX_TIMEOUT;
+		ugcp->usbgc_tx_timeout_interval = USBGEM_TX_TIMEOUT_INTERVAL;
+
+		/* flow control */
+		/*
+		 * XXX - flow control caused link down frequently under
+		 * heavy traffic
+		 */
+		ugcp->usbgc_flow_control = FLOW_CONTROL_RX_PAUSE;
+
+		/* MII timeout parameters */
+		ugcp->usbgc_mii_link_watch_interval = ONESEC;
+		ugcp->usbgc_mii_an_watch_interval = ONESEC/5;
+		ugcp->usbgc_mii_reset_timeout = MII_RESET_TIMEOUT; /* 1 sec */
+		ugcp->usbgc_mii_an_timeout = MII_AN_TIMEOUT;	/* 5 sec */
+		ugcp->usbgc_mii_an_wait = 0;
+		ugcp->usbgc_mii_linkdown_timeout = MII_LINKDOWN_TIMEOUT;
+
+		ugcp->usbgc_mii_an_delay = ONESEC/10;
+		ugcp->usbgc_mii_linkdown_action = MII_ACTION_RSA;
+		ugcp->usbgc_mii_linkdown_timeout_action = MII_ACTION_RESET;
+		ugcp->usbgc_mii_dont_reset = B_FALSE;
+		ugcp->usbgc_mii_hw_link_detection = B_TRUE;
+		ugcp->usbgc_mii_stop_mac_on_linkdown = B_FALSE;
+
+		/* I/O methods */
+
+		/* mac operation */
+		ugcp->usbgc_attach_chip = &axf_attach_chip;
+		ugcp->usbgc_reset_chip = &axf_reset_chip;
+		ugcp->usbgc_init_chip = &axf_init_chip;
+		ugcp->usbgc_start_chip = &axf_start_chip;
+		ugcp->usbgc_stop_chip = &axf_stop_chip;
+		ugcp->usbgc_multicast_hash = &axf_mcast_hash;
+
+		ugcp->usbgc_set_rx_filter = &axf_set_rx_filter;
+		ugcp->usbgc_set_media = &axf_set_media;
+		ugcp->usbgc_get_stats = &axf_get_stats;
+		ugcp->usbgc_interrupt = &axf_interrupt;
+
+		/* packet operation */
+		ugcp->usbgc_tx_make_packet = &axf_tx_make_packet;
+		ugcp->usbgc_rx_make_packet = &axf_rx_make_packet;
+
+		/* mii operations */
+		ugcp->usbgc_mii_probe = &axf_mii_probe;
+		ugcp->usbgc_mii_init = &axf_mii_init;
+		ugcp->usbgc_mii_config = &usbgem_mii_config_default;
+		ugcp->usbgc_mii_read = &axf_mii_read;
+		ugcp->usbgc_mii_write = &axf_mii_write;
+
+		/* mtu */
+		ugcp->usbgc_min_mtu = ETHERMTU;
+		ugcp->usbgc_max_mtu = ETHERMTU;
+		ugcp->usbgc_default_mtu = ETHERMTU;
+
+		lp = kmem_zalloc(sizeof (struct axf_dev), KM_SLEEP);
+		lp->chip = p;
+		lp->last_link_state = 0;
+		lp->phy_has_reset = B_FALSE;
+
+		dp = usbgem_do_attach(dip, ugcp, lp, sizeof (struct axf_dev));
+
+		kmem_free(ugcp, sizeof (*ugcp));
+
+		if (dp != NULL) {
+			return (DDI_SUCCESS);
+		}
+
+err_free_mem:
+		kmem_free(lp, sizeof (struct axf_dev));
+err_close_pipe:
+err:
+		return (DDI_FAILURE);
+	}
+
+	if (cmd == DDI_RESUME) {
+		return (usbgem_resume(dip));
+	}
+
+	return (DDI_FAILURE);
+}
+
+static int
+axfdetach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+	int	ret;
+
+	if (cmd == DDI_DETACH) {
+		ret = usbgem_do_detach(dip);
+		if (ret != DDI_SUCCESS) {
+			return (DDI_FAILURE);
+		}
+		return (DDI_SUCCESS);
+	}
+	if (cmd == DDI_SUSPEND) {
+		return (usbgem_suspend(dip));
+	}
+	return (DDI_FAILURE);
+}
+
+/* ======================================================== */
+/*
+ * OS depend (loadable streams driver) routine
+ */
+/* ======================================================== */
+#ifdef USBGEM_CONFIG_GLDv3
+USBGEM_STREAM_OPS(axf_ops, axfattach, axfdetach);
+#else
+static	struct module_info axfminfo = {
+	0,			/* mi_idnum */
+	"axf",			/* mi_idname */
+	0,			/* mi_minpsz */
+	ETHERMTU,		/* mi_maxpsz */
+	ETHERMTU*128,		/* mi_hiwat */
+	1,			/* mi_lowat */
+};
+
+static	struct qinit axfrinit = {
+	(int (*)()) NULL,	/* qi_putp */
+	usbgem_rsrv,		/* qi_srvp */
+	usbgem_open,		/* qi_qopen */
+	usbgem_close,		/* qi_qclose */
+	(int (*)()) NULL,	/* qi_qadmin */
+	&axfminfo,		/* qi_minfo */
+	NULL			/* qi_mstat */
+};
+
+static	struct qinit axfwinit = {
+	usbgem_wput,		/* qi_putp */
+	usbgem_wsrv,		/* qi_srvp */
+	(int (*)()) NULL,	/* qi_qopen */
+	(int (*)()) NULL,	/* qi_qclose */
+	(int (*)()) NULL,	/* qi_qadmin */
+	&axfminfo,		/* qi_minfo */
+	NULL			/* qi_mstat */
+};
+
+static struct streamtab	axf_info = {
+	&axfrinit,	/* st_rdinit */
+	&axfwinit,	/* st_wrinit */
+	NULL,		/* st_muxrinit */
+	NULL		/* st_muxwrinit */
+};
+
+static	struct cb_ops cb_axf_ops = {
+	nulldev,	/* cb_open */
+	nulldev,	/* cb_close */
+	nodev,		/* cb_strategy */
+	nodev,		/* cb_print */
+	nodev,		/* cb_dump */
+	nodev,		/* cb_read */
+	nodev,		/* cb_write */
+	nodev,		/* cb_ioctl */
+	nodev,		/* cb_devmap */
+	nodev,		/* cb_mmap */
+	nodev,		/* cb_segmap */
+	nochpoll,	/* cb_chpoll */
+	ddi_prop_op,	/* cb_prop_op */
+	&axf_info,	/* cb_stream */
+	D_NEW|D_MP	/* cb_flag */
+};
+
+static	struct dev_ops axf_ops = {
+	DEVO_REV,	/* devo_rev */
+	0,		/* devo_refcnt */
+	usbgem_getinfo,	/* devo_getinfo */
+	nulldev,	/* devo_identify */
+	nulldev,	/* devo_probe */
+	axfattach,	/* devo_attach */
+	axfdetach,	/* devo_detach */
+	nodev,		/* devo_reset */
+	&cb_axf_ops,	/* devo_cb_ops */
+	NULL,		/* devo_bus_ops */
+	usbgem_power,	/* devo_power */
+#if DEVO_REV >= 4
+	usbgem_quiesce,	/* devo_quiesce */
+#endif
+};
+#endif
+
+static struct modldrv modldrv = {
+	&mod_driverops,	/* Type of module.  This one is a driver */
+	ident,
+	&axf_ops,	/* driver ops */
+};
+
+static struct modlinkage modlinkage = {
+	MODREV_1, &modldrv, NULL
+};
+
+/* ======================================================== */
+/*
+ * _init : done
+ */
+/* ======================================================== */
+int
+_init(void)
+{
+	int 	status;
+
+	DPRINTF(2, (CE_CONT, "!axf: _init: called"));
+
+	status = usbgem_mod_init(&axf_ops, "axf");
+	if (status != DDI_SUCCESS) {
+		return (status);
+	}
+	status = mod_install(&modlinkage);
+	if (status != DDI_SUCCESS) {
+		usbgem_mod_fini(&axf_ops);
+	}
+	return (status);
+}
+
+/*
+ * _fini : done
+ */
+int
+_fini(void)
+{
+	int	status;
+
+	DPRINTF(2, (CE_CONT, "!axf: _fini: called"));
+	status = mod_remove(&modlinkage);
+	if (status == DDI_SUCCESS) {
+		usbgem_mod_fini(&axf_ops);
+	}
+	return (status);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&modlinkage, modinfop));
+}
diff --git a/usr/src/uts/common/io/cons.c b/usr/src/uts/common/io/cons.c
index 6ef1b0b9f7..495ae93cf9 100644
--- a/usr/src/uts/common/io/cons.c
+++ b/usr/src/uts/common/io/cons.c
@@ -21,6 +21,7 @@
 
 /*
  * Copyright (c) 1982, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015, Joyent, Inc. All rights reserved.
  */
 
 /*
@@ -53,6 +54,7 @@
 #include <sys/vnode.h>
 #include <sys/uio.h>
 #include <sys/stat.h>
+#include <sys/limits.h>
 
 #include <sys/console.h>
 #include <sys/consdev.h>
@@ -414,14 +416,24 @@ cnwrite(dev_t dev, struct uio *uio, struct cred *cred)
 	 */
 	if (vsconsvp != NULL && vsconsvp->v_stream != NULL) {
 		struiod_t uiod;
+		struct iovec buf[IOV_MAX_STACK];
+		int iovlen = 0;
+
+		if (uio->uio_iovcnt > IOV_MAX_STACK) {
+			iovlen = uio->uio_iovcnt * sizeof (iovec_t);
+			uiod.d_iov = kmem_alloc(iovlen, KM_SLEEP);
+		} else {
+			uiod.d_iov = buf;
+		}
 
 		/*
 		 * strwrite modifies uio so need to make copy.
 		 */
-		(void) uiodup(uio, &uiod.d_uio, uiod.d_iov,
-		    sizeof (uiod.d_iov) / sizeof (*uiod.d_iov));
+		(void) uiodup(uio, &uiod.d_uio, uiod.d_iov, uio->uio_iovcnt);
 
 		(void) strwrite(vsconsvp, &uiod.d_uio, cred);
+		if (iovlen != 0)
+			kmem_free(uiod.d_iov, iovlen);
 	}
 
 	if (rconsvp->v_stream != NULL)
diff --git a/usr/src/uts/common/io/devpoll.c b/usr/src/uts/common/io/devpoll.c
index e00ac1d1e9..f8d9f1cff8 100644
--- a/usr/src/uts/common/io/devpoll.c
+++ b/usr/src/uts/common/io/devpoll.c
@@ -670,15 +670,26 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp)
 
 	uiosize = uiop->uio_resid;
 	pollfdnum = uiosize / size;
-	mutex_enter(&curproc->p_lock);
-	if (pollfdnum > (uint_t)rctl_enforced_value(
-	    rctlproc_legacy[RLIMIT_NOFILE], curproc->p_rctls, curproc)) {
-		(void) rctl_action(rctlproc_legacy[RLIMIT_NOFILE],
-		    curproc->p_rctls, curproc, RCA_SAFE);
+
+	/*
+	 * We want to make sure that pollfdnum isn't large enough to DoS us,
+	 * but we also don't want to grab p_lock unnecessarily -- so we
+	 * perform the full check against our resource limits if and only if
+	 * pollfdnum is larger than the known-to-be-sane value of UINT8_MAX.
+	 */
+	if (pollfdnum > UINT8_MAX) {
+		mutex_enter(&curproc->p_lock);
+		if (pollfdnum >
+		    (uint_t)rctl_enforced_value(rctlproc_legacy[RLIMIT_NOFILE],
+		    curproc->p_rctls, curproc)) {
+			(void) rctl_action(rctlproc_legacy[RLIMIT_NOFILE],
+			    curproc->p_rctls, curproc, RCA_SAFE);
+			mutex_exit(&curproc->p_lock);
+			return (EINVAL);
+		}
 		mutex_exit(&curproc->p_lock);
-		return (EINVAL);
 	}
-	mutex_exit(&curproc->p_lock);
+
 	/*
 	 * Copy in the pollfd array.  Walk through the array and add
 	 * each polled fd to the cached set.
@@ -1112,14 +1123,18 @@ dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp)
 			void *setp = STRUCT_FGETP(dvpoll, dp_setp);
 
 			if (setp != NULL) {
-				if (copyin(setp, &set, sizeof (set))) {
-					DP_REFRELE(dpep);
-					return (EFAULT);
+				if ((mode & FKIOCTL) != 0) {
+					/* Use the signal set directly */
+					ksetp = (k_sigset_t *)setp;
+				} else {
+					if (copyin(setp, &set, sizeof (set))) {
+						DP_REFRELE(dpep);
+						return (EFAULT);
+					}
+					sigutok(&set, &kset);
+					ksetp = &kset;
 				}
 
-				sigutok(&set, &kset);
-				ksetp = &kset;
-
 				mutex_enter(&p->p_lock);
 				schedctl_finish_sigblock(t);
 				lwp->lwp_sigoldmask = t->t_hold;
@@ -1268,6 +1283,10 @@ dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp)
 		DP_SIGMASK_RESTORE(ksetp);
 
 		if (error == 0 && fdcnt > 0) {
+			/*
+			 * It should be noted that FKIOCTL does not influence
+			 * the copyout (vs bcopy) of dp_fds at this time.
+			 */
 			if (copyout(ps->ps_dpbuf,
 			    STRUCT_FGETP(dvpoll, dp_fds), fdcnt * fdsize)) {
 				DP_REFRELE(dpep);
diff --git a/usr/src/uts/common/io/dld/dld_drv.c b/usr/src/uts/common/io/dld/dld_drv.c
index 40cbe86170..62bc4a8ecf 100644
--- a/usr/src/uts/common/io/dld/dld_drv.c
+++ b/usr/src/uts/common/io/dld/dld_drv.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015, Joyent Inc.
  */
 
 /*
@@ -701,7 +702,8 @@ drv_ioc_prop_common(dld_ioc_macprop_t *prop, intptr_t arg, boolean_t set,
 				err = EACCES;
 				goto done;
 			}
-			err = dls_devnet_setzid(dlh, dzp->diz_zid);
+			err = dls_devnet_setzid(dlh, dzp->diz_zid,
+			    dzp->diz_transient);
 		} else {
 			kprop->pr_perm_flags = MAC_PROP_PERM_RW;
 			(*(zoneid_t *)kprop->pr_val) = dls_devnet_getzid(dlh);
@@ -865,7 +867,7 @@ drv_ioc_rename(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
 		return (err);
 
 	if ((err = dls_devnet_rename(dir->dir_linkid1, dir->dir_linkid2,
-	    dir->dir_link)) != 0)
+	    dir->dir_link, dir->dir_zoneinit)) != 0)
 		return (err);
 
 	if (dir->dir_linkid2 == DATALINK_INVALID_LINKID)
@@ -1376,7 +1378,8 @@ static dld_ioc_modentry_t dld_ioc_modtable[] = {
 	{SIMNET_IOC,	"simnet", 0, NULL, 0},
 	{BRIDGE_IOC,	"bridge", 0, NULL, 0},
 	{IPTUN_IOC,	"iptun", 0, NULL, 0},
-	{IBPART_IOC,	"ibp", -1, NULL, 0}
+	{IBPART_IOC,	"ibp", -1, NULL, 0},
+	{OVERLAY_IOC,	"overlay", 0, NULL, 0}
 };
 #define	DLDIOC_CNT	\
 	(sizeof (dld_ioc_modtable) / sizeof (dld_ioc_modentry_t))
diff --git a/usr/src/uts/common/io/dld/dld_proto.c b/usr/src/uts/common/io/dld/dld_proto.c
index a438e43d91..661d8b2f4f 100644
--- a/usr/src/uts/common/io/dld/dld_proto.c
+++ b/usr/src/uts/common/io/dld/dld_proto.c
@@ -41,7 +41,7 @@ static proto_reqfunc_t proto_info_req, proto_attach_req, proto_detach_req,
     proto_bind_req, proto_unbind_req, proto_promiscon_req, proto_promiscoff_req,
     proto_enabmulti_req, proto_disabmulti_req, proto_physaddr_req,
     proto_setphysaddr_req, proto_udqos_req, proto_req, proto_capability_req,
-    proto_notify_req, proto_passive_req;
+    proto_notify_req, proto_passive_req, proto_exclusive_req;
 
 static void proto_capability_advertise(dld_str_t *, mblk_t *);
 static int dld_capab_poll_disable(dld_str_t *, dld_capab_poll_t *);
@@ -121,6 +121,9 @@ dld_proto(dld_str_t *dsp, mblk_t *mp)
 	case DL_PASSIVE_REQ:
 		proto_passive_req(dsp, mp);
 		break;
+	case DL_EXCLUSIVE_REQ:
+		proto_exclusive_req(dsp, mp);
+		break;
 	default:
 		proto_req(dsp, mp);
 		break;
@@ -605,6 +608,14 @@ proto_promiscon_req(dld_str_t *dsp, mblk_t *mp)
 		new_flags |= DLS_PROMISC_PHYS;
 		break;
 
+	case DL_PROMISC_RX_ONLY:
+		new_flags |= DLS_PROMISC_RX_ONLY;
+		break;
+
+	case DL_PROMISC_FIXUPS:
+		new_flags |= DLS_PROMISC_FIXUPS;
+		break;
+
 	default:
 		dl_err = DL_NOTSUPPORTED;
 		goto failed2;
@@ -692,6 +703,22 @@ proto_promiscoff_req(dld_str_t *dsp, mblk_t *mp)
 		new_flags &= ~DLS_PROMISC_PHYS;
 		break;
 
+	case DL_PROMISC_RX_ONLY:
+		if (!(dsp->ds_promisc & DLS_PROMISC_RX_ONLY)) {
+			dl_err = DL_NOTENAB;
+			goto failed;
+		}
+		new_flags &= ~DLS_PROMISC_RX_ONLY;
+		break;
+
+	case DL_PROMISC_FIXUPS:
+		if (!(dsp->ds_promisc & DLS_PROMISC_FIXUPS)) {
+			dl_err = DL_NOTENAB;
+			goto failed;
+		}
+		new_flags &= ~DLS_PROMISC_FIXUPS;
+		break;
+
 	default:
 		dl_err = DL_NOTSUPPORTED;
 		mac_perim_exit(mph);
@@ -1295,7 +1322,8 @@ proto_passive_req(dld_str_t *dsp, mblk_t *mp)
 	 * If we've already become active by issuing an active primitive,
 	 * then it's too late to try to become passive.
 	 */
-	if (dsp->ds_passivestate == DLD_ACTIVE) {
+	if (dsp->ds_passivestate == DLD_ACTIVE ||
+	    dsp->ds_passivestate == DLD_EXCLUSIVE) {
 		dl_err = DL_OUTSTATE;
 		goto failed;
 	}
@@ -1354,7 +1382,12 @@ dld_capab_direct(dld_str_t *dsp, void *data, uint_t flags)
 		dls_rx_set(dsp, (dls_rx_t)direct->di_rx_cf,
 		    direct->di_rx_ch);
 
-		direct->di_tx_df = (uintptr_t)str_mdata_fastpath_put;
+		if (direct->di_flags & DI_DIRECT_RAW) {
+			direct->di_tx_df =
+			    (uintptr_t)str_mdata_raw_fastpath_put;
+		} else {
+			direct->di_tx_df = (uintptr_t)str_mdata_fastpath_put;
+		}
 		direct->di_tx_dh = dsp;
 		direct->di_tx_cb_df = (uintptr_t)mac_client_tx_notify;
 		direct->di_tx_cb_dh = dsp->ds_mch;
@@ -1516,8 +1549,9 @@ dld_capab(dld_str_t *dsp, uint_t type, void *data, uint_t flags)
 	 * completes. So we limit the check to DLD_ENABLE case.
 	 */
 	if ((flags == DLD_ENABLE && type != DLD_CAPAB_PERIM) &&
-	    (dsp->ds_sap != ETHERTYPE_IP ||
-	    !check_mod_above(dsp->ds_rq, "ip"))) {
+	    ((dsp->ds_sap != ETHERTYPE_IP ||
+	    !check_mod_above(dsp->ds_rq, "ip")) &&
+	    !check_mod_above(dsp->ds_rq, "vnd"))) {
 		return (ENOTSUP);
 	}
 
@@ -1599,9 +1633,15 @@ proto_capability_advertise(dld_str_t *dsp, mblk_t *mp)
 	}
 
 	/*
-	 * Direct capability negotiation interface between IP and DLD
+	 * Direct capability negotiation interface between IP/VND and DLD. Note
+	 * that for vnd we only allow the case where the media type is the
+	 * native media type so we know that there are no transformations that
+	 * would have to happen to the mac header that it receives.
 	 */
-	if (dsp->ds_sap == ETHERTYPE_IP && check_mod_above(dsp->ds_rq, "ip")) {
+	if ((dsp->ds_sap == ETHERTYPE_IP &&
+	    check_mod_above(dsp->ds_rq, "ip")) ||
+	    (check_mod_above(dsp->ds_rq, "vnd") &&
+	    dsp->ds_mip->mi_media == dsp->ds_mip->mi_nativemedia)) {
 		dld_capable = B_TRUE;
 		subsize += sizeof (dl_capability_sub_t) +
 		    sizeof (dl_capab_dld_t);
@@ -1720,3 +1760,36 @@ dld_capabilities_disable(dld_str_t *dsp)
 	if (dsp->ds_polling)
 		(void) dld_capab_poll_disable(dsp, NULL);
 }
+
+static void
+proto_exclusive_req(dld_str_t *dsp, mblk_t *mp)
+{
+	int ret = 0;
+	t_uscalar_t dl_err;
+	mac_perim_handle_t mph;
+
+	if (dsp->ds_passivestate != DLD_UNINITIALIZED) {
+		dl_err = DL_OUTSTATE;
+		goto failed;
+	}
+
+	if (MBLKL(mp) < DL_EXCLUSIVE_REQ_SIZE) {
+		dl_err = DL_BADPRIM;
+		goto failed;
+	}
+
+	mac_perim_enter_by_mh(dsp->ds_mh, &mph);
+	ret = dls_exclusive_set(dsp, B_TRUE);
+	mac_perim_exit(mph);
+
+	if (ret != 0) {
+		dl_err = DL_SYSERR;
+		goto failed;
+	}
+
+	dsp->ds_passivestate = DLD_EXCLUSIVE;
+	dlokack(dsp->ds_wq, mp, DL_EXCLUSIVE_REQ);
+	return;
+failed:
+	dlerrorack(dsp->ds_wq, mp, DL_EXCLUSIVE_REQ, dl_err, (t_uscalar_t)ret);
+}
diff --git a/usr/src/uts/common/io/dld/dld_str.c b/usr/src/uts/common/io/dld/dld_str.c
index 6f0d0b9a6c..f5308e70ff 100644
--- a/usr/src/uts/common/io/dld/dld_str.c
+++ b/usr/src/uts/common/io/dld/dld_str.c
@@ -854,6 +854,77 @@ i_dld_ether_header_update_tag(mblk_t *mp, uint_t pri, uint16_t vid,
 	return (mp);
 }
 
+static boolean_t
+i_dld_raw_ether_check(dld_str_t *dsp, mac_header_info_t *mhip, mblk_t **mpp)
+{
+	mblk_t *mp = *mpp;
+	mblk_t *newmp;
+	uint_t pri, vid, dvid;
+
+	dvid = mac_client_vid(dsp->ds_mch);
+
+	/*
+	 * Discard the packet if this is a VLAN stream but the VID in
+	 * the packet is not correct.
+	 */
+	vid = VLAN_ID(mhip->mhi_tci);
+	if ((dvid != VLAN_ID_NONE) && (vid != VLAN_ID_NONE))
+		return (B_FALSE);
+
+	/*
+	 * Discard the packet if this packet is a tagged packet
+	 * but both pri and VID are 0.
+	 */
+	pri = VLAN_PRI(mhip->mhi_tci);
+	if (mhip->mhi_istagged && !mhip->mhi_ispvid && pri == 0 &&
+	    vid == VLAN_ID_NONE)
+		return (B_FALSE);
+
+	/*
+	 * Update the priority bits to the per-stream priority if
+	 * priority is not set in the packet. Update the VID for
+	 * packets on a VLAN stream.
+	 */
+	pri = (pri == 0) ? dsp->ds_pri : 0;
+	if ((pri != 0) || (dvid != VLAN_ID_NONE)) {
+		if ((newmp = i_dld_ether_header_update_tag(mp, pri,
+		    dvid, dsp->ds_dlp->dl_tagmode)) == NULL) {
+			return (B_FALSE);
+		}
+		*mpp = newmp;
+	}
+
+	return (B_TRUE);
+}
+
+mac_tx_cookie_t
+str_mdata_raw_fastpath_put(dld_str_t *dsp, mblk_t *mp, uintptr_t f_hint,
+    uint16_t flag)
+{
+	boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER);
+	mac_header_info_t mhi;
+	mac_tx_cookie_t cookie;
+
+	if (mac_vlan_header_info(dsp->ds_mh, mp, &mhi) != 0)
+		goto discard;
+
+	if (is_ethernet) {
+		if (i_dld_raw_ether_check(dsp, &mhi, &mp) == B_FALSE)
+			goto discard;
+	}
+
+	if ((cookie = DLD_TX(dsp, mp, f_hint, flag)) != NULL) {
+		DLD_SETQFULL(dsp);
+	}
+	return (cookie);
+discard:
+	/* TODO: bump kstat? */
+	freemsg(mp);
+	return (NULL);
+}
+
+
+
 /*
  * M_DATA put (IP fast-path mode)
  */
@@ -902,7 +973,6 @@ str_mdata_raw_put(dld_str_t *dsp, mblk_t *mp)
 	mblk_t *bp, *newmp;
 	size_t size;
 	mac_header_info_t mhi;
-	uint_t pri, vid, dvid;
 	uint_t max_sdu;
 
 	/*
@@ -948,38 +1018,8 @@ str_mdata_raw_put(dld_str_t *dsp, mblk_t *mp)
 		goto discard;
 
 	if (is_ethernet) {
-		dvid = mac_client_vid(dsp->ds_mch);
-
-		/*
-		 * Discard the packet if this is a VLAN stream but the VID in
-		 * the packet is not correct.
-		 */
-		vid = VLAN_ID(mhi.mhi_tci);
-		if ((dvid != VLAN_ID_NONE) && (vid != VLAN_ID_NONE))
-			goto discard;
-
-		/*
-		 * Discard the packet if this packet is a tagged packet
-		 * but both pri and VID are 0.
-		 */
-		pri = VLAN_PRI(mhi.mhi_tci);
-		if (mhi.mhi_istagged && !mhi.mhi_ispvid && pri == 0 &&
-		    vid == VLAN_ID_NONE)
+		if (i_dld_raw_ether_check(dsp, &mhi, &mp) == B_FALSE)
 			goto discard;
-
-		/*
-		 * Update the priority bits to the per-stream priority if
-		 * priority is not set in the packet. Update the VID for
-		 * packets on a VLAN stream.
-		 */
-		pri = (pri == 0) ? dsp->ds_pri : 0;
-		if ((pri != 0) || (dvid != VLAN_ID_NONE)) {
-			if ((newmp = i_dld_ether_header_update_tag(mp, pri,
-			    dvid, dsp->ds_dlp->dl_tagmode)) == NULL) {
-				goto discard;
-			}
-			mp = newmp;
-		}
 	}
 
 	if (DLD_TX(dsp, mp, 0, 0) != NULL) {
diff --git a/usr/src/uts/common/io/dls/dls.c b/usr/src/uts/common/io/dls/dls.c
index 92993ada58..0f8dbcb57a 100644
--- a/usr/src/uts/common/io/dls/dls.c
+++ b/usr/src/uts/common/io/dls/dls.c
@@ -25,7 +25,7 @@
  */
 
 /*
- * Copyright (c) 2013 Joyent, Inc.  All rights reserved.
+ * Copyright 2016 Joyent, Inc.
  */
 
 /*
@@ -248,19 +248,69 @@ dls_promisc(dld_str_t *dsp, uint32_t new_flags)
 {
 	int err = 0;
 	uint32_t old_flags = dsp->ds_promisc;
+	uint32_t new_type = new_flags &
+	    ~(DLS_PROMISC_RX_ONLY | DLS_PROMISC_FIXUPS);
 	mac_client_promisc_type_t mptype = MAC_CLIENT_PROMISC_ALL;
+	uint16_t mac_flags = 0;
+	boolean_t doremove = B_FALSE;
 
 	ASSERT(MAC_PERIM_HELD(dsp->ds_mh));
 	ASSERT(!(new_flags & ~(DLS_PROMISC_SAP | DLS_PROMISC_MULTI |
-	    DLS_PROMISC_PHYS)));
+	    DLS_PROMISC_PHYS | DLS_PROMISC_RX_ONLY | DLS_PROMISC_FIXUPS)));
+
+	/*
+	 * If we only have the non-data receive flags set or are only changing
+	 * them, then there's nothing to do other than update the flags here.
+	 * Basically when we only have something in the set of
+	 * DLS_PROMISC_RX_ONLY and DLS_PROMISC_FIXUPS around, then there's
+	 * nothing else for us to do other than toggle it, as there's no need to
+	 * talk to MAC and we don't have to do anything else.
+	 */
+	if ((old_flags & ~(DLS_PROMISC_RX_ONLY | DLS_PROMISC_FIXUPS)) == 0 &&
+	    (new_flags & ~(DLS_PROMISC_RX_ONLY | DLS_PROMISC_FIXUPS)) == 0) {
+		dsp->ds_promisc = new_flags;
+		return (0);
+	}
 
 	/*
 	 * If the user has only requested DLS_PROMISC_MULTI then we need to make
 	 * sure that they don't see all packets.
 	 */
-	if (new_flags == DLS_PROMISC_MULTI)
+	if (new_type == DLS_PROMISC_MULTI)
 		mptype = MAC_CLIENT_PROMISC_MULTI;
 
+	/*
+	 * Look at new flags and figure out the correct mac promisc flags.
+	 * If we've only requested DLS_PROMISC_SAP and not _MULTI or _PHYS,
+	 * don't turn on physical promisc mode.
+	 */
+	if (new_flags & DLS_PROMISC_RX_ONLY)
+		mac_flags |= MAC_PROMISC_FLAGS_NO_TX_LOOP;
+	if (new_flags & DLS_PROMISC_FIXUPS)
+		mac_flags |= MAC_PROMISC_FLAGS_DO_FIXUPS;
+	if (new_type == DLS_PROMISC_SAP)
+		mac_flags |= MAC_PROMISC_FLAGS_NO_PHYS;
+
+	/*
+	 * If we're coming in and we're being asked to transition to a state
+	 * where the only DLS flags would be enabled are flags that change what
+	 * we do with promiscuous packets (DLS_PROMISC_RX_ONLY and
+	 * DLS_PROMISC_FIXUPS) and not which packets we should receive, then we
+	 * need to remove the MAC layer promiscuous handler.
+	 */
+	if ((new_flags & ~(DLS_PROMISC_RX_ONLY | DLS_PROMISC_FIXUPS)) == 0 &&
+	    (old_flags & ~(DLS_PROMISC_RX_ONLY | DLS_PROMISC_FIXUPS)) != 0 &&
+	    new_flags != 0) {
+		doremove = B_TRUE;
+	}
+
+	/*
+	 * There are three cases we care about here with respect to MAC. Going
+	 * from nothing to something, something to nothing, something to
+	 * something where we need to change how we're getting stuff from mac.
+	 * In the last case, as long as they're not equal, we need to assume
+	 * something has changed and do something about it.
+	 */
 	if (dsp->ds_promisc == 0 && new_flags != 0) {
 		/*
 		 * If only DLS_PROMISC_SAP, we don't turn on the
@@ -268,9 +318,7 @@ dls_promisc(dld_str_t *dsp, uint32_t new_flags)
 		 */
 		dsp->ds_promisc = new_flags;
 		err = mac_promisc_add(dsp->ds_mch, mptype,
-		    dls_rx_promisc, dsp, &dsp->ds_mph,
-		    (new_flags != DLS_PROMISC_SAP) ? 0 :
-		    MAC_PROMISC_FLAGS_NO_PHYS);
+		    dls_rx_promisc, dsp, &dsp->ds_mph, mac_flags);
 		if (err != 0) {
 			dsp->ds_promisc = old_flags;
 			return (err);
@@ -281,7 +329,8 @@ dls_promisc(dld_str_t *dsp, uint32_t new_flags)
 			mac_promisc_remove(dsp->ds_vlan_mph);
 			dsp->ds_vlan_mph = NULL;
 		}
-	} else if (dsp->ds_promisc != 0 && new_flags == 0) {
+	} else if (dsp->ds_promisc != 0 &&
+	    (new_flags == 0 || doremove == B_TRUE)) {
 		ASSERT(dsp->ds_mph != NULL);
 
 		mac_promisc_remove(dsp->ds_mph);
@@ -296,19 +345,13 @@ dls_promisc(dld_str_t *dsp, uint32_t new_flags)
 			    MAC_CLIENT_PROMISC_ALL, dls_rx_vlan_promisc, dsp,
 			    &dsp->ds_vlan_mph, MAC_PROMISC_FLAGS_NO_PHYS);
 		}
-	} else if (dsp->ds_promisc == DLS_PROMISC_SAP && new_flags != 0 &&
-	    new_flags != dsp->ds_promisc) {
-		/*
-		 * If the old flag is PROMISC_SAP, but the current flag has
-		 * changed to some new non-zero value, we need to turn the
-		 * physical promiscuous mode.
-		 */
+	} else if (new_flags != 0 && new_flags != old_flags) {
 		ASSERT(dsp->ds_mph != NULL);
 		mac_promisc_remove(dsp->ds_mph);
 		/* Honors both after-remove and before-add semantics! */
 		dsp->ds_promisc = new_flags;
 		err = mac_promisc_add(dsp->ds_mch, mptype,
-		    dls_rx_promisc, dsp, &dsp->ds_mph, 0);
+		    dls_rx_promisc, dsp, &dsp->ds_mph, mac_flags);
 		if (err != 0)
 			dsp->ds_promisc = old_flags;
 	} else {
@@ -629,6 +672,22 @@ boolean_t
 dls_accept_promisc(dld_str_t *dsp, mac_header_info_t *mhip, dls_rx_t *ds_rx,
     void **ds_rx_arg, boolean_t loopback)
 {
+	if (dsp->ds_promisc == 0) {
+		/*
+		 * If there are active walkers of the mi_promisc_list when
+		 * promiscuousness is disabled, ds_promisc will be cleared,
+		 * but the DLS will remain on the mi_promisc_list until the
+		 * walk is completed.  If we do not recognize this case here,
+		 * we won't properly execute the ds_promisc case in the common
+		 * accept routine -- and we will potentially accept a packet
+		 * that has originated with this DLS (which in turn can
+		 * induce recursion and death by stack overflow).  If
+		 * ds_promisc is zero, we know that we are in this window --
+		 * and we refuse to accept the packet.
+		 */
+		return (B_FALSE);
+	}
+
 	return (dls_accept_common(dsp, mhip, ds_rx, ds_rx_arg, B_TRUE,
 	    loopback));
 }
@@ -659,7 +718,10 @@ dls_mac_active_set(dls_link_t *dlp)
 		 * Set the function to start receiving packets.
 		 */
 		mac_rx_set(dlp->dl_mch, i_dls_link_rx, dlp);
+	} else if (dlp->dl_exclusive == B_TRUE) {
+		return (EBUSY);
 	}
+
 	dlp->dl_nactive++;
 	return (0);
 }
@@ -685,7 +747,11 @@ dls_active_set(dld_str_t *dsp)
 	if (dsp->ds_passivestate == DLD_PASSIVE)
 		return (0);
 
-	/* If we're already active, then there's nothing more to do. */
+	if (dsp->ds_dlp->dl_exclusive == B_TRUE &&
+	    dsp->ds_passivestate != DLD_EXCLUSIVE)
+		return (EBUSY);
+
+	/* If we're already active, we need to check the link's exclusivity */
 	if ((dsp->ds_nactive == 0) &&
 	    ((err = dls_mac_active_set(dsp->ds_dlp)) != 0)) {
 		/* except for ENXIO all other errors are mapped to EBUSY */
@@ -694,7 +760,8 @@ dls_active_set(dld_str_t *dsp)
 		return (err);
 	}
 
-	dsp->ds_passivestate = DLD_ACTIVE;
+	dsp->ds_passivestate = dsp->ds_dlp->dl_exclusive == B_TRUE ?
+	    DLD_EXCLUSIVE : DLD_ACTIVE;
 	dsp->ds_nactive++;
 	return (0);
 }
@@ -725,7 +792,32 @@ dls_active_clear(dld_str_t *dsp, boolean_t all)
 	if (dsp->ds_nactive != 0)
 		return;
 
-	ASSERT(dsp->ds_passivestate == DLD_ACTIVE);
+	ASSERT(dsp->ds_passivestate == DLD_ACTIVE ||
+	    dsp->ds_passivestate == DLD_EXCLUSIVE);
 	dls_mac_active_clear(dsp->ds_dlp);
+	/*
+	 * We verify below to ensure that no other part of DLS has mucked with
+	 * our exclusive state.
+	 */
+	if (dsp->ds_passivestate == DLD_EXCLUSIVE)
+		VERIFY(dls_exclusive_set(dsp, B_FALSE) == 0);
 	dsp->ds_passivestate = DLD_UNINITIALIZED;
 }
+
+int
+dls_exclusive_set(dld_str_t *dsp, boolean_t enable)
+{
+	ASSERT(MAC_PERIM_HELD(dsp->ds_mh));
+
+	if (enable == B_FALSE) {
+		dsp->ds_dlp->dl_exclusive = B_FALSE;
+		return (0);
+	}
+
+	if (dsp->ds_dlp->dl_nactive != 0)
+		return (EBUSY);
+
+	dsp->ds_dlp->dl_exclusive = B_TRUE;
+
+	return (0);
+}
diff --git a/usr/src/uts/common/io/dls/dls_link.c b/usr/src/uts/common/io/dls/dls_link.c
index 6b92a81e77..4a735d870e 100644
--- a/usr/src/uts/common/io/dls/dls_link.c
+++ b/usr/src/uts/common/io/dls/dls_link.c
@@ -21,6 +21,7 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2015 Joyent, Inc.
  */
 
 /*
@@ -34,6 +35,9 @@
 #include	<sys/dld_impl.h>
 #include	<sys/sdt.h>
 #include	<sys/atomic.h>
+#include	<sys/sysevent.h>
+#include	<sys/sysevent/eventdefs.h>
+#include	<sys/sysevent/datalink.h>
 
 static kmem_cache_t	*i_dls_link_cachep;
 mod_hash_t		*i_dls_link_hash;
@@ -579,6 +583,67 @@ drop:
 	freemsg(mp);
 }
 
+/*
+ * We'd like to notify via sysevents that a link state change has occurred.
+ * There are a couple of challenges associated with this. The first is that if
+ * the link is flapping a lot, we may not see an accurate state when we launch
+ * the notification, we're told it changed, not what it changed to.
+ *
+ * The next problem is that all of the information that a user has associated
+ * with this device is the exact opposite of what we have on the dls_link_t. We
+ * have the name of the mac device, which has no bearing on what users see.
+ * Likewise, we don't have the datalink id either. So we're going to have to get
+ * this from dls.
+ *
+ * This is all further complicated by the fact that this could be going on in
+ * another thread at the same time as someone is tearing down the dls_link_t
+ * that we're associated with. We need to be careful not to grab the mac
+ * perimeter, otherwise we stand a good chance of deadlock.
+ */
+static void
+dls_link_notify(void *arg, mac_notify_type_t type)
+{
+	dls_link_t 	*dlp = arg;
+	dls_dl_handle_t	dhp;
+	nvlist_t	*nvp;
+	sysevent_t	*event;
+	sysevent_id_t	eid;
+
+	if (type != MAC_NOTE_LINK && type != MAC_NOTE_LOWLINK)
+		return;
+
+	/*
+	 * If we can't find a devnet handle for this link, then there is no user
+	 * knowable device for this at the moment and there's nothing we can
+	 * really share with them that will make sense.
+	 */
+	if (dls_devnet_hold_tmp_by_link(dlp, &dhp) != 0)
+		return;
+
+	/*
+	 * Because we're attaching this nvlist_t to the sysevent, it'll get
+	 * cleaned up when we call sysevent_free.
+	 */
+	VERIFY(nvlist_alloc(&nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+	VERIFY(nvlist_add_int32(nvp, DATALINK_EV_LINK_ID,
+	    dls_devnet_linkid(dhp)) == 0);
+	VERIFY(nvlist_add_string(nvp, DATALINK_EV_LINK_NAME,
+	    dls_devnet_link(dhp)) == 0);
+	VERIFY(nvlist_add_int32(nvp, DATALINK_EV_ZONE_ID,
+	    dls_devnet_getzid(dhp)) == 0);
+
+	dls_devnet_rele_tmp(dhp);
+
+	event = sysevent_alloc(EC_DATALINK, ESC_DATALINK_LINK_STATE,
+	    ILLUMOS_KERN_PUB"dls", SE_SLEEP);
+	VERIFY(event != NULL);
+	(void) sysevent_attach_attributes(event, (sysevent_attr_list_t *)nvp);
+
+	(void) log_sysevent(event, SE_SLEEP, &eid);
+	sysevent_free(event);
+
+}
+
 static void
 i_dls_link_destroy(dls_link_t *dlp)
 {
@@ -589,6 +654,9 @@ i_dls_link_destroy(dls_link_t *dlp)
 	/*
 	 * Free the structure back to the cache.
 	 */
+	if (dlp->dl_mnh != NULL)
+		mac_notify_remove(dlp->dl_mnh, B_TRUE);
+
 	if (dlp->dl_mch != NULL)
 		mac_client_close(dlp->dl_mch, 0);
 
@@ -600,8 +668,10 @@ i_dls_link_destroy(dls_link_t *dlp)
 	dlp->dl_mh = NULL;
 	dlp->dl_mch = NULL;
 	dlp->dl_mip = NULL;
+	dlp->dl_mnh = NULL;
 	dlp->dl_unknowns = 0;
 	dlp->dl_nonip_cnt = 0;
+	dlp->dl_exclusive = B_FALSE;
 	kmem_cache_free(i_dls_link_cachep, dlp);
 }
 
@@ -640,6 +710,8 @@ i_dls_link_create(const char *name, dls_link_t **dlpp)
 	if (err != 0)
 		goto bail;
 
+	dlp->dl_mnh = mac_notify_add(dlp->dl_mh, dls_link_notify, dlp);
+
 	DTRACE_PROBE2(dls__primary__client, char *, dlp->dl_name, void *,
 	    dlp->dl_mch);
 
diff --git a/usr/src/uts/common/io/dls/dls_mgmt.c b/usr/src/uts/common/io/dls/dls_mgmt.c
index 049c4bd757..105c55c7ce 100644
--- a/usr/src/uts/common/io/dls/dls_mgmt.c
+++ b/usr/src/uts/common/io/dls/dls_mgmt.c
@@ -21,6 +21,7 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2015 Joyent, Inc.
  */
 
 /*
@@ -105,12 +106,13 @@ typedef struct dls_devnet_s {
 	zoneid_t	dd_zid;		/* current zone */
 	boolean_t	dd_prop_loaded;
 	taskqid_t	dd_prop_taskid;
+	boolean_t	dd_transient;	/* link goes away when zone does */
 } dls_devnet_t;
 
 static int i_dls_devnet_create_iptun(const char *, const char *,
     datalink_id_t *);
 static int i_dls_devnet_destroy_iptun(datalink_id_t);
-static int i_dls_devnet_setzid(dls_devnet_t *, zoneid_t, boolean_t);
+static int i_dls_devnet_setzid(dls_devnet_t *, zoneid_t, boolean_t, boolean_t);
 static int dls_devnet_unset(const char *, datalink_id_t *, boolean_t);
 
 /*ARGSUSED*/
@@ -145,7 +147,12 @@ dls_zone_remove(datalink_id_t linkid, void *arg)
 	dls_devnet_t *ddp;
 
 	if (dls_devnet_hold_tmp(linkid, &ddp) == 0) {
-		(void) dls_devnet_setzid(ddp, GLOBAL_ZONEID);
+		/*
+		 * Don't bother moving transient links back to the global zone
+		 * since we will simply delete them in dls_devnet_unset.
+		 */
+		if (!ddp->dd_transient)
+			(void) dls_devnet_setzid(ddp, GLOBAL_ZONEID, B_FALSE);
 		dls_devnet_rele_tmp(ddp);
 	}
 	return (0);
@@ -526,6 +533,7 @@ dls_mgmt_get_linkid(const char *link, datalink_id_t *linkid)
 
 	getlinkid.ld_cmd = DLMGMT_CMD_GETLINKID;
 	(void) strlcpy(getlinkid.ld_link, link, MAXLINKNAMELEN);
+	getlinkid.ld_zoneid = getzoneid();
 
 	if ((err = i_dls_mgmt_upcall(&getlinkid, sizeof (getlinkid), &retval,
 	    sizeof (retval))) == 0) {
@@ -534,6 +542,27 @@ dls_mgmt_get_linkid(const char *link, datalink_id_t *linkid)
 	return (err);
 }
 
+int
+dls_mgmt_get_linkid_in_zone(const char *link, datalink_id_t *linkid,
+    zoneid_t zid)
+{
+	dlmgmt_door_getlinkid_t		getlinkid;
+	dlmgmt_getlinkid_retval_t	retval;
+	int				err;
+
+	ASSERT(getzoneid() == GLOBAL_ZONEID || zid == getzoneid());
+	getlinkid.ld_cmd = DLMGMT_CMD_GETLINKID;
+	(void) strlcpy(getlinkid.ld_link, link, MAXLINKNAMELEN);
+	getlinkid.ld_zoneid = zid;
+
+	if ((err = i_dls_mgmt_upcall(&getlinkid, sizeof (getlinkid), &retval,
+	    sizeof (retval))) == 0) {
+		*linkid = retval.lr_linkid;
+	}
+	return (err);
+}
+
+
 datalink_id_t
 dls_mgmt_get_next(datalink_id_t linkid, datalink_class_t class,
     datalink_media_t dmedia, uint32_t flags)
@@ -740,12 +769,23 @@ dls_devnet_stat_update(kstat_t *ksp, int rw)
  * Create the "link" kstats.
  */
 static void
-dls_devnet_stat_create(dls_devnet_t *ddp, zoneid_t zoneid)
+dls_devnet_stat_create(dls_devnet_t *ddp, zoneid_t zoneid, zoneid_t newzoneid)
 {
 	kstat_t	*ksp;
+	char	*nm;
+	char	kname[MAXLINKNAMELEN];
+
+	if (zoneid != newzoneid) {
+		ASSERT(zoneid == GLOBAL_ZONEID);
+		(void) snprintf(kname, sizeof (kname), "z%d_%s", newzoneid,
+		    ddp->dd_linkname);
+		nm = kname;
+	} else {
+		nm = ddp->dd_linkname;
+	}
 
-	if (dls_stat_create("link", 0, ddp->dd_linkname, zoneid,
-	    dls_devnet_stat_update, ddp, &ksp) == 0) {
+	if (dls_stat_create("link", 0, nm, zoneid,
+	    dls_devnet_stat_update, ddp, &ksp, newzoneid) == 0) {
 		ASSERT(ksp != NULL);
 		if (zoneid == ddp->dd_owner_zid) {
 			ASSERT(ddp->dd_ksp == NULL);
@@ -765,12 +805,12 @@ dls_devnet_stat_destroy(dls_devnet_t *ddp, zoneid_t zoneid)
 {
 	if (zoneid == ddp->dd_owner_zid) {
 		if (ddp->dd_ksp != NULL) {
-			kstat_delete(ddp->dd_ksp);
+			dls_stat_delete(ddp->dd_ksp);
 			ddp->dd_ksp = NULL;
 		}
 	} else {
 		if (ddp->dd_zone_ksp != NULL) {
-			kstat_delete(ddp->dd_zone_ksp);
+			dls_stat_delete(ddp->dd_zone_ksp);
 			ddp->dd_zone_ksp = NULL;
 		}
 	}
@@ -781,15 +821,25 @@ dls_devnet_stat_destroy(dls_devnet_t *ddp, zoneid_t zoneid)
  * and create the new set using the new name.
  */
 static void
-dls_devnet_stat_rename(dls_devnet_t *ddp)
+dls_devnet_stat_rename(dls_devnet_t *ddp, boolean_t zoneinit)
 {
 	if (ddp->dd_ksp != NULL) {
-		kstat_delete(ddp->dd_ksp);
+		dls_stat_delete(ddp->dd_ksp);
 		ddp->dd_ksp = NULL;
 	}
-	/* We can't rename a link while it's assigned to a non-global zone. */
+	if (zoneinit && ddp->dd_zone_ksp != NULL) {
+		dls_stat_delete(ddp->dd_zone_ksp);
+		ddp->dd_zone_ksp = NULL;
+	}
+	/*
+	 * We can't rename a link while it's assigned to a non-global zone
+	 * unless we're first initializing the zone while readying it.
+	 */
 	ASSERT(ddp->dd_zone_ksp == NULL);
-	dls_devnet_stat_create(ddp, ddp->dd_owner_zid);
+	dls_devnet_stat_create(ddp, ddp->dd_owner_zid,
+	    (zoneinit ? ddp->dd_zid : ddp->dd_owner_zid));
+	if (zoneinit)
+		dls_devnet_stat_create(ddp, ddp->dd_zid, ddp->dd_zid);
 }
 
 /*
@@ -878,7 +928,8 @@ done:
 	rw_exit(&i_dls_devnet_lock);
 	if (err == 0) {
 		if (zoneid != GLOBAL_ZONEID &&
-		    (err = i_dls_devnet_setzid(ddp, zoneid, B_FALSE)) != 0)
+		    (err = i_dls_devnet_setzid(ddp, zoneid, B_FALSE,
+		    B_FALSE)) != 0)
 			(void) dls_devnet_unset(macname, &linkid, B_TRUE);
 		/*
 		 * The kstat subsystem holds its own locks (rather perimeter)
@@ -887,7 +938,7 @@ done:
 		 * lock hierarchy is kstat locks -> i_dls_devnet_lock.
 		 */
 		if (stat_create)
-			dls_devnet_stat_create(ddp, zoneid);
+			dls_devnet_stat_create(ddp, zoneid, zoneid);
 		if (ddpp != NULL)
 			*ddpp = ddp;
 	}
@@ -924,17 +975,78 @@ dls_devnet_unset(const char *macname, datalink_id_t *id, boolean_t wait)
 	ASSERT(ddp->dd_ref != 0);
 	if ((ddp->dd_ref != 1) || (!wait &&
 	    (ddp->dd_tref != 0 || ddp->dd_prop_taskid != NULL))) {
-		mutex_exit(&ddp->dd_mutex);
-		rw_exit(&i_dls_devnet_lock);
-		return (EBUSY);
+		int zstatus = 0;
+
+		/*
+		 * There are a couple of alternatives that might be going on
+		 * here; a) the zone is shutting down and it has a transient
+		 * link assigned, in which case we want to clean it up instead
+		 * of moving it back to the global zone, or b) its possible
+		 * that we're trying to clean up an orphaned vnic that was
+		 * delegated to a zone and which wasn't cleaned up properly
+		 * when the zone went away.  Check for either of these cases
+		 * before we simply return EBUSY.
+		 *
+		 * zstatus indicates which situation we are dealing with:
+		 *	 0 - means return EBUSY
+		 *	 1 - means case (a), cleanup transient link
+		 *	-1 - means case (b), orphained VNIC
+		 */
+		if (ddp->dd_ref > 1 && ddp->dd_zid != GLOBAL_ZONEID) {
+			zone_t	*zp;
+
+			if ((zp = zone_find_by_id(ddp->dd_zid)) == NULL) {
+				zstatus = -1;
+			} else {
+				if (ddp->dd_transient) {
+					zone_status_t s = zone_status_get(zp);
+
+					if (s >= ZONE_IS_SHUTTING_DOWN)
+						zstatus = 1;
+				}
+				zone_rele(zp);
+			}
+		}
+
+		if (zstatus == 0) {
+			mutex_exit(&ddp->dd_mutex);
+			rw_exit(&i_dls_devnet_lock);
+			return (EBUSY);
+		}
+
+		/*
+		 * We want to delete the link, reset ref to 1;
+		 */
+		if (zstatus == -1)
+			/* Log a warning, but continue in this case */
+			cmn_err(CE_WARN, "clear orphaned datalink: %s\n",
+			    ddp->dd_linkname);
+		ddp->dd_ref = 1;
 	}
 
 	ddp->dd_flags |= DD_CONDEMNED;
 	ddp->dd_ref--;
 	*id = ddp->dd_linkid;
 
-	if (ddp->dd_zid != GLOBAL_ZONEID)
-		(void) i_dls_devnet_setzid(ddp, GLOBAL_ZONEID, B_FALSE);
+	if (ddp->dd_zid != GLOBAL_ZONEID) {
+		/*
+		 * We need to release the dd_mutex before we try and destroy the
+		 * stat. When we destroy it, we'll need to grab the lock for the
+		 * kstat but if there's a concurrent reader of the kstat, we'll
+		 * be blocked on it. This will lead to deadlock because these
+		 * kstats employ a ks_update function (dls_devnet_stat_update)
+		 * which needs the dd_mutex that we currently hold.
+		 *
+		 * Because we've already flagged the dls_devnet_t as
+		 * DD_CONDEMNED and we still have a write lock on
+		 * i_dls_devnet_lock, we should be able to release the dd_mutex.
+		 */
+		mutex_exit(&ddp->dd_mutex);
+		dls_devnet_stat_destroy(ddp, ddp->dd_zid);
+		mutex_enter(&ddp->dd_mutex);
+		(void) i_dls_devnet_setzid(ddp, GLOBAL_ZONEID, B_FALSE,
+		    B_FALSE);
+	}
 
 	/*
 	 * Remove this dls_devnet_t from the hash table.
@@ -960,8 +1072,15 @@ dls_devnet_unset(const char *macname, datalink_id_t *id, boolean_t wait)
 		ASSERT(ddp->dd_tref == 0 && ddp->dd_prop_taskid == NULL);
 	}
 
-	if (ddp->dd_linkid != DATALINK_INVALID_LINKID)
+	if (ddp->dd_linkid != DATALINK_INVALID_LINKID) {
+		/*
+		 * See the earlier call in this function for an explanation.
+		 */
+		mutex_exit(&ddp->dd_mutex);
 		dls_devnet_stat_destroy(ddp, ddp->dd_owner_zid);
+		mutex_enter(&ddp->dd_mutex);
+	}
+
 
 	ddp->dd_prop_loaded = B_FALSE;
 	ddp->dd_linkid = DATALINK_INVALID_LINKID;
@@ -972,6 +1091,39 @@ dls_devnet_unset(const char *macname, datalink_id_t *id, boolean_t wait)
 	return (0);
 }
 
+/*
+ * This is a private hold routine used when we already have the dls_link_t, thus
+ * we know that it cannot go away.
+ */
+int
+dls_devnet_hold_tmp_by_link(dls_link_t *dlp, dls_dl_handle_t *ddhp)
+{
+	int err;
+	dls_devnet_t *ddp = NULL;
+
+	rw_enter(&i_dls_devnet_lock, RW_WRITER);
+	if ((err = mod_hash_find(i_dls_devnet_hash,
+	    (mod_hash_key_t)dlp->dl_name, (mod_hash_val_t *)&ddp)) != 0) {
+		ASSERT(err == MH_ERR_NOTFOUND);
+		rw_exit(&i_dls_devnet_lock);
+		return (ENOENT);
+	}
+
+	mutex_enter(&ddp->dd_mutex);
+	ASSERT(ddp->dd_ref > 0);
+	if (ddp->dd_flags & DD_CONDEMNED) {
+		mutex_exit(&ddp->dd_mutex);
+		rw_exit(&i_dls_devnet_lock);
+		return (ENOENT);
+	}
+	ddp->dd_tref++;
+	mutex_exit(&ddp->dd_mutex);
+	rw_exit(&i_dls_devnet_lock);
+
+	*ddhp = ddp;
+	return (0);
+}
+
 static int
 dls_devnet_hold_common(datalink_id_t linkid, dls_devnet_t **ddpp,
     boolean_t tmp_hold)
@@ -1111,7 +1263,7 @@ dls_devnet_rele(dls_devnet_t *ddp)
 }
 
 static int
-dls_devnet_hold_by_name(const char *link, dls_devnet_t **ddpp)
+dls_devnet_hold_by_name(const char *link, dls_devnet_t **ddpp, zoneid_t zid)
 {
 	char			drv[MAXLINKNAMELEN];
 	uint_t			ppa;
@@ -1121,7 +1273,7 @@ dls_devnet_hold_by_name(const char *link, dls_devnet_t **ddpp)
 	dls_dev_handle_t	ddh;
 	int			err;
 
-	if ((err = dls_mgmt_get_linkid(link, &linkid)) == 0)
+	if ((err = dls_mgmt_get_linkid_in_zone(link, &linkid, zid)) == 0)
 		return (dls_devnet_hold(linkid, ddpp));
 
 	/*
@@ -1261,9 +1413,15 @@ dls_devnet_phydev(datalink_id_t vlanid, dev_t *devp)
  *
  *    This case does not change the <link name, linkid> mapping, so the link's
  *    kstats need to be updated with using name associated the given id2.
+ *
+ * The zonename parameter is used to allow us to create a VNIC in the global
+ * zone which is assigned to a non-global zone.  Since there is a race condition
+ * in the create process if two VNICs have the same name, we need to rename it
+ * after it has been assigned to the zone.
  */
 int
-dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link)
+dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link,
+    boolean_t zoneinit)
 {
 	dls_dev_handle_t	ddh = NULL;
 	int			err = 0;
@@ -1313,13 +1471,16 @@ dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link)
 	 * is currently accessing the link kstats, or if the link is on-loan
 	 * to a non-global zone. Then set the DD_KSTAT_CHANGING flag to
 	 * prevent any access to the kstats while we delete and recreate
-	 * kstats below.
+	 * kstats below.  However, we skip this check if we're renaming the
+	 * vnic as part of bringing it up for a zone.
 	 */
 	mutex_enter(&ddp->dd_mutex);
-	if (ddp->dd_ref > 1) {
-		mutex_exit(&ddp->dd_mutex);
-		err = EBUSY;
-		goto done;
+	if (!zoneinit) {
+		if (ddp->dd_ref > 1) {
+			mutex_exit(&ddp->dd_mutex);
+			err = EBUSY;
+			goto done;
+		}
 	}
 
 	ddp->dd_flags |= DD_KSTAT_CHANGING;
@@ -1333,7 +1494,15 @@ dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link)
 		/* rename mac client name and its flow if exists */
 		if ((err = mac_open(ddp->dd_mac, &mh)) != 0)
 			goto done;
-		(void) mac_rename_primary(mh, link);
+		if (zoneinit) {
+			char tname[MAXLINKNAMELEN];
+
+			(void) snprintf(tname, sizeof (tname), "z%d_%s",
+			    ddp->dd_zid, link);
+			(void) mac_rename_primary(mh, tname);
+		} else {
+			(void) mac_rename_primary(mh, link);
+		}
 		mac_close(mh);
 		goto done;
 	}
@@ -1406,7 +1575,7 @@ done:
 	 */
 	rw_exit(&i_dls_devnet_lock);
 	if (err == 0)
-		dls_devnet_stat_rename(ddp);
+		dls_devnet_stat_rename(ddp, zoneinit);
 
 	if (clear_dd_flag) {
 		mutex_enter(&ddp->dd_mutex);
@@ -1421,7 +1590,8 @@ done:
 }
 
 static int
-i_dls_devnet_setzid(dls_devnet_t *ddp, zoneid_t new_zoneid, boolean_t setprop)
+i_dls_devnet_setzid(dls_devnet_t *ddp, zoneid_t new_zoneid, boolean_t setprop,
+    boolean_t transient)
 {
 	int			err;
 	mac_perim_handle_t	mph;
@@ -1454,6 +1624,7 @@ i_dls_devnet_setzid(dls_devnet_t *ddp, zoneid_t new_zoneid, boolean_t setprop)
 	}
 	if ((err = dls_link_setzid(ddp->dd_mac, new_zoneid)) == 0) {
 		ddp->dd_zid = new_zoneid;
+		ddp->dd_transient = transient;
 		devnet_need_rebuild = B_TRUE;
 	}
 
@@ -1468,7 +1639,7 @@ done:
 }
 
 int
-dls_devnet_setzid(dls_dl_handle_t ddh, zoneid_t new_zid)
+dls_devnet_setzid(dls_dl_handle_t ddh, zoneid_t new_zid, boolean_t transient)
 {
 	dls_devnet_t	*ddp;
 	int		err;
@@ -1490,7 +1661,7 @@ dls_devnet_setzid(dls_dl_handle_t ddh, zoneid_t new_zid)
 		refheld = B_TRUE;
 	}
 
-	if ((err = i_dls_devnet_setzid(ddh, new_zid, B_TRUE)) != 0) {
+	if ((err = i_dls_devnet_setzid(ddh, new_zid, B_TRUE, transient)) != 0) {
 		if (refheld)
 			dls_devnet_rele(ddp);
 		return (err);
@@ -1507,7 +1678,7 @@ dls_devnet_setzid(dls_dl_handle_t ddh, zoneid_t new_zid)
 	if (old_zid != GLOBAL_ZONEID)
 		dls_devnet_stat_destroy(ddh, old_zid);
 	if (new_zid != GLOBAL_ZONEID)
-		dls_devnet_stat_create(ddh, new_zid);
+		dls_devnet_stat_create(ddh, new_zid, new_zid);
 
 	return (0);
 }
@@ -1545,15 +1716,19 @@ dls_devnet_islinkvisible(datalink_id_t linkid, zoneid_t zoneid)
  * Access a vanity naming node.
  */
 int
-dls_devnet_open(const char *link, dls_dl_handle_t *dhp, dev_t *devp)
+dls_devnet_open_in_zone(const char *link, dls_dl_handle_t *dhp, dev_t *devp,
+    zoneid_t zid)
 {
 	dls_devnet_t	*ddp;
 	dls_link_t	*dlp;
-	zoneid_t	zid = getzoneid();
+	zoneid_t	czid = getzoneid();
 	int		err;
 	mac_perim_handle_t	mph;
 
-	if ((err = dls_devnet_hold_by_name(link, &ddp)) != 0)
+	if (czid != GLOBAL_ZONEID && czid != zid)
+		return (ENOENT);
+
+	if ((err = dls_devnet_hold_by_name(link, &ddp, zid)) != 0)
 		return (err);
 
 	dls_devnet_prop_task_wait(ddp);
@@ -1586,6 +1761,12 @@ dls_devnet_open(const char *link, dls_dl_handle_t *dhp, dev_t *devp)
 	return (0);
 }
 
+int
+dls_devnet_open(const char *link, dls_dl_handle_t *dhp, dev_t *devp)
+{
+	return (dls_devnet_open_in_zone(link, dhp, devp, getzoneid()));
+}
+
 /*
  * Close access to a vanity naming node.
  */
@@ -1765,6 +1946,12 @@ i_dls_devnet_destroy_iptun(datalink_id_t linkid)
 }
 
 const char *
+dls_devnet_link(dls_dl_handle_t ddh)
+{
+	return (ddh->dd_linkname);
+}
+
+const char *
 dls_devnet_mac(dls_dl_handle_t ddh)
 {
 	return (ddh->dd_mac);
diff --git a/usr/src/uts/common/io/dls/dls_stat.c b/usr/src/uts/common/io/dls/dls_stat.c
index 51e4be7260..82dceff278 100644
--- a/usr/src/uts/common/io/dls/dls_stat.c
+++ b/usr/src/uts/common/io/dls/dls_stat.c
@@ -21,6 +21,7 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2011 Joyent, Inc.  All rights reserved.
  */
 
 /*
@@ -30,30 +31,33 @@
 #include <sys/dld_impl.h>
 #include <sys/mac_ether.h>
 
-static mac_stat_info_t	i_dls_si[] = {
-	{ MAC_STAT_IFSPEED, "ifspeed", KSTAT_DATA_UINT64, 0 },
-	{ MAC_STAT_MULTIRCV, "multircv", KSTAT_DATA_UINT32, 0 },
-	{ MAC_STAT_BRDCSTRCV, "brdcstrcv", KSTAT_DATA_UINT32, 0 },
-	{ MAC_STAT_MULTIXMT, "multixmt", KSTAT_DATA_UINT32, 0 },
-	{ MAC_STAT_BRDCSTXMT, "brdcstxmt", KSTAT_DATA_UINT32, 0 },
-	{ MAC_STAT_NORCVBUF, "norcvbuf", KSTAT_DATA_UINT32, 0 },
-	{ MAC_STAT_IERRORS, "ierrors", KSTAT_DATA_UINT32, 0 },
-	{ MAC_STAT_NOXMTBUF, "noxmtbuf", KSTAT_DATA_UINT32, 0 },
-	{ MAC_STAT_OERRORS, "oerrors", KSTAT_DATA_UINT32, 0 },
-	{ MAC_STAT_COLLISIONS, "collisions", KSTAT_DATA_UINT32, 0 },
-	{ MAC_STAT_RBYTES, "rbytes", KSTAT_DATA_UINT32, 0 },
-	{ MAC_STAT_IPACKETS, "ipackets", KSTAT_DATA_UINT32, 0 },
-	{ MAC_STAT_OBYTES, "obytes", KSTAT_DATA_UINT32, 0 },
-	{ MAC_STAT_OPACKETS, "opackets", KSTAT_DATA_UINT32, 0 },
-	{ MAC_STAT_RBYTES, "rbytes64", KSTAT_DATA_UINT64, 0 },
-	{ MAC_STAT_IPACKETS, "ipackets64", KSTAT_DATA_UINT64, 0 },
-	{ MAC_STAT_OBYTES, "obytes64", KSTAT_DATA_UINT64, 0 },
-	{ MAC_STAT_OPACKETS, "opackets64", KSTAT_DATA_UINT64, 0 },
-	{ MAC_STAT_LINK_STATE, "link_state", KSTAT_DATA_UINT32,
-	    (uint64_t)LINK_STATE_UNKNOWN}
-};
-
-#define	STAT_INFO_COUNT	(sizeof (i_dls_si) / sizeof (i_dls_si[0]))
+/*
+ * structure for link kstats
+ */
+typedef struct {
+	kstat_named_t	dk_ifspeed;
+	kstat_named_t	dk_multircv;
+	kstat_named_t	dk_brdcstrcv;
+	kstat_named_t	dk_multixmt;
+	kstat_named_t	dk_brdcstxmt;
+	kstat_named_t	dk_norcvbuf;
+	kstat_named_t	dk_ierrors;
+	kstat_named_t	dk_noxmtbuf;
+	kstat_named_t	dk_oerrors;
+	kstat_named_t	dk_collisions;
+	kstat_named_t	dk_rbytes;
+	kstat_named_t	dk_ipackets;
+	kstat_named_t	dk_obytes;
+	kstat_named_t	dk_opackets;
+	kstat_named_t	dk_rbytes64;
+	kstat_named_t	dk_ipackets64;
+	kstat_named_t	dk_obytes64;
+	kstat_named_t	dk_opackets64;
+	kstat_named_t	dk_link_state;
+	kstat_named_t	dk_link_duplex;
+	kstat_named_t	dk_unknowns;
+	kstat_named_t	dk_zonename;
+} dls_kstat_t;
 
 /*
  * Exported functions.
@@ -61,42 +65,54 @@ static mac_stat_info_t	i_dls_si[] = {
 int
 dls_stat_update(kstat_t *ksp, dls_link_t *dlp, int rw)
 {
-	kstat_named_t	*knp;
-	uint_t		i;
-	uint64_t	val;
+	dls_kstat_t *dkp = ksp->ks_data;
 
 	if (rw != KSTAT_READ)
 		return (EACCES);
 
-	knp = (kstat_named_t *)ksp->ks_data;
-	for (i = 0; i < STAT_INFO_COUNT; i++) {
-		val = mac_stat_get(dlp->dl_mh, i_dls_si[i].msi_stat);
-
-		switch (i_dls_si[i].msi_type) {
-		case KSTAT_DATA_UINT64:
-			knp->value.ui64 = val;
-			break;
-		case KSTAT_DATA_UINT32:
-			knp->value.ui32 = (uint32_t)val;
-			break;
-		default:
-			ASSERT(B_FALSE);
-		}
-
-		knp++;
-	}
+	dkp->dk_ifspeed.value.ui64 = mac_stat_get(dlp->dl_mh, MAC_STAT_IFSPEED);
+	dkp->dk_multircv.value.ui32 = mac_stat_get(dlp->dl_mh,
+	    MAC_STAT_MULTIRCV);
+	dkp->dk_brdcstrcv.value.ui32 = mac_stat_get(dlp->dl_mh,
+	    MAC_STAT_BRDCSTRCV);
+	dkp->dk_multixmt.value.ui32 = mac_stat_get(dlp->dl_mh,
+	    MAC_STAT_MULTIXMT);
+	dkp->dk_brdcstxmt.value.ui32 = mac_stat_get(dlp->dl_mh,
+	    MAC_STAT_BRDCSTXMT);
+	dkp->dk_norcvbuf.value.ui32 = mac_stat_get(dlp->dl_mh,
+	    MAC_STAT_NORCVBUF);
+	dkp->dk_ierrors.value.ui32 = mac_stat_get(dlp->dl_mh, MAC_STAT_IERRORS);
+	dkp->dk_noxmtbuf.value.ui32 = mac_stat_get(dlp->dl_mh,
+	    MAC_STAT_NOXMTBUF);
+	dkp->dk_oerrors.value.ui32 = mac_stat_get(dlp->dl_mh, MAC_STAT_OERRORS);
+	dkp->dk_collisions.value.ui32 = mac_stat_get(dlp->dl_mh,
+	    MAC_STAT_COLLISIONS);
+	dkp->dk_rbytes.value.ui32 = mac_stat_get(dlp->dl_mh, MAC_STAT_RBYTES);
+	dkp->dk_ipackets.value.ui32 = mac_stat_get(dlp->dl_mh,
+	    MAC_STAT_IPACKETS);
+	dkp->dk_obytes.value.ui32 = mac_stat_get(dlp->dl_mh, MAC_STAT_OBYTES);
+	dkp->dk_opackets.value.ui32 = mac_stat_get(dlp->dl_mh,
+	    MAC_STAT_OPACKETS);
+	dkp->dk_rbytes64.value.ui64 = mac_stat_get(dlp->dl_mh, MAC_STAT_RBYTES);
+	dkp->dk_ipackets64.value.ui64 = mac_stat_get(dlp->dl_mh,
+	    MAC_STAT_IPACKETS);
+	dkp->dk_obytes64.value.ui64 = mac_stat_get(dlp->dl_mh, MAC_STAT_OBYTES);
+	dkp->dk_opackets64.value.ui64 = mac_stat_get(dlp->dl_mh,
+	    MAC_STAT_OPACKETS);
+	dkp->dk_link_state.value.ui32 = mac_stat_get(dlp->dl_mh,
+	    MAC_STAT_LINK_STATE);
 
 	/*
 	 * Ethernet specific kstat "link_duplex"
 	 */
 	if (dlp->dl_mip->mi_nativemedia != DL_ETHER) {
-		knp->value.ui32 = LINK_DUPLEX_UNKNOWN;
+		dkp->dk_link_duplex.value.ui32 = LINK_DUPLEX_UNKNOWN;
 	} else {
-		val = mac_stat_get(dlp->dl_mh, ETHER_STAT_LINK_DUPLEX);
-		knp->value.ui32 = (uint32_t)val;
+		dkp->dk_link_duplex.value.ui32 =
+		    (uint32_t)mac_stat_get(dlp->dl_mh, ETHER_STAT_LINK_DUPLEX);
 	}
-	knp++;
-	knp->value.ui32 = dlp->dl_unknowns;
+
+	dkp->dk_unknowns.value.ui32 = dlp->dl_unknowns;
 
 	return (0);
 }
@@ -104,30 +120,66 @@ dls_stat_update(kstat_t *ksp, dls_link_t *dlp, int rw)
 int
 dls_stat_create(const char *module, int instance, const char *name,
     zoneid_t zoneid, int (*update)(struct kstat *, int), void *private,
-    kstat_t **kspp)
+    kstat_t **kspp, zoneid_t newzoneid)
 {
 	kstat_t		*ksp;
-	kstat_named_t	*knp;
-	uint_t		i;
+	zone_t		*zone;
+	dls_kstat_t	*dkp;
 
 	if ((ksp = kstat_create_zone(module, instance, name, "net",
-	    KSTAT_TYPE_NAMED, STAT_INFO_COUNT + 2, 0, zoneid)) == NULL) {
+	    KSTAT_TYPE_NAMED, sizeof (dls_kstat_t) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL, zoneid)) == NULL) {
 		return (EINVAL);
 	}
 
 	ksp->ks_update = update;
 	ksp->ks_private = private;
+	dkp = ksp->ks_data = kmem_zalloc(sizeof (dls_kstat_t), KM_SLEEP);
+	if ((zone = zone_find_by_id(newzoneid)) != NULL) {
+		ksp->ks_data_size += strlen(zone->zone_name) + 1;
+	}
 
-	knp = (kstat_named_t *)ksp->ks_data;
-	for (i = 0; i < STAT_INFO_COUNT; i++) {
-		kstat_named_init(knp, i_dls_si[i].msi_name,
-		    i_dls_si[i].msi_type);
-		knp++;
+	kstat_named_init(&dkp->dk_ifspeed, "ifspeed", KSTAT_DATA_UINT64);
+	kstat_named_init(&dkp->dk_multircv, "multircv", KSTAT_DATA_UINT32);
+	kstat_named_init(&dkp->dk_brdcstrcv, "brdcstrcv", KSTAT_DATA_UINT32);
+	kstat_named_init(&dkp->dk_multixmt, "multixmt", KSTAT_DATA_UINT32);
+	kstat_named_init(&dkp->dk_brdcstxmt, "brdcstxmt", KSTAT_DATA_UINT32);
+	kstat_named_init(&dkp->dk_norcvbuf, "norcvbuf", KSTAT_DATA_UINT32);
+	kstat_named_init(&dkp->dk_ierrors, "ierrors", KSTAT_DATA_UINT32);
+	kstat_named_init(&dkp->dk_noxmtbuf, "noxmtbuf", KSTAT_DATA_UINT32);
+	kstat_named_init(&dkp->dk_oerrors, "oerrors", KSTAT_DATA_UINT32);
+	kstat_named_init(&dkp->dk_collisions, "collisions", KSTAT_DATA_UINT32);
+	kstat_named_init(&dkp->dk_rbytes, "rbytes", KSTAT_DATA_UINT32);
+	kstat_named_init(&dkp->dk_ipackets, "ipackets", KSTAT_DATA_UINT32);
+	kstat_named_init(&dkp->dk_obytes, "obytes", KSTAT_DATA_UINT32);
+	kstat_named_init(&dkp->dk_opackets, "opackets", KSTAT_DATA_UINT32);
+	kstat_named_init(&dkp->dk_rbytes64, "rbytes64", KSTAT_DATA_UINT64);
+	kstat_named_init(&dkp->dk_ipackets64, "ipackets64", KSTAT_DATA_UINT64);
+	kstat_named_init(&dkp->dk_obytes64, "obytes64", KSTAT_DATA_UINT64);
+	kstat_named_init(&dkp->dk_opackets64, "opackets64", KSTAT_DATA_UINT64);
+	kstat_named_init(&dkp->dk_link_state, "link_state", KSTAT_DATA_UINT32);
+	kstat_named_init(&dkp->dk_link_duplex, "link_duplex",
+		    KSTAT_DATA_UINT32);
+	kstat_named_init(&dkp->dk_unknowns, "unknowns", KSTAT_DATA_UINT32);
+	kstat_named_init(&dkp->dk_zonename, "zonename", KSTAT_DATA_STRING);
+
+	if (zone != NULL) {
+		kstat_named_setstr(&dkp->dk_zonename, zone->zone_name);
+		zone_rele(zone);
 	}
 
-	kstat_named_init(knp++, "link_duplex", KSTAT_DATA_UINT32);
-	kstat_named_init(knp, "unknowns", KSTAT_DATA_UINT32);
 	kstat_install(ksp);
 	*kspp = ksp;
 	return (0);
 }
+
+void
+dls_stat_delete(kstat_t *ksp)
+{
+	void *data;
+	if (ksp != NULL) {
+		data = ksp->ks_data;
+		kstat_delete(ksp);
+		kmem_free(data, sizeof (dls_kstat_t));
+	}
+}
diff --git a/usr/src/uts/common/io/dr_sas/THIRDPARTYLICENSE b/usr/src/uts/common/io/dr_sas/THIRDPARTYLICENSE
new file mode 100644
index 0000000000..00aefb6f51
--- /dev/null
+++ b/usr/src/uts/common/io/dr_sas/THIRDPARTYLICENSE
@@ -0,0 +1,32 @@
+/*
+ * MegaRAID device driver for SAS2.0 controllers
+ * Copyright (c) 2009, LSI Logic Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the author nor the names of its contributors may be
+ *    used to endorse or promote products derived from this software without
+ *    specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
diff --git a/usr/src/uts/common/io/dr_sas/THIRDPARTYLICENSE.descrip b/usr/src/uts/common/io/dr_sas/THIRDPARTYLICENSE.descrip
new file mode 100644
index 0000000000..ac6d2d1b15
--- /dev/null
+++ b/usr/src/uts/common/io/dr_sas/THIRDPARTYLICENSE.descrip
@@ -0,0 +1 @@
+DR_SAS DRIVER
diff --git a/usr/src/uts/common/io/dr_sas/dr_sas.c b/usr/src/uts/common/io/dr_sas/dr_sas.c
new file mode 100644
index 0000000000..5b1dc82938
--- /dev/null
+++ b/usr/src/uts/common/io/dr_sas/dr_sas.c
@@ -0,0 +1,5506 @@
+/*
+ * dr_sas.c: source for dr_sas driver
+ *
+ * MegaRAID device driver for SAS2.0 controllers
+ * Copyright (c) 2008-2009, LSI Logic Corporation.
+ * All rights reserved.
+ *
+ * Version:
+ * Author:
+ *		Arun Chandrashekhar
+ *		Manju R
+ *        	Rajesh Prabhakaran
+ *        	Seokmann Ju
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the author nor the names of its contributors may be
+ *    used to endorse or promote products derived from this software without
+ *    specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/file.h>
+#include <sys/errno.h>
+#include <sys/open.h>
+#include <sys/cred.h>
+#include <sys/modctl.h>
+#include <sys/conf.h>
+#include <sys/devops.h>
+#include <sys/cmn_err.h>
+#include <sys/kmem.h>
+#include <sys/stat.h>
+#include <sys/mkdev.h>
+#include <sys/pci.h>
+#include <sys/scsi/scsi.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/atomic.h>
+#include <sys/signal.h>
+#include <sys/fs/dv_node.h>	/* devfs_clean */
+
+#include "dr_sas.h"
+
+/*
+ * FMA header files
+ */
+#include <sys/ddifm.h>
+#include <sys/fm/protocol.h>
+#include <sys/fm/util.h>
+#include <sys/fm/io/ddi.h>
+
+/*
+ * Local static data
+ */
+static void	*drsas_state = NULL;
+static int 	debug_level_g = CL_NONE;
+
+#pragma weak scsi_hba_open
+#pragma weak scsi_hba_close
+#pragma weak scsi_hba_ioctl
+
+static ddi_dma_attr_t drsas_generic_dma_attr = {
+	DMA_ATTR_V0,		/* dma_attr_version */
+	0,			/* low DMA address range */
+	0xFFFFFFFFU,		/* high DMA address range */
+	0xFFFFFFFFU,		/* DMA counter register  */
+	8,			/* DMA address alignment */
+	0x07,			/* DMA burstsizes  */
+	1,			/* min DMA size */
+	0xFFFFFFFFU,		/* max DMA size */
+	0xFFFFFFFFU,		/* segment boundary */
+	DRSAS_MAX_SGE_CNT,	/* dma_attr_sglen */
+	512,			/* granularity of device */
+	0			/* bus specific DMA flags */
+};
+
+int32_t drsas_max_cap_maxxfer = 0x1000000;
+
+/*
+ * cb_ops contains base level routines
+ */
+static struct cb_ops drsas_cb_ops = {
+	drsas_open,		/* open */
+	drsas_close,		/* close */
+	nodev,			/* strategy */
+	nodev,			/* print */
+	nodev,			/* dump */
+	nodev,			/* read */
+	nodev,			/* write */
+	drsas_ioctl,		/* ioctl */
+	nodev,			/* devmap */
+	nodev,			/* mmap */
+	nodev,			/* segmap */
+	nochpoll,		/* poll */
+	nodev,			/* cb_prop_op */
+	0,			/* streamtab  */
+	D_NEW | D_HOTPLUG,	/* cb_flag */
+	CB_REV,			/* cb_rev */
+	nodev,			/* cb_aread */
+	nodev			/* cb_awrite */
+};
+
+/*
+ * dev_ops contains configuration routines
+ */
+static struct dev_ops drsas_ops = {
+	DEVO_REV,		/* rev, */
+	0,			/* refcnt */
+	drsas_getinfo,		/* getinfo */
+	nulldev,		/* identify */
+	nulldev,		/* probe */
+	drsas_attach,		/* attach */
+	drsas_detach,		/* detach */
+	drsas_reset,		/* reset */
+	&drsas_cb_ops,		/* char/block ops */
+	NULL,			/* bus ops */
+	NULL,			/* power */
+	ddi_quiesce_not_supported,		/* quiesce */
+};
+
+char _depends_on[] = "misc/scsi";
+
+static struct modldrv modldrv = {
+	&mod_driverops,		/* module type - driver */
+	DRSAS_VERSION,
+	&drsas_ops,		/* driver ops */
+};
+
+static struct modlinkage modlinkage = {
+	MODREV_1,	/* ml_rev - must be MODREV_1 */
+	&modldrv,	/* ml_linkage */
+	NULL		/* end of driver linkage */
+};
+
+static struct ddi_device_acc_attr endian_attr = {
+	DDI_DEVICE_ATTR_V0,
+	DDI_STRUCTURE_LE_ACC,
+	DDI_STRICTORDER_ACC
+};
+
+
+/*
+ * ************************************************************************** *
+ *                                                                            *
+ *         common entry points - for loadable kernel modules                  *
+ *                                                                            *
+ * ************************************************************************** *
+ */
+
+int
+_init(void)
+{
+	int ret;
+
+	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+	ret = ddi_soft_state_init(&drsas_state,
+	    sizeof (struct drsas_instance), 0);
+
+	if (ret != DDI_SUCCESS) {
+		con_log(CL_ANN, (CE_WARN, "dr_sas: could not init state"));
+		return (ret);
+	}
+
+	if ((ret = scsi_hba_init(&modlinkage)) != DDI_SUCCESS) {
+		con_log(CL_ANN, (CE_WARN, "dr_sas: could not init scsi hba"));
+		ddi_soft_state_fini(&drsas_state);
+		return (ret);
+	}
+
+	ret = mod_install(&modlinkage);
+
+	if (ret != DDI_SUCCESS) {
+		con_log(CL_ANN, (CE_WARN, "dr_sas: mod_install failed"));
+		scsi_hba_fini(&modlinkage);
+		ddi_soft_state_fini(&drsas_state);
+	}
+
+	return (ret);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+	return (mod_info(&modlinkage, modinfop));
+}
+
+int
+_fini(void)
+{
+	int ret;
+
+	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+	if ((ret = mod_remove(&modlinkage)) != DDI_SUCCESS)
+		return (ret);
+
+	scsi_hba_fini(&modlinkage);
+
+	ddi_soft_state_fini(&drsas_state);
+
+	return (ret);
+}
+
+
+/*
+ * ************************************************************************** *
+ *                                                                            *
+ *               common entry points - for autoconfiguration                  *
+ *                                                                            *
+ * ************************************************************************** *
+ */
+
+static int
+drsas_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+	int		instance_no;
+	int		nregs;
+	uint8_t		added_isr_f = 0;
+	uint8_t		added_soft_isr_f = 0;
+	uint8_t		create_devctl_node_f = 0;
+	uint8_t		create_scsi_node_f = 0;
+	uint8_t		create_ioc_node_f = 0;
+	uint8_t		tran_alloc_f = 0;
+	uint8_t 	irq;
+	uint16_t	vendor_id;
+	uint16_t	device_id;
+	uint16_t	subsysvid;
+	uint16_t	subsysid;
+	uint16_t	command;
+	off_t		reglength = 0;
+	int		intr_types = 0;
+	char		*data;
+	int		msi_enable = 0;
+
+	scsi_hba_tran_t		*tran;
+	ddi_dma_attr_t  tran_dma_attr;
+	struct drsas_instance	*instance;
+
+	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+	/* CONSTCOND */
+	ASSERT(NO_COMPETING_THREADS);
+
+	instance_no = ddi_get_instance(dip);
+
+	/*
+	 * check to see whether this device is in a DMA-capable slot.
+	 */
+	if (ddi_slaveonly(dip) == DDI_SUCCESS) {
+		con_log(CL_ANN, (CE_WARN,
+		    "dr_sas%d: Device in slave-only slot, unused",
+		    instance_no));
+		return (DDI_FAILURE);
+	}
+
+	switch (cmd) {
+		case DDI_ATTACH:
+			con_log(CL_DLEVEL1, (CE_NOTE, "dr_sas: DDI_ATTACH"));
+			/* allocate the soft state for the instance */
+			if (ddi_soft_state_zalloc(drsas_state, instance_no)
+			    != DDI_SUCCESS) {
+				con_log(CL_ANN, (CE_WARN,
+				    "dr_sas%d: Failed to allocate soft state",
+				    instance_no));
+
+				return (DDI_FAILURE);
+			}
+
+			instance = (struct drsas_instance *)ddi_get_soft_state
+			    (drsas_state, instance_no);
+
+			if (instance == NULL) {
+				con_log(CL_ANN, (CE_WARN,
+				    "dr_sas%d: Bad soft state", instance_no));
+
+				ddi_soft_state_free(drsas_state, instance_no);
+
+				return (DDI_FAILURE);
+			}
+
+			bzero((caddr_t)instance,
+			    sizeof (struct drsas_instance));
+
+			instance->func_ptr = kmem_zalloc(
+			    sizeof (struct drsas_func_ptr), KM_SLEEP);
+			ASSERT(instance->func_ptr);
+
+			/* Setup the PCI configuration space handles */
+			if (pci_config_setup(dip, &instance->pci_handle) !=
+			    DDI_SUCCESS) {
+				con_log(CL_ANN, (CE_WARN,
+				    "dr_sas%d: pci config setup failed ",
+				    instance_no));
+
+				kmem_free(instance->func_ptr,
+				    sizeof (struct drsas_func_ptr));
+				ddi_soft_state_free(drsas_state, instance_no);
+
+				return (DDI_FAILURE);
+			}
+
+			if (ddi_dev_nregs(dip, &nregs) != DDI_SUCCESS) {
+				con_log(CL_ANN, (CE_WARN,
+				    "dr_sas: failed to get registers."));
+
+				pci_config_teardown(&instance->pci_handle);
+				kmem_free(instance->func_ptr,
+				    sizeof (struct drsas_func_ptr));
+				ddi_soft_state_free(drsas_state, instance_no);
+
+				return (DDI_FAILURE);
+			}
+
+			vendor_id = pci_config_get16(instance->pci_handle,
+			    PCI_CONF_VENID);
+			device_id = pci_config_get16(instance->pci_handle,
+			    PCI_CONF_DEVID);
+
+			subsysvid = pci_config_get16(instance->pci_handle,
+			    PCI_CONF_SUBVENID);
+			subsysid = pci_config_get16(instance->pci_handle,
+			    PCI_CONF_SUBSYSID);
+
+			pci_config_put16(instance->pci_handle, PCI_CONF_COMM,
+			    (pci_config_get16(instance->pci_handle,
+			    PCI_CONF_COMM) | PCI_COMM_ME));
+			irq = pci_config_get8(instance->pci_handle,
+			    PCI_CONF_ILINE);
+
+			con_log(CL_DLEVEL1, (CE_CONT, "dr_sas%d: "
+			    "0x%x:0x%x 0x%x:0x%x, irq:%d drv-ver:%s",
+			    instance_no, vendor_id, device_id, subsysvid,
+			    subsysid, irq, DRSAS_VERSION));
+
+			/* enable bus-mastering */
+			command = pci_config_get16(instance->pci_handle,
+			    PCI_CONF_COMM);
+
+			if (!(command & PCI_COMM_ME)) {
+				command |= PCI_COMM_ME;
+
+				pci_config_put16(instance->pci_handle,
+				    PCI_CONF_COMM, command);
+
+				con_log(CL_ANN, (CE_CONT, "dr_sas%d: "
+				    "enable bus-mastering", instance_no));
+			} else {
+				con_log(CL_DLEVEL1, (CE_CONT, "dr_sas%d: "
+				"bus-mastering already set", instance_no));
+			}
+
+			/* initialize function pointers */
+			if ((device_id == PCI_DEVICE_ID_LSI_2108VDE) ||
+			    (device_id == PCI_DEVICE_ID_LSI_2108V)) {
+				con_log(CL_DLEVEL1, (CE_CONT, "dr_sas%d: "
+				    "2108V/DE detected", instance_no));
+				instance->func_ptr->read_fw_status_reg =
+				    read_fw_status_reg_ppc;
+				instance->func_ptr->issue_cmd = issue_cmd_ppc;
+				instance->func_ptr->issue_cmd_in_sync_mode =
+				    issue_cmd_in_sync_mode_ppc;
+				instance->func_ptr->issue_cmd_in_poll_mode =
+				    issue_cmd_in_poll_mode_ppc;
+				instance->func_ptr->enable_intr =
+				    enable_intr_ppc;
+				instance->func_ptr->disable_intr =
+				    disable_intr_ppc;
+				instance->func_ptr->intr_ack = intr_ack_ppc;
+			} else {
+				con_log(CL_ANN, (CE_WARN,
+				    "dr_sas: Invalid device detected"));
+
+				pci_config_teardown(&instance->pci_handle);
+				kmem_free(instance->func_ptr,
+				    sizeof (struct drsas_func_ptr));
+				ddi_soft_state_free(drsas_state, instance_no);
+
+				return (DDI_FAILURE);
+			}
+
+			instance->baseaddress = pci_config_get32(
+			    instance->pci_handle, PCI_CONF_BASE0);
+			instance->baseaddress &= 0x0fffc;
+
+			instance->dip		= dip;
+			instance->vendor_id	= vendor_id;
+			instance->device_id	= device_id;
+			instance->subsysvid	= subsysvid;
+			instance->subsysid	= subsysid;
+			instance->instance	= instance_no;
+
+			/* Initialize FMA */
+			instance->fm_capabilities = ddi_prop_get_int(
+			    DDI_DEV_T_ANY, instance->dip, DDI_PROP_DONTPASS,
+			    "fm-capable", DDI_FM_EREPORT_CAPABLE |
+			    DDI_FM_ACCCHK_CAPABLE | DDI_FM_DMACHK_CAPABLE
+			    | DDI_FM_ERRCB_CAPABLE);
+
+			drsas_fm_init(instance);
+
+			/* Initialize Interrupts */
+			if ((ddi_dev_regsize(instance->dip,
+			    REGISTER_SET_IO_2108, &reglength) != DDI_SUCCESS) ||
+			    reglength < MINIMUM_MFI_MEM_SZ) {
+				return (DDI_FAILURE);
+			}
+			if (reglength > DEFAULT_MFI_MEM_SZ) {
+				reglength = DEFAULT_MFI_MEM_SZ;
+				con_log(CL_DLEVEL1, (CE_NOTE,
+				    "dr_sas: register length to map is "
+				    "0x%lx bytes", reglength));
+			}
+			if (ddi_regs_map_setup(instance->dip,
+			    REGISTER_SET_IO_2108, &instance->regmap, 0,
+			    reglength, &endian_attr, &instance->regmap_handle)
+			    != DDI_SUCCESS) {
+				con_log(CL_ANN, (CE_NOTE,
+				    "dr_sas: couldn't map control registers"));
+				goto fail_attach;
+			}
+
+			/*
+			 * Disable Interrupt Now.
+			 * Setup Software interrupt
+			 */
+			instance->func_ptr->disable_intr(instance);
+
+			msi_enable = 0;
+			if (ddi_prop_lookup_string(DDI_DEV_T_ANY, dip, 0,
+			    "drsas-enable-msi", &data) == DDI_SUCCESS) {
+				if (strncmp(data, "yes", 3) == 0) {
+					msi_enable = 1;
+					con_log(CL_ANN, (CE_WARN,
+					    "msi_enable = %d ENABLED",
+					    msi_enable));
+				}
+				ddi_prop_free(data);
+			}
+
+			con_log(CL_DLEVEL1, (CE_WARN, "msi_enable = %d",
+			    msi_enable));
+
+			/* Check for all supported interrupt types */
+			if (ddi_intr_get_supported_types(
+			    dip, &intr_types) != DDI_SUCCESS) {
+				con_log(CL_ANN, (CE_WARN,
+				    "ddi_intr_get_supported_types() failed"));
+				goto fail_attach;
+			}
+
+			con_log(CL_DLEVEL1, (CE_NOTE,
+			    "ddi_intr_get_supported_types() ret: 0x%x",
+			    intr_types));
+
+			/* Initialize and Setup Interrupt handler */
+			if (msi_enable && (intr_types & DDI_INTR_TYPE_MSIX)) {
+				if (drsas_add_intrs(instance,
+				    DDI_INTR_TYPE_MSIX) != DDI_SUCCESS) {
+					con_log(CL_ANN, (CE_WARN,
+					    "MSIX interrupt query failed"));
+					goto fail_attach;
+				}
+				instance->intr_type = DDI_INTR_TYPE_MSIX;
+			} else if (msi_enable && (intr_types &
+			    DDI_INTR_TYPE_MSI)) {
+				if (drsas_add_intrs(instance,
+				    DDI_INTR_TYPE_MSI) != DDI_SUCCESS) {
+					con_log(CL_ANN, (CE_WARN,
+					    "MSI interrupt query failed"));
+					goto fail_attach;
+				}
+				instance->intr_type = DDI_INTR_TYPE_MSI;
+			} else if (intr_types & DDI_INTR_TYPE_FIXED) {
+				msi_enable = 0;
+				if (drsas_add_intrs(instance,
+				    DDI_INTR_TYPE_FIXED) != DDI_SUCCESS) {
+					con_log(CL_ANN, (CE_WARN,
+					    "FIXED interrupt query failed"));
+					goto fail_attach;
+				}
+				instance->intr_type = DDI_INTR_TYPE_FIXED;
+			} else {
+				con_log(CL_ANN, (CE_WARN, "Device cannot "
+				    "suppport either FIXED or MSI/X "
+				    "interrupts"));
+				goto fail_attach;
+			}
+
+			added_isr_f = 1;
+
+			/* setup the mfi based low level driver */
+			if (init_mfi(instance) != DDI_SUCCESS) {
+				con_log(CL_ANN, (CE_WARN, "dr_sas: "
+				"could not initialize the low level driver"));
+
+				goto fail_attach;
+			}
+
+			/* Initialize all Mutex */
+			INIT_LIST_HEAD(&instance->completed_pool_list);
+			mutex_init(&instance->completed_pool_mtx,
+			    "completed_pool_mtx", MUTEX_DRIVER,
+			    DDI_INTR_PRI(instance->intr_pri));
+
+			mutex_init(&instance->int_cmd_mtx, "int_cmd_mtx",
+			    MUTEX_DRIVER, DDI_INTR_PRI(instance->intr_pri));
+			cv_init(&instance->int_cmd_cv, NULL, CV_DRIVER, NULL);
+
+			mutex_init(&instance->cmd_pool_mtx, "cmd_pool_mtx",
+			    MUTEX_DRIVER, DDI_INTR_PRI(instance->intr_pri));
+
+			/* Register our soft-isr for highlevel interrupts. */
+			instance->isr_level = instance->intr_pri;
+			if (instance->isr_level == HIGH_LEVEL_INTR) {
+				if (ddi_add_softintr(dip, DDI_SOFTINT_HIGH,
+				    &instance->soft_intr_id, NULL, NULL,
+				    drsas_softintr, (caddr_t)instance) !=
+				    DDI_SUCCESS) {
+					con_log(CL_ANN, (CE_WARN,
+					    " Software ISR did not register"));
+
+					goto fail_attach;
+				}
+
+				added_soft_isr_f = 1;
+			}
+
+			/* Allocate a transport structure */
+			tran = scsi_hba_tran_alloc(dip, SCSI_HBA_CANSLEEP);
+
+			if (tran == NULL) {
+				con_log(CL_ANN, (CE_WARN,
+				    "scsi_hba_tran_alloc failed"));
+				goto fail_attach;
+			}
+
+			tran_alloc_f = 1;
+
+			instance->tran = tran;
+
+			tran->tran_hba_private	= instance;
+			tran->tran_tgt_init	= drsas_tran_tgt_init;
+			tran->tran_tgt_probe	= scsi_hba_probe;
+			tran->tran_tgt_free	= drsas_tran_tgt_free;
+			tran->tran_init_pkt	= drsas_tran_init_pkt;
+			tran->tran_start	= drsas_tran_start;
+			tran->tran_abort	= drsas_tran_abort;
+			tran->tran_reset	= drsas_tran_reset;
+			tran->tran_getcap	= drsas_tran_getcap;
+			tran->tran_setcap	= drsas_tran_setcap;
+			tran->tran_destroy_pkt	= drsas_tran_destroy_pkt;
+			tran->tran_dmafree	= drsas_tran_dmafree;
+			tran->tran_sync_pkt	= drsas_tran_sync_pkt;
+			tran->tran_bus_config	= drsas_tran_bus_config;
+
+			tran_dma_attr = drsas_generic_dma_attr;
+			tran_dma_attr.dma_attr_sgllen = instance->max_num_sge;
+
+			/* Attach this instance of the hba */
+			if (scsi_hba_attach_setup(dip, &tran_dma_attr, tran, 0)
+			    != DDI_SUCCESS) {
+				con_log(CL_ANN, (CE_WARN,
+				    "scsi_hba_attach failed"));
+
+				goto fail_attach;
+			}
+
+			/* create devctl node for cfgadm command */
+			if (ddi_create_minor_node(dip, "devctl",
+			    S_IFCHR, INST2DEVCTL(instance_no),
+			    DDI_NT_SCSI_NEXUS, 0) == DDI_FAILURE) {
+				con_log(CL_ANN, (CE_WARN,
+				    "dr_sas: failed to create devctl node."));
+
+				goto fail_attach;
+			}
+
+			create_devctl_node_f = 1;
+
+			/* create scsi node for cfgadm command */
+			if (ddi_create_minor_node(dip, "scsi", S_IFCHR,
+			    INST2SCSI(instance_no),
+			    DDI_NT_SCSI_ATTACHMENT_POINT, 0) ==
+			    DDI_FAILURE) {
+				con_log(CL_ANN, (CE_WARN,
+				    "dr_sas: failed to create scsi node."));
+
+				goto fail_attach;
+			}
+
+			create_scsi_node_f = 1;
+
+			(void) sprintf(instance->iocnode, "%d:lsirdctl",
+			    instance_no);
+
+			/*
+			 * Create a node for applications
+			 * for issuing ioctl to the driver.
+			 */
+			if (ddi_create_minor_node(dip, instance->iocnode,
+			    S_IFCHR, INST2LSIRDCTL(instance_no),
+			    DDI_PSEUDO, 0) == DDI_FAILURE) {
+				con_log(CL_ANN, (CE_WARN,
+				    "dr_sas: failed to create ioctl node."));
+
+				goto fail_attach;
+			}
+
+			create_ioc_node_f = 1;
+
+			/* Create a taskq to handle dr events */
+			if ((instance->taskq = ddi_taskq_create(dip,
+			    "drsas_dr_taskq", 1,
+			    TASKQ_DEFAULTPRI, 0)) == NULL) {
+				con_log(CL_ANN, (CE_WARN,
+				    "dr_sas: failed to create taskq "));
+				instance->taskq = NULL;
+				goto fail_attach;
+			}
+
+			/* enable interrupt */
+			instance->func_ptr->enable_intr(instance);
+
+			/* initiate AEN */
+			if (start_mfi_aen(instance)) {
+				con_log(CL_ANN, (CE_WARN,
+				    "dr_sas: failed to initiate AEN."));
+				goto fail_initiate_aen;
+			}
+
+			con_log(CL_DLEVEL1, (CE_NOTE,
+			    "AEN started for instance %d.", instance_no));
+
+			/* Finally! We are on the air.  */
+			ddi_report_dev(dip);
+
+			if (drsas_check_acc_handle(instance->regmap_handle) !=
+			    DDI_SUCCESS) {
+				goto fail_attach;
+			}
+			if (drsas_check_acc_handle(instance->pci_handle) !=
+			    DDI_SUCCESS) {
+				goto fail_attach;
+			}
+			instance->dr_ld_list =
+			    kmem_zalloc(MRDRV_MAX_LD * sizeof (struct drsas_ld),
+			    KM_SLEEP);
+			break;
+		case DDI_PM_RESUME:
+			con_log(CL_ANN, (CE_NOTE,
+			    "dr_sas: DDI_PM_RESUME"));
+			break;
+		case DDI_RESUME:
+			con_log(CL_ANN, (CE_NOTE,
+			    "dr_sas: DDI_RESUME"));
+			break;
+		default:
+			con_log(CL_ANN, (CE_WARN,
+			    "dr_sas: invalid attach cmd=%x", cmd));
+			return (DDI_FAILURE);
+	}
+
+	return (DDI_SUCCESS);
+
+fail_initiate_aen:
+fail_attach:
+	if (create_devctl_node_f) {
+		ddi_remove_minor_node(dip, "devctl");
+	}
+
+	if (create_scsi_node_f) {
+		ddi_remove_minor_node(dip, "scsi");
+	}
+
+	if (create_ioc_node_f) {
+		ddi_remove_minor_node(dip, instance->iocnode);
+	}
+
+	if (tran_alloc_f) {
+		scsi_hba_tran_free(tran);
+	}
+
+
+	if (added_soft_isr_f) {
+		ddi_remove_softintr(instance->soft_intr_id);
+	}
+
+	if (added_isr_f) {
+		drsas_rem_intrs(instance);
+	}
+
+	if (instance && instance->taskq) {
+		ddi_taskq_destroy(instance->taskq);
+	}
+
+	drsas_fm_ereport(instance, DDI_FM_DEVICE_NO_RESPONSE);
+	ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST);
+
+	drsas_fm_fini(instance);
+
+	pci_config_teardown(&instance->pci_handle);
+
+	ddi_soft_state_free(drsas_state, instance_no);
+
+	con_log(CL_ANN, (CE_NOTE,
+	    "dr_sas: return failure from drsas_attach"));
+
+	return (DDI_FAILURE);
+}
+
+/*ARGSUSED*/
+static int
+drsas_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd,  void *arg, void **resultp)
+{
+	int	rval;
+	int	drsas_minor = getminor((dev_t)arg);
+
+	struct drsas_instance	*instance;
+
+	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+	switch (cmd) {
+		case DDI_INFO_DEVT2DEVINFO:
+			instance = (struct drsas_instance *)
+			    ddi_get_soft_state(drsas_state,
+			    MINOR2INST(drsas_minor));
+
+			if (instance == NULL) {
+				*resultp = NULL;
+				rval = DDI_FAILURE;
+			} else {
+				*resultp = instance->dip;
+				rval = DDI_SUCCESS;
+			}
+			break;
+		case DDI_INFO_DEVT2INSTANCE:
+			*resultp = (void *)instance;
+			rval = DDI_SUCCESS;
+			break;
+		default:
+			*resultp = NULL;
+			rval = DDI_FAILURE;
+	}
+
+	return (rval);
+}
+
+static int
+drsas_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+	int	instance_no;
+
+	struct drsas_instance	*instance;
+
+	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+	/* CONSTCOND */
+	ASSERT(NO_COMPETING_THREADS);
+
+	instance_no = ddi_get_instance(dip);
+
+	instance = (struct drsas_instance *)ddi_get_soft_state(drsas_state,
+	    instance_no);
+
+	if (!instance) {
+		con_log(CL_ANN, (CE_WARN,
+		    "dr_sas:%d could not get instance in detach",
+		    instance_no));
+
+		return (DDI_FAILURE);
+	}
+
+	con_log(CL_ANN, (CE_NOTE,
+	    "dr_sas%d: detaching device 0x%4x:0x%4x:0x%4x:0x%4x",
+	    instance_no, instance->vendor_id, instance->device_id,
+	    instance->subsysvid, instance->subsysid));
+
+	switch (cmd) {
+	case DDI_DETACH:
+		con_log(CL_ANN, (CE_NOTE,
+		    "drsas_detach: DDI_DETACH"));
+
+		if (scsi_hba_detach(dip) != DDI_SUCCESS) {
+			con_log(CL_ANN, (CE_WARN,
+			    "dr_sas:%d failed to detach",
+			    instance_no));
+
+			return (DDI_FAILURE);
+		}
+
+		scsi_hba_tran_free(instance->tran);
+
+		flush_cache(instance);
+
+		if (abort_aen_cmd(instance, instance->aen_cmd)) {
+			con_log(CL_ANN, (CE_WARN, "drsas_detach: "
+			    "failed to abort prevous AEN command"));
+
+			return (DDI_FAILURE);
+		}
+
+		instance->func_ptr->disable_intr(instance);
+
+		if (instance->isr_level == HIGH_LEVEL_INTR) {
+			ddi_remove_softintr(instance->soft_intr_id);
+		}
+
+		drsas_rem_intrs(instance);
+
+		if (instance->taskq) {
+			ddi_taskq_destroy(instance->taskq);
+		}
+		kmem_free(instance->dr_ld_list, MRDRV_MAX_LD
+		    * sizeof (struct drsas_ld));
+		free_space_for_mfi(instance);
+
+		drsas_fm_fini(instance);
+
+		pci_config_teardown(&instance->pci_handle);
+
+		kmem_free(instance->func_ptr,
+		    sizeof (struct drsas_func_ptr));
+
+		ddi_soft_state_free(drsas_state, instance_no);
+		break;
+	case DDI_PM_SUSPEND:
+		con_log(CL_ANN, (CE_NOTE,
+		    "drsas_detach: DDI_PM_SUSPEND"));
+
+		break;
+	case DDI_SUSPEND:
+		con_log(CL_ANN, (CE_NOTE,
+		    "drsas_detach: DDI_SUSPEND"));
+
+		break;
+	default:
+		con_log(CL_ANN, (CE_WARN,
+		    "invalid detach command:0x%x", cmd));
+		return (DDI_FAILURE);
+	}
+
+	return (DDI_SUCCESS);
+}
+
+/*
+ * ************************************************************************** *
+ *                                                                            *
+ *             common entry points - for character driver types               *
+ *                                                                            *
+ * ************************************************************************** *
+ */
+static  int
+drsas_open(dev_t *dev, int openflags, int otyp, cred_t *credp)
+{
+	int	rval = 0;
+
+	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+	/* Check root permissions */
+	if (drv_priv(credp) != 0) {
+		con_log(CL_ANN, (CE_WARN,
+		    "dr_sas: Non-root ioctl access denied!"));
+		return (EPERM);
+	}
+
+	/* Verify we are being opened as a character device */
+	if (otyp != OTYP_CHR) {
+		con_log(CL_ANN, (CE_WARN,
+		    "dr_sas: ioctl node must be a char node"));
+		return (EINVAL);
+	}
+
+	if (ddi_get_soft_state(drsas_state, MINOR2INST(getminor(*dev)))
+	    == NULL) {
+		return (ENXIO);
+	}
+
+	if (scsi_hba_open) {
+		rval = scsi_hba_open(dev, openflags, otyp, credp);
+	}
+
+	return (rval);
+}
+
+static  int
+drsas_close(dev_t dev, int openflags, int otyp, cred_t *credp)
+{
+	int	rval = 0;
+
+	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+	/* no need for locks! */
+
+	if (scsi_hba_close) {
+		rval = scsi_hba_close(dev, openflags, otyp, credp);
+	}
+
+	return (rval);
+}
+
+static int
+drsas_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
+    int *rvalp)
+{
+	int	rval = 0;
+
+	struct drsas_instance	*instance;
+	struct drsas_ioctl	*ioctl;
+	struct drsas_aen	aen;
+	int i;
+	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+	instance = ddi_get_soft_state(drsas_state, MINOR2INST(getminor(dev)));
+
+	if (instance == NULL) {
+		/* invalid minor number */
+		con_log(CL_ANN, (CE_WARN, "dr_sas: adapter not found."));
+		return (ENXIO);
+	}
+
+	ioctl = (struct drsas_ioctl *)kmem_zalloc(sizeof (struct drsas_ioctl),
+	    KM_SLEEP);
+	ASSERT(ioctl);
+
+	switch ((uint_t)cmd) {
+		case DRSAS_IOCTL_FIRMWARE:
+			for (i = 0; i < sizeof (struct drsas_ioctl); i++) {
+				if (ddi_copyin((uint8_t *)arg+i,
+				    (uint8_t *)ioctl+i, 1, mode)) {
+					con_log(CL_ANN, (CE_WARN, "drsas_ioctl "
+					    "ERROR IOCTL copyin"));
+					kmem_free(ioctl,
+					    sizeof (struct drsas_ioctl));
+					return (EFAULT);
+				}
+			}
+			if (ioctl->control_code == DRSAS_DRIVER_IOCTL_COMMON) {
+				rval = handle_drv_ioctl(instance, ioctl, mode);
+			} else {
+				rval = handle_mfi_ioctl(instance, ioctl, mode);
+			}
+			for (i = 0; i < sizeof (struct drsas_ioctl) - 1; i++) {
+				if (ddi_copyout((uint8_t *)ioctl+i,
+				    (uint8_t *)arg+i, 1, mode)) {
+					con_log(CL_ANN, (CE_WARN,
+					    "drsas_ioctl: ddi_copyout "
+					    "failed"));
+					rval = 1;
+					break;
+				}
+			}
+
+			break;
+		case DRSAS_IOCTL_AEN:
+			for (i = 0; i < sizeof (struct drsas_aen); i++) {
+				if (ddi_copyin((uint8_t *)arg+i,
+				    (uint8_t *)&aen+i, 1, mode)) {
+					con_log(CL_ANN, (CE_WARN,
+					    "drsas_ioctl: "
+					    "ERROR AEN copyin"));
+					kmem_free(ioctl,
+					    sizeof (struct drsas_ioctl));
+					return (EFAULT);
+				}
+			}
+
+			rval = handle_mfi_aen(instance, &aen);
+			for (i = 0; i < sizeof (struct drsas_aen); i++) {
+				if (ddi_copyout((uint8_t *)&aen + i,
+				    (uint8_t *)arg + i, 1, mode)) {
+					con_log(CL_ANN, (CE_WARN,
+					    "drsas_ioctl: "
+					    "ddi_copyout failed"));
+					rval = 1;
+					break;
+				}
+			}
+
+			break;
+		default:
+			rval = scsi_hba_ioctl(dev, cmd, arg,
+			    mode, credp, rvalp);
+
+			con_log(CL_DLEVEL1, (CE_NOTE, "drsas_ioctl: "
+			    "scsi_hba_ioctl called, ret = %x.", rval));
+	}
+
+	kmem_free(ioctl, sizeof (struct drsas_ioctl));
+	return (rval);
+}
+
+/*
+ * ************************************************************************** *
+ *                                                                            *
+ *               common entry points - for block driver types                 *
+ *                                                                            *
+ * ************************************************************************** *
+ */
+/*ARGSUSED*/
+static int
+drsas_reset(dev_info_t *dip, ddi_reset_cmd_t cmd)
+{
+	int	instance_no;
+
+	struct drsas_instance	*instance;
+
+	instance_no = ddi_get_instance(dip);
+	instance = (struct drsas_instance *)ddi_get_soft_state
+	    (drsas_state, instance_no);
+
+	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+	if (!instance) {
+		con_log(CL_ANN, (CE_WARN, "dr_sas:%d could not get adapter "
+		    "in reset", instance_no));
+		return (DDI_FAILURE);
+	}
+
+	instance->func_ptr->disable_intr(instance);
+
+	con_log(CL_ANN1, (CE_NOTE, "flushing cache for instance %d",
+	    instance_no));
+
+	flush_cache(instance);
+
+	return (DDI_SUCCESS);
+}
+
+
+/*
+ * ************************************************************************** *
+ *                                                                            *
+ *                          entry points (SCSI HBA)                           *
+ *                                                                            *
+ * ************************************************************************** *
+ */
+/*ARGSUSED*/
+static int
+drsas_tran_tgt_init(dev_info_t *hba_dip, dev_info_t *tgt_dip,
+		scsi_hba_tran_t *tran, struct scsi_device *sd)
+{
+	struct drsas_instance *instance;
+	uint16_t tgt = sd->sd_address.a_target;
+	uint8_t lun = sd->sd_address.a_lun;
+
+	con_log(CL_ANN1, (CE_NOTE, "drsas_tgt_init target %d lun %d",
+	    tgt, lun));
+
+	instance = ADDR2MR(&sd->sd_address);
+
+	if (ndi_dev_is_persistent_node(tgt_dip) == 0) {
+		(void) ndi_merge_node(tgt_dip, drsas_name_node);
+		ddi_set_name_addr(tgt_dip, NULL);
+
+		con_log(CL_ANN1, (CE_NOTE, "drsas_tgt_init in "
+		    "ndi_dev_is_persistent_node DDI_FAILURE t = %d l = %d",
+		    tgt, lun));
+		return (DDI_FAILURE);
+	}
+
+	con_log(CL_ANN1, (CE_NOTE, "drsas_tgt_init dev_dip %p tgt_dip %p",
+	    (void *)instance->dr_ld_list[tgt].dip, (void *)tgt_dip));
+
+	if (tgt < MRDRV_MAX_LD && lun == 0) {
+		if (instance->dr_ld_list[tgt].dip == NULL &&
+		    strcmp(ddi_driver_name(sd->sd_dev), "sd") == 0) {
+			instance->dr_ld_list[tgt].dip = tgt_dip;
+			instance->dr_ld_list[tgt].lun_type = DRSAS_LD_LUN;
+		}
+	}
+	return (DDI_SUCCESS);
+}
+
+/*ARGSUSED*/
+static void
+drsas_tran_tgt_free(dev_info_t *hba_dip, dev_info_t *tgt_dip,
+    scsi_hba_tran_t *hba_tran, struct scsi_device *sd)
+{
+	struct drsas_instance *instance;
+	int tgt = sd->sd_address.a_target;
+	int lun = sd->sd_address.a_lun;
+
+	instance = ADDR2MR(&sd->sd_address);
+
+	con_log(CL_ANN1, (CE_NOTE, "tgt_free t = %d l = %d", tgt, lun));
+
+	if (tgt < MRDRV_MAX_LD && lun == 0) {
+		if (instance->dr_ld_list[tgt].dip == tgt_dip) {
+			instance->dr_ld_list[tgt].dip = NULL;
+		}
+	}
+}
+
+static dev_info_t *
+drsas_find_child(struct drsas_instance *instance, uint16_t tgt, uint8_t lun)
+{
+	dev_info_t *child = NULL;
+	char addr[SCSI_MAXNAMELEN];
+	char tmp[MAXNAMELEN];
+
+	(void) sprintf(addr, "%x,%x", tgt, lun);
+	for (child = ddi_get_child(instance->dip); child;
+	    child = ddi_get_next_sibling(child)) {
+
+		if (drsas_name_node(child, tmp, MAXNAMELEN) !=
+		    DDI_SUCCESS) {
+			continue;
+		}
+
+		if (strcmp(addr, tmp) == 0) {
+			break;
+		}
+	}
+	con_log(CL_ANN1, (CE_NOTE, "drsas_find_child: return child = %p",
+	    (void *)child));
+	return (child);
+}
+
+static int
+drsas_name_node(dev_info_t *dip, char *name, int len)
+{
+	int tgt, lun;
+
+	tgt = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
+	    DDI_PROP_DONTPASS, "target", -1);
+	con_log(CL_ANN1, (CE_NOTE,
+	    "drsas_name_node: dip %p tgt %d", (void *)dip, tgt));
+	if (tgt == -1) {
+		return (DDI_FAILURE);
+	}
+	lun = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
+	    "lun", -1);
+	con_log(CL_ANN1,
+	    (CE_NOTE, "drsas_name_node: tgt %d lun %d", tgt, lun));
+	if (lun == -1) {
+		return (DDI_FAILURE);
+	}
+	(void) snprintf(name, len, "%x,%x", tgt, lun);
+	return (DDI_SUCCESS);
+}
+
+static struct scsi_pkt *
+drsas_tran_init_pkt(struct scsi_address *ap, register struct scsi_pkt *pkt,
+	struct buf *bp, int cmdlen, int statuslen, int tgtlen,
+	int flags, int (*callback)(), caddr_t arg)
+{
+	struct scsa_cmd	*acmd;
+	struct drsas_instance	*instance;
+	struct scsi_pkt	*new_pkt;
+
+	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+	instance = ADDR2MR(ap);
+
+	/* step #1 : pkt allocation */
+	if (pkt == NULL) {
+		pkt = scsi_hba_pkt_alloc(instance->dip, ap, cmdlen, statuslen,
+		    tgtlen, sizeof (struct scsa_cmd), callback, arg);
+		if (pkt == NULL) {
+			return (NULL);
+		}
+
+		acmd = PKT2CMD(pkt);
+
+		/*
+		 * Initialize the new pkt - we redundantly initialize
+		 * all the fields for illustrative purposes.
+		 */
+		acmd->cmd_pkt		= pkt;
+		acmd->cmd_flags		= 0;
+		acmd->cmd_scblen	= statuslen;
+		acmd->cmd_cdblen	= cmdlen;
+		acmd->cmd_dmahandle	= NULL;
+		acmd->cmd_ncookies	= 0;
+		acmd->cmd_cookie	= 0;
+		acmd->cmd_cookiecnt	= 0;
+		acmd->cmd_nwin		= 0;
+
+		pkt->pkt_address	= *ap;
+		pkt->pkt_comp		= (void (*)())NULL;
+		pkt->pkt_flags		= 0;
+		pkt->pkt_time		= 0;
+		pkt->pkt_resid		= 0;
+		pkt->pkt_state		= 0;
+		pkt->pkt_statistics	= 0;
+		pkt->pkt_reason		= 0;
+		new_pkt			= pkt;
+	} else {
+		acmd = PKT2CMD(pkt);
+		new_pkt = NULL;
+	}
+
+	/* step #2 : dma allocation/move */
+	if (bp && bp->b_bcount != 0) {
+		if (acmd->cmd_dmahandle == NULL) {
+			if (drsas_dma_alloc(instance, pkt, bp, flags,
+			    callback) == DDI_FAILURE) {
+				if (new_pkt) {
+					scsi_hba_pkt_free(ap, new_pkt);
+				}
+				return ((struct scsi_pkt *)NULL);
+			}
+		} else {
+			if (drsas_dma_move(instance, pkt, bp) == DDI_FAILURE) {
+				return ((struct scsi_pkt *)NULL);
+			}
+		}
+	}
+
+	return (pkt);
+}
+
+static int
+drsas_tran_start(struct scsi_address *ap, register struct scsi_pkt *pkt)
+{
+	uchar_t 	cmd_done = 0;
+
+	struct drsas_instance	*instance = ADDR2MR(ap);
+	struct drsas_cmd	*cmd;
+
+	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d:SCSI CDB[0]=0x%x",
+	    __func__, __LINE__, pkt->pkt_cdbp[0]));
+
+	pkt->pkt_reason	= CMD_CMPLT;
+	*pkt->pkt_scbp = STATUS_GOOD; /* clear arq scsi_status */
+
+	cmd = build_cmd(instance, ap, pkt, &cmd_done);
+
+	/*
+	 * Check if the command is already completed by the drsas_build_cmd()
+	 * routine. In which case the busy_flag would be clear and scb will be
+	 * NULL and appropriate reason provided in pkt_reason field
+	 */
+	if (cmd_done) {
+		pkt->pkt_reason = CMD_CMPLT;
+		pkt->pkt_scbp[0] = STATUS_GOOD;
+		pkt->pkt_state |= STATE_GOT_BUS | STATE_GOT_TARGET
+		    | STATE_SENT_CMD;
+		if (((pkt->pkt_flags & FLAG_NOINTR) == 0) && pkt->pkt_comp) {
+			(*pkt->pkt_comp)(pkt);
+		}
+
+		return (TRAN_ACCEPT);
+	}
+
+	if (cmd == NULL) {
+		return (TRAN_BUSY);
+	}
+
+	if ((pkt->pkt_flags & FLAG_NOINTR) == 0) {
+		if (instance->fw_outstanding > instance->max_fw_cmds) {
+			con_log(CL_ANN, (CE_CONT, "dr_sas:Firmware busy"));
+			return_mfi_pkt(instance, cmd);
+			return (TRAN_BUSY);
+		}
+
+		/* Synchronize the Cmd frame for the controller */
+		(void) ddi_dma_sync(cmd->frame_dma_obj.dma_handle, 0, 0,
+		    DDI_DMA_SYNC_FORDEV);
+
+		instance->func_ptr->issue_cmd(cmd, instance);
+
+	} else {
+		struct drsas_header *hdr = &cmd->frame->hdr;
+
+		cmd->sync_cmd = DRSAS_TRUE;
+
+		instance->func_ptr-> issue_cmd_in_poll_mode(instance, cmd);
+
+		pkt->pkt_reason		= CMD_CMPLT;
+		pkt->pkt_statistics	= 0;
+		pkt->pkt_state |= STATE_XFERRED_DATA | STATE_GOT_STATUS;
+
+		switch (ddi_get8(cmd->frame_dma_obj.acc_handle,
+		    &hdr->cmd_status)) {
+		case MFI_STAT_OK:
+			pkt->pkt_scbp[0] = STATUS_GOOD;
+			break;
+
+		case MFI_STAT_SCSI_DONE_WITH_ERROR:
+
+			pkt->pkt_reason	= CMD_CMPLT;
+			pkt->pkt_statistics = 0;
+
+			((struct scsi_status *)pkt->pkt_scbp)->sts_chk = 1;
+			break;
+
+		case MFI_STAT_DEVICE_NOT_FOUND:
+			pkt->pkt_reason		= CMD_DEV_GONE;
+			pkt->pkt_statistics	= STAT_DISCON;
+			break;
+
+		default:
+			((struct scsi_status *)pkt->pkt_scbp)->sts_busy = 1;
+		}
+
+		return_mfi_pkt(instance, cmd);
+		(void) drsas_common_check(instance, cmd);
+
+		if (pkt->pkt_comp) {
+			(*pkt->pkt_comp)(pkt);
+		}
+
+	}
+
+	return (TRAN_ACCEPT);
+}
+
+/*ARGSUSED*/
+static int
+drsas_tran_abort(struct scsi_address *ap, struct scsi_pkt *pkt)
+{
+	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+	/* abort command not supported by H/W */
+
+	return (DDI_FAILURE);
+}
+
+/*ARGSUSED*/
+static int
+drsas_tran_reset(struct scsi_address *ap, int level)
+{
+	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+	/* reset command not supported by H/W */
+
+	return (DDI_FAILURE);
+
+}
+
+/*ARGSUSED*/
+static int
+drsas_tran_getcap(struct scsi_address *ap, char *cap, int whom)
+{
+	int	rval = 0;
+
+	struct drsas_instance	*instance = ADDR2MR(ap);
+
+	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+	/* we do allow inquiring about capabilities for other targets */
+	if (cap == NULL) {
+		return (-1);
+	}
+
+	switch (scsi_hba_lookup_capstr(cap)) {
+	case SCSI_CAP_DMA_MAX:
+		/* Limit to 16MB max transfer */
+		rval = drsas_max_cap_maxxfer;
+		break;
+	case SCSI_CAP_MSG_OUT:
+		rval = 1;
+		break;
+	case SCSI_CAP_DISCONNECT:
+		rval = 0;
+		break;
+	case SCSI_CAP_SYNCHRONOUS:
+		rval = 0;
+		break;
+	case SCSI_CAP_WIDE_XFER:
+		rval = 1;
+		break;
+	case SCSI_CAP_TAGGED_QING:
+		rval = 1;
+		break;
+	case SCSI_CAP_UNTAGGED_QING:
+		rval = 1;
+		break;
+	case SCSI_CAP_PARITY:
+		rval = 1;
+		break;
+	case SCSI_CAP_INITIATOR_ID:
+		rval = instance->init_id;
+		break;
+	case SCSI_CAP_ARQ:
+		rval = 1;
+		break;
+	case SCSI_CAP_LINKED_CMDS:
+		rval = 0;
+		break;
+	case SCSI_CAP_RESET_NOTIFICATION:
+		rval = 1;
+		break;
+	case SCSI_CAP_GEOMETRY:
+		rval = -1;
+
+		break;
+	default:
+		con_log(CL_DLEVEL2, (CE_NOTE, "Default cap coming 0x%x",
+		    scsi_hba_lookup_capstr(cap)));
+		rval = -1;
+		break;
+	}
+
+	return (rval);
+}
+
+/*ARGSUSED*/
+static int
+drsas_tran_setcap(struct scsi_address *ap, char *cap, int value, int whom)
+{
+	int		rval = 1;
+
+	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+	/* We don't allow setting capabilities for other targets */
+	if (cap == NULL || whom == 0) {
+		return (-1);
+	}
+
+	switch (scsi_hba_lookup_capstr(cap)) {
+		case SCSI_CAP_DMA_MAX:
+		case SCSI_CAP_MSG_OUT:
+		case SCSI_CAP_PARITY:
+		case SCSI_CAP_LINKED_CMDS:
+		case SCSI_CAP_RESET_NOTIFICATION:
+		case SCSI_CAP_DISCONNECT:
+		case SCSI_CAP_SYNCHRONOUS:
+		case SCSI_CAP_UNTAGGED_QING:
+		case SCSI_CAP_WIDE_XFER:
+		case SCSI_CAP_INITIATOR_ID:
+		case SCSI_CAP_ARQ:
+			/*
+			 * None of these are settable via
+			 * the capability interface.
+			 */
+			break;
+		case SCSI_CAP_TAGGED_QING:
+			rval = 1;
+			break;
+		case SCSI_CAP_SECTOR_SIZE:
+			rval = 1;
+			break;
+
+		case SCSI_CAP_TOTAL_SECTORS:
+			rval = 1;
+			break;
+		default:
+			rval = -1;
+			break;
+	}
+
+	return (rval);
+}
+
+static void
+drsas_tran_destroy_pkt(struct scsi_address *ap, struct scsi_pkt *pkt)
+{
+	struct scsa_cmd *acmd = PKT2CMD(pkt);
+
+	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+	if (acmd->cmd_flags & CFLAG_DMAVALID) {
+		acmd->cmd_flags &= ~CFLAG_DMAVALID;
+
+		(void) ddi_dma_unbind_handle(acmd->cmd_dmahandle);
+
+		ddi_dma_free_handle(&acmd->cmd_dmahandle);
+
+		acmd->cmd_dmahandle = NULL;
+	}
+
+	/* free the pkt */
+	scsi_hba_pkt_free(ap, pkt);
+}
+
+/*ARGSUSED*/
+static void
+drsas_tran_dmafree(struct scsi_address *ap, struct scsi_pkt *pkt)
+{
+	register struct scsa_cmd *acmd = PKT2CMD(pkt);
+
+	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+	if (acmd->cmd_flags & CFLAG_DMAVALID) {
+		acmd->cmd_flags &= ~CFLAG_DMAVALID;
+
+		(void) ddi_dma_unbind_handle(acmd->cmd_dmahandle);
+
+		ddi_dma_free_handle(&acmd->cmd_dmahandle);
+
+		acmd->cmd_dmahandle = NULL;
+	}
+}
+
+/*ARGSUSED*/
+static void
+drsas_tran_sync_pkt(struct scsi_address *ap, struct scsi_pkt *pkt)
+{
+	register struct scsa_cmd	*acmd = PKT2CMD(pkt);
+
+	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+	if (acmd->cmd_flags & CFLAG_DMAVALID) {
+		(void) ddi_dma_sync(acmd->cmd_dmahandle, acmd->cmd_dma_offset,
+		    acmd->cmd_dma_len, (acmd->cmd_flags & CFLAG_DMASEND) ?
+		    DDI_DMA_SYNC_FORDEV : DDI_DMA_SYNC_FORCPU);
+	}
+}
+
+/*
+ * drsas_isr(caddr_t)
+ *
+ * The Interrupt Service Routine
+ *
+ * Collect status for all completed commands and do callback
+ *
+ */
+static uint_t
+drsas_isr(struct drsas_instance *instance)
+{
+	int		need_softintr;
+	uint32_t	producer;
+	uint32_t	consumer;
+	uint32_t	context;
+
+	struct drsas_cmd	*cmd;
+
+	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+	ASSERT(instance);
+	if ((instance->intr_type == DDI_INTR_TYPE_FIXED) &&
+	    !instance->func_ptr->intr_ack(instance)) {
+		return (DDI_INTR_UNCLAIMED);
+	}
+
+	(void) ddi_dma_sync(instance->mfi_internal_dma_obj.dma_handle,
+	    0, 0, DDI_DMA_SYNC_FORCPU);
+
+	if (drsas_check_dma_handle(instance->mfi_internal_dma_obj.dma_handle)
+	    != DDI_SUCCESS) {
+		drsas_fm_ereport(instance, DDI_FM_DEVICE_NO_RESPONSE);
+		ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST);
+		return (DDI_INTR_UNCLAIMED);
+	}
+
+	producer = ddi_get32(instance->mfi_internal_dma_obj.acc_handle,
+	    instance->producer);
+	consumer = ddi_get32(instance->mfi_internal_dma_obj.acc_handle,
+	    instance->consumer);
+
+	con_log(CL_ANN1, (CE_CONT, " producer %x consumer %x ",
+	    producer, consumer));
+	if (producer == consumer) {
+		con_log(CL_ANN1, (CE_WARN, "producer = consumer case"));
+		return (DDI_INTR_UNCLAIMED);
+	}
+	mutex_enter(&instance->completed_pool_mtx);
+
+	while (consumer != producer) {
+		context = ddi_get32(instance->mfi_internal_dma_obj.acc_handle,
+		    &instance->reply_queue[consumer]);
+		cmd = instance->cmd_list[context];
+		mlist_add_tail(&cmd->list, &instance->completed_pool_list);
+
+		consumer++;
+		if (consumer == (instance->max_fw_cmds + 1)) {
+			consumer = 0;
+		}
+	}
+
+	mutex_exit(&instance->completed_pool_mtx);
+
+	ddi_put32(instance->mfi_internal_dma_obj.acc_handle,
+	    instance->consumer, consumer);
+	(void) ddi_dma_sync(instance->mfi_internal_dma_obj.dma_handle,
+	    0, 0, DDI_DMA_SYNC_FORDEV);
+
+	if (instance->softint_running) {
+		need_softintr = 0;
+	} else {
+		need_softintr = 1;
+	}
+
+	if (instance->isr_level == HIGH_LEVEL_INTR) {
+		if (need_softintr) {
+			ddi_trigger_softintr(instance->soft_intr_id);
+		}
+	} else {
+		/*
+		 * Not a high-level interrupt, therefore call the soft level
+		 * interrupt explicitly
+		 */
+		(void) drsas_softintr(instance);
+	}
+
+	return (DDI_INTR_CLAIMED);
+}
+
+
+/*
+ * ************************************************************************** *
+ *                                                                            *
+ *                                  libraries                                 *
+ *                                                                            *
+ * ************************************************************************** *
+ */
+/*
+ * get_mfi_pkt : Get a command from the free pool
+ * After successful allocation, the caller of this routine
+ * must clear the frame buffer (memset to zero) before
+ * using the packet further.
+ *
+ * ***** Note *****
+ * After clearing the frame buffer the context id of the
+ * frame buffer SHOULD be restored back.
+ */
+static struct drsas_cmd *
+get_mfi_pkt(struct drsas_instance *instance)
+{
+	mlist_t 		*head = &instance->cmd_pool_list;
+	struct drsas_cmd	*cmd = NULL;
+
+	mutex_enter(&instance->cmd_pool_mtx);
+	ASSERT(mutex_owned(&instance->cmd_pool_mtx));
+
+	if (!mlist_empty(head)) {
+		cmd = mlist_entry(head->next, struct drsas_cmd, list);
+		mlist_del_init(head->next);
+	}
+	if (cmd != NULL)
+		cmd->pkt = NULL;
+	mutex_exit(&instance->cmd_pool_mtx);
+
+	return (cmd);
+}
+
+/*
+ * return_mfi_pkt : Return a cmd to free command pool
+ */
+static void
+return_mfi_pkt(struct drsas_instance *instance, struct drsas_cmd *cmd)
+{
+	mutex_enter(&instance->cmd_pool_mtx);
+	ASSERT(mutex_owned(&instance->cmd_pool_mtx));
+
+	mlist_add(&cmd->list, &instance->cmd_pool_list);
+
+	mutex_exit(&instance->cmd_pool_mtx);
+}
+
+/*
+ * destroy_mfi_frame_pool
+ */
+static void
+destroy_mfi_frame_pool(struct drsas_instance *instance)
+{
+	int		i;
+	uint32_t	max_cmd = instance->max_fw_cmds;
+
+	struct drsas_cmd	*cmd;
+
+	/* return all frames to pool */
+	for (i = 0; i < max_cmd+1; i++) {
+
+		cmd = instance->cmd_list[i];
+
+		if (cmd->frame_dma_obj_status == DMA_OBJ_ALLOCATED)
+			(void) drsas_free_dma_obj(instance, cmd->frame_dma_obj);
+
+		cmd->frame_dma_obj_status  = DMA_OBJ_FREED;
+	}
+
+}
+
+/*
+ * create_mfi_frame_pool
+ */
+static int
+create_mfi_frame_pool(struct drsas_instance *instance)
+{
+	int		i = 0;
+	int		cookie_cnt;
+	uint16_t	max_cmd;
+	uint16_t	sge_sz;
+	uint32_t	sgl_sz;
+	uint32_t	tot_frame_size;
+
+	struct drsas_cmd	*cmd;
+
+	max_cmd = instance->max_fw_cmds;
+
+	sge_sz	= sizeof (struct drsas_sge64);
+
+	/* calculated the number of 64byte frames required for SGL */
+	sgl_sz		= sge_sz * instance->max_num_sge;
+	tot_frame_size	= sgl_sz + MRMFI_FRAME_SIZE + SENSE_LENGTH;
+
+	con_log(CL_DLEVEL3, (CE_NOTE, "create_mfi_frame_pool: "
+	    "sgl_sz %x tot_frame_size %x", sgl_sz, tot_frame_size));
+
+	while (i < max_cmd+1) {
+		cmd = instance->cmd_list[i];
+
+		cmd->frame_dma_obj.size	= tot_frame_size;
+		cmd->frame_dma_obj.dma_attr = drsas_generic_dma_attr;
+		cmd->frame_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU;
+		cmd->frame_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU;
+		cmd->frame_dma_obj.dma_attr.dma_attr_sgllen = 1;
+		cmd->frame_dma_obj.dma_attr.dma_attr_align = 64;
+
+
+		cookie_cnt = drsas_alloc_dma_obj(instance, &cmd->frame_dma_obj,
+		    (uchar_t)DDI_STRUCTURE_LE_ACC);
+
+		if (cookie_cnt == -1 || cookie_cnt > 1) {
+			con_log(CL_ANN, (CE_WARN,
+			    "create_mfi_frame_pool: could not alloc."));
+			return (DDI_FAILURE);
+		}
+
+		bzero(cmd->frame_dma_obj.buffer, tot_frame_size);
+
+		cmd->frame_dma_obj_status = DMA_OBJ_ALLOCATED;
+		cmd->frame = (union drsas_frame *)cmd->frame_dma_obj.buffer;
+		cmd->frame_phys_addr =
+		    cmd->frame_dma_obj.dma_cookie[0].dmac_address;
+
+		cmd->sense = (uint8_t *)(((unsigned long)
+		    cmd->frame_dma_obj.buffer) +
+		    tot_frame_size - SENSE_LENGTH);
+		cmd->sense_phys_addr =
+		    cmd->frame_dma_obj.dma_cookie[0].dmac_address +
+		    tot_frame_size - SENSE_LENGTH;
+
+		if (!cmd->frame || !cmd->sense) {
+			con_log(CL_ANN, (CE_NOTE,
+			    "dr_sas: pci_pool_alloc failed"));
+
+			return (ENOMEM);
+		}
+
+		ddi_put32(cmd->frame_dma_obj.acc_handle,
+		    &cmd->frame->io.context, cmd->index);
+		i++;
+
+		con_log(CL_DLEVEL3, (CE_NOTE, "[%x]-%x",
+		    cmd->index, cmd->frame_phys_addr));
+	}
+
+	return (DDI_SUCCESS);
+}
+
+/*
+ * free_additional_dma_buffer
+ */
+static void
+free_additional_dma_buffer(struct drsas_instance *instance)
+{
+	if (instance->mfi_internal_dma_obj.status == DMA_OBJ_ALLOCATED) {
+		(void) drsas_free_dma_obj(instance,
+		    instance->mfi_internal_dma_obj);
+		instance->mfi_internal_dma_obj.status = DMA_OBJ_FREED;
+	}
+
+	if (instance->mfi_evt_detail_obj.status == DMA_OBJ_ALLOCATED) {
+		(void) drsas_free_dma_obj(instance,
+		    instance->mfi_evt_detail_obj);
+		instance->mfi_evt_detail_obj.status = DMA_OBJ_FREED;
+	}
+}
+
+/*
+ * alloc_additional_dma_buffer
+ */
+static int
+alloc_additional_dma_buffer(struct drsas_instance *instance)
+{
+	uint32_t	reply_q_sz;
+	uint32_t	internal_buf_size = PAGESIZE*2;
+
+	/* max cmds plus 1 + producer & consumer */
+	reply_q_sz = sizeof (uint32_t) * (instance->max_fw_cmds + 1 + 2);
+
+	instance->mfi_internal_dma_obj.size = internal_buf_size;
+	instance->mfi_internal_dma_obj.dma_attr	= drsas_generic_dma_attr;
+	instance->mfi_internal_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU;
+	instance->mfi_internal_dma_obj.dma_attr.dma_attr_count_max =
+	    0xFFFFFFFFU;
+	instance->mfi_internal_dma_obj.dma_attr.dma_attr_sgllen	= 1;
+
+	if (drsas_alloc_dma_obj(instance, &instance->mfi_internal_dma_obj,
+	    (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) {
+		con_log(CL_ANN, (CE_WARN,
+		    "dr_sas: could not alloc reply queue"));
+		return (DDI_FAILURE);
+	}
+
+	bzero(instance->mfi_internal_dma_obj.buffer, internal_buf_size);
+
+	instance->mfi_internal_dma_obj.status |= DMA_OBJ_ALLOCATED;
+
+	instance->producer = (uint32_t *)((unsigned long)
+	    instance->mfi_internal_dma_obj.buffer);
+	instance->consumer = (uint32_t *)((unsigned long)
+	    instance->mfi_internal_dma_obj.buffer + 4);
+	instance->reply_queue = (uint32_t *)((unsigned long)
+	    instance->mfi_internal_dma_obj.buffer + 8);
+	instance->internal_buf = (caddr_t)(((unsigned long)
+	    instance->mfi_internal_dma_obj.buffer) + reply_q_sz + 8);
+	instance->internal_buf_dmac_add =
+	    instance->mfi_internal_dma_obj.dma_cookie[0].dmac_address +
+	    (reply_q_sz + 8);
+	instance->internal_buf_size = internal_buf_size -
+	    (reply_q_sz + 8);
+
+	/* allocate evt_detail */
+	instance->mfi_evt_detail_obj.size = sizeof (struct drsas_evt_detail);
+	instance->mfi_evt_detail_obj.dma_attr = drsas_generic_dma_attr;
+	instance->mfi_evt_detail_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU;
+	instance->mfi_evt_detail_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU;
+	instance->mfi_evt_detail_obj.dma_attr.dma_attr_sgllen = 1;
+	instance->mfi_evt_detail_obj.dma_attr.dma_attr_align = 1;
+
+	if (drsas_alloc_dma_obj(instance, &instance->mfi_evt_detail_obj,
+	    (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) {
+		con_log(CL_ANN, (CE_WARN, "alloc_additional_dma_buffer: "
+		    "could not allocate data transfer buffer."));
+		return (DDI_FAILURE);
+	}
+
+	bzero(instance->mfi_evt_detail_obj.buffer,
+	    sizeof (struct drsas_evt_detail));
+
+	instance->mfi_evt_detail_obj.status |= DMA_OBJ_ALLOCATED;
+
+	return (DDI_SUCCESS);
+}
+
+/*
+ * free_space_for_mfi
+ */
+static void
+free_space_for_mfi(struct drsas_instance *instance)
+{
+	int		i;
+	uint32_t	max_cmd = instance->max_fw_cmds;
+
+	/* already freed */
+	if (instance->cmd_list == NULL) {
+		return;
+	}
+
+	free_additional_dma_buffer(instance);
+
+	/* first free the MFI frame pool */
+	destroy_mfi_frame_pool(instance);
+
+	/* free all the commands in the cmd_list */
+	for (i = 0; i < instance->max_fw_cmds+1; i++) {
+		kmem_free(instance->cmd_list[i],
+		    sizeof (struct drsas_cmd));
+
+		instance->cmd_list[i] = NULL;
+	}
+
+	/* free the cmd_list buffer itself */
+	kmem_free(instance->cmd_list,
+	    sizeof (struct drsas_cmd *) * (max_cmd+1));
+
+	instance->cmd_list = NULL;
+
+	INIT_LIST_HEAD(&instance->cmd_pool_list);
+}
+
+/*
+ * alloc_space_for_mfi
+ */
+static int
+alloc_space_for_mfi(struct drsas_instance *instance)
+{
+	int		i;
+	uint32_t	max_cmd;
+	size_t		sz;
+
+	struct drsas_cmd	*cmd;
+
+	max_cmd = instance->max_fw_cmds;
+
+	/* reserve 1 more slot for flush_cache */
+	sz = sizeof (struct drsas_cmd *) * (max_cmd+1);
+
+	/*
+	 * instance->cmd_list is an array of struct drsas_cmd pointers.
+	 * Allocate the dynamic array first and then allocate individual
+	 * commands.
+	 */
+	instance->cmd_list = kmem_zalloc(sz, KM_SLEEP);
+	ASSERT(instance->cmd_list);
+
+	for (i = 0; i < max_cmd+1; i++) {
+		instance->cmd_list[i] = kmem_zalloc(sizeof (struct drsas_cmd),
+		    KM_SLEEP);
+		ASSERT(instance->cmd_list[i]);
+	}
+
+	INIT_LIST_HEAD(&instance->cmd_pool_list);
+
+	/* add all the commands to command pool (instance->cmd_pool) */
+	for (i = 0; i < max_cmd; i++) {
+		cmd		= instance->cmd_list[i];
+		cmd->index	= i;
+
+		mlist_add_tail(&cmd->list, &instance->cmd_pool_list);
+	}
+
+	/* single slot for flush_cache won't be added in command pool */
+	cmd		= instance->cmd_list[max_cmd];
+	cmd->index	= i;
+
+	/* create a frame pool and assign one frame to each cmd */
+	if (create_mfi_frame_pool(instance)) {
+		con_log(CL_ANN, (CE_NOTE, "error creating frame DMA pool"));
+		return (DDI_FAILURE);
+	}
+
+	/* create a frame pool and assign one frame to each cmd */
+	if (alloc_additional_dma_buffer(instance)) {
+		con_log(CL_ANN, (CE_NOTE, "error creating frame DMA pool"));
+		return (DDI_FAILURE);
+	}
+
+	return (DDI_SUCCESS);
+}
+
+/*
+ * get_ctrl_info
+ */
+static int
+get_ctrl_info(struct drsas_instance *instance,
+    struct drsas_ctrl_info *ctrl_info)
+{
+	int	ret = 0;
+
+	struct drsas_cmd		*cmd;
+	struct drsas_dcmd_frame	*dcmd;
+	struct drsas_ctrl_info	*ci;
+
+	cmd = get_mfi_pkt(instance);
+
+	if (!cmd) {
+		con_log(CL_ANN, (CE_WARN,
+		    "Failed to get a cmd for ctrl info"));
+		return (DDI_FAILURE);
+	}
+	/* Clear the frame buffer and assign back the context id */
+	(void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame));
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context,
+	    cmd->index);
+
+	dcmd = &cmd->frame->dcmd;
+
+	ci = (struct drsas_ctrl_info *)instance->internal_buf;
+
+	if (!ci) {
+		con_log(CL_ANN, (CE_WARN,
+		    "Failed to alloc mem for ctrl info"));
+		return_mfi_pkt(instance, cmd);
+		return (DDI_FAILURE);
+	}
+
+	(void) memset(ci, 0, sizeof (struct drsas_ctrl_info));
+
+	/* for( i = 0; i < DCMD_MBOX_SZ; i++ ) dcmd->mbox.b[i] = 0; */
+	(void) memset(dcmd->mbox.b, 0, DCMD_MBOX_SZ);
+
+	ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd, MFI_CMD_OP_DCMD);
+	ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd_status,
+	    MFI_CMD_STATUS_POLL_MODE);
+	ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->sge_count, 1);
+	ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->flags,
+	    MFI_FRAME_DIR_READ);
+	ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->timeout, 0);
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->data_xfer_len,
+	    sizeof (struct drsas_ctrl_info));
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->opcode,
+	    DR_DCMD_CTRL_GET_INFO);
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->sgl.sge32[0].phys_addr,
+	    instance->internal_buf_dmac_add);
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->sgl.sge32[0].length,
+	    sizeof (struct drsas_ctrl_info));
+
+	cmd->frame_count = 1;
+
+	if (!instance->func_ptr->issue_cmd_in_poll_mode(instance, cmd)) {
+		ret = 0;
+		ddi_rep_get8(cmd->frame_dma_obj.acc_handle,
+		    (uint8_t *)ctrl_info, (uint8_t *)ci,
+		    sizeof (struct drsas_ctrl_info), DDI_DEV_AUTOINCR);
+	} else {
+		con_log(CL_ANN, (CE_WARN, "get_ctrl_info: Ctrl info failed"));
+		ret = -1;
+	}
+
+	return_mfi_pkt(instance, cmd);
+	if (drsas_common_check(instance, cmd) != DDI_SUCCESS) {
+		ret = -1;
+	}
+
+	return (ret);
+}
+
+/*
+ * abort_aen_cmd
+ */
+static int
+abort_aen_cmd(struct drsas_instance *instance,
+    struct drsas_cmd *cmd_to_abort)
+{
+	int	ret = 0;
+
+	struct drsas_cmd		*cmd;
+	struct drsas_abort_frame	*abort_fr;
+
+	cmd = get_mfi_pkt(instance);
+
+	if (!cmd) {
+		con_log(CL_ANN, (CE_WARN,
+		    "Failed to get a cmd for ctrl info"));
+		return (DDI_FAILURE);
+	}
+	/* Clear the frame buffer and assign back the context id */
+	(void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame));
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context,
+	    cmd->index);
+
+	abort_fr = &cmd->frame->abort;
+
+	/* prepare and issue the abort frame */
+	ddi_put8(cmd->frame_dma_obj.acc_handle,
+	    &abort_fr->cmd, MFI_CMD_OP_ABORT);
+	ddi_put8(cmd->frame_dma_obj.acc_handle, &abort_fr->cmd_status,
+	    MFI_CMD_STATUS_SYNC_MODE);
+	ddi_put16(cmd->frame_dma_obj.acc_handle, &abort_fr->flags, 0);
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &abort_fr->abort_context,
+	    cmd_to_abort->index);
+	ddi_put32(cmd->frame_dma_obj.acc_handle,
+	    &abort_fr->abort_mfi_phys_addr_lo, cmd_to_abort->frame_phys_addr);
+	ddi_put32(cmd->frame_dma_obj.acc_handle,
+	    &abort_fr->abort_mfi_phys_addr_hi, 0);
+
+	instance->aen_cmd->abort_aen = 1;
+
+	cmd->sync_cmd = DRSAS_TRUE;
+	cmd->frame_count = 1;
+
+	if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) {
+		con_log(CL_ANN, (CE_WARN,
+		    "abort_aen_cmd: issue_cmd_in_sync_mode failed"));
+		ret = -1;
+	} else {
+		ret = 0;
+	}
+
+	instance->aen_cmd->abort_aen = 1;
+	instance->aen_cmd = 0;
+
+	return_mfi_pkt(instance, cmd);
+	(void) drsas_common_check(instance, cmd);
+
+	return (ret);
+}
+
+/*
+ * init_mfi
+ */
+static int
+init_mfi(struct drsas_instance *instance)
+{
+	struct drsas_cmd		*cmd;
+	struct drsas_ctrl_info		ctrl_info;
+	struct drsas_init_frame		*init_frame;
+	struct drsas_init_queue_info	*initq_info;
+
+	/* we expect the FW state to be READY */
+	if (mfi_state_transition_to_ready(instance)) {
+		con_log(CL_ANN, (CE_WARN, "dr_sas: F/W is not ready"));
+		goto fail_ready_state;
+	}
+
+	/* get various operational parameters from status register */
+	instance->max_num_sge =
+	    (instance->func_ptr->read_fw_status_reg(instance) &
+	    0xFF0000) >> 0x10;
+	/*
+	 * Reduce the max supported cmds by 1. This is to ensure that the
+	 * reply_q_sz (1 more than the max cmd that driver may send)
+	 * does not exceed max cmds that the FW can support
+	 */
+	instance->max_fw_cmds =
+	    instance->func_ptr->read_fw_status_reg(instance) & 0xFFFF;
+	instance->max_fw_cmds = instance->max_fw_cmds - 1;
+
+	instance->max_num_sge =
+	    (instance->max_num_sge > DRSAS_MAX_SGE_CNT) ?
+	    DRSAS_MAX_SGE_CNT : instance->max_num_sge;
+
+	/* create a pool of commands */
+	if (alloc_space_for_mfi(instance) != DDI_SUCCESS)
+		goto fail_alloc_fw_space;
+
+	/*
+	 * Prepare a init frame. Note the init frame points to queue info
+	 * structure. Each frame has SGL allocated after first 64 bytes. For
+	 * this frame - since we don't need any SGL - we use SGL's space as
+	 * queue info structure
+	 */
+	cmd = get_mfi_pkt(instance);
+	/* Clear the frame buffer and assign back the context id */
+	(void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame));
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context,
+	    cmd->index);
+
+	init_frame = (struct drsas_init_frame *)cmd->frame;
+	initq_info = (struct drsas_init_queue_info *)
+	    ((unsigned long)init_frame + 64);
+
+	(void) memset(init_frame, 0, MRMFI_FRAME_SIZE);
+	(void) memset(initq_info, 0, sizeof (struct drsas_init_queue_info));
+
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &initq_info->init_flags, 0);
+
+	ddi_put32(cmd->frame_dma_obj.acc_handle,
+	    &initq_info->reply_queue_entries, instance->max_fw_cmds + 1);
+
+	ddi_put32(cmd->frame_dma_obj.acc_handle,
+	    &initq_info->producer_index_phys_addr_hi, 0);
+	ddi_put32(cmd->frame_dma_obj.acc_handle,
+	    &initq_info->producer_index_phys_addr_lo,
+	    instance->mfi_internal_dma_obj.dma_cookie[0].dmac_address);
+
+	ddi_put32(cmd->frame_dma_obj.acc_handle,
+	    &initq_info->consumer_index_phys_addr_hi, 0);
+	ddi_put32(cmd->frame_dma_obj.acc_handle,
+	    &initq_info->consumer_index_phys_addr_lo,
+	    instance->mfi_internal_dma_obj.dma_cookie[0].dmac_address + 4);
+
+	ddi_put32(cmd->frame_dma_obj.acc_handle,
+	    &initq_info->reply_queue_start_phys_addr_hi, 0);
+	ddi_put32(cmd->frame_dma_obj.acc_handle,
+	    &initq_info->reply_queue_start_phys_addr_lo,
+	    instance->mfi_internal_dma_obj.dma_cookie[0].dmac_address + 8);
+
+	ddi_put8(cmd->frame_dma_obj.acc_handle,
+	    &init_frame->cmd, MFI_CMD_OP_INIT);
+	ddi_put8(cmd->frame_dma_obj.acc_handle, &init_frame->cmd_status,
+	    MFI_CMD_STATUS_POLL_MODE);
+	ddi_put16(cmd->frame_dma_obj.acc_handle, &init_frame->flags, 0);
+	ddi_put32(cmd->frame_dma_obj.acc_handle,
+	    &init_frame->queue_info_new_phys_addr_lo,
+	    cmd->frame_phys_addr + 64);
+	ddi_put32(cmd->frame_dma_obj.acc_handle,
+	    &init_frame->queue_info_new_phys_addr_hi, 0);
+
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &init_frame->data_xfer_len,
+	    sizeof (struct drsas_init_queue_info));
+
+	cmd->frame_count = 1;
+
+	/* issue the init frame in polled mode */
+	if (instance->func_ptr->issue_cmd_in_poll_mode(instance, cmd)) {
+		con_log(CL_ANN, (CE_WARN, "failed to init firmware"));
+		goto fail_fw_init;
+	}
+
+	return_mfi_pkt(instance, cmd);
+	if (drsas_common_check(instance, cmd) != DDI_SUCCESS) {
+		goto fail_fw_init;
+	}
+
+	/* gather misc FW related information */
+	if (!get_ctrl_info(instance, &ctrl_info)) {
+		instance->max_sectors_per_req = ctrl_info.max_request_size;
+		con_log(CL_ANN1, (CE_NOTE, "product name %s ld present %d",
+		    ctrl_info.product_name, ctrl_info.ld_present_count));
+	} else {
+		instance->max_sectors_per_req = instance->max_num_sge *
+		    PAGESIZE / 512;
+	}
+
+	if (drsas_check_acc_handle(instance->regmap_handle) != DDI_SUCCESS) {
+		goto fail_fw_init;
+	}
+
+	return (DDI_SUCCESS);
+
+fail_fw_init:
+fail_alloc_fw_space:
+
+	free_space_for_mfi(instance);
+
+fail_ready_state:
+	ddi_regs_map_free(&instance->regmap_handle);
+
+fail_mfi_reg_setup:
+	return (DDI_FAILURE);
+}
+
+/*
+ * mfi_state_transition_to_ready	: Move the FW to READY state
+ *
+ * @reg_set			: MFI register set
+ */
+static int
+mfi_state_transition_to_ready(struct drsas_instance *instance)
+{
+	int		i;
+	uint8_t		max_wait;
+	uint32_t	fw_ctrl;
+	uint32_t	fw_state;
+	uint32_t	cur_state;
+
+	fw_state =
+	    instance->func_ptr->read_fw_status_reg(instance) & MFI_STATE_MASK;
+	con_log(CL_ANN1, (CE_NOTE,
+	    "mfi_state_transition_to_ready:FW state = 0x%x", fw_state));
+
+	while (fw_state != MFI_STATE_READY) {
+		con_log(CL_ANN, (CE_NOTE,
+		    "mfi_state_transition_to_ready:FW state%x", fw_state));
+
+		switch (fw_state) {
+		case MFI_STATE_FAULT:
+			con_log(CL_ANN, (CE_NOTE,
+			    "dr_sas: FW in FAULT state!!"));
+
+			return (ENODEV);
+		case MFI_STATE_WAIT_HANDSHAKE:
+			/* set the CLR bit in IMR0 */
+			con_log(CL_ANN, (CE_NOTE,
+			    "dr_sas: FW waiting for HANDSHAKE"));
+			/*
+			 * PCI_Hot Plug: MFI F/W requires
+			 * (MFI_INIT_CLEAR_HANDSHAKE|MFI_INIT_HOTPLUG)
+			 * to be set
+			 */
+			/* WR_IB_MSG_0(MFI_INIT_CLEAR_HANDSHAKE, instance); */
+			WR_IB_DOORBELL(MFI_INIT_CLEAR_HANDSHAKE |
+			    MFI_INIT_HOTPLUG, instance);
+
+			max_wait	= 2;
+			cur_state	= MFI_STATE_WAIT_HANDSHAKE;
+			break;
+		case MFI_STATE_BOOT_MESSAGE_PENDING:
+			/* set the CLR bit in IMR0 */
+			con_log(CL_ANN, (CE_NOTE,
+			    "dr_sas: FW state boot message pending"));
+			/*
+			 * PCI_Hot Plug: MFI F/W requires
+			 * (MFI_INIT_CLEAR_HANDSHAKE|MFI_INIT_HOTPLUG)
+			 * to be set
+			 */
+			WR_IB_DOORBELL(MFI_INIT_HOTPLUG, instance);
+
+			max_wait	= 10;
+			cur_state	= MFI_STATE_BOOT_MESSAGE_PENDING;
+			break;
+		case MFI_STATE_OPERATIONAL:
+			/* bring it to READY state; assuming max wait 2 secs */
+			instance->func_ptr->disable_intr(instance);
+			con_log(CL_ANN1, (CE_NOTE,
+			    "dr_sas: FW in OPERATIONAL state"));
+			/*
+			 * PCI_Hot Plug: MFI F/W requires
+			 * (MFI_INIT_READY | MFI_INIT_MFIMODE | MFI_INIT_ABORT)
+			 * to be set
+			 */
+			/* WR_IB_DOORBELL(MFI_INIT_READY, instance); */
+			WR_IB_DOORBELL(MFI_RESET_FLAGS, instance);
+
+			max_wait	= 10;
+			cur_state	= MFI_STATE_OPERATIONAL;
+			break;
+		case MFI_STATE_UNDEFINED:
+			/* this state should not last for more than 2 seconds */
+			con_log(CL_ANN, (CE_NOTE, "FW state undefined"));
+
+			max_wait	= 2;
+			cur_state	= MFI_STATE_UNDEFINED;
+			break;
+		case MFI_STATE_BB_INIT:
+			max_wait	= 2;
+			cur_state	= MFI_STATE_BB_INIT;
+			break;
+		case MFI_STATE_FW_INIT:
+			max_wait	= 2;
+			cur_state	= MFI_STATE_FW_INIT;
+			break;
+		case MFI_STATE_DEVICE_SCAN:
+			max_wait	= 10;
+			cur_state	= MFI_STATE_DEVICE_SCAN;
+			break;
+		default:
+			con_log(CL_ANN, (CE_NOTE,
+			    "dr_sas: Unknown state 0x%x", fw_state));
+			return (ENODEV);
+		}
+
+		/* the cur_state should not last for more than max_wait secs */
+		for (i = 0; i < (max_wait * MILLISEC); i++) {
+			/* fw_state = RD_OB_MSG_0(instance) & MFI_STATE_MASK; */
+			fw_state =
+			    instance->func_ptr->read_fw_status_reg(instance) &
+			    MFI_STATE_MASK;
+
+			if (fw_state == cur_state) {
+				delay(1 * drv_usectohz(MILLISEC));
+			} else {
+				break;
+			}
+		}
+
+		/* return error if fw_state hasn't changed after max_wait */
+		if (fw_state == cur_state) {
+			con_log(CL_ANN, (CE_NOTE,
+			    "FW state hasn't changed in %d secs", max_wait));
+			return (ENODEV);
+		}
+	};
+
+	fw_ctrl = RD_IB_DOORBELL(instance);
+
+	con_log(CL_ANN1, (CE_NOTE,
+	    "mfi_state_transition_to_ready:FW ctrl = 0x%x", fw_ctrl));
+
+	/*
+	 * Write 0xF to the doorbell register to do the following.
+	 * - Abort all outstanding commands (bit 0).
+	 * - Transition from OPERATIONAL to READY state (bit 1).
+	 * - Discard (possible) low MFA posted in 64-bit mode (bit-2).
+	 * - Set to release FW to continue running (i.e. BIOS handshake
+	 *   (bit 3).
+	 */
+	WR_IB_DOORBELL(0xF, instance);
+
+	if (drsas_check_acc_handle(instance->regmap_handle) != DDI_SUCCESS) {
+		return (ENODEV);
+	}
+	return (DDI_SUCCESS);
+}
+
+/*
+ * get_seq_num
+ */
+static int
+get_seq_num(struct drsas_instance *instance,
+    struct drsas_evt_log_info *eli)
+{
+	int	ret = DDI_SUCCESS;
+
+	dma_obj_t			dcmd_dma_obj;
+	struct drsas_cmd		*cmd;
+	struct drsas_dcmd_frame		*dcmd;
+
+	cmd = get_mfi_pkt(instance);
+
+	if (!cmd) {
+		cmn_err(CE_WARN, "dr_sas: failed to get a cmd");
+		return (ENOMEM);
+	}
+	/* Clear the frame buffer and assign back the context id */
+	(void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame));
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context,
+	    cmd->index);
+
+	dcmd	= &cmd->frame->dcmd;
+
+	/* allocate the data transfer buffer */
+	dcmd_dma_obj.size = sizeof (struct drsas_evt_log_info);
+	dcmd_dma_obj.dma_attr = drsas_generic_dma_attr;
+	dcmd_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU;
+	dcmd_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU;
+	dcmd_dma_obj.dma_attr.dma_attr_sgllen = 1;
+	dcmd_dma_obj.dma_attr.dma_attr_align = 1;
+
+	if (drsas_alloc_dma_obj(instance, &dcmd_dma_obj,
+	    (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) {
+		con_log(CL_ANN, (CE_WARN,
+		    "get_seq_num: could not allocate data transfer buffer."));
+		return (DDI_FAILURE);
+	}
+
+	(void) memset(dcmd_dma_obj.buffer, 0,
+	    sizeof (struct drsas_evt_log_info));
+
+	(void) memset(dcmd->mbox.b, 0, DCMD_MBOX_SZ);
+
+	ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd, MFI_CMD_OP_DCMD);
+	ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd_status, 0);
+	ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->sge_count, 1);
+	ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->flags,
+	    MFI_FRAME_DIR_READ);
+	ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->timeout, 0);
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->data_xfer_len,
+	    sizeof (struct drsas_evt_log_info));
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->opcode,
+	    DR_DCMD_CTRL_EVENT_GET_INFO);
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->sgl.sge32[0].length,
+	    sizeof (struct drsas_evt_log_info));
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->sgl.sge32[0].phys_addr,
+	    dcmd_dma_obj.dma_cookie[0].dmac_address);
+
+	cmd->sync_cmd = DRSAS_TRUE;
+	cmd->frame_count = 1;
+
+	if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) {
+		cmn_err(CE_WARN, "get_seq_num: "
+		    "failed to issue DRSAS_DCMD_CTRL_EVENT_GET_INFO");
+		ret = DDI_FAILURE;
+	} else {
+		/* copy the data back into callers buffer */
+		ddi_rep_get8(cmd->frame_dma_obj.acc_handle, (uint8_t *)eli,
+		    (uint8_t *)dcmd_dma_obj.buffer,
+		    sizeof (struct drsas_evt_log_info), DDI_DEV_AUTOINCR);
+		ret = DDI_SUCCESS;
+	}
+
+	if (drsas_free_dma_obj(instance, dcmd_dma_obj) != DDI_SUCCESS)
+		ret = DDI_FAILURE;
+
+	return_mfi_pkt(instance, cmd);
+	if (drsas_common_check(instance, cmd) != DDI_SUCCESS) {
+		ret = DDI_FAILURE;
+	}
+	return (ret);
+}
+
+/*
+ * start_mfi_aen
+ */
+static int
+start_mfi_aen(struct drsas_instance *instance)
+{
+	int	ret = 0;
+
+	struct drsas_evt_log_info	eli;
+	union drsas_evt_class_locale	class_locale;
+
+	/* get the latest sequence number from FW */
+	(void) memset(&eli, 0, sizeof (struct drsas_evt_log_info));
+
+	if (get_seq_num(instance, &eli)) {
+		cmn_err(CE_WARN, "start_mfi_aen: failed to get seq num");
+		return (-1);
+	}
+
+	/* register AEN with FW for latest sequence number plus 1 */
+	class_locale.members.reserved	= 0;
+	class_locale.members.locale	= DR_EVT_LOCALE_ALL;
+	class_locale.members.class	= DR_EVT_CLASS_INFO;
+	ret = register_mfi_aen(instance, eli.newest_seq_num + 1,
+	    class_locale.word);
+
+	if (ret) {
+		cmn_err(CE_WARN, "start_mfi_aen: aen registration failed");
+		return (-1);
+	}
+
+	return (ret);
+}
+
+/*
+ * flush_cache
+ */
+static void
+flush_cache(struct drsas_instance *instance)
+{
+	struct drsas_cmd		*cmd = NULL;
+	struct drsas_dcmd_frame		*dcmd;
+	uint32_t	max_cmd = instance->max_fw_cmds;
+
+	cmd = instance->cmd_list[max_cmd];
+
+	if (cmd == NULL)
+		return;
+
+	dcmd = &cmd->frame->dcmd;
+
+	(void) memset(dcmd->mbox.b, 0, DCMD_MBOX_SZ);
+
+	ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd, MFI_CMD_OP_DCMD);
+	ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd_status, 0x0);
+	ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->sge_count, 0);
+	ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->flags,
+	    MFI_FRAME_DIR_NONE);
+	ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->timeout, 0);
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->data_xfer_len, 0);
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->opcode,
+	    DR_DCMD_CTRL_CACHE_FLUSH);
+	ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->mbox.b[0],
+	    DR_FLUSH_CTRL_CACHE | DR_FLUSH_DISK_CACHE);
+
+	cmd->frame_count = 1;
+
+	if (instance->func_ptr->issue_cmd_in_poll_mode(instance, cmd)) {
+		con_log(CL_ANN1, (CE_WARN,
+	    "flush_cache: failed to issue MFI_DCMD_CTRL_CACHE_FLUSH"));
+	}
+	con_log(CL_DLEVEL1, (CE_NOTE, "done"));
+}
+
+/*
+ * service_mfi_aen-	Completes an AEN command
+ * @instance:			Adapter soft state
+ * @cmd:			Command to be completed
+ *
+ */
+static void
+service_mfi_aen(struct drsas_instance *instance, struct drsas_cmd *cmd)
+{
+	uint32_t	seq_num;
+	struct drsas_evt_detail *evt_detail =
+	    (struct drsas_evt_detail *)instance->mfi_evt_detail_obj.buffer;
+	int		rval = 0;
+	int		tgt = 0;
+	ddi_acc_handle_t		acc_handle;
+
+	acc_handle = cmd->frame_dma_obj.acc_handle;
+
+	cmd->cmd_status = ddi_get8(acc_handle, &cmd->frame->io.cmd_status);
+
+	if (cmd->cmd_status == ENODATA) {
+		cmd->cmd_status = 0;
+	}
+
+	/*
+	 * log the MFI AEN event to the sysevent queue so that
+	 * application will get noticed
+	 */
+	if (ddi_log_sysevent(instance->dip, DDI_VENDOR_LSI, "LSIMEGA", "SAS",
+	    NULL, NULL, DDI_NOSLEEP) != DDI_SUCCESS) {
+		int	instance_no = ddi_get_instance(instance->dip);
+		con_log(CL_ANN, (CE_WARN,
+		    "dr_sas%d: Failed to log AEN event", instance_no));
+	}
+	/*
+	 * Check for any ld devices that has changed state. i.e. online
+	 * or offline.
+	 */
+	con_log(CL_ANN1, (CE_NOTE,
+	    "AEN: code = %x class = %x locale = %x args = %x",
+	    ddi_get32(acc_handle, &evt_detail->code),
+	    evt_detail->cl.members.class,
+	    ddi_get16(acc_handle, &evt_detail->cl.members.locale),
+	    ddi_get8(acc_handle, &evt_detail->arg_type)));
+
+	switch (ddi_get32(acc_handle, &evt_detail->code)) {
+	case DR_EVT_CFG_CLEARED: {
+		for (tgt = 0; tgt < MRDRV_MAX_LD; tgt++) {
+			if (instance->dr_ld_list[tgt].dip != NULL) {
+				rval = drsas_service_evt(instance, tgt, 0,
+				    DRSAS_EVT_UNCONFIG_TGT, NULL);
+				con_log(CL_ANN1, (CE_WARN,
+				    "dr_sas: CFG CLEARED AEN rval = %d "
+				    "tgt id = %d", rval, tgt));
+			}
+		}
+		break;
+	}
+
+	case DR_EVT_LD_DELETED: {
+		rval = drsas_service_evt(instance,
+		    ddi_get16(acc_handle, &evt_detail->args.ld.target_id), 0,
+		    DRSAS_EVT_UNCONFIG_TGT, NULL);
+		con_log(CL_ANN1, (CE_WARN, "dr_sas: LD DELETED AEN rval = %d "
+		    "tgt id = %d index = %d", rval,
+		    ddi_get16(acc_handle, &evt_detail->args.ld.target_id),
+		    ddi_get8(acc_handle, &evt_detail->args.ld.ld_index)));
+		break;
+	} /* End of DR_EVT_LD_DELETED */
+
+	case DR_EVT_LD_CREATED: {
+		rval = drsas_service_evt(instance,
+		    ddi_get16(acc_handle, &evt_detail->args.ld.target_id), 0,
+		    DRSAS_EVT_CONFIG_TGT, NULL);
+		con_log(CL_ANN1, (CE_WARN, "dr_sas: LD CREATED AEN rval = %d "
+		    "tgt id = %d index = %d", rval,
+		    ddi_get16(acc_handle, &evt_detail->args.ld.target_id),
+		    ddi_get8(acc_handle, &evt_detail->args.ld.ld_index)));
+		break;
+	} /* End of DR_EVT_LD_CREATED */
+	} /* End of Main Switch */
+
+	/* get copy of seq_num and class/locale for re-registration */
+	seq_num = ddi_get32(acc_handle, &evt_detail->seq_num);
+	seq_num++;
+	(void) memset(instance->mfi_evt_detail_obj.buffer, 0,
+	    sizeof (struct drsas_evt_detail));
+
+	ddi_put8(acc_handle, &cmd->frame->dcmd.cmd_status, 0x0);
+	ddi_put32(acc_handle, &cmd->frame->dcmd.mbox.w[0], seq_num);
+
+	instance->aen_seq_num = seq_num;
+
+	cmd->frame_count = 1;
+
+	/* Issue the aen registration frame */
+	instance->func_ptr->issue_cmd(cmd, instance);
+}
+
+/*
+ * complete_cmd_in_sync_mode -	Completes an internal command
+ * @instance:			Adapter soft state
+ * @cmd:			Command to be completed
+ *
+ * The issue_cmd_in_sync_mode() function waits for a command to complete
+ * after it issues a command. This function wakes up that waiting routine by
+ * calling wake_up() on the wait queue.
+ */
+static void
+complete_cmd_in_sync_mode(struct drsas_instance *instance,
+    struct drsas_cmd *cmd)
+{
+	cmd->cmd_status = ddi_get8(cmd->frame_dma_obj.acc_handle,
+	    &cmd->frame->io.cmd_status);
+
+	cmd->sync_cmd = DRSAS_FALSE;
+
+	if (cmd->cmd_status == ENODATA) {
+		cmd->cmd_status = 0;
+	}
+
+	cv_broadcast(&instance->int_cmd_cv);
+}
+
+/*
+ * drsas_softintr - The Software ISR
+ * @param arg	: HBA soft state
+ *
+ * called from high-level interrupt if hi-level interrupt are not there,
+ * otherwise triggered as a soft interrupt
+ */
+static uint_t
+drsas_softintr(struct drsas_instance *instance)
+{
+	struct scsi_pkt		*pkt;
+	struct scsa_cmd		*acmd;
+	struct drsas_cmd	*cmd;
+	struct mlist_head	*pos, *next;
+	mlist_t			process_list;
+	struct drsas_header	*hdr;
+	struct scsi_arq_status	*arqstat;
+
+	con_log(CL_ANN1, (CE_CONT, "drsas_softintr called"));
+
+	ASSERT(instance);
+	mutex_enter(&instance->completed_pool_mtx);
+
+	if (mlist_empty(&instance->completed_pool_list)) {
+		mutex_exit(&instance->completed_pool_mtx);
+		return (DDI_INTR_UNCLAIMED);
+	}
+
+	instance->softint_running = 1;
+
+	INIT_LIST_HEAD(&process_list);
+	mlist_splice(&instance->completed_pool_list, &process_list);
+	INIT_LIST_HEAD(&instance->completed_pool_list);
+
+	mutex_exit(&instance->completed_pool_mtx);
+
+	/* perform all callbacks first, before releasing the SCBs */
+	mlist_for_each_safe(pos, next, &process_list) {
+		cmd = mlist_entry(pos, struct drsas_cmd, list);
+
+		/* syncronize the Cmd frame for the controller */
+		(void) ddi_dma_sync(cmd->frame_dma_obj.dma_handle,
+		    0, 0, DDI_DMA_SYNC_FORCPU);
+
+		if (drsas_check_dma_handle(cmd->frame_dma_obj.dma_handle) !=
+		    DDI_SUCCESS) {
+			drsas_fm_ereport(instance, DDI_FM_DEVICE_NO_RESPONSE);
+			ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST);
+			return (DDI_INTR_UNCLAIMED);
+		}
+
+		hdr = &cmd->frame->hdr;
+
+		/* remove the internal command from the process list */
+		mlist_del_init(&cmd->list);
+
+		switch (ddi_get8(cmd->frame_dma_obj.acc_handle, &hdr->cmd)) {
+		case MFI_CMD_OP_PD_SCSI:
+		case MFI_CMD_OP_LD_SCSI:
+		case MFI_CMD_OP_LD_READ:
+		case MFI_CMD_OP_LD_WRITE:
+			/*
+			 * MFI_CMD_OP_PD_SCSI and MFI_CMD_OP_LD_SCSI
+			 * could have been issued either through an
+			 * IO path or an IOCTL path. If it was via IOCTL,
+			 * we will send it to internal completion.
+			 */
+			if (cmd->sync_cmd == DRSAS_TRUE) {
+				complete_cmd_in_sync_mode(instance, cmd);
+				break;
+			}
+
+			/* regular commands */
+			acmd =	cmd->cmd;
+			pkt =	CMD2PKT(acmd);
+
+			if (acmd->cmd_flags & CFLAG_DMAVALID) {
+				if (acmd->cmd_flags & CFLAG_CONSISTENT) {
+					(void) ddi_dma_sync(acmd->cmd_dmahandle,
+					    acmd->cmd_dma_offset,
+					    acmd->cmd_dma_len,
+					    DDI_DMA_SYNC_FORCPU);
+				}
+			}
+
+			pkt->pkt_reason		= CMD_CMPLT;
+			pkt->pkt_statistics	= 0;
+			pkt->pkt_state = STATE_GOT_BUS
+			    | STATE_GOT_TARGET | STATE_SENT_CMD
+			    | STATE_XFERRED_DATA | STATE_GOT_STATUS;
+
+			con_log(CL_ANN1, (CE_CONT,
+			    "CDB[0] = %x completed for %s: size %lx context %x",
+			    pkt->pkt_cdbp[0], ((acmd->islogical) ? "LD" : "PD"),
+			    acmd->cmd_dmacount, hdr->context));
+
+			if (pkt->pkt_cdbp[0] == SCMD_INQUIRY) {
+				struct scsi_inquiry	*inq;
+
+				if (acmd->cmd_dmacount != 0) {
+					bp_mapin(acmd->cmd_buf);
+					inq = (struct scsi_inquiry *)
+					    acmd->cmd_buf->b_un.b_addr;
+
+					/* don't expose physical drives to OS */
+					if (acmd->islogical &&
+					    (hdr->cmd_status == MFI_STAT_OK)) {
+						display_scsi_inquiry(
+						    (caddr_t)inq);
+					} else if ((hdr->cmd_status ==
+					    MFI_STAT_OK) && inq->inq_dtype ==
+					    DTYPE_DIRECT) {
+
+						display_scsi_inquiry(
+						    (caddr_t)inq);
+
+						/* for physical disk */
+						hdr->cmd_status =
+						    MFI_STAT_DEVICE_NOT_FOUND;
+					}
+				}
+			}
+
+			switch (hdr->cmd_status) {
+			case MFI_STAT_OK:
+				pkt->pkt_scbp[0] = STATUS_GOOD;
+				break;
+			case MFI_STAT_LD_CC_IN_PROGRESS:
+			case MFI_STAT_LD_RECON_IN_PROGRESS:
+				pkt->pkt_scbp[0] = STATUS_GOOD;
+				break;
+			case MFI_STAT_LD_INIT_IN_PROGRESS:
+				con_log(CL_ANN,
+				    (CE_WARN, "Initialization in Progress"));
+				pkt->pkt_reason	= CMD_TRAN_ERR;
+
+				break;
+			case MFI_STAT_SCSI_DONE_WITH_ERROR:
+				con_log(CL_ANN1, (CE_CONT, "scsi_done error"));
+
+				pkt->pkt_reason	= CMD_CMPLT;
+				((struct scsi_status *)
+				    pkt->pkt_scbp)->sts_chk = 1;
+
+				if (pkt->pkt_cdbp[0] == SCMD_TEST_UNIT_READY) {
+
+					con_log(CL_ANN,
+					    (CE_WARN, "TEST_UNIT_READY fail"));
+
+				} else {
+					pkt->pkt_state |= STATE_ARQ_DONE;
+					arqstat = (void *)(pkt->pkt_scbp);
+					arqstat->sts_rqpkt_reason = CMD_CMPLT;
+					arqstat->sts_rqpkt_resid = 0;
+					arqstat->sts_rqpkt_state |=
+					    STATE_GOT_BUS | STATE_GOT_TARGET
+					    | STATE_SENT_CMD
+					    | STATE_XFERRED_DATA;
+					*(uint8_t *)&arqstat->sts_rqpkt_status =
+					    STATUS_GOOD;
+					ddi_rep_get8(
+					    cmd->frame_dma_obj.acc_handle,
+					    (uint8_t *)
+					    &(arqstat->sts_sensedata),
+					    cmd->sense,
+					    acmd->cmd_scblen -
+					    offsetof(struct scsi_arq_status,
+					    sts_sensedata), DDI_DEV_AUTOINCR);
+				}
+				break;
+			case MFI_STAT_LD_OFFLINE:
+			case MFI_STAT_DEVICE_NOT_FOUND:
+				con_log(CL_ANN1, (CE_CONT,
+				    "device not found error"));
+				pkt->pkt_reason	= CMD_DEV_GONE;
+				pkt->pkt_statistics  = STAT_DISCON;
+				break;
+			case MFI_STAT_LD_LBA_OUT_OF_RANGE:
+				pkt->pkt_state |= STATE_ARQ_DONE;
+				pkt->pkt_reason	= CMD_CMPLT;
+				((struct scsi_status *)
+				    pkt->pkt_scbp)->sts_chk = 1;
+
+				arqstat = (void *)(pkt->pkt_scbp);
+				arqstat->sts_rqpkt_reason = CMD_CMPLT;
+				arqstat->sts_rqpkt_resid = 0;
+				arqstat->sts_rqpkt_state |= STATE_GOT_BUS
+				    | STATE_GOT_TARGET | STATE_SENT_CMD
+				    | STATE_XFERRED_DATA;
+				*(uint8_t *)&arqstat->sts_rqpkt_status =
+				    STATUS_GOOD;
+
+				arqstat->sts_sensedata.es_valid = 1;
+				arqstat->sts_sensedata.es_key =
+				    KEY_ILLEGAL_REQUEST;
+				arqstat->sts_sensedata.es_class =
+				    CLASS_EXTENDED_SENSE;
+
+				/*
+				 * LOGICAL BLOCK ADDRESS OUT OF RANGE:
+				 * ASC: 0x21h; ASCQ: 0x00h;
+				 */
+				arqstat->sts_sensedata.es_add_code = 0x21;
+				arqstat->sts_sensedata.es_qual_code = 0x00;
+
+				break;
+
+			default:
+				con_log(CL_ANN, (CE_CONT, "Unknown status!"));
+				pkt->pkt_reason	= CMD_TRAN_ERR;
+
+				break;
+			}
+
+			atomic_add_16(&instance->fw_outstanding, (-1));
+
+			return_mfi_pkt(instance, cmd);
+
+			(void) drsas_common_check(instance, cmd);
+
+			if (acmd->cmd_dmahandle) {
+				if (drsas_check_dma_handle(
+				    acmd->cmd_dmahandle) != DDI_SUCCESS) {
+					ddi_fm_service_impact(instance->dip,
+					    DDI_SERVICE_UNAFFECTED);
+					pkt->pkt_reason = CMD_TRAN_ERR;
+					pkt->pkt_statistics = 0;
+				}
+			}
+
+			/* Call the callback routine */
+			if (((pkt->pkt_flags & FLAG_NOINTR) == 0) &&
+			    pkt->pkt_comp) {
+				(*pkt->pkt_comp)(pkt);
+			}
+
+			break;
+		case MFI_CMD_OP_SMP:
+		case MFI_CMD_OP_STP:
+			complete_cmd_in_sync_mode(instance, cmd);
+			break;
+		case MFI_CMD_OP_DCMD:
+			/* see if got an event notification */
+			if (ddi_get32(cmd->frame_dma_obj.acc_handle,
+			    &cmd->frame->dcmd.opcode) ==
+			    DR_DCMD_CTRL_EVENT_WAIT) {
+				if ((instance->aen_cmd == cmd) &&
+				    (instance->aen_cmd->abort_aen)) {
+					con_log(CL_ANN, (CE_WARN,
+					    "drsas_softintr: "
+					    "aborted_aen returned"));
+				} else {
+					atomic_add_16(&instance->fw_outstanding,
+					    (-1));
+					service_mfi_aen(instance, cmd);
+				}
+			} else {
+				complete_cmd_in_sync_mode(instance, cmd);
+			}
+
+			break;
+		case MFI_CMD_OP_ABORT:
+			con_log(CL_ANN, (CE_WARN, "MFI_CMD_OP_ABORT complete"));
+			/*
+			 * MFI_CMD_OP_ABORT successfully completed
+			 * in the synchronous mode
+			 */
+			complete_cmd_in_sync_mode(instance, cmd);
+			break;
+		default:
+			drsas_fm_ereport(instance, DDI_FM_DEVICE_NO_RESPONSE);
+			ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST);
+
+			if (cmd->pkt != NULL) {
+				pkt = cmd->pkt;
+				if (((pkt->pkt_flags & FLAG_NOINTR) == 0) &&
+				    pkt->pkt_comp) {
+					(*pkt->pkt_comp)(pkt);
+				}
+			}
+			con_log(CL_ANN, (CE_WARN, "Cmd type unknown !"));
+			break;
+		}
+	}
+
+	instance->softint_running = 0;
+
+	return (DDI_INTR_CLAIMED);
+}
+
+/*
+ * drsas_alloc_dma_obj
+ *
+ * Allocate the memory and other resources for an dma object.
+ */
+static int
+drsas_alloc_dma_obj(struct drsas_instance *instance, dma_obj_t *obj,
+    uchar_t endian_flags)
+{
+	int	i;
+	size_t	alen = 0;
+	uint_t	cookie_cnt;
+	struct ddi_device_acc_attr tmp_endian_attr;
+
+	tmp_endian_attr = endian_attr;
+	tmp_endian_attr.devacc_attr_endian_flags = endian_flags;
+
+	i = ddi_dma_alloc_handle(instance->dip, &obj->dma_attr,
+	    DDI_DMA_SLEEP, NULL, &obj->dma_handle);
+	if (i != DDI_SUCCESS) {
+
+		switch (i) {
+			case DDI_DMA_BADATTR :
+				con_log(CL_ANN, (CE_WARN,
+				"Failed ddi_dma_alloc_handle- Bad attribute"));
+				break;
+			case DDI_DMA_NORESOURCES :
+				con_log(CL_ANN, (CE_WARN,
+				"Failed ddi_dma_alloc_handle- No Resources"));
+				break;
+			default :
+				con_log(CL_ANN, (CE_WARN,
+				"Failed ddi_dma_alloc_handle: "
+				"unknown status %d", i));
+				break;
+		}
+
+		return (-1);
+	}
+
+	if ((ddi_dma_mem_alloc(obj->dma_handle, obj->size, &tmp_endian_attr,
+	    DDI_DMA_RDWR | DDI_DMA_STREAMING, DDI_DMA_SLEEP, NULL,
+	    &obj->buffer, &alen, &obj->acc_handle) != DDI_SUCCESS) ||
+	    alen < obj->size) {
+
+		ddi_dma_free_handle(&obj->dma_handle);
+
+		con_log(CL_ANN, (CE_WARN, "Failed : ddi_dma_mem_alloc"));
+
+		return (-1);
+	}
+
+	if (ddi_dma_addr_bind_handle(obj->dma_handle, NULL, obj->buffer,
+	    obj->size, DDI_DMA_RDWR | DDI_DMA_STREAMING, DDI_DMA_SLEEP,
+	    NULL, &obj->dma_cookie[0], &cookie_cnt) != DDI_SUCCESS) {
+
+		ddi_dma_mem_free(&obj->acc_handle);
+		ddi_dma_free_handle(&obj->dma_handle);
+
+		con_log(CL_ANN, (CE_WARN, "Failed : ddi_dma_addr_bind_handle"));
+
+		return (-1);
+	}
+
+	if (drsas_check_dma_handle(obj->dma_handle) != DDI_SUCCESS) {
+		ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST);
+		return (-1);
+	}
+
+	if (drsas_check_acc_handle(obj->acc_handle) != DDI_SUCCESS) {
+		ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST);
+		return (-1);
+	}
+
+	return (cookie_cnt);
+}
+
+/*
+ * drsas_free_dma_obj(struct drsas_instance *, dma_obj_t)
+ *
+ * De-allocate the memory and other resources for an dma object, which must
+ * have been alloated by a previous call to drsas_alloc_dma_obj()
+ */
+static int
+drsas_free_dma_obj(struct drsas_instance *instance, dma_obj_t obj)
+{
+
+	if (drsas_check_dma_handle(obj.dma_handle) != DDI_SUCCESS) {
+		ddi_fm_service_impact(instance->dip, DDI_SERVICE_UNAFFECTED);
+		return (DDI_FAILURE);
+	}
+
+	if (drsas_check_acc_handle(obj.acc_handle) != DDI_SUCCESS) {
+		ddi_fm_service_impact(instance->dip, DDI_SERVICE_UNAFFECTED);
+		return (DDI_FAILURE);
+	}
+
+	(void) ddi_dma_unbind_handle(obj.dma_handle);
+	ddi_dma_mem_free(&obj.acc_handle);
+	ddi_dma_free_handle(&obj.dma_handle);
+
+	return (DDI_SUCCESS);
+}
+
+/*
+ * drsas_dma_alloc(instance_t *, struct scsi_pkt *, struct buf *,
+ * int, int (*)())
+ *
+ * Allocate dma resources for a new scsi command
+ */
+static int
+drsas_dma_alloc(struct drsas_instance *instance, struct scsi_pkt *pkt,
+    struct buf *bp, int flags, int (*callback)())
+{
+	int	dma_flags;
+	int	(*cb)(caddr_t);
+	int	i;
+
+	ddi_dma_attr_t	tmp_dma_attr = drsas_generic_dma_attr;
+	struct scsa_cmd	*acmd = PKT2CMD(pkt);
+
+	acmd->cmd_buf = bp;
+
+	if (bp->b_flags & B_READ) {
+		acmd->cmd_flags &= ~CFLAG_DMASEND;
+		dma_flags = DDI_DMA_READ;
+	} else {
+		acmd->cmd_flags |= CFLAG_DMASEND;
+		dma_flags = DDI_DMA_WRITE;
+	}
+
+	if (flags & PKT_CONSISTENT) {
+		acmd->cmd_flags |= CFLAG_CONSISTENT;
+		dma_flags |= DDI_DMA_CONSISTENT;
+	}
+
+	if (flags & PKT_DMA_PARTIAL) {
+		dma_flags |= DDI_DMA_PARTIAL;
+	}
+
+	dma_flags |= DDI_DMA_REDZONE;
+
+	cb = (callback == NULL_FUNC) ? DDI_DMA_DONTWAIT : DDI_DMA_SLEEP;
+
+	tmp_dma_attr.dma_attr_sgllen = instance->max_num_sge;
+	tmp_dma_attr.dma_attr_addr_hi = 0xffffffffffffffffull;
+
+	if ((i = ddi_dma_alloc_handle(instance->dip, &tmp_dma_attr,
+	    cb, 0, &acmd->cmd_dmahandle)) != DDI_SUCCESS) {
+		switch (i) {
+		case DDI_DMA_BADATTR:
+			bioerror(bp, EFAULT);
+			return (DDI_FAILURE);
+
+		case DDI_DMA_NORESOURCES:
+			bioerror(bp, 0);
+			return (DDI_FAILURE);
+
+		default:
+			con_log(CL_ANN, (CE_PANIC, "ddi_dma_alloc_handle: "
+			    "impossible result (0x%x)", i));
+			bioerror(bp, EFAULT);
+			return (DDI_FAILURE);
+		}
+	}
+
+	i = ddi_dma_buf_bind_handle(acmd->cmd_dmahandle, bp, dma_flags,
+	    cb, 0, &acmd->cmd_dmacookies[0], &acmd->cmd_ncookies);
+
+	switch (i) {
+	case DDI_DMA_PARTIAL_MAP:
+		if ((dma_flags & DDI_DMA_PARTIAL) == 0) {
+			con_log(CL_ANN, (CE_PANIC, "ddi_dma_buf_bind_handle: "
+			    "DDI_DMA_PARTIAL_MAP impossible"));
+			goto no_dma_cookies;
+		}
+
+		if (ddi_dma_numwin(acmd->cmd_dmahandle, &acmd->cmd_nwin) ==
+		    DDI_FAILURE) {
+			con_log(CL_ANN, (CE_PANIC, "ddi_dma_numwin failed"));
+			goto no_dma_cookies;
+		}
+
+		if (ddi_dma_getwin(acmd->cmd_dmahandle, acmd->cmd_curwin,
+		    &acmd->cmd_dma_offset, &acmd->cmd_dma_len,
+		    &acmd->cmd_dmacookies[0], &acmd->cmd_ncookies) ==
+		    DDI_FAILURE) {
+
+			con_log(CL_ANN, (CE_PANIC, "ddi_dma_getwin failed"));
+			goto no_dma_cookies;
+		}
+
+		goto get_dma_cookies;
+	case DDI_DMA_MAPPED:
+		acmd->cmd_nwin = 1;
+		acmd->cmd_dma_len = 0;
+		acmd->cmd_dma_offset = 0;
+
+get_dma_cookies:
+		i = 0;
+		acmd->cmd_dmacount = 0;
+		for (;;) {
+			acmd->cmd_dmacount +=
+			    acmd->cmd_dmacookies[i++].dmac_size;
+
+			if (i == instance->max_num_sge ||
+			    i == acmd->cmd_ncookies)
+				break;
+
+			ddi_dma_nextcookie(acmd->cmd_dmahandle,
+			    &acmd->cmd_dmacookies[i]);
+		}
+
+		acmd->cmd_cookie = i;
+		acmd->cmd_cookiecnt = i;
+
+		acmd->cmd_flags |= CFLAG_DMAVALID;
+
+		if (bp->b_bcount >= acmd->cmd_dmacount) {
+			pkt->pkt_resid = bp->b_bcount - acmd->cmd_dmacount;
+		} else {
+			pkt->pkt_resid = 0;
+		}
+
+		return (DDI_SUCCESS);
+	case DDI_DMA_NORESOURCES:
+		bioerror(bp, 0);
+		break;
+	case DDI_DMA_NOMAPPING:
+		bioerror(bp, EFAULT);
+		break;
+	case DDI_DMA_TOOBIG:
+		bioerror(bp, EINVAL);
+		break;
+	case DDI_DMA_INUSE:
+		con_log(CL_ANN, (CE_PANIC, "ddi_dma_buf_bind_handle:"
+		    " DDI_DMA_INUSE impossible"));
+		break;
+	default:
+		con_log(CL_ANN, (CE_PANIC, "ddi_dma_buf_bind_handle: "
+		    "impossible result (0x%x)", i));
+		break;
+	}
+
+no_dma_cookies:
+	ddi_dma_free_handle(&acmd->cmd_dmahandle);
+	acmd->cmd_dmahandle = NULL;
+	acmd->cmd_flags &= ~CFLAG_DMAVALID;
+	return (DDI_FAILURE);
+}
+
+/*
+ * drsas_dma_move(struct drsas_instance *, struct scsi_pkt *, struct buf *)
+ *
+ * move dma resources to next dma window
+ *
+ */
+static int
+drsas_dma_move(struct drsas_instance *instance, struct scsi_pkt *pkt,
+    struct buf *bp)
+{
+	int	i = 0;
+
+	struct scsa_cmd	*acmd = PKT2CMD(pkt);
+
+	/*
+	 * If there are no more cookies remaining in this window,
+	 * must move to the next window first.
+	 */
+	if (acmd->cmd_cookie == acmd->cmd_ncookies) {
+		if (acmd->cmd_curwin == acmd->cmd_nwin && acmd->cmd_nwin == 1) {
+			return (DDI_SUCCESS);
+		}
+
+		/* at last window, cannot move */
+		if (++acmd->cmd_curwin >= acmd->cmd_nwin) {
+			return (DDI_FAILURE);
+		}
+
+		if (ddi_dma_getwin(acmd->cmd_dmahandle, acmd->cmd_curwin,
+		    &acmd->cmd_dma_offset, &acmd->cmd_dma_len,
+		    &acmd->cmd_dmacookies[0], &acmd->cmd_ncookies) ==
+		    DDI_FAILURE) {
+			return (DDI_FAILURE);
+		}
+
+		acmd->cmd_cookie = 0;
+	} else {
+		/* still more cookies in this window - get the next one */
+		ddi_dma_nextcookie(acmd->cmd_dmahandle,
+		    &acmd->cmd_dmacookies[0]);
+	}
+
+	/* get remaining cookies in this window, up to our maximum */
+	for (;;) {
+		acmd->cmd_dmacount += acmd->cmd_dmacookies[i++].dmac_size;
+		acmd->cmd_cookie++;
+
+		if (i == instance->max_num_sge ||
+		    acmd->cmd_cookie == acmd->cmd_ncookies) {
+			break;
+		}
+
+		ddi_dma_nextcookie(acmd->cmd_dmahandle,
+		    &acmd->cmd_dmacookies[i]);
+	}
+
+	acmd->cmd_cookiecnt = i;
+
+	if (bp->b_bcount >= acmd->cmd_dmacount) {
+		pkt->pkt_resid = bp->b_bcount - acmd->cmd_dmacount;
+	} else {
+		pkt->pkt_resid = 0;
+	}
+
+	return (DDI_SUCCESS);
+}
+
+/*
+ * build_cmd
+ */
+static struct drsas_cmd *
+build_cmd(struct drsas_instance *instance, struct scsi_address *ap,
+    struct scsi_pkt *pkt, uchar_t *cmd_done)
+{
+	uint16_t	flags = 0;
+	uint32_t	i;
+	uint32_t 	context;
+	uint32_t	sge_bytes;
+	ddi_acc_handle_t acc_handle;
+	struct drsas_cmd		*cmd;
+	struct drsas_sge64		*mfi_sgl;
+	struct scsa_cmd			*acmd = PKT2CMD(pkt);
+	struct drsas_pthru_frame 	*pthru;
+	struct drsas_io_frame		*ldio;
+
+	/* find out if this is logical or physical drive command.  */
+	acmd->islogical = MRDRV_IS_LOGICAL(ap);
+	acmd->device_id = MAP_DEVICE_ID(instance, ap);
+	*cmd_done = 0;
+
+	/* get the command packet */
+	if (!(cmd = get_mfi_pkt(instance))) {
+		return (NULL);
+	}
+
+	acc_handle = cmd->frame_dma_obj.acc_handle;
+
+	/* Clear the frame buffer and assign back the context id */
+	(void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame));
+	ddi_put32(acc_handle, &cmd->frame->hdr.context, cmd->index);
+
+	cmd->pkt = pkt;
+	cmd->cmd = acmd;
+
+	/* lets get the command directions */
+	if (acmd->cmd_flags & CFLAG_DMASEND) {
+		flags = MFI_FRAME_DIR_WRITE;
+
+		if (acmd->cmd_flags & CFLAG_CONSISTENT) {
+			(void) ddi_dma_sync(acmd->cmd_dmahandle,
+			    acmd->cmd_dma_offset, acmd->cmd_dma_len,
+			    DDI_DMA_SYNC_FORDEV);
+		}
+	} else if (acmd->cmd_flags & ~CFLAG_DMASEND) {
+		flags = MFI_FRAME_DIR_READ;
+
+		if (acmd->cmd_flags & CFLAG_CONSISTENT) {
+			(void) ddi_dma_sync(acmd->cmd_dmahandle,
+			    acmd->cmd_dma_offset, acmd->cmd_dma_len,
+			    DDI_DMA_SYNC_FORCPU);
+		}
+	} else {
+		flags = MFI_FRAME_DIR_NONE;
+	}
+
+	flags |= MFI_FRAME_SGL64;
+
+	switch (pkt->pkt_cdbp[0]) {
+
+	/*
+	 * case SCMD_SYNCHRONIZE_CACHE:
+	 * 	flush_cache(instance);
+	 *	return_mfi_pkt(instance, cmd);
+	 *	*cmd_done = 1;
+	 *
+	 *	return (NULL);
+	 */
+
+	case SCMD_READ:
+	case SCMD_WRITE:
+	case SCMD_READ_G1:
+	case SCMD_WRITE_G1:
+		if (acmd->islogical) {
+			ldio = (struct drsas_io_frame *)cmd->frame;
+
+			/*
+			 * preare the Logical IO frame:
+			 * 2nd bit is zero for all read cmds
+			 */
+			ddi_put8(acc_handle, &ldio->cmd,
+			    (pkt->pkt_cdbp[0] & 0x02) ? MFI_CMD_OP_LD_WRITE
+			    : MFI_CMD_OP_LD_READ);
+			ddi_put8(acc_handle, &ldio->cmd_status, 0x0);
+			ddi_put8(acc_handle, &ldio->scsi_status, 0x0);
+			ddi_put8(acc_handle, &ldio->target_id, acmd->device_id);
+			ddi_put16(acc_handle, &ldio->timeout, 0);
+			ddi_put8(acc_handle, &ldio->reserved_0, 0);
+			ddi_put16(acc_handle, &ldio->pad_0, 0);
+			ddi_put16(acc_handle, &ldio->flags, flags);
+
+			/* Initialize sense Information */
+			bzero(cmd->sense, SENSE_LENGTH);
+			ddi_put8(acc_handle, &ldio->sense_len, SENSE_LENGTH);
+			ddi_put32(acc_handle, &ldio->sense_buf_phys_addr_hi, 0);
+			ddi_put32(acc_handle, &ldio->sense_buf_phys_addr_lo,
+			    cmd->sense_phys_addr);
+			ddi_put32(acc_handle, &ldio->start_lba_hi, 0);
+			ddi_put8(acc_handle, &ldio->access_byte,
+			    (acmd->cmd_cdblen != 6) ? pkt->pkt_cdbp[1] : 0);
+			ddi_put8(acc_handle, &ldio->sge_count,
+			    acmd->cmd_cookiecnt);
+			mfi_sgl = (struct drsas_sge64	*)&ldio->sgl;
+
+			context = ddi_get32(acc_handle, &ldio->context);
+
+			if (acmd->cmd_cdblen == CDB_GROUP0) {
+				ddi_put32(acc_handle, &ldio->lba_count, (
+				    (uint16_t)(pkt->pkt_cdbp[4])));
+
+				ddi_put32(acc_handle, &ldio->start_lba_lo, (
+				    ((uint32_t)(pkt->pkt_cdbp[3])) |
+				    ((uint32_t)(pkt->pkt_cdbp[2]) << 8) |
+				    ((uint32_t)((pkt->pkt_cdbp[1]) & 0x1F)
+				    << 16)));
+			} else if (acmd->cmd_cdblen == CDB_GROUP1) {
+				ddi_put32(acc_handle, &ldio->lba_count, (
+				    ((uint16_t)(pkt->pkt_cdbp[8])) |
+				    ((uint16_t)(pkt->pkt_cdbp[7]) << 8)));
+
+				ddi_put32(acc_handle, &ldio->start_lba_lo, (
+				    ((uint32_t)(pkt->pkt_cdbp[5])) |
+				    ((uint32_t)(pkt->pkt_cdbp[4]) << 8) |
+				    ((uint32_t)(pkt->pkt_cdbp[3]) << 16) |
+				    ((uint32_t)(pkt->pkt_cdbp[2]) << 24)));
+			} else if (acmd->cmd_cdblen == CDB_GROUP2) {
+				ddi_put32(acc_handle, &ldio->lba_count, (
+				    ((uint16_t)(pkt->pkt_cdbp[9])) |
+				    ((uint16_t)(pkt->pkt_cdbp[8]) << 8) |
+				    ((uint16_t)(pkt->pkt_cdbp[7]) << 16) |
+				    ((uint16_t)(pkt->pkt_cdbp[6]) << 24)));
+
+				ddi_put32(acc_handle, &ldio->start_lba_lo, (
+				    ((uint32_t)(pkt->pkt_cdbp[5])) |
+				    ((uint32_t)(pkt->pkt_cdbp[4]) << 8) |
+				    ((uint32_t)(pkt->pkt_cdbp[3]) << 16) |
+				    ((uint32_t)(pkt->pkt_cdbp[2]) << 24)));
+			} else if (acmd->cmd_cdblen == CDB_GROUP3) {
+				ddi_put32(acc_handle, &ldio->lba_count, (
+				    ((uint16_t)(pkt->pkt_cdbp[13])) |
+				    ((uint16_t)(pkt->pkt_cdbp[12]) << 8) |
+				    ((uint16_t)(pkt->pkt_cdbp[11]) << 16) |
+				    ((uint16_t)(pkt->pkt_cdbp[10]) << 24)));
+
+				ddi_put32(acc_handle, &ldio->start_lba_lo, (
+				    ((uint32_t)(pkt->pkt_cdbp[9])) |
+				    ((uint32_t)(pkt->pkt_cdbp[8]) << 8) |
+				    ((uint32_t)(pkt->pkt_cdbp[7]) << 16) |
+				    ((uint32_t)(pkt->pkt_cdbp[6]) << 24)));
+
+				ddi_put32(acc_handle, &ldio->start_lba_lo, (
+				    ((uint32_t)(pkt->pkt_cdbp[5])) |
+				    ((uint32_t)(pkt->pkt_cdbp[4]) << 8) |
+				    ((uint32_t)(pkt->pkt_cdbp[3]) << 16) |
+				    ((uint32_t)(pkt->pkt_cdbp[2]) << 24)));
+			}
+
+			break;
+		}
+		/* fall through For all non-rd/wr cmds */
+	default:
+
+		switch (pkt->pkt_cdbp[0]) {
+		case SCMD_MODE_SENSE:
+		case SCMD_MODE_SENSE_G1: {
+			union scsi_cdb	*cdbp;
+			uint16_t	page_code;
+
+			cdbp = (void *)pkt->pkt_cdbp;
+			page_code = (uint16_t)cdbp->cdb_un.sg.scsi[0];
+			switch (page_code) {
+			case 0x3:
+			case 0x4:
+				(void) drsas_mode_sense_build(pkt);
+				return_mfi_pkt(instance, cmd);
+				*cmd_done = 1;
+				return (NULL);
+			}
+			break;
+		}
+		default:
+			break;
+		}
+
+		pthru	= (struct drsas_pthru_frame *)cmd->frame;
+
+		/* prepare the DCDB frame */
+		ddi_put8(acc_handle, &pthru->cmd, (acmd->islogical) ?
+		    MFI_CMD_OP_LD_SCSI : MFI_CMD_OP_PD_SCSI);
+		ddi_put8(acc_handle, &pthru->cmd_status, 0x0);
+		ddi_put8(acc_handle, &pthru->scsi_status, 0x0);
+		ddi_put8(acc_handle, &pthru->target_id, acmd->device_id);
+		ddi_put8(acc_handle, &pthru->lun, 0);
+		ddi_put8(acc_handle, &pthru->cdb_len, acmd->cmd_cdblen);
+		ddi_put16(acc_handle, &pthru->timeout, 0);
+		ddi_put16(acc_handle, &pthru->flags, flags);
+		ddi_put32(acc_handle, &pthru->data_xfer_len,
+		    acmd->cmd_dmacount);
+		ddi_put8(acc_handle, &pthru->sge_count, acmd->cmd_cookiecnt);
+		mfi_sgl			= (struct drsas_sge64 *)&pthru->sgl;
+
+		bzero(cmd->sense, SENSE_LENGTH);
+		ddi_put8(acc_handle, &pthru->sense_len, SENSE_LENGTH);
+		ddi_put32(acc_handle, &pthru->sense_buf_phys_addr_hi, 0);
+		ddi_put32(acc_handle, &pthru->sense_buf_phys_addr_lo,
+		    cmd->sense_phys_addr);
+
+		context = ddi_get32(acc_handle, &pthru->context);
+		ddi_rep_put8(acc_handle, (uint8_t *)pkt->pkt_cdbp,
+		    (uint8_t *)pthru->cdb, acmd->cmd_cdblen, DDI_DEV_AUTOINCR);
+
+		break;
+	}
+#ifdef lint
+	context = context;
+#endif
+	/* prepare the scatter-gather list for the firmware */
+	for (i = 0; i < acmd->cmd_cookiecnt; i++, mfi_sgl++) {
+		ddi_put64(acc_handle, &mfi_sgl->phys_addr,
+		    acmd->cmd_dmacookies[i].dmac_laddress);
+		ddi_put32(acc_handle, &mfi_sgl->length,
+		    acmd->cmd_dmacookies[i].dmac_size);
+	}
+
+	sge_bytes = sizeof (struct drsas_sge64)*acmd->cmd_cookiecnt;
+
+	cmd->frame_count = (sge_bytes / MRMFI_FRAME_SIZE) +
+	    ((sge_bytes % MRMFI_FRAME_SIZE) ? 1 : 0) + 1;
+
+	if (cmd->frame_count >= 8) {
+		cmd->frame_count = 8;
+	}
+
+	return (cmd);
+}
+
+/*
+ * issue_mfi_pthru
+ */
+static int
+issue_mfi_pthru(struct drsas_instance *instance, struct drsas_ioctl *ioctl,
+    struct drsas_cmd *cmd, int mode)
+{
+	void		*ubuf;
+	uint32_t	kphys_addr = 0;
+	uint32_t	xferlen = 0;
+	uint_t		model;
+	ddi_acc_handle_t	acc_handle = cmd->frame_dma_obj.acc_handle;
+	dma_obj_t			pthru_dma_obj;
+	struct drsas_pthru_frame	*kpthru;
+	struct drsas_pthru_frame	*pthru;
+	int i;
+	pthru = &cmd->frame->pthru;
+	kpthru = (struct drsas_pthru_frame *)&ioctl->frame[0];
+
+	model = ddi_model_convert_from(mode & FMODELS);
+	if (model == DDI_MODEL_ILP32) {
+		con_log(CL_ANN1, (CE_NOTE, "issue_mfi_pthru: DDI_MODEL_LP32"));
+
+		xferlen	= kpthru->sgl.sge32[0].length;
+
+		ubuf	= (void *)(ulong_t)kpthru->sgl.sge32[0].phys_addr;
+	} else {
+#ifdef _ILP32
+		con_log(CL_ANN1, (CE_NOTE, "issue_mfi_pthru: DDI_MODEL_LP32"));
+		xferlen	= kpthru->sgl.sge32[0].length;
+		ubuf	= (void *)(ulong_t)kpthru->sgl.sge32[0].phys_addr;
+#else
+		con_log(CL_ANN1, (CE_NOTE, "issue_mfi_pthru: DDI_MODEL_LP64"));
+		xferlen	= kpthru->sgl.sge64[0].length;
+		ubuf	= (void *)(ulong_t)kpthru->sgl.sge64[0].phys_addr;
+#endif
+	}
+
+	if (xferlen) {
+		/* means IOCTL requires DMA */
+		/* allocate the data transfer buffer */
+		pthru_dma_obj.size = xferlen;
+		pthru_dma_obj.dma_attr = drsas_generic_dma_attr;
+		pthru_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU;
+		pthru_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU;
+		pthru_dma_obj.dma_attr.dma_attr_sgllen = 1;
+		pthru_dma_obj.dma_attr.dma_attr_align = 1;
+
+		/* allocate kernel buffer for DMA */
+		if (drsas_alloc_dma_obj(instance, &pthru_dma_obj,
+		    (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) {
+			con_log(CL_ANN, (CE_WARN, "issue_mfi_pthru: "
+			    "could not allocate data transfer buffer."));
+			return (DDI_FAILURE);
+		}
+
+		/* If IOCTL requires DMA WRITE, do ddi_copyin IOCTL data copy */
+		if (kpthru->flags & MFI_FRAME_DIR_WRITE) {
+			for (i = 0; i < xferlen; i++) {
+				if (ddi_copyin((uint8_t *)ubuf+i,
+				    (uint8_t *)pthru_dma_obj.buffer+i,
+				    1, mode)) {
+					con_log(CL_ANN, (CE_WARN,
+					    "issue_mfi_pthru : "
+					    "copy from user space failed"));
+					return (DDI_FAILURE);
+				}
+			}
+		}
+
+		kphys_addr = pthru_dma_obj.dma_cookie[0].dmac_address;
+	}
+
+	ddi_put8(acc_handle, &pthru->cmd, kpthru->cmd);
+	ddi_put8(acc_handle, &pthru->sense_len, kpthru->sense_len);
+	ddi_put8(acc_handle, &pthru->cmd_status, 0);
+	ddi_put8(acc_handle, &pthru->scsi_status, 0);
+	ddi_put8(acc_handle, &pthru->target_id, kpthru->target_id);
+	ddi_put8(acc_handle, &pthru->lun, kpthru->lun);
+	ddi_put8(acc_handle, &pthru->cdb_len, kpthru->cdb_len);
+	ddi_put8(acc_handle, &pthru->sge_count, kpthru->sge_count);
+	ddi_put16(acc_handle, &pthru->timeout, kpthru->timeout);
+	ddi_put32(acc_handle, &pthru->data_xfer_len, kpthru->data_xfer_len);
+
+	ddi_put32(acc_handle, &pthru->sense_buf_phys_addr_hi, 0);
+	/* pthru->sense_buf_phys_addr_lo = cmd->sense_phys_addr; */
+	ddi_put32(acc_handle, &pthru->sense_buf_phys_addr_lo, 0);
+
+	ddi_rep_put8(acc_handle, (uint8_t *)kpthru->cdb, (uint8_t *)pthru->cdb,
+	    pthru->cdb_len, DDI_DEV_AUTOINCR);
+
+	ddi_put16(acc_handle, &pthru->flags, kpthru->flags & ~MFI_FRAME_SGL64);
+	ddi_put32(acc_handle, &pthru->sgl.sge32[0].length, xferlen);
+	ddi_put32(acc_handle, &pthru->sgl.sge32[0].phys_addr, kphys_addr);
+
+	cmd->sync_cmd = DRSAS_TRUE;
+	cmd->frame_count = 1;
+
+	if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) {
+		con_log(CL_ANN, (CE_WARN,
+		    "issue_mfi_pthru: fw_ioctl failed"));
+	} else {
+		if (xferlen && kpthru->flags & MFI_FRAME_DIR_READ) {
+			for (i = 0; i < xferlen; i++) {
+				if (ddi_copyout(
+				    (uint8_t *)pthru_dma_obj.buffer+i,
+				    (uint8_t *)ubuf+i, 1, mode)) {
+					con_log(CL_ANN, (CE_WARN,
+					    "issue_mfi_pthru : "
+					    "copy to user space failed"));
+					return (DDI_FAILURE);
+				}
+			}
+		}
+	}
+
+	kpthru->cmd_status = ddi_get8(acc_handle, &pthru->cmd_status);
+	kpthru->scsi_status = ddi_get8(acc_handle, &pthru->scsi_status);
+
+	con_log(CL_ANN, (CE_NOTE, "issue_mfi_pthru: cmd_status %x, "
+	    "scsi_status %x", kpthru->cmd_status, kpthru->scsi_status));
+
+	if (xferlen) {
+		/* free kernel buffer */
+		if (drsas_free_dma_obj(instance, pthru_dma_obj) != DDI_SUCCESS)
+			return (DDI_FAILURE);
+	}
+
+	return (DDI_SUCCESS);
+}
+
+/*
+ * issue_mfi_dcmd
+ */
+static int
+issue_mfi_dcmd(struct drsas_instance *instance, struct drsas_ioctl *ioctl,
+    struct drsas_cmd *cmd, int mode)
+{
+	void		*ubuf;
+	uint32_t	kphys_addr = 0;
+	uint32_t	xferlen = 0;
+	uint32_t	model;
+	dma_obj_t	dcmd_dma_obj;
+	struct drsas_dcmd_frame	*kdcmd;
+	struct drsas_dcmd_frame	*dcmd;
+	ddi_acc_handle_t	acc_handle = cmd->frame_dma_obj.acc_handle;
+	int i;
+	dcmd = &cmd->frame->dcmd;
+	kdcmd = (struct drsas_dcmd_frame *)&ioctl->frame[0];
+
+	model = ddi_model_convert_from(mode & FMODELS);
+	if (model == DDI_MODEL_ILP32) {
+		con_log(CL_ANN1, (CE_NOTE, "issue_mfi_dcmd: DDI_MODEL_ILP32"));
+
+		xferlen	= kdcmd->sgl.sge32[0].length;
+
+		ubuf	= (void *)(ulong_t)kdcmd->sgl.sge32[0].phys_addr;
+	} else {
+#ifdef _ILP32
+		con_log(CL_ANN1, (CE_NOTE, "issue_mfi_dcmd: DDI_MODEL_ILP32"));
+		xferlen	= kdcmd->sgl.sge32[0].length;
+		ubuf	= (void *)(ulong_t)kdcmd->sgl.sge32[0].phys_addr;
+#else
+		con_log(CL_ANN1, (CE_NOTE, "issue_mfi_dcmd: DDI_MODEL_LP64"));
+		xferlen	= kdcmd->sgl.sge64[0].length;
+		ubuf	= (void *)(ulong_t)kdcmd->sgl.sge64[0].phys_addr;
+#endif
+	}
+	if (xferlen) {
+		/* means IOCTL requires DMA */
+		/* allocate the data transfer buffer */
+		dcmd_dma_obj.size = xferlen;
+		dcmd_dma_obj.dma_attr = drsas_generic_dma_attr;
+		dcmd_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU;
+		dcmd_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU;
+		dcmd_dma_obj.dma_attr.dma_attr_sgllen = 1;
+		dcmd_dma_obj.dma_attr.dma_attr_align = 1;
+
+		/* allocate kernel buffer for DMA */
+		if (drsas_alloc_dma_obj(instance, &dcmd_dma_obj,
+		    (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) {
+			con_log(CL_ANN, (CE_WARN, "issue_mfi_dcmd: "
+			    "could not allocate data transfer buffer."));
+			return (DDI_FAILURE);
+		}
+
+		/* If IOCTL requires DMA WRITE, do ddi_copyin IOCTL data copy */
+		if (kdcmd->flags & MFI_FRAME_DIR_WRITE) {
+			for (i = 0; i < xferlen; i++) {
+				if (ddi_copyin((uint8_t *)ubuf + i,
+				    (uint8_t *)dcmd_dma_obj.buffer + i,
+				    1, mode)) {
+					con_log(CL_ANN, (CE_WARN,
+					    "issue_mfi_dcmd : "
+					    "copy from user space failed"));
+					return (DDI_FAILURE);
+				}
+			}
+		}
+
+		kphys_addr = dcmd_dma_obj.dma_cookie[0].dmac_address;
+	}
+
+	ddi_put8(acc_handle, &dcmd->cmd, kdcmd->cmd);
+	ddi_put8(acc_handle, &dcmd->cmd_status, 0);
+	ddi_put8(acc_handle, &dcmd->sge_count, kdcmd->sge_count);
+	ddi_put16(acc_handle, &dcmd->timeout, kdcmd->timeout);
+	ddi_put32(acc_handle, &dcmd->data_xfer_len, kdcmd->data_xfer_len);
+	ddi_put32(acc_handle, &dcmd->opcode, kdcmd->opcode);
+
+	ddi_rep_put8(acc_handle, (uint8_t *)kdcmd->mbox.b,
+	    (uint8_t *)dcmd->mbox.b, DCMD_MBOX_SZ, DDI_DEV_AUTOINCR);
+
+	ddi_put16(acc_handle, &dcmd->flags, kdcmd->flags & ~MFI_FRAME_SGL64);
+	ddi_put32(acc_handle, &dcmd->sgl.sge32[0].length, xferlen);
+	ddi_put32(acc_handle, &dcmd->sgl.sge32[0].phys_addr, kphys_addr);
+
+	cmd->sync_cmd = DRSAS_TRUE;
+	cmd->frame_count = 1;
+
+	if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) {
+		con_log(CL_ANN, (CE_WARN, "issue_mfi_dcmd: fw_ioctl failed"));
+	} else {
+		if (xferlen && (kdcmd->flags & MFI_FRAME_DIR_READ)) {
+			for (i = 0; i < xferlen; i++) {
+				if (ddi_copyout(
+				    (uint8_t *)dcmd_dma_obj.buffer + i,
+				    (uint8_t *)ubuf + i,
+				    1, mode)) {
+					con_log(CL_ANN, (CE_WARN,
+					    "issue_mfi_dcmd : "
+					    "copy to user space failed"));
+					return (DDI_FAILURE);
+				}
+			}
+		}
+	}
+
+	kdcmd->cmd_status = ddi_get8(acc_handle, &dcmd->cmd_status);
+
+	if (xferlen) {
+		/* free kernel buffer */
+		if (drsas_free_dma_obj(instance, dcmd_dma_obj) != DDI_SUCCESS)
+			return (DDI_FAILURE);
+	}
+
+	return (DDI_SUCCESS);
+}
+
+/*
+ * issue_mfi_smp
+ */
+static int
+issue_mfi_smp(struct drsas_instance *instance, struct drsas_ioctl *ioctl,
+    struct drsas_cmd *cmd, int mode)
+{
+	void		*request_ubuf;
+	void		*response_ubuf;
+	uint32_t	request_xferlen = 0;
+	uint32_t	response_xferlen = 0;
+	uint_t		model;
+	dma_obj_t			request_dma_obj;
+	dma_obj_t			response_dma_obj;
+	ddi_acc_handle_t	acc_handle = cmd->frame_dma_obj.acc_handle;
+	struct drsas_smp_frame		*ksmp;
+	struct drsas_smp_frame		*smp;
+	struct drsas_sge32		*sge32;
+#ifndef _ILP32
+	struct drsas_sge64		*sge64;
+#endif
+	int i;
+	uint64_t			tmp_sas_addr;
+
+	smp = &cmd->frame->smp;
+	ksmp = (struct drsas_smp_frame *)&ioctl->frame[0];
+
+	model = ddi_model_convert_from(mode & FMODELS);
+	if (model == DDI_MODEL_ILP32) {
+		con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: DDI_MODEL_ILP32"));
+
+		sge32			= &ksmp->sgl[0].sge32[0];
+		response_xferlen	= sge32[0].length;
+		request_xferlen		= sge32[1].length;
+		con_log(CL_ANN, (CE_NOTE, "issue_mfi_smp: "
+		    "response_xferlen = %x, request_xferlen = %x",
+		    response_xferlen, request_xferlen));
+
+		response_ubuf	= (void *)(ulong_t)sge32[0].phys_addr;
+		request_ubuf	= (void *)(ulong_t)sge32[1].phys_addr;
+		con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: "
+		    "response_ubuf = %p, request_ubuf = %p",
+		    response_ubuf, request_ubuf));
+	} else {
+#ifdef _ILP32
+		con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: DDI_MODEL_ILP32"));
+
+		sge32			= &ksmp->sgl[0].sge32[0];
+		response_xferlen	= sge32[0].length;
+		request_xferlen		= sge32[1].length;
+		con_log(CL_ANN, (CE_NOTE, "issue_mfi_smp: "
+		    "response_xferlen = %x, request_xferlen = %x",
+		    response_xferlen, request_xferlen));
+
+		response_ubuf	= (void *)(ulong_t)sge32[0].phys_addr;
+		request_ubuf	= (void *)(ulong_t)sge32[1].phys_addr;
+		con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: "
+		    "response_ubuf = %p, request_ubuf = %p",
+		    response_ubuf, request_ubuf));
+#else
+		con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: DDI_MODEL_LP64"));
+
+		sge64			= &ksmp->sgl[0].sge64[0];
+		response_xferlen	= sge64[0].length;
+		request_xferlen		= sge64[1].length;
+
+		response_ubuf	= (void *)(ulong_t)sge64[0].phys_addr;
+		request_ubuf	= (void *)(ulong_t)sge64[1].phys_addr;
+#endif
+	}
+	if (request_xferlen) {
+		/* means IOCTL requires DMA */
+		/* allocate the data transfer buffer */
+		request_dma_obj.size = request_xferlen;
+		request_dma_obj.dma_attr = drsas_generic_dma_attr;
+		request_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU;
+		request_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU;
+		request_dma_obj.dma_attr.dma_attr_sgllen = 1;
+		request_dma_obj.dma_attr.dma_attr_align = 1;
+
+		/* allocate kernel buffer for DMA */
+		if (drsas_alloc_dma_obj(instance, &request_dma_obj,
+		    (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) {
+			con_log(CL_ANN, (CE_WARN, "issue_mfi_smp: "
+			    "could not allocate data transfer buffer."));
+			return (DDI_FAILURE);
+		}
+
+		/* If IOCTL requires DMA WRITE, do ddi_copyin IOCTL data copy */
+		for (i = 0; i < request_xferlen; i++) {
+			if (ddi_copyin((uint8_t *)request_ubuf + i,
+			    (uint8_t *)request_dma_obj.buffer + i,
+			    1, mode)) {
+				con_log(CL_ANN, (CE_WARN, "issue_mfi_smp: "
+				    "copy from user space failed"));
+				return (DDI_FAILURE);
+			}
+		}
+	}
+
+	if (response_xferlen) {
+		/* means IOCTL requires DMA */
+		/* allocate the data transfer buffer */
+		response_dma_obj.size = response_xferlen;
+		response_dma_obj.dma_attr = drsas_generic_dma_attr;
+		response_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU;
+		response_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU;
+		response_dma_obj.dma_attr.dma_attr_sgllen = 1;
+		response_dma_obj.dma_attr.dma_attr_align = 1;
+
+		/* allocate kernel buffer for DMA */
+		if (drsas_alloc_dma_obj(instance, &response_dma_obj,
+		    (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) {
+			con_log(CL_ANN, (CE_WARN, "issue_mfi_smp: "
+			    "could not allocate data transfer buffer."));
+			return (DDI_FAILURE);
+		}
+
+		/* If IOCTL requires DMA WRITE, do ddi_copyin IOCTL data copy */
+		for (i = 0; i < response_xferlen; i++) {
+			if (ddi_copyin((uint8_t *)response_ubuf + i,
+			    (uint8_t *)response_dma_obj.buffer + i,
+			    1, mode)) {
+				con_log(CL_ANN, (CE_WARN, "issue_mfi_smp: "
+				    "copy from user space failed"));
+				return (DDI_FAILURE);
+			}
+		}
+	}
+
+	ddi_put8(acc_handle, &smp->cmd, ksmp->cmd);
+	ddi_put8(acc_handle, &smp->cmd_status, 0);
+	ddi_put8(acc_handle, &smp->connection_status, 0);
+	ddi_put8(acc_handle, &smp->sge_count, ksmp->sge_count);
+	/* smp->context		= ksmp->context; */
+	ddi_put16(acc_handle, &smp->timeout, ksmp->timeout);
+	ddi_put32(acc_handle, &smp->data_xfer_len, ksmp->data_xfer_len);
+
+	bcopy((void *)&ksmp->sas_addr, (void *)&tmp_sas_addr,
+	    sizeof (uint64_t));
+	ddi_put64(acc_handle, &smp->sas_addr, tmp_sas_addr);
+
+	ddi_put16(acc_handle, &smp->flags, ksmp->flags & ~MFI_FRAME_SGL64);
+
+	model = ddi_model_convert_from(mode & FMODELS);
+	if (model == DDI_MODEL_ILP32) {
+		con_log(CL_ANN1, (CE_NOTE,
+		    "handle_drv_ioctl: DDI_MODEL_ILP32"));
+
+		sge32 = &smp->sgl[0].sge32[0];
+		ddi_put32(acc_handle, &sge32[0].length, response_xferlen);
+		ddi_put32(acc_handle, &sge32[0].phys_addr,
+		    response_dma_obj.dma_cookie[0].dmac_address);
+		ddi_put32(acc_handle, &sge32[1].length, request_xferlen);
+		ddi_put32(acc_handle, &sge32[1].phys_addr,
+		    request_dma_obj.dma_cookie[0].dmac_address);
+	} else {
+#ifdef _ILP32
+		con_log(CL_ANN1, (CE_NOTE,
+		    "handle_drv_ioctl: DDI_MODEL_ILP32"));
+		sge32 = &smp->sgl[0].sge32[0];
+		ddi_put32(acc_handle, &sge32[0].length, response_xferlen);
+		ddi_put32(acc_handle, &sge32[0].phys_addr,
+		    response_dma_obj.dma_cookie[0].dmac_address);
+		ddi_put32(acc_handle, &sge32[1].length, request_xferlen);
+		ddi_put32(acc_handle, &sge32[1].phys_addr,
+		    request_dma_obj.dma_cookie[0].dmac_address);
+#else
+		con_log(CL_ANN1, (CE_NOTE,
+		    "issue_mfi_smp: DDI_MODEL_LP64"));
+		sge64 = &smp->sgl[0].sge64[0];
+		ddi_put32(acc_handle, &sge64[0].length, response_xferlen);
+		ddi_put64(acc_handle, &sge64[0].phys_addr,
+		    response_dma_obj.dma_cookie[0].dmac_address);
+		ddi_put32(acc_handle, &sge64[1].length, request_xferlen);
+		ddi_put64(acc_handle, &sge64[1].phys_addr,
+		    request_dma_obj.dma_cookie[0].dmac_address);
+#endif
+	}
+	con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp : "
+	    "smp->response_xferlen = %d, smp->request_xferlen = %d "
+	    "smp->data_xfer_len = %d", ddi_get32(acc_handle, &sge32[0].length),
+	    ddi_get32(acc_handle, &sge32[1].length),
+	    ddi_get32(acc_handle, &smp->data_xfer_len)));
+
+	cmd->sync_cmd = DRSAS_TRUE;
+	cmd->frame_count = 1;
+
+	if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) {
+		con_log(CL_ANN, (CE_WARN,
+		    "issue_mfi_smp: fw_ioctl failed"));
+	} else {
+		con_log(CL_ANN1, (CE_NOTE,
+		    "issue_mfi_smp: copy to user space"));
+
+		if (request_xferlen) {
+			for (i = 0; i < request_xferlen; i++) {
+				if (ddi_copyout(
+				    (uint8_t *)request_dma_obj.buffer +
+				    i, (uint8_t *)request_ubuf + i,
+				    1, mode)) {
+					con_log(CL_ANN, (CE_WARN,
+					    "issue_mfi_smp : copy to user space"
+					    " failed"));
+					return (DDI_FAILURE);
+				}
+			}
+		}
+
+		if (response_xferlen) {
+			for (i = 0; i < response_xferlen; i++) {
+				if (ddi_copyout(
+				    (uint8_t *)response_dma_obj.buffer
+				    + i, (uint8_t *)response_ubuf
+				    + i, 1, mode)) {
+					con_log(CL_ANN, (CE_WARN,
+					    "issue_mfi_smp : copy to "
+					    "user space failed"));
+					return (DDI_FAILURE);
+				}
+			}
+		}
+	}
+
+	ksmp->cmd_status = ddi_get8(acc_handle, &smp->cmd_status);
+	con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: smp->cmd_status = %d",
+	    ddi_get8(acc_handle, &smp->cmd_status)));
+
+
+	if (request_xferlen) {
+		/* free kernel buffer */
+		if (drsas_free_dma_obj(instance, request_dma_obj) !=
+		    DDI_SUCCESS)
+			return (DDI_FAILURE);
+	}
+
+	if (response_xferlen) {
+		/* free kernel buffer */
+		if (drsas_free_dma_obj(instance, response_dma_obj) !=
+		    DDI_SUCCESS)
+			return (DDI_FAILURE);
+	}
+
+	return (DDI_SUCCESS);
+}
+
+/*
+ * issue_mfi_stp
+ */
+static int
+issue_mfi_stp(struct drsas_instance *instance, struct drsas_ioctl *ioctl,
+    struct drsas_cmd *cmd, int mode)
+{
+	void		*fis_ubuf;
+	void		*data_ubuf;
+	uint32_t	fis_xferlen = 0;
+	uint32_t	data_xferlen = 0;
+	uint_t		model;
+	dma_obj_t	fis_dma_obj;
+	dma_obj_t	data_dma_obj;
+	struct drsas_stp_frame	*kstp;
+	struct drsas_stp_frame	*stp;
+	ddi_acc_handle_t	acc_handle = cmd->frame_dma_obj.acc_handle;
+	int i;
+
+	stp = &cmd->frame->stp;
+	kstp = (struct drsas_stp_frame *)&ioctl->frame[0];
+
+	model = ddi_model_convert_from(mode & FMODELS);
+	if (model == DDI_MODEL_ILP32) {
+		con_log(CL_ANN1, (CE_NOTE, "issue_mfi_stp: DDI_MODEL_ILP32"));
+
+		fis_xferlen	= kstp->sgl.sge32[0].length;
+		data_xferlen	= kstp->sgl.sge32[1].length;
+
+		fis_ubuf	= (void *)(ulong_t)kstp->sgl.sge32[0].phys_addr;
+		data_ubuf	= (void *)(ulong_t)kstp->sgl.sge32[1].phys_addr;
+	}
+	else
+	{
+#ifdef _ILP32
+		con_log(CL_ANN1, (CE_NOTE, "issue_mfi_stp: DDI_MODEL_ILP32"));
+
+		fis_xferlen	= kstp->sgl.sge32[0].length;
+		data_xferlen	= kstp->sgl.sge32[1].length;
+
+		fis_ubuf	= (void *)(ulong_t)kstp->sgl.sge32[0].phys_addr;
+		data_ubuf	= (void *)(ulong_t)kstp->sgl.sge32[1].phys_addr;
+#else
+		con_log(CL_ANN1, (CE_NOTE, "issue_mfi_stp: DDI_MODEL_LP64"));
+
+		fis_xferlen	= kstp->sgl.sge64[0].length;
+		data_xferlen	= kstp->sgl.sge64[1].length;
+
+		fis_ubuf	= (void *)(ulong_t)kstp->sgl.sge64[0].phys_addr;
+		data_ubuf	= (void *)(ulong_t)kstp->sgl.sge64[1].phys_addr;
+#endif
+	}
+
+
+	if (fis_xferlen) {
+		con_log(CL_ANN, (CE_NOTE, "issue_mfi_stp: "
+		    "fis_ubuf = %p fis_xferlen = %x", fis_ubuf, fis_xferlen));
+
+		/* means IOCTL requires DMA */
+		/* allocate the data transfer buffer */
+		fis_dma_obj.size = fis_xferlen;
+		fis_dma_obj.dma_attr = drsas_generic_dma_attr;
+		fis_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU;
+		fis_dma_obj.dma_attr.dma_attr_count_max	= 0xFFFFFFFFU;
+		fis_dma_obj.dma_attr.dma_attr_sgllen = 1;
+		fis_dma_obj.dma_attr.dma_attr_align = 1;
+
+		/* allocate kernel buffer for DMA */
+		if (drsas_alloc_dma_obj(instance, &fis_dma_obj,
+		    (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) {
+			con_log(CL_ANN, (CE_WARN, "issue_mfi_stp : "
+			    "could not allocate data transfer buffer."));
+			return (DDI_FAILURE);
+		}
+
+		/* If IOCTL requires DMA WRITE, do ddi_copyin IOCTL data copy */
+		for (i = 0; i < fis_xferlen; i++) {
+			if (ddi_copyin((uint8_t *)fis_ubuf + i,
+			    (uint8_t *)fis_dma_obj.buffer + i, 1, mode)) {
+				con_log(CL_ANN, (CE_WARN, "issue_mfi_stp: "
+				    "copy from user space failed"));
+				return (DDI_FAILURE);
+			}
+		}
+	}
+
+	if (data_xferlen) {
+		con_log(CL_ANN, (CE_NOTE, "issue_mfi_stp: data_ubuf = %p "
+		    "data_xferlen = %x", data_ubuf, data_xferlen));
+
+		/* means IOCTL requires DMA */
+		/* allocate the data transfer buffer */
+		data_dma_obj.size = data_xferlen;
+		data_dma_obj.dma_attr = drsas_generic_dma_attr;
+		data_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU;
+		data_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU;
+		data_dma_obj.dma_attr.dma_attr_sgllen = 1;
+		data_dma_obj.dma_attr.dma_attr_align = 1;
+
+/* allocate kernel buffer for DMA */
+		if (drsas_alloc_dma_obj(instance, &data_dma_obj,
+		    (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) {
+			con_log(CL_ANN, (CE_WARN, "issue_mfi_stp: "
+			    "could not allocate data transfer buffer."));
+			return (DDI_FAILURE);
+		}
+
+		/* If IOCTL requires DMA WRITE, do ddi_copyin IOCTL data copy */
+		for (i = 0; i < data_xferlen; i++) {
+			if (ddi_copyin((uint8_t *)data_ubuf + i,
+			    (uint8_t *)data_dma_obj.buffer + i, 1, mode)) {
+				con_log(CL_ANN, (CE_WARN, "issue_mfi_stp: "
+				    "copy from user space failed"));
+				return (DDI_FAILURE);
+			}
+		}
+	}
+
+	ddi_put8(acc_handle, &stp->cmd, kstp->cmd);
+	ddi_put8(acc_handle, &stp->cmd_status, 0);
+	ddi_put8(acc_handle, &stp->connection_status, 0);
+	ddi_put8(acc_handle, &stp->target_id, kstp->target_id);
+	ddi_put8(acc_handle, &stp->sge_count, kstp->sge_count);
+
+	ddi_put16(acc_handle, &stp->timeout, kstp->timeout);
+	ddi_put32(acc_handle, &stp->data_xfer_len, kstp->data_xfer_len);
+
+	ddi_rep_put8(acc_handle, (uint8_t *)kstp->fis, (uint8_t *)stp->fis, 10,
+	    DDI_DEV_AUTOINCR);
+
+	ddi_put16(acc_handle, &stp->flags, kstp->flags & ~MFI_FRAME_SGL64);
+	ddi_put32(acc_handle, &stp->stp_flags, kstp->stp_flags);
+	ddi_put32(acc_handle, &stp->sgl.sge32[0].length, fis_xferlen);
+	ddi_put32(acc_handle, &stp->sgl.sge32[0].phys_addr,
+	    fis_dma_obj.dma_cookie[0].dmac_address);
+	ddi_put32(acc_handle, &stp->sgl.sge32[1].length, data_xferlen);
+	ddi_put32(acc_handle, &stp->sgl.sge32[1].phys_addr,
+	    data_dma_obj.dma_cookie[0].dmac_address);
+
+	cmd->sync_cmd = DRSAS_TRUE;
+	cmd->frame_count = 1;
+
+	if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) {
+		con_log(CL_ANN, (CE_WARN, "issue_mfi_stp: fw_ioctl failed"));
+	} else {
+
+		if (fis_xferlen) {
+			for (i = 0; i < fis_xferlen; i++) {
+				if (ddi_copyout(
+				    (uint8_t *)fis_dma_obj.buffer + i,
+				    (uint8_t *)fis_ubuf + i, 1, mode)) {
+					con_log(CL_ANN, (CE_WARN,
+					    "issue_mfi_stp : copy to "
+					    "user space failed"));
+					return (DDI_FAILURE);
+				}
+			}
+		}
+	}
+	if (data_xferlen) {
+		for (i = 0; i < data_xferlen; i++) {
+			if (ddi_copyout(
+			    (uint8_t *)data_dma_obj.buffer + i,
+			    (uint8_t *)data_ubuf + i, 1, mode)) {
+				con_log(CL_ANN, (CE_WARN,
+				    "issue_mfi_stp : copy to"
+				    " user space failed"));
+				return (DDI_FAILURE);
+			}
+		}
+	}
+
+	kstp->cmd_status = ddi_get8(acc_handle, &stp->cmd_status);
+
+	if (fis_xferlen) {
+		/* free kernel buffer */
+		if (drsas_free_dma_obj(instance, fis_dma_obj) != DDI_SUCCESS)
+			return (DDI_FAILURE);
+	}
+
+	if (data_xferlen) {
+		/* free kernel buffer */
+		if (drsas_free_dma_obj(instance, data_dma_obj) != DDI_SUCCESS)
+			return (DDI_FAILURE);
+	}
+
+	return (DDI_SUCCESS);
+}
+
+/*
+ * fill_up_drv_ver
+ */
+static void
+fill_up_drv_ver(struct drsas_drv_ver *dv)
+{
+	(void) memset(dv, 0, sizeof (struct drsas_drv_ver));
+
+	(void) memcpy(dv->signature, "$LSI LOGIC$", strlen("$LSI LOGIC$"));
+	(void) memcpy(dv->os_name, "Solaris", strlen("Solaris"));
+	(void) memcpy(dv->drv_name, "dr_sas", strlen("dr_sas"));
+	(void) memcpy(dv->drv_ver, DRSAS_VERSION, strlen(DRSAS_VERSION));
+	(void) memcpy(dv->drv_rel_date, DRSAS_RELDATE,
+	    strlen(DRSAS_RELDATE));
+}
+
+/*
+ * handle_drv_ioctl
+ */
+static int
+handle_drv_ioctl(struct drsas_instance *instance, struct drsas_ioctl *ioctl,
+    int mode)
+{
+	int	i;
+	int	rval = DDI_SUCCESS;
+	int	*props = NULL;
+	void	*ubuf;
+
+	uint8_t		*pci_conf_buf;
+	uint32_t	xferlen;
+	uint32_t	num_props;
+	uint_t		model;
+	struct drsas_dcmd_frame	*kdcmd;
+	struct drsas_drv_ver	dv;
+	struct drsas_pci_information pi;
+
+	kdcmd = (struct drsas_dcmd_frame *)&ioctl->frame[0];
+
+	model = ddi_model_convert_from(mode & FMODELS);
+	if (model == DDI_MODEL_ILP32) {
+		con_log(CL_ANN1, (CE_NOTE,
+		    "handle_drv_ioctl: DDI_MODEL_ILP32"));
+
+		xferlen	= kdcmd->sgl.sge32[0].length;
+
+		ubuf = (void *)(ulong_t)kdcmd->sgl.sge32[0].phys_addr;
+	} else {
+#ifdef _ILP32
+		con_log(CL_ANN1, (CE_NOTE,
+		    "handle_drv_ioctl: DDI_MODEL_ILP32"));
+		xferlen	= kdcmd->sgl.sge32[0].length;
+		ubuf = (void *)(ulong_t)kdcmd->sgl.sge32[0].phys_addr;
+#else
+		con_log(CL_ANN1, (CE_NOTE,
+		    "handle_drv_ioctl: DDI_MODEL_LP64"));
+		xferlen	= kdcmd->sgl.sge64[0].length;
+		ubuf = (void *)(ulong_t)kdcmd->sgl.sge64[0].phys_addr;
+#endif
+	}
+	con_log(CL_ANN1, (CE_NOTE, "handle_drv_ioctl: "
+	    "dataBuf=%p size=%d bytes", ubuf, xferlen));
+
+	switch (kdcmd->opcode) {
+	case DRSAS_DRIVER_IOCTL_DRIVER_VERSION:
+		con_log(CL_ANN1, (CE_NOTE, "handle_drv_ioctl: "
+		    "DRSAS_DRIVER_IOCTL_DRIVER_VERSION"));
+
+		fill_up_drv_ver(&dv);
+		for (i = 0; i < xferlen; i++) {
+			if (ddi_copyout((uint8_t *)&dv + i, (uint8_t *)ubuf + i,
+			    1, mode)) {
+				con_log(CL_ANN, (CE_WARN, "handle_drv_ioctl: "
+				    "DRSAS_DRIVER_IOCTL_DRIVER_VERSION"
+				    " : copy to user space failed"));
+				kdcmd->cmd_status = 1;
+				rval = DDI_FAILURE;
+				break;
+			}
+		}
+		if (i == xferlen)
+			kdcmd->cmd_status = 0;
+		break;
+	case DRSAS_DRIVER_IOCTL_PCI_INFORMATION:
+		con_log(CL_ANN1, (CE_NOTE, "handle_drv_ioctl: "
+		    "DRSAS_DRIVER_IOCTL_PCI_INFORMAITON"));
+
+		if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, instance->dip,
+		    0, "reg", &props, &num_props)) {
+			con_log(CL_ANN, (CE_WARN, "handle_drv_ioctl: "
+			    "DRSAS_DRIVER_IOCTL_PCI_INFORMATION : "
+			    "ddi_prop_look_int_array failed"));
+			rval = DDI_FAILURE;
+		} else {
+
+			pi.busNumber = (props[0] >> 16) & 0xFF;
+			pi.deviceNumber = (props[0] >> 11) & 0x1f;
+			pi.functionNumber = (props[0] >> 8) & 0x7;
+			ddi_prop_free((void *)props);
+		}
+
+		pci_conf_buf = (uint8_t *)&pi.pciHeaderInfo;
+
+		for (i = 0; i < (sizeof (struct drsas_pci_information) -
+		    offsetof(struct drsas_pci_information, pciHeaderInfo));
+		    i++) {
+			pci_conf_buf[i] =
+			    pci_config_get8(instance->pci_handle, i);
+		}
+		for (i = 0; i < xferlen; i++) {
+			if (ddi_copyout((uint8_t *)&pi + i, (uint8_t *)ubuf + i,
+			    1, mode)) {
+				con_log(CL_ANN, (CE_WARN, "handle_drv_ioctl: "
+				    "DRSAS_DRIVER_IOCTL_PCI_INFORMATION"
+				    " : copy to user space failed"));
+				kdcmd->cmd_status = 1;
+				rval = DDI_FAILURE;
+				break;
+			}
+		}
+
+		if (i == xferlen)
+			kdcmd->cmd_status = 0;
+
+		break;
+	default:
+		con_log(CL_ANN, (CE_WARN, "handle_drv_ioctl: "
+		    "invalid driver specific IOCTL opcode = 0x%x",
+		    kdcmd->opcode));
+		kdcmd->cmd_status = 1;
+		rval = DDI_FAILURE;
+		break;
+	}
+
+	return (rval);
+}
+
+/*
+ * handle_mfi_ioctl
+ */
+static int
+handle_mfi_ioctl(struct drsas_instance *instance, struct drsas_ioctl *ioctl,
+    int mode)
+{
+	int	rval = DDI_SUCCESS;
+
+	struct drsas_header	*hdr;
+	struct drsas_cmd	*cmd;
+
+	cmd = get_mfi_pkt(instance);
+
+	if (!cmd) {
+		con_log(CL_ANN, (CE_WARN, "dr_sas: "
+		    "failed to get a cmd packet"));
+		return (DDI_FAILURE);
+	}
+
+	/* Clear the frame buffer and assign back the context id */
+	(void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame));
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context,
+	    cmd->index);
+
+	hdr = (struct drsas_header *)&ioctl->frame[0];
+
+	switch (hdr->cmd) {
+	case MFI_CMD_OP_DCMD:
+		rval = issue_mfi_dcmd(instance, ioctl, cmd, mode);
+		break;
+	case MFI_CMD_OP_SMP:
+		rval = issue_mfi_smp(instance, ioctl, cmd, mode);
+		break;
+	case MFI_CMD_OP_STP:
+		rval = issue_mfi_stp(instance, ioctl, cmd, mode);
+		break;
+	case MFI_CMD_OP_LD_SCSI:
+	case MFI_CMD_OP_PD_SCSI:
+		rval = issue_mfi_pthru(instance, ioctl, cmd, mode);
+		break;
+	default:
+		con_log(CL_ANN, (CE_WARN, "handle_mfi_ioctl: "
+		    "invalid mfi ioctl hdr->cmd = %d", hdr->cmd));
+		rval = DDI_FAILURE;
+		break;
+	}
+
+
+	return_mfi_pkt(instance, cmd);
+	if (drsas_common_check(instance, cmd) != DDI_SUCCESS)
+		rval = DDI_FAILURE;
+	return (rval);
+}
+
+/*
+ * AEN
+ */
+static int
+handle_mfi_aen(struct drsas_instance *instance, struct drsas_aen *aen)
+{
+	int	rval = 0;
+
+	rval = register_mfi_aen(instance, instance->aen_seq_num,
+	    aen->class_locale_word);
+
+	aen->cmd_status = (uint8_t)rval;
+
+	return (rval);
+}
+
+static int
+register_mfi_aen(struct drsas_instance *instance, uint32_t seq_num,
+    uint32_t class_locale_word)
+{
+	int	ret_val;
+
+	struct drsas_cmd	*cmd, *aen_cmd;
+	struct drsas_dcmd_frame	*dcmd;
+	union drsas_evt_class_locale	curr_aen;
+	union drsas_evt_class_locale	prev_aen;
+
+	/*
+	 * If there an AEN pending already (aen_cmd), check if the
+	 * class_locale of that pending AEN is inclusive of the new
+	 * AEN request we currently have. If it is, then we don't have
+	 * to do anything. In other words, whichever events the current
+	 * AEN request is subscribing to, have already been subscribed
+	 * to.
+	 *
+	 * If the old_cmd is _not_ inclusive, then we have to abort
+	 * that command, form a class_locale that is superset of both
+	 * old and current and re-issue to the FW
+	 */
+
+	curr_aen.word = class_locale_word;
+	aen_cmd = instance->aen_cmd;
+	if (aen_cmd) {
+		prev_aen.word = ddi_get32(aen_cmd->frame_dma_obj.acc_handle,
+		    &aen_cmd->frame->dcmd.mbox.w[1]);
+
+		/*
+		 * A class whose enum value is smaller is inclusive of all
+		 * higher values. If a PROGRESS (= -1) was previously
+		 * registered, then a new registration requests for higher
+		 * classes need not be sent to FW. They are automatically
+		 * included.
+		 *
+		 * Locale numbers don't have such hierarchy. They are bitmap
+		 * values
+		 */
+		if ((prev_aen.members.class <= curr_aen.members.class) &&
+		    !((prev_aen.members.locale & curr_aen.members.locale) ^
+		    curr_aen.members.locale)) {
+			/*
+			 * Previously issued event registration includes
+			 * current request. Nothing to do.
+			 */
+
+			return (0);
+		} else {
+			curr_aen.members.locale |= prev_aen.members.locale;
+
+			if (prev_aen.members.class < curr_aen.members.class)
+				curr_aen.members.class = prev_aen.members.class;
+
+			ret_val = abort_aen_cmd(instance, aen_cmd);
+
+			if (ret_val) {
+				con_log(CL_ANN, (CE_WARN, "register_mfi_aen: "
+				    "failed to abort prevous AEN command"));
+
+				return (ret_val);
+			}
+		}
+	} else {
+		curr_aen.word = class_locale_word;
+	}
+
+	cmd = get_mfi_pkt(instance);
+
+	if (!cmd)
+		return (ENOMEM);
+	/* Clear the frame buffer and assign back the context id */
+	(void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame));
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context,
+	    cmd->index);
+
+	dcmd = &cmd->frame->dcmd;
+
+	/* for(i = 0; i < DCMD_MBOX_SZ; i++) dcmd->mbox.b[i] = 0; */
+	(void) memset(dcmd->mbox.b, 0, DCMD_MBOX_SZ);
+
+	(void) memset(instance->mfi_evt_detail_obj.buffer, 0,
+	    sizeof (struct drsas_evt_detail));
+
+	/* Prepare DCMD for aen registration */
+	ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd, MFI_CMD_OP_DCMD);
+	ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd_status, 0x0);
+	ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->sge_count, 1);
+	ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->flags,
+	    MFI_FRAME_DIR_READ);
+	ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->timeout, 0);
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->data_xfer_len,
+	    sizeof (struct drsas_evt_detail));
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->opcode,
+	    DR_DCMD_CTRL_EVENT_WAIT);
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->mbox.w[0], seq_num);
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->mbox.w[1],
+	    curr_aen.word);
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->sgl.sge32[0].phys_addr,
+	    instance->mfi_evt_detail_obj.dma_cookie[0].dmac_address);
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->sgl.sge32[0].length,
+	    sizeof (struct drsas_evt_detail));
+
+	instance->aen_seq_num = seq_num;
+
+
+	/*
+	 * Store reference to the cmd used to register for AEN. When an
+	 * application wants us to register for AEN, we have to abort this
+	 * cmd and re-register with a new EVENT LOCALE supplied by that app
+	 */
+	instance->aen_cmd = cmd;
+
+	cmd->frame_count = 1;
+
+	/* Issue the aen registration frame */
+	/* atomic_add_16 (&instance->fw_outstanding, 1); */
+	instance->func_ptr->issue_cmd(cmd, instance);
+
+	return (0);
+}
+
+static void
+display_scsi_inquiry(caddr_t scsi_inq)
+{
+#define	MAX_SCSI_DEVICE_CODE	14
+	int		i;
+	char		inquiry_buf[256] = {0};
+	int		len;
+	const char	*const scsi_device_types[] = {
+		"Direct-Access    ",
+		"Sequential-Access",
+		"Printer          ",
+		"Processor        ",
+		"WORM             ",
+		"CD-ROM           ",
+		"Scanner          ",
+		"Optical Device   ",
+		"Medium Changer   ",
+		"Communications   ",
+		"Unknown          ",
+		"Unknown          ",
+		"Unknown          ",
+		"Enclosure        ",
+	};
+
+	len = 0;
+
+	len += snprintf(inquiry_buf + len, 265 - len, "  Vendor: ");
+	for (i = 8; i < 16; i++) {
+		len += snprintf(inquiry_buf + len, 265 - len, "%c",
+		    scsi_inq[i]);
+	}
+
+	len += snprintf(inquiry_buf + len, 265 - len, "  Model: ");
+
+	for (i = 16; i < 32; i++) {
+		len += snprintf(inquiry_buf + len, 265 - len, "%c",
+		    scsi_inq[i]);
+	}
+
+	len += snprintf(inquiry_buf + len, 265 - len, "  Rev: ");
+
+	for (i = 32; i < 36; i++) {
+		len += snprintf(inquiry_buf + len, 265 - len, "%c",
+		    scsi_inq[i]);
+	}
+
+	len += snprintf(inquiry_buf + len, 265 - len, "\n");
+
+
+	i = scsi_inq[0] & 0x1f;
+
+
+	len += snprintf(inquiry_buf + len, 265 - len, "  Type:   %s ",
+	    i < MAX_SCSI_DEVICE_CODE ? scsi_device_types[i] :
+	    "Unknown          ");
+
+
+	len += snprintf(inquiry_buf + len, 265 - len,
+	    "                 ANSI SCSI revision: %02x", scsi_inq[2] & 0x07);
+
+	if ((scsi_inq[2] & 0x07) == 1 && (scsi_inq[3] & 0x0f) == 1) {
+		len += snprintf(inquiry_buf + len, 265 - len, " CCS\n");
+	} else {
+		len += snprintf(inquiry_buf + len, 265 - len, "\n");
+	}
+
+	con_log(CL_ANN1, (CE_CONT, inquiry_buf));
+}
+
+static int
+read_fw_status_reg_ppc(struct drsas_instance *instance)
+{
+	return ((int)RD_OB_SCRATCH_PAD_0(instance));
+}
+
+static void
+issue_cmd_ppc(struct drsas_cmd *cmd, struct drsas_instance *instance)
+{
+	atomic_add_16(&instance->fw_outstanding, 1);
+
+	/* Issue the command to the FW */
+	WR_IB_QPORT((cmd->frame_phys_addr) |
+	    (((cmd->frame_count - 1) << 1) | 1), instance);
+}
+
+/*
+ * issue_cmd_in_sync_mode
+ */
+static int
+issue_cmd_in_sync_mode_ppc(struct drsas_instance *instance,
+    struct drsas_cmd *cmd)
+{
+	int		i;
+	uint32_t	msecs = MFI_POLL_TIMEOUT_SECS * (10 * MILLISEC);
+
+	con_log(CL_ANN1, (CE_NOTE, "issue_cmd_in_sync_mode_ppc: called"));
+
+	cmd->cmd_status	= ENODATA;
+
+	WR_IB_QPORT((cmd->frame_phys_addr) |
+	    (((cmd->frame_count - 1) << 1) | 1), instance);
+
+	mutex_enter(&instance->int_cmd_mtx);
+
+	for (i = 0; i < msecs && (cmd->cmd_status == ENODATA); i++) {
+		cv_wait(&instance->int_cmd_cv, &instance->int_cmd_mtx);
+	}
+
+	mutex_exit(&instance->int_cmd_mtx);
+
+	con_log(CL_ANN1, (CE_NOTE, "issue_cmd_in_sync_mode_ppc: done"));
+
+	if (i < (msecs -1)) {
+		return (DDI_SUCCESS);
+	} else {
+		return (DDI_FAILURE);
+	}
+}
+
+/*
+ * issue_cmd_in_poll_mode
+ */
+static int
+issue_cmd_in_poll_mode_ppc(struct drsas_instance *instance,
+    struct drsas_cmd *cmd)
+{
+	int		i;
+	uint16_t	flags;
+	uint32_t	msecs = MFI_POLL_TIMEOUT_SECS * MILLISEC;
+	struct drsas_header *frame_hdr;
+
+	con_log(CL_ANN1, (CE_NOTE, "issue_cmd_in_poll_mode_ppc: called"));
+
+	frame_hdr = (struct drsas_header *)cmd->frame;
+	ddi_put8(cmd->frame_dma_obj.acc_handle, &frame_hdr->cmd_status,
+	    MFI_CMD_STATUS_POLL_MODE);
+	flags = ddi_get16(cmd->frame_dma_obj.acc_handle, &frame_hdr->flags);
+	flags 	|= MFI_FRAME_DONT_POST_IN_REPLY_QUEUE;
+
+	ddi_put16(cmd->frame_dma_obj.acc_handle, &frame_hdr->flags, flags);
+
+	/* issue the frame using inbound queue port */
+	WR_IB_QPORT((cmd->frame_phys_addr) |
+	    (((cmd->frame_count - 1) << 1) | 1), instance);
+
+	/* wait for cmd_status to change from 0xFF */
+	for (i = 0; i < msecs && (
+	    ddi_get8(cmd->frame_dma_obj.acc_handle, &frame_hdr->cmd_status)
+	    == MFI_CMD_STATUS_POLL_MODE); i++) {
+		drv_usecwait(MILLISEC); /* wait for 1000 usecs */
+	}
+
+	if (ddi_get8(cmd->frame_dma_obj.acc_handle, &frame_hdr->cmd_status)
+	    == MFI_CMD_STATUS_POLL_MODE) {
+		con_log(CL_ANN, (CE_NOTE, "issue_cmd_in_poll_mode: "
+		    "cmd polling timed out"));
+		return (DDI_FAILURE);
+	}
+
+	return (DDI_SUCCESS);
+}
+
+static void
+enable_intr_ppc(struct drsas_instance *instance)
+{
+	uint32_t	mask;
+
+	con_log(CL_ANN1, (CE_NOTE, "enable_intr_ppc: called"));
+
+	/* WR_OB_DOORBELL_CLEAR(0xFFFFFFFF, instance); */
+	WR_OB_DOORBELL_CLEAR(OB_DOORBELL_CLEAR_MASK, instance);
+
+	/* WR_OB_INTR_MASK(~0x80000000, instance); */
+	WR_OB_INTR_MASK(~(MFI_REPLY_2108_MESSAGE_INTR_MASK), instance);
+
+	/* dummy read to force PCI flush */
+	mask = RD_OB_INTR_MASK(instance);
+
+	con_log(CL_ANN1, (CE_NOTE, "enable_intr_ppc: "
+	    "outbound_intr_mask = 0x%x", mask));
+}
+
+static void
+disable_intr_ppc(struct drsas_instance *instance)
+{
+	uint32_t	mask;
+
+	con_log(CL_ANN1, (CE_NOTE, "disable_intr_ppc: called"));
+
+	con_log(CL_ANN1, (CE_NOTE, "disable_intr_ppc: before : "
+	    "outbound_intr_mask = 0x%x", RD_OB_INTR_MASK(instance)));
+
+	/* WR_OB_INTR_MASK(0xFFFFFFFF, instance); */
+	WR_OB_INTR_MASK(OB_INTR_MASK, instance);
+
+	con_log(CL_ANN1, (CE_NOTE, "disable_intr_ppc: after : "
+	    "outbound_intr_mask = 0x%x", RD_OB_INTR_MASK(instance)));
+
+	/* dummy read to force PCI flush */
+	mask = RD_OB_INTR_MASK(instance);
+#ifdef lint
+	mask = mask;
+#endif
+}
+
+static int
+intr_ack_ppc(struct drsas_instance *instance)
+{
+	uint32_t	status;
+
+	con_log(CL_ANN1, (CE_NOTE, "intr_ack_ppc: called"));
+
+	/* check if it is our interrupt */
+	status = RD_OB_INTR_STATUS(instance);
+
+	con_log(CL_ANN1, (CE_NOTE, "intr_ack_ppc: status = 0x%x", status));
+
+	if (!(status & MFI_REPLY_2108_MESSAGE_INTR)) {
+		return (DDI_INTR_UNCLAIMED);
+	}
+
+	/* clear the interrupt by writing back the same value */
+	WR_OB_DOORBELL_CLEAR(status, instance);
+
+	/* dummy READ */
+	status = RD_OB_INTR_STATUS(instance);
+
+	con_log(CL_ANN1, (CE_NOTE, "intr_ack_ppc: interrupt cleared"));
+
+	return (DDI_INTR_CLAIMED);
+}
+
+static int
+drsas_common_check(struct drsas_instance *instance,
+    struct  drsas_cmd *cmd)
+{
+	int ret = DDI_SUCCESS;
+
+	if (drsas_check_dma_handle(cmd->frame_dma_obj.dma_handle) !=
+	    DDI_SUCCESS) {
+		ddi_fm_service_impact(instance->dip, DDI_SERVICE_UNAFFECTED);
+		if (cmd->pkt != NULL) {
+			cmd->pkt->pkt_reason = CMD_TRAN_ERR;
+			cmd->pkt->pkt_statistics = 0;
+		}
+		ret = DDI_FAILURE;
+	}
+	if (drsas_check_dma_handle(instance->mfi_internal_dma_obj.dma_handle)
+	    != DDI_SUCCESS) {
+		ddi_fm_service_impact(instance->dip, DDI_SERVICE_UNAFFECTED);
+		if (cmd->pkt != NULL) {
+			cmd->pkt->pkt_reason = CMD_TRAN_ERR;
+			cmd->pkt->pkt_statistics = 0;
+		}
+		ret = DDI_FAILURE;
+	}
+	if (drsas_check_dma_handle(instance->mfi_evt_detail_obj.dma_handle) !=
+	    DDI_SUCCESS) {
+		ddi_fm_service_impact(instance->dip, DDI_SERVICE_UNAFFECTED);
+		if (cmd->pkt != NULL) {
+			cmd->pkt->pkt_reason = CMD_TRAN_ERR;
+			cmd->pkt->pkt_statistics = 0;
+		}
+		ret = DDI_FAILURE;
+	}
+	if (drsas_check_acc_handle(instance->regmap_handle) != DDI_SUCCESS) {
+		ddi_fm_service_impact(instance->dip, DDI_SERVICE_UNAFFECTED);
+
+		ddi_fm_acc_err_clear(instance->regmap_handle, DDI_FME_VER0);
+
+		if (cmd->pkt != NULL) {
+			cmd->pkt->pkt_reason = CMD_TRAN_ERR;
+			cmd->pkt->pkt_statistics = 0;
+		}
+		ret = DDI_FAILURE;
+	}
+
+	return (ret);
+}
+
+/*ARGSUSED*/
+static int
+drsas_fm_error_cb(dev_info_t *dip, ddi_fm_error_t *err, const void *impl_data)
+{
+	/*
+	 * as the driver can always deal with an error in any dma or
+	 * access handle, we can just return the fme_status value.
+	 */
+	pci_ereport_post(dip, err, NULL);
+	return (err->fme_status);
+}
+
+static void
+drsas_fm_init(struct drsas_instance *instance)
+{
+	/* Need to change iblock to priority for new MSI intr */
+	ddi_iblock_cookie_t fm_ibc;
+
+	/* Only register with IO Fault Services if we have some capability */
+	if (instance->fm_capabilities) {
+		/* Adjust access and dma attributes for FMA */
+		endian_attr.devacc_attr_access = DDI_FLAGERR_ACC;
+		drsas_generic_dma_attr.dma_attr_flags = DDI_DMA_FLAGERR;
+
+		/*
+		 * Register capabilities with IO Fault Services.
+		 * fm_capabilities will be updated to indicate
+		 * capabilities actually supported (not requested.)
+		 */
+
+		ddi_fm_init(instance->dip, &instance->fm_capabilities, &fm_ibc);
+
+		/*
+		 * Initialize pci ereport capabilities if ereport
+		 * capable (should always be.)
+		 */
+
+		if (DDI_FM_EREPORT_CAP(instance->fm_capabilities) ||
+		    DDI_FM_ERRCB_CAP(instance->fm_capabilities)) {
+			pci_ereport_setup(instance->dip);
+		}
+
+		/*
+		 * Register error callback if error callback capable.
+		 */
+		if (DDI_FM_ERRCB_CAP(instance->fm_capabilities)) {
+			ddi_fm_handler_register(instance->dip,
+			    drsas_fm_error_cb, (void*) instance);
+		}
+	} else {
+		endian_attr.devacc_attr_access = DDI_DEFAULT_ACC;
+		drsas_generic_dma_attr.dma_attr_flags = 0;
+	}
+}
+
+static void
+drsas_fm_fini(struct drsas_instance *instance)
+{
+	/* Only unregister FMA capabilities if registered */
+	if (instance->fm_capabilities) {
+		/*
+		 * Un-register error callback if error callback capable.
+		 */
+		if (DDI_FM_ERRCB_CAP(instance->fm_capabilities)) {
+			ddi_fm_handler_unregister(instance->dip);
+		}
+
+		/*
+		 * Release any resources allocated by pci_ereport_setup()
+		 */
+		if (DDI_FM_EREPORT_CAP(instance->fm_capabilities) ||
+		    DDI_FM_ERRCB_CAP(instance->fm_capabilities)) {
+			pci_ereport_teardown(instance->dip);
+		}
+
+		/* Unregister from IO Fault Services */
+		ddi_fm_fini(instance->dip);
+
+		/* Adjust access and dma attributes for FMA */
+		endian_attr.devacc_attr_access = DDI_DEFAULT_ACC;
+		drsas_generic_dma_attr.dma_attr_flags = 0;
+	}
+}
+
+int
+drsas_check_acc_handle(ddi_acc_handle_t handle)
+{
+	ddi_fm_error_t de;
+
+	if (handle == NULL) {
+		return (DDI_FAILURE);
+	}
+
+	ddi_fm_acc_err_get(handle, &de, DDI_FME_VERSION);
+
+	return (de.fme_status);
+}
+
+int
+drsas_check_dma_handle(ddi_dma_handle_t handle)
+{
+	ddi_fm_error_t de;
+
+	if (handle == NULL) {
+		return (DDI_FAILURE);
+	}
+
+	ddi_fm_dma_err_get(handle, &de, DDI_FME_VERSION);
+
+	return (de.fme_status);
+}
+
+void
+drsas_fm_ereport(struct drsas_instance *instance, char *detail)
+{
+	uint64_t ena;
+	char buf[FM_MAX_CLASS];
+
+	(void) snprintf(buf, FM_MAX_CLASS, "%s.%s", DDI_FM_DEVICE, detail);
+	ena = fm_ena_generate(0, FM_ENA_FMT1);
+	if (DDI_FM_EREPORT_CAP(instance->fm_capabilities)) {
+		ddi_fm_ereport_post(instance->dip, buf, ena, DDI_NOSLEEP,
+		    FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERSION, NULL);
+	}
+}
+
+static int
+drsas_add_intrs(struct drsas_instance *instance, int intr_type)
+{
+
+	dev_info_t *dip = instance->dip;
+	int	avail, actual, count;
+	int	i, flag, ret;
+
+	con_log(CL_DLEVEL1, (CE_WARN, "drsas_add_intrs: intr_type = %x",
+	    intr_type));
+
+	/* Get number of interrupts */
+	ret = ddi_intr_get_nintrs(dip, intr_type, &count);
+	if ((ret != DDI_SUCCESS) || (count == 0)) {
+		con_log(CL_ANN, (CE_WARN, "ddi_intr_get_nintrs() failed:"
+		    "ret %d count %d", ret, count));
+
+		return (DDI_FAILURE);
+	}
+
+	con_log(CL_DLEVEL1, (CE_WARN, "drsas_add_intrs: count = %d ", count));
+
+	/* Get number of available interrupts */
+	ret = ddi_intr_get_navail(dip, intr_type, &avail);
+	if ((ret != DDI_SUCCESS) || (avail == 0)) {
+		con_log(CL_ANN, (CE_WARN, "ddi_intr_get_navail() failed:"
+		    "ret %d avail %d", ret, avail));
+
+		return (DDI_FAILURE);
+	}
+	con_log(CL_DLEVEL1, (CE_WARN, "drsas_add_intrs: avail = %d ", avail));
+
+	/* Only one interrupt routine. So limit the count to 1 */
+	if (count > 1) {
+		count = 1;
+	}
+
+	/*
+	 * Allocate an array of interrupt handlers. Currently we support
+	 * only one interrupt. The framework can be extended later.
+	 */
+	instance->intr_size = count * sizeof (ddi_intr_handle_t);
+	instance->intr_htable = kmem_zalloc(instance->intr_size, KM_SLEEP);
+	ASSERT(instance->intr_htable);
+
+	flag = ((intr_type == DDI_INTR_TYPE_MSI) || (intr_type ==
+	    DDI_INTR_TYPE_MSIX)) ? DDI_INTR_ALLOC_STRICT:DDI_INTR_ALLOC_NORMAL;
+
+	/* Allocate interrupt */
+	ret = ddi_intr_alloc(dip, instance->intr_htable, intr_type, 0,
+	    count, &actual, flag);
+
+	if ((ret != DDI_SUCCESS) || (actual == 0)) {
+		con_log(CL_ANN, (CE_WARN, "drsas_add_intrs: "
+		    "avail = %d", avail));
+		kmem_free(instance->intr_htable, instance->intr_size);
+		return (DDI_FAILURE);
+	}
+	if (actual < count) {
+		con_log(CL_ANN, (CE_WARN, "drsas_add_intrs: "
+		    "Requested = %d  Received = %d", count, actual));
+	}
+	instance->intr_cnt = actual;
+
+	/*
+	 * Get the priority of the interrupt allocated.
+	 */
+	if ((ret = ddi_intr_get_pri(instance->intr_htable[0],
+	    &instance->intr_pri)) != DDI_SUCCESS) {
+		con_log(CL_ANN, (CE_WARN, "drsas_add_intrs: "
+		    "get priority call failed"));
+
+		for (i = 0; i < actual; i++) {
+			(void) ddi_intr_free(instance->intr_htable[i]);
+		}
+		kmem_free(instance->intr_htable, instance->intr_size);
+		return (DDI_FAILURE);
+	}
+
+	/*
+	 * Test for high level mutex. we don't support them.
+	 */
+	if (instance->intr_pri >= ddi_intr_get_hilevel_pri()) {
+		con_log(CL_ANN, (CE_WARN, "drsas_add_intrs: "
+		    "High level interrupts not supported."));
+
+		for (i = 0; i < actual; i++) {
+			(void) ddi_intr_free(instance->intr_htable[i]);
+		}
+		kmem_free(instance->intr_htable, instance->intr_size);
+		return (DDI_FAILURE);
+	}
+
+	con_log(CL_DLEVEL1, (CE_NOTE, "drsas_add_intrs: intr_pri = 0x%x ",
+	    instance->intr_pri));
+
+	/* Call ddi_intr_add_handler() */
+	for (i = 0; i < actual; i++) {
+		ret = ddi_intr_add_handler(instance->intr_htable[i],
+		    (ddi_intr_handler_t *)drsas_isr, (caddr_t)instance,
+		    (caddr_t)(uintptr_t)i);
+
+		if (ret != DDI_SUCCESS) {
+			con_log(CL_ANN, (CE_WARN, "drsas_add_intrs:"
+			    "failed %d", ret));
+
+			for (i = 0; i < actual; i++) {
+				(void) ddi_intr_free(instance->intr_htable[i]);
+			}
+			kmem_free(instance->intr_htable, instance->intr_size);
+			return (DDI_FAILURE);
+		}
+
+	}
+
+	con_log(CL_DLEVEL1, (CE_WARN, " ddi_intr_add_handler done"));
+
+	if ((ret = ddi_intr_get_cap(instance->intr_htable[0],
+	    &instance->intr_cap)) != DDI_SUCCESS) {
+		con_log(CL_ANN, (CE_WARN, "ddi_intr_get_cap() failed %d",
+		    ret));
+
+		/* Free already allocated intr */
+		for (i = 0; i < actual; i++) {
+			(void) ddi_intr_remove_handler(
+			    instance->intr_htable[i]);
+			(void) ddi_intr_free(instance->intr_htable[i]);
+		}
+		kmem_free(instance->intr_htable, instance->intr_size);
+		return (DDI_FAILURE);
+	}
+
+	if (instance->intr_cap &  DDI_INTR_FLAG_BLOCK) {
+		con_log(CL_ANN, (CE_WARN, "Calling ddi_intr_block _enable"));
+
+		(void) ddi_intr_block_enable(instance->intr_htable,
+		    instance->intr_cnt);
+	} else {
+		con_log(CL_ANN, (CE_NOTE, " calling ddi_intr_enable"));
+
+		for (i = 0; i < instance->intr_cnt; i++) {
+			(void) ddi_intr_enable(instance->intr_htable[i]);
+			con_log(CL_ANN, (CE_NOTE, "ddi intr enable returns "
+			    "%d", i));
+		}
+	}
+
+	return (DDI_SUCCESS);
+
+}
+
+
+static void
+drsas_rem_intrs(struct drsas_instance *instance)
+{
+	int i;
+
+	con_log(CL_ANN, (CE_NOTE, "drsas_rem_intrs called"));
+
+	/* Disable all interrupts first */
+	if (instance->intr_cap & DDI_INTR_FLAG_BLOCK) {
+		(void) ddi_intr_block_disable(instance->intr_htable,
+		    instance->intr_cnt);
+	} else {
+		for (i = 0; i < instance->intr_cnt; i++) {
+			(void) ddi_intr_disable(instance->intr_htable[i]);
+		}
+	}
+
+	/* Remove all the handlers */
+
+	for (i = 0; i < instance->intr_cnt; i++) {
+		(void) ddi_intr_remove_handler(instance->intr_htable[i]);
+		(void) ddi_intr_free(instance->intr_htable[i]);
+	}
+
+	kmem_free(instance->intr_htable, instance->intr_size);
+}
+
+static int
+drsas_tran_bus_config(dev_info_t *parent, uint_t flags,
+    ddi_bus_config_op_t op, void *arg, dev_info_t **childp)
+{
+	struct drsas_instance *instance;
+	int config;
+	int rval;
+
+	char *ptr = NULL;
+	int tgt, lun;
+
+	con_log(CL_ANN1, (CE_NOTE, "Bus config called for op = %x", op));
+
+	if ((instance = ddi_get_soft_state(drsas_state,
+	    ddi_get_instance(parent))) == NULL) {
+		return (NDI_FAILURE);
+	}
+
+	/* Hold nexus during bus_config */
+	ndi_devi_enter(parent, &config);
+	switch (op) {
+	case BUS_CONFIG_ONE: {
+
+		/* parse wwid/target name out of name given */
+		if ((ptr = strchr((char *)arg, '@')) == NULL) {
+			rval = NDI_FAILURE;
+			break;
+		}
+		ptr++;
+
+		if (drsas_parse_devname(arg, &tgt, &lun) != 0) {
+			rval = NDI_FAILURE;
+			break;
+		}
+
+		if (lun == 0) {
+			rval = drsas_config_ld(instance, tgt, lun, childp);
+		} else {
+			rval = NDI_FAILURE;
+		}
+
+		break;
+	}
+	case BUS_CONFIG_DRIVER:
+	case BUS_CONFIG_ALL: {
+
+		rval = drsas_config_all_devices(instance);
+
+		rval = NDI_SUCCESS;
+		break;
+	}
+	}
+
+	if (rval == NDI_SUCCESS) {
+		rval = ndi_busop_bus_config(parent, flags, op, arg, childp, 0);
+
+	}
+	ndi_devi_exit(parent, config);
+
+	con_log(CL_ANN1, (CE_NOTE, "drsas_tran_bus_config: rval = %x",
+	    rval));
+	return (rval);
+}
+
+static int
+drsas_config_all_devices(struct drsas_instance *instance)
+{
+	int rval, tgt;
+
+	for (tgt = 0; tgt < MRDRV_MAX_LD; tgt++) {
+		(void) drsas_config_ld(instance, tgt, 0, NULL);
+
+	}
+
+	rval = NDI_SUCCESS;
+	return (rval);
+}
+
+static int
+drsas_parse_devname(char *devnm, int *tgt, int *lun)
+{
+	char devbuf[SCSI_MAXNAMELEN];
+	char *addr;
+	char *p,  *tp, *lp;
+	long num;
+
+	/* Parse dev name and address */
+	(void) strcpy(devbuf, devnm);
+	addr = "";
+	for (p = devbuf; *p != '\0'; p++) {
+		if (*p == '@') {
+			addr = p + 1;
+			*p = '\0';
+		} else if (*p == ':') {
+			*p = '\0';
+			break;
+		}
+	}
+
+	/* Parse target and lun */
+	for (p = tp = addr, lp = NULL; *p != '\0'; p++) {
+		if (*p == ',') {
+			lp = p + 1;
+			*p = '\0';
+			break;
+		}
+	}
+	if (tgt && tp) {
+		if (ddi_strtol(tp, NULL, 0x10, &num)) {
+			return (DDI_FAILURE); /* Can declare this as constant */
+		}
+			*tgt = (int)num;
+	}
+	if (lun && lp) {
+		if (ddi_strtol(lp, NULL, 0x10, &num)) {
+			return (DDI_FAILURE);
+		}
+			*lun = (int)num;
+	}
+	return (DDI_SUCCESS);  /* Success case */
+}
+
+static int
+drsas_config_ld(struct drsas_instance *instance, uint16_t tgt,
+    uint8_t lun, dev_info_t **ldip)
+{
+	struct scsi_device *sd;
+	dev_info_t *child;
+	int rval;
+
+	con_log(CL_ANN1, (CE_NOTE, "drsas_config_ld: t = %d l = %d",
+	    tgt, lun));
+
+	if ((child = drsas_find_child(instance, tgt, lun)) != NULL) {
+		if (ldip) {
+			*ldip = child;
+		}
+		con_log(CL_ANN1, (CE_NOTE,
+		    "drsas_config_ld: Child = %p found t = %d l = %d",
+		    (void *)child, tgt, lun));
+		return (NDI_SUCCESS);
+	}
+
+	sd = kmem_zalloc(sizeof (struct scsi_device), KM_SLEEP);
+	sd->sd_address.a_hba_tran = instance->tran;
+	sd->sd_address.a_target = (uint16_t)tgt;
+	sd->sd_address.a_lun = (uint8_t)lun;
+
+	if (scsi_hba_probe(sd, NULL) == SCSIPROBE_EXISTS)
+		rval = drsas_config_scsi_device(instance, sd, ldip);
+	else
+		rval = NDI_FAILURE;
+
+	/* sd_unprobe is blank now. Free buffer manually */
+	if (sd->sd_inq) {
+		kmem_free(sd->sd_inq, SUN_INQSIZE);
+		sd->sd_inq = (struct scsi_inquiry *)NULL;
+	}
+
+	kmem_free(sd, sizeof (struct scsi_device));
+	con_log(CL_ANN1, (CE_NOTE, "drsas_config_ld: return rval = %d",
+	    rval));
+	return (rval);
+}
+
+static int
+drsas_config_scsi_device(struct drsas_instance *instance,
+    struct scsi_device *sd, dev_info_t **dipp)
+{
+	char *nodename = NULL;
+	char **compatible = NULL;
+	int ncompatible = 0;
+	char *childname;
+	dev_info_t *ldip = NULL;
+	int tgt = sd->sd_address.a_target;
+	int lun = sd->sd_address.a_lun;
+	int dtype = sd->sd_inq->inq_dtype & DTYPE_MASK;
+	int rval;
+
+	con_log(CL_ANN1, (CE_WARN, "dr_sas: scsi_device t%dL%d", tgt, lun));
+	scsi_hba_nodename_compatible_get(sd->sd_inq, NULL, dtype,
+	    NULL, &nodename, &compatible, &ncompatible);
+
+	if (nodename == NULL) {
+		con_log(CL_ANN1, (CE_WARN, "dr_sas: Found no compatible driver "
+		    "for t%dL%d", tgt, lun));
+		rval = NDI_FAILURE;
+		goto finish;
+	}
+
+	childname = (dtype == DTYPE_DIRECT) ? "sd" : nodename;
+	con_log(CL_ANN1, (CE_WARN,
+	    "dr_sas: Childname = %2s nodename = %s", childname, nodename));
+
+	/* Create a dev node */
+	rval = ndi_devi_alloc(instance->dip, childname, DEVI_SID_NODEID, &ldip);
+	con_log(CL_ANN1, (CE_WARN,
+	    "dr_sas_config_scsi_device: ndi_devi_alloc rval = %x", rval));
+	if (rval == NDI_SUCCESS) {
+		if (ndi_prop_update_int(DDI_DEV_T_NONE, ldip, "target", tgt) !=
+		    DDI_PROP_SUCCESS) {
+			con_log(CL_ANN1, (CE_WARN, "dr_sas: unable to create "
+			    "property for t%dl%d target", tgt, lun));
+			rval = NDI_FAILURE;
+			goto finish;
+		}
+		if (ndi_prop_update_int(DDI_DEV_T_NONE, ldip, "lun", lun) !=
+		    DDI_PROP_SUCCESS) {
+			con_log(CL_ANN1, (CE_WARN, "dr_sas: unable to create "
+			    "property for t%dl%d lun", tgt, lun));
+			rval = NDI_FAILURE;
+			goto finish;
+		}
+
+		if (ndi_prop_update_string_array(DDI_DEV_T_NONE, ldip,
+		    "compatible", compatible, ncompatible) !=
+		    DDI_PROP_SUCCESS) {
+			con_log(CL_ANN1, (CE_WARN, "dr_sas: unable to create "
+			    "property for t%dl%d compatible", tgt, lun));
+			rval = NDI_FAILURE;
+			goto finish;
+		}
+
+		rval = ndi_devi_online(ldip, NDI_ONLINE_ATTACH);
+		if (rval != NDI_SUCCESS) {
+			con_log(CL_ANN1, (CE_WARN, "dr_sas: unable to online "
+			    "t%dl%d", tgt, lun));
+			ndi_prop_remove_all(ldip);
+			(void) ndi_devi_free(ldip);
+		} else {
+			con_log(CL_ANN1, (CE_WARN, "dr_sas: online Done :"
+			    "0 t%dl%d", tgt, lun));
+		}
+
+	}
+finish:
+	if (dipp) {
+		*dipp = ldip;
+	}
+
+	con_log(CL_DLEVEL1, (CE_WARN,
+	    "dr_sas: config_scsi_device rval = %d t%dL%d",
+	    rval, tgt, lun));
+	scsi_hba_nodename_compatible_free(nodename, compatible);
+	return (rval);
+}
+
+/*ARGSUSED*/
+static int
+drsas_service_evt(struct drsas_instance *instance, int tgt, int lun, int event,
+    uint64_t wwn)
+{
+	struct drsas_eventinfo *mrevt = NULL;
+
+	con_log(CL_ANN1, (CE_NOTE,
+	    "drsas_service_evt called for t%dl%d event = %d",
+	    tgt, lun, event));
+
+	if ((instance->taskq == NULL) || (mrevt =
+	    kmem_zalloc(sizeof (struct drsas_eventinfo), KM_NOSLEEP)) == NULL) {
+		return (ENOMEM);
+	}
+
+	mrevt->instance = instance;
+	mrevt->tgt = tgt;
+	mrevt->lun = lun;
+	mrevt->event = event;
+
+	if ((ddi_taskq_dispatch(instance->taskq,
+	    (void (*)(void *))drsas_issue_evt_taskq, mrevt, DDI_NOSLEEP)) !=
+	    DDI_SUCCESS) {
+		con_log(CL_ANN1, (CE_NOTE,
+		    "dr_sas: Event task failed for t%dl%d event = %d",
+		    tgt, lun, event));
+		kmem_free(mrevt, sizeof (struct drsas_eventinfo));
+		return (DDI_FAILURE);
+	}
+	return (DDI_SUCCESS);
+}
+
+static void
+drsas_issue_evt_taskq(struct drsas_eventinfo *mrevt)
+{
+	struct drsas_instance *instance = mrevt->instance;
+	dev_info_t *dip, *pdip;
+	int circ1 = 0;
+	char *devname;
+
+	con_log(CL_ANN1, (CE_NOTE, "drsas_issue_evt_taskq: called for"
+	    " tgt %d lun %d event %d",
+	    mrevt->tgt, mrevt->lun, mrevt->event));
+
+	if (mrevt->tgt < MRDRV_MAX_LD && mrevt->lun == 0) {
+		dip = instance->dr_ld_list[mrevt->tgt].dip;
+	} else {
+		return;
+	}
+
+	ndi_devi_enter(instance->dip, &circ1);
+	switch (mrevt->event) {
+	case DRSAS_EVT_CONFIG_TGT:
+		if (dip == NULL) {
+
+			if (mrevt->lun == 0) {
+				(void) drsas_config_ld(instance, mrevt->tgt,
+				    0, NULL);
+			}
+			con_log(CL_ANN1, (CE_NOTE,
+			    "dr_sas: EVT_CONFIG_TGT called:"
+			    " for tgt %d lun %d event %d",
+			    mrevt->tgt, mrevt->lun, mrevt->event));
+
+		} else {
+			con_log(CL_ANN1, (CE_NOTE,
+			    "dr_sas: EVT_CONFIG_TGT dip != NULL:"
+			    " for tgt %d lun %d event %d",
+			    mrevt->tgt, mrevt->lun, mrevt->event));
+		}
+		break;
+	case DRSAS_EVT_UNCONFIG_TGT:
+		if (dip) {
+			if (i_ddi_devi_attached(dip)) {
+
+				pdip = ddi_get_parent(dip);
+
+				devname = kmem_zalloc(MAXNAMELEN + 1, KM_SLEEP);
+				(void) ddi_deviname(dip, devname);
+
+				(void) devfs_clean(pdip, devname + 1,
+				    DV_CLEAN_FORCE);
+				kmem_free(devname, MAXNAMELEN + 1);
+			}
+			(void) ndi_devi_offline(dip, NDI_DEVI_REMOVE);
+			con_log(CL_ANN1, (CE_NOTE,
+			    "dr_sas: EVT_UNCONFIG_TGT called:"
+			    " for tgt %d lun %d event %d",
+			    mrevt->tgt, mrevt->lun, mrevt->event));
+		} else {
+			con_log(CL_ANN1, (CE_NOTE,
+			    "dr_sas: EVT_UNCONFIG_TGT dip == NULL:"
+			    " for tgt %d lun %d event %d",
+			    mrevt->tgt, mrevt->lun, mrevt->event));
+		}
+		break;
+	}
+	kmem_free(mrevt, sizeof (struct drsas_eventinfo));
+	ndi_devi_exit(instance->dip, circ1);
+}
+
+static int
+drsas_mode_sense_build(struct scsi_pkt *pkt)
+{
+	union scsi_cdb		*cdbp;
+	uint16_t 		page_code;
+	struct scsa_cmd		*acmd;
+	struct buf		*bp;
+	struct mode_header	*modehdrp;
+
+	cdbp = (void *)pkt->pkt_cdbp;
+	page_code = cdbp->cdb_un.sg.scsi[0];
+	acmd = PKT2CMD(pkt);
+	bp = acmd->cmd_buf;
+	if ((!bp) && bp->b_un.b_addr && bp->b_bcount && acmd->cmd_dmacount) {
+		con_log(CL_ANN1, (CE_WARN, "Failing MODESENSE Command"));
+		/* ADD pkt statistics as Command failed. */
+		return (NULL);
+	}
+
+	bp_mapin(bp);
+	bzero(bp->b_un.b_addr, bp->b_bcount);
+
+	switch (page_code) {
+		case 0x3: {
+			struct mode_format *page3p = NULL;
+			modehdrp = (struct mode_header *)(bp->b_un.b_addr);
+			modehdrp->bdesc_length = MODE_BLK_DESC_LENGTH;
+
+			page3p = (void *)((caddr_t)modehdrp +
+			    MODE_HEADER_LENGTH + MODE_BLK_DESC_LENGTH);
+			page3p->mode_page.code = 0x3;
+			page3p->mode_page.length =
+			    (uchar_t)(sizeof (struct mode_format));
+			page3p->data_bytes_sect = 512;
+			page3p->sect_track = 63;
+			break;
+		}
+		case 0x4: {
+			struct mode_geometry *page4p = NULL;
+			modehdrp = (struct mode_header *)(bp->b_un.b_addr);
+			modehdrp->bdesc_length = MODE_BLK_DESC_LENGTH;
+
+			page4p = (void *)((caddr_t)modehdrp +
+			    MODE_HEADER_LENGTH + MODE_BLK_DESC_LENGTH);
+			page4p->mode_page.code = 0x4;
+			page4p->mode_page.length =
+			    (uchar_t)(sizeof (struct mode_geometry));
+			page4p->heads = 255;
+			page4p->rpm = 10000;
+			break;
+		}
+		default:
+			break;
+	}
+	return (NULL);
+}
diff --git a/usr/src/uts/common/io/dr_sas/dr_sas.conf b/usr/src/uts/common/io/dr_sas/dr_sas.conf
new file mode 100644
index 0000000000..3792f43ca4
--- /dev/null
+++ b/usr/src/uts/common/io/dr_sas/dr_sas.conf
@@ -0,0 +1,15 @@
+#
+# Copyright (c) 2008-2009, LSI Logic Corporation.
+# All rights reserved.
+#
+# Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+
+#
+# dr_sas.conf for sol 10 (and later) for all supported architectures
+#
+# global definitions
+
+# MSI specific flag. user can uncomment this line and set flag "yes" to enable MSI
+#drsas-enable-msi="yes";
diff --git a/usr/src/uts/common/io/dr_sas/dr_sas.h b/usr/src/uts/common/io/dr_sas/dr_sas.h
new file mode 100644
index 0000000000..8f78658edf
--- /dev/null
+++ b/usr/src/uts/common/io/dr_sas/dr_sas.h
@@ -0,0 +1,1766 @@
+/*
+ * dr_sas.h: header for dr_sas
+ *
+ * Solaris MegaRAID driver for SAS2.0 controllers
+ * Copyright (c) 2008-2009, LSI Logic Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the author nor the names of its contributors may be
+ *    used to endorse or promote products derived from this software without
+ *    specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_DR_SAS_H_
+#define	_DR_SAS_H_
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#include <sys/scsi/scsi.h>
+#include "dr_sas_list.h"
+
+/*
+ * MegaRAID SAS2.0 Driver meta data
+ */
+#define	DRSAS_VERSION				"LSIv2.0"
+#define	DRSAS_RELDATE				"Jan 9, 2009"
+
+#define	DRSAS_TRUE				1
+#define	DRSAS_FALSE				0
+
+/*
+ * MegaRAID SAS2.0 device id conversion definitions.
+ */
+#define	INST2LSIRDCTL(x)		((x) << INST_MINOR_SHIFT)
+
+/*
+ * MegaRAID SAS2.0 supported controllers
+ */
+#define	PCI_DEVICE_ID_LSI_2108VDE		0x0078
+#define	PCI_DEVICE_ID_LSI_2108V			0x0079
+
+/*
+ * Register Index for 2108 Controllers.
+ */
+#define	REGISTER_SET_IO_2108			(2)
+
+#define	DRSAS_MAX_SGE_CNT			0x50
+
+#define	DRSAS_IOCTL_DRIVER			0x12341234
+#define	DRSAS_IOCTL_FIRMWARE			0x12345678
+#define	DRSAS_IOCTL_AEN				0x87654321
+
+#define	DRSAS_1_SECOND				1000000
+
+/* Dynamic Enumeration Flags */
+#define	DRSAS_PD_LUN		1
+#define	DRSAS_LD_LUN		0
+#define	DRSAS_PD_TGT_MAX	255
+#define	DRSAS_GET_PD_MAX(s)	((s)->dr_pd_max)
+#define	WWN_STRLEN		17
+
+/*
+ * =====================================
+ * MegaRAID SAS2.0 MFI firmware definitions
+ * =====================================
+ */
+/*
+ * MFI stands for  MegaRAID SAS2.0 FW Interface. This is just a moniker for
+ * protocol between the software and firmware. Commands are issued using
+ * "message frames"
+ */
+
+/*
+ * FW posts its state in upper 4 bits of outbound_msg_0 register
+ */
+#define	MFI_STATE_SHIFT 			28
+#define	MFI_STATE_MASK				((uint32_t)0xF<<MFI_STATE_SHIFT)
+#define	MFI_STATE_UNDEFINED			((uint32_t)0x0<<MFI_STATE_SHIFT)
+#define	MFI_STATE_BB_INIT			((uint32_t)0x1<<MFI_STATE_SHIFT)
+#define	MFI_STATE_FW_INIT			((uint32_t)0x4<<MFI_STATE_SHIFT)
+#define	MFI_STATE_WAIT_HANDSHAKE		((uint32_t)0x6<<MFI_STATE_SHIFT)
+#define	MFI_STATE_FW_INIT_2			((uint32_t)0x7<<MFI_STATE_SHIFT)
+#define	MFI_STATE_DEVICE_SCAN			((uint32_t)0x8<<MFI_STATE_SHIFT)
+#define	MFI_STATE_BOOT_MESSAGE_PENDING		((uint32_t)0x9<<MFI_STATE_SHIFT)
+#define	MFI_STATE_FLUSH_CACHE			((uint32_t)0xA<<MFI_STATE_SHIFT)
+#define	MFI_STATE_READY				((uint32_t)0xB<<MFI_STATE_SHIFT)
+#define	MFI_STATE_OPERATIONAL			((uint32_t)0xC<<MFI_STATE_SHIFT)
+#define	MFI_STATE_FAULT				((uint32_t)0xF<<MFI_STATE_SHIFT)
+
+#define	MRMFI_FRAME_SIZE			64
+
+/*
+ * During FW init, clear pending cmds & reset state using inbound_msg_0
+ *
+ * ABORT	: Abort all pending cmds
+ * READY	: Move from OPERATIONAL to READY state; discard queue info
+ * MFIMODE	: Discard (possible) low MFA posted in 64-bit mode (??)
+ * CLR_HANDSHAKE: FW is waiting for HANDSHAKE from BIOS or Driver
+ */
+#define	MFI_INIT_ABORT				0x00000001
+#define	MFI_INIT_READY				0x00000002
+#define	MFI_INIT_MFIMODE			0x00000004
+#define	MFI_INIT_CLEAR_HANDSHAKE		0x00000008
+#define	MFI_INIT_HOTPLUG			0x00000010
+#define	MFI_STOP_ADP				0x00000020
+#define	MFI_RESET_FLAGS		MFI_INIT_READY|MFI_INIT_MFIMODE|MFI_INIT_ABORT
+
+/*
+ * MFI frame flags
+ */
+#define	MFI_FRAME_POST_IN_REPLY_QUEUE		0x0000
+#define	MFI_FRAME_DONT_POST_IN_REPLY_QUEUE	0x0001
+#define	MFI_FRAME_SGL32				0x0000
+#define	MFI_FRAME_SGL64				0x0002
+#define	MFI_FRAME_SENSE32			0x0000
+#define	MFI_FRAME_SENSE64			0x0004
+#define	MFI_FRAME_DIR_NONE			0x0000
+#define	MFI_FRAME_DIR_WRITE			0x0008
+#define	MFI_FRAME_DIR_READ			0x0010
+#define	MFI_FRAME_DIR_BOTH			0x0018
+
+/*
+ * Definition for cmd_status
+ */
+#define	MFI_CMD_STATUS_POLL_MODE		0xFF
+#define	MFI_CMD_STATUS_SYNC_MODE		0xFF
+
+/*
+ * MFI command opcodes
+ */
+#define	MFI_CMD_OP_INIT				0x00
+#define	MFI_CMD_OP_LD_READ			0x01
+#define	MFI_CMD_OP_LD_WRITE			0x02
+#define	MFI_CMD_OP_LD_SCSI			0x03
+#define	MFI_CMD_OP_PD_SCSI			0x04
+#define	MFI_CMD_OP_DCMD				0x05
+#define	MFI_CMD_OP_ABORT			0x06
+#define	MFI_CMD_OP_SMP				0x07
+#define	MFI_CMD_OP_STP				0x08
+
+#define	DR_DCMD_CTRL_GET_INFO			0x01010000
+
+#define	DR_DCMD_CTRL_CACHE_FLUSH		0x01101000
+#define	DR_FLUSH_CTRL_CACHE			0x01
+#define	DR_FLUSH_DISK_CACHE			0x02
+
+#define	DR_DCMD_CTRL_SHUTDOWN			0x01050000
+#define	DRSAS_ENABLE_DRIVE_SPINDOWN		0x01
+
+#define	DR_DCMD_CTRL_EVENT_GET_INFO		0x01040100
+#define	DR_DCMD_CTRL_EVENT_GET			0x01040300
+#define	DR_DCMD_CTRL_EVENT_WAIT			0x01040500
+#define	DR_DCMD_LD_GET_PROPERTIES		0x03030000
+#define	DR_DCMD_PD_GET_INFO			0x02020000
+
+/*
+ * Solaris Specific MAX values
+ */
+#define	MAX_SGL					24
+/*
+ * MFI command completion codes
+ */
+enum MFI_STAT {
+	MFI_STAT_OK				= 0x00,
+	MFI_STAT_INVALID_CMD			= 0x01,
+	MFI_STAT_INVALID_DCMD			= 0x02,
+	MFI_STAT_INVALID_PARAMETER		= 0x03,
+	MFI_STAT_INVALID_SEQUENCE_NUMBER	= 0x04,
+	MFI_STAT_ABORT_NOT_POSSIBLE		= 0x05,
+	MFI_STAT_APP_HOST_CODE_NOT_FOUND	= 0x06,
+	MFI_STAT_APP_IN_USE			= 0x07,
+	MFI_STAT_APP_NOT_INITIALIZED		= 0x08,
+	MFI_STAT_ARRAY_INDEX_INVALID		= 0x09,
+	MFI_STAT_ARRAY_ROW_NOT_EMPTY		= 0x0a,
+	MFI_STAT_CONFIG_RESOURCE_CONFLICT	= 0x0b,
+	MFI_STAT_DEVICE_NOT_FOUND		= 0x0c,
+	MFI_STAT_DRIVE_TOO_SMALL		= 0x0d,
+	MFI_STAT_FLASH_ALLOC_FAIL		= 0x0e,
+	MFI_STAT_FLASH_BUSY			= 0x0f,
+	MFI_STAT_FLASH_ERROR			= 0x10,
+	MFI_STAT_FLASH_IMAGE_BAD		= 0x11,
+	MFI_STAT_FLASH_IMAGE_INCOMPLETE		= 0x12,
+	MFI_STAT_FLASH_NOT_OPEN			= 0x13,
+	MFI_STAT_FLASH_NOT_STARTED		= 0x14,
+	MFI_STAT_FLUSH_FAILED			= 0x15,
+	MFI_STAT_HOST_CODE_NOT_FOUNT		= 0x16,
+	MFI_STAT_LD_CC_IN_PROGRESS		= 0x17,
+	MFI_STAT_LD_INIT_IN_PROGRESS		= 0x18,
+	MFI_STAT_LD_LBA_OUT_OF_RANGE		= 0x19,
+	MFI_STAT_LD_MAX_CONFIGURED		= 0x1a,
+	MFI_STAT_LD_NOT_OPTIMAL			= 0x1b,
+	MFI_STAT_LD_RBLD_IN_PROGRESS		= 0x1c,
+	MFI_STAT_LD_RECON_IN_PROGRESS		= 0x1d,
+	MFI_STAT_LD_WRONG_RAID_LEVEL		= 0x1e,
+	MFI_STAT_MAX_SPARES_EXCEEDED		= 0x1f,
+	MFI_STAT_MEMORY_NOT_AVAILABLE		= 0x20,
+	MFI_STAT_MFC_HW_ERROR			= 0x21,
+	MFI_STAT_NO_HW_PRESENT			= 0x22,
+	MFI_STAT_NOT_FOUND			= 0x23,
+	MFI_STAT_NOT_IN_ENCL			= 0x24,
+	MFI_STAT_PD_CLEAR_IN_PROGRESS		= 0x25,
+	MFI_STAT_PD_TYPE_WRONG			= 0x26,
+	MFI_STAT_PR_DISABLED			= 0x27,
+	MFI_STAT_ROW_INDEX_INVALID		= 0x28,
+	MFI_STAT_SAS_CONFIG_INVALID_ACTION	= 0x29,
+	MFI_STAT_SAS_CONFIG_INVALID_DATA	= 0x2a,
+	MFI_STAT_SAS_CONFIG_INVALID_PAGE	= 0x2b,
+	MFI_STAT_SAS_CONFIG_INVALID_TYPE	= 0x2c,
+	MFI_STAT_SCSI_DONE_WITH_ERROR		= 0x2d,
+	MFI_STAT_SCSI_IO_FAILED			= 0x2e,
+	MFI_STAT_SCSI_RESERVATION_CONFLICT	= 0x2f,
+	MFI_STAT_SHUTDOWN_FAILED		= 0x30,
+	MFI_STAT_TIME_NOT_SET			= 0x31,
+	MFI_STAT_WRONG_STATE			= 0x32,
+	MFI_STAT_LD_OFFLINE			= 0x33,
+	/* UNUSED: 0x34 to 0xfe */
+	MFI_STAT_INVALID_STATUS			= 0xFF
+};
+
+enum DR_EVT_CLASS {
+	DR_EVT_CLASS_DEBUG		= -2,
+	DR_EVT_CLASS_PROGRESS		= -1,
+	DR_EVT_CLASS_INFO		=  0,
+	DR_EVT_CLASS_WARNING		=  1,
+	DR_EVT_CLASS_CRITICAL		=  2,
+	DR_EVT_CLASS_FATAL		=  3,
+	DR_EVT_CLASS_DEAD		=  4
+};
+
+enum DR_EVT_LOCALE {
+	DR_EVT_LOCALE_LD		= 0x0001,
+	DR_EVT_LOCALE_PD		= 0x0002,
+	DR_EVT_LOCALE_ENCL		= 0x0004,
+	DR_EVT_LOCALE_BBU		= 0x0008,
+	DR_EVT_LOCALE_SAS		= 0x0010,
+	DR_EVT_LOCALE_CTRL		= 0x0020,
+	DR_EVT_LOCALE_CONFIG		= 0x0040,
+	DR_EVT_LOCALE_CLUSTER		= 0x0080,
+	DR_EVT_LOCALE_ALL		= 0xffff
+};
+
+#define	DR_EVT_CFG_CLEARED		0x0004
+#define	DR_EVT_LD_CREATED		0x008a
+#define	DR_EVT_LD_DELETED		0x008b
+#define	DR_EVT_PD_REMOVED_EXT		0x00f8
+#define	DR_EVT_PD_INSERTED_EXT		0x00f7
+
+enum LD_STATE {
+	LD_OFFLINE		= 0,
+	LD_PARTIALLY_DEGRADED	= 1,
+	LD_DEGRADED		= 2,
+	LD_OPTIMAL		= 3,
+	LD_INVALID		= 0xFF
+};
+
+enum DRSAS_EVT {
+	DRSAS_EVT_CONFIG_TGT	= 0,
+	DRSAS_EVT_UNCONFIG_TGT	= 1,
+	DRSAS_EVT_UNCONFIG_SMP	= 2
+};
+
+#define	DMA_OBJ_ALLOCATED	1
+#define	DMA_OBJ_REALLOCATED	2
+#define	DMA_OBJ_FREED		3
+
+/*
+ * dma_obj_t	- Our DMA object
+ * @param buffer	: kernel virtual address
+ * @param size		: size of the data to be allocated
+ * @param acc_handle	: access handle
+ * @param dma_handle	: dma handle
+ * @param dma_cookie	: scatter-gather list
+ * @param dma_attr	: dma attributes for this buffer
+ * Our DMA object. The caller must initialize the size and dma attributes
+ * (dma_attr) fields before allocating the resources.
+ */
+typedef struct {
+	caddr_t			buffer;
+	uint32_t		size;
+	ddi_acc_handle_t	acc_handle;
+	ddi_dma_handle_t	dma_handle;
+	ddi_dma_cookie_t	dma_cookie[DRSAS_MAX_SGE_CNT];
+	ddi_dma_attr_t		dma_attr;
+	uint8_t			status;
+	uint8_t			reserved[3];
+} dma_obj_t;
+
+struct drsas_eventinfo {
+	struct drsas_instance	*instance;
+	int 			tgt;
+	int 			lun;
+	int 			event;
+};
+
+struct drsas_ld {
+	dev_info_t		*dip;
+	uint8_t 		lun_type;
+	uint8_t 		reserved[3];
+};
+
+struct drsas_pd {
+	dev_info_t		*dip;
+	uint8_t 		lun_type;
+	uint8_t 		dev_id;
+	uint8_t 		flags;
+	uint8_t 		reserved;
+};
+
+struct drsas_pd_info {
+	uint16_t	deviceId;
+	uint16_t	seqNum;
+	uint8_t		inquiryData[96];
+	uint8_t		vpdPage83[64];
+	uint8_t		notSupported;
+	uint8_t		scsiDevType;
+	uint8_t		a;
+	uint8_t		device_speed;
+	uint32_t	mediaerrcnt;
+	uint32_t	other;
+	uint32_t	pred;
+	uint32_t	lastpred;
+	uint16_t	fwState;
+	uint8_t		disabled;
+	uint8_t		linkspwwd;
+	uint32_t	ddfType;
+	struct {
+		uint8_t	count;
+		uint8_t	isPathBroken;
+		uint8_t	connectorIndex[2];
+		uint8_t	reserved[4];
+		uint64_t sasAddr[2];
+		uint8_t	reserved2[16];
+	} pathInfo;
+};
+
+typedef struct drsas_instance {
+	uint32_t	*producer;
+	uint32_t	*consumer;
+
+	uint32_t	*reply_queue;
+	dma_obj_t	mfi_internal_dma_obj;
+
+	uint8_t		init_id;
+	uint8_t		reserved[3];
+
+	uint16_t	max_num_sge;
+	uint16_t	max_fw_cmds;
+	uint32_t	max_sectors_per_req;
+
+	struct drsas_cmd **cmd_list;
+
+	mlist_t		cmd_pool_list;
+	kmutex_t	cmd_pool_mtx;
+
+	mlist_t		cmd_pend_list;
+	kmutex_t	cmd_pend_mtx;
+
+	dma_obj_t	mfi_evt_detail_obj;
+	struct drsas_cmd *aen_cmd;
+
+	uint32_t	aen_seq_num;
+	uint32_t	aen_class_locale_word;
+
+	scsi_hba_tran_t		*tran;
+
+	kcondvar_t	int_cmd_cv;
+	kmutex_t	int_cmd_mtx;
+
+	kcondvar_t	aen_cmd_cv;
+	kmutex_t	aen_cmd_mtx;
+
+	kcondvar_t	abort_cmd_cv;
+	kmutex_t	abort_cmd_mtx;
+
+	dev_info_t		*dip;
+	ddi_acc_handle_t	pci_handle;
+
+	timeout_id_t	timeout_id;
+	uint32_t	unique_id;
+	uint16_t	fw_outstanding;
+	caddr_t		regmap;
+	ddi_acc_handle_t	regmap_handle;
+	uint8_t		isr_level;
+	ddi_iblock_cookie_t	iblock_cookie;
+	ddi_iblock_cookie_t	soft_iblock_cookie;
+	ddi_softintr_t		soft_intr_id;
+	uint8_t		softint_running;
+	kmutex_t	completed_pool_mtx;
+	mlist_t		completed_pool_list;
+
+	caddr_t		internal_buf;
+	uint32_t	internal_buf_dmac_add;
+	uint32_t	internal_buf_size;
+
+	uint16_t	vendor_id;
+	uint16_t	device_id;
+	uint16_t	subsysvid;
+	uint16_t	subsysid;
+	int		instance;
+	int		baseaddress;
+	char		iocnode[16];
+
+	int		fm_capabilities;
+
+	struct drsas_func_ptr *func_ptr;
+	/* MSI interrupts specific */
+	ddi_intr_handle_t *intr_htable;
+	int		intr_type;
+	int		intr_cnt;
+	size_t		intr_size;
+	uint_t		intr_pri;
+	int		intr_cap;
+
+	ddi_taskq_t	*taskq;
+	struct drsas_ld	*dr_ld_list;
+} drsas_t;
+
+struct drsas_func_ptr {
+	int (*read_fw_status_reg)(struct drsas_instance *);
+	void (*issue_cmd)(struct drsas_cmd *, struct drsas_instance *);
+	int (*issue_cmd_in_sync_mode)(struct drsas_instance *,
+	    struct drsas_cmd *);
+	int (*issue_cmd_in_poll_mode)(struct drsas_instance *,
+	    struct drsas_cmd *);
+	void (*enable_intr)(struct drsas_instance *);
+	void (*disable_intr)(struct drsas_instance *);
+	int (*intr_ack)(struct drsas_instance *);
+};
+
+/*
+ * ### Helper routines ###
+ */
+
+/*
+ * con_log() - console log routine
+ * @param level		: indicates the severity of the message.
+ * @fparam mt		: format string
+ *
+ * con_log displays the error messages on the console based on the current
+ * debug level. Also it attaches the appropriate kernel severity level with
+ * the message.
+ *
+ *
+ * console messages debug levels
+ */
+#define	CL_NONE		0	/* No debug information */
+#define	CL_ANN		1	/* print unconditionally, announcements */
+#define	CL_ANN1		2	/* No o/p  */
+#define	CL_DLEVEL1	3	/* debug level 1, informative */
+#define	CL_DLEVEL2	4	/* debug level 2, verbose */
+#define	CL_DLEVEL3	5	/* debug level 3, very verbose */
+
+#ifdef __SUNPRO_C
+#define	__func__ ""
+#endif
+
+#define	con_log(level, fmt) { if (debug_level_g >= level) cmn_err fmt; }
+
+/*
+ * ### SCSA definitions ###
+ */
+#define	PKT2TGT(pkt)	((pkt)->pkt_address.a_target)
+#define	PKT2LUN(pkt)	((pkt)->pkt_address.a_lun)
+#define	PKT2TRAN(pkt)	((pkt)->pkt_adress.a_hba_tran)
+#define	ADDR2TRAN(ap)	((ap)->a_hba_tran)
+
+#define	TRAN2MR(tran)	(struct drsas_instance *)(tran)->tran_hba_private)
+#define	ADDR2MR(ap)	(TRAN2MR(ADDR2TRAN(ap))
+
+#define	PKT2CMD(pkt)	((struct scsa_cmd *)(pkt)->pkt_ha_private)
+#define	CMD2PKT(sp)	((sp)->cmd_pkt)
+#define	PKT2REQ(pkt)	(&(PKT2CMD(pkt)->request))
+
+#define	CMD2ADDR(cmd)	(&CMD2PKT(cmd)->pkt_address)
+#define	CMD2TRAN(cmd)	(CMD2PKT(cmd)->pkt_address.a_hba_tran)
+#define	CMD2MR(cmd)	(TRAN2MR(CMD2TRAN(cmd)))
+
+#define	CFLAG_DMAVALID		0x0001	/* requires a dma operation */
+#define	CFLAG_DMASEND		0x0002	/* Transfer from the device */
+#define	CFLAG_CONSISTENT	0x0040	/* consistent data transfer */
+
+/*
+ * ### Data structures for ioctl inteface and internal commands ###
+ */
+
+/*
+ * Data direction flags
+ */
+#define	UIOC_RD		0x00001
+#define	UIOC_WR		0x00002
+
+#define	SCP2HOST(scp)		(scp)->device->host	/* to host */
+#define	SCP2HOSTDATA(scp)	SCP2HOST(scp)->hostdata	/* to soft state */
+#define	SCP2CHANNEL(scp)	(scp)->device->channel	/* to channel */
+#define	SCP2TARGET(scp)		(scp)->device->id	/* to target */
+#define	SCP2LUN(scp)		(scp)->device->lun	/* to LUN */
+
+#define	SCSIHOST2ADAP(host)	(((caddr_t *)(host->hostdata))[0])
+#define	SCP2ADAPTER(scp)				\
+	(struct drsas_instance *)SCSIHOST2ADAP(SCP2HOST(scp))
+
+#define	MRDRV_IS_LOGICAL_SCSA(instance, acmd)		\
+	(acmd->device_id < MRDRV_MAX_LD) ? 1 : 0
+#define	MRDRV_IS_LOGICAL(ap)				\
+	((ap->a_target < MRDRV_MAX_LD) && (ap->a_lun == 0)) ? 1 : 0
+#define	MAP_DEVICE_ID(instance, ap)			\
+	(ap->a_target)
+
+#define	HIGH_LEVEL_INTR			1
+#define	NORMAL_LEVEL_INTR		0
+
+/*
+ * scsa_cmd  - Per-command mr private data
+ * @param cmd_dmahandle		:  dma handle
+ * @param cmd_dmacookies	:  current dma cookies
+ * @param cmd_pkt		:  scsi_pkt reference
+ * @param cmd_dmacount		:  dma count
+ * @param cmd_cookie		:  next cookie
+ * @param cmd_ncookies		:  cookies per window
+ * @param cmd_cookiecnt		:  cookies per sub-win
+ * @param cmd_nwin		:  number of dma windows
+ * @param cmd_curwin		:  current dma window
+ * @param cmd_dma_offset	:  current window offset
+ * @param cmd_dma_len		:  current window length
+ * @param cmd_flags		:  private flags
+ * @param cmd_cdblen		:  length of cdb
+ * @param cmd_scblen		:  length of scb
+ * @param cmd_buf		:  command buffer
+ * @param channel		:  channel for scsi sub-system
+ * @param target		:  target for scsi sub-system
+ * @param lun			:  LUN for scsi sub-system
+ *
+ * - Allocated at same time as scsi_pkt by scsi_hba_pkt_alloc(9E)
+ * - Pointed to by pkt_ha_private field in scsi_pkt
+ */
+struct scsa_cmd {
+	ddi_dma_handle_t	cmd_dmahandle;
+	ddi_dma_cookie_t	cmd_dmacookies[DRSAS_MAX_SGE_CNT];
+	struct scsi_pkt		*cmd_pkt;
+	ulong_t			cmd_dmacount;
+	uint_t			cmd_cookie;
+	uint_t			cmd_ncookies;
+	uint_t			cmd_cookiecnt;
+	uint_t			cmd_nwin;
+	uint_t			cmd_curwin;
+	off_t			cmd_dma_offset;
+	ulong_t			cmd_dma_len;
+	ulong_t			cmd_flags;
+	uint_t			cmd_cdblen;
+	uint_t			cmd_scblen;
+	struct buf		*cmd_buf;
+	ushort_t		device_id;
+	uchar_t			islogical;
+	uchar_t			lun;
+	struct drsas_device	*drsas_dev;
+};
+
+
+struct drsas_cmd {
+	union drsas_frame	*frame;
+	uint32_t		frame_phys_addr;
+	uint8_t			*sense;
+	uint32_t		sense_phys_addr;
+	dma_obj_t		frame_dma_obj;
+	uint8_t			frame_dma_obj_status;
+
+	uint32_t		index;
+	uint8_t			sync_cmd;
+	uint8_t			cmd_status;
+	uint16_t		abort_aen;
+	mlist_t			list;
+	uint32_t		frame_count;
+	struct scsa_cmd		*cmd;
+	struct scsi_pkt		*pkt;
+};
+
+#define	MAX_MGMT_ADAPTERS			1024
+#define	IOC_SIGNATURE				"MR-SAS"
+
+#define	IOC_CMD_FIRMWARE			0x0
+#define	DRSAS_DRIVER_IOCTL_COMMON		0xF0010000
+#define	DRSAS_DRIVER_IOCTL_DRIVER_VERSION	0xF0010100
+#define	DRSAS_DRIVER_IOCTL_PCI_INFORMATION	0xF0010200
+#define	DRSAS_DRIVER_IOCTL_MRRAID_STATISTICS	0xF0010300
+
+
+#define	DRSAS_MAX_SENSE_LENGTH			32
+
+struct drsas_mgmt_info {
+
+	uint16_t			count;
+	struct drsas_instance		*instance[MAX_MGMT_ADAPTERS];
+	uint16_t			map[MAX_MGMT_ADAPTERS];
+	int				max_index;
+};
+
+#pragma pack(1)
+
+/*
+ * SAS controller properties
+ */
+struct drsas_ctrl_prop {
+	uint16_t	seq_num;
+	uint16_t	pred_fail_poll_interval;
+	uint16_t	intr_throttle_count;
+	uint16_t	intr_throttle_timeouts;
+
+	uint8_t		rebuild_rate;
+	uint8_t		patrol_read_rate;
+	uint8_t		bgi_rate;
+	uint8_t		cc_rate;
+	uint8_t		recon_rate;
+
+	uint8_t		cache_flush_interval;
+
+	uint8_t		spinup_drv_count;
+	uint8_t		spinup_delay;
+
+	uint8_t		cluster_enable;
+	uint8_t		coercion_mode;
+	uint8_t		disk_write_cache_disable;
+	uint8_t		alarm_enable;
+
+	uint8_t		reserved[44];
+};
+
+/*
+ * SAS controller information
+ */
+struct drsas_ctrl_info {
+	/* PCI device information */
+	struct {
+		uint16_t	vendor_id;
+		uint16_t	device_id;
+		uint16_t	sub_vendor_id;
+		uint16_t	sub_device_id;
+		uint8_t	reserved[24];
+	} pci;
+
+	/* Host interface information */
+	struct {
+		uint8_t	PCIX		: 1;
+		uint8_t	PCIE		: 1;
+		uint8_t	iSCSI		: 1;
+		uint8_t	SAS_3G		: 1;
+		uint8_t	reserved_0	: 4;
+		uint8_t	reserved_1[6];
+		uint8_t	port_count;
+		uint64_t	port_addr[8];
+	} host_interface;
+
+	/* Device (backend) interface information */
+	struct {
+		uint8_t	SPI		: 1;
+		uint8_t	SAS_3G		: 1;
+		uint8_t	SATA_1_5G	: 1;
+		uint8_t	SATA_3G		: 1;
+		uint8_t	reserved_0	: 4;
+		uint8_t	reserved_1[6];
+		uint8_t	port_count;
+		uint64_t	port_addr[8];
+	} device_interface;
+
+	/* List of components residing in flash. All str are null terminated */
+	uint32_t	image_check_word;
+	uint32_t	image_component_count;
+
+	struct {
+		char	name[8];
+		char	version[32];
+		char	build_date[16];
+		char	built_time[16];
+	} image_component[8];
+
+	/*
+	 * List of flash components that have been flashed on the card, but
+	 * are not in use, pending reset of the adapter. This list will be
+	 * empty if a flash operation has not occurred. All stings are null
+	 * terminated
+	 */
+	uint32_t	pending_image_component_count;
+
+	struct {
+		char	name[8];
+		char	version[32];
+		char	build_date[16];
+		char	build_time[16];
+	} pending_image_component[8];
+
+	uint8_t		max_arms;
+	uint8_t		max_spans;
+	uint8_t		max_arrays;
+	uint8_t		max_lds;
+
+	char		product_name[80];
+	char		serial_no[32];
+
+	/*
+	 * Other physical/controller/operation information. Indicates the
+	 * presence of the hardware
+	 */
+	struct {
+		uint32_t	bbu		: 1;
+		uint32_t	alarm		: 1;
+		uint32_t	nvram		: 1;
+		uint32_t	uart		: 1;
+		uint32_t	reserved	: 28;
+	} hw_present;
+
+	uint32_t	current_fw_time;
+
+	/* Maximum data transfer sizes */
+	uint16_t		max_concurrent_cmds;
+	uint16_t		max_sge_count;
+	uint32_t		max_request_size;
+
+	/* Logical and physical device counts */
+	uint16_t		ld_present_count;
+	uint16_t		ld_degraded_count;
+	uint16_t		ld_offline_count;
+
+	uint16_t		pd_present_count;
+	uint16_t		pd_disk_present_count;
+	uint16_t		pd_disk_pred_failure_count;
+	uint16_t		pd_disk_failed_count;
+
+	/* Memory size information */
+	uint16_t		nvram_size;
+	uint16_t		memory_size;
+	uint16_t		flash_size;
+
+	/* Error counters */
+	uint16_t		mem_correctable_error_count;
+	uint16_t		mem_uncorrectable_error_count;
+
+	/* Cluster information */
+	uint8_t		cluster_permitted;
+	uint8_t		cluster_active;
+	uint8_t		reserved_1[2];
+
+	/* Controller capabilities structures */
+	struct {
+		uint32_t	raid_level_0	: 1;
+		uint32_t	raid_level_1	: 1;
+		uint32_t	raid_level_5	: 1;
+		uint32_t	raid_level_1E	: 1;
+		uint32_t	reserved	: 28;
+	} raid_levels;
+
+	struct {
+		uint32_t	rbld_rate		: 1;
+		uint32_t	cc_rate			: 1;
+		uint32_t	bgi_rate		: 1;
+		uint32_t	recon_rate		: 1;
+		uint32_t	patrol_rate		: 1;
+		uint32_t	alarm_control		: 1;
+		uint32_t	cluster_supported	: 1;
+		uint32_t	bbu			: 1;
+		uint32_t	spanning_allowed	: 1;
+		uint32_t	dedicated_hotspares	: 1;
+		uint32_t	revertible_hotspares	: 1;
+		uint32_t	foreign_config_import	: 1;
+		uint32_t	self_diagnostic		: 1;
+		uint32_t	reserved		: 19;
+	} adapter_operations;
+
+	struct {
+		uint32_t	read_policy	: 1;
+		uint32_t	write_policy	: 1;
+		uint32_t	io_policy	: 1;
+		uint32_t	access_policy	: 1;
+		uint32_t	reserved	: 28;
+	} ld_operations;
+
+	struct {
+		uint8_t	min;
+		uint8_t	max;
+		uint8_t	reserved[2];
+	} stripe_size_operations;
+
+	struct {
+		uint32_t	force_online	: 1;
+		uint32_t	force_offline	: 1;
+		uint32_t	force_rebuild	: 1;
+		uint32_t	reserved	: 29;
+	} pd_operations;
+
+	struct {
+		uint32_t	ctrl_supports_sas	: 1;
+		uint32_t	ctrl_supports_sata	: 1;
+		uint32_t	allow_mix_in_encl	: 1;
+		uint32_t	allow_mix_in_ld		: 1;
+		uint32_t	allow_sata_in_cluster	: 1;
+		uint32_t	reserved		: 27;
+	} pd_mix_support;
+
+	/* Include the controller properties (changeable items) */
+	uint8_t				reserved_2[12];
+	struct drsas_ctrl_prop		properties;
+
+	uint8_t				pad[0x800 - 0x640];
+};
+
+/*
+ * ==================================
+ * MegaRAID SAS2.0 driver definitions
+ * ==================================
+ */
+#define	MRDRV_MAX_NUM_CMD			1024
+
+#define	MRDRV_MAX_PD_CHANNELS			2
+#define	MRDRV_MAX_LD_CHANNELS			2
+#define	MRDRV_MAX_CHANNELS			(MRDRV_MAX_PD_CHANNELS + \
+						MRDRV_MAX_LD_CHANNELS)
+#define	MRDRV_MAX_DEV_PER_CHANNEL		128
+#define	MRDRV_DEFAULT_INIT_ID			-1
+#define	MRDRV_MAX_CMD_PER_LUN			1000
+#define	MRDRV_MAX_LUN				1
+#define	MRDRV_MAX_LD				64
+
+#define	MRDRV_RESET_WAIT_TIME			300
+#define	MRDRV_RESET_NOTICE_INTERVAL		5
+
+#define	DRSAS_IOCTL_CMD				0
+
+/*
+ * FW can accept both 32 and 64 bit SGLs. We want to allocate 32/64 bit
+ * SGLs based on the size of dma_addr_t
+ */
+#define	IS_DMA64		(sizeof (dma_addr_t) == 8)
+
+#define	IB_MSG_0_OFF			0x10	/* XScale */
+#define	OB_MSG_0_OFF			0x18	/* XScale */
+#define	IB_DOORBELL_OFF			0x20	/* XScale & ROC */
+#define	OB_INTR_STATUS_OFF		0x30	/* XScale & ROC */
+#define	OB_INTR_MASK_OFF		0x34	/* XScale & ROC */
+#define	IB_QPORT_OFF			0x40	/* XScale & ROC */
+#define	OB_DOORBELL_CLEAR_OFF		0xA0	/* ROC */
+#define	OB_SCRATCH_PAD_0_OFF		0xB0	/* ROC */
+#define	OB_INTR_MASK			0xFFFFFFFF
+#define	OB_DOORBELL_CLEAR_MASK		0xFFFFFFFF
+
+/*
+ * All MFI register set macros accept drsas_register_set*
+ */
+#define	WR_IB_MSG_0(v, instance) 	ddi_put32((instance)->regmap_handle, \
+	(uint32_t *)((uintptr_t)(instance)->regmap + IB_MSG_0_OFF), (v))
+
+#define	RD_OB_MSG_0(instance) 		ddi_get32((instance)->regmap_handle, \
+	(uint32_t *)((uintptr_t)(instance)->regmap + OB_MSG_0_OFF))
+
+#define	WR_IB_DOORBELL(v, instance)	ddi_put32((instance)->regmap_handle, \
+	(uint32_t *)((uintptr_t)(instance)->regmap + IB_DOORBELL_OFF), (v))
+
+#define	RD_IB_DOORBELL(instance)	ddi_get32((instance)->regmap_handle, \
+	(uint32_t *)((uintptr_t)(instance)->regmap + IB_DOORBELL_OFF))
+
+#define	WR_OB_INTR_STATUS(v, instance) 	ddi_put32((instance)->regmap_handle, \
+	(uint32_t *)((uintptr_t)(instance)->regmap + OB_INTR_STATUS_OFF), (v))
+
+#define	RD_OB_INTR_STATUS(instance) 	ddi_get32((instance)->regmap_handle, \
+	(uint32_t *)((uintptr_t)(instance)->regmap + OB_INTR_STATUS_OFF))
+
+#define	WR_OB_INTR_MASK(v, instance) 	ddi_put32((instance)->regmap_handle, \
+	(uint32_t *)((uintptr_t)(instance)->regmap + OB_INTR_MASK_OFF), (v))
+
+#define	RD_OB_INTR_MASK(instance) 	ddi_get32((instance)->regmap_handle, \
+	(uint32_t *)((uintptr_t)(instance)->regmap + OB_INTR_MASK_OFF))
+
+#define	WR_IB_QPORT(v, instance) 	ddi_put32((instance)->regmap_handle, \
+	(uint32_t *)((uintptr_t)(instance)->regmap + IB_QPORT_OFF), (v))
+
+#define	WR_OB_DOORBELL_CLEAR(v, instance) ddi_put32((instance)->regmap_handle, \
+	(uint32_t *)((uintptr_t)(instance)->regmap + OB_DOORBELL_CLEAR_OFF), \
+	(v))
+
+#define	RD_OB_SCRATCH_PAD_0(instance) 	ddi_get32((instance)->regmap_handle, \
+	(uint32_t *)((uintptr_t)(instance)->regmap + OB_SCRATCH_PAD_0_OFF))
+
+/*
+ * When FW is in MFI_STATE_READY or MFI_STATE_OPERATIONAL, the state data
+ * of Outbound Msg Reg 0 indicates max concurrent cmds supported, max SGEs
+ * supported per cmd and if 64-bit MFAs (M64) is enabled or disabled.
+ */
+#define	MFI_OB_INTR_STATUS_MASK		0x00000002
+
+/*
+ * This MFI_REPLY_2108_MESSAGE_INTR flag is used also
+ * in enable_intr_ppc also. Hence bit 2, i.e. 0x4 has
+ * been set in this flag along with bit 1.
+ */
+#define	MFI_REPLY_2108_MESSAGE_INTR		0x00000001
+#define	MFI_REPLY_2108_MESSAGE_INTR_MASK	0x00000005
+
+#define	MFI_POLL_TIMEOUT_SECS		60
+
+#define	MFI_ENABLE_INTR(instance)  ddi_put32((instance)->regmap_handle, \
+	(uint32_t *)((uintptr_t)(instance)->regmap + OB_INTR_MASK_OFF), 1)
+#define	MFI_DISABLE_INTR(instance)					\
+{									\
+	uint32_t disable = 1;						\
+	uint32_t mask =  ddi_get32((instance)->regmap_handle, 		\
+	    (uint32_t *)((uintptr_t)(instance)->regmap + OB_INTR_MASK_OFF));\
+	mask &= ~disable;						\
+	ddi_put32((instance)->regmap_handle, (uint32_t *)		\
+	    (uintptr_t)((instance)->regmap + OB_INTR_MASK_OFF), mask);	\
+}
+
+/* By default, the firmware programs for 8 Kbytes of memory */
+#define	DEFAULT_MFI_MEM_SZ	8192
+#define	MINIMUM_MFI_MEM_SZ	4096
+
+/* DCMD Message Frame MAILBOX0-11 */
+#define	DCMD_MBOX_SZ		12
+
+
+struct drsas_register_set {
+	uint32_t	reserved_0[4];
+
+	uint32_t	inbound_msg_0;
+	uint32_t	inbound_msg_1;
+	uint32_t	outbound_msg_0;
+	uint32_t	outbound_msg_1;
+
+	uint32_t	inbound_doorbell;
+	uint32_t	inbound_intr_status;
+	uint32_t	inbound_intr_mask;
+
+	uint32_t	outbound_doorbell;
+	uint32_t	outbound_intr_status;
+	uint32_t	outbound_intr_mask;
+
+	uint32_t	reserved_1[2];
+
+	uint32_t	inbound_queue_port;
+	uint32_t	outbound_queue_port;
+
+	uint32_t 	reserved_2[22];
+
+	uint32_t 	outbound_doorbell_clear;
+
+	uint32_t 	reserved_3[3];
+
+	uint32_t 	outbound_scratch_pad;
+
+	uint32_t 	reserved_4[3];
+
+	uint32_t 	inbound_low_queue_port;
+
+	uint32_t 	inbound_high_queue_port;
+
+	uint32_t 	reserved_5;
+	uint32_t 	index_registers[820];
+};
+
+struct drsas_sge32 {
+	uint32_t	phys_addr;
+	uint32_t	length;
+};
+
+struct drsas_sge64 {
+	uint64_t	phys_addr;
+	uint32_t	length;
+};
+
+union drsas_sgl {
+	struct drsas_sge32	sge32[1];
+	struct drsas_sge64	sge64[1];
+};
+
+struct drsas_header {
+	uint8_t		cmd;
+	uint8_t		sense_len;
+	uint8_t		cmd_status;
+	uint8_t		scsi_status;
+
+	uint8_t		target_id;
+	uint8_t		lun;
+	uint8_t		cdb_len;
+	uint8_t		sge_count;
+
+	uint32_t	context;
+	uint8_t		req_id;
+	uint8_t		msgvector;
+	uint16_t	pad_0;
+
+	uint16_t	flags;
+	uint16_t	timeout;
+	uint32_t	data_xferlen;
+};
+
+union drsas_sgl_frame {
+	struct drsas_sge32	sge32[8];
+	struct drsas_sge64	sge64[5];
+};
+
+struct drsas_init_frame {
+	uint8_t		cmd;
+	uint8_t		reserved_0;
+	uint8_t		cmd_status;
+
+	uint8_t		reserved_1;
+	uint32_t	reserved_2;
+
+	uint32_t	context;
+	uint8_t		req_id;
+	uint8_t		msgvector;
+	uint16_t	pad_0;
+
+	uint16_t	flags;
+	uint16_t	reserved_3;
+	uint32_t	data_xfer_len;
+
+	uint32_t	queue_info_new_phys_addr_lo;
+	uint32_t	queue_info_new_phys_addr_hi;
+	uint32_t	queue_info_old_phys_addr_lo;
+	uint32_t	queue_info_old_phys_addr_hi;
+
+	uint32_t	reserved_4[6];
+};
+
+struct drsas_init_queue_info {
+	uint32_t		init_flags;
+	uint32_t		reply_queue_entries;
+
+	uint32_t		reply_queue_start_phys_addr_lo;
+	uint32_t		reply_queue_start_phys_addr_hi;
+	uint32_t		producer_index_phys_addr_lo;
+	uint32_t		producer_index_phys_addr_hi;
+	uint32_t		consumer_index_phys_addr_lo;
+	uint32_t		consumer_index_phys_addr_hi;
+};
+
+struct drsas_io_frame {
+	uint8_t			cmd;
+	uint8_t			sense_len;
+	uint8_t			cmd_status;
+	uint8_t			scsi_status;
+
+	uint8_t			target_id;
+	uint8_t			access_byte;
+	uint8_t			reserved_0;
+	uint8_t			sge_count;
+
+	uint32_t		context;
+	uint8_t			req_id;
+	uint8_t			msgvector;
+	uint16_t		pad_0;
+
+	uint16_t		flags;
+	uint16_t		timeout;
+	uint32_t		lba_count;
+
+	uint32_t		sense_buf_phys_addr_lo;
+	uint32_t		sense_buf_phys_addr_hi;
+
+	uint32_t		start_lba_lo;
+	uint32_t		start_lba_hi;
+
+	union drsas_sgl		sgl;
+};
+
+struct drsas_pthru_frame {
+	uint8_t			cmd;
+	uint8_t			sense_len;
+	uint8_t			cmd_status;
+	uint8_t			scsi_status;
+
+	uint8_t			target_id;
+	uint8_t			lun;
+	uint8_t			cdb_len;
+	uint8_t			sge_count;
+
+	uint32_t		context;
+	uint8_t			req_id;
+	uint8_t			msgvector;
+	uint16_t		pad_0;
+
+	uint16_t		flags;
+	uint16_t		timeout;
+	uint32_t		data_xfer_len;
+
+	uint32_t		sense_buf_phys_addr_lo;
+	uint32_t		sense_buf_phys_addr_hi;
+
+	uint8_t			cdb[16];
+	union drsas_sgl		sgl;
+};
+
+struct drsas_dcmd_frame {
+	uint8_t			cmd;
+	uint8_t			reserved_0;
+	uint8_t			cmd_status;
+	uint8_t			reserved_1[4];
+	uint8_t			sge_count;
+
+	uint32_t		context;
+	uint8_t			req_id;
+	uint8_t			msgvector;
+	uint16_t		pad_0;
+
+	uint16_t		flags;
+	uint16_t		timeout;
+
+	uint32_t		data_xfer_len;
+	uint32_t		opcode;
+
+	union {
+		uint8_t b[DCMD_MBOX_SZ];
+		uint16_t s[6];
+		uint32_t w[3];
+	} mbox;
+
+	union drsas_sgl		sgl;
+};
+
+struct drsas_abort_frame {
+	uint8_t		cmd;
+	uint8_t		reserved_0;
+	uint8_t		cmd_status;
+
+	uint8_t		reserved_1;
+	uint32_t	reserved_2;
+
+	uint32_t	context;
+	uint8_t		req_id;
+	uint8_t		msgvector;
+	uint16_t	pad_0;
+
+	uint16_t	flags;
+	uint16_t	reserved_3;
+	uint32_t	reserved_4;
+
+	uint32_t	abort_context;
+	uint32_t	pad_1;
+
+	uint32_t	abort_mfi_phys_addr_lo;
+	uint32_t	abort_mfi_phys_addr_hi;
+
+	uint32_t	reserved_5[6];
+};
+
+struct drsas_smp_frame {
+	uint8_t		cmd;
+	uint8_t		reserved_1;
+	uint8_t		cmd_status;
+	uint8_t		connection_status;
+
+	uint8_t		reserved_2[3];
+	uint8_t		sge_count;
+
+	uint32_t	context;
+	uint8_t		req_id;
+	uint8_t		msgvector;
+	uint16_t	pad_0;
+
+	uint16_t	flags;
+	uint16_t	timeout;
+
+	uint32_t	data_xfer_len;
+
+	uint64_t	sas_addr;
+
+	union drsas_sgl	sgl[2];
+};
+
+struct drsas_stp_frame {
+	uint8_t		cmd;
+	uint8_t		reserved_1;
+	uint8_t		cmd_status;
+	uint8_t		connection_status;
+
+	uint8_t		target_id;
+	uint8_t		reserved_2[2];
+	uint8_t		sge_count;
+
+	uint32_t	context;
+	uint8_t		req_id;
+	uint8_t		msgvector;
+	uint16_t	pad_0;
+
+	uint16_t	flags;
+	uint16_t	timeout;
+
+	uint32_t	data_xfer_len;
+
+	uint16_t	fis[10];
+	uint32_t	stp_flags;
+	union drsas_sgl	sgl;
+};
+
+union drsas_frame {
+	struct drsas_header		hdr;
+	struct drsas_init_frame		init;
+	struct drsas_io_frame		io;
+	struct drsas_pthru_frame	pthru;
+	struct drsas_dcmd_frame		dcmd;
+	struct drsas_abort_frame	abort;
+	struct drsas_smp_frame		smp;
+	struct drsas_stp_frame		stp;
+
+	uint8_t			raw_bytes[64];
+};
+
+typedef struct drsas_pd_address {
+	uint16_t	device_id;
+	uint16_t	encl_id;
+
+	union {
+		struct {
+			uint8_t encl_index;
+			uint8_t slot_number;
+		} pd_address;
+		struct {
+			uint8_t	encl_position;
+			uint8_t	encl_connector_index;
+		} encl_address;
+	}address;
+
+	uint8_t	scsi_dev_type;
+
+	union {
+		uint8_t		port_bitmap;
+		uint8_t		port_numbers;
+	} connected;
+
+	uint64_t		sas_addr[2];
+} drsas_pd_address_t;
+
+union drsas_evt_class_locale {
+	struct {
+		uint16_t	locale;
+		uint8_t		reserved;
+		int8_t		class;
+	} members;
+
+	uint32_t	word;
+};
+
+struct drsas_evt_log_info {
+	uint32_t	newest_seq_num;
+	uint32_t	oldest_seq_num;
+	uint32_t	clear_seq_num;
+	uint32_t	shutdown_seq_num;
+	uint32_t	boot_seq_num;
+};
+
+struct drsas_progress {
+	uint16_t	progress;
+	uint16_t	elapsed_seconds;
+};
+
+struct drsas_evtarg_ld {
+	uint16_t	target_id;
+	uint8_t		ld_index;
+	uint8_t		reserved;
+};
+
+struct drsas_evtarg_pd {
+	uint16_t	device_id;
+	uint8_t		encl_index;
+	uint8_t		slot_number;
+};
+
+struct drsas_evt_detail {
+	uint32_t	seq_num;
+	uint32_t	time_stamp;
+	uint32_t	code;
+	union drsas_evt_class_locale	cl;
+	uint8_t		arg_type;
+	uint8_t		reserved1[15];
+
+	union {
+		struct {
+			struct drsas_evtarg_pd	pd;
+			uint8_t			cdb_length;
+			uint8_t			sense_length;
+			uint8_t			reserved[2];
+			uint8_t			cdb[16];
+			uint8_t			sense[64];
+		} cdbSense;
+
+		struct drsas_evtarg_ld		ld;
+
+		struct {
+			struct drsas_evtarg_ld	ld;
+			uint64_t		count;
+		} ld_count;
+
+		struct {
+			uint64_t		lba;
+			struct drsas_evtarg_ld	ld;
+		} ld_lba;
+
+		struct {
+			struct drsas_evtarg_ld	ld;
+			uint32_t		prevOwner;
+			uint32_t		newOwner;
+		} ld_owner;
+
+		struct {
+			uint64_t		ld_lba;
+			uint64_t		pd_lba;
+			struct drsas_evtarg_ld	ld;
+			struct drsas_evtarg_pd	pd;
+		} ld_lba_pd_lba;
+
+		struct {
+			struct drsas_evtarg_ld	ld;
+			struct drsas_progress	prog;
+		} ld_prog;
+
+		struct {
+			struct drsas_evtarg_ld	ld;
+			uint32_t		prev_state;
+			uint32_t		new_state;
+		} ld_state;
+
+		struct {
+			uint64_t		strip;
+			struct drsas_evtarg_ld	ld;
+		} ld_strip;
+
+		struct drsas_evtarg_pd		pd;
+
+		struct {
+			struct drsas_evtarg_pd	pd;
+			uint32_t		err;
+		} pd_err;
+
+		struct {
+			uint64_t		lba;
+			struct drsas_evtarg_pd	pd;
+		} pd_lba;
+
+		struct {
+			uint64_t		lba;
+			struct drsas_evtarg_pd	pd;
+			struct drsas_evtarg_ld	ld;
+		} pd_lba_ld;
+
+		struct {
+			struct drsas_evtarg_pd	pd;
+			struct drsas_progress	prog;
+		} pd_prog;
+
+		struct {
+			struct drsas_evtarg_pd	pd;
+			uint32_t		prevState;
+			uint32_t		newState;
+		} pd_state;
+
+		struct {
+			uint16_t	vendorId;
+			uint16_t	deviceId;
+			uint16_t	subVendorId;
+			uint16_t	subDeviceId;
+		} pci;
+
+		uint32_t	rate;
+		char		str[96];
+
+		struct {
+			uint32_t	rtc;
+			uint32_t	elapsedSeconds;
+		} time;
+
+		struct {
+			uint32_t	ecar;
+			uint32_t	elog;
+			char		str[64];
+		} ecc;
+
+		drsas_pd_address_t	pd_addr;
+
+		uint8_t		b[96];
+		uint16_t	s[48];
+		uint32_t	w[24];
+		uint64_t	d[12];
+	} args;
+
+	char	description[128];
+
+};
+
+/* only 63 are usable by the application */
+#define	MAX_LOGICAL_DRIVES			64
+/* only 255 physical devices may be used */
+#define	MAX_PHYSICAL_DEVICES			256
+#define	MAX_PD_PER_ENCLOSURE			64
+/* maximum disks per array */
+#define	MAX_ROW_SIZE				32
+/* maximum spans per logical drive */
+#define	MAX_SPAN_DEPTH				8
+/* maximum number of arrays a hot spare may be dedicated to */
+#define	MAX_ARRAYS_DEDICATED			16
+/* maximum number of arrays which may exist */
+#define	MAX_ARRAYS				128
+/* maximum number of foreign configs that may ha managed at once */
+#define	MAX_FOREIGN_CONFIGS			8
+/* maximum spares (global and dedicated combined) */
+#define	MAX_SPARES_FOR_THE_CONTROLLER		MAX_PHYSICAL_DEVICES
+/* maximum possible Target IDs (i.e. 0 to 63) */
+#define	MAX_TARGET_ID				63
+/* maximum number of supported enclosures */
+#define	MAX_ENCLOSURES				32
+/* maximum number of PHYs per controller */
+#define	MAX_PHYS_PER_CONTROLLER			16
+/* maximum number of LDs per array (due to DDF limitations) */
+#define	MAX_LDS_PER_ARRAY			16
+
+/*
+ * -----------------------------------------------------------------------------
+ * -----------------------------------------------------------------------------
+ *
+ * Logical Drive commands
+ *
+ * -----------------------------------------------------------------------------
+ * -----------------------------------------------------------------------------
+ */
+#define	DR_DCMD_LD	0x03000000,	/* Logical Device (LD) opcodes */
+
+/*
+ * Input:	dcmd.opcode	- DR_DCMD_LD_GET_LIST
+ *		dcmd.mbox	- reserved
+ *		dcmd.sge IN	- ptr to returned DR_LD_LIST structure
+ * Desc:	Return the logical drive list structure
+ * Status:	No error
+ */
+
+/*
+ * defines the logical drive reference structure
+ */
+typedef	union _DR_LD_REF {	/* LD reference structure */
+	struct {
+		uint8_t	targetId; /* LD target id (0 to MAX_TARGET_ID) */
+		uint8_t	reserved; /* reserved for in line with DR_PD_REF */
+		uint16_t seqNum;  /* Sequence Number */
+	} ld_ref;
+	uint32_t ref;		/* shorthand reference to full 32-bits */
+} DR_LD_REF;			/* 4 bytes */
+
+/*
+ * defines the logical drive list structure
+ */
+typedef struct _DR_LD_LIST {
+	uint32_t	ldCount;	/* number of LDs */
+	uint32_t	reserved;	/* pad to 8-byte boundary */
+	struct {
+		DR_LD_REF ref;	/* LD reference */
+		uint8_t	state;		/* current LD state (DR_LD_STATE) */
+		uint8_t	reserved[3];	/* pad to 8-byte boundary */
+		uint64_t size;		/* LD size */
+	} ldList[MAX_LOGICAL_DRIVES];
+} DR_LD_LIST;
+
+struct drsas_drv_ver {
+	uint8_t	signature[12];
+	uint8_t	os_name[16];
+	uint8_t	os_ver[12];
+	uint8_t	drv_name[20];
+	uint8_t	drv_ver[32];
+	uint8_t	drv_rel_date[20];
+};
+
+#define	PCI_TYPE0_ADDRESSES		6
+#define	PCI_TYPE1_ADDRESSES		2
+#define	PCI_TYPE2_ADDRESSES		5
+
+struct drsas_pci_common_header {
+	uint16_t	vendorID;		/* (ro) */
+	uint16_t	deviceID;		/* (ro) */
+	uint16_t	command;		/* Device control */
+	uint16_t	status;
+	uint8_t		revisionID;		/* (ro) */
+	uint8_t		progIf;			/* (ro) */
+	uint8_t		subClass;		/* (ro) */
+	uint8_t		baseClass;		/* (ro) */
+	uint8_t		cacheLineSize;		/* (ro+) */
+	uint8_t		latencyTimer;		/* (ro+) */
+	uint8_t		headerType;		/* (ro) */
+	uint8_t		bist;			/* Built in self test */
+
+	union {
+	    struct {
+		uint32_t	baseAddresses[PCI_TYPE0_ADDRESSES];
+		uint32_t	cis;
+		uint16_t	subVendorID;
+		uint16_t	subSystemID;
+		uint32_t	romBaseAddress;
+		uint8_t		capabilitiesPtr;
+		uint8_t		reserved1[3];
+		uint32_t	reserved2;
+		uint8_t		interruptLine;
+		uint8_t		interruptPin;	/* (ro) */
+		uint8_t		minimumGrant;	/* (ro) */
+		uint8_t		maximumLatency;	/* (ro) */
+	    } type_0;
+
+	    struct {
+		uint32_t	baseAddresses[PCI_TYPE1_ADDRESSES];
+		uint8_t		primaryBus;
+		uint8_t		secondaryBus;
+		uint8_t		subordinateBus;
+		uint8_t		secondaryLatency;
+		uint8_t		ioBase;
+		uint8_t		ioLimit;
+		uint16_t	secondaryStatus;
+		uint16_t	memoryBase;
+		uint16_t	memoryLimit;
+		uint16_t	prefetchBase;
+		uint16_t	prefetchLimit;
+		uint32_t	prefetchBaseUpper32;
+		uint32_t	prefetchLimitUpper32;
+		uint16_t	ioBaseUpper16;
+		uint16_t	ioLimitUpper16;
+		uint8_t		capabilitiesPtr;
+		uint8_t		reserved1[3];
+		uint32_t	romBaseAddress;
+		uint8_t		interruptLine;
+		uint8_t		interruptPin;
+		uint16_t	bridgeControl;
+	    } type_1;
+
+	    struct {
+		uint32_t	socketRegistersBaseAddress;
+		uint8_t		capabilitiesPtr;
+		uint8_t		reserved;
+		uint16_t	secondaryStatus;
+		uint8_t		primaryBus;
+		uint8_t		secondaryBus;
+		uint8_t		subordinateBus;
+		uint8_t		secondaryLatency;
+		struct {
+			uint32_t	base;
+			uint32_t	limit;
+		} range[PCI_TYPE2_ADDRESSES-1];
+		uint8_t		interruptLine;
+		uint8_t		interruptPin;
+		uint16_t	bridgeControl;
+	    } type_2;
+	} header;
+};
+
+struct drsas_pci_link_capability {
+	union {
+	    struct {
+		uint32_t linkSpeed		:4;
+		uint32_t linkWidth		:6;
+		uint32_t aspmSupport		:2;
+		uint32_t losExitLatency		:3;
+		uint32_t l1ExitLatency		:3;
+		uint32_t rsvdp			:6;
+		uint32_t portNumber		:8;
+	    } bits;
+
+	    uint32_t asUlong;
+	} cap;
+
+};
+
+struct drsas_pci_link_status_capability {
+	union {
+	    struct {
+		uint16_t linkSpeed		:4;
+		uint16_t negotiatedLinkWidth	:6;
+		uint16_t linkTrainingError	:1;
+		uint16_t linkTraning		:1;
+		uint16_t slotClockConfig	:1;
+		uint16_t rsvdZ			:3;
+	    } bits;
+
+	    uint16_t asUshort;
+	} stat_cap;
+
+	uint16_t reserved;
+
+};
+
+struct drsas_pci_capabilities {
+	struct drsas_pci_link_capability	linkCapability;
+	struct drsas_pci_link_status_capability linkStatusCapability;
+};
+
+struct drsas_pci_information
+{
+	uint32_t		busNumber;
+	uint8_t			deviceNumber;
+	uint8_t			functionNumber;
+	uint8_t			interruptVector;
+	uint8_t			reserved;
+	struct drsas_pci_common_header pciHeaderInfo;
+	struct drsas_pci_capabilities capability;
+	uint8_t			reserved2[32];
+};
+
+struct drsas_ioctl {
+	uint16_t	version;
+	uint16_t	controller_id;
+	uint8_t		signature[8];
+	uint32_t	reserved_1;
+	uint32_t	control_code;
+	uint32_t	reserved_2[2];
+	uint8_t		frame[64];
+	union drsas_sgl_frame sgl_frame;
+	uint8_t		sense_buff[DRSAS_MAX_SENSE_LENGTH];
+	uint8_t		data[1];
+};
+
+struct drsas_aen {
+	uint16_t	host_no;
+	uint16_t	cmd_status;
+	uint32_t	seq_num;
+	uint32_t	class_locale_word;
+};
+#pragma pack()
+
+#ifndef	DDI_VENDOR_LSI
+#define	DDI_VENDOR_LSI		"LSI"
+#endif /* DDI_VENDOR_LSI */
+
+static int	drsas_getinfo(dev_info_t *, ddi_info_cmd_t,  void *, void **);
+static int	drsas_attach(dev_info_t *, ddi_attach_cmd_t);
+static int	drsas_reset(dev_info_t *, ddi_reset_cmd_t);
+static int	drsas_detach(dev_info_t *, ddi_detach_cmd_t);
+static int	drsas_open(dev_t *, int, int, cred_t *);
+static int	drsas_close(dev_t, int, int, cred_t *);
+static int	drsas_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
+
+static int	drsas_tran_tgt_init(dev_info_t *, dev_info_t *,
+		    scsi_hba_tran_t *, struct scsi_device *);
+static struct scsi_pkt *drsas_tran_init_pkt(struct scsi_address *, register
+		    struct scsi_pkt *, struct buf *, int, int, int, int,
+		    int (*)(), caddr_t);
+static int	drsas_tran_start(struct scsi_address *,
+		    register struct scsi_pkt *);
+static int	drsas_tran_abort(struct scsi_address *, struct scsi_pkt *);
+static int	drsas_tran_reset(struct scsi_address *, int);
+static int	drsas_tran_getcap(struct scsi_address *, char *, int);
+static int	drsas_tran_setcap(struct scsi_address *, char *, int, int);
+static void	drsas_tran_destroy_pkt(struct scsi_address *,
+		    struct scsi_pkt *);
+static void	drsas_tran_dmafree(struct scsi_address *, struct scsi_pkt *);
+static void	drsas_tran_sync_pkt(struct scsi_address *, struct scsi_pkt *);
+static uint_t	drsas_isr();
+static uint_t	drsas_softintr();
+
+static int	init_mfi(struct drsas_instance *);
+static int	drsas_free_dma_obj(struct drsas_instance *, dma_obj_t);
+static int	drsas_alloc_dma_obj(struct drsas_instance *, dma_obj_t *,
+		    uchar_t);
+static struct drsas_cmd *get_mfi_pkt(struct drsas_instance *);
+static void	return_mfi_pkt(struct drsas_instance *,
+		    struct drsas_cmd *);
+
+static void	free_space_for_mfi(struct drsas_instance *);
+static void	free_additional_dma_buffer(struct drsas_instance *);
+static int	alloc_additional_dma_buffer(struct drsas_instance *);
+static int	read_fw_status_reg_ppc(struct drsas_instance *);
+static void	issue_cmd_ppc(struct drsas_cmd *, struct drsas_instance *);
+static int	issue_cmd_in_poll_mode_ppc(struct drsas_instance *,
+		    struct drsas_cmd *);
+static int	issue_cmd_in_sync_mode_ppc(struct drsas_instance *,
+		    struct drsas_cmd *);
+static void	enable_intr_ppc(struct drsas_instance *);
+static void	disable_intr_ppc(struct drsas_instance *);
+static int	intr_ack_ppc(struct drsas_instance *);
+static int	mfi_state_transition_to_ready(struct drsas_instance *);
+static void	destroy_mfi_frame_pool(struct drsas_instance *);
+static int	create_mfi_frame_pool(struct drsas_instance *);
+static int	drsas_dma_alloc(struct drsas_instance *, struct scsi_pkt *,
+		    struct buf *, int, int (*)());
+static int	drsas_dma_move(struct drsas_instance *,
+			struct scsi_pkt *, struct buf *);
+static void	flush_cache(struct drsas_instance *instance);
+static void	display_scsi_inquiry(caddr_t);
+static int	start_mfi_aen(struct drsas_instance *instance);
+static int	handle_drv_ioctl(struct drsas_instance *instance,
+		    struct drsas_ioctl *ioctl, int mode);
+static int	handle_mfi_ioctl(struct drsas_instance *instance,
+		    struct drsas_ioctl *ioctl, int mode);
+static int	handle_mfi_aen(struct drsas_instance *instance,
+		    struct drsas_aen *aen);
+static void	fill_up_drv_ver(struct drsas_drv_ver *dv);
+static struct drsas_cmd *build_cmd(struct drsas_instance *instance,
+		    struct scsi_address *ap, struct scsi_pkt *pkt,
+		    uchar_t *cmd_done);
+static int	register_mfi_aen(struct drsas_instance *instance,
+		    uint32_t seq_num, uint32_t class_locale_word);
+static int	issue_mfi_pthru(struct drsas_instance *instance, struct
+		    drsas_ioctl *ioctl, struct drsas_cmd *cmd, int mode);
+static int	issue_mfi_dcmd(struct drsas_instance *instance, struct
+		    drsas_ioctl *ioctl, struct drsas_cmd *cmd, int mode);
+static int	issue_mfi_smp(struct drsas_instance *instance, struct
+		    drsas_ioctl *ioctl, struct drsas_cmd *cmd, int mode);
+static int	issue_mfi_stp(struct drsas_instance *instance, struct
+		    drsas_ioctl *ioctl, struct drsas_cmd *cmd, int mode);
+static int	abort_aen_cmd(struct drsas_instance *instance,
+		    struct drsas_cmd *cmd_to_abort);
+
+static int	drsas_common_check(struct drsas_instance *instance,
+		    struct  drsas_cmd *cmd);
+static void	drsas_fm_init(struct drsas_instance *instance);
+static void	drsas_fm_fini(struct drsas_instance *instance);
+static int	drsas_fm_error_cb(dev_info_t *, ddi_fm_error_t *,
+		    const void *);
+static void	drsas_fm_ereport(struct drsas_instance *instance,
+		    char *detail);
+static int	drsas_check_dma_handle(ddi_dma_handle_t handle);
+static int	drsas_check_acc_handle(ddi_acc_handle_t handle);
+
+static void	drsas_rem_intrs(struct drsas_instance *instance);
+static int	drsas_add_intrs(struct drsas_instance *instance, int intr_type);
+
+static void	drsas_tran_tgt_free(dev_info_t *, dev_info_t *,
+		    scsi_hba_tran_t *, struct scsi_device *);
+static int	drsas_tran_bus_config(dev_info_t *, uint_t,
+		    ddi_bus_config_op_t, void *, dev_info_t **);
+static int	drsas_parse_devname(char *, int *, int *);
+static int	drsas_config_all_devices(struct drsas_instance *);
+static int 	drsas_config_scsi_device(struct drsas_instance *,
+		    struct scsi_device *, dev_info_t **);
+static int 	drsas_config_ld(struct drsas_instance *, uint16_t,
+				uint8_t, dev_info_t **);
+static dev_info_t *drsas_find_child(struct drsas_instance *, uint16_t,
+			uint8_t);
+static int	drsas_name_node(dev_info_t *, char *, int);
+static void	drsas_issue_evt_taskq(struct drsas_eventinfo *);
+static int	drsas_service_evt(struct drsas_instance *, int, int, int,
+			uint64_t);
+static int	drsas_mode_sense_build(struct scsi_pkt *);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _DR_SAS_H_ */
diff --git a/usr/src/uts/common/io/dr_sas/dr_sas_list.h b/usr/src/uts/common/io/dr_sas/dr_sas_list.h
new file mode 100644
index 0000000000..4154a77796
--- /dev/null
+++ b/usr/src/uts/common/io/dr_sas/dr_sas_list.h
@@ -0,0 +1,212 @@
+/*
+ * dr_sas_list.h: header for dr_sas
+ *
+ * Solaris MegaRAID driver for SAS2.0 controllers
+ * Copyright (c) 2008-2009, LSI Logic Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the author nor the names of its contributors may be
+ *    used to endorse or promote products derived from this software without
+ *    specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_DR_SAS_LIST_H_
+#define	_DR_SAS_LIST_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Simple doubly linked list implementation.
+ *
+ * Some of the internal functions ("__xxx") are useful when
+ * manipulating whole lists rather than single entries, as
+ * sometimes we already know the next/prev entries and we can
+ * generate better code by using them directly rather than
+ * using the generic single-entry routines.
+ */
+
+struct mlist_head {
+	struct mlist_head *next, *prev;
+};
+
+typedef struct mlist_head mlist_t;
+
+#define	LIST_HEAD_INIT(name) { &(name), &(name) }
+
+#define	LIST_HEAD(name) \
+	struct mlist_head name = LIST_HEAD_INIT(name)
+
+#define	INIT_LIST_HEAD(ptr) { \
+	(ptr)->next = (ptr); (ptr)->prev = (ptr); \
+}
+
+
+/*
+ * Insert a new entry between two known consecutive entries.
+ *
+ * This is only for internal list manipulation where we know
+ * the prev/next entries already!
+ */
+static void __list_add(struct mlist_head *new,
+	struct mlist_head *prev,
+	struct mlist_head *next)
+{
+	next->prev = new;
+	new->next = next;
+	new->prev = prev;
+	prev->next = new;
+}
+
+
+/*
+ * mlist_add - add a new entry
+ * @new: new entry to be added
+ * @head: list head to add it after
+ *
+ * Insert a new entry after the specified head.
+ * This is good for implementing stacks.
+ */
+static void mlist_add(struct mlist_head *new, struct mlist_head *head)
+{
+	__list_add(new, head, head->next);
+}
+
+
+/*
+ * mlist_add_tail - add a new entry
+ * @new: new entry to be added
+ * @head: list head to add it before
+ *
+ * Insert a new entry before the specified head.
+ * This is useful for implementing queues.
+ */
+static void mlist_add_tail(struct mlist_head *new, struct mlist_head *head)
+{
+	__list_add(new, head->prev, head);
+}
+
+
+
+/*
+ * Delete a list entry by making the prev/next entries
+ * point to each other.
+ *
+ * This is only for internal list manipulation where we know
+ * the prev/next entries already!
+ */
+static void __list_del(struct mlist_head *prev,
+			struct mlist_head *next)
+{
+	next->prev = prev;
+	prev->next = next;
+}
+
+
+/*
+ * mlist_del_init - deletes entry from list and reinitialize it.
+ * @entry: the element to delete from the list.
+ */
+static void mlist_del_init(struct mlist_head *entry)
+{
+	__list_del(entry->prev, entry->next);
+	INIT_LIST_HEAD(entry);
+}
+
+
+/*
+ * mlist_empty - tests whether a list is empty
+ * @head: the list to test.
+ */
+static int mlist_empty(struct mlist_head *head)
+{
+	return (head->next == head);
+}
+
+
+/*
+ * mlist_splice - join two lists
+ * @list: the new list to add.
+ * @head: the place to add it in the first list.
+ */
+static void mlist_splice(struct mlist_head *list, struct mlist_head *head)
+{
+	struct mlist_head *first = list->next;
+
+	if (first != list) {
+		struct mlist_head *last = list->prev;
+		struct mlist_head *at = head->next;
+
+		first->prev = head;
+		head->next = first;
+
+		last->next = at;
+		at->prev = last;
+	}
+}
+
+
+/*
+ * mlist_entry - get the struct for this entry
+ * @ptr:	the &struct mlist_head pointer.
+ * @type:	the type of the struct this is embedded in.
+ * @member:	the name of the list_struct within the struct.
+ */
+#define	mlist_entry(ptr, type, member) \
+	((type *)((size_t)(ptr) - offsetof(type, member)))
+
+
+/*
+ * mlist_for_each	-	iterate over a list
+ * @pos:	the &struct mlist_head to use as a loop counter.
+ * @head:	the head for your list.
+ */
+#define	mlist_for_each(pos, head) \
+	for (pos = (head)->next, prefetch(pos->next); pos != (head); \
+		pos = pos->next, prefetch(pos->next))
+
+
+/*
+ * mlist_for_each_safe - iterate over a list safe against removal of list entry
+ * @pos:	the &struct mlist_head to use as a loop counter.
+ * @n:		another &struct mlist_head to use as temporary storage
+ * @head:	the head for your list.
+ */
+#define	mlist_for_each_safe(pos, n, head) \
+	for (pos = (head)->next, n = pos->next; pos != (head); \
+		pos = n, n = pos->next)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _DR_SAS_LIST_H_ */
diff --git a/usr/src/uts/common/io/fibre-channel/impl/fctl.c b/usr/src/uts/common/io/fibre-channel/impl/fctl.c
index 634de6c6dd..87105e779d 100644
--- a/usr/src/uts/common/io/fibre-channel/impl/fctl.c
+++ b/usr/src/uts/common/io/fibre-channel/impl/fctl.c
@@ -24,6 +24,7 @@
  */
 /*
  * Copyright 2012 Garrett D'Amore <garrett@damore.org>.  All rights reserved.
+ * Copyright (c) 2015 Joyent, Inc.  All rights reserved.
  */
 /*
  * Fibre channel Transport Library (fctl)
@@ -5500,6 +5501,11 @@ fc_ulp_get_adapter_paths(char *pathList, int count)
 		maxPorts ++;
 	}
 
+	if (maxPorts == 0) {
+		mutex_exit(&fctl_port_lock);
+		return (0);
+	}
+
 	/* Now allocate a buffer to store all the pointers for comparisons */
 	portList = kmem_zalloc(sizeof (fc_local_port_t *) * maxPorts, KM_SLEEP);
 
diff --git a/usr/src/uts/common/io/gsqueue/gsqueue.c b/usr/src/uts/common/io/gsqueue/gsqueue.c
new file mode 100644
index 0000000000..b484b16142
--- /dev/null
+++ b/usr/src/uts/common/io/gsqueue/gsqueue.c
@@ -0,0 +1,612 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 Joyent, Inc.  All rights reserved.
+ */
+
+/*
+ * Serialization queues are a technique used in illumos to provide what's
+ * commonly known as a 'vertical' perimeter. The idea (described a bit in
+ * uts/common/inet/squeue.c) is to provide a means to make sure that message
+ * blocks (mblk_t) are processed in a specific order. Subsystems like ip and vnd
+ * consume these on different policies, ip on a conn_t basis, vnd on a per
+ * device basis, and use this to ensure that only one packet is being processed
+ * at a given time.
+ *
+ * Serialization queues were originally used by ip. As part of that
+ * implementation, many of the details of ip were baked into it. That includes
+ * things like conn_t, ip receive attributes, and the notion of sets. While an
+ * individual serialization queue, or gsqueue_t, is a useful level of
+ * abstraction, it isn't the basis on which monst consumers want to manage them.
+ * Instead, we have the notion of a set of serialization queues. These sets are
+ * DR (CPU Dynamic reconfiguration) aware, and allow consumers to have a
+ * gsqueue_t per CPU to fanout on without managing them all itself. In the
+ * original implementation, this existed, but they were heavily tied into the
+ * infrastructure of IP, and its notion of polling on the underlying MAC
+ * devices.
+ *
+ * The result of that past is a new interface to serialization queues and a
+ * similar, but slightly different, abstraction to sets of these
+ * (gsqueue_set_t).  When designing this there are two different approaches that
+ * one could consider. The first is that the system has one gsqueue_set_t that
+ * the entire world shares, whether IP or some other consumer. The other is that
+ * every consumer has their own set.
+ *
+ * The trade offs between these two failure modes are the pathological failure
+ * modes. There is no guarantee that any two consumers here are equivalent. In
+ * fact, they very likely have very different latency profiles. If they are
+ * being processed in the same queue, that can lead to very odd behaviors. More
+ * generally, if we have a series of processing functions from one consumer
+ * which are generally short, and another which are generally long, that'll
+ * cause undue latency that's harder to observe. If we instead take the approach
+ * that each consumer should have its own set that it fans out over then we
+ * won't end up with the problem that a given serialization queue will have
+ * multiple latency profiles, but instead we'll see cpu contention for the bound
+ * gsqueue_t worker thread. Keep in mind though, that only the gsqueue_t worker
+ * thread is bound and it is in fact possible for it to be processed by other
+ * threads on other CPUs.
+ *
+ * We've opted to go down the second path, so each consumer has its own
+ * independent set of serialization queues that it is bound over.
+ *
+ * Structure Hierarchies
+ * ---------------------
+ *
+ * At the top level, we have a single list of gsqueue_set_t. The gsqueue_set_t
+ * encapsulates all the per-CPU gsqueue_t that exist in the form of
+ * gsqueue_cpu_t.  The gsqueue_cpu_t has been designed such that it could
+ * accommodate more than one gsqueue_t, but today there is a one to one mapping.
+ *
+ * We maintain two different lists of gsqueue_cpu_t, the active and defunct
+ * sets. The active set is maintained in the array `gs_cpus`. There are NCPU
+ * entries available in `gs_cpus` with the total number of currently active cpus
+ * described in `gs_ncpus`. The ordering of `gs_cpus` is unimportant.  When
+ * there is no longer a need for a given binding (see the following section for
+ * more explanation on when this is the case) then we move the entry to the
+ * `gs_defunct` list which is just a singly linked list of gsqueue_cpu_t.
+ *
+ * In addition, each gsqueue_set_t can have a series of callbacks registered
+ * with it. These are described in the following section. Graphically, a given
+ * gsqueue_set_t looks roughly like the following:
+ *
+ *     +---------------+
+ *     | gsqueue_set_t |
+ *     +---------------+
+ *       |    |     |
+ *       |    |     * . . . gs_cpus
+ *       |    |     |
+ *       |    |     |      +-------------------------------------------------+
+ *       |    |     +----->| gsqueue_cpu_t || gsqueue_cpu_t || gsqueue_cpu_t |...
+ *       |    |            +-------------------------------------------------+
+ *       |    |
+ *       |    * . . . gs_defunct
+ *       |    |
+ *       |    |    +---------------+   +---------------+   +---------------+
+ *       |    +--->| gsqueue_cpu_t |-->| gsqueue_cpu_t |-->| gsqueue_cpu_t |...
+ *       |         +---------------+   +---------------+   +---------------+
+ *       * . . . gs_cbs
+ *       |
+ *       |    +--------------+   +--------------+  +--------------+
+ *       +--->| gsqueue_cb_t |-->| gsqueue_cb_t |->| gsqueue_cb_t |...
+ *            +--------------+   +--------------+  +--------------+
+ *
+ * CPU DR, gsqueue_t, and gsqueue_t
+ * --------------------------------
+ *
+ * Recall, that every serialization queue (gsqueue_t or squeue_t) has a worker
+ * thread that may end up doing work. As part of supporting fanout, we have one
+ * gsqueue_t per CPU, and its worker thread is bound to that CPU. Because of
+ * this binding, we need to deal with CPU DR changes.
+ *
+ * The gsqueue driver maintains a single CPU DR callback that is used for the
+ * entire sub-system. We break down CPU DR events into three groups. Offline
+ * events, online events, and events we can ignore. When the first group occurs,
+ * we need to go through every gsqueue_t, find the gsqueue_cpu_t that
+ * corresponds to that processor id, and unbind all of its gsqueue_t's. It's
+ * rather important that we only unbind the gsqueue_t's and not actually destroy
+ * them. When this happens, they could very easily have data queued inside of
+ * them and it's unreasonable to just throw out everything in them at this
+ * point. The data remains intact and service continues uinterrupted.
+ *
+ * When we receive an online event, we do the opposite. We try to find a
+ * gsqueue_cpu_t that previously was bound to this CPU (by leaving its gqc_cpuid
+ * field intact) in the defunct list. If we find one, we remove it from the
+ * defunct list and add it to the active list as well as binding the gsqueue_t
+ * to the CPU in question. If we don't find one, then we create a new one.
+ *
+ * To deal with these kinds of situations, we allow a consumer to register
+ * callbacks for the gsqueue_t that they are interested in. These callbacks will
+ * fire whenever we are handling a topology change. The design of the callbacks
+ * is not that the user can take any administrative action during them, but
+ * rather set something for them to do asynchronously. It is illegal to make any
+ * calls into the gsqueue system while you are in a callback.
+ *
+ * Locking
+ * -------
+ *
+ * The lock ordering here is fairly straightforward. Due to our use of CPU
+ * binding and the CPU DR callbacks, we have an additional lock to consider
+ * cpu_lock. Because of that, the following are the rules for locking:
+ *
+ *
+ *   o If performing binding operations, you must grab cpu_lock. cpu_lock is
+ *     also at the top of the order.
+ *
+ *   o cpu_lock > gsqueue_lock > gsqueue_t`gs_lock > squeue_t`sq_lock
+ *     If you need to take multiple locks, you must take the greatest
+ *     (left-most) one first.
+ */
+
+#include <sys/types.h>
+#include <sys/conf.h>
+#include <sys/stat.h>
+#include <sys/kmem.h>
+#include <sys/stream.h>
+#include <sys/modctl.h>
+#include <sys/cpuvar.h>
+#include <sys/list.h>
+#include <sys/sysmacros.h>
+
+#include <sys/gsqueue.h>
+#include <sys/squeue_impl.h>
+
+typedef struct gsqueue_cb {
+	struct gsqueue_cb *gcb_next;
+	gsqueue_cb_f gcb_func;
+	void *gcb_arg;
+} gsqueue_cb_t;
+
+typedef struct gsqueue_cpu {
+	struct gsqueue_cpu *gqc_next;
+	squeue_t *gqc_head;
+	processorid_t gqc_cpuid;
+} gsqueue_cpu_t;
+
+struct gsqueue_set {
+	list_node_t gs_next;
+	uint_t gs_wwait;
+	pri_t gs_wpri;
+	kmutex_t gs_lock;
+	int gs_ncpus;
+	gsqueue_cpu_t **gs_cpus;
+	gsqueue_cpu_t *gs_defunct;
+	gsqueue_cb_t *gs_cbs;
+};
+
+static kmutex_t gsqueue_lock;
+static list_t gsqueue_list;
+static kmem_cache_t *gsqueue_cb_cache;
+static kmem_cache_t *gsqueue_cpu_cache;
+static kmem_cache_t *gsqueue_set_cache;
+
+static gsqueue_cpu_t *
+gsqueue_cpu_create(uint_t wwait, pri_t wpri, processorid_t cpuid)
+{
+	gsqueue_cpu_t *scp;
+
+	scp = kmem_cache_alloc(gsqueue_cpu_cache, KM_SLEEP);
+
+	scp->gqc_next = NULL;
+	scp->gqc_cpuid = cpuid;
+	scp->gqc_head = squeue_create(wwait, wpri, B_FALSE);
+	scp->gqc_head->sq_state = SQS_DEFAULT;
+	squeue_bind(scp->gqc_head, cpuid);
+
+	return (scp);
+}
+
+static void
+gsqueue_cpu_destroy(gsqueue_cpu_t *scp)
+{
+	squeue_destroy(scp->gqc_head);
+	kmem_cache_free(gsqueue_cpu_cache, scp);
+}
+
+gsqueue_set_t *
+gsqueue_set_create(uint_t wwait, pri_t wpri)
+{
+	int i;
+	gsqueue_set_t *gssp;
+
+	gssp = kmem_cache_alloc(gsqueue_set_cache, KM_SLEEP);
+	gssp->gs_wwait = wwait;
+	gssp->gs_wpri = wpri;
+	gssp->gs_ncpus = 0;
+
+	/*
+	 * We're grabbing CPU lock. Once we let go of it we have to ensure all
+	 * set up of the gsqueue_set_t is complete, as it'll be in there for the
+	 * various CPU DR bits.
+	 */
+	mutex_enter(&cpu_lock);
+
+	for (i = 0; i < NCPU; i++) {
+		gsqueue_cpu_t *scp;
+		cpu_t *cp = cpu_get(i);
+		if (cp != NULL && CPU_ACTIVE(cp) &&
+		    cp->cpu_flags & CPU_EXISTS) {
+			scp = gsqueue_cpu_create(wwait, wpri, cp->cpu_id);
+			gssp->gs_cpus[gssp->gs_ncpus] = scp;
+			gssp->gs_ncpus++;
+		}
+	}
+
+	/* Finally we can add it to our global list and be done */
+	mutex_enter(&gsqueue_lock);
+	list_insert_tail(&gsqueue_list, gssp);
+	mutex_exit(&gsqueue_lock);
+	mutex_exit(&cpu_lock);
+
+	return (gssp);
+}
+
+void
+gsqueue_set_destroy(gsqueue_set_t *gssp)
+{
+	int i;
+	gsqueue_cpu_t *scp;
+
+	/*
+	 * Go through and unbind all of the squeues while cpu_lock is held and
+	 * move them to the defunct list. Once that's done, we don't need to do
+	 * anything else with cpu_lock.
+	 */
+	mutex_enter(&cpu_lock);
+	mutex_enter(&gsqueue_lock);
+	list_remove(&gsqueue_list, gssp);
+	mutex_exit(&gsqueue_lock);
+
+	mutex_enter(&gssp->gs_lock);
+
+	for (i = 0; i < gssp->gs_ncpus; i++) {
+		scp = gssp->gs_cpus[i];
+		squeue_unbind(scp->gqc_head);
+		scp->gqc_next = gssp->gs_defunct;
+		gssp->gs_defunct = scp;
+		gssp->gs_cpus[i] = NULL;
+	}
+	gssp->gs_ncpus = 0;
+
+	mutex_exit(&gssp->gs_lock);
+	mutex_exit(&cpu_lock);
+
+	while (gssp->gs_defunct != NULL) {
+		gsqueue_cpu_t *scp;
+
+		scp = gssp->gs_defunct;
+		gssp->gs_defunct = scp->gqc_next;
+		gsqueue_cpu_destroy(scp);
+	}
+
+	while (gssp->gs_cbs != NULL) {
+		gsqueue_cb_t *cbp;
+
+		cbp = gssp->gs_cbs;
+		gssp->gs_cbs = cbp->gcb_next;
+		kmem_cache_free(gsqueue_cb_cache, cbp);
+	}
+
+	ASSERT(gssp->gs_ncpus == 0);
+	ASSERT(gssp->gs_defunct == NULL);
+	ASSERT(gssp->gs_cbs == NULL);
+	kmem_cache_free(gsqueue_set_cache, gssp);
+}
+
+gsqueue_t *
+gsqueue_set_get(gsqueue_set_t *gssp, uint_t index)
+{
+	squeue_t *sqp;
+	gsqueue_cpu_t *scp;
+
+	mutex_enter(&gssp->gs_lock);
+	scp = gssp->gs_cpus[index % gssp->gs_ncpus];
+	sqp = scp->gqc_head;
+	mutex_exit(&gssp->gs_lock);
+	return ((gsqueue_t *)sqp);
+}
+
+uintptr_t
+gsqueue_set_cb_add(gsqueue_set_t *gssp, gsqueue_cb_f cb, void *arg)
+{
+	gsqueue_cb_t *cbp;
+
+	cbp = kmem_cache_alloc(gsqueue_cb_cache, KM_SLEEP);
+	cbp->gcb_func = cb;
+	cbp->gcb_arg = arg;
+
+	mutex_enter(&gssp->gs_lock);
+	cbp->gcb_next = gssp->gs_cbs;
+	gssp->gs_cbs = cbp;
+	mutex_exit(&gssp->gs_lock);
+	return ((uintptr_t)cbp);
+}
+
+int
+gsqueue_set_cb_remove(gsqueue_set_t *gssp, uintptr_t id)
+{
+	gsqueue_cb_t *cbp, *prev;
+	mutex_enter(&gssp->gs_lock);
+	cbp = gssp->gs_cbs;
+	prev = NULL;
+	while (cbp != NULL) {
+		if ((uintptr_t)cbp != id) {
+			prev = cbp;
+			cbp = cbp->gcb_next;
+			continue;
+		}
+
+		if (prev == NULL) {
+			gssp->gs_cbs = cbp->gcb_next;
+		} else {
+			prev->gcb_next = cbp->gcb_next;
+		}
+
+		mutex_exit(&gssp->gs_lock);
+		kmem_cache_free(gsqueue_cb_cache, cbp);
+		return (0);
+	}
+	mutex_exit(&gssp->gs_lock);
+	return (-1);
+}
+
+void
+gsqueue_enter_one(gsqueue_t *gsp, mblk_t *mp, gsqueue_proc_f func, void *arg,
+    int flags, uint8_t tag)
+{
+	squeue_t *sqp = (squeue_t *)gsp;
+
+	ASSERT(mp->b_next == NULL);
+	ASSERT(mp->b_prev == NULL);
+	mp->b_queue = (queue_t *)func;
+	mp->b_prev = arg;
+	sqp->sq_enter(sqp, mp, mp, 1, NULL, flags, tag);
+}
+
+static void
+gsqueue_notify(gsqueue_set_t *gssp, squeue_t *sqp, boolean_t online)
+{
+	gsqueue_cb_t *cbp;
+
+	ASSERT(MUTEX_HELD(&gssp->gs_lock));
+	cbp = gssp->gs_cbs;
+	while (cbp != NULL) {
+		cbp->gcb_func(gssp, (gsqueue_t *)sqp, cbp->gcb_arg, online);
+		cbp = cbp->gcb_next;
+	}
+
+}
+
+/*
+ * When we online a processor we need to go through and either bind a defunct
+ * squeue or create a new one. We'll try to reuse a gsqueue_cpu_t from the
+ * defunct list that used to be on that processor. If no such gsqueue_cpu_t
+ * exists, then we'll create a new one. We'd rather avoid taking over an
+ * existing defunct one that used to be on another CPU, as its not unreasonable
+ * to believe that its CPU will come back. More CPUs are offlined and onlined by
+ * the administrator or by creating cpu sets than actually get offlined by FMA.
+ */
+static void
+gsqueue_handle_online(processorid_t id)
+{
+	gsqueue_set_t *gssp;
+
+	ASSERT(MUTEX_HELD(&cpu_lock));
+	mutex_enter(&gsqueue_lock);
+	for (gssp = list_head(&gsqueue_list); gssp != NULL;
+	    gssp = list_next(&gsqueue_list, gssp)) {
+		gsqueue_cpu_t *scp;
+
+		mutex_enter(&gssp->gs_lock);
+		scp = gssp->gs_defunct;
+		while (scp != NULL) {
+			if (scp->gqc_cpuid == id)
+				break;
+			scp = scp->gqc_next;
+		}
+
+		if (scp == NULL) {
+			scp = gsqueue_cpu_create(gssp->gs_wwait,
+			    gssp->gs_wpri, id);
+		} else {
+			squeue_bind(scp->gqc_head, id);
+		}
+		ASSERT(gssp->gs_ncpus < NCPU);
+		gssp->gs_cpus[gssp->gs_ncpus] = scp;
+		gssp->gs_ncpus++;
+		gsqueue_notify(gssp, scp->gqc_head, B_TRUE);
+		mutex_exit(&gssp->gs_lock);
+	}
+	mutex_exit(&gsqueue_lock);
+}
+
+static void
+gsqueue_handle_offline(processorid_t id)
+{
+	gsqueue_set_t *gssp;
+
+	ASSERT(MUTEX_HELD(&cpu_lock));
+	mutex_enter(&gsqueue_lock);
+	for (gssp = list_head(&gsqueue_list); gssp != NULL;
+	    gssp = list_next(&gsqueue_list, gssp)) {
+		int i;
+		gsqueue_cpu_t *scp = NULL;
+
+		mutex_enter(&gssp->gs_lock);
+		for (i = 0; i < gssp->gs_ncpus; i++) {
+			if (gssp->gs_cpus[i]->gqc_cpuid == id) {
+				scp = gssp->gs_cpus[i];
+				break;
+			}
+		}
+
+		if (scp != NULL) {
+			squeue_unbind(scp->gqc_head);
+			scp->gqc_next = gssp->gs_defunct;
+			gssp->gs_defunct = scp;
+			gssp->gs_cpus[i] = gssp->gs_cpus[gssp->gs_ncpus-1];
+			gssp->gs_ncpus--;
+			gsqueue_notify(gssp, scp->gqc_head, B_FALSE);
+		}
+		mutex_exit(&gssp->gs_lock);
+	}
+	mutex_exit(&gsqueue_lock);
+}
+
+/* ARGSUSED */
+static int
+gsqueue_cpu_setup(cpu_setup_t what, int id, void *unused)
+{
+	cpu_t *cp;
+
+	ASSERT(MUTEX_HELD(&cpu_lock));
+	cp = cpu_get(id);
+	switch (what) {
+	case CPU_CONFIG:
+	case CPU_ON:
+	case CPU_INIT:
+	case CPU_CPUPART_IN:
+		if (cp != NULL && CPU_ACTIVE(cp) && cp->cpu_flags & CPU_EXISTS)
+			gsqueue_handle_online(cp->cpu_id);
+		break;
+	case CPU_UNCONFIG:
+	case CPU_OFF:
+	case CPU_CPUPART_OUT:
+		gsqueue_handle_offline(cp->cpu_id);
+		break;
+	default:
+		break;
+	}
+
+	return (0);
+}
+
+
+/* ARGSUSED */
+static int
+gsqueue_set_cache_construct(void *buf, void *arg, int kmflags)
+{
+	gsqueue_set_t *gssp = buf;
+
+	gssp->gs_cpus = kmem_alloc(sizeof (gsqueue_cpu_t *) * NCPU, kmflags);
+	if (gssp->gs_cpus == NULL)
+		return (-1);
+
+	mutex_init(&gssp->gs_lock, NULL, MUTEX_DRIVER, NULL);
+	gssp->gs_ncpus = 0;
+	gssp->gs_defunct = NULL;
+	gssp->gs_cbs = NULL;
+
+	return (0);
+}
+
+static void
+gsqueue_set_cache_destruct(void *buf, void *arg)
+{
+	gsqueue_set_t *gssp = buf;
+
+	kmem_free(gssp->gs_cpus, sizeof (gsqueue_cpu_t *) * NCPU);
+	gssp->gs_cpus = NULL;
+	mutex_destroy(&gssp->gs_lock);
+}
+
+static void
+gsqueue_ddiinit(void)
+{
+	list_create(&gsqueue_list, sizeof (gsqueue_set_t),
+	    offsetof(gsqueue_set_t, gs_next));
+	mutex_init(&gsqueue_lock, NULL, MUTEX_DRIVER, NULL);
+
+	gsqueue_cb_cache = kmem_cache_create("gsqueue_cb_cache",
+	    sizeof (gsqueue_cb_t),
+	    0, NULL, NULL, NULL, NULL, NULL, 0);
+	gsqueue_cpu_cache = kmem_cache_create("gsqueue_cpu_cache",
+	    sizeof (gsqueue_cpu_t),
+	    0, NULL, NULL, NULL, NULL, NULL, 0);
+	gsqueue_set_cache = kmem_cache_create("squeue_set_cache",
+	    sizeof (gsqueue_set_t),
+	    0, gsqueue_set_cache_construct, gsqueue_set_cache_destruct,
+	    NULL, NULL, NULL, 0);
+
+
+	mutex_enter(&cpu_lock);
+	register_cpu_setup_func(gsqueue_cpu_setup, NULL);
+	mutex_exit(&cpu_lock);
+}
+
+static int
+gsqueue_ddifini(void)
+{
+	mutex_enter(&gsqueue_lock);
+	if (list_is_empty(&gsqueue_list) == 0) {
+		mutex_exit(&gsqueue_lock);
+		return (EBUSY);
+	}
+	list_destroy(&gsqueue_list);
+	mutex_exit(&gsqueue_lock);
+
+	mutex_enter(&cpu_lock);
+	register_cpu_setup_func(gsqueue_cpu_setup, NULL);
+	mutex_exit(&cpu_lock);
+
+	kmem_cache_destroy(gsqueue_set_cache);
+	kmem_cache_destroy(gsqueue_cpu_cache);
+	kmem_cache_destroy(gsqueue_cb_cache);
+
+	mutex_destroy(&gsqueue_lock);
+
+	return (0);
+}
+
+static struct modlmisc		gsqueue_modmisc = {
+	&mod_miscops,
+	"gsqueue"
+};
+
+static struct modlinkage	gsqueue_modlinkage = {
+	MODREV_1,
+	&gsqueue_modmisc,
+	NULL
+};
+
+int
+_init(void)
+{
+	int ret;
+
+	gsqueue_ddiinit();
+	if ((ret = mod_install(&gsqueue_modlinkage)) != 0) {
+		VERIFY(gsqueue_ddifini() == 0);
+		return (ret);
+	}
+
+	return (ret);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&gsqueue_modlinkage, modinfop));
+}
+
+int
+_fini(void)
+{
+	int ret;
+
+	if ((ret = gsqueue_ddifini()) != 0)
+		return (ret);
+
+	if ((ret = mod_remove(&gsqueue_modlinkage)) != 0)
+		return (ret);
+
+	return (0);
+}
diff --git a/usr/src/uts/common/io/i40e/core/README b/usr/src/uts/common/io/i40e/core/README
new file mode 100644
index 0000000000..dc0149ce62
--- /dev/null
+++ b/usr/src/uts/common/io/i40e/core/README
@@ -0,0 +1,410 @@
+	ixl FreeBSD* Base Driver and ixlv VF Driver for the
+	     Intel XL710 Ethernet Controller Family
+
+/*$FreeBSD$*/
+================================================================
+
+August 26, 2014
+
+
+Contents
+========
+
+- Overview
+- Supported Adapters
+- The VF Driver
+- Building and Installation
+- Additional Configurations
+- Known Limitations
+
+
+Overview
+========
+
+This file describes the IXL FreeBSD* Base driver and the IXLV VF Driver
+for the XL710 Ethernet Family of Adapters. The Driver has been developed
+for use with FreeBSD 10.0 or later, but should be compatible with any
+supported release.
+
+For questions related to hardware requirements, refer to the documentation
+supplied with your Intel XL710 adapter. All hardware requirements listed
+apply for use with FreeBSD.
+
+
+Supported Adapters
+==================
+
+The drivers in this release are compatible with XL710 and X710-based
+Intel Ethernet Network Connections.
+
+
+SFP+ Devices with Pluggable Optics
+----------------------------------
+
+SR Modules
+----------
+  Intel     DUAL RATE 1G/10G SFP+ SR (bailed)    FTLX8571D3BCV-IT
+  Intel     DUAL RATE 1G/10G SFP+ SR (bailed)    AFBR-703SDZ-IN2
+
+LR Modules
+----------
+  Intel     DUAL RATE 1G/10G SFP+ LR (bailed)    FTLX1471D3BCV-IT
+  Intel     DUAL RATE 1G/10G SFP+ LR (bailed)    AFCT-701SDZ-IN2
+
+QSFP+ Modules
+-------------
+  Intel     TRIPLE RATE 1G/10G/40G QSFP+ SR (bailed)    E40GQSFPSR
+  Intel     TRIPLE RATE 1G/10G/40G QSFP+ LR (bailed)    E40GQSFPLR
+    QSFP+ 1G speed is not supported on XL710 based devices.
+
+X710/XL710 Based SFP+ adapters support all passive and active limiting direct
+attach cables that comply with SFF-8431 v4.1 and SFF-8472 v10.4 specifications.
+              
+The VF Driver
+==================
+The VF driver is normally used in a virtualized environment where a host
+driver manages SRIOV, and provides a VF device to the guest. With this
+first release the only host environment tested was using Linux QEMU/KVM.
+Support is planned for Xen and VMWare hosts at a later time.
+
+In the FreeBSD guest the IXLV driver would be loaded and will function
+using the VF device assigned to it.
+
+The VF driver provides most of the same functionality as the CORE driver,
+but is actually a slave to the Host, access to many controls are actually
+accomplished by a request to the Host via what is called the "Admin queue".
+These are startup and initialization events however, once in operation
+the device is self-contained and should achieve near native performance.
+
+Some notable limitations of the VF environment: for security reasons 
+the driver is never permitted to be promiscuous, therefore a tcpdump
+will not behave the same with the interface. Second, media info is not
+available from the PF, so it will always appear as auto.
+
+Tarball Building and Installation
+=========================
+
+NOTE: You must have kernel sources installed to compile the driver tarball.
+
+These instructions assume a standalone driver tarball, building the driver
+already in the kernel source is simply a matter of adding the device entry
+to the kernel config file, or building in the ixl or ixlv module directory.
+
+In the instructions below, x.x.x is the driver version
+as indicated in the name of the driver tarball. The example is
+for ixl, the same procedure applies for ixlv.
+
+1. Move the base driver tar file to the directory of your choice.
+   For example, use /home/username/ixl or /usr/local/src/ixl.
+
+2. Untar/unzip the archive:
+     tar xfz ixl-x.x.x.tar.gz
+
+3. To install man page:
+     cd ixl-x.x.x
+     gzip -c ixl.4 > /usr/share/man/man4/ixl.4.gz
+
+4. To load the driver onto a running system:
+     cd ixl-x.x.x/src
+     make load
+
+5. To assign an IP address to the interface, enter the following:
+     ifconfig ixl<interface_num> <IP_address>
+
+6. Verify that the interface works. Enter the following, where <IP_address>
+   is the IP address for another machine on the same subnet as the interface
+   that is  being tested:
+
+     ping <IP_address>
+
+7. If you want the driver to load automatically when the system is booted:
+
+     cd ixl-x.x.x/src
+     make
+     make install
+        
+    Edit /boot/loader.conf, and add the following line:
+     if_ixl_load="YES"
+
+    Edit /etc/rc.conf, and create the appropriate
+    ifconfig_ixl<interface_num> entry:
+
+     ifconfig_ixl<interface_num>="<ifconfig_settings>"
+
+     Example usage:
+
+     ifconfig_ixl0="inet 192.168.10.1 netmask 255.255.255.0"
+
+     NOTE: For assistance, see the ifconfig man page.
+
+
+
+Configuration and Tuning
+=========================
+
+Both drivers supports Transmit/Receive Checksum Offload for IPv4 and IPv6,
+TSO forIPv4 and IPv6, LRO, and Jumbo Frames on all 40 Gigabit adapters. 
+
+  Jumbo Frames
+  ------------
+  To enable Jumbo Frames, use the ifconfig utility to increase
+  the MTU beyond 1500 bytes.
+
+       - The Jumbo Frames setting on the switch must be set to at least
+         22 byteslarger than that of the adapter.
+
+       - The maximum MTU setting for Jumbo Frames is 9706. This value
+         coincides with the maximum jumbo frames size of 9728.
+         To modify the setting, enter the following:
+
+        ifconfig ixl<interface_num> <hostname or IP address> mtu 9000
+
+       - To confirm an interface's MTU value, use the ifconfig command.
+         To confirm the MTU used between two specific devices, use:
+
+        route get <destination_IP_address>
+
+  VLANs
+  -----
+  To create a new VLAN pseudo-interface:
+
+        ifconfig <vlan_name> create
+
+  To associate the VLAN pseudo-interface with a physical interface
+  and assign a VLAN ID, IP address, and netmask:
+
+        ifconfig <vlan_name> <ip_address> netmask <subnet_mask> vlan
+           <vlan_id> vlandev <physical_interface>
+
+  Example:
+
+        ifconfig vlan10 10.0.0.1 netmask 255.255.255.0 vlan 10 vlandev ixl0
+
+  In this example, all packets will be marked on egress with
+  802.1Q VLAN tags, specifying a VLAN ID of 10.
+
+  To remove a VLAN pseudo-interface:
+
+        ifconfig <vlan_name> destroy
+
+
+  Checksum Offload
+  ----------------
+    
+  Checksum offloading supports IPv4 and IPv6 with TCP and UDP packets
+  and is supported for both transmit and receive. Checksum offloading
+  for transmit and recieve is enabled by default for both IPv4 and IPv6.
+
+  Checksum offloading can be enabled or disabled using ifconfig.
+  Transmit and receive offloading for IPv4 and Ipv6 are enabled
+  and disabled seperately.
+
+  NOTE: TSO requires Tx checksum, so when Tx checksum
+        is disabled, TSO will also  be disabled. 
+
+  To enable Tx checksum offloading for ipv4:
+
+         ifconfig ixl<interface_num> txcsum4 
+
+  To disable Tx checksum offloading for ipv4:
+         
+         ifconfig ixl<interface_num> -txcsum4 
+         (NOTE: This will disable TSO4)
+
+  To enable Rx checksum offloading for ipv6:
+ 
+         ifconfig ixl<interface_num> rxcsum6 
+         
+  To disable Rx checksum offloading for ipv6:
+
+         ifconfig ixl<interface_num> -rxcsum6 
+         (NOTE: This will disable TSO6)
+
+  
+  To confirm the current settings:
+
+         ifconfig ixl<interface_num>
+
+  
+  TSO
+  ---
+
+  TSO supports both IPv4 and IPv6 and is enabled by default. TSO can
+  be disabled and enabled using the ifconfig utility.
+
+  NOTE: TSO requires Tx checksum, so when Tx checksum is
+      disabled, TSO will also be disabled. 
+
+  To disable TSO IPv4:
+
+         ifconfig ixl<interface_num> -tso4
+         
+  To enable TSO IPv4:
+
+         ifconfig ixl<interface_num> tso4 
+
+  To disable TSO IPv6:
+
+         ifconfig ixl<interface_num> -tso6
+
+  To enable TSO IPv6:
+        
+         ifconfig ixl<interface_num> tso6
+
+  To disable BOTH TSO IPv4 and IPv6:
+
+         ifconfig ixl<interface_num> -tso
+
+  To enable BOTH TSO IPv4 and IPv6:
+  
+         ifconfig ixl<interface_num> tso
+
+
+  LRO
+  ---
+
+  Large Receive Offload is enabled by default. It can be enabled
+  or disabled by using the ifconfig utility.
+
+  NOTE: LRO should be disabled when forwarding packets.
+
+  To disable LRO:	
+
+         ifconfig ixl<interface_num> -lro 
+
+  To enable LRO:
+
+         ifconfig ixl<interface_num> lro 
+
+
+Flow Control  (IXL only)
+------------
+Flow control is disabled by default. To change flow control settings use sysctl.
+
+To enable flow control to Rx pause frames:     
+
+         sysctl dev.ixl.<interface_num>.fc=1
+
+To enable flow control to Tx pause frames: 
+
+         sysctl dev.ixl.<interface_num>.fc=2
+
+To enable flow control to Rx and Tx pause frames:
+
+         sysctl dev.ixl.<interface_num>.fc=3
+
+To disable flow control:
+
+         sysctl dev.ixl.<interface_num>.fc=0
+    
+
+NOTE: You must have a flow control capable link partner.
+
+NOTE: The VF driver does not have access to flow control, it must be
+	managed from the host side.
+
+   
+  Important system configuration changes:
+  =======================================
+ 
+-Change the file /etc/sysctl.conf, and add the line:  
+ 
+         hw.intr_storm_threshold: 0 (the default is 1000)
+
+-Best throughput results are seen with a large MTU; use 9706 if possible. 
+
+-The default number of descriptors per ring is 1024, increasing this may
+improve performance depending on the use case.
+
+-The VF driver uses a relatively large buf ring, this was found to eliminate
+ UDP transmit errors, it is a tuneable, and if no UDP traffic is used it can
+ be reduced. It is memory used per queue.
+
+
+Known Limitations
+=================
+
+Network Memory Buffer allocation
+--------------------------------
+  FreeBSD may have a low number of network memory buffers (mbufs) by default.
+If your mbuf value is too low, it may cause the driver to fail to initialize
+and/or cause the system to become unresponsive. You can check to see if the
+system is mbuf-starved by running 'netstat -m'. Increase the number of mbufs
+by editing the lines below in /etc/sysctl.conf:
+
+         kern.ipc.nmbclusters
+         kern.ipc.nmbjumbop    
+         kern.ipc.nmbjumbo9
+         kern.ipc.nmbjumbo16
+         kern.ipc.nmbufs
+
+The amount of memory that you allocate is system specific, and may
+require some trial and error.
+
+Also, increasing the follwing in /etc/sysctl.conf could help increase
+network performance:
+         
+         kern.ipc.maxsockbuf
+         net.inet.tcp.sendspace
+         net.inet.tcp.recvspace
+         net.inet.udp.maxdgram
+         net.inet.udp.recvspace
+                  
+
+UDP Stress Test Dropped Packet Issue
+------------------------------------
+Under small packet UDP stress test with the ixl driver, the FreeBSD system
+may drop UDP packets due to the fullness of socket buffers. You may want to
+change the driver's Flow Control variables to the minimum value for
+controlling packet reception.
+
+
+Disable LRO when routing/bridging
+---------------------------------
+LRO must be turned off when forwarding traffic.
+
+
+Lower than expected performance
+-------------------------------
+Some PCIe x8 slots are actually configured as x4 slots. These slots have
+insufficient bandwidth for full line rate with dual port and quad port
+devices.
+
+In addition, if you put a PCIe Generation 3-capable adapter into a PCIe
+Generation 2 slot, you cannot get full bandwidth. The driver detects this
+situation and writes the following message in the system log:
+
+  "PCI-Express bandwidth available for this card is not sufficient for
+   optimal  performance. For optimal performance a x8 PCI-Express slot
+   is required."
+
+If this error occurs, moving your adapter to a true PCIe Generation 3 x8
+slot will resolve the issue.
+
+
+Support
+=======
+
+For general information and support, go to the Intel support website at:
+
+        http://support.intel.com
+
+If an issue is identified with the released source code on the supported kernel
+with a supported adapter, email the specific information related to the issue
+to freebsdnic@mailbox.intel.com.
+
+
+License
+=======
+
+This software program is released under the terms of a license agreement
+between you ('Licensee') and Intel. Do not use or load this software or any
+associated  materials (collectively, the 'Software') until you have carefully
+read the full terms and conditions of the LICENSE located in this software
+package. By loadingor using the Software, you agree to the terms of this
+Agreement. If you do not agree with the terms of this Agreement, do not
+install or use the Software.
+
+* Other names and brands may be claimed as the property of others.
+
+
diff --git a/usr/src/uts/common/io/i40e/core/THIRDPARTYLICENSE b/usr/src/uts/common/io/i40e/core/THIRDPARTYLICENSE
new file mode 100644
index 0000000000..04c551f1b2
--- /dev/null
+++ b/usr/src/uts/common/io/i40e/core/THIRDPARTYLICENSE
@@ -0,0 +1,29 @@
+  Copyright (c) 2013-2015, Intel Corporation 
+  All rights reserved.
+  
+  Redistribution and use in source and binary forms, with or without 
+  modification, are permitted provided that the following conditions are met:
+  
+   1. Redistributions of source code must retain the above copyright notice, 
+      this list of conditions and the following disclaimer.
+  
+   2. Redistributions in binary form must reproduce the above copyright 
+      notice, this list of conditions and the following disclaimer in the 
+      documentation and/or other materials provided with the distribution.
+  
+   3. Neither the name of the Intel Corporation nor the names of its 
+      contributors may be used to endorse or promote products derived from 
+      this software without specific prior written permission.
+  
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+  POSSIBILITY OF SUCH DAMAGE.
+
diff --git a/usr/src/uts/common/io/i40e/core/THIRDPARTYLICENSE.descrip b/usr/src/uts/common/io/i40e/core/THIRDPARTYLICENSE.descrip
new file mode 100644
index 0000000000..7a9537b10e
--- /dev/null
+++ b/usr/src/uts/common/io/i40e/core/THIRDPARTYLICENSE.descrip
@@ -0,0 +1 @@
+i40e DRIVER
diff --git a/usr/src/uts/common/io/i40e/core/i40e_adminq.c b/usr/src/uts/common/io/i40e/core/i40e_adminq.c
new file mode 100644
index 0000000000..67b72fd9f2
--- /dev/null
+++ b/usr/src/uts/common/io/i40e/core/i40e_adminq.c
@@ -0,0 +1,1101 @@
+/******************************************************************************
+
+  Copyright (c) 2013-2015, Intel Corporation 
+  All rights reserved.
+  
+  Redistribution and use in source and binary forms, with or without 
+  modification, are permitted provided that the following conditions are met:
+  
+   1. Redistributions of source code must retain the above copyright notice, 
+      this list of conditions and the following disclaimer.
+  
+   2. Redistributions in binary form must reproduce the above copyright 
+      notice, this list of conditions and the following disclaimer in the 
+      documentation and/or other materials provided with the distribution.
+  
+   3. Neither the name of the Intel Corporation nor the names of its 
+      contributors may be used to endorse or promote products derived from 
+      this software without specific prior written permission.
+  
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+  POSSIBILITY OF SUCH DAMAGE.
+
+******************************************************************************/
+/*$FreeBSD: head/sys/dev/ixl/i40e_adminq.c 284049 2015-06-05 22:52:42Z jfv $*/
+
+#include "i40e_status.h"
+#include "i40e_type.h"
+#include "i40e_register.h"
+#include "i40e_adminq.h"
+#include "i40e_prototype.h"
+
+/**
+ * i40e_is_nvm_update_op - return TRUE if this is an NVM update operation
+ * @desc: API request descriptor
+ **/
+static INLINE bool i40e_is_nvm_update_op(struct i40e_aq_desc *desc)
+{
+	return (desc->opcode == CPU_TO_LE16(i40e_aqc_opc_nvm_erase) ||
+		desc->opcode == CPU_TO_LE16(i40e_aqc_opc_nvm_update));
+}
+
+/**
+ *  i40e_adminq_init_regs - Initialize AdminQ registers
+ *  @hw: pointer to the hardware structure
+ *
+ *  This assumes the alloc_asq and alloc_arq functions have already been called
+ **/
+static void i40e_adminq_init_regs(struct i40e_hw *hw)
+{
+	/* set head and tail registers in our local struct */
+	if (i40e_is_vf(hw)) {
+		hw->aq.asq.tail = I40E_VF_ATQT1;
+		hw->aq.asq.head = I40E_VF_ATQH1;
+		hw->aq.asq.len  = I40E_VF_ATQLEN1;
+		hw->aq.asq.bal  = I40E_VF_ATQBAL1;
+		hw->aq.asq.bah  = I40E_VF_ATQBAH1;
+		hw->aq.arq.tail = I40E_VF_ARQT1;
+		hw->aq.arq.head = I40E_VF_ARQH1;
+		hw->aq.arq.len  = I40E_VF_ARQLEN1;
+		hw->aq.arq.bal  = I40E_VF_ARQBAL1;
+		hw->aq.arq.bah  = I40E_VF_ARQBAH1;
+	} else {
+		hw->aq.asq.tail = I40E_PF_ATQT;
+		hw->aq.asq.head = I40E_PF_ATQH;
+		hw->aq.asq.len  = I40E_PF_ATQLEN;
+		hw->aq.asq.bal  = I40E_PF_ATQBAL;
+		hw->aq.asq.bah  = I40E_PF_ATQBAH;
+		hw->aq.arq.tail = I40E_PF_ARQT;
+		hw->aq.arq.head = I40E_PF_ARQH;
+		hw->aq.arq.len  = I40E_PF_ARQLEN;
+		hw->aq.arq.bal  = I40E_PF_ARQBAL;
+		hw->aq.arq.bah  = I40E_PF_ARQBAH;
+	}
+}
+
+/**
+ *  i40e_alloc_adminq_asq_ring - Allocate Admin Queue send rings
+ *  @hw: pointer to the hardware structure
+ **/
+enum i40e_status_code i40e_alloc_adminq_asq_ring(struct i40e_hw *hw)
+{
+	enum i40e_status_code ret_code;
+
+	ret_code = i40e_allocate_dma_mem(hw, &hw->aq.asq.desc_buf,
+					 i40e_mem_atq_ring,
+					 (hw->aq.num_asq_entries *
+					 sizeof(struct i40e_aq_desc)),
+					 I40E_ADMINQ_DESC_ALIGNMENT);
+	if (ret_code)
+		return ret_code;
+
+	ret_code = i40e_allocate_virt_mem(hw, &hw->aq.asq.cmd_buf,
+					  (hw->aq.num_asq_entries *
+					  sizeof(struct i40e_asq_cmd_details)));
+	if (ret_code) {
+		i40e_free_dma_mem(hw, &hw->aq.asq.desc_buf);
+		return ret_code;
+	}
+
+	return ret_code;
+}
+
+/**
+ *  i40e_alloc_adminq_arq_ring - Allocate Admin Queue receive rings
+ *  @hw: pointer to the hardware structure
+ **/
+enum i40e_status_code i40e_alloc_adminq_arq_ring(struct i40e_hw *hw)
+{
+	enum i40e_status_code ret_code;
+
+	ret_code = i40e_allocate_dma_mem(hw, &hw->aq.arq.desc_buf,
+					 i40e_mem_arq_ring,
+					 (hw->aq.num_arq_entries *
+					 sizeof(struct i40e_aq_desc)),
+					 I40E_ADMINQ_DESC_ALIGNMENT);
+
+	return ret_code;
+}
+
+/**
+ *  i40e_free_adminq_asq - Free Admin Queue send rings
+ *  @hw: pointer to the hardware structure
+ *
+ *  This assumes the posted send buffers have already been cleaned
+ *  and de-allocated
+ **/
+void i40e_free_adminq_asq(struct i40e_hw *hw)
+{
+	i40e_free_dma_mem(hw, &hw->aq.asq.desc_buf);
+}
+
+/**
+ *  i40e_free_adminq_arq - Free Admin Queue receive rings
+ *  @hw: pointer to the hardware structure
+ *
+ *  This assumes the posted receive buffers have already been cleaned
+ *  and de-allocated
+ **/
+void i40e_free_adminq_arq(struct i40e_hw *hw)
+{
+	i40e_free_dma_mem(hw, &hw->aq.arq.desc_buf);
+}
+
+/**
+ *  i40e_alloc_arq_bufs - Allocate pre-posted buffers for the receive queue
+ *  @hw: pointer to the hardware structure
+ **/
+static enum i40e_status_code i40e_alloc_arq_bufs(struct i40e_hw *hw)
+{
+	enum i40e_status_code ret_code;
+	struct i40e_aq_desc *desc;
+	struct i40e_dma_mem *bi;
+	int i;
+
+	/* We'll be allocating the buffer info memory first, then we can
+	 * allocate the mapped buffers for the event processing
+	 */
+
+	/* buffer_info structures do not need alignment */
+	ret_code = i40e_allocate_virt_mem(hw, &hw->aq.arq.dma_head,
+		(hw->aq.num_arq_entries * sizeof(struct i40e_dma_mem)));
+	if (ret_code)
+		goto alloc_arq_bufs;
+	hw->aq.arq.r.arq_bi = (struct i40e_dma_mem *)hw->aq.arq.dma_head.va;
+
+	/* allocate the mapped buffers */
+	for (i = 0; i < hw->aq.num_arq_entries; i++) {
+		bi = &hw->aq.arq.r.arq_bi[i];
+		ret_code = i40e_allocate_dma_mem(hw, bi,
+						 i40e_mem_arq_buf,
+						 hw->aq.arq_buf_size,
+						 I40E_ADMINQ_DESC_ALIGNMENT);
+		if (ret_code)
+			goto unwind_alloc_arq_bufs;
+
+		/* now configure the descriptors for use */
+		desc = I40E_ADMINQ_DESC(hw->aq.arq, i);
+
+		desc->flags = CPU_TO_LE16(I40E_AQ_FLAG_BUF);
+		if (hw->aq.arq_buf_size > I40E_AQ_LARGE_BUF)
+			desc->flags |= CPU_TO_LE16(I40E_AQ_FLAG_LB);
+		desc->opcode = 0;
+		/* This is in accordance with Admin queue design, there is no
+		 * register for buffer size configuration
+		 */
+		desc->datalen = CPU_TO_LE16((u16)bi->size);
+		desc->retval = 0;
+		desc->cookie_high = 0;
+		desc->cookie_low = 0;
+		desc->params.external.addr_high =
+			CPU_TO_LE32(I40E_HI_DWORD(bi->pa));
+		desc->params.external.addr_low =
+			CPU_TO_LE32(I40E_LO_DWORD(bi->pa));
+		desc->params.external.param0 = 0;
+		desc->params.external.param1 = 0;
+	}
+
+alloc_arq_bufs:
+	return ret_code;
+
+unwind_alloc_arq_bufs:
+	/* don't try to free the one that failed... */
+	i--;
+	for (; i >= 0; i--)
+		i40e_free_dma_mem(hw, &hw->aq.arq.r.arq_bi[i]);
+	i40e_free_virt_mem(hw, &hw->aq.arq.dma_head);
+
+	return ret_code;
+}
+
+/**
+ *  i40e_alloc_asq_bufs - Allocate empty buffer structs for the send queue
+ *  @hw: pointer to the hardware structure
+ **/
+static enum i40e_status_code i40e_alloc_asq_bufs(struct i40e_hw *hw)
+{
+	enum i40e_status_code ret_code;
+	struct i40e_dma_mem *bi;
+	int i;
+
+	/* No mapped memory needed yet, just the buffer info structures */
+	ret_code = i40e_allocate_virt_mem(hw, &hw->aq.asq.dma_head,
+		(hw->aq.num_asq_entries * sizeof(struct i40e_dma_mem)));
+	if (ret_code)
+		goto alloc_asq_bufs;
+	hw->aq.asq.r.asq_bi = (struct i40e_dma_mem *)hw->aq.asq.dma_head.va;
+
+	/* allocate the mapped buffers */
+	for (i = 0; i < hw->aq.num_asq_entries; i++) {
+		bi = &hw->aq.asq.r.asq_bi[i];
+		ret_code = i40e_allocate_dma_mem(hw, bi,
+						 i40e_mem_asq_buf,
+						 hw->aq.asq_buf_size,
+						 I40E_ADMINQ_DESC_ALIGNMENT);
+		if (ret_code)
+			goto unwind_alloc_asq_bufs;
+	}
+alloc_asq_bufs:
+	return ret_code;
+
+unwind_alloc_asq_bufs:
+	/* don't try to free the one that failed... */
+	i--;
+	for (; i >= 0; i--)
+		i40e_free_dma_mem(hw, &hw->aq.asq.r.asq_bi[i]);
+	i40e_free_virt_mem(hw, &hw->aq.asq.dma_head);
+
+	return ret_code;
+}
+
+/**
+ *  i40e_free_arq_bufs - Free receive queue buffer info elements
+ *  @hw: pointer to the hardware structure
+ **/
+static void i40e_free_arq_bufs(struct i40e_hw *hw)
+{
+	int i;
+
+	/* free descriptors */
+	for (i = 0; i < hw->aq.num_arq_entries; i++)
+		i40e_free_dma_mem(hw, &hw->aq.arq.r.arq_bi[i]);
+
+	/* free the descriptor memory */
+	i40e_free_dma_mem(hw, &hw->aq.arq.desc_buf);
+
+	/* free the dma header */
+	i40e_free_virt_mem(hw, &hw->aq.arq.dma_head);
+}
+
+/**
+ *  i40e_free_asq_bufs - Free send queue buffer info elements
+ *  @hw: pointer to the hardware structure
+ **/
+static void i40e_free_asq_bufs(struct i40e_hw *hw)
+{
+	int i;
+
+	/* only unmap if the address is non-NULL */
+	for (i = 0; i < hw->aq.num_asq_entries; i++)
+		if (hw->aq.asq.r.asq_bi[i].pa)
+			i40e_free_dma_mem(hw, &hw->aq.asq.r.asq_bi[i]);
+
+	/* free the buffer info list */
+	i40e_free_virt_mem(hw, &hw->aq.asq.cmd_buf);
+
+	/* free the descriptor memory */
+	i40e_free_dma_mem(hw, &hw->aq.asq.desc_buf);
+
+	/* free the dma header */
+	i40e_free_virt_mem(hw, &hw->aq.asq.dma_head);
+}
+
+/**
+ *  i40e_config_asq_regs - configure ASQ registers
+ *  @hw: pointer to the hardware structure
+ *
+ *  Configure base address and length registers for the transmit queue
+ **/
+static enum i40e_status_code i40e_config_asq_regs(struct i40e_hw *hw)
+{
+	enum i40e_status_code ret_code = I40E_SUCCESS;
+	u32 reg = 0;
+
+	/* Clear Head and Tail */
+	wr32(hw, hw->aq.asq.head, 0);
+	wr32(hw, hw->aq.asq.tail, 0);
+
+	/* set starting point */
+	if (!i40e_is_vf(hw))
+		wr32(hw, hw->aq.asq.len, (hw->aq.num_asq_entries |
+					  I40E_PF_ATQLEN_ATQENABLE_MASK));
+	if (i40e_is_vf(hw))
+		wr32(hw, hw->aq.asq.len, (hw->aq.num_asq_entries |
+					  I40E_VF_ATQLEN1_ATQENABLE_MASK));
+	wr32(hw, hw->aq.asq.bal, I40E_LO_DWORD(hw->aq.asq.desc_buf.pa));
+	wr32(hw, hw->aq.asq.bah, I40E_HI_DWORD(hw->aq.asq.desc_buf.pa));
+
+	/* Check one register to verify that config was applied */
+	reg = rd32(hw, hw->aq.asq.bal);
+	if (reg != I40E_LO_DWORD(hw->aq.asq.desc_buf.pa))
+		ret_code = I40E_ERR_ADMIN_QUEUE_ERROR;
+
+	return ret_code;
+}
+
+/**
+ *  i40e_config_arq_regs - ARQ register configuration
+ *  @hw: pointer to the hardware structure
+ *
+ * Configure base address and length registers for the receive (event queue)
+ **/
+static enum i40e_status_code i40e_config_arq_regs(struct i40e_hw *hw)
+{
+	enum i40e_status_code ret_code = I40E_SUCCESS;
+	u32 reg = 0;
+
+	/* Clear Head and Tail */
+	wr32(hw, hw->aq.arq.head, 0);
+	wr32(hw, hw->aq.arq.tail, 0);
+
+	/* set starting point */
+	if (!i40e_is_vf(hw))
+		wr32(hw, hw->aq.arq.len, (hw->aq.num_arq_entries |
+					  I40E_PF_ARQLEN_ARQENABLE_MASK));
+	if (i40e_is_vf(hw))
+		wr32(hw, hw->aq.arq.len, (hw->aq.num_arq_entries |
+					  I40E_VF_ARQLEN1_ARQENABLE_MASK));
+	wr32(hw, hw->aq.arq.bal, I40E_LO_DWORD(hw->aq.arq.desc_buf.pa));
+	wr32(hw, hw->aq.arq.bah, I40E_HI_DWORD(hw->aq.arq.desc_buf.pa));
+
+	/* Update tail in the HW to post pre-allocated buffers */
+	wr32(hw, hw->aq.arq.tail, hw->aq.num_arq_entries - 1);
+
+	/* Check one register to verify that config was applied */
+	reg = rd32(hw, hw->aq.arq.bal);
+	if (reg != I40E_LO_DWORD(hw->aq.arq.desc_buf.pa))
+		ret_code = I40E_ERR_ADMIN_QUEUE_ERROR;
+
+	return ret_code;
+}
+
+/**
+ *  i40e_init_asq - main initialization routine for ASQ
+ *  @hw: pointer to the hardware structure
+ *
+ *  This is the main initialization routine for the Admin Send Queue
+ *  Prior to calling this function, drivers *MUST* set the following fields
+ *  in the hw->aq structure:
+ *     - hw->aq.num_asq_entries
+ *     - hw->aq.arq_buf_size
+ *
+ *  Do *NOT* hold the lock when calling this as the memory allocation routines
+ *  called are not going to be atomic context safe
+ **/
+enum i40e_status_code i40e_init_asq(struct i40e_hw *hw)
+{
+	enum i40e_status_code ret_code = I40E_SUCCESS;
+
+	if (hw->aq.asq.count > 0) {
+		/* queue already initialized */
+		ret_code = I40E_ERR_NOT_READY;
+		goto init_adminq_exit;
+	}
+
+	/* verify input for valid configuration */
+	if ((hw->aq.num_asq_entries == 0) ||
+	    (hw->aq.asq_buf_size == 0)) {
+		ret_code = I40E_ERR_CONFIG;
+		goto init_adminq_exit;
+	}
+
+	hw->aq.asq.next_to_use = 0;
+	hw->aq.asq.next_to_clean = 0;
+	hw->aq.asq.count = hw->aq.num_asq_entries;
+
+	/* allocate the ring memory */
+	ret_code = i40e_alloc_adminq_asq_ring(hw);
+	if (ret_code != I40E_SUCCESS)
+		goto init_adminq_exit;
+
+	/* allocate buffers in the rings */
+	ret_code = i40e_alloc_asq_bufs(hw);
+	if (ret_code != I40E_SUCCESS)
+		goto init_adminq_free_rings;
+
+	/* initialize base registers */
+	ret_code = i40e_config_asq_regs(hw);
+	if (ret_code != I40E_SUCCESS)
+		goto init_adminq_free_rings;
+
+	/* success! */
+	goto init_adminq_exit;
+
+init_adminq_free_rings:
+	i40e_free_adminq_asq(hw);
+
+init_adminq_exit:
+	return ret_code;
+}
+
+/**
+ *  i40e_init_arq - initialize ARQ
+ *  @hw: pointer to the hardware structure
+ *
+ *  The main initialization routine for the Admin Receive (Event) Queue.
+ *  Prior to calling this function, drivers *MUST* set the following fields
+ *  in the hw->aq structure:
+ *     - hw->aq.num_asq_entries
+ *     - hw->aq.arq_buf_size
+ *
+ *  Do *NOT* hold the lock when calling this as the memory allocation routines
+ *  called are not going to be atomic context safe
+ **/
+enum i40e_status_code i40e_init_arq(struct i40e_hw *hw)
+{
+	enum i40e_status_code ret_code = I40E_SUCCESS;
+
+	if (hw->aq.arq.count > 0) {
+		/* queue already initialized */
+		ret_code = I40E_ERR_NOT_READY;
+		goto init_adminq_exit;
+	}
+
+	/* verify input for valid configuration */
+	if ((hw->aq.num_arq_entries == 0) ||
+	    (hw->aq.arq_buf_size == 0)) {
+		ret_code = I40E_ERR_CONFIG;
+		goto init_adminq_exit;
+	}
+
+	hw->aq.arq.next_to_use = 0;
+	hw->aq.arq.next_to_clean = 0;
+	hw->aq.arq.count = hw->aq.num_arq_entries;
+
+	/* allocate the ring memory */
+	ret_code = i40e_alloc_adminq_arq_ring(hw);
+	if (ret_code != I40E_SUCCESS)
+		goto init_adminq_exit;
+
+	/* allocate buffers in the rings */
+	ret_code = i40e_alloc_arq_bufs(hw);
+	if (ret_code != I40E_SUCCESS)
+		goto init_adminq_free_rings;
+
+	/* initialize base registers */
+	ret_code = i40e_config_arq_regs(hw);
+	if (ret_code != I40E_SUCCESS)
+		goto init_adminq_free_rings;
+
+	/* success! */
+	goto init_adminq_exit;
+
+init_adminq_free_rings:
+	i40e_free_adminq_arq(hw);
+
+init_adminq_exit:
+	return ret_code;
+}
+
+/**
+ *  i40e_shutdown_asq - shutdown the ASQ
+ *  @hw: pointer to the hardware structure
+ *
+ *  The main shutdown routine for the Admin Send Queue
+ **/
+enum i40e_status_code i40e_shutdown_asq(struct i40e_hw *hw)
+{
+	enum i40e_status_code ret_code = I40E_SUCCESS;
+
+	if (hw->aq.asq.count == 0)
+		return I40E_ERR_NOT_READY;
+
+	/* Stop firmware AdminQ processing */
+	wr32(hw, hw->aq.asq.head, 0);
+	wr32(hw, hw->aq.asq.tail, 0);
+	wr32(hw, hw->aq.asq.len, 0);
+	wr32(hw, hw->aq.asq.bal, 0);
+	wr32(hw, hw->aq.asq.bah, 0);
+
+	/* make sure spinlock is available */
+	i40e_acquire_spinlock(&hw->aq.asq_spinlock);
+
+	hw->aq.asq.count = 0; /* to indicate uninitialized queue */
+
+	/* free ring buffers */
+	i40e_free_asq_bufs(hw);
+
+	i40e_release_spinlock(&hw->aq.asq_spinlock);
+
+	return ret_code;
+}
+
+/**
+ *  i40e_shutdown_arq - shutdown ARQ
+ *  @hw: pointer to the hardware structure
+ *
+ *  The main shutdown routine for the Admin Receive Queue
+ **/
+enum i40e_status_code i40e_shutdown_arq(struct i40e_hw *hw)
+{
+	enum i40e_status_code ret_code = I40E_SUCCESS;
+
+	if (hw->aq.arq.count == 0)
+		return I40E_ERR_NOT_READY;
+
+	/* Stop firmware AdminQ processing */
+	wr32(hw, hw->aq.arq.head, 0);
+	wr32(hw, hw->aq.arq.tail, 0);
+	wr32(hw, hw->aq.arq.len, 0);
+	wr32(hw, hw->aq.arq.bal, 0);
+	wr32(hw, hw->aq.arq.bah, 0);
+
+	/* make sure spinlock is available */
+	i40e_acquire_spinlock(&hw->aq.arq_spinlock);
+
+	hw->aq.arq.count = 0; /* to indicate uninitialized queue */
+
+	/* free ring buffers */
+	i40e_free_arq_bufs(hw);
+
+	i40e_release_spinlock(&hw->aq.arq_spinlock);
+
+	return ret_code;
+}
+
+/**
+ *  i40e_init_adminq - main initialization routine for Admin Queue
+ *  @hw: pointer to the hardware structure
+ *
+ *  Prior to calling this function, drivers *MUST* set the following fields
+ *  in the hw->aq structure:
+ *     - hw->aq.num_asq_entries
+ *     - hw->aq.num_arq_entries
+ *     - hw->aq.arq_buf_size
+ *     - hw->aq.asq_buf_size
+ **/
+enum i40e_status_code i40e_init_adminq(struct i40e_hw *hw)
+{
+	enum i40e_status_code ret_code;
+	u16 eetrack_lo, eetrack_hi;
+	u16 cfg_ptr, oem_hi, oem_lo;
+	int retry = 0;
+	/* verify input for valid configuration */
+	if ((hw->aq.num_arq_entries == 0) ||
+	    (hw->aq.num_asq_entries == 0) ||
+	    (hw->aq.arq_buf_size == 0) ||
+	    (hw->aq.asq_buf_size == 0)) {
+		ret_code = I40E_ERR_CONFIG;
+		goto init_adminq_exit;
+	}
+
+	/* initialize spin locks */
+	i40e_init_spinlock(&hw->aq.asq_spinlock);
+	i40e_init_spinlock(&hw->aq.arq_spinlock);
+
+	/* Set up register offsets */
+	i40e_adminq_init_regs(hw);
+
+	/* setup ASQ command write back timeout */
+	hw->aq.asq_cmd_timeout = I40E_ASQ_CMD_TIMEOUT;
+
+	/* allocate the ASQ */
+	ret_code = i40e_init_asq(hw);
+	if (ret_code != I40E_SUCCESS)
+		goto init_adminq_destroy_spinlocks;
+
+	/* allocate the ARQ */
+	ret_code = i40e_init_arq(hw);
+	if (ret_code != I40E_SUCCESS)
+		goto init_adminq_free_asq;
+
+	/* VF has no need of firmware */
+	if (i40e_is_vf(hw))
+		goto init_adminq_exit;
+	/* There are some cases where the firmware may not be quite ready
+	 * for AdminQ operations, so we retry the AdminQ setup a few times
+	 * if we see timeouts in this first AQ call.
+	 */
+	do {
+		ret_code = i40e_aq_get_firmware_version(hw,
+							&hw->aq.fw_maj_ver,
+							&hw->aq.fw_min_ver,
+							&hw->aq.fw_build,
+							&hw->aq.api_maj_ver,
+							&hw->aq.api_min_ver,
+							NULL);
+		if (ret_code != I40E_ERR_ADMIN_QUEUE_TIMEOUT)
+			break;
+		retry++;
+		i40e_msec_delay(100);
+		i40e_resume_aq(hw);
+	} while (retry < 10);
+	if (ret_code != I40E_SUCCESS)
+		goto init_adminq_free_arq;
+
+	/* get the NVM version info */
+	i40e_read_nvm_word(hw, I40E_SR_NVM_DEV_STARTER_VERSION,
+			   &hw->nvm.version);
+	i40e_read_nvm_word(hw, I40E_SR_NVM_EETRACK_LO, &eetrack_lo);
+	i40e_read_nvm_word(hw, I40E_SR_NVM_EETRACK_HI, &eetrack_hi);
+	hw->nvm.eetrack = (eetrack_hi << 16) | eetrack_lo;
+	i40e_read_nvm_word(hw, I40E_SR_BOOT_CONFIG_PTR, &cfg_ptr);
+	i40e_read_nvm_word(hw, (cfg_ptr + I40E_NVM_OEM_VER_OFF),
+			   &oem_hi);
+	i40e_read_nvm_word(hw, (cfg_ptr + (I40E_NVM_OEM_VER_OFF + 1)),
+			   &oem_lo);
+	hw->nvm.oem_ver = ((u32)oem_hi << 16) | oem_lo;
+
+	if (hw->aq.api_maj_ver > I40E_FW_API_VERSION_MAJOR) {
+		ret_code = I40E_ERR_FIRMWARE_API_VERSION;
+		goto init_adminq_free_arq;
+	}
+
+	/* pre-emptive resource lock release */
+	i40e_aq_release_resource(hw, I40E_NVM_RESOURCE_ID, 0, NULL);
+	hw->aq.nvm_release_on_done = FALSE;
+	hw->nvmupd_state = I40E_NVMUPD_STATE_INIT;
+
+	ret_code = i40e_aq_set_hmc_resource_profile(hw,
+						    I40E_HMC_PROFILE_DEFAULT,
+						    0,
+						    NULL);
+	ret_code = I40E_SUCCESS;
+
+	/* success! */
+	goto init_adminq_exit;
+
+init_adminq_free_arq:
+	i40e_shutdown_arq(hw);
+init_adminq_free_asq:
+	i40e_shutdown_asq(hw);
+init_adminq_destroy_spinlocks:
+	i40e_destroy_spinlock(&hw->aq.asq_spinlock);
+	i40e_destroy_spinlock(&hw->aq.arq_spinlock);
+
+init_adminq_exit:
+	return ret_code;
+}
+
+/**
+ *  i40e_shutdown_adminq - shutdown routine for the Admin Queue
+ *  @hw: pointer to the hardware structure
+ **/
+enum i40e_status_code i40e_shutdown_adminq(struct i40e_hw *hw)
+{
+	enum i40e_status_code ret_code = I40E_SUCCESS;
+
+	if (i40e_check_asq_alive(hw))
+		i40e_aq_queue_shutdown(hw, TRUE);
+
+	i40e_shutdown_asq(hw);
+	i40e_shutdown_arq(hw);
+
+	/* destroy the spinlocks */
+	i40e_destroy_spinlock(&hw->aq.asq_spinlock);
+	i40e_destroy_spinlock(&hw->aq.arq_spinlock);
+
+	if (hw->nvm_buff.va)
+		i40e_free_virt_mem(hw, &hw->nvm_buff);
+
+	return ret_code;
+}
+
+/**
+ *  i40e_clean_asq - cleans Admin send queue
+ *  @hw: pointer to the hardware structure
+ *
+ *  returns the number of free desc
+ **/
+u16 i40e_clean_asq(struct i40e_hw *hw)
+{
+	struct i40e_adminq_ring *asq = &(hw->aq.asq);
+	struct i40e_asq_cmd_details *details;
+	u16 ntc = asq->next_to_clean;
+	struct i40e_aq_desc desc_cb;
+	struct i40e_aq_desc *desc;
+
+	desc = I40E_ADMINQ_DESC(*asq, ntc);
+	details = I40E_ADMINQ_DETAILS(*asq, ntc);
+
+	while (rd32(hw, hw->aq.asq.head) != ntc) {
+		i40e_debug(hw, I40E_DEBUG_AQ_MESSAGE,
+			   "ntc %d head %d.\n", ntc, rd32(hw, hw->aq.asq.head));
+
+		if (details->callback) {
+			I40E_ADMINQ_CALLBACK cb_func =
+					(I40E_ADMINQ_CALLBACK)details->callback;
+			i40e_memcpy(&desc_cb, desc, sizeof(struct i40e_aq_desc),
+				    I40E_DMA_TO_DMA);
+			cb_func(hw, &desc_cb);
+		}
+		i40e_memset(desc, 0, sizeof(*desc), I40E_DMA_MEM);
+		i40e_memset(details, 0, sizeof(*details), I40E_NONDMA_MEM);
+		ntc++;
+		if (ntc == asq->count)
+			ntc = 0;
+		desc = I40E_ADMINQ_DESC(*asq, ntc);
+		details = I40E_ADMINQ_DETAILS(*asq, ntc);
+	}
+
+	asq->next_to_clean = ntc;
+
+	return I40E_DESC_UNUSED(asq);
+}
+
+/**
+ *  i40e_asq_done - check if FW has processed the Admin Send Queue
+ *  @hw: pointer to the hw struct
+ *
+ *  Returns TRUE if the firmware has processed all descriptors on the
+ *  admin send queue. Returns FALSE if there are still requests pending.
+ **/
+bool i40e_asq_done(struct i40e_hw *hw)
+{
+	/* AQ designers suggest use of head for better
+	 * timing reliability than DD bit
+	 */
+	return rd32(hw, hw->aq.asq.head) == hw->aq.asq.next_to_use;
+
+}
+
+/**
+ *  i40e_asq_send_command - send command to Admin Queue
+ *  @hw: pointer to the hw struct
+ *  @desc: prefilled descriptor describing the command (non DMA mem)
+ *  @buff: buffer to use for indirect commands
+ *  @buff_size: size of buffer for indirect commands
+ *  @cmd_details: pointer to command details structure
+ *
+ *  This is the main send command driver routine for the Admin Queue send
+ *  queue.  It runs the queue, cleans the queue, etc
+ **/
+enum i40e_status_code i40e_asq_send_command(struct i40e_hw *hw,
+				struct i40e_aq_desc *desc,
+				void *buff, /* can be NULL */
+				u16  buff_size,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	enum i40e_status_code status = I40E_SUCCESS;
+	struct i40e_dma_mem *dma_buff = NULL;
+	struct i40e_asq_cmd_details *details;
+	struct i40e_aq_desc *desc_on_ring;
+	bool cmd_completed = FALSE;
+	u16  retval = 0;
+	u32  val = 0;
+
+	hw->aq.asq_last_status = I40E_AQ_RC_OK;
+
+	val = rd32(hw, hw->aq.asq.head);
+	if (val >= hw->aq.num_asq_entries) {
+		i40e_debug(hw, I40E_DEBUG_AQ_MESSAGE,
+			   "AQTX: head overrun at %d\n", val);
+		status = I40E_ERR_QUEUE_EMPTY;
+		goto asq_send_command_exit;
+	}
+
+	if (hw->aq.asq.count == 0) {
+		i40e_debug(hw, I40E_DEBUG_AQ_MESSAGE,
+			   "AQTX: Admin queue not initialized.\n");
+		status = I40E_ERR_QUEUE_EMPTY;
+		goto asq_send_command_exit;
+	}
+
+	details = I40E_ADMINQ_DETAILS(hw->aq.asq, hw->aq.asq.next_to_use);
+	if (cmd_details) {
+		i40e_memcpy(details,
+			    cmd_details,
+			    sizeof(struct i40e_asq_cmd_details),
+			    I40E_NONDMA_TO_NONDMA);
+
+		/* If the cmd_details are defined copy the cookie.  The
+		 * CPU_TO_LE32 is not needed here because the data is ignored
+		 * by the FW, only used by the driver
+		 */
+		if (details->cookie) {
+			desc->cookie_high =
+				CPU_TO_LE32(I40E_HI_DWORD(details->cookie));
+			desc->cookie_low =
+				CPU_TO_LE32(I40E_LO_DWORD(details->cookie));
+		}
+	} else {
+		i40e_memset(details, 0,
+			    sizeof(struct i40e_asq_cmd_details),
+			    I40E_NONDMA_MEM);
+	}
+
+	/* clear requested flags and then set additional flags if defined */
+	desc->flags &= ~CPU_TO_LE16(details->flags_dis);
+	desc->flags |= CPU_TO_LE16(details->flags_ena);
+
+	i40e_acquire_spinlock(&hw->aq.asq_spinlock);
+
+	if (buff_size > hw->aq.asq_buf_size) {
+		i40e_debug(hw,
+			   I40E_DEBUG_AQ_MESSAGE,
+			   "AQTX: Invalid buffer size: %d.\n",
+			   buff_size);
+		status = I40E_ERR_INVALID_SIZE;
+		goto asq_send_command_error;
+	}
+
+	if (details->postpone && !details->async) {
+		i40e_debug(hw,
+			   I40E_DEBUG_AQ_MESSAGE,
+			   "AQTX: Async flag not set along with postpone flag");
+		status = I40E_ERR_PARAM;
+		goto asq_send_command_error;
+	}
+
+	/* call clean and check queue available function to reclaim the
+	 * descriptors that were processed by FW, the function returns the
+	 * number of desc available
+	 */
+	/* the clean function called here could be called in a separate thread
+	 * in case of asynchronous completions
+	 */
+	if (i40e_clean_asq(hw) == 0) {
+		i40e_debug(hw,
+			   I40E_DEBUG_AQ_MESSAGE,
+			   "AQTX: Error queue is full.\n");
+		status = I40E_ERR_ADMIN_QUEUE_FULL;
+		goto asq_send_command_error;
+	}
+
+	/* initialize the temp desc pointer with the right desc */
+	desc_on_ring = I40E_ADMINQ_DESC(hw->aq.asq, hw->aq.asq.next_to_use);
+
+	/* if the desc is available copy the temp desc to the right place */
+	i40e_memcpy(desc_on_ring, desc, sizeof(struct i40e_aq_desc),
+		    I40E_NONDMA_TO_DMA);
+
+	/* if buff is not NULL assume indirect command */
+	if (buff != NULL) {
+		dma_buff = &(hw->aq.asq.r.asq_bi[hw->aq.asq.next_to_use]);
+		/* copy the user buff into the respective DMA buff */
+		i40e_memcpy(dma_buff->va, buff, buff_size,
+			    I40E_NONDMA_TO_DMA);
+		desc_on_ring->datalen = CPU_TO_LE16(buff_size);
+
+		/* Update the address values in the desc with the pa value
+		 * for respective buffer
+		 */
+		desc_on_ring->params.external.addr_high =
+				CPU_TO_LE32(I40E_HI_DWORD(dma_buff->pa));
+		desc_on_ring->params.external.addr_low =
+				CPU_TO_LE32(I40E_LO_DWORD(dma_buff->pa));
+	}
+
+	/* bump the tail */
+	i40e_debug(hw, I40E_DEBUG_AQ_MESSAGE, "AQTX: desc and buffer:\n");
+	i40e_debug_aq(hw, I40E_DEBUG_AQ_COMMAND, (void *)desc_on_ring,
+		      buff, buff_size);
+	(hw->aq.asq.next_to_use)++;
+	if (hw->aq.asq.next_to_use == hw->aq.asq.count)
+		hw->aq.asq.next_to_use = 0;
+	if (!details->postpone)
+		wr32(hw, hw->aq.asq.tail, hw->aq.asq.next_to_use);
+
+	/* if cmd_details are not defined or async flag is not set,
+	 * we need to wait for desc write back
+	 */
+	if (!details->async && !details->postpone) {
+		u32 total_delay = 0;
+
+		do {
+			/* AQ designers suggest use of head for better
+			 * timing reliability than DD bit
+			 */
+			if (i40e_asq_done(hw))
+				break;
+			/* ugh! delay while spin_lock */
+			i40e_msec_delay(1);
+			total_delay++;
+		} while (total_delay < hw->aq.asq_cmd_timeout);
+	}
+
+	/* if ready, copy the desc back to temp */
+	if (i40e_asq_done(hw)) {
+		i40e_memcpy(desc, desc_on_ring, sizeof(struct i40e_aq_desc),
+			    I40E_DMA_TO_NONDMA);
+		if (buff != NULL)
+			i40e_memcpy(buff, dma_buff->va, buff_size,
+				    I40E_DMA_TO_NONDMA);
+		retval = LE16_TO_CPU(desc->retval);
+		if (retval != 0) {
+			i40e_debug(hw,
+				   I40E_DEBUG_AQ_MESSAGE,
+				   "AQTX: Command completed with error 0x%X.\n",
+				   retval);
+
+			/* strip off FW internal code */
+			retval &= 0xff;
+		}
+		cmd_completed = TRUE;
+		if ((enum i40e_admin_queue_err)retval == I40E_AQ_RC_OK)
+			status = I40E_SUCCESS;
+		else
+			status = I40E_ERR_ADMIN_QUEUE_ERROR;
+		hw->aq.asq_last_status = (enum i40e_admin_queue_err)retval;
+	}
+
+	i40e_debug(hw, I40E_DEBUG_AQ_MESSAGE,
+		   "AQTX: desc and buffer writeback:\n");
+	i40e_debug_aq(hw, I40E_DEBUG_AQ_COMMAND, (void *)desc, buff, buff_size);
+
+	/* save writeback aq if requested */
+	if (details->wb_desc)
+		i40e_memcpy(details->wb_desc, desc_on_ring,
+			    sizeof(struct i40e_aq_desc), I40E_DMA_TO_NONDMA);
+
+	/* update the error if time out occurred */
+	if ((!cmd_completed) &&
+	    (!details->async && !details->postpone)) {
+		i40e_debug(hw,
+			   I40E_DEBUG_AQ_MESSAGE,
+			   "AQTX: Writeback timeout.\n");
+		status = I40E_ERR_ADMIN_QUEUE_TIMEOUT;
+	}
+
+asq_send_command_error:
+	i40e_release_spinlock(&hw->aq.asq_spinlock);
+asq_send_command_exit:
+	return status;
+}
+
+/**
+ *  i40e_fill_default_direct_cmd_desc - AQ descriptor helper function
+ *  @desc:     pointer to the temp descriptor (non DMA mem)
+ *  @opcode:   the opcode can be used to decide which flags to turn off or on
+ *
+ *  Fill the desc with default values
+ **/
+void i40e_fill_default_direct_cmd_desc(struct i40e_aq_desc *desc,
+				       u16 opcode)
+{
+	/* zero out the desc */
+	i40e_memset((void *)desc, 0, sizeof(struct i40e_aq_desc),
+		    I40E_NONDMA_MEM);
+	desc->opcode = CPU_TO_LE16(opcode);
+	desc->flags = CPU_TO_LE16(I40E_AQ_FLAG_SI);
+}
+
+/**
+ *  i40e_clean_arq_element
+ *  @hw: pointer to the hw struct
+ *  @e: event info from the receive descriptor, includes any buffers
+ *  @pending: number of events that could be left to process
+ *
+ *  This function cleans one Admin Receive Queue element and returns
+ *  the contents through e.  It can also return how many events are
+ *  left to process through 'pending'
+ **/
+enum i40e_status_code i40e_clean_arq_element(struct i40e_hw *hw,
+					     struct i40e_arq_event_info *e,
+					     u16 *pending)
+{
+	enum i40e_status_code ret_code = I40E_SUCCESS;
+	u16 ntc = hw->aq.arq.next_to_clean;
+	struct i40e_aq_desc *desc;
+	struct i40e_dma_mem *bi;
+	u16 desc_idx;
+	u16 datalen;
+	u16 flags;
+	u16 ntu;
+
+	/* take the lock before we start messing with the ring */
+	i40e_acquire_spinlock(&hw->aq.arq_spinlock);
+
+	/* set next_to_use to head */
+	if (!i40e_is_vf(hw))
+		ntu = (rd32(hw, hw->aq.arq.head) & I40E_PF_ARQH_ARQH_MASK);
+	if (i40e_is_vf(hw))
+		ntu = (rd32(hw, hw->aq.arq.head) & I40E_VF_ARQH1_ARQH_MASK);
+	if (ntu == ntc) {
+		/* nothing to do - shouldn't need to update ring's values */
+		ret_code = I40E_ERR_ADMIN_QUEUE_NO_WORK;
+		goto clean_arq_element_out;
+	}
+
+	/* now clean the next descriptor */
+	desc = I40E_ADMINQ_DESC(hw->aq.arq, ntc);
+	desc_idx = ntc;
+
+	flags = LE16_TO_CPU(desc->flags);
+	if (flags & I40E_AQ_FLAG_ERR) {
+		ret_code = I40E_ERR_ADMIN_QUEUE_ERROR;
+		hw->aq.arq_last_status =
+			(enum i40e_admin_queue_err)LE16_TO_CPU(desc->retval);
+		i40e_debug(hw,
+			   I40E_DEBUG_AQ_MESSAGE,
+			   "AQRX: Event received with error 0x%X.\n",
+			   hw->aq.arq_last_status);
+	}
+
+	i40e_memcpy(&e->desc, desc, sizeof(struct i40e_aq_desc),
+		    I40E_DMA_TO_NONDMA);
+	datalen = LE16_TO_CPU(desc->datalen);
+	e->msg_len = min(datalen, e->buf_len);
+	if (e->msg_buf != NULL && (e->msg_len != 0))
+		i40e_memcpy(e->msg_buf,
+			    hw->aq.arq.r.arq_bi[desc_idx].va,
+			    e->msg_len, I40E_DMA_TO_NONDMA);
+
+	i40e_debug(hw, I40E_DEBUG_AQ_MESSAGE, "AQRX: desc and buffer:\n");
+	i40e_debug_aq(hw, I40E_DEBUG_AQ_COMMAND, (void *)desc, e->msg_buf,
+		      hw->aq.arq_buf_size);
+
+	/* Restore the original datalen and buffer address in the desc,
+	 * FW updates datalen to indicate the event message
+	 * size
+	 */
+	bi = &hw->aq.arq.r.arq_bi[ntc];
+	i40e_memset((void *)desc, 0, sizeof(struct i40e_aq_desc), I40E_DMA_MEM);
+
+	desc->flags = CPU_TO_LE16(I40E_AQ_FLAG_BUF);
+	if (hw->aq.arq_buf_size > I40E_AQ_LARGE_BUF)
+		desc->flags |= CPU_TO_LE16(I40E_AQ_FLAG_LB);
+	desc->datalen = CPU_TO_LE16((u16)bi->size);
+	desc->params.external.addr_high = CPU_TO_LE32(I40E_HI_DWORD(bi->pa));
+	desc->params.external.addr_low = CPU_TO_LE32(I40E_LO_DWORD(bi->pa));
+
+	/* set tail = the last cleaned desc index. */
+	wr32(hw, hw->aq.arq.tail, ntc);
+	/* ntc is updated to tail + 1 */
+	ntc++;
+	if (ntc == hw->aq.num_arq_entries)
+		ntc = 0;
+	hw->aq.arq.next_to_clean = ntc;
+	hw->aq.arq.next_to_use = ntu;
+
+clean_arq_element_out:
+	/* Set pending if needed, unlock and return */
+	if (pending != NULL)
+		*pending = (ntc > ntu ? hw->aq.arq.count : 0) + (ntu - ntc);
+	i40e_release_spinlock(&hw->aq.arq_spinlock);
+
+	if (i40e_is_nvm_update_op(&e->desc)) {
+		if (hw->aq.nvm_release_on_done) {
+			i40e_release_nvm(hw);
+			hw->aq.nvm_release_on_done = FALSE;
+		}
+
+		switch (hw->nvmupd_state) {
+		case I40E_NVMUPD_STATE_INIT_WAIT:
+			hw->nvmupd_state = I40E_NVMUPD_STATE_INIT;
+			break;
+
+		case I40E_NVMUPD_STATE_WRITE_WAIT:
+			hw->nvmupd_state = I40E_NVMUPD_STATE_WRITING;
+			break;
+
+		default:
+			break;
+		}
+	}
+
+	return ret_code;
+}
+
+void i40e_resume_aq(struct i40e_hw *hw)
+{
+	/* Registers are reset after PF reset */
+	hw->aq.asq.next_to_use = 0;
+	hw->aq.asq.next_to_clean = 0;
+
+	i40e_config_asq_regs(hw);
+
+	hw->aq.arq.next_to_use = 0;
+	hw->aq.arq.next_to_clean = 0;
+
+	i40e_config_arq_regs(hw);
+}
diff --git a/usr/src/uts/common/io/i40e/core/i40e_adminq.h b/usr/src/uts/common/io/i40e/core/i40e_adminq.h
new file mode 100644
index 0000000000..e20d6893ed
--- /dev/null
+++ b/usr/src/uts/common/io/i40e/core/i40e_adminq.h
@@ -0,0 +1,125 @@
+/******************************************************************************
+
+  Copyright (c) 2013-2015, Intel Corporation 
+  All rights reserved.
+  
+  Redistribution and use in source and binary forms, with or without 
+  modification, are permitted provided that the following conditions are met:
+  
+   1. Redistributions of source code must retain the above copyright notice, 
+      this list of conditions and the following disclaimer.
+  
+   2. Redistributions in binary form must reproduce the above copyright 
+      notice, this list of conditions and the following disclaimer in the 
+      documentation and/or other materials provided with the distribution.
+  
+   3. Neither the name of the Intel Corporation nor the names of its 
+      contributors may be used to endorse or promote products derived from 
+      this software without specific prior written permission.
+  
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+  POSSIBILITY OF SUCH DAMAGE.
+
+******************************************************************************/
+/*$FreeBSD: head/sys/dev/ixl/i40e_adminq.h 284049 2015-06-05 22:52:42Z jfv $*/
+
+#ifndef _I40E_ADMINQ_H_
+#define _I40E_ADMINQ_H_
+
+#include "i40e_osdep.h"
+#include "i40e_status.h"
+#include "i40e_adminq_cmd.h"
+
+#define I40E_ADMINQ_DESC(R, i)   \
+	(&(((struct i40e_aq_desc *)((R).desc_buf.va))[i]))
+
+#define I40E_ADMINQ_DESC_ALIGNMENT 4096
+
+struct i40e_adminq_ring {
+	struct i40e_virt_mem dma_head;	/* space for dma structures */
+	struct i40e_dma_mem desc_buf;	/* descriptor ring memory */
+	struct i40e_virt_mem cmd_buf;	/* command buffer memory */
+
+	union {
+		struct i40e_dma_mem *asq_bi;
+		struct i40e_dma_mem *arq_bi;
+	} r;
+
+	u16 count;		/* Number of descriptors */
+	u16 rx_buf_len;		/* Admin Receive Queue buffer length */
+
+	/* used for interrupt processing */
+	u16 next_to_use;
+	u16 next_to_clean;
+
+	/* used for queue tracking */
+	u32 head;
+	u32 tail;
+	u32 len;
+	u32 bah;
+	u32 bal;
+};
+
+/* ASQ transaction details */
+struct i40e_asq_cmd_details {
+	void *callback; /* cast from type I40E_ADMINQ_CALLBACK */
+	u64 cookie;
+	u16 flags_ena;
+	u16 flags_dis;
+	bool async;
+	bool postpone;
+	struct i40e_aq_desc *wb_desc;
+};
+
+#define I40E_ADMINQ_DETAILS(R, i)   \
+	(&(((struct i40e_asq_cmd_details *)((R).cmd_buf.va))[i]))
+
+/* ARQ event information */
+struct i40e_arq_event_info {
+	struct i40e_aq_desc desc;
+	u16 msg_len;
+	u16 buf_len;
+	u8 *msg_buf;
+};
+
+/* Admin Queue information */
+struct i40e_adminq_info {
+	struct i40e_adminq_ring arq;    /* receive queue */
+	struct i40e_adminq_ring asq;    /* send queue */
+	u32 asq_cmd_timeout;            /* send queue cmd write back timeout*/
+	u16 num_arq_entries;            /* receive queue depth */
+	u16 num_asq_entries;            /* send queue depth */
+	u16 arq_buf_size;               /* receive queue buffer size */
+	u16 asq_buf_size;               /* send queue buffer size */
+	u16 fw_maj_ver;                 /* firmware major version */
+	u16 fw_min_ver;                 /* firmware minor version */
+	u32 fw_build;                   /* firmware build number */
+	u16 api_maj_ver;                /* api major version */
+	u16 api_min_ver;                /* api minor version */
+	bool nvm_release_on_done;
+
+	struct i40e_spinlock asq_spinlock; /* Send queue spinlock */
+	struct i40e_spinlock arq_spinlock; /* Receive queue spinlock */
+
+	/* last status values on send and receive queues */
+	enum i40e_admin_queue_err asq_last_status;
+	enum i40e_admin_queue_err arq_last_status;
+};
+
+/* general information */
+#define I40E_AQ_LARGE_BUF		512
+#define I40E_ASQ_CMD_TIMEOUT		250  /* msecs */
+
+void i40e_fill_default_direct_cmd_desc(struct i40e_aq_desc *desc,
+				       u16 opcode);
+
+#endif /* _I40E_ADMINQ_H_ */
diff --git a/usr/src/uts/common/io/i40e/core/i40e_adminq_cmd.h b/usr/src/uts/common/io/i40e/core/i40e_adminq_cmd.h
new file mode 100644
index 0000000000..af9f107597
--- /dev/null
+++ b/usr/src/uts/common/io/i40e/core/i40e_adminq_cmd.h
@@ -0,0 +1,2424 @@
+/******************************************************************************
+
+  Copyright (c) 2013-2015, Intel Corporation 
+  All rights reserved.
+  
+  Redistribution and use in source and binary forms, with or without 
+  modification, are permitted provided that the following conditions are met:
+  
+   1. Redistributions of source code must retain the above copyright notice, 
+      this list of conditions and the following disclaimer.
+  
+   2. Redistributions in binary form must reproduce the above copyright 
+      notice, this list of conditions and the following disclaimer in the 
+      documentation and/or other materials provided with the distribution.
+  
+   3. Neither the name of the Intel Corporation nor the names of its 
+      contributors may be used to endorse or promote products derived from 
+      this software without specific prior written permission.
+  
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+  POSSIBILITY OF SUCH DAMAGE.
+
+******************************************************************************/
+/*$FreeBSD: head/sys/dev/ixl/i40e_adminq_cmd.h 284049 2015-06-05 22:52:42Z jfv $*/
+
+#ifndef _I40E_ADMINQ_CMD_H_
+#define _I40E_ADMINQ_CMD_H_
+
+/* This header file defines the i40e Admin Queue commands and is shared between
+ * i40e Firmware and Software.
+ *
+ * This file needs to comply with the Linux Kernel coding style.
+ */
+
+#define I40E_FW_API_VERSION_MAJOR	0x0001
+#ifdef X722_SUPPORT
+#define I40E_FW_API_VERSION_MINOR	0x0003
+#else
+#define I40E_FW_API_VERSION_MINOR	0x0004
+#endif
+
+struct i40e_aq_desc {
+	__le16 flags;
+	__le16 opcode;
+	__le16 datalen;
+	__le16 retval;
+	__le32 cookie_high;
+	__le32 cookie_low;
+	union {
+		struct {
+			__le32 param0;
+			__le32 param1;
+			__le32 param2;
+			__le32 param3;
+		} internal;
+		struct {
+			__le32 param0;
+			__le32 param1;
+			__le32 addr_high;
+			__le32 addr_low;
+		} external;
+		u8 raw[16];
+	} params;
+};
+
+/* Flags sub-structure
+ * |0  |1  |2  |3  |4  |5  |6  |7  |8  |9  |10 |11 |12 |13 |14 |15 |
+ * |DD |CMP|ERR|VFE| * *  RESERVED * * |LB |RD |VFC|BUF|SI |EI |FE |
+ */
+
+/* command flags and offsets*/
+#define I40E_AQ_FLAG_DD_SHIFT	0
+#define I40E_AQ_FLAG_CMP_SHIFT	1
+#define I40E_AQ_FLAG_ERR_SHIFT	2
+#define I40E_AQ_FLAG_VFE_SHIFT	3
+#define I40E_AQ_FLAG_LB_SHIFT	9
+#define I40E_AQ_FLAG_RD_SHIFT	10
+#define I40E_AQ_FLAG_VFC_SHIFT	11
+#define I40E_AQ_FLAG_BUF_SHIFT	12
+#define I40E_AQ_FLAG_SI_SHIFT	13
+#define I40E_AQ_FLAG_EI_SHIFT	14
+#define I40E_AQ_FLAG_FE_SHIFT	15
+
+#define I40E_AQ_FLAG_DD		(1 << I40E_AQ_FLAG_DD_SHIFT)  /* 0x1    */
+#define I40E_AQ_FLAG_CMP	(1 << I40E_AQ_FLAG_CMP_SHIFT) /* 0x2    */
+#define I40E_AQ_FLAG_ERR	(1 << I40E_AQ_FLAG_ERR_SHIFT) /* 0x4    */
+#define I40E_AQ_FLAG_VFE	(1 << I40E_AQ_FLAG_VFE_SHIFT) /* 0x8    */
+#define I40E_AQ_FLAG_LB		(1 << I40E_AQ_FLAG_LB_SHIFT)  /* 0x200  */
+#define I40E_AQ_FLAG_RD		(1 << I40E_AQ_FLAG_RD_SHIFT)  /* 0x400  */
+#define I40E_AQ_FLAG_VFC	(1 << I40E_AQ_FLAG_VFC_SHIFT) /* 0x800  */
+#define I40E_AQ_FLAG_BUF	(1 << I40E_AQ_FLAG_BUF_SHIFT) /* 0x1000 */
+#define I40E_AQ_FLAG_SI		(1 << I40E_AQ_FLAG_SI_SHIFT)  /* 0x2000 */
+#define I40E_AQ_FLAG_EI		(1 << I40E_AQ_FLAG_EI_SHIFT)  /* 0x4000 */
+#define I40E_AQ_FLAG_FE		(1 << I40E_AQ_FLAG_FE_SHIFT)  /* 0x8000 */
+
+/* error codes */
+enum i40e_admin_queue_err {
+	I40E_AQ_RC_OK		= 0,  /* success */
+	I40E_AQ_RC_EPERM	= 1,  /* Operation not permitted */
+	I40E_AQ_RC_ENOENT	= 2,  /* No such element */
+	I40E_AQ_RC_ESRCH	= 3,  /* Bad opcode */
+	I40E_AQ_RC_EINTR	= 4,  /* operation interrupted */
+	I40E_AQ_RC_EIO		= 5,  /* I/O error */
+	I40E_AQ_RC_ENXIO	= 6,  /* No such resource */
+	I40E_AQ_RC_E2BIG	= 7,  /* Arg too long */
+	I40E_AQ_RC_EAGAIN	= 8,  /* Try again */
+	I40E_AQ_RC_ENOMEM	= 9,  /* Out of memory */
+	I40E_AQ_RC_EACCES	= 10, /* Permission denied */
+	I40E_AQ_RC_EFAULT	= 11, /* Bad address */
+	I40E_AQ_RC_EBUSY	= 12, /* Device or resource busy */
+	I40E_AQ_RC_EEXIST	= 13, /* object already exists */
+	I40E_AQ_RC_EINVAL	= 14, /* Invalid argument */
+	I40E_AQ_RC_ENOTTY	= 15, /* Not a typewriter */
+	I40E_AQ_RC_ENOSPC	= 16, /* No space left or alloc failure */
+	I40E_AQ_RC_ENOSYS	= 17, /* Function not implemented */
+	I40E_AQ_RC_ERANGE	= 18, /* Parameter out of range */
+	I40E_AQ_RC_EFLUSHED	= 19, /* Cmd flushed due to prev cmd error */
+	I40E_AQ_RC_BAD_ADDR	= 20, /* Descriptor contains a bad pointer */
+	I40E_AQ_RC_EMODE	= 21, /* Op not allowed in current dev mode */
+	I40E_AQ_RC_EFBIG	= 22, /* File too large */
+};
+
+/* Admin Queue command opcodes */
+enum i40e_admin_queue_opc {
+	/* aq commands */
+	i40e_aqc_opc_get_version	= 0x0001,
+	i40e_aqc_opc_driver_version	= 0x0002,
+	i40e_aqc_opc_queue_shutdown	= 0x0003,
+	i40e_aqc_opc_set_pf_context	= 0x0004,
+
+	/* resource ownership */
+	i40e_aqc_opc_request_resource	= 0x0008,
+	i40e_aqc_opc_release_resource	= 0x0009,
+
+	i40e_aqc_opc_list_func_capabilities	= 0x000A,
+	i40e_aqc_opc_list_dev_capabilities	= 0x000B,
+
+	/* LAA */
+	i40e_aqc_opc_mac_address_read	= 0x0107,
+	i40e_aqc_opc_mac_address_write	= 0x0108,
+
+	/* PXE */
+	i40e_aqc_opc_clear_pxe_mode	= 0x0110,
+
+	/* internal switch commands */
+	i40e_aqc_opc_get_switch_config		= 0x0200,
+	i40e_aqc_opc_add_statistics		= 0x0201,
+	i40e_aqc_opc_remove_statistics		= 0x0202,
+	i40e_aqc_opc_set_port_parameters	= 0x0203,
+	i40e_aqc_opc_get_switch_resource_alloc	= 0x0204,
+
+	i40e_aqc_opc_add_vsi			= 0x0210,
+	i40e_aqc_opc_update_vsi_parameters	= 0x0211,
+	i40e_aqc_opc_get_vsi_parameters		= 0x0212,
+
+	i40e_aqc_opc_add_pv			= 0x0220,
+	i40e_aqc_opc_update_pv_parameters	= 0x0221,
+	i40e_aqc_opc_get_pv_parameters		= 0x0222,
+
+	i40e_aqc_opc_add_veb			= 0x0230,
+	i40e_aqc_opc_update_veb_parameters	= 0x0231,
+	i40e_aqc_opc_get_veb_parameters		= 0x0232,
+
+	i40e_aqc_opc_delete_element		= 0x0243,
+
+	i40e_aqc_opc_add_macvlan		= 0x0250,
+	i40e_aqc_opc_remove_macvlan		= 0x0251,
+	i40e_aqc_opc_add_vlan			= 0x0252,
+	i40e_aqc_opc_remove_vlan		= 0x0253,
+	i40e_aqc_opc_set_vsi_promiscuous_modes	= 0x0254,
+	i40e_aqc_opc_add_tag			= 0x0255,
+	i40e_aqc_opc_remove_tag			= 0x0256,
+	i40e_aqc_opc_add_multicast_etag		= 0x0257,
+	i40e_aqc_opc_remove_multicast_etag	= 0x0258,
+	i40e_aqc_opc_update_tag			= 0x0259,
+	i40e_aqc_opc_add_control_packet_filter	= 0x025A,
+	i40e_aqc_opc_remove_control_packet_filter	= 0x025B,
+	i40e_aqc_opc_add_cloud_filters		= 0x025C,
+	i40e_aqc_opc_remove_cloud_filters	= 0x025D,
+
+	i40e_aqc_opc_add_mirror_rule	= 0x0260,
+	i40e_aqc_opc_delete_mirror_rule	= 0x0261,
+
+	/* DCB commands */
+	i40e_aqc_opc_dcb_ignore_pfc	= 0x0301,
+	i40e_aqc_opc_dcb_updated	= 0x0302,
+
+	/* TX scheduler */
+	i40e_aqc_opc_configure_vsi_bw_limit		= 0x0400,
+	i40e_aqc_opc_configure_vsi_ets_sla_bw_limit	= 0x0406,
+	i40e_aqc_opc_configure_vsi_tc_bw		= 0x0407,
+	i40e_aqc_opc_query_vsi_bw_config		= 0x0408,
+	i40e_aqc_opc_query_vsi_ets_sla_config		= 0x040A,
+	i40e_aqc_opc_configure_switching_comp_bw_limit	= 0x0410,
+
+	i40e_aqc_opc_enable_switching_comp_ets			= 0x0413,
+	i40e_aqc_opc_modify_switching_comp_ets			= 0x0414,
+	i40e_aqc_opc_disable_switching_comp_ets			= 0x0415,
+	i40e_aqc_opc_configure_switching_comp_ets_bw_limit	= 0x0416,
+	i40e_aqc_opc_configure_switching_comp_bw_config		= 0x0417,
+	i40e_aqc_opc_query_switching_comp_ets_config		= 0x0418,
+	i40e_aqc_opc_query_port_ets_config			= 0x0419,
+	i40e_aqc_opc_query_switching_comp_bw_config		= 0x041A,
+	i40e_aqc_opc_suspend_port_tx				= 0x041B,
+	i40e_aqc_opc_resume_port_tx				= 0x041C,
+	i40e_aqc_opc_configure_partition_bw			= 0x041D,
+
+	/* hmc */
+	i40e_aqc_opc_query_hmc_resource_profile	= 0x0500,
+	i40e_aqc_opc_set_hmc_resource_profile	= 0x0501,
+
+	/* phy commands*/
+	i40e_aqc_opc_get_phy_abilities		= 0x0600,
+	i40e_aqc_opc_set_phy_config		= 0x0601,
+	i40e_aqc_opc_set_mac_config		= 0x0603,
+	i40e_aqc_opc_set_link_restart_an	= 0x0605,
+	i40e_aqc_opc_get_link_status		= 0x0607,
+	i40e_aqc_opc_set_phy_int_mask		= 0x0613,
+	i40e_aqc_opc_get_local_advt_reg		= 0x0614,
+	i40e_aqc_opc_set_local_advt_reg		= 0x0615,
+	i40e_aqc_opc_get_partner_advt		= 0x0616,
+	i40e_aqc_opc_set_lb_modes		= 0x0618,
+	i40e_aqc_opc_get_phy_wol_caps		= 0x0621,
+	i40e_aqc_opc_set_phy_debug		= 0x0622,
+	i40e_aqc_opc_upload_ext_phy_fm		= 0x0625,
+
+	/* NVM commands */
+	i40e_aqc_opc_nvm_read			= 0x0701,
+	i40e_aqc_opc_nvm_erase			= 0x0702,
+	i40e_aqc_opc_nvm_update			= 0x0703,
+	i40e_aqc_opc_nvm_config_read		= 0x0704,
+	i40e_aqc_opc_nvm_config_write		= 0x0705,
+	i40e_aqc_opc_oem_post_update		= 0x0720,
+
+	/* virtualization commands */
+	i40e_aqc_opc_send_msg_to_pf		= 0x0801,
+	i40e_aqc_opc_send_msg_to_vf		= 0x0802,
+	i40e_aqc_opc_send_msg_to_peer		= 0x0803,
+
+	/* alternate structure */
+	i40e_aqc_opc_alternate_write		= 0x0900,
+	i40e_aqc_opc_alternate_write_indirect	= 0x0901,
+	i40e_aqc_opc_alternate_read		= 0x0902,
+	i40e_aqc_opc_alternate_read_indirect	= 0x0903,
+	i40e_aqc_opc_alternate_write_done	= 0x0904,
+	i40e_aqc_opc_alternate_set_mode		= 0x0905,
+	i40e_aqc_opc_alternate_clear_port	= 0x0906,
+
+	/* LLDP commands */
+	i40e_aqc_opc_lldp_get_mib	= 0x0A00,
+	i40e_aqc_opc_lldp_update_mib	= 0x0A01,
+	i40e_aqc_opc_lldp_add_tlv	= 0x0A02,
+	i40e_aqc_opc_lldp_update_tlv	= 0x0A03,
+	i40e_aqc_opc_lldp_delete_tlv	= 0x0A04,
+	i40e_aqc_opc_lldp_stop		= 0x0A05,
+	i40e_aqc_opc_lldp_start		= 0x0A06,
+	i40e_aqc_opc_get_cee_dcb_cfg	= 0x0A07,
+	i40e_aqc_opc_lldp_set_local_mib	= 0x0A08,
+	i40e_aqc_opc_lldp_stop_start_spec_agent	= 0x0A09,
+
+	/* Tunnel commands */
+	i40e_aqc_opc_add_udp_tunnel	= 0x0B00,
+	i40e_aqc_opc_del_udp_tunnel	= 0x0B01,
+#ifdef X722_SUPPORT
+	i40e_aqc_opc_set_rss_key	= 0x0B02,
+	i40e_aqc_opc_set_rss_lut	= 0x0B03,
+	i40e_aqc_opc_get_rss_key	= 0x0B04,
+	i40e_aqc_opc_get_rss_lut	= 0x0B05,
+#endif
+
+	/* Async Events */
+	i40e_aqc_opc_event_lan_overflow		= 0x1001,
+
+	/* OEM commands */
+	i40e_aqc_opc_oem_parameter_change	= 0xFE00,
+	i40e_aqc_opc_oem_device_status_change	= 0xFE01,
+	i40e_aqc_opc_oem_ocsd_initialize	= 0xFE02,
+	i40e_aqc_opc_oem_ocbb_initialize	= 0xFE03,
+
+	/* debug commands */
+	i40e_aqc_opc_debug_read_reg		= 0xFF03,
+	i40e_aqc_opc_debug_write_reg		= 0xFF04,
+	i40e_aqc_opc_debug_modify_reg		= 0xFF07,
+	i40e_aqc_opc_debug_dump_internals	= 0xFF08,
+};
+
+/* command structures and indirect data structures */
+
+/* Structure naming conventions:
+ * - no suffix for direct command descriptor structures
+ * - _data for indirect sent data
+ * - _resp for indirect return data (data which is both will use _data)
+ * - _completion for direct return data
+ * - _element_ for repeated elements (may also be _data or _resp)
+ *
+ * Command structures are expected to overlay the params.raw member of the basic
+ * descriptor, and as such cannot exceed 16 bytes in length.
+ */
+
+/* This macro is used to generate a compilation error if a structure
+ * is not exactly the correct length. It gives a divide by zero error if the
+ * structure is not of the correct size, otherwise it creates an enum that is
+ * never used.
+ */
+#define I40E_CHECK_STRUCT_LEN(n, X) enum i40e_static_assert_enum_##X \
+	{ i40e_static_assert_##X = (n)/((sizeof(struct X) == (n)) ? 1 : 0) }
+
+/* This macro is used extensively to ensure that command structures are 16
+ * bytes in length as they have to map to the raw array of that size.
+ */
+#define I40E_CHECK_CMD_LENGTH(X)	I40E_CHECK_STRUCT_LEN(16, X)
+
+/* internal (0x00XX) commands */
+
+/* Get version (direct 0x0001) */
+struct i40e_aqc_get_version {
+	__le32 rom_ver;
+	__le32 fw_build;
+	__le16 fw_major;
+	__le16 fw_minor;
+	__le16 api_major;
+	__le16 api_minor;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_get_version);
+
+/* Send driver version (indirect 0x0002) */
+struct i40e_aqc_driver_version {
+	u8	driver_major_ver;
+	u8	driver_minor_ver;
+	u8	driver_build_ver;
+	u8	driver_subbuild_ver;
+	u8	reserved[4];
+	__le32	address_high;
+	__le32	address_low;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_driver_version);
+
+/* Queue Shutdown (direct 0x0003) */
+struct i40e_aqc_queue_shutdown {
+	__le32	driver_unloading;
+#define I40E_AQ_DRIVER_UNLOADING	0x1
+	u8	reserved[12];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_queue_shutdown);
+
+/* Set PF context (0x0004, direct) */
+struct i40e_aqc_set_pf_context {
+	u8	pf_id;
+	u8	reserved[15];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_set_pf_context);
+
+/* Request resource ownership (direct 0x0008)
+ * Release resource ownership (direct 0x0009)
+ */
+#define I40E_AQ_RESOURCE_NVM			1
+#define I40E_AQ_RESOURCE_SDP			2
+#define I40E_AQ_RESOURCE_ACCESS_READ		1
+#define I40E_AQ_RESOURCE_ACCESS_WRITE		2
+#define I40E_AQ_RESOURCE_NVM_READ_TIMEOUT	3000
+#define I40E_AQ_RESOURCE_NVM_WRITE_TIMEOUT	180000
+
+struct i40e_aqc_request_resource {
+	__le16	resource_id;
+	__le16	access_type;
+	__le32	timeout;
+	__le32	resource_number;
+	u8	reserved[4];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_request_resource);
+
+/* Get function capabilities (indirect 0x000A)
+ * Get device capabilities (indirect 0x000B)
+ */
+struct i40e_aqc_list_capabilites {
+	u8 command_flags;
+#define I40E_AQ_LIST_CAP_PF_INDEX_EN	1
+	u8 pf_index;
+	u8 reserved[2];
+	__le32 count;
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_list_capabilites);
+
+struct i40e_aqc_list_capabilities_element_resp {
+	__le16	id;
+	u8	major_rev;
+	u8	minor_rev;
+	__le32	number;
+	__le32	logical_id;
+	__le32	phys_id;
+	u8	reserved[16];
+};
+
+/* list of caps */
+
+#define I40E_AQ_CAP_ID_SWITCH_MODE	0x0001
+#define I40E_AQ_CAP_ID_MNG_MODE		0x0002
+#define I40E_AQ_CAP_ID_NPAR_ACTIVE	0x0003
+#define I40E_AQ_CAP_ID_OS2BMC_CAP	0x0004
+#define I40E_AQ_CAP_ID_FUNCTIONS_VALID	0x0005
+#define I40E_AQ_CAP_ID_ALTERNATE_RAM	0x0006
+#define I40E_AQ_CAP_ID_SRIOV		0x0012
+#define I40E_AQ_CAP_ID_VF		0x0013
+#define I40E_AQ_CAP_ID_VMDQ		0x0014
+#define I40E_AQ_CAP_ID_8021QBG		0x0015
+#define I40E_AQ_CAP_ID_8021QBR		0x0016
+#define I40E_AQ_CAP_ID_VSI		0x0017
+#define I40E_AQ_CAP_ID_DCB		0x0018
+#define I40E_AQ_CAP_ID_FCOE		0x0021
+#define I40E_AQ_CAP_ID_ISCSI		0x0022
+#define I40E_AQ_CAP_ID_RSS		0x0040
+#define I40E_AQ_CAP_ID_RXQ		0x0041
+#define I40E_AQ_CAP_ID_TXQ		0x0042
+#define I40E_AQ_CAP_ID_MSIX		0x0043
+#define I40E_AQ_CAP_ID_VF_MSIX		0x0044
+#define I40E_AQ_CAP_ID_FLOW_DIRECTOR	0x0045
+#define I40E_AQ_CAP_ID_1588		0x0046
+#define I40E_AQ_CAP_ID_IWARP		0x0051
+#define I40E_AQ_CAP_ID_LED		0x0061
+#define I40E_AQ_CAP_ID_SDP		0x0062
+#define I40E_AQ_CAP_ID_MDIO		0x0063
+#define I40E_AQ_CAP_ID_FLEX10		0x00F1
+#define I40E_AQ_CAP_ID_CEM		0x00F2
+
+/* Set CPPM Configuration (direct 0x0103) */
+struct i40e_aqc_cppm_configuration {
+	__le16	command_flags;
+#define I40E_AQ_CPPM_EN_LTRC	0x0800
+#define I40E_AQ_CPPM_EN_DMCTH	0x1000
+#define I40E_AQ_CPPM_EN_DMCTLX	0x2000
+#define I40E_AQ_CPPM_EN_HPTC	0x4000
+#define I40E_AQ_CPPM_EN_DMARC	0x8000
+	__le16	ttlx;
+	__le32	dmacr;
+	__le16	dmcth;
+	u8	hptc;
+	u8	reserved;
+	__le32	pfltrc;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_cppm_configuration);
+
+/* Set ARP Proxy command / response (indirect 0x0104) */
+struct i40e_aqc_arp_proxy_data {
+	__le16	command_flags;
+#define I40E_AQ_ARP_INIT_IPV4	0x0008
+#define I40E_AQ_ARP_UNSUP_CTL	0x0010
+#define I40E_AQ_ARP_ENA		0x0020
+#define I40E_AQ_ARP_ADD_IPV4	0x0040
+#define I40E_AQ_ARP_DEL_IPV4	0x0080
+	__le16	table_id;
+	__le32	pfpm_proxyfc;
+	__le32	ip_addr;
+	u8	mac_addr[6];
+	u8	reserved[2];
+};
+
+I40E_CHECK_STRUCT_LEN(0x14, i40e_aqc_arp_proxy_data);
+
+/* Set NS Proxy Table Entry Command (indirect 0x0105) */
+struct i40e_aqc_ns_proxy_data {
+	__le16	table_idx_mac_addr_0;
+	__le16	table_idx_mac_addr_1;
+	__le16	table_idx_ipv6_0;
+	__le16	table_idx_ipv6_1;
+	__le16	control;
+#define I40E_AQ_NS_PROXY_ADD_0		0x0100
+#define I40E_AQ_NS_PROXY_DEL_0		0x0200
+#define I40E_AQ_NS_PROXY_ADD_1		0x0400
+#define I40E_AQ_NS_PROXY_DEL_1		0x0800
+#define I40E_AQ_NS_PROXY_ADD_IPV6_0	0x1000
+#define I40E_AQ_NS_PROXY_DEL_IPV6_0	0x2000
+#define I40E_AQ_NS_PROXY_ADD_IPV6_1	0x4000
+#define I40E_AQ_NS_PROXY_DEL_IPV6_1	0x8000
+#define I40E_AQ_NS_PROXY_COMMAND_SEQ	0x0001
+#define I40E_AQ_NS_PROXY_INIT_IPV6_TBL	0x0002
+#define I40E_AQ_NS_PROXY_INIT_MAC_TBL	0x0004
+	u8	mac_addr_0[6];
+	u8	mac_addr_1[6];
+	u8	local_mac_addr[6];
+	u8	ipv6_addr_0[16]; /* Warning! spec specifies BE byte order */
+	u8	ipv6_addr_1[16];
+};
+
+I40E_CHECK_STRUCT_LEN(0x3c, i40e_aqc_ns_proxy_data);
+
+/* Manage LAA Command (0x0106) - obsolete */
+struct i40e_aqc_mng_laa {
+	__le16	command_flags;
+#define I40E_AQ_LAA_FLAG_WR	0x8000
+	u8	reserved[2];
+	__le32	sal;
+	__le16	sah;
+	u8	reserved2[6];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_mng_laa);
+
+/* Manage MAC Address Read Command (indirect 0x0107) */
+struct i40e_aqc_mac_address_read {
+	__le16	command_flags;
+#define I40E_AQC_LAN_ADDR_VALID		0x10
+#define I40E_AQC_SAN_ADDR_VALID		0x20
+#define I40E_AQC_PORT_ADDR_VALID	0x40
+#define I40E_AQC_WOL_ADDR_VALID		0x80
+#define I40E_AQC_MC_MAG_EN_VALID	0x100
+#define I40E_AQC_ADDR_VALID_MASK	0x1F0
+	u8	reserved[6];
+	__le32	addr_high;
+	__le32	addr_low;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_mac_address_read);
+
+struct i40e_aqc_mac_address_read_data {
+	u8 pf_lan_mac[6];
+	u8 pf_san_mac[6];
+	u8 port_mac[6];
+	u8 pf_wol_mac[6];
+};
+
+I40E_CHECK_STRUCT_LEN(24, i40e_aqc_mac_address_read_data);
+
+/* Manage MAC Address Write Command (0x0108) */
+struct i40e_aqc_mac_address_write {
+	__le16	command_flags;
+#define I40E_AQC_WRITE_TYPE_LAA_ONLY	0x0000
+#define I40E_AQC_WRITE_TYPE_LAA_WOL	0x4000
+#define I40E_AQC_WRITE_TYPE_PORT	0x8000
+#define I40E_AQC_WRITE_TYPE_UPDATE_MC_MAG	0xC000
+#define I40E_AQC_WRITE_TYPE_MASK	0xC000
+
+	__le16	mac_sah;
+	__le32	mac_sal;
+	u8	reserved[8];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_mac_address_write);
+
+/* PXE commands (0x011x) */
+
+/* Clear PXE Command and response  (direct 0x0110) */
+struct i40e_aqc_clear_pxe {
+	u8	rx_cnt;
+	u8	reserved[15];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_clear_pxe);
+
+/* Switch configuration commands (0x02xx) */
+
+/* Used by many indirect commands that only pass an seid and a buffer in the
+ * command
+ */
+struct i40e_aqc_switch_seid {
+	__le16	seid;
+	u8	reserved[6];
+	__le32	addr_high;
+	__le32	addr_low;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_switch_seid);
+
+/* Get Switch Configuration command (indirect 0x0200)
+ * uses i40e_aqc_switch_seid for the descriptor
+ */
+struct i40e_aqc_get_switch_config_header_resp {
+	__le16	num_reported;
+	__le16	num_total;
+	u8	reserved[12];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_get_switch_config_header_resp);
+
+struct i40e_aqc_switch_config_element_resp {
+	u8	element_type;
+#define I40E_AQ_SW_ELEM_TYPE_MAC	1
+#define I40E_AQ_SW_ELEM_TYPE_PF		2
+#define I40E_AQ_SW_ELEM_TYPE_VF		3
+#define I40E_AQ_SW_ELEM_TYPE_EMP	4
+#define I40E_AQ_SW_ELEM_TYPE_BMC	5
+#define I40E_AQ_SW_ELEM_TYPE_PV		16
+#define I40E_AQ_SW_ELEM_TYPE_VEB	17
+#define I40E_AQ_SW_ELEM_TYPE_PA		18
+#define I40E_AQ_SW_ELEM_TYPE_VSI	19
+	u8	revision;
+#define I40E_AQ_SW_ELEM_REV_1		1
+	__le16	seid;
+	__le16	uplink_seid;
+	__le16	downlink_seid;
+	u8	reserved[3];
+	u8	connection_type;
+#define I40E_AQ_CONN_TYPE_REGULAR	0x1
+#define I40E_AQ_CONN_TYPE_DEFAULT	0x2
+#define I40E_AQ_CONN_TYPE_CASCADED	0x3
+	__le16	scheduler_id;
+	__le16	element_info;
+};
+
+I40E_CHECK_STRUCT_LEN(0x10, i40e_aqc_switch_config_element_resp);
+
+/* Get Switch Configuration (indirect 0x0200)
+ *    an array of elements are returned in the response buffer
+ *    the first in the array is the header, remainder are elements
+ */
+struct i40e_aqc_get_switch_config_resp {
+	struct i40e_aqc_get_switch_config_header_resp	header;
+	struct i40e_aqc_switch_config_element_resp	element[1];
+};
+
+I40E_CHECK_STRUCT_LEN(0x20, i40e_aqc_get_switch_config_resp);
+
+/* Add Statistics (direct 0x0201)
+ * Remove Statistics (direct 0x0202)
+ */
+struct i40e_aqc_add_remove_statistics {
+	__le16	seid;
+	__le16	vlan;
+	__le16	stat_index;
+	u8	reserved[10];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_add_remove_statistics);
+
+/* Set Port Parameters command (direct 0x0203) */
+struct i40e_aqc_set_port_parameters {
+	__le16	command_flags;
+#define I40E_AQ_SET_P_PARAMS_SAVE_BAD_PACKETS	1
+#define I40E_AQ_SET_P_PARAMS_PAD_SHORT_PACKETS	2 /* must set! */
+#define I40E_AQ_SET_P_PARAMS_DOUBLE_VLAN_ENA	4
+	__le16	bad_frame_vsi;
+	__le16	default_seid;        /* reserved for command */
+	u8	reserved[10];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_set_port_parameters);
+
+/* Get Switch Resource Allocation (indirect 0x0204) */
+struct i40e_aqc_get_switch_resource_alloc {
+	u8	num_entries;         /* reserved for command */
+	u8	reserved[7];
+	__le32	addr_high;
+	__le32	addr_low;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_get_switch_resource_alloc);
+
+/* expect an array of these structs in the response buffer */
+struct i40e_aqc_switch_resource_alloc_element_resp {
+	u8	resource_type;
+#define I40E_AQ_RESOURCE_TYPE_VEB		0x0
+#define I40E_AQ_RESOURCE_TYPE_VSI		0x1
+#define I40E_AQ_RESOURCE_TYPE_MACADDR		0x2
+#define I40E_AQ_RESOURCE_TYPE_STAG		0x3
+#define I40E_AQ_RESOURCE_TYPE_ETAG		0x4
+#define I40E_AQ_RESOURCE_TYPE_MULTICAST_HASH	0x5
+#define I40E_AQ_RESOURCE_TYPE_UNICAST_HASH	0x6
+#define I40E_AQ_RESOURCE_TYPE_VLAN		0x7
+#define I40E_AQ_RESOURCE_TYPE_VSI_LIST_ENTRY	0x8
+#define I40E_AQ_RESOURCE_TYPE_ETAG_LIST_ENTRY	0x9
+#define I40E_AQ_RESOURCE_TYPE_VLAN_STAT_POOL	0xA
+#define I40E_AQ_RESOURCE_TYPE_MIRROR_RULE	0xB
+#define I40E_AQ_RESOURCE_TYPE_QUEUE_SETS	0xC
+#define I40E_AQ_RESOURCE_TYPE_VLAN_FILTERS	0xD
+#define I40E_AQ_RESOURCE_TYPE_INNER_MAC_FILTERS	0xF
+#define I40E_AQ_RESOURCE_TYPE_IP_FILTERS	0x10
+#define I40E_AQ_RESOURCE_TYPE_GRE_VN_KEYS	0x11
+#define I40E_AQ_RESOURCE_TYPE_VN2_KEYS		0x12
+#define I40E_AQ_RESOURCE_TYPE_TUNNEL_PORTS	0x13
+	u8	reserved1;
+	__le16	guaranteed;
+	__le16	total;
+	__le16	used;
+	__le16	total_unalloced;
+	u8	reserved2[6];
+};
+
+I40E_CHECK_STRUCT_LEN(0x10, i40e_aqc_switch_resource_alloc_element_resp);
+
+/* Add VSI (indirect 0x0210)
+ *    this indirect command uses struct i40e_aqc_vsi_properties_data
+ *    as the indirect buffer (128 bytes)
+ *
+ * Update VSI (indirect 0x211)
+ *     uses the same data structure as Add VSI
+ *
+ * Get VSI (indirect 0x0212)
+ *     uses the same completion and data structure as Add VSI
+ */
+struct i40e_aqc_add_get_update_vsi {
+	__le16	uplink_seid;
+	u8	connection_type;
+#define I40E_AQ_VSI_CONN_TYPE_NORMAL	0x1
+#define I40E_AQ_VSI_CONN_TYPE_DEFAULT	0x2
+#define I40E_AQ_VSI_CONN_TYPE_CASCADED	0x3
+	u8	reserved1;
+	u8	vf_id;
+	u8	reserved2;
+	__le16	vsi_flags;
+#define I40E_AQ_VSI_TYPE_SHIFT		0x0
+#define I40E_AQ_VSI_TYPE_MASK		(0x3 << I40E_AQ_VSI_TYPE_SHIFT)
+#define I40E_AQ_VSI_TYPE_VF		0x0
+#define I40E_AQ_VSI_TYPE_VMDQ2		0x1
+#define I40E_AQ_VSI_TYPE_PF		0x2
+#define I40E_AQ_VSI_TYPE_EMP_MNG	0x3
+#define I40E_AQ_VSI_FLAG_CASCADED_PV	0x4
+	__le32	addr_high;
+	__le32	addr_low;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_add_get_update_vsi);
+
+struct i40e_aqc_add_get_update_vsi_completion {
+	__le16 seid;
+	__le16 vsi_number;
+	__le16 vsi_used;
+	__le16 vsi_free;
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_add_get_update_vsi_completion);
+
+struct i40e_aqc_vsi_properties_data {
+	/* first 96 byte are written by SW */
+	__le16	valid_sections;
+#define I40E_AQ_VSI_PROP_SWITCH_VALID		0x0001
+#define I40E_AQ_VSI_PROP_SECURITY_VALID		0x0002
+#define I40E_AQ_VSI_PROP_VLAN_VALID		0x0004
+#define I40E_AQ_VSI_PROP_CAS_PV_VALID		0x0008
+#define I40E_AQ_VSI_PROP_INGRESS_UP_VALID	0x0010
+#define I40E_AQ_VSI_PROP_EGRESS_UP_VALID	0x0020
+#define I40E_AQ_VSI_PROP_QUEUE_MAP_VALID	0x0040
+#define I40E_AQ_VSI_PROP_QUEUE_OPT_VALID	0x0080
+#define I40E_AQ_VSI_PROP_OUTER_UP_VALID		0x0100
+#define I40E_AQ_VSI_PROP_SCHED_VALID		0x0200
+	/* switch section */
+	__le16	switch_id; /* 12bit id combined with flags below */
+#define I40E_AQ_VSI_SW_ID_SHIFT		0x0000
+#define I40E_AQ_VSI_SW_ID_MASK		(0xFFF << I40E_AQ_VSI_SW_ID_SHIFT)
+#define I40E_AQ_VSI_SW_ID_FLAG_NOT_STAG	0x1000
+#define I40E_AQ_VSI_SW_ID_FLAG_ALLOW_LB	0x2000
+#define I40E_AQ_VSI_SW_ID_FLAG_LOCAL_LB	0x4000
+	u8	sw_reserved[2];
+	/* security section */
+	u8	sec_flags;
+#define I40E_AQ_VSI_SEC_FLAG_ALLOW_DEST_OVRD	0x01
+#define I40E_AQ_VSI_SEC_FLAG_ENABLE_VLAN_CHK	0x02
+#define I40E_AQ_VSI_SEC_FLAG_ENABLE_MAC_CHK	0x04
+	u8	sec_reserved;
+	/* VLAN section */
+	__le16	pvid; /* VLANS include priority bits */
+	__le16	fcoe_pvid;
+	u8	port_vlan_flags;
+#define I40E_AQ_VSI_PVLAN_MODE_SHIFT	0x00
+#define I40E_AQ_VSI_PVLAN_MODE_MASK	(0x03 << \
+					 I40E_AQ_VSI_PVLAN_MODE_SHIFT)
+#define I40E_AQ_VSI_PVLAN_MODE_TAGGED	0x01
+#define I40E_AQ_VSI_PVLAN_MODE_UNTAGGED	0x02
+#define I40E_AQ_VSI_PVLAN_MODE_ALL	0x03
+#define I40E_AQ_VSI_PVLAN_INSERT_PVID	0x04
+#define I40E_AQ_VSI_PVLAN_EMOD_SHIFT	0x03
+#define I40E_AQ_VSI_PVLAN_EMOD_MASK	(0x3 << \
+					 I40E_AQ_VSI_PVLAN_EMOD_SHIFT)
+#define I40E_AQ_VSI_PVLAN_EMOD_STR_BOTH	0x0
+#define I40E_AQ_VSI_PVLAN_EMOD_STR_UP	0x08
+#define I40E_AQ_VSI_PVLAN_EMOD_STR	0x10
+#define I40E_AQ_VSI_PVLAN_EMOD_NOTHING	0x18
+	u8	pvlan_reserved[3];
+	/* ingress egress up sections */
+	__le32	ingress_table; /* bitmap, 3 bits per up */
+#define I40E_AQ_VSI_UP_TABLE_UP0_SHIFT	0
+#define I40E_AQ_VSI_UP_TABLE_UP0_MASK	(0x7 << \
+					 I40E_AQ_VSI_UP_TABLE_UP0_SHIFT)
+#define I40E_AQ_VSI_UP_TABLE_UP1_SHIFT	3
+#define I40E_AQ_VSI_UP_TABLE_UP1_MASK	(0x7 << \
+					 I40E_AQ_VSI_UP_TABLE_UP1_SHIFT)
+#define I40E_AQ_VSI_UP_TABLE_UP2_SHIFT	6
+#define I40E_AQ_VSI_UP_TABLE_UP2_MASK	(0x7 << \
+					 I40E_AQ_VSI_UP_TABLE_UP2_SHIFT)
+#define I40E_AQ_VSI_UP_TABLE_UP3_SHIFT	9
+#define I40E_AQ_VSI_UP_TABLE_UP3_MASK	(0x7 << \
+					 I40E_AQ_VSI_UP_TABLE_UP3_SHIFT)
+#define I40E_AQ_VSI_UP_TABLE_UP4_SHIFT	12
+#define I40E_AQ_VSI_UP_TABLE_UP4_MASK	(0x7 << \
+					 I40E_AQ_VSI_UP_TABLE_UP4_SHIFT)
+#define I40E_AQ_VSI_UP_TABLE_UP5_SHIFT	15
+#define I40E_AQ_VSI_UP_TABLE_UP5_MASK	(0x7 << \
+					 I40E_AQ_VSI_UP_TABLE_UP5_SHIFT)
+#define I40E_AQ_VSI_UP_TABLE_UP6_SHIFT	18
+#define I40E_AQ_VSI_UP_TABLE_UP6_MASK	(0x7 << \
+					 I40E_AQ_VSI_UP_TABLE_UP6_SHIFT)
+#define I40E_AQ_VSI_UP_TABLE_UP7_SHIFT	21
+#define I40E_AQ_VSI_UP_TABLE_UP7_MASK	(0x7 << \
+					 I40E_AQ_VSI_UP_TABLE_UP7_SHIFT)
+	__le32	egress_table;   /* same defines as for ingress table */
+	/* cascaded PV section */
+	__le16	cas_pv_tag;
+	u8	cas_pv_flags;
+#define I40E_AQ_VSI_CAS_PV_TAGX_SHIFT		0x00
+#define I40E_AQ_VSI_CAS_PV_TAGX_MASK		(0x03 << \
+						 I40E_AQ_VSI_CAS_PV_TAGX_SHIFT)
+#define I40E_AQ_VSI_CAS_PV_TAGX_LEAVE		0x00
+#define I40E_AQ_VSI_CAS_PV_TAGX_REMOVE		0x01
+#define I40E_AQ_VSI_CAS_PV_TAGX_COPY		0x02
+#define I40E_AQ_VSI_CAS_PV_INSERT_TAG		0x10
+#define I40E_AQ_VSI_CAS_PV_ETAG_PRUNE		0x20
+#define I40E_AQ_VSI_CAS_PV_ACCEPT_HOST_TAG	0x40
+	u8	cas_pv_reserved;
+	/* queue mapping section */
+	__le16	mapping_flags;
+#define I40E_AQ_VSI_QUE_MAP_CONTIG	0x0
+#define I40E_AQ_VSI_QUE_MAP_NONCONTIG	0x1
+	__le16	queue_mapping[16];
+#define I40E_AQ_VSI_QUEUE_SHIFT		0x0
+#define I40E_AQ_VSI_QUEUE_MASK		(0x7FF << I40E_AQ_VSI_QUEUE_SHIFT)
+	__le16	tc_mapping[8];
+#define I40E_AQ_VSI_TC_QUE_OFFSET_SHIFT	0
+#define I40E_AQ_VSI_TC_QUE_OFFSET_MASK	(0x1FF << \
+					 I40E_AQ_VSI_TC_QUE_OFFSET_SHIFT)
+#define I40E_AQ_VSI_TC_QUE_NUMBER_SHIFT	9
+#define I40E_AQ_VSI_TC_QUE_NUMBER_MASK	(0x7 << \
+					 I40E_AQ_VSI_TC_QUE_NUMBER_SHIFT)
+	/* queueing option section */
+	u8	queueing_opt_flags;
+#ifdef X722_SUPPORT
+#define I40E_AQ_VSI_QUE_OPT_MULTICAST_UDP_ENA	0x04
+#define I40E_AQ_VSI_QUE_OPT_UNICAST_UDP_ENA	0x08
+#endif
+#define I40E_AQ_VSI_QUE_OPT_TCP_ENA	0x10
+#define I40E_AQ_VSI_QUE_OPT_FCOE_ENA	0x20
+#ifdef X722_SUPPORT
+#define I40E_AQ_VSI_QUE_OPT_RSS_LUT_PF	0x00
+#define I40E_AQ_VSI_QUE_OPT_RSS_LUT_VSI	0x40
+#endif
+	u8	queueing_opt_reserved[3];
+	/* scheduler section */
+	u8	up_enable_bits;
+	u8	sched_reserved;
+	/* outer up section */
+	__le32	outer_up_table; /* same structure and defines as ingress table */
+	u8	cmd_reserved[8];
+	/* last 32 bytes are written by FW */
+	__le16	qs_handle[8];
+#define I40E_AQ_VSI_QS_HANDLE_INVALID	0xFFFF
+	__le16	stat_counter_idx;
+	__le16	sched_id;
+	u8	resp_reserved[12];
+};
+
+I40E_CHECK_STRUCT_LEN(128, i40e_aqc_vsi_properties_data);
+
+/* Add Port Virtualizer (direct 0x0220)
+ * also used for update PV (direct 0x0221) but only flags are used
+ * (IS_CTRL_PORT only works on add PV)
+ */
+struct i40e_aqc_add_update_pv {
+	__le16	command_flags;
+#define I40E_AQC_PV_FLAG_PV_TYPE		0x1
+#define I40E_AQC_PV_FLAG_FWD_UNKNOWN_STAG_EN	0x2
+#define I40E_AQC_PV_FLAG_FWD_UNKNOWN_ETAG_EN	0x4
+#define I40E_AQC_PV_FLAG_IS_CTRL_PORT		0x8
+	__le16	uplink_seid;
+	__le16	connected_seid;
+	u8	reserved[10];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_add_update_pv);
+
+struct i40e_aqc_add_update_pv_completion {
+	/* reserved for update; for add also encodes error if rc == ENOSPC */
+	__le16	pv_seid;
+#define I40E_AQC_PV_ERR_FLAG_NO_PV	0x1
+#define I40E_AQC_PV_ERR_FLAG_NO_SCHED	0x2
+#define I40E_AQC_PV_ERR_FLAG_NO_COUNTER	0x4
+#define I40E_AQC_PV_ERR_FLAG_NO_ENTRY	0x8
+	u8	reserved[14];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_add_update_pv_completion);
+
+/* Get PV Params (direct 0x0222)
+ * uses i40e_aqc_switch_seid for the descriptor
+ */
+
+struct i40e_aqc_get_pv_params_completion {
+	__le16	seid;
+	__le16	default_stag;
+	__le16	pv_flags; /* same flags as add_pv */
+#define I40E_AQC_GET_PV_PV_TYPE			0x1
+#define I40E_AQC_GET_PV_FRWD_UNKNOWN_STAG	0x2
+#define I40E_AQC_GET_PV_FRWD_UNKNOWN_ETAG	0x4
+	u8	reserved[8];
+	__le16	default_port_seid;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_get_pv_params_completion);
+
+/* Add VEB (direct 0x0230) */
+struct i40e_aqc_add_veb {
+	__le16	uplink_seid;
+	__le16	downlink_seid;
+	__le16	veb_flags;
+#define I40E_AQC_ADD_VEB_FLOATING		0x1
+#define I40E_AQC_ADD_VEB_PORT_TYPE_SHIFT	1
+#define I40E_AQC_ADD_VEB_PORT_TYPE_MASK		(0x3 << \
+					I40E_AQC_ADD_VEB_PORT_TYPE_SHIFT)
+#define I40E_AQC_ADD_VEB_PORT_TYPE_DEFAULT	0x2
+#define I40E_AQC_ADD_VEB_PORT_TYPE_DATA		0x4
+#define I40E_AQC_ADD_VEB_ENABLE_L2_FILTER	0x8
+	u8	enable_tcs;
+	u8	reserved[9];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_add_veb);
+
+struct i40e_aqc_add_veb_completion {
+	u8	reserved[6];
+	__le16	switch_seid;
+	/* also encodes error if rc == ENOSPC; codes are the same as add_pv */
+	__le16	veb_seid;
+#define I40E_AQC_VEB_ERR_FLAG_NO_VEB		0x1
+#define I40E_AQC_VEB_ERR_FLAG_NO_SCHED		0x2
+#define I40E_AQC_VEB_ERR_FLAG_NO_COUNTER	0x4
+#define I40E_AQC_VEB_ERR_FLAG_NO_ENTRY		0x8
+	__le16	statistic_index;
+	__le16	vebs_used;
+	__le16	vebs_free;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_add_veb_completion);
+
+/* Get VEB Parameters (direct 0x0232)
+ * uses i40e_aqc_switch_seid for the descriptor
+ */
+struct i40e_aqc_get_veb_parameters_completion {
+	__le16	seid;
+	__le16	switch_id;
+	__le16	veb_flags; /* only the first/last flags from 0x0230 is valid */
+	__le16	statistic_index;
+	__le16	vebs_used;
+	__le16	vebs_free;
+	u8	reserved[4];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_get_veb_parameters_completion);
+
+/* Delete Element (direct 0x0243)
+ * uses the generic i40e_aqc_switch_seid
+ */
+
+/* Add MAC-VLAN (indirect 0x0250) */
+
+/* used for the command for most vlan commands */
+struct i40e_aqc_macvlan {
+	__le16	num_addresses;
+	__le16	seid[3];
+#define I40E_AQC_MACVLAN_CMD_SEID_NUM_SHIFT	0
+#define I40E_AQC_MACVLAN_CMD_SEID_NUM_MASK	(0x3FF << \
+					I40E_AQC_MACVLAN_CMD_SEID_NUM_SHIFT)
+#define I40E_AQC_MACVLAN_CMD_SEID_VALID		0x8000
+	__le32	addr_high;
+	__le32	addr_low;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_macvlan);
+
+/* indirect data for command and response */
+struct i40e_aqc_add_macvlan_element_data {
+	u8	mac_addr[6];
+	__le16	vlan_tag;
+	__le16	flags;
+#define I40E_AQC_MACVLAN_ADD_PERFECT_MATCH	0x0001
+#define I40E_AQC_MACVLAN_ADD_HASH_MATCH		0x0002
+#define I40E_AQC_MACVLAN_ADD_IGNORE_VLAN	0x0004
+#define I40E_AQC_MACVLAN_ADD_TO_QUEUE		0x0008
+	__le16	queue_number;
+#define I40E_AQC_MACVLAN_CMD_QUEUE_SHIFT	0
+#define I40E_AQC_MACVLAN_CMD_QUEUE_MASK		(0x7FF << \
+					I40E_AQC_MACVLAN_CMD_SEID_NUM_SHIFT)
+	/* response section */
+	u8	match_method;
+#define I40E_AQC_MM_PERFECT_MATCH	0x01
+#define I40E_AQC_MM_HASH_MATCH		0x02
+#define I40E_AQC_MM_ERR_NO_RES		0xFF
+	u8	reserved1[3];
+};
+
+struct i40e_aqc_add_remove_macvlan_completion {
+	__le16 perfect_mac_used;
+	__le16 perfect_mac_free;
+	__le16 unicast_hash_free;
+	__le16 multicast_hash_free;
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_add_remove_macvlan_completion);
+
+/* Remove MAC-VLAN (indirect 0x0251)
+ * uses i40e_aqc_macvlan for the descriptor
+ * data points to an array of num_addresses of elements
+ */
+
+struct i40e_aqc_remove_macvlan_element_data {
+	u8	mac_addr[6];
+	__le16	vlan_tag;
+	u8	flags;
+#define I40E_AQC_MACVLAN_DEL_PERFECT_MATCH	0x01
+#define I40E_AQC_MACVLAN_DEL_HASH_MATCH		0x02
+#define I40E_AQC_MACVLAN_DEL_IGNORE_VLAN	0x08
+#define I40E_AQC_MACVLAN_DEL_ALL_VSIS		0x10
+	u8	reserved[3];
+	/* reply section */
+	u8	error_code;
+#define I40E_AQC_REMOVE_MACVLAN_SUCCESS		0x0
+#define I40E_AQC_REMOVE_MACVLAN_FAIL		0xFF
+	u8	reply_reserved[3];
+};
+
+/* Add VLAN (indirect 0x0252)
+ * Remove VLAN (indirect 0x0253)
+ * use the generic i40e_aqc_macvlan for the command
+ */
+struct i40e_aqc_add_remove_vlan_element_data {
+	__le16	vlan_tag;
+	u8	vlan_flags;
+/* flags for add VLAN */
+#define I40E_AQC_ADD_VLAN_LOCAL			0x1
+#define I40E_AQC_ADD_PVLAN_TYPE_SHIFT		1
+#define I40E_AQC_ADD_PVLAN_TYPE_MASK	(0x3 << I40E_AQC_ADD_PVLAN_TYPE_SHIFT)
+#define I40E_AQC_ADD_PVLAN_TYPE_REGULAR		0x0
+#define I40E_AQC_ADD_PVLAN_TYPE_PRIMARY		0x2
+#define I40E_AQC_ADD_PVLAN_TYPE_SECONDARY	0x4
+#define I40E_AQC_VLAN_PTYPE_SHIFT		3
+#define I40E_AQC_VLAN_PTYPE_MASK	(0x3 << I40E_AQC_VLAN_PTYPE_SHIFT)
+#define I40E_AQC_VLAN_PTYPE_REGULAR_VSI		0x0
+#define I40E_AQC_VLAN_PTYPE_PROMISC_VSI		0x8
+#define I40E_AQC_VLAN_PTYPE_COMMUNITY_VSI	0x10
+#define I40E_AQC_VLAN_PTYPE_ISOLATED_VSI	0x18
+/* flags for remove VLAN */
+#define I40E_AQC_REMOVE_VLAN_ALL	0x1
+	u8	reserved;
+	u8	result;
+/* flags for add VLAN */
+#define I40E_AQC_ADD_VLAN_SUCCESS	0x0
+#define I40E_AQC_ADD_VLAN_FAIL_REQUEST	0xFE
+#define I40E_AQC_ADD_VLAN_FAIL_RESOURCE	0xFF
+/* flags for remove VLAN */
+#define I40E_AQC_REMOVE_VLAN_SUCCESS	0x0
+#define I40E_AQC_REMOVE_VLAN_FAIL	0xFF
+	u8	reserved1[3];
+};
+
+struct i40e_aqc_add_remove_vlan_completion {
+	u8	reserved[4];
+	__le16	vlans_used;
+	__le16	vlans_free;
+	__le32	addr_high;
+	__le32	addr_low;
+};
+
+/* Set VSI Promiscuous Modes (direct 0x0254) */
+struct i40e_aqc_set_vsi_promiscuous_modes {
+	__le16	promiscuous_flags;
+	__le16	valid_flags;
+/* flags used for both fields above */
+#define I40E_AQC_SET_VSI_PROMISC_UNICAST	0x01
+#define I40E_AQC_SET_VSI_PROMISC_MULTICAST	0x02
+#define I40E_AQC_SET_VSI_PROMISC_BROADCAST	0x04
+#define I40E_AQC_SET_VSI_DEFAULT		0x08
+#define I40E_AQC_SET_VSI_PROMISC_VLAN		0x10
+	__le16	seid;
+#define I40E_AQC_VSI_PROM_CMD_SEID_MASK		0x3FF
+	__le16	vlan_tag;
+#define I40E_AQC_SET_VSI_VLAN_MASK		0x0FFF
+#define I40E_AQC_SET_VSI_VLAN_VALID		0x8000
+	u8	reserved[8];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_set_vsi_promiscuous_modes);
+
+/* Add S/E-tag command (direct 0x0255)
+ * Uses generic i40e_aqc_add_remove_tag_completion for completion
+ */
+struct i40e_aqc_add_tag {
+	__le16	flags;
+#define I40E_AQC_ADD_TAG_FLAG_TO_QUEUE		0x0001
+	__le16	seid;
+#define I40E_AQC_ADD_TAG_CMD_SEID_NUM_SHIFT	0
+#define I40E_AQC_ADD_TAG_CMD_SEID_NUM_MASK	(0x3FF << \
+					I40E_AQC_ADD_TAG_CMD_SEID_NUM_SHIFT)
+	__le16	tag;
+	__le16	queue_number;
+	u8	reserved[8];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_add_tag);
+
+struct i40e_aqc_add_remove_tag_completion {
+	u8	reserved[12];
+	__le16	tags_used;
+	__le16	tags_free;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_add_remove_tag_completion);
+
+/* Remove S/E-tag command (direct 0x0256)
+ * Uses generic i40e_aqc_add_remove_tag_completion for completion
+ */
+struct i40e_aqc_remove_tag {
+	__le16	seid;
+#define I40E_AQC_REMOVE_TAG_CMD_SEID_NUM_SHIFT	0
+#define I40E_AQC_REMOVE_TAG_CMD_SEID_NUM_MASK	(0x3FF << \
+					I40E_AQC_REMOVE_TAG_CMD_SEID_NUM_SHIFT)
+	__le16	tag;
+	u8	reserved[12];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_remove_tag);
+
+/* Add multicast E-Tag (direct 0x0257)
+ * del multicast E-Tag (direct 0x0258) only uses pv_seid and etag fields
+ * and no external data
+ */
+struct i40e_aqc_add_remove_mcast_etag {
+	__le16	pv_seid;
+	__le16	etag;
+	u8	num_unicast_etags;
+	u8	reserved[3];
+	__le32	addr_high;          /* address of array of 2-byte s-tags */
+	__le32	addr_low;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_add_remove_mcast_etag);
+
+struct i40e_aqc_add_remove_mcast_etag_completion {
+	u8	reserved[4];
+	__le16	mcast_etags_used;
+	__le16	mcast_etags_free;
+	__le32	addr_high;
+	__le32	addr_low;
+
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_add_remove_mcast_etag_completion);
+
+/* Update S/E-Tag (direct 0x0259) */
+struct i40e_aqc_update_tag {
+	__le16	seid;
+#define I40E_AQC_UPDATE_TAG_CMD_SEID_NUM_SHIFT	0
+#define I40E_AQC_UPDATE_TAG_CMD_SEID_NUM_MASK	(0x3FF << \
+					I40E_AQC_UPDATE_TAG_CMD_SEID_NUM_SHIFT)
+	__le16	old_tag;
+	__le16	new_tag;
+	u8	reserved[10];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_update_tag);
+
+struct i40e_aqc_update_tag_completion {
+	u8	reserved[12];
+	__le16	tags_used;
+	__le16	tags_free;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_update_tag_completion);
+
+/* Add Control Packet filter (direct 0x025A)
+ * Remove Control Packet filter (direct 0x025B)
+ * uses the i40e_aqc_add_oveb_cloud,
+ * and the generic direct completion structure
+ */
+struct i40e_aqc_add_remove_control_packet_filter {
+	u8	mac[6];
+	__le16	etype;
+	__le16	flags;
+#define I40E_AQC_ADD_CONTROL_PACKET_FLAGS_IGNORE_MAC	0x0001
+#define I40E_AQC_ADD_CONTROL_PACKET_FLAGS_DROP		0x0002
+#define I40E_AQC_ADD_CONTROL_PACKET_FLAGS_TO_QUEUE	0x0004
+#define I40E_AQC_ADD_CONTROL_PACKET_FLAGS_TX		0x0008
+#define I40E_AQC_ADD_CONTROL_PACKET_FLAGS_RX		0x0000
+	__le16	seid;
+#define I40E_AQC_ADD_CONTROL_PACKET_CMD_SEID_NUM_SHIFT	0
+#define I40E_AQC_ADD_CONTROL_PACKET_CMD_SEID_NUM_MASK	(0x3FF << \
+				I40E_AQC_ADD_CONTROL_PACKET_CMD_SEID_NUM_SHIFT)
+	__le16	queue;
+	u8	reserved[2];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_add_remove_control_packet_filter);
+
+struct i40e_aqc_add_remove_control_packet_filter_completion {
+	__le16	mac_etype_used;
+	__le16	etype_used;
+	__le16	mac_etype_free;
+	__le16	etype_free;
+	u8	reserved[8];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_add_remove_control_packet_filter_completion);
+
+/* Add Cloud filters (indirect 0x025C)
+ * Remove Cloud filters (indirect 0x025D)
+ * uses the i40e_aqc_add_remove_cloud_filters,
+ * and the generic indirect completion structure
+ */
+struct i40e_aqc_add_remove_cloud_filters {
+	u8	num_filters;
+	u8	reserved;
+	__le16	seid;
+#define I40E_AQC_ADD_CLOUD_CMD_SEID_NUM_SHIFT	0
+#define I40E_AQC_ADD_CLOUD_CMD_SEID_NUM_MASK	(0x3FF << \
+					I40E_AQC_ADD_CLOUD_CMD_SEID_NUM_SHIFT)
+	u8	reserved2[4];
+	__le32	addr_high;
+	__le32	addr_low;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_add_remove_cloud_filters);
+
+struct i40e_aqc_add_remove_cloud_filters_element_data {
+	u8	outer_mac[6];
+	u8	inner_mac[6];
+	__le16	inner_vlan;
+	union {
+		struct {
+			u8 reserved[12];
+			u8 data[4];
+		} v4;
+		struct {
+			u8 data[16];
+		} v6;
+	} ipaddr;
+	__le16	flags;
+#define I40E_AQC_ADD_CLOUD_FILTER_SHIFT			0
+#define I40E_AQC_ADD_CLOUD_FILTER_MASK	(0x3F << \
+					I40E_AQC_ADD_CLOUD_FILTER_SHIFT)
+/* 0x0000 reserved */
+#define I40E_AQC_ADD_CLOUD_FILTER_OIP			0x0001
+/* 0x0002 reserved */
+#define I40E_AQC_ADD_CLOUD_FILTER_IMAC_IVLAN		0x0003
+#define I40E_AQC_ADD_CLOUD_FILTER_IMAC_IVLAN_TEN_ID	0x0004
+/* 0x0005 reserved */
+#define I40E_AQC_ADD_CLOUD_FILTER_IMAC_TEN_ID		0x0006
+/* 0x0007 reserved */
+/* 0x0008 reserved */
+#define I40E_AQC_ADD_CLOUD_FILTER_OMAC			0x0009
+#define I40E_AQC_ADD_CLOUD_FILTER_IMAC			0x000A
+#define I40E_AQC_ADD_CLOUD_FILTER_OMAC_TEN_ID_IMAC	0x000B
+#define I40E_AQC_ADD_CLOUD_FILTER_IIP			0x000C
+
+#define I40E_AQC_ADD_CLOUD_FLAGS_TO_QUEUE		0x0080
+#define I40E_AQC_ADD_CLOUD_VNK_SHIFT			6
+#define I40E_AQC_ADD_CLOUD_VNK_MASK			0x00C0
+#define I40E_AQC_ADD_CLOUD_FLAGS_IPV4			0
+#define I40E_AQC_ADD_CLOUD_FLAGS_IPV6			0x0100
+
+#define I40E_AQC_ADD_CLOUD_TNL_TYPE_SHIFT		9
+#define I40E_AQC_ADD_CLOUD_TNL_TYPE_MASK		0x1E00
+#define I40E_AQC_ADD_CLOUD_TNL_TYPE_XVLAN		0
+#define I40E_AQC_ADD_CLOUD_TNL_TYPE_NVGRE_OMAC		1
+#define I40E_AQC_ADD_CLOUD_TNL_TYPE_NGE			2
+#define I40E_AQC_ADD_CLOUD_TNL_TYPE_IP			3
+
+	__le32	tenant_id;
+	u8	reserved[4];
+	__le16	queue_number;
+#define I40E_AQC_ADD_CLOUD_QUEUE_SHIFT		0
+#define I40E_AQC_ADD_CLOUD_QUEUE_MASK		(0x7FF << \
+						 I40E_AQC_ADD_CLOUD_QUEUE_SHIFT)
+	u8	reserved2[14];
+	/* response section */
+	u8	allocation_result;
+#define I40E_AQC_ADD_CLOUD_FILTER_SUCCESS	0x0
+#define I40E_AQC_ADD_CLOUD_FILTER_FAIL		0xFF
+	u8	response_reserved[7];
+};
+
+struct i40e_aqc_remove_cloud_filters_completion {
+	__le16 perfect_ovlan_used;
+	__le16 perfect_ovlan_free;
+	__le16 vlan_used;
+	__le16 vlan_free;
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_remove_cloud_filters_completion);
+
+/* Add Mirror Rule (indirect or direct 0x0260)
+ * Delete Mirror Rule (indirect or direct 0x0261)
+ * note: some rule types (4,5) do not use an external buffer.
+ *       take care to set the flags correctly.
+ */
+struct i40e_aqc_add_delete_mirror_rule {
+	__le16 seid;
+	__le16 rule_type;
+#define I40E_AQC_MIRROR_RULE_TYPE_SHIFT		0
+#define I40E_AQC_MIRROR_RULE_TYPE_MASK		(0x7 << \
+						I40E_AQC_MIRROR_RULE_TYPE_SHIFT)
+#define I40E_AQC_MIRROR_RULE_TYPE_VPORT_INGRESS	1
+#define I40E_AQC_MIRROR_RULE_TYPE_VPORT_EGRESS	2
+#define I40E_AQC_MIRROR_RULE_TYPE_VLAN		3
+#define I40E_AQC_MIRROR_RULE_TYPE_ALL_INGRESS	4
+#define I40E_AQC_MIRROR_RULE_TYPE_ALL_EGRESS	5
+	__le16 num_entries;
+	__le16 destination;  /* VSI for add, rule id for delete */
+	__le32 addr_high;    /* address of array of 2-byte VSI or VLAN ids */
+	__le32 addr_low;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_add_delete_mirror_rule);
+
+struct i40e_aqc_add_delete_mirror_rule_completion {
+	u8	reserved[2];
+	__le16	rule_id;  /* only used on add */
+	__le16	mirror_rules_used;
+	__le16	mirror_rules_free;
+	__le32	addr_high;
+	__le32	addr_low;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_add_delete_mirror_rule_completion);
+
+/* DCB 0x03xx*/
+
+/* PFC Ignore (direct 0x0301)
+ *    the command and response use the same descriptor structure
+ */
+struct i40e_aqc_pfc_ignore {
+	u8	tc_bitmap;
+	u8	command_flags; /* unused on response */
+#define I40E_AQC_PFC_IGNORE_SET		0x80
+#define I40E_AQC_PFC_IGNORE_CLEAR	0x0
+	u8	reserved[14];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_pfc_ignore);
+
+/* DCB Update (direct 0x0302) uses the i40e_aq_desc structure
+ * with no parameters
+ */
+
+/* TX scheduler 0x04xx */
+
+/* Almost all the indirect commands use
+ * this generic struct to pass the SEID in param0
+ */
+struct i40e_aqc_tx_sched_ind {
+	__le16	vsi_seid;
+	u8	reserved[6];
+	__le32	addr_high;
+	__le32	addr_low;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_tx_sched_ind);
+
+/* Several commands respond with a set of queue set handles */
+struct i40e_aqc_qs_handles_resp {
+	__le16 qs_handles[8];
+};
+
+/* Configure VSI BW limits (direct 0x0400) */
+struct i40e_aqc_configure_vsi_bw_limit {
+	__le16	vsi_seid;
+	u8	reserved[2];
+	__le16	credit;
+	u8	reserved1[2];
+	u8	max_credit; /* 0-3, limit = 2^max */
+	u8	reserved2[7];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_configure_vsi_bw_limit);
+
+/* Configure VSI Bandwidth Limit per Traffic Type (indirect 0x0406)
+ *    responds with i40e_aqc_qs_handles_resp
+ */
+struct i40e_aqc_configure_vsi_ets_sla_bw_data {
+	u8	tc_valid_bits;
+	u8	reserved[15];
+	__le16	tc_bw_credits[8]; /* FW writesback QS handles here */
+
+	/* 4 bits per tc 0-7, 4th bit is reserved, limit = 2^max */
+	__le16	tc_bw_max[2];
+	u8	reserved1[28];
+};
+
+I40E_CHECK_STRUCT_LEN(0x40, i40e_aqc_configure_vsi_ets_sla_bw_data);
+
+/* Configure VSI Bandwidth Allocation per Traffic Type (indirect 0x0407)
+ *    responds with i40e_aqc_qs_handles_resp
+ */
+struct i40e_aqc_configure_vsi_tc_bw_data {
+	u8	tc_valid_bits;
+	u8	reserved[3];
+	u8	tc_bw_credits[8];
+	u8	reserved1[4];
+	__le16	qs_handles[8];
+};
+
+I40E_CHECK_STRUCT_LEN(0x20, i40e_aqc_configure_vsi_tc_bw_data);
+
+/* Query vsi bw configuration (indirect 0x0408) */
+struct i40e_aqc_query_vsi_bw_config_resp {
+	u8	tc_valid_bits;
+	u8	tc_suspended_bits;
+	u8	reserved[14];
+	__le16	qs_handles[8];
+	u8	reserved1[4];
+	__le16	port_bw_limit;
+	u8	reserved2[2];
+	u8	max_bw; /* 0-3, limit = 2^max */
+	u8	reserved3[23];
+};
+
+I40E_CHECK_STRUCT_LEN(0x40, i40e_aqc_query_vsi_bw_config_resp);
+
+/* Query VSI Bandwidth Allocation per Traffic Type (indirect 0x040A) */
+struct i40e_aqc_query_vsi_ets_sla_config_resp {
+	u8	tc_valid_bits;
+	u8	reserved[3];
+	u8	share_credits[8];
+	__le16	credits[8];
+
+	/* 4 bits per tc 0-7, 4th bit is reserved, limit = 2^max */
+	__le16	tc_bw_max[2];
+};
+
+I40E_CHECK_STRUCT_LEN(0x20, i40e_aqc_query_vsi_ets_sla_config_resp);
+
+/* Configure Switching Component Bandwidth Limit (direct 0x0410) */
+struct i40e_aqc_configure_switching_comp_bw_limit {
+	__le16	seid;
+	u8	reserved[2];
+	__le16	credit;
+	u8	reserved1[2];
+	u8	max_bw; /* 0-3, limit = 2^max */
+	u8	reserved2[7];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_configure_switching_comp_bw_limit);
+
+/* Enable  Physical Port ETS (indirect 0x0413)
+ * Modify  Physical Port ETS (indirect 0x0414)
+ * Disable Physical Port ETS (indirect 0x0415)
+ */
+struct i40e_aqc_configure_switching_comp_ets_data {
+	u8	reserved[4];
+	u8	tc_valid_bits;
+	u8	seepage;
+#define I40E_AQ_ETS_SEEPAGE_EN_MASK	0x1
+	u8	tc_strict_priority_flags;
+	u8	reserved1[17];
+	u8	tc_bw_share_credits[8];
+	u8	reserved2[96];
+};
+
+I40E_CHECK_STRUCT_LEN(0x80, i40e_aqc_configure_switching_comp_ets_data);
+
+/* Configure Switching Component Bandwidth Limits per Tc (indirect 0x0416) */
+struct i40e_aqc_configure_switching_comp_ets_bw_limit_data {
+	u8	tc_valid_bits;
+	u8	reserved[15];
+	__le16	tc_bw_credit[8];
+
+	/* 4 bits per tc 0-7, 4th bit is reserved, limit = 2^max */
+	__le16	tc_bw_max[2];
+	u8	reserved1[28];
+};
+
+I40E_CHECK_STRUCT_LEN(0x40, i40e_aqc_configure_switching_comp_ets_bw_limit_data);
+
+/* Configure Switching Component Bandwidth Allocation per Tc
+ * (indirect 0x0417)
+ */
+struct i40e_aqc_configure_switching_comp_bw_config_data {
+	u8	tc_valid_bits;
+	u8	reserved[2];
+	u8	absolute_credits; /* bool */
+	u8	tc_bw_share_credits[8];
+	u8	reserved1[20];
+};
+
+I40E_CHECK_STRUCT_LEN(0x20, i40e_aqc_configure_switching_comp_bw_config_data);
+
+/* Query Switching Component Configuration (indirect 0x0418) */
+struct i40e_aqc_query_switching_comp_ets_config_resp {
+	u8	tc_valid_bits;
+	u8	reserved[35];
+	__le16	port_bw_limit;
+	u8	reserved1[2];
+	u8	tc_bw_max; /* 0-3, limit = 2^max */
+	u8	reserved2[23];
+};
+
+I40E_CHECK_STRUCT_LEN(0x40, i40e_aqc_query_switching_comp_ets_config_resp);
+
+/* Query PhysicalPort ETS Configuration (indirect 0x0419) */
+struct i40e_aqc_query_port_ets_config_resp {
+	u8	reserved[4];
+	u8	tc_valid_bits;
+	u8	reserved1;
+	u8	tc_strict_priority_bits;
+	u8	reserved2;
+	u8	tc_bw_share_credits[8];
+	__le16	tc_bw_limits[8];
+
+	/* 4 bits per tc 0-7, 4th bit reserved, limit = 2^max */
+	__le16	tc_bw_max[2];
+	u8	reserved3[32];
+};
+
+I40E_CHECK_STRUCT_LEN(0x44, i40e_aqc_query_port_ets_config_resp);
+
+/* Query Switching Component Bandwidth Allocation per Traffic Type
+ * (indirect 0x041A)
+ */
+struct i40e_aqc_query_switching_comp_bw_config_resp {
+	u8	tc_valid_bits;
+	u8	reserved[2];
+	u8	absolute_credits_enable; /* bool */
+	u8	tc_bw_share_credits[8];
+	__le16	tc_bw_limits[8];
+
+	/* 4 bits per tc 0-7, 4th bit is reserved, limit = 2^max */
+	__le16	tc_bw_max[2];
+};
+
+I40E_CHECK_STRUCT_LEN(0x20, i40e_aqc_query_switching_comp_bw_config_resp);
+
+/* Suspend/resume port TX traffic
+ * (direct 0x041B and 0x041C) uses the generic SEID struct
+ */
+
+/* Configure partition BW
+ * (indirect 0x041D)
+ */
+struct i40e_aqc_configure_partition_bw_data {
+	__le16	pf_valid_bits;
+	u8	min_bw[16];      /* guaranteed bandwidth */
+	u8	max_bw[16];      /* bandwidth limit */
+};
+
+I40E_CHECK_STRUCT_LEN(0x22, i40e_aqc_configure_partition_bw_data);
+
+/* Get and set the active HMC resource profile and status.
+ * (direct 0x0500) and (direct 0x0501)
+ */
+struct i40e_aq_get_set_hmc_resource_profile {
+	u8	pm_profile;
+	u8	pe_vf_enabled;
+	u8	reserved[14];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aq_get_set_hmc_resource_profile);
+
+enum i40e_aq_hmc_profile {
+	/* I40E_HMC_PROFILE_NO_CHANGE    = 0, reserved */
+	I40E_HMC_PROFILE_DEFAULT	= 1,
+	I40E_HMC_PROFILE_FAVOR_VF	= 2,
+	I40E_HMC_PROFILE_EQUAL		= 3,
+};
+
+#define I40E_AQ_GET_HMC_RESOURCE_PROFILE_PM_MASK	0xF
+#define I40E_AQ_GET_HMC_RESOURCE_PROFILE_COUNT_MASK	0x3F
+
+/* Get PHY Abilities (indirect 0x0600) uses the generic indirect struct */
+
+/* set in param0 for get phy abilities to report qualified modules */
+#define I40E_AQ_PHY_REPORT_QUALIFIED_MODULES	0x0001
+#define I40E_AQ_PHY_REPORT_INITIAL_VALUES	0x0002
+
+enum i40e_aq_phy_type {
+	I40E_PHY_TYPE_SGMII			= 0x0,
+	I40E_PHY_TYPE_1000BASE_KX		= 0x1,
+	I40E_PHY_TYPE_10GBASE_KX4		= 0x2,
+	I40E_PHY_TYPE_10GBASE_KR		= 0x3,
+	I40E_PHY_TYPE_40GBASE_KR4		= 0x4,
+	I40E_PHY_TYPE_XAUI			= 0x5,
+	I40E_PHY_TYPE_XFI			= 0x6,
+	I40E_PHY_TYPE_SFI			= 0x7,
+	I40E_PHY_TYPE_XLAUI			= 0x8,
+	I40E_PHY_TYPE_XLPPI			= 0x9,
+	I40E_PHY_TYPE_40GBASE_CR4_CU		= 0xA,
+	I40E_PHY_TYPE_10GBASE_CR1_CU		= 0xB,
+	I40E_PHY_TYPE_10GBASE_AOC		= 0xC,
+	I40E_PHY_TYPE_40GBASE_AOC		= 0xD,
+	I40E_PHY_TYPE_100BASE_TX		= 0x11,
+	I40E_PHY_TYPE_1000BASE_T		= 0x12,
+	I40E_PHY_TYPE_10GBASE_T			= 0x13,
+	I40E_PHY_TYPE_10GBASE_SR		= 0x14,
+	I40E_PHY_TYPE_10GBASE_LR		= 0x15,
+	I40E_PHY_TYPE_10GBASE_SFPP_CU		= 0x16,
+	I40E_PHY_TYPE_10GBASE_CR1		= 0x17,
+	I40E_PHY_TYPE_40GBASE_CR4		= 0x18,
+	I40E_PHY_TYPE_40GBASE_SR4		= 0x19,
+	I40E_PHY_TYPE_40GBASE_LR4		= 0x1A,
+	I40E_PHY_TYPE_1000BASE_SX		= 0x1B,
+	I40E_PHY_TYPE_1000BASE_LX		= 0x1C,
+	I40E_PHY_TYPE_1000BASE_T_OPTICAL	= 0x1D,
+	I40E_PHY_TYPE_20GBASE_KR2		= 0x1E,
+	I40E_PHY_TYPE_MAX
+};
+
+#define I40E_LINK_SPEED_100MB_SHIFT	0x1
+#define I40E_LINK_SPEED_1000MB_SHIFT	0x2
+#define I40E_LINK_SPEED_10GB_SHIFT	0x3
+#define I40E_LINK_SPEED_40GB_SHIFT	0x4
+#define I40E_LINK_SPEED_20GB_SHIFT	0x5
+
+enum i40e_aq_link_speed {
+	I40E_LINK_SPEED_UNKNOWN	= 0,
+	I40E_LINK_SPEED_100MB	= (1 << I40E_LINK_SPEED_100MB_SHIFT),
+	I40E_LINK_SPEED_1GB	= (1 << I40E_LINK_SPEED_1000MB_SHIFT),
+	I40E_LINK_SPEED_10GB	= (1 << I40E_LINK_SPEED_10GB_SHIFT),
+	I40E_LINK_SPEED_40GB	= (1 << I40E_LINK_SPEED_40GB_SHIFT),
+	I40E_LINK_SPEED_20GB	= (1 << I40E_LINK_SPEED_20GB_SHIFT)
+};
+
+struct i40e_aqc_module_desc {
+	u8 oui[3];
+	u8 reserved1;
+	u8 part_number[16];
+	u8 revision[4];
+	u8 reserved2[8];
+};
+
+I40E_CHECK_STRUCT_LEN(0x20, i40e_aqc_module_desc);
+
+struct i40e_aq_get_phy_abilities_resp {
+	__le32	phy_type;       /* bitmap using the above enum for offsets */
+	u8	link_speed;     /* bitmap using the above enum bit patterns */
+	u8	abilities;
+#define I40E_AQ_PHY_FLAG_PAUSE_TX	0x01
+#define I40E_AQ_PHY_FLAG_PAUSE_RX	0x02
+#define I40E_AQ_PHY_FLAG_LOW_POWER	0x04
+#define I40E_AQ_PHY_LINK_ENABLED	0x08
+#define I40E_AQ_PHY_AN_ENABLED		0x10
+#define I40E_AQ_PHY_FLAG_MODULE_QUAL	0x20
+	__le16	eee_capability;
+#define I40E_AQ_EEE_100BASE_TX		0x0002
+#define I40E_AQ_EEE_1000BASE_T		0x0004
+#define I40E_AQ_EEE_10GBASE_T		0x0008
+#define I40E_AQ_EEE_1000BASE_KX		0x0010
+#define I40E_AQ_EEE_10GBASE_KX4		0x0020
+#define I40E_AQ_EEE_10GBASE_KR		0x0040
+	__le32	eeer_val;
+	u8	d3_lpan;
+#define I40E_AQ_SET_PHY_D3_LPAN_ENA	0x01
+	u8	reserved[3];
+	u8	phy_id[4];
+	u8	module_type[3];
+	u8	qualified_module_count;
+#define I40E_AQ_PHY_MAX_QMS		16
+	struct i40e_aqc_module_desc	qualified_module[I40E_AQ_PHY_MAX_QMS];
+};
+
+I40E_CHECK_STRUCT_LEN(0x218, i40e_aq_get_phy_abilities_resp);
+
+/* Set PHY Config (direct 0x0601) */
+struct i40e_aq_set_phy_config { /* same bits as above in all */
+	__le32	phy_type;
+	u8	link_speed;
+	u8	abilities;
+/* bits 0-2 use the values from get_phy_abilities_resp */
+#define I40E_AQ_PHY_ENABLE_LINK		0x08
+#define I40E_AQ_PHY_ENABLE_AN		0x10
+#define I40E_AQ_PHY_ENABLE_ATOMIC_LINK	0x20
+	__le16	eee_capability;
+	__le32	eeer;
+	u8	low_power_ctrl;
+	u8	reserved[3];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aq_set_phy_config);
+
+/* Set MAC Config command data structure (direct 0x0603) */
+struct i40e_aq_set_mac_config {
+	__le16	max_frame_size;
+	u8	params;
+#define I40E_AQ_SET_MAC_CONFIG_CRC_EN		0x04
+#define I40E_AQ_SET_MAC_CONFIG_PACING_MASK	0x78
+#define I40E_AQ_SET_MAC_CONFIG_PACING_SHIFT	3
+#define I40E_AQ_SET_MAC_CONFIG_PACING_NONE	0x0
+#define I40E_AQ_SET_MAC_CONFIG_PACING_1B_13TX	0xF
+#define I40E_AQ_SET_MAC_CONFIG_PACING_1DW_9TX	0x9
+#define I40E_AQ_SET_MAC_CONFIG_PACING_1DW_4TX	0x8
+#define I40E_AQ_SET_MAC_CONFIG_PACING_3DW_7TX	0x7
+#define I40E_AQ_SET_MAC_CONFIG_PACING_2DW_3TX	0x6
+#define I40E_AQ_SET_MAC_CONFIG_PACING_1DW_1TX	0x5
+#define I40E_AQ_SET_MAC_CONFIG_PACING_3DW_2TX	0x4
+#define I40E_AQ_SET_MAC_CONFIG_PACING_7DW_3TX	0x3
+#define I40E_AQ_SET_MAC_CONFIG_PACING_4DW_1TX	0x2
+#define I40E_AQ_SET_MAC_CONFIG_PACING_9DW_1TX	0x1
+	u8	tx_timer_priority; /* bitmap */
+	__le16	tx_timer_value;
+	__le16	fc_refresh_threshold;
+	u8	reserved[8];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aq_set_mac_config);
+
+/* Restart Auto-Negotiation (direct 0x605) */
+struct i40e_aqc_set_link_restart_an {
+	u8	command;
+#define I40E_AQ_PHY_RESTART_AN	0x02
+#define I40E_AQ_PHY_LINK_ENABLE	0x04
+	u8	reserved[15];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_set_link_restart_an);
+
+/* Get Link Status cmd & response data structure (direct 0x0607) */
+struct i40e_aqc_get_link_status {
+	__le16	command_flags; /* only field set on command */
+#define I40E_AQ_LSE_MASK		0x3
+#define I40E_AQ_LSE_NOP			0x0
+#define I40E_AQ_LSE_DISABLE		0x2
+#define I40E_AQ_LSE_ENABLE		0x3
+/* only response uses this flag */
+#define I40E_AQ_LSE_IS_ENABLED		0x1
+	u8	phy_type;    /* i40e_aq_phy_type   */
+	u8	link_speed;  /* i40e_aq_link_speed */
+	u8	link_info;
+#define I40E_AQ_LINK_UP			0x01    /* obsolete */
+#define I40E_AQ_LINK_UP_FUNCTION	0x01
+#define I40E_AQ_LINK_FAULT		0x02
+#define I40E_AQ_LINK_FAULT_TX		0x04
+#define I40E_AQ_LINK_FAULT_RX		0x08
+#define I40E_AQ_LINK_FAULT_REMOTE	0x10
+#define I40E_AQ_LINK_UP_PORT		0x20
+#define I40E_AQ_MEDIA_AVAILABLE		0x40
+#define I40E_AQ_SIGNAL_DETECT		0x80
+	u8	an_info;
+#define I40E_AQ_AN_COMPLETED		0x01
+#define I40E_AQ_LP_AN_ABILITY		0x02
+#define I40E_AQ_PD_FAULT		0x04
+#define I40E_AQ_FEC_EN			0x08
+#define I40E_AQ_PHY_LOW_POWER		0x10
+#define I40E_AQ_LINK_PAUSE_TX		0x20
+#define I40E_AQ_LINK_PAUSE_RX		0x40
+#define I40E_AQ_QUALIFIED_MODULE	0x80
+	u8	ext_info;
+#define I40E_AQ_LINK_PHY_TEMP_ALARM	0x01
+#define I40E_AQ_LINK_XCESSIVE_ERRORS	0x02
+#define I40E_AQ_LINK_TX_SHIFT		0x02
+#define I40E_AQ_LINK_TX_MASK		(0x03 << I40E_AQ_LINK_TX_SHIFT)
+#define I40E_AQ_LINK_TX_ACTIVE		0x00
+#define I40E_AQ_LINK_TX_DRAINED		0x01
+#define I40E_AQ_LINK_TX_FLUSHED		0x03
+#define I40E_AQ_LINK_FORCED_40G		0x10
+	u8	loopback; /* use defines from i40e_aqc_set_lb_mode */
+	__le16	max_frame_size;
+	u8	config;
+#define I40E_AQ_CONFIG_CRC_ENA		0x04
+#define I40E_AQ_CONFIG_PACING_MASK	0x78
+	u8	reserved[5];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_get_link_status);
+
+/* Set event mask command (direct 0x613) */
+struct i40e_aqc_set_phy_int_mask {
+	u8	reserved[8];
+	__le16	event_mask;
+#define I40E_AQ_EVENT_LINK_UPDOWN	0x0002
+#define I40E_AQ_EVENT_MEDIA_NA		0x0004
+#define I40E_AQ_EVENT_LINK_FAULT	0x0008
+#define I40E_AQ_EVENT_PHY_TEMP_ALARM	0x0010
+#define I40E_AQ_EVENT_EXCESSIVE_ERRORS	0x0020
+#define I40E_AQ_EVENT_SIGNAL_DETECT	0x0040
+#define I40E_AQ_EVENT_AN_COMPLETED	0x0080
+#define I40E_AQ_EVENT_MODULE_QUAL_FAIL	0x0100
+#define I40E_AQ_EVENT_PORT_TX_SUSPENDED	0x0200
+	u8	reserved1[6];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_set_phy_int_mask);
+
+/* Get Local AN advt register (direct 0x0614)
+ * Set Local AN advt register (direct 0x0615)
+ * Get Link Partner AN advt register (direct 0x0616)
+ */
+struct i40e_aqc_an_advt_reg {
+	__le32	local_an_reg0;
+	__le16	local_an_reg1;
+	u8	reserved[10];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_an_advt_reg);
+
+/* Set Loopback mode (0x0618) */
+struct i40e_aqc_set_lb_mode {
+	__le16	lb_mode;
+#define I40E_AQ_LB_PHY_LOCAL	0x01
+#define I40E_AQ_LB_PHY_REMOTE	0x02
+#define I40E_AQ_LB_MAC_LOCAL	0x04
+	u8	reserved[14];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_set_lb_mode);
+
+/* Set PHY Debug command (0x0622) */
+struct i40e_aqc_set_phy_debug {
+	u8	command_flags;
+#define I40E_AQ_PHY_DEBUG_RESET_INTERNAL	0x02
+#define I40E_AQ_PHY_DEBUG_RESET_EXTERNAL_SHIFT	2
+#define I40E_AQ_PHY_DEBUG_RESET_EXTERNAL_MASK	(0x03 << \
+					I40E_AQ_PHY_DEBUG_RESET_EXTERNAL_SHIFT)
+#define I40E_AQ_PHY_DEBUG_RESET_EXTERNAL_NONE	0x00
+#define I40E_AQ_PHY_DEBUG_RESET_EXTERNAL_HARD	0x01
+#define I40E_AQ_PHY_DEBUG_RESET_EXTERNAL_SOFT	0x02
+#define I40E_AQ_PHY_DEBUG_DISABLE_LINK_FW	0x10
+	u8	reserved[15];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_set_phy_debug);
+
+enum i40e_aq_phy_reg_type {
+	I40E_AQC_PHY_REG_INTERNAL	= 0x1,
+	I40E_AQC_PHY_REG_EXERNAL_BASET	= 0x2,
+	I40E_AQC_PHY_REG_EXERNAL_MODULE	= 0x3
+};
+
+/* NVM Read command (indirect 0x0701)
+ * NVM Erase commands (direct 0x0702)
+ * NVM Update commands (indirect 0x0703)
+ */
+struct i40e_aqc_nvm_update {
+	u8	command_flags;
+#define I40E_AQ_NVM_LAST_CMD	0x01
+#define I40E_AQ_NVM_FLASH_ONLY	0x80
+	u8	module_pointer;
+	__le16	length;
+	__le32	offset;
+	__le32	addr_high;
+	__le32	addr_low;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_nvm_update);
+
+/* NVM Config Read (indirect 0x0704) */
+struct i40e_aqc_nvm_config_read {
+	__le16	cmd_flags;
+#define I40E_AQ_ANVM_SINGLE_OR_MULTIPLE_FEATURES_MASK	1
+#define I40E_AQ_ANVM_READ_SINGLE_FEATURE		0
+#define I40E_AQ_ANVM_READ_MULTIPLE_FEATURES		1
+	__le16	element_count;
+	__le16	element_id;     /* Feature/field ID */
+	__le16	element_id_msw;	/* MSWord of field ID */
+	__le32	address_high;
+	__le32	address_low;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_nvm_config_read);
+
+/* NVM Config Write (indirect 0x0705) */
+struct i40e_aqc_nvm_config_write {
+	__le16	cmd_flags;
+	__le16	element_count;
+	u8	reserved[4];
+	__le32	address_high;
+	__le32	address_low;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_nvm_config_write);
+
+/* Used for 0x0704 as well as for 0x0705 commands */
+#define I40E_AQ_ANVM_FEATURE_OR_IMMEDIATE_SHIFT		1
+#define I40E_AQ_ANVM_FEATURE_OR_IMMEDIATE_MASK		(1 << I40E_AQ_ANVM_FEATURE_OR_IMMEDIATE_SHIFT)
+#define I40E_AQ_ANVM_FEATURE				0
+#define I40E_AQ_ANVM_IMMEDIATE_FIELD			(1 << FEATURE_OR_IMMEDIATE_SHIFT)
+struct i40e_aqc_nvm_config_data_feature {
+	__le16 feature_id;
+#define I40E_AQ_ANVM_FEATURE_OPTION_OEM_ONLY		0x01
+#define I40E_AQ_ANVM_FEATURE_OPTION_DWORD_MAP		0x08
+#define I40E_AQ_ANVM_FEATURE_OPTION_POR_CSR		0x10
+	__le16 feature_options;
+	__le16 feature_selection;
+};
+
+I40E_CHECK_STRUCT_LEN(0x6, i40e_aqc_nvm_config_data_feature);
+
+struct i40e_aqc_nvm_config_data_immediate_field {
+	__le32 field_id;
+	__le32 field_value;
+	__le16 field_options;
+	__le16 reserved;
+};
+
+I40E_CHECK_STRUCT_LEN(0xc, i40e_aqc_nvm_config_data_immediate_field);
+
+/* OEM Post Update (indirect 0x0720)
+ * no command data struct used
+ */
+ struct i40e_aqc_nvm_oem_post_update {
+#define I40E_AQ_NVM_OEM_POST_UPDATE_EXTERNAL_DATA	0x01
+	u8 sel_data;
+	u8 reserved[7];
+};
+
+I40E_CHECK_STRUCT_LEN(0x8, i40e_aqc_nvm_oem_post_update);
+
+struct i40e_aqc_nvm_oem_post_update_buffer {
+	u8 str_len;
+	u8 dev_addr;
+	__le16 eeprom_addr;
+	u8 data[36];
+};
+
+I40E_CHECK_STRUCT_LEN(0x28, i40e_aqc_nvm_oem_post_update_buffer);
+
+/* Send to PF command (indirect 0x0801) id is only used by PF
+ * Send to VF command (indirect 0x0802) id is only used by PF
+ * Send to Peer PF command (indirect 0x0803)
+ */
+struct i40e_aqc_pf_vf_message {
+	__le32	id;
+	u8	reserved[4];
+	__le32	addr_high;
+	__le32	addr_low;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_pf_vf_message);
+
+/* Alternate structure */
+
+/* Direct write (direct 0x0900)
+ * Direct read (direct 0x0902)
+ */
+struct i40e_aqc_alternate_write {
+	__le32 address0;
+	__le32 data0;
+	__le32 address1;
+	__le32 data1;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_alternate_write);
+
+/* Indirect write (indirect 0x0901)
+ * Indirect read (indirect 0x0903)
+ */
+
+struct i40e_aqc_alternate_ind_write {
+	__le32 address;
+	__le32 length;
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_alternate_ind_write);
+
+/* Done alternate write (direct 0x0904)
+ * uses i40e_aq_desc
+ */
+struct i40e_aqc_alternate_write_done {
+	__le16	cmd_flags;
+#define I40E_AQ_ALTERNATE_MODE_BIOS_MASK	1
+#define I40E_AQ_ALTERNATE_MODE_BIOS_LEGACY	0
+#define I40E_AQ_ALTERNATE_MODE_BIOS_UEFI	1
+#define I40E_AQ_ALTERNATE_RESET_NEEDED		2
+	u8	reserved[14];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_alternate_write_done);
+
+/* Set OEM mode (direct 0x0905) */
+struct i40e_aqc_alternate_set_mode {
+	__le32	mode;
+#define I40E_AQ_ALTERNATE_MODE_NONE	0
+#define I40E_AQ_ALTERNATE_MODE_OEM	1
+	u8	reserved[12];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_alternate_set_mode);
+
+/* Clear port Alternate RAM (direct 0x0906) uses i40e_aq_desc */
+
+/* async events 0x10xx */
+
+/* Lan Queue Overflow Event (direct, 0x1001) */
+struct i40e_aqc_lan_overflow {
+	__le32	prtdcb_rupto;
+	__le32	otx_ctl;
+	u8	reserved[8];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_lan_overflow);
+
+/* Get LLDP MIB (indirect 0x0A00) */
+struct i40e_aqc_lldp_get_mib {
+	u8	type;
+	u8	reserved1;
+#define I40E_AQ_LLDP_MIB_TYPE_MASK		0x3
+#define I40E_AQ_LLDP_MIB_LOCAL			0x0
+#define I40E_AQ_LLDP_MIB_REMOTE			0x1
+#define I40E_AQ_LLDP_MIB_LOCAL_AND_REMOTE	0x2
+#define I40E_AQ_LLDP_BRIDGE_TYPE_MASK		0xC
+#define I40E_AQ_LLDP_BRIDGE_TYPE_SHIFT		0x2
+#define I40E_AQ_LLDP_BRIDGE_TYPE_NEAREST_BRIDGE	0x0
+#define I40E_AQ_LLDP_BRIDGE_TYPE_NON_TPMR	0x1
+#define I40E_AQ_LLDP_TX_SHIFT			0x4
+#define I40E_AQ_LLDP_TX_MASK			(0x03 << I40E_AQ_LLDP_TX_SHIFT)
+/* TX pause flags use I40E_AQ_LINK_TX_* above */
+	__le16	local_len;
+	__le16	remote_len;
+	u8	reserved2[2];
+	__le32	addr_high;
+	__le32	addr_low;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_lldp_get_mib);
+
+/* Configure LLDP MIB Change Event (direct 0x0A01)
+ * also used for the event (with type in the command field)
+ */
+struct i40e_aqc_lldp_update_mib {
+	u8	command;
+#define I40E_AQ_LLDP_MIB_UPDATE_ENABLE	0x0
+#define I40E_AQ_LLDP_MIB_UPDATE_DISABLE	0x1
+	u8	reserved[7];
+	__le32	addr_high;
+	__le32	addr_low;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_lldp_update_mib);
+
+/* Add LLDP TLV (indirect 0x0A02)
+ * Delete LLDP TLV (indirect 0x0A04)
+ */
+struct i40e_aqc_lldp_add_tlv {
+	u8	type; /* only nearest bridge and non-TPMR from 0x0A00 */
+	u8	reserved1[1];
+	__le16	len;
+	u8	reserved2[4];
+	__le32	addr_high;
+	__le32	addr_low;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_lldp_add_tlv);
+
+/* Update LLDP TLV (indirect 0x0A03) */
+struct i40e_aqc_lldp_update_tlv {
+	u8	type; /* only nearest bridge and non-TPMR from 0x0A00 */
+	u8	reserved;
+	__le16	old_len;
+	__le16	new_offset;
+	__le16	new_len;
+	__le32	addr_high;
+	__le32	addr_low;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_lldp_update_tlv);
+
+/* Stop LLDP (direct 0x0A05) */
+struct i40e_aqc_lldp_stop {
+	u8	command;
+#define I40E_AQ_LLDP_AGENT_STOP		0x0
+#define I40E_AQ_LLDP_AGENT_SHUTDOWN	0x1
+	u8	reserved[15];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_lldp_stop);
+
+/* Start LLDP (direct 0x0A06) */
+
+struct i40e_aqc_lldp_start {
+	u8	command;
+#define I40E_AQ_LLDP_AGENT_START	0x1
+	u8	reserved[15];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_lldp_start);
+
+/* Get CEE DCBX Oper Config (0x0A07)
+ * uses the generic descriptor struct
+ * returns below as indirect response
+ */
+
+#define I40E_AQC_CEE_APP_FCOE_SHIFT	0x0
+#define I40E_AQC_CEE_APP_FCOE_MASK	(0x7 << I40E_AQC_CEE_APP_FCOE_SHIFT)
+#define I40E_AQC_CEE_APP_ISCSI_SHIFT	0x3
+#define I40E_AQC_CEE_APP_ISCSI_MASK	(0x7 << I40E_AQC_CEE_APP_ISCSI_SHIFT)
+#define I40E_AQC_CEE_APP_FIP_SHIFT	0x8
+#define I40E_AQC_CEE_APP_FIP_MASK	(0x7 << I40E_AQC_CEE_APP_FIP_SHIFT)
+
+#define I40E_AQC_CEE_PG_STATUS_SHIFT	0x0
+#define I40E_AQC_CEE_PG_STATUS_MASK	(0x7 << I40E_AQC_CEE_PG_STATUS_SHIFT)
+#define I40E_AQC_CEE_PFC_STATUS_SHIFT	0x3
+#define I40E_AQC_CEE_PFC_STATUS_MASK	(0x7 << I40E_AQC_CEE_PFC_STATUS_SHIFT)
+#define I40E_AQC_CEE_APP_STATUS_SHIFT	0x8
+#define I40E_AQC_CEE_APP_STATUS_MASK	(0x7 << I40E_AQC_CEE_APP_STATUS_SHIFT)
+#define I40E_AQC_CEE_FCOE_STATUS_SHIFT	0x8
+#define I40E_AQC_CEE_FCOE_STATUS_MASK	(0x7 << I40E_AQC_CEE_FCOE_STATUS_SHIFT)
+#define I40E_AQC_CEE_ISCSI_STATUS_SHIFT	0xB
+#define I40E_AQC_CEE_ISCSI_STATUS_MASK	(0x7 << I40E_AQC_CEE_ISCSI_STATUS_SHIFT)
+#define I40E_AQC_CEE_FIP_STATUS_SHIFT	0x10
+#define I40E_AQC_CEE_FIP_STATUS_MASK	(0x7 << I40E_AQC_CEE_FIP_STATUS_SHIFT)
+
+/* struct i40e_aqc_get_cee_dcb_cfg_v1_resp was originally defined with
+ * word boundary layout issues, which the Linux compilers silently deal
+ * with by adding padding, making the actual struct larger than designed.
+ * However, the FW compiler for the NIC is less lenient and complains
+ * about the struct.  Hence, the struct defined here has an extra byte in
+ * fields reserved3 and reserved4 to directly acknowledge that padding,
+ * and the new length is used in the length check macro.
+ */
+struct i40e_aqc_get_cee_dcb_cfg_v1_resp {
+	u8	reserved1;
+	u8	oper_num_tc;
+	u8	oper_prio_tc[4];
+	u8	reserved2;
+	u8	oper_tc_bw[8];
+	u8	oper_pfc_en;
+	u8	reserved3[2];
+	__le16	oper_app_prio;
+	u8	reserved4[2];
+	__le16	tlv_status;
+};
+
+I40E_CHECK_STRUCT_LEN(0x18, i40e_aqc_get_cee_dcb_cfg_v1_resp);
+
+struct i40e_aqc_get_cee_dcb_cfg_resp {
+	u8	oper_num_tc;
+	u8	oper_prio_tc[4];
+	u8	oper_tc_bw[8];
+	u8	oper_pfc_en;
+	__le16	oper_app_prio;
+	__le32	tlv_status;
+	u8	reserved[12];
+};
+
+I40E_CHECK_STRUCT_LEN(0x20, i40e_aqc_get_cee_dcb_cfg_resp);
+
+/*	Set Local LLDP MIB (indirect 0x0A08)
+ *	Used to replace the local MIB of a given LLDP agent. e.g. DCBx
+ */
+struct i40e_aqc_lldp_set_local_mib {
+#define SET_LOCAL_MIB_AC_TYPE_DCBX_SHIFT	0
+#define SET_LOCAL_MIB_AC_TYPE_DCBX_MASK		(1 << SET_LOCAL_MIB_AC_TYPE_DCBX_SHIFT)
+	u8	type;
+	u8	reserved0;
+	__le16	length;
+	u8	reserved1[4];
+	__le32	address_high;
+	__le32	address_low;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_lldp_set_local_mib);
+
+/*	Stop/Start LLDP Agent (direct 0x0A09)
+ *	Used for stopping/starting specific LLDP agent. e.g. DCBx
+ */
+struct i40e_aqc_lldp_stop_start_specific_agent {
+#define I40E_AQC_START_SPECIFIC_AGENT_SHIFT	0
+#define I40E_AQC_START_SPECIFIC_AGENT_MASK	(1 << I40E_AQC_START_SPECIFIC_AGENT_SHIFT)
+	u8	command;
+	u8	reserved[15];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_lldp_stop_start_specific_agent);
+
+/* Add Udp Tunnel command and completion (direct 0x0B00) */
+struct i40e_aqc_add_udp_tunnel {
+	__le16	udp_port;
+	u8	reserved0[3];
+	u8	protocol_type;
+#define I40E_AQC_TUNNEL_TYPE_VXLAN	0x00
+#define I40E_AQC_TUNNEL_TYPE_NGE	0x01
+#define I40E_AQC_TUNNEL_TYPE_TEREDO	0x10
+	u8	reserved1[10];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_add_udp_tunnel);
+
+struct i40e_aqc_add_udp_tunnel_completion {
+	__le16 udp_port;
+	u8	filter_entry_index;
+	u8	multiple_pfs;
+#define I40E_AQC_SINGLE_PF		0x0
+#define I40E_AQC_MULTIPLE_PFS		0x1
+	u8	total_filters;
+	u8	reserved[11];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_add_udp_tunnel_completion);
+
+/* remove UDP Tunnel command (0x0B01) */
+struct i40e_aqc_remove_udp_tunnel {
+	u8	reserved[2];
+	u8	index; /* 0 to 15 */
+	u8	reserved2[13];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_remove_udp_tunnel);
+
+struct i40e_aqc_del_udp_tunnel_completion {
+	__le16	udp_port;
+	u8	index; /* 0 to 15 */
+	u8	multiple_pfs;
+	u8	total_filters_used;
+	u8	reserved1[11];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_del_udp_tunnel_completion);
+#ifdef X722_SUPPORT
+
+struct i40e_aqc_get_set_rss_key {
+#define I40E_AQC_SET_RSS_KEY_VSI_VALID		(0x1 << 15)
+#define I40E_AQC_SET_RSS_KEY_VSI_ID_SHIFT	0
+#define I40E_AQC_SET_RSS_KEY_VSI_ID_MASK	(0x3FF << \
+					I40E_AQC_SET_RSS_KEY_VSI_ID_SHIFT)
+	__le16	vsi_id;
+	u8	reserved[6];
+	__le32	addr_high;
+	__le32	addr_low;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_get_set_rss_key);
+
+struct i40e_aqc_get_set_rss_key_data {
+	u8 standard_rss_key[0x28];
+	u8 extended_hash_key[0xc];
+};
+
+I40E_CHECK_STRUCT_LEN(0x34, i40e_aqc_get_set_rss_key_data);
+
+struct  i40e_aqc_get_set_rss_lut {
+#define I40E_AQC_SET_RSS_LUT_VSI_VALID		(0x1 << 15)
+#define I40E_AQC_SET_RSS_LUT_VSI_ID_SHIFT	0
+#define I40E_AQC_SET_RSS_LUT_VSI_ID_MASK	(0x3FF << \
+					I40E_AQC_SET_RSS_LUT_VSI_ID_SHIFT)
+	__le16	vsi_id;
+#define I40E_AQC_SET_RSS_LUT_TABLE_TYPE_SHIFT	0
+#define I40E_AQC_SET_RSS_LUT_TABLE_TYPE_MASK	(0x1 << \
+					I40E_AQC_SET_RSS_LUT_TABLE_TYPE_SHIFT)
+
+#define I40E_AQC_SET_RSS_LUT_TABLE_TYPE_VSI	0
+#define I40E_AQC_SET_RSS_LUT_TABLE_TYPE_PF	1
+	__le16	flags;
+	u8	reserved[4];
+	__le32	addr_high;
+	__le32	addr_low;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_get_set_rss_lut);
+#endif
+
+/* tunnel key structure 0x0B10 */
+
+struct i40e_aqc_tunnel_key_structure {
+	u8	key1_off;
+	u8	key2_off;
+	u8	key1_len;  /* 0 to 15 */
+	u8	key2_len;  /* 0 to 15 */
+	u8	flags;
+#define I40E_AQC_TUNNEL_KEY_STRUCT_OVERRIDE	0x01
+/* response flags */
+#define I40E_AQC_TUNNEL_KEY_STRUCT_SUCCESS	0x01
+#define I40E_AQC_TUNNEL_KEY_STRUCT_MODIFIED	0x02
+#define I40E_AQC_TUNNEL_KEY_STRUCT_OVERRIDDEN	0x03
+	u8	network_key_index;
+#define I40E_AQC_NETWORK_KEY_INDEX_VXLAN		0x0
+#define I40E_AQC_NETWORK_KEY_INDEX_NGE			0x1
+#define I40E_AQC_NETWORK_KEY_INDEX_FLEX_MAC_IN_UDP	0x2
+#define I40E_AQC_NETWORK_KEY_INDEX_GRE			0x3
+	u8	reserved[10];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_tunnel_key_structure);
+
+/* OEM mode commands (direct 0xFE0x) */
+struct i40e_aqc_oem_param_change {
+	__le32	param_type;
+#define I40E_AQ_OEM_PARAM_TYPE_PF_CTL	0
+#define I40E_AQ_OEM_PARAM_TYPE_BW_CTL	1
+#define I40E_AQ_OEM_PARAM_MAC		2
+	__le32	param_value1;
+	__le16	param_value2;
+	u8	reserved[6];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_oem_param_change);
+
+struct i40e_aqc_oem_state_change {
+	__le32	state;
+#define I40E_AQ_OEM_STATE_LINK_DOWN	0x0
+#define I40E_AQ_OEM_STATE_LINK_UP	0x1
+	u8	reserved[12];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_oem_state_change);
+
+/* Initialize OCSD (0xFE02, direct) */
+struct i40e_aqc_opc_oem_ocsd_initialize {
+	u8 type_status;
+	u8 reserved1[3];
+	__le32 ocsd_memory_block_addr_high;
+	__le32 ocsd_memory_block_addr_low;
+	__le32 requested_update_interval;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_opc_oem_ocsd_initialize);
+
+/* Initialize OCBB  (0xFE03, direct) */
+struct i40e_aqc_opc_oem_ocbb_initialize {
+	u8 type_status;
+	u8 reserved1[3];
+	__le32 ocbb_memory_block_addr_high;
+	__le32 ocbb_memory_block_addr_low;
+	u8 reserved2[4];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_opc_oem_ocbb_initialize);
+
+/* debug commands */
+
+/* get device id (0xFF00) uses the generic structure */
+
+/* set test more (0xFF01, internal) */
+
+struct i40e_acq_set_test_mode {
+	u8	mode;
+#define I40E_AQ_TEST_PARTIAL	0
+#define I40E_AQ_TEST_FULL	1
+#define I40E_AQ_TEST_NVM	2
+	u8	reserved[3];
+	u8	command;
+#define I40E_AQ_TEST_OPEN	0
+#define I40E_AQ_TEST_CLOSE	1
+#define I40E_AQ_TEST_INC	2
+	u8	reserved2[3];
+	__le32	address_high;
+	__le32	address_low;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_acq_set_test_mode);
+
+/* Debug Read Register command (0xFF03)
+ * Debug Write Register command (0xFF04)
+ */
+struct i40e_aqc_debug_reg_read_write {
+	__le32 reserved;
+	__le32 address;
+	__le32 value_high;
+	__le32 value_low;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_debug_reg_read_write);
+
+/* Scatter/gather Reg Read  (indirect 0xFF05)
+ * Scatter/gather Reg Write (indirect 0xFF06)
+ */
+
+/* i40e_aq_desc is used for the command */
+struct i40e_aqc_debug_reg_sg_element_data {
+	__le32 address;
+	__le32 value;
+};
+
+/* Debug Modify register (direct 0xFF07) */
+struct i40e_aqc_debug_modify_reg {
+	__le32 address;
+	__le32 value;
+	__le32 clear_mask;
+	__le32 set_mask;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_debug_modify_reg);
+
+/* dump internal data (0xFF08, indirect) */
+
+#define I40E_AQ_CLUSTER_ID_AUX		0
+#define I40E_AQ_CLUSTER_ID_SWITCH_FLU	1
+#define I40E_AQ_CLUSTER_ID_TXSCHED	2
+#define I40E_AQ_CLUSTER_ID_HMC		3
+#define I40E_AQ_CLUSTER_ID_MAC0		4
+#define I40E_AQ_CLUSTER_ID_MAC1		5
+#define I40E_AQ_CLUSTER_ID_MAC2		6
+#define I40E_AQ_CLUSTER_ID_MAC3		7
+#define I40E_AQ_CLUSTER_ID_DCB		8
+#define I40E_AQ_CLUSTER_ID_EMP_MEM	9
+#define I40E_AQ_CLUSTER_ID_PKT_BUF	10
+#define I40E_AQ_CLUSTER_ID_ALTRAM	11
+
+struct i40e_aqc_debug_dump_internals {
+	u8	cluster_id;
+	u8	table_id;
+	__le16	data_size;
+	__le32	idx;
+	__le32	address_high;
+	__le32	address_low;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_debug_dump_internals);
+
+struct i40e_aqc_debug_modify_internals {
+	u8	cluster_id;
+	u8	cluster_specific_params[7];
+	__le32	address_high;
+	__le32	address_low;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_debug_modify_internals);
+
+#endif
diff --git a/usr/src/uts/common/io/i40e/core/i40e_alloc.h b/usr/src/uts/common/io/i40e/core/i40e_alloc.h
new file mode 100644
index 0000000000..4428287f83
--- /dev/null
+++ b/usr/src/uts/common/io/i40e/core/i40e_alloc.h
@@ -0,0 +1,66 @@
+/******************************************************************************
+
+  Copyright (c) 2013-2014, Intel Corporation 
+  All rights reserved.
+  
+  Redistribution and use in source and binary forms, with or without 
+  modification, are permitted provided that the following conditions are met:
+  
+   1. Redistributions of source code must retain the above copyright notice, 
+      this list of conditions and the following disclaimer.
+  
+   2. Redistributions in binary form must reproduce the above copyright 
+      notice, this list of conditions and the following disclaimer in the 
+      documentation and/or other materials provided with the distribution.
+  
+   3. Neither the name of the Intel Corporation nor the names of its 
+      contributors may be used to endorse or promote products derived from 
+      this software without specific prior written permission.
+  
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+  POSSIBILITY OF SUCH DAMAGE.
+
+******************************************************************************/
+/*$FreeBSD: head/sys/dev/ixl/i40e_alloc.h 283119 2015-05-19 18:35:18Z jhb $*/
+
+#ifndef _I40E_ALLOC_H_
+#define _I40E_ALLOC_H_
+
+struct i40e_hw;
+
+/* Memory allocation types */
+enum i40e_memory_type {
+	i40e_mem_arq_buf = 0,		/* ARQ indirect command buffer */
+	i40e_mem_asq_buf = 1,
+	i40e_mem_atq_buf = 2,		/* ATQ indirect command buffer */
+	i40e_mem_arq_ring = 3,		/* ARQ descriptor ring */
+	i40e_mem_atq_ring = 4,		/* ATQ descriptor ring */
+	i40e_mem_pd = 5,		/* Page Descriptor */
+	i40e_mem_bp = 6,		/* Backing Page - 4KB */
+	i40e_mem_bp_jumbo = 7,		/* Backing Page - > 4KB */
+	i40e_mem_reserved
+};
+
+/* prototype for functions used for dynamic memory allocation */
+enum i40e_status_code i40e_allocate_dma_mem(struct i40e_hw *hw,
+					    struct i40e_dma_mem *mem,
+					    enum i40e_memory_type type,
+					    u64 size, u32 alignment);
+enum i40e_status_code i40e_free_dma_mem(struct i40e_hw *hw,
+					struct i40e_dma_mem *mem);
+enum i40e_status_code i40e_allocate_virt_mem(struct i40e_hw *hw,
+					     struct i40e_virt_mem *mem,
+					     u32 size);
+enum i40e_status_code i40e_free_virt_mem(struct i40e_hw *hw,
+					 struct i40e_virt_mem *mem);
+
+#endif /* _I40E_ALLOC_H_ */
diff --git a/usr/src/uts/common/io/i40e/core/i40e_common.c b/usr/src/uts/common/io/i40e/core/i40e_common.c
new file mode 100644
index 0000000000..c58eb9de1e
--- /dev/null
+++ b/usr/src/uts/common/io/i40e/core/i40e_common.c
@@ -0,0 +1,5708 @@
+/******************************************************************************
+
+  Copyright (c) 2013-2015, Intel Corporation 
+  All rights reserved.
+  
+  Redistribution and use in source and binary forms, with or without 
+  modification, are permitted provided that the following conditions are met:
+  
+   1. Redistributions of source code must retain the above copyright notice, 
+      this list of conditions and the following disclaimer.
+  
+   2. Redistributions in binary form must reproduce the above copyright 
+      notice, this list of conditions and the following disclaimer in the 
+      documentation and/or other materials provided with the distribution.
+  
+   3. Neither the name of the Intel Corporation nor the names of its 
+      contributors may be used to endorse or promote products derived from 
+      this software without specific prior written permission.
+  
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+  POSSIBILITY OF SUCH DAMAGE.
+
+******************************************************************************/
+/*$FreeBSD: head/sys/dev/ixl/i40e_common.c 284049 2015-06-05 22:52:42Z jfv $*/
+
+#include "i40e_type.h"
+#include "i40e_adminq.h"
+#include "i40e_prototype.h"
+#include "i40e_virtchnl.h"
+
+
+/**
+ * i40e_set_mac_type - Sets MAC type
+ * @hw: pointer to the HW structure
+ *
+ * This function sets the mac type of the adapter based on the
+ * vendor ID and device ID stored in the hw structure.
+ **/
+enum i40e_status_code i40e_set_mac_type(struct i40e_hw *hw)
+{
+	enum i40e_status_code status = I40E_SUCCESS;
+
+	DEBUGFUNC("i40e_set_mac_type\n");
+
+	if (hw->vendor_id == I40E_INTEL_VENDOR_ID) {
+		switch (hw->device_id) {
+		case I40E_DEV_ID_SFP_XL710:
+		case I40E_DEV_ID_QEMU:
+		case I40E_DEV_ID_KX_A:
+		case I40E_DEV_ID_KX_B:
+		case I40E_DEV_ID_KX_C:
+		case I40E_DEV_ID_QSFP_A:
+		case I40E_DEV_ID_QSFP_B:
+		case I40E_DEV_ID_QSFP_C:
+		case I40E_DEV_ID_10G_BASE_T:
+		case I40E_DEV_ID_10G_BASE_T4:
+		case I40E_DEV_ID_20G_KR2:
+		case I40E_DEV_ID_20G_KR2_A:
+			hw->mac.type = I40E_MAC_XL710;
+			break;
+#ifdef X722_SUPPORT
+		case I40E_DEV_ID_SFP_X722:
+		case I40E_DEV_ID_1G_BASE_T_X722:
+		case I40E_DEV_ID_10G_BASE_T_X722:
+			hw->mac.type = I40E_MAC_X722;
+			break;
+#endif
+#ifdef X722_SUPPORT
+		case I40E_DEV_ID_X722_VF:
+		case I40E_DEV_ID_X722_VF_HV:
+			hw->mac.type = I40E_MAC_X722_VF;
+			break;
+#endif
+		case I40E_DEV_ID_VF:
+		case I40E_DEV_ID_VF_HV:
+			hw->mac.type = I40E_MAC_VF;
+			break;
+		default:
+			hw->mac.type = I40E_MAC_GENERIC;
+			break;
+		}
+	} else {
+		status = I40E_ERR_DEVICE_NOT_SUPPORTED;
+	}
+
+	DEBUGOUT2("i40e_set_mac_type found mac: %d, returns: %d\n",
+		  hw->mac.type, status);
+	return status;
+}
+
+/**
+ * i40e_aq_str - convert AQ err code to a string
+ * @hw: pointer to the HW structure
+ * @aq_err: the AQ error code to convert
+ **/
+char *i40e_aq_str(struct i40e_hw *hw, enum i40e_admin_queue_err aq_err)
+{
+	switch (aq_err) {
+	case I40E_AQ_RC_OK:
+		return "OK";
+	case I40E_AQ_RC_EPERM:
+		return "I40E_AQ_RC_EPERM";
+	case I40E_AQ_RC_ENOENT:
+		return "I40E_AQ_RC_ENOENT";
+	case I40E_AQ_RC_ESRCH:
+		return "I40E_AQ_RC_ESRCH";
+	case I40E_AQ_RC_EINTR:
+		return "I40E_AQ_RC_EINTR";
+	case I40E_AQ_RC_EIO:
+		return "I40E_AQ_RC_EIO";
+	case I40E_AQ_RC_ENXIO:
+		return "I40E_AQ_RC_ENXIO";
+	case I40E_AQ_RC_E2BIG:
+		return "I40E_AQ_RC_E2BIG";
+	case I40E_AQ_RC_EAGAIN:
+		return "I40E_AQ_RC_EAGAIN";
+	case I40E_AQ_RC_ENOMEM:
+		return "I40E_AQ_RC_ENOMEM";
+	case I40E_AQ_RC_EACCES:
+		return "I40E_AQ_RC_EACCES";
+	case I40E_AQ_RC_EFAULT:
+		return "I40E_AQ_RC_EFAULT";
+	case I40E_AQ_RC_EBUSY:
+		return "I40E_AQ_RC_EBUSY";
+	case I40E_AQ_RC_EEXIST:
+		return "I40E_AQ_RC_EEXIST";
+	case I40E_AQ_RC_EINVAL:
+		return "I40E_AQ_RC_EINVAL";
+	case I40E_AQ_RC_ENOTTY:
+		return "I40E_AQ_RC_ENOTTY";
+	case I40E_AQ_RC_ENOSPC:
+		return "I40E_AQ_RC_ENOSPC";
+	case I40E_AQ_RC_ENOSYS:
+		return "I40E_AQ_RC_ENOSYS";
+	case I40E_AQ_RC_ERANGE:
+		return "I40E_AQ_RC_ERANGE";
+	case I40E_AQ_RC_EFLUSHED:
+		return "I40E_AQ_RC_EFLUSHED";
+	case I40E_AQ_RC_BAD_ADDR:
+		return "I40E_AQ_RC_BAD_ADDR";
+	case I40E_AQ_RC_EMODE:
+		return "I40E_AQ_RC_EMODE";
+	case I40E_AQ_RC_EFBIG:
+		return "I40E_AQ_RC_EFBIG";
+	}
+
+	snprintf(hw->err_str, sizeof(hw->err_str), "%d", aq_err);
+	return hw->err_str;
+}
+
+/**
+ * i40e_stat_str - convert status err code to a string
+ * @hw: pointer to the HW structure
+ * @stat_err: the status error code to convert
+ **/
+char *i40e_stat_str(struct i40e_hw *hw, enum i40e_status_code stat_err)
+{
+	switch (stat_err) {
+	case I40E_SUCCESS:
+		return "OK";
+	case I40E_ERR_NVM:
+		return "I40E_ERR_NVM";
+	case I40E_ERR_NVM_CHECKSUM:
+		return "I40E_ERR_NVM_CHECKSUM";
+	case I40E_ERR_PHY:
+		return "I40E_ERR_PHY";
+	case I40E_ERR_CONFIG:
+		return "I40E_ERR_CONFIG";
+	case I40E_ERR_PARAM:
+		return "I40E_ERR_PARAM";
+	case I40E_ERR_MAC_TYPE:
+		return "I40E_ERR_MAC_TYPE";
+	case I40E_ERR_UNKNOWN_PHY:
+		return "I40E_ERR_UNKNOWN_PHY";
+	case I40E_ERR_LINK_SETUP:
+		return "I40E_ERR_LINK_SETUP";
+	case I40E_ERR_ADAPTER_STOPPED:
+		return "I40E_ERR_ADAPTER_STOPPED";
+	case I40E_ERR_INVALID_MAC_ADDR:
+		return "I40E_ERR_INVALID_MAC_ADDR";
+	case I40E_ERR_DEVICE_NOT_SUPPORTED:
+		return "I40E_ERR_DEVICE_NOT_SUPPORTED";
+	case I40E_ERR_MASTER_REQUESTS_PENDING:
+		return "I40E_ERR_MASTER_REQUESTS_PENDING";
+	case I40E_ERR_INVALID_LINK_SETTINGS:
+		return "I40E_ERR_INVALID_LINK_SETTINGS";
+	case I40E_ERR_AUTONEG_NOT_COMPLETE:
+		return "I40E_ERR_AUTONEG_NOT_COMPLETE";
+	case I40E_ERR_RESET_FAILED:
+		return "I40E_ERR_RESET_FAILED";
+	case I40E_ERR_SWFW_SYNC:
+		return "I40E_ERR_SWFW_SYNC";
+	case I40E_ERR_NO_AVAILABLE_VSI:
+		return "I40E_ERR_NO_AVAILABLE_VSI";
+	case I40E_ERR_NO_MEMORY:
+		return "I40E_ERR_NO_MEMORY";
+	case I40E_ERR_BAD_PTR:
+		return "I40E_ERR_BAD_PTR";
+	case I40E_ERR_RING_FULL:
+		return "I40E_ERR_RING_FULL";
+	case I40E_ERR_INVALID_PD_ID:
+		return "I40E_ERR_INVALID_PD_ID";
+	case I40E_ERR_INVALID_QP_ID:
+		return "I40E_ERR_INVALID_QP_ID";
+	case I40E_ERR_INVALID_CQ_ID:
+		return "I40E_ERR_INVALID_CQ_ID";
+	case I40E_ERR_INVALID_CEQ_ID:
+		return "I40E_ERR_INVALID_CEQ_ID";
+	case I40E_ERR_INVALID_AEQ_ID:
+		return "I40E_ERR_INVALID_AEQ_ID";
+	case I40E_ERR_INVALID_SIZE:
+		return "I40E_ERR_INVALID_SIZE";
+	case I40E_ERR_INVALID_ARP_INDEX:
+		return "I40E_ERR_INVALID_ARP_INDEX";
+	case I40E_ERR_INVALID_FPM_FUNC_ID:
+		return "I40E_ERR_INVALID_FPM_FUNC_ID";
+	case I40E_ERR_QP_INVALID_MSG_SIZE:
+		return "I40E_ERR_QP_INVALID_MSG_SIZE";
+	case I40E_ERR_QP_TOOMANY_WRS_POSTED:
+		return "I40E_ERR_QP_TOOMANY_WRS_POSTED";
+	case I40E_ERR_INVALID_FRAG_COUNT:
+		return "I40E_ERR_INVALID_FRAG_COUNT";
+	case I40E_ERR_QUEUE_EMPTY:
+		return "I40E_ERR_QUEUE_EMPTY";
+	case I40E_ERR_INVALID_ALIGNMENT:
+		return "I40E_ERR_INVALID_ALIGNMENT";
+	case I40E_ERR_FLUSHED_QUEUE:
+		return "I40E_ERR_FLUSHED_QUEUE";
+	case I40E_ERR_INVALID_PUSH_PAGE_INDEX:
+		return "I40E_ERR_INVALID_PUSH_PAGE_INDEX";
+	case I40E_ERR_INVALID_IMM_DATA_SIZE:
+		return "I40E_ERR_INVALID_IMM_DATA_SIZE";
+	case I40E_ERR_TIMEOUT:
+		return "I40E_ERR_TIMEOUT";
+	case I40E_ERR_OPCODE_MISMATCH:
+		return "I40E_ERR_OPCODE_MISMATCH";
+	case I40E_ERR_CQP_COMPL_ERROR:
+		return "I40E_ERR_CQP_COMPL_ERROR";
+	case I40E_ERR_INVALID_VF_ID:
+		return "I40E_ERR_INVALID_VF_ID";
+	case I40E_ERR_INVALID_HMCFN_ID:
+		return "I40E_ERR_INVALID_HMCFN_ID";
+	case I40E_ERR_BACKING_PAGE_ERROR:
+		return "I40E_ERR_BACKING_PAGE_ERROR";
+	case I40E_ERR_NO_PBLCHUNKS_AVAILABLE:
+		return "I40E_ERR_NO_PBLCHUNKS_AVAILABLE";
+	case I40E_ERR_INVALID_PBLE_INDEX:
+		return "I40E_ERR_INVALID_PBLE_INDEX";
+	case I40E_ERR_INVALID_SD_INDEX:
+		return "I40E_ERR_INVALID_SD_INDEX";
+	case I40E_ERR_INVALID_PAGE_DESC_INDEX:
+		return "I40E_ERR_INVALID_PAGE_DESC_INDEX";
+	case I40E_ERR_INVALID_SD_TYPE:
+		return "I40E_ERR_INVALID_SD_TYPE";
+	case I40E_ERR_MEMCPY_FAILED:
+		return "I40E_ERR_MEMCPY_FAILED";
+	case I40E_ERR_INVALID_HMC_OBJ_INDEX:
+		return "I40E_ERR_INVALID_HMC_OBJ_INDEX";
+	case I40E_ERR_INVALID_HMC_OBJ_COUNT:
+		return "I40E_ERR_INVALID_HMC_OBJ_COUNT";
+	case I40E_ERR_INVALID_SRQ_ARM_LIMIT:
+		return "I40E_ERR_INVALID_SRQ_ARM_LIMIT";
+	case I40E_ERR_SRQ_ENABLED:
+		return "I40E_ERR_SRQ_ENABLED";
+	case I40E_ERR_ADMIN_QUEUE_ERROR:
+		return "I40E_ERR_ADMIN_QUEUE_ERROR";
+	case I40E_ERR_ADMIN_QUEUE_TIMEOUT:
+		return "I40E_ERR_ADMIN_QUEUE_TIMEOUT";
+	case I40E_ERR_BUF_TOO_SHORT:
+		return "I40E_ERR_BUF_TOO_SHORT";
+	case I40E_ERR_ADMIN_QUEUE_FULL:
+		return "I40E_ERR_ADMIN_QUEUE_FULL";
+	case I40E_ERR_ADMIN_QUEUE_NO_WORK:
+		return "I40E_ERR_ADMIN_QUEUE_NO_WORK";
+	case I40E_ERR_BAD_IWARP_CQE:
+		return "I40E_ERR_BAD_IWARP_CQE";
+	case I40E_ERR_NVM_BLANK_MODE:
+		return "I40E_ERR_NVM_BLANK_MODE";
+	case I40E_ERR_NOT_IMPLEMENTED:
+		return "I40E_ERR_NOT_IMPLEMENTED";
+	case I40E_ERR_PE_DOORBELL_NOT_ENABLED:
+		return "I40E_ERR_PE_DOORBELL_NOT_ENABLED";
+	case I40E_ERR_DIAG_TEST_FAILED:
+		return "I40E_ERR_DIAG_TEST_FAILED";
+	case I40E_ERR_NOT_READY:
+		return "I40E_ERR_NOT_READY";
+	case I40E_NOT_SUPPORTED:
+		return "I40E_NOT_SUPPORTED";
+	case I40E_ERR_FIRMWARE_API_VERSION:
+		return "I40E_ERR_FIRMWARE_API_VERSION";
+	}
+
+	snprintf(hw->err_str, sizeof(hw->err_str), "%d", stat_err);
+	return hw->err_str;
+}
+
+/**
+ * i40e_debug_aq
+ * @hw: debug mask related to admin queue
+ * @mask: debug mask
+ * @desc: pointer to admin queue descriptor
+ * @buffer: pointer to command buffer
+ * @buf_len: max length of buffer
+ *
+ * Dumps debug log about adminq command with descriptor contents.
+ **/
+void i40e_debug_aq(struct i40e_hw *hw, enum i40e_debug_mask mask, void *desc,
+		   void *buffer, u16 buf_len)
+{
+	struct i40e_aq_desc *aq_desc = (struct i40e_aq_desc *)desc;
+	u16 len = LE16_TO_CPU(aq_desc->datalen);
+	u8 *buf = (u8 *)buffer;
+	u16 i = 0;
+
+	if ((!(mask & hw->debug_mask)) || (desc == NULL))
+		return;
+
+	i40e_debug(hw, mask,
+		   "AQ CMD: opcode 0x%04X, flags 0x%04X, datalen 0x%04X, retval 0x%04X\n",
+		   LE16_TO_CPU(aq_desc->opcode),
+		   LE16_TO_CPU(aq_desc->flags),
+		   LE16_TO_CPU(aq_desc->datalen),
+		   LE16_TO_CPU(aq_desc->retval));
+	i40e_debug(hw, mask, "\tcookie (h,l) 0x%08X 0x%08X\n",
+		   LE32_TO_CPU(aq_desc->cookie_high),
+		   LE32_TO_CPU(aq_desc->cookie_low));
+	i40e_debug(hw, mask, "\tparam (0,1)  0x%08X 0x%08X\n",
+		   LE32_TO_CPU(aq_desc->params.internal.param0),
+		   LE32_TO_CPU(aq_desc->params.internal.param1));
+	i40e_debug(hw, mask, "\taddr (h,l)   0x%08X 0x%08X\n",
+		   LE32_TO_CPU(aq_desc->params.external.addr_high),
+		   LE32_TO_CPU(aq_desc->params.external.addr_low));
+
+	if ((buffer != NULL) && (aq_desc->datalen != 0)) {
+		i40e_debug(hw, mask, "AQ CMD Buffer:\n");
+		if (buf_len < len)
+			len = buf_len;
+		/* write the full 16-byte chunks */
+		for (i = 0; i < (len - 16); i += 16)
+			i40e_debug(hw, mask,
+				   "\t0x%04X  %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X\n",
+				   i, buf[i], buf[i+1], buf[i+2], buf[i+3],
+				   buf[i+4], buf[i+5], buf[i+6], buf[i+7],
+				   buf[i+8], buf[i+9], buf[i+10], buf[i+11],
+				   buf[i+12], buf[i+13], buf[i+14], buf[i+15]);
+		/* write whatever's left over without overrunning the buffer */
+		if (i < len) {
+			char d_buf[80];
+			int j = 0;
+
+			memset(d_buf, 0, sizeof(d_buf));
+#ifdef I40E_ILLUMOS
+			/*
+			 * Sigh.
+			 *
+			 * The illumos DDI (inherited from OpenSolaris) says
+			 * sprintf() returns the pointer to its first
+			 * argument, NOT the length of bytes printed. A better
+			 * solution would be to have the kernel provide
+			 * something like real_sprintf() but for now, we
+			 * hack around it.
+			 */
+			(void) sprintf(d_buf, "\t0x%04X ", i);
+			j += strlen(d_buf);
+			/* Bounds-check at 77, because " XX" emits 4 chars. */
+			while (i < len && j < 77) {
+				(void) sprintf(&d_buf[j], " %02X", buf[i++]);
+				j += strlen(&d_buf[j]);
+			}
+#else
+			j += sprintf(d_buf, "\t0x%04X ", i);
+			while (i < len)
+				j += sprintf(&d_buf[j], " %02X", buf[i++]);
+#endif
+			i40e_debug(hw, mask, "%s\n", d_buf);
+		}
+	}
+}
+
+/**
+ * i40e_check_asq_alive
+ * @hw: pointer to the hw struct
+ *
+ * Returns TRUE if Queue is enabled else FALSE.
+ **/
+bool i40e_check_asq_alive(struct i40e_hw *hw)
+{
+	if (hw->aq.asq.len)
+		if (!i40e_is_vf(hw))
+			return !!(rd32(hw, hw->aq.asq.len) &
+				I40E_PF_ATQLEN_ATQENABLE_MASK);
+		if (i40e_is_vf(hw))
+			return !!(rd32(hw, hw->aq.asq.len) &
+				I40E_VF_ATQLEN1_ATQENABLE_MASK);
+	return FALSE;
+}
+
+/**
+ * i40e_aq_queue_shutdown
+ * @hw: pointer to the hw struct
+ * @unloading: is the driver unloading itself
+ *
+ * Tell the Firmware that we're shutting down the AdminQ and whether
+ * or not the driver is unloading as well.
+ **/
+enum i40e_status_code i40e_aq_queue_shutdown(struct i40e_hw *hw,
+					     bool unloading)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_queue_shutdown *cmd =
+		(struct i40e_aqc_queue_shutdown *)&desc.params.raw;
+	enum i40e_status_code status;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					  i40e_aqc_opc_queue_shutdown);
+
+	if (unloading)
+		cmd->driver_unloading = CPU_TO_LE32(I40E_AQ_DRIVER_UNLOADING);
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, NULL);
+
+	return status;
+}
+#ifdef X722_SUPPORT
+
+/**
+ * i40e_aq_get_set_rss_lut
+ * @hw: pointer to the hardware structure
+ * @vsi_id: vsi fw index
+ * @pf_lut: for PF table set TRUE, for VSI table set FALSE
+ * @lut: pointer to the lut buffer provided by the caller
+ * @lut_size: size of the lut buffer
+ * @set: set TRUE to set the table, FALSE to get the table
+ *
+ * Internal function to get or set RSS look up table
+ **/
+static enum i40e_status_code i40e_aq_get_set_rss_lut(struct i40e_hw *hw,
+						     u16 vsi_id, bool pf_lut,
+						     u8 *lut, u16 lut_size,
+						     bool set)
+{
+	enum i40e_status_code status;
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_get_set_rss_lut *cmd_resp =
+		   (struct i40e_aqc_get_set_rss_lut *)&desc.params.raw;
+
+	if (set)
+		i40e_fill_default_direct_cmd_desc(&desc,
+						  i40e_aqc_opc_set_rss_lut);
+	else
+		i40e_fill_default_direct_cmd_desc(&desc,
+						  i40e_aqc_opc_get_rss_lut);
+
+	/* Indirect command */
+	desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_BUF);
+	desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_RD);
+
+	cmd_resp->vsi_id =
+			CPU_TO_LE16((u16)((vsi_id <<
+					  I40E_AQC_SET_RSS_LUT_VSI_ID_SHIFT) &
+					  I40E_AQC_SET_RSS_LUT_VSI_ID_MASK));
+	cmd_resp->vsi_id |= CPU_TO_LE16((u16)I40E_AQC_SET_RSS_LUT_VSI_VALID);
+
+	if (pf_lut)
+		cmd_resp->flags |= CPU_TO_LE16((u16)
+					((I40E_AQC_SET_RSS_LUT_TABLE_TYPE_PF <<
+					I40E_AQC_SET_RSS_LUT_TABLE_TYPE_SHIFT) &
+					I40E_AQC_SET_RSS_LUT_TABLE_TYPE_MASK));
+	else
+		cmd_resp->flags |= CPU_TO_LE16((u16)
+					((I40E_AQC_SET_RSS_LUT_TABLE_TYPE_VSI <<
+					I40E_AQC_SET_RSS_LUT_TABLE_TYPE_SHIFT) &
+					I40E_AQC_SET_RSS_LUT_TABLE_TYPE_MASK));
+
+	cmd_resp->addr_high = CPU_TO_LE32(I40E_HI_WORD((u64)lut));
+	cmd_resp->addr_low = CPU_TO_LE32(I40E_LO_DWORD((u64)lut));
+
+	status = i40e_asq_send_command(hw, &desc, lut, lut_size, NULL);
+
+	return status;
+}
+
+/**
+ * i40e_aq_get_rss_lut
+ * @hw: pointer to the hardware structure
+ * @vsi_id: vsi fw index
+ * @pf_lut: for PF table set TRUE, for VSI table set FALSE
+ * @lut: pointer to the lut buffer provided by the caller
+ * @lut_size: size of the lut buffer
+ *
+ * get the RSS lookup table, PF or VSI type
+ **/
+enum i40e_status_code i40e_aq_get_rss_lut(struct i40e_hw *hw, u16 vsi_id,
+					  bool pf_lut, u8 *lut, u16 lut_size)
+{
+	return i40e_aq_get_set_rss_lut(hw, vsi_id, pf_lut, lut, lut_size,
+				       FALSE);
+}
+
+/**
+ * i40e_aq_set_rss_lut
+ * @hw: pointer to the hardware structure
+ * @vsi_id: vsi fw index
+ * @pf_lut: for PF table set TRUE, for VSI table set FALSE
+ * @lut: pointer to the lut buffer provided by the caller
+ * @lut_size: size of the lut buffer
+ *
+ * set the RSS lookup table, PF or VSI type
+ **/
+enum i40e_status_code i40e_aq_set_rss_lut(struct i40e_hw *hw, u16 vsi_id,
+					  bool pf_lut, u8 *lut, u16 lut_size)
+{
+	return i40e_aq_get_set_rss_lut(hw, vsi_id, pf_lut, lut, lut_size, TRUE);
+}
+
+/**
+ * i40e_aq_get_set_rss_key
+ * @hw: pointer to the hw struct
+ * @vsi_id: vsi fw index
+ * @key: pointer to key info struct
+ * @set: set TRUE to set the key, FALSE to get the key
+ *
+ * get the RSS key per VSI
+ **/
+static enum i40e_status_code i40e_aq_get_set_rss_key(struct i40e_hw *hw,
+				      u16 vsi_id,
+				      struct i40e_aqc_get_set_rss_key_data *key,
+				      bool set)
+{
+	enum i40e_status_code status;
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_get_set_rss_key *cmd_resp =
+			(struct i40e_aqc_get_set_rss_key *)&desc.params.raw;
+	u16 key_size = sizeof(struct i40e_aqc_get_set_rss_key_data);
+
+	if (set)
+		i40e_fill_default_direct_cmd_desc(&desc,
+						  i40e_aqc_opc_set_rss_key);
+	else
+		i40e_fill_default_direct_cmd_desc(&desc,
+						  i40e_aqc_opc_get_rss_key);
+
+	/* Indirect command */
+	desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_BUF);
+	desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_RD);
+
+	cmd_resp->vsi_id =
+			CPU_TO_LE16((u16)((vsi_id <<
+					  I40E_AQC_SET_RSS_KEY_VSI_ID_SHIFT) &
+					  I40E_AQC_SET_RSS_KEY_VSI_ID_MASK));
+	cmd_resp->vsi_id |= CPU_TO_LE16((u16)I40E_AQC_SET_RSS_KEY_VSI_VALID);
+	cmd_resp->addr_high = CPU_TO_LE32(I40E_HI_WORD((u64)key));
+	cmd_resp->addr_low = CPU_TO_LE32(I40E_LO_DWORD((u64)key));
+
+	status = i40e_asq_send_command(hw, &desc, key, key_size, NULL);
+
+	return status;
+}
+
+/**
+ * i40e_aq_get_rss_key
+ * @hw: pointer to the hw struct
+ * @vsi_id: vsi fw index
+ * @key: pointer to key info struct
+ *
+ **/
+enum i40e_status_code i40e_aq_get_rss_key(struct i40e_hw *hw,
+				      u16 vsi_id,
+				      struct i40e_aqc_get_set_rss_key_data *key)
+{
+	return i40e_aq_get_set_rss_key(hw, vsi_id, key, FALSE);
+}
+
+/**
+ * i40e_aq_set_rss_key
+ * @hw: pointer to the hw struct
+ * @vsi_id: vsi fw index
+ * @key: pointer to key info struct
+ *
+ * set the RSS key per VSI
+ **/
+enum i40e_status_code i40e_aq_set_rss_key(struct i40e_hw *hw,
+				      u16 vsi_id,
+				      struct i40e_aqc_get_set_rss_key_data *key)
+{
+	return i40e_aq_get_set_rss_key(hw, vsi_id, key, TRUE);
+}
+#endif /* X722_SUPPORT */
+
+/* The i40e_ptype_lookup table is used to convert from the 8-bit ptype in the
+ * hardware to a bit-field that can be used by SW to more easily determine the
+ * packet type.
+ *
+ * Macros are used to shorten the table lines and make this table human
+ * readable.
+ *
+ * We store the PTYPE in the top byte of the bit field - this is just so that
+ * we can check that the table doesn't have a row missing, as the index into
+ * the table should be the PTYPE.
+ *
+ * Typical work flow:
+ *
+ * IF NOT i40e_ptype_lookup[ptype].known
+ * THEN
+ *      Packet is unknown
+ * ELSE IF i40e_ptype_lookup[ptype].outer_ip == I40E_RX_PTYPE_OUTER_IP
+ *      Use the rest of the fields to look at the tunnels, inner protocols, etc
+ * ELSE
+ *      Use the enum i40e_rx_l2_ptype to decode the packet type
+ * ENDIF
+ */
+
+/* macro to make the table lines short */
+#define I40E_PTT(PTYPE, OUTER_IP, OUTER_IP_VER, OUTER_FRAG, T, TE, TEF, I, PL)\
+	{	PTYPE, \
+		1, \
+		I40E_RX_PTYPE_OUTER_##OUTER_IP, \
+		I40E_RX_PTYPE_OUTER_##OUTER_IP_VER, \
+		I40E_RX_PTYPE_##OUTER_FRAG, \
+		I40E_RX_PTYPE_TUNNEL_##T, \
+		I40E_RX_PTYPE_TUNNEL_END_##TE, \
+		I40E_RX_PTYPE_##TEF, \
+		I40E_RX_PTYPE_INNER_PROT_##I, \
+		I40E_RX_PTYPE_PAYLOAD_LAYER_##PL }
+
+#define I40E_PTT_UNUSED_ENTRY(PTYPE) \
+		{ PTYPE, 0, 0, 0, 0, 0, 0, 0, 0, 0 }
+
+/* shorter macros makes the table fit but are terse */
+#define I40E_RX_PTYPE_NOF		I40E_RX_PTYPE_NOT_FRAG
+#define I40E_RX_PTYPE_FRG		I40E_RX_PTYPE_FRAG
+#define I40E_RX_PTYPE_INNER_PROT_TS	I40E_RX_PTYPE_INNER_PROT_TIMESYNC
+
+/* Lookup table mapping the HW PTYPE to the bit field for decoding */
+struct i40e_rx_ptype_decoded i40e_ptype_lookup[] = {
+	/* L2 Packet types */
+	I40E_PTT_UNUSED_ENTRY(0),
+	I40E_PTT(1,  L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY2),
+	I40E_PTT(2,  L2, NONE, NOF, NONE, NONE, NOF, TS,   PAY2),
+	I40E_PTT(3,  L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY2),
+	I40E_PTT_UNUSED_ENTRY(4),
+	I40E_PTT_UNUSED_ENTRY(5),
+	I40E_PTT(6,  L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY2),
+	I40E_PTT(7,  L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY2),
+	I40E_PTT_UNUSED_ENTRY(8),
+	I40E_PTT_UNUSED_ENTRY(9),
+	I40E_PTT(10, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY2),
+	I40E_PTT(11, L2, NONE, NOF, NONE, NONE, NOF, NONE, NONE),
+	I40E_PTT(12, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
+	I40E_PTT(13, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
+	I40E_PTT(14, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
+	I40E_PTT(15, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
+	I40E_PTT(16, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
+	I40E_PTT(17, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
+	I40E_PTT(18, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
+	I40E_PTT(19, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
+	I40E_PTT(20, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
+	I40E_PTT(21, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
+
+	/* Non Tunneled IPv4 */
+	I40E_PTT(22, IP, IPV4, FRG, NONE, NONE, NOF, NONE, PAY3),
+	I40E_PTT(23, IP, IPV4, NOF, NONE, NONE, NOF, NONE, PAY3),
+	I40E_PTT(24, IP, IPV4, NOF, NONE, NONE, NOF, UDP,  PAY4),
+	I40E_PTT_UNUSED_ENTRY(25),
+	I40E_PTT(26, IP, IPV4, NOF, NONE, NONE, NOF, TCP,  PAY4),
+	I40E_PTT(27, IP, IPV4, NOF, NONE, NONE, NOF, SCTP, PAY4),
+	I40E_PTT(28, IP, IPV4, NOF, NONE, NONE, NOF, ICMP, PAY4),
+
+	/* IPv4 --> IPv4 */
+	I40E_PTT(29, IP, IPV4, NOF, IP_IP, IPV4, FRG, NONE, PAY3),
+	I40E_PTT(30, IP, IPV4, NOF, IP_IP, IPV4, NOF, NONE, PAY3),
+	I40E_PTT(31, IP, IPV4, NOF, IP_IP, IPV4, NOF, UDP,  PAY4),
+	I40E_PTT_UNUSED_ENTRY(32),
+	I40E_PTT(33, IP, IPV4, NOF, IP_IP, IPV4, NOF, TCP,  PAY4),
+	I40E_PTT(34, IP, IPV4, NOF, IP_IP, IPV4, NOF, SCTP, PAY4),
+	I40E_PTT(35, IP, IPV4, NOF, IP_IP, IPV4, NOF, ICMP, PAY4),
+
+	/* IPv4 --> IPv6 */
+	I40E_PTT(36, IP, IPV4, NOF, IP_IP, IPV6, FRG, NONE, PAY3),
+	I40E_PTT(37, IP, IPV4, NOF, IP_IP, IPV6, NOF, NONE, PAY3),
+	I40E_PTT(38, IP, IPV4, NOF, IP_IP, IPV6, NOF, UDP,  PAY4),
+	I40E_PTT_UNUSED_ENTRY(39),
+	I40E_PTT(40, IP, IPV4, NOF, IP_IP, IPV6, NOF, TCP,  PAY4),
+	I40E_PTT(41, IP, IPV4, NOF, IP_IP, IPV6, NOF, SCTP, PAY4),
+	I40E_PTT(42, IP, IPV4, NOF, IP_IP, IPV6, NOF, ICMP, PAY4),
+
+	/* IPv4 --> GRE/NAT */
+	I40E_PTT(43, IP, IPV4, NOF, IP_GRENAT, NONE, NOF, NONE, PAY3),
+
+	/* IPv4 --> GRE/NAT --> IPv4 */
+	I40E_PTT(44, IP, IPV4, NOF, IP_GRENAT, IPV4, FRG, NONE, PAY3),
+	I40E_PTT(45, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, NONE, PAY3),
+	I40E_PTT(46, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, UDP,  PAY4),
+	I40E_PTT_UNUSED_ENTRY(47),
+	I40E_PTT(48, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, TCP,  PAY4),
+	I40E_PTT(49, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, SCTP, PAY4),
+	I40E_PTT(50, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, ICMP, PAY4),
+
+	/* IPv4 --> GRE/NAT --> IPv6 */
+	I40E_PTT(51, IP, IPV4, NOF, IP_GRENAT, IPV6, FRG, NONE, PAY3),
+	I40E_PTT(52, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, NONE, PAY3),
+	I40E_PTT(53, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, UDP,  PAY4),
+	I40E_PTT_UNUSED_ENTRY(54),
+	I40E_PTT(55, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, TCP,  PAY4),
+	I40E_PTT(56, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, SCTP, PAY4),
+	I40E_PTT(57, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, ICMP, PAY4),
+
+	/* IPv4 --> GRE/NAT --> MAC */
+	I40E_PTT(58, IP, IPV4, NOF, IP_GRENAT_MAC, NONE, NOF, NONE, PAY3),
+
+	/* IPv4 --> GRE/NAT --> MAC --> IPv4 */
+	I40E_PTT(59, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, FRG, NONE, PAY3),
+	I40E_PTT(60, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, NONE, PAY3),
+	I40E_PTT(61, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, UDP,  PAY4),
+	I40E_PTT_UNUSED_ENTRY(62),
+	I40E_PTT(63, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, TCP,  PAY4),
+	I40E_PTT(64, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, SCTP, PAY4),
+	I40E_PTT(65, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, ICMP, PAY4),
+
+	/* IPv4 --> GRE/NAT -> MAC --> IPv6 */
+	I40E_PTT(66, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, FRG, NONE, PAY3),
+	I40E_PTT(67, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, NONE, PAY3),
+	I40E_PTT(68, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, UDP,  PAY4),
+	I40E_PTT_UNUSED_ENTRY(69),
+	I40E_PTT(70, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, TCP,  PAY4),
+	I40E_PTT(71, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, SCTP, PAY4),
+	I40E_PTT(72, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, ICMP, PAY4),
+
+	/* IPv4 --> GRE/NAT --> MAC/VLAN */
+	I40E_PTT(73, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, NONE, NOF, NONE, PAY3),
+
+	/* IPv4 ---> GRE/NAT -> MAC/VLAN --> IPv4 */
+	I40E_PTT(74, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, FRG, NONE, PAY3),
+	I40E_PTT(75, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, NONE, PAY3),
+	I40E_PTT(76, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, UDP,  PAY4),
+	I40E_PTT_UNUSED_ENTRY(77),
+	I40E_PTT(78, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, TCP,  PAY4),
+	I40E_PTT(79, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, SCTP, PAY4),
+	I40E_PTT(80, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, ICMP, PAY4),
+
+	/* IPv4 -> GRE/NAT -> MAC/VLAN --> IPv6 */
+	I40E_PTT(81, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, FRG, NONE, PAY3),
+	I40E_PTT(82, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, NONE, PAY3),
+	I40E_PTT(83, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, UDP,  PAY4),
+	I40E_PTT_UNUSED_ENTRY(84),
+	I40E_PTT(85, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, TCP,  PAY4),
+	I40E_PTT(86, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, SCTP, PAY4),
+	I40E_PTT(87, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, ICMP, PAY4),
+
+	/* Non Tunneled IPv6 */
+	I40E_PTT(88, IP, IPV6, FRG, NONE, NONE, NOF, NONE, PAY3),
+	I40E_PTT(89, IP, IPV6, NOF, NONE, NONE, NOF, NONE, PAY3),
+	I40E_PTT(90, IP, IPV6, NOF, NONE, NONE, NOF, UDP,  PAY4),
+	I40E_PTT_UNUSED_ENTRY(91),
+	I40E_PTT(92, IP, IPV6, NOF, NONE, NONE, NOF, TCP,  PAY4),
+	I40E_PTT(93, IP, IPV6, NOF, NONE, NONE, NOF, SCTP, PAY4),
+	I40E_PTT(94, IP, IPV6, NOF, NONE, NONE, NOF, ICMP, PAY4),
+
+	/* IPv6 --> IPv4 */
+	I40E_PTT(95,  IP, IPV6, NOF, IP_IP, IPV4, FRG, NONE, PAY3),
+	I40E_PTT(96,  IP, IPV6, NOF, IP_IP, IPV4, NOF, NONE, PAY3),
+	I40E_PTT(97,  IP, IPV6, NOF, IP_IP, IPV4, NOF, UDP,  PAY4),
+	I40E_PTT_UNUSED_ENTRY(98),
+	I40E_PTT(99,  IP, IPV6, NOF, IP_IP, IPV4, NOF, TCP,  PAY4),
+	I40E_PTT(100, IP, IPV6, NOF, IP_IP, IPV4, NOF, SCTP, PAY4),
+	I40E_PTT(101, IP, IPV6, NOF, IP_IP, IPV4, NOF, ICMP, PAY4),
+
+	/* IPv6 --> IPv6 */
+	I40E_PTT(102, IP, IPV6, NOF, IP_IP, IPV6, FRG, NONE, PAY3),
+	I40E_PTT(103, IP, IPV6, NOF, IP_IP, IPV6, NOF, NONE, PAY3),
+	I40E_PTT(104, IP, IPV6, NOF, IP_IP, IPV6, NOF, UDP,  PAY4),
+	I40E_PTT_UNUSED_ENTRY(105),
+	I40E_PTT(106, IP, IPV6, NOF, IP_IP, IPV6, NOF, TCP,  PAY4),
+	I40E_PTT(107, IP, IPV6, NOF, IP_IP, IPV6, NOF, SCTP, PAY4),
+	I40E_PTT(108, IP, IPV6, NOF, IP_IP, IPV6, NOF, ICMP, PAY4),
+
+	/* IPv6 --> GRE/NAT */
+	I40E_PTT(109, IP, IPV6, NOF, IP_GRENAT, NONE, NOF, NONE, PAY3),
+
+	/* IPv6 --> GRE/NAT -> IPv4 */
+	I40E_PTT(110, IP, IPV6, NOF, IP_GRENAT, IPV4, FRG, NONE, PAY3),
+	I40E_PTT(111, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, NONE, PAY3),
+	I40E_PTT(112, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, UDP,  PAY4),
+	I40E_PTT_UNUSED_ENTRY(113),
+	I40E_PTT(114, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, TCP,  PAY4),
+	I40E_PTT(115, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, SCTP, PAY4),
+	I40E_PTT(116, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, ICMP, PAY4),
+
+	/* IPv6 --> GRE/NAT -> IPv6 */
+	I40E_PTT(117, IP, IPV6, NOF, IP_GRENAT, IPV6, FRG, NONE, PAY3),
+	I40E_PTT(118, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, NONE, PAY3),
+	I40E_PTT(119, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, UDP,  PAY4),
+	I40E_PTT_UNUSED_ENTRY(120),
+	I40E_PTT(121, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, TCP,  PAY4),
+	I40E_PTT(122, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, SCTP, PAY4),
+	I40E_PTT(123, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, ICMP, PAY4),
+
+	/* IPv6 --> GRE/NAT -> MAC */
+	I40E_PTT(124, IP, IPV6, NOF, IP_GRENAT_MAC, NONE, NOF, NONE, PAY3),
+
+	/* IPv6 --> GRE/NAT -> MAC -> IPv4 */
+	I40E_PTT(125, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, FRG, NONE, PAY3),
+	I40E_PTT(126, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, NONE, PAY3),
+	I40E_PTT(127, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, UDP,  PAY4),
+	I40E_PTT_UNUSED_ENTRY(128),
+	I40E_PTT(129, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, TCP,  PAY4),
+	I40E_PTT(130, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, SCTP, PAY4),
+	I40E_PTT(131, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, ICMP, PAY4),
+
+	/* IPv6 --> GRE/NAT -> MAC -> IPv6 */
+	I40E_PTT(132, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, FRG, NONE, PAY3),
+	I40E_PTT(133, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, NONE, PAY3),
+	I40E_PTT(134, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, UDP,  PAY4),
+	I40E_PTT_UNUSED_ENTRY(135),
+	I40E_PTT(136, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, TCP,  PAY4),
+	I40E_PTT(137, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, SCTP, PAY4),
+	I40E_PTT(138, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, ICMP, PAY4),
+
+	/* IPv6 --> GRE/NAT -> MAC/VLAN */
+	I40E_PTT(139, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, NONE, NOF, NONE, PAY3),
+
+	/* IPv6 --> GRE/NAT -> MAC/VLAN --> IPv4 */
+	I40E_PTT(140, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, FRG, NONE, PAY3),
+	I40E_PTT(141, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, NONE, PAY3),
+	I40E_PTT(142, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, UDP,  PAY4),
+	I40E_PTT_UNUSED_ENTRY(143),
+	I40E_PTT(144, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, TCP,  PAY4),
+	I40E_PTT(145, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, SCTP, PAY4),
+	I40E_PTT(146, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, ICMP, PAY4),
+
+	/* IPv6 --> GRE/NAT -> MAC/VLAN --> IPv6 */
+	I40E_PTT(147, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, FRG, NONE, PAY3),
+	I40E_PTT(148, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, NONE, PAY3),
+	I40E_PTT(149, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, UDP,  PAY4),
+	I40E_PTT_UNUSED_ENTRY(150),
+	I40E_PTT(151, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, TCP,  PAY4),
+	I40E_PTT(152, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, SCTP, PAY4),
+	I40E_PTT(153, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, ICMP, PAY4),
+
+	/* unused entries */
+	I40E_PTT_UNUSED_ENTRY(154),
+	I40E_PTT_UNUSED_ENTRY(155),
+	I40E_PTT_UNUSED_ENTRY(156),
+	I40E_PTT_UNUSED_ENTRY(157),
+	I40E_PTT_UNUSED_ENTRY(158),
+	I40E_PTT_UNUSED_ENTRY(159),
+
+	I40E_PTT_UNUSED_ENTRY(160),
+	I40E_PTT_UNUSED_ENTRY(161),
+	I40E_PTT_UNUSED_ENTRY(162),
+	I40E_PTT_UNUSED_ENTRY(163),
+	I40E_PTT_UNUSED_ENTRY(164),
+	I40E_PTT_UNUSED_ENTRY(165),
+	I40E_PTT_UNUSED_ENTRY(166),
+	I40E_PTT_UNUSED_ENTRY(167),
+	I40E_PTT_UNUSED_ENTRY(168),
+	I40E_PTT_UNUSED_ENTRY(169),
+
+	I40E_PTT_UNUSED_ENTRY(170),
+	I40E_PTT_UNUSED_ENTRY(171),
+	I40E_PTT_UNUSED_ENTRY(172),
+	I40E_PTT_UNUSED_ENTRY(173),
+	I40E_PTT_UNUSED_ENTRY(174),
+	I40E_PTT_UNUSED_ENTRY(175),
+	I40E_PTT_UNUSED_ENTRY(176),
+	I40E_PTT_UNUSED_ENTRY(177),
+	I40E_PTT_UNUSED_ENTRY(178),
+	I40E_PTT_UNUSED_ENTRY(179),
+
+	I40E_PTT_UNUSED_ENTRY(180),
+	I40E_PTT_UNUSED_ENTRY(181),
+	I40E_PTT_UNUSED_ENTRY(182),
+	I40E_PTT_UNUSED_ENTRY(183),
+	I40E_PTT_UNUSED_ENTRY(184),
+	I40E_PTT_UNUSED_ENTRY(185),
+	I40E_PTT_UNUSED_ENTRY(186),
+	I40E_PTT_UNUSED_ENTRY(187),
+	I40E_PTT_UNUSED_ENTRY(188),
+	I40E_PTT_UNUSED_ENTRY(189),
+
+	I40E_PTT_UNUSED_ENTRY(190),
+	I40E_PTT_UNUSED_ENTRY(191),
+	I40E_PTT_UNUSED_ENTRY(192),
+	I40E_PTT_UNUSED_ENTRY(193),
+	I40E_PTT_UNUSED_ENTRY(194),
+	I40E_PTT_UNUSED_ENTRY(195),
+	I40E_PTT_UNUSED_ENTRY(196),
+	I40E_PTT_UNUSED_ENTRY(197),
+	I40E_PTT_UNUSED_ENTRY(198),
+	I40E_PTT_UNUSED_ENTRY(199),
+
+	I40E_PTT_UNUSED_ENTRY(200),
+	I40E_PTT_UNUSED_ENTRY(201),
+	I40E_PTT_UNUSED_ENTRY(202),
+	I40E_PTT_UNUSED_ENTRY(203),
+	I40E_PTT_UNUSED_ENTRY(204),
+	I40E_PTT_UNUSED_ENTRY(205),
+	I40E_PTT_UNUSED_ENTRY(206),
+	I40E_PTT_UNUSED_ENTRY(207),
+	I40E_PTT_UNUSED_ENTRY(208),
+	I40E_PTT_UNUSED_ENTRY(209),
+
+	I40E_PTT_UNUSED_ENTRY(210),
+	I40E_PTT_UNUSED_ENTRY(211),
+	I40E_PTT_UNUSED_ENTRY(212),
+	I40E_PTT_UNUSED_ENTRY(213),
+	I40E_PTT_UNUSED_ENTRY(214),
+	I40E_PTT_UNUSED_ENTRY(215),
+	I40E_PTT_UNUSED_ENTRY(216),
+	I40E_PTT_UNUSED_ENTRY(217),
+	I40E_PTT_UNUSED_ENTRY(218),
+	I40E_PTT_UNUSED_ENTRY(219),
+
+	I40E_PTT_UNUSED_ENTRY(220),
+	I40E_PTT_UNUSED_ENTRY(221),
+	I40E_PTT_UNUSED_ENTRY(222),
+	I40E_PTT_UNUSED_ENTRY(223),
+	I40E_PTT_UNUSED_ENTRY(224),
+	I40E_PTT_UNUSED_ENTRY(225),
+	I40E_PTT_UNUSED_ENTRY(226),
+	I40E_PTT_UNUSED_ENTRY(227),
+	I40E_PTT_UNUSED_ENTRY(228),
+	I40E_PTT_UNUSED_ENTRY(229),
+
+	I40E_PTT_UNUSED_ENTRY(230),
+	I40E_PTT_UNUSED_ENTRY(231),
+	I40E_PTT_UNUSED_ENTRY(232),
+	I40E_PTT_UNUSED_ENTRY(233),
+	I40E_PTT_UNUSED_ENTRY(234),
+	I40E_PTT_UNUSED_ENTRY(235),
+	I40E_PTT_UNUSED_ENTRY(236),
+	I40E_PTT_UNUSED_ENTRY(237),
+	I40E_PTT_UNUSED_ENTRY(238),
+	I40E_PTT_UNUSED_ENTRY(239),
+
+	I40E_PTT_UNUSED_ENTRY(240),
+	I40E_PTT_UNUSED_ENTRY(241),
+	I40E_PTT_UNUSED_ENTRY(242),
+	I40E_PTT_UNUSED_ENTRY(243),
+	I40E_PTT_UNUSED_ENTRY(244),
+	I40E_PTT_UNUSED_ENTRY(245),
+	I40E_PTT_UNUSED_ENTRY(246),
+	I40E_PTT_UNUSED_ENTRY(247),
+	I40E_PTT_UNUSED_ENTRY(248),
+	I40E_PTT_UNUSED_ENTRY(249),
+
+	I40E_PTT_UNUSED_ENTRY(250),
+	I40E_PTT_UNUSED_ENTRY(251),
+	I40E_PTT_UNUSED_ENTRY(252),
+	I40E_PTT_UNUSED_ENTRY(253),
+	I40E_PTT_UNUSED_ENTRY(254),
+	I40E_PTT_UNUSED_ENTRY(255)
+};
+
+
+/**
+ * i40e_validate_mac_addr - Validate unicast MAC address
+ * @mac_addr: pointer to MAC address
+ *
+ * Tests a MAC address to ensure it is a valid Individual Address
+ **/
+enum i40e_status_code i40e_validate_mac_addr(u8 *mac_addr)
+{
+	enum i40e_status_code status = I40E_SUCCESS;
+
+	DEBUGFUNC("i40e_validate_mac_addr");
+
+	/* Broadcast addresses ARE multicast addresses
+	 * Make sure it is not a multicast address
+	 * Reject the zero address
+	 */
+	if (I40E_IS_MULTICAST(mac_addr) ||
+	    (mac_addr[0] == 0 && mac_addr[1] == 0 && mac_addr[2] == 0 &&
+	      mac_addr[3] == 0 && mac_addr[4] == 0 && mac_addr[5] == 0))
+		status = I40E_ERR_INVALID_MAC_ADDR;
+
+	return status;
+}
+
+/**
+ * i40e_init_shared_code - Initialize the shared code
+ * @hw: pointer to hardware structure
+ *
+ * This assigns the MAC type and PHY code and inits the NVM.
+ * Does not touch the hardware. This function must be called prior to any
+ * other function in the shared code. The i40e_hw structure should be
+ * memset to 0 prior to calling this function.  The following fields in
+ * hw structure should be filled in prior to calling this function:
+ * hw_addr, back, device_id, vendor_id, subsystem_device_id,
+ * subsystem_vendor_id, and revision_id
+ **/
+enum i40e_status_code i40e_init_shared_code(struct i40e_hw *hw)
+{
+	enum i40e_status_code status = I40E_SUCCESS;
+	u32 port, ari, func_rid;
+
+	DEBUGFUNC("i40e_init_shared_code");
+
+	i40e_set_mac_type(hw);
+
+	switch (hw->mac.type) {
+	case I40E_MAC_XL710:
+#ifdef X722_SUPPORT
+	case I40E_MAC_X722:
+#endif
+		break;
+	default:
+		return I40E_ERR_DEVICE_NOT_SUPPORTED;
+	}
+
+	hw->phy.get_link_info = TRUE;
+
+	/* Determine port number and PF number*/
+	port = (rd32(hw, I40E_PFGEN_PORTNUM) & I40E_PFGEN_PORTNUM_PORT_NUM_MASK)
+					   >> I40E_PFGEN_PORTNUM_PORT_NUM_SHIFT;
+	hw->port = (u8)port;
+	ari = (rd32(hw, I40E_GLPCI_CAPSUP) & I40E_GLPCI_CAPSUP_ARI_EN_MASK) >>
+						 I40E_GLPCI_CAPSUP_ARI_EN_SHIFT;
+	func_rid = rd32(hw, I40E_PF_FUNC_RID);
+	if (ari)
+		hw->pf_id = (u8)(func_rid & 0xff);
+	else
+		hw->pf_id = (u8)(func_rid & 0x7);
+
+	status = i40e_init_nvm(hw);
+	return status;
+}
+
+/**
+ * i40e_aq_mac_address_read - Retrieve the MAC addresses
+ * @hw: pointer to the hw struct
+ * @flags: a return indicator of what addresses were added to the addr store
+ * @addrs: the requestor's mac addr store
+ * @cmd_details: pointer to command details structure or NULL
+ **/
+static enum i40e_status_code i40e_aq_mac_address_read(struct i40e_hw *hw,
+				   u16 *flags,
+				   struct i40e_aqc_mac_address_read_data *addrs,
+				   struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_mac_address_read *cmd_data =
+		(struct i40e_aqc_mac_address_read *)&desc.params.raw;
+	enum i40e_status_code status;
+
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_mac_address_read);
+	desc.flags |= CPU_TO_LE16(I40E_AQ_FLAG_BUF);
+
+	status = i40e_asq_send_command(hw, &desc, addrs,
+				       sizeof(*addrs), cmd_details);
+	*flags = LE16_TO_CPU(cmd_data->command_flags);
+
+	return status;
+}
+
+/**
+ * i40e_aq_mac_address_write - Change the MAC addresses
+ * @hw: pointer to the hw struct
+ * @flags: indicates which MAC to be written
+ * @mac_addr: address to write
+ * @cmd_details: pointer to command details structure or NULL
+ **/
+enum i40e_status_code i40e_aq_mac_address_write(struct i40e_hw *hw,
+				    u16 flags, u8 *mac_addr,
+				    struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_mac_address_write *cmd_data =
+		(struct i40e_aqc_mac_address_write *)&desc.params.raw;
+	enum i40e_status_code status;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					  i40e_aqc_opc_mac_address_write);
+	cmd_data->command_flags = CPU_TO_LE16(flags);
+	cmd_data->mac_sah = CPU_TO_LE16((u16)mac_addr[0] << 8 | mac_addr[1]);
+	cmd_data->mac_sal = CPU_TO_LE32(((u32)mac_addr[2] << 24) |
+					((u32)mac_addr[3] << 16) |
+					((u32)mac_addr[4] << 8) |
+					mac_addr[5]);
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_get_mac_addr - get MAC address
+ * @hw: pointer to the HW structure
+ * @mac_addr: pointer to MAC address
+ *
+ * Reads the adapter's MAC address from register
+ **/
+enum i40e_status_code i40e_get_mac_addr(struct i40e_hw *hw, u8 *mac_addr)
+{
+	struct i40e_aqc_mac_address_read_data addrs;
+	enum i40e_status_code status;
+	u16 flags = 0;
+
+	status = i40e_aq_mac_address_read(hw, &flags, &addrs, NULL);
+
+	if (flags & I40E_AQC_LAN_ADDR_VALID)
+		memcpy(mac_addr, &addrs.pf_lan_mac, sizeof(addrs.pf_lan_mac));
+
+	return status;
+}
+
+/**
+ * i40e_get_port_mac_addr - get Port MAC address
+ * @hw: pointer to the HW structure
+ * @mac_addr: pointer to Port MAC address
+ *
+ * Reads the adapter's Port MAC address
+ **/
+enum i40e_status_code i40e_get_port_mac_addr(struct i40e_hw *hw, u8 *mac_addr)
+{
+	struct i40e_aqc_mac_address_read_data addrs;
+	enum i40e_status_code status;
+	u16 flags = 0;
+
+	status = i40e_aq_mac_address_read(hw, &flags, &addrs, NULL);
+	if (status)
+		return status;
+
+	if (flags & I40E_AQC_PORT_ADDR_VALID)
+		memcpy(mac_addr, &addrs.port_mac, sizeof(addrs.port_mac));
+	else
+		status = I40E_ERR_INVALID_MAC_ADDR;
+
+	return status;
+}
+
+/**
+ * i40e_pre_tx_queue_cfg - pre tx queue configure
+ * @hw: pointer to the HW structure
+ * @queue: target pf queue index
+ * @enable: state change request
+ *
+ * Handles hw requirement to indicate intention to enable
+ * or disable target queue.
+ **/
+void i40e_pre_tx_queue_cfg(struct i40e_hw *hw, u32 queue, bool enable)
+{
+	u32 abs_queue_idx = hw->func_caps.base_queue + queue;
+	u32 reg_block = 0;
+	u32 reg_val;
+
+	if (abs_queue_idx >= 128) {
+		reg_block = abs_queue_idx / 128;
+		abs_queue_idx %= 128;
+	}
+
+	reg_val = rd32(hw, I40E_GLLAN_TXPRE_QDIS(reg_block));
+	reg_val &= ~I40E_GLLAN_TXPRE_QDIS_QINDX_MASK;
+	reg_val |= (abs_queue_idx << I40E_GLLAN_TXPRE_QDIS_QINDX_SHIFT);
+
+	if (enable)
+		reg_val |= I40E_GLLAN_TXPRE_QDIS_CLEAR_QDIS_MASK;
+	else
+		reg_val |= I40E_GLLAN_TXPRE_QDIS_SET_QDIS_MASK;
+
+	wr32(hw, I40E_GLLAN_TXPRE_QDIS(reg_block), reg_val);
+}
+
+/**
+ *  i40e_read_pba_string - Reads part number string from EEPROM
+ *  @hw: pointer to hardware structure
+ *  @pba_num: stores the part number string from the EEPROM
+ *  @pba_num_size: part number string buffer length
+ *
+ *  Reads the part number string from the EEPROM.
+ **/
+enum i40e_status_code i40e_read_pba_string(struct i40e_hw *hw, u8 *pba_num,
+					    u32 pba_num_size)
+{
+	enum i40e_status_code status = I40E_SUCCESS;
+	u16 pba_word = 0;
+	u16 pba_size = 0;
+	u16 pba_ptr = 0;
+	u16 i = 0;
+
+	status = i40e_read_nvm_word(hw, I40E_SR_PBA_FLAGS, &pba_word);
+	if ((status != I40E_SUCCESS) || (pba_word != 0xFAFA)) {
+		DEBUGOUT("Failed to read PBA flags or flag is invalid.\n");
+		return status;
+	}
+
+	status = i40e_read_nvm_word(hw, I40E_SR_PBA_BLOCK_PTR, &pba_ptr);
+	if (status != I40E_SUCCESS) {
+		DEBUGOUT("Failed to read PBA Block pointer.\n");
+		return status;
+	}
+
+	status = i40e_read_nvm_word(hw, pba_ptr, &pba_size);
+	if (status != I40E_SUCCESS) {
+		DEBUGOUT("Failed to read PBA Block size.\n");
+		return status;
+	}
+
+	/* Subtract one to get PBA word count (PBA Size word is included in
+	 * total size)
+	 */
+	pba_size--;
+	if (pba_num_size < (((u32)pba_size * 2) + 1)) {
+		DEBUGOUT("Buffer to small for PBA data.\n");
+		return I40E_ERR_PARAM;
+	}
+
+	for (i = 0; i < pba_size; i++) {
+		status = i40e_read_nvm_word(hw, (pba_ptr + 1) + i, &pba_word);
+		if (status != I40E_SUCCESS) {
+			DEBUGOUT1("Failed to read PBA Block word %d.\n", i);
+			return status;
+		}
+
+		pba_num[(i * 2)] = (pba_word >> 8) & 0xFF;
+		pba_num[(i * 2) + 1] = pba_word & 0xFF;
+	}
+	pba_num[(pba_size * 2)] = '\0';
+
+	return status;
+}
+
+/**
+ * i40e_get_media_type - Gets media type
+ * @hw: pointer to the hardware structure
+ **/
+static enum i40e_media_type i40e_get_media_type(struct i40e_hw *hw)
+{
+	enum i40e_media_type media;
+
+	switch (hw->phy.link_info.phy_type) {
+	case I40E_PHY_TYPE_10GBASE_SR:
+	case I40E_PHY_TYPE_10GBASE_LR:
+	case I40E_PHY_TYPE_1000BASE_SX:
+	case I40E_PHY_TYPE_1000BASE_LX:
+	case I40E_PHY_TYPE_40GBASE_SR4:
+	case I40E_PHY_TYPE_40GBASE_LR4:
+		media = I40E_MEDIA_TYPE_FIBER;
+		break;
+	case I40E_PHY_TYPE_100BASE_TX:
+	case I40E_PHY_TYPE_1000BASE_T:
+	case I40E_PHY_TYPE_10GBASE_T:
+		media = I40E_MEDIA_TYPE_BASET;
+		break;
+	case I40E_PHY_TYPE_10GBASE_CR1_CU:
+	case I40E_PHY_TYPE_40GBASE_CR4_CU:
+	case I40E_PHY_TYPE_10GBASE_CR1:
+	case I40E_PHY_TYPE_40GBASE_CR4:
+	case I40E_PHY_TYPE_10GBASE_SFPP_CU:
+	case I40E_PHY_TYPE_40GBASE_AOC:
+	case I40E_PHY_TYPE_10GBASE_AOC:
+		media = I40E_MEDIA_TYPE_DA;
+		break;
+	case I40E_PHY_TYPE_1000BASE_KX:
+	case I40E_PHY_TYPE_10GBASE_KX4:
+	case I40E_PHY_TYPE_10GBASE_KR:
+	case I40E_PHY_TYPE_40GBASE_KR4:
+	case I40E_PHY_TYPE_20GBASE_KR2:
+		media = I40E_MEDIA_TYPE_BACKPLANE;
+		break;
+	case I40E_PHY_TYPE_SGMII:
+	case I40E_PHY_TYPE_XAUI:
+	case I40E_PHY_TYPE_XFI:
+	case I40E_PHY_TYPE_XLAUI:
+	case I40E_PHY_TYPE_XLPPI:
+	default:
+		media = I40E_MEDIA_TYPE_UNKNOWN;
+		break;
+	}
+
+	return media;
+}
+
+#define I40E_PF_RESET_WAIT_COUNT	200
+/**
+ * i40e_pf_reset - Reset the PF
+ * @hw: pointer to the hardware structure
+ *
+ * Assuming someone else has triggered a global reset,
+ * assure the global reset is complete and then reset the PF
+ **/
+enum i40e_status_code i40e_pf_reset(struct i40e_hw *hw)
+{
+	u32 cnt = 0;
+	u32 cnt1 = 0;
+	u32 reg = 0;
+	u32 grst_del;
+
+	/* Poll for Global Reset steady state in case of recent GRST.
+	 * The grst delay value is in 100ms units, and we'll wait a
+	 * couple counts longer to be sure we don't just miss the end.
+	 */
+	grst_del = (rd32(hw, I40E_GLGEN_RSTCTL) &
+			I40E_GLGEN_RSTCTL_GRSTDEL_MASK) >>
+			I40E_GLGEN_RSTCTL_GRSTDEL_SHIFT;
+	for (cnt = 0; cnt < grst_del + 10; cnt++) {
+		reg = rd32(hw, I40E_GLGEN_RSTAT);
+		if (!(reg & I40E_GLGEN_RSTAT_DEVSTATE_MASK))
+			break;
+		i40e_msec_delay(100);
+	}
+	if (reg & I40E_GLGEN_RSTAT_DEVSTATE_MASK) {
+		DEBUGOUT("Global reset polling failed to complete.\n");
+		return I40E_ERR_RESET_FAILED;
+	}
+
+	/* Now Wait for the FW to be ready */
+	for (cnt1 = 0; cnt1 < I40E_PF_RESET_WAIT_COUNT; cnt1++) {
+		reg = rd32(hw, I40E_GLNVM_ULD);
+		reg &= (I40E_GLNVM_ULD_CONF_CORE_DONE_MASK |
+			I40E_GLNVM_ULD_CONF_GLOBAL_DONE_MASK);
+		if (reg == (I40E_GLNVM_ULD_CONF_CORE_DONE_MASK |
+			    I40E_GLNVM_ULD_CONF_GLOBAL_DONE_MASK)) {
+			DEBUGOUT1("Core and Global modules ready %d\n", cnt1);
+			break;
+		}
+		i40e_msec_delay(10);
+	}
+	if (!(reg & (I40E_GLNVM_ULD_CONF_CORE_DONE_MASK |
+		     I40E_GLNVM_ULD_CONF_GLOBAL_DONE_MASK))) {
+		DEBUGOUT("wait for FW Reset complete timedout\n");
+		DEBUGOUT1("I40E_GLNVM_ULD = 0x%x\n", reg);
+		return I40E_ERR_RESET_FAILED;
+	}
+
+	/* If there was a Global Reset in progress when we got here,
+	 * we don't need to do the PF Reset
+	 */
+	if (!cnt) {
+		reg = rd32(hw, I40E_PFGEN_CTRL);
+		wr32(hw, I40E_PFGEN_CTRL,
+		     (reg | I40E_PFGEN_CTRL_PFSWR_MASK));
+		for (cnt = 0; cnt < I40E_PF_RESET_WAIT_COUNT; cnt++) {
+			reg = rd32(hw, I40E_PFGEN_CTRL);
+			if (!(reg & I40E_PFGEN_CTRL_PFSWR_MASK))
+				break;
+			i40e_msec_delay(1);
+		}
+		if (reg & I40E_PFGEN_CTRL_PFSWR_MASK) {
+			DEBUGOUT("PF reset polling failed to complete.\n");
+			return I40E_ERR_RESET_FAILED;
+		}
+	}
+
+	i40e_clear_pxe_mode(hw);
+
+
+	return I40E_SUCCESS;
+}
+
+/**
+ * i40e_clear_hw - clear out any left over hw state
+ * @hw: pointer to the hw struct
+ *
+ * Clear queues and interrupts, typically called at init time,
+ * but after the capabilities have been found so we know how many
+ * queues and msix vectors have been allocated.
+ **/
+void i40e_clear_hw(struct i40e_hw *hw)
+{
+	u32 num_queues, base_queue;
+	u32 num_pf_int;
+	u32 num_vf_int;
+	u32 num_vfs;
+	u32 i, j;
+	u32 val;
+	u32 eol = 0x7ff;
+
+	/* get number of interrupts, queues, and vfs */
+	val = rd32(hw, I40E_GLPCI_CNF2);
+	num_pf_int = (val & I40E_GLPCI_CNF2_MSI_X_PF_N_MASK) >>
+			I40E_GLPCI_CNF2_MSI_X_PF_N_SHIFT;
+	num_vf_int = (val & I40E_GLPCI_CNF2_MSI_X_VF_N_MASK) >>
+			I40E_GLPCI_CNF2_MSI_X_VF_N_SHIFT;
+
+	val = rd32(hw, I40E_PFLAN_QALLOC);
+	base_queue = (val & I40E_PFLAN_QALLOC_FIRSTQ_MASK) >>
+			I40E_PFLAN_QALLOC_FIRSTQ_SHIFT;
+	j = (val & I40E_PFLAN_QALLOC_LASTQ_MASK) >>
+			I40E_PFLAN_QALLOC_LASTQ_SHIFT;
+	if (val & I40E_PFLAN_QALLOC_VALID_MASK)
+		num_queues = (j - base_queue) + 1;
+	else
+		num_queues = 0;
+
+	val = rd32(hw, I40E_PF_VT_PFALLOC);
+	i = (val & I40E_PF_VT_PFALLOC_FIRSTVF_MASK) >>
+			I40E_PF_VT_PFALLOC_FIRSTVF_SHIFT;
+	j = (val & I40E_PF_VT_PFALLOC_LASTVF_MASK) >>
+			I40E_PF_VT_PFALLOC_LASTVF_SHIFT;
+	if (val & I40E_PF_VT_PFALLOC_VALID_MASK)
+		num_vfs = (j - i) + 1;
+	else
+		num_vfs = 0;
+
+	/* stop all the interrupts */
+	wr32(hw, I40E_PFINT_ICR0_ENA, 0);
+	val = 0x3 << I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT;
+	for (i = 0; i < num_pf_int - 2; i++)
+		wr32(hw, I40E_PFINT_DYN_CTLN(i), val);
+
+	/* Set the FIRSTQ_INDX field to 0x7FF in PFINT_LNKLSTx */
+	val = eol << I40E_PFINT_LNKLST0_FIRSTQ_INDX_SHIFT;
+	wr32(hw, I40E_PFINT_LNKLST0, val);
+	for (i = 0; i < num_pf_int - 2; i++)
+		wr32(hw, I40E_PFINT_LNKLSTN(i), val);
+	val = eol << I40E_VPINT_LNKLST0_FIRSTQ_INDX_SHIFT;
+	for (i = 0; i < num_vfs; i++)
+		wr32(hw, I40E_VPINT_LNKLST0(i), val);
+	for (i = 0; i < num_vf_int - 2; i++)
+		wr32(hw, I40E_VPINT_LNKLSTN(i), val);
+
+	/* warn the HW of the coming Tx disables */
+	for (i = 0; i < num_queues; i++) {
+		u32 abs_queue_idx = base_queue + i;
+		u32 reg_block = 0;
+
+		if (abs_queue_idx >= 128) {
+			reg_block = abs_queue_idx / 128;
+			abs_queue_idx %= 128;
+		}
+
+		val = rd32(hw, I40E_GLLAN_TXPRE_QDIS(reg_block));
+		val &= ~I40E_GLLAN_TXPRE_QDIS_QINDX_MASK;
+		val |= (abs_queue_idx << I40E_GLLAN_TXPRE_QDIS_QINDX_SHIFT);
+		val |= I40E_GLLAN_TXPRE_QDIS_SET_QDIS_MASK;
+
+		wr32(hw, I40E_GLLAN_TXPRE_QDIS(reg_block), val);
+	}
+	i40e_usec_delay(400);
+
+	/* stop all the queues */
+	for (i = 0; i < num_queues; i++) {
+		wr32(hw, I40E_QINT_TQCTL(i), 0);
+		wr32(hw, I40E_QTX_ENA(i), 0);
+		wr32(hw, I40E_QINT_RQCTL(i), 0);
+		wr32(hw, I40E_QRX_ENA(i), 0);
+	}
+
+	/* short wait for all queue disables to settle */
+	i40e_usec_delay(50);
+}
+
+/**
+ * i40e_clear_pxe_mode - clear pxe operations mode
+ * @hw: pointer to the hw struct
+ *
+ * Make sure all PXE mode settings are cleared, including things
+ * like descriptor fetch/write-back mode.
+ **/
+void i40e_clear_pxe_mode(struct i40e_hw *hw)
+{
+	if (i40e_check_asq_alive(hw))
+		i40e_aq_clear_pxe_mode(hw, NULL);
+}
+
+/**
+ * i40e_led_is_mine - helper to find matching led
+ * @hw: pointer to the hw struct
+ * @idx: index into GPIO registers
+ *
+ * returns: 0 if no match, otherwise the value of the GPIO_CTL register
+ */
+static u32 i40e_led_is_mine(struct i40e_hw *hw, int idx)
+{
+	u32 gpio_val = 0;
+	u32 port;
+
+	if (!hw->func_caps.led[idx])
+		return 0;
+
+	gpio_val = rd32(hw, I40E_GLGEN_GPIO_CTL(idx));
+	port = (gpio_val & I40E_GLGEN_GPIO_CTL_PRT_NUM_MASK) >>
+		I40E_GLGEN_GPIO_CTL_PRT_NUM_SHIFT;
+
+	/* if PRT_NUM_NA is 1 then this LED is not port specific, OR
+	 * if it is not our port then ignore
+	 */
+	if ((gpio_val & I40E_GLGEN_GPIO_CTL_PRT_NUM_NA_MASK) ||
+	    (port != hw->port))
+		return 0;
+
+	return gpio_val;
+}
+
+#define I40E_COMBINED_ACTIVITY 0xA
+#define I40E_FILTER_ACTIVITY 0xE
+#define I40E_LINK_ACTIVITY 0xC
+#define I40E_MAC_ACTIVITY 0xD
+#define I40E_LED0 22
+
+/**
+ * i40e_led_get - return current on/off mode
+ * @hw: pointer to the hw struct
+ *
+ * The value returned is the 'mode' field as defined in the
+ * GPIO register definitions: 0x0 = off, 0xf = on, and other
+ * values are variations of possible behaviors relating to
+ * blink, link, and wire.
+ **/
+u32 i40e_led_get(struct i40e_hw *hw)
+{
+	u32 current_mode = 0;
+	u32 mode = 0;
+	int i;
+
+	/* as per the documentation GPIO 22-29 are the LED
+	 * GPIO pins named LED0..LED7
+	 */
+	for (i = I40E_LED0; i <= I40E_GLGEN_GPIO_CTL_MAX_INDEX; i++) {
+		u32 gpio_val = i40e_led_is_mine(hw, i);
+
+		if (!gpio_val)
+			continue;
+
+		/* ignore gpio LED src mode entries related to the activity
+		 *  LEDs
+		 */
+		current_mode = ((gpio_val & I40E_GLGEN_GPIO_CTL_LED_MODE_MASK)
+				>> I40E_GLGEN_GPIO_CTL_LED_MODE_SHIFT);
+		switch (current_mode) {
+		case I40E_COMBINED_ACTIVITY:
+		case I40E_FILTER_ACTIVITY:
+		case I40E_MAC_ACTIVITY:
+			continue;
+		default:
+			break;
+		}
+
+		mode = (gpio_val & I40E_GLGEN_GPIO_CTL_LED_MODE_MASK) >>
+			I40E_GLGEN_GPIO_CTL_LED_MODE_SHIFT;
+		break;
+	}
+
+	return mode;
+}
+
+/**
+ * i40e_led_set - set new on/off mode
+ * @hw: pointer to the hw struct
+ * @mode: 0=off, 0xf=on (else see manual for mode details)
+ * @blink: TRUE if the LED should blink when on, FALSE if steady
+ *
+ * if this function is used to turn on the blink it should
+ * be used to disable the blink when restoring the original state.
+ **/
+void i40e_led_set(struct i40e_hw *hw, u32 mode, bool blink)
+{
+	u32 current_mode = 0;
+	int i;
+
+	if (mode & 0xfffffff0) {
+		DEBUGOUT1("invalid mode passed in %X\n", mode);
+	}
+
+	/* as per the documentation GPIO 22-29 are the LED
+	 * GPIO pins named LED0..LED7
+	 */
+	for (i = I40E_LED0; i <= I40E_GLGEN_GPIO_CTL_MAX_INDEX; i++) {
+		u32 gpio_val = i40e_led_is_mine(hw, i);
+
+		if (!gpio_val)
+			continue;
+
+		/* ignore gpio LED src mode entries related to the activity
+		 * LEDs
+		 */
+		current_mode = ((gpio_val & I40E_GLGEN_GPIO_CTL_LED_MODE_MASK)
+				>> I40E_GLGEN_GPIO_CTL_LED_MODE_SHIFT);
+		switch (current_mode) {
+		case I40E_COMBINED_ACTIVITY:
+		case I40E_FILTER_ACTIVITY:
+		case I40E_MAC_ACTIVITY:
+			continue;
+		default:
+			break;
+		}
+
+		gpio_val &= ~I40E_GLGEN_GPIO_CTL_LED_MODE_MASK;
+		/* this & is a bit of paranoia, but serves as a range check */
+		gpio_val |= ((mode << I40E_GLGEN_GPIO_CTL_LED_MODE_SHIFT) &
+			     I40E_GLGEN_GPIO_CTL_LED_MODE_MASK);
+
+		if (mode == I40E_LINK_ACTIVITY)
+			blink = FALSE;
+
+		if (blink)
+			gpio_val |= BIT(I40E_GLGEN_GPIO_CTL_LED_BLINK_SHIFT);
+		else
+			gpio_val &= ~BIT(I40E_GLGEN_GPIO_CTL_LED_BLINK_SHIFT);
+
+		wr32(hw, I40E_GLGEN_GPIO_CTL(i), gpio_val);
+		break;
+	}
+}
+
+/* Admin command wrappers */
+
+/**
+ * i40e_aq_get_phy_capabilities
+ * @hw: pointer to the hw struct
+ * @abilities: structure for PHY capabilities to be filled
+ * @qualified_modules: report Qualified Modules
+ * @report_init: report init capabilities (active are default)
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Returns the various PHY abilities supported on the Port.
+ **/
+enum i40e_status_code i40e_aq_get_phy_capabilities(struct i40e_hw *hw,
+			bool qualified_modules, bool report_init,
+			struct i40e_aq_get_phy_abilities_resp *abilities,
+			struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	enum i40e_status_code status;
+	u16 abilities_size = sizeof(struct i40e_aq_get_phy_abilities_resp);
+
+	if (!abilities)
+		return I40E_ERR_PARAM;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					  i40e_aqc_opc_get_phy_abilities);
+
+	desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_BUF);
+	if (abilities_size > I40E_AQ_LARGE_BUF)
+		desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_LB);
+
+	if (qualified_modules)
+		desc.params.external.param0 |=
+			CPU_TO_LE32(I40E_AQ_PHY_REPORT_QUALIFIED_MODULES);
+
+	if (report_init)
+		desc.params.external.param0 |=
+			CPU_TO_LE32(I40E_AQ_PHY_REPORT_INITIAL_VALUES);
+
+	status = i40e_asq_send_command(hw, &desc, abilities, abilities_size,
+				    cmd_details);
+
+	if (hw->aq.asq_last_status == I40E_AQ_RC_EIO)
+		status = I40E_ERR_UNKNOWN_PHY;
+
+	return status;
+}
+
+/**
+ * i40e_aq_set_phy_config
+ * @hw: pointer to the hw struct
+ * @config: structure with PHY configuration to be set
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Set the various PHY configuration parameters
+ * supported on the Port.One or more of the Set PHY config parameters may be
+ * ignored in an MFP mode as the PF may not have the privilege to set some
+ * of the PHY Config parameters. This status will be indicated by the
+ * command response.
+ **/
+enum i40e_status_code i40e_aq_set_phy_config(struct i40e_hw *hw,
+				struct i40e_aq_set_phy_config *config,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aq_set_phy_config *cmd =
+		(struct i40e_aq_set_phy_config *)&desc.params.raw;
+	enum i40e_status_code status;
+
+	if (!config)
+		return I40E_ERR_PARAM;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					  i40e_aqc_opc_set_phy_config);
+
+	*cmd = *config;
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_set_fc
+ * @hw: pointer to the hw struct
+ *
+ * Set the requested flow control mode using set_phy_config.
+ **/
+enum i40e_status_code i40e_set_fc(struct i40e_hw *hw, u8 *aq_failures,
+				  bool atomic_restart)
+{
+	enum i40e_fc_mode fc_mode = hw->fc.requested_mode;
+	struct i40e_aq_get_phy_abilities_resp abilities;
+	struct i40e_aq_set_phy_config config;
+	enum i40e_status_code status;
+	u8 pause_mask = 0x0;
+
+	*aq_failures = 0x0;
+
+	switch (fc_mode) {
+	case I40E_FC_FULL:
+		pause_mask |= I40E_AQ_PHY_FLAG_PAUSE_TX;
+		pause_mask |= I40E_AQ_PHY_FLAG_PAUSE_RX;
+		break;
+	case I40E_FC_RX_PAUSE:
+		pause_mask |= I40E_AQ_PHY_FLAG_PAUSE_RX;
+		break;
+	case I40E_FC_TX_PAUSE:
+		pause_mask |= I40E_AQ_PHY_FLAG_PAUSE_TX;
+		break;
+	default:
+		break;
+	}
+
+	/* Get the current phy config */
+	status = i40e_aq_get_phy_capabilities(hw, FALSE, false, &abilities,
+					      NULL);
+	if (status) {
+		*aq_failures |= I40E_SET_FC_AQ_FAIL_GET;
+		return status;
+	}
+
+	memset(&config, 0, sizeof(config));
+	/* clear the old pause settings */
+	config.abilities = abilities.abilities & ~(I40E_AQ_PHY_FLAG_PAUSE_TX) &
+			   ~(I40E_AQ_PHY_FLAG_PAUSE_RX);
+	/* set the new abilities */
+	config.abilities |= pause_mask;
+	/* If the abilities have changed, then set the new config */
+	if (config.abilities != abilities.abilities) {
+		/* Auto restart link so settings take effect */
+		if (atomic_restart)
+			config.abilities |= I40E_AQ_PHY_ENABLE_ATOMIC_LINK;
+		/* Copy over all the old settings */
+		config.phy_type = abilities.phy_type;
+		config.link_speed = abilities.link_speed;
+		config.eee_capability = abilities.eee_capability;
+		config.eeer = abilities.eeer_val;
+		config.low_power_ctrl = abilities.d3_lpan;
+		status = i40e_aq_set_phy_config(hw, &config, NULL);
+
+		if (status)
+			*aq_failures |= I40E_SET_FC_AQ_FAIL_SET;
+	}
+	/* Update the link info */
+	status = i40e_update_link_info(hw);
+	if (status) {
+		/* Wait a little bit (on 40G cards it sometimes takes a really
+		 * long time for link to come back from the atomic reset)
+		 * and try once more
+		 */
+		i40e_msec_delay(1000);
+		status = i40e_update_link_info(hw);
+	}
+	if (status)
+		*aq_failures |= I40E_SET_FC_AQ_FAIL_UPDATE;
+
+	return status;
+}
+
+/**
+ * i40e_aq_set_mac_config
+ * @hw: pointer to the hw struct
+ * @max_frame_size: Maximum Frame Size to be supported by the port
+ * @crc_en: Tell HW to append a CRC to outgoing frames
+ * @pacing: Pacing configurations
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Configure MAC settings for frame size, jumbo frame support and the
+ * addition of a CRC by the hardware.
+ **/
+enum i40e_status_code i40e_aq_set_mac_config(struct i40e_hw *hw,
+				u16 max_frame_size,
+				bool crc_en, u16 pacing,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aq_set_mac_config *cmd =
+		(struct i40e_aq_set_mac_config *)&desc.params.raw;
+	enum i40e_status_code status;
+
+	if (max_frame_size == 0)
+		return I40E_ERR_PARAM;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					  i40e_aqc_opc_set_mac_config);
+
+	cmd->max_frame_size = CPU_TO_LE16(max_frame_size);
+	cmd->params = ((u8)pacing & 0x0F) << 3;
+	if (crc_en)
+		cmd->params |= I40E_AQ_SET_MAC_CONFIG_CRC_EN;
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_aq_clear_pxe_mode
+ * @hw: pointer to the hw struct
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Tell the firmware that the driver is taking over from PXE
+ **/
+enum i40e_status_code i40e_aq_clear_pxe_mode(struct i40e_hw *hw,
+			struct i40e_asq_cmd_details *cmd_details)
+{
+	enum i40e_status_code status;
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_clear_pxe *cmd =
+		(struct i40e_aqc_clear_pxe *)&desc.params.raw;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					  i40e_aqc_opc_clear_pxe_mode);
+
+	cmd->rx_cnt = 0x2;
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	wr32(hw, I40E_GLLAN_RCTL_0, 0x1);
+
+	return status;
+}
+
+/**
+ * i40e_aq_set_link_restart_an
+ * @hw: pointer to the hw struct
+ * @enable_link: if TRUE: enable link, if FALSE: disable link
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Sets up the link and restarts the Auto-Negotiation over the link.
+ **/
+enum i40e_status_code i40e_aq_set_link_restart_an(struct i40e_hw *hw,
+		bool enable_link, struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_set_link_restart_an *cmd =
+		(struct i40e_aqc_set_link_restart_an *)&desc.params.raw;
+	enum i40e_status_code status;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					  i40e_aqc_opc_set_link_restart_an);
+
+	cmd->command = I40E_AQ_PHY_RESTART_AN;
+	if (enable_link)
+		cmd->command |= I40E_AQ_PHY_LINK_ENABLE;
+	else
+		cmd->command &= ~I40E_AQ_PHY_LINK_ENABLE;
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_aq_get_link_info
+ * @hw: pointer to the hw struct
+ * @enable_lse: enable/disable LinkStatusEvent reporting
+ * @link: pointer to link status structure - optional
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Returns the link status of the adapter.
+ **/
+enum i40e_status_code i40e_aq_get_link_info(struct i40e_hw *hw,
+				bool enable_lse, struct i40e_link_status *link,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_get_link_status *resp =
+		(struct i40e_aqc_get_link_status *)&desc.params.raw;
+	struct i40e_link_status *hw_link_info = &hw->phy.link_info;
+	enum i40e_status_code status;
+	bool tx_pause, rx_pause;
+	u16 command_flags;
+
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_get_link_status);
+
+	if (enable_lse)
+		command_flags = I40E_AQ_LSE_ENABLE;
+	else
+		command_flags = I40E_AQ_LSE_DISABLE;
+	resp->command_flags = CPU_TO_LE16(command_flags);
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	if (status != I40E_SUCCESS)
+		goto aq_get_link_info_exit;
+
+	/* save off old link status information */
+	i40e_memcpy(&hw->phy.link_info_old, hw_link_info,
+		    sizeof(*hw_link_info), I40E_NONDMA_TO_NONDMA);
+
+	/* update link status */
+	hw_link_info->phy_type = (enum i40e_aq_phy_type)resp->phy_type;
+	hw->phy.media_type = i40e_get_media_type(hw);
+	hw_link_info->link_speed = (enum i40e_aq_link_speed)resp->link_speed;
+	hw_link_info->link_info = resp->link_info;
+	hw_link_info->an_info = resp->an_info;
+	hw_link_info->ext_info = resp->ext_info;
+	hw_link_info->loopback = resp->loopback;
+	hw_link_info->max_frame_size = LE16_TO_CPU(resp->max_frame_size);
+	hw_link_info->pacing = resp->config & I40E_AQ_CONFIG_PACING_MASK;
+
+	/* update fc info */
+	tx_pause = !!(resp->an_info & I40E_AQ_LINK_PAUSE_TX);
+	rx_pause = !!(resp->an_info & I40E_AQ_LINK_PAUSE_RX);
+	if (tx_pause & rx_pause)
+		hw->fc.current_mode = I40E_FC_FULL;
+	else if (tx_pause)
+		hw->fc.current_mode = I40E_FC_TX_PAUSE;
+	else if (rx_pause)
+		hw->fc.current_mode = I40E_FC_RX_PAUSE;
+	else
+		hw->fc.current_mode = I40E_FC_NONE;
+
+	if (resp->config & I40E_AQ_CONFIG_CRC_ENA)
+		hw_link_info->crc_enable = TRUE;
+	else
+		hw_link_info->crc_enable = FALSE;
+
+	if (resp->command_flags & CPU_TO_LE16(I40E_AQ_LSE_ENABLE))
+		hw_link_info->lse_enable = TRUE;
+	else
+		hw_link_info->lse_enable = FALSE;
+
+	if ((hw->aq.fw_maj_ver < 4 || (hw->aq.fw_maj_ver == 4 &&
+	     hw->aq.fw_min_ver < 40)) && hw_link_info->phy_type == 0xE)
+		hw_link_info->phy_type = I40E_PHY_TYPE_10GBASE_SFPP_CU;
+
+	/* save link status information */
+	if (link)
+		i40e_memcpy(link, hw_link_info, sizeof(*hw_link_info),
+			    I40E_NONDMA_TO_NONDMA);
+
+	/* flag cleared so helper functions don't call AQ again */
+	hw->phy.get_link_info = FALSE;
+
+aq_get_link_info_exit:
+	return status;
+}
+
+/**
+ * i40e_aq_set_phy_int_mask
+ * @hw: pointer to the hw struct
+ * @mask: interrupt mask to be set
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Set link interrupt mask.
+ **/
+enum i40e_status_code i40e_aq_set_phy_int_mask(struct i40e_hw *hw,
+				u16 mask,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_set_phy_int_mask *cmd =
+		(struct i40e_aqc_set_phy_int_mask *)&desc.params.raw;
+	enum i40e_status_code status;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					  i40e_aqc_opc_set_phy_int_mask);
+
+	cmd->event_mask = CPU_TO_LE16(mask);
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_aq_get_local_advt_reg
+ * @hw: pointer to the hw struct
+ * @advt_reg: local AN advertisement register value
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Get the Local AN advertisement register value.
+ **/
+enum i40e_status_code i40e_aq_get_local_advt_reg(struct i40e_hw *hw,
+				u64 *advt_reg,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_an_advt_reg *resp =
+		(struct i40e_aqc_an_advt_reg *)&desc.params.raw;
+	enum i40e_status_code status;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					  i40e_aqc_opc_get_local_advt_reg);
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	if (status != I40E_SUCCESS)
+		goto aq_get_local_advt_reg_exit;
+
+	*advt_reg = (u64)(LE16_TO_CPU(resp->local_an_reg1)) << 32;
+	*advt_reg |= LE32_TO_CPU(resp->local_an_reg0);
+
+aq_get_local_advt_reg_exit:
+	return status;
+}
+
+/**
+ * i40e_aq_set_local_advt_reg
+ * @hw: pointer to the hw struct
+ * @advt_reg: local AN advertisement register value
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Get the Local AN advertisement register value.
+ **/
+enum i40e_status_code i40e_aq_set_local_advt_reg(struct i40e_hw *hw,
+				u64 advt_reg,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_an_advt_reg *cmd =
+		(struct i40e_aqc_an_advt_reg *)&desc.params.raw;
+	enum i40e_status_code status;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					  i40e_aqc_opc_get_local_advt_reg);
+
+	cmd->local_an_reg0 = CPU_TO_LE32(I40E_LO_DWORD(advt_reg));
+	cmd->local_an_reg1 = CPU_TO_LE16(I40E_HI_DWORD(advt_reg));
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_aq_get_partner_advt
+ * @hw: pointer to the hw struct
+ * @advt_reg: AN partner advertisement register value
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Get the link partner AN advertisement register value.
+ **/
+enum i40e_status_code i40e_aq_get_partner_advt(struct i40e_hw *hw,
+				u64 *advt_reg,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_an_advt_reg *resp =
+		(struct i40e_aqc_an_advt_reg *)&desc.params.raw;
+	enum i40e_status_code status;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					  i40e_aqc_opc_get_partner_advt);
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	if (status != I40E_SUCCESS)
+		goto aq_get_partner_advt_exit;
+
+	*advt_reg = (u64)(LE16_TO_CPU(resp->local_an_reg1)) << 32;
+	*advt_reg |= LE32_TO_CPU(resp->local_an_reg0);
+
+aq_get_partner_advt_exit:
+	return status;
+}
+
+/**
+ * i40e_aq_set_lb_modes
+ * @hw: pointer to the hw struct
+ * @lb_modes: loopback mode to be set
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Sets loopback modes.
+ **/
+enum i40e_status_code i40e_aq_set_lb_modes(struct i40e_hw *hw,
+				u16 lb_modes,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_set_lb_mode *cmd =
+		(struct i40e_aqc_set_lb_mode *)&desc.params.raw;
+	enum i40e_status_code status;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					  i40e_aqc_opc_set_lb_modes);
+
+	cmd->lb_mode = CPU_TO_LE16(lb_modes);
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_aq_set_phy_debug
+ * @hw: pointer to the hw struct
+ * @cmd_flags: debug command flags
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Reset the external PHY.
+ **/
+enum i40e_status_code i40e_aq_set_phy_debug(struct i40e_hw *hw, u8 cmd_flags,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_set_phy_debug *cmd =
+		(struct i40e_aqc_set_phy_debug *)&desc.params.raw;
+	enum i40e_status_code status;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					  i40e_aqc_opc_set_phy_debug);
+
+	cmd->command_flags = cmd_flags;
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_aq_add_vsi
+ * @hw: pointer to the hw struct
+ * @vsi_ctx: pointer to a vsi context struct
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Add a VSI context to the hardware.
+**/
+enum i40e_status_code i40e_aq_add_vsi(struct i40e_hw *hw,
+				struct i40e_vsi_context *vsi_ctx,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_add_get_update_vsi *cmd =
+		(struct i40e_aqc_add_get_update_vsi *)&desc.params.raw;
+	struct i40e_aqc_add_get_update_vsi_completion *resp =
+		(struct i40e_aqc_add_get_update_vsi_completion *)
+		&desc.params.raw;
+	enum i40e_status_code status;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					  i40e_aqc_opc_add_vsi);
+
+	cmd->uplink_seid = CPU_TO_LE16(vsi_ctx->uplink_seid);
+	cmd->connection_type = vsi_ctx->connection_type;
+	cmd->vf_id = vsi_ctx->vf_num;
+	cmd->vsi_flags = CPU_TO_LE16(vsi_ctx->flags);
+
+	desc.flags |= CPU_TO_LE16((u16)(I40E_AQ_FLAG_BUF | I40E_AQ_FLAG_RD));
+
+	status = i40e_asq_send_command(hw, &desc, &vsi_ctx->info,
+				    sizeof(vsi_ctx->info), cmd_details);
+
+	if (status != I40E_SUCCESS)
+		goto aq_add_vsi_exit;
+
+	vsi_ctx->seid = LE16_TO_CPU(resp->seid);
+	vsi_ctx->vsi_number = LE16_TO_CPU(resp->vsi_number);
+	vsi_ctx->vsis_allocated = LE16_TO_CPU(resp->vsi_used);
+	vsi_ctx->vsis_unallocated = LE16_TO_CPU(resp->vsi_free);
+
+aq_add_vsi_exit:
+	return status;
+}
+
+/**
+ * i40e_aq_set_default_vsi
+ * @hw: pointer to the hw struct
+ * @seid: vsi number
+ * @cmd_details: pointer to command details structure or NULL
+ **/
+enum i40e_status_code i40e_aq_set_default_vsi(struct i40e_hw *hw,
+				u16 seid,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_set_vsi_promiscuous_modes *cmd =
+		(struct i40e_aqc_set_vsi_promiscuous_modes *)
+		&desc.params.raw;
+	enum i40e_status_code status;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					i40e_aqc_opc_set_vsi_promiscuous_modes);
+
+	cmd->promiscuous_flags = CPU_TO_LE16(I40E_AQC_SET_VSI_DEFAULT);
+	cmd->valid_flags = CPU_TO_LE16(I40E_AQC_SET_VSI_DEFAULT);
+	cmd->seid = CPU_TO_LE16(seid);
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_aq_set_vsi_unicast_promiscuous
+ * @hw: pointer to the hw struct
+ * @seid: vsi number
+ * @set: set unicast promiscuous enable/disable
+ * @cmd_details: pointer to command details structure or NULL
+ **/
+enum i40e_status_code i40e_aq_set_vsi_unicast_promiscuous(struct i40e_hw *hw,
+				u16 seid, bool set,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_set_vsi_promiscuous_modes *cmd =
+		(struct i40e_aqc_set_vsi_promiscuous_modes *)&desc.params.raw;
+	enum i40e_status_code status;
+	u16 flags = 0;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					i40e_aqc_opc_set_vsi_promiscuous_modes);
+
+	if (set)
+		flags |= I40E_AQC_SET_VSI_PROMISC_UNICAST;
+
+	cmd->promiscuous_flags = CPU_TO_LE16(flags);
+
+	cmd->valid_flags = CPU_TO_LE16(I40E_AQC_SET_VSI_PROMISC_UNICAST);
+
+	cmd->seid = CPU_TO_LE16(seid);
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_aq_set_vsi_multicast_promiscuous
+ * @hw: pointer to the hw struct
+ * @seid: vsi number
+ * @set: set multicast promiscuous enable/disable
+ * @cmd_details: pointer to command details structure or NULL
+ **/
+enum i40e_status_code i40e_aq_set_vsi_multicast_promiscuous(struct i40e_hw *hw,
+				u16 seid, bool set, struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_set_vsi_promiscuous_modes *cmd =
+		(struct i40e_aqc_set_vsi_promiscuous_modes *)&desc.params.raw;
+	enum i40e_status_code status;
+	u16 flags = 0;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					i40e_aqc_opc_set_vsi_promiscuous_modes);
+
+	if (set)
+		flags |= I40E_AQC_SET_VSI_PROMISC_MULTICAST;
+
+	cmd->promiscuous_flags = CPU_TO_LE16(flags);
+
+	cmd->valid_flags = CPU_TO_LE16(I40E_AQC_SET_VSI_PROMISC_MULTICAST);
+
+	cmd->seid = CPU_TO_LE16(seid);
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_aq_set_vsi_mc_promisc_on_vlan
+ * @hw: pointer to the hw struct
+ * @seid: vsi number
+ * @enable: set MAC L2 layer unicast promiscuous enable/disable for a given VLAN
+ * @vid: The VLAN tag filter - capture any multicast packet with this VLAN tag
+ * @cmd_details: pointer to command details structure or NULL
+ **/
+enum i40e_status_code i40e_aq_set_vsi_mc_promisc_on_vlan(struct i40e_hw *hw,
+				u16 seid, bool enable, u16 vid,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_set_vsi_promiscuous_modes *cmd =
+		(struct i40e_aqc_set_vsi_promiscuous_modes *)&desc.params.raw;
+	enum i40e_status_code status;
+	u16 flags = 0;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					i40e_aqc_opc_set_vsi_promiscuous_modes);
+
+	if (enable)
+		flags |= I40E_AQC_SET_VSI_PROMISC_MULTICAST;
+
+	cmd->promiscuous_flags = CPU_TO_LE16(flags);
+	cmd->valid_flags = CPU_TO_LE16(I40E_AQC_SET_VSI_PROMISC_MULTICAST);
+	cmd->seid = CPU_TO_LE16(seid);
+	cmd->vlan_tag = CPU_TO_LE16(vid | I40E_AQC_SET_VSI_VLAN_VALID);
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_aq_set_vsi_uc_promisc_on_vlan
+ * @hw: pointer to the hw struct
+ * @seid: vsi number
+ * @enable: set MAC L2 layer unicast promiscuous enable/disable for a given VLAN
+ * @vid: The VLAN tag filter - capture any unicast packet with this VLAN tag
+ * @cmd_details: pointer to command details structure or NULL
+ **/
+enum i40e_status_code i40e_aq_set_vsi_uc_promisc_on_vlan(struct i40e_hw *hw,
+				u16 seid, bool enable, u16 vid,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_set_vsi_promiscuous_modes *cmd =
+		(struct i40e_aqc_set_vsi_promiscuous_modes *)&desc.params.raw;
+	enum i40e_status_code status;
+	u16 flags = 0;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					i40e_aqc_opc_set_vsi_promiscuous_modes);
+
+	if (enable)
+		flags |= I40E_AQC_SET_VSI_PROMISC_UNICAST;
+
+	cmd->promiscuous_flags = CPU_TO_LE16(flags);
+	cmd->valid_flags = CPU_TO_LE16(I40E_AQC_SET_VSI_PROMISC_UNICAST);
+	cmd->seid = CPU_TO_LE16(seid);
+	cmd->vlan_tag = CPU_TO_LE16(vid | I40E_AQC_SET_VSI_VLAN_VALID);
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_aq_set_vsi_broadcast
+ * @hw: pointer to the hw struct
+ * @seid: vsi number
+ * @set_filter: TRUE to set filter, FALSE to clear filter
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Set or clear the broadcast promiscuous flag (filter) for a given VSI.
+ **/
+enum i40e_status_code i40e_aq_set_vsi_broadcast(struct i40e_hw *hw,
+				u16 seid, bool set_filter,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_set_vsi_promiscuous_modes *cmd =
+		(struct i40e_aqc_set_vsi_promiscuous_modes *)&desc.params.raw;
+	enum i40e_status_code status;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					i40e_aqc_opc_set_vsi_promiscuous_modes);
+
+	if (set_filter)
+		cmd->promiscuous_flags
+			    |= CPU_TO_LE16(I40E_AQC_SET_VSI_PROMISC_BROADCAST);
+	else
+		cmd->promiscuous_flags
+			    &= CPU_TO_LE16(~I40E_AQC_SET_VSI_PROMISC_BROADCAST);
+
+	cmd->valid_flags = CPU_TO_LE16(I40E_AQC_SET_VSI_PROMISC_BROADCAST);
+	cmd->seid = CPU_TO_LE16(seid);
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_get_vsi_params - get VSI configuration info
+ * @hw: pointer to the hw struct
+ * @vsi_ctx: pointer to a vsi context struct
+ * @cmd_details: pointer to command details structure or NULL
+ **/
+enum i40e_status_code i40e_aq_get_vsi_params(struct i40e_hw *hw,
+				struct i40e_vsi_context *vsi_ctx,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_add_get_update_vsi *cmd =
+		(struct i40e_aqc_add_get_update_vsi *)&desc.params.raw;
+	struct i40e_aqc_add_get_update_vsi_completion *resp =
+		(struct i40e_aqc_add_get_update_vsi_completion *)
+		&desc.params.raw;
+	enum i40e_status_code status;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					  i40e_aqc_opc_get_vsi_parameters);
+
+	cmd->uplink_seid = CPU_TO_LE16(vsi_ctx->seid);
+
+	desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_BUF);
+
+	status = i40e_asq_send_command(hw, &desc, &vsi_ctx->info,
+				    sizeof(vsi_ctx->info), NULL);
+
+	if (status != I40E_SUCCESS)
+		goto aq_get_vsi_params_exit;
+
+	vsi_ctx->seid = LE16_TO_CPU(resp->seid);
+	vsi_ctx->vsi_number = LE16_TO_CPU(resp->vsi_number);
+	vsi_ctx->vsis_allocated = LE16_TO_CPU(resp->vsi_used);
+	vsi_ctx->vsis_unallocated = LE16_TO_CPU(resp->vsi_free);
+
+aq_get_vsi_params_exit:
+	return status;
+}
+
+/**
+ * i40e_aq_update_vsi_params
+ * @hw: pointer to the hw struct
+ * @vsi_ctx: pointer to a vsi context struct
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Update a VSI context.
+ **/
+enum i40e_status_code i40e_aq_update_vsi_params(struct i40e_hw *hw,
+				struct i40e_vsi_context *vsi_ctx,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_add_get_update_vsi *cmd =
+		(struct i40e_aqc_add_get_update_vsi *)&desc.params.raw;
+	enum i40e_status_code status;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					  i40e_aqc_opc_update_vsi_parameters);
+	cmd->uplink_seid = CPU_TO_LE16(vsi_ctx->seid);
+
+	desc.flags |= CPU_TO_LE16((u16)(I40E_AQ_FLAG_BUF | I40E_AQ_FLAG_RD));
+
+	status = i40e_asq_send_command(hw, &desc, &vsi_ctx->info,
+				    sizeof(vsi_ctx->info), cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_aq_get_switch_config
+ * @hw: pointer to the hardware structure
+ * @buf: pointer to the result buffer
+ * @buf_size: length of input buffer
+ * @start_seid: seid to start for the report, 0 == beginning
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Fill the buf with switch configuration returned from AdminQ command
+ **/
+enum i40e_status_code i40e_aq_get_switch_config(struct i40e_hw *hw,
+				struct i40e_aqc_get_switch_config_resp *buf,
+				u16 buf_size, u16 *start_seid,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_switch_seid *scfg =
+		(struct i40e_aqc_switch_seid *)&desc.params.raw;
+	enum i40e_status_code status;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					  i40e_aqc_opc_get_switch_config);
+	desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_BUF);
+	if (buf_size > I40E_AQ_LARGE_BUF)
+		desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_LB);
+	scfg->seid = CPU_TO_LE16(*start_seid);
+
+	status = i40e_asq_send_command(hw, &desc, buf, buf_size, cmd_details);
+	*start_seid = LE16_TO_CPU(scfg->seid);
+
+	return status;
+}
+
+/**
+ * i40e_aq_get_firmware_version
+ * @hw: pointer to the hw struct
+ * @fw_major_version: firmware major version
+ * @fw_minor_version: firmware minor version
+ * @fw_build: firmware build number
+ * @api_major_version: major queue version
+ * @api_minor_version: minor queue version
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Get the firmware version from the admin queue commands
+ **/
+enum i40e_status_code i40e_aq_get_firmware_version(struct i40e_hw *hw,
+				u16 *fw_major_version, u16 *fw_minor_version,
+				u32 *fw_build,
+				u16 *api_major_version, u16 *api_minor_version,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_get_version *resp =
+		(struct i40e_aqc_get_version *)&desc.params.raw;
+	enum i40e_status_code status;
+
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_get_version);
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	if (status == I40E_SUCCESS) {
+		if (fw_major_version != NULL)
+			*fw_major_version = LE16_TO_CPU(resp->fw_major);
+		if (fw_minor_version != NULL)
+			*fw_minor_version = LE16_TO_CPU(resp->fw_minor);
+		if (fw_build != NULL)
+			*fw_build = LE32_TO_CPU(resp->fw_build);
+		if (api_major_version != NULL)
+			*api_major_version = LE16_TO_CPU(resp->api_major);
+		if (api_minor_version != NULL)
+			*api_minor_version = LE16_TO_CPU(resp->api_minor);
+
+		/* A workaround to fix the API version in SW */
+		if (api_major_version && api_minor_version &&
+		    fw_major_version && fw_minor_version &&
+		    ((*api_major_version == 1) && (*api_minor_version == 1)) &&
+		    (((*fw_major_version == 4) && (*fw_minor_version >= 2)) ||
+		     (*fw_major_version > 4)))
+			*api_minor_version = 2;
+	}
+
+	return status;
+}
+
+/**
+ * i40e_aq_send_driver_version
+ * @hw: pointer to the hw struct
+ * @dv: driver's major, minor version
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Send the driver version to the firmware
+ **/
+enum i40e_status_code i40e_aq_send_driver_version(struct i40e_hw *hw,
+				struct i40e_driver_version *dv,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_driver_version *cmd =
+		(struct i40e_aqc_driver_version *)&desc.params.raw;
+	enum i40e_status_code status;
+	u16 len;
+
+	if (dv == NULL)
+		return I40E_ERR_PARAM;
+
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_driver_version);
+
+	desc.flags |= CPU_TO_LE16(I40E_AQ_FLAG_BUF | I40E_AQ_FLAG_RD);
+	cmd->driver_major_ver = dv->major_version;
+	cmd->driver_minor_ver = dv->minor_version;
+	cmd->driver_build_ver = dv->build_version;
+	cmd->driver_subbuild_ver = dv->subbuild_version;
+
+	len = 0;
+	while (len < sizeof(dv->driver_string) &&
+	       (dv->driver_string[len] < 0x80) &&
+	       dv->driver_string[len])
+		len++;
+	status = i40e_asq_send_command(hw, &desc, dv->driver_string,
+				       len, cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_get_link_status - get status of the HW network link
+ * @hw: pointer to the hw struct
+ * @link_up: pointer to bool (TRUE/FALSE = linkup/linkdown)
+ *
+ * Variable link_up TRUE if link is up, FALSE if link is down.
+ * The variable link_up is invalid if returned value of status != I40E_SUCCESS
+ *
+ * Side effect: LinkStatusEvent reporting becomes enabled
+ **/
+enum i40e_status_code i40e_get_link_status(struct i40e_hw *hw, bool *link_up)
+{
+	enum i40e_status_code status = I40E_SUCCESS;
+
+	if (hw->phy.get_link_info) {
+		status = i40e_update_link_info(hw);
+
+		if (status != I40E_SUCCESS)
+			i40e_debug(hw, I40E_DEBUG_LINK, "get link failed: status %d\n",
+				   status);
+	}
+
+	*link_up = hw->phy.link_info.link_info & I40E_AQ_LINK_UP;
+
+	return status;
+}
+
+/**
+ * i40e_updatelink_status - update status of the HW network link
+ * @hw: pointer to the hw struct
+ **/
+enum i40e_status_code i40e_update_link_info(struct i40e_hw *hw)
+{
+	struct i40e_aq_get_phy_abilities_resp abilities;
+	enum i40e_status_code status = I40E_SUCCESS;
+
+	status = i40e_aq_get_link_info(hw, TRUE, NULL, NULL);
+	if (status)
+		return status;
+
+	status = i40e_aq_get_phy_capabilities(hw, FALSE, false, &abilities,
+					      NULL);
+	if (status)
+		return status;
+
+	memcpy(hw->phy.link_info.module_type, &abilities.module_type,
+		sizeof(hw->phy.link_info.module_type));
+
+	return status;
+}
+
+
+/**
+ * i40e_get_link_speed
+ * @hw: pointer to the hw struct
+ *
+ * Returns the link speed of the adapter.
+ **/
+enum i40e_aq_link_speed i40e_get_link_speed(struct i40e_hw *hw)
+{
+	enum i40e_aq_link_speed speed = I40E_LINK_SPEED_UNKNOWN;
+	enum i40e_status_code status = I40E_SUCCESS;
+
+	if (hw->phy.get_link_info) {
+		status = i40e_aq_get_link_info(hw, TRUE, NULL, NULL);
+
+		if (status != I40E_SUCCESS)
+			goto i40e_link_speed_exit;
+	}
+
+	speed = hw->phy.link_info.link_speed;
+
+i40e_link_speed_exit:
+	return speed;
+}
+
+/**
+ * i40e_aq_add_veb - Insert a VEB between the VSI and the MAC
+ * @hw: pointer to the hw struct
+ * @uplink_seid: the MAC or other gizmo SEID
+ * @downlink_seid: the VSI SEID
+ * @enabled_tc: bitmap of TCs to be enabled
+ * @default_port: TRUE for default port VSI, FALSE for control port
+ * @enable_l2_filtering: TRUE to add L2 filter table rules to regular forwarding rules for cloud support
+ * @veb_seid: pointer to where to put the resulting VEB SEID
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * This asks the FW to add a VEB between the uplink and downlink
+ * elements.  If the uplink SEID is 0, this will be a floating VEB.
+ **/
+enum i40e_status_code i40e_aq_add_veb(struct i40e_hw *hw, u16 uplink_seid,
+				u16 downlink_seid, u8 enabled_tc,
+				bool default_port, bool enable_l2_filtering,
+				u16 *veb_seid,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_add_veb *cmd =
+		(struct i40e_aqc_add_veb *)&desc.params.raw;
+	struct i40e_aqc_add_veb_completion *resp =
+		(struct i40e_aqc_add_veb_completion *)&desc.params.raw;
+	enum i40e_status_code status;
+	u16 veb_flags = 0;
+
+	/* SEIDs need to either both be set or both be 0 for floating VEB */
+	if (!!uplink_seid != !!downlink_seid)
+		return I40E_ERR_PARAM;
+
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_add_veb);
+
+	cmd->uplink_seid = CPU_TO_LE16(uplink_seid);
+	cmd->downlink_seid = CPU_TO_LE16(downlink_seid);
+	cmd->enable_tcs = enabled_tc;
+	if (!uplink_seid)
+		veb_flags |= I40E_AQC_ADD_VEB_FLOATING;
+	if (default_port)
+		veb_flags |= I40E_AQC_ADD_VEB_PORT_TYPE_DEFAULT;
+	else
+		veb_flags |= I40E_AQC_ADD_VEB_PORT_TYPE_DATA;
+
+	if (enable_l2_filtering)
+		veb_flags |= I40E_AQC_ADD_VEB_ENABLE_L2_FILTER;
+
+	cmd->veb_flags = CPU_TO_LE16(veb_flags);
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	if (!status && veb_seid)
+		*veb_seid = LE16_TO_CPU(resp->veb_seid);
+
+	return status;
+}
+
+/**
+ * i40e_aq_get_veb_parameters - Retrieve VEB parameters
+ * @hw: pointer to the hw struct
+ * @veb_seid: the SEID of the VEB to query
+ * @switch_id: the uplink switch id
+ * @floating: set to TRUE if the VEB is floating
+ * @statistic_index: index of the stats counter block for this VEB
+ * @vebs_used: number of VEB's used by function
+ * @vebs_free: total VEB's not reserved by any function
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * This retrieves the parameters for a particular VEB, specified by
+ * uplink_seid, and returns them to the caller.
+ **/
+enum i40e_status_code i40e_aq_get_veb_parameters(struct i40e_hw *hw,
+				u16 veb_seid, u16 *switch_id,
+				bool *floating, u16 *statistic_index,
+				u16 *vebs_used, u16 *vebs_free,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_get_veb_parameters_completion *cmd_resp =
+		(struct i40e_aqc_get_veb_parameters_completion *)
+		&desc.params.raw;
+	enum i40e_status_code status;
+
+	if (veb_seid == 0)
+		return I40E_ERR_PARAM;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					  i40e_aqc_opc_get_veb_parameters);
+	cmd_resp->seid = CPU_TO_LE16(veb_seid);
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+	if (status)
+		goto get_veb_exit;
+
+	if (switch_id)
+		*switch_id = LE16_TO_CPU(cmd_resp->switch_id);
+	if (statistic_index)
+		*statistic_index = LE16_TO_CPU(cmd_resp->statistic_index);
+	if (vebs_used)
+		*vebs_used = LE16_TO_CPU(cmd_resp->vebs_used);
+	if (vebs_free)
+		*vebs_free = LE16_TO_CPU(cmd_resp->vebs_free);
+	if (floating) {
+		u16 flags = LE16_TO_CPU(cmd_resp->veb_flags);
+
+		if (flags & I40E_AQC_ADD_VEB_FLOATING)
+			*floating = TRUE;
+		else
+			*floating = FALSE;
+	}
+
+get_veb_exit:
+	return status;
+}
+
+/**
+ * i40e_aq_add_macvlan
+ * @hw: pointer to the hw struct
+ * @seid: VSI for the mac address
+ * @mv_list: list of macvlans to be added
+ * @count: length of the list
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Add MAC/VLAN addresses to the HW filtering
+ **/
+enum i40e_status_code i40e_aq_add_macvlan(struct i40e_hw *hw, u16 seid,
+			struct i40e_aqc_add_macvlan_element_data *mv_list,
+			u16 count, struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_macvlan *cmd =
+		(struct i40e_aqc_macvlan *)&desc.params.raw;
+	enum i40e_status_code status;
+	u16 buf_size;
+
+	if (count == 0 || !mv_list || !hw)
+		return I40E_ERR_PARAM;
+
+	buf_size = count * sizeof(*mv_list);
+
+	/* prep the rest of the request */
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_add_macvlan);
+	cmd->num_addresses = CPU_TO_LE16(count);
+	cmd->seid[0] = CPU_TO_LE16(I40E_AQC_MACVLAN_CMD_SEID_VALID | seid);
+	cmd->seid[1] = 0;
+	cmd->seid[2] = 0;
+
+	desc.flags |= CPU_TO_LE16((u16)(I40E_AQ_FLAG_BUF | I40E_AQ_FLAG_RD));
+	if (buf_size > I40E_AQ_LARGE_BUF)
+		desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_LB);
+
+	status = i40e_asq_send_command(hw, &desc, mv_list, buf_size,
+				    cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_aq_remove_macvlan
+ * @hw: pointer to the hw struct
+ * @seid: VSI for the mac address
+ * @mv_list: list of macvlans to be removed
+ * @count: length of the list
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Remove MAC/VLAN addresses from the HW filtering
+ **/
+enum i40e_status_code i40e_aq_remove_macvlan(struct i40e_hw *hw, u16 seid,
+			struct i40e_aqc_remove_macvlan_element_data *mv_list,
+			u16 count, struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_macvlan *cmd =
+		(struct i40e_aqc_macvlan *)&desc.params.raw;
+	enum i40e_status_code status;
+	u16 buf_size;
+
+	if (count == 0 || !mv_list || !hw)
+		return I40E_ERR_PARAM;
+
+	buf_size = count * sizeof(*mv_list);
+
+	/* prep the rest of the request */
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_remove_macvlan);
+	cmd->num_addresses = CPU_TO_LE16(count);
+	cmd->seid[0] = CPU_TO_LE16(I40E_AQC_MACVLAN_CMD_SEID_VALID | seid);
+	cmd->seid[1] = 0;
+	cmd->seid[2] = 0;
+
+	desc.flags |= CPU_TO_LE16((u16)(I40E_AQ_FLAG_BUF | I40E_AQ_FLAG_RD));
+	if (buf_size > I40E_AQ_LARGE_BUF)
+		desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_LB);
+
+	status = i40e_asq_send_command(hw, &desc, mv_list, buf_size,
+				       cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_aq_add_vlan - Add VLAN ids to the HW filtering
+ * @hw: pointer to the hw struct
+ * @seid: VSI for the vlan filters
+ * @v_list: list of vlan filters to be added
+ * @count: length of the list
+ * @cmd_details: pointer to command details structure or NULL
+ **/
+enum i40e_status_code i40e_aq_add_vlan(struct i40e_hw *hw, u16 seid,
+			struct i40e_aqc_add_remove_vlan_element_data *v_list,
+			u8 count, struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_macvlan *cmd =
+		(struct i40e_aqc_macvlan *)&desc.params.raw;
+	enum i40e_status_code status;
+	u16 buf_size;
+
+	if (count == 0 || !v_list || !hw)
+		return I40E_ERR_PARAM;
+
+	buf_size = count * sizeof(*v_list);
+
+	/* prep the rest of the request */
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_add_vlan);
+	cmd->num_addresses = CPU_TO_LE16(count);
+	cmd->seid[0] = CPU_TO_LE16(seid | I40E_AQC_MACVLAN_CMD_SEID_VALID);
+	cmd->seid[1] = 0;
+	cmd->seid[2] = 0;
+
+	desc.flags |= CPU_TO_LE16((u16)(I40E_AQ_FLAG_BUF | I40E_AQ_FLAG_RD));
+	if (buf_size > I40E_AQ_LARGE_BUF)
+		desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_LB);
+
+	status = i40e_asq_send_command(hw, &desc, v_list, buf_size,
+				       cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_aq_remove_vlan - Remove VLANs from the HW filtering
+ * @hw: pointer to the hw struct
+ * @seid: VSI for the vlan filters
+ * @v_list: list of macvlans to be removed
+ * @count: length of the list
+ * @cmd_details: pointer to command details structure or NULL
+ **/
+enum i40e_status_code i40e_aq_remove_vlan(struct i40e_hw *hw, u16 seid,
+			struct i40e_aqc_add_remove_vlan_element_data *v_list,
+			u8 count, struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_macvlan *cmd =
+		(struct i40e_aqc_macvlan *)&desc.params.raw;
+	enum i40e_status_code status;
+	u16 buf_size;
+
+	if (count == 0 || !v_list || !hw)
+		return I40E_ERR_PARAM;
+
+	buf_size = count * sizeof(*v_list);
+
+	/* prep the rest of the request */
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_remove_vlan);
+	cmd->num_addresses = CPU_TO_LE16(count);
+	cmd->seid[0] = CPU_TO_LE16(seid | I40E_AQC_MACVLAN_CMD_SEID_VALID);
+	cmd->seid[1] = 0;
+	cmd->seid[2] = 0;
+
+	desc.flags |= CPU_TO_LE16((u16)(I40E_AQ_FLAG_BUF | I40E_AQ_FLAG_RD));
+	if (buf_size > I40E_AQ_LARGE_BUF)
+		desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_LB);
+
+	status = i40e_asq_send_command(hw, &desc, v_list, buf_size,
+				       cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_aq_send_msg_to_vf
+ * @hw: pointer to the hardware structure
+ * @vfid: vf id to send msg
+ * @v_opcode: opcodes for VF-PF communication
+ * @v_retval: return error code
+ * @msg: pointer to the msg buffer
+ * @msglen: msg length
+ * @cmd_details: pointer to command details
+ *
+ * send msg to vf
+ **/
+enum i40e_status_code i40e_aq_send_msg_to_vf(struct i40e_hw *hw, u16 vfid,
+				u32 v_opcode, u32 v_retval, u8 *msg, u16 msglen,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_pf_vf_message *cmd =
+		(struct i40e_aqc_pf_vf_message *)&desc.params.raw;
+	enum i40e_status_code status;
+
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_send_msg_to_vf);
+	cmd->id = CPU_TO_LE32(vfid);
+	desc.cookie_high = CPU_TO_LE32(v_opcode);
+	desc.cookie_low = CPU_TO_LE32(v_retval);
+	desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_SI);
+	if (msglen) {
+		desc.flags |= CPU_TO_LE16((u16)(I40E_AQ_FLAG_BUF |
+						I40E_AQ_FLAG_RD));
+		if (msglen > I40E_AQ_LARGE_BUF)
+			desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_LB);
+		desc.datalen = CPU_TO_LE16(msglen);
+	}
+	status = i40e_asq_send_command(hw, &desc, msg, msglen, cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_aq_debug_read_register
+ * @hw: pointer to the hw struct
+ * @reg_addr: register address
+ * @reg_val: register value
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Read the register using the admin queue commands
+ **/
+enum i40e_status_code i40e_aq_debug_read_register(struct i40e_hw *hw,
+				u32 reg_addr, u64 *reg_val,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_debug_reg_read_write *cmd_resp =
+		(struct i40e_aqc_debug_reg_read_write *)&desc.params.raw;
+	enum i40e_status_code status;
+
+	if (reg_val == NULL)
+		return I40E_ERR_PARAM;
+
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_debug_read_reg);
+
+	cmd_resp->address = CPU_TO_LE32(reg_addr);
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	if (status == I40E_SUCCESS) {
+		*reg_val = ((u64)LE32_TO_CPU(cmd_resp->value_high) << 32) |
+			   (u64)LE32_TO_CPU(cmd_resp->value_low);
+	}
+
+	return status;
+}
+
+/**
+ * i40e_aq_debug_write_register
+ * @hw: pointer to the hw struct
+ * @reg_addr: register address
+ * @reg_val: register value
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Write to a register using the admin queue commands
+ **/
+enum i40e_status_code i40e_aq_debug_write_register(struct i40e_hw *hw,
+				u32 reg_addr, u64 reg_val,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_debug_reg_read_write *cmd =
+		(struct i40e_aqc_debug_reg_read_write *)&desc.params.raw;
+	enum i40e_status_code status;
+
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_debug_write_reg);
+
+	cmd->address = CPU_TO_LE32(reg_addr);
+	cmd->value_high = CPU_TO_LE32((u32)(reg_val >> 32));
+	cmd->value_low = CPU_TO_LE32((u32)(reg_val & 0xFFFFFFFF));
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_aq_get_hmc_resource_profile
+ * @hw: pointer to the hw struct
+ * @profile: type of profile the HMC is to be set as
+ * @pe_vf_enabled_count: the number of PE enabled VFs the system has
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * query the HMC profile of the device.
+ **/
+enum i40e_status_code i40e_aq_get_hmc_resource_profile(struct i40e_hw *hw,
+				enum i40e_aq_hmc_profile *profile,
+				u8 *pe_vf_enabled_count,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aq_get_set_hmc_resource_profile *resp =
+		(struct i40e_aq_get_set_hmc_resource_profile *)&desc.params.raw;
+	enum i40e_status_code status;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+				i40e_aqc_opc_query_hmc_resource_profile);
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	*profile = (enum i40e_aq_hmc_profile)(resp->pm_profile &
+		   I40E_AQ_GET_HMC_RESOURCE_PROFILE_PM_MASK);
+	*pe_vf_enabled_count = resp->pe_vf_enabled &
+			       I40E_AQ_GET_HMC_RESOURCE_PROFILE_COUNT_MASK;
+
+	return status;
+}
+
+/**
+ * i40e_aq_set_hmc_resource_profile
+ * @hw: pointer to the hw struct
+ * @profile: type of profile the HMC is to be set as
+ * @pe_vf_enabled_count: the number of PE enabled VFs the system has
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * set the HMC profile of the device.
+ **/
+enum i40e_status_code i40e_aq_set_hmc_resource_profile(struct i40e_hw *hw,
+				enum i40e_aq_hmc_profile profile,
+				u8 pe_vf_enabled_count,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aq_get_set_hmc_resource_profile *cmd =
+		(struct i40e_aq_get_set_hmc_resource_profile *)&desc.params.raw;
+	enum i40e_status_code status;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					i40e_aqc_opc_set_hmc_resource_profile);
+
+	cmd->pm_profile = (u8)profile;
+	cmd->pe_vf_enabled = pe_vf_enabled_count;
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_aq_request_resource
+ * @hw: pointer to the hw struct
+ * @resource: resource id
+ * @access: access type
+ * @sdp_number: resource number
+ * @timeout: the maximum time in ms that the driver may hold the resource
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * requests common resource using the admin queue commands
+ **/
+enum i40e_status_code i40e_aq_request_resource(struct i40e_hw *hw,
+				enum i40e_aq_resources_ids resource,
+				enum i40e_aq_resource_access_type access,
+				u8 sdp_number, u64 *timeout,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_request_resource *cmd_resp =
+		(struct i40e_aqc_request_resource *)&desc.params.raw;
+	enum i40e_status_code status;
+
+	DEBUGFUNC("i40e_aq_request_resource");
+
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_request_resource);
+
+	cmd_resp->resource_id = CPU_TO_LE16(resource);
+	cmd_resp->access_type = CPU_TO_LE16(access);
+	cmd_resp->resource_number = CPU_TO_LE32(sdp_number);
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+	/* The completion specifies the maximum time in ms that the driver
+	 * may hold the resource in the Timeout field.
+	 * If the resource is held by someone else, the command completes with
+	 * busy return value and the timeout field indicates the maximum time
+	 * the current owner of the resource has to free it.
+	 */
+	if (status == I40E_SUCCESS || hw->aq.asq_last_status == I40E_AQ_RC_EBUSY)
+		*timeout = LE32_TO_CPU(cmd_resp->timeout);
+
+	return status;
+}
+
+/**
+ * i40e_aq_release_resource
+ * @hw: pointer to the hw struct
+ * @resource: resource id
+ * @sdp_number: resource number
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * release common resource using the admin queue commands
+ **/
+enum i40e_status_code i40e_aq_release_resource(struct i40e_hw *hw,
+				enum i40e_aq_resources_ids resource,
+				u8 sdp_number,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_request_resource *cmd =
+		(struct i40e_aqc_request_resource *)&desc.params.raw;
+	enum i40e_status_code status;
+
+	DEBUGFUNC("i40e_aq_release_resource");
+
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_release_resource);
+
+	cmd->resource_id = CPU_TO_LE16(resource);
+	cmd->resource_number = CPU_TO_LE32(sdp_number);
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_aq_read_nvm
+ * @hw: pointer to the hw struct
+ * @module_pointer: module pointer location in words from the NVM beginning
+ * @offset: byte offset from the module beginning
+ * @length: length of the section to be read (in bytes from the offset)
+ * @data: command buffer (size [bytes] = length)
+ * @last_command: tells if this is the last command in a series
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Read the NVM using the admin queue commands
+ **/
+enum i40e_status_code i40e_aq_read_nvm(struct i40e_hw *hw, u8 module_pointer,
+				u32 offset, u16 length, void *data,
+				bool last_command,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_nvm_update *cmd =
+		(struct i40e_aqc_nvm_update *)&desc.params.raw;
+	enum i40e_status_code status;
+
+	DEBUGFUNC("i40e_aq_read_nvm");
+
+	/* In offset the highest byte must be zeroed. */
+	if (offset & 0xFF000000) {
+		status = I40E_ERR_PARAM;
+		goto i40e_aq_read_nvm_exit;
+	}
+
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_nvm_read);
+
+	/* If this is the last command in a series, set the proper flag. */
+	if (last_command)
+		cmd->command_flags |= I40E_AQ_NVM_LAST_CMD;
+	cmd->module_pointer = module_pointer;
+	cmd->offset = CPU_TO_LE32(offset);
+	cmd->length = CPU_TO_LE16(length);
+
+	desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_BUF);
+	if (length > I40E_AQ_LARGE_BUF)
+		desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_LB);
+
+	status = i40e_asq_send_command(hw, &desc, data, length, cmd_details);
+
+i40e_aq_read_nvm_exit:
+	return status;
+}
+
+/**
+ * i40e_aq_read_nvm_config - read an nvm config block
+ * @hw: pointer to the hw struct
+ * @cmd_flags: NVM access admin command bits
+ * @field_id: field or feature id
+ * @data: buffer for result
+ * @buf_size: buffer size
+ * @element_count: pointer to count of elements read by FW
+ * @cmd_details: pointer to command details structure or NULL
+ **/
+enum i40e_status_code i40e_aq_read_nvm_config(struct i40e_hw *hw,
+				u8 cmd_flags, u32 field_id, void *data,
+				u16 buf_size, u16 *element_count,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_nvm_config_read *cmd =
+		(struct i40e_aqc_nvm_config_read *)&desc.params.raw;
+	enum i40e_status_code status;
+
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_nvm_config_read);
+	desc.flags |= CPU_TO_LE16((u16)(I40E_AQ_FLAG_BUF));
+	if (buf_size > I40E_AQ_LARGE_BUF)
+		desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_LB);
+
+	cmd->cmd_flags = CPU_TO_LE16(cmd_flags);
+	cmd->element_id = CPU_TO_LE16((u16)(0xffff & field_id));
+	if (cmd_flags & I40E_AQ_ANVM_FEATURE_OR_IMMEDIATE_MASK)
+		cmd->element_id_msw = CPU_TO_LE16((u16)(field_id >> 16));
+	else
+		cmd->element_id_msw = 0;
+
+	status = i40e_asq_send_command(hw, &desc, data, buf_size, cmd_details);
+
+	if (!status && element_count)
+		*element_count = LE16_TO_CPU(cmd->element_count);
+
+	return status;
+}
+
+/**
+ * i40e_aq_write_nvm_config - write an nvm config block
+ * @hw: pointer to the hw struct
+ * @cmd_flags: NVM access admin command bits
+ * @data: buffer for result
+ * @buf_size: buffer size
+ * @element_count: count of elements to be written
+ * @cmd_details: pointer to command details structure or NULL
+ **/
+enum i40e_status_code i40e_aq_write_nvm_config(struct i40e_hw *hw,
+				u8 cmd_flags, void *data, u16 buf_size,
+				u16 element_count,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_nvm_config_write *cmd =
+		(struct i40e_aqc_nvm_config_write *)&desc.params.raw;
+	enum i40e_status_code status;
+
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_nvm_config_write);
+	desc.flags |= CPU_TO_LE16((u16)(I40E_AQ_FLAG_BUF | I40E_AQ_FLAG_RD));
+	if (buf_size > I40E_AQ_LARGE_BUF)
+		desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_LB);
+
+	cmd->element_count = CPU_TO_LE16(element_count);
+	cmd->cmd_flags = CPU_TO_LE16(cmd_flags);
+	status = i40e_asq_send_command(hw, &desc, data, buf_size, cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_aq_oem_post_update - triggers an OEM specific flow after update
+ * @hw: pointer to the hw struct
+ * @cmd_details: pointer to command details structure or NULL
+ **/
+enum i40e_status_code i40e_aq_oem_post_update(struct i40e_hw *hw,
+				void *buff, u16 buff_size,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	enum i40e_status_code status;
+
+
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_oem_post_update);
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+	if (status && LE16_TO_CPU(desc.retval) == I40E_AQ_RC_ESRCH)
+		status = I40E_ERR_NOT_IMPLEMENTED;
+
+	return status;
+}
+
+/**
+ * i40e_aq_erase_nvm
+ * @hw: pointer to the hw struct
+ * @module_pointer: module pointer location in words from the NVM beginning
+ * @offset: offset in the module (expressed in 4 KB from module's beginning)
+ * @length: length of the section to be erased (expressed in 4 KB)
+ * @last_command: tells if this is the last command in a series
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Erase the NVM sector using the admin queue commands
+ **/
+enum i40e_status_code i40e_aq_erase_nvm(struct i40e_hw *hw, u8 module_pointer,
+				u32 offset, u16 length, bool last_command,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_nvm_update *cmd =
+		(struct i40e_aqc_nvm_update *)&desc.params.raw;
+	enum i40e_status_code status;
+
+	DEBUGFUNC("i40e_aq_erase_nvm");
+
+	/* In offset the highest byte must be zeroed. */
+	if (offset & 0xFF000000) {
+		status = I40E_ERR_PARAM;
+		goto i40e_aq_erase_nvm_exit;
+	}
+
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_nvm_erase);
+
+	/* If this is the last command in a series, set the proper flag. */
+	if (last_command)
+		cmd->command_flags |= I40E_AQ_NVM_LAST_CMD;
+	cmd->module_pointer = module_pointer;
+	cmd->offset = CPU_TO_LE32(offset);
+	cmd->length = CPU_TO_LE16(length);
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+i40e_aq_erase_nvm_exit:
+	return status;
+}
+
+#define I40E_DEV_FUNC_CAP_SWITCH_MODE	0x01
+#define I40E_DEV_FUNC_CAP_MGMT_MODE	0x02
+#define I40E_DEV_FUNC_CAP_NPAR		0x03
+#define I40E_DEV_FUNC_CAP_OS2BMC	0x04
+#define I40E_DEV_FUNC_CAP_VALID_FUNC	0x05
+#define I40E_DEV_FUNC_CAP_SRIOV_1_1	0x12
+#define I40E_DEV_FUNC_CAP_VF		0x13
+#define I40E_DEV_FUNC_CAP_VMDQ		0x14
+#define I40E_DEV_FUNC_CAP_802_1_QBG	0x15
+#define I40E_DEV_FUNC_CAP_802_1_QBH	0x16
+#define I40E_DEV_FUNC_CAP_VSI		0x17
+#define I40E_DEV_FUNC_CAP_DCB		0x18
+#define I40E_DEV_FUNC_CAP_FCOE		0x21
+#define I40E_DEV_FUNC_CAP_ISCSI		0x22
+#define I40E_DEV_FUNC_CAP_RSS		0x40
+#define I40E_DEV_FUNC_CAP_RX_QUEUES	0x41
+#define I40E_DEV_FUNC_CAP_TX_QUEUES	0x42
+#define I40E_DEV_FUNC_CAP_MSIX		0x43
+#define I40E_DEV_FUNC_CAP_MSIX_VF	0x44
+#define I40E_DEV_FUNC_CAP_FLOW_DIRECTOR	0x45
+#define I40E_DEV_FUNC_CAP_IEEE_1588	0x46
+#define I40E_DEV_FUNC_CAP_FLEX10	0xF1
+#define I40E_DEV_FUNC_CAP_CEM		0xF2
+#define I40E_DEV_FUNC_CAP_IWARP		0x51
+#define I40E_DEV_FUNC_CAP_LED		0x61
+#define I40E_DEV_FUNC_CAP_SDP		0x62
+#define I40E_DEV_FUNC_CAP_MDIO		0x63
+#define I40E_DEV_FUNC_CAP_WR_CSR_PROT	0x64
+
+/**
+ * i40e_parse_discover_capabilities
+ * @hw: pointer to the hw struct
+ * @buff: pointer to a buffer containing device/function capability records
+ * @cap_count: number of capability records in the list
+ * @list_type_opc: type of capabilities list to parse
+ *
+ * Parse the device/function capabilities list.
+ **/
+static void i40e_parse_discover_capabilities(struct i40e_hw *hw, void *buff,
+				     u32 cap_count,
+				     enum i40e_admin_queue_opc list_type_opc)
+{
+	struct i40e_aqc_list_capabilities_element_resp *cap;
+	u32 valid_functions, num_functions;
+	u32 number, logical_id, phys_id;
+	struct i40e_hw_capabilities *p;
+	u8 major_rev;
+	u32 i = 0;
+	u16 id;
+
+	cap = (struct i40e_aqc_list_capabilities_element_resp *) buff;
+
+	if (list_type_opc == i40e_aqc_opc_list_dev_capabilities)
+		p = (struct i40e_hw_capabilities *)&hw->dev_caps;
+	else if (list_type_opc == i40e_aqc_opc_list_func_capabilities)
+		p = (struct i40e_hw_capabilities *)&hw->func_caps;
+	else
+		return;
+
+	for (i = 0; i < cap_count; i++, cap++) {
+		id = LE16_TO_CPU(cap->id);
+		number = LE32_TO_CPU(cap->number);
+		logical_id = LE32_TO_CPU(cap->logical_id);
+		phys_id = LE32_TO_CPU(cap->phys_id);
+		major_rev = cap->major_rev;
+
+		switch (id) {
+		case I40E_DEV_FUNC_CAP_SWITCH_MODE:
+			p->switch_mode = number;
+			break;
+		case I40E_DEV_FUNC_CAP_MGMT_MODE:
+			p->management_mode = number;
+			break;
+		case I40E_DEV_FUNC_CAP_NPAR:
+			p->npar_enable = number;
+			break;
+		case I40E_DEV_FUNC_CAP_OS2BMC:
+			p->os2bmc = number;
+			break;
+		case I40E_DEV_FUNC_CAP_VALID_FUNC:
+			p->valid_functions = number;
+			break;
+		case I40E_DEV_FUNC_CAP_SRIOV_1_1:
+			if (number == 1)
+				p->sr_iov_1_1 = TRUE;
+			break;
+		case I40E_DEV_FUNC_CAP_VF:
+			p->num_vfs = number;
+			p->vf_base_id = logical_id;
+			break;
+		case I40E_DEV_FUNC_CAP_VMDQ:
+			if (number == 1)
+				p->vmdq = TRUE;
+			break;
+		case I40E_DEV_FUNC_CAP_802_1_QBG:
+			if (number == 1)
+				p->evb_802_1_qbg = TRUE;
+			break;
+		case I40E_DEV_FUNC_CAP_802_1_QBH:
+			if (number == 1)
+				p->evb_802_1_qbh = TRUE;
+			break;
+		case I40E_DEV_FUNC_CAP_VSI:
+			p->num_vsis = number;
+			break;
+		case I40E_DEV_FUNC_CAP_DCB:
+			if (number == 1) {
+				p->dcb = TRUE;
+				p->enabled_tcmap = logical_id;
+				p->maxtc = phys_id;
+			}
+			break;
+		case I40E_DEV_FUNC_CAP_FCOE:
+			if (number == 1)
+				p->fcoe = TRUE;
+			break;
+		case I40E_DEV_FUNC_CAP_ISCSI:
+			if (number == 1)
+				p->iscsi = TRUE;
+			break;
+		case I40E_DEV_FUNC_CAP_RSS:
+			p->rss = TRUE;
+			p->rss_table_size = number;
+			p->rss_table_entry_width = logical_id;
+			break;
+		case I40E_DEV_FUNC_CAP_RX_QUEUES:
+			p->num_rx_qp = number;
+			p->base_queue = phys_id;
+			break;
+		case I40E_DEV_FUNC_CAP_TX_QUEUES:
+			p->num_tx_qp = number;
+			p->base_queue = phys_id;
+			break;
+		case I40E_DEV_FUNC_CAP_MSIX:
+			p->num_msix_vectors = number;
+			break;
+		case I40E_DEV_FUNC_CAP_MSIX_VF:
+			p->num_msix_vectors_vf = number;
+			break;
+		case I40E_DEV_FUNC_CAP_FLEX10:
+			if (major_rev == 1) {
+				if (number == 1) {
+					p->flex10_enable = TRUE;
+					p->flex10_capable = TRUE;
+				}
+			} else {
+				/* Capability revision >= 2 */
+				if (number & 1)
+					p->flex10_enable = TRUE;
+				if (number & 2)
+					p->flex10_capable = TRUE;
+			}
+			p->flex10_mode = logical_id;
+			p->flex10_status = phys_id;
+			break;
+		case I40E_DEV_FUNC_CAP_CEM:
+			if (number == 1)
+				p->mgmt_cem = TRUE;
+			break;
+		case I40E_DEV_FUNC_CAP_IWARP:
+			if (number == 1)
+				p->iwarp = TRUE;
+			break;
+		case I40E_DEV_FUNC_CAP_LED:
+			if (phys_id < I40E_HW_CAP_MAX_GPIO)
+				p->led[phys_id] = TRUE;
+			break;
+		case I40E_DEV_FUNC_CAP_SDP:
+			if (phys_id < I40E_HW_CAP_MAX_GPIO)
+				p->sdp[phys_id] = TRUE;
+			break;
+		case I40E_DEV_FUNC_CAP_MDIO:
+			if (number == 1) {
+				p->mdio_port_num = phys_id;
+				p->mdio_port_mode = logical_id;
+			}
+			break;
+		case I40E_DEV_FUNC_CAP_IEEE_1588:
+			if (number == 1)
+				p->ieee_1588 = TRUE;
+			break;
+		case I40E_DEV_FUNC_CAP_FLOW_DIRECTOR:
+			p->fd = TRUE;
+			p->fd_filters_guaranteed = number;
+			p->fd_filters_best_effort = logical_id;
+			break;
+		case I40E_DEV_FUNC_CAP_WR_CSR_PROT:
+			p->wr_csr_prot = (u64)number;
+			p->wr_csr_prot |= (u64)logical_id << 32;
+			break;
+		default:
+			break;
+		}
+	}
+
+	if (p->fcoe)
+		i40e_debug(hw, I40E_DEBUG_ALL, "device is FCoE capable\n");
+
+	/* Always disable FCoE if compiled without the I40E_FCOE_ENA flag */
+	p->fcoe = FALSE;
+
+	/* count the enabled ports (aka the "not disabled" ports) */
+	hw->num_ports = 0;
+	for (i = 0; i < 4; i++) {
+		u32 port_cfg_reg = I40E_PRTGEN_CNF + (4 * i);
+		u64 port_cfg = 0;
+
+		/* use AQ read to get the physical register offset instead
+		 * of the port relative offset
+		 */
+		i40e_aq_debug_read_register(hw, port_cfg_reg, &port_cfg, NULL);
+		if (!(port_cfg & I40E_PRTGEN_CNF_PORT_DIS_MASK))
+			hw->num_ports++;
+	}
+
+	valid_functions = p->valid_functions;
+	num_functions = 0;
+	while (valid_functions) {
+		if (valid_functions & 1)
+			num_functions++;
+		valid_functions >>= 1;
+	}
+
+	/* partition id is 1-based, and functions are evenly spread
+	 * across the ports as partitions
+	 */
+	hw->partition_id = (hw->pf_id / hw->num_ports) + 1;
+	hw->num_partitions = num_functions / hw->num_ports;
+
+	/* additional HW specific goodies that might
+	 * someday be HW version specific
+	 */
+	p->rx_buf_chain_len = I40E_MAX_CHAINED_RX_BUFFERS;
+}
+
+/**
+ * i40e_aq_discover_capabilities
+ * @hw: pointer to the hw struct
+ * @buff: a virtual buffer to hold the capabilities
+ * @buff_size: Size of the virtual buffer
+ * @data_size: Size of the returned data, or buff size needed if AQ err==ENOMEM
+ * @list_type_opc: capabilities type to discover - pass in the command opcode
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Get the device capabilities descriptions from the firmware
+ **/
+enum i40e_status_code i40e_aq_discover_capabilities(struct i40e_hw *hw,
+				void *buff, u16 buff_size, u16 *data_size,
+				enum i40e_admin_queue_opc list_type_opc,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aqc_list_capabilites *cmd;
+	struct i40e_aq_desc desc;
+	enum i40e_status_code status = I40E_SUCCESS;
+
+	cmd = (struct i40e_aqc_list_capabilites *)&desc.params.raw;
+
+	if (list_type_opc != i40e_aqc_opc_list_func_capabilities &&
+		list_type_opc != i40e_aqc_opc_list_dev_capabilities) {
+		status = I40E_ERR_PARAM;
+		goto exit;
+	}
+
+	i40e_fill_default_direct_cmd_desc(&desc, list_type_opc);
+
+	desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_BUF);
+	if (buff_size > I40E_AQ_LARGE_BUF)
+		desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_LB);
+
+	status = i40e_asq_send_command(hw, &desc, buff, buff_size, cmd_details);
+	*data_size = LE16_TO_CPU(desc.datalen);
+
+	if (status)
+		goto exit;
+
+	i40e_parse_discover_capabilities(hw, buff, LE32_TO_CPU(cmd->count),
+					 list_type_opc);
+
+exit:
+	return status;
+}
+
+/**
+ * i40e_aq_update_nvm
+ * @hw: pointer to the hw struct
+ * @module_pointer: module pointer location in words from the NVM beginning
+ * @offset: byte offset from the module beginning
+ * @length: length of the section to be written (in bytes from the offset)
+ * @data: command buffer (size [bytes] = length)
+ * @last_command: tells if this is the last command in a series
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Update the NVM using the admin queue commands
+ **/
+enum i40e_status_code i40e_aq_update_nvm(struct i40e_hw *hw, u8 module_pointer,
+				u32 offset, u16 length, void *data,
+				bool last_command,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_nvm_update *cmd =
+		(struct i40e_aqc_nvm_update *)&desc.params.raw;
+	enum i40e_status_code status;
+
+	DEBUGFUNC("i40e_aq_update_nvm");
+
+	/* In offset the highest byte must be zeroed. */
+	if (offset & 0xFF000000) {
+		status = I40E_ERR_PARAM;
+		goto i40e_aq_update_nvm_exit;
+	}
+
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_nvm_update);
+
+	/* If this is the last command in a series, set the proper flag. */
+	if (last_command)
+		cmd->command_flags |= I40E_AQ_NVM_LAST_CMD;
+	cmd->module_pointer = module_pointer;
+	cmd->offset = CPU_TO_LE32(offset);
+	cmd->length = CPU_TO_LE16(length);
+
+	desc.flags |= CPU_TO_LE16((u16)(I40E_AQ_FLAG_BUF | I40E_AQ_FLAG_RD));
+	if (length > I40E_AQ_LARGE_BUF)
+		desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_LB);
+
+	status = i40e_asq_send_command(hw, &desc, data, length, cmd_details);
+
+i40e_aq_update_nvm_exit:
+	return status;
+}
+
+/**
+ * i40e_aq_get_lldp_mib
+ * @hw: pointer to the hw struct
+ * @bridge_type: type of bridge requested
+ * @mib_type: Local, Remote or both Local and Remote MIBs
+ * @buff: pointer to a user supplied buffer to store the MIB block
+ * @buff_size: size of the buffer (in bytes)
+ * @local_len : length of the returned Local LLDP MIB
+ * @remote_len: length of the returned Remote LLDP MIB
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Requests the complete LLDP MIB (entire packet).
+ **/
+enum i40e_status_code i40e_aq_get_lldp_mib(struct i40e_hw *hw, u8 bridge_type,
+				u8 mib_type, void *buff, u16 buff_size,
+				u16 *local_len, u16 *remote_len,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_lldp_get_mib *cmd =
+		(struct i40e_aqc_lldp_get_mib *)&desc.params.raw;
+	struct i40e_aqc_lldp_get_mib *resp =
+		(struct i40e_aqc_lldp_get_mib *)&desc.params.raw;
+	enum i40e_status_code status;
+
+	if (buff_size == 0 || !buff)
+		return I40E_ERR_PARAM;
+
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_lldp_get_mib);
+	/* Indirect Command */
+	desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_BUF);
+
+	cmd->type = mib_type & I40E_AQ_LLDP_MIB_TYPE_MASK;
+	cmd->type |= ((bridge_type << I40E_AQ_LLDP_BRIDGE_TYPE_SHIFT) &
+		       I40E_AQ_LLDP_BRIDGE_TYPE_MASK);
+
+	desc.datalen = CPU_TO_LE16(buff_size);
+
+	desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_BUF);
+	if (buff_size > I40E_AQ_LARGE_BUF)
+		desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_LB);
+
+	status = i40e_asq_send_command(hw, &desc, buff, buff_size, cmd_details);
+	if (!status) {
+		if (local_len != NULL)
+			*local_len = LE16_TO_CPU(resp->local_len);
+		if (remote_len != NULL)
+			*remote_len = LE16_TO_CPU(resp->remote_len);
+	}
+
+	return status;
+}
+
+ /**
+ * i40e_aq_set_lldp_mib - Set the LLDP MIB
+ * @hw: pointer to the hw struct
+ * @mib_type: Local, Remote or both Local and Remote MIBs
+ * @buff: pointer to a user supplied buffer to store the MIB block
+ * @buff_size: size of the buffer (in bytes)
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Set the LLDP MIB.
+ **/
+enum i40e_status_code i40e_aq_set_lldp_mib(struct i40e_hw *hw,
+				u8 mib_type, void *buff, u16 buff_size,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_lldp_set_local_mib *cmd =
+		(struct i40e_aqc_lldp_set_local_mib *)&desc.params.raw;
+	enum i40e_status_code status;
+
+	if (buff_size == 0 || !buff)
+		return I40E_ERR_PARAM;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+				i40e_aqc_opc_lldp_set_local_mib);
+	/* Indirect Command */
+	desc.flags |= CPU_TO_LE16((u16)(I40E_AQ_FLAG_BUF | I40E_AQ_FLAG_RD));
+	if (buff_size > I40E_AQ_LARGE_BUF)
+		desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_LB);
+	desc.datalen = CPU_TO_LE16(buff_size);
+
+	cmd->type = mib_type;
+	cmd->length = CPU_TO_LE16(buff_size);
+	cmd->address_high = CPU_TO_LE32(I40E_HI_WORD((uintptr_t)buff));
+	cmd->address_low =  CPU_TO_LE32(I40E_LO_DWORD((uintptr_t)buff));
+
+	status = i40e_asq_send_command(hw, &desc, buff, buff_size, cmd_details);
+	return status;
+}
+
+/**
+ * i40e_aq_cfg_lldp_mib_change_event
+ * @hw: pointer to the hw struct
+ * @enable_update: Enable or Disable event posting
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Enable or Disable posting of an event on ARQ when LLDP MIB
+ * associated with the interface changes
+ **/
+enum i40e_status_code i40e_aq_cfg_lldp_mib_change_event(struct i40e_hw *hw,
+				bool enable_update,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_lldp_update_mib *cmd =
+		(struct i40e_aqc_lldp_update_mib *)&desc.params.raw;
+	enum i40e_status_code status;
+
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_lldp_update_mib);
+
+	if (!enable_update)
+		cmd->command |= I40E_AQ_LLDP_MIB_UPDATE_DISABLE;
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_aq_add_lldp_tlv
+ * @hw: pointer to the hw struct
+ * @bridge_type: type of bridge
+ * @buff: buffer with TLV to add
+ * @buff_size: length of the buffer
+ * @tlv_len: length of the TLV to be added
+ * @mib_len: length of the LLDP MIB returned in response
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Add the specified TLV to LLDP Local MIB for the given bridge type,
+ * it is responsibility of the caller to make sure that the TLV is not
+ * already present in the LLDPDU.
+ * In return firmware will write the complete LLDP MIB with the newly
+ * added TLV in the response buffer.
+ **/
+enum i40e_status_code i40e_aq_add_lldp_tlv(struct i40e_hw *hw, u8 bridge_type,
+				void *buff, u16 buff_size, u16 tlv_len,
+				u16 *mib_len,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_lldp_add_tlv *cmd =
+		(struct i40e_aqc_lldp_add_tlv *)&desc.params.raw;
+	enum i40e_status_code status;
+
+	if (buff_size == 0 || !buff || tlv_len == 0)
+		return I40E_ERR_PARAM;
+
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_lldp_add_tlv);
+
+	/* Indirect Command */
+	desc.flags |= CPU_TO_LE16((u16)(I40E_AQ_FLAG_BUF | I40E_AQ_FLAG_RD));
+	if (buff_size > I40E_AQ_LARGE_BUF)
+		desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_LB);
+	desc.datalen = CPU_TO_LE16(buff_size);
+
+	cmd->type = ((bridge_type << I40E_AQ_LLDP_BRIDGE_TYPE_SHIFT) &
+		      I40E_AQ_LLDP_BRIDGE_TYPE_MASK);
+	cmd->len = CPU_TO_LE16(tlv_len);
+
+	status = i40e_asq_send_command(hw, &desc, buff, buff_size, cmd_details);
+	if (!status) {
+		if (mib_len != NULL)
+			*mib_len = LE16_TO_CPU(desc.datalen);
+	}
+
+	return status;
+}
+
+/**
+ * i40e_aq_update_lldp_tlv
+ * @hw: pointer to the hw struct
+ * @bridge_type: type of bridge
+ * @buff: buffer with TLV to update
+ * @buff_size: size of the buffer holding original and updated TLVs
+ * @old_len: Length of the Original TLV
+ * @new_len: Length of the Updated TLV
+ * @offset: offset of the updated TLV in the buff
+ * @mib_len: length of the returned LLDP MIB
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Update the specified TLV to the LLDP Local MIB for the given bridge type.
+ * Firmware will place the complete LLDP MIB in response buffer with the
+ * updated TLV.
+ **/
+enum i40e_status_code i40e_aq_update_lldp_tlv(struct i40e_hw *hw,
+				u8 bridge_type, void *buff, u16 buff_size,
+				u16 old_len, u16 new_len, u16 offset,
+				u16 *mib_len,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_lldp_update_tlv *cmd =
+		(struct i40e_aqc_lldp_update_tlv *)&desc.params.raw;
+	enum i40e_status_code status;
+
+	if (buff_size == 0 || !buff || offset == 0 ||
+	    old_len == 0 || new_len == 0)
+		return I40E_ERR_PARAM;
+
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_lldp_update_tlv);
+
+	/* Indirect Command */
+	desc.flags |= CPU_TO_LE16((u16)(I40E_AQ_FLAG_BUF | I40E_AQ_FLAG_RD));
+	if (buff_size > I40E_AQ_LARGE_BUF)
+		desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_LB);
+	desc.datalen = CPU_TO_LE16(buff_size);
+
+	cmd->type = ((bridge_type << I40E_AQ_LLDP_BRIDGE_TYPE_SHIFT) &
+		      I40E_AQ_LLDP_BRIDGE_TYPE_MASK);
+	cmd->old_len = CPU_TO_LE16(old_len);
+	cmd->new_offset = CPU_TO_LE16(offset);
+	cmd->new_len = CPU_TO_LE16(new_len);
+
+	status = i40e_asq_send_command(hw, &desc, buff, buff_size, cmd_details);
+	if (!status) {
+		if (mib_len != NULL)
+			*mib_len = LE16_TO_CPU(desc.datalen);
+	}
+
+	return status;
+}
+
+/**
+ * i40e_aq_delete_lldp_tlv
+ * @hw: pointer to the hw struct
+ * @bridge_type: type of bridge
+ * @buff: pointer to a user supplied buffer that has the TLV
+ * @buff_size: length of the buffer
+ * @tlv_len: length of the TLV to be deleted
+ * @mib_len: length of the returned LLDP MIB
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Delete the specified TLV from LLDP Local MIB for the given bridge type.
+ * The firmware places the entire LLDP MIB in the response buffer.
+ **/
+enum i40e_status_code i40e_aq_delete_lldp_tlv(struct i40e_hw *hw,
+				u8 bridge_type, void *buff, u16 buff_size,
+				u16 tlv_len, u16 *mib_len,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_lldp_add_tlv *cmd =
+		(struct i40e_aqc_lldp_add_tlv *)&desc.params.raw;
+	enum i40e_status_code status;
+
+	if (buff_size == 0 || !buff)
+		return I40E_ERR_PARAM;
+
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_lldp_delete_tlv);
+
+	/* Indirect Command */
+	desc.flags |= CPU_TO_LE16((u16)(I40E_AQ_FLAG_BUF | I40E_AQ_FLAG_RD));
+	if (buff_size > I40E_AQ_LARGE_BUF)
+		desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_LB);
+	desc.datalen = CPU_TO_LE16(buff_size);
+	cmd->len = CPU_TO_LE16(tlv_len);
+	cmd->type = ((bridge_type << I40E_AQ_LLDP_BRIDGE_TYPE_SHIFT) &
+		      I40E_AQ_LLDP_BRIDGE_TYPE_MASK);
+
+	status = i40e_asq_send_command(hw, &desc, buff, buff_size, cmd_details);
+	if (!status) {
+		if (mib_len != NULL)
+			*mib_len = LE16_TO_CPU(desc.datalen);
+	}
+
+	return status;
+}
+
+/**
+ * i40e_aq_stop_lldp
+ * @hw: pointer to the hw struct
+ * @shutdown_agent: True if LLDP Agent needs to be Shutdown
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Stop or Shutdown the embedded LLDP Agent
+ **/
+enum i40e_status_code i40e_aq_stop_lldp(struct i40e_hw *hw, bool shutdown_agent,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_lldp_stop *cmd =
+		(struct i40e_aqc_lldp_stop *)&desc.params.raw;
+	enum i40e_status_code status;
+
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_lldp_stop);
+
+	if (shutdown_agent)
+		cmd->command |= I40E_AQ_LLDP_AGENT_SHUTDOWN;
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_aq_start_lldp
+ * @hw: pointer to the hw struct
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Start the embedded LLDP Agent on all ports.
+ **/
+enum i40e_status_code i40e_aq_start_lldp(struct i40e_hw *hw,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_lldp_start *cmd =
+		(struct i40e_aqc_lldp_start *)&desc.params.raw;
+	enum i40e_status_code status;
+
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_lldp_start);
+
+	cmd->command = I40E_AQ_LLDP_AGENT_START;
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_aq_get_cee_dcb_config
+ * @hw: pointer to the hw struct
+ * @buff: response buffer that stores CEE operational configuration
+ * @buff_size: size of the buffer passed
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Get CEE DCBX mode operational configuration from firmware
+ **/
+enum i40e_status_code i40e_aq_get_cee_dcb_config(struct i40e_hw *hw,
+				void *buff, u16 buff_size,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	enum i40e_status_code status;
+
+	if (buff_size == 0 || !buff)
+		return I40E_ERR_PARAM;
+
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_get_cee_dcb_cfg);
+
+	desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_BUF);
+	status = i40e_asq_send_command(hw, &desc, (void *)buff, buff_size,
+				       cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_aq_start_stop_dcbx - Start/Stop DCBx service in FW
+ * @hw: pointer to the hw struct
+ * @start_agent: True if DCBx Agent needs to be Started
+ *				False if DCBx Agent needs to be Stopped
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Start/Stop the embedded dcbx Agent
+ **/
+enum i40e_status_code i40e_aq_start_stop_dcbx(struct i40e_hw *hw,
+				bool start_agent,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_lldp_stop_start_specific_agent *cmd =
+		(struct i40e_aqc_lldp_stop_start_specific_agent *)
+				&desc.params.raw;
+	enum i40e_status_code status;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+				i40e_aqc_opc_lldp_stop_start_spec_agent);
+
+	if (start_agent)
+		cmd->command = I40E_AQC_START_SPECIFIC_AGENT_MASK;
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_aq_add_udp_tunnel
+ * @hw: pointer to the hw struct
+ * @udp_port: the UDP port to add
+ * @header_len: length of the tunneling header length in DWords
+ * @protocol_index: protocol index type
+ * @filter_index: pointer to filter index
+ * @cmd_details: pointer to command details structure or NULL
+ **/
+enum i40e_status_code i40e_aq_add_udp_tunnel(struct i40e_hw *hw,
+				u16 udp_port, u8 protocol_index,
+				u8 *filter_index,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_add_udp_tunnel *cmd =
+		(struct i40e_aqc_add_udp_tunnel *)&desc.params.raw;
+	struct i40e_aqc_del_udp_tunnel_completion *resp =
+		(struct i40e_aqc_del_udp_tunnel_completion *)&desc.params.raw;
+	enum i40e_status_code status;
+
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_add_udp_tunnel);
+
+	cmd->udp_port = CPU_TO_LE16(udp_port);
+	cmd->protocol_type = protocol_index;
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	if (!status && filter_index)
+		*filter_index = resp->index;
+
+	return status;
+}
+
+/**
+ * i40e_aq_del_udp_tunnel
+ * @hw: pointer to the hw struct
+ * @index: filter index
+ * @cmd_details: pointer to command details structure or NULL
+ **/
+enum i40e_status_code i40e_aq_del_udp_tunnel(struct i40e_hw *hw, u8 index,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_remove_udp_tunnel *cmd =
+		(struct i40e_aqc_remove_udp_tunnel *)&desc.params.raw;
+	enum i40e_status_code status;
+
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_del_udp_tunnel);
+
+	cmd->index = index;
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_aq_get_switch_resource_alloc (0x0204)
+ * @hw: pointer to the hw struct
+ * @num_entries: pointer to u8 to store the number of resource entries returned
+ * @buf: pointer to a user supplied buffer.  This buffer must be large enough
+ *        to store the resource information for all resource types.  Each
+ *        resource type is a i40e_aqc_switch_resource_alloc_data structure.
+ * @count: size, in bytes, of the buffer provided
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Query the resources allocated to a function.
+ **/
+enum i40e_status_code i40e_aq_get_switch_resource_alloc(struct i40e_hw *hw,
+			u8 *num_entries,
+			struct i40e_aqc_switch_resource_alloc_element_resp *buf,
+			u16 count,
+			struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_get_switch_resource_alloc *cmd_resp =
+		(struct i40e_aqc_get_switch_resource_alloc *)&desc.params.raw;
+	enum i40e_status_code status;
+	u16 length = count * sizeof(*buf);
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					i40e_aqc_opc_get_switch_resource_alloc);
+
+	desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_BUF);
+	if (length > I40E_AQ_LARGE_BUF)
+		desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_LB);
+
+	status = i40e_asq_send_command(hw, &desc, buf, length, cmd_details);
+
+	if (!status && num_entries)
+		*num_entries = cmd_resp->num_entries;
+
+	return status;
+}
+
+/**
+ * i40e_aq_delete_element - Delete switch element
+ * @hw: pointer to the hw struct
+ * @seid: the SEID to delete from the switch
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * This deletes a switch element from the switch.
+ **/
+enum i40e_status_code i40e_aq_delete_element(struct i40e_hw *hw, u16 seid,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_switch_seid *cmd =
+		(struct i40e_aqc_switch_seid *)&desc.params.raw;
+	enum i40e_status_code status;
+
+	if (seid == 0)
+		return I40E_ERR_PARAM;
+
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_delete_element);
+
+	cmd->seid = CPU_TO_LE16(seid);
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	return status;
+}
+
+/**
+ * i40_aq_add_pvirt - Instantiate a Port Virtualizer on a port
+ * @hw: pointer to the hw struct
+ * @flags: component flags
+ * @mac_seid: uplink seid (MAC SEID)
+ * @vsi_seid: connected vsi seid
+ * @ret_seid: seid of create pv component
+ *
+ * This instantiates an i40e port virtualizer with specified flags.
+ * Depending on specified flags the port virtualizer can act as a
+ * 802.1Qbr port virtualizer or a 802.1Qbg S-component.
+ */
+enum i40e_status_code i40e_aq_add_pvirt(struct i40e_hw *hw, u16 flags,
+				       u16 mac_seid, u16 vsi_seid,
+				       u16 *ret_seid)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_add_update_pv *cmd =
+		(struct i40e_aqc_add_update_pv *)&desc.params.raw;
+	struct i40e_aqc_add_update_pv_completion *resp =
+		(struct i40e_aqc_add_update_pv_completion *)&desc.params.raw;
+	enum i40e_status_code status;
+
+	if (vsi_seid == 0)
+		return I40E_ERR_PARAM;
+
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_add_pv);
+	cmd->command_flags = CPU_TO_LE16(flags);
+	cmd->uplink_seid = CPU_TO_LE16(mac_seid);
+	cmd->connected_seid = CPU_TO_LE16(vsi_seid);
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, NULL);
+	if (!status && ret_seid)
+		*ret_seid = LE16_TO_CPU(resp->pv_seid);
+
+	return status;
+}
+
+/**
+ * i40e_aq_add_tag - Add an S/E-tag
+ * @hw: pointer to the hw struct
+ * @direct_to_queue: should s-tag direct flow to a specific queue
+ * @vsi_seid: VSI SEID to use this tag
+ * @tag: value of the tag
+ * @queue_num: queue number, only valid is direct_to_queue is TRUE
+ * @tags_used: return value, number of tags in use by this PF
+ * @tags_free: return value, number of unallocated tags
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * This associates an S- or E-tag to a VSI in the switch complex.  It returns
+ * the number of tags allocated by the PF, and the number of unallocated
+ * tags available.
+ **/
+enum i40e_status_code i40e_aq_add_tag(struct i40e_hw *hw, bool direct_to_queue,
+				u16 vsi_seid, u16 tag, u16 queue_num,
+				u16 *tags_used, u16 *tags_free,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_add_tag *cmd =
+		(struct i40e_aqc_add_tag *)&desc.params.raw;
+	struct i40e_aqc_add_remove_tag_completion *resp =
+		(struct i40e_aqc_add_remove_tag_completion *)&desc.params.raw;
+	enum i40e_status_code status;
+
+	if (vsi_seid == 0)
+		return I40E_ERR_PARAM;
+
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_add_tag);
+
+	cmd->seid = CPU_TO_LE16(vsi_seid);
+	cmd->tag = CPU_TO_LE16(tag);
+	if (direct_to_queue) {
+		cmd->flags = CPU_TO_LE16(I40E_AQC_ADD_TAG_FLAG_TO_QUEUE);
+		cmd->queue_number = CPU_TO_LE16(queue_num);
+	}
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	if (!status) {
+		if (tags_used != NULL)
+			*tags_used = LE16_TO_CPU(resp->tags_used);
+		if (tags_free != NULL)
+			*tags_free = LE16_TO_CPU(resp->tags_free);
+	}
+
+	return status;
+}
+
+/**
+ * i40e_aq_remove_tag - Remove an S- or E-tag
+ * @hw: pointer to the hw struct
+ * @vsi_seid: VSI SEID this tag is associated with
+ * @tag: value of the S-tag to delete
+ * @tags_used: return value, number of tags in use by this PF
+ * @tags_free: return value, number of unallocated tags
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * This deletes an S- or E-tag from a VSI in the switch complex.  It returns
+ * the number of tags allocated by the PF, and the number of unallocated
+ * tags available.
+ **/
+enum i40e_status_code i40e_aq_remove_tag(struct i40e_hw *hw, u16 vsi_seid,
+				u16 tag, u16 *tags_used, u16 *tags_free,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_remove_tag *cmd =
+		(struct i40e_aqc_remove_tag *)&desc.params.raw;
+	struct i40e_aqc_add_remove_tag_completion *resp =
+		(struct i40e_aqc_add_remove_tag_completion *)&desc.params.raw;
+	enum i40e_status_code status;
+
+	if (vsi_seid == 0)
+		return I40E_ERR_PARAM;
+
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_remove_tag);
+
+	cmd->seid = CPU_TO_LE16(vsi_seid);
+	cmd->tag = CPU_TO_LE16(tag);
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	if (!status) {
+		if (tags_used != NULL)
+			*tags_used = LE16_TO_CPU(resp->tags_used);
+		if (tags_free != NULL)
+			*tags_free = LE16_TO_CPU(resp->tags_free);
+	}
+
+	return status;
+}
+
+/**
+ * i40e_aq_add_mcast_etag - Add a multicast E-tag
+ * @hw: pointer to the hw struct
+ * @pv_seid: Port Virtualizer of this SEID to associate E-tag with
+ * @etag: value of E-tag to add
+ * @num_tags_in_buf: number of unicast E-tags in indirect buffer
+ * @buf: address of indirect buffer
+ * @tags_used: return value, number of E-tags in use by this port
+ * @tags_free: return value, number of unallocated M-tags
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * This associates a multicast E-tag to a port virtualizer.  It will return
+ * the number of tags allocated by the PF, and the number of unallocated
+ * tags available.
+ *
+ * The indirect buffer pointed to by buf is a list of 2-byte E-tags,
+ * num_tags_in_buf long.
+ **/
+enum i40e_status_code i40e_aq_add_mcast_etag(struct i40e_hw *hw, u16 pv_seid,
+				u16 etag, u8 num_tags_in_buf, void *buf,
+				u16 *tags_used, u16 *tags_free,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_add_remove_mcast_etag *cmd =
+		(struct i40e_aqc_add_remove_mcast_etag *)&desc.params.raw;
+	struct i40e_aqc_add_remove_mcast_etag_completion *resp =
+	   (struct i40e_aqc_add_remove_mcast_etag_completion *)&desc.params.raw;
+	enum i40e_status_code status;
+	u16 length = sizeof(u16) * num_tags_in_buf;
+
+	if ((pv_seid == 0) || (buf == NULL) || (num_tags_in_buf == 0))
+		return I40E_ERR_PARAM;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					  i40e_aqc_opc_add_multicast_etag);
+
+	cmd->pv_seid = CPU_TO_LE16(pv_seid);
+	cmd->etag = CPU_TO_LE16(etag);
+	cmd->num_unicast_etags = num_tags_in_buf;
+
+	desc.flags |= CPU_TO_LE16((u16)(I40E_AQ_FLAG_BUF | I40E_AQ_FLAG_RD));
+	if (length > I40E_AQ_LARGE_BUF)
+		desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_LB);
+
+	status = i40e_asq_send_command(hw, &desc, buf, length, cmd_details);
+
+	if (!status) {
+		if (tags_used != NULL)
+			*tags_used = LE16_TO_CPU(resp->mcast_etags_used);
+		if (tags_free != NULL)
+			*tags_free = LE16_TO_CPU(resp->mcast_etags_free);
+	}
+
+	return status;
+}
+
+/**
+ * i40e_aq_remove_mcast_etag - Remove a multicast E-tag
+ * @hw: pointer to the hw struct
+ * @pv_seid: Port Virtualizer SEID this M-tag is associated with
+ * @etag: value of the E-tag to remove
+ * @tags_used: return value, number of tags in use by this port
+ * @tags_free: return value, number of unallocated tags
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * This deletes an E-tag from the port virtualizer.  It will return
+ * the number of tags allocated by the port, and the number of unallocated
+ * tags available.
+ **/
+enum i40e_status_code i40e_aq_remove_mcast_etag(struct i40e_hw *hw, u16 pv_seid,
+				u16 etag, u16 *tags_used, u16 *tags_free,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_add_remove_mcast_etag *cmd =
+		(struct i40e_aqc_add_remove_mcast_etag *)&desc.params.raw;
+	struct i40e_aqc_add_remove_mcast_etag_completion *resp =
+	   (struct i40e_aqc_add_remove_mcast_etag_completion *)&desc.params.raw;
+	enum i40e_status_code status;
+
+
+	if (pv_seid == 0)
+		return I40E_ERR_PARAM;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					  i40e_aqc_opc_remove_multicast_etag);
+
+	cmd->pv_seid = CPU_TO_LE16(pv_seid);
+	cmd->etag = CPU_TO_LE16(etag);
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	if (!status) {
+		if (tags_used != NULL)
+			*tags_used = LE16_TO_CPU(resp->mcast_etags_used);
+		if (tags_free != NULL)
+			*tags_free = LE16_TO_CPU(resp->mcast_etags_free);
+	}
+
+	return status;
+}
+
+/**
+ * i40e_aq_update_tag - Update an S/E-tag
+ * @hw: pointer to the hw struct
+ * @vsi_seid: VSI SEID using this S-tag
+ * @old_tag: old tag value
+ * @new_tag: new tag value
+ * @tags_used: return value, number of tags in use by this PF
+ * @tags_free: return value, number of unallocated tags
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * This updates the value of the tag currently attached to this VSI
+ * in the switch complex.  It will return the number of tags allocated
+ * by the PF, and the number of unallocated tags available.
+ **/
+enum i40e_status_code i40e_aq_update_tag(struct i40e_hw *hw, u16 vsi_seid,
+				u16 old_tag, u16 new_tag, u16 *tags_used,
+				u16 *tags_free,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_update_tag *cmd =
+		(struct i40e_aqc_update_tag *)&desc.params.raw;
+	struct i40e_aqc_update_tag_completion *resp =
+		(struct i40e_aqc_update_tag_completion *)&desc.params.raw;
+	enum i40e_status_code status;
+
+	if (vsi_seid == 0)
+		return I40E_ERR_PARAM;
+
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_update_tag);
+
+	cmd->seid = CPU_TO_LE16(vsi_seid);
+	cmd->old_tag = CPU_TO_LE16(old_tag);
+	cmd->new_tag = CPU_TO_LE16(new_tag);
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	if (!status) {
+		if (tags_used != NULL)
+			*tags_used = LE16_TO_CPU(resp->tags_used);
+		if (tags_free != NULL)
+			*tags_free = LE16_TO_CPU(resp->tags_free);
+	}
+
+	return status;
+}
+
+/**
+ * i40e_aq_dcb_ignore_pfc - Ignore PFC for given TCs
+ * @hw: pointer to the hw struct
+ * @tcmap: TC map for request/release any ignore PFC condition
+ * @request: request or release ignore PFC condition
+ * @tcmap_ret: return TCs for which PFC is currently ignored
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * This sends out request/release to ignore PFC condition for a TC.
+ * It will return the TCs for which PFC is currently ignored.
+ **/
+enum i40e_status_code i40e_aq_dcb_ignore_pfc(struct i40e_hw *hw, u8 tcmap,
+				bool request, u8 *tcmap_ret,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_pfc_ignore *cmd_resp =
+		(struct i40e_aqc_pfc_ignore *)&desc.params.raw;
+	enum i40e_status_code status;
+
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_dcb_ignore_pfc);
+
+	if (request)
+		cmd_resp->command_flags = I40E_AQC_PFC_IGNORE_SET;
+
+	cmd_resp->tc_bitmap = tcmap;
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	if (!status) {
+		if (tcmap_ret != NULL)
+			*tcmap_ret = cmd_resp->tc_bitmap;
+	}
+
+	return status;
+}
+
+/**
+ * i40e_aq_dcb_updated - DCB Updated Command
+ * @hw: pointer to the hw struct
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * When LLDP is handled in PF this command is used by the PF
+ * to notify EMP that a DCB setting is modified.
+ * When LLDP is handled in EMP this command is used by the PF
+ * to notify EMP whenever one of the following parameters get
+ * modified:
+ *   - PFCLinkDelayAllowance in PRTDCB_GENC.PFCLDA
+ *   - PCIRTT in PRTDCB_GENC.PCIRTT
+ *   - Maximum Frame Size for non-FCoE TCs set by PRTDCB_TDPUC.MAX_TXFRAME.
+ * EMP will return when the shared RPB settings have been
+ * recomputed and modified. The retval field in the descriptor
+ * will be set to 0 when RPB is modified.
+ **/
+enum i40e_status_code i40e_aq_dcb_updated(struct i40e_hw *hw,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	enum i40e_status_code status;
+
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_dcb_updated);
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_aq_add_statistics - Add a statistics block to a VLAN in a switch.
+ * @hw: pointer to the hw struct
+ * @seid: defines the SEID of the switch for which the stats are requested
+ * @vlan_id: the VLAN ID for which the statistics are requested
+ * @stat_index: index of the statistics counters block assigned to this VLAN
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * XL710 supports 128 smonVlanStats counters.This command is used to
+ * allocate a set of smonVlanStats counters to a specific VLAN in a specific
+ * switch.
+ **/
+enum i40e_status_code i40e_aq_add_statistics(struct i40e_hw *hw, u16 seid,
+				u16 vlan_id, u16 *stat_index,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_add_remove_statistics *cmd_resp =
+		(struct i40e_aqc_add_remove_statistics *)&desc.params.raw;
+	enum i40e_status_code status;
+
+	if ((seid == 0) || (stat_index == NULL))
+		return I40E_ERR_PARAM;
+
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_add_statistics);
+
+	cmd_resp->seid = CPU_TO_LE16(seid);
+	cmd_resp->vlan = CPU_TO_LE16(vlan_id);
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	if (!status && stat_index)
+		*stat_index = LE16_TO_CPU(cmd_resp->stat_index);
+
+	return status;
+}
+
+/**
+ * i40e_aq_remove_statistics - Remove a statistics block to a VLAN in a switch.
+ * @hw: pointer to the hw struct
+ * @seid: defines the SEID of the switch for which the stats are requested
+ * @vlan_id: the VLAN ID for which the statistics are requested
+ * @stat_index: index of the statistics counters block assigned to this VLAN
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * XL710 supports 128 smonVlanStats counters.This command is used to
+ * deallocate a set of smonVlanStats counters to a specific VLAN in a specific
+ * switch.
+ **/
+enum i40e_status_code i40e_aq_remove_statistics(struct i40e_hw *hw, u16 seid,
+				u16 vlan_id, u16 stat_index,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_add_remove_statistics *cmd =
+		(struct i40e_aqc_add_remove_statistics *)&desc.params.raw;
+	enum i40e_status_code status;
+
+	if (seid == 0)
+		return I40E_ERR_PARAM;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					  i40e_aqc_opc_remove_statistics);
+
+	cmd->seid = CPU_TO_LE16(seid);
+	cmd->vlan  = CPU_TO_LE16(vlan_id);
+	cmd->stat_index = CPU_TO_LE16(stat_index);
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_aq_set_port_parameters - set physical port parameters.
+ * @hw: pointer to the hw struct
+ * @bad_frame_vsi: defines the VSI to which bad frames are forwarded
+ * @save_bad_pac: if set packets with errors are forwarded to the bad frames VSI
+ * @pad_short_pac: if set transmit packets smaller than 60 bytes are padded
+ * @double_vlan: if set double VLAN is enabled
+ * @cmd_details: pointer to command details structure or NULL
+ **/
+enum i40e_status_code i40e_aq_set_port_parameters(struct i40e_hw *hw,
+				u16 bad_frame_vsi, bool save_bad_pac,
+				bool pad_short_pac, bool double_vlan,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aqc_set_port_parameters *cmd;
+	enum i40e_status_code status;
+	struct i40e_aq_desc desc;
+	u16 command_flags = 0;
+
+	cmd = (struct i40e_aqc_set_port_parameters *)&desc.params.raw;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					  i40e_aqc_opc_set_port_parameters);
+
+	cmd->bad_frame_vsi = CPU_TO_LE16(bad_frame_vsi);
+	if (save_bad_pac)
+		command_flags |= I40E_AQ_SET_P_PARAMS_SAVE_BAD_PACKETS;
+	if (pad_short_pac)
+		command_flags |= I40E_AQ_SET_P_PARAMS_PAD_SHORT_PACKETS;
+	if (double_vlan)
+		command_flags |= I40E_AQ_SET_P_PARAMS_DOUBLE_VLAN_ENA;
+	cmd->command_flags = CPU_TO_LE16(command_flags);
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_aq_tx_sched_cmd - generic Tx scheduler AQ command handler
+ * @hw: pointer to the hw struct
+ * @seid: seid for the physical port/switching component/vsi
+ * @buff: Indirect buffer to hold data parameters and response
+ * @buff_size: Indirect buffer size
+ * @opcode: Tx scheduler AQ command opcode
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Generic command handler for Tx scheduler AQ commands
+ **/
+static enum i40e_status_code i40e_aq_tx_sched_cmd(struct i40e_hw *hw, u16 seid,
+				void *buff, u16 buff_size,
+				 enum i40e_admin_queue_opc opcode,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_tx_sched_ind *cmd =
+		(struct i40e_aqc_tx_sched_ind *)&desc.params.raw;
+	enum i40e_status_code status;
+	bool cmd_param_flag = FALSE;
+
+	switch (opcode) {
+	case i40e_aqc_opc_configure_vsi_ets_sla_bw_limit:
+	case i40e_aqc_opc_configure_vsi_tc_bw:
+	case i40e_aqc_opc_enable_switching_comp_ets:
+	case i40e_aqc_opc_modify_switching_comp_ets:
+	case i40e_aqc_opc_disable_switching_comp_ets:
+	case i40e_aqc_opc_configure_switching_comp_ets_bw_limit:
+	case i40e_aqc_opc_configure_switching_comp_bw_config:
+		cmd_param_flag = TRUE;
+		break;
+	case i40e_aqc_opc_query_vsi_bw_config:
+	case i40e_aqc_opc_query_vsi_ets_sla_config:
+	case i40e_aqc_opc_query_switching_comp_ets_config:
+	case i40e_aqc_opc_query_port_ets_config:
+	case i40e_aqc_opc_query_switching_comp_bw_config:
+		cmd_param_flag = FALSE;
+		break;
+	default:
+		return I40E_ERR_PARAM;
+	}
+
+	i40e_fill_default_direct_cmd_desc(&desc, opcode);
+
+	/* Indirect command */
+	desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_BUF);
+	if (cmd_param_flag)
+		desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_RD);
+	if (buff_size > I40E_AQ_LARGE_BUF)
+		desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_LB);
+
+	desc.datalen = CPU_TO_LE16(buff_size);
+
+	cmd->vsi_seid = CPU_TO_LE16(seid);
+
+	status = i40e_asq_send_command(hw, &desc, buff, buff_size, cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_aq_config_vsi_bw_limit - Configure VSI BW Limit
+ * @hw: pointer to the hw struct
+ * @seid: VSI seid
+ * @credit: BW limit credits (0 = disabled)
+ * @max_credit: Max BW limit credits
+ * @cmd_details: pointer to command details structure or NULL
+ **/
+enum i40e_status_code i40e_aq_config_vsi_bw_limit(struct i40e_hw *hw,
+				u16 seid, u16 credit, u8 max_credit,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_configure_vsi_bw_limit *cmd =
+		(struct i40e_aqc_configure_vsi_bw_limit *)&desc.params.raw;
+	enum i40e_status_code status;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					  i40e_aqc_opc_configure_vsi_bw_limit);
+
+	cmd->vsi_seid = CPU_TO_LE16(seid);
+	cmd->credit = CPU_TO_LE16(credit);
+	cmd->max_credit = max_credit;
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_aq_config_switch_comp_bw_limit - Configure Switching component BW Limit
+ * @hw: pointer to the hw struct
+ * @seid: switching component seid
+ * @credit: BW limit credits (0 = disabled)
+ * @max_bw: Max BW limit credits
+ * @cmd_details: pointer to command details structure or NULL
+ **/
+enum i40e_status_code i40e_aq_config_switch_comp_bw_limit(struct i40e_hw *hw,
+				u16 seid, u16 credit, u8 max_bw,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_configure_switching_comp_bw_limit *cmd =
+	  (struct i40e_aqc_configure_switching_comp_bw_limit *)&desc.params.raw;
+	enum i40e_status_code status;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+				i40e_aqc_opc_configure_switching_comp_bw_limit);
+
+	cmd->seid = CPU_TO_LE16(seid);
+	cmd->credit = CPU_TO_LE16(credit);
+	cmd->max_bw = max_bw;
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_aq_config_vsi_ets_sla_bw_limit - Config VSI BW Limit per TC
+ * @hw: pointer to the hw struct
+ * @seid: VSI seid
+ * @bw_data: Buffer holding enabled TCs, per TC BW limit/credits
+ * @cmd_details: pointer to command details structure or NULL
+ **/
+enum i40e_status_code i40e_aq_config_vsi_ets_sla_bw_limit(struct i40e_hw *hw,
+			u16 seid,
+			struct i40e_aqc_configure_vsi_ets_sla_bw_data *bw_data,
+			struct i40e_asq_cmd_details *cmd_details)
+{
+	return i40e_aq_tx_sched_cmd(hw, seid, (void *)bw_data, sizeof(*bw_data),
+				    i40e_aqc_opc_configure_vsi_ets_sla_bw_limit,
+				    cmd_details);
+}
+
+/**
+ * i40e_aq_config_vsi_tc_bw - Config VSI BW Allocation per TC
+ * @hw: pointer to the hw struct
+ * @seid: VSI seid
+ * @bw_data: Buffer holding enabled TCs, relative TC BW limit/credits
+ * @cmd_details: pointer to command details structure or NULL
+ **/
+enum i40e_status_code i40e_aq_config_vsi_tc_bw(struct i40e_hw *hw,
+			u16 seid,
+			struct i40e_aqc_configure_vsi_tc_bw_data *bw_data,
+			struct i40e_asq_cmd_details *cmd_details)
+{
+	return i40e_aq_tx_sched_cmd(hw, seid, (void *)bw_data, sizeof(*bw_data),
+				    i40e_aqc_opc_configure_vsi_tc_bw,
+				    cmd_details);
+}
+
+/**
+ * i40e_aq_config_switch_comp_ets_bw_limit - Config Switch comp BW Limit per TC
+ * @hw: pointer to the hw struct
+ * @seid: seid of the switching component
+ * @bw_data: Buffer holding enabled TCs, per TC BW limit/credits
+ * @cmd_details: pointer to command details structure or NULL
+ **/
+enum i40e_status_code i40e_aq_config_switch_comp_ets_bw_limit(
+	struct i40e_hw *hw, u16 seid,
+	struct i40e_aqc_configure_switching_comp_ets_bw_limit_data *bw_data,
+	struct i40e_asq_cmd_details *cmd_details)
+{
+	return i40e_aq_tx_sched_cmd(hw, seid, (void *)bw_data, sizeof(*bw_data),
+			    i40e_aqc_opc_configure_switching_comp_ets_bw_limit,
+			    cmd_details);
+}
+
+/**
+ * i40e_aq_query_vsi_bw_config - Query VSI BW configuration
+ * @hw: pointer to the hw struct
+ * @seid: seid of the VSI
+ * @bw_data: Buffer to hold VSI BW configuration
+ * @cmd_details: pointer to command details structure or NULL
+ **/
+enum i40e_status_code i40e_aq_query_vsi_bw_config(struct i40e_hw *hw,
+			u16 seid,
+			struct i40e_aqc_query_vsi_bw_config_resp *bw_data,
+			struct i40e_asq_cmd_details *cmd_details)
+{
+	return i40e_aq_tx_sched_cmd(hw, seid, (void *)bw_data, sizeof(*bw_data),
+				    i40e_aqc_opc_query_vsi_bw_config,
+				    cmd_details);
+}
+
+/**
+ * i40e_aq_query_vsi_ets_sla_config - Query VSI BW configuration per TC
+ * @hw: pointer to the hw struct
+ * @seid: seid of the VSI
+ * @bw_data: Buffer to hold VSI BW configuration per TC
+ * @cmd_details: pointer to command details structure or NULL
+ **/
+enum i40e_status_code i40e_aq_query_vsi_ets_sla_config(struct i40e_hw *hw,
+			u16 seid,
+			struct i40e_aqc_query_vsi_ets_sla_config_resp *bw_data,
+			struct i40e_asq_cmd_details *cmd_details)
+{
+	return i40e_aq_tx_sched_cmd(hw, seid, (void *)bw_data, sizeof(*bw_data),
+				    i40e_aqc_opc_query_vsi_ets_sla_config,
+				    cmd_details);
+}
+
+/**
+ * i40e_aq_query_switch_comp_ets_config - Query Switch comp BW config per TC
+ * @hw: pointer to the hw struct
+ * @seid: seid of the switching component
+ * @bw_data: Buffer to hold switching component's per TC BW config
+ * @cmd_details: pointer to command details structure or NULL
+ **/
+enum i40e_status_code i40e_aq_query_switch_comp_ets_config(struct i40e_hw *hw,
+		u16 seid,
+		struct i40e_aqc_query_switching_comp_ets_config_resp *bw_data,
+		struct i40e_asq_cmd_details *cmd_details)
+{
+	return i40e_aq_tx_sched_cmd(hw, seid, (void *)bw_data, sizeof(*bw_data),
+				   i40e_aqc_opc_query_switching_comp_ets_config,
+				   cmd_details);
+}
+
+/**
+ * i40e_aq_query_port_ets_config - Query Physical Port ETS configuration
+ * @hw: pointer to the hw struct
+ * @seid: seid of the VSI or switching component connected to Physical Port
+ * @bw_data: Buffer to hold current ETS configuration for the Physical Port
+ * @cmd_details: pointer to command details structure or NULL
+ **/
+enum i40e_status_code i40e_aq_query_port_ets_config(struct i40e_hw *hw,
+			u16 seid,
+			struct i40e_aqc_query_port_ets_config_resp *bw_data,
+			struct i40e_asq_cmd_details *cmd_details)
+{
+	return i40e_aq_tx_sched_cmd(hw, seid, (void *)bw_data, sizeof(*bw_data),
+				    i40e_aqc_opc_query_port_ets_config,
+				    cmd_details);
+}
+
+/**
+ * i40e_aq_query_switch_comp_bw_config - Query Switch comp BW configuration
+ * @hw: pointer to the hw struct
+ * @seid: seid of the switching component
+ * @bw_data: Buffer to hold switching component's BW configuration
+ * @cmd_details: pointer to command details structure or NULL
+ **/
+enum i40e_status_code i40e_aq_query_switch_comp_bw_config(struct i40e_hw *hw,
+		u16 seid,
+		struct i40e_aqc_query_switching_comp_bw_config_resp *bw_data,
+		struct i40e_asq_cmd_details *cmd_details)
+{
+	return i40e_aq_tx_sched_cmd(hw, seid, (void *)bw_data, sizeof(*bw_data),
+				    i40e_aqc_opc_query_switching_comp_bw_config,
+				    cmd_details);
+}
+
+/**
+ * i40e_validate_filter_settings
+ * @hw: pointer to the hardware structure
+ * @settings: Filter control settings
+ *
+ * Check and validate the filter control settings passed.
+ * The function checks for the valid filter/context sizes being
+ * passed for FCoE and PE.
+ *
+ * Returns I40E_SUCCESS if the values passed are valid and within
+ * range else returns an error.
+ **/
+static enum i40e_status_code i40e_validate_filter_settings(struct i40e_hw *hw,
+				struct i40e_filter_control_settings *settings)
+{
+	u32 fcoe_cntx_size, fcoe_filt_size;
+	u32 pe_cntx_size, pe_filt_size;
+	u32 fcoe_fmax;
+
+	u32 val;
+
+	/* Validate FCoE settings passed */
+	switch (settings->fcoe_filt_num) {
+	case I40E_HASH_FILTER_SIZE_1K:
+	case I40E_HASH_FILTER_SIZE_2K:
+	case I40E_HASH_FILTER_SIZE_4K:
+	case I40E_HASH_FILTER_SIZE_8K:
+	case I40E_HASH_FILTER_SIZE_16K:
+	case I40E_HASH_FILTER_SIZE_32K:
+		fcoe_filt_size = I40E_HASH_FILTER_BASE_SIZE;
+		fcoe_filt_size <<= (u32)settings->fcoe_filt_num;
+		break;
+	default:
+		return I40E_ERR_PARAM;
+	}
+
+	switch (settings->fcoe_cntx_num) {
+	case I40E_DMA_CNTX_SIZE_512:
+	case I40E_DMA_CNTX_SIZE_1K:
+	case I40E_DMA_CNTX_SIZE_2K:
+	case I40E_DMA_CNTX_SIZE_4K:
+		fcoe_cntx_size = I40E_DMA_CNTX_BASE_SIZE;
+		fcoe_cntx_size <<= (u32)settings->fcoe_cntx_num;
+		break;
+	default:
+		return I40E_ERR_PARAM;
+	}
+
+	/* Validate PE settings passed */
+	switch (settings->pe_filt_num) {
+	case I40E_HASH_FILTER_SIZE_1K:
+	case I40E_HASH_FILTER_SIZE_2K:
+	case I40E_HASH_FILTER_SIZE_4K:
+	case I40E_HASH_FILTER_SIZE_8K:
+	case I40E_HASH_FILTER_SIZE_16K:
+	case I40E_HASH_FILTER_SIZE_32K:
+	case I40E_HASH_FILTER_SIZE_64K:
+	case I40E_HASH_FILTER_SIZE_128K:
+	case I40E_HASH_FILTER_SIZE_256K:
+	case I40E_HASH_FILTER_SIZE_512K:
+	case I40E_HASH_FILTER_SIZE_1M:
+		pe_filt_size = I40E_HASH_FILTER_BASE_SIZE;
+		pe_filt_size <<= (u32)settings->pe_filt_num;
+		break;
+	default:
+		return I40E_ERR_PARAM;
+	}
+
+	switch (settings->pe_cntx_num) {
+	case I40E_DMA_CNTX_SIZE_512:
+	case I40E_DMA_CNTX_SIZE_1K:
+	case I40E_DMA_CNTX_SIZE_2K:
+	case I40E_DMA_CNTX_SIZE_4K:
+	case I40E_DMA_CNTX_SIZE_8K:
+	case I40E_DMA_CNTX_SIZE_16K:
+	case I40E_DMA_CNTX_SIZE_32K:
+	case I40E_DMA_CNTX_SIZE_64K:
+	case I40E_DMA_CNTX_SIZE_128K:
+	case I40E_DMA_CNTX_SIZE_256K:
+		pe_cntx_size = I40E_DMA_CNTX_BASE_SIZE;
+		pe_cntx_size <<= (u32)settings->pe_cntx_num;
+		break;
+	default:
+		return I40E_ERR_PARAM;
+	}
+
+	/* FCHSIZE + FCDSIZE should not be greater than PMFCOEFMAX */
+	val = rd32(hw, I40E_GLHMC_FCOEFMAX);
+	fcoe_fmax = (val & I40E_GLHMC_FCOEFMAX_PMFCOEFMAX_MASK)
+		     >> I40E_GLHMC_FCOEFMAX_PMFCOEFMAX_SHIFT;
+	if (fcoe_filt_size + fcoe_cntx_size >  fcoe_fmax)
+		return I40E_ERR_INVALID_SIZE;
+
+	return I40E_SUCCESS;
+}
+
+/**
+ * i40e_set_filter_control
+ * @hw: pointer to the hardware structure
+ * @settings: Filter control settings
+ *
+ * Set the Queue Filters for PE/FCoE and enable filters required
+ * for a single PF. It is expected that these settings are programmed
+ * at the driver initialization time.
+ **/
+enum i40e_status_code i40e_set_filter_control(struct i40e_hw *hw,
+				struct i40e_filter_control_settings *settings)
+{
+	enum i40e_status_code ret = I40E_SUCCESS;
+	u32 hash_lut_size = 0;
+	u32 val;
+
+	if (!settings)
+		return I40E_ERR_PARAM;
+
+	/* Validate the input settings */
+	ret = i40e_validate_filter_settings(hw, settings);
+	if (ret)
+		return ret;
+
+	/* Read the PF Queue Filter control register */
+	val = rd32(hw, I40E_PFQF_CTL_0);
+
+	/* Program required PE hash buckets for the PF */
+	val &= ~I40E_PFQF_CTL_0_PEHSIZE_MASK;
+	val |= ((u32)settings->pe_filt_num << I40E_PFQF_CTL_0_PEHSIZE_SHIFT) &
+		I40E_PFQF_CTL_0_PEHSIZE_MASK;
+	/* Program required PE contexts for the PF */
+	val &= ~I40E_PFQF_CTL_0_PEDSIZE_MASK;
+	val |= ((u32)settings->pe_cntx_num << I40E_PFQF_CTL_0_PEDSIZE_SHIFT) &
+		I40E_PFQF_CTL_0_PEDSIZE_MASK;
+
+	/* Program required FCoE hash buckets for the PF */
+	val &= ~I40E_PFQF_CTL_0_PFFCHSIZE_MASK;
+	val |= ((u32)settings->fcoe_filt_num <<
+			I40E_PFQF_CTL_0_PFFCHSIZE_SHIFT) &
+		I40E_PFQF_CTL_0_PFFCHSIZE_MASK;
+	/* Program required FCoE DDP contexts for the PF */
+	val &= ~I40E_PFQF_CTL_0_PFFCDSIZE_MASK;
+	val |= ((u32)settings->fcoe_cntx_num <<
+			I40E_PFQF_CTL_0_PFFCDSIZE_SHIFT) &
+		I40E_PFQF_CTL_0_PFFCDSIZE_MASK;
+
+	/* Program Hash LUT size for the PF */
+	val &= ~I40E_PFQF_CTL_0_HASHLUTSIZE_MASK;
+	if (settings->hash_lut_size == I40E_HASH_LUT_SIZE_512)
+		hash_lut_size = 1;
+	val |= (hash_lut_size << I40E_PFQF_CTL_0_HASHLUTSIZE_SHIFT) &
+		I40E_PFQF_CTL_0_HASHLUTSIZE_MASK;
+
+	/* Enable FDIR, Ethertype and MACVLAN filters for PF and VFs */
+	if (settings->enable_fdir)
+		val |= I40E_PFQF_CTL_0_FD_ENA_MASK;
+	if (settings->enable_ethtype)
+		val |= I40E_PFQF_CTL_0_ETYPE_ENA_MASK;
+	if (settings->enable_macvlan)
+		val |= I40E_PFQF_CTL_0_MACVLAN_ENA_MASK;
+
+	wr32(hw, I40E_PFQF_CTL_0, val);
+
+	return I40E_SUCCESS;
+}
+
+/**
+ * i40e_aq_add_rem_control_packet_filter - Add or Remove Control Packet Filter
+ * @hw: pointer to the hw struct
+ * @mac_addr: MAC address to use in the filter
+ * @ethtype: Ethertype to use in the filter
+ * @flags: Flags that needs to be applied to the filter
+ * @vsi_seid: seid of the control VSI
+ * @queue: VSI queue number to send the packet to
+ * @is_add: Add control packet filter if True else remove
+ * @stats: Structure to hold information on control filter counts
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * This command will Add or Remove control packet filter for a control VSI.
+ * In return it will update the total number of perfect filter count in
+ * the stats member.
+ **/
+enum i40e_status_code i40e_aq_add_rem_control_packet_filter(struct i40e_hw *hw,
+				u8 *mac_addr, u16 ethtype, u16 flags,
+				u16 vsi_seid, u16 queue, bool is_add,
+				struct i40e_control_filter_stats *stats,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_add_remove_control_packet_filter *cmd =
+		(struct i40e_aqc_add_remove_control_packet_filter *)
+		&desc.params.raw;
+	struct i40e_aqc_add_remove_control_packet_filter_completion *resp =
+		(struct i40e_aqc_add_remove_control_packet_filter_completion *)
+		&desc.params.raw;
+	enum i40e_status_code status;
+
+	if (vsi_seid == 0)
+		return I40E_ERR_PARAM;
+
+	if (is_add) {
+		i40e_fill_default_direct_cmd_desc(&desc,
+				i40e_aqc_opc_add_control_packet_filter);
+		cmd->queue = CPU_TO_LE16(queue);
+	} else {
+		i40e_fill_default_direct_cmd_desc(&desc,
+				i40e_aqc_opc_remove_control_packet_filter);
+	}
+
+	if (mac_addr)
+		i40e_memcpy(cmd->mac, mac_addr, I40E_ETH_LENGTH_OF_ADDRESS,
+			    I40E_NONDMA_TO_NONDMA);
+
+	cmd->etype = CPU_TO_LE16(ethtype);
+	cmd->flags = CPU_TO_LE16(flags);
+	cmd->seid = CPU_TO_LE16(vsi_seid);
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	if (!status && stats) {
+		stats->mac_etype_used = LE16_TO_CPU(resp->mac_etype_used);
+		stats->etype_used = LE16_TO_CPU(resp->etype_used);
+		stats->mac_etype_free = LE16_TO_CPU(resp->mac_etype_free);
+		stats->etype_free = LE16_TO_CPU(resp->etype_free);
+	}
+
+	return status;
+}
+
+/**
+ * i40e_aq_add_cloud_filters
+ * @hw: pointer to the hardware structure
+ * @seid: VSI seid to add cloud filters from
+ * @filters: Buffer which contains the filters to be added
+ * @filter_count: number of filters contained in the buffer
+ *
+ * Set the cloud filters for a given VSI.  The contents of the
+ * i40e_aqc_add_remove_cloud_filters_element_data are filled
+ * in by the caller of the function.
+ *
+ **/
+enum i40e_status_code i40e_aq_add_cloud_filters(struct i40e_hw *hw,
+	u16 seid,
+	struct i40e_aqc_add_remove_cloud_filters_element_data *filters,
+	u8 filter_count)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_add_remove_cloud_filters *cmd =
+	(struct i40e_aqc_add_remove_cloud_filters *)&desc.params.raw;
+	u16 buff_len;
+	enum i40e_status_code status;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					  i40e_aqc_opc_add_cloud_filters);
+
+	buff_len = filter_count * sizeof(*filters);
+	desc.datalen = CPU_TO_LE16(buff_len);
+	desc.flags |= CPU_TO_LE16((u16)(I40E_AQ_FLAG_BUF | I40E_AQ_FLAG_RD));
+	cmd->num_filters = filter_count;
+	cmd->seid = CPU_TO_LE16(seid);
+
+	status = i40e_asq_send_command(hw, &desc, filters, buff_len, NULL);
+
+	return status;
+}
+
+/**
+ * i40e_aq_remove_cloud_filters
+ * @hw: pointer to the hardware structure
+ * @seid: VSI seid to remove cloud filters from
+ * @filters: Buffer which contains the filters to be removed
+ * @filter_count: number of filters contained in the buffer
+ *
+ * Remove the cloud filters for a given VSI.  The contents of the
+ * i40e_aqc_add_remove_cloud_filters_element_data are filled
+ * in by the caller of the function.
+ *
+ **/
+enum i40e_status_code i40e_aq_remove_cloud_filters(struct i40e_hw *hw,
+		u16 seid,
+		struct i40e_aqc_add_remove_cloud_filters_element_data *filters,
+		u8 filter_count)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_add_remove_cloud_filters *cmd =
+	(struct i40e_aqc_add_remove_cloud_filters *)&desc.params.raw;
+	enum i40e_status_code status;
+	u16 buff_len;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					  i40e_aqc_opc_remove_cloud_filters);
+
+	buff_len = filter_count * sizeof(*filters);
+	desc.datalen = CPU_TO_LE16(buff_len);
+	desc.flags |= CPU_TO_LE16((u16)(I40E_AQ_FLAG_BUF | I40E_AQ_FLAG_RD));
+	cmd->num_filters = filter_count;
+	cmd->seid = CPU_TO_LE16(seid);
+
+	status = i40e_asq_send_command(hw, &desc, filters, buff_len, NULL);
+
+	return status;
+}
+
+/**
+ * i40e_aq_alternate_write
+ * @hw: pointer to the hardware structure
+ * @reg_addr0: address of first dword to be read
+ * @reg_val0: value to be written under 'reg_addr0'
+ * @reg_addr1: address of second dword to be read
+ * @reg_val1: value to be written under 'reg_addr1'
+ *
+ * Write one or two dwords to alternate structure. Fields are indicated
+ * by 'reg_addr0' and 'reg_addr1' register numbers.
+ *
+ **/
+enum i40e_status_code i40e_aq_alternate_write(struct i40e_hw *hw,
+				u32 reg_addr0, u32 reg_val0,
+				u32 reg_addr1, u32 reg_val1)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_alternate_write *cmd_resp =
+		(struct i40e_aqc_alternate_write *)&desc.params.raw;
+	enum i40e_status_code status;
+
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_alternate_write);
+	cmd_resp->address0 = CPU_TO_LE32(reg_addr0);
+	cmd_resp->address1 = CPU_TO_LE32(reg_addr1);
+	cmd_resp->data0 = CPU_TO_LE32(reg_val0);
+	cmd_resp->data1 = CPU_TO_LE32(reg_val1);
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, NULL);
+
+	return status;
+}
+
+/**
+ * i40e_aq_alternate_write_indirect
+ * @hw: pointer to the hardware structure
+ * @addr: address of a first register to be modified
+ * @dw_count: number of alternate structure fields to write
+ * @buffer: pointer to the command buffer
+ *
+ * Write 'dw_count' dwords from 'buffer' to alternate structure
+ * starting at 'addr'.
+ *
+ **/
+enum i40e_status_code i40e_aq_alternate_write_indirect(struct i40e_hw *hw,
+				u32 addr, u32 dw_count, void *buffer)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_alternate_ind_write *cmd_resp =
+		(struct i40e_aqc_alternate_ind_write *)&desc.params.raw;
+	enum i40e_status_code status;
+
+	if (buffer == NULL)
+		return I40E_ERR_PARAM;
+
+	/* Indirect command */
+	i40e_fill_default_direct_cmd_desc(&desc,
+					 i40e_aqc_opc_alternate_write_indirect);
+
+	desc.flags |= CPU_TO_LE16(I40E_AQ_FLAG_RD);
+	desc.flags |= CPU_TO_LE16(I40E_AQ_FLAG_BUF);
+	if (dw_count > (I40E_AQ_LARGE_BUF/4))
+		desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_LB);
+
+	cmd_resp->address = CPU_TO_LE32(addr);
+	cmd_resp->length = CPU_TO_LE32(dw_count);
+	cmd_resp->addr_high = CPU_TO_LE32(I40E_HI_WORD((uintptr_t)buffer));
+	cmd_resp->addr_low = CPU_TO_LE32(I40E_LO_DWORD((uintptr_t)buffer));
+
+	status = i40e_asq_send_command(hw, &desc, buffer,
+				       I40E_LO_DWORD(4*dw_count), NULL);
+
+	return status;
+}
+
+/**
+ * i40e_aq_alternate_read
+ * @hw: pointer to the hardware structure
+ * @reg_addr0: address of first dword to be read
+ * @reg_val0: pointer for data read from 'reg_addr0'
+ * @reg_addr1: address of second dword to be read
+ * @reg_val1: pointer for data read from 'reg_addr1'
+ *
+ * Read one or two dwords from alternate structure. Fields are indicated
+ * by 'reg_addr0' and 'reg_addr1' register numbers. If 'reg_val1' pointer
+ * is not passed then only register at 'reg_addr0' is read.
+ *
+ **/
+enum i40e_status_code i40e_aq_alternate_read(struct i40e_hw *hw,
+				u32 reg_addr0, u32 *reg_val0,
+				u32 reg_addr1, u32 *reg_val1)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_alternate_write *cmd_resp =
+		(struct i40e_aqc_alternate_write *)&desc.params.raw;
+	enum i40e_status_code status;
+
+	if (reg_val0 == NULL)
+		return I40E_ERR_PARAM;
+
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_alternate_read);
+	cmd_resp->address0 = CPU_TO_LE32(reg_addr0);
+	cmd_resp->address1 = CPU_TO_LE32(reg_addr1);
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, NULL);
+
+	if (status == I40E_SUCCESS) {
+		*reg_val0 = LE32_TO_CPU(cmd_resp->data0);
+
+		if (reg_val1 != NULL)
+			*reg_val1 = LE32_TO_CPU(cmd_resp->data1);
+	}
+
+	return status;
+}
+
+/**
+ * i40e_aq_alternate_read_indirect
+ * @hw: pointer to the hardware structure
+ * @addr: address of the alternate structure field
+ * @dw_count: number of alternate structure fields to read
+ * @buffer: pointer to the command buffer
+ *
+ * Read 'dw_count' dwords from alternate structure starting at 'addr' and
+ * place them in 'buffer'. The buffer should be allocated by caller.
+ *
+ **/
+enum i40e_status_code i40e_aq_alternate_read_indirect(struct i40e_hw *hw,
+				u32 addr, u32 dw_count, void *buffer)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_alternate_ind_write *cmd_resp =
+		(struct i40e_aqc_alternate_ind_write *)&desc.params.raw;
+	enum i40e_status_code status;
+
+	if (buffer == NULL)
+		return I40E_ERR_PARAM;
+
+	/* Indirect command */
+	i40e_fill_default_direct_cmd_desc(&desc,
+		i40e_aqc_opc_alternate_read_indirect);
+
+	desc.flags |= CPU_TO_LE16(I40E_AQ_FLAG_RD);
+	desc.flags |= CPU_TO_LE16(I40E_AQ_FLAG_BUF);
+	if (dw_count > (I40E_AQ_LARGE_BUF/4))
+		desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_LB);
+
+	cmd_resp->address = CPU_TO_LE32(addr);
+	cmd_resp->length = CPU_TO_LE32(dw_count);
+	cmd_resp->addr_high = CPU_TO_LE32(I40E_HI_DWORD((uintptr_t)buffer));
+	cmd_resp->addr_low = CPU_TO_LE32(I40E_LO_DWORD((uintptr_t)buffer));
+
+	status = i40e_asq_send_command(hw, &desc, buffer,
+				       I40E_LO_DWORD(4*dw_count), NULL);
+
+	return status;
+}
+
+/**
+ *  i40e_aq_alternate_clear
+ *  @hw: pointer to the HW structure.
+ *
+ *  Clear the alternate structures of the port from which the function
+ *  is called.
+ *
+ **/
+enum i40e_status_code i40e_aq_alternate_clear(struct i40e_hw *hw)
+{
+	struct i40e_aq_desc desc;
+	enum i40e_status_code status;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					  i40e_aqc_opc_alternate_clear_port);
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, NULL);
+
+	return status;
+}
+
+/**
+ *  i40e_aq_alternate_write_done
+ *  @hw: pointer to the HW structure.
+ *  @bios_mode: indicates whether the command is executed by UEFI or legacy BIOS
+ *  @reset_needed: indicates the SW should trigger GLOBAL reset
+ *
+ *  Indicates to the FW that alternate structures have been changed.
+ *
+ **/
+enum i40e_status_code i40e_aq_alternate_write_done(struct i40e_hw *hw,
+		u8 bios_mode, bool *reset_needed)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_alternate_write_done *cmd =
+		(struct i40e_aqc_alternate_write_done *)&desc.params.raw;
+	enum i40e_status_code status;
+
+	if (reset_needed == NULL)
+		return I40E_ERR_PARAM;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					  i40e_aqc_opc_alternate_write_done);
+
+	cmd->cmd_flags = CPU_TO_LE16(bios_mode);
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, NULL);
+	if (!status && reset_needed)
+		*reset_needed = ((LE16_TO_CPU(cmd->cmd_flags) &
+				 I40E_AQ_ALTERNATE_RESET_NEEDED) != 0);
+
+	return status;
+}
+
+/**
+ *  i40e_aq_set_oem_mode
+ *  @hw: pointer to the HW structure.
+ *  @oem_mode: the OEM mode to be used
+ *
+ *  Sets the device to a specific operating mode. Currently the only supported
+ *  mode is no_clp, which causes FW to refrain from using Alternate RAM.
+ *
+ **/
+enum i40e_status_code i40e_aq_set_oem_mode(struct i40e_hw *hw,
+		u8 oem_mode)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_alternate_write_done *cmd =
+		(struct i40e_aqc_alternate_write_done *)&desc.params.raw;
+	enum i40e_status_code status;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					  i40e_aqc_opc_alternate_set_mode);
+
+	cmd->cmd_flags = CPU_TO_LE16(oem_mode);
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, NULL);
+
+	return status;
+}
+
+/**
+ * i40e_aq_resume_port_tx
+ * @hw: pointer to the hardware structure
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Resume port's Tx traffic
+ **/
+enum i40e_status_code i40e_aq_resume_port_tx(struct i40e_hw *hw,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	enum i40e_status_code status;
+
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_resume_port_tx);
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_set_pci_config_data - store PCI bus info
+ * @hw: pointer to hardware structure
+ * @link_status: the link status word from PCI config space
+ *
+ * Stores the PCI bus info (speed, width, type) within the i40e_hw structure
+ **/
+void i40e_set_pci_config_data(struct i40e_hw *hw, u16 link_status)
+{
+	hw->bus.type = i40e_bus_type_pci_express;
+
+	switch (link_status & I40E_PCI_LINK_WIDTH) {
+	case I40E_PCI_LINK_WIDTH_1:
+		hw->bus.width = i40e_bus_width_pcie_x1;
+		break;
+	case I40E_PCI_LINK_WIDTH_2:
+		hw->bus.width = i40e_bus_width_pcie_x2;
+		break;
+	case I40E_PCI_LINK_WIDTH_4:
+		hw->bus.width = i40e_bus_width_pcie_x4;
+		break;
+	case I40E_PCI_LINK_WIDTH_8:
+		hw->bus.width = i40e_bus_width_pcie_x8;
+		break;
+	default:
+		hw->bus.width = i40e_bus_width_unknown;
+		break;
+	}
+
+	switch (link_status & I40E_PCI_LINK_SPEED) {
+	case I40E_PCI_LINK_SPEED_2500:
+		hw->bus.speed = i40e_bus_speed_2500;
+		break;
+	case I40E_PCI_LINK_SPEED_5000:
+		hw->bus.speed = i40e_bus_speed_5000;
+		break;
+	case I40E_PCI_LINK_SPEED_8000:
+		hw->bus.speed = i40e_bus_speed_8000;
+		break;
+	default:
+		hw->bus.speed = i40e_bus_speed_unknown;
+		break;
+	}
+}
+
+/**
+ * i40e_aq_debug_dump
+ * @hw: pointer to the hardware structure
+ * @cluster_id: specific cluster to dump
+ * @table_id: table id within cluster
+ * @start_index: index of line in the block to read
+ * @buff_size: dump buffer size
+ * @buff: dump buffer
+ * @ret_buff_size: actual buffer size returned
+ * @ret_next_table: next block to read
+ * @ret_next_index: next index to read
+ *
+ * Dump internal FW/HW data for debug purposes.
+ *
+ **/
+enum i40e_status_code i40e_aq_debug_dump(struct i40e_hw *hw, u8 cluster_id,
+				u8 table_id, u32 start_index, u16 buff_size,
+				void *buff, u16 *ret_buff_size,
+				u8 *ret_next_table, u32 *ret_next_index,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_debug_dump_internals *cmd =
+		(struct i40e_aqc_debug_dump_internals *)&desc.params.raw;
+	struct i40e_aqc_debug_dump_internals *resp =
+		(struct i40e_aqc_debug_dump_internals *)&desc.params.raw;
+	enum i40e_status_code status;
+
+	if (buff_size == 0 || !buff)
+		return I40E_ERR_PARAM;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					  i40e_aqc_opc_debug_dump_internals);
+	/* Indirect Command */
+	desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_BUF);
+	if (buff_size > I40E_AQ_LARGE_BUF)
+		desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_LB);
+
+	cmd->cluster_id = cluster_id;
+	cmd->table_id = table_id;
+	cmd->idx = CPU_TO_LE32(start_index);
+
+	desc.datalen = CPU_TO_LE16(buff_size);
+
+	status = i40e_asq_send_command(hw, &desc, buff, buff_size, cmd_details);
+	if (!status) {
+		if (ret_buff_size != NULL)
+			*ret_buff_size = LE16_TO_CPU(desc.datalen);
+		if (ret_next_table != NULL)
+			*ret_next_table = resp->table_id;
+		if (ret_next_index != NULL)
+			*ret_next_index = LE32_TO_CPU(resp->idx);
+	}
+
+	return status;
+}
+
+/**
+ * i40e_read_bw_from_alt_ram
+ * @hw: pointer to the hardware structure
+ * @max_bw: pointer for max_bw read
+ * @min_bw: pointer for min_bw read
+ * @min_valid: pointer for bool that is TRUE if min_bw is a valid value
+ * @max_valid: pointer for bool that is TRUE if max_bw is a valid value
+ *
+ * Read bw from the alternate ram for the given pf
+ **/
+enum i40e_status_code i40e_read_bw_from_alt_ram(struct i40e_hw *hw,
+					u32 *max_bw, u32 *min_bw,
+					bool *min_valid, bool *max_valid)
+{
+	enum i40e_status_code status;
+	u32 max_bw_addr, min_bw_addr;
+
+	/* Calculate the address of the min/max bw registers */
+	max_bw_addr = I40E_ALT_STRUCT_FIRST_PF_OFFSET +
+		      I40E_ALT_STRUCT_MAX_BW_OFFSET +
+		      (I40E_ALT_STRUCT_DWORDS_PER_PF * hw->pf_id);
+	min_bw_addr = I40E_ALT_STRUCT_FIRST_PF_OFFSET +
+		      I40E_ALT_STRUCT_MIN_BW_OFFSET +
+		      (I40E_ALT_STRUCT_DWORDS_PER_PF * hw->pf_id);
+
+	/* Read the bandwidths from alt ram */
+	status = i40e_aq_alternate_read(hw, max_bw_addr, max_bw,
+					min_bw_addr, min_bw);
+
+	if (*min_bw & I40E_ALT_BW_VALID_MASK)
+		*min_valid = TRUE;
+	else
+		*min_valid = FALSE;
+
+	if (*max_bw & I40E_ALT_BW_VALID_MASK)
+		*max_valid = TRUE;
+	else
+		*max_valid = FALSE;
+
+	return status;
+}
+
+/**
+ * i40e_aq_configure_partition_bw
+ * @hw: pointer to the hardware structure
+ * @bw_data: Buffer holding valid pfs and bw limits
+ * @cmd_details: pointer to command details
+ *
+ * Configure partitions guaranteed/max bw
+ **/
+enum i40e_status_code i40e_aq_configure_partition_bw(struct i40e_hw *hw,
+			struct i40e_aqc_configure_partition_bw_data *bw_data,
+			struct i40e_asq_cmd_details *cmd_details)
+{
+	enum i40e_status_code status;
+	struct i40e_aq_desc desc;
+	u16 bwd_size = sizeof(*bw_data);
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+				i40e_aqc_opc_configure_partition_bw);
+
+	/* Indirect command */
+	desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_BUF);
+	desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_RD);
+
+	if (bwd_size > I40E_AQ_LARGE_BUF)
+		desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_LB);
+
+	desc.datalen = CPU_TO_LE16(bwd_size);
+
+	status = i40e_asq_send_command(hw, &desc, bw_data, bwd_size, cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_aq_send_msg_to_pf
+ * @hw: pointer to the hardware structure
+ * @v_opcode: opcodes for VF-PF communication
+ * @v_retval: return error code
+ * @msg: pointer to the msg buffer
+ * @msglen: msg length
+ * @cmd_details: pointer to command details
+ *
+ * Send message to PF driver using admin queue. By default, this message
+ * is sent asynchronously, i.e. i40e_asq_send_command() does not wait for
+ * completion before returning.
+ **/
+enum i40e_status_code i40e_aq_send_msg_to_pf(struct i40e_hw *hw,
+				enum i40e_virtchnl_ops v_opcode,
+				enum i40e_status_code v_retval,
+				u8 *msg, u16 msglen,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_asq_cmd_details details;
+	enum i40e_status_code status;
+
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_send_msg_to_pf);
+	desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_SI);
+	desc.cookie_high = CPU_TO_LE32(v_opcode);
+	desc.cookie_low = CPU_TO_LE32(v_retval);
+	if (msglen) {
+		desc.flags |= CPU_TO_LE16((u16)(I40E_AQ_FLAG_BUF
+						| I40E_AQ_FLAG_RD));
+		if (msglen > I40E_AQ_LARGE_BUF)
+			desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_LB);
+		desc.datalen = CPU_TO_LE16(msglen);
+	}
+	if (!cmd_details) {
+		i40e_memset(&details, 0, sizeof(details), I40E_NONDMA_MEM);
+		details.async = TRUE;
+		cmd_details = &details;
+	}
+	status = i40e_asq_send_command(hw, (struct i40e_aq_desc *)&desc, msg,
+				       msglen, cmd_details);
+	return status;
+}
+
+/**
+ * i40e_vf_parse_hw_config
+ * @hw: pointer to the hardware structure
+ * @msg: pointer to the virtual channel VF resource structure
+ *
+ * Given a VF resource message from the PF, populate the hw struct
+ * with appropriate information.
+ **/
+void i40e_vf_parse_hw_config(struct i40e_hw *hw,
+			     struct i40e_virtchnl_vf_resource *msg)
+{
+	struct i40e_virtchnl_vsi_resource *vsi_res;
+	int i;
+
+	vsi_res = &msg->vsi_res[0];
+
+	hw->dev_caps.num_vsis = msg->num_vsis;
+	hw->dev_caps.num_rx_qp = msg->num_queue_pairs;
+	hw->dev_caps.num_tx_qp = msg->num_queue_pairs;
+	hw->dev_caps.num_msix_vectors_vf = msg->max_vectors;
+	hw->dev_caps.dcb = msg->vf_offload_flags &
+			   I40E_VIRTCHNL_VF_OFFLOAD_L2;
+	hw->dev_caps.fcoe = (msg->vf_offload_flags &
+			     I40E_VIRTCHNL_VF_OFFLOAD_FCOE) ? 1 : 0;
+	hw->dev_caps.iwarp = (msg->vf_offload_flags &
+			      I40E_VIRTCHNL_VF_OFFLOAD_IWARP) ? 1 : 0;
+	for (i = 0; i < msg->num_vsis; i++) {
+		if (vsi_res->vsi_type == I40E_VSI_SRIOV) {
+			i40e_memcpy(hw->mac.perm_addr,
+				    vsi_res->default_mac_addr,
+				    I40E_ETH_LENGTH_OF_ADDRESS,
+				    I40E_NONDMA_TO_NONDMA);
+			i40e_memcpy(hw->mac.addr, vsi_res->default_mac_addr,
+				    I40E_ETH_LENGTH_OF_ADDRESS,
+				    I40E_NONDMA_TO_NONDMA);
+		}
+		vsi_res++;
+	}
+}
+
+/**
+ * i40e_vf_reset
+ * @hw: pointer to the hardware structure
+ *
+ * Send a VF_RESET message to the PF. Does not wait for response from PF
+ * as none will be forthcoming. Immediately after calling this function,
+ * the admin queue should be shut down and (optionally) reinitialized.
+ **/
+enum i40e_status_code i40e_vf_reset(struct i40e_hw *hw)
+{
+	return i40e_aq_send_msg_to_pf(hw, I40E_VIRTCHNL_OP_RESET_VF,
+				      I40E_SUCCESS, NULL, 0, NULL);
+}
diff --git a/usr/src/uts/common/io/i40e/core/i40e_devids.h b/usr/src/uts/common/io/i40e/core/i40e_devids.h
new file mode 100644
index 0000000000..5b927bed9f
--- /dev/null
+++ b/usr/src/uts/common/io/i40e/core/i40e_devids.h
@@ -0,0 +1,68 @@
+/******************************************************************************
+
+  Copyright (c) 2013-2015, Intel Corporation 
+  All rights reserved.
+  
+  Redistribution and use in source and binary forms, with or without 
+  modification, are permitted provided that the following conditions are met:
+  
+   1. Redistributions of source code must retain the above copyright notice, 
+      this list of conditions and the following disclaimer.
+  
+   2. Redistributions in binary form must reproduce the above copyright 
+      notice, this list of conditions and the following disclaimer in the 
+      documentation and/or other materials provided with the distribution.
+  
+   3. Neither the name of the Intel Corporation nor the names of its 
+      contributors may be used to endorse or promote products derived from 
+      this software without specific prior written permission.
+  
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+  POSSIBILITY OF SUCH DAMAGE.
+
+******************************************************************************/
+/*$FreeBSD: head/sys/dev/ixl/i40e_devids.h 284049 2015-06-05 22:52:42Z jfv $*/
+
+#ifndef _I40E_DEVIDS_H_
+#define _I40E_DEVIDS_H_
+
+/* Vendor ID */
+#define I40E_INTEL_VENDOR_ID		0x8086
+
+/* Device IDs */
+#define I40E_DEV_ID_SFP_XL710		0x1572
+#define I40E_DEV_ID_QEMU		0x1574
+#define I40E_DEV_ID_KX_A		0x157F
+#define I40E_DEV_ID_KX_B		0x1580
+#define I40E_DEV_ID_KX_C		0x1581
+#define I40E_DEV_ID_QSFP_A		0x1583
+#define I40E_DEV_ID_QSFP_B		0x1584
+#define I40E_DEV_ID_QSFP_C		0x1585
+#define I40E_DEV_ID_10G_BASE_T		0x1586
+#define I40E_DEV_ID_20G_KR2		0x1587
+#define I40E_DEV_ID_20G_KR2_A		0x1588
+#define I40E_DEV_ID_10G_BASE_T4		0x1589
+#define I40E_DEV_ID_VF			0x154C
+#define I40E_DEV_ID_VF_HV		0x1571
+#ifdef X722_SUPPORT
+#define I40E_DEV_ID_SFP_X722		0x37D0
+#define I40E_DEV_ID_1G_BASE_T_X722	0x37D1
+#define I40E_DEV_ID_10G_BASE_T_X722	0x37D2
+#define I40E_DEV_ID_X722_VF		0x37CD
+#define I40E_DEV_ID_X722_VF_HV		0x37D9
+#endif /* X722_SUPPORT */
+
+#define i40e_is_40G_device(d)		((d) == I40E_DEV_ID_QSFP_A  || \
+					 (d) == I40E_DEV_ID_QSFP_B  || \
+					 (d) == I40E_DEV_ID_QSFP_C)
+
+#endif /* _I40E_DEVIDS_H_ */
diff --git a/usr/src/uts/common/io/i40e/core/i40e_hmc.c b/usr/src/uts/common/io/i40e/core/i40e_hmc.c
new file mode 100644
index 0000000000..3f0e6e8d5b
--- /dev/null
+++ b/usr/src/uts/common/io/i40e/core/i40e_hmc.c
@@ -0,0 +1,373 @@
+/******************************************************************************
+
+  Copyright (c) 2013-2015, Intel Corporation 
+  All rights reserved.
+  
+  Redistribution and use in source and binary forms, with or without 
+  modification, are permitted provided that the following conditions are met:
+  
+   1. Redistributions of source code must retain the above copyright notice, 
+      this list of conditions and the following disclaimer.
+  
+   2. Redistributions in binary form must reproduce the above copyright 
+      notice, this list of conditions and the following disclaimer in the 
+      documentation and/or other materials provided with the distribution.
+  
+   3. Neither the name of the Intel Corporation nor the names of its 
+      contributors may be used to endorse or promote products derived from 
+      this software without specific prior written permission.
+  
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+  POSSIBILITY OF SUCH DAMAGE.
+
+******************************************************************************/
+/*$FreeBSD: head/sys/dev/ixl/i40e_hmc.c 284049 2015-06-05 22:52:42Z jfv $*/
+
+#include "i40e_osdep.h"
+#include "i40e_register.h"
+#include "i40e_status.h"
+#include "i40e_alloc.h"
+#include "i40e_hmc.h"
+#ifndef I40E_NO_TYPE_HEADER
+#include "i40e_type.h"
+#endif
+
+/**
+ * i40e_add_sd_table_entry - Adds a segment descriptor to the table
+ * @hw: pointer to our hw struct
+ * @hmc_info: pointer to the HMC configuration information struct
+ * @sd_index: segment descriptor index to manipulate
+ * @type: what type of segment descriptor we're manipulating
+ * @direct_mode_sz: size to alloc in direct mode
+ **/
+enum i40e_status_code i40e_add_sd_table_entry(struct i40e_hw *hw,
+					      struct i40e_hmc_info *hmc_info,
+					      u32 sd_index,
+					      enum i40e_sd_entry_type type,
+					      u64 direct_mode_sz)
+{
+	enum i40e_status_code ret_code = I40E_SUCCESS;
+	struct i40e_hmc_sd_entry *sd_entry;
+	enum   i40e_memory_type mem_type;
+	bool dma_mem_alloc_done = FALSE;
+	struct i40e_dma_mem mem;
+	u64 alloc_len;
+
+	if (NULL == hmc_info->sd_table.sd_entry) {
+		ret_code = I40E_ERR_BAD_PTR;
+		DEBUGOUT("i40e_add_sd_table_entry: bad sd_entry\n");
+		goto exit;
+	}
+
+	if (sd_index >= hmc_info->sd_table.sd_cnt) {
+		ret_code = I40E_ERR_INVALID_SD_INDEX;
+		DEBUGOUT("i40e_add_sd_table_entry: bad sd_index\n");
+		goto exit;
+	}
+
+	sd_entry = &hmc_info->sd_table.sd_entry[sd_index];
+	if (!sd_entry->valid) {
+		if (I40E_SD_TYPE_PAGED == type) {
+			mem_type = i40e_mem_pd;
+			alloc_len = I40E_HMC_PAGED_BP_SIZE;
+		} else {
+			mem_type = i40e_mem_bp_jumbo;
+			alloc_len = direct_mode_sz;
+		}
+
+		/* allocate a 4K pd page or 2M backing page */
+		ret_code = i40e_allocate_dma_mem(hw, &mem, mem_type, alloc_len,
+						 I40E_HMC_PD_BP_BUF_ALIGNMENT);
+		if (ret_code)
+			goto exit;
+		dma_mem_alloc_done = TRUE;
+		if (I40E_SD_TYPE_PAGED == type) {
+			ret_code = i40e_allocate_virt_mem(hw,
+					&sd_entry->u.pd_table.pd_entry_virt_mem,
+					sizeof(struct i40e_hmc_pd_entry) * 512);
+			if (ret_code)
+				goto exit;
+			sd_entry->u.pd_table.pd_entry =
+				(struct i40e_hmc_pd_entry *)
+				sd_entry->u.pd_table.pd_entry_virt_mem.va;
+			i40e_memcpy(&sd_entry->u.pd_table.pd_page_addr,
+				    &mem, sizeof(struct i40e_dma_mem),
+				    I40E_NONDMA_TO_NONDMA);
+		} else {
+			i40e_memcpy(&sd_entry->u.bp.addr,
+				    &mem, sizeof(struct i40e_dma_mem),
+				    I40E_NONDMA_TO_NONDMA);
+			sd_entry->u.bp.sd_pd_index = sd_index;
+		}
+		/* initialize the sd entry */
+		hmc_info->sd_table.sd_entry[sd_index].entry_type = type;
+
+		/* increment the ref count */
+		I40E_INC_SD_REFCNT(&hmc_info->sd_table);
+	}
+	/* Increment backing page reference count */
+	if (I40E_SD_TYPE_DIRECT == sd_entry->entry_type)
+		I40E_INC_BP_REFCNT(&sd_entry->u.bp);
+exit:
+	if (I40E_SUCCESS != ret_code)
+		if (dma_mem_alloc_done)
+			i40e_free_dma_mem(hw, &mem);
+
+	return ret_code;
+}
+
+/**
+ * i40e_add_pd_table_entry - Adds page descriptor to the specified table
+ * @hw: pointer to our HW structure
+ * @hmc_info: pointer to the HMC configuration information structure
+ * @pd_index: which page descriptor index to manipulate
+ * @rsrc_pg: if not NULL, use preallocated page instead of allocating new one.
+ *
+ * This function:
+ *	1. Initializes the pd entry
+ *	2. Adds pd_entry in the pd_table
+ *	3. Mark the entry valid in i40e_hmc_pd_entry structure
+ *	4. Initializes the pd_entry's ref count to 1
+ * assumptions:
+ *	1. The memory for pd should be pinned down, physically contiguous and
+ *	   aligned on 4K boundary and zeroed memory.
+ *	2. It should be 4K in size.
+ **/
+enum i40e_status_code i40e_add_pd_table_entry(struct i40e_hw *hw,
+					      struct i40e_hmc_info *hmc_info,
+					      u32 pd_index,
+					      struct i40e_dma_mem *rsrc_pg)
+{
+	enum i40e_status_code ret_code = I40E_SUCCESS;
+	struct i40e_hmc_pd_table *pd_table;
+	struct i40e_hmc_pd_entry *pd_entry;
+	struct i40e_dma_mem mem;
+	struct i40e_dma_mem *page = &mem;
+	u32 sd_idx, rel_pd_idx;
+	u64 *pd_addr;
+	u64 page_desc;
+
+	if (pd_index / I40E_HMC_PD_CNT_IN_SD >= hmc_info->sd_table.sd_cnt) {
+		ret_code = I40E_ERR_INVALID_PAGE_DESC_INDEX;
+		DEBUGOUT("i40e_add_pd_table_entry: bad pd_index\n");
+		goto exit;
+	}
+
+	/* find corresponding sd */
+	sd_idx = (pd_index / I40E_HMC_PD_CNT_IN_SD);
+	if (I40E_SD_TYPE_PAGED !=
+	    hmc_info->sd_table.sd_entry[sd_idx].entry_type)
+		goto exit;
+
+	rel_pd_idx = (pd_index % I40E_HMC_PD_CNT_IN_SD);
+	pd_table = &hmc_info->sd_table.sd_entry[sd_idx].u.pd_table;
+	pd_entry = &pd_table->pd_entry[rel_pd_idx];
+	if (!pd_entry->valid) {
+		if (rsrc_pg) {
+			pd_entry->rsrc_pg = TRUE;
+			page = rsrc_pg;
+		} else {
+			/* allocate a 4K backing page */
+			ret_code = i40e_allocate_dma_mem(hw, page, i40e_mem_bp,
+						I40E_HMC_PAGED_BP_SIZE,
+						I40E_HMC_PD_BP_BUF_ALIGNMENT);
+			if (ret_code)
+				goto exit;
+			pd_entry->rsrc_pg = FALSE;
+		}
+
+		i40e_memcpy(&pd_entry->bp.addr, page,
+			    sizeof(struct i40e_dma_mem), I40E_NONDMA_TO_NONDMA);
+		pd_entry->bp.sd_pd_index = pd_index;
+		pd_entry->bp.entry_type = I40E_SD_TYPE_PAGED;
+		/* Set page address and valid bit */
+		page_desc = page->pa | 0x1;
+
+		pd_addr = (u64 *)pd_table->pd_page_addr.va;
+		pd_addr += rel_pd_idx;
+
+		/* Add the backing page physical address in the pd entry */
+		i40e_memcpy(pd_addr, &page_desc, sizeof(u64),
+			    I40E_NONDMA_TO_DMA);
+
+		pd_entry->sd_index = sd_idx;
+		pd_entry->valid = TRUE;
+		I40E_INC_PD_REFCNT(pd_table);
+	}
+	I40E_INC_BP_REFCNT(&pd_entry->bp);
+exit:
+	return ret_code;
+}
+
+/**
+ * i40e_remove_pd_bp - remove a backing page from a page descriptor
+ * @hw: pointer to our HW structure
+ * @hmc_info: pointer to the HMC configuration information structure
+ * @idx: the page index
+ * @is_pf: distinguishes a VF from a PF
+ *
+ * This function:
+ *	1. Marks the entry in pd tabe (for paged address mode) or in sd table
+ *	   (for direct address mode) invalid.
+ *	2. Write to register PMPDINV to invalidate the backing page in FV cache
+ *	3. Decrement the ref count for the pd _entry
+ * assumptions:
+ *	1. Caller can deallocate the memory used by backing storage after this
+ *	   function returns.
+ **/
+enum i40e_status_code i40e_remove_pd_bp(struct i40e_hw *hw,
+					struct i40e_hmc_info *hmc_info,
+					u32 idx)
+{
+	enum i40e_status_code ret_code = I40E_SUCCESS;
+	struct i40e_hmc_pd_entry *pd_entry;
+	struct i40e_hmc_pd_table *pd_table;
+	struct i40e_hmc_sd_entry *sd_entry;
+	u32 sd_idx, rel_pd_idx;
+	u64 *pd_addr;
+
+	/* calculate index */
+	sd_idx = idx / I40E_HMC_PD_CNT_IN_SD;
+	rel_pd_idx = idx % I40E_HMC_PD_CNT_IN_SD;
+	if (sd_idx >= hmc_info->sd_table.sd_cnt) {
+		ret_code = I40E_ERR_INVALID_PAGE_DESC_INDEX;
+		DEBUGOUT("i40e_remove_pd_bp: bad idx\n");
+		goto exit;
+	}
+	sd_entry = &hmc_info->sd_table.sd_entry[sd_idx];
+	if (I40E_SD_TYPE_PAGED != sd_entry->entry_type) {
+		ret_code = I40E_ERR_INVALID_SD_TYPE;
+		DEBUGOUT("i40e_remove_pd_bp: wrong sd_entry type\n");
+		goto exit;
+	}
+	/* get the entry and decrease its ref counter */
+	pd_table = &hmc_info->sd_table.sd_entry[sd_idx].u.pd_table;
+	pd_entry = &pd_table->pd_entry[rel_pd_idx];
+	I40E_DEC_BP_REFCNT(&pd_entry->bp);
+	if (pd_entry->bp.ref_cnt)
+		goto exit;
+
+	/* mark the entry invalid */
+	pd_entry->valid = FALSE;
+	I40E_DEC_PD_REFCNT(pd_table);
+	pd_addr = (u64 *)pd_table->pd_page_addr.va;
+	pd_addr += rel_pd_idx;
+	i40e_memset(pd_addr, 0, sizeof(u64), I40E_DMA_MEM);
+	I40E_INVALIDATE_PF_HMC_PD(hw, sd_idx, idx);
+
+	/* free memory here */
+	if (!pd_entry->rsrc_pg)
+		ret_code = i40e_free_dma_mem(hw, &(pd_entry->bp.addr));
+	if (I40E_SUCCESS != ret_code)
+		goto exit;
+	if (!pd_table->ref_cnt)
+		i40e_free_virt_mem(hw, &pd_table->pd_entry_virt_mem);
+exit:
+	return ret_code;
+}
+
+/**
+ * i40e_prep_remove_sd_bp - Prepares to remove a backing page from a sd entry
+ * @hmc_info: pointer to the HMC configuration information structure
+ * @idx: the page index
+ **/
+enum i40e_status_code i40e_prep_remove_sd_bp(struct i40e_hmc_info *hmc_info,
+					     u32 idx)
+{
+	enum i40e_status_code ret_code = I40E_SUCCESS;
+	struct i40e_hmc_sd_entry *sd_entry;
+
+	/* get the entry and decrease its ref counter */
+	sd_entry = &hmc_info->sd_table.sd_entry[idx];
+	I40E_DEC_BP_REFCNT(&sd_entry->u.bp);
+	if (sd_entry->u.bp.ref_cnt) {
+		ret_code = I40E_ERR_NOT_READY;
+		goto exit;
+	}
+	I40E_DEC_SD_REFCNT(&hmc_info->sd_table);
+
+	/* mark the entry invalid */
+	sd_entry->valid = FALSE;
+exit:
+	return ret_code;
+}
+
+/**
+ * i40e_remove_sd_bp_new - Removes a backing page from a segment descriptor
+ * @hw: pointer to our hw struct
+ * @hmc_info: pointer to the HMC configuration information structure
+ * @idx: the page index
+ * @is_pf: used to distinguish between VF and PF
+ **/
+enum i40e_status_code i40e_remove_sd_bp_new(struct i40e_hw *hw,
+					    struct i40e_hmc_info *hmc_info,
+					    u32 idx, bool is_pf)
+{
+	struct i40e_hmc_sd_entry *sd_entry;
+
+	if (!is_pf)
+		return I40E_NOT_SUPPORTED;
+
+	/* get the entry and decrease its ref counter */
+	sd_entry = &hmc_info->sd_table.sd_entry[idx];
+	I40E_CLEAR_PF_SD_ENTRY(hw, idx, I40E_SD_TYPE_DIRECT);
+
+	return i40e_free_dma_mem(hw, &(sd_entry->u.bp.addr));
+}
+
+/**
+ * i40e_prep_remove_pd_page - Prepares to remove a PD page from sd entry.
+ * @hmc_info: pointer to the HMC configuration information structure
+ * @idx: segment descriptor index to find the relevant page descriptor
+ **/
+enum i40e_status_code i40e_prep_remove_pd_page(struct i40e_hmc_info *hmc_info,
+					       u32 idx)
+{
+	enum i40e_status_code ret_code = I40E_SUCCESS;
+	struct i40e_hmc_sd_entry *sd_entry;
+
+	sd_entry = &hmc_info->sd_table.sd_entry[idx];
+
+	if (sd_entry->u.pd_table.ref_cnt) {
+		ret_code = I40E_ERR_NOT_READY;
+		goto exit;
+	}
+
+	/* mark the entry invalid */
+	sd_entry->valid = FALSE;
+
+	I40E_DEC_SD_REFCNT(&hmc_info->sd_table);
+exit:
+	return ret_code;
+}
+
+/**
+ * i40e_remove_pd_page_new - Removes a PD page from sd entry.
+ * @hw: pointer to our hw struct
+ * @hmc_info: pointer to the HMC configuration information structure
+ * @idx: segment descriptor index to find the relevant page descriptor
+ * @is_pf: used to distinguish between VF and PF
+ **/
+enum i40e_status_code i40e_remove_pd_page_new(struct i40e_hw *hw,
+					      struct i40e_hmc_info *hmc_info,
+					      u32 idx, bool is_pf)
+{
+	struct i40e_hmc_sd_entry *sd_entry;
+
+	if (!is_pf)
+		return I40E_NOT_SUPPORTED;
+
+	sd_entry = &hmc_info->sd_table.sd_entry[idx];
+	I40E_CLEAR_PF_SD_ENTRY(hw, idx, I40E_SD_TYPE_PAGED);
+
+	return i40e_free_dma_mem(hw, &(sd_entry->u.pd_table.pd_page_addr));
+}
diff --git a/usr/src/uts/common/io/i40e/core/i40e_hmc.h b/usr/src/uts/common/io/i40e/core/i40e_hmc.h
new file mode 100644
index 0000000000..d6e1f93421
--- /dev/null
+++ b/usr/src/uts/common/io/i40e/core/i40e_hmc.h
@@ -0,0 +1,246 @@
+/******************************************************************************
+
+  Copyright (c) 2013-2015, Intel Corporation 
+  All rights reserved.
+  
+  Redistribution and use in source and binary forms, with or without 
+  modification, are permitted provided that the following conditions are met:
+  
+   1. Redistributions of source code must retain the above copyright notice, 
+      this list of conditions and the following disclaimer.
+  
+   2. Redistributions in binary form must reproduce the above copyright 
+      notice, this list of conditions and the following disclaimer in the 
+      documentation and/or other materials provided with the distribution.
+  
+   3. Neither the name of the Intel Corporation nor the names of its 
+      contributors may be used to endorse or promote products derived from 
+      this software without specific prior written permission.
+  
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+  POSSIBILITY OF SUCH DAMAGE.
+
+******************************************************************************/
+/*$FreeBSD: head/sys/dev/ixl/i40e_hmc.h 284049 2015-06-05 22:52:42Z jfv $*/
+
+#ifndef _I40E_HMC_H_
+#define _I40E_HMC_H_
+
+#define I40E_HMC_MAX_BP_COUNT 512
+
+/* forward-declare the HW struct for the compiler */
+struct i40e_hw;
+
+#define I40E_HMC_INFO_SIGNATURE		0x484D5347 /* HMSG */
+#define I40E_HMC_PD_CNT_IN_SD		512
+#define I40E_HMC_DIRECT_BP_SIZE		0x200000 /* 2M */
+#define I40E_HMC_PAGED_BP_SIZE		4096
+#define I40E_HMC_PD_BP_BUF_ALIGNMENT	4096
+#define I40E_FIRST_VF_FPM_ID		16
+
+struct i40e_hmc_obj_info {
+	u64 base;	/* base addr in FPM */
+	u32 max_cnt;	/* max count available for this hmc func */
+	u32 cnt;	/* count of objects driver actually wants to create */
+	u64 size;	/* size in bytes of one object */
+};
+
+enum i40e_sd_entry_type {
+	I40E_SD_TYPE_INVALID = 0,
+	I40E_SD_TYPE_PAGED   = 1,
+	I40E_SD_TYPE_DIRECT  = 2
+};
+
+struct i40e_hmc_bp {
+	enum i40e_sd_entry_type entry_type;
+	struct i40e_dma_mem addr; /* populate to be used by hw */
+	u32 sd_pd_index;
+	u32 ref_cnt;
+};
+
+struct i40e_hmc_pd_entry {
+	struct i40e_hmc_bp bp;
+	u32 sd_index;
+	bool rsrc_pg;
+	bool valid;
+};
+
+struct i40e_hmc_pd_table {
+	struct i40e_dma_mem pd_page_addr; /* populate to be used by hw */
+	struct i40e_hmc_pd_entry  *pd_entry; /* [512] for sw book keeping */
+	struct i40e_virt_mem pd_entry_virt_mem; /* virt mem for pd_entry */
+
+	u32 ref_cnt;
+	u32 sd_index;
+};
+
+struct i40e_hmc_sd_entry {
+	enum i40e_sd_entry_type entry_type;
+	bool valid;
+
+	union {
+		struct i40e_hmc_pd_table pd_table;
+		struct i40e_hmc_bp bp;
+	} u;
+};
+
+struct i40e_hmc_sd_table {
+	struct i40e_virt_mem addr; /* used to track sd_entry allocations */
+	u32 sd_cnt;
+	u32 ref_cnt;
+	struct i40e_hmc_sd_entry *sd_entry; /* (sd_cnt*512) entries max */
+};
+
+struct i40e_hmc_info {
+	u32 signature;
+	/* equals to pci func num for PF and dynamically allocated for VFs */
+	u8 hmc_fn_id;
+	u16 first_sd_index; /* index of the first available SD */
+
+	/* hmc objects */
+	struct i40e_hmc_obj_info *hmc_obj;
+	struct i40e_virt_mem hmc_obj_virt_mem;
+	struct i40e_hmc_sd_table sd_table;
+};
+
+#define I40E_INC_SD_REFCNT(sd_table)	((sd_table)->ref_cnt++)
+#define I40E_INC_PD_REFCNT(pd_table)	((pd_table)->ref_cnt++)
+#define I40E_INC_BP_REFCNT(bp)		((bp)->ref_cnt++)
+
+#define I40E_DEC_SD_REFCNT(sd_table)	((sd_table)->ref_cnt--)
+#define I40E_DEC_PD_REFCNT(pd_table)	((pd_table)->ref_cnt--)
+#define I40E_DEC_BP_REFCNT(bp)		((bp)->ref_cnt--)
+
+/**
+ * I40E_SET_PF_SD_ENTRY - marks the sd entry as valid in the hardware
+ * @hw: pointer to our hw struct
+ * @pa: pointer to physical address
+ * @sd_index: segment descriptor index
+ * @type: if sd entry is direct or paged
+ **/
+#define I40E_SET_PF_SD_ENTRY(hw, pa, sd_index, type)			\
+{									\
+	u32 val1, val2, val3;						\
+	val1 = (u32)(I40E_HI_DWORD(pa));				\
+	val2 = (u32)(pa) | (I40E_HMC_MAX_BP_COUNT <<			\
+		 I40E_PFHMC_SDDATALOW_PMSDBPCOUNT_SHIFT) |		\
+		((((type) == I40E_SD_TYPE_PAGED) ? 0 : 1) <<		\
+		I40E_PFHMC_SDDATALOW_PMSDTYPE_SHIFT) |			\
+		BIT(I40E_PFHMC_SDDATALOW_PMSDVALID_SHIFT);		\
+	val3 = (sd_index) | BIT_ULL(I40E_PFHMC_SDCMD_PMSDWR_SHIFT);	\
+	wr32((hw), I40E_PFHMC_SDDATAHIGH, val1);			\
+	wr32((hw), I40E_PFHMC_SDDATALOW, val2);				\
+	wr32((hw), I40E_PFHMC_SDCMD, val3);				\
+}
+
+/**
+ * I40E_CLEAR_PF_SD_ENTRY - marks the sd entry as invalid in the hardware
+ * @hw: pointer to our hw struct
+ * @sd_index: segment descriptor index
+ * @type: if sd entry is direct or paged
+ **/
+#define I40E_CLEAR_PF_SD_ENTRY(hw, sd_index, type)			\
+{									\
+	u32 val2, val3;							\
+	val2 = (I40E_HMC_MAX_BP_COUNT <<				\
+		I40E_PFHMC_SDDATALOW_PMSDBPCOUNT_SHIFT) |		\
+		((((type) == I40E_SD_TYPE_PAGED) ? 0 : 1) <<		\
+		I40E_PFHMC_SDDATALOW_PMSDTYPE_SHIFT);			\
+	val3 = (sd_index) | BIT_ULL(I40E_PFHMC_SDCMD_PMSDWR_SHIFT);	\
+	wr32((hw), I40E_PFHMC_SDDATAHIGH, 0);				\
+	wr32((hw), I40E_PFHMC_SDDATALOW, val2);				\
+	wr32((hw), I40E_PFHMC_SDCMD, val3);				\
+}
+
+/**
+ * I40E_INVALIDATE_PF_HMC_PD - Invalidates the pd cache in the hardware
+ * @hw: pointer to our hw struct
+ * @sd_idx: segment descriptor index
+ * @pd_idx: page descriptor index
+ **/
+#define I40E_INVALIDATE_PF_HMC_PD(hw, sd_idx, pd_idx)			\
+	wr32((hw), I40E_PFHMC_PDINV,					\
+	    (((sd_idx) << I40E_PFHMC_PDINV_PMSDIDX_SHIFT) |		\
+	     ((pd_idx) << I40E_PFHMC_PDINV_PMPDIDX_SHIFT)))
+
+/**
+ * I40E_FIND_SD_INDEX_LIMIT - finds segment descriptor index limit
+ * @hmc_info: pointer to the HMC configuration information structure
+ * @type: type of HMC resources we're searching
+ * @index: starting index for the object
+ * @cnt: number of objects we're trying to create
+ * @sd_idx: pointer to return index of the segment descriptor in question
+ * @sd_limit: pointer to return the maximum number of segment descriptors
+ *
+ * This function calculates the segment descriptor index and index limit
+ * for the resource defined by i40e_hmc_rsrc_type.
+ **/
+#define I40E_FIND_SD_INDEX_LIMIT(hmc_info, type, index, cnt, sd_idx, sd_limit)\
+{									\
+	u64 fpm_addr, fpm_limit;					\
+	fpm_addr = (hmc_info)->hmc_obj[(type)].base +			\
+		   (hmc_info)->hmc_obj[(type)].size * (index);		\
+	fpm_limit = fpm_addr + (hmc_info)->hmc_obj[(type)].size * (cnt);\
+	*(sd_idx) = (u32)(fpm_addr / I40E_HMC_DIRECT_BP_SIZE);		\
+	*(sd_limit) = (u32)((fpm_limit - 1) / I40E_HMC_DIRECT_BP_SIZE);	\
+	/* add one more to the limit to correct our range */		\
+	*(sd_limit) += 1;						\
+}
+
+/**
+ * I40E_FIND_PD_INDEX_LIMIT - finds page descriptor index limit
+ * @hmc_info: pointer to the HMC configuration information struct
+ * @type: HMC resource type we're examining
+ * @idx: starting index for the object
+ * @cnt: number of objects we're trying to create
+ * @pd_index: pointer to return page descriptor index
+ * @pd_limit: pointer to return page descriptor index limit
+ *
+ * Calculates the page descriptor index and index limit for the resource
+ * defined by i40e_hmc_rsrc_type.
+ **/
+#define I40E_FIND_PD_INDEX_LIMIT(hmc_info, type, idx, cnt, pd_index, pd_limit)\
+{									\
+	u64 fpm_adr, fpm_limit;						\
+	fpm_adr = (hmc_info)->hmc_obj[(type)].base +			\
+		  (hmc_info)->hmc_obj[(type)].size * (idx);		\
+	fpm_limit = fpm_adr + (hmc_info)->hmc_obj[(type)].size * (cnt);	\
+	*(pd_index) = (u32)(fpm_adr / I40E_HMC_PAGED_BP_SIZE);		\
+	*(pd_limit) = (u32)((fpm_limit - 1) / I40E_HMC_PAGED_BP_SIZE);	\
+	/* add one more to the limit to correct our range */		\
+	*(pd_limit) += 1;						\
+}
+enum i40e_status_code i40e_add_sd_table_entry(struct i40e_hw *hw,
+					      struct i40e_hmc_info *hmc_info,
+					      u32 sd_index,
+					      enum i40e_sd_entry_type type,
+					      u64 direct_mode_sz);
+
+enum i40e_status_code i40e_add_pd_table_entry(struct i40e_hw *hw,
+					      struct i40e_hmc_info *hmc_info,
+					      u32 pd_index,
+					      struct i40e_dma_mem *rsrc_pg);
+enum i40e_status_code i40e_remove_pd_bp(struct i40e_hw *hw,
+					struct i40e_hmc_info *hmc_info,
+					u32 idx);
+enum i40e_status_code i40e_prep_remove_sd_bp(struct i40e_hmc_info *hmc_info,
+					     u32 idx);
+enum i40e_status_code i40e_remove_sd_bp_new(struct i40e_hw *hw,
+					    struct i40e_hmc_info *hmc_info,
+					    u32 idx, bool is_pf);
+enum i40e_status_code i40e_prep_remove_pd_page(struct i40e_hmc_info *hmc_info,
+					       u32 idx);
+enum i40e_status_code i40e_remove_pd_page_new(struct i40e_hw *hw,
+					      struct i40e_hmc_info *hmc_info,
+					      u32 idx, bool is_pf);
+
+#endif /* _I40E_HMC_H_ */
diff --git a/usr/src/uts/common/io/i40e/core/i40e_lan_hmc.c b/usr/src/uts/common/io/i40e/core/i40e_lan_hmc.c
new file mode 100644
index 0000000000..2b2fa4f8f9
--- /dev/null
+++ b/usr/src/uts/common/io/i40e/core/i40e_lan_hmc.c
@@ -0,0 +1,1412 @@
+/******************************************************************************
+
+  Copyright (c) 2013-2015, Intel Corporation 
+  All rights reserved.
+  
+  Redistribution and use in source and binary forms, with or without 
+  modification, are permitted provided that the following conditions are met:
+  
+   1. Redistributions of source code must retain the above copyright notice, 
+      this list of conditions and the following disclaimer.
+  
+   2. Redistributions in binary form must reproduce the above copyright 
+      notice, this list of conditions and the following disclaimer in the 
+      documentation and/or other materials provided with the distribution.
+  
+   3. Neither the name of the Intel Corporation nor the names of its 
+      contributors may be used to endorse or promote products derived from 
+      this software without specific prior written permission.
+  
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+  POSSIBILITY OF SUCH DAMAGE.
+
+******************************************************************************/
+/*$FreeBSD: head/sys/dev/ixl/i40e_lan_hmc.c 284049 2015-06-05 22:52:42Z jfv $*/
+
+#include "i40e_osdep.h"
+#include "i40e_register.h"
+#include "i40e_type.h"
+#include "i40e_hmc.h"
+#include "i40e_lan_hmc.h"
+#include "i40e_prototype.h"
+
+/* lan specific interface functions */
+
+/**
+ * i40e_align_l2obj_base - aligns base object pointer to 512 bytes
+ * @offset: base address offset needing alignment
+ *
+ * Aligns the layer 2 function private memory so it's 512-byte aligned.
+ **/
+static u64 i40e_align_l2obj_base(u64 offset)
+{
+	u64 aligned_offset = offset;
+
+	if ((offset % I40E_HMC_L2OBJ_BASE_ALIGNMENT) > 0)
+		aligned_offset += (I40E_HMC_L2OBJ_BASE_ALIGNMENT -
+				   (offset % I40E_HMC_L2OBJ_BASE_ALIGNMENT));
+
+	return aligned_offset;
+}
+
+/**
+ * i40e_calculate_l2fpm_size - calculates layer 2 FPM memory size
+ * @txq_num: number of Tx queues needing backing context
+ * @rxq_num: number of Rx queues needing backing context
+ * @fcoe_cntx_num: amount of FCoE statefull contexts needing backing context
+ * @fcoe_filt_num: number of FCoE filters needing backing context
+ *
+ * Calculates the maximum amount of memory for the function required, based
+ * on the number of resources it must provide context for.
+ **/
+u64 i40e_calculate_l2fpm_size(u32 txq_num, u32 rxq_num,
+			      u32 fcoe_cntx_num, u32 fcoe_filt_num)
+{
+	u64 fpm_size = 0;
+
+	fpm_size = txq_num * I40E_HMC_OBJ_SIZE_TXQ;
+	fpm_size = i40e_align_l2obj_base(fpm_size);
+
+	fpm_size += (rxq_num * I40E_HMC_OBJ_SIZE_RXQ);
+	fpm_size = i40e_align_l2obj_base(fpm_size);
+
+	fpm_size += (fcoe_cntx_num * I40E_HMC_OBJ_SIZE_FCOE_CNTX);
+	fpm_size = i40e_align_l2obj_base(fpm_size);
+
+	fpm_size += (fcoe_filt_num * I40E_HMC_OBJ_SIZE_FCOE_FILT);
+	fpm_size = i40e_align_l2obj_base(fpm_size);
+
+	return fpm_size;
+}
+
+/**
+ * i40e_init_lan_hmc - initialize i40e_hmc_info struct
+ * @hw: pointer to the HW structure
+ * @txq_num: number of Tx queues needing backing context
+ * @rxq_num: number of Rx queues needing backing context
+ * @fcoe_cntx_num: amount of FCoE statefull contexts needing backing context
+ * @fcoe_filt_num: number of FCoE filters needing backing context
+ *
+ * This function will be called once per physical function initialization.
+ * It will fill out the i40e_hmc_obj_info structure for LAN objects based on
+ * the driver's provided input, as well as information from the HMC itself
+ * loaded from NVRAM.
+ *
+ * Assumptions:
+ *   - HMC Resource Profile has been selected before calling this function.
+ **/
+enum i40e_status_code i40e_init_lan_hmc(struct i40e_hw *hw, u32 txq_num,
+					u32 rxq_num, u32 fcoe_cntx_num,
+					u32 fcoe_filt_num)
+{
+	struct i40e_hmc_obj_info *obj, *full_obj;
+	enum i40e_status_code ret_code = I40E_SUCCESS;
+	u64 l2fpm_size;
+	u32 size_exp;
+
+	hw->hmc.signature = I40E_HMC_INFO_SIGNATURE;
+	hw->hmc.hmc_fn_id = hw->pf_id;
+
+	/* allocate memory for hmc_obj */
+	ret_code = i40e_allocate_virt_mem(hw, &hw->hmc.hmc_obj_virt_mem,
+			sizeof(struct i40e_hmc_obj_info) * I40E_HMC_LAN_MAX);
+	if (ret_code)
+		goto init_lan_hmc_out;
+	hw->hmc.hmc_obj = (struct i40e_hmc_obj_info *)
+			  hw->hmc.hmc_obj_virt_mem.va;
+
+	/* The full object will be used to create the LAN HMC SD */
+	full_obj = &hw->hmc.hmc_obj[I40E_HMC_LAN_FULL];
+	full_obj->max_cnt = 0;
+	full_obj->cnt = 0;
+	full_obj->base = 0;
+	full_obj->size = 0;
+
+	/* Tx queue context information */
+	obj = &hw->hmc.hmc_obj[I40E_HMC_LAN_TX];
+	obj->max_cnt = rd32(hw, I40E_GLHMC_LANQMAX);
+	obj->cnt = txq_num;
+	obj->base = 0;
+	size_exp = rd32(hw, I40E_GLHMC_LANTXOBJSZ);
+	obj->size = BIT_ULL(size_exp);
+
+	/* validate values requested by driver don't exceed HMC capacity */
+	if (txq_num > obj->max_cnt) {
+		ret_code = I40E_ERR_INVALID_HMC_OBJ_COUNT;
+		DEBUGOUT3("i40e_init_lan_hmc: Tx context: asks for 0x%x but max allowed is 0x%x, returns error %d\n",
+			  txq_num, obj->max_cnt, ret_code);
+		goto init_lan_hmc_out;
+	}
+
+	/* aggregate values into the full LAN object for later */
+	full_obj->max_cnt += obj->max_cnt;
+	full_obj->cnt += obj->cnt;
+
+	/* Rx queue context information */
+	obj = &hw->hmc.hmc_obj[I40E_HMC_LAN_RX];
+	obj->max_cnt = rd32(hw, I40E_GLHMC_LANQMAX);
+	obj->cnt = rxq_num;
+	obj->base = hw->hmc.hmc_obj[I40E_HMC_LAN_TX].base +
+		    (hw->hmc.hmc_obj[I40E_HMC_LAN_TX].cnt *
+		     hw->hmc.hmc_obj[I40E_HMC_LAN_TX].size);
+	obj->base = i40e_align_l2obj_base(obj->base);
+	size_exp = rd32(hw, I40E_GLHMC_LANRXOBJSZ);
+	obj->size = BIT_ULL(size_exp);
+
+	/* validate values requested by driver don't exceed HMC capacity */
+	if (rxq_num > obj->max_cnt) {
+		ret_code = I40E_ERR_INVALID_HMC_OBJ_COUNT;
+		DEBUGOUT3("i40e_init_lan_hmc: Rx context: asks for 0x%x but max allowed is 0x%x, returns error %d\n",
+			  rxq_num, obj->max_cnt, ret_code);
+		goto init_lan_hmc_out;
+	}
+
+	/* aggregate values into the full LAN object for later */
+	full_obj->max_cnt += obj->max_cnt;
+	full_obj->cnt += obj->cnt;
+
+	/* FCoE context information */
+	obj = &hw->hmc.hmc_obj[I40E_HMC_FCOE_CTX];
+	obj->max_cnt = rd32(hw, I40E_GLHMC_FCOEMAX);
+	obj->cnt = fcoe_cntx_num;
+	obj->base = hw->hmc.hmc_obj[I40E_HMC_LAN_RX].base +
+		    (hw->hmc.hmc_obj[I40E_HMC_LAN_RX].cnt *
+		     hw->hmc.hmc_obj[I40E_HMC_LAN_RX].size);
+	obj->base = i40e_align_l2obj_base(obj->base);
+	size_exp = rd32(hw, I40E_GLHMC_FCOEDDPOBJSZ);
+	obj->size = BIT_ULL(size_exp);
+
+	/* validate values requested by driver don't exceed HMC capacity */
+	if (fcoe_cntx_num > obj->max_cnt) {
+		ret_code = I40E_ERR_INVALID_HMC_OBJ_COUNT;
+		DEBUGOUT3("i40e_init_lan_hmc: FCoE context: asks for 0x%x but max allowed is 0x%x, returns error %d\n",
+			  fcoe_cntx_num, obj->max_cnt, ret_code);
+		goto init_lan_hmc_out;
+	}
+
+	/* aggregate values into the full LAN object for later */
+	full_obj->max_cnt += obj->max_cnt;
+	full_obj->cnt += obj->cnt;
+
+	/* FCoE filter information */
+	obj = &hw->hmc.hmc_obj[I40E_HMC_FCOE_FILT];
+	obj->max_cnt = rd32(hw, I40E_GLHMC_FCOEFMAX);
+	obj->cnt = fcoe_filt_num;
+	obj->base = hw->hmc.hmc_obj[I40E_HMC_FCOE_CTX].base +
+		    (hw->hmc.hmc_obj[I40E_HMC_FCOE_CTX].cnt *
+		     hw->hmc.hmc_obj[I40E_HMC_FCOE_CTX].size);
+	obj->base = i40e_align_l2obj_base(obj->base);
+	size_exp = rd32(hw, I40E_GLHMC_FCOEFOBJSZ);
+	obj->size = BIT_ULL(size_exp);
+
+	/* validate values requested by driver don't exceed HMC capacity */
+	if (fcoe_filt_num > obj->max_cnt) {
+		ret_code = I40E_ERR_INVALID_HMC_OBJ_COUNT;
+		DEBUGOUT3("i40e_init_lan_hmc: FCoE filter: asks for 0x%x but max allowed is 0x%x, returns error %d\n",
+			  fcoe_filt_num, obj->max_cnt, ret_code);
+		goto init_lan_hmc_out;
+	}
+
+	/* aggregate values into the full LAN object for later */
+	full_obj->max_cnt += obj->max_cnt;
+	full_obj->cnt += obj->cnt;
+
+	hw->hmc.first_sd_index = 0;
+	hw->hmc.sd_table.ref_cnt = 0;
+	l2fpm_size = i40e_calculate_l2fpm_size(txq_num, rxq_num, fcoe_cntx_num,
+					       fcoe_filt_num);
+	if (NULL == hw->hmc.sd_table.sd_entry) {
+		hw->hmc.sd_table.sd_cnt = (u32)
+				   (l2fpm_size + I40E_HMC_DIRECT_BP_SIZE - 1) /
+				   I40E_HMC_DIRECT_BP_SIZE;
+
+		/* allocate the sd_entry members in the sd_table */
+		ret_code = i40e_allocate_virt_mem(hw, &hw->hmc.sd_table.addr,
+					  (sizeof(struct i40e_hmc_sd_entry) *
+					  hw->hmc.sd_table.sd_cnt));
+		if (ret_code)
+			goto init_lan_hmc_out;
+		hw->hmc.sd_table.sd_entry =
+			(struct i40e_hmc_sd_entry *)hw->hmc.sd_table.addr.va;
+	}
+	/* store in the LAN full object for later */
+	full_obj->size = l2fpm_size;
+
+init_lan_hmc_out:
+	return ret_code;
+}
+
+/**
+ * i40e_remove_pd_page - Remove a page from the page descriptor table
+ * @hw: pointer to the HW structure
+ * @hmc_info: pointer to the HMC configuration information structure
+ * @idx: segment descriptor index to find the relevant page descriptor
+ *
+ * This function:
+ *	1. Marks the entry in pd table (for paged address mode) invalid
+ *	2. write to register PMPDINV to invalidate the backing page in FV cache
+ *	3. Decrement the ref count for  pd_entry
+ * assumptions:
+ *	1. caller can deallocate the memory used by pd after this function
+ *	   returns.
+ **/
+static enum i40e_status_code i40e_remove_pd_page(struct i40e_hw *hw,
+						 struct i40e_hmc_info *hmc_info,
+						 u32 idx)
+{
+	enum i40e_status_code ret_code = I40E_SUCCESS;
+
+	if (i40e_prep_remove_pd_page(hmc_info, idx) == I40E_SUCCESS)
+		ret_code = i40e_remove_pd_page_new(hw, hmc_info, idx, TRUE);
+
+	return ret_code;
+}
+
+/**
+ * i40e_remove_sd_bp - remove a backing page from a segment descriptor
+ * @hw: pointer to our HW structure
+ * @hmc_info: pointer to the HMC configuration information structure
+ * @idx: the page index
+ *
+ * This function:
+ *	1. Marks the entry in sd table (for direct address mode) invalid
+ *	2. write to register PMSDCMD, PMSDDATALOW(PMSDDATALOW.PMSDVALID set
+ *	   to 0) and PMSDDATAHIGH to invalidate the sd page
+ *	3. Decrement the ref count for the sd_entry
+ * assumptions:
+ *	1. caller can deallocate the memory used by backing storage after this
+ *	   function returns.
+ **/
+static enum i40e_status_code i40e_remove_sd_bp(struct i40e_hw *hw,
+					       struct i40e_hmc_info *hmc_info,
+					       u32 idx)
+{
+	enum i40e_status_code ret_code = I40E_SUCCESS;
+
+	if (i40e_prep_remove_sd_bp(hmc_info, idx) == I40E_SUCCESS)
+		ret_code = i40e_remove_sd_bp_new(hw, hmc_info, idx, TRUE);
+
+	return ret_code;
+}
+
+/**
+ * i40e_create_lan_hmc_object - allocate backing store for hmc objects
+ * @hw: pointer to the HW structure
+ * @info: pointer to i40e_hmc_create_obj_info struct
+ *
+ * This will allocate memory for PDs and backing pages and populate
+ * the sd and pd entries.
+ **/
+enum i40e_status_code i40e_create_lan_hmc_object(struct i40e_hw *hw,
+				struct i40e_hmc_lan_create_obj_info *info)
+{
+	enum i40e_status_code ret_code = I40E_SUCCESS;
+	struct i40e_hmc_sd_entry *sd_entry;
+	u32 pd_idx1 = 0, pd_lmt1 = 0;
+	u32 pd_idx = 0, pd_lmt = 0;
+	bool pd_error = FALSE;
+	u32 sd_idx, sd_lmt;
+	u64 sd_size;
+	u32 i, j;
+
+	if (NULL == info) {
+		ret_code = I40E_ERR_BAD_PTR;
+		DEBUGOUT("i40e_create_lan_hmc_object: bad info ptr\n");
+		goto exit;
+	}
+	if (NULL == info->hmc_info) {
+		ret_code = I40E_ERR_BAD_PTR;
+		DEBUGOUT("i40e_create_lan_hmc_object: bad hmc_info ptr\n");
+		goto exit;
+	}
+	if (I40E_HMC_INFO_SIGNATURE != info->hmc_info->signature) {
+		ret_code = I40E_ERR_BAD_PTR;
+		DEBUGOUT("i40e_create_lan_hmc_object: bad signature\n");
+		goto exit;
+	}
+
+	if (info->start_idx >= info->hmc_info->hmc_obj[info->rsrc_type].cnt) {
+		ret_code = I40E_ERR_INVALID_HMC_OBJ_INDEX;
+		DEBUGOUT1("i40e_create_lan_hmc_object: returns error %d\n",
+			  ret_code);
+		goto exit;
+	}
+	if ((info->start_idx + info->count) >
+	    info->hmc_info->hmc_obj[info->rsrc_type].cnt) {
+		ret_code = I40E_ERR_INVALID_HMC_OBJ_COUNT;
+		DEBUGOUT1("i40e_create_lan_hmc_object: returns error %d\n",
+			  ret_code);
+		goto exit;
+	}
+
+	/* find sd index and limit */
+	I40E_FIND_SD_INDEX_LIMIT(info->hmc_info, info->rsrc_type,
+				 info->start_idx, info->count,
+				 &sd_idx, &sd_lmt);
+	if (sd_idx >= info->hmc_info->sd_table.sd_cnt ||
+	    sd_lmt > info->hmc_info->sd_table.sd_cnt) {
+			ret_code = I40E_ERR_INVALID_SD_INDEX;
+			goto exit;
+	}
+	/* find pd index */
+	I40E_FIND_PD_INDEX_LIMIT(info->hmc_info, info->rsrc_type,
+				 info->start_idx, info->count, &pd_idx,
+				 &pd_lmt);
+
+	/* This is to cover for cases where you may not want to have an SD with
+	 * the full 2M memory but something smaller. By not filling out any
+	 * size, the function will default the SD size to be 2M.
+	 */
+	if (info->direct_mode_sz == 0)
+		sd_size = I40E_HMC_DIRECT_BP_SIZE;
+	else
+		sd_size = info->direct_mode_sz;
+
+	/* check if all the sds are valid. If not, allocate a page and
+	 * initialize it.
+	 */
+	for (j = sd_idx; j < sd_lmt; j++) {
+		/* update the sd table entry */
+		ret_code = i40e_add_sd_table_entry(hw, info->hmc_info, j,
+						   info->entry_type,
+						   sd_size);
+		if (I40E_SUCCESS != ret_code)
+			goto exit_sd_error;
+		sd_entry = &info->hmc_info->sd_table.sd_entry[j];
+		if (I40E_SD_TYPE_PAGED == sd_entry->entry_type) {
+			/* check if all the pds in this sd are valid. If not,
+			 * allocate a page and initialize it.
+			 */
+
+			/* find pd_idx and pd_lmt in this sd */
+			pd_idx1 = max(pd_idx, (j * I40E_HMC_MAX_BP_COUNT));
+			pd_lmt1 = min(pd_lmt,
+				      ((j + 1) * I40E_HMC_MAX_BP_COUNT));
+			for (i = pd_idx1; i < pd_lmt1; i++) {
+				/* update the pd table entry */
+				ret_code = i40e_add_pd_table_entry(hw,
+								info->hmc_info,
+								i, NULL);
+				if (I40E_SUCCESS != ret_code) {
+					pd_error = TRUE;
+					break;
+				}
+			}
+			if (pd_error) {
+				/* remove the backing pages from pd_idx1 to i */
+				while (i && (i > pd_idx1)) {
+					i40e_remove_pd_bp(hw, info->hmc_info,
+							  (i - 1));
+					i--;
+				}
+			}
+		}
+		if (!sd_entry->valid) {
+			sd_entry->valid = TRUE;
+			switch (sd_entry->entry_type) {
+			case I40E_SD_TYPE_PAGED:
+				I40E_SET_PF_SD_ENTRY(hw,
+					sd_entry->u.pd_table.pd_page_addr.pa,
+					j, sd_entry->entry_type);
+				break;
+			case I40E_SD_TYPE_DIRECT:
+				I40E_SET_PF_SD_ENTRY(hw, sd_entry->u.bp.addr.pa,
+						     j, sd_entry->entry_type);
+				break;
+			default:
+				ret_code = I40E_ERR_INVALID_SD_TYPE;
+				goto exit;
+			}
+		}
+	}
+	goto exit;
+
+exit_sd_error:
+	/* cleanup for sd entries from j to sd_idx */
+	while (j && (j > sd_idx)) {
+		sd_entry = &info->hmc_info->sd_table.sd_entry[j - 1];
+		switch (sd_entry->entry_type) {
+		case I40E_SD_TYPE_PAGED:
+			pd_idx1 = max(pd_idx,
+				      ((j - 1) * I40E_HMC_MAX_BP_COUNT));
+			pd_lmt1 = min(pd_lmt, (j * I40E_HMC_MAX_BP_COUNT));
+			for (i = pd_idx1; i < pd_lmt1; i++)
+				i40e_remove_pd_bp(hw, info->hmc_info, i);
+			i40e_remove_pd_page(hw, info->hmc_info, (j - 1));
+			break;
+		case I40E_SD_TYPE_DIRECT:
+			i40e_remove_sd_bp(hw, info->hmc_info, (j - 1));
+			break;
+		default:
+			ret_code = I40E_ERR_INVALID_SD_TYPE;
+			break;
+		}
+		j--;
+	}
+exit:
+	return ret_code;
+}
+
+/**
+ * i40e_configure_lan_hmc - prepare the HMC backing store
+ * @hw: pointer to the hw structure
+ * @model: the model for the layout of the SD/PD tables
+ *
+ * - This function will be called once per physical function initialization.
+ * - This function will be called after i40e_init_lan_hmc() and before
+ *   any LAN/FCoE HMC objects can be created.
+ **/
+enum i40e_status_code i40e_configure_lan_hmc(struct i40e_hw *hw,
+					     enum i40e_hmc_model model)
+{
+	struct i40e_hmc_lan_create_obj_info info;
+	u8 hmc_fn_id = hw->hmc.hmc_fn_id;
+	struct i40e_hmc_obj_info *obj;
+	enum i40e_status_code ret_code = I40E_SUCCESS;
+
+	/* Initialize part of the create object info struct */
+	info.hmc_info = &hw->hmc;
+	info.rsrc_type = I40E_HMC_LAN_FULL;
+	info.start_idx = 0;
+	info.direct_mode_sz = hw->hmc.hmc_obj[I40E_HMC_LAN_FULL].size;
+
+	/* Build the SD entry for the LAN objects */
+	switch (model) {
+	case I40E_HMC_MODEL_DIRECT_PREFERRED:
+	case I40E_HMC_MODEL_DIRECT_ONLY:
+		info.entry_type = I40E_SD_TYPE_DIRECT;
+		/* Make one big object, a single SD */
+		info.count = 1;
+		ret_code = i40e_create_lan_hmc_object(hw, &info);
+		if ((ret_code != I40E_SUCCESS) && (model == I40E_HMC_MODEL_DIRECT_PREFERRED))
+			goto try_type_paged;
+		else if (ret_code != I40E_SUCCESS)
+			goto configure_lan_hmc_out;
+		/* else clause falls through the break */
+		break;
+	case I40E_HMC_MODEL_PAGED_ONLY:
+try_type_paged:
+		info.entry_type = I40E_SD_TYPE_PAGED;
+		/* Make one big object in the PD table */
+		info.count = 1;
+		ret_code = i40e_create_lan_hmc_object(hw, &info);
+		if (ret_code != I40E_SUCCESS)
+			goto configure_lan_hmc_out;
+		break;
+	default:
+		/* unsupported type */
+		ret_code = I40E_ERR_INVALID_SD_TYPE;
+		DEBUGOUT1("i40e_configure_lan_hmc: Unknown SD type: %d\n",
+			  ret_code);
+		goto configure_lan_hmc_out;
+	}
+
+	/* Configure and program the FPM registers so objects can be created */
+
+	/* Tx contexts */
+	obj = &hw->hmc.hmc_obj[I40E_HMC_LAN_TX];
+	wr32(hw, I40E_GLHMC_LANTXBASE(hmc_fn_id),
+	     (u32)((obj->base & I40E_GLHMC_LANTXBASE_FPMLANTXBASE_MASK) / 512));
+	wr32(hw, I40E_GLHMC_LANTXCNT(hmc_fn_id), obj->cnt);
+
+	/* Rx contexts */
+	obj = &hw->hmc.hmc_obj[I40E_HMC_LAN_RX];
+	wr32(hw, I40E_GLHMC_LANRXBASE(hmc_fn_id),
+	     (u32)((obj->base & I40E_GLHMC_LANRXBASE_FPMLANRXBASE_MASK) / 512));
+	wr32(hw, I40E_GLHMC_LANRXCNT(hmc_fn_id), obj->cnt);
+
+	/* FCoE contexts */
+	obj = &hw->hmc.hmc_obj[I40E_HMC_FCOE_CTX];
+	wr32(hw, I40E_GLHMC_FCOEDDPBASE(hmc_fn_id),
+	 (u32)((obj->base & I40E_GLHMC_FCOEDDPBASE_FPMFCOEDDPBASE_MASK) / 512));
+	wr32(hw, I40E_GLHMC_FCOEDDPCNT(hmc_fn_id), obj->cnt);
+
+	/* FCoE filters */
+	obj = &hw->hmc.hmc_obj[I40E_HMC_FCOE_FILT];
+	wr32(hw, I40E_GLHMC_FCOEFBASE(hmc_fn_id),
+	     (u32)((obj->base & I40E_GLHMC_FCOEFBASE_FPMFCOEFBASE_MASK) / 512));
+	wr32(hw, I40E_GLHMC_FCOEFCNT(hmc_fn_id), obj->cnt);
+
+configure_lan_hmc_out:
+	return ret_code;
+}
+
+/**
+ * i40e_delete_hmc_object - remove hmc objects
+ * @hw: pointer to the HW structure
+ * @info: pointer to i40e_hmc_delete_obj_info struct
+ *
+ * This will de-populate the SDs and PDs.  It frees
+ * the memory for PDS and backing storage.  After this function is returned,
+ * caller should deallocate memory allocated previously for
+ * book-keeping information about PDs and backing storage.
+ **/
+enum i40e_status_code i40e_delete_lan_hmc_object(struct i40e_hw *hw,
+				struct i40e_hmc_lan_delete_obj_info *info)
+{
+	enum i40e_status_code ret_code = I40E_SUCCESS;
+	struct i40e_hmc_pd_table *pd_table;
+	u32 pd_idx, pd_lmt, rel_pd_idx;
+	u32 sd_idx, sd_lmt;
+	u32 i, j;
+
+	if (NULL == info) {
+		ret_code = I40E_ERR_BAD_PTR;
+		DEBUGOUT("i40e_delete_hmc_object: bad info ptr\n");
+		goto exit;
+	}
+	if (NULL == info->hmc_info) {
+		ret_code = I40E_ERR_BAD_PTR;
+		DEBUGOUT("i40e_delete_hmc_object: bad info->hmc_info ptr\n");
+		goto exit;
+	}
+	if (I40E_HMC_INFO_SIGNATURE != info->hmc_info->signature) {
+		ret_code = I40E_ERR_BAD_PTR;
+		DEBUGOUT("i40e_delete_hmc_object: bad hmc_info->signature\n");
+		goto exit;
+	}
+
+	if (NULL == info->hmc_info->sd_table.sd_entry) {
+		ret_code = I40E_ERR_BAD_PTR;
+		DEBUGOUT("i40e_delete_hmc_object: bad sd_entry\n");
+		goto exit;
+	}
+
+	if (NULL == info->hmc_info->hmc_obj) {
+		ret_code = I40E_ERR_BAD_PTR;
+		DEBUGOUT("i40e_delete_hmc_object: bad hmc_info->hmc_obj\n");
+		goto exit;
+	}
+	if (info->start_idx >= info->hmc_info->hmc_obj[info->rsrc_type].cnt) {
+		ret_code = I40E_ERR_INVALID_HMC_OBJ_INDEX;
+		DEBUGOUT1("i40e_delete_hmc_object: returns error %d\n",
+			  ret_code);
+		goto exit;
+	}
+
+	if ((info->start_idx + info->count) >
+	    info->hmc_info->hmc_obj[info->rsrc_type].cnt) {
+		ret_code = I40E_ERR_INVALID_HMC_OBJ_COUNT;
+		DEBUGOUT1("i40e_delete_hmc_object: returns error %d\n",
+			  ret_code);
+		goto exit;
+	}
+
+	I40E_FIND_PD_INDEX_LIMIT(info->hmc_info, info->rsrc_type,
+				 info->start_idx, info->count, &pd_idx,
+				 &pd_lmt);
+
+	for (j = pd_idx; j < pd_lmt; j++) {
+		sd_idx = j / I40E_HMC_PD_CNT_IN_SD;
+
+		if (I40E_SD_TYPE_PAGED !=
+		    info->hmc_info->sd_table.sd_entry[sd_idx].entry_type)
+			continue;
+
+		rel_pd_idx = j % I40E_HMC_PD_CNT_IN_SD;
+
+		pd_table =
+			&info->hmc_info->sd_table.sd_entry[sd_idx].u.pd_table;
+		if (pd_table->pd_entry[rel_pd_idx].valid) {
+			ret_code = i40e_remove_pd_bp(hw, info->hmc_info, j);
+			if (I40E_SUCCESS != ret_code)
+				goto exit;
+		}
+	}
+
+	/* find sd index and limit */
+	I40E_FIND_SD_INDEX_LIMIT(info->hmc_info, info->rsrc_type,
+				 info->start_idx, info->count,
+				 &sd_idx, &sd_lmt);
+	if (sd_idx >= info->hmc_info->sd_table.sd_cnt ||
+	    sd_lmt > info->hmc_info->sd_table.sd_cnt) {
+		ret_code = I40E_ERR_INVALID_SD_INDEX;
+		goto exit;
+	}
+
+	for (i = sd_idx; i < sd_lmt; i++) {
+		if (!info->hmc_info->sd_table.sd_entry[i].valid)
+			continue;
+		switch (info->hmc_info->sd_table.sd_entry[i].entry_type) {
+		case I40E_SD_TYPE_DIRECT:
+			ret_code = i40e_remove_sd_bp(hw, info->hmc_info, i);
+			if (I40E_SUCCESS != ret_code)
+				goto exit;
+			break;
+		case I40E_SD_TYPE_PAGED:
+			ret_code = i40e_remove_pd_page(hw, info->hmc_info, i);
+			if (I40E_SUCCESS != ret_code)
+				goto exit;
+			break;
+		default:
+			break;
+		}
+	}
+exit:
+	return ret_code;
+}
+
+/**
+ * i40e_shutdown_lan_hmc - Remove HMC backing store, free allocated memory
+ * @hw: pointer to the hw structure
+ *
+ * This must be called by drivers as they are shutting down and being
+ * removed from the OS.
+ **/
+enum i40e_status_code i40e_shutdown_lan_hmc(struct i40e_hw *hw)
+{
+	struct i40e_hmc_lan_delete_obj_info info;
+	enum i40e_status_code ret_code;
+
+	info.hmc_info = &hw->hmc;
+	info.rsrc_type = I40E_HMC_LAN_FULL;
+	info.start_idx = 0;
+	info.count = 1;
+
+	/* delete the object */
+	ret_code = i40e_delete_lan_hmc_object(hw, &info);
+
+	/* free the SD table entry for LAN */
+	i40e_free_virt_mem(hw, &hw->hmc.sd_table.addr);
+	hw->hmc.sd_table.sd_cnt = 0;
+	hw->hmc.sd_table.sd_entry = NULL;
+
+	/* free memory used for hmc_obj */
+	i40e_free_virt_mem(hw, &hw->hmc.hmc_obj_virt_mem);
+	hw->hmc.hmc_obj = NULL;
+
+	return ret_code;
+}
+
+#define I40E_HMC_STORE(_struct, _ele)		\
+	offsetof(struct _struct, _ele),		\
+	FIELD_SIZEOF(struct _struct, _ele)
+
+struct i40e_context_ele {
+	u16 offset;
+	u16 size_of;
+	u16 width;
+	u16 lsb;
+};
+
+/* LAN Tx Queue Context */
+static struct i40e_context_ele i40e_hmc_txq_ce_info[] = {
+					     /* Field      Width    LSB */
+	{I40E_HMC_STORE(i40e_hmc_obj_txq, head),           13,      0 },
+	{I40E_HMC_STORE(i40e_hmc_obj_txq, new_context),     1,     30 },
+	{I40E_HMC_STORE(i40e_hmc_obj_txq, base),           57,     32 },
+	{I40E_HMC_STORE(i40e_hmc_obj_txq, fc_ena),          1,     89 },
+	{I40E_HMC_STORE(i40e_hmc_obj_txq, timesync_ena),    1,     90 },
+	{I40E_HMC_STORE(i40e_hmc_obj_txq, fd_ena),          1,     91 },
+	{I40E_HMC_STORE(i40e_hmc_obj_txq, alt_vlan_ena),    1,     92 },
+	{I40E_HMC_STORE(i40e_hmc_obj_txq, cpuid),           8,     96 },
+/* line 1 */
+	{I40E_HMC_STORE(i40e_hmc_obj_txq, thead_wb),       13,  0 + 128 },
+	{I40E_HMC_STORE(i40e_hmc_obj_txq, head_wb_ena),     1, 32 + 128 },
+	{I40E_HMC_STORE(i40e_hmc_obj_txq, qlen),           13, 33 + 128 },
+	{I40E_HMC_STORE(i40e_hmc_obj_txq, tphrdesc_ena),    1, 46 + 128 },
+	{I40E_HMC_STORE(i40e_hmc_obj_txq, tphrpacket_ena),  1, 47 + 128 },
+	{I40E_HMC_STORE(i40e_hmc_obj_txq, tphwdesc_ena),    1, 48 + 128 },
+	{I40E_HMC_STORE(i40e_hmc_obj_txq, head_wb_addr),   64, 64 + 128 },
+/* line 7 */
+	{I40E_HMC_STORE(i40e_hmc_obj_txq, crc),            32,  0 + (7 * 128) },
+	{I40E_HMC_STORE(i40e_hmc_obj_txq, rdylist),        10, 84 + (7 * 128) },
+	{I40E_HMC_STORE(i40e_hmc_obj_txq, rdylist_act),     1, 94 + (7 * 128) },
+	{ 0 }
+};
+
+/* LAN Rx Queue Context */
+static struct i40e_context_ele i40e_hmc_rxq_ce_info[] = {
+					 /* Field      Width    LSB */
+	{ I40E_HMC_STORE(i40e_hmc_obj_rxq, head),        13,	0   },
+	{ I40E_HMC_STORE(i40e_hmc_obj_rxq, cpuid),        8,	13  },
+	{ I40E_HMC_STORE(i40e_hmc_obj_rxq, base),        57,	32  },
+	{ I40E_HMC_STORE(i40e_hmc_obj_rxq, qlen),        13,	89  },
+	{ I40E_HMC_STORE(i40e_hmc_obj_rxq, dbuff),        7,	102 },
+	{ I40E_HMC_STORE(i40e_hmc_obj_rxq, hbuff),        5,	109 },
+	{ I40E_HMC_STORE(i40e_hmc_obj_rxq, dtype),        2,	114 },
+	{ I40E_HMC_STORE(i40e_hmc_obj_rxq, dsize),        1,	116 },
+	{ I40E_HMC_STORE(i40e_hmc_obj_rxq, crcstrip),     1,	117 },
+	{ I40E_HMC_STORE(i40e_hmc_obj_rxq, fc_ena),       1,	118 },
+	{ I40E_HMC_STORE(i40e_hmc_obj_rxq, l2tsel),       1,	119 },
+	{ I40E_HMC_STORE(i40e_hmc_obj_rxq, hsplit_0),     4,	120 },
+	{ I40E_HMC_STORE(i40e_hmc_obj_rxq, hsplit_1),     2,	124 },
+	{ I40E_HMC_STORE(i40e_hmc_obj_rxq, showiv),       1,	127 },
+	{ I40E_HMC_STORE(i40e_hmc_obj_rxq, rxmax),       14,	174 },
+	{ I40E_HMC_STORE(i40e_hmc_obj_rxq, tphrdesc_ena), 1,	193 },
+	{ I40E_HMC_STORE(i40e_hmc_obj_rxq, tphwdesc_ena), 1,	194 },
+	{ I40E_HMC_STORE(i40e_hmc_obj_rxq, tphdata_ena),  1,	195 },
+	{ I40E_HMC_STORE(i40e_hmc_obj_rxq, tphhead_ena),  1,	196 },
+	{ I40E_HMC_STORE(i40e_hmc_obj_rxq, lrxqthresh),   3,	198 },
+	{ I40E_HMC_STORE(i40e_hmc_obj_rxq, prefena),      1,	201 },
+	{ 0 }
+};
+
+/**
+ * i40e_write_byte - replace HMC context byte
+ * @hmc_bits: pointer to the HMC memory
+ * @ce_info: a description of the struct to be read from
+ * @src: the struct to be read from
+ **/
+static void i40e_write_byte(u8 *hmc_bits,
+			    struct i40e_context_ele *ce_info,
+			    u8 *src)
+{
+	u8 src_byte, dest_byte, mask;
+	u8 *from, *dest;
+	u16 shift_width;
+
+	/* copy from the next struct field */
+	from = src + ce_info->offset;
+
+	/* prepare the bits and mask */
+	shift_width = ce_info->lsb % 8;
+	mask = BIT(ce_info->width) - 1;
+
+	src_byte = *from;
+	src_byte &= mask;
+
+	/* shift to correct alignment */
+	mask <<= shift_width;
+	src_byte <<= shift_width;
+
+	/* get the current bits from the target bit string */
+	dest = hmc_bits + (ce_info->lsb / 8);
+
+	i40e_memcpy(&dest_byte, dest, sizeof(dest_byte), I40E_DMA_TO_NONDMA);
+
+	dest_byte &= ~mask;	/* get the bits not changing */
+	dest_byte |= src_byte;	/* add in the new bits */
+
+	/* put it all back */
+	i40e_memcpy(dest, &dest_byte, sizeof(dest_byte), I40E_NONDMA_TO_DMA);
+}
+
+/**
+ * i40e_write_word - replace HMC context word
+ * @hmc_bits: pointer to the HMC memory
+ * @ce_info: a description of the struct to be read from
+ * @src: the struct to be read from
+ **/
+static void i40e_write_word(u8 *hmc_bits,
+			    struct i40e_context_ele *ce_info,
+			    u8 *src)
+{
+	u16 src_word, mask;
+	u8 *from, *dest;
+	u16 shift_width;
+	__le16 dest_word;
+
+	/* copy from the next struct field */
+	from = src + ce_info->offset;
+
+	/* prepare the bits and mask */
+	shift_width = ce_info->lsb % 8;
+	mask = BIT(ce_info->width) - 1;
+
+	/* don't swizzle the bits until after the mask because the mask bits
+	 * will be in a different bit position on big endian machines
+	 */
+	src_word = *(u16 *)from;
+	src_word &= mask;
+
+	/* shift to correct alignment */
+	mask <<= shift_width;
+	src_word <<= shift_width;
+
+	/* get the current bits from the target bit string */
+	dest = hmc_bits + (ce_info->lsb / 8);
+
+	i40e_memcpy(&dest_word, dest, sizeof(dest_word), I40E_DMA_TO_NONDMA);
+
+	dest_word &= ~(CPU_TO_LE16(mask));	/* get the bits not changing */
+	dest_word |= CPU_TO_LE16(src_word);	/* add in the new bits */
+
+	/* put it all back */
+	i40e_memcpy(dest, &dest_word, sizeof(dest_word), I40E_NONDMA_TO_DMA);
+}
+
+/**
+ * i40e_write_dword - replace HMC context dword
+ * @hmc_bits: pointer to the HMC memory
+ * @ce_info: a description of the struct to be read from
+ * @src: the struct to be read from
+ **/
+static void i40e_write_dword(u8 *hmc_bits,
+			     struct i40e_context_ele *ce_info,
+			     u8 *src)
+{
+	u32 src_dword, mask;
+	u8 *from, *dest;
+	u16 shift_width;
+	__le32 dest_dword;
+
+	/* copy from the next struct field */
+	from = src + ce_info->offset;
+
+	/* prepare the bits and mask */
+	shift_width = ce_info->lsb % 8;
+
+	/* if the field width is exactly 32 on an x86 machine, then the shift
+	 * operation will not work because the SHL instructions count is masked
+	 * to 5 bits so the shift will do nothing
+	 */
+	if (ce_info->width < 32)
+		mask = BIT(ce_info->width) - 1;
+	else
+		mask = ~(u32)0;
+
+	/* don't swizzle the bits until after the mask because the mask bits
+	 * will be in a different bit position on big endian machines
+	 */
+	src_dword = *(u32 *)from;
+	src_dword &= mask;
+
+	/* shift to correct alignment */
+	mask <<= shift_width;
+	src_dword <<= shift_width;
+
+	/* get the current bits from the target bit string */
+	dest = hmc_bits + (ce_info->lsb / 8);
+
+	i40e_memcpy(&dest_dword, dest, sizeof(dest_dword), I40E_DMA_TO_NONDMA);
+
+	dest_dword &= ~(CPU_TO_LE32(mask));	/* get the bits not changing */
+	dest_dword |= CPU_TO_LE32(src_dword);	/* add in the new bits */
+
+	/* put it all back */
+	i40e_memcpy(dest, &dest_dword, sizeof(dest_dword), I40E_NONDMA_TO_DMA);
+}
+
+/**
+ * i40e_write_qword - replace HMC context qword
+ * @hmc_bits: pointer to the HMC memory
+ * @ce_info: a description of the struct to be read from
+ * @src: the struct to be read from
+ **/
+static void i40e_write_qword(u8 *hmc_bits,
+			     struct i40e_context_ele *ce_info,
+			     u8 *src)
+{
+	u64 src_qword, mask;
+	u8 *from, *dest;
+	u16 shift_width;
+	__le64 dest_qword;
+
+	/* copy from the next struct field */
+	from = src + ce_info->offset;
+
+	/* prepare the bits and mask */
+	shift_width = ce_info->lsb % 8;
+
+	/* if the field width is exactly 64 on an x86 machine, then the shift
+	 * operation will not work because the SHL instructions count is masked
+	 * to 6 bits so the shift will do nothing
+	 */
+	if (ce_info->width < 64)
+		mask = BIT_ULL(ce_info->width) - 1;
+	else
+		mask = ~(u64)0;
+
+	/* don't swizzle the bits until after the mask because the mask bits
+	 * will be in a different bit position on big endian machines
+	 */
+	src_qword = *(u64 *)from;
+	src_qword &= mask;
+
+	/* shift to correct alignment */
+	mask <<= shift_width;
+	src_qword <<= shift_width;
+
+	/* get the current bits from the target bit string */
+	dest = hmc_bits + (ce_info->lsb / 8);
+
+	i40e_memcpy(&dest_qword, dest, sizeof(dest_qword), I40E_DMA_TO_NONDMA);
+
+	dest_qword &= ~(CPU_TO_LE64(mask));	/* get the bits not changing */
+	dest_qword |= CPU_TO_LE64(src_qword);	/* add in the new bits */
+
+	/* put it all back */
+	i40e_memcpy(dest, &dest_qword, sizeof(dest_qword), I40E_NONDMA_TO_DMA);
+}
+
+/**
+ * i40e_read_byte - read HMC context byte into struct
+ * @hmc_bits: pointer to the HMC memory
+ * @ce_info: a description of the struct to be filled
+ * @dest: the struct to be filled
+ **/
+static void i40e_read_byte(u8 *hmc_bits,
+			   struct i40e_context_ele *ce_info,
+			   u8 *dest)
+{
+	u8 dest_byte, mask;
+	u8 *src, *target;
+	u16 shift_width;
+
+	/* prepare the bits and mask */
+	shift_width = ce_info->lsb % 8;
+	mask = BIT(ce_info->width) - 1;
+
+	/* shift to correct alignment */
+	mask <<= shift_width;
+
+	/* get the current bits from the src bit string */
+	src = hmc_bits + (ce_info->lsb / 8);
+
+	i40e_memcpy(&dest_byte, src, sizeof(dest_byte), I40E_DMA_TO_NONDMA);
+
+	dest_byte &= ~(mask);
+
+	dest_byte >>= shift_width;
+
+	/* get the address from the struct field */
+	target = dest + ce_info->offset;
+
+	/* put it back in the struct */
+	i40e_memcpy(target, &dest_byte, sizeof(dest_byte), I40E_NONDMA_TO_DMA);
+}
+
+/**
+ * i40e_read_word - read HMC context word into struct
+ * @hmc_bits: pointer to the HMC memory
+ * @ce_info: a description of the struct to be filled
+ * @dest: the struct to be filled
+ **/
+static void i40e_read_word(u8 *hmc_bits,
+			   struct i40e_context_ele *ce_info,
+			   u8 *dest)
+{
+	u16 dest_word, mask;
+	u8 *src, *target;
+	u16 shift_width;
+	__le16 src_word;
+
+	/* prepare the bits and mask */
+	shift_width = ce_info->lsb % 8;
+	mask = BIT(ce_info->width) - 1;
+
+	/* shift to correct alignment */
+	mask <<= shift_width;
+
+	/* get the current bits from the src bit string */
+	src = hmc_bits + (ce_info->lsb / 8);
+
+	i40e_memcpy(&src_word, src, sizeof(src_word), I40E_DMA_TO_NONDMA);
+
+	/* the data in the memory is stored as little endian so mask it
+	 * correctly
+	 */
+	src_word &= ~(CPU_TO_LE16(mask));
+
+	/* get the data back into host order before shifting */
+	dest_word = LE16_TO_CPU(src_word);
+
+	dest_word >>= shift_width;
+
+	/* get the address from the struct field */
+	target = dest + ce_info->offset;
+
+	/* put it back in the struct */
+	i40e_memcpy(target, &dest_word, sizeof(dest_word), I40E_NONDMA_TO_DMA);
+}
+
+/**
+ * i40e_read_dword - read HMC context dword into struct
+ * @hmc_bits: pointer to the HMC memory
+ * @ce_info: a description of the struct to be filled
+ * @dest: the struct to be filled
+ **/
+static void i40e_read_dword(u8 *hmc_bits,
+			    struct i40e_context_ele *ce_info,
+			    u8 *dest)
+{
+	u32 dest_dword, mask;
+	u8 *src, *target;
+	u16 shift_width;
+	__le32 src_dword;
+
+	/* prepare the bits and mask */
+	shift_width = ce_info->lsb % 8;
+
+	/* if the field width is exactly 32 on an x86 machine, then the shift
+	 * operation will not work because the SHL instructions count is masked
+	 * to 5 bits so the shift will do nothing
+	 */
+	if (ce_info->width < 32)
+		mask = BIT(ce_info->width) - 1;
+	else
+		mask = ~(u32)0;
+
+	/* shift to correct alignment */
+	mask <<= shift_width;
+
+	/* get the current bits from the src bit string */
+	src = hmc_bits + (ce_info->lsb / 8);
+
+	i40e_memcpy(&src_dword, src, sizeof(src_dword), I40E_DMA_TO_NONDMA);
+
+	/* the data in the memory is stored as little endian so mask it
+	 * correctly
+	 */
+	src_dword &= ~(CPU_TO_LE32(mask));
+
+	/* get the data back into host order before shifting */
+	dest_dword = LE32_TO_CPU(src_dword);
+
+	dest_dword >>= shift_width;
+
+	/* get the address from the struct field */
+	target = dest + ce_info->offset;
+
+	/* put it back in the struct */
+	i40e_memcpy(target, &dest_dword, sizeof(dest_dword),
+		    I40E_NONDMA_TO_DMA);
+}
+
+/**
+ * i40e_read_qword - read HMC context qword into struct
+ * @hmc_bits: pointer to the HMC memory
+ * @ce_info: a description of the struct to be filled
+ * @dest: the struct to be filled
+ **/
+static void i40e_read_qword(u8 *hmc_bits,
+			    struct i40e_context_ele *ce_info,
+			    u8 *dest)
+{
+	u64 dest_qword, mask;
+	u8 *src, *target;
+	u16 shift_width;
+	__le64 src_qword;
+
+	/* prepare the bits and mask */
+	shift_width = ce_info->lsb % 8;
+
+	/* if the field width is exactly 64 on an x86 machine, then the shift
+	 * operation will not work because the SHL instructions count is masked
+	 * to 6 bits so the shift will do nothing
+	 */
+	if (ce_info->width < 64)
+		mask = BIT_ULL(ce_info->width) - 1;
+	else
+		mask = ~(u64)0;
+
+	/* shift to correct alignment */
+	mask <<= shift_width;
+
+	/* get the current bits from the src bit string */
+	src = hmc_bits + (ce_info->lsb / 8);
+
+	i40e_memcpy(&src_qword, src, sizeof(src_qword), I40E_DMA_TO_NONDMA);
+
+	/* the data in the memory is stored as little endian so mask it
+	 * correctly
+	 */
+	src_qword &= ~(CPU_TO_LE64(mask));
+
+	/* get the data back into host order before shifting */
+	dest_qword = LE64_TO_CPU(src_qword);
+
+	dest_qword >>= shift_width;
+
+	/* get the address from the struct field */
+	target = dest + ce_info->offset;
+
+	/* put it back in the struct */
+	i40e_memcpy(target, &dest_qword, sizeof(dest_qword),
+		    I40E_NONDMA_TO_DMA);
+}
+
+/**
+ * i40e_get_hmc_context - extract HMC context bits
+ * @context_bytes: pointer to the context bit array
+ * @ce_info: a description of the struct to be filled
+ * @dest: the struct to be filled
+ **/
+static enum i40e_status_code i40e_get_hmc_context(u8 *context_bytes,
+					struct i40e_context_ele *ce_info,
+					u8 *dest)
+{
+	int f;
+
+	for (f = 0; ce_info[f].width != 0; f++) {
+		switch (ce_info[f].size_of) {
+		case 1:
+			i40e_read_byte(context_bytes, &ce_info[f], dest);
+			break;
+		case 2:
+			i40e_read_word(context_bytes, &ce_info[f], dest);
+			break;
+		case 4:
+			i40e_read_dword(context_bytes, &ce_info[f], dest);
+			break;
+		case 8:
+			i40e_read_qword(context_bytes, &ce_info[f], dest);
+			break;
+		default:
+			/* nothing to do, just keep going */
+			break;
+		}
+	}
+
+	return I40E_SUCCESS;
+}
+
+/**
+ * i40e_clear_hmc_context - zero out the HMC context bits
+ * @hw:       the hardware struct
+ * @context_bytes: pointer to the context bit array (DMA memory)
+ * @hmc_type: the type of HMC resource
+ **/
+static enum i40e_status_code i40e_clear_hmc_context(struct i40e_hw *hw,
+					u8 *context_bytes,
+					enum i40e_hmc_lan_rsrc_type hmc_type)
+{
+	/* clean the bit array */
+	i40e_memset(context_bytes, 0, (u32)hw->hmc.hmc_obj[hmc_type].size,
+		    I40E_DMA_MEM);
+
+	return I40E_SUCCESS;
+}
+
+/**
+ * i40e_set_hmc_context - replace HMC context bits
+ * @context_bytes: pointer to the context bit array
+ * @ce_info:  a description of the struct to be filled
+ * @dest:     the struct to be filled
+ **/
+static enum i40e_status_code i40e_set_hmc_context(u8 *context_bytes,
+					struct i40e_context_ele *ce_info,
+					u8 *dest)
+{
+	int f;
+
+	for (f = 0; ce_info[f].width != 0; f++) {
+
+		/* we have to deal with each element of the HMC using the
+		 * correct size so that we are correct regardless of the
+		 * endianness of the machine
+		 */
+		switch (ce_info[f].size_of) {
+		case 1:
+			i40e_write_byte(context_bytes, &ce_info[f], dest);
+			break;
+		case 2:
+			i40e_write_word(context_bytes, &ce_info[f], dest);
+			break;
+		case 4:
+			i40e_write_dword(context_bytes, &ce_info[f], dest);
+			break;
+		case 8:
+			i40e_write_qword(context_bytes, &ce_info[f], dest);
+			break;
+		}
+	}
+
+	return I40E_SUCCESS;
+}
+
+/**
+ * i40e_hmc_get_object_va - retrieves an object's virtual address
+ * @hw: pointer to the hw structure
+ * @object_base: pointer to u64 to get the va
+ * @rsrc_type: the hmc resource type
+ * @obj_idx: hmc object index
+ *
+ * This function retrieves the object's virtual address from the object
+ * base pointer.  This function is used for LAN Queue contexts.
+ **/
+static
+enum i40e_status_code i40e_hmc_get_object_va(struct i40e_hw *hw,
+					u8 **object_base,
+					enum i40e_hmc_lan_rsrc_type rsrc_type,
+					u32 obj_idx)
+{
+	u32 obj_offset_in_sd, obj_offset_in_pd;
+	struct i40e_hmc_info     *hmc_info = &hw->hmc;
+	struct i40e_hmc_sd_entry *sd_entry;
+	struct i40e_hmc_pd_entry *pd_entry;
+	u32 pd_idx, pd_lmt, rel_pd_idx;
+	enum i40e_status_code ret_code = I40E_SUCCESS;
+	u64 obj_offset_in_fpm;
+	u32 sd_idx, sd_lmt;
+
+	if (NULL == hmc_info) {
+		ret_code = I40E_ERR_BAD_PTR;
+		DEBUGOUT("i40e_hmc_get_object_va: bad hmc_info ptr\n");
+		goto exit;
+	}
+	if (NULL == hmc_info->hmc_obj) {
+		ret_code = I40E_ERR_BAD_PTR;
+		DEBUGOUT("i40e_hmc_get_object_va: bad hmc_info->hmc_obj ptr\n");
+		goto exit;
+	}
+	if (NULL == object_base) {
+		ret_code = I40E_ERR_BAD_PTR;
+		DEBUGOUT("i40e_hmc_get_object_va: bad object_base ptr\n");
+		goto exit;
+	}
+	if (I40E_HMC_INFO_SIGNATURE != hmc_info->signature) {
+		ret_code = I40E_ERR_BAD_PTR;
+		DEBUGOUT("i40e_hmc_get_object_va: bad hmc_info->signature\n");
+		goto exit;
+	}
+	if (obj_idx >= hmc_info->hmc_obj[rsrc_type].cnt) {
+		DEBUGOUT1("i40e_hmc_get_object_va: returns error %d\n",
+			  ret_code);
+		ret_code = I40E_ERR_INVALID_HMC_OBJ_INDEX;
+		goto exit;
+	}
+	/* find sd index and limit */
+	I40E_FIND_SD_INDEX_LIMIT(hmc_info, rsrc_type, obj_idx, 1,
+				 &sd_idx, &sd_lmt);
+
+	sd_entry = &hmc_info->sd_table.sd_entry[sd_idx];
+	obj_offset_in_fpm = hmc_info->hmc_obj[rsrc_type].base +
+			    hmc_info->hmc_obj[rsrc_type].size * obj_idx;
+
+	if (I40E_SD_TYPE_PAGED == sd_entry->entry_type) {
+		I40E_FIND_PD_INDEX_LIMIT(hmc_info, rsrc_type, obj_idx, 1,
+					 &pd_idx, &pd_lmt);
+		rel_pd_idx = pd_idx % I40E_HMC_PD_CNT_IN_SD;
+		pd_entry = &sd_entry->u.pd_table.pd_entry[rel_pd_idx];
+		obj_offset_in_pd = (u32)(obj_offset_in_fpm %
+					 I40E_HMC_PAGED_BP_SIZE);
+		*object_base = (u8 *)pd_entry->bp.addr.va + obj_offset_in_pd;
+	} else {
+		obj_offset_in_sd = (u32)(obj_offset_in_fpm %
+					 I40E_HMC_DIRECT_BP_SIZE);
+		*object_base = (u8 *)sd_entry->u.bp.addr.va + obj_offset_in_sd;
+	}
+exit:
+	return ret_code;
+}
+
+/**
+ * i40e_get_lan_tx_queue_context - return the HMC context for the queue
+ * @hw:    the hardware struct
+ * @queue: the queue we care about
+ * @s:     the struct to be filled
+ **/
+enum i40e_status_code i40e_get_lan_tx_queue_context(struct i40e_hw *hw,
+						    u16 queue,
+						    struct i40e_hmc_obj_txq *s)
+{
+	enum i40e_status_code err;
+	u8 *context_bytes;
+
+	err = i40e_hmc_get_object_va(hw, &context_bytes, I40E_HMC_LAN_TX, queue);
+	if (err < 0)
+		return err;
+
+	return i40e_get_hmc_context(context_bytes,
+				    i40e_hmc_txq_ce_info, (u8 *)s);
+}
+
+/**
+ * i40e_clear_lan_tx_queue_context - clear the HMC context for the queue
+ * @hw:    the hardware struct
+ * @queue: the queue we care about
+ **/
+enum i40e_status_code i40e_clear_lan_tx_queue_context(struct i40e_hw *hw,
+						      u16 queue)
+{
+	enum i40e_status_code err;
+	u8 *context_bytes;
+
+	err = i40e_hmc_get_object_va(hw, &context_bytes, I40E_HMC_LAN_TX, queue);
+	if (err < 0)
+		return err;
+
+	return i40e_clear_hmc_context(hw, context_bytes, I40E_HMC_LAN_TX);
+}
+
+/**
+ * i40e_set_lan_tx_queue_context - set the HMC context for the queue
+ * @hw:    the hardware struct
+ * @queue: the queue we care about
+ * @s:     the struct to be filled
+ **/
+enum i40e_status_code i40e_set_lan_tx_queue_context(struct i40e_hw *hw,
+						    u16 queue,
+						    struct i40e_hmc_obj_txq *s)
+{
+	enum i40e_status_code err;
+	u8 *context_bytes;
+
+	err = i40e_hmc_get_object_va(hw, &context_bytes, I40E_HMC_LAN_TX, queue);
+	if (err < 0)
+		return err;
+
+	return i40e_set_hmc_context(context_bytes,
+				    i40e_hmc_txq_ce_info, (u8 *)s);
+}
+
+/**
+ * i40e_get_lan_rx_queue_context - return the HMC context for the queue
+ * @hw:    the hardware struct
+ * @queue: the queue we care about
+ * @s:     the struct to be filled
+ **/
+enum i40e_status_code i40e_get_lan_rx_queue_context(struct i40e_hw *hw,
+						    u16 queue,
+						    struct i40e_hmc_obj_rxq *s)
+{
+	enum i40e_status_code err;
+	u8 *context_bytes;
+
+	err = i40e_hmc_get_object_va(hw, &context_bytes, I40E_HMC_LAN_RX, queue);
+	if (err < 0)
+		return err;
+
+	return i40e_get_hmc_context(context_bytes,
+				    i40e_hmc_rxq_ce_info, (u8 *)s);
+}
+
+/**
+ * i40e_clear_lan_rx_queue_context - clear the HMC context for the queue
+ * @hw:    the hardware struct
+ * @queue: the queue we care about
+ **/
+enum i40e_status_code i40e_clear_lan_rx_queue_context(struct i40e_hw *hw,
+						      u16 queue)
+{
+	enum i40e_status_code err;
+	u8 *context_bytes;
+
+	err = i40e_hmc_get_object_va(hw, &context_bytes, I40E_HMC_LAN_RX, queue);
+	if (err < 0)
+		return err;
+
+	return i40e_clear_hmc_context(hw, context_bytes, I40E_HMC_LAN_RX);
+}
+
+/**
+ * i40e_set_lan_rx_queue_context - set the HMC context for the queue
+ * @hw:    the hardware struct
+ * @queue: the queue we care about
+ * @s:     the struct to be filled
+ **/
+enum i40e_status_code i40e_set_lan_rx_queue_context(struct i40e_hw *hw,
+						    u16 queue,
+						    struct i40e_hmc_obj_rxq *s)
+{
+	enum i40e_status_code err;
+	u8 *context_bytes;
+
+	err = i40e_hmc_get_object_va(hw, &context_bytes, I40E_HMC_LAN_RX, queue);
+	if (err < 0)
+		return err;
+
+	return i40e_set_hmc_context(context_bytes,
+				    i40e_hmc_rxq_ce_info, (u8 *)s);
+}
diff --git a/usr/src/uts/common/io/i40e/core/i40e_lan_hmc.h b/usr/src/uts/common/io/i40e/core/i40e_lan_hmc.h
new file mode 100644
index 0000000000..2a575264ab
--- /dev/null
+++ b/usr/src/uts/common/io/i40e/core/i40e_lan_hmc.h
@@ -0,0 +1,201 @@
+/******************************************************************************
+
+  Copyright (c) 2013-2014, Intel Corporation 
+  All rights reserved.
+  
+  Redistribution and use in source and binary forms, with or without 
+  modification, are permitted provided that the following conditions are met:
+  
+   1. Redistributions of source code must retain the above copyright notice, 
+      this list of conditions and the following disclaimer.
+  
+   2. Redistributions in binary form must reproduce the above copyright 
+      notice, this list of conditions and the following disclaimer in the 
+      documentation and/or other materials provided with the distribution.
+  
+   3. Neither the name of the Intel Corporation nor the names of its 
+      contributors may be used to endorse or promote products derived from 
+      this software without specific prior written permission.
+  
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+  POSSIBILITY OF SUCH DAMAGE.
+
+******************************************************************************/
+/*$FreeBSD: head/sys/dev/ixl/i40e_lan_hmc.h 283119 2015-05-19 18:35:18Z jhb $*/
+
+#ifndef _I40E_LAN_HMC_H_
+#define _I40E_LAN_HMC_H_
+
+/* forward-declare the HW struct for the compiler */
+struct i40e_hw;
+
+/* HMC element context information */
+
+/* Rx queue context data
+ *
+ * The sizes of the variables may be larger than needed due to crossing byte
+ * boundaries. If we do not have the width of the variable set to the correct
+ * size then we could end up shifting bits off the top of the variable when the
+ * variable is at the top of a byte and crosses over into the next byte.
+ */
+struct i40e_hmc_obj_rxq {
+	u16 head;
+	u16 cpuid; /* bigger than needed, see above for reason */
+	u64 base;
+	u16 qlen;
+#define I40E_RXQ_CTX_DBUFF_SHIFT 7
+	u16 dbuff; /* bigger than needed, see above for reason */
+#define I40E_RXQ_CTX_HBUFF_SHIFT 6
+	u16 hbuff; /* bigger than needed, see above for reason */
+	u8  dtype;
+	u8  dsize;
+	u8  crcstrip;
+	u8  fc_ena;
+	u8  l2tsel;
+	u8  hsplit_0;
+	u8  hsplit_1;
+	u8  showiv;
+	u32 rxmax; /* bigger than needed, see above for reason */
+	u8  tphrdesc_ena;
+	u8  tphwdesc_ena;
+	u8  tphdata_ena;
+	u8  tphhead_ena;
+	u16 lrxqthresh; /* bigger than needed, see above for reason */
+	u8  prefena;	/* NOTE: normally must be set to 1 at init */
+};
+
+/* Tx queue context data
+*
+* The sizes of the variables may be larger than needed due to crossing byte
+* boundaries. If we do not have the width of the variable set to the correct
+* size then we could end up shifting bits off the top of the variable when the
+* variable is at the top of a byte and crosses over into the next byte.
+*/
+struct i40e_hmc_obj_txq {
+	u16 head;
+	u8  new_context;
+	u64 base;
+	u8  fc_ena;
+	u8  timesync_ena;
+	u8  fd_ena;
+	u8  alt_vlan_ena;
+	u16 thead_wb;
+	u8  cpuid;
+	u8  head_wb_ena;
+	u16 qlen;
+	u8  tphrdesc_ena;
+	u8  tphrpacket_ena;
+	u8  tphwdesc_ena;
+	u64 head_wb_addr;
+	u32 crc;
+	u16 rdylist;
+	u8  rdylist_act;
+};
+
+/* for hsplit_0 field of Rx HMC context */
+enum i40e_hmc_obj_rx_hsplit_0 {
+	I40E_HMC_OBJ_RX_HSPLIT_0_NO_SPLIT      = 0,
+	I40E_HMC_OBJ_RX_HSPLIT_0_SPLIT_L2      = 1,
+	I40E_HMC_OBJ_RX_HSPLIT_0_SPLIT_IP      = 2,
+	I40E_HMC_OBJ_RX_HSPLIT_0_SPLIT_TCP_UDP = 4,
+	I40E_HMC_OBJ_RX_HSPLIT_0_SPLIT_SCTP    = 8,
+};
+
+/* fcoe_cntx and fcoe_filt are for debugging purpose only */
+struct i40e_hmc_obj_fcoe_cntx {
+	u32 rsv[32];
+};
+
+struct i40e_hmc_obj_fcoe_filt {
+	u32 rsv[8];
+};
+
+/* Context sizes for LAN objects */
+enum i40e_hmc_lan_object_size {
+	I40E_HMC_LAN_OBJ_SZ_8   = 0x3,
+	I40E_HMC_LAN_OBJ_SZ_16  = 0x4,
+	I40E_HMC_LAN_OBJ_SZ_32  = 0x5,
+	I40E_HMC_LAN_OBJ_SZ_64  = 0x6,
+	I40E_HMC_LAN_OBJ_SZ_128 = 0x7,
+	I40E_HMC_LAN_OBJ_SZ_256 = 0x8,
+	I40E_HMC_LAN_OBJ_SZ_512 = 0x9,
+};
+
+#define I40E_HMC_L2OBJ_BASE_ALIGNMENT 512
+#define I40E_HMC_OBJ_SIZE_TXQ         128
+#define I40E_HMC_OBJ_SIZE_RXQ         32
+#define I40E_HMC_OBJ_SIZE_FCOE_CNTX   64
+#define I40E_HMC_OBJ_SIZE_FCOE_FILT   64
+
+enum i40e_hmc_lan_rsrc_type {
+	I40E_HMC_LAN_FULL  = 0,
+	I40E_HMC_LAN_TX    = 1,
+	I40E_HMC_LAN_RX    = 2,
+	I40E_HMC_FCOE_CTX  = 3,
+	I40E_HMC_FCOE_FILT = 4,
+	I40E_HMC_LAN_MAX   = 5
+};
+
+enum i40e_hmc_model {
+	I40E_HMC_MODEL_DIRECT_PREFERRED = 0,
+	I40E_HMC_MODEL_DIRECT_ONLY      = 1,
+	I40E_HMC_MODEL_PAGED_ONLY       = 2,
+	I40E_HMC_MODEL_UNKNOWN,
+};
+
+struct i40e_hmc_lan_create_obj_info {
+	struct i40e_hmc_info *hmc_info;
+	u32 rsrc_type;
+	u32 start_idx;
+	u32 count;
+	enum i40e_sd_entry_type entry_type;
+	u64 direct_mode_sz;
+};
+
+struct i40e_hmc_lan_delete_obj_info {
+	struct i40e_hmc_info *hmc_info;
+	u32 rsrc_type;
+	u32 start_idx;
+	u32 count;
+};
+
+enum i40e_status_code i40e_init_lan_hmc(struct i40e_hw *hw, u32 txq_num,
+					u32 rxq_num, u32 fcoe_cntx_num,
+					u32 fcoe_filt_num);
+enum i40e_status_code i40e_configure_lan_hmc(struct i40e_hw *hw,
+					     enum i40e_hmc_model model);
+enum i40e_status_code i40e_shutdown_lan_hmc(struct i40e_hw *hw);
+
+u64 i40e_calculate_l2fpm_size(u32 txq_num, u32 rxq_num,
+			      u32 fcoe_cntx_num, u32 fcoe_filt_num);
+enum i40e_status_code i40e_get_lan_tx_queue_context(struct i40e_hw *hw,
+						    u16 queue,
+						    struct i40e_hmc_obj_txq *s);
+enum i40e_status_code i40e_clear_lan_tx_queue_context(struct i40e_hw *hw,
+						      u16 queue);
+enum i40e_status_code i40e_set_lan_tx_queue_context(struct i40e_hw *hw,
+						    u16 queue,
+						    struct i40e_hmc_obj_txq *s);
+enum i40e_status_code i40e_get_lan_rx_queue_context(struct i40e_hw *hw,
+						    u16 queue,
+						    struct i40e_hmc_obj_rxq *s);
+enum i40e_status_code i40e_clear_lan_rx_queue_context(struct i40e_hw *hw,
+						      u16 queue);
+enum i40e_status_code i40e_set_lan_rx_queue_context(struct i40e_hw *hw,
+						    u16 queue,
+						    struct i40e_hmc_obj_rxq *s);
+enum i40e_status_code i40e_create_lan_hmc_object(struct i40e_hw *hw,
+				struct i40e_hmc_lan_create_obj_info *info);
+enum i40e_status_code i40e_delete_lan_hmc_object(struct i40e_hw *hw,
+				struct i40e_hmc_lan_delete_obj_info *info);
+
+#endif /* _I40E_LAN_HMC_H_ */
diff --git a/usr/src/uts/common/io/i40e/core/i40e_nvm.c b/usr/src/uts/common/io/i40e/core/i40e_nvm.c
new file mode 100644
index 0000000000..04d61bb969
--- /dev/null
+++ b/usr/src/uts/common/io/i40e/core/i40e_nvm.c
@@ -0,0 +1,712 @@
+/******************************************************************************
+
+  Copyright (c) 2013-2015, Intel Corporation 
+  All rights reserved.
+  
+  Redistribution and use in source and binary forms, with or without 
+  modification, are permitted provided that the following conditions are met:
+  
+   1. Redistributions of source code must retain the above copyright notice, 
+      this list of conditions and the following disclaimer.
+  
+   2. Redistributions in binary form must reproduce the above copyright 
+      notice, this list of conditions and the following disclaimer in the 
+      documentation and/or other materials provided with the distribution.
+  
+   3. Neither the name of the Intel Corporation nor the names of its 
+      contributors may be used to endorse or promote products derived from 
+      this software without specific prior written permission.
+  
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+  POSSIBILITY OF SUCH DAMAGE.
+
+******************************************************************************/
+/*$FreeBSD: head/sys/dev/ixl/i40e_nvm.c 284049 2015-06-05 22:52:42Z jfv $*/
+
+#include "i40e_prototype.h"
+
+enum i40e_status_code i40e_read_nvm_word_srctl(struct i40e_hw *hw, u16 offset,
+					       u16 *data);
+enum i40e_status_code i40e_read_nvm_word_aq(struct i40e_hw *hw, u16 offset,
+					    u16 *data);
+enum i40e_status_code i40e_read_nvm_buffer_srctl(struct i40e_hw *hw, u16 offset,
+						 u16 *words, u16 *data);
+enum i40e_status_code i40e_read_nvm_buffer_aq(struct i40e_hw *hw, u16 offset,
+					      u16 *words, u16 *data);
+enum i40e_status_code i40e_read_nvm_aq(struct i40e_hw *hw, u8 module_pointer,
+				       u32 offset, u16 words, void *data,
+				       bool last_command);
+
+/**
+ * i40e_init_nvm_ops - Initialize NVM function pointers
+ * @hw: pointer to the HW structure
+ *
+ * Setup the function pointers and the NVM info structure. Should be called
+ * once per NVM initialization, e.g. inside the i40e_init_shared_code().
+ * Please notice that the NVM term is used here (& in all methods covered
+ * in this file) as an equivalent of the FLASH part mapped into the SR.
+ * We are accessing FLASH always thru the Shadow RAM.
+ **/
+enum i40e_status_code i40e_init_nvm(struct i40e_hw *hw)
+{
+	struct i40e_nvm_info *nvm = &hw->nvm;
+	enum i40e_status_code ret_code = I40E_SUCCESS;
+	u32 fla, gens;
+	u8 sr_size;
+
+	DEBUGFUNC("i40e_init_nvm");
+
+	/* The SR size is stored regardless of the nvm programming mode
+	 * as the blank mode may be used in the factory line.
+	 */
+	gens = rd32(hw, I40E_GLNVM_GENS);
+	sr_size = ((gens & I40E_GLNVM_GENS_SR_SIZE_MASK) >>
+			   I40E_GLNVM_GENS_SR_SIZE_SHIFT);
+	/* Switching to words (sr_size contains power of 2KB) */
+	nvm->sr_size = BIT(sr_size) * I40E_SR_WORDS_IN_1KB;
+
+	/* Check if we are in the normal or blank NVM programming mode */
+	fla = rd32(hw, I40E_GLNVM_FLA);
+	if (fla & I40E_GLNVM_FLA_LOCKED_MASK) { /* Normal programming mode */
+		/* Max NVM timeout */
+		nvm->timeout = I40E_MAX_NVM_TIMEOUT;
+		nvm->blank_nvm_mode = FALSE;
+	} else { /* Blank programming mode */
+		nvm->blank_nvm_mode = TRUE;
+		ret_code = I40E_ERR_NVM_BLANK_MODE;
+		i40e_debug(hw, I40E_DEBUG_NVM, "NVM init error: unsupported blank mode.\n");
+	}
+
+	return ret_code;
+}
+
+/**
+ * i40e_acquire_nvm - Generic request for acquiring the NVM ownership
+ * @hw: pointer to the HW structure
+ * @access: NVM access type (read or write)
+ *
+ * This function will request NVM ownership for reading
+ * via the proper Admin Command.
+ **/
+enum i40e_status_code i40e_acquire_nvm(struct i40e_hw *hw,
+				       enum i40e_aq_resource_access_type access)
+{
+	enum i40e_status_code ret_code = I40E_SUCCESS;
+	u64 gtime, timeout;
+	u64 time_left = 0;
+
+	DEBUGFUNC("i40e_acquire_nvm");
+
+	if (hw->nvm.blank_nvm_mode)
+		goto i40e_i40e_acquire_nvm_exit;
+
+	ret_code = i40e_aq_request_resource(hw, I40E_NVM_RESOURCE_ID, access,
+					    0, &time_left, NULL);
+	/* Reading the Global Device Timer */
+	gtime = rd32(hw, I40E_GLVFGEN_TIMER);
+
+	/* Store the timeout */
+	hw->nvm.hw_semaphore_timeout = I40E_MS_TO_GTIME(time_left) + gtime;
+
+	if (ret_code)
+		i40e_debug(hw, I40E_DEBUG_NVM,
+			   "NVM acquire type %d failed time_left=%llu ret=%d aq_err=%d\n",
+			   access, time_left, ret_code, hw->aq.asq_last_status);
+
+	if (ret_code && time_left) {
+		/* Poll until the current NVM owner timeouts */
+		timeout = I40E_MS_TO_GTIME(I40E_MAX_NVM_TIMEOUT) + gtime;
+		while ((gtime < timeout) && time_left) {
+			i40e_msec_delay(10);
+			gtime = rd32(hw, I40E_GLVFGEN_TIMER);
+			ret_code = i40e_aq_request_resource(hw,
+							I40E_NVM_RESOURCE_ID,
+							access, 0, &time_left,
+							NULL);
+			if (ret_code == I40E_SUCCESS) {
+				hw->nvm.hw_semaphore_timeout =
+					    I40E_MS_TO_GTIME(time_left) + gtime;
+				break;
+			}
+		}
+		if (ret_code != I40E_SUCCESS) {
+			hw->nvm.hw_semaphore_timeout = 0;
+			i40e_debug(hw, I40E_DEBUG_NVM,
+				   "NVM acquire timed out, wait %llu ms before trying again. status=%d aq_err=%d\n",
+				   time_left, ret_code, hw->aq.asq_last_status);
+		}
+	}
+
+i40e_i40e_acquire_nvm_exit:
+	return ret_code;
+}
+
+/**
+ * i40e_release_nvm - Generic request for releasing the NVM ownership
+ * @hw: pointer to the HW structure
+ *
+ * This function will release NVM resource via the proper Admin Command.
+ **/
+void i40e_release_nvm(struct i40e_hw *hw)
+{
+	enum i40e_status_code ret_code = I40E_SUCCESS;
+	u32 total_delay = 0;
+
+	DEBUGFUNC("i40e_release_nvm");
+
+	if (hw->nvm.blank_nvm_mode)
+		return;
+
+	ret_code = i40e_aq_release_resource(hw, I40E_NVM_RESOURCE_ID, 0, NULL);
+
+	/* there are some rare cases when trying to release the resource
+	 * results in an admin Q timeout, so handle them correctly
+	 */
+	while ((ret_code == I40E_ERR_ADMIN_QUEUE_TIMEOUT) &&
+	       (total_delay < hw->aq.asq_cmd_timeout)) {
+			i40e_msec_delay(1);
+			ret_code = i40e_aq_release_resource(hw,
+						I40E_NVM_RESOURCE_ID, 0, NULL);
+			total_delay++;
+	}
+}
+
+/**
+ * i40e_poll_sr_srctl_done_bit - Polls the GLNVM_SRCTL done bit
+ * @hw: pointer to the HW structure
+ *
+ * Polls the SRCTL Shadow RAM register done bit.
+ **/
+static enum i40e_status_code i40e_poll_sr_srctl_done_bit(struct i40e_hw *hw)
+{
+	enum i40e_status_code ret_code = I40E_ERR_TIMEOUT;
+	u32 srctl, wait_cnt;
+
+	DEBUGFUNC("i40e_poll_sr_srctl_done_bit");
+
+	/* Poll the I40E_GLNVM_SRCTL until the done bit is set */
+	for (wait_cnt = 0; wait_cnt < I40E_SRRD_SRCTL_ATTEMPTS; wait_cnt++) {
+		srctl = rd32(hw, I40E_GLNVM_SRCTL);
+		if (srctl & I40E_GLNVM_SRCTL_DONE_MASK) {
+			ret_code = I40E_SUCCESS;
+			break;
+		}
+		i40e_usec_delay(5);
+	}
+	if (ret_code == I40E_ERR_TIMEOUT)
+		i40e_debug(hw, I40E_DEBUG_NVM, "Done bit in GLNVM_SRCTL not set");
+	return ret_code;
+}
+
+/**
+ * i40e_read_nvm_word - Reads Shadow RAM
+ * @hw: pointer to the HW structure
+ * @offset: offset of the Shadow RAM word to read (0x000000 - 0x001FFF)
+ * @data: word read from the Shadow RAM
+ *
+ * Reads one 16 bit word from the Shadow RAM using the GLNVM_SRCTL register.
+ **/
+enum i40e_status_code i40e_read_nvm_word(struct i40e_hw *hw, u16 offset,
+					 u16 *data)
+{
+#ifdef X722_SUPPORT
+	if (hw->mac.type == I40E_MAC_X722)
+		return i40e_read_nvm_word_aq(hw, offset, data);
+#endif
+	return i40e_read_nvm_word_srctl(hw, offset, data);
+}
+
+/**
+ * i40e_read_nvm_word_srctl - Reads Shadow RAM via SRCTL register
+ * @hw: pointer to the HW structure
+ * @offset: offset of the Shadow RAM word to read (0x000000 - 0x001FFF)
+ * @data: word read from the Shadow RAM
+ *
+ * Reads one 16 bit word from the Shadow RAM using the GLNVM_SRCTL register.
+ **/
+enum i40e_status_code i40e_read_nvm_word_srctl(struct i40e_hw *hw, u16 offset,
+					       u16 *data)
+{
+	enum i40e_status_code ret_code = I40E_ERR_TIMEOUT;
+	u32 sr_reg;
+
+	DEBUGFUNC("i40e_read_nvm_word_srctl");
+
+	if (offset >= hw->nvm.sr_size) {
+		i40e_debug(hw, I40E_DEBUG_NVM,
+			   "NVM read error: Offset %d beyond Shadow RAM limit %d\n",
+			   offset, hw->nvm.sr_size);
+		ret_code = I40E_ERR_PARAM;
+		goto read_nvm_exit;
+	}
+
+	/* Poll the done bit first */
+	ret_code = i40e_poll_sr_srctl_done_bit(hw);
+	if (ret_code == I40E_SUCCESS) {
+		/* Write the address and start reading */
+		sr_reg = ((u32)offset << I40E_GLNVM_SRCTL_ADDR_SHIFT) |
+			 BIT(I40E_GLNVM_SRCTL_START_SHIFT);
+		wr32(hw, I40E_GLNVM_SRCTL, sr_reg);
+
+		/* Poll I40E_GLNVM_SRCTL until the done bit is set */
+		ret_code = i40e_poll_sr_srctl_done_bit(hw);
+		if (ret_code == I40E_SUCCESS) {
+			sr_reg = rd32(hw, I40E_GLNVM_SRDATA);
+			*data = (u16)((sr_reg &
+				       I40E_GLNVM_SRDATA_RDDATA_MASK)
+				    >> I40E_GLNVM_SRDATA_RDDATA_SHIFT);
+		}
+	}
+	if (ret_code != I40E_SUCCESS)
+		i40e_debug(hw, I40E_DEBUG_NVM,
+			   "NVM read error: Couldn't access Shadow RAM address: 0x%x\n",
+			   offset);
+
+read_nvm_exit:
+	return ret_code;
+}
+
+/**
+ * i40e_read_nvm_word_aq - Reads Shadow RAM via AQ
+ * @hw: pointer to the HW structure
+ * @offset: offset of the Shadow RAM word to read (0x000000 - 0x001FFF)
+ * @data: word read from the Shadow RAM
+ *
+ * Reads one 16 bit word from the Shadow RAM using the GLNVM_SRCTL register.
+ **/
+enum i40e_status_code i40e_read_nvm_word_aq(struct i40e_hw *hw, u16 offset,
+					    u16 *data)
+{
+	enum i40e_status_code ret_code = I40E_ERR_TIMEOUT;
+
+	DEBUGFUNC("i40e_read_nvm_word_aq");
+
+	ret_code = i40e_read_nvm_aq(hw, 0x0, offset, 1, data, TRUE);
+	*data = LE16_TO_CPU(*(__le16 *)data);
+
+	return ret_code;
+}
+
+/**
+ * i40e_read_nvm_buffer - Reads Shadow RAM buffer
+ * @hw: pointer to the HW structure
+ * @offset: offset of the Shadow RAM word to read (0x000000 - 0x001FFF).
+ * @words: (in) number of words to read; (out) number of words actually read
+ * @data: words read from the Shadow RAM
+ *
+ * Reads 16 bit words (data buffer) from the SR using the i40e_read_nvm_srrd()
+ * method. The buffer read is preceded by the NVM ownership take
+ * and followed by the release.
+ **/
+enum i40e_status_code i40e_read_nvm_buffer(struct i40e_hw *hw, u16 offset,
+					   u16 *words, u16 *data)
+{
+#ifdef X722_SUPPORT
+	if (hw->mac.type == I40E_MAC_X722)
+		return i40e_read_nvm_buffer_aq(hw, offset, words, data);
+#endif
+	return i40e_read_nvm_buffer_srctl(hw, offset, words, data);
+}
+
+/**
+ * i40e_read_nvm_buffer_srctl - Reads Shadow RAM buffer via SRCTL register
+ * @hw: pointer to the HW structure
+ * @offset: offset of the Shadow RAM word to read (0x000000 - 0x001FFF).
+ * @words: (in) number of words to read; (out) number of words actually read
+ * @data: words read from the Shadow RAM
+ *
+ * Reads 16 bit words (data buffer) from the SR using the i40e_read_nvm_srrd()
+ * method. The buffer read is preceded by the NVM ownership take
+ * and followed by the release.
+ **/
+enum i40e_status_code i40e_read_nvm_buffer_srctl(struct i40e_hw *hw, u16 offset,
+						 u16 *words, u16 *data)
+{
+	enum i40e_status_code ret_code = I40E_SUCCESS;
+	u16 index, word;
+
+	DEBUGFUNC("i40e_read_nvm_buffer_srctl");
+
+	/* Loop thru the selected region */
+	for (word = 0; word < *words; word++) {
+		index = offset + word;
+		ret_code = i40e_read_nvm_word_srctl(hw, index, &data[word]);
+		if (ret_code != I40E_SUCCESS)
+			break;
+	}
+
+	/* Update the number of words read from the Shadow RAM */
+	*words = word;
+
+	return ret_code;
+}
+
+/**
+ * i40e_read_nvm_buffer_aq - Reads Shadow RAM buffer via AQ
+ * @hw: pointer to the HW structure
+ * @offset: offset of the Shadow RAM word to read (0x000000 - 0x001FFF).
+ * @words: (in) number of words to read; (out) number of words actually read
+ * @data: words read from the Shadow RAM
+ *
+ * Reads 16 bit words (data buffer) from the SR using the i40e_read_nvm_aq()
+ * method. The buffer read is preceded by the NVM ownership take
+ * and followed by the release.
+ **/
+enum i40e_status_code i40e_read_nvm_buffer_aq(struct i40e_hw *hw, u16 offset,
+					      u16 *words, u16 *data)
+{
+	enum i40e_status_code ret_code;
+	u16 read_size = *words;
+	bool last_cmd = FALSE;
+	u16 words_read = 0;
+	u16 i = 0;
+
+	DEBUGFUNC("i40e_read_nvm_buffer_aq");
+
+	do {
+		/* Calculate number of bytes we should read in this step.
+		 * FVL AQ do not allow to read more than one page at a time or
+		 * to cross page boundaries.
+		 */
+		if (offset % I40E_SR_SECTOR_SIZE_IN_WORDS)
+			read_size = min(*words,
+					(u16)(I40E_SR_SECTOR_SIZE_IN_WORDS -
+				      (offset % I40E_SR_SECTOR_SIZE_IN_WORDS)));
+		else
+			read_size = min((*words - words_read),
+					I40E_SR_SECTOR_SIZE_IN_WORDS);
+
+		/* Check if this is last command, if so set proper flag */
+		if ((words_read + read_size) >= *words)
+			last_cmd = TRUE;
+
+		ret_code = i40e_read_nvm_aq(hw, 0x0, offset, read_size,
+					    data + words_read, last_cmd);
+		if (ret_code != I40E_SUCCESS)
+			goto read_nvm_buffer_aq_exit;
+
+		/* Increment counter for words already read and move offset to
+		 * new read location
+		 */
+		words_read += read_size;
+		offset += read_size;
+	} while (words_read < *words);
+
+	for (i = 0; i < *words; i++)
+		data[i] = LE16_TO_CPU(((__le16 *)data)[i]);
+
+read_nvm_buffer_aq_exit:
+	*words = words_read;
+	return ret_code;
+}
+
+/**
+ * i40e_read_nvm_aq - Read Shadow RAM.
+ * @hw: pointer to the HW structure.
+ * @module_pointer: module pointer location in words from the NVM beginning
+ * @offset: offset in words from module start
+ * @words: number of words to write
+ * @data: buffer with words to write to the Shadow RAM
+ * @last_command: tells the AdminQ that this is the last command
+ *
+ * Writes a 16 bit words buffer to the Shadow RAM using the admin command.
+ **/
+enum i40e_status_code i40e_read_nvm_aq(struct i40e_hw *hw, u8 module_pointer,
+				       u32 offset, u16 words, void *data,
+				       bool last_command)
+{
+	enum i40e_status_code ret_code = I40E_ERR_NVM;
+	struct i40e_asq_cmd_details cmd_details;
+
+	DEBUGFUNC("i40e_read_nvm_aq");
+
+	memset(&cmd_details, 0, sizeof(cmd_details));
+	cmd_details.wb_desc = &hw->nvm_wb_desc;
+
+	/* Here we are checking the SR limit only for the flat memory model.
+	 * We cannot do it for the module-based model, as we did not acquire
+	 * the NVM resource yet (we cannot get the module pointer value).
+	 * Firmware will check the module-based model.
+	 */
+	if ((offset + words) > hw->nvm.sr_size)
+		i40e_debug(hw, I40E_DEBUG_NVM,
+			   "NVM write error: offset %d beyond Shadow RAM limit %d\n",
+			   (offset + words), hw->nvm.sr_size);
+	else if (words > I40E_SR_SECTOR_SIZE_IN_WORDS)
+		/* We can write only up to 4KB (one sector), in one AQ write */
+		i40e_debug(hw, I40E_DEBUG_NVM,
+			   "NVM write fail error: tried to write %d words, limit is %d.\n",
+			   words, I40E_SR_SECTOR_SIZE_IN_WORDS);
+	else if (((offset + (words - 1)) / I40E_SR_SECTOR_SIZE_IN_WORDS)
+		 != (offset / I40E_SR_SECTOR_SIZE_IN_WORDS))
+		/* A single write cannot spread over two sectors */
+		i40e_debug(hw, I40E_DEBUG_NVM,
+			   "NVM write error: cannot spread over two sectors in a single write offset=%d words=%d\n",
+			   offset, words);
+	else
+		ret_code = i40e_aq_read_nvm(hw, module_pointer,
+					    2 * offset,  /*bytes*/
+					    2 * words,   /*bytes*/
+					    data, last_command, &cmd_details);
+
+	return ret_code;
+}
+
+/**
+ * i40e_write_nvm_aq - Writes Shadow RAM.
+ * @hw: pointer to the HW structure.
+ * @module_pointer: module pointer location in words from the NVM beginning
+ * @offset: offset in words from module start
+ * @words: number of words to write
+ * @data: buffer with words to write to the Shadow RAM
+ * @last_command: tells the AdminQ that this is the last command
+ *
+ * Writes a 16 bit words buffer to the Shadow RAM using the admin command.
+ **/
+enum i40e_status_code i40e_write_nvm_aq(struct i40e_hw *hw, u8 module_pointer,
+					u32 offset, u16 words, void *data,
+					bool last_command)
+{
+	enum i40e_status_code ret_code = I40E_ERR_NVM;
+	struct i40e_asq_cmd_details cmd_details;
+
+	DEBUGFUNC("i40e_write_nvm_aq");
+
+	memset(&cmd_details, 0, sizeof(cmd_details));
+	cmd_details.wb_desc = &hw->nvm_wb_desc;
+
+	/* Here we are checking the SR limit only for the flat memory model.
+	 * We cannot do it for the module-based model, as we did not acquire
+	 * the NVM resource yet (we cannot get the module pointer value).
+	 * Firmware will check the module-based model.
+	 */
+	if ((offset + words) > hw->nvm.sr_size)
+		DEBUGOUT("NVM write error: offset beyond Shadow RAM limit.\n");
+	else if (words > I40E_SR_SECTOR_SIZE_IN_WORDS)
+		/* We can write only up to 4KB (one sector), in one AQ write */
+		DEBUGOUT("NVM write fail error: cannot write more than 4KB in a single write.\n");
+	else if (((offset + (words - 1)) / I40E_SR_SECTOR_SIZE_IN_WORDS)
+		 != (offset / I40E_SR_SECTOR_SIZE_IN_WORDS))
+		/* A single write cannot spread over two sectors */
+		DEBUGOUT("NVM write error: cannot spread over two sectors in a single write.\n");
+	else
+		ret_code = i40e_aq_update_nvm(hw, module_pointer,
+					      2 * offset,  /*bytes*/
+					      2 * words,   /*bytes*/
+					      data, last_command, &cmd_details);
+
+	return ret_code;
+}
+
+/**
+ * i40e_write_nvm_word - Writes Shadow RAM word
+ * @hw: pointer to the HW structure
+ * @offset: offset of the Shadow RAM word to write
+ * @data: word to write to the Shadow RAM
+ *
+ * Writes a 16 bit word to the SR using the i40e_write_nvm_aq() method.
+ * NVM ownership have to be acquired and released (on ARQ completion event
+ * reception) by caller. To commit SR to NVM update checksum function
+ * should be called.
+ **/
+enum i40e_status_code i40e_write_nvm_word(struct i40e_hw *hw, u32 offset,
+					  void *data)
+{
+	DEBUGFUNC("i40e_write_nvm_word");
+
+	*((__le16 *)data) = CPU_TO_LE16(*((u16 *)data));
+
+	/* Value 0x00 below means that we treat SR as a flat mem */
+	return i40e_write_nvm_aq(hw, 0x00, offset, 1, data, FALSE);
+}
+
+/**
+ * i40e_write_nvm_buffer - Writes Shadow RAM buffer
+ * @hw: pointer to the HW structure
+ * @module_pointer: module pointer location in words from the NVM beginning
+ * @offset: offset of the Shadow RAM buffer to write
+ * @words: number of words to write
+ * @data: words to write to the Shadow RAM
+ *
+ * Writes a 16 bit words buffer to the Shadow RAM using the admin command.
+ * NVM ownership must be acquired before calling this function and released
+ * on ARQ completion event reception by caller. To commit SR to NVM update
+ * checksum function should be called.
+ **/
+enum i40e_status_code i40e_write_nvm_buffer(struct i40e_hw *hw,
+					    u8 module_pointer, u32 offset,
+					    u16 words, void *data)
+{
+	__le16 *le_word_ptr = (__le16 *)data;
+	u16 *word_ptr = (u16 *)data;
+	u32 i = 0;
+
+	DEBUGFUNC("i40e_write_nvm_buffer");
+
+	for (i = 0; i < words; i++)
+		le_word_ptr[i] = CPU_TO_LE16(word_ptr[i]);
+
+	/* Here we will only write one buffer as the size of the modules
+	 * mirrored in the Shadow RAM is always less than 4K.
+	 */
+	return i40e_write_nvm_aq(hw, module_pointer, offset, words,
+				 data, FALSE);
+}
+
+/**
+ * i40e_calc_nvm_checksum - Calculates and returns the checksum
+ * @hw: pointer to hardware structure
+ * @checksum: pointer to the checksum
+ *
+ * This function calculates SW Checksum that covers the whole 64kB shadow RAM
+ * except the VPD and PCIe ALT Auto-load modules. The structure and size of VPD
+ * is customer specific and unknown. Therefore, this function skips all maximum
+ * possible size of VPD (1kB).
+ **/
+enum i40e_status_code i40e_calc_nvm_checksum(struct i40e_hw *hw, u16 *checksum)
+{
+	enum i40e_status_code ret_code = I40E_SUCCESS;
+	struct i40e_virt_mem vmem;
+	u16 pcie_alt_module = 0;
+	u16 checksum_local = 0;
+	u16 vpd_module = 0;
+	u16 *data;
+	u16 i = 0;
+
+	DEBUGFUNC("i40e_calc_nvm_checksum");
+
+	ret_code = i40e_allocate_virt_mem(hw, &vmem,
+				    I40E_SR_SECTOR_SIZE_IN_WORDS * sizeof(u16));
+	if (ret_code)
+		goto i40e_calc_nvm_checksum_exit;
+	data = (u16 *)vmem.va;
+
+	/* read pointer to VPD area */
+	ret_code = i40e_read_nvm_word(hw, I40E_SR_VPD_PTR, &vpd_module);
+	if (ret_code != I40E_SUCCESS) {
+		ret_code = I40E_ERR_NVM_CHECKSUM;
+		goto i40e_calc_nvm_checksum_exit;
+	}
+
+	/* read pointer to PCIe Alt Auto-load module */
+	ret_code = i40e_read_nvm_word(hw, I40E_SR_PCIE_ALT_AUTO_LOAD_PTR,
+				      &pcie_alt_module);
+	if (ret_code != I40E_SUCCESS) {
+		ret_code = I40E_ERR_NVM_CHECKSUM;
+		goto i40e_calc_nvm_checksum_exit;
+	}
+
+	/* Calculate SW checksum that covers the whole 64kB shadow RAM
+	 * except the VPD and PCIe ALT Auto-load modules
+	 */
+	for (i = 0; i < hw->nvm.sr_size; i++) {
+		/* Read SR page */
+		if ((i % I40E_SR_SECTOR_SIZE_IN_WORDS) == 0) {
+			u16 words = I40E_SR_SECTOR_SIZE_IN_WORDS;
+
+			ret_code = i40e_read_nvm_buffer(hw, i, &words, data);
+			if (ret_code != I40E_SUCCESS) {
+				ret_code = I40E_ERR_NVM_CHECKSUM;
+				goto i40e_calc_nvm_checksum_exit;
+			}
+		}
+
+		/* Skip Checksum word */
+		if (i == I40E_SR_SW_CHECKSUM_WORD)
+			continue;
+		/* Skip VPD module (convert byte size to word count) */
+		if ((i >= (u32)vpd_module) &&
+		    (i < ((u32)vpd_module +
+		     (I40E_SR_VPD_MODULE_MAX_SIZE / 2)))) {
+			continue;
+		}
+		/* Skip PCIe ALT module (convert byte size to word count) */
+		if ((i >= (u32)pcie_alt_module) &&
+		    (i < ((u32)pcie_alt_module +
+		     (I40E_SR_PCIE_ALT_MODULE_MAX_SIZE / 2)))) {
+			continue;
+		}
+
+		checksum_local += data[i % I40E_SR_SECTOR_SIZE_IN_WORDS];
+	}
+
+	*checksum = (u16)I40E_SR_SW_CHECKSUM_BASE - checksum_local;
+
+i40e_calc_nvm_checksum_exit:
+	i40e_free_virt_mem(hw, &vmem);
+	return ret_code;
+}
+
+/**
+ * i40e_update_nvm_checksum - Updates the NVM checksum
+ * @hw: pointer to hardware structure
+ *
+ * NVM ownership must be acquired before calling this function and released
+ * on ARQ completion event reception by caller.
+ * This function will commit SR to NVM.
+ **/
+enum i40e_status_code i40e_update_nvm_checksum(struct i40e_hw *hw)
+{
+	enum i40e_status_code ret_code = I40E_SUCCESS;
+	u16 checksum;
+	__le16 le_sum;
+
+	DEBUGFUNC("i40e_update_nvm_checksum");
+
+	ret_code = i40e_calc_nvm_checksum(hw, &checksum);
+	le_sum = CPU_TO_LE16(checksum);
+	if (ret_code == I40E_SUCCESS)
+		ret_code = i40e_write_nvm_aq(hw, 0x00, I40E_SR_SW_CHECKSUM_WORD,
+					     1, &le_sum, TRUE);
+
+	return ret_code;
+}
+
+/**
+ * i40e_validate_nvm_checksum - Validate EEPROM checksum
+ * @hw: pointer to hardware structure
+ * @checksum: calculated checksum
+ *
+ * Performs checksum calculation and validates the NVM SW checksum. If the
+ * caller does not need checksum, the value can be NULL.
+ **/
+enum i40e_status_code i40e_validate_nvm_checksum(struct i40e_hw *hw,
+						 u16 *checksum)
+{
+	enum i40e_status_code ret_code = I40E_SUCCESS;
+	u16 checksum_sr = 0;
+	u16 checksum_local = 0;
+
+	DEBUGFUNC("i40e_validate_nvm_checksum");
+
+	ret_code = i40e_calc_nvm_checksum(hw, &checksum_local);
+	if (ret_code != I40E_SUCCESS)
+		goto i40e_validate_nvm_checksum_exit;
+
+	/* Do not use i40e_read_nvm_word() because we do not want to take
+	 * the synchronization semaphores twice here.
+	 */
+	i40e_read_nvm_word(hw, I40E_SR_SW_CHECKSUM_WORD, &checksum_sr);
+
+	/* Verify read checksum from EEPROM is the same as
+	 * calculated checksum
+	 */
+	if (checksum_local != checksum_sr)
+		ret_code = I40E_ERR_NVM_CHECKSUM;
+
+	/* If the user cares, return the calculated checksum */
+	if (checksum)
+		*checksum = checksum_local;
+
+i40e_validate_nvm_checksum_exit:
+	return ret_code;
+}
diff --git a/usr/src/uts/common/io/i40e/core/i40e_prototype.h b/usr/src/uts/common/io/i40e/core/i40e_prototype.h
new file mode 100644
index 0000000000..6f1cfc3afe
--- /dev/null
+++ b/usr/src/uts/common/io/i40e/core/i40e_prototype.h
@@ -0,0 +1,478 @@
+/******************************************************************************
+
+  Copyright (c) 2013-2015, Intel Corporation 
+  All rights reserved.
+  
+  Redistribution and use in source and binary forms, with or without 
+  modification, are permitted provided that the following conditions are met:
+  
+   1. Redistributions of source code must retain the above copyright notice, 
+      this list of conditions and the following disclaimer.
+  
+   2. Redistributions in binary form must reproduce the above copyright 
+      notice, this list of conditions and the following disclaimer in the 
+      documentation and/or other materials provided with the distribution.
+  
+   3. Neither the name of the Intel Corporation nor the names of its 
+      contributors may be used to endorse or promote products derived from 
+      this software without specific prior written permission.
+  
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+  POSSIBILITY OF SUCH DAMAGE.
+
+******************************************************************************/
+/*$FreeBSD: head/sys/dev/ixl/i40e_prototype.h 284049 2015-06-05 22:52:42Z jfv $*/
+
+#ifndef _I40E_PROTOTYPE_H_
+#define _I40E_PROTOTYPE_H_
+
+#include "i40e_type.h"
+#include "i40e_alloc.h"
+#include "i40e_virtchnl.h"
+
+/* Prototypes for shared code functions that are not in
+ * the standard function pointer structures.  These are
+ * mostly because they are needed even before the init
+ * has happened and will assist in the early SW and FW
+ * setup.
+ */
+
+/* adminq functions */
+enum i40e_status_code i40e_init_adminq(struct i40e_hw *hw);
+enum i40e_status_code i40e_shutdown_adminq(struct i40e_hw *hw);
+enum i40e_status_code i40e_init_asq(struct i40e_hw *hw);
+enum i40e_status_code i40e_init_arq(struct i40e_hw *hw);
+enum i40e_status_code i40e_alloc_adminq_asq_ring(struct i40e_hw *hw);
+enum i40e_status_code i40e_alloc_adminq_arq_ring(struct i40e_hw *hw);
+enum i40e_status_code i40e_shutdown_asq(struct i40e_hw *hw);
+enum i40e_status_code i40e_shutdown_arq(struct i40e_hw *hw);
+u16 i40e_clean_asq(struct i40e_hw *hw);
+void i40e_free_adminq_asq(struct i40e_hw *hw);
+void i40e_free_adminq_arq(struct i40e_hw *hw);
+enum i40e_status_code i40e_validate_mac_addr(u8 *mac_addr);
+void i40e_adminq_init_ring_data(struct i40e_hw *hw);
+enum i40e_status_code i40e_clean_arq_element(struct i40e_hw *hw,
+					     struct i40e_arq_event_info *e,
+					     u16 *events_pending);
+enum i40e_status_code i40e_asq_send_command(struct i40e_hw *hw,
+				struct i40e_aq_desc *desc,
+				void *buff, /* can be NULL */
+				u16  buff_size,
+				struct i40e_asq_cmd_details *cmd_details);
+bool i40e_asq_done(struct i40e_hw *hw);
+
+/* debug function for adminq */
+void i40e_debug_aq(struct i40e_hw *hw, enum i40e_debug_mask mask,
+		   void *desc, void *buffer, u16 buf_len);
+
+void i40e_idle_aq(struct i40e_hw *hw);
+void i40e_resume_aq(struct i40e_hw *hw);
+bool i40e_check_asq_alive(struct i40e_hw *hw);
+enum i40e_status_code i40e_aq_queue_shutdown(struct i40e_hw *hw, bool unloading);
+#ifdef X722_SUPPORT
+
+enum i40e_status_code i40e_aq_get_rss_lut(struct i40e_hw *hw, u16 seid,
+					  bool pf_lut, u8 *lut, u16 lut_size);
+enum i40e_status_code i40e_aq_set_rss_lut(struct i40e_hw *hw, u16 seid,
+					  bool pf_lut, u8 *lut, u16 lut_size);
+enum i40e_status_code i40e_aq_get_rss_key(struct i40e_hw *hw,
+				     u16 seid,
+				     struct i40e_aqc_get_set_rss_key_data *key);
+enum i40e_status_code i40e_aq_set_rss_key(struct i40e_hw *hw,
+				     u16 seid,
+				     struct i40e_aqc_get_set_rss_key_data *key);
+#endif
+char *i40e_aq_str(struct i40e_hw *hw, enum i40e_admin_queue_err aq_err);
+char *i40e_stat_str(struct i40e_hw *hw, enum i40e_status_code stat_err);
+
+
+u32 i40e_led_get(struct i40e_hw *hw);
+void i40e_led_set(struct i40e_hw *hw, u32 mode, bool blink);
+
+/* admin send queue commands */
+
+enum i40e_status_code i40e_aq_get_firmware_version(struct i40e_hw *hw,
+				u16 *fw_major_version, u16 *fw_minor_version,
+				u32 *fw_build,
+				u16 *api_major_version, u16 *api_minor_version,
+				struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_debug_write_register(struct i40e_hw *hw,
+				u32 reg_addr, u64 reg_val,
+				struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_debug_read_register(struct i40e_hw *hw,
+				u32  reg_addr, u64 *reg_val,
+				struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_set_phy_debug(struct i40e_hw *hw, u8 cmd_flags,
+				struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_set_default_vsi(struct i40e_hw *hw, u16 vsi_id,
+				struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_get_phy_capabilities(struct i40e_hw *hw,
+			bool qualified_modules, bool report_init,
+			struct i40e_aq_get_phy_abilities_resp *abilities,
+			struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_set_phy_config(struct i40e_hw *hw,
+				struct i40e_aq_set_phy_config *config,
+				struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_set_fc(struct i40e_hw *hw, u8 *aq_failures,
+				  bool atomic_reset);
+enum i40e_status_code i40e_aq_set_phy_int_mask(struct i40e_hw *hw, u16 mask,
+				struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_set_mac_config(struct i40e_hw *hw,
+				u16 max_frame_size, bool crc_en, u16 pacing,
+				struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_get_local_advt_reg(struct i40e_hw *hw,
+				u64 *advt_reg,
+				struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_get_partner_advt(struct i40e_hw *hw,
+				u64 *advt_reg,
+				struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_set_lb_modes(struct i40e_hw *hw, u16 lb_modes,
+				struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_clear_pxe_mode(struct i40e_hw *hw,
+			struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_set_link_restart_an(struct i40e_hw *hw,
+		bool enable_link, struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_get_link_info(struct i40e_hw *hw,
+				bool enable_lse, struct i40e_link_status *link,
+				struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_set_local_advt_reg(struct i40e_hw *hw,
+				u64 advt_reg,
+				struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_send_driver_version(struct i40e_hw *hw,
+				struct i40e_driver_version *dv,
+				struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_add_vsi(struct i40e_hw *hw,
+				struct i40e_vsi_context *vsi_ctx,
+				struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_set_vsi_broadcast(struct i40e_hw *hw,
+				u16 vsi_id, bool set_filter,
+				struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_set_vsi_unicast_promiscuous(struct i40e_hw *hw,
+		u16 vsi_id, bool set, struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_set_vsi_multicast_promiscuous(struct i40e_hw *hw,
+		u16 vsi_id, bool set, struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_set_vsi_mc_promisc_on_vlan(struct i40e_hw *hw,
+				u16 seid, bool enable, u16 vid,
+				struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_set_vsi_uc_promisc_on_vlan(struct i40e_hw *hw,
+				u16 seid, bool enable, u16 vid,
+				struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_get_vsi_params(struct i40e_hw *hw,
+				struct i40e_vsi_context *vsi_ctx,
+				struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_update_vsi_params(struct i40e_hw *hw,
+				struct i40e_vsi_context *vsi_ctx,
+				struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_add_veb(struct i40e_hw *hw, u16 uplink_seid,
+				u16 downlink_seid, u8 enabled_tc,
+				bool default_port, bool enable_l2_filtering,
+				u16 *pveb_seid,
+				struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_get_veb_parameters(struct i40e_hw *hw,
+				u16 veb_seid, u16 *switch_id, bool *floating,
+				u16 *statistic_index, u16 *vebs_used,
+				u16 *vebs_free,
+				struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_add_macvlan(struct i40e_hw *hw, u16 vsi_id,
+			struct i40e_aqc_add_macvlan_element_data *mv_list,
+			u16 count, struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_remove_macvlan(struct i40e_hw *hw, u16 vsi_id,
+			struct i40e_aqc_remove_macvlan_element_data *mv_list,
+			u16 count, struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_add_vlan(struct i40e_hw *hw, u16 vsi_id,
+			struct i40e_aqc_add_remove_vlan_element_data *v_list,
+			u8 count, struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_remove_vlan(struct i40e_hw *hw, u16 vsi_id,
+			struct i40e_aqc_add_remove_vlan_element_data *v_list,
+			u8 count, struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_send_msg_to_vf(struct i40e_hw *hw, u16 vfid,
+				u32 v_opcode, u32 v_retval, u8 *msg, u16 msglen,
+				struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_get_switch_config(struct i40e_hw *hw,
+				struct i40e_aqc_get_switch_config_resp *buf,
+				u16 buf_size, u16 *start_seid,
+				struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_request_resource(struct i40e_hw *hw,
+				enum i40e_aq_resources_ids resource,
+				enum i40e_aq_resource_access_type access,
+				u8 sdp_number, u64 *timeout,
+				struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_release_resource(struct i40e_hw *hw,
+				enum i40e_aq_resources_ids resource,
+				u8 sdp_number,
+				struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_read_nvm(struct i40e_hw *hw, u8 module_pointer,
+				u32 offset, u16 length, void *data,
+				bool last_command,
+				struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_erase_nvm(struct i40e_hw *hw, u8 module_pointer,
+				u32 offset, u16 length, bool last_command,
+				struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_read_nvm_config(struct i40e_hw *hw,
+				u8 cmd_flags, u32 field_id, void *data,
+				u16 buf_size, u16 *element_count,
+				struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_write_nvm_config(struct i40e_hw *hw,
+				u8 cmd_flags, void *data, u16 buf_size,
+				u16 element_count,
+				struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_oem_post_update(struct i40e_hw *hw,
+				void *buff, u16 buff_size,
+				struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_discover_capabilities(struct i40e_hw *hw,
+				void *buff, u16 buff_size, u16 *data_size,
+				enum i40e_admin_queue_opc list_type_opc,
+				struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_update_nvm(struct i40e_hw *hw, u8 module_pointer,
+				u32 offset, u16 length, void *data,
+				bool last_command,
+				struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_get_lldp_mib(struct i40e_hw *hw, u8 bridge_type,
+				u8 mib_type, void *buff, u16 buff_size,
+				u16 *local_len, u16 *remote_len,
+				struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_set_lldp_mib(struct i40e_hw *hw,
+				u8 mib_type, void *buff, u16 buff_size,
+				struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_cfg_lldp_mib_change_event(struct i40e_hw *hw,
+				bool enable_update,
+				struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_add_lldp_tlv(struct i40e_hw *hw, u8 bridge_type,
+				void *buff, u16 buff_size, u16 tlv_len,
+				u16 *mib_len,
+				struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_update_lldp_tlv(struct i40e_hw *hw,
+				u8 bridge_type, void *buff, u16 buff_size,
+				u16 old_len, u16 new_len, u16 offset,
+				u16 *mib_len,
+				struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_delete_lldp_tlv(struct i40e_hw *hw,
+				u8 bridge_type, void *buff, u16 buff_size,
+				u16 tlv_len, u16 *mib_len,
+				struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_stop_lldp(struct i40e_hw *hw, bool shutdown_agent,
+				struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_start_lldp(struct i40e_hw *hw,
+				struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_get_cee_dcb_config(struct i40e_hw *hw,
+				void *buff, u16 buff_size,
+				struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_start_stop_dcbx(struct i40e_hw *hw,
+				bool start_agent,
+				struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_add_udp_tunnel(struct i40e_hw *hw,
+				u16 udp_port, u8 protocol_index,
+				u8 *filter_index,
+				struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_del_udp_tunnel(struct i40e_hw *hw, u8 index,
+				struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_get_switch_resource_alloc(struct i40e_hw *hw,
+			u8 *num_entries,
+			struct i40e_aqc_switch_resource_alloc_element_resp *buf,
+			u16 count,
+			struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_add_pvirt(struct i40e_hw *hw, u16 flags,
+				       u16 mac_seid, u16 vsi_seid,
+				       u16 *ret_seid);
+enum i40e_status_code i40e_aq_add_tag(struct i40e_hw *hw, bool direct_to_queue,
+				u16 vsi_seid, u16 tag, u16 queue_num,
+				u16 *tags_used, u16 *tags_free,
+				struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_remove_tag(struct i40e_hw *hw, u16 vsi_seid,
+				u16 tag, u16 *tags_used, u16 *tags_free,
+				struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_add_mcast_etag(struct i40e_hw *hw, u16 pe_seid,
+				u16 etag, u8 num_tags_in_buf, void *buf,
+				u16 *tags_used, u16 *tags_free,
+				struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_remove_mcast_etag(struct i40e_hw *hw, u16 pe_seid,
+				u16 etag, u16 *tags_used, u16 *tags_free,
+				struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_update_tag(struct i40e_hw *hw, u16 vsi_seid,
+				u16 old_tag, u16 new_tag, u16 *tags_used,
+				u16 *tags_free,
+				struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_add_statistics(struct i40e_hw *hw, u16 seid,
+				u16 vlan_id, u16 *stat_index,
+				struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_remove_statistics(struct i40e_hw *hw, u16 seid,
+				u16 vlan_id, u16 stat_index,
+				struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_set_port_parameters(struct i40e_hw *hw,
+				u16 bad_frame_vsi, bool save_bad_pac,
+				bool pad_short_pac, bool double_vlan,
+				struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_delete_element(struct i40e_hw *hw, u16 seid,
+				struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_mac_address_write(struct i40e_hw *hw,
+				    u16 flags, u8 *mac_addr,
+				    struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_config_vsi_bw_limit(struct i40e_hw *hw,
+				u16 seid, u16 credit, u8 max_credit,
+				struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_dcb_ignore_pfc(struct i40e_hw *hw,
+				u8 tcmap, bool request, u8 *tcmap_ret,
+				struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_get_hmc_resource_profile(struct i40e_hw *hw,
+				enum i40e_aq_hmc_profile *profile,
+				u8 *pe_vf_enabled_count,
+				struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_config_switch_comp_ets_bw_limit(
+	struct i40e_hw *hw, u16 seid,
+	struct i40e_aqc_configure_switching_comp_ets_bw_limit_data *bw_data,
+	struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_config_vsi_ets_sla_bw_limit(struct i40e_hw *hw,
+			u16 seid,
+			struct i40e_aqc_configure_vsi_ets_sla_bw_data *bw_data,
+			struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_dcb_updated(struct i40e_hw *hw,
+				struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_set_hmc_resource_profile(struct i40e_hw *hw,
+				enum i40e_aq_hmc_profile profile,
+				u8 pe_vf_enabled_count,
+				struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_config_switch_comp_bw_limit(struct i40e_hw *hw,
+				u16 seid, u16 credit, u8 max_bw,
+				struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_config_vsi_tc_bw(struct i40e_hw *hw, u16 seid,
+			struct i40e_aqc_configure_vsi_tc_bw_data *bw_data,
+			struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_query_vsi_bw_config(struct i40e_hw *hw,
+			u16 seid,
+			struct i40e_aqc_query_vsi_bw_config_resp *bw_data,
+			struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_query_vsi_ets_sla_config(struct i40e_hw *hw,
+			u16 seid,
+			struct i40e_aqc_query_vsi_ets_sla_config_resp *bw_data,
+			struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_query_switch_comp_ets_config(struct i40e_hw *hw,
+		u16 seid,
+		struct i40e_aqc_query_switching_comp_ets_config_resp *bw_data,
+		struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_query_port_ets_config(struct i40e_hw *hw,
+		u16 seid,
+		struct i40e_aqc_query_port_ets_config_resp *bw_data,
+		struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_query_switch_comp_bw_config(struct i40e_hw *hw,
+		u16 seid,
+		struct i40e_aqc_query_switching_comp_bw_config_resp *bw_data,
+		struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_resume_port_tx(struct i40e_hw *hw,
+				struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_read_lldp_cfg(struct i40e_hw *hw,
+					struct i40e_lldp_variables *lldp_cfg);
+enum i40e_status_code i40e_aq_add_cloud_filters(struct i40e_hw *hw,
+		u16 vsi,
+		struct i40e_aqc_add_remove_cloud_filters_element_data *filters,
+		u8 filter_count);
+
+enum i40e_status_code i40e_aq_remove_cloud_filters(struct i40e_hw *hw,
+		u16 vsi,
+		struct i40e_aqc_add_remove_cloud_filters_element_data *filters,
+		u8 filter_count);
+
+enum i40e_status_code i40e_aq_alternate_read(struct i40e_hw *hw,
+				u32 reg_addr0, u32 *reg_val0,
+				u32 reg_addr1, u32 *reg_val1);
+enum i40e_status_code i40e_aq_alternate_read_indirect(struct i40e_hw *hw,
+				u32 addr, u32 dw_count, void *buffer);
+enum i40e_status_code i40e_aq_alternate_write(struct i40e_hw *hw,
+				u32 reg_addr0, u32 reg_val0,
+				u32 reg_addr1, u32 reg_val1);
+enum i40e_status_code i40e_aq_alternate_write_indirect(struct i40e_hw *hw,
+				u32 addr, u32 dw_count, void *buffer);
+enum i40e_status_code i40e_aq_alternate_clear(struct i40e_hw *hw);
+enum i40e_status_code i40e_aq_alternate_write_done(struct i40e_hw *hw,
+				u8 bios_mode, bool *reset_needed);
+enum i40e_status_code i40e_aq_set_oem_mode(struct i40e_hw *hw,
+				u8 oem_mode);
+
+/* i40e_common */
+enum i40e_status_code i40e_init_shared_code(struct i40e_hw *hw);
+enum i40e_status_code i40e_pf_reset(struct i40e_hw *hw);
+void i40e_clear_hw(struct i40e_hw *hw);
+void i40e_clear_pxe_mode(struct i40e_hw *hw);
+enum i40e_status_code i40e_get_link_status(struct i40e_hw *hw, bool *link_up);
+enum i40e_status_code i40e_update_link_info(struct i40e_hw *hw);
+enum i40e_status_code i40e_get_mac_addr(struct i40e_hw *hw, u8 *mac_addr);
+enum i40e_status_code i40e_read_bw_from_alt_ram(struct i40e_hw *hw,
+		u32 *max_bw, u32 *min_bw, bool *min_valid, bool *max_valid);
+enum i40e_status_code i40e_aq_configure_partition_bw(struct i40e_hw *hw,
+			struct i40e_aqc_configure_partition_bw_data *bw_data,
+			struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_get_port_mac_addr(struct i40e_hw *hw, u8 *mac_addr);
+enum i40e_status_code i40e_read_pba_string(struct i40e_hw *hw, u8 *pba_num,
+					    u32 pba_num_size);
+void i40e_pre_tx_queue_cfg(struct i40e_hw *hw, u32 queue, bool enable);
+enum i40e_aq_link_speed i40e_get_link_speed(struct i40e_hw *hw);
+/* prototype for functions used for NVM access */
+enum i40e_status_code i40e_init_nvm(struct i40e_hw *hw);
+enum i40e_status_code i40e_acquire_nvm(struct i40e_hw *hw,
+				      enum i40e_aq_resource_access_type access);
+void i40e_release_nvm(struct i40e_hw *hw);
+enum i40e_status_code i40e_read_nvm_word(struct i40e_hw *hw, u16 offset,
+					 u16 *data);
+enum i40e_status_code i40e_read_nvm_buffer(struct i40e_hw *hw, u16 offset,
+					   u16 *words, u16 *data);
+enum i40e_status_code i40e_write_nvm_aq(struct i40e_hw *hw, u8 module,
+					u32 offset, u16 words, void *data,
+					bool last_command);
+enum i40e_status_code i40e_write_nvm_word(struct i40e_hw *hw, u32 offset,
+					  void *data);
+enum i40e_status_code i40e_write_nvm_buffer(struct i40e_hw *hw, u8 module,
+					    u32 offset, u16 words, void *data);
+enum i40e_status_code i40e_calc_nvm_checksum(struct i40e_hw *hw, u16 *checksum);
+enum i40e_status_code i40e_update_nvm_checksum(struct i40e_hw *hw);
+enum i40e_status_code i40e_validate_nvm_checksum(struct i40e_hw *hw,
+						 u16 *checksum);
+enum i40e_status_code i40e_nvmupd_command(struct i40e_hw *hw,
+					  struct i40e_nvm_access *cmd,
+					  u8 *bytes, int *);
+void i40e_set_pci_config_data(struct i40e_hw *hw, u16 link_status);
+
+enum i40e_status_code i40e_set_mac_type(struct i40e_hw *hw);
+
+extern struct i40e_rx_ptype_decoded i40e_ptype_lookup[];
+
+static INLINE struct i40e_rx_ptype_decoded decode_rx_desc_ptype(u8 ptype)
+{
+	return i40e_ptype_lookup[ptype];
+}
+
+/* prototype for functions used for SW spinlocks */
+void i40e_init_spinlock(struct i40e_spinlock *sp);
+void i40e_acquire_spinlock(struct i40e_spinlock *sp);
+void i40e_release_spinlock(struct i40e_spinlock *sp);
+void i40e_destroy_spinlock(struct i40e_spinlock *sp);
+
+/* i40e_common for VF drivers*/
+void i40e_vf_parse_hw_config(struct i40e_hw *hw,
+			     struct i40e_virtchnl_vf_resource *msg);
+enum i40e_status_code i40e_vf_reset(struct i40e_hw *hw);
+enum i40e_status_code i40e_aq_send_msg_to_pf(struct i40e_hw *hw,
+				enum i40e_virtchnl_ops v_opcode,
+				enum i40e_status_code v_retval,
+				u8 *msg, u16 msglen,
+				struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_set_filter_control(struct i40e_hw *hw,
+				struct i40e_filter_control_settings *settings);
+enum i40e_status_code i40e_aq_add_rem_control_packet_filter(struct i40e_hw *hw,
+				u8 *mac_addr, u16 ethtype, u16 flags,
+				u16 vsi_seid, u16 queue, bool is_add,
+				struct i40e_control_filter_stats *stats,
+				struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_debug_dump(struct i40e_hw *hw, u8 cluster_id,
+				u8 table_id, u32 start_index, u16 buff_size,
+				void *buff, u16 *ret_buff_size,
+				u8 *ret_next_table, u32 *ret_next_index,
+				struct i40e_asq_cmd_details *cmd_details);
+#endif /* _I40E_PROTOTYPE_H_ */
diff --git a/usr/src/uts/common/io/i40e/core/i40e_register.h b/usr/src/uts/common/io/i40e/core/i40e_register.h
new file mode 100644
index 0000000000..ff4b8a54f2
--- /dev/null
+++ b/usr/src/uts/common/io/i40e/core/i40e_register.h
@@ -0,0 +1,5317 @@
+/******************************************************************************
+
+  Copyright (c) 2013-2015, Intel Corporation 
+  All rights reserved.
+  
+  Redistribution and use in source and binary forms, with or without 
+  modification, are permitted provided that the following conditions are met:
+  
+   1. Redistributions of source code must retain the above copyright notice, 
+      this list of conditions and the following disclaimer.
+  
+   2. Redistributions in binary form must reproduce the above copyright 
+      notice, this list of conditions and the following disclaimer in the 
+      documentation and/or other materials provided with the distribution.
+  
+   3. Neither the name of the Intel Corporation nor the names of its 
+      contributors may be used to endorse or promote products derived from 
+      this software without specific prior written permission.
+  
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+  POSSIBILITY OF SUCH DAMAGE.
+
+******************************************************************************/
+/*$FreeBSD: head/sys/dev/ixl/i40e_register.h 284049 2015-06-05 22:52:42Z jfv $*/
+
+#ifndef _I40E_REGISTER_H_
+#define _I40E_REGISTER_H_
+
+
+#define I40E_GL_ARQBAH              0x000801C0 /* Reset: EMPR */
+#define I40E_GL_ARQBAH_ARQBAH_SHIFT 0
+#define I40E_GL_ARQBAH_ARQBAH_MASK  I40E_MASK(0xFFFFFFFF, I40E_GL_ARQBAH_ARQBAH_SHIFT)
+#define I40E_GL_ARQBAL              0x000800C0 /* Reset: EMPR */
+#define I40E_GL_ARQBAL_ARQBAL_SHIFT 0
+#define I40E_GL_ARQBAL_ARQBAL_MASK  I40E_MASK(0xFFFFFFFF, I40E_GL_ARQBAL_ARQBAL_SHIFT)
+#define I40E_GL_ARQH            0x000803C0 /* Reset: EMPR */
+#define I40E_GL_ARQH_ARQH_SHIFT 0
+#define I40E_GL_ARQH_ARQH_MASK  I40E_MASK(0x3FF, I40E_GL_ARQH_ARQH_SHIFT)
+#define I40E_GL_ARQT            0x000804C0 /* Reset: EMPR */
+#define I40E_GL_ARQT_ARQT_SHIFT 0
+#define I40E_GL_ARQT_ARQT_MASK  I40E_MASK(0x3FF, I40E_GL_ARQT_ARQT_SHIFT)
+#define I40E_GL_ATQBAH              0x00080140 /* Reset: EMPR */
+#define I40E_GL_ATQBAH_ATQBAH_SHIFT 0
+#define I40E_GL_ATQBAH_ATQBAH_MASK  I40E_MASK(0xFFFFFFFF, I40E_GL_ATQBAH_ATQBAH_SHIFT)
+#define I40E_GL_ATQBAL              0x00080040 /* Reset: EMPR */
+#define I40E_GL_ATQBAL_ATQBAL_SHIFT 0
+#define I40E_GL_ATQBAL_ATQBAL_MASK  I40E_MASK(0xFFFFFFFF, I40E_GL_ATQBAL_ATQBAL_SHIFT)
+#define I40E_GL_ATQH            0x00080340 /* Reset: EMPR */
+#define I40E_GL_ATQH_ATQH_SHIFT 0
+#define I40E_GL_ATQH_ATQH_MASK  I40E_MASK(0x3FF, I40E_GL_ATQH_ATQH_SHIFT)
+#define I40E_GL_ATQLEN                 0x00080240 /* Reset: EMPR */
+#define I40E_GL_ATQLEN_ATQLEN_SHIFT    0
+#define I40E_GL_ATQLEN_ATQLEN_MASK     I40E_MASK(0x3FF, I40E_GL_ATQLEN_ATQLEN_SHIFT)
+#define I40E_GL_ATQLEN_ATQVFE_SHIFT    28
+#define I40E_GL_ATQLEN_ATQVFE_MASK     I40E_MASK(0x1, I40E_GL_ATQLEN_ATQVFE_SHIFT)
+#define I40E_GL_ATQLEN_ATQOVFL_SHIFT   29
+#define I40E_GL_ATQLEN_ATQOVFL_MASK    I40E_MASK(0x1, I40E_GL_ATQLEN_ATQOVFL_SHIFT)
+#define I40E_GL_ATQLEN_ATQCRIT_SHIFT   30
+#define I40E_GL_ATQLEN_ATQCRIT_MASK    I40E_MASK(0x1, I40E_GL_ATQLEN_ATQCRIT_SHIFT)
+#define I40E_GL_ATQLEN_ATQENABLE_SHIFT 31
+#define I40E_GL_ATQLEN_ATQENABLE_MASK  I40E_MASK(0x1, I40E_GL_ATQLEN_ATQENABLE_SHIFT)
+#define I40E_GL_ATQT            0x00080440 /* Reset: EMPR */
+#define I40E_GL_ATQT_ATQT_SHIFT 0
+#define I40E_GL_ATQT_ATQT_MASK  I40E_MASK(0x3FF, I40E_GL_ATQT_ATQT_SHIFT)
+#define I40E_PF_ARQBAH              0x00080180 /* Reset: EMPR */
+#define I40E_PF_ARQBAH_ARQBAH_SHIFT 0
+#define I40E_PF_ARQBAH_ARQBAH_MASK  I40E_MASK(0xFFFFFFFF, I40E_PF_ARQBAH_ARQBAH_SHIFT)
+#define I40E_PF_ARQBAL              0x00080080 /* Reset: EMPR */
+#define I40E_PF_ARQBAL_ARQBAL_SHIFT 0
+#define I40E_PF_ARQBAL_ARQBAL_MASK  I40E_MASK(0xFFFFFFFF, I40E_PF_ARQBAL_ARQBAL_SHIFT)
+#define I40E_PF_ARQH            0x00080380 /* Reset: EMPR */
+#define I40E_PF_ARQH_ARQH_SHIFT 0
+#define I40E_PF_ARQH_ARQH_MASK  I40E_MASK(0x3FF, I40E_PF_ARQH_ARQH_SHIFT)
+#define I40E_PF_ARQLEN                 0x00080280 /* Reset: EMPR */
+#define I40E_PF_ARQLEN_ARQLEN_SHIFT    0
+#define I40E_PF_ARQLEN_ARQLEN_MASK     I40E_MASK(0x3FF, I40E_PF_ARQLEN_ARQLEN_SHIFT)
+#define I40E_PF_ARQLEN_ARQVFE_SHIFT    28
+#define I40E_PF_ARQLEN_ARQVFE_MASK     I40E_MASK(0x1, I40E_PF_ARQLEN_ARQVFE_SHIFT)
+#define I40E_PF_ARQLEN_ARQOVFL_SHIFT   29
+#define I40E_PF_ARQLEN_ARQOVFL_MASK    I40E_MASK(0x1, I40E_PF_ARQLEN_ARQOVFL_SHIFT)
+#define I40E_PF_ARQLEN_ARQCRIT_SHIFT   30
+#define I40E_PF_ARQLEN_ARQCRIT_MASK    I40E_MASK(0x1, I40E_PF_ARQLEN_ARQCRIT_SHIFT)
+#define I40E_PF_ARQLEN_ARQENABLE_SHIFT 31
+#define I40E_PF_ARQLEN_ARQENABLE_MASK  I40E_MASK(0x1, I40E_PF_ARQLEN_ARQENABLE_SHIFT)
+#define I40E_PF_ARQT            0x00080480 /* Reset: EMPR */
+#define I40E_PF_ARQT_ARQT_SHIFT 0
+#define I40E_PF_ARQT_ARQT_MASK  I40E_MASK(0x3FF, I40E_PF_ARQT_ARQT_SHIFT)
+#define I40E_PF_ATQBAH              0x00080100 /* Reset: EMPR */
+#define I40E_PF_ATQBAH_ATQBAH_SHIFT 0
+#define I40E_PF_ATQBAH_ATQBAH_MASK  I40E_MASK(0xFFFFFFFF, I40E_PF_ATQBAH_ATQBAH_SHIFT)
+#define I40E_PF_ATQBAL              0x00080000 /* Reset: EMPR */
+#define I40E_PF_ATQBAL_ATQBAL_SHIFT 0
+#define I40E_PF_ATQBAL_ATQBAL_MASK  I40E_MASK(0xFFFFFFFF, I40E_PF_ATQBAL_ATQBAL_SHIFT)
+#define I40E_PF_ATQH            0x00080300 /* Reset: EMPR */
+#define I40E_PF_ATQH_ATQH_SHIFT 0
+#define I40E_PF_ATQH_ATQH_MASK  I40E_MASK(0x3FF, I40E_PF_ATQH_ATQH_SHIFT)
+#define I40E_PF_ATQLEN                 0x00080200 /* Reset: EMPR */
+#define I40E_PF_ATQLEN_ATQLEN_SHIFT    0
+#define I40E_PF_ATQLEN_ATQLEN_MASK     I40E_MASK(0x3FF, I40E_PF_ATQLEN_ATQLEN_SHIFT)
+#define I40E_PF_ATQLEN_ATQVFE_SHIFT    28
+#define I40E_PF_ATQLEN_ATQVFE_MASK     I40E_MASK(0x1, I40E_PF_ATQLEN_ATQVFE_SHIFT)
+#define I40E_PF_ATQLEN_ATQOVFL_SHIFT   29
+#define I40E_PF_ATQLEN_ATQOVFL_MASK    I40E_MASK(0x1, I40E_PF_ATQLEN_ATQOVFL_SHIFT)
+#define I40E_PF_ATQLEN_ATQCRIT_SHIFT   30
+#define I40E_PF_ATQLEN_ATQCRIT_MASK    I40E_MASK(0x1, I40E_PF_ATQLEN_ATQCRIT_SHIFT)
+#define I40E_PF_ATQLEN_ATQENABLE_SHIFT 31
+#define I40E_PF_ATQLEN_ATQENABLE_MASK  I40E_MASK(0x1, I40E_PF_ATQLEN_ATQENABLE_SHIFT)
+#define I40E_PF_ATQT            0x00080400 /* Reset: EMPR */
+#define I40E_PF_ATQT_ATQT_SHIFT 0
+#define I40E_PF_ATQT_ATQT_MASK  I40E_MASK(0x3FF, I40E_PF_ATQT_ATQT_SHIFT)
+#define I40E_VF_ARQBAH(_VF)         (0x00081400 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: EMPR */
+#define I40E_VF_ARQBAH_MAX_INDEX    127
+#define I40E_VF_ARQBAH_ARQBAH_SHIFT 0
+#define I40E_VF_ARQBAH_ARQBAH_MASK  I40E_MASK(0xFFFFFFFF, I40E_VF_ARQBAH_ARQBAH_SHIFT)
+#define I40E_VF_ARQBAL(_VF)         (0x00080C00 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: EMPR */
+#define I40E_VF_ARQBAL_MAX_INDEX    127
+#define I40E_VF_ARQBAL_ARQBAL_SHIFT 0
+#define I40E_VF_ARQBAL_ARQBAL_MASK  I40E_MASK(0xFFFFFFFF, I40E_VF_ARQBAL_ARQBAL_SHIFT)
+#define I40E_VF_ARQH(_VF)       (0x00082400 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: EMPR */
+#define I40E_VF_ARQH_MAX_INDEX  127
+#define I40E_VF_ARQH_ARQH_SHIFT 0
+#define I40E_VF_ARQH_ARQH_MASK  I40E_MASK(0x3FF, I40E_VF_ARQH_ARQH_SHIFT)
+#define I40E_VF_ARQLEN(_VF)            (0x00081C00 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: EMPR */
+#define I40E_VF_ARQLEN_MAX_INDEX       127
+#define I40E_VF_ARQLEN_ARQLEN_SHIFT    0
+#define I40E_VF_ARQLEN_ARQLEN_MASK     I40E_MASK(0x3FF, I40E_VF_ARQLEN_ARQLEN_SHIFT)
+#define I40E_VF_ARQLEN_ARQVFE_SHIFT    28
+#define I40E_VF_ARQLEN_ARQVFE_MASK     I40E_MASK(0x1, I40E_VF_ARQLEN_ARQVFE_SHIFT)
+#define I40E_VF_ARQLEN_ARQOVFL_SHIFT   29
+#define I40E_VF_ARQLEN_ARQOVFL_MASK    I40E_MASK(0x1, I40E_VF_ARQLEN_ARQOVFL_SHIFT)
+#define I40E_VF_ARQLEN_ARQCRIT_SHIFT   30
+#define I40E_VF_ARQLEN_ARQCRIT_MASK    I40E_MASK(0x1, I40E_VF_ARQLEN_ARQCRIT_SHIFT)
+#define I40E_VF_ARQLEN_ARQENABLE_SHIFT 31
+#define I40E_VF_ARQLEN_ARQENABLE_MASK  I40E_MASK(0x1, I40E_VF_ARQLEN_ARQENABLE_SHIFT)
+#define I40E_VF_ARQT(_VF)       (0x00082C00 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: EMPR */
+#define I40E_VF_ARQT_MAX_INDEX  127
+#define I40E_VF_ARQT_ARQT_SHIFT 0
+#define I40E_VF_ARQT_ARQT_MASK  I40E_MASK(0x3FF, I40E_VF_ARQT_ARQT_SHIFT)
+#define I40E_VF_ATQBAH(_VF)         (0x00081000 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: EMPR */
+#define I40E_VF_ATQBAH_MAX_INDEX    127
+#define I40E_VF_ATQBAH_ATQBAH_SHIFT 0
+#define I40E_VF_ATQBAH_ATQBAH_MASK  I40E_MASK(0xFFFFFFFF, I40E_VF_ATQBAH_ATQBAH_SHIFT)
+#define I40E_VF_ATQBAL(_VF)         (0x00080800 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: EMPR */
+#define I40E_VF_ATQBAL_MAX_INDEX    127
+#define I40E_VF_ATQBAL_ATQBAL_SHIFT 0
+#define I40E_VF_ATQBAL_ATQBAL_MASK  I40E_MASK(0xFFFFFFFF, I40E_VF_ATQBAL_ATQBAL_SHIFT)
+#define I40E_VF_ATQH(_VF)       (0x00082000 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: EMPR */
+#define I40E_VF_ATQH_MAX_INDEX  127
+#define I40E_VF_ATQH_ATQH_SHIFT 0
+#define I40E_VF_ATQH_ATQH_MASK  I40E_MASK(0x3FF, I40E_VF_ATQH_ATQH_SHIFT)
+#define I40E_VF_ATQLEN(_VF)            (0x00081800 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: EMPR */
+#define I40E_VF_ATQLEN_MAX_INDEX       127
+#define I40E_VF_ATQLEN_ATQLEN_SHIFT    0
+#define I40E_VF_ATQLEN_ATQLEN_MASK     I40E_MASK(0x3FF, I40E_VF_ATQLEN_ATQLEN_SHIFT)
+#define I40E_VF_ATQLEN_ATQVFE_SHIFT    28
+#define I40E_VF_ATQLEN_ATQVFE_MASK     I40E_MASK(0x1, I40E_VF_ATQLEN_ATQVFE_SHIFT)
+#define I40E_VF_ATQLEN_ATQOVFL_SHIFT   29
+#define I40E_VF_ATQLEN_ATQOVFL_MASK    I40E_MASK(0x1, I40E_VF_ATQLEN_ATQOVFL_SHIFT)
+#define I40E_VF_ATQLEN_ATQCRIT_SHIFT   30
+#define I40E_VF_ATQLEN_ATQCRIT_MASK    I40E_MASK(0x1, I40E_VF_ATQLEN_ATQCRIT_SHIFT)
+#define I40E_VF_ATQLEN_ATQENABLE_SHIFT 31
+#define I40E_VF_ATQLEN_ATQENABLE_MASK  I40E_MASK(0x1, I40E_VF_ATQLEN_ATQENABLE_SHIFT)
+#define I40E_VF_ATQT(_VF)       (0x00082800 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: EMPR */
+#define I40E_VF_ATQT_MAX_INDEX  127
+#define I40E_VF_ATQT_ATQT_SHIFT 0
+#define I40E_VF_ATQT_ATQT_MASK  I40E_MASK(0x3FF, I40E_VF_ATQT_ATQT_SHIFT)
+#define I40E_PRT_L2TAGSEN              0x001C0B20 /* Reset: CORER */
+#define I40E_PRT_L2TAGSEN_ENABLE_SHIFT 0
+#define I40E_PRT_L2TAGSEN_ENABLE_MASK  I40E_MASK(0xFF, I40E_PRT_L2TAGSEN_ENABLE_SHIFT)
+#define I40E_PFCM_LAN_ERRDATA                  0x0010C080 /* Reset: PFR */
+#define I40E_PFCM_LAN_ERRDATA_ERROR_CODE_SHIFT 0
+#define I40E_PFCM_LAN_ERRDATA_ERROR_CODE_MASK  I40E_MASK(0xF, I40E_PFCM_LAN_ERRDATA_ERROR_CODE_SHIFT)
+#define I40E_PFCM_LAN_ERRDATA_Q_TYPE_SHIFT     4
+#define I40E_PFCM_LAN_ERRDATA_Q_TYPE_MASK      I40E_MASK(0x7, I40E_PFCM_LAN_ERRDATA_Q_TYPE_SHIFT)
+#define I40E_PFCM_LAN_ERRDATA_Q_NUM_SHIFT      8
+#define I40E_PFCM_LAN_ERRDATA_Q_NUM_MASK       I40E_MASK(0xFFF, I40E_PFCM_LAN_ERRDATA_Q_NUM_SHIFT)
+#define I40E_PFCM_LAN_ERRINFO                     0x0010C000 /* Reset: PFR */
+#define I40E_PFCM_LAN_ERRINFO_ERROR_VALID_SHIFT   0
+#define I40E_PFCM_LAN_ERRINFO_ERROR_VALID_MASK    I40E_MASK(0x1, I40E_PFCM_LAN_ERRINFO_ERROR_VALID_SHIFT)
+#define I40E_PFCM_LAN_ERRINFO_ERROR_INST_SHIFT    4
+#define I40E_PFCM_LAN_ERRINFO_ERROR_INST_MASK     I40E_MASK(0x7, I40E_PFCM_LAN_ERRINFO_ERROR_INST_SHIFT)
+#define I40E_PFCM_LAN_ERRINFO_DBL_ERROR_CNT_SHIFT 8
+#define I40E_PFCM_LAN_ERRINFO_DBL_ERROR_CNT_MASK  I40E_MASK(0xFF, I40E_PFCM_LAN_ERRINFO_DBL_ERROR_CNT_SHIFT)
+#define I40E_PFCM_LAN_ERRINFO_RLU_ERROR_CNT_SHIFT 16
+#define I40E_PFCM_LAN_ERRINFO_RLU_ERROR_CNT_MASK  I40E_MASK(0xFF, I40E_PFCM_LAN_ERRINFO_RLU_ERROR_CNT_SHIFT)
+#define I40E_PFCM_LAN_ERRINFO_RLS_ERROR_CNT_SHIFT 24
+#define I40E_PFCM_LAN_ERRINFO_RLS_ERROR_CNT_MASK  I40E_MASK(0xFF, I40E_PFCM_LAN_ERRINFO_RLS_ERROR_CNT_SHIFT)
+#define I40E_PFCM_LANCTXCTL                  0x0010C300 /* Reset: CORER */
+#define I40E_PFCM_LANCTXCTL_QUEUE_NUM_SHIFT  0
+#define I40E_PFCM_LANCTXCTL_QUEUE_NUM_MASK   I40E_MASK(0xFFF, I40E_PFCM_LANCTXCTL_QUEUE_NUM_SHIFT)
+#define I40E_PFCM_LANCTXCTL_SUB_LINE_SHIFT   12
+#define I40E_PFCM_LANCTXCTL_SUB_LINE_MASK    I40E_MASK(0x7, I40E_PFCM_LANCTXCTL_SUB_LINE_SHIFT)
+#define I40E_PFCM_LANCTXCTL_QUEUE_TYPE_SHIFT 15
+#define I40E_PFCM_LANCTXCTL_QUEUE_TYPE_MASK  I40E_MASK(0x3, I40E_PFCM_LANCTXCTL_QUEUE_TYPE_SHIFT)
+#define I40E_PFCM_LANCTXCTL_OP_CODE_SHIFT    17
+#define I40E_PFCM_LANCTXCTL_OP_CODE_MASK     I40E_MASK(0x3, I40E_PFCM_LANCTXCTL_OP_CODE_SHIFT)
+#define I40E_PFCM_LANCTXDATA(_i)        (0x0010C100 + ((_i) * 128)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_PFCM_LANCTXDATA_MAX_INDEX  3
+#define I40E_PFCM_LANCTXDATA_DATA_SHIFT 0
+#define I40E_PFCM_LANCTXDATA_DATA_MASK  I40E_MASK(0xFFFFFFFF, I40E_PFCM_LANCTXDATA_DATA_SHIFT)
+#define I40E_PFCM_LANCTXSTAT                0x0010C380 /* Reset: CORER */
+#define I40E_PFCM_LANCTXSTAT_CTX_DONE_SHIFT 0
+#define I40E_PFCM_LANCTXSTAT_CTX_DONE_MASK  I40E_MASK(0x1, I40E_PFCM_LANCTXSTAT_CTX_DONE_SHIFT)
+#define I40E_PFCM_LANCTXSTAT_CTX_MISS_SHIFT 1
+#define I40E_PFCM_LANCTXSTAT_CTX_MISS_MASK  I40E_MASK(0x1, I40E_PFCM_LANCTXSTAT_CTX_MISS_SHIFT)
+#define I40E_VFCM_PE_ERRDATA1(_VF)             (0x00138800 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
+#define I40E_VFCM_PE_ERRDATA1_MAX_INDEX        127
+#define I40E_VFCM_PE_ERRDATA1_ERROR_CODE_SHIFT 0
+#define I40E_VFCM_PE_ERRDATA1_ERROR_CODE_MASK  I40E_MASK(0xF, I40E_VFCM_PE_ERRDATA1_ERROR_CODE_SHIFT)
+#define I40E_VFCM_PE_ERRDATA1_Q_TYPE_SHIFT     4
+#define I40E_VFCM_PE_ERRDATA1_Q_TYPE_MASK      I40E_MASK(0x7, I40E_VFCM_PE_ERRDATA1_Q_TYPE_SHIFT)
+#define I40E_VFCM_PE_ERRDATA1_Q_NUM_SHIFT      8
+#define I40E_VFCM_PE_ERRDATA1_Q_NUM_MASK       I40E_MASK(0x3FFFF, I40E_VFCM_PE_ERRDATA1_Q_NUM_SHIFT)
+#define I40E_VFCM_PE_ERRINFO1(_VF)                (0x00138400 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
+#define I40E_VFCM_PE_ERRINFO1_MAX_INDEX           127
+#define I40E_VFCM_PE_ERRINFO1_ERROR_VALID_SHIFT   0
+#define I40E_VFCM_PE_ERRINFO1_ERROR_VALID_MASK    I40E_MASK(0x1, I40E_VFCM_PE_ERRINFO1_ERROR_VALID_SHIFT)
+#define I40E_VFCM_PE_ERRINFO1_ERROR_INST_SHIFT    4
+#define I40E_VFCM_PE_ERRINFO1_ERROR_INST_MASK     I40E_MASK(0x7, I40E_VFCM_PE_ERRINFO1_ERROR_INST_SHIFT)
+#define I40E_VFCM_PE_ERRINFO1_DBL_ERROR_CNT_SHIFT 8
+#define I40E_VFCM_PE_ERRINFO1_DBL_ERROR_CNT_MASK  I40E_MASK(0xFF, I40E_VFCM_PE_ERRINFO1_DBL_ERROR_CNT_SHIFT)
+#define I40E_VFCM_PE_ERRINFO1_RLU_ERROR_CNT_SHIFT 16
+#define I40E_VFCM_PE_ERRINFO1_RLU_ERROR_CNT_MASK  I40E_MASK(0xFF, I40E_VFCM_PE_ERRINFO1_RLU_ERROR_CNT_SHIFT)
+#define I40E_VFCM_PE_ERRINFO1_RLS_ERROR_CNT_SHIFT 24
+#define I40E_VFCM_PE_ERRINFO1_RLS_ERROR_CNT_MASK  I40E_MASK(0xFF, I40E_VFCM_PE_ERRINFO1_RLS_ERROR_CNT_SHIFT)
+#define I40E_GLDCB_GENC              0x00083044 /* Reset: CORER */
+#define I40E_GLDCB_GENC_PCIRTT_SHIFT 0
+#define I40E_GLDCB_GENC_PCIRTT_MASK  I40E_MASK(0xFFFF, I40E_GLDCB_GENC_PCIRTT_SHIFT)
+#define I40E_GLDCB_RUPTI                     0x00122618 /* Reset: CORER */
+#define I40E_GLDCB_RUPTI_PFCTIMEOUT_UP_SHIFT 0
+#define I40E_GLDCB_RUPTI_PFCTIMEOUT_UP_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLDCB_RUPTI_PFCTIMEOUT_UP_SHIFT)
+#define I40E_PRTDCB_FCCFG            0x001E4640 /* Reset: GLOBR */
+#define I40E_PRTDCB_FCCFG_TFCE_SHIFT 3
+#define I40E_PRTDCB_FCCFG_TFCE_MASK  I40E_MASK(0x3, I40E_PRTDCB_FCCFG_TFCE_SHIFT)
+#define I40E_PRTDCB_FCRTV                     0x001E4600 /* Reset: GLOBR */
+#define I40E_PRTDCB_FCRTV_FC_REFRESH_TH_SHIFT 0
+#define I40E_PRTDCB_FCRTV_FC_REFRESH_TH_MASK  I40E_MASK(0xFFFF, I40E_PRTDCB_FCRTV_FC_REFRESH_TH_SHIFT)
+#define I40E_PRTDCB_FCTTVN(_i)             (0x001E4580 + ((_i) * 32)) /* _i=0...3 */ /* Reset: GLOBR */
+#define I40E_PRTDCB_FCTTVN_MAX_INDEX       3
+#define I40E_PRTDCB_FCTTVN_TTV_2N_SHIFT    0
+#define I40E_PRTDCB_FCTTVN_TTV_2N_MASK     I40E_MASK(0xFFFF, I40E_PRTDCB_FCTTVN_TTV_2N_SHIFT)
+#define I40E_PRTDCB_FCTTVN_TTV_2N_P1_SHIFT 16
+#define I40E_PRTDCB_FCTTVN_TTV_2N_P1_MASK  I40E_MASK(0xFFFF, I40E_PRTDCB_FCTTVN_TTV_2N_P1_SHIFT)
+#define I40E_PRTDCB_GENC                    0x00083000 /* Reset: CORER */
+#define I40E_PRTDCB_GENC_RESERVED_1_SHIFT   0
+#define I40E_PRTDCB_GENC_RESERVED_1_MASK    I40E_MASK(0x3, I40E_PRTDCB_GENC_RESERVED_1_SHIFT)
+#define I40E_PRTDCB_GENC_NUMTC_SHIFT        2
+#define I40E_PRTDCB_GENC_NUMTC_MASK         I40E_MASK(0xF, I40E_PRTDCB_GENC_NUMTC_SHIFT)
+#define I40E_PRTDCB_GENC_FCOEUP_SHIFT       6
+#define I40E_PRTDCB_GENC_FCOEUP_MASK        I40E_MASK(0x7, I40E_PRTDCB_GENC_FCOEUP_SHIFT)
+#define I40E_PRTDCB_GENC_FCOEUP_VALID_SHIFT 9
+#define I40E_PRTDCB_GENC_FCOEUP_VALID_MASK  I40E_MASK(0x1, I40E_PRTDCB_GENC_FCOEUP_VALID_SHIFT)
+#define I40E_PRTDCB_GENC_PFCLDA_SHIFT       16
+#define I40E_PRTDCB_GENC_PFCLDA_MASK        I40E_MASK(0xFFFF, I40E_PRTDCB_GENC_PFCLDA_SHIFT)
+#define I40E_PRTDCB_GENS                   0x00083020 /* Reset: CORER */
+#define I40E_PRTDCB_GENS_DCBX_STATUS_SHIFT 0
+#define I40E_PRTDCB_GENS_DCBX_STATUS_MASK  I40E_MASK(0x7, I40E_PRTDCB_GENS_DCBX_STATUS_SHIFT)
+#define I40E_PRTDCB_MFLCN             0x001E2400 /* Reset: GLOBR */
+#define I40E_PRTDCB_MFLCN_PMCF_SHIFT  0
+#define I40E_PRTDCB_MFLCN_PMCF_MASK   I40E_MASK(0x1, I40E_PRTDCB_MFLCN_PMCF_SHIFT)
+#define I40E_PRTDCB_MFLCN_DPF_SHIFT   1
+#define I40E_PRTDCB_MFLCN_DPF_MASK    I40E_MASK(0x1, I40E_PRTDCB_MFLCN_DPF_SHIFT)
+#define I40E_PRTDCB_MFLCN_RPFCM_SHIFT 2
+#define I40E_PRTDCB_MFLCN_RPFCM_MASK  I40E_MASK(0x1, I40E_PRTDCB_MFLCN_RPFCM_SHIFT)
+#define I40E_PRTDCB_MFLCN_RFCE_SHIFT  3
+#define I40E_PRTDCB_MFLCN_RFCE_MASK   I40E_MASK(0x1, I40E_PRTDCB_MFLCN_RFCE_SHIFT)
+#define I40E_PRTDCB_MFLCN_RPFCE_SHIFT 4
+#define I40E_PRTDCB_MFLCN_RPFCE_MASK  I40E_MASK(0xFF, I40E_PRTDCB_MFLCN_RPFCE_SHIFT)
+#define I40E_PRTDCB_RETSC                    0x001223E0 /* Reset: CORER */
+#define I40E_PRTDCB_RETSC_ETS_MODE_SHIFT     0
+#define I40E_PRTDCB_RETSC_ETS_MODE_MASK      I40E_MASK(0x1, I40E_PRTDCB_RETSC_ETS_MODE_SHIFT)
+#define I40E_PRTDCB_RETSC_NON_ETS_MODE_SHIFT 1
+#define I40E_PRTDCB_RETSC_NON_ETS_MODE_MASK  I40E_MASK(0x1, I40E_PRTDCB_RETSC_NON_ETS_MODE_SHIFT)
+#define I40E_PRTDCB_RETSC_ETS_MAX_EXP_SHIFT  2
+#define I40E_PRTDCB_RETSC_ETS_MAX_EXP_MASK   I40E_MASK(0xF, I40E_PRTDCB_RETSC_ETS_MAX_EXP_SHIFT)
+#define I40E_PRTDCB_RETSC_LLTC_SHIFT         8
+#define I40E_PRTDCB_RETSC_LLTC_MASK          I40E_MASK(0xFF, I40E_PRTDCB_RETSC_LLTC_SHIFT)
+#define I40E_PRTDCB_RETSTCC(_i)               (0x00122180 + ((_i) * 32)) /* _i=0...7 */ /* Reset: CORER */
+#define I40E_PRTDCB_RETSTCC_MAX_INDEX         7
+#define I40E_PRTDCB_RETSTCC_BWSHARE_SHIFT     0
+#define I40E_PRTDCB_RETSTCC_BWSHARE_MASK      I40E_MASK(0x7F, I40E_PRTDCB_RETSTCC_BWSHARE_SHIFT)
+#define I40E_PRTDCB_RETSTCC_UPINTC_MODE_SHIFT 30
+#define I40E_PRTDCB_RETSTCC_UPINTC_MODE_MASK  I40E_MASK(0x1, I40E_PRTDCB_RETSTCC_UPINTC_MODE_SHIFT)
+#define I40E_PRTDCB_RETSTCC_ETSTC_SHIFT       31
+#define I40E_PRTDCB_RETSTCC_ETSTC_MASK        I40E_MASK(0x1, I40E_PRTDCB_RETSTCC_ETSTC_SHIFT)
+#define I40E_PRTDCB_RPPMC                    0x001223A0 /* Reset: CORER */
+#define I40E_PRTDCB_RPPMC_LANRPPM_SHIFT      0
+#define I40E_PRTDCB_RPPMC_LANRPPM_MASK       I40E_MASK(0xFF, I40E_PRTDCB_RPPMC_LANRPPM_SHIFT)
+#define I40E_PRTDCB_RPPMC_RDMARPPM_SHIFT     8
+#define I40E_PRTDCB_RPPMC_RDMARPPM_MASK      I40E_MASK(0xFF, I40E_PRTDCB_RPPMC_RDMARPPM_SHIFT)
+#define I40E_PRTDCB_RPPMC_RX_FIFO_SIZE_SHIFT 16
+#define I40E_PRTDCB_RPPMC_RX_FIFO_SIZE_MASK  I40E_MASK(0xFF, I40E_PRTDCB_RPPMC_RX_FIFO_SIZE_SHIFT)
+#define I40E_PRTDCB_RUP                0x001C0B00 /* Reset: CORER */
+#define I40E_PRTDCB_RUP_NOVLANUP_SHIFT 0
+#define I40E_PRTDCB_RUP_NOVLANUP_MASK  I40E_MASK(0x7, I40E_PRTDCB_RUP_NOVLANUP_SHIFT)
+#define I40E_PRTDCB_RUP2TC             0x001C09A0 /* Reset: CORER */
+#define I40E_PRTDCB_RUP2TC_UP0TC_SHIFT 0
+#define I40E_PRTDCB_RUP2TC_UP0TC_MASK  I40E_MASK(0x7, I40E_PRTDCB_RUP2TC_UP0TC_SHIFT)
+#define I40E_PRTDCB_RUP2TC_UP1TC_SHIFT 3
+#define I40E_PRTDCB_RUP2TC_UP1TC_MASK  I40E_MASK(0x7, I40E_PRTDCB_RUP2TC_UP1TC_SHIFT)
+#define I40E_PRTDCB_RUP2TC_UP2TC_SHIFT 6
+#define I40E_PRTDCB_RUP2TC_UP2TC_MASK  I40E_MASK(0x7, I40E_PRTDCB_RUP2TC_UP2TC_SHIFT)
+#define I40E_PRTDCB_RUP2TC_UP3TC_SHIFT 9
+#define I40E_PRTDCB_RUP2TC_UP3TC_MASK  I40E_MASK(0x7, I40E_PRTDCB_RUP2TC_UP3TC_SHIFT)
+#define I40E_PRTDCB_RUP2TC_UP4TC_SHIFT 12
+#define I40E_PRTDCB_RUP2TC_UP4TC_MASK  I40E_MASK(0x7, I40E_PRTDCB_RUP2TC_UP4TC_SHIFT)
+#define I40E_PRTDCB_RUP2TC_UP5TC_SHIFT 15
+#define I40E_PRTDCB_RUP2TC_UP5TC_MASK  I40E_MASK(0x7, I40E_PRTDCB_RUP2TC_UP5TC_SHIFT)
+#define I40E_PRTDCB_RUP2TC_UP6TC_SHIFT 18
+#define I40E_PRTDCB_RUP2TC_UP6TC_MASK  I40E_MASK(0x7, I40E_PRTDCB_RUP2TC_UP6TC_SHIFT)
+#define I40E_PRTDCB_RUP2TC_UP7TC_SHIFT 21
+#define I40E_PRTDCB_RUP2TC_UP7TC_MASK  I40E_MASK(0x7, I40E_PRTDCB_RUP2TC_UP7TC_SHIFT)
+#define I40E_PRTDCB_RUPTQ(_i)          (0x00122400 + ((_i) * 32)) /* _i=0...7 */ /* Reset: CORER */
+#define I40E_PRTDCB_RUPTQ_MAX_INDEX    7
+#define I40E_PRTDCB_RUPTQ_RXQNUM_SHIFT 0
+#define I40E_PRTDCB_RUPTQ_RXQNUM_MASK  I40E_MASK(0x3FFF, I40E_PRTDCB_RUPTQ_RXQNUM_SHIFT)
+#define I40E_PRTDCB_TC2PFC              0x001C0980 /* Reset: CORER */
+#define I40E_PRTDCB_TC2PFC_TC2PFC_SHIFT 0
+#define I40E_PRTDCB_TC2PFC_TC2PFC_MASK  I40E_MASK(0xFF, I40E_PRTDCB_TC2PFC_TC2PFC_SHIFT)
+#define I40E_PRTDCB_TCMSTC(_i)        (0x000A0040 + ((_i) * 32)) /* _i=0...7 */ /* Reset: CORER */
+#define I40E_PRTDCB_TCMSTC_MAX_INDEX  7
+#define I40E_PRTDCB_TCMSTC_MSTC_SHIFT 0
+#define I40E_PRTDCB_TCMSTC_MSTC_MASK  I40E_MASK(0xFFFFF, I40E_PRTDCB_TCMSTC_MSTC_SHIFT)
+#define I40E_PRTDCB_TCPMC                 0x000A21A0 /* Reset: CORER */
+#define I40E_PRTDCB_TCPMC_CPM_SHIFT       0
+#define I40E_PRTDCB_TCPMC_CPM_MASK        I40E_MASK(0x1FFF, I40E_PRTDCB_TCPMC_CPM_SHIFT)
+#define I40E_PRTDCB_TCPMC_LLTC_SHIFT      13
+#define I40E_PRTDCB_TCPMC_LLTC_MASK       I40E_MASK(0xFF, I40E_PRTDCB_TCPMC_LLTC_SHIFT)
+#define I40E_PRTDCB_TCPMC_TCPM_MODE_SHIFT 30
+#define I40E_PRTDCB_TCPMC_TCPM_MODE_MASK  I40E_MASK(0x1, I40E_PRTDCB_TCPMC_TCPM_MODE_SHIFT)
+#define I40E_PRTDCB_TCWSTC(_i)        (0x000A2040 + ((_i) * 32)) /* _i=0...7 */ /* Reset: CORER */
+#define I40E_PRTDCB_TCWSTC_MAX_INDEX  7
+#define I40E_PRTDCB_TCWSTC_MSTC_SHIFT 0
+#define I40E_PRTDCB_TCWSTC_MSTC_MASK  I40E_MASK(0xFFFFF, I40E_PRTDCB_TCWSTC_MSTC_SHIFT)
+#define I40E_PRTDCB_TDPMC                 0x000A0180 /* Reset: CORER */
+#define I40E_PRTDCB_TDPMC_DPM_SHIFT       0
+#define I40E_PRTDCB_TDPMC_DPM_MASK        I40E_MASK(0xFF, I40E_PRTDCB_TDPMC_DPM_SHIFT)
+#define I40E_PRTDCB_TDPMC_TCPM_MODE_SHIFT 30
+#define I40E_PRTDCB_TDPMC_TCPM_MODE_MASK  I40E_MASK(0x1, I40E_PRTDCB_TDPMC_TCPM_MODE_SHIFT)
+#define I40E_PRTDCB_TETSC_TCB                             0x000AE060 /* Reset: CORER */
+#define I40E_PRTDCB_TETSC_TCB_EN_LL_STRICT_PRIORITY_SHIFT 0
+#define I40E_PRTDCB_TETSC_TCB_EN_LL_STRICT_PRIORITY_MASK  I40E_MASK(0x1, I40E_PRTDCB_TETSC_TCB_EN_LL_STRICT_PRIORITY_SHIFT)
+#define I40E_PRTDCB_TETSC_TCB_LLTC_SHIFT                  8
+#define I40E_PRTDCB_TETSC_TCB_LLTC_MASK                   I40E_MASK(0xFF, I40E_PRTDCB_TETSC_TCB_LLTC_SHIFT)
+#define I40E_PRTDCB_TETSC_TPB                             0x00098060 /* Reset: CORER */
+#define I40E_PRTDCB_TETSC_TPB_EN_LL_STRICT_PRIORITY_SHIFT 0
+#define I40E_PRTDCB_TETSC_TPB_EN_LL_STRICT_PRIORITY_MASK  I40E_MASK(0x1, I40E_PRTDCB_TETSC_TPB_EN_LL_STRICT_PRIORITY_SHIFT)
+#define I40E_PRTDCB_TETSC_TPB_LLTC_SHIFT                  8
+#define I40E_PRTDCB_TETSC_TPB_LLTC_MASK                   I40E_MASK(0xFF, I40E_PRTDCB_TETSC_TPB_LLTC_SHIFT)
+#define I40E_PRTDCB_TFCS              0x001E4560 /* Reset: GLOBR */
+#define I40E_PRTDCB_TFCS_TXOFF_SHIFT  0
+#define I40E_PRTDCB_TFCS_TXOFF_MASK   I40E_MASK(0x1, I40E_PRTDCB_TFCS_TXOFF_SHIFT)
+#define I40E_PRTDCB_TFCS_TXOFF0_SHIFT 8
+#define I40E_PRTDCB_TFCS_TXOFF0_MASK  I40E_MASK(0x1, I40E_PRTDCB_TFCS_TXOFF0_SHIFT)
+#define I40E_PRTDCB_TFCS_TXOFF1_SHIFT 9
+#define I40E_PRTDCB_TFCS_TXOFF1_MASK  I40E_MASK(0x1, I40E_PRTDCB_TFCS_TXOFF1_SHIFT)
+#define I40E_PRTDCB_TFCS_TXOFF2_SHIFT 10
+#define I40E_PRTDCB_TFCS_TXOFF2_MASK  I40E_MASK(0x1, I40E_PRTDCB_TFCS_TXOFF2_SHIFT)
+#define I40E_PRTDCB_TFCS_TXOFF3_SHIFT 11
+#define I40E_PRTDCB_TFCS_TXOFF3_MASK  I40E_MASK(0x1, I40E_PRTDCB_TFCS_TXOFF3_SHIFT)
+#define I40E_PRTDCB_TFCS_TXOFF4_SHIFT 12
+#define I40E_PRTDCB_TFCS_TXOFF4_MASK  I40E_MASK(0x1, I40E_PRTDCB_TFCS_TXOFF4_SHIFT)
+#define I40E_PRTDCB_TFCS_TXOFF5_SHIFT 13
+#define I40E_PRTDCB_TFCS_TXOFF5_MASK  I40E_MASK(0x1, I40E_PRTDCB_TFCS_TXOFF5_SHIFT)
+#define I40E_PRTDCB_TFCS_TXOFF6_SHIFT 14
+#define I40E_PRTDCB_TFCS_TXOFF6_MASK  I40E_MASK(0x1, I40E_PRTDCB_TFCS_TXOFF6_SHIFT)
+#define I40E_PRTDCB_TFCS_TXOFF7_SHIFT 15
+#define I40E_PRTDCB_TFCS_TXOFF7_MASK  I40E_MASK(0x1, I40E_PRTDCB_TFCS_TXOFF7_SHIFT)
+#define I40E_PRTDCB_TPFCTS(_i)            (0x001E4660 + ((_i) * 32)) /* _i=0...7 */ /* Reset: GLOBR */
+#define I40E_PRTDCB_TPFCTS_MAX_INDEX      7
+#define I40E_PRTDCB_TPFCTS_PFCTIMER_SHIFT 0
+#define I40E_PRTDCB_TPFCTS_PFCTIMER_MASK  I40E_MASK(0x3FFF, I40E_PRTDCB_TPFCTS_PFCTIMER_SHIFT)
+#define I40E_GLFCOE_RCTL                0x00269B94 /* Reset: CORER */
+#define I40E_GLFCOE_RCTL_FCOEVER_SHIFT  0
+#define I40E_GLFCOE_RCTL_FCOEVER_MASK   I40E_MASK(0xF, I40E_GLFCOE_RCTL_FCOEVER_SHIFT)
+#define I40E_GLFCOE_RCTL_SAVBAD_SHIFT   4
+#define I40E_GLFCOE_RCTL_SAVBAD_MASK    I40E_MASK(0x1, I40E_GLFCOE_RCTL_SAVBAD_SHIFT)
+#define I40E_GLFCOE_RCTL_ICRC_SHIFT     5
+#define I40E_GLFCOE_RCTL_ICRC_MASK      I40E_MASK(0x1, I40E_GLFCOE_RCTL_ICRC_SHIFT)
+#define I40E_GLFCOE_RCTL_MAX_SIZE_SHIFT 16
+#define I40E_GLFCOE_RCTL_MAX_SIZE_MASK  I40E_MASK(0x3FFF, I40E_GLFCOE_RCTL_MAX_SIZE_SHIFT)
+#define I40E_GL_FWSTS             0x00083048 /* Reset: POR */
+#define I40E_GL_FWSTS_FWS0B_SHIFT 0
+#define I40E_GL_FWSTS_FWS0B_MASK  I40E_MASK(0xFF, I40E_GL_FWSTS_FWS0B_SHIFT)
+#define I40E_GL_FWSTS_FWRI_SHIFT  9
+#define I40E_GL_FWSTS_FWRI_MASK   I40E_MASK(0x1, I40E_GL_FWSTS_FWRI_SHIFT)
+#define I40E_GL_FWSTS_FWS1B_SHIFT 16
+#define I40E_GL_FWSTS_FWS1B_MASK  I40E_MASK(0xFF, I40E_GL_FWSTS_FWS1B_SHIFT)
+#define I40E_GLGEN_CLKSTAT                    0x000B8184 /* Reset: POR */
+#define I40E_GLGEN_CLKSTAT_CLKMODE_SHIFT      0
+#define I40E_GLGEN_CLKSTAT_CLKMODE_MASK       I40E_MASK(0x1, I40E_GLGEN_CLKSTAT_CLKMODE_SHIFT)
+#define I40E_GLGEN_CLKSTAT_U_CLK_SPEED_SHIFT  4
+#define I40E_GLGEN_CLKSTAT_U_CLK_SPEED_MASK   I40E_MASK(0x3, I40E_GLGEN_CLKSTAT_U_CLK_SPEED_SHIFT)
+#define I40E_GLGEN_CLKSTAT_P0_CLK_SPEED_SHIFT 8
+#define I40E_GLGEN_CLKSTAT_P0_CLK_SPEED_MASK  I40E_MASK(0x7, I40E_GLGEN_CLKSTAT_P0_CLK_SPEED_SHIFT)
+#define I40E_GLGEN_CLKSTAT_P1_CLK_SPEED_SHIFT 12
+#define I40E_GLGEN_CLKSTAT_P1_CLK_SPEED_MASK  I40E_MASK(0x7, I40E_GLGEN_CLKSTAT_P1_CLK_SPEED_SHIFT)
+#define I40E_GLGEN_CLKSTAT_P2_CLK_SPEED_SHIFT 16
+#define I40E_GLGEN_CLKSTAT_P2_CLK_SPEED_MASK  I40E_MASK(0x7, I40E_GLGEN_CLKSTAT_P2_CLK_SPEED_SHIFT)
+#define I40E_GLGEN_CLKSTAT_P3_CLK_SPEED_SHIFT 20
+#define I40E_GLGEN_CLKSTAT_P3_CLK_SPEED_MASK  I40E_MASK(0x7, I40E_GLGEN_CLKSTAT_P3_CLK_SPEED_SHIFT)
+#define I40E_GLGEN_GPIO_CTL(_i)                (0x00088100 + ((_i) * 4)) /* _i=0...29 */ /* Reset: POR */
+#define I40E_GLGEN_GPIO_CTL_MAX_INDEX          29
+#define I40E_GLGEN_GPIO_CTL_PRT_NUM_SHIFT      0
+#define I40E_GLGEN_GPIO_CTL_PRT_NUM_MASK       I40E_MASK(0x3, I40E_GLGEN_GPIO_CTL_PRT_NUM_SHIFT)
+#define I40E_GLGEN_GPIO_CTL_PRT_NUM_NA_SHIFT   3
+#define I40E_GLGEN_GPIO_CTL_PRT_NUM_NA_MASK    I40E_MASK(0x1, I40E_GLGEN_GPIO_CTL_PRT_NUM_NA_SHIFT)
+#define I40E_GLGEN_GPIO_CTL_PIN_DIR_SHIFT      4
+#define I40E_GLGEN_GPIO_CTL_PIN_DIR_MASK       I40E_MASK(0x1, I40E_GLGEN_GPIO_CTL_PIN_DIR_SHIFT)
+#define I40E_GLGEN_GPIO_CTL_TRI_CTL_SHIFT      5
+#define I40E_GLGEN_GPIO_CTL_TRI_CTL_MASK       I40E_MASK(0x1, I40E_GLGEN_GPIO_CTL_TRI_CTL_SHIFT)
+#define I40E_GLGEN_GPIO_CTL_OUT_CTL_SHIFT      6
+#define I40E_GLGEN_GPIO_CTL_OUT_CTL_MASK       I40E_MASK(0x1, I40E_GLGEN_GPIO_CTL_OUT_CTL_SHIFT)
+#define I40E_GLGEN_GPIO_CTL_PIN_FUNC_SHIFT     7
+#define I40E_GLGEN_GPIO_CTL_PIN_FUNC_MASK      I40E_MASK(0x7, I40E_GLGEN_GPIO_CTL_PIN_FUNC_SHIFT)
+#define I40E_GLGEN_GPIO_CTL_LED_INVRT_SHIFT    10
+#define I40E_GLGEN_GPIO_CTL_LED_INVRT_MASK     I40E_MASK(0x1, I40E_GLGEN_GPIO_CTL_LED_INVRT_SHIFT)
+#define I40E_GLGEN_GPIO_CTL_LED_BLINK_SHIFT    11
+#define I40E_GLGEN_GPIO_CTL_LED_BLINK_MASK     I40E_MASK(0x1, I40E_GLGEN_GPIO_CTL_LED_BLINK_SHIFT)
+#define I40E_GLGEN_GPIO_CTL_LED_MODE_SHIFT     12
+#define I40E_GLGEN_GPIO_CTL_LED_MODE_MASK      I40E_MASK(0x1F, I40E_GLGEN_GPIO_CTL_LED_MODE_SHIFT)
+#define I40E_GLGEN_GPIO_CTL_INT_MODE_SHIFT     17
+#define I40E_GLGEN_GPIO_CTL_INT_MODE_MASK      I40E_MASK(0x3, I40E_GLGEN_GPIO_CTL_INT_MODE_SHIFT)
+#define I40E_GLGEN_GPIO_CTL_OUT_DEFAULT_SHIFT  19
+#define I40E_GLGEN_GPIO_CTL_OUT_DEFAULT_MASK   I40E_MASK(0x1, I40E_GLGEN_GPIO_CTL_OUT_DEFAULT_SHIFT)
+#define I40E_GLGEN_GPIO_CTL_PHY_PIN_NAME_SHIFT 20
+#define I40E_GLGEN_GPIO_CTL_PHY_PIN_NAME_MASK  I40E_MASK(0x3F, I40E_GLGEN_GPIO_CTL_PHY_PIN_NAME_SHIFT)
+#define I40E_GLGEN_GPIO_CTL_PRT_BIT_MAP_SHIFT  26
+#define I40E_GLGEN_GPIO_CTL_PRT_BIT_MAP_MASK   I40E_MASK(0xF, I40E_GLGEN_GPIO_CTL_PRT_BIT_MAP_SHIFT)
+#define I40E_GLGEN_GPIO_SET                 0x00088184 /* Reset: POR */
+#define I40E_GLGEN_GPIO_SET_GPIO_INDX_SHIFT 0
+#define I40E_GLGEN_GPIO_SET_GPIO_INDX_MASK  I40E_MASK(0x1F, I40E_GLGEN_GPIO_SET_GPIO_INDX_SHIFT)
+#define I40E_GLGEN_GPIO_SET_SDP_DATA_SHIFT  5
+#define I40E_GLGEN_GPIO_SET_SDP_DATA_MASK   I40E_MASK(0x1, I40E_GLGEN_GPIO_SET_SDP_DATA_SHIFT)
+#define I40E_GLGEN_GPIO_SET_DRIVE_SDP_SHIFT 6
+#define I40E_GLGEN_GPIO_SET_DRIVE_SDP_MASK  I40E_MASK(0x1, I40E_GLGEN_GPIO_SET_DRIVE_SDP_SHIFT)
+#define I40E_GLGEN_GPIO_STAT                  0x0008817C /* Reset: POR */
+#define I40E_GLGEN_GPIO_STAT_GPIO_VALUE_SHIFT 0
+#define I40E_GLGEN_GPIO_STAT_GPIO_VALUE_MASK  I40E_MASK(0x3FFFFFFF, I40E_GLGEN_GPIO_STAT_GPIO_VALUE_SHIFT)
+#define I40E_GLGEN_GPIO_TRANSIT                       0x00088180 /* Reset: POR */
+#define I40E_GLGEN_GPIO_TRANSIT_GPIO_TRANSITION_SHIFT 0
+#define I40E_GLGEN_GPIO_TRANSIT_GPIO_TRANSITION_MASK  I40E_MASK(0x3FFFFFFF, I40E_GLGEN_GPIO_TRANSIT_GPIO_TRANSITION_SHIFT)
+#define I40E_GLGEN_I2CCMD(_i)          (0x000881E0 + ((_i) * 4)) /* _i=0...3 */ /* Reset: POR */
+#define I40E_GLGEN_I2CCMD_MAX_INDEX    3
+#define I40E_GLGEN_I2CCMD_DATA_SHIFT   0
+#define I40E_GLGEN_I2CCMD_DATA_MASK    I40E_MASK(0xFFFF, I40E_GLGEN_I2CCMD_DATA_SHIFT)
+#define I40E_GLGEN_I2CCMD_REGADD_SHIFT 16
+#define I40E_GLGEN_I2CCMD_REGADD_MASK  I40E_MASK(0xFF, I40E_GLGEN_I2CCMD_REGADD_SHIFT)
+#define I40E_GLGEN_I2CCMD_PHYADD_SHIFT 24
+#define I40E_GLGEN_I2CCMD_PHYADD_MASK  I40E_MASK(0x7, I40E_GLGEN_I2CCMD_PHYADD_SHIFT)
+#define I40E_GLGEN_I2CCMD_OP_SHIFT     27
+#define I40E_GLGEN_I2CCMD_OP_MASK      I40E_MASK(0x1, I40E_GLGEN_I2CCMD_OP_SHIFT)
+#define I40E_GLGEN_I2CCMD_RESET_SHIFT  28
+#define I40E_GLGEN_I2CCMD_RESET_MASK   I40E_MASK(0x1, I40E_GLGEN_I2CCMD_RESET_SHIFT)
+#define I40E_GLGEN_I2CCMD_R_SHIFT      29
+#define I40E_GLGEN_I2CCMD_R_MASK       I40E_MASK(0x1, I40E_GLGEN_I2CCMD_R_SHIFT)
+#define I40E_GLGEN_I2CCMD_E_SHIFT      31
+#define I40E_GLGEN_I2CCMD_E_MASK       I40E_MASK(0x1, I40E_GLGEN_I2CCMD_E_SHIFT)
+#define I40E_GLGEN_I2CPARAMS(_i)                   (0x000881AC + ((_i) * 4)) /* _i=0...3 */ /* Reset: POR */
+#define I40E_GLGEN_I2CPARAMS_MAX_INDEX             3
+#define I40E_GLGEN_I2CPARAMS_WRITE_TIME_SHIFT      0
+#define I40E_GLGEN_I2CPARAMS_WRITE_TIME_MASK       I40E_MASK(0x1F, I40E_GLGEN_I2CPARAMS_WRITE_TIME_SHIFT)
+#define I40E_GLGEN_I2CPARAMS_READ_TIME_SHIFT       5
+#define I40E_GLGEN_I2CPARAMS_READ_TIME_MASK        I40E_MASK(0x7, I40E_GLGEN_I2CPARAMS_READ_TIME_SHIFT)
+#define I40E_GLGEN_I2CPARAMS_I2CBB_EN_SHIFT        8
+#define I40E_GLGEN_I2CPARAMS_I2CBB_EN_MASK         I40E_MASK(0x1, I40E_GLGEN_I2CPARAMS_I2CBB_EN_SHIFT)
+#define I40E_GLGEN_I2CPARAMS_CLK_SHIFT             9
+#define I40E_GLGEN_I2CPARAMS_CLK_MASK              I40E_MASK(0x1, I40E_GLGEN_I2CPARAMS_CLK_SHIFT)
+#define I40E_GLGEN_I2CPARAMS_DATA_OUT_SHIFT        10
+#define I40E_GLGEN_I2CPARAMS_DATA_OUT_MASK         I40E_MASK(0x1, I40E_GLGEN_I2CPARAMS_DATA_OUT_SHIFT)
+#define I40E_GLGEN_I2CPARAMS_DATA_OE_N_SHIFT       11
+#define I40E_GLGEN_I2CPARAMS_DATA_OE_N_MASK        I40E_MASK(0x1, I40E_GLGEN_I2CPARAMS_DATA_OE_N_SHIFT)
+#define I40E_GLGEN_I2CPARAMS_DATA_IN_SHIFT         12
+#define I40E_GLGEN_I2CPARAMS_DATA_IN_MASK          I40E_MASK(0x1, I40E_GLGEN_I2CPARAMS_DATA_IN_SHIFT)
+#define I40E_GLGEN_I2CPARAMS_CLK_OE_N_SHIFT        13
+#define I40E_GLGEN_I2CPARAMS_CLK_OE_N_MASK         I40E_MASK(0x1, I40E_GLGEN_I2CPARAMS_CLK_OE_N_SHIFT)
+#define I40E_GLGEN_I2CPARAMS_CLK_IN_SHIFT          14
+#define I40E_GLGEN_I2CPARAMS_CLK_IN_MASK           I40E_MASK(0x1, I40E_GLGEN_I2CPARAMS_CLK_IN_SHIFT)
+#define I40E_GLGEN_I2CPARAMS_CLK_STRETCH_DIS_SHIFT 15
+#define I40E_GLGEN_I2CPARAMS_CLK_STRETCH_DIS_MASK  I40E_MASK(0x1, I40E_GLGEN_I2CPARAMS_CLK_STRETCH_DIS_SHIFT)
+#define I40E_GLGEN_I2CPARAMS_I2C_DATA_ORDER_SHIFT  31
+#define I40E_GLGEN_I2CPARAMS_I2C_DATA_ORDER_MASK   I40E_MASK(0x1, I40E_GLGEN_I2CPARAMS_I2C_DATA_ORDER_SHIFT)
+#define I40E_GLGEN_LED_CTL                          0x00088178 /* Reset: POR */
+#define I40E_GLGEN_LED_CTL_GLOBAL_BLINK_MODE_SHIFT  0
+#define I40E_GLGEN_LED_CTL_GLOBAL_BLINK_MODE_MASK   I40E_MASK(0x1, I40E_GLGEN_LED_CTL_GLOBAL_BLINK_MODE_SHIFT)
+#define I40E_GLGEN_MDIO_CTRL(_i)                (0x000881D0 + ((_i) * 4)) /* _i=0...3 */ /* Reset: POR */
+#define I40E_GLGEN_MDIO_CTRL_MAX_INDEX          3
+#define I40E_GLGEN_MDIO_CTRL_LEGACY_RSVD2_SHIFT 0
+#define I40E_GLGEN_MDIO_CTRL_LEGACY_RSVD2_MASK  I40E_MASK(0x1FFFF, I40E_GLGEN_MDIO_CTRL_LEGACY_RSVD2_SHIFT)
+#define I40E_GLGEN_MDIO_CTRL_CONTMDC_SHIFT      17
+#define I40E_GLGEN_MDIO_CTRL_CONTMDC_MASK       I40E_MASK(0x1, I40E_GLGEN_MDIO_CTRL_CONTMDC_SHIFT)
+#define I40E_GLGEN_MDIO_CTRL_LEGACY_RSVD1_SHIFT 18
+#define I40E_GLGEN_MDIO_CTRL_LEGACY_RSVD1_MASK  I40E_MASK(0x7FF, I40E_GLGEN_MDIO_CTRL_LEGACY_RSVD1_SHIFT)
+#define I40E_GLGEN_MDIO_CTRL_LEGACY_RSVD0_SHIFT 29
+#define I40E_GLGEN_MDIO_CTRL_LEGACY_RSVD0_MASK  I40E_MASK(0x7, I40E_GLGEN_MDIO_CTRL_LEGACY_RSVD0_SHIFT)
+#define I40E_GLGEN_MDIO_I2C_SEL(_i)                (0x000881C0 + ((_i) * 4)) /* _i=0...3 */ /* Reset: POR */
+#define I40E_GLGEN_MDIO_I2C_SEL_MAX_INDEX          3
+#define I40E_GLGEN_MDIO_I2C_SEL_MDIO_I2C_SEL_SHIFT 0
+#define I40E_GLGEN_MDIO_I2C_SEL_MDIO_I2C_SEL_MASK  I40E_MASK(0x1, I40E_GLGEN_MDIO_I2C_SEL_MDIO_I2C_SEL_SHIFT)
+#define I40E_GLGEN_MDIO_I2C_SEL_PHY_PORT_NUM_SHIFT 1
+#define I40E_GLGEN_MDIO_I2C_SEL_PHY_PORT_NUM_MASK  I40E_MASK(0xF, I40E_GLGEN_MDIO_I2C_SEL_PHY_PORT_NUM_SHIFT)
+#define I40E_GLGEN_MDIO_I2C_SEL_PHY0_ADDRESS_SHIFT 5
+#define I40E_GLGEN_MDIO_I2C_SEL_PHY0_ADDRESS_MASK  I40E_MASK(0x1F, I40E_GLGEN_MDIO_I2C_SEL_PHY0_ADDRESS_SHIFT)
+#define I40E_GLGEN_MDIO_I2C_SEL_PHY1_ADDRESS_SHIFT 10
+#define I40E_GLGEN_MDIO_I2C_SEL_PHY1_ADDRESS_MASK  I40E_MASK(0x1F, I40E_GLGEN_MDIO_I2C_SEL_PHY1_ADDRESS_SHIFT)
+#define I40E_GLGEN_MDIO_I2C_SEL_PHY2_ADDRESS_SHIFT 15
+#define I40E_GLGEN_MDIO_I2C_SEL_PHY2_ADDRESS_MASK  I40E_MASK(0x1F, I40E_GLGEN_MDIO_I2C_SEL_PHY2_ADDRESS_SHIFT)
+#define I40E_GLGEN_MDIO_I2C_SEL_PHY3_ADDRESS_SHIFT 20
+#define I40E_GLGEN_MDIO_I2C_SEL_PHY3_ADDRESS_MASK  I40E_MASK(0x1F, I40E_GLGEN_MDIO_I2C_SEL_PHY3_ADDRESS_SHIFT)
+#define I40E_GLGEN_MDIO_I2C_SEL_MDIO_IF_MODE_SHIFT 25
+#define I40E_GLGEN_MDIO_I2C_SEL_MDIO_IF_MODE_MASK  I40E_MASK(0xF, I40E_GLGEN_MDIO_I2C_SEL_MDIO_IF_MODE_SHIFT)
+#define I40E_GLGEN_MDIO_I2C_SEL_EN_FAST_MODE_SHIFT 31
+#define I40E_GLGEN_MDIO_I2C_SEL_EN_FAST_MODE_MASK  I40E_MASK(0x1, I40E_GLGEN_MDIO_I2C_SEL_EN_FAST_MODE_SHIFT)
+#define I40E_GLGEN_MSCA(_i)               (0x0008818C + ((_i) * 4)) /* _i=0...3 */ /* Reset: POR */
+#define I40E_GLGEN_MSCA_MAX_INDEX         3
+#define I40E_GLGEN_MSCA_MDIADD_SHIFT      0
+#define I40E_GLGEN_MSCA_MDIADD_MASK       I40E_MASK(0xFFFF, I40E_GLGEN_MSCA_MDIADD_SHIFT)
+#define I40E_GLGEN_MSCA_DEVADD_SHIFT      16
+#define I40E_GLGEN_MSCA_DEVADD_MASK       I40E_MASK(0x1F, I40E_GLGEN_MSCA_DEVADD_SHIFT)
+#define I40E_GLGEN_MSCA_PHYADD_SHIFT      21
+#define I40E_GLGEN_MSCA_PHYADD_MASK       I40E_MASK(0x1F, I40E_GLGEN_MSCA_PHYADD_SHIFT)
+#define I40E_GLGEN_MSCA_OPCODE_SHIFT      26
+#define I40E_GLGEN_MSCA_OPCODE_MASK       I40E_MASK(0x3, I40E_GLGEN_MSCA_OPCODE_SHIFT)
+#define I40E_GLGEN_MSCA_STCODE_SHIFT      28
+#define I40E_GLGEN_MSCA_STCODE_MASK       I40E_MASK(0x3, I40E_GLGEN_MSCA_STCODE_SHIFT)
+#define I40E_GLGEN_MSCA_MDICMD_SHIFT      30
+#define I40E_GLGEN_MSCA_MDICMD_MASK       I40E_MASK(0x1, I40E_GLGEN_MSCA_MDICMD_SHIFT)
+#define I40E_GLGEN_MSCA_MDIINPROGEN_SHIFT 31
+#define I40E_GLGEN_MSCA_MDIINPROGEN_MASK  I40E_MASK(0x1, I40E_GLGEN_MSCA_MDIINPROGEN_SHIFT)
+#define I40E_GLGEN_MSRWD(_i)             (0x0008819C + ((_i) * 4)) /* _i=0...3 */ /* Reset: POR */
+#define I40E_GLGEN_MSRWD_MAX_INDEX       3
+#define I40E_GLGEN_MSRWD_MDIWRDATA_SHIFT 0
+#define I40E_GLGEN_MSRWD_MDIWRDATA_MASK  I40E_MASK(0xFFFF, I40E_GLGEN_MSRWD_MDIWRDATA_SHIFT)
+#define I40E_GLGEN_MSRWD_MDIRDDATA_SHIFT 16
+#define I40E_GLGEN_MSRWD_MDIRDDATA_MASK  I40E_MASK(0xFFFF, I40E_GLGEN_MSRWD_MDIRDDATA_SHIFT)
+#define I40E_GLGEN_PCIFCNCNT                0x001C0AB4 /* Reset: PCIR */
+#define I40E_GLGEN_PCIFCNCNT_PCIPFCNT_SHIFT 0
+#define I40E_GLGEN_PCIFCNCNT_PCIPFCNT_MASK  I40E_MASK(0x1F, I40E_GLGEN_PCIFCNCNT_PCIPFCNT_SHIFT)
+#define I40E_GLGEN_PCIFCNCNT_PCIVFCNT_SHIFT 16
+#define I40E_GLGEN_PCIFCNCNT_PCIVFCNT_MASK  I40E_MASK(0xFF, I40E_GLGEN_PCIFCNCNT_PCIVFCNT_SHIFT)
+#define I40E_GLGEN_RSTAT                   0x000B8188 /* Reset: POR */
+#define I40E_GLGEN_RSTAT_DEVSTATE_SHIFT    0
+#define I40E_GLGEN_RSTAT_DEVSTATE_MASK     I40E_MASK(0x3, I40E_GLGEN_RSTAT_DEVSTATE_SHIFT)
+#define I40E_GLGEN_RSTAT_RESET_TYPE_SHIFT  2
+#define I40E_GLGEN_RSTAT_RESET_TYPE_MASK   I40E_MASK(0x3, I40E_GLGEN_RSTAT_RESET_TYPE_SHIFT)
+#define I40E_GLGEN_RSTAT_CORERCNT_SHIFT    4
+#define I40E_GLGEN_RSTAT_CORERCNT_MASK     I40E_MASK(0x3, I40E_GLGEN_RSTAT_CORERCNT_SHIFT)
+#define I40E_GLGEN_RSTAT_GLOBRCNT_SHIFT    6
+#define I40E_GLGEN_RSTAT_GLOBRCNT_MASK     I40E_MASK(0x3, I40E_GLGEN_RSTAT_GLOBRCNT_SHIFT)
+#define I40E_GLGEN_RSTAT_EMPRCNT_SHIFT     8
+#define I40E_GLGEN_RSTAT_EMPRCNT_MASK      I40E_MASK(0x3, I40E_GLGEN_RSTAT_EMPRCNT_SHIFT)
+#define I40E_GLGEN_RSTAT_TIME_TO_RST_SHIFT 10
+#define I40E_GLGEN_RSTAT_TIME_TO_RST_MASK  I40E_MASK(0x3F, I40E_GLGEN_RSTAT_TIME_TO_RST_SHIFT)
+#define I40E_GLGEN_RSTCTL                   0x000B8180 /* Reset: POR */
+#define I40E_GLGEN_RSTCTL_GRSTDEL_SHIFT     0
+#define I40E_GLGEN_RSTCTL_GRSTDEL_MASK      I40E_MASK(0x3F, I40E_GLGEN_RSTCTL_GRSTDEL_SHIFT)
+#define I40E_GLGEN_RSTCTL_ECC_RST_ENA_SHIFT 8
+#define I40E_GLGEN_RSTCTL_ECC_RST_ENA_MASK  I40E_MASK(0x1, I40E_GLGEN_RSTCTL_ECC_RST_ENA_SHIFT)
+#define I40E_GLGEN_RTRIG              0x000B8190 /* Reset: CORER */
+#define I40E_GLGEN_RTRIG_CORER_SHIFT  0
+#define I40E_GLGEN_RTRIG_CORER_MASK   I40E_MASK(0x1, I40E_GLGEN_RTRIG_CORER_SHIFT)
+#define I40E_GLGEN_RTRIG_GLOBR_SHIFT  1
+#define I40E_GLGEN_RTRIG_GLOBR_MASK   I40E_MASK(0x1, I40E_GLGEN_RTRIG_GLOBR_SHIFT)
+#define I40E_GLGEN_RTRIG_EMPFWR_SHIFT 2
+#define I40E_GLGEN_RTRIG_EMPFWR_MASK  I40E_MASK(0x1, I40E_GLGEN_RTRIG_EMPFWR_SHIFT)
+#define I40E_GLGEN_STAT               0x000B612C /* Reset: POR */
+#define I40E_GLGEN_STAT_HWRSVD0_SHIFT 0
+#define I40E_GLGEN_STAT_HWRSVD0_MASK  I40E_MASK(0x3, I40E_GLGEN_STAT_HWRSVD0_SHIFT)
+#define I40E_GLGEN_STAT_DCBEN_SHIFT   2
+#define I40E_GLGEN_STAT_DCBEN_MASK    I40E_MASK(0x1, I40E_GLGEN_STAT_DCBEN_SHIFT)
+#define I40E_GLGEN_STAT_VTEN_SHIFT    3
+#define I40E_GLGEN_STAT_VTEN_MASK     I40E_MASK(0x1, I40E_GLGEN_STAT_VTEN_SHIFT)
+#define I40E_GLGEN_STAT_FCOEN_SHIFT   4
+#define I40E_GLGEN_STAT_FCOEN_MASK    I40E_MASK(0x1, I40E_GLGEN_STAT_FCOEN_SHIFT)
+#define I40E_GLGEN_STAT_EVBEN_SHIFT   5
+#define I40E_GLGEN_STAT_EVBEN_MASK    I40E_MASK(0x1, I40E_GLGEN_STAT_EVBEN_SHIFT)
+#define I40E_GLGEN_STAT_HWRSVD1_SHIFT 6
+#define I40E_GLGEN_STAT_HWRSVD1_MASK  I40E_MASK(0x3, I40E_GLGEN_STAT_HWRSVD1_SHIFT)
+#define I40E_GLGEN_VFLRSTAT(_i)         (0x00092600 + ((_i) * 4)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLGEN_VFLRSTAT_MAX_INDEX   3
+#define I40E_GLGEN_VFLRSTAT_VFLRE_SHIFT 0
+#define I40E_GLGEN_VFLRSTAT_VFLRE_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLGEN_VFLRSTAT_VFLRE_SHIFT)
+#define I40E_GLVFGEN_TIMER             0x000881BC /* Reset: CORER */
+#define I40E_GLVFGEN_TIMER_GTIME_SHIFT 0
+#define I40E_GLVFGEN_TIMER_GTIME_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLVFGEN_TIMER_GTIME_SHIFT)
+#define I40E_PFGEN_CTRL             0x00092400 /* Reset: PFR */
+#define I40E_PFGEN_CTRL_PFSWR_SHIFT 0
+#define I40E_PFGEN_CTRL_PFSWR_MASK  I40E_MASK(0x1, I40E_PFGEN_CTRL_PFSWR_SHIFT)
+#define I40E_PFGEN_DRUN               0x00092500 /* Reset: CORER */
+#define I40E_PFGEN_DRUN_DRVUNLD_SHIFT 0
+#define I40E_PFGEN_DRUN_DRVUNLD_MASK  I40E_MASK(0x1, I40E_PFGEN_DRUN_DRVUNLD_SHIFT)
+#define I40E_PFGEN_PORTNUM                0x001C0480 /* Reset: CORER */
+#define I40E_PFGEN_PORTNUM_PORT_NUM_SHIFT 0
+#define I40E_PFGEN_PORTNUM_PORT_NUM_MASK  I40E_MASK(0x3, I40E_PFGEN_PORTNUM_PORT_NUM_SHIFT)
+#define I40E_PFGEN_STATE                  0x00088000 /* Reset: CORER */
+#define I40E_PFGEN_STATE_RESERVED_0_SHIFT 0
+#define I40E_PFGEN_STATE_RESERVED_0_MASK  I40E_MASK(0x1, I40E_PFGEN_STATE_RESERVED_0_SHIFT)
+#define I40E_PFGEN_STATE_PFFCEN_SHIFT     1
+#define I40E_PFGEN_STATE_PFFCEN_MASK      I40E_MASK(0x1, I40E_PFGEN_STATE_PFFCEN_SHIFT)
+#define I40E_PFGEN_STATE_PFLINKEN_SHIFT   2
+#define I40E_PFGEN_STATE_PFLINKEN_MASK    I40E_MASK(0x1, I40E_PFGEN_STATE_PFLINKEN_SHIFT)
+#define I40E_PFGEN_STATE_PFSCEN_SHIFT     3
+#define I40E_PFGEN_STATE_PFSCEN_MASK      I40E_MASK(0x1, I40E_PFGEN_STATE_PFSCEN_SHIFT)
+#define I40E_PRTGEN_CNF                      0x000B8120 /* Reset: POR */
+#define I40E_PRTGEN_CNF_PORT_DIS_SHIFT       0
+#define I40E_PRTGEN_CNF_PORT_DIS_MASK        I40E_MASK(0x1, I40E_PRTGEN_CNF_PORT_DIS_SHIFT)
+#define I40E_PRTGEN_CNF_ALLOW_PORT_DIS_SHIFT 1
+#define I40E_PRTGEN_CNF_ALLOW_PORT_DIS_MASK  I40E_MASK(0x1, I40E_PRTGEN_CNF_ALLOW_PORT_DIS_SHIFT)
+#define I40E_PRTGEN_CNF_EMP_PORT_DIS_SHIFT   2
+#define I40E_PRTGEN_CNF_EMP_PORT_DIS_MASK    I40E_MASK(0x1, I40E_PRTGEN_CNF_EMP_PORT_DIS_SHIFT)
+#define I40E_PRTGEN_CNF2                          0x000B8160 /* Reset: POR */
+#define I40E_PRTGEN_CNF2_ACTIVATE_PORT_LINK_SHIFT 0
+#define I40E_PRTGEN_CNF2_ACTIVATE_PORT_LINK_MASK  I40E_MASK(0x1, I40E_PRTGEN_CNF2_ACTIVATE_PORT_LINK_SHIFT)
+#define I40E_PRTGEN_STATUS                   0x000B8100 /* Reset: POR */
+#define I40E_PRTGEN_STATUS_PORT_VALID_SHIFT  0
+#define I40E_PRTGEN_STATUS_PORT_VALID_MASK   I40E_MASK(0x1, I40E_PRTGEN_STATUS_PORT_VALID_SHIFT)
+#define I40E_PRTGEN_STATUS_PORT_ACTIVE_SHIFT 1
+#define I40E_PRTGEN_STATUS_PORT_ACTIVE_MASK  I40E_MASK(0x1, I40E_PRTGEN_STATUS_PORT_ACTIVE_SHIFT)
+#define I40E_VFGEN_RSTAT1(_VF)            (0x00074400 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
+#define I40E_VFGEN_RSTAT1_MAX_INDEX       127
+#define I40E_VFGEN_RSTAT1_VFR_STATE_SHIFT 0
+#define I40E_VFGEN_RSTAT1_VFR_STATE_MASK  I40E_MASK(0x3, I40E_VFGEN_RSTAT1_VFR_STATE_SHIFT)
+#define I40E_VPGEN_VFRSTAT(_VF)       (0x00091C00 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: CORER */
+#define I40E_VPGEN_VFRSTAT_MAX_INDEX  127
+#define I40E_VPGEN_VFRSTAT_VFRD_SHIFT 0
+#define I40E_VPGEN_VFRSTAT_VFRD_MASK  I40E_MASK(0x1, I40E_VPGEN_VFRSTAT_VFRD_SHIFT)
+#define I40E_VPGEN_VFRTRIG(_VF)        (0x00091800 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: CORER */
+#define I40E_VPGEN_VFRTRIG_MAX_INDEX   127
+#define I40E_VPGEN_VFRTRIG_VFSWR_SHIFT 0
+#define I40E_VPGEN_VFRTRIG_VFSWR_MASK  I40E_MASK(0x1, I40E_VPGEN_VFRTRIG_VFSWR_SHIFT)
+#define I40E_VSIGEN_RSTAT(_VSI)      (0x00090800 + ((_VSI) * 4)) /* _i=0...383 */ /* Reset: CORER */
+#define I40E_VSIGEN_RSTAT_MAX_INDEX  383
+#define I40E_VSIGEN_RSTAT_VMRD_SHIFT 0
+#define I40E_VSIGEN_RSTAT_VMRD_MASK  I40E_MASK(0x1, I40E_VSIGEN_RSTAT_VMRD_SHIFT)
+#define I40E_VSIGEN_RTRIG(_VSI)       (0x00090000 + ((_VSI) * 4)) /* _i=0...383 */ /* Reset: CORER */
+#define I40E_VSIGEN_RTRIG_MAX_INDEX   383
+#define I40E_VSIGEN_RTRIG_VMSWR_SHIFT 0
+#define I40E_VSIGEN_RTRIG_VMSWR_MASK  I40E_MASK(0x1, I40E_VSIGEN_RTRIG_VMSWR_SHIFT)
+#define I40E_GLHMC_FCOEDDPBASE(_i)                  (0x000C6600 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_FCOEDDPBASE_MAX_INDEX            15
+#define I40E_GLHMC_FCOEDDPBASE_FPMFCOEDDPBASE_SHIFT 0
+#define I40E_GLHMC_FCOEDDPBASE_FPMFCOEDDPBASE_MASK  I40E_MASK(0xFFFFFF, I40E_GLHMC_FCOEDDPBASE_FPMFCOEDDPBASE_SHIFT)
+#define I40E_GLHMC_FCOEDDPCNT(_i)                 (0x000C6700 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_FCOEDDPCNT_MAX_INDEX           15
+#define I40E_GLHMC_FCOEDDPCNT_FPMFCOEDDPCNT_SHIFT 0
+#define I40E_GLHMC_FCOEDDPCNT_FPMFCOEDDPCNT_MASK  I40E_MASK(0xFFFFF, I40E_GLHMC_FCOEDDPCNT_FPMFCOEDDPCNT_SHIFT)
+#define I40E_GLHMC_FCOEDDPOBJSZ                      0x000C2010 /* Reset: CORER */
+#define I40E_GLHMC_FCOEDDPOBJSZ_PMFCOEDDPOBJSZ_SHIFT 0
+#define I40E_GLHMC_FCOEDDPOBJSZ_PMFCOEDDPOBJSZ_MASK  I40E_MASK(0xF, I40E_GLHMC_FCOEDDPOBJSZ_PMFCOEDDPOBJSZ_SHIFT)
+#define I40E_GLHMC_FCOEFBASE(_i)                (0x000C6800 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_FCOEFBASE_MAX_INDEX          15
+#define I40E_GLHMC_FCOEFBASE_FPMFCOEFBASE_SHIFT 0
+#define I40E_GLHMC_FCOEFBASE_FPMFCOEFBASE_MASK  I40E_MASK(0xFFFFFF, I40E_GLHMC_FCOEFBASE_FPMFCOEFBASE_SHIFT)
+#define I40E_GLHMC_FCOEFCNT(_i)               (0x000C6900 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_FCOEFCNT_MAX_INDEX         15
+#define I40E_GLHMC_FCOEFCNT_FPMFCOEFCNT_SHIFT 0
+#define I40E_GLHMC_FCOEFCNT_FPMFCOEFCNT_MASK  I40E_MASK(0x7FFFFF, I40E_GLHMC_FCOEFCNT_FPMFCOEFCNT_SHIFT)
+#define I40E_GLHMC_FCOEFMAX                  0x000C20D0 /* Reset: CORER */
+#define I40E_GLHMC_FCOEFMAX_PMFCOEFMAX_SHIFT 0
+#define I40E_GLHMC_FCOEFMAX_PMFCOEFMAX_MASK  I40E_MASK(0xFFFF, I40E_GLHMC_FCOEFMAX_PMFCOEFMAX_SHIFT)
+#define I40E_GLHMC_FCOEFOBJSZ                    0x000C2018 /* Reset: CORER */
+#define I40E_GLHMC_FCOEFOBJSZ_PMFCOEFOBJSZ_SHIFT 0
+#define I40E_GLHMC_FCOEFOBJSZ_PMFCOEFOBJSZ_MASK  I40E_MASK(0xF, I40E_GLHMC_FCOEFOBJSZ_PMFCOEFOBJSZ_SHIFT)
+#define I40E_GLHMC_FCOEMAX                 0x000C2014 /* Reset: CORER */
+#define I40E_GLHMC_FCOEMAX_PMFCOEMAX_SHIFT 0
+#define I40E_GLHMC_FCOEMAX_PMFCOEMAX_MASK  I40E_MASK(0x1FFF, I40E_GLHMC_FCOEMAX_PMFCOEMAX_SHIFT)
+#define I40E_GLHMC_FSIAVBASE(_i)                (0x000C5600 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_FSIAVBASE_MAX_INDEX          15
+#define I40E_GLHMC_FSIAVBASE_FPMFSIAVBASE_SHIFT 0
+#define I40E_GLHMC_FSIAVBASE_FPMFSIAVBASE_MASK  I40E_MASK(0xFFFFFF, I40E_GLHMC_FSIAVBASE_FPMFSIAVBASE_SHIFT)
+#define I40E_GLHMC_FSIAVCNT(_i)               (0x000C5700 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_FSIAVCNT_MAX_INDEX         15
+#define I40E_GLHMC_FSIAVCNT_FPMFSIAVCNT_SHIFT 0
+#define I40E_GLHMC_FSIAVCNT_FPMFSIAVCNT_MASK  I40E_MASK(0x1FFFFFFF, I40E_GLHMC_FSIAVCNT_FPMFSIAVCNT_SHIFT)
+#define I40E_GLHMC_FSIAVCNT_RSVD_SHIFT        29
+#define I40E_GLHMC_FSIAVCNT_RSVD_MASK         I40E_MASK(0x7, I40E_GLHMC_FSIAVCNT_RSVD_SHIFT)
+#define I40E_GLHMC_FSIAVMAX                  0x000C2068 /* Reset: CORER */
+#define I40E_GLHMC_FSIAVMAX_PMFSIAVMAX_SHIFT 0
+#define I40E_GLHMC_FSIAVMAX_PMFSIAVMAX_MASK  I40E_MASK(0x1FFFF, I40E_GLHMC_FSIAVMAX_PMFSIAVMAX_SHIFT)
+#define I40E_GLHMC_FSIAVOBJSZ                    0x000C2064 /* Reset: CORER */
+#define I40E_GLHMC_FSIAVOBJSZ_PMFSIAVOBJSZ_SHIFT 0
+#define I40E_GLHMC_FSIAVOBJSZ_PMFSIAVOBJSZ_MASK  I40E_MASK(0xF, I40E_GLHMC_FSIAVOBJSZ_PMFSIAVOBJSZ_SHIFT)
+#define I40E_GLHMC_FSIMCBASE(_i)                (0x000C6000 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_FSIMCBASE_MAX_INDEX          15
+#define I40E_GLHMC_FSIMCBASE_FPMFSIMCBASE_SHIFT 0
+#define I40E_GLHMC_FSIMCBASE_FPMFSIMCBASE_MASK  I40E_MASK(0xFFFFFF, I40E_GLHMC_FSIMCBASE_FPMFSIMCBASE_SHIFT)
+#define I40E_GLHMC_FSIMCCNT(_i)              (0x000C6100 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_FSIMCCNT_MAX_INDEX        15
+#define I40E_GLHMC_FSIMCCNT_FPMFSIMCSZ_SHIFT 0
+#define I40E_GLHMC_FSIMCCNT_FPMFSIMCSZ_MASK  I40E_MASK(0x1FFFFFFF, I40E_GLHMC_FSIMCCNT_FPMFSIMCSZ_SHIFT)
+#define I40E_GLHMC_FSIMCMAX                  0x000C2060 /* Reset: CORER */
+#define I40E_GLHMC_FSIMCMAX_PMFSIMCMAX_SHIFT 0
+#define I40E_GLHMC_FSIMCMAX_PMFSIMCMAX_MASK  I40E_MASK(0x3FFF, I40E_GLHMC_FSIMCMAX_PMFSIMCMAX_SHIFT)
+#define I40E_GLHMC_FSIMCOBJSZ                    0x000C205c /* Reset: CORER */
+#define I40E_GLHMC_FSIMCOBJSZ_PMFSIMCOBJSZ_SHIFT 0
+#define I40E_GLHMC_FSIMCOBJSZ_PMFSIMCOBJSZ_MASK  I40E_MASK(0xF, I40E_GLHMC_FSIMCOBJSZ_PMFSIMCOBJSZ_SHIFT)
+#define I40E_GLHMC_LANQMAX                 0x000C2008 /* Reset: CORER */
+#define I40E_GLHMC_LANQMAX_PMLANQMAX_SHIFT 0
+#define I40E_GLHMC_LANQMAX_PMLANQMAX_MASK  I40E_MASK(0x7FF, I40E_GLHMC_LANQMAX_PMLANQMAX_SHIFT)
+#define I40E_GLHMC_LANRXBASE(_i)                (0x000C6400 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_LANRXBASE_MAX_INDEX          15
+#define I40E_GLHMC_LANRXBASE_FPMLANRXBASE_SHIFT 0
+#define I40E_GLHMC_LANRXBASE_FPMLANRXBASE_MASK  I40E_MASK(0xFFFFFF, I40E_GLHMC_LANRXBASE_FPMLANRXBASE_SHIFT)
+#define I40E_GLHMC_LANRXCNT(_i)               (0x000C6500 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_LANRXCNT_MAX_INDEX         15
+#define I40E_GLHMC_LANRXCNT_FPMLANRXCNT_SHIFT 0
+#define I40E_GLHMC_LANRXCNT_FPMLANRXCNT_MASK  I40E_MASK(0x7FF, I40E_GLHMC_LANRXCNT_FPMLANRXCNT_SHIFT)
+#define I40E_GLHMC_LANRXOBJSZ                    0x000C200c /* Reset: CORER */
+#define I40E_GLHMC_LANRXOBJSZ_PMLANRXOBJSZ_SHIFT 0
+#define I40E_GLHMC_LANRXOBJSZ_PMLANRXOBJSZ_MASK  I40E_MASK(0xF, I40E_GLHMC_LANRXOBJSZ_PMLANRXOBJSZ_SHIFT)
+#define I40E_GLHMC_LANTXBASE(_i)                (0x000C6200 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_LANTXBASE_MAX_INDEX          15
+#define I40E_GLHMC_LANTXBASE_FPMLANTXBASE_SHIFT 0
+#define I40E_GLHMC_LANTXBASE_FPMLANTXBASE_MASK  I40E_MASK(0xFFFFFF, I40E_GLHMC_LANTXBASE_FPMLANTXBASE_SHIFT)
+#define I40E_GLHMC_LANTXBASE_RSVD_SHIFT         24
+#define I40E_GLHMC_LANTXBASE_RSVD_MASK          I40E_MASK(0xFF, I40E_GLHMC_LANTXBASE_RSVD_SHIFT)
+#define I40E_GLHMC_LANTXCNT(_i)               (0x000C6300 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_LANTXCNT_MAX_INDEX         15
+#define I40E_GLHMC_LANTXCNT_FPMLANTXCNT_SHIFT 0
+#define I40E_GLHMC_LANTXCNT_FPMLANTXCNT_MASK  I40E_MASK(0x7FF, I40E_GLHMC_LANTXCNT_FPMLANTXCNT_SHIFT)
+#define I40E_GLHMC_LANTXOBJSZ                    0x000C2004 /* Reset: CORER */
+#define I40E_GLHMC_LANTXOBJSZ_PMLANTXOBJSZ_SHIFT 0
+#define I40E_GLHMC_LANTXOBJSZ_PMLANTXOBJSZ_MASK  I40E_MASK(0xF, I40E_GLHMC_LANTXOBJSZ_PMLANTXOBJSZ_SHIFT)
+#define I40E_GLHMC_PFASSIGN(_i)                 (0x000C0c00 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_PFASSIGN_MAX_INDEX           15
+#define I40E_GLHMC_PFASSIGN_PMFCNPFASSIGN_SHIFT 0
+#define I40E_GLHMC_PFASSIGN_PMFCNPFASSIGN_MASK  I40E_MASK(0xF, I40E_GLHMC_PFASSIGN_PMFCNPFASSIGN_SHIFT)
+#define I40E_GLHMC_SDPART(_i)            (0x000C0800 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_SDPART_MAX_INDEX      15
+#define I40E_GLHMC_SDPART_PMSDBASE_SHIFT 0
+#define I40E_GLHMC_SDPART_PMSDBASE_MASK  I40E_MASK(0xFFF, I40E_GLHMC_SDPART_PMSDBASE_SHIFT)
+#define I40E_GLHMC_SDPART_PMSDSIZE_SHIFT 16
+#define I40E_GLHMC_SDPART_PMSDSIZE_MASK  I40E_MASK(0x1FFF, I40E_GLHMC_SDPART_PMSDSIZE_SHIFT)
+#define I40E_PFHMC_ERRORDATA                      0x000C0500 /* Reset: PFR */
+#define I40E_PFHMC_ERRORDATA_HMC_ERROR_DATA_SHIFT 0
+#define I40E_PFHMC_ERRORDATA_HMC_ERROR_DATA_MASK  I40E_MASK(0x3FFFFFFF, I40E_PFHMC_ERRORDATA_HMC_ERROR_DATA_SHIFT)
+#define I40E_PFHMC_ERRORINFO                       0x000C0400 /* Reset: PFR */
+#define I40E_PFHMC_ERRORINFO_PMF_INDEX_SHIFT       0
+#define I40E_PFHMC_ERRORINFO_PMF_INDEX_MASK        I40E_MASK(0x1F, I40E_PFHMC_ERRORINFO_PMF_INDEX_SHIFT)
+#define I40E_PFHMC_ERRORINFO_PMF_ISVF_SHIFT        7
+#define I40E_PFHMC_ERRORINFO_PMF_ISVF_MASK         I40E_MASK(0x1, I40E_PFHMC_ERRORINFO_PMF_ISVF_SHIFT)
+#define I40E_PFHMC_ERRORINFO_HMC_ERROR_TYPE_SHIFT  8
+#define I40E_PFHMC_ERRORINFO_HMC_ERROR_TYPE_MASK   I40E_MASK(0xF, I40E_PFHMC_ERRORINFO_HMC_ERROR_TYPE_SHIFT)
+#define I40E_PFHMC_ERRORINFO_HMC_OBJECT_TYPE_SHIFT 16
+#define I40E_PFHMC_ERRORINFO_HMC_OBJECT_TYPE_MASK  I40E_MASK(0x1F, I40E_PFHMC_ERRORINFO_HMC_OBJECT_TYPE_SHIFT)
+#define I40E_PFHMC_ERRORINFO_ERROR_DETECTED_SHIFT  31
+#define I40E_PFHMC_ERRORINFO_ERROR_DETECTED_MASK   I40E_MASK(0x1, I40E_PFHMC_ERRORINFO_ERROR_DETECTED_SHIFT)
+#define I40E_PFHMC_PDINV               0x000C0300 /* Reset: PFR */
+#define I40E_PFHMC_PDINV_PMSDIDX_SHIFT 0
+#define I40E_PFHMC_PDINV_PMSDIDX_MASK  I40E_MASK(0xFFF, I40E_PFHMC_PDINV_PMSDIDX_SHIFT)
+#define I40E_PFHMC_PDINV_PMPDIDX_SHIFT 16
+#define I40E_PFHMC_PDINV_PMPDIDX_MASK  I40E_MASK(0x1FF, I40E_PFHMC_PDINV_PMPDIDX_SHIFT)
+#define I40E_PFHMC_SDCMD               0x000C0000 /* Reset: PFR */
+#define I40E_PFHMC_SDCMD_PMSDIDX_SHIFT 0
+#define I40E_PFHMC_SDCMD_PMSDIDX_MASK  I40E_MASK(0xFFF, I40E_PFHMC_SDCMD_PMSDIDX_SHIFT)
+#define I40E_PFHMC_SDCMD_PMSDWR_SHIFT  31
+#define I40E_PFHMC_SDCMD_PMSDWR_MASK   I40E_MASK(0x1, I40E_PFHMC_SDCMD_PMSDWR_SHIFT)
+#define I40E_PFHMC_SDDATAHIGH                    0x000C0200 /* Reset: PFR */
+#define I40E_PFHMC_SDDATAHIGH_PMSDDATAHIGH_SHIFT 0
+#define I40E_PFHMC_SDDATAHIGH_PMSDDATAHIGH_MASK  I40E_MASK(0xFFFFFFFF, I40E_PFHMC_SDDATAHIGH_PMSDDATAHIGH_SHIFT)
+#define I40E_PFHMC_SDDATALOW                   0x000C0100 /* Reset: PFR */
+#define I40E_PFHMC_SDDATALOW_PMSDVALID_SHIFT   0
+#define I40E_PFHMC_SDDATALOW_PMSDVALID_MASK    I40E_MASK(0x1, I40E_PFHMC_SDDATALOW_PMSDVALID_SHIFT)
+#define I40E_PFHMC_SDDATALOW_PMSDTYPE_SHIFT    1
+#define I40E_PFHMC_SDDATALOW_PMSDTYPE_MASK     I40E_MASK(0x1, I40E_PFHMC_SDDATALOW_PMSDTYPE_SHIFT)
+#define I40E_PFHMC_SDDATALOW_PMSDBPCOUNT_SHIFT 2
+#define I40E_PFHMC_SDDATALOW_PMSDBPCOUNT_MASK  I40E_MASK(0x3FF, I40E_PFHMC_SDDATALOW_PMSDBPCOUNT_SHIFT)
+#define I40E_PFHMC_SDDATALOW_PMSDDATALOW_SHIFT 12
+#define I40E_PFHMC_SDDATALOW_PMSDDATALOW_MASK  I40E_MASK(0xFFFFF, I40E_PFHMC_SDDATALOW_PMSDDATALOW_SHIFT)
+#define I40E_GL_GP_FUSE(_i)              (0x0009400C + ((_i) * 4)) /* _i=0...28 */ /* Reset: POR */
+#define I40E_GL_GP_FUSE_MAX_INDEX        28
+#define I40E_GL_GP_FUSE_GL_GP_FUSE_SHIFT 0
+#define I40E_GL_GP_FUSE_GL_GP_FUSE_MASK  I40E_MASK(0xFFFFFFFF, I40E_GL_GP_FUSE_GL_GP_FUSE_SHIFT)
+#define I40E_GL_UFUSE                        0x00094008 /* Reset: POR */
+#define I40E_GL_UFUSE_FOUR_PORT_ENABLE_SHIFT 1
+#define I40E_GL_UFUSE_FOUR_PORT_ENABLE_MASK  I40E_MASK(0x1, I40E_GL_UFUSE_FOUR_PORT_ENABLE_SHIFT)
+#define I40E_GL_UFUSE_NIC_ID_SHIFT           2
+#define I40E_GL_UFUSE_NIC_ID_MASK            I40E_MASK(0x1, I40E_GL_UFUSE_NIC_ID_SHIFT)
+#define I40E_GL_UFUSE_ULT_LOCKOUT_SHIFT      10
+#define I40E_GL_UFUSE_ULT_LOCKOUT_MASK       I40E_MASK(0x1, I40E_GL_UFUSE_ULT_LOCKOUT_SHIFT)
+#define I40E_GL_UFUSE_CLS_LOCKOUT_SHIFT      11
+#define I40E_GL_UFUSE_CLS_LOCKOUT_MASK       I40E_MASK(0x1, I40E_GL_UFUSE_CLS_LOCKOUT_SHIFT)
+#define I40E_EMPINT_GPIO_ENA                  0x00088188 /* Reset: POR */
+#define I40E_EMPINT_GPIO_ENA_GPIO0_ENA_SHIFT  0
+#define I40E_EMPINT_GPIO_ENA_GPIO0_ENA_MASK   I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO0_ENA_SHIFT)
+#define I40E_EMPINT_GPIO_ENA_GPIO1_ENA_SHIFT  1
+#define I40E_EMPINT_GPIO_ENA_GPIO1_ENA_MASK   I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO1_ENA_SHIFT)
+#define I40E_EMPINT_GPIO_ENA_GPIO2_ENA_SHIFT  2
+#define I40E_EMPINT_GPIO_ENA_GPIO2_ENA_MASK   I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO2_ENA_SHIFT)
+#define I40E_EMPINT_GPIO_ENA_GPIO3_ENA_SHIFT  3
+#define I40E_EMPINT_GPIO_ENA_GPIO3_ENA_MASK   I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO3_ENA_SHIFT)
+#define I40E_EMPINT_GPIO_ENA_GPIO4_ENA_SHIFT  4
+#define I40E_EMPINT_GPIO_ENA_GPIO4_ENA_MASK   I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO4_ENA_SHIFT)
+#define I40E_EMPINT_GPIO_ENA_GPIO5_ENA_SHIFT  5
+#define I40E_EMPINT_GPIO_ENA_GPIO5_ENA_MASK   I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO5_ENA_SHIFT)
+#define I40E_EMPINT_GPIO_ENA_GPIO6_ENA_SHIFT  6
+#define I40E_EMPINT_GPIO_ENA_GPIO6_ENA_MASK   I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO6_ENA_SHIFT)
+#define I40E_EMPINT_GPIO_ENA_GPIO7_ENA_SHIFT  7
+#define I40E_EMPINT_GPIO_ENA_GPIO7_ENA_MASK   I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO7_ENA_SHIFT)
+#define I40E_EMPINT_GPIO_ENA_GPIO8_ENA_SHIFT  8
+#define I40E_EMPINT_GPIO_ENA_GPIO8_ENA_MASK   I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO8_ENA_SHIFT)
+#define I40E_EMPINT_GPIO_ENA_GPIO9_ENA_SHIFT  9
+#define I40E_EMPINT_GPIO_ENA_GPIO9_ENA_MASK   I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO9_ENA_SHIFT)
+#define I40E_EMPINT_GPIO_ENA_GPIO10_ENA_SHIFT 10
+#define I40E_EMPINT_GPIO_ENA_GPIO10_ENA_MASK  I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO10_ENA_SHIFT)
+#define I40E_EMPINT_GPIO_ENA_GPIO11_ENA_SHIFT 11
+#define I40E_EMPINT_GPIO_ENA_GPIO11_ENA_MASK  I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO11_ENA_SHIFT)
+#define I40E_EMPINT_GPIO_ENA_GPIO12_ENA_SHIFT 12
+#define I40E_EMPINT_GPIO_ENA_GPIO12_ENA_MASK  I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO12_ENA_SHIFT)
+#define I40E_EMPINT_GPIO_ENA_GPIO13_ENA_SHIFT 13
+#define I40E_EMPINT_GPIO_ENA_GPIO13_ENA_MASK  I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO13_ENA_SHIFT)
+#define I40E_EMPINT_GPIO_ENA_GPIO14_ENA_SHIFT 14
+#define I40E_EMPINT_GPIO_ENA_GPIO14_ENA_MASK  I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO14_ENA_SHIFT)
+#define I40E_EMPINT_GPIO_ENA_GPIO15_ENA_SHIFT 15
+#define I40E_EMPINT_GPIO_ENA_GPIO15_ENA_MASK  I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO15_ENA_SHIFT)
+#define I40E_EMPINT_GPIO_ENA_GPIO16_ENA_SHIFT 16
+#define I40E_EMPINT_GPIO_ENA_GPIO16_ENA_MASK  I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO16_ENA_SHIFT)
+#define I40E_EMPINT_GPIO_ENA_GPIO17_ENA_SHIFT 17
+#define I40E_EMPINT_GPIO_ENA_GPIO17_ENA_MASK  I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO17_ENA_SHIFT)
+#define I40E_EMPINT_GPIO_ENA_GPIO18_ENA_SHIFT 18
+#define I40E_EMPINT_GPIO_ENA_GPIO18_ENA_MASK  I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO18_ENA_SHIFT)
+#define I40E_EMPINT_GPIO_ENA_GPIO19_ENA_SHIFT 19
+#define I40E_EMPINT_GPIO_ENA_GPIO19_ENA_MASK  I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO19_ENA_SHIFT)
+#define I40E_EMPINT_GPIO_ENA_GPIO20_ENA_SHIFT 20
+#define I40E_EMPINT_GPIO_ENA_GPIO20_ENA_MASK  I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO20_ENA_SHIFT)
+#define I40E_EMPINT_GPIO_ENA_GPIO21_ENA_SHIFT 21
+#define I40E_EMPINT_GPIO_ENA_GPIO21_ENA_MASK  I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO21_ENA_SHIFT)
+#define I40E_EMPINT_GPIO_ENA_GPIO22_ENA_SHIFT 22
+#define I40E_EMPINT_GPIO_ENA_GPIO22_ENA_MASK  I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO22_ENA_SHIFT)
+#define I40E_EMPINT_GPIO_ENA_GPIO23_ENA_SHIFT 23
+#define I40E_EMPINT_GPIO_ENA_GPIO23_ENA_MASK  I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO23_ENA_SHIFT)
+#define I40E_EMPINT_GPIO_ENA_GPIO24_ENA_SHIFT 24
+#define I40E_EMPINT_GPIO_ENA_GPIO24_ENA_MASK  I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO24_ENA_SHIFT)
+#define I40E_EMPINT_GPIO_ENA_GPIO25_ENA_SHIFT 25
+#define I40E_EMPINT_GPIO_ENA_GPIO25_ENA_MASK  I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO25_ENA_SHIFT)
+#define I40E_EMPINT_GPIO_ENA_GPIO26_ENA_SHIFT 26
+#define I40E_EMPINT_GPIO_ENA_GPIO26_ENA_MASK  I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO26_ENA_SHIFT)
+#define I40E_EMPINT_GPIO_ENA_GPIO27_ENA_SHIFT 27
+#define I40E_EMPINT_GPIO_ENA_GPIO27_ENA_MASK  I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO27_ENA_SHIFT)
+#define I40E_EMPINT_GPIO_ENA_GPIO28_ENA_SHIFT 28
+#define I40E_EMPINT_GPIO_ENA_GPIO28_ENA_MASK  I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO28_ENA_SHIFT)
+#define I40E_EMPINT_GPIO_ENA_GPIO29_ENA_SHIFT 29
+#define I40E_EMPINT_GPIO_ENA_GPIO29_ENA_MASK  I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO29_ENA_SHIFT)
+#define I40E_PFGEN_PORTMDIO_NUM                       0x0003F100 /* Reset: CORER */
+#define I40E_PFGEN_PORTMDIO_NUM_PORT_NUM_SHIFT        0
+#define I40E_PFGEN_PORTMDIO_NUM_PORT_NUM_MASK         I40E_MASK(0x3, I40E_PFGEN_PORTMDIO_NUM_PORT_NUM_SHIFT)
+#define I40E_PFGEN_PORTMDIO_NUM_VFLINK_STAT_ENA_SHIFT 4
+#define I40E_PFGEN_PORTMDIO_NUM_VFLINK_STAT_ENA_MASK  I40E_MASK(0x1, I40E_PFGEN_PORTMDIO_NUM_VFLINK_STAT_ENA_SHIFT)
+#define I40E_PFINT_AEQCTL                  0x00038700 /* Reset: CORER */
+#define I40E_PFINT_AEQCTL_MSIX_INDX_SHIFT  0
+#define I40E_PFINT_AEQCTL_MSIX_INDX_MASK   I40E_MASK(0xFF, I40E_PFINT_AEQCTL_MSIX_INDX_SHIFT)
+#define I40E_PFINT_AEQCTL_ITR_INDX_SHIFT   11
+#define I40E_PFINT_AEQCTL_ITR_INDX_MASK    I40E_MASK(0x3, I40E_PFINT_AEQCTL_ITR_INDX_SHIFT)
+#define I40E_PFINT_AEQCTL_MSIX0_INDX_SHIFT 13
+#define I40E_PFINT_AEQCTL_MSIX0_INDX_MASK  I40E_MASK(0x7, I40E_PFINT_AEQCTL_MSIX0_INDX_SHIFT)
+#define I40E_PFINT_AEQCTL_CAUSE_ENA_SHIFT  30
+#define I40E_PFINT_AEQCTL_CAUSE_ENA_MASK   I40E_MASK(0x1, I40E_PFINT_AEQCTL_CAUSE_ENA_SHIFT)
+#define I40E_PFINT_AEQCTL_INTEVENT_SHIFT   31
+#define I40E_PFINT_AEQCTL_INTEVENT_MASK    I40E_MASK(0x1, I40E_PFINT_AEQCTL_INTEVENT_SHIFT)
+#define I40E_PFINT_CEQCTL(_INTPF)          (0x00036800 + ((_INTPF) * 4)) /* _i=0...511 */ /* Reset: CORER */
+#define I40E_PFINT_CEQCTL_MAX_INDEX        511
+#define I40E_PFINT_CEQCTL_MSIX_INDX_SHIFT  0
+#define I40E_PFINT_CEQCTL_MSIX_INDX_MASK   I40E_MASK(0xFF, I40E_PFINT_CEQCTL_MSIX_INDX_SHIFT)
+#define I40E_PFINT_CEQCTL_ITR_INDX_SHIFT   11
+#define I40E_PFINT_CEQCTL_ITR_INDX_MASK    I40E_MASK(0x3, I40E_PFINT_CEQCTL_ITR_INDX_SHIFT)
+#define I40E_PFINT_CEQCTL_MSIX0_INDX_SHIFT 13
+#define I40E_PFINT_CEQCTL_MSIX0_INDX_MASK  I40E_MASK(0x7, I40E_PFINT_CEQCTL_MSIX0_INDX_SHIFT)
+#define I40E_PFINT_CEQCTL_NEXTQ_INDX_SHIFT 16
+#define I40E_PFINT_CEQCTL_NEXTQ_INDX_MASK  I40E_MASK(0x7FF, I40E_PFINT_CEQCTL_NEXTQ_INDX_SHIFT)
+#define I40E_PFINT_CEQCTL_NEXTQ_TYPE_SHIFT 27
+#define I40E_PFINT_CEQCTL_NEXTQ_TYPE_MASK  I40E_MASK(0x3, I40E_PFINT_CEQCTL_NEXTQ_TYPE_SHIFT)
+#define I40E_PFINT_CEQCTL_CAUSE_ENA_SHIFT  30
+#define I40E_PFINT_CEQCTL_CAUSE_ENA_MASK   I40E_MASK(0x1, I40E_PFINT_CEQCTL_CAUSE_ENA_SHIFT)
+#define I40E_PFINT_CEQCTL_INTEVENT_SHIFT   31
+#define I40E_PFINT_CEQCTL_INTEVENT_MASK    I40E_MASK(0x1, I40E_PFINT_CEQCTL_INTEVENT_SHIFT)
+#define I40E_GLINT_CTL				0x0003F800 /* Reset: CORER */
+#define I40E_GLINT_CTL_DIS_AUTOMASK_PF0_SHIFT	0
+#define I40E_GLINT_CTL_DIS_AUTOMASK_PF0_MASK	I40E_MASK(0x1, I40E_GLINT_CTL_DIS_AUTOMASK_PF0_SHIFT)
+#define I40E_GLINT_CTL_DIS_AUTOMASK_VF0_SHIFT	1
+#define I40E_GLINT_CTL_DIS_AUTOMASK_VF0_MASK	I40E_MASK(0x1, I40E_GLINT_CTL_DIS_AUTOMASK_VF0_SHIFT)
+#define I40E_GLINT_CTL_DIS_AUTOMASK_N_SHIFT	2
+#define I40E_GLINT_CTL_DIS_AUTOMASK_N_MASK	I40E_MASK(0x1, I40E_GLINT_CTL_DIS_AUTOMASK_N_SHIFT)
+#define I40E_PFINT_DYN_CTL0                       0x00038480 /* Reset: PFR */
+#define I40E_PFINT_DYN_CTL0_INTENA_SHIFT          0
+#define I40E_PFINT_DYN_CTL0_INTENA_MASK           I40E_MASK(0x1, I40E_PFINT_DYN_CTL0_INTENA_SHIFT)
+#define I40E_PFINT_DYN_CTL0_CLEARPBA_SHIFT        1
+#define I40E_PFINT_DYN_CTL0_CLEARPBA_MASK         I40E_MASK(0x1, I40E_PFINT_DYN_CTL0_CLEARPBA_SHIFT)
+#define I40E_PFINT_DYN_CTL0_SWINT_TRIG_SHIFT      2
+#define I40E_PFINT_DYN_CTL0_SWINT_TRIG_MASK       I40E_MASK(0x1, I40E_PFINT_DYN_CTL0_SWINT_TRIG_SHIFT)
+#define I40E_PFINT_DYN_CTL0_ITR_INDX_SHIFT        3
+#define I40E_PFINT_DYN_CTL0_ITR_INDX_MASK         I40E_MASK(0x3, I40E_PFINT_DYN_CTL0_ITR_INDX_SHIFT)
+#define I40E_PFINT_DYN_CTL0_INTERVAL_SHIFT        5
+#define I40E_PFINT_DYN_CTL0_INTERVAL_MASK         I40E_MASK(0xFFF, I40E_PFINT_DYN_CTL0_INTERVAL_SHIFT)
+#define I40E_PFINT_DYN_CTL0_SW_ITR_INDX_ENA_SHIFT 24
+#define I40E_PFINT_DYN_CTL0_SW_ITR_INDX_ENA_MASK  I40E_MASK(0x1, I40E_PFINT_DYN_CTL0_SW_ITR_INDX_ENA_SHIFT)
+#define I40E_PFINT_DYN_CTL0_SW_ITR_INDX_SHIFT     25
+#define I40E_PFINT_DYN_CTL0_SW_ITR_INDX_MASK      I40E_MASK(0x3, I40E_PFINT_DYN_CTL0_SW_ITR_INDX_SHIFT)
+#define I40E_PFINT_DYN_CTL0_INTENA_MSK_SHIFT      31
+#define I40E_PFINT_DYN_CTL0_INTENA_MSK_MASK       I40E_MASK(0x1, I40E_PFINT_DYN_CTL0_INTENA_MSK_SHIFT)
+#define I40E_PFINT_DYN_CTLN(_INTPF)               (0x00034800 + ((_INTPF) * 4)) /* _i=0...511 */ /* Reset: PFR */
+#define I40E_PFINT_DYN_CTLN_MAX_INDEX             511
+#define I40E_PFINT_DYN_CTLN_INTENA_SHIFT          0
+#define I40E_PFINT_DYN_CTLN_INTENA_MASK           I40E_MASK(0x1, I40E_PFINT_DYN_CTLN_INTENA_SHIFT)
+#define I40E_PFINT_DYN_CTLN_CLEARPBA_SHIFT        1
+#define I40E_PFINT_DYN_CTLN_CLEARPBA_MASK         I40E_MASK(0x1, I40E_PFINT_DYN_CTLN_CLEARPBA_SHIFT)
+#define I40E_PFINT_DYN_CTLN_SWINT_TRIG_SHIFT      2
+#define I40E_PFINT_DYN_CTLN_SWINT_TRIG_MASK       I40E_MASK(0x1, I40E_PFINT_DYN_CTLN_SWINT_TRIG_SHIFT)
+#define I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT        3
+#define I40E_PFINT_DYN_CTLN_ITR_INDX_MASK         I40E_MASK(0x3, I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT)
+#define I40E_PFINT_DYN_CTLN_INTERVAL_SHIFT        5
+#define I40E_PFINT_DYN_CTLN_INTERVAL_MASK         I40E_MASK(0xFFF, I40E_PFINT_DYN_CTLN_INTERVAL_SHIFT)
+#define I40E_PFINT_DYN_CTLN_SW_ITR_INDX_ENA_SHIFT 24
+#define I40E_PFINT_DYN_CTLN_SW_ITR_INDX_ENA_MASK  I40E_MASK(0x1, I40E_PFINT_DYN_CTLN_SW_ITR_INDX_ENA_SHIFT)
+#define I40E_PFINT_DYN_CTLN_SW_ITR_INDX_SHIFT     25
+#define I40E_PFINT_DYN_CTLN_SW_ITR_INDX_MASK      I40E_MASK(0x3, I40E_PFINT_DYN_CTLN_SW_ITR_INDX_SHIFT)
+#define I40E_PFINT_DYN_CTLN_INTENA_MSK_SHIFT      31
+#define I40E_PFINT_DYN_CTLN_INTENA_MSK_MASK       I40E_MASK(0x1, I40E_PFINT_DYN_CTLN_INTENA_MSK_SHIFT)
+#define I40E_PFINT_GPIO_ENA                  0x00088080 /* Reset: CORER */
+#define I40E_PFINT_GPIO_ENA_GPIO0_ENA_SHIFT  0
+#define I40E_PFINT_GPIO_ENA_GPIO0_ENA_MASK   I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO0_ENA_SHIFT)
+#define I40E_PFINT_GPIO_ENA_GPIO1_ENA_SHIFT  1
+#define I40E_PFINT_GPIO_ENA_GPIO1_ENA_MASK   I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO1_ENA_SHIFT)
+#define I40E_PFINT_GPIO_ENA_GPIO2_ENA_SHIFT  2
+#define I40E_PFINT_GPIO_ENA_GPIO2_ENA_MASK   I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO2_ENA_SHIFT)
+#define I40E_PFINT_GPIO_ENA_GPIO3_ENA_SHIFT  3
+#define I40E_PFINT_GPIO_ENA_GPIO3_ENA_MASK   I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO3_ENA_SHIFT)
+#define I40E_PFINT_GPIO_ENA_GPIO4_ENA_SHIFT  4
+#define I40E_PFINT_GPIO_ENA_GPIO4_ENA_MASK   I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO4_ENA_SHIFT)
+#define I40E_PFINT_GPIO_ENA_GPIO5_ENA_SHIFT  5
+#define I40E_PFINT_GPIO_ENA_GPIO5_ENA_MASK   I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO5_ENA_SHIFT)
+#define I40E_PFINT_GPIO_ENA_GPIO6_ENA_SHIFT  6
+#define I40E_PFINT_GPIO_ENA_GPIO6_ENA_MASK   I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO6_ENA_SHIFT)
+#define I40E_PFINT_GPIO_ENA_GPIO7_ENA_SHIFT  7
+#define I40E_PFINT_GPIO_ENA_GPIO7_ENA_MASK   I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO7_ENA_SHIFT)
+#define I40E_PFINT_GPIO_ENA_GPIO8_ENA_SHIFT  8
+#define I40E_PFINT_GPIO_ENA_GPIO8_ENA_MASK   I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO8_ENA_SHIFT)
+#define I40E_PFINT_GPIO_ENA_GPIO9_ENA_SHIFT  9
+#define I40E_PFINT_GPIO_ENA_GPIO9_ENA_MASK   I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO9_ENA_SHIFT)
+#define I40E_PFINT_GPIO_ENA_GPIO10_ENA_SHIFT 10
+#define I40E_PFINT_GPIO_ENA_GPIO10_ENA_MASK  I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO10_ENA_SHIFT)
+#define I40E_PFINT_GPIO_ENA_GPIO11_ENA_SHIFT 11
+#define I40E_PFINT_GPIO_ENA_GPIO11_ENA_MASK  I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO11_ENA_SHIFT)
+#define I40E_PFINT_GPIO_ENA_GPIO12_ENA_SHIFT 12
+#define I40E_PFINT_GPIO_ENA_GPIO12_ENA_MASK  I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO12_ENA_SHIFT)
+#define I40E_PFINT_GPIO_ENA_GPIO13_ENA_SHIFT 13
+#define I40E_PFINT_GPIO_ENA_GPIO13_ENA_MASK  I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO13_ENA_SHIFT)
+#define I40E_PFINT_GPIO_ENA_GPIO14_ENA_SHIFT 14
+#define I40E_PFINT_GPIO_ENA_GPIO14_ENA_MASK  I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO14_ENA_SHIFT)
+#define I40E_PFINT_GPIO_ENA_GPIO15_ENA_SHIFT 15
+#define I40E_PFINT_GPIO_ENA_GPIO15_ENA_MASK  I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO15_ENA_SHIFT)
+#define I40E_PFINT_GPIO_ENA_GPIO16_ENA_SHIFT 16
+#define I40E_PFINT_GPIO_ENA_GPIO16_ENA_MASK  I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO16_ENA_SHIFT)
+#define I40E_PFINT_GPIO_ENA_GPIO17_ENA_SHIFT 17
+#define I40E_PFINT_GPIO_ENA_GPIO17_ENA_MASK  I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO17_ENA_SHIFT)
+#define I40E_PFINT_GPIO_ENA_GPIO18_ENA_SHIFT 18
+#define I40E_PFINT_GPIO_ENA_GPIO18_ENA_MASK  I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO18_ENA_SHIFT)
+#define I40E_PFINT_GPIO_ENA_GPIO19_ENA_SHIFT 19
+#define I40E_PFINT_GPIO_ENA_GPIO19_ENA_MASK  I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO19_ENA_SHIFT)
+#define I40E_PFINT_GPIO_ENA_GPIO20_ENA_SHIFT 20
+#define I40E_PFINT_GPIO_ENA_GPIO20_ENA_MASK  I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO20_ENA_SHIFT)
+#define I40E_PFINT_GPIO_ENA_GPIO21_ENA_SHIFT 21
+#define I40E_PFINT_GPIO_ENA_GPIO21_ENA_MASK  I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO21_ENA_SHIFT)
+#define I40E_PFINT_GPIO_ENA_GPIO22_ENA_SHIFT 22
+#define I40E_PFINT_GPIO_ENA_GPIO22_ENA_MASK  I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO22_ENA_SHIFT)
+#define I40E_PFINT_GPIO_ENA_GPIO23_ENA_SHIFT 23
+#define I40E_PFINT_GPIO_ENA_GPIO23_ENA_MASK  I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO23_ENA_SHIFT)
+#define I40E_PFINT_GPIO_ENA_GPIO24_ENA_SHIFT 24
+#define I40E_PFINT_GPIO_ENA_GPIO24_ENA_MASK  I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO24_ENA_SHIFT)
+#define I40E_PFINT_GPIO_ENA_GPIO25_ENA_SHIFT 25
+#define I40E_PFINT_GPIO_ENA_GPIO25_ENA_MASK  I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO25_ENA_SHIFT)
+#define I40E_PFINT_GPIO_ENA_GPIO26_ENA_SHIFT 26
+#define I40E_PFINT_GPIO_ENA_GPIO26_ENA_MASK  I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO26_ENA_SHIFT)
+#define I40E_PFINT_GPIO_ENA_GPIO27_ENA_SHIFT 27
+#define I40E_PFINT_GPIO_ENA_GPIO27_ENA_MASK  I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO27_ENA_SHIFT)
+#define I40E_PFINT_GPIO_ENA_GPIO28_ENA_SHIFT 28
+#define I40E_PFINT_GPIO_ENA_GPIO28_ENA_MASK  I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO28_ENA_SHIFT)
+#define I40E_PFINT_GPIO_ENA_GPIO29_ENA_SHIFT 29
+#define I40E_PFINT_GPIO_ENA_GPIO29_ENA_MASK  I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO29_ENA_SHIFT)
+#define I40E_PFINT_ICR0                        0x00038780 /* Reset: CORER */
+#define I40E_PFINT_ICR0_INTEVENT_SHIFT         0
+#define I40E_PFINT_ICR0_INTEVENT_MASK          I40E_MASK(0x1, I40E_PFINT_ICR0_INTEVENT_SHIFT)
+#define I40E_PFINT_ICR0_QUEUE_0_SHIFT          1
+#define I40E_PFINT_ICR0_QUEUE_0_MASK           I40E_MASK(0x1, I40E_PFINT_ICR0_QUEUE_0_SHIFT)
+#define I40E_PFINT_ICR0_QUEUE_1_SHIFT          2
+#define I40E_PFINT_ICR0_QUEUE_1_MASK           I40E_MASK(0x1, I40E_PFINT_ICR0_QUEUE_1_SHIFT)
+#define I40E_PFINT_ICR0_QUEUE_2_SHIFT          3
+#define I40E_PFINT_ICR0_QUEUE_2_MASK           I40E_MASK(0x1, I40E_PFINT_ICR0_QUEUE_2_SHIFT)
+#define I40E_PFINT_ICR0_QUEUE_3_SHIFT          4
+#define I40E_PFINT_ICR0_QUEUE_3_MASK           I40E_MASK(0x1, I40E_PFINT_ICR0_QUEUE_3_SHIFT)
+#define I40E_PFINT_ICR0_QUEUE_4_SHIFT          5
+#define I40E_PFINT_ICR0_QUEUE_4_MASK           I40E_MASK(0x1, I40E_PFINT_ICR0_QUEUE_4_SHIFT)
+#define I40E_PFINT_ICR0_QUEUE_5_SHIFT          6
+#define I40E_PFINT_ICR0_QUEUE_5_MASK           I40E_MASK(0x1, I40E_PFINT_ICR0_QUEUE_5_SHIFT)
+#define I40E_PFINT_ICR0_QUEUE_6_SHIFT          7
+#define I40E_PFINT_ICR0_QUEUE_6_MASK           I40E_MASK(0x1, I40E_PFINT_ICR0_QUEUE_6_SHIFT)
+#define I40E_PFINT_ICR0_QUEUE_7_SHIFT          8
+#define I40E_PFINT_ICR0_QUEUE_7_MASK           I40E_MASK(0x1, I40E_PFINT_ICR0_QUEUE_7_SHIFT)
+#define I40E_PFINT_ICR0_ECC_ERR_SHIFT          16
+#define I40E_PFINT_ICR0_ECC_ERR_MASK           I40E_MASK(0x1, I40E_PFINT_ICR0_ECC_ERR_SHIFT)
+#define I40E_PFINT_ICR0_MAL_DETECT_SHIFT       19
+#define I40E_PFINT_ICR0_MAL_DETECT_MASK        I40E_MASK(0x1, I40E_PFINT_ICR0_MAL_DETECT_SHIFT)
+#define I40E_PFINT_ICR0_GRST_SHIFT             20
+#define I40E_PFINT_ICR0_GRST_MASK              I40E_MASK(0x1, I40E_PFINT_ICR0_GRST_SHIFT)
+#define I40E_PFINT_ICR0_PCI_EXCEPTION_SHIFT    21
+#define I40E_PFINT_ICR0_PCI_EXCEPTION_MASK     I40E_MASK(0x1, I40E_PFINT_ICR0_PCI_EXCEPTION_SHIFT)
+#define I40E_PFINT_ICR0_GPIO_SHIFT             22
+#define I40E_PFINT_ICR0_GPIO_MASK              I40E_MASK(0x1, I40E_PFINT_ICR0_GPIO_SHIFT)
+#define I40E_PFINT_ICR0_TIMESYNC_SHIFT         23
+#define I40E_PFINT_ICR0_TIMESYNC_MASK          I40E_MASK(0x1, I40E_PFINT_ICR0_TIMESYNC_SHIFT)
+#define I40E_PFINT_ICR0_STORM_DETECT_SHIFT     24
+#define I40E_PFINT_ICR0_STORM_DETECT_MASK      I40E_MASK(0x1, I40E_PFINT_ICR0_STORM_DETECT_SHIFT)
+#define I40E_PFINT_ICR0_LINK_STAT_CHANGE_SHIFT 25
+#define I40E_PFINT_ICR0_LINK_STAT_CHANGE_MASK  I40E_MASK(0x1, I40E_PFINT_ICR0_LINK_STAT_CHANGE_SHIFT)
+#define I40E_PFINT_ICR0_HMC_ERR_SHIFT          26
+#define I40E_PFINT_ICR0_HMC_ERR_MASK           I40E_MASK(0x1, I40E_PFINT_ICR0_HMC_ERR_SHIFT)
+#define I40E_PFINT_ICR0_PE_CRITERR_SHIFT       28
+#define I40E_PFINT_ICR0_PE_CRITERR_MASK        I40E_MASK(0x1, I40E_PFINT_ICR0_PE_CRITERR_SHIFT)
+#define I40E_PFINT_ICR0_VFLR_SHIFT             29
+#define I40E_PFINT_ICR0_VFLR_MASK              I40E_MASK(0x1, I40E_PFINT_ICR0_VFLR_SHIFT)
+#define I40E_PFINT_ICR0_ADMINQ_SHIFT           30
+#define I40E_PFINT_ICR0_ADMINQ_MASK            I40E_MASK(0x1, I40E_PFINT_ICR0_ADMINQ_SHIFT)
+#define I40E_PFINT_ICR0_SWINT_SHIFT            31
+#define I40E_PFINT_ICR0_SWINT_MASK             I40E_MASK(0x1, I40E_PFINT_ICR0_SWINT_SHIFT)
+#define I40E_PFINT_ICR0_ENA                        0x00038800 /* Reset: CORER */
+#define I40E_PFINT_ICR0_ENA_ECC_ERR_SHIFT          16
+#define I40E_PFINT_ICR0_ENA_ECC_ERR_MASK           I40E_MASK(0x1, I40E_PFINT_ICR0_ENA_ECC_ERR_SHIFT)
+#define I40E_PFINT_ICR0_ENA_MAL_DETECT_SHIFT       19
+#define I40E_PFINT_ICR0_ENA_MAL_DETECT_MASK        I40E_MASK(0x1, I40E_PFINT_ICR0_ENA_MAL_DETECT_SHIFT)
+#define I40E_PFINT_ICR0_ENA_GRST_SHIFT             20
+#define I40E_PFINT_ICR0_ENA_GRST_MASK              I40E_MASK(0x1, I40E_PFINT_ICR0_ENA_GRST_SHIFT)
+#define I40E_PFINT_ICR0_ENA_PCI_EXCEPTION_SHIFT    21
+#define I40E_PFINT_ICR0_ENA_PCI_EXCEPTION_MASK     I40E_MASK(0x1, I40E_PFINT_ICR0_ENA_PCI_EXCEPTION_SHIFT)
+#define I40E_PFINT_ICR0_ENA_GPIO_SHIFT             22
+#define I40E_PFINT_ICR0_ENA_GPIO_MASK              I40E_MASK(0x1, I40E_PFINT_ICR0_ENA_GPIO_SHIFT)
+#define I40E_PFINT_ICR0_ENA_TIMESYNC_SHIFT         23
+#define I40E_PFINT_ICR0_ENA_TIMESYNC_MASK          I40E_MASK(0x1, I40E_PFINT_ICR0_ENA_TIMESYNC_SHIFT)
+#define I40E_PFINT_ICR0_ENA_STORM_DETECT_SHIFT     24
+#define I40E_PFINT_ICR0_ENA_STORM_DETECT_MASK      I40E_MASK(0x1, I40E_PFINT_ICR0_ENA_STORM_DETECT_SHIFT)
+#define I40E_PFINT_ICR0_ENA_LINK_STAT_CHANGE_SHIFT 25
+#define I40E_PFINT_ICR0_ENA_LINK_STAT_CHANGE_MASK  I40E_MASK(0x1, I40E_PFINT_ICR0_ENA_LINK_STAT_CHANGE_SHIFT)
+#define I40E_PFINT_ICR0_ENA_HMC_ERR_SHIFT          26
+#define I40E_PFINT_ICR0_ENA_HMC_ERR_MASK           I40E_MASK(0x1, I40E_PFINT_ICR0_ENA_HMC_ERR_SHIFT)
+#define I40E_PFINT_ICR0_ENA_PE_CRITERR_SHIFT       28
+#define I40E_PFINT_ICR0_ENA_PE_CRITERR_MASK        I40E_MASK(0x1, I40E_PFINT_ICR0_ENA_PE_CRITERR_SHIFT)
+#define I40E_PFINT_ICR0_ENA_VFLR_SHIFT             29
+#define I40E_PFINT_ICR0_ENA_VFLR_MASK              I40E_MASK(0x1, I40E_PFINT_ICR0_ENA_VFLR_SHIFT)
+#define I40E_PFINT_ICR0_ENA_ADMINQ_SHIFT           30
+#define I40E_PFINT_ICR0_ENA_ADMINQ_MASK            I40E_MASK(0x1, I40E_PFINT_ICR0_ENA_ADMINQ_SHIFT)
+#define I40E_PFINT_ICR0_ENA_RSVD_SHIFT             31
+#define I40E_PFINT_ICR0_ENA_RSVD_MASK              I40E_MASK(0x1, I40E_PFINT_ICR0_ENA_RSVD_SHIFT)
+#define I40E_PFINT_ITR0(_i)            (0x00038000 + ((_i) * 128)) /* _i=0...2 */ /* Reset: PFR */
+#define I40E_PFINT_ITR0_MAX_INDEX      2
+#define I40E_PFINT_ITR0_INTERVAL_SHIFT 0
+#define I40E_PFINT_ITR0_INTERVAL_MASK  I40E_MASK(0xFFF, I40E_PFINT_ITR0_INTERVAL_SHIFT)
+#define I40E_PFINT_ITRN(_i, _INTPF)     (0x00030000 + ((_i) * 2048 + (_INTPF) * 4)) /* _i=0...2, _INTPF=0...511 */ /* Reset: PFR */
+#define I40E_PFINT_ITRN_MAX_INDEX      2
+#define I40E_PFINT_ITRN_INTERVAL_SHIFT 0
+#define I40E_PFINT_ITRN_INTERVAL_MASK  I40E_MASK(0xFFF, I40E_PFINT_ITRN_INTERVAL_SHIFT)
+#define I40E_PFINT_LNKLST0                   0x00038500 /* Reset: PFR */
+#define I40E_PFINT_LNKLST0_FIRSTQ_INDX_SHIFT 0
+#define I40E_PFINT_LNKLST0_FIRSTQ_INDX_MASK  I40E_MASK(0x7FF, I40E_PFINT_LNKLST0_FIRSTQ_INDX_SHIFT)
+#define I40E_PFINT_LNKLST0_FIRSTQ_TYPE_SHIFT 11
+#define I40E_PFINT_LNKLST0_FIRSTQ_TYPE_MASK  I40E_MASK(0x3, I40E_PFINT_LNKLST0_FIRSTQ_TYPE_SHIFT)
+#define I40E_PFINT_LNKLSTN(_INTPF)           (0x00035000 + ((_INTPF) * 4)) /* _i=0...511 */ /* Reset: PFR */
+#define I40E_PFINT_LNKLSTN_MAX_INDEX         511
+#define I40E_PFINT_LNKLSTN_FIRSTQ_INDX_SHIFT 0
+#define I40E_PFINT_LNKLSTN_FIRSTQ_INDX_MASK  I40E_MASK(0x7FF, I40E_PFINT_LNKLSTN_FIRSTQ_INDX_SHIFT)
+#define I40E_PFINT_LNKLSTN_FIRSTQ_TYPE_SHIFT 11
+#define I40E_PFINT_LNKLSTN_FIRSTQ_TYPE_MASK  I40E_MASK(0x3, I40E_PFINT_LNKLSTN_FIRSTQ_TYPE_SHIFT)
+#define I40E_PFINT_RATE0                 0x00038580 /* Reset: PFR */
+#define I40E_PFINT_RATE0_INTERVAL_SHIFT  0
+#define I40E_PFINT_RATE0_INTERVAL_MASK   I40E_MASK(0x3F, I40E_PFINT_RATE0_INTERVAL_SHIFT)
+#define I40E_PFINT_RATE0_INTRL_ENA_SHIFT 6
+#define I40E_PFINT_RATE0_INTRL_ENA_MASK  I40E_MASK(0x1, I40E_PFINT_RATE0_INTRL_ENA_SHIFT)
+#define I40E_PFINT_RATEN(_INTPF)         (0x00035800 + ((_INTPF) * 4)) /* _i=0...511 */ /* Reset: PFR */
+#define I40E_PFINT_RATEN_MAX_INDEX       511
+#define I40E_PFINT_RATEN_INTERVAL_SHIFT  0
+#define I40E_PFINT_RATEN_INTERVAL_MASK   I40E_MASK(0x3F, I40E_PFINT_RATEN_INTERVAL_SHIFT)
+#define I40E_PFINT_RATEN_INTRL_ENA_SHIFT 6
+#define I40E_PFINT_RATEN_INTRL_ENA_MASK  I40E_MASK(0x1, I40E_PFINT_RATEN_INTRL_ENA_SHIFT)
+#define I40E_PFINT_STAT_CTL0                      0x00038400 /* Reset: CORER */
+#define I40E_PFINT_STAT_CTL0_OTHER_ITR_INDX_SHIFT 2
+#define I40E_PFINT_STAT_CTL0_OTHER_ITR_INDX_MASK  I40E_MASK(0x3, I40E_PFINT_STAT_CTL0_OTHER_ITR_INDX_SHIFT)
+#define I40E_QINT_RQCTL(_Q)              (0x0003A000 + ((_Q) * 4)) /* _i=0...1535 */ /* Reset: CORER */
+#define I40E_QINT_RQCTL_MAX_INDEX        1535
+#define I40E_QINT_RQCTL_MSIX_INDX_SHIFT  0
+#define I40E_QINT_RQCTL_MSIX_INDX_MASK   I40E_MASK(0xFF, I40E_QINT_RQCTL_MSIX_INDX_SHIFT)
+#define I40E_QINT_RQCTL_ITR_INDX_SHIFT   11
+#define I40E_QINT_RQCTL_ITR_INDX_MASK    I40E_MASK(0x3, I40E_QINT_RQCTL_ITR_INDX_SHIFT)
+#define I40E_QINT_RQCTL_MSIX0_INDX_SHIFT 13
+#define I40E_QINT_RQCTL_MSIX0_INDX_MASK  I40E_MASK(0x7, I40E_QINT_RQCTL_MSIX0_INDX_SHIFT)
+#define I40E_QINT_RQCTL_NEXTQ_INDX_SHIFT 16
+#define I40E_QINT_RQCTL_NEXTQ_INDX_MASK  I40E_MASK(0x7FF, I40E_QINT_RQCTL_NEXTQ_INDX_SHIFT)
+#define I40E_QINT_RQCTL_NEXTQ_TYPE_SHIFT 27
+#define I40E_QINT_RQCTL_NEXTQ_TYPE_MASK  I40E_MASK(0x3, I40E_QINT_RQCTL_NEXTQ_TYPE_SHIFT)
+#define I40E_QINT_RQCTL_CAUSE_ENA_SHIFT  30
+#define I40E_QINT_RQCTL_CAUSE_ENA_MASK   I40E_MASK(0x1, I40E_QINT_RQCTL_CAUSE_ENA_SHIFT)
+#define I40E_QINT_RQCTL_INTEVENT_SHIFT   31
+#define I40E_QINT_RQCTL_INTEVENT_MASK    I40E_MASK(0x1, I40E_QINT_RQCTL_INTEVENT_SHIFT)
+#define I40E_QINT_TQCTL(_Q)              (0x0003C000 + ((_Q) * 4)) /* _i=0...1535 */ /* Reset: CORER */
+#define I40E_QINT_TQCTL_MAX_INDEX        1535
+#define I40E_QINT_TQCTL_MSIX_INDX_SHIFT  0
+#define I40E_QINT_TQCTL_MSIX_INDX_MASK   I40E_MASK(0xFF, I40E_QINT_TQCTL_MSIX_INDX_SHIFT)
+#define I40E_QINT_TQCTL_ITR_INDX_SHIFT   11
+#define I40E_QINT_TQCTL_ITR_INDX_MASK    I40E_MASK(0x3, I40E_QINT_TQCTL_ITR_INDX_SHIFT)
+#define I40E_QINT_TQCTL_MSIX0_INDX_SHIFT 13
+#define I40E_QINT_TQCTL_MSIX0_INDX_MASK  I40E_MASK(0x7, I40E_QINT_TQCTL_MSIX0_INDX_SHIFT)
+#define I40E_QINT_TQCTL_NEXTQ_INDX_SHIFT 16
+#define I40E_QINT_TQCTL_NEXTQ_INDX_MASK  I40E_MASK(0x7FF, I40E_QINT_TQCTL_NEXTQ_INDX_SHIFT)
+#define I40E_QINT_TQCTL_NEXTQ_TYPE_SHIFT 27
+#define I40E_QINT_TQCTL_NEXTQ_TYPE_MASK  I40E_MASK(0x3, I40E_QINT_TQCTL_NEXTQ_TYPE_SHIFT)
+#define I40E_QINT_TQCTL_CAUSE_ENA_SHIFT  30
+#define I40E_QINT_TQCTL_CAUSE_ENA_MASK   I40E_MASK(0x1, I40E_QINT_TQCTL_CAUSE_ENA_SHIFT)
+#define I40E_QINT_TQCTL_INTEVENT_SHIFT   31
+#define I40E_QINT_TQCTL_INTEVENT_MASK    I40E_MASK(0x1, I40E_QINT_TQCTL_INTEVENT_SHIFT)
+#define I40E_VFINT_DYN_CTL0(_VF)                  (0x0002A400 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
+#define I40E_VFINT_DYN_CTL0_MAX_INDEX             127
+#define I40E_VFINT_DYN_CTL0_INTENA_SHIFT          0
+#define I40E_VFINT_DYN_CTL0_INTENA_MASK           I40E_MASK(0x1, I40E_VFINT_DYN_CTL0_INTENA_SHIFT)
+#define I40E_VFINT_DYN_CTL0_CLEARPBA_SHIFT        1
+#define I40E_VFINT_DYN_CTL0_CLEARPBA_MASK         I40E_MASK(0x1, I40E_VFINT_DYN_CTL0_CLEARPBA_SHIFT)
+#define I40E_VFINT_DYN_CTL0_SWINT_TRIG_SHIFT      2
+#define I40E_VFINT_DYN_CTL0_SWINT_TRIG_MASK       I40E_MASK(0x1, I40E_VFINT_DYN_CTL0_SWINT_TRIG_SHIFT)
+#define I40E_VFINT_DYN_CTL0_ITR_INDX_SHIFT        3
+#define I40E_VFINT_DYN_CTL0_ITR_INDX_MASK         I40E_MASK(0x3, I40E_VFINT_DYN_CTL0_ITR_INDX_SHIFT)
+#define I40E_VFINT_DYN_CTL0_INTERVAL_SHIFT        5
+#define I40E_VFINT_DYN_CTL0_INTERVAL_MASK         I40E_MASK(0xFFF, I40E_VFINT_DYN_CTL0_INTERVAL_SHIFT)
+#define I40E_VFINT_DYN_CTL0_SW_ITR_INDX_ENA_SHIFT 24
+#define I40E_VFINT_DYN_CTL0_SW_ITR_INDX_ENA_MASK  I40E_MASK(0x1, I40E_VFINT_DYN_CTL0_SW_ITR_INDX_ENA_SHIFT)
+#define I40E_VFINT_DYN_CTL0_SW_ITR_INDX_SHIFT     25
+#define I40E_VFINT_DYN_CTL0_SW_ITR_INDX_MASK      I40E_MASK(0x3, I40E_VFINT_DYN_CTL0_SW_ITR_INDX_SHIFT)
+#define I40E_VFINT_DYN_CTL0_INTENA_MSK_SHIFT      31
+#define I40E_VFINT_DYN_CTL0_INTENA_MSK_MASK       I40E_MASK(0x1, I40E_VFINT_DYN_CTL0_INTENA_MSK_SHIFT)
+#define I40E_VFINT_DYN_CTLN(_INTVF)               (0x00024800 + ((_INTVF) * 4)) /* _i=0...511 */ /* Reset: VFR */
+#define I40E_VFINT_DYN_CTLN_MAX_INDEX             511
+#define I40E_VFINT_DYN_CTLN_INTENA_SHIFT          0
+#define I40E_VFINT_DYN_CTLN_INTENA_MASK           I40E_MASK(0x1, I40E_VFINT_DYN_CTLN_INTENA_SHIFT)
+#define I40E_VFINT_DYN_CTLN_CLEARPBA_SHIFT        1
+#define I40E_VFINT_DYN_CTLN_CLEARPBA_MASK         I40E_MASK(0x1, I40E_VFINT_DYN_CTLN_CLEARPBA_SHIFT)
+#define I40E_VFINT_DYN_CTLN_SWINT_TRIG_SHIFT      2
+#define I40E_VFINT_DYN_CTLN_SWINT_TRIG_MASK       I40E_MASK(0x1, I40E_VFINT_DYN_CTLN_SWINT_TRIG_SHIFT)
+#define I40E_VFINT_DYN_CTLN_ITR_INDX_SHIFT        3
+#define I40E_VFINT_DYN_CTLN_ITR_INDX_MASK         I40E_MASK(0x3, I40E_VFINT_DYN_CTLN_ITR_INDX_SHIFT)
+#define I40E_VFINT_DYN_CTLN_INTERVAL_SHIFT        5
+#define I40E_VFINT_DYN_CTLN_INTERVAL_MASK         I40E_MASK(0xFFF, I40E_VFINT_DYN_CTLN_INTERVAL_SHIFT)
+#define I40E_VFINT_DYN_CTLN_SW_ITR_INDX_ENA_SHIFT 24
+#define I40E_VFINT_DYN_CTLN_SW_ITR_INDX_ENA_MASK  I40E_MASK(0x1, I40E_VFINT_DYN_CTLN_SW_ITR_INDX_ENA_SHIFT)
+#define I40E_VFINT_DYN_CTLN_SW_ITR_INDX_SHIFT     25
+#define I40E_VFINT_DYN_CTLN_SW_ITR_INDX_MASK      I40E_MASK(0x3, I40E_VFINT_DYN_CTLN_SW_ITR_INDX_SHIFT)
+#define I40E_VFINT_DYN_CTLN_INTENA_MSK_SHIFT      31
+#define I40E_VFINT_DYN_CTLN_INTENA_MSK_MASK       I40E_MASK(0x1, I40E_VFINT_DYN_CTLN_INTENA_MSK_SHIFT)
+#define I40E_VFINT_ICR0(_VF)                   (0x0002BC00 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: CORER */
+#define I40E_VFINT_ICR0_MAX_INDEX              127
+#define I40E_VFINT_ICR0_INTEVENT_SHIFT         0
+#define I40E_VFINT_ICR0_INTEVENT_MASK          I40E_MASK(0x1, I40E_VFINT_ICR0_INTEVENT_SHIFT)
+#define I40E_VFINT_ICR0_QUEUE_0_SHIFT          1
+#define I40E_VFINT_ICR0_QUEUE_0_MASK           I40E_MASK(0x1, I40E_VFINT_ICR0_QUEUE_0_SHIFT)
+#define I40E_VFINT_ICR0_QUEUE_1_SHIFT          2
+#define I40E_VFINT_ICR0_QUEUE_1_MASK           I40E_MASK(0x1, I40E_VFINT_ICR0_QUEUE_1_SHIFT)
+#define I40E_VFINT_ICR0_QUEUE_2_SHIFT          3
+#define I40E_VFINT_ICR0_QUEUE_2_MASK           I40E_MASK(0x1, I40E_VFINT_ICR0_QUEUE_2_SHIFT)
+#define I40E_VFINT_ICR0_QUEUE_3_SHIFT          4
+#define I40E_VFINT_ICR0_QUEUE_3_MASK           I40E_MASK(0x1, I40E_VFINT_ICR0_QUEUE_3_SHIFT)
+#define I40E_VFINT_ICR0_LINK_STAT_CHANGE_SHIFT 25
+#define I40E_VFINT_ICR0_LINK_STAT_CHANGE_MASK  I40E_MASK(0x1, I40E_VFINT_ICR0_LINK_STAT_CHANGE_SHIFT)
+#define I40E_VFINT_ICR0_ADMINQ_SHIFT           30
+#define I40E_VFINT_ICR0_ADMINQ_MASK            I40E_MASK(0x1, I40E_VFINT_ICR0_ADMINQ_SHIFT)
+#define I40E_VFINT_ICR0_SWINT_SHIFT            31
+#define I40E_VFINT_ICR0_SWINT_MASK             I40E_MASK(0x1, I40E_VFINT_ICR0_SWINT_SHIFT)
+#define I40E_VFINT_ICR0_ENA(_VF)                   (0x0002C000 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: CORER */
+#define I40E_VFINT_ICR0_ENA_MAX_INDEX              127
+#define I40E_VFINT_ICR0_ENA_LINK_STAT_CHANGE_SHIFT 25
+#define I40E_VFINT_ICR0_ENA_LINK_STAT_CHANGE_MASK  I40E_MASK(0x1, I40E_VFINT_ICR0_ENA_LINK_STAT_CHANGE_SHIFT)
+#define I40E_VFINT_ICR0_ENA_ADMINQ_SHIFT           30
+#define I40E_VFINT_ICR0_ENA_ADMINQ_MASK            I40E_MASK(0x1, I40E_VFINT_ICR0_ENA_ADMINQ_SHIFT)
+#define I40E_VFINT_ICR0_ENA_RSVD_SHIFT             31
+#define I40E_VFINT_ICR0_ENA_RSVD_MASK              I40E_MASK(0x1, I40E_VFINT_ICR0_ENA_RSVD_SHIFT)
+#define I40E_VFINT_ITR0(_i, _VF)        (0x00028000 + ((_i) * 1024 + (_VF) * 4)) /* _i=0...2, _VF=0...127 */ /* Reset: VFR */
+#define I40E_VFINT_ITR0_MAX_INDEX      2
+#define I40E_VFINT_ITR0_INTERVAL_SHIFT 0
+#define I40E_VFINT_ITR0_INTERVAL_MASK  I40E_MASK(0xFFF, I40E_VFINT_ITR0_INTERVAL_SHIFT)
+#define I40E_VFINT_ITRN(_i, _INTVF)     (0x00020000 + ((_i) * 2048 + (_INTVF) * 4)) /* _i=0...2, _INTVF=0...511 */ /* Reset: VFR */
+#define I40E_VFINT_ITRN_MAX_INDEX      2
+#define I40E_VFINT_ITRN_INTERVAL_SHIFT 0
+#define I40E_VFINT_ITRN_INTERVAL_MASK  I40E_MASK(0xFFF, I40E_VFINT_ITRN_INTERVAL_SHIFT)
+#define I40E_VFINT_STAT_CTL0(_VF)                 (0x0002A000 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: CORER */
+#define I40E_VFINT_STAT_CTL0_MAX_INDEX            127
+#define I40E_VFINT_STAT_CTL0_OTHER_ITR_INDX_SHIFT 2
+#define I40E_VFINT_STAT_CTL0_OTHER_ITR_INDX_MASK  I40E_MASK(0x3, I40E_VFINT_STAT_CTL0_OTHER_ITR_INDX_SHIFT)
+#define I40E_VPINT_AEQCTL(_VF)             (0x0002B800 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: CORER */
+#define I40E_VPINT_AEQCTL_MAX_INDEX        127
+#define I40E_VPINT_AEQCTL_MSIX_INDX_SHIFT  0
+#define I40E_VPINT_AEQCTL_MSIX_INDX_MASK   I40E_MASK(0xFF, I40E_VPINT_AEQCTL_MSIX_INDX_SHIFT)
+#define I40E_VPINT_AEQCTL_ITR_INDX_SHIFT   11
+#define I40E_VPINT_AEQCTL_ITR_INDX_MASK    I40E_MASK(0x3, I40E_VPINT_AEQCTL_ITR_INDX_SHIFT)
+#define I40E_VPINT_AEQCTL_MSIX0_INDX_SHIFT 13
+#define I40E_VPINT_AEQCTL_MSIX0_INDX_MASK  I40E_MASK(0x7, I40E_VPINT_AEQCTL_MSIX0_INDX_SHIFT)
+#define I40E_VPINT_AEQCTL_CAUSE_ENA_SHIFT  30
+#define I40E_VPINT_AEQCTL_CAUSE_ENA_MASK   I40E_MASK(0x1, I40E_VPINT_AEQCTL_CAUSE_ENA_SHIFT)
+#define I40E_VPINT_AEQCTL_INTEVENT_SHIFT   31
+#define I40E_VPINT_AEQCTL_INTEVENT_MASK    I40E_MASK(0x1, I40E_VPINT_AEQCTL_INTEVENT_SHIFT)
+#define I40E_VPINT_CEQCTL(_INTVF)          (0x00026800 + ((_INTVF) * 4)) /* _i=0...511 */ /* Reset: CORER */
+#define I40E_VPINT_CEQCTL_MAX_INDEX        511
+#define I40E_VPINT_CEQCTL_MSIX_INDX_SHIFT  0
+#define I40E_VPINT_CEQCTL_MSIX_INDX_MASK   I40E_MASK(0xFF, I40E_VPINT_CEQCTL_MSIX_INDX_SHIFT)
+#define I40E_VPINT_CEQCTL_ITR_INDX_SHIFT   11
+#define I40E_VPINT_CEQCTL_ITR_INDX_MASK    I40E_MASK(0x3, I40E_VPINT_CEQCTL_ITR_INDX_SHIFT)
+#define I40E_VPINT_CEQCTL_MSIX0_INDX_SHIFT 13
+#define I40E_VPINT_CEQCTL_MSIX0_INDX_MASK  I40E_MASK(0x7, I40E_VPINT_CEQCTL_MSIX0_INDX_SHIFT)
+#define I40E_VPINT_CEQCTL_NEXTQ_INDX_SHIFT 16
+#define I40E_VPINT_CEQCTL_NEXTQ_INDX_MASK  I40E_MASK(0x7FF, I40E_VPINT_CEQCTL_NEXTQ_INDX_SHIFT)
+#define I40E_VPINT_CEQCTL_NEXTQ_TYPE_SHIFT 27
+#define I40E_VPINT_CEQCTL_NEXTQ_TYPE_MASK  I40E_MASK(0x3, I40E_VPINT_CEQCTL_NEXTQ_TYPE_SHIFT)
+#define I40E_VPINT_CEQCTL_CAUSE_ENA_SHIFT  30
+#define I40E_VPINT_CEQCTL_CAUSE_ENA_MASK   I40E_MASK(0x1, I40E_VPINT_CEQCTL_CAUSE_ENA_SHIFT)
+#define I40E_VPINT_CEQCTL_INTEVENT_SHIFT   31
+#define I40E_VPINT_CEQCTL_INTEVENT_MASK    I40E_MASK(0x1, I40E_VPINT_CEQCTL_INTEVENT_SHIFT)
+#define I40E_VPINT_LNKLST0(_VF)              (0x0002A800 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
+#define I40E_VPINT_LNKLST0_MAX_INDEX         127
+#define I40E_VPINT_LNKLST0_FIRSTQ_INDX_SHIFT 0
+#define I40E_VPINT_LNKLST0_FIRSTQ_INDX_MASK  I40E_MASK(0x7FF, I40E_VPINT_LNKLST0_FIRSTQ_INDX_SHIFT)
+#define I40E_VPINT_LNKLST0_FIRSTQ_TYPE_SHIFT 11
+#define I40E_VPINT_LNKLST0_FIRSTQ_TYPE_MASK  I40E_MASK(0x3, I40E_VPINT_LNKLST0_FIRSTQ_TYPE_SHIFT)
+#define I40E_VPINT_LNKLSTN(_INTVF)           (0x00025000 + ((_INTVF) * 4)) /* _i=0...511 */ /* Reset: VFR */
+#define I40E_VPINT_LNKLSTN_MAX_INDEX         511
+#define I40E_VPINT_LNKLSTN_FIRSTQ_INDX_SHIFT 0
+#define I40E_VPINT_LNKLSTN_FIRSTQ_INDX_MASK  I40E_MASK(0x7FF, I40E_VPINT_LNKLSTN_FIRSTQ_INDX_SHIFT)
+#define I40E_VPINT_LNKLSTN_FIRSTQ_TYPE_SHIFT 11
+#define I40E_VPINT_LNKLSTN_FIRSTQ_TYPE_MASK  I40E_MASK(0x3, I40E_VPINT_LNKLSTN_FIRSTQ_TYPE_SHIFT)
+#define I40E_VPINT_RATE0(_VF)            (0x0002AC00 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
+#define I40E_VPINT_RATE0_MAX_INDEX       127
+#define I40E_VPINT_RATE0_INTERVAL_SHIFT  0
+#define I40E_VPINT_RATE0_INTERVAL_MASK   I40E_MASK(0x3F, I40E_VPINT_RATE0_INTERVAL_SHIFT)
+#define I40E_VPINT_RATE0_INTRL_ENA_SHIFT 6
+#define I40E_VPINT_RATE0_INTRL_ENA_MASK  I40E_MASK(0x1, I40E_VPINT_RATE0_INTRL_ENA_SHIFT)
+#define I40E_VPINT_RATEN(_INTVF)         (0x00025800 + ((_INTVF) * 4)) /* _i=0...511 */ /* Reset: VFR */
+#define I40E_VPINT_RATEN_MAX_INDEX       511
+#define I40E_VPINT_RATEN_INTERVAL_SHIFT  0
+#define I40E_VPINT_RATEN_INTERVAL_MASK   I40E_MASK(0x3F, I40E_VPINT_RATEN_INTERVAL_SHIFT)
+#define I40E_VPINT_RATEN_INTRL_ENA_SHIFT 6
+#define I40E_VPINT_RATEN_INTRL_ENA_MASK  I40E_MASK(0x1, I40E_VPINT_RATEN_INTRL_ENA_SHIFT)
+#define I40E_GL_RDPU_CNTRL                 0x00051060 /* Reset: CORER */
+#define I40E_GL_RDPU_CNTRL_RX_PAD_EN_SHIFT 0
+#define I40E_GL_RDPU_CNTRL_RX_PAD_EN_MASK  I40E_MASK(0x1, I40E_GL_RDPU_CNTRL_RX_PAD_EN_SHIFT)
+#define I40E_GL_RDPU_CNTRL_ECO_SHIFT       1
+#define I40E_GL_RDPU_CNTRL_ECO_MASK        I40E_MASK(0x7FFFFFFF, I40E_GL_RDPU_CNTRL_ECO_SHIFT)
+#define I40E_GLLAN_RCTL_0                0x0012A500 /* Reset: CORER */
+#define I40E_GLLAN_RCTL_0_PXE_MODE_SHIFT 0
+#define I40E_GLLAN_RCTL_0_PXE_MODE_MASK  I40E_MASK(0x1, I40E_GLLAN_RCTL_0_PXE_MODE_SHIFT)
+#define I40E_GLLAN_TSOMSK_F               0x000442D8 /* Reset: CORER */
+#define I40E_GLLAN_TSOMSK_F_TCPMSKF_SHIFT 0
+#define I40E_GLLAN_TSOMSK_F_TCPMSKF_MASK  I40E_MASK(0xFFF, I40E_GLLAN_TSOMSK_F_TCPMSKF_SHIFT)
+#define I40E_GLLAN_TSOMSK_L               0x000442E0 /* Reset: CORER */
+#define I40E_GLLAN_TSOMSK_L_TCPMSKL_SHIFT 0
+#define I40E_GLLAN_TSOMSK_L_TCPMSKL_MASK  I40E_MASK(0xFFF, I40E_GLLAN_TSOMSK_L_TCPMSKL_SHIFT)
+#define I40E_GLLAN_TSOMSK_M               0x000442DC /* Reset: CORER */
+#define I40E_GLLAN_TSOMSK_M_TCPMSKM_SHIFT 0
+#define I40E_GLLAN_TSOMSK_M_TCPMSKM_MASK  I40E_MASK(0xFFF, I40E_GLLAN_TSOMSK_M_TCPMSKM_SHIFT)
+#define I40E_GLLAN_TXPRE_QDIS(_i)              (0x000e6500 + ((_i) * 4)) /* _i=0...11 */ /* Reset: CORER */
+#define I40E_GLLAN_TXPRE_QDIS_MAX_INDEX        11
+#define I40E_GLLAN_TXPRE_QDIS_QINDX_SHIFT      0
+#define I40E_GLLAN_TXPRE_QDIS_QINDX_MASK       I40E_MASK(0x7FF, I40E_GLLAN_TXPRE_QDIS_QINDX_SHIFT)
+#define I40E_GLLAN_TXPRE_QDIS_QDIS_STAT_SHIFT  16
+#define I40E_GLLAN_TXPRE_QDIS_QDIS_STAT_MASK   I40E_MASK(0x1, I40E_GLLAN_TXPRE_QDIS_QDIS_STAT_SHIFT)
+#define I40E_GLLAN_TXPRE_QDIS_SET_QDIS_SHIFT   30
+#define I40E_GLLAN_TXPRE_QDIS_SET_QDIS_MASK    I40E_MASK(0x1, I40E_GLLAN_TXPRE_QDIS_SET_QDIS_SHIFT)
+#define I40E_GLLAN_TXPRE_QDIS_CLEAR_QDIS_SHIFT 31
+#define I40E_GLLAN_TXPRE_QDIS_CLEAR_QDIS_MASK  I40E_MASK(0x1, I40E_GLLAN_TXPRE_QDIS_CLEAR_QDIS_SHIFT)
+#define I40E_PFLAN_QALLOC              0x001C0400 /* Reset: CORER */
+#define I40E_PFLAN_QALLOC_FIRSTQ_SHIFT 0
+#define I40E_PFLAN_QALLOC_FIRSTQ_MASK  I40E_MASK(0x7FF, I40E_PFLAN_QALLOC_FIRSTQ_SHIFT)
+#define I40E_PFLAN_QALLOC_LASTQ_SHIFT  16
+#define I40E_PFLAN_QALLOC_LASTQ_MASK   I40E_MASK(0x7FF, I40E_PFLAN_QALLOC_LASTQ_SHIFT)
+#define I40E_PFLAN_QALLOC_VALID_SHIFT  31
+#define I40E_PFLAN_QALLOC_VALID_MASK   I40E_MASK(0x1, I40E_PFLAN_QALLOC_VALID_SHIFT)
+#define I40E_QRX_ENA(_Q)             (0x00120000 + ((_Q) * 4)) /* _i=0...1535 */ /* Reset: PFR */
+#define I40E_QRX_ENA_MAX_INDEX       1535
+#define I40E_QRX_ENA_QENA_REQ_SHIFT  0
+#define I40E_QRX_ENA_QENA_REQ_MASK   I40E_MASK(0x1, I40E_QRX_ENA_QENA_REQ_SHIFT)
+#define I40E_QRX_ENA_FAST_QDIS_SHIFT 1
+#define I40E_QRX_ENA_FAST_QDIS_MASK  I40E_MASK(0x1, I40E_QRX_ENA_FAST_QDIS_SHIFT)
+#define I40E_QRX_ENA_QENA_STAT_SHIFT 2
+#define I40E_QRX_ENA_QENA_STAT_MASK  I40E_MASK(0x1, I40E_QRX_ENA_QENA_STAT_SHIFT)
+#define I40E_QRX_TAIL(_Q)        (0x00128000 + ((_Q) * 4)) /* _i=0...1535 */ /* Reset: CORER */
+#define I40E_QRX_TAIL_MAX_INDEX  1535
+#define I40E_QRX_TAIL_TAIL_SHIFT 0
+#define I40E_QRX_TAIL_TAIL_MASK  I40E_MASK(0x1FFF, I40E_QRX_TAIL_TAIL_SHIFT)
+#define I40E_QTX_CTL(_Q)             (0x00104000 + ((_Q) * 4)) /* _i=0...1535 */ /* Reset: CORER */
+#define I40E_QTX_CTL_MAX_INDEX       1535
+#define I40E_QTX_CTL_PFVF_Q_SHIFT    0
+#define I40E_QTX_CTL_PFVF_Q_MASK     I40E_MASK(0x3, I40E_QTX_CTL_PFVF_Q_SHIFT)
+#define I40E_QTX_CTL_PF_INDX_SHIFT   2
+#define I40E_QTX_CTL_PF_INDX_MASK    I40E_MASK(0xF, I40E_QTX_CTL_PF_INDX_SHIFT)
+#define I40E_QTX_CTL_VFVM_INDX_SHIFT 7
+#define I40E_QTX_CTL_VFVM_INDX_MASK  I40E_MASK(0x1FF, I40E_QTX_CTL_VFVM_INDX_SHIFT)
+#define I40E_QTX_ENA(_Q)             (0x00100000 + ((_Q) * 4)) /* _i=0...1535 */ /* Reset: PFR */
+#define I40E_QTX_ENA_MAX_INDEX       1535
+#define I40E_QTX_ENA_QENA_REQ_SHIFT  0
+#define I40E_QTX_ENA_QENA_REQ_MASK   I40E_MASK(0x1, I40E_QTX_ENA_QENA_REQ_SHIFT)
+#define I40E_QTX_ENA_FAST_QDIS_SHIFT 1
+#define I40E_QTX_ENA_FAST_QDIS_MASK  I40E_MASK(0x1, I40E_QTX_ENA_FAST_QDIS_SHIFT)
+#define I40E_QTX_ENA_QENA_STAT_SHIFT 2
+#define I40E_QTX_ENA_QENA_STAT_MASK  I40E_MASK(0x1, I40E_QTX_ENA_QENA_STAT_SHIFT)
+#define I40E_QTX_HEAD(_Q)              (0x000E4000 + ((_Q) * 4)) /* _i=0...1535 */ /* Reset: CORER */
+#define I40E_QTX_HEAD_MAX_INDEX        1535
+#define I40E_QTX_HEAD_HEAD_SHIFT       0
+#define I40E_QTX_HEAD_HEAD_MASK        I40E_MASK(0x1FFF, I40E_QTX_HEAD_HEAD_SHIFT)
+#define I40E_QTX_HEAD_RS_PENDING_SHIFT 16
+#define I40E_QTX_HEAD_RS_PENDING_MASK  I40E_MASK(0x1, I40E_QTX_HEAD_RS_PENDING_SHIFT)
+#define I40E_QTX_TAIL(_Q)        (0x00108000 + ((_Q) * 4)) /* _i=0...1535 */ /* Reset: PFR */
+#define I40E_QTX_TAIL_MAX_INDEX  1535
+#define I40E_QTX_TAIL_TAIL_SHIFT 0
+#define I40E_QTX_TAIL_TAIL_MASK  I40E_MASK(0x1FFF, I40E_QTX_TAIL_TAIL_SHIFT)
+#define I40E_VPLAN_MAPENA(_VF)           (0x00074000 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
+#define I40E_VPLAN_MAPENA_MAX_INDEX      127
+#define I40E_VPLAN_MAPENA_TXRX_ENA_SHIFT 0
+#define I40E_VPLAN_MAPENA_TXRX_ENA_MASK  I40E_MASK(0x1, I40E_VPLAN_MAPENA_TXRX_ENA_SHIFT)
+#define I40E_VPLAN_QTABLE(_i, _VF)      (0x00070000 + ((_i) * 1024 + (_VF) * 4)) /* _i=0...15, _VF=0...127 */ /* Reset: VFR */
+#define I40E_VPLAN_QTABLE_MAX_INDEX    15
+#define I40E_VPLAN_QTABLE_QINDEX_SHIFT 0
+#define I40E_VPLAN_QTABLE_QINDEX_MASK  I40E_MASK(0x7FF, I40E_VPLAN_QTABLE_QINDEX_SHIFT)
+#define I40E_VSILAN_QBASE(_VSI)               (0x0020C800 + ((_VSI) * 4)) /* _i=0...383 */ /* Reset: PFR */
+#define I40E_VSILAN_QBASE_MAX_INDEX           383
+#define I40E_VSILAN_QBASE_VSIBASE_SHIFT       0
+#define I40E_VSILAN_QBASE_VSIBASE_MASK        I40E_MASK(0x7FF, I40E_VSILAN_QBASE_VSIBASE_SHIFT)
+#define I40E_VSILAN_QBASE_VSIQTABLE_ENA_SHIFT 11
+#define I40E_VSILAN_QBASE_VSIQTABLE_ENA_MASK  I40E_MASK(0x1, I40E_VSILAN_QBASE_VSIQTABLE_ENA_SHIFT)
+#define I40E_VSILAN_QTABLE(_i, _VSI)       (0x00200000 + ((_i) * 2048 + (_VSI) * 4)) /* _i=0...7, _VSI=0...383 */ /* Reset: PFR */
+#define I40E_VSILAN_QTABLE_MAX_INDEX      7
+#define I40E_VSILAN_QTABLE_QINDEX_0_SHIFT 0
+#define I40E_VSILAN_QTABLE_QINDEX_0_MASK  I40E_MASK(0x7FF, I40E_VSILAN_QTABLE_QINDEX_0_SHIFT)
+#define I40E_VSILAN_QTABLE_QINDEX_1_SHIFT 16
+#define I40E_VSILAN_QTABLE_QINDEX_1_MASK  I40E_MASK(0x7FF, I40E_VSILAN_QTABLE_QINDEX_1_SHIFT)
+#define I40E_PRTGL_SAH              0x001E2140 /* Reset: GLOBR */
+#define I40E_PRTGL_SAH_FC_SAH_SHIFT 0
+#define I40E_PRTGL_SAH_FC_SAH_MASK  I40E_MASK(0xFFFF, I40E_PRTGL_SAH_FC_SAH_SHIFT)
+#define I40E_PRTGL_SAH_MFS_SHIFT    16
+#define I40E_PRTGL_SAH_MFS_MASK     I40E_MASK(0xFFFF, I40E_PRTGL_SAH_MFS_SHIFT)
+#define I40E_PRTGL_SAL              0x001E2120 /* Reset: GLOBR */
+#define I40E_PRTGL_SAL_FC_SAL_SHIFT 0
+#define I40E_PRTGL_SAL_FC_SAL_MASK  I40E_MASK(0xFFFFFFFF, I40E_PRTGL_SAL_FC_SAL_SHIFT)
+#define I40E_PRTMAC_HSEC_CTL_RX_ENABLE_GCP                              0x001E30E0 /* Reset: GLOBR */
+#define I40E_PRTMAC_HSEC_CTL_RX_ENABLE_GCP_HSEC_CTL_RX_ENABLE_GCP_SHIFT 0
+#define I40E_PRTMAC_HSEC_CTL_RX_ENABLE_GCP_HSEC_CTL_RX_ENABLE_GCP_MASK  I40E_MASK(0x1, I40E_PRTMAC_HSEC_CTL_RX_ENABLE_GCP_HSEC_CTL_RX_ENABLE_GCP_SHIFT)
+#define I40E_PRTMAC_HSEC_CTL_RX_ENABLE_GPP                              0x001E3260 /* Reset: GLOBR */
+#define I40E_PRTMAC_HSEC_CTL_RX_ENABLE_GPP_HSEC_CTL_RX_ENABLE_GPP_SHIFT 0
+#define I40E_PRTMAC_HSEC_CTL_RX_ENABLE_GPP_HSEC_CTL_RX_ENABLE_GPP_MASK  I40E_MASK(0x1, I40E_PRTMAC_HSEC_CTL_RX_ENABLE_GPP_HSEC_CTL_RX_ENABLE_GPP_SHIFT)
+#define I40E_PRTMAC_HSEC_CTL_RX_ENABLE_PPP                              0x001E32E0 /* Reset: GLOBR */
+#define I40E_PRTMAC_HSEC_CTL_RX_ENABLE_PPP_HSEC_CTL_RX_ENABLE_PPP_SHIFT 0
+#define I40E_PRTMAC_HSEC_CTL_RX_ENABLE_PPP_HSEC_CTL_RX_ENABLE_PPP_MASK  I40E_MASK(0x1, I40E_PRTMAC_HSEC_CTL_RX_ENABLE_PPP_HSEC_CTL_RX_ENABLE_PPP_SHIFT)
+#define I40E_PRTMAC_HSEC_CTL_RX_FORWARD_CONTROL                                   0x001E3360 /* Reset: GLOBR */
+#define I40E_PRTMAC_HSEC_CTL_RX_FORWARD_CONTROL_HSEC_CTL_RX_FORWARD_CONTROL_SHIFT 0
+#define I40E_PRTMAC_HSEC_CTL_RX_FORWARD_CONTROL_HSEC_CTL_RX_FORWARD_CONTROL_MASK  I40E_MASK(0x1, I40E_PRTMAC_HSEC_CTL_RX_FORWARD_CONTROL_HSEC_CTL_RX_FORWARD_CONTROL_SHIFT)
+#define I40E_PRTMAC_HSEC_CTL_RX_PAUSE_DA_UCAST_PART1                                        0x001E3110 /* Reset: GLOBR */
+#define I40E_PRTMAC_HSEC_CTL_RX_PAUSE_DA_UCAST_PART1_HSEC_CTL_RX_PAUSE_DA_UCAST_PART1_SHIFT 0
+#define I40E_PRTMAC_HSEC_CTL_RX_PAUSE_DA_UCAST_PART1_HSEC_CTL_RX_PAUSE_DA_UCAST_PART1_MASK  I40E_MASK(0xFFFFFFFF, I40E_PRTMAC_HSEC_CTL_RX_PAUSE_DA_UCAST_PART1_HSEC_CTL_RX_PAUSE_DA_UCAST_PART1_SHIFT)
+#define I40E_PRTMAC_HSEC_CTL_RX_PAUSE_DA_UCAST_PART2                                        0x001E3120 /* Reset: GLOBR */
+#define I40E_PRTMAC_HSEC_CTL_RX_PAUSE_DA_UCAST_PART2_HSEC_CTL_RX_PAUSE_DA_UCAST_PART2_SHIFT 0
+#define I40E_PRTMAC_HSEC_CTL_RX_PAUSE_DA_UCAST_PART2_HSEC_CTL_RX_PAUSE_DA_UCAST_PART2_MASK  I40E_MASK(0xFFFF, I40E_PRTMAC_HSEC_CTL_RX_PAUSE_DA_UCAST_PART2_HSEC_CTL_RX_PAUSE_DA_UCAST_PART2_SHIFT)
+#define I40E_PRTMAC_HSEC_CTL_RX_PAUSE_ENABLE                                0x001E30C0 /* Reset: GLOBR */
+#define I40E_PRTMAC_HSEC_CTL_RX_PAUSE_ENABLE_HSEC_CTL_RX_PAUSE_ENABLE_SHIFT 0
+#define I40E_PRTMAC_HSEC_CTL_RX_PAUSE_ENABLE_HSEC_CTL_RX_PAUSE_ENABLE_MASK  I40E_MASK(0x1FF, I40E_PRTMAC_HSEC_CTL_RX_PAUSE_ENABLE_HSEC_CTL_RX_PAUSE_ENABLE_SHIFT)
+#define I40E_PRTMAC_HSEC_CTL_RX_PAUSE_SA_PART1                                  0x001E3140 /* Reset: GLOBR */
+#define I40E_PRTMAC_HSEC_CTL_RX_PAUSE_SA_PART1_HSEC_CTL_RX_PAUSE_SA_PART1_SHIFT 0
+#define I40E_PRTMAC_HSEC_CTL_RX_PAUSE_SA_PART1_HSEC_CTL_RX_PAUSE_SA_PART1_MASK  I40E_MASK(0xFFFFFFFF, I40E_PRTMAC_HSEC_CTL_RX_PAUSE_SA_PART1_HSEC_CTL_RX_PAUSE_SA_PART1_SHIFT)
+#define I40E_PRTMAC_HSEC_CTL_RX_PAUSE_SA_PART2                                  0x001E3150 /* Reset: GLOBR */
+#define I40E_PRTMAC_HSEC_CTL_RX_PAUSE_SA_PART2_HSEC_CTL_RX_PAUSE_SA_PART2_SHIFT 0
+#define I40E_PRTMAC_HSEC_CTL_RX_PAUSE_SA_PART2_HSEC_CTL_RX_PAUSE_SA_PART2_MASK  I40E_MASK(0xFFFF, I40E_PRTMAC_HSEC_CTL_RX_PAUSE_SA_PART2_HSEC_CTL_RX_PAUSE_SA_PART2_SHIFT)
+#define I40E_PRTMAC_HSEC_CTL_TX_PAUSE_ENABLE                                0x001E30D0 /* Reset: GLOBR */
+#define I40E_PRTMAC_HSEC_CTL_TX_PAUSE_ENABLE_HSEC_CTL_TX_PAUSE_ENABLE_SHIFT 0
+#define I40E_PRTMAC_HSEC_CTL_TX_PAUSE_ENABLE_HSEC_CTL_TX_PAUSE_ENABLE_MASK  I40E_MASK(0x1FF, I40E_PRTMAC_HSEC_CTL_TX_PAUSE_ENABLE_HSEC_CTL_TX_PAUSE_ENABLE_SHIFT)
+#define I40E_PRTMAC_HSEC_CTL_TX_PAUSE_QUANTA(_i)                            (0x001E3370 + ((_i) * 16)) /* _i=0...8 */ /* Reset: GLOBR */
+#define I40E_PRTMAC_HSEC_CTL_TX_PAUSE_QUANTA_MAX_INDEX                      8
+#define I40E_PRTMAC_HSEC_CTL_TX_PAUSE_QUANTA_HSEC_CTL_TX_PAUSE_QUANTA_SHIFT 0
+#define I40E_PRTMAC_HSEC_CTL_TX_PAUSE_QUANTA_HSEC_CTL_TX_PAUSE_QUANTA_MASK  I40E_MASK(0xFFFF, I40E_PRTMAC_HSEC_CTL_TX_PAUSE_QUANTA_HSEC_CTL_TX_PAUSE_QUANTA_SHIFT)
+#define I40E_PRTMAC_HSEC_CTL_TX_PAUSE_REFRESH_TIMER(_i)                                   (0x001E3400 + ((_i) * 16)) /* _i=0...8 */ /* Reset: GLOBR */
+#define I40E_PRTMAC_HSEC_CTL_TX_PAUSE_REFRESH_TIMER_MAX_INDEX                             8
+#define I40E_PRTMAC_HSEC_CTL_TX_PAUSE_REFRESH_TIMER_HSEC_CTL_TX_PAUSE_REFRESH_TIMER_SHIFT 0
+#define I40E_PRTMAC_HSEC_CTL_TX_PAUSE_REFRESH_TIMER_HSEC_CTL_TX_PAUSE_REFRESH_TIMER_MASK  I40E_MASK(0xFFFF, I40E_PRTMAC_HSEC_CTL_TX_PAUSE_REFRESH_TIMER_HSEC_CTL_TX_PAUSE_REFRESH_TIMER_SHIFT)
+#define I40E_PRTMAC_HSEC_CTL_TX_SA_PART1                            0x001E34B0 /* Reset: GLOBR */
+#define I40E_PRTMAC_HSEC_CTL_TX_SA_PART1_HSEC_CTL_TX_SA_PART1_SHIFT 0
+#define I40E_PRTMAC_HSEC_CTL_TX_SA_PART1_HSEC_CTL_TX_SA_PART1_MASK  I40E_MASK(0xFFFFFFFF, I40E_PRTMAC_HSEC_CTL_TX_SA_PART1_HSEC_CTL_TX_SA_PART1_SHIFT)
+#define I40E_PRTMAC_HSEC_CTL_TX_SA_PART2                            0x001E34C0 /* Reset: GLOBR */
+#define I40E_PRTMAC_HSEC_CTL_TX_SA_PART2_HSEC_CTL_TX_SA_PART2_SHIFT 0
+#define I40E_PRTMAC_HSEC_CTL_TX_SA_PART2_HSEC_CTL_TX_SA_PART2_MASK  I40E_MASK(0xFFFF, I40E_PRTMAC_HSEC_CTL_TX_SA_PART2_HSEC_CTL_TX_SA_PART2_SHIFT)
+#define I40E_PRTMAC_PCS_XAUI_SWAP_A                     0x0008C480 /* Reset: GLOBR */
+#define I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_TX_LANE3_SHIFT 0
+#define I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_TX_LANE3_MASK  I40E_MASK(0x3, I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_TX_LANE3_SHIFT)
+#define I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_TX_LANE2_SHIFT 2
+#define I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_TX_LANE2_MASK  I40E_MASK(0x3, I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_TX_LANE2_SHIFT)
+#define I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_TX_LANE1_SHIFT 4
+#define I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_TX_LANE1_MASK  I40E_MASK(0x3, I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_TX_LANE1_SHIFT)
+#define I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_TX_LANE0_SHIFT 6
+#define I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_TX_LANE0_MASK  I40E_MASK(0x3, I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_TX_LANE0_SHIFT)
+#define I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_RX_LANE3_SHIFT 8
+#define I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_RX_LANE3_MASK  I40E_MASK(0x3, I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_RX_LANE3_SHIFT)
+#define I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_RX_LANE2_SHIFT 10
+#define I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_RX_LANE2_MASK  I40E_MASK(0x3, I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_RX_LANE2_SHIFT)
+#define I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_RX_LANE1_SHIFT 12
+#define I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_RX_LANE1_MASK  I40E_MASK(0x3, I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_RX_LANE1_SHIFT)
+#define I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_RX_LANE0_SHIFT 14
+#define I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_RX_LANE0_MASK  I40E_MASK(0x3, I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_RX_LANE0_SHIFT)
+#define I40E_PRTMAC_PCS_XAUI_SWAP_B                     0x0008C484 /* Reset: GLOBR */
+#define I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_TX_LANE3_SHIFT 0
+#define I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_TX_LANE3_MASK  I40E_MASK(0x3, I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_TX_LANE3_SHIFT)
+#define I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_TX_LANE2_SHIFT 2
+#define I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_TX_LANE2_MASK  I40E_MASK(0x3, I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_TX_LANE2_SHIFT)
+#define I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_TX_LANE1_SHIFT 4
+#define I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_TX_LANE1_MASK  I40E_MASK(0x3, I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_TX_LANE1_SHIFT)
+#define I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_TX_LANE0_SHIFT 6
+#define I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_TX_LANE0_MASK  I40E_MASK(0x3, I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_TX_LANE0_SHIFT)
+#define I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_RX_LANE3_SHIFT 8
+#define I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_RX_LANE3_MASK  I40E_MASK(0x3, I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_RX_LANE3_SHIFT)
+#define I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_RX_LANE2_SHIFT 10
+#define I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_RX_LANE2_MASK  I40E_MASK(0x3, I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_RX_LANE2_SHIFT)
+#define I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_RX_LANE1_SHIFT 12
+#define I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_RX_LANE1_MASK  I40E_MASK(0x3, I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_RX_LANE1_SHIFT)
+#define I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_RX_LANE0_SHIFT 14
+#define I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_RX_LANE0_MASK  I40E_MASK(0x3, I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_RX_LANE0_SHIFT)
+#define I40E_GL_FWRESETCNT                  0x00083100 /* Reset: POR */
+#define I40E_GL_FWRESETCNT_FWRESETCNT_SHIFT 0
+#define I40E_GL_FWRESETCNT_FWRESETCNT_MASK  I40E_MASK(0xFFFFFFFF, I40E_GL_FWRESETCNT_FWRESETCNT_SHIFT)
+#define I40E_GL_MNG_FWSM                              0x000B6134 /* Reset: POR */
+#define I40E_GL_MNG_FWSM_FW_MODES_SHIFT               0
+#define I40E_GL_MNG_FWSM_FW_MODES_MASK                I40E_MASK(0x3, I40E_GL_MNG_FWSM_FW_MODES_SHIFT)
+#define I40E_GL_MNG_FWSM_EEP_RELOAD_IND_SHIFT         10
+#define I40E_GL_MNG_FWSM_EEP_RELOAD_IND_MASK          I40E_MASK(0x1, I40E_GL_MNG_FWSM_EEP_RELOAD_IND_SHIFT)
+#define I40E_GL_MNG_FWSM_CRC_ERROR_MODULE_SHIFT       11
+#define I40E_GL_MNG_FWSM_CRC_ERROR_MODULE_MASK        I40E_MASK(0xF, I40E_GL_MNG_FWSM_CRC_ERROR_MODULE_SHIFT)
+#define I40E_GL_MNG_FWSM_FW_STATUS_VALID_SHIFT        15
+#define I40E_GL_MNG_FWSM_FW_STATUS_VALID_MASK         I40E_MASK(0x1, I40E_GL_MNG_FWSM_FW_STATUS_VALID_SHIFT)
+#define I40E_GL_MNG_FWSM_RESET_CNT_SHIFT              16
+#define I40E_GL_MNG_FWSM_RESET_CNT_MASK               I40E_MASK(0x7, I40E_GL_MNG_FWSM_RESET_CNT_SHIFT)
+#define I40E_GL_MNG_FWSM_EXT_ERR_IND_SHIFT            19
+#define I40E_GL_MNG_FWSM_EXT_ERR_IND_MASK             I40E_MASK(0x3F, I40E_GL_MNG_FWSM_EXT_ERR_IND_SHIFT)
+#define I40E_GL_MNG_FWSM_PHY_SERDES0_CONFIG_ERR_SHIFT 26
+#define I40E_GL_MNG_FWSM_PHY_SERDES0_CONFIG_ERR_MASK  I40E_MASK(0x1, I40E_GL_MNG_FWSM_PHY_SERDES0_CONFIG_ERR_SHIFT)
+#define I40E_GL_MNG_FWSM_PHY_SERDES1_CONFIG_ERR_SHIFT 27
+#define I40E_GL_MNG_FWSM_PHY_SERDES1_CONFIG_ERR_MASK  I40E_MASK(0x1, I40E_GL_MNG_FWSM_PHY_SERDES1_CONFIG_ERR_SHIFT)
+#define I40E_GL_MNG_FWSM_PHY_SERDES2_CONFIG_ERR_SHIFT 28
+#define I40E_GL_MNG_FWSM_PHY_SERDES2_CONFIG_ERR_MASK  I40E_MASK(0x1, I40E_GL_MNG_FWSM_PHY_SERDES2_CONFIG_ERR_SHIFT)
+#define I40E_GL_MNG_FWSM_PHY_SERDES3_CONFIG_ERR_SHIFT 29
+#define I40E_GL_MNG_FWSM_PHY_SERDES3_CONFIG_ERR_MASK  I40E_MASK(0x1, I40E_GL_MNG_FWSM_PHY_SERDES3_CONFIG_ERR_SHIFT)
+#define I40E_GL_MNG_HWARB_CTRL                   0x000B6130 /* Reset: POR */
+#define I40E_GL_MNG_HWARB_CTRL_NCSI_ARB_EN_SHIFT 0
+#define I40E_GL_MNG_HWARB_CTRL_NCSI_ARB_EN_MASK  I40E_MASK(0x1, I40E_GL_MNG_HWARB_CTRL_NCSI_ARB_EN_SHIFT)
+#define I40E_PRT_MNG_FTFT_DATA(_i)         (0x000852A0 + ((_i) * 32)) /* _i=0...31 */ /* Reset: POR */
+#define I40E_PRT_MNG_FTFT_DATA_MAX_INDEX   31
+#define I40E_PRT_MNG_FTFT_DATA_DWORD_SHIFT 0
+#define I40E_PRT_MNG_FTFT_DATA_DWORD_MASK  I40E_MASK(0xFFFFFFFF, I40E_PRT_MNG_FTFT_DATA_DWORD_SHIFT)
+#define I40E_PRT_MNG_FTFT_LENGTH              0x00085260 /* Reset: POR */
+#define I40E_PRT_MNG_FTFT_LENGTH_LENGTH_SHIFT 0
+#define I40E_PRT_MNG_FTFT_LENGTH_LENGTH_MASK  I40E_MASK(0xFF, I40E_PRT_MNG_FTFT_LENGTH_LENGTH_SHIFT)
+#define I40E_PRT_MNG_FTFT_MASK(_i)        (0x00085160 + ((_i) * 32)) /* _i=0...7 */ /* Reset: POR */
+#define I40E_PRT_MNG_FTFT_MASK_MAX_INDEX  7
+#define I40E_PRT_MNG_FTFT_MASK_MASK_SHIFT 0
+#define I40E_PRT_MNG_FTFT_MASK_MASK_MASK  I40E_MASK(0xFFFF, I40E_PRT_MNG_FTFT_MASK_MASK_SHIFT)
+#define I40E_PRT_MNG_MANC                            0x00256A20 /* Reset: POR */
+#define I40E_PRT_MNG_MANC_FLOW_CONTROL_DISCARD_SHIFT 0
+#define I40E_PRT_MNG_MANC_FLOW_CONTROL_DISCARD_MASK  I40E_MASK(0x1, I40E_PRT_MNG_MANC_FLOW_CONTROL_DISCARD_SHIFT)
+#define I40E_PRT_MNG_MANC_NCSI_DISCARD_SHIFT         1
+#define I40E_PRT_MNG_MANC_NCSI_DISCARD_MASK          I40E_MASK(0x1, I40E_PRT_MNG_MANC_NCSI_DISCARD_SHIFT)
+#define I40E_PRT_MNG_MANC_RCV_TCO_EN_SHIFT           17
+#define I40E_PRT_MNG_MANC_RCV_TCO_EN_MASK            I40E_MASK(0x1, I40E_PRT_MNG_MANC_RCV_TCO_EN_SHIFT)
+#define I40E_PRT_MNG_MANC_RCV_ALL_SHIFT              19
+#define I40E_PRT_MNG_MANC_RCV_ALL_MASK               I40E_MASK(0x1, I40E_PRT_MNG_MANC_RCV_ALL_SHIFT)
+#define I40E_PRT_MNG_MANC_FIXED_NET_TYPE_SHIFT       25
+#define I40E_PRT_MNG_MANC_FIXED_NET_TYPE_MASK        I40E_MASK(0x1, I40E_PRT_MNG_MANC_FIXED_NET_TYPE_SHIFT)
+#define I40E_PRT_MNG_MANC_NET_TYPE_SHIFT             26
+#define I40E_PRT_MNG_MANC_NET_TYPE_MASK              I40E_MASK(0x1, I40E_PRT_MNG_MANC_NET_TYPE_SHIFT)
+#define I40E_PRT_MNG_MANC_EN_BMC2OS_SHIFT            28
+#define I40E_PRT_MNG_MANC_EN_BMC2OS_MASK             I40E_MASK(0x1, I40E_PRT_MNG_MANC_EN_BMC2OS_SHIFT)
+#define I40E_PRT_MNG_MANC_EN_BMC2NET_SHIFT           29
+#define I40E_PRT_MNG_MANC_EN_BMC2NET_MASK            I40E_MASK(0x1, I40E_PRT_MNG_MANC_EN_BMC2NET_SHIFT)
+#define I40E_PRT_MNG_MAVTV(_i)       (0x00255900 + ((_i) * 32)) /* _i=0...7 */ /* Reset: POR */
+#define I40E_PRT_MNG_MAVTV_MAX_INDEX 7
+#define I40E_PRT_MNG_MAVTV_VID_SHIFT 0
+#define I40E_PRT_MNG_MAVTV_VID_MASK  I40E_MASK(0xFFF, I40E_PRT_MNG_MAVTV_VID_SHIFT)
+#define I40E_PRT_MNG_MDEF(_i)                             (0x00255D00 + ((_i) * 32)) /* _i=0...7 */ /* Reset: POR */
+#define I40E_PRT_MNG_MDEF_MAX_INDEX                       7
+#define I40E_PRT_MNG_MDEF_MAC_EXACT_AND_SHIFT             0
+#define I40E_PRT_MNG_MDEF_MAC_EXACT_AND_MASK              I40E_MASK(0xF, I40E_PRT_MNG_MDEF_MAC_EXACT_AND_SHIFT)
+#define I40E_PRT_MNG_MDEF_BROADCAST_AND_SHIFT             4
+#define I40E_PRT_MNG_MDEF_BROADCAST_AND_MASK              I40E_MASK(0x1, I40E_PRT_MNG_MDEF_BROADCAST_AND_SHIFT)
+#define I40E_PRT_MNG_MDEF_VLAN_AND_SHIFT                  5
+#define I40E_PRT_MNG_MDEF_VLAN_AND_MASK                   I40E_MASK(0xFF, I40E_PRT_MNG_MDEF_VLAN_AND_SHIFT)
+#define I40E_PRT_MNG_MDEF_IPV4_ADDRESS_AND_SHIFT          13
+#define I40E_PRT_MNG_MDEF_IPV4_ADDRESS_AND_MASK           I40E_MASK(0xF, I40E_PRT_MNG_MDEF_IPV4_ADDRESS_AND_SHIFT)
+#define I40E_PRT_MNG_MDEF_IPV6_ADDRESS_AND_SHIFT          17
+#define I40E_PRT_MNG_MDEF_IPV6_ADDRESS_AND_MASK           I40E_MASK(0xF, I40E_PRT_MNG_MDEF_IPV6_ADDRESS_AND_SHIFT)
+#define I40E_PRT_MNG_MDEF_MAC_EXACT_OR_SHIFT              21
+#define I40E_PRT_MNG_MDEF_MAC_EXACT_OR_MASK               I40E_MASK(0xF, I40E_PRT_MNG_MDEF_MAC_EXACT_OR_SHIFT)
+#define I40E_PRT_MNG_MDEF_BROADCAST_OR_SHIFT              25
+#define I40E_PRT_MNG_MDEF_BROADCAST_OR_MASK               I40E_MASK(0x1, I40E_PRT_MNG_MDEF_BROADCAST_OR_SHIFT)
+#define I40E_PRT_MNG_MDEF_MULTICAST_AND_SHIFT             26
+#define I40E_PRT_MNG_MDEF_MULTICAST_AND_MASK              I40E_MASK(0x1, I40E_PRT_MNG_MDEF_MULTICAST_AND_SHIFT)
+#define I40E_PRT_MNG_MDEF_ARP_REQUEST_OR_SHIFT            27
+#define I40E_PRT_MNG_MDEF_ARP_REQUEST_OR_MASK             I40E_MASK(0x1, I40E_PRT_MNG_MDEF_ARP_REQUEST_OR_SHIFT)
+#define I40E_PRT_MNG_MDEF_ARP_RESPONSE_OR_SHIFT           28
+#define I40E_PRT_MNG_MDEF_ARP_RESPONSE_OR_MASK            I40E_MASK(0x1, I40E_PRT_MNG_MDEF_ARP_RESPONSE_OR_SHIFT)
+#define I40E_PRT_MNG_MDEF_NEIGHBOR_DISCOVERY_134_OR_SHIFT 29
+#define I40E_PRT_MNG_MDEF_NEIGHBOR_DISCOVERY_134_OR_MASK  I40E_MASK(0x1, I40E_PRT_MNG_MDEF_NEIGHBOR_DISCOVERY_134_OR_SHIFT)
+#define I40E_PRT_MNG_MDEF_PORT_0X298_OR_SHIFT             30
+#define I40E_PRT_MNG_MDEF_PORT_0X298_OR_MASK              I40E_MASK(0x1, I40E_PRT_MNG_MDEF_PORT_0X298_OR_SHIFT)
+#define I40E_PRT_MNG_MDEF_PORT_0X26F_OR_SHIFT             31
+#define I40E_PRT_MNG_MDEF_PORT_0X26F_OR_MASK              I40E_MASK(0x1, I40E_PRT_MNG_MDEF_PORT_0X26F_OR_SHIFT)
+#define I40E_PRT_MNG_MDEF_EXT(_i)                             (0x00255F00 + ((_i) * 32)) /* _i=0...7 */ /* Reset: POR */
+#define I40E_PRT_MNG_MDEF_EXT_MAX_INDEX                       7
+#define I40E_PRT_MNG_MDEF_EXT_L2_ETHERTYPE_AND_SHIFT          0
+#define I40E_PRT_MNG_MDEF_EXT_L2_ETHERTYPE_AND_MASK           I40E_MASK(0xF, I40E_PRT_MNG_MDEF_EXT_L2_ETHERTYPE_AND_SHIFT)
+#define I40E_PRT_MNG_MDEF_EXT_L2_ETHERTYPE_OR_SHIFT           4
+#define I40E_PRT_MNG_MDEF_EXT_L2_ETHERTYPE_OR_MASK            I40E_MASK(0xF, I40E_PRT_MNG_MDEF_EXT_L2_ETHERTYPE_OR_SHIFT)
+#define I40E_PRT_MNG_MDEF_EXT_FLEX_PORT_OR_SHIFT              8
+#define I40E_PRT_MNG_MDEF_EXT_FLEX_PORT_OR_MASK               I40E_MASK(0xFFFF, I40E_PRT_MNG_MDEF_EXT_FLEX_PORT_OR_SHIFT)
+#define I40E_PRT_MNG_MDEF_EXT_FLEX_TCO_SHIFT                  24
+#define I40E_PRT_MNG_MDEF_EXT_FLEX_TCO_MASK                   I40E_MASK(0x1, I40E_PRT_MNG_MDEF_EXT_FLEX_TCO_SHIFT)
+#define I40E_PRT_MNG_MDEF_EXT_NEIGHBOR_DISCOVERY_135_OR_SHIFT 25
+#define I40E_PRT_MNG_MDEF_EXT_NEIGHBOR_DISCOVERY_135_OR_MASK  I40E_MASK(0x1, I40E_PRT_MNG_MDEF_EXT_NEIGHBOR_DISCOVERY_135_OR_SHIFT)
+#define I40E_PRT_MNG_MDEF_EXT_NEIGHBOR_DISCOVERY_136_OR_SHIFT 26
+#define I40E_PRT_MNG_MDEF_EXT_NEIGHBOR_DISCOVERY_136_OR_MASK  I40E_MASK(0x1, I40E_PRT_MNG_MDEF_EXT_NEIGHBOR_DISCOVERY_136_OR_SHIFT)
+#define I40E_PRT_MNG_MDEF_EXT_NEIGHBOR_DISCOVERY_137_OR_SHIFT 27
+#define I40E_PRT_MNG_MDEF_EXT_NEIGHBOR_DISCOVERY_137_OR_MASK  I40E_MASK(0x1, I40E_PRT_MNG_MDEF_EXT_NEIGHBOR_DISCOVERY_137_OR_SHIFT)
+#define I40E_PRT_MNG_MDEF_EXT_ICMP_OR_SHIFT                   28
+#define I40E_PRT_MNG_MDEF_EXT_ICMP_OR_MASK                    I40E_MASK(0x1, I40E_PRT_MNG_MDEF_EXT_ICMP_OR_SHIFT)
+#define I40E_PRT_MNG_MDEF_EXT_MLD_SHIFT                       29
+#define I40E_PRT_MNG_MDEF_EXT_MLD_MASK                        I40E_MASK(0x1, I40E_PRT_MNG_MDEF_EXT_MLD_SHIFT)
+#define I40E_PRT_MNG_MDEF_EXT_APPLY_TO_NETWORK_TRAFFIC_SHIFT  30
+#define I40E_PRT_MNG_MDEF_EXT_APPLY_TO_NETWORK_TRAFFIC_MASK   I40E_MASK(0x1, I40E_PRT_MNG_MDEF_EXT_APPLY_TO_NETWORK_TRAFFIC_SHIFT)
+#define I40E_PRT_MNG_MDEF_EXT_APPLY_TO_HOST_TRAFFIC_SHIFT     31
+#define I40E_PRT_MNG_MDEF_EXT_APPLY_TO_HOST_TRAFFIC_MASK      I40E_MASK(0x1, I40E_PRT_MNG_MDEF_EXT_APPLY_TO_HOST_TRAFFIC_SHIFT)
+#define I40E_PRT_MNG_MDEFVSI(_i)                (0x00256580 + ((_i) * 32)) /* _i=0...3 */ /* Reset: POR */
+#define I40E_PRT_MNG_MDEFVSI_MAX_INDEX          3
+#define I40E_PRT_MNG_MDEFVSI_MDEFVSI_2N_SHIFT   0
+#define I40E_PRT_MNG_MDEFVSI_MDEFVSI_2N_MASK    I40E_MASK(0xFFFF, I40E_PRT_MNG_MDEFVSI_MDEFVSI_2N_SHIFT)
+#define I40E_PRT_MNG_MDEFVSI_MDEFVSI_2NP1_SHIFT 16
+#define I40E_PRT_MNG_MDEFVSI_MDEFVSI_2NP1_MASK  I40E_MASK(0xFFFF, I40E_PRT_MNG_MDEFVSI_MDEFVSI_2NP1_SHIFT)
+#define I40E_PRT_MNG_METF(_i)            (0x00256780 + ((_i) * 32)) /* _i=0...3 */ /* Reset: POR */
+#define I40E_PRT_MNG_METF_MAX_INDEX      3
+#define I40E_PRT_MNG_METF_ETYPE_SHIFT    0
+#define I40E_PRT_MNG_METF_ETYPE_MASK     I40E_MASK(0xFFFF, I40E_PRT_MNG_METF_ETYPE_SHIFT)
+#define I40E_PRT_MNG_METF_POLARITY_SHIFT 30
+#define I40E_PRT_MNG_METF_POLARITY_MASK  I40E_MASK(0x1, I40E_PRT_MNG_METF_POLARITY_SHIFT)
+#define I40E_PRT_MNG_MFUTP(_i)                      (0x00254E00 + ((_i) * 32)) /* _i=0...15 */ /* Reset: POR */
+#define I40E_PRT_MNG_MFUTP_MAX_INDEX                15
+#define I40E_PRT_MNG_MFUTP_MFUTP_N_SHIFT            0
+#define I40E_PRT_MNG_MFUTP_MFUTP_N_MASK             I40E_MASK(0xFFFF, I40E_PRT_MNG_MFUTP_MFUTP_N_SHIFT)
+#define I40E_PRT_MNG_MFUTP_UDP_SHIFT                16
+#define I40E_PRT_MNG_MFUTP_UDP_MASK                 I40E_MASK(0x1, I40E_PRT_MNG_MFUTP_UDP_SHIFT)
+#define I40E_PRT_MNG_MFUTP_TCP_SHIFT                17
+#define I40E_PRT_MNG_MFUTP_TCP_MASK                 I40E_MASK(0x1, I40E_PRT_MNG_MFUTP_TCP_SHIFT)
+#define I40E_PRT_MNG_MFUTP_SOURCE_DESTINATION_SHIFT 18
+#define I40E_PRT_MNG_MFUTP_SOURCE_DESTINATION_MASK  I40E_MASK(0x1, I40E_PRT_MNG_MFUTP_SOURCE_DESTINATION_SHIFT)
+#define I40E_PRT_MNG_MIPAF4(_i)         (0x00256280 + ((_i) * 32)) /* _i=0...3 */ /* Reset: POR */
+#define I40E_PRT_MNG_MIPAF4_MAX_INDEX   3
+#define I40E_PRT_MNG_MIPAF4_MIPAF_SHIFT 0
+#define I40E_PRT_MNG_MIPAF4_MIPAF_MASK  I40E_MASK(0xFFFFFFFF, I40E_PRT_MNG_MIPAF4_MIPAF_SHIFT)
+#define I40E_PRT_MNG_MIPAF6(_i)         (0x00254200 + ((_i) * 32)) /* _i=0...15 */ /* Reset: POR */
+#define I40E_PRT_MNG_MIPAF6_MAX_INDEX   15
+#define I40E_PRT_MNG_MIPAF6_MIPAF_SHIFT 0
+#define I40E_PRT_MNG_MIPAF6_MIPAF_MASK  I40E_MASK(0xFFFFFFFF, I40E_PRT_MNG_MIPAF6_MIPAF_SHIFT)
+#define I40E_PRT_MNG_MMAH(_i)        (0x00256380 + ((_i) * 32)) /* _i=0...3 */ /* Reset: POR */
+#define I40E_PRT_MNG_MMAH_MAX_INDEX  3
+#define I40E_PRT_MNG_MMAH_MMAH_SHIFT 0
+#define I40E_PRT_MNG_MMAH_MMAH_MASK  I40E_MASK(0xFFFF, I40E_PRT_MNG_MMAH_MMAH_SHIFT)
+#define I40E_PRT_MNG_MMAL(_i)        (0x00256480 + ((_i) * 32)) /* _i=0...3 */ /* Reset: POR */
+#define I40E_PRT_MNG_MMAL_MAX_INDEX  3
+#define I40E_PRT_MNG_MMAL_MMAL_SHIFT 0
+#define I40E_PRT_MNG_MMAL_MMAL_MASK  I40E_MASK(0xFFFFFFFF, I40E_PRT_MNG_MMAL_MMAL_SHIFT)
+#define I40E_PRT_MNG_MNGONLY                                  0x00256A60 /* Reset: POR */
+#define I40E_PRT_MNG_MNGONLY_EXCLUSIVE_TO_MANAGEABILITY_SHIFT 0
+#define I40E_PRT_MNG_MNGONLY_EXCLUSIVE_TO_MANAGEABILITY_MASK  I40E_MASK(0xFF, I40E_PRT_MNG_MNGONLY_EXCLUSIVE_TO_MANAGEABILITY_SHIFT)
+#define I40E_PRT_MNG_MSFM                    0x00256AA0 /* Reset: POR */
+#define I40E_PRT_MNG_MSFM_PORT_26F_UDP_SHIFT 0
+#define I40E_PRT_MNG_MSFM_PORT_26F_UDP_MASK  I40E_MASK(0x1, I40E_PRT_MNG_MSFM_PORT_26F_UDP_SHIFT)
+#define I40E_PRT_MNG_MSFM_PORT_26F_TCP_SHIFT 1
+#define I40E_PRT_MNG_MSFM_PORT_26F_TCP_MASK  I40E_MASK(0x1, I40E_PRT_MNG_MSFM_PORT_26F_TCP_SHIFT)
+#define I40E_PRT_MNG_MSFM_PORT_298_UDP_SHIFT 2
+#define I40E_PRT_MNG_MSFM_PORT_298_UDP_MASK  I40E_MASK(0x1, I40E_PRT_MNG_MSFM_PORT_298_UDP_SHIFT)
+#define I40E_PRT_MNG_MSFM_PORT_298_TCP_SHIFT 3
+#define I40E_PRT_MNG_MSFM_PORT_298_TCP_MASK  I40E_MASK(0x1, I40E_PRT_MNG_MSFM_PORT_298_TCP_SHIFT)
+#define I40E_PRT_MNG_MSFM_IPV6_0_MASK_SHIFT  4
+#define I40E_PRT_MNG_MSFM_IPV6_0_MASK_MASK   I40E_MASK(0x1, I40E_PRT_MNG_MSFM_IPV6_0_MASK_SHIFT)
+#define I40E_PRT_MNG_MSFM_IPV6_1_MASK_SHIFT  5
+#define I40E_PRT_MNG_MSFM_IPV6_1_MASK_MASK   I40E_MASK(0x1, I40E_PRT_MNG_MSFM_IPV6_1_MASK_SHIFT)
+#define I40E_PRT_MNG_MSFM_IPV6_2_MASK_SHIFT  6
+#define I40E_PRT_MNG_MSFM_IPV6_2_MASK_MASK   I40E_MASK(0x1, I40E_PRT_MNG_MSFM_IPV6_2_MASK_SHIFT)
+#define I40E_PRT_MNG_MSFM_IPV6_3_MASK_SHIFT  7
+#define I40E_PRT_MNG_MSFM_IPV6_3_MASK_MASK   I40E_MASK(0x1, I40E_PRT_MNG_MSFM_IPV6_3_MASK_SHIFT)
+#define I40E_MSIX_PBA(_i)          (0x00001000 + ((_i) * 4)) /* _i=0...5 */ /* Reset: FLR */
+#define I40E_MSIX_PBA_MAX_INDEX    5
+#define I40E_MSIX_PBA_PENBIT_SHIFT 0
+#define I40E_MSIX_PBA_PENBIT_MASK  I40E_MASK(0xFFFFFFFF, I40E_MSIX_PBA_PENBIT_SHIFT)
+#define I40E_MSIX_TADD(_i)              (0x00000000 + ((_i) * 16)) /* _i=0...128 */ /* Reset: FLR */
+#define I40E_MSIX_TADD_MAX_INDEX        128
+#define I40E_MSIX_TADD_MSIXTADD10_SHIFT 0
+#define I40E_MSIX_TADD_MSIXTADD10_MASK  I40E_MASK(0x3, I40E_MSIX_TADD_MSIXTADD10_SHIFT)
+#define I40E_MSIX_TADD_MSIXTADD_SHIFT   2
+#define I40E_MSIX_TADD_MSIXTADD_MASK    I40E_MASK(0x3FFFFFFF, I40E_MSIX_TADD_MSIXTADD_SHIFT)
+#define I40E_MSIX_TMSG(_i)            (0x00000008 + ((_i) * 16)) /* _i=0...128 */ /* Reset: FLR */
+#define I40E_MSIX_TMSG_MAX_INDEX      128
+#define I40E_MSIX_TMSG_MSIXTMSG_SHIFT 0
+#define I40E_MSIX_TMSG_MSIXTMSG_MASK  I40E_MASK(0xFFFFFFFF, I40E_MSIX_TMSG_MSIXTMSG_SHIFT)
+#define I40E_MSIX_TUADD(_i)             (0x00000004 + ((_i) * 16)) /* _i=0...128 */ /* Reset: FLR */
+#define I40E_MSIX_TUADD_MAX_INDEX       128
+#define I40E_MSIX_TUADD_MSIXTUADD_SHIFT 0
+#define I40E_MSIX_TUADD_MSIXTUADD_MASK  I40E_MASK(0xFFFFFFFF, I40E_MSIX_TUADD_MSIXTUADD_SHIFT)
+#define I40E_MSIX_TVCTRL(_i)        (0x0000000C + ((_i) * 16)) /* _i=0...128 */ /* Reset: FLR */
+#define I40E_MSIX_TVCTRL_MAX_INDEX  128
+#define I40E_MSIX_TVCTRL_MASK_SHIFT 0
+#define I40E_MSIX_TVCTRL_MASK_MASK  I40E_MASK(0x1, I40E_MSIX_TVCTRL_MASK_SHIFT)
+#define I40E_VFMSIX_PBA1(_i)          (0x00002000 + ((_i) * 4)) /* _i=0...19 */ /* Reset: VFLR */
+#define I40E_VFMSIX_PBA1_MAX_INDEX    19
+#define I40E_VFMSIX_PBA1_PENBIT_SHIFT 0
+#define I40E_VFMSIX_PBA1_PENBIT_MASK  I40E_MASK(0xFFFFFFFF, I40E_VFMSIX_PBA1_PENBIT_SHIFT)
+#define I40E_VFMSIX_TADD1(_i)              (0x00002100 + ((_i) * 16)) /* _i=0...639 */ /* Reset: VFLR */
+#define I40E_VFMSIX_TADD1_MAX_INDEX        639
+#define I40E_VFMSIX_TADD1_MSIXTADD10_SHIFT 0
+#define I40E_VFMSIX_TADD1_MSIXTADD10_MASK  I40E_MASK(0x3, I40E_VFMSIX_TADD1_MSIXTADD10_SHIFT)
+#define I40E_VFMSIX_TADD1_MSIXTADD_SHIFT   2
+#define I40E_VFMSIX_TADD1_MSIXTADD_MASK    I40E_MASK(0x3FFFFFFF, I40E_VFMSIX_TADD1_MSIXTADD_SHIFT)
+#define I40E_VFMSIX_TMSG1(_i)            (0x00002108 + ((_i) * 16)) /* _i=0...639 */ /* Reset: VFLR */
+#define I40E_VFMSIX_TMSG1_MAX_INDEX      639
+#define I40E_VFMSIX_TMSG1_MSIXTMSG_SHIFT 0
+#define I40E_VFMSIX_TMSG1_MSIXTMSG_MASK  I40E_MASK(0xFFFFFFFF, I40E_VFMSIX_TMSG1_MSIXTMSG_SHIFT)
+#define I40E_VFMSIX_TUADD1(_i)             (0x00002104 + ((_i) * 16)) /* _i=0...639 */ /* Reset: VFLR */
+#define I40E_VFMSIX_TUADD1_MAX_INDEX       639
+#define I40E_VFMSIX_TUADD1_MSIXTUADD_SHIFT 0
+#define I40E_VFMSIX_TUADD1_MSIXTUADD_MASK  I40E_MASK(0xFFFFFFFF, I40E_VFMSIX_TUADD1_MSIXTUADD_SHIFT)
+#define I40E_VFMSIX_TVCTRL1(_i)        (0x0000210C + ((_i) * 16)) /* _i=0...639 */ /* Reset: VFLR */
+#define I40E_VFMSIX_TVCTRL1_MAX_INDEX  639
+#define I40E_VFMSIX_TVCTRL1_MASK_SHIFT 0
+#define I40E_VFMSIX_TVCTRL1_MASK_MASK  I40E_MASK(0x1, I40E_VFMSIX_TVCTRL1_MASK_SHIFT)
+#define I40E_GLNVM_FLA                0x000B6108 /* Reset: POR */
+#define I40E_GLNVM_FLA_FL_SCK_SHIFT   0
+#define I40E_GLNVM_FLA_FL_SCK_MASK    I40E_MASK(0x1, I40E_GLNVM_FLA_FL_SCK_SHIFT)
+#define I40E_GLNVM_FLA_FL_CE_SHIFT    1
+#define I40E_GLNVM_FLA_FL_CE_MASK     I40E_MASK(0x1, I40E_GLNVM_FLA_FL_CE_SHIFT)
+#define I40E_GLNVM_FLA_FL_SI_SHIFT    2
+#define I40E_GLNVM_FLA_FL_SI_MASK     I40E_MASK(0x1, I40E_GLNVM_FLA_FL_SI_SHIFT)
+#define I40E_GLNVM_FLA_FL_SO_SHIFT    3
+#define I40E_GLNVM_FLA_FL_SO_MASK     I40E_MASK(0x1, I40E_GLNVM_FLA_FL_SO_SHIFT)
+#define I40E_GLNVM_FLA_FL_REQ_SHIFT   4
+#define I40E_GLNVM_FLA_FL_REQ_MASK    I40E_MASK(0x1, I40E_GLNVM_FLA_FL_REQ_SHIFT)
+#define I40E_GLNVM_FLA_FL_GNT_SHIFT   5
+#define I40E_GLNVM_FLA_FL_GNT_MASK    I40E_MASK(0x1, I40E_GLNVM_FLA_FL_GNT_SHIFT)
+#define I40E_GLNVM_FLA_LOCKED_SHIFT   6
+#define I40E_GLNVM_FLA_LOCKED_MASK    I40E_MASK(0x1, I40E_GLNVM_FLA_LOCKED_SHIFT)
+#define I40E_GLNVM_FLA_FL_SADDR_SHIFT 18
+#define I40E_GLNVM_FLA_FL_SADDR_MASK  I40E_MASK(0x7FF, I40E_GLNVM_FLA_FL_SADDR_SHIFT)
+#define I40E_GLNVM_FLA_FL_BUSY_SHIFT  30
+#define I40E_GLNVM_FLA_FL_BUSY_MASK   I40E_MASK(0x1, I40E_GLNVM_FLA_FL_BUSY_SHIFT)
+#define I40E_GLNVM_FLA_FL_DER_SHIFT   31
+#define I40E_GLNVM_FLA_FL_DER_MASK    I40E_MASK(0x1, I40E_GLNVM_FLA_FL_DER_SHIFT)
+#define I40E_GLNVM_FLASHID                  0x000B6104 /* Reset: POR */
+#define I40E_GLNVM_FLASHID_FLASHID_SHIFT    0
+#define I40E_GLNVM_FLASHID_FLASHID_MASK     I40E_MASK(0xFFFFFF, I40E_GLNVM_FLASHID_FLASHID_SHIFT)
+#define I40E_GLNVM_FLASHID_FLEEP_PERF_SHIFT 31
+#define I40E_GLNVM_FLASHID_FLEEP_PERF_MASK  I40E_MASK(0x1, I40E_GLNVM_FLASHID_FLEEP_PERF_SHIFT)
+#define I40E_GLNVM_GENS                  0x000B6100 /* Reset: POR */
+#define I40E_GLNVM_GENS_NVM_PRES_SHIFT   0
+#define I40E_GLNVM_GENS_NVM_PRES_MASK    I40E_MASK(0x1, I40E_GLNVM_GENS_NVM_PRES_SHIFT)
+#define I40E_GLNVM_GENS_SR_SIZE_SHIFT    5
+#define I40E_GLNVM_GENS_SR_SIZE_MASK     I40E_MASK(0x7, I40E_GLNVM_GENS_SR_SIZE_SHIFT)
+#define I40E_GLNVM_GENS_BANK1VAL_SHIFT   8
+#define I40E_GLNVM_GENS_BANK1VAL_MASK    I40E_MASK(0x1, I40E_GLNVM_GENS_BANK1VAL_SHIFT)
+#define I40E_GLNVM_GENS_ALT_PRST_SHIFT   23
+#define I40E_GLNVM_GENS_ALT_PRST_MASK    I40E_MASK(0x1, I40E_GLNVM_GENS_ALT_PRST_SHIFT)
+#define I40E_GLNVM_GENS_FL_AUTO_RD_SHIFT 25
+#define I40E_GLNVM_GENS_FL_AUTO_RD_MASK  I40E_MASK(0x1, I40E_GLNVM_GENS_FL_AUTO_RD_SHIFT)
+#define I40E_GLNVM_PROTCSR(_i)              (0x000B6010 + ((_i) * 4)) /* _i=0...59 */ /* Reset: POR */
+#define I40E_GLNVM_PROTCSR_MAX_INDEX        59
+#define I40E_GLNVM_PROTCSR_ADDR_BLOCK_SHIFT 0
+#define I40E_GLNVM_PROTCSR_ADDR_BLOCK_MASK  I40E_MASK(0xFFFFFF, I40E_GLNVM_PROTCSR_ADDR_BLOCK_SHIFT)
+#define I40E_GLNVM_SRCTL              0x000B6110 /* Reset: POR */
+#define I40E_GLNVM_SRCTL_SRBUSY_SHIFT 0
+#define I40E_GLNVM_SRCTL_SRBUSY_MASK  I40E_MASK(0x1, I40E_GLNVM_SRCTL_SRBUSY_SHIFT)
+#define I40E_GLNVM_SRCTL_ADDR_SHIFT   14
+#define I40E_GLNVM_SRCTL_ADDR_MASK    I40E_MASK(0x7FFF, I40E_GLNVM_SRCTL_ADDR_SHIFT)
+#define I40E_GLNVM_SRCTL_WRITE_SHIFT  29
+#define I40E_GLNVM_SRCTL_WRITE_MASK   I40E_MASK(0x1, I40E_GLNVM_SRCTL_WRITE_SHIFT)
+#define I40E_GLNVM_SRCTL_START_SHIFT  30
+#define I40E_GLNVM_SRCTL_START_MASK   I40E_MASK(0x1, I40E_GLNVM_SRCTL_START_SHIFT)
+#define I40E_GLNVM_SRCTL_DONE_SHIFT   31
+#define I40E_GLNVM_SRCTL_DONE_MASK    I40E_MASK(0x1, I40E_GLNVM_SRCTL_DONE_SHIFT)
+#define I40E_GLNVM_SRDATA              0x000B6114 /* Reset: POR */
+#define I40E_GLNVM_SRDATA_WRDATA_SHIFT 0
+#define I40E_GLNVM_SRDATA_WRDATA_MASK  I40E_MASK(0xFFFF, I40E_GLNVM_SRDATA_WRDATA_SHIFT)
+#define I40E_GLNVM_SRDATA_RDDATA_SHIFT 16
+#define I40E_GLNVM_SRDATA_RDDATA_MASK  I40E_MASK(0xFFFF, I40E_GLNVM_SRDATA_RDDATA_SHIFT)
+#define I40E_GLNVM_ULD                          0x000B6008 /* Reset: POR */
+#define I40E_GLNVM_ULD_CONF_PCIR_DONE_SHIFT     0
+#define I40E_GLNVM_ULD_CONF_PCIR_DONE_MASK      I40E_MASK(0x1, I40E_GLNVM_ULD_CONF_PCIR_DONE_SHIFT)
+#define I40E_GLNVM_ULD_CONF_PCIRTL_DONE_SHIFT   1
+#define I40E_GLNVM_ULD_CONF_PCIRTL_DONE_MASK    I40E_MASK(0x1, I40E_GLNVM_ULD_CONF_PCIRTL_DONE_SHIFT)
+#define I40E_GLNVM_ULD_CONF_LCB_DONE_SHIFT      2
+#define I40E_GLNVM_ULD_CONF_LCB_DONE_MASK       I40E_MASK(0x1, I40E_GLNVM_ULD_CONF_LCB_DONE_SHIFT)
+#define I40E_GLNVM_ULD_CONF_CORE_DONE_SHIFT     3
+#define I40E_GLNVM_ULD_CONF_CORE_DONE_MASK      I40E_MASK(0x1, I40E_GLNVM_ULD_CONF_CORE_DONE_SHIFT)
+#define I40E_GLNVM_ULD_CONF_GLOBAL_DONE_SHIFT   4
+#define I40E_GLNVM_ULD_CONF_GLOBAL_DONE_MASK    I40E_MASK(0x1, I40E_GLNVM_ULD_CONF_GLOBAL_DONE_SHIFT)
+#define I40E_GLNVM_ULD_CONF_POR_DONE_SHIFT      5
+#define I40E_GLNVM_ULD_CONF_POR_DONE_MASK       I40E_MASK(0x1, I40E_GLNVM_ULD_CONF_POR_DONE_SHIFT)
+#define I40E_GLNVM_ULD_CONF_PCIE_ANA_DONE_SHIFT 6
+#define I40E_GLNVM_ULD_CONF_PCIE_ANA_DONE_MASK  I40E_MASK(0x1, I40E_GLNVM_ULD_CONF_PCIE_ANA_DONE_SHIFT)
+#define I40E_GLNVM_ULD_CONF_PHY_ANA_DONE_SHIFT  7
+#define I40E_GLNVM_ULD_CONF_PHY_ANA_DONE_MASK   I40E_MASK(0x1, I40E_GLNVM_ULD_CONF_PHY_ANA_DONE_SHIFT)
+#define I40E_GLNVM_ULD_CONF_EMP_DONE_SHIFT      8
+#define I40E_GLNVM_ULD_CONF_EMP_DONE_MASK       I40E_MASK(0x1, I40E_GLNVM_ULD_CONF_EMP_DONE_SHIFT)
+#define I40E_GLNVM_ULD_CONF_PCIALT_DONE_SHIFT   9
+#define I40E_GLNVM_ULD_CONF_PCIALT_DONE_MASK    I40E_MASK(0x1, I40E_GLNVM_ULD_CONF_PCIALT_DONE_SHIFT)
+#define I40E_GLPCI_BYTCTH                        0x0009C484 /* Reset: PCIR */
+#define I40E_GLPCI_BYTCTH_PCI_COUNT_BW_BCT_SHIFT 0
+#define I40E_GLPCI_BYTCTH_PCI_COUNT_BW_BCT_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPCI_BYTCTH_PCI_COUNT_BW_BCT_SHIFT)
+#define I40E_GLPCI_BYTCTL                        0x0009C488 /* Reset: PCIR */
+#define I40E_GLPCI_BYTCTL_PCI_COUNT_BW_BCT_SHIFT 0
+#define I40E_GLPCI_BYTCTL_PCI_COUNT_BW_BCT_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPCI_BYTCTL_PCI_COUNT_BW_BCT_SHIFT)
+#define I40E_GLPCI_CAPCTRL              0x000BE4A4 /* Reset: PCIR */
+#define I40E_GLPCI_CAPCTRL_VPD_EN_SHIFT 0
+#define I40E_GLPCI_CAPCTRL_VPD_EN_MASK  I40E_MASK(0x1, I40E_GLPCI_CAPCTRL_VPD_EN_SHIFT)
+#define I40E_GLPCI_CAPSUP                      0x000BE4A8 /* Reset: PCIR */
+#define I40E_GLPCI_CAPSUP_PCIE_VER_SHIFT       0
+#define I40E_GLPCI_CAPSUP_PCIE_VER_MASK        I40E_MASK(0x1, I40E_GLPCI_CAPSUP_PCIE_VER_SHIFT)
+#define I40E_GLPCI_CAPSUP_LTR_EN_SHIFT         2
+#define I40E_GLPCI_CAPSUP_LTR_EN_MASK          I40E_MASK(0x1, I40E_GLPCI_CAPSUP_LTR_EN_SHIFT)
+#define I40E_GLPCI_CAPSUP_TPH_EN_SHIFT         3
+#define I40E_GLPCI_CAPSUP_TPH_EN_MASK          I40E_MASK(0x1, I40E_GLPCI_CAPSUP_TPH_EN_SHIFT)
+#define I40E_GLPCI_CAPSUP_ARI_EN_SHIFT         4
+#define I40E_GLPCI_CAPSUP_ARI_EN_MASK          I40E_MASK(0x1, I40E_GLPCI_CAPSUP_ARI_EN_SHIFT)
+#define I40E_GLPCI_CAPSUP_IOV_EN_SHIFT         5
+#define I40E_GLPCI_CAPSUP_IOV_EN_MASK          I40E_MASK(0x1, I40E_GLPCI_CAPSUP_IOV_EN_SHIFT)
+#define I40E_GLPCI_CAPSUP_ACS_EN_SHIFT         6
+#define I40E_GLPCI_CAPSUP_ACS_EN_MASK          I40E_MASK(0x1, I40E_GLPCI_CAPSUP_ACS_EN_SHIFT)
+#define I40E_GLPCI_CAPSUP_SEC_EN_SHIFT         7
+#define I40E_GLPCI_CAPSUP_SEC_EN_MASK          I40E_MASK(0x1, I40E_GLPCI_CAPSUP_SEC_EN_SHIFT)
+#define I40E_GLPCI_CAPSUP_ECRC_GEN_EN_SHIFT    16
+#define I40E_GLPCI_CAPSUP_ECRC_GEN_EN_MASK     I40E_MASK(0x1, I40E_GLPCI_CAPSUP_ECRC_GEN_EN_SHIFT)
+#define I40E_GLPCI_CAPSUP_ECRC_CHK_EN_SHIFT    17
+#define I40E_GLPCI_CAPSUP_ECRC_CHK_EN_MASK     I40E_MASK(0x1, I40E_GLPCI_CAPSUP_ECRC_CHK_EN_SHIFT)
+#define I40E_GLPCI_CAPSUP_IDO_EN_SHIFT         18
+#define I40E_GLPCI_CAPSUP_IDO_EN_MASK          I40E_MASK(0x1, I40E_GLPCI_CAPSUP_IDO_EN_SHIFT)
+#define I40E_GLPCI_CAPSUP_MSI_MASK_SHIFT       19
+#define I40E_GLPCI_CAPSUP_MSI_MASK_MASK        I40E_MASK(0x1, I40E_GLPCI_CAPSUP_MSI_MASK_SHIFT)
+#define I40E_GLPCI_CAPSUP_CSR_CONF_EN_SHIFT    20
+#define I40E_GLPCI_CAPSUP_CSR_CONF_EN_MASK     I40E_MASK(0x1, I40E_GLPCI_CAPSUP_CSR_CONF_EN_SHIFT)
+#define I40E_GLPCI_CAPSUP_LOAD_SUBSYS_ID_SHIFT 30
+#define I40E_GLPCI_CAPSUP_LOAD_SUBSYS_ID_MASK  I40E_MASK(0x1, I40E_GLPCI_CAPSUP_LOAD_SUBSYS_ID_SHIFT)
+#define I40E_GLPCI_CAPSUP_LOAD_DEV_ID_SHIFT    31
+#define I40E_GLPCI_CAPSUP_LOAD_DEV_ID_MASK     I40E_MASK(0x1, I40E_GLPCI_CAPSUP_LOAD_DEV_ID_SHIFT)
+#define I40E_GLPCI_CNF                   0x000BE4C0 /* Reset: POR */
+#define I40E_GLPCI_CNF_FLEX10_SHIFT      1
+#define I40E_GLPCI_CNF_FLEX10_MASK       I40E_MASK(0x1, I40E_GLPCI_CNF_FLEX10_SHIFT)
+#define I40E_GLPCI_CNF_WAKE_PIN_EN_SHIFT 2
+#define I40E_GLPCI_CNF_WAKE_PIN_EN_MASK  I40E_MASK(0x1, I40E_GLPCI_CNF_WAKE_PIN_EN_SHIFT)
+#define I40E_GLPCI_CNF2                      0x000BE494 /* Reset: PCIR */
+#define I40E_GLPCI_CNF2_RO_DIS_SHIFT         0
+#define I40E_GLPCI_CNF2_RO_DIS_MASK          I40E_MASK(0x1, I40E_GLPCI_CNF2_RO_DIS_SHIFT)
+#define I40E_GLPCI_CNF2_CACHELINE_SIZE_SHIFT 1
+#define I40E_GLPCI_CNF2_CACHELINE_SIZE_MASK  I40E_MASK(0x1, I40E_GLPCI_CNF2_CACHELINE_SIZE_SHIFT)
+#define I40E_GLPCI_CNF2_MSI_X_PF_N_SHIFT     2
+#define I40E_GLPCI_CNF2_MSI_X_PF_N_MASK      I40E_MASK(0x7FF, I40E_GLPCI_CNF2_MSI_X_PF_N_SHIFT)
+#define I40E_GLPCI_CNF2_MSI_X_VF_N_SHIFT     13
+#define I40E_GLPCI_CNF2_MSI_X_VF_N_MASK      I40E_MASK(0x7FF, I40E_GLPCI_CNF2_MSI_X_VF_N_SHIFT)
+#define I40E_GLPCI_DREVID                     0x0009C480 /* Reset: PCIR */
+#define I40E_GLPCI_DREVID_DEFAULT_REVID_SHIFT 0
+#define I40E_GLPCI_DREVID_DEFAULT_REVID_MASK  I40E_MASK(0xFF, I40E_GLPCI_DREVID_DEFAULT_REVID_SHIFT)
+#define I40E_GLPCI_GSCL_1                        0x0009C48C /* Reset: PCIR */
+#define I40E_GLPCI_GSCL_1_GIO_COUNT_EN_0_SHIFT   0
+#define I40E_GLPCI_GSCL_1_GIO_COUNT_EN_0_MASK    I40E_MASK(0x1, I40E_GLPCI_GSCL_1_GIO_COUNT_EN_0_SHIFT)
+#define I40E_GLPCI_GSCL_1_GIO_COUNT_EN_1_SHIFT   1
+#define I40E_GLPCI_GSCL_1_GIO_COUNT_EN_1_MASK    I40E_MASK(0x1, I40E_GLPCI_GSCL_1_GIO_COUNT_EN_1_SHIFT)
+#define I40E_GLPCI_GSCL_1_GIO_COUNT_EN_2_SHIFT   2
+#define I40E_GLPCI_GSCL_1_GIO_COUNT_EN_2_MASK    I40E_MASK(0x1, I40E_GLPCI_GSCL_1_GIO_COUNT_EN_2_SHIFT)
+#define I40E_GLPCI_GSCL_1_GIO_COUNT_EN_3_SHIFT   3
+#define I40E_GLPCI_GSCL_1_GIO_COUNT_EN_3_MASK    I40E_MASK(0x1, I40E_GLPCI_GSCL_1_GIO_COUNT_EN_3_SHIFT)
+#define I40E_GLPCI_GSCL_1_LBC_ENABLE_0_SHIFT     4
+#define I40E_GLPCI_GSCL_1_LBC_ENABLE_0_MASK      I40E_MASK(0x1, I40E_GLPCI_GSCL_1_LBC_ENABLE_0_SHIFT)
+#define I40E_GLPCI_GSCL_1_LBC_ENABLE_1_SHIFT     5
+#define I40E_GLPCI_GSCL_1_LBC_ENABLE_1_MASK      I40E_MASK(0x1, I40E_GLPCI_GSCL_1_LBC_ENABLE_1_SHIFT)
+#define I40E_GLPCI_GSCL_1_LBC_ENABLE_2_SHIFT     6
+#define I40E_GLPCI_GSCL_1_LBC_ENABLE_2_MASK      I40E_MASK(0x1, I40E_GLPCI_GSCL_1_LBC_ENABLE_2_SHIFT)
+#define I40E_GLPCI_GSCL_1_LBC_ENABLE_3_SHIFT     7
+#define I40E_GLPCI_GSCL_1_LBC_ENABLE_3_MASK      I40E_MASK(0x1, I40E_GLPCI_GSCL_1_LBC_ENABLE_3_SHIFT)
+#define I40E_GLPCI_GSCL_1_PCI_COUNT_LAT_EN_SHIFT 8
+#define I40E_GLPCI_GSCL_1_PCI_COUNT_LAT_EN_MASK  I40E_MASK(0x1, I40E_GLPCI_GSCL_1_PCI_COUNT_LAT_EN_SHIFT)
+#define I40E_GLPCI_GSCL_1_PCI_COUNT_LAT_EV_SHIFT 9
+#define I40E_GLPCI_GSCL_1_PCI_COUNT_LAT_EV_MASK  I40E_MASK(0x1F, I40E_GLPCI_GSCL_1_PCI_COUNT_LAT_EV_SHIFT)
+#define I40E_GLPCI_GSCL_1_PCI_COUNT_BW_EN_SHIFT  14
+#define I40E_GLPCI_GSCL_1_PCI_COUNT_BW_EN_MASK   I40E_MASK(0x1, I40E_GLPCI_GSCL_1_PCI_COUNT_BW_EN_SHIFT)
+#define I40E_GLPCI_GSCL_1_PCI_COUNT_BW_EV_SHIFT  15
+#define I40E_GLPCI_GSCL_1_PCI_COUNT_BW_EV_MASK   I40E_MASK(0x1F, I40E_GLPCI_GSCL_1_PCI_COUNT_BW_EV_SHIFT)
+#define I40E_GLPCI_GSCL_1_GIO_64_BIT_EN_SHIFT    28
+#define I40E_GLPCI_GSCL_1_GIO_64_BIT_EN_MASK     I40E_MASK(0x1, I40E_GLPCI_GSCL_1_GIO_64_BIT_EN_SHIFT)
+#define I40E_GLPCI_GSCL_1_GIO_COUNT_RESET_SHIFT  29
+#define I40E_GLPCI_GSCL_1_GIO_COUNT_RESET_MASK   I40E_MASK(0x1, I40E_GLPCI_GSCL_1_GIO_COUNT_RESET_SHIFT)
+#define I40E_GLPCI_GSCL_1_GIO_COUNT_STOP_SHIFT   30
+#define I40E_GLPCI_GSCL_1_GIO_COUNT_STOP_MASK    I40E_MASK(0x1, I40E_GLPCI_GSCL_1_GIO_COUNT_STOP_SHIFT)
+#define I40E_GLPCI_GSCL_1_GIO_COUNT_START_SHIFT  31
+#define I40E_GLPCI_GSCL_1_GIO_COUNT_START_MASK   I40E_MASK(0x1, I40E_GLPCI_GSCL_1_GIO_COUNT_START_SHIFT)
+#define I40E_GLPCI_GSCL_2                       0x0009C490 /* Reset: PCIR */
+#define I40E_GLPCI_GSCL_2_GIO_EVENT_NUM_0_SHIFT 0
+#define I40E_GLPCI_GSCL_2_GIO_EVENT_NUM_0_MASK  I40E_MASK(0xFF, I40E_GLPCI_GSCL_2_GIO_EVENT_NUM_0_SHIFT)
+#define I40E_GLPCI_GSCL_2_GIO_EVENT_NUM_1_SHIFT 8
+#define I40E_GLPCI_GSCL_2_GIO_EVENT_NUM_1_MASK  I40E_MASK(0xFF, I40E_GLPCI_GSCL_2_GIO_EVENT_NUM_1_SHIFT)
+#define I40E_GLPCI_GSCL_2_GIO_EVENT_NUM_2_SHIFT 16
+#define I40E_GLPCI_GSCL_2_GIO_EVENT_NUM_2_MASK  I40E_MASK(0xFF, I40E_GLPCI_GSCL_2_GIO_EVENT_NUM_2_SHIFT)
+#define I40E_GLPCI_GSCL_2_GIO_EVENT_NUM_3_SHIFT 24
+#define I40E_GLPCI_GSCL_2_GIO_EVENT_NUM_3_MASK  I40E_MASK(0xFF, I40E_GLPCI_GSCL_2_GIO_EVENT_NUM_3_SHIFT)
+#define I40E_GLPCI_GSCL_5_8(_i)                   (0x0009C494 + ((_i) * 4)) /* _i=0...3 */ /* Reset: PCIR */
+#define I40E_GLPCI_GSCL_5_8_MAX_INDEX             3
+#define I40E_GLPCI_GSCL_5_8_LBC_THRESHOLD_N_SHIFT 0
+#define I40E_GLPCI_GSCL_5_8_LBC_THRESHOLD_N_MASK  I40E_MASK(0xFFFF, I40E_GLPCI_GSCL_5_8_LBC_THRESHOLD_N_SHIFT)
+#define I40E_GLPCI_GSCL_5_8_LBC_TIMER_N_SHIFT     16
+#define I40E_GLPCI_GSCL_5_8_LBC_TIMER_N_MASK      I40E_MASK(0xFFFF, I40E_GLPCI_GSCL_5_8_LBC_TIMER_N_SHIFT)
+#define I40E_GLPCI_GSCN_0_3(_i)                 (0x0009C4A4 + ((_i) * 4)) /* _i=0...3 */ /* Reset: PCIR */
+#define I40E_GLPCI_GSCN_0_3_MAX_INDEX           3
+#define I40E_GLPCI_GSCN_0_3_EVENT_COUNTER_SHIFT 0
+#define I40E_GLPCI_GSCN_0_3_EVENT_COUNTER_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPCI_GSCN_0_3_EVENT_COUNTER_SHIFT)
+#define I40E_GLPCI_LBARCTRL                    0x000BE484 /* Reset: POR */
+#define I40E_GLPCI_LBARCTRL_PREFBAR_SHIFT      0
+#define I40E_GLPCI_LBARCTRL_PREFBAR_MASK       I40E_MASK(0x1, I40E_GLPCI_LBARCTRL_PREFBAR_SHIFT)
+#define I40E_GLPCI_LBARCTRL_BAR32_SHIFT        1
+#define I40E_GLPCI_LBARCTRL_BAR32_MASK         I40E_MASK(0x1, I40E_GLPCI_LBARCTRL_BAR32_SHIFT)
+#define I40E_GLPCI_LBARCTRL_FLASH_EXPOSE_SHIFT 3
+#define I40E_GLPCI_LBARCTRL_FLASH_EXPOSE_MASK  I40E_MASK(0x1, I40E_GLPCI_LBARCTRL_FLASH_EXPOSE_SHIFT)
+#define I40E_GLPCI_LBARCTRL_RSVD_4_SHIFT       4
+#define I40E_GLPCI_LBARCTRL_RSVD_4_MASK        I40E_MASK(0x3, I40E_GLPCI_LBARCTRL_RSVD_4_SHIFT)
+#define I40E_GLPCI_LBARCTRL_FL_SIZE_SHIFT      6
+#define I40E_GLPCI_LBARCTRL_FL_SIZE_MASK       I40E_MASK(0x7, I40E_GLPCI_LBARCTRL_FL_SIZE_SHIFT)
+#define I40E_GLPCI_LBARCTRL_RSVD_10_SHIFT      10
+#define I40E_GLPCI_LBARCTRL_RSVD_10_MASK       I40E_MASK(0x1, I40E_GLPCI_LBARCTRL_RSVD_10_SHIFT)
+#define I40E_GLPCI_LBARCTRL_EXROM_SIZE_SHIFT   11
+#define I40E_GLPCI_LBARCTRL_EXROM_SIZE_MASK    I40E_MASK(0x7, I40E_GLPCI_LBARCTRL_EXROM_SIZE_SHIFT)
+#define I40E_GLPCI_LINKCAP                          0x000BE4AC /* Reset: PCIR */
+#define I40E_GLPCI_LINKCAP_LINK_SPEEDS_VECTOR_SHIFT 0
+#define I40E_GLPCI_LINKCAP_LINK_SPEEDS_VECTOR_MASK  I40E_MASK(0x3F, I40E_GLPCI_LINKCAP_LINK_SPEEDS_VECTOR_SHIFT)
+#define I40E_GLPCI_LINKCAP_MAX_PAYLOAD_SHIFT        6
+#define I40E_GLPCI_LINKCAP_MAX_PAYLOAD_MASK         I40E_MASK(0x7, I40E_GLPCI_LINKCAP_MAX_PAYLOAD_SHIFT)
+#define I40E_GLPCI_LINKCAP_MAX_LINK_WIDTH_SHIFT     9
+#define I40E_GLPCI_LINKCAP_MAX_LINK_WIDTH_MASK      I40E_MASK(0xF, I40E_GLPCI_LINKCAP_MAX_LINK_WIDTH_SHIFT)
+#define I40E_GLPCI_PCIERR                    0x000BE4FC /* Reset: PCIR */
+#define I40E_GLPCI_PCIERR_PCIE_ERR_REP_SHIFT 0
+#define I40E_GLPCI_PCIERR_PCIE_ERR_REP_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPCI_PCIERR_PCIE_ERR_REP_SHIFT)
+#define I40E_GLPCI_PKTCT                        0x0009C4BC /* Reset: PCIR */
+#define I40E_GLPCI_PKTCT_PCI_COUNT_BW_PCT_SHIFT 0
+#define I40E_GLPCI_PKTCT_PCI_COUNT_BW_PCT_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPCI_PKTCT_PCI_COUNT_BW_PCT_SHIFT)
+#define I40E_GLPCI_PM_MUX_NPQ                        0x0009C4F4 /* Reset: PCIR */
+#define I40E_GLPCI_PM_MUX_NPQ_NPQ_NUM_PORT_SEL_SHIFT 0
+#define I40E_GLPCI_PM_MUX_NPQ_NPQ_NUM_PORT_SEL_MASK  I40E_MASK(0x7, I40E_GLPCI_PM_MUX_NPQ_NPQ_NUM_PORT_SEL_SHIFT)
+#define I40E_GLPCI_PM_MUX_NPQ_INNER_NPQ_SEL_SHIFT    16
+#define I40E_GLPCI_PM_MUX_NPQ_INNER_NPQ_SEL_MASK     I40E_MASK(0x1F, I40E_GLPCI_PM_MUX_NPQ_INNER_NPQ_SEL_SHIFT)
+#define I40E_GLPCI_PM_MUX_PFB                      0x0009C4F0 /* Reset: PCIR */
+#define I40E_GLPCI_PM_MUX_PFB_PFB_PORT_SEL_SHIFT   0
+#define I40E_GLPCI_PM_MUX_PFB_PFB_PORT_SEL_MASK    I40E_MASK(0x1F, I40E_GLPCI_PM_MUX_PFB_PFB_PORT_SEL_SHIFT)
+#define I40E_GLPCI_PM_MUX_PFB_INNER_PORT_SEL_SHIFT 16
+#define I40E_GLPCI_PM_MUX_PFB_INNER_PORT_SEL_MASK  I40E_MASK(0x7, I40E_GLPCI_PM_MUX_PFB_INNER_PORT_SEL_SHIFT)
+#define I40E_GLPCI_PMSUP                    0x000BE4B0 /* Reset: PCIR */
+#define I40E_GLPCI_PMSUP_ASPM_SUP_SHIFT     0
+#define I40E_GLPCI_PMSUP_ASPM_SUP_MASK      I40E_MASK(0x3, I40E_GLPCI_PMSUP_ASPM_SUP_SHIFT)
+#define I40E_GLPCI_PMSUP_L0S_EXIT_LAT_SHIFT 2
+#define I40E_GLPCI_PMSUP_L0S_EXIT_LAT_MASK  I40E_MASK(0x7, I40E_GLPCI_PMSUP_L0S_EXIT_LAT_SHIFT)
+#define I40E_GLPCI_PMSUP_L1_EXIT_LAT_SHIFT  5
+#define I40E_GLPCI_PMSUP_L1_EXIT_LAT_MASK   I40E_MASK(0x7, I40E_GLPCI_PMSUP_L1_EXIT_LAT_SHIFT)
+#define I40E_GLPCI_PMSUP_L0S_ACC_LAT_SHIFT  8
+#define I40E_GLPCI_PMSUP_L0S_ACC_LAT_MASK   I40E_MASK(0x7, I40E_GLPCI_PMSUP_L0S_ACC_LAT_SHIFT)
+#define I40E_GLPCI_PMSUP_L1_ACC_LAT_SHIFT   11
+#define I40E_GLPCI_PMSUP_L1_ACC_LAT_MASK    I40E_MASK(0x7, I40E_GLPCI_PMSUP_L1_ACC_LAT_SHIFT)
+#define I40E_GLPCI_PMSUP_SLOT_CLK_SHIFT     14
+#define I40E_GLPCI_PMSUP_SLOT_CLK_MASK      I40E_MASK(0x1, I40E_GLPCI_PMSUP_SLOT_CLK_SHIFT)
+#define I40E_GLPCI_PMSUP_OBFF_SUP_SHIFT     15
+#define I40E_GLPCI_PMSUP_OBFF_SUP_MASK      I40E_MASK(0x3, I40E_GLPCI_PMSUP_OBFF_SUP_SHIFT)
+#define I40E_GLPCI_PQ_MAX_USED_SPC                                0x0009C4EC /* Reset: PCIR */
+#define I40E_GLPCI_PQ_MAX_USED_SPC_GLPCI_PQ_MAX_USED_SPC_12_SHIFT 0
+#define I40E_GLPCI_PQ_MAX_USED_SPC_GLPCI_PQ_MAX_USED_SPC_12_MASK  I40E_MASK(0xFF, I40E_GLPCI_PQ_MAX_USED_SPC_GLPCI_PQ_MAX_USED_SPC_12_SHIFT)
+#define I40E_GLPCI_PQ_MAX_USED_SPC_GLPCI_PQ_MAX_USED_SPC_13_SHIFT 8
+#define I40E_GLPCI_PQ_MAX_USED_SPC_GLPCI_PQ_MAX_USED_SPC_13_MASK  I40E_MASK(0xFF, I40E_GLPCI_PQ_MAX_USED_SPC_GLPCI_PQ_MAX_USED_SPC_13_SHIFT)
+#define I40E_GLPCI_PWRDATA                  0x000BE490 /* Reset: PCIR */
+#define I40E_GLPCI_PWRDATA_D0_POWER_SHIFT   0
+#define I40E_GLPCI_PWRDATA_D0_POWER_MASK    I40E_MASK(0xFF, I40E_GLPCI_PWRDATA_D0_POWER_SHIFT)
+#define I40E_GLPCI_PWRDATA_COMM_POWER_SHIFT 8
+#define I40E_GLPCI_PWRDATA_COMM_POWER_MASK  I40E_MASK(0xFF, I40E_GLPCI_PWRDATA_COMM_POWER_SHIFT)
+#define I40E_GLPCI_PWRDATA_D3_POWER_SHIFT   16
+#define I40E_GLPCI_PWRDATA_D3_POWER_MASK    I40E_MASK(0xFF, I40E_GLPCI_PWRDATA_D3_POWER_SHIFT)
+#define I40E_GLPCI_PWRDATA_DATA_SCALE_SHIFT 24
+#define I40E_GLPCI_PWRDATA_DATA_SCALE_MASK  I40E_MASK(0x3, I40E_GLPCI_PWRDATA_DATA_SCALE_SHIFT)
+#define I40E_GLPCI_REVID                 0x000BE4B4 /* Reset: PCIR */
+#define I40E_GLPCI_REVID_NVM_REVID_SHIFT 0
+#define I40E_GLPCI_REVID_NVM_REVID_MASK  I40E_MASK(0xFF, I40E_GLPCI_REVID_NVM_REVID_SHIFT)
+#define I40E_GLPCI_SERH                 0x000BE49C /* Reset: PCIR */
+#define I40E_GLPCI_SERH_SER_NUM_H_SHIFT 0
+#define I40E_GLPCI_SERH_SER_NUM_H_MASK  I40E_MASK(0xFFFF, I40E_GLPCI_SERH_SER_NUM_H_SHIFT)
+#define I40E_GLPCI_SERL                 0x000BE498 /* Reset: PCIR */
+#define I40E_GLPCI_SERL_SER_NUM_L_SHIFT 0
+#define I40E_GLPCI_SERL_SER_NUM_L_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPCI_SERL_SER_NUM_L_SHIFT)
+#define I40E_GLPCI_SPARE_BITS_0                  0x0009C4F8 /* Reset: PCIR */
+#define I40E_GLPCI_SPARE_BITS_0_SPARE_BITS_SHIFT 0
+#define I40E_GLPCI_SPARE_BITS_0_SPARE_BITS_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPCI_SPARE_BITS_0_SPARE_BITS_SHIFT)
+#define I40E_GLPCI_SPARE_BITS_1                  0x0009C4FC /* Reset: PCIR */
+#define I40E_GLPCI_SPARE_BITS_1_SPARE_BITS_SHIFT 0
+#define I40E_GLPCI_SPARE_BITS_1_SPARE_BITS_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPCI_SPARE_BITS_1_SPARE_BITS_SHIFT)
+#define I40E_GLPCI_SUBVENID                  0x000BE48C /* Reset: PCIR */
+#define I40E_GLPCI_SUBVENID_SUB_VEN_ID_SHIFT 0
+#define I40E_GLPCI_SUBVENID_SUB_VEN_ID_MASK  I40E_MASK(0xFFFF, I40E_GLPCI_SUBVENID_SUB_VEN_ID_SHIFT)
+#define I40E_GLPCI_UPADD               0x000BE4F8 /* Reset: PCIR */
+#define I40E_GLPCI_UPADD_ADDRESS_SHIFT 1
+#define I40E_GLPCI_UPADD_ADDRESS_MASK  I40E_MASK(0x7FFFFFFF, I40E_GLPCI_UPADD_ADDRESS_SHIFT)
+#define I40E_GLPCI_VENDORID                0x000BE518 /* Reset: PCIR */
+#define I40E_GLPCI_VENDORID_VENDORID_SHIFT 0
+#define I40E_GLPCI_VENDORID_VENDORID_MASK  I40E_MASK(0xFFFF, I40E_GLPCI_VENDORID_VENDORID_SHIFT)
+#define I40E_GLPCI_VFSUP                   0x000BE4B8 /* Reset: PCIR */
+#define I40E_GLPCI_VFSUP_VF_PREFETCH_SHIFT 0
+#define I40E_GLPCI_VFSUP_VF_PREFETCH_MASK  I40E_MASK(0x1, I40E_GLPCI_VFSUP_VF_PREFETCH_SHIFT)
+#define I40E_GLPCI_VFSUP_VR_BAR_TYPE_SHIFT 1
+#define I40E_GLPCI_VFSUP_VR_BAR_TYPE_MASK  I40E_MASK(0x1, I40E_GLPCI_VFSUP_VR_BAR_TYPE_SHIFT)
+#define I40E_GLTPH_CTRL                         0x000BE480 /* Reset: PCIR */
+#define I40E_GLTPH_CTRL_DESC_PH_SHIFT           9
+#define I40E_GLTPH_CTRL_DESC_PH_MASK            I40E_MASK(0x3, I40E_GLTPH_CTRL_DESC_PH_SHIFT)
+#define I40E_GLTPH_CTRL_DATA_PH_SHIFT           11
+#define I40E_GLTPH_CTRL_DATA_PH_MASK            I40E_MASK(0x3, I40E_GLTPH_CTRL_DATA_PH_SHIFT)
+#define I40E_PF_FUNC_RID                       0x0009C000 /* Reset: PCIR */
+#define I40E_PF_FUNC_RID_FUNCTION_NUMBER_SHIFT 0
+#define I40E_PF_FUNC_RID_FUNCTION_NUMBER_MASK  I40E_MASK(0x7, I40E_PF_FUNC_RID_FUNCTION_NUMBER_SHIFT)
+#define I40E_PF_FUNC_RID_DEVICE_NUMBER_SHIFT   3
+#define I40E_PF_FUNC_RID_DEVICE_NUMBER_MASK    I40E_MASK(0x1F, I40E_PF_FUNC_RID_DEVICE_NUMBER_SHIFT)
+#define I40E_PF_FUNC_RID_BUS_NUMBER_SHIFT      8
+#define I40E_PF_FUNC_RID_BUS_NUMBER_MASK       I40E_MASK(0xFF, I40E_PF_FUNC_RID_BUS_NUMBER_SHIFT)
+#define I40E_PF_PCI_CIAA               0x0009C080 /* Reset: FLR */
+#define I40E_PF_PCI_CIAA_ADDRESS_SHIFT 0
+#define I40E_PF_PCI_CIAA_ADDRESS_MASK  I40E_MASK(0xFFF, I40E_PF_PCI_CIAA_ADDRESS_SHIFT)
+#define I40E_PF_PCI_CIAA_VF_NUM_SHIFT  12
+#define I40E_PF_PCI_CIAA_VF_NUM_MASK   I40E_MASK(0x7F, I40E_PF_PCI_CIAA_VF_NUM_SHIFT)
+#define I40E_PF_PCI_CIAD            0x0009C100 /* Reset: FLR */
+#define I40E_PF_PCI_CIAD_DATA_SHIFT 0
+#define I40E_PF_PCI_CIAD_DATA_MASK  I40E_MASK(0xFFFFFFFF, I40E_PF_PCI_CIAD_DATA_SHIFT)
+#define I40E_PFPCI_CLASS                     0x000BE400 /* Reset: PCIR */
+#define I40E_PFPCI_CLASS_STORAGE_CLASS_SHIFT 0
+#define I40E_PFPCI_CLASS_STORAGE_CLASS_MASK  I40E_MASK(0x1, I40E_PFPCI_CLASS_STORAGE_CLASS_SHIFT)
+#define I40E_PFPCI_CLASS_RESERVED_1_SHIFT    1
+#define I40E_PFPCI_CLASS_RESERVED_1_MASK     I40E_MASK(0x1, I40E_PFPCI_CLASS_RESERVED_1_SHIFT)
+#define I40E_PFPCI_CLASS_PF_IS_LAN_SHIFT     2
+#define I40E_PFPCI_CLASS_PF_IS_LAN_MASK      I40E_MASK(0x1, I40E_PFPCI_CLASS_PF_IS_LAN_SHIFT)
+#define I40E_PFPCI_CNF                 0x000BE000 /* Reset: PCIR */
+#define I40E_PFPCI_CNF_MSI_EN_SHIFT    2
+#define I40E_PFPCI_CNF_MSI_EN_MASK     I40E_MASK(0x1, I40E_PFPCI_CNF_MSI_EN_SHIFT)
+#define I40E_PFPCI_CNF_EXROM_DIS_SHIFT 3
+#define I40E_PFPCI_CNF_EXROM_DIS_MASK  I40E_MASK(0x1, I40E_PFPCI_CNF_EXROM_DIS_SHIFT)
+#define I40E_PFPCI_CNF_IO_BAR_SHIFT    4
+#define I40E_PFPCI_CNF_IO_BAR_MASK     I40E_MASK(0x1, I40E_PFPCI_CNF_IO_BAR_SHIFT)
+#define I40E_PFPCI_CNF_INT_PIN_SHIFT   5
+#define I40E_PFPCI_CNF_INT_PIN_MASK    I40E_MASK(0x3, I40E_PFPCI_CNF_INT_PIN_SHIFT)
+#define I40E_PFPCI_DEVID                 0x000BE080 /* Reset: PCIR */
+#define I40E_PFPCI_DEVID_PF_DEV_ID_SHIFT 0
+#define I40E_PFPCI_DEVID_PF_DEV_ID_MASK  I40E_MASK(0xFFFF, I40E_PFPCI_DEVID_PF_DEV_ID_SHIFT)
+#define I40E_PFPCI_DEVID_VF_DEV_ID_SHIFT 16
+#define I40E_PFPCI_DEVID_VF_DEV_ID_MASK  I40E_MASK(0xFFFF, I40E_PFPCI_DEVID_VF_DEV_ID_SHIFT)
+#define I40E_PFPCI_FACTPS                        0x0009C180 /* Reset: FLR */
+#define I40E_PFPCI_FACTPS_FUNC_POWER_STATE_SHIFT 0
+#define I40E_PFPCI_FACTPS_FUNC_POWER_STATE_MASK  I40E_MASK(0x3, I40E_PFPCI_FACTPS_FUNC_POWER_STATE_SHIFT)
+#define I40E_PFPCI_FACTPS_FUNC_AUX_EN_SHIFT      3
+#define I40E_PFPCI_FACTPS_FUNC_AUX_EN_MASK       I40E_MASK(0x1, I40E_PFPCI_FACTPS_FUNC_AUX_EN_SHIFT)
+#define I40E_PFPCI_FUNC                            0x000BE200 /* Reset: POR */
+#define I40E_PFPCI_FUNC_FUNC_DIS_SHIFT             0
+#define I40E_PFPCI_FUNC_FUNC_DIS_MASK              I40E_MASK(0x1, I40E_PFPCI_FUNC_FUNC_DIS_SHIFT)
+#define I40E_PFPCI_FUNC_ALLOW_FUNC_DIS_SHIFT       1
+#define I40E_PFPCI_FUNC_ALLOW_FUNC_DIS_MASK        I40E_MASK(0x1, I40E_PFPCI_FUNC_ALLOW_FUNC_DIS_SHIFT)
+#define I40E_PFPCI_FUNC_DIS_FUNC_ON_PORT_DIS_SHIFT 2
+#define I40E_PFPCI_FUNC_DIS_FUNC_ON_PORT_DIS_MASK  I40E_MASK(0x1, I40E_PFPCI_FUNC_DIS_FUNC_ON_PORT_DIS_SHIFT)
+#define I40E_PFPCI_FUNC2                    0x000BE180 /* Reset: PCIR */
+#define I40E_PFPCI_FUNC2_EMP_FUNC_DIS_SHIFT 0
+#define I40E_PFPCI_FUNC2_EMP_FUNC_DIS_MASK  I40E_MASK(0x1, I40E_PFPCI_FUNC2_EMP_FUNC_DIS_SHIFT)
+#define I40E_PFPCI_ICAUSE                      0x0009C200 /* Reset: PFR */
+#define I40E_PFPCI_ICAUSE_PCIE_ERR_CAUSE_SHIFT 0
+#define I40E_PFPCI_ICAUSE_PCIE_ERR_CAUSE_MASK  I40E_MASK(0xFFFFFFFF, I40E_PFPCI_ICAUSE_PCIE_ERR_CAUSE_SHIFT)
+#define I40E_PFPCI_IENA                   0x0009C280 /* Reset: PFR */
+#define I40E_PFPCI_IENA_PCIE_ERR_EN_SHIFT 0
+#define I40E_PFPCI_IENA_PCIE_ERR_EN_MASK  I40E_MASK(0xFFFFFFFF, I40E_PFPCI_IENA_PCIE_ERR_EN_SHIFT)
+#define I40E_PFPCI_PF_FLUSH_DONE                  0x0009C800 /* Reset: PCIR */
+#define I40E_PFPCI_PF_FLUSH_DONE_FLUSH_DONE_SHIFT 0
+#define I40E_PFPCI_PF_FLUSH_DONE_FLUSH_DONE_MASK  I40E_MASK(0x1, I40E_PFPCI_PF_FLUSH_DONE_FLUSH_DONE_SHIFT)
+#define I40E_PFPCI_PM              0x000BE300 /* Reset: POR */
+#define I40E_PFPCI_PM_PME_EN_SHIFT 0
+#define I40E_PFPCI_PM_PME_EN_MASK  I40E_MASK(0x1, I40E_PFPCI_PM_PME_EN_SHIFT)
+#define I40E_PFPCI_STATUS1                  0x000BE280 /* Reset: POR */
+#define I40E_PFPCI_STATUS1_FUNC_VALID_SHIFT 0
+#define I40E_PFPCI_STATUS1_FUNC_VALID_MASK  I40E_MASK(0x1, I40E_PFPCI_STATUS1_FUNC_VALID_SHIFT)
+#define I40E_PFPCI_SUBSYSID                    0x000BE100 /* Reset: PCIR */
+#define I40E_PFPCI_SUBSYSID_PF_SUBSYS_ID_SHIFT 0
+#define I40E_PFPCI_SUBSYSID_PF_SUBSYS_ID_MASK  I40E_MASK(0xFFFF, I40E_PFPCI_SUBSYSID_PF_SUBSYS_ID_SHIFT)
+#define I40E_PFPCI_SUBSYSID_VF_SUBSYS_ID_SHIFT 16
+#define I40E_PFPCI_SUBSYSID_VF_SUBSYS_ID_MASK  I40E_MASK(0xFFFF, I40E_PFPCI_SUBSYSID_VF_SUBSYS_ID_SHIFT)
+#define I40E_PFPCI_VF_FLUSH_DONE                  0x0000E400 /* Reset: PCIR */
+#define I40E_PFPCI_VF_FLUSH_DONE_FLUSH_DONE_SHIFT 0
+#define I40E_PFPCI_VF_FLUSH_DONE_FLUSH_DONE_MASK  I40E_MASK(0x1, I40E_PFPCI_VF_FLUSH_DONE_FLUSH_DONE_SHIFT)
+#define I40E_PFPCI_VF_FLUSH_DONE1(_VF)             (0x0009C600 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: PCIR */
+#define I40E_PFPCI_VF_FLUSH_DONE1_MAX_INDEX        127
+#define I40E_PFPCI_VF_FLUSH_DONE1_FLUSH_DONE_SHIFT 0
+#define I40E_PFPCI_VF_FLUSH_DONE1_FLUSH_DONE_MASK  I40E_MASK(0x1, I40E_PFPCI_VF_FLUSH_DONE1_FLUSH_DONE_SHIFT)
+#define I40E_PFPCI_VM_FLUSH_DONE                  0x0009C880 /* Reset: PCIR */
+#define I40E_PFPCI_VM_FLUSH_DONE_FLUSH_DONE_SHIFT 0
+#define I40E_PFPCI_VM_FLUSH_DONE_FLUSH_DONE_MASK  I40E_MASK(0x1, I40E_PFPCI_VM_FLUSH_DONE_FLUSH_DONE_SHIFT)
+#define I40E_PFPCI_VMINDEX               0x0009C300 /* Reset: PCIR */
+#define I40E_PFPCI_VMINDEX_VMINDEX_SHIFT 0
+#define I40E_PFPCI_VMINDEX_VMINDEX_MASK  I40E_MASK(0x1FF, I40E_PFPCI_VMINDEX_VMINDEX_SHIFT)
+#define I40E_PFPCI_VMPEND               0x0009C380 /* Reset: PCIR */
+#define I40E_PFPCI_VMPEND_PENDING_SHIFT 0
+#define I40E_PFPCI_VMPEND_PENDING_MASK  I40E_MASK(0x1, I40E_PFPCI_VMPEND_PENDING_SHIFT)
+#define I40E_PRTPM_EEE_STAT                     0x001E4320 /* Reset: GLOBR */
+#define I40E_PRTPM_EEE_STAT_EEE_NEG_SHIFT       29
+#define I40E_PRTPM_EEE_STAT_EEE_NEG_MASK        I40E_MASK(0x1, I40E_PRTPM_EEE_STAT_EEE_NEG_SHIFT)
+#define I40E_PRTPM_EEE_STAT_RX_LPI_STATUS_SHIFT 30
+#define I40E_PRTPM_EEE_STAT_RX_LPI_STATUS_MASK  I40E_MASK(0x1, I40E_PRTPM_EEE_STAT_RX_LPI_STATUS_SHIFT)
+#define I40E_PRTPM_EEE_STAT_TX_LPI_STATUS_SHIFT 31
+#define I40E_PRTPM_EEE_STAT_TX_LPI_STATUS_MASK  I40E_MASK(0x1, I40E_PRTPM_EEE_STAT_TX_LPI_STATUS_SHIFT)
+#define I40E_PRTPM_EEEC                     0x001E4380 /* Reset: GLOBR */
+#define I40E_PRTPM_EEEC_TW_WAKE_MIN_SHIFT   16
+#define I40E_PRTPM_EEEC_TW_WAKE_MIN_MASK    I40E_MASK(0x3F, I40E_PRTPM_EEEC_TW_WAKE_MIN_SHIFT)
+#define I40E_PRTPM_EEEC_TX_LU_LPI_DLY_SHIFT 24
+#define I40E_PRTPM_EEEC_TX_LU_LPI_DLY_MASK  I40E_MASK(0x3, I40E_PRTPM_EEEC_TX_LU_LPI_DLY_SHIFT)
+#define I40E_PRTPM_EEEC_TEEE_DLY_SHIFT      26
+#define I40E_PRTPM_EEEC_TEEE_DLY_MASK       I40E_MASK(0x3F, I40E_PRTPM_EEEC_TEEE_DLY_SHIFT)
+#define I40E_PRTPM_EEEFWD                          0x001E4400 /* Reset: GLOBR */
+#define I40E_PRTPM_EEEFWD_EEE_FW_CONFIG_DONE_SHIFT 31
+#define I40E_PRTPM_EEEFWD_EEE_FW_CONFIG_DONE_MASK  I40E_MASK(0x1, I40E_PRTPM_EEEFWD_EEE_FW_CONFIG_DONE_SHIFT)
+#define I40E_PRTPM_EEER                 0x001E4360 /* Reset: GLOBR */
+#define I40E_PRTPM_EEER_TW_SYSTEM_SHIFT 0
+#define I40E_PRTPM_EEER_TW_SYSTEM_MASK  I40E_MASK(0xFFFF, I40E_PRTPM_EEER_TW_SYSTEM_SHIFT)
+#define I40E_PRTPM_EEER_TX_LPI_EN_SHIFT 16
+#define I40E_PRTPM_EEER_TX_LPI_EN_MASK  I40E_MASK(0x1, I40E_PRTPM_EEER_TX_LPI_EN_SHIFT)
+#define I40E_PRTPM_EEETXC              0x001E43E0 /* Reset: GLOBR */
+#define I40E_PRTPM_EEETXC_TW_PHY_SHIFT 0
+#define I40E_PRTPM_EEETXC_TW_PHY_MASK  I40E_MASK(0xFFFF, I40E_PRTPM_EEETXC_TW_PHY_SHIFT)
+#define I40E_PRTPM_GC                     0x000B8140 /* Reset: POR */
+#define I40E_PRTPM_GC_EMP_LINK_ON_SHIFT   0
+#define I40E_PRTPM_GC_EMP_LINK_ON_MASK    I40E_MASK(0x1, I40E_PRTPM_GC_EMP_LINK_ON_SHIFT)
+#define I40E_PRTPM_GC_MNG_VETO_SHIFT      1
+#define I40E_PRTPM_GC_MNG_VETO_MASK       I40E_MASK(0x1, I40E_PRTPM_GC_MNG_VETO_SHIFT)
+#define I40E_PRTPM_GC_RATD_SHIFT          2
+#define I40E_PRTPM_GC_RATD_MASK           I40E_MASK(0x1, I40E_PRTPM_GC_RATD_SHIFT)
+#define I40E_PRTPM_GC_LCDMP_SHIFT         3
+#define I40E_PRTPM_GC_LCDMP_MASK          I40E_MASK(0x1, I40E_PRTPM_GC_LCDMP_SHIFT)
+#define I40E_PRTPM_GC_LPLU_ASSERTED_SHIFT 31
+#define I40E_PRTPM_GC_LPLU_ASSERTED_MASK  I40E_MASK(0x1, I40E_PRTPM_GC_LPLU_ASSERTED_SHIFT)
+#define I40E_PRTPM_RLPIC              0x001E43A0 /* Reset: GLOBR */
+#define I40E_PRTPM_RLPIC_ERLPIC_SHIFT 0
+#define I40E_PRTPM_RLPIC_ERLPIC_MASK  I40E_MASK(0xFFFFFFFF, I40E_PRTPM_RLPIC_ERLPIC_SHIFT)
+#define I40E_PRTPM_TLPIC              0x001E43C0 /* Reset: GLOBR */
+#define I40E_PRTPM_TLPIC_ETLPIC_SHIFT 0
+#define I40E_PRTPM_TLPIC_ETLPIC_MASK  I40E_MASK(0xFFFFFFFF, I40E_PRTPM_TLPIC_ETLPIC_SHIFT)
+#define I40E_GLRPB_DPSS               0x000AC828 /* Reset: CORER */
+#define I40E_GLRPB_DPSS_DPS_TCN_SHIFT 0
+#define I40E_GLRPB_DPSS_DPS_TCN_MASK  I40E_MASK(0xFFFFF, I40E_GLRPB_DPSS_DPS_TCN_SHIFT)
+#define I40E_GLRPB_GHW           0x000AC830 /* Reset: CORER */
+#define I40E_GLRPB_GHW_GHW_SHIFT 0
+#define I40E_GLRPB_GHW_GHW_MASK  I40E_MASK(0xFFFFF, I40E_GLRPB_GHW_GHW_SHIFT)
+#define I40E_GLRPB_GLW           0x000AC834 /* Reset: CORER */
+#define I40E_GLRPB_GLW_GLW_SHIFT 0
+#define I40E_GLRPB_GLW_GLW_MASK  I40E_MASK(0xFFFFF, I40E_GLRPB_GLW_GLW_SHIFT)
+#define I40E_GLRPB_PHW           0x000AC844 /* Reset: CORER */
+#define I40E_GLRPB_PHW_PHW_SHIFT 0
+#define I40E_GLRPB_PHW_PHW_MASK  I40E_MASK(0xFFFFF, I40E_GLRPB_PHW_PHW_SHIFT)
+#define I40E_GLRPB_PLW           0x000AC848 /* Reset: CORER */
+#define I40E_GLRPB_PLW_PLW_SHIFT 0
+#define I40E_GLRPB_PLW_PLW_MASK  I40E_MASK(0xFFFFF, I40E_GLRPB_PLW_PLW_SHIFT)
+#define I40E_PRTRPB_DHW(_i)           (0x000AC100 + ((_i) * 32)) /* _i=0...7 */ /* Reset: CORER */
+#define I40E_PRTRPB_DHW_MAX_INDEX     7
+#define I40E_PRTRPB_DHW_DHW_TCN_SHIFT 0
+#define I40E_PRTRPB_DHW_DHW_TCN_MASK  I40E_MASK(0xFFFFF, I40E_PRTRPB_DHW_DHW_TCN_SHIFT)
+#define I40E_PRTRPB_DLW(_i)           (0x000AC220 + ((_i) * 32)) /* _i=0...7 */ /* Reset: CORER */
+#define I40E_PRTRPB_DLW_MAX_INDEX     7
+#define I40E_PRTRPB_DLW_DLW_TCN_SHIFT 0
+#define I40E_PRTRPB_DLW_DLW_TCN_MASK  I40E_MASK(0xFFFFF, I40E_PRTRPB_DLW_DLW_TCN_SHIFT)
+#define I40E_PRTRPB_DPS(_i)           (0x000AC320 + ((_i) * 32)) /* _i=0...7 */ /* Reset: CORER */
+#define I40E_PRTRPB_DPS_MAX_INDEX     7
+#define I40E_PRTRPB_DPS_DPS_TCN_SHIFT 0
+#define I40E_PRTRPB_DPS_DPS_TCN_MASK  I40E_MASK(0xFFFFF, I40E_PRTRPB_DPS_DPS_TCN_SHIFT)
+#define I40E_PRTRPB_SHT(_i)           (0x000AC480 + ((_i) * 32)) /* _i=0...7 */ /* Reset: CORER */
+#define I40E_PRTRPB_SHT_MAX_INDEX     7
+#define I40E_PRTRPB_SHT_SHT_TCN_SHIFT 0
+#define I40E_PRTRPB_SHT_SHT_TCN_MASK  I40E_MASK(0xFFFFF, I40E_PRTRPB_SHT_SHT_TCN_SHIFT)
+#define I40E_PRTRPB_SHW           0x000AC580 /* Reset: CORER */
+#define I40E_PRTRPB_SHW_SHW_SHIFT 0
+#define I40E_PRTRPB_SHW_SHW_MASK  I40E_MASK(0xFFFFF, I40E_PRTRPB_SHW_SHW_SHIFT)
+#define I40E_PRTRPB_SLT(_i)           (0x000AC5A0 + ((_i) * 32)) /* _i=0...7 */ /* Reset: CORER */
+#define I40E_PRTRPB_SLT_MAX_INDEX     7
+#define I40E_PRTRPB_SLT_SLT_TCN_SHIFT 0
+#define I40E_PRTRPB_SLT_SLT_TCN_MASK  I40E_MASK(0xFFFFF, I40E_PRTRPB_SLT_SLT_TCN_SHIFT)
+#define I40E_PRTRPB_SLW           0x000AC6A0 /* Reset: CORER */
+#define I40E_PRTRPB_SLW_SLW_SHIFT 0
+#define I40E_PRTRPB_SLW_SLW_MASK  I40E_MASK(0xFFFFF, I40E_PRTRPB_SLW_SLW_SHIFT)
+#define I40E_PRTRPB_SPS           0x000AC7C0 /* Reset: CORER */
+#define I40E_PRTRPB_SPS_SPS_SHIFT 0
+#define I40E_PRTRPB_SPS_SPS_MASK  I40E_MASK(0xFFFFF, I40E_PRTRPB_SPS_SPS_SHIFT)
+#define I40E_GLQF_CTL                      0x00269BA4 /* Reset: CORER */
+#define I40E_GLQF_CTL_HTOEP_SHIFT          1
+#define I40E_GLQF_CTL_HTOEP_MASK           I40E_MASK(0x1, I40E_GLQF_CTL_HTOEP_SHIFT)
+#define I40E_GLQF_CTL_HTOEP_FCOE_SHIFT     2
+#define I40E_GLQF_CTL_HTOEP_FCOE_MASK      I40E_MASK(0x1, I40E_GLQF_CTL_HTOEP_FCOE_SHIFT)
+#define I40E_GLQF_CTL_PCNT_ALLOC_SHIFT     3
+#define I40E_GLQF_CTL_PCNT_ALLOC_MASK      I40E_MASK(0x7, I40E_GLQF_CTL_PCNT_ALLOC_SHIFT)
+#define I40E_GLQF_CTL_FD_AUTO_PCTYPE_SHIFT 6
+#define I40E_GLQF_CTL_FD_AUTO_PCTYPE_MASK  I40E_MASK(0x1, I40E_GLQF_CTL_FD_AUTO_PCTYPE_SHIFT)
+#define I40E_GLQF_CTL_RSVD_SHIFT           7
+#define I40E_GLQF_CTL_RSVD_MASK            I40E_MASK(0x1, I40E_GLQF_CTL_RSVD_SHIFT)
+#define I40E_GLQF_CTL_MAXPEBLEN_SHIFT      8
+#define I40E_GLQF_CTL_MAXPEBLEN_MASK       I40E_MASK(0x7, I40E_GLQF_CTL_MAXPEBLEN_SHIFT)
+#define I40E_GLQF_CTL_MAXFCBLEN_SHIFT      11
+#define I40E_GLQF_CTL_MAXFCBLEN_MASK       I40E_MASK(0x7, I40E_GLQF_CTL_MAXFCBLEN_SHIFT)
+#define I40E_GLQF_CTL_MAXFDBLEN_SHIFT      14
+#define I40E_GLQF_CTL_MAXFDBLEN_MASK       I40E_MASK(0x7, I40E_GLQF_CTL_MAXFDBLEN_SHIFT)
+#define I40E_GLQF_CTL_FDBEST_SHIFT         17
+#define I40E_GLQF_CTL_FDBEST_MASK          I40E_MASK(0xFF, I40E_GLQF_CTL_FDBEST_SHIFT)
+#define I40E_GLQF_CTL_PROGPRIO_SHIFT       25
+#define I40E_GLQF_CTL_PROGPRIO_MASK        I40E_MASK(0x1, I40E_GLQF_CTL_PROGPRIO_SHIFT)
+#define I40E_GLQF_CTL_INVALPRIO_SHIFT      26
+#define I40E_GLQF_CTL_INVALPRIO_MASK       I40E_MASK(0x1, I40E_GLQF_CTL_INVALPRIO_SHIFT)
+#define I40E_GLQF_CTL_IGNORE_IP_SHIFT      27
+#define I40E_GLQF_CTL_IGNORE_IP_MASK       I40E_MASK(0x1, I40E_GLQF_CTL_IGNORE_IP_SHIFT)
+#define I40E_GLQF_FDCNT_0                   0x00269BAC /* Reset: CORER */
+#define I40E_GLQF_FDCNT_0_GUARANT_CNT_SHIFT 0
+#define I40E_GLQF_FDCNT_0_GUARANT_CNT_MASK  I40E_MASK(0x1FFF, I40E_GLQF_FDCNT_0_GUARANT_CNT_SHIFT)
+#define I40E_GLQF_FDCNT_0_BESTCNT_SHIFT     13
+#define I40E_GLQF_FDCNT_0_BESTCNT_MASK      I40E_MASK(0x1FFF, I40E_GLQF_FDCNT_0_BESTCNT_SHIFT)
+#define I40E_GLQF_HKEY(_i)         (0x00270140 + ((_i) * 4)) /* _i=0...12 */ /* Reset: CORER */
+#define I40E_GLQF_HKEY_MAX_INDEX   12
+#define I40E_GLQF_HKEY_KEY_0_SHIFT 0
+#define I40E_GLQF_HKEY_KEY_0_MASK  I40E_MASK(0xFF, I40E_GLQF_HKEY_KEY_0_SHIFT)
+#define I40E_GLQF_HKEY_KEY_1_SHIFT 8
+#define I40E_GLQF_HKEY_KEY_1_MASK  I40E_MASK(0xFF, I40E_GLQF_HKEY_KEY_1_SHIFT)
+#define I40E_GLQF_HKEY_KEY_2_SHIFT 16
+#define I40E_GLQF_HKEY_KEY_2_MASK  I40E_MASK(0xFF, I40E_GLQF_HKEY_KEY_2_SHIFT)
+#define I40E_GLQF_HKEY_KEY_3_SHIFT 24
+#define I40E_GLQF_HKEY_KEY_3_MASK  I40E_MASK(0xFF, I40E_GLQF_HKEY_KEY_3_SHIFT)
+#define I40E_GLQF_HSYM(_i)            (0x00269D00 + ((_i) * 4)) /* _i=0...63 */ /* Reset: CORER */
+#define I40E_GLQF_HSYM_MAX_INDEX      63
+#define I40E_GLQF_HSYM_SYMH_ENA_SHIFT 0
+#define I40E_GLQF_HSYM_SYMH_ENA_MASK  I40E_MASK(0x1, I40E_GLQF_HSYM_SYMH_ENA_SHIFT)
+#define I40E_GLQF_PCNT(_i)        (0x00266800 + ((_i) * 4)) /* _i=0...511 */ /* Reset: CORER */
+#define I40E_GLQF_PCNT_MAX_INDEX  511
+#define I40E_GLQF_PCNT_PCNT_SHIFT 0
+#define I40E_GLQF_PCNT_PCNT_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLQF_PCNT_PCNT_SHIFT)
+#define I40E_GLQF_SWAP(_i, _j)          (0x00267E00 + ((_i) * 4 + (_j) * 8)) /* _i=0...1, _j=0...63 */ /* Reset: CORER */
+#define I40E_GLQF_SWAP_MAX_INDEX       1
+#define I40E_GLQF_SWAP_OFF0_SRC0_SHIFT 0
+#define I40E_GLQF_SWAP_OFF0_SRC0_MASK  I40E_MASK(0x3F, I40E_GLQF_SWAP_OFF0_SRC0_SHIFT)
+#define I40E_GLQF_SWAP_OFF0_SRC1_SHIFT 6
+#define I40E_GLQF_SWAP_OFF0_SRC1_MASK  I40E_MASK(0x3F, I40E_GLQF_SWAP_OFF0_SRC1_SHIFT)
+#define I40E_GLQF_SWAP_FLEN0_SHIFT     12
+#define I40E_GLQF_SWAP_FLEN0_MASK      I40E_MASK(0xF, I40E_GLQF_SWAP_FLEN0_SHIFT)
+#define I40E_GLQF_SWAP_OFF1_SRC0_SHIFT 16
+#define I40E_GLQF_SWAP_OFF1_SRC0_MASK  I40E_MASK(0x3F, I40E_GLQF_SWAP_OFF1_SRC0_SHIFT)
+#define I40E_GLQF_SWAP_OFF1_SRC1_SHIFT 22
+#define I40E_GLQF_SWAP_OFF1_SRC1_MASK  I40E_MASK(0x3F, I40E_GLQF_SWAP_OFF1_SRC1_SHIFT)
+#define I40E_GLQF_SWAP_FLEN1_SHIFT     28
+#define I40E_GLQF_SWAP_FLEN1_MASK      I40E_MASK(0xF, I40E_GLQF_SWAP_FLEN1_SHIFT)
+#define I40E_PFQF_CTL_0                   0x001C0AC0 /* Reset: CORER */
+#define I40E_PFQF_CTL_0_PEHSIZE_SHIFT     0
+#define I40E_PFQF_CTL_0_PEHSIZE_MASK      I40E_MASK(0x1F, I40E_PFQF_CTL_0_PEHSIZE_SHIFT)
+#define I40E_PFQF_CTL_0_PEDSIZE_SHIFT     5
+#define I40E_PFQF_CTL_0_PEDSIZE_MASK      I40E_MASK(0x1F, I40E_PFQF_CTL_0_PEDSIZE_SHIFT)
+#define I40E_PFQF_CTL_0_PFFCHSIZE_SHIFT   10
+#define I40E_PFQF_CTL_0_PFFCHSIZE_MASK    I40E_MASK(0xF, I40E_PFQF_CTL_0_PFFCHSIZE_SHIFT)
+#define I40E_PFQF_CTL_0_PFFCDSIZE_SHIFT   14
+#define I40E_PFQF_CTL_0_PFFCDSIZE_MASK    I40E_MASK(0x3, I40E_PFQF_CTL_0_PFFCDSIZE_SHIFT)
+#define I40E_PFQF_CTL_0_HASHLUTSIZE_SHIFT 16
+#define I40E_PFQF_CTL_0_HASHLUTSIZE_MASK  I40E_MASK(0x1, I40E_PFQF_CTL_0_HASHLUTSIZE_SHIFT)
+#define I40E_PFQF_CTL_0_FD_ENA_SHIFT      17
+#define I40E_PFQF_CTL_0_FD_ENA_MASK       I40E_MASK(0x1, I40E_PFQF_CTL_0_FD_ENA_SHIFT)
+#define I40E_PFQF_CTL_0_ETYPE_ENA_SHIFT   18
+#define I40E_PFQF_CTL_0_ETYPE_ENA_MASK    I40E_MASK(0x1, I40E_PFQF_CTL_0_ETYPE_ENA_SHIFT)
+#define I40E_PFQF_CTL_0_MACVLAN_ENA_SHIFT 19
+#define I40E_PFQF_CTL_0_MACVLAN_ENA_MASK  I40E_MASK(0x1, I40E_PFQF_CTL_0_MACVLAN_ENA_SHIFT)
+#define I40E_PFQF_CTL_0_VFFCHSIZE_SHIFT   20
+#define I40E_PFQF_CTL_0_VFFCHSIZE_MASK    I40E_MASK(0xF, I40E_PFQF_CTL_0_VFFCHSIZE_SHIFT)
+#define I40E_PFQF_CTL_0_VFFCDSIZE_SHIFT   24
+#define I40E_PFQF_CTL_0_VFFCDSIZE_MASK    I40E_MASK(0x3, I40E_PFQF_CTL_0_VFFCDSIZE_SHIFT)
+#define I40E_PFQF_CTL_1                    0x00245D80 /* Reset: CORER */
+#define I40E_PFQF_CTL_1_CLEARFDTABLE_SHIFT 0
+#define I40E_PFQF_CTL_1_CLEARFDTABLE_MASK  I40E_MASK(0x1, I40E_PFQF_CTL_1_CLEARFDTABLE_SHIFT)
+#define I40E_PFQF_FDALLOC               0x00246280 /* Reset: CORER */
+#define I40E_PFQF_FDALLOC_FDALLOC_SHIFT 0
+#define I40E_PFQF_FDALLOC_FDALLOC_MASK  I40E_MASK(0xFF, I40E_PFQF_FDALLOC_FDALLOC_SHIFT)
+#define I40E_PFQF_FDALLOC_FDBEST_SHIFT  8
+#define I40E_PFQF_FDALLOC_FDBEST_MASK   I40E_MASK(0xFF, I40E_PFQF_FDALLOC_FDBEST_SHIFT)
+#define I40E_PFQF_FDSTAT                   0x00246380 /* Reset: CORER */
+#define I40E_PFQF_FDSTAT_GUARANT_CNT_SHIFT 0
+#define I40E_PFQF_FDSTAT_GUARANT_CNT_MASK  I40E_MASK(0x1FFF, I40E_PFQF_FDSTAT_GUARANT_CNT_SHIFT)
+#define I40E_PFQF_FDSTAT_BEST_CNT_SHIFT    16
+#define I40E_PFQF_FDSTAT_BEST_CNT_MASK     I40E_MASK(0x1FFF, I40E_PFQF_FDSTAT_BEST_CNT_SHIFT)
+#define I40E_PFQF_HENA(_i)             (0x00245900 + ((_i) * 128)) /* _i=0...1 */ /* Reset: CORER */
+#define I40E_PFQF_HENA_MAX_INDEX       1
+#define I40E_PFQF_HENA_PTYPE_ENA_SHIFT 0
+#define I40E_PFQF_HENA_PTYPE_ENA_MASK  I40E_MASK(0xFFFFFFFF, I40E_PFQF_HENA_PTYPE_ENA_SHIFT)
+#define I40E_PFQF_HKEY(_i)         (0x00244800 + ((_i) * 128)) /* _i=0...12 */ /* Reset: CORER */
+#define I40E_PFQF_HKEY_MAX_INDEX   12
+#define I40E_PFQF_HKEY_KEY_0_SHIFT 0
+#define I40E_PFQF_HKEY_KEY_0_MASK  I40E_MASK(0xFF, I40E_PFQF_HKEY_KEY_0_SHIFT)
+#define I40E_PFQF_HKEY_KEY_1_SHIFT 8
+#define I40E_PFQF_HKEY_KEY_1_MASK  I40E_MASK(0xFF, I40E_PFQF_HKEY_KEY_1_SHIFT)
+#define I40E_PFQF_HKEY_KEY_2_SHIFT 16
+#define I40E_PFQF_HKEY_KEY_2_MASK  I40E_MASK(0xFF, I40E_PFQF_HKEY_KEY_2_SHIFT)
+#define I40E_PFQF_HKEY_KEY_3_SHIFT 24
+#define I40E_PFQF_HKEY_KEY_3_MASK  I40E_MASK(0xFF, I40E_PFQF_HKEY_KEY_3_SHIFT)
+#define I40E_PFQF_HLUT(_i)        (0x00240000 + ((_i) * 128)) /* _i=0...127 */ /* Reset: CORER */
+#define I40E_PFQF_HLUT_MAX_INDEX  127
+#define I40E_PFQF_HLUT_LUT0_SHIFT 0
+#define I40E_PFQF_HLUT_LUT0_MASK  I40E_MASK(0x3F, I40E_PFQF_HLUT_LUT0_SHIFT)
+#define I40E_PFQF_HLUT_LUT1_SHIFT 8
+#define I40E_PFQF_HLUT_LUT1_MASK  I40E_MASK(0x3F, I40E_PFQF_HLUT_LUT1_SHIFT)
+#define I40E_PFQF_HLUT_LUT2_SHIFT 16
+#define I40E_PFQF_HLUT_LUT2_MASK  I40E_MASK(0x3F, I40E_PFQF_HLUT_LUT2_SHIFT)
+#define I40E_PFQF_HLUT_LUT3_SHIFT 24
+#define I40E_PFQF_HLUT_LUT3_MASK  I40E_MASK(0x3F, I40E_PFQF_HLUT_LUT3_SHIFT)
+#define I40E_PRTQF_CTL_0                0x00256E60 /* Reset: CORER */
+#define I40E_PRTQF_CTL_0_HSYM_ENA_SHIFT 0
+#define I40E_PRTQF_CTL_0_HSYM_ENA_MASK  I40E_MASK(0x1, I40E_PRTQF_CTL_0_HSYM_ENA_SHIFT)
+#define I40E_PRTQF_FD_FLXINSET(_i)         (0x00253800 + ((_i) * 32)) /* _i=0...63 */ /* Reset: CORER */
+#define I40E_PRTQF_FD_FLXINSET_MAX_INDEX   63
+#define I40E_PRTQF_FD_FLXINSET_INSET_SHIFT 0
+#define I40E_PRTQF_FD_FLXINSET_INSET_MASK  I40E_MASK(0xFF, I40E_PRTQF_FD_FLXINSET_INSET_SHIFT)
+#define I40E_PRTQF_FD_MSK(_i, _j)       (0x00252000 + ((_i) * 64 + (_j) * 32)) /* _i=0...63, _j=0...1 */ /* Reset: CORER */
+#define I40E_PRTQF_FD_MSK_MAX_INDEX    63
+#define I40E_PRTQF_FD_MSK_MASK_SHIFT   0
+#define I40E_PRTQF_FD_MSK_MASK_MASK    I40E_MASK(0xFFFF, I40E_PRTQF_FD_MSK_MASK_SHIFT)
+#define I40E_PRTQF_FD_MSK_OFFSET_SHIFT 16
+#define I40E_PRTQF_FD_MSK_OFFSET_MASK  I40E_MASK(0x3F, I40E_PRTQF_FD_MSK_OFFSET_SHIFT)
+#define I40E_PRTQF_FLX_PIT(_i)              (0x00255200 + ((_i) * 32)) /* _i=0...8 */ /* Reset: CORER */
+#define I40E_PRTQF_FLX_PIT_MAX_INDEX        8
+#define I40E_PRTQF_FLX_PIT_SOURCE_OFF_SHIFT 0
+#define I40E_PRTQF_FLX_PIT_SOURCE_OFF_MASK  I40E_MASK(0x1F, I40E_PRTQF_FLX_PIT_SOURCE_OFF_SHIFT)
+#define I40E_PRTQF_FLX_PIT_FSIZE_SHIFT      5
+#define I40E_PRTQF_FLX_PIT_FSIZE_MASK       I40E_MASK(0x1F, I40E_PRTQF_FLX_PIT_FSIZE_SHIFT)
+#define I40E_PRTQF_FLX_PIT_DEST_OFF_SHIFT   10
+#define I40E_PRTQF_FLX_PIT_DEST_OFF_MASK    I40E_MASK(0x3F, I40E_PRTQF_FLX_PIT_DEST_OFF_SHIFT)
+#define I40E_VFQF_HENA1(_i, _VF)         (0x00230800 + ((_i) * 1024 + (_VF) * 4)) /* _i=0...1, _VF=0...127 */ /* Reset: CORER */
+#define I40E_VFQF_HENA1_MAX_INDEX       1
+#define I40E_VFQF_HENA1_PTYPE_ENA_SHIFT 0
+#define I40E_VFQF_HENA1_PTYPE_ENA_MASK  I40E_MASK(0xFFFFFFFF, I40E_VFQF_HENA1_PTYPE_ENA_SHIFT)
+#define I40E_VFQF_HKEY1(_i, _VF)     (0x00228000 + ((_i) * 1024 + (_VF) * 4)) /* _i=0...12, _VF=0...127 */ /* Reset: CORER */
+#define I40E_VFQF_HKEY1_MAX_INDEX   12
+#define I40E_VFQF_HKEY1_KEY_0_SHIFT 0
+#define I40E_VFQF_HKEY1_KEY_0_MASK  I40E_MASK(0xFF, I40E_VFQF_HKEY1_KEY_0_SHIFT)
+#define I40E_VFQF_HKEY1_KEY_1_SHIFT 8
+#define I40E_VFQF_HKEY1_KEY_1_MASK  I40E_MASK(0xFF, I40E_VFQF_HKEY1_KEY_1_SHIFT)
+#define I40E_VFQF_HKEY1_KEY_2_SHIFT 16
+#define I40E_VFQF_HKEY1_KEY_2_MASK  I40E_MASK(0xFF, I40E_VFQF_HKEY1_KEY_2_SHIFT)
+#define I40E_VFQF_HKEY1_KEY_3_SHIFT 24
+#define I40E_VFQF_HKEY1_KEY_3_MASK  I40E_MASK(0xFF, I40E_VFQF_HKEY1_KEY_3_SHIFT)
+#define I40E_VFQF_HLUT1(_i, _VF)    (0x00220000 + ((_i) * 1024 + (_VF) * 4)) /* _i=0...15, _VF=0...127 */ /* Reset: CORER */
+#define I40E_VFQF_HLUT1_MAX_INDEX  15
+#define I40E_VFQF_HLUT1_LUT0_SHIFT 0
+#define I40E_VFQF_HLUT1_LUT0_MASK  I40E_MASK(0xF, I40E_VFQF_HLUT1_LUT0_SHIFT)
+#define I40E_VFQF_HLUT1_LUT1_SHIFT 8
+#define I40E_VFQF_HLUT1_LUT1_MASK  I40E_MASK(0xF, I40E_VFQF_HLUT1_LUT1_SHIFT)
+#define I40E_VFQF_HLUT1_LUT2_SHIFT 16
+#define I40E_VFQF_HLUT1_LUT2_MASK  I40E_MASK(0xF, I40E_VFQF_HLUT1_LUT2_SHIFT)
+#define I40E_VFQF_HLUT1_LUT3_SHIFT 24
+#define I40E_VFQF_HLUT1_LUT3_MASK  I40E_MASK(0xF, I40E_VFQF_HLUT1_LUT3_SHIFT)
+#define I40E_VFQF_HREGION1(_i, _VF)              (0x0022E000 + ((_i) * 1024 + (_VF) * 4)) /* _i=0...7, _VF=0...127 */ /* Reset: CORER */
+#define I40E_VFQF_HREGION1_MAX_INDEX            7
+#define I40E_VFQF_HREGION1_OVERRIDE_ENA_0_SHIFT 0
+#define I40E_VFQF_HREGION1_OVERRIDE_ENA_0_MASK  I40E_MASK(0x1, I40E_VFQF_HREGION1_OVERRIDE_ENA_0_SHIFT)
+#define I40E_VFQF_HREGION1_REGION_0_SHIFT       1
+#define I40E_VFQF_HREGION1_REGION_0_MASK        I40E_MASK(0x7, I40E_VFQF_HREGION1_REGION_0_SHIFT)
+#define I40E_VFQF_HREGION1_OVERRIDE_ENA_1_SHIFT 4
+#define I40E_VFQF_HREGION1_OVERRIDE_ENA_1_MASK  I40E_MASK(0x1, I40E_VFQF_HREGION1_OVERRIDE_ENA_1_SHIFT)
+#define I40E_VFQF_HREGION1_REGION_1_SHIFT       5
+#define I40E_VFQF_HREGION1_REGION_1_MASK        I40E_MASK(0x7, I40E_VFQF_HREGION1_REGION_1_SHIFT)
+#define I40E_VFQF_HREGION1_OVERRIDE_ENA_2_SHIFT 8
+#define I40E_VFQF_HREGION1_OVERRIDE_ENA_2_MASK  I40E_MASK(0x1, I40E_VFQF_HREGION1_OVERRIDE_ENA_2_SHIFT)
+#define I40E_VFQF_HREGION1_REGION_2_SHIFT       9
+#define I40E_VFQF_HREGION1_REGION_2_MASK        I40E_MASK(0x7, I40E_VFQF_HREGION1_REGION_2_SHIFT)
+#define I40E_VFQF_HREGION1_OVERRIDE_ENA_3_SHIFT 12
+#define I40E_VFQF_HREGION1_OVERRIDE_ENA_3_MASK  I40E_MASK(0x1, I40E_VFQF_HREGION1_OVERRIDE_ENA_3_SHIFT)
+#define I40E_VFQF_HREGION1_REGION_3_SHIFT       13
+#define I40E_VFQF_HREGION1_REGION_3_MASK        I40E_MASK(0x7, I40E_VFQF_HREGION1_REGION_3_SHIFT)
+#define I40E_VFQF_HREGION1_OVERRIDE_ENA_4_SHIFT 16
+#define I40E_VFQF_HREGION1_OVERRIDE_ENA_4_MASK  I40E_MASK(0x1, I40E_VFQF_HREGION1_OVERRIDE_ENA_4_SHIFT)
+#define I40E_VFQF_HREGION1_REGION_4_SHIFT       17
+#define I40E_VFQF_HREGION1_REGION_4_MASK        I40E_MASK(0x7, I40E_VFQF_HREGION1_REGION_4_SHIFT)
+#define I40E_VFQF_HREGION1_OVERRIDE_ENA_5_SHIFT 20
+#define I40E_VFQF_HREGION1_OVERRIDE_ENA_5_MASK  I40E_MASK(0x1, I40E_VFQF_HREGION1_OVERRIDE_ENA_5_SHIFT)
+#define I40E_VFQF_HREGION1_REGION_5_SHIFT       21
+#define I40E_VFQF_HREGION1_REGION_5_MASK        I40E_MASK(0x7, I40E_VFQF_HREGION1_REGION_5_SHIFT)
+#define I40E_VFQF_HREGION1_OVERRIDE_ENA_6_SHIFT 24
+#define I40E_VFQF_HREGION1_OVERRIDE_ENA_6_MASK  I40E_MASK(0x1, I40E_VFQF_HREGION1_OVERRIDE_ENA_6_SHIFT)
+#define I40E_VFQF_HREGION1_REGION_6_SHIFT       25
+#define I40E_VFQF_HREGION1_REGION_6_MASK        I40E_MASK(0x7, I40E_VFQF_HREGION1_REGION_6_SHIFT)
+#define I40E_VFQF_HREGION1_OVERRIDE_ENA_7_SHIFT 28
+#define I40E_VFQF_HREGION1_OVERRIDE_ENA_7_MASK  I40E_MASK(0x1, I40E_VFQF_HREGION1_OVERRIDE_ENA_7_SHIFT)
+#define I40E_VFQF_HREGION1_REGION_7_SHIFT       29
+#define I40E_VFQF_HREGION1_REGION_7_MASK        I40E_MASK(0x7, I40E_VFQF_HREGION1_REGION_7_SHIFT)
+#define I40E_VPQF_CTL(_VF)          (0x001C0000 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
+#define I40E_VPQF_CTL_MAX_INDEX     127
+#define I40E_VPQF_CTL_PEHSIZE_SHIFT 0
+#define I40E_VPQF_CTL_PEHSIZE_MASK  I40E_MASK(0x1F, I40E_VPQF_CTL_PEHSIZE_SHIFT)
+#define I40E_VPQF_CTL_PEDSIZE_SHIFT 5
+#define I40E_VPQF_CTL_PEDSIZE_MASK  I40E_MASK(0x1F, I40E_VPQF_CTL_PEDSIZE_SHIFT)
+#define I40E_VPQF_CTL_FCHSIZE_SHIFT 10
+#define I40E_VPQF_CTL_FCHSIZE_MASK  I40E_MASK(0xF, I40E_VPQF_CTL_FCHSIZE_SHIFT)
+#define I40E_VPQF_CTL_FCDSIZE_SHIFT 14
+#define I40E_VPQF_CTL_FCDSIZE_MASK  I40E_MASK(0x3, I40E_VPQF_CTL_FCDSIZE_SHIFT)
+#define I40E_VSIQF_CTL(_VSI)             (0x0020D800 + ((_VSI) * 4)) /* _i=0...383 */ /* Reset: PFR */
+#define I40E_VSIQF_CTL_MAX_INDEX         383
+#define I40E_VSIQF_CTL_FCOE_ENA_SHIFT    0
+#define I40E_VSIQF_CTL_FCOE_ENA_MASK     I40E_MASK(0x1, I40E_VSIQF_CTL_FCOE_ENA_SHIFT)
+#define I40E_VSIQF_CTL_PETCP_ENA_SHIFT   1
+#define I40E_VSIQF_CTL_PETCP_ENA_MASK    I40E_MASK(0x1, I40E_VSIQF_CTL_PETCP_ENA_SHIFT)
+#define I40E_VSIQF_CTL_PEUUDP_ENA_SHIFT  2
+#define I40E_VSIQF_CTL_PEUUDP_ENA_MASK   I40E_MASK(0x1, I40E_VSIQF_CTL_PEUUDP_ENA_SHIFT)
+#define I40E_VSIQF_CTL_PEMUDP_ENA_SHIFT  3
+#define I40E_VSIQF_CTL_PEMUDP_ENA_MASK   I40E_MASK(0x1, I40E_VSIQF_CTL_PEMUDP_ENA_SHIFT)
+#define I40E_VSIQF_CTL_PEUFRAG_ENA_SHIFT 4
+#define I40E_VSIQF_CTL_PEUFRAG_ENA_MASK  I40E_MASK(0x1, I40E_VSIQF_CTL_PEUFRAG_ENA_SHIFT)
+#define I40E_VSIQF_CTL_PEMFRAG_ENA_SHIFT 5
+#define I40E_VSIQF_CTL_PEMFRAG_ENA_MASK  I40E_MASK(0x1, I40E_VSIQF_CTL_PEMFRAG_ENA_SHIFT)
+#define I40E_VSIQF_TCREGION(_i, _VSI)         (0x00206000 + ((_i) * 2048 + (_VSI) * 4)) /* _i=0...3, _VSI=0...383 */ /* Reset: PFR */
+#define I40E_VSIQF_TCREGION_MAX_INDEX        3
+#define I40E_VSIQF_TCREGION_TC_OFFSET_SHIFT  0
+#define I40E_VSIQF_TCREGION_TC_OFFSET_MASK   I40E_MASK(0x1FF, I40E_VSIQF_TCREGION_TC_OFFSET_SHIFT)
+#define I40E_VSIQF_TCREGION_TC_SIZE_SHIFT    9
+#define I40E_VSIQF_TCREGION_TC_SIZE_MASK     I40E_MASK(0x7, I40E_VSIQF_TCREGION_TC_SIZE_SHIFT)
+#define I40E_VSIQF_TCREGION_TC_OFFSET2_SHIFT 16
+#define I40E_VSIQF_TCREGION_TC_OFFSET2_MASK  I40E_MASK(0x1FF, I40E_VSIQF_TCREGION_TC_OFFSET2_SHIFT)
+#define I40E_VSIQF_TCREGION_TC_SIZE2_SHIFT   25
+#define I40E_VSIQF_TCREGION_TC_SIZE2_MASK    I40E_MASK(0x7, I40E_VSIQF_TCREGION_TC_SIZE2_SHIFT)
+#define I40E_GL_FCOECRC(_i)           (0x00314d80 + ((_i) * 8)) /* _i=0...143 */ /* Reset: CORER */
+#define I40E_GL_FCOECRC_MAX_INDEX     143
+#define I40E_GL_FCOECRC_FCOECRC_SHIFT 0
+#define I40E_GL_FCOECRC_FCOECRC_MASK  I40E_MASK(0xFFFFFFFF, I40E_GL_FCOECRC_FCOECRC_SHIFT)
+#define I40E_GL_FCOEDDPC(_i)            (0x00314480 + ((_i) * 8)) /* _i=0...143 */ /* Reset: CORER */
+#define I40E_GL_FCOEDDPC_MAX_INDEX      143
+#define I40E_GL_FCOEDDPC_FCOEDDPC_SHIFT 0
+#define I40E_GL_FCOEDDPC_FCOEDDPC_MASK  I40E_MASK(0xFFFFFFFF, I40E_GL_FCOEDDPC_FCOEDDPC_SHIFT)
+#define I40E_GL_FCOEDIFEC(_i)             (0x00318480 + ((_i) * 8)) /* _i=0...143 */ /* Reset: CORER */
+#define I40E_GL_FCOEDIFEC_MAX_INDEX       143
+#define I40E_GL_FCOEDIFEC_FCOEDIFRC_SHIFT 0
+#define I40E_GL_FCOEDIFEC_FCOEDIFRC_MASK  I40E_MASK(0xFFFFFFFF, I40E_GL_FCOEDIFEC_FCOEDIFRC_SHIFT)
+#define I40E_GL_FCOEDIFTCL(_i)             (0x00354000 + ((_i) * 8)) /* _i=0...143 */ /* Reset: CORER */
+#define I40E_GL_FCOEDIFTCL_MAX_INDEX       143
+#define I40E_GL_FCOEDIFTCL_FCOEDIFTC_SHIFT 0
+#define I40E_GL_FCOEDIFTCL_FCOEDIFTC_MASK  I40E_MASK(0xFFFFFFFF, I40E_GL_FCOEDIFTCL_FCOEDIFTC_SHIFT)
+#define I40E_GL_FCOEDIXEC(_i)             (0x0034c000 + ((_i) * 8)) /* _i=0...143 */ /* Reset: CORER */
+#define I40E_GL_FCOEDIXEC_MAX_INDEX       143
+#define I40E_GL_FCOEDIXEC_FCOEDIXEC_SHIFT 0
+#define I40E_GL_FCOEDIXEC_FCOEDIXEC_MASK  I40E_MASK(0xFFFFFFFF, I40E_GL_FCOEDIXEC_FCOEDIXEC_SHIFT)
+#define I40E_GL_FCOEDIXVC(_i)             (0x00350000 + ((_i) * 8)) /* _i=0...143 */ /* Reset: CORER */
+#define I40E_GL_FCOEDIXVC_MAX_INDEX       143
+#define I40E_GL_FCOEDIXVC_FCOEDIXVC_SHIFT 0
+#define I40E_GL_FCOEDIXVC_FCOEDIXVC_MASK  I40E_MASK(0xFFFFFFFF, I40E_GL_FCOEDIXVC_FCOEDIXVC_SHIFT)
+#define I40E_GL_FCOEDWRCH(_i)             (0x00320004 + ((_i) * 8)) /* _i=0...143 */ /* Reset: CORER */
+#define I40E_GL_FCOEDWRCH_MAX_INDEX       143
+#define I40E_GL_FCOEDWRCH_FCOEDWRCH_SHIFT 0
+#define I40E_GL_FCOEDWRCH_FCOEDWRCH_MASK  I40E_MASK(0xFFFF, I40E_GL_FCOEDWRCH_FCOEDWRCH_SHIFT)
+#define I40E_GL_FCOEDWRCL(_i)             (0x00320000 + ((_i) * 8)) /* _i=0...143 */ /* Reset: CORER */
+#define I40E_GL_FCOEDWRCL_MAX_INDEX       143
+#define I40E_GL_FCOEDWRCL_FCOEDWRCL_SHIFT 0
+#define I40E_GL_FCOEDWRCL_FCOEDWRCL_MASK  I40E_MASK(0xFFFFFFFF, I40E_GL_FCOEDWRCL_FCOEDWRCL_SHIFT)
+#define I40E_GL_FCOEDWTCH(_i)             (0x00348084 + ((_i) * 8)) /* _i=0...143 */ /* Reset: CORER */
+#define I40E_GL_FCOEDWTCH_MAX_INDEX       143
+#define I40E_GL_FCOEDWTCH_FCOEDWTCH_SHIFT 0
+#define I40E_GL_FCOEDWTCH_FCOEDWTCH_MASK  I40E_MASK(0xFFFF, I40E_GL_FCOEDWTCH_FCOEDWTCH_SHIFT)
+#define I40E_GL_FCOEDWTCL(_i)             (0x00348080 + ((_i) * 8)) /* _i=0...143 */ /* Reset: CORER */
+#define I40E_GL_FCOEDWTCL_MAX_INDEX       143
+#define I40E_GL_FCOEDWTCL_FCOEDWTCL_SHIFT 0
+#define I40E_GL_FCOEDWTCL_FCOEDWTCL_MASK  I40E_MASK(0xFFFFFFFF, I40E_GL_FCOEDWTCL_FCOEDWTCL_SHIFT)
+#define I40E_GL_FCOELAST(_i)            (0x00314000 + ((_i) * 8)) /* _i=0...143 */ /* Reset: CORER */
+#define I40E_GL_FCOELAST_MAX_INDEX      143
+#define I40E_GL_FCOELAST_FCOELAST_SHIFT 0
+#define I40E_GL_FCOELAST_FCOELAST_MASK  I40E_MASK(0xFFFFFFFF, I40E_GL_FCOELAST_FCOELAST_SHIFT)
+#define I40E_GL_FCOEPRC(_i)           (0x00315200 + ((_i) * 8)) /* _i=0...143 */ /* Reset: CORER */
+#define I40E_GL_FCOEPRC_MAX_INDEX     143
+#define I40E_GL_FCOEPRC_FCOEPRC_SHIFT 0
+#define I40E_GL_FCOEPRC_FCOEPRC_MASK  I40E_MASK(0xFFFFFFFF, I40E_GL_FCOEPRC_FCOEPRC_SHIFT)
+#define I40E_GL_FCOEPTC(_i)           (0x00344C00 + ((_i) * 8)) /* _i=0...143 */ /* Reset: CORER */
+#define I40E_GL_FCOEPTC_MAX_INDEX     143
+#define I40E_GL_FCOEPTC_FCOEPTC_SHIFT 0
+#define I40E_GL_FCOEPTC_FCOEPTC_MASK  I40E_MASK(0xFFFFFFFF, I40E_GL_FCOEPTC_FCOEPTC_SHIFT)
+#define I40E_GL_FCOERPDC(_i)            (0x00324000 + ((_i) * 8)) /* _i=0...143 */ /* Reset: CORER */
+#define I40E_GL_FCOERPDC_MAX_INDEX      143
+#define I40E_GL_FCOERPDC_FCOERPDC_SHIFT 0
+#define I40E_GL_FCOERPDC_FCOERPDC_MASK  I40E_MASK(0xFFFFFFFF, I40E_GL_FCOERPDC_FCOERPDC_SHIFT)
+#define I40E_GL_RXERR1_L(_i)             (0x00318000 + ((_i) * 8)) /* _i=0...143 */ /* Reset: CORER */
+#define I40E_GL_RXERR1_L_MAX_INDEX       143
+#define I40E_GL_RXERR1_L_FCOEDIFRC_SHIFT 0
+#define I40E_GL_RXERR1_L_FCOEDIFRC_MASK  I40E_MASK(0xFFFFFFFF, I40E_GL_RXERR1_L_FCOEDIFRC_SHIFT)
+#define I40E_GL_RXERR2_L(_i)             (0x0031c000 + ((_i) * 8)) /* _i=0...143 */ /* Reset: CORER */
+#define I40E_GL_RXERR2_L_MAX_INDEX       143
+#define I40E_GL_RXERR2_L_FCOEDIXAC_SHIFT 0
+#define I40E_GL_RXERR2_L_FCOEDIXAC_MASK  I40E_MASK(0xFFFFFFFF, I40E_GL_RXERR2_L_FCOEDIXAC_SHIFT)
+#define I40E_GLPRT_BPRCH(_i)         (0x003005E4 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_BPRCH_MAX_INDEX   3
+#define I40E_GLPRT_BPRCH_BPRCH_SHIFT 0
+#define I40E_GLPRT_BPRCH_BPRCH_MASK  I40E_MASK(0xFFFF, I40E_GLPRT_BPRCH_BPRCH_SHIFT)
+#define I40E_GLPRT_BPRCL(_i)         (0x003005E0 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_BPRCL_MAX_INDEX   3
+#define I40E_GLPRT_BPRCL_BPRCL_SHIFT 0
+#define I40E_GLPRT_BPRCL_BPRCL_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPRT_BPRCL_BPRCL_SHIFT)
+#define I40E_GLPRT_BPTCH(_i)         (0x00300A04 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_BPTCH_MAX_INDEX   3
+#define I40E_GLPRT_BPTCH_BPTCH_SHIFT 0
+#define I40E_GLPRT_BPTCH_BPTCH_MASK  I40E_MASK(0xFFFF, I40E_GLPRT_BPTCH_BPTCH_SHIFT)
+#define I40E_GLPRT_BPTCL(_i)         (0x00300A00 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_BPTCL_MAX_INDEX   3
+#define I40E_GLPRT_BPTCL_BPTCL_SHIFT 0
+#define I40E_GLPRT_BPTCL_BPTCL_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPRT_BPTCL_BPTCL_SHIFT)
+#define I40E_GLPRT_CRCERRS(_i)           (0x00300080 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_CRCERRS_MAX_INDEX     3
+#define I40E_GLPRT_CRCERRS_CRCERRS_SHIFT 0
+#define I40E_GLPRT_CRCERRS_CRCERRS_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPRT_CRCERRS_CRCERRS_SHIFT)
+#define I40E_GLPRT_GORCH(_i)         (0x00300004 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_GORCH_MAX_INDEX   3
+#define I40E_GLPRT_GORCH_GORCH_SHIFT 0
+#define I40E_GLPRT_GORCH_GORCH_MASK  I40E_MASK(0xFFFF, I40E_GLPRT_GORCH_GORCH_SHIFT)
+#define I40E_GLPRT_GORCL(_i)         (0x00300000 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_GORCL_MAX_INDEX   3
+#define I40E_GLPRT_GORCL_GORCL_SHIFT 0
+#define I40E_GLPRT_GORCL_GORCL_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPRT_GORCL_GORCL_SHIFT)
+#define I40E_GLPRT_GOTCH(_i)         (0x00300684 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_GOTCH_MAX_INDEX   3
+#define I40E_GLPRT_GOTCH_GOTCH_SHIFT 0
+#define I40E_GLPRT_GOTCH_GOTCH_MASK  I40E_MASK(0xFFFF, I40E_GLPRT_GOTCH_GOTCH_SHIFT)
+#define I40E_GLPRT_GOTCL(_i)         (0x00300680 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_GOTCL_MAX_INDEX   3
+#define I40E_GLPRT_GOTCL_GOTCL_SHIFT 0
+#define I40E_GLPRT_GOTCL_GOTCL_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPRT_GOTCL_GOTCL_SHIFT)
+#define I40E_GLPRT_ILLERRC(_i)           (0x003000E0 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_ILLERRC_MAX_INDEX     3
+#define I40E_GLPRT_ILLERRC_ILLERRC_SHIFT 0
+#define I40E_GLPRT_ILLERRC_ILLERRC_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPRT_ILLERRC_ILLERRC_SHIFT)
+#define I40E_GLPRT_LDPC(_i)        (0x00300620 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_LDPC_MAX_INDEX  3
+#define I40E_GLPRT_LDPC_LDPC_SHIFT 0
+#define I40E_GLPRT_LDPC_LDPC_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPRT_LDPC_LDPC_SHIFT)
+#define I40E_GLPRT_LXOFFRXC(_i)              (0x00300160 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_LXOFFRXC_MAX_INDEX        3
+#define I40E_GLPRT_LXOFFRXC_LXOFFRXCNT_SHIFT 0
+#define I40E_GLPRT_LXOFFRXC_LXOFFRXCNT_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPRT_LXOFFRXC_LXOFFRXCNT_SHIFT)
+#define I40E_GLPRT_LXOFFTXC(_i)            (0x003009A0 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_LXOFFTXC_MAX_INDEX      3
+#define I40E_GLPRT_LXOFFTXC_LXOFFTXC_SHIFT 0
+#define I40E_GLPRT_LXOFFTXC_LXOFFTXC_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPRT_LXOFFTXC_LXOFFTXC_SHIFT)
+#define I40E_GLPRT_LXONRXC(_i)             (0x00300140 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_LXONRXC_MAX_INDEX       3
+#define I40E_GLPRT_LXONRXC_LXONRXCNT_SHIFT 0
+#define I40E_GLPRT_LXONRXC_LXONRXCNT_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPRT_LXONRXC_LXONRXCNT_SHIFT)
+#define I40E_GLPRT_LXONTXC(_i)           (0x00300980 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_LXONTXC_MAX_INDEX     3
+#define I40E_GLPRT_LXONTXC_LXONTXC_SHIFT 0
+#define I40E_GLPRT_LXONTXC_LXONTXC_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPRT_LXONTXC_LXONTXC_SHIFT)
+#define I40E_GLPRT_MLFC(_i)        (0x00300020 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_MLFC_MAX_INDEX  3
+#define I40E_GLPRT_MLFC_MLFC_SHIFT 0
+#define I40E_GLPRT_MLFC_MLFC_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPRT_MLFC_MLFC_SHIFT)
+#define I40E_GLPRT_MPRCH(_i)         (0x003005C4 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_MPRCH_MAX_INDEX   3
+#define I40E_GLPRT_MPRCH_MPRCH_SHIFT 0
+#define I40E_GLPRT_MPRCH_MPRCH_MASK  I40E_MASK(0xFFFF, I40E_GLPRT_MPRCH_MPRCH_SHIFT)
+#define I40E_GLPRT_MPRCL(_i)         (0x003005C0 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_MPRCL_MAX_INDEX   3
+#define I40E_GLPRT_MPRCL_MPRCL_SHIFT 0
+#define I40E_GLPRT_MPRCL_MPRCL_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPRT_MPRCL_MPRCL_SHIFT)
+#define I40E_GLPRT_MPTCH(_i)         (0x003009E4 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_MPTCH_MAX_INDEX   3
+#define I40E_GLPRT_MPTCH_MPTCH_SHIFT 0
+#define I40E_GLPRT_MPTCH_MPTCH_MASK  I40E_MASK(0xFFFF, I40E_GLPRT_MPTCH_MPTCH_SHIFT)
+#define I40E_GLPRT_MPTCL(_i)         (0x003009E0 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_MPTCL_MAX_INDEX   3
+#define I40E_GLPRT_MPTCL_MPTCL_SHIFT 0
+#define I40E_GLPRT_MPTCL_MPTCL_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPRT_MPTCL_MPTCL_SHIFT)
+#define I40E_GLPRT_MRFC(_i)        (0x00300040 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_MRFC_MAX_INDEX  3
+#define I40E_GLPRT_MRFC_MRFC_SHIFT 0
+#define I40E_GLPRT_MRFC_MRFC_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPRT_MRFC_MRFC_SHIFT)
+#define I40E_GLPRT_PRC1023H(_i)            (0x00300504 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_PRC1023H_MAX_INDEX      3
+#define I40E_GLPRT_PRC1023H_PRC1023H_SHIFT 0
+#define I40E_GLPRT_PRC1023H_PRC1023H_MASK  I40E_MASK(0xFFFF, I40E_GLPRT_PRC1023H_PRC1023H_SHIFT)
+#define I40E_GLPRT_PRC1023L(_i)            (0x00300500 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_PRC1023L_MAX_INDEX      3
+#define I40E_GLPRT_PRC1023L_PRC1023L_SHIFT 0
+#define I40E_GLPRT_PRC1023L_PRC1023L_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPRT_PRC1023L_PRC1023L_SHIFT)
+#define I40E_GLPRT_PRC127H(_i)           (0x003004A4 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_PRC127H_MAX_INDEX     3
+#define I40E_GLPRT_PRC127H_PRC127H_SHIFT 0
+#define I40E_GLPRT_PRC127H_PRC127H_MASK  I40E_MASK(0xFFFF, I40E_GLPRT_PRC127H_PRC127H_SHIFT)
+#define I40E_GLPRT_PRC127L(_i)           (0x003004A0 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_PRC127L_MAX_INDEX     3
+#define I40E_GLPRT_PRC127L_PRC127L_SHIFT 0
+#define I40E_GLPRT_PRC127L_PRC127L_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPRT_PRC127L_PRC127L_SHIFT)
+#define I40E_GLPRT_PRC1522H(_i)            (0x00300524 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_PRC1522H_MAX_INDEX      3
+#define I40E_GLPRT_PRC1522H_PRC1522H_SHIFT 0
+#define I40E_GLPRT_PRC1522H_PRC1522H_MASK  I40E_MASK(0xFFFF, I40E_GLPRT_PRC1522H_PRC1522H_SHIFT)
+#define I40E_GLPRT_PRC1522L(_i)            (0x00300520 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_PRC1522L_MAX_INDEX      3
+#define I40E_GLPRT_PRC1522L_PRC1522L_SHIFT 0
+#define I40E_GLPRT_PRC1522L_PRC1522L_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPRT_PRC1522L_PRC1522L_SHIFT)
+#define I40E_GLPRT_PRC255H(_i)              (0x003004C4 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_PRC255H_MAX_INDEX        3
+#define I40E_GLPRT_PRC255H_PRTPRC255H_SHIFT 0
+#define I40E_GLPRT_PRC255H_PRTPRC255H_MASK  I40E_MASK(0xFFFF, I40E_GLPRT_PRC255H_PRTPRC255H_SHIFT)
+#define I40E_GLPRT_PRC255L(_i)           (0x003004C0 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_PRC255L_MAX_INDEX     3
+#define I40E_GLPRT_PRC255L_PRC255L_SHIFT 0
+#define I40E_GLPRT_PRC255L_PRC255L_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPRT_PRC255L_PRC255L_SHIFT)
+#define I40E_GLPRT_PRC511H(_i)           (0x003004E4 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_PRC511H_MAX_INDEX     3
+#define I40E_GLPRT_PRC511H_PRC511H_SHIFT 0
+#define I40E_GLPRT_PRC511H_PRC511H_MASK  I40E_MASK(0xFFFF, I40E_GLPRT_PRC511H_PRC511H_SHIFT)
+#define I40E_GLPRT_PRC511L(_i)           (0x003004E0 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_PRC511L_MAX_INDEX     3
+#define I40E_GLPRT_PRC511L_PRC511L_SHIFT 0
+#define I40E_GLPRT_PRC511L_PRC511L_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPRT_PRC511L_PRC511L_SHIFT)
+#define I40E_GLPRT_PRC64H(_i)          (0x00300484 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_PRC64H_MAX_INDEX    3
+#define I40E_GLPRT_PRC64H_PRC64H_SHIFT 0
+#define I40E_GLPRT_PRC64H_PRC64H_MASK  I40E_MASK(0xFFFF, I40E_GLPRT_PRC64H_PRC64H_SHIFT)
+#define I40E_GLPRT_PRC64L(_i)          (0x00300480 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_PRC64L_MAX_INDEX    3
+#define I40E_GLPRT_PRC64L_PRC64L_SHIFT 0
+#define I40E_GLPRT_PRC64L_PRC64L_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPRT_PRC64L_PRC64L_SHIFT)
+#define I40E_GLPRT_PRC9522H(_i)            (0x00300544 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_PRC9522H_MAX_INDEX      3
+#define I40E_GLPRT_PRC9522H_PRC1522H_SHIFT 0
+#define I40E_GLPRT_PRC9522H_PRC1522H_MASK  I40E_MASK(0xFFFF, I40E_GLPRT_PRC9522H_PRC1522H_SHIFT)
+#define I40E_GLPRT_PRC9522L(_i)            (0x00300540 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_PRC9522L_MAX_INDEX      3
+#define I40E_GLPRT_PRC9522L_PRC1522L_SHIFT 0
+#define I40E_GLPRT_PRC9522L_PRC1522L_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPRT_PRC9522L_PRC1522L_SHIFT)
+#define I40E_GLPRT_PTC1023H(_i)            (0x00300724 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_PTC1023H_MAX_INDEX      3
+#define I40E_GLPRT_PTC1023H_PTC1023H_SHIFT 0
+#define I40E_GLPRT_PTC1023H_PTC1023H_MASK  I40E_MASK(0xFFFF, I40E_GLPRT_PTC1023H_PTC1023H_SHIFT)
+#define I40E_GLPRT_PTC1023L(_i)            (0x00300720 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_PTC1023L_MAX_INDEX      3
+#define I40E_GLPRT_PTC1023L_PTC1023L_SHIFT 0
+#define I40E_GLPRT_PTC1023L_PTC1023L_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPRT_PTC1023L_PTC1023L_SHIFT)
+#define I40E_GLPRT_PTC127H(_i)           (0x003006C4 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_PTC127H_MAX_INDEX     3
+#define I40E_GLPRT_PTC127H_PTC127H_SHIFT 0
+#define I40E_GLPRT_PTC127H_PTC127H_MASK  I40E_MASK(0xFFFF, I40E_GLPRT_PTC127H_PTC127H_SHIFT)
+#define I40E_GLPRT_PTC127L(_i)           (0x003006C0 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_PTC127L_MAX_INDEX     3
+#define I40E_GLPRT_PTC127L_PTC127L_SHIFT 0
+#define I40E_GLPRT_PTC127L_PTC127L_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPRT_PTC127L_PTC127L_SHIFT)
+#define I40E_GLPRT_PTC1522H(_i)            (0x00300744 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_PTC1522H_MAX_INDEX      3
+#define I40E_GLPRT_PTC1522H_PTC1522H_SHIFT 0
+#define I40E_GLPRT_PTC1522H_PTC1522H_MASK  I40E_MASK(0xFFFF, I40E_GLPRT_PTC1522H_PTC1522H_SHIFT)
+#define I40E_GLPRT_PTC1522L(_i)            (0x00300740 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_PTC1522L_MAX_INDEX      3
+#define I40E_GLPRT_PTC1522L_PTC1522L_SHIFT 0
+#define I40E_GLPRT_PTC1522L_PTC1522L_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPRT_PTC1522L_PTC1522L_SHIFT)
+#define I40E_GLPRT_PTC255H(_i)           (0x003006E4 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_PTC255H_MAX_INDEX     3
+#define I40E_GLPRT_PTC255H_PTC255H_SHIFT 0
+#define I40E_GLPRT_PTC255H_PTC255H_MASK  I40E_MASK(0xFFFF, I40E_GLPRT_PTC255H_PTC255H_SHIFT)
+#define I40E_GLPRT_PTC255L(_i)           (0x003006E0 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_PTC255L_MAX_INDEX     3
+#define I40E_GLPRT_PTC255L_PTC255L_SHIFT 0
+#define I40E_GLPRT_PTC255L_PTC255L_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPRT_PTC255L_PTC255L_SHIFT)
+#define I40E_GLPRT_PTC511H(_i)           (0x00300704 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_PTC511H_MAX_INDEX     3
+#define I40E_GLPRT_PTC511H_PTC511H_SHIFT 0
+#define I40E_GLPRT_PTC511H_PTC511H_MASK  I40E_MASK(0xFFFF, I40E_GLPRT_PTC511H_PTC511H_SHIFT)
+#define I40E_GLPRT_PTC511L(_i)           (0x00300700 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_PTC511L_MAX_INDEX     3
+#define I40E_GLPRT_PTC511L_PTC511L_SHIFT 0
+#define I40E_GLPRT_PTC511L_PTC511L_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPRT_PTC511L_PTC511L_SHIFT)
+#define I40E_GLPRT_PTC64H(_i)          (0x003006A4 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_PTC64H_MAX_INDEX    3
+#define I40E_GLPRT_PTC64H_PTC64H_SHIFT 0
+#define I40E_GLPRT_PTC64H_PTC64H_MASK  I40E_MASK(0xFFFF, I40E_GLPRT_PTC64H_PTC64H_SHIFT)
+#define I40E_GLPRT_PTC64L(_i)          (0x003006A0 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_PTC64L_MAX_INDEX    3
+#define I40E_GLPRT_PTC64L_PTC64L_SHIFT 0
+#define I40E_GLPRT_PTC64L_PTC64L_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPRT_PTC64L_PTC64L_SHIFT)
+#define I40E_GLPRT_PTC9522H(_i)            (0x00300764 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_PTC9522H_MAX_INDEX      3
+#define I40E_GLPRT_PTC9522H_PTC9522H_SHIFT 0
+#define I40E_GLPRT_PTC9522H_PTC9522H_MASK  I40E_MASK(0xFFFF, I40E_GLPRT_PTC9522H_PTC9522H_SHIFT)
+#define I40E_GLPRT_PTC9522L(_i)            (0x00300760 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_PTC9522L_MAX_INDEX      3
+#define I40E_GLPRT_PTC9522L_PTC9522L_SHIFT 0
+#define I40E_GLPRT_PTC9522L_PTC9522L_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPRT_PTC9522L_PTC9522L_SHIFT)
+#define I40E_GLPRT_PXOFFRXC(_i, _j)             (0x00300280 + ((_i) * 8 + (_j) * 32)) /* _i=0...3, _j=0...7 */ /* Reset: CORER */
+#define I40E_GLPRT_PXOFFRXC_MAX_INDEX          3
+#define I40E_GLPRT_PXOFFRXC_PRPXOFFRXCNT_SHIFT 0
+#define I40E_GLPRT_PXOFFRXC_PRPXOFFRXCNT_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPRT_PXOFFRXC_PRPXOFFRXCNT_SHIFT)
+#define I40E_GLPRT_PXOFFTXC(_i, _j)             (0x00300880 + ((_i) * 8 + (_j) * 32)) /* _i=0...3, _j=0...7 */ /* Reset: CORER */
+#define I40E_GLPRT_PXOFFTXC_MAX_INDEX          3
+#define I40E_GLPRT_PXOFFTXC_PRPXOFFTXCNT_SHIFT 0
+#define I40E_GLPRT_PXOFFTXC_PRPXOFFTXCNT_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPRT_PXOFFTXC_PRPXOFFTXCNT_SHIFT)
+#define I40E_GLPRT_PXONRXC(_i, _j)            (0x00300180 + ((_i) * 8 + (_j) * 32)) /* _i=0...3, _j=0...7 */ /* Reset: CORER */
+#define I40E_GLPRT_PXONRXC_MAX_INDEX         3
+#define I40E_GLPRT_PXONRXC_PRPXONRXCNT_SHIFT 0
+#define I40E_GLPRT_PXONRXC_PRPXONRXCNT_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPRT_PXONRXC_PRPXONRXCNT_SHIFT)
+#define I40E_GLPRT_PXONTXC(_i, _j)          (0x00300780 + ((_i) * 8 + (_j) * 32)) /* _i=0...3, _j=0...7 */ /* Reset: CORER */
+#define I40E_GLPRT_PXONTXC_MAX_INDEX       3
+#define I40E_GLPRT_PXONTXC_PRPXONTXC_SHIFT 0
+#define I40E_GLPRT_PXONTXC_PRPXONTXC_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPRT_PXONTXC_PRPXONTXC_SHIFT)
+#define I40E_GLPRT_RDPC(_i)        (0x00300600 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_RDPC_MAX_INDEX  3
+#define I40E_GLPRT_RDPC_RDPC_SHIFT 0
+#define I40E_GLPRT_RDPC_RDPC_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPRT_RDPC_RDPC_SHIFT)
+#define I40E_GLPRT_RFC(_i)       (0x00300560 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_RFC_MAX_INDEX 3
+#define I40E_GLPRT_RFC_RFC_SHIFT 0
+#define I40E_GLPRT_RFC_RFC_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPRT_RFC_RFC_SHIFT)
+#define I40E_GLPRT_RJC(_i)       (0x00300580 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_RJC_MAX_INDEX 3
+#define I40E_GLPRT_RJC_RJC_SHIFT 0
+#define I40E_GLPRT_RJC_RJC_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPRT_RJC_RJC_SHIFT)
+#define I40E_GLPRT_RLEC(_i)        (0x003000A0 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_RLEC_MAX_INDEX  3
+#define I40E_GLPRT_RLEC_RLEC_SHIFT 0
+#define I40E_GLPRT_RLEC_RLEC_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPRT_RLEC_RLEC_SHIFT)
+#define I40E_GLPRT_ROC(_i)       (0x00300120 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_ROC_MAX_INDEX 3
+#define I40E_GLPRT_ROC_ROC_SHIFT 0
+#define I40E_GLPRT_ROC_ROC_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPRT_ROC_ROC_SHIFT)
+#define I40E_GLPRT_RUC(_i)       (0x00300100 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_RUC_MAX_INDEX 3
+#define I40E_GLPRT_RUC_RUC_SHIFT 0
+#define I40E_GLPRT_RUC_RUC_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPRT_RUC_RUC_SHIFT)
+#define I40E_GLPRT_RUPP(_i)        (0x00300660 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_RUPP_MAX_INDEX  3
+#define I40E_GLPRT_RUPP_RUPP_SHIFT 0
+#define I40E_GLPRT_RUPP_RUPP_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPRT_RUPP_RUPP_SHIFT)
+#define I40E_GLPRT_RXON2OFFCNT(_i, _j)              (0x00300380 + ((_i) * 8 + (_j) * 32)) /* _i=0...3, _j=0...7 */ /* Reset: CORER */
+#define I40E_GLPRT_RXON2OFFCNT_MAX_INDEX           3
+#define I40E_GLPRT_RXON2OFFCNT_PRRXON2OFFCNT_SHIFT 0
+#define I40E_GLPRT_RXON2OFFCNT_PRRXON2OFFCNT_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPRT_RXON2OFFCNT_PRRXON2OFFCNT_SHIFT)
+#define I40E_GLPRT_TDOLD(_i)               (0x00300A20 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_TDOLD_MAX_INDEX         3
+#define I40E_GLPRT_TDOLD_GLPRT_TDOLD_SHIFT 0
+#define I40E_GLPRT_TDOLD_GLPRT_TDOLD_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPRT_TDOLD_GLPRT_TDOLD_SHIFT)
+#define I40E_GLPRT_UPRCH(_i)         (0x003005A4 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_UPRCH_MAX_INDEX   3
+#define I40E_GLPRT_UPRCH_UPRCH_SHIFT 0
+#define I40E_GLPRT_UPRCH_UPRCH_MASK  I40E_MASK(0xFFFF, I40E_GLPRT_UPRCH_UPRCH_SHIFT)
+#define I40E_GLPRT_UPRCL(_i)         (0x003005A0 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_UPRCL_MAX_INDEX   3
+#define I40E_GLPRT_UPRCL_UPRCL_SHIFT 0
+#define I40E_GLPRT_UPRCL_UPRCL_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPRT_UPRCL_UPRCL_SHIFT)
+#define I40E_GLPRT_UPTCH(_i)         (0x003009C4 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_UPTCH_MAX_INDEX   3
+#define I40E_GLPRT_UPTCH_UPTCH_SHIFT 0
+#define I40E_GLPRT_UPTCH_UPTCH_MASK  I40E_MASK(0xFFFF, I40E_GLPRT_UPTCH_UPTCH_SHIFT)
+#define I40E_GLPRT_UPTCL(_i)          (0x003009C0 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_UPTCL_MAX_INDEX    3
+#define I40E_GLPRT_UPTCL_VUPTCH_SHIFT 0
+#define I40E_GLPRT_UPTCL_VUPTCH_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPRT_UPTCL_VUPTCH_SHIFT)
+#define I40E_GLSW_BPRCH(_i)         (0x00370104 + ((_i) * 8)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLSW_BPRCH_MAX_INDEX   15
+#define I40E_GLSW_BPRCH_BPRCH_SHIFT 0
+#define I40E_GLSW_BPRCH_BPRCH_MASK  I40E_MASK(0xFFFF, I40E_GLSW_BPRCH_BPRCH_SHIFT)
+#define I40E_GLSW_BPRCL(_i)         (0x00370100 + ((_i) * 8)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLSW_BPRCL_MAX_INDEX   15
+#define I40E_GLSW_BPRCL_BPRCL_SHIFT 0
+#define I40E_GLSW_BPRCL_BPRCL_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLSW_BPRCL_BPRCL_SHIFT)
+#define I40E_GLSW_BPTCH(_i)         (0x00340104 + ((_i) * 8)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLSW_BPTCH_MAX_INDEX   15
+#define I40E_GLSW_BPTCH_BPTCH_SHIFT 0
+#define I40E_GLSW_BPTCH_BPTCH_MASK  I40E_MASK(0xFFFF, I40E_GLSW_BPTCH_BPTCH_SHIFT)
+#define I40E_GLSW_BPTCL(_i)         (0x00340100 + ((_i) * 8)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLSW_BPTCL_MAX_INDEX   15
+#define I40E_GLSW_BPTCL_BPTCL_SHIFT 0
+#define I40E_GLSW_BPTCL_BPTCL_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLSW_BPTCL_BPTCL_SHIFT)
+#define I40E_GLSW_GORCH(_i)         (0x0035C004 + ((_i) * 8)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLSW_GORCH_MAX_INDEX   15
+#define I40E_GLSW_GORCH_GORCH_SHIFT 0
+#define I40E_GLSW_GORCH_GORCH_MASK  I40E_MASK(0xFFFF, I40E_GLSW_GORCH_GORCH_SHIFT)
+#define I40E_GLSW_GORCL(_i)         (0x0035c000 + ((_i) * 8)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLSW_GORCL_MAX_INDEX   15
+#define I40E_GLSW_GORCL_GORCL_SHIFT 0
+#define I40E_GLSW_GORCL_GORCL_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLSW_GORCL_GORCL_SHIFT)
+#define I40E_GLSW_GOTCH(_i)         (0x0032C004 + ((_i) * 8)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLSW_GOTCH_MAX_INDEX   15
+#define I40E_GLSW_GOTCH_GOTCH_SHIFT 0
+#define I40E_GLSW_GOTCH_GOTCH_MASK  I40E_MASK(0xFFFF, I40E_GLSW_GOTCH_GOTCH_SHIFT)
+#define I40E_GLSW_GOTCL(_i)         (0x0032c000 + ((_i) * 8)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLSW_GOTCL_MAX_INDEX   15
+#define I40E_GLSW_GOTCL_GOTCL_SHIFT 0
+#define I40E_GLSW_GOTCL_GOTCL_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLSW_GOTCL_GOTCL_SHIFT)
+#define I40E_GLSW_MPRCH(_i)         (0x00370084 + ((_i) * 8)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLSW_MPRCH_MAX_INDEX   15
+#define I40E_GLSW_MPRCH_MPRCH_SHIFT 0
+#define I40E_GLSW_MPRCH_MPRCH_MASK  I40E_MASK(0xFFFF, I40E_GLSW_MPRCH_MPRCH_SHIFT)
+#define I40E_GLSW_MPRCL(_i)         (0x00370080 + ((_i) * 8)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLSW_MPRCL_MAX_INDEX   15
+#define I40E_GLSW_MPRCL_MPRCL_SHIFT 0
+#define I40E_GLSW_MPRCL_MPRCL_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLSW_MPRCL_MPRCL_SHIFT)
+#define I40E_GLSW_MPTCH(_i)         (0x00340084 + ((_i) * 8)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLSW_MPTCH_MAX_INDEX   15
+#define I40E_GLSW_MPTCH_MPTCH_SHIFT 0
+#define I40E_GLSW_MPTCH_MPTCH_MASK  I40E_MASK(0xFFFF, I40E_GLSW_MPTCH_MPTCH_SHIFT)
+#define I40E_GLSW_MPTCL(_i)         (0x00340080 + ((_i) * 8)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLSW_MPTCL_MAX_INDEX   15
+#define I40E_GLSW_MPTCL_MPTCL_SHIFT 0
+#define I40E_GLSW_MPTCL_MPTCL_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLSW_MPTCL_MPTCL_SHIFT)
+#define I40E_GLSW_RUPP(_i)        (0x00370180 + ((_i) * 8)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLSW_RUPP_MAX_INDEX  15
+#define I40E_GLSW_RUPP_RUPP_SHIFT 0
+#define I40E_GLSW_RUPP_RUPP_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLSW_RUPP_RUPP_SHIFT)
+#define I40E_GLSW_TDPC(_i)        (0x00348000 + ((_i) * 8)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLSW_TDPC_MAX_INDEX  15
+#define I40E_GLSW_TDPC_TDPC_SHIFT 0
+#define I40E_GLSW_TDPC_TDPC_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLSW_TDPC_TDPC_SHIFT)
+#define I40E_GLSW_UPRCH(_i)         (0x00370004 + ((_i) * 8)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLSW_UPRCH_MAX_INDEX   15
+#define I40E_GLSW_UPRCH_UPRCH_SHIFT 0
+#define I40E_GLSW_UPRCH_UPRCH_MASK  I40E_MASK(0xFFFF, I40E_GLSW_UPRCH_UPRCH_SHIFT)
+#define I40E_GLSW_UPRCL(_i)         (0x00370000 + ((_i) * 8)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLSW_UPRCL_MAX_INDEX   15
+#define I40E_GLSW_UPRCL_UPRCL_SHIFT 0
+#define I40E_GLSW_UPRCL_UPRCL_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLSW_UPRCL_UPRCL_SHIFT)
+#define I40E_GLSW_UPTCH(_i)         (0x00340004 + ((_i) * 8)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLSW_UPTCH_MAX_INDEX   15
+#define I40E_GLSW_UPTCH_UPTCH_SHIFT 0
+#define I40E_GLSW_UPTCH_UPTCH_MASK  I40E_MASK(0xFFFF, I40E_GLSW_UPTCH_UPTCH_SHIFT)
+#define I40E_GLSW_UPTCL(_i)         (0x00340000 + ((_i) * 8)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLSW_UPTCL_MAX_INDEX   15
+#define I40E_GLSW_UPTCL_UPTCL_SHIFT 0
+#define I40E_GLSW_UPTCL_UPTCL_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLSW_UPTCL_UPTCL_SHIFT)
+#define I40E_GLV_BPRCH(_i)         (0x0036D804 + ((_i) * 8)) /* _i=0...383 */ /* Reset: CORER */
+#define I40E_GLV_BPRCH_MAX_INDEX   383
+#define I40E_GLV_BPRCH_BPRCH_SHIFT 0
+#define I40E_GLV_BPRCH_BPRCH_MASK  I40E_MASK(0xFFFF, I40E_GLV_BPRCH_BPRCH_SHIFT)
+#define I40E_GLV_BPRCL(_i)         (0x0036d800 + ((_i) * 8)) /* _i=0...383 */ /* Reset: CORER */
+#define I40E_GLV_BPRCL_MAX_INDEX   383
+#define I40E_GLV_BPRCL_BPRCL_SHIFT 0
+#define I40E_GLV_BPRCL_BPRCL_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLV_BPRCL_BPRCL_SHIFT)
+#define I40E_GLV_BPTCH(_i)         (0x0033D804 + ((_i) * 8)) /* _i=0...383 */ /* Reset: CORER */
+#define I40E_GLV_BPTCH_MAX_INDEX   383
+#define I40E_GLV_BPTCH_BPTCH_SHIFT 0
+#define I40E_GLV_BPTCH_BPTCH_MASK  I40E_MASK(0xFFFF, I40E_GLV_BPTCH_BPTCH_SHIFT)
+#define I40E_GLV_BPTCL(_i)         (0x0033d800 + ((_i) * 8)) /* _i=0...383 */ /* Reset: CORER */
+#define I40E_GLV_BPTCL_MAX_INDEX   383
+#define I40E_GLV_BPTCL_BPTCL_SHIFT 0
+#define I40E_GLV_BPTCL_BPTCL_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLV_BPTCL_BPTCL_SHIFT)
+#define I40E_GLV_GORCH(_i)         (0x00358004 + ((_i) * 8)) /* _i=0...383 */ /* Reset: CORER */
+#define I40E_GLV_GORCH_MAX_INDEX   383
+#define I40E_GLV_GORCH_GORCH_SHIFT 0
+#define I40E_GLV_GORCH_GORCH_MASK  I40E_MASK(0xFFFF, I40E_GLV_GORCH_GORCH_SHIFT)
+#define I40E_GLV_GORCL(_i)         (0x00358000 + ((_i) * 8)) /* _i=0...383 */ /* Reset: CORER */
+#define I40E_GLV_GORCL_MAX_INDEX   383
+#define I40E_GLV_GORCL_GORCL_SHIFT 0
+#define I40E_GLV_GORCL_GORCL_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLV_GORCL_GORCL_SHIFT)
+#define I40E_GLV_GOTCH(_i)         (0x00328004 + ((_i) * 8)) /* _i=0...383 */ /* Reset: CORER */
+#define I40E_GLV_GOTCH_MAX_INDEX   383
+#define I40E_GLV_GOTCH_GOTCH_SHIFT 0
+#define I40E_GLV_GOTCH_GOTCH_MASK  I40E_MASK(0xFFFF, I40E_GLV_GOTCH_GOTCH_SHIFT)
+#define I40E_GLV_GOTCL(_i)         (0x00328000 + ((_i) * 8)) /* _i=0...383 */ /* Reset: CORER */
+#define I40E_GLV_GOTCL_MAX_INDEX   383
+#define I40E_GLV_GOTCL_GOTCL_SHIFT 0
+#define I40E_GLV_GOTCL_GOTCL_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLV_GOTCL_GOTCL_SHIFT)
+#define I40E_GLV_MPRCH(_i)         (0x0036CC04 + ((_i) * 8)) /* _i=0...383 */ /* Reset: CORER */
+#define I40E_GLV_MPRCH_MAX_INDEX   383
+#define I40E_GLV_MPRCH_MPRCH_SHIFT 0
+#define I40E_GLV_MPRCH_MPRCH_MASK  I40E_MASK(0xFFFF, I40E_GLV_MPRCH_MPRCH_SHIFT)
+#define I40E_GLV_MPRCL(_i)         (0x0036cc00 + ((_i) * 8)) /* _i=0...383 */ /* Reset: CORER */
+#define I40E_GLV_MPRCL_MAX_INDEX   383
+#define I40E_GLV_MPRCL_MPRCL_SHIFT 0
+#define I40E_GLV_MPRCL_MPRCL_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLV_MPRCL_MPRCL_SHIFT)
+#define I40E_GLV_MPTCH(_i)         (0x0033CC04 + ((_i) * 8)) /* _i=0...383 */ /* Reset: CORER */
+#define I40E_GLV_MPTCH_MAX_INDEX   383
+#define I40E_GLV_MPTCH_MPTCH_SHIFT 0
+#define I40E_GLV_MPTCH_MPTCH_MASK  I40E_MASK(0xFFFF, I40E_GLV_MPTCH_MPTCH_SHIFT)
+#define I40E_GLV_MPTCL(_i)         (0x0033cc00 + ((_i) * 8)) /* _i=0...383 */ /* Reset: CORER */
+#define I40E_GLV_MPTCL_MAX_INDEX   383
+#define I40E_GLV_MPTCL_MPTCL_SHIFT 0
+#define I40E_GLV_MPTCL_MPTCL_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLV_MPTCL_MPTCL_SHIFT)
+#define I40E_GLV_RDPC(_i)        (0x00310000 + ((_i) * 8)) /* _i=0...383 */ /* Reset: CORER */
+#define I40E_GLV_RDPC_MAX_INDEX  383
+#define I40E_GLV_RDPC_RDPC_SHIFT 0
+#define I40E_GLV_RDPC_RDPC_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLV_RDPC_RDPC_SHIFT)
+#define I40E_GLV_RUPP(_i)        (0x0036E400 + ((_i) * 8)) /* _i=0...383 */ /* Reset: CORER */
+#define I40E_GLV_RUPP_MAX_INDEX  383
+#define I40E_GLV_RUPP_RUPP_SHIFT 0
+#define I40E_GLV_RUPP_RUPP_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLV_RUPP_RUPP_SHIFT)
+#define I40E_GLV_TEPC(_VSI)      (0x00344000 + ((_VSI) * 4)) /* _i=0...383 */ /* Reset: CORER */
+#define I40E_GLV_TEPC_MAX_INDEX  383
+#define I40E_GLV_TEPC_TEPC_SHIFT 0
+#define I40E_GLV_TEPC_TEPC_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLV_TEPC_TEPC_SHIFT)
+#define I40E_GLV_UPRCH(_i)         (0x0036C004 + ((_i) * 8)) /* _i=0...383 */ /* Reset: CORER */
+#define I40E_GLV_UPRCH_MAX_INDEX   383
+#define I40E_GLV_UPRCH_UPRCH_SHIFT 0
+#define I40E_GLV_UPRCH_UPRCH_MASK  I40E_MASK(0xFFFF, I40E_GLV_UPRCH_UPRCH_SHIFT)
+#define I40E_GLV_UPRCL(_i)         (0x0036c000 + ((_i) * 8)) /* _i=0...383 */ /* Reset: CORER */
+#define I40E_GLV_UPRCL_MAX_INDEX   383
+#define I40E_GLV_UPRCL_UPRCL_SHIFT 0
+#define I40E_GLV_UPRCL_UPRCL_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLV_UPRCL_UPRCL_SHIFT)
+#define I40E_GLV_UPTCH(_i)            (0x0033C004 + ((_i) * 8)) /* _i=0...383 */ /* Reset: CORER */
+#define I40E_GLV_UPTCH_MAX_INDEX      383
+#define I40E_GLV_UPTCH_GLVUPTCH_SHIFT 0
+#define I40E_GLV_UPTCH_GLVUPTCH_MASK  I40E_MASK(0xFFFF, I40E_GLV_UPTCH_GLVUPTCH_SHIFT)
+#define I40E_GLV_UPTCL(_i)         (0x0033c000 + ((_i) * 8)) /* _i=0...383 */ /* Reset: CORER */
+#define I40E_GLV_UPTCL_MAX_INDEX   383
+#define I40E_GLV_UPTCL_UPTCL_SHIFT 0
+#define I40E_GLV_UPTCL_UPTCL_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLV_UPTCL_UPTCL_SHIFT)
+#define I40E_GLVEBTC_RBCH(_i, _j)      (0x00364004 + ((_i) * 8 + (_j) * 64)) /* _i=0...7, _j=0...15 */ /* Reset: CORER */
+#define I40E_GLVEBTC_RBCH_MAX_INDEX   7
+#define I40E_GLVEBTC_RBCH_TCBCH_SHIFT 0
+#define I40E_GLVEBTC_RBCH_TCBCH_MASK  I40E_MASK(0xFFFF, I40E_GLVEBTC_RBCH_TCBCH_SHIFT)
+#define I40E_GLVEBTC_RBCL(_i, _j)      (0x00364000 + ((_i) * 8 + (_j) * 64)) /* _i=0...7, _j=0...15 */ /* Reset: CORER */
+#define I40E_GLVEBTC_RBCL_MAX_INDEX   7
+#define I40E_GLVEBTC_RBCL_TCBCL_SHIFT 0
+#define I40E_GLVEBTC_RBCL_TCBCL_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLVEBTC_RBCL_TCBCL_SHIFT)
+#define I40E_GLVEBTC_RPCH(_i, _j)      (0x00368004 + ((_i) * 8 + (_j) * 64)) /* _i=0...7, _j=0...15 */ /* Reset: CORER */
+#define I40E_GLVEBTC_RPCH_MAX_INDEX   7
+#define I40E_GLVEBTC_RPCH_TCPCH_SHIFT 0
+#define I40E_GLVEBTC_RPCH_TCPCH_MASK  I40E_MASK(0xFFFF, I40E_GLVEBTC_RPCH_TCPCH_SHIFT)
+#define I40E_GLVEBTC_RPCL(_i, _j)      (0x00368000 + ((_i) * 8 + (_j) * 64)) /* _i=0...7, _j=0...15 */ /* Reset: CORER */
+#define I40E_GLVEBTC_RPCL_MAX_INDEX   7
+#define I40E_GLVEBTC_RPCL_TCPCL_SHIFT 0
+#define I40E_GLVEBTC_RPCL_TCPCL_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLVEBTC_RPCL_TCPCL_SHIFT)
+#define I40E_GLVEBTC_TBCH(_i, _j)      (0x00334004 + ((_i) * 8 + (_j) * 64)) /* _i=0...7, _j=0...15 */ /* Reset: CORER */
+#define I40E_GLVEBTC_TBCH_MAX_INDEX   7
+#define I40E_GLVEBTC_TBCH_TCBCH_SHIFT 0
+#define I40E_GLVEBTC_TBCH_TCBCH_MASK  I40E_MASK(0xFFFF, I40E_GLVEBTC_TBCH_TCBCH_SHIFT)
+#define I40E_GLVEBTC_TBCL(_i, _j)      (0x00334000 + ((_i) * 8 + (_j) * 64)) /* _i=0...7, _j=0...15 */ /* Reset: CORER */
+#define I40E_GLVEBTC_TBCL_MAX_INDEX   7
+#define I40E_GLVEBTC_TBCL_TCBCL_SHIFT 0
+#define I40E_GLVEBTC_TBCL_TCBCL_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLVEBTC_TBCL_TCBCL_SHIFT)
+#define I40E_GLVEBTC_TPCH(_i, _j)      (0x00338004 + ((_i) * 8 + (_j) * 64)) /* _i=0...7, _j=0...15 */ /* Reset: CORER */
+#define I40E_GLVEBTC_TPCH_MAX_INDEX   7
+#define I40E_GLVEBTC_TPCH_TCPCH_SHIFT 0
+#define I40E_GLVEBTC_TPCH_TCPCH_MASK  I40E_MASK(0xFFFF, I40E_GLVEBTC_TPCH_TCPCH_SHIFT)
+#define I40E_GLVEBTC_TPCL(_i, _j)      (0x00338000 + ((_i) * 8 + (_j) * 64)) /* _i=0...7, _j=0...15 */ /* Reset: CORER */
+#define I40E_GLVEBTC_TPCL_MAX_INDEX   7
+#define I40E_GLVEBTC_TPCL_TCPCL_SHIFT 0
+#define I40E_GLVEBTC_TPCL_TCPCL_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLVEBTC_TPCL_TCPCL_SHIFT)
+#define I40E_GLVEBVL_BPCH(_i)          (0x00374804 + ((_i) * 8)) /* _i=0...127 */ /* Reset: CORER */
+#define I40E_GLVEBVL_BPCH_MAX_INDEX    127
+#define I40E_GLVEBVL_BPCH_VLBPCH_SHIFT 0
+#define I40E_GLVEBVL_BPCH_VLBPCH_MASK  I40E_MASK(0xFFFF, I40E_GLVEBVL_BPCH_VLBPCH_SHIFT)
+#define I40E_GLVEBVL_BPCL(_i)          (0x00374800 + ((_i) * 8)) /* _i=0...127 */ /* Reset: CORER */
+#define I40E_GLVEBVL_BPCL_MAX_INDEX    127
+#define I40E_GLVEBVL_BPCL_VLBPCL_SHIFT 0
+#define I40E_GLVEBVL_BPCL_VLBPCL_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLVEBVL_BPCL_VLBPCL_SHIFT)
+#define I40E_GLVEBVL_GORCH(_i)         (0x00360004 + ((_i) * 8)) /* _i=0...127 */ /* Reset: CORER */
+#define I40E_GLVEBVL_GORCH_MAX_INDEX   127
+#define I40E_GLVEBVL_GORCH_VLBCH_SHIFT 0
+#define I40E_GLVEBVL_GORCH_VLBCH_MASK  I40E_MASK(0xFFFF, I40E_GLVEBVL_GORCH_VLBCH_SHIFT)
+#define I40E_GLVEBVL_GORCL(_i)         (0x00360000 + ((_i) * 8)) /* _i=0...127 */ /* Reset: CORER */
+#define I40E_GLVEBVL_GORCL_MAX_INDEX   127
+#define I40E_GLVEBVL_GORCL_VLBCL_SHIFT 0
+#define I40E_GLVEBVL_GORCL_VLBCL_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLVEBVL_GORCL_VLBCL_SHIFT)
+#define I40E_GLVEBVL_GOTCH(_i)         (0x00330004 + ((_i) * 8)) /* _i=0...127 */ /* Reset: CORER */
+#define I40E_GLVEBVL_GOTCH_MAX_INDEX   127
+#define I40E_GLVEBVL_GOTCH_VLBCH_SHIFT 0
+#define I40E_GLVEBVL_GOTCH_VLBCH_MASK  I40E_MASK(0xFFFF, I40E_GLVEBVL_GOTCH_VLBCH_SHIFT)
+#define I40E_GLVEBVL_GOTCL(_i)         (0x00330000 + ((_i) * 8)) /* _i=0...127 */ /* Reset: CORER */
+#define I40E_GLVEBVL_GOTCL_MAX_INDEX   127
+#define I40E_GLVEBVL_GOTCL_VLBCL_SHIFT 0
+#define I40E_GLVEBVL_GOTCL_VLBCL_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLVEBVL_GOTCL_VLBCL_SHIFT)
+#define I40E_GLVEBVL_MPCH(_i)          (0x00374404 + ((_i) * 8)) /* _i=0...127 */ /* Reset: CORER */
+#define I40E_GLVEBVL_MPCH_MAX_INDEX    127
+#define I40E_GLVEBVL_MPCH_VLMPCH_SHIFT 0
+#define I40E_GLVEBVL_MPCH_VLMPCH_MASK  I40E_MASK(0xFFFF, I40E_GLVEBVL_MPCH_VLMPCH_SHIFT)
+#define I40E_GLVEBVL_MPCL(_i)          (0x00374400 + ((_i) * 8)) /* _i=0...127 */ /* Reset: CORER */
+#define I40E_GLVEBVL_MPCL_MAX_INDEX    127
+#define I40E_GLVEBVL_MPCL_VLMPCL_SHIFT 0
+#define I40E_GLVEBVL_MPCL_VLMPCL_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLVEBVL_MPCL_VLMPCL_SHIFT)
+#define I40E_GLVEBVL_UPCH(_i)          (0x00374004 + ((_i) * 8)) /* _i=0...127 */ /* Reset: CORER */
+#define I40E_GLVEBVL_UPCH_MAX_INDEX    127
+#define I40E_GLVEBVL_UPCH_VLUPCH_SHIFT 0
+#define I40E_GLVEBVL_UPCH_VLUPCH_MASK  I40E_MASK(0xFFFF, I40E_GLVEBVL_UPCH_VLUPCH_SHIFT)
+#define I40E_GLVEBVL_UPCL(_i)          (0x00374000 + ((_i) * 8)) /* _i=0...127 */ /* Reset: CORER */
+#define I40E_GLVEBVL_UPCL_MAX_INDEX    127
+#define I40E_GLVEBVL_UPCL_VLUPCL_SHIFT 0
+#define I40E_GLVEBVL_UPCL_VLUPCL_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLVEBVL_UPCL_VLUPCL_SHIFT)
+#define I40E_GL_MTG_FLU_MSK_H                 0x00269F4C /* Reset: CORER */
+#define I40E_GL_MTG_FLU_MSK_H_MASK_HIGH_SHIFT 0
+#define I40E_GL_MTG_FLU_MSK_H_MASK_HIGH_MASK  I40E_MASK(0xFFFF, I40E_GL_MTG_FLU_MSK_H_MASK_HIGH_SHIFT)
+#define I40E_GL_SWR_DEF_ACT(_i)              (0x00270200 + ((_i) * 4)) /* _i=0...35 */ /* Reset: CORER */
+#define I40E_GL_SWR_DEF_ACT_MAX_INDEX        35
+#define I40E_GL_SWR_DEF_ACT_DEF_ACTION_SHIFT 0
+#define I40E_GL_SWR_DEF_ACT_DEF_ACTION_MASK  I40E_MASK(0xFFFFFFFF, I40E_GL_SWR_DEF_ACT_DEF_ACTION_SHIFT)
+#define I40E_GL_SWR_DEF_ACT_EN(_i)                     (0x0026CFB8 + ((_i) * 4)) /* _i=0...1 */ /* Reset: CORER */
+#define I40E_GL_SWR_DEF_ACT_EN_MAX_INDEX               1
+#define I40E_GL_SWR_DEF_ACT_EN_DEF_ACT_EN_BITMAP_SHIFT 0
+#define I40E_GL_SWR_DEF_ACT_EN_DEF_ACT_EN_BITMAP_MASK  I40E_MASK(0xFFFFFFFF, I40E_GL_SWR_DEF_ACT_EN_DEF_ACT_EN_BITMAP_SHIFT)
+#define I40E_PRTTSYN_ADJ               0x001E4280 /* Reset: GLOBR */
+#define I40E_PRTTSYN_ADJ_TSYNADJ_SHIFT 0
+#define I40E_PRTTSYN_ADJ_TSYNADJ_MASK  I40E_MASK(0x7FFFFFFF, I40E_PRTTSYN_ADJ_TSYNADJ_SHIFT)
+#define I40E_PRTTSYN_ADJ_SIGN_SHIFT    31
+#define I40E_PRTTSYN_ADJ_SIGN_MASK     I40E_MASK(0x1, I40E_PRTTSYN_ADJ_SIGN_SHIFT)
+#define I40E_PRTTSYN_AUX_0(_i)           (0x001E42A0 + ((_i) * 32)) /* _i=0...1 */ /* Reset: GLOBR */
+#define I40E_PRTTSYN_AUX_0_MAX_INDEX     1
+#define I40E_PRTTSYN_AUX_0_OUT_ENA_SHIFT 0
+#define I40E_PRTTSYN_AUX_0_OUT_ENA_MASK  I40E_MASK(0x1, I40E_PRTTSYN_AUX_0_OUT_ENA_SHIFT)
+#define I40E_PRTTSYN_AUX_0_OUTMOD_SHIFT  1
+#define I40E_PRTTSYN_AUX_0_OUTMOD_MASK   I40E_MASK(0x3, I40E_PRTTSYN_AUX_0_OUTMOD_SHIFT)
+#define I40E_PRTTSYN_AUX_0_OUTLVL_SHIFT  3
+#define I40E_PRTTSYN_AUX_0_OUTLVL_MASK   I40E_MASK(0x1, I40E_PRTTSYN_AUX_0_OUTLVL_SHIFT)
+#define I40E_PRTTSYN_AUX_0_PULSEW_SHIFT  8
+#define I40E_PRTTSYN_AUX_0_PULSEW_MASK   I40E_MASK(0xF, I40E_PRTTSYN_AUX_0_PULSEW_SHIFT)
+#define I40E_PRTTSYN_AUX_0_EVNTLVL_SHIFT 16
+#define I40E_PRTTSYN_AUX_0_EVNTLVL_MASK  I40E_MASK(0x3, I40E_PRTTSYN_AUX_0_EVNTLVL_SHIFT)
+#define I40E_PRTTSYN_AUX_1(_i)               (0x001E42E0 + ((_i) * 32)) /* _i=0...1 */ /* Reset: GLOBR */
+#define I40E_PRTTSYN_AUX_1_MAX_INDEX         1
+#define I40E_PRTTSYN_AUX_1_INSTNT_SHIFT      0
+#define I40E_PRTTSYN_AUX_1_INSTNT_MASK       I40E_MASK(0x1, I40E_PRTTSYN_AUX_1_INSTNT_SHIFT)
+#define I40E_PRTTSYN_AUX_1_SAMPLE_TIME_SHIFT 1
+#define I40E_PRTTSYN_AUX_1_SAMPLE_TIME_MASK  I40E_MASK(0x1, I40E_PRTTSYN_AUX_1_SAMPLE_TIME_SHIFT)
+#define I40E_PRTTSYN_CLKO(_i)            (0x001E4240 + ((_i) * 32)) /* _i=0...1 */ /* Reset: GLOBR */
+#define I40E_PRTTSYN_CLKO_MAX_INDEX      1
+#define I40E_PRTTSYN_CLKO_TSYNCLKO_SHIFT 0
+#define I40E_PRTTSYN_CLKO_TSYNCLKO_MASK  I40E_MASK(0xFFFFFFFF, I40E_PRTTSYN_CLKO_TSYNCLKO_SHIFT)
+#define I40E_PRTTSYN_CTL0                       0x001E4200 /* Reset: GLOBR */
+#define I40E_PRTTSYN_CTL0_CLEAR_TSYNTIMER_SHIFT 0
+#define I40E_PRTTSYN_CTL0_CLEAR_TSYNTIMER_MASK  I40E_MASK(0x1, I40E_PRTTSYN_CTL0_CLEAR_TSYNTIMER_SHIFT)
+#define I40E_PRTTSYN_CTL0_TXTIME_INT_ENA_SHIFT  1
+#define I40E_PRTTSYN_CTL0_TXTIME_INT_ENA_MASK   I40E_MASK(0x1, I40E_PRTTSYN_CTL0_TXTIME_INT_ENA_SHIFT)
+#define I40E_PRTTSYN_CTL0_EVENT_INT_ENA_SHIFT   2
+#define I40E_PRTTSYN_CTL0_EVENT_INT_ENA_MASK    I40E_MASK(0x1, I40E_PRTTSYN_CTL0_EVENT_INT_ENA_SHIFT)
+#define I40E_PRTTSYN_CTL0_TGT_INT_ENA_SHIFT     3
+#define I40E_PRTTSYN_CTL0_TGT_INT_ENA_MASK      I40E_MASK(0x1, I40E_PRTTSYN_CTL0_TGT_INT_ENA_SHIFT)
+#define I40E_PRTTSYN_CTL0_PF_ID_SHIFT           8
+#define I40E_PRTTSYN_CTL0_PF_ID_MASK            I40E_MASK(0xF, I40E_PRTTSYN_CTL0_PF_ID_SHIFT)
+#define I40E_PRTTSYN_CTL0_TSYNACT_SHIFT         12
+#define I40E_PRTTSYN_CTL0_TSYNACT_MASK          I40E_MASK(0x3, I40E_PRTTSYN_CTL0_TSYNACT_SHIFT)
+#define I40E_PRTTSYN_CTL0_TSYNENA_SHIFT         31
+#define I40E_PRTTSYN_CTL0_TSYNENA_MASK          I40E_MASK(0x1, I40E_PRTTSYN_CTL0_TSYNENA_SHIFT)
+#define I40E_PRTTSYN_CTL1                   0x00085020 /* Reset: CORER */
+#define I40E_PRTTSYN_CTL1_V1MESSTYPE0_SHIFT 0
+#define I40E_PRTTSYN_CTL1_V1MESSTYPE0_MASK  I40E_MASK(0xFF, I40E_PRTTSYN_CTL1_V1MESSTYPE0_SHIFT)
+#define I40E_PRTTSYN_CTL1_V1MESSTYPE1_SHIFT 8
+#define I40E_PRTTSYN_CTL1_V1MESSTYPE1_MASK  I40E_MASK(0xFF, I40E_PRTTSYN_CTL1_V1MESSTYPE1_SHIFT)
+#define I40E_PRTTSYN_CTL1_V2MESSTYPE0_SHIFT 16
+#define I40E_PRTTSYN_CTL1_V2MESSTYPE0_MASK  I40E_MASK(0xF, I40E_PRTTSYN_CTL1_V2MESSTYPE0_SHIFT)
+#define I40E_PRTTSYN_CTL1_V2MESSTYPE1_SHIFT 20
+#define I40E_PRTTSYN_CTL1_V2MESSTYPE1_MASK  I40E_MASK(0xF, I40E_PRTTSYN_CTL1_V2MESSTYPE1_SHIFT)
+#define I40E_PRTTSYN_CTL1_TSYNTYPE_SHIFT    24
+#define I40E_PRTTSYN_CTL1_TSYNTYPE_MASK     I40E_MASK(0x3, I40E_PRTTSYN_CTL1_TSYNTYPE_SHIFT)
+#define I40E_PRTTSYN_CTL1_UDP_ENA_SHIFT     26
+#define I40E_PRTTSYN_CTL1_UDP_ENA_MASK      I40E_MASK(0x3, I40E_PRTTSYN_CTL1_UDP_ENA_SHIFT)
+#define I40E_PRTTSYN_CTL1_TSYNENA_SHIFT     31
+#define I40E_PRTTSYN_CTL1_TSYNENA_MASK      I40E_MASK(0x1, I40E_PRTTSYN_CTL1_TSYNENA_SHIFT)
+#define I40E_PRTTSYN_EVNT_H(_i)              (0x001E40C0 + ((_i) * 32)) /* _i=0...1 */ /* Reset: GLOBR */
+#define I40E_PRTTSYN_EVNT_H_MAX_INDEX        1
+#define I40E_PRTTSYN_EVNT_H_TSYNEVNT_H_SHIFT 0
+#define I40E_PRTTSYN_EVNT_H_TSYNEVNT_H_MASK  I40E_MASK(0xFFFFFFFF, I40E_PRTTSYN_EVNT_H_TSYNEVNT_H_SHIFT)
+#define I40E_PRTTSYN_EVNT_L(_i)              (0x001E4080 + ((_i) * 32)) /* _i=0...1 */ /* Reset: GLOBR */
+#define I40E_PRTTSYN_EVNT_L_MAX_INDEX        1
+#define I40E_PRTTSYN_EVNT_L_TSYNEVNT_L_SHIFT 0
+#define I40E_PRTTSYN_EVNT_L_TSYNEVNT_L_MASK  I40E_MASK(0xFFFFFFFF, I40E_PRTTSYN_EVNT_L_TSYNEVNT_L_SHIFT)
+#define I40E_PRTTSYN_INC_H                 0x001E4060 /* Reset: GLOBR */
+#define I40E_PRTTSYN_INC_H_TSYNINC_H_SHIFT 0
+#define I40E_PRTTSYN_INC_H_TSYNINC_H_MASK  I40E_MASK(0x3F, I40E_PRTTSYN_INC_H_TSYNINC_H_SHIFT)
+#define I40E_PRTTSYN_INC_L                 0x001E4040 /* Reset: GLOBR */
+#define I40E_PRTTSYN_INC_L_TSYNINC_L_SHIFT 0
+#define I40E_PRTTSYN_INC_L_TSYNINC_L_MASK  I40E_MASK(0xFFFFFFFF, I40E_PRTTSYN_INC_L_TSYNINC_L_SHIFT)
+#define I40E_PRTTSYN_RXTIME_H(_i)            (0x00085040 + ((_i) * 32)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_PRTTSYN_RXTIME_H_MAX_INDEX      3
+#define I40E_PRTTSYN_RXTIME_H_RXTIEM_H_SHIFT 0
+#define I40E_PRTTSYN_RXTIME_H_RXTIEM_H_MASK  I40E_MASK(0xFFFFFFFF, I40E_PRTTSYN_RXTIME_H_RXTIEM_H_SHIFT)
+#define I40E_PRTTSYN_RXTIME_L(_i)            (0x000850C0 + ((_i) * 32)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_PRTTSYN_RXTIME_L_MAX_INDEX      3
+#define I40E_PRTTSYN_RXTIME_L_RXTIEM_L_SHIFT 0
+#define I40E_PRTTSYN_RXTIME_L_RXTIEM_L_MASK  I40E_MASK(0xFFFFFFFF, I40E_PRTTSYN_RXTIME_L_RXTIEM_L_SHIFT)
+#define I40E_PRTTSYN_STAT_0              0x001E4220 /* Reset: GLOBR */
+#define I40E_PRTTSYN_STAT_0_EVENT0_SHIFT 0
+#define I40E_PRTTSYN_STAT_0_EVENT0_MASK  I40E_MASK(0x1, I40E_PRTTSYN_STAT_0_EVENT0_SHIFT)
+#define I40E_PRTTSYN_STAT_0_EVENT1_SHIFT 1
+#define I40E_PRTTSYN_STAT_0_EVENT1_MASK  I40E_MASK(0x1, I40E_PRTTSYN_STAT_0_EVENT1_SHIFT)
+#define I40E_PRTTSYN_STAT_0_TGT0_SHIFT   2
+#define I40E_PRTTSYN_STAT_0_TGT0_MASK    I40E_MASK(0x1, I40E_PRTTSYN_STAT_0_TGT0_SHIFT)
+#define I40E_PRTTSYN_STAT_0_TGT1_SHIFT   3
+#define I40E_PRTTSYN_STAT_0_TGT1_MASK    I40E_MASK(0x1, I40E_PRTTSYN_STAT_0_TGT1_SHIFT)
+#define I40E_PRTTSYN_STAT_0_TXTIME_SHIFT 4
+#define I40E_PRTTSYN_STAT_0_TXTIME_MASK  I40E_MASK(0x1, I40E_PRTTSYN_STAT_0_TXTIME_SHIFT)
+#define I40E_PRTTSYN_STAT_1            0x00085140 /* Reset: CORER */
+#define I40E_PRTTSYN_STAT_1_RXT0_SHIFT 0
+#define I40E_PRTTSYN_STAT_1_RXT0_MASK  I40E_MASK(0x1, I40E_PRTTSYN_STAT_1_RXT0_SHIFT)
+#define I40E_PRTTSYN_STAT_1_RXT1_SHIFT 1
+#define I40E_PRTTSYN_STAT_1_RXT1_MASK  I40E_MASK(0x1, I40E_PRTTSYN_STAT_1_RXT1_SHIFT)
+#define I40E_PRTTSYN_STAT_1_RXT2_SHIFT 2
+#define I40E_PRTTSYN_STAT_1_RXT2_MASK  I40E_MASK(0x1, I40E_PRTTSYN_STAT_1_RXT2_SHIFT)
+#define I40E_PRTTSYN_STAT_1_RXT3_SHIFT 3
+#define I40E_PRTTSYN_STAT_1_RXT3_MASK  I40E_MASK(0x1, I40E_PRTTSYN_STAT_1_RXT3_SHIFT)
+#define I40E_PRTTSYN_TGT_H(_i)              (0x001E4180 + ((_i) * 32)) /* _i=0...1 */ /* Reset: GLOBR */
+#define I40E_PRTTSYN_TGT_H_MAX_INDEX        1
+#define I40E_PRTTSYN_TGT_H_TSYNTGTT_H_SHIFT 0
+#define I40E_PRTTSYN_TGT_H_TSYNTGTT_H_MASK  I40E_MASK(0xFFFFFFFF, I40E_PRTTSYN_TGT_H_TSYNTGTT_H_SHIFT)
+#define I40E_PRTTSYN_TGT_L(_i)              (0x001E4140 + ((_i) * 32)) /* _i=0...1 */ /* Reset: GLOBR */
+#define I40E_PRTTSYN_TGT_L_MAX_INDEX        1
+#define I40E_PRTTSYN_TGT_L_TSYNTGTT_L_SHIFT 0
+#define I40E_PRTTSYN_TGT_L_TSYNTGTT_L_MASK  I40E_MASK(0xFFFFFFFF, I40E_PRTTSYN_TGT_L_TSYNTGTT_L_SHIFT)
+#define I40E_PRTTSYN_TIME_H                  0x001E4120 /* Reset: GLOBR */
+#define I40E_PRTTSYN_TIME_H_TSYNTIME_H_SHIFT 0
+#define I40E_PRTTSYN_TIME_H_TSYNTIME_H_MASK  I40E_MASK(0xFFFFFFFF, I40E_PRTTSYN_TIME_H_TSYNTIME_H_SHIFT)
+#define I40E_PRTTSYN_TIME_L                  0x001E4100 /* Reset: GLOBR */
+#define I40E_PRTTSYN_TIME_L_TSYNTIME_L_SHIFT 0
+#define I40E_PRTTSYN_TIME_L_TSYNTIME_L_MASK  I40E_MASK(0xFFFFFFFF, I40E_PRTTSYN_TIME_L_TSYNTIME_L_SHIFT)
+#define I40E_PRTTSYN_TXTIME_H                0x001E41E0 /* Reset: GLOBR */
+#define I40E_PRTTSYN_TXTIME_H_TXTIEM_H_SHIFT 0
+#define I40E_PRTTSYN_TXTIME_H_TXTIEM_H_MASK  I40E_MASK(0xFFFFFFFF, I40E_PRTTSYN_TXTIME_H_TXTIEM_H_SHIFT)
+#define I40E_PRTTSYN_TXTIME_L                0x001E41C0 /* Reset: GLOBR */
+#define I40E_PRTTSYN_TXTIME_L_TXTIEM_L_SHIFT 0
+#define I40E_PRTTSYN_TXTIME_L_TXTIEM_L_MASK  I40E_MASK(0xFFFFFFFF, I40E_PRTTSYN_TXTIME_L_TXTIEM_L_SHIFT)
+#define I40E_GL_MDET_RX                0x0012A510 /* Reset: CORER */
+#define I40E_GL_MDET_RX_FUNCTION_SHIFT 0
+#define I40E_GL_MDET_RX_FUNCTION_MASK  I40E_MASK(0xFF, I40E_GL_MDET_RX_FUNCTION_SHIFT)
+#define I40E_GL_MDET_RX_EVENT_SHIFT    8
+#define I40E_GL_MDET_RX_EVENT_MASK     I40E_MASK(0x1FF, I40E_GL_MDET_RX_EVENT_SHIFT)
+#define I40E_GL_MDET_RX_QUEUE_SHIFT    17
+#define I40E_GL_MDET_RX_QUEUE_MASK     I40E_MASK(0x3FFF, I40E_GL_MDET_RX_QUEUE_SHIFT)
+#define I40E_GL_MDET_RX_VALID_SHIFT    31
+#define I40E_GL_MDET_RX_VALID_MASK     I40E_MASK(0x1, I40E_GL_MDET_RX_VALID_SHIFT)
+#define I40E_GL_MDET_TX              0x000E6480 /* Reset: CORER */
+#define I40E_GL_MDET_TX_QUEUE_SHIFT  0
+#define I40E_GL_MDET_TX_QUEUE_MASK   I40E_MASK(0xFFF, I40E_GL_MDET_TX_QUEUE_SHIFT)
+#define I40E_GL_MDET_TX_VF_NUM_SHIFT 12
+#define I40E_GL_MDET_TX_VF_NUM_MASK  I40E_MASK(0x1FF, I40E_GL_MDET_TX_VF_NUM_SHIFT)
+#define I40E_GL_MDET_TX_PF_NUM_SHIFT 21
+#define I40E_GL_MDET_TX_PF_NUM_MASK  I40E_MASK(0xF, I40E_GL_MDET_TX_PF_NUM_SHIFT)
+#define I40E_GL_MDET_TX_EVENT_SHIFT  25
+#define I40E_GL_MDET_TX_EVENT_MASK   I40E_MASK(0x1F, I40E_GL_MDET_TX_EVENT_SHIFT)
+#define I40E_GL_MDET_TX_VALID_SHIFT  31
+#define I40E_GL_MDET_TX_VALID_MASK   I40E_MASK(0x1, I40E_GL_MDET_TX_VALID_SHIFT)
+#define I40E_PF_MDET_RX             0x0012A400 /* Reset: CORER */
+#define I40E_PF_MDET_RX_VALID_SHIFT 0
+#define I40E_PF_MDET_RX_VALID_MASK  I40E_MASK(0x1, I40E_PF_MDET_RX_VALID_SHIFT)
+#define I40E_PF_MDET_TX             0x000E6400 /* Reset: CORER */
+#define I40E_PF_MDET_TX_VALID_SHIFT 0
+#define I40E_PF_MDET_TX_VALID_MASK  I40E_MASK(0x1, I40E_PF_MDET_TX_VALID_SHIFT)
+#define I40E_PF_VT_PFALLOC               0x001C0500 /* Reset: CORER */
+#define I40E_PF_VT_PFALLOC_FIRSTVF_SHIFT 0
+#define I40E_PF_VT_PFALLOC_FIRSTVF_MASK  I40E_MASK(0xFF, I40E_PF_VT_PFALLOC_FIRSTVF_SHIFT)
+#define I40E_PF_VT_PFALLOC_LASTVF_SHIFT  8
+#define I40E_PF_VT_PFALLOC_LASTVF_MASK   I40E_MASK(0xFF, I40E_PF_VT_PFALLOC_LASTVF_SHIFT)
+#define I40E_PF_VT_PFALLOC_VALID_SHIFT   31
+#define I40E_PF_VT_PFALLOC_VALID_MASK    I40E_MASK(0x1, I40E_PF_VT_PFALLOC_VALID_SHIFT)
+#define I40E_VP_MDET_RX(_VF)        (0x0012A000 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: CORER */
+#define I40E_VP_MDET_RX_MAX_INDEX   127
+#define I40E_VP_MDET_RX_VALID_SHIFT 0
+#define I40E_VP_MDET_RX_VALID_MASK  I40E_MASK(0x1, I40E_VP_MDET_RX_VALID_SHIFT)
+#define I40E_VP_MDET_TX(_VF)        (0x000E6000 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: CORER */
+#define I40E_VP_MDET_TX_MAX_INDEX   127
+#define I40E_VP_MDET_TX_VALID_SHIFT 0
+#define I40E_VP_MDET_TX_VALID_MASK  I40E_MASK(0x1, I40E_VP_MDET_TX_VALID_SHIFT)
+#define I40E_GLPM_WUMC                    0x0006C800 /* Reset: POR */
+#define I40E_GLPM_WUMC_NOTCO_SHIFT        0
+#define I40E_GLPM_WUMC_NOTCO_MASK         I40E_MASK(0x1, I40E_GLPM_WUMC_NOTCO_SHIFT)
+#define I40E_GLPM_WUMC_SRST_PIN_VAL_SHIFT 1
+#define I40E_GLPM_WUMC_SRST_PIN_VAL_MASK  I40E_MASK(0x1, I40E_GLPM_WUMC_SRST_PIN_VAL_SHIFT)
+#define I40E_GLPM_WUMC_ROL_MODE_SHIFT     2
+#define I40E_GLPM_WUMC_ROL_MODE_MASK      I40E_MASK(0x1, I40E_GLPM_WUMC_ROL_MODE_SHIFT)
+#define I40E_GLPM_WUMC_RESERVED_4_SHIFT   3
+#define I40E_GLPM_WUMC_RESERVED_4_MASK    I40E_MASK(0x1FFF, I40E_GLPM_WUMC_RESERVED_4_SHIFT)
+#define I40E_GLPM_WUMC_MNG_WU_PF_SHIFT    16
+#define I40E_GLPM_WUMC_MNG_WU_PF_MASK     I40E_MASK(0xFFFF, I40E_GLPM_WUMC_MNG_WU_PF_SHIFT)
+#define I40E_PFPM_APM            0x000B8080 /* Reset: POR */
+#define I40E_PFPM_APM_APME_SHIFT 0
+#define I40E_PFPM_APM_APME_MASK  I40E_MASK(0x1, I40E_PFPM_APM_APME_SHIFT)
+#define I40E_PFPM_FHFT_LENGTH(_i)          (0x0006A000 + ((_i) * 128)) /* _i=0...7 */ /* Reset: POR */
+#define I40E_PFPM_FHFT_LENGTH_MAX_INDEX    7
+#define I40E_PFPM_FHFT_LENGTH_LENGTH_SHIFT 0
+#define I40E_PFPM_FHFT_LENGTH_LENGTH_MASK  I40E_MASK(0xFF, I40E_PFPM_FHFT_LENGTH_LENGTH_SHIFT)
+#define I40E_PFPM_WUC                 0x0006B200 /* Reset: POR */
+#define I40E_PFPM_WUC_EN_APM_D0_SHIFT 5
+#define I40E_PFPM_WUC_EN_APM_D0_MASK  I40E_MASK(0x1, I40E_PFPM_WUC_EN_APM_D0_SHIFT)
+#define I40E_PFPM_WUFC                 0x0006B400 /* Reset: POR */
+#define I40E_PFPM_WUFC_LNKC_SHIFT      0
+#define I40E_PFPM_WUFC_LNKC_MASK       I40E_MASK(0x1, I40E_PFPM_WUFC_LNKC_SHIFT)
+#define I40E_PFPM_WUFC_MAG_SHIFT       1
+#define I40E_PFPM_WUFC_MAG_MASK        I40E_MASK(0x1, I40E_PFPM_WUFC_MAG_SHIFT)
+#define I40E_PFPM_WUFC_MNG_SHIFT       3
+#define I40E_PFPM_WUFC_MNG_MASK        I40E_MASK(0x1, I40E_PFPM_WUFC_MNG_SHIFT)
+#define I40E_PFPM_WUFC_FLX0_ACT_SHIFT  4
+#define I40E_PFPM_WUFC_FLX0_ACT_MASK   I40E_MASK(0x1, I40E_PFPM_WUFC_FLX0_ACT_SHIFT)
+#define I40E_PFPM_WUFC_FLX1_ACT_SHIFT  5
+#define I40E_PFPM_WUFC_FLX1_ACT_MASK   I40E_MASK(0x1, I40E_PFPM_WUFC_FLX1_ACT_SHIFT)
+#define I40E_PFPM_WUFC_FLX2_ACT_SHIFT  6
+#define I40E_PFPM_WUFC_FLX2_ACT_MASK   I40E_MASK(0x1, I40E_PFPM_WUFC_FLX2_ACT_SHIFT)
+#define I40E_PFPM_WUFC_FLX3_ACT_SHIFT  7
+#define I40E_PFPM_WUFC_FLX3_ACT_MASK   I40E_MASK(0x1, I40E_PFPM_WUFC_FLX3_ACT_SHIFT)
+#define I40E_PFPM_WUFC_FLX4_ACT_SHIFT  8
+#define I40E_PFPM_WUFC_FLX4_ACT_MASK   I40E_MASK(0x1, I40E_PFPM_WUFC_FLX4_ACT_SHIFT)
+#define I40E_PFPM_WUFC_FLX5_ACT_SHIFT  9
+#define I40E_PFPM_WUFC_FLX5_ACT_MASK   I40E_MASK(0x1, I40E_PFPM_WUFC_FLX5_ACT_SHIFT)
+#define I40E_PFPM_WUFC_FLX6_ACT_SHIFT  10
+#define I40E_PFPM_WUFC_FLX6_ACT_MASK   I40E_MASK(0x1, I40E_PFPM_WUFC_FLX6_ACT_SHIFT)
+#define I40E_PFPM_WUFC_FLX7_ACT_SHIFT  11
+#define I40E_PFPM_WUFC_FLX7_ACT_MASK   I40E_MASK(0x1, I40E_PFPM_WUFC_FLX7_ACT_SHIFT)
+#define I40E_PFPM_WUFC_FLX0_SHIFT      16
+#define I40E_PFPM_WUFC_FLX0_MASK       I40E_MASK(0x1, I40E_PFPM_WUFC_FLX0_SHIFT)
+#define I40E_PFPM_WUFC_FLX1_SHIFT      17
+#define I40E_PFPM_WUFC_FLX1_MASK       I40E_MASK(0x1, I40E_PFPM_WUFC_FLX1_SHIFT)
+#define I40E_PFPM_WUFC_FLX2_SHIFT      18
+#define I40E_PFPM_WUFC_FLX2_MASK       I40E_MASK(0x1, I40E_PFPM_WUFC_FLX2_SHIFT)
+#define I40E_PFPM_WUFC_FLX3_SHIFT      19
+#define I40E_PFPM_WUFC_FLX3_MASK       I40E_MASK(0x1, I40E_PFPM_WUFC_FLX3_SHIFT)
+#define I40E_PFPM_WUFC_FLX4_SHIFT      20
+#define I40E_PFPM_WUFC_FLX4_MASK       I40E_MASK(0x1, I40E_PFPM_WUFC_FLX4_SHIFT)
+#define I40E_PFPM_WUFC_FLX5_SHIFT      21
+#define I40E_PFPM_WUFC_FLX5_MASK       I40E_MASK(0x1, I40E_PFPM_WUFC_FLX5_SHIFT)
+#define I40E_PFPM_WUFC_FLX6_SHIFT      22
+#define I40E_PFPM_WUFC_FLX6_MASK       I40E_MASK(0x1, I40E_PFPM_WUFC_FLX6_SHIFT)
+#define I40E_PFPM_WUFC_FLX7_SHIFT      23
+#define I40E_PFPM_WUFC_FLX7_MASK       I40E_MASK(0x1, I40E_PFPM_WUFC_FLX7_SHIFT)
+#define I40E_PFPM_WUFC_FW_RST_WK_SHIFT 31
+#define I40E_PFPM_WUFC_FW_RST_WK_MASK  I40E_MASK(0x1, I40E_PFPM_WUFC_FW_RST_WK_SHIFT)
+#define I40E_PFPM_WUS                  0x0006B600 /* Reset: POR */
+#define I40E_PFPM_WUS_LNKC_SHIFT       0
+#define I40E_PFPM_WUS_LNKC_MASK        I40E_MASK(0x1, I40E_PFPM_WUS_LNKC_SHIFT)
+#define I40E_PFPM_WUS_MAG_SHIFT        1
+#define I40E_PFPM_WUS_MAG_MASK         I40E_MASK(0x1, I40E_PFPM_WUS_MAG_SHIFT)
+#define I40E_PFPM_WUS_PME_STATUS_SHIFT 2
+#define I40E_PFPM_WUS_PME_STATUS_MASK  I40E_MASK(0x1, I40E_PFPM_WUS_PME_STATUS_SHIFT)
+#define I40E_PFPM_WUS_MNG_SHIFT        3
+#define I40E_PFPM_WUS_MNG_MASK         I40E_MASK(0x1, I40E_PFPM_WUS_MNG_SHIFT)
+#define I40E_PFPM_WUS_FLX0_SHIFT       16
+#define I40E_PFPM_WUS_FLX0_MASK        I40E_MASK(0x1, I40E_PFPM_WUS_FLX0_SHIFT)
+#define I40E_PFPM_WUS_FLX1_SHIFT       17
+#define I40E_PFPM_WUS_FLX1_MASK        I40E_MASK(0x1, I40E_PFPM_WUS_FLX1_SHIFT)
+#define I40E_PFPM_WUS_FLX2_SHIFT       18
+#define I40E_PFPM_WUS_FLX2_MASK        I40E_MASK(0x1, I40E_PFPM_WUS_FLX2_SHIFT)
+#define I40E_PFPM_WUS_FLX3_SHIFT       19
+#define I40E_PFPM_WUS_FLX3_MASK        I40E_MASK(0x1, I40E_PFPM_WUS_FLX3_SHIFT)
+#define I40E_PFPM_WUS_FLX4_SHIFT       20
+#define I40E_PFPM_WUS_FLX4_MASK        I40E_MASK(0x1, I40E_PFPM_WUS_FLX4_SHIFT)
+#define I40E_PFPM_WUS_FLX5_SHIFT       21
+#define I40E_PFPM_WUS_FLX5_MASK        I40E_MASK(0x1, I40E_PFPM_WUS_FLX5_SHIFT)
+#define I40E_PFPM_WUS_FLX6_SHIFT       22
+#define I40E_PFPM_WUS_FLX6_MASK        I40E_MASK(0x1, I40E_PFPM_WUS_FLX6_SHIFT)
+#define I40E_PFPM_WUS_FLX7_SHIFT       23
+#define I40E_PFPM_WUS_FLX7_MASK        I40E_MASK(0x1, I40E_PFPM_WUS_FLX7_SHIFT)
+#define I40E_PFPM_WUS_FW_RST_WK_SHIFT  31
+#define I40E_PFPM_WUS_FW_RST_WK_MASK   I40E_MASK(0x1, I40E_PFPM_WUS_FW_RST_WK_SHIFT)
+#define I40E_PRTPM_FHFHR                 0x0006C000 /* Reset: POR */
+#define I40E_PRTPM_FHFHR_UNICAST_SHIFT   0
+#define I40E_PRTPM_FHFHR_UNICAST_MASK    I40E_MASK(0x1, I40E_PRTPM_FHFHR_UNICAST_SHIFT)
+#define I40E_PRTPM_FHFHR_MULTICAST_SHIFT 1
+#define I40E_PRTPM_FHFHR_MULTICAST_MASK  I40E_MASK(0x1, I40E_PRTPM_FHFHR_MULTICAST_SHIFT)
+#define I40E_PRTPM_SAH(_i)             (0x001E44C0 + ((_i) * 32)) /* _i=0...3 */ /* Reset: PFR */
+#define I40E_PRTPM_SAH_MAX_INDEX       3
+#define I40E_PRTPM_SAH_PFPM_SAH_SHIFT  0
+#define I40E_PRTPM_SAH_PFPM_SAH_MASK   I40E_MASK(0xFFFF, I40E_PRTPM_SAH_PFPM_SAH_SHIFT)
+#define I40E_PRTPM_SAH_PF_NUM_SHIFT    26
+#define I40E_PRTPM_SAH_PF_NUM_MASK     I40E_MASK(0xF, I40E_PRTPM_SAH_PF_NUM_SHIFT)
+#define I40E_PRTPM_SAH_MC_MAG_EN_SHIFT 30
+#define I40E_PRTPM_SAH_MC_MAG_EN_MASK  I40E_MASK(0x1, I40E_PRTPM_SAH_MC_MAG_EN_SHIFT)
+#define I40E_PRTPM_SAH_AV_SHIFT        31
+#define I40E_PRTPM_SAH_AV_MASK         I40E_MASK(0x1, I40E_PRTPM_SAH_AV_SHIFT)
+#define I40E_PRTPM_SAL(_i)            (0x001E4440 + ((_i) * 32)) /* _i=0...3 */ /* Reset: PFR */
+#define I40E_PRTPM_SAL_MAX_INDEX      3
+#define I40E_PRTPM_SAL_PFPM_SAL_SHIFT 0
+#define I40E_PRTPM_SAL_PFPM_SAL_MASK  I40E_MASK(0xFFFFFFFF, I40E_PRTPM_SAL_PFPM_SAL_SHIFT)
+#define I40E_VF_ARQBAH1              0x00006000 /* Reset: EMPR */
+#define I40E_VF_ARQBAH1_ARQBAH_SHIFT 0
+#define I40E_VF_ARQBAH1_ARQBAH_MASK  I40E_MASK(0xFFFFFFFF, I40E_VF_ARQBAH1_ARQBAH_SHIFT)
+#define I40E_VF_ARQBAL1              0x00006C00 /* Reset: EMPR */
+#define I40E_VF_ARQBAL1_ARQBAL_SHIFT 0
+#define I40E_VF_ARQBAL1_ARQBAL_MASK  I40E_MASK(0xFFFFFFFF, I40E_VF_ARQBAL1_ARQBAL_SHIFT)
+#define I40E_VF_ARQH1            0x00007400 /* Reset: EMPR */
+#define I40E_VF_ARQH1_ARQH_SHIFT 0
+#define I40E_VF_ARQH1_ARQH_MASK  I40E_MASK(0x3FF, I40E_VF_ARQH1_ARQH_SHIFT)
+#define I40E_VF_ARQLEN1                 0x00008000 /* Reset: EMPR */
+#define I40E_VF_ARQLEN1_ARQLEN_SHIFT    0
+#define I40E_VF_ARQLEN1_ARQLEN_MASK     I40E_MASK(0x3FF, I40E_VF_ARQLEN1_ARQLEN_SHIFT)
+#define I40E_VF_ARQLEN1_ARQVFE_SHIFT    28
+#define I40E_VF_ARQLEN1_ARQVFE_MASK     I40E_MASK(0x1, I40E_VF_ARQLEN1_ARQVFE_SHIFT)
+#define I40E_VF_ARQLEN1_ARQOVFL_SHIFT   29
+#define I40E_VF_ARQLEN1_ARQOVFL_MASK    I40E_MASK(0x1, I40E_VF_ARQLEN1_ARQOVFL_SHIFT)
+#define I40E_VF_ARQLEN1_ARQCRIT_SHIFT   30
+#define I40E_VF_ARQLEN1_ARQCRIT_MASK    I40E_MASK(0x1, I40E_VF_ARQLEN1_ARQCRIT_SHIFT)
+#define I40E_VF_ARQLEN1_ARQENABLE_SHIFT 31
+#define I40E_VF_ARQLEN1_ARQENABLE_MASK  I40E_MASK(0x1, I40E_VF_ARQLEN1_ARQENABLE_SHIFT)
+#define I40E_VF_ARQT1            0x00007000 /* Reset: EMPR */
+#define I40E_VF_ARQT1_ARQT_SHIFT 0
+#define I40E_VF_ARQT1_ARQT_MASK  I40E_MASK(0x3FF, I40E_VF_ARQT1_ARQT_SHIFT)
+#define I40E_VF_ATQBAH1              0x00007800 /* Reset: EMPR */
+#define I40E_VF_ATQBAH1_ATQBAH_SHIFT 0
+#define I40E_VF_ATQBAH1_ATQBAH_MASK  I40E_MASK(0xFFFFFFFF, I40E_VF_ATQBAH1_ATQBAH_SHIFT)
+#define I40E_VF_ATQBAL1              0x00007C00 /* Reset: EMPR */
+#define I40E_VF_ATQBAL1_ATQBAL_SHIFT 0
+#define I40E_VF_ATQBAL1_ATQBAL_MASK  I40E_MASK(0xFFFFFFFF, I40E_VF_ATQBAL1_ATQBAL_SHIFT)
+#define I40E_VF_ATQH1            0x00006400 /* Reset: EMPR */
+#define I40E_VF_ATQH1_ATQH_SHIFT 0
+#define I40E_VF_ATQH1_ATQH_MASK  I40E_MASK(0x3FF, I40E_VF_ATQH1_ATQH_SHIFT)
+#define I40E_VF_ATQLEN1                 0x00006800 /* Reset: EMPR */
+#define I40E_VF_ATQLEN1_ATQLEN_SHIFT    0
+#define I40E_VF_ATQLEN1_ATQLEN_MASK     I40E_MASK(0x3FF, I40E_VF_ATQLEN1_ATQLEN_SHIFT)
+#define I40E_VF_ATQLEN1_ATQVFE_SHIFT    28
+#define I40E_VF_ATQLEN1_ATQVFE_MASK     I40E_MASK(0x1, I40E_VF_ATQLEN1_ATQVFE_SHIFT)
+#define I40E_VF_ATQLEN1_ATQOVFL_SHIFT   29
+#define I40E_VF_ATQLEN1_ATQOVFL_MASK    I40E_MASK(0x1, I40E_VF_ATQLEN1_ATQOVFL_SHIFT)
+#define I40E_VF_ATQLEN1_ATQCRIT_SHIFT   30
+#define I40E_VF_ATQLEN1_ATQCRIT_MASK    I40E_MASK(0x1, I40E_VF_ATQLEN1_ATQCRIT_SHIFT)
+#define I40E_VF_ATQLEN1_ATQENABLE_SHIFT 31
+#define I40E_VF_ATQLEN1_ATQENABLE_MASK  I40E_MASK(0x1, I40E_VF_ATQLEN1_ATQENABLE_SHIFT)
+#define I40E_VF_ATQT1            0x00008400 /* Reset: EMPR */
+#define I40E_VF_ATQT1_ATQT_SHIFT 0
+#define I40E_VF_ATQT1_ATQT_MASK  I40E_MASK(0x3FF, I40E_VF_ATQT1_ATQT_SHIFT)
+#define I40E_VFGEN_RSTAT                 0x00008800 /* Reset: VFR */
+#define I40E_VFGEN_RSTAT_VFR_STATE_SHIFT 0
+#define I40E_VFGEN_RSTAT_VFR_STATE_MASK  I40E_MASK(0x3, I40E_VFGEN_RSTAT_VFR_STATE_SHIFT)
+#define I40E_VFINT_DYN_CTL01                       0x00005C00 /* Reset: VFR */
+#define I40E_VFINT_DYN_CTL01_INTENA_SHIFT          0
+#define I40E_VFINT_DYN_CTL01_INTENA_MASK           I40E_MASK(0x1, I40E_VFINT_DYN_CTL01_INTENA_SHIFT)
+#define I40E_VFINT_DYN_CTL01_CLEARPBA_SHIFT        1
+#define I40E_VFINT_DYN_CTL01_CLEARPBA_MASK         I40E_MASK(0x1, I40E_VFINT_DYN_CTL01_CLEARPBA_SHIFT)
+#define I40E_VFINT_DYN_CTL01_SWINT_TRIG_SHIFT      2
+#define I40E_VFINT_DYN_CTL01_SWINT_TRIG_MASK       I40E_MASK(0x1, I40E_VFINT_DYN_CTL01_SWINT_TRIG_SHIFT)
+#define I40E_VFINT_DYN_CTL01_ITR_INDX_SHIFT        3
+#define I40E_VFINT_DYN_CTL01_ITR_INDX_MASK         I40E_MASK(0x3, I40E_VFINT_DYN_CTL01_ITR_INDX_SHIFT)
+#define I40E_VFINT_DYN_CTL01_INTERVAL_SHIFT        5
+#define I40E_VFINT_DYN_CTL01_INTERVAL_MASK         I40E_MASK(0xFFF, I40E_VFINT_DYN_CTL01_INTERVAL_SHIFT)
+#define I40E_VFINT_DYN_CTL01_SW_ITR_INDX_ENA_SHIFT 24
+#define I40E_VFINT_DYN_CTL01_SW_ITR_INDX_ENA_MASK  I40E_MASK(0x1, I40E_VFINT_DYN_CTL01_SW_ITR_INDX_ENA_SHIFT)
+#define I40E_VFINT_DYN_CTL01_SW_ITR_INDX_SHIFT     25
+#define I40E_VFINT_DYN_CTL01_SW_ITR_INDX_MASK      I40E_MASK(0x3, I40E_VFINT_DYN_CTL01_SW_ITR_INDX_SHIFT)
+#define I40E_VFINT_DYN_CTL01_INTENA_MSK_SHIFT      31
+#define I40E_VFINT_DYN_CTL01_INTENA_MSK_MASK       I40E_MASK(0x1, I40E_VFINT_DYN_CTL01_INTENA_MSK_SHIFT)
+#define I40E_VFINT_DYN_CTLN1(_INTVF)               (0x00003800 + ((_INTVF) * 4)) /* _i=0...15 */ /* Reset: VFR */
+#define I40E_VFINT_DYN_CTLN1_MAX_INDEX             15
+#define I40E_VFINT_DYN_CTLN1_INTENA_SHIFT          0
+#define I40E_VFINT_DYN_CTLN1_INTENA_MASK           I40E_MASK(0x1, I40E_VFINT_DYN_CTLN1_INTENA_SHIFT)
+#define I40E_VFINT_DYN_CTLN1_CLEARPBA_SHIFT        1
+#define I40E_VFINT_DYN_CTLN1_CLEARPBA_MASK         I40E_MASK(0x1, I40E_VFINT_DYN_CTLN1_CLEARPBA_SHIFT)
+#define I40E_VFINT_DYN_CTLN1_SWINT_TRIG_SHIFT      2
+#define I40E_VFINT_DYN_CTLN1_SWINT_TRIG_MASK       I40E_MASK(0x1, I40E_VFINT_DYN_CTLN1_SWINT_TRIG_SHIFT)
+#define I40E_VFINT_DYN_CTLN1_ITR_INDX_SHIFT        3
+#define I40E_VFINT_DYN_CTLN1_ITR_INDX_MASK         I40E_MASK(0x3, I40E_VFINT_DYN_CTLN1_ITR_INDX_SHIFT)
+#define I40E_VFINT_DYN_CTLN1_INTERVAL_SHIFT        5
+#define I40E_VFINT_DYN_CTLN1_INTERVAL_MASK         I40E_MASK(0xFFF, I40E_VFINT_DYN_CTLN1_INTERVAL_SHIFT)
+#define I40E_VFINT_DYN_CTLN1_SW_ITR_INDX_ENA_SHIFT 24
+#define I40E_VFINT_DYN_CTLN1_SW_ITR_INDX_ENA_MASK  I40E_MASK(0x1, I40E_VFINT_DYN_CTLN1_SW_ITR_INDX_ENA_SHIFT)
+#define I40E_VFINT_DYN_CTLN1_SW_ITR_INDX_SHIFT     25
+#define I40E_VFINT_DYN_CTLN1_SW_ITR_INDX_MASK      I40E_MASK(0x3, I40E_VFINT_DYN_CTLN1_SW_ITR_INDX_SHIFT)
+#define I40E_VFINT_DYN_CTLN1_INTENA_MSK_SHIFT      31
+#define I40E_VFINT_DYN_CTLN1_INTENA_MSK_MASK       I40E_MASK(0x1, I40E_VFINT_DYN_CTLN1_INTENA_MSK_SHIFT)
+#define I40E_VFINT_ICR0_ENA1                        0x00005000 /* Reset: CORER */
+#define I40E_VFINT_ICR0_ENA1_LINK_STAT_CHANGE_SHIFT 25
+#define I40E_VFINT_ICR0_ENA1_LINK_STAT_CHANGE_MASK  I40E_MASK(0x1, I40E_VFINT_ICR0_ENA1_LINK_STAT_CHANGE_SHIFT)
+#define I40E_VFINT_ICR0_ENA1_ADMINQ_SHIFT           30
+#define I40E_VFINT_ICR0_ENA1_ADMINQ_MASK            I40E_MASK(0x1, I40E_VFINT_ICR0_ENA1_ADMINQ_SHIFT)
+#define I40E_VFINT_ICR0_ENA1_RSVD_SHIFT             31
+#define I40E_VFINT_ICR0_ENA1_RSVD_MASK              I40E_MASK(0x1, I40E_VFINT_ICR0_ENA1_RSVD_SHIFT)
+#define I40E_VFINT_ICR01                        0x00004800 /* Reset: CORER */
+#define I40E_VFINT_ICR01_INTEVENT_SHIFT         0
+#define I40E_VFINT_ICR01_INTEVENT_MASK          I40E_MASK(0x1, I40E_VFINT_ICR01_INTEVENT_SHIFT)
+#define I40E_VFINT_ICR01_QUEUE_0_SHIFT          1
+#define I40E_VFINT_ICR01_QUEUE_0_MASK           I40E_MASK(0x1, I40E_VFINT_ICR01_QUEUE_0_SHIFT)
+#define I40E_VFINT_ICR01_QUEUE_1_SHIFT          2
+#define I40E_VFINT_ICR01_QUEUE_1_MASK           I40E_MASK(0x1, I40E_VFINT_ICR01_QUEUE_1_SHIFT)
+#define I40E_VFINT_ICR01_QUEUE_2_SHIFT          3
+#define I40E_VFINT_ICR01_QUEUE_2_MASK           I40E_MASK(0x1, I40E_VFINT_ICR01_QUEUE_2_SHIFT)
+#define I40E_VFINT_ICR01_QUEUE_3_SHIFT          4
+#define I40E_VFINT_ICR01_QUEUE_3_MASK           I40E_MASK(0x1, I40E_VFINT_ICR01_QUEUE_3_SHIFT)
+#define I40E_VFINT_ICR01_LINK_STAT_CHANGE_SHIFT 25
+#define I40E_VFINT_ICR01_LINK_STAT_CHANGE_MASK  I40E_MASK(0x1, I40E_VFINT_ICR01_LINK_STAT_CHANGE_SHIFT)
+#define I40E_VFINT_ICR01_ADMINQ_SHIFT           30
+#define I40E_VFINT_ICR01_ADMINQ_MASK            I40E_MASK(0x1, I40E_VFINT_ICR01_ADMINQ_SHIFT)
+#define I40E_VFINT_ICR01_SWINT_SHIFT            31
+#define I40E_VFINT_ICR01_SWINT_MASK             I40E_MASK(0x1, I40E_VFINT_ICR01_SWINT_SHIFT)
+#define I40E_VFINT_ITR01(_i)            (0x00004C00 + ((_i) * 4)) /* _i=0...2 */ /* Reset: VFR */
+#define I40E_VFINT_ITR01_MAX_INDEX      2
+#define I40E_VFINT_ITR01_INTERVAL_SHIFT 0
+#define I40E_VFINT_ITR01_INTERVAL_MASK  I40E_MASK(0xFFF, I40E_VFINT_ITR01_INTERVAL_SHIFT)
+#define I40E_VFINT_ITRN1(_i, _INTVF)     (0x00002800 + ((_i) * 64 + (_INTVF) * 4)) /* _i=0...2, _INTVF=0...15 */ /* Reset: VFR */
+#define I40E_VFINT_ITRN1_MAX_INDEX      2
+#define I40E_VFINT_ITRN1_INTERVAL_SHIFT 0
+#define I40E_VFINT_ITRN1_INTERVAL_MASK  I40E_MASK(0xFFF, I40E_VFINT_ITRN1_INTERVAL_SHIFT)
+#define I40E_VFINT_STAT_CTL01                      0x00005400 /* Reset: CORER */
+#define I40E_VFINT_STAT_CTL01_OTHER_ITR_INDX_SHIFT 2
+#define I40E_VFINT_STAT_CTL01_OTHER_ITR_INDX_MASK  I40E_MASK(0x3, I40E_VFINT_STAT_CTL01_OTHER_ITR_INDX_SHIFT)
+#define I40E_QRX_TAIL1(_Q)        (0x00002000 + ((_Q) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_QRX_TAIL1_MAX_INDEX  15
+#define I40E_QRX_TAIL1_TAIL_SHIFT 0
+#define I40E_QRX_TAIL1_TAIL_MASK  I40E_MASK(0x1FFF, I40E_QRX_TAIL1_TAIL_SHIFT)
+#define I40E_QTX_TAIL1(_Q)        (0x00000000 + ((_Q) * 4)) /* _i=0...15 */ /* Reset: PFR */
+#define I40E_QTX_TAIL1_MAX_INDEX  15
+#define I40E_QTX_TAIL1_TAIL_SHIFT 0
+#define I40E_QTX_TAIL1_TAIL_MASK  I40E_MASK(0x1FFF, I40E_QTX_TAIL1_TAIL_SHIFT)
+#define I40E_VFMSIX_PBA              0x00002000 /* Reset: VFLR */
+#define I40E_VFMSIX_PBA_PENBIT_SHIFT 0
+#define I40E_VFMSIX_PBA_PENBIT_MASK  I40E_MASK(0xFFFFFFFF, I40E_VFMSIX_PBA_PENBIT_SHIFT)
+#define I40E_VFMSIX_TADD(_i)              (0x00000000 + ((_i) * 16)) /* _i=0...16 */ /* Reset: VFLR */
+#define I40E_VFMSIX_TADD_MAX_INDEX        16
+#define I40E_VFMSIX_TADD_MSIXTADD10_SHIFT 0
+#define I40E_VFMSIX_TADD_MSIXTADD10_MASK  I40E_MASK(0x3, I40E_VFMSIX_TADD_MSIXTADD10_SHIFT)
+#define I40E_VFMSIX_TADD_MSIXTADD_SHIFT   2
+#define I40E_VFMSIX_TADD_MSIXTADD_MASK    I40E_MASK(0x3FFFFFFF, I40E_VFMSIX_TADD_MSIXTADD_SHIFT)
+#define I40E_VFMSIX_TMSG(_i)            (0x00000008 + ((_i) * 16)) /* _i=0...16 */ /* Reset: VFLR */
+#define I40E_VFMSIX_TMSG_MAX_INDEX      16
+#define I40E_VFMSIX_TMSG_MSIXTMSG_SHIFT 0
+#define I40E_VFMSIX_TMSG_MSIXTMSG_MASK  I40E_MASK(0xFFFFFFFF, I40E_VFMSIX_TMSG_MSIXTMSG_SHIFT)
+#define I40E_VFMSIX_TUADD(_i)             (0x00000004 + ((_i) * 16)) /* _i=0...16 */ /* Reset: VFLR */
+#define I40E_VFMSIX_TUADD_MAX_INDEX       16
+#define I40E_VFMSIX_TUADD_MSIXTUADD_SHIFT 0
+#define I40E_VFMSIX_TUADD_MSIXTUADD_MASK  I40E_MASK(0xFFFFFFFF, I40E_VFMSIX_TUADD_MSIXTUADD_SHIFT)
+#define I40E_VFMSIX_TVCTRL(_i)        (0x0000000C + ((_i) * 16)) /* _i=0...16 */ /* Reset: VFLR */
+#define I40E_VFMSIX_TVCTRL_MAX_INDEX  16
+#define I40E_VFMSIX_TVCTRL_MASK_SHIFT 0
+#define I40E_VFMSIX_TVCTRL_MASK_MASK  I40E_MASK(0x1, I40E_VFMSIX_TVCTRL_MASK_SHIFT)
+#define I40E_VFCM_PE_ERRDATA                  0x0000DC00 /* Reset: VFR */
+#define I40E_VFCM_PE_ERRDATA_ERROR_CODE_SHIFT 0
+#define I40E_VFCM_PE_ERRDATA_ERROR_CODE_MASK  I40E_MASK(0xF, I40E_VFCM_PE_ERRDATA_ERROR_CODE_SHIFT)
+#define I40E_VFCM_PE_ERRDATA_Q_TYPE_SHIFT     4
+#define I40E_VFCM_PE_ERRDATA_Q_TYPE_MASK      I40E_MASK(0x7, I40E_VFCM_PE_ERRDATA_Q_TYPE_SHIFT)
+#define I40E_VFCM_PE_ERRDATA_Q_NUM_SHIFT      8
+#define I40E_VFCM_PE_ERRDATA_Q_NUM_MASK       I40E_MASK(0x3FFFF, I40E_VFCM_PE_ERRDATA_Q_NUM_SHIFT)
+#define I40E_VFCM_PE_ERRINFO                     0x0000D800 /* Reset: VFR */
+#define I40E_VFCM_PE_ERRINFO_ERROR_VALID_SHIFT   0
+#define I40E_VFCM_PE_ERRINFO_ERROR_VALID_MASK    I40E_MASK(0x1, I40E_VFCM_PE_ERRINFO_ERROR_VALID_SHIFT)
+#define I40E_VFCM_PE_ERRINFO_ERROR_INST_SHIFT    4
+#define I40E_VFCM_PE_ERRINFO_ERROR_INST_MASK     I40E_MASK(0x7, I40E_VFCM_PE_ERRINFO_ERROR_INST_SHIFT)
+#define I40E_VFCM_PE_ERRINFO_DBL_ERROR_CNT_SHIFT 8
+#define I40E_VFCM_PE_ERRINFO_DBL_ERROR_CNT_MASK  I40E_MASK(0xFF, I40E_VFCM_PE_ERRINFO_DBL_ERROR_CNT_SHIFT)
+#define I40E_VFCM_PE_ERRINFO_RLU_ERROR_CNT_SHIFT 16
+#define I40E_VFCM_PE_ERRINFO_RLU_ERROR_CNT_MASK  I40E_MASK(0xFF, I40E_VFCM_PE_ERRINFO_RLU_ERROR_CNT_SHIFT)
+#define I40E_VFCM_PE_ERRINFO_RLS_ERROR_CNT_SHIFT 24
+#define I40E_VFCM_PE_ERRINFO_RLS_ERROR_CNT_MASK  I40E_MASK(0xFF, I40E_VFCM_PE_ERRINFO_RLS_ERROR_CNT_SHIFT)
+#define I40E_VFQF_HENA(_i)             (0x0000C400 + ((_i) * 4)) /* _i=0...1 */ /* Reset: CORER */
+#define I40E_VFQF_HENA_MAX_INDEX       1
+#define I40E_VFQF_HENA_PTYPE_ENA_SHIFT 0
+#define I40E_VFQF_HENA_PTYPE_ENA_MASK  I40E_MASK(0xFFFFFFFF, I40E_VFQF_HENA_PTYPE_ENA_SHIFT)
+#define I40E_VFQF_HKEY(_i)         (0x0000CC00 + ((_i) * 4)) /* _i=0...12 */ /* Reset: CORER */
+#define I40E_VFQF_HKEY_MAX_INDEX   12
+#define I40E_VFQF_HKEY_KEY_0_SHIFT 0
+#define I40E_VFQF_HKEY_KEY_0_MASK  I40E_MASK(0xFF, I40E_VFQF_HKEY_KEY_0_SHIFT)
+#define I40E_VFQF_HKEY_KEY_1_SHIFT 8
+#define I40E_VFQF_HKEY_KEY_1_MASK  I40E_MASK(0xFF, I40E_VFQF_HKEY_KEY_1_SHIFT)
+#define I40E_VFQF_HKEY_KEY_2_SHIFT 16
+#define I40E_VFQF_HKEY_KEY_2_MASK  I40E_MASK(0xFF, I40E_VFQF_HKEY_KEY_2_SHIFT)
+#define I40E_VFQF_HKEY_KEY_3_SHIFT 24
+#define I40E_VFQF_HKEY_KEY_3_MASK  I40E_MASK(0xFF, I40E_VFQF_HKEY_KEY_3_SHIFT)
+#define I40E_VFQF_HLUT(_i)        (0x0000D000 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_VFQF_HLUT_MAX_INDEX  15
+#define I40E_VFQF_HLUT_LUT0_SHIFT 0
+#define I40E_VFQF_HLUT_LUT0_MASK  I40E_MASK(0xF, I40E_VFQF_HLUT_LUT0_SHIFT)
+#define I40E_VFQF_HLUT_LUT1_SHIFT 8
+#define I40E_VFQF_HLUT_LUT1_MASK  I40E_MASK(0xF, I40E_VFQF_HLUT_LUT1_SHIFT)
+#define I40E_VFQF_HLUT_LUT2_SHIFT 16
+#define I40E_VFQF_HLUT_LUT2_MASK  I40E_MASK(0xF, I40E_VFQF_HLUT_LUT2_SHIFT)
+#define I40E_VFQF_HLUT_LUT3_SHIFT 24
+#define I40E_VFQF_HLUT_LUT3_MASK  I40E_MASK(0xF, I40E_VFQF_HLUT_LUT3_SHIFT)
+#define I40E_VFQF_HREGION(_i)                  (0x0000D400 + ((_i) * 4)) /* _i=0...7 */ /* Reset: CORER */
+#define I40E_VFQF_HREGION_MAX_INDEX            7
+#define I40E_VFQF_HREGION_OVERRIDE_ENA_0_SHIFT 0
+#define I40E_VFQF_HREGION_OVERRIDE_ENA_0_MASK  I40E_MASK(0x1, I40E_VFQF_HREGION_OVERRIDE_ENA_0_SHIFT)
+#define I40E_VFQF_HREGION_REGION_0_SHIFT       1
+#define I40E_VFQF_HREGION_REGION_0_MASK        I40E_MASK(0x7, I40E_VFQF_HREGION_REGION_0_SHIFT)
+#define I40E_VFQF_HREGION_OVERRIDE_ENA_1_SHIFT 4
+#define I40E_VFQF_HREGION_OVERRIDE_ENA_1_MASK  I40E_MASK(0x1, I40E_VFQF_HREGION_OVERRIDE_ENA_1_SHIFT)
+#define I40E_VFQF_HREGION_REGION_1_SHIFT       5
+#define I40E_VFQF_HREGION_REGION_1_MASK        I40E_MASK(0x7, I40E_VFQF_HREGION_REGION_1_SHIFT)
+#define I40E_VFQF_HREGION_OVERRIDE_ENA_2_SHIFT 8
+#define I40E_VFQF_HREGION_OVERRIDE_ENA_2_MASK  I40E_MASK(0x1, I40E_VFQF_HREGION_OVERRIDE_ENA_2_SHIFT)
+#define I40E_VFQF_HREGION_REGION_2_SHIFT       9
+#define I40E_VFQF_HREGION_REGION_2_MASK        I40E_MASK(0x7, I40E_VFQF_HREGION_REGION_2_SHIFT)
+#define I40E_VFQF_HREGION_OVERRIDE_ENA_3_SHIFT 12
+#define I40E_VFQF_HREGION_OVERRIDE_ENA_3_MASK  I40E_MASK(0x1, I40E_VFQF_HREGION_OVERRIDE_ENA_3_SHIFT)
+#define I40E_VFQF_HREGION_REGION_3_SHIFT       13
+#define I40E_VFQF_HREGION_REGION_3_MASK        I40E_MASK(0x7, I40E_VFQF_HREGION_REGION_3_SHIFT)
+#define I40E_VFQF_HREGION_OVERRIDE_ENA_4_SHIFT 16
+#define I40E_VFQF_HREGION_OVERRIDE_ENA_4_MASK  I40E_MASK(0x1, I40E_VFQF_HREGION_OVERRIDE_ENA_4_SHIFT)
+#define I40E_VFQF_HREGION_REGION_4_SHIFT       17
+#define I40E_VFQF_HREGION_REGION_4_MASK        I40E_MASK(0x7, I40E_VFQF_HREGION_REGION_4_SHIFT)
+#define I40E_VFQF_HREGION_OVERRIDE_ENA_5_SHIFT 20
+#define I40E_VFQF_HREGION_OVERRIDE_ENA_5_MASK  I40E_MASK(0x1, I40E_VFQF_HREGION_OVERRIDE_ENA_5_SHIFT)
+#define I40E_VFQF_HREGION_REGION_5_SHIFT       21
+#define I40E_VFQF_HREGION_REGION_5_MASK        I40E_MASK(0x7, I40E_VFQF_HREGION_REGION_5_SHIFT)
+#define I40E_VFQF_HREGION_OVERRIDE_ENA_6_SHIFT 24
+#define I40E_VFQF_HREGION_OVERRIDE_ENA_6_MASK  I40E_MASK(0x1, I40E_VFQF_HREGION_OVERRIDE_ENA_6_SHIFT)
+#define I40E_VFQF_HREGION_REGION_6_SHIFT       25
+#define I40E_VFQF_HREGION_REGION_6_MASK        I40E_MASK(0x7, I40E_VFQF_HREGION_REGION_6_SHIFT)
+#define I40E_VFQF_HREGION_OVERRIDE_ENA_7_SHIFT 28
+#define I40E_VFQF_HREGION_OVERRIDE_ENA_7_MASK  I40E_MASK(0x1, I40E_VFQF_HREGION_OVERRIDE_ENA_7_SHIFT)
+#define I40E_VFQF_HREGION_REGION_7_SHIFT       29
+#define I40E_VFQF_HREGION_REGION_7_MASK        I40E_MASK(0x7, I40E_VFQF_HREGION_REGION_7_SHIFT)
+#ifdef X722_SUPPORT
+
+#define I40E_MNGSB_FDCRC               0x000B7050 /* Reset: POR */
+#define I40E_MNGSB_FDCRC_CRC_RES_SHIFT 0
+#define I40E_MNGSB_FDCRC_CRC_RES_MASK  I40E_MASK(0xFF, I40E_MNGSB_FDCRC_CRC_RES_SHIFT)
+#define I40E_MNGSB_FDCS                   0x000B7040 /* Reset: POR */
+#define I40E_MNGSB_FDCS_CRC_CONT_SHIFT    2
+#define I40E_MNGSB_FDCS_CRC_CONT_MASK     I40E_MASK(0x1, I40E_MNGSB_FDCS_CRC_CONT_SHIFT)
+#define I40E_MNGSB_FDCS_CRC_SEED_EN_SHIFT 3
+#define I40E_MNGSB_FDCS_CRC_SEED_EN_MASK  I40E_MASK(0x1, I40E_MNGSB_FDCS_CRC_SEED_EN_SHIFT)
+#define I40E_MNGSB_FDCS_CRC_WR_INH_SHIFT  4
+#define I40E_MNGSB_FDCS_CRC_WR_INH_MASK   I40E_MASK(0x1, I40E_MNGSB_FDCS_CRC_WR_INH_SHIFT)
+#define I40E_MNGSB_FDCS_CRC_SEED_SHIFT    8
+#define I40E_MNGSB_FDCS_CRC_SEED_MASK     I40E_MASK(0xFF, I40E_MNGSB_FDCS_CRC_SEED_SHIFT)
+#define I40E_MNGSB_FDS                0x000B7048 /* Reset: POR */
+#define I40E_MNGSB_FDS_START_BC_SHIFT 0
+#define I40E_MNGSB_FDS_START_BC_MASK  I40E_MASK(0xFFF, I40E_MNGSB_FDS_START_BC_SHIFT)
+#define I40E_MNGSB_FDS_LAST_BC_SHIFT  16
+#define I40E_MNGSB_FDS_LAST_BC_MASK   I40E_MASK(0xFFF, I40E_MNGSB_FDS_LAST_BC_SHIFT)
+
+#define I40E_GL_VF_CTRL_RX(_VF)           (0x00083600 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: EMPR */
+#define I40E_GL_VF_CTRL_RX_MAX_INDEX      127
+#define I40E_GL_VF_CTRL_RX_AQ_RX_EN_SHIFT 0
+#define I40E_GL_VF_CTRL_RX_AQ_RX_EN_MASK  I40E_MASK(0x1, I40E_GL_VF_CTRL_RX_AQ_RX_EN_SHIFT)
+#define I40E_GL_VF_CTRL_TX(_VF)           (0x00083400 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: EMPR */
+#define I40E_GL_VF_CTRL_TX_MAX_INDEX      127
+#define I40E_GL_VF_CTRL_TX_AQ_TX_EN_SHIFT 0
+#define I40E_GL_VF_CTRL_TX_AQ_TX_EN_MASK  I40E_MASK(0x1, I40E_GL_VF_CTRL_TX_AQ_TX_EN_SHIFT)
+
+#define I40E_GLCM_LAN_CACHESIZE                 0x0010C4D8 /* Reset: CORER */
+#define I40E_GLCM_LAN_CACHESIZE_WORD_SIZE_SHIFT 0
+#define I40E_GLCM_LAN_CACHESIZE_WORD_SIZE_MASK  I40E_MASK(0xFFF, I40E_GLCM_LAN_CACHESIZE_WORD_SIZE_SHIFT)
+#define I40E_GLCM_LAN_CACHESIZE_SETS_SHIFT      12
+#define I40E_GLCM_LAN_CACHESIZE_SETS_MASK       I40E_MASK(0xF, I40E_GLCM_LAN_CACHESIZE_SETS_SHIFT)
+#define I40E_GLCM_LAN_CACHESIZE_WAYS_SHIFT      16
+#define I40E_GLCM_LAN_CACHESIZE_WAYS_MASK       I40E_MASK(0x3FF, I40E_GLCM_LAN_CACHESIZE_WAYS_SHIFT)
+#define I40E_GLCM_PE_CACHESIZE                 0x00138FE4 /* Reset: CORER */
+#define I40E_GLCM_PE_CACHESIZE_WORD_SIZE_SHIFT 0
+#define I40E_GLCM_PE_CACHESIZE_WORD_SIZE_MASK  I40E_MASK(0xFFF, I40E_GLCM_PE_CACHESIZE_WORD_SIZE_SHIFT)
+#define I40E_GLCM_PE_CACHESIZE_SETS_SHIFT      12
+#define I40E_GLCM_PE_CACHESIZE_SETS_MASK       I40E_MASK(0xF, I40E_GLCM_PE_CACHESIZE_SETS_SHIFT)
+#define I40E_GLCM_PE_CACHESIZE_WAYS_SHIFT      16
+#define I40E_GLCM_PE_CACHESIZE_WAYS_MASK       I40E_MASK(0x1FF, I40E_GLCM_PE_CACHESIZE_WAYS_SHIFT)
+#define I40E_PFCM_PE_ERRDATA                  0x00138D00 /* Reset: PFR */
+#define I40E_PFCM_PE_ERRDATA_ERROR_CODE_SHIFT 0
+#define I40E_PFCM_PE_ERRDATA_ERROR_CODE_MASK  I40E_MASK(0xF, I40E_PFCM_PE_ERRDATA_ERROR_CODE_SHIFT)
+#define I40E_PFCM_PE_ERRDATA_Q_TYPE_SHIFT     4
+#define I40E_PFCM_PE_ERRDATA_Q_TYPE_MASK      I40E_MASK(0x7, I40E_PFCM_PE_ERRDATA_Q_TYPE_SHIFT)
+#define I40E_PFCM_PE_ERRDATA_Q_NUM_SHIFT      8
+#define I40E_PFCM_PE_ERRDATA_Q_NUM_MASK       I40E_MASK(0x3FFFF, I40E_PFCM_PE_ERRDATA_Q_NUM_SHIFT)
+#define I40E_PFCM_PE_ERRINFO                     0x00138C80 /* Reset: PFR */
+#define I40E_PFCM_PE_ERRINFO_ERROR_VALID_SHIFT   0
+#define I40E_PFCM_PE_ERRINFO_ERROR_VALID_MASK    I40E_MASK(0x1, I40E_PFCM_PE_ERRINFO_ERROR_VALID_SHIFT)
+#define I40E_PFCM_PE_ERRINFO_ERROR_INST_SHIFT    4
+#define I40E_PFCM_PE_ERRINFO_ERROR_INST_MASK     I40E_MASK(0x7, I40E_PFCM_PE_ERRINFO_ERROR_INST_SHIFT)
+#define I40E_PFCM_PE_ERRINFO_DBL_ERROR_CNT_SHIFT 8
+#define I40E_PFCM_PE_ERRINFO_DBL_ERROR_CNT_MASK  I40E_MASK(0xFF, I40E_PFCM_PE_ERRINFO_DBL_ERROR_CNT_SHIFT)
+#define I40E_PFCM_PE_ERRINFO_RLU_ERROR_CNT_SHIFT 16
+#define I40E_PFCM_PE_ERRINFO_RLU_ERROR_CNT_MASK  I40E_MASK(0xFF, I40E_PFCM_PE_ERRINFO_RLU_ERROR_CNT_SHIFT)
+#define I40E_PFCM_PE_ERRINFO_RLS_ERROR_CNT_SHIFT 24
+#define I40E_PFCM_PE_ERRINFO_RLS_ERROR_CNT_MASK  I40E_MASK(0xFF, I40E_PFCM_PE_ERRINFO_RLS_ERROR_CNT_SHIFT)
+
+#define I40E_PRTDCB_TFMSTC(_i)        (0x000A0040 + ((_i) * 32)) /* _i=0...7 */ /* Reset: CORER */
+#define I40E_PRTDCB_TFMSTC_MAX_INDEX  7
+#define I40E_PRTDCB_TFMSTC_MSTC_SHIFT 0
+#define I40E_PRTDCB_TFMSTC_MSTC_MASK  I40E_MASK(0xFFFFF, I40E_PRTDCB_TFMSTC_MSTC_SHIFT)
+#define I40E_GL_FWSTS_FWROWD_SHIFT 8
+#define I40E_GL_FWSTS_FWROWD_MASK  I40E_MASK(0x1, I40E_GL_FWSTS_FWROWD_SHIFT)
+#define I40E_GLFOC_CACHESIZE                 0x000AA0DC /* Reset: CORER */
+#define I40E_GLFOC_CACHESIZE_WORD_SIZE_SHIFT 0
+#define I40E_GLFOC_CACHESIZE_WORD_SIZE_MASK  I40E_MASK(0xFF, I40E_GLFOC_CACHESIZE_WORD_SIZE_SHIFT)
+#define I40E_GLFOC_CACHESIZE_SETS_SHIFT      8
+#define I40E_GLFOC_CACHESIZE_SETS_MASK       I40E_MASK(0xFFF, I40E_GLFOC_CACHESIZE_SETS_SHIFT)
+#define I40E_GLFOC_CACHESIZE_WAYS_SHIFT      20
+#define I40E_GLFOC_CACHESIZE_WAYS_MASK       I40E_MASK(0xF, I40E_GLFOC_CACHESIZE_WAYS_SHIFT)
+#define I40E_GLHMC_APBVTINUSEBASE(_i)                   (0x000C4a00 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_APBVTINUSEBASE_MAX_INDEX             15
+#define I40E_GLHMC_APBVTINUSEBASE_FPMAPBINUSEBASE_SHIFT 0
+#define I40E_GLHMC_APBVTINUSEBASE_FPMAPBINUSEBASE_MASK  I40E_MASK(0xFFFFFF, I40E_GLHMC_APBVTINUSEBASE_FPMAPBINUSEBASE_SHIFT)
+#define I40E_GLHMC_CEQPART(_i)             (0x001312C0 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_CEQPART_MAX_INDEX       15
+#define I40E_GLHMC_CEQPART_PMCEQBASE_SHIFT 0
+#define I40E_GLHMC_CEQPART_PMCEQBASE_MASK  I40E_MASK(0xFF, I40E_GLHMC_CEQPART_PMCEQBASE_SHIFT)
+#define I40E_GLHMC_CEQPART_PMCEQSIZE_SHIFT 16
+#define I40E_GLHMC_CEQPART_PMCEQSIZE_MASK  I40E_MASK(0x1FF, I40E_GLHMC_CEQPART_PMCEQSIZE_SHIFT)
+#define I40E_GLHMC_DBCQMAX                     0x000C20F0 /* Reset: CORER */
+#define I40E_GLHMC_DBCQMAX_GLHMC_DBCQMAX_SHIFT 0
+#define I40E_GLHMC_DBCQMAX_GLHMC_DBCQMAX_MASK  I40E_MASK(0x3FFFF, I40E_GLHMC_DBCQMAX_GLHMC_DBCQMAX_SHIFT)
+#define I40E_GLHMC_DBCQPART(_i)              (0x00131240 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_DBCQPART_MAX_INDEX        15
+#define I40E_GLHMC_DBCQPART_PMDBCQBASE_SHIFT 0
+#define I40E_GLHMC_DBCQPART_PMDBCQBASE_MASK  I40E_MASK(0x3FFF, I40E_GLHMC_DBCQPART_PMDBCQBASE_SHIFT)
+#define I40E_GLHMC_DBCQPART_PMDBCQSIZE_SHIFT 16
+#define I40E_GLHMC_DBCQPART_PMDBCQSIZE_MASK  I40E_MASK(0x7FFF, I40E_GLHMC_DBCQPART_PMDBCQSIZE_SHIFT)
+#define I40E_GLHMC_DBQPMAX                     0x000C20EC /* Reset: CORER */
+#define I40E_GLHMC_DBQPMAX_GLHMC_DBQPMAX_SHIFT 0
+#define I40E_GLHMC_DBQPMAX_GLHMC_DBQPMAX_MASK  I40E_MASK(0x7FFFF, I40E_GLHMC_DBQPMAX_GLHMC_DBQPMAX_SHIFT)
+#define I40E_GLHMC_DBQPPART(_i)              (0x00138D80 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_DBQPPART_MAX_INDEX        15
+#define I40E_GLHMC_DBQPPART_PMDBQPBASE_SHIFT 0
+#define I40E_GLHMC_DBQPPART_PMDBQPBASE_MASK  I40E_MASK(0x3FFF, I40E_GLHMC_DBQPPART_PMDBQPBASE_SHIFT)
+#define I40E_GLHMC_DBQPPART_PMDBQPSIZE_SHIFT 16
+#define I40E_GLHMC_DBQPPART_PMDBQPSIZE_MASK  I40E_MASK(0x7FFF, I40E_GLHMC_DBQPPART_PMDBQPSIZE_SHIFT)
+#define I40E_GLHMC_PEARPBASE(_i)                (0x000C4800 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_PEARPBASE_MAX_INDEX          15
+#define I40E_GLHMC_PEARPBASE_FPMPEARPBASE_SHIFT 0
+#define I40E_GLHMC_PEARPBASE_FPMPEARPBASE_MASK  I40E_MASK(0xFFFFFF, I40E_GLHMC_PEARPBASE_FPMPEARPBASE_SHIFT)
+#define I40E_GLHMC_PEARPCNT(_i)               (0x000C4900 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_PEARPCNT_MAX_INDEX         15
+#define I40E_GLHMC_PEARPCNT_FPMPEARPCNT_SHIFT 0
+#define I40E_GLHMC_PEARPCNT_FPMPEARPCNT_MASK  I40E_MASK(0x1FFFFFFF, I40E_GLHMC_PEARPCNT_FPMPEARPCNT_SHIFT)
+#define I40E_GLHMC_PEARPMAX                  0x000C2038 /* Reset: CORER */
+#define I40E_GLHMC_PEARPMAX_PMPEARPMAX_SHIFT 0
+#define I40E_GLHMC_PEARPMAX_PMPEARPMAX_MASK  I40E_MASK(0x1FFFF, I40E_GLHMC_PEARPMAX_PMPEARPMAX_SHIFT)
+#define I40E_GLHMC_PEARPOBJSZ                    0x000C2034 /* Reset: CORER */
+#define I40E_GLHMC_PEARPOBJSZ_PMPEARPOBJSZ_SHIFT 0
+#define I40E_GLHMC_PEARPOBJSZ_PMPEARPOBJSZ_MASK  I40E_MASK(0x7, I40E_GLHMC_PEARPOBJSZ_PMPEARPOBJSZ_SHIFT)
+#define I40E_GLHMC_PECQBASE(_i)               (0x000C4200 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_PECQBASE_MAX_INDEX         15
+#define I40E_GLHMC_PECQBASE_FPMPECQBASE_SHIFT 0
+#define I40E_GLHMC_PECQBASE_FPMPECQBASE_MASK  I40E_MASK(0xFFFFFF, I40E_GLHMC_PECQBASE_FPMPECQBASE_SHIFT)
+#define I40E_GLHMC_PECQCNT(_i)              (0x000C4300 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_PECQCNT_MAX_INDEX        15
+#define I40E_GLHMC_PECQCNT_FPMPECQCNT_SHIFT 0
+#define I40E_GLHMC_PECQCNT_FPMPECQCNT_MASK  I40E_MASK(0x1FFFFFFF, I40E_GLHMC_PECQCNT_FPMPECQCNT_SHIFT)
+#define I40E_GLHMC_PECQOBJSZ                   0x000C2020 /* Reset: CORER */
+#define I40E_GLHMC_PECQOBJSZ_PMPECQOBJSZ_SHIFT 0
+#define I40E_GLHMC_PECQOBJSZ_PMPECQOBJSZ_MASK  I40E_MASK(0xF, I40E_GLHMC_PECQOBJSZ_PMPECQOBJSZ_SHIFT)
+#define I40E_GLHMC_PEHTCNT(_i)              (0x000C4700 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_PEHTCNT_MAX_INDEX        15
+#define I40E_GLHMC_PEHTCNT_FPMPEHTCNT_SHIFT 0
+#define I40E_GLHMC_PEHTCNT_FPMPEHTCNT_MASK  I40E_MASK(0x1FFFFFFF, I40E_GLHMC_PEHTCNT_FPMPEHTCNT_SHIFT)
+#define I40E_GLHMC_PEHTEBASE(_i)                (0x000C4600 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_PEHTEBASE_MAX_INDEX          15
+#define I40E_GLHMC_PEHTEBASE_FPMPEHTEBASE_SHIFT 0
+#define I40E_GLHMC_PEHTEBASE_FPMPEHTEBASE_MASK  I40E_MASK(0xFFFFFF, I40E_GLHMC_PEHTEBASE_FPMPEHTEBASE_SHIFT)
+#define I40E_GLHMC_PEHTEOBJSZ                    0x000C202c /* Reset: CORER */
+#define I40E_GLHMC_PEHTEOBJSZ_PMPEHTEOBJSZ_SHIFT 0
+#define I40E_GLHMC_PEHTEOBJSZ_PMPEHTEOBJSZ_MASK  I40E_MASK(0xF, I40E_GLHMC_PEHTEOBJSZ_PMPEHTEOBJSZ_SHIFT)
+#define I40E_GLHMC_PEHTMAX                 0x000C2030 /* Reset: CORER */
+#define I40E_GLHMC_PEHTMAX_PMPEHTMAX_SHIFT 0
+#define I40E_GLHMC_PEHTMAX_PMPEHTMAX_MASK  I40E_MASK(0x1FFFFF, I40E_GLHMC_PEHTMAX_PMPEHTMAX_SHIFT)
+#define I40E_GLHMC_PEMRBASE(_i)               (0x000C4c00 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_PEMRBASE_MAX_INDEX         15
+#define I40E_GLHMC_PEMRBASE_FPMPEMRBASE_SHIFT 0
+#define I40E_GLHMC_PEMRBASE_FPMPEMRBASE_MASK  I40E_MASK(0xFFFFFF, I40E_GLHMC_PEMRBASE_FPMPEMRBASE_SHIFT)
+#define I40E_GLHMC_PEMRCNT(_i)             (0x000C4d00 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_PEMRCNT_MAX_INDEX       15
+#define I40E_GLHMC_PEMRCNT_FPMPEMRSZ_SHIFT 0
+#define I40E_GLHMC_PEMRCNT_FPMPEMRSZ_MASK  I40E_MASK(0x1FFFFFFF, I40E_GLHMC_PEMRCNT_FPMPEMRSZ_SHIFT)
+#define I40E_GLHMC_PEMRMAX                 0x000C2040 /* Reset: CORER */
+#define I40E_GLHMC_PEMRMAX_PMPEMRMAX_SHIFT 0
+#define I40E_GLHMC_PEMRMAX_PMPEMRMAX_MASK  I40E_MASK(0x7FFFFF, I40E_GLHMC_PEMRMAX_PMPEMRMAX_SHIFT)
+#define I40E_GLHMC_PEMROBJSZ                   0x000C203c /* Reset: CORER */
+#define I40E_GLHMC_PEMROBJSZ_PMPEMROBJSZ_SHIFT 0
+#define I40E_GLHMC_PEMROBJSZ_PMPEMROBJSZ_MASK  I40E_MASK(0xF, I40E_GLHMC_PEMROBJSZ_PMPEMROBJSZ_SHIFT)
+#define I40E_GLHMC_PEPBLBASE(_i)                (0x000C5800 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_PEPBLBASE_MAX_INDEX          15
+#define I40E_GLHMC_PEPBLBASE_FPMPEPBLBASE_SHIFT 0
+#define I40E_GLHMC_PEPBLBASE_FPMPEPBLBASE_MASK  I40E_MASK(0xFFFFFF, I40E_GLHMC_PEPBLBASE_FPMPEPBLBASE_SHIFT)
+#define I40E_GLHMC_PEPBLCNT(_i)               (0x000C5900 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_PEPBLCNT_MAX_INDEX         15
+#define I40E_GLHMC_PEPBLCNT_FPMPEPBLCNT_SHIFT 0
+#define I40E_GLHMC_PEPBLCNT_FPMPEPBLCNT_MASK  I40E_MASK(0x1FFFFFFF, I40E_GLHMC_PEPBLCNT_FPMPEPBLCNT_SHIFT)
+#define I40E_GLHMC_PEPBLMAX                  0x000C206c /* Reset: CORER */
+#define I40E_GLHMC_PEPBLMAX_PMPEPBLMAX_SHIFT 0
+#define I40E_GLHMC_PEPBLMAX_PMPEPBLMAX_MASK  I40E_MASK(0x1FFFFFFF, I40E_GLHMC_PEPBLMAX_PMPEPBLMAX_SHIFT)
+#define I40E_GLHMC_PEPFFIRSTSD                         0x000C20E4 /* Reset: CORER */
+#define I40E_GLHMC_PEPFFIRSTSD_GLHMC_PEPFFIRSTSD_SHIFT 0
+#define I40E_GLHMC_PEPFFIRSTSD_GLHMC_PEPFFIRSTSD_MASK  I40E_MASK(0xFFF, I40E_GLHMC_PEPFFIRSTSD_GLHMC_PEPFFIRSTSD_SHIFT)
+#define I40E_GLHMC_PEQ1BASE(_i)               (0x000C5200 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_PEQ1BASE_MAX_INDEX         15
+#define I40E_GLHMC_PEQ1BASE_FPMPEQ1BASE_SHIFT 0
+#define I40E_GLHMC_PEQ1BASE_FPMPEQ1BASE_MASK  I40E_MASK(0xFFFFFF, I40E_GLHMC_PEQ1BASE_FPMPEQ1BASE_SHIFT)
+#define I40E_GLHMC_PEQ1CNT(_i)              (0x000C5300 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_PEQ1CNT_MAX_INDEX        15
+#define I40E_GLHMC_PEQ1CNT_FPMPEQ1CNT_SHIFT 0
+#define I40E_GLHMC_PEQ1CNT_FPMPEQ1CNT_MASK  I40E_MASK(0x1FFFFFFF, I40E_GLHMC_PEQ1CNT_FPMPEQ1CNT_SHIFT)
+#define I40E_GLHMC_PEQ1FLBASE(_i)                 (0x000C5400 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_PEQ1FLBASE_MAX_INDEX           15
+#define I40E_GLHMC_PEQ1FLBASE_FPMPEQ1FLBASE_SHIFT 0
+#define I40E_GLHMC_PEQ1FLBASE_FPMPEQ1FLBASE_MASK  I40E_MASK(0xFFFFFF, I40E_GLHMC_PEQ1FLBASE_FPMPEQ1FLBASE_SHIFT)
+#define I40E_GLHMC_PEQ1FLMAX                   0x000C2058 /* Reset: CORER */
+#define I40E_GLHMC_PEQ1FLMAX_PMPEQ1FLMAX_SHIFT 0
+#define I40E_GLHMC_PEQ1FLMAX_PMPEQ1FLMAX_MASK  I40E_MASK(0x3FFFFFF, I40E_GLHMC_PEQ1FLMAX_PMPEQ1FLMAX_SHIFT)
+#define I40E_GLHMC_PEQ1MAX                 0x000C2054 /* Reset: CORER */
+#define I40E_GLHMC_PEQ1MAX_PMPEQ1MAX_SHIFT 0
+#define I40E_GLHMC_PEQ1MAX_PMPEQ1MAX_MASK  I40E_MASK(0x3FFFFFF, I40E_GLHMC_PEQ1MAX_PMPEQ1MAX_SHIFT)
+#define I40E_GLHMC_PEQ1OBJSZ                   0x000C2050 /* Reset: CORER */
+#define I40E_GLHMC_PEQ1OBJSZ_PMPEQ1OBJSZ_SHIFT 0
+#define I40E_GLHMC_PEQ1OBJSZ_PMPEQ1OBJSZ_MASK  I40E_MASK(0xF, I40E_GLHMC_PEQ1OBJSZ_PMPEQ1OBJSZ_SHIFT)
+#define I40E_GLHMC_PEQPBASE(_i)               (0x000C4000 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_PEQPBASE_MAX_INDEX         15
+#define I40E_GLHMC_PEQPBASE_FPMPEQPBASE_SHIFT 0
+#define I40E_GLHMC_PEQPBASE_FPMPEQPBASE_MASK  I40E_MASK(0xFFFFFF, I40E_GLHMC_PEQPBASE_FPMPEQPBASE_SHIFT)
+#define I40E_GLHMC_PEQPCNT(_i)              (0x000C4100 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_PEQPCNT_MAX_INDEX        15
+#define I40E_GLHMC_PEQPCNT_FPMPEQPCNT_SHIFT 0
+#define I40E_GLHMC_PEQPCNT_FPMPEQPCNT_MASK  I40E_MASK(0x1FFFFFFF, I40E_GLHMC_PEQPCNT_FPMPEQPCNT_SHIFT)
+#define I40E_GLHMC_PEQPOBJSZ                   0x000C201c /* Reset: CORER */
+#define I40E_GLHMC_PEQPOBJSZ_PMPEQPOBJSZ_SHIFT 0
+#define I40E_GLHMC_PEQPOBJSZ_PMPEQPOBJSZ_MASK  I40E_MASK(0xF, I40E_GLHMC_PEQPOBJSZ_PMPEQPOBJSZ_SHIFT)
+#define I40E_GLHMC_PESRQBASE(_i)                (0x000C4400 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_PESRQBASE_MAX_INDEX          15
+#define I40E_GLHMC_PESRQBASE_FPMPESRQBASE_SHIFT 0
+#define I40E_GLHMC_PESRQBASE_FPMPESRQBASE_MASK  I40E_MASK(0xFFFFFF, I40E_GLHMC_PESRQBASE_FPMPESRQBASE_SHIFT)
+#define I40E_GLHMC_PESRQCNT(_i)               (0x000C4500 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_PESRQCNT_MAX_INDEX         15
+#define I40E_GLHMC_PESRQCNT_FPMPESRQCNT_SHIFT 0
+#define I40E_GLHMC_PESRQCNT_FPMPESRQCNT_MASK  I40E_MASK(0x1FFFFFFF, I40E_GLHMC_PESRQCNT_FPMPESRQCNT_SHIFT)
+#define I40E_GLHMC_PESRQMAX                  0x000C2028 /* Reset: CORER */
+#define I40E_GLHMC_PESRQMAX_PMPESRQMAX_SHIFT 0
+#define I40E_GLHMC_PESRQMAX_PMPESRQMAX_MASK  I40E_MASK(0xFFFF, I40E_GLHMC_PESRQMAX_PMPESRQMAX_SHIFT)
+#define I40E_GLHMC_PESRQOBJSZ                    0x000C2024 /* Reset: CORER */
+#define I40E_GLHMC_PESRQOBJSZ_PMPESRQOBJSZ_SHIFT 0
+#define I40E_GLHMC_PESRQOBJSZ_PMPESRQOBJSZ_MASK  I40E_MASK(0xF, I40E_GLHMC_PESRQOBJSZ_PMPESRQOBJSZ_SHIFT)
+#define I40E_GLHMC_PETIMERBASE(_i)                  (0x000C5A00 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_PETIMERBASE_MAX_INDEX            15
+#define I40E_GLHMC_PETIMERBASE_FPMPETIMERBASE_SHIFT 0
+#define I40E_GLHMC_PETIMERBASE_FPMPETIMERBASE_MASK  I40E_MASK(0xFFFFFF, I40E_GLHMC_PETIMERBASE_FPMPETIMERBASE_SHIFT)
+#define I40E_GLHMC_PETIMERCNT(_i)                 (0x000C5B00 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_PETIMERCNT_MAX_INDEX           15
+#define I40E_GLHMC_PETIMERCNT_FPMPETIMERCNT_SHIFT 0
+#define I40E_GLHMC_PETIMERCNT_FPMPETIMERCNT_MASK  I40E_MASK(0x1FFFFFFF, I40E_GLHMC_PETIMERCNT_FPMPETIMERCNT_SHIFT)
+#define I40E_GLHMC_PETIMERMAX                    0x000C2084 /* Reset: CORER */
+#define I40E_GLHMC_PETIMERMAX_PMPETIMERMAX_SHIFT 0
+#define I40E_GLHMC_PETIMERMAX_PMPETIMERMAX_MASK  I40E_MASK(0x1FFFFFFF, I40E_GLHMC_PETIMERMAX_PMPETIMERMAX_SHIFT)
+#define I40E_GLHMC_PETIMEROBJSZ                      0x000C2080 /* Reset: CORER */
+#define I40E_GLHMC_PETIMEROBJSZ_PMPETIMEROBJSZ_SHIFT 0
+#define I40E_GLHMC_PETIMEROBJSZ_PMPETIMEROBJSZ_MASK  I40E_MASK(0xF, I40E_GLHMC_PETIMEROBJSZ_PMPETIMEROBJSZ_SHIFT)
+#define I40E_GLHMC_PEXFBASE(_i)               (0x000C4e00 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_PEXFBASE_MAX_INDEX         15
+#define I40E_GLHMC_PEXFBASE_FPMPEXFBASE_SHIFT 0
+#define I40E_GLHMC_PEXFBASE_FPMPEXFBASE_MASK  I40E_MASK(0xFFFFFF, I40E_GLHMC_PEXFBASE_FPMPEXFBASE_SHIFT)
+#define I40E_GLHMC_PEXFCNT(_i)              (0x000C4f00 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_PEXFCNT_MAX_INDEX        15
+#define I40E_GLHMC_PEXFCNT_FPMPEXFCNT_SHIFT 0
+#define I40E_GLHMC_PEXFCNT_FPMPEXFCNT_MASK  I40E_MASK(0x1FFFFFFF, I40E_GLHMC_PEXFCNT_FPMPEXFCNT_SHIFT)
+#define I40E_GLHMC_PEXFFLBASE(_i)                 (0x000C5000 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_PEXFFLBASE_MAX_INDEX           15
+#define I40E_GLHMC_PEXFFLBASE_FPMPEXFFLBASE_SHIFT 0
+#define I40E_GLHMC_PEXFFLBASE_FPMPEXFFLBASE_MASK  I40E_MASK(0xFFFFFF, I40E_GLHMC_PEXFFLBASE_FPMPEXFFLBASE_SHIFT)
+#define I40E_GLHMC_PEXFFLMAX                   0x000C204c /* Reset: CORER */
+#define I40E_GLHMC_PEXFFLMAX_PMPEXFFLMAX_SHIFT 0
+#define I40E_GLHMC_PEXFFLMAX_PMPEXFFLMAX_MASK  I40E_MASK(0x1FFFFFF, I40E_GLHMC_PEXFFLMAX_PMPEXFFLMAX_SHIFT)
+#define I40E_GLHMC_PEXFMAX                 0x000C2048 /* Reset: CORER */
+#define I40E_GLHMC_PEXFMAX_PMPEXFMAX_SHIFT 0
+#define I40E_GLHMC_PEXFMAX_PMPEXFMAX_MASK  I40E_MASK(0x3FFFFFF, I40E_GLHMC_PEXFMAX_PMPEXFMAX_SHIFT)
+#define I40E_GLHMC_PEXFOBJSZ                   0x000C2044 /* Reset: CORER */
+#define I40E_GLHMC_PEXFOBJSZ_PMPEXFOBJSZ_SHIFT 0
+#define I40E_GLHMC_PEXFOBJSZ_PMPEXFOBJSZ_MASK  I40E_MASK(0xF, I40E_GLHMC_PEXFOBJSZ_PMPEXFOBJSZ_SHIFT)
+#define I40E_GLHMC_PFPESDPART(_i)            (0x000C0880 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_PFPESDPART_MAX_INDEX      15
+#define I40E_GLHMC_PFPESDPART_PMSDBASE_SHIFT 0
+#define I40E_GLHMC_PFPESDPART_PMSDBASE_MASK  I40E_MASK(0xFFF, I40E_GLHMC_PFPESDPART_PMSDBASE_SHIFT)
+#define I40E_GLHMC_PFPESDPART_PMSDSIZE_SHIFT 16
+#define I40E_GLHMC_PFPESDPART_PMSDSIZE_MASK  I40E_MASK(0x1FFF, I40E_GLHMC_PFPESDPART_PMSDSIZE_SHIFT)
+#define I40E_GLHMC_VFAPBVTINUSEBASE(_i)                   (0x000Cca00 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+#define I40E_GLHMC_VFAPBVTINUSEBASE_MAX_INDEX             31
+#define I40E_GLHMC_VFAPBVTINUSEBASE_FPMAPBINUSEBASE_SHIFT 0
+#define I40E_GLHMC_VFAPBVTINUSEBASE_FPMAPBINUSEBASE_MASK  I40E_MASK(0xFFFFFF, I40E_GLHMC_VFAPBVTINUSEBASE_FPMAPBINUSEBASE_SHIFT)
+#define I40E_GLHMC_VFCEQPART(_i)             (0x00132240 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+#define I40E_GLHMC_VFCEQPART_MAX_INDEX       31
+#define I40E_GLHMC_VFCEQPART_PMCEQBASE_SHIFT 0
+#define I40E_GLHMC_VFCEQPART_PMCEQBASE_MASK  I40E_MASK(0xFF, I40E_GLHMC_VFCEQPART_PMCEQBASE_SHIFT)
+#define I40E_GLHMC_VFCEQPART_PMCEQSIZE_SHIFT 16
+#define I40E_GLHMC_VFCEQPART_PMCEQSIZE_MASK  I40E_MASK(0x1FF, I40E_GLHMC_VFCEQPART_PMCEQSIZE_SHIFT)
+#define I40E_GLHMC_VFDBCQPART(_i)              (0x00132140 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+#define I40E_GLHMC_VFDBCQPART_MAX_INDEX        31
+#define I40E_GLHMC_VFDBCQPART_PMDBCQBASE_SHIFT 0
+#define I40E_GLHMC_VFDBCQPART_PMDBCQBASE_MASK  I40E_MASK(0x3FFF, I40E_GLHMC_VFDBCQPART_PMDBCQBASE_SHIFT)
+#define I40E_GLHMC_VFDBCQPART_PMDBCQSIZE_SHIFT 16
+#define I40E_GLHMC_VFDBCQPART_PMDBCQSIZE_MASK  I40E_MASK(0x7FFF, I40E_GLHMC_VFDBCQPART_PMDBCQSIZE_SHIFT)
+#define I40E_GLHMC_VFDBQPPART(_i)              (0x00138E00 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+#define I40E_GLHMC_VFDBQPPART_MAX_INDEX        31
+#define I40E_GLHMC_VFDBQPPART_PMDBQPBASE_SHIFT 0
+#define I40E_GLHMC_VFDBQPPART_PMDBQPBASE_MASK  I40E_MASK(0x3FFF, I40E_GLHMC_VFDBQPPART_PMDBQPBASE_SHIFT)
+#define I40E_GLHMC_VFDBQPPART_PMDBQPSIZE_SHIFT 16
+#define I40E_GLHMC_VFDBQPPART_PMDBQPSIZE_MASK  I40E_MASK(0x7FFF, I40E_GLHMC_VFDBQPPART_PMDBQPSIZE_SHIFT)
+#define I40E_GLHMC_VFFSIAVBASE(_i)                (0x000Cd600 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+#define I40E_GLHMC_VFFSIAVBASE_MAX_INDEX          31
+#define I40E_GLHMC_VFFSIAVBASE_FPMFSIAVBASE_SHIFT 0
+#define I40E_GLHMC_VFFSIAVBASE_FPMFSIAVBASE_MASK  I40E_MASK(0xFFFFFF, I40E_GLHMC_VFFSIAVBASE_FPMFSIAVBASE_SHIFT)
+#define I40E_GLHMC_VFFSIAVCNT(_i)               (0x000Cd700 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+#define I40E_GLHMC_VFFSIAVCNT_MAX_INDEX         31
+#define I40E_GLHMC_VFFSIAVCNT_FPMFSIAVCNT_SHIFT 0
+#define I40E_GLHMC_VFFSIAVCNT_FPMFSIAVCNT_MASK  I40E_MASK(0x1FFFFFFF, I40E_GLHMC_VFFSIAVCNT_FPMFSIAVCNT_SHIFT)
+#define I40E_GLHMC_VFPDINV(_i)               (0x000C8300 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+#define I40E_GLHMC_VFPDINV_MAX_INDEX         31
+#define I40E_GLHMC_VFPDINV_PMSDIDX_SHIFT     0
+#define I40E_GLHMC_VFPDINV_PMSDIDX_MASK      I40E_MASK(0xFFF, I40E_GLHMC_VFPDINV_PMSDIDX_SHIFT)
+#define I40E_GLHMC_VFPDINV_PMSDPARTSEL_SHIFT 15
+#define I40E_GLHMC_VFPDINV_PMSDPARTSEL_MASK  I40E_MASK(0x1, I40E_GLHMC_VFPDINV_PMSDPARTSEL_SHIFT)
+#define I40E_GLHMC_VFPDINV_PMPDIDX_SHIFT     16
+#define I40E_GLHMC_VFPDINV_PMPDIDX_MASK      I40E_MASK(0x1FF, I40E_GLHMC_VFPDINV_PMPDIDX_SHIFT)
+#define I40E_GLHMC_VFPEARPBASE(_i)                (0x000Cc800 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+#define I40E_GLHMC_VFPEARPBASE_MAX_INDEX          31
+#define I40E_GLHMC_VFPEARPBASE_FPMPEARPBASE_SHIFT 0
+#define I40E_GLHMC_VFPEARPBASE_FPMPEARPBASE_MASK  I40E_MASK(0xFFFFFF, I40E_GLHMC_VFPEARPBASE_FPMPEARPBASE_SHIFT)
+#define I40E_GLHMC_VFPEARPCNT(_i)               (0x000Cc900 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+#define I40E_GLHMC_VFPEARPCNT_MAX_INDEX         31
+#define I40E_GLHMC_VFPEARPCNT_FPMPEARPCNT_SHIFT 0
+#define I40E_GLHMC_VFPEARPCNT_FPMPEARPCNT_MASK  I40E_MASK(0x1FFFFFFF, I40E_GLHMC_VFPEARPCNT_FPMPEARPCNT_SHIFT)
+#define I40E_GLHMC_VFPECQBASE(_i)               (0x000Cc200 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+#define I40E_GLHMC_VFPECQBASE_MAX_INDEX         31
+#define I40E_GLHMC_VFPECQBASE_FPMPECQBASE_SHIFT 0
+#define I40E_GLHMC_VFPECQBASE_FPMPECQBASE_MASK  I40E_MASK(0xFFFFFF, I40E_GLHMC_VFPECQBASE_FPMPECQBASE_SHIFT)
+#define I40E_GLHMC_VFPECQCNT(_i)              (0x000Cc300 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+#define I40E_GLHMC_VFPECQCNT_MAX_INDEX        31
+#define I40E_GLHMC_VFPECQCNT_FPMPECQCNT_SHIFT 0
+#define I40E_GLHMC_VFPECQCNT_FPMPECQCNT_MASK  I40E_MASK(0x1FFFFFFF, I40E_GLHMC_VFPECQCNT_FPMPECQCNT_SHIFT)
+#define I40E_GLHMC_VFPEHTCNT(_i)              (0x000Cc700 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+#define I40E_GLHMC_VFPEHTCNT_MAX_INDEX        31
+#define I40E_GLHMC_VFPEHTCNT_FPMPEHTCNT_SHIFT 0
+#define I40E_GLHMC_VFPEHTCNT_FPMPEHTCNT_MASK  I40E_MASK(0x1FFFFFFF, I40E_GLHMC_VFPEHTCNT_FPMPEHTCNT_SHIFT)
+#define I40E_GLHMC_VFPEHTEBASE(_i)                (0x000Cc600 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+#define I40E_GLHMC_VFPEHTEBASE_MAX_INDEX          31
+#define I40E_GLHMC_VFPEHTEBASE_FPMPEHTEBASE_SHIFT 0
+#define I40E_GLHMC_VFPEHTEBASE_FPMPEHTEBASE_MASK  I40E_MASK(0xFFFFFF, I40E_GLHMC_VFPEHTEBASE_FPMPEHTEBASE_SHIFT)
+#define I40E_GLHMC_VFPEMRBASE(_i)               (0x000Ccc00 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+#define I40E_GLHMC_VFPEMRBASE_MAX_INDEX         31
+#define I40E_GLHMC_VFPEMRBASE_FPMPEMRBASE_SHIFT 0
+#define I40E_GLHMC_VFPEMRBASE_FPMPEMRBASE_MASK  I40E_MASK(0xFFFFFF, I40E_GLHMC_VFPEMRBASE_FPMPEMRBASE_SHIFT)
+#define I40E_GLHMC_VFPEMRCNT(_i)             (0x000Ccd00 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+#define I40E_GLHMC_VFPEMRCNT_MAX_INDEX       31
+#define I40E_GLHMC_VFPEMRCNT_FPMPEMRSZ_SHIFT 0
+#define I40E_GLHMC_VFPEMRCNT_FPMPEMRSZ_MASK  I40E_MASK(0x1FFFFFFF, I40E_GLHMC_VFPEMRCNT_FPMPEMRSZ_SHIFT)
+#define I40E_GLHMC_VFPEPBLBASE(_i)                (0x000Cd800 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+#define I40E_GLHMC_VFPEPBLBASE_MAX_INDEX          31
+#define I40E_GLHMC_VFPEPBLBASE_FPMPEPBLBASE_SHIFT 0
+#define I40E_GLHMC_VFPEPBLBASE_FPMPEPBLBASE_MASK  I40E_MASK(0xFFFFFF, I40E_GLHMC_VFPEPBLBASE_FPMPEPBLBASE_SHIFT)
+#define I40E_GLHMC_VFPEPBLCNT(_i)               (0x000Cd900 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+#define I40E_GLHMC_VFPEPBLCNT_MAX_INDEX         31
+#define I40E_GLHMC_VFPEPBLCNT_FPMPEPBLCNT_SHIFT 0
+#define I40E_GLHMC_VFPEPBLCNT_FPMPEPBLCNT_MASK  I40E_MASK(0x1FFFFFFF, I40E_GLHMC_VFPEPBLCNT_FPMPEPBLCNT_SHIFT)
+#define I40E_GLHMC_VFPEQ1BASE(_i)               (0x000Cd200 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+#define I40E_GLHMC_VFPEQ1BASE_MAX_INDEX         31
+#define I40E_GLHMC_VFPEQ1BASE_FPMPEQ1BASE_SHIFT 0
+#define I40E_GLHMC_VFPEQ1BASE_FPMPEQ1BASE_MASK  I40E_MASK(0xFFFFFF, I40E_GLHMC_VFPEQ1BASE_FPMPEQ1BASE_SHIFT)
+#define I40E_GLHMC_VFPEQ1CNT(_i)              (0x000Cd300 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+#define I40E_GLHMC_VFPEQ1CNT_MAX_INDEX        31
+#define I40E_GLHMC_VFPEQ1CNT_FPMPEQ1CNT_SHIFT 0
+#define I40E_GLHMC_VFPEQ1CNT_FPMPEQ1CNT_MASK  I40E_MASK(0x1FFFFFFF, I40E_GLHMC_VFPEQ1CNT_FPMPEQ1CNT_SHIFT)
+#define I40E_GLHMC_VFPEQ1FLBASE(_i)                 (0x000Cd400 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+#define I40E_GLHMC_VFPEQ1FLBASE_MAX_INDEX           31
+#define I40E_GLHMC_VFPEQ1FLBASE_FPMPEQ1FLBASE_SHIFT 0
+#define I40E_GLHMC_VFPEQ1FLBASE_FPMPEQ1FLBASE_MASK  I40E_MASK(0xFFFFFF, I40E_GLHMC_VFPEQ1FLBASE_FPMPEQ1FLBASE_SHIFT)
+#define I40E_GLHMC_VFPEQPBASE(_i)               (0x000Cc000 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+#define I40E_GLHMC_VFPEQPBASE_MAX_INDEX         31
+#define I40E_GLHMC_VFPEQPBASE_FPMPEQPBASE_SHIFT 0
+#define I40E_GLHMC_VFPEQPBASE_FPMPEQPBASE_MASK  I40E_MASK(0xFFFFFF, I40E_GLHMC_VFPEQPBASE_FPMPEQPBASE_SHIFT)
+#define I40E_GLHMC_VFPEQPCNT(_i)              (0x000Cc100 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+#define I40E_GLHMC_VFPEQPCNT_MAX_INDEX        31
+#define I40E_GLHMC_VFPEQPCNT_FPMPEQPCNT_SHIFT 0
+#define I40E_GLHMC_VFPEQPCNT_FPMPEQPCNT_MASK  I40E_MASK(0x1FFFFFFF, I40E_GLHMC_VFPEQPCNT_FPMPEQPCNT_SHIFT)
+#define I40E_GLHMC_VFPESRQBASE(_i)                (0x000Cc400 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+#define I40E_GLHMC_VFPESRQBASE_MAX_INDEX          31
+#define I40E_GLHMC_VFPESRQBASE_FPMPESRQBASE_SHIFT 0
+#define I40E_GLHMC_VFPESRQBASE_FPMPESRQBASE_MASK  I40E_MASK(0xFFFFFF, I40E_GLHMC_VFPESRQBASE_FPMPESRQBASE_SHIFT)
+#define I40E_GLHMC_VFPESRQCNT(_i)               (0x000Cc500 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+#define I40E_GLHMC_VFPESRQCNT_MAX_INDEX         31
+#define I40E_GLHMC_VFPESRQCNT_FPMPESRQCNT_SHIFT 0
+#define I40E_GLHMC_VFPESRQCNT_FPMPESRQCNT_MASK  I40E_MASK(0x1FFFFFFF, I40E_GLHMC_VFPESRQCNT_FPMPESRQCNT_SHIFT)
+#define I40E_GLHMC_VFPETIMERBASE(_i)                  (0x000CDA00 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+#define I40E_GLHMC_VFPETIMERBASE_MAX_INDEX            31
+#define I40E_GLHMC_VFPETIMERBASE_FPMPETIMERBASE_SHIFT 0
+#define I40E_GLHMC_VFPETIMERBASE_FPMPETIMERBASE_MASK  I40E_MASK(0xFFFFFF, I40E_GLHMC_VFPETIMERBASE_FPMPETIMERBASE_SHIFT)
+#define I40E_GLHMC_VFPETIMERCNT(_i)                 (0x000CDB00 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+#define I40E_GLHMC_VFPETIMERCNT_MAX_INDEX           31
+#define I40E_GLHMC_VFPETIMERCNT_FPMPETIMERCNT_SHIFT 0
+#define I40E_GLHMC_VFPETIMERCNT_FPMPETIMERCNT_MASK  I40E_MASK(0x1FFFFFFF, I40E_GLHMC_VFPETIMERCNT_FPMPETIMERCNT_SHIFT)
+#define I40E_GLHMC_VFPEXFBASE(_i)               (0x000Cce00 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+#define I40E_GLHMC_VFPEXFBASE_MAX_INDEX         31
+#define I40E_GLHMC_VFPEXFBASE_FPMPEXFBASE_SHIFT 0
+#define I40E_GLHMC_VFPEXFBASE_FPMPEXFBASE_MASK  I40E_MASK(0xFFFFFF, I40E_GLHMC_VFPEXFBASE_FPMPEXFBASE_SHIFT)
+#define I40E_GLHMC_VFPEXFCNT(_i)              (0x000Ccf00 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+#define I40E_GLHMC_VFPEXFCNT_MAX_INDEX        31
+#define I40E_GLHMC_VFPEXFCNT_FPMPEXFCNT_SHIFT 0
+#define I40E_GLHMC_VFPEXFCNT_FPMPEXFCNT_MASK  I40E_MASK(0x1FFFFFFF, I40E_GLHMC_VFPEXFCNT_FPMPEXFCNT_SHIFT)
+#define I40E_GLHMC_VFPEXFFLBASE(_i)                 (0x000Cd000 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+#define I40E_GLHMC_VFPEXFFLBASE_MAX_INDEX           31
+#define I40E_GLHMC_VFPEXFFLBASE_FPMPEXFFLBASE_SHIFT 0
+#define I40E_GLHMC_VFPEXFFLBASE_FPMPEXFFLBASE_MASK  I40E_MASK(0xFFFFFF, I40E_GLHMC_VFPEXFFLBASE_FPMPEXFFLBASE_SHIFT)
+#define I40E_GLHMC_VFSDPART(_i)            (0x000C8800 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+#define I40E_GLHMC_VFSDPART_MAX_INDEX      31
+#define I40E_GLHMC_VFSDPART_PMSDBASE_SHIFT 0
+#define I40E_GLHMC_VFSDPART_PMSDBASE_MASK  I40E_MASK(0xFFF, I40E_GLHMC_VFSDPART_PMSDBASE_SHIFT)
+#define I40E_GLHMC_VFSDPART_PMSDSIZE_SHIFT 16
+#define I40E_GLHMC_VFSDPART_PMSDSIZE_MASK  I40E_MASK(0x1FFF, I40E_GLHMC_VFSDPART_PMSDSIZE_SHIFT)
+#define I40E_GLPBLOC_CACHESIZE                 0x000A80BC /* Reset: CORER */
+#define I40E_GLPBLOC_CACHESIZE_WORD_SIZE_SHIFT 0
+#define I40E_GLPBLOC_CACHESIZE_WORD_SIZE_MASK  I40E_MASK(0xFF, I40E_GLPBLOC_CACHESIZE_WORD_SIZE_SHIFT)
+#define I40E_GLPBLOC_CACHESIZE_SETS_SHIFT      8
+#define I40E_GLPBLOC_CACHESIZE_SETS_MASK       I40E_MASK(0xFFF, I40E_GLPBLOC_CACHESIZE_SETS_SHIFT)
+#define I40E_GLPBLOC_CACHESIZE_WAYS_SHIFT      20
+#define I40E_GLPBLOC_CACHESIZE_WAYS_MASK       I40E_MASK(0xF, I40E_GLPBLOC_CACHESIZE_WAYS_SHIFT)
+#define I40E_GLPDOC_CACHESIZE                 0x000D0088 /* Reset: CORER */
+#define I40E_GLPDOC_CACHESIZE_WORD_SIZE_SHIFT 0
+#define I40E_GLPDOC_CACHESIZE_WORD_SIZE_MASK  I40E_MASK(0xFF, I40E_GLPDOC_CACHESIZE_WORD_SIZE_SHIFT)
+#define I40E_GLPDOC_CACHESIZE_SETS_SHIFT      8
+#define I40E_GLPDOC_CACHESIZE_SETS_MASK       I40E_MASK(0xFFF, I40E_GLPDOC_CACHESIZE_SETS_SHIFT)
+#define I40E_GLPDOC_CACHESIZE_WAYS_SHIFT      20
+#define I40E_GLPDOC_CACHESIZE_WAYS_MASK       I40E_MASK(0xF, I40E_GLPDOC_CACHESIZE_WAYS_SHIFT)
+#define I40E_GLPEOC_CACHESIZE                 0x000A60E8 /* Reset: CORER */
+#define I40E_GLPEOC_CACHESIZE_WORD_SIZE_SHIFT 0
+#define I40E_GLPEOC_CACHESIZE_WORD_SIZE_MASK  I40E_MASK(0xFF, I40E_GLPEOC_CACHESIZE_WORD_SIZE_SHIFT)
+#define I40E_GLPEOC_CACHESIZE_SETS_SHIFT      8
+#define I40E_GLPEOC_CACHESIZE_SETS_MASK       I40E_MASK(0xFFF, I40E_GLPEOC_CACHESIZE_SETS_SHIFT)
+#define I40E_GLPEOC_CACHESIZE_WAYS_SHIFT      20
+#define I40E_GLPEOC_CACHESIZE_WAYS_MASK       I40E_MASK(0xF, I40E_GLPEOC_CACHESIZE_WAYS_SHIFT)
+#define I40E_PFHMC_PDINV_PMSDPARTSEL_SHIFT 15
+#define I40E_PFHMC_PDINV_PMSDPARTSEL_MASK  I40E_MASK(0x1, I40E_PFHMC_PDINV_PMSDPARTSEL_SHIFT)
+#define I40E_PFHMC_SDCMD_PMSDPARTSEL_SHIFT 15
+#define I40E_PFHMC_SDCMD_PMSDPARTSEL_MASK  I40E_MASK(0x1, I40E_PFHMC_SDCMD_PMSDPARTSEL_SHIFT)
+#define I40E_GL_PPRS_SPARE                     0x000856E0 /* Reset: CORER */
+#define I40E_GL_PPRS_SPARE_GL_PPRS_SPARE_SHIFT 0
+#define I40E_GL_PPRS_SPARE_GL_PPRS_SPARE_MASK  I40E_MASK(0xFFFFFFFF, I40E_GL_PPRS_SPARE_GL_PPRS_SPARE_SHIFT)
+#define I40E_GL_TLAN_SPARE                     0x000E64E0 /* Reset: CORER */
+#define I40E_GL_TLAN_SPARE_GL_TLAN_SPARE_SHIFT 0
+#define I40E_GL_TLAN_SPARE_GL_TLAN_SPARE_MASK  I40E_MASK(0xFFFFFFFF, I40E_GL_TLAN_SPARE_GL_TLAN_SPARE_SHIFT)
+#define I40E_GL_TUPM_SPARE                     0x000a2230 /* Reset: CORER */
+#define I40E_GL_TUPM_SPARE_GL_TUPM_SPARE_SHIFT 0
+#define I40E_GL_TUPM_SPARE_GL_TUPM_SPARE_MASK  I40E_MASK(0xFFFFFFFF, I40E_GL_TUPM_SPARE_GL_TUPM_SPARE_SHIFT)
+#define I40E_GLGEN_CAR_DEBUG                                 0x000B81C0 /* Reset: POR */
+#define I40E_GLGEN_CAR_DEBUG_CAR_UPPER_CORE_CLK_EN_SHIFT     0
+#define I40E_GLGEN_CAR_DEBUG_CAR_UPPER_CORE_CLK_EN_MASK      I40E_MASK(0x1, I40E_GLGEN_CAR_DEBUG_CAR_UPPER_CORE_CLK_EN_SHIFT)
+#define I40E_GLGEN_CAR_DEBUG_CAR_PCIE_HIU_CLK_EN_SHIFT       1
+#define I40E_GLGEN_CAR_DEBUG_CAR_PCIE_HIU_CLK_EN_MASK        I40E_MASK(0x1, I40E_GLGEN_CAR_DEBUG_CAR_PCIE_HIU_CLK_EN_SHIFT)
+#define I40E_GLGEN_CAR_DEBUG_CAR_PE_CLK_EN_SHIFT             2
+#define I40E_GLGEN_CAR_DEBUG_CAR_PE_CLK_EN_MASK              I40E_MASK(0x1, I40E_GLGEN_CAR_DEBUG_CAR_PE_CLK_EN_SHIFT)
+#define I40E_GLGEN_CAR_DEBUG_CAR_PCIE_PRIM_CLK_ACTIVE_SHIFT  3
+#define I40E_GLGEN_CAR_DEBUG_CAR_PCIE_PRIM_CLK_ACTIVE_MASK   I40E_MASK(0x1, I40E_GLGEN_CAR_DEBUG_CAR_PCIE_PRIM_CLK_ACTIVE_SHIFT)
+#define I40E_GLGEN_CAR_DEBUG_CDC_PE_ACTIVE_SHIFT             4
+#define I40E_GLGEN_CAR_DEBUG_CDC_PE_ACTIVE_MASK              I40E_MASK(0x1, I40E_GLGEN_CAR_DEBUG_CDC_PE_ACTIVE_SHIFT)
+#define I40E_GLGEN_CAR_DEBUG_CAR_PCIE_RAW_PRST_RESET_N_SHIFT 5
+#define I40E_GLGEN_CAR_DEBUG_CAR_PCIE_RAW_PRST_RESET_N_MASK  I40E_MASK(0x1, I40E_GLGEN_CAR_DEBUG_CAR_PCIE_RAW_PRST_RESET_N_SHIFT)
+#define I40E_GLGEN_CAR_DEBUG_CAR_PCIE_RAW_SCLR_RESET_N_SHIFT 6
+#define I40E_GLGEN_CAR_DEBUG_CAR_PCIE_RAW_SCLR_RESET_N_MASK  I40E_MASK(0x1, I40E_GLGEN_CAR_DEBUG_CAR_PCIE_RAW_SCLR_RESET_N_SHIFT)
+#define I40E_GLGEN_CAR_DEBUG_CAR_PCIE_RAW_IB_RESET_N_SHIFT   7
+#define I40E_GLGEN_CAR_DEBUG_CAR_PCIE_RAW_IB_RESET_N_MASK    I40E_MASK(0x1, I40E_GLGEN_CAR_DEBUG_CAR_PCIE_RAW_IB_RESET_N_SHIFT)
+#define I40E_GLGEN_CAR_DEBUG_CAR_PCIE_RAW_IMIB_RESET_N_SHIFT 8
+#define I40E_GLGEN_CAR_DEBUG_CAR_PCIE_RAW_IMIB_RESET_N_MASK  I40E_MASK(0x1, I40E_GLGEN_CAR_DEBUG_CAR_PCIE_RAW_IMIB_RESET_N_SHIFT)
+#define I40E_GLGEN_CAR_DEBUG_CAR_RAW_EMP_RESET_N_SHIFT       9
+#define I40E_GLGEN_CAR_DEBUG_CAR_RAW_EMP_RESET_N_MASK        I40E_MASK(0x1, I40E_GLGEN_CAR_DEBUG_CAR_RAW_EMP_RESET_N_SHIFT)
+#define I40E_GLGEN_CAR_DEBUG_CAR_RAW_GLOBAL_RESET_N_SHIFT    10
+#define I40E_GLGEN_CAR_DEBUG_CAR_RAW_GLOBAL_RESET_N_MASK     I40E_MASK(0x1, I40E_GLGEN_CAR_DEBUG_CAR_RAW_GLOBAL_RESET_N_SHIFT)
+#define I40E_GLGEN_CAR_DEBUG_CAR_RAW_LAN_POWER_GOOD_SHIFT    11
+#define I40E_GLGEN_CAR_DEBUG_CAR_RAW_LAN_POWER_GOOD_MASK     I40E_MASK(0x1, I40E_GLGEN_CAR_DEBUG_CAR_RAW_LAN_POWER_GOOD_SHIFT)
+#define I40E_GLGEN_CAR_DEBUG_CDC_IOSF_PRIMERY_RST_B_SHIFT    12
+#define I40E_GLGEN_CAR_DEBUG_CDC_IOSF_PRIMERY_RST_B_MASK     I40E_MASK(0x1, I40E_GLGEN_CAR_DEBUG_CDC_IOSF_PRIMERY_RST_B_SHIFT)
+#define I40E_GLGEN_CAR_DEBUG_GBE_GLOBALRST_B_SHIFT           13
+#define I40E_GLGEN_CAR_DEBUG_GBE_GLOBALRST_B_MASK            I40E_MASK(0x1, I40E_GLGEN_CAR_DEBUG_GBE_GLOBALRST_B_SHIFT)
+#define I40E_GLGEN_CAR_DEBUG_FLEEP_AL_GLOBR_DONE_SHIFT       14
+#define I40E_GLGEN_CAR_DEBUG_FLEEP_AL_GLOBR_DONE_MASK        I40E_MASK(0x1, I40E_GLGEN_CAR_DEBUG_FLEEP_AL_GLOBR_DONE_SHIFT)
+#define I40E_GLGEN_MISC_SPARE                        0x000880E0 /* Reset: POR */
+#define I40E_GLGEN_MISC_SPARE_GLGEN_MISC_SPARE_SHIFT 0
+#define I40E_GLGEN_MISC_SPARE_GLGEN_MISC_SPARE_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLGEN_MISC_SPARE_GLGEN_MISC_SPARE_SHIFT)
+#define I40E_GL_UFUSE_SOC                   0x000BE550 /* Reset: POR */
+#define I40E_GL_UFUSE_SOC_PORT_MODE_SHIFT   0
+#define I40E_GL_UFUSE_SOC_PORT_MODE_MASK    I40E_MASK(0x3, I40E_GL_UFUSE_SOC_PORT_MODE_SHIFT)
+#define I40E_GL_UFUSE_SOC_NIC_ID_SHIFT      2
+#define I40E_GL_UFUSE_SOC_NIC_ID_MASK       I40E_MASK(0x1, I40E_GL_UFUSE_SOC_NIC_ID_SHIFT)
+#define I40E_GL_UFUSE_SOC_SPARE_FUSES_SHIFT 3
+#define I40E_GL_UFUSE_SOC_SPARE_FUSES_MASK  I40E_MASK(0x1FFF, I40E_GL_UFUSE_SOC_SPARE_FUSES_SHIFT)
+#define I40E_PFINT_DYN_CTL0_WB_ON_ITR_SHIFT       30
+#define I40E_PFINT_DYN_CTL0_WB_ON_ITR_MASK        I40E_MASK(0x1, I40E_PFINT_DYN_CTL0_WB_ON_ITR_SHIFT)
+#define I40E_PFINT_DYN_CTLN_WB_ON_ITR_SHIFT       30
+#define I40E_PFINT_DYN_CTLN_WB_ON_ITR_MASK        I40E_MASK(0x1, I40E_PFINT_DYN_CTLN_WB_ON_ITR_SHIFT)
+#define I40E_VFINT_DYN_CTL0_WB_ON_ITR_SHIFT       30
+#define I40E_VFINT_DYN_CTL0_WB_ON_ITR_MASK        I40E_MASK(0x1, I40E_VFINT_DYN_CTL0_WB_ON_ITR_SHIFT)
+#define I40E_VFINT_DYN_CTLN_WB_ON_ITR_SHIFT       30
+#define I40E_VFINT_DYN_CTLN_WB_ON_ITR_MASK        I40E_MASK(0x1, I40E_VFINT_DYN_CTLN_WB_ON_ITR_SHIFT)
+#define I40E_VPLAN_QBASE(_VF)               (0x00074800 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
+#define I40E_VPLAN_QBASE_MAX_INDEX          127
+#define I40E_VPLAN_QBASE_VFFIRSTQ_SHIFT     0
+#define I40E_VPLAN_QBASE_VFFIRSTQ_MASK      I40E_MASK(0x7FF, I40E_VPLAN_QBASE_VFFIRSTQ_SHIFT)
+#define I40E_VPLAN_QBASE_VFNUMQ_SHIFT       11
+#define I40E_VPLAN_QBASE_VFNUMQ_MASK        I40E_MASK(0xFF, I40E_VPLAN_QBASE_VFNUMQ_SHIFT)
+#define I40E_VPLAN_QBASE_VFQTABLE_ENA_SHIFT 31
+#define I40E_VPLAN_QBASE_VFQTABLE_ENA_MASK  I40E_MASK(0x1, I40E_VPLAN_QBASE_VFQTABLE_ENA_SHIFT)
+#define I40E_PRTMAC_LINK_DOWN_COUNTER                         0x001E2440 /* Reset: GLOBR */
+#define I40E_PRTMAC_LINK_DOWN_COUNTER_LINK_DOWN_COUNTER_SHIFT 0
+#define I40E_PRTMAC_LINK_DOWN_COUNTER_LINK_DOWN_COUNTER_MASK  I40E_MASK(0xFFFF, I40E_PRTMAC_LINK_DOWN_COUNTER_LINK_DOWN_COUNTER_SHIFT)
+#define I40E_GLNVM_AL_REQ                        0x000B6164 /* Reset: POR */
+#define I40E_GLNVM_AL_REQ_POR_SHIFT              0
+#define I40E_GLNVM_AL_REQ_POR_MASK               I40E_MASK(0x1, I40E_GLNVM_AL_REQ_POR_SHIFT)
+#define I40E_GLNVM_AL_REQ_PCIE_IMIB_SHIFT        1
+#define I40E_GLNVM_AL_REQ_PCIE_IMIB_MASK         I40E_MASK(0x1, I40E_GLNVM_AL_REQ_PCIE_IMIB_SHIFT)
+#define I40E_GLNVM_AL_REQ_GLOBR_SHIFT            2
+#define I40E_GLNVM_AL_REQ_GLOBR_MASK             I40E_MASK(0x1, I40E_GLNVM_AL_REQ_GLOBR_SHIFT)
+#define I40E_GLNVM_AL_REQ_CORER_SHIFT            3
+#define I40E_GLNVM_AL_REQ_CORER_MASK             I40E_MASK(0x1, I40E_GLNVM_AL_REQ_CORER_SHIFT)
+#define I40E_GLNVM_AL_REQ_PE_SHIFT               4
+#define I40E_GLNVM_AL_REQ_PE_MASK                I40E_MASK(0x1, I40E_GLNVM_AL_REQ_PE_SHIFT)
+#define I40E_GLNVM_AL_REQ_PCIE_IMIB_ASSERT_SHIFT 5
+#define I40E_GLNVM_AL_REQ_PCIE_IMIB_ASSERT_MASK  I40E_MASK(0x1, I40E_GLNVM_AL_REQ_PCIE_IMIB_ASSERT_SHIFT)
+#define I40E_GLNVM_ALTIMERS                   0x000B6140 /* Reset: POR */
+#define I40E_GLNVM_ALTIMERS_PCI_ALTIMER_SHIFT 0
+#define I40E_GLNVM_ALTIMERS_PCI_ALTIMER_MASK  I40E_MASK(0xFFF, I40E_GLNVM_ALTIMERS_PCI_ALTIMER_SHIFT)
+#define I40E_GLNVM_ALTIMERS_GEN_ALTIMER_SHIFT 12
+#define I40E_GLNVM_ALTIMERS_GEN_ALTIMER_MASK  I40E_MASK(0xFFFFF, I40E_GLNVM_ALTIMERS_GEN_ALTIMER_SHIFT)
+#define I40E_GLNVM_FLA              0x000B6108 /* Reset: POR */
+#define I40E_GLNVM_FLA_LOCKED_SHIFT 6
+#define I40E_GLNVM_FLA_LOCKED_MASK  I40E_MASK(0x1, I40E_GLNVM_FLA_LOCKED_SHIFT)
+
+#define I40E_GLNVM_ULD                    0x000B6008 /* Reset: POR */
+#define I40E_GLNVM_ULD_PCIER_DONE_SHIFT   0
+#define I40E_GLNVM_ULD_PCIER_DONE_MASK    I40E_MASK(0x1, I40E_GLNVM_ULD_PCIER_DONE_SHIFT)
+#define I40E_GLNVM_ULD_PCIER_DONE_1_SHIFT 1
+#define I40E_GLNVM_ULD_PCIER_DONE_1_MASK  I40E_MASK(0x1, I40E_GLNVM_ULD_PCIER_DONE_1_SHIFT)
+#define I40E_GLNVM_ULD_CORER_DONE_SHIFT   3
+#define I40E_GLNVM_ULD_CORER_DONE_MASK    I40E_MASK(0x1, I40E_GLNVM_ULD_CORER_DONE_SHIFT)
+#define I40E_GLNVM_ULD_GLOBR_DONE_SHIFT   4
+#define I40E_GLNVM_ULD_GLOBR_DONE_MASK    I40E_MASK(0x1, I40E_GLNVM_ULD_GLOBR_DONE_SHIFT)
+#define I40E_GLNVM_ULD_POR_DONE_SHIFT     5
+#define I40E_GLNVM_ULD_POR_DONE_MASK      I40E_MASK(0x1, I40E_GLNVM_ULD_POR_DONE_SHIFT)
+#define I40E_GLNVM_ULD_POR_DONE_1_SHIFT   8
+#define I40E_GLNVM_ULD_POR_DONE_1_MASK    I40E_MASK(0x1, I40E_GLNVM_ULD_POR_DONE_1_SHIFT)
+#define I40E_GLNVM_ULD_PCIER_DONE_2_SHIFT 9
+#define I40E_GLNVM_ULD_PCIER_DONE_2_MASK  I40E_MASK(0x1, I40E_GLNVM_ULD_PCIER_DONE_2_SHIFT)
+#define I40E_GLNVM_ULD_PE_DONE_SHIFT      10
+#define I40E_GLNVM_ULD_PE_DONE_MASK       I40E_MASK(0x1, I40E_GLNVM_ULD_PE_DONE_SHIFT)
+#define I40E_GLNVM_ULT                      0x000B6154 /* Reset: POR */
+#define I40E_GLNVM_ULT_CONF_PCIR_AE_SHIFT   0
+#define I40E_GLNVM_ULT_CONF_PCIR_AE_MASK    I40E_MASK(0x1, I40E_GLNVM_ULT_CONF_PCIR_AE_SHIFT)
+#define I40E_GLNVM_ULT_CONF_PCIRTL_AE_SHIFT 1
+#define I40E_GLNVM_ULT_CONF_PCIRTL_AE_MASK  I40E_MASK(0x1, I40E_GLNVM_ULT_CONF_PCIRTL_AE_SHIFT)
+#define I40E_GLNVM_ULT_RESERVED_1_SHIFT     2
+#define I40E_GLNVM_ULT_RESERVED_1_MASK      I40E_MASK(0x1, I40E_GLNVM_ULT_RESERVED_1_SHIFT)
+#define I40E_GLNVM_ULT_CONF_CORE_AE_SHIFT   3
+#define I40E_GLNVM_ULT_CONF_CORE_AE_MASK    I40E_MASK(0x1, I40E_GLNVM_ULT_CONF_CORE_AE_SHIFT)
+#define I40E_GLNVM_ULT_CONF_GLOBAL_AE_SHIFT 4
+#define I40E_GLNVM_ULT_CONF_GLOBAL_AE_MASK  I40E_MASK(0x1, I40E_GLNVM_ULT_CONF_GLOBAL_AE_SHIFT)
+#define I40E_GLNVM_ULT_CONF_POR_AE_SHIFT    5
+#define I40E_GLNVM_ULT_CONF_POR_AE_MASK     I40E_MASK(0x1, I40E_GLNVM_ULT_CONF_POR_AE_SHIFT)
+#define I40E_GLNVM_ULT_RESERVED_2_SHIFT     6
+#define I40E_GLNVM_ULT_RESERVED_2_MASK      I40E_MASK(0x1, I40E_GLNVM_ULT_RESERVED_2_SHIFT)
+#define I40E_GLNVM_ULT_RESERVED_3_SHIFT     7
+#define I40E_GLNVM_ULT_RESERVED_3_MASK      I40E_MASK(0x1, I40E_GLNVM_ULT_RESERVED_3_SHIFT)
+#define I40E_GLNVM_ULT_CONF_EMP_AE_SHIFT    8
+#define I40E_GLNVM_ULT_CONF_EMP_AE_MASK     I40E_MASK(0x1, I40E_GLNVM_ULT_CONF_EMP_AE_SHIFT)
+#define I40E_GLNVM_ULT_CONF_PCIALT_AE_SHIFT 9
+#define I40E_GLNVM_ULT_CONF_PCIALT_AE_MASK  I40E_MASK(0x1, I40E_GLNVM_ULT_CONF_PCIALT_AE_SHIFT)
+#define I40E_GLNVM_ULT_RESERVED_4_SHIFT     10
+#define I40E_GLNVM_ULT_RESERVED_4_MASK      I40E_MASK(0x3FFFFF, I40E_GLNVM_ULT_RESERVED_4_SHIFT)
+#define I40E_MEM_INIT_DONE_STAT                           0x000B615C /* Reset: POR */
+#define I40E_MEM_INIT_DONE_STAT_CMLAN_MEM_INIT_DONE_SHIFT 0
+#define I40E_MEM_INIT_DONE_STAT_CMLAN_MEM_INIT_DONE_MASK  I40E_MASK(0x1, I40E_MEM_INIT_DONE_STAT_CMLAN_MEM_INIT_DONE_SHIFT)
+#define I40E_MEM_INIT_DONE_STAT_PMAT_MEM_INIT_DONE_SHIFT  1
+#define I40E_MEM_INIT_DONE_STAT_PMAT_MEM_INIT_DONE_MASK   I40E_MASK(0x1, I40E_MEM_INIT_DONE_STAT_PMAT_MEM_INIT_DONE_SHIFT)
+#define I40E_MEM_INIT_DONE_STAT_RCU_MEM_INIT_DONE_SHIFT   2
+#define I40E_MEM_INIT_DONE_STAT_RCU_MEM_INIT_DONE_MASK    I40E_MASK(0x1, I40E_MEM_INIT_DONE_STAT_RCU_MEM_INIT_DONE_SHIFT)
+#define I40E_MEM_INIT_DONE_STAT_TDPU_MEM_INIT_DONE_SHIFT  3
+#define I40E_MEM_INIT_DONE_STAT_TDPU_MEM_INIT_DONE_MASK   I40E_MASK(0x1, I40E_MEM_INIT_DONE_STAT_TDPU_MEM_INIT_DONE_SHIFT)
+#define I40E_MEM_INIT_DONE_STAT_TLAN_MEM_INIT_DONE_SHIFT  4
+#define I40E_MEM_INIT_DONE_STAT_TLAN_MEM_INIT_DONE_MASK   I40E_MASK(0x1, I40E_MEM_INIT_DONE_STAT_TLAN_MEM_INIT_DONE_SHIFT)
+#define I40E_MEM_INIT_DONE_STAT_RLAN_MEM_INIT_DONE_SHIFT  5
+#define I40E_MEM_INIT_DONE_STAT_RLAN_MEM_INIT_DONE_MASK   I40E_MASK(0x1, I40E_MEM_INIT_DONE_STAT_RLAN_MEM_INIT_DONE_SHIFT)
+#define I40E_MEM_INIT_DONE_STAT_RDPU_MEM_INIT_DONE_SHIFT  6
+#define I40E_MEM_INIT_DONE_STAT_RDPU_MEM_INIT_DONE_MASK   I40E_MASK(0x1, I40E_MEM_INIT_DONE_STAT_RDPU_MEM_INIT_DONE_SHIFT)
+#define I40E_MEM_INIT_DONE_STAT_PPRS_MEM_INIT_DONE_SHIFT  7
+#define I40E_MEM_INIT_DONE_STAT_PPRS_MEM_INIT_DONE_MASK   I40E_MASK(0x1, I40E_MEM_INIT_DONE_STAT_PPRS_MEM_INIT_DONE_SHIFT)
+#define I40E_MEM_INIT_DONE_STAT_RPB_MEM_INIT_DONE_SHIFT   8
+#define I40E_MEM_INIT_DONE_STAT_RPB_MEM_INIT_DONE_MASK    I40E_MASK(0x1, I40E_MEM_INIT_DONE_STAT_RPB_MEM_INIT_DONE_SHIFT)
+#define I40E_MEM_INIT_DONE_STAT_TPB_MEM_INIT_DONE_SHIFT   9
+#define I40E_MEM_INIT_DONE_STAT_TPB_MEM_INIT_DONE_MASK    I40E_MASK(0x1, I40E_MEM_INIT_DONE_STAT_TPB_MEM_INIT_DONE_SHIFT)
+#define I40E_MEM_INIT_DONE_STAT_FOC_MEM_INIT_DONE_SHIFT   10
+#define I40E_MEM_INIT_DONE_STAT_FOC_MEM_INIT_DONE_MASK    I40E_MASK(0x1, I40E_MEM_INIT_DONE_STAT_FOC_MEM_INIT_DONE_SHIFT)
+#define I40E_MEM_INIT_DONE_STAT_TSCD_MEM_INIT_DONE_SHIFT  11
+#define I40E_MEM_INIT_DONE_STAT_TSCD_MEM_INIT_DONE_MASK   I40E_MASK(0x1, I40E_MEM_INIT_DONE_STAT_TSCD_MEM_INIT_DONE_SHIFT)
+#define I40E_MEM_INIT_DONE_STAT_TCB_MEM_INIT_DONE_SHIFT   12
+#define I40E_MEM_INIT_DONE_STAT_TCB_MEM_INIT_DONE_MASK    I40E_MASK(0x1, I40E_MEM_INIT_DONE_STAT_TCB_MEM_INIT_DONE_SHIFT)
+#define I40E_MEM_INIT_DONE_STAT_RCB_MEM_INIT_DONE_SHIFT   13
+#define I40E_MEM_INIT_DONE_STAT_RCB_MEM_INIT_DONE_MASK    I40E_MASK(0x1, I40E_MEM_INIT_DONE_STAT_RCB_MEM_INIT_DONE_SHIFT)
+#define I40E_MEM_INIT_DONE_STAT_WUC_MEM_INIT_DONE_SHIFT   14
+#define I40E_MEM_INIT_DONE_STAT_WUC_MEM_INIT_DONE_MASK    I40E_MASK(0x1, I40E_MEM_INIT_DONE_STAT_WUC_MEM_INIT_DONE_SHIFT)
+#define I40E_MEM_INIT_DONE_STAT_STAT_MEM_INIT_DONE_SHIFT  15
+#define I40E_MEM_INIT_DONE_STAT_STAT_MEM_INIT_DONE_MASK   I40E_MASK(0x1, I40E_MEM_INIT_DONE_STAT_STAT_MEM_INIT_DONE_SHIFT)
+#define I40E_MEM_INIT_DONE_STAT_ITR_MEM_INIT_DONE_SHIFT   16
+#define I40E_MEM_INIT_DONE_STAT_ITR_MEM_INIT_DONE_MASK    I40E_MASK(0x1, I40E_MEM_INIT_DONE_STAT_ITR_MEM_INIT_DONE_SHIFT)
+#define I40E_MNGSB_DADD            0x000B7030 /* Reset: POR */
+#define I40E_MNGSB_DADD_ADDR_SHIFT 0
+#define I40E_MNGSB_DADD_ADDR_MASK  I40E_MASK(0xFFFFFFFF, I40E_MNGSB_DADD_ADDR_SHIFT)
+#define I40E_MNGSB_DCNT                0x000B7034 /* Reset: POR */
+#define I40E_MNGSB_DCNT_BYTE_CNT_SHIFT 0
+#define I40E_MNGSB_DCNT_BYTE_CNT_MASK  I40E_MASK(0xFFFFFFFF, I40E_MNGSB_DCNT_BYTE_CNT_SHIFT)
+#define I40E_MNGSB_MSGCTL                  0x000B7020 /* Reset: POR */
+#define I40E_MNGSB_MSGCTL_HDR_DWS_SHIFT    0
+#define I40E_MNGSB_MSGCTL_HDR_DWS_MASK     I40E_MASK(0x3, I40E_MNGSB_MSGCTL_HDR_DWS_SHIFT)
+#define I40E_MNGSB_MSGCTL_EXP_RDW_SHIFT    8
+#define I40E_MNGSB_MSGCTL_EXP_RDW_MASK     I40E_MASK(0x1FF, I40E_MNGSB_MSGCTL_EXP_RDW_SHIFT)
+#define I40E_MNGSB_MSGCTL_MSG_MODE_SHIFT   26
+#define I40E_MNGSB_MSGCTL_MSG_MODE_MASK    I40E_MASK(0x3, I40E_MNGSB_MSGCTL_MSG_MODE_SHIFT)
+#define I40E_MNGSB_MSGCTL_TOKEN_MODE_SHIFT 28
+#define I40E_MNGSB_MSGCTL_TOKEN_MODE_MASK  I40E_MASK(0x3, I40E_MNGSB_MSGCTL_TOKEN_MODE_SHIFT)
+#define I40E_MNGSB_MSGCTL_BARCLR_SHIFT     30
+#define I40E_MNGSB_MSGCTL_BARCLR_MASK      I40E_MASK(0x1, I40E_MNGSB_MSGCTL_BARCLR_SHIFT)
+#define I40E_MNGSB_MSGCTL_CMDV_SHIFT       31
+#define I40E_MNGSB_MSGCTL_CMDV_MASK        I40E_MASK(0x1, I40E_MNGSB_MSGCTL_CMDV_SHIFT)
+#define I40E_MNGSB_RDATA            0x000B7300 /* Reset: POR */
+#define I40E_MNGSB_RDATA_DATA_SHIFT 0
+#define I40E_MNGSB_RDATA_DATA_MASK  I40E_MASK(0xFFFFFFFF, I40E_MNGSB_RDATA_DATA_SHIFT)
+#define I40E_MNGSB_RHDR0                   0x000B72FC /* Reset: POR */
+#define I40E_MNGSB_RHDR0_DESTINATION_SHIFT 0
+#define I40E_MNGSB_RHDR0_DESTINATION_MASK  I40E_MASK(0xFF, I40E_MNGSB_RHDR0_DESTINATION_SHIFT)
+#define I40E_MNGSB_RHDR0_SOURCE_SHIFT      8
+#define I40E_MNGSB_RHDR0_SOURCE_MASK       I40E_MASK(0xFF, I40E_MNGSB_RHDR0_SOURCE_SHIFT)
+#define I40E_MNGSB_RHDR0_OPCODE_SHIFT      16
+#define I40E_MNGSB_RHDR0_OPCODE_MASK       I40E_MASK(0xFF, I40E_MNGSB_RHDR0_OPCODE_SHIFT)
+#define I40E_MNGSB_RHDR0_TAG_SHIFT         24
+#define I40E_MNGSB_RHDR0_TAG_MASK          I40E_MASK(0x7, I40E_MNGSB_RHDR0_TAG_SHIFT)
+#define I40E_MNGSB_RHDR0_RESPONSE_SHIFT    27
+#define I40E_MNGSB_RHDR0_RESPONSE_MASK     I40E_MASK(0x7, I40E_MNGSB_RHDR0_RESPONSE_SHIFT)
+#define I40E_MNGSB_RHDR0_EH_SHIFT          31
+#define I40E_MNGSB_RHDR0_EH_MASK           I40E_MASK(0x1, I40E_MNGSB_RHDR0_EH_SHIFT)
+#define I40E_MNGSB_RSPCTL                      0x000B7024 /* Reset: POR */
+#define I40E_MNGSB_RSPCTL_DMA_MSG_DWORDS_SHIFT 0
+#define I40E_MNGSB_RSPCTL_DMA_MSG_DWORDS_MASK  I40E_MASK(0x1FF, I40E_MNGSB_RSPCTL_DMA_MSG_DWORDS_SHIFT)
+#define I40E_MNGSB_RSPCTL_RSP_MODE_SHIFT       26
+#define I40E_MNGSB_RSPCTL_RSP_MODE_MASK        I40E_MASK(0x3, I40E_MNGSB_RSPCTL_RSP_MODE_SHIFT)
+#define I40E_MNGSB_RSPCTL_RSP_BAD_LEN_SHIFT    30
+#define I40E_MNGSB_RSPCTL_RSP_BAD_LEN_MASK     I40E_MASK(0x1, I40E_MNGSB_RSPCTL_RSP_BAD_LEN_SHIFT)
+#define I40E_MNGSB_RSPCTL_RSP_ERR_SHIFT        31
+#define I40E_MNGSB_RSPCTL_RSP_ERR_MASK         I40E_MASK(0x1, I40E_MNGSB_RSPCTL_RSP_ERR_SHIFT)
+#define I40E_MNGSB_WDATA            0x000B7100 /* Reset: POR */
+#define I40E_MNGSB_WDATA_DATA_SHIFT 0
+#define I40E_MNGSB_WDATA_DATA_MASK  I40E_MASK(0xFFFFFFFF, I40E_MNGSB_WDATA_DATA_SHIFT)
+#define I40E_MNGSB_WHDR0                  0x000B70F4 /* Reset: POR */
+#define I40E_MNGSB_WHDR0_RAW_DEST_SHIFT   0
+#define I40E_MNGSB_WHDR0_RAW_DEST_MASK    I40E_MASK(0xFF, I40E_MNGSB_WHDR0_RAW_DEST_SHIFT)
+#define I40E_MNGSB_WHDR0_DEST_SEL_SHIFT   12
+#define I40E_MNGSB_WHDR0_DEST_SEL_MASK    I40E_MASK(0xF, I40E_MNGSB_WHDR0_DEST_SEL_SHIFT)
+#define I40E_MNGSB_WHDR0_OPCODE_SEL_SHIFT 16
+#define I40E_MNGSB_WHDR0_OPCODE_SEL_MASK  I40E_MASK(0xFF, I40E_MNGSB_WHDR0_OPCODE_SEL_SHIFT)
+#define I40E_MNGSB_WHDR0_TAG_SHIFT        24
+#define I40E_MNGSB_WHDR0_TAG_MASK         I40E_MASK(0x7F, I40E_MNGSB_WHDR0_TAG_SHIFT)
+#define I40E_MNGSB_WHDR1            0x000B70F8 /* Reset: POR */
+#define I40E_MNGSB_WHDR1_ADDR_SHIFT 0
+#define I40E_MNGSB_WHDR1_ADDR_MASK  I40E_MASK(0xFFFFFFFF, I40E_MNGSB_WHDR1_ADDR_SHIFT)
+#define I40E_MNGSB_WHDR2              0x000B70FC /* Reset: POR */
+#define I40E_MNGSB_WHDR2_LENGTH_SHIFT 0
+#define I40E_MNGSB_WHDR2_LENGTH_MASK  I40E_MASK(0xFFFFFFFF, I40E_MNGSB_WHDR2_LENGTH_SHIFT)
+
+#define I40E_GLPCI_CAPSUP_WAKUP_EN_SHIFT       21
+#define I40E_GLPCI_CAPSUP_WAKUP_EN_MASK        I40E_MASK(0x1, I40E_GLPCI_CAPSUP_WAKUP_EN_SHIFT)
+
+#define I40E_GLPCI_CUR_CLNT_COMMON                  0x0009CA18 /* Reset: PCIR */
+#define I40E_GLPCI_CUR_CLNT_COMMON_DATA_LINES_SHIFT 0
+#define I40E_GLPCI_CUR_CLNT_COMMON_DATA_LINES_MASK  I40E_MASK(0xFFFF, I40E_GLPCI_CUR_CLNT_COMMON_DATA_LINES_SHIFT)
+#define I40E_GLPCI_CUR_CLNT_COMMON_OSR_SHIFT        16
+#define I40E_GLPCI_CUR_CLNT_COMMON_OSR_MASK         I40E_MASK(0xFFFF, I40E_GLPCI_CUR_CLNT_COMMON_OSR_SHIFT)
+#define I40E_GLPCI_CUR_CLNT_PIPEMON                  0x0009CA20 /* Reset: PCIR */
+#define I40E_GLPCI_CUR_CLNT_PIPEMON_DATA_LINES_SHIFT 0
+#define I40E_GLPCI_CUR_CLNT_PIPEMON_DATA_LINES_MASK  I40E_MASK(0xFFFF, I40E_GLPCI_CUR_CLNT_PIPEMON_DATA_LINES_SHIFT)
+#define I40E_GLPCI_CUR_MNG_ALWD                  0x0009c514 /* Reset: PCIR */
+#define I40E_GLPCI_CUR_MNG_ALWD_DATA_LINES_SHIFT 0
+#define I40E_GLPCI_CUR_MNG_ALWD_DATA_LINES_MASK  I40E_MASK(0xFFFF, I40E_GLPCI_CUR_MNG_ALWD_DATA_LINES_SHIFT)
+#define I40E_GLPCI_CUR_MNG_ALWD_OSR_SHIFT        16
+#define I40E_GLPCI_CUR_MNG_ALWD_OSR_MASK         I40E_MASK(0xFFFF, I40E_GLPCI_CUR_MNG_ALWD_OSR_SHIFT)
+#define I40E_GLPCI_CUR_MNG_RSVD                  0x0009c594 /* Reset: PCIR */
+#define I40E_GLPCI_CUR_MNG_RSVD_DATA_LINES_SHIFT 0
+#define I40E_GLPCI_CUR_MNG_RSVD_DATA_LINES_MASK  I40E_MASK(0xFFFF, I40E_GLPCI_CUR_MNG_RSVD_DATA_LINES_SHIFT)
+#define I40E_GLPCI_CUR_MNG_RSVD_OSR_SHIFT        16
+#define I40E_GLPCI_CUR_MNG_RSVD_OSR_MASK         I40E_MASK(0xFFFF, I40E_GLPCI_CUR_MNG_RSVD_OSR_SHIFT)
+#define I40E_GLPCI_CUR_PMAT_ALWD                  0x0009c510 /* Reset: PCIR */
+#define I40E_GLPCI_CUR_PMAT_ALWD_DATA_LINES_SHIFT 0
+#define I40E_GLPCI_CUR_PMAT_ALWD_DATA_LINES_MASK  I40E_MASK(0xFFFF, I40E_GLPCI_CUR_PMAT_ALWD_DATA_LINES_SHIFT)
+#define I40E_GLPCI_CUR_PMAT_ALWD_OSR_SHIFT        16
+#define I40E_GLPCI_CUR_PMAT_ALWD_OSR_MASK         I40E_MASK(0xFFFF, I40E_GLPCI_CUR_PMAT_ALWD_OSR_SHIFT)
+#define I40E_GLPCI_CUR_PMAT_RSVD                  0x0009c590 /* Reset: PCIR */
+#define I40E_GLPCI_CUR_PMAT_RSVD_DATA_LINES_SHIFT 0
+#define I40E_GLPCI_CUR_PMAT_RSVD_DATA_LINES_MASK  I40E_MASK(0xFFFF, I40E_GLPCI_CUR_PMAT_RSVD_DATA_LINES_SHIFT)
+#define I40E_GLPCI_CUR_PMAT_RSVD_OSR_SHIFT        16
+#define I40E_GLPCI_CUR_PMAT_RSVD_OSR_MASK         I40E_MASK(0xFFFF, I40E_GLPCI_CUR_PMAT_RSVD_OSR_SHIFT)
+#define I40E_GLPCI_CUR_RLAN_ALWD                  0x0009c500 /* Reset: PCIR */
+#define I40E_GLPCI_CUR_RLAN_ALWD_DATA_LINES_SHIFT 0
+#define I40E_GLPCI_CUR_RLAN_ALWD_DATA_LINES_MASK  I40E_MASK(0xFFFF, I40E_GLPCI_CUR_RLAN_ALWD_DATA_LINES_SHIFT)
+#define I40E_GLPCI_CUR_RLAN_ALWD_OSR_SHIFT        16
+#define I40E_GLPCI_CUR_RLAN_ALWD_OSR_MASK         I40E_MASK(0xFFFF, I40E_GLPCI_CUR_RLAN_ALWD_OSR_SHIFT)
+#define I40E_GLPCI_CUR_RLAN_RSVD                  0x0009c580 /* Reset: PCIR */
+#define I40E_GLPCI_CUR_RLAN_RSVD_DATA_LINES_SHIFT 0
+#define I40E_GLPCI_CUR_RLAN_RSVD_DATA_LINES_MASK  I40E_MASK(0xFFFF, I40E_GLPCI_CUR_RLAN_RSVD_DATA_LINES_SHIFT)
+#define I40E_GLPCI_CUR_RLAN_RSVD_OSR_SHIFT        16
+#define I40E_GLPCI_CUR_RLAN_RSVD_OSR_MASK         I40E_MASK(0xFFFF, I40E_GLPCI_CUR_RLAN_RSVD_OSR_SHIFT)
+#define I40E_GLPCI_CUR_RXPE_ALWD                  0x0009c508 /* Reset: PCIR */
+#define I40E_GLPCI_CUR_RXPE_ALWD_DATA_LINES_SHIFT 0
+#define I40E_GLPCI_CUR_RXPE_ALWD_DATA_LINES_MASK  I40E_MASK(0xFFFF, I40E_GLPCI_CUR_RXPE_ALWD_DATA_LINES_SHIFT)
+#define I40E_GLPCI_CUR_RXPE_ALWD_OSR_SHIFT        16
+#define I40E_GLPCI_CUR_RXPE_ALWD_OSR_MASK         I40E_MASK(0xFFFF, I40E_GLPCI_CUR_RXPE_ALWD_OSR_SHIFT)
+#define I40E_GLPCI_CUR_RXPE_RSVD                  0x0009c588 /* Reset: PCIR */
+#define I40E_GLPCI_CUR_RXPE_RSVD_DATA_LINES_SHIFT 0
+#define I40E_GLPCI_CUR_RXPE_RSVD_DATA_LINES_MASK  I40E_MASK(0xFFFF, I40E_GLPCI_CUR_RXPE_RSVD_DATA_LINES_SHIFT)
+#define I40E_GLPCI_CUR_RXPE_RSVD_OSR_SHIFT        16
+#define I40E_GLPCI_CUR_RXPE_RSVD_OSR_MASK         I40E_MASK(0xFFFF, I40E_GLPCI_CUR_RXPE_RSVD_OSR_SHIFT)
+#define I40E_GLPCI_CUR_TDPU_ALWD                  0x0009c518 /* Reset: PCIR */
+#define I40E_GLPCI_CUR_TDPU_ALWD_DATA_LINES_SHIFT 0
+#define I40E_GLPCI_CUR_TDPU_ALWD_DATA_LINES_MASK  I40E_MASK(0xFFFF, I40E_GLPCI_CUR_TDPU_ALWD_DATA_LINES_SHIFT)
+#define I40E_GLPCI_CUR_TDPU_ALWD_OSR_SHIFT        16
+#define I40E_GLPCI_CUR_TDPU_ALWD_OSR_MASK         I40E_MASK(0xFFFF, I40E_GLPCI_CUR_TDPU_ALWD_OSR_SHIFT)
+#define I40E_GLPCI_CUR_TDPU_RSVD                  0x0009c598 /* Reset: PCIR */
+#define I40E_GLPCI_CUR_TDPU_RSVD_DATA_LINES_SHIFT 0
+#define I40E_GLPCI_CUR_TDPU_RSVD_DATA_LINES_MASK  I40E_MASK(0xFFFF, I40E_GLPCI_CUR_TDPU_RSVD_DATA_LINES_SHIFT)
+#define I40E_GLPCI_CUR_TDPU_RSVD_OSR_SHIFT        16
+#define I40E_GLPCI_CUR_TDPU_RSVD_OSR_MASK         I40E_MASK(0xFFFF, I40E_GLPCI_CUR_TDPU_RSVD_OSR_SHIFT)
+#define I40E_GLPCI_CUR_TLAN_ALWD                  0x0009c504 /* Reset: PCIR */
+#define I40E_GLPCI_CUR_TLAN_ALWD_DATA_LINES_SHIFT 0
+#define I40E_GLPCI_CUR_TLAN_ALWD_DATA_LINES_MASK  I40E_MASK(0xFFFF, I40E_GLPCI_CUR_TLAN_ALWD_DATA_LINES_SHIFT)
+#define I40E_GLPCI_CUR_TLAN_ALWD_OSR_SHIFT        16
+#define I40E_GLPCI_CUR_TLAN_ALWD_OSR_MASK         I40E_MASK(0xFFFF, I40E_GLPCI_CUR_TLAN_ALWD_OSR_SHIFT)
+#define I40E_GLPCI_CUR_TLAN_RSVD                  0x0009c584 /* Reset: PCIR */
+#define I40E_GLPCI_CUR_TLAN_RSVD_DATA_LINES_SHIFT 0
+#define I40E_GLPCI_CUR_TLAN_RSVD_DATA_LINES_MASK  I40E_MASK(0xFFFF, I40E_GLPCI_CUR_TLAN_RSVD_DATA_LINES_SHIFT)
+#define I40E_GLPCI_CUR_TLAN_RSVD_OSR_SHIFT        16
+#define I40E_GLPCI_CUR_TLAN_RSVD_OSR_MASK         I40E_MASK(0xFFFF, I40E_GLPCI_CUR_TLAN_RSVD_OSR_SHIFT)
+#define I40E_GLPCI_CUR_TXPE_ALWD                  0x0009c50C /* Reset: PCIR */
+#define I40E_GLPCI_CUR_TXPE_ALWD_DATA_LINES_SHIFT 0
+#define I40E_GLPCI_CUR_TXPE_ALWD_DATA_LINES_MASK  I40E_MASK(0xFFFF, I40E_GLPCI_CUR_TXPE_ALWD_DATA_LINES_SHIFT)
+#define I40E_GLPCI_CUR_TXPE_ALWD_OSR_SHIFT        16
+#define I40E_GLPCI_CUR_TXPE_ALWD_OSR_MASK         I40E_MASK(0xFFFF, I40E_GLPCI_CUR_TXPE_ALWD_OSR_SHIFT)
+#define I40E_GLPCI_CUR_TXPE_RSVD                  0x0009c58c /* Reset: PCIR */
+#define I40E_GLPCI_CUR_TXPE_RSVD_DATA_LINES_SHIFT 0
+#define I40E_GLPCI_CUR_TXPE_RSVD_DATA_LINES_MASK  I40E_MASK(0xFFFF, I40E_GLPCI_CUR_TXPE_RSVD_DATA_LINES_SHIFT)
+#define I40E_GLPCI_CUR_TXPE_RSVD_OSR_SHIFT        16
+#define I40E_GLPCI_CUR_TXPE_RSVD_OSR_MASK         I40E_MASK(0xFFFF, I40E_GLPCI_CUR_TXPE_RSVD_OSR_SHIFT)
+#define I40E_GLPCI_CUR_WATMK_CLNT_COMMON                  0x0009CA28 /* Reset: PCIR */
+#define I40E_GLPCI_CUR_WATMK_CLNT_COMMON_DATA_LINES_SHIFT 0
+#define I40E_GLPCI_CUR_WATMK_CLNT_COMMON_DATA_LINES_MASK  I40E_MASK(0xFFFF, I40E_GLPCI_CUR_WATMK_CLNT_COMMON_DATA_LINES_SHIFT)
+#define I40E_GLPCI_CUR_WATMK_CLNT_COMMON_OSR_SHIFT        16
+#define I40E_GLPCI_CUR_WATMK_CLNT_COMMON_OSR_MASK         I40E_MASK(0xFFFF, I40E_GLPCI_CUR_WATMK_CLNT_COMMON_OSR_SHIFT)
+
+#define I40E_GLPCI_LBARCTRL_PE_DB_SIZE_SHIFT    4
+#define I40E_GLPCI_LBARCTRL_PE_DB_SIZE_MASK     I40E_MASK(0x3, I40E_GLPCI_LBARCTRL_PE_DB_SIZE_SHIFT)
+#define I40E_GLPCI_LBARCTRL_VF_PE_DB_SIZE_SHIFT 10
+#define I40E_GLPCI_LBARCTRL_VF_PE_DB_SIZE_MASK  I40E_MASK(0x1, I40E_GLPCI_LBARCTRL_VF_PE_DB_SIZE_SHIFT)
+#define I40E_GLPCI_NPQ_CFG                    0x0009CA00 /* Reset: PCIR */
+#define I40E_GLPCI_NPQ_CFG_EXTEND_TO_SHIFT    0
+#define I40E_GLPCI_NPQ_CFG_EXTEND_TO_MASK     I40E_MASK(0x1, I40E_GLPCI_NPQ_CFG_EXTEND_TO_SHIFT)
+#define I40E_GLPCI_NPQ_CFG_SMALL_TO_SHIFT     1
+#define I40E_GLPCI_NPQ_CFG_SMALL_TO_MASK      I40E_MASK(0x1, I40E_GLPCI_NPQ_CFG_SMALL_TO_SHIFT)
+#define I40E_GLPCI_NPQ_CFG_WEIGHT_AVG_SHIFT   2
+#define I40E_GLPCI_NPQ_CFG_WEIGHT_AVG_MASK    I40E_MASK(0xF, I40E_GLPCI_NPQ_CFG_WEIGHT_AVG_SHIFT)
+#define I40E_GLPCI_NPQ_CFG_NPQ_SPARE_SHIFT    6
+#define I40E_GLPCI_NPQ_CFG_NPQ_SPARE_MASK     I40E_MASK(0x3FF, I40E_GLPCI_NPQ_CFG_NPQ_SPARE_SHIFT)
+#define I40E_GLPCI_NPQ_CFG_NPQ_ERR_STAT_SHIFT 16
+#define I40E_GLPCI_NPQ_CFG_NPQ_ERR_STAT_MASK  I40E_MASK(0xF, I40E_GLPCI_NPQ_CFG_NPQ_ERR_STAT_SHIFT)
+#define I40E_GLPCI_WATMK_CLNT_PIPEMON                  0x0009CA30 /* Reset: PCIR */
+#define I40E_GLPCI_WATMK_CLNT_PIPEMON_DATA_LINES_SHIFT 0
+#define I40E_GLPCI_WATMK_CLNT_PIPEMON_DATA_LINES_MASK  I40E_MASK(0xFFFF, I40E_GLPCI_WATMK_CLNT_PIPEMON_DATA_LINES_SHIFT)
+#define I40E_GLPCI_WATMK_MNG_ALWD                  0x0009CB14 /* Reset: PCIR */
+#define I40E_GLPCI_WATMK_MNG_ALWD_DATA_LINES_SHIFT 0
+#define I40E_GLPCI_WATMK_MNG_ALWD_DATA_LINES_MASK  I40E_MASK(0xFFFF, I40E_GLPCI_WATMK_MNG_ALWD_DATA_LINES_SHIFT)
+#define I40E_GLPCI_WATMK_MNG_ALWD_OSR_SHIFT        16
+#define I40E_GLPCI_WATMK_MNG_ALWD_OSR_MASK         I40E_MASK(0xFFFF, I40E_GLPCI_WATMK_MNG_ALWD_OSR_SHIFT)
+#define I40E_GLPCI_WATMK_PMAT_ALWD                  0x0009CB10 /* Reset: PCIR */
+#define I40E_GLPCI_WATMK_PMAT_ALWD_DATA_LINES_SHIFT 0
+#define I40E_GLPCI_WATMK_PMAT_ALWD_DATA_LINES_MASK  I40E_MASK(0xFFFF, I40E_GLPCI_WATMK_PMAT_ALWD_DATA_LINES_SHIFT)
+#define I40E_GLPCI_WATMK_PMAT_ALWD_OSR_SHIFT        16
+#define I40E_GLPCI_WATMK_PMAT_ALWD_OSR_MASK         I40E_MASK(0xFFFF, I40E_GLPCI_WATMK_PMAT_ALWD_OSR_SHIFT)
+#define I40E_GLPCI_WATMK_RLAN_ALWD                  0x0009CB00 /* Reset: PCIR */
+#define I40E_GLPCI_WATMK_RLAN_ALWD_DATA_LINES_SHIFT 0
+#define I40E_GLPCI_WATMK_RLAN_ALWD_DATA_LINES_MASK  I40E_MASK(0xFFFF, I40E_GLPCI_WATMK_RLAN_ALWD_DATA_LINES_SHIFT)
+#define I40E_GLPCI_WATMK_RLAN_ALWD_OSR_SHIFT        16
+#define I40E_GLPCI_WATMK_RLAN_ALWD_OSR_MASK         I40E_MASK(0xFFFF, I40E_GLPCI_WATMK_RLAN_ALWD_OSR_SHIFT)
+#define I40E_GLPCI_WATMK_RXPE_ALWD                  0x0009CB08 /* Reset: PCIR */
+#define I40E_GLPCI_WATMK_RXPE_ALWD_DATA_LINES_SHIFT 0
+#define I40E_GLPCI_WATMK_RXPE_ALWD_DATA_LINES_MASK  I40E_MASK(0xFFFF, I40E_GLPCI_WATMK_RXPE_ALWD_DATA_LINES_SHIFT)
+#define I40E_GLPCI_WATMK_RXPE_ALWD_OSR_SHIFT        16
+#define I40E_GLPCI_WATMK_RXPE_ALWD_OSR_MASK         I40E_MASK(0xFFFF, I40E_GLPCI_WATMK_RXPE_ALWD_OSR_SHIFT)
+#define I40E_GLPCI_WATMK_TLAN_ALWD                  0x0009CB04 /* Reset: PCIR */
+#define I40E_GLPCI_WATMK_TLAN_ALWD_DATA_LINES_SHIFT 0
+#define I40E_GLPCI_WATMK_TLAN_ALWD_DATA_LINES_MASK  I40E_MASK(0xFFFF, I40E_GLPCI_WATMK_TLAN_ALWD_DATA_LINES_SHIFT)
+#define I40E_GLPCI_WATMK_TLAN_ALWD_OSR_SHIFT        16
+#define I40E_GLPCI_WATMK_TLAN_ALWD_OSR_MASK         I40E_MASK(0xFFFF, I40E_GLPCI_WATMK_TLAN_ALWD_OSR_SHIFT)
+#define I40E_GLPCI_WATMK_TPDU_ALWD                  0x0009CB18 /* Reset: PCIR */
+#define I40E_GLPCI_WATMK_TPDU_ALWD_DATA_LINES_SHIFT 0
+#define I40E_GLPCI_WATMK_TPDU_ALWD_DATA_LINES_MASK  I40E_MASK(0xFFFF, I40E_GLPCI_WATMK_TPDU_ALWD_DATA_LINES_SHIFT)
+#define I40E_GLPCI_WATMK_TPDU_ALWD_OSR_SHIFT        16
+#define I40E_GLPCI_WATMK_TPDU_ALWD_OSR_MASK         I40E_MASK(0xFFFF, I40E_GLPCI_WATMK_TPDU_ALWD_OSR_SHIFT)
+#define I40E_GLPCI_WATMK_TXPE_ALWD                  0x0009CB0c /* Reset: PCIR */
+#define I40E_GLPCI_WATMK_TXPE_ALWD_DATA_LINES_SHIFT 0
+#define I40E_GLPCI_WATMK_TXPE_ALWD_DATA_LINES_MASK  I40E_MASK(0xFFFF, I40E_GLPCI_WATMK_TXPE_ALWD_DATA_LINES_SHIFT)
+#define I40E_GLPCI_WATMK_TXPE_ALWD_OSR_SHIFT        16
+#define I40E_GLPCI_WATMK_TXPE_ALWD_OSR_MASK         I40E_MASK(0xFFFF, I40E_GLPCI_WATMK_TXPE_ALWD_OSR_SHIFT)
+#define I40E_GLPE_CPUSTATUS0                    0x0000D040 /* Reset: PE_CORER */
+#define I40E_GLPE_CPUSTATUS0_PECPUSTATUS0_SHIFT 0
+#define I40E_GLPE_CPUSTATUS0_PECPUSTATUS0_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPE_CPUSTATUS0_PECPUSTATUS0_SHIFT)
+#define I40E_GLPE_CPUSTATUS1                    0x0000D044 /* Reset: PE_CORER */
+#define I40E_GLPE_CPUSTATUS1_PECPUSTATUS1_SHIFT 0
+#define I40E_GLPE_CPUSTATUS1_PECPUSTATUS1_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPE_CPUSTATUS1_PECPUSTATUS1_SHIFT)
+#define I40E_GLPE_CPUSTATUS2                    0x0000D048 /* Reset: PE_CORER */
+#define I40E_GLPE_CPUSTATUS2_PECPUSTATUS2_SHIFT 0
+#define I40E_GLPE_CPUSTATUS2_PECPUSTATUS2_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPE_CPUSTATUS2_PECPUSTATUS2_SHIFT)
+#define I40E_GLPE_CPUTRIG0                   0x0000D060 /* Reset: PE_CORER */
+#define I40E_GLPE_CPUTRIG0_PECPUTRIG0_SHIFT  0
+#define I40E_GLPE_CPUTRIG0_PECPUTRIG0_MASK   I40E_MASK(0xFFFF, I40E_GLPE_CPUTRIG0_PECPUTRIG0_SHIFT)
+#define I40E_GLPE_CPUTRIG0_TEPREQUEST0_SHIFT 17
+#define I40E_GLPE_CPUTRIG0_TEPREQUEST0_MASK  I40E_MASK(0x1, I40E_GLPE_CPUTRIG0_TEPREQUEST0_SHIFT)
+#define I40E_GLPE_CPUTRIG0_OOPREQUEST0_SHIFT 18
+#define I40E_GLPE_CPUTRIG0_OOPREQUEST0_MASK  I40E_MASK(0x1, I40E_GLPE_CPUTRIG0_OOPREQUEST0_SHIFT)
+#define I40E_GLPE_DUAL40_RUPM                     0x0000DA04 /* Reset: PE_CORER */
+#define I40E_GLPE_DUAL40_RUPM_DUAL_40G_MODE_SHIFT 0
+#define I40E_GLPE_DUAL40_RUPM_DUAL_40G_MODE_MASK  I40E_MASK(0x1, I40E_GLPE_DUAL40_RUPM_DUAL_40G_MODE_SHIFT)
+#define I40E_GLPE_PFAEQEDROPCNT(_i)               (0x00131440 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLPE_PFAEQEDROPCNT_MAX_INDEX         15
+#define I40E_GLPE_PFAEQEDROPCNT_AEQEDROPCNT_SHIFT 0
+#define I40E_GLPE_PFAEQEDROPCNT_AEQEDROPCNT_MASK  I40E_MASK(0xFFFF, I40E_GLPE_PFAEQEDROPCNT_AEQEDROPCNT_SHIFT)
+#define I40E_GLPE_PFCEQEDROPCNT(_i)               (0x001313C0 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLPE_PFCEQEDROPCNT_MAX_INDEX         15
+#define I40E_GLPE_PFCEQEDROPCNT_CEQEDROPCNT_SHIFT 0
+#define I40E_GLPE_PFCEQEDROPCNT_CEQEDROPCNT_MASK  I40E_MASK(0xFFFF, I40E_GLPE_PFCEQEDROPCNT_CEQEDROPCNT_SHIFT)
+#define I40E_GLPE_PFCQEDROPCNT(_i)              (0x00131340 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLPE_PFCQEDROPCNT_MAX_INDEX        15
+#define I40E_GLPE_PFCQEDROPCNT_CQEDROPCNT_SHIFT 0
+#define I40E_GLPE_PFCQEDROPCNT_CQEDROPCNT_MASK  I40E_MASK(0xFFFF, I40E_GLPE_PFCQEDROPCNT_CQEDROPCNT_SHIFT)
+#define I40E_GLPE_RUPM_CQPPOOL                0x0000DACC /* Reset: PE_CORER */
+#define I40E_GLPE_RUPM_CQPPOOL_CQPSPADS_SHIFT 0
+#define I40E_GLPE_RUPM_CQPPOOL_CQPSPADS_MASK  I40E_MASK(0xFF, I40E_GLPE_RUPM_CQPPOOL_CQPSPADS_SHIFT)
+#define I40E_GLPE_RUPM_FLRPOOL                0x0000DAC4 /* Reset: PE_CORER */
+#define I40E_GLPE_RUPM_FLRPOOL_FLRSPADS_SHIFT 0
+#define I40E_GLPE_RUPM_FLRPOOL_FLRSPADS_MASK  I40E_MASK(0xFF, I40E_GLPE_RUPM_FLRPOOL_FLRSPADS_SHIFT)
+#define I40E_GLPE_RUPM_GCTL                   0x0000DA00 /* Reset: PE_CORER */
+#define I40E_GLPE_RUPM_GCTL_ALLOFFTH_SHIFT    0
+#define I40E_GLPE_RUPM_GCTL_ALLOFFTH_MASK     I40E_MASK(0xFF, I40E_GLPE_RUPM_GCTL_ALLOFFTH_SHIFT)
+#define I40E_GLPE_RUPM_GCTL_RUPM_P0_DIS_SHIFT 26
+#define I40E_GLPE_RUPM_GCTL_RUPM_P0_DIS_MASK  I40E_MASK(0x1, I40E_GLPE_RUPM_GCTL_RUPM_P0_DIS_SHIFT)
+#define I40E_GLPE_RUPM_GCTL_RUPM_P1_DIS_SHIFT 27
+#define I40E_GLPE_RUPM_GCTL_RUPM_P1_DIS_MASK  I40E_MASK(0x1, I40E_GLPE_RUPM_GCTL_RUPM_P1_DIS_SHIFT)
+#define I40E_GLPE_RUPM_GCTL_RUPM_P2_DIS_SHIFT 28
+#define I40E_GLPE_RUPM_GCTL_RUPM_P2_DIS_MASK  I40E_MASK(0x1, I40E_GLPE_RUPM_GCTL_RUPM_P2_DIS_SHIFT)
+#define I40E_GLPE_RUPM_GCTL_RUPM_P3_DIS_SHIFT 29
+#define I40E_GLPE_RUPM_GCTL_RUPM_P3_DIS_MASK  I40E_MASK(0x1, I40E_GLPE_RUPM_GCTL_RUPM_P3_DIS_SHIFT)
+#define I40E_GLPE_RUPM_GCTL_RUPM_DIS_SHIFT    30
+#define I40E_GLPE_RUPM_GCTL_RUPM_DIS_MASK     I40E_MASK(0x1, I40E_GLPE_RUPM_GCTL_RUPM_DIS_SHIFT)
+#define I40E_GLPE_RUPM_GCTL_SWLB_MODE_SHIFT   31
+#define I40E_GLPE_RUPM_GCTL_SWLB_MODE_MASK    I40E_MASK(0x1, I40E_GLPE_RUPM_GCTL_SWLB_MODE_SHIFT)
+#define I40E_GLPE_RUPM_PTXPOOL                0x0000DAC8 /* Reset: PE_CORER */
+#define I40E_GLPE_RUPM_PTXPOOL_PTXSPADS_SHIFT 0
+#define I40E_GLPE_RUPM_PTXPOOL_PTXSPADS_MASK  I40E_MASK(0xFF, I40E_GLPE_RUPM_PTXPOOL_PTXSPADS_SHIFT)
+#define I40E_GLPE_RUPM_PUSHPOOL                 0x0000DAC0 /* Reset: PE_CORER */
+#define I40E_GLPE_RUPM_PUSHPOOL_PUSHSPADS_SHIFT 0
+#define I40E_GLPE_RUPM_PUSHPOOL_PUSHSPADS_MASK  I40E_MASK(0xFF, I40E_GLPE_RUPM_PUSHPOOL_PUSHSPADS_SHIFT)
+#define I40E_GLPE_RUPM_TXHOST_EN                 0x0000DA08 /* Reset: PE_CORER */
+#define I40E_GLPE_RUPM_TXHOST_EN_TXHOST_EN_SHIFT 0
+#define I40E_GLPE_RUPM_TXHOST_EN_TXHOST_EN_MASK  I40E_MASK(0x1, I40E_GLPE_RUPM_TXHOST_EN_TXHOST_EN_SHIFT)
+#define I40E_GLPE_VFAEQEDROPCNT(_i)               (0x00132540 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+#define I40E_GLPE_VFAEQEDROPCNT_MAX_INDEX         31
+#define I40E_GLPE_VFAEQEDROPCNT_AEQEDROPCNT_SHIFT 0
+#define I40E_GLPE_VFAEQEDROPCNT_AEQEDROPCNT_MASK  I40E_MASK(0xFFFF, I40E_GLPE_VFAEQEDROPCNT_AEQEDROPCNT_SHIFT)
+#define I40E_GLPE_VFCEQEDROPCNT(_i)               (0x00132440 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+#define I40E_GLPE_VFCEQEDROPCNT_MAX_INDEX         31
+#define I40E_GLPE_VFCEQEDROPCNT_CEQEDROPCNT_SHIFT 0
+#define I40E_GLPE_VFCEQEDROPCNT_CEQEDROPCNT_MASK  I40E_MASK(0xFFFF, I40E_GLPE_VFCEQEDROPCNT_CEQEDROPCNT_SHIFT)
+#define I40E_GLPE_VFCQEDROPCNT(_i)              (0x00132340 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+#define I40E_GLPE_VFCQEDROPCNT_MAX_INDEX        31
+#define I40E_GLPE_VFCQEDROPCNT_CQEDROPCNT_SHIFT 0
+#define I40E_GLPE_VFCQEDROPCNT_CQEDROPCNT_MASK  I40E_MASK(0xFFFF, I40E_GLPE_VFCQEDROPCNT_CQEDROPCNT_SHIFT)
+#define I40E_GLPE_VFFLMOBJCTRL(_i)                  (0x0000D400 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPE_VFFLMOBJCTRL_MAX_INDEX            31
+#define I40E_GLPE_VFFLMOBJCTRL_XMIT_BLOCKSIZE_SHIFT 0
+#define I40E_GLPE_VFFLMOBJCTRL_XMIT_BLOCKSIZE_MASK  I40E_MASK(0x7, I40E_GLPE_VFFLMOBJCTRL_XMIT_BLOCKSIZE_SHIFT)
+#define I40E_GLPE_VFFLMOBJCTRL_Q1_BLOCKSIZE_SHIFT   8
+#define I40E_GLPE_VFFLMOBJCTRL_Q1_BLOCKSIZE_MASK    I40E_MASK(0x7, I40E_GLPE_VFFLMOBJCTRL_Q1_BLOCKSIZE_SHIFT)
+#define I40E_GLPE_VFFLMQ1ALLOCERR(_i)               (0x0000C700 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPE_VFFLMQ1ALLOCERR_MAX_INDEX         31
+#define I40E_GLPE_VFFLMQ1ALLOCERR_ERROR_COUNT_SHIFT 0
+#define I40E_GLPE_VFFLMQ1ALLOCERR_ERROR_COUNT_MASK  I40E_MASK(0xFFFF, I40E_GLPE_VFFLMQ1ALLOCERR_ERROR_COUNT_SHIFT)
+#define I40E_GLPE_VFFLMXMITALLOCERR(_i)               (0x0000C600 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPE_VFFLMXMITALLOCERR_MAX_INDEX         31
+#define I40E_GLPE_VFFLMXMITALLOCERR_ERROR_COUNT_SHIFT 0
+#define I40E_GLPE_VFFLMXMITALLOCERR_ERROR_COUNT_MASK  I40E_MASK(0xFFFF, I40E_GLPE_VFFLMXMITALLOCERR_ERROR_COUNT_SHIFT)
+#define I40E_GLPE_VFUDACTRL(_i)                    (0x0000C000 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPE_VFUDACTRL_MAX_INDEX              31
+#define I40E_GLPE_VFUDACTRL_IPV4MCFRAGRESBP_SHIFT  0
+#define I40E_GLPE_VFUDACTRL_IPV4MCFRAGRESBP_MASK   I40E_MASK(0x1, I40E_GLPE_VFUDACTRL_IPV4MCFRAGRESBP_SHIFT)
+#define I40E_GLPE_VFUDACTRL_IPV4UCFRAGRESBP_SHIFT  1
+#define I40E_GLPE_VFUDACTRL_IPV4UCFRAGRESBP_MASK   I40E_MASK(0x1, I40E_GLPE_VFUDACTRL_IPV4UCFRAGRESBP_SHIFT)
+#define I40E_GLPE_VFUDACTRL_IPV6MCFRAGRESBP_SHIFT  2
+#define I40E_GLPE_VFUDACTRL_IPV6MCFRAGRESBP_MASK   I40E_MASK(0x1, I40E_GLPE_VFUDACTRL_IPV6MCFRAGRESBP_SHIFT)
+#define I40E_GLPE_VFUDACTRL_IPV6UCFRAGRESBP_SHIFT  3
+#define I40E_GLPE_VFUDACTRL_IPV6UCFRAGRESBP_MASK   I40E_MASK(0x1, I40E_GLPE_VFUDACTRL_IPV6UCFRAGRESBP_SHIFT)
+#define I40E_GLPE_VFUDACTRL_UDPMCFRAGRESFAIL_SHIFT 4
+#define I40E_GLPE_VFUDACTRL_UDPMCFRAGRESFAIL_MASK  I40E_MASK(0x1, I40E_GLPE_VFUDACTRL_UDPMCFRAGRESFAIL_SHIFT)
+#define I40E_GLPE_VFUDAUCFBQPN(_i)         (0x0000C100 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPE_VFUDAUCFBQPN_MAX_INDEX   31
+#define I40E_GLPE_VFUDAUCFBQPN_QPN_SHIFT   0
+#define I40E_GLPE_VFUDAUCFBQPN_QPN_MASK    I40E_MASK(0x3FFFF, I40E_GLPE_VFUDAUCFBQPN_QPN_SHIFT)
+#define I40E_GLPE_VFUDAUCFBQPN_VALID_SHIFT 31
+#define I40E_GLPE_VFUDAUCFBQPN_VALID_MASK  I40E_MASK(0x1, I40E_GLPE_VFUDAUCFBQPN_VALID_SHIFT)
+#define I40E_PFPE_AEQALLOC               0x00131180 /* Reset: PFR */
+#define I40E_PFPE_AEQALLOC_AECOUNT_SHIFT 0
+#define I40E_PFPE_AEQALLOC_AECOUNT_MASK  I40E_MASK(0xFFFFFFFF, I40E_PFPE_AEQALLOC_AECOUNT_SHIFT)
+#define I40E_PFPE_CCQPHIGH                  0x00008200 /* Reset: PFR */
+#define I40E_PFPE_CCQPHIGH_PECCQPHIGH_SHIFT 0
+#define I40E_PFPE_CCQPHIGH_PECCQPHIGH_MASK  I40E_MASK(0xFFFFFFFF, I40E_PFPE_CCQPHIGH_PECCQPHIGH_SHIFT)
+#define I40E_PFPE_CCQPLOW                 0x00008180 /* Reset: PFR */
+#define I40E_PFPE_CCQPLOW_PECCQPLOW_SHIFT 0
+#define I40E_PFPE_CCQPLOW_PECCQPLOW_MASK  I40E_MASK(0xFFFFFFFF, I40E_PFPE_CCQPLOW_PECCQPLOW_SHIFT)
+#define I40E_PFPE_CCQPSTATUS                   0x00008100 /* Reset: PFR */
+#define I40E_PFPE_CCQPSTATUS_CCQP_DONE_SHIFT   0
+#define I40E_PFPE_CCQPSTATUS_CCQP_DONE_MASK    I40E_MASK(0x1, I40E_PFPE_CCQPSTATUS_CCQP_DONE_SHIFT)
+#define I40E_PFPE_CCQPSTATUS_HMC_PROFILE_SHIFT 4
+#define I40E_PFPE_CCQPSTATUS_HMC_PROFILE_MASK  I40E_MASK(0x7, I40E_PFPE_CCQPSTATUS_HMC_PROFILE_SHIFT)
+#define I40E_PFPE_CCQPSTATUS_RDMA_EN_VFS_SHIFT 16
+#define I40E_PFPE_CCQPSTATUS_RDMA_EN_VFS_MASK  I40E_MASK(0x3F, I40E_PFPE_CCQPSTATUS_RDMA_EN_VFS_SHIFT)
+#define I40E_PFPE_CCQPSTATUS_CCQP_ERR_SHIFT    31
+#define I40E_PFPE_CCQPSTATUS_CCQP_ERR_MASK     I40E_MASK(0x1, I40E_PFPE_CCQPSTATUS_CCQP_ERR_SHIFT)
+#define I40E_PFPE_CQACK              0x00131100 /* Reset: PFR */
+#define I40E_PFPE_CQACK_PECQID_SHIFT 0
+#define I40E_PFPE_CQACK_PECQID_MASK  I40E_MASK(0x1FFFF, I40E_PFPE_CQACK_PECQID_SHIFT)
+#define I40E_PFPE_CQARM              0x00131080 /* Reset: PFR */
+#define I40E_PFPE_CQARM_PECQID_SHIFT 0
+#define I40E_PFPE_CQARM_PECQID_MASK  I40E_MASK(0x1FFFF, I40E_PFPE_CQARM_PECQID_SHIFT)
+#define I40E_PFPE_CQPDB              0x00008000 /* Reset: PFR */
+#define I40E_PFPE_CQPDB_WQHEAD_SHIFT 0
+#define I40E_PFPE_CQPDB_WQHEAD_MASK  I40E_MASK(0x7FF, I40E_PFPE_CQPDB_WQHEAD_SHIFT)
+#define I40E_PFPE_CQPERRCODES                      0x00008880 /* Reset: PFR */
+#define I40E_PFPE_CQPERRCODES_CQP_MINOR_CODE_SHIFT 0
+#define I40E_PFPE_CQPERRCODES_CQP_MINOR_CODE_MASK  I40E_MASK(0xFFFF, I40E_PFPE_CQPERRCODES_CQP_MINOR_CODE_SHIFT)
+#define I40E_PFPE_CQPERRCODES_CQP_MAJOR_CODE_SHIFT 16
+#define I40E_PFPE_CQPERRCODES_CQP_MAJOR_CODE_MASK  I40E_MASK(0xFFFF, I40E_PFPE_CQPERRCODES_CQP_MAJOR_CODE_SHIFT)
+#define I40E_PFPE_CQPTAIL                  0x00008080 /* Reset: PFR */
+#define I40E_PFPE_CQPTAIL_WQTAIL_SHIFT     0
+#define I40E_PFPE_CQPTAIL_WQTAIL_MASK      I40E_MASK(0x7FF, I40E_PFPE_CQPTAIL_WQTAIL_SHIFT)
+#define I40E_PFPE_CQPTAIL_CQP_OP_ERR_SHIFT 31
+#define I40E_PFPE_CQPTAIL_CQP_OP_ERR_MASK  I40E_MASK(0x1, I40E_PFPE_CQPTAIL_CQP_OP_ERR_SHIFT)
+#define I40E_PFPE_FLMQ1ALLOCERR                   0x00008980 /* Reset: PFR */
+#define I40E_PFPE_FLMQ1ALLOCERR_ERROR_COUNT_SHIFT 0
+#define I40E_PFPE_FLMQ1ALLOCERR_ERROR_COUNT_MASK  I40E_MASK(0xFFFF, I40E_PFPE_FLMQ1ALLOCERR_ERROR_COUNT_SHIFT)
+#define I40E_PFPE_FLMXMITALLOCERR                   0x00008900 /* Reset: PFR */
+#define I40E_PFPE_FLMXMITALLOCERR_ERROR_COUNT_SHIFT 0
+#define I40E_PFPE_FLMXMITALLOCERR_ERROR_COUNT_MASK  I40E_MASK(0xFFFF, I40E_PFPE_FLMXMITALLOCERR_ERROR_COUNT_SHIFT)
+#define I40E_PFPE_IPCONFIG0                        0x00008280 /* Reset: PFR */
+#define I40E_PFPE_IPCONFIG0_PEIPID_SHIFT           0
+#define I40E_PFPE_IPCONFIG0_PEIPID_MASK            I40E_MASK(0xFFFF, I40E_PFPE_IPCONFIG0_PEIPID_SHIFT)
+#define I40E_PFPE_IPCONFIG0_USEENTIREIDRANGE_SHIFT 16
+#define I40E_PFPE_IPCONFIG0_USEENTIREIDRANGE_MASK  I40E_MASK(0x1, I40E_PFPE_IPCONFIG0_USEENTIREIDRANGE_SHIFT)
+#define I40E_PFPE_MRTEIDXMASK                       0x00008600 /* Reset: PFR */
+#define I40E_PFPE_MRTEIDXMASK_MRTEIDXMASKBITS_SHIFT 0
+#define I40E_PFPE_MRTEIDXMASK_MRTEIDXMASKBITS_MASK  I40E_MASK(0x1F, I40E_PFPE_MRTEIDXMASK_MRTEIDXMASKBITS_SHIFT)
+#define I40E_PFPE_RCVUNEXPECTEDERROR                        0x00008680 /* Reset: PFR */
+#define I40E_PFPE_RCVUNEXPECTEDERROR_TCP_RX_UNEXP_ERR_SHIFT 0
+#define I40E_PFPE_RCVUNEXPECTEDERROR_TCP_RX_UNEXP_ERR_MASK  I40E_MASK(0xFFFFFF, I40E_PFPE_RCVUNEXPECTEDERROR_TCP_RX_UNEXP_ERR_SHIFT)
+#define I40E_PFPE_TCPNOWTIMER               0x00008580 /* Reset: PFR */
+#define I40E_PFPE_TCPNOWTIMER_TCP_NOW_SHIFT 0
+#define I40E_PFPE_TCPNOWTIMER_TCP_NOW_MASK  I40E_MASK(0xFFFFFFFF, I40E_PFPE_TCPNOWTIMER_TCP_NOW_SHIFT)
+#define I40E_PFPE_UDACTRL                        0x00008700 /* Reset: PFR */
+#define I40E_PFPE_UDACTRL_IPV4MCFRAGRESBP_SHIFT  0
+#define I40E_PFPE_UDACTRL_IPV4MCFRAGRESBP_MASK   I40E_MASK(0x1, I40E_PFPE_UDACTRL_IPV4MCFRAGRESBP_SHIFT)
+#define I40E_PFPE_UDACTRL_IPV4UCFRAGRESBP_SHIFT  1
+#define I40E_PFPE_UDACTRL_IPV4UCFRAGRESBP_MASK   I40E_MASK(0x1, I40E_PFPE_UDACTRL_IPV4UCFRAGRESBP_SHIFT)
+#define I40E_PFPE_UDACTRL_IPV6MCFRAGRESBP_SHIFT  2
+#define I40E_PFPE_UDACTRL_IPV6MCFRAGRESBP_MASK   I40E_MASK(0x1, I40E_PFPE_UDACTRL_IPV6MCFRAGRESBP_SHIFT)
+#define I40E_PFPE_UDACTRL_IPV6UCFRAGRESBP_SHIFT  3
+#define I40E_PFPE_UDACTRL_IPV6UCFRAGRESBP_MASK   I40E_MASK(0x1, I40E_PFPE_UDACTRL_IPV6UCFRAGRESBP_SHIFT)
+#define I40E_PFPE_UDACTRL_UDPMCFRAGRESFAIL_SHIFT 4
+#define I40E_PFPE_UDACTRL_UDPMCFRAGRESFAIL_MASK  I40E_MASK(0x1, I40E_PFPE_UDACTRL_UDPMCFRAGRESFAIL_SHIFT)
+#define I40E_PFPE_UDAUCFBQPN             0x00008780 /* Reset: PFR */
+#define I40E_PFPE_UDAUCFBQPN_QPN_SHIFT   0
+#define I40E_PFPE_UDAUCFBQPN_QPN_MASK    I40E_MASK(0x3FFFF, I40E_PFPE_UDAUCFBQPN_QPN_SHIFT)
+#define I40E_PFPE_UDAUCFBQPN_VALID_SHIFT 31
+#define I40E_PFPE_UDAUCFBQPN_VALID_MASK  I40E_MASK(0x1, I40E_PFPE_UDAUCFBQPN_VALID_SHIFT)
+#define I40E_PFPE_WQEALLOC                      0x00138C00 /* Reset: PFR */
+#define I40E_PFPE_WQEALLOC_PEQPID_SHIFT         0
+#define I40E_PFPE_WQEALLOC_PEQPID_MASK          I40E_MASK(0x3FFFF, I40E_PFPE_WQEALLOC_PEQPID_SHIFT)
+#define I40E_PFPE_WQEALLOC_WQE_DESC_INDEX_SHIFT 20
+#define I40E_PFPE_WQEALLOC_WQE_DESC_INDEX_MASK  I40E_MASK(0xFFF, I40E_PFPE_WQEALLOC_WQE_DESC_INDEX_SHIFT)
+#define I40E_PRTDCB_RLPMC              0x0001F140 /* Reset: PE_CORER */
+#define I40E_PRTDCB_RLPMC_TC2PFC_SHIFT 0
+#define I40E_PRTDCB_RLPMC_TC2PFC_MASK  I40E_MASK(0xFF, I40E_PRTDCB_RLPMC_TC2PFC_SHIFT)
+#define I40E_PRTDCB_TCMSTC_RLPM(_i)        (0x0001F040 + ((_i) * 32)) /* _i=0...7 */ /* Reset: PE_CORER */
+#define I40E_PRTDCB_TCMSTC_RLPM_MAX_INDEX  7
+#define I40E_PRTDCB_TCMSTC_RLPM_MSTC_SHIFT 0
+#define I40E_PRTDCB_TCMSTC_RLPM_MSTC_MASK  I40E_MASK(0xFFFFF, I40E_PRTDCB_TCMSTC_RLPM_MSTC_SHIFT)
+#define I40E_PRTDCB_TCPMC_RLPM                 0x0001F1A0 /* Reset: PE_CORER */
+#define I40E_PRTDCB_TCPMC_RLPM_CPM_SHIFT       0
+#define I40E_PRTDCB_TCPMC_RLPM_CPM_MASK        I40E_MASK(0x1FFF, I40E_PRTDCB_TCPMC_RLPM_CPM_SHIFT)
+#define I40E_PRTDCB_TCPMC_RLPM_LLTC_SHIFT      13
+#define I40E_PRTDCB_TCPMC_RLPM_LLTC_MASK       I40E_MASK(0xFF, I40E_PRTDCB_TCPMC_RLPM_LLTC_SHIFT)
+#define I40E_PRTDCB_TCPMC_RLPM_TCPM_MODE_SHIFT 30
+#define I40E_PRTDCB_TCPMC_RLPM_TCPM_MODE_MASK  I40E_MASK(0x1, I40E_PRTDCB_TCPMC_RLPM_TCPM_MODE_SHIFT)
+#define I40E_PRTE_RUPM_TCCNTR03                0x0000DAE0 /* Reset: PE_CORER */
+#define I40E_PRTE_RUPM_TCCNTR03_TC0COUNT_SHIFT 0
+#define I40E_PRTE_RUPM_TCCNTR03_TC0COUNT_MASK  I40E_MASK(0xFF, I40E_PRTE_RUPM_TCCNTR03_TC0COUNT_SHIFT)
+#define I40E_PRTE_RUPM_TCCNTR03_TC1COUNT_SHIFT 8
+#define I40E_PRTE_RUPM_TCCNTR03_TC1COUNT_MASK  I40E_MASK(0xFF, I40E_PRTE_RUPM_TCCNTR03_TC1COUNT_SHIFT)
+#define I40E_PRTE_RUPM_TCCNTR03_TC2COUNT_SHIFT 16
+#define I40E_PRTE_RUPM_TCCNTR03_TC2COUNT_MASK  I40E_MASK(0xFF, I40E_PRTE_RUPM_TCCNTR03_TC2COUNT_SHIFT)
+#define I40E_PRTE_RUPM_TCCNTR03_TC3COUNT_SHIFT 24
+#define I40E_PRTE_RUPM_TCCNTR03_TC3COUNT_MASK  I40E_MASK(0xFF, I40E_PRTE_RUPM_TCCNTR03_TC3COUNT_SHIFT)
+#define I40E_PRTPE_RUPM_CNTR             0x0000DB20 /* Reset: PE_CORER */
+#define I40E_PRTPE_RUPM_CNTR_COUNT_SHIFT 0
+#define I40E_PRTPE_RUPM_CNTR_COUNT_MASK  I40E_MASK(0xFF, I40E_PRTPE_RUPM_CNTR_COUNT_SHIFT)
+#define I40E_PRTPE_RUPM_CTL                 0x0000DA40 /* Reset: PE_CORER */
+#define I40E_PRTPE_RUPM_CTL_LLTC_SHIFT      13
+#define I40E_PRTPE_RUPM_CTL_LLTC_MASK       I40E_MASK(0xFF, I40E_PRTPE_RUPM_CTL_LLTC_SHIFT)
+#define I40E_PRTPE_RUPM_CTL_RUPM_MODE_SHIFT 30
+#define I40E_PRTPE_RUPM_CTL_RUPM_MODE_MASK  I40E_MASK(0x1, I40E_PRTPE_RUPM_CTL_RUPM_MODE_SHIFT)
+#define I40E_PRTPE_RUPM_PFCCTL              0x0000DA60 /* Reset: PE_CORER */
+#define I40E_PRTPE_RUPM_PFCCTL_TC2PFC_SHIFT 0
+#define I40E_PRTPE_RUPM_PFCCTL_TC2PFC_MASK  I40E_MASK(0xFF, I40E_PRTPE_RUPM_PFCCTL_TC2PFC_SHIFT)
+#define I40E_PRTPE_RUPM_PFCPC                 0x0000DA80 /* Reset: PE_CORER */
+#define I40E_PRTPE_RUPM_PFCPC_PORTOFFTH_SHIFT 0
+#define I40E_PRTPE_RUPM_PFCPC_PORTOFFTH_MASK  I40E_MASK(0xFF, I40E_PRTPE_RUPM_PFCPC_PORTOFFTH_SHIFT)
+#define I40E_PRTPE_RUPM_PFCTCC                 0x0000DAA0 /* Reset: PE_CORER */
+#define I40E_PRTPE_RUPM_PFCTCC_TCOFFTH_SHIFT   0
+#define I40E_PRTPE_RUPM_PFCTCC_TCOFFTH_MASK    I40E_MASK(0xFF, I40E_PRTPE_RUPM_PFCTCC_TCOFFTH_SHIFT)
+#define I40E_PRTPE_RUPM_PFCTCC_LL_PRI_TH_SHIFT 16
+#define I40E_PRTPE_RUPM_PFCTCC_LL_PRI_TH_MASK  I40E_MASK(0xFF, I40E_PRTPE_RUPM_PFCTCC_LL_PRI_TH_SHIFT)
+#define I40E_PRTPE_RUPM_PFCTCC_LL_PRI_EN_SHIFT 31
+#define I40E_PRTPE_RUPM_PFCTCC_LL_PRI_EN_MASK  I40E_MASK(0x1, I40E_PRTPE_RUPM_PFCTCC_LL_PRI_EN_SHIFT)
+#define I40E_PRTPE_RUPM_PTCTCCNTR47                0x0000DB60 /* Reset: PE_CORER */
+#define I40E_PRTPE_RUPM_PTCTCCNTR47_TC4COUNT_SHIFT 0
+#define I40E_PRTPE_RUPM_PTCTCCNTR47_TC4COUNT_MASK  I40E_MASK(0xFF, I40E_PRTPE_RUPM_PTCTCCNTR47_TC4COUNT_SHIFT)
+#define I40E_PRTPE_RUPM_PTCTCCNTR47_TC5COUNT_SHIFT 8
+#define I40E_PRTPE_RUPM_PTCTCCNTR47_TC5COUNT_MASK  I40E_MASK(0xFF, I40E_PRTPE_RUPM_PTCTCCNTR47_TC5COUNT_SHIFT)
+#define I40E_PRTPE_RUPM_PTCTCCNTR47_TC6COUNT_SHIFT 16
+#define I40E_PRTPE_RUPM_PTCTCCNTR47_TC6COUNT_MASK  I40E_MASK(0xFF, I40E_PRTPE_RUPM_PTCTCCNTR47_TC6COUNT_SHIFT)
+#define I40E_PRTPE_RUPM_PTCTCCNTR47_TC7COUNT_SHIFT 24
+#define I40E_PRTPE_RUPM_PTCTCCNTR47_TC7COUNT_MASK  I40E_MASK(0xFF, I40E_PRTPE_RUPM_PTCTCCNTR47_TC7COUNT_SHIFT)
+#define I40E_PRTPE_RUPM_PTXTCCNTR03                0x0000DB40 /* Reset: PE_CORER */
+#define I40E_PRTPE_RUPM_PTXTCCNTR03_TC0COUNT_SHIFT 0
+#define I40E_PRTPE_RUPM_PTXTCCNTR03_TC0COUNT_MASK  I40E_MASK(0xFF, I40E_PRTPE_RUPM_PTXTCCNTR03_TC0COUNT_SHIFT)
+#define I40E_PRTPE_RUPM_PTXTCCNTR03_TC1COUNT_SHIFT 8
+#define I40E_PRTPE_RUPM_PTXTCCNTR03_TC1COUNT_MASK  I40E_MASK(0xFF, I40E_PRTPE_RUPM_PTXTCCNTR03_TC1COUNT_SHIFT)
+#define I40E_PRTPE_RUPM_PTXTCCNTR03_TC2COUNT_SHIFT 16
+#define I40E_PRTPE_RUPM_PTXTCCNTR03_TC2COUNT_MASK  I40E_MASK(0xFF, I40E_PRTPE_RUPM_PTXTCCNTR03_TC2COUNT_SHIFT)
+#define I40E_PRTPE_RUPM_PTXTCCNTR03_TC3COUNT_SHIFT 24
+#define I40E_PRTPE_RUPM_PTXTCCNTR03_TC3COUNT_MASK  I40E_MASK(0xFF, I40E_PRTPE_RUPM_PTXTCCNTR03_TC3COUNT_SHIFT)
+#define I40E_PRTPE_RUPM_TCCNTR47                0x0000DB00 /* Reset: PE_CORER */
+#define I40E_PRTPE_RUPM_TCCNTR47_TC4COUNT_SHIFT 0
+#define I40E_PRTPE_RUPM_TCCNTR47_TC4COUNT_MASK  I40E_MASK(0xFF, I40E_PRTPE_RUPM_TCCNTR47_TC4COUNT_SHIFT)
+#define I40E_PRTPE_RUPM_TCCNTR47_TC5COUNT_SHIFT 8
+#define I40E_PRTPE_RUPM_TCCNTR47_TC5COUNT_MASK  I40E_MASK(0xFF, I40E_PRTPE_RUPM_TCCNTR47_TC5COUNT_SHIFT)
+#define I40E_PRTPE_RUPM_TCCNTR47_TC6COUNT_SHIFT 16
+#define I40E_PRTPE_RUPM_TCCNTR47_TC6COUNT_MASK  I40E_MASK(0xFF, I40E_PRTPE_RUPM_TCCNTR47_TC6COUNT_SHIFT)
+#define I40E_PRTPE_RUPM_TCCNTR47_TC7COUNT_SHIFT 24
+#define I40E_PRTPE_RUPM_TCCNTR47_TC7COUNT_MASK  I40E_MASK(0xFF, I40E_PRTPE_RUPM_TCCNTR47_TC7COUNT_SHIFT)
+#define I40E_PRTPE_RUPM_THRES                     0x0000DA20 /* Reset: PE_CORER */
+#define I40E_PRTPE_RUPM_THRES_MINSPADSPERTC_SHIFT 0
+#define I40E_PRTPE_RUPM_THRES_MINSPADSPERTC_MASK  I40E_MASK(0xFF, I40E_PRTPE_RUPM_THRES_MINSPADSPERTC_SHIFT)
+#define I40E_PRTPE_RUPM_THRES_MAXSPADS_SHIFT      8
+#define I40E_PRTPE_RUPM_THRES_MAXSPADS_MASK       I40E_MASK(0xFF, I40E_PRTPE_RUPM_THRES_MAXSPADS_SHIFT)
+#define I40E_PRTPE_RUPM_THRES_MAXSPADSPERTC_SHIFT 16
+#define I40E_PRTPE_RUPM_THRES_MAXSPADSPERTC_MASK  I40E_MASK(0xFF, I40E_PRTPE_RUPM_THRES_MAXSPADSPERTC_SHIFT)
+#define I40E_VFPE_AEQALLOC(_VF)          (0x00130C00 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
+#define I40E_VFPE_AEQALLOC_MAX_INDEX     127
+#define I40E_VFPE_AEQALLOC_AECOUNT_SHIFT 0
+#define I40E_VFPE_AEQALLOC_AECOUNT_MASK  I40E_MASK(0xFFFFFFFF, I40E_VFPE_AEQALLOC_AECOUNT_SHIFT)
+#define I40E_VFPE_CCQPHIGH(_VF)             (0x00001000 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
+#define I40E_VFPE_CCQPHIGH_MAX_INDEX        127
+#define I40E_VFPE_CCQPHIGH_PECCQPHIGH_SHIFT 0
+#define I40E_VFPE_CCQPHIGH_PECCQPHIGH_MASK  I40E_MASK(0xFFFFFFFF, I40E_VFPE_CCQPHIGH_PECCQPHIGH_SHIFT)
+#define I40E_VFPE_CCQPLOW(_VF)            (0x00000C00 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
+#define I40E_VFPE_CCQPLOW_MAX_INDEX       127
+#define I40E_VFPE_CCQPLOW_PECCQPLOW_SHIFT 0
+#define I40E_VFPE_CCQPLOW_PECCQPLOW_MASK  I40E_MASK(0xFFFFFFFF, I40E_VFPE_CCQPLOW_PECCQPLOW_SHIFT)
+#define I40E_VFPE_CCQPSTATUS(_VF)              (0x00000800 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
+#define I40E_VFPE_CCQPSTATUS_MAX_INDEX         127
+#define I40E_VFPE_CCQPSTATUS_CCQP_DONE_SHIFT   0
+#define I40E_VFPE_CCQPSTATUS_CCQP_DONE_MASK    I40E_MASK(0x1, I40E_VFPE_CCQPSTATUS_CCQP_DONE_SHIFT)
+#define I40E_VFPE_CCQPSTATUS_HMC_PROFILE_SHIFT 4
+#define I40E_VFPE_CCQPSTATUS_HMC_PROFILE_MASK  I40E_MASK(0x7, I40E_VFPE_CCQPSTATUS_HMC_PROFILE_SHIFT)
+#define I40E_VFPE_CCQPSTATUS_RDMA_EN_VFS_SHIFT 16
+#define I40E_VFPE_CCQPSTATUS_RDMA_EN_VFS_MASK  I40E_MASK(0x3F, I40E_VFPE_CCQPSTATUS_RDMA_EN_VFS_SHIFT)
+#define I40E_VFPE_CCQPSTATUS_CCQP_ERR_SHIFT    31
+#define I40E_VFPE_CCQPSTATUS_CCQP_ERR_MASK     I40E_MASK(0x1, I40E_VFPE_CCQPSTATUS_CCQP_ERR_SHIFT)
+#define I40E_VFPE_CQACK(_VF)         (0x00130800 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
+#define I40E_VFPE_CQACK_MAX_INDEX    127
+#define I40E_VFPE_CQACK_PECQID_SHIFT 0
+#define I40E_VFPE_CQACK_PECQID_MASK  I40E_MASK(0x1FFFF, I40E_VFPE_CQACK_PECQID_SHIFT)
+#define I40E_VFPE_CQARM(_VF)         (0x00130400 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
+#define I40E_VFPE_CQARM_MAX_INDEX    127
+#define I40E_VFPE_CQARM_PECQID_SHIFT 0
+#define I40E_VFPE_CQARM_PECQID_MASK  I40E_MASK(0x1FFFF, I40E_VFPE_CQARM_PECQID_SHIFT)
+#define I40E_VFPE_CQPDB(_VF)         (0x00000000 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
+#define I40E_VFPE_CQPDB_MAX_INDEX    127
+#define I40E_VFPE_CQPDB_WQHEAD_SHIFT 0
+#define I40E_VFPE_CQPDB_WQHEAD_MASK  I40E_MASK(0x7FF, I40E_VFPE_CQPDB_WQHEAD_SHIFT)
+#define I40E_VFPE_CQPERRCODES(_VF)                 (0x00001800 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
+#define I40E_VFPE_CQPERRCODES_MAX_INDEX            127
+#define I40E_VFPE_CQPERRCODES_CQP_MINOR_CODE_SHIFT 0
+#define I40E_VFPE_CQPERRCODES_CQP_MINOR_CODE_MASK  I40E_MASK(0xFFFF, I40E_VFPE_CQPERRCODES_CQP_MINOR_CODE_SHIFT)
+#define I40E_VFPE_CQPERRCODES_CQP_MAJOR_CODE_SHIFT 16
+#define I40E_VFPE_CQPERRCODES_CQP_MAJOR_CODE_MASK  I40E_MASK(0xFFFF, I40E_VFPE_CQPERRCODES_CQP_MAJOR_CODE_SHIFT)
+#define I40E_VFPE_CQPTAIL(_VF)             (0x00000400 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
+#define I40E_VFPE_CQPTAIL_MAX_INDEX        127
+#define I40E_VFPE_CQPTAIL_WQTAIL_SHIFT     0
+#define I40E_VFPE_CQPTAIL_WQTAIL_MASK      I40E_MASK(0x7FF, I40E_VFPE_CQPTAIL_WQTAIL_SHIFT)
+#define I40E_VFPE_CQPTAIL_CQP_OP_ERR_SHIFT 31
+#define I40E_VFPE_CQPTAIL_CQP_OP_ERR_MASK  I40E_MASK(0x1, I40E_VFPE_CQPTAIL_CQP_OP_ERR_SHIFT)
+#define I40E_VFPE_IPCONFIG0(_VF)                   (0x00001400 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
+#define I40E_VFPE_IPCONFIG0_MAX_INDEX              127
+#define I40E_VFPE_IPCONFIG0_PEIPID_SHIFT           0
+#define I40E_VFPE_IPCONFIG0_PEIPID_MASK            I40E_MASK(0xFFFF, I40E_VFPE_IPCONFIG0_PEIPID_SHIFT)
+#define I40E_VFPE_IPCONFIG0_USEENTIREIDRANGE_SHIFT 16
+#define I40E_VFPE_IPCONFIG0_USEENTIREIDRANGE_MASK  I40E_MASK(0x1, I40E_VFPE_IPCONFIG0_USEENTIREIDRANGE_SHIFT)
+#define I40E_VFPE_MRTEIDXMASK(_VF)                  (0x00003000 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
+#define I40E_VFPE_MRTEIDXMASK_MAX_INDEX             127
+#define I40E_VFPE_MRTEIDXMASK_MRTEIDXMASKBITS_SHIFT 0
+#define I40E_VFPE_MRTEIDXMASK_MRTEIDXMASKBITS_MASK  I40E_MASK(0x1F, I40E_VFPE_MRTEIDXMASK_MRTEIDXMASKBITS_SHIFT)
+#define I40E_VFPE_RCVUNEXPECTEDERROR(_VF)                   (0x00003400 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
+#define I40E_VFPE_RCVUNEXPECTEDERROR_MAX_INDEX              127
+#define I40E_VFPE_RCVUNEXPECTEDERROR_TCP_RX_UNEXP_ERR_SHIFT 0
+#define I40E_VFPE_RCVUNEXPECTEDERROR_TCP_RX_UNEXP_ERR_MASK  I40E_MASK(0xFFFFFF, I40E_VFPE_RCVUNEXPECTEDERROR_TCP_RX_UNEXP_ERR_SHIFT)
+#define I40E_VFPE_TCPNOWTIMER(_VF)          (0x00002C00 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
+#define I40E_VFPE_TCPNOWTIMER_MAX_INDEX     127
+#define I40E_VFPE_TCPNOWTIMER_TCP_NOW_SHIFT 0
+#define I40E_VFPE_TCPNOWTIMER_TCP_NOW_MASK  I40E_MASK(0xFFFFFFFF, I40E_VFPE_TCPNOWTIMER_TCP_NOW_SHIFT)
+#define I40E_VFPE_WQEALLOC(_VF)                 (0x00138000 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
+#define I40E_VFPE_WQEALLOC_MAX_INDEX            127
+#define I40E_VFPE_WQEALLOC_PEQPID_SHIFT         0
+#define I40E_VFPE_WQEALLOC_PEQPID_MASK          I40E_MASK(0x3FFFF, I40E_VFPE_WQEALLOC_PEQPID_SHIFT)
+#define I40E_VFPE_WQEALLOC_WQE_DESC_INDEX_SHIFT 20
+#define I40E_VFPE_WQEALLOC_WQE_DESC_INDEX_MASK  I40E_MASK(0xFFF, I40E_VFPE_WQEALLOC_WQE_DESC_INDEX_SHIFT)
+#define I40E_GLPES_PFIP4RXDISCARD(_i)                (0x00010600 + ((_i) * 4)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP4RXDISCARD_MAX_INDEX          15
+#define I40E_GLPES_PFIP4RXDISCARD_IP4RXDISCARD_SHIFT 0
+#define I40E_GLPES_PFIP4RXDISCARD_IP4RXDISCARD_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP4RXDISCARD_IP4RXDISCARD_SHIFT)
+#define I40E_GLPES_PFIP4RXFRAGSHI(_i)                (0x00010804 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP4RXFRAGSHI_MAX_INDEX          15
+#define I40E_GLPES_PFIP4RXFRAGSHI_IP4RXFRAGSHI_SHIFT 0
+#define I40E_GLPES_PFIP4RXFRAGSHI_IP4RXFRAGSHI_MASK  I40E_MASK(0xFFFF, I40E_GLPES_PFIP4RXFRAGSHI_IP4RXFRAGSHI_SHIFT)
+#define I40E_GLPES_PFIP4RXFRAGSLO(_i)                (0x00010800 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP4RXFRAGSLO_MAX_INDEX          15
+#define I40E_GLPES_PFIP4RXFRAGSLO_IP4RXFRAGSLO_SHIFT 0
+#define I40E_GLPES_PFIP4RXFRAGSLO_IP4RXFRAGSLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP4RXFRAGSLO_IP4RXFRAGSLO_SHIFT)
+#define I40E_GLPES_PFIP4RXMCOCTSHI(_i)                 (0x00010A04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP4RXMCOCTSHI_MAX_INDEX           15
+#define I40E_GLPES_PFIP4RXMCOCTSHI_IP4RXMCOCTSHI_SHIFT 0
+#define I40E_GLPES_PFIP4RXMCOCTSHI_IP4RXMCOCTSHI_MASK  I40E_MASK(0xFFFF, I40E_GLPES_PFIP4RXMCOCTSHI_IP4RXMCOCTSHI_SHIFT)
+#define I40E_GLPES_PFIP4RXMCOCTSLO(_i)                 (0x00010A00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP4RXMCOCTSLO_MAX_INDEX           15
+#define I40E_GLPES_PFIP4RXMCOCTSLO_IP4RXMCOCTSLO_SHIFT 0
+#define I40E_GLPES_PFIP4RXMCOCTSLO_IP4RXMCOCTSLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP4RXMCOCTSLO_IP4RXMCOCTSLO_SHIFT)
+#define I40E_GLPES_PFIP4RXMCPKTSHI(_i)                 (0x00010C04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP4RXMCPKTSHI_MAX_INDEX           15
+#define I40E_GLPES_PFIP4RXMCPKTSHI_IP4RXMCPKTSHI_SHIFT 0
+#define I40E_GLPES_PFIP4RXMCPKTSHI_IP4RXMCPKTSHI_MASK  I40E_MASK(0xFFFF, I40E_GLPES_PFIP4RXMCPKTSHI_IP4RXMCPKTSHI_SHIFT)
+#define I40E_GLPES_PFIP4RXMCPKTSLO(_i)                 (0x00010C00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP4RXMCPKTSLO_MAX_INDEX           15
+#define I40E_GLPES_PFIP4RXMCPKTSLO_IP4RXMCPKTSLO_SHIFT 0
+#define I40E_GLPES_PFIP4RXMCPKTSLO_IP4RXMCPKTSLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP4RXMCPKTSLO_IP4RXMCPKTSLO_SHIFT)
+#define I40E_GLPES_PFIP4RXOCTSHI(_i)               (0x00010204 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP4RXOCTSHI_MAX_INDEX         15
+#define I40E_GLPES_PFIP4RXOCTSHI_IP4RXOCTSHI_SHIFT 0
+#define I40E_GLPES_PFIP4RXOCTSHI_IP4RXOCTSHI_MASK  I40E_MASK(0xFFFF, I40E_GLPES_PFIP4RXOCTSHI_IP4RXOCTSHI_SHIFT)
+#define I40E_GLPES_PFIP4RXOCTSLO(_i)               (0x00010200 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP4RXOCTSLO_MAX_INDEX         15
+#define I40E_GLPES_PFIP4RXOCTSLO_IP4RXOCTSLO_SHIFT 0
+#define I40E_GLPES_PFIP4RXOCTSLO_IP4RXOCTSLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP4RXOCTSLO_IP4RXOCTSLO_SHIFT)
+#define I40E_GLPES_PFIP4RXPKTSHI(_i)               (0x00010404 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP4RXPKTSHI_MAX_INDEX         15
+#define I40E_GLPES_PFIP4RXPKTSHI_IP4RXPKTSHI_SHIFT 0
+#define I40E_GLPES_PFIP4RXPKTSHI_IP4RXPKTSHI_MASK  I40E_MASK(0xFFFF, I40E_GLPES_PFIP4RXPKTSHI_IP4RXPKTSHI_SHIFT)
+#define I40E_GLPES_PFIP4RXPKTSLO(_i)               (0x00010400 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP4RXPKTSLO_MAX_INDEX         15
+#define I40E_GLPES_PFIP4RXPKTSLO_IP4RXPKTSLO_SHIFT 0
+#define I40E_GLPES_PFIP4RXPKTSLO_IP4RXPKTSLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP4RXPKTSLO_IP4RXPKTSLO_SHIFT)
+#define I40E_GLPES_PFIP4RXTRUNC(_i)              (0x00010700 + ((_i) * 4)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP4RXTRUNC_MAX_INDEX        15
+#define I40E_GLPES_PFIP4RXTRUNC_IP4RXTRUNC_SHIFT 0
+#define I40E_GLPES_PFIP4RXTRUNC_IP4RXTRUNC_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP4RXTRUNC_IP4RXTRUNC_SHIFT)
+#define I40E_GLPES_PFIP4TXFRAGSHI(_i)                (0x00011E04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP4TXFRAGSHI_MAX_INDEX          15
+#define I40E_GLPES_PFIP4TXFRAGSHI_IP4TXFRAGSHI_SHIFT 0
+#define I40E_GLPES_PFIP4TXFRAGSHI_IP4TXFRAGSHI_MASK  I40E_MASK(0xFFFF, I40E_GLPES_PFIP4TXFRAGSHI_IP4TXFRAGSHI_SHIFT)
+#define I40E_GLPES_PFIP4TXFRAGSLO(_i)                (0x00011E00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP4TXFRAGSLO_MAX_INDEX          15
+#define I40E_GLPES_PFIP4TXFRAGSLO_IP4TXFRAGSLO_SHIFT 0
+#define I40E_GLPES_PFIP4TXFRAGSLO_IP4TXFRAGSLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP4TXFRAGSLO_IP4TXFRAGSLO_SHIFT)
+#define I40E_GLPES_PFIP4TXMCOCTSHI(_i)                 (0x00012004 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP4TXMCOCTSHI_MAX_INDEX           15
+#define I40E_GLPES_PFIP4TXMCOCTSHI_IP4TXMCOCTSHI_SHIFT 0
+#define I40E_GLPES_PFIP4TXMCOCTSHI_IP4TXMCOCTSHI_MASK  I40E_MASK(0xFFFF, I40E_GLPES_PFIP4TXMCOCTSHI_IP4TXMCOCTSHI_SHIFT)
+#define I40E_GLPES_PFIP4TXMCOCTSLO(_i)                 (0x00012000 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP4TXMCOCTSLO_MAX_INDEX           15
+#define I40E_GLPES_PFIP4TXMCOCTSLO_IP4TXMCOCTSLO_SHIFT 0
+#define I40E_GLPES_PFIP4TXMCOCTSLO_IP4TXMCOCTSLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP4TXMCOCTSLO_IP4TXMCOCTSLO_SHIFT)
+#define I40E_GLPES_PFIP4TXMCPKTSHI(_i)                 (0x00012204 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP4TXMCPKTSHI_MAX_INDEX           15
+#define I40E_GLPES_PFIP4TXMCPKTSHI_IP4TXMCPKTSHI_SHIFT 0
+#define I40E_GLPES_PFIP4TXMCPKTSHI_IP4TXMCPKTSHI_MASK  I40E_MASK(0xFFFF, I40E_GLPES_PFIP4TXMCPKTSHI_IP4TXMCPKTSHI_SHIFT)
+#define I40E_GLPES_PFIP4TXMCPKTSLO(_i)                 (0x00012200 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP4TXMCPKTSLO_MAX_INDEX           15
+#define I40E_GLPES_PFIP4TXMCPKTSLO_IP4TXMCPKTSLO_SHIFT 0
+#define I40E_GLPES_PFIP4TXMCPKTSLO_IP4TXMCPKTSLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP4TXMCPKTSLO_IP4TXMCPKTSLO_SHIFT)
+#define I40E_GLPES_PFIP4TXNOROUTE(_i)                (0x00012E00 + ((_i) * 4)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP4TXNOROUTE_MAX_INDEX          15
+#define I40E_GLPES_PFIP4TXNOROUTE_IP4TXNOROUTE_SHIFT 0
+#define I40E_GLPES_PFIP4TXNOROUTE_IP4TXNOROUTE_MASK  I40E_MASK(0xFFFFFF, I40E_GLPES_PFIP4TXNOROUTE_IP4TXNOROUTE_SHIFT)
+#define I40E_GLPES_PFIP4TXOCTSHI(_i)               (0x00011A04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP4TXOCTSHI_MAX_INDEX         15
+#define I40E_GLPES_PFIP4TXOCTSHI_IP4TXOCTSHI_SHIFT 0
+#define I40E_GLPES_PFIP4TXOCTSHI_IP4TXOCTSHI_MASK  I40E_MASK(0xFFFF, I40E_GLPES_PFIP4TXOCTSHI_IP4TXOCTSHI_SHIFT)
+#define I40E_GLPES_PFIP4TXOCTSLO(_i)               (0x00011A00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP4TXOCTSLO_MAX_INDEX         15
+#define I40E_GLPES_PFIP4TXOCTSLO_IP4TXOCTSLO_SHIFT 0
+#define I40E_GLPES_PFIP4TXOCTSLO_IP4TXOCTSLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP4TXOCTSLO_IP4TXOCTSLO_SHIFT)
+#define I40E_GLPES_PFIP4TXPKTSHI(_i)               (0x00011C04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP4TXPKTSHI_MAX_INDEX         15
+#define I40E_GLPES_PFIP4TXPKTSHI_IP4TXPKTSHI_SHIFT 0
+#define I40E_GLPES_PFIP4TXPKTSHI_IP4TXPKTSHI_MASK  I40E_MASK(0xFFFF, I40E_GLPES_PFIP4TXPKTSHI_IP4TXPKTSHI_SHIFT)
+#define I40E_GLPES_PFIP4TXPKTSLO(_i)               (0x00011C00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP4TXPKTSLO_MAX_INDEX         15
+#define I40E_GLPES_PFIP4TXPKTSLO_IP4TXPKTSLO_SHIFT 0
+#define I40E_GLPES_PFIP4TXPKTSLO_IP4TXPKTSLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP4TXPKTSLO_IP4TXPKTSLO_SHIFT)
+#define I40E_GLPES_PFIP6RXDISCARD(_i)                (0x00011200 + ((_i) * 4)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP6RXDISCARD_MAX_INDEX          15
+#define I40E_GLPES_PFIP6RXDISCARD_IP6RXDISCARD_SHIFT 0
+#define I40E_GLPES_PFIP6RXDISCARD_IP6RXDISCARD_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP6RXDISCARD_IP6RXDISCARD_SHIFT)
+#define I40E_GLPES_PFIP6RXFRAGSHI(_i)                (0x00011404 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP6RXFRAGSHI_MAX_INDEX          15
+#define I40E_GLPES_PFIP6RXFRAGSHI_IP6RXFRAGSHI_SHIFT 0
+#define I40E_GLPES_PFIP6RXFRAGSHI_IP6RXFRAGSHI_MASK  I40E_MASK(0xFFFF, I40E_GLPES_PFIP6RXFRAGSHI_IP6RXFRAGSHI_SHIFT)
+#define I40E_GLPES_PFIP6RXFRAGSLO(_i)                (0x00011400 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP6RXFRAGSLO_MAX_INDEX          15
+#define I40E_GLPES_PFIP6RXFRAGSLO_IP6RXFRAGSLO_SHIFT 0
+#define I40E_GLPES_PFIP6RXFRAGSLO_IP6RXFRAGSLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP6RXFRAGSLO_IP6RXFRAGSLO_SHIFT)
+#define I40E_GLPES_PFIP6RXMCOCTSHI(_i)                 (0x00011604 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP6RXMCOCTSHI_MAX_INDEX           15
+#define I40E_GLPES_PFIP6RXMCOCTSHI_IP6RXMCOCTSHI_SHIFT 0
+#define I40E_GLPES_PFIP6RXMCOCTSHI_IP6RXMCOCTSHI_MASK  I40E_MASK(0xFFFF, I40E_GLPES_PFIP6RXMCOCTSHI_IP6RXMCOCTSHI_SHIFT)
+#define I40E_GLPES_PFIP6RXMCOCTSLO(_i)                 (0x00011600 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP6RXMCOCTSLO_MAX_INDEX           15
+#define I40E_GLPES_PFIP6RXMCOCTSLO_IP6RXMCOCTSLO_SHIFT 0
+#define I40E_GLPES_PFIP6RXMCOCTSLO_IP6RXMCOCTSLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP6RXMCOCTSLO_IP6RXMCOCTSLO_SHIFT)
+#define I40E_GLPES_PFIP6RXMCPKTSHI(_i)                 (0x00011804 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP6RXMCPKTSHI_MAX_INDEX           15
+#define I40E_GLPES_PFIP6RXMCPKTSHI_IP6RXMCPKTSHI_SHIFT 0
+#define I40E_GLPES_PFIP6RXMCPKTSHI_IP6RXMCPKTSHI_MASK  I40E_MASK(0xFFFF, I40E_GLPES_PFIP6RXMCPKTSHI_IP6RXMCPKTSHI_SHIFT)
+#define I40E_GLPES_PFIP6RXMCPKTSLO(_i)                 (0x00011800 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP6RXMCPKTSLO_MAX_INDEX           15
+#define I40E_GLPES_PFIP6RXMCPKTSLO_IP6RXMCPKTSLO_SHIFT 0
+#define I40E_GLPES_PFIP6RXMCPKTSLO_IP6RXMCPKTSLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP6RXMCPKTSLO_IP6RXMCPKTSLO_SHIFT)
+#define I40E_GLPES_PFIP6RXOCTSHI(_i)               (0x00010E04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP6RXOCTSHI_MAX_INDEX         15
+#define I40E_GLPES_PFIP6RXOCTSHI_IP6RXOCTSHI_SHIFT 0
+#define I40E_GLPES_PFIP6RXOCTSHI_IP6RXOCTSHI_MASK  I40E_MASK(0xFFFF, I40E_GLPES_PFIP6RXOCTSHI_IP6RXOCTSHI_SHIFT)
+#define I40E_GLPES_PFIP6RXOCTSLO(_i)               (0x00010E00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP6RXOCTSLO_MAX_INDEX         15
+#define I40E_GLPES_PFIP6RXOCTSLO_IP6RXOCTSLO_SHIFT 0
+#define I40E_GLPES_PFIP6RXOCTSLO_IP6RXOCTSLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP6RXOCTSLO_IP6RXOCTSLO_SHIFT)
+#define I40E_GLPES_PFIP6RXPKTSHI(_i)               (0x00011004 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP6RXPKTSHI_MAX_INDEX         15
+#define I40E_GLPES_PFIP6RXPKTSHI_IP6RXPKTSHI_SHIFT 0
+#define I40E_GLPES_PFIP6RXPKTSHI_IP6RXPKTSHI_MASK  I40E_MASK(0xFFFF, I40E_GLPES_PFIP6RXPKTSHI_IP6RXPKTSHI_SHIFT)
+#define I40E_GLPES_PFIP6RXPKTSLO(_i)               (0x00011000 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP6RXPKTSLO_MAX_INDEX         15
+#define I40E_GLPES_PFIP6RXPKTSLO_IP6RXPKTSLO_SHIFT 0
+#define I40E_GLPES_PFIP6RXPKTSLO_IP6RXPKTSLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP6RXPKTSLO_IP6RXPKTSLO_SHIFT)
+#define I40E_GLPES_PFIP6RXTRUNC(_i)              (0x00011300 + ((_i) * 4)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP6RXTRUNC_MAX_INDEX        15
+#define I40E_GLPES_PFIP6RXTRUNC_IP6RXTRUNC_SHIFT 0
+#define I40E_GLPES_PFIP6RXTRUNC_IP6RXTRUNC_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP6RXTRUNC_IP6RXTRUNC_SHIFT)
+#define I40E_GLPES_PFIP6TXFRAGSHI(_i)                (0x00012804 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP6TXFRAGSHI_MAX_INDEX          15
+#define I40E_GLPES_PFIP6TXFRAGSHI_IP6TXFRAGSHI_SHIFT 0
+#define I40E_GLPES_PFIP6TXFRAGSHI_IP6TXFRAGSHI_MASK  I40E_MASK(0xFFFF, I40E_GLPES_PFIP6TXFRAGSHI_IP6TXFRAGSHI_SHIFT)
+#define I40E_GLPES_PFIP6TXFRAGSLO(_i)                (0x00012800 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP6TXFRAGSLO_MAX_INDEX          15
+#define I40E_GLPES_PFIP6TXFRAGSLO_IP6TXFRAGSLO_SHIFT 0
+#define I40E_GLPES_PFIP6TXFRAGSLO_IP6TXFRAGSLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP6TXFRAGSLO_IP6TXFRAGSLO_SHIFT)
+#define I40E_GLPES_PFIP6TXMCOCTSHI(_i)                 (0x00012A04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP6TXMCOCTSHI_MAX_INDEX           15
+#define I40E_GLPES_PFIP6TXMCOCTSHI_IP6TXMCOCTSHI_SHIFT 0
+#define I40E_GLPES_PFIP6TXMCOCTSHI_IP6TXMCOCTSHI_MASK  I40E_MASK(0xFFFF, I40E_GLPES_PFIP6TXMCOCTSHI_IP6TXMCOCTSHI_SHIFT)
+#define I40E_GLPES_PFIP6TXMCOCTSLO(_i)                 (0x00012A00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP6TXMCOCTSLO_MAX_INDEX           15
+#define I40E_GLPES_PFIP6TXMCOCTSLO_IP6TXMCOCTSLO_SHIFT 0
+#define I40E_GLPES_PFIP6TXMCOCTSLO_IP6TXMCOCTSLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP6TXMCOCTSLO_IP6TXMCOCTSLO_SHIFT)
+#define I40E_GLPES_PFIP6TXMCPKTSHI(_i)                 (0x00012C04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP6TXMCPKTSHI_MAX_INDEX           15
+#define I40E_GLPES_PFIP6TXMCPKTSHI_IP6TXMCPKTSHI_SHIFT 0
+#define I40E_GLPES_PFIP6TXMCPKTSHI_IP6TXMCPKTSHI_MASK  I40E_MASK(0xFFFF, I40E_GLPES_PFIP6TXMCPKTSHI_IP6TXMCPKTSHI_SHIFT)
+#define I40E_GLPES_PFIP6TXMCPKTSLO(_i)                 (0x00012C00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP6TXMCPKTSLO_MAX_INDEX           15
+#define I40E_GLPES_PFIP6TXMCPKTSLO_IP6TXMCPKTSLO_SHIFT 0
+#define I40E_GLPES_PFIP6TXMCPKTSLO_IP6TXMCPKTSLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP6TXMCPKTSLO_IP6TXMCPKTSLO_SHIFT)
+#define I40E_GLPES_PFIP6TXNOROUTE(_i)                (0x00012F00 + ((_i) * 4)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP6TXNOROUTE_MAX_INDEX          15
+#define I40E_GLPES_PFIP6TXNOROUTE_IP6TXNOROUTE_SHIFT 0
+#define I40E_GLPES_PFIP6TXNOROUTE_IP6TXNOROUTE_MASK  I40E_MASK(0xFFFFFF, I40E_GLPES_PFIP6TXNOROUTE_IP6TXNOROUTE_SHIFT)
+#define I40E_GLPES_PFIP6TXOCTSHI(_i)               (0x00012404 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP6TXOCTSHI_MAX_INDEX         15
+#define I40E_GLPES_PFIP6TXOCTSHI_IP6TXOCTSHI_SHIFT 0
+#define I40E_GLPES_PFIP6TXOCTSHI_IP6TXOCTSHI_MASK  I40E_MASK(0xFFFF, I40E_GLPES_PFIP6TXOCTSHI_IP6TXOCTSHI_SHIFT)
+#define I40E_GLPES_PFIP6TXOCTSLO(_i)               (0x00012400 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP6TXOCTSLO_MAX_INDEX         15
+#define I40E_GLPES_PFIP6TXOCTSLO_IP6TXOCTSLO_SHIFT 0
+#define I40E_GLPES_PFIP6TXOCTSLO_IP6TXOCTSLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP6TXOCTSLO_IP6TXOCTSLO_SHIFT)
+#define I40E_GLPES_PFIP6TXPKTSHI(_i)               (0x00012604 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP6TXPKTSHI_MAX_INDEX         15
+#define I40E_GLPES_PFIP6TXPKTSHI_IP6TXPKTSHI_SHIFT 0
+#define I40E_GLPES_PFIP6TXPKTSHI_IP6TXPKTSHI_MASK  I40E_MASK(0xFFFF, I40E_GLPES_PFIP6TXPKTSHI_IP6TXPKTSHI_SHIFT)
+#define I40E_GLPES_PFIP6TXPKTSLO(_i)               (0x00012600 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP6TXPKTSLO_MAX_INDEX         15
+#define I40E_GLPES_PFIP6TXPKTSLO_IP6TXPKTSLO_SHIFT 0
+#define I40E_GLPES_PFIP6TXPKTSLO_IP6TXPKTSLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP6TXPKTSLO_IP6TXPKTSLO_SHIFT)
+#define I40E_GLPES_PFRDMARXRDSHI(_i)               (0x00013E04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFRDMARXRDSHI_MAX_INDEX         15
+#define I40E_GLPES_PFRDMARXRDSHI_RDMARXRDSHI_SHIFT 0
+#define I40E_GLPES_PFRDMARXRDSHI_RDMARXRDSHI_MASK  I40E_MASK(0xFFFF, I40E_GLPES_PFRDMARXRDSHI_RDMARXRDSHI_SHIFT)
+#define I40E_GLPES_PFRDMARXRDSLO(_i)               (0x00013E00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFRDMARXRDSLO_MAX_INDEX         15
+#define I40E_GLPES_PFRDMARXRDSLO_RDMARXRDSLO_SHIFT 0
+#define I40E_GLPES_PFRDMARXRDSLO_RDMARXRDSLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFRDMARXRDSLO_RDMARXRDSLO_SHIFT)
+#define I40E_GLPES_PFRDMARXSNDSHI(_i)                (0x00014004 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFRDMARXSNDSHI_MAX_INDEX          15
+#define I40E_GLPES_PFRDMARXSNDSHI_RDMARXSNDSHI_SHIFT 0
+#define I40E_GLPES_PFRDMARXSNDSHI_RDMARXSNDSHI_MASK  I40E_MASK(0xFFFF, I40E_GLPES_PFRDMARXSNDSHI_RDMARXSNDSHI_SHIFT)
+#define I40E_GLPES_PFRDMARXSNDSLO(_i)                (0x00014000 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFRDMARXSNDSLO_MAX_INDEX          15
+#define I40E_GLPES_PFRDMARXSNDSLO_RDMARXSNDSLO_SHIFT 0
+#define I40E_GLPES_PFRDMARXSNDSLO_RDMARXSNDSLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFRDMARXSNDSLO_RDMARXSNDSLO_SHIFT)
+#define I40E_GLPES_PFRDMARXWRSHI(_i)               (0x00013C04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFRDMARXWRSHI_MAX_INDEX         15
+#define I40E_GLPES_PFRDMARXWRSHI_RDMARXWRSHI_SHIFT 0
+#define I40E_GLPES_PFRDMARXWRSHI_RDMARXWRSHI_MASK  I40E_MASK(0xFFFF, I40E_GLPES_PFRDMARXWRSHI_RDMARXWRSHI_SHIFT)
+#define I40E_GLPES_PFRDMARXWRSLO(_i)               (0x00013C00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFRDMARXWRSLO_MAX_INDEX         15
+#define I40E_GLPES_PFRDMARXWRSLO_RDMARXWRSLO_SHIFT 0
+#define I40E_GLPES_PFRDMARXWRSLO_RDMARXWRSLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFRDMARXWRSLO_RDMARXWRSLO_SHIFT)
+#define I40E_GLPES_PFRDMATXRDSHI(_i)               (0x00014404 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFRDMATXRDSHI_MAX_INDEX         15
+#define I40E_GLPES_PFRDMATXRDSHI_RDMARXRDSHI_SHIFT 0
+#define I40E_GLPES_PFRDMATXRDSHI_RDMARXRDSHI_MASK  I40E_MASK(0xFFFF, I40E_GLPES_PFRDMATXRDSHI_RDMARXRDSHI_SHIFT)
+#define I40E_GLPES_PFRDMATXRDSLO(_i)               (0x00014400 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFRDMATXRDSLO_MAX_INDEX         15
+#define I40E_GLPES_PFRDMATXRDSLO_RDMARXRDSLO_SHIFT 0
+#define I40E_GLPES_PFRDMATXRDSLO_RDMARXRDSLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFRDMATXRDSLO_RDMARXRDSLO_SHIFT)
+#define I40E_GLPES_PFRDMATXSNDSHI(_i)                (0x00014604 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFRDMATXSNDSHI_MAX_INDEX          15
+#define I40E_GLPES_PFRDMATXSNDSHI_RDMARXSNDSHI_SHIFT 0
+#define I40E_GLPES_PFRDMATXSNDSHI_RDMARXSNDSHI_MASK  I40E_MASK(0xFFFF, I40E_GLPES_PFRDMATXSNDSHI_RDMARXSNDSHI_SHIFT)
+#define I40E_GLPES_PFRDMATXSNDSLO(_i)                (0x00014600 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFRDMATXSNDSLO_MAX_INDEX          15
+#define I40E_GLPES_PFRDMATXSNDSLO_RDMARXSNDSLO_SHIFT 0
+#define I40E_GLPES_PFRDMATXSNDSLO_RDMARXSNDSLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFRDMATXSNDSLO_RDMARXSNDSLO_SHIFT)
+#define I40E_GLPES_PFRDMATXWRSHI(_i)               (0x00014204 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFRDMATXWRSHI_MAX_INDEX         15
+#define I40E_GLPES_PFRDMATXWRSHI_RDMARXWRSHI_SHIFT 0
+#define I40E_GLPES_PFRDMATXWRSHI_RDMARXWRSHI_MASK  I40E_MASK(0xFFFF, I40E_GLPES_PFRDMATXWRSHI_RDMARXWRSHI_SHIFT)
+#define I40E_GLPES_PFRDMATXWRSLO(_i)               (0x00014200 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFRDMATXWRSLO_MAX_INDEX         15
+#define I40E_GLPES_PFRDMATXWRSLO_RDMARXWRSLO_SHIFT 0
+#define I40E_GLPES_PFRDMATXWRSLO_RDMARXWRSLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFRDMATXWRSLO_RDMARXWRSLO_SHIFT)
+#define I40E_GLPES_PFRDMAVBNDHI(_i)              (0x00014804 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFRDMAVBNDHI_MAX_INDEX        15
+#define I40E_GLPES_PFRDMAVBNDHI_RDMAVBNDHI_SHIFT 0
+#define I40E_GLPES_PFRDMAVBNDHI_RDMAVBNDHI_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFRDMAVBNDHI_RDMAVBNDHI_SHIFT)
+#define I40E_GLPES_PFRDMAVBNDLO(_i)              (0x00014800 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFRDMAVBNDLO_MAX_INDEX        15
+#define I40E_GLPES_PFRDMAVBNDLO_RDMAVBNDLO_SHIFT 0
+#define I40E_GLPES_PFRDMAVBNDLO_RDMAVBNDLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFRDMAVBNDLO_RDMAVBNDLO_SHIFT)
+#define I40E_GLPES_PFRDMAVINVHI(_i)              (0x00014A04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFRDMAVINVHI_MAX_INDEX        15
+#define I40E_GLPES_PFRDMAVINVHI_RDMAVINVHI_SHIFT 0
+#define I40E_GLPES_PFRDMAVINVHI_RDMAVINVHI_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFRDMAVINVHI_RDMAVINVHI_SHIFT)
+#define I40E_GLPES_PFRDMAVINVLO(_i)              (0x00014A00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFRDMAVINVLO_MAX_INDEX        15
+#define I40E_GLPES_PFRDMAVINVLO_RDMAVINVLO_SHIFT 0
+#define I40E_GLPES_PFRDMAVINVLO_RDMAVINVLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFRDMAVINVLO_RDMAVINVLO_SHIFT)
+#define I40E_GLPES_PFRXVLANERR(_i)             (0x00010000 + ((_i) * 4)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFRXVLANERR_MAX_INDEX       15
+#define I40E_GLPES_PFRXVLANERR_RXVLANERR_SHIFT 0
+#define I40E_GLPES_PFRXVLANERR_RXVLANERR_MASK  I40E_MASK(0xFFFFFF, I40E_GLPES_PFRXVLANERR_RXVLANERR_SHIFT)
+#define I40E_GLPES_PFTCPRTXSEG(_i)             (0x00013600 + ((_i) * 4)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFTCPRTXSEG_MAX_INDEX       15
+#define I40E_GLPES_PFTCPRTXSEG_TCPRTXSEG_SHIFT 0
+#define I40E_GLPES_PFTCPRTXSEG_TCPRTXSEG_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFTCPRTXSEG_TCPRTXSEG_SHIFT)
+#define I40E_GLPES_PFTCPRXOPTERR(_i)               (0x00013200 + ((_i) * 4)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFTCPRXOPTERR_MAX_INDEX         15
+#define I40E_GLPES_PFTCPRXOPTERR_TCPRXOPTERR_SHIFT 0
+#define I40E_GLPES_PFTCPRXOPTERR_TCPRXOPTERR_MASK  I40E_MASK(0xFFFFFF, I40E_GLPES_PFTCPRXOPTERR_TCPRXOPTERR_SHIFT)
+#define I40E_GLPES_PFTCPRXPROTOERR(_i)                 (0x00013300 + ((_i) * 4)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFTCPRXPROTOERR_MAX_INDEX           15
+#define I40E_GLPES_PFTCPRXPROTOERR_TCPRXPROTOERR_SHIFT 0
+#define I40E_GLPES_PFTCPRXPROTOERR_TCPRXPROTOERR_MASK  I40E_MASK(0xFFFFFF, I40E_GLPES_PFTCPRXPROTOERR_TCPRXPROTOERR_SHIFT)
+#define I40E_GLPES_PFTCPRXSEGSHI(_i)               (0x00013004 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFTCPRXSEGSHI_MAX_INDEX         15
+#define I40E_GLPES_PFTCPRXSEGSHI_TCPRXSEGSHI_SHIFT 0
+#define I40E_GLPES_PFTCPRXSEGSHI_TCPRXSEGSHI_MASK  I40E_MASK(0xFFFF, I40E_GLPES_PFTCPRXSEGSHI_TCPRXSEGSHI_SHIFT)
+#define I40E_GLPES_PFTCPRXSEGSLO(_i)               (0x00013000 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFTCPRXSEGSLO_MAX_INDEX         15
+#define I40E_GLPES_PFTCPRXSEGSLO_TCPRXSEGSLO_SHIFT 0
+#define I40E_GLPES_PFTCPRXSEGSLO_TCPRXSEGSLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFTCPRXSEGSLO_TCPRXSEGSLO_SHIFT)
+#define I40E_GLPES_PFTCPTXSEGHI(_i)              (0x00013404 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFTCPTXSEGHI_MAX_INDEX        15
+#define I40E_GLPES_PFTCPTXSEGHI_TCPTXSEGHI_SHIFT 0
+#define I40E_GLPES_PFTCPTXSEGHI_TCPTXSEGHI_MASK  I40E_MASK(0xFFFF, I40E_GLPES_PFTCPTXSEGHI_TCPTXSEGHI_SHIFT)
+#define I40E_GLPES_PFTCPTXSEGLO(_i)              (0x00013400 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFTCPTXSEGLO_MAX_INDEX        15
+#define I40E_GLPES_PFTCPTXSEGLO_TCPTXSEGLO_SHIFT 0
+#define I40E_GLPES_PFTCPTXSEGLO_TCPTXSEGLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFTCPTXSEGLO_TCPTXSEGLO_SHIFT)
+#define I40E_GLPES_PFUDPRXPKTSHI(_i)               (0x00013804 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFUDPRXPKTSHI_MAX_INDEX         15
+#define I40E_GLPES_PFUDPRXPKTSHI_UDPRXPKTSHI_SHIFT 0
+#define I40E_GLPES_PFUDPRXPKTSHI_UDPRXPKTSHI_MASK  I40E_MASK(0xFFFF, I40E_GLPES_PFUDPRXPKTSHI_UDPRXPKTSHI_SHIFT)
+#define I40E_GLPES_PFUDPRXPKTSLO(_i)               (0x00013800 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFUDPRXPKTSLO_MAX_INDEX         15
+#define I40E_GLPES_PFUDPRXPKTSLO_UDPRXPKTSLO_SHIFT 0
+#define I40E_GLPES_PFUDPRXPKTSLO_UDPRXPKTSLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFUDPRXPKTSLO_UDPRXPKTSLO_SHIFT)
+#define I40E_GLPES_PFUDPTXPKTSHI(_i)               (0x00013A04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFUDPTXPKTSHI_MAX_INDEX         15
+#define I40E_GLPES_PFUDPTXPKTSHI_UDPTXPKTSHI_SHIFT 0
+#define I40E_GLPES_PFUDPTXPKTSHI_UDPTXPKTSHI_MASK  I40E_MASK(0xFFFF, I40E_GLPES_PFUDPTXPKTSHI_UDPTXPKTSHI_SHIFT)
+#define I40E_GLPES_PFUDPTXPKTSLO(_i)               (0x00013A00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFUDPTXPKTSLO_MAX_INDEX         15
+#define I40E_GLPES_PFUDPTXPKTSLO_UDPTXPKTSLO_SHIFT 0
+#define I40E_GLPES_PFUDPTXPKTSLO_UDPTXPKTSLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFUDPTXPKTSLO_UDPTXPKTSLO_SHIFT)
+#define I40E_GLPES_RDMARXMULTFPDUSHI                         0x0001E014 /* Reset: PE_CORER */
+#define I40E_GLPES_RDMARXMULTFPDUSHI_RDMARXMULTFPDUSHI_SHIFT 0
+#define I40E_GLPES_RDMARXMULTFPDUSHI_RDMARXMULTFPDUSHI_MASK  I40E_MASK(0xFFFFFF, I40E_GLPES_RDMARXMULTFPDUSHI_RDMARXMULTFPDUSHI_SHIFT)
+#define I40E_GLPES_RDMARXMULTFPDUSLO                         0x0001E010 /* Reset: PE_CORER */
+#define I40E_GLPES_RDMARXMULTFPDUSLO_RDMARXMULTFPDUSLO_SHIFT 0
+#define I40E_GLPES_RDMARXMULTFPDUSLO_RDMARXMULTFPDUSLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_RDMARXMULTFPDUSLO_RDMARXMULTFPDUSLO_SHIFT)
+#define I40E_GLPES_RDMARXOOODDPHI                      0x0001E01C /* Reset: PE_CORER */
+#define I40E_GLPES_RDMARXOOODDPHI_RDMARXOOODDPHI_SHIFT 0
+#define I40E_GLPES_RDMARXOOODDPHI_RDMARXOOODDPHI_MASK  I40E_MASK(0xFFFFFF, I40E_GLPES_RDMARXOOODDPHI_RDMARXOOODDPHI_SHIFT)
+#define I40E_GLPES_RDMARXOOODDPLO                      0x0001E018 /* Reset: PE_CORER */
+#define I40E_GLPES_RDMARXOOODDPLO_RDMARXOOODDPLO_SHIFT 0
+#define I40E_GLPES_RDMARXOOODDPLO_RDMARXOOODDPLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_RDMARXOOODDPLO_RDMARXOOODDPLO_SHIFT)
+#define I40E_GLPES_RDMARXOOONOMARK                     0x0001E004 /* Reset: PE_CORER */
+#define I40E_GLPES_RDMARXOOONOMARK_RDMAOOONOMARK_SHIFT 0
+#define I40E_GLPES_RDMARXOOONOMARK_RDMAOOONOMARK_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_RDMARXOOONOMARK_RDMAOOONOMARK_SHIFT)
+#define I40E_GLPES_RDMARXUNALIGN                     0x0001E000 /* Reset: PE_CORER */
+#define I40E_GLPES_RDMARXUNALIGN_RDMRXAUNALIGN_SHIFT 0
+#define I40E_GLPES_RDMARXUNALIGN_RDMRXAUNALIGN_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_RDMARXUNALIGN_RDMRXAUNALIGN_SHIFT)
+#define I40E_GLPES_TCPRXFOURHOLEHI                       0x0001E044 /* Reset: PE_CORER */
+#define I40E_GLPES_TCPRXFOURHOLEHI_TCPRXFOURHOLEHI_SHIFT 0
+#define I40E_GLPES_TCPRXFOURHOLEHI_TCPRXFOURHOLEHI_MASK  I40E_MASK(0xFFFFFF, I40E_GLPES_TCPRXFOURHOLEHI_TCPRXFOURHOLEHI_SHIFT)
+#define I40E_GLPES_TCPRXFOURHOLELO                       0x0001E040 /* Reset: PE_CORER */
+#define I40E_GLPES_TCPRXFOURHOLELO_TCPRXFOURHOLELO_SHIFT 0
+#define I40E_GLPES_TCPRXFOURHOLELO_TCPRXFOURHOLELO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_TCPRXFOURHOLELO_TCPRXFOURHOLELO_SHIFT)
+#define I40E_GLPES_TCPRXONEHOLEHI                      0x0001E02C /* Reset: PE_CORER */
+#define I40E_GLPES_TCPRXONEHOLEHI_TCPRXONEHOLEHI_SHIFT 0
+#define I40E_GLPES_TCPRXONEHOLEHI_TCPRXONEHOLEHI_MASK  I40E_MASK(0xFFFFFF, I40E_GLPES_TCPRXONEHOLEHI_TCPRXONEHOLEHI_SHIFT)
+#define I40E_GLPES_TCPRXONEHOLELO                      0x0001E028 /* Reset: PE_CORER */
+#define I40E_GLPES_TCPRXONEHOLELO_TCPRXONEHOLELO_SHIFT 0
+#define I40E_GLPES_TCPRXONEHOLELO_TCPRXONEHOLELO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_TCPRXONEHOLELO_TCPRXONEHOLELO_SHIFT)
+#define I40E_GLPES_TCPRXPUREACKHI                       0x0001E024 /* Reset: PE_CORER */
+#define I40E_GLPES_TCPRXPUREACKHI_TCPRXPUREACKSHI_SHIFT 0
+#define I40E_GLPES_TCPRXPUREACKHI_TCPRXPUREACKSHI_MASK  I40E_MASK(0xFFFFFF, I40E_GLPES_TCPRXPUREACKHI_TCPRXPUREACKSHI_SHIFT)
+#define I40E_GLPES_TCPRXPUREACKSLO                      0x0001E020 /* Reset: PE_CORER */
+#define I40E_GLPES_TCPRXPUREACKSLO_TCPRXPUREACKLO_SHIFT 0
+#define I40E_GLPES_TCPRXPUREACKSLO_TCPRXPUREACKLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_TCPRXPUREACKSLO_TCPRXPUREACKLO_SHIFT)
+#define I40E_GLPES_TCPRXTHREEHOLEHI                        0x0001E03C /* Reset: PE_CORER */
+#define I40E_GLPES_TCPRXTHREEHOLEHI_TCPRXTHREEHOLEHI_SHIFT 0
+#define I40E_GLPES_TCPRXTHREEHOLEHI_TCPRXTHREEHOLEHI_MASK  I40E_MASK(0xFFFFFF, I40E_GLPES_TCPRXTHREEHOLEHI_TCPRXTHREEHOLEHI_SHIFT)
+#define I40E_GLPES_TCPRXTHREEHOLELO                        0x0001E038 /* Reset: PE_CORER */
+#define I40E_GLPES_TCPRXTHREEHOLELO_TCPRXTHREEHOLELO_SHIFT 0
+#define I40E_GLPES_TCPRXTHREEHOLELO_TCPRXTHREEHOLELO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_TCPRXTHREEHOLELO_TCPRXTHREEHOLELO_SHIFT)
+#define I40E_GLPES_TCPRXTWOHOLEHI                      0x0001E034 /* Reset: PE_CORER */
+#define I40E_GLPES_TCPRXTWOHOLEHI_TCPRXTWOHOLEHI_SHIFT 0
+#define I40E_GLPES_TCPRXTWOHOLEHI_TCPRXTWOHOLEHI_MASK  I40E_MASK(0xFFFFFF, I40E_GLPES_TCPRXTWOHOLEHI_TCPRXTWOHOLEHI_SHIFT)
+#define I40E_GLPES_TCPRXTWOHOLELO                      0x0001E030 /* Reset: PE_CORER */
+#define I40E_GLPES_TCPRXTWOHOLELO_TCPRXTWOHOLELO_SHIFT 0
+#define I40E_GLPES_TCPRXTWOHOLELO_TCPRXTWOHOLELO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_TCPRXTWOHOLELO_TCPRXTWOHOLELO_SHIFT)
+#define I40E_GLPES_TCPTXRETRANSFASTHI                          0x0001E04C /* Reset: PE_CORER */
+#define I40E_GLPES_TCPTXRETRANSFASTHI_TCPTXRETRANSFASTHI_SHIFT 0
+#define I40E_GLPES_TCPTXRETRANSFASTHI_TCPTXRETRANSFASTHI_MASK  I40E_MASK(0xFFFFFF, I40E_GLPES_TCPTXRETRANSFASTHI_TCPTXRETRANSFASTHI_SHIFT)
+#define I40E_GLPES_TCPTXRETRANSFASTLO                          0x0001E048 /* Reset: PE_CORER */
+#define I40E_GLPES_TCPTXRETRANSFASTLO_TCPTXRETRANSFASTLO_SHIFT 0
+#define I40E_GLPES_TCPTXRETRANSFASTLO_TCPTXRETRANSFASTLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_TCPTXRETRANSFASTLO_TCPTXRETRANSFASTLO_SHIFT)
+#define I40E_GLPES_TCPTXTOUTSFASTHI                        0x0001E054 /* Reset: PE_CORER */
+#define I40E_GLPES_TCPTXTOUTSFASTHI_TCPTXTOUTSFASTHI_SHIFT 0
+#define I40E_GLPES_TCPTXTOUTSFASTHI_TCPTXTOUTSFASTHI_MASK  I40E_MASK(0xFFFFFF, I40E_GLPES_TCPTXTOUTSFASTHI_TCPTXTOUTSFASTHI_SHIFT)
+#define I40E_GLPES_TCPTXTOUTSFASTLO                        0x0001E050 /* Reset: PE_CORER */
+#define I40E_GLPES_TCPTXTOUTSFASTLO_TCPTXTOUTSFASTLO_SHIFT 0
+#define I40E_GLPES_TCPTXTOUTSFASTLO_TCPTXTOUTSFASTLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_TCPTXTOUTSFASTLO_TCPTXTOUTSFASTLO_SHIFT)
+#define I40E_GLPES_TCPTXTOUTSHI                    0x0001E05C /* Reset: PE_CORER */
+#define I40E_GLPES_TCPTXTOUTSHI_TCPTXTOUTSHI_SHIFT 0
+#define I40E_GLPES_TCPTXTOUTSHI_TCPTXTOUTSHI_MASK  I40E_MASK(0xFFFFFF, I40E_GLPES_TCPTXTOUTSHI_TCPTXTOUTSHI_SHIFT)
+#define I40E_GLPES_TCPTXTOUTSLO                    0x0001E058 /* Reset: PE_CORER */
+#define I40E_GLPES_TCPTXTOUTSLO_TCPTXTOUTSLO_SHIFT 0
+#define I40E_GLPES_TCPTXTOUTSLO_TCPTXTOUTSLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_TCPTXTOUTSLO_TCPTXTOUTSLO_SHIFT)
+#define I40E_GLPES_VFIP4RXDISCARD(_i)                (0x00018600 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP4RXDISCARD_MAX_INDEX          31
+#define I40E_GLPES_VFIP4RXDISCARD_IP4RXDISCARD_SHIFT 0
+#define I40E_GLPES_VFIP4RXDISCARD_IP4RXDISCARD_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP4RXDISCARD_IP4RXDISCARD_SHIFT)
+#define I40E_GLPES_VFIP4RXFRAGSHI(_i)                (0x00018804 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP4RXFRAGSHI_MAX_INDEX          31
+#define I40E_GLPES_VFIP4RXFRAGSHI_IP4RXFRAGSHI_SHIFT 0
+#define I40E_GLPES_VFIP4RXFRAGSHI_IP4RXFRAGSHI_MASK  I40E_MASK(0xFFFF, I40E_GLPES_VFIP4RXFRAGSHI_IP4RXFRAGSHI_SHIFT)
+#define I40E_GLPES_VFIP4RXFRAGSLO(_i)                (0x00018800 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP4RXFRAGSLO_MAX_INDEX          31
+#define I40E_GLPES_VFIP4RXFRAGSLO_IP4RXFRAGSLO_SHIFT 0
+#define I40E_GLPES_VFIP4RXFRAGSLO_IP4RXFRAGSLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP4RXFRAGSLO_IP4RXFRAGSLO_SHIFT)
+#define I40E_GLPES_VFIP4RXMCOCTSHI(_i)                 (0x00018A04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP4RXMCOCTSHI_MAX_INDEX           31
+#define I40E_GLPES_VFIP4RXMCOCTSHI_IP4RXMCOCTSHI_SHIFT 0
+#define I40E_GLPES_VFIP4RXMCOCTSHI_IP4RXMCOCTSHI_MASK  I40E_MASK(0xFFFF, I40E_GLPES_VFIP4RXMCOCTSHI_IP4RXMCOCTSHI_SHIFT)
+#define I40E_GLPES_VFIP4RXMCOCTSLO(_i)                 (0x00018A00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP4RXMCOCTSLO_MAX_INDEX           31
+#define I40E_GLPES_VFIP4RXMCOCTSLO_IP4RXMCOCTSLO_SHIFT 0
+#define I40E_GLPES_VFIP4RXMCOCTSLO_IP4RXMCOCTSLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP4RXMCOCTSLO_IP4RXMCOCTSLO_SHIFT)
+#define I40E_GLPES_VFIP4RXMCPKTSHI(_i)                 (0x00018C04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP4RXMCPKTSHI_MAX_INDEX           31
+#define I40E_GLPES_VFIP4RXMCPKTSHI_IP4RXMCPKTSHI_SHIFT 0
+#define I40E_GLPES_VFIP4RXMCPKTSHI_IP4RXMCPKTSHI_MASK  I40E_MASK(0xFFFF, I40E_GLPES_VFIP4RXMCPKTSHI_IP4RXMCPKTSHI_SHIFT)
+#define I40E_GLPES_VFIP4RXMCPKTSLO(_i)                 (0x00018C00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP4RXMCPKTSLO_MAX_INDEX           31
+#define I40E_GLPES_VFIP4RXMCPKTSLO_IP4RXMCPKTSLO_SHIFT 0
+#define I40E_GLPES_VFIP4RXMCPKTSLO_IP4RXMCPKTSLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP4RXMCPKTSLO_IP4RXMCPKTSLO_SHIFT)
+#define I40E_GLPES_VFIP4RXOCTSHI(_i)               (0x00018204 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP4RXOCTSHI_MAX_INDEX         31
+#define I40E_GLPES_VFIP4RXOCTSHI_IP4RXOCTSHI_SHIFT 0
+#define I40E_GLPES_VFIP4RXOCTSHI_IP4RXOCTSHI_MASK  I40E_MASK(0xFFFF, I40E_GLPES_VFIP4RXOCTSHI_IP4RXOCTSHI_SHIFT)
+#define I40E_GLPES_VFIP4RXOCTSLO(_i)               (0x00018200 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP4RXOCTSLO_MAX_INDEX         31
+#define I40E_GLPES_VFIP4RXOCTSLO_IP4RXOCTSLO_SHIFT 0
+#define I40E_GLPES_VFIP4RXOCTSLO_IP4RXOCTSLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP4RXOCTSLO_IP4RXOCTSLO_SHIFT)
+#define I40E_GLPES_VFIP4RXPKTSHI(_i)               (0x00018404 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP4RXPKTSHI_MAX_INDEX         31
+#define I40E_GLPES_VFIP4RXPKTSHI_IP4RXPKTSHI_SHIFT 0
+#define I40E_GLPES_VFIP4RXPKTSHI_IP4RXPKTSHI_MASK  I40E_MASK(0xFFFF, I40E_GLPES_VFIP4RXPKTSHI_IP4RXPKTSHI_SHIFT)
+#define I40E_GLPES_VFIP4RXPKTSLO(_i)               (0x00018400 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP4RXPKTSLO_MAX_INDEX         31
+#define I40E_GLPES_VFIP4RXPKTSLO_IP4RXPKTSLO_SHIFT 0
+#define I40E_GLPES_VFIP4RXPKTSLO_IP4RXPKTSLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP4RXPKTSLO_IP4RXPKTSLO_SHIFT)
+#define I40E_GLPES_VFIP4RXTRUNC(_i)              (0x00018700 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP4RXTRUNC_MAX_INDEX        31
+#define I40E_GLPES_VFIP4RXTRUNC_IP4RXTRUNC_SHIFT 0
+#define I40E_GLPES_VFIP4RXTRUNC_IP4RXTRUNC_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP4RXTRUNC_IP4RXTRUNC_SHIFT)
+#define I40E_GLPES_VFIP4TXFRAGSHI(_i)                (0x00019E04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP4TXFRAGSHI_MAX_INDEX          31
+#define I40E_GLPES_VFIP4TXFRAGSHI_IP4TXFRAGSHI_SHIFT 0
+#define I40E_GLPES_VFIP4TXFRAGSHI_IP4TXFRAGSHI_MASK  I40E_MASK(0xFFFF, I40E_GLPES_VFIP4TXFRAGSHI_IP4TXFRAGSHI_SHIFT)
+#define I40E_GLPES_VFIP4TXFRAGSLO(_i)                (0x00019E00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP4TXFRAGSLO_MAX_INDEX          31
+#define I40E_GLPES_VFIP4TXFRAGSLO_IP4TXFRAGSLO_SHIFT 0
+#define I40E_GLPES_VFIP4TXFRAGSLO_IP4TXFRAGSLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP4TXFRAGSLO_IP4TXFRAGSLO_SHIFT)
+#define I40E_GLPES_VFIP4TXMCOCTSHI(_i)                 (0x0001A004 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP4TXMCOCTSHI_MAX_INDEX           31
+#define I40E_GLPES_VFIP4TXMCOCTSHI_IP4TXMCOCTSHI_SHIFT 0
+#define I40E_GLPES_VFIP4TXMCOCTSHI_IP4TXMCOCTSHI_MASK  I40E_MASK(0xFFFF, I40E_GLPES_VFIP4TXMCOCTSHI_IP4TXMCOCTSHI_SHIFT)
+#define I40E_GLPES_VFIP4TXMCOCTSLO(_i)                 (0x0001A000 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP4TXMCOCTSLO_MAX_INDEX           31
+#define I40E_GLPES_VFIP4TXMCOCTSLO_IP4TXMCOCTSLO_SHIFT 0
+#define I40E_GLPES_VFIP4TXMCOCTSLO_IP4TXMCOCTSLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP4TXMCOCTSLO_IP4TXMCOCTSLO_SHIFT)
+#define I40E_GLPES_VFIP4TXMCPKTSHI(_i)                 (0x0001A204 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP4TXMCPKTSHI_MAX_INDEX           31
+#define I40E_GLPES_VFIP4TXMCPKTSHI_IP4TXMCPKTSHI_SHIFT 0
+#define I40E_GLPES_VFIP4TXMCPKTSHI_IP4TXMCPKTSHI_MASK  I40E_MASK(0xFFFF, I40E_GLPES_VFIP4TXMCPKTSHI_IP4TXMCPKTSHI_SHIFT)
+#define I40E_GLPES_VFIP4TXMCPKTSLO(_i)                 (0x0001A200 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP4TXMCPKTSLO_MAX_INDEX           31
+#define I40E_GLPES_VFIP4TXMCPKTSLO_IP4TXMCPKTSLO_SHIFT 0
+#define I40E_GLPES_VFIP4TXMCPKTSLO_IP4TXMCPKTSLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP4TXMCPKTSLO_IP4TXMCPKTSLO_SHIFT)
+#define I40E_GLPES_VFIP4TXNOROUTE(_i)                (0x0001AE00 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP4TXNOROUTE_MAX_INDEX          31
+#define I40E_GLPES_VFIP4TXNOROUTE_IP4TXNOROUTE_SHIFT 0
+#define I40E_GLPES_VFIP4TXNOROUTE_IP4TXNOROUTE_MASK  I40E_MASK(0xFFFFFF, I40E_GLPES_VFIP4TXNOROUTE_IP4TXNOROUTE_SHIFT)
+#define I40E_GLPES_VFIP4TXOCTSHI(_i)               (0x00019A04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP4TXOCTSHI_MAX_INDEX         31
+#define I40E_GLPES_VFIP4TXOCTSHI_IP4TXOCTSHI_SHIFT 0
+#define I40E_GLPES_VFIP4TXOCTSHI_IP4TXOCTSHI_MASK  I40E_MASK(0xFFFF, I40E_GLPES_VFIP4TXOCTSHI_IP4TXOCTSHI_SHIFT)
+#define I40E_GLPES_VFIP4TXOCTSLO(_i)               (0x00019A00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP4TXOCTSLO_MAX_INDEX         31
+#define I40E_GLPES_VFIP4TXOCTSLO_IP4TXOCTSLO_SHIFT 0
+#define I40E_GLPES_VFIP4TXOCTSLO_IP4TXOCTSLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP4TXOCTSLO_IP4TXOCTSLO_SHIFT)
+#define I40E_GLPES_VFIP4TXPKTSHI(_i)               (0x00019C04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP4TXPKTSHI_MAX_INDEX         31
+#define I40E_GLPES_VFIP4TXPKTSHI_IP4TXPKTSHI_SHIFT 0
+#define I40E_GLPES_VFIP4TXPKTSHI_IP4TXPKTSHI_MASK  I40E_MASK(0xFFFF, I40E_GLPES_VFIP4TXPKTSHI_IP4TXPKTSHI_SHIFT)
+#define I40E_GLPES_VFIP4TXPKTSLO(_i)               (0x00019C00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP4TXPKTSLO_MAX_INDEX         31
+#define I40E_GLPES_VFIP4TXPKTSLO_IP4TXPKTSLO_SHIFT 0
+#define I40E_GLPES_VFIP4TXPKTSLO_IP4TXPKTSLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP4TXPKTSLO_IP4TXPKTSLO_SHIFT)
+#define I40E_GLPES_VFIP6RXDISCARD(_i)                (0x00019200 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP6RXDISCARD_MAX_INDEX          31
+#define I40E_GLPES_VFIP6RXDISCARD_IP6RXDISCARD_SHIFT 0
+#define I40E_GLPES_VFIP6RXDISCARD_IP6RXDISCARD_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP6RXDISCARD_IP6RXDISCARD_SHIFT)
+#define I40E_GLPES_VFIP6RXFRAGSHI(_i)                (0x00019404 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP6RXFRAGSHI_MAX_INDEX          31
+#define I40E_GLPES_VFIP6RXFRAGSHI_IP6RXFRAGSHI_SHIFT 0
+#define I40E_GLPES_VFIP6RXFRAGSHI_IP6RXFRAGSHI_MASK  I40E_MASK(0xFFFF, I40E_GLPES_VFIP6RXFRAGSHI_IP6RXFRAGSHI_SHIFT)
+#define I40E_GLPES_VFIP6RXFRAGSLO(_i)                (0x00019400 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP6RXFRAGSLO_MAX_INDEX          31
+#define I40E_GLPES_VFIP6RXFRAGSLO_IP6RXFRAGSLO_SHIFT 0
+#define I40E_GLPES_VFIP6RXFRAGSLO_IP6RXFRAGSLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP6RXFRAGSLO_IP6RXFRAGSLO_SHIFT)
+#define I40E_GLPES_VFIP6RXMCOCTSHI(_i)                 (0x00019604 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP6RXMCOCTSHI_MAX_INDEX           31
+#define I40E_GLPES_VFIP6RXMCOCTSHI_IP6RXMCOCTSHI_SHIFT 0
+#define I40E_GLPES_VFIP6RXMCOCTSHI_IP6RXMCOCTSHI_MASK  I40E_MASK(0xFFFF, I40E_GLPES_VFIP6RXMCOCTSHI_IP6RXMCOCTSHI_SHIFT)
+#define I40E_GLPES_VFIP6RXMCOCTSLO(_i)                 (0x00019600 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP6RXMCOCTSLO_MAX_INDEX           31
+#define I40E_GLPES_VFIP6RXMCOCTSLO_IP6RXMCOCTSLO_SHIFT 0
+#define I40E_GLPES_VFIP6RXMCOCTSLO_IP6RXMCOCTSLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP6RXMCOCTSLO_IP6RXMCOCTSLO_SHIFT)
+#define I40E_GLPES_VFIP6RXMCPKTSHI(_i)                 (0x00019804 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP6RXMCPKTSHI_MAX_INDEX           31
+#define I40E_GLPES_VFIP6RXMCPKTSHI_IP6RXMCPKTSHI_SHIFT 0
+#define I40E_GLPES_VFIP6RXMCPKTSHI_IP6RXMCPKTSHI_MASK  I40E_MASK(0xFFFF, I40E_GLPES_VFIP6RXMCPKTSHI_IP6RXMCPKTSHI_SHIFT)
+#define I40E_GLPES_VFIP6RXMCPKTSLO(_i)                 (0x00019800 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP6RXMCPKTSLO_MAX_INDEX           31
+#define I40E_GLPES_VFIP6RXMCPKTSLO_IP6RXMCPKTSLO_SHIFT 0
+#define I40E_GLPES_VFIP6RXMCPKTSLO_IP6RXMCPKTSLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP6RXMCPKTSLO_IP6RXMCPKTSLO_SHIFT)
+#define I40E_GLPES_VFIP6RXOCTSHI(_i)               (0x00018E04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP6RXOCTSHI_MAX_INDEX         31
+#define I40E_GLPES_VFIP6RXOCTSHI_IP6RXOCTSHI_SHIFT 0
+#define I40E_GLPES_VFIP6RXOCTSHI_IP6RXOCTSHI_MASK  I40E_MASK(0xFFFF, I40E_GLPES_VFIP6RXOCTSHI_IP6RXOCTSHI_SHIFT)
+#define I40E_GLPES_VFIP6RXOCTSLO(_i)               (0x00018E00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP6RXOCTSLO_MAX_INDEX         31
+#define I40E_GLPES_VFIP6RXOCTSLO_IP6RXOCTSLO_SHIFT 0
+#define I40E_GLPES_VFIP6RXOCTSLO_IP6RXOCTSLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP6RXOCTSLO_IP6RXOCTSLO_SHIFT)
+#define I40E_GLPES_VFIP6RXPKTSHI(_i)               (0x00019004 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP6RXPKTSHI_MAX_INDEX         31
+#define I40E_GLPES_VFIP6RXPKTSHI_IP6RXPKTSHI_SHIFT 0
+#define I40E_GLPES_VFIP6RXPKTSHI_IP6RXPKTSHI_MASK  I40E_MASK(0xFFFF, I40E_GLPES_VFIP6RXPKTSHI_IP6RXPKTSHI_SHIFT)
+#define I40E_GLPES_VFIP6RXPKTSLO(_i)               (0x00019000 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP6RXPKTSLO_MAX_INDEX         31
+#define I40E_GLPES_VFIP6RXPKTSLO_IP6RXPKTSLO_SHIFT 0
+#define I40E_GLPES_VFIP6RXPKTSLO_IP6RXPKTSLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP6RXPKTSLO_IP6RXPKTSLO_SHIFT)
+#define I40E_GLPES_VFIP6RXTRUNC(_i)              (0x00019300 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP6RXTRUNC_MAX_INDEX        31
+#define I40E_GLPES_VFIP6RXTRUNC_IP6RXTRUNC_SHIFT 0
+#define I40E_GLPES_VFIP6RXTRUNC_IP6RXTRUNC_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP6RXTRUNC_IP6RXTRUNC_SHIFT)
+#define I40E_GLPES_VFIP6TXFRAGSHI(_i)                (0x0001A804 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP6TXFRAGSHI_MAX_INDEX          31
+#define I40E_GLPES_VFIP6TXFRAGSHI_IP6TXFRAGSHI_SHIFT 0
+#define I40E_GLPES_VFIP6TXFRAGSHI_IP6TXFRAGSHI_MASK  I40E_MASK(0xFFFF, I40E_GLPES_VFIP6TXFRAGSHI_IP6TXFRAGSHI_SHIFT)
+#define I40E_GLPES_VFIP6TXFRAGSLO(_i)                (0x0001A800 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP6TXFRAGSLO_MAX_INDEX          31
+#define I40E_GLPES_VFIP6TXFRAGSLO_IP6TXFRAGSLO_SHIFT 0
+#define I40E_GLPES_VFIP6TXFRAGSLO_IP6TXFRAGSLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP6TXFRAGSLO_IP6TXFRAGSLO_SHIFT)
+#define I40E_GLPES_VFIP6TXMCOCTSHI(_i)                 (0x0001AA04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP6TXMCOCTSHI_MAX_INDEX           31
+#define I40E_GLPES_VFIP6TXMCOCTSHI_IP6TXMCOCTSHI_SHIFT 0
+#define I40E_GLPES_VFIP6TXMCOCTSHI_IP6TXMCOCTSHI_MASK  I40E_MASK(0xFFFF, I40E_GLPES_VFIP6TXMCOCTSHI_IP6TXMCOCTSHI_SHIFT)
+#define I40E_GLPES_VFIP6TXMCOCTSLO(_i)                 (0x0001AA00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP6TXMCOCTSLO_MAX_INDEX           31
+#define I40E_GLPES_VFIP6TXMCOCTSLO_IP6TXMCOCTSLO_SHIFT 0
+#define I40E_GLPES_VFIP6TXMCOCTSLO_IP6TXMCOCTSLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP6TXMCOCTSLO_IP6TXMCOCTSLO_SHIFT)
+#define I40E_GLPES_VFIP6TXMCPKTSHI(_i)                 (0x0001AC04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP6TXMCPKTSHI_MAX_INDEX           31
+#define I40E_GLPES_VFIP6TXMCPKTSHI_IP6TXMCPKTSHI_SHIFT 0
+#define I40E_GLPES_VFIP6TXMCPKTSHI_IP6TXMCPKTSHI_MASK  I40E_MASK(0xFFFF, I40E_GLPES_VFIP6TXMCPKTSHI_IP6TXMCPKTSHI_SHIFT)
+#define I40E_GLPES_VFIP6TXMCPKTSLO(_i)                 (0x0001AC00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP6TXMCPKTSLO_MAX_INDEX           31
+#define I40E_GLPES_VFIP6TXMCPKTSLO_IP6TXMCPKTSLO_SHIFT 0
+#define I40E_GLPES_VFIP6TXMCPKTSLO_IP6TXMCPKTSLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP6TXMCPKTSLO_IP6TXMCPKTSLO_SHIFT)
+#define I40E_GLPES_VFIP6TXNOROUTE(_i)                (0x0001AF00 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP6TXNOROUTE_MAX_INDEX          31
+#define I40E_GLPES_VFIP6TXNOROUTE_IP6TXNOROUTE_SHIFT 0
+#define I40E_GLPES_VFIP6TXNOROUTE_IP6TXNOROUTE_MASK  I40E_MASK(0xFFFFFF, I40E_GLPES_VFIP6TXNOROUTE_IP6TXNOROUTE_SHIFT)
+#define I40E_GLPES_VFIP6TXOCTSHI(_i)               (0x0001A404 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP6TXOCTSHI_MAX_INDEX         31
+#define I40E_GLPES_VFIP6TXOCTSHI_IP6TXOCTSHI_SHIFT 0
+#define I40E_GLPES_VFIP6TXOCTSHI_IP6TXOCTSHI_MASK  I40E_MASK(0xFFFF, I40E_GLPES_VFIP6TXOCTSHI_IP6TXOCTSHI_SHIFT)
+#define I40E_GLPES_VFIP6TXOCTSLO(_i)               (0x0001A400 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP6TXOCTSLO_MAX_INDEX         31
+#define I40E_GLPES_VFIP6TXOCTSLO_IP6TXOCTSLO_SHIFT 0
+#define I40E_GLPES_VFIP6TXOCTSLO_IP6TXOCTSLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP6TXOCTSLO_IP6TXOCTSLO_SHIFT)
+#define I40E_GLPES_VFIP6TXPKTSHI(_i)               (0x0001A604 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP6TXPKTSHI_MAX_INDEX         31
+#define I40E_GLPES_VFIP6TXPKTSHI_IP6TXPKTSHI_SHIFT 0
+#define I40E_GLPES_VFIP6TXPKTSHI_IP6TXPKTSHI_MASK  I40E_MASK(0xFFFF, I40E_GLPES_VFIP6TXPKTSHI_IP6TXPKTSHI_SHIFT)
+#define I40E_GLPES_VFIP6TXPKTSLO(_i)               (0x0001A600 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP6TXPKTSLO_MAX_INDEX         31
+#define I40E_GLPES_VFIP6TXPKTSLO_IP6TXPKTSLO_SHIFT 0
+#define I40E_GLPES_VFIP6TXPKTSLO_IP6TXPKTSLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP6TXPKTSLO_IP6TXPKTSLO_SHIFT)
+#define I40E_GLPES_VFRDMARXRDSHI(_i)               (0x0001BE04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFRDMARXRDSHI_MAX_INDEX         31
+#define I40E_GLPES_VFRDMARXRDSHI_RDMARXRDSHI_SHIFT 0
+#define I40E_GLPES_VFRDMARXRDSHI_RDMARXRDSHI_MASK  I40E_MASK(0xFFFF, I40E_GLPES_VFRDMARXRDSHI_RDMARXRDSHI_SHIFT)
+#define I40E_GLPES_VFRDMARXRDSLO(_i)               (0x0001BE00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFRDMARXRDSLO_MAX_INDEX         31
+#define I40E_GLPES_VFRDMARXRDSLO_RDMARXRDSLO_SHIFT 0
+#define I40E_GLPES_VFRDMARXRDSLO_RDMARXRDSLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFRDMARXRDSLO_RDMARXRDSLO_SHIFT)
+#define I40E_GLPES_VFRDMARXSNDSHI(_i)                (0x0001C004 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFRDMARXSNDSHI_MAX_INDEX          31
+#define I40E_GLPES_VFRDMARXSNDSHI_RDMARXSNDSHI_SHIFT 0
+#define I40E_GLPES_VFRDMARXSNDSHI_RDMARXSNDSHI_MASK  I40E_MASK(0xFFFF, I40E_GLPES_VFRDMARXSNDSHI_RDMARXSNDSHI_SHIFT)
+#define I40E_GLPES_VFRDMARXSNDSLO(_i)                (0x0001C000 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFRDMARXSNDSLO_MAX_INDEX          31
+#define I40E_GLPES_VFRDMARXSNDSLO_RDMARXSNDSLO_SHIFT 0
+#define I40E_GLPES_VFRDMARXSNDSLO_RDMARXSNDSLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFRDMARXSNDSLO_RDMARXSNDSLO_SHIFT)
+#define I40E_GLPES_VFRDMARXWRSHI(_i)               (0x0001BC04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFRDMARXWRSHI_MAX_INDEX         31
+#define I40E_GLPES_VFRDMARXWRSHI_RDMARXWRSHI_SHIFT 0
+#define I40E_GLPES_VFRDMARXWRSHI_RDMARXWRSHI_MASK  I40E_MASK(0xFFFF, I40E_GLPES_VFRDMARXWRSHI_RDMARXWRSHI_SHIFT)
+#define I40E_GLPES_VFRDMARXWRSLO(_i)               (0x0001BC00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFRDMARXWRSLO_MAX_INDEX         31
+#define I40E_GLPES_VFRDMARXWRSLO_RDMARXWRSLO_SHIFT 0
+#define I40E_GLPES_VFRDMARXWRSLO_RDMARXWRSLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFRDMARXWRSLO_RDMARXWRSLO_SHIFT)
+#define I40E_GLPES_VFRDMATXRDSHI(_i)               (0x0001C404 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFRDMATXRDSHI_MAX_INDEX         31
+#define I40E_GLPES_VFRDMATXRDSHI_RDMARXRDSHI_SHIFT 0
+#define I40E_GLPES_VFRDMATXRDSHI_RDMARXRDSHI_MASK  I40E_MASK(0xFFFF, I40E_GLPES_VFRDMATXRDSHI_RDMARXRDSHI_SHIFT)
+#define I40E_GLPES_VFRDMATXRDSLO(_i)               (0x0001C400 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFRDMATXRDSLO_MAX_INDEX         31
+#define I40E_GLPES_VFRDMATXRDSLO_RDMARXRDSLO_SHIFT 0
+#define I40E_GLPES_VFRDMATXRDSLO_RDMARXRDSLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFRDMATXRDSLO_RDMARXRDSLO_SHIFT)
+#define I40E_GLPES_VFRDMATXSNDSHI(_i)                (0x0001C604 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFRDMATXSNDSHI_MAX_INDEX          31
+#define I40E_GLPES_VFRDMATXSNDSHI_RDMARXSNDSHI_SHIFT 0
+#define I40E_GLPES_VFRDMATXSNDSHI_RDMARXSNDSHI_MASK  I40E_MASK(0xFFFF, I40E_GLPES_VFRDMATXSNDSHI_RDMARXSNDSHI_SHIFT)
+#define I40E_GLPES_VFRDMATXSNDSLO(_i)                (0x0001C600 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFRDMATXSNDSLO_MAX_INDEX          31
+#define I40E_GLPES_VFRDMATXSNDSLO_RDMARXSNDSLO_SHIFT 0
+#define I40E_GLPES_VFRDMATXSNDSLO_RDMARXSNDSLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFRDMATXSNDSLO_RDMARXSNDSLO_SHIFT)
+#define I40E_GLPES_VFRDMATXWRSHI(_i)               (0x0001C204 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFRDMATXWRSHI_MAX_INDEX         31
+#define I40E_GLPES_VFRDMATXWRSHI_RDMARXWRSHI_SHIFT 0
+#define I40E_GLPES_VFRDMATXWRSHI_RDMARXWRSHI_MASK  I40E_MASK(0xFFFF, I40E_GLPES_VFRDMATXWRSHI_RDMARXWRSHI_SHIFT)
+#define I40E_GLPES_VFRDMATXWRSLO(_i)               (0x0001C200 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFRDMATXWRSLO_MAX_INDEX         31
+#define I40E_GLPES_VFRDMATXWRSLO_RDMARXWRSLO_SHIFT 0
+#define I40E_GLPES_VFRDMATXWRSLO_RDMARXWRSLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFRDMATXWRSLO_RDMARXWRSLO_SHIFT)
+#define I40E_GLPES_VFRDMAVBNDHI(_i)              (0x0001C804 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFRDMAVBNDHI_MAX_INDEX        31
+#define I40E_GLPES_VFRDMAVBNDHI_RDMAVBNDHI_SHIFT 0
+#define I40E_GLPES_VFRDMAVBNDHI_RDMAVBNDHI_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFRDMAVBNDHI_RDMAVBNDHI_SHIFT)
+#define I40E_GLPES_VFRDMAVBNDLO(_i)              (0x0001C800 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFRDMAVBNDLO_MAX_INDEX        31
+#define I40E_GLPES_VFRDMAVBNDLO_RDMAVBNDLO_SHIFT 0
+#define I40E_GLPES_VFRDMAVBNDLO_RDMAVBNDLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFRDMAVBNDLO_RDMAVBNDLO_SHIFT)
+#define I40E_GLPES_VFRDMAVINVHI(_i)              (0x0001CA04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFRDMAVINVHI_MAX_INDEX        31
+#define I40E_GLPES_VFRDMAVINVHI_RDMAVINVHI_SHIFT 0
+#define I40E_GLPES_VFRDMAVINVHI_RDMAVINVHI_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFRDMAVINVHI_RDMAVINVHI_SHIFT)
+#define I40E_GLPES_VFRDMAVINVLO(_i)              (0x0001CA00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFRDMAVINVLO_MAX_INDEX        31
+#define I40E_GLPES_VFRDMAVINVLO_RDMAVINVLO_SHIFT 0
+#define I40E_GLPES_VFRDMAVINVLO_RDMAVINVLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFRDMAVINVLO_RDMAVINVLO_SHIFT)
+#define I40E_GLPES_VFRXVLANERR(_i)             (0x00018000 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFRXVLANERR_MAX_INDEX       31
+#define I40E_GLPES_VFRXVLANERR_RXVLANERR_SHIFT 0
+#define I40E_GLPES_VFRXVLANERR_RXVLANERR_MASK  I40E_MASK(0xFFFFFF, I40E_GLPES_VFRXVLANERR_RXVLANERR_SHIFT)
+#define I40E_GLPES_VFTCPRTXSEG(_i)             (0x0001B600 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFTCPRTXSEG_MAX_INDEX       31
+#define I40E_GLPES_VFTCPRTXSEG_TCPRTXSEG_SHIFT 0
+#define I40E_GLPES_VFTCPRTXSEG_TCPRTXSEG_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFTCPRTXSEG_TCPRTXSEG_SHIFT)
+#define I40E_GLPES_VFTCPRXOPTERR(_i)               (0x0001B200 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFTCPRXOPTERR_MAX_INDEX         31
+#define I40E_GLPES_VFTCPRXOPTERR_TCPRXOPTERR_SHIFT 0
+#define I40E_GLPES_VFTCPRXOPTERR_TCPRXOPTERR_MASK  I40E_MASK(0xFFFFFF, I40E_GLPES_VFTCPRXOPTERR_TCPRXOPTERR_SHIFT)
+#define I40E_GLPES_VFTCPRXPROTOERR(_i)                 (0x0001B300 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFTCPRXPROTOERR_MAX_INDEX           31
+#define I40E_GLPES_VFTCPRXPROTOERR_TCPRXPROTOERR_SHIFT 0
+#define I40E_GLPES_VFTCPRXPROTOERR_TCPRXPROTOERR_MASK  I40E_MASK(0xFFFFFF, I40E_GLPES_VFTCPRXPROTOERR_TCPRXPROTOERR_SHIFT)
+#define I40E_GLPES_VFTCPRXSEGSHI(_i)               (0x0001B004 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFTCPRXSEGSHI_MAX_INDEX         31
+#define I40E_GLPES_VFTCPRXSEGSHI_TCPRXSEGSHI_SHIFT 0
+#define I40E_GLPES_VFTCPRXSEGSHI_TCPRXSEGSHI_MASK  I40E_MASK(0xFFFF, I40E_GLPES_VFTCPRXSEGSHI_TCPRXSEGSHI_SHIFT)
+#define I40E_GLPES_VFTCPRXSEGSLO(_i)               (0x0001B000 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFTCPRXSEGSLO_MAX_INDEX         31
+#define I40E_GLPES_VFTCPRXSEGSLO_TCPRXSEGSLO_SHIFT 0
+#define I40E_GLPES_VFTCPRXSEGSLO_TCPRXSEGSLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFTCPRXSEGSLO_TCPRXSEGSLO_SHIFT)
+#define I40E_GLPES_VFTCPTXSEGHI(_i)              (0x0001B404 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFTCPTXSEGHI_MAX_INDEX        31
+#define I40E_GLPES_VFTCPTXSEGHI_TCPTXSEGHI_SHIFT 0
+#define I40E_GLPES_VFTCPTXSEGHI_TCPTXSEGHI_MASK  I40E_MASK(0xFFFF, I40E_GLPES_VFTCPTXSEGHI_TCPTXSEGHI_SHIFT)
+#define I40E_GLPES_VFTCPTXSEGLO(_i)              (0x0001B400 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFTCPTXSEGLO_MAX_INDEX        31
+#define I40E_GLPES_VFTCPTXSEGLO_TCPTXSEGLO_SHIFT 0
+#define I40E_GLPES_VFTCPTXSEGLO_TCPTXSEGLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFTCPTXSEGLO_TCPTXSEGLO_SHIFT)
+#define I40E_GLPES_VFUDPRXPKTSHI(_i)               (0x0001B804 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFUDPRXPKTSHI_MAX_INDEX         31
+#define I40E_GLPES_VFUDPRXPKTSHI_UDPRXPKTSHI_SHIFT 0
+#define I40E_GLPES_VFUDPRXPKTSHI_UDPRXPKTSHI_MASK  I40E_MASK(0xFFFF, I40E_GLPES_VFUDPRXPKTSHI_UDPRXPKTSHI_SHIFT)
+#define I40E_GLPES_VFUDPRXPKTSLO(_i)               (0x0001B800 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFUDPRXPKTSLO_MAX_INDEX         31
+#define I40E_GLPES_VFUDPRXPKTSLO_UDPRXPKTSLO_SHIFT 0
+#define I40E_GLPES_VFUDPRXPKTSLO_UDPRXPKTSLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFUDPRXPKTSLO_UDPRXPKTSLO_SHIFT)
+#define I40E_GLPES_VFUDPTXPKTSHI(_i)               (0x0001BA04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFUDPTXPKTSHI_MAX_INDEX         31
+#define I40E_GLPES_VFUDPTXPKTSHI_UDPTXPKTSHI_SHIFT 0
+#define I40E_GLPES_VFUDPTXPKTSHI_UDPTXPKTSHI_MASK  I40E_MASK(0xFFFF, I40E_GLPES_VFUDPTXPKTSHI_UDPTXPKTSHI_SHIFT)
+#define I40E_GLPES_VFUDPTXPKTSLO(_i)               (0x0001BA00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFUDPTXPKTSLO_MAX_INDEX         31
+#define I40E_GLPES_VFUDPTXPKTSLO_UDPTXPKTSLO_SHIFT 0
+#define I40E_GLPES_VFUDPTXPKTSLO_UDPTXPKTSLO_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFUDPTXPKTSLO_UDPTXPKTSLO_SHIFT)
+#define I40E_GLGEN_PME_TO                     0x000B81BC /* Reset: POR */
+#define I40E_GLGEN_PME_TO_PME_TO_FOR_PE_SHIFT 0
+#define I40E_GLGEN_PME_TO_PME_TO_FOR_PE_MASK  I40E_MASK(0x1, I40E_GLGEN_PME_TO_PME_TO_FOR_PE_SHIFT)
+#define I40E_GLQF_APBVT(_i)         (0x00260000 + ((_i) * 4)) /* _i=0...2047 */ /* Reset: CORER */
+#define I40E_GLQF_APBVT_MAX_INDEX   2047
+#define I40E_GLQF_APBVT_APBVT_SHIFT 0
+#define I40E_GLQF_APBVT_APBVT_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLQF_APBVT_APBVT_SHIFT)
+#define I40E_GLQF_FD_PCTYPES(_i)             (0x00268000 + ((_i) * 4)) /* _i=0...63 */ /* Reset: POR */
+#define I40E_GLQF_FD_PCTYPES_MAX_INDEX       63
+#define I40E_GLQF_FD_PCTYPES_FD_PCTYPE_SHIFT 0
+#define I40E_GLQF_FD_PCTYPES_FD_PCTYPE_MASK  I40E_MASK(0x3F, I40E_GLQF_FD_PCTYPES_FD_PCTYPE_SHIFT)
+#define I40E_GLQF_FDEVICTENA(_i)                   (0x00270384 + ((_i) * 4)) /* _i=0...1 */ /* Reset: CORER */
+#define I40E_GLQF_FDEVICTENA_MAX_INDEX             1
+#define I40E_GLQF_FDEVICTENA_GLQF_FDEVICTENA_SHIFT 0
+#define I40E_GLQF_FDEVICTENA_GLQF_FDEVICTENA_MASK  I40E_MASK(0xFFFFFFFF, I40E_GLQF_FDEVICTENA_GLQF_FDEVICTENA_SHIFT)
+#define I40E_GLQF_FDEVICTFLAG                0x00270280 /* Reset: CORER */
+#define I40E_GLQF_FDEVICTFLAG_TX_FLAGS_SHIFT 0
+#define I40E_GLQF_FDEVICTFLAG_TX_FLAGS_MASK  I40E_MASK(0xFF, I40E_GLQF_FDEVICTFLAG_TX_FLAGS_SHIFT)
+#define I40E_GLQF_FDEVICTFLAG_RX_FLAGS_SHIFT 8
+#define I40E_GLQF_FDEVICTFLAG_RX_FLAGS_MASK  I40E_MASK(0xFF, I40E_GLQF_FDEVICTFLAG_RX_FLAGS_SHIFT)
+#define I40E_PFQF_CTL_2               0x00270300 /* Reset: CORER */
+#define I40E_PFQF_CTL_2_PEHSIZE_SHIFT 0
+#define I40E_PFQF_CTL_2_PEHSIZE_MASK  I40E_MASK(0x1F, I40E_PFQF_CTL_2_PEHSIZE_SHIFT)
+#define I40E_PFQF_CTL_2_PEDSIZE_SHIFT 5
+#define I40E_PFQF_CTL_2_PEDSIZE_MASK  I40E_MASK(0x1F, I40E_PFQF_CTL_2_PEDSIZE_SHIFT)
+/* Redefined for X722 family */
+#define I40E_X722_PFQF_HLUT(_i)        (0x00240000 + ((_i) * 128)) /* _i=0...127 */ /* Reset: CORER */
+#define I40E_X722_PFQF_HLUT_MAX_INDEX  127
+#define I40E_X722_PFQF_HLUT_LUT0_SHIFT 0
+#define I40E_X722_PFQF_HLUT_LUT0_MASK  I40E_MASK(0x7F, I40E_X722_PFQF_HLUT_LUT0_SHIFT)
+#define I40E_X722_PFQF_HLUT_LUT1_SHIFT 8
+#define I40E_X722_PFQF_HLUT_LUT1_MASK  I40E_MASK(0x7F, I40E_X722_PFQF_HLUT_LUT1_SHIFT)
+#define I40E_X722_PFQF_HLUT_LUT2_SHIFT 16
+#define I40E_X722_PFQF_HLUT_LUT2_MASK  I40E_MASK(0x7F, I40E_X722_PFQF_HLUT_LUT2_SHIFT)
+#define I40E_X722_PFQF_HLUT_LUT3_SHIFT 24
+#define I40E_X722_PFQF_HLUT_LUT3_MASK  I40E_MASK(0x7F, I40E_X722_PFQF_HLUT_LUT3_SHIFT)
+#define I40E_PFQF_HREGION(_i)                  (0x00245400 + ((_i) * 128)) /* _i=0...7 */ /* Reset: CORER */
+#define I40E_PFQF_HREGION_MAX_INDEX            7
+#define I40E_PFQF_HREGION_OVERRIDE_ENA_0_SHIFT 0
+#define I40E_PFQF_HREGION_OVERRIDE_ENA_0_MASK  I40E_MASK(0x1, I40E_PFQF_HREGION_OVERRIDE_ENA_0_SHIFT)
+#define I40E_PFQF_HREGION_REGION_0_SHIFT       1
+#define I40E_PFQF_HREGION_REGION_0_MASK        I40E_MASK(0x7, I40E_PFQF_HREGION_REGION_0_SHIFT)
+#define I40E_PFQF_HREGION_OVERRIDE_ENA_1_SHIFT 4
+#define I40E_PFQF_HREGION_OVERRIDE_ENA_1_MASK  I40E_MASK(0x1, I40E_PFQF_HREGION_OVERRIDE_ENA_1_SHIFT)
+#define I40E_PFQF_HREGION_REGION_1_SHIFT       5
+#define I40E_PFQF_HREGION_REGION_1_MASK        I40E_MASK(0x7, I40E_PFQF_HREGION_REGION_1_SHIFT)
+#define I40E_PFQF_HREGION_OVERRIDE_ENA_2_SHIFT 8
+#define I40E_PFQF_HREGION_OVERRIDE_ENA_2_MASK  I40E_MASK(0x1, I40E_PFQF_HREGION_OVERRIDE_ENA_2_SHIFT)
+#define I40E_PFQF_HREGION_REGION_2_SHIFT       9
+#define I40E_PFQF_HREGION_REGION_2_MASK        I40E_MASK(0x7, I40E_PFQF_HREGION_REGION_2_SHIFT)
+#define I40E_PFQF_HREGION_OVERRIDE_ENA_3_SHIFT 12
+#define I40E_PFQF_HREGION_OVERRIDE_ENA_3_MASK  I40E_MASK(0x1, I40E_PFQF_HREGION_OVERRIDE_ENA_3_SHIFT)
+#define I40E_PFQF_HREGION_REGION_3_SHIFT       13
+#define I40E_PFQF_HREGION_REGION_3_MASK        I40E_MASK(0x7, I40E_PFQF_HREGION_REGION_3_SHIFT)
+#define I40E_PFQF_HREGION_OVERRIDE_ENA_4_SHIFT 16
+#define I40E_PFQF_HREGION_OVERRIDE_ENA_4_MASK  I40E_MASK(0x1, I40E_PFQF_HREGION_OVERRIDE_ENA_4_SHIFT)
+#define I40E_PFQF_HREGION_REGION_4_SHIFT       17
+#define I40E_PFQF_HREGION_REGION_4_MASK        I40E_MASK(0x7, I40E_PFQF_HREGION_REGION_4_SHIFT)
+#define I40E_PFQF_HREGION_OVERRIDE_ENA_5_SHIFT 20
+#define I40E_PFQF_HREGION_OVERRIDE_ENA_5_MASK  I40E_MASK(0x1, I40E_PFQF_HREGION_OVERRIDE_ENA_5_SHIFT)
+#define I40E_PFQF_HREGION_REGION_5_SHIFT       21
+#define I40E_PFQF_HREGION_REGION_5_MASK        I40E_MASK(0x7, I40E_PFQF_HREGION_REGION_5_SHIFT)
+#define I40E_PFQF_HREGION_OVERRIDE_ENA_6_SHIFT 24
+#define I40E_PFQF_HREGION_OVERRIDE_ENA_6_MASK  I40E_MASK(0x1, I40E_PFQF_HREGION_OVERRIDE_ENA_6_SHIFT)
+#define I40E_PFQF_HREGION_REGION_6_SHIFT       25
+#define I40E_PFQF_HREGION_REGION_6_MASK        I40E_MASK(0x7, I40E_PFQF_HREGION_REGION_6_SHIFT)
+#define I40E_PFQF_HREGION_OVERRIDE_ENA_7_SHIFT 28
+#define I40E_PFQF_HREGION_OVERRIDE_ENA_7_MASK  I40E_MASK(0x1, I40E_PFQF_HREGION_OVERRIDE_ENA_7_SHIFT)
+#define I40E_PFQF_HREGION_REGION_7_SHIFT       29
+#define I40E_PFQF_HREGION_REGION_7_MASK        I40E_MASK(0x7, I40E_PFQF_HREGION_REGION_7_SHIFT)
+#define I40E_VSIQF_CTL_RSS_LUT_TYPE_SHIFT 8
+#define I40E_VSIQF_CTL_RSS_LUT_TYPE_MASK  I40E_MASK(0x1, I40E_VSIQF_CTL_RSS_LUT_TYPE_SHIFT)
+#define I40E_VSIQF_HKEY(_i, _VSI)    (0x002A0000 + ((_i) * 2048 + (_VSI) * 4)) /* _i=0...12, _VSI=0...383 */ /* Reset: CORER */
+#define I40E_VSIQF_HKEY_MAX_INDEX   12
+#define I40E_VSIQF_HKEY_KEY_0_SHIFT 0
+#define I40E_VSIQF_HKEY_KEY_0_MASK  I40E_MASK(0xFF, I40E_VSIQF_HKEY_KEY_0_SHIFT)
+#define I40E_VSIQF_HKEY_KEY_1_SHIFT 8
+#define I40E_VSIQF_HKEY_KEY_1_MASK  I40E_MASK(0xFF, I40E_VSIQF_HKEY_KEY_1_SHIFT)
+#define I40E_VSIQF_HKEY_KEY_2_SHIFT 16
+#define I40E_VSIQF_HKEY_KEY_2_MASK  I40E_MASK(0xFF, I40E_VSIQF_HKEY_KEY_2_SHIFT)
+#define I40E_VSIQF_HKEY_KEY_3_SHIFT 24
+#define I40E_VSIQF_HKEY_KEY_3_MASK  I40E_MASK(0xFF, I40E_VSIQF_HKEY_KEY_3_SHIFT)
+#define I40E_VSIQF_HLUT(_i, _VSI)   (0x00220000 + ((_i) * 2048 + (_VSI) * 4)) /* _i=0...15, _VSI=0...383 */ /* Reset: CORER */
+#define I40E_VSIQF_HLUT_MAX_INDEX  15
+#define I40E_VSIQF_HLUT_LUT0_SHIFT 0
+#define I40E_VSIQF_HLUT_LUT0_MASK  I40E_MASK(0xF, I40E_VSIQF_HLUT_LUT0_SHIFT)
+#define I40E_VSIQF_HLUT_LUT1_SHIFT 8
+#define I40E_VSIQF_HLUT_LUT1_MASK  I40E_MASK(0xF, I40E_VSIQF_HLUT_LUT1_SHIFT)
+#define I40E_VSIQF_HLUT_LUT2_SHIFT 16
+#define I40E_VSIQF_HLUT_LUT2_MASK  I40E_MASK(0xF, I40E_VSIQF_HLUT_LUT2_SHIFT)
+#define I40E_VSIQF_HLUT_LUT3_SHIFT 24
+#define I40E_VSIQF_HLUT_LUT3_MASK  I40E_MASK(0xF, I40E_VSIQF_HLUT_LUT3_SHIFT)
+#define I40E_GLGEN_STAT_CLEAR                        0x00390004 /* Reset: CORER */
+#define I40E_GLGEN_STAT_CLEAR_GLGEN_STAT_CLEAR_SHIFT 0
+#define I40E_GLGEN_STAT_CLEAR_GLGEN_STAT_CLEAR_MASK  I40E_MASK(0x1, I40E_GLGEN_STAT_CLEAR_GLGEN_STAT_CLEAR_SHIFT)
+#define I40E_GLGEN_STAT_HALT                  0x00390000 /* Reset: CORER */
+#define I40E_GLGEN_STAT_HALT_HALT_CELLS_SHIFT 0
+#define I40E_GLGEN_STAT_HALT_HALT_CELLS_MASK  I40E_MASK(0x3FFFFFFF, I40E_GLGEN_STAT_HALT_HALT_CELLS_SHIFT)
+#define I40E_VFINT_DYN_CTL01_WB_ON_ITR_SHIFT       30
+#define I40E_VFINT_DYN_CTL01_WB_ON_ITR_MASK        I40E_MASK(0x1, I40E_VFINT_DYN_CTL01_WB_ON_ITR_SHIFT)
+#define I40E_VFINT_DYN_CTLN1_WB_ON_ITR_SHIFT       30
+#define I40E_VFINT_DYN_CTLN1_WB_ON_ITR_MASK        I40E_MASK(0x1, I40E_VFINT_DYN_CTLN1_WB_ON_ITR_SHIFT)
+#define I40E_VFPE_AEQALLOC1               0x0000A400 /* Reset: VFR */
+#define I40E_VFPE_AEQALLOC1_AECOUNT_SHIFT 0
+#define I40E_VFPE_AEQALLOC1_AECOUNT_MASK  I40E_MASK(0xFFFFFFFF, I40E_VFPE_AEQALLOC1_AECOUNT_SHIFT)
+#define I40E_VFPE_CCQPHIGH1                  0x00009800 /* Reset: VFR */
+#define I40E_VFPE_CCQPHIGH1_PECCQPHIGH_SHIFT 0
+#define I40E_VFPE_CCQPHIGH1_PECCQPHIGH_MASK  I40E_MASK(0xFFFFFFFF, I40E_VFPE_CCQPHIGH1_PECCQPHIGH_SHIFT)
+#define I40E_VFPE_CCQPLOW1                 0x0000AC00 /* Reset: VFR */
+#define I40E_VFPE_CCQPLOW1_PECCQPLOW_SHIFT 0
+#define I40E_VFPE_CCQPLOW1_PECCQPLOW_MASK  I40E_MASK(0xFFFFFFFF, I40E_VFPE_CCQPLOW1_PECCQPLOW_SHIFT)
+#define I40E_VFPE_CCQPSTATUS1                   0x0000B800 /* Reset: VFR */
+#define I40E_VFPE_CCQPSTATUS1_CCQP_DONE_SHIFT   0
+#define I40E_VFPE_CCQPSTATUS1_CCQP_DONE_MASK    I40E_MASK(0x1, I40E_VFPE_CCQPSTATUS1_CCQP_DONE_SHIFT)
+#define I40E_VFPE_CCQPSTATUS1_HMC_PROFILE_SHIFT 4
+#define I40E_VFPE_CCQPSTATUS1_HMC_PROFILE_MASK  I40E_MASK(0x7, I40E_VFPE_CCQPSTATUS1_HMC_PROFILE_SHIFT)
+#define I40E_VFPE_CCQPSTATUS1_RDMA_EN_VFS_SHIFT 16
+#define I40E_VFPE_CCQPSTATUS1_RDMA_EN_VFS_MASK  I40E_MASK(0x3F, I40E_VFPE_CCQPSTATUS1_RDMA_EN_VFS_SHIFT)
+#define I40E_VFPE_CCQPSTATUS1_CCQP_ERR_SHIFT    31
+#define I40E_VFPE_CCQPSTATUS1_CCQP_ERR_MASK     I40E_MASK(0x1, I40E_VFPE_CCQPSTATUS1_CCQP_ERR_SHIFT)
+#define I40E_VFPE_CQACK1              0x0000B000 /* Reset: VFR */
+#define I40E_VFPE_CQACK1_PECQID_SHIFT 0
+#define I40E_VFPE_CQACK1_PECQID_MASK  I40E_MASK(0x1FFFF, I40E_VFPE_CQACK1_PECQID_SHIFT)
+#define I40E_VFPE_CQARM1              0x0000B400 /* Reset: VFR */
+#define I40E_VFPE_CQARM1_PECQID_SHIFT 0
+#define I40E_VFPE_CQARM1_PECQID_MASK  I40E_MASK(0x1FFFF, I40E_VFPE_CQARM1_PECQID_SHIFT)
+#define I40E_VFPE_CQPDB1              0x0000BC00 /* Reset: VFR */
+#define I40E_VFPE_CQPDB1_WQHEAD_SHIFT 0
+#define I40E_VFPE_CQPDB1_WQHEAD_MASK  I40E_MASK(0x7FF, I40E_VFPE_CQPDB1_WQHEAD_SHIFT)
+#define I40E_VFPE_CQPERRCODES1                      0x00009C00 /* Reset: VFR */
+#define I40E_VFPE_CQPERRCODES1_CQP_MINOR_CODE_SHIFT 0
+#define I40E_VFPE_CQPERRCODES1_CQP_MINOR_CODE_MASK  I40E_MASK(0xFFFF, I40E_VFPE_CQPERRCODES1_CQP_MINOR_CODE_SHIFT)
+#define I40E_VFPE_CQPERRCODES1_CQP_MAJOR_CODE_SHIFT 16
+#define I40E_VFPE_CQPERRCODES1_CQP_MAJOR_CODE_MASK  I40E_MASK(0xFFFF, I40E_VFPE_CQPERRCODES1_CQP_MAJOR_CODE_SHIFT)
+#define I40E_VFPE_CQPTAIL1                  0x0000A000 /* Reset: VFR */
+#define I40E_VFPE_CQPTAIL1_WQTAIL_SHIFT     0
+#define I40E_VFPE_CQPTAIL1_WQTAIL_MASK      I40E_MASK(0x7FF, I40E_VFPE_CQPTAIL1_WQTAIL_SHIFT)
+#define I40E_VFPE_CQPTAIL1_CQP_OP_ERR_SHIFT 31
+#define I40E_VFPE_CQPTAIL1_CQP_OP_ERR_MASK  I40E_MASK(0x1, I40E_VFPE_CQPTAIL1_CQP_OP_ERR_SHIFT)
+#define I40E_VFPE_IPCONFIG01                        0x00008C00 /* Reset: VFR */
+#define I40E_VFPE_IPCONFIG01_PEIPID_SHIFT           0
+#define I40E_VFPE_IPCONFIG01_PEIPID_MASK            I40E_MASK(0xFFFF, I40E_VFPE_IPCONFIG01_PEIPID_SHIFT)
+#define I40E_VFPE_IPCONFIG01_USEENTIREIDRANGE_SHIFT 16
+#define I40E_VFPE_IPCONFIG01_USEENTIREIDRANGE_MASK  I40E_MASK(0x1, I40E_VFPE_IPCONFIG01_USEENTIREIDRANGE_SHIFT)
+#define I40E_VFPE_MRTEIDXMASK1                       0x00009000 /* Reset: VFR */
+#define I40E_VFPE_MRTEIDXMASK1_MRTEIDXMASKBITS_SHIFT 0
+#define I40E_VFPE_MRTEIDXMASK1_MRTEIDXMASKBITS_MASK  I40E_MASK(0x1F, I40E_VFPE_MRTEIDXMASK1_MRTEIDXMASKBITS_SHIFT)
+#define I40E_VFPE_RCVUNEXPECTEDERROR1                        0x00009400 /* Reset: VFR */
+#define I40E_VFPE_RCVUNEXPECTEDERROR1_TCP_RX_UNEXP_ERR_SHIFT 0
+#define I40E_VFPE_RCVUNEXPECTEDERROR1_TCP_RX_UNEXP_ERR_MASK  I40E_MASK(0xFFFFFF, I40E_VFPE_RCVUNEXPECTEDERROR1_TCP_RX_UNEXP_ERR_SHIFT)
+#define I40E_VFPE_TCPNOWTIMER1               0x0000A800 /* Reset: VFR */
+#define I40E_VFPE_TCPNOWTIMER1_TCP_NOW_SHIFT 0
+#define I40E_VFPE_TCPNOWTIMER1_TCP_NOW_MASK  I40E_MASK(0xFFFFFFFF, I40E_VFPE_TCPNOWTIMER1_TCP_NOW_SHIFT)
+#define I40E_VFPE_WQEALLOC1                      0x0000C000 /* Reset: VFR */
+#define I40E_VFPE_WQEALLOC1_PEQPID_SHIFT         0
+#define I40E_VFPE_WQEALLOC1_PEQPID_MASK          I40E_MASK(0x3FFFF, I40E_VFPE_WQEALLOC1_PEQPID_SHIFT)
+#define I40E_VFPE_WQEALLOC1_WQE_DESC_INDEX_SHIFT 20
+#define I40E_VFPE_WQEALLOC1_WQE_DESC_INDEX_MASK  I40E_MASK(0xFFF, I40E_VFPE_WQEALLOC1_WQE_DESC_INDEX_SHIFT)
+
+#endif /* X722_SUPPORT */
+#endif /* _I40E_REGISTER_H_ */
diff --git a/usr/src/uts/common/io/i40e/core/i40e_status.h b/usr/src/uts/common/io/i40e/core/i40e_status.h
new file mode 100644
index 0000000000..1f27507970
--- /dev/null
+++ b/usr/src/uts/common/io/i40e/core/i40e_status.h
@@ -0,0 +1,108 @@
+/******************************************************************************
+
+  Copyright (c) 2013-2014, Intel Corporation 
+  All rights reserved.
+  
+  Redistribution and use in source and binary forms, with or without 
+  modification, are permitted provided that the following conditions are met:
+  
+   1. Redistributions of source code must retain the above copyright notice, 
+      this list of conditions and the following disclaimer.
+  
+   2. Redistributions in binary form must reproduce the above copyright 
+      notice, this list of conditions and the following disclaimer in the 
+      documentation and/or other materials provided with the distribution.
+  
+   3. Neither the name of the Intel Corporation nor the names of its 
+      contributors may be used to endorse or promote products derived from 
+      this software without specific prior written permission.
+  
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+  POSSIBILITY OF SUCH DAMAGE.
+
+******************************************************************************/
+/*$FreeBSD: head/sys/dev/ixl/i40e_status.h 283119 2015-05-19 18:35:18Z jhb $*/
+
+#ifndef _I40E_STATUS_H_
+#define _I40E_STATUS_H_
+
+/* Error Codes */
+enum i40e_status_code {
+	I40E_SUCCESS				= 0,
+	I40E_ERR_NVM				= -1,
+	I40E_ERR_NVM_CHECKSUM			= -2,
+	I40E_ERR_PHY				= -3,
+	I40E_ERR_CONFIG				= -4,
+	I40E_ERR_PARAM				= -5,
+	I40E_ERR_MAC_TYPE			= -6,
+	I40E_ERR_UNKNOWN_PHY			= -7,
+	I40E_ERR_LINK_SETUP			= -8,
+	I40E_ERR_ADAPTER_STOPPED		= -9,
+	I40E_ERR_INVALID_MAC_ADDR		= -10,
+	I40E_ERR_DEVICE_NOT_SUPPORTED		= -11,
+	I40E_ERR_MASTER_REQUESTS_PENDING	= -12,
+	I40E_ERR_INVALID_LINK_SETTINGS		= -13,
+	I40E_ERR_AUTONEG_NOT_COMPLETE		= -14,
+	I40E_ERR_RESET_FAILED			= -15,
+	I40E_ERR_SWFW_SYNC			= -16,
+	I40E_ERR_NO_AVAILABLE_VSI		= -17,
+	I40E_ERR_NO_MEMORY			= -18,
+	I40E_ERR_BAD_PTR			= -19,
+	I40E_ERR_RING_FULL			= -20,
+	I40E_ERR_INVALID_PD_ID			= -21,
+	I40E_ERR_INVALID_QP_ID			= -22,
+	I40E_ERR_INVALID_CQ_ID			= -23,
+	I40E_ERR_INVALID_CEQ_ID			= -24,
+	I40E_ERR_INVALID_AEQ_ID			= -25,
+	I40E_ERR_INVALID_SIZE			= -26,
+	I40E_ERR_INVALID_ARP_INDEX		= -27,
+	I40E_ERR_INVALID_FPM_FUNC_ID		= -28,
+	I40E_ERR_QP_INVALID_MSG_SIZE		= -29,
+	I40E_ERR_QP_TOOMANY_WRS_POSTED		= -30,
+	I40E_ERR_INVALID_FRAG_COUNT		= -31,
+	I40E_ERR_QUEUE_EMPTY			= -32,
+	I40E_ERR_INVALID_ALIGNMENT		= -33,
+	I40E_ERR_FLUSHED_QUEUE			= -34,
+	I40E_ERR_INVALID_PUSH_PAGE_INDEX	= -35,
+	I40E_ERR_INVALID_IMM_DATA_SIZE		= -36,
+	I40E_ERR_TIMEOUT			= -37,
+	I40E_ERR_OPCODE_MISMATCH		= -38,
+	I40E_ERR_CQP_COMPL_ERROR		= -39,
+	I40E_ERR_INVALID_VF_ID			= -40,
+	I40E_ERR_INVALID_HMCFN_ID		= -41,
+	I40E_ERR_BACKING_PAGE_ERROR		= -42,
+	I40E_ERR_NO_PBLCHUNKS_AVAILABLE		= -43,
+	I40E_ERR_INVALID_PBLE_INDEX		= -44,
+	I40E_ERR_INVALID_SD_INDEX		= -45,
+	I40E_ERR_INVALID_PAGE_DESC_INDEX	= -46,
+	I40E_ERR_INVALID_SD_TYPE		= -47,
+	I40E_ERR_MEMCPY_FAILED			= -48,
+	I40E_ERR_INVALID_HMC_OBJ_INDEX		= -49,
+	I40E_ERR_INVALID_HMC_OBJ_COUNT		= -50,
+	I40E_ERR_INVALID_SRQ_ARM_LIMIT		= -51,
+	I40E_ERR_SRQ_ENABLED			= -52,
+	I40E_ERR_ADMIN_QUEUE_ERROR		= -53,
+	I40E_ERR_ADMIN_QUEUE_TIMEOUT		= -54,
+	I40E_ERR_BUF_TOO_SHORT			= -55,
+	I40E_ERR_ADMIN_QUEUE_FULL		= -56,
+	I40E_ERR_ADMIN_QUEUE_NO_WORK		= -57,
+	I40E_ERR_BAD_IWARP_CQE			= -58,
+	I40E_ERR_NVM_BLANK_MODE			= -59,
+	I40E_ERR_NOT_IMPLEMENTED		= -60,
+	I40E_ERR_PE_DOORBELL_NOT_ENABLED	= -61,
+	I40E_ERR_DIAG_TEST_FAILED		= -62,
+	I40E_ERR_NOT_READY			= -63,
+	I40E_NOT_SUPPORTED			= -64,
+	I40E_ERR_FIRMWARE_API_VERSION		= -65,
+};
+
+#endif /* _I40E_STATUS_H_ */
diff --git a/usr/src/uts/common/io/i40e/core/i40e_type.h b/usr/src/uts/common/io/i40e/core/i40e_type.h
new file mode 100644
index 0000000000..b4a84993e9
--- /dev/null
+++ b/usr/src/uts/common/io/i40e/core/i40e_type.h
@@ -0,0 +1,1581 @@
+/******************************************************************************
+
+  Copyright (c) 2013-2015, Intel Corporation 
+  All rights reserved.
+  
+  Redistribution and use in source and binary forms, with or without 
+  modification, are permitted provided that the following conditions are met:
+  
+   1. Redistributions of source code must retain the above copyright notice, 
+      this list of conditions and the following disclaimer.
+  
+   2. Redistributions in binary form must reproduce the above copyright 
+      notice, this list of conditions and the following disclaimer in the 
+      documentation and/or other materials provided with the distribution.
+  
+   3. Neither the name of the Intel Corporation nor the names of its 
+      contributors may be used to endorse or promote products derived from 
+      this software without specific prior written permission.
+  
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+  POSSIBILITY OF SUCH DAMAGE.
+
+******************************************************************************/
+/*$FreeBSD: head/sys/dev/ixl/i40e_type.h 284049 2015-06-05 22:52:42Z jfv $*/
+
+#ifndef _I40E_TYPE_H_
+#define _I40E_TYPE_H_
+
+#include "i40e_status.h"
+#include "i40e_osdep.h"
+#include "i40e_register.h"
+#include "i40e_adminq.h"
+#include "i40e_hmc.h"
+#include "i40e_lan_hmc.h"
+#include "i40e_devids.h"
+
+#define UNREFERENCED_XPARAMETER
+
+#define BIT(a) (1UL << (a))
+#define BIT_ULL(a) (1ULL << (a))
+
+#ifndef I40E_MASK
+/* I40E_MASK is a macro used on 32 bit registers */
+#define I40E_MASK(mask, shift) (((uint32_t)(mask)) << ((uint32_t)(shift)))
+#endif
+
+#define I40E_MAX_PF			16
+#define I40E_MAX_PF_VSI			64
+#define I40E_MAX_PF_QP			128
+#define I40E_MAX_VSI_QP			16
+#define I40E_MAX_VF_VSI			3
+#define I40E_MAX_CHAINED_RX_BUFFERS	5
+#define I40E_MAX_PF_UDP_OFFLOAD_PORTS	16
+
+/* something less than 1 minute */
+#define I40E_HEARTBEAT_TIMEOUT		(HZ * 50)
+
+/* Max default timeout in ms, */
+#define I40E_MAX_NVM_TIMEOUT		18000
+
+/* Check whether address is multicast. */
+#define I40E_IS_MULTICAST(address) (bool)(((u8 *)(address))[0] & ((u8)0x01))
+
+/* Check whether an address is broadcast. */
+#define I40E_IS_BROADCAST(address)	\
+	((((u8 *)(address))[0] == ((u8)0xff)) && \
+	(((u8 *)(address))[1] == ((u8)0xff)))
+
+/* Switch from ms to the 1usec global time (this is the GTIME resolution) */
+#define I40E_MS_TO_GTIME(time)		((time) * 1000)
+
+/* forward declaration */
+struct i40e_hw;
+typedef void (*I40E_ADMINQ_CALLBACK)(struct i40e_hw *, struct i40e_aq_desc *);
+
+#define I40E_ETH_LENGTH_OF_ADDRESS	6
+/* Data type manipulation macros. */
+#define I40E_HI_DWORD(x)	((u32)((((x) >> 16) >> 16) & 0xFFFFFFFF))
+#define I40E_LO_DWORD(x)	((u32)((x) & 0xFFFFFFFF))
+
+#define I40E_HI_WORD(x)		((u16)(((x) >> 16) & 0xFFFF))
+#define I40E_LO_WORD(x)		((u16)((x) & 0xFFFF))
+
+#define I40E_HI_BYTE(x)		((u8)(((x) >> 8) & 0xFF))
+#define I40E_LO_BYTE(x)		((u8)((x) & 0xFF))
+
+/* Number of Transmit Descriptors must be a multiple of 8. */
+#define I40E_REQ_TX_DESCRIPTOR_MULTIPLE	8
+/* Number of Receive Descriptors must be a multiple of 32 if
+ * the number of descriptors is greater than 32.
+ */
+#define I40E_REQ_RX_DESCRIPTOR_MULTIPLE	32
+
+#define I40E_DESC_UNUSED(R)	\
+	((((R)->next_to_clean > (R)->next_to_use) ? 0 : (R)->count) + \
+	(R)->next_to_clean - (R)->next_to_use - 1)
+
+/* bitfields for Tx queue mapping in QTX_CTL */
+#define I40E_QTX_CTL_VF_QUEUE	0x0
+#define I40E_QTX_CTL_VM_QUEUE	0x1
+#define I40E_QTX_CTL_PF_QUEUE	0x2
+
+/* debug masks - set these bits in hw->debug_mask to control output */
+enum i40e_debug_mask {
+	I40E_DEBUG_INIT			= 0x00000001,
+	I40E_DEBUG_RELEASE		= 0x00000002,
+
+	I40E_DEBUG_LINK			= 0x00000010,
+	I40E_DEBUG_PHY			= 0x00000020,
+	I40E_DEBUG_HMC			= 0x00000040,
+	I40E_DEBUG_NVM			= 0x00000080,
+	I40E_DEBUG_LAN			= 0x00000100,
+	I40E_DEBUG_FLOW			= 0x00000200,
+	I40E_DEBUG_DCB			= 0x00000400,
+	I40E_DEBUG_DIAG			= 0x00000800,
+	I40E_DEBUG_FD			= 0x00001000,
+
+	I40E_DEBUG_AQ_MESSAGE		= 0x01000000,
+	I40E_DEBUG_AQ_DESCRIPTOR	= 0x02000000,
+	I40E_DEBUG_AQ_DESC_BUFFER	= 0x04000000,
+	I40E_DEBUG_AQ_COMMAND		= 0x06000000,
+	I40E_DEBUG_AQ			= 0x0F000000,
+
+	/*
+	 * Ugggh, have to cast these because of enums being "int" and these
+	 * overflow int.
+	 */
+	I40E_DEBUG_USER			= (int)0xF0000000,
+
+	I40E_DEBUG_ALL			= (int)0xFFFFFFFF
+};
+
+/* PCI Bus Info */
+#define I40E_PCI_LINK_STATUS		0xB2
+#define I40E_PCI_LINK_WIDTH		0x3F0
+#define I40E_PCI_LINK_WIDTH_1		0x10
+#define I40E_PCI_LINK_WIDTH_2		0x20
+#define I40E_PCI_LINK_WIDTH_4		0x40
+#define I40E_PCI_LINK_WIDTH_8		0x80
+#define I40E_PCI_LINK_SPEED		0xF
+#define I40E_PCI_LINK_SPEED_2500	0x1
+#define I40E_PCI_LINK_SPEED_5000	0x2
+#define I40E_PCI_LINK_SPEED_8000	0x3
+
+/* Memory types */
+enum i40e_memset_type {
+	I40E_NONDMA_MEM = 0,
+	I40E_DMA_MEM
+};
+
+/* Memcpy types */
+enum i40e_memcpy_type {
+	I40E_NONDMA_TO_NONDMA = 0,
+	I40E_NONDMA_TO_DMA,
+	I40E_DMA_TO_DMA,
+	I40E_DMA_TO_NONDMA
+};
+
+/* These are structs for managing the hardware information and the operations.
+ * The structures of function pointers are filled out at init time when we
+ * know for sure exactly which hardware we're working with.  This gives us the
+ * flexibility of using the same main driver code but adapting to slightly
+ * different hardware needs as new parts are developed.  For this architecture,
+ * the Firmware and AdminQ are intended to insulate the driver from most of the
+ * future changes, but these structures will also do part of the job.
+ */
+enum i40e_mac_type {
+	I40E_MAC_UNKNOWN = 0,
+	I40E_MAC_X710,
+	I40E_MAC_XL710,
+	I40E_MAC_VF,
+#ifdef X722_SUPPORT
+	I40E_MAC_X722,
+	I40E_MAC_X722_VF,
+#endif
+	I40E_MAC_GENERIC,
+};
+
+enum i40e_media_type {
+	I40E_MEDIA_TYPE_UNKNOWN = 0,
+	I40E_MEDIA_TYPE_FIBER,
+	I40E_MEDIA_TYPE_BASET,
+	I40E_MEDIA_TYPE_BACKPLANE,
+	I40E_MEDIA_TYPE_CX4,
+	I40E_MEDIA_TYPE_DA,
+	I40E_MEDIA_TYPE_VIRTUAL
+};
+
+enum i40e_fc_mode {
+	I40E_FC_NONE = 0,
+	I40E_FC_RX_PAUSE,
+	I40E_FC_TX_PAUSE,
+	I40E_FC_FULL,
+	I40E_FC_PFC,
+	I40E_FC_DEFAULT
+};
+
+enum i40e_set_fc_aq_failures {
+	I40E_SET_FC_AQ_FAIL_NONE = 0,
+	I40E_SET_FC_AQ_FAIL_GET = 1,
+	I40E_SET_FC_AQ_FAIL_SET = 2,
+	I40E_SET_FC_AQ_FAIL_UPDATE = 4,
+	I40E_SET_FC_AQ_FAIL_SET_UPDATE = 6
+};
+
+enum i40e_vsi_type {
+	I40E_VSI_MAIN	= 0,
+	I40E_VSI_VMDQ1	= 1,
+	I40E_VSI_VMDQ2	= 2,
+	I40E_VSI_CTRL	= 3,
+	I40E_VSI_FCOE	= 4,
+	I40E_VSI_MIRROR	= 5,
+	I40E_VSI_SRIOV	= 6,
+	I40E_VSI_FDIR	= 7,
+	I40E_VSI_TYPE_UNKNOWN
+};
+
+enum i40e_queue_type {
+	I40E_QUEUE_TYPE_RX = 0,
+	I40E_QUEUE_TYPE_TX,
+	I40E_QUEUE_TYPE_PE_CEQ,
+	I40E_QUEUE_TYPE_UNKNOWN
+};
+
+struct i40e_link_status {
+	enum i40e_aq_phy_type phy_type;
+	enum i40e_aq_link_speed link_speed;
+	u8 link_info;
+	u8 an_info;
+	u8 ext_info;
+	u8 loopback;
+	/* is Link Status Event notification to SW enabled */
+	bool lse_enable;
+	u16 max_frame_size;
+	bool crc_enable;
+	u8 pacing;
+	u8 requested_speeds;
+	u8 module_type[3];
+	/* 1st byte: module identifier */
+#define I40E_MODULE_TYPE_SFP		0x03
+#define I40E_MODULE_TYPE_QSFP		0x0D
+	/* 2nd byte: ethernet compliance codes for 10/40G */
+#define I40E_MODULE_TYPE_40G_ACTIVE	0x01
+#define I40E_MODULE_TYPE_40G_LR4	0x02
+#define I40E_MODULE_TYPE_40G_SR4	0x04
+#define I40E_MODULE_TYPE_40G_CR4	0x08
+#define I40E_MODULE_TYPE_10G_BASE_SR	0x10
+#define I40E_MODULE_TYPE_10G_BASE_LR	0x20
+#define I40E_MODULE_TYPE_10G_BASE_LRM	0x40
+#define I40E_MODULE_TYPE_10G_BASE_ER	0x80
+	/* 3rd byte: ethernet compliance codes for 1G */
+#define I40E_MODULE_TYPE_1000BASE_SX	0x01
+#define I40E_MODULE_TYPE_1000BASE_LX	0x02
+#define I40E_MODULE_TYPE_1000BASE_CX	0x04
+#define I40E_MODULE_TYPE_1000BASE_T	0x08
+};
+
+enum i40e_aq_capabilities_phy_type {
+	I40E_CAP_PHY_TYPE_SGMII			= BIT(I40E_PHY_TYPE_SGMII),
+	I40E_CAP_PHY_TYPE_1000BASE_KX		= BIT(I40E_PHY_TYPE_1000BASE_KX),
+	I40E_CAP_PHY_TYPE_10GBASE_KX4		= BIT(I40E_PHY_TYPE_10GBASE_KX4),
+	I40E_CAP_PHY_TYPE_10GBASE_KR		= BIT(I40E_PHY_TYPE_10GBASE_KR),
+	I40E_CAP_PHY_TYPE_40GBASE_KR4		= BIT(I40E_PHY_TYPE_40GBASE_KR4),
+	I40E_CAP_PHY_TYPE_XAUI			= BIT(I40E_PHY_TYPE_XAUI),
+	I40E_CAP_PHY_TYPE_XFI			= BIT(I40E_PHY_TYPE_XFI),
+	I40E_CAP_PHY_TYPE_SFI			= BIT(I40E_PHY_TYPE_SFI),
+	I40E_CAP_PHY_TYPE_XLAUI			= BIT(I40E_PHY_TYPE_XLAUI),
+	I40E_CAP_PHY_TYPE_XLPPI			= BIT(I40E_PHY_TYPE_XLPPI),
+	I40E_CAP_PHY_TYPE_40GBASE_CR4_CU	= BIT(I40E_PHY_TYPE_40GBASE_CR4_CU),
+	I40E_CAP_PHY_TYPE_10GBASE_CR1_CU	= BIT(I40E_PHY_TYPE_10GBASE_CR1_CU),
+	I40E_CAP_PHY_TYPE_10GBASE_AOC		= BIT(I40E_PHY_TYPE_10GBASE_AOC),
+	I40E_CAP_PHY_TYPE_40GBASE_AOC		= BIT(I40E_PHY_TYPE_40GBASE_AOC),
+	I40E_CAP_PHY_TYPE_100BASE_TX		= BIT(I40E_PHY_TYPE_100BASE_TX),
+	I40E_CAP_PHY_TYPE_1000BASE_T		= BIT(I40E_PHY_TYPE_1000BASE_T),
+	I40E_CAP_PHY_TYPE_10GBASE_T		= BIT(I40E_PHY_TYPE_10GBASE_T),
+	I40E_CAP_PHY_TYPE_10GBASE_SR		= BIT(I40E_PHY_TYPE_10GBASE_SR),
+	I40E_CAP_PHY_TYPE_10GBASE_LR		= BIT(I40E_PHY_TYPE_10GBASE_LR),
+	I40E_CAP_PHY_TYPE_10GBASE_SFPP_CU	= BIT(I40E_PHY_TYPE_10GBASE_SFPP_CU),
+	I40E_CAP_PHY_TYPE_10GBASE_CR1		= BIT(I40E_PHY_TYPE_10GBASE_CR1),
+	I40E_CAP_PHY_TYPE_40GBASE_CR4		= BIT(I40E_PHY_TYPE_40GBASE_CR4),
+	I40E_CAP_PHY_TYPE_40GBASE_SR4		= BIT(I40E_PHY_TYPE_40GBASE_SR4),
+	I40E_CAP_PHY_TYPE_40GBASE_LR4		= BIT(I40E_PHY_TYPE_40GBASE_LR4),
+	I40E_CAP_PHY_TYPE_1000BASE_SX		= BIT(I40E_PHY_TYPE_1000BASE_SX),
+	I40E_CAP_PHY_TYPE_1000BASE_LX		= BIT(I40E_PHY_TYPE_1000BASE_LX),
+	I40E_CAP_PHY_TYPE_1000BASE_T_OPTICAL	= BIT(I40E_PHY_TYPE_1000BASE_T_OPTICAL),
+	I40E_CAP_PHY_TYPE_20GBASE_KR2		= BIT(I40E_PHY_TYPE_20GBASE_KR2)
+};
+
+struct i40e_phy_info {
+	struct i40e_link_status link_info;
+	struct i40e_link_status link_info_old;
+	bool get_link_info;
+	enum i40e_media_type media_type;
+	/* all the phy types the NVM is capable of */
+	enum i40e_aq_capabilities_phy_type phy_types;
+};
+
+#define I40E_HW_CAP_MAX_GPIO			30
+#define I40E_HW_CAP_MDIO_PORT_MODE_MDIO		0
+#define I40E_HW_CAP_MDIO_PORT_MODE_I2C		1
+
+/* Capabilities of a PF or a VF or the whole device */
+struct i40e_hw_capabilities {
+	u32  switch_mode;
+#define I40E_NVM_IMAGE_TYPE_EVB		0x0
+#define I40E_NVM_IMAGE_TYPE_CLOUD	0x2
+#define I40E_NVM_IMAGE_TYPE_UDP_CLOUD	0x3
+
+	u32  management_mode;
+	u32  npar_enable;
+	u32  os2bmc;
+	u32  valid_functions;
+	bool sr_iov_1_1;
+	bool vmdq;
+	bool evb_802_1_qbg; /* Edge Virtual Bridging */
+	bool evb_802_1_qbh; /* Bridge Port Extension */
+	bool dcb;
+	bool fcoe;
+	bool iscsi; /* Indicates iSCSI enabled */
+	bool flex10_enable;
+	bool flex10_capable;
+	u32  flex10_mode;
+#define I40E_FLEX10_MODE_UNKNOWN	0x0
+#define I40E_FLEX10_MODE_DCC		0x1
+#define I40E_FLEX10_MODE_DCI		0x2
+
+	u32 flex10_status;
+#define I40E_FLEX10_STATUS_DCC_ERROR	0x1
+#define I40E_FLEX10_STATUS_VC_MODE	0x2
+
+	bool mgmt_cem;
+	bool ieee_1588;
+	bool iwarp;
+	bool fd;
+	u32 fd_filters_guaranteed;
+	u32 fd_filters_best_effort;
+	bool rss;
+	u32 rss_table_size;
+	u32 rss_table_entry_width;
+	bool led[I40E_HW_CAP_MAX_GPIO];
+	bool sdp[I40E_HW_CAP_MAX_GPIO];
+	u32 nvm_image_type;
+	u32 num_flow_director_filters;
+	u32 num_vfs;
+	u32 vf_base_id;
+	u32 num_vsis;
+	u32 num_rx_qp;
+	u32 num_tx_qp;
+	u32 base_queue;
+	u32 num_msix_vectors;
+	u32 num_msix_vectors_vf;
+	u32 led_pin_num;
+	u32 sdp_pin_num;
+	u32 mdio_port_num;
+	u32 mdio_port_mode;
+	u8 rx_buf_chain_len;
+	u32 enabled_tcmap;
+	u32 maxtc;
+	u64 wr_csr_prot;
+};
+
+struct i40e_mac_info {
+	enum i40e_mac_type type;
+	u8 addr[I40E_ETH_LENGTH_OF_ADDRESS];
+	u8 perm_addr[I40E_ETH_LENGTH_OF_ADDRESS];
+	u8 san_addr[I40E_ETH_LENGTH_OF_ADDRESS];
+	u8 port_addr[I40E_ETH_LENGTH_OF_ADDRESS];
+	u16 max_fcoeq;
+};
+
+enum i40e_aq_resources_ids {
+	I40E_NVM_RESOURCE_ID = 1
+};
+
+enum i40e_aq_resource_access_type {
+	I40E_RESOURCE_READ = 1,
+	I40E_RESOURCE_WRITE
+};
+
+struct i40e_nvm_info {
+	u64 hw_semaphore_timeout; /* usec global time (GTIME resolution) */
+	u32 timeout;              /* [ms] */
+	u16 sr_size;              /* Shadow RAM size in words */
+	bool blank_nvm_mode;      /* is NVM empty (no FW present)*/
+	u16 version;              /* NVM package version */
+	u32 eetrack;              /* NVM data version */
+	u32 oem_ver;              /* OEM version info */
+};
+
+/* definitions used in NVM update support */
+
+enum i40e_nvmupd_cmd {
+	I40E_NVMUPD_INVALID,
+	I40E_NVMUPD_READ_CON,
+	I40E_NVMUPD_READ_SNT,
+	I40E_NVMUPD_READ_LCB,
+	I40E_NVMUPD_READ_SA,
+	I40E_NVMUPD_WRITE_ERA,
+	I40E_NVMUPD_WRITE_CON,
+	I40E_NVMUPD_WRITE_SNT,
+	I40E_NVMUPD_WRITE_LCB,
+	I40E_NVMUPD_WRITE_SA,
+	I40E_NVMUPD_CSUM_CON,
+	I40E_NVMUPD_CSUM_SA,
+	I40E_NVMUPD_CSUM_LCB,
+	I40E_NVMUPD_STATUS,
+	I40E_NVMUPD_EXEC_AQ,
+	I40E_NVMUPD_GET_AQ_RESULT,
+};
+
+enum i40e_nvmupd_state {
+	I40E_NVMUPD_STATE_INIT,
+	I40E_NVMUPD_STATE_READING,
+	I40E_NVMUPD_STATE_WRITING,
+	I40E_NVMUPD_STATE_INIT_WAIT,
+	I40E_NVMUPD_STATE_WRITE_WAIT,
+};
+
+/* nvm_access definition and its masks/shifts need to be accessible to
+ * application, core driver, and shared code.  Where is the right file?
+ */
+#define I40E_NVM_READ	0xB
+#define I40E_NVM_WRITE	0xC
+
+#define I40E_NVM_MOD_PNT_MASK 0xFF
+
+#define I40E_NVM_TRANS_SHIFT	8
+#define I40E_NVM_TRANS_MASK	(0xf << I40E_NVM_TRANS_SHIFT)
+#define I40E_NVM_CON		0x0
+#define I40E_NVM_SNT		0x1
+#define I40E_NVM_LCB		0x2
+#define I40E_NVM_SA		(I40E_NVM_SNT | I40E_NVM_LCB)
+#define I40E_NVM_ERA		0x4
+#define I40E_NVM_CSUM		0x8
+#define I40E_NVM_EXEC		0xf
+
+#define I40E_NVM_ADAPT_SHIFT	16
+#define I40E_NVM_ADAPT_MASK	(0xffffULL << I40E_NVM_ADAPT_SHIFT)
+
+#define I40E_NVMUPD_MAX_DATA	4096
+#define I40E_NVMUPD_IFACE_TIMEOUT 2 /* seconds */
+
+struct i40e_nvm_access {
+	u32 command;
+	u32 config;
+	u32 offset;	/* in bytes */
+	u32 data_size;	/* in bytes */
+	u8 data[1];
+};
+
+/* PCI bus types */
+enum i40e_bus_type {
+	i40e_bus_type_unknown = 0,
+	i40e_bus_type_pci,
+	i40e_bus_type_pcix,
+	i40e_bus_type_pci_express,
+	i40e_bus_type_reserved
+};
+
+/* PCI bus speeds */
+enum i40e_bus_speed {
+	i40e_bus_speed_unknown	= 0,
+	i40e_bus_speed_33	= 33,
+	i40e_bus_speed_66	= 66,
+	i40e_bus_speed_100	= 100,
+	i40e_bus_speed_120	= 120,
+	i40e_bus_speed_133	= 133,
+	i40e_bus_speed_2500	= 2500,
+	i40e_bus_speed_5000	= 5000,
+	i40e_bus_speed_8000	= 8000,
+	i40e_bus_speed_reserved
+};
+
+/* PCI bus widths */
+enum i40e_bus_width {
+	i40e_bus_width_unknown	= 0,
+	i40e_bus_width_pcie_x1	= 1,
+	i40e_bus_width_pcie_x2	= 2,
+	i40e_bus_width_pcie_x4	= 4,
+	i40e_bus_width_pcie_x8	= 8,
+	i40e_bus_width_32	= 32,
+	i40e_bus_width_64	= 64,
+	i40e_bus_width_reserved
+};
+
+/* Bus parameters */
+struct i40e_bus_info {
+	enum i40e_bus_speed speed;
+	enum i40e_bus_width width;
+	enum i40e_bus_type type;
+
+	u16 func;
+	u16 device;
+	u16 lan_id;
+};
+
+/* Flow control (FC) parameters */
+struct i40e_fc_info {
+	enum i40e_fc_mode current_mode; /* FC mode in effect */
+	enum i40e_fc_mode requested_mode; /* FC mode requested by caller */
+};
+
+#define I40E_MAX_TRAFFIC_CLASS		8
+#define I40E_MAX_USER_PRIORITY		8
+#define I40E_DCBX_MAX_APPS		32
+#define I40E_LLDPDU_SIZE		1500
+#define I40E_TLV_STATUS_OPER		0x1
+#define I40E_TLV_STATUS_SYNC		0x2
+#define I40E_TLV_STATUS_ERR		0x4
+#define I40E_CEE_OPER_MAX_APPS		3
+#define I40E_APP_PROTOID_FCOE		0x8906
+#define I40E_APP_PROTOID_ISCSI		0x0cbc
+#define I40E_APP_PROTOID_FIP		0x8914
+#define I40E_APP_SEL_ETHTYPE		0x1
+#define I40E_APP_SEL_TCPIP		0x2
+#define I40E_CEE_APP_SEL_ETHTYPE	0x0
+#define I40E_CEE_APP_SEL_TCPIP		0x1
+
+/* CEE or IEEE 802.1Qaz ETS Configuration data */
+struct i40e_dcb_ets_config {
+	u8 willing;
+	u8 cbs;
+	u8 maxtcs;
+	u8 prioritytable[I40E_MAX_TRAFFIC_CLASS];
+	u8 tcbwtable[I40E_MAX_TRAFFIC_CLASS];
+	u8 tsatable[I40E_MAX_TRAFFIC_CLASS];
+};
+
+/* CEE or IEEE 802.1Qaz PFC Configuration data */
+struct i40e_dcb_pfc_config {
+	u8 willing;
+	u8 mbc;
+	u8 pfccap;
+	u8 pfcenable;
+};
+
+/* CEE or IEEE 802.1Qaz Application Priority data */
+struct i40e_dcb_app_priority_table {
+	u8  priority;
+	u8  selector;
+	u16 protocolid;
+};
+
+struct i40e_dcbx_config {
+	u8  dcbx_mode;
+#define I40E_DCBX_MODE_CEE	0x1
+#define I40E_DCBX_MODE_IEEE	0x2
+	u32 numapps;
+	u32 tlv_status; /* CEE mode TLV status */
+	struct i40e_dcb_ets_config etscfg;
+	struct i40e_dcb_ets_config etsrec;
+	struct i40e_dcb_pfc_config pfc;
+	struct i40e_dcb_app_priority_table app[I40E_DCBX_MAX_APPS];
+};
+
+/* Port hardware description */
+struct i40e_hw {
+	u8 *hw_addr;
+	void *back;
+
+	/* subsystem structs */
+	struct i40e_phy_info phy;
+	struct i40e_mac_info mac;
+	struct i40e_bus_info bus;
+	struct i40e_nvm_info nvm;
+	struct i40e_fc_info fc;
+
+	/* pci info */
+	u16 device_id;
+	u16 vendor_id;
+	u16 subsystem_device_id;
+	u16 subsystem_vendor_id;
+	u8 revision_id;
+	u8 port;
+	bool adapter_stopped;
+
+	/* capabilities for entire device and PCI func */
+	struct i40e_hw_capabilities dev_caps;
+	struct i40e_hw_capabilities func_caps;
+
+	/* Flow Director shared filter space */
+	u16 fdir_shared_filter_count;
+
+	/* device profile info */
+	u8  pf_id;
+	u16 main_vsi_seid;
+
+	/* for multi-function MACs */
+	u16 partition_id;
+	u16 num_partitions;
+	u16 num_ports;
+
+	/* Closest numa node to the device */
+	u16 numa_node;
+
+	/* Admin Queue info */
+	struct i40e_adminq_info aq;
+
+	/* state of nvm update process */
+	enum i40e_nvmupd_state nvmupd_state;
+	struct i40e_aq_desc nvm_wb_desc;
+	struct i40e_virt_mem nvm_buff;
+
+	/* HMC info */
+	struct i40e_hmc_info hmc; /* HMC info struct */
+
+	/* LLDP/DCBX Status */
+	u16 dcbx_status;
+
+	/* DCBX info */
+	struct i40e_dcbx_config local_dcbx_config; /* Oper/Local Cfg */
+	struct i40e_dcbx_config remote_dcbx_config; /* Peer Cfg */
+	struct i40e_dcbx_config desired_dcbx_config; /* CEE Desired Cfg */
+
+	/* debug mask */
+	u32 debug_mask;
+	char err_str[16];
+};
+
+static INLINE bool i40e_is_vf(struct i40e_hw *hw)
+{
+#ifdef X722_SUPPORT
+	return (hw->mac.type == I40E_MAC_VF ||
+		hw->mac.type == I40E_MAC_X722_VF);
+#else
+	return hw->mac.type == I40E_MAC_VF;
+#endif
+}
+
+struct i40e_driver_version {
+	u8 major_version;
+	u8 minor_version;
+	u8 build_version;
+	u8 subbuild_version;
+	u8 driver_string[32];
+};
+
+/* RX Descriptors */
+union i40e_16byte_rx_desc {
+	struct {
+		__le64 pkt_addr; /* Packet buffer address */
+		__le64 hdr_addr; /* Header buffer address */
+	} read;
+	struct {
+		struct {
+			struct {
+				union {
+					__le16 mirroring_status;
+					__le16 fcoe_ctx_id;
+				} mirr_fcoe;
+				__le16 l2tag1;
+			} lo_dword;
+			union {
+				__le32 rss; /* RSS Hash */
+				__le32 fd_id; /* Flow director filter id */
+				__le32 fcoe_param; /* FCoE DDP Context id */
+			} hi_dword;
+		} qword0;
+		struct {
+			/* ext status/error/pktype/length */
+			__le64 status_error_len;
+		} qword1;
+	} wb;  /* writeback */
+};
+
+union i40e_32byte_rx_desc {
+	struct {
+		__le64  pkt_addr; /* Packet buffer address */
+		__le64  hdr_addr; /* Header buffer address */
+			/* bit 0 of hdr_buffer_addr is DD bit */
+		__le64  rsvd1;
+		__le64  rsvd2;
+	} read;
+	struct {
+		struct {
+			struct {
+				union {
+					__le16 mirroring_status;
+					__le16 fcoe_ctx_id;
+				} mirr_fcoe;
+				__le16 l2tag1;
+			} lo_dword;
+			union {
+				__le32 rss; /* RSS Hash */
+				__le32 fcoe_param; /* FCoE DDP Context id */
+				/* Flow director filter id in case of
+				 * Programming status desc WB
+				 */
+				__le32 fd_id;
+			} hi_dword;
+		} qword0;
+		struct {
+			/* status/error/pktype/length */
+			__le64 status_error_len;
+		} qword1;
+		struct {
+			__le16 ext_status; /* extended status */
+			__le16 rsvd;
+			__le16 l2tag2_1;
+			__le16 l2tag2_2;
+		} qword2;
+		struct {
+			union {
+				__le32 flex_bytes_lo;
+				__le32 pe_status;
+			} lo_dword;
+			union {
+				__le32 flex_bytes_hi;
+				__le32 fd_id;
+			} hi_dword;
+		} qword3;
+	} wb;  /* writeback */
+};
+
+#define I40E_RXD_QW0_MIRROR_STATUS_SHIFT	8
+#define I40E_RXD_QW0_MIRROR_STATUS_MASK	(0x3FUL << \
+					 I40E_RXD_QW0_MIRROR_STATUS_SHIFT)
+#define I40E_RXD_QW0_FCOEINDX_SHIFT	0
+#define I40E_RXD_QW0_FCOEINDX_MASK	(0xFFFUL << \
+					 I40E_RXD_QW0_FCOEINDX_SHIFT)
+
+enum i40e_rx_desc_status_bits {
+	/* Note: These are predefined bit offsets */
+	I40E_RX_DESC_STATUS_DD_SHIFT		= 0,
+	I40E_RX_DESC_STATUS_EOF_SHIFT		= 1,
+	I40E_RX_DESC_STATUS_L2TAG1P_SHIFT	= 2,
+	I40E_RX_DESC_STATUS_L3L4P_SHIFT		= 3,
+	I40E_RX_DESC_STATUS_CRCP_SHIFT		= 4,
+	I40E_RX_DESC_STATUS_TSYNINDX_SHIFT	= 5, /* 2 BITS */
+	I40E_RX_DESC_STATUS_TSYNVALID_SHIFT	= 7,
+#ifdef X722_SUPPORT
+	I40E_RX_DESC_STATUS_EXT_UDP_0_SHIFT	= 8,
+#else
+	I40E_RX_DESC_STATUS_RESERVED1_SHIFT	= 8,
+#endif
+
+	I40E_RX_DESC_STATUS_UMBCAST_SHIFT	= 9, /* 2 BITS */
+	I40E_RX_DESC_STATUS_FLM_SHIFT		= 11,
+	I40E_RX_DESC_STATUS_FLTSTAT_SHIFT	= 12, /* 2 BITS */
+	I40E_RX_DESC_STATUS_LPBK_SHIFT		= 14,
+	I40E_RX_DESC_STATUS_IPV6EXADD_SHIFT	= 15,
+	I40E_RX_DESC_STATUS_RESERVED2_SHIFT	= 16, /* 2 BITS */
+#ifdef X722_SUPPORT
+	I40E_RX_DESC_STATUS_INT_UDP_0_SHIFT	= 18,
+#else
+	I40E_RX_DESC_STATUS_UDP_0_SHIFT		= 18,
+#endif
+	I40E_RX_DESC_STATUS_LAST /* this entry must be last!!! */
+};
+
+#define I40E_RXD_QW1_STATUS_SHIFT	0
+#define I40E_RXD_QW1_STATUS_MASK	((BIT(I40E_RX_DESC_STATUS_LAST) - 1) << \
+					 I40E_RXD_QW1_STATUS_SHIFT)
+
+#define I40E_RXD_QW1_STATUS_TSYNINDX_SHIFT   I40E_RX_DESC_STATUS_TSYNINDX_SHIFT
+#define I40E_RXD_QW1_STATUS_TSYNINDX_MASK	(0x3UL << \
+					     I40E_RXD_QW1_STATUS_TSYNINDX_SHIFT)
+
+#define I40E_RXD_QW1_STATUS_TSYNVALID_SHIFT  I40E_RX_DESC_STATUS_TSYNVALID_SHIFT
+#define I40E_RXD_QW1_STATUS_TSYNVALID_MASK   BIT_ULL(I40E_RXD_QW1_STATUS_TSYNVALID_SHIFT)
+
+#define I40E_RXD_QW1_STATUS_UMBCAST_SHIFT	I40E_RX_DESC_STATUS_UMBCAST
+#define I40E_RXD_QW1_STATUS_UMBCAST_MASK	(0x3UL << \
+					 I40E_RXD_QW1_STATUS_UMBCAST_SHIFT)
+
+enum i40e_rx_desc_fltstat_values {
+	I40E_RX_DESC_FLTSTAT_NO_DATA	= 0,
+	I40E_RX_DESC_FLTSTAT_RSV_FD_ID	= 1, /* 16byte desc? FD_ID : RSV */
+	I40E_RX_DESC_FLTSTAT_RSV	= 2,
+	I40E_RX_DESC_FLTSTAT_RSS_HASH	= 3,
+};
+
+#define I40E_RXD_PACKET_TYPE_UNICAST	0
+#define I40E_RXD_PACKET_TYPE_MULTICAST	1
+#define I40E_RXD_PACKET_TYPE_BROADCAST	2
+#define I40E_RXD_PACKET_TYPE_MIRRORED	3
+
+#define I40E_RXD_QW1_ERROR_SHIFT	19
+#define I40E_RXD_QW1_ERROR_MASK		(0xFFUL << I40E_RXD_QW1_ERROR_SHIFT)
+
+enum i40e_rx_desc_error_bits {
+	/* Note: These are predefined bit offsets */
+	I40E_RX_DESC_ERROR_RXE_SHIFT		= 0,
+	I40E_RX_DESC_ERROR_RECIPE_SHIFT		= 1,
+	I40E_RX_DESC_ERROR_HBO_SHIFT		= 2,
+	I40E_RX_DESC_ERROR_L3L4E_SHIFT		= 3, /* 3 BITS */
+	I40E_RX_DESC_ERROR_IPE_SHIFT		= 3,
+	I40E_RX_DESC_ERROR_L4E_SHIFT		= 4,
+	I40E_RX_DESC_ERROR_EIPE_SHIFT		= 5,
+	I40E_RX_DESC_ERROR_OVERSIZE_SHIFT	= 6,
+	I40E_RX_DESC_ERROR_PPRS_SHIFT		= 7
+};
+
+enum i40e_rx_desc_error_l3l4e_fcoe_masks {
+	I40E_RX_DESC_ERROR_L3L4E_NONE		= 0,
+	I40E_RX_DESC_ERROR_L3L4E_PROT		= 1,
+	I40E_RX_DESC_ERROR_L3L4E_FC		= 2,
+	I40E_RX_DESC_ERROR_L3L4E_DMAC_ERR	= 3,
+	I40E_RX_DESC_ERROR_L3L4E_DMAC_WARN	= 4
+};
+
+#define I40E_RXD_QW1_PTYPE_SHIFT	30
+#define I40E_RXD_QW1_PTYPE_MASK		(0xFFULL << I40E_RXD_QW1_PTYPE_SHIFT)
+
+/* Packet type non-ip values */
+enum i40e_rx_l2_ptype {
+	I40E_RX_PTYPE_L2_RESERVED			= 0,
+	I40E_RX_PTYPE_L2_MAC_PAY2			= 1,
+	I40E_RX_PTYPE_L2_TIMESYNC_PAY2			= 2,
+	I40E_RX_PTYPE_L2_FIP_PAY2			= 3,
+	I40E_RX_PTYPE_L2_OUI_PAY2			= 4,
+	I40E_RX_PTYPE_L2_MACCNTRL_PAY2			= 5,
+	I40E_RX_PTYPE_L2_LLDP_PAY2			= 6,
+	I40E_RX_PTYPE_L2_ECP_PAY2			= 7,
+	I40E_RX_PTYPE_L2_EVB_PAY2			= 8,
+	I40E_RX_PTYPE_L2_QCN_PAY2			= 9,
+	I40E_RX_PTYPE_L2_EAPOL_PAY2			= 10,
+	I40E_RX_PTYPE_L2_ARP				= 11,
+	I40E_RX_PTYPE_L2_FCOE_PAY3			= 12,
+	I40E_RX_PTYPE_L2_FCOE_FCDATA_PAY3		= 13,
+	I40E_RX_PTYPE_L2_FCOE_FCRDY_PAY3		= 14,
+	I40E_RX_PTYPE_L2_FCOE_FCRSP_PAY3		= 15,
+	I40E_RX_PTYPE_L2_FCOE_FCOTHER_PA		= 16,
+	I40E_RX_PTYPE_L2_FCOE_VFT_PAY3			= 17,
+	I40E_RX_PTYPE_L2_FCOE_VFT_FCDATA		= 18,
+	I40E_RX_PTYPE_L2_FCOE_VFT_FCRDY			= 19,
+	I40E_RX_PTYPE_L2_FCOE_VFT_FCRSP			= 20,
+	I40E_RX_PTYPE_L2_FCOE_VFT_FCOTHER		= 21,
+	I40E_RX_PTYPE_GRENAT4_MAC_PAY3			= 58,
+	I40E_RX_PTYPE_GRENAT4_MACVLAN_IPV6_ICMP_PAY4	= 87,
+	I40E_RX_PTYPE_GRENAT6_MAC_PAY3			= 124,
+	I40E_RX_PTYPE_GRENAT6_MACVLAN_IPV6_ICMP_PAY4	= 153
+};
+
+struct i40e_rx_ptype_decoded {
+	u32 ptype:8;
+	u32 known:1;
+	u32 outer_ip:1;
+	u32 outer_ip_ver:1;
+	u32 outer_frag:1;
+	u32 tunnel_type:3;
+	u32 tunnel_end_prot:2;
+	u32 tunnel_end_frag:1;
+	u32 inner_prot:4;
+	u32 payload_layer:3;
+};
+
+enum i40e_rx_ptype_outer_ip {
+	I40E_RX_PTYPE_OUTER_L2	= 0,
+	I40E_RX_PTYPE_OUTER_IP	= 1
+};
+
+enum i40e_rx_ptype_outer_ip_ver {
+	I40E_RX_PTYPE_OUTER_NONE	= 0,
+	I40E_RX_PTYPE_OUTER_IPV4	= 0,
+	I40E_RX_PTYPE_OUTER_IPV6	= 1
+};
+
+enum i40e_rx_ptype_outer_fragmented {
+	I40E_RX_PTYPE_NOT_FRAG	= 0,
+	I40E_RX_PTYPE_FRAG	= 1
+};
+
+enum i40e_rx_ptype_tunnel_type {
+	I40E_RX_PTYPE_TUNNEL_NONE		= 0,
+	I40E_RX_PTYPE_TUNNEL_IP_IP		= 1,
+	I40E_RX_PTYPE_TUNNEL_IP_GRENAT		= 2,
+	I40E_RX_PTYPE_TUNNEL_IP_GRENAT_MAC	= 3,
+	I40E_RX_PTYPE_TUNNEL_IP_GRENAT_MAC_VLAN	= 4,
+};
+
+enum i40e_rx_ptype_tunnel_end_prot {
+	I40E_RX_PTYPE_TUNNEL_END_NONE	= 0,
+	I40E_RX_PTYPE_TUNNEL_END_IPV4	= 1,
+	I40E_RX_PTYPE_TUNNEL_END_IPV6	= 2,
+};
+
+enum i40e_rx_ptype_inner_prot {
+	I40E_RX_PTYPE_INNER_PROT_NONE		= 0,
+	I40E_RX_PTYPE_INNER_PROT_UDP		= 1,
+	I40E_RX_PTYPE_INNER_PROT_TCP		= 2,
+	I40E_RX_PTYPE_INNER_PROT_SCTP		= 3,
+	I40E_RX_PTYPE_INNER_PROT_ICMP		= 4,
+	I40E_RX_PTYPE_INNER_PROT_TIMESYNC	= 5
+};
+
+enum i40e_rx_ptype_payload_layer {
+	I40E_RX_PTYPE_PAYLOAD_LAYER_NONE	= 0,
+	I40E_RX_PTYPE_PAYLOAD_LAYER_PAY2	= 1,
+	I40E_RX_PTYPE_PAYLOAD_LAYER_PAY3	= 2,
+	I40E_RX_PTYPE_PAYLOAD_LAYER_PAY4	= 3,
+};
+
+#define I40E_RX_PTYPE_BIT_MASK		0x0FFFFFFF
+#define I40E_RX_PTYPE_SHIFT		56
+
+#define I40E_RXD_QW1_LENGTH_PBUF_SHIFT	38
+#define I40E_RXD_QW1_LENGTH_PBUF_MASK	(0x3FFFULL << \
+					 I40E_RXD_QW1_LENGTH_PBUF_SHIFT)
+
+#define I40E_RXD_QW1_LENGTH_HBUF_SHIFT	52
+#define I40E_RXD_QW1_LENGTH_HBUF_MASK	(0x7FFULL << \
+					 I40E_RXD_QW1_LENGTH_HBUF_SHIFT)
+
+#define I40E_RXD_QW1_LENGTH_SPH_SHIFT	63
+#define I40E_RXD_QW1_LENGTH_SPH_MASK	BIT_ULL(I40E_RXD_QW1_LENGTH_SPH_SHIFT)
+
+#define I40E_RXD_QW1_NEXTP_SHIFT	38
+#define I40E_RXD_QW1_NEXTP_MASK		(0x1FFFULL << I40E_RXD_QW1_NEXTP_SHIFT)
+
+#define I40E_RXD_QW2_EXT_STATUS_SHIFT	0
+#define I40E_RXD_QW2_EXT_STATUS_MASK	(0xFFFFFUL << \
+					 I40E_RXD_QW2_EXT_STATUS_SHIFT)
+
+enum i40e_rx_desc_ext_status_bits {
+	/* Note: These are predefined bit offsets */
+	I40E_RX_DESC_EXT_STATUS_L2TAG2P_SHIFT	= 0,
+	I40E_RX_DESC_EXT_STATUS_L2TAG3P_SHIFT	= 1,
+	I40E_RX_DESC_EXT_STATUS_FLEXBL_SHIFT	= 2, /* 2 BITS */
+	I40E_RX_DESC_EXT_STATUS_FLEXBH_SHIFT	= 4, /* 2 BITS */
+	I40E_RX_DESC_EXT_STATUS_FDLONGB_SHIFT	= 9,
+	I40E_RX_DESC_EXT_STATUS_FCOELONGB_SHIFT	= 10,
+	I40E_RX_DESC_EXT_STATUS_PELONGB_SHIFT	= 11,
+};
+
+#define I40E_RXD_QW2_L2TAG2_SHIFT	0
+#define I40E_RXD_QW2_L2TAG2_MASK	(0xFFFFUL << I40E_RXD_QW2_L2TAG2_SHIFT)
+
+#define I40E_RXD_QW2_L2TAG3_SHIFT	16
+#define I40E_RXD_QW2_L2TAG3_MASK	(0xFFFFUL << I40E_RXD_QW2_L2TAG3_SHIFT)
+
+enum i40e_rx_desc_pe_status_bits {
+	/* Note: These are predefined bit offsets */
+	I40E_RX_DESC_PE_STATUS_QPID_SHIFT	= 0, /* 18 BITS */
+	I40E_RX_DESC_PE_STATUS_L4PORT_SHIFT	= 0, /* 16 BITS */
+	I40E_RX_DESC_PE_STATUS_IPINDEX_SHIFT	= 16, /* 8 BITS */
+	I40E_RX_DESC_PE_STATUS_QPIDHIT_SHIFT	= 24,
+	I40E_RX_DESC_PE_STATUS_APBVTHIT_SHIFT	= 25,
+	I40E_RX_DESC_PE_STATUS_PORTV_SHIFT	= 26,
+	I40E_RX_DESC_PE_STATUS_URG_SHIFT	= 27,
+	I40E_RX_DESC_PE_STATUS_IPFRAG_SHIFT	= 28,
+	I40E_RX_DESC_PE_STATUS_IPOPT_SHIFT	= 29
+};
+
+#define I40E_RX_PROG_STATUS_DESC_LENGTH_SHIFT		38
+#define I40E_RX_PROG_STATUS_DESC_LENGTH			0x2000000
+
+#define I40E_RX_PROG_STATUS_DESC_QW1_PROGID_SHIFT	2
+#define I40E_RX_PROG_STATUS_DESC_QW1_PROGID_MASK	(0x7UL << \
+				I40E_RX_PROG_STATUS_DESC_QW1_PROGID_SHIFT)
+
+#define I40E_RX_PROG_STATUS_DESC_QW1_STATUS_SHIFT	0
+#define I40E_RX_PROG_STATUS_DESC_QW1_STATUS_MASK	(0x7FFFUL << \
+				I40E_RX_PROG_STATUS_DESC_QW1_STATUS_SHIFT)
+
+#define I40E_RX_PROG_STATUS_DESC_QW1_ERROR_SHIFT	19
+#define I40E_RX_PROG_STATUS_DESC_QW1_ERROR_MASK		(0x3FUL << \
+				I40E_RX_PROG_STATUS_DESC_QW1_ERROR_SHIFT)
+
+enum i40e_rx_prog_status_desc_status_bits {
+	/* Note: These are predefined bit offsets */
+	I40E_RX_PROG_STATUS_DESC_DD_SHIFT	= 0,
+	I40E_RX_PROG_STATUS_DESC_PROG_ID_SHIFT	= 2 /* 3 BITS */
+};
+
+enum i40e_rx_prog_status_desc_prog_id_masks {
+	I40E_RX_PROG_STATUS_DESC_FD_FILTER_STATUS	= 1,
+	I40E_RX_PROG_STATUS_DESC_FCOE_CTXT_PROG_STATUS	= 2,
+	I40E_RX_PROG_STATUS_DESC_FCOE_CTXT_INVL_STATUS	= 4,
+};
+
+enum i40e_rx_prog_status_desc_error_bits {
+	/* Note: These are predefined bit offsets */
+	I40E_RX_PROG_STATUS_DESC_FD_TBL_FULL_SHIFT	= 0,
+	I40E_RX_PROG_STATUS_DESC_NO_FD_ENTRY_SHIFT	= 1,
+	I40E_RX_PROG_STATUS_DESC_FCOE_TBL_FULL_SHIFT	= 2,
+	I40E_RX_PROG_STATUS_DESC_FCOE_CONFLICT_SHIFT	= 3
+};
+
+#define I40E_TWO_BIT_MASK	0x3
+#define I40E_THREE_BIT_MASK	0x7
+#define I40E_FOUR_BIT_MASK	0xF
+#define I40E_EIGHTEEN_BIT_MASK	0x3FFFF
+
+/* TX Descriptor */
+struct i40e_tx_desc {
+	__le64 buffer_addr; /* Address of descriptor's data buf */
+	__le64 cmd_type_offset_bsz;
+};
+
+#define I40E_TXD_QW1_DTYPE_SHIFT	0
+#define I40E_TXD_QW1_DTYPE_MASK		(0xFUL << I40E_TXD_QW1_DTYPE_SHIFT)
+
+enum i40e_tx_desc_dtype_value {
+	I40E_TX_DESC_DTYPE_DATA		= 0x0,
+	I40E_TX_DESC_DTYPE_NOP		= 0x1, /* same as Context desc */
+	I40E_TX_DESC_DTYPE_CONTEXT	= 0x1,
+	I40E_TX_DESC_DTYPE_FCOE_CTX	= 0x2,
+	I40E_TX_DESC_DTYPE_FILTER_PROG	= 0x8,
+	I40E_TX_DESC_DTYPE_DDP_CTX	= 0x9,
+	I40E_TX_DESC_DTYPE_FLEX_DATA	= 0xB,
+	I40E_TX_DESC_DTYPE_FLEX_CTX_1	= 0xC,
+	I40E_TX_DESC_DTYPE_FLEX_CTX_2	= 0xD,
+	I40E_TX_DESC_DTYPE_DESC_DONE	= 0xF
+};
+
+#define I40E_TXD_QW1_CMD_SHIFT	4
+#define I40E_TXD_QW1_CMD_MASK	(0x3FFUL << I40E_TXD_QW1_CMD_SHIFT)
+
+enum i40e_tx_desc_cmd_bits {
+	I40E_TX_DESC_CMD_EOP			= 0x0001,
+	I40E_TX_DESC_CMD_RS			= 0x0002,
+	I40E_TX_DESC_CMD_ICRC			= 0x0004,
+	I40E_TX_DESC_CMD_IL2TAG1		= 0x0008,
+	I40E_TX_DESC_CMD_DUMMY			= 0x0010,
+	I40E_TX_DESC_CMD_IIPT_NONIP		= 0x0000, /* 2 BITS */
+	I40E_TX_DESC_CMD_IIPT_IPV6		= 0x0020, /* 2 BITS */
+	I40E_TX_DESC_CMD_IIPT_IPV4		= 0x0040, /* 2 BITS */
+	I40E_TX_DESC_CMD_IIPT_IPV4_CSUM		= 0x0060, /* 2 BITS */
+	I40E_TX_DESC_CMD_FCOET			= 0x0080,
+	I40E_TX_DESC_CMD_L4T_EOFT_UNK		= 0x0000, /* 2 BITS */
+	I40E_TX_DESC_CMD_L4T_EOFT_TCP		= 0x0100, /* 2 BITS */
+	I40E_TX_DESC_CMD_L4T_EOFT_SCTP		= 0x0200, /* 2 BITS */
+	I40E_TX_DESC_CMD_L4T_EOFT_UDP		= 0x0300, /* 2 BITS */
+	I40E_TX_DESC_CMD_L4T_EOFT_EOF_N		= 0x0000, /* 2 BITS */
+	I40E_TX_DESC_CMD_L4T_EOFT_EOF_T		= 0x0100, /* 2 BITS */
+	I40E_TX_DESC_CMD_L4T_EOFT_EOF_NI	= 0x0200, /* 2 BITS */
+	I40E_TX_DESC_CMD_L4T_EOFT_EOF_A		= 0x0300, /* 2 BITS */
+};
+
+#define I40E_TXD_QW1_OFFSET_SHIFT	16
+#define I40E_TXD_QW1_OFFSET_MASK	(0x3FFFFULL << \
+					 I40E_TXD_QW1_OFFSET_SHIFT)
+
+enum i40e_tx_desc_length_fields {
+	/* Note: These are predefined bit offsets */
+	I40E_TX_DESC_LENGTH_MACLEN_SHIFT	= 0, /* 7 BITS */
+	I40E_TX_DESC_LENGTH_IPLEN_SHIFT		= 7, /* 7 BITS */
+	I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT	= 14 /* 4 BITS */
+};
+
+#define I40E_TXD_QW1_MACLEN_MASK (0x7FUL << I40E_TX_DESC_LENGTH_MACLEN_SHIFT)
+#define I40E_TXD_QW1_IPLEN_MASK  (0x7FUL << I40E_TX_DESC_LENGTH_IPLEN_SHIFT)
+#define I40E_TXD_QW1_L4LEN_MASK  (0xFUL << I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT)
+#define I40E_TXD_QW1_FCLEN_MASK  (0xFUL << I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT)
+
+#define I40E_TXD_QW1_TX_BUF_SZ_SHIFT	34
+#define I40E_TXD_QW1_TX_BUF_SZ_MASK	(0x3FFFULL << \
+					 I40E_TXD_QW1_TX_BUF_SZ_SHIFT)
+
+#define I40E_TXD_QW1_L2TAG1_SHIFT	48
+#define I40E_TXD_QW1_L2TAG1_MASK	(0xFFFFULL << I40E_TXD_QW1_L2TAG1_SHIFT)
+
+/* Context descriptors */
+struct i40e_tx_context_desc {
+	__le32 tunneling_params;
+	__le16 l2tag2;
+	__le16 rsvd;
+	__le64 type_cmd_tso_mss;
+};
+
+#define I40E_TXD_CTX_QW1_DTYPE_SHIFT	0
+#define I40E_TXD_CTX_QW1_DTYPE_MASK	(0xFUL << I40E_TXD_CTX_QW1_DTYPE_SHIFT)
+
+#define I40E_TXD_CTX_QW1_CMD_SHIFT	4
+#define I40E_TXD_CTX_QW1_CMD_MASK	(0xFFFFUL << I40E_TXD_CTX_QW1_CMD_SHIFT)
+
+enum i40e_tx_ctx_desc_cmd_bits {
+	I40E_TX_CTX_DESC_TSO		= 0x01,
+	I40E_TX_CTX_DESC_TSYN		= 0x02,
+	I40E_TX_CTX_DESC_IL2TAG2	= 0x04,
+	I40E_TX_CTX_DESC_IL2TAG2_IL2H	= 0x08,
+	I40E_TX_CTX_DESC_SWTCH_NOTAG	= 0x00,
+	I40E_TX_CTX_DESC_SWTCH_UPLINK	= 0x10,
+	I40E_TX_CTX_DESC_SWTCH_LOCAL	= 0x20,
+	I40E_TX_CTX_DESC_SWTCH_VSI	= 0x30,
+	I40E_TX_CTX_DESC_SWPE		= 0x40
+};
+
+#define I40E_TXD_CTX_QW1_TSO_LEN_SHIFT	30
+#define I40E_TXD_CTX_QW1_TSO_LEN_MASK	(0x3FFFFULL << \
+					 I40E_TXD_CTX_QW1_TSO_LEN_SHIFT)
+
+#define I40E_TXD_CTX_QW1_MSS_SHIFT	50
+#define I40E_TXD_CTX_QW1_MSS_MASK	(0x3FFFULL << \
+					 I40E_TXD_CTX_QW1_MSS_SHIFT)
+
+#define I40E_TXD_CTX_QW1_VSI_SHIFT	50
+#define I40E_TXD_CTX_QW1_VSI_MASK	(0x1FFULL << I40E_TXD_CTX_QW1_VSI_SHIFT)
+
+#define I40E_TXD_CTX_QW0_EXT_IP_SHIFT	0
+#define I40E_TXD_CTX_QW0_EXT_IP_MASK	(0x3ULL << \
+					 I40E_TXD_CTX_QW0_EXT_IP_SHIFT)
+
+enum i40e_tx_ctx_desc_eipt_offload {
+	I40E_TX_CTX_EXT_IP_NONE		= 0x0,
+	I40E_TX_CTX_EXT_IP_IPV6		= 0x1,
+	I40E_TX_CTX_EXT_IP_IPV4_NO_CSUM	= 0x2,
+	I40E_TX_CTX_EXT_IP_IPV4		= 0x3
+};
+
+#define I40E_TXD_CTX_QW0_EXT_IPLEN_SHIFT	2
+#define I40E_TXD_CTX_QW0_EXT_IPLEN_MASK	(0x3FULL << \
+					 I40E_TXD_CTX_QW0_EXT_IPLEN_SHIFT)
+
+#define I40E_TXD_CTX_QW0_NATT_SHIFT	9
+#define I40E_TXD_CTX_QW0_NATT_MASK	(0x3ULL << I40E_TXD_CTX_QW0_NATT_SHIFT)
+
+#define I40E_TXD_CTX_UDP_TUNNELING	BIT_ULL(I40E_TXD_CTX_QW0_NATT_SHIFT)
+#define I40E_TXD_CTX_GRE_TUNNELING	(0x2ULL << I40E_TXD_CTX_QW0_NATT_SHIFT)
+
+#define I40E_TXD_CTX_QW0_EIP_NOINC_SHIFT	11
+#define I40E_TXD_CTX_QW0_EIP_NOINC_MASK	BIT_ULL(I40E_TXD_CTX_QW0_EIP_NOINC_SHIFT)
+
+#define I40E_TXD_CTX_EIP_NOINC_IPID_CONST	I40E_TXD_CTX_QW0_EIP_NOINC_MASK
+
+#define I40E_TXD_CTX_QW0_NATLEN_SHIFT	12
+#define I40E_TXD_CTX_QW0_NATLEN_MASK	(0X7FULL << \
+					 I40E_TXD_CTX_QW0_NATLEN_SHIFT)
+
+#define I40E_TXD_CTX_QW0_DECTTL_SHIFT	19
+#define I40E_TXD_CTX_QW0_DECTTL_MASK	(0xFULL << \
+					 I40E_TXD_CTX_QW0_DECTTL_SHIFT)
+
+#ifdef X722_SUPPORT
+#define I40E_TXD_CTX_QW0_L4T_CS_SHIFT	23
+#define I40E_TXD_CTX_QW0_L4T_CS_MASK	BIT_ULL(I40E_TXD_CTX_QW0_L4T_CS_SHIFT)
+#endif
+struct i40e_nop_desc {
+	__le64 rsvd;
+	__le64 dtype_cmd;
+};
+
+#define I40E_TXD_NOP_QW1_DTYPE_SHIFT	0
+#define I40E_TXD_NOP_QW1_DTYPE_MASK	(0xFUL << I40E_TXD_NOP_QW1_DTYPE_SHIFT)
+
+#define I40E_TXD_NOP_QW1_CMD_SHIFT	4
+#define I40E_TXD_NOP_QW1_CMD_MASK	(0x7FUL << I40E_TXD_NOP_QW1_CMD_SHIFT)
+
+enum i40e_tx_nop_desc_cmd_bits {
+	/* Note: These are predefined bit offsets */
+	I40E_TX_NOP_DESC_EOP_SHIFT	= 0,
+	I40E_TX_NOP_DESC_RS_SHIFT	= 1,
+	I40E_TX_NOP_DESC_RSV_SHIFT	= 2 /* 5 bits */
+};
+
+struct i40e_filter_program_desc {
+	__le32 qindex_flex_ptype_vsi;
+	__le32 rsvd;
+	__le32 dtype_cmd_cntindex;
+	__le32 fd_id;
+};
+#define I40E_TXD_FLTR_QW0_QINDEX_SHIFT	0
+#define I40E_TXD_FLTR_QW0_QINDEX_MASK	(0x7FFUL << \
+					 I40E_TXD_FLTR_QW0_QINDEX_SHIFT)
+#define I40E_TXD_FLTR_QW0_FLEXOFF_SHIFT	11
+#define I40E_TXD_FLTR_QW0_FLEXOFF_MASK	(0x7UL << \
+					 I40E_TXD_FLTR_QW0_FLEXOFF_SHIFT)
+#define I40E_TXD_FLTR_QW0_PCTYPE_SHIFT	17
+#define I40E_TXD_FLTR_QW0_PCTYPE_MASK	(0x3FUL << \
+					 I40E_TXD_FLTR_QW0_PCTYPE_SHIFT)
+
+/* Packet Classifier Types for filters */
+enum i40e_filter_pctype {
+#ifdef X722_SUPPORT
+	/* Note: Values 0-28 are reserved for future use.
+	 * Value 29, 30, 32 are not supported on XL710 and X710.
+	 */
+	I40E_FILTER_PCTYPE_NONF_UNICAST_IPV4_UDP	= 29,
+	I40E_FILTER_PCTYPE_NONF_MULTICAST_IPV4_UDP	= 30,
+#else
+	/* Note: Values 0-30 are reserved for future use */
+#endif
+	I40E_FILTER_PCTYPE_NONF_IPV4_UDP		= 31,
+#ifdef X722_SUPPORT
+	I40E_FILTER_PCTYPE_NONF_IPV4_TCP_SYN_NO_ACK	= 32,
+#else
+	/* Note: Value 32 is reserved for future use */
+#endif
+	I40E_FILTER_PCTYPE_NONF_IPV4_TCP		= 33,
+	I40E_FILTER_PCTYPE_NONF_IPV4_SCTP		= 34,
+	I40E_FILTER_PCTYPE_NONF_IPV4_OTHER		= 35,
+	I40E_FILTER_PCTYPE_FRAG_IPV4			= 36,
+#ifdef X722_SUPPORT
+	/* Note: Values 37-38 are reserved for future use.
+	 * Value 39, 40, 42 are not supported on XL710 and X710.
+	 */
+	I40E_FILTER_PCTYPE_NONF_UNICAST_IPV6_UDP	= 39,
+	I40E_FILTER_PCTYPE_NONF_MULTICAST_IPV6_UDP	= 40,
+#else
+	/* Note: Values 37-40 are reserved for future use */
+#endif
+	I40E_FILTER_PCTYPE_NONF_IPV6_UDP		= 41,
+#ifdef X722_SUPPORT
+	I40E_FILTER_PCTYPE_NONF_IPV6_TCP_SYN_NO_ACK	= 42,
+#endif
+	I40E_FILTER_PCTYPE_NONF_IPV6_TCP		= 43,
+	I40E_FILTER_PCTYPE_NONF_IPV6_SCTP		= 44,
+	I40E_FILTER_PCTYPE_NONF_IPV6_OTHER		= 45,
+	I40E_FILTER_PCTYPE_FRAG_IPV6			= 46,
+	/* Note: Value 47 is reserved for future use */
+	I40E_FILTER_PCTYPE_FCOE_OX			= 48,
+	I40E_FILTER_PCTYPE_FCOE_RX			= 49,
+	I40E_FILTER_PCTYPE_FCOE_OTHER			= 50,
+	/* Note: Values 51-62 are reserved for future use */
+	I40E_FILTER_PCTYPE_L2_PAYLOAD			= 63,
+};
+
+enum i40e_filter_program_desc_dest {
+	I40E_FILTER_PROGRAM_DESC_DEST_DROP_PACKET		= 0x0,
+	I40E_FILTER_PROGRAM_DESC_DEST_DIRECT_PACKET_QINDEX	= 0x1,
+	I40E_FILTER_PROGRAM_DESC_DEST_DIRECT_PACKET_OTHER	= 0x2,
+};
+
+enum i40e_filter_program_desc_fd_status {
+	I40E_FILTER_PROGRAM_DESC_FD_STATUS_NONE			= 0x0,
+	I40E_FILTER_PROGRAM_DESC_FD_STATUS_FD_ID		= 0x1,
+	I40E_FILTER_PROGRAM_DESC_FD_STATUS_FD_ID_4FLEX_BYTES	= 0x2,
+	I40E_FILTER_PROGRAM_DESC_FD_STATUS_8FLEX_BYTES		= 0x3,
+};
+
+#define I40E_TXD_FLTR_QW0_DEST_VSI_SHIFT	23
+#define I40E_TXD_FLTR_QW0_DEST_VSI_MASK	BIT_ULL(I40E_TXD_FLTR_QW0_DEST_VSI_SHIFT)
+
+#define I40E_TXD_FLTR_QW1_DTYPE_SHIFT	0
+#define I40E_TXD_FLTR_QW1_DTYPE_MASK	(0xFUL << I40E_TXD_FLTR_QW1_DTYPE_SHIFT)
+
+#define I40E_TXD_FLTR_QW1_CMD_SHIFT	4
+#define I40E_TXD_FLTR_QW1_CMD_MASK	(0xFFFFULL << \
+					 I40E_TXD_FLTR_QW1_CMD_SHIFT)
+
+#define I40E_TXD_FLTR_QW1_PCMD_SHIFT	(0x0ULL + I40E_TXD_FLTR_QW1_CMD_SHIFT)
+#define I40E_TXD_FLTR_QW1_PCMD_MASK	(0x7ULL << I40E_TXD_FLTR_QW1_PCMD_SHIFT)
+
+enum i40e_filter_program_desc_pcmd {
+	I40E_FILTER_PROGRAM_DESC_PCMD_ADD_UPDATE	= 0x1,
+	I40E_FILTER_PROGRAM_DESC_PCMD_REMOVE		= 0x2,
+};
+
+#define I40E_TXD_FLTR_QW1_DEST_SHIFT	(0x3ULL + I40E_TXD_FLTR_QW1_CMD_SHIFT)
+#define I40E_TXD_FLTR_QW1_DEST_MASK	(0x3ULL << I40E_TXD_FLTR_QW1_DEST_SHIFT)
+
+#define I40E_TXD_FLTR_QW1_CNT_ENA_SHIFT	(0x7ULL + I40E_TXD_FLTR_QW1_CMD_SHIFT)
+#define I40E_TXD_FLTR_QW1_CNT_ENA_MASK	BIT_ULL(I40E_TXD_FLTR_QW1_CNT_ENA_SHIFT)
+
+#define I40E_TXD_FLTR_QW1_FD_STATUS_SHIFT	(0x9ULL + \
+						 I40E_TXD_FLTR_QW1_CMD_SHIFT)
+#define I40E_TXD_FLTR_QW1_FD_STATUS_MASK (0x3ULL << \
+					  I40E_TXD_FLTR_QW1_FD_STATUS_SHIFT)
+#ifdef X722_SUPPORT
+
+#define I40E_TXD_FLTR_QW1_ATR_SHIFT	(0xEULL + \
+					 I40E_TXD_FLTR_QW1_CMD_SHIFT)
+#define I40E_TXD_FLTR_QW1_ATR_MASK	BIT_ULL(I40E_TXD_FLTR_QW1_ATR_SHIFT)
+#endif
+
+#define I40E_TXD_FLTR_QW1_CNTINDEX_SHIFT 20
+#define I40E_TXD_FLTR_QW1_CNTINDEX_MASK	(0x1FFUL << \
+					 I40E_TXD_FLTR_QW1_CNTINDEX_SHIFT)
+
+enum i40e_filter_type {
+	I40E_FLOW_DIRECTOR_FLTR = 0,
+	I40E_PE_QUAD_HASH_FLTR = 1,
+	I40E_ETHERTYPE_FLTR,
+	I40E_FCOE_CTX_FLTR,
+	I40E_MAC_VLAN_FLTR,
+	I40E_HASH_FLTR
+};
+
+struct i40e_vsi_context {
+	u16 seid;
+	u16 uplink_seid;
+	u16 vsi_number;
+	u16 vsis_allocated;
+	u16 vsis_unallocated;
+	u16 flags;
+	u8 pf_num;
+	u8 vf_num;
+	u8 connection_type;
+	struct i40e_aqc_vsi_properties_data info;
+};
+
+struct i40e_veb_context {
+	u16 seid;
+	u16 uplink_seid;
+	u16 veb_number;
+	u16 vebs_allocated;
+	u16 vebs_unallocated;
+	u16 flags;
+	struct i40e_aqc_get_veb_parameters_completion info;
+};
+
+/* Statistics collected by each port, VSI, VEB, and S-channel */
+struct i40e_eth_stats {
+	u64 rx_bytes;			/* gorc */
+	u64 rx_unicast;			/* uprc */
+	u64 rx_multicast;		/* mprc */
+	u64 rx_broadcast;		/* bprc */
+	u64 rx_discards;		/* rdpc */
+	u64 rx_unknown_protocol;	/* rupp */
+	u64 tx_bytes;			/* gotc */
+	u64 tx_unicast;			/* uptc */
+	u64 tx_multicast;		/* mptc */
+	u64 tx_broadcast;		/* bptc */
+	u64 tx_discards;		/* tdpc */
+	u64 tx_errors;			/* tepc */
+};
+
+/* Statistics collected per VEB per TC */
+struct i40e_veb_tc_stats {
+	u64 tc_rx_packets[I40E_MAX_TRAFFIC_CLASS];
+	u64 tc_rx_bytes[I40E_MAX_TRAFFIC_CLASS];
+	u64 tc_tx_packets[I40E_MAX_TRAFFIC_CLASS];
+	u64 tc_tx_bytes[I40E_MAX_TRAFFIC_CLASS];
+};
+
+/* Statistics collected by the MAC */
+struct i40e_hw_port_stats {
+	/* eth stats collected by the port */
+	struct i40e_eth_stats eth;
+
+	/* additional port specific stats */
+	u64 tx_dropped_link_down;	/* tdold */
+	u64 crc_errors;			/* crcerrs */
+	u64 illegal_bytes;		/* illerrc */
+	u64 error_bytes;		/* errbc */
+	u64 mac_local_faults;		/* mlfc */
+	u64 mac_remote_faults;		/* mrfc */
+	u64 rx_length_errors;		/* rlec */
+	u64 link_xon_rx;		/* lxonrxc */
+	u64 link_xoff_rx;		/* lxoffrxc */
+	u64 priority_xon_rx[8];		/* pxonrxc[8] */
+	u64 priority_xoff_rx[8];	/* pxoffrxc[8] */
+	u64 link_xon_tx;		/* lxontxc */
+	u64 link_xoff_tx;		/* lxofftxc */
+	u64 priority_xon_tx[8];		/* pxontxc[8] */
+	u64 priority_xoff_tx[8];	/* pxofftxc[8] */
+	u64 priority_xon_2_xoff[8];	/* rxon2offcnt[8] */
+	u64 rx_size_64;			/* prc64 */
+	u64 rx_size_127;		/* prc127 */
+	u64 rx_size_255;		/* prc255 */
+	u64 rx_size_511;		/* prc511 */
+	u64 rx_size_1023;		/* prc1023 */
+	u64 rx_size_1522;		/* prc1522 */
+	u64 rx_size_big;		/* prc9522 */
+	u64 rx_undersize;		/* ruc */
+	u64 rx_fragments;		/* rfc */
+	u64 rx_oversize;		/* roc */
+	u64 rx_jabber;			/* rjc */
+	u64 tx_size_64;			/* ptc64 */
+	u64 tx_size_127;		/* ptc127 */
+	u64 tx_size_255;		/* ptc255 */
+	u64 tx_size_511;		/* ptc511 */
+	u64 tx_size_1023;		/* ptc1023 */
+	u64 tx_size_1522;		/* ptc1522 */
+	u64 tx_size_big;		/* ptc9522 */
+	u64 mac_short_packet_dropped;	/* mspdc */
+	u64 checksum_error;		/* xec */
+	/* flow director stats */
+	u64 fd_atr_match;
+	u64 fd_sb_match;
+	u64 fd_atr_tunnel_match;
+	u32 fd_atr_status;
+	u32 fd_sb_status;
+	/* EEE LPI */
+	u32 tx_lpi_status;
+	u32 rx_lpi_status;
+	u64 tx_lpi_count;		/* etlpic */
+	u64 rx_lpi_count;		/* erlpic */
+};
+
+/* Checksum and Shadow RAM pointers */
+#define I40E_SR_NVM_CONTROL_WORD		0x00
+#define I40E_SR_PCIE_ANALOG_CONFIG_PTR		0x03
+#define I40E_SR_PHY_ANALOG_CONFIG_PTR		0x04
+#define I40E_SR_OPTION_ROM_PTR			0x05
+#define I40E_SR_RO_PCIR_REGS_AUTO_LOAD_PTR	0x06
+#define I40E_SR_AUTO_GENERATED_POINTERS_PTR	0x07
+#define I40E_SR_PCIR_REGS_AUTO_LOAD_PTR		0x08
+#define I40E_SR_EMP_GLOBAL_MODULE_PTR		0x09
+#define I40E_SR_RO_PCIE_LCB_PTR			0x0A
+#define I40E_SR_EMP_IMAGE_PTR			0x0B
+#define I40E_SR_PE_IMAGE_PTR			0x0C
+#define I40E_SR_CSR_PROTECTED_LIST_PTR		0x0D
+#define I40E_SR_MNG_CONFIG_PTR			0x0E
+#define I40E_SR_EMP_MODULE_PTR			0x0F
+#define I40E_SR_PBA_FLAGS			0x15
+#define I40E_SR_PBA_BLOCK_PTR			0x16
+#define I40E_SR_BOOT_CONFIG_PTR			0x17
+#define I40E_NVM_OEM_VER_OFF			0x83
+#define I40E_SR_NVM_DEV_STARTER_VERSION		0x18
+#define I40E_SR_NVM_WAKE_ON_LAN			0x19
+#define I40E_SR_ALTERNATE_SAN_MAC_ADDRESS_PTR	0x27
+#define I40E_SR_PERMANENT_SAN_MAC_ADDRESS_PTR	0x28
+#define I40E_SR_NVM_MAP_VERSION			0x29
+#define I40E_SR_NVM_IMAGE_VERSION		0x2A
+#define I40E_SR_NVM_STRUCTURE_VERSION		0x2B
+#define I40E_SR_NVM_EETRACK_LO			0x2D
+#define I40E_SR_NVM_EETRACK_HI			0x2E
+#define I40E_SR_VPD_PTR				0x2F
+#define I40E_SR_PXE_SETUP_PTR			0x30
+#define I40E_SR_PXE_CONFIG_CUST_OPTIONS_PTR	0x31
+#define I40E_SR_NVM_ORIGINAL_EETRACK_LO		0x34
+#define I40E_SR_NVM_ORIGINAL_EETRACK_HI		0x35
+#define I40E_SR_SW_ETHERNET_MAC_ADDRESS_PTR	0x37
+#define I40E_SR_POR_REGS_AUTO_LOAD_PTR		0x38
+#define I40E_SR_EMPR_REGS_AUTO_LOAD_PTR		0x3A
+#define I40E_SR_GLOBR_REGS_AUTO_LOAD_PTR	0x3B
+#define I40E_SR_CORER_REGS_AUTO_LOAD_PTR	0x3C
+#define I40E_SR_PCIE_ALT_AUTO_LOAD_PTR		0x3E
+#define I40E_SR_SW_CHECKSUM_WORD		0x3F
+#define I40E_SR_1ST_FREE_PROVISION_AREA_PTR	0x40
+#define I40E_SR_4TH_FREE_PROVISION_AREA_PTR	0x42
+#define I40E_SR_3RD_FREE_PROVISION_AREA_PTR	0x44
+#define I40E_SR_2ND_FREE_PROVISION_AREA_PTR	0x46
+#define I40E_SR_EMP_SR_SETTINGS_PTR		0x48
+#define I40E_SR_FEATURE_CONFIGURATION_PTR	0x49
+#define I40E_SR_CONFIGURATION_METADATA_PTR	0x4D
+#define I40E_SR_IMMEDIATE_VALUES_PTR		0x4E
+
+/* Auxiliary field, mask and shift definition for Shadow RAM and NVM Flash */
+#define I40E_SR_VPD_MODULE_MAX_SIZE		1024
+#define I40E_SR_PCIE_ALT_MODULE_MAX_SIZE	1024
+#define I40E_SR_CONTROL_WORD_1_SHIFT		0x06
+#define I40E_SR_CONTROL_WORD_1_MASK	(0x03 << I40E_SR_CONTROL_WORD_1_SHIFT)
+
+/* Shadow RAM related */
+#define I40E_SR_SECTOR_SIZE_IN_WORDS	0x800
+#define I40E_SR_BUF_ALIGNMENT		4096
+#define I40E_SR_WORDS_IN_1KB		512
+/* Checksum should be calculated such that after adding all the words,
+ * including the checksum word itself, the sum should be 0xBABA.
+ */
+#define I40E_SR_SW_CHECKSUM_BASE	0xBABA
+
+#define I40E_SRRD_SRCTL_ATTEMPTS	100000
+
+enum i40e_switch_element_types {
+	I40E_SWITCH_ELEMENT_TYPE_MAC	= 1,
+	I40E_SWITCH_ELEMENT_TYPE_PF	= 2,
+	I40E_SWITCH_ELEMENT_TYPE_VF	= 3,
+	I40E_SWITCH_ELEMENT_TYPE_EMP	= 4,
+	I40E_SWITCH_ELEMENT_TYPE_BMC	= 6,
+	I40E_SWITCH_ELEMENT_TYPE_PE	= 16,
+	I40E_SWITCH_ELEMENT_TYPE_VEB	= 17,
+	I40E_SWITCH_ELEMENT_TYPE_PA	= 18,
+	I40E_SWITCH_ELEMENT_TYPE_VSI	= 19,
+};
+
+/* Supported EtherType filters */
+enum i40e_ether_type_index {
+	I40E_ETHER_TYPE_1588		= 0,
+	I40E_ETHER_TYPE_FIP		= 1,
+	I40E_ETHER_TYPE_OUI_EXTENDED	= 2,
+	I40E_ETHER_TYPE_MAC_CONTROL	= 3,
+	I40E_ETHER_TYPE_LLDP		= 4,
+	I40E_ETHER_TYPE_EVB_PROTOCOL1	= 5,
+	I40E_ETHER_TYPE_EVB_PROTOCOL2	= 6,
+	I40E_ETHER_TYPE_QCN_CNM		= 7,
+	I40E_ETHER_TYPE_8021X		= 8,
+	I40E_ETHER_TYPE_ARP		= 9,
+	I40E_ETHER_TYPE_RSV1		= 10,
+	I40E_ETHER_TYPE_RSV2		= 11,
+};
+
+/* Filter context base size is 1K */
+#define I40E_HASH_FILTER_BASE_SIZE	1024
+/* Supported Hash filter values */
+enum i40e_hash_filter_size {
+	I40E_HASH_FILTER_SIZE_1K	= 0,
+	I40E_HASH_FILTER_SIZE_2K	= 1,
+	I40E_HASH_FILTER_SIZE_4K	= 2,
+	I40E_HASH_FILTER_SIZE_8K	= 3,
+	I40E_HASH_FILTER_SIZE_16K	= 4,
+	I40E_HASH_FILTER_SIZE_32K	= 5,
+	I40E_HASH_FILTER_SIZE_64K	= 6,
+	I40E_HASH_FILTER_SIZE_128K	= 7,
+	I40E_HASH_FILTER_SIZE_256K	= 8,
+	I40E_HASH_FILTER_SIZE_512K	= 9,
+	I40E_HASH_FILTER_SIZE_1M	= 10,
+};
+
+/* DMA context base size is 0.5K */
+#define I40E_DMA_CNTX_BASE_SIZE		512
+/* Supported DMA context values */
+enum i40e_dma_cntx_size {
+	I40E_DMA_CNTX_SIZE_512		= 0,
+	I40E_DMA_CNTX_SIZE_1K		= 1,
+	I40E_DMA_CNTX_SIZE_2K		= 2,
+	I40E_DMA_CNTX_SIZE_4K		= 3,
+	I40E_DMA_CNTX_SIZE_8K		= 4,
+	I40E_DMA_CNTX_SIZE_16K		= 5,
+	I40E_DMA_CNTX_SIZE_32K		= 6,
+	I40E_DMA_CNTX_SIZE_64K		= 7,
+	I40E_DMA_CNTX_SIZE_128K		= 8,
+	I40E_DMA_CNTX_SIZE_256K		= 9,
+};
+
+/* Supported Hash look up table (LUT) sizes */
+enum i40e_hash_lut_size {
+	I40E_HASH_LUT_SIZE_128		= 0,
+	I40E_HASH_LUT_SIZE_512		= 1,
+};
+
+/* Structure to hold a per PF filter control settings */
+struct i40e_filter_control_settings {
+	/* number of PE Quad Hash filter buckets */
+	enum i40e_hash_filter_size pe_filt_num;
+	/* number of PE Quad Hash contexts */
+	enum i40e_dma_cntx_size pe_cntx_num;
+	/* number of FCoE filter buckets */
+	enum i40e_hash_filter_size fcoe_filt_num;
+	/* number of FCoE DDP contexts */
+	enum i40e_dma_cntx_size fcoe_cntx_num;
+	/* size of the Hash LUT */
+	enum i40e_hash_lut_size	hash_lut_size;
+	/* enable FDIR filters for PF and its VFs */
+	bool enable_fdir;
+	/* enable Ethertype filters for PF and its VFs */
+	bool enable_ethtype;
+	/* enable MAC/VLAN filters for PF and its VFs */
+	bool enable_macvlan;
+};
+
+/* Structure to hold device level control filter counts */
+struct i40e_control_filter_stats {
+	u16 mac_etype_used;   /* Used perfect match MAC/EtherType filters */
+	u16 etype_used;       /* Used perfect EtherType filters */
+	u16 mac_etype_free;   /* Un-used perfect match MAC/EtherType filters */
+	u16 etype_free;       /* Un-used perfect EtherType filters */
+};
+
+enum i40e_reset_type {
+	I40E_RESET_POR		= 0,
+	I40E_RESET_CORER	= 1,
+	I40E_RESET_GLOBR	= 2,
+	I40E_RESET_EMPR		= 3,
+};
+
+/* IEEE 802.1AB LLDP Agent Variables from NVM */
+#define I40E_NVM_LLDP_CFG_PTR		0xD
+struct i40e_lldp_variables {
+	u16 length;
+	u16 adminstatus;
+	u16 msgfasttx;
+	u16 msgtxinterval;
+	u16 txparams;
+	u16 timers;
+	u16 crc8;
+};
+
+/* Offsets into Alternate Ram */
+#define I40E_ALT_STRUCT_FIRST_PF_OFFSET		0   /* in dwords */
+#define I40E_ALT_STRUCT_DWORDS_PER_PF		64   /* in dwords */
+#define I40E_ALT_STRUCT_OUTER_VLAN_TAG_OFFSET	0xD  /* in dwords */
+#define I40E_ALT_STRUCT_USER_PRIORITY_OFFSET	0xC  /* in dwords */
+#define I40E_ALT_STRUCT_MIN_BW_OFFSET		0xE  /* in dwords */
+#define I40E_ALT_STRUCT_MAX_BW_OFFSET		0xF  /* in dwords */
+
+/* Alternate Ram Bandwidth Masks */
+#define I40E_ALT_BW_VALUE_MASK		0xFF
+#define I40E_ALT_BW_RELATIVE_MASK	0x40000000
+#define I40E_ALT_BW_VALID_MASK		0x80000000
+
+/* RSS Hash Table Size */
+#define I40E_PFQF_CTL_0_HASHLUTSIZE_512	0x00010000
+
+/* PBA length (and one with additional zero-padding byte), see Table 6-2. */
+#define	I40E_PBANUM_LENGTH	12
+#define	I40E_PBANUM_STRLEN	13
+
+#endif /* _I40E_TYPE_H_ */
diff --git a/usr/src/uts/common/io/i40e/core/i40e_virtchnl.h b/usr/src/uts/common/io/i40e/core/i40e_virtchnl.h
new file mode 100644
index 0000000000..17b090f454
--- /dev/null
+++ b/usr/src/uts/common/io/i40e/core/i40e_virtchnl.h
@@ -0,0 +1,378 @@
+/******************************************************************************
+
+  Copyright (c) 2013-2015, Intel Corporation 
+  All rights reserved.
+  
+  Redistribution and use in source and binary forms, with or without 
+  modification, are permitted provided that the following conditions are met:
+  
+   1. Redistributions of source code must retain the above copyright notice, 
+      this list of conditions and the following disclaimer.
+  
+   2. Redistributions in binary form must reproduce the above copyright 
+      notice, this list of conditions and the following disclaimer in the 
+      documentation and/or other materials provided with the distribution.
+  
+   3. Neither the name of the Intel Corporation nor the names of its 
+      contributors may be used to endorse or promote products derived from 
+      this software without specific prior written permission.
+  
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+  POSSIBILITY OF SUCH DAMAGE.
+
+******************************************************************************/
+/*$FreeBSD: head/sys/dev/ixl/i40e_virtchnl.h 284049 2015-06-05 22:52:42Z jfv $*/
+
+#ifndef _I40E_VIRTCHNL_H_
+#define _I40E_VIRTCHNL_H_
+
+#include "i40e_type.h"
+
+/* Description:
+ * This header file describes the VF-PF communication protocol used
+ * by the various i40e drivers.
+ *
+ * Admin queue buffer usage:
+ * desc->opcode is always i40e_aqc_opc_send_msg_to_pf
+ * flags, retval, datalen, and data addr are all used normally.
+ * Firmware copies the cookie fields when sending messages between the PF and
+ * VF, but uses all other fields internally. Due to this limitation, we
+ * must send all messages as "indirect", i.e. using an external buffer.
+ *
+ * All the vsi indexes are relative to the VF. Each VF can have maximum of
+ * three VSIs. All the queue indexes are relative to the VSI.  Each VF can
+ * have a maximum of sixteen queues for all of its VSIs.
+ *
+ * The PF is required to return a status code in v_retval for all messages
+ * except RESET_VF, which does not require any response. The return value is of
+ * i40e_status_code type, defined in the i40e_type.h.
+ *
+ * In general, VF driver initialization should roughly follow the order of these
+ * opcodes. The VF driver must first validate the API version of the PF driver,
+ * then request a reset, then get resources, then configure queues and
+ * interrupts. After these operations are complete, the VF driver may start
+ * its queues, optionally add MAC and VLAN filters, and process traffic.
+ */
+
+/* Opcodes for VF-PF communication. These are placed in the v_opcode field
+ * of the virtchnl_msg structure.
+ */
+enum i40e_virtchnl_ops {
+/* The PF sends status change events to VFs using
+ * the I40E_VIRTCHNL_OP_EVENT opcode.
+ * VFs send requests to the PF using the other ops.
+ */
+	I40E_VIRTCHNL_OP_UNKNOWN = 0,
+	I40E_VIRTCHNL_OP_VERSION = 1, /* must ALWAYS be 1 */
+	I40E_VIRTCHNL_OP_RESET_VF = 2,
+	I40E_VIRTCHNL_OP_GET_VF_RESOURCES = 3,
+	I40E_VIRTCHNL_OP_CONFIG_TX_QUEUE = 4,
+	I40E_VIRTCHNL_OP_CONFIG_RX_QUEUE = 5,
+	I40E_VIRTCHNL_OP_CONFIG_VSI_QUEUES = 6,
+	I40E_VIRTCHNL_OP_CONFIG_IRQ_MAP = 7,
+	I40E_VIRTCHNL_OP_ENABLE_QUEUES = 8,
+	I40E_VIRTCHNL_OP_DISABLE_QUEUES = 9,
+	I40E_VIRTCHNL_OP_ADD_ETHER_ADDRESS = 10,
+	I40E_VIRTCHNL_OP_DEL_ETHER_ADDRESS = 11,
+	I40E_VIRTCHNL_OP_ADD_VLAN = 12,
+	I40E_VIRTCHNL_OP_DEL_VLAN = 13,
+	I40E_VIRTCHNL_OP_CONFIG_PROMISCUOUS_MODE = 14,
+	I40E_VIRTCHNL_OP_GET_STATS = 15,
+	I40E_VIRTCHNL_OP_FCOE = 16,
+	I40E_VIRTCHNL_OP_EVENT = 17,
+};
+
+/* Virtual channel message descriptor. This overlays the admin queue
+ * descriptor. All other data is passed in external buffers.
+ */
+
+struct i40e_virtchnl_msg {
+	u8 pad[8];			 /* AQ flags/opcode/len/retval fields */
+	enum i40e_virtchnl_ops v_opcode; /* avoid confusion with desc->opcode */
+	enum i40e_status_code v_retval;  /* ditto for desc->retval */
+	u32 vfid;			 /* used by PF when sending to VF */
+};
+
+/* Message descriptions and data structures.*/
+
+/* I40E_VIRTCHNL_OP_VERSION
+ * VF posts its version number to the PF. PF responds with its version number
+ * in the same format, along with a return code.
+ * Reply from PF has its major/minor versions also in param0 and param1.
+ * If there is a major version mismatch, then the VF cannot operate.
+ * If there is a minor version mismatch, then the VF can operate but should
+ * add a warning to the system log.
+ *
+ * This enum element MUST always be specified as == 1, regardless of other
+ * changes in the API. The PF must always respond to this message without
+ * error regardless of version mismatch.
+ */
+#define I40E_VIRTCHNL_VERSION_MAJOR		1
+#define I40E_VIRTCHNL_VERSION_MINOR		1
+#define I40E_VIRTCHNL_VERSION_MINOR_NO_VF_CAPS	0
+
+struct i40e_virtchnl_version_info {
+	u32 major;
+	u32 minor;
+};
+
+/* I40E_VIRTCHNL_OP_RESET_VF
+ * VF sends this request to PF with no parameters
+ * PF does NOT respond! VF driver must delay then poll VFGEN_RSTAT register
+ * until reset completion is indicated. The admin queue must be reinitialized
+ * after this operation.
+ *
+ * When reset is complete, PF must ensure that all queues in all VSIs associated
+ * with the VF are stopped, all queue configurations in the HMC are set to 0,
+ * and all MAC and VLAN filters (except the default MAC address) on all VSIs
+ * are cleared.
+ */
+
+/* I40E_VIRTCHNL_OP_GET_VF_RESOURCES
+ * Version 1.0 VF sends this request to PF with no parameters
+ * Version 1.1 VF sends this request to PF with u32 bitmap of its capabilities
+ * PF responds with an indirect message containing
+ * i40e_virtchnl_vf_resource and one or more
+ * i40e_virtchnl_vsi_resource structures.
+ */
+
+struct i40e_virtchnl_vsi_resource {
+	u16 vsi_id;
+	u16 num_queue_pairs;
+	enum i40e_vsi_type vsi_type;
+	u16 qset_handle;
+	u8 default_mac_addr[I40E_ETH_LENGTH_OF_ADDRESS];
+};
+/* VF offload flags */
+#define I40E_VIRTCHNL_VF_OFFLOAD_L2		0x00000001
+#define I40E_VIRTCHNL_VF_OFFLOAD_IWARP		0x00000002
+#define I40E_VIRTCHNL_VF_OFFLOAD_FCOE		0x00000004
+#define I40E_VIRTCHNL_VF_OFFLOAD_RSS_AQ		0x00000008
+#define I40E_VIRTCHNL_VF_OFFLOAD_RSS_REG	0x00000010
+#define I40E_VIRTCHNL_VF_OFFLOAD_VLAN		0x00010000
+#define I40E_VIRTCHNL_VF_OFFLOAD_RX_POLLING	0x00020000
+
+struct i40e_virtchnl_vf_resource {
+	u16 num_vsis;
+	u16 num_queue_pairs;
+	u16 max_vectors;
+	u16 max_mtu;
+
+	u32 vf_offload_flags;
+	u32 max_fcoe_contexts;
+	u32 max_fcoe_filters;
+
+	struct i40e_virtchnl_vsi_resource vsi_res[1];
+};
+
+/* I40E_VIRTCHNL_OP_CONFIG_TX_QUEUE
+ * VF sends this message to set up parameters for one TX queue.
+ * External data buffer contains one instance of i40e_virtchnl_txq_info.
+ * PF configures requested queue and returns a status code.
+ */
+
+/* Tx queue config info */
+struct i40e_virtchnl_txq_info {
+	u16 vsi_id;
+	u16 queue_id;
+	u16 ring_len;		/* number of descriptors, multiple of 8 */
+	u16 headwb_enabled;
+	u64 dma_ring_addr;
+	u64 dma_headwb_addr;
+};
+
+/* I40E_VIRTCHNL_OP_CONFIG_RX_QUEUE
+ * VF sends this message to set up parameters for one RX queue.
+ * External data buffer contains one instance of i40e_virtchnl_rxq_info.
+ * PF configures requested queue and returns a status code.
+ */
+
+/* Rx queue config info */
+struct i40e_virtchnl_rxq_info {
+	u16 vsi_id;
+	u16 queue_id;
+	u32 ring_len;		/* number of descriptors, multiple of 32 */
+	u16 hdr_size;
+	u16 splithdr_enabled;
+	u32 databuffer_size;
+	u32 max_pkt_size;
+	u64 dma_ring_addr;
+	enum i40e_hmc_obj_rx_hsplit_0 rx_split_pos;
+};
+
+/* I40E_VIRTCHNL_OP_CONFIG_VSI_QUEUES
+ * VF sends this message to set parameters for all active TX and RX queues
+ * associated with the specified VSI.
+ * PF configures queues and returns status.
+ * If the number of queues specified is greater than the number of queues
+ * associated with the VSI, an error is returned and no queues are configured.
+ */
+struct i40e_virtchnl_queue_pair_info {
+	/* NOTE: vsi_id and queue_id should be identical for both queues. */
+	struct i40e_virtchnl_txq_info txq;
+	struct i40e_virtchnl_rxq_info rxq;
+};
+
+struct i40e_virtchnl_vsi_queue_config_info {
+	u16 vsi_id;
+	u16 num_queue_pairs;
+	struct i40e_virtchnl_queue_pair_info qpair[1];
+};
+
+/* I40E_VIRTCHNL_OP_CONFIG_IRQ_MAP
+ * VF uses this message to map vectors to queues.
+ * The rxq_map and txq_map fields are bitmaps used to indicate which queues
+ * are to be associated with the specified vector.
+ * The "other" causes are always mapped to vector 0.
+ * PF configures interrupt mapping and returns status.
+ */
+struct i40e_virtchnl_vector_map {
+	u16 vsi_id;
+	u16 vector_id;
+	u16 rxq_map;
+	u16 txq_map;
+	u16 rxitr_idx;
+	u16 txitr_idx;
+};
+
+struct i40e_virtchnl_irq_map_info {
+	u16 num_vectors;
+	struct i40e_virtchnl_vector_map vecmap[1];
+};
+
+/* I40E_VIRTCHNL_OP_ENABLE_QUEUES
+ * I40E_VIRTCHNL_OP_DISABLE_QUEUES
+ * VF sends these message to enable or disable TX/RX queue pairs.
+ * The queues fields are bitmaps indicating which queues to act upon.
+ * (Currently, we only support 16 queues per VF, but we make the field
+ * u32 to allow for expansion.)
+ * PF performs requested action and returns status.
+ */
+struct i40e_virtchnl_queue_select {
+	u16 vsi_id;
+	u16 pad;
+	u32 rx_queues;
+	u32 tx_queues;
+};
+
+/* I40E_VIRTCHNL_OP_ADD_ETHER_ADDRESS
+ * VF sends this message in order to add one or more unicast or multicast
+ * address filters for the specified VSI.
+ * PF adds the filters and returns status.
+ */
+
+/* I40E_VIRTCHNL_OP_DEL_ETHER_ADDRESS
+ * VF sends this message in order to remove one or more unicast or multicast
+ * filters for the specified VSI.
+ * PF removes the filters and returns status.
+ */
+
+struct i40e_virtchnl_ether_addr {
+	u8 addr[I40E_ETH_LENGTH_OF_ADDRESS];
+	u8 pad[2];
+};
+
+struct i40e_virtchnl_ether_addr_list {
+	u16 vsi_id;
+	u16 num_elements;
+	struct i40e_virtchnl_ether_addr list[1];
+};
+
+/* I40E_VIRTCHNL_OP_ADD_VLAN
+ * VF sends this message to add one or more VLAN tag filters for receives.
+ * PF adds the filters and returns status.
+ * If a port VLAN is configured by the PF, this operation will return an
+ * error to the VF.
+ */
+
+/* I40E_VIRTCHNL_OP_DEL_VLAN
+ * VF sends this message to remove one or more VLAN tag filters for receives.
+ * PF removes the filters and returns status.
+ * If a port VLAN is configured by the PF, this operation will return an
+ * error to the VF.
+ */
+
+struct i40e_virtchnl_vlan_filter_list {
+	u16 vsi_id;
+	u16 num_elements;
+	u16 vlan_id[1];
+};
+
+/* I40E_VIRTCHNL_OP_CONFIG_PROMISCUOUS_MODE
+ * VF sends VSI id and flags.
+ * PF returns status code in retval.
+ * Note: we assume that broadcast accept mode is always enabled.
+ */
+struct i40e_virtchnl_promisc_info {
+	u16 vsi_id;
+	u16 flags;
+};
+
+#define I40E_FLAG_VF_UNICAST_PROMISC	0x00000001
+#define I40E_FLAG_VF_MULTICAST_PROMISC	0x00000002
+
+/* I40E_VIRTCHNL_OP_GET_STATS
+ * VF sends this message to request stats for the selected VSI. VF uses
+ * the i40e_virtchnl_queue_select struct to specify the VSI. The queue_id
+ * field is ignored by the PF.
+ *
+ * PF replies with struct i40e_eth_stats in an external buffer.
+ */
+
+/* I40E_VIRTCHNL_OP_EVENT
+ * PF sends this message to inform the VF driver of events that may affect it.
+ * No direct response is expected from the VF, though it may generate other
+ * messages in response to this one.
+ */
+enum i40e_virtchnl_event_codes {
+	I40E_VIRTCHNL_EVENT_UNKNOWN = 0,
+	I40E_VIRTCHNL_EVENT_LINK_CHANGE,
+	I40E_VIRTCHNL_EVENT_RESET_IMPENDING,
+	I40E_VIRTCHNL_EVENT_PF_DRIVER_CLOSE,
+};
+#define I40E_PF_EVENT_SEVERITY_INFO		0
+#define I40E_PF_EVENT_SEVERITY_ATTENTION	1
+#define I40E_PF_EVENT_SEVERITY_ACTION_REQUIRED	2
+#define I40E_PF_EVENT_SEVERITY_CERTAIN_DOOM	255
+
+struct i40e_virtchnl_pf_event {
+	enum i40e_virtchnl_event_codes event;
+	union {
+		struct {
+			enum i40e_aq_link_speed link_speed;
+			bool link_status;
+		} link_event;
+	} event_data;
+
+	int severity;
+};
+
+/* VF reset states - these are written into the RSTAT register:
+ * I40E_VFGEN_RSTAT1 on the PF
+ * I40E_VFGEN_RSTAT on the VF
+ * When the PF initiates a reset, it writes 0
+ * When the reset is complete, it writes 1
+ * When the PF detects that the VF has recovered, it writes 2
+ * VF checks this register periodically to determine if a reset has occurred,
+ * then polls it to know when the reset is complete.
+ * If either the PF or VF reads the register while the hardware
+ * is in a reset state, it will return DEADBEEF, which, when masked
+ * will result in 3.
+ */
+enum i40e_vfr_states {
+	I40E_VFR_INPROGRESS = 0,
+	I40E_VFR_COMPLETED,
+	I40E_VFR_VFACTIVE,
+	I40E_VFR_UNKNOWN,
+};
+
+#endif /* _I40E_VIRTCHNL_H_ */
diff --git a/usr/src/uts/common/io/i40e/i40e.conf b/usr/src/uts/common/io/i40e/i40e.conf
new file mode 100644
index 0000000000..b4c3459931
--- /dev/null
+++ b/usr/src/uts/common/io/i40e/i40e.conf
@@ -0,0 +1,19 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2016 Joyent, Inc.
+#
+
+#
+# Driver.conf file for Intel XL710 PCIe NIC Driver (i40e)
+# See i40e(7D) for valid options.
+#
diff --git a/usr/src/uts/common/io/i40e/i40e_gld.c b/usr/src/uts/common/io/i40e/i40e_gld.c
new file mode 100644
index 0000000000..0af4c4c71f
--- /dev/null
+++ b/usr/src/uts/common/io/i40e/i40e_gld.c
@@ -0,0 +1,1097 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 OmniTI Computer Consulting, Inc. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
+ */
+
+/*
+ * For more information, please see the big theory statement in i40e_main.c.
+ */
+
+#include "i40e_sw.h"
+
+#define	I40E_PROP_RX_DMA_THRESH	"_rx_dma_threshold"
+#define	I40E_PROP_TX_DMA_THRESH	"_tx_dma_threshold"
+#define	I40E_PROP_RX_ITR	"_rx_intr_throttle"
+#define	I40E_PROP_TX_ITR	"_tx_intr_throttle"
+#define	I40E_PROP_OTHER_ITR	"_other_intr_throttle"
+
+char *i40e_priv_props[] = {
+	I40E_PROP_RX_DMA_THRESH,
+	I40E_PROP_TX_DMA_THRESH,
+	I40E_PROP_RX_ITR,
+	I40E_PROP_TX_ITR,
+	I40E_PROP_OTHER_ITR,
+	NULL
+};
+
+static int
+i40e_group_remove_mac(void *arg, const uint8_t *mac_addr)
+{
+	i40e_t *i40e = arg;
+	struct i40e_aqc_remove_macvlan_element_data filt;
+	struct i40e_hw *hw = &i40e->i40e_hw_space;
+	int ret, i, last;
+	i40e_uaddr_t *iua;
+
+	if (I40E_IS_MULTICAST(mac_addr))
+		return (EINVAL);
+
+	mutex_enter(&i40e->i40e_general_lock);
+
+	if (i40e->i40e_state & I40E_SUSPENDED) {
+		ret = ECANCELED;
+		goto done;
+	}
+
+	for (i = 0; i < i40e->i40e_resources.ifr_nmacfilt_used; i++) {
+		if (bcmp(mac_addr, i40e->i40e_uaddrs[i].iua_mac,
+		    ETHERADDRL) == 0)
+			break;
+	}
+
+	if (i == i40e->i40e_resources.ifr_nmacfilt_used) {
+		ret = ENOENT;
+		goto done;
+	}
+
+	iua = &i40e->i40e_uaddrs[i];
+	ASSERT(i40e->i40e_resources.ifr_nmacfilt_used > 0);
+
+	bzero(&filt, sizeof (filt));
+	bcopy(mac_addr, filt.mac_addr, ETHERADDRL);
+	filt.flags = I40E_AQC_MACVLAN_DEL_PERFECT_MATCH |
+	    I40E_AQC_MACVLAN_DEL_IGNORE_VLAN;
+
+	if (i40e_aq_remove_macvlan(hw, iua->iua_vsi, &filt, 1, NULL) !=
+	    I40E_SUCCESS) {
+		i40e_error(i40e, "failed to remove mac address "
+		    "%2x:%2x:%2x:%2x:%2x:%2x from unicast filter: %d",
+		    mac_addr[0], mac_addr[1], mac_addr[2], mac_addr[3],
+		    mac_addr[4], mac_addr[5], filt.error_code);
+		ret = EIO;
+		goto done;
+	}
+
+	last = i40e->i40e_resources.ifr_nmacfilt_used - 1;
+	if (i != last) {
+		i40e_uaddr_t *src = &i40e->i40e_uaddrs[last];
+		bcopy(src, iua, sizeof (i40e_uaddr_t));
+	}
+
+	/*
+	 * Set the multicast bit in the last one to indicate to ourselves that
+	 * it's invalid.
+	 */
+	bzero(&i40e->i40e_uaddrs[last], sizeof (i40e_uaddr_t));
+	i40e->i40e_uaddrs[last].iua_mac[0] = 0x01;
+	i40e->i40e_resources.ifr_nmacfilt_used--;
+	ret = 0;
+done:
+	mutex_exit(&i40e->i40e_general_lock);
+
+	return (ret);
+}
+
+static int
+i40e_group_add_mac(void *arg, const uint8_t *mac_addr)
+{
+	i40e_t *i40e = arg;
+	struct i40e_hw *hw = &i40e->i40e_hw_space;
+	int i, ret;
+	i40e_uaddr_t *iua;
+	struct i40e_aqc_add_macvlan_element_data filt;
+
+	if (I40E_IS_MULTICAST(mac_addr))
+		return (EINVAL);
+
+	mutex_enter(&i40e->i40e_general_lock);
+	if (i40e->i40e_state & I40E_SUSPENDED) {
+		ret = ECANCELED;
+		goto done;
+	}
+
+	if (i40e->i40e_resources.ifr_nmacfilt ==
+	    i40e->i40e_resources.ifr_nmacfilt_used) {
+		ret = ENOSPC;
+		goto done;
+	}
+
+	for (i = 0; i < i40e->i40e_resources.ifr_nmacfilt_used; i++) {
+		if (bcmp(mac_addr, i40e->i40e_uaddrs[i].iua_mac,
+		    ETHERADDRL) == 0) {
+			ret = EEXIST;
+			goto done;
+		}
+	}
+
+	/*
+	 * Note, the general use of the i40e_vsi_id will have to be refactored
+	 * when we have proper group support.
+	 */
+	bzero(&filt, sizeof (filt));
+	bcopy(mac_addr, filt.mac_addr, ETHERADDRL);
+	filt.flags = I40E_AQC_MACVLAN_ADD_PERFECT_MATCH	|
+	    I40E_AQC_MACVLAN_ADD_IGNORE_VLAN;
+
+	if ((ret = i40e_aq_add_macvlan(hw, i40e->i40e_vsi_id, &filt, 1,
+	    NULL)) != I40E_SUCCESS) {
+		i40e_error(i40e, "failed to add mac address "
+		    "%2x:%2x:%2x:%2x:%2x:%2x to unicast filter: %d",
+		    mac_addr[0], mac_addr[1], mac_addr[2], mac_addr[3],
+		    mac_addr[4], mac_addr[5], ret);
+		ret = EIO;
+		goto done;
+	}
+
+	iua = &i40e->i40e_uaddrs[i40e->i40e_resources.ifr_nmacfilt_used];
+	bcopy(mac_addr, iua->iua_mac, ETHERADDRL);
+	iua->iua_vsi = i40e->i40e_vsi_id;
+	i40e->i40e_resources.ifr_nmacfilt_used++;
+	ASSERT(i40e->i40e_resources.ifr_nmacfilt_used <=
+	    i40e->i40e_resources.ifr_nmacfilt);
+	ret = 0;
+done:
+	mutex_exit(&i40e->i40e_general_lock);
+	return (ret);
+}
+
+static int
+i40e_m_start(void *arg)
+{
+	i40e_t *i40e = arg;
+	int rc = 0;
+
+	mutex_enter(&i40e->i40e_general_lock);
+	if (i40e->i40e_state & I40E_SUSPENDED) {
+		rc = ECANCELED;
+		goto done;
+	}
+
+	if (!i40e_start(i40e, B_TRUE)) {
+		rc = EIO;
+		goto done;
+	}
+
+	atomic_or_32(&i40e->i40e_state, I40E_STARTED);
+done:
+	mutex_exit(&i40e->i40e_general_lock);
+
+	return (rc);
+}
+
+static void
+i40e_m_stop(void *arg)
+{
+	i40e_t *i40e = arg;
+
+	mutex_enter(&i40e->i40e_general_lock);
+
+	if (i40e->i40e_state & I40E_SUSPENDED)
+		goto done;
+
+	atomic_and_32(&i40e->i40e_state, ~I40E_STARTED);
+	i40e_stop(i40e, B_TRUE);
+done:
+	mutex_exit(&i40e->i40e_general_lock);
+}
+
+/*
+ * Enable and disable promiscuous mode as requested. We have to toggle both
+ * unicast and multicast. Note that multicast may already be enabled due to the
+ * i40e_m_multicast may toggle it itself. See i40e_main.c for more information
+ * on this.
+ */
+static int
+i40e_m_promisc(void *arg, boolean_t on)
+{
+	i40e_t *i40e = arg;
+	struct i40e_hw *hw = &i40e->i40e_hw_space;
+	int ret = 0, err = 0;
+
+	mutex_enter(&i40e->i40e_general_lock);
+	if (i40e->i40e_state & I40E_SUSPENDED) {
+		ret = ECANCELED;
+		goto done;
+	}
+
+
+	ret = i40e_aq_set_vsi_unicast_promiscuous(hw, i40e->i40e_vsi_id,
+	    on, NULL);
+	if (ret != I40E_SUCCESS) {
+		i40e_error(i40e, "failed to %s unicast promiscuity on "
+		    "the default VSI: %d", on == B_TRUE ? "enable" : "disable",
+		    ret);
+		err = EIO;
+		goto done;
+	}
+
+	/*
+	 * If we have a non-zero mcast_promisc_count, then it has already been
+	 * enabled or we need to leave it that way and not touch it.
+	 */
+	if (i40e->i40e_mcast_promisc_count > 0) {
+		i40e->i40e_promisc_on = on;
+		goto done;
+	}
+
+	ret = i40e_aq_set_vsi_multicast_promiscuous(hw, i40e->i40e_vsi_id,
+	    on, NULL);
+	if (ret != I40E_SUCCESS) {
+		i40e_error(i40e, "failed to %s multicast promiscuity on "
+		    "the default VSI: %d", on == B_TRUE ? "enable" : "disable",
+		    ret);
+
+		/*
+		 * Try our best to put us back into a state that MAC expects us
+		 * to be in.
+		 */
+		ret = i40e_aq_set_vsi_unicast_promiscuous(hw, i40e->i40e_vsi_id,
+		    !on, NULL);
+		if (ret != I40E_SUCCESS) {
+			i40e_error(i40e, "failed to %s unicast promiscuity on "
+			    "the default VSI after toggling multicast failed: "
+			    "%d", on == B_TRUE ? "disable" : "enable", ret);
+		}
+
+		err = EIO;
+		goto done;
+	} else {
+		i40e->i40e_promisc_on = on;
+	}
+
+done:
+	mutex_exit(&i40e->i40e_general_lock);
+	return (err);
+}
+
+/*
+ * See the big theory statement in i40e_main.c for multicast address management.
+ */
+static int
+i40e_multicast_add(i40e_t *i40e, const uint8_t *multicast_address)
+{
+	struct i40e_hw *hw = &i40e->i40e_hw_space;
+	struct i40e_aqc_add_macvlan_element_data filt;
+	i40e_maddr_t *mc;
+	int ret;
+
+	ASSERT(MUTEX_HELD(&i40e->i40e_general_lock));
+
+	if (i40e->i40e_resources.ifr_nmcastfilt_used ==
+	    i40e->i40e_resources.ifr_nmcastfilt) {
+		if (i40e->i40e_mcast_promisc_count == 0 &&
+		    i40e->i40e_promisc_on == B_FALSE) {
+			ret = i40e_aq_set_vsi_multicast_promiscuous(hw,
+			    i40e->i40e_vsi_id, B_TRUE, NULL);
+			if (ret != I40E_SUCCESS) {
+				i40e_error(i40e, "failed to enable promiscuous "
+				    "mode on VSI %d: %d", i40e->i40e_vsi_id,
+				    ret);
+				return (EIO);
+			}
+		}
+		i40e->i40e_mcast_promisc_count++;
+		return (0);
+	}
+
+	mc = &i40e->i40e_maddrs[i40e->i40e_resources.ifr_nmcastfilt_used];
+	bzero(&filt, sizeof (filt));
+	bcopy(multicast_address, filt.mac_addr, ETHERADDRL);
+	filt.flags = I40E_AQC_MACVLAN_ADD_HASH_MATCH |
+	    I40E_AQC_MACVLAN_ADD_IGNORE_VLAN;
+
+	if ((ret = i40e_aq_add_macvlan(hw, i40e->i40e_vsi_id, &filt, 1,
+	    NULL)) != I40E_SUCCESS) {
+		i40e_error(i40e, "failed to add mac address "
+		    "%2x:%2x:%2x:%2x:%2x:%2x to multicast filter: %d",
+		    multicast_address[0], multicast_address[1],
+		    multicast_address[2], multicast_address[3],
+		    multicast_address[4], multicast_address[5],
+		    ret);
+		return (EIO);
+	}
+
+	bcopy(multicast_address, mc->ima_mac, ETHERADDRL);
+	i40e->i40e_resources.ifr_nmcastfilt_used++;
+	return (0);
+}
+
+/*
+ * See the big theory statement in i40e_main.c for multicast address management.
+ */
+static int
+i40e_multicast_remove(i40e_t *i40e, const uint8_t *multicast_address)
+{
+	int i, ret;
+	struct i40e_hw *hw = &i40e->i40e_hw_space;
+
+	ASSERT(MUTEX_HELD(&i40e->i40e_general_lock));
+
+	for (i = 0; i < i40e->i40e_resources.ifr_nmcastfilt_used; i++) {
+		struct i40e_aqc_remove_macvlan_element_data filt;
+		int last;
+
+		if (bcmp(multicast_address, i40e->i40e_maddrs[i].ima_mac,
+		    ETHERADDRL) != 0) {
+			continue;
+		}
+
+		bzero(&filt, sizeof (filt));
+		bcopy(multicast_address, filt.mac_addr, ETHERADDRL);
+		filt.flags = I40E_AQC_MACVLAN_DEL_HASH_MATCH |
+		    I40E_AQC_MACVLAN_DEL_IGNORE_VLAN;
+
+		if (i40e_aq_remove_macvlan(hw, i40e->i40e_vsi_id,
+		    &filt, 1, NULL) != I40E_SUCCESS) {
+			i40e_error(i40e, "failed to remove mac address "
+			    "%2x:%2x:%2x:%2x:%2x:%2x from multicast "
+			    "filter: %d",
+			    multicast_address[0], multicast_address[1],
+			    multicast_address[2], multicast_address[3],
+			    multicast_address[4], multicast_address[5],
+			    filt.error_code);
+			return (EIO);
+		}
+
+		last = i40e->i40e_resources.ifr_nmcastfilt_used - 1;
+		if (i != last) {
+			bcopy(&i40e->i40e_maddrs[last], &i40e->i40e_maddrs[i],
+			    sizeof (i40e_maddr_t));
+			bzero(&i40e->i40e_maddrs[last], sizeof (i40e_maddr_t));
+		}
+
+		ASSERT(i40e->i40e_resources.ifr_nmcastfilt_used > 0);
+		i40e->i40e_resources.ifr_nmcastfilt_used--;
+		return (0);
+	}
+
+	if (i40e->i40e_mcast_promisc_count > 0) {
+		if (i40e->i40e_mcast_promisc_count == 1 &&
+		    i40e->i40e_promisc_on == B_FALSE) {
+			ret = i40e_aq_set_vsi_multicast_promiscuous(hw,
+			    i40e->i40e_vsi_id, B_FALSE, NULL);
+			if (ret != I40E_SUCCESS) {
+				i40e_error(i40e, "failed to disable "
+				    "promiscuous mode on VSI %d: %d",
+				    i40e->i40e_vsi_id, ret);
+				return (EIO);
+			}
+		}
+		i40e->i40e_mcast_promisc_count--;
+
+		return (0);
+	}
+
+	return (ENOENT);
+}
+
+static int
+i40e_m_multicast(void *arg, boolean_t add, const uint8_t *multicast_address)
+{
+	i40e_t *i40e = arg;
+	int rc;
+
+	mutex_enter(&i40e->i40e_general_lock);
+
+	if (i40e->i40e_state & I40E_SUSPENDED) {
+		mutex_exit(&i40e->i40e_general_lock);
+		return (ECANCELED);
+	}
+
+	if (add == B_TRUE) {
+		rc = i40e_multicast_add(i40e, multicast_address);
+	} else {
+		rc = i40e_multicast_remove(i40e, multicast_address);
+	}
+
+	mutex_exit(&i40e->i40e_general_lock);
+	return (rc);
+}
+
+/* ARGSUSED */
+static void
+i40e_m_ioctl(void *arg, queue_t *q, mblk_t *mp)
+{
+	/*
+	 * At this time, we don't support toggling i40e into loopback mode. It's
+	 * questionable how much value this has when there's no clear way to
+	 * toggle this behavior from a supported way in userland.
+	 */
+	miocnak(q, mp, 0, EINVAL);
+}
+
+static int
+i40e_ring_start(mac_ring_driver_t rh, uint64_t gen_num)
+{
+	i40e_trqpair_t *itrq = (i40e_trqpair_t *)rh;
+
+	/*
+	 * GLDv3 requires we keep track of a generation number, as it uses
+	 * that number to keep track of whether or not a ring is active.
+	 */
+	mutex_enter(&itrq->itrq_rx_lock);
+	itrq->itrq_rxgen = gen_num;
+	mutex_exit(&itrq->itrq_rx_lock);
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+i40e_rx_ring_intr_enable(mac_intr_handle_t intrh)
+{
+	i40e_trqpair_t *itrq = (i40e_trqpair_t *)intrh;
+	i40e_t *i40e = itrq->itrq_i40e;
+
+	mutex_enter(&i40e->i40e_general_lock);
+	ASSERT(i40e->i40e_intr_poll == B_TRUE);
+	i40e_intr_rx_queue_enable(i40e, itrq->itrq_index);
+	i40e->i40e_intr_poll = B_FALSE;
+	mutex_exit(&i40e->i40e_general_lock);
+
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+i40e_rx_ring_intr_disable(mac_intr_handle_t intrh)
+{
+	i40e_trqpair_t *itrq = (i40e_trqpair_t *)intrh;
+	i40e_t *i40e = itrq->itrq_i40e;
+
+	mutex_enter(&i40e->i40e_general_lock);
+	i40e_intr_rx_queue_disable(i40e, itrq->itrq_index);
+	i40e->i40e_intr_poll = B_TRUE;
+	mutex_exit(&i40e->i40e_general_lock);
+
+	return (0);
+}
+
+/* ARGSUSED */
+static void
+i40e_fill_tx_ring(void *arg, mac_ring_type_t rtype, const int group_index,
+    const int ring_index, mac_ring_info_t *infop, mac_ring_handle_t rh)
+{
+	i40e_t *i40e = arg;
+	mac_intr_t *mintr = &infop->mri_intr;
+	i40e_trqpair_t *itrq = &(i40e->i40e_trqpairs[ring_index]);
+
+	/*
+	 * Note the group index here is expected to be -1 due to the fact that
+	 * we're not actually grouping things tx-wise at this time.
+	 */
+	ASSERT(group_index == -1);
+	ASSERT(ring_index < i40e->i40e_num_trqpairs);
+
+	itrq->itrq_mactxring = rh;
+	infop->mri_driver = (mac_ring_driver_t)itrq;
+	infop->mri_start = NULL;
+	infop->mri_stop = NULL;
+	infop->mri_tx = i40e_ring_tx;
+	infop->mri_stat = i40e_tx_ring_stat;
+
+	/*
+	 * We only provide the handle in cases where we have MSI-X interrupts,
+	 * to indicate that we'd actually support retargetting.
+	 */
+	if (i40e->i40e_intr_type & DDI_INTR_TYPE_MSIX) {
+		mintr->mi_ddi_handle =
+		    i40e->i40e_intr_handles[itrq->itrq_tx_intrvec];
+	}
+}
+
+/* ARGSUSED */
+static void
+i40e_fill_rx_ring(void *arg, mac_ring_type_t rtype, const int group_index,
+    const int ring_index, mac_ring_info_t *infop, mac_ring_handle_t rh)
+{
+	i40e_t *i40e = arg;
+	mac_intr_t *mintr = &infop->mri_intr;
+	i40e_trqpair_t *itrq = &i40e->i40e_trqpairs[ring_index];
+
+	/*
+	 * We assert the group number and ring index to help sanity check
+	 * ourselves and mark that we'll need to rework this when we have
+	 * multiple groups.
+	 */
+	ASSERT3S(group_index, ==, 0);
+	ASSERT3S(ring_index, <, i40e->i40e_num_trqpairs);
+
+	itrq->itrq_macrxring = rh;
+	infop->mri_driver = (mac_ring_driver_t)itrq;
+	infop->mri_start = i40e_ring_start;
+	infop->mri_stop = NULL;
+	infop->mri_poll = i40e_ring_rx_poll;
+	infop->mri_stat = i40e_rx_ring_stat;
+	mintr->mi_handle = (mac_intr_handle_t)itrq;
+	mintr->mi_enable = i40e_rx_ring_intr_enable;
+	mintr->mi_disable = i40e_rx_ring_intr_disable;
+
+	/*
+	 * We only provide the handle in cases where we have MSI-X interrupts,
+	 * to indicate that we'd actually support retargetting.
+	 */
+	if (i40e->i40e_intr_type & DDI_INTR_TYPE_MSIX) {
+		mintr->mi_ddi_handle =
+		    i40e->i40e_intr_handles[itrq->itrq_rx_intrvec];
+	}
+}
+
+/* ARGSUSED */
+static void
+i40e_fill_rx_group(void *arg, mac_ring_type_t rtype, const int index,
+    mac_group_info_t *infop, mac_group_handle_t gh)
+{
+	i40e_t *i40e = arg;
+
+	if (rtype != MAC_RING_TYPE_RX)
+		return;
+
+	/*
+	 * Note, this is a simplified view of a group, given that we only have a
+	 * single group and a single ring at the moment. We'll want to expand
+	 * upon this as we leverage more hardware functionality.
+	 */
+	i40e->i40e_rx_group_handle = gh;
+	infop->mgi_driver = (mac_group_driver_t)i40e;
+	infop->mgi_start = NULL;
+	infop->mgi_stop = NULL;
+	infop->mgi_addmac = i40e_group_add_mac;
+	infop->mgi_remmac = i40e_group_remove_mac;
+
+	ASSERT(i40e->i40e_num_rx_groups == I40E_GROUP_MAX);
+	infop->mgi_count = i40e->i40e_num_trqpairs;
+}
+
+static boolean_t
+i40e_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
+{
+	i40e_t *i40e = arg;
+	mac_capab_rings_t *cap_rings;
+
+	switch (cap) {
+	case MAC_CAPAB_HCKSUM: {
+		uint32_t *txflags = cap_data;
+
+		*txflags = 0;
+		if (i40e->i40e_tx_hcksum_enable == B_TRUE)
+			*txflags = HCKSUM_INET_PARTIAL | HCKSUM_IPHDRCKSUM;
+		break;
+	}
+
+	case MAC_CAPAB_RINGS:
+		cap_rings = cap_data;
+		cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
+		switch (cap_rings->mr_type) {
+		case MAC_RING_TYPE_TX:
+			/*
+			 * Note, saying we have no rings, but some number of
+			 * groups indicates to MAC that it should create
+			 * psuedo-groups with one for each TX ring. This may not
+			 * be the long term behavior we want, but it'll work for
+			 * now.
+			 */
+			cap_rings->mr_gnum = 0;
+			cap_rings->mr_rnum = i40e->i40e_num_trqpairs;
+			cap_rings->mr_rget = i40e_fill_tx_ring;
+			cap_rings->mr_gget = NULL;
+			cap_rings->mr_gaddring = NULL;
+			cap_rings->mr_gremring = NULL;
+			break;
+		case MAC_RING_TYPE_RX:
+			cap_rings->mr_rnum = i40e->i40e_num_trqpairs;
+			cap_rings->mr_rget = i40e_fill_rx_ring;
+			cap_rings->mr_gnum = I40E_GROUP_MAX;
+			cap_rings->mr_gget = i40e_fill_rx_group;
+			cap_rings->mr_gaddring = NULL;
+			cap_rings->mr_gremring = NULL;
+			break;
+		default:
+			return (B_FALSE);
+		}
+		break;
+	default:
+		return (B_FALSE);
+	}
+
+	return (B_TRUE);
+}
+
+/* ARGSUSED */
+static int
+i40e_m_setprop_private(i40e_t *i40e, const char *pr_name, uint_t pr_valsize,
+    const void *pr_val)
+{
+	int ret;
+	long val;
+	char *eptr;
+
+	ASSERT(MUTEX_HELD(&i40e->i40e_general_lock));
+
+	if ((ret = ddi_strtol(pr_val, &eptr, 10, &val)) != 0 ||
+	    *eptr != '\0') {
+		return (ret);
+	}
+
+	if (strcmp(pr_name, I40E_PROP_RX_DMA_THRESH) == 0) {
+		if (val < I40E_MIN_RX_DMA_THRESH ||
+		    val > I40E_MAX_RX_DMA_THRESH) {
+			return (EINVAL);
+		}
+		i40e->i40e_rx_dma_min = (uint32_t)val;
+		return (0);
+	}
+
+	if (strcmp(pr_name, I40E_PROP_TX_DMA_THRESH) == 0) {
+		if (val < I40E_MIN_TX_DMA_THRESH ||
+		    val > I40E_MAX_TX_DMA_THRESH) {
+			return (EINVAL);
+		}
+		i40e->i40e_tx_dma_min = (uint32_t)val;
+		return (0);
+	}
+
+	if (strcmp(pr_name, I40E_PROP_RX_ITR) == 0) {
+		if (val < I40E_MIN_ITR ||
+		    val > I40E_MAX_ITR) {
+			return (EINVAL);
+		}
+		i40e->i40e_rx_itr = (uint32_t)val;
+		i40e_intr_set_itr(i40e, I40E_ITR_INDEX_RX, i40e->i40e_rx_itr);
+		return (0);
+	}
+
+	if (strcmp(pr_name, I40E_PROP_TX_ITR) == 0) {
+		if (val < I40E_MIN_ITR ||
+		    val > I40E_MAX_ITR) {
+			return (EINVAL);
+		}
+		i40e->i40e_tx_itr = (uint32_t)val;
+		i40e_intr_set_itr(i40e, I40E_ITR_INDEX_TX, i40e->i40e_tx_itr);
+		return (0);
+	}
+
+	if (strcmp(pr_name, I40E_PROP_OTHER_ITR) == 0) {
+		if (val < I40E_MIN_ITR ||
+		    val > I40E_MAX_ITR) {
+			return (EINVAL);
+		}
+		i40e->i40e_tx_itr = (uint32_t)val;
+		i40e_intr_set_itr(i40e, I40E_ITR_INDEX_OTHER,
+		    i40e->i40e_other_itr);
+		return (0);
+	}
+
+	return (ENOTSUP);
+}
+
+static int
+i40e_m_getprop_private(i40e_t *i40e, const char *pr_name, uint_t pr_valsize,
+    void *pr_val)
+{
+	uint32_t val;
+
+	ASSERT(MUTEX_HELD(&i40e->i40e_general_lock));
+
+	if (strcmp(pr_name, I40E_PROP_RX_DMA_THRESH) == 0) {
+		val = i40e->i40e_rx_dma_min;
+	} else if (strcmp(pr_name, I40E_PROP_TX_DMA_THRESH) == 0) {
+		val = i40e->i40e_tx_dma_min;
+	} else if (strcmp(pr_name, I40E_PROP_RX_ITR) == 0) {
+		val = i40e->i40e_rx_itr;
+	} else if (strcmp(pr_name, I40E_PROP_TX_ITR) == 0) {
+		val = i40e->i40e_tx_itr;
+	} else if (strcmp(pr_name, I40E_PROP_OTHER_ITR) == 0) {
+		val = i40e->i40e_other_itr;
+	} else {
+		return (ENOTSUP);
+	}
+
+	if (snprintf(pr_val, pr_valsize, "%d", val) >= pr_valsize)
+		return (ERANGE);
+	return (0);
+}
+
+/*
+ * Annoyingly for private properties MAC seems to ignore default values that
+ * aren't strings. That means that we have to translate all of these into
+ * uint32_t's and instead we size the buffer to be large enough to hold a
+ * uint32_t.
+ */
+/* ARGSUSED */
+static void
+i40e_m_propinfo_private(i40e_t *i40e, const char *pr_name,
+    mac_prop_info_handle_t prh)
+{
+	char buf[64];
+	uint32_t def;
+
+	if (strcmp(pr_name, I40E_PROP_RX_DMA_THRESH) == 0) {
+		mac_prop_info_set_perm(prh, MAC_PROP_PERM_RW);
+		def = I40E_DEF_RX_DMA_THRESH;
+		mac_prop_info_set_range_uint32(prh,
+		    I40E_MIN_RX_DMA_THRESH,
+		    I40E_MAX_RX_DMA_THRESH);
+	} else if (strcmp(pr_name, I40E_PROP_TX_DMA_THRESH) == 0) {
+		mac_prop_info_set_perm(prh, MAC_PROP_PERM_RW);
+		def = I40E_DEF_TX_DMA_THRESH;
+		mac_prop_info_set_range_uint32(prh,
+		    I40E_MIN_TX_DMA_THRESH,
+		    I40E_MAX_TX_DMA_THRESH);
+	} else if (strcmp(pr_name, I40E_PROP_RX_ITR) == 0) {
+		mac_prop_info_set_perm(prh, MAC_PROP_PERM_RW);
+		def = I40E_DEF_RX_ITR;
+		mac_prop_info_set_range_uint32(prh, I40E_MIN_ITR, I40E_MAX_ITR);
+	} else if (strcmp(pr_name, I40E_PROP_TX_ITR) == 0) {
+		mac_prop_info_set_perm(prh, MAC_PROP_PERM_RW);
+		def = I40E_DEF_TX_ITR;
+		mac_prop_info_set_range_uint32(prh, I40E_MIN_ITR, I40E_MAX_ITR);
+	} else if (strcmp(pr_name, I40E_PROP_OTHER_ITR) == 0) {
+		mac_prop_info_set_perm(prh, MAC_PROP_PERM_RW);
+		def = I40E_DEF_OTHER_ITR;
+		mac_prop_info_set_range_uint32(prh, I40E_MIN_ITR, I40E_MAX_ITR);
+	} else {
+		return;
+	}
+
+	(void) snprintf(buf, sizeof (buf), "%d", def);
+	mac_prop_info_set_default_str(prh, buf);
+}
+
+static int
+i40e_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
+    uint_t pr_valsize, const void *pr_val)
+{
+	uint32_t new_mtu;
+	i40e_t *i40e = arg;
+	int ret = 0;
+
+	mutex_enter(&i40e->i40e_general_lock);
+	if (i40e->i40e_state & I40E_SUSPENDED) {
+		mutex_exit(&i40e->i40e_general_lock);
+		return (ECANCELED);
+	}
+
+	switch (pr_num) {
+	/*
+	 * These properties are always read-only across every device.
+	 */
+	case MAC_PROP_DUPLEX:
+	case MAC_PROP_SPEED:
+	case MAC_PROP_STATUS:
+	case MAC_PROP_ADV_100FDX_CAP:
+	case MAC_PROP_ADV_1000FDX_CAP:
+	case MAC_PROP_ADV_10GFDX_CAP:
+	case MAC_PROP_ADV_40GFDX_CAP:
+		ret = ENOTSUP;
+		break;
+	/*
+	 * These are read-only at this time as we don't support configuring
+	 * auto-negotiation. See the theory statement in i40e_main.c.
+	 */
+	case MAC_PROP_EN_100FDX_CAP:
+	case MAC_PROP_EN_1000FDX_CAP:
+	case MAC_PROP_EN_10GFDX_CAP:
+	case MAC_PROP_EN_40GFDX_CAP:
+	case MAC_PROP_AUTONEG:
+	case MAC_PROP_FLOWCTRL:
+		ret = ENOTSUP;
+		break;
+
+	case MAC_PROP_MTU:
+		bcopy(pr_val, &new_mtu, sizeof (new_mtu));
+		if (new_mtu == i40e->i40e_sdu)
+			break;
+
+		if (new_mtu < I40E_MIN_MTU ||
+		    new_mtu > I40E_MAX_MTU) {
+			ret = EINVAL;
+			break;
+		}
+
+		if (i40e->i40e_state & I40E_STARTED) {
+			ret = EBUSY;
+			break;
+		}
+
+		ret = mac_maxsdu_update(i40e->i40e_mac_hdl, new_mtu);
+		if (ret == 0) {
+			i40e->i40e_sdu = new_mtu;
+			i40e_update_mtu(i40e);
+		}
+		break;
+
+	case MAC_PROP_PRIVATE:
+		ret = i40e_m_setprop_private(i40e, pr_name, pr_valsize, pr_val);
+		break;
+	default:
+		ret = ENOTSUP;
+		break;
+	}
+
+	mutex_exit(&i40e->i40e_general_lock);
+	return (ret);
+}
+
+static int
+i40e_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
+    uint_t pr_valsize, void *pr_val)
+{
+	i40e_t *i40e = arg;
+	uint64_t speed;
+	int ret = 0;
+	uint8_t *u8;
+	link_flowctrl_t fctl;
+
+	mutex_enter(&i40e->i40e_general_lock);
+
+	switch (pr_num) {
+	case MAC_PROP_DUPLEX:
+		if (pr_valsize < sizeof (link_duplex_t)) {
+			ret = EOVERFLOW;
+			break;
+		}
+		bcopy(&i40e->i40e_link_duplex, pr_val, sizeof (link_duplex_t));
+		break;
+	case MAC_PROP_SPEED:
+		if (pr_valsize < sizeof (uint64_t)) {
+			ret = EOVERFLOW;
+			break;
+		}
+		speed = i40e->i40e_link_speed * 1000000ULL;
+		bcopy(&speed, pr_val, sizeof (speed));
+		break;
+	case MAC_PROP_STATUS:
+		if (pr_valsize < sizeof (link_state_t)) {
+			ret = EOVERFLOW;
+			break;
+		}
+		bcopy(&i40e->i40e_link_state, pr_val, sizeof (link_state_t));
+		break;
+	case MAC_PROP_AUTONEG:
+		if (pr_valsize < sizeof (uint8_t)) {
+			ret = EOVERFLOW;
+			break;
+		}
+		u8 = pr_val;
+		*u8 = 1;
+		break;
+	case MAC_PROP_FLOWCTRL:
+		/*
+		 * Because we don't currently support hardware flow control, we
+		 * just hardcode this to be none.
+		 */
+		if (pr_valsize < sizeof (link_flowctrl_t)) {
+			ret = EOVERFLOW;
+			break;
+		}
+		fctl = LINK_FLOWCTRL_NONE;
+		bcopy(&fctl, pr_val, sizeof (link_flowctrl_t));
+		break;
+	case MAC_PROP_MTU:
+		if (pr_valsize < sizeof (uint32_t)) {
+			ret = EOVERFLOW;
+			break;
+		}
+		bcopy(&i40e->i40e_sdu, pr_val, sizeof (uint32_t));
+		break;
+
+	/*
+	 * Because we don't let users control the speeds we may auto-negotiate
+	 * to, the values of the ADV_ and EN_ will always be the same.
+	 */
+	case MAC_PROP_ADV_100FDX_CAP:
+	case MAC_PROP_EN_100FDX_CAP:
+		if (pr_valsize < sizeof (uint8_t)) {
+			ret = EOVERFLOW;
+			break;
+		}
+		u8 = pr_val;
+		*u8 = (i40e->i40e_phy.link_speed & I40E_LINK_SPEED_100MB) != 0;
+		break;
+	case MAC_PROP_ADV_1000FDX_CAP:
+	case MAC_PROP_EN_1000FDX_CAP:
+		if (pr_valsize < sizeof (uint8_t)) {
+			ret = EOVERFLOW;
+			break;
+		}
+		u8 = pr_val;
+		*u8 = (i40e->i40e_phy.link_speed & I40E_LINK_SPEED_1GB) != 0;
+		break;
+	case MAC_PROP_ADV_10GFDX_CAP:
+	case MAC_PROP_EN_10GFDX_CAP:
+		if (pr_valsize < sizeof (uint8_t)) {
+			ret = EOVERFLOW;
+			break;
+		}
+		u8 = pr_val;
+		*u8 = (i40e->i40e_phy.link_speed & I40E_LINK_SPEED_10GB) != 0;
+		break;
+	case MAC_PROP_ADV_40GFDX_CAP:
+	case MAC_PROP_EN_40GFDX_CAP:
+		if (pr_valsize < sizeof (uint8_t)) {
+			ret = EOVERFLOW;
+			break;
+		}
+		u8 = pr_val;
+		*u8 = (i40e->i40e_phy.link_speed & I40E_LINK_SPEED_40GB) != 0;
+		break;
+	case MAC_PROP_PRIVATE:
+		ret = i40e_m_getprop_private(i40e, pr_name, pr_valsize, pr_val);
+		break;
+	default:
+		ret = ENOTSUP;
+		break;
+	}
+
+	mutex_exit(&i40e->i40e_general_lock);
+
+	return (ret);
+}
+
+static void
+i40e_m_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num,
+    mac_prop_info_handle_t prh)
+{
+	i40e_t *i40e = arg;
+
+	mutex_enter(&i40e->i40e_general_lock);
+
+	switch (pr_num) {
+	case MAC_PROP_DUPLEX:
+	case MAC_PROP_SPEED:
+		mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
+		break;
+	case MAC_PROP_FLOWCTRL:
+		/*
+		 * At the moment, the driver doesn't support flow control, hence
+		 * why this is set to read-only and none.
+		 */
+		mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
+		mac_prop_info_set_default_link_flowctrl(prh,
+		    LINK_FLOWCTRL_NONE);
+		break;
+	case MAC_PROP_MTU:
+		mac_prop_info_set_range_uint32(prh, I40E_MIN_MTU, I40E_MAX_MTU);
+		break;
+
+	/*
+	 * We set the defaults for these based upon the phy's ability to
+	 * support the speeds. Note, auto-negotiation is required for fiber,
+	 * hence it is read-only and always enabled. When we have access to
+	 * copper phys we can revisit this.
+	 */
+	case MAC_PROP_AUTONEG:
+		mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
+		mac_prop_info_set_default_uint8(prh, 1);
+		break;
+	case MAC_PROP_ADV_100FDX_CAP:
+		mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
+		mac_prop_info_set_default_uint8(prh,
+		    (i40e->i40e_phy.link_speed & I40E_LINK_SPEED_100MB) != 0);
+		break;
+	case MAC_PROP_EN_100FDX_CAP:
+		mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
+		mac_prop_info_set_default_uint8(prh,
+		    (i40e->i40e_phy.link_speed & I40E_LINK_SPEED_100MB) != 0);
+		break;
+	case MAC_PROP_ADV_1000FDX_CAP:
+		mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
+		mac_prop_info_set_default_uint8(prh,
+		    (i40e->i40e_phy.link_speed & I40E_LINK_SPEED_1GB) != 0);
+		break;
+	case MAC_PROP_EN_1000FDX_CAP:
+		mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
+		mac_prop_info_set_default_uint8(prh,
+		    (i40e->i40e_phy.link_speed & I40E_LINK_SPEED_1GB) != 0);
+		break;
+	case MAC_PROP_ADV_10GFDX_CAP:
+		mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
+		mac_prop_info_set_default_uint8(prh,
+		    (i40e->i40e_phy.link_speed & I40E_LINK_SPEED_10GB) != 0);
+		break;
+	case MAC_PROP_EN_10GFDX_CAP:
+		mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
+		mac_prop_info_set_default_uint8(prh,
+		    (i40e->i40e_phy.link_speed & I40E_LINK_SPEED_10GB) != 0);
+		break;
+	case MAC_PROP_ADV_40GFDX_CAP:
+		mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
+		mac_prop_info_set_default_uint8(prh,
+		    (i40e->i40e_phy.link_speed & I40E_LINK_SPEED_40GB) != 0);
+		break;
+	case MAC_PROP_EN_40GFDX_CAP:
+		mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
+		mac_prop_info_set_default_uint8(prh,
+		    (i40e->i40e_phy.link_speed & I40E_LINK_SPEED_40GB) != 0);
+		break;
+	case MAC_PROP_PRIVATE:
+		i40e_m_propinfo_private(i40e, pr_name, prh);
+		break;
+	default:
+		break;
+	}
+
+	mutex_exit(&i40e->i40e_general_lock);
+}
+
+#define	I40E_M_CALLBACK_FLAGS \
+	(MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_GETPROP | MC_PROPINFO)
+
+static mac_callbacks_t i40e_m_callbacks = {
+	I40E_M_CALLBACK_FLAGS,
+	i40e_m_stat,
+	i40e_m_start,
+	i40e_m_stop,
+	i40e_m_promisc,
+	i40e_m_multicast,
+	NULL,
+	NULL,
+	NULL,
+	i40e_m_ioctl,
+	i40e_m_getcapab,
+	NULL,
+	NULL,
+	i40e_m_setprop,
+	i40e_m_getprop,
+	i40e_m_propinfo
+};
+
+boolean_t
+i40e_register_mac(i40e_t *i40e)
+{
+	struct i40e_hw *hw = &i40e->i40e_hw_space;
+	int status;
+	mac_register_t *mac = mac_alloc(MAC_VERSION);
+
+	if (mac == NULL)
+		return (B_FALSE);
+
+	mac->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
+	mac->m_driver = i40e;
+	mac->m_dip = i40e->i40e_dip;
+	mac->m_src_addr = hw->mac.addr;
+	mac->m_callbacks = &i40e_m_callbacks;
+	mac->m_min_sdu = 0;
+	mac->m_max_sdu = i40e->i40e_sdu;
+	mac->m_margin = VLAN_TAGSZ;
+	mac->m_priv_props = i40e_priv_props;
+	mac->m_v12n = MAC_VIRT_LEVEL1;
+
+	status = mac_register(mac, &i40e->i40e_mac_hdl);
+	if (status != 0)
+		i40e_error(i40e, "mac_register() returned %d", status);
+	mac_free(mac);
+
+	return (status == 0);
+}
diff --git a/usr/src/uts/common/io/i40e/i40e_intr.c b/usr/src/uts/common/io/i40e/i40e_intr.c
new file mode 100644
index 0000000000..ba9bea7b20
--- /dev/null
+++ b/usr/src/uts/common/io/i40e/i40e_intr.c
@@ -0,0 +1,757 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+/*
+ * -------------------------
+ * Interrupt Handling Theory
+ * -------------------------
+ *
+ * There are a couple different sets of interrupts that we need to worry about:
+ *
+ *   - Interrupts from receive queues
+ *   - Interrupts from transmit queues
+ *   - 'Other Interrupts', such as the administrative queue
+ *
+ * 'Other Interrupts' are asynchronous events such as a link status change event
+ * being posted to the administrative queue, unrecoverable ECC errors, and more.
+ * If we have something being posted to the administrative queue, then we go
+ * through and process it, because it's generally enabled as a separate logical
+ * interrupt. Note, we may need to do more here eventually. To re-enable the
+ * interrupts from the 'Other Interrupts' section, we need to clear the PBA and
+ * write ENA to PFINT_ICR0.
+ *
+ * Interrupts from the transmit and receive queues indicates that our requests
+ * have been processed. In the rx case, it means that we have data that we
+ * should take a look at and send up the stack. In the tx case, it means that
+ * data which we got from MAC has now been sent out on the wire and we can free
+ * the associated data. Most of the logic for acting upon the presence of this
+ * data can be found in i40e_transciever.c which handles all of the DMA, rx, and
+ * tx operations. This file is dedicated to handling and dealing with interrupt
+ * processing.
+ *
+ * All devices supported by this driver support three kinds of interrupts:
+ *
+ *   o Extended Message Signaled Interrupts (MSI-X)
+ *   o Message Signaled Interrupts (MSI)
+ *   o Legacy PCI interrupts (INTx)
+ *
+ * Generally speaking the hardware logically handles MSI and INTx the same and
+ * restricts us to only using a single interrupt, which isn't the interesting
+ * case. With MSI-X available, each physical function of the device provides the
+ * opportunity for multiple interrupts which is what we'll focus on.
+ *
+ * --------------------
+ * Interrupt Management
+ * --------------------
+ *
+ * By default, the admin queue, which consists of the asynchronous other
+ * interrupts is always bound to MSI-X vector zero. Next, we spread out all of
+ * the other interrupts that we have available to us over the remaining
+ * interrupt vectors.
+ *
+ * This means that there may be multiple queues, both tx and rx, which are
+ * mapped to the same interrupt. When the interrupt fires, we'll have to check
+ * all of them for servicing, before we go through and indicate that the
+ * interrupt is claimed.
+ *
+ * The hardware provides the means of mapping various queues to MSI-X interrupts
+ * by programming the I40E_QINT_RQCTL() and I4OE_QINT_TQCTL() registers. These
+ * registers can also be used to enable and disable whether or not the queue is
+ * a source of interrupts. As part of this, the hardware requires that we
+ * maintain a linked list of queues for each interrupt vector. While it may seem
+ * like this is only there for the purproses of ITRs, that's not the case. The
+ * first queue must be programmed in I40E_QINT_LNKLSTN(%vector) register. Each
+ * queue defines the next one in either the I40E_QINT_RQCTL or I40E_QINT_TQCTL
+ * register.
+ *
+ * Because we only have a single queue enabled at the moment and we always have
+ * two interrupts, we do something pretty simple and just know that there's one
+ * data queue in the interrupt handler. Longer term, we'll need to think harder
+ * about this, but for the moment it'll have to suffice.
+ *
+ * Finally, the individual interrupt vector itself has the ability to be enabled
+ * and disabled. The overall interrupt is controlled through the
+ * I40E_PFINT_DYN_CTLN() register. This is used to turn on and off the interrupt
+ * as a whole.
+ *
+ * Note that this means that both the individual queue and the interrupt as a
+ * whole can be toggled and re-enabled.
+ *
+ * -------------------
+ * Non-MSIX Management
+ * -------------------
+ *
+ * We may have a case where the Operating System is unable to actually allocate
+ * any MSI-X to the system. In such a world, there is only one transmit/receive
+ * queue pair and it is bound to the same interrupt with index zero. The
+ * hardware doesn't allow us access to additional interrupt vectors in these
+ * modes. Note that technically we could support more transmit/receive queues if
+ * we wanted.
+ *
+ * In this world, because the interrupts for the admin queue and traffic are
+ * mixed together, we have to consult ICR0 to determine what has occurred. The
+ * QINT_TQCTL and QINT_RQCTL registers have a field, 'MSI-X 0 index' which
+ * allows us to set a specific bit in ICR0. There are up to seven such bits;
+ * however, we only use the bit 0 and 1 for the rx and tx queue respectively.
+ * These are contained by the I40E_INTR_NOTX_{R|T}X_QUEUE and
+ * I40E_INTR_NOTX_{R|T}X_MASK registers respectively.
+ *
+ * Unfortunately, these corresponding queue bits have no corresponding entry in
+ * the ICR0_ENA register. So instead, when enabling interrupts on the queues, we
+ * end up enabling it on the queue registers rather than on the MSI-X registers.
+ * In the MSI-X world, because they can be enabled and disabled, this is
+ * different and the queues can always be enabled and disabled, but the
+ * interrupts themselves are toggled (ignoring the question of interrupt
+ * blanking for polling on rings).
+ *
+ * Finally, we still have to set up the interrupt linked list, but the list is
+ * instead rooted at the register I40E_PFINT_LNKLST0, rather than being tied to
+ * one of the other MSI-X registers.
+ *
+ * --------------------
+ * Interrupt Moderation
+ * --------------------
+ *
+ * The XL710 hardware has three different interrupt moderation registers per
+ * interrupt. Unsurprisingly, we use these for:
+ *
+ *   o RX interrupts
+ *   o TX interrupts
+ *   o 'Other interrupts' (link status change, admin queue, etc.)
+ *
+ * By default, we throttle 'other interrupts' the most, then TX interrupts, and
+ * then RX interrupts. The default values for these were based on trying to
+ * reason about both the importance and frequency of events. Generally speaking
+ * 'other interrupts' are not very frequent and they're not important for the
+ * I/O data path in and of itself (though they may indicate issues with the I/O
+ * data path).
+ *
+ * On the flip side, when we're not polling, RX interrupts are very important.
+ * The longer we wait for them, the more latency that we inject into the system.
+ * However, if we allow interrupts to occur too frequently, we risk a few
+ * problems:
+ *
+ *  1) Abusing system resources. Without proper interrupt blanking and polling,
+ *     we can see upwards of 200k-300k interrupts per second on the system.
+ *
+ *  2) Not enough data coalescing to enable polling. In other words, the more
+ *     data that we allow to build up, the more likely we'll be able to enable
+ *     polling mode and allowing us to better handle bulk data.
+ *
+ * In-between the 'other interrupts' and the TX interrupts we have the
+ * reclamation of TX buffers. This operation is not quite as important as we
+ * generally size the ring large enough that we should be able to reclaim a
+ * substantial amount of the descriptors that we have used per interrupt. So
+ * while it's important that this interrupt occur, we don't necessarily need it
+ * firing as frequently as RX; it doesn't, on its own, induce additional latency
+ * into the system.
+ *
+ * Based on all this we currently assign static ITR values for the system. While
+ * we could move to a dynamic system (the hardware supports that), we'd want to
+ * make sure that we're seeing problems from this that we believe would be
+ * generally helped by the added complexity.
+ *
+ * Based on this, the default values that we have allow for the following
+ * interrupt thresholds:
+ *
+ *    o 20k interrupts/s for RX
+ *    o 5k interrupts/s for TX
+ *    o 2k interupts/s for 'Other Interrupts'
+ */
+
+#include "i40e_sw.h"
+
+#define	I40E_INTR_NOTX_QUEUE	0
+#define	I40E_INTR_NOTX_INTR	0
+#define	I40E_INTR_NOTX_RX_QUEUE	0
+#define	I40E_INTR_NOTX_RX_MASK	(1 << I40E_PFINT_ICR0_QUEUE_0_SHIFT)
+#define	I40E_INTR_NOTX_TX_QUEUE	1
+#define	I40E_INTR_NOTX_TX_MASK	(1 << I40E_PFINT_ICR0_QUEUE_1_SHIFT)
+
+void
+i40e_intr_set_itr(i40e_t *i40e, i40e_itr_index_t itr, uint_t val)
+{
+	int i;
+	i40e_hw_t *hw = &i40e->i40e_hw_space;
+
+	VERIFY3U(val, <=, I40E_MAX_ITR);
+	VERIFY3U(itr, <, I40E_ITR_INDEX_NONE);
+
+	/*
+	 * No matter the interrupt mode, the ITR for other interrupts is always
+	 * on interrupt zero and the same is true if we're not using MSI-X.
+	 */
+	if (itr == I40E_ITR_INDEX_OTHER ||
+	    i40e->i40e_intr_type != DDI_INTR_TYPE_MSIX) {
+		I40E_WRITE_REG(hw, I40E_PFINT_ITR0(itr), val);
+		return;
+	}
+
+	for (i = 1; i < i40e->i40e_intr_count; i++) {
+		I40E_WRITE_REG(hw, I40E_PFINT_ITRN(itr, i - 1), val);
+	}
+}
+
+/*
+ * Re-enable the adminq. Note that the adminq doesn't have a traditional queue
+ * associated with it from an interrupt perspective and just lives on ICR0.
+ * However when MSI-X interrupts are not being used, then this also enables and
+ * disables those interrupts.
+ */
+static void
+i40e_intr_adminq_enable(i40e_t *i40e)
+{
+	i40e_hw_t *hw = &i40e->i40e_hw_space;
+	uint32_t reg;
+
+	reg = I40E_PFINT_DYN_CTL0_INTENA_MASK |
+	    I40E_PFINT_DYN_CTL0_CLEARPBA_MASK |
+	    (I40E_ITR_INDEX_NONE << I40E_PFINT_DYN_CTL0_ITR_INDX_SHIFT);
+	I40E_WRITE_REG(hw, I40E_PFINT_DYN_CTL0, reg);
+	i40e_flush(hw);
+}
+
+static void
+i40e_intr_adminq_disable(i40e_t *i40e)
+{
+	i40e_hw_t *hw = &i40e->i40e_hw_space;
+	uint32_t reg;
+
+	reg = I40E_ITR_INDEX_NONE << I40E_PFINT_DYN_CTL0_ITR_INDX_SHIFT;
+	I40E_WRITE_REG(hw, I40E_PFINT_DYN_CTL0, reg);
+}
+
+static void
+i40e_intr_io_enable(i40e_t *i40e, int vector)
+{
+	uint32_t reg;
+	i40e_hw_t *hw = &i40e->i40e_hw_space;
+
+	reg = I40E_PFINT_DYN_CTLN_INTENA_MASK |
+	    I40E_PFINT_DYN_CTLN_CLEARPBA_MASK |
+	    (I40E_ITR_INDEX_NONE << I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT);
+	I40E_WRITE_REG(hw, I40E_PFINT_DYN_CTLN(vector - 1), reg);
+}
+
+static void
+i40e_intr_io_disable(i40e_t *i40e, int vector)
+{
+	uint32_t reg;
+	i40e_hw_t *hw = &i40e->i40e_hw_space;
+
+	reg = I40E_ITR_INDEX_NONE << I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT;
+	I40E_WRITE_REG(hw, I40E_PFINT_DYN_CTLN(vector - 1), reg);
+}
+
+/*
+ * When MSI-X interrupts are being used, then we can enable the actual
+ * interrupts themselves. However, when they are not, we instead have to turn
+ * towards the queue's CAUSE_ENA bit and enable that.
+ */
+void
+i40e_intr_io_enable_all(i40e_t *i40e)
+{
+	if (i40e->i40e_intr_type == DDI_INTR_TYPE_MSIX) {
+		int i;
+
+		for (i = 1; i < i40e->i40e_intr_count; i++) {
+			i40e_intr_io_enable(i40e, i);
+		}
+	} else {
+		uint32_t reg;
+		i40e_hw_t *hw = &i40e->i40e_hw_space;
+
+		reg = I40E_READ_REG(hw, I40E_QINT_RQCTL(I40E_INTR_NOTX_QUEUE));
+		reg |= I40E_QINT_RQCTL_CAUSE_ENA_MASK;
+		I40E_WRITE_REG(hw, I40E_QINT_RQCTL(I40E_INTR_NOTX_QUEUE), reg);
+
+		reg = I40E_READ_REG(hw, I40E_QINT_TQCTL(I40E_INTR_NOTX_QUEUE));
+		reg |= I40E_QINT_TQCTL_CAUSE_ENA_MASK;
+		I40E_WRITE_REG(hw, I40E_QINT_TQCTL(I40E_INTR_NOTX_QUEUE), reg);
+	}
+}
+
+/*
+ * When MSI-X interrupts are being used, then we can disable the actual
+ * interrupts themselves. However, when they are not, we instead have to turn
+ * towards the queue's CAUSE_ENA bit and disable that.
+ */
+void
+i40e_intr_io_disable_all(i40e_t *i40e)
+{
+	if (i40e->i40e_intr_type == DDI_INTR_TYPE_MSIX) {
+		int i;
+
+		for (i = 1; i < i40e->i40e_intr_count; i++) {
+			i40e_intr_io_disable(i40e, i);
+		}
+	} else {
+		uint32_t reg;
+		i40e_hw_t *hw = &i40e->i40e_hw_space;
+
+		reg = I40E_READ_REG(hw, I40E_QINT_RQCTL(I40E_INTR_NOTX_QUEUE));
+		reg &= ~I40E_QINT_RQCTL_CAUSE_ENA_MASK;
+		I40E_WRITE_REG(hw, I40E_QINT_RQCTL(I40E_INTR_NOTX_QUEUE), reg);
+
+		reg = I40E_READ_REG(hw, I40E_QINT_TQCTL(I40E_INTR_NOTX_QUEUE));
+		reg &= ~I40E_QINT_TQCTL_CAUSE_ENA_MASK;
+		I40E_WRITE_REG(hw, I40E_QINT_TQCTL(I40E_INTR_NOTX_QUEUE), reg);
+	}
+}
+
+/*
+ * As part of disabling the tx and rx queue's we're technically supposed to
+ * remove the linked list entries. The simplest way is to clear the LNKLSTN
+ * register by setting it to I40E_QUEUE_TYPE_EOL (0x7FF).
+ *
+ * Note all of the FM register access checks are performed by the caller.
+ */
+void
+i40e_intr_io_clear_cause(i40e_t *i40e)
+{
+	int i;
+	i40e_hw_t *hw = &i40e->i40e_hw_space;
+
+	if (i40e->i40e_intr_type != DDI_INTR_TYPE_MSIX) {
+		uint32_t reg;
+		reg = I40E_QUEUE_TYPE_EOL;
+		I40E_WRITE_REG(hw, I40E_PFINT_LNKLST0, reg);
+		return;
+	}
+
+	for (i = 1; i < i40e->i40e_intr_count; i++) {
+		uint32_t reg;
+#ifdef DEBUG
+		/*
+		 * Verify that the interrupt in question is disabled. This is a
+		 * prerequisite of modifying the data in question.
+		 */
+		reg = I40E_READ_REG(hw, I40E_PFINT_DYN_CTLN(i - 1));
+		VERIFY0(reg & I40E_PFINT_DYN_CTLN_INTENA_MASK);
+#endif
+		reg = I40E_QUEUE_TYPE_EOL;
+		I40E_WRITE_REG(hw, I40E_PFINT_LNKLSTN(i - 1), reg);
+	}
+
+	i40e_flush(hw);
+}
+
+/*
+ * Finalize interrupt handling. Mostly this disables the admin queue.
+ */
+void
+i40e_intr_chip_fini(i40e_t *i40e)
+{
+#ifdef DEBUG
+	int i;
+	uint32_t reg;
+
+	i40e_hw_t *hw = &i40e->i40e_hw_space;
+
+	/*
+	 * Take a look and verify that all other interrupts have been disabled
+	 * and the interrupt linked lists have been zeroed.
+	 */
+	if (i40e->i40e_intr_type == DDI_INTR_TYPE_MSIX) {
+		for (i = 1; i < i40e->i40e_intr_count; i++) {
+			reg = I40E_READ_REG(hw, I40E_PFINT_DYN_CTLN(i - 1));
+			VERIFY0(reg & I40E_PFINT_DYN_CTLN_INTENA_MASK);
+
+			reg = I40E_READ_REG(hw, I40E_PFINT_LNKLSTN(i - 1));
+			VERIFY3U(reg, ==, I40E_QUEUE_TYPE_EOL);
+		}
+	}
+#endif
+
+	i40e_intr_adminq_disable(i40e);
+}
+
+/*
+ * Enable all of the queues and set the corresponding LNKLSTN registers. Note
+ * that we always enable queues as interrupt sources, even though we don't
+ * enable the MSI-X interrupt vectors.
+ */
+static void
+i40e_intr_init_queue_msix(i40e_t *i40e)
+{
+	i40e_hw_t *hw = &i40e->i40e_hw_space;
+	uint32_t reg;
+
+	/*
+	 * Because we only have a single queue, just do something simple now.
+	 * How this all works will need to really be properly redone based on
+	 * the bit maps, etc. Note that we skip the ITR logic for the moment,
+	 * just to make our lives as explicit and simple as possible.
+	 */
+	reg = (0 << I40E_PFINT_LNKLSTN_FIRSTQ_INDX_SHIFT) |
+	    (I40E_QUEUE_TYPE_RX << I40E_PFINT_LNKLSTN_FIRSTQ_TYPE_SHIFT);
+	I40E_WRITE_REG(hw, I40E_PFINT_LNKLSTN(0), reg);
+
+	reg = (1 << I40E_QINT_RQCTL_MSIX_INDX_SHIFT) |
+	    (I40E_ITR_INDEX_RX << I40E_QINT_RQCTL_ITR_INDX_SHIFT) |
+	    (0 << I40E_QINT_RQCTL_NEXTQ_INDX_SHIFT) |
+	    (I40E_QUEUE_TYPE_TX << I40E_QINT_RQCTL_NEXTQ_TYPE_SHIFT) |
+	    I40E_QINT_RQCTL_CAUSE_ENA_MASK;
+
+	I40E_WRITE_REG(hw, I40E_QINT_RQCTL(0), reg);
+
+	reg = (1 << I40E_QINT_RQCTL_MSIX_INDX_SHIFT) |
+	    (I40E_ITR_INDEX_TX << I40E_QINT_RQCTL_ITR_INDX_SHIFT) |
+	    (I40E_QUEUE_TYPE_EOL << I40E_QINT_TQCTL_NEXTQ_INDX_SHIFT) |
+	    (I40E_QUEUE_TYPE_RX << I40E_QINT_RQCTL_NEXTQ_TYPE_SHIFT) |
+	    I40E_QINT_TQCTL_CAUSE_ENA_MASK;
+
+	I40E_WRITE_REG(hw, I40E_QINT_TQCTL(0), reg);
+
+}
+
+/*
+ * Set up a single queue to share the admin queue interrupt in the non-MSI-X
+ * world. Note we do not enable the queue as an interrupt cause at this time. We
+ * don't have any other vector of control here, unlike with the MSI-X interrupt
+ * case.
+ */
+static void
+i40e_intr_init_queue_shared(i40e_t *i40e)
+{
+	i40e_hw_t *hw = &i40e->i40e_hw_space;
+	uint32_t reg;
+
+	VERIFY(i40e->i40e_intr_type == DDI_INTR_TYPE_FIXED ||
+	    i40e->i40e_intr_type == DDI_INTR_TYPE_MSI);
+
+	reg = (I40E_INTR_NOTX_QUEUE << I40E_PFINT_LNKLST0_FIRSTQ_INDX_SHIFT) |
+	    (I40E_QUEUE_TYPE_RX << I40E_PFINT_LNKLSTN_FIRSTQ_TYPE_SHIFT);
+	I40E_WRITE_REG(hw, I40E_PFINT_LNKLST0, reg);
+
+	reg = (I40E_INTR_NOTX_INTR << I40E_QINT_RQCTL_MSIX_INDX_SHIFT) |
+	    (I40E_ITR_INDEX_RX << I40E_QINT_RQCTL_ITR_INDX_SHIFT) |
+	    (I40E_INTR_NOTX_RX_QUEUE << I40E_QINT_RQCTL_MSIX0_INDX_SHIFT) |
+	    (I40E_INTR_NOTX_QUEUE << I40E_QINT_RQCTL_NEXTQ_INDX_SHIFT) |
+	    (I40E_QUEUE_TYPE_TX << I40E_QINT_RQCTL_NEXTQ_TYPE_SHIFT);
+
+	I40E_WRITE_REG(hw, I40E_QINT_RQCTL(I40E_INTR_NOTX_QUEUE), reg);
+
+	reg = (I40E_INTR_NOTX_INTR << I40E_QINT_TQCTL_MSIX_INDX_SHIFT) |
+	    (I40E_ITR_INDEX_TX << I40E_QINT_TQCTL_ITR_INDX_SHIFT) |
+	    (I40E_INTR_NOTX_TX_QUEUE << I40E_QINT_TQCTL_MSIX0_INDX_SHIFT) |
+	    (I40E_QUEUE_TYPE_EOL << I40E_QINT_TQCTL_NEXTQ_INDX_SHIFT) |
+	    (I40E_QUEUE_TYPE_RX << I40E_QINT_TQCTL_NEXTQ_TYPE_SHIFT);
+
+	I40E_WRITE_REG(hw, I40E_QINT_TQCTL(I40E_INTR_NOTX_QUEUE), reg);
+}
+
+/*
+ * Enable the specified queue as a valid source of interrupts. Note, this should
+ * only be used as part of the GLDv3's interrupt blanking routines. The debug
+ * build assertions are specific to that.
+ */
+void
+i40e_intr_rx_queue_enable(i40e_t *i40e, uint_t queue)
+{
+	uint32_t reg;
+	i40e_hw_t *hw = &i40e->i40e_hw_space;
+
+	ASSERT(MUTEX_HELD(&i40e->i40e_general_lock));
+	ASSERT(queue < i40e->i40e_num_trqpairs);
+
+	reg = I40E_READ_REG(hw, I40E_QINT_RQCTL(queue));
+	ASSERT0(reg & I40E_QINT_RQCTL_CAUSE_ENA_MASK);
+	reg |= I40E_QINT_RQCTL_CAUSE_ENA_MASK;
+	I40E_WRITE_REG(hw, I40E_QINT_RQCTL(queue), reg);
+}
+
+/*
+ * Disable the specified queue as a valid source of interrupts. Note, this
+ * should only be used as part of the GLDv3's interrupt blanking routines. The
+ * debug build assertions are specific to that.
+ */
+void
+i40e_intr_rx_queue_disable(i40e_t *i40e, uint_t queue)
+{
+	uint32_t reg;
+	i40e_hw_t *hw = &i40e->i40e_hw_space;
+
+	ASSERT(MUTEX_HELD(&i40e->i40e_general_lock));
+	ASSERT(queue < i40e->i40e_num_trqpairs);
+
+	reg = I40E_READ_REG(hw, I40E_QINT_RQCTL(queue));
+	ASSERT3U(reg & I40E_QINT_RQCTL_CAUSE_ENA_MASK, ==,
+	    I40E_QINT_RQCTL_CAUSE_ENA_MASK);
+	reg &= ~I40E_QINT_RQCTL_CAUSE_ENA_MASK;
+	I40E_WRITE_REG(hw, I40E_QINT_RQCTL(queue), reg);
+}
+
+/*
+ * Start up the various chip's interrupt handling. We not only configure the
+ * adminq here, but we also go through and configure all of the actual queues,
+ * the interrupt linked lists, and others.
+ */
+void
+i40e_intr_chip_init(i40e_t *i40e)
+{
+	i40e_hw_t *hw = &i40e->i40e_hw_space;
+	uint32_t reg;
+
+	/*
+	 * Ensure that all non adminq interrupts are disabled at the chip level.
+	 */
+	i40e_intr_io_disable_all(i40e);
+
+	I40E_WRITE_REG(hw, I40E_PFINT_ICR0_ENA, 0);
+	(void) I40E_READ_REG(hw, I40E_PFINT_ICR0);
+
+	/*
+	 * Always enable all of the other-class interrupts to be on their own
+	 * ITR. This only needs to be set on interrupt zero, which has its own
+	 * special setting.
+	 */
+	reg = I40E_ITR_INDEX_OTHER << I40E_PFINT_STAT_CTL0_OTHER_ITR_INDX_SHIFT;
+	I40E_WRITE_REG(hw, I40E_PFINT_STAT_CTL0, reg);
+
+	/*
+	 * Enable interrupt types we expect to receive. At the moment, this
+	 * is limited to the adminq; however, we'll want to review 11.2.2.9.22
+	 * for more types here as we add support for detecting them, handling
+	 * them, and resetting the device as appropriate.
+	 */
+	reg = I40E_PFINT_ICR0_ENA_ADMINQ_MASK;
+	I40E_WRITE_REG(hw, I40E_PFINT_ICR0_ENA, reg);
+
+	/*
+	 * Always set the interrupt linked list to empty. We'll come back and
+	 * change this if MSI-X are actually on the scene.
+	 */
+	I40E_WRITE_REG(hw, I40E_PFINT_LNKLST0, I40E_QUEUE_TYPE_EOL);
+
+	i40e_intr_adminq_enable(i40e);
+
+	/*
+	 * Set up all of the queues and map them to interrupts based on the bit
+	 * assignments.
+	 */
+	if (i40e->i40e_intr_type == DDI_INTR_TYPE_MSIX) {
+		i40e_intr_init_queue_msix(i40e);
+	} else {
+		i40e_intr_init_queue_shared(i40e);
+	}
+
+	/*
+	 * Finally set all of the default ITRs for the interrupts. Note that the
+	 * queues will have been set up above.
+	 */
+	i40e_intr_set_itr(i40e, I40E_ITR_INDEX_RX, i40e->i40e_rx_itr);
+	i40e_intr_set_itr(i40e, I40E_ITR_INDEX_TX, i40e->i40e_tx_itr);
+	i40e_intr_set_itr(i40e, I40E_ITR_INDEX_OTHER, i40e->i40e_other_itr);
+}
+
+static void
+i40e_intr_adminq_work(i40e_t *i40e)
+{
+	struct i40e_hw *hw = &i40e->i40e_hw_space;
+	struct i40e_arq_event_info evt;
+	uint16_t remain = 1;
+
+	bzero(&evt, sizeof (struct i40e_arq_event_info));
+	evt.buf_len = I40E_ADMINQ_BUFSZ;
+	evt.msg_buf = i40e->i40e_aqbuf;
+
+	while (remain != 0) {
+		enum i40e_status_code ret;
+		uint16_t opcode;
+
+		/*
+		 * At the moment, the only error code that seems to be returned
+		 * is one saying that there's no work. In such a case we leave
+		 * this be.
+		 */
+		ret = i40e_clean_arq_element(hw, &evt, &remain);
+		if (ret != I40E_SUCCESS)
+			break;
+
+		opcode = LE_16(evt.desc.opcode);
+		switch (opcode) {
+		case i40e_aqc_opc_get_link_status:
+			mutex_enter(&i40e->i40e_general_lock);
+			i40e_link_check(i40e);
+			mutex_exit(&i40e->i40e_general_lock);
+			break;
+		default:
+			/*
+			 * Longer term we'll want to enable other causes here
+			 * and get these cleaned up and doing something.
+			 */
+			break;
+		}
+	}
+}
+
+static void
+i40e_intr_rx_work(i40e_t *i40e, int queue)
+{
+	mblk_t *mp;
+	i40e_trqpair_t *itrq;
+
+	ASSERT(queue < i40e->i40e_num_trqpairs);
+	itrq = &i40e->i40e_trqpairs[queue];
+
+	mutex_enter(&itrq->itrq_rx_lock);
+	mp = i40e_ring_rx(itrq, I40E_POLL_NULL);
+	mutex_exit(&itrq->itrq_rx_lock);
+
+	if (mp != NULL) {
+		mac_rx_ring(i40e->i40e_mac_hdl, itrq->itrq_macrxring, mp,
+		    itrq->itrq_rxgen);
+	}
+}
+
+static void
+i40e_intr_tx_work(i40e_t *i40e, int queue)
+{
+	i40e_trqpair_t *itrq;
+
+	itrq = &i40e->i40e_trqpairs[queue];
+	i40e_tx_recycle_ring(itrq);
+}
+
+/*
+ * At the moment, the only 'other' interrupt on ICR0 that we handle is the
+ * adminq. We should go through and support the other notifications at some
+ * point.
+ */
+static void
+i40e_intr_other_work(i40e_t *i40e)
+{
+	struct i40e_hw *hw = &i40e->i40e_hw_space;
+	uint32_t reg;
+
+	reg = I40E_READ_REG(hw, I40E_PFINT_ICR0);
+	if (i40e_check_acc_handle(i40e->i40e_osdep_space.ios_reg_handle) !=
+	    DDI_FM_OK) {
+		ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED);
+		atomic_or_32(&i40e->i40e_state, I40E_ERROR);
+		return;
+	}
+
+	if (reg & I40E_PFINT_ICR0_ADMINQ_MASK)
+		i40e_intr_adminq_work(i40e);
+
+	/*
+	 * Make sure that the adminq interrupt is not masked and then explicitly
+	 * enable the adminq and thus the other interrupt.
+	 */
+	reg = I40E_READ_REG(hw, I40E_PFINT_ICR0_ENA);
+	reg |= I40E_PFINT_ICR0_ENA_ADMINQ_MASK;
+	I40E_WRITE_REG(hw, I40E_PFINT_ICR0_ENA, reg);
+
+	i40e_intr_adminq_enable(i40e);
+}
+
+uint_t
+i40e_intr_msix(void *arg1, void *arg2)
+{
+	i40e_t *i40e = (i40e_t *)arg1;
+	int vector_idx = (int)(uintptr_t)arg2;
+
+	/*
+	 * When using MSI-X interrupts, vector 0 is always reserved for the
+	 * adminq at this time. Though longer term, we'll want to also bridge
+	 * some I/O to them.
+	 */
+	if (vector_idx == 0) {
+		i40e_intr_other_work(i40e);
+		return (DDI_INTR_CLAIMED);
+	}
+
+	VERIFY(vector_idx == 1);
+
+	/*
+	 * Note that we explicitly do not check this value under the lock even
+	 * though assignments to it are done so. In this case, the cost of
+	 * getting this wrong is at worst a bit of additional contention and
+	 * even more rarely, a duplicated packet. However, the cost on the other
+	 * hand is a lot more. This is something that as we more generally
+	 * implement ring support we should revisit.
+	 */
+	if (i40e->i40e_intr_poll != B_TRUE)
+		i40e_intr_rx_work(i40e, 0);
+	i40e_intr_tx_work(i40e, 0);
+	i40e_intr_io_enable(i40e, 1);
+
+	return (DDI_INTR_CLAIMED);
+}
+
+static uint_t
+i40e_intr_notx(i40e_t *i40e, boolean_t shared)
+{
+	i40e_hw_t *hw = &i40e->i40e_hw_space;
+	uint32_t reg;
+	int ret = DDI_INTR_CLAIMED;
+
+	if (shared == B_TRUE) {
+		mutex_enter(&i40e->i40e_general_lock);
+		if (i40e->i40e_state & I40E_SUSPENDED) {
+			mutex_exit(&i40e->i40e_general_lock);
+			return (DDI_INTR_UNCLAIMED);
+		}
+		mutex_exit(&i40e->i40e_general_lock);
+	}
+
+	reg = I40E_READ_REG(hw, I40E_PFINT_ICR0);
+	if (i40e_check_acc_handle(i40e->i40e_osdep_space.ios_reg_handle) !=
+	    DDI_FM_OK) {
+		ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED);
+		atomic_or_32(&i40e->i40e_state, I40E_ERROR);
+		return (DDI_INTR_CLAIMED);
+	}
+
+	if (reg == 0) {
+		if (shared == B_TRUE)
+			ret = DDI_INTR_UNCLAIMED;
+		goto done;
+	}
+
+	if (reg & I40E_PFINT_ICR0_ADMINQ_MASK)
+		i40e_intr_adminq_work(i40e);
+
+	if (reg & I40E_INTR_NOTX_RX_MASK)
+		i40e_intr_rx_work(i40e, 0);
+
+	if (reg & I40E_INTR_NOTX_TX_MASK)
+		i40e_intr_tx_work(i40e, 0);
+
+done:
+	i40e_intr_adminq_enable(i40e);
+	return (ret);
+
+}
+
+/* ARGSUSED */
+uint_t
+i40e_intr_msi(void *arg1, void *arg2)
+{
+	i40e_t *i40e = (i40e_t *)arg1;
+
+	return (i40e_intr_notx(i40e, B_FALSE));
+}
+
+/* ARGSUSED */
+uint_t
+i40e_intr_legacy(void *arg1, void *arg2)
+{
+	i40e_t *i40e = (i40e_t *)arg1;
+
+	return (i40e_intr_notx(i40e, B_TRUE));
+}
diff --git a/usr/src/uts/common/io/i40e/i40e_main.c b/usr/src/uts/common/io/i40e/i40e_main.c
new file mode 100644
index 0000000000..91164abf87
--- /dev/null
+++ b/usr/src/uts/common/io/i40e/i40e_main.c
@@ -0,0 +1,2883 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 OmniTI Computer Consulting, Inc. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
+ */
+
+/*
+ * i40e - Intel 10/40 Gb Ethernet driver
+ *
+ * The i40e driver is the main software device driver for the Intel 40 Gb family
+ * of devices. Note that these devices come in many flavors with both 40 GbE
+ * ports and 10 GbE ports. This device is the successor to the 82599 family of
+ * devices (ixgbe).
+ *
+ * Unlike previous generations of Intel 1 GbE and 10 GbE devices, the 40 GbE
+ * devices defined in the XL710 controller (previously known as Fortville) are a
+ * rather different beast and have a small switch embedded inside of them. In
+ * addition, the way that most of the programming is done has been overhauled.
+ * As opposed to just using PCIe memory mapped registers, it also has an
+ * administrative queue which is used to communicate with firmware running on
+ * the chip.
+ *
+ * Each physical function in the hardware shows up as a device that this driver
+ * will bind to. The hardware splits many resources evenly across all of the
+ * physical functions present on the device, while other resources are instead
+ * shared across the entire card and its up to the device driver to
+ * intelligently partition them.
+ *
+ * ------------
+ * Organization
+ * ------------
+ *
+ * This driver is made up of several files which have their own theory
+ * statements spread across them. We'll touch on the high level purpose of each
+ * file here, and then we'll get into more discussion on how the device is
+ * generally modelled with respect to the interfaces in illumos.
+ *
+ * i40e_gld.c: This file contains all of the bindings to MAC and the networking
+ *             stack.
+ *
+ * i40e_intr.c: This file contains all of the interrupt service routines and
+ *              contains logic to enable and disable interrupts on the hardware.
+ *              It also contains the logic to map hardware resources such as the
+ *              rings to and from interrupts and controls their ability to fire.
+ *
+ *              There is a big theory statement on interrupts present there.
+ *
+ * i40e_main.c: The file that you're currently in. It interfaces with the
+ *              traditional OS DDI interfaces and is in charge of configuring
+ *              the device.
+ *
+ * i40e_osdep.[ch]: These files contain interfaces and definitions needed to
+ *                  work with Intel's common code for the device.
+ *
+ * i40e_stats.c: This file contains the general work and logic around our
+ *               kstats. A theory statement on their organization and use of the
+ *               hardware exists there.
+ *
+ * i40e_sw.h: This header file contains all of the primary structure definitions
+ *            and constants that are used across the entire driver.
+ *
+ * i40e_transceiver.c: This file contains all of the logic for sending and
+ *                     receiving data. It contains all of the ring and DMA
+ *                     allocation logic, as well as, the actual interfaces to
+ *                     send and receive data.
+ *
+ *                     A big theory statement on ring management, descriptors,
+ *                     and how it ties into the OS is present there.
+ *
+ * --------------
+ * General Design
+ * --------------
+ *
+ * Before we go too far into the general way we've laid out data structures and
+ * the like, it's worth taking some time to explain how the hardware is
+ * organized. This organization informs a lot of how we do things at this time
+ * in the driver.
+ *
+ * Each physical device consists of a number of one or more ports, which are
+ * considered physical functions in the PCI sense and thus each get enumerated
+ * by the system, resulting in an instance being created and attached to. While
+ * there are many resources that are unique to each physical function eg.
+ * instance of the device, there are many that are shared across all of them.
+ * Several resources have an amount reserved for each VSI and then a static pool
+ * of resources, available for all functions on the card.
+ *
+ * The most important resource in hardware are its transmit and receive queue
+ * pairs (i40e_trqpair_t). These should be thought of as rings in GLDv3
+ * parlance. There are a set number of these on each device; however, they are
+ * statically partitioned among all of the different physical functions.
+ *
+ * 'Fortville' (the code name for this device family) is basically a switch. To
+ * map MAC addresses and other things to queues, we end up having to create
+ * Virtual Station Interfaces (VSIs) and establish forwarding rules that direct
+ * traffic to a queue. A VSI owns a collection of queues and has a series of
+ * forwarding rules that point to it. One way to think of this is to treat it
+ * like MAC does a VNIC. When MAC refers to a group, a collection of rings and
+ * classification resources, that is a VSI in i40e.
+ *
+ * The sets of VSIs is shared across the entire device, though there may be some
+ * amount that are reserved to each PF. Because the GLDv3 does not let us change
+ * the number of groups dynamically, we instead statically divide this amount
+ * evenly between all the functions that exist. In addition, we have the same
+ * problem with the mac address forwarding rules. There are a static number that
+ * exist shared across all the functions.
+ *
+ * To handle both of these resources, what we end up doing is going through and
+ * determining which functions belong to the same device. Nominally one might do
+ * this by having a nexus driver; however, a prime requirement for a nexus
+ * driver is identifying the various children and activating them. While it is
+ * possible to get this information from NVRAM, we would end up duplicating a
+ * lot of the PCI enumeration logic. Really, at the end of the day, the device
+ * doesn't give us the traditional identification properties we want from a
+ * nexus driver.
+ *
+ * Instead, we rely on some properties that are guaranteed to be unique. While
+ * it might be tempting to leverage the PBA or serial number of the device from
+ * NVRAM, there is nothing that says that two devices can't be mis-programmed to
+ * have the same values in NVRAM. Instead, we uniquely identify a group of
+ * functions based on their parent in the /devices tree, their PCI bus and PCI
+ * function identifiers. Using either on their own may not be sufficient.
+ *
+ * For each unique PCI device that we encounter, we'll create a i40e_device_t.
+ * From there, because we don't have a good way to tell the GLDv3 about sharing
+ * resources between everything, we'll end up just dividing the resources
+ * evenly between all of the functions. Longer term, if we don't have to declare
+ * to the GLDv3 that these resources are shared, then we'll maintain a pool and
+ * hae each PF allocate from the pool in the device, thus if only two of four
+ * ports are being used, for example, then all of the resources can still be
+ * used.
+ *
+ * -------------------------------------------
+ * Transmit and Receive Queue Pair Allocations
+ * -------------------------------------------
+ *
+ * NVRAM ends up assigning each PF its own share of the transmit and receive LAN
+ * queue pairs, we have no way of modifying it, only observing it. From there,
+ * it's up to us to map these queues to VSIs and VFs. Since we don't support any
+ * VFs at this time, we only focus on assignments to VSIs.
+ *
+ * At the moment, we used a static mapping of transmit/receive queue pairs to a
+ * given VSI (eg. rings to a group). Though in the fullness of time, we want to
+ * make this something which is fully dynamic and take advantage of documented,
+ * but not yet available functionality for adding filters based on VXLAN and
+ * other encapsulation technologies.
+ *
+ * -------------------------------------
+ * Broadcast, Multicast, and Promiscuous
+ * -------------------------------------
+ *
+ * As part of the GLDv3, we need to make sure that we can handle receiving
+ * broadcast and multicast traffic. As well as enabling promiscuous mode when
+ * requested. GLDv3 requires that all broadcast and multicast traffic be
+ * retrieved by the default group, eg. the first one. This is the same thing as
+ * the default VSI.
+ *
+ * To receieve broadcast traffic, we enable it through the admin queue, rather
+ * than use one of our filters for it. For multicast traffic, we reserve a
+ * certain number of the hash filters and assign them to a given PF. When we
+ * exceed those, we then switch to using promicuous mode for multicast traffic.
+ *
+ * More specifically, once we exceed the number of filters (indicated because
+ * the i40e_t`i40e_resources.ifr_nmcastfilt ==
+ * i40e_t`i40e_resources.ifr_nmcastfilt_used), we then instead need to toggle
+ * promiscuous mode. If promiscuous mode is toggled then we keep track of the
+ * number of MACs added to it by incrementing i40e_t`i40e_mcast_promisc_count.
+ * That will stay enabled until that count reaches zero indicating that we have
+ * only added multicast addresses that we have a corresponding entry for.
+ *
+ * Because MAC itself wants to toggle promiscuous mode, which includes both
+ * unicast and multicast traffic, we go through and keep track of that
+ * ourselves. That is maintained through the use of the i40e_t`i40e_promisc_on
+ * member.
+ *
+ * --------------
+ * VSI Management
+ * --------------
+ *
+ * At this time, we currently only support a single MAC group, and thus a single
+ * VSI. This VSI is considered the default VSI and should be the only one that
+ * exists after a reset. Currently it is stored as the member
+ * i40e_t`i40e_vsi_id. While this works for the moment and for an initial
+ * driver, it's not sufficient for the longer-term path of the driver. Instead,
+ * we'll want to actually have a unique i40e_vsi_t structure which is used
+ * everywhere. Note that this means that every place that uses the
+ * i40e_t`i40e_vsi_id will need to be refactored.
+ *
+ * ----------------
+ * Structure Layout
+ * ----------------
+ *
+ * The following images relates the core data structures together. The primary
+ * structure in the system is the i40e_t. It itself contains multiple rings,
+ * i40e_trqpair_t's which contain the various transmit and receive data. The
+ * receive data is stored outside of the i40e_trqpair_t and instead in the
+ * i40e_rx_data_t. The i40e_t has a corresponding i40e_device_t which keeps
+ * track of per-physical device state. Finally, for every active descriptor,
+ * there is a corresponding control block, which is where the
+ * i40e_rx_control_block_t and the i40e_tx_control_block_t come from.
+ *
+ *   +-----------------------+       +-----------------------+
+ *   | Global i40e_t list    |       | Global Device list    |
+ *   |                       |    +--|                       |
+ *   | i40e_glist            |    |  | i40e_dlist            |
+ *   +-----------------------+    |  +-----------------------+
+ *       |                        v
+ *       |      +------------------------+      +-----------------------+
+ *       |      | Device-wide Structure  |----->| Device-wide Structure |--> ...
+ *       |      | i40e_device_t          |      | i40e_device_t         |
+ *       |      |                        |      +-----------------------+
+ *       |      | dev_info_t *     ------+--> Parent in devices tree.
+ *       |      | uint_t           ------+--> PCI bus number
+ *       |      | uint_t           ------+--> PCI device number
+ *       |      | uint_t           ------+--> Number of functions
+ *       |      | i40e_switch_rsrcs_t ---+--> Captured total switch resources
+ *       |      | list_t           ------+-------------+
+ *       |      +------------------------+             |
+ *       |                           ^                 |
+ *       |                           +--------+        |
+ *       |                                    |        v
+ *       |  +---------------------------+     |   +-------------------+
+ *       +->| GLDv3 Device, per PF      |-----|-->| GLDv3 Device (PF) |--> ...
+ *          | i40e_t                    |     |   | i40e_t            |
+ *          | **Primary Structure**     |     |   +-------------------+
+ *          |                           |     |
+ *          | i40e_device_t *         --+-----+
+ *          | i40e_state_t            --+---> Device State
+ *          | i40e_hw_t               --+---> Intel common code structure
+ *          | mac_handle_t            --+---> GLDv3 handle to MAC
+ *          | ddi_periodic_t          --+---> Link activity timer
+ *          | int (vsi_id)            --+---> VSI ID, main identifier
+ *          | i40e_func_rsrc_t        --+---> Available hardware resources
+ *          | i40e_switch_rsrc_t *    --+---> Switch resource snapshot
+ *          | i40e_sdu                --+---> Current MTU
+ *          | i40e_frame_max          --+---> Current HW frame size
+ *          | i40e_uaddr_t *          --+---> Array of assigned unicast MACs
+ *          | i40e_maddr_t *          --+---> Array of assigned multicast MACs
+ *          | i40e_mcast_promisccount --+---> Active multicast state
+ *          | i40e_promisc_on         --+---> Current promiscuous mode state
+ *          | int                     --+---> Number of transmit/receive pairs
+ *          | kstat_t *               --+---> PF kstats
+ *          | kstat_t *               --+---> VSI kstats
+ *          | i40e_pf_stats_t         --+---> PF kstat backing data
+ *          | i40e_vsi_stats_t        --+---> VSI kstat backing data
+ *          | i40e_trqpair_t *        --+---------+
+ *          +---------------------------+         |
+ *                                                |
+ *                                                v
+ *  +-------------------------------+       +-----------------------------+
+ *  | Transmit/Receive Queue Pair   |-------| Transmit/Receive Queue Pair |->...
+ *  | i40e_trqpair_t                |       | i40e_trqpair_t              |
+ *  + Ring Data Structure           |       +-----------------------------+
+ *  |                               |
+ *  | mac_ring_handle_t             +--> MAC RX ring handle
+ *  | mac_ring_handle_t             +--> MAC TX ring handle
+ *  | i40e_rxq_stat_t             --+--> RX Queue stats
+ *  | i40e_txq_stat_t             --+--> TX Queue stats
+ *  | uint32_t (tx ring size)       +--> TX Ring Size
+ *  | uint32_t (tx free list size)  +--> TX Free List Size
+ *  | i40e_dma_buffer_t     --------+--> TX Descriptor ring DMA
+ *  | i40e_tx_desc_t *      --------+--> TX descriptor ring
+ *  | volatile unt32_t *            +--> TX Write back head
+ *  | uint32_t               -------+--> TX ring head
+ *  | uint32_t               -------+--> TX ring tail
+ *  | uint32_t               -------+--> Num TX desc free
+ *  | i40e_tx_control_block_t *   --+--> TX control block array  ---+
+ *  | i40e_tx_control_block_t **  --+--> TCB work list          ----+
+ *  | i40e_tx_control_block_t **  --+--> TCB free list           ---+
+ *  | uint32_t               -------+--> Free TCB count             |
+ *  | i40e_rx_data_t *       -------+--+                            v
+ *  +-------------------------------+  |          +---------------------------+
+ *                                     |          | Per-TX Frame Metadata     |
+ *                                     |          | i40e_tx_control_block_t   |
+ *                +--------------------+          |                           |
+ *                |           mblk to transmit <--+---      mblk_t *          |
+ *                |           type of transmit <--+---      i40e_tx_type_t    |
+ *                |              TX DMA handle <--+---      ddi_dma_handle_t  |
+ *                v              TX DMA buffer <--+---      i40e_dma_buffer_t |
+ *    +------------------------------+            +---------------------------+
+ *    | Core Receive Data            |
+ *    | i40e_rx_data_t               |
+ *    |                              |
+ *    | i40e_dma_buffer_t          --+--> RX descriptor DMA Data
+ *    | i40e_rx_desc_t             --+--> RX descriptor ring
+ *    | uint32_t                   --+--> Next free desc.
+ *    | i40e_rx_control_block_t *  --+--> RX Control Block Array  ---+
+ *    | i40e_rx_control_block_t ** --+--> RCB work list           ---+
+ *    | i40e_rx_control_block_t ** --+--> RCB free list           ---+
+ *    +------------------------------+                               |
+ *                ^                                                  |
+ *                |     +---------------------------+                |
+ *                |     | Per-RX Frame Metadata     |<---------------+
+ *                |     | i40e_rx_control_block_t   |
+ *                |     |                           |
+ *                |     | mblk_t *              ----+--> Received mblk_t data
+ *                |     | uint32_t              ----+--> Reference count
+ *                |     | i40e_dma_buffer_t     ----+--> Receive data DMA info
+ *                |     | frtn_t                ----+--> mblk free function info
+ *                +-----+-- i40e_rx_data_t *        |
+ *                      +---------------------------+
+ *
+ * -------------
+ * Lock Ordering
+ * -------------
+ *
+ * In order to ensure that we don't deadlock, the following represents the
+ * lock oder being used. When grabbing locks, follow the following order. Lower
+ * numbers are more important. Thus, the i40e_glock which is number 0, must be
+ * taken before any other locks in the driver. On the other hand, the
+ * i40e_t`i40e_stat_lock, has the highest number because it's the least
+ * important lock. Note, that just because one lock is higher than another does
+ * not mean that all intermediary locks are required.
+ *
+ * 0) i40e_glock
+ * 1) i40e_t`i40e_general_lock
+ *
+ * 2) i40e_trqpair_t`itrq_rx_lock
+ * 3) i40e_trqpair_t`itrq_tx_lock
+ * 4) i40e_t`i40e_rx_pending_lock
+ * 5) i40e_trqpair_t`itrq_tcb_lock
+ *
+ * 6) i40e_t`i40e_stat_lock
+ *
+ * Rules and expectations:
+ *
+ * 1) A thread holding locks belong to one PF should not hold locks belonging to
+ * a second. If for some reason this becomes necessary, locks should be grabbed
+ * based on the list order in the i40e_device_t, which implies that the
+ * i40e_glock is held.
+ *
+ * 2) When grabbing locks between multiple transmit and receive queues, the
+ * locks for the lowest number transmit/receive queue should be grabbed first.
+ *
+ * 3) When grabbing both the transmit and receive lock for a given queue, always
+ * grab i40e_trqpair_t`itrq_rx_lock before the i40e_trqpair_t`itrq_tx_lock.
+ *
+ * 4) The following pairs of locks are not expected to be held at the same time:
+ *
+ * o i40e_t`i40e_rx_pending_lock and i40e_trqpair_t`itrq_tcb_lock
+ *
+ * -----------
+ * Future Work
+ * -----------
+ *
+ * At the moment the i40e_t driver is rather bare bones, allowing us to start
+ * getting data flowing and folks using it while we develop additional features.
+ * While bugs have been filed to cover this future work, the following gives an
+ * overview of expected work:
+ *
+ *  o TSO support
+ *  o RSS / multiple ring support
+ *  o Multiple group support
+ *  o DMA binding and breaking up the locking in ring recycling.
+ *  o Enhanced detection of device errors
+ *  o Participation in IRM
+ *  o FMA device reset
+ *  o Stall detection, temperature error detection, etc.
+ *  o More dynamic resource pools
+ */
+
+#include "i40e_sw.h"
+
+static char i40e_ident[] = "Intel 10/40Gb Ethernet v1.0.0";
+
+/*
+ * The i40e_glock primarily protects the lists below and the i40e_device_t
+ * structures.
+ */
+static kmutex_t i40e_glock;
+static list_t i40e_glist;
+static list_t i40e_dlist;
+
+/*
+ * Access attributes for register mapping.
+ */
+static ddi_device_acc_attr_t i40e_regs_acc_attr = {
+	DDI_DEVICE_ATTR_V1,
+	DDI_STRUCTURE_LE_ACC,
+	DDI_STRICTORDER_ACC,
+	DDI_FLAGERR_ACC
+};
+
+/*
+ * Logging function for this driver.
+ */
+static void
+i40e_dev_err(i40e_t *i40e, int level, boolean_t console, const char *fmt,
+    va_list ap)
+{
+	char buf[1024];
+
+	(void) vsnprintf(buf, sizeof (buf), fmt, ap);
+
+	if (i40e == NULL) {
+		cmn_err(level, (console) ? "%s: %s" : "!%s: %s",
+		    I40E_MODULE_NAME, buf);
+	} else {
+		dev_err(i40e->i40e_dip, level, (console) ? "%s" : "!%s",
+		    buf);
+	}
+}
+
+/*
+ * Because there's the stupid trailing-comma problem with the C preprocessor
+ * and variable arguments, I need to instantiate these.	 Pardon the redundant
+ * code.
+ */
+void
+i40e_error(i40e_t *i40e, const char *fmt, ...)
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+	i40e_dev_err(i40e, CE_WARN, B_FALSE, fmt, ap);
+	va_end(ap);
+}
+
+void
+i40e_log(i40e_t *i40e, const char *fmt, ...)
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+	i40e_dev_err(i40e, CE_NOTE, B_FALSE, fmt, ap);
+	va_end(ap);
+}
+
+void
+i40e_notice(i40e_t *i40e, const char *fmt, ...)
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+	i40e_dev_err(i40e, CE_NOTE, B_TRUE, fmt, ap);
+	va_end(ap);
+}
+
+static void
+i40e_device_rele(i40e_t *i40e)
+{
+	i40e_device_t *idp = i40e->i40e_device;
+
+	if (idp == NULL)
+		return;
+
+	mutex_enter(&i40e_glock);
+	VERIFY(idp->id_nreg > 0);
+	list_remove(&idp->id_i40e_list, i40e);
+	idp->id_nreg--;
+	if (idp->id_nreg == 0) {
+		list_remove(&i40e_dlist, idp);
+		list_destroy(&idp->id_i40e_list);
+		kmem_free(idp->id_rsrcs, sizeof (i40e_switch_rsrc_t) *
+		    idp->id_rsrcs_alloc);
+		kmem_free(idp, sizeof (i40e_device_t));
+	}
+	i40e->i40e_device = NULL;
+	mutex_exit(&i40e_glock);
+}
+
+static i40e_device_t *
+i40e_device_find(i40e_t *i40e, dev_info_t *parent, uint_t bus, uint_t device)
+{
+	i40e_device_t *idp;
+	mutex_enter(&i40e_glock);
+	for (idp = list_head(&i40e_dlist); idp != NULL;
+	    idp = list_next(&i40e_dlist, idp)) {
+		if (idp->id_parent == parent && idp->id_pci_bus == bus &&
+		    idp->id_pci_device == device) {
+			break;
+		}
+	}
+
+	if (idp != NULL) {
+		VERIFY(idp->id_nreg < idp->id_nfuncs);
+		idp->id_nreg++;
+	} else {
+		i40e_hw_t *hw = &i40e->i40e_hw_space;
+		ASSERT(hw->num_ports > 0);
+		ASSERT(hw->num_partitions > 0);
+
+		/*
+		 * The Intel common code doesn't exactly keep the number of PCI
+		 * functions. But it calculates it during discovery of
+		 * partitions and ports. So what we do is undo the calculation
+		 * that it does originally, as functions are evenly spread
+		 * across ports in the rare case of partitions.
+		 */
+		idp = kmem_alloc(sizeof (i40e_device_t), KM_SLEEP);
+		idp->id_parent = parent;
+		idp->id_pci_bus = bus;
+		idp->id_pci_device = device;
+		idp->id_nfuncs = hw->num_ports * hw->num_partitions;
+		idp->id_nreg = 1;
+		idp->id_rsrcs_alloc = i40e->i40e_switch_rsrc_alloc;
+		idp->id_rsrcs_act = i40e->i40e_switch_rsrc_actual;
+		idp->id_rsrcs = kmem_alloc(sizeof (i40e_switch_rsrc_t) *
+		    idp->id_rsrcs_alloc, KM_SLEEP);
+		bcopy(i40e->i40e_switch_rsrcs, idp->id_rsrcs,
+		    sizeof (i40e_switch_rsrc_t) * idp->id_rsrcs_alloc);
+		list_create(&idp->id_i40e_list, sizeof (i40e_t),
+		    offsetof(i40e_t, i40e_dlink));
+
+		list_insert_tail(&i40e_dlist, idp);
+	}
+
+	list_insert_tail(&idp->id_i40e_list, i40e);
+	mutex_exit(&i40e_glock);
+
+	return (idp);
+}
+
+static void
+i40e_link_state_set(i40e_t *i40e, link_state_t state)
+{
+	if (i40e->i40e_link_state == state)
+		return;
+
+	i40e->i40e_link_state = state;
+	mac_link_update(i40e->i40e_mac_hdl, i40e->i40e_link_state);
+}
+
+/*
+ * This is a basic link check routine. Mostly we're using this just to see
+ * if we can get any accurate information about the state of the link being
+ * up or down, as well as updating the link state, speed, etc. information.
+ */
+void
+i40e_link_check(i40e_t *i40e)
+{
+	i40e_hw_t *hw = &i40e->i40e_hw_space;
+	boolean_t ls;
+	int ret;
+
+	ASSERT(MUTEX_HELD(&i40e->i40e_general_lock));
+
+	hw->phy.get_link_info = B_TRUE;
+	if ((ret = i40e_get_link_status(hw, &ls)) != I40E_SUCCESS) {
+		i40e->i40e_s_link_status_errs++;
+		i40e->i40e_s_link_status_lasterr = ret;
+		return;
+	}
+
+	/*
+	 * Firmware abstracts all of the mac and phy information for us, so we
+	 * can use i40e_get_link_status to determine the current state.
+	 */
+	if (ls == B_TRUE) {
+		enum i40e_aq_link_speed speed;
+
+		speed = i40e_get_link_speed(hw);
+
+		/*
+		 * Translate from an i40e value to a value in Mbits/s.
+		 */
+		switch (speed) {
+		case I40E_LINK_SPEED_100MB:
+			i40e->i40e_link_speed = 100;
+			break;
+		case I40E_LINK_SPEED_1GB:
+			i40e->i40e_link_speed = 1000;
+			break;
+		case I40E_LINK_SPEED_10GB:
+			i40e->i40e_link_speed = 10000;
+			break;
+		case I40E_LINK_SPEED_20GB:
+			i40e->i40e_link_speed = 20000;
+			break;
+		case I40E_LINK_SPEED_40GB:
+			i40e->i40e_link_speed = 40000;
+			break;
+		default:
+			i40e->i40e_link_speed = 0;
+			break;
+		}
+
+		/*
+		 * At this time, hardware does not support half-duplex
+		 * operation, hence why we don't ask the hardware about our
+		 * current speed.
+		 */
+		i40e->i40e_link_duplex = LINK_DUPLEX_FULL;
+		i40e_link_state_set(i40e, LINK_STATE_UP);
+	} else {
+		i40e->i40e_link_speed = 0;
+		i40e->i40e_link_duplex = 0;
+		i40e_link_state_set(i40e, LINK_STATE_DOWN);
+	}
+}
+
+static void
+i40e_rem_intrs(i40e_t *i40e)
+{
+	int i, rc;
+
+	for (i = 0; i < i40e->i40e_intr_count; i++) {
+		rc = ddi_intr_free(i40e->i40e_intr_handles[i]);
+		if (rc != DDI_SUCCESS) {
+			i40e_log(i40e, "failed to free interrupt %d: %d",
+			    i, rc);
+		}
+	}
+
+	kmem_free(i40e->i40e_intr_handles, i40e->i40e_intr_size);
+	i40e->i40e_intr_handles = NULL;
+}
+
+static void
+i40e_rem_intr_handlers(i40e_t *i40e)
+{
+	int i, rc;
+
+	for (i = 0; i < i40e->i40e_intr_count; i++) {
+		rc = ddi_intr_remove_handler(i40e->i40e_intr_handles[i]);
+		if (rc != DDI_SUCCESS) {
+			i40e_log(i40e, "failed to remove interrupt %d: %d",
+			    i, rc);
+		}
+	}
+}
+
+/*
+ * illumos Fault Management Architecture (FMA) support.
+ */
+
+int
+i40e_check_acc_handle(ddi_acc_handle_t handle)
+{
+	ddi_fm_error_t de;
+
+	ddi_fm_acc_err_get(handle, &de, DDI_FME_VERSION);
+	ddi_fm_acc_err_clear(handle, DDI_FME_VERSION);
+	return (de.fme_status);
+}
+
+int
+i40e_check_dma_handle(ddi_dma_handle_t handle)
+{
+	ddi_fm_error_t de;
+
+	ddi_fm_dma_err_get(handle, &de, DDI_FME_VERSION);
+	return (de.fme_status);
+}
+
+/*
+ * Fault service error handling callback function.
+ */
+/* ARGSUSED */
+static int
+i40e_fm_error_cb(dev_info_t *dip, ddi_fm_error_t *err, const void *impl_data)
+{
+	pci_ereport_post(dip, err, NULL);
+	return (err->fme_status);
+}
+
+static void
+i40e_fm_init(i40e_t *i40e)
+{
+	ddi_iblock_cookie_t iblk;
+
+	i40e->i40e_fm_capabilities = ddi_prop_get_int(DDI_DEV_T_ANY,
+	    i40e->i40e_dip, DDI_PROP_DONTPASS, "fm_capable",
+	    DDI_FM_EREPORT_CAPABLE | DDI_FM_ACCCHK_CAPABLE |
+	    DDI_FM_DMACHK_CAPABLE | DDI_FM_ERRCB_CAPABLE);
+
+	if (i40e->i40e_fm_capabilities < 0) {
+		i40e->i40e_fm_capabilities = 0;
+	} else if (i40e->i40e_fm_capabilities > 0xf) {
+		i40e->i40e_fm_capabilities = DDI_FM_EREPORT_CAPABLE |
+		    DDI_FM_ACCCHK_CAPABLE | DDI_FM_DMACHK_CAPABLE |
+		    DDI_FM_ERRCB_CAPABLE;
+	}
+
+	/*
+	 * Only register with IO Fault Services if we have some capability
+	 */
+	if (i40e->i40e_fm_capabilities & DDI_FM_ACCCHK_CAPABLE) {
+		i40e_regs_acc_attr.devacc_attr_access = DDI_FLAGERR_ACC;
+	} else {
+		i40e_regs_acc_attr.devacc_attr_access = DDI_DEFAULT_ACC;
+	}
+
+	if (i40e->i40e_fm_capabilities) {
+		ddi_fm_init(i40e->i40e_dip, &i40e->i40e_fm_capabilities, &iblk);
+
+		if (DDI_FM_EREPORT_CAP(i40e->i40e_fm_capabilities) ||
+		    DDI_FM_ERRCB_CAP(i40e->i40e_fm_capabilities)) {
+			pci_ereport_setup(i40e->i40e_dip);
+		}
+
+		if (DDI_FM_ERRCB_CAP(i40e->i40e_fm_capabilities)) {
+			ddi_fm_handler_register(i40e->i40e_dip,
+			    i40e_fm_error_cb, (void*)i40e);
+		}
+	}
+
+	if (i40e->i40e_fm_capabilities & DDI_FM_DMACHK_CAPABLE) {
+		i40e_init_dma_attrs(i40e, B_TRUE);
+	} else {
+		i40e_init_dma_attrs(i40e, B_FALSE);
+	}
+}
+
+static void
+i40e_fm_fini(i40e_t *i40e)
+{
+	if (i40e->i40e_fm_capabilities) {
+
+		if (DDI_FM_EREPORT_CAP(i40e->i40e_fm_capabilities) ||
+		    DDI_FM_ERRCB_CAP(i40e->i40e_fm_capabilities))
+			pci_ereport_teardown(i40e->i40e_dip);
+
+		if (DDI_FM_ERRCB_CAP(i40e->i40e_fm_capabilities))
+			ddi_fm_handler_unregister(i40e->i40e_dip);
+
+		ddi_fm_fini(i40e->i40e_dip);
+	}
+}
+
+void
+i40e_fm_ereport(i40e_t *i40e, char *detail)
+{
+	uint64_t ena;
+	char buf[FM_MAX_CLASS];
+
+	(void) snprintf(buf, FM_MAX_CLASS, "%s.%s", DDI_FM_DEVICE, detail);
+	ena = fm_ena_generate(0, FM_ENA_FMT1);
+	if (DDI_FM_EREPORT_CAP(i40e->i40e_fm_capabilities)) {
+		ddi_fm_ereport_post(i40e->i40e_dip, buf, ena, DDI_NOSLEEP,
+		    FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERS0, NULL);
+	}
+}
+
+/*
+ * Here we're trying to get the ID of the default VSI. In general, when we come
+ * through and look at this shortly after attach, we expect there to only be a
+ * single element present, which is the default VSI. Importantly, each PF seems
+ * to not see any other devices, in part because of the simple switch mode that
+ * we're using. If for some reason, we see more artifact, we'll need to revisit
+ * what we're doing here.
+ */
+static int
+i40e_get_vsi_id(i40e_t *i40e)
+{
+	i40e_hw_t *hw = &i40e->i40e_hw_space;
+	struct i40e_aqc_get_switch_config_resp *sw_config;
+	uint8_t aq_buf[I40E_AQ_LARGE_BUF];
+	uint16_t next = 0;
+	int rc;
+
+	/* LINTED: E_BAD_PTR_CAST_ALIGN */
+	sw_config = (struct i40e_aqc_get_switch_config_resp *)aq_buf;
+	rc = i40e_aq_get_switch_config(hw, sw_config, sizeof (aq_buf), &next,
+	    NULL);
+	if (rc != I40E_SUCCESS) {
+		i40e_error(i40e, "i40e_aq_get_switch_config() failed %d: %d\n",
+		    rc, hw->aq.asq_last_status);
+		return (-1);
+	}
+
+	if (LE_16(sw_config->header.num_reported) != 1) {
+		i40e_error(i40e, "encountered multiple (%d) switching units "
+		    "during attach, not proceeding",
+		    LE_16(sw_config->header.num_reported));
+		return (-1);
+	}
+
+	return (sw_config->element[0].seid);
+}
+
+/*
+ * We need to fill the i40e_hw_t structure with the capabilities of this PF. We
+ * must also provide the memory for it; however, we don't need to keep it around
+ * to the call to the common code. It takes it and parses it into an internal
+ * structure.
+ */
+static boolean_t
+i40e_get_hw_capabilities(i40e_t *i40e, i40e_hw_t *hw)
+{
+	struct i40e_aqc_list_capabilities_element_resp *buf;
+	int rc;
+	size_t len;
+	uint16_t needed;
+	int nelems = I40E_HW_CAP_DEFAULT;
+
+	for (;;) {
+		len = nelems * sizeof (*buf);
+		ASSERT(len > 0);
+		buf = kmem_alloc(len, KM_SLEEP);
+		rc = i40e_aq_discover_capabilities(hw, buf, len,
+		    &needed, i40e_aqc_opc_list_func_capabilities, NULL);
+		kmem_free(buf, len);
+
+		if (hw->aq.asq_last_status == I40E_AQ_RC_ENOMEM &&
+		    nelems == I40E_HW_CAP_DEFAULT) {
+			if (nelems == needed) {
+				i40e_error(i40e, "Capability discovery failed "
+				    "due to byzantine common code");
+				return (B_FALSE);
+			}
+			nelems = needed;
+			continue;
+		} else if (hw->aq.asq_last_status != I40E_AQ_RC_OK) {
+			i40e_error(i40e, "Capability discovery failed: %d", rc);
+			return (B_FALSE);
+		}
+
+		break;
+	}
+
+	return (B_TRUE);
+}
+
+/*
+ * Obtain the switch's capabilities as seen by this PF and keep it around for
+ * our later use.
+ */
+static boolean_t
+i40e_get_switch_resources(i40e_t *i40e)
+{
+	i40e_hw_t *hw = &i40e->i40e_hw_space;
+	uint8_t cnt = 2;
+	uint8_t act;
+	size_t size;
+	i40e_switch_rsrc_t *buf;
+
+	for (;;) {
+		enum i40e_status_code ret;
+		size = cnt * sizeof (i40e_switch_rsrc_t);
+		ASSERT(size > 0);
+		if (size > UINT16_MAX)
+			return (B_FALSE);
+		buf = kmem_alloc(size, KM_SLEEP);
+
+		ret = i40e_aq_get_switch_resource_alloc(hw, &act, buf,
+		    cnt, NULL);
+		if (ret == I40E_ERR_ADMIN_QUEUE_ERROR &&
+		    hw->aq.asq_last_status == I40E_AQ_RC_EINVAL) {
+			kmem_free(buf, size);
+			cnt += I40E_SWITCH_CAP_DEFAULT;
+			continue;
+		} else if (ret != I40E_SUCCESS) {
+			kmem_free(buf, size);
+			i40e_error(i40e,
+			    "failed to retrieve switch statistics: %d\n", ret);
+			return (B_FALSE);
+		}
+
+		break;
+	}
+
+	i40e->i40e_switch_rsrc_alloc = cnt;
+	i40e->i40e_switch_rsrc_actual = act;
+	i40e->i40e_switch_rsrcs = buf;
+
+	return (B_TRUE);
+}
+
+static void
+i40e_cleanup_resources(i40e_t *i40e)
+{
+	if (i40e->i40e_uaddrs != NULL) {
+		kmem_free(i40e->i40e_uaddrs, sizeof (i40e_uaddr_t) *
+		    i40e->i40e_resources.ifr_nmacfilt);
+		i40e->i40e_uaddrs = NULL;
+	}
+
+	if (i40e->i40e_maddrs != NULL) {
+		kmem_free(i40e->i40e_maddrs, sizeof (i40e_maddr_t) *
+		    i40e->i40e_resources.ifr_nmcastfilt);
+		i40e->i40e_maddrs = NULL;
+	}
+
+	if (i40e->i40e_switch_rsrcs != NULL) {
+		size_t sz = sizeof (i40e_switch_rsrc_t) *
+		    i40e->i40e_switch_rsrc_alloc;
+		ASSERT(sz > 0);
+		kmem_free(i40e->i40e_switch_rsrcs, sz);
+		i40e->i40e_switch_rsrcs = NULL;
+	}
+
+	if (i40e->i40e_device != NULL)
+		i40e_device_rele(i40e);
+}
+
+static boolean_t
+i40e_get_available_resources(i40e_t *i40e)
+{
+	dev_info_t *parent;
+	uint16_t bus, device, func;
+	uint_t nregs;
+	int *regs, i;
+	i40e_device_t *idp;
+	i40e_hw_t *hw = &i40e->i40e_hw_space;
+
+	parent = ddi_get_parent(i40e->i40e_dip);
+
+	if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, i40e->i40e_dip, 0, "reg",
+	    &regs, &nregs) != DDI_PROP_SUCCESS) {
+		return (B_FALSE);
+	}
+
+	if (nregs < 1) {
+		ddi_prop_free(regs);
+		return (B_FALSE);
+	}
+
+	bus = PCI_REG_BUS_G(regs[0]);
+	device = PCI_REG_DEV_G(regs[0]);
+	func = PCI_REG_FUNC_G(regs[0]);
+	ddi_prop_free(regs);
+
+	i40e->i40e_hw_space.bus.func = func;
+	i40e->i40e_hw_space.bus.device = device;
+
+	if (i40e_get_switch_resources(i40e) == B_FALSE) {
+		return (B_FALSE);
+	}
+
+	/*
+	 * To calculate the total amount of a resource we have available, we
+	 * need to add how many our i40e_t thinks it has guaranteed, if any, and
+	 * then we need to go through and divide the number of available on the
+	 * device, which was snapshotted before anyone should have allocated
+	 * anything, and use that to derive how many are available from the
+	 * pool. Longer term, we may want to turn this into something that's
+	 * more of a pool-like resource that everything can share (though that
+	 * may require some more assistance from MAC).
+	 *
+	 * Though for transmit and receive queue pairs, we just have to ask
+	 * firmware instead.
+	 */
+	idp = i40e_device_find(i40e, parent, bus, device);
+	i40e->i40e_device = idp;
+	i40e->i40e_resources.ifr_nvsis = 0;
+	i40e->i40e_resources.ifr_nvsis_used = 0;
+	i40e->i40e_resources.ifr_nmacfilt = 0;
+	i40e->i40e_resources.ifr_nmacfilt_used = 0;
+	i40e->i40e_resources.ifr_nmcastfilt = 0;
+	i40e->i40e_resources.ifr_nmcastfilt_used = 0;
+
+	for (i = 0; i < i40e->i40e_switch_rsrc_actual; i++) {
+		i40e_switch_rsrc_t *srp = &i40e->i40e_switch_rsrcs[i];
+
+		switch (srp->resource_type) {
+		case I40E_AQ_RESOURCE_TYPE_VSI:
+			i40e->i40e_resources.ifr_nvsis +=
+			    LE_16(srp->guaranteed);
+			i40e->i40e_resources.ifr_nvsis_used = LE_16(srp->used);
+			break;
+		case I40E_AQ_RESOURCE_TYPE_MACADDR:
+			i40e->i40e_resources.ifr_nmacfilt +=
+			    LE_16(srp->guaranteed);
+			i40e->i40e_resources.ifr_nmacfilt_used =
+			    LE_16(srp->used);
+			break;
+		case I40E_AQ_RESOURCE_TYPE_MULTICAST_HASH:
+			i40e->i40e_resources.ifr_nmcastfilt +=
+			    LE_16(srp->guaranteed);
+			i40e->i40e_resources.ifr_nmcastfilt_used =
+			    LE_16(srp->used);
+			break;
+		default:
+			break;
+		}
+	}
+
+	for (i = 0; i < idp->id_rsrcs_act; i++) {
+		i40e_switch_rsrc_t *srp = &i40e->i40e_switch_rsrcs[i];
+		switch (srp->resource_type) {
+		case I40E_AQ_RESOURCE_TYPE_VSI:
+			i40e->i40e_resources.ifr_nvsis +=
+			    LE_16(srp->total_unalloced) / idp->id_nfuncs;
+			break;
+		case I40E_AQ_RESOURCE_TYPE_MACADDR:
+			i40e->i40e_resources.ifr_nmacfilt +=
+			    LE_16(srp->total_unalloced) / idp->id_nfuncs;
+			break;
+		case I40E_AQ_RESOURCE_TYPE_MULTICAST_HASH:
+			i40e->i40e_resources.ifr_nmcastfilt +=
+			    LE_16(srp->total_unalloced) / idp->id_nfuncs;
+		default:
+			break;
+		}
+	}
+
+	i40e->i40e_resources.ifr_nrx_queue = hw->func_caps.num_rx_qp;
+	i40e->i40e_resources.ifr_ntx_queue = hw->func_caps.num_tx_qp;
+
+	i40e->i40e_uaddrs = kmem_zalloc(sizeof (i40e_uaddr_t) *
+	    i40e->i40e_resources.ifr_nmacfilt, KM_SLEEP);
+	i40e->i40e_maddrs = kmem_zalloc(sizeof (i40e_maddr_t) *
+	    i40e->i40e_resources.ifr_nmcastfilt, KM_SLEEP);
+
+	/*
+	 * Initialize these a multicast address to indicate it's invalid for
+	 * sanity purposes. Think of it like 0xdeadbeef.
+	 */
+	for (i = 0; i < i40e->i40e_resources.ifr_nmacfilt; i++)
+		i40e->i40e_uaddrs[i].iua_mac[0] = 0x01;
+
+	return (B_TRUE);
+}
+
+static boolean_t
+i40e_enable_interrupts(i40e_t *i40e)
+{
+	int i, rc;
+
+	if (i40e->i40e_intr_cap & DDI_INTR_FLAG_BLOCK) {
+		rc = ddi_intr_block_enable(i40e->i40e_intr_handles,
+		    i40e->i40e_intr_count);
+		if (rc != DDI_SUCCESS) {
+			i40e_error(i40e, "Interrupt block-enable failed: %d",
+			    rc);
+			return (B_FALSE);
+		}
+	} else {
+		for (i = 0; i < i40e->i40e_intr_count; i++) {
+			rc = ddi_intr_enable(i40e->i40e_intr_handles[i]);
+			if (rc != DDI_SUCCESS) {
+				i40e_error(i40e,
+				    "Failed to enable interrupt %d: %d", i, rc);
+				while (--i >= 0) {
+					(void) ddi_intr_disable(
+					    i40e->i40e_intr_handles[i]);
+				}
+				return (B_FALSE);
+			}
+		}
+	}
+
+	return (B_TRUE);
+}
+
+static boolean_t
+i40e_disable_interrupts(i40e_t *i40e)
+{
+	int i, rc;
+
+	if (i40e->i40e_intr_cap & DDI_INTR_FLAG_BLOCK) {
+		rc = ddi_intr_block_disable(i40e->i40e_intr_handles,
+		    i40e->i40e_intr_count);
+		if (rc != DDI_SUCCESS) {
+			i40e_error(i40e,
+			    "Interrupt block-disabled failed: %d", rc);
+			return (B_FALSE);
+		}
+	} else {
+		for (i = 0; i < i40e->i40e_intr_count; i++) {
+			rc = ddi_intr_disable(i40e->i40e_intr_handles[i]);
+			if (rc != DDI_SUCCESS) {
+				i40e_error(i40e,
+				    "Failed to disable interrupt %d: %d",
+				    i, rc);
+				return (B_FALSE);
+			}
+		}
+	}
+
+	return (B_TRUE);
+}
+
+/*
+ * Free receive & transmit rings.
+ */
+static void
+i40e_free_trqpairs(i40e_t *i40e)
+{
+	int i;
+	i40e_trqpair_t *itrq;
+
+	if (i40e->i40e_trqpairs != NULL) {
+		for (i = 0; i < i40e->i40e_num_trqpairs; i++) {
+			itrq = &i40e->i40e_trqpairs[i];
+			mutex_destroy(&itrq->itrq_rx_lock);
+			mutex_destroy(&itrq->itrq_tx_lock);
+			mutex_destroy(&itrq->itrq_tcb_lock);
+
+			/*
+			 * Should have already been cleaned up by start/stop,
+			 * etc.
+			 */
+			ASSERT(itrq->itrq_txkstat == NULL);
+			ASSERT(itrq->itrq_rxkstat == NULL);
+		}
+
+		kmem_free(i40e->i40e_trqpairs,
+		    sizeof (i40e_trqpair_t) * i40e->i40e_num_trqpairs);
+		i40e->i40e_trqpairs = NULL;
+	}
+
+	cv_destroy(&i40e->i40e_rx_pending_cv);
+	mutex_destroy(&i40e->i40e_rx_pending_lock);
+	mutex_destroy(&i40e->i40e_general_lock);
+}
+
+/*
+ * Allocate transmit and receive rings, as well as other data structures that we
+ * need.
+ */
+static boolean_t
+i40e_alloc_trqpairs(i40e_t *i40e)
+{
+	int i;
+	void *mutexpri = DDI_INTR_PRI(i40e->i40e_intr_pri);
+
+	/*
+	 * Now that we have the priority for the interrupts, initialize
+	 * all relevant locks.
+	 */
+	mutex_init(&i40e->i40e_general_lock, NULL, MUTEX_DRIVER, mutexpri);
+	mutex_init(&i40e->i40e_rx_pending_lock, NULL, MUTEX_DRIVER, mutexpri);
+	cv_init(&i40e->i40e_rx_pending_cv, NULL, CV_DRIVER, NULL);
+
+	i40e->i40e_trqpairs = kmem_zalloc(sizeof (i40e_trqpair_t) *
+	    i40e->i40e_num_trqpairs, KM_SLEEP);
+	for (i = 0; i < i40e->i40e_num_trqpairs; i++) {
+		i40e_trqpair_t *itrq = &i40e->i40e_trqpairs[i];
+
+		itrq->itrq_i40e = i40e;
+		mutex_init(&itrq->itrq_rx_lock, NULL, MUTEX_DRIVER, mutexpri);
+		mutex_init(&itrq->itrq_tx_lock, NULL, MUTEX_DRIVER, mutexpri);
+		mutex_init(&itrq->itrq_tcb_lock, NULL, MUTEX_DRIVER, mutexpri);
+		itrq->itrq_index = i;
+	}
+
+	return (B_TRUE);
+}
+
+
+
+/*
+ * Unless a .conf file already overrode i40e_t structure values, they will
+ * be 0, and need to be set in conjunction with the now-available HW report.
+ *
+ * However, at the moment, we cap all of these resources as we only support a
+ * single receive ring and a single group.
+ */
+/* ARGSUSED */
+static void
+i40e_hw_to_instance(i40e_t *i40e, i40e_hw_t *hw)
+{
+	if (i40e->i40e_num_trqpairs == 0) {
+		i40e->i40e_num_trqpairs = I40E_TRQPAIR_MAX;
+	}
+
+	if (i40e->i40e_num_rx_groups == 0) {
+		i40e->i40e_num_rx_groups = I40E_GROUP_MAX;
+	}
+}
+
+/*
+ * Free any resources required by, or setup by, the Intel common code.
+ */
+static void
+i40e_common_code_fini(i40e_t *i40e)
+{
+	i40e_hw_t *hw = &i40e->i40e_hw_space;
+	int rc;
+
+	rc = i40e_shutdown_lan_hmc(hw);
+	if (rc != I40E_SUCCESS)
+		i40e_error(i40e, "failed to shutdown LAN hmc: %d", rc);
+
+	rc = i40e_shutdown_adminq(hw);
+	if (rc != I40E_SUCCESS)
+		i40e_error(i40e, "failed to shutdown admin queue: %d", rc);
+}
+
+/*
+ * Initialize and call Intel common-code routines, includes some setup
+ * the common code expects from the driver.  Also prints on failure, so
+ * the caller doesn't have to.
+ */
+static boolean_t
+i40e_common_code_init(i40e_t *i40e, i40e_hw_t *hw)
+{
+	int rc;
+
+	i40e_clear_hw(hw);
+	rc = i40e_pf_reset(hw);
+	if (rc != 0) {
+		i40e_error(i40e, "failed to reset hardware: %d", rc);
+		i40e_fm_ereport(i40e, DDI_FM_DEVICE_NO_RESPONSE);
+		return (B_FALSE);
+	}
+
+	rc = i40e_init_shared_code(hw);
+	if (rc != 0) {
+		i40e_error(i40e, "failed to initialize i40e core: %d", rc);
+		return (B_FALSE);
+	}
+
+	hw->aq.num_arq_entries = I40E_DEF_ADMINQ_SIZE;
+	hw->aq.num_asq_entries =  I40E_DEF_ADMINQ_SIZE;
+	hw->aq.arq_buf_size = I40E_ADMINQ_BUFSZ;
+	hw->aq.asq_buf_size = I40E_ADMINQ_BUFSZ;
+
+	rc = i40e_init_adminq(hw);
+	if (rc != 0) {
+		i40e_error(i40e, "failed to initialize firmware admin queue: "
+		    "%d, potential firmware version mismatch", rc);
+		i40e_fm_ereport(i40e, DDI_FM_DEVICE_INVAL_STATE);
+		return (B_FALSE);
+	}
+
+	if (hw->aq.api_maj_ver == I40E_FW_API_VERSION_MAJOR &&
+	    hw->aq.api_min_ver > I40E_FW_API_VERSION_MINOR) {
+		i40e_notice(i40e, "The driver for the device detected a newer "
+		    "version of the NVM image (%d.%d) than expected (%d.%d).\n"
+		    "Please install the most recent version of the network "
+		    "driver.\n", hw->aq.api_maj_ver, hw->aq.api_min_ver,
+		    I40E_FW_API_VERSION_MAJOR, I40E_FW_API_VERSION_MINOR);
+	} else if (hw->aq.api_maj_ver < I40E_FW_API_VERSION_MAJOR ||
+	    hw->aq.api_min_ver < (I40E_FW_API_VERSION_MINOR - 1)) {
+		i40e_notice(i40e, "The driver for the device detected an older"
+		    " version of the NVM image (%d.%d) than expected (%d.%d)."
+		    "\nPlease update the NVM image.\n",
+		    hw->aq.api_maj_ver, hw->aq.api_min_ver,
+		    I40E_FW_API_VERSION_MAJOR, I40E_FW_API_VERSION_MINOR - 1);
+	}
+
+	i40e_clear_pxe_mode(hw);
+
+	/*
+	 * We need to call this so that the common code can discover
+	 * capabilities of the hardware, which it uses throughout the rest.
+	 */
+	if (!i40e_get_hw_capabilities(i40e, hw)) {
+		i40e_error(i40e, "failed to obtain hardware capabilities");
+		return (B_FALSE);
+	}
+
+	if (i40e_get_available_resources(i40e) == B_FALSE) {
+		i40e_error(i40e, "failed to obtain hardware resources");
+		return (B_FALSE);
+	}
+
+	i40e_hw_to_instance(i40e, hw);
+
+	rc = i40e_init_lan_hmc(hw, hw->func_caps.num_tx_qp,
+	    hw->func_caps.num_rx_qp, 0, 0);
+	if (rc != 0) {
+		i40e_error(i40e, "failed to initialize hardware memory cache: "
+		    "%d\n", rc);
+		return (B_FALSE);
+	}
+
+	rc = i40e_configure_lan_hmc(hw, I40E_HMC_MODEL_DIRECT_ONLY);
+	if (rc != 0) {
+		i40e_error(i40e, "failed to configure hardware memory cache: "
+		    "%d\n", rc);
+		return (B_FALSE);
+	}
+
+	(void) i40e_aq_stop_lldp(hw, TRUE, NULL);
+
+	rc = i40e_get_mac_addr(hw, hw->mac.addr);
+	if (rc != I40E_SUCCESS) {
+		i40e_error(i40e, "failed to retrieve hardware mac address: %d",
+		    rc);
+		return (B_FALSE);
+	}
+
+	rc = i40e_validate_mac_addr(hw->mac.addr);
+	if (rc != 0) {
+		i40e_error(i40e, "failed to validate internal mac address: "
+		    "%d\n", rc);
+		return (B_FALSE);
+	}
+	bcopy(hw->mac.addr, hw->mac.perm_addr, ETHERADDRL);
+	if ((rc = i40e_get_port_mac_addr(hw, hw->mac.port_addr)) !=
+	    I40E_SUCCESS) {
+		i40e_error(i40e, "failed to retrieve port mac address: %d",
+		    rc);
+		return (B_FALSE);
+	}
+
+	/*
+	 * We need to obtain the Virtual Station ID (VSI) before we can
+	 * perform other operations on the device.
+	 */
+	i40e->i40e_vsi_id = i40e_get_vsi_id(i40e);
+	if (i40e->i40e_vsi_id == -1) {
+		i40e_error(i40e, "failed to obtain VSI ID");
+		return (B_FALSE);
+	}
+
+	return (B_TRUE);
+}
+
+static void
+i40e_unconfigure(dev_info_t *devinfo, i40e_t *i40e)
+{
+	int rc;
+
+	if (i40e->i40e_attach_progress & I40E_ATTACH_ENABLE_INTR)
+		(void) i40e_disable_interrupts(i40e);
+
+	if ((i40e->i40e_attach_progress & I40E_ATTACH_LINK_TIMER) &&
+	    i40e->i40e_periodic_id != 0) {
+		ddi_periodic_delete(i40e->i40e_periodic_id);
+		i40e->i40e_periodic_id = 0;
+	}
+
+	if (i40e->i40e_attach_progress & I40E_ATTACH_MAC) {
+		rc = mac_unregister(i40e->i40e_mac_hdl);
+		if (rc != 0) {
+			i40e_error(i40e, "failed to unregister from mac: %d",
+			    rc);
+		}
+	}
+
+	if (i40e->i40e_attach_progress & I40E_ATTACH_STATS) {
+		i40e_stats_fini(i40e);
+	}
+
+	if (i40e->i40e_attach_progress & I40E_ATTACH_ADD_INTR)
+		i40e_rem_intr_handlers(i40e);
+
+	if (i40e->i40e_attach_progress & I40E_ATTACH_ALLOC_RINGSLOCKS)
+		i40e_free_trqpairs(i40e);
+
+	if (i40e->i40e_attach_progress & I40E_ATTACH_ALLOC_INTR)
+		i40e_rem_intrs(i40e);
+
+	if (i40e->i40e_attach_progress & I40E_ATTACH_COMMON_CODE)
+		i40e_common_code_fini(i40e);
+
+	i40e_cleanup_resources(i40e);
+
+	if (i40e->i40e_attach_progress & I40E_ATTACH_PROPS)
+		(void) ddi_prop_remove_all(devinfo);
+
+	if (i40e->i40e_attach_progress & I40E_ATTACH_REGS_MAP &&
+	    i40e->i40e_osdep_space.ios_reg_handle != NULL) {
+		ddi_regs_map_free(&i40e->i40e_osdep_space.ios_reg_handle);
+		i40e->i40e_osdep_space.ios_reg_handle = NULL;
+	}
+
+	if ((i40e->i40e_attach_progress & I40E_ATTACH_PCI_CONFIG) &&
+	    i40e->i40e_osdep_space.ios_cfg_handle != NULL) {
+		pci_config_teardown(&i40e->i40e_osdep_space.ios_cfg_handle);
+		i40e->i40e_osdep_space.ios_cfg_handle = NULL;
+	}
+
+	if (i40e->i40e_attach_progress & I40E_ATTACH_FM_INIT)
+		i40e_fm_fini(i40e);
+
+	kmem_free(i40e->i40e_aqbuf, I40E_ADMINQ_BUFSZ);
+	kmem_free(i40e, sizeof (i40e_t));
+
+	ddi_set_driver_private(devinfo, NULL);
+}
+
+static boolean_t
+i40e_final_init(i40e_t *i40e)
+{
+	i40e_hw_t *hw = &i40e->i40e_hw_space;
+	struct i40e_osdep *osdep = OS_DEP(hw);
+	uint8_t pbanum[I40E_PBANUM_STRLEN];
+	enum i40e_status_code irc;
+	char buf[I40E_DDI_PROP_LEN];
+
+	pbanum[0] = '\0';
+	irc = i40e_read_pba_string(hw, pbanum, sizeof (pbanum));
+	if (irc != I40E_SUCCESS) {
+		i40e_log(i40e, "failed to read PBA string: %d", irc);
+	} else {
+		(void) ddi_prop_update_string(DDI_DEV_T_NONE, i40e->i40e_dip,
+		    "printed-board-assembly", (char *)pbanum);
+	}
+
+#ifdef	DEBUG
+	ASSERT(snprintf(NULL, 0, "%d.%d", hw->aq.fw_maj_ver,
+	    hw->aq.fw_min_ver) < sizeof (buf));
+	ASSERT(snprintf(NULL, 0, "%x", hw->aq.fw_build) < sizeof (buf));
+	ASSERT(snprintf(NULL, 0, "%d.%d", hw->aq.api_maj_ver,
+	    hw->aq.api_min_ver) < sizeof (buf));
+#endif
+
+	(void) snprintf(buf, sizeof (buf), "%d.%d", hw->aq.fw_maj_ver,
+	    hw->aq.fw_min_ver);
+	(void) ddi_prop_update_string(DDI_DEV_T_NONE, i40e->i40e_dip,
+	    "firmware-version", buf);
+	(void) snprintf(buf, sizeof (buf), "%x", hw->aq.fw_build);
+	(void) ddi_prop_update_string(DDI_DEV_T_NONE, i40e->i40e_dip,
+	    "firmware-build", buf);
+	(void) snprintf(buf, sizeof (buf), "%d.%d", hw->aq.api_maj_ver,
+	    hw->aq.api_min_ver);
+	(void) ddi_prop_update_string(DDI_DEV_T_NONE, i40e->i40e_dip,
+	    "api-version", buf);
+
+	if (!i40e_set_hw_bus_info(hw))
+		return (B_FALSE);
+
+	if (i40e_check_acc_handle(osdep->ios_reg_handle) != DDI_FM_OK) {
+		ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_LOST);
+		return (B_FALSE);
+	}
+
+	return (B_TRUE);
+}
+
+static boolean_t
+i40e_identify_hardware(i40e_t *i40e)
+{
+	i40e_hw_t *hw = &i40e->i40e_hw_space;
+	struct i40e_osdep *osdep = &i40e->i40e_osdep_space;
+
+	hw->vendor_id = pci_config_get16(osdep->ios_cfg_handle, PCI_CONF_VENID);
+	hw->device_id = pci_config_get16(osdep->ios_cfg_handle, PCI_CONF_DEVID);
+	hw->revision_id = pci_config_get8(osdep->ios_cfg_handle,
+	    PCI_CONF_REVID);
+	hw->subsystem_device_id =
+	    pci_config_get16(osdep->ios_cfg_handle, PCI_CONF_SUBSYSID);
+	hw->subsystem_vendor_id =
+	    pci_config_get16(osdep->ios_cfg_handle, PCI_CONF_SUBVENID);
+
+	/*
+	 * Note that we set the hardware's bus information later on, in
+	 * i40e_get_available_resources(). The common code doesn't seem to
+	 * require that it be set in any ways, it seems to be mostly for
+	 * book-keeping.
+	 */
+
+	/* Call common code to set the MAC type for this adapter. */
+	if (i40e_set_mac_type(hw) != I40E_SUCCESS)
+		return (B_FALSE);
+
+	return (B_TRUE);
+}
+
+static boolean_t
+i40e_regs_map(i40e_t *i40e)
+{
+	dev_info_t *devinfo = i40e->i40e_dip;
+	i40e_hw_t *hw = &i40e->i40e_hw_space;
+	struct i40e_osdep *osdep = &i40e->i40e_osdep_space;
+	off_t memsize;
+	int ret;
+
+	if (ddi_dev_regsize(devinfo, I40E_ADAPTER_REGSET, &memsize) !=
+	    DDI_SUCCESS) {
+		i40e_error(i40e, "Used invalid register set to map PCIe regs");
+		return (B_FALSE);
+	}
+
+	if ((ret = ddi_regs_map_setup(devinfo, I40E_ADAPTER_REGSET,
+	    (caddr_t *)&hw->hw_addr, 0, memsize, &i40e_regs_acc_attr,
+	    &osdep->ios_reg_handle)) != DDI_SUCCESS) {
+		i40e_error(i40e, "failed to map device registers: %d", ret);
+		return (B_FALSE);
+	}
+
+	osdep->ios_reg_size = memsize;
+	return (B_TRUE);
+}
+
+/*
+ * Update parameters required when a new MTU has been configured.  Calculate the
+ * maximum frame size, as well as, size our DMA buffers which we size in
+ * increments of 1K.
+ */
+void
+i40e_update_mtu(i40e_t *i40e)
+{
+	uint32_t rx, tx;
+
+	i40e->i40e_frame_max = i40e->i40e_sdu +
+	    sizeof (struct ether_vlan_header) + ETHERFCSL;
+
+	rx = i40e->i40e_frame_max + I40E_BUF_IPHDR_ALIGNMENT;
+	i40e->i40e_rx_buf_size = ((rx >> 10) +
+	    ((rx & (((uint32_t)1 << 10) -1)) > 0 ? 1 : 0)) << 10;
+
+	tx = i40e->i40e_frame_max;
+	i40e->i40e_tx_buf_size = ((tx >> 10) +
+	    ((tx & (((uint32_t)1 << 10) -1)) > 0 ? 1 : 0)) << 10;
+}
+
+static int
+i40e_get_prop(i40e_t *i40e, char *prop, int min, int max, int def)
+{
+	int val;
+
+	val = ddi_prop_get_int(DDI_DEV_T_ANY, i40e->i40e_dip, DDI_PROP_DONTPASS,
+	    prop, def);
+	if (val > max)
+		val = max;
+	if (val < min)
+		val = min;
+	return (val);
+}
+
+static void
+i40e_init_properties(i40e_t *i40e)
+{
+	i40e->i40e_sdu = i40e_get_prop(i40e, "default_mtu",
+	    I40E_MIN_MTU, I40E_MAX_MTU, I40E_DEF_MTU);
+
+	i40e->i40e_intr_force = i40e_get_prop(i40e, "intr_force",
+	    I40E_INTR_NONE, I40E_INTR_LEGACY, I40E_INTR_NONE);
+
+	i40e->i40e_mr_enable = i40e_get_prop(i40e, "mr_enable",
+	    B_FALSE, B_TRUE, B_TRUE);
+
+	i40e->i40e_tx_ring_size = i40e_get_prop(i40e, "tx_ring_size",
+	    I40E_MIN_TX_RING_SIZE, I40E_MAX_TX_RING_SIZE,
+	    I40E_DEF_TX_RING_SIZE);
+	if ((i40e->i40e_tx_ring_size % I40E_DESC_ALIGN) != 0) {
+		i40e->i40e_tx_ring_size = P2ROUNDUP(i40e->i40e_tx_ring_size,
+		    I40E_DESC_ALIGN);
+	}
+
+	i40e->i40e_tx_block_thresh = i40e_get_prop(i40e, "tx_resched_threshold",
+	    I40E_MIN_TX_BLOCK_THRESH,
+	    i40e->i40e_tx_ring_size - I40E_TX_MAX_COOKIE,
+	    I40E_DEF_TX_BLOCK_THRESH);
+
+	i40e->i40e_rx_ring_size = i40e_get_prop(i40e, "rx_ring_size",
+	    I40E_MIN_RX_RING_SIZE, I40E_MAX_RX_RING_SIZE,
+	    I40E_DEF_RX_RING_SIZE);
+	if ((i40e->i40e_rx_ring_size % I40E_DESC_ALIGN) != 0) {
+		i40e->i40e_rx_ring_size = P2ROUNDUP(i40e->i40e_rx_ring_size,
+		    I40E_DESC_ALIGN);
+	}
+
+	i40e->i40e_rx_limit_per_intr = i40e_get_prop(i40e, "rx_limit_per_intr",
+	    I40E_MIN_RX_LIMIT_PER_INTR,	I40E_MAX_RX_LIMIT_PER_INTR,
+	    I40E_DEF_RX_LIMIT_PER_INTR);
+
+	i40e->i40e_tx_hcksum_enable = i40e_get_prop(i40e, "tx_hcksum_enable",
+	    B_FALSE, B_TRUE, B_TRUE);
+
+	i40e->i40e_rx_hcksum_enable = i40e_get_prop(i40e, "rx_hcksum_enable",
+	    B_FALSE, B_TRUE, B_TRUE);
+
+	i40e->i40e_rx_dma_min = i40e_get_prop(i40e, "rx_dma_threshold",
+	    I40E_MIN_RX_DMA_THRESH, I40E_MAX_RX_DMA_THRESH,
+	    I40E_DEF_RX_DMA_THRESH);
+
+	i40e->i40e_tx_dma_min = i40e_get_prop(i40e, "tx_dma_threshold",
+	    I40E_MIN_TX_DMA_THRESH, I40E_MAX_TX_DMA_THRESH,
+	    I40E_DEF_TX_DMA_THRESH);
+
+	i40e->i40e_tx_itr = i40e_get_prop(i40e, "tx_intr_throttle",
+	    I40E_MIN_ITR, I40E_MAX_ITR, I40E_DEF_TX_ITR);
+
+	i40e->i40e_rx_itr = i40e_get_prop(i40e, "rx_intr_throttle",
+	    I40E_MIN_ITR, I40E_MAX_ITR, I40E_DEF_RX_ITR);
+
+	i40e->i40e_other_itr = i40e_get_prop(i40e, "other_intr_throttle",
+	    I40E_MIN_ITR, I40E_MAX_ITR, I40E_DEF_OTHER_ITR);
+
+	if (!i40e->i40e_mr_enable) {
+		i40e->i40e_num_trqpairs = I40E_TRQPAIR_NOMSIX;
+		i40e->i40e_num_rx_groups = I40E_GROUP_NOMSIX;
+	}
+
+	i40e_update_mtu(i40e);
+}
+
+/*
+ * There are a few constraints on interrupts that we're currently imposing, some
+ * of which are restrictions from hardware. For a fuller treatment, see
+ * i40e_intr.c.
+ *
+ * Currently, to use MSI-X we require two interrupts be available though in
+ * theory we should participate in IRM and happily use more interrupts.
+ *
+ * Hardware only supports a single MSI being programmed and therefore if we
+ * don't have MSI-X interrupts available at this time, then we ratchet down the
+ * number of rings and groups available. Obviously, we only bother with a single
+ * fixed interrupt.
+ */
+static boolean_t
+i40e_alloc_intr_handles(i40e_t *i40e, dev_info_t *devinfo, int intr_type)
+{
+	int request, count, actual, rc, min;
+
+	switch (intr_type) {
+	case DDI_INTR_TYPE_FIXED:
+	case DDI_INTR_TYPE_MSI:
+		request = 1;
+		min = 1;
+		break;
+	case DDI_INTR_TYPE_MSIX:
+		/*
+		 * At the moment, we always request two MSI-X while we still
+		 * only support a single interrupt. The upper bound on what's
+		 * supported by a given device is defined by MSI_X_PF_N in
+		 * GLPCI_CNF2. When we evolve, we should read it to determine
+		 * what the real max is.
+		 */
+		ASSERT(i40e->i40e_num_trqpairs == 1);
+		request = 2;
+		min = 2;
+		break;
+	default:
+		panic("bad interrupt type passed to i40e_alloc_intr_handles: "
+		    "%d", intr_type);
+		return (B_FALSE);
+	}
+
+	rc = ddi_intr_get_nintrs(devinfo, intr_type, &count);
+	if (rc != DDI_SUCCESS || count < min) {
+		i40e_log(i40e, "Get interrupt number failed, "
+		    "returned %d, count %d\n", rc, count);
+		return (B_FALSE);
+	}
+
+	rc = ddi_intr_get_navail(devinfo, intr_type, &count);
+	if (rc != DDI_SUCCESS || count < min) {
+		i40e_log(i40e, "Get AVAILABLE interrupt number failed, "
+		    "returned %d, count %d\n", rc, count);
+		return (B_FALSE);
+	}
+
+	actual = 0;
+	i40e->i40e_intr_count = 0;
+	i40e->i40e_intr_count_max = 0;
+	i40e->i40e_intr_count_min = 0;
+
+	i40e->i40e_intr_size = request * sizeof (ddi_intr_handle_t);
+	ASSERT(i40e->i40e_intr_size != 0);
+	i40e->i40e_intr_handles = kmem_alloc(i40e->i40e_intr_size, KM_SLEEP);
+
+	rc = ddi_intr_alloc(devinfo, i40e->i40e_intr_handles, intr_type, 0,
+	    min(request, count), &actual, DDI_INTR_ALLOC_NORMAL);
+	if (rc != DDI_SUCCESS) {
+		i40e_log(i40e, "Interrupt allocation failed with %d.", rc);
+		goto alloc_handle_fail;
+	}
+
+	i40e->i40e_intr_count = actual;
+	i40e->i40e_intr_count_max = request;
+	i40e->i40e_intr_count_min = min;
+
+	if (actual < min) {
+		i40e_log(i40e, "actual (%d) is less than minimum (%d).",
+		    actual, min);
+		goto alloc_handle_fail;
+	}
+
+	/*
+	 * Record the priority and capabilities for our first vector.  Once
+	 * we have it, that's our priority until detach time.  Even if we
+	 * eventually participate in IRM, our priority shouldn't change.
+	 */
+	rc = ddi_intr_get_pri(i40e->i40e_intr_handles[0], &i40e->i40e_intr_pri);
+	if (rc != DDI_SUCCESS) {
+		i40e_log(i40e,
+		    "Getting interrupt priority failed with %d.", rc);
+		goto alloc_handle_fail;
+	}
+
+	rc = ddi_intr_get_cap(i40e->i40e_intr_handles[0], &i40e->i40e_intr_cap);
+	if (rc != DDI_SUCCESS) {
+		i40e_log(i40e,
+		    "Getting interrupt capabilities failed with %d.", rc);
+		goto alloc_handle_fail;
+	}
+
+	i40e->i40e_intr_type = intr_type;
+	return (B_TRUE);
+
+alloc_handle_fail:
+
+	i40e_rem_intrs(i40e);
+	return (B_FALSE);
+}
+
+static boolean_t
+i40e_alloc_intrs(i40e_t *i40e, dev_info_t *devinfo)
+{
+	int intr_types, rc;
+
+	rc = ddi_intr_get_supported_types(devinfo, &intr_types);
+	if (rc != DDI_SUCCESS) {
+		i40e_error(i40e, "failed to get supported interrupt types: %d",
+		    rc);
+		return (B_FALSE);
+	}
+
+	i40e->i40e_intr_type = 0;
+
+	if ((intr_types & DDI_INTR_TYPE_MSIX) &&
+	    i40e->i40e_intr_force <= I40E_INTR_MSIX) {
+		if (i40e_alloc_intr_handles(i40e, devinfo, DDI_INTR_TYPE_MSIX))
+			return (B_TRUE);
+	}
+
+	/*
+	 * We only use multiple transmit/receive pairs when MSI-X interrupts are
+	 * available due to the fact that the device basically only supports a
+	 * single MSI interrupt.
+	 */
+	i40e->i40e_num_trqpairs = I40E_TRQPAIR_NOMSIX;
+	i40e->i40e_num_rx_groups = I40E_GROUP_NOMSIX;
+
+	if ((intr_types & DDI_INTR_TYPE_MSI) &&
+	    (i40e->i40e_intr_force <= I40E_INTR_MSI)) {
+		if (i40e_alloc_intr_handles(i40e, devinfo, DDI_INTR_TYPE_MSI))
+			return (B_TRUE);
+	}
+
+	if (intr_types & DDI_INTR_TYPE_FIXED) {
+		if (i40e_alloc_intr_handles(i40e, devinfo, DDI_INTR_TYPE_FIXED))
+			return (B_TRUE);
+	}
+
+	return (B_FALSE);
+}
+
+/*
+ * Map different interrupts to MSI-X vectors.
+ */
+static boolean_t
+i40e_map_intrs_to_vectors(i40e_t *i40e)
+{
+	if (i40e->i40e_intr_type != DDI_INTR_TYPE_MSIX) {
+		return (B_TRUE);
+	}
+
+	/*
+	 * At the moment, we only have one queue and one interrupt thus both are
+	 * on that one interrupt. However, longer term we need to go back to
+	 * using the ixgbe style map of queues to vectors or walk the linked
+	 * list from the device to know what to go handle. Therefore for the
+	 * moment, since we need to map our single set of rings to the one
+	 * I/O interrupt that exists for MSI-X.
+	 */
+	ASSERT(i40e->i40e_intr_count == 2);
+	ASSERT(i40e->i40e_num_trqpairs == 1);
+
+	i40e->i40e_trqpairs[0].itrq_rx_intrvec = 1;
+	i40e->i40e_trqpairs[0].itrq_tx_intrvec = 1;
+
+	return (B_TRUE);
+}
+
+static boolean_t
+i40e_add_intr_handlers(i40e_t *i40e)
+{
+	int rc, vector;
+
+	switch (i40e->i40e_intr_type) {
+	case DDI_INTR_TYPE_MSIX:
+		for (vector = 0; vector < i40e->i40e_intr_count; vector++) {
+			rc = ddi_intr_add_handler(
+			    i40e->i40e_intr_handles[vector],
+			    (ddi_intr_handler_t *)i40e_intr_msix, i40e,
+			    (void *)(uintptr_t)vector);
+			if (rc != DDI_SUCCESS) {
+				i40e_log(i40e, "Add interrupt handler (MSI-X) "
+				    "failed: return %d, vector %d", rc, vector);
+				for (vector--; vector >= 0; vector--) {
+					(void) ddi_intr_remove_handler(
+					    i40e->i40e_intr_handles[vector]);
+				}
+				return (B_FALSE);
+			}
+		}
+		break;
+	case DDI_INTR_TYPE_MSI:
+		rc = ddi_intr_add_handler(i40e->i40e_intr_handles[0],
+		    (ddi_intr_handler_t *)i40e_intr_msi, i40e, NULL);
+		if (rc != DDI_SUCCESS) {
+			i40e_log(i40e, "Add interrupt handler (MSI) failed: "
+			    "return %d", rc);
+			return (B_FALSE);
+		}
+		break;
+	case DDI_INTR_TYPE_FIXED:
+		rc = ddi_intr_add_handler(i40e->i40e_intr_handles[0],
+		    (ddi_intr_handler_t *)i40e_intr_legacy, i40e, NULL);
+		if (rc != DDI_SUCCESS) {
+			i40e_log(i40e, "Add interrupt handler (legacy) failed:"
+			    " return %d", rc);
+			return (B_FALSE);
+		}
+		break;
+	default:
+		/* Cast to pacify lint */
+		panic("i40e_intr_type %p contains an unknown type: %d",
+		    (void *)i40e, i40e->i40e_intr_type);
+	}
+
+	return (B_TRUE);
+}
+
+/*
+ * Perform periodic checks. Longer term, we should be thinking about additional
+ * things here:
+ *
+ * o Stall Detection
+ * o Temperature sensor detection
+ * o Device resetting
+ * o Statistics updating to avoid wraparound
+ */
+static void
+i40e_timer(void *arg)
+{
+	i40e_t *i40e = arg;
+
+	mutex_enter(&i40e->i40e_general_lock);
+	i40e_link_check(i40e);
+	mutex_exit(&i40e->i40e_general_lock);
+}
+
+/*
+ * Get the hardware state, and scribble away anything that needs scribbling.
+ */
+static void
+i40e_get_hw_state(i40e_t *i40e, i40e_hw_t *hw)
+{
+	int rc;
+
+	ASSERT(MUTEX_HELD(&i40e->i40e_general_lock));
+
+	(void) i40e_aq_get_link_info(hw, TRUE, NULL, NULL);
+	i40e_link_check(i40e);
+
+	/*
+	 * Try and determine our PHY. Note that we may have to retry to and
+	 * delay to detect fiber correctly.
+	 */
+	rc = i40e_aq_get_phy_capabilities(hw, B_FALSE, B_TRUE, &i40e->i40e_phy,
+	    NULL);
+	if (rc == I40E_ERR_UNKNOWN_PHY) {
+		i40e_msec_delay(200);
+		rc = i40e_aq_get_phy_capabilities(hw, B_FALSE, B_TRUE,
+		    &i40e->i40e_phy, NULL);
+	}
+
+	if (rc != I40E_SUCCESS) {
+		if (rc == I40E_ERR_UNKNOWN_PHY) {
+			i40e_error(i40e, "encountered unknown PHY type, "
+			    "not attaching.");
+		} else {
+			i40e_error(i40e, "error getting physical capabilities: "
+			    "%d, %d", rc, hw->aq.asq_last_status);
+		}
+	}
+
+	rc = i40e_update_link_info(hw);
+	if (rc != I40E_SUCCESS) {
+		i40e_error(i40e, "failed to update link information: %d", rc);
+	}
+
+	/*
+	 * In general, we don't want to mask off (as in stop from being a cause)
+	 * any of the interrupts that the phy might be able to generate.
+	 */
+	rc = i40e_aq_set_phy_int_mask(hw, 0, NULL);
+	if (rc != I40E_SUCCESS) {
+		i40e_error(i40e, "failed to update phy link mask: %d\n", rc);
+	}
+}
+
+/*
+ * Go through and re-initialize any existing filters that we may have set up for
+ * this device. Note that we would only expect them to exist if hardware had
+ * already been initialized and we had just reset it. While we're not
+ * implementing this yet, we're keeping this around for when we add reset
+ * capabilities, so this isn't forgotten.
+ */
+/* ARGSUSED */
+static void
+i40e_init_macaddrs(i40e_t *i40e, i40e_hw_t *hw)
+{
+}
+
+/*
+ * Configure the hardware for the Virtual Station Interface (VSI).  Currently
+ * we only support one, but in the future we could instantiate more than one
+ * per attach-point.
+ */
+static boolean_t
+i40e_config_vsi(i40e_t *i40e, i40e_hw_t *hw)
+{
+	struct i40e_vsi_context	context;
+	int err;
+
+	bzero(&context, sizeof (struct i40e_vsi_context));
+	context.seid = i40e->i40e_vsi_id;
+	context.pf_num = hw->pf_id;
+	err = i40e_aq_get_vsi_params(hw, &context, NULL);
+	if (err != I40E_SUCCESS) {
+		i40e_error(i40e, "get VSI params failed with %d\n", err);
+		return (B_FALSE);
+	}
+
+	/*
+	 * Set the queue and traffic class bits.  Keep it simple for now.
+	 */
+	context.info.valid_sections = I40E_AQ_VSI_PROP_QUEUE_MAP_VALID;
+	context.info.mapping_flags = I40E_AQ_VSI_QUE_MAP_CONTIG;
+	context.info.queue_mapping[0] = I40E_ASSIGN_ALL_QUEUES;
+	context.info.tc_mapping[0] = I40E_TRAFFIC_CLASS_NO_QUEUES;
+
+	context.info.valid_sections |= I40E_AQ_VSI_PROP_VLAN_VALID;
+	context.info.port_vlan_flags = I40E_AQ_VSI_PVLAN_MODE_ALL |
+	    I40E_AQ_VSI_PVLAN_EMOD_NOTHING;
+
+	context.flags = LE16_TO_CPU(I40E_AQ_VSI_TYPE_PF);
+
+	i40e->i40e_vsi_stat_id = LE16_TO_CPU(context.info.stat_counter_idx);
+	if (i40e_stat_vsi_init(i40e) == B_FALSE)
+		return (B_FALSE);
+
+	err = i40e_aq_update_vsi_params(hw, &context, NULL);
+	if (err != I40E_SUCCESS) {
+		i40e_error(i40e, "Update VSI params failed with %d", err);
+		return (B_FALSE);
+	}
+
+
+	return (B_TRUE);
+}
+
+/*
+ * Wrapper to kick the chipset on.
+ */
+static boolean_t
+i40e_chip_start(i40e_t *i40e)
+{
+	i40e_hw_t *hw = &i40e->i40e_hw_space;
+	struct i40e_filter_control_settings filter;
+	int rc;
+
+	if (((hw->aq.fw_maj_ver == 4) && (hw->aq.fw_min_ver < 33)) ||
+	    (hw->aq.fw_maj_ver < 4)) {
+		i40e_msec_delay(75);
+		if (i40e_aq_set_link_restart_an(hw, TRUE, NULL) !=
+		    I40E_SUCCESS) {
+			i40e_error(i40e, "failed to restart link: admin queue "
+			    "error: %d\n", hw->aq.asq_last_status);
+			return (B_FALSE);
+		}
+	}
+
+	/* Determine hardware state */
+	i40e_get_hw_state(i40e, hw);
+
+	/* Initialize mac addresses. */
+	i40e_init_macaddrs(i40e, hw);
+
+	/*
+	 * Set up the filter control.
+	 */
+	bzero(&filter, sizeof (filter));
+	filter.enable_ethtype = TRUE;
+	filter.enable_macvlan = TRUE;
+
+	rc = i40e_set_filter_control(hw, &filter);
+	if (rc != I40E_SUCCESS) {
+		i40e_error(i40e, "i40e_set_filter_control() returned %d", rc);
+		return (B_FALSE);
+	}
+
+	i40e_intr_chip_init(i40e);
+
+	if (!i40e_config_vsi(i40e, hw))
+		return (B_FALSE);
+
+	i40e_flush(hw);
+
+	return (B_TRUE);
+}
+
+/*
+ * Take care of tearing down the rx ring. See 8.3.3.1.2 for more information.
+ */
+static void
+i40e_shutdown_rx_rings(i40e_t *i40e)
+{
+	int i;
+	uint32_t reg;
+
+	i40e_hw_t *hw = &i40e->i40e_hw_space;
+
+	/*
+	 * Step 1. The interrupt linked list (see i40e_intr.c for more
+	 * information) should have already been cleared before calling this
+	 * function.
+	 */
+#ifdef	DEBUG
+	if (i40e->i40e_intr_type == DDI_INTR_TYPE_MSIX) {
+		for (i = 1; i < i40e->i40e_intr_count; i++) {
+			reg = I40E_READ_REG(hw, I40E_PFINT_LNKLSTN(i - 1));
+			VERIFY3U(reg, ==, I40E_QUEUE_TYPE_EOL);
+		}
+	} else {
+		reg = I40E_READ_REG(hw, I40E_PFINT_LNKLST0);
+		VERIFY3U(reg, ==, I40E_QUEUE_TYPE_EOL);
+	}
+
+#endif	/* DEBUG */
+
+	for (i = 0; i < i40e->i40e_num_trqpairs; i++) {
+		/*
+		 * Step 1. Request the queue by clearing QENA_REQ. It may not be
+		 * set due to unwinding from failures and a partially enabled
+		 * ring set.
+		 */
+		reg = I40E_READ_REG(hw, I40E_QRX_ENA(i));
+		if (!(reg & I40E_QRX_ENA_QENA_REQ_MASK))
+			continue;
+		VERIFY((reg & I40E_QRX_ENA_QENA_REQ_MASK) ==
+		    I40E_QRX_ENA_QENA_REQ_MASK);
+		reg &= ~I40E_QRX_ENA_QENA_REQ_MASK;
+		I40E_WRITE_REG(hw, I40E_QRX_ENA(i), reg);
+	}
+
+	/*
+	 * Step 2. Wait for the disable to take, by having QENA_STAT in the FPM
+	 * be cleared. Note that we could still receive data in the queue during
+	 * this time. We don't actually wait for this now and instead defer this
+	 * to i40e_shutdown_rings_wait(), after we've interleaved disabling the
+	 * TX queues as well.
+	 */
+}
+
+static void
+i40e_shutdown_tx_rings(i40e_t *i40e)
+{
+	int i;
+	uint32_t reg;
+
+	i40e_hw_t *hw = &i40e->i40e_hw_space;
+
+	/*
+	 * Step 1. The interrupt linked list should already have been cleared.
+	 */
+#ifdef DEBUG
+	if (i40e->i40e_intr_type == DDI_INTR_TYPE_MSIX) {
+		for (i = 1; i < i40e->i40e_intr_count; i++) {
+			reg = I40E_READ_REG(hw, I40E_PFINT_LNKLSTN(i - 1));
+			VERIFY3U(reg, ==, I40E_QUEUE_TYPE_EOL);
+		}
+	} else {
+		reg = I40E_READ_REG(hw, I40E_PFINT_LNKLST0);
+		VERIFY3U(reg, ==, I40E_QUEUE_TYPE_EOL);
+
+	}
+#endif	/* DEBUG */
+
+	for (i = 0; i < i40e->i40e_num_trqpairs; i++) {
+		/*
+		 * Step 2. Set the SET_QDIS flag for every queue.
+		 */
+		i40e_pre_tx_queue_cfg(hw, i, B_FALSE);
+	}
+
+	/*
+	 * Step 3. Wait at least 400 usec (can be done once for all queues).
+	 */
+	drv_usecwait(500);
+
+	for (i = 0; i < i40e->i40e_num_trqpairs; i++) {
+		/*
+		 * Step 4. Clear the QENA_REQ flag which tells hardware to
+		 * quiesce. If QENA_REQ is not already set then that means that
+		 * we likely already tried to disable this queue.
+		 */
+		reg = I40E_READ_REG(hw, I40E_QTX_ENA(i));
+		if (!(reg & I40E_QTX_ENA_QENA_REQ_MASK))
+			continue;
+		reg &= ~I40E_QTX_ENA_QENA_REQ_MASK;
+		I40E_WRITE_REG(hw, I40E_QTX_ENA(i), reg);
+	}
+
+	/*
+	 * Step 5. Wait for all drains to finish. This will be done by the
+	 * hardware removing the QENA_STAT flag from the queue. Rather than
+	 * waiting here, we interleave it with all the others in
+	 * i40e_shutdown_rings_wait().
+	 */
+}
+
+/*
+ * Wait for all the rings to be shut down. e.g. Steps 2 and 5 from the above
+ * functions.
+ */
+static boolean_t
+i40e_shutdown_rings_wait(i40e_t *i40e)
+{
+	int i, try;
+	i40e_hw_t *hw = &i40e->i40e_hw_space;
+
+	for (i = 0; i < i40e->i40e_num_trqpairs; i++) {
+		uint32_t reg;
+
+		for (try = 0; try < I40E_RING_WAIT_NTRIES; try++) {
+			reg = I40E_READ_REG(hw, I40E_QRX_ENA(i));
+			if ((reg & I40E_QRX_ENA_QENA_STAT_MASK) == 0)
+				break;
+			i40e_msec_delay(I40E_RING_WAIT_PAUSE);
+		}
+
+		if ((reg & I40E_QRX_ENA_QENA_STAT_MASK) != 0) {
+			i40e_error(i40e, "timed out disabling rx queue %d\n",
+			    i);
+			return (B_FALSE);
+		}
+
+		for (try = 0; try < I40E_RING_WAIT_NTRIES; try++) {
+			reg = I40E_READ_REG(hw, I40E_QTX_ENA(i));
+			if ((reg & I40E_QTX_ENA_QENA_STAT_MASK) == 0)
+				break;
+			i40e_msec_delay(I40E_RING_WAIT_PAUSE);
+		}
+
+		if ((reg & I40E_QTX_ENA_QENA_STAT_MASK) != 0) {
+			i40e_error(i40e, "timed out disabling tx queue %d\n",
+			    i);
+			return (B_FALSE);
+		}
+	}
+
+	return (B_TRUE);
+}
+
+static boolean_t
+i40e_shutdown_rings(i40e_t *i40e)
+{
+	i40e_shutdown_rx_rings(i40e);
+	i40e_shutdown_tx_rings(i40e);
+	return (i40e_shutdown_rings_wait(i40e));
+}
+
+static void
+i40e_setup_rx_descs(i40e_trqpair_t *itrq)
+{
+	int i;
+	i40e_rx_data_t *rxd = itrq->itrq_rxdata;
+
+	for (i = 0; i < rxd->rxd_ring_size; i++) {
+		i40e_rx_control_block_t *rcb;
+		i40e_rx_desc_t *rdesc;
+
+		rcb = rxd->rxd_work_list[i];
+		rdesc = &rxd->rxd_desc_ring[i];
+
+		rdesc->read.pkt_addr =
+		    CPU_TO_LE64((uintptr_t)rcb->rcb_dma.dmab_dma_address);
+		rdesc->read.hdr_addr = 0;
+	}
+}
+
+static boolean_t
+i40e_setup_rx_hmc(i40e_trqpair_t *itrq)
+{
+	i40e_rx_data_t *rxd = itrq->itrq_rxdata;
+	i40e_t *i40e = itrq->itrq_i40e;
+	i40e_hw_t *hw = &i40e->i40e_hw_space;
+
+	struct i40e_hmc_obj_rxq rctx;
+	int err;
+
+	bzero(&rctx, sizeof (struct i40e_hmc_obj_rxq));
+	rctx.base = rxd->rxd_desc_area.dmab_dma_address /
+	    I40E_HMC_RX_CTX_UNIT;
+	rctx.qlen = rxd->rxd_ring_size;
+	VERIFY(i40e->i40e_rx_buf_size >= I40E_HMC_RX_DBUFF_MIN);
+	VERIFY(i40e->i40e_rx_buf_size <= I40E_HMC_RX_DBUFF_MAX);
+	rctx.dbuff = i40e->i40e_rx_buf_size >> I40E_RXQ_CTX_DBUFF_SHIFT;
+	rctx.hbuff = 0 >> I40E_RXQ_CTX_HBUFF_SHIFT;
+	rctx.dtype = I40E_HMC_RX_DTYPE_NOSPLIT;
+	rctx.dsize = I40E_HMC_RX_DSIZE_32BYTE;
+	rctx.crcstrip = I40E_HMC_RX_CRCSTRIP_ENABLE;
+	rctx.fc_ena = I40E_HMC_RX_FC_DISABLE;
+	rctx.l2tsel = I40E_HMC_RX_L2TAGORDER;
+	rctx.hsplit_0 = I40E_HMC_RX_HDRSPLIT_DISABLE;
+	rctx.hsplit_1 = I40E_HMC_RX_HDRSPLIT_DISABLE;
+	rctx.showiv = I40E_HMC_RX_INVLAN_DONTSTRIP;
+	rctx.rxmax = i40e->i40e_frame_max;
+	rctx.tphrdesc_ena = I40E_HMC_RX_TPH_DISABLE;
+	rctx.tphwdesc_ena = I40E_HMC_RX_TPH_DISABLE;
+	rctx.tphdata_ena = I40E_HMC_RX_TPH_DISABLE;
+	rctx.tphhead_ena = I40E_HMC_RX_TPH_DISABLE;
+	rctx.lrxqthresh = I40E_HMC_RX_LOWRXQ_NOINTR;
+
+	/*
+	 * This must be set to 0x1, see Table 8-12 in section 8.3.3.2.2.
+	 */
+	rctx.prefena = I40E_HMC_RX_PREFENA;
+
+	err = i40e_clear_lan_rx_queue_context(hw, itrq->itrq_index);
+	if (err != I40E_SUCCESS) {
+		i40e_error(i40e, "failed to clear rx queue %d context: %d\n",
+		    itrq->itrq_index, err);
+		return (B_FALSE);
+	}
+
+	err = i40e_set_lan_rx_queue_context(hw, itrq->itrq_index, &rctx);
+	if (err != I40E_SUCCESS) {
+		i40e_error(i40e, "failed to set rx queue %d context: %d\n",
+		    itrq->itrq_index, err);
+		return (B_FALSE);
+	}
+
+	return (B_TRUE);
+}
+
+/*
+ * Take care of setting up the descriptor rings and actually programming the
+ * device. See 8.3.3.1.1 for the full list of steps we need to do to enable the
+ * rx rings.
+ */
+static boolean_t
+i40e_setup_rx_rings(i40e_t *i40e)
+{
+	int i;
+	i40e_hw_t *hw = &i40e->i40e_hw_space;
+
+	for (i = 0; i < i40e->i40e_num_trqpairs; i++) {
+		i40e_trqpair_t *itrq = &i40e->i40e_trqpairs[i];
+		i40e_rx_data_t *rxd = itrq->itrq_rxdata;
+		uint32_t reg;
+
+		/*
+		 * Step 1. Program all receive ring descriptors.
+		 */
+		i40e_setup_rx_descs(itrq);
+
+		/*
+		 * Step 2. Program the queue's FPM/HMC context.
+		 */
+		if (i40e_setup_rx_hmc(itrq) == B_FALSE)
+			return (B_FALSE);
+
+		/*
+		 * Step 3. Clear the queue's tail pointer and set it to the end
+		 * of the space.
+		 */
+		I40E_WRITE_REG(hw, I40E_QRX_TAIL(i), 0);
+		I40E_WRITE_REG(hw, I40E_QRX_TAIL(i), rxd->rxd_ring_size - 1);
+
+		/*
+		 * Step 4. Enable the queue via the QENA_REQ.
+		 */
+		reg = I40E_READ_REG(hw, I40E_QRX_ENA(i));
+		VERIFY0(reg & (I40E_QRX_ENA_QENA_REQ_MASK |
+		    I40E_QRX_ENA_QENA_STAT_MASK));
+		reg |= I40E_QRX_ENA_QENA_REQ_MASK;
+		I40E_WRITE_REG(hw, I40E_QRX_ENA(i), reg);
+	}
+
+	/*
+	 * Note, we wait for every queue to be enabled before we start checking.
+	 * This will hopefully cause most queues to be enabled at this point.
+	 */
+	for (i = 0; i < i40e->i40e_num_trqpairs; i++) {
+		uint32_t j, reg;
+
+		/*
+		 * Step 5. Verify that QENA_STAT has been set. It's promised
+		 * that this should occur within about 10 us, but like other
+		 * systems, we give the card a bit more time.
+		 */
+		for (j = 0; j < I40E_RING_WAIT_NTRIES; j++) {
+			reg = I40E_READ_REG(hw, I40E_QRX_ENA(i));
+
+			if (reg & I40E_QRX_ENA_QENA_STAT_MASK)
+				break;
+			i40e_msec_delay(I40E_RING_WAIT_PAUSE);
+		}
+
+		if ((reg & I40E_QRX_ENA_QENA_STAT_MASK) == 0) {
+			i40e_error(i40e, "failed to enable rx queue %d, timed "
+			    "out.");
+			return (B_FALSE);
+		}
+	}
+
+	return (B_TRUE);
+}
+
+static boolean_t
+i40e_setup_tx_hmc(i40e_trqpair_t *itrq)
+{
+	i40e_t *i40e = itrq->itrq_i40e;
+	i40e_hw_t *hw = &i40e->i40e_hw_space;
+
+	struct i40e_hmc_obj_txq tctx;
+	struct i40e_vsi_context	context;
+	int err;
+
+	bzero(&tctx, sizeof (struct i40e_hmc_obj_txq));
+	tctx.new_context = I40E_HMC_TX_NEW_CONTEXT;
+	tctx.base = itrq->itrq_desc_area.dmab_dma_address /
+	    I40E_HMC_TX_CTX_UNIT;
+	tctx.fc_ena = I40E_HMC_TX_FC_DISABLE;
+	tctx.timesync_ena = I40E_HMC_TX_TS_DISABLE;
+	tctx.fd_ena = I40E_HMC_TX_FD_DISABLE;
+	tctx.alt_vlan_ena = I40E_HMC_TX_ALT_VLAN_DISABLE;
+	tctx.head_wb_ena = I40E_HMC_TX_WB_ENABLE;
+	tctx.qlen = itrq->itrq_tx_ring_size;
+	tctx.tphrdesc_ena = I40E_HMC_TX_TPH_DISABLE;
+	tctx.tphrpacket_ena = I40E_HMC_TX_TPH_DISABLE;
+	tctx.tphwdesc_ena = I40E_HMC_TX_TPH_DISABLE;
+	tctx.head_wb_addr = itrq->itrq_desc_area.dmab_dma_address +
+	    sizeof (i40e_tx_desc_t) * itrq->itrq_tx_ring_size;
+
+	/*
+	 * This field isn't actually documented, like crc, but it suggests that
+	 * it should be zeroed. We leave both of these here because of that for
+	 * now. We should check with Intel on why these are here even.
+	 */
+	tctx.crc = 0;
+	tctx.rdylist_act = 0;
+
+	/*
+	 * We're supposed to assign the rdylist field with the value of the
+	 * traffic class index for the first device. We query the VSI parameters
+	 * again to get what the handle is. Note that every queue is always
+	 * assigned to traffic class zero, because we don't actually use them.
+	 */
+	bzero(&context, sizeof (struct i40e_vsi_context));
+	context.seid = i40e->i40e_vsi_id;
+	context.pf_num = hw->pf_id;
+	err = i40e_aq_get_vsi_params(hw, &context, NULL);
+	if (err != I40E_SUCCESS) {
+		i40e_error(i40e, "get VSI params failed with %d\n", err);
+		return (B_FALSE);
+	}
+	tctx.rdylist = LE_16(context.info.qs_handle[0]);
+
+	err = i40e_clear_lan_tx_queue_context(hw, itrq->itrq_index);
+	if (err != I40E_SUCCESS) {
+		i40e_error(i40e, "failed to clear tx queue %d context: %d\n",
+		    itrq->itrq_index, err);
+		return (B_FALSE);
+	}
+
+	err = i40e_set_lan_tx_queue_context(hw, itrq->itrq_index, &tctx);
+	if (err != I40E_SUCCESS) {
+		i40e_error(i40e, "failed to set tx queue %d context: %d\n",
+		    itrq->itrq_index, err);
+		return (B_FALSE);
+	}
+
+	return (B_TRUE);
+}
+
+/*
+ * Take care of setting up the descriptor rings and actually programming the
+ * device. See 8.4.3.1.1 for what we need to do here.
+ */
+static boolean_t
+i40e_setup_tx_rings(i40e_t *i40e)
+{
+	int i;
+	i40e_hw_t *hw = &i40e->i40e_hw_space;
+
+	for (i = 0; i < i40e->i40e_num_trqpairs; i++) {
+		i40e_trqpair_t *itrq = &i40e->i40e_trqpairs[i];
+		uint32_t reg;
+
+		/*
+		 * Step 1. Clear the queue disable flag and verify that the
+		 * index is set correctly.
+		 */
+		i40e_pre_tx_queue_cfg(hw, i, B_TRUE);
+
+		/*
+		 * Step 2. Prepare the queue's FPM/HMC context.
+		 */
+		if (i40e_setup_tx_hmc(itrq) == B_FALSE)
+			return (B_FALSE);
+
+		/*
+		 * Step 3. Verify that it's clear that this PF owns this queue.
+		 */
+		reg = I40E_QTX_CTL_PF_QUEUE;
+		reg |= (hw->pf_id << I40E_QTX_CTL_PF_INDX_SHIFT) &
+		    I40E_QTX_CTL_PF_INDX_MASK;
+		I40E_WRITE_REG(hw, I40E_QTX_CTL(itrq->itrq_index), reg);
+		i40e_flush(hw);
+
+		/*
+		 * Step 4. Set the QENA_REQ flag.
+		 */
+		reg = I40E_READ_REG(hw, I40E_QTX_ENA(i));
+		VERIFY0(reg & (I40E_QTX_ENA_QENA_REQ_MASK |
+		    I40E_QTX_ENA_QENA_STAT_MASK));
+		reg |= I40E_QTX_ENA_QENA_REQ_MASK;
+		I40E_WRITE_REG(hw, I40E_QTX_ENA(i), reg);
+	}
+
+	/*
+	 * Note, we wait for every queue to be enabled before we start checking.
+	 * This will hopefully cause most queues to be enabled at this point.
+	 */
+	for (i = 0; i < i40e->i40e_num_trqpairs; i++) {
+		uint32_t j, reg;
+
+		/*
+		 * Step 5. Verify that QENA_STAT has been set. It's promised
+		 * that this should occur within about 10 us, but like BSD,
+		 * we'll try for up to 100 ms for this queue.
+		 */
+		for (j = 0; j < I40E_RING_WAIT_NTRIES; j++) {
+			reg = I40E_READ_REG(hw, I40E_QTX_ENA(i));
+
+			if (reg & I40E_QTX_ENA_QENA_STAT_MASK)
+				break;
+			i40e_msec_delay(I40E_RING_WAIT_PAUSE);
+		}
+
+		if ((reg & I40E_QTX_ENA_QENA_STAT_MASK) == 0) {
+			i40e_error(i40e, "failed to enable tx queue %d, timed "
+			    "out");
+			return (B_FALSE);
+		}
+	}
+
+	return (B_TRUE);
+}
+
+void
+i40e_stop(i40e_t *i40e, boolean_t free_allocations)
+{
+	int i;
+
+	ASSERT(MUTEX_HELD(&i40e->i40e_general_lock));
+
+	/*
+	 * Shutdown and drain the tx and rx pipeline. We do this using the
+	 * following steps.
+	 *
+	 * 1) Shutdown interrupts to all the queues (trying to keep the admin
+	 *    queue alive).
+	 *
+	 * 2) Remove all of the interrupt tx and rx causes by setting the
+	 *    interrupt linked lists to zero.
+	 *
+	 * 2) Shutdown the tx and rx rings. Because i40e_shutdown_rings() should
+	 *    wait for all the queues to be disabled, once we reach that point
+	 *    it should be safe to free associated data.
+	 *
+	 * 4) Wait 50ms after all that is done. This ensures that the rings are
+	 *    ready for programming again and we don't have to think about this
+	 *    in other parts of the driver.
+	 *
+	 * 5) Disable remaining chip interrupts, (admin queue, etc.)
+	 *
+	 * 6) Verify that FM is happy with all the register accesses we
+	 *    performed.
+	 */
+	i40e_intr_io_disable_all(i40e);
+	i40e_intr_io_clear_cause(i40e);
+
+	if (i40e_shutdown_rings(i40e) == B_FALSE) {
+		ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_LOST);
+	}
+
+	delay(50 * drv_usectohz(1000));
+
+	i40e_intr_chip_fini(i40e);
+
+	for (i = 0; i < i40e->i40e_num_trqpairs; i++) {
+		mutex_enter(&i40e->i40e_trqpairs[i].itrq_rx_lock);
+		mutex_enter(&i40e->i40e_trqpairs[i].itrq_tx_lock);
+	}
+
+	/*
+	 * We should consider refactoring this to be part of the ring start /
+	 * stop routines at some point.
+	 */
+	for (i = 0; i < i40e->i40e_num_trqpairs; i++) {
+		i40e_stats_trqpair_fini(&i40e->i40e_trqpairs[i]);
+	}
+
+	if (i40e_check_acc_handle(i40e->i40e_osdep_space.ios_cfg_handle) !=
+	    DDI_FM_OK) {
+		ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_LOST);
+	}
+
+	for (i = 0; i < i40e->i40e_num_trqpairs; i++) {
+		i40e_tx_cleanup_ring(&i40e->i40e_trqpairs[i]);
+	}
+
+	for (i = 0; i < i40e->i40e_num_trqpairs; i++) {
+		mutex_exit(&i40e->i40e_trqpairs[i].itrq_rx_lock);
+		mutex_exit(&i40e->i40e_trqpairs[i].itrq_tx_lock);
+	}
+
+	i40e_stat_vsi_fini(i40e);
+
+	i40e->i40e_link_speed = 0;
+	i40e->i40e_link_duplex = 0;
+	i40e_link_state_set(i40e, LINK_STATE_UNKNOWN);
+
+	if (free_allocations) {
+		i40e_free_ring_mem(i40e, B_FALSE);
+	}
+}
+
+boolean_t
+i40e_start(i40e_t *i40e, boolean_t alloc)
+{
+	i40e_hw_t *hw = &i40e->i40e_hw_space;
+	boolean_t rc = B_TRUE;
+	int i, err;
+
+	ASSERT(MUTEX_HELD(&i40e->i40e_general_lock));
+
+	if (alloc) {
+		if (i40e_alloc_ring_mem(i40e) == B_FALSE) {
+			i40e_error(i40e,
+			    "Failed to allocate ring memory");
+			return (B_FALSE);
+		}
+	}
+
+	/*
+	 * This should get refactored to be part of ring start and stop at
+	 * some point, along with most of the logic here.
+	 */
+	for (i = 0; i < i40e->i40e_num_trqpairs; i++) {
+		if (i40e_stats_trqpair_init(&i40e->i40e_trqpairs[i]) ==
+		    B_FALSE) {
+			int j;
+
+			for (j = 0; j < i; j++) {
+				i40e_trqpair_t *itrq = &i40e->i40e_trqpairs[j];
+				i40e_stats_trqpair_fini(itrq);
+			}
+			return (B_FALSE);
+		}
+	}
+
+	if (!i40e_chip_start(i40e)) {
+		i40e_fm_ereport(i40e, DDI_FM_DEVICE_INVAL_STATE);
+		rc = B_FALSE;
+		goto done;
+	}
+
+	if (i40e_setup_rx_rings(i40e) == B_FALSE) {
+		rc = B_FALSE;
+		goto done;
+	}
+
+	if (i40e_setup_tx_rings(i40e) == B_FALSE) {
+		rc = B_FALSE;
+		goto done;
+	}
+
+	/*
+	 * Enable broadcast traffic; however, do not enable multicast traffic.
+	 * That's handle exclusively through MAC's mc_multicst routines.
+	 */
+	err = i40e_aq_set_vsi_broadcast(hw, i40e->i40e_vsi_id, B_TRUE, NULL);
+	if (err != I40E_SUCCESS) {
+		i40e_error(i40e, "failed to set default VSI: %d\n", err);
+		rc = B_FALSE;
+		goto done;
+	}
+
+	err = i40e_aq_set_mac_config(hw, i40e->i40e_frame_max, B_TRUE, 0, NULL);
+	if (err != I40E_SUCCESS) {
+		i40e_error(i40e, "failed to set MAC config: %d\n", err);
+		rc = B_FALSE;
+		goto done;
+	}
+
+	/*
+	 * Finally, make sure that we're happy from an FM perspective.
+	 */
+	if (i40e_check_acc_handle(i40e->i40e_osdep_space.ios_reg_handle) !=
+	    DDI_FM_OK) {
+		rc = B_FALSE;
+		goto done;
+	}
+
+	/* Clear state bits prior to final interrupt enabling. */
+	atomic_and_32(&i40e->i40e_state,
+	    ~(I40E_ERROR | I40E_STALL | I40E_OVERTEMP));
+
+	i40e_intr_io_enable_all(i40e);
+
+done:
+	if (rc == B_FALSE) {
+		i40e_stop(i40e, B_FALSE);
+		if (alloc == B_TRUE) {
+			i40e_free_ring_mem(i40e, B_TRUE);
+		}
+		ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_LOST);
+	}
+
+	return (rc);
+}
+
+/*
+ * We may have loaned up descriptors to the stack. As such, if we still have
+ * them outstanding, then we will not continue with detach.
+ */
+static boolean_t
+i40e_drain_rx(i40e_t *i40e)
+{
+	mutex_enter(&i40e->i40e_rx_pending_lock);
+	while (i40e->i40e_rx_pending > 0) {
+		if (cv_reltimedwait(&i40e->i40e_rx_pending_cv,
+		    &i40e->i40e_rx_pending_lock,
+		    drv_usectohz(I40E_DRAIN_RX_WAIT), TR_CLOCK_TICK) == -1) {
+			mutex_exit(&i40e->i40e_rx_pending_lock);
+			return (B_FALSE);
+		}
+	}
+	mutex_exit(&i40e->i40e_rx_pending_lock);
+
+	return (B_TRUE);
+}
+
+static int
+i40e_attach(dev_info_t *devinfo, ddi_attach_cmd_t cmd)
+{
+	i40e_t *i40e;
+	struct i40e_osdep *osdep;
+	i40e_hw_t *hw;
+	int instance;
+
+	if (cmd != DDI_ATTACH)
+		return (DDI_FAILURE);
+
+	instance = ddi_get_instance(devinfo);
+	i40e = kmem_zalloc(sizeof (i40e_t), KM_SLEEP);
+
+	i40e->i40e_aqbuf = kmem_zalloc(I40E_ADMINQ_BUFSZ, KM_SLEEP);
+	i40e->i40e_instance = instance;
+	i40e->i40e_dip = devinfo;
+
+	hw = &i40e->i40e_hw_space;
+	osdep = &i40e->i40e_osdep_space;
+	hw->back = osdep;
+	osdep->ios_i40e = i40e;
+
+	ddi_set_driver_private(devinfo, i40e);
+
+	i40e_fm_init(i40e);
+	i40e->i40e_attach_progress |= I40E_ATTACH_FM_INIT;
+
+	if (pci_config_setup(devinfo, &osdep->ios_cfg_handle) != DDI_SUCCESS) {
+		i40e_error(i40e, "Failed to map PCI configurations.");
+		goto attach_fail;
+	}
+	i40e->i40e_attach_progress |= I40E_ATTACH_PCI_CONFIG;
+
+	if (!i40e_identify_hardware(i40e)) {
+		i40e_error(i40e, "Failed to identify hardware");
+		goto attach_fail;
+	}
+
+	if (!i40e_regs_map(i40e)) {
+		i40e_error(i40e, "Failed to map device registers.");
+		goto attach_fail;
+	}
+	i40e->i40e_attach_progress |= I40E_ATTACH_REGS_MAP;
+
+	i40e_init_properties(i40e);
+	i40e->i40e_attach_progress |= I40E_ATTACH_PROPS;
+
+	if (!i40e_common_code_init(i40e, hw))
+		goto attach_fail;
+	i40e->i40e_attach_progress |= I40E_ATTACH_COMMON_CODE;
+
+	/*
+	 * When we participate in IRM, we should make sure that we register
+	 * ourselves with it before callbacks.
+	 */
+	if (!i40e_alloc_intrs(i40e, devinfo)) {
+		i40e_error(i40e, "Failed to allocate interrupts.");
+		goto attach_fail;
+	}
+	i40e->i40e_attach_progress |= I40E_ATTACH_ALLOC_INTR;
+
+	if (!i40e_alloc_trqpairs(i40e)) {
+		i40e_error(i40e,
+		    "Failed to allocate receive & transmit rings.");
+		goto attach_fail;
+	}
+	i40e->i40e_attach_progress |= I40E_ATTACH_ALLOC_RINGSLOCKS;
+
+	if (!i40e_map_intrs_to_vectors(i40e)) {
+		i40e_error(i40e, "Failed to map interrupts to vectors.");
+		goto attach_fail;
+	}
+
+	if (!i40e_add_intr_handlers(i40e)) {
+		i40e_error(i40e, "Failed to add the interrupt handlers.");
+		goto attach_fail;
+	}
+	i40e->i40e_attach_progress |= I40E_ATTACH_ADD_INTR;
+
+	if (!i40e_final_init(i40e)) {
+		i40e_error(i40e, "Final initialization failed.");
+		goto attach_fail;
+	}
+	i40e->i40e_attach_progress |= I40E_ATTACH_INIT;
+
+	if (i40e_check_acc_handle(i40e->i40e_osdep_space.ios_cfg_handle) !=
+	    DDI_FM_OK) {
+		ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_LOST);
+		goto attach_fail;
+	}
+
+	if (!i40e_stats_init(i40e)) {
+		i40e_error(i40e, "Stats initialization failed.");
+		goto attach_fail;
+	}
+	i40e->i40e_attach_progress |= I40E_ATTACH_STATS;
+
+	if (!i40e_register_mac(i40e)) {
+		i40e_error(i40e, "Failed to register to MAC/GLDv3");
+		goto attach_fail;
+	}
+	i40e->i40e_attach_progress |= I40E_ATTACH_MAC;
+
+	i40e->i40e_periodic_id = ddi_periodic_add(i40e_timer, i40e,
+	    I40E_CYCLIC_PERIOD, DDI_IPL_0);
+	if (i40e->i40e_periodic_id == 0) {
+		i40e_error(i40e, "Failed to add the link-check timer");
+		goto attach_fail;
+	}
+	i40e->i40e_attach_progress |= I40E_ATTACH_LINK_TIMER;
+
+	if (!i40e_enable_interrupts(i40e)) {
+		i40e_error(i40e, "Failed to enable DDI interrupts");
+		goto attach_fail;
+	}
+	i40e->i40e_attach_progress |= I40E_ATTACH_ENABLE_INTR;
+
+	atomic_or_32(&i40e->i40e_state, I40E_INITIALIZED);
+
+	mutex_enter(&i40e_glock);
+	list_insert_tail(&i40e_glist, i40e);
+	mutex_exit(&i40e_glock);
+
+	return (DDI_SUCCESS);
+
+attach_fail:
+	i40e_unconfigure(devinfo, i40e);
+	return (DDI_FAILURE);
+}
+
+static int
+i40e_detach(dev_info_t *devinfo, ddi_detach_cmd_t cmd)
+{
+	i40e_t *i40e;
+
+	if (cmd != DDI_DETACH)
+		return (DDI_FAILURE);
+
+	i40e = (i40e_t *)ddi_get_driver_private(devinfo);
+	if (i40e == NULL) {
+		i40e_log(NULL, "i40e_detach() called with no i40e pointer!");
+		return (DDI_FAILURE);
+	}
+
+	if (i40e_drain_rx(i40e) == B_FALSE) {
+		i40e_log(i40e, "timed out draining DMA resources, %d buffers "
+		    "remain", i40e->i40e_rx_pending);
+		return (DDI_FAILURE);
+	}
+
+	mutex_enter(&i40e_glock);
+	list_remove(&i40e_glist, i40e);
+	mutex_exit(&i40e_glock);
+
+	i40e_unconfigure(devinfo, i40e);
+
+	return (DDI_SUCCESS);
+}
+
+static struct cb_ops i40e_cb_ops = {
+	nulldev,		/* cb_open */
+	nulldev,		/* cb_close */
+	nodev,			/* cb_strategy */
+	nodev,			/* cb_print */
+	nodev,			/* cb_dump */
+	nodev,			/* cb_read */
+	nodev,			/* cb_write */
+	nodev,			/* cb_ioctl */
+	nodev,			/* cb_devmap */
+	nodev,			/* cb_mmap */
+	nodev,			/* cb_segmap */
+	nochpoll,		/* cb_chpoll */
+	ddi_prop_op,		/* cb_prop_op */
+	NULL,			/* cb_stream */
+	D_MP | D_HOTPLUG,	/* cb_flag */
+	CB_REV,			/* cb_rev */
+	nodev,			/* cb_aread */
+	nodev			/* cb_awrite */
+};
+
+static struct dev_ops i40e_dev_ops = {
+	DEVO_REV,		/* devo_rev */
+	0,			/* devo_refcnt */
+	NULL,			/* devo_getinfo */
+	nulldev,		/* devo_identify */
+	nulldev,		/* devo_probe */
+	i40e_attach,		/* devo_attach */
+	i40e_detach,		/* devo_detach */
+	nodev,			/* devo_reset */
+	&i40e_cb_ops,		/* devo_cb_ops */
+	NULL,			/* devo_bus_ops */
+	ddi_power,		/* devo_power */
+	ddi_quiesce_not_supported /* devo_quiesce */
+};
+
+static struct modldrv i40e_modldrv = {
+	&mod_driverops,
+	i40e_ident,
+	&i40e_dev_ops
+};
+
+static struct modlinkage i40e_modlinkage = {
+	MODREV_1,
+	&i40e_modldrv,
+	NULL
+};
+
+/*
+ * Module Initialization Functions.
+ */
+int
+_init(void)
+{
+	int status;
+
+	list_create(&i40e_glist, sizeof (i40e_t), offsetof(i40e_t, i40e_glink));
+	list_create(&i40e_dlist, sizeof (i40e_device_t),
+	    offsetof(i40e_device_t, id_link));
+	mutex_init(&i40e_glock, NULL, MUTEX_DRIVER, NULL);
+	mac_init_ops(&i40e_dev_ops, I40E_MODULE_NAME);
+
+	status = mod_install(&i40e_modlinkage);
+	if (status != DDI_SUCCESS) {
+		mac_fini_ops(&i40e_dev_ops);
+		mutex_destroy(&i40e_glock);
+		list_destroy(&i40e_dlist);
+		list_destroy(&i40e_glist);
+	}
+
+	return (status);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&i40e_modlinkage, modinfop));
+}
+
+int
+_fini(void)
+{
+	int status;
+
+	status = mod_remove(&i40e_modlinkage);
+	if (status == DDI_SUCCESS) {
+		mac_fini_ops(&i40e_dev_ops);
+		mutex_destroy(&i40e_glock);
+		list_destroy(&i40e_dlist);
+		list_destroy(&i40e_glist);
+	}
+
+	return (status);
+}
diff --git a/usr/src/uts/common/io/i40e/i40e_osdep.c b/usr/src/uts/common/io/i40e/i40e_osdep.c
new file mode 100644
index 0000000000..41a13ee4ec
--- /dev/null
+++ b/usr/src/uts/common/io/i40e/i40e_osdep.c
@@ -0,0 +1,236 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 OmniTI Computer Consulting, Inc. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#include "i40e_sw.h"
+#include "i40e_type.h"
+#include "i40e_alloc.h"
+#include "i40e_osdep.h"
+
+#include <sys/dtrace.h>
+
+/* ARGSUSED */
+i40e_status
+i40e_allocate_virt_mem(struct i40e_hw *hw, struct i40e_virt_mem *mem, u32 size)
+{
+	mem->va = kmem_zalloc(size, KM_SLEEP);
+	mem->size = size;
+	return (I40E_SUCCESS);
+}
+
+/* ARGSUSED */
+i40e_status
+i40e_free_virt_mem(struct i40e_hw *hw, struct i40e_virt_mem *mem)
+{
+	if (mem->va != NULL)
+		kmem_free(mem->va, mem->size);
+	return (I40E_SUCCESS);
+}
+
+/* ARGSUSED */
+i40e_status
+i40e_allocate_dma_mem(struct i40e_hw *hw, struct i40e_dma_mem *mem,
+    enum i40e_memory_type type, u64 size, u32 alignment)
+{
+	int rc;
+	i40e_t *i40e = OS_DEP(hw)->ios_i40e;
+	dev_info_t *dip = i40e->i40e_dip;
+	size_t len;
+	ddi_dma_cookie_t cookie;
+	uint_t cookie_num;
+	ddi_dma_attr_t attr;
+
+	/*
+	 * Because we need to honor the specified alignment, we need to
+	 * dynamically construct the attributes. We save the alignment for
+	 * debugging purposes.
+	 */
+	bcopy(&i40e->i40e_static_dma_attr, &attr, sizeof (ddi_dma_attr_t));
+	attr.dma_attr_align = alignment;
+	mem->idm_alignment = alignment;
+	rc = ddi_dma_alloc_handle(dip, &i40e->i40e_static_dma_attr,
+	    DDI_DMA_DONTWAIT, NULL, &mem->idm_dma_handle);
+	if (rc != DDI_SUCCESS) {
+		mem->idm_dma_handle = NULL;
+		i40e_error(i40e, "failed to allocate DMA handle for common "
+		    "code: %d", rc);
+
+		/*
+		 * Swallow unknown errors and treat them like we do
+		 * DDI_DMA_NORESOURCES, in other words, a memory error.
+		 */
+		if (rc == DDI_DMA_BADATTR)
+			return (I40E_ERR_PARAM);
+		return (I40E_ERR_NO_MEMORY);
+	}
+
+	rc = ddi_dma_mem_alloc(mem->idm_dma_handle, size,
+	    &i40e->i40e_buf_acc_attr, DDI_DMA_STREAMING, DDI_DMA_DONTWAIT,
+	    NULL, (caddr_t *)&mem->va, &len, &mem->idm_acc_handle);
+	if (rc != DDI_SUCCESS) {
+		mem->idm_acc_handle = NULL;
+		mem->va = NULL;
+		ASSERT(mem->idm_dma_handle != NULL);
+		ddi_dma_free_handle(&mem->idm_dma_handle);
+		mem->idm_dma_handle = NULL;
+
+		i40e_error(i40e, "failed to allocate %d bytes of DMA memory "
+		    "for common code", size);
+		return (I40E_ERR_NO_MEMORY);
+	}
+
+	bzero(mem->va, len);
+
+	rc = ddi_dma_addr_bind_handle(mem->idm_dma_handle, NULL, mem->va, len,
+	    DDI_DMA_RDWR | DDI_DMA_STREAMING, DDI_DMA_DONTWAIT, NULL,
+	    &cookie, &cookie_num);
+	if (rc != DDI_DMA_MAPPED) {
+		mem->pa = NULL;
+		ASSERT(mem->idm_acc_handle != NULL);
+		ddi_dma_mem_free(&mem->idm_acc_handle);
+		mem->idm_acc_handle = NULL;
+		mem->va = NULL;
+		ASSERT(mem->idm_dma_handle != NULL);
+		ddi_dma_free_handle(&mem->idm_dma_handle);
+		mem->idm_dma_handle = NULL;
+
+		i40e_error(i40e, "failed to bind %d byte sized dma region: %d",
+		    len, rc);
+		switch (rc) {
+		case DDI_DMA_INUSE:
+			return (I40E_ERR_NOT_READY);
+		case DDI_DMA_TOOBIG:
+			return (I40E_ERR_INVALID_SIZE);
+		case DDI_DMA_NOMAPPING:
+		case DDI_DMA_NORESOURCES:
+		default:
+			return (I40E_ERR_NO_MEMORY);
+		}
+	}
+
+	ASSERT(cookie_num == 1);
+	mem->pa = cookie.dmac_laddress;
+	/*
+	 * Lint doesn't like this because the common code gives us a uint64_t as
+	 * input, but the common code then asks us to assign it to a size_t. So
+	 * lint's right, but in this case there isn't much we can do.
+	 */
+	mem->size = (size_t)size;
+
+	return (I40E_SUCCESS);
+}
+
+/* ARGSUSED */
+i40e_status
+i40e_free_dma_mem(struct i40e_hw *hw, struct i40e_dma_mem *mem)
+{
+	if (mem->pa != 0) {
+		VERIFY(mem->idm_dma_handle != NULL);
+		(void) ddi_dma_unbind_handle(mem->idm_dma_handle);
+		mem->pa = 0;
+		mem->size = 0;
+	}
+
+	if (mem->idm_acc_handle != NULL) {
+		ddi_dma_mem_free(&mem->idm_acc_handle);
+		mem->idm_acc_handle = NULL;
+		mem->va = NULL;
+	}
+
+	if (mem->idm_dma_handle != NULL) {
+		ddi_dma_free_handle(&mem->idm_dma_handle);
+		mem->idm_dma_handle = NULL;
+	}
+
+	/*
+	 * Watch out for sloppiness.
+	 */
+	ASSERT(mem->pa == 0);
+	ASSERT(mem->va == NULL);
+	ASSERT(mem->size == 0);
+	mem->idm_alignment = UINT32_MAX;
+
+	return (I40E_SUCCESS);
+}
+
+/*
+ * The common code wants to initialize its 'spinlocks' here, aka adaptive
+ * mutexes. At this time these are only used to maintain the adminq's data and
+ * as such it will only be used outside of interrupt context and even then,
+ * we're not going to actually end up ever doing anything above lock level and
+ * up in doing stuff with high level interrupts.
+ */
+void
+i40e_init_spinlock(struct i40e_spinlock *lock)
+{
+	mutex_init(&lock->ispl_mutex, NULL, MUTEX_DRIVER, NULL);
+}
+
+void
+i40e_acquire_spinlock(struct i40e_spinlock *lock)
+{
+	mutex_enter(&lock->ispl_mutex);
+}
+
+void
+i40e_release_spinlock(struct i40e_spinlock *lock)
+{
+	mutex_exit(&lock->ispl_mutex);
+}
+
+void
+i40e_destroy_spinlock(struct i40e_spinlock *lock)
+{
+	mutex_destroy(&lock->ispl_mutex);
+}
+
+boolean_t
+i40e_set_hw_bus_info(struct i40e_hw *hw)
+{
+	uint8_t pcie_id = PCI_CAP_ID_PCI_E;
+	uint16_t pcie_cap, value;
+	int status;
+
+	/* locate the pci-e capability block */
+	status = pci_lcap_locate((OS_DEP(hw))->ios_cfg_handle, pcie_id,
+	    &pcie_cap);
+	if (status != DDI_SUCCESS) {
+		i40e_error(OS_DEP(hw)->ios_i40e, "failed to locate PCIe "
+		    "capability block: %d",
+		    status);
+		return (B_FALSE);
+	}
+
+	value = pci_config_get16(OS_DEP(hw)->ios_cfg_handle,
+	    pcie_cap + PCIE_LINKSTS);
+
+	i40e_set_pci_config_data(hw, value);
+
+	return (B_TRUE);
+}
+
+/* ARGSUSED */
+void
+i40e_debug(void *hw, u32 mask, char *fmt, ...)
+{
+	char buf[1024];
+	va_list args;
+
+	va_start(args, fmt);
+	(void) vsnprintf(buf, sizeof (buf), fmt, args);
+	va_end(args);
+
+	DTRACE_PROBE2(i40e__debug, uint32_t, mask, char *, buf);
+}
diff --git a/usr/src/uts/common/io/i40e/i40e_osdep.h b/usr/src/uts/common/io/i40e/i40e_osdep.h
new file mode 100644
index 0000000000..12f498bc72
--- /dev/null
+++ b/usr/src/uts/common/io/i40e/i40e_osdep.h
@@ -0,0 +1,201 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 OmniTI Computer Consulting, Inc. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#ifndef _I40E_OSDEP_H
+#define	_I40E_OSDEP_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/types.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/pci_cap.h>
+#include <sys/sysmacros.h>
+
+/*
+ * For the moment, we use this to basically deal with a few custom changes
+ * particularly around mutex initialization. This is used to indicate that we
+ * should take illumos variants.
+ */
+#define	I40E_ILLUMOS 1
+
+#define	DEBUGOUT(S)				i40e_debug(NULL, 0, S)
+#define	DEBUGOUT1(S, A)				i40e_debug(NULL, 0, S, A)
+#define	DEBUGOUT2(S, A, B)			i40e_debug(NULL, 0, S, A, B)
+#define	DEBUGOUT3(S, A, B, C)			i40e_debug(NULL, 0, S, A, B, C)
+#define	DEBUGOUT4(S, A, B, C, D)		\
+	i40e_debug(NULL, 0, S, A, B, C, D)
+#define	DEBUGOUT5(S, A, B, C, D, E)		\
+	i40e_debug(NULL, 0, S, A, B, C, D, E)
+#define	DEBUGOUT6(S, A, B, C, D, E, F)		\
+	i40e_debug(NULL, 0, S, A, B, C, D, E, F)
+#define	DEBUGOUT7(S, A, B, C, D, E, F, G)	\
+	i40e_debug(NULL, 0, S, A, B, C, D, E, F, G)
+#define	DEBUGFUNC(F)				DEBUGOUT(F);
+
+
+#define	UNREFERENCED_PARAMETER(x)		_NOTE(ARGUNUSED(x))
+#define	UNREFERENCED_1PARAMETER(_p)		UNREFERENCED_PARAMETER(_p)
+#define	UNREFERENCED_2PARAMETER(_p, _q)		_NOTE(ARGUNUSED(_p, _q))
+#define	UNREFERENCED_3PARAMETER(_p, _q, _r)	_NOTE(ARGUNUSED(_p, _q, _r))
+#define	UNREFERENCED_4PARAMETER(_p, _q, _r, _s)	_NOTE(ARGUNUSED(_p, _q,_r, _s))
+
+#define	INLINE  inline
+
+/*
+ * The mdb dmod needs to use this code as well, but mdb already defines TRUE and
+ * FALSE in the module API. Thus we don't define these if we're building the
+ * dmod, as indicated by _I40E_MDB_DMOD. However, if we don't define these, then
+ * the shared code will be upset.
+ */
+#ifndef _I40E_MDB_DMOD
+#define	FALSE	B_FALSE
+#define	false	B_FALSE
+#define	TRUE	B_TRUE
+#define	true	B_TRUE
+#endif /* _I40E_MDB_DMOD */
+
+
+#define	CPU_TO_LE16(o)	LE_16(o)
+#define	CPU_TO_LE32(s)	LE_32(s)
+#define	CPU_TO_LE64(h)	LE_64(h)
+#define	LE16_TO_CPU(a)	LE_16(a)
+#define	LE32_TO_CPU(c)	LE_32(c)
+#define	LE64_TO_CPU(k)	LE_64(k)
+
+#define	I40E_NTOHS(a)	ntohs(a)
+#define	I40E_NTOHL(a)	ntohl(a)
+#define	I40E_HTONS(a)	htons(a)
+#define	I40E_HTONL(a)	htonl(a)
+
+#define	i40e_memset(a, b, c, d)  memset((a), (b), (c))
+#define	i40e_memcpy(a, b, c, d)  bcopy((b), (a), (c))
+
+#define	i40e_usec_delay(x) drv_usecwait(x)
+#define	i40e_msec_delay(x) drv_usecwait(1000 * (x))
+
+#define	FIELD_SIZEOF(x, y) (sizeof (((x*)0)->y))
+
+#define	BIT(a) 		(1UL << (a))
+#define	BIT_ULL(a) 	(1ULL << (a))
+
+typedef boolean_t	bool;
+
+typedef uint8_t		u8;
+typedef int8_t		s8;
+typedef uint16_t	u16;
+typedef int16_t		s16;
+typedef uint32_t	u32;
+typedef int32_t		s32;
+typedef uint64_t	u64;
+
+/* long string relief */
+typedef enum i40e_status_code i40e_status;
+
+#define	__le16  u16
+#define	__le32  u32
+#define	__le64  u64
+#define	__be16  u16
+#define	__be32  u32
+#define	__be64  u64
+
+/*
+ * Most other systems use spin locks for interrupts. However, illumos always
+ * uses a single kmutex_t for both and we decide what to do based on IPL (hint:
+ * it's not going to be a true spin lock, we'll use an adaptive mutex).
+ */
+struct i40e_spinlock {
+	kmutex_t ispl_mutex;
+};
+
+/*
+ * Note, while prefetch is strictly not present on all architectures, (it's an
+ * SSE extension on i386), it is expected that the platforms provide it.
+ */
+#define	prefetch(x) prefetch_read_many(x)
+
+struct i40e_osdep {
+	off_t			ios_reg_size;
+	ddi_acc_handle_t 	ios_reg_handle;
+	ddi_acc_handle_t 	ios_cfg_handle;
+	struct i40e		*ios_i40e;
+};
+
+/*
+ * This structure and its members are defined by the common code. This means we
+ * cannot structure prefix it, even if we want to.
+ */
+struct i40e_virt_mem {
+	void 	*va;
+	u32	size;
+};
+
+/*
+ * The first three members of this structure are defined by the common code.
+ * This means we cannot structure prefix them, even if we wanted to.
+ */
+struct i40e_dma_mem {
+	void			*va;	/* Virtual address. */
+	u64			pa;	/* Physical (DMA/Hardware) address. */
+	size_t			size;	/* Buffer size. */
+
+	/* illumos-private members */
+	ddi_acc_handle_t	idm_acc_handle;	/* Data access handle */
+	ddi_dma_handle_t	idm_dma_handle;	/* DMA handle */
+	uint32_t		idm_alignment;	/* Requested alignment */
+};
+
+struct i40e_hw; /* forward decl */
+
+#define	OS_DEP(hw) ((struct i40e_osdep *)((hw)->back))
+#define	i40e_read_pci_cfg(hw, reg) \
+	(pci_config_get16(OS_DEP(hw)->ios_cfg_handle, (reg)))
+#define	i40e_write_pci_cfg(hw, reg, value) \
+	(pci_config_put16(OS_DEP(hw)->ios_cfg_handle, (reg), (value)))
+
+/*
+ * Intel expects that the symbol wr32 and r32 be defined to something which can
+ * read and write the 32-bit register in PCI space.
+ *
+ * To make it easier for readers and satisfy the general agreement that macros
+ * should be in all capitals, we use our own versions of these macros.
+ */
+#define	wr32(hw, reg, value) \
+	ddi_put32(OS_DEP(hw)->ios_reg_handle, \
+	    (uint32_t *)((uintptr_t)(hw)->hw_addr + (reg)), (value))
+#define	rd32(hw, reg) \
+	ddi_get32(OS_DEP(hw)->ios_reg_handle, \
+	    (uint32_t *)((uintptr_t)(hw)->hw_addr + (reg)))
+#define	I40E_WRITE_REG	wr32
+#define	I40E_READ_REG	rd32
+
+/*
+ * The use of GLEN_STAT presumes that we're only using this file for a PF
+ * driver. If we end up doing a VF driver, then we'll want to logically change
+ * this.
+ */
+#define	i40e_flush(hw) (void) rd32(hw, I40E_GLGEN_STAT)
+
+extern void i40e_debug(void *, u32, char *, ...);
+extern boolean_t i40e_set_hw_bus_info(struct i40e_hw *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _I40E_OSDEP_H */
diff --git a/usr/src/uts/common/io/i40e/i40e_stats.c b/usr/src/uts/common/io/i40e/i40e_stats.c
new file mode 100644
index 0000000000..c7dd403fc8
--- /dev/null
+++ b/usr/src/uts/common/io/i40e/i40e_stats.c
@@ -0,0 +1,1310 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 OmniTI Computer Consulting, Inc. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#include "i40e_sw.h"
+
+/*
+ * -------------------
+ * Statistics Overview
+ * -------------------
+ *
+ * As part of managing the driver and understanding what's going on, we keep
+ * track of statistics from two different sources:
+ *
+ *   - Statistics from the device
+ *   - Statistics maintained by the driver
+ *
+ * Generally, the hardware provides us traditional IETF and MIB Ethernet
+ * statistics, for example, the total packets in and out, various errors in
+ * packets, the negotiated status etc. The driver, on the other hand, primarily
+ * contains statistics around driver-specific issues, such as information about
+ * checksumming on receive and transmit and the data in and out of a specific
+ * ring.
+ *
+ * We export statistics in two different forms. The first form is the required
+ * GLDv3 endpoints, specifically:
+ *
+ *   - The general GLDv3 mc_getstat interface
+ *   - The GLDv3 ring mri_stat interface
+ *
+ * The second form that we export statistics is through kstats. kstats are
+ * exported in different ways. Particularly we arrange the kstats to monitor the
+ * layout of the device. Currently we have kstats which capture both the IEEE
+ * and driver-implementation specific stats. There are kstats for each of the
+ * following structures:
+ *
+ *   - Each physical function
+ *   - Each VSI
+ *   - Each Queue
+ *
+ * The PF's kstat is called 'pfstats' so as not to collide with other system
+ * provided kstats. Thus, for instance 0, usually the first PF, the full kstat
+ * would be: i40e:0:pfstats:.
+ *
+ * The kstat for each VSI is called vsi_%instance. So for the first PF, which is
+ * instance zero and the first vsi, which has id 0, it will be named vsi_0 and
+ * the full kstat would be i40e:0:vsi_0:.
+ *
+ * The kstat for each queue is trqpair_tx_%queue and trqpair_rx_%queue. Note
+ * that these are labeled based on their local index, which may mean that
+ * different instances have overlapping sets of queues. This isn't a problem as
+ * the kstats will always use the instance number of the pf to distinguish it in
+ * the kstat tuple.
+ *
+ * ---------------------
+ * Hardware Arrangements
+ * ---------------------
+ *
+ * The hardware keeps statistics at each physical function/MAC (PF) and it keeps
+ * statistics on each virtual station interface (VSI). Currently we only use one
+ * VSI per PF (see the i40e_main.c theory statement). The hardware has a limited
+ * number of statistics units available. While every PF is guaranteed to have a
+ * statistics unit, it is possible that we will run out for a given VSI. We'll
+ * have to figure out an appropriate strategy here when we end up supporting
+ * multiple VSIs.
+ *
+ * The hardware keeps these statistics as 32-bit and 48-bit counters. We are
+ * required to read them and then compute the differences between them. The
+ * 48-bit counters span more than one 32-bit register in the BAR. The hardware
+ * suggests that to read them, we perform 64-bit reads of the lower of the two
+ * registers that make up a 48-bit stat. The hardware guarantees that the reads
+ * of those two registers will be atomic and we'll get a consistent value, not a
+ * property it has for every read of two registers.
+ *
+ * For every kstat we have based on this, we have a corresponding uint64_t that
+ * we keep around as a base value in a separate structure. Whenever we read a
+ * value, we end up grabbing the current value, calculating a difference between
+ * the previously stored value and the current one, and updating the kstat with
+ * that difference. After which, we go through and update the base value that we
+ * stored. This is all encapsulated in i40e_stat_get_uint32() and
+ * i40e_stat_get_uint48().
+ *
+ * The only unfortunate thing here is that the hardware doesn't give us any kind
+ * of overflow counter. It just tries to make sure that the uint32_t and
+ * uint48_t counters are large enough to hopefully not overflow right away. This
+ * isn't the most reassuring statement and we should investigate ways of
+ * ensuring that if a system is active, but not actively measured, we don't lose
+ * data.
+ *
+ * The pf kstats data is stored in the i40e_t`i40e_pf_kstat. It is backed by the
+ * i40e_t`i40e_pf_stat structure. Similarly the VSI related kstat is in
+ * i40e_t`i40e_vsi_kstat and the data is backed in the i40e_t`i40e_vsi_stat. All
+ * of this data is protected by the i40e_stat_lock, which should be taken last,
+ * when acquiring locks.
+ */
+
+static void
+i40e_stat_get_uint48(i40e_t *i40e, uintptr_t reg, kstat_named_t *kstat,
+    uint64_t *base, boolean_t init)
+{
+	i40e_hw_t *hw = &i40e->i40e_hw_space;
+	uint64_t raw, delta;
+
+	ASSERT(MUTEX_HELD(&i40e->i40e_stat_lock));
+
+	raw = ddi_get64(i40e->i40e_osdep_space.ios_reg_handle,
+	    (uint64_t *)((uintptr_t)hw->hw_addr + reg));
+
+	if (init == B_TRUE) {
+		*base = raw;
+		return;
+	}
+
+	/*
+	 * Check for wraparound, note that the counter is actually only 48-bits,
+	 * even though it has two uint32_t regs present.
+	 */
+	if (raw >= *base) {
+		delta = raw - *base;
+	} else {
+		delta = 0x1000000000000ULL - *base + raw;
+	}
+
+	kstat->value.ui64 += delta;
+	*base = raw;
+}
+
+static void
+i40e_stat_get_uint32(i40e_t *i40e, uintptr_t reg, kstat_named_t *kstat,
+    uint64_t *base, boolean_t init)
+{
+	i40e_hw_t *hw = &i40e->i40e_hw_space;
+	uint64_t raw, delta;
+
+	ASSERT(MUTEX_HELD(&i40e->i40e_stat_lock));
+
+	raw = ddi_get32(i40e->i40e_osdep_space.ios_reg_handle,
+	    (uint32_t *)((uintptr_t)hw->hw_addr + reg));
+
+	if (init == B_TRUE) {
+		*base = raw;
+		return;
+	}
+
+	/*
+	 * Watch out for wraparound as we only have a 32-bit counter.
+	 */
+	if (raw >= *base) {
+		delta = raw - *base;
+	} else {
+		delta = 0x100000000ULL - *base + raw;
+	}
+
+	kstat->value.ui64 += delta;
+	*base = raw;
+
+}
+
+static void
+i40e_stat_vsi_update(i40e_t *i40e, boolean_t init)
+{
+	i40e_vsi_stats_t *ivs;
+	i40e_vsi_kstats_t *ivk;
+	int id = i40e->i40e_vsi_stat_id;
+
+	ASSERT(i40e->i40e_vsi_kstat != NULL);
+	ivs = &i40e->i40e_vsi_stat;
+	ivk = i40e->i40e_vsi_kstat->ks_data;
+
+	mutex_enter(&i40e->i40e_stat_lock);
+
+	i40e_stat_get_uint48(i40e, I40E_GLV_GORCL(id), &ivk->ivk_rx_bytes,
+	    &ivs->ivs_rx_bytes, init);
+	i40e_stat_get_uint48(i40e, I40E_GLV_UPRCL(id), &ivk->ivk_rx_unicast,
+	    &ivs->ivs_rx_unicast, init);
+	i40e_stat_get_uint48(i40e, I40E_GLV_MPRCL(id), &ivk->ivk_rx_multicast,
+	    &ivs->ivs_rx_multicast, init);
+	i40e_stat_get_uint48(i40e, I40E_GLV_BPRCL(id), &ivk->ivk_rx_broadcast,
+	    &ivs->ivs_rx_broadcast, init);
+
+	i40e_stat_get_uint32(i40e, I40E_GLV_RDPC(id), &ivk->ivk_rx_discards,
+	    &ivs->ivs_rx_discards, init);
+	i40e_stat_get_uint32(i40e, I40E_GLV_RUPP(id),
+	    &ivk->ivk_rx_unknown_protocol,
+	    &ivs->ivs_rx_unknown_protocol,
+	    init);
+
+	i40e_stat_get_uint48(i40e, I40E_GLV_GOTCL(id), &ivk->ivk_tx_bytes,
+	    &ivs->ivs_tx_bytes, init);
+	i40e_stat_get_uint48(i40e, I40E_GLV_UPTCL(id), &ivk->ivk_tx_unicast,
+	    &ivs->ivs_tx_unicast, init);
+	i40e_stat_get_uint48(i40e, I40E_GLV_MPTCL(id), &ivk->ivk_tx_multicast,
+	    &ivs->ivs_tx_multicast, init);
+	i40e_stat_get_uint48(i40e, I40E_GLV_BPTCL(id), &ivk->ivk_tx_broadcast,
+	    &ivs->ivs_tx_broadcast, init);
+
+	i40e_stat_get_uint32(i40e, I40E_GLV_TEPC(id), &ivk->ivk_tx_errors,
+	    &ivs->ivs_tx_errors, init);
+
+	mutex_exit(&i40e->i40e_stat_lock);
+
+	/*
+	 * We follow ixgbe's lead here and that if a kstat update didn't work
+	 * 100% then we mark service unaffected as opposed to when fetching
+	 * things for MAC directly.
+	 */
+	if (i40e_check_acc_handle(i40e->i40e_osdep_space.ios_reg_handle) !=
+	    DDI_FM_OK) {
+		ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_UNAFFECTED);
+	}
+}
+
+static int
+i40e_stat_vsi_kstat_update(kstat_t *ksp, int rw)
+{
+	i40e_t *i40e;
+
+	if (rw == KSTAT_WRITE)
+		return (EACCES);
+
+	i40e = ksp->ks_private;
+	i40e_stat_vsi_update(i40e, B_FALSE);
+	return (0);
+}
+
+void
+i40e_stat_vsi_fini(i40e_t *i40e)
+{
+	if (i40e->i40e_vsi_kstat != NULL) {
+		kstat_delete(i40e->i40e_vsi_kstat);
+		i40e->i40e_vsi_kstat = NULL;
+	}
+}
+
+boolean_t
+i40e_stat_vsi_init(i40e_t *i40e)
+{
+	kstat_t *ksp;
+	i40e_vsi_kstats_t *ivk;
+	char buf[64];
+
+	(void) snprintf(buf, sizeof (buf), "vsi_%d", i40e->i40e_vsi_id);
+
+	ksp = kstat_create(I40E_MODULE_NAME, ddi_get_instance(i40e->i40e_dip),
+	    buf, "net", KSTAT_TYPE_NAMED,
+	    sizeof (i40e_vsi_kstats_t) / sizeof (kstat_named_t), 0);
+
+	if (ksp == NULL) {
+		i40e_error(i40e, "Failed to create kstats for VSI %d",
+		    i40e->i40e_vsi_id);
+		return (B_FALSE);
+	}
+
+	i40e->i40e_vsi_kstat = ksp;
+	ivk = ksp->ks_data;
+	ksp->ks_update = i40e_stat_vsi_kstat_update;
+	ksp->ks_private = i40e;
+
+	kstat_named_init(&ivk->ivk_rx_bytes, "rx_bytes",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ivk->ivk_rx_unicast, "rx_unicast",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ivk->ivk_rx_multicast, "rx_multicast",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ivk->ivk_rx_broadcast, "rx_broadcast",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ivk->ivk_rx_discards, "rx_discards",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ivk->ivk_rx_unknown_protocol, "rx_unknown_protocol",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ivk->ivk_tx_bytes, "tx_bytes",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ivk->ivk_tx_unicast, "tx_unicast",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ivk->ivk_tx_multicast, "tx_multicast",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ivk->ivk_tx_broadcast, "tx_broadcast",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ivk->ivk_tx_errors, "tx_errors",
+	    KSTAT_DATA_UINT64);
+
+	bzero(&i40e->i40e_vsi_stat, sizeof (i40e_vsi_stats_t));
+	i40e_stat_vsi_update(i40e, B_TRUE);
+	kstat_install(i40e->i40e_vsi_kstat);
+
+	return (B_TRUE);
+}
+
+static void
+i40e_stat_pf_update(i40e_t *i40e, boolean_t init)
+{
+	i40e_pf_stats_t *ips;
+	i40e_pf_kstats_t *ipk;
+	int port = i40e->i40e_hw_space.port;
+	int i;
+
+	ASSERT(i40e->i40e_pf_kstat != NULL);
+	ips = &i40e->i40e_pf_stat;
+	ipk = i40e->i40e_pf_kstat->ks_data;
+
+	mutex_enter(&i40e->i40e_stat_lock);
+
+	/* 64-bit PCIe regs */
+	i40e_stat_get_uint48(i40e, I40E_GLPRT_GORCL(port),
+	    &ipk->ipk_rx_bytes, &ips->ips_rx_bytes, init);
+	i40e_stat_get_uint48(i40e, I40E_GLPRT_UPRCL(port),
+	    &ipk->ipk_rx_unicast, &ips->ips_rx_unicast, init);
+	i40e_stat_get_uint48(i40e, I40E_GLPRT_MPRCL(port),
+	    &ipk->ipk_rx_multicast, &ips->ips_rx_multicast, init);
+	i40e_stat_get_uint48(i40e, I40E_GLPRT_BPRCL(port),
+	    &ipk->ipk_rx_broadcast, &ips->ips_rx_broadcast, init);
+	i40e_stat_get_uint48(i40e, I40E_GLPRT_GOTCL(port),
+	    &ipk->ipk_tx_bytes, &ips->ips_tx_bytes, init);
+	i40e_stat_get_uint48(i40e, I40E_GLPRT_UPTCL(port),
+	    &ipk->ipk_tx_unicast, &ips->ips_tx_unicast, init);
+	i40e_stat_get_uint48(i40e, I40E_GLPRT_MPTCL(port),
+	    &ipk->ipk_tx_multicast, &ips->ips_tx_multicast, init);
+	i40e_stat_get_uint48(i40e, I40E_GLPRT_BPTCL(port),
+	    &ipk->ipk_tx_broadcast, &ips->ips_tx_broadcast, init);
+
+	i40e_stat_get_uint48(i40e, I40E_GLPRT_PRC64L(port),
+	    &ipk->ipk_rx_size_64, &ips->ips_rx_size_64, init);
+	i40e_stat_get_uint48(i40e, I40E_GLPRT_PRC127L(port),
+	    &ipk->ipk_rx_size_127, &ips->ips_rx_size_127, init);
+	i40e_stat_get_uint48(i40e, I40E_GLPRT_PRC255L(port),
+	    &ipk->ipk_rx_size_255, &ips->ips_rx_size_255, init);
+	i40e_stat_get_uint48(i40e, I40E_GLPRT_PRC511L(port),
+	    &ipk->ipk_rx_size_511, &ips->ips_rx_size_511, init);
+	i40e_stat_get_uint48(i40e, I40E_GLPRT_PRC1023L(port),
+	    &ipk->ipk_rx_size_1023, &ips->ips_rx_size_1023, init);
+	i40e_stat_get_uint48(i40e, I40E_GLPRT_PRC1522L(port),
+	    &ipk->ipk_rx_size_1522, &ips->ips_rx_size_1522, init);
+	i40e_stat_get_uint48(i40e, I40E_GLPRT_PRC9522L(port),
+	    &ipk->ipk_rx_size_9522, &ips->ips_rx_size_9522, init);
+
+	i40e_stat_get_uint48(i40e, I40E_GLPRT_PTC64L(port),
+	    &ipk->ipk_tx_size_64, &ips->ips_tx_size_64, init);
+	i40e_stat_get_uint48(i40e, I40E_GLPRT_PTC127L(port),
+	    &ipk->ipk_tx_size_127, &ips->ips_tx_size_127, init);
+	i40e_stat_get_uint48(i40e, I40E_GLPRT_PTC255L(port),
+	    &ipk->ipk_tx_size_255, &ips->ips_tx_size_255, init);
+	i40e_stat_get_uint48(i40e, I40E_GLPRT_PTC511L(port),
+	    &ipk->ipk_tx_size_511, &ips->ips_tx_size_511, init);
+	i40e_stat_get_uint48(i40e, I40E_GLPRT_PTC1023L(port),
+	    &ipk->ipk_tx_size_1023, &ips->ips_tx_size_1023, init);
+	i40e_stat_get_uint48(i40e, I40E_GLPRT_PTC1522L(port),
+	    &ipk->ipk_tx_size_1522, &ips->ips_tx_size_1522, init);
+	i40e_stat_get_uint48(i40e, I40E_GLPRT_PTC9522L(port),
+	    &ipk->ipk_tx_size_9522, &ips->ips_tx_size_9522, init);
+
+	/* 32-bit PCIe regs */
+	i40e_stat_get_uint32(i40e, I40E_GLPRT_LXONRXC(port),
+	    &ipk->ipk_link_xon_rx, &ips->ips_link_xon_rx, init);
+	i40e_stat_get_uint32(i40e, I40E_GLPRT_LXOFFRXC(port),
+	    &ipk->ipk_link_xoff_rx, &ips->ips_link_xoff_rx, init);
+	i40e_stat_get_uint32(i40e, I40E_GLPRT_LXONTXC(port),
+	    &ipk->ipk_link_xon_tx, &ips->ips_link_xon_tx, init);
+	i40e_stat_get_uint32(i40e, I40E_GLPRT_LXOFFTXC(port),
+	    &ipk->ipk_link_xoff_tx, &ips->ips_link_xoff_tx, init);
+
+	for (i = 0; i < 8; i++) {
+		i40e_stat_get_uint32(i40e, I40E_GLPRT_PXONRXC(port, i),
+		    &ipk->ipk_priority_xon_rx[i], &ips->ips_priority_xon_rx[i],
+		    init);
+		i40e_stat_get_uint32(i40e, I40E_GLPRT_PXOFFRXC(port, i),
+		    &ipk->ipk_priority_xoff_rx[i],
+		    &ips->ips_priority_xoff_rx[i],
+		    init);
+		i40e_stat_get_uint32(i40e, I40E_GLPRT_PXONTXC(port, i),
+		    &ipk->ipk_priority_xon_tx[i], &ips->ips_priority_xon_tx[i],
+		    init);
+		i40e_stat_get_uint32(i40e, I40E_GLPRT_PXOFFTXC(port, i),
+		    &ipk->ipk_priority_xoff_tx[i],
+		    &ips->ips_priority_xoff_tx[i],
+		    init);
+		i40e_stat_get_uint32(i40e, I40E_GLPRT_RXON2OFFCNT(port, i),
+		    &ipk->ipk_priority_xon_2_xoff[i],
+		    &ips->ips_priority_xon_2_xoff[i],
+		    init);
+	}
+
+	i40e_stat_get_uint32(i40e, I40E_GLPRT_CRCERRS(port),
+	    &ipk->ipk_crc_errors, &ips->ips_crc_errors, init);
+	i40e_stat_get_uint32(i40e, I40E_GLPRT_ILLERRC(port),
+	    &ipk->ipk_illegal_bytes, &ips->ips_illegal_bytes, init);
+	i40e_stat_get_uint32(i40e, I40E_GLPRT_MLFC(port),
+	    &ipk->ipk_mac_local_faults, &ips->ips_mac_local_faults, init);
+	i40e_stat_get_uint32(i40e, I40E_GLPRT_MRFC(port),
+	    &ipk->ipk_mac_remote_faults, &ips->ips_mac_remote_faults, init);
+	i40e_stat_get_uint32(i40e, I40E_GLPRT_RLEC(port),
+	    &ipk->ipk_rx_length_errors, &ips->ips_rx_length_errors, init);
+	i40e_stat_get_uint32(i40e, I40E_GLPRT_RUC(port),
+	    &ipk->ipk_rx_undersize, &ips->ips_rx_undersize, init);
+	i40e_stat_get_uint32(i40e, I40E_GLPRT_RFC(port),
+	    &ipk->ipk_rx_fragments, &ips->ips_rx_fragments, init);
+	i40e_stat_get_uint32(i40e, I40E_GLPRT_ROC(port),
+	    &ipk->ipk_rx_oversize, &ips->ips_rx_oversize, init);
+	i40e_stat_get_uint32(i40e, I40E_GLPRT_RJC(port),
+	    &ipk->ipk_rx_jabber, &ips->ips_rx_jabber, init);
+	i40e_stat_get_uint32(i40e, I40E_GLPRT_RDPC(port),
+	    &ipk->ipk_rx_discards, &ips->ips_rx_discards, init);
+	i40e_stat_get_uint32(i40e, I40E_GLPRT_LDPC(port),
+	    &ipk->ipk_rx_vm_discards, &ips->ips_rx_vm_discards, init);
+	i40e_stat_get_uint32(i40e, I40E_GLPRT_MSPDC(port),
+	    &ipk->ipk_rx_short_discards, &ips->ips_rx_short_discards, init);
+	i40e_stat_get_uint32(i40e, I40E_GLPRT_TDOLD(port),
+	    &ipk->ipk_tx_dropped_link_down, &ips->ips_tx_dropped_link_down,
+	    init);
+	i40e_stat_get_uint32(i40e, I40E_GLPRT_RUPP(port),
+	    &ipk->ipk_rx_unknown_protocol, &ips->ips_rx_unknown_protocol, init);
+
+	/* 64-bit */
+	i40e_stat_get_uint48(i40e, I40E_GL_RXERR1_L(port), &ipk->ipk_rx_err1,
+	    &ips->ips_rx_err1, init);
+	i40e_stat_get_uint48(i40e, I40E_GL_RXERR2_L(port), &ipk->ipk_rx_err2,
+	    &ips->ips_rx_err2, init);
+
+	mutex_exit(&i40e->i40e_stat_lock);
+
+	/*
+	 * We follow ixgbe's lead here and that if a kstat update didn't work
+	 * 100% then we mark service unaffected as opposed to when fetching
+	 * things for MAC directly.
+	 */
+	if (i40e_check_acc_handle(i40e->i40e_osdep_space.ios_reg_handle) !=
+	    DDI_FM_OK) {
+		ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_UNAFFECTED);
+	}
+}
+
+static int
+i40e_stat_pf_kstat_update(kstat_t *ksp, int rw)
+{
+	i40e_t *i40e;
+
+	if (rw == KSTAT_WRITE)
+		return (EACCES);
+
+	i40e = ksp->ks_private;
+	i40e_stat_pf_update(i40e, B_FALSE);
+	return (0);
+}
+
+
+static boolean_t
+i40e_stat_pf_init(i40e_t *i40e)
+{
+	kstat_t *ksp;
+	i40e_pf_kstats_t *ipk;
+
+	ksp = kstat_create(I40E_MODULE_NAME, ddi_get_instance(i40e->i40e_dip),
+	    "pfstats", "net", KSTAT_TYPE_NAMED,
+	    sizeof (i40e_pf_kstats_t) / sizeof (kstat_named_t), 0);
+	if (ksp == NULL) {
+		i40e_error(i40e, "Could not create kernel statistics.");
+		return (B_FALSE);
+	}
+
+	i40e->i40e_pf_kstat = ksp;
+	ipk = ksp->ks_data;
+	ksp->ks_update = i40e_stat_pf_kstat_update;
+	ksp->ks_private = i40e;
+
+	kstat_named_init(&ipk->ipk_rx_bytes, "rx_bytes",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ipk->ipk_rx_unicast, "rx_unicast",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ipk->ipk_rx_multicast, "rx_multicast",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ipk->ipk_rx_broadcast, "rx_broadcast",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ipk->ipk_tx_bytes, "tx_bytes",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ipk->ipk_tx_unicast, "tx_unicast",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ipk->ipk_tx_multicast, "tx_multicast",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ipk->ipk_tx_broadcast, "tx_broadcast",
+	    KSTAT_DATA_UINT64);
+
+	kstat_named_init(&ipk->ipk_rx_size_64, "rx_size_64",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ipk->ipk_rx_size_127, "rx_size_127",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ipk->ipk_rx_size_255, "rx_size_255",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ipk->ipk_rx_size_511, "rx_size_511",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ipk->ipk_rx_size_1023, "rx_size_1023",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ipk->ipk_rx_size_1522, "rx_size_1522",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ipk->ipk_rx_size_9522, "rx_size_9522",
+	    KSTAT_DATA_UINT64);
+
+	kstat_named_init(&ipk->ipk_tx_size_64, "tx_size_64",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ipk->ipk_tx_size_127, "tx_size_127",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ipk->ipk_tx_size_255, "tx_size_255",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ipk->ipk_tx_size_511, "tx_size_511",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ipk->ipk_tx_size_1023, "tx_size_1023",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ipk->ipk_tx_size_1522, "tx_size_1522",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ipk->ipk_tx_size_9522, "tx_size_9522",
+	    KSTAT_DATA_UINT64);
+
+	kstat_named_init(&ipk->ipk_link_xon_rx, "link_xon_rx",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ipk->ipk_link_xoff_rx, "link_xoff_rx",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ipk->ipk_link_xon_tx, "link_xon_tx",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ipk->ipk_link_xoff_tx, "link_xoff_tx",
+	    KSTAT_DATA_UINT64);
+
+	kstat_named_init(&ipk->ipk_priority_xon_rx[0], "priority_xon_rx[0]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ipk->ipk_priority_xoff_rx[0], "priority_xoff_rx[0]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ipk->ipk_priority_xon_tx[0], "priority_xon_tx[0]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ipk->ipk_priority_xoff_tx[0], "priority_xoff_tx[0]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ipk->ipk_priority_xon_2_xoff[0],
+	    "priority_xon_2_xoff[0]",
+	    KSTAT_DATA_UINT64);
+
+	kstat_named_init(&ipk->ipk_priority_xon_rx[1], "priority_xon_rx[1]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ipk->ipk_priority_xoff_rx[1], "priority_xoff_rx[1]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ipk->ipk_priority_xon_tx[1], "priority_xon_tx[1]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ipk->ipk_priority_xoff_tx[1], "priority_xoff_tx[1]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ipk->ipk_priority_xon_2_xoff[1],
+	    "priority_xon_2_xoff[1]",
+	    KSTAT_DATA_UINT64);
+
+	kstat_named_init(&ipk->ipk_priority_xon_rx[2], "priority_xon_rx[2]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ipk->ipk_priority_xoff_rx[2], "priority_xoff_rx[2]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ipk->ipk_priority_xon_tx[2], "priority_xon_tx[2]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ipk->ipk_priority_xoff_tx[2], "priority_xoff_tx[2]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ipk->ipk_priority_xon_2_xoff[2],
+	    "priority_xon_2_xoff[2]",
+	    KSTAT_DATA_UINT64);
+
+	kstat_named_init(&ipk->ipk_priority_xon_rx[3], "priority_xon_rx[3]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ipk->ipk_priority_xoff_rx[3], "priority_xoff_rx[3]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ipk->ipk_priority_xon_tx[3], "priority_xon_tx[3]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ipk->ipk_priority_xoff_tx[3], "priority_xoff_tx[3]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ipk->ipk_priority_xon_2_xoff[3],
+	    "priority_xon_2_xoff[3]",
+	    KSTAT_DATA_UINT64);
+
+	kstat_named_init(&ipk->ipk_priority_xon_rx[4], "priority_xon_rx[4]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ipk->ipk_priority_xoff_rx[4], "priority_xoff_rx[4]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ipk->ipk_priority_xon_tx[4], "priority_xon_tx[4]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ipk->ipk_priority_xoff_tx[4], "priority_xoff_tx[4]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ipk->ipk_priority_xon_2_xoff[4],
+	    "priority_xon_2_xoff[4]",
+	    KSTAT_DATA_UINT64);
+
+	kstat_named_init(&ipk->ipk_priority_xon_rx[5], "priority_xon_rx[5]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ipk->ipk_priority_xoff_rx[5], "priority_xoff_rx[5]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ipk->ipk_priority_xon_tx[5], "priority_xon_tx[5]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ipk->ipk_priority_xoff_tx[5], "priority_xoff_tx[5]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ipk->ipk_priority_xon_2_xoff[5],
+	    "priority_xon_2_xoff[5]",
+	    KSTAT_DATA_UINT64);
+
+	kstat_named_init(&ipk->ipk_priority_xon_rx[6], "priority_xon_rx[6]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ipk->ipk_priority_xoff_rx[6], "priority_xoff_rx[6]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ipk->ipk_priority_xon_tx[6], "priority_xon_tx[6]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ipk->ipk_priority_xoff_tx[6], "priority_xoff_tx[6]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ipk->ipk_priority_xon_2_xoff[6],
+	    "priority_xon_2_xoff[6]",
+	    KSTAT_DATA_UINT64);
+
+	kstat_named_init(&ipk->ipk_priority_xon_rx[7], "priority_xon_rx[7]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ipk->ipk_priority_xoff_rx[7], "priority_xoff_rx[7]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ipk->ipk_priority_xon_tx[7], "priority_xon_tx[7]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ipk->ipk_priority_xoff_tx[7], "priority_xoff_tx[7]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ipk->ipk_priority_xon_2_xoff[7],
+	    "priority_xon_2_xoff[7]",
+	    KSTAT_DATA_UINT64);
+
+	kstat_named_init(&ipk->ipk_crc_errors, "crc_errors",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ipk->ipk_illegal_bytes, "illegal_bytes",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ipk->ipk_mac_local_faults, "mac_local_faults",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ipk->ipk_mac_remote_faults, "mac_remote_faults",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ipk->ipk_rx_length_errors, "rx_length_errors",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ipk->ipk_rx_undersize, "rx_undersize",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ipk->ipk_rx_fragments, "rx_fragments",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ipk->ipk_rx_oversize, "rx_oversize",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ipk->ipk_rx_jabber, "rx_jabber",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ipk->ipk_rx_discards, "rx_discards",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ipk->ipk_rx_vm_discards, "rx_vm_discards",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ipk->ipk_rx_short_discards, "rx_short_discards",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ipk->ipk_tx_dropped_link_down, "tx_dropped_link_down",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ipk->ipk_rx_unknown_protocol, "rx_unknown_protocol",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ipk->ipk_rx_err1, "rx_err1",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ipk->ipk_rx_err2, "rx_err2",
+	    KSTAT_DATA_UINT64);
+
+
+	bzero(&i40e->i40e_pf_stat, sizeof (i40e_pf_stats_t));
+	i40e_stat_pf_update(i40e, B_TRUE);
+
+	kstat_install(i40e->i40e_pf_kstat);
+
+	return (B_TRUE);
+}
+
+void
+i40e_stats_fini(i40e_t *i40e)
+{
+	ASSERT(i40e->i40e_vsi_kstat == NULL);
+	if (i40e->i40e_pf_kstat != NULL) {
+		kstat_delete(i40e->i40e_pf_kstat);
+		i40e->i40e_pf_kstat = NULL;
+	}
+
+	mutex_destroy(&i40e->i40e_stat_lock);
+}
+
+boolean_t
+i40e_stats_init(i40e_t *i40e)
+{
+	mutex_init(&i40e->i40e_stat_lock, NULL, MUTEX_DRIVER, NULL);
+	if (i40e_stat_pf_init(i40e) == B_FALSE) {
+		mutex_destroy(&i40e->i40e_stat_lock);
+		return (B_FALSE);
+	}
+
+	return (B_TRUE);
+}
+
+/*
+ * For Nemo/GLDv3.
+ */
+int
+i40e_m_stat(void *arg, uint_t stat, uint64_t *val)
+{
+	i40e_t *i40e = (i40e_t *)arg;
+	i40e_hw_t *hw = &i40e->i40e_hw_space;
+	int port = i40e->i40e_hw_space.port;
+	i40e_pf_stats_t *ips;
+	i40e_pf_kstats_t *ipk;
+
+
+	ASSERT(i40e->i40e_pf_kstat != NULL);
+	ips = &i40e->i40e_pf_stat;
+	ipk = i40e->i40e_pf_kstat->ks_data;
+
+	/*
+	 * We need both locks, as various stats are protected by different
+	 * things here.
+	 */
+	mutex_enter(&i40e->i40e_general_lock);
+
+	if (i40e->i40e_state & I40E_SUSPENDED) {
+		mutex_exit(&i40e->i40e_general_lock);
+		return (ECANCELED);
+	}
+
+	mutex_enter(&i40e->i40e_stat_lock);
+
+	/*
+	 * Unfortunately the GLDv3 conflates two rather different things here.
+	 * We're combining statistics about the physical port represented by
+	 * this instance with statistics that describe the properties of the
+	 * logical interface. As such, we're going to use the various aspects of
+	 * the port to describe these stats as they represent what the physical
+	 * instance is doing, even though that that means some tools may be
+	 * confused and that to see the logical traffic on the interface itself
+	 * sans VNICs and the like will require more work.
+	 *
+	 * Stats which are not listed in this switch statement are unimplemented
+	 * at this time in hardware or don't currently apply to the device.
+	 */
+	switch (stat) {
+	/* MIB-II stats (RFC 1213 and RFC 1573) */
+	case MAC_STAT_IFSPEED:
+		*val = i40e->i40e_link_speed * 1000000ull;
+		break;
+	case MAC_STAT_MULTIRCV:
+		i40e_stat_get_uint48(i40e, I40E_GLPRT_MPRCL(port),
+		    &ipk->ipk_rx_multicast, &ips->ips_rx_multicast, B_FALSE);
+		*val = ipk->ipk_rx_multicast.value.ui64;
+		break;
+	case MAC_STAT_BRDCSTRCV:
+		i40e_stat_get_uint48(i40e, I40E_GLPRT_BPRCL(port),
+		    &ipk->ipk_rx_broadcast, &ips->ips_rx_broadcast, B_FALSE);
+		*val = ipk->ipk_rx_broadcast.value.ui64;
+		break;
+	case MAC_STAT_MULTIXMT:
+		i40e_stat_get_uint48(i40e, I40E_GLPRT_MPTCL(port),
+		    &ipk->ipk_tx_multicast, &ips->ips_tx_multicast, B_FALSE);
+		*val = ipk->ipk_tx_multicast.value.ui64;
+		break;
+	case MAC_STAT_BRDCSTXMT:
+		i40e_stat_get_uint48(i40e, I40E_GLPRT_BPTCL(port),
+		    &ipk->ipk_tx_broadcast, &ips->ips_tx_broadcast, B_FALSE);
+		*val = ipk->ipk_tx_broadcast.value.ui64;
+		break;
+	case MAC_STAT_NORCVBUF:
+		i40e_stat_get_uint32(i40e, I40E_GLPRT_RDPC(port),
+		    &ipk->ipk_rx_discards, &ips->ips_rx_discards, B_FALSE);
+		i40e_stat_get_uint32(i40e, I40E_GLPRT_LDPC(port),
+		    &ipk->ipk_rx_vm_discards, &ips->ips_rx_vm_discards,
+		    B_FALSE);
+		*val = ipk->ipk_rx_discards.value.ui64 +
+		    ipk->ipk_rx_vm_discards.value.ui64;
+		break;
+	/*
+	 * Note, that some RXERR2 stats are also duplicated by the switch filter
+	 * stats; however, since we're not using those at this time, it seems
+	 * reasonable to include them.
+	 */
+	case MAC_STAT_IERRORS:
+		i40e_stat_get_uint32(i40e, I40E_GLPRT_CRCERRS(port),
+		    &ipk->ipk_crc_errors, &ips->ips_crc_errors, B_FALSE);
+		i40e_stat_get_uint32(i40e, I40E_GLPRT_ILLERRC(port),
+		    &ipk->ipk_illegal_bytes, &ips->ips_illegal_bytes, B_FALSE);
+		i40e_stat_get_uint32(i40e, I40E_GLPRT_RLEC(port),
+		    &ipk->ipk_rx_length_errors, &ips->ips_rx_length_errors,
+		    B_FALSE);
+		i40e_stat_get_uint48(i40e, I40E_GL_RXERR1_L(port),
+		    &ipk->ipk_rx_err1, &ips->ips_rx_err1, B_FALSE);
+		i40e_stat_get_uint48(i40e, I40E_GL_RXERR2_L(port),
+		    &ipk->ipk_rx_err2, &ips->ips_rx_err2, B_FALSE);
+
+		*val = ipk->ipk_crc_errors.value.ui64 +
+		    ipk->ipk_illegal_bytes.value.ui64 +
+		    ipk->ipk_rx_length_errors.value.ui64 +
+		    ipk->ipk_rx_err1.value.ui64 +
+		    ipk->ipk_rx_err2.value.ui64;
+		break;
+	case MAC_STAT_UNKNOWNS:
+		i40e_stat_get_uint32(i40e, I40E_GLPRT_RUPP(port),
+		    &ipk->ipk_rx_unknown_protocol,
+		    &ips->ips_rx_unknown_protocol,
+		    B_FALSE);
+		*val = ipk->ipk_rx_unknown_protocol.value.ui64;
+		break;
+	case MAC_STAT_RBYTES:
+		i40e_stat_get_uint48(i40e, I40E_GLPRT_GORCL(port),
+		    &ipk->ipk_rx_bytes, &ips->ips_rx_bytes, B_FALSE);
+		*val = ipk->ipk_rx_bytes.value.ui64;
+		break;
+	case MAC_STAT_IPACKETS:
+		i40e_stat_get_uint48(i40e, I40E_GLPRT_UPRCL(port),
+		    &ipk->ipk_rx_unicast, &ips->ips_rx_unicast, B_FALSE);
+		i40e_stat_get_uint48(i40e, I40E_GLPRT_MPRCL(port),
+		    &ipk->ipk_rx_multicast, &ips->ips_rx_multicast, B_FALSE);
+		i40e_stat_get_uint48(i40e, I40E_GLPRT_BPRCL(port),
+		    &ipk->ipk_rx_broadcast, &ips->ips_rx_broadcast, B_FALSE);
+		*val = ipk->ipk_rx_unicast.value.ui64 +
+		    ipk->ipk_rx_multicast.value.ui64 +
+		    ipk->ipk_rx_broadcast.value.ui64;
+		break;
+	case MAC_STAT_OBYTES:
+		i40e_stat_get_uint48(i40e, I40E_GLPRT_GOTCL(port),
+		    &ipk->ipk_tx_bytes, &ips->ips_tx_bytes, B_FALSE);
+		*val = ipk->ipk_tx_bytes.value.ui64;
+		break;
+	case MAC_STAT_OPACKETS:
+		i40e_stat_get_uint48(i40e, I40E_GLPRT_UPTCL(port),
+		    &ipk->ipk_tx_unicast, &ips->ips_tx_unicast, B_FALSE);
+		i40e_stat_get_uint48(i40e, I40E_GLPRT_MPTCL(port),
+		    &ipk->ipk_tx_multicast, &ips->ips_tx_multicast, B_FALSE);
+		i40e_stat_get_uint48(i40e, I40E_GLPRT_BPTCL(port),
+		    &ipk->ipk_tx_broadcast, &ips->ips_tx_broadcast, B_FALSE);
+		*val = ipk->ipk_tx_unicast.value.ui64 +
+		    ipk->ipk_tx_multicast.value.ui64 +
+		    ipk->ipk_tx_broadcast.value.ui64;
+		break;
+	case MAC_STAT_UNDERFLOWS:
+		i40e_stat_get_uint32(i40e, I40E_GLPRT_RUC(port),
+		    &ipk->ipk_rx_undersize, &ips->ips_rx_undersize, B_FALSE);
+		i40e_stat_get_uint32(i40e, I40E_GLPRT_RFC(port),
+		    &ipk->ipk_rx_fragments, &ips->ips_rx_fragments, B_FALSE);
+		i40e_stat_get_uint32(i40e, I40E_GLPRT_MSPDC(port),
+		    &ipk->ipk_rx_short_discards, &ips->ips_rx_short_discards,
+		    B_FALSE);
+		*val = ipk->ipk_rx_undersize.value.ui64 +
+		    ipk->ipk_rx_fragments.value.ui64 +
+		    ipk->ipk_rx_short_discards.value.ui64;
+		break;
+	case MAC_STAT_OVERFLOWS:
+		i40e_stat_get_uint32(i40e, I40E_GLPRT_ROC(port),
+		    &ipk->ipk_rx_oversize, &ips->ips_rx_oversize, B_FALSE);
+		i40e_stat_get_uint32(i40e, I40E_GLPRT_RJC(port),
+		    &ipk->ipk_rx_jabber, &ips->ips_rx_jabber, B_FALSE);
+		*val = ipk->ipk_rx_oversize.value.ui64 +
+		    ipk->ipk_rx_fragments.value.ui64;
+		break;
+
+	/* RFC 1643 stats */
+	case ETHER_STAT_FCS_ERRORS:
+		i40e_stat_get_uint32(i40e, I40E_GLPRT_CRCERRS(port),
+		    &ipk->ipk_crc_errors, &ips->ips_crc_errors, B_FALSE);
+		*val = ipk->ipk_crc_errors.value.ui64;
+		break;
+	case ETHER_STAT_TOOLONG_ERRORS:
+		i40e_stat_get_uint32(i40e, I40E_GLPRT_ROC(port),
+		    &ipk->ipk_rx_oversize, &ips->ips_rx_oversize, B_FALSE);
+		*val = ipk->ipk_rx_oversize.value.ui64;
+		break;
+	case ETHER_STAT_MACRCV_ERRORS:
+		i40e_stat_get_uint32(i40e, I40E_GLPRT_ILLERRC(port),
+		    &ipk->ipk_illegal_bytes, &ips->ips_illegal_bytes, B_FALSE);
+		i40e_stat_get_uint32(i40e, I40E_GLPRT_RLEC(port),
+		    &ipk->ipk_rx_length_errors, &ips->ips_rx_length_errors,
+		    B_FALSE);
+		i40e_stat_get_uint32(i40e, I40E_GLPRT_RFC(port),
+		    &ipk->ipk_rx_fragments, &ips->ips_rx_fragments, B_FALSE);
+		*val = ipk->ipk_illegal_bytes.value.ui64 +
+		    ipk->ipk_rx_length_errors.value.ui64 +
+		    ipk->ipk_rx_fragments.value.ui64;
+		break;
+	/* MII/GMII stats */
+
+	/*
+	 * The receiver address is apparently the same as the port number.
+	 */
+	case ETHER_STAT_XCVR_ADDR:
+		/* The Receiver address is apparently the same as the port */
+		*val = i40e->i40e_hw_space.port;
+		break;
+	case ETHER_STAT_XCVR_ID:
+		switch (hw->phy.media_type) {
+		case I40E_MEDIA_TYPE_BASET:
+			/*
+			 * Transform the data here into the ID. Note, generally
+			 * the revision is left out.
+			 */
+			*val = i40e->i40e_phy.phy_id[3] << 24 |
+			    i40e->i40e_phy.phy_id[2] << 16 |
+			    i40e->i40e_phy.phy_id[1] << 8;
+			break;
+		case I40E_MEDIA_TYPE_FIBER:
+		case I40E_MEDIA_TYPE_BACKPLANE:
+		case I40E_MEDIA_TYPE_CX4:
+		case I40E_MEDIA_TYPE_DA:
+		case I40E_MEDIA_TYPE_VIRTUAL:
+			*val = i40e->i40e_phy.phy_id[0] |
+			    i40e->i40e_phy.phy_id[1] << 8 |
+			    i40e->i40e_phy.phy_id[2] << 16;
+			break;
+		case I40E_MEDIA_TYPE_UNKNOWN:
+		default:
+			goto unimpl;
+		}
+		break;
+	case ETHER_STAT_XCVR_INUSE:
+		switch (hw->phy.link_info.phy_type) {
+		case I40E_PHY_TYPE_100BASE_TX:
+			*val = XCVR_100T2;
+			break;
+		case I40E_PHY_TYPE_1000BASE_T:
+			*val = XCVR_1000T;
+			break;
+		default:
+			*val = XCVR_UNDEFINED;
+			break;
+		}
+		break;
+
+	/*
+	 * This group answers the question of do we support a given speed in
+	 * theory.
+	 */
+	case ETHER_STAT_CAP_100FDX:
+		*val = (i40e->i40e_phy.link_speed & I40E_LINK_SPEED_100MB) != 0;
+		break;
+	case ETHER_STAT_CAP_1000FDX:
+		*val = (i40e->i40e_phy.link_speed & I40E_LINK_SPEED_1GB) != 0;
+		break;
+	case ETHER_STAT_CAP_10GFDX:
+		*val = (i40e->i40e_phy.link_speed & I40E_LINK_SPEED_10GB) != 0;
+		break;
+	case ETHER_STAT_CAP_40GFDX:
+		*val = (i40e->i40e_phy.link_speed & I40E_LINK_SPEED_40GB) != 0;
+		break;
+
+	/*
+	 * These ask are we currently advertising these speeds and abilities.
+	 * Until we support setting these because we're working with a copper
+	 * PHY, then the only things we advertise are based on the link PHY
+	 * speeds. In other words, we advertise everything we support.
+	 */
+	case ETHER_STAT_ADV_CAP_100FDX:
+		*val = (i40e->i40e_phy.link_speed & I40E_LINK_SPEED_100MB) != 0;
+		break;
+	case ETHER_STAT_ADV_CAP_1000FDX:
+		*val = (i40e->i40e_phy.link_speed & I40E_LINK_SPEED_1GB) != 0;
+		break;
+	case ETHER_STAT_ADV_CAP_10GFDX:
+		*val = (i40e->i40e_phy.link_speed & I40E_LINK_SPEED_10GB) != 0;
+		break;
+	case ETHER_STAT_ADV_CAP_40GFDX:
+		*val = (i40e->i40e_phy.link_speed & I40E_LINK_SPEED_40GB) != 0;
+		break;
+
+	/*
+	 * These ask if the peer supports these speeds, e.g. what did they tell
+	 * us in auto-negotiation. Unfortunately, hardware doesn't appear to
+	 * give us a way to determine whether or not they actually support
+	 * something, only what they have enabled. This means that all we can
+	 * tell the user is the speed that we're currently at, unfortunately.
+	 */
+	case ETHER_STAT_LP_CAP_100FDX:
+		*val = i40e->i40e_link_speed == 100;
+		break;
+	case ETHER_STAT_LP_CAP_1000FDX:
+		*val = i40e->i40e_link_speed == 1000;
+		break;
+	case ETHER_STAT_LP_CAP_10GFDX:
+		*val = i40e->i40e_link_speed == 10000;
+		break;
+	case ETHER_STAT_LP_CAP_40GFDX:
+		*val = i40e->i40e_link_speed == 40000;
+		break;
+
+	/*
+	 * Statistics for unsupported speeds. Note that these often have the
+	 * same constraints as the other ones. For example, we can't answer the
+	 * question of the ETHER_STAT_LP_CAP family because hardware doesn't
+	 * give us any way of knowing whether or not it does.
+	 */
+	case ETHER_STAT_CAP_100HDX:
+	case ETHER_STAT_CAP_1000HDX:
+	case ETHER_STAT_CAP_10FDX:
+	case ETHER_STAT_CAP_10HDX:
+	case ETHER_STAT_CAP_100T4:
+	case ETHER_STAT_CAP_100GFDX:
+	case ETHER_STAT_CAP_2500FDX:
+	case ETHER_STAT_CAP_5000FDX:
+	case ETHER_STAT_ADV_CAP_1000HDX:
+	case ETHER_STAT_ADV_CAP_100HDX:
+	case ETHER_STAT_ADV_CAP_10FDX:
+	case ETHER_STAT_ADV_CAP_10HDX:
+	case ETHER_STAT_ADV_CAP_100T4:
+	case ETHER_STAT_ADV_CAP_100GFDX:
+	case ETHER_STAT_ADV_CAP_2500FDX:
+	case ETHER_STAT_ADV_CAP_5000FDX:
+	case ETHER_STAT_LP_CAP_1000HDX:
+	case ETHER_STAT_LP_CAP_100HDX:
+	case ETHER_STAT_LP_CAP_10FDX:
+	case ETHER_STAT_LP_CAP_10HDX:
+	case ETHER_STAT_LP_CAP_100T4:
+	case ETHER_STAT_LP_CAP_100GFDX:
+	case ETHER_STAT_LP_CAP_2500FDX:
+	case ETHER_STAT_LP_CAP_5000FDX:
+		*val = 0;
+		break;
+
+	case ETHER_STAT_LINK_DUPLEX:
+		*val = i40e->i40e_link_duplex;
+		break;
+	case ETHER_STAT_TOOSHORT_ERRORS:
+		i40e_stat_get_uint32(i40e, I40E_GLPRT_RUC(port),
+		    &ipk->ipk_rx_undersize, &ips->ips_rx_undersize, B_FALSE);
+
+		i40e_stat_get_uint32(i40e, I40E_GLPRT_MSPDC(port),
+		    &ipk->ipk_rx_short_discards, &ips->ips_rx_short_discards,
+		    B_FALSE);
+		*val = ipk->ipk_rx_undersize.value.ui64 +
+		    ipk->ipk_rx_short_discards.value.ui64;
+		break;
+	case ETHER_STAT_JABBER_ERRORS:
+		i40e_stat_get_uint32(i40e, I40E_GLPRT_RJC(port),
+		    &ipk->ipk_rx_jabber, &ips->ips_rx_jabber, B_FALSE);
+		*val = ipk->ipk_rx_jabber.value.ui64;
+		break;
+
+	/*
+	 * Non-Link speed related capabilities.
+	 */
+	case ETHER_STAT_CAP_AUTONEG:
+		*val = 1;
+		break;
+
+	case ETHER_STAT_ADV_CAP_AUTONEG:
+		*val = 1;
+		break;
+
+	case ETHER_STAT_LP_CAP_AUTONEG:
+		*val = (hw->phy.link_info.an_info & I40E_AQ_LP_AN_ABILITY) != 0;
+		break;
+
+	case ETHER_STAT_LINK_AUTONEG:
+		*val = 1;
+		break;
+
+	/*
+	 * Note that while the hardware does support the pause functionality, at
+	 * this time we do not use it at all and effectively disable it.
+	 */
+	case ETHER_STAT_CAP_ASMPAUSE:
+		*val = (i40e->i40e_phy.abilities &
+		    I40E_AQ_PHY_FLAG_PAUSE_RX) != 0;
+		break;
+	case ETHER_STAT_CAP_PAUSE:
+		*val = (i40e->i40e_phy.abilities &
+		    I40E_AQ_PHY_FLAG_PAUSE_TX) != 0;
+		break;
+
+	/*
+	 * Because we don't support these at this time, they are always
+	 * hard-coded to zero.
+	 */
+	case ETHER_STAT_ADV_CAP_ASMPAUSE:
+	case ETHER_STAT_ADV_CAP_PAUSE:
+		*val = 0;
+		break;
+
+	/*
+	 * Like the other LP fields, we can only answer the question have we
+	 * enabled it, not whether the other end actually supports it.
+	 */
+	case ETHER_STAT_LP_CAP_ASMPAUSE:
+	case ETHER_STAT_LINK_ASMPAUSE:
+		*val = (hw->phy.link_info.an_info & I40E_AQ_LINK_PAUSE_RX) != 0;
+		break;
+	case ETHER_STAT_LP_CAP_PAUSE:
+	case ETHER_STAT_LINK_PAUSE:
+		*val = (hw->phy.link_info.an_info & I40E_AQ_LINK_PAUSE_TX) != 0;
+		break;
+
+	default:
+	unimpl:
+		mutex_exit(&i40e->i40e_stat_lock);
+		mutex_exit(&i40e->i40e_general_lock);
+		return (ENOTSUP);
+	}
+
+	mutex_exit(&i40e->i40e_stat_lock);
+	mutex_exit(&i40e->i40e_general_lock);
+
+	if (i40e_check_acc_handle(i40e->i40e_osdep_space.ios_reg_handle) !=
+	    DDI_FM_OK) {
+		ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED);
+		return (EIO);
+	}
+
+	return (0);
+}
+
+int
+i40e_rx_ring_stat(mac_ring_driver_t rh, uint_t stat, uint64_t *val)
+{
+	i40e_trqpair_t *itrq = (i40e_trqpair_t *)rh;
+	i40e_t *i40e = itrq->itrq_i40e;
+
+	if (i40e->i40e_state & I40E_SUSPENDED) {
+		return (ECANCELED);
+	}
+
+	switch (stat) {
+	case MAC_STAT_RBYTES:
+		*val = itrq->itrq_rxstat.irxs_bytes.value.ui64;
+		break;
+	case MAC_STAT_IPACKETS:
+		*val = itrq->itrq_rxstat.irxs_packets.value.ui64;
+		break;
+	default:
+		*val = 0;
+		return (ENOTSUP);
+	}
+
+	return (0);
+}
+
+int
+i40e_tx_ring_stat(mac_ring_driver_t rh, uint_t stat, uint64_t *val)
+{
+	i40e_trqpair_t *itrq = (i40e_trqpair_t *)rh;
+	i40e_t *i40e = itrq->itrq_i40e;
+
+	if (i40e->i40e_state & I40E_SUSPENDED) {
+		return (ECANCELED);
+	}
+
+	switch (stat) {
+	case MAC_STAT_OBYTES:
+		*val = itrq->itrq_txstat.itxs_bytes.value.ui64;
+		break;
+	case MAC_STAT_OPACKETS:
+		*val = itrq->itrq_txstat.itxs_packets.value.ui64;
+		break;
+	default:
+		*val = 0;
+		return (ENOTSUP);
+	}
+
+	return (0);
+}
+
+/*
+ * When we end up refactoring all off the queue assignments and have non-static
+ * queue to VSI mappings, then we may need to revisit the general locking
+ * strategy that we employ and have the kstat creation / deletion be part of the
+ * ring start and stop routines.
+ */
+void
+i40e_stats_trqpair_fini(i40e_trqpair_t *itrq)
+{
+	if (itrq->itrq_txkstat != NULL) {
+		kstat_delete(itrq->itrq_txkstat);
+		itrq->itrq_txkstat = NULL;
+	}
+
+	if (itrq->itrq_rxkstat != NULL) {
+		kstat_delete(itrq->itrq_rxkstat);
+		itrq->itrq_rxkstat = NULL;
+	}
+}
+
+boolean_t
+i40e_stats_trqpair_init(i40e_trqpair_t *itrq)
+{
+	char buf[128];
+	i40e_t *i40e = itrq->itrq_i40e;
+	i40e_txq_stat_t *tsp = &itrq->itrq_txstat;
+	i40e_rxq_stat_t *rsp = &itrq->itrq_rxstat;
+
+	(void) snprintf(buf, sizeof (buf), "trqpair_tx_%d", itrq->itrq_index);
+	itrq->itrq_txkstat = kstat_create(I40E_MODULE_NAME,
+	    ddi_get_instance(i40e->i40e_dip), buf, "net", KSTAT_TYPE_NAMED,
+	    sizeof (i40e_txq_stat_t) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL);
+
+	if (itrq->itrq_txkstat == NULL)
+		return (B_FALSE);
+
+	(void) snprintf(buf, sizeof (buf), "trqpair_rx_%d", itrq->itrq_index);
+	itrq->itrq_rxkstat = kstat_create(I40E_MODULE_NAME,
+	    ddi_get_instance(i40e->i40e_dip), buf, "net", KSTAT_TYPE_NAMED,
+	    sizeof (i40e_rxq_stat_t) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL);
+
+	if (itrq->itrq_rxkstat == NULL) {
+		kstat_delete(itrq->itrq_txkstat);
+		itrq->itrq_txkstat = NULL;
+		return (B_FALSE);
+	}
+
+	itrq->itrq_txkstat->ks_data = &itrq->itrq_txstat;
+	itrq->itrq_rxkstat->ks_data = &itrq->itrq_rxstat;
+
+	kstat_named_init(&tsp->itxs_bytes, "tx_bytes",
+	    KSTAT_DATA_UINT64);
+	tsp->itxs_bytes.value.ui64 = 0;
+	kstat_named_init(&tsp->itxs_packets, "tx_packets",
+	    KSTAT_DATA_UINT64);
+	tsp->itxs_packets.value.ui64 = 0;
+	kstat_named_init(&tsp->itxs_descriptors, "tx_descriptors",
+	    KSTAT_DATA_UINT64);
+	tsp->itxs_descriptors.value.ui64 = 0;
+	kstat_named_init(&tsp->itxs_recycled, "tx_recycled",
+	    KSTAT_DATA_UINT64);
+	tsp->itxs_recycled.value.ui64 = 0;
+
+	kstat_named_init(&tsp->itxs_hck_meoifail, "tx_hck_meoifail",
+	    KSTAT_DATA_UINT64);
+	tsp->itxs_hck_meoifail.value.ui64 = 0;
+	kstat_named_init(&tsp->itxs_hck_nol2info, "tx_hck_nol2info",
+	    KSTAT_DATA_UINT64);
+	tsp->itxs_hck_nol2info.value.ui64 = 0;
+	kstat_named_init(&tsp->itxs_hck_nol3info, "tx_hck_nol3info",
+	    KSTAT_DATA_UINT64);
+	tsp->itxs_hck_nol3info.value.ui64 = 0;
+	kstat_named_init(&tsp->itxs_hck_nol4info, "tx_hck_nol4info",
+	    KSTAT_DATA_UINT64);
+	tsp->itxs_hck_nol4info.value.ui64 = 0;
+	kstat_named_init(&tsp->itxs_hck_badl3, "tx_hck_badl3",
+	    KSTAT_DATA_UINT64);
+	tsp->itxs_hck_badl3.value.ui64 = 0;
+	kstat_named_init(&tsp->itxs_hck_badl4, "tx_hck_badl4",
+	    KSTAT_DATA_UINT64);
+	tsp->itxs_hck_badl4.value.ui64 = 0;
+	kstat_named_init(&tsp->itxs_err_notcb, "tx_err_notcb",
+	    KSTAT_DATA_UINT64);
+	tsp->itxs_err_notcb.value.ui64 = 0;
+	kstat_named_init(&tsp->itxs_err_nodescs, "tx_err_nodescs",
+	    KSTAT_DATA_UINT64);
+	tsp->itxs_err_nodescs.value.ui64 = 0;
+	kstat_named_init(&tsp->itxs_err_context, "tx_err_context",
+	    KSTAT_DATA_UINT64);
+	tsp->itxs_err_context.value.ui64 = 0;
+	kstat_named_init(&tsp->itxs_num_unblocked, "tx_num_unblocked",
+	    KSTAT_DATA_UINT64);
+	tsp->itxs_num_unblocked.value.ui64 = 0;
+
+
+	kstat_named_init(&rsp->irxs_bytes, "rx_bytes",
+	    KSTAT_DATA_UINT64);
+	rsp->irxs_bytes.value.ui64 = 0;
+	kstat_named_init(&rsp->irxs_packets, "rx_packets",
+	    KSTAT_DATA_UINT64);
+	rsp->irxs_packets.value.ui64 = 0;
+	kstat_named_init(&rsp->irxs_rx_desc_error, "rx_desc_error",
+	    KSTAT_DATA_UINT64);
+	rsp->irxs_rx_desc_error.value.ui64 = 0;
+	kstat_named_init(&rsp->irxs_rx_intr_limit, "rx_intr_limit",
+	    KSTAT_DATA_UINT64);
+	rsp->irxs_rx_intr_limit.value.ui64 = 0;
+	kstat_named_init(&rsp->irxs_rx_bind_norcb, "rx_bind_norcb",
+	    KSTAT_DATA_UINT64);
+	rsp->irxs_rx_bind_norcb.value.ui64 = 0;
+	kstat_named_init(&rsp->irxs_rx_bind_nomp, "rx_bind_nomp",
+	    KSTAT_DATA_UINT64);
+	rsp->irxs_rx_bind_nomp.value.ui64 = 0;
+	kstat_named_init(&rsp->irxs_rx_copy_nomem, "rx_copy_nomem",
+	    KSTAT_DATA_UINT64);
+	rsp->irxs_rx_copy_nomem.value.ui64 = 0;
+	kstat_named_init(&rsp->irxs_hck_v4hdrok, "rx_hck_v4hdrok",
+	    KSTAT_DATA_UINT64);
+	rsp->irxs_hck_v4hdrok.value.ui64 = 0;
+	kstat_named_init(&rsp->irxs_hck_l4hdrok, "rx_hck_l4hdrok",
+	    KSTAT_DATA_UINT64);
+	rsp->irxs_hck_l4hdrok.value.ui64 = 0;
+	kstat_named_init(&rsp->irxs_hck_unknown, "rx_hck_unknown",
+	    KSTAT_DATA_UINT64);
+	rsp->irxs_hck_unknown.value.ui64 = 0;
+	kstat_named_init(&rsp->irxs_hck_nol3l4p, "rx_hck_nol3l4p",
+	    KSTAT_DATA_UINT64);
+	rsp->irxs_hck_nol3l4p.value.ui64 = 0;
+	kstat_named_init(&rsp->irxs_hck_iperr, "rx_hck_iperr",
+	    KSTAT_DATA_UINT64);
+	rsp->irxs_hck_iperr.value.ui64 = 0;
+	kstat_named_init(&rsp->irxs_hck_eiperr, "rx_hck_eiperr",
+	    KSTAT_DATA_UINT64);
+	rsp->irxs_hck_eiperr.value.ui64 = 0;
+	kstat_named_init(&rsp->irxs_hck_l4err, "rx_hck_l4err",
+	    KSTAT_DATA_UINT64);
+	rsp->irxs_hck_l4err.value.ui64 = 0;
+	kstat_named_init(&rsp->irxs_hck_v6skip, "rx_hck_v6skip",
+	    KSTAT_DATA_UINT64);
+	rsp->irxs_hck_v6skip.value.ui64 = 0;
+	kstat_named_init(&rsp->irxs_hck_set, "rx_hck_set",
+	    KSTAT_DATA_UINT64);
+	rsp->irxs_hck_set.value.ui64 = 0;
+	kstat_named_init(&rsp->irxs_hck_miss, "rx_hck_miss",
+	    KSTAT_DATA_UINT64);
+	rsp->irxs_hck_miss.value.ui64 = 0;
+
+	kstat_install(itrq->itrq_txkstat);
+	kstat_install(itrq->itrq_rxkstat);
+
+	return (B_TRUE);
+}
diff --git a/usr/src/uts/common/io/i40e/i40e_sw.h b/usr/src/uts/common/io/i40e/i40e_sw.h
new file mode 100644
index 0000000000..04959b1590
--- /dev/null
+++ b/usr/src/uts/common/io/i40e/i40e_sw.h
@@ -0,0 +1,974 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 OmniTI Computer Consulting, Inc. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
+ */
+
+/*
+ * Please see i40e_main.c for an introduction to the device driver, its layout,
+ * and more.
+ */
+
+#ifndef	_I40E_SW_H
+#define	_I40E_SW_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/types.h>
+#include <sys/conf.h>
+#include <sys/debug.h>
+#include <sys/stropts.h>
+#include <sys/stream.h>
+#include <sys/strsun.h>
+#include <sys/strlog.h>
+#include <sys/kmem.h>
+#include <sys/stat.h>
+#include <sys/kstat.h>
+#include <sys/modctl.h>
+#include <sys/errno.h>
+#include <sys/dlpi.h>
+#include <sys/mac_provider.h>
+#include <sys/mac_ether.h>
+#include <sys/vlan.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/pci.h>
+#include <sys/pcie.h>
+#include <sys/sdt.h>
+#include <sys/ethernet.h>
+#include <sys/pattr.h>
+#include <sys/strsubr.h>
+#include <sys/netlb.h>
+#include <sys/random.h>
+#include <inet/common.h>
+#include <inet/tcp.h>
+#include <inet/ip.h>
+#include <inet/mi.h>
+#include <inet/nd.h>
+#include <netinet/udp.h>
+#include <netinet/sctp.h>
+#include <sys/bitmap.h>
+#include <sys/cpuvar.h>
+#include <sys/ddifm.h>
+#include <sys/fm/protocol.h>
+#include <sys/fm/util.h>
+#include <sys/disp.h>
+#include <sys/fm/io/ddi.h>
+#include <sys/list.h>
+#include <sys/debug.h>
+#include <sys/sdt.h>
+#include "i40e_type.h"
+#include "i40e_osdep.h"
+#include "i40e_prototype.h"
+#include "i40e_xregs.h"
+
+#define	I40E_MODULE_NAME "i40e"
+
+#define	I40E_ADAPTER_REGSET	1
+
+/*
+ * Configuration constants. Note that the hardware defines a minimum bound of 32
+ * descriptors and requires that the programming of the descriptor lengths be
+ * aligned in units of 32 descriptors.
+ */
+#define	I40E_MIN_TX_RING_SIZE	64
+#define	I40E_MAX_TX_RING_SIZE	4096
+#define	I40E_DEF_TX_RING_SIZE	1024
+
+#define	I40E_MIN_RX_RING_SIZE	64
+#define	I40E_MAX_RX_RING_SIZE	4096
+#define	I40E_DEF_RX_RING_SIZE	1024
+
+#define	I40E_DESC_ALIGN		32
+
+/*
+ * Sizes used for asynchronous processing of the adminq. We allocate a fixed
+ * size buffer for each instance of the device during attach time, rather than
+ * allocating and freeing one during interrupt processing.
+ *
+ * We also define the descriptor size of the admin queue here.
+ */
+#define	I40E_ADMINQ_BUFSZ	4096
+#define	I40E_MAX_ADMINQ_SIZE	1024
+#define	I40E_DEF_ADMINQ_SIZE	256
+
+/*
+ * Note, while the min and maximum values are based upon the sizing of the ring
+ * itself, the default is taken from ixgbe without much thought. It's basically
+ * been cargo culted. See i40e_transciever.c for a bit more information.
+ */
+#define	I40E_MIN_RX_LIMIT_PER_INTR	16
+#define	I40E_MAX_RX_LIMIT_PER_INTR	4096
+#define	I40E_DEF_RX_LIMIT_PER_INTR	256
+
+/*
+ * Valid MTU ranges. Note that the XL710's maximum payload is actually 9728.
+ * However, we need to adjust for the ETHERFCSL (4 bytes) and the Ethernet VLAN
+ * header size (18 bytes) to get the actual maximum frame we can use. If
+ * different adapters end up with different sizes, we should make this value a
+ * bit more dynamic.
+ */
+#define	I40E_MAX_MTU	9706
+#define	I40E_MIN_MTU	ETHERMIN
+#define	I40E_DEF_MTU	ETHERMTU
+
+/*
+ * Interrupt throttling related values. Interrupt throttling values are defined
+ * in two microsecond increments. Note that a value of zero basically says do no
+ * ITR activity. A helpful way to think about these is that setting the ITR to a
+ * value will allow a certain number of interrupts per second.
+ *
+ * Our default values for RX allow 20k interrupts per second while our default
+ * values for TX allow for 5k interrupts per second. For other class interrupts,
+ * we limit ourselves to a rate of 2k/s.
+ */
+#define	I40E_MIN_ITR		0x0000
+#define	I40E_MAX_ITR		0x0FF0
+#define	I40E_DEF_RX_ITR		0x0019
+#define	I40E_DEF_TX_ITR		0x0064
+#define	I40E_DEF_OTHER_ITR	0x00FA
+
+/*
+ * Indexes into the three ITR registers that we have.
+ */
+typedef enum i40e_itr_index {
+	I40E_ITR_INDEX_RX	= 0x0,
+	I40E_ITR_INDEX_TX	= 0x1,
+	I40E_ITR_INDEX_OTHER	= 0x2,
+	I40E_ITR_INDEX_NONE 	= 0x3
+} i40e_itr_index_t;
+
+
+/*
+ * Table 1-5 of the PRM notes that LSO supports up to 256 KB.
+ */
+#define	I40E_LSO_MAXLEN	(256 * 1024)
+
+#define	I40E_CYCLIC_PERIOD NANOSEC	/* 1 second */
+#define	I40E_DRAIN_RX_WAIT	(500 * MILLISEC)	/* In us */
+
+/*
+ * All the other queue types for are defined by the common code. However, this
+ * is the constant to indicate that it's terminated.
+ */
+#define	I40E_QUEUE_TYPE_EOL	0x7FF
+
+/*
+ * See the comments in i40e_buf.c as to the purpose of this value and how it's
+ * used to ensure that the IP header is eventually aligned when it's received by
+ * the OS.
+ */
+#define	I40E_BUF_IPHDR_ALIGNMENT	2
+
+/*
+ * The XL710 controller has a limit of eight buffers being allowed to be used
+ * for the transmission of a single frame. This is defined in 8.4.1 - Transmit
+ * Packet in System Memory.
+ */
+#define	I40E_TX_MAX_COOKIE	8
+
+/*
+ * Sizing to determine the amount of available descriptors at which we'll
+ * consider ourselves blocked. Also, when we have these available, we'll then
+ * consider ourselves available to transmit to MAC again. Strictly speaking, the
+ * MAX is based on the ring size. The default sizing is based on ixgbe.
+ */
+#define	I40E_MIN_TX_BLOCK_THRESH	I40E_TX_MAX_COOKIE
+#define	I40E_DEF_TX_BLOCK_THRESH	I40E_MIN_TX_BLOCK_THRESH
+
+/*
+ * Sizing for DMA thresholds. These are used to indicate whether or not we
+ * should perform a bcopy or a DMA binding of a given message block. The range
+ * allows for setting things such that we'll always do a bcopy (a high value) or
+ * always perform a DMA binding (a low value).
+ */
+#define	I40E_MIN_RX_DMA_THRESH		0
+#define	I40E_DEF_RX_DMA_THRESH		256
+#define	I40E_MAX_RX_DMA_THRESH		INT32_MAX
+
+#define	I40E_MIN_TX_DMA_THRESH		0
+#define	I40E_DEF_TX_DMA_THRESH		256
+#define	I40E_MAX_TX_DMA_THRESH		INT32_MAX
+
+/*
+ * Resource sizing counts. There are various aspects of hardware where we may
+ * have some variable number of elements that we need to handle. Such as the
+ * hardware capabilities and switch capacities. We cannot know a priori how many
+ * elements to do, so instead we take a starting guess and then will grow it up
+ * to an upper bound on a number of elements, to limit memory consumption in
+ * case of a hardware bug.
+ */
+#define	I40E_HW_CAP_DEFAULT	40
+#define	I40E_SWITCH_CAP_DEFAULT	25
+
+/*
+ * Host Memory Context related constants.
+ */
+#define	I40E_HMC_RX_CTX_UNIT		128
+#define	I40E_HMC_RX_DBUFF_MIN		1024
+#define	I40E_HMC_RX_DBUFF_MAX		(16 * 1024 - 128)
+#define	I40E_HMC_RX_DTYPE_NOSPLIT	0
+#define	I40E_HMC_RX_DSIZE_32BYTE	1
+#define	I40E_HMC_RX_CRCSTRIP_ENABLE	1
+#define	I40E_HMC_RX_FC_DISABLE		0
+#define	I40E_HMC_RX_L2TAGORDER		1
+#define	I40E_HMC_RX_HDRSPLIT_DISABLE	0
+#define	I40E_HMC_RX_INVLAN_DONTSTRIP	0
+#define	I40E_HMC_RX_TPH_DISABLE		0
+#define	I40E_HMC_RX_LOWRXQ_NOINTR	0
+#define	I40E_HMC_RX_PREFENA		1
+
+#define	I40E_HMC_TX_CTX_UNIT		128
+#define	I40E_HMC_TX_NEW_CONTEXT		1
+#define	I40E_HMC_TX_FC_DISABLE		0
+#define	I40E_HMC_TX_TS_DISABLE		0
+#define	I40E_HMC_TX_FD_DISABLE		0
+#define	I40E_HMC_TX_ALT_VLAN_DISABLE	0
+#define	I40E_HMC_TX_WB_ENABLE		1
+#define	I40E_HMC_TX_TPH_DISABLE		0
+
+/*
+ * Whenever we establish and create a VSI, we need to assign some number of
+ * queues that it's allowed to access from the PF. Because we only have a single
+ * VSI per PF at this time, we assign it all the queues.
+ *
+ * Many of the devices support what's called Data-center Bridging. Which is a
+ * feature that we don't have much use of at this time. However, we still need
+ * to fill in this information. We follow the guidance of the note in Table 7-80
+ * which talks about bytes 62-77. It says that if we don't want to assign
+ * anything to traffic classes, we should set the field to zero. Effectively
+ * this means that everything in the system is assigned to traffic class zero.
+ */
+#define	I40E_ASSIGN_ALL_QUEUES		0
+#define	I40E_TRAFFIC_CLASS_NO_QUEUES	0
+
+/*
+ * This defines the error mask that we care about from rx descriptors. Currently
+ * we're only concerned with the general errors and oversize errors.
+ */
+#define	I40E_RX_ERR_BITS	((1 << I40E_RX_DESC_ERROR_RXE_SHIFT) | \
+	(1 << I40E_RX_DESC_ERROR_OVERSIZE_SHIFT))
+
+/*
+ * Property sizing macros for firmware versions, etc. They need to be large
+ * enough to hold 32-bit quantities transformed to strings as %d.%d or %x.
+ */
+#define	I40E_DDI_PROP_LEN	64
+
+/*
+ * We currently consolidate some overrides that we use in the code here. These
+ * will be gone in the fullness of time, but as we're bringing up the device,
+ * this is what we use.
+ */
+#define	I40E_GROUP_MAX		1
+#define	I40E_TRQPAIR_MAX	1
+
+#define	I40E_GROUP_NOMSIX	1
+#define	I40E_TRQPAIR_NOMSIX	1
+
+/*
+ * It seems reasonable to cast this to void because the only reason that we
+ * should be getting a DDI_FAILURE is due to the fact that we specify addresses
+ * out of range. Because we specify no offset or address, it shouldn't happen.
+ */
+#ifdef	DEBUG
+#define	I40E_DMA_SYNC(handle, flag)	ASSERT0(ddi_dma_sync( \
+					    (handle)->dmab_dma_handle, 0, 0, \
+					    (flag)))
+#else	/* !DEBUG */
+#define	I40E_DMA_SYNC(handle, flag)	((void) ddi_dma_sync( \
+					    (handle)->dmab_dma_handle, 0, 0, \
+					    (flag)))
+#endif	/* DEBUG */
+
+/*
+ * Constants related to ring startup and teardown. These refer to the amount of
+ * time that we're willing to wait for a ring to spin up and spin down.
+ */
+#define	I40E_RING_WAIT_NTRIES	10
+#define	I40E_RING_WAIT_PAUSE	10	/* ms */
+
+/*
+ * Bit flags for attach_progress
+ */
+typedef enum i40e_attach_state {
+	I40E_ATTACH_PCI_CONFIG	= 0x0001,	/* PCI config setup */
+	I40E_ATTACH_REGS_MAP	= 0x0002,	/* Registers mapped */
+	I40E_ATTACH_PROPS	= 0x0004,	/* Properties initialized */
+	I40E_ATTACH_ALLOC_INTR	= 0x0008,	/* Interrupts allocated */
+	I40E_ATTACH_ALLOC_RINGSLOCKS	= 0x0010, /* Rings & locks allocated */
+	I40E_ATTACH_ADD_INTR	= 0x0020,	/* Intr handlers added */
+	I40E_ATTACH_COMMON_CODE	= 0x0040, /* Intel code initialized */
+	I40E_ATTACH_INIT	= 0x0080,	/* Device initialized */
+	I40E_ATTACH_STATS	= 0x0200,	/* Kstats created */
+	I40E_ATTACH_MAC		= 0x0800,	/* MAC registered */
+	I40E_ATTACH_ENABLE_INTR	= 0x1000,	/* DDI interrupts enabled */
+	I40E_ATTACH_FM_INIT	= 0x2000,	/* FMA initialized */
+	I40E_ATTACH_LINK_TIMER	= 0x4000,	/* link check timer */
+} i40e_attach_state_t;
+
+
+/*
+ * State flags that what's going on in in the device. Some of these state flags
+ * indicate some aspirational work that needs to happen in the driver.
+ *
+ * I40E_UNKNOWN:	The device has yet to be started.
+ * I40E_INITIALIZED:	The device has been fully attached.
+ * I40E_STARTED:	The device has come out of the GLDV3 start routine.
+ * I40E_SUSPENDED:	The device is suspended and I/O among other things
+ * 			should not occur. This happens because of an actual
+ * 			DDI_SUSPEND or interrupt adjustments.
+ * I40E_STALL:		The tx stall detection logic has found a stall.
+ * I40E_OVERTEMP:	The device has encountered a temperature alarm.
+ * I40E_INTR_ADJUST:	Our interrupts are being manipulated and therefore we
+ * 			shouldn't be manipulating their state.
+ * I40E_ERROR:		We've detected an FM error and degraded the device.
+ */
+typedef enum i40e_state {
+	I40E_UNKNOWN		= 0x00,
+	I40E_INITIALIZED	= 0x01,
+	I40E_STARTED		= 0x02,
+	I40E_SUSPENDED		= 0x04,
+	I40E_STALL		= 0x08,
+	I40E_OVERTEMP		= 0x20,
+	I40E_INTR_ADJUST	= 0x40,
+	I40E_ERROR		= 0x80
+} i40e_state_t;
+
+
+/*
+ * Definitions for common Intel things that we use and some slightly more usable
+ * names.
+ */
+typedef struct i40e_hw i40e_hw_t;
+typedef struct i40e_aqc_switch_resource_alloc_element_resp i40e_switch_rsrc_t;
+
+/*
+ * Handles and addresses of DMA buffers.
+ */
+typedef struct i40e_dma_buffer {
+	caddr_t		dmab_address;		/* Virtual address */
+	uint64_t	dmab_dma_address;	/* DMA (Hardware) address */
+	ddi_acc_handle_t dmab_acc_handle;	/* Data access handle */
+	ddi_dma_handle_t dmab_dma_handle;	/* DMA handle */
+	size_t		dmab_size;		/* Buffer size */
+	size_t		dmab_len;		/* Data length in the buffer */
+} i40e_dma_buffer_t;
+
+/*
+ * RX Control Block
+ */
+typedef struct i40e_rx_control_block {
+	mblk_t			*rcb_mp;
+	uint32_t		rcb_ref;
+	i40e_dma_buffer_t	rcb_dma;
+	frtn_t			rcb_free_rtn;
+	struct i40e_rx_data	*rcb_rxd;
+} i40e_rx_control_block_t;
+
+typedef enum {
+	I40E_TX_NONE,
+	I40E_TX_COPY,
+	I40E_TX_DMA
+} i40e_tx_type_t;
+
+typedef struct i40e_tx_desc i40e_tx_desc_t;
+typedef union i40e_32byte_rx_desc i40e_rx_desc_t;
+
+typedef struct i40e_tx_control_block {
+	struct i40e_tx_control_block	*tcb_next;
+	mblk_t				*tcb_mp;
+	i40e_tx_type_t			tcb_type;
+	ddi_dma_handle_t		tcb_dma_handle;
+	i40e_dma_buffer_t		tcb_dma;
+} i40e_tx_control_block_t;
+
+/*
+ * Receive ring data (used below).
+ */
+typedef struct i40e_rx_data {
+	struct i40e	*rxd_i40e;
+
+	/*
+	 * RX descriptor ring definitions
+	 */
+	i40e_dma_buffer_t rxd_desc_area;	/* DMA buffer of rx desc ring */
+	i40e_rx_desc_t *rxd_desc_ring;		/* Rx desc ring */
+	uint32_t rxd_desc_next;			/* Index of next rx desc */
+
+	/*
+	 * RX control block list definitions
+	 */
+	kmutex_t		rxd_free_lock;	/* Lock to protect free data */
+	i40e_rx_control_block_t	*rxd_rcb_area;	/* Array of control blocks */
+	i40e_rx_control_block_t	**rxd_work_list; /* Work list of rcbs */
+	i40e_rx_control_block_t	**rxd_free_list; /* Free list of rcbs */
+	uint32_t		rxd_rcb_free;	/* Number of free rcbs */
+
+	/*
+	 * RX software ring settings
+	 */
+	uint32_t	rxd_ring_size;	/* Rx descriptor ring size */
+	uint32_t	rxd_free_list_size;	/* Rx free list size */
+
+	/*
+	 * RX outstanding data. This is used to keep track of outstanding loaned
+	 * descriptors after we've shut down receiving information. Note these
+	 * are protected by the i40e_t`i40e_rx_pending_lock.
+	 */
+	uint32_t	rxd_rcb_pending;
+	boolean_t	rxd_shutdown;
+} i40e_rx_data_t;
+
+/*
+ * Structures for unicast and multicast addresses. Note that we keep the VSI id
+ * around for unicast addresses, since they may belong to different VSIs.
+ * However, since all multicast addresses belong to the default VSI, we don't
+ * duplicate that information.
+ */
+typedef struct i40e_uaddr {
+	uint8_t iua_mac[ETHERADDRL];
+	int	iua_vsi;
+} i40e_uaddr_t;
+
+typedef struct i40e_maddr {
+	uint8_t ima_mac[ETHERADDRL];
+} i40e_maddr_t;
+
+/*
+ * Collection of RX statistics on a given queue.
+ */
+typedef struct i40e_rxq_stat {
+	/*
+	 * The i40e hardware does not maintain statistics on a per-ring basis,
+	 * only on a per-PF and per-VSI level. As such, to satisfy the GLDv3, we
+	 * need to maintain our own stats for packets and bytes.
+	 */
+	kstat_named_t	irxs_bytes;	/* Bytes in on queue */
+	kstat_named_t	irxs_packets;	/* Packets in on queue */
+
+	/*
+	 * The following set of stats cover non-checksum data path issues.
+	 */
+	kstat_named_t	irxs_rx_desc_error;	/* Error bit set on desc */
+	kstat_named_t	irxs_rx_copy_nomem;	/* allocb failure for copy */
+	kstat_named_t	irxs_rx_intr_limit;	/* Hit i40e_rx_limit_per_intr */
+	kstat_named_t	irxs_rx_bind_norcb;	/* No replacement rcb free */
+	kstat_named_t	irxs_rx_bind_nomp;	/* No mblk_t in bind rcb */
+
+	/*
+	 * The following set of statistics covers rx checksum related activity.
+	 * These are all primarily set in i40e_rx_hcksum. If rx checksum
+	 * activity is disabled, then these should all be zero.
+	 */
+	kstat_named_t	irxs_hck_v4hdrok;	/* Valid IPv4 Header */
+	kstat_named_t	irxs_hck_l4hdrok;	/* Valid L4 Header */
+	kstat_named_t	irxs_hck_unknown;	/* !pinfo.known */
+	kstat_named_t	irxs_hck_nol3l4p;	/* Missing L3L4P bit in desc */
+	kstat_named_t	irxs_hck_iperr;		/* IPE error bit set */
+	kstat_named_t	irxs_hck_eiperr;	/* EIPE error bit set */
+	kstat_named_t	irxs_hck_l4err;		/* L4E error bit set */
+	kstat_named_t	irxs_hck_v6skip;	/* IPv6 case hw fails on */
+	kstat_named_t	irxs_hck_set;		/* Total times we set cksum */
+	kstat_named_t	irxs_hck_miss;		/* Times with zero cksum bits */
+} i40e_rxq_stat_t;
+
+/*
+ * Collection of TX Statistics on a given queue
+ */
+typedef struct i40e_txq_stat {
+	kstat_named_t	itxs_bytes;		/* Bytes out on queue */
+	kstat_named_t	itxs_packets;		/* Packets out on queue */
+	kstat_named_t	itxs_descriptors;	/* Descriptors issued */
+	kstat_named_t	itxs_recycled;		/* Descriptors reclaimed */
+	/*
+	 * Various failure conditions.
+	 */
+	kstat_named_t	itxs_hck_meoifail;	/* ether offload failures */
+	kstat_named_t	itxs_hck_nol2info;	/* Missing l2 info */
+	kstat_named_t	itxs_hck_nol3info;	/* Missing l3 info */
+	kstat_named_t	itxs_hck_nol4info;	/* Missing l4 info */
+	kstat_named_t	itxs_hck_badl3;		/* Not IPv4/IPv6 */
+	kstat_named_t	itxs_hck_badl4;		/* Bad L4 Paylaod */
+
+	kstat_named_t	itxs_err_notcb;		/* No tcb's available */
+	kstat_named_t	itxs_err_nodescs;	/* No tcb's available */
+	kstat_named_t	itxs_err_context;	/* Total context failures */
+
+	kstat_named_t	itxs_num_unblocked;	/* Number of MAC unblocks */
+} i40e_txq_stat_t;
+
+/*
+ * An instance of an XL710 transmit/receive queue pair. This currently
+ * represents a combination of both a transmit and receive ring, though they
+ * should really be split apart into separate logical structures. Unfortunately,
+ * during initial work we mistakenly joined them together.
+ */
+typedef struct i40e_trqpair {
+	struct i40e *itrq_i40e;
+
+	/* Receive-side structures. */
+	kmutex_t itrq_rx_lock;
+	mac_ring_handle_t itrq_macrxring; /* Receive ring handle. */
+	i40e_rx_data_t *itrq_rxdata;	/* Receive ring rx data. */
+	uint64_t itrq_rxgen;		/* Generation number for mac/GLDv3. */
+	uint32_t itrq_index;		/* Queue index in the PF */
+	uint32_t itrq_rx_intrvec;	/* Receive interrupt vector. */
+
+	/* Receive-side stats. */
+	i40e_rxq_stat_t	itrq_rxstat;
+	kstat_t	*itrq_rxkstat;
+
+	/* Transmit-side structures. */
+	kmutex_t itrq_tx_lock;
+	mac_ring_handle_t itrq_mactxring; /* Transmit ring handle. */
+	uint32_t itrq_tx_intrvec;	/* Transmit interrupt vector. */
+	boolean_t itrq_tx_blocked;	/* Does MAC think we're blocked? */
+
+	/*
+	 * TX data sizing
+	 */
+	uint32_t		itrq_tx_ring_size;
+	uint32_t		itrq_tx_free_list_size;
+
+	/*
+	 * TX descriptor ring data
+	 */
+	i40e_dma_buffer_t	itrq_desc_area;	/* DMA buffer of tx desc ring */
+	i40e_tx_desc_t		*itrq_desc_ring; /* TX Desc ring */
+	volatile uint32_t 	*itrq_desc_wbhead; /* TX write-back index */
+	uint32_t		itrq_desc_head;	/* Last index hw freed */
+	uint32_t		itrq_desc_tail;	/* Index of next free desc */
+	uint32_t		itrq_desc_free;	/* Number of free descriptors */
+
+	/*
+	 * TX control block (tcb) data
+	 */
+	kmutex_t		itrq_tcb_lock;
+	i40e_tx_control_block_t	*itrq_tcb_area;	/* Array of control blocks */
+	i40e_tx_control_block_t	**itrq_tcb_work_list;	/* In use tcb */
+	i40e_tx_control_block_t	**itrq_tcb_free_list;	/* Available tcb */
+	uint32_t		itrq_tcb_free;	/* Count of free tcb */
+
+	/* Transmit-side stats. */
+	i40e_txq_stat_t		itrq_txstat;
+	kstat_t			*itrq_txkstat;
+
+} i40e_trqpair_t;
+
+/*
+ * VSI statistics.
+ *
+ * This mirrors the i40e_eth_stats structure but transforms it into a kstat.
+ * Note that the stock statistic structure also includes entries for tx
+ * discards. However, this is not actually implemented for the VSI (see Table
+ * 7-221), hence why we don't include the member which would always have a value
+ * of zero. This choice was made to minimize confusion to someone looking at
+ * these, as a value of zero does not necessarily equate to the fact that it's
+ * not implemented.
+ */
+typedef struct i40e_vsi_stats {
+	uint64_t ivs_rx_bytes;			/* gorc */
+	uint64_t ivs_rx_unicast;		/* uprc */
+	uint64_t ivs_rx_multicast;		/* mprc */
+	uint64_t ivs_rx_broadcast;		/* bprc */
+	uint64_t ivs_rx_discards;		/* rdpc */
+	uint64_t ivs_rx_unknown_protocol;	/* rupp */
+	uint64_t ivs_tx_bytes;			/* gotc */
+	uint64_t ivs_tx_unicast;		/* uptc */
+	uint64_t ivs_tx_multicast;		/* mptc */
+	uint64_t ivs_tx_broadcast;		/* bptc */
+	uint64_t ivs_tx_errors;			/* tepc */
+} i40e_vsi_stats_t;
+
+typedef struct i40e_vsi_kstats {
+	kstat_named_t	ivk_rx_bytes;
+	kstat_named_t	ivk_rx_unicast;
+	kstat_named_t	ivk_rx_multicast;
+	kstat_named_t	ivk_rx_broadcast;
+	kstat_named_t	ivk_rx_discards;
+	kstat_named_t	ivk_rx_unknown_protocol;
+	kstat_named_t	ivk_tx_bytes;
+	kstat_named_t	ivk_tx_unicast;
+	kstat_named_t	ivk_tx_multicast;
+	kstat_named_t	ivk_tx_broadcast;
+	kstat_named_t	ivk_tx_errors;
+} i40e_vsi_kstats_t;
+
+/*
+ * For pf statistics, we opt not to use the standard statistics as defined by
+ * the Intel common code. This also currently combines statistics that are
+ * global across the entire device.
+ */
+typedef struct i40e_pf_stats {
+	uint64_t ips_rx_bytes;			/* gorc */
+	uint64_t ips_rx_unicast;		/* uprc */
+	uint64_t ips_rx_multicast;		/* mprc */
+	uint64_t ips_rx_broadcast;		/* bprc */
+	uint64_t ips_tx_bytes;			/* gotc */
+	uint64_t ips_tx_unicast;		/* uptc */
+	uint64_t ips_tx_multicast;		/* mptc */
+	uint64_t ips_tx_broadcast;		/* bptc */
+
+	uint64_t ips_rx_size_64;		/* prc64 */
+	uint64_t ips_rx_size_127;		/* prc127 */
+	uint64_t ips_rx_size_255;		/* prc255 */
+	uint64_t ips_rx_size_511;		/* prc511 */
+	uint64_t ips_rx_size_1023;		/* prc1023 */
+	uint64_t ips_rx_size_1522;		/* prc1522 */
+	uint64_t ips_rx_size_9522;		/* prc9522 */
+
+	uint64_t ips_tx_size_64;		/* ptc64 */
+	uint64_t ips_tx_size_127;		/* ptc127 */
+	uint64_t ips_tx_size_255;		/* ptc255 */
+	uint64_t ips_tx_size_511;		/* ptc511 */
+	uint64_t ips_tx_size_1023;		/* ptc1023 */
+	uint64_t ips_tx_size_1522;		/* ptc1522 */
+	uint64_t ips_tx_size_9522;		/* ptc9522 */
+
+	uint64_t ips_link_xon_rx;		/* lxonrxc */
+	uint64_t ips_link_xoff_rx;		/* lxoffrxc */
+	uint64_t ips_link_xon_tx;		/* lxontxc */
+	uint64_t ips_link_xoff_tx;		/* lxofftxc */
+	uint64_t ips_priority_xon_rx[8];	/* pxonrxc[8] */
+	uint64_t ips_priority_xoff_rx[8];	/* pxoffrxc[8] */
+	uint64_t ips_priority_xon_tx[8];	/* pxontxc[8] */
+	uint64_t ips_priority_xoff_tx[8];	/* pxofftxc[8] */
+	uint64_t ips_priority_xon_2_xoff[8];	/* rxon2offcnt[8] */
+
+	uint64_t ips_crc_errors;		/* crcerrs */
+	uint64_t ips_illegal_bytes;		/* illerrc */
+	uint64_t ips_mac_local_faults;		/* mlfc */
+	uint64_t ips_mac_remote_faults;		/* mrfc */
+	uint64_t ips_rx_length_errors;		/* rlec */
+	uint64_t ips_rx_undersize;		/* ruc */
+	uint64_t ips_rx_fragments;		/* rfc */
+	uint64_t ips_rx_oversize;		/* roc */
+	uint64_t ips_rx_jabber;			/* rjc */
+	uint64_t ips_rx_discards;		/* rdpc */
+	uint64_t ips_rx_vm_discards;		/* ldpc */
+	uint64_t ips_rx_short_discards;		/* mspdc */
+	uint64_t ips_tx_dropped_link_down;	/* tdold */
+	uint64_t ips_rx_unknown_protocol;	/* rupp */
+	uint64_t ips_rx_err1;			/* rxerr1 */
+	uint64_t ips_rx_err2;			/* rxerr2 */
+} i40e_pf_stats_t;
+
+typedef struct i40e_pf_kstats {
+	kstat_named_t ipk_rx_bytes;		/* gorc */
+	kstat_named_t ipk_rx_unicast;		/* uprc */
+	kstat_named_t ipk_rx_multicast;		/* mprc */
+	kstat_named_t ipk_rx_broadcast;		/* bprc */
+	kstat_named_t ipk_tx_bytes;		/* gotc */
+	kstat_named_t ipk_tx_unicast;		/* uptc */
+	kstat_named_t ipk_tx_multicast;		/* mptc */
+	kstat_named_t ipk_tx_broadcast;		/* bptc */
+
+	kstat_named_t ipk_rx_size_64;		/* prc64 */
+	kstat_named_t ipk_rx_size_127;		/* prc127 */
+	kstat_named_t ipk_rx_size_255;		/* prc255 */
+	kstat_named_t ipk_rx_size_511;		/* prc511 */
+	kstat_named_t ipk_rx_size_1023;		/* prc1023 */
+	kstat_named_t ipk_rx_size_1522;		/* prc1522 */
+	kstat_named_t ipk_rx_size_9522;		/* prc9522 */
+
+	kstat_named_t ipk_tx_size_64;		/* ptc64 */
+	kstat_named_t ipk_tx_size_127;		/* ptc127 */
+	kstat_named_t ipk_tx_size_255;		/* ptc255 */
+	kstat_named_t ipk_tx_size_511;		/* ptc511 */
+	kstat_named_t ipk_tx_size_1023;		/* ptc1023 */
+	kstat_named_t ipk_tx_size_1522;		/* ptc1522 */
+	kstat_named_t ipk_tx_size_9522;		/* ptc9522 */
+
+	kstat_named_t ipk_link_xon_rx;		/* lxonrxc */
+	kstat_named_t ipk_link_xoff_rx;		/* lxoffrxc */
+	kstat_named_t ipk_link_xon_tx;		/* lxontxc */
+	kstat_named_t ipk_link_xoff_tx;		/* lxofftxc */
+	kstat_named_t ipk_priority_xon_rx[8];	/* pxonrxc[8] */
+	kstat_named_t ipk_priority_xoff_rx[8];	/* pxoffrxc[8] */
+	kstat_named_t ipk_priority_xon_tx[8];	/* pxontxc[8] */
+	kstat_named_t ipk_priority_xoff_tx[8];	/* pxofftxc[8] */
+	kstat_named_t ipk_priority_xon_2_xoff[8];	/* rxon2offcnt[8] */
+
+	kstat_named_t ipk_crc_errors;		/* crcerrs */
+	kstat_named_t ipk_illegal_bytes;	/* illerrc */
+	kstat_named_t ipk_mac_local_faults;	/* mlfc */
+	kstat_named_t ipk_mac_remote_faults;	/* mrfc */
+	kstat_named_t ipk_rx_length_errors;	/* rlec */
+	kstat_named_t ipk_rx_undersize;		/* ruc */
+	kstat_named_t ipk_rx_fragments;		/* rfc */
+	kstat_named_t ipk_rx_oversize;		/* roc */
+	kstat_named_t ipk_rx_jabber;		/* rjc */
+	kstat_named_t ipk_rx_discards;		/* rdpc */
+	kstat_named_t ipk_rx_vm_discards;	/* ldpc */
+	kstat_named_t ipk_rx_short_discards;	/* mspdc */
+	kstat_named_t ipk_tx_dropped_link_down;	/* tdold */
+	kstat_named_t ipk_rx_unknown_protocol;	/* rupp */
+	kstat_named_t ipk_rx_err1;		/* rxerr1 */
+	kstat_named_t ipk_rx_err2;		/* rxerr2 */
+} i40e_pf_kstats_t;
+
+/*
+ * Resources that are pooled and specific to a given i40e_t.
+ */
+typedef struct i40e_func_rsrc {
+	uint_t	ifr_nrx_queue;
+	uint_t	ifr_nrx_queue_used;
+	uint_t	ifr_ntx_queue;
+	uint_t	ifr_trx_queue_used;
+	uint_t	ifr_nvsis;
+	uint_t	ifr_nvsis_used;
+	uint_t	ifr_nmacfilt;
+	uint_t	ifr_nmacfilt_used;
+	uint_t	ifr_nmcastfilt;
+	uint_t	ifr_nmcastfilt_used;
+} i40e_func_rsrc_t;
+
+/*
+ * Main i40e per-instance state.
+ */
+typedef struct i40e {
+	list_node_t	i40e_glink;		/* Global list link */
+	list_node_t	i40e_dlink;		/* Device list link */
+	kmutex_t	i40e_general_lock;	/* General device lock */
+
+	/*
+	 * General Data and management
+	 */
+	dev_info_t	*i40e_dip;
+	int		i40e_instance;
+	int		i40e_fm_capabilities;
+	uint_t		i40e_state;
+	i40e_attach_state_t i40e_attach_progress;
+	mac_handle_t	i40e_mac_hdl;
+	ddi_periodic_t	i40e_periodic_id;
+
+	/*
+	 * Pointers to common code data structures and memory for the common
+	 * code.
+	 */
+	struct i40e_hw				i40e_hw_space;
+	struct i40e_osdep			i40e_osdep_space;
+	struct i40e_aq_get_phy_abilities_resp	i40e_phy;
+	void 					*i40e_aqbuf;
+
+	/*
+	 * Device state, switch information, and resources.
+	 */
+	int			i40e_vsi_id;
+	struct i40e_device	*i40e_device;
+	i40e_func_rsrc_t	i40e_resources;
+	uint16_t		i40e_switch_rsrc_alloc;
+	uint16_t		i40e_switch_rsrc_actual;
+	i40e_switch_rsrc_t	*i40e_switch_rsrcs;
+	i40e_uaddr_t		*i40e_uaddrs;
+	i40e_maddr_t		*i40e_maddrs;
+	int			i40e_mcast_promisc_count;
+	boolean_t		i40e_promisc_on;
+	link_state_t		i40e_link_state;
+	uint32_t		i40e_link_speed;	/* In Mbps */
+	link_duplex_t		i40e_link_duplex;
+	uint_t			i40e_sdu;
+	uint_t			i40e_frame_max;
+
+	/*
+	 * Transmit and receive information, tunables, and MAC info.
+	 */
+	i40e_trqpair_t	*i40e_trqpairs;
+	boolean_t 	i40e_mr_enable;
+	int		i40e_num_trqpairs;
+	uint_t		i40e_other_itr;
+
+	int		i40e_num_rx_groups;
+	int		i40e_num_rx_descs;
+	mac_group_handle_t i40e_rx_group_handle;
+	uint32_t	i40e_rx_ring_size;
+	uint32_t	i40e_rx_buf_size;
+	boolean_t	i40e_rx_hcksum_enable;
+	uint32_t	i40e_rx_dma_min;
+	uint32_t	i40e_rx_limit_per_intr;
+	uint_t		i40e_rx_itr;
+
+	int		i40e_num_tx_descs;
+	uint32_t	i40e_tx_ring_size;
+	uint32_t	i40e_tx_buf_size;
+	uint32_t	i40e_tx_block_thresh;
+	boolean_t	i40e_tx_hcksum_enable;
+	uint32_t	i40e_tx_dma_min;
+	uint_t		i40e_tx_itr;
+
+	/*
+	 * Interrupt state
+	 *
+	 * Note that the use of a single boolean_t for i40e_intr_poll isn't
+	 * really the best design. When we have more than a single ring on the
+	 * device working, we'll transition to using something more
+	 * sophisticated.
+	 */
+	uint_t		i40e_intr_pri;
+	uint_t		i40e_intr_force;
+	uint_t		i40e_intr_type;
+	int		i40e_intr_cap;
+	uint32_t	i40e_intr_count;
+	uint32_t	i40e_intr_count_max;
+	uint32_t	i40e_intr_count_min;
+	size_t		i40e_intr_size;
+	ddi_intr_handle_t *i40e_intr_handles;
+	ddi_cb_handle_t	i40e_callback_handle;
+	boolean_t	i40e_intr_poll;
+
+	/*
+	 * DMA attributes. See i40e_buf.c for why we have copies of them in the
+	 * i40e_t.
+	 */
+	ddi_dma_attr_t		i40e_static_dma_attr;
+	ddi_dma_attr_t		i40e_txbind_dma_attr;
+	ddi_device_acc_attr_t	i40e_desc_acc_attr;
+	ddi_device_acc_attr_t	i40e_buf_acc_attr;
+
+	/*
+	 * The following two fields are used to protect and keep track of
+	 * outstanding, loaned buffers to MAC. If we have these, we can't
+	 * detach as we have active DMA memory outstanding.
+	 */
+	kmutex_t	i40e_rx_pending_lock;
+	kcondvar_t	i40e_rx_pending_cv;
+	uint32_t	i40e_rx_pending;
+
+	/*
+	 * PF statistics and VSI statistics.
+	 */
+	kmutex_t		i40e_stat_lock;
+	kstat_t			*i40e_pf_kstat;
+	kstat_t			*i40e_vsi_kstat;
+	i40e_pf_stats_t		i40e_pf_stat;
+	i40e_vsi_stats_t	i40e_vsi_stat;
+	uint16_t		i40e_vsi_stat_id;
+
+	/*
+	 * Misc. stats and counters that should maybe one day be kstats.
+	 */
+	uint64_t	i40e_s_link_status_errs;
+	uint32_t	i40e_s_link_status_lasterr;
+} i40e_t;
+
+/*
+ * The i40e_device represents a PCI device which encapsulates multiple physical
+ * functions which are represented as an i40e_t. This is used to track the use
+ * of pooled resources throughout all of the various devices.
+ */
+typedef struct i40e_device {
+	list_node_t	id_link;
+	dev_info_t	*id_parent;
+	uint_t		id_pci_bus;
+	uint_t		id_pci_device;
+	uint_t		id_nfuncs;	/* Total number of functions */
+	uint_t		id_nreg;	/* Total number present */
+	list_t		id_i40e_list;	/* List of i40e_t's registered */
+	i40e_switch_rsrc_t	*id_rsrcs; /* Switch resources for this PF */
+	uint_t		id_rsrcs_alloc;	/* Total allocated resources */
+	uint_t		id_rsrcs_act;	/* Actual number of resources */
+} i40e_device_t;
+
+/* Values for the interrupt forcing on the NIC. */
+#define	I40E_INTR_NONE			0
+#define	I40E_INTR_MSIX			1
+#define	I40E_INTR_MSI			2
+#define	I40E_INTR_LEGACY		3
+
+/* Hint that we don't want to do any polling... */
+#define	I40E_POLL_NULL			-1
+
+/*
+ * Logging functions.
+ */
+extern void i40e_error(i40e_t *, const char *, ...);
+extern void i40e_notice(i40e_t *, const char *, ...);
+extern void i40e_log(i40e_t *, const char *, ...);
+
+/*
+ * General link handling functions.
+ */
+extern void i40e_link_check(i40e_t *);
+extern void i40e_update_mtu(i40e_t *);
+
+/*
+ * FMA functions.
+ */
+extern int i40e_check_acc_handle(ddi_acc_handle_t);
+extern int i40e_check_dma_handle(ddi_dma_handle_t);
+extern void i40e_fm_ereport(i40e_t *, char *);
+
+/*
+ * Interrupt handlers and interrupt handler setup.
+ */
+extern void i40e_intr_chip_init(i40e_t *);
+extern void i40e_intr_chip_fini(i40e_t *);
+extern uint_t i40e_intr_msix(void *, void *);
+extern uint_t i40e_intr_msi(void *, void *);
+extern uint_t i40e_intr_legacy(void *, void *);
+extern void i40e_intr_io_enable_all(i40e_t *);
+extern void i40e_intr_io_disable_all(i40e_t *);
+extern void i40e_intr_io_clear_cause(i40e_t *);
+extern void i40e_intr_rx_queue_disable(i40e_t *, uint_t);
+extern void i40e_intr_rx_queue_enable(i40e_t *, uint_t);
+extern void i40e_intr_set_itr(i40e_t *, i40e_itr_index_t, uint_t);
+
+/*
+ * Receive-side functions
+ */
+extern mblk_t *i40e_ring_rx(i40e_trqpair_t *, int);
+extern mblk_t *i40e_ring_rx_poll(void *, int);
+extern void i40e_rx_recycle(caddr_t);
+
+/*
+ * Transmit-side functions
+ */
+mblk_t *i40e_ring_tx(void *, mblk_t *);
+extern void i40e_tx_recycle_ring(i40e_trqpair_t *);
+extern void i40e_tx_cleanup_ring(i40e_trqpair_t *);
+
+/*
+ * Statistics functions.
+ */
+extern boolean_t i40e_stats_init(i40e_t *);
+extern void i40e_stats_fini(i40e_t *);
+extern boolean_t i40e_stat_vsi_init(i40e_t *);
+extern void i40e_stat_vsi_fini(i40e_t *);
+extern boolean_t i40e_stats_trqpair_init(i40e_trqpair_t *);
+extern void i40e_stats_trqpair_fini(i40e_trqpair_t *);
+extern int i40e_m_stat(void *, uint_t, uint64_t *);
+extern int i40e_rx_ring_stat(mac_ring_driver_t, uint_t, uint64_t *);
+extern int i40e_tx_ring_stat(mac_ring_driver_t, uint_t, uint64_t *);
+
+/*
+ * MAC/GLDv3 functions, and functions called by MAC/GLDv3 support code.
+ */
+extern boolean_t i40e_register_mac(i40e_t *);
+extern boolean_t i40e_start(i40e_t *, boolean_t);
+extern void i40e_stop(i40e_t *, boolean_t);
+
+/*
+ * DMA & buffer functions and attributes
+ */
+extern void i40e_init_dma_attrs(i40e_t *, boolean_t);
+extern boolean_t i40e_alloc_ring_mem(i40e_t *);
+extern void i40e_free_ring_mem(i40e_t *, boolean_t);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _I40E_SW_H */
diff --git a/usr/src/uts/common/io/i40e/i40e_transceiver.c b/usr/src/uts/common/io/i40e/i40e_transceiver.c
new file mode 100644
index 0000000000..06f82f856e
--- /dev/null
+++ b/usr/src/uts/common/io/i40e/i40e_transceiver.c
@@ -0,0 +1,2266 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 OmniTI Computer Consulting, Inc. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#include "i40e_sw.h"
+
+/*
+ * ---------------------------------------------------------
+ * Buffer and Memory Management, Receiving, and Transmitting
+ * ---------------------------------------------------------
+ *
+ * Each physical function (PF), which is what we think of as an instance of the
+ * device driver, has a series of associated transmit and receive queue pairs.
+ * Effectively, what we think of in MAC as rings. Each of these has their own
+ * ring of descriptors which is used as part of doing DMA activity.
+ *
+ * The transmit ring of descriptors are 16-byte entries which are used to send
+ * packets, program filters, etc. The receive ring of descriptors are either
+ * 16-byte or 32-bytes each. At the moment, we opt to use the larger descriptor
+ * format so that we're in a better position if we ever want to leverage that
+ * information later on.
+ *
+ * However, these rings are just for descriptors, they don't talk or deal with
+ * how we actually store the memory that we need for DMA or the associated
+ * information that we need for keeping track of message blocks. To correspond
+ * to the hardware descriptor ring which is how we communicate with hardware, we
+ * introduce a control block which keeps track of our required metadata like DMA
+ * mappings.
+ *
+ * There are two main considerations that dictate how much memory and buffers
+ * we end up allocating. Those are:
+ *
+ *   o The size of the ring (controlled through the driver.conf file)
+ *
+ *   o The maximum size frame we can receive.
+ *
+ * The size of the rings currently defaults to 1024 descriptors and is stored in
+ * the i40e_t`i40e_rx_ring_size and i40e_t`i40e_tx_ring_size.
+ *
+ * While the size of the rings is controlled by the driver.conf, the maximum
+ * size frame is informed primarily through the use of dladm and the setting of
+ * the MTU property on the device. From the MTU, we then go and do some
+ * machinations. The first thing we do is we then have to add in space for the
+ * Ethernet header, potentially a VLAN header, and the FCS check. This value is
+ * what's stored as i40e_t`i40e_frame_max and is derived any time
+ * i40e_t`i40e_sdu changes.
+ *
+ * This size is then rounded up to the nearest 1k chunk, which represents the
+ * actual amount of memory that we'll allocate for a single frame.
+ *
+ * Note, that for rx, we do something that might be unexpected. We always add
+ * an extra two bytes to the frame size that we allocate. We then offset the DMA
+ * address that we receive a packet into by two bytes. This ensures that the IP
+ * header will always be 4 byte aligned because the MAC header is either 14 or
+ * 18 bytes in length, depending on the use of 802.1Q tagging, which makes IP's
+ * and MAC's lives easier.
+ *
+ * Both the rx and tx descriptor rings (which are what we use to communicate
+ * with hardware) are allocated as a single region of DMA memory which is the
+ * size of the descriptor (4 bytes and 2 bytes respectively) times the total
+ * number of descriptors for an rx and tx ring.
+ *
+ * While the rx and tx descriptors are allocated using DMA-based memory, the
+ * control blocks for each of them are allocated using normal kernel memory.
+ * They aren't special from a DMA perspective. We'll go over the design of both
+ * receiving and transmitting separately, as they have slightly different
+ * control blocks and different ways that we manage the relationship between
+ * control blocks and descriptors.
+ *
+ * ---------------------------------
+ * RX Descriptors and Control Blocks
+ * ---------------------------------
+ *
+ * For every descriptor in the ring that the driver has, we need some associated
+ * memory, which means that we need to have the receive specific control block.
+ * We have a couple different, but related goals:
+ *
+ *   o Once we've completed the mc_start GLDv3 endpoint, we do not want to do
+ *     any additional memory allocations or DMA allocations if we don't have to.
+ *
+ *   o We'd like to try and do as much zero-copy as possible, while taking into
+ *     account the cost of mapping in DMA resources.
+ *
+ *   o We'd like to have every receive descriptor available.
+ *
+ * Now, these rules are a bit in tension with one another. The act of mapping in
+ * is an exercise of trying to find the break-even point between page table
+ * updates and bcopy. We currently start by using the same metrics that ixgbe
+ * used; however, it should be known that this value has effectively been
+ * cargo-culted across to yet another driver, sorry.
+ *
+ * If we receive a packet which is larger than our copy threshold, we'll create
+ * a message block out of the DMA memory via desballoc(9F) and send that up to
+ * MAC that way. This will cause us to be notified when the message block is
+ * then freed because it has been consumed, dropped, or otherwise. Otherwise, if
+ * it's less than the threshold, we'll try to use allocb and bcopy it into the
+ * block, thus allowing us to immediately reuse the DMA resource. Note, on debug
+ * builds, we allow someone to whack the variable i40e_debug_rx_mode to override
+ * the behavior and always do a bcopy or a DMA bind.
+ *
+ * To try and ensure that the device always has blocks that it can receive data
+ * into, we maintain two lists of control blocks, a working list and a free
+ * list. Each list is sized equal to the number of descriptors in the rx ring.
+ * During the GLDv3 mc_start routine, we allocate a number of rx control blocks
+ * equal to twice the number of descriptors in the ring and we assign them
+ * equally to the free list and to the working list. Each control block also has
+ * DMA memory allocated and associated with which it will be used to receive the
+ * actual packet data. All of a received frame's data will end up in a single
+ * DMA buffer.
+ *
+ * During operation, we always maintain the invariant that each rx descriptor
+ * has an associated rx control block which lives in the working list. If we
+ * feel that we should loan up DMA memory to MAC in the form of a message block,
+ * we can only do so if we can maintain this invariant. To do that, we swap in
+ * one of the buffers from the free list. If none are available, then we resort
+ * to using allocb(9F) and bcopy(9F) on the packet instead, regardless of the
+ * size.
+ *
+ * Loaned message blocks come back to use when freemsg(9F) or freeb(9F) is
+ * called on the block, at which point we restore the rx control block to the
+ * free list and are able to reuse the DMA memory again. While the scheme may
+ * seem odd, it importantly keeps us out of trying to do any DMA allocations in
+ * the normal path of operation, even though we may still have to allocate
+ * message blocks and copy.
+ *
+ * The following state machine describes the life time of a rx control block. In
+ * the diagram we abbrviate the rx ring descriptor entry as rxd and the rx
+ * control block entry as rcb.
+ *
+ *             |                                   |
+ *             * ... 1/2 of all initial rcb's  ... *
+ *             |                                   |
+ *             v                                   v
+ *     +------------------+               +------------------+
+ *     | rcb on free list |---*---------->| rcb on work list |
+ *     +------------------+   .           +------------------+
+ *             ^              . moved to          |
+ *             |                replace rcb       * . . Frame received,
+ *             |                loaned to         |     entry on free list
+ *             |                MAC + co.         |     available. rcb's
+ *             |                                  |     memory made into mblk_t
+ *             * . freemsg(9F)                    |     and sent up to MAC.
+ *             |   called on                      |
+ *             |   loaned rcb                     |
+ *             |   and it is                      v
+ *             |   recycled.              +-------------------+
+ *             +--------------------<-----| rcb loaned to MAC |
+ *                                        +-------------------+
+ *
+ * Finally, note that every rx control block has a reference count on it. One
+ * reference is added as long as the driver has had the GLDv3 mc_start endpoint
+ * called. If the GLDv3 mc_stop entry point is called, IP has been unplumbed and
+ * no other DLPI consumers remain, then we'll decrement the reference count by
+ * one. Whenever we loan up the rx control block and associated buffer to MAC,
+ * then we bump the reference count again. Even though the device is stopped,
+ * there may still be loaned frames in upper levels that we'll want to account
+ * for. Our callback from freemsg(9F)/freeb(9F) will take care of making sure
+ * that it is cleaned up.
+ *
+ * --------------------
+ * Managing the RX Ring
+ * --------------------
+ *
+ * The receive ring descriptors are arranged in a circular buffer with a head
+ * and tail pointer. There are both the conventional head and tail pointers
+ * which are used to partition the ring into two portions, a portion that we,
+ * the operating system, manage and a portion that is managed by hardware. When
+ * hardware owns a descriptor in the ring, it means that it is waiting for data
+ * to be filled in. However, when a portion of the ring is owned by the driver,
+ * then that means that the descriptor has been consumed and we need to go take
+ * a look at it.
+ *
+ * The initial head is configured to be zero by writing it as such in the
+ * receive queue context in the FPM (function private memory from the host). The
+ * initial tail is written to be the last descriptor. This is written to via the
+ * PCIe register I40E_QRX_TAIL(). Technically, hardware owns everything between
+ * the HEAD and TAIL, inclusive. Note that while we initially program the HEAD,
+ * the only values we ever consult ourselves are the TAIL register and our own
+ * state tracking. Effectively, we cache the HEAD register and then update it
+ * ourselves based on our work.
+ *
+ * When we iterate over the rx descriptors and thus the received frames, we are
+ * either in an interrupt context or we've been asked by MAC to poll on the
+ * ring. If we've been asked to poll on the ring, we have a maximum number of
+ * bytes of mblk_t's to return. If processing an rx descriptor would cause us to
+ * exceed that count, then we do not process it. When in interrupt context, we
+ * don't have a strict byte count. However, to ensure liveness, we limit the
+ * amount of data based on a configuration value
+ * (i40e_t`i40e_rx_limit_per_intr). The number that we've started with for this
+ * is based on similar numbers that are used for ixgbe. After some additional
+ * time in the field, we'll have a sense as to whether or not it should be
+ * changed.
+ *
+ * When processing, we start at our own HEAD pointer
+ * (i40e_rx_data_t`rxd_desc_next), which indicates the descriptor to start
+ * processing. Every RX descriptor has what's described as the DD bit. This bit
+ * (the LSB of the second 8-byte word), indicates whether or not the descriptor
+ * is done.  When we give descriptors to the hardware, this value is always
+ * zero. When the hardware has finished a descriptor, it will always be one.
+ *
+ * The first thing that we check is whether the DD bit indicates that the
+ * current HEAD is ready. If it isn't, then we're done. That's the primary
+ * invariant of processing a frame. If it's done, then there are a few other
+ * things that we want to look at. In the same status word as the DD bit, there
+ * are two other important bits:
+ *
+ *   o End of Packet (EOP)
+ *   o Error bits
+ *
+ * The end of packet indicates that we have reached the last descriptor. Now,
+ * you might ask when would there be more than one descriptor. The reason for
+ * that might be due to large receive offload (lro) or header splitting
+ * functionality, which presently isn't supported in the driver. The error bits
+ * in the frame are only valid when EOP is set.
+ *
+ * If error bits are set on the frame, then we still consume it; however, we
+ * will not generate an mblk_t to send up to MAC. If there are no error bits
+ * set, then we'll consume the descriptor either using bcopy or DMA binding. See
+ * the earlier section 'RX DESCRIPTORS AND CONTROL BLOCKS' for more information
+ * on how that selection is made.
+ *
+ * Regardless of whether we construct an mblk_t or encounter an error, we end up
+ * resetting the descriptor. This re-arms the descriptor for hardware and in the
+ * process, we may end up assigning it a new receive control bock. After we do
+ * this, we always update our HEAD pointer, no matter what.
+ *
+ * Finally, once we've consumed as much as we will in a given window, we go and
+ * update the TAIL register to indicate all the frames we've consumed. We only
+ * do a single bulk write for the ring.
+ *
+ * ---------------------------------
+ * TX Descriptors and Control Blocks
+ * ---------------------------------
+ *
+ * While the transmit path is similar in spirit to the receive path, it works
+ * differently due to the fact that all data is originated by the operating
+ * system and not by the device.
+ *
+ * Like rx, there is both a descriptor ring that we use to communicate to the
+ * driver and which points to the memory used to transmit a frame. Similarly,
+ * there is a corresponding transmit control block. Each transmit control block
+ * has a region of DMA memory allocated to it; however, the way we use it
+ * varies.
+ *
+ * The driver is asked to process a single frame at a time. That message block
+ * may be made up of multiple fragments linked together by the mblk_t`b_cont
+ * member. The device has a hard limit of up to 8 buffers being allowed for use
+ * for a single logical frame. For each fragment, we'll try and use an entry
+ * from the tx descriptor ring and then we'll allocate a corresponding tx
+ * control block. Depending on the size of the fragment, we may copy it around
+ * or we might instead try to do DMA binding of the fragment.
+ *
+ * If we exceed the number of blocks that fit, we'll try to pull up the block
+ * and then we'll do a DMA bind and send it out.
+ *
+ * If we don't have enough space in the ring or tx control blocks available,
+ * then we'll return the unprocessed message block to MAC. This will induce flow
+ * control and once we recycle enough entries, we'll once again enable sending
+ * on the ring.
+ *
+ * We size the working list as equal to the number of descriptors in the ring.
+ * We size the free list as equal to 1.5 times the number of descriptors in the
+ * ring. We'll allocate a number of tx control block entries equal to the number
+ * of entries in the free list. By default, all entries are placed in the free
+ * list. As we come along and try to send something, we'll allocate entries from
+ * the free list and add them to the working list, where they'll stay until the
+ * hardware indicates that all of the data has been written back to us. The
+ * reason that we start with 1.5x is to help facilitate having more than one TX
+ * buffer associated with the DMA activity.
+ *
+ * --------------------
+ * Managing the TX Ring
+ * --------------------
+ *
+ * The transmit descriptor ring is driven by us. We maintain our own notion of a
+ * HEAD and TAIL register and we update the hardware with updates to the TAIL
+ * register. When the hardware is done writing out data, it updates us by
+ * writing back to a specific address, not by updating the individual
+ * descriptors. That address is a 4-byte region after the main transmit
+ * descriptor ring. This is why the descriptor ring has an extra descriptor's
+ * worth allocated to it.
+ *
+ * We maintain our notion of the HEAD in the i40e_trqpair_t`itrq_desc_head and
+ * the TAIL in the i40e_trqpair_t`itrq_desc_tail. When we write out frames,
+ * we'll update the tail there and in the I40E_QTX_TAIL() register. At various
+ * points in time, through both interrupts, and our own internal checks, we'll
+ * sync the write-back head portion of the DMA space. Based on the index it
+ * reports back, we'll free everything between our current HEAD and the
+ * indicated index and update HEAD to the new index.
+ *
+ * When a frame comes in, we try to use a number of transmit control blocks and
+ * we'll transition them from the free list to the work list. They'll get moved
+ * to the entry on the work list that corresponds with the transmit descriptor
+ * they correspond to. Once we are indicated that the corresponding descriptor
+ * has been freed, we'll return it to the list.
+ *
+ * The thread control block free list is managed by keeping track of the number
+ * of entries in it, i40e_trqpair_t`itrq_tcb_free. We use it as a way to index
+ * into the free list and add things to it. In effect, we always push and pop
+ * from the tail and protect it with a single lock,
+ * i40e_trqpair_t`itrq_tcb_lock. This scheme is somewhat simplistic and may not
+ * stand up to further performance testing; however, it does allow us to get off
+ * the ground with the device driver.
+ *
+ * The following image describes where a given transmit control block lives in
+ * its lifetime:
+ *
+ *             |
+ *             * ... Initial placement for all tcb's
+ *             |
+ *             v
+ *    +------------------+                       +------------------+
+ *    | tcb on free list |---*------------------>| tcb on work list |
+ *    +------------------+   .                   +------------------+
+ *             ^             . tcb allocated               |
+ *             |               to send frame               v
+ *             |               or fragment on              |
+ *             |               wire, mblk from             |
+ *             |               MAC associated.             |
+ *             |                                           |
+ *             +------*-------------------------------<----+
+ *                    .
+ *                    . Hardware indicates
+ *                      entry transmitted.
+ *                      tcb recycled, mblk
+ *                      from MAC freed.
+ *
+ * ------------
+ * Blocking MAC
+ * ------------
+ *
+ * Wen performing transmit, we can run out of descriptors and ring entries. When
+ * such a case happens, we return the mblk_t to MAC to indicate that we've been
+ * blocked. At that point in time, MAC becomes blocked and will not transmit
+ * anything out that specific ring until we notify MAC. To indicate that we're
+ * in such a situation we set i40e_trqpair_t`itrq_tx_blocked member to B_TRUE.
+ *
+ * When we recycle tx descriptors then we'll end up signaling MAC by calling
+ * mac_tx_ring_update() if we were blocked, letting it know that it's safe to
+ * start sending frames out to us again.
+ */
+
+/*
+ * We set our DMA alignment requests based on the smallest supported page size
+ * of the corresponding platform.
+ */
+#if	defined(__sparc)
+#define	I40E_DMA_ALIGNMENT 0x2000ull
+#elif defined(__x86)
+#define	I40E_DMA_ALIGNMENT 0x1000ull
+#else
+#error	"unknown architecture for i40e"
+#endif
+
+/*
+ * This structure is used to maintain information and flags related to
+ * transmitting a frame. The first member is the set of flags we need to or into
+ * the command word (generally checksumming related). The second member controls
+ * the word offsets which is required for IP and L4 checksumming.
+ */
+typedef struct i40e_tx_context {
+	enum i40e_tx_desc_cmd_bits	itc_cmdflags;
+	uint32_t			itc_offsets;
+} i40e_tx_context_t;
+
+/*
+ * Toggles on debug builds which can be used to override our RX behaviour based
+ * on thresholds.
+ */
+#ifdef	DEBUG
+typedef enum {
+	I40E_DEBUG_RX_DEFAULT	= 0,
+	I40E_DEBUG_RX_BCOPY	= 1,
+	I40E_DEBUG_RX_DMABIND	= 2
+} i40e_debug_rx_t;
+
+i40e_debug_rx_t i40e_debug_rx_mode = I40E_DEBUG_RX_DEFAULT;
+#endif	/* DEBUG */
+
+/*
+ * Notes on the following pair of DMA attributes. The first attribute,
+ * i40e_static_dma_attr, is designed to be used for both the descriptor rings
+ * and the static buffers that we associate with control blocks. For this
+ * reason, we force an SGL length of one. While technically the driver supports
+ * a larger SGL (5 on rx and 8 on tx), we opt to only use one to simplify our
+ * management here. In addition, when the Intel common code wants to allocate
+ * memory via the i40e_allocate_virt_mem osdep function, we have it leverage
+ * the static dma attr.
+ *
+ * The second set of attributes, i40e_txbind_dma_attr, is what we use when we're
+ * binding a bunch of mblk_t fragments to go out the door. Note that the main
+ * difference here is that we're allowed a larger SGL length -- eight.
+ *
+ * Note, we default to setting ourselves to be DMA capable here. However,
+ * because we could have multiple instances which have different FMA error
+ * checking capabilities, or end up on different buses, we make these static
+ * and const and copy them into the i40e_t for the given device with the actual
+ * values that reflect the actual capabilities.
+ */
+static const ddi_dma_attr_t i40e_g_static_dma_attr = {
+	DMA_ATTR_V0,			/* version number */
+	0x0000000000000000ull,		/* low address */
+	0xFFFFFFFFFFFFFFFFull,		/* high address */
+	0x00000000FFFFFFFFull,		/* dma counter max */
+	I40E_DMA_ALIGNMENT,		/* alignment */
+	0x00000FFF,			/* burst sizes */
+	0x00000001,			/* minimum transfer size */
+	0x00000000FFFFFFFFull,		/* maximum transfer size */
+	0xFFFFFFFFFFFFFFFFull,		/* maximum segment size */
+	1,				/* scatter/gather list length */
+	0x00000001,			/* granularity */
+	DDI_DMA_FLAGERR			/* DMA flags */
+};
+
+static const ddi_dma_attr_t i40e_g_txbind_dma_attr = {
+	DMA_ATTR_V0,			/* version number */
+	0x0000000000000000ull,		/* low address */
+	0xFFFFFFFFFFFFFFFFull,		/* high address */
+	0x00000000FFFFFFFFull,		/* dma counter max */
+	I40E_DMA_ALIGNMENT,		/* alignment */
+	0x00000FFF,			/* burst sizes */
+	0x00000001,			/* minimum transfer size */
+	0x00000000FFFFFFFFull,		/* maximum transfer size */
+	0xFFFFFFFFFFFFFFFFull,		/* maximum segment size	 */
+	I40E_TX_MAX_COOKIE,		/* scatter/gather list length */
+	0x00000001,			/* granularity */
+	DDI_DMA_FLAGERR			/* DMA flags */
+};
+
+/*
+ * Next, we have the attributes for these structures. The descriptor rings are
+ * all strictly little endian, while the data buffers are just arrays of bytes
+ * representing frames. Because of this, we purposefully simplify the driver
+ * programming life by programming the descriptor ring as little endian, while
+ * for the buffer data we keep it as unstructured.
+ *
+ * Note, that to keep the Intel common code operating in a reasonable way, when
+ * we allocate DMA memory for it, we do not use byte swapping and thus use the
+ * standard i40e_buf_acc_attr.
+ */
+static const ddi_device_acc_attr_t i40e_g_desc_acc_attr = {
+	DDI_DEVICE_ATTR_V0,
+	DDI_STRUCTURE_LE_ACC,
+	DDI_STRICTORDER_ACC
+};
+
+static const ddi_device_acc_attr_t i40e_g_buf_acc_attr = {
+	DDI_DEVICE_ATTR_V0,
+	DDI_NEVERSWAP_ACC,
+	DDI_STRICTORDER_ACC
+};
+
+/*
+ * The next two functions are designed to be type-safe versions of macros that
+ * are used to increment and decrement a descriptor index in the loop. Note,
+ * these are marked inline to try and keep the data path hot and they were
+ * effectively inlined in their previous life as macros.
+ */
+static inline int
+i40e_next_desc(int base, int count, int size)
+{
+	int out;
+
+	ASSERT(base >= 0);
+	ASSERT(count > 0);
+	ASSERT(size > 0);
+
+	if (base + count < size) {
+		out = base + count;
+	} else {
+		out = base + count - size;
+	}
+
+	ASSERT(out >= 0 && out < size);
+	return (out);
+}
+
+static inline int
+i40e_prev_desc(int base, int count, int size)
+{
+	int out;
+
+	ASSERT(base >= 0);
+	ASSERT(count > 0);
+	ASSERT(size > 0);
+
+	if (base >= count) {
+		out = base - count;
+	} else {
+		out = base - count + size;
+	}
+
+	ASSERT(out >= 0 && out < size);
+	return (out);
+}
+
+/*
+ * Free DMA memory that is represented by a i40e_dma_buffer_t.
+ */
+static void
+i40e_free_dma_buffer(i40e_dma_buffer_t *dmap)
+{
+	if (dmap->dmab_dma_address != NULL) {
+		VERIFY(dmap->dmab_dma_handle != NULL);
+		(void) ddi_dma_unbind_handle(dmap->dmab_dma_handle);
+		dmap->dmab_dma_address = NULL;
+		dmap->dmab_size = 0;
+	}
+
+	if (dmap->dmab_acc_handle != NULL) {
+		ddi_dma_mem_free(&dmap->dmab_acc_handle);
+		dmap->dmab_acc_handle = NULL;
+		dmap->dmab_address = NULL;
+	}
+
+	if (dmap->dmab_dma_handle != NULL) {
+		ddi_dma_free_handle(&dmap->dmab_dma_handle);
+		dmap->dmab_dma_handle = NULL;
+	}
+
+	/*
+	 * These should only be set if we have valid handles allocated and
+	 * therefore should always be NULLed out due to the above code. This
+	 * is here to catch us acting sloppy.
+	 */
+	ASSERT(dmap->dmab_dma_address == NULL);
+	ASSERT(dmap->dmab_address == NULL);
+	ASSERT(dmap->dmab_size == 0);
+	dmap->dmab_len = 0;
+}
+
+/*
+ * Allocate size bytes of DMA memory based on the passed in attributes. This
+ * fills in the information in dmap and is designed for all of our single cookie
+ * allocations.
+ */
+static boolean_t
+i40e_alloc_dma_buffer(i40e_t *i40e, i40e_dma_buffer_t *dmap,
+    ddi_dma_attr_t *attrsp, ddi_device_acc_attr_t *accp, boolean_t stream,
+    boolean_t zero, size_t size)
+{
+	int ret;
+	uint_t flags;
+	size_t len;
+	ddi_dma_cookie_t cookie;
+	uint_t ncookies;
+
+	if (stream == B_TRUE)
+		flags = DDI_DMA_STREAMING;
+	else
+		flags = DDI_DMA_CONSISTENT;
+
+	/*
+	 * Step one: Allocate the DMA handle
+	 */
+	ret = ddi_dma_alloc_handle(i40e->i40e_dip, attrsp, DDI_DMA_DONTWAIT,
+	    NULL, &dmap->dmab_dma_handle);
+	if (ret != DDI_SUCCESS) {
+		i40e_error(i40e, "failed to allocate dma handle for I/O "
+		    "buffers: %d", ret);
+		dmap->dmab_dma_handle = NULL;
+		return (B_FALSE);
+	}
+
+	/*
+	 * Step two: Allocate the DMA memory
+	 */
+	ret = ddi_dma_mem_alloc(dmap->dmab_dma_handle, size, accp, flags,
+	    DDI_DMA_DONTWAIT, NULL, &dmap->dmab_address, &len,
+	    &dmap->dmab_acc_handle);
+	if (ret != DDI_SUCCESS) {
+		i40e_error(i40e, "failed to allocate %d bytes of DMA for I/O "
+		    "buffers", size);
+		dmap->dmab_address = NULL;
+		dmap->dmab_acc_handle = NULL;
+		i40e_free_dma_buffer(dmap);
+		return (B_FALSE);
+	}
+
+	/*
+	 * Step three: Optionally zero
+	 */
+	if (zero == B_TRUE)
+		bzero(dmap->dmab_address, len);
+
+	/*
+	 * Step four: Bind the memory
+	 */
+	ret = ddi_dma_addr_bind_handle(dmap->dmab_dma_handle, NULL,
+	    dmap->dmab_address, len, DDI_DMA_RDWR | flags, DDI_DMA_DONTWAIT,
+	    NULL, &cookie, &ncookies);
+	if (ret != DDI_DMA_MAPPED) {
+		i40e_error(i40e, "failed to allocate %d bytes of DMA for I/O "
+		    "buffers: %d", size, ret);
+		i40e_free_dma_buffer(dmap);
+		return (B_FALSE);
+	}
+
+	VERIFY(ncookies == 1);
+	dmap->dmab_dma_address = cookie.dmac_laddress;
+	dmap->dmab_size = len;
+	dmap->dmab_len = 0;
+	return (B_TRUE);
+}
+
+/*
+ * This function is called once the last pending rcb has been freed by the upper
+ * levels of the system.
+ */
+static void
+i40e_free_rx_data(i40e_rx_data_t *rxd)
+{
+	VERIFY(rxd->rxd_rcb_pending == 0);
+
+	if (rxd->rxd_rcb_area != NULL) {
+		kmem_free(rxd->rxd_rcb_area,
+		    sizeof (i40e_rx_control_block_t) *
+		    (rxd->rxd_free_list_size + rxd->rxd_ring_size));
+		rxd->rxd_rcb_area = NULL;
+	}
+
+	if (rxd->rxd_free_list != NULL) {
+		kmem_free(rxd->rxd_free_list,
+		    sizeof (i40e_rx_control_block_t *) *
+		    rxd->rxd_free_list_size);
+		rxd->rxd_free_list = NULL;
+	}
+
+	if (rxd->rxd_work_list != NULL) {
+		kmem_free(rxd->rxd_work_list,
+		    sizeof (i40e_rx_control_block_t *) *
+		    rxd->rxd_ring_size);
+	}
+
+	kmem_free(rxd, sizeof (i40e_rx_data_t));
+}
+
+static boolean_t
+i40e_alloc_rx_data(i40e_t *i40e, i40e_trqpair_t *itrq)
+{
+	i40e_rx_data_t *rxd;
+
+	rxd = kmem_zalloc(sizeof (i40e_rx_data_t), KM_NOSLEEP);
+	if (rxd == NULL)
+		return (B_FALSE);
+	itrq->itrq_rxdata = rxd;
+	rxd->rxd_i40e = i40e;
+
+	rxd->rxd_ring_size = i40e->i40e_rx_ring_size;
+	rxd->rxd_free_list_size = i40e->i40e_rx_ring_size;
+
+	rxd->rxd_rcb_free = rxd->rxd_free_list_size;
+
+	rxd->rxd_work_list = kmem_zalloc(sizeof (i40e_rx_control_block_t *) *
+	    rxd->rxd_ring_size, KM_NOSLEEP);
+	if (rxd->rxd_work_list == NULL) {
+		i40e_error(i40e, "failed to allocate rx work list for a ring "
+		    "of %d entries for ring %d", rxd->rxd_ring_size,
+		    itrq->itrq_index);
+		goto cleanup;
+	}
+
+	rxd->rxd_free_list = kmem_zalloc(sizeof (i40e_rx_control_block_t *) *
+	    rxd->rxd_free_list_size, KM_NOSLEEP);
+	if (rxd->rxd_free_list == NULL) {
+		i40e_error(i40e, "failed to allocate a %d entry rx free list "
+		    "for ring %d", rxd->rxd_free_list_size, itrq->itrq_index);
+		goto cleanup;
+	}
+
+	rxd->rxd_rcb_area = kmem_zalloc(sizeof (i40e_rx_control_block_t) *
+	    (rxd->rxd_free_list_size + rxd->rxd_ring_size), KM_NOSLEEP);
+	if (rxd->rxd_rcb_area == NULL) {
+		i40e_error(i40e, "failed to allocate a %d entry rcb area for "
+		    "ring %d", rxd->rxd_ring_size + rxd->rxd_free_list_size,
+		    itrq->itrq_index);
+		goto cleanup;
+	}
+
+	return (B_TRUE);
+
+cleanup:
+	i40e_free_rx_data(rxd);
+	itrq->itrq_rxdata = NULL;
+	return (B_FALSE);
+}
+
+/*
+ * Free all of the memory that we've allocated for DMA. Note that we may have
+ * buffers that we've loaned up to the OS which are still outstanding. We'll
+ * always free up the descriptor ring, because we no longer need that. For each
+ * rcb, we'll iterate over it and if we send the reference count to zero, then
+ * we'll free the message block and DMA related resources. However, if we don't
+ * take the last one, then we'll go ahead and keep track that we'll have pending
+ * data and clean it up when we get there.
+ */
+static void
+i40e_free_rx_dma(i40e_rx_data_t *rxd, boolean_t failed_init)
+{
+	uint32_t i, count, ref;
+
+	i40e_rx_control_block_t *rcb;
+	i40e_t *i40e = rxd->rxd_i40e;
+
+	i40e_free_dma_buffer(&rxd->rxd_desc_area);
+	rxd->rxd_desc_ring = NULL;
+	rxd->rxd_desc_next = 0;
+
+	mutex_enter(&i40e->i40e_rx_pending_lock);
+
+	rcb = rxd->rxd_rcb_area;
+	count = rxd->rxd_ring_size + rxd->rxd_free_list_size;
+
+	for (i = 0; i < count; i++, rcb++) {
+		VERIFY(rcb != NULL);
+
+		/*
+		 * If we're cleaning up from a failed creation attempt, then an
+		 * entry may never have been assembled which would mean that
+		 * it's reference count is zero. If we find that, we leave it
+		 * be, because nothing else should be modifying it at this
+		 * point. We're not at the point that any more references can be
+		 * added, just removed.
+		 */
+		if (failed_init == B_TRUE && rcb->rcb_ref == 0)
+			continue;
+
+		ref = atomic_dec_32_nv(&rcb->rcb_ref);
+		if (ref == 0) {
+			freemsg(rcb->rcb_mp);
+			rcb->rcb_mp = NULL;
+			i40e_free_dma_buffer(&rcb->rcb_dma);
+		} else {
+			atomic_inc_32(&rxd->rxd_rcb_pending);
+			atomic_inc_32(&i40e->i40e_rx_pending);
+		}
+	}
+	mutex_exit(&i40e->i40e_rx_pending_lock);
+}
+
+/*
+ * Initialize the DMA memory for the descriptor ring and for each frame in the
+ * control block list.
+ */
+static boolean_t
+i40e_alloc_rx_dma(i40e_rx_data_t *rxd)
+{
+	int i, count;
+	size_t dmasz;
+	i40e_rx_control_block_t *rcb;
+	i40e_t *i40e = rxd->rxd_i40e;
+
+	/*
+	 * First allocate the rx descriptor ring.
+	 */
+	dmasz = sizeof (i40e_rx_desc_t) * rxd->rxd_ring_size;
+	VERIFY(dmasz > 0);
+	if (i40e_alloc_dma_buffer(i40e, &rxd->rxd_desc_area,
+	    &i40e->i40e_static_dma_attr, &i40e->i40e_desc_acc_attr, B_FALSE,
+	    B_TRUE, dmasz) == B_FALSE) {
+		i40e_error(i40e, "failed to allocate DMA resources "
+		    "for rx descriptor ring");
+		return (B_FALSE);
+	}
+	rxd->rxd_desc_ring =
+	    (i40e_rx_desc_t *)(uintptr_t)rxd->rxd_desc_area.dmab_address;
+	rxd->rxd_desc_next = 0;
+
+	count = rxd->rxd_ring_size + rxd->rxd_free_list_size;
+	rcb = rxd->rxd_rcb_area;
+
+	dmasz = i40e->i40e_rx_buf_size;
+	VERIFY(dmasz > 0);
+	for (i = 0; i < count; i++, rcb++) {
+		i40e_dma_buffer_t *dmap;
+		VERIFY(rcb != NULL);
+
+		if (i < rxd->rxd_ring_size) {
+			rxd->rxd_work_list[i] = rcb;
+		} else {
+			rxd->rxd_free_list[i - rxd->rxd_ring_size] = rcb;
+		}
+
+		dmap = &rcb->rcb_dma;
+		if (i40e_alloc_dma_buffer(i40e, dmap,
+		    &i40e->i40e_static_dma_attr, &i40e->i40e_buf_acc_attr,
+		    B_TRUE, B_FALSE, dmasz) == B_FALSE) {
+			i40e_error(i40e, "failed to allocate rx dma buffer");
+			return (B_FALSE);
+		}
+
+		/*
+		 * Initialize the control block and offset the DMA address. See
+		 * the note in the big theory statement that explains how this
+		 * helps IP deal with alignment. Note, we don't worry about
+		 * whether or not we successfully get an mblk_t from desballoc,
+		 * it's a common case that we have to handle later on in the
+		 * system.
+		 */
+		dmap->dmab_size -= I40E_BUF_IPHDR_ALIGNMENT;
+		dmap->dmab_address += I40E_BUF_IPHDR_ALIGNMENT;
+		dmap->dmab_dma_address += I40E_BUF_IPHDR_ALIGNMENT;
+
+		rcb->rcb_ref = 1;
+		rcb->rcb_rxd = rxd;
+		rcb->rcb_free_rtn.free_func = i40e_rx_recycle;
+		rcb->rcb_free_rtn.free_arg = (caddr_t)rcb;
+		rcb->rcb_mp = desballoc((unsigned char *)dmap->dmab_address,
+		    dmap->dmab_size, 0, &rcb->rcb_free_rtn);
+	}
+
+	return (B_TRUE);
+}
+
+static void
+i40e_free_tx_dma(i40e_trqpair_t *itrq)
+{
+	size_t fsz;
+
+	if (itrq->itrq_tcb_area != NULL) {
+		uint32_t i;
+		i40e_tx_control_block_t *tcb = itrq->itrq_tcb_area;
+
+		for (i = 0; i < itrq->itrq_tx_free_list_size; i++, tcb++) {
+			i40e_free_dma_buffer(&tcb->tcb_dma);
+			if (tcb->tcb_dma_handle != NULL) {
+				ddi_dma_free_handle(&tcb->tcb_dma_handle);
+				tcb->tcb_dma_handle = NULL;
+			}
+		}
+
+		fsz = sizeof (i40e_tx_control_block_t) *
+		    itrq->itrq_tx_free_list_size;
+		kmem_free(itrq->itrq_tcb_area, fsz);
+		itrq->itrq_tcb_area = NULL;
+	}
+
+	if (itrq->itrq_tcb_free_list != NULL) {
+		fsz = sizeof (i40e_tx_control_block_t *) *
+		    itrq->itrq_tx_free_list_size;
+		kmem_free(itrq->itrq_tcb_free_list, fsz);
+		itrq->itrq_tcb_free_list = NULL;
+	}
+
+	if (itrq->itrq_tcb_work_list != NULL) {
+		fsz = sizeof (i40e_tx_control_block_t *) *
+		    itrq->itrq_tx_ring_size;
+		kmem_free(itrq->itrq_tcb_work_list, fsz);
+		itrq->itrq_tcb_work_list = NULL;
+	}
+
+	i40e_free_dma_buffer(&itrq->itrq_desc_area);
+	itrq->itrq_desc_ring = NULL;
+
+}
+
+static boolean_t
+i40e_alloc_tx_dma(i40e_trqpair_t *itrq)
+{
+	int i, ret;
+	size_t dmasz;
+	i40e_tx_control_block_t *tcb;
+	i40e_t *i40e = itrq->itrq_i40e;
+
+	itrq->itrq_tx_ring_size = i40e->i40e_tx_ring_size;
+	itrq->itrq_tx_free_list_size = i40e->i40e_tx_ring_size +
+	    (i40e->i40e_tx_ring_size >> 1);
+
+	/*
+	 * Allocate an additional tx descriptor for the writeback head.
+	 */
+	dmasz = sizeof (i40e_tx_desc_t) * itrq->itrq_tx_ring_size;
+	dmasz += sizeof (i40e_tx_desc_t);
+
+	VERIFY(dmasz > 0);
+	if (i40e_alloc_dma_buffer(i40e, &itrq->itrq_desc_area,
+	    &i40e->i40e_static_dma_attr, &i40e->i40e_desc_acc_attr,
+	    B_FALSE, B_TRUE, dmasz) == B_FALSE) {
+		i40e_error(i40e, "failed to allocate DMA resources for tx "
+		    "descriptor ring");
+		return (B_FALSE);
+	}
+	itrq->itrq_desc_ring =
+	    (i40e_tx_desc_t *)(uintptr_t)itrq->itrq_desc_area.dmab_address;
+	itrq->itrq_desc_wbhead = (uint32_t *)(itrq->itrq_desc_ring +
+	    itrq->itrq_tx_ring_size);
+	itrq->itrq_desc_head = 0;
+	itrq->itrq_desc_tail = 0;
+	itrq->itrq_desc_free = itrq->itrq_tx_ring_size;
+
+	itrq->itrq_tcb_work_list = kmem_zalloc(itrq->itrq_tx_ring_size *
+	    sizeof (i40e_tx_control_block_t *), KM_NOSLEEP);
+	if (itrq->itrq_tcb_work_list == NULL) {
+		i40e_error(i40e, "failed to allocate a %d entry tx work list "
+		    "for ring %d", itrq->itrq_tx_ring_size, itrq->itrq_index);
+		goto cleanup;
+	}
+
+	itrq->itrq_tcb_free_list = kmem_zalloc(itrq->itrq_tx_free_list_size *
+	    sizeof (i40e_tx_control_block_t *), KM_SLEEP);
+	if (itrq->itrq_tcb_free_list == NULL) {
+		i40e_error(i40e, "failed to allocate a %d entry tx free list "
+		    "for ring %d", itrq->itrq_tx_free_list_size,
+		    itrq->itrq_index);
+		goto cleanup;
+	}
+
+	/*
+	 * We allocate enough tx control blocks to cover the free list.
+	 */
+	itrq->itrq_tcb_area = kmem_zalloc(sizeof (i40e_tx_control_block_t) *
+	    itrq->itrq_tx_free_list_size, KM_NOSLEEP);
+	if (itrq->itrq_tcb_area == NULL) {
+		i40e_error(i40e, "failed to allocate a %d entry tcb area for "
+		    "ring %d", itrq->itrq_tx_free_list_size, itrq->itrq_index);
+		goto cleanup;
+	}
+
+	/*
+	 * For each tcb, allocate DMA memory.
+	 */
+	dmasz = i40e->i40e_tx_buf_size;
+	VERIFY(dmasz > 0);
+	tcb = itrq->itrq_tcb_area;
+	for (i = 0; i < itrq->itrq_tx_free_list_size; i++, tcb++) {
+		VERIFY(tcb != NULL);
+
+		/*
+		 * Allocate both a DMA buffer which we'll use for when we copy
+		 * packets for transmission and allocate a DMA handle which
+		 * we'll use when we bind data.
+		 */
+		ret = ddi_dma_alloc_handle(i40e->i40e_dip,
+		    &i40e->i40e_txbind_dma_attr, DDI_DMA_DONTWAIT, NULL,
+		    &tcb->tcb_dma_handle);
+		if (ret != DDI_SUCCESS) {
+			i40e_error(i40e, "failed to allocate DMA handle for tx "
+			    "data binding on ring %d: %d", itrq->itrq_index,
+			    ret);
+			tcb->tcb_dma_handle = NULL;
+			goto cleanup;
+		}
+
+		if (i40e_alloc_dma_buffer(i40e, &tcb->tcb_dma,
+		    &i40e->i40e_static_dma_attr, &i40e->i40e_buf_acc_attr,
+		    B_TRUE, B_FALSE, dmasz) == B_FALSE) {
+			i40e_error(i40e, "failed to allocate %d bytes of "
+			    "DMA for tx data binding on ring %d: %d", dmasz,
+			    itrq->itrq_index);
+			goto cleanup;
+		}
+
+		itrq->itrq_tcb_free_list[i] = tcb;
+	}
+
+	itrq->itrq_tcb_free = itrq->itrq_tx_free_list_size;
+
+	return (B_TRUE);
+
+cleanup:
+	i40e_free_tx_dma(itrq);
+	return (B_FALSE);
+}
+
+/*
+ * Free all memory associated with all of the rings on this i40e instance. Note,
+ * this is done as part of the GLDv3 stop routine.
+ */
+void
+i40e_free_ring_mem(i40e_t *i40e, boolean_t failed_init)
+{
+	int i;
+
+	for (i = 0; i < i40e->i40e_num_trqpairs; i++) {
+		i40e_rx_data_t *rxd = i40e->i40e_trqpairs[i].itrq_rxdata;
+
+		/*
+		 * Clean up our rx data. We have to free DMA resources first and
+		 * then if we have no more pending RCB's, then we'll go ahead
+		 * and clean things up. Note, we can't set the stopped flag on
+		 * the rx data until after we've done the first pass of the
+		 * pending resources. Otherwise we might race with
+		 * i40e_rx_recycle on determining who should free the
+		 * i40e_rx_data_t above.
+		 */
+		i40e_free_rx_dma(rxd, failed_init);
+
+		mutex_enter(&i40e->i40e_rx_pending_lock);
+		rxd->rxd_shutdown = B_TRUE;
+		if (rxd->rxd_rcb_pending == 0) {
+			i40e_free_rx_data(rxd);
+			i40e->i40e_trqpairs[i].itrq_rxdata = NULL;
+		}
+		mutex_exit(&i40e->i40e_rx_pending_lock);
+
+		i40e_free_tx_dma(&i40e->i40e_trqpairs[i]);
+	}
+}
+
+/*
+ * Allocate all of the resources associated with all of the rings on this i40e
+ * instance. Note this is done as part of the GLDv3 start routine and thus we
+ * should not use blocking allocations. This takes care of both DMA and non-DMA
+ * related resources.
+ */
+boolean_t
+i40e_alloc_ring_mem(i40e_t *i40e)
+{
+	int i;
+
+	for (i = 0; i < i40e->i40e_num_trqpairs; i++) {
+		if (i40e_alloc_rx_data(i40e, &i40e->i40e_trqpairs[i]) ==
+		    B_FALSE)
+			goto unwind;
+
+		if (i40e_alloc_rx_dma(i40e->i40e_trqpairs[i].itrq_rxdata) ==
+		    B_FALSE)
+			goto unwind;
+
+		if (i40e_alloc_tx_dma(&i40e->i40e_trqpairs[i]) == B_FALSE)
+			goto unwind;
+	}
+
+	return (B_TRUE);
+
+unwind:
+	i40e_free_ring_mem(i40e, B_TRUE);
+	return (B_FALSE);
+}
+
+
+/*
+ * Because every instance of i40e may have different support for FMA
+ * capabilities, we copy the DMA attributes into the i40e_t and set them that
+ * way and use them for determining attributes.
+ */
+void
+i40e_init_dma_attrs(i40e_t *i40e, boolean_t fma)
+{
+	bcopy(&i40e_g_static_dma_attr, &i40e->i40e_static_dma_attr,
+	    sizeof (ddi_dma_attr_t));
+	bcopy(&i40e_g_txbind_dma_attr, &i40e->i40e_txbind_dma_attr,
+	    sizeof (ddi_dma_attr_t));
+	bcopy(&i40e_g_desc_acc_attr, &i40e->i40e_desc_acc_attr,
+	    sizeof (ddi_device_acc_attr_t));
+	bcopy(&i40e_g_buf_acc_attr, &i40e->i40e_buf_acc_attr,
+	    sizeof (ddi_device_acc_attr_t));
+
+	if (fma == B_TRUE) {
+		i40e->i40e_static_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR;
+		i40e->i40e_txbind_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR;
+	} else {
+		i40e->i40e_static_dma_attr.dma_attr_flags &= ~DDI_DMA_FLAGERR;
+		i40e->i40e_txbind_dma_attr.dma_attr_flags &= ~DDI_DMA_FLAGERR;
+	}
+}
+
+static void
+i40e_rcb_free(i40e_rx_data_t *rxd, i40e_rx_control_block_t *rcb)
+{
+	mutex_enter(&rxd->rxd_free_lock);
+	ASSERT(rxd->rxd_rcb_free < rxd->rxd_free_list_size);
+	ASSERT(rxd->rxd_free_list[rxd->rxd_rcb_free] == NULL);
+	rxd->rxd_free_list[rxd->rxd_rcb_free] = rcb;
+	rxd->rxd_rcb_free++;
+	mutex_exit(&rxd->rxd_free_lock);
+}
+
+static i40e_rx_control_block_t *
+i40e_rcb_alloc(i40e_rx_data_t *rxd)
+{
+	i40e_rx_control_block_t *rcb;
+
+	mutex_enter(&rxd->rxd_free_lock);
+	if (rxd->rxd_rcb_free == 0) {
+		mutex_exit(&rxd->rxd_free_lock);
+		return (NULL);
+	}
+	rxd->rxd_rcb_free--;
+	rcb = rxd->rxd_free_list[rxd->rxd_rcb_free];
+	VERIFY(rcb != NULL);
+	rxd->rxd_free_list[rxd->rxd_rcb_free] = NULL;
+	mutex_exit(&rxd->rxd_free_lock);
+
+	return (rcb);
+}
+
+/*
+ * This is the callback that we get from the OS when freemsg(9F) has been called
+ * on a loaned descriptor. In addition, if we take the last reference count
+ * here, then we have to tear down all of the rx data.
+ */
+void
+i40e_rx_recycle(caddr_t arg)
+{
+	uint32_t ref;
+	i40e_rx_control_block_t *rcb;
+	i40e_rx_data_t *rxd;
+	i40e_t *i40e;
+
+	/* LINTED: E_BAD_PTR_CAST_ALIGN */
+	rcb = (i40e_rx_control_block_t *)arg;
+	rxd = rcb->rcb_rxd;
+	i40e = rxd->rxd_i40e;
+
+	/*
+	 * It's possible for this to be called with a reference count of zero.
+	 * That will happen when we're doing the freemsg after taking the last
+	 * reference because we're tearing down everything and this rcb is not
+	 * outstanding.
+	 */
+	if (rcb->rcb_ref == 0)
+		return;
+
+	/*
+	 * Don't worry about failure of desballoc here. It'll only become fatal
+	 * if we're trying to use it and we can't in i40e_rx_bind().
+	 */
+	rcb->rcb_mp = desballoc((unsigned char *)rcb->rcb_dma.dmab_address,
+	    rcb->rcb_dma.dmab_size, 0, &rcb->rcb_free_rtn);
+	i40e_rcb_free(rxd, rcb);
+
+	/*
+	 * It's possible that the rcb was being used while we are shutting down
+	 * the device. In that case, we'll take the final reference from the
+	 * device here.
+	 */
+	ref = atomic_dec_32_nv(&rcb->rcb_ref);
+	if (ref == 0) {
+		freemsg(rcb->rcb_mp);
+		rcb->rcb_mp = NULL;
+		i40e_free_dma_buffer(&rcb->rcb_dma);
+
+		mutex_enter(&i40e->i40e_rx_pending_lock);
+		atomic_dec_32(&rxd->rxd_rcb_pending);
+		atomic_dec_32(&i40e->i40e_rx_pending);
+
+		/*
+		 * If this was the last block and it's been indicated that we've
+		 * passed the shutdown point, we should clean up.
+		 */
+		if (rxd->rxd_shutdown == B_TRUE && rxd->rxd_rcb_pending == 0) {
+			i40e_free_rx_data(rxd);
+			cv_broadcast(&i40e->i40e_rx_pending_cv);
+		}
+
+		mutex_exit(&i40e->i40e_rx_pending_lock);
+	}
+}
+
+static mblk_t *
+i40e_rx_bind(i40e_trqpair_t *itrq, i40e_rx_data_t *rxd, uint32_t index,
+    uint32_t plen)
+{
+	mblk_t *mp;
+	i40e_t *i40e = rxd->rxd_i40e;
+	i40e_rx_control_block_t *rcb, *rep_rcb;
+
+	ASSERT(MUTEX_HELD(&itrq->itrq_rx_lock));
+
+	if ((rep_rcb = i40e_rcb_alloc(rxd)) == NULL) {
+		itrq->itrq_rxstat.irxs_rx_bind_norcb.value.ui64++;
+		return (NULL);
+	}
+
+	rcb = rxd->rxd_work_list[index];
+
+	/*
+	 * Check to make sure we have a mblk_t. If we don't, this is our last
+	 * chance to try and get one.
+	 */
+	if (rcb->rcb_mp == NULL) {
+		rcb->rcb_mp =
+		    desballoc((unsigned char *)rcb->rcb_dma.dmab_address,
+		    rcb->rcb_dma.dmab_size, 0, &rcb->rcb_free_rtn);
+		if (rcb->rcb_mp == NULL) {
+			itrq->itrq_rxstat.irxs_rx_bind_nomp.value.ui64++;
+			i40e_rcb_free(rxd, rcb);
+			return (NULL);
+		}
+	}
+
+	I40E_DMA_SYNC(&rcb->rcb_dma, DDI_DMA_SYNC_FORKERNEL);
+
+	if (i40e_check_dma_handle(rcb->rcb_dma.dmab_dma_handle) != DDI_FM_OK) {
+		ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED);
+		atomic_or_32(&i40e->i40e_state, I40E_ERROR);
+		i40e_rcb_free(rxd, rcb);
+		return (NULL);
+	}
+
+	/*
+	 * Note, we've already accounted for the I40E_BUF_IPHDR_ALIGNMENT.
+	 */
+	mp = rcb->rcb_mp;
+	atomic_inc_32(&rcb->rcb_ref);
+	mp->b_wptr = mp->b_rptr + plen;
+	mp->b_next = mp->b_cont = NULL;
+
+	rxd->rxd_work_list[index] = rep_rcb;
+	return (mp);
+}
+
+/*
+ * We're going to allocate a new message block for this frame and attempt to
+ * receive it. See the big theory statement for more information on when we copy
+ * versus bind.
+ */
+static mblk_t *
+i40e_rx_copy(i40e_trqpair_t *itrq, i40e_rx_data_t *rxd, uint32_t index,
+    uint32_t plen)
+{
+	i40e_t *i40e = rxd->rxd_i40e;
+	i40e_rx_control_block_t *rcb;
+	mblk_t *mp;
+
+	ASSERT(index < rxd->rxd_ring_size);
+	rcb = rxd->rxd_work_list[index];
+
+	I40E_DMA_SYNC(&rcb->rcb_dma, DDI_DMA_SYNC_FORKERNEL);
+
+	if (i40e_check_dma_handle(rcb->rcb_dma.dmab_dma_handle) != DDI_FM_OK) {
+		ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED);
+		atomic_or_32(&i40e->i40e_state, I40E_ERROR);
+		return (NULL);
+	}
+
+	mp = allocb(plen + I40E_BUF_IPHDR_ALIGNMENT, 0);
+	if (mp == NULL) {
+		itrq->itrq_rxstat.irxs_rx_copy_nomem.value.ui64++;
+		return (NULL);
+	}
+
+	mp->b_rptr += I40E_BUF_IPHDR_ALIGNMENT;
+	bcopy(rcb->rcb_dma.dmab_address, mp->b_rptr, plen);
+	mp->b_wptr = mp->b_rptr + plen;
+
+	return (mp);
+}
+
+/*
+ * Determine if the device has enabled any checksum flags for us. The level of
+ * checksum computed will depend on the type packet that we have, which is
+ * contained in ptype. For example, the checksum logic it does will vary
+ * depending on whether or not the packet is considered tunneled, whether it
+ * recognizes the L4 type, etc. Section 8.3.4.3 summarizes which checksums are
+ * valid.
+ *
+ * While there are additional checksums that we could recognize here, we'll need
+ * to get some additional GLDv3 enhancements to be able to properly describe
+ * them.
+ */
+static void
+i40e_rx_hcksum(i40e_trqpair_t *itrq, mblk_t *mp, uint64_t status, uint32_t err,
+    uint32_t ptype)
+{
+	uint32_t cksum;
+	struct i40e_rx_ptype_decoded pinfo;
+
+	ASSERT(ptype <= 255);
+	pinfo = decode_rx_desc_ptype(ptype);
+
+	cksum = 0;
+
+	/*
+	 * If the ptype isn't something that we know in the driver, then we
+	 * shouldn't even consider moving forward.
+	 */
+	if (pinfo.known == 0) {
+		itrq->itrq_rxstat.irxs_hck_unknown.value.ui64++;
+		return;
+	}
+
+	/*
+	 * If hardware didn't set the L3L4P bit on the frame, then there is no
+	 * checksum offload to consider.
+	 */
+	if ((status & (1 << I40E_RX_DESC_STATUS_L3L4P_SHIFT)) == 0) {
+		itrq->itrq_rxstat.irxs_hck_nol3l4p.value.ui64++;
+		return;
+	}
+
+	/*
+	 * The device tells us that IPv6 checksums where a Destination Options
+	 * Header or a Routing header shouldn't be trusted. Discard all
+	 * checksums in this case.
+	 */
+	if (pinfo.outer_ip == I40E_RX_PTYPE_OUTER_IP &&
+	    pinfo.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV6 &&
+	    (status & (1 << I40E_RX_DESC_STATUS_IPV6EXADD_SHIFT))) {
+		itrq->itrq_rxstat.irxs_hck_v6skip.value.ui64++;
+		return;
+	}
+
+	/*
+	 * The hardware denotes three kinds of possible errors. Two are reserved
+	 * for inner and outer IP checksum errors (IPE and EIPE) and the latter
+	 * is for L4 checksum errors (L4E). If there is only one IP header, then
+	 * the only thing that we care about is IPE. Note that since we don't
+	 * support inner checksums, we will ignore IPE being set on tunneled
+	 * packets and only care about EIPE.
+	 */
+	if (pinfo.outer_ip == I40E_RX_PTYPE_OUTER_IP &&
+	    pinfo.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV4) {
+		if (pinfo.tunnel_type == I40E_RX_PTYPE_OUTER_NONE) {
+			if ((err & (1 << I40E_RX_DESC_ERROR_IPE_SHIFT)) != 0) {
+				itrq->itrq_rxstat.irxs_hck_iperr.value.ui64++;
+			} else {
+				itrq->itrq_rxstat.irxs_hck_v4hdrok.value.ui64++;
+				cksum |= HCK_IPV4_HDRCKSUM_OK;
+			}
+		} else {
+			if ((err & (1 << I40E_RX_DESC_ERROR_EIPE_SHIFT)) != 0) {
+				itrq->itrq_rxstat.irxs_hck_eiperr.value.ui64++;
+			} else {
+				itrq->itrq_rxstat.irxs_hck_v4hdrok.value.ui64++;
+				cksum |= HCK_IPV4_HDRCKSUM_OK;
+			}
+		}
+	}
+
+	/*
+	 * We only have meaningful L4 checksums in the case of IP->L4 and
+	 * IP->IP->L4. There is not outer L4 checksum data available in any
+	 * other case. Further, we don't bother reporting the valid checksum in
+	 * the case of IP->IP->L4 set.
+	 */
+	if (pinfo.outer_ip == I40E_RX_PTYPE_OUTER_IP &&
+	    pinfo.tunnel_type == I40E_RX_PTYPE_TUNNEL_NONE &&
+	    (pinfo.inner_prot == I40E_RX_PTYPE_INNER_PROT_UDP ||
+	    pinfo.inner_prot == I40E_RX_PTYPE_INNER_PROT_TCP ||
+	    pinfo.inner_prot == I40E_RX_PTYPE_INNER_PROT_ICMP ||
+	    pinfo.inner_prot == I40E_RX_PTYPE_INNER_PROT_SCTP)) {
+		ASSERT(pinfo.payload_layer == I40E_RX_PTYPE_PAYLOAD_LAYER_PAY4);
+		if ((err & (1 << I40E_RX_DESC_ERROR_L4E_SHIFT)) != 0) {
+			itrq->itrq_rxstat.irxs_hck_l4err.value.ui64++;
+		} else {
+			itrq->itrq_rxstat.irxs_hck_l4hdrok.value.ui64++;
+			cksum |= HCK_FULLCKSUM_OK;
+		}
+	}
+
+	if (cksum != 0) {
+		itrq->itrq_rxstat.irxs_hck_set.value.ui64++;
+		mac_hcksum_set(mp, 0, 0, 0, 0, cksum);
+	} else {
+		itrq->itrq_rxstat.irxs_hck_miss.value.ui64++;
+	}
+}
+
+mblk_t *
+i40e_ring_rx(i40e_trqpair_t *itrq, int poll_bytes)
+{
+	i40e_t *i40e;
+	i40e_hw_t *hw;
+	i40e_rx_data_t *rxd;
+	uint32_t cur_head;
+	i40e_rx_desc_t *cur_desc;
+	i40e_rx_control_block_t *rcb;
+	uint64_t rx_bytes, rx_frames;
+	uint64_t stword;
+	mblk_t *mp, *mp_head, **mp_tail;
+
+	ASSERT(MUTEX_HELD(&itrq->itrq_rx_lock));
+	rxd = itrq->itrq_rxdata;
+	i40e = itrq->itrq_i40e;
+	hw = &i40e->i40e_hw_space;
+
+	if (!(i40e->i40e_state & I40E_STARTED) ||
+	    (i40e->i40e_state & I40E_OVERTEMP) ||
+	    (i40e->i40e_state & I40E_SUSPENDED) ||
+	    (i40e->i40e_state & I40E_ERROR))
+		return (NULL);
+
+	/*
+	 * Before we do anything else, we have to make sure that all of the DMA
+	 * buffers are synced up and then check to make sure that they're
+	 * actually good from an FM perspective.
+	 */
+	I40E_DMA_SYNC(&rxd->rxd_desc_area, DDI_DMA_SYNC_FORKERNEL);
+	if (i40e_check_dma_handle(rxd->rxd_desc_area.dmab_dma_handle) !=
+	    DDI_FM_OK) {
+		ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED);
+		atomic_or_32(&i40e->i40e_state, I40E_ERROR);
+		return (NULL);
+	}
+
+	/*
+	 * Prepare our stats. We do a limited amount of processing in both
+	 * polling and interrupt context. The limit in interrupt context is
+	 * based on frames, in polling context based on bytes.
+	 */
+	rx_bytes = rx_frames = 0;
+	mp_head = NULL;
+	mp_tail = &mp_head;
+
+	/*
+	 * At this point, the descriptor ring is available to check. We'll try
+	 * and process until we either run out of poll_bytes or descriptors.
+	 */
+	cur_head = rxd->rxd_desc_next;
+	cur_desc = &rxd->rxd_desc_ring[cur_head];
+	stword = LE64_TO_CPU(cur_desc->wb.qword1.status_error_len);
+
+	/*
+	 * Note, the primary invariant of this loop should be tha cur_head,
+	 * cur_desc, and stword always point to the currently processed
+	 * descriptor. When we leave the loop, it should point to a descriptor
+	 * that HAS NOT been processed. Meaning, that if we haven't consumed the
+	 * frame, the descriptor should not be advanced.
+	 */
+	while ((stword & (1 << I40E_RX_DESC_STATUS_DD_SHIFT)) != 0) {
+		uint32_t error, eop, plen, ptype;
+
+		/*
+		 * The DD, PLEN, and EOP bits are the only ones that are valid
+		 * in every frame. The error information is only valid when EOP
+		 * is set in the same frame.
+		 *
+		 * At this time, because we don't do any LRO or header
+		 * splitting. We expect that every frame should have EOP set in
+		 * it. When later functionality comes in, we'll want to
+		 * re-evaluate this.
+		 */
+		eop = stword & (1 << I40E_RX_DESC_STATUS_EOF_SHIFT);
+		VERIFY(eop != 0);
+
+		error = (stword & I40E_RXD_QW1_ERROR_MASK) >>
+		    I40E_RXD_QW1_ERROR_SHIFT;
+		if (error & I40E_RX_ERR_BITS) {
+			itrq->itrq_rxstat.irxs_rx_desc_error.value.ui64++;
+			goto discard;
+		}
+
+		plen = (stword & I40E_RXD_QW1_LENGTH_PBUF_MASK) >>
+		    I40E_RXD_QW1_LENGTH_PBUF_SHIFT;
+
+		ptype = (stword & I40E_RXD_QW1_PTYPE_MASK) >>
+		    I40E_RXD_QW1_PTYPE_SHIFT;
+
+		/*
+		 * This packet contains valid data. We should check to see if
+		 * we're actually going to consume it based on its length (to
+		 * ensure that we don't overshoot our quota). We determine
+		 * whether to bcopy or bind the DMA resources based on the size
+		 * of the frame. However, if on debug, we allow it to be
+		 * overridden for testing purposes.
+		 *
+		 * We should be smarter about this and do DMA binding for
+		 * larger frames, but for now, it's really more important that
+		 * we actually just get something simple working.
+		 */
+
+		/*
+		 * Ensure we don't exceed our polling quota by reading this
+		 * frame. Note we only bump bytes now, we bump frames later.
+		 */
+		if ((poll_bytes != I40E_POLL_NULL) &&
+		    (rx_bytes + plen) > poll_bytes)
+			break;
+		rx_bytes += plen;
+
+		mp = NULL;
+		if (plen >= i40e->i40e_rx_dma_min)
+			mp = i40e_rx_bind(itrq, rxd, cur_head, plen);
+		if (mp == NULL)
+			mp = i40e_rx_copy(itrq, rxd, cur_head, plen);
+
+		if (mp != NULL) {
+			if (i40e->i40e_rx_hcksum_enable)
+				i40e_rx_hcksum(itrq, mp, stword, error, ptype);
+			*mp_tail = mp;
+			mp_tail = &mp->b_next;
+		}
+
+		/*
+		 * Now we need to prepare this frame for use again. See the
+		 * discussion in the big theory statements.
+		 *
+		 * However, right now we're doing the simple version of this.
+		 * Normally what we'd do would depend on whether or not we were
+		 * doing DMA binding or bcopying. But because we're always doing
+		 * bcopying, we can just always use the current index as a key
+		 * for what to do and reassign the buffer based on the ring.
+		 */
+discard:
+		rcb = rxd->rxd_work_list[cur_head];
+		cur_desc->read.pkt_addr =
+		    CPU_TO_LE64((uintptr_t)rcb->rcb_dma.dmab_dma_address);
+		cur_desc->read.hdr_addr = 0;
+
+		/*
+		 * Finally, update our loop invariants.
+		 */
+		cur_head = i40e_next_desc(cur_head, 1, rxd->rxd_ring_size);
+		cur_desc = &rxd->rxd_desc_ring[cur_head];
+		stword = LE64_TO_CPU(cur_desc->wb.qword1.status_error_len);
+
+		/*
+		 * To help provide liveness, we limit the amount of data that
+		 * we'll end up counting. Note that in these cases, an interrupt
+		 * is not dissimilar from a polling request.
+		 */
+		rx_frames++;
+		if (rx_frames > i40e->i40e_rx_limit_per_intr) {
+			itrq->itrq_rxstat.irxs_rx_intr_limit.value.ui64++;
+			break;
+		}
+	}
+
+	/*
+	 * As we've modified the ring, we need to make sure that we sync the
+	 * descriptor ring for the device. Next, we update the hardware and
+	 * update our notion of where the head for us to read from hardware is
+	 * next.
+	 */
+	I40E_DMA_SYNC(&rxd->rxd_desc_area, DDI_DMA_SYNC_FORDEV);
+	if (i40e_check_dma_handle(rxd->rxd_desc_area.dmab_dma_handle) !=
+	    DDI_FM_OK) {
+		ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED);
+		atomic_or_32(&i40e->i40e_state, I40E_ERROR);
+	}
+
+	if (rx_frames != 0) {
+		uint32_t tail;
+		ddi_acc_handle_t rh = i40e->i40e_osdep_space.ios_reg_handle;
+		rxd->rxd_desc_next = cur_head;
+		tail = i40e_prev_desc(cur_head, 1, rxd->rxd_ring_size);
+
+		I40E_WRITE_REG(hw, I40E_QRX_TAIL(itrq->itrq_index), tail);
+		if (i40e_check_acc_handle(rh) != DDI_FM_OK) {
+			ddi_fm_service_impact(i40e->i40e_dip,
+			    DDI_SERVICE_DEGRADED);
+			atomic_or_32(&i40e->i40e_state, I40E_ERROR);
+		}
+
+		itrq->itrq_rxstat.irxs_bytes.value.ui64 += rx_bytes;
+		itrq->itrq_rxstat.irxs_packets.value.ui64 += rx_frames;
+	}
+
+#ifdef DEBUG
+	if (rx_frames == 0) {
+		ASSERT(rx_bytes == 0);
+	}
+#endif
+
+	return (mp_head);
+}
+
+/*
+ * This function is called by the GLDv3 when it wants to poll on a ring. The
+ * only primary difference from when we call this during an interrupt is that we
+ * have a limit on the number of bytes that we should consume.
+ */
+mblk_t *
+i40e_ring_rx_poll(void *arg, int poll_bytes)
+{
+	i40e_trqpair_t *itrq = arg;
+	mblk_t *mp;
+
+	ASSERT(poll_bytes > 0);
+	if (poll_bytes == 0)
+		return (NULL);
+
+	mutex_enter(&itrq->itrq_rx_lock);
+	mp = i40e_ring_rx(itrq, poll_bytes);
+	mutex_exit(&itrq->itrq_rx_lock);
+
+	return (mp);
+}
+
+/*
+ * This is a structure I wish someone would fill out for me for dorking with the
+ * checksums. When we get some more experience with this, we should go ahead and
+ * consider adding this to MAC.
+ */
+typedef enum mac_ether_offload_flags {
+	MEOI_L2INFO_SET		= 0x01,
+	MEOI_VLAN_TAGGED	= 0x02,
+	MEOI_L3INFO_SET		= 0x04,
+	MEOI_L3CKSUM_SET	= 0x08,
+	MEOI_L4INFO_SET		= 0x10,
+	MEOI_L4CKSUM_SET	= 0x20
+} mac_ether_offload_flags_t;
+
+typedef struct mac_ether_offload_info {
+	mac_ether_offload_flags_t	meoi_flags;
+	uint8_t		meoi_l2hlen;	/* How long is the Ethernet header? */
+	uint16_t	meoi_l3proto;	/* What's the Ethertype */
+	uint8_t		meoi_l3hlen;	/* How long is the header? */
+	uint8_t		meoi_l4proto;	/* What is the payload type? */
+	uint8_t		meoi_l4hlen;	/* How long is the L4 header */
+	mblk_t		*meoi_l3ckmp;	/* Which mblk has the l3 checksum */
+	off_t		meoi_l3ckoff;	/* What's the offset to it */
+	mblk_t		*meoi_l4ckmp;	/* Which mblk has the L4 checksum */
+	off_t		meoi_l4off;	/* What is the offset to it? */
+} mac_ether_offload_info_t;
+
+/*
+ * This is something that we'd like to make a general MAC function. Before we do
+ * that, we should add support for TSO.
+ *
+ * We should really keep track of our offset and not walk everything every
+ * time. I can't imagine that this will be kind to us at high packet rates;
+ * however, for the moment, let's leave that.
+ *
+ * This walks a message block chain without pulling up to fill in the context
+ * information. Note that the data we care about could be hidden across more
+ * than one mblk_t.
+ */
+static int
+i40e_meoi_get_uint8(mblk_t *mp, off_t off, uint8_t *out)
+{
+	size_t mpsize;
+	uint8_t *bp;
+
+	mpsize = msgsize(mp);
+	/* Check for overflow */
+	if (off + sizeof (uint16_t) > mpsize)
+		return (-1);
+
+	mpsize = MBLKL(mp);
+	while (off >= mpsize) {
+		mp = mp->b_cont;
+		off -= mpsize;
+		mpsize = MBLKL(mp);
+	}
+
+	bp = mp->b_rptr + off;
+	*out = *bp;
+	return (0);
+
+}
+
+static int
+i40e_meoi_get_uint16(mblk_t *mp, off_t off, uint16_t *out)
+{
+	size_t mpsize;
+	uint8_t *bp;
+
+	mpsize = msgsize(mp);
+	/* Check for overflow */
+	if (off + sizeof (uint16_t) > mpsize)
+		return (-1);
+
+	mpsize = MBLKL(mp);
+	while (off >= mpsize) {
+		mp = mp->b_cont;
+		off -= mpsize;
+		mpsize = MBLKL(mp);
+	}
+
+	/*
+	 * Data is in network order. Note the second byte of data might be in
+	 * the next mp.
+	 */
+	bp = mp->b_rptr + off;
+	*out = *bp << 8;
+	if (off + 1 == mpsize) {
+		mp = mp->b_cont;
+		bp = mp->b_rptr;
+	} else {
+		bp++;
+	}
+
+	*out |= *bp;
+	return (0);
+
+}
+
+static int
+mac_ether_offload_info(mblk_t *mp, mac_ether_offload_info_t *meoi)
+{
+	size_t off;
+	uint16_t ether;
+	uint8_t ipproto, iplen, l4len, maclen;
+
+	bzero(meoi, sizeof (mac_ether_offload_info_t));
+
+	off = offsetof(struct ether_header, ether_type);
+	if (i40e_meoi_get_uint16(mp, off, &ether) != 0)
+		return (-1);
+
+	if (ether == ETHERTYPE_VLAN) {
+		off = offsetof(struct ether_vlan_header, ether_type);
+		if (i40e_meoi_get_uint16(mp, off, &ether) != 0)
+			return (-1);
+		meoi->meoi_flags |= MEOI_VLAN_TAGGED;
+		maclen = sizeof (struct ether_vlan_header);
+	} else {
+		maclen = sizeof (struct ether_header);
+	}
+	meoi->meoi_flags |= MEOI_L2INFO_SET;
+	meoi->meoi_l2hlen = maclen;
+	meoi->meoi_l3proto = ether;
+
+	switch (ether) {
+	case ETHERTYPE_IP:
+		/*
+		 * For IPv4 we need to get the length of the header, as it can
+		 * be variable.
+		 */
+		off = offsetof(ipha_t, ipha_version_and_hdr_length) + maclen;
+		if (i40e_meoi_get_uint8(mp, off, &iplen) != 0)
+			return (-1);
+		iplen &= 0x0f;
+		if (iplen < 5 || iplen > 0x0f)
+			return (-1);
+		iplen *= 4;
+		off = offsetof(ipha_t, ipha_protocol) + maclen;
+		if (i40e_meoi_get_uint8(mp, off, &ipproto) == -1)
+			return (-1);
+		break;
+	case ETHERTYPE_IPV6:
+		iplen = 40;
+		off = offsetof(ip6_t, ip6_nxt) + maclen;
+		if (i40e_meoi_get_uint8(mp, off, &ipproto) == -1)
+			return (-1);
+		break;
+	default:
+		return (0);
+	}
+	meoi->meoi_l3hlen = iplen;
+	meoi->meoi_l4proto = ipproto;
+	meoi->meoi_flags |= MEOI_L3INFO_SET;
+
+	switch (ipproto) {
+	case IPPROTO_TCP:
+		off = offsetof(tcph_t, th_offset_and_rsrvd) + maclen + iplen;
+		if (i40e_meoi_get_uint8(mp, off, &l4len) == -1)
+			return (-1);
+		l4len = (l4len & 0xf0) >> 4;
+		if (l4len < 5 || l4len > 0xf)
+			return (-1);
+		l4len *= 4;
+		break;
+	case IPPROTO_UDP:
+		l4len = sizeof (struct udphdr);
+		break;
+	case IPPROTO_SCTP:
+		l4len = sizeof (sctp_hdr_t);
+		break;
+	default:
+		return (0);
+	}
+
+	meoi->meoi_l4hlen = l4len;
+	meoi->meoi_flags |= MEOI_L4INFO_SET;
+	return (0);
+}
+
+/*
+ * Attempt to put togther the information we'll need to feed into a descriptor
+ * to properly program the hardware for checksum offload as well as the
+ * generally required flags.
+ *
+ * The i40e_tx_contex_t`itc_cmdflags contains the set of flags we need to or
+ * into the descriptor based on the checksum flags for this mblk_t and the
+ * actual information we care about.
+ */
+static int
+i40e_tx_context(i40e_t *i40e, i40e_trqpair_t *itrq, mblk_t *mp,
+    i40e_tx_context_t *tctx)
+{
+	int ret;
+	uint32_t flags, start;
+	mac_ether_offload_info_t meo;
+	i40e_txq_stat_t *txs = &itrq->itrq_txstat;
+
+	bzero(tctx, sizeof (i40e_tx_context_t));
+
+	if (i40e->i40e_tx_hcksum_enable != B_TRUE)
+		return (0);
+
+	mac_hcksum_get(mp, &start, NULL, NULL, NULL, &flags);
+	if (flags == 0)
+		return (0);
+
+	if ((ret = mac_ether_offload_info(mp, &meo)) != 0) {
+		txs->itxs_hck_meoifail.value.ui64++;
+		return (ret);
+	}
+
+	/*
+	 * Have we been asked to checksum an IPv4 header. If so, verify that we
+	 * have sufficient information and then set the proper fields in the
+	 * command structure.
+	 */
+	if (flags & HCK_IPV4_HDRCKSUM) {
+		if ((meo.meoi_flags & MEOI_L2INFO_SET) == 0) {
+			txs->itxs_hck_nol2info.value.ui64++;
+			return (-1);
+		}
+		if ((meo.meoi_flags & MEOI_L3INFO_SET) == 0) {
+			txs->itxs_hck_nol3info.value.ui64++;
+			return (-1);
+		}
+		if (meo.meoi_l3proto != ETHERTYPE_IP) {
+			txs->itxs_hck_badl3.value.ui64++;
+			return (-1);
+		}
+		tctx->itc_cmdflags |= I40E_TX_DESC_CMD_IIPT_IPV4_CSUM;
+		tctx->itc_offsets |= (meo.meoi_l2hlen >> 1) <<
+		    I40E_TX_DESC_LENGTH_MACLEN_SHIFT;
+		tctx->itc_offsets |= (meo.meoi_l3hlen >> 2) <<
+		    I40E_TX_DESC_LENGTH_IPLEN_SHIFT;
+	}
+
+	/*
+	 * We've been asked to provide an L4 header, first, set up the IP
+	 * information in the descriptor if we haven't already before moving
+	 * onto seeing if we have enough information for the L4 checksum
+	 * offload.
+	 */
+	if (flags & HCK_PARTIALCKSUM) {
+		if ((meo.meoi_flags & MEOI_L4INFO_SET) == 0) {
+			txs->itxs_hck_nol4info.value.ui64++;
+			return (-1);
+		}
+
+		if (!(flags & HCK_IPV4_HDRCKSUM)) {
+			if ((meo.meoi_flags & MEOI_L2INFO_SET) == 0) {
+				txs->itxs_hck_nol2info.value.ui64++;
+				return (-1);
+			}
+			if ((meo.meoi_flags & MEOI_L3INFO_SET) == 0) {
+				txs->itxs_hck_nol3info.value.ui64++;
+				return (-1);
+			}
+
+			if (meo.meoi_l3proto == ETHERTYPE_IP) {
+				tctx->itc_cmdflags |=
+				    I40E_TX_DESC_CMD_IIPT_IPV4;
+			} else if (meo.meoi_l3proto == ETHERTYPE_IPV6) {
+				tctx->itc_cmdflags |=
+				    I40E_TX_DESC_CMD_IIPT_IPV6;
+			} else {
+				txs->itxs_hck_badl3.value.ui64++;
+				return (-1);
+			}
+			tctx->itc_offsets |= (meo.meoi_l2hlen >> 1) <<
+			    I40E_TX_DESC_LENGTH_MACLEN_SHIFT;
+			tctx->itc_offsets |= (meo.meoi_l3hlen >> 2) <<
+			    I40E_TX_DESC_LENGTH_IPLEN_SHIFT;
+		}
+
+		switch (meo.meoi_l4proto) {
+		case IPPROTO_TCP:
+			tctx->itc_cmdflags |= I40E_TX_DESC_CMD_L4T_EOFT_TCP;
+			break;
+		case IPPROTO_UDP:
+			tctx->itc_cmdflags |= I40E_TX_DESC_CMD_L4T_EOFT_UDP;
+			break;
+		case IPPROTO_SCTP:
+			tctx->itc_cmdflags |= I40E_TX_DESC_CMD_L4T_EOFT_SCTP;
+			break;
+		default:
+			txs->itxs_hck_badl4.value.ui64++;
+			return (-1);
+		}
+
+		tctx->itc_offsets |= (meo.meoi_l4hlen >> 2) <<
+		    I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
+	}
+
+	return (0);
+}
+
+static void
+i40e_tcb_free(i40e_trqpair_t *itrq, i40e_tx_control_block_t *tcb)
+{
+	ASSERT(tcb != NULL);
+
+	mutex_enter(&itrq->itrq_tcb_lock);
+	ASSERT(itrq->itrq_tcb_free < itrq->itrq_tx_free_list_size);
+	itrq->itrq_tcb_free_list[itrq->itrq_tcb_free] = tcb;
+	itrq->itrq_tcb_free++;
+	mutex_exit(&itrq->itrq_tcb_lock);
+}
+
+static i40e_tx_control_block_t *
+i40e_tcb_alloc(i40e_trqpair_t *itrq)
+{
+	i40e_tx_control_block_t *ret;
+
+	mutex_enter(&itrq->itrq_tcb_lock);
+	if (itrq->itrq_tcb_free == 0) {
+		mutex_exit(&itrq->itrq_tcb_lock);
+		return (NULL);
+	}
+
+	itrq->itrq_tcb_free--;
+	ret = itrq->itrq_tcb_free_list[itrq->itrq_tcb_free];
+	itrq->itrq_tcb_free_list[itrq->itrq_tcb_free] = NULL;
+	mutex_exit(&itrq->itrq_tcb_lock);
+
+	ASSERT(ret != NULL);
+	return (ret);
+}
+
+/*
+ * This should be used to free any DMA resources, associated mblk_t's, etc. It's
+ * used as part of recycling the message blocks when we have either an interrupt
+ * or other activity that indicates that we need to take a look.
+ */
+static void
+i40e_tcb_reset(i40e_tx_control_block_t *tcb)
+{
+	switch (tcb->tcb_type) {
+	case I40E_TX_COPY:
+		tcb->tcb_dma.dmab_len = 0;
+		break;
+	case I40E_TX_DMA:
+		(void) ddi_dma_unbind_handle(tcb->tcb_dma_handle);
+		break;
+	case I40E_TX_NONE:
+		/* Cast to pacify lint */
+		panic("trying to free tcb %p with bad type none\n", (void *)tcb);
+	default:
+		panic("unknown i40e tcb type: %d", tcb->tcb_type);
+	}
+
+	tcb->tcb_type = I40E_TX_NONE;
+	freemsg(tcb->tcb_mp);
+	tcb->tcb_mp = NULL;
+	tcb->tcb_next = NULL;
+}
+
+/*
+ * This is called as part of shutting down to clean up all outstanding
+ * descriptors. Similar to recycle, except we don't re-arm anything and instead
+ * just return control blocks to the free list.
+ */
+void
+i40e_tx_cleanup_ring(i40e_trqpair_t *itrq)
+{
+	uint32_t index;
+
+	ASSERT(MUTEX_HELD(&itrq->itrq_tx_lock));
+	ASSERT(itrq->itrq_desc_free <= itrq->itrq_tx_ring_size);
+
+	/*
+	 * Because we should have shut down the chip at this point, it should be
+	 * safe to just clean up all the entries between our head and tail.
+	 */
+#ifdef	DEBUG
+	index = I40E_READ_REG(&itrq->itrq_i40e->i40e_hw_space,
+	    I40E_QTX_ENA(itrq->itrq_index));
+	VERIFY0(index & (I40E_QTX_ENA_QENA_REQ_MASK |
+	    I40E_QTX_ENA_QENA_STAT_MASK));
+#endif
+
+	index = itrq->itrq_desc_head;
+	while (itrq->itrq_desc_free < itrq->itrq_tx_ring_size) {
+		i40e_tx_control_block_t *tcb;
+
+		tcb = itrq->itrq_tcb_work_list[index];
+		VERIFY(tcb != NULL);
+		itrq->itrq_tcb_work_list[index] = NULL;
+		i40e_tcb_reset(tcb);
+		i40e_tcb_free(itrq, tcb);
+
+		bzero(&itrq->itrq_desc_ring[index], sizeof (i40e_tx_desc_t));
+		index = i40e_next_desc(index, 1, itrq->itrq_tx_ring_size);
+		itrq->itrq_desc_free++;
+	}
+
+	ASSERT(index == itrq->itrq_desc_tail);
+	itrq->itrq_desc_head = index;
+}
+
+/*
+ * We're here either by hook or by crook. We need to see if there are transmit
+ * descriptors available for us to go and clean up and return to the hardware.
+ * We may also be blocked, and if so, we should make sure that we let it know
+ * we're good to go.
+ */
+void
+i40e_tx_recycle_ring(i40e_trqpair_t *itrq)
+{
+	uint32_t wbhead, toclean, count;
+	i40e_tx_control_block_t *tcbhead;
+	i40e_t *i40e = itrq->itrq_i40e;
+
+	mutex_enter(&itrq->itrq_tx_lock);
+
+	ASSERT(itrq->itrq_desc_free <= itrq->itrq_tx_ring_size);
+	if (itrq->itrq_desc_free == itrq->itrq_tx_ring_size) {
+		if (itrq->itrq_tx_blocked == B_TRUE) {
+			itrq->itrq_tx_blocked = B_FALSE;
+			mac_tx_ring_update(i40e->i40e_mac_hdl,
+			    itrq->itrq_mactxring);
+			itrq->itrq_txstat.itxs_num_unblocked.value.ui64++;
+		}
+		mutex_exit(&itrq->itrq_tx_lock);
+		return;
+	}
+
+	/*
+	 * Now we need to try and see if there's anything available. The driver
+	 * will write to the head location and it guarantees that it does not
+	 * use relaxed ordering.
+	 */
+	VERIFY0(ddi_dma_sync(itrq->itrq_desc_area.dmab_dma_handle,
+	    (uintptr_t)itrq->itrq_desc_wbhead,
+	    sizeof (uint32_t), DDI_DMA_SYNC_FORKERNEL));
+
+	if (i40e_check_dma_handle(itrq->itrq_desc_area.dmab_dma_handle) !=
+	    DDI_FM_OK) {
+		mutex_exit(&itrq->itrq_tx_lock);
+		ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED);
+		atomic_or_32(&i40e->i40e_state, I40E_ERROR);
+		return;
+	}
+
+	wbhead = *itrq->itrq_desc_wbhead;
+	toclean = itrq->itrq_desc_head;
+	count = 0;
+	tcbhead = NULL;
+
+	while (toclean != wbhead) {
+		i40e_tx_control_block_t *tcb;
+
+		tcb = itrq->itrq_tcb_work_list[toclean];
+		itrq->itrq_tcb_work_list[toclean] = NULL;
+		ASSERT(tcb != NULL);
+		tcb->tcb_next = tcbhead;
+		tcbhead = tcb;
+
+		/*
+		 * We zero this out for sanity purposes.
+		 */
+		bzero(&itrq->itrq_desc_ring[toclean], sizeof (i40e_tx_desc_t));
+		toclean = i40e_next_desc(toclean, 1, itrq->itrq_tx_ring_size);
+		count++;
+	}
+
+	itrq->itrq_desc_head = wbhead;
+	itrq->itrq_desc_free += count;
+	itrq->itrq_txstat.itxs_recycled.value.ui64 += count;
+	ASSERT(itrq->itrq_desc_free <= itrq->itrq_tx_ring_size);
+
+	if (itrq->itrq_tx_blocked == B_TRUE &&
+	    itrq->itrq_desc_free > i40e->i40e_tx_block_thresh) {
+		itrq->itrq_tx_blocked = B_FALSE;
+
+		mac_tx_ring_update(i40e->i40e_mac_hdl, itrq->itrq_mactxring);
+		itrq->itrq_txstat.itxs_num_unblocked.value.ui64++;
+	}
+
+	mutex_exit(&itrq->itrq_tx_lock);
+
+	/*
+	 * Now clean up the tcb.
+	 */
+	while (tcbhead != NULL) {
+		i40e_tx_control_block_t *tcb = tcbhead;
+
+		tcbhead = tcb->tcb_next;
+		i40e_tcb_reset(tcb);
+		i40e_tcb_free(itrq, tcb);
+	}
+
+	DTRACE_PROBE2(i40e__recycle, i40e_trqpair_t *, itrq, uint32_t, count);
+}
+
+/*
+ * We've been asked to send a message block on the wire. We'll only have a
+ * single chain. There will not be any b_next pointers; however, there may be
+ * multiple b_cont blocks.
+ *
+ * We may do one of three things with any given mblk_t chain:
+ *
+ *   1) Drop it
+ *   2) Transmit it
+ *   3) Return it
+ *
+ * If we return it to MAC, then MAC will flow control on our behalf. In other
+ * words, it won't send us anything until we tell it that it's okay to send us
+ * something.
+ */
+mblk_t *
+i40e_ring_tx(void *arg, mblk_t *mp)
+{
+	const mblk_t *nmp;
+	size_t mpsize;
+	i40e_tx_control_block_t *tcb;
+	i40e_tx_desc_t *txdesc;
+	i40e_tx_context_t tctx;
+	int cmd, type;
+
+	i40e_trqpair_t *itrq = arg;
+	i40e_t *i40e = itrq->itrq_i40e;
+	i40e_hw_t *hw = &i40e->i40e_hw_space;
+	i40e_txq_stat_t *txs = &itrq->itrq_txstat;
+
+	ASSERT(mp->b_next == NULL);
+
+	if (!(i40e->i40e_state & I40E_STARTED) ||
+	    (i40e->i40e_state & I40E_OVERTEMP) ||
+	    (i40e->i40e_state & I40E_SUSPENDED) ||
+	    (i40e->i40e_state & I40E_ERROR) ||
+	    (i40e->i40e_link_state != LINK_STATE_UP)) {
+		freemsg(mp);
+		return (NULL);
+	}
+
+	/*
+	 * Figure out the relevant context about this frame that we might need
+	 * for enabling checksum, lso, etc. This also fills in information that
+	 * we might set around the packet type, etc.
+	 */
+	if (i40e_tx_context(i40e, itrq, mp, &tctx) < 0) {
+		freemsg(mp);
+		itrq->itrq_txstat.itxs_err_context.value.ui64++;
+		return (NULL);
+	}
+
+	/*
+	 * For the primordial driver we can punt on doing any recycling right
+	 * now; however, longer term we need to probably do some more pro-active
+	 * recycling to cut back on stalls in the tx path.
+	 */
+
+	/*
+	 * Do a quick size check to make sure it fits into what we think it
+	 * should for this device. Note that longer term this will be false,
+	 * particularly when we have the world of TSO.
+	 */
+	mpsize = 0;
+	for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
+		mpsize += MBLKL(nmp);
+	}
+
+	/*
+	 * First we allocate our tx control block and prepare the packet for
+	 * transmit before we do a final check for descriptors. We do it this
+	 * way to minimize the time under the tx lock.
+	 */
+	tcb = i40e_tcb_alloc(itrq);
+	if (tcb == NULL) {
+		txs->itxs_err_notcb.value.ui64++;
+		goto txfail;
+	}
+
+	/*
+	 * For transmitting a block, we're currently going to use just a
+	 * single control block and bcopy all of the fragments into it. We
+	 * should be more intelligent about doing DMA binding or otherwise, but
+	 * for getting off the ground this will have to do.
+	 */
+	ASSERT(tcb->tcb_dma.dmab_len == 0);
+	ASSERT(tcb->tcb_dma.dmab_size >= mpsize);
+	for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
+		size_t clen = MBLKL(nmp);
+		void *coff = tcb->tcb_dma.dmab_address + tcb->tcb_dma.dmab_len;
+
+		bcopy(nmp->b_rptr, coff, clen);
+		tcb->tcb_dma.dmab_len += clen;
+	}
+	ASSERT(tcb->tcb_dma.dmab_len == mpsize);
+
+	/*
+	 * While there's really no need to keep the mp here, but let's just do
+	 * it to help with our own debugging for now.
+	 */
+	tcb->tcb_mp = mp;
+	tcb->tcb_type = I40E_TX_COPY;
+	I40E_DMA_SYNC(&tcb->tcb_dma, DDI_DMA_SYNC_FORDEV);
+
+	mutex_enter(&itrq->itrq_tx_lock);
+	if (itrq->itrq_desc_free < i40e->i40e_tx_block_thresh) {
+		txs->itxs_err_nodescs.value.ui64++;
+		mutex_exit(&itrq->itrq_tx_lock);
+		goto txfail;
+	}
+
+	/*
+	 * Build up the descriptor and send it out. Thankfully at the moment
+	 * we only need a single desc, because we're not doing anything fancy
+	 * yet.
+	 */
+	ASSERT(itrq->itrq_desc_free > 0);
+	itrq->itrq_desc_free--;
+	txdesc = &itrq->itrq_desc_ring[itrq->itrq_desc_tail];
+	itrq->itrq_tcb_work_list[itrq->itrq_desc_tail] = tcb;
+	itrq->itrq_desc_tail = i40e_next_desc(itrq->itrq_desc_tail, 1,
+	    itrq->itrq_tx_ring_size);
+
+	/*
+	 * Note, we always set EOP and RS which indicates that this is the last
+	 * data frame and that we should ask for it to be transmitted. We also
+	 * must always set ICRC, because that is an internal bit that must be
+	 * set to one for data descriptors. The remaining bits in the command
+	 * descriptor depend on checksumming and are determined based on the
+	 * information set up in i40e_tx_context().
+	 */
+	type = I40E_TX_DESC_DTYPE_DATA;
+	cmd = I40E_TX_DESC_CMD_EOP |
+	    I40E_TX_DESC_CMD_RS |
+	    I40E_TX_DESC_CMD_ICRC |
+	    tctx.itc_cmdflags;
+	txdesc->buffer_addr =
+	    CPU_TO_LE64((uintptr_t)tcb->tcb_dma.dmab_dma_address);
+	txdesc->cmd_type_offset_bsz = CPU_TO_LE64(((uint64_t)type |
+	    ((uint64_t)tctx.itc_offsets << I40E_TXD_QW1_OFFSET_SHIFT) |
+	    ((uint64_t)cmd << I40E_TXD_QW1_CMD_SHIFT) |
+	    ((uint64_t)tcb->tcb_dma.dmab_len << I40E_TXD_QW1_TX_BUF_SZ_SHIFT)));
+
+	/*
+	 * Now, finally, sync the DMA data and alert hardware.
+	 */
+	I40E_DMA_SYNC(&itrq->itrq_desc_area, DDI_DMA_SYNC_FORDEV);
+
+	I40E_WRITE_REG(hw, I40E_QTX_TAIL(itrq->itrq_index),
+	    itrq->itrq_desc_tail);
+	if (i40e_check_acc_handle(i40e->i40e_osdep_space.ios_reg_handle) !=
+	    DDI_FM_OK) {
+		/*
+		 * Note, we can't really go through and clean this up very well,
+		 * because the memory has been given to the device, so just
+		 * indicate it's been transmitted.
+		 */
+		ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED);
+		atomic_or_32(&i40e->i40e_state, I40E_ERROR);
+	}
+
+	txs->itxs_bytes.value.ui64 += mpsize;
+	txs->itxs_packets.value.ui64++;
+	txs->itxs_descriptors.value.ui64++;
+
+	mutex_exit(&itrq->itrq_tx_lock);
+
+	return (NULL);
+
+txfail:
+	/*
+	 * We ran out of resources. Return it to MAC and indicate that we'll
+	 * need to signal MAC. If there are allocated tcb's, return them now.
+	 * Make sure to reset their message block's, since we'll return them
+	 * back to MAC.
+	 */
+	if (tcb != NULL) {
+		tcb->tcb_mp = NULL;
+		i40e_tcb_reset(tcb);
+		i40e_tcb_free(itrq, tcb);
+	}
+
+	mutex_enter(&itrq->itrq_tx_lock);
+	itrq->itrq_tx_blocked = B_TRUE;
+	mutex_exit(&itrq->itrq_tx_lock);
+
+	return (mp);
+}
diff --git a/usr/src/uts/common/io/i40e/i40e_xregs.h b/usr/src/uts/common/io/i40e/i40e_xregs.h
new file mode 100644
index 0000000000..1bf3a1f0be
--- /dev/null
+++ b/usr/src/uts/common/io/i40e/i40e_xregs.h
@@ -0,0 +1,53 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#ifndef _I40E_XREGS_H
+#define	_I40E_XREGS_H
+
+/*
+ * This file contains extra register definitions and other things that would
+ * nominally come from the Intel common code, but do not due to bugs, erratum,
+ * etc. Ideally we'll get to a point where we can remove this file.
+ */
+#include "i40e_type.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * The MSPDC register is missing from the current datasheet.
+ */
+#define	I40E_GLPRT_MSPDC(_i)		(0x00300060 + ((_i) * 8)) /* _i=0...3 */
+#define	I40E_GLPRT_MSDPC_MAX_INDEX	3
+#define	I40E_GLPRT_MSPDC_MSPDC_SHIFT	0
+#define	I40E_GLPRT_MSPDC_MSPDC_MASK	\
+	I40E_MASK(0xFFFFFFFF, I40E_GLPRT_MSPDC_MSPDC_SHIFT)
+
+/*
+ * The RXERR* registers are technically correct from the perspective of their
+ * addreses; however, the other associated constants are not correct. Instead,
+ * we have new definitions here in the interim.
+ */
+
+#define	I40E_X_GL_RXERR1_L(_i)		(0x00318000 + ((_i) * 8))
+
+#define	I40E_X_GL_RXERR2_L(_i)		(0x0031c000 + ((_i) * 8))
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _I40E_XREGS_H */
diff --git a/usr/src/uts/common/io/inotify.c b/usr/src/uts/common/io/inotify.c
new file mode 100644
index 0000000000..baa36cfc8d
--- /dev/null
+++ b/usr/src/uts/common/io/inotify.c
@@ -0,0 +1,1504 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2015 Joyent, Inc.  All rights reserved.
+ * Copyright (c) 2015 The MathWorks, Inc.  All rights reserved.
+ */
+
+/*
+ * Support for the inotify facility, a Linux-borne facility for asynchronous
+ * notification of certain events on specified files or directories.  Our
+ * implementation broadly leverages the file event monitoring facility, and
+ * would actually be quite straightforward were it not for a very serious
+ * blunder in the inotify interface:  in addition to allowing for one to be
+ * notified on events on a particular file or directory, inotify also allows
+ * for one to be notified on certain events on files _within_ a watched
+ * directory -- even though those events have absolutely nothing to do with
+ * the directory itself.  This leads to all sorts of madness because file
+ * operations are (of course) not undertaken on paths but rather on open
+ * files -- and the relationships between open files and the paths that resolve
+ * to those files are neither static nor isomorphic.  We implement this
+ * concept by having _child watches_ when directories are watched with events
+ * in IN_CHILD_EVENTS.  We add child watches when a watch on a directory is
+ * first added, and we modify those child watches dynamically as files are
+ * created, deleted, moved into or moved out of the specified directory.  This
+ * mechanism works well, absent hard links.  Hard links, unfortunately, break
+ * this rather badly, and the user is warned that watches on directories that
+ * have multiple directory entries referring to the same file may behave
+ * unexpectedly.
+ */
+
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/inotify.h>
+#include <sys/fem.h>
+#include <sys/conf.h>
+#include <sys/stat.h>
+#include <sys/vfs_opreg.h>
+#include <sys/vmem.h>
+#include <sys/avl.h>
+#include <sys/sysmacros.h>
+#include <sys/cyclic.h>
+#include <sys/filio.h>
+
+struct inotify_state;
+struct inotify_kevent;
+
+typedef struct inotify_watch inotify_watch_t;
+typedef struct inotify_state inotify_state_t;
+typedef struct inotify_kevent inotify_kevent_t;
+
+struct inotify_watch {
+	kmutex_t inw_lock;			/* lock protecting ref count */
+	int inw_refcnt;				/* reference count */
+	uint8_t inw_zombie:1;			/* boolean: is zombie */
+	uint8_t inw_fired:1;			/* boolean: fired one-shot */
+	uint8_t inw_active:1;			/* boolean: watch is active */
+	uint8_t inw_orphaned:1;			/* boolean: orphaned */
+	kcondvar_t inw_cv;			/* condvar for zombifier */
+	uint32_t inw_mask;			/* mask of watch */
+	int32_t inw_wd;				/* watch descriptor */
+	vnode_t *inw_vp;			/* underlying vnode */
+	inotify_watch_t *inw_parent;		/* parent, if a child */
+	avl_node_t inw_byvp;			/* watches by vnode */
+	avl_node_t inw_bywd;			/* watches by descriptor */
+	avl_tree_t inw_children;		/* children, if a parent */
+	char *inw_name;				/* name, if a child */
+	list_node_t inw_orphan;			/* orphan list */
+	cred_t *inw_cred;			/* cred, if orphaned */
+	inotify_state_t *inw_state;		/* corresponding state */
+};
+
+struct inotify_kevent {
+	inotify_kevent_t *ine_next;		/* next event in queue */
+	struct inotify_event ine_event;		/* event (variable size) */
+};
+
+#define	INOTIFY_EVENT_LENGTH(ev) \
+	(sizeof (inotify_kevent_t) + (ev)->ine_event.len)
+
+struct inotify_state {
+	kmutex_t ins_lock;			/* lock protecting state */
+	avl_tree_t ins_byvp;			/* watches by vnode */
+	avl_tree_t ins_bywd;			/* watches by descriptor */
+	vmem_t *ins_wds;			/* watch identifier arena */
+	int ins_maxwatches;			/* maximum number of watches */
+	int ins_maxevents;			/* maximum number of events */
+	int ins_nevents;			/* current # of events */
+	int32_t ins_size;			/* total size of events */
+	inotify_kevent_t *ins_head;		/* head of event queue */
+	inotify_kevent_t *ins_tail;		/* tail of event queue */
+	pollhead_t ins_pollhd;			/* poll head */
+	kcondvar_t ins_cv;			/* condvar for reading */
+	list_t ins_orphans;			/* orphan list */
+	ddi_periodic_t ins_cleaner;		/* cyclic for cleaning */
+	inotify_watch_t *ins_zombies;		/* zombie watch list */
+	cred_t *ins_cred;			/* creator's credentials */
+	inotify_state_t *ins_next;		/* next state on global list */
+};
+
+/*
+ * Tunables (exported read-only in lx-branded zones via /proc).
+ */
+int	inotify_maxwatches = 8192;		/* max watches per instance */
+int	inotify_maxevents = 16384;		/* max events */
+int	inotify_maxinstances = 128;		/* max instances per user */
+
+/*
+ * Internal global variables.
+ */
+static kmutex_t		inotify_lock;		/* lock protecting state */
+static dev_info_t	*inotify_devi;		/* device info */
+static fem_t		*inotify_femp;		/* FEM pointer */
+static vmem_t		*inotify_minor;		/* minor number arena */
+static void		*inotify_softstate;	/* softstate pointer */
+static inotify_state_t	*inotify_state;		/* global list if state */
+
+static void inotify_watch_event(inotify_watch_t *, uint64_t, char *);
+static void inotify_watch_insert(inotify_watch_t *, vnode_t *, char *);
+static void inotify_watch_delete(inotify_watch_t *, uint32_t);
+static void inotify_watch_remove(inotify_state_t *state,
+	inotify_watch_t *watch);
+
+static int
+inotify_fop_close(femarg_t *vf, int flag, int count, offset_t offset,
+    cred_t *cr, caller_context_t *ct)
+{
+	inotify_watch_t *watch = vf->fa_fnode->fn_available;
+	int rval;
+
+	if ((rval = vnext_close(vf, flag, count, offset, cr, ct)) == 0) {
+		inotify_watch_event(watch, flag & FWRITE ?
+		    IN_CLOSE_WRITE : IN_CLOSE_NOWRITE, NULL);
+	}
+
+	return (rval);
+}
+
+static int
+inotify_fop_create(femarg_t *vf, char *name, vattr_t *vap, vcexcl_t excl,
+    int mode, vnode_t **vpp, cred_t *cr, int flag, caller_context_t *ct,
+    vsecattr_t *vsecp)
+{
+	inotify_watch_t *watch = vf->fa_fnode->fn_available;
+	int rval;
+
+	if ((rval = vnext_create(vf, name, vap, excl, mode,
+	    vpp, cr, flag, ct, vsecp)) == 0) {
+		inotify_watch_insert(watch, *vpp, name);
+		inotify_watch_event(watch, IN_CREATE, name);
+	}
+
+	return (rval);
+}
+
+static int
+inotify_fop_link(femarg_t *vf, vnode_t *svp, char *tnm, cred_t *cr,
+    caller_context_t *ct, int flags)
+{
+	inotify_watch_t *watch = vf->fa_fnode->fn_available;
+	int rval;
+
+	if ((rval = vnext_link(vf, svp, tnm, cr, ct, flags)) == 0) {
+		inotify_watch_insert(watch, svp, tnm);
+		inotify_watch_event(watch, IN_CREATE, tnm);
+	}
+
+	return (rval);
+}
+
+static int
+inotify_fop_mkdir(femarg_t *vf, char *name, vattr_t *vap, vnode_t **vpp,
+    cred_t *cr, caller_context_t *ct, int flags, vsecattr_t *vsecp)
+{
+	inotify_watch_t *watch = vf->fa_fnode->fn_available;
+	int rval;
+
+	if ((rval = vnext_mkdir(vf, name, vap, vpp, cr,
+	    ct, flags, vsecp)) == 0) {
+		inotify_watch_insert(watch, *vpp, name);
+		inotify_watch_event(watch, IN_CREATE | IN_ISDIR, name);
+	}
+
+	return (rval);
+}
+
+static int
+inotify_fop_open(femarg_t *vf, int mode, cred_t *cr, caller_context_t *ct)
+{
+	inotify_watch_t *watch = vf->fa_fnode->fn_available;
+	int rval;
+
+	if ((rval = vnext_open(vf, mode, cr, ct)) == 0)
+		inotify_watch_event(watch, IN_OPEN, NULL);
+
+	return (rval);
+}
+
+static int
+inotify_fop_read(femarg_t *vf, struct uio *uiop, int ioflag, struct cred *cr,
+    caller_context_t *ct)
+{
+	inotify_watch_t *watch = vf->fa_fnode->fn_available;
+	int rval = vnext_read(vf, uiop, ioflag, cr, ct);
+	inotify_watch_event(watch, IN_ACCESS, NULL);
+
+	return (rval);
+}
+
+static int
+inotify_fop_readdir(femarg_t *vf, uio_t *uiop, cred_t *cr, int *eofp,
+    caller_context_t *ct, int flags)
+{
+	inotify_watch_t *watch = vf->fa_fnode->fn_available;
+	int rval = vnext_readdir(vf, uiop, cr, eofp, ct, flags);
+	inotify_watch_event(watch, IN_ACCESS | IN_ISDIR, NULL);
+
+	return (rval);
+}
+
+int
+inotify_fop_remove(femarg_t *vf, char *nm, cred_t *cr, caller_context_t *ct,
+    int flags)
+{
+	inotify_watch_t *watch = vf->fa_fnode->fn_available;
+	int rval;
+
+	if ((rval = vnext_remove(vf, nm, cr, ct, flags)) == 0)
+		inotify_watch_event(watch, IN_DELETE, nm);
+
+	return (rval);
+}
+
+int
+inotify_fop_rmdir(femarg_t *vf, char *nm, vnode_t *cdir, cred_t *cr,
+    caller_context_t *ct, int flags)
+{
+	inotify_watch_t *watch = vf->fa_fnode->fn_available;
+	int rval;
+
+	if ((rval = vnext_rmdir(vf, nm, cdir, cr, ct, flags)) == 0)
+		inotify_watch_event(watch, IN_DELETE | IN_ISDIR, nm);
+
+	return (rval);
+}
+
+static int
+inotify_fop_setattr(femarg_t *vf, vattr_t *vap, int flags, cred_t *cr,
+    caller_context_t *ct)
+{
+	inotify_watch_t *watch = vf->fa_fnode->fn_available;
+	int rval;
+
+	if ((rval = vnext_setattr(vf, vap, flags, cr, ct)) == 0)
+		inotify_watch_event(watch, IN_ATTRIB, NULL);
+
+	return (rval);
+}
+
+static int
+inotify_fop_write(femarg_t *vf, struct uio *uiop, int ioflag, struct cred *cr,
+    caller_context_t *ct)
+{
+	inotify_watch_t *watch = vf->fa_fnode->fn_available;
+	int rval = vnext_write(vf, uiop, ioflag, cr, ct);
+	inotify_watch_event(watch, IN_MODIFY, NULL);
+
+	return (rval);
+}
+
+static int
+inotify_fop_vnevent(femarg_t *vf, vnevent_t vnevent, vnode_t *dvp, char *name,
+    caller_context_t *ct)
+{
+	inotify_watch_t *watch = vf->fa_fnode->fn_available;
+
+	switch (vnevent) {
+	case VE_RENAME_SRC:
+		inotify_watch_event(watch, IN_MOVE_SELF, NULL);
+		inotify_watch_delete(watch, IN_MOVE_SELF);
+		break;
+	case VE_REMOVE:
+		/*
+		 * Linux will apparently fire an IN_ATTRIB event when the link
+		 * count changes (including when it drops to 0 on a remove).
+		 * This is merely somewhat odd; what is amazing is that this
+		 * IN_ATTRIB event is not visible on an inotify watch on the
+		 * parent directory.  (IN_ATTRIB events are normally sent to
+		 * watches on the parent directory).  While it's hard to
+		 * believe that this constitutes desired semantics, ltp
+		 * unfortunately tests this case (if implicitly); in the name
+		 * of bug-for-bug compatibility, we fire IN_ATTRIB iff we are
+		 * explicitly watching the file that has been removed.
+		 */
+		if (watch->inw_parent == NULL)
+			inotify_watch_event(watch, IN_ATTRIB, NULL);
+
+		/*FALLTHROUGH*/
+	case VE_RENAME_DEST:
+		inotify_watch_event(watch, IN_DELETE_SELF, NULL);
+		inotify_watch_delete(watch, IN_DELETE_SELF);
+		break;
+	case VE_RMDIR:
+		/*
+		 * It seems that IN_ISDIR should really be OR'd in here, but
+		 * Linux doesn't seem to do that in this case; for the sake of
+		 * bug-for-bug compatibility, we don't do it either.
+		 */
+		inotify_watch_event(watch, IN_DELETE_SELF, NULL);
+		inotify_watch_delete(watch, IN_DELETE_SELF);
+		break;
+	case VE_CREATE:
+	case VE_TRUNCATE:
+	case VE_RESIZE:
+		inotify_watch_event(watch, IN_MODIFY | IN_ATTRIB, NULL);
+		break;
+	case VE_LINK:
+		inotify_watch_event(watch, IN_ATTRIB, NULL);
+		break;
+	case VE_RENAME_SRC_DIR:
+		inotify_watch_event(watch, IN_MOVED_FROM, name);
+		break;
+	case VE_RENAME_DEST_DIR:
+		if (name == NULL)
+			name = dvp->v_path;
+
+		inotify_watch_insert(watch, dvp, name);
+		inotify_watch_event(watch, IN_MOVED_TO, name);
+		break;
+	case VE_SUPPORT:
+	case VE_MOUNTEDOVER:
+	case VE_PRE_RENAME_SRC:
+	case VE_PRE_RENAME_DEST:
+	case VE_PRE_RENAME_DEST_DIR:
+		break;
+	}
+
+	return (vnext_vnevent(vf, vnevent, dvp, name, ct));
+}
+
+const fs_operation_def_t inotify_vnodesrc_template[] = {
+	VOPNAME_CLOSE,		{ .femop_close = inotify_fop_close },
+	VOPNAME_CREATE,		{ .femop_create = inotify_fop_create },
+	VOPNAME_LINK,		{ .femop_link = inotify_fop_link },
+	VOPNAME_MKDIR,		{ .femop_mkdir = inotify_fop_mkdir },
+	VOPNAME_OPEN,		{ .femop_open = inotify_fop_open },
+	VOPNAME_READ,		{ .femop_read = inotify_fop_read },
+	VOPNAME_READDIR,	{ .femop_readdir = inotify_fop_readdir },
+	VOPNAME_REMOVE,		{ .femop_remove = inotify_fop_remove },
+	VOPNAME_RMDIR,		{ .femop_rmdir = inotify_fop_rmdir },
+	VOPNAME_SETATTR,	{ .femop_setattr = inotify_fop_setattr },
+	VOPNAME_WRITE,		{ .femop_write = inotify_fop_write },
+	VOPNAME_VNEVENT,	{ .femop_vnevent = inotify_fop_vnevent },
+	NULL, NULL
+};
+
+static int
+inotify_watch_cmpwd(inotify_watch_t *lhs, inotify_watch_t *rhs)
+{
+	if (lhs->inw_wd < rhs->inw_wd)
+		return (-1);
+
+	if (lhs->inw_wd > rhs->inw_wd)
+		return (1);
+
+	return (0);
+}
+
+static int
+inotify_watch_cmpvp(inotify_watch_t *lhs, inotify_watch_t *rhs)
+{
+	uintptr_t lvp = (uintptr_t)lhs->inw_vp, rvp = (uintptr_t)rhs->inw_vp;
+
+	if (lvp < rvp)
+		return (-1);
+
+	if (lvp > rvp)
+		return (1);
+
+	return (0);
+}
+
+static void
+inotify_watch_hold(inotify_watch_t *watch)
+{
+	mutex_enter(&watch->inw_lock);
+	VERIFY(watch->inw_refcnt > 0);
+	watch->inw_refcnt++;
+	mutex_exit(&watch->inw_lock);
+}
+
+static void
+inotify_watch_release(inotify_watch_t *watch)
+{
+	mutex_enter(&watch->inw_lock);
+	VERIFY(watch->inw_refcnt > 1);
+
+	if (--watch->inw_refcnt == 1 && watch->inw_zombie) {
+		/*
+		 * We're down to our last reference; kick anyone that might be
+		 * waiting.
+		 */
+		cv_signal(&watch->inw_cv);
+	}
+
+	mutex_exit(&watch->inw_lock);
+}
+
+static void
+inotify_watch_event(inotify_watch_t *watch, uint64_t mask, char *name)
+{
+	inotify_kevent_t *event, *tail;
+	inotify_state_t *state = watch->inw_state;
+	uint32_t wd = watch->inw_wd, cookie = 0, len;
+	boolean_t removal = mask & IN_REMOVAL ? B_TRUE : B_FALSE;
+	inotify_watch_t *source = watch;
+
+	if (!(mask &= watch->inw_mask) || mask == IN_ISDIR)
+		return;
+
+	if (watch->inw_parent != NULL) {
+		/*
+		 * This is an event on the child; if this isn't a valid child
+		 * event, return.  Otherwise, we move our watch to be our
+		 * parent (which we know is around because we have a hold on
+		 * it) and continue.
+		 */
+		if (!(mask & IN_CHILD_EVENTS))
+			return;
+
+		name = watch->inw_name;
+		watch = watch->inw_parent;
+		wd = watch->inw_wd;
+	}
+
+	if (!removal) {
+		mutex_enter(&state->ins_lock);
+
+		if (watch->inw_zombie ||
+		    watch->inw_fired || !watch->inw_active) {
+			mutex_exit(&state->ins_lock);
+			return;
+		}
+	} else {
+		if (!watch->inw_active)
+			return;
+
+		VERIFY(MUTEX_HELD(&state->ins_lock));
+	}
+
+	/*
+	 * If this is an operation on a directory and it's a child event
+	 * (event if it's not on a child), we specify IN_ISDIR.
+	 */
+	if (source->inw_vp->v_type == VDIR && (mask & IN_CHILD_EVENTS))
+		mask |= IN_ISDIR;
+
+	if (mask & (IN_MOVED_FROM | IN_MOVED_TO))
+		cookie = (uint32_t)curthread->t_did;
+
+	if (state->ins_nevents >= state->ins_maxevents) {
+		/*
+		 * We're at our maximum number of events -- turn our event
+		 * into an IN_Q_OVERFLOW event, which will be coalesced if
+		 * it's already the tail event.
+		 */
+		mask = IN_Q_OVERFLOW;
+		wd = (uint32_t)-1;
+		cookie = 0;
+		len = 0;
+	}
+
+	if ((tail = state->ins_tail) != NULL && tail->ine_event.wd == wd &&
+	    tail->ine_event.mask == mask && tail->ine_event.cookie == cookie &&
+	    ((tail->ine_event.len == 0 && len == 0) ||
+	    (name != NULL && tail->ine_event.len != 0 &&
+	    strcmp(tail->ine_event.name, name) == 0))) {
+		/*
+		 * This is an implicitly coalesced event; we're done.
+		 */
+		if (!removal)
+			mutex_exit(&state->ins_lock);
+		return;
+	}
+
+	if (name != NULL) {
+		len = strlen(name) + 1;
+		len = roundup(len, sizeof (struct inotify_event));
+	} else {
+		len = 0;
+	}
+
+	event = kmem_zalloc(sizeof (inotify_kevent_t) + len, KM_SLEEP);
+	event->ine_event.wd = wd;
+	event->ine_event.mask = (uint32_t)mask;
+	event->ine_event.cookie = cookie;
+	event->ine_event.len = len;
+
+	if (name != NULL)
+		strcpy(event->ine_event.name, name);
+
+	if (tail != NULL) {
+		tail->ine_next = event;
+	} else {
+		VERIFY(state->ins_head == NULL);
+		state->ins_head = event;
+		cv_broadcast(&state->ins_cv);
+	}
+
+	state->ins_tail = event;
+	state->ins_nevents++;
+	state->ins_size += sizeof (event->ine_event) + len;
+
+	if (removal)
+		return;
+
+	if ((watch->inw_mask & IN_ONESHOT) && !watch->inw_fired) {
+		/*
+		 * If this is a one-shot, we need to remove the watch.  (Note
+		 * that this will recurse back into inotify_watch_event() to
+		 * fire the IN_IGNORED event -- but with "removal" set.)
+		 */
+		watch->inw_fired = 1;
+		inotify_watch_remove(state, watch);
+	}
+
+	mutex_exit(&state->ins_lock);
+	pollwakeup(&state->ins_pollhd, POLLRDNORM | POLLIN);
+}
+
+/*
+ * Destroy a watch.  By the time we're in here, the watch must have exactly
+ * one reference.
+ */
+static void
+inotify_watch_destroy(inotify_watch_t *watch)
+{
+	VERIFY(MUTEX_HELD(&watch->inw_lock));
+
+	if (watch->inw_name != NULL)
+		kmem_free(watch->inw_name, strlen(watch->inw_name) + 1);
+
+	kmem_free(watch, sizeof (inotify_watch_t));
+}
+
+/*
+ * Zombify a watch.  By the time we come in here, it must be true that the
+ * watch has already been fem_uninstall()'d -- the only reference should be
+ * in the state's data structure.  If we can get away with freeing it, we'll
+ * do that -- but if the reference count is greater than one due to an active
+ * vnode operation, we'll put this watch on the zombie list on the state
+ * structure.
+ */
+static void
+inotify_watch_zombify(inotify_watch_t *watch)
+{
+	inotify_state_t *state = watch->inw_state;
+
+	VERIFY(MUTEX_HELD(&state->ins_lock));
+	VERIFY(!watch->inw_zombie);
+
+	watch->inw_zombie = 1;
+
+	if (watch->inw_parent != NULL) {
+		inotify_watch_release(watch->inw_parent);
+	} else {
+		avl_remove(&state->ins_byvp, watch);
+		avl_remove(&state->ins_bywd, watch);
+		vmem_free(state->ins_wds, (void *)(uintptr_t)watch->inw_wd, 1);
+		watch->inw_wd = -1;
+	}
+
+	mutex_enter(&watch->inw_lock);
+
+	if (watch->inw_refcnt == 1) {
+		/*
+		 * There are no operations in flight and there is no way
+		 * for anyone to discover this watch -- we can destroy it.
+		 */
+		inotify_watch_destroy(watch);
+	} else {
+		/*
+		 * There are operations in flight; we will need to enqueue
+		 * this for later destruction.
+		 */
+		watch->inw_parent = state->ins_zombies;
+		state->ins_zombies = watch;
+		mutex_exit(&watch->inw_lock);
+	}
+}
+
+static inotify_watch_t *
+inotify_watch_add(inotify_state_t *state, inotify_watch_t *parent,
+    const char *name, vnode_t *vp, uint32_t mask)
+{
+	inotify_watch_t *watch;
+	int err;
+
+	VERIFY(MUTEX_HELD(&state->ins_lock));
+
+	watch = kmem_zalloc(sizeof (inotify_watch_t), KM_SLEEP);
+
+	watch->inw_vp = vp;
+	watch->inw_mask = mask;
+	watch->inw_state = state;
+	watch->inw_refcnt = 1;
+
+	if (parent == NULL) {
+		watch->inw_wd = (int)(uintptr_t)vmem_alloc(state->ins_wds,
+		    1, VM_BESTFIT | VM_SLEEP);
+		avl_add(&state->ins_byvp, watch);
+		avl_add(&state->ins_bywd, watch);
+
+		avl_create(&watch->inw_children,
+		    (int(*)(const void *, const void *))inotify_watch_cmpvp,
+		    sizeof (inotify_watch_t),
+		    offsetof(inotify_watch_t, inw_byvp));
+	} else {
+		VERIFY(name != NULL);
+		inotify_watch_hold(parent);
+		watch->inw_mask &= IN_CHILD_EVENTS;
+		watch->inw_parent = parent;
+		watch->inw_name = kmem_alloc(strlen(name) + 1, KM_SLEEP);
+		strcpy(watch->inw_name, name);
+
+		avl_add(&parent->inw_children, watch);
+	}
+
+	/*
+	 * Add our monitor to the vnode.  We must not have the watch lock held
+	 * when we do this, as it will immediately hold our watch.
+	 */
+	err = fem_install(vp, inotify_femp, watch, OPARGUNIQ,
+	    (void (*)(void *))inotify_watch_hold,
+	    (void (*)(void *))inotify_watch_release);
+
+	VERIFY(err == 0);
+
+	return (watch);
+}
+
+/*
+ * Remove a (non-child) watch.  This is called from either synchronous context
+ * via inotify_rm_watch() or monitor context via either a vnevent or a
+ * one-shot.
+ */
+static void
+inotify_watch_remove(inotify_state_t *state, inotify_watch_t *watch)
+{
+	inotify_watch_t *child;
+	int err;
+
+	VERIFY(MUTEX_HELD(&state->ins_lock));
+	VERIFY(watch->inw_parent == NULL);
+
+	err = fem_uninstall(watch->inw_vp, inotify_femp, watch);
+	VERIFY(err == 0);
+
+	/*
+	 * If we have children, we're going to remove them all and set them
+	 * all to be zombies.
+	 */
+	while ((child = avl_first(&watch->inw_children)) != NULL) {
+		VERIFY(child->inw_parent == watch);
+		avl_remove(&watch->inw_children, child);
+
+		err = fem_uninstall(child->inw_vp, inotify_femp, child);
+		VERIFY(err == 0);
+
+		/*
+		 * If this child watch has been orphaned, remove it from the
+		 * state's list of orphans.
+		 */
+		if (child->inw_orphaned) {
+			list_remove(&state->ins_orphans, child);
+			crfree(child->inw_cred);
+		}
+
+		VN_RELE(child->inw_vp);
+
+		/*
+		 * We're down (or should be down) to a single reference to
+		 * this child watch; it's safe to zombify it.
+		 */
+		inotify_watch_zombify(child);
+	}
+
+	inotify_watch_event(watch, IN_IGNORED | IN_REMOVAL, NULL);
+	VN_RELE(watch->inw_vp);
+
+	/*
+	 * It's now safe to zombify the watch -- we know that the only reference
+	 * can come from operations in flight.
+	 */
+	inotify_watch_zombify(watch);
+}
+
+/*
+ * Delete a watch.  Should only be called from VOP context.
+ */
+static void
+inotify_watch_delete(inotify_watch_t *watch, uint32_t event)
+{
+	inotify_state_t *state = watch->inw_state;
+	inotify_watch_t cmp = { .inw_vp = watch->inw_vp }, *parent;
+	int err;
+
+	if (event != IN_DELETE_SELF && !(watch->inw_mask & IN_CHILD_EVENTS))
+		return;
+
+	mutex_enter(&state->ins_lock);
+
+	if (watch->inw_zombie) {
+		mutex_exit(&state->ins_lock);
+		return;
+	}
+
+	if ((parent = watch->inw_parent) == NULL) {
+		if (event == IN_DELETE_SELF) {
+			/*
+			 * If we're here because we're being deleted and we
+			 * are not a child watch, we need to delete the entire
+			 * watch, children and all.
+			 */
+			inotify_watch_remove(state, watch);
+		}
+
+		mutex_exit(&state->ins_lock);
+		return;
+	} else {
+		if (event == IN_DELETE_SELF &&
+		    !(parent->inw_mask & IN_EXCL_UNLINK)) {
+			/*
+			 * This is a child watch for a file that is being
+			 * removed and IN_EXCL_UNLINK has not been specified;
+			 * indicate that it is orphaned and add it to the list
+			 * of orphans.  (This list will be checked by the
+			 * cleaning cyclic to determine when the watch has
+			 * become the only hold on the vnode, at which point
+			 * the watch can be zombified.)  Note that we check
+			 * if the watch is orphaned before we orphan it:  hard
+			 * links make it possible for VE_REMOVE to be called
+			 * multiple times on the same vnode. (!)
+			 */
+			if (!watch->inw_orphaned) {
+				watch->inw_orphaned = 1;
+				watch->inw_cred = CRED();
+				crhold(watch->inw_cred);
+				list_insert_head(&state->ins_orphans, watch);
+			}
+
+			mutex_exit(&state->ins_lock);
+			return;
+		}
+
+		if (watch->inw_orphaned) {
+			/*
+			 * If we're here, a file was orphaned and then later
+			 * moved -- which almost certainly means that hard
+			 * links are on the scene.  We choose the orphan over
+			 * the move because we don't want to spuriously
+			 * drop events if we can avoid it.
+			 */
+			crfree(watch->inw_cred);
+			list_remove(&state->ins_orphans, watch);
+		}
+	}
+
+	if (avl_find(&parent->inw_children, &cmp, NULL) == NULL) {
+		/*
+		 * This watch has already been deleted from the parent.
+		 */
+		mutex_exit(&state->ins_lock);
+		return;
+	}
+
+	avl_remove(&parent->inw_children, watch);
+	err = fem_uninstall(watch->inw_vp, inotify_femp, watch);
+	VERIFY(err == 0);
+
+	VN_RELE(watch->inw_vp);
+
+	/*
+	 * It's now safe to zombify the watch -- which won't actually delete
+	 * it as we know that the reference count is greater than 1.
+	 */
+	inotify_watch_zombify(watch);
+	mutex_exit(&state->ins_lock);
+}
+
+/*
+ * Insert a new child watch.  Should only be called from VOP context when
+ * a child is created in a watched directory.
+ */
+static void
+inotify_watch_insert(inotify_watch_t *watch, vnode_t *vp, char *name)
+{
+	inotify_state_t *state = watch->inw_state;
+	inotify_watch_t cmp = { .inw_vp = vp };
+
+	if (!(watch->inw_mask & IN_CHILD_EVENTS))
+		return;
+
+	mutex_enter(&state->ins_lock);
+
+	if (watch->inw_zombie || watch->inw_parent != NULL || vp == NULL) {
+		mutex_exit(&state->ins_lock);
+		return;
+	}
+
+	if (avl_find(&watch->inw_children, &cmp, NULL) != NULL) {
+		mutex_exit(&state->ins_lock);
+		return;
+	}
+
+	VN_HOLD(vp);
+	watch = inotify_watch_add(state, watch, name, vp, watch->inw_mask);
+	VERIFY(watch != NULL);
+
+	mutex_exit(&state->ins_lock);
+}
+
+
+static int
+inotify_add_watch(inotify_state_t *state, vnode_t *vp, uint32_t mask,
+    int32_t *wdp)
+{
+	inotify_watch_t *watch, cmp = { .inw_vp = vp };
+	uint32_t set;
+
+	set = (mask & (IN_ALL_EVENTS | IN_MODIFIERS)) | IN_UNMASKABLE;
+
+	/*
+	 * Lookup our vnode to determine if we already have a watch on it.
+	 */
+	mutex_enter(&state->ins_lock);
+
+	if ((watch = avl_find(&state->ins_byvp, &cmp, NULL)) == NULL) {
+		/*
+		 * We don't have this watch; allocate a new one, provided that
+		 * we have fewer than our limit.
+		 */
+		if (avl_numnodes(&state->ins_bywd) >= state->ins_maxwatches) {
+			mutex_exit(&state->ins_lock);
+			return (ENOSPC);
+		}
+
+		VN_HOLD(vp);
+		watch = inotify_watch_add(state, NULL, NULL, vp, set);
+		*wdp = watch->inw_wd;
+		mutex_exit(&state->ins_lock);
+
+		return (0);
+	}
+
+	VERIFY(!watch->inw_zombie);
+
+	if (!(mask & IN_MASK_ADD)) {
+		/*
+		 * Note that if we're resetting our event mask and we're
+		 * transitioning from an event mask that includes child events
+		 * to one that doesn't, there will be potentially some stale
+		 * child watches.  This is basically fine:  they won't fire,
+		 * and they will correctly be removed when the watch is
+		 * removed.
+		 */
+		watch->inw_mask = 0;
+	}
+
+	watch->inw_mask |= set;
+
+	*wdp = watch->inw_wd;
+
+	mutex_exit(&state->ins_lock);
+
+	return (0);
+}
+
+static int
+inotify_add_child(inotify_state_t *state, vnode_t *vp, char *name)
+{
+	inotify_watch_t *watch, cmp = { .inw_vp = vp };
+	vnode_t *cvp;
+	int err;
+
+	/*
+	 * Verify that the specified child doesn't have a directory component
+	 * within it.
+	 */
+	if (strchr(name, '/') != NULL)
+		return (EINVAL);
+
+	/*
+	 * Lookup the underlying file.  Note that this will succeed even if
+	 * we don't have permissions to actually read the file.
+	 */
+	if ((err = lookupnameat(name,
+	    UIO_SYSSPACE, NO_FOLLOW, NULL, &cvp, vp)) != 0) {
+		return (err);
+	}
+
+	/*
+	 * Use our vnode to find our watch, and then add our child watch to it.
+	 */
+	mutex_enter(&state->ins_lock);
+
+	if ((watch = avl_find(&state->ins_byvp, &cmp, NULL)) == NULL) {
+		/*
+		 * This is unexpected -- it means that we don't have the
+		 * watch that we thought we had.
+		 */
+		mutex_exit(&state->ins_lock);
+		VN_RELE(cvp);
+		return (ENXIO);
+	}
+
+	/*
+	 * Now lookup the child vnode in the watch; we'll only add it if it
+	 * isn't already there.
+	 */
+	cmp.inw_vp = cvp;
+
+	if (avl_find(&watch->inw_children, &cmp, NULL) != NULL) {
+		mutex_exit(&state->ins_lock);
+		VN_RELE(cvp);
+		return (0);
+	}
+
+	watch = inotify_watch_add(state, watch, name, cvp, watch->inw_mask);
+	VERIFY(watch != NULL);
+	mutex_exit(&state->ins_lock);
+
+	return (0);
+}
+
+static int
+inotify_rm_watch(inotify_state_t *state, int32_t wd)
+{
+	inotify_watch_t *watch, cmp = { .inw_wd = wd };
+
+	mutex_enter(&state->ins_lock);
+
+	if ((watch = avl_find(&state->ins_bywd, &cmp, NULL)) == NULL) {
+		mutex_exit(&state->ins_lock);
+		return (EINVAL);
+	}
+
+	inotify_watch_remove(state, watch);
+	mutex_exit(&state->ins_lock);
+
+	return (0);
+}
+
+static int
+inotify_activate(inotify_state_t *state, int32_t wd)
+{
+	inotify_watch_t *watch, cmp = { .inw_wd = wd };
+
+	mutex_enter(&state->ins_lock);
+
+	if ((watch = avl_find(&state->ins_bywd, &cmp, NULL)) == NULL) {
+		mutex_exit(&state->ins_lock);
+		return (EINVAL);
+	}
+
+	watch->inw_active = 1;
+
+	mutex_exit(&state->ins_lock);
+
+	return (0);
+}
+
+/*
+ * Called periodically as a cyclic to process the orphans and zombies.
+ */
+static void
+inotify_clean(void *arg)
+{
+	inotify_state_t *state = arg;
+	inotify_watch_t *watch, *parent, *next, **prev;
+	cred_t *savecred;
+	int err;
+
+	mutex_enter(&state->ins_lock);
+
+	for (watch = list_head(&state->ins_orphans);
+	    watch != NULL; watch = next) {
+		next = list_next(&state->ins_orphans, watch);
+
+		VERIFY(!watch->inw_zombie);
+		VERIFY((parent = watch->inw_parent) != NULL);
+
+		if (watch->inw_vp->v_count > 1)
+			continue;
+
+		avl_remove(&parent->inw_children, watch);
+		err = fem_uninstall(watch->inw_vp, inotify_femp, watch);
+		VERIFY(err == 0);
+
+		list_remove(&state->ins_orphans, watch);
+
+		/*
+		 * For purposes of releasing the vnode, we need to switch our
+		 * cred to be the cred of the orphaning thread (which we held
+		 * at the time this watch was orphaned).
+		 */
+		savecred = curthread->t_cred;
+		curthread->t_cred = watch->inw_cred;
+		VN_RELE(watch->inw_vp);
+		crfree(watch->inw_cred);
+		curthread->t_cred = savecred;
+
+		inotify_watch_zombify(watch);
+	}
+
+	prev = &state->ins_zombies;
+
+	while ((watch = *prev) != NULL) {
+		mutex_enter(&watch->inw_lock);
+
+		if (watch->inw_refcnt == 1) {
+			*prev = watch->inw_parent;
+			inotify_watch_destroy(watch);
+			continue;
+		}
+
+		prev = &watch->inw_parent;
+		mutex_exit(&watch->inw_lock);
+	}
+
+	mutex_exit(&state->ins_lock);
+}
+
+/*ARGSUSED*/
+static int
+inotify_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
+{
+	inotify_state_t *state;
+	major_t major = getemajor(*devp);
+	minor_t minor = getminor(*devp);
+	int instances = 0;
+	char c[64];
+
+	if (minor != INOTIFYMNRN_INOTIFY)
+		return (ENXIO);
+
+	mutex_enter(&inotify_lock);
+
+	for (state = inotify_state; state != NULL; state = state->ins_next) {
+		if (state->ins_cred == cred_p)
+			instances++;
+	}
+
+	if (instances >= inotify_maxinstances) {
+		mutex_exit(&inotify_lock);
+		return (EMFILE);
+	}
+
+	minor = (minor_t)(uintptr_t)vmem_alloc(inotify_minor, 1,
+	    VM_BESTFIT | VM_SLEEP);
+
+	if (ddi_soft_state_zalloc(inotify_softstate, minor) != DDI_SUCCESS) {
+		vmem_free(inotify_minor, (void *)(uintptr_t)minor, 1);
+		mutex_exit(&inotify_lock);
+		return (NULL);
+	}
+
+	state = ddi_get_soft_state(inotify_softstate, minor);
+	*devp = makedevice(major, minor);
+
+	crhold(cred_p);
+	state->ins_cred = cred_p;
+	state->ins_next = inotify_state;
+	inotify_state = state;
+
+	(void) snprintf(c, sizeof (c), "inotify_watchid_%d", minor);
+	state->ins_wds = vmem_create(c, (void *)1, UINT32_MAX, 1,
+	    NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
+
+	avl_create(&state->ins_bywd,
+	    (int(*)(const void *, const void *))inotify_watch_cmpwd,
+	    sizeof (inotify_watch_t),
+	    offsetof(inotify_watch_t, inw_bywd));
+
+	avl_create(&state->ins_byvp,
+	    (int(*)(const void *, const void *))inotify_watch_cmpvp,
+	    sizeof (inotify_watch_t),
+	    offsetof(inotify_watch_t, inw_byvp));
+
+	list_create(&state->ins_orphans, sizeof (inotify_watch_t),
+	    offsetof(inotify_watch_t, inw_orphan));
+
+	state->ins_maxwatches = inotify_maxwatches;
+	state->ins_maxevents = inotify_maxevents;
+
+	mutex_exit(&inotify_lock);
+
+	state->ins_cleaner = ddi_periodic_add(inotify_clean,
+	    state, NANOSEC, DDI_IPL_0);
+
+	return (0);
+}
+
+/*ARGSUSED*/
+static int
+inotify_read(dev_t dev, uio_t *uio, cred_t *cr)
+{
+	inotify_state_t *state;
+	inotify_kevent_t *event;
+	minor_t minor = getminor(dev);
+	int err = 0, nevents = 0;
+	size_t len;
+
+	state = ddi_get_soft_state(inotify_softstate, minor);
+
+	mutex_enter(&state->ins_lock);
+
+	while (state->ins_head == NULL) {
+		if (uio->uio_fmode & (FNDELAY|FNONBLOCK)) {
+			mutex_exit(&state->ins_lock);
+			return (EAGAIN);
+		}
+
+		if (!cv_wait_sig_swap(&state->ins_cv, &state->ins_lock)) {
+			mutex_exit(&state->ins_lock);
+			return (EINTR);
+		}
+	}
+
+	/*
+	 * We have events and we have our lock; return as many as we can.
+	 */
+	while ((event = state->ins_head) != NULL) {
+		len = sizeof (event->ine_event) + event->ine_event.len;
+
+		if (uio->uio_resid < len) {
+			if (nevents == 0)
+				err = EINVAL;
+			break;
+		}
+
+		nevents++;
+
+		if ((err = uiomove(&event->ine_event, len, UIO_READ, uio)) != 0)
+			break;
+
+		VERIFY(state->ins_nevents > 0);
+		state->ins_nevents--;
+
+		VERIFY(state->ins_size > 0);
+		state->ins_size -= len;
+
+		if ((state->ins_head = event->ine_next) == NULL) {
+			VERIFY(event == state->ins_tail);
+			VERIFY(state->ins_nevents == 0);
+			state->ins_tail = NULL;
+		}
+
+		kmem_free(event, INOTIFY_EVENT_LENGTH(event));
+	}
+
+	mutex_exit(&state->ins_lock);
+
+	return (err);
+}
+
+/*ARGSUSED*/
+static int
+inotify_poll(dev_t dev, short events, int anyyet, short *reventsp,
+    struct pollhead **phpp)
+{
+	inotify_state_t *state;
+	minor_t minor = getminor(dev);
+
+	state = ddi_get_soft_state(inotify_softstate, minor);
+
+	mutex_enter(&state->ins_lock);
+
+	if (state->ins_head != NULL) {
+		*reventsp = events & (POLLRDNORM | POLLIN);
+	} else {
+		*reventsp = 0;
+
+		if (!anyyet)
+			*phpp = &state->ins_pollhd;
+	}
+
+	mutex_exit(&state->ins_lock);
+
+	return (0);
+}
+
+/*ARGSUSED*/
+static int
+inotify_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv)
+{
+	inotify_state_t *state;
+	minor_t minor = getminor(dev);
+	file_t *fp;
+	int rval;
+
+	state = ddi_get_soft_state(inotify_softstate, minor);
+
+	switch (cmd) {
+	case INOTIFYIOC_ADD_WATCH: {
+		inotify_addwatch_t addwatch;
+		file_t *fp;
+
+		if (copyin((void *)arg, &addwatch, sizeof (addwatch)) != 0)
+			return (EFAULT);
+
+		if ((fp = getf(addwatch.inaw_fd)) == NULL)
+			return (EBADF);
+
+		rval = inotify_add_watch(state, fp->f_vnode,
+		    addwatch.inaw_mask, rv);
+
+		releasef(addwatch.inaw_fd);
+		return (rval);
+	}
+
+	case INOTIFYIOC_ADD_CHILD: {
+		inotify_addchild_t addchild;
+		char name[MAXPATHLEN];
+
+		if (copyin((void *)arg, &addchild, sizeof (addchild)) != 0)
+			return (EFAULT);
+
+		if (copyinstr(addchild.inac_name, name, MAXPATHLEN, NULL) != 0)
+			return (EFAULT);
+
+		if ((fp = getf(addchild.inac_fd)) == NULL)
+			return (EBADF);
+
+		rval = inotify_add_child(state, fp->f_vnode, name);
+
+		releasef(addchild.inac_fd);
+		return (rval);
+	}
+
+	case INOTIFYIOC_RM_WATCH:
+		return (inotify_rm_watch(state, arg));
+
+	case INOTIFYIOC_ACTIVATE:
+		return (inotify_activate(state, arg));
+
+	case FIONREAD: {
+		int32_t size;
+
+		mutex_enter(&state->ins_lock);
+		size = state->ins_size;
+		mutex_exit(&state->ins_lock);
+
+		if (copyout(&size, (void *)arg, sizeof (size)) != 0)
+			return (EFAULT);
+
+		return (0);
+	}
+
+	default:
+		break;
+	}
+
+	return (ENOTTY);
+}
+
+/*ARGSUSED*/
+static int
+inotify_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
+{
+	inotify_state_t *state, **sp;
+	inotify_watch_t *watch, *zombies;
+	inotify_kevent_t *event;
+	minor_t minor = getminor(dev);
+
+	state = ddi_get_soft_state(inotify_softstate, minor);
+
+	if (state->ins_pollhd.ph_list != NULL) {
+		pollwakeup(&state->ins_pollhd, POLLERR);
+		pollhead_clean(&state->ins_pollhd);
+	}
+
+	mutex_enter(&state->ins_lock);
+
+	/*
+	 * First, destroy all of our watches.
+	 */
+	while ((watch = avl_first(&state->ins_bywd)) != NULL)
+		inotify_watch_remove(state, watch);
+
+	/*
+	 * And now destroy our event queue.
+	 */
+	while ((event = state->ins_head) != NULL) {
+		state->ins_head = event->ine_next;
+		kmem_free(event, INOTIFY_EVENT_LENGTH(event));
+	}
+
+	zombies = state->ins_zombies;
+	state->ins_zombies = NULL;
+	mutex_exit(&state->ins_lock);
+
+	/*
+	 * Now that our state lock is dropped, we can synchronously wait on
+	 * any zombies.
+	 */
+	while ((watch = zombies) != NULL) {
+		zombies = zombies->inw_parent;
+
+		mutex_enter(&watch->inw_lock);
+
+		while (watch->inw_refcnt > 1)
+			cv_wait(&watch->inw_cv, &watch->inw_lock);
+
+		inotify_watch_destroy(watch);
+	}
+
+	if (state->ins_cleaner != NULL) {
+		ddi_periodic_delete(state->ins_cleaner);
+		state->ins_cleaner = NULL;
+	}
+
+	mutex_enter(&inotify_lock);
+
+	/*
+	 * Remove our state from our global list, and release our hold on
+	 * the cred.
+	 */
+	for (sp = &inotify_state; *sp != state; sp = &((*sp)->ins_next))
+		VERIFY(*sp != NULL);
+
+	*sp = (*sp)->ins_next;
+	crfree(state->ins_cred);
+	vmem_destroy(state->ins_wds);
+
+	ddi_soft_state_free(inotify_softstate, minor);
+	vmem_free(inotify_minor, (void *)(uintptr_t)minor, 1);
+
+	mutex_exit(&inotify_lock);
+
+	return (0);
+}
+
+/*ARGSUSED*/
+static int
+inotify_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
+{
+	mutex_enter(&inotify_lock);
+
+	if (ddi_soft_state_init(&inotify_softstate,
+	    sizeof (inotify_state_t), 0) != 0) {
+		cmn_err(CE_NOTE, "/dev/inotify failed to create soft state");
+		mutex_exit(&inotify_lock);
+		return (DDI_FAILURE);
+	}
+
+	if (ddi_create_minor_node(devi, "inotify", S_IFCHR,
+	    INOTIFYMNRN_INOTIFY, DDI_PSEUDO, NULL) == DDI_FAILURE) {
+		cmn_err(CE_NOTE, "/dev/inotify couldn't create minor node");
+		ddi_soft_state_fini(&inotify_softstate);
+		mutex_exit(&inotify_lock);
+		return (DDI_FAILURE);
+	}
+
+	if (fem_create("inotify_fem",
+	    inotify_vnodesrc_template, &inotify_femp) != 0) {
+		cmn_err(CE_NOTE, "/dev/inotify couldn't create FEM state");
+		ddi_remove_minor_node(devi, NULL);
+		ddi_soft_state_fini(&inotify_softstate);
+		mutex_exit(&inotify_lock);
+		return (DDI_FAILURE);
+	}
+
+	ddi_report_dev(devi);
+	inotify_devi = devi;
+
+	inotify_minor = vmem_create("inotify_minor", (void *)INOTIFYMNRN_CLONE,
+	    UINT32_MAX - INOTIFYMNRN_CLONE, 1, NULL, NULL, NULL, 0,
+	    VM_SLEEP | VMC_IDENTIFIER);
+
+	mutex_exit(&inotify_lock);
+
+	return (DDI_SUCCESS);
+}
+
+/*ARGSUSED*/
+static int
+inotify_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+	switch (cmd) {
+	case DDI_DETACH:
+		break;
+
+	case DDI_SUSPEND:
+		return (DDI_SUCCESS);
+
+	default:
+		return (DDI_FAILURE);
+	}
+
+	mutex_enter(&inotify_lock);
+	fem_free(inotify_femp);
+	vmem_destroy(inotify_minor);
+
+	ddi_remove_minor_node(inotify_devi, NULL);
+	inotify_devi = NULL;
+
+	ddi_soft_state_fini(&inotify_softstate);
+	mutex_exit(&inotify_lock);
+
+	return (DDI_SUCCESS);
+}
+
+/*ARGSUSED*/
+static int
+inotify_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
+{
+	int error;
+
+	switch (infocmd) {
+	case DDI_INFO_DEVT2DEVINFO:
+		*result = (void *)inotify_devi;
+		error = DDI_SUCCESS;
+		break;
+	case DDI_INFO_DEVT2INSTANCE:
+		*result = (void *)0;
+		error = DDI_SUCCESS;
+		break;
+	default:
+		error = DDI_FAILURE;
+	}
+	return (error);
+}
+
+static struct cb_ops inotify_cb_ops = {
+	inotify_open,		/* open */
+	inotify_close,		/* close */
+	nulldev,		/* strategy */
+	nulldev,		/* print */
+	nodev,			/* dump */
+	inotify_read,		/* read */
+	nodev,			/* write */
+	inotify_ioctl,		/* ioctl */
+	nodev,			/* devmap */
+	nodev,			/* mmap */
+	nodev,			/* segmap */
+	inotify_poll,		/* poll */
+	ddi_prop_op,		/* cb_prop_op */
+	0,			/* streamtab  */
+	D_NEW | D_MP		/* Driver compatibility flag */
+};
+
+static struct dev_ops inotify_ops = {
+	DEVO_REV,		/* devo_rev */
+	0,			/* refcnt */
+	inotify_info,		/* get_dev_info */
+	nulldev,		/* identify */
+	nulldev,		/* probe */
+	inotify_attach,		/* attach */
+	inotify_detach,		/* detach */
+	nodev,			/* reset */
+	&inotify_cb_ops,	/* driver operations */
+	NULL,			/* bus operations */
+	nodev,			/* dev power */
+	ddi_quiesce_not_needed,	/* quiesce */
+};
+
+static struct modldrv modldrv = {
+	&mod_driverops,		/* module type (this is a pseudo driver) */
+	"inotify support",	/* name of module */
+	&inotify_ops,		/* driver ops */
+};
+
+static struct modlinkage modlinkage = {
+	MODREV_1,
+	(void *)&modldrv,
+	NULL
+};
+
+int
+_init(void)
+{
+	return (mod_install(&modlinkage));
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&modlinkage, modinfop));
+}
+
+int
+_fini(void)
+{
+	return (mod_remove(&modlinkage));
+}
diff --git a/usr/src/uts/common/io/inotify.conf b/usr/src/uts/common/io/inotify.conf
new file mode 100644
index 0000000000..ce9da6180f
--- /dev/null
+++ b/usr/src/uts/common/io/inotify.conf
@@ -0,0 +1,16 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2014 Joyent, Inc.  All rights reserved.
+#
+
+name="inotify" parent="pseudo" instance=0;
diff --git a/usr/src/uts/common/io/ixgbe/ixgbe_main.c b/usr/src/uts/common/io/ixgbe/ixgbe_main.c
index 848e3470c7..c29a762e06 100644
--- a/usr/src/uts/common/io/ixgbe/ixgbe_main.c
+++ b/usr/src/uts/common/io/ixgbe/ixgbe_main.c
@@ -271,7 +271,7 @@ static adapter_info_t ixgbe_82599eb_cap = {
 	128,		/* default number of rx queues */
 	64,		/* maximum number of rx groups */
 	1,		/* minimum number of rx groups */
-	1,		/* default number of rx groups */
+	32,		/* default number of rx groups */
 	128,		/* maximum number of tx queues */
 	1,		/* minimum number of tx queues */
 	8,		/* default number of tx queues */
@@ -302,7 +302,7 @@ static adapter_info_t ixgbe_X540_cap = {
 	128,		/* default number of rx queues */
 	64,		/* maximum number of rx groups */
 	1,		/* minimum number of rx groups */
-	1,		/* default number of rx groups */
+	32,		/* default number of rx groups */
 	128,		/* maximum number of tx queues */
 	1,		/* minimum number of tx queues */
 	8,		/* default number of tx queues */
@@ -1792,6 +1792,7 @@ ixgbe_cbfunc(dev_info_t *dip, ddi_cb_action_t cbaction, void *cbarg,
     void *arg1, void *arg2)
 {
 	ixgbe_t *ixgbe = (ixgbe_t *)arg1;
+	int prev = ixgbe->intr_cnt;
 
 	switch (cbaction) {
 	/* IRM callback */
@@ -1805,7 +1806,8 @@ ixgbe_cbfunc(dev_info_t *dip, ddi_cb_action_t cbaction, void *cbarg,
 		if (ixgbe_intr_adjust(ixgbe, cbaction, count) !=
 		    DDI_SUCCESS) {
 			ixgbe_error(ixgbe,
-			    "IRM CB: Failed to adjust interrupts");
+			    "IRM CB: Failed to adjust interrupts [%d %d %d]",
+			    cbaction, count, prev);
 			goto cb_fail;
 		}
 		break;
diff --git a/usr/src/uts/common/io/ksocket/ksocket.c b/usr/src/uts/common/io/ksocket/ksocket.c
index 8944fcbff3..25da45be39 100644
--- a/usr/src/uts/common/io/ksocket/ksocket.c
+++ b/usr/src/uts/common/io/ksocket/ksocket.c
@@ -22,7 +22,7 @@
 /*
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright 2015, Joyent, Inc.
  */
 
 #include <sys/file.h>
@@ -932,3 +932,15 @@ ksocket_rele(ksocket_t ks)
 			cv_signal(&so->so_closing_cv);
 	}
 }
+
+int
+ksocket_krecv_set(ksocket_t ks, ksocket_krecv_f cb, void *arg)
+{
+	return (so_krecv_set(KSTOSO(ks), (so_krecv_f)cb, arg));
+}
+
+void
+ksocket_krecv_unblock(ksocket_t ks)
+{
+	return (so_krecv_unblock(KSTOSO(ks)));
+}
diff --git a/usr/src/uts/common/io/ksocket/ksocket_impl.h b/usr/src/uts/common/io/ksocket/ksocket_impl.h
index ac5251540f..516a68d358 100644
--- a/usr/src/uts/common/io/ksocket/ksocket_impl.h
+++ b/usr/src/uts/common/io/ksocket/ksocket_impl.h
@@ -22,11 +22,17 @@
 /*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2015, Joyent, Inc.
  */
 
 #ifndef	_INET_KSOCKET_KSOCKET_IMPL_H
 #define	_INET_KSOCKET_KSOCKET_IMPL_H
 
+/*
+ * Note that if this relationship ever changes, the logic in ksocket_krecv_set
+ * must be updated and we must maintain local state about this on whatever the
+ * new ksocket object is.
+ */
 #define	KSTOSO(ks)	((struct sonode *)(ks))
 #define	SOTOKS(so)	((ksocket_t)(uintptr_t)(so))
 
diff --git a/usr/src/uts/common/io/ksyms.c b/usr/src/uts/common/io/ksyms.c
index c9f0c63b69..5233fcd0b4 100644
--- a/usr/src/uts/common/io/ksyms.c
+++ b/usr/src/uts/common/io/ksyms.c
@@ -21,6 +21,7 @@
 /*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2016 Joyent, Inc.
  */
 
 
@@ -219,6 +220,14 @@ ksyms_open(dev_t *devp, int flag, int otyp, struct cred *cred)
 	char *addr;
 	void *hptr = NULL;
 	ksyms_buflist_hdr_t hdr;
+
+	/*
+	 * This device should never be visible in a zone, but if it somehow
+	 * does get created we refuse to allow the zone to use it.
+	 */
+	if (crgetzoneid(cred) != GLOBAL_ZONEID)
+		return (EACCES);
+
 	bzero(&hdr, sizeof (struct ksyms_buflist_hdr));
 	list_create(&hdr.blist, PAGESIZE,
 	    offsetof(ksyms_buflist_t, buflist_node));
diff --git a/usr/src/uts/common/io/mac/mac.c b/usr/src/uts/common/io/mac/mac.c
index 1d30dc3478..1bf49a5b44 100644
--- a/usr/src/uts/common/io/mac/mac.c
+++ b/usr/src/uts/common/io/mac/mac.c
@@ -3141,6 +3141,9 @@ mac_prop_check_size(mac_prop_id_t id, uint_t valsize, boolean_t is_range)
 	case MAC_PROP_WL_MLME:
 		minsize = sizeof (wl_mlme_t);
 		break;
+	case MAC_PROP_VN_PROMISC_FILTERED:
+		minsize = sizeof (boolean_t);
+		break;
 	}
 
 	return (valsize >= minsize);
diff --git a/usr/src/uts/common/io/mac/mac_client.c b/usr/src/uts/common/io/mac/mac_client.c
index 8f0ec9eb67..18a6613424 100644
--- a/usr/src/uts/common/io/mac/mac_client.c
+++ b/usr/src/uts/common/io/mac/mac_client.c
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2014, Joyent, Inc.  All rights reserved.
+ * Copyright 2015, Joyent, Inc.
  */
 
 /*
@@ -3263,6 +3263,11 @@ mac_promisc_add(mac_client_handle_t mch, mac_client_promisc_type_t type,
 	mac_cb_info_t	*mcbi;
 	int rc;
 
+	if ((flags & MAC_PROMISC_FLAGS_NO_COPY) &&
+	    (flags & MAC_PROMISC_FLAGS_DO_FIXUPS)) {
+		return (EINVAL);
+	}
+
 	i_mac_perim_enter(mip);
 
 	if ((rc = mac_start((mac_handle_t)mip)) != 0) {
@@ -3271,7 +3276,8 @@ mac_promisc_add(mac_client_handle_t mch, mac_client_promisc_type_t type,
 	}
 
 	if ((mcip->mci_state_flags & MCIS_IS_VNIC) &&
-	    type == MAC_CLIENT_PROMISC_ALL) {
+	    type == MAC_CLIENT_PROMISC_ALL &&
+	    (mcip->mci_protect_flags & MPT_FLAG_PROMISC_FILTERED)) {
 		/*
 		 * The function is being invoked by the upper MAC client
 		 * of a VNIC. The VNIC should only see the traffic
@@ -3308,6 +3314,7 @@ mac_promisc_add(mac_client_handle_t mch, mac_client_promisc_type_t type,
 	mpip->mpi_strip_vlan_tag =
 	    ((flags & MAC_PROMISC_FLAGS_VLAN_TAG_STRIP) != 0);
 	mpip->mpi_no_copy = ((flags & MAC_PROMISC_FLAGS_NO_COPY) != 0);
+	mpip->mpi_do_fixups = ((flags & MAC_PROMISC_FLAGS_DO_FIXUPS) != 0);
 
 	mcbi = &mip->mi_promisc_cb_info;
 	mutex_enter(mcbi->mcbi_lockp);
@@ -3944,15 +3951,22 @@ mac_client_get_effective_resources(mac_client_handle_t mch,
 
 static void
 mac_promisc_dispatch_one(mac_promisc_impl_t *mpip, mblk_t *mp,
-    boolean_t loopback)
+    boolean_t loopback, boolean_t local)
 {
 	mblk_t *mp_copy, *mp_next;
 
-	if (!mpip->mpi_no_copy || mpip->mpi_strip_vlan_tag) {
+	if (!mpip->mpi_no_copy || mpip->mpi_strip_vlan_tag ||
+	    (mpip->mpi_do_fixups && local)) {
 		mp_copy = copymsg(mp);
 		if (mp_copy == NULL)
 			return;
 
+		if (mpip->mpi_do_fixups && local) {
+			mp_copy = mac_fix_cksum(mp_copy);
+			if (mp_copy == NULL)
+				return;
+		}
+
 		if (mpip->mpi_strip_vlan_tag) {
 			mp_copy = mac_strip_vlan_tag_chain(mp_copy);
 			if (mp_copy == NULL)
@@ -4009,7 +4023,7 @@ mac_is_mcast(mac_impl_t *mip, mblk_t *mp)
  */
 void
 mac_promisc_dispatch(mac_impl_t *mip, mblk_t *mp_chain,
-    mac_client_impl_t *sender)
+    mac_client_impl_t *sender, boolean_t local)
 {
 	mac_promisc_impl_t *mpip;
 	mac_cb_t *mcb;
@@ -4049,8 +4063,10 @@ mac_promisc_dispatch(mac_impl_t *mip, mblk_t *mp_chain,
 
 			if (is_sender ||
 			    mpip->mpi_type == MAC_CLIENT_PROMISC_ALL ||
-			    is_mcast)
-				mac_promisc_dispatch_one(mpip, mp, is_sender);
+			    is_mcast) {
+				mac_promisc_dispatch_one(mpip, mp, is_sender,
+				    local);
+			}
 		}
 	}
 	MAC_PROMISC_WALKER_DCR(mip);
@@ -4079,7 +4095,8 @@ mac_promisc_client_dispatch(mac_client_impl_t *mcip, mblk_t *mp_chain)
 			mpip = (mac_promisc_impl_t *)mcb->mcb_objp;
 			if (mpip->mpi_type == MAC_CLIENT_PROMISC_FILTERED &&
 			    !is_mcast) {
-				mac_promisc_dispatch_one(mpip, mp, B_FALSE);
+				mac_promisc_dispatch_one(mpip, mp, B_FALSE,
+				    B_FALSE);
 			}
 		}
 	}
@@ -4150,16 +4167,15 @@ mac_info_get(const char *name, mac_info_t *minfop)
 /*
  * To get the capabilities that MAC layer cares about, such as rings, factory
  * mac address, vnic or not, it should directly invoke this function.  If the
- * link is part of a bridge, then the only "capability" it has is the inability
- * to do zero copy.
+ * link is part of a bridge, then the link is unable to do zero copy.
  */
 boolean_t
 i_mac_capab_get(mac_handle_t mh, mac_capab_t cap, void *cap_data)
 {
 	mac_impl_t *mip = (mac_impl_t *)mh;
 
-	if (mip->mi_bridge_link != NULL)
-		return (cap == MAC_CAPAB_NO_ZCOPY);
+	if (mip->mi_bridge_link != NULL && cap == MAC_CAPAB_NO_ZCOPY)
+		return (B_TRUE);
 	else if (mip->mi_callbacks->mc_callbacks & MC_GETCAPAB)
 		return (mip->mi_getcapab(mip->mi_driver, cap, cap_data));
 	else
@@ -4338,7 +4354,13 @@ mac_addr_len(mac_handle_t mh)
 boolean_t
 mac_is_vnic(mac_handle_t mh)
 {
-	return (((mac_impl_t *)mh)->mi_state_flags & MIS_IS_VNIC);
+	return ((((mac_impl_t *)mh)->mi_state_flags & MIS_IS_VNIC) != 0);
+}
+
+boolean_t
+mac_is_overlay(mac_handle_t mh)
+{
+	return ((((mac_impl_t *)mh)->mi_state_flags & MIS_IS_OVERLAY) != 0);
 }
 
 mac_handle_t
@@ -5552,3 +5574,23 @@ mac_client_set_rings(mac_client_handle_t mch, int rxrings, int txrings)
 		mrp->mrp_ntxrings = txrings;
 	}
 }
+
+boolean_t
+mac_get_promisc_filtered(mac_client_handle_t mch)
+{
+	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
+
+	return (mcip->mci_protect_flags & MPT_FLAG_PROMISC_FILTERED);
+}
+
+void
+mac_set_promisc_filtered(mac_client_handle_t mch, boolean_t enable)
+{
+	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
+
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
+	if (enable)
+		mcip->mci_protect_flags |= MPT_FLAG_PROMISC_FILTERED;
+	else
+		mcip->mci_protect_flags &= ~MPT_FLAG_PROMISC_FILTERED;
+}
diff --git a/usr/src/uts/common/io/mac/mac_datapath_setup.c b/usr/src/uts/common/io/mac/mac_datapath_setup.c
index 14d94981cd..0459506784 100644
--- a/usr/src/uts/common/io/mac/mac_datapath_setup.c
+++ b/usr/src/uts/common/io/mac/mac_datapath_setup.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015, Joyent, Inc.
  */
 
 #include <sys/types.h>
@@ -603,6 +604,7 @@ mac_srs_cpu_setup(cpu_setup_t what, int id, void *arg)
  *
  * TODO: Cleanup and tighten some of the assumptions.
  */
+boolean_t mac_check_overlay = B_TRUE;
 boolean_t mac_use_bw_heuristic = B_TRUE;
 static int
 mac_compute_soft_ring_count(flow_entry_t *flent, int rx_srs_cnt, int maxcpus)
@@ -610,6 +612,7 @@ mac_compute_soft_ring_count(flow_entry_t *flent, int rx_srs_cnt, int maxcpus)
 	uint64_t cpu_speed, bw = 0;
 	int srings = 0;
 	boolean_t bw_enabled = B_FALSE;
+	mac_client_impl_t *mcip = flent->fe_mcip;
 
 	ASSERT(!(flent->fe_type & FLOW_USER));
 	if (flent->fe_resource_props.mrp_mask & MRP_MAXBW &&
@@ -637,7 +640,16 @@ mac_compute_soft_ring_count(flow_entry_t *flent, int rx_srs_cnt, int maxcpus)
 			 */
 			if (mac_soft_ring_enable)
 				srings = srings * 2;
+		} else if (mac_check_overlay == B_TRUE &&
+		    (mcip->mci_state_flags & MCIS_IS_VNIC) != 0) {
+			/* Is this a VNIC on an overlay? */
+			mac_handle_t mh = (mac_handle_t)mcip->mci_mip;
+			if (mac_is_overlay(mh) == B_TRUE) {
+				srings = mac_rx_soft_ring_10gig_count;
+			}
 		}
+
+
 	} else {
 		/*
 		 * Soft ring computation using CPU speed and specified
diff --git a/usr/src/uts/common/io/mac/mac_protect.c b/usr/src/uts/common/io/mac/mac_protect.c
index 805b5d36f9..da83dc643e 100644
--- a/usr/src/uts/common/io/mac/mac_protect.c
+++ b/usr/src/uts/common/io/mac/mac_protect.c
@@ -2576,6 +2576,9 @@ mac_protect_init(mac_client_impl_t *mcip)
 	    sizeof (dhcpv6_addr_t), offsetof(dhcpv6_addr_t, da_node));
 	avl_create(&mcip->mci_v6_slaac_ip, compare_slaac_ip,
 	    sizeof (slaac_addr_t), offsetof(slaac_addr_t, sla_node));
+
+	if (mcip->mci_state_flags & MCIS_IS_VNIC)
+		mcip->mci_protect_flags |= MPT_FLAG_PROMISC_FILTERED;
 }
 
 void
diff --git a/usr/src/uts/common/io/mac/mac_provider.c b/usr/src/uts/common/io/mac/mac_provider.c
index 57d1996d84..98b770786a 100644
--- a/usr/src/uts/common/io/mac/mac_provider.c
+++ b/usr/src/uts/common/io/mac/mac_provider.c
@@ -21,6 +21,7 @@
 
 /*
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015, Joyent, Inc.
  */
 
 #include <sys/types.h>
@@ -350,6 +351,9 @@ mac_register(mac_register_t *mregp, mac_handle_t *mhp)
 	if (i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_AGGR, NULL))
 		mip->mi_state_flags |= MIS_IS_AGGR;
 
+	if (i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_OVERLAY, NULL))
+		mip->mi_state_flags |= MIS_IS_OVERLAY;
+
 	mac_addr_factory_init(mip);
 
 	/*
@@ -670,7 +674,7 @@ mac_trill_snoop(mac_handle_t mh, mblk_t *mp)
 	mac_impl_t *mip = (mac_impl_t *)mh;
 
 	if (mip->mi_promisc_list != NULL)
-		mac_promisc_dispatch(mip, mp, NULL);
+		mac_promisc_dispatch(mip, mp, NULL, B_FALSE);
 }
 
 /*
@@ -691,7 +695,7 @@ mac_rx_common(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain)
 	 * this MAC, pass them a copy if appropriate.
 	 */
 	if (mip->mi_promisc_list != NULL)
-		mac_promisc_dispatch(mip, mp_chain, NULL);
+		mac_promisc_dispatch(mip, mp_chain, NULL, B_FALSE);
 
 	if (mr != NULL) {
 		/*
diff --git a/usr/src/uts/common/io/mac/mac_sched.c b/usr/src/uts/common/io/mac/mac_sched.c
index 148f739d52..0e2cb864c9 100644
--- a/usr/src/uts/common/io/mac/mac_sched.c
+++ b/usr/src/uts/common/io/mac/mac_sched.c
@@ -1370,7 +1370,7 @@ int mac_srs_worker_wakeup_ticks = 0;
  * said, the constant is left as a static variable to allow it to be
  * dynamically tuned in the field if and as needed.
  */
-static uintptr_t mac_rx_srs_stack_needed = 10240;
+static uintptr_t mac_rx_srs_stack_needed = 13312;
 static uint_t mac_rx_srs_stack_toodeep;
 
 #ifndef STACK_GROWTH_DOWN
@@ -2310,7 +2310,7 @@ check_again:
 				if (smcip->mci_mip->mi_promisc_list != NULL) {
 					mutex_exit(lock);
 					mac_promisc_dispatch(smcip->mci_mip,
-					    head, NULL);
+					    head, NULL, B_FALSE);
 					mutex_enter(lock);
 				}
 			}
@@ -4450,8 +4450,10 @@ mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain,
 				 * check is done inside the MAC_TX()
 				 * macro.
 				 */
-				if (mip->mi_promisc_list != NULL)
-					mac_promisc_dispatch(mip, mp, src_mcip);
+				if (mip->mi_promisc_list != NULL) {
+					mac_promisc_dispatch(mip, mp, src_mcip,
+					    B_TRUE);
+				}
 
 				do_switch = ((src_mcip->mci_state_flags &
 				    dst_mcip->mci_state_flags &
diff --git a/usr/src/uts/common/io/mac/mac_stat.c b/usr/src/uts/common/io/mac/mac_stat.c
index 31972f94d8..c1a5c9c069 100644
--- a/usr/src/uts/common/io/mac/mac_stat.c
+++ b/usr/src/uts/common/io/mac/mac_stat.c
@@ -21,6 +21,7 @@
 /*
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2013 Joyent, Inc.  All rights reserved.
  */
 
 /*
@@ -390,8 +391,8 @@ i_mac_stat_create(void *handle, const char *modname, const char *statname,
 	kstat_t		*ksp;
 	kstat_named_t	*knp;
 
-	ksp = kstat_create(modname, 0, statname, "net",
-	    KSTAT_TYPE_NAMED, count, 0);
+	ksp = kstat_create_zone(modname, 0, statname, "net",
+	    KSTAT_TYPE_NAMED, count, 0, getzoneid());
 
 	if (ksp == NULL)
 		return (NULL);
@@ -948,9 +949,9 @@ mac_driver_stat_create(mac_impl_t *mip)
 	major_t		major = getmajor(mip->mi_phy_dev);
 
 	count = MAC_MOD_NKSTAT + MAC_NKSTAT + mip->mi_type->mt_statcount;
-	ksp = kstat_create((const char *)ddi_major_to_name(major),
+	ksp = kstat_create_zone((const char *)ddi_major_to_name(major),
 	    getminor(mip->mi_phy_dev) - 1, MAC_KSTAT_NAME,
-	    MAC_KSTAT_CLASS, KSTAT_TYPE_NAMED, count, 0);
+	    MAC_KSTAT_CLASS, KSTAT_TYPE_NAMED, count, 0, getzoneid());
 	if (ksp == NULL)
 		return;
 
diff --git a/usr/src/uts/common/io/mem.c b/usr/src/uts/common/io/mem.c
index cdbeb0d422..8955b3d935 100644
--- a/usr/src/uts/common/io/mem.c
+++ b/usr/src/uts/common/io/mem.c
@@ -24,7 +24,7 @@
  */
 
 /*
- * Copyright (c) 2015, Joyent, Inc.  All rights reserved.
+ * Copyright 2016, Joyent, Inc.
  */
 
 /*
@@ -221,10 +221,19 @@ mmopen(dev_t *devp, int flag, int typ, struct cred *cred)
 	switch (getminor(*devp)) {
 	case M_NULL:
 	case M_ZERO:
+		/* standard devices */
+		break;
+
 	case M_MEM:
 	case M_KMEM:
 	case M_ALLKMEM:
-		/* standard devices */
+		/*
+		 * These devices should never be visible in a zone, but if they
+		 * somehow do get created we refuse to allow the zone to use
+		 * them.
+		 */
+		if (crgetzoneid(cred) != GLOBAL_ZONEID)
+			return (EACCES);
 		break;
 
 	default:
diff --git a/usr/src/uts/common/io/mr_sas/mr_sas.conf b/usr/src/uts/common/io/mr_sas/mr_sas.conf
index cfda434e23..6c585c6a42 100644
--- a/usr/src/uts/common/io/mr_sas/mr_sas.conf
+++ b/usr/src/uts/common/io/mr_sas/mr_sas.conf
@@ -13,3 +13,11 @@
 # Fast-Path specific flag. Default is "yes".
 # mrsas-enable-fp="yes";
 
+flow_control="dmult" queue="qsort" tape="sctp";
+
+# MSI specific flag. To enable MSI modify the flag value to "yes"
+mrsas-enable-msi="yes";
+
+# Fast-Path specific flag. To enable Fast-Path modify the flag value to "yes"
+mrsas-enable-fp="yes";
+
diff --git a/usr/src/uts/common/io/nfp/THIRDPARTYLICENSE b/usr/src/uts/common/io/nfp/THIRDPARTYLICENSE
new file mode 100644
index 0000000000..187088ff34
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/THIRDPARTYLICENSE
@@ -0,0 +1,19 @@
+Copyright (c) 2014, Thales UK Limited
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/usr/src/uts/common/io/nfp/THIRDPARTYLICENSE.descrip b/usr/src/uts/common/io/nfp/THIRDPARTYLICENSE.descrip
new file mode 100644
index 0000000000..cde8b65b37
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/THIRDPARTYLICENSE.descrip
@@ -0,0 +1 @@
+NFAST CRYPTO ACCELERATOR DRIVER
diff --git a/usr/src/uts/common/io/nfp/autoversion.h b/usr/src/uts/common/io/nfp/autoversion.h
new file mode 100644
index 0000000000..b9021942b2
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/autoversion.h
@@ -0,0 +1,21 @@
+/*
+
+(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+*/
+
+/* AUTOGENERATED - DO NOT EDIT */
+#ifndef AUTOVERSION_H
+#define AUTOVERSION_H
+
+#define VERSION_RELEASEMAJOR 2
+#define VERSION_RELEASEMINOR 26
+#define VERSION_RELEASEPATCH 40
+#define VERSION_NO "2.26.40cam999"
+#define VERSION_COMPNAME "nfdrv"
+
+#endif
diff --git a/usr/src/uts/common/io/nfp/drvlist.c b/usr/src/uts/common/io/nfp/drvlist.c
new file mode 100644
index 0000000000..a04b1fd5b0
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/drvlist.c
@@ -0,0 +1,19 @@
+/*
+
+(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+*/
+
+#include "nfp_common.h"
+#include "nfp_cmd.h"
+
+const nfpcmd_dev *nfp_drvlist[] = {
+  &i21285_cmddev,
+  &i21555_cmddev,
+  NULL
+};
+
diff --git a/usr/src/uts/common/io/nfp/hostif.c b/usr/src/uts/common/io/nfp/hostif.c
new file mode 100644
index 0000000000..684be703ea
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/hostif.c
@@ -0,0 +1,1192 @@
+/*
+
+hostif.c: nFast PCI driver for Solaris 2.5, 2.6, 2.7 and 2.8
+
+(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+history
+
+06/05/1998 jsh  Original solaris 2.6
+21/05/1999 jsh  added support for solaris 2.5
+10/06/1999 jsh  added support for solaris 2.7 (32 and 64 bit)
+??/??/2001 jsh  added support for solaris 2.8 (32 and 64 bit)
+16/10/2001 jsh  moved from nfast to new structure in nfdrv
+12/02/2002 jsh  added high level interrupt support
+
+*/
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/errno.h>
+#include <sys/file.h>
+#include <sys/conf.h>
+#include <sys/uio.h>
+#include <sys/map.h>
+#include <sys/debug.h>
+#include <sys/modctl.h>
+#include <sys/kmem.h>
+#include <sys/cmn_err.h>
+#include <sys/open.h>
+#include <sys/stat.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/pci.h>
+
+#include "nfp_common.h"
+#include "nfp_hostif.h"
+#include "nfp_osif.h"
+#include "nfp_cmd.h"
+
+#include "nfp.h"
+
+/* mapped memory attributes, no-swap endianess (done in higher level) */
+static struct ddi_device_acc_attr nosw_attr = {
+  DDI_DEVICE_ATTR_V0,
+  DDI_NEVERSWAP_ACC,
+  DDI_STRICTORDER_ACC
+};
+
+/* dma attributes */
+static ddi_dma_attr_t dma_attrs = {
+  DMA_ATTR_V0,            /* version number */
+  (uint64_t)0x0,          /* low address */
+  (uint64_t)0xffffffff,   /* high address */
+  (uint64_t)0xffffff,     /* DMA counter max */
+  (uint64_t)0x1,          /* alignment */
+  0x0c,                   /* burst sizes */
+  0x1,                    /* minimum transfer size */
+  (uint64_t)0x3ffffff,    /* maximum transfer size */
+  (uint64_t)0x7fff,       /* maximum segment size */
+  1,                      /* no scatter/gather lists */
+  1,                      /* granularity */
+  0                       /* DMA flags */
+};
+
+/*
+ * Debug message control
+ * Debug Levels:
+ *  0 = no messages
+ *  1 = Errors
+ *  2 = Subroutine calls & control flow
+ *  3 = I/O Data (verbose!)
+ * Can be set with adb or in the /etc/system file with
+ * "set nfp:nfp_debug=<value>"
+ */
+
+int nfp_debug= 1;
+
+static void *state_head; /* opaque handle top of state structs */
+
+static int nfp_open(dev_t *dev, int openflags, int otyp, cred_t *credp);
+static int nfp_close(dev_t dev, int openflags, int otyp, cred_t *credp);
+static int nfp_release_dev( dev_info_t *dip );
+
+static int nfp_read(dev_t dev, struct uio *uiop, cred_t *credp);
+static int nfp_write(dev_t dev, struct uio *uiop, cred_t *credp);
+static int nfp_strategy(struct buf *bp);
+
+static int nfp_ioctl(dev_t dev, int cmd, ioctlptr_t arg, int mode, cred_t *credp, int *rvalp);
+static int nfp_chpoll(dev_t dev, short events, int anyyet, short *reventsp,
+                    struct pollhead **phpp);
+
+static void nfp_wrtimeout (void *pdev);
+static void nfp_rdtimeout (void *pdev);
+
+static int nfp_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result);
+static int nfp_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
+static int nfp_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
+
+static void nfp_read_complete_final(nfp_dev *pdev, int ok);
+static void nfp_write_complete_final(nfp_dev *pdev, int ok);
+
+/* nfp file ops --------------------------------------------------- */
+
+static struct cb_ops nfp_cb_ops = {
+  nfp_open,
+  nfp_close,
+  nodev,                /* no nfp_strategy */
+  nodev,                /* no print routine */
+  nodev,                /* no dump routine */
+  nfp_read,
+  nfp_write,
+  nfp_ioctl,
+  nodev,                /* no devmap routine */
+  nodev,                /* no mmap routine */
+  nodev,                /* no segmap routine */
+  nfp_chpoll,
+  ddi_prop_op,
+  0,            /* not a STREAMS driver, no cb_str routine */
+  D_NEW | D_MP | EXTRA_CB_FLAGS, /* must be safe for multi-thread/multi-processor */
+  CB_REV,
+  nodev,                /* aread */
+  nodev                 /* awrite */
+};
+
+static struct dev_ops nfp_ops = {
+  DEVO_REV,               /* DEVO_REV indicated by manual */
+  0,                      /* device reference count       */
+  nfp_getinfo,
+  nulldev,                /* identify */
+  nulldev,                /* probe */
+  nfp_attach,
+  nfp_detach,
+  nodev,                  /* device reset routine         */
+  &nfp_cb_ops,
+  (struct bus_ops *)0,    /* bus operations               */
+};
+
+extern struct mod_ops mod_driverops;
+static struct modldrv modldrv = {
+  &mod_driverops,
+  NFP_DRVNAME,
+  &nfp_ops,
+};
+
+static struct modlinkage modlinkage = {
+  MODREV_1,               /* MODREV_1 indicated by manual */
+  (void *)&modldrv,
+  NULL,                   /* termination of list of linkage structures */
+};
+
+/* interface resource allocation */
+
+int nfp_alloc_pci_push( nfp_dev *pdev ) {
+  /* allocate resources needed for PCI Push,
+   * if not already allocated.
+   * return True if successful
+   */
+  nfp_err ret;
+  uint_t cookie_count;
+	size_t real_length;
+
+  if(!pdev->read_buf) {
+    /* allocate read buffer */
+    pdev->read_buf = kmem_zalloc( NFP_READBUF_SIZE, KM_NOSLEEP );
+  }
+  if(!pdev->read_buf) {
+    nfp_log( NFP_DBG1, "nfp_attach: kmem_zalloc read buffer failed");
+    pdev->read_buf = NULL;
+    return 0;
+  }
+
+  if(!pdev->rd_dma_ok) {
+    /* allocate dma handle for read buffer */
+    ret = ddi_dma_alloc_handle( pdev->dip,
+                                &dma_attrs,
+                                DDI_DMA_DONTWAIT,
+                                NULL,
+                                &pdev->read_dma_handle );
+    if( ret != DDI_SUCCESS ) {
+      nfp_log( NFP_DBG1,
+               "nfp_alloc_pci_push: ddi_dma_alloc_handle failed (%d)",
+               ret );
+      return 0;
+    }
+
+    /* Allocate the memory for dma transfers */
+    ret = ddi_dma_mem_alloc(pdev->read_dma_handle, NFP_READBUF_SIZE, &nosw_attr,
+			    DDI_DMA_CONSISTENT, DDI_DMA_DONTWAIT, NULL,
+			    (caddr_t*)&pdev->read_buf, &real_length, &pdev->acchandle);
+    if (ret != DDI_SUCCESS) {
+      nfp_log( NFP_DBG1, "nfp_alloc_pci_push: ddi_dma_mem_alloc failed (%d)", ret);
+      ddi_dma_free_handle( &pdev->read_dma_handle );
+      return 0;
+    }
+
+    ret = ddi_dma_addr_bind_handle( pdev->read_dma_handle,
+                                    NULL, /* kernel address space */
+                                    (caddr_t)pdev->read_buf, real_length,
+                                    DDI_DMA_READ | DDI_DMA_CONSISTENT, /* dma flags */
+                                    DDI_DMA_DONTWAIT, NULL,
+                                    &pdev->read_dma_cookie, &cookie_count );
+    if( ret != DDI_DMA_MAPPED ) {
+      nfp_log( NFP_DBG1,
+               "nfp_alloc_pci_push: ddi_dma_addr_bind_handle failed (%d)",
+               ret);
+      ddi_dma_mem_free(&pdev->acchandle);
+      ddi_dma_free_handle( &pdev->read_dma_handle );
+      return 0;
+    }
+    if( cookie_count > 1 ) {
+      nfp_log( NFP_DBG1,
+               "nfp_alloc_pci_push: error:"
+               " ddi_dma_addr_bind_handle wants %d transfers",
+               cookie_count);
+      ddi_dma_mem_free(&pdev->acchandle);
+      (void) ddi_dma_unbind_handle( pdev->read_dma_handle );
+      ddi_dma_free_handle( &pdev->read_dma_handle );
+      return 0;
+    }
+    pdev->rd_dma_ok = 1;
+  }
+  return pdev->rd_dma_ok;
+}
+
+void nfp_free_pci_push( nfp_dev *pdev ) {
+  /* free resources allocated to PCI Push */
+  if( pdev->rd_dma_ok ) {
+    (void) ddi_dma_sync(pdev->read_dma_handle,0,0,DDI_DMA_SYNC_FORKERNEL);
+    ddi_dma_mem_free(&pdev->acchandle);
+    (void) ddi_dma_unbind_handle( pdev->read_dma_handle );
+    ddi_dma_free_handle( &pdev->read_dma_handle );
+    pdev->rd_dma_ok = 0;
+  }
+  if( pdev->read_buf ) {
+    kmem_free( pdev->read_buf, NFP_READBUF_SIZE );
+    pdev->read_buf = NULL;
+  }
+}
+
+/* include definition of nfp_set_ifvers() */
+#define nfp_ifvers NFDEV_IF_PCI_PUSH
+#include "nfp_ifvers.c"
+#undef nfp_ifvers
+
+/*--------------------*/
+/*  nfp_isr           */
+/*--------------------*/
+
+static u_int nfp_isr( char *pdev_in ) {
+  /* LINTED: alignment */
+  nfp_dev *pdev= (nfp_dev *)pdev_in;
+  nfp_err ne;
+  int handled;
+
+  nfp_log( NFP_DBG3, "nfp_isr: entered");
+
+  if( !pdev ) {
+    nfp_log( NFP_DBG1, "nfp_isr: cannot find dev");
+    return DDI_INTR_UNCLAIMED;
+  }
+
+  /* The isr needs to be mutex'ed - an SMP can call us while we're still
+   * running!
+   */
+  mutex_enter(&pdev->low_mutex);
+  ne= pdev->cmddev->isr( pdev->common.cmdctx, &handled );
+  mutex_exit(&pdev->low_mutex);
+
+  if( !ne && handled )
+    return DDI_INTR_CLAIMED;
+  if (ne)
+    nfp_log( NFP_DBG1, "nfp_isr: failed");
+  else
+    nfp_log( NFP_DBG3, "nfp_isr: unclaimed");
+  return DDI_INTR_UNCLAIMED;
+}
+
+static u_int nfp_soft_isr( char *pdev_in ) {
+  /* LINTED: alignment */
+  nfp_dev *pdev= (nfp_dev *)pdev_in;
+  int rd, wr;
+
+  nfp_log( NFP_DBG3, "nfp_soft_isr: entered");
+
+  if( !pdev ) {
+    nfp_log( NFP_DBG1, "nfp_soft_isr: cannot find dev");
+    return DDI_INTR_UNCLAIMED;
+  }
+  rd= wr= 0;
+  
+  mutex_enter(&pdev->high_mutex);
+  if(pdev->high_read) {
+    pdev->high_read= 0;
+    mutex_exit(&pdev->high_mutex);
+    rd= 1;
+  }
+  if(pdev->high_write) {
+    pdev->high_write= 0;
+    wr= 1;
+  }
+  mutex_exit(&pdev->high_mutex);
+
+  if(rd) {
+    nfp_log( NFP_DBG3, "nfp_soft_isr: read done");
+    nfp_read_complete_final(pdev, pdev->rd_ok);
+  }
+  if(wr) {
+    nfp_log( NFP_DBG3, "nfp_soft_isr: write done");
+    nfp_write_complete_final(pdev, pdev->wr_ok);
+  }
+  if( rd || wr )
+    return DDI_INTR_CLAIMED;
+
+  nfp_log( NFP_DBG2, "nfp_isr: unclaimed");
+  return DDI_INTR_UNCLAIMED;
+}
+
+
+/*-------------------------*/
+/*  nfp_read               */
+/*-------------------------*/
+
+void nfp_read_complete(nfp_dev *pdev, int ok) {
+  nfp_log( NFP_DBG2,"nfp_read_complete: entering");
+
+  if(pdev->high_intr) {
+    nfp_log(NFP_DBG2, "nfp_read_complete: high_intr");
+    mutex_enter(&pdev->high_mutex);
+    nfp_log(NFP_DBG3, "nfp_read_complete: high_mutex entered");
+    if(pdev->high_read)
+      nfp_log(NFP_DBG1, "nfp_read_complete: high_read allread set!");
+    pdev->high_read= 1;
+    pdev->rd_ok= ok;
+    nfp_log(NFP_DBG3, "nfp_read_complete: exiting high_mutex");
+    mutex_exit(&pdev->high_mutex);
+    ddi_trigger_softintr(pdev->soft_int_id);
+  } else
+    nfp_read_complete_final( pdev, ok );
+  nfp_log( NFP_DBG2,"nfp_read_complete: exiting");
+}
+
+static void nfp_read_complete_final(nfp_dev *pdev, int ok) {
+  nfp_log( NFP_DBG2,"nfp_read_complete_final: entering");
+  if(pdev->rdtimeout)
+    (void) untimeout(pdev->rdtimeout);
+  if(!pdev->rd_outstanding) {
+    nfp_log( NFP_DBG1,"nfp_read_complete_final: !pdev->rd_outstanding");
+  }
+  nfp_log( NFP_DBG2,"nfp_read_complete_final: pdev->rd_outstanding=0, ok %d", ok);
+  mutex_enter(&pdev->isr_mutex);
+  pdev->rd_outstanding= 0;
+  pdev->rd_ready= 1;
+  pdev->rd_ok= ok;
+  cv_broadcast(&pdev->rd_cv);
+  mutex_exit(&pdev->isr_mutex);
+  pollwakeup (&pdev->pollhead, POLLRDNORM);
+  nfp_log( NFP_DBG2,"nfp_read_complete_final: exiting");
+}
+
+static void nfp_rdtimeout( void *pdev_in )
+{
+  nfp_dev *pdev= (nfp_dev *)pdev_in;
+
+  nfp_log( NFP_DBG1, "nfp_rdtimeout: read timed out");
+
+  if (!pdev) {
+    nfp_log( NFP_DBG1, "nfp_rdtimeout: NULL pdev." );
+    return;
+  }
+  pdev->rdtimeout= 0;
+  nfp_read_complete_final(pdev, 0);
+}
+
+/* ARGSUSED */
+static int nfp_read(dev_t dev, struct uio *uiop, cred_t *credp) {
+  int ret;
+  nfp_log( NFP_DBG2, "nfp_read: entered" );
+  if (ddi_get_soft_state(state_head, getminor(dev)) != NULL) {
+    nfp_log( NFP_DBG1, "nfp_read: unable to get nfp_dev");
+    return (ENODEV);
+  }
+  nfp_log( NFP_DBG2, "nfp_read: about to physio." );
+  ret = physio(nfp_strategy, (struct buf *)0, dev, B_READ, minphys, uiop );
+  if(ret)
+    nfp_log( NFP_DBG1, "nfp_read: physio returned %x.", ret );
+  return ret;
+}
+
+/*-------------------------*/
+/*  nfp_write              */
+/*-------------------------*/
+
+void nfp_write_complete( nfp_dev *pdev, int ok) {
+  nfp_log( NFP_DBG2,"nfp_write_complete: entering");
+
+  if(pdev->high_intr) {
+    mutex_enter(&pdev->high_mutex);
+    if(pdev->high_write)
+      nfp_log(NFP_DBG1, "nfp_write_complete: high_write allread set!");
+    pdev->high_write= 1;
+    pdev->wr_ok= ok;
+    mutex_exit(&pdev->high_mutex);
+    ddi_trigger_softintr(pdev->soft_int_id);
+  } else
+    nfp_write_complete_final( pdev, ok );
+  nfp_log( NFP_DBG2,"nfp_write_complete: exiting");
+}
+
+static void nfp_write_complete_final( nfp_dev *pdev, int ok) {
+  struct buf *local_wr_bp;
+  nfp_log( NFP_DBG2,"nfp_write_complete_final: entering");
+  if(pdev->wrtimeout)
+    (void) untimeout(pdev->wrtimeout);
+
+  if (!pdev->wr_bp) {
+    nfp_log( NFP_DBG2, "nfp_write_complete_final: write: wr_bp == NULL." );
+    return;
+  }
+
+  bp_mapout(pdev->wr_bp);
+  pdev->wr_bp->b_resid = ok ? 0 : pdev->wr_bp->b_bcount;
+  /* Make sure we set wr_ready before calling biodone to avoid a race */
+  pdev->wr_ready = 1;
+  bioerror(pdev->wr_bp, ok ? 0 : ENXIO);
+  local_wr_bp = pdev->wr_bp;
+  pdev->wr_bp = 0;
+  biodone(local_wr_bp);
+  nfp_log( NFP_DBG2, "nfp_write_complete_final: isr_mutex extited");
+  pollwakeup (&pdev->pollhead, POLLWRNORM);
+
+  nfp_log( NFP_DBG2, "nfp_write_complete_final: leaving");
+}
+
+static void nfp_wrtimeout( void *pdev_in )
+{
+  nfp_dev *pdev= (nfp_dev *)pdev_in;
+
+  nfp_log( NFP_DBG1, "nfp_wrtimeout: write timed out");
+
+  if (!pdev) {
+    nfp_log( NFP_DBG1, "nfp_wrtimeout: NULL pdev." );
+    return;
+  }
+  pdev->wrtimeout= 0;
+  nfp_write_complete_final(pdev, 0);
+}
+
+/* ARGSUSED */
+static int nfp_write(dev_t dev, struct uio *uiop, cred_t *credp) {
+  int ret;
+  nfp_log( NFP_DBG2, "nfp_write: entered." );
+  if (ddi_get_soft_state(state_head, getminor(dev)) == NULL) {
+    nfp_log( NFP_DBG1, "nfp_chread: unable to get nfp_dev.");
+    return (ENODEV);
+  }
+  nfp_log( NFP_DBG2, "nfp_write: about to physio." );
+  ret = physio(nfp_strategy, (struct buf *)0, dev, B_WRITE, minphys, uiop );
+  if(ret)
+    nfp_log( NFP_DBG1, "nfp_write: physio returned %x.", ret );
+  return ret;
+}
+
+/*-------------------------*/
+/*  nfp_strategy           */
+/*-------------------------*/
+
+#define NFP_STRAT_ERR(thebp,err,txt) \
+      nfp_log( NFP_DBG1, "nfp_strategy: " txt ".\n"); \
+      (thebp)->b_resid = (thebp)->b_bcount; \
+      bioerror ((thebp), err); \
+      biodone ((thebp));
+
+static int nfp_strategy(struct buf *bp) {
+  register struct nfp_dev *pdev;
+  nfp_err ne;
+  
+  nfp_log( NFP_DBG2, "nfp_strategy: entered." );
+  if (!(pdev = ddi_get_soft_state(state_head, getminor(bp->b_edev)))) {
+    NFP_STRAT_ERR (bp, ENXIO, "unable to get nfp_dev");
+    return (0);
+  }
+
+  if (bp->b_flags & B_READ) {
+    int count;
+    /* read */
+    if (!pdev->rd_ready) {
+      NFP_STRAT_ERR (bp,ENXIO,"read called when not ready");
+      return (0);
+    }
+    pdev->rd_ready=0;
+    pdev->rd_pending = 0;
+    if( !pdev->rd_ok) {
+      NFP_STRAT_ERR (bp,ENXIO,"read failed");
+      return (0);
+    }
+    /* copy data from module */
+    if(pdev->ifvers >= NFDEV_IF_PCI_PUSH) {
+      nfp_log( NFP_DBG3, "nfp_strategy: copying kernel read buffer");
+      if( ddi_dma_sync(pdev->read_dma_handle,0,0,DDI_DMA_SYNC_FORKERNEL) != DDI_SUCCESS )
+      {
+        NFP_STRAT_ERR(bp,ENXIO,"ddi_dma_sync(read_dma_handle) failed");
+        return (0);
+      }
+      /* LINTED: alignment */
+      count= *(unsigned int *)(pdev->read_buf+4);
+      count= FROM_LE32_MEM(&count);
+      nfp_log( NFP_DBG3, "nfp_strategy: read count %d", count);
+      if(count<0 || count>bp->b_bcount) {
+        NFP_STRAT_ERR(bp,ENXIO,"bad read byte count from device");
+        nfp_log( NFP_DBG1, "nfp_strategy: bad read byte count (%d) from device", count);
+        return (0);
+      }
+      bp_mapin (bp);
+      bcopy( pdev->read_buf + 8, bp->b_un.b_addr, count );
+      bp_mapout (bp);
+    } else {
+      bp_mapin (bp);
+      ne=  pdev->cmddev->read_block( bp->b_un.b_addr, bp->b_bcount, pdev->common.cmdctx, &count );
+      bp_mapout (bp);
+      if( ne != NFP_SUCCESS) {
+        NFP_STRAT_ERR (bp,nfp_oserr(ne),"read_block failed");
+        return (0);
+      }
+    }
+    bioerror(bp, 0);
+    bp->b_resid = 0;
+    biodone (bp);
+  } else {
+    /* write */
+    if (!pdev->wr_ready) {
+      NFP_STRAT_ERR (bp,ENXIO,"write called when not ready");
+      return (0);
+    }
+    if (pdev->wr_bp) {
+      NFP_STRAT_ERR (bp,ENXIO,"wr_bp != NULL");
+      return (0);
+    }
+    pdev->wrtimeout= timeout(nfp_wrtimeout, (caddr_t)pdev, NFP_TIMEOUT_SEC * drv_usectohz(1000000));
+    pdev->wr_bp = bp;
+    pdev->wr_ready = 0;
+    bp_mapin (bp);
+    ne= pdev->cmddev->write_block( bp->b_un.b_addr, bp->b_bcount,  pdev->common.cmdctx);
+    if( ne != NFP_SUCCESS ) {
+      bp_mapout (bp);
+      (void) untimeout(pdev->wrtimeout);
+      pdev->wr_bp = 0;
+      pdev->wr_ready = 1;
+      NFP_STRAT_ERR (bp,nfp_oserr(ne),"write failed");
+      return (0);
+    }
+  }
+  nfp_log( NFP_DBG2, "nfp_strategy: leaving");
+
+  return (0);
+}
+
+
+/*--------------------*/
+/*  poll / select     */
+/*--------------------*/
+
+static int nfp_chpoll(dev_t dev, short events, int anyyet, short *reventsp,
+                      struct pollhead **phpp) {
+  nfp_dev *pdev;
+  short revents;
+
+  if (!(pdev = ddi_get_soft_state(state_head, getminor(dev)))) {
+    nfp_log( NFP_DBG1, "nfp_chpoll: unable to get nfp_dev");
+    *reventsp=0;
+    return (0);
+  }
+  nfp_log( NFP_DBG2, "nfp_chpoll: entered %x", events);
+
+  revents=0;
+  if (events&POLLWRNORM) {
+    if (pdev->wr_ready) {
+      nfp_log( NFP_DBG2, "nfp_chpoll: write ready");
+      revents|=POLLWRNORM;
+    }
+  }
+
+  if (events&POLLRDNORM) {
+    if (pdev->rd_ready) {
+      nfp_log( NFP_DBG2, "nfp_chpoll: read ready");
+      revents|=POLLRDNORM;
+    }
+  }
+
+  if (!revents && !anyyet) {
+    *phpp=&pdev->pollhead;
+  }
+  *reventsp=revents;
+
+  nfp_log( NFP_DBG2, "nfp_chpoll: leaving");
+  return (0);
+}
+
+
+/*--------------------*/
+/*  ioctl             */
+/*--------------------*/
+
+/* ARGSUSED */
+static int nfp_ioctl(dev_t dev, int cmd, ioctlptr_t arg, int mode, cred_t *credp, int *rvalp) {
+  register struct nfp_dev *pdev;
+
+  nfp_log( NFP_DBG2, "nfp_ioctl: entered." );
+
+  if (!(pdev = ddi_get_soft_state(state_head, getminor(dev)))) {
+    nfp_log( NFP_DBG1, "nfp_ioctl: unable to get nfp dev.");
+    return (ENXIO);
+  }
+
+  switch (cmd) {
+  case NFDEV_IOCTL_ENQUIRY:
+    {
+      long *outp;
+      int outlen;
+      nfdev_enquiry_str enq_data;
+
+      enq_data.busno = (unsigned int)-1;
+      enq_data.slotno = (unsigned char)-1;
+
+      /* get our bus and slot num */
+      if (ddi_getlongprop (DDI_DEV_T_NONE,
+                           pdev->dip, 0, "reg",
+                           (caddr_t)&outp, &outlen) != DDI_PROP_NOT_FOUND) {
+        nfp_log( NFP_DBG2, "ddi_getlongprop('reg') ok." );
+        if( outlen > 0 ) {
+          enq_data.busno = ((*outp)>>16) & 0xff;
+          enq_data.slotno = ((*outp)>>11) & 0x1f;
+          nfp_log( NFP_DBG2, "busno %d, slotno %d.",
+                   enq_data.busno, enq_data.slotno );
+        }
+      } else
+        nfp_log( NFP_DBG1, "ddi_getlongprop('reg') failed." );
+
+      if( ddi_copyout( (char *)&enq_data, (void *)arg, sizeof(enq_data), mode ) != 0 ) {
+        nfp_log( NFP_DBG1, "ddi_copyout() failed." );
+        return EFAULT;
+      }
+    }
+    break;
+
+  case NFDEV_IOCTL_ENSUREREADING:
+    {
+      unsigned int addr, len;
+      nfp_err  ret;
+      if( ddi_copyin( (void *)arg, (char *)&len, sizeof(unsigned int), mode ) != 0 ) {
+        nfp_log( NFP_DBG1, "ddi_copyin() failed." );
+        return (EFAULT);
+      }
+      /* signal a read to the module */
+      nfp_log( NFP_DBG2, "nfp_ioctl: signalling read request to module, len = %x.", len );
+      if (len>8192) {
+        nfp_log( NFP_DBG1, "nfp_ioctl: len >8192 = %x.", len );
+        return EINVAL;
+      }
+      if (pdev->rd_outstanding==1) {
+        nfp_log( NFP_DBG1, "nfp_ioctl: not about to call read with read outstanding.");
+        return EIO;
+      }
+
+      addr= 0;
+      if(pdev->ifvers >= NFDEV_IF_PCI_PUSH) {
+        if( len > NFP_READBUF_SIZE ) {
+          nfp_log( NFP_DBG1, "nfp_ioctl: len > NFP_READBUF_SIZE = %x.", len );
+          return EINVAL;
+        }
+        addr= pdev->read_dma_cookie.dmac_address;
+      }
+
+      pdev->rd_outstanding = 1;
+      nfp_log( NFP_DBG2,"nfp_ioctl: pdev->rd_outstanding=1");
+
+      /* setup timeout timer */
+      pdev->rdtimeout= timeout(nfp_rdtimeout, (caddr_t)pdev, NFP_TIMEOUT_SEC * drv_usectohz(1000000));
+
+      nfp_log( NFP_DBG2, "nfp_ioctl: read request");
+      ret = pdev->cmddev->ensure_reading(addr, len, pdev->common.cmdctx);
+      if ( ret != NFP_SUCCESS ) {
+        (void) untimeout(pdev->rdtimeout);
+        pdev->rdtimeout = 0;
+        pdev->rd_outstanding = 0;
+        nfp_log( NFP_DBG1, "nfp_ioctl : cmddev->ensure_reading failed ");
+        return nfp_oserr( ret );
+      }
+    }
+    break;
+
+  case NFDEV_IOCTL_PCI_IFVERS:
+    {
+      int vers;
+
+      nfp_log( NFP_DBG2, "nfp_ioctl: NFDEV_IOCTL_PCI_IFVERS");
+
+      if( ddi_copyin( (void *)arg, (char *)&vers, sizeof(vers), mode ) != 0 ) {
+        nfp_log( NFP_DBG1, "ddi_copyin() failed." );
+        return (EFAULT);
+      }
+
+      if( pdev->rd_outstanding ) {
+        nfp_log( NFP_DBG1, "nfp_ioctl: can't set ifvers %d as read outstanding", vers);
+        return EIO;
+      }
+
+      nfp_set_ifvers(pdev, vers);
+      if( pdev->ifvers != vers ) {
+        nfp_log( NFP_DBG1, "nfp_ioctl: can't set ifvers %d", vers);
+        return EIO;
+      }
+    }
+    break;
+
+  case NFDEV_IOCTL_STATS:
+    {
+      if( ddi_copyout( (char *)&(pdev->common.stats),
+                       (void *)arg,
+                       sizeof(nfdev_stats_str),
+                       mode ) != 0 ) {
+        nfp_log( NFP_DBG1, "ddi_copyout() failed." );
+        return EFAULT;
+      }
+    }
+    break;
+
+  default:
+    nfp_log( NFP_DBG1, "nfp_ioctl: unknown ioctl." );
+    return EINVAL;
+  }
+
+  return 0;
+}
+
+/*-------------------------*/
+/*  nfp_open               */
+/*-------------------------*/
+
+/* ARGSUSED */
+int nfp_open(dev_t *dev, int openflags, int otyp, cred_t *credp)
+{       
+  nfp_err ret;
+  register struct nfp_dev *pdev;
+ 
+  nfp_log( NFP_DBG2, "entered nfp_open." );
+      
+  pdev = (nfp_dev *)ddi_get_soft_state(state_head, getminor(*dev));
+  
+  if( !pdev ) {
+    nfp_log( NFP_DBG1, "nfp_open: unable to get nfp dev.");
+    return (ENODEV);
+  }     
+        
+  if( otyp != OTYP_CHR ) {
+    nfp_log( NFP_DBG1, "nfp_open: not opened as character device");
+    return (EINVAL);
+  } 
+    
+  mutex_enter(&pdev->busy_mutex);
+    
+  if (pdev->busy) {
+    mutex_exit(&pdev->busy_mutex);
+    nfp_log( NFP_DBG1, "nfp_open: device busy");
+    return EBUSY;
+  } 
+  pdev->busy= 1;
+  mutex_exit(&pdev->busy_mutex);
+
+  /* use oldest possible interface until told otherwise */
+  pdev->ifvers= NFDEV_IF_STANDARD;
+  nfp_log( NFP_DBG3, "nfp_open: setting ifvers %d", pdev->ifvers);
+  pdev->rd_ready= 0; /* drop any old data */
+ 
+  ret = pdev->cmddev->open(pdev->common.cmdctx);
+  if( ret != NFP_SUCCESS ) {
+    nfp_log( NFP_DBG1, "nfp_open : cmddev->open failed ");
+    return nfp_oserr( ret );
+  } 
+
+  nfp_log( NFP_DBG2, "nfp_open: done");
+
+  return 0;
+}
+
+/*--------------------*/
+/*  nfp_close         */
+/*--------------------*/
+
+/* ARGSUSED */
+static int nfp_close(dev_t dev, int openflags, int otyp, cred_t *credp) {
+  nfp_dev *pdev;
+  nfp_err ret;
+
+  nfp_log( NFP_DBG2, "nfp_close: entered");
+
+  pdev = (struct nfp_dev *)ddi_get_soft_state(state_head, getminor(dev));
+  if( !pdev ) {
+    nfp_log( NFP_DBG1, "nfp_close: cannot find dev.");
+    return ENODEV;
+  }
+
+  mutex_enter(&pdev->isr_mutex);
+  if(pdev->rd_outstanding) {
+    int lbolt, err;
+    nfp_get_lbolt(&lbolt, err);
+    if(!err)
+      (void) cv_timedwait(&pdev->rd_cv, &pdev->isr_mutex, lbolt + (NFP_TIMEOUT_SEC * drv_usectohz(1000000)) );
+  }
+  mutex_exit(&pdev->isr_mutex);
+  ret = pdev->cmddev->close(pdev->common.cmdctx);
+  if (ret != NFP_SUCCESS ) {
+    nfp_log( NFP_DBG1, " nfp_close : cmddev->close failed");
+    return nfp_oserr( ret );
+  }
+
+  mutex_enter(&pdev->busy_mutex);
+  pdev->busy= 0;
+  mutex_exit(&pdev->busy_mutex);
+
+  return 0;
+}
+
+/****************************************************************************
+
+  nfp driver config
+
+ ****************************************************************************/
+
+/*-------------------------*/
+/*  nfp_getinfo            */
+/*-------------------------*/
+
+/* ARGSUSED */
+static int nfp_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) {
+  int error;
+  nfp_dev *pdev;
+
+  nfp_log( NFP_DBG2, "nfp_getinfo: entered" );
+
+  pdev = (struct nfp_dev *)ddi_get_soft_state(state_head, getminor((dev_t)arg));
+  if( !pdev ) {
+    nfp_log( NFP_DBG1, "nfp_close: cannot find dev.");
+    return ENODEV;
+  }
+
+  switch (infocmd) {
+  case DDI_INFO_DEVT2DEVINFO:
+    if (pdev == NULL) {
+      *result = NULL;
+      error = DDI_FAILURE;
+    } else {
+      /*
+       * don't need to use a MUTEX even though we are
+       * accessing our instance structure; dev->dip
+       * never changes.
+       */
+      *result = pdev->dip;
+      error = DDI_SUCCESS;
+    }
+    break;
+  case DDI_INFO_DEVT2INSTANCE:
+    *result = (void *)(uintptr_t)getminor((dev_t)arg);
+    error = DDI_SUCCESS;
+    break;
+  default:
+    *result = NULL;
+    error = DDI_FAILURE;
+  }
+
+  nfp_log( NFP_DBG2, "nfp_getinfo: leaving." );
+  return (error);
+}
+
+/*-------------------------*/
+/*  nfp_release            */
+/*-------------------------*/
+
+static int nfp_release_dev( dev_info_t *dip ) {
+  nfp_dev *pdev;
+  int instance, i;
+  nfp_err ret;
+
+  nfp_log( NFP_DBG2, "nfp_release_dev: entering" );
+
+  instance = ddi_get_instance(dip);
+  pdev = (struct nfp_dev *)ddi_get_soft_state(state_head, instance);
+  if (pdev) {
+    nfp_log( NFP_DBG3, "nfp_release_dev: removing device" );
+
+    nfp_free_pci_push(pdev);
+
+    if( pdev->cmddev ) {
+      nfp_log( NFP_DBG3, "nfp_release_dev: destroying cmd dev" );
+       ret = pdev->cmddev->destroy(pdev->common.cmdctx);
+       if (ret != NFP_SUCCESS) {
+         nfp_log( NFP_DBG1, " nfp_release_dev : cmddev->destroy failed ");
+         return nfp_oserr( ret );
+       }
+    }
+
+    if(pdev->high_iblock_cookie) {
+      nfp_log( NFP_DBG3, "nfp_release_dev: removing high and soft irq" );
+      ddi_remove_softintr(pdev->soft_int_id);
+      ddi_remove_intr(pdev->dip, 0, pdev->high_iblock_cookie);
+      mutex_destroy( &pdev->busy_mutex );
+      cv_destroy( &pdev->rd_cv );
+      mutex_destroy( &pdev->isr_mutex );
+      mutex_destroy( &pdev->high_mutex );
+    } else if(pdev->iblock_cookie) {
+      nfp_log( NFP_DBG3, "nfp_release_dev: removing irq" );
+      ddi_remove_intr(pdev->dip, 0, pdev->iblock_cookie);
+      mutex_destroy( &pdev->busy_mutex );
+      cv_destroy( &pdev->rd_cv );
+      mutex_destroy( &pdev->isr_mutex );
+    }
+    if(pdev->low_iblock_cookie) {
+      ddi_remove_intr(pdev->dip, 0, pdev->low_iblock_cookie);
+      mutex_destroy( &pdev->low_mutex);
+    }
+
+    for(i=0;i<6;i++) {
+      if( pdev->common.extra[i] ) {
+        nfp_log( NFP_DBG3, "nfp_release_dev: unmapping BAR %d", i );
+        ddi_regs_map_free ((ddi_acc_handle_t *)&pdev->common.extra[i]);
+      }
+    }
+
+    ddi_remove_minor_node(dip, NULL);
+
+    if (pdev->conf_handle)
+      pci_config_teardown( &pdev->conf_handle );
+
+    ddi_soft_state_free(state_head, instance);
+  }
+  nfp_log( NFP_DBG2, "nfp_release: finished" );
+
+  return DDI_SUCCESS;
+}
+
+
+/*-------------------------*/
+/*  nfp_attach             */
+/*-------------------------*/
+
+static int nfp_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) {
+  int instance;
+  nfp_dev *pdev = NULL;
+  int intres;
+  uint16_t device, vendor, sub_device, sub_vendor;
+  long *outp;
+  nfpcmd_dev const *cmddev;
+  int index, i;
+  nfp_err ret;
+
+  nfp_log( NFP_DBG2, "nfp_attach: entered." );
+
+  if (cmd != DDI_ATTACH) {
+    nfp_log( NFP_DBG1, "nfp_attach: bad command." );
+    goto bailout;
+  }
+
+  instance = ddi_get_instance(dip);
+
+  if (ddi_soft_state_zalloc(state_head, instance) != 0) {
+    nfp_log( NFP_DBG1, "nfp_attach: ddi_soft_state_zalloc() failed." );
+    goto bailout;
+  }
+
+  pdev = (struct nfp_dev *)ddi_get_soft_state(state_head, instance);
+  if( !pdev ) {
+    nfp_log( NFP_DBG1, "nfp_attach: cannot find dev.");
+    return ENODEV;
+  }
+  pdev->dip = dip;
+
+  /* map in pci config registers */
+  if (pci_config_setup(dip, &pdev->conf_handle)) {
+    nfp_log( NFP_DBG1, "nfp_attach: pci_config_setup() failed." );
+    goto bailout;
+  }
+
+  /* find out what we have got */
+  vendor= PCI_CONFIG_GET16( pdev->conf_handle, PCI_CONF_VENID );
+  device = PCI_CONFIG_GET16( pdev->conf_handle, PCI_CONF_DEVID );
+  sub_vendor = PCI_CONFIG_GET16( pdev->conf_handle, PCI_CONF_SUBVENID );
+  sub_device = PCI_CONFIG_GET16( pdev->conf_handle, PCI_CONF_SUBSYSID );
+
+  index= 0;
+  while( (cmddev = nfp_drvlist[index++]) != NULL ) {
+    if( cmddev->vendorid == vendor &&
+        cmddev->deviceid == device &&
+        cmddev->sub_vendorid == sub_vendor &&
+        cmddev->sub_deviceid == sub_device )
+      break;
+  }
+  if( !cmddev ) {
+    nfp_log( NFP_DBG1, "nfp_attach: unknonw device." );
+    goto bailout;
+  }
+
+  /* map BARs */
+  for( i=0; i<6; i++ ) {
+    if( cmddev->bar_sizes[i] ) {
+      off_t size;
+      if( ddi_dev_regsize(dip, i+1, &size) != DDI_SUCCESS) {
+        nfp_log( NFP_DBG1, "nfp_attach: ddi_dev_regsize() failed for BAR %d", i );
+        goto bailout;
+      }
+      if( size < (cmddev->bar_sizes[i] & ~NFP_MEMBAR_MASK) ) { 
+        nfp_log( NFP_DBG1, "nfp_attach: BAR %d too small %x (%x)", i, size, (cmddev->bar_sizes[i] & ~0xF) );
+        goto bailout;
+      }
+      if (ddi_regs_map_setup(dip, i+1, (caddr_t *)&pdev->common.bar[i],
+                         0, cmddev->bar_sizes[i] & ~NFP_MEMBAR_MASK, &nosw_attr, (ddi_acc_handle_t *)&pdev->common.extra[i] )) { 
+        nfp_log( NFP_DBG1, "nfp_attach: ddi_regs_map_setup() failed for BAR %d", i );
+        goto bailout;
+      }
+      nfp_log( NFP_DBG3, "nfp_attach: BAR[%d] mapped to %x (%x)", i, pdev->common.bar[i], size );
+    }
+  }
+  
+  pdev->read_buf = NULL;
+  pdev->rd_dma_ok = 0;
+
+  /* attach to minor node */
+  if (ddi_create_minor_node(dip, "nfp", S_IFCHR, instance, (char *)cmddev->name, 0) == DDI_FAILURE) {
+    ddi_remove_minor_node(dip, NULL);
+    nfp_log( NFP_DBG1, "nfp_attach: ddi_create_minor_node() failed." );
+    goto bailout;
+  }
+  
+  pdev->wr_ready = 1;
+  pdev->rd_ready = 0;
+  pdev->rd_pending = 0;
+  pdev->rd_outstanding = 0;
+  pdev->busy=0; 
+  pdev->cmddev= cmddev;
+  
+  ret = pdev->cmddev->create(&pdev->common);
+  if( ret != NFP_SUCCESS) {
+    nfp_log( NFP_DBG1, "nfp_attach: failed to create command device");
+    goto bailout;
+  }
+  pdev->common.dev= pdev;
+
+  if (ddi_intr_hilevel(dip, 0) != 0){
+    nfp_log( NFP_DBG2, "nfp_attach: high-level interrupt");
+    if( ddi_get_iblock_cookie(dip, 0, &pdev->high_iblock_cookie) ) {
+      nfp_log( NFP_DBG1, "nfp_attach: ddi_get_iblock_cookie(high) failed." );
+      goto bailout;
+    } 
+    if( ddi_get_iblock_cookie(dip, 0, &pdev->low_iblock_cookie) ) {
+      nfp_log( NFP_DBG1, "nfp_attach: ddi_get_iblock_cookie(low) failed." );
+      goto bailout;
+    }
+    mutex_init(&pdev->high_mutex, NULL, MUTEX_DRIVER,
+                (void *)pdev->high_iblock_cookie);
+    mutex_init(&pdev->low_mutex, NULL, MUTEX_DRIVER,
+                (void *)pdev->low_iblock_cookie);
+    if (ddi_add_intr(dip, 0, NULL,
+                NULL, nfp_isr,
+                (caddr_t)pdev) != DDI_SUCCESS) {
+      nfp_log( NFP_DBG1, "nfp_attach: ddi_add_intr(high) failed." );
+      goto bailout;
+    }
+    if( ddi_get_soft_iblock_cookie(dip, DDI_SOFTINT_HIGH,
+                &pdev->iblock_cookie) ) {
+      nfp_log( NFP_DBG1, "nfp_attach: ddi_get_iblock_cookie(soft) failed." );
+      goto bailout;
+    }
+    mutex_init(&pdev->isr_mutex, NULL, MUTEX_DRIVER,
+                (void *)pdev->iblock_cookie);
+    if (ddi_add_softintr(dip, DDI_SOFTINT_HIGH, &pdev->soft_int_id,
+                &pdev->iblock_cookie, NULL,
+                nfp_soft_isr, (caddr_t)pdev) != DDI_SUCCESS)
+                goto bailout;
+    pdev->high_intr= 1;
+  } else {
+    nfp_log( NFP_DBG2, "nfp_attach: low-level interrupt");
+
+    if (ddi_get_iblock_cookie (dip, 0, &pdev->iblock_cookie)) {
+      nfp_log( NFP_DBG1, "nfp_attach: ddi_get_iblock_cookie() failed." );
+      goto bailout;
+    }
+  
+    mutex_init(&pdev->isr_mutex, "nfp isr mutex", MUTEX_DRIVER, (void *)pdev->iblock_cookie);
+  
+    if (ddi_add_intr(dip, 0, NULL,
+                     (ddi_idevice_cookie_t *)NULL, nfp_isr,
+                     (caddr_t)pdev) != DDI_SUCCESS) {
+      nfp_log( NFP_DBG1, "nfp_attach: ddi_add_intr() failed." );
+      goto bailout;
+    }
+  }
+  mutex_init(&pdev->busy_mutex, "nfp busy mutex", MUTEX_DRIVER, NULL );
+  cv_init(&pdev->rd_cv, "nfp read condvar", CV_DRIVER, NULL );
+
+  /* get our bus and slot num */
+  if (ddi_getlongprop (DDI_DEV_T_NONE, 
+                       pdev->dip, 0, "reg",
+                       (caddr_t)&outp, &intres) != DDI_PROP_NOT_FOUND) {
+    nfp_log( NFP_DBG2, "nfp_attach: ddi_getlongprop('reg') ok." );
+    if( intres > 0 ) {
+      nfp_log( NFP_DBG1, "nfp_attach: found PCI nfast bus %x slot %x.",
+               ((*outp)>>16) & 0xff, ((*outp)>>11) & 0x1f );
+    }
+  }
+  
+  nfp_log( NFP_DBG2, "nfp_attach: attach succeeded." );
+  return DDI_SUCCESS;
+  
+bailout:
+  (void) nfp_release_dev( dip );
+
+  return DDI_FAILURE;
+}
+
+/*-------------------------*/
+/*  nfp_detach             */
+/*-------------------------*/
+
+/* 
+ * When our driver is unloaded, nfp_detach cleans up and frees the resources
+ * we allocated in nfp_attach.
+ */
+static int nfp_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) {
+  if (cmd != DDI_DETACH)
+    return (DDI_FAILURE);
+
+  (void) nfp_release_dev(dip);
+
+  return (DDI_SUCCESS);
+}
+
+/*-------------------------*/
+/*  _init                  */
+/*-------------------------*/
+
+int _init(void) {
+  register int error;
+
+  nfp_log( NFP_DBG2, "_init: entered" );
+
+  if ((error = ddi_soft_state_init(&state_head, sizeof (struct nfp_dev), 1)) != 0) {
+    nfp_log( NFP_DBG1, "_init: soft_state_init() failed" );
+    return (error);
+  }
+  
+  if ((error = mod_install(&modlinkage)) != 0) {
+    nfp_log( NFP_DBG1, "_init: mod_install() failed" );
+    ddi_soft_state_fini(&state_head);
+  }
+  
+  nfp_log( NFP_DBG2, "_init: leaving" );
+  return (error);
+}
+
+/*-------------------------*/
+/*  _info                  */
+/*-------------------------*/
+
+int _info(struct modinfo *modinfop) { 
+  nfp_log( NFP_DBG2, "_info: entered" );
+  
+  return (mod_info(&modlinkage, modinfop));
+}
+
+/*-------------------------*/
+/*  _fini                  */
+/*-------------------------*/
+
+int _fini(void) {
+  int status;
+  
+  nfp_log( NFP_DBG2, "_fini: entered" );
+  
+  if ((status = mod_remove(&modlinkage)) != 0) {
+    nfp_log( NFP_DBG2, "_fini: mod_remove() failed." );
+    return (status);
+  }
+  
+  ddi_soft_state_fini(&state_head);
+  
+  nfp_log( NFP_DBG2, "_fini: leaving" );
+  
+  return (status);
+}
+
diff --git a/usr/src/uts/common/io/nfp/i21285.c b/usr/src/uts/common/io/nfp/i21285.c
new file mode 100644
index 0000000000..f51a09188d
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/i21285.c
@@ -0,0 +1,310 @@
+/*
+
+i21285.c: nCipher PCI HSM intel/digital 21285 command driver
+
+(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+
+history
+
+09/10/2001 jsh  Original
+
+*/
+
+#include "nfp_common.h"
+#include "nfp_error.h"
+#include "nfp_hostif.h"
+#include "nfp_osif.h"
+#include "i21285.h"
+#include "nfp_cmd.h"
+#include "nfpci.h"
+
+/* create ------------------------------------------------------- */
+
+static nfp_err i21285_create( nfp_cdev *pdev ) {
+  unsigned int tmp32;
+
+  nfp_log( NFP_DBG2, "i21285_create: entered");
+  pdev->cmdctx= pdev;  /* set our context to just be a pointer to our nfp_cdev */
+
+  nfp_log( NFP_DBG2, "i21285_create: enable doorbell");
+  if(!pdev->bar[ IOBAR ]) {
+    nfp_log( NFP_DBG1, "i21285_create: null BAR[%d]", IOBAR );
+    return NFP_ENOMEM;
+  }
+  TO_LE32_IO( &tmp32, DOORBELL_ENABLE | POSTLIST_ENABLE);
+  nfp_outl( pdev, IOBAR, I21285_OFFSET_INTERRUPT_MASK, tmp32 );
+
+  return NFP_SUCCESS;
+}
+
+/* stop ------------------------------------------------------- */
+
+static nfp_err i21285_destroy( void * ctx ) {
+  nfp_cdev *pdev;
+  unsigned int tmp32;
+
+  nfp_log( NFP_DBG2, "i21285_destroy: entered");
+
+  pdev= (nfp_cdev *)ctx;
+  if(!pdev) {
+    nfp_log( NFP_DBG1, "i21285_destroy: NULL pdev");
+    return NFP_ENODEV;
+  }
+  if(!pdev->bar[ IOBAR ]) {
+    nfp_log( NFP_DBG1, "i21285_destroy: null BAR[%d]", IOBAR );
+    return NFP_ENOMEM;
+  }
+  TO_LE32_IO( &tmp32, DOORBELL_DISABLE | POSTLIST_DISABLE );
+  nfp_outl( pdev, IOBAR, I21285_OFFSET_INTERRUPT_MASK, tmp32 );
+
+  return NFP_SUCCESS;
+}
+
+/* open ------------------------------------------------------- */
+
+/* ARGSUSED */
+static nfp_err i21285_open( void * ctx ) {
+  nfp_log( NFP_DBG2, "i21285_open: entered");
+
+  return NFP_SUCCESS;
+}
+
+/* close ------------------------------------------------------- */
+
+/* ARGSUSED */
+static nfp_err i21285_close( void * ctx ) {
+  nfp_log( NFP_DBG2, "i21285_close: entered");
+
+  return NFP_SUCCESS;
+}
+
+/* isr ------------------------------------------------------- */
+
+static nfp_err i21285_isr( void *ctx, int *handled ) {
+  nfp_cdev *pdev;
+  unsigned int doorbell;
+  unsigned int tmp32;
+
+  nfp_log( NFP_DBG3, "i21285_isr: entered");
+
+  *handled= 0;
+  pdev= (nfp_cdev *)ctx;
+  if(!pdev) {
+    nfp_log( NFP_DBG1, "i21285_isr: NULL pdev");
+    return NFP_ENODEV;
+  }
+
+  doorbell= nfp_inl( pdev, IOBAR, I21285_OFFSET_DOORBELL);
+  doorbell= FROM_LE32_IO(&doorbell) & 0xffff;
+  while( doorbell && doorbell != 0xffff) {
+    *handled= 1;
+    /* service interrupts */
+    if( doorbell & (NFAST_INT_DEVICE_WRITE_OK | NFAST_INT_DEVICE_WRITE_FAILED)) {
+      TO_LE32_IO( &tmp32, NFAST_INT_DEVICE_WRITE_OK | NFAST_INT_DEVICE_WRITE_FAILED);
+      nfp_outl( pdev, IOBAR, I21285_OFFSET_DOORBELL, tmp32 );
+
+      nfp_log(NFP_DBG2, "i21285_isr: write done interrupt, ok = %d.", doorbell & NFAST_INT_DEVICE_WRITE_OK ? 1 : 0 );
+
+      nfp_write_complete(pdev->dev, doorbell & NFAST_INT_DEVICE_WRITE_OK ? 1 : 0 );
+    }
+
+    if( doorbell & (NFAST_INT_DEVICE_READ_OK | NFAST_INT_DEVICE_READ_FAILED)) {
+       TO_LE32_IO( &tmp32, NFAST_INT_DEVICE_READ_OK | NFAST_INT_DEVICE_READ_FAILED );
+       nfp_outl( pdev, IOBAR, I21285_OFFSET_DOORBELL, tmp32 );
+
+      nfp_log(NFP_DBG2, "i21285_isr: read ack interrupt, ok = %d.", doorbell & NFAST_INT_DEVICE_READ_OK ? 1 : 0 );
+      nfp_read_complete( pdev->dev, doorbell & NFAST_INT_DEVICE_READ_OK ? 1 : 0);
+    }
+
+    if( doorbell & ~(NFAST_INT_DEVICE_READ_OK  | NFAST_INT_DEVICE_READ_FAILED |
+                     NFAST_INT_DEVICE_WRITE_OK | NFAST_INT_DEVICE_WRITE_FAILED)) {
+      nfp_log( NFP_DBG1, "i21285_isr: unexpected interrupt %x", doorbell );
+      TO_LE32_IO( &tmp32, 0xffff & doorbell );
+      nfp_outl( pdev, IOBAR, I21285_OFFSET_DOORBELL, tmp32 );
+    }
+    doorbell= nfp_inl( pdev, IOBAR, I21285_OFFSET_DOORBELL);
+    doorbell= FROM_LE32_IO(&doorbell) & 0xffff;
+  }
+  return 0;
+}
+
+/* write ------------------------------------------------------- */
+
+static nfp_err i21285_write( const char *block, int len, void *ctx ) {
+  nfp_cdev *cdev;
+  unsigned int hdr[2];
+  nfp_err ne;
+  unsigned int tmp32;
+
+  nfp_log( NFP_DBG2, "i21285_write: entered");
+
+  cdev= (nfp_cdev *)ctx;
+  if(!cdev) {
+    nfp_log( NFP_DBG1, "i21285_write: NULL pdev");
+    return NFP_ENODEV;
+  }
+
+  nfp_log(NFP_DBG2, "i21285_write: pdev->bar[ MEMBAR ]= %x\n", cdev->bar[ MEMBAR ]);
+  nfp_log(NFP_DBG2, "i21285_write: pdev->bar[ IOBAR ]= %x\n", cdev->bar[ IOBAR ]);
+  if(!cdev->bar[ MEMBAR ]) {
+    nfp_log( NFP_DBG1, "i21285_write: null BAR[%d]", MEMBAR );
+    return NFP_ENOMEM;
+  }
+  ne= nfp_copy_from_user_to_dev( cdev, MEMBAR, NFPCI_JOBS_WR_DATA, block, len);
+  if (ne) {
+    nfp_log( NFP_DBG1, "i21285_write: nfp_copy_from_user_to_dev failed");
+    return ne;
+  }
+  TO_LE32_MEM(&hdr[0], NFPCI_JOB_CONTROL);
+  TO_LE32_MEM(&hdr[1], len);
+
+  ne= nfp_copy_to_dev( cdev, MEMBAR, NFPCI_JOBS_WR_CONTROL, (const char *)hdr, 8);
+  if (ne) {
+    nfp_log( NFP_DBG1, "i21285_write: nfp_copy_to_dev failed");
+    return ne;
+  }
+
+  ne= nfp_copy_from_dev( cdev, MEMBAR, NFPCI_JOBS_WR_LENGTH, (char *)hdr, 4);
+  if (ne) {
+    nfp_log( NFP_DBG1, "i21285_write: nfp_copy_from_dev failed");
+    return ne;
+  }
+  
+  TO_LE32_MEM( &tmp32, len );
+  if ( hdr[0] != tmp32 ) {
+    nfp_log( NFP_DBG1, "i21285_write: length not written");
+    return NFP_EIO;
+  }
+
+  TO_LE32_IO( &tmp32, NFAST_INT_HOST_WRITE_REQUEST);
+
+  nfp_outl( cdev, IOBAR, I21285_OFFSET_DOORBELL, tmp32 );
+
+  nfp_log( NFP_DBG2, "i21285_write: done");
+  return NFP_SUCCESS;
+}
+
+/* read ------------------------------------------------------- */
+
+static nfp_err i21285_read( char *block, int len, void *ctx, int *rcount) {
+  nfp_cdev *cdev;
+  nfp_err ne;
+  int count;
+
+  nfp_log( NFP_DBG2, "i21285_read: entered, len %d", len);
+  *rcount= 0;
+
+  cdev= (nfp_cdev *)ctx;
+  if(!cdev) {
+    nfp_log( NFP_DBG1, "i21285_read: NULL pdev");
+    return NFP_ENODEV;
+  }
+
+  if(!cdev->bar[ MEMBAR ]) {
+    nfp_log( NFP_DBG1, "i21285_read: null BAR[%d]", MEMBAR );
+    return NFP_ENOMEM;
+  }
+  ne= nfp_copy_from_dev( cdev, MEMBAR, NFPCI_JOBS_RD_LENGTH, (char *)&count, 4);
+  if(ne) {
+    nfp_log( NFP_DBG1, "i21285_read: nfp_copy_from_dev failed.");
+    return ne;
+  }
+  count= FROM_LE32_MEM(&count);
+  if(count<0 || count>len) {
+    nfp_log( NFP_DBG1, "i21285_read: bad byte count (%d) from device", count);
+    return NFP_EIO;
+  }
+  ne= nfp_copy_to_user_from_dev( cdev, MEMBAR, NFPCI_JOBS_RD_DATA, block, count);
+  if( ne ) {
+    nfp_log( NFP_DBG1, "i21285_read: nfp_copy_to_user_from_dev failed.");
+    return ne;
+  }
+  nfp_log( NFP_DBG2, "i21285_read: done");
+  *rcount= count;
+  return NFP_SUCCESS;
+}
+
+/* chupdate  ------------------------------------------------------- */
+
+/* ARGSUSED */
+static nfp_err i21285_chupdate( char *data, int len, void *ctx ) {
+  nfp_log( NFP_DBG1, "i21285_chupdate: NYI");
+  return NFP_SUCCESS;
+}
+
+/* ensure reading -------------------------------------------------- */
+
+static nfp_err i21285_ensure_reading( unsigned int addr, int len, void *ctx ) {
+  nfp_cdev *cdev;
+  unsigned int hdr[2];
+  unsigned int tmp32;
+  nfp_err ne;
+
+  nfp_log( NFP_DBG2, "i21285_ensure_reading: entered");
+
+  if(addr) {
+    nfp_log( NFP_DBG2, "i21285_ensure_reading: bad addr");
+    return -NFP_EINVAL;
+  }
+
+  cdev= (nfp_cdev *)ctx;
+  if(!cdev) {
+    nfp_log( NFP_DBG1, "i21285_ensure_reading: NULL pdev");
+    return NFP_ENODEV;
+  }
+
+  if(!cdev->bar[ MEMBAR ]) {
+    nfp_log( NFP_DBG1, "i21285_ensure_reading: null BAR[%d]", MEMBAR );
+    return NFP_ENXIO;
+  }
+  nfp_log( NFP_DBG3, "i21285_ensure_reading: pdev->bar[ MEMBAR ]= %x", cdev->bar[ MEMBAR ]);
+  nfp_log( NFP_DBG3, "i21285_ensure_reading: pdev->bar[ IOBAR ]= %x", cdev->bar[ IOBAR ]);
+  TO_LE32_MEM( &hdr[0], NFPCI_JOB_CONTROL);
+  TO_LE32_MEM( &hdr[1], len);
+  ne= nfp_copy_to_dev( cdev, MEMBAR, NFPCI_JOBS_RD_CONTROL, (const char *)hdr, 8);
+  if (ne) {
+    nfp_log( NFP_DBG1, "i21285_ensure_reading: nfp_copy_to_dev failed");
+    return ne;
+  }
+  ne= nfp_copy_from_dev( cdev, MEMBAR, NFPCI_JOBS_RD_LENGTH, (char *)hdr, 4);
+  if (ne) {
+    nfp_log( NFP_DBG1, "i21285_ensure_reading: nfp_copy_from_dev failed");
+    return ne;
+  }
+  TO_LE32_MEM( &tmp32, len );
+  if ( hdr[0] != tmp32 ) {
+    nfp_log( NFP_DBG1, "i21285_ensure_reading: len not written");
+    return NFP_EIO;
+  };
+  TO_LE32_IO( &tmp32, NFAST_INT_HOST_READ_REQUEST );
+  nfp_outl( cdev, IOBAR, I21285_OFFSET_DOORBELL, tmp32 );
+
+  return NFP_SUCCESS;
+}
+
+/* command device structure ------------------------------------- */
+
+
+const nfpcmd_dev i21285_cmddev = {
+  "nCipher Gen 1 PCI",
+  PCI_VENDOR_ID_DEC, PCI_DEVICE_ID_DEC_21285,
+  PCI_VENDOR_ID_NCIPHER, PCI_DEVICE_ID_NFAST_GEN1,
+  { 0, IOSIZE | PCI_BASE_ADDRESS_SPACE_IO, NFPCI_RAM_MINSIZE, 0, 0, 0 },
+  NFP_CMD_FLG_NEED_IOBUF,
+  i21285_create,
+  i21285_destroy,
+  i21285_open,
+  i21285_close,
+  i21285_isr,
+  i21285_write,
+  i21285_read,
+  i21285_chupdate,
+  i21285_ensure_reading,
+  0, /* no debug */
+};
+  
diff --git a/usr/src/uts/common/io/nfp/i21285.h b/usr/src/uts/common/io/nfp/i21285.h
new file mode 100644
index 0000000000..4ea1d853ec
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/i21285.h
@@ -0,0 +1,43 @@
+/*
+
+(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+*/
+
+#ifndef NFP_I21285_H
+#define NFP_I21285_H
+
+#ifndef PCI_VENDOR_ID_DEC
+#define PCI_VENDOR_ID_DEC               0x1011
+#endif
+#ifndef PCI_DEVICE_ID_DEC_21285
+#define PCI_DEVICE_ID_DEC_21285         0x1065
+#endif
+#ifndef PCI_VENDOR_ID_NCIPHER
+#define PCI_VENDOR_ID_NCIPHER           0x0100
+#endif
+
+#ifndef PCI_DEVICE_ID_NFAST_GEN1
+#define PCI_DEVICE_ID_NFAST_GEN1	0x0100
+#endif
+
+#define I21285_OFFSET_DOORBELL		0x60
+#define I21285_OFFSET_INTERRUPT_MASK	0x34
+
+#define DOORBELL_ENABLE 0x0
+#define DOORBELL_DISABLE 0x4
+
+#define POSTLIST_ENABLE 0x0
+#define POSTLIST_DISABLE 0x8
+
+#define IOBAR	1
+#define MEMBAR	2
+
+#define IOSIZE	0x80
+#define MEMSIZE	0x100000
+
+#endif
diff --git a/usr/src/uts/common/io/nfp/i21555.c b/usr/src/uts/common/io/nfp/i21555.c
new file mode 100644
index 0000000000..82024dc800
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/i21555.c
@@ -0,0 +1,423 @@
+/*
+
+i21555.c: nCipher PCI HSM intel 21555 command driver
+
+(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+history
+
+09/10/2001 jsh  Original
+
+*/
+
+#include "nfp_common.h"
+#include "nfp_error.h"
+#include "nfp_hostif.h"
+#include "nfp_osif.h"
+#include "i21555.h"
+#include "nfp_cmd.h"
+#include "nfpci.h"
+
+/* started ------------------------------------------------------
+ *
+ * Check that device is ready to talk, by checking that
+ * the i21555 has master enabled on its secondary interface
+ */
+
+static nfp_err i21555_started( nfp_cdev *pdev ) {
+  unsigned int tmp32;
+#ifdef CONFIGSPACE_DEBUG
+  unsigned int reg32[64];
+  int i;
+#endif
+  nfp_err ne;
+
+  nfp_log( NFP_DBG2, "i21555_started: entered");
+
+#ifdef CONFIGSPACE_DEBUG
+  /* Suck up all the registers */
+  for (i=0; i < 64; i++) {
+    ne = nfp_config_inl( pdev, i*4, &reg32[i] );
+  }
+
+  for (i=0; i < 16; i++) {
+    int j = i * 4;
+    nfp_log( NFP_DBG3, "i21555 config reg %2x: %08x %08x %08x %08x", j*4,
+        reg32[j], reg32[j+1], reg32[j+2], reg32[j+3]);
+  }
+#endif
+
+  ne = nfp_config_inl( pdev, I21555_CFG_SEC_CMD_STATUS, &tmp32 );
+  if (ne) {
+    /* succeed if PCI config reads are not implemented */
+    if (ne == NFP_EUNKNOWN)
+      return NFP_SUCCESS;
+    nfp_log( NFP_DBG1, "i21555_started: nfp_config_inl failed");
+    return ne;
+  }
+
+  tmp32= FROM_LE32_IO(&tmp32) & 0xffff;
+
+  if ( tmp32 & CFG_CMD_MASTER ) {
+    nfp_log( NFP_DBG3, "i21555_started: Yes %x", tmp32);
+    return NFP_SUCCESS;
+  } else {
+    nfp_log( NFP_DBG1, "i21555_started: device not started yet %x", tmp32);
+    return NFP_ESTARTING;
+  }
+}
+
+/* create ------------------------------------------------------- */
+
+static nfp_err i21555_create( nfp_cdev *pdev ) {
+  unsigned int tmp32;
+
+  nfp_log( NFP_DBG2, "i21555_create: entered");
+  pdev->cmdctx= pdev;  /* set our context to just be a pointer to our nfp_cdev */
+
+  if(!pdev->bar[ IOBAR ]) {
+    nfp_log( NFP_DBG1, "i21555_create: null BAR[%d]", IOBAR );
+    return NFP_ENOMEM;
+  }
+  nfp_log( NFP_DBG2, "i21555_create: enable doorbell");
+  TO_LE32_IO( &tmp32, I21555_DOORBELL_PRI_ENABLE );
+  nfp_outl( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_SET_MASK, tmp32 );
+  nfp_outl( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_CLEAR_MASK, tmp32 );
+  return NFP_SUCCESS;
+}
+
+/* stop ------------------------------------------------------- */
+
+static nfp_err i21555_destroy( void * ctx ) {
+  nfp_cdev *pdev;
+  unsigned int tmp32;
+
+  nfp_log( NFP_DBG2, "i21555_destroy: entered");
+
+  pdev= (nfp_cdev *)ctx;
+  if(!pdev) {
+    nfp_log( NFP_DBG1, "i21555_destroy: NULL pdev");
+    return NFP_ENODEV;
+  }
+  if(!pdev->bar[ IOBAR ]) {
+    nfp_log( NFP_DBG1, "i21555_destroy: null BAR[%d]", IOBAR );
+    return NFP_ENOMEM;
+  }
+  TO_LE32_IO( &tmp32, I21555_DOORBELL_PRI_DISABLE );
+  nfp_outl( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_SET_MASK, tmp32 );
+  nfp_outl( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_CLEAR_MASK, tmp32 );
+
+  return NFP_SUCCESS;
+}
+
+/* open ------------------------------------------------------- */
+
+/* ARGSUSED */
+static nfp_err i21555_open( void * ctx ) {
+
+  nfp_log( NFP_DBG2, "i21555_open: entered");
+
+  return NFP_SUCCESS;
+}
+
+/* close ------------------------------------------------------- */
+
+/* ARGSUSED */
+static nfp_err i21555_close( void * ctx ) {
+  nfp_log( NFP_DBG2, "i21555_close: entered");
+
+  return NFP_SUCCESS;
+}
+
+/* isr ------------------------------------------------------- */
+
+static nfp_err i21555_isr( void *ctx, int *handled ) {
+  nfp_cdev *pdev;
+  nfp_err ne;
+  unsigned short doorbell;
+  unsigned short tmp16;
+
+  nfp_log( NFP_DBG3, "i21555_isr: entered");
+
+  *handled= 0;
+  pdev= (nfp_cdev *)ctx;
+  if(!pdev) {
+    nfp_log( NFP_DBG1, "i21555_isr: NULL pdev");
+    return NFP_ENODEV;
+  }
+
+  pdev->stats.isr++;
+
+  if(!pdev->bar[ IOBAR ]) {
+    nfp_log( NFP_DBG1, "i21555_isr: null BAR[%d]", IOBAR );
+    return NFP_ENOMEM;
+  }
+
+  /* This interrupt may not be from our module, so check that it actually is
+   * us before handling it.
+   */
+  ne = i21555_started( pdev );
+  if (ne) {
+    if (ne != NFP_ESTARTING) {
+      nfp_log( NFP_DBG1, "i21555_isr: i21555_started failed");
+    }
+    return ne;
+  }
+
+  doorbell= nfp_inw( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_SET);
+  doorbell= FROM_LE16_IO(&doorbell);
+  while( doorbell && doorbell != 0xffff) {
+    *handled= 1;
+    /* service interrupts */
+    if( doorbell & (NFAST_INT_DEVICE_WRITE_OK | NFAST_INT_DEVICE_WRITE_FAILED)) {
+      pdev->stats.isr_write++;
+      TO_LE16_IO(&tmp16,NFAST_INT_DEVICE_WRITE_OK | NFAST_INT_DEVICE_WRITE_FAILED);
+      nfp_outw( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_CLEAR, tmp16 );
+
+      nfp_log( NFP_DBG2, "i21555_isr: write done interrupt, ok = %d.", doorbell & NFAST_INT_DEVICE_WRITE_OK ? 1 : 0 );
+
+      nfp_write_complete(pdev->dev, doorbell & NFAST_INT_DEVICE_WRITE_OK ? 1 : 0 );
+    }
+
+    if( doorbell & (NFAST_INT_DEVICE_READ_OK | NFAST_INT_DEVICE_READ_FAILED)) {
+      pdev->stats.isr_read++;
+      TO_LE16_IO(&tmp16,NFAST_INT_DEVICE_READ_OK | NFAST_INT_DEVICE_READ_FAILED);
+      nfp_outw( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_CLEAR, tmp16 );
+
+      nfp_log( NFP_DBG2, "i21555_isr: read ack interrupt, ok = %d.", doorbell & NFAST_INT_DEVICE_READ_OK ? 1 : 0 );
+      nfp_read_complete( pdev->dev, doorbell & NFAST_INT_DEVICE_READ_OK ? 1 : 0);
+    }
+
+    if( doorbell & ~(NFAST_INT_DEVICE_READ_OK  | NFAST_INT_DEVICE_READ_FAILED |
+                     NFAST_INT_DEVICE_WRITE_OK | NFAST_INT_DEVICE_WRITE_FAILED)) {
+      TO_LE16_IO(&tmp16,doorbell);
+      nfp_outw( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_CLEAR, tmp16 );
+      nfp_log( NFP_DBG1, "i21555_isr: unexpected interrupt %x", doorbell );
+    }
+    doorbell= nfp_inw( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_SET);
+    doorbell= FROM_LE16_IO(&doorbell);
+  }
+  nfp_log( NFP_DBG3, "i21555_isr: exiting");
+  return 0;
+}
+
+/* write ------------------------------------------------------- */
+
+static nfp_err i21555_write( const char *block, int len, void *ctx) {
+  nfp_cdev *cdev;
+  unsigned int hdr[2];
+  nfp_err ne;
+  unsigned short tmp16;
+  unsigned int tmp32;
+
+  nfp_log( NFP_DBG2, "i21555_write: entered");
+
+  cdev= (nfp_cdev *)ctx;
+  if(!cdev) {
+    nfp_log( NFP_DBG1, "i21555_write: NULL cdev");
+    return NFP_ENODEV;
+  }
+
+  cdev->stats.write_fail++;
+
+  if(!cdev->bar[ IOBAR ]) {
+    nfp_log( NFP_DBG1, "i21555_write: null BAR[%d]", IOBAR );
+    return NFP_ENOMEM;
+  }
+
+  ne = i21555_started( cdev );
+  if (ne) {
+    if (ne != NFP_ESTARTING) {
+      nfp_log( NFP_DBG1, "i21555_write: i21555_started failed");
+    }
+    return ne;
+  }
+
+  nfp_log( NFP_DBG3, "i21555_write: cdev->bar[ MEMBAR ]= %x", cdev->bar[ MEMBAR ]);
+  nfp_log( NFP_DBG3, "i21555_write: cdev->bar[ IOBAR ]= %x", cdev->bar[ IOBAR ]);
+  nfp_log( NFP_DBG3, "i21555_write: block len %d", len ); 
+  ne= nfp_copy_from_user_to_dev( cdev, MEMBAR, NFPCI_JOBS_WR_DATA, block, len);
+  if (ne) {
+    nfp_log( NFP_DBG1, "i21555_write: nfp_copy_from_user_to_dev failed");
+    return ne;
+  }
+  TO_LE32_MEM(&hdr[0], NFPCI_JOB_CONTROL);
+  TO_LE32_MEM(&hdr[1], len);
+  ne= nfp_copy_to_dev( cdev, MEMBAR, NFPCI_JOBS_WR_CONTROL, (const char *)hdr, 8);
+  if (ne) {
+    nfp_log( NFP_DBG1, "i21555_write: nfp_copy_to_dev failed");
+    return ne;
+  }
+
+  ne= nfp_copy_from_dev( cdev, MEMBAR, NFPCI_JOBS_WR_LENGTH, (char *)hdr, 4);
+  if (ne) {
+    nfp_log( NFP_DBG1, "i21555_write: nfp_copy_from_dev failed");
+    return ne;
+  }
+
+  TO_LE32_MEM(&tmp32, len);
+  if ( hdr[0] != tmp32 ) {
+    nfp_log( NFP_DBG1, "i21555_write: length not written");
+    return NFP_EIO;
+  }
+  TO_LE16_IO(&tmp16, NFAST_INT_HOST_WRITE_REQUEST >> 16);
+  nfp_outw( cdev, IOBAR, I21555_OFFSET_DOORBELL_SEC_SET, tmp16);
+
+  cdev->stats.write_fail--;
+  cdev->stats.write_block++;
+  cdev->stats.write_byte += len;
+
+  nfp_log( NFP_DBG2, "i21555_write: done");
+  return NFP_SUCCESS;
+}
+
+/* read ------------------------------------------------------- */
+
+static nfp_err i21555_read( char *block, int len, void *ctx, int *rcount) {
+  nfp_cdev *cdev;
+  nfp_err ne;
+  int count;
+
+  nfp_log( NFP_DBG2, "i21555_read: entered");
+  *rcount= 0;
+
+  cdev= (nfp_cdev *)ctx;
+  if(!cdev) {
+    nfp_log( NFP_DBG1, "i21555_read: NULL pdev");
+    return NFP_ENODEV;
+  }
+
+  cdev->stats.read_fail++;
+
+  if(!cdev->bar[ IOBAR ]) {
+    nfp_log( NFP_DBG1, "i21555_read: null BAR[%d]", IOBAR );
+    return NFP_ENOMEM;
+  }
+
+  ne= nfp_copy_from_dev( cdev, MEMBAR, NFPCI_JOBS_RD_LENGTH, (char *)&count, 4);
+  if (ne) {
+    nfp_log( NFP_DBG1, "i21555_read: nfp_copy_from_dev failed.");
+    return ne;
+  }
+  count= FROM_LE32_MEM(&count);
+  if(count<0 || count>len) {
+    nfp_log( NFP_DBG1, "i21555_read: bad byte count (%d) from device", count);
+    return NFP_EIO;
+  }
+  ne= nfp_copy_to_user_from_dev( cdev, MEMBAR, NFPCI_JOBS_RD_DATA, block, count);
+  if (ne) {
+    nfp_log( NFP_DBG1, "i21555_read: nfp_copy_to_user failed.");
+    return ne;
+  }
+  nfp_log( NFP_DBG2, "i21555_read: done");
+  *rcount= count;
+  cdev->stats.read_fail--;
+  cdev->stats.read_block++;
+  cdev->stats.read_byte += len;
+  return NFP_SUCCESS;
+}
+
+/* chupdate  ------------------------------------------------------- */
+
+/* ARGSUSED */
+static nfp_err i21555_chupdate( char *data, int len, void *ctx ) {
+  nfp_log( NFP_DBG1, "i21555_chupdate: NYI");
+  return NFP_SUCCESS;
+}
+
+/* ensure reading -------------------------------------------------- */
+
+static nfp_err i21555_ensure_reading( unsigned int addr, int len, void *ctx ) {
+  nfp_cdev *cdev;
+  unsigned int hdr[3];
+  unsigned short tmp16;
+  unsigned int tmp32;
+  nfp_err ne;
+  int hdr_len;
+
+  nfp_log( NFP_DBG2, "i21555_ensure_reading: entered");
+
+  cdev= (nfp_cdev *)ctx;
+  if(!cdev) {
+    nfp_log( NFP_DBG1, "i21555_ensure_reading: NULL pdev");
+    return NFP_ENODEV;
+  }
+
+  cdev->stats.ensure_fail++;
+
+  if(!cdev->bar[ IOBAR ]) {
+    nfp_log( NFP_DBG1, "i21555_ensure_reading: null BAR[%d]", IOBAR );
+    return NFP_ENOMEM;
+  }
+
+  ne = i21555_started( cdev );
+  if (ne) {
+    if (ne != NFP_ESTARTING) {
+      nfp_log( NFP_DBG1, "i21555_ensure_reading: i21555_started failed");
+    }
+    return ne;
+  }
+
+  nfp_log( NFP_DBG3, "i21555_ensure_reading: pdev->bar[ MEMBAR ]= %x", cdev->bar[ MEMBAR ]);
+  nfp_log( NFP_DBG3, "i21555_ensure_reading: pdev->bar[ IOBAR ]= %x", cdev->bar[ IOBAR ]);
+  if(addr) {
+    nfp_log( NFP_DBG3, "i21555_ensure_reading: new format, addr %x", addr);
+    TO_LE32_MEM(&hdr[0], NFPCI_JOB_CONTROL_PCI_PUSH);
+    TO_LE32_MEM(&hdr[1], len);
+    TO_LE32_MEM(&hdr[2], addr);
+    hdr_len= 12;
+  } else {
+    TO_LE32_MEM(&hdr[0], NFPCI_JOB_CONTROL);
+    TO_LE32_MEM(&hdr[1], len);
+    hdr_len= 8;
+  }
+  ne= nfp_copy_to_dev( cdev, MEMBAR, NFPCI_JOBS_RD_CONTROL, (const char *)hdr, hdr_len);
+  if (ne) {
+    nfp_log( NFP_DBG1, "i21555_ensure_reading: nfp_copy_to_dev failed");
+    return ne;
+  }
+
+  ne= nfp_copy_from_dev( cdev, MEMBAR, NFPCI_JOBS_RD_LENGTH, (char *)hdr, 4);
+  if (ne) {
+    nfp_log( NFP_DBG1, "i21555_ensure_reading: nfp_copy_from_dev failed");
+    return ne;
+  }
+
+  TO_LE32_MEM(&tmp32, len);
+
+  if ( hdr[0] != tmp32 ) {
+    nfp_log( NFP_DBG1, "i21555_ensure_reading: len not written");
+    return NFP_EIO;
+  }
+  TO_LE16_IO( &tmp16, NFAST_INT_HOST_READ_REQUEST >> 16);
+  nfp_outw( cdev, IOBAR, I21555_OFFSET_DOORBELL_SEC_SET, tmp16);
+
+  cdev->stats.ensure_fail--;
+  cdev->stats.ensure++;
+
+  return NFP_SUCCESS;
+}
+
+/* command device structure ------------------------------------- */
+
+const nfpcmd_dev i21555_cmddev = {
+  "nCipher Gen 2 PCI",
+  PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_21555,
+  PCI_VENDOR_ID_NCIPHER, PCI_SUBSYSTEM_ID_NFAST_REV1,
+  { 0, IOSIZE | PCI_BASE_ADDRESS_SPACE_IO, NFPCI_RAM_MINSIZE_JOBS, 0, 0, 0 },
+  NFP_CMD_FLG_NEED_IOBUF,
+  i21555_create,
+  i21555_destroy,
+  i21555_open,
+  i21555_close,
+  i21555_isr,
+  i21555_write,
+  i21555_read,
+  i21555_chupdate,
+  i21555_ensure_reading,
+  i21555_debug,
+};
diff --git a/usr/src/uts/common/io/nfp/i21555.h b/usr/src/uts/common/io/nfp/i21555.h
new file mode 100644
index 0000000000..d8f3965938
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/i21555.h
@@ -0,0 +1,51 @@
+/*
+
+(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+*/
+
+#ifndef I21555_H
+#define I21555_H
+
+#ifndef PCI_VENDOR_ID_INTEL
+#define PCI_VENDOR_ID_INTEL             0x8086
+#endif
+
+#ifndef PCI_DEVICE_ID_INTEL_21555
+#define PCI_DEVICE_ID_INTEL_21555       0xb555
+#endif
+
+#ifndef PCI_VENDOR_ID_NCIPHER
+#define PCI_VENDOR_ID_NCIPHER           0x0100
+#endif
+
+#ifndef PCI_SUBSYSTEM_ID_NFAST_REV1
+#define PCI_SUBSYSTEM_ID_NFAST_REV1     0x0100
+#endif
+
+#define I21555_OFFSET_DOORBELL_PRI_SET		0x9C
+#define I21555_OFFSET_DOORBELL_SEC_SET		0x9E
+#define I21555_OFFSET_DOORBELL_PRI_CLEAR	0x98
+
+#define I21555_OFFSET_DOORBELL_PRI_SET_MASK	0xA4
+#define I21555_OFFSET_DOORBELL_PRI_CLEAR_MASK	0xA0
+
+#define I21555_DOORBELL_PRI_ENABLE 0x0000
+#define I21555_DOORBELL_PRI_DISABLE 0xFFFF
+
+#define I21555_CFG_SEC_CMD_STATUS 0x44
+
+#define CFG_CMD_MASTER 0x0004
+
+#define IOBAR   1
+#define MEMBAR  2
+
+#define IOSIZE  0x100
+
+extern nfp_err i21555_debug( int cmd, void *ctx );
+
+#endif
diff --git a/usr/src/uts/common/io/nfp/i21555d.c b/usr/src/uts/common/io/nfp/i21555d.c
new file mode 100644
index 0000000000..183ace8275
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/i21555d.c
@@ -0,0 +1,28 @@
+/*
+
+i21555d.c: nCipher PCI HSM intel 21555 debug ioctl
+
+(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+
+history
+
+15/05/2002 jsh  Original, does nothing
+
+*/
+
+#include "nfp_common.h"
+#include "nfp_error.h"
+#include "nfp_osif.h"
+#include "i21555.h"
+
+/* ARGSUSED */
+nfp_err i21555_debug( int cmd, void *ctx) {
+  nfp_log( NFP_DBG1, "i21555_debug: entered");
+
+  return NFP_EUNKNOWN;
+}
diff --git a/usr/src/uts/common/io/nfp/nfdev-common.h b/usr/src/uts/common/io/nfp/nfdev-common.h
new file mode 100644
index 0000000000..8a97bf2c63
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/nfdev-common.h
@@ -0,0 +1,141 @@
+/*
+
+(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+*/
+/** \file nfdev-common.h
+ *
+ * \brief nFast device driver (not generic SCSI) ioctl struct definition file
+ *  include NFDEV-$(system) for ioctl number definitions
+ *
+ *  1998.07.13	jsh	Started
+ *
+ * 
+ */
+
+#ifndef NFDEV_COMMON_H
+#define NFDEV_COMMON_H
+
+/**
+ * Result of the ENQUIRY ioctl.
+ */
+typedef struct nfdev_enquiry_str {
+  unsigned int  busno; /**< Which bus is the PCI device on. */
+  unsigned char slotno; /**< Which slot is the PCI device in. */
+  unsigned char reserved[3]; /**< for consistant struct alignment */
+} nfdev_enquiry_str;
+
+/**
+ * Result of the STATS ioctl.
+ */
+typedef struct nfdev_stats_str {
+  unsigned long  isr; /**< Count interrupts. */
+  unsigned long  isr_read; /**< Count read interrupts. */
+  unsigned long  isr_write; /**< Count write interrupts. */
+  unsigned long  write_fail; /**< Count write failures. */
+  unsigned long  write_block; /**< Count blocks written. */
+  unsigned long  write_byte; /**< Count bytes written. */
+  unsigned long  read_fail; /**< Count read failures. */
+  unsigned long  read_block; /**< Count blocks read. */
+  unsigned long  read_byte; /**< Count bytes read. */
+  unsigned long  ensure_fail; /**< Count read request failures. */
+  unsigned long  ensure; /**< Count read requests. */
+} nfdev_stats_str;
+
+/**
+ * Input to the CONTROL ioctl.
+ */
+typedef struct nfdev_control_str {
+  unsigned control; /**< Control flags. */
+} nfdev_control_str;
+
+/** Control bit indicating host supports MOI control */
+#define NFDEV_CONTROL_HOST_MOI 0x0001
+
+/** Index of control bits indicating desired mode
+ *
+ * Desired mode follows the M_ModuleMode enumeration.
+ */
+#define NFDEV_CONTROL_MODE_SHIFT 1
+
+/** Detect a backwards-compatible control value
+ *
+ * Returns true if the request control value "makes no difference", i.e.
+ * and the failure of an attempt to set it is therefore uninteresting.
+ */
+#define NFDEV_CONTROL_HARMLESS(c) ((c) <= 1)
+
+/**
+ * Result of the STATUS ioctl.
+ */
+typedef struct nfdev_status_str {
+  unsigned  status; /**< Status flags. */
+  char      error[8]; /**< Error string. */
+} nfdev_status_str;
+
+/** Monitor firmware supports MOI control and error reporting */
+#define NFDEV_STATUS_MONITOR_MOI 0x0001
+
+/** Application firmware supports MOI control and error reporting */
+#define NFDEV_STATUS_APPLICATION_MOI 0x0002
+
+/** Application firmware running and supports error reporting */
+#define NFDEV_STATUS_APPLICATION_RUNNING 0x0004
+
+/** HSM failed
+ *
+ * Consult error[] for additional information.
+ */
+#define NFDEV_STATUS_FAILED 0x0008
+
+/** Standard PCI interface. */
+#define NFDEV_IF_STANDARD	0x01
+
+/** PCI interface with results pushed from device
+ *  via DMA.
+ */
+#define NFDEV_IF_PCI_PUSH	0x02
+
+/* platform independant base ioctl numbers */
+
+/** Enquiry ioctl.
+ *  \return nfdev_enquiry_str describing the attached device. */
+#define NFDEV_IOCTL_NUM_ENQUIRY        0x01
+/** Channel Update ioctl.
+ *  \deprecated */
+#define NFDEV_IOCTL_NUM_CHUPDATE       0x02
+/** Ensure Reading ioctl.
+ *  Signal a read request to the device.
+ *  \param (unsigned int) Length of data to be read.
+ */
+#define NFDEV_IOCTL_NUM_ENSUREREADING  0x03
+/** Device Count ioctl.
+ *  Not implemented for on all platforms.
+ *  \return (int) the number of attached devices. */
+#define NFDEV_IOCTL_NUM_DEVCOUNT       0x04
+/** Internal Debug ioctl.
+ *  Not implemented in release drivers. */
+#define NFDEV_IOCTL_NUM_DEBUG          0x05
+/** PCI Interface Version ioctl.
+ *  \param (int) Maximum PCI interface version
+ *   supported by the user of the device. */
+#define NFDEV_IOCTL_NUM_PCI_IFVERS     0x06
+/** Statistics ioctl.
+ *  \return nfdev_enquiry_str describing the attached device. */
+#define NFDEV_IOCTL_NUM_STATS          0x07
+
+/** Module control ioctl
+ * \param (nfdev_control_str) Value to write to HSM control register
+ */
+#define NFDEV_IOCTL_NUM_CONTROL        0x08
+
+/** Module state ioctl
+ * \return (nfdev_status_str) Values read from HSM status/error registers
+ */
+#define NFDEV_IOCTL_NUM_STATUS         0x09
+
+#endif
diff --git a/usr/src/uts/common/io/nfp/nfdev-solaris.h b/usr/src/uts/common/io/nfp/nfdev-solaris.h
new file mode 100644
index 0000000000..923b902e46
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/nfdev-solaris.h
@@ -0,0 +1,37 @@
+/*
+
+nfdev-solaris.h: nFast solaris specific device ioctl interface.
+
+(C) Copyright nCipher Corporation Ltd 1998-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+history
+
+14/07/1998 jsh  Original
+
+*/
+
+#ifndef NFDEV_SOLARIS_H
+#define NFDEV_SOLARIS_H
+
+#include "nfdev-common.h"
+
+#define NFDEV_IOCTL_TYPE ('n'<<8)
+
+#define NFDEV_IOCTL_ENQUIRY		( NFDEV_IOCTL_TYPE | \
+					  NFDEV_IOCTL_NUM_ENQUIRY )
+#define NFDEV_IOCTL_ENSUREREADING	( NFDEV_IOCTL_TYPE | \
+					  NFDEV_IOCTL_NUM_ENSUREREADING )
+#define NFDEV_IOCTL_DEVCOUNT		( NFDEV_IOCTL_TYPE | \
+					  NFDEV_IOCTL_NUM_DEVCOUNT )
+#define NFDEV_IOCTL_DEBUG		( NFDEV_IOCTL_TYPE | \
+					  NFDEV_IOCTL_NUM_DEBUG )
+#define NFDEV_IOCTL_PCI_IFVERS		( NFDEV_IOCTL_TYPE | \
+					  NFDEV_IOCTL_NUM_PCI_IFVERS )
+#define NFDEV_IOCTL_STATS		( NFDEV_IOCTL_TYPE | \
+					  NFDEV_IOCTL_NUM_STATS )
+
+#endif /* NFDEV_SOLARIS_H */
diff --git a/usr/src/uts/common/io/nfp/nfp.h b/usr/src/uts/common/io/nfp/nfp.h
new file mode 100644
index 0000000000..9704f04fbc
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/nfp.h
@@ -0,0 +1,113 @@
+/*
+
+nfp.h: nFast PCI driver for Solaris 2.5, 2.6 and 2.7
+
+(C) Copyright nCipher Corporation Ltd 2001-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+history
+
+06/05/1998 jsh  Original solaris 2.6
+21/05/1999 jsh  added support for solaris 2.5
+10/06/1999 jsh  added support for solaris 2.7 (32 and 64 bit)
+16/10/2001 jsh  moved from nfast to new structure in nfdrv
+
+*/
+
+#ifndef NFP_H
+#define NFP_H
+
+#ifndef _KERNEL
+#error Hello?  this is a driver, please compile with -D_KERNEL
+#endif
+
+#if ( CH_KERNELVER < 260 )
+typedef int ioctlptr_t;
+typedef unsigned short uint16_t;
+#define DDI_GET32    ddi_getl
+#define DDI_PUT32    ddi_putl
+#define DDI_GET16    ddi_getw
+#define DDI_PUT16    ddi_putw
+#define DDI_REP_GET8 ddi_rep_getb
+#define DDI_REP_PUT8 ddi_rep_putb
+#define DDI_REP_GET32 ddi_rep_getl
+#define DDI_REP_PUT32 ddi_rep_putl
+#define PCI_CONFIG_GET16 pci_config_getw
+#else /* ( CH_KERNELVER >= 260 ) */
+typedef intptr_t ioctlptr_t;
+#define DDI_GET32    ddi_get32
+#define DDI_PUT32    ddi_put32
+#define DDI_GET16    ddi_get16
+#define DDI_PUT16    ddi_put16
+#define DDI_REP_GET8 ddi_rep_get8
+#define DDI_REP_PUT8 ddi_rep_put8
+#define DDI_REP_GET32 ddi_rep_get32
+#define DDI_REP_PUT32 ddi_rep_put32
+#define PCI_CONFIG_GET16 pci_config_get16
+#endif
+
+#if ( CH_KERNELVER < 270 )
+typedef int nfp_timeout_t;
+#define EXTRA_CB_FLAGS 0
+#define VSXPRINTF(s, n, format, ap) vsprintf (s, format, ap)
+#else /* ( CH_KERNELVER >= 270 ) */
+typedef timeout_id_t nfp_timeout_t;
+#define EXTRA_CB_FLAGS D_64BIT
+#define VSXPRINTF(s, n, format, ap) vsnprintf(s, n, format, ap)
+#endif
+
+typedef struct nfp_dev {
+  int rd_ok;
+  int wr_ok;
+
+  int ifvers;
+
+  /* for PCI push read interface */
+  unsigned char *read_buf;
+  ddi_dma_handle_t read_dma_handle;
+  ddi_dma_cookie_t read_dma_cookie;
+
+  ddi_acc_handle_t acchandle;
+
+  int rd_dma_ok;
+
+  nfp_timeout_t wrtimeout;
+  nfp_timeout_t rdtimeout;
+
+  struct buf *wr_bp;
+  int wr_ready;
+  int rd_ready;
+  int rd_pending;
+  int rd_outstanding;
+  kcondvar_t rd_cv;
+
+  struct pollhead pollhead;
+  dev_info_t *dip;
+
+  ddi_iblock_cookie_t high_iblock_cookie; /* for mutex */
+  ddi_iblock_cookie_t low_iblock_cookie; /* for mutex */
+  kmutex_t high_mutex;
+  kmutex_t low_mutex;
+  int high_intr;
+  ddi_softintr_t soft_int_id;
+  int high_read;
+  int high_write;
+
+  ddi_iblock_cookie_t iblock_cookie; /* for mutex */
+  kmutex_t isr_mutex;
+
+  kmutex_t busy_mutex;
+  int busy;
+ 
+  ddi_acc_handle_t conf_handle;
+
+  nfp_cdev common;
+  const nfpcmd_dev *cmddev;
+} nfp_dev;
+
+extern struct nfp_dev *nfp_dev_list[];
+
+#endif /* NFP_H */
diff --git a/usr/src/uts/common/io/nfp/nfp_cmd.h b/usr/src/uts/common/io/nfp/nfp_cmd.h
new file mode 100644
index 0000000000..db8af0b2f9
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/nfp_cmd.h
@@ -0,0 +1,68 @@
+/*
+
+nfp_cmd.h: nCipher PCI HSM command driver decalrations
+
+(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+history
+
+10/10/2001 jsh  Original
+
+*/
+
+#ifndef NFPCMD_H
+#define NFPCMD_H
+
+#include "nfp_hostif.h"
+#include "nfp_error.h"
+
+/* read and write called with userspace buffer */
+
+typedef struct nfpcmd_dev {
+  const char *name;
+  unsigned short vendorid, deviceid,
+                 sub_vendorid, sub_deviceid;
+  unsigned int bar_sizes[6];    /* includes IO bit */
+  unsigned int flags;
+  nfp_err (*create)(struct nfp_cdev *pdev);
+  nfp_err (*destroy)(void * ctx);
+  nfp_err (*open)(void * ctx);
+  nfp_err (*close)(void * ctx);
+  nfp_err (*isr)(void *ctx, int *handled);
+  nfp_err (*write_block)( const char *ublock, int len, void *ctx );
+  nfp_err (*read_block)( char *ublock, int len, void *ctx, int *rcount);
+  nfp_err (*channel_update)( char *data, int len, void *ctx);
+  nfp_err (*ensure_reading)( unsigned int addr, int len, void *ctx );
+  nfp_err (*debug)( int cmd, void *ctx);
+} nfpcmd_dev;
+
+#define NFP_CMD_FLG_NEED_IOBUF	0x1
+
+/* list of all supported drivers ---------------------------------------- */
+
+extern const nfpcmd_dev *nfp_drvlist[];
+
+extern const nfpcmd_dev i21285_cmddev;
+extern const nfpcmd_dev i21555_cmddev;
+extern const nfpcmd_dev bcm5820_cmddev;
+
+#ifndef PCI_BASE_ADDRESS_SPACE_IO
+#define PCI_BASE_ADDRESS_SPACE_IO	0x1
+#endif
+
+#define NFP_MAXDEV	16
+
+
+#define NFP_MEMBAR_MASK    ~0xf
+#define NFP_IOBAR_MASK     ~0x3
+/*
+   This masks off the bottom bits of the PCI_CSR_BAR which signify that the
+   BAR is an IO BAR rather than a MEM BAR 
+*/ 
+
+#endif
+
diff --git a/usr/src/uts/common/io/nfp/nfp_common.h b/usr/src/uts/common/io/nfp/nfp_common.h
new file mode 100644
index 0000000000..d1d2100fea
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/nfp_common.h
@@ -0,0 +1,68 @@
+/*
+
+(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+*/
+
+#ifndef NFP_COMMON_H
+#define NFP_COMMON_H
+
+#include <sys/types.h>
+#include <sys/conf.h>
+
+typedef uint32_t UINT32;
+typedef uint8_t BYTE;
+
+#define DEFINE_NFPCI_PACKED_STRUCTS
+#include "nfpci.h"
+#include "nfdev-solaris.h"
+
+typedef int oserr_t;
+
+#if CH_BIGENDIAN
+
+/* Big Endian Sparc */
+
+#define SWP32(x) \
+( (((unsigned int)(x)>>24)&0xff) | (((unsigned int)(x)>>8)&0xff00) | (((unsigned int)(x)<<8)&0xff0000) | (((unsigned int)(x)<<24)&0xff000000) ) 
+
+#define SWP16(x) ( (((x)>>8)&0xff) | (((x)<<8)&0xff00) )
+
+#define FROM_LE32_IO(x)		SWP32(*x)
+#define TO_LE32_IO(x,y)		*x=SWP32(y)
+
+#define FROM_LE32_MEM(x)	SWP32(*x)
+#define TO_LE32_MEM(x,y)	*x=SWP32(y)
+
+#define FROM_LE16_IO(x)		SWP16(*x)
+#define TO_LE16_IO(x,y)		*x=SWP16(y)
+
+#else
+
+/* Little Endian x86 */
+
+#define FROM_LE32_IO(x) (*x)
+#define TO_LE32_IO(x,y) (*x=y)
+
+#define FROM_LE32_MEM(x) (*x)
+#define TO_LE32_MEM(x,y) (*x=y)
+
+#define FROM_LE16_IO(x) (*x)
+#define TO_LE16_IO(x,y) (*x=y)
+
+#endif /* !CH_BIGENDIAN */
+
+#include <sys/types.h>
+
+#if CH_KERNELVER == 260
+#define nfp_get_lbolt( lbolt, err ) err= drv_getparm( LBOLT, lbolt )
+#else
+#define nfp_get_lbolt( lbolt, err ) { *lbolt= ddi_get_lbolt(); err= 0; }
+#endif
+
+#endif
+
diff --git a/usr/src/uts/common/io/nfp/nfp_error.h b/usr/src/uts/common/io/nfp/nfp_error.h
new file mode 100644
index 0000000000..d64cb78fd4
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/nfp_error.h
@@ -0,0 +1,48 @@
+/*
+
+nfp_error.h: nCipher PCI HSM error handling
+
+(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+history
+
+05/12/2001 jsh  Original
+
+*/
+
+#ifndef NFP_ERROR_H
+#define NFP_ERROR_H
+
+#include "nfp_common.h"
+
+#define NFP_SUCCESS	0x0
+#define NFP_EFAULT      0x1
+#define NFP_ENOMEM	0x2
+#define NFP_EINVAL	0x3
+#define NFP_EIO		0x4
+#define NFP_ENXIO	0x5
+#define NFP_ENODEV	0x6
+#define NFP_EINTR	0x7
+#define NFP_ESTARTING	0x8
+#define NFP_EAGAIN	0x9
+#define NFP_EUNKNOWN	0x100
+
+typedef int nfp_err;
+
+extern oserr_t nfp_oserr( nfp_err nerr );
+extern nfp_err nfp_error( oserr_t oerr );
+
+#define nfr( x) \
+  return nfp_error((x))
+
+#define nfer(x, fn, msg) \
+  { oserr_t err=(x); if(err) { nfp_log( NFP_DBG1, #fn ": " msg); return nfp_error(err); } }
+
+#define er(x, fn, msg ) \
+{ nfp_err err=(x); if(err) { nfp_log( NFP_DBG1, #fn ": " msg); return err; } }
+
+#endif
diff --git a/usr/src/uts/common/io/nfp/nfp_hostif.h b/usr/src/uts/common/io/nfp/nfp_hostif.h
new file mode 100644
index 0000000000..3e7d8187e5
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/nfp_hostif.h
@@ -0,0 +1,54 @@
+/*
+
+nfp_hostif.h: nCipher PCI HSM host interface declarations
+
+(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+history
+
+10/10/2001 jsh  Original
+
+*/
+
+#ifndef NFP_HOSTIF_H
+#define NFP_HOSTIF_H
+
+#include "nfdev-common.h"
+
+struct nfp_dev;
+
+/* common device structure */
+
+typedef struct nfp_cdev {
+  unsigned char *bar[6];
+  void *extra[6];
+
+  int busno;
+  int slotno;
+
+  void *cmdctx;
+
+  char *iobuf;
+
+  struct nfp_dev* dev;
+
+  struct nfdev_stats_str stats;
+
+} nfp_cdev;
+
+/* callbacks from command drivers -------------------------------------- */
+
+void nfp_read_complete(  struct nfp_dev *pdev, int ok);
+void nfp_write_complete( struct nfp_dev *pdev, int ok);
+
+#define NFP_READ_MAX (8 * 1024)
+#define NFP_READBUF_SIZE (NFP_READ_MAX + 8)
+#define NFP_TIMEOUT_SEC 10
+
+#define NFP_DRVNAME "nCipher nFast PCI driver"
+
+#endif
diff --git a/usr/src/uts/common/io/nfp/nfp_ifvers.c b/usr/src/uts/common/io/nfp/nfp_ifvers.c
new file mode 100644
index 0000000000..807b4f24c5
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/nfp_ifvers.c
@@ -0,0 +1,51 @@
+/*
+
+(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+*/
+
+/*
+ * nfp_ifervs.c  - common pci interface versioning
+ *
+ * uses:
+ *
+ * int pdev->ifvers
+ *     device interface version
+ *
+ * int nfp_ifvers
+ *     interface version limit
+ * 
+ * int nfp_alloc_pci_push( nfp_dev *pdev )
+ *     allocates resources needed for PCI Push,
+ *     if not already allocated, and return True if successful
+ *
+ * void nfp_free_pci_push( nfp_dev *pdev ) {
+ *     frees any resources allocated to PCI Push
+ */
+
+void nfp_set_ifvers( nfp_dev *pdev, int vers ) {
+  if( nfp_ifvers != 0 && vers > nfp_ifvers ) {
+    nfp_log( NFP_DBG2,
+             "nfp_set_ifvers: can't set ifvers %d"
+             " as nfp_ifvers wants max ifvers %d",
+             vers, nfp_ifvers);
+    return;
+  }
+  if( vers >= NFDEV_IF_PCI_PUSH ) {
+    if(!nfp_alloc_pci_push(pdev)) {
+      nfp_log( NFP_DBG1,
+               "nfp_set_ifvers: can't set ifvers %d"
+               " as resources not available",
+               vers);
+      return;
+    }
+  } else {
+    nfp_free_pci_push(pdev);
+  }
+  pdev->ifvers= vers;
+  nfp_log( NFP_DBG3, "nfp_set_ifvers: setting ifvers %d", vers);
+}
diff --git a/usr/src/uts/common/io/nfp/nfp_osif.h b/usr/src/uts/common/io/nfp/nfp_osif.h
new file mode 100644
index 0000000000..17ffe469ce
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/nfp_osif.h
@@ -0,0 +1,105 @@
+/*
+
+nfp_osif.h: nCipher PCI HSM OS interface declarations
+
+(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+history
+
+10/10/2001 jsh  Original
+
+*/
+
+#ifndef NFP_OSIF_H
+#define NFP_OSIF_H
+
+#include "nfp_hostif.h"
+#include "nfp_error.h"
+
+/* general typedefs ----------------------------------------------- */
+
+typedef volatile unsigned int reg32;
+typedef volatile unsigned short reg16;
+typedef volatile unsigned char reg8;
+
+/* sempaphores, mutexs and events --------------------------------- */
+
+#if 0
+extern nfp_err nfp_sema_init( nfp_sema *sema, int initial);
+extern void nfp_sema_destroy( nfp_sema *sema );
+extern void nfp_sema_post( nfp_sema *sema );
+extern void nfp_sema_wait( nfp_sema *sema );
+extern int nfp_sema_wait_sig( nfp_sema *sema );
+
+extern nfp_err nfp_mutex_init( nfp_mutex *mutex );
+extern void nfp_mutex_destroy( nfp_mutex *mutex );
+extern void nfp_mutex_enter( nfp_mutex *mutex );
+extern void nfp_mutex_exit( nfp_mutex *mutex );
+
+extern nfp_err nfp_event_init( nfp_event *event );
+extern void nfp_event_destroy( nfp_event *event );
+extern void nfp_event_set( nfp_event *event );
+extern void nfp_event_clear( nfp_event *event );
+extern void nfp_event_wait( nfp_event *event );
+extern void nfp_event_wait_sig( nfp_event *event );
+
+#endif
+
+/* timeouts ------------------------------------------------------ */
+
+extern void nfp_sleep( int ms );
+
+/* memory handling ----------------------------------------------- */
+
+#define KMALLOC_DMA	0
+#define KMALLOC_CACHED	1
+
+extern void *nfp_kmalloc( int size, int flags );
+extern void *nfp_krealloc( void *ptr, int size, int flags );
+extern void nfp_kfree( void * );
+
+/* config space access ------------------------------------------------ */
+
+/* return Little Endian 32 bit config register */
+extern nfp_err nfp_config_inl( nfp_cdev *pdev, int offset, unsigned int *res );
+
+/* io space access ------------------------------------------------ */
+
+extern unsigned int nfp_inl( nfp_cdev *pdev, int bar, int offset );
+extern unsigned short nfp_inw( nfp_cdev *pdev, int bar, int offset );
+extern void nfp_outl( nfp_cdev *pdev, int bar, int offset, unsigned int data );
+extern void nfp_outw( nfp_cdev *pdev, int bar, int offset, unsigned short data );
+
+/* user and device memory space access ---------------------------- */
+
+/* NB these 2 functions are not guarenteed to be re-entrant for a given device */
+extern nfp_err nfp_copy_from_user_to_dev( nfp_cdev *cdev, int bar, int offset, const char *ubuf, int len);
+extern nfp_err nfp_copy_to_user_from_dev( nfp_cdev *cdev, int bar, int offset, char *ubuf, int len);
+
+extern nfp_err nfp_copy_from_user( char *kbuf, const char *ubuf, int len );
+extern nfp_err nfp_copy_to_user( char *ubuf, const char *kbuf, int len );
+
+extern nfp_err nfp_copy_from_dev( nfp_cdev *cdev, int bar, int offset, char *kbuf, int len );
+extern nfp_err nfp_copy_to_dev( nfp_cdev *cdev, int bar, int offset, const char *kbuf, int len);
+
+/* debug ------------------------------------------------------------ */
+
+#define NFP_DBG1	1
+#define NFP_DBGE	NFP_DBG1
+#define NFP_DBG2	2
+#define NFP_DBG3	3
+#define NFP_DBG4	4
+
+#ifdef STRANGE_VARARGS
+extern void nfp_log();
+#else
+extern void nfp_log( int severity, const char *format, ...);
+#endif
+
+extern int nfp_debug;
+
+#endif
diff --git a/usr/src/uts/common/io/nfp/nfpci.h b/usr/src/uts/common/io/nfp/nfpci.h
new file mode 100644
index 0000000000..793f5995e6
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/nfpci.h
@@ -0,0 +1,171 @@
+/*
+
+(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+*/
+
+/*
+*
+*  NFPCI.H	- nFast PCI interface definition file
+*
+*
+*
+*  1998.06.09	IH	Started
+*
+* The interface presented by nFast PCI devices consists of:
+*
+* A region of shared RAM used for data transfer & control information
+* A doorbell interrupt register, so both sides can give each other interrupts
+* A number of DMA channels for transferring data
+*/
+
+#ifndef NFPCI_H
+#define NFPCI_H
+
+/* Sizes of some regions */
+#define NFPCI_RAM_MINSIZE	0x00100000
+/* This is the minimum size of shared RAM. In future it may be possible to
+   negotiate larger sizes of shared RAM or auto-detect how big it is */
+#define NFPCI_RAM_MINSIZE_JOBS	0x00020000 /* standard jobs only */
+#define NFPCI_RAM_MINSIZE_KERN	0x00040000 /* standard and kernel jobs */
+
+/* Offsets within shared memory space.
+   The following main regions are:
+     jobs input area
+     jobs output area
+     kernel jobs input area
+     kernel output area
+*/
+
+#define NFPCI_OFFSET_JOBS		0x00000000
+#define NFPCI_OFFSET_JOBS_WR		0x00000000
+#define NFPCI_OFFSET_JOBS_RD		0x00010000
+#define NFPCI_OFFSET_KERN		0x00020000
+#define NFPCI_OFFSET_KERN_WR		0x00020000
+#define NFPCI_OFFSET_KERN_RD		0x00030000
+
+/* Interrupts, defined by bit position in doorbell register */
+
+/* Interrupts from device to host */
+#define NFAST_INT_DEVICE_WRITE_OK               0x00000001
+#define NFAST_INT_DEVICE_WRITE_FAILED           0x00000002
+#define NFAST_INT_DEVICE_READ_OK                0x00000004
+#define NFAST_INT_DEVICE_READ_FAILED            0x00000008
+#define NFAST_INT_DEVICE_KERN_WRITE_OK		0x00000010
+#define NFAST_INT_DEVICE_KERN_WRITE_FAILED	0x00000020
+#define NFAST_INT_DEVICE_KERN_READ_OK		0x00000040
+#define NFAST_INT_DEVICE_KERN_READ_FAILED	0x00000080
+
+/* Interrupts from host to device */
+#define NFAST_INT_HOST_WRITE_REQUEST            0x00010000
+#define NFAST_INT_HOST_READ_REQUEST             0x00020000
+#define NFAST_INT_HOST_DEBUG                    0x00040000
+#define NFAST_INT_HOST_KERN_WRITE_REQUEST	0x00080000
+#define NFAST_INT_HOST_KERN_READ_REQUEST	0x00100000
+
+/* Ordinary job submission ------------------------ */
+
+/* The NFPCI_OFFSET_JOBS_WR and NFPCI_OFFSET_JOBS_RD regions are defined
+   by the following (byte) address offsets... */
+
+#define NFPCI_OFFSET_CONTROL	0x0
+#define NFPCI_OFFSET_LENGTH	0x4
+#define NFPCI_OFFSET_DATA	0x8
+#define NFPCI_OFFSET_PUSH_ADDR	0x8
+
+#define NFPCI_JOBS_WR_CONTROL	(NFPCI_OFFSET_JOBS_WR + NFPCI_OFFSET_CONTROL)
+#define NFPCI_JOBS_WR_LENGTH	(NFPCI_OFFSET_JOBS_WR + NFPCI_OFFSET_LENGTH)
+#define NFPCI_JOBS_WR_DATA	(NFPCI_OFFSET_JOBS_WR + NFPCI_OFFSET_DATA)
+#define NFPCI_MAX_JOBS_WR_LEN		(0x0000FFF8)
+
+#define NFPCI_JOBS_RD_CONTROL	(NFPCI_OFFSET_JOBS_RD + NFPCI_OFFSET_CONTROL)
+#define NFPCI_JOBS_RD_LENGTH	(NFPCI_OFFSET_JOBS_RD + NFPCI_OFFSET_LENGTH)
+#define NFPCI_JOBS_RD_DATA	(NFPCI_OFFSET_JOBS_RD + NFPCI_OFFSET_DATA)
+/* address in PCI space of host buffer for NFPCI_JOB_CONTROL_PCI_PUSH */
+#define NFPCI_JOBS_RD_PUSH_ADDR	(NFPCI_OFFSET_JOBS_RD + NFPCI_OFFSET_PUSH_ADDR)
+#define NFPCI_MAX_JOBS_RD_LEN		(0x000FFF8)
+
+/* Kernel inferface job submission ---------------- */
+
+#define NFPCI_KERN_WR_CONTROL   (NFPCI_OFFSET_KERN_WR + NFPCI_OFFSET_CONTROL)
+#define NFPCI_KERN_WR_LENGTH    (NFPCI_OFFSET_KERN_WR + NFPCI_OFFSET_LENGTH)
+#define NFPCI_KERN_WR_DATA      (NFPCI_OFFSET_KERN_WR + NFPCI_OFFSET_DATA)
+#define NFPCI_MAX_KERN_WR_LEN      (0x0000FFF8)
+
+#define NFPCI_KERN_RD_CONTROL   (NFPCI_OFFSET_KERN_RD + NFPCI_OFFSET_CONTROL)
+#define NFPCI_KERN_RD_LENGTH    (NFPCI_OFFSET_KERN_RD + NFPCI_OFFSET_LENGTH)
+#define NFPCI_KERN_RD_DATA      (NFPCI_OFFSET_KERN_RD + NFPCI_OFFSET_DATA)
+/* address in PCI space of host buffer for NFPCI_JOB_CONTROL_PCI_PUSH */
+#define NFPCI_KERN_RD_ADDR      (NFPCI_OFFSET_KERN_RD + NFPCI_OFFSET_PUSH_ADDR)
+#define NFPCI_MAX_KERN_RD_LEN		(0x000FFF8)
+
+#ifdef DEFINE_NFPCI_PACKED_STRUCTS
+typedef struct
+{
+  UINT32	controlword;
+  UINT32	length;		/* length of data to follow */
+  union {
+    BYTE	data[1];
+    UINT32	addr;
+  } uu;
+}
+  NFPCI_JOBS_BLOCK;
+#endif
+
+
+#define NFPCI_JOB_CONTROL		0x00000001
+#define NFPCI_JOB_CONTROL_PCI_PUSH	0x00000002
+/*
+   The 'Control' word is analogous to the SCSI read/write address;
+   1 = standard push/pull IO
+   2 = push/push IO
+
+   To submit a block of job data, the host:
+   - sets the (32-bit, little-endian) word at NFPCI_JOBS_WR_CONTROL to NFPCI_JOB_CONTROL
+   - sets the word at NFPCI_JOBS_WR_LENGTH to the length of the data
+   - copies the data to NFPCI_JOBS_WR_DATA
+   - sets interrupt NFAST_INT_HOST_WRITE_REQUEST in the doorbell register
+   - awaits the NFAST_INT_DEVICE_WRITE_OK (or _FAILED) interrupts back
+
+   To read a block of jobs back, the host:
+   - sets the word at NFPCI_JOBS_RD_CONTROL to NFPCI_JOB_CONTROL
+   - sets the word at NFPCI_JOBS_RD_LENGTH to the max length for returned data
+   - sets interrupt NFAST_INT_HOST_READ_REQUEST
+   - awaits the NFAST_INT_DEVICE_READ_OK (or _FAILED) interrupt
+   - reads the data from NFPCI_JOBS_RD_DATA; the module will set the word at
+	NFPCI_JOBS_RD_LENGTH to its actual length.
+
+   Optionally the host can request the PCI read data to be pushed to host PCI mapped ram:
+   - allocates a contiguous PCI addressable buffer for a NFPCI_JOBS_BLOCK of max
+        size NFPCI_MAX_JOBS_RD_LEN (or NFPCI_MAX_KERN_RD_LEN) + 8
+   - sets the word at NFPCI_JOBS_RD_CONTROL to NFPCI_JOB_CONTROL_PCI_PUSH
+   - sets the word at NFPCI_JOBS_RD_LENGTH to the max length for returned data
+   - sets the word at NFPCI_JOBS_RD_PUSH_ADDR to be the host PCI address of
+        the buffer
+   - sets interrupt NFAST_INT_HOST_READ_REQUEST
+   - awaits the NFAST_INT_DEVICE_READ_OK (or _FAILED) interrupt
+   - reads the data from the buffer at NFPCI_OFFSET_DATA in the buffer.  The
+        module will set NFPCI_OFFSET_LENGTH to the actual length.
+*/
+
+#define NFPCI_SCRATCH_CONTROL       0
+
+#define NFPCI_SCRATCH_CONTROL_HOST_MOI   (1<<0)
+#define NFPCI_SCRATCH_CONTROL_MODE_SHIFT 1
+#define NFPCI_SCRATCH_CONTROL_MODE_MASK  (3<<NFPCI_SCRATCH_CONTROL_MODE_SHIFT)
+
+#define NFPCI_SCRATCH_STATUS        1
+
+#define NFPCI_SCRATCH_STATUS_MONITOR_MOI         (1<<0)
+#define NFPCI_SCRATCH_STATUS_APPLICATION_MOI     (1<<1)
+#define NFPCI_SCRATCH_STATUS_APPLICATION_RUNNING (1<<2)
+#define NFPCI_SCRATCH_STATUS_ERROR               (1<<3)
+
+#define NFPCI_SCRATCH_ERROR_LO      2
+#define NFPCI_SCRATCH_ERROR_HI      3
+
+#endif
diff --git a/usr/src/uts/common/io/nfp/osif.c b/usr/src/uts/common/io/nfp/osif.c
new file mode 100644
index 0000000000..fba62f9a37
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/osif.c
@@ -0,0 +1,184 @@
+/*
+
+(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+*/
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/errno.h>
+#include <sys/file.h>
+#include <sys/conf.h>
+#include <sys/uio.h>
+#include <sys/map.h>
+#include <sys/debug.h>
+#include <sys/modctl.h>
+#include <sys/kmem.h>
+#include <sys/cmn_err.h>
+#include <sys/open.h>
+#include <sys/stat.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/pci.h>
+
+#include "nfp_common.h"
+#include "nfp_hostif.h"
+#include "nfp_error.h"
+#include "nfp_osif.h"
+#include "nfp_cmd.h"
+#include "nfp.h"
+#include "autoversion.h"
+
+/* config space access ---------------------------------- */
+
+nfp_err nfp_config_inl( nfp_cdev *pdev, int offset, unsigned int *res ) {
+  unsigned int tmp32;
+  if ( !pdev || !pdev->dev || !pdev->dev->conf_handle )
+    return NFP_ENODEV;
+
+/* pci_config_get32() does byte swapping, so put back to LE */
+  tmp32 = pci_config_get32( pdev->dev->conf_handle, offset );
+  TO_LE32_IO(res, tmp32);
+
+  return NFP_SUCCESS;
+}
+
+/* user space memory access ---------------------------------- */
+
+nfp_err nfp_copy_from_user( char *kbuf, const char *ubuf, int len) {
+  bcopy(ubuf, kbuf, len);
+  return 0;
+}
+
+nfp_err nfp_copy_to_user( char *ubuf, const char *kbuf, int len) {
+  bcopy(kbuf, ubuf, len);
+  return 0;
+}
+
+nfp_err nfp_copy_from_user_to_dev( nfp_cdev *cdev, int bar, int offset, const char *ubuf, int len) {
+  /* dirty hack on Solaris, as we are called from strategy we are, in fact, copying from kernel mem */
+  return nfp_copy_to_dev( cdev, bar, offset, ubuf, len );
+}
+
+nfp_err nfp_copy_to_user_from_dev( nfp_cdev *cdev, int bar, int offset, char *ubuf, int len) {
+  /* dirty hack on Solaris, as we are called from strategy we are, in fact, copying to kernel mem */
+  return nfp_copy_from_dev( cdev, bar, offset, ubuf, len );
+}
+
+nfp_err nfp_copy_from_dev( nfp_cdev *cdev, int bar, int offset, char *kbuf, int len) {
+  if( len & 0x3 || offset & 0x3 )
+    DDI_REP_GET8( cdev->extra[bar], (unsigned char *)kbuf, cdev->bar[bar] + offset, len, DDI_DEV_AUTOINCR);
+  else
+    /* LINTED: alignment */
+    DDI_REP_GET32( cdev->extra[bar], (unsigned int *)kbuf, (unsigned int *)(cdev->bar[bar] + offset), len / 4, DDI_DEV_AUTOINCR);
+  return NFP_SUCCESS;
+}
+
+nfp_err nfp_copy_to_dev( nfp_cdev *cdev, int bar, int offset, const char *kbuf, int len) {
+  if( len & 0x3 || offset & 0x3 )
+    DDI_REP_PUT8( cdev->extra[bar], (unsigned char *)kbuf, cdev->bar[bar] + offset, len, DDI_DEV_AUTOINCR );
+  else
+    /* LINTED: alignment */
+    DDI_REP_PUT32( cdev->extra[bar], (unsigned int *)kbuf, (unsigned int *)(cdev->bar[bar] + offset), len / 4, DDI_DEV_AUTOINCR );
+  return NFP_SUCCESS;
+}
+
+/* pci io space access --------------------------------------- */
+
+unsigned int nfp_inl( nfp_cdev *pdev, int bar, int offset ) {
+  nfp_log( NFP_DBG3, "nfp_inl: addr %x", (uintptr_t) pdev->bar[bar] + offset);
+  /* LINTED: alignment */
+  return DDI_GET32( pdev->extra[bar], (uint32_t *)(pdev->bar[bar] + offset) );
+}
+
+unsigned short nfp_inw( nfp_cdev *pdev, int bar, int offset ) {
+  nfp_log( NFP_DBG3, "nfp_inw: addr %x", (uintptr_t) pdev->bar[bar] + offset);
+  /* LINTED: alignment */
+  return DDI_GET16( pdev->extra[bar], (unsigned short *)(pdev->bar[ bar ] + offset) );
+}
+
+void nfp_outl( nfp_cdev *pdev, int bar, int offset, unsigned int data ) {
+  nfp_log( NFP_DBG3, "nfp_outl: addr %x, data %x", (uintptr_t) pdev->bar[bar] + offset, data);
+  /* LINTED: alignment */
+  DDI_PUT32( pdev->extra[bar], (uint32_t *)(pdev->bar[ bar ] + offset), data ); 
+}
+
+void nfp_outw( nfp_cdev *pdev, int bar, int offset, unsigned short data ) {
+  nfp_log( NFP_DBG3, "nfp_outl: addr %x, data %x", (uintptr_t) pdev->bar[bar] + offset, data);
+  /* LINTED: alignment */
+  DDI_PUT16( pdev->extra[bar], (unsigned short *)(pdev->bar[ bar ] + offset), data ); 
+}
+
+/* logging ---------------------------------------------------- */
+
+void nfp_log( int level, const char *fmt, ...)
+{
+  auto char buf[256];
+  va_list ap;
+
+  switch (level) {
+  case NFP_DBG4: if (nfp_debug < 4) break;
+  /*FALLTHROUGH*/
+  case NFP_DBG3: if (nfp_debug < 3) break;
+  /*FALLTHROUGH*/
+  case NFP_DBG2: if (nfp_debug < 2) break;
+  /*FALLTHROUGH*/
+  case NFP_DBG1: if (nfp_debug < 1) break;
+  /*FALLTHROUGH*/
+  default:
+    va_start(ap, fmt);
+    (void) vsnprintf(buf, 256, fmt, ap);
+    va_end(ap);
+    cmn_err(CE_CONT, "!" VERSION_COMPNAME " " VERSION_NO ": %s\n", buf);
+    break;
+  }
+}
+
+struct errstr {
+  int oserr;
+  nfp_err nferr;
+};
+
+
+static struct errstr errtab[] = {
+  { EFAULT, NFP_EFAULT },
+  { ENOMEM, NFP_ENOMEM },
+  { EINVAL, NFP_EINVAL },
+  { EIO,    NFP_EIO    },
+  { ENXIO,  NFP_ENXIO  },
+  { ENODEV, NFP_ENODEV  },
+  { EINVAL, NFP_EUNKNOWN },
+  { 0, 0 }
+};
+
+nfp_err nfp_error( int oserr )
+{
+  struct errstr *perr;
+  if(!oserr)
+    return 0;
+  perr= errtab;
+  while(perr->nferr) {
+   if(perr->oserr == oserr)
+     return perr->nferr;
+   perr++;
+  }
+  return NFP_EUNKNOWN;
+}
+
+int nfp_oserr( nfp_err nferr )
+{
+  struct errstr *perr;
+  if(nferr == NFP_SUCCESS)
+    return 0;
+  perr= errtab;
+  while(perr->nferr) {
+   if(perr->nferr == nferr)
+     return perr->oserr;
+   perr++;
+  }
+  return EIO;
+}
diff --git a/usr/src/uts/common/io/overlay/overlay.c b/usr/src/uts/common/io/overlay/overlay.c
new file mode 100644
index 0000000000..3f34ec3b58
--- /dev/null
+++ b/usr/src/uts/common/io/overlay/overlay.c
@@ -0,0 +1,2184 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+/*
+ * Overlay Devices
+ *
+ * Overlay devices provide a means for creating overlay networks, a means of
+ * multiplexing multiple logical, isolated, and discrete layer two and layer
+ * three networks on top of one physical network.
+ *
+ * In general, these overlay devices encapsulate the logic to answer two
+ * different questions:
+ *
+ *   1) How should I transform a packet to put it on the wire?
+ *   2) Where should I send a transformed packet?
+ *
+ * Each overlay device is presented to the user as a GLDv3 device. While the
+ * link itself cannot have an IP interface created on top of it, it allows for
+ * additional GLDv3 devices, such as a VNIC, to be created on top of it which
+ * can be plumbed up with IP interfaces.
+ *
+ *
+ * --------------------
+ * General Architecture
+ * --------------------
+ *
+ * The logical overlay device that a user sees in dladm(1M) is a combination of
+ * two different components that work together. The first component is this
+ * kernel module, which is responsible for answering question one -- how should
+ * I transform a packet to put it on the wire.
+ *
+ * The second component is what we call the virtual ARP daemon, or varpd. It is
+ * a userland component that is responsible for answering the second question --
+ * Where should I send a transformed packet. Instances of the kernel overlay
+ * GLDv3 device ask varpd the question of where should a packet go.
+ *
+ * The split was done for a few reasons. Importantly, we wanted to keep the act
+ * of generating encapsulated packets in the kernel so as to ensure that the
+ * general data path was fast and also kept simple. On the flip side, while the
+ * question of where should something go may be simple, it may often be
+ * complicated and need to interface with several different external or
+ * distributed systems. In those cases, it's simpler to allow for the full
+ * flexibility of userland to be brought to bear to solve that problem and in
+ * general, the path isn't very common.
+ *
+ * The following is what makes up the logical overlay device that a user would
+ * create with dladm(1M).
+ *
+ *       Kernel                                     Userland
+ *   . . . . . . . . . . . . . . . . . . . . .   . . . . . . . . . . . . .
+ *   . +--------+   +--------+  +--------+   .   .                       .
+ *   . | VNIC 0 |   | VNIC 1 |  | VNIC 2 |   .   .                       .
+ *   . +--------+   +--------+  +--------+   .   .                       .
+ *   .     |            |           |        .   .                       .
+ *   .     |            |           |        .   .                       .
+ *   .     +------------+-----------+        .   .                       .
+ *   .                  |              . . /dev/overlay                  .
+ *   .           +--------------+      .     .   .       +------------+  .
+ *   .           |              |      .     .   .       |            |  .
+ *   .           |    Overlay   |======*=================|   Virtual  |  .
+ *   .           | GLDv3 Device |========================| ARP Daemon |  .
+ *   .           |              |            .   .       |            |  .
+ *   .           +--------------+            .   .       +------------+  .
+ *   .                  |                    .   .              |        .
+ *   .                  |                    .   .              |        .
+ *   .           +----------------+          .   .         +--------+    .
+ *   .           |  Overlay       |          .   .         | varpd  |    .
+ *   .           |  Encapsulation |          .   .         | Lookup |    .
+ *   .           |  Plugin        |          .   .         | Plugin |    .
+ *   .           +----------------+          .   .         +--------+    .
+ *   . . . . . . . . . . . . . . . . . . . . .   . . . . . . . . . . . . .
+ *
+ *
+ * This image shows the two different components and where they live.
+ * Importantly, it also shows that both the kernel overlay device and the
+ * userland varpd both support plugins. The plugins actually implement the
+ * things that users care about and the APIs have been designed to try to
+ * minimize the amount of things that a module writer needs to worry about it.
+ *
+ * IDENTIFIERS
+ *
+ * Every overlay device is defined by a unique identifier which is the overlay
+ * identifier. Its purpose is similar to that of a VLAN identifier, it's a
+ * unique number that is used to differentiate between different entries on the
+ * wire.
+ *
+ * ENCAPSULATION
+ *
+ * An overlay encapsulation plugin is a kernel miscellaneous module whose
+ * purpose is to contain knowledge about how to transform packets to put them
+ * onto the wire and to take them off. An example of an encapsulation plugin is
+ * vxlan. It's also how support for things like nvgre or geneve would be brought
+ * into the system.
+ *
+ * Each encapsulation plugins defines a series of operation vectors and
+ * properties. For the full details on everything they should provide, please
+ * read uts/common/sys/overlay_plugin.h. The encapsulation plugin is responsible
+ * for telling the system what information is required to send a packet. For
+ * example, vxlan is defined to send everything over a UDP packet and therefore
+ * requires a port and an IP address, while nvgre on the other hand is its own
+ * IP type and therefore just requires an IP address. In addition, it also
+ * provides information about the kind of socket that should be created. This is
+ * used by the kernel multiplexor, more of that in the Kernel Components
+ * section.
+ *
+ * LOOKUPS
+ *
+ * The kernel communicates requests for lookups over the character device
+ * /dev/overlay. varpd is responsible for listening for requests on that device
+ * and answering them. The character device is specific to the target path and
+ * varpd.
+ *
+ * Much as the kernel overlay module handles the bulk of the scaffolding but
+ * leaves the important work to the encapsulation plugin, varpd provides a
+ * similar role and leaves the full brunt of lookups to a userland dynamic
+ * shared object which implements the logic of lookups.
+ *
+ * Each lookup plugin defines a series of operation vectors and properties. For
+ * the full details on everything that they should provide, please read
+ * lib/varpd/libvarpd/libvarpd_provider.h. Essentially, they are given a MAC
+ * address and asked to give an address on the physical network that it should
+ * be sent to. In addition, they handle questions related to how to handle
+ * things like broadcast and multicast traffic, etc.
+ *
+ * ----------
+ * Properties
+ * ----------
+ *
+ * A device from a dladm perspective has a unique set of properties that are
+ * combined from three different sources:
+ *
+ *   1) Generic properties that every overlay device has
+ *   2) Properties that are specific to the encapsulation plugin
+ *   3) Properties that are specific to the lookup plugin
+ *
+ * All of these are exposed in a single set of properties in dladm. Note that
+ * these are not necessarily traditional link properties. However, if something
+ * is both a traditional GLDv3 link property, say the MTU of a device, and a
+ * specific property here, than the driver ensures that all existing GLDv3
+ * specific means of manipulating it are used and wraps up its private property
+ * interfaces to ensure that works.
+ *
+ * Properties in the second and third category are prefixed with the name of
+ * their module. For example, the vxlan encapsulation module has a property
+ * called the 'listen_ip'. This property would show up in dladm as
+ * 'vxlan/listen_ip'. This allows different plugins to both use similar names
+ * for similar properties and to also have independent name spaces so that
+ * overlapping names do not conflict with anything else.
+ *
+ * While the kernel combines both sets one and two into a single coherent view,
+ * it does not do anything with respect to the properties that are owned by the
+ * lookup plugin -- those are owned wholly by varpd. Instead, libdladm is in
+ * charge of bridging these two worlds into one magical experience for the user.
+ * It carries the burden of knowing about both overlay specific and varpd
+ * specific properties. Importantly, we want to maintain this distinction. We
+ * don't want to treat the kernel as an arbitrary key/value store for varpd and
+ * we want the kernel to own its own data and not have to ask userland for
+ * information that it owns.
+ *
+ * Every property in the system has the following attributes:
+ *
+ *   o A name
+ *   o A type
+ *   o A size
+ *   o Permissions
+ *   o Default value
+ *   o Valid value ranges
+ *   o A value
+ *
+ * Everything except for the value is obtained by callers through the propinfo
+ * callbacks and a property has a maximum size of OVERLAY_PROP_SIZEMAX,
+ * currently 256 bytes.
+ *
+ * The following are the supported types of properties:
+ *
+ * 	OVERLAY_PROP_T_INT
+ *
+ * 		A signed integer, its length is 8 bytes, corresponding to a
+ * 		int64_t.
+ *
+ * 	OVERLAY_PROP_T_UINT
+ *
+ * 		An unsigned integer, its length is 8 bytes, corresponding to a
+ * 		uint64_t.
+ *
+ * 	OVERLAY_PROP_T_IP
+ *
+ * 		A struct in6_addr, it has a fixed size.
+ *
+ * 	OVERLAY_PROP_T_STRING
+ *
+ * 		A null-terminated character string encoded in either ASCII or
+ * 		UTF-8. Note that the size of the string includes the null
+ * 		terminator.
+ *
+ * The next thing that we apply to a property is its permission. The permissions
+ * are put together by the bitwise or of the following flags and values.
+ *
+ * 	OVERLAY_PROP_PERM_REQ
+ *
+ * 		This indicates a required property. A property that is required
+ * 		must be set by a consumer before the device can be created. If a
+ * 		required property has a default property, this constraint is
+ * 		loosened because the default property defines the value.
+ *
+ * 	OVERLAY_PORP_PERM_READ
+ *
+ * 		This indicates that a property can be read. All properties will
+ * 		have this value set.
+ *
+ * 	OVERLAY_PROP_PERM_WRITE
+ *
+ * 		This indicates that a property can be written to and thus
+ * 		updated by userland. Properties that are only intended to
+ * 		display information, will not have OVERLAY_PROP_PERM_WRITE set.
+ *
+ * In addition, a few additional values are defined as a convenience to
+ * consumers. The first, OVERLAY_PROP_PERM_RW, is a combination of
+ * OVERLAY_PROP_PERM_READ and OVERLAY_PERM_PROP_WRITE. The second,
+ * OVERLAY_PROP_PERM_RRW, is a combination of OVERLAY_PROP_PERM_REQ,
+ * OVERLAY_PROP_PERM_READ, and OVERLAY_PROP_PERM_WRITE. The protection mode of a
+ * property should generally be a constant across its lifetime.
+ *
+ * A property may optionally have a default value. If it does have a default
+ * value, and that property is not set to be a different value, then the default
+ * value is inherited automatically. It also means that if the default value is
+ * acceptable, there is no need to set the value for a required property. For
+ * example, the vxlan module has the vxlan/listen_port property which is
+ * required, but has a default value of 4789 (the IANA assigned port). Because
+ * of that default value, there is no need for it to be set.
+ *
+ * Finally, a property may declare a list of valid values. These valid values
+ * are used for display purposes, they are not enforced by the broader system,
+ * but merely allow a means for the information to be communicated to the user
+ * through dladm(1M). Like a default value, this is optional.
+ *
+ * The general scaffolding does not do very much with respect to the getting and
+ * setting of properties. That is really owned by the individual plugins
+ * themselves.
+ *
+ * -----------------------------
+ * Destinations and Plugin Types
+ * -----------------------------
+ *
+ * Both encapsulation and lookup plugins define the kinds of destinations that
+ * they know how to support. There are three different pieces of information
+ * that can be used to address to a destination currently, all of which is
+ * summarized in the type overlay_point_t. Any combination of these is
+ * supported.
+ *
+ * 	OVERLAY_PLUGIN_D_ETHERNET
+ *
+ * 		An Ethernet MAC address is required.
+ *
+ * 	OVERLAY_PLUGIN_D_IP
+ *
+ * 		An IP address is required. All IP addresses used by the overlay
+ * 		system are transmitted as IPv6 addresses. IPv4 addresses can be
+ * 		represented by using IPv4-mapped IPv6 addresses.
+ *
+ * 	OVERLAY_PLUGIN_D_PORT
+ *
+ * 		A TCP/UDP port is required.
+ *
+ * A kernel encapsulation plugin declares which of these that it requires, it's
+ * a static set. On the other hand, a userland lookup plugin can be built to
+ * support all of these or any combination thereof. It gets passed the required
+ * destination type, based on the kernel encapsulation method, and then it makes
+ * the determination as to whether or not it supports it. For example, the
+ * direct plugin can support either an IP or both an IP and a port, it simply
+ * doesn't display the direct/dest_port property in the cases where a port is
+ * not required to support this.
+ *
+ * The user lookup plugins have two different modes of operation which
+ * determines how they interact with the broader system and how look ups are
+ * performed. These types are:
+ *
+ * 	OVERLAY_TARGET_POINT
+ *
+ * 		A point to point plugin has a single static definition for where
+ * 		to send all traffic. Every packet in the system always gets sent
+ * 		to the exact same destination which is programmed into the
+ * 		kernel when the general device is activated.
+ *
+ * 	OVERLAY_TARGET_DYNAMIC
+ *
+ * 		A dynamic plugin does not have a single static definition.
+ * 		Instead, for each destination, the kernel makes an asynchronous
+ * 		request to varpd to determine where the packet should be routed,
+ * 		and if a specific destination is found, then that destination is
+ * 		cached in the overlay device's target cache.
+ *
+ * This distinction, while important for the general overlay device's operation,
+ * is not important to the encapsulation plugins. They don't need to know about
+ * any of these pieces. It's just a concern for varpd, the userland plugin, and
+ * the general overlay scaffolding.
+ *
+ * When an overlay device is set to OVERLAY_TARGET_POINT, then it does not
+ * maintain a target cache, and instead just keeps track of the destination and
+ * always sends encapsulated packets to that address. When the target type is of
+ * OVERLAY_TARGET_DYNAMIC, then the kernel maintains a cache of all such
+ * destinations. These destinations are kept around in an instance of a
+ * reference hash that is specific to the given overlay device. Entries in the
+ * cache can be invalidated and replaced by varpd and its lookup plugins.
+ *
+ * ----------------------------------
+ * Kernel Components and Architecture
+ * ----------------------------------
+ *
+ * There are multiple pieces inside the kernel that work together, there is the
+ * general overlay_dev_t structure, which is the logical GLDv3 device, but it
+ * itself has references to things like an instance of an encapsulation plugin,
+ * a pointer to a mux and a target cache. It can roughly be summarized in the
+ * following image:
+ *
+ *     +------------------+
+ *     | global           |
+ *     | overlay list     |
+ *     | overlay_dev_list |
+ *     +------------------+
+ *        |
+ *        |  +-----------------------+            +---------------+
+ *        +->| GLDv3 Device          |----------->| GLDv3 Device  | -> ...
+ *           | overlay_dev_t         |            | overlay_dev_t |
+ *           |                       |            +---------------+
+ *           |                       |
+ *           | mac_handle_t     -----+---> GLDv3 handle to MAC
+ *           | datalink_id_t    -----+---> Datalink ID used by DLS
+ *           | overlay_dev_flag_t ---+---> Device state
+ *           | uint_t           -----+---> Curent device MTU
+ *           | uint_t           -----+---> In-progress RX operations
+ *           | uint_t           -----+---> In-progress TX operations
+ *           | char[]           -----+---> FMA degraded message
+ *           | void *           -----+---> plugin private data
+ *           | overlay_target_t * ---+---------------------+
+ *           | overlay_plugin_t * ---+---------+           |
+ *           +-----------------------+         |           |
+ *                           ^                 |           |
+ *   +--------------------+  |                 |           |
+ *   | Kernel Socket      |  |                 |           |
+ *   | Multiplexor        |  |                 |           |
+ *   | overlay_mux_t      |  |                 |           |
+ *   |                    |  |                 |           |
+ *   | avl_tree_t        -+--+                 |           |
+ *   | uint_t            -+--> socket family   |           |
+ *   | uint_t            -+--> socket type     |           |
+ *   | uint_t            -+--> socket protocol |           |
+ *   | ksocket_t         -+--> I/O socket      |           |
+ *   | struct sockaddr * -+--> ksocket address |           |
+ *   | overlay_plugin_t --+--------+           |           |
+ *   +--------------------+        |           |           |
+ *                                 |           |           |
+ *   +-------------------------+   |           |           |
+ *   | Encap Plugin            |<--+-----------+           |
+ *   | overlay_plugin_t        |                           |
+ *   |                         |                           |
+ *   | char *               ---+--> plugin name            |
+ *   | overlay_plugin_ops_t * -+--> plugin downcalls       |
+ *   | char ** (props)      ---+--> property list          |
+ *   | uint_t               ---+--> id length              |
+ *   | overlay_plugin_flags_t -+--> plugin flags           |
+ *   | overlay_plugin_dest_t --+--> destination type       v
+ *   +-------------------------+                    +-------------------------+
+ *                                                  |   Target Cache          |
+ *                                                  |   overlay_target_t      |
+ *                                                  |                         |
+ *                                    cache mode <--+- overlay_target_mode_t  |
+ *                                     dest type <--+- overlay_plugin_dest_t  |
+ *                                   cache flags <--+- overlay_target_flag_t  |
+ *                                     varpd id  <--+- uint64_t               |
+ *                       outstanding varpd reqs. <--+- uint_t                 |
+ *                   OVERLAY_TARGET_POINT state  <--+- overlay_target_point_t |
+ *               OVERLAY_TARGET_DYNAMIC state <-+---+- overlay_target_dyn_t   |
+ *                                              |   +-------------------------+
+ *                      +-----------------------+
+ *                      |
+ *                      v
+ *   +-------------------------------+   +------------------------+
+ *   | Target Entry                  |-->| Target Entry           |--> ...
+ *   | overlay_target_entry_t        |   | overlay_target_entry_t |
+ *   |                               |   +------------------------+
+ *   |                               |
+ *   | overlay_target_entry_flags_t -+--> Entry flags
+ *   | uint8_t[ETHERADDRL]        ---+--> Target MAC address
+ *   | overlay_target_point_t     ---+--> Target underlay address
+ *   | mblk_t *                   ---+--> outstanding mblk head
+ *   | mblk_t *                   ---+--> outstanding mblk tail
+ *   | size_t                     ---+--> outstanding mblk size
+ *   +-------------------------------+
+ *
+ * The primary entries that we care about are the overlay_dev_t, which
+ * correspond to each overlay device that is created with dladm(1M). Globally,
+ * these devices are maintained in a simple list_t which is protected with a
+ * lock.  Hence, these include important information such as the mac_handle_t
+ * and a datalink_id_t which is used to interact with the broader MAC and DLS
+ * ecosystem. We also maintain additional information such as the current state,
+ * outstanding operations, the mtu, and importantly, the plugin's private data.
+ * This is the instance of an encapsulation plugin that gets created as part of
+ * creating an overlay device. Another aspect of this is that the overlay_dev_t
+ * also includes information with respect to FMA. For more information, see the
+ * FMA section.
+ *
+ * Each overlay_dev_t has a pointer to a plugin, a mux, and a target. The plugin
+ * is the encapsulation plugin. This allows the device to make downcalls into it
+ * based on doing things like getting and setting properties. Otherwise, the
+ * plugin itself is a fairly straightforward entity. They are maintained in an
+ * (not pictured above) list. The plugins themselves mostly maintain things like
+ * the static list of properties, what kind of destination they require, and the
+ * operations vector. A given module may contain more if necessary.
+ *
+ * The next piece of the puzzle is the mux, or a multiplexor. The mux itself
+ * maintains a ksocket and it is through the mux that we send and receive
+ * message blocks. The mux represents a socket type and address, as well as a
+ * plugin. Multiple overlay_dev_t devices may then share the same mux. For
+ * example, consider the case where you have different instances of vxlan all on
+ * the same underlay network. These would all logically share the same IP
+ * address and port that packets are sent and received on; however, what differs
+ * is the decapuslation ID.
+ *
+ * Each mux maintains a ksocket_t which is similar to a socket(3SOCKET). Unlike
+ * a socket, we enable a direct callback on the ksocket. This means that
+ * whenever a message block chain is received, rather than sitting there and
+ * getting a callback in a context and kicking that back out to a taskq. Instead
+ * data comes into the callback function overlay_mux_recv().
+ *
+ * The mux is given encapsulated packets (via overlay_m_tx, the GLDv3 tx
+ * function) to transmit. It receives encapsulated packets, decapsulates them to
+ * determine the overlay identifier, looks up the given device that matches that
+ * identifier, and then causes the broader MAC world to receive the packet with
+ * a call to mac_rx().
+ *
+ * Today, we don't do too much that's special with the ksocket; however, as
+ * hardware is gaining understanding for these encapuslation protocols, we'll
+ * probably want to think of better ways to get those capabilities passed down
+ * and potentially better ways to program receive filters so they get directly
+ * to us. Though, that's all fantasy future land.
+ *
+ * The next part of the puzzle is the target cache. The purpose of the target
+ * cache is to cache where we should send a packet on the underlay network,
+ * given its mac address. The target cache operates in two modes depending on
+ * whether the lookup module was declared to OVERLAY_TARGET_POINT or
+ * OVERLAY_TARGET_DYANMIC.
+ *
+ * In the case where the target cache has been programmed to be
+ * OVERLAY_TARGET_POINT, then we only maintain a single overlay_target_point_t
+ * which has the destination that we send everything, no matter the destination
+ * mac address.
+ *
+ * On the other hand, when we have an instance of OVERLAY_TARGET_DYNAMIC, things
+ * are much more interesting and as a result, more complicated. We primarily
+ * store lists of overlay_target_entry_t's which are stored in both an avl tree
+ * and a refhash_t. The primary look up path uses the refhash_t and the avl tree
+ * is only used for a few of the target ioctls used to dump data such that we
+ * can get a consistent iteration order for things like dladm show-overlay -t.
+ * The key that we use for the reference hashtable is based on the mac address
+ * in the cache and currently we just do a simple CRC32 to transform it into a
+ * hash.
+ *
+ * Each entry maintains a set of flags to indicate the current status of the
+ * request. The flags may indicate one of three states: that current cache entry
+ * is valid, that the current cache entry has been directed to drop all output,
+ * and that the current cache entry is invalid and may be being looked up. In
+ * the case where it's valid, we just take the destination address and run with
+ * it.
+ *
+ * If it's invalid and a lookup has not been made, then we start the process
+ * that prepares a query that will make its way up to varpd. The cache entry
+ * entry maintains a message block chain of outstanding message blocks and a
+ * size. These lists are populated only when we don't know the answer as to
+ * where should these be sent. The size entry is used to cap the amount of
+ * outstanding data that we don't know the answer to. If we exceed a cap on the
+ * amount of outstanding data (currently 1 Mb), then we'll drop any additional
+ * packets. Once we get an answer indicating a valid destination, we transmit
+ * any outstanding data to that place. For the full story on how we look that up
+ * will be discussed in the section on the Target Cache Lifecycle.
+ *
+ * ------------------------
+ * FMA and Degraded Devices
+ * ------------------------
+ *
+ * Every kernel overlay device keeps track of its FMA state. Today in FMA we
+ * cannot represent partitions between resources nor can we represent that a
+ * given minor node of a psuedo device has failed -- if we degrade the overlay
+ * device, then the entire dev_info_t is degraded. However, we still want to be
+ * able to indicate to administrators that things may go wrong.
+ *
+ * To this end, we've added a notion of a degraded state to every overlay
+ * device. This state is primarily dictated by userland and it can happen for
+ * various reasons. Generally, because a userland lookup plugin has been
+ * partitioned, or something has gone wrong such that there is no longer any
+ * userland lookup module for a device, then we'll mark it degraded.
+ *
+ * As long as any of our minor instances is degraded, then we'll fire off the
+ * FMA event to note that. Once the last degraded instance is no longer
+ * degraded, then we'll end up telling FMA that we're all clean.
+ *
+ * To help administrators get a better sense of which of the various minor
+ * devices is wrong, we store the odd_fmamsg[] character array. This character
+ * array can be fetched with doing a dladm show-overlay -f.
+ *
+ * Note, that it's important that we do not update the link status of the
+ * devices. We want to remain up as much as possible. By changing the link in a
+ * degraded state, this may end up making things worse. We may still actually
+ * have information in the target cache and if we mark the link down, that'll
+ * result in not being able to use it. The reason being that this'll mark all
+ * the downstream VNICs down which will go to IP and from there we end up
+ * dealing with sadness.
+ *
+ * -----------------------
+ * Target Cache Life Cycle
+ * -----------------------
+ *
+ * This section only applies when we have a lookup plugin of
+ * OVERLAY_TARGET_DYNAMIC. None of this applies to those of type
+ * OVERLAY_TARGET_POINT.
+ *
+ * While we got into the target cache in the general architecture section, it's
+ * worth going into more details as to how this actually works and showing some
+ * examples and state machines. Recall that a target cache entry basically has
+ * the following state transition diagram:
+ *
+ * Initial state
+ *    . . .           . . . first access       . . . varpd lookup enqueued
+ *        .           .                        .
+ *        .           .                        .
+ *     +-------+      .     +----------+       .
+ *     |  No   |------*---->| Invalid  |-------*----+
+ *     | Entry |            |  Entry   |            |
+ *     +-------+            +----------+            |
+ *                 varpd      ^      ^   varpd      |
+ *                 invalidate |      |   drop       |
+ *                      . . . *      * . .          v
+ *          +-------+         |      |         +---------+
+ *          | Entry |--->-----+      +----<----| Entry   |
+ *          | Valid |<----------*---------<----| Pending |->-+     varpd
+ *          +-------+           .              +---------+   * . . drop, but
+ *                              . varpd                ^     |     other queued
+ *                              . success              |     |     entries
+ *                                                     +-----+
+ *
+ * When the table is first created, it is empty. As we attempt to lookup entries
+ * and we find there is no entry at all, we'll create a new table entry for it.
+ * At that point the entry is technically in an invalid state, that means that
+ * we have no valid data from varpd. In that case, we'll go ahead and queue the
+ * packet into the entry's pending chain, and queue a varpd lookup, setting the
+ * OVERLAY_ENTRY_F_PENDING flag in the progress.
+ *
+ * If additional mblk_t's come in for this entry, we end up appending them to
+ * the tail of the chain, if and only if, we don't exceed the threshold for the
+ * amount of space they can take up. An entry remains pending until we get a
+ * varpd reply. If varpd replies with a valid results, we move to the valid
+ * entry state, and remove the OVERLAY_ENTRY_F_PENDING flag and set it with one
+ * of OVERLAY_ENTRY_F_VALID or OVERLAY_ENTRY_F_DROP as appropriate.
+ *
+ * Once an entry is valid, it stays valid until user land tells us to invalidate
+ * it with an ioctl or replace it, OVERLAY_TARG_CACHE_REMOE and
+ * OVERLAY_TARG_CACHE_SET respectively.
+ *
+ * If the lookup fails with a call to drop the packet, then the next state is
+ * determined by the state of the queue. If the set of outstanding entries is
+ * empty, then we just transition back to the invalid state. If instead, the
+ * set of outstanding entries is not empty, then we'll queue another entry and
+ * stay in the same state, repeating this until the number of requests is
+ * drained.
+ *
+ * The following images describes the flow of a given lookup and where the
+ * overlay_target_entry_t is at any given time.
+ *
+ *     +-------------------+
+ *     | Invalid Entry     |		An entry starts off as an invalid entry
+ *     | de:ad:be:ef:00:00 |		and only exists in the target cache.
+ *     +-------------------+
+ *
+ * 	~~~~
+ *
+ *     +---------------------+
+ *     | Global list_t       |		A mblk_t comes in for an entry. We
+ *     | overlay_target_list |		append it to the overlay_target_list.
+ *     +---------------------+
+ *                   |
+ *                   v
+ *             +-------------------+      +-------------------+
+ *             | Pending Entry     |----->| Pending Entry     |--->...
+ *             | 42:5e:1a:10:d6:2d |      | de:ad:be:ef:00:00 |
+ *             +-------------------+      +-------------------+
+ *
+ * 	~~~~
+ *
+ *     +--------------------------+
+ *     | /dev/overlay minor state |	User land said that it would look up an
+ *     | overlay_target_hdl_t     |	entry for us. We remove it from the
+ *     +--------------------------+	global list and add it to the handle's
+ *                  |			outstanding list.
+ *                  |
+ *                  v
+ *            +-------------------+      +-------------------+
+ *            | Pending Entry     |----->| Pending Entry     |
+ *            | 90:b8:d0:79:02:dd |      | de:ad:be:ef:00:00 |
+ *            +-------------------+      +-------------------+
+ *
+ * 	~~~~
+ *
+ *     +-------------------+
+ *     | Valid Entry       |		varpd returned an answer with
+ *     | de:ad:be:ef:00:00 |		OVERLAY_IOC_RESPOND and the target cache
+ *     | 10.169.23.42:4789 |		entry is now populated with a
+ *     +-------------------+		destination and marked as valid
+ *
+ *
+ * The lookup mechanism is performed via a series of operations on the character
+ * psuedo-device /dev/overlay. The only thing that uses this device is the
+ * userland daemon varpd. /dev/overlay is a cloneable device, each open of it
+ * granting a new minor number which maintains its own state. We maintain this
+ * state so that way if an outstanding lookup was queued to something that
+ * crashed or closed its handle without responding, we can know about this and
+ * thus handle it appropriately.
+ *
+ * When a lookup is first created it's added to our global list of outstanding
+ * lookups. To service requests, userland is required to perform an ioctl to ask
+ * for a request. We will block it in the kernel a set amount of time waiting
+ * for a request. When we give a request to a given minor instance of the
+ * device, we remove it from the global list and append the request to the
+ * device's list of outstanding entries, for the reasons we discussed above.
+ * When a lookup comes in, we give user land a smaller amount of information
+ * specific to that packet, the overlay_targ_lookup_t. It includes a request id
+ * to identify this, and then the overlay id, the varpd id, the header and
+ * packet size, the source and destination mac address, the SAP, and any
+ * potential VLAN header.
+ *
+ * At that point, it stays in that outstanding list until one of two ioctls are
+ * returned: OVERLAY_TARG_RESPOND or OVERLAY_TARG_DROP. During this time,
+ * userland may also perform other operations. For example, it may use
+ * OVERLAY_TARG_PKT to get a copy of this packet so it can perform more in-depth
+ * analysis of what to do beyond what we gave it initially. This is useful for
+ * providing proxy arp and the like. Finally, there are two other ioctls that
+ * varpd can then do. The first is OVERLAY_TARG_INJECT which injects the
+ * non-jumbo frame packet up into that mac device and OVERLAY_TARG_RESEND which
+ * causes us to encapsulate and send out the packet they've given us.
+ *
+ *
+ * Finally, through the target cache, several ioctls are provided to allow for
+ * interrogation and management of the cache. They allow for individual entries
+ * to be retrieved, set, or have the entire table flushed. For the full set of
+ * ioctls here and what they do, take a look at uts/common/sys/overlay_target.h.
+ *
+ * ------------------
+ * Sample Packet Flow
+ * ------------------
+ *
+ * There's a lot of pieces here, hopefully an example of how this all fits
+ * together will help clarify and elucidate what's going on. We're going to
+ * first track an outgoing packet, eg. one that is sent from an IP interface on
+ * a VNIC on top of an overlay device, and then we'll look at what it means to
+ * respond to that.
+ *
+ *
+ *    +----------------+        +--------------+            +------------------+
+ *    | IP/DLS send    |------->| MAC sends it |----------->| mblk_t reaches   |
+ *    | packet to MAC  |        | to the GLDv3 |            | overlay GLDv3 tx |
+ *    +----------------+        | VNIC device  |            | overlay_m_tx()   |
+ *                              +--------------+            +------------------+
+ *                                                                   |
+ *                             . lookup              . cache         |
+ *                             . drop                . miss          v
+ *            +---------+      .       +--------+    .      +------------------+
+ *            | freemsg |<-----*-------| varpd  |<---*------| Lookup each mblk |
+ *            | mblk_t  |              | lookup |           | in the target    |
+ *            +---------+              | queued |           | cache            |
+ *                ^                    +--------+           +------------------+
+ *      on send   |                        |                         |     cache
+ *      error . . *                        *. . lookup               * . . hit
+ *                |                        |    success              v
+ *                |                        |                +------------------+
+ *    +-----------------+                  +--------------->| call plugin      |
+ *    | Send out        |                                   | ovpo_encap() to  |
+ *    | overlay_mux_t's |<----------------------------------| get encap mblk_t |
+ *    | ksocket         |                                   +------------------+
+ *    +-----------------+
+ *
+ * The receive end point looks a little different and looks more like:
+ *
+ *  +------------------+     +----------------+    +-----------+
+ *  | mblk_t comes off |---->| enter netstack |--->| delivered |---+
+ *  | the physical     |     | IP stack       |    |     to    |   * . . direct
+ *  | device           |     +----------------+    |  ksocket  |   |   callback
+ *  +------------------+                           +-----------+   |
+ *                       . overlay id                              |
+ *                       . not found                               v
+ *       +-----------+   .      +-----------------+       +--------------------+
+ *       | freemsg   |<--*------| call plugin     |<------| overlay_mux_recv() |
+ *       | mblk_t    |          | ovpo_decap() to |       +--------------------+
+ *       +-----------+          | decap mblk_t    |
+ *                              +-----------------+
+ *                                     |
+ *                                     * . . overlay id
+ *                                     v     found
+ *                                 +--------+      +----------------+
+ *                                 | adjust |----->| call mac_rx    |
+ *                                 | mblk_t |      | on original    |
+ *                                 +--------+      | decaped packet |
+ *                                                 +----------------+
+ *
+ * ------------------
+ * Netstack Awareness
+ * ------------------
+ *
+ * In the above image we note that this enters a netstack. Today the only
+ * netstack that can be is the global zone as the overlay driver itself is not
+ * exactly netstack aware. What this really means is that varpd cannot run in a
+ * non-global zone and an overlay device cannot belong to a non-global zone.
+ * Non-global zones can still have a VNIC assigned to them that's been created
+ * over the overlay device the same way they would if it had been created over
+ * an etherstub or a physical device.
+ *
+ * The majority of the work to make it netstack aware is straightforward and the
+ * biggest thing is to create a netstack module that allows us to hook into
+ * netstack (and thus zone) creation and destruction.  From there, we need to
+ * amend the target cache lookup routines that we discussed earlier to not have
+ * a global outstanding list and a global list of handles, but rather, one per
+ * netstack.
+ *
+ * For the mux, we'll need to open the ksocket in the context of the zone, we
+ * can likely do this with a properly composed credential, but we'll need to do
+ * some more work on that path. Finally, we'll want to make sure the dld ioctls
+ * are aware of the zoneid of the caller and we use that appropriately and store
+ * it in the overlay_dev_t.
+ *
+ * -----------
+ * GLDv3 Notes
+ * -----------
+ *
+ * The overlay driver implements a GLDv3 device. Parts of GLDv3 are more
+ * relevant and other parts are much less relevant for us. For example, the
+ * GLDv3 is used to toggle the device being put into and out of promiscuous
+ * mode, to program MAC addresses for unicast and multicast hardware filters.
+ * Today, an overlay device doesn't have a notion of promiscuous mode nor does
+ * it have a notion of unicast and multicast addresses programmed into the
+ * device. Instead, for the purposes of the hardware filter, we don't do
+ * anything and just always accept new addresses being added and removed.
+ *
+ * If the GLDv3 start function has not been called, then we will not use this
+ * device for I/O purposes. Any calls to transmit or receive should be dropped,
+ * though the GLDv3 guarantees us that transmit will not be called without
+ * calling start. Similarly, once stop is called, then no packets can be dealt
+ * with.
+ *
+ * Today we don't support the stat interfaces, though there's no good reason
+ * that we shouldn't assemble some of the stats based on what we have in the
+ * future.
+ *
+ * When it comes to link properties, many of the traditional link properties do
+ * not apply and many others MAC handles for us. For example, we don't need to
+ * implement anything for overlay_m_getprop() to deal with returning the MTU, as
+ * MAC never calls into us for that. As such, there isn't much of anything to
+ * support in terms of properties.
+ *
+ * Today, we don't support any notion of hardware capabilities. However, if
+ * future NIC hardware or other changes to the system cause it to make sense for
+ * us to emulate logical groups, then we should do that. However, we still do
+ * implement a capab function so that we can identify ourselves as an overlay
+ * device to the broader MAC framework. This is done mostly so that a device
+ * created on top of us can have fanout rings as we don't try to lie about a
+ * speed for our device.
+ *
+ * The other question is what should be done for a device's MTU and margin. We
+ * set our minimum supported MTU to be the minimum value that an IP network may
+ * be set to 576 -- which mimics what an etherstub does. On the flip side, we
+ * have our upper bound set to 8900. This value comes from the fact that a lot
+ * of jumbo networks use their maximum as 9000. As such, we want to reserve 100
+ * bytes, which isn't exactly the most accurate number, but it'll be good enough
+ * for now. Because of that, our default MTU off of these devices is 1400, as
+ * the default MTU for everything is usually 1500 or whatever the underlying
+ * device is at; however, this is a bit simpler than asking the netstack what
+ * are all the IP interfaces at. It also calls into question how PMTU and PMTU
+ * discovery should work here. The challenge, especially for
+ * OVERLAY_TARG_DYNAMIC is that the MTU to any of the places will vary and it's
+ * not clear that if you have a single bad entry that the overall MTU should be
+ * lowered. Instead, we should figure out a better way of determining these
+ * kinds of PMTU errors and appropriately alerting the administrator via FMA.
+ *
+ * Regarding margin, we allow a margin of up to VLAN_TAGSZ depending on whether
+ * or not the underlying encapsulation device supports VLAN tags. If it does,
+ * then we'll set the margin to allow for it, otherwise, we will not.
+ */
+
+#include <sys/conf.h>
+#include <sys/errno.h>
+#include <sys/stat.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/modctl.h>
+#include <sys/policy.h>
+#include <sys/stream.h>
+#include <sys/strsubr.h>
+#include <sys/strsun.h>
+#include <sys/types.h>
+#include <sys/kmem.h>
+#include <sys/param.h>
+#include <sys/sysmacros.h>
+#include <sys/ddifm.h>
+
+#include <sys/dls.h>
+#include <sys/dld_ioc.h>
+#include <sys/mac_provider.h>
+#include <sys/mac_client_priv.h>
+#include <sys/mac_ether.h>
+#include <sys/vlan.h>
+
+#include <sys/overlay_impl.h>
+
+dev_info_t *overlay_dip;
+static kmutex_t overlay_dev_lock;
+static list_t overlay_dev_list;
+static uint8_t overlay_macaddr[ETHERADDRL] =
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
+
+typedef enum overlay_dev_prop {
+	OVERLAY_DEV_P_MTU = 0,
+	OVERLAY_DEV_P_VNETID,
+	OVERLAY_DEV_P_ENCAP,
+	OVERLAY_DEV_P_VARPDID
+} overlay_dev_prop_t;
+
+#define	OVERLAY_DEV_NPROPS	4
+static const char *overlay_dev_props[] = {
+	"mtu",
+	"vnetid",
+	"encap",
+	"varpd/id"
+};
+
+#define	OVERLAY_MTU_MIN	576
+#define	OVERLAY_MTU_DEF	1400
+#define	OVERLAY_MTU_MAX	8900
+
+overlay_dev_t *
+overlay_hold_by_dlid(datalink_id_t id)
+{
+	overlay_dev_t *o;
+
+	mutex_enter(&overlay_dev_lock);
+	for (o = list_head(&overlay_dev_list); o != NULL;
+	    o = list_next(&overlay_dev_list, o)) {
+		if (id == o->odd_linkid) {
+			mutex_enter(&o->odd_lock);
+			o->odd_ref++;
+			mutex_exit(&o->odd_lock);
+			mutex_exit(&overlay_dev_lock);
+			return (o);
+		}
+	}
+
+	mutex_exit(&overlay_dev_lock);
+	return (NULL);
+}
+
+void
+overlay_hold_rele(overlay_dev_t *odd)
+{
+	mutex_enter(&odd->odd_lock);
+	ASSERT(odd->odd_ref > 0);
+	odd->odd_ref--;
+	mutex_exit(&odd->odd_lock);
+}
+
+void
+overlay_io_start(overlay_dev_t *odd, overlay_dev_flag_t flag)
+{
+	ASSERT(flag == OVERLAY_F_IN_RX || flag == OVERLAY_F_IN_TX);
+	ASSERT(MUTEX_HELD(&odd->odd_lock));
+
+	if (flag & OVERLAY_F_IN_RX)
+		odd->odd_rxcount++;
+	if (flag & OVERLAY_F_IN_TX)
+		odd->odd_txcount++;
+	odd->odd_flags |= flag;
+}
+
+void
+overlay_io_done(overlay_dev_t *odd, overlay_dev_flag_t flag)
+{
+	boolean_t signal = B_FALSE;
+
+	ASSERT(flag == OVERLAY_F_IN_RX || flag == OVERLAY_F_IN_TX);
+	ASSERT(MUTEX_HELD(&odd->odd_lock));
+
+	if (flag & OVERLAY_F_IN_RX) {
+		ASSERT(odd->odd_rxcount > 0);
+		odd->odd_rxcount--;
+		if (odd->odd_rxcount == 0) {
+			signal = B_TRUE;
+			odd->odd_flags &= ~OVERLAY_F_IN_RX;
+		}
+	}
+	if (flag & OVERLAY_F_IN_TX) {
+		ASSERT(odd->odd_txcount > 0);
+		odd->odd_txcount--;
+		if (odd->odd_txcount == 0) {
+			signal = B_TRUE;
+			odd->odd_flags &= ~OVERLAY_F_IN_TX;
+		}
+	}
+
+	if (signal == B_TRUE)
+		cv_broadcast(&odd->odd_iowait);
+}
+
+static void
+overlay_io_wait(overlay_dev_t *odd, overlay_dev_flag_t flag)
+{
+	ASSERT((flag & ~OVERLAY_F_IOMASK) == 0);
+	ASSERT(MUTEX_HELD(&odd->odd_lock));
+
+	while (odd->odd_flags & flag) {
+		cv_wait(&odd->odd_iowait, &odd->odd_lock);
+	}
+}
+
+void
+overlay_dev_iter(overlay_dev_iter_f func, void *arg)
+{
+	overlay_dev_t *odd;
+
+	mutex_enter(&overlay_dev_lock);
+	for (odd = list_head(&overlay_dev_list); odd != NULL;
+	    odd = list_next(&overlay_dev_list, odd)) {
+		if (func(odd, arg) != 0) {
+			mutex_exit(&overlay_dev_lock);
+			return;
+		}
+	}
+	mutex_exit(&overlay_dev_lock);
+}
+
+/* ARGSUSED */
+static int
+overlay_m_stat(void *arg, uint_t stat, uint64_t *val)
+{
+	return (ENOTSUP);
+}
+
+static int
+overlay_m_start(void *arg)
+{
+	overlay_dev_t *odd = arg;
+	overlay_mux_t *mux;
+	int ret, domain, family, prot;
+	struct sockaddr_storage storage;
+	socklen_t slen;
+
+	mutex_enter(&odd->odd_lock);
+	if ((odd->odd_flags & OVERLAY_F_ACTIVATED) == 0) {
+		mutex_exit(&odd->odd_lock);
+		return (EAGAIN);
+	}
+	mutex_exit(&odd->odd_lock);
+
+	ret = odd->odd_plugin->ovp_ops->ovpo_socket(odd->odd_pvoid, &domain,
+	    &family, &prot, (struct sockaddr *)&storage, &slen);
+	if (ret != 0)
+		return (ret);
+
+	mux = overlay_mux_open(odd->odd_plugin, domain, family, prot,
+	    (struct sockaddr *)&storage, slen, &ret);
+	if (mux == NULL)
+		return (ret);
+
+	overlay_mux_add_dev(mux, odd);
+	odd->odd_mux = mux;
+	mutex_enter(&odd->odd_lock);
+	ASSERT(!(odd->odd_flags & OVERLAY_F_IN_MUX));
+	odd->odd_flags |= OVERLAY_F_IN_MUX;
+	mutex_exit(&odd->odd_lock);
+
+	return (0);
+}
+
+static void
+overlay_m_stop(void *arg)
+{
+	overlay_dev_t *odd = arg;
+
+	/*
+	 * The MAC Perimeter is held here, so we don't have to worry about
+	 * synchornizing this with respect to metadata operations.
+	 */
+	mutex_enter(&odd->odd_lock);
+	VERIFY(odd->odd_flags & OVERLAY_F_IN_MUX);
+	VERIFY(!(odd->odd_flags & OVERLAY_F_MDDROP));
+	odd->odd_flags |= OVERLAY_F_MDDROP;
+	overlay_io_wait(odd, OVERLAY_F_IOMASK);
+	mutex_exit(&odd->odd_lock);
+
+	overlay_mux_remove_dev(odd->odd_mux, odd);
+	overlay_mux_close(odd->odd_mux);
+	odd->odd_mux = NULL;
+
+	mutex_enter(&odd->odd_lock);
+	odd->odd_flags &= ~OVERLAY_F_IN_MUX;
+	odd->odd_flags &= ~OVERLAY_F_MDDROP;
+	VERIFY((odd->odd_flags & OVERLAY_F_STOPMASK) == 0);
+	mutex_exit(&odd->odd_lock);
+}
+
+/*
+ * For more info on this, see the big theory statement.
+ */
+/* ARGSUSED */
+static int
+overlay_m_promisc(void *arg, boolean_t on)
+{
+	return (0);
+}
+
+/*
+ * For more info on this, see the big theory statement.
+ */
+/* ARGSUSED */
+static int
+overlay_m_multicast(void *arg, boolean_t add, const uint8_t *addrp)
+{
+	return (0);
+}
+
+/*
+ * For more info on this, see the big theory statement.
+ */
+/* ARGSUSED */
+static int
+overlay_m_unicast(void *arg, const uint8_t *macaddr)
+{
+	return (0);
+}
+
+mblk_t *
+overlay_m_tx(void *arg, mblk_t *mp_chain)
+{
+	overlay_dev_t *odd = arg;
+	mblk_t *mp, *ep;
+	int ret;
+	ovep_encap_info_t einfo;
+	struct msghdr hdr;
+
+	mutex_enter(&odd->odd_lock);
+	if ((odd->odd_flags & OVERLAY_F_MDDROP) ||
+	    !(odd->odd_flags & OVERLAY_F_IN_MUX)) {
+		mutex_exit(&odd->odd_lock);
+		freemsgchain(mp_chain);
+		return (NULL);
+	}
+	overlay_io_start(odd, OVERLAY_F_IN_TX);
+	mutex_exit(&odd->odd_lock);
+
+	bzero(&hdr, sizeof (struct msghdr));
+
+	bzero(&einfo, sizeof (ovep_encap_info_t));
+	einfo.ovdi_id = odd->odd_vid;
+	mp = mp_chain;
+	while (mp != NULL) {
+		socklen_t slen;
+		struct sockaddr_storage storage;
+
+		mp_chain = mp->b_next;
+		mp->b_next = NULL;
+		ep = NULL;
+
+		ret = overlay_target_lookup(odd, mp,
+		    (struct sockaddr *)&storage, &slen);
+		if (ret != OVERLAY_TARGET_OK) {
+			if (ret == OVERLAY_TARGET_DROP)
+				freemsg(mp);
+			mp = mp_chain;
+			continue;
+		}
+
+		hdr.msg_name = &storage;
+		hdr.msg_namelen = slen;
+
+		ret = odd->odd_plugin->ovp_ops->ovpo_encap(odd->odd_mh, mp,
+		    &einfo, &ep);
+		if (ret != 0 || ep == NULL) {
+			freemsg(mp);
+			goto out;
+		}
+
+		ep->b_cont = mp;
+		ret = overlay_mux_tx(odd->odd_mux, &hdr, ep);
+		if (ret != 0)
+			goto out;
+
+		mp = mp_chain;
+	}
+
+out:
+	mutex_enter(&odd->odd_lock);
+	overlay_io_done(odd, OVERLAY_F_IN_TX);
+	mutex_exit(&odd->odd_lock);
+	return (mp_chain);
+}
+
+/* ARGSUSED */
+static void
+overlay_m_ioctl(void *arg, queue_t *q, mblk_t *mp)
+{
+	miocnak(q, mp, 0, ENOTSUP);
+}
+
+/* ARGSUSED */
+static boolean_t
+overlay_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
+{
+	/*
+	 * Tell MAC we're an overlay.
+	 */
+	if (cap == MAC_CAPAB_OVERLAY)
+		return (B_TRUE);
+	return (B_FALSE);
+}
+
+/* ARGSUSED */
+static int
+overlay_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
+    uint_t pr_valsize, const void *pr_val)
+{
+	uint32_t mtu, old;
+	int err;
+	overlay_dev_t *odd = arg;
+
+	if (pr_num != MAC_PROP_MTU)
+		return (ENOTSUP);
+
+	bcopy(pr_val, &mtu, sizeof (mtu));
+	if (mtu < OVERLAY_MTU_MIN || mtu > OVERLAY_MTU_MAX)
+		return (EINVAL);
+
+	mutex_enter(&odd->odd_lock);
+	old = odd->odd_mtu;
+	odd->odd_mtu = mtu;
+	err = mac_maxsdu_update(odd->odd_mh, mtu);
+	if (err != 0)
+		odd->odd_mtu = old;
+	mutex_exit(&odd->odd_lock);
+
+	return (err);
+}
+
+/* ARGSUSED */
+static int
+overlay_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
+    uint_t pr_valsize, void *pr_val)
+{
+	return (ENOTSUP);
+}
+
+/* ARGSUSED */
+static void
+overlay_m_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num,
+    mac_prop_info_handle_t prh)
+{
+	if (pr_num != MAC_PROP_MTU)
+		return;
+
+	mac_prop_info_set_default_uint32(prh, OVERLAY_MTU_DEF);
+	mac_prop_info_set_range_uint32(prh, OVERLAY_MTU_MIN, OVERLAY_MTU_MAX);
+}
+
+static mac_callbacks_t overlay_m_callbacks = {
+	.mc_callbacks = (MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_GETPROP |
+	    MC_PROPINFO),
+	.mc_getstat = overlay_m_stat,
+	.mc_start = overlay_m_start,
+	.mc_stop = overlay_m_stop,
+	.mc_setpromisc = overlay_m_promisc,
+	.mc_multicst = overlay_m_multicast,
+	.mc_unicst = overlay_m_unicast,
+	.mc_tx = overlay_m_tx,
+	.mc_ioctl = overlay_m_ioctl,
+	.mc_getcapab = overlay_m_getcapab,
+	.mc_getprop = overlay_m_getprop,
+	.mc_setprop = overlay_m_setprop,
+	.mc_propinfo = overlay_m_propinfo
+};
+
+static boolean_t
+overlay_valid_name(const char *name, size_t buflen)
+{
+	size_t actlen;
+	int err, i;
+
+	for (i = 0; i < buflen; i++) {
+		if (name[i] == '\0')
+			break;
+	}
+
+	if (i == 0 || i == buflen)
+		return (B_FALSE);
+	actlen = i;
+	if (strchr(name, '/') != NULL)
+		return (B_FALSE);
+	if (u8_validate((char *)name, actlen, NULL,
+	    U8_VALIDATE_ENTIRE, &err) < 0)
+		return (B_FALSE);
+	return (B_TRUE);
+}
+
+/* ARGSUSED */
+static int
+overlay_i_create(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
+{
+	int err;
+	uint64_t maxid;
+	overlay_dev_t *odd, *o;
+	mac_register_t *mac;
+	overlay_ioc_create_t *oicp = karg;
+
+	if (overlay_valid_name(oicp->oic_encap, MAXLINKNAMELEN) == B_FALSE)
+		return (EINVAL);
+
+	odd = kmem_zalloc(sizeof (overlay_dev_t), KM_SLEEP);
+	odd->odd_linkid = oicp->oic_linkid;
+	odd->odd_plugin = overlay_plugin_lookup(oicp->oic_encap);
+	if (odd->odd_plugin == NULL) {
+		kmem_free(odd, sizeof (overlay_dev_t));
+		return (ENOENT);
+	}
+	err = odd->odd_plugin->ovp_ops->ovpo_init((overlay_handle_t)odd,
+	    &odd->odd_pvoid);
+	if (err != 0) {
+		odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
+		overlay_plugin_rele(odd->odd_plugin);
+		kmem_free(odd, sizeof (overlay_dev_t));
+		return (EINVAL);
+	}
+
+	/*
+	 * Make sure that our virtual network id is valid for the given plugin
+	 * that we're working with.
+	 */
+	ASSERT(odd->odd_plugin->ovp_id_size <= 8);
+	maxid = UINT64_MAX;
+	if (odd->odd_plugin->ovp_id_size != 8)
+		maxid = (1ULL << (odd->odd_plugin->ovp_id_size * 8)) - 1ULL;
+	if (oicp->oic_vnetid > maxid) {
+		odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
+		overlay_plugin_rele(odd->odd_plugin);
+		kmem_free(odd, sizeof (overlay_dev_t));
+		return (EINVAL);
+	}
+	odd->odd_vid = oicp->oic_vnetid;
+
+	mac = mac_alloc(MAC_VERSION);
+	if (mac == NULL) {
+		mutex_exit(&overlay_dev_lock);
+		odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
+		overlay_plugin_rele(odd->odd_plugin);
+		kmem_free(odd, sizeof (overlay_dev_t));
+		return (EINVAL);
+	}
+
+	mac->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
+	mac->m_driver = odd;
+	mac->m_dip = overlay_dip;
+	mac->m_dst_addr = NULL;
+	mac->m_callbacks = &overlay_m_callbacks;
+	mac->m_pdata = NULL;
+	mac->m_pdata_size = 0;
+
+	mac->m_priv_props = NULL;
+
+	/* Let mac handle this itself. */
+	mac->m_instance = (uint_t)-1;
+
+	/*
+	 * There is no real source address that should be used here, but saying
+	 * that we're not ethernet is going to cause its own problems. At the
+	 * end of the say, this is fine.
+	 */
+	mac->m_src_addr = overlay_macaddr;
+
+	/*
+	 * Start with the default MTU as the max SDU. If the MTU is changed, the
+	 * SDU will be changed to reflect that.
+	 */
+	mac->m_min_sdu = 1;
+	mac->m_max_sdu = OVERLAY_MTU_DEF;
+	mac->m_multicast_sdu = 0;
+
+	/*
+	 * The underlying device doesn't matter, instead this comes from the
+	 * encapsulation protocol and whether or not they allow VLAN tags.
+	 */
+	if (odd->odd_plugin->ovp_flags & OVEP_F_VLAN_TAG) {
+		mac->m_margin = VLAN_TAGSZ;
+	} else {
+		mac->m_margin = 0;
+	}
+
+	/*
+	 * Today, we have no MAC virtualization, it may make sense in the future
+	 * to go ahead and emulate some subset of this, but it doesn't today.
+	 */
+	mac->m_v12n = MAC_VIRT_NONE;
+
+	mutex_enter(&overlay_dev_lock);
+	for (o = list_head(&overlay_dev_list); o != NULL;
+	    o = list_next(&overlay_dev_list, o)) {
+		if (o->odd_linkid == oicp->oic_linkid) {
+			mutex_exit(&overlay_dev_lock);
+			odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
+			overlay_plugin_rele(odd->odd_plugin);
+			kmem_free(odd, sizeof (overlay_dev_t));
+			return (EEXIST);
+		}
+
+		if (o->odd_vid == oicp->oic_vnetid &&
+		    o->odd_plugin == odd->odd_plugin) {
+			mutex_exit(&overlay_dev_lock);
+			odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
+			overlay_plugin_rele(odd->odd_plugin);
+			kmem_free(odd, sizeof (overlay_dev_t));
+			return (EEXIST);
+		}
+	}
+
+	err = mac_register(mac, &odd->odd_mh);
+	mac_free(mac);
+	if (err != 0) {
+		mutex_exit(&overlay_dev_lock);
+		odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
+		overlay_plugin_rele(odd->odd_plugin);
+		kmem_free(odd, sizeof (overlay_dev_t));
+		return (err);
+	}
+
+	err = dls_devnet_create(odd->odd_mh, odd->odd_linkid,
+	    crgetzoneid(cred));
+	if (err != 0) {
+		mutex_exit(&overlay_dev_lock);
+		(void) mac_unregister(odd->odd_mh);
+		odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
+		overlay_plugin_rele(odd->odd_plugin);
+		kmem_free(odd, sizeof (overlay_dev_t));
+		return (err);
+	}
+
+	mutex_init(&odd->odd_lock, NULL, MUTEX_DRIVER, NULL);
+	cv_init(&odd->odd_iowait, NULL, CV_DRIVER, NULL);
+	odd->odd_ref = 0;
+	odd->odd_flags = 0;
+	list_insert_tail(&overlay_dev_list, odd);
+	mutex_exit(&overlay_dev_lock);
+
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+overlay_i_activate(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
+{
+	int i, ret;
+	overlay_dev_t *odd;
+	mac_perim_handle_t mph;
+	overlay_ioc_activate_t *oiap = karg;
+	overlay_ioc_propinfo_t *infop;
+	overlay_ioc_prop_t *oip;
+	overlay_prop_handle_t phdl;
+
+	odd = overlay_hold_by_dlid(oiap->oia_linkid);
+	if (odd == NULL)
+		return (ENOENT);
+
+	infop = kmem_alloc(sizeof (overlay_ioc_propinfo_t), KM_SLEEP);
+	oip = kmem_alloc(sizeof (overlay_ioc_prop_t), KM_SLEEP);
+	phdl = (overlay_prop_handle_t)infop;
+
+	mac_perim_enter_by_mh(odd->odd_mh, &mph);
+	mutex_enter(&odd->odd_lock);
+	if (odd->odd_flags & OVERLAY_F_ACTIVATED) {
+		mutex_exit(&odd->odd_lock);
+		mac_perim_exit(mph);
+		overlay_hold_rele(odd);
+		kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
+		kmem_free(oip, sizeof (overlay_ioc_prop_t));
+		return (EEXIST);
+	}
+	mutex_exit(&odd->odd_lock);
+
+	for (i = 0; i < odd->odd_plugin->ovp_nprops; i++) {
+		const char *pname = odd->odd_plugin->ovp_props[i];
+		bzero(infop, sizeof (overlay_ioc_propinfo_t));
+		overlay_prop_init(phdl);
+		ret = odd->odd_plugin->ovp_ops->ovpo_propinfo(pname, phdl);
+		if (ret != 0) {
+			mac_perim_exit(mph);
+			overlay_hold_rele(odd);
+			kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
+			kmem_free(oip, sizeof (overlay_ioc_prop_t));
+			return (ret);
+		}
+
+		if ((infop->oipi_prot & OVERLAY_PROP_PERM_REQ) == 0)
+			continue;
+		bzero(oip, sizeof (overlay_ioc_prop_t));
+		oip->oip_size = sizeof (oip->oip_value);
+		ret = odd->odd_plugin->ovp_ops->ovpo_getprop(odd->odd_pvoid,
+		    pname, oip->oip_value, &oip->oip_size);
+		if (ret != 0) {
+			mac_perim_exit(mph);
+			overlay_hold_rele(odd);
+			kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
+			kmem_free(oip, sizeof (overlay_ioc_prop_t));
+			return (ret);
+		}
+		if (oip->oip_size == 0) {
+			mac_perim_exit(mph);
+			overlay_hold_rele(odd);
+			kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
+			kmem_free(oip, sizeof (overlay_ioc_prop_t));
+			return (EINVAL);
+		}
+	}
+
+	mutex_enter(&odd->odd_lock);
+	if ((odd->odd_flags & OVERLAY_F_VARPD) == 0) {
+		mutex_exit(&odd->odd_lock);
+		mac_perim_exit(mph);
+		overlay_hold_rele(odd);
+		kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
+		kmem_free(oip, sizeof (overlay_ioc_prop_t));
+		return (ENXIO);
+	}
+
+	ASSERT((odd->odd_flags & OVERLAY_F_ACTIVATED) == 0);
+	odd->odd_flags |= OVERLAY_F_ACTIVATED;
+
+	/*
+	 * Now that we've activated ourselves, we should indicate to the world
+	 * that we're up. Note that we may not be able to perform lookups at
+	 * this time, but our notion of being 'up' isn't dependent on that
+	 * ability.
+	 */
+	mac_link_update(odd->odd_mh, LINK_STATE_UP);
+	mutex_exit(&odd->odd_lock);
+
+	mac_perim_exit(mph);
+	overlay_hold_rele(odd);
+	kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
+	kmem_free(oip, sizeof (overlay_ioc_prop_t));
+
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+overlay_i_delete(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
+{
+	overlay_ioc_delete_t *oidp = karg;
+	overlay_dev_t *odd;
+	datalink_id_t tid;
+	int ret;
+
+	odd = overlay_hold_by_dlid(oidp->oid_linkid);
+	if (odd == NULL) {
+		return (ENOENT);
+	}
+
+	mutex_enter(&odd->odd_lock);
+	/* If we're not the only hold, we're busy */
+	if (odd->odd_ref != 1) {
+		mutex_exit(&odd->odd_lock);
+		overlay_hold_rele(odd);
+		return (EBUSY);
+	}
+
+	if (odd->odd_flags & OVERLAY_F_IN_MUX) {
+		mutex_exit(&odd->odd_lock);
+		overlay_hold_rele(odd);
+		return (EBUSY);
+	}
+
+	/*
+	 * To remove this, we need to first remove it from dls and then remove
+	 * it from mac. The act of removing it from mac will check if there are
+	 * devices on top of this, eg. vnics. If there are, then that will fail
+	 * and we'll have to go through and recreate the dls entry. Only after
+	 * mac_unregister has succeeded, then we'll go through and actually free
+	 * everything and drop the dev lock.
+	 */
+	ret = dls_devnet_destroy(odd->odd_mh, &tid, B_TRUE);
+	if (ret != 0) {
+		overlay_hold_rele(odd);
+		return (ret);
+	}
+
+	ASSERT(oidp->oid_linkid == tid);
+	ret = mac_disable(odd->odd_mh);
+	if (ret != 0) {
+		(void) dls_devnet_create(odd->odd_mh, odd->odd_linkid,
+		    crgetzoneid(cred));
+		overlay_hold_rele(odd);
+		return (ret);
+	}
+
+	overlay_target_quiesce(odd->odd_target);
+
+	mutex_enter(&overlay_dev_lock);
+	list_remove(&overlay_dev_list, odd);
+	mutex_exit(&overlay_dev_lock);
+
+	cv_destroy(&odd->odd_iowait);
+	mutex_destroy(&odd->odd_lock);
+	overlay_target_free(odd);
+	odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
+	overlay_plugin_rele(odd->odd_plugin);
+	kmem_free(odd, sizeof (overlay_dev_t));
+
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+overlay_i_nprops(void *karg, intptr_t arg, int mode, cred_t *cred,
+    int *rvalp)
+{
+	overlay_dev_t *odd;
+	overlay_ioc_nprops_t *on = karg;
+
+	odd = overlay_hold_by_dlid(on->oipn_linkid);
+	if (odd == NULL)
+		return (ENOENT);
+	on->oipn_nprops = odd->odd_plugin->ovp_nprops + OVERLAY_DEV_NPROPS;
+	overlay_hold_rele(odd);
+
+	return (0);
+}
+
+static int
+overlay_propinfo_plugin_cb(overlay_plugin_t *opp, void *arg)
+{
+	overlay_prop_handle_t phdl = arg;
+	overlay_prop_set_range_str(phdl, opp->ovp_name);
+	return (0);
+}
+
+static int
+overlay_i_name_to_propid(overlay_dev_t *odd, const char *name, uint_t *id)
+{
+	int i;
+
+	for (i = 0; i < OVERLAY_DEV_NPROPS; i++) {
+		if (strcmp(overlay_dev_props[i], name) == 0) {
+			*id = i;
+			return (0);
+		}
+	}
+
+	for (i = 0; i < odd->odd_plugin->ovp_nprops; i++) {
+		if (strcmp(odd->odd_plugin->ovp_props[i], name) == 0) {
+			*id = i + OVERLAY_DEV_NPROPS;
+			return (0);
+		}
+	}
+
+	return (ENOENT);
+}
+
+static void
+overlay_i_propinfo_mtu(overlay_dev_t *odd, overlay_prop_handle_t phdl)
+{
+	uint32_t def;
+	mac_propval_range_t range;
+	uint_t perm;
+
+	ASSERT(MAC_PERIM_HELD(odd->odd_mh));
+
+	bzero(&range, sizeof (mac_propval_range_t));
+	range.mpr_count = 1;
+	if (mac_prop_info(odd->odd_mh, MAC_PROP_MTU, "mtu", &def,
+	    sizeof (def), &range, &perm) != 0)
+		return;
+
+	if (perm == MAC_PROP_PERM_READ)
+		overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_READ);
+	else if (perm == MAC_PROP_PERM_WRITE)
+		overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_WRITE);
+	else if (perm == MAC_PROP_PERM_RW)
+		overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RW);
+
+	overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT);
+	overlay_prop_set_default(phdl, &def, sizeof (def));
+	overlay_prop_set_range_uint32(phdl, range.mpr_range_uint32[0].mpur_min,
+	    range.mpr_range_uint32[0].mpur_max);
+}
+
+/* ARGSUSED */
+static int
+overlay_i_propinfo(void *karg, intptr_t arg, int mode, cred_t *cred,
+    int *rvalp)
+{
+	overlay_dev_t *odd;
+	int ret;
+	mac_perim_handle_t mph;
+	uint_t propid = UINT_MAX;
+	overlay_ioc_propinfo_t *oip = karg;
+	overlay_prop_handle_t phdl = (overlay_prop_handle_t)oip;
+
+	odd = overlay_hold_by_dlid(oip->oipi_linkid);
+	if (odd == NULL)
+		return (ENOENT);
+
+	overlay_prop_init(phdl);
+	mac_perim_enter_by_mh(odd->odd_mh, &mph);
+
+	/*
+	 * If the id is -1, then the property that we're looking for is named in
+	 * oipi_name and we should fill in its id. Otherwise, we've been given
+	 * an id and we need to turn that into a name for our plugin's sake. The
+	 * id is our own fabrication for property discovery.
+	 */
+	if (oip->oipi_id == -1) {
+		/*
+		 * Determine if it's a known generic property or it belongs to a
+		 * module by checking against the list of known names.
+		 */
+		oip->oipi_name[OVERLAY_PROP_NAMELEN-1] = '\0';
+		if ((ret = overlay_i_name_to_propid(odd, oip->oipi_name,
+		    &propid)) != 0) {
+			overlay_hold_rele(odd);
+			mac_perim_exit(mph);
+			return (ret);
+		}
+		oip->oipi_id = propid;
+		if (propid >= OVERLAY_DEV_NPROPS) {
+			ret = odd->odd_plugin->ovp_ops->ovpo_propinfo(
+			    oip->oipi_name, phdl);
+			overlay_hold_rele(odd);
+			mac_perim_exit(mph);
+			return (ret);
+
+		}
+	} else if (oip->oipi_id >= OVERLAY_DEV_NPROPS) {
+		uint_t id = oip->oipi_id - OVERLAY_DEV_NPROPS;
+
+		if (id >= odd->odd_plugin->ovp_nprops) {
+			overlay_hold_rele(odd);
+			mac_perim_exit(mph);
+			return (EINVAL);
+		}
+		ret = odd->odd_plugin->ovp_ops->ovpo_propinfo(
+		    odd->odd_plugin->ovp_props[id], phdl);
+		overlay_hold_rele(odd);
+		mac_perim_exit(mph);
+		return (ret);
+	} else if (oip->oipi_id < -1) {
+		overlay_hold_rele(odd);
+		mac_perim_exit(mph);
+		return (EINVAL);
+	} else {
+		ASSERT(oip->oipi_id < OVERLAY_DEV_NPROPS);
+		ASSERT(oip->oipi_id >= 0);
+		propid = oip->oipi_id;
+		(void) strlcpy(oip->oipi_name, overlay_dev_props[propid],
+		    sizeof (oip->oipi_name));
+	}
+
+	switch (propid) {
+	case OVERLAY_DEV_P_MTU:
+		overlay_i_propinfo_mtu(odd, phdl);
+		break;
+	case OVERLAY_DEV_P_VNETID:
+		overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RW);
+		overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT);
+		overlay_prop_set_nodefault(phdl);
+		break;
+	case OVERLAY_DEV_P_ENCAP:
+		overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_READ);
+		overlay_prop_set_type(phdl, OVERLAY_PROP_T_STRING);
+		overlay_prop_set_nodefault(phdl);
+		overlay_plugin_walk(overlay_propinfo_plugin_cb, phdl);
+		break;
+	case OVERLAY_DEV_P_VARPDID:
+		overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_READ);
+		overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT);
+		overlay_prop_set_nodefault(phdl);
+		break;
+	default:
+		overlay_hold_rele(odd);
+		mac_perim_exit(mph);
+		return (ENOENT);
+	}
+
+	overlay_hold_rele(odd);
+	mac_perim_exit(mph);
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+overlay_i_getprop(void *karg, intptr_t arg, int mode, cred_t *cred,
+    int *rvalp)
+{
+	int ret;
+	overlay_dev_t *odd;
+	mac_perim_handle_t mph;
+	overlay_ioc_prop_t *oip = karg;
+	uint_t propid, mtu;
+
+	odd = overlay_hold_by_dlid(oip->oip_linkid);
+	if (odd == NULL)
+		return (ENOENT);
+
+	mac_perim_enter_by_mh(odd->odd_mh, &mph);
+	oip->oip_size = OVERLAY_PROP_SIZEMAX;
+	oip->oip_name[OVERLAY_PROP_NAMELEN-1] = '\0';
+	if (oip->oip_id == -1) {
+		int i;
+
+		for (i = 0; i < OVERLAY_DEV_NPROPS; i++) {
+			if (strcmp(overlay_dev_props[i], oip->oip_name) == 0)
+				break;
+			if (i == OVERLAY_DEV_NPROPS) {
+				ret = odd->odd_plugin->ovp_ops->ovpo_getprop(
+				    odd->odd_pvoid, oip->oip_name,
+				    oip->oip_value, &oip->oip_size);
+				overlay_hold_rele(odd);
+				mac_perim_exit(mph);
+				return (ret);
+			}
+		}
+
+		propid = i;
+	} else if (oip->oip_id >= OVERLAY_DEV_NPROPS) {
+		uint_t id = oip->oip_id - OVERLAY_DEV_NPROPS;
+
+		if (id > odd->odd_plugin->ovp_nprops) {
+			overlay_hold_rele(odd);
+			mac_perim_exit(mph);
+			return (EINVAL);
+		}
+		ret = odd->odd_plugin->ovp_ops->ovpo_getprop(odd->odd_pvoid,
+		    odd->odd_plugin->ovp_props[id], oip->oip_value,
+		    &oip->oip_size);
+		overlay_hold_rele(odd);
+		mac_perim_exit(mph);
+		return (ret);
+	} else if (oip->oip_id < -1) {
+		overlay_hold_rele(odd);
+		mac_perim_exit(mph);
+		return (EINVAL);
+	} else {
+		ASSERT(oip->oip_id < OVERLAY_DEV_NPROPS);
+		ASSERT(oip->oip_id >= 0);
+		propid = oip->oip_id;
+	}
+
+	ret = 0;
+	switch (propid) {
+	case OVERLAY_DEV_P_MTU:
+		/*
+		 * The MTU is always set and retrieved through MAC, to allow for
+		 * MAC to do whatever it wants, as really that property belongs
+		 * to MAC. This is important for things where vnics have hold on
+		 * the MTU.
+		 */
+		mac_sdu_get(odd->odd_mh, NULL, &mtu);
+		bcopy(&mtu, oip->oip_value, sizeof (uint_t));
+		oip->oip_size = sizeof (uint_t);
+		break;
+	case OVERLAY_DEV_P_VNETID:
+		/*
+		 * While it's read-only while inside of a mux, we're not in a
+		 * context that can guarantee that. Therefore we always grab the
+		 * overlay_dev_t's odd_lock.
+		 */
+		mutex_enter(&odd->odd_lock);
+		bcopy(&odd->odd_vid, oip->oip_value, sizeof (uint64_t));
+		mutex_exit(&odd->odd_lock);
+		oip->oip_size = sizeof (uint64_t);
+		break;
+	case OVERLAY_DEV_P_ENCAP:
+		oip->oip_size = strlcpy((char *)oip->oip_value,
+		    odd->odd_plugin->ovp_name, oip->oip_size);
+		break;
+	case OVERLAY_DEV_P_VARPDID:
+		mutex_enter(&odd->odd_lock);
+		if (odd->odd_flags & OVERLAY_F_VARPD) {
+			const uint64_t val = odd->odd_target->ott_id;
+			bcopy(&val, oip->oip_value, sizeof (uint64_t));
+			oip->oip_size = sizeof (uint64_t);
+		} else {
+			oip->oip_size = 0;
+		}
+		mutex_exit(&odd->odd_lock);
+		break;
+	default:
+		ret = ENOENT;
+	}
+
+	overlay_hold_rele(odd);
+	mac_perim_exit(mph);
+	return (ret);
+}
+
+static void
+overlay_setprop_vnetid(overlay_dev_t *odd, uint64_t vnetid)
+{
+	mutex_enter(&odd->odd_lock);
+
+	/* Simple case, not active */
+	if (!(odd->odd_flags & OVERLAY_F_IN_MUX)) {
+		odd->odd_vid = vnetid;
+		mutex_exit(&odd->odd_lock);
+		return;
+	}
+
+	/*
+	 * In the hard case, we need to set the drop flag, quiesce I/O and then
+	 * we can go ahead and do everything.
+	 */
+	odd->odd_flags |= OVERLAY_F_MDDROP;
+	overlay_io_wait(odd, OVERLAY_F_IOMASK);
+	mutex_exit(&odd->odd_lock);
+
+	overlay_mux_remove_dev(odd->odd_mux, odd);
+	mutex_enter(&odd->odd_lock);
+	odd->odd_vid = vnetid;
+	mutex_exit(&odd->odd_lock);
+	overlay_mux_add_dev(odd->odd_mux, odd);
+
+	mutex_enter(&odd->odd_lock);
+	ASSERT(odd->odd_flags & OVERLAY_F_IN_MUX);
+	odd->odd_flags &= ~OVERLAY_F_IN_MUX;
+	mutex_exit(&odd->odd_lock);
+}
+
+/* ARGSUSED */
+static int
+overlay_i_setprop(void *karg, intptr_t arg, int mode, cred_t *cred,
+    int *rvalp)
+{
+	int ret;
+	overlay_dev_t *odd;
+	overlay_ioc_prop_t *oip = karg;
+	uint_t propid = UINT_MAX;
+	mac_perim_handle_t mph;
+	uint64_t maxid, *vidp;
+
+	if (oip->oip_size > OVERLAY_PROP_SIZEMAX)
+		return (EINVAL);
+
+	odd = overlay_hold_by_dlid(oip->oip_linkid);
+	if (odd == NULL)
+		return (ENOENT);
+
+	oip->oip_name[OVERLAY_PROP_NAMELEN-1] = '\0';
+	mac_perim_enter_by_mh(odd->odd_mh, &mph);
+	mutex_enter(&odd->odd_lock);
+	if (odd->odd_flags & OVERLAY_F_ACTIVATED) {
+		mac_perim_exit(mph);
+		mutex_exit(&odd->odd_lock);
+		return (ENOTSUP);
+	}
+	mutex_exit(&odd->odd_lock);
+	if (oip->oip_id == -1) {
+		int i;
+
+		for (i = 0; i < OVERLAY_DEV_NPROPS; i++) {
+			if (strcmp(overlay_dev_props[i], oip->oip_name) == 0)
+				break;
+			if (i == OVERLAY_DEV_NPROPS) {
+				ret = odd->odd_plugin->ovp_ops->ovpo_setprop(
+				    odd->odd_pvoid, oip->oip_name,
+				    oip->oip_value, oip->oip_size);
+				overlay_hold_rele(odd);
+				mac_perim_exit(mph);
+				return (ret);
+			}
+		}
+
+		propid = i;
+	} else if (oip->oip_id >= OVERLAY_DEV_NPROPS) {
+		uint_t id = oip->oip_id - OVERLAY_DEV_NPROPS;
+
+		if (id > odd->odd_plugin->ovp_nprops) {
+			mac_perim_exit(mph);
+			overlay_hold_rele(odd);
+			return (EINVAL);
+		}
+		ret = odd->odd_plugin->ovp_ops->ovpo_setprop(odd->odd_pvoid,
+		    odd->odd_plugin->ovp_props[id], oip->oip_value,
+		    oip->oip_size);
+		mac_perim_exit(mph);
+		overlay_hold_rele(odd);
+		return (ret);
+	} else if (oip->oip_id < -1) {
+		mac_perim_exit(mph);
+		overlay_hold_rele(odd);
+		return (EINVAL);
+	} else {
+		ASSERT(oip->oip_id < OVERLAY_DEV_NPROPS);
+		ASSERT(oip->oip_id >= 0);
+		propid = oip->oip_id;
+	}
+
+	ret = 0;
+	switch (propid) {
+	case OVERLAY_DEV_P_MTU:
+		ret = mac_set_prop(odd->odd_mh, MAC_PROP_MTU, "mtu",
+		    oip->oip_value, oip->oip_size);
+		break;
+	case OVERLAY_DEV_P_VNETID:
+		if (oip->oip_size != sizeof (uint64_t)) {
+			ret = EINVAL;
+			break;
+		}
+		vidp = (uint64_t *)oip->oip_value;
+		ASSERT(odd->odd_plugin->ovp_id_size <= 8);
+		maxid = UINT64_MAX;
+		if (odd->odd_plugin->ovp_id_size != 8)
+			maxid = (1ULL << (odd->odd_plugin->ovp_id_size * 8)) -
+			    1ULL;
+		if (*vidp >= maxid) {
+			ret = EINVAL;
+			break;
+		}
+		overlay_setprop_vnetid(odd, *vidp);
+		break;
+	case OVERLAY_DEV_P_ENCAP:
+	case OVERLAY_DEV_P_VARPDID:
+		ret = EPERM;
+		break;
+	default:
+		ret = ENOENT;
+	}
+
+	mac_perim_exit(mph);
+	overlay_hold_rele(odd);
+	return (ret);
+}
+
+/* ARGSUSED */
+static int
+overlay_i_status(void *karg, intptr_t arg, int mode, cred_t *cred,
+    int *rvalp)
+{
+	overlay_dev_t *odd;
+	overlay_ioc_status_t *os = karg;
+
+	odd = overlay_hold_by_dlid(os->ois_linkid);
+	if (odd == NULL)
+		return (ENOENT);
+
+	mutex_enter(&odd->odd_lock);
+	if ((odd->odd_flags & OVERLAY_F_DEGRADED) != 0) {
+		os->ois_status = OVERLAY_I_DEGRADED;
+		if (odd->odd_fmamsg != NULL) {
+			(void) strlcpy(os->ois_message, odd->odd_fmamsg,
+			    OVERLAY_STATUS_BUFLEN);
+		} else {
+			os->ois_message[0] = '\0';
+		}
+
+	} else {
+		os->ois_status = OVERLAY_I_OK;
+		os->ois_message[0] = '\0';
+	}
+	mutex_exit(&odd->odd_lock);
+	overlay_hold_rele(odd);
+
+	return (0);
+}
+
+static dld_ioc_info_t overlay_ioc_list[] = {
+	{ OVERLAY_IOC_CREATE, DLDCOPYIN, sizeof (overlay_ioc_create_t),
+		overlay_i_create, secpolicy_dl_config },
+	{ OVERLAY_IOC_ACTIVATE, DLDCOPYIN, sizeof (overlay_ioc_activate_t),
+		overlay_i_activate, secpolicy_dl_config },
+	{ OVERLAY_IOC_DELETE, DLDCOPYIN, sizeof (overlay_ioc_delete_t),
+		overlay_i_delete, secpolicy_dl_config },
+	{ OVERLAY_IOC_PROPINFO, DLDCOPYIN | DLDCOPYOUT,
+		sizeof (overlay_ioc_propinfo_t), overlay_i_propinfo,
+		secpolicy_dl_config },
+	{ OVERLAY_IOC_GETPROP, DLDCOPYIN | DLDCOPYOUT,
+		sizeof (overlay_ioc_prop_t), overlay_i_getprop,
+		secpolicy_dl_config },
+	{ OVERLAY_IOC_SETPROP, DLDCOPYIN,
+		sizeof (overlay_ioc_prop_t), overlay_i_setprop,
+		secpolicy_dl_config },
+	{ OVERLAY_IOC_NPROPS, DLDCOPYIN | DLDCOPYOUT,
+		sizeof (overlay_ioc_nprops_t), overlay_i_nprops,
+		secpolicy_dl_config },
+	{ OVERLAY_IOC_STATUS, DLDCOPYIN | DLDCOPYOUT,
+		sizeof (overlay_ioc_status_t), overlay_i_status,
+		NULL }
+};
+
+static int
+overlay_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+	int fmcap = DDI_FM_EREPORT_CAPABLE;
+	if (cmd != DDI_ATTACH)
+		return (DDI_FAILURE);
+
+	if (overlay_dip != NULL || ddi_get_instance(dip) != 0)
+		return (DDI_FAILURE);
+
+	ddi_fm_init(dip, &fmcap, NULL);
+
+	if (ddi_create_minor_node(dip, OVERLAY_CTL, S_IFCHR,
+	    ddi_get_instance(dip), DDI_PSEUDO, 0) == DDI_FAILURE)
+		return (DDI_FAILURE);
+
+	if (dld_ioc_register(OVERLAY_IOC, overlay_ioc_list,
+	    DLDIOCCNT(overlay_ioc_list)) != 0) {
+		ddi_remove_minor_node(dip, OVERLAY_CTL);
+		return (DDI_FAILURE);
+	}
+
+	overlay_dip = dip;
+	return (DDI_SUCCESS);
+}
+
+/* ARGSUSED */
+static int
+overlay_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resp)
+{
+	int error;
+
+	switch (cmd) {
+	case DDI_INFO_DEVT2DEVINFO:
+		*resp = (void *)overlay_dip;
+		error = DDI_SUCCESS;
+		break;
+	case DDI_INFO_DEVT2INSTANCE:
+		*resp = (void *)0;
+		error = DDI_SUCCESS;
+		break;
+	default:
+		error = DDI_FAILURE;
+		break;
+	}
+
+	return (error);
+}
+
+static int
+overlay_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+	if (cmd != DDI_DETACH)
+		return (DDI_FAILURE);
+
+	mutex_enter(&overlay_dev_lock);
+	if (!list_is_empty(&overlay_dev_list) || overlay_target_busy()) {
+		mutex_exit(&overlay_dev_lock);
+		return (EBUSY);
+	}
+	mutex_exit(&overlay_dev_lock);
+
+
+	dld_ioc_unregister(OVERLAY_IOC);
+	ddi_remove_minor_node(dip, OVERLAY_CTL);
+	ddi_fm_fini(dip);
+	overlay_dip = NULL;
+	return (DDI_SUCCESS);
+}
+
+static struct cb_ops overlay_cbops = {
+	overlay_target_open,	/* cb_open */
+	overlay_target_close,	/* cb_close */
+	nodev,			/* cb_strategy */
+	nodev,			/* cb_print */
+	nodev,			/* cb_dump */
+	nodev,			/* cb_read */
+	nodev,			/* cb_write */
+	overlay_target_ioctl,	/* cb_ioctl */
+	nodev,			/* cb_devmap */
+	nodev,			/* cb_mmap */
+	nodev,			/* cb_segmap */
+	nochpoll,		/* cb_chpoll */
+	ddi_prop_op,		/* cb_prop_op */
+	NULL,			/* cb_stream */
+	D_MP,			/* cb_flag */
+	CB_REV,			/* cb_rev */
+	nodev,			/* cb_aread */
+	nodev,			/* cb_awrite */
+};
+
+static struct dev_ops overlay_dev_ops = {
+	DEVO_REV,		/* devo_rev */
+	0,			/* devo_refcnt */
+	overlay_getinfo,	/* devo_getinfo */
+	nulldev,		/* devo_identify */
+	nulldev,		/* devo_probe */
+	overlay_attach,		/* devo_attach */
+	overlay_detach,		/* devo_detach */
+	nulldev,		/* devo_reset */
+	&overlay_cbops,		/* devo_cb_ops */
+	NULL,			/* devo_bus_ops */
+	NULL,			/* devo_power */
+	ddi_quiesce_not_supported	/* devo_quiesce */
+};
+
+static struct modldrv overlay_modldrv = {
+	&mod_driverops,
+	"Overlay Network Driver",
+	&overlay_dev_ops
+};
+
+static struct modlinkage overlay_linkage = {
+	MODREV_1,
+	&overlay_modldrv
+};
+
+static int
+overlay_init(void)
+{
+	mutex_init(&overlay_dev_lock, NULL, MUTEX_DRIVER, NULL);
+	list_create(&overlay_dev_list, sizeof (overlay_dev_t),
+	    offsetof(overlay_dev_t, odd_link));
+	overlay_mux_init();
+	overlay_plugin_init();
+	overlay_target_init();
+
+	return (DDI_SUCCESS);
+}
+
+static void
+overlay_fini(void)
+{
+	overlay_target_fini();
+	overlay_plugin_fini();
+	overlay_mux_fini();
+	mutex_destroy(&overlay_dev_lock);
+	list_destroy(&overlay_dev_list);
+}
+
+int
+_init(void)
+{
+	int err;
+
+	if ((err = overlay_init()) != DDI_SUCCESS)
+		return (err);
+
+	mac_init_ops(NULL, "overlay");
+	err = mod_install(&overlay_linkage);
+	if (err != DDI_SUCCESS) {
+		overlay_fini();
+		return (err);
+	}
+
+	return (0);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&overlay_linkage, modinfop));
+}
+
+int
+_fini(void)
+{
+	int err;
+
+	err = mod_remove(&overlay_linkage);
+	if (err != 0)
+		return (err);
+
+	overlay_fini();
+	return (0);
+}
diff --git a/usr/src/uts/common/io/overlay/overlay.conf b/usr/src/uts/common/io/overlay/overlay.conf
new file mode 100644
index 0000000000..4b62fafd94
--- /dev/null
+++ b/usr/src/uts/common/io/overlay/overlay.conf
@@ -0,0 +1,16 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2015, Joyent, Inc.
+#
+
+name="overlay" parent="pseudo" instance=0;
diff --git a/usr/src/uts/common/io/overlay/overlay.mapfile b/usr/src/uts/common/io/overlay/overlay.mapfile
new file mode 100644
index 0000000000..800d72dc2b
--- /dev/null
+++ b/usr/src/uts/common/io/overlay/overlay.mapfile
@@ -0,0 +1,46 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2015 Joyent, Inc.
+#
+
+#
+# MAPFILE HEADER START
+#
+# WARNING:  STOP NOW.  DO NOT MODIFY THIS FILE.
+# Object versioning must comply with the rules detailed in
+#
+#	usr/src/lib/README.mapfiles
+#
+# You should not be making modifications here until you've read the most current
+# copy of that file. If you need help, contact a gatekeeper for guidance.
+#
+# MAPFILE HEADER END
+#
+
+$mapfile_version 2
+
+SYMBOL_VERSION ILLUMOSprivate {
+    global:
+	# DDI Interfaces
+	_fini;
+	_init;
+	_info;
+
+	# Encapsualation Plugin interfaces
+	overlay_plugin_alloc;
+	overlay_plugin_free;
+	overlay_plugin_register;
+	overlay_plugin_unregister;
+    local:
+	*;
+};
diff --git a/usr/src/uts/common/io/overlay/overlay_fm.c b/usr/src/uts/common/io/overlay/overlay_fm.c
new file mode 100644
index 0000000000..0701d08e8b
--- /dev/null
+++ b/usr/src/uts/common/io/overlay/overlay_fm.c
@@ -0,0 +1,82 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+/*
+ * Overlay device FMA operations.
+ *
+ * For more information, see the big theory statement in
+ * uts/common/io/overlay/overlay.c
+ */
+
+#include <sys/ddifm.h>
+#include <sys/overlay_impl.h>
+
+kmutex_t overlay_fm_lock;
+uint_t overlay_fm_count;
+
+void
+overlay_fm_init(void)
+{
+	overlay_fm_count = 0;
+	mutex_init(&overlay_fm_lock, NULL, MUTEX_DRIVER, NULL);
+}
+
+void
+overlay_fm_fini(void)
+{
+	VERIFY(overlay_fm_count == 0);
+	mutex_destroy(&overlay_fm_lock);
+}
+
+void
+overlay_fm_degrade(overlay_dev_t *odd, const char *msg)
+{
+	mutex_enter(&overlay_fm_lock);
+	mutex_enter(&odd->odd_lock);
+
+	if (msg != NULL)
+		(void) strlcpy(odd->odd_fmamsg, msg, OVERLAY_STATUS_BUFLEN);
+
+	if (odd->odd_flags & OVERLAY_F_DEGRADED)
+		goto out;
+
+	odd->odd_flags |= OVERLAY_F_DEGRADED;
+	overlay_fm_count++;
+	if (overlay_fm_count == 1) {
+		ddi_fm_service_impact(overlay_dip, DDI_SERVICE_DEGRADED);
+	}
+out:
+	mutex_exit(&odd->odd_lock);
+	mutex_exit(&overlay_fm_lock);
+}
+
+void
+overlay_fm_restore(overlay_dev_t *odd)
+{
+	mutex_enter(&overlay_fm_lock);
+	mutex_enter(&odd->odd_lock);
+	if (!(odd->odd_flags & OVERLAY_F_DEGRADED))
+		goto out;
+
+	odd->odd_fmamsg[0] = '\0';
+	odd->odd_flags &= ~OVERLAY_F_DEGRADED;
+	overlay_fm_count--;
+	if (overlay_fm_count == 0) {
+		ddi_fm_service_impact(overlay_dip, DDI_SERVICE_RESTORED);
+	}
+out:
+	mutex_exit(&odd->odd_lock);
+	mutex_exit(&overlay_fm_lock);
+}
diff --git a/usr/src/uts/common/io/overlay/overlay_mux.c b/usr/src/uts/common/io/overlay/overlay_mux.c
new file mode 100644
index 0000000000..9f70e8c83e
--- /dev/null
+++ b/usr/src/uts/common/io/overlay/overlay_mux.c
@@ -0,0 +1,354 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+/*
+ * Overlay device ksocket multiplexer.
+ *
+ * For more information, see the big theory statement in
+ * uts/common/io/overlay/overlay.c
+ */
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/ksynch.h>
+#include <sys/ksocket.h>
+#include <sys/avl.h>
+#include <sys/list.h>
+#include <sys/sysmacros.h>
+#include <sys/strsubr.h>
+#include <sys/strsun.h>
+#include <sys/tihdr.h>
+
+#include <sys/overlay_impl.h>
+
+#include <sys/sdt.h>
+
+#define	OVERLAY_FREEMSG(mp, reason) \
+    DTRACE_PROBE2(overlay__fremsg, mblk_t *, mp, char *, reason)
+
+static list_t overlay_mux_list;
+static kmutex_t overlay_mux_lock;
+
+void
+overlay_mux_init(void)
+{
+	list_create(&overlay_mux_list, sizeof (overlay_mux_t),
+	    offsetof(overlay_mux_t, omux_lnode));
+	mutex_init(&overlay_mux_lock, NULL, MUTEX_DRIVER, NULL);
+}
+
+void
+overlay_mux_fini(void)
+{
+	mutex_destroy(&overlay_mux_lock);
+	list_destroy(&overlay_mux_list);
+}
+
+static int
+overlay_mux_comparator(const void *a, const void *b)
+{
+	const overlay_dev_t *odl, *odr;
+	odl = a;
+	odr = b;
+	if (odl->odd_vid > odr->odd_vid)
+		return (1);
+	else if (odl->odd_vid < odr->odd_vid)
+		return (-1);
+	else
+		return (0);
+}
+
+/*
+ * This is the central receive data path. We need to decode the packet, if we
+ * can, and then deliver it to the appropriate overlay.
+ */
+/* ARGSUSED */
+static boolean_t
+overlay_mux_recv(ksocket_t ks, mblk_t *mpchain, size_t msgsize, int oob,
+    void *arg)
+{
+	mblk_t *mp, *nmp, *fmp;
+	overlay_mux_t *mux = arg;
+
+	/*
+	 * We may have a received a chain of messages. Each messsage in the
+	 * chain will likely have a T_unitdata_ind attached to it as an M_PROTO.
+	 * If we aren't getting that, we should probably drop that for the
+	 * moment.
+	 */
+	for (mp = mpchain; mp != NULL; mp = nmp) {
+		struct T_unitdata_ind *tudi;
+		ovep_encap_info_t infop;
+		overlay_dev_t od, *odd;
+		int ret;
+
+		nmp = mp->b_next;
+		mp->b_next = NULL;
+
+		if (DB_TYPE(mp) != M_PROTO) {
+			OVERLAY_FREEMSG(mp, "first one isn't M_PROTO");
+			freemsg(mp);
+			continue;
+		}
+
+		if (mp->b_cont == NULL) {
+			OVERLAY_FREEMSG(mp, "missing a b_cont");
+			freemsg(mp);
+			continue;
+		}
+
+		tudi = (struct T_unitdata_ind *)mp->b_rptr;
+		if (tudi->PRIM_type != T_UNITDATA_IND) {
+			OVERLAY_FREEMSG(mp, "Not a T_unitdata_ind *");
+			freemsg(mp);
+			continue;
+		}
+
+		/*
+		 * In the future, we'll care about the source information
+		 * for purposes of telling varpd for oob invalidation. But for
+		 * now, just drop that block.
+		 */
+		fmp = mp;
+		mp = fmp->b_cont;
+		fmp->b_cont = NULL;
+		freemsg(fmp);
+
+		/*
+		 * Decap and deliver.
+		 */
+		bzero(&infop, sizeof (ovep_encap_info_t));
+		ret = mux->omux_plugin->ovp_ops->ovpo_decap(NULL, mp, &infop);
+		if (ret != 0) {
+			OVERLAY_FREEMSG(mp, "decap failed");
+			freemsg(mp);
+			continue;
+		}
+		if (MBLKL(mp) > infop.ovdi_hdr_size) {
+			mp->b_rptr += infop.ovdi_hdr_size;
+		} else {
+			while (infop.ovdi_hdr_size != 0) {
+				size_t rem, blkl;
+
+				if (mp == NULL)
+					break;
+
+				blkl = MBLKL(mp);
+				rem = MIN(infop.ovdi_hdr_size, blkl);
+				infop.ovdi_hdr_size -= rem;
+				mp->b_rptr += rem;
+				if (rem == blkl) {
+					fmp = mp;
+					mp = fmp->b_cont;
+					fmp->b_cont = NULL;
+					OVERLAY_FREEMSG(mp,
+					    "freed a fmp block");
+					freemsg(fmp);
+				}
+			}
+			if (mp == NULL) {
+				OVERLAY_FREEMSG(mp, "freed it all...");
+				continue;
+			}
+		}
+
+
+		od.odd_vid = infop.ovdi_id;
+		mutex_enter(&mux->omux_lock);
+		odd = avl_find(&mux->omux_devices, &od, NULL);
+		if (odd == NULL) {
+			mutex_exit(&mux->omux_lock);
+			OVERLAY_FREEMSG(mp, "no matching vid");
+			freemsg(mp);
+			continue;
+		}
+		mutex_enter(&odd->odd_lock);
+		if ((odd->odd_flags & OVERLAY_F_MDDROP) ||
+		    !(odd->odd_flags & OVERLAY_F_IN_MUX)) {
+			mutex_exit(&odd->odd_lock);
+			mutex_exit(&mux->omux_lock);
+			OVERLAY_FREEMSG(mp, "dev dropped");
+			freemsg(mp);
+			continue;
+		}
+		overlay_io_start(odd, OVERLAY_F_IN_RX);
+		mutex_exit(&odd->odd_lock);
+		mutex_exit(&mux->omux_lock);
+
+		mac_rx(odd->odd_mh, NULL, mp);
+
+		mutex_enter(&odd->odd_lock);
+		overlay_io_done(odd, OVERLAY_F_IN_RX);
+		mutex_exit(&odd->odd_lock);
+	}
+
+	return (B_TRUE);
+}
+
+/*
+ * Register a given device with a socket backend. If no such device socket
+ * exists, create a new one.
+ */
+overlay_mux_t *
+overlay_mux_open(overlay_plugin_t *opp, int domain, int family, int protocol,
+    struct sockaddr *addr, socklen_t len, int *errp)
+{
+	int err;
+	overlay_mux_t *mux;
+	ksocket_t ksock;
+
+	if (errp == NULL)
+		errp = &err;
+
+	mutex_enter(&overlay_mux_lock);
+	for (mux = list_head(&overlay_mux_list); mux != NULL;
+	    mux = list_next(&overlay_mux_list, mux)) {
+		if (domain == mux->omux_domain &&
+		    family == mux->omux_family &&
+		    protocol == mux->omux_protocol &&
+		    len == mux->omux_alen &&
+		    bcmp(addr, mux->omux_addr, len) == 0) {
+
+			if (opp != mux->omux_plugin) {
+				*errp = EEXIST;
+				return (NULL);
+			}
+
+			mutex_enter(&mux->omux_lock);
+			mux->omux_count++;
+			mutex_exit(&mux->omux_lock);
+			mutex_exit(&overlay_mux_lock);
+			*errp = 0;
+			return (mux);
+		}
+	}
+
+	/*
+	 * Today we aren't zone-aware and only exist in the global zone. When we
+	 * allow for things to exist in the non-global zone, we'll want to use a
+	 * credential that's actually specific to the zone.
+	 */
+	*errp = ksocket_socket(&ksock, domain, family, protocol, KSOCKET_SLEEP,
+	    kcred);
+	if (*errp != 0) {
+		mutex_exit(&overlay_mux_lock);
+		return (NULL);
+	}
+
+	*errp = ksocket_bind(ksock, addr, len, kcred);
+	if (*errp != 0) {
+		mutex_exit(&overlay_mux_lock);
+		ksocket_close(ksock, kcred);
+		return (NULL);
+	}
+
+	/*
+	 * Ask our lower layer to optionally toggle anything they need on this
+	 * socket. Because a socket is owned by a single type of plugin, we can
+	 * then ask it to perform any additional socket set up it'd like to do.
+	 */
+	if (opp->ovp_ops->ovpo_sockopt != NULL &&
+	    (*errp = opp->ovp_ops->ovpo_sockopt(ksock)) != 0) {
+		mutex_exit(&overlay_mux_lock);
+		ksocket_close(ksock, kcred);
+		return (NULL);
+	}
+
+	mux = kmem_alloc(sizeof (overlay_mux_t), KM_SLEEP);
+	list_link_init(&mux->omux_lnode);
+	mux->omux_ksock = ksock;
+	mux->omux_plugin = opp;
+	mux->omux_domain = domain;
+	mux->omux_family = family;
+	mux->omux_protocol = protocol;
+	mux->omux_addr = kmem_alloc(len, KM_SLEEP);
+	bcopy(addr, mux->omux_addr, len);
+	mux->omux_alen = len;
+	mux->omux_count = 1;
+	avl_create(&mux->omux_devices, overlay_mux_comparator,
+	    sizeof (overlay_dev_t), offsetof(overlay_dev_t, odd_muxnode));
+	mutex_init(&mux->omux_lock, NULL, MUTEX_DRIVER, NULL);
+
+
+	/* Once this is called, we need to expect to rx data */
+	*errp = ksocket_krecv_set(ksock, overlay_mux_recv, mux);
+	if (*errp != 0) {
+		ksocket_close(ksock, kcred);
+		mutex_destroy(&mux->omux_lock);
+		avl_destroy(&mux->omux_devices);
+		kmem_free(mux->omux_addr, len);
+		kmem_free(mux, sizeof (overlay_mux_t));
+		return (NULL);
+	}
+
+	list_insert_tail(&overlay_mux_list, mux);
+	mutex_exit(&overlay_mux_lock);
+
+	*errp = 0;
+	return (mux);
+}
+
+void
+overlay_mux_close(overlay_mux_t *mux)
+{
+	mutex_enter(&overlay_mux_lock);
+	mutex_enter(&mux->omux_lock);
+	mux->omux_count--;
+	if (mux->omux_count != 0) {
+		mutex_exit(&mux->omux_lock);
+		mutex_exit(&overlay_mux_lock);
+		return;
+	}
+	list_remove(&overlay_mux_list, mux);
+	mutex_exit(&mux->omux_lock);
+	mutex_exit(&overlay_mux_lock);
+
+	ksocket_close(mux->omux_ksock, kcred);
+	avl_destroy(&mux->omux_devices);
+	kmem_free(mux->omux_addr, mux->omux_alen);
+	kmem_free(mux, sizeof (overlay_mux_t));
+}
+
+void
+overlay_mux_add_dev(overlay_mux_t *mux, overlay_dev_t *odd)
+{
+	mutex_enter(&mux->omux_lock);
+	avl_add(&mux->omux_devices, odd);
+	mutex_exit(&mux->omux_lock);
+}
+
+void
+overlay_mux_remove_dev(overlay_mux_t *mux, overlay_dev_t *odd)
+{
+	mutex_enter(&mux->omux_lock);
+	avl_remove(&mux->omux_devices, odd);
+	mutex_exit(&mux->omux_lock);
+}
+
+int
+overlay_mux_tx(overlay_mux_t *mux, struct msghdr *hdr, mblk_t *mp)
+{
+	int ret;
+
+	/*
+	 * It'd be nice to be able to use MSG_MBLK_QUICKRELE, unfortunately,
+	 * that isn't actually supported by UDP at this time.
+	 */
+	ret = ksocket_sendmblk(mux->omux_ksock, hdr, 0, &mp, kcred);
+	if (ret != 0)
+		freemsg(mp);
+
+	return (ret);
+}
diff --git a/usr/src/uts/common/io/overlay/overlay_plugin.c b/usr/src/uts/common/io/overlay/overlay_plugin.c
new file mode 100644
index 0000000000..348ddb92a2
--- /dev/null
+++ b/usr/src/uts/common/io/overlay/overlay_plugin.c
@@ -0,0 +1,281 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+/*
+ * Overlay device encapsulation plugin management
+ *
+ * For more information, see the big theory statement in
+ * uts/common/io/overlay/overlay.c
+ */
+
+#include <sys/types.h>
+#include <sys/kmem.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/errno.h>
+#include <sys/sysmacros.h>
+#include <sys/modctl.h>
+
+#include <sys/overlay_impl.h>
+
+static kmem_cache_t *overlay_plugin_cache;
+static kmutex_t overlay_plugin_lock;
+static list_t overlay_plugin_list;
+
+#define	OVERLAY_MODDIR	"overlay"
+
+/* ARGSUSED */
+static int
+overlay_plugin_cache_constructor(void *buf, void *arg, int kmflags)
+{
+	overlay_plugin_t *opp = buf;
+
+	mutex_init(&opp->ovp_mutex, NULL, MUTEX_DRIVER, NULL);
+	list_link_init(&opp->ovp_link);
+
+	return (0);
+}
+
+/* ARGSUSED */
+static void
+overlay_plugin_cache_destructor(void *buf, void *arg)
+{
+	overlay_plugin_t *opp = buf;
+	ASSERT(list_link_active(&opp->ovp_link) == 0);
+	mutex_destroy(&opp->ovp_mutex);
+}
+
+void
+overlay_plugin_init(void)
+{
+	mutex_init(&overlay_plugin_lock, NULL, MUTEX_DRIVER, 0);
+
+	/*
+	 * In the future we may want to have a reaper to unload unused modules
+	 * to help the kernel be able to reclaim memory.
+	 */
+	overlay_plugin_cache = kmem_cache_create("overlay_plugin_cache",
+	    sizeof (overlay_plugin_t), 0, overlay_plugin_cache_constructor,
+	    overlay_plugin_cache_destructor, NULL, NULL, NULL, 0);
+	list_create(&overlay_plugin_list, sizeof (overlay_plugin_t),
+	    offsetof(overlay_plugin_t, ovp_link));
+}
+
+void
+overlay_plugin_fini(void)
+{
+	mutex_enter(&overlay_plugin_lock);
+	VERIFY(list_is_empty(&overlay_plugin_list));
+	mutex_exit(&overlay_plugin_lock);
+
+	list_destroy(&overlay_plugin_list);
+	kmem_cache_destroy(overlay_plugin_cache);
+	mutex_destroy(&overlay_plugin_lock);
+}
+
+overlay_plugin_register_t *
+overlay_plugin_alloc(uint_t version)
+{
+	overlay_plugin_register_t *ovrp;
+	/* Version 1 is the only one that exists */
+	if (version != OVEP_VERSION_ONE)
+		return (NULL);
+
+	ovrp = kmem_zalloc(sizeof (overlay_plugin_register_t), KM_SLEEP);
+	ovrp->ovep_version = version;
+	return (ovrp);
+}
+
+void
+overlay_plugin_free(overlay_plugin_register_t *ovrp)
+{
+	kmem_free(ovrp, sizeof (overlay_plugin_register_t));
+}
+
+int
+overlay_plugin_register(overlay_plugin_register_t *ovrp)
+{
+	overlay_plugin_t *opp, *ipp;
+
+	/* Sanity check parameters of the registration */
+	if (ovrp->ovep_version != OVEP_VERSION_ONE)
+		return (EINVAL);
+
+	if (ovrp->ovep_name == NULL || ovrp->ovep_ops == NULL)
+		return (EINVAL);
+
+	if ((ovrp->ovep_flags & ~(OVEP_F_VLAN_TAG)) != 0)
+		return (EINVAL);
+
+	if (ovrp->ovep_id_size < 1)
+		return (EINVAL);
+
+	/* Don't support anything that has an id size larger than 8 bytes */
+	if (ovrp->ovep_id_size > 8)
+		return (ENOTSUP);
+
+	if (ovrp->ovep_dest == OVERLAY_PLUGIN_D_INVALID)
+		return (EINVAL);
+
+	if ((ovrp->ovep_dest & ~OVERLAY_PLUGIN_D_MASK) != 0)
+		return (EINVAL);
+
+	if (ovrp->ovep_ops->ovpo_callbacks != 0)
+		return (EINVAL);
+	if (ovrp->ovep_ops->ovpo_init == NULL)
+		return (EINVAL);
+	if (ovrp->ovep_ops->ovpo_fini == NULL)
+		return (EINVAL);
+	if (ovrp->ovep_ops->ovpo_encap == NULL)
+		return (EINVAL);
+	if (ovrp->ovep_ops->ovpo_decap == NULL)
+		return (EINVAL);
+	if (ovrp->ovep_ops->ovpo_socket == NULL)
+		return (EINVAL);
+	if (ovrp->ovep_ops->ovpo_getprop == NULL)
+		return (EINVAL);
+	if (ovrp->ovep_ops->ovpo_setprop == NULL)
+		return (EINVAL);
+	if (ovrp->ovep_ops->ovpo_propinfo == NULL)
+		return (EINVAL);
+
+
+	opp = kmem_cache_alloc(overlay_plugin_cache, KM_SLEEP);
+	opp->ovp_active = 0;
+	opp->ovp_name = ovrp->ovep_name;
+	opp->ovp_ops = ovrp->ovep_ops;
+	opp->ovp_props = ovrp->ovep_props;
+	opp->ovp_id_size = ovrp->ovep_id_size;
+	opp->ovp_flags = ovrp->ovep_flags;
+	opp->ovp_dest = ovrp->ovep_dest;
+
+	opp->ovp_nprops = 0;
+	if (ovrp->ovep_props != NULL) {
+		while (ovrp->ovep_props[opp->ovp_nprops] != NULL) {
+			if (strlen(ovrp->ovep_props[opp->ovp_nprops]) >=
+			    OVERLAY_PROP_NAMELEN) {
+				mutex_exit(&overlay_plugin_lock);
+				kmem_cache_free(overlay_plugin_cache, opp);
+				return (EINVAL);
+			}
+			opp->ovp_nprops++;
+		}
+	}
+
+	mutex_enter(&overlay_plugin_lock);
+	for (ipp = list_head(&overlay_plugin_list); ipp != NULL;
+	    ipp = list_next(&overlay_plugin_list, ipp)) {
+		if (strcmp(ipp->ovp_name, opp->ovp_name) == 0) {
+			mutex_exit(&overlay_plugin_lock);
+			kmem_cache_free(overlay_plugin_cache, opp);
+			return (EEXIST);
+		}
+	}
+	list_insert_tail(&overlay_plugin_list, opp);
+	mutex_exit(&overlay_plugin_lock);
+
+	return (0);
+}
+
+int
+overlay_plugin_unregister(const char *name)
+{
+	overlay_plugin_t *opp;
+
+	mutex_enter(&overlay_plugin_lock);
+	for (opp = list_head(&overlay_plugin_list); opp != NULL;
+	    opp = list_next(&overlay_plugin_list, opp)) {
+		if (strcmp(opp->ovp_name, name) == 0)
+			break;
+	}
+
+	if (opp == NULL) {
+		mutex_exit(&overlay_plugin_lock);
+		return (ENOENT);
+	}
+
+	mutex_enter(&opp->ovp_mutex);
+	if (opp->ovp_active > 0) {
+		mutex_exit(&opp->ovp_mutex);
+		mutex_exit(&overlay_plugin_lock);
+		return (EBUSY);
+	}
+	mutex_exit(&opp->ovp_mutex);
+
+	list_remove(&overlay_plugin_list, opp);
+	mutex_exit(&overlay_plugin_lock);
+
+	kmem_cache_free(overlay_plugin_cache, opp);
+	return (0);
+}
+
+overlay_plugin_t *
+overlay_plugin_lookup(const char *name)
+{
+	overlay_plugin_t *opp;
+	boolean_t trymodload = B_FALSE;
+
+	for (;;) {
+		mutex_enter(&overlay_plugin_lock);
+		for (opp = list_head(&overlay_plugin_list); opp != NULL;
+		    opp = list_next(&overlay_plugin_list, opp)) {
+			if (strcmp(name, opp->ovp_name) == 0) {
+				mutex_enter(&opp->ovp_mutex);
+				opp->ovp_active++;
+				mutex_exit(&opp->ovp_mutex);
+				mutex_exit(&overlay_plugin_lock);
+				return (opp);
+			}
+		}
+		mutex_exit(&overlay_plugin_lock);
+
+		if (trymodload == B_TRUE)
+			return (NULL);
+
+		/*
+		 * If we didn't find it, it may still exist, but just not have
+		 * been a loaded module. In that case, we'll do one attempt to
+		 * load it.
+		 */
+		if (modload(OVERLAY_MODDIR, (char *)name) == -1)
+			return (NULL);
+		trymodload = B_TRUE;
+	}
+
+}
+
+void
+overlay_plugin_rele(overlay_plugin_t *opp)
+{
+	mutex_enter(&opp->ovp_mutex);
+	ASSERT(opp->ovp_active > 0);
+	opp->ovp_active--;
+	mutex_exit(&opp->ovp_mutex);
+}
+
+void
+overlay_plugin_walk(overlay_plugin_walk_f func, void *arg)
+{
+	overlay_plugin_t *opp;
+	mutex_enter(&overlay_plugin_lock);
+	for (opp = list_head(&overlay_plugin_list); opp != NULL;
+	    opp = list_next(&overlay_plugin_list, opp)) {
+		if (func(opp, arg) != 0) {
+			mutex_exit(&overlay_plugin_lock);
+			return;
+		}
+	}
+	mutex_exit(&overlay_plugin_lock);
+}
diff --git a/usr/src/uts/common/io/overlay/overlay_prop.c b/usr/src/uts/common/io/overlay/overlay_prop.c
new file mode 100644
index 0000000000..ba1ea2a629
--- /dev/null
+++ b/usr/src/uts/common/io/overlay/overlay_prop.c
@@ -0,0 +1,122 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015, Joyent, Inc.
+ */
+
+/*
+ * Routines for manipulating property information structures.
+ *
+ * For more information, see the big theory statement in
+ * uts/common/io/overlay/overlay.c
+ */
+
+#include <sys/overlay_impl.h>
+
+void
+overlay_prop_init(overlay_prop_handle_t phdl)
+{
+	overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl;
+	mac_propval_range_t *rangep = (mac_propval_range_t *)infop->oipi_poss;
+
+	infop->oipi_posssize = sizeof (mac_propval_range_t);
+	bzero(rangep, sizeof (mac_propval_range_t));
+}
+
+void
+overlay_prop_set_name(overlay_prop_handle_t phdl, const char *name)
+{
+	overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl;
+	(void) strlcpy(infop->oipi_name, name, OVERLAY_PROP_NAMELEN);
+}
+
+void
+overlay_prop_set_prot(overlay_prop_handle_t phdl, overlay_prop_prot_t prot)
+{
+	overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl;
+	infop->oipi_prot = prot;
+}
+
+void
+overlay_prop_set_type(overlay_prop_handle_t phdl, overlay_prop_type_t type)
+{
+	overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl;
+	infop->oipi_type = type;
+}
+
+int
+overlay_prop_set_default(overlay_prop_handle_t phdl, void *def, ssize_t len)
+{
+	overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl;
+
+	if (len > OVERLAY_PROP_SIZEMAX)
+		return (E2BIG);
+
+	if (len < 0)
+		return (EOVERFLOW);
+
+	bcopy(def, infop->oipi_default, len);
+	infop->oipi_defsize = (uint32_t)len;
+
+	return (0);
+}
+
+void
+overlay_prop_set_nodefault(overlay_prop_handle_t phdl)
+{
+	overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl;
+	infop->oipi_default[0] = '\0';
+	infop->oipi_defsize = 0;
+}
+
+void
+overlay_prop_set_range_uint32(overlay_prop_handle_t phdl, uint32_t min,
+    uint32_t max)
+{
+	overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl;
+	mac_propval_range_t *rangep = (mac_propval_range_t *)infop->oipi_poss;
+
+	if (rangep->mpr_count != 0 && rangep->mpr_type != MAC_PROPVAL_UINT32)
+		return;
+
+	if (infop->oipi_posssize + sizeof (mac_propval_uint32_range_t) >
+	    sizeof (infop->oipi_poss))
+		return;
+
+	infop->oipi_posssize += sizeof (mac_propval_uint32_range_t);
+	rangep->mpr_count++;
+	rangep->mpr_type = MAC_PROPVAL_UINT32;
+	rangep->u.mpr_uint32[rangep->mpr_count-1].mpur_min = min;
+	rangep->u.mpr_uint32[rangep->mpr_count-1].mpur_max = max;
+}
+
+void
+overlay_prop_set_range_str(overlay_prop_handle_t phdl, const char *str)
+{
+	size_t len = strlen(str) + 1; /* Account for a null terminator */
+	overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl;
+	mac_propval_range_t *rangep = (mac_propval_range_t *)infop->oipi_poss;
+	mac_propval_str_range_t *pstr = &rangep->u.mpr_str;
+
+	if (rangep->mpr_count != 0 && rangep->mpr_type != MAC_PROPVAL_STR)
+		return;
+
+	if (infop->oipi_posssize + len > sizeof (infop->oipi_poss))
+		return;
+
+	rangep->mpr_count++;
+	rangep->mpr_type = MAC_PROPVAL_STR;
+	strlcpy((char *)&pstr->mpur_data[pstr->mpur_nextbyte], str,
+	    sizeof (infop->oipi_poss) - infop->oipi_posssize);
+	pstr->mpur_nextbyte += len;
+	infop->oipi_posssize += len;
+}
diff --git a/usr/src/uts/common/io/overlay/overlay_target.c b/usr/src/uts/common/io/overlay/overlay_target.c
new file mode 100644
index 0000000000..f4147b56d1
--- /dev/null
+++ b/usr/src/uts/common/io/overlay/overlay_target.c
@@ -0,0 +1,1651 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+/*
+ * Overlay device target cache management
+ *
+ * For more information, see the big theory statement in
+ * uts/common/io/overlay/overlay.c
+ */
+
+#include <sys/types.h>
+#include <sys/ethernet.h>
+#include <sys/kmem.h>
+#include <sys/policy.h>
+#include <sys/sysmacros.h>
+#include <sys/stream.h>
+#include <sys/strsun.h>
+#include <sys/strsubr.h>
+#include <sys/mac_provider.h>
+#include <sys/mac_client.h>
+#include <sys/mac_client_priv.h>
+#include <sys/vlan.h>
+#include <sys/crc32.h>
+#include <sys/cred.h>
+#include <sys/file.h>
+#include <sys/errno.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+
+#include <sys/overlay_impl.h>
+#include <sys/sdt.h>
+
+/*
+ * This is total straw man, but at least it's a prime number. Here we're
+ * going to have to go through and do a lot of evaluation and understanding as
+ * to how these target caches should grow and shrink, as well as, memory
+ * pressure and evictions. This just gives us a starting point that'll be 'good
+ * enough', until it's not.
+ */
+#define	OVERLAY_HSIZE	823
+
+/*
+ * We use this data structure to keep track of what requests have been actively
+ * allocated to a given instance so we know what to put back on the pending
+ * list.
+ */
+typedef struct overlay_target_hdl {
+	minor_t oth_minor;		/* RO */
+	zoneid_t oth_zoneid;		/* RO */
+	int oth_oflags;			/* RO */
+	list_node_t oth_link;		/* overlay_target_lock */
+	kmutex_t oth_lock;
+	list_t	oth_outstanding;	/* oth_lock */
+} overlay_target_hdl_t;
+
+typedef int (*overlay_target_copyin_f)(const void *, void **, size_t *, int);
+typedef int (*overlay_target_ioctl_f)(overlay_target_hdl_t *, void *);
+typedef int (*overlay_target_copyout_f)(void *, void *, size_t, int);
+
+typedef struct overaly_target_ioctl {
+	int		oti_cmd;	/* ioctl id */
+	boolean_t	oti_write;	/* ioctl requires FWRITE */
+	boolean_t	oti_ncopyout;	/* copyout data? */
+	overlay_target_copyin_f oti_copyin;	/* copyin func */
+	overlay_target_ioctl_f oti_func; /* function to call */
+	overlay_target_copyout_f oti_copyout;	/* copyin func */
+	size_t		oti_size;	/* size of user level structure */
+} overlay_target_ioctl_t;
+
+static kmem_cache_t *overlay_target_cache;
+static kmem_cache_t *overlay_entry_cache;
+static id_space_t *overlay_thdl_idspace;
+static void *overlay_thdl_state;
+
+/*
+ * When we support overlay devices in the NGZ, then all of these need to become
+ * zone aware, by plugging into the netstack engine and becoming per-netstack
+ * data.
+ */
+static list_t overlay_thdl_list;
+static kmutex_t overlay_target_lock;
+static kcondvar_t overlay_target_condvar;
+static list_t overlay_target_list;
+static boolean_t overlay_target_excl;
+
+/*
+ * Outstanding data per hash table entry.
+ */
+static int overlay_ent_size = 128 * 1024;
+
+/* ARGSUSED */
+static int
+overlay_target_cache_constructor(void *buf, void *arg, int kmflgs)
+{
+	overlay_target_t *ott = buf;
+
+	mutex_init(&ott->ott_lock, NULL, MUTEX_DRIVER, NULL);
+	cv_init(&ott->ott_cond, NULL, CV_DRIVER, NULL);
+	return (0);
+}
+
+/* ARGSUSED */
+static void
+overlay_target_cache_destructor(void *buf, void *arg)
+{
+	overlay_target_t *ott = buf;
+
+	cv_destroy(&ott->ott_cond);
+	mutex_destroy(&ott->ott_lock);
+}
+
+/* ARGSUSED */
+static int
+overlay_entry_cache_constructor(void *buf, void *arg, int kmflgs)
+{
+	overlay_target_entry_t *ote = buf;
+
+	bzero(ote, sizeof (overlay_target_entry_t));
+	mutex_init(&ote->ote_lock, NULL, MUTEX_DRIVER, NULL);
+	return (0);
+}
+
+/* ARGSUSED */
+static void
+overlay_entry_cache_destructor(void *buf, void *arg)
+{
+	overlay_target_entry_t *ote = buf;
+
+	mutex_destroy(&ote->ote_lock);
+}
+
+static uint64_t
+overlay_mac_hash(const void *v)
+{
+	uint32_t crc;
+	CRC32(crc, v, ETHERADDRL, -1U, crc32_table);
+	return (crc);
+}
+
+static int
+overlay_mac_cmp(const void *a, const void *b)
+{
+	return (bcmp(a, b, ETHERADDRL));
+}
+
+/* ARGSUSED */
+static void
+overlay_target_entry_dtor(void *arg)
+{
+	overlay_target_entry_t *ote = arg;
+
+	ote->ote_flags = 0;
+	bzero(ote->ote_addr, ETHERADDRL);
+	ote->ote_ott = NULL;
+	ote->ote_odd = NULL;
+	freemsgchain(ote->ote_chead);
+	ote->ote_chead = ote->ote_ctail = NULL;
+	ote->ote_mbsize = 0;
+	ote->ote_vtime = 0;
+	kmem_cache_free(overlay_entry_cache, ote);
+}
+
+static int
+overlay_mac_avl(const void *a, const void *b)
+{
+	int i;
+	const overlay_target_entry_t *l, *r;
+	l = a;
+	r = b;
+
+	for (i = 0; i < ETHERADDRL; i++) {
+		if (l->ote_addr[i] > r->ote_addr[i])
+			return (1);
+		else if (l->ote_addr[i] < r->ote_addr[i])
+			return (-1);
+	}
+
+	return (0);
+}
+
+void
+overlay_target_init(void)
+{
+	int ret;
+	ret = ddi_soft_state_init(&overlay_thdl_state,
+	    sizeof (overlay_target_hdl_t), 1);
+	VERIFY(ret == 0);
+	overlay_target_cache = kmem_cache_create("overlay_target",
+	    sizeof (overlay_target_t), 0, overlay_target_cache_constructor,
+	    overlay_target_cache_destructor, NULL, NULL, NULL, 0);
+	overlay_entry_cache = kmem_cache_create("overlay_entry",
+	    sizeof (overlay_target_entry_t), 0, overlay_entry_cache_constructor,
+	    overlay_entry_cache_destructor, NULL, NULL, NULL, 0);
+	mutex_init(&overlay_target_lock, NULL, MUTEX_DRIVER, NULL);
+	cv_init(&overlay_target_condvar, NULL, CV_DRIVER, NULL);
+	list_create(&overlay_target_list, sizeof (overlay_target_entry_t),
+	    offsetof(overlay_target_entry_t, ote_qlink));
+	list_create(&overlay_thdl_list, sizeof (overlay_target_hdl_t),
+	    offsetof(overlay_target_hdl_t, oth_link));
+	overlay_thdl_idspace = id_space_create("overlay_target_minors",
+	    1, INT32_MAX);
+}
+
+void
+overlay_target_fini(void)
+{
+	id_space_destroy(overlay_thdl_idspace);
+	list_destroy(&overlay_thdl_list);
+	list_destroy(&overlay_target_list);
+	cv_destroy(&overlay_target_condvar);
+	mutex_destroy(&overlay_target_lock);
+	kmem_cache_destroy(overlay_entry_cache);
+	kmem_cache_destroy(overlay_target_cache);
+	ddi_soft_state_fini(&overlay_thdl_state);
+}
+
+void
+overlay_target_free(overlay_dev_t *odd)
+{
+	if (odd->odd_target == NULL)
+		return;
+
+	if (odd->odd_target->ott_mode == OVERLAY_TARGET_DYNAMIC) {
+		refhash_t *rp = odd->odd_target->ott_u.ott_dyn.ott_dhash;
+		avl_tree_t *ap = &odd->odd_target->ott_u.ott_dyn.ott_tree;
+		overlay_target_entry_t *ote;
+
+		/*
+		 * Our AVL tree and hashtable contain the same elements,
+		 * therefore we should just remove it from the tree, but then
+		 * delete the entries when we remove them from the hash table
+		 * (which happens through the refhash dtor).
+		 */
+		while ((ote = avl_first(ap)) != NULL)
+			avl_remove(ap, ote);
+
+		avl_destroy(ap);
+		for (ote = refhash_first(rp); ote != NULL;
+		    ote = refhash_next(rp, ote)) {
+			refhash_remove(rp, ote);
+		}
+		refhash_destroy(rp);
+	}
+
+	ASSERT(odd->odd_target->ott_ocount == 0);
+	kmem_cache_free(overlay_target_cache, odd->odd_target);
+}
+
+int
+overlay_target_busy()
+{
+	int ret;
+
+	mutex_enter(&overlay_target_lock);
+	ret = !list_is_empty(&overlay_thdl_list);
+	mutex_exit(&overlay_target_lock);
+
+	return (ret);
+}
+
+static void
+overlay_target_queue(overlay_target_entry_t *entry)
+{
+	mutex_enter(&overlay_target_lock);
+	mutex_enter(&entry->ote_ott->ott_lock);
+	if (entry->ote_ott->ott_flags & OVERLAY_T_TEARDOWN) {
+		mutex_exit(&entry->ote_ott->ott_lock);
+		mutex_exit(&overlay_target_lock);
+		return;
+	}
+	entry->ote_ott->ott_ocount++;
+	mutex_exit(&entry->ote_ott->ott_lock);
+	list_insert_tail(&overlay_target_list, entry);
+	cv_signal(&overlay_target_condvar);
+	mutex_exit(&overlay_target_lock);
+}
+
+void
+overlay_target_quiesce(overlay_target_t *ott)
+{
+	if (ott == NULL)
+		return;
+	mutex_enter(&ott->ott_lock);
+	ott->ott_flags |= OVERLAY_T_TEARDOWN;
+	while (ott->ott_ocount != 0)
+		cv_wait(&ott->ott_cond, &ott->ott_lock);
+	mutex_exit(&ott->ott_lock);
+}
+
+/*
+ * This functions assumes that the destination mode is OVERLAY_PLUGIN_D_IP |
+ * OVERLAY_PLUGIN_D_PORT. As we don't have an implementation of anything else at
+ * this time, say for NVGRE, we drop all packets that mcuh this.
+ */
+int
+overlay_target_lookup(overlay_dev_t *odd, mblk_t *mp, struct sockaddr *sock,
+    socklen_t *slenp)
+{
+	int ret;
+	struct sockaddr_in6 *v6;
+	overlay_target_t *ott;
+	mac_header_info_t mhi;
+	overlay_target_entry_t *entry;
+
+	ASSERT(odd->odd_target != NULL);
+
+	/*
+	 * At this point, the overlay device is in a mux which means that it's
+	 * been activated. At this point, parts of the target, such as the mode
+	 * and the destination are now read-only and we don't have to worry
+	 * about synchronization for them.
+	 */
+	ott = odd->odd_target;
+	if (ott->ott_dest != (OVERLAY_PLUGIN_D_IP | OVERLAY_PLUGIN_D_PORT))
+		return (OVERLAY_TARGET_DROP);
+
+	v6 = (struct sockaddr_in6 *)sock;
+	bzero(v6, sizeof (struct sockaddr_in6));
+	v6->sin6_family = AF_INET6;
+
+	if (ott->ott_mode == OVERLAY_TARGET_POINT) {
+		mutex_enter(&ott->ott_lock);
+		bcopy(&ott->ott_u.ott_point.otp_ip, &v6->sin6_addr,
+		    sizeof (struct in6_addr));
+		v6->sin6_port = htons(ott->ott_u.ott_point.otp_port);
+		mutex_exit(&ott->ott_lock);
+		*slenp = sizeof (struct sockaddr_in6);
+
+		return (OVERLAY_TARGET_OK);
+	}
+
+	ASSERT(ott->ott_mode == OVERLAY_TARGET_DYNAMIC);
+
+	/*
+	 * Note we only want the MAC address here, therefore we won't bother
+	 * using mac_vlan_header_info(). If any caller needs the vlan info at
+	 * this point, this should change to a call to mac_vlan_header_info().
+	 */
+	if (mac_header_info(odd->odd_mh, mp, &mhi) != 0)
+		return (OVERLAY_TARGET_DROP);
+	mutex_enter(&ott->ott_lock);
+	entry = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
+	    mhi.mhi_daddr);
+	if (entry == NULL) {
+		entry = kmem_cache_alloc(overlay_entry_cache,
+		    KM_NOSLEEP | KM_NORMALPRI);
+		if (entry == NULL) {
+			mutex_exit(&ott->ott_lock);
+			return (OVERLAY_TARGET_DROP);
+		}
+		bcopy(mhi.mhi_daddr, entry->ote_addr, ETHERADDRL);
+		entry->ote_chead = entry->ote_ctail = mp;
+		entry->ote_mbsize = msgsize(mp);
+		entry->ote_flags |= OVERLAY_ENTRY_F_PENDING;
+		entry->ote_ott = ott;
+		entry->ote_odd = odd;
+		refhash_insert(ott->ott_u.ott_dyn.ott_dhash, entry);
+		avl_add(&ott->ott_u.ott_dyn.ott_tree, entry);
+		mutex_exit(&ott->ott_lock);
+		overlay_target_queue(entry);
+		return (OVERLAY_TARGET_ASYNC);
+	}
+	refhash_hold(ott->ott_u.ott_dyn.ott_dhash, entry);
+	mutex_exit(&ott->ott_lock);
+
+	mutex_enter(&entry->ote_lock);
+	if (entry->ote_flags & OVERLAY_ENTRY_F_DROP) {
+		ret = OVERLAY_TARGET_DROP;
+	} else if (entry->ote_flags & OVERLAY_ENTRY_F_VALID) {
+		bcopy(&entry->ote_dest.otp_ip, &v6->sin6_addr,
+		    sizeof (struct in6_addr));
+		v6->sin6_port = htons(entry->ote_dest.otp_port);
+		*slenp = sizeof (struct sockaddr_in6);
+		ret = OVERLAY_TARGET_OK;
+	} else {
+		size_t mlen = msgsize(mp);
+
+		if (mlen + entry->ote_mbsize > overlay_ent_size) {
+			ret = OVERLAY_TARGET_DROP;
+		} else {
+			if (entry->ote_ctail != NULL) {
+				ASSERT(entry->ote_ctail->b_next ==
+				    NULL);
+				entry->ote_ctail->b_next = mp;
+				entry->ote_ctail = mp;
+			} else {
+				entry->ote_chead = mp;
+				entry->ote_ctail = mp;
+			}
+			entry->ote_mbsize += mlen;
+			if ((entry->ote_flags &
+			    OVERLAY_ENTRY_F_PENDING) == 0) {
+				entry->ote_flags |=
+				    OVERLAY_ENTRY_F_PENDING;
+				overlay_target_queue(entry);
+			}
+			ret = OVERLAY_TARGET_ASYNC;
+		}
+	}
+	mutex_exit(&entry->ote_lock);
+
+	mutex_enter(&ott->ott_lock);
+	refhash_rele(ott->ott_u.ott_dyn.ott_dhash, entry);
+	mutex_exit(&ott->ott_lock);
+
+	return (ret);
+}
+
+/* ARGSUSED */
+static int
+overlay_target_info(overlay_target_hdl_t *thdl, void *arg)
+{
+	overlay_dev_t *odd;
+	overlay_targ_info_t *oti = arg;
+
+	odd = overlay_hold_by_dlid(oti->oti_linkid);
+	if (odd == NULL)
+		return (ENOENT);
+
+	mutex_enter(&odd->odd_lock);
+	oti->oti_flags = 0;
+	oti->oti_needs = odd->odd_plugin->ovp_dest;
+	if (odd->odd_flags & OVERLAY_F_DEGRADED)
+		oti->oti_flags |= OVERLAY_TARG_INFO_F_DEGRADED;
+	if (odd->odd_flags & OVERLAY_F_ACTIVATED)
+		oti->oti_flags |= OVERLAY_TARG_INFO_F_ACTIVE;
+	oti->oti_vnetid = odd->odd_vid;
+	mutex_exit(&odd->odd_lock);
+	overlay_hold_rele(odd);
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+overlay_target_associate(overlay_target_hdl_t *thdl, void *arg)
+{
+	overlay_dev_t *odd;
+	overlay_target_t *ott;
+	overlay_targ_associate_t *ota = arg;
+
+	odd = overlay_hold_by_dlid(ota->ota_linkid);
+	if (odd == NULL)
+		return (ENOENT);
+
+	if (ota->ota_id == 0) {
+		overlay_hold_rele(odd);
+		return (EINVAL);
+	}
+
+	if (ota->ota_mode != OVERLAY_TARGET_POINT &&
+	    ota->ota_mode != OVERLAY_TARGET_DYNAMIC) {
+		overlay_hold_rele(odd);
+		return (EINVAL);
+	}
+
+	if (ota->ota_provides != odd->odd_plugin->ovp_dest) {
+		overlay_hold_rele(odd);
+		return (EINVAL);
+	}
+
+	if (ota->ota_mode == OVERLAY_TARGET_POINT) {
+		if (ota->ota_provides & OVERLAY_PLUGIN_D_IP) {
+			if (IN6_IS_ADDR_UNSPECIFIED(&ota->ota_point.otp_ip) ||
+			    IN6_IS_ADDR_V4COMPAT(&ota->ota_point.otp_ip) ||
+			    IN6_IS_ADDR_V4MAPPED_ANY(&ota->ota_point.otp_ip)) {
+				overlay_hold_rele(odd);
+				return (EINVAL);
+			}
+		}
+
+		if (ota->ota_provides & OVERLAY_PLUGIN_D_PORT) {
+			if (ota->ota_point.otp_port == 0) {
+				overlay_hold_rele(odd);
+				return (EINVAL);
+			}
+		}
+	}
+
+	ott = kmem_cache_alloc(overlay_target_cache, KM_SLEEP);
+	ott->ott_flags = 0;
+	ott->ott_ocount = 0;
+	ott->ott_mode = ota->ota_mode;
+	ott->ott_dest = ota->ota_provides;
+	ott->ott_id = ota->ota_id;
+
+	if (ott->ott_mode == OVERLAY_TARGET_POINT) {
+		bcopy(&ota->ota_point, &ott->ott_u.ott_point,
+		    sizeof (overlay_target_point_t));
+	} else {
+		ott->ott_u.ott_dyn.ott_dhash = refhash_create(OVERLAY_HSIZE,
+		    overlay_mac_hash, overlay_mac_cmp,
+		    overlay_target_entry_dtor, sizeof (overlay_target_entry_t),
+		    offsetof(overlay_target_entry_t, ote_reflink),
+		    offsetof(overlay_target_entry_t, ote_addr), KM_SLEEP);
+		avl_create(&ott->ott_u.ott_dyn.ott_tree, overlay_mac_avl,
+		    sizeof (overlay_target_entry_t),
+		    offsetof(overlay_target_entry_t, ote_avllink));
+	}
+	mutex_enter(&odd->odd_lock);
+	if (odd->odd_flags & OVERLAY_F_VARPD) {
+		mutex_exit(&odd->odd_lock);
+		kmem_cache_free(overlay_target_cache, ott);
+		overlay_hold_rele(odd);
+		return (EEXIST);
+	}
+
+	odd->odd_flags |= OVERLAY_F_VARPD;
+	odd->odd_target = ott;
+	mutex_exit(&odd->odd_lock);
+
+	overlay_hold_rele(odd);
+
+
+	return (0);
+}
+
+
+/* ARGSUSED */
+static int
+overlay_target_degrade(overlay_target_hdl_t *thdl, void *arg)
+{
+	overlay_dev_t *odd;
+	overlay_targ_degrade_t *otd = arg;
+
+	odd = overlay_hold_by_dlid(otd->otd_linkid);
+	if (odd == NULL)
+		return (ENOENT);
+
+	overlay_fm_degrade(odd, otd->otd_buf);
+	overlay_hold_rele(odd);
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+overlay_target_restore(overlay_target_hdl_t *thdl, void *arg)
+{
+	overlay_dev_t *odd;
+	overlay_targ_id_t *otid = arg;
+
+	odd = overlay_hold_by_dlid(otid->otid_linkid);
+	if (odd == NULL)
+		return (ENOENT);
+
+	overlay_fm_restore(odd);
+	overlay_hold_rele(odd);
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+overlay_target_disassociate(overlay_target_hdl_t *thdl, void *arg)
+{
+	overlay_dev_t *odd;
+	overlay_targ_id_t *otid = arg;
+
+	odd = overlay_hold_by_dlid(otid->otid_linkid);
+	if (odd == NULL)
+		return (ENOENT);
+
+	mutex_enter(&odd->odd_lock);
+	odd->odd_flags &= ~OVERLAY_F_VARPD;
+	mutex_exit(&odd->odd_lock);
+
+	overlay_hold_rele(odd);
+	return (0);
+
+}
+
+static int
+overlay_target_lookup_request(overlay_target_hdl_t *thdl, void *arg)
+{
+	overlay_targ_lookup_t *otl = arg;
+	overlay_target_entry_t *entry;
+	clock_t ret, timeout;
+	mac_header_info_t mhi;
+
+	timeout = ddi_get_lbolt() + drv_usectohz(MICROSEC);
+again:
+	mutex_enter(&overlay_target_lock);
+	while (list_is_empty(&overlay_target_list)) {
+		ret = cv_timedwait(&overlay_target_condvar,
+		    &overlay_target_lock, timeout);
+		if (ret == -1) {
+			mutex_exit(&overlay_target_lock);
+			return (ETIME);
+		}
+	}
+	entry = list_remove_head(&overlay_target_list);
+	mutex_exit(&overlay_target_lock);
+	mutex_enter(&entry->ote_lock);
+	if (entry->ote_flags & OVERLAY_ENTRY_F_VALID) {
+		ASSERT(entry->ote_chead == NULL);
+		mutex_exit(&entry->ote_lock);
+		goto again;
+	}
+	ASSERT(entry->ote_chead != NULL);
+
+	/*
+	 * If we have a bogon that doesn't have a valid mac header, drop it and
+	 * try again.
+	 */
+	if (mac_vlan_header_info(entry->ote_odd->odd_mh, entry->ote_chead,
+	    &mhi) != 0) {
+		boolean_t queue = B_FALSE;
+		mblk_t *mp = entry->ote_chead;
+		entry->ote_chead = mp->b_next;
+		mp->b_next = NULL;
+		if (entry->ote_ctail == mp)
+			entry->ote_ctail = entry->ote_chead;
+		entry->ote_mbsize -= msgsize(mp);
+		if (entry->ote_chead != NULL)
+			queue = B_TRUE;
+		mutex_exit(&entry->ote_lock);
+		if (queue == B_TRUE)
+			overlay_target_queue(entry);
+		freemsg(mp);
+		goto again;
+	}
+
+	otl->otl_dlid = entry->ote_odd->odd_linkid;
+	otl->otl_reqid = (uintptr_t)entry;
+	otl->otl_varpdid = entry->ote_ott->ott_id;
+	otl->otl_vnetid = entry->ote_odd->odd_vid;
+
+	otl->otl_hdrsize = mhi.mhi_hdrsize;
+	otl->otl_pktsize = msgsize(entry->ote_chead) - otl->otl_hdrsize;
+	bcopy(mhi.mhi_daddr, otl->otl_dstaddr, ETHERADDRL);
+	bcopy(mhi.mhi_saddr, otl->otl_srcaddr, ETHERADDRL);
+	otl->otl_dsttype = mhi.mhi_dsttype;
+	otl->otl_sap = mhi.mhi_bindsap;
+	otl->otl_vlan = VLAN_ID(mhi.mhi_tci);
+	mutex_exit(&entry->ote_lock);
+
+	mutex_enter(&thdl->oth_lock);
+	list_insert_tail(&thdl->oth_outstanding, entry);
+	mutex_exit(&thdl->oth_lock);
+
+	return (0);
+}
+
+static int
+overlay_target_lookup_respond(overlay_target_hdl_t *thdl, void *arg)
+{
+	const overlay_targ_resp_t *otr = arg;
+	overlay_target_entry_t *entry;
+	mblk_t *mp;
+
+	mutex_enter(&thdl->oth_lock);
+	for (entry = list_head(&thdl->oth_outstanding); entry != NULL;
+	    entry = list_next(&thdl->oth_outstanding, entry)) {
+		if ((uintptr_t)entry == otr->otr_reqid)
+			break;
+	}
+
+	if (entry == NULL) {
+		mutex_exit(&thdl->oth_lock);
+		return (EINVAL);
+	}
+	list_remove(&thdl->oth_outstanding, entry);
+	mutex_exit(&thdl->oth_lock);
+
+	mutex_enter(&entry->ote_lock);
+	bcopy(&otr->otr_answer, &entry->ote_dest,
+	    sizeof (overlay_target_point_t));
+	entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING;
+	entry->ote_flags |= OVERLAY_ENTRY_F_VALID;
+	mp = entry->ote_chead;
+	entry->ote_chead = NULL;
+	entry->ote_ctail = NULL;
+	entry->ote_mbsize = 0;
+	entry->ote_vtime = gethrtime();
+	mutex_exit(&entry->ote_lock);
+
+	/*
+	 * For now do an in-situ drain.
+	 */
+	mp = overlay_m_tx(entry->ote_odd, mp);
+	freemsgchain(mp);
+
+	mutex_enter(&entry->ote_ott->ott_lock);
+	entry->ote_ott->ott_ocount--;
+	cv_signal(&entry->ote_ott->ott_cond);
+	mutex_exit(&entry->ote_ott->ott_lock);
+
+	return (0);
+}
+
+static int
+overlay_target_lookup_drop(overlay_target_hdl_t *thdl, void *arg)
+{
+	const overlay_targ_resp_t *otr = arg;
+	overlay_target_entry_t *entry;
+	mblk_t *mp;
+	boolean_t queue = B_FALSE;
+
+	mutex_enter(&thdl->oth_lock);
+	for (entry = list_head(&thdl->oth_outstanding); entry != NULL;
+	    entry = list_next(&thdl->oth_outstanding, entry)) {
+		if ((uintptr_t)entry == otr->otr_reqid)
+			break;
+	}
+
+	if (entry == NULL) {
+		mutex_exit(&thdl->oth_lock);
+		return (EINVAL);
+	}
+	list_remove(&thdl->oth_outstanding, entry);
+	mutex_exit(&thdl->oth_lock);
+
+	mutex_enter(&entry->ote_lock);
+
+	/* Safeguard against a confused varpd */
+	if (entry->ote_flags & OVERLAY_ENTRY_F_VALID) {
+		entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING;
+		DTRACE_PROBE1(overlay__target__valid__drop,
+		    overlay_target_entry_t *, entry);
+		mutex_exit(&entry->ote_lock);
+		goto done;
+	}
+
+	mp = entry->ote_chead;
+	if (mp != NULL) {
+		entry->ote_chead = mp->b_next;
+		mp->b_next = NULL;
+		if (entry->ote_ctail == mp)
+			entry->ote_ctail = entry->ote_chead;
+		entry->ote_mbsize -= msgsize(mp);
+	}
+	if (entry->ote_chead != NULL) {
+		queue = B_TRUE;
+		entry->ote_flags |= OVERLAY_ENTRY_F_PENDING;
+	} else {
+		entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING;
+	}
+	mutex_exit(&entry->ote_lock);
+
+	if (queue == B_TRUE)
+		overlay_target_queue(entry);
+	freemsg(mp);
+
+done:
+	mutex_enter(&entry->ote_ott->ott_lock);
+	entry->ote_ott->ott_ocount--;
+	cv_signal(&entry->ote_ott->ott_cond);
+	mutex_exit(&entry->ote_ott->ott_lock);
+
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+overlay_target_pkt_copyin(const void *ubuf, void **outp, size_t *bsize,
+    int flags)
+{
+	overlay_targ_pkt_t *pkt;
+	overlay_targ_pkt32_t *pkt32;
+
+	pkt = kmem_alloc(sizeof (overlay_targ_pkt_t), KM_SLEEP);
+	*outp = pkt;
+	*bsize = sizeof (overlay_targ_pkt_t);
+	if (ddi_model_convert_from(flags & FMODELS) == DDI_MODEL_ILP32) {
+		uintptr_t addr;
+
+		if (ddi_copyin(ubuf, pkt, sizeof (overlay_targ_pkt32_t),
+		    flags & FKIOCTL) != 0) {
+			kmem_free(pkt, *bsize);
+			return (EFAULT);
+		}
+		pkt32 = (overlay_targ_pkt32_t *)pkt;
+		addr = pkt32->otp_buf;
+		pkt->otp_buf = (void *)addr;
+	} else {
+		if (ddi_copyin(ubuf, pkt, *bsize, flags & FKIOCTL) != 0) {
+			kmem_free(pkt, *bsize);
+			return (EFAULT);
+		}
+	}
+	return (0);
+}
+
+static int
+overlay_target_pkt_copyout(void *ubuf, void *buf, size_t bufsize,
+    int flags)
+{
+	if (ddi_model_convert_from(flags & FMODELS) == DDI_MODEL_ILP32) {
+		overlay_targ_pkt_t *pkt = buf;
+		overlay_targ_pkt32_t *pkt32 = buf;
+		uintptr_t addr = (uintptr_t)pkt->otp_buf;
+		pkt32->otp_buf = (caddr32_t)addr;
+		if (ddi_copyout(buf, ubuf, sizeof (overlay_targ_pkt32_t),
+		    flags & FKIOCTL) != 0)
+			return (EFAULT);
+	} else {
+		if (ddi_copyout(buf, ubuf, bufsize, flags & FKIOCTL) != 0)
+			return (EFAULT);
+	}
+	return (0);
+}
+
+static int
+overlay_target_packet(overlay_target_hdl_t *thdl, void *arg)
+{
+	overlay_targ_pkt_t *pkt = arg;
+	overlay_target_entry_t *entry;
+	mblk_t *mp;
+	size_t mlen;
+	size_t boff;
+
+	mutex_enter(&thdl->oth_lock);
+	for (entry = list_head(&thdl->oth_outstanding); entry != NULL;
+	    entry = list_next(&thdl->oth_outstanding, entry)) {
+		if ((uintptr_t)entry == pkt->otp_reqid)
+			break;
+	}
+
+	if (entry == NULL) {
+		mutex_exit(&thdl->oth_lock);
+		return (EINVAL);
+	}
+	mutex_enter(&entry->ote_lock);
+	mutex_exit(&thdl->oth_lock);
+	mp = entry->ote_chead;
+	/* Protect against a rogue varpd */
+	if (mp == NULL) {
+		mutex_exit(&entry->ote_lock);
+		return (EINVAL);
+	}
+	mlen = MIN(msgsize(mp), pkt->otp_size);
+	pkt->otp_size = mlen;
+	boff = 0;
+	while (mlen > 0) {
+		size_t wlen = MIN(MBLKL(mp), mlen);
+		if (ddi_copyout(mp->b_rptr,
+		    (void *)((uintptr_t)pkt->otp_buf + boff),
+		    wlen, 0) != 0) {
+			mutex_exit(&entry->ote_lock);
+			return (EFAULT);
+		}
+		mlen -= wlen;
+		boff += wlen;
+		mp = mp->b_cont;
+	}
+	mutex_exit(&entry->ote_lock);
+	return (0);
+}
+
+static int
+overlay_target_inject(overlay_target_hdl_t *thdl, void *arg)
+{
+	overlay_targ_pkt_t *pkt = arg;
+	overlay_target_entry_t *entry;
+	overlay_dev_t *odd;
+	mblk_t *mp;
+
+	if (pkt->otp_size > ETHERMAX + VLAN_TAGSZ)
+		return (EINVAL);
+
+	mp = allocb(pkt->otp_size, 0);
+	if (mp == NULL)
+		return (ENOMEM);
+
+	if (ddi_copyin(pkt->otp_buf, mp->b_rptr, pkt->otp_size, 0) != 0) {
+		freeb(mp);
+		return (EFAULT);
+	}
+	mp->b_wptr += pkt->otp_size;
+
+	if (pkt->otp_linkid != UINT64_MAX) {
+		odd = overlay_hold_by_dlid(pkt->otp_linkid);
+		if (odd == NULL) {
+			freeb(mp);
+			return (ENOENT);
+		}
+	} else {
+		mutex_enter(&thdl->oth_lock);
+		for (entry = list_head(&thdl->oth_outstanding); entry != NULL;
+		    entry = list_next(&thdl->oth_outstanding, entry)) {
+			if ((uintptr_t)entry == pkt->otp_reqid)
+				break;
+		}
+
+		if (entry == NULL) {
+			mutex_exit(&thdl->oth_lock);
+			freeb(mp);
+			return (ENOENT);
+		}
+		odd = entry->ote_odd;
+		mutex_exit(&thdl->oth_lock);
+	}
+
+	mutex_enter(&odd->odd_lock);
+	overlay_io_start(odd, OVERLAY_F_IN_RX);
+	mutex_exit(&odd->odd_lock);
+
+	mac_rx(odd->odd_mh, NULL, mp);
+
+	mutex_enter(&odd->odd_lock);
+	overlay_io_done(odd, OVERLAY_F_IN_RX);
+	mutex_exit(&odd->odd_lock);
+
+	return (0);
+}
+
+static int
+overlay_target_resend(overlay_target_hdl_t *thdl, void *arg)
+{
+	overlay_targ_pkt_t *pkt = arg;
+	overlay_target_entry_t *entry;
+	overlay_dev_t *odd;
+	mblk_t *mp;
+
+	if (pkt->otp_size > ETHERMAX + VLAN_TAGSZ)
+		return (EINVAL);
+
+	mp = allocb(pkt->otp_size, 0);
+	if (mp == NULL)
+		return (ENOMEM);
+
+	if (ddi_copyin(pkt->otp_buf, mp->b_rptr, pkt->otp_size, 0) != 0) {
+		freeb(mp);
+		return (EFAULT);
+	}
+	mp->b_wptr += pkt->otp_size;
+
+	if (pkt->otp_linkid != UINT64_MAX) {
+		odd = overlay_hold_by_dlid(pkt->otp_linkid);
+		if (odd == NULL) {
+			freeb(mp);
+			return (ENOENT);
+		}
+	} else {
+		mutex_enter(&thdl->oth_lock);
+		for (entry = list_head(&thdl->oth_outstanding); entry != NULL;
+		    entry = list_next(&thdl->oth_outstanding, entry)) {
+			if ((uintptr_t)entry == pkt->otp_reqid)
+				break;
+		}
+
+		if (entry == NULL) {
+			mutex_exit(&thdl->oth_lock);
+			freeb(mp);
+			return (ENOENT);
+		}
+		odd = entry->ote_odd;
+		mutex_exit(&thdl->oth_lock);
+	}
+
+	mp = overlay_m_tx(odd, mp);
+	freemsgchain(mp);
+
+	return (0);
+}
+
+typedef struct overlay_targ_list_int {
+	boolean_t	otli_count;
+	uint32_t	otli_cur;
+	uint32_t	otli_nents;
+	uint32_t	otli_ents[];
+} overlay_targ_list_int_t;
+
+static int
+overlay_target_list_copyin(const void *ubuf, void **outp, size_t *bsize,
+    int flags)
+{
+	overlay_targ_list_t n;
+	overlay_targ_list_int_t *otl;
+
+	if (ddi_copyin(ubuf, &n, sizeof (overlay_targ_list_t),
+	    flags & FKIOCTL) != 0)
+		return (EFAULT);
+
+	/*
+	 */
+	if (n.otl_nents >= INT32_MAX / sizeof (uint32_t))
+		return (EINVAL);
+	*bsize = sizeof (overlay_targ_list_int_t) +
+	    sizeof (uint32_t) * n.otl_nents;
+	otl = kmem_zalloc(*bsize, KM_SLEEP);
+	otl->otli_cur = 0;
+	otl->otli_nents = n.otl_nents;
+	if (otl->otli_nents != 0) {
+		otl->otli_count = B_FALSE;
+		if (ddi_copyin((void *)((uintptr_t)ubuf +
+		    offsetof(overlay_targ_list_t, otl_ents)),
+		    otl->otli_ents, n.otl_nents * sizeof (uint32_t),
+		    flags & FKIOCTL) != 0) {
+			kmem_free(otl, *bsize);
+			return (EFAULT);
+		}
+	} else {
+		otl->otli_count = B_TRUE;
+	}
+
+	*outp = otl;
+	return (0);
+}
+
+static int
+overlay_target_ioctl_list_cb(overlay_dev_t *odd, void *arg)
+{
+	overlay_targ_list_int_t *otl = arg;
+
+	if (otl->otli_cur < otl->otli_nents)
+		otl->otli_ents[otl->otli_cur] = odd->odd_linkid;
+	otl->otli_cur++;
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+overlay_target_ioctl_list(overlay_target_hdl_t *thdl, void *arg)
+{
+	overlay_dev_iter(overlay_target_ioctl_list_cb, arg);
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+overlay_target_list_copyout(void *ubuf, void *buf, size_t bufsize, int flags)
+{
+	overlay_targ_list_int_t *otl = buf;
+
+	if (ddi_copyout(&otl->otli_cur, ubuf, sizeof (uint32_t),
+	    flags & FKIOCTL) != 0)
+		return (EFAULT);
+
+	if (otl->otli_count == B_FALSE) {
+		if (ddi_copyout(otl->otli_ents,
+		    (void *)((uintptr_t)ubuf +
+		    offsetof(overlay_targ_list_t, otl_ents)),
+		    sizeof (uint32_t) * otl->otli_nents,
+		    flags & FKIOCTL) != 0)
+			return (EFAULT);
+	}
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+overlay_target_cache_get(overlay_target_hdl_t *thdl, void *arg)
+{
+	int ret = 0;
+	overlay_dev_t *odd;
+	overlay_target_t *ott;
+	overlay_targ_cache_t *otc = arg;
+
+	odd = overlay_hold_by_dlid(otc->otc_linkid);
+	if (odd == NULL)
+		return (ENOENT);
+
+	mutex_enter(&odd->odd_lock);
+	if (!(odd->odd_flags & OVERLAY_F_VARPD)) {
+		mutex_exit(&odd->odd_lock);
+		overlay_hold_rele(odd);
+		return (ENXIO);
+	}
+	ott = odd->odd_target;
+	if (ott->ott_mode != OVERLAY_TARGET_POINT &&
+	    ott->ott_mode != OVERLAY_TARGET_DYNAMIC) {
+		mutex_exit(&odd->odd_lock);
+		overlay_hold_rele(odd);
+		return (ENOTSUP);
+	}
+	mutex_enter(&ott->ott_lock);
+	mutex_exit(&odd->odd_lock);
+
+	if (ott->ott_mode == OVERLAY_TARGET_POINT) {
+		otc->otc_entry.otce_flags = 0;
+		bcopy(&ott->ott_u.ott_point, &otc->otc_entry.otce_dest,
+		    sizeof (overlay_target_point_t));
+	} else {
+		overlay_target_entry_t *ote;
+		ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
+		    otc->otc_entry.otce_mac);
+		if (ote != NULL) {
+			mutex_enter(&ote->ote_lock);
+			if ((ote->ote_flags &
+			    OVERLAY_ENTRY_F_VALID_MASK) != 0) {
+				if (ote->ote_flags & OVERLAY_ENTRY_F_DROP) {
+					otc->otc_entry.otce_flags =
+					    OVERLAY_TARGET_CACHE_DROP;
+				} else {
+					otc->otc_entry.otce_flags = 0;
+					bcopy(&ote->ote_dest,
+					    &otc->otc_entry.otce_dest,
+					    sizeof (overlay_target_point_t));
+				}
+				ret = 0;
+			} else {
+				ret = ENOENT;
+			}
+			mutex_exit(&ote->ote_lock);
+		} else {
+			ret = ENOENT;
+		}
+	}
+
+	mutex_exit(&ott->ott_lock);
+	overlay_hold_rele(odd);
+
+	return (ret);
+}
+
+/* ARGSUSED */
+static int
+overlay_target_cache_set(overlay_target_hdl_t *thdl, void *arg)
+{
+	overlay_dev_t *odd;
+	overlay_target_t *ott;
+	overlay_target_entry_t *ote;
+	overlay_targ_cache_t *otc = arg;
+	mblk_t *mp = NULL;
+
+	if (otc->otc_entry.otce_flags & ~OVERLAY_TARGET_CACHE_DROP)
+		return (EINVAL);
+
+	odd = overlay_hold_by_dlid(otc->otc_linkid);
+	if (odd == NULL)
+		return (ENOENT);
+
+	mutex_enter(&odd->odd_lock);
+	if (!(odd->odd_flags & OVERLAY_F_VARPD)) {
+		mutex_exit(&odd->odd_lock);
+		overlay_hold_rele(odd);
+		return (ENXIO);
+	}
+	ott = odd->odd_target;
+	if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC) {
+		mutex_exit(&odd->odd_lock);
+		overlay_hold_rele(odd);
+		return (ENOTSUP);
+	}
+	mutex_enter(&ott->ott_lock);
+	mutex_exit(&odd->odd_lock);
+
+	ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
+	    otc->otc_entry.otce_mac);
+	if (ote == NULL) {
+		ote = kmem_cache_alloc(overlay_entry_cache, KM_SLEEP);
+		bcopy(otc->otc_entry.otce_mac, ote->ote_addr, ETHERADDRL);
+		ote->ote_chead = ote->ote_ctail = NULL;
+		ote->ote_mbsize = 0;
+		ote->ote_ott = ott;
+		ote->ote_odd = odd;
+		mutex_enter(&ote->ote_lock);
+		refhash_insert(ott->ott_u.ott_dyn.ott_dhash, ote);
+		avl_add(&ott->ott_u.ott_dyn.ott_tree, ote);
+	} else {
+		mutex_enter(&ote->ote_lock);
+	}
+
+	if (otc->otc_entry.otce_flags & OVERLAY_TARGET_CACHE_DROP) {
+		ote->ote_flags |= OVERLAY_ENTRY_F_DROP;
+	} else {
+		ote->ote_flags |= OVERLAY_ENTRY_F_VALID;
+		bcopy(&otc->otc_entry.otce_dest, &ote->ote_dest,
+		    sizeof (overlay_target_point_t));
+		mp = ote->ote_chead;
+		ote->ote_chead = NULL;
+		ote->ote_ctail = NULL;
+		ote->ote_mbsize = 0;
+		ote->ote_vtime = gethrtime();
+	}
+
+	mutex_exit(&ote->ote_lock);
+	mutex_exit(&ott->ott_lock);
+
+	if (mp != NULL) {
+		mp = overlay_m_tx(ote->ote_odd, mp);
+		freemsgchain(mp);
+	}
+
+	overlay_hold_rele(odd);
+
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+overlay_target_cache_remove(overlay_target_hdl_t *thdl, void *arg)
+{
+	int ret = 0;
+	overlay_dev_t *odd;
+	overlay_target_t *ott;
+	overlay_target_entry_t *ote;
+	overlay_targ_cache_t *otc = arg;
+
+	odd = overlay_hold_by_dlid(otc->otc_linkid);
+	if (odd == NULL)
+		return (ENOENT);
+
+	mutex_enter(&odd->odd_lock);
+	if (!(odd->odd_flags & OVERLAY_F_VARPD)) {
+		mutex_exit(&odd->odd_lock);
+		overlay_hold_rele(odd);
+		return (ENXIO);
+	}
+	ott = odd->odd_target;
+	if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC) {
+		mutex_exit(&odd->odd_lock);
+		overlay_hold_rele(odd);
+		return (ENOTSUP);
+	}
+	mutex_enter(&ott->ott_lock);
+	mutex_exit(&odd->odd_lock);
+
+	ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
+	    otc->otc_entry.otce_mac);
+	if (ote != NULL) {
+		mutex_enter(&ote->ote_lock);
+		ote->ote_flags &= ~OVERLAY_ENTRY_F_VALID_MASK;
+		mutex_exit(&ote->ote_lock);
+		ret = 0;
+	} else {
+		ret = ENOENT;
+	}
+
+	mutex_exit(&ott->ott_lock);
+	overlay_hold_rele(odd);
+
+	return (ret);
+}
+
+/* ARGSUSED */
+static int
+overlay_target_cache_flush(overlay_target_hdl_t *thdl, void *arg)
+{
+	avl_tree_t *avl;
+	overlay_dev_t *odd;
+	overlay_target_t *ott;
+	overlay_target_entry_t *ote;
+	overlay_targ_cache_t *otc = arg;
+
+	odd = overlay_hold_by_dlid(otc->otc_linkid);
+	if (odd == NULL)
+		return (ENOENT);
+
+	mutex_enter(&odd->odd_lock);
+	if (!(odd->odd_flags & OVERLAY_F_VARPD)) {
+		mutex_exit(&odd->odd_lock);
+		overlay_hold_rele(odd);
+		return (ENXIO);
+	}
+	ott = odd->odd_target;
+	if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC) {
+		mutex_exit(&odd->odd_lock);
+		overlay_hold_rele(odd);
+		return (ENOTSUP);
+	}
+	mutex_enter(&ott->ott_lock);
+	mutex_exit(&odd->odd_lock);
+	avl = &ott->ott_u.ott_dyn.ott_tree;
+
+	for (ote = avl_first(avl); ote != NULL; ote = AVL_NEXT(avl, ote)) {
+		mutex_enter(&ote->ote_lock);
+		ote->ote_flags &= ~OVERLAY_ENTRY_F_VALID_MASK;
+		mutex_exit(&ote->ote_lock);
+	}
+	ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
+	    otc->otc_entry.otce_mac);
+
+	mutex_exit(&ott->ott_lock);
+	overlay_hold_rele(odd);
+
+	return (0);
+}
+
+static int
+overlay_target_cache_iter_copyin(const void *ubuf, void **outp, size_t *bsize,
+    int flags)
+{
+	overlay_targ_cache_iter_t base, *iter;
+
+	if (ddi_copyin(ubuf, &base, sizeof (overlay_targ_cache_iter_t),
+	    flags & FKIOCTL) != 0)
+		return (EFAULT);
+
+	if (base.otci_count > OVERLAY_TARGET_ITER_MAX)
+		return (E2BIG);
+
+	if (base.otci_count == 0)
+		return (EINVAL);
+
+	*bsize = sizeof (overlay_targ_cache_iter_t) +
+	    base.otci_count * sizeof (overlay_targ_cache_entry_t);
+	iter = kmem_alloc(*bsize, KM_SLEEP);
+	bcopy(&base, iter, sizeof (overlay_targ_cache_iter_t));
+	*outp = iter;
+
+	return (0);
+}
+
+typedef struct overlay_targ_cache_marker {
+	uint8_t		otcm_mac[ETHERADDRL];
+	uint16_t	otcm_done;
+} overlay_targ_cache_marker_t;
+
+/* ARGSUSED */
+static int
+overlay_target_cache_iter(overlay_target_hdl_t *thdl, void *arg)
+{
+	overlay_dev_t *odd;
+	overlay_target_t *ott;
+	overlay_target_entry_t lookup, *ent;
+	overlay_targ_cache_marker_t *mark;
+	avl_index_t where;
+	avl_tree_t *avl;
+	uint16_t written = 0;
+
+	overlay_targ_cache_iter_t *iter = arg;
+	mark = (void *)&iter->otci_marker;
+
+	if (mark->otcm_done != 0) {
+		iter->otci_count = 0;
+		return (0);
+	}
+
+	odd = overlay_hold_by_dlid(iter->otci_linkid);
+	if (odd == NULL)
+		return (ENOENT);
+
+	mutex_enter(&odd->odd_lock);
+	if (!(odd->odd_flags & OVERLAY_F_VARPD)) {
+		mutex_exit(&odd->odd_lock);
+		overlay_hold_rele(odd);
+		return (ENXIO);
+	}
+	ott = odd->odd_target;
+	if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC &&
+	    ott->ott_mode != OVERLAY_TARGET_POINT) {
+		mutex_exit(&odd->odd_lock);
+		overlay_hold_rele(odd);
+		return (ENOTSUP);
+	}
+
+	/*
+	 * Holding this lock across the entire iteration probably isn't very
+	 * good. We should perhaps add an r/w lock for the avl tree. But we'll
+	 * wait until we now it's necessary before we do more.
+	 */
+	mutex_enter(&ott->ott_lock);
+	mutex_exit(&odd->odd_lock);
+
+	if (ott->ott_mode == OVERLAY_TARGET_POINT) {
+		overlay_targ_cache_entry_t *out = &iter->otci_ents[0];
+		bzero(out->otce_mac, ETHERADDRL);
+		out->otce_flags = 0;
+		bcopy(&ott->ott_u.ott_point, &out->otce_dest,
+		    sizeof (overlay_target_point_t));
+		written++;
+		mark->otcm_done = 1;
+	}
+
+	avl = &ott->ott_u.ott_dyn.ott_tree;
+	bcopy(mark->otcm_mac, lookup.ote_addr, ETHERADDRL);
+	ent = avl_find(avl, &lookup, &where);
+
+	/*
+	 * NULL ent means that the entry does not exist, so we want to start
+	 * with the closest node in the tree. This means that we implicitly rely
+	 * on the tree's order and the first node will be the mac 00:00:00:00:00
+	 * and the last will be ff:ff:ff:ff:ff:ff.
+	 */
+	if (ent == NULL) {
+		ent = avl_nearest(avl, where, AVL_AFTER);
+		if (ent == NULL) {
+			mark->otcm_done = 1;
+			goto done;
+		}
+	}
+
+	for (; ent != NULL && written < iter->otci_count;
+	    ent = AVL_NEXT(avl, ent)) {
+		overlay_targ_cache_entry_t *out = &iter->otci_ents[written];
+		mutex_enter(&ent->ote_lock);
+		if ((ent->ote_flags & OVERLAY_ENTRY_F_VALID_MASK) == 0) {
+			mutex_exit(&ent->ote_lock);
+			continue;
+		}
+		bcopy(ent->ote_addr, out->otce_mac, ETHERADDRL);
+		out->otce_flags = 0;
+		if (ent->ote_flags & OVERLAY_ENTRY_F_DROP)
+			out->otce_flags |= OVERLAY_TARGET_CACHE_DROP;
+		if (ent->ote_flags & OVERLAY_ENTRY_F_VALID)
+			bcopy(&ent->ote_dest, &out->otce_dest,
+			    sizeof (overlay_target_point_t));
+		written++;
+		mutex_exit(&ent->ote_lock);
+	}
+
+	if (ent != NULL) {
+		bcopy(ent->ote_addr, mark->otcm_mac, ETHERADDRL);
+	} else {
+		mark->otcm_done = 1;
+	}
+
+done:
+	iter->otci_count = written;
+	mutex_exit(&ott->ott_lock);
+	overlay_hold_rele(odd);
+
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+overlay_target_cache_iter_copyout(void *ubuf, void *buf, size_t bufsize,
+    int flags)
+{
+	size_t outsize;
+	const overlay_targ_cache_iter_t *iter = buf;
+
+	outsize = sizeof (overlay_targ_cache_iter_t) +
+	    iter->otci_count * sizeof (overlay_targ_cache_entry_t);
+
+	if (ddi_copyout(buf, ubuf, outsize, flags & FKIOCTL) != 0)
+		return (EFAULT);
+
+	return (0);
+}
+
+static overlay_target_ioctl_t overlay_target_ioctab[] = {
+	{ OVERLAY_TARG_INFO, B_TRUE, B_TRUE,
+		NULL, overlay_target_info,
+		NULL, sizeof (overlay_targ_info_t)	},
+	{ OVERLAY_TARG_ASSOCIATE, B_TRUE, B_FALSE,
+		NULL, overlay_target_associate,
+		NULL, sizeof (overlay_targ_associate_t)	},
+	{ OVERLAY_TARG_DISASSOCIATE, B_TRUE, B_FALSE,
+		NULL, overlay_target_disassociate,
+		NULL, sizeof (overlay_targ_id_t)	},
+	{ OVERLAY_TARG_DEGRADE, B_TRUE, B_FALSE,
+		NULL, overlay_target_degrade,
+		NULL, sizeof (overlay_targ_degrade_t)	},
+	{ OVERLAY_TARG_RESTORE, B_TRUE, B_FALSE,
+		NULL, overlay_target_restore,
+		NULL, sizeof (overlay_targ_id_t)	},
+	{ OVERLAY_TARG_LOOKUP, B_FALSE, B_TRUE,
+		NULL, overlay_target_lookup_request,
+		NULL, sizeof (overlay_targ_lookup_t)	},
+	{ OVERLAY_TARG_RESPOND, B_TRUE, B_FALSE,
+		NULL, overlay_target_lookup_respond,
+		NULL, sizeof (overlay_targ_resp_t)	},
+	{ OVERLAY_TARG_DROP, B_TRUE, B_FALSE,
+		NULL, overlay_target_lookup_drop,
+		NULL, sizeof (overlay_targ_resp_t)	},
+	{ OVERLAY_TARG_PKT, B_TRUE, B_TRUE,
+		overlay_target_pkt_copyin,
+		overlay_target_packet,
+		overlay_target_pkt_copyout,
+		sizeof (overlay_targ_pkt_t)		},
+	{ OVERLAY_TARG_INJECT, B_TRUE, B_FALSE,
+		overlay_target_pkt_copyin,
+		overlay_target_inject,
+		NULL, sizeof (overlay_targ_pkt_t)	},
+	{ OVERLAY_TARG_RESEND, B_TRUE, B_FALSE,
+		overlay_target_pkt_copyin,
+		overlay_target_resend,
+		NULL, sizeof (overlay_targ_pkt_t)	},
+	{ OVERLAY_TARG_LIST, B_FALSE, B_TRUE,
+		overlay_target_list_copyin,
+		overlay_target_ioctl_list,
+		overlay_target_list_copyout,
+		sizeof (overlay_targ_list_t)		},
+	{ OVERLAY_TARG_CACHE_GET, B_FALSE, B_TRUE,
+		NULL, overlay_target_cache_get,
+		NULL, sizeof (overlay_targ_cache_t)	},
+	{ OVERLAY_TARG_CACHE_SET, B_TRUE, B_TRUE,
+		NULL, overlay_target_cache_set,
+		NULL, sizeof (overlay_targ_cache_t)	},
+	{ OVERLAY_TARG_CACHE_REMOVE, B_TRUE, B_TRUE,
+		NULL, overlay_target_cache_remove,
+		NULL, sizeof (overlay_targ_cache_t)	},
+	{ OVERLAY_TARG_CACHE_FLUSH, B_TRUE, B_TRUE,
+		NULL, overlay_target_cache_flush,
+		NULL, sizeof (overlay_targ_cache_t)	},
+	{ OVERLAY_TARG_CACHE_ITER, B_FALSE, B_TRUE,
+		overlay_target_cache_iter_copyin,
+		overlay_target_cache_iter,
+		overlay_target_cache_iter_copyout,
+		sizeof (overlay_targ_cache_iter_t)		},
+	{ 0 }
+};
+
+int
+overlay_target_open(dev_t *devp, int flags, int otype, cred_t *credp)
+{
+	minor_t mid;
+	overlay_target_hdl_t *thdl;
+
+	if (secpolicy_dl_config(credp) != 0)
+		return (EPERM);
+
+	if (getminor(*devp) != 0)
+		return (ENXIO);
+
+	if (otype & OTYP_BLK)
+		return (EINVAL);
+
+	if (flags & ~(FREAD | FWRITE | FEXCL))
+		return (EINVAL);
+
+	if ((flags & FWRITE) &&
+	    !(flags & FEXCL))
+		return (EINVAL);
+
+	if (!(flags & FREAD) && !(flags & FWRITE))
+		return (EINVAL);
+
+	if (crgetzoneid(credp) != GLOBAL_ZONEID)
+		return (EPERM);
+
+	mid = id_alloc(overlay_thdl_idspace);
+	if (ddi_soft_state_zalloc(overlay_thdl_state, mid) != 0) {
+		id_free(overlay_thdl_idspace, mid);
+		return (ENXIO);
+	}
+
+	thdl = ddi_get_soft_state(overlay_thdl_state, mid);
+	VERIFY(thdl != NULL);
+	thdl->oth_minor = mid;
+	thdl->oth_zoneid = crgetzoneid(credp);
+	thdl->oth_oflags = flags;
+	mutex_init(&thdl->oth_lock, NULL, MUTEX_DRIVER, NULL);
+	list_create(&thdl->oth_outstanding, sizeof (overlay_target_entry_t),
+	    offsetof(overlay_target_entry_t, ote_qlink));
+	*devp = makedevice(getmajor(*devp), mid);
+
+	mutex_enter(&overlay_target_lock);
+	if ((flags & FEXCL) && overlay_target_excl == B_TRUE) {
+		mutex_exit(&overlay_target_lock);
+		list_destroy(&thdl->oth_outstanding);
+		mutex_destroy(&thdl->oth_lock);
+		ddi_soft_state_free(overlay_thdl_state, mid);
+		id_free(overlay_thdl_idspace, mid);
+		return (EEXIST);
+	} else if ((flags & FEXCL) != 0) {
+		VERIFY(overlay_target_excl == B_FALSE);
+		overlay_target_excl = B_TRUE;
+	}
+	list_insert_tail(&overlay_thdl_list, thdl);
+	mutex_exit(&overlay_target_lock);
+
+	return (0);
+}
+
+/* ARGSUSED */
+int
+overlay_target_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
+    int *rvalp)
+{
+	overlay_target_ioctl_t *ioc;
+	overlay_target_hdl_t *thdl;
+
+	if (secpolicy_dl_config(credp) != 0)
+		return (EPERM);
+
+	if ((thdl = ddi_get_soft_state(overlay_thdl_state,
+	    getminor(dev))) == NULL)
+		return (ENXIO);
+
+	for (ioc = &overlay_target_ioctab[0]; ioc->oti_cmd != 0; ioc++) {
+		int ret;
+		caddr_t buf;
+		size_t bufsize;
+
+		if (ioc->oti_cmd != cmd)
+			continue;
+
+		if (ioc->oti_write == B_TRUE && !(mode & FWRITE))
+			return (EBADF);
+
+		if (ioc->oti_copyin == NULL) {
+			bufsize = ioc->oti_size;
+			buf = kmem_alloc(bufsize, KM_SLEEP);
+			if (ddi_copyin((void *)(uintptr_t)arg, buf, bufsize,
+			    mode & FKIOCTL) != 0) {
+				kmem_free(buf, bufsize);
+				return (EFAULT);
+			}
+		} else {
+			if ((ret = ioc->oti_copyin((void *)(uintptr_t)arg,
+			    (void **)&buf, &bufsize, mode)) != 0)
+				return (ret);
+		}
+
+		ret = ioc->oti_func(thdl, buf);
+		if (ret == 0 && ioc->oti_size != 0 &&
+		    ioc->oti_ncopyout == B_TRUE) {
+			if (ioc->oti_copyout == NULL) {
+				if (ddi_copyout(buf, (void *)(uintptr_t)arg,
+				    bufsize, mode & FKIOCTL) != 0)
+					ret = EFAULT;
+			} else {
+				ret = ioc->oti_copyout((void *)(uintptr_t)arg,
+				    buf, bufsize, mode);
+			}
+		}
+
+		kmem_free(buf, bufsize);
+		return (ret);
+	}
+
+	return (ENOTTY);
+}
+
+/* ARGSUSED */
+int
+overlay_target_close(dev_t dev, int flags, int otype, cred_t *credp)
+{
+	overlay_target_hdl_t *thdl;
+	overlay_target_entry_t *entry;
+	minor_t mid = getminor(dev);
+
+	if ((thdl = ddi_get_soft_state(overlay_thdl_state, mid)) == NULL)
+		return (ENXIO);
+
+	mutex_enter(&overlay_target_lock);
+	list_remove(&overlay_thdl_list, thdl);
+	mutex_enter(&thdl->oth_lock);
+	while ((entry = list_remove_head(&thdl->oth_outstanding)) != NULL)
+		list_insert_tail(&overlay_target_list, entry);
+	cv_signal(&overlay_target_condvar);
+	mutex_exit(&thdl->oth_lock);
+	if ((thdl->oth_oflags & FEXCL) != 0) {
+		VERIFY(overlay_target_excl == B_TRUE);
+		overlay_target_excl = B_FALSE;
+	}
+	mutex_exit(&overlay_target_lock);
+
+	list_destroy(&thdl->oth_outstanding);
+	mutex_destroy(&thdl->oth_lock);
+	mid = thdl->oth_minor;
+	ddi_soft_state_free(overlay_thdl_state, mid);
+	id_free(overlay_thdl_idspace, mid);
+
+	return (0);
+}
diff --git a/usr/src/uts/common/io/overlay/plugins/overlay_vxlan.c b/usr/src/uts/common/io/overlay/plugins/overlay_vxlan.c
new file mode 100644
index 0000000000..8b4e4ecb42
--- /dev/null
+++ b/usr/src/uts/common/io/overlay/plugins/overlay_vxlan.c
@@ -0,0 +1,372 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+/*
+ * VXLAN encapsulation module
+ *
+ *
+ * The VXLAN header looks as follows in network byte order:
+ *
+ * |0        3| 4 |5                     31|
+ * +----------+---+------------------------+
+ * | Reserved | I | Reserved               |
+ * +---------------------------------------+
+ * | Virtual Network ID         | Reserved |
+ * +----------------------------+----------+
+ * |0                         23|24      31|
+ *
+ * All reserved values must be 0. The I bit must be 1. We call the top
+ * word the VXLAN magic field for the time being. The second word is
+ * definitely not the most friendly way to operate. Specifically, the ID
+ * is a 24-bit big endian value, but we have to make sure not to use the
+ * reserved byte.
+ *
+ * For us, VXLAN encapsulation is a fairly straightforward implementation. It
+ * only has two properties, a listen_ip and a listen_port. These determine on
+ * what address we should be listening on. While we do not have a default
+ * address to listen upon, we do have a default port, which is the IANA assigned
+ * port for VXLAN -- 4789.
+ */
+
+#include <sys/overlay_plugin.h>
+#include <sys/modctl.h>
+#include <sys/errno.h>
+#include <sys/byteorder.h>
+#include <sys/vxlan.h>
+#include <inet/ip.h>
+#include <netinet/in.h>
+#include <sys/strsun.h>
+#include <netinet/udp.h>
+
+static const char *vxlan_ident = "vxlan";
+static uint16_t vxlan_defport = IPPORT_VXLAN;
+
+/*
+ * Should we enable UDP source port hashing for fanout.
+ */
+boolean_t vxlan_fanout = B_TRUE;
+
+static const char *vxlan_props[] = {
+	"vxlan/listen_ip",
+	"vxlan/listen_port",
+	NULL
+};
+
+typedef struct vxlan {
+	kmutex_t vxl_lock;
+	overlay_handle_t vxl_oh;
+	uint16_t vxl_lport;
+	boolean_t vxl_hladdr;
+	struct in6_addr vxl_laddr;
+} vxlan_t;
+
+static int
+vxlan_o_init(overlay_handle_t oh, void **outp)
+{
+	vxlan_t *vxl;
+
+	vxl = kmem_alloc(sizeof (vxlan_t), KM_SLEEP);
+	*outp = vxl;
+	mutex_init(&vxl->vxl_lock, NULL, MUTEX_DRIVER, NULL);
+	vxl->vxl_oh = oh;
+	vxl->vxl_lport = vxlan_defport;
+	vxl->vxl_hladdr = B_FALSE;
+
+	return (0);
+}
+
+static void
+vxlan_o_fini(void *arg)
+{
+	vxlan_t *vxl = arg;
+
+	mutex_destroy(&vxl->vxl_lock);
+	kmem_free(arg, sizeof (vxlan_t));
+}
+
+static int
+vxlan_o_socket(void *arg, int *dp, int *fp, int *pp, struct sockaddr *addr,
+    socklen_t *slenp)
+{
+	vxlan_t *vxl = arg;
+	struct sockaddr_in6 *in;
+
+	in = (struct sockaddr_in6 *)addr;
+	*dp = AF_INET6;
+	*fp = SOCK_DGRAM;
+	*pp = 0;
+	bzero(in, sizeof (struct sockaddr_in6));
+	in->sin6_family = AF_INET6;
+
+	/*
+	 * We should consider a more expressive private errno set that
+	 * provider's can use.
+	 */
+	mutex_enter(&vxl->vxl_lock);
+	if (vxl->vxl_hladdr == B_FALSE) {
+		mutex_exit(&vxl->vxl_lock);
+		return (EINVAL);
+	}
+	in->sin6_port = htons(vxl->vxl_lport);
+	in->sin6_addr = vxl->vxl_laddr;
+	mutex_exit(&vxl->vxl_lock);
+	*slenp = sizeof (struct sockaddr_in6);
+
+	return (0);
+}
+
+static int
+vxlan_o_sockopt(ksocket_t ksock)
+{
+	int val, err;
+	if (vxlan_fanout == B_FALSE)
+		return (0);
+
+	val = UDP_HASH_VXLAN;
+	err = ksocket_setsockopt(ksock, IPPROTO_UDP, UDP_SRCPORT_HASH, &val,
+	    sizeof (val), kcred);
+	return (err);
+}
+
+/* ARGSUSED */
+static int
+vxlan_o_encap(void *arg, mblk_t *mp, ovep_encap_info_t *einfop,
+    mblk_t **outp)
+{
+	mblk_t *ob;
+	vxlan_hdr_t *vxh;
+
+	ASSERT(einfop->ovdi_id < (1 << 24));
+
+	/*
+	 * This allocation could get hot. We may want to have a good way to
+	 * cache and handle this allocation the same way that IP does with
+	 * keeping around a message block per entry, or basically treating this
+	 * as an immutable message block in the system. Basically freemsg() will
+	 * be a nop, but we'll do the right thing with respect to the rest of
+	 * the chain.
+	 */
+	ob = allocb(VXLAN_HDR_LEN, 0);
+	if (ob == NULL)
+		return (ENOMEM);
+
+	vxh = (vxlan_hdr_t *)ob->b_rptr;
+	vxh->vxlan_flags = ntohl(VXLAN_F_VDI);
+	vxh->vxlan_id = htonl((uint32_t)einfop->ovdi_id << VXLAN_ID_SHIFT);
+	ob->b_wptr += VXLAN_HDR_LEN;
+	*outp = ob;
+
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+vxlan_o_decap(void *arg, mblk_t *mp, ovep_encap_info_t *dinfop)
+{
+	vxlan_hdr_t *vxh;
+
+	if (MBLKL(mp) < sizeof (vxlan_hdr_t))
+		return (EINVAL);
+	vxh = (vxlan_hdr_t *)mp->b_rptr;
+	if ((ntohl(vxh->vxlan_flags) & VXLAN_F_VDI) == 0)
+		return (EINVAL);
+
+	dinfop->ovdi_id = ntohl(vxh->vxlan_id) >> VXLAN_ID_SHIFT;
+	dinfop->ovdi_hdr_size = VXLAN_HDR_LEN;
+
+	return (0);
+}
+
+static int
+vxlan_o_getprop(void *arg, const char *pr_name, void *buf, uint32_t *bufsize)
+{
+	vxlan_t *vxl = arg;
+
+	/* vxlan/listen_ip */
+	if (strcmp(pr_name, vxlan_props[0]) == 0) {
+		if (*bufsize < sizeof (struct in6_addr))
+			return (EOVERFLOW);
+
+		mutex_enter(&vxl->vxl_lock);
+		if (vxl->vxl_hladdr == B_FALSE) {
+			*bufsize = 0;
+		} else {
+			bcopy(&vxl->vxl_laddr, buf, sizeof (struct in6_addr));
+			*bufsize = sizeof (struct in6_addr);
+		}
+		mutex_exit(&vxl->vxl_lock);
+		return (0);
+	}
+
+	/* vxlan/listen_port */
+	if (strcmp(pr_name, vxlan_props[1]) == 0) {
+		uint64_t val;
+		if (*bufsize < sizeof (uint64_t))
+			return (EOVERFLOW);
+
+		mutex_enter(&vxl->vxl_lock);
+		val = vxl->vxl_lport;
+		bcopy(&val, buf, sizeof (uint64_t));
+		*bufsize = sizeof (uint64_t);
+		mutex_exit(&vxl->vxl_lock);
+		return (0);
+	}
+
+	return (EINVAL);
+}
+
+static int
+vxlan_o_setprop(void *arg, const char *pr_name, const void *buf,
+    uint32_t bufsize)
+{
+	vxlan_t *vxl = arg;
+
+	/* vxlan/listen_ip */
+	if (strcmp(pr_name, vxlan_props[0]) == 0) {
+		const struct in6_addr *ipv6 = buf;
+		if (bufsize != sizeof (struct in6_addr))
+			return (EINVAL);
+
+		if (IN6_IS_ADDR_V4COMPAT(ipv6))
+			return (EINVAL);
+
+		if (IN6_IS_ADDR_MULTICAST(ipv6))
+			return (EINVAL);
+
+		if (IN6_IS_ADDR_6TO4(ipv6))
+			return (EINVAL);
+
+		if (IN6_IS_ADDR_V4MAPPED(ipv6)) {
+			ipaddr_t v4;
+			IN6_V4MAPPED_TO_IPADDR(ipv6, v4);
+			if (IN_MULTICAST(v4))
+				return (EINVAL);
+		}
+
+		mutex_enter(&vxl->vxl_lock);
+		vxl->vxl_hladdr = B_TRUE;
+		bcopy(ipv6, &vxl->vxl_laddr, sizeof (struct in6_addr));
+		mutex_exit(&vxl->vxl_lock);
+
+		return (0);
+	}
+
+	/* vxlan/listen_port */
+	if (strcmp(pr_name, vxlan_props[1]) == 0) {
+		const uint64_t *valp = buf;
+		if (bufsize != 8)
+			return (EINVAL);
+
+		if (*valp == 0 || *valp > UINT16_MAX)
+			return (EINVAL);
+
+		mutex_enter(&vxl->vxl_lock);
+		vxl->vxl_lport = *valp;
+		mutex_exit(&vxl->vxl_lock);
+		return (0);
+	}
+	return (EINVAL);
+}
+
+static int
+vxlan_o_propinfo(const char *pr_name, overlay_prop_handle_t phdl)
+{
+	/* vxlan/listen_ip */
+	if (strcmp(pr_name, vxlan_props[0]) == 0) {
+		overlay_prop_set_name(phdl, vxlan_props[0]);
+		overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RRW);
+		overlay_prop_set_type(phdl, OVERLAY_PROP_T_IP);
+		overlay_prop_set_nodefault(phdl);
+		return (0);
+	}
+
+	if (strcmp(pr_name, vxlan_props[1]) == 0) {
+		overlay_prop_set_name(phdl, vxlan_props[1]);
+		overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RRW);
+		overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT);
+		(void) overlay_prop_set_default(phdl, &vxlan_defport,
+		    sizeof (vxlan_defport));
+		overlay_prop_set_range_uint32(phdl, 1, UINT16_MAX);
+		return (0);
+	}
+
+	return (EINVAL);
+}
+
+static struct overlay_plugin_ops vxlan_o_ops = {
+	0,
+	vxlan_o_init,
+	vxlan_o_fini,
+	vxlan_o_encap,
+	vxlan_o_decap,
+	vxlan_o_socket,
+	vxlan_o_sockopt,
+	vxlan_o_getprop,
+	vxlan_o_setprop,
+	vxlan_o_propinfo
+};
+
+static struct modlmisc vxlan_modlmisc = {
+	&mod_miscops,
+	"VXLAN encap plugin"
+};
+
+static struct modlinkage vxlan_modlinkage = {
+	MODREV_1,
+	&vxlan_modlmisc
+};
+
+int
+_init(void)
+{
+	int err;
+	overlay_plugin_register_t *ovrp;
+
+	ovrp = overlay_plugin_alloc(OVEP_VERSION);
+	if (ovrp == NULL)
+		return (ENOTSUP);
+	ovrp->ovep_name = vxlan_ident;
+	ovrp->ovep_ops = &vxlan_o_ops;
+	ovrp->ovep_id_size = VXLAN_ID_LEN;
+	ovrp->ovep_flags = OVEP_F_VLAN_TAG;
+	ovrp->ovep_dest = OVERLAY_PLUGIN_D_IP | OVERLAY_PLUGIN_D_PORT;
+	ovrp->ovep_props = vxlan_props;
+
+	if ((err = overlay_plugin_register(ovrp)) == 0) {
+		if ((err = mod_install(&vxlan_modlinkage)) != 0) {
+			(void) overlay_plugin_unregister(vxlan_ident);
+		}
+	}
+
+	overlay_plugin_free(ovrp);
+	return (err);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&vxlan_modlinkage, modinfop));
+}
+
+int
+_fini(void)
+{
+	int err;
+
+	if ((err = overlay_plugin_unregister(vxlan_ident)) != 0)
+		return (err);
+
+	return (mod_remove(&vxlan_modlinkage));
+}
diff --git a/usr/src/uts/common/io/physmem.c b/usr/src/uts/common/io/physmem.c
index 39d5003b02..c48fecd133 100644
--- a/usr/src/uts/common/io/physmem.c
+++ b/usr/src/uts/common/io/physmem.c
@@ -21,6 +21,7 @@
 /*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2016 Joyent, Inc.
  */
 
 
@@ -807,6 +808,13 @@ physmem_open(dev_t *devp, int flag, int otyp, cred_t *credp)
 	int ret;
 	static int msg_printed = 0;
 
+	/*
+	 * This device should never be visible in a zone, but if it somehow
+	 * does get created we refuse to allow the zone to use it.
+	 */
+	if (crgetzoneid(credp) != GLOBAL_ZONEID)
+		return (EACCES);
+
 	if ((flag & (FWRITE | FREAD)) != (FWRITE | FREAD)) {
 		return (EINVAL);
 	}
diff --git a/usr/src/uts/common/io/pseudo.conf b/usr/src/uts/common/io/pseudo.conf
index 42248e93d6..08affec609 100644
--- a/usr/src/uts/common/io/pseudo.conf
+++ b/usr/src/uts/common/io/pseudo.conf
@@ -22,8 +22,7 @@
 #
 # Copyright 2003 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
-#
-# ident	"%Z%%M%	%I%	%E% SMI"
+# Copyright 2014 Joyent, Inc.  All rights reserved.
 #
 # This file is private to the pseudonex driver.  It should not be edited.
 #
@@ -38,3 +37,9 @@ name="pseudo" class="root" instance=0;
 # /pseudo; it has as its children the zone console pseudo nodes.
 #
 name="zconsnex" parent="/pseudo" instance=1 valid-children="zcons";
+
+#
+# zfdnex is an alias for pseudo; this node is instantiated as a child of
+# /pseudo; it has as its children the zone fd pseudo nodes.
+#
+name="zfdnex" parent="/pseudo" instance=2 valid-children="zfd";
diff --git a/usr/src/uts/common/io/pseudonex.c b/usr/src/uts/common/io/pseudonex.c
index f83b0abf39..0ae06f88cc 100644
--- a/usr/src/uts/common/io/pseudonex.c
+++ b/usr/src/uts/common/io/pseudonex.c
@@ -83,6 +83,8 @@ static int pseudonex_detach(dev_info_t *, ddi_detach_cmd_t);
 static int pseudonex_open(dev_t *, int, int, cred_t *);
 static int pseudonex_close(dev_t, int, int, cred_t *);
 static int pseudonex_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
+static int pseudonex_fm_init(dev_info_t *, dev_info_t *, int,
+    ddi_iblock_cookie_t *);
 static int pseudonex_ctl(dev_info_t *, dev_info_t *, ddi_ctl_enum_t, void *,
     void *);
 
@@ -90,6 +92,8 @@ static void *pseudonex_state;
 
 typedef struct pseudonex_state {
 	dev_info_t *pnx_devi;
+	int pnx_fmcap;
+	ddi_iblock_cookie_t pnx_fm_ibc;
 } pseudonex_state_t;
 
 static struct bus_ops pseudonex_bus_ops = {
@@ -116,7 +120,7 @@ static struct bus_ops pseudonex_bus_ops = {
 	NULL,			/* bus_intr_ctl */
 	NULL,			/* bus_config */
 	NULL,			/* bus_unconfig */
-	NULL,			/* bus_fm_init */
+	pseudonex_fm_init,	/* bus_fm_init */
 	NULL,			/* bus_fm_fini */
 	NULL,			/* bus_fm_access_enter */
 	NULL,			/* bus_fm_access_exit */
@@ -228,6 +232,9 @@ pseudonex_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
 	pnx_state = ddi_get_soft_state(pseudonex_state, instance);
 	pnx_state->pnx_devi = devi;
 
+	pnx_state->pnx_fmcap = DDI_FM_EREPORT_CAPABLE;
+	ddi_fm_init(devi, &pnx_state->pnx_fmcap, &pnx_state->pnx_fm_ibc);
+
 	if (ddi_create_minor_node(devi, "devctl", S_IFCHR, instance,
 	    DDI_NT_NEXUS, 0) != DDI_SUCCESS) {
 		ddi_remove_minor_node(devi, NULL);
@@ -247,6 +254,10 @@ pseudonex_detach(dev_info_t *devi, ddi_detach_cmd_t cmd)
 	if (cmd == DDI_SUSPEND)
 		return (DDI_SUCCESS);
 
+	if (cmd != DDI_DETACH)
+		return (DDI_FAILURE);
+
+	ddi_fm_fini(devi);
 	ddi_remove_minor_node(devi, NULL);
 	ddi_soft_state_free(pseudonex_state, instance);
 	return (DDI_SUCCESS);
@@ -375,6 +386,19 @@ pseudonex_auto_assign(dev_info_t *child)
 }
 
 static int
+pseudonex_fm_init(dev_info_t *dip, dev_info_t *tdip, int cap,
+    ddi_iblock_cookie_t *ibc)
+{
+	pseudonex_state_t *pnx_state;
+
+	pnx_state = ddi_get_soft_state(pseudonex_state, ddi_get_instance(dip));
+	ASSERT(pnx_state != NULL);
+	ASSERT(ibc != NULL);
+	*ibc = pnx_state->pnx_fm_ibc;
+	return (pnx_state->pnx_fmcap & cap);
+}
+
+static int
 pseudonex_ctl(dev_info_t *dip, dev_info_t *rdip, ddi_ctl_enum_t ctlop,
     void *arg, void *result)
 {
diff --git a/usr/src/uts/common/io/ptm.c b/usr/src/uts/common/io/ptm.c
index 400e9ffd10..07ffddc123 100644
--- a/usr/src/uts/common/io/ptm.c
+++ b/usr/src/uts/common/io/ptm.c
@@ -447,6 +447,18 @@ ptmclose(queue_t *rqp, int flag, cred_t *credp)
 	return (0);
 }
 
+static boolean_t
+ptmptsopencb(ptmptsopencb_arg_t arg)
+{
+	struct pt_ttys	*ptmp = (struct pt_ttys *)arg;
+	boolean_t rval;
+
+	PT_ENTER_READ(ptmp);
+	rval = (ptmp->pt_nullmsg != NULL);
+	PT_EXIT_READ(ptmp);
+	return (rval);
+}
+
 /*
  * The wput procedure will only handle ioctl and flush messages.
  */
@@ -574,6 +586,41 @@ ptmwput(queue_t *qp, mblk_t *mp)
 			miocack(qp, mp, 0, 0);
 			break;
 		}
+		case PTMPTSOPENCB:
+		{
+			mblk_t		*dp;	/* ioctl reply data */
+			ptmptsopencb_t	*ppocb;
+
+			/* only allow the kernel to invoke this ioctl */
+			if (iocp->ioc_cr != kcred) {
+				miocnak(qp, mp, 0, EINVAL);
+				break;
+			}
+
+			/* we don't support transparent ioctls */
+			ASSERT(iocp->ioc_count != TRANSPARENT);
+			if (iocp->ioc_count == TRANSPARENT) {
+				miocnak(qp, mp, 0, EINVAL);
+				break;
+			}
+
+			/* allocate a response message */
+			dp = allocb(sizeof (ptmptsopencb_t), BPRI_MED);
+			if (dp == NULL) {
+				miocnak(qp, mp, 0, EAGAIN);
+				break;
+			}
+
+			/* initialize the ioctl results */
+			ppocb = (ptmptsopencb_t *)dp->b_rptr;
+			ppocb->ppocb_func = ptmptsopencb;
+			ppocb->ppocb_arg = (ptmptsopencb_arg_t)ptmp;
+
+			/* send the reply data */
+			mioc2ack(mp, dp, sizeof (ptmptsopencb_t), 0);
+			qreply(qp, mp);
+			break;
+		}
 		}
 		break;
 
diff --git a/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas.c b/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas.c
index 72c8800f3e..dc5e8eafc9 100644
--- a/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas.c
+++ b/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas.c
@@ -22,7 +22,7 @@
 /*
  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright 2015, Joyent, Inc.
  * Copyright 2014 OmniTI Computer Consulting, Inc. All rights reserved.
  * Copyright (c) 2014, Tegile Systems Inc. All rights reserved.
  */
@@ -72,6 +72,7 @@
 #include <sys/file.h>
 #include <sys/policy.h>
 #include <sys/model.h>
+#include <sys/refhash.h>
 #include <sys/sysevent.h>
 #include <sys/sysevent/eventdefs.h>
 #include <sys/sysevent/dr.h>
@@ -99,7 +100,6 @@
 #include <sys/scsi/adapters/mpt_sas/mptsas_var.h>
 #include <sys/scsi/adapters/mpt_sas/mptsas_ioctl.h>
 #include <sys/scsi/adapters/mpt_sas/mptsas_smhba.h>
-#include <sys/scsi/adapters/mpt_sas/mptsas_hash.h>
 #include <sys/raidioctl.h>
 
 #include <sys/fs/dv_node.h>	/* devfs_clean */
diff --git a/usr/src/uts/common/io/scsi/targets/sd.c b/usr/src/uts/common/io/scsi/targets/sd.c
index ae1e7e0fc3..dc5dc22e37 100644
--- a/usr/src/uts/common/io/scsi/targets/sd.c
+++ b/usr/src/uts/common/io/scsi/targets/sd.c
@@ -3503,9 +3503,13 @@ sd_set_mmc_caps(sd_ssc_t *ssc)
 		 * according to the successful response to the page
 		 * 0x2A mode sense request.
 		 */
-		scsi_log(SD_DEVINFO(un), sd_label, CE_WARN,
-		    "sd_set_mmc_caps: Mode Sense returned "
-		    "invalid block descriptor length\n");
+		/*
+		 * The following warning occurs due to the KVM CD-ROM
+		 * mishandling the multi-media commands.  Ignore it.
+		 * scsi_log(SD_DEVINFO(un), sd_label, CE_WARN,
+		 *     "sd_set_mmc_caps: Mode Sense returned "
+		 *     "invalid block descriptor length\n");
+		 */
 		kmem_free(buf, BUFLEN_MODE_CDROM_CAP);
 		return;
 	}
@@ -4450,18 +4454,77 @@ sd_sdconf_id_match(struct sd_lun *un, char *id, int idlen)
 {
 	struct scsi_inquiry	*sd_inq;
 	int 			rval = SD_SUCCESS;
+	char			*p;
+	int			chk_vidlen = 0, chk_pidlen = 0;
+	int			has_tail = 0;
+	static const int	VSZ = sizeof (sd_inq->inq_vid);
+	static const int	PSZ = sizeof (sd_inq->inq_pid);
 
 	ASSERT(un != NULL);
 	sd_inq = un->un_sd->sd_inq;
 	ASSERT(id != NULL);
 
 	/*
-	 * We use the inq_vid as a pointer to a buffer containing the
-	 * vid and pid and use the entire vid/pid length of the table
-	 * entry for the comparison. This works because the inq_pid
-	 * data member follows inq_vid in the scsi_inquiry structure.
+	 * We would like to use the inq_vid as a pointer to a buffer
+	 * containing the vid and pid and use the entire vid/pid length of
+	 * the table entry for the comparison.  However, this does not work
+	 * because, while the inq_pid data member follows inq_vid in the
+	 * scsi_inquiry structure, we do not control the contents of this
+	 * buffer, and some broken devices violate SPC 4.3.1 and return
+	 * fields with null bytes in them.
+	 */
+	chk_vidlen = MIN(VSZ, idlen);
+	p = id + chk_vidlen - 1;
+	while (*p == ' ' && chk_vidlen > 0) {
+		--p;
+		--chk_vidlen;
+	}
+
+	/*
+	 * If it's all spaces, check the whole thing.
 	 */
-	if (strncasecmp(sd_inq->inq_vid, id, idlen) != 0) {
+	if (chk_vidlen == 0)
+		chk_vidlen = MIN(VSZ, idlen);
+
+	if (idlen > VSZ) {
+		chk_pidlen = idlen - VSZ;
+		p = id + idlen - 1;
+		while (*p == ' ' && chk_pidlen > 0) {
+			--p;
+			--chk_pidlen;
+		}
+		if (chk_pidlen == 0)
+			chk_pidlen = MIN(PSZ, idlen - VSZ);
+	}
+
+	/*
+	 * There's one more thing we need to do here.  If the user specified
+	 * an ID with trailing spaces, we need to make sure the inquiry
+	 * vid/pid has only spaces or NULs after the check length; otherwise, it
+	 * can't match.
+	 */
+	if (idlen > chk_vidlen && chk_vidlen < VSZ) {
+		for (p = sd_inq->inq_vid + chk_vidlen;
+		    p < sd_inq->inq_vid + VSZ; ++p) {
+			if (*p != ' ' && *p != '\0') {
+				++has_tail;
+				break;
+			}
+		}
+	}
+	if (idlen > chk_pidlen + VSZ && chk_pidlen < PSZ) {
+		for (p = sd_inq->inq_pid + chk_pidlen;
+		    p < sd_inq->inq_pid + PSZ; ++p) {
+			if (*p != ' ' && *p != '\0') {
+				++has_tail;
+				break;
+			}
+		}
+	}
+
+	if (has_tail || strncasecmp(sd_inq->inq_vid, id, chk_vidlen) != 0 ||
+	    (idlen > VSZ &&
+	    strncasecmp(sd_inq->inq_pid, id + VSZ, chk_pidlen) != 0)) {
 		/*
 		 * The user id string is compared to the inquiry vid/pid
 		 * using a case insensitive comparison and ignoring
diff --git a/usr/src/uts/common/io/signalfd.c b/usr/src/uts/common/io/signalfd.c
index 32f8f85f7a..4ab4f36d4e 100644
--- a/usr/src/uts/common/io/signalfd.c
+++ b/usr/src/uts/common/io/signalfd.c
@@ -10,7 +10,7 @@
  */
 
 /*
- * Copyright 2015 Joyent, Inc.
+ * Copyright 2016 Joyent, Inc.
  */
 
 /*
@@ -19,97 +19,73 @@
  *
  * As described on the signalfd(3C) man page, the general idea behind these
  * file descriptors is that they can be used to synchronously consume signals
- * via the read(2) syscall. That capability already exists with the
- * sigwaitinfo(3C) function but the key advantage of signalfd is that, because
- * it is file descriptor based, poll(2) can be used to determine when signals
- * are available to be consumed.
+ * via the read(2) syscall.  While that capability already exists with the
+ * sigwaitinfo(3C) function, signalfd holds an advantage since it is file
+ * descriptor based: It is able use the event facilities (poll(2), /dev/poll,
+ * event ports) to notify interested parties when consumable signals arrive.
  *
- * The general implementation uses signalfd_state to hold both the signal set
- * and poll head for an open file descriptor. Because a process can be using
- * different sigfds with different signal sets, each signalfd_state poll head
- * can be thought of as an independent signal stream and the thread(s) waiting
- * on that stream will get poll notification when any signal in the
- * corresponding set is received.
+ * The signalfd lifecycle begins When a process opens /dev/signalfd.  A minor
+ * will be allocated for them along with an associated signalfd_state_t struct.
+ * It is there where the mask of desired signals resides.
  *
- * The sigfd_proc_state_t struct lives on the proc_t and maintains per-proc
- * state for function callbacks and data when the proc needs to do work during
- * signal delivery for pollwakeup.
+ * Reading from the signalfd is straightforward and mimics the kernel behavior
+ * for sigtimedwait().  Signals continue to live on either the proc's p_sig, or
+ * thread's t_sig, member.  During a read operation, those which match the mask
+ * are consumed so they are no longer pending.
  *
- * The read side of the implementation is straightforward and mimics the
- * kernel behavior for sigtimedwait(). Signals continue to live on either
- * the proc's p_sig, or thread's t_sig, member. Read consumes the signal so
- * that it is no longer pending.
+ * The poll side is more complex.  Every time a signal is delivered, all of the
+ * signalfds on the process need to be examined in order to pollwake threads
+ * waiting for signal arrival.
  *
- * The poll side is more complex since all of the sigfds on the process need
- * to be examined every time a signal is delivered to the process in order to
- * pollwake any thread waiting in poll for that signal.
+ * When a thread polling on a signalfd requires a pollhead, several steps must
+ * be taken to safely ensure the proper result.  A sigfd_proc_state_t is
+ * created for the calling process if it does not yet exist.  It is there where
+ * a list of sigfd_poll_waiter_t structures reside which associate pollheads to
+ * signalfd_state_t entries.  The sigfd_proc_state_t list is walked to find a
+ * sigfd_poll_waiter_t matching the signalfd_state_t which corresponds to the
+ * polled resource.  If one is found, it is reused.  Otherwise a new one is
+ * created, incrementing the refcount on the signalfd_state_t, and it is added
+ * to the sigfd_poll_waiter_t list.
  *
- * Because it is likely that a process will only be using one, or a few, sigfds,
- * but many total file descriptors, we maintain a list of sigfds which need
- * pollwakeup. The list lives on the proc's p_sigfd struct. In this way only
- * zero, or a few, of the state structs will need to be examined every time a
- * signal is delivered to the process, instead of having to examine all of the
- * file descriptors to find the state structs. When a state struct with a
- * matching signal set is found then pollwakeup is called.
+ * The complications imposed by fork(2) are why the pollhead is stored in the
+ * associated sigfd_poll_waiter_t instead of directly in the signalfd_state_t.
+ * More than one process can hold a reference to the signalfd at a time but
+ * arriving signals should wake only process-local pollers.  Additionally,
+ * signalfd_close is called only when the last referencing fd is closed, hiding
+ * occurrences of preceeding threads which released their references.  This
+ * necessitates reference counting on the signalfd_state_t so it is able to
+ * persist after close until all poll references have been cleansed.  Doing so
+ * ensures that blocked pollers which hold references to the signalfd_state_t
+ * will be able to do clean-up after the descriptor itself has been closed.
  *
- * The sigfd_list is self-cleaning; as signalfd_pollwake_cb is called, the list
- * will clear out on its own. There is an exit helper (signalfd_exit_helper)
- * which cleans up any remaining per-proc state when the process exits.
+ * When a signal arrives in a process polling on signalfd, signalfd_pollwake_cb
+ * is called via the pointer in sigfd_proc_state_t.  It will walk over the
+ * sigfd_poll_waiter_t entries present in the list, searching for any
+ * associated with a signalfd_state_t with a matching signal mask.  The
+ * approach of keeping the poller list in p_sigfd was chosen because a process
+ * is likely to use few signalfds relative to its total file descriptors.  It
+ * reduces the work required for each received signal.
  *
- * The main complexity with signalfd is the interaction of forking and polling.
- * This interaction is complex because now two processes have a fd that
- * references the same dev_t (and its associated signalfd_state), but signals
- * go to only one of those processes. Also, we don't know when one of the
- * processes closes its fd because our 'close' entry point is only called when
- * the last fd is closed (which could be by either process).
+ * The sigfd_list is self-cleaning; as signalfd_pollwake_cb is called, the list
+ * will clear out on its own.  Any remaining per-process state which remains
+ * will be cleaned up by the exit helper (signalfd_exit_helper).
  *
- * Because the state struct is referenced by both file descriptors, and the
- * state struct represents a signal stream needing a pollwakeup, if both
- * processes were polling then both processes would get a pollwakeup when a
- * signal arrives for either process (that is, the pollhead is associated with
- * our dev_t so when a signal arrives the pollwakeup wakes up all waiters).
+ * The structures associated with signalfd state are designed to operate
+ * correctly across fork, but there is one caveat that applies.  Using
+ * fork-shared signalfd descriptors in conjuction with fork-shared caching poll
+ * descriptors (such as /dev/poll or event ports) will result in missed poll
+ * wake-ups.  This is caused by the pollhead identity of signalfd descriptors
+ * being dependent on the process they are polled from.  Because it has a
+ * thread-local cache, poll(2) is unaffected by this limitation.
  *
- * Fortunately this is not a common problem in practice, but the implementation
- * attempts to mitigate unexpected behavior. The typical behavior is that the
- * parent has been polling the signalfd (which is why it was open in the first
- * place) and the parent might have a pending signalfd_state (with the
- * pollhead) on its per-process sigfd_list. After the fork the child will
- * simply close that fd (among others) as part of the typical fork/close/exec
- * pattern. Because the child will never poll that fd, it will never get any
- * state onto its own sigfd_list (the child starts with a null list). The
- * intention is that the child sees no pollwakeup activity for signals unless
- * it explicitly reinvokes poll on the sigfd.
+ * Lock ordering:
  *
- * As background, there are two primary polling cases to consider when the
- * parent process forks:
- * 1) If any thread is blocked in poll(2) then both the parent and child will
- *    return from the poll syscall with EINTR. This means that if either
- *    process wants to re-poll on a sigfd then it needs to re-run poll and
- *    would come back in to the signalfd_poll entry point. The parent would
- *    already have the dev_t's state on its sigfd_list and the child would not
- *    have anything there unless it called poll again on its fd.
- * 2) If the process is using /dev/poll(7D) then the polling info is being
- *    cached by the poll device and the process might not currently be blocked
- *    on anything polling related. A subsequent DP_POLL ioctl will not invoke
- *    our signalfd_poll entry point again. Because the parent still has its
- *    sigfd_list setup, an incoming signal will hit our signalfd_pollwake_cb
- *    entry point, which in turn calls pollwake, and /dev/poll will do the
- *    right thing on DP_POLL. The child will not have a sigfd_list yet so the
- *    signal will not cause a pollwakeup. The dp code does its own handling for
- *    cleaning up its cache.
+ * 1. signalfd_lock
+ * 2. signalfd_state_t`sfd_lock
  *
- * This leaves only one odd corner case. If the parent and child both use
- * the dup-ed sigfd to poll then when a signal is delivered to either process
- * there is no way to determine which one should get the pollwakeup (since
- * both processes will be queued on the same signal stream poll head). What
- * happens in this case is that both processes will return from poll, but only
- * one of them will actually have a signal to read. The other will return
- * from read with EAGAIN, or block. This case is actually similar to the
- * situation within a single process which got two different sigfd's with the
- * same mask (or poll on two fd's that are dup-ed). Both would return from poll
- * when a signal arrives but only one read would consume the signal and the
- * other read would fail or block. Applications which poll on shared fd's
- * cannot assume that a subsequent read will actually obtain data.
+ * 1. proc_t`p_lock (to walk p_sigfd)
+ * 2. signalfd_state_t`sfd_lock
+ * 2a. signalfd_lock (after sfd_lock is dropped, when sfd_count falls to 0)
  */
 
 #include <sys/ddi.h>
@@ -123,118 +99,150 @@
 #include <sys/schedctl.h>
 #include <sys/id_space.h>
 #include <sys/sdt.h>
+#include <sys/brand.h>
 
 typedef struct signalfd_state signalfd_state_t;
 
 struct signalfd_state {
-	kmutex_t sfd_lock;			/* lock protecting state */
-	pollhead_t sfd_pollhd;			/* poll head */
-	k_sigset_t sfd_set;			/* signals for this fd */
-	signalfd_state_t *sfd_next;		/* next state on global list */
+	list_node_t	sfd_list;		/* node in global list */
+	kmutex_t	sfd_lock;		/* protects fields below */
+	uint_t		sfd_count;		/* ref count */
+	boolean_t	sfd_valid;		/* valid while open */
+	k_sigset_t	sfd_set;		/* signals for this fd */
 };
 
+typedef struct sigfd_poll_waiter {
+	list_node_t		spw_list;
+	signalfd_state_t	*spw_state;
+	pollhead_t		spw_pollhd;
+} sigfd_poll_waiter_t;
+
 /*
- * Internal global variables.
+ * Protects global state in signalfd_devi, signalfd_minor, signalfd_softstate,
+ * and signalfd_state (including sfd_list field of members)
  */
-static kmutex_t		signalfd_lock;		/* lock protecting state */
+static kmutex_t		signalfd_lock;
 static dev_info_t	*signalfd_devi;		/* device info */
 static id_space_t	*signalfd_minor;	/* minor number arena */
 static void		*signalfd_softstate;	/* softstate pointer */
-static signalfd_state_t	*signalfd_state;	/* global list of state */
+static list_t		signalfd_state;		/* global list of state */
+
 
-/*
- * If we don't already have an entry in the proc's list for this state, add one.
- */
 static void
-signalfd_wake_list_add(signalfd_state_t *state)
+signalfd_state_enter(signalfd_state_t *state)
 {
-	proc_t *p = curproc;
-	list_t *lst;
-	sigfd_wake_list_t *wlp;
-
-	ASSERT(MUTEX_HELD(&p->p_lock));
-	ASSERT(p->p_sigfd != NULL);
+	ASSERT(MUTEX_HELD(&state->sfd_lock));
+	ASSERT(state->sfd_count > 0);
+	VERIFY(state->sfd_valid == B_TRUE);
 
-	lst = &((sigfd_proc_state_t *)p->p_sigfd)->sigfd_list;
-	for (wlp = list_head(lst); wlp != NULL; wlp = list_next(lst, wlp)) {
-		if (wlp->sigfd_wl_state == state)
-			break;
-	}
-
-	if (wlp == NULL) {
-		wlp = kmem_zalloc(sizeof (sigfd_wake_list_t), KM_SLEEP);
-		wlp->sigfd_wl_state = state;
-		list_insert_head(lst, wlp);
-	}
+	state->sfd_count++;
 }
 
 static void
-signalfd_wake_rm(list_t *lst, sigfd_wake_list_t *wlp)
+signalfd_state_release(signalfd_state_t *state, boolean_t locked)
 {
-	list_remove(lst, wlp);
-	kmem_free(wlp, sizeof (sigfd_wake_list_t));
+	ASSERT(MUTEX_HELD(&state->sfd_lock));
+	ASSERT(state->sfd_count > 0);
+
+	if (state->sfd_count == 1) {
+		VERIFY(state->sfd_valid == B_FALSE);
+		mutex_exit(&state->sfd_lock);
+		if (locked) {
+			ASSERT(MUTEX_HELD(&signalfd_lock));
+			list_remove(&signalfd_state, state);
+		} else {
+			ASSERT(MUTEX_NOT_HELD(&signalfd_lock));
+			mutex_enter(&signalfd_lock);
+			list_remove(&signalfd_state, state);
+			mutex_exit(&signalfd_lock);
+		}
+		kmem_free(state, sizeof (*state));
+		return;
+	}
+	state->sfd_count--;
+	mutex_exit(&state->sfd_lock);
 }
 
-static void
-signalfd_wake_list_rm(proc_t *p, signalfd_state_t *state)
+static sigfd_poll_waiter_t *
+signalfd_wake_list_add(sigfd_proc_state_t *pstate, signalfd_state_t *state)
 {
-	sigfd_wake_list_t *wlp;
-	list_t *lst;
+	list_t *lst = &pstate->sigfd_list;
+	sigfd_poll_waiter_t *pw;
 
-	ASSERT(MUTEX_HELD(&p->p_lock));
+	for (pw = list_head(lst); pw != NULL; pw = list_next(lst, pw)) {
+		if (pw->spw_state == state)
+			break;
+	}
 
-	if (p->p_sigfd == NULL)
-		return;
+	if (pw == NULL) {
+		pw = kmem_zalloc(sizeof (*pw), KM_SLEEP);
+
+		mutex_enter(&state->sfd_lock);
+		signalfd_state_enter(state);
+		pw->spw_state = state;
+		mutex_exit(&state->sfd_lock);
+		list_insert_head(lst, pw);
+	}
+	return (pw);
+}
+
+static sigfd_poll_waiter_t *
+signalfd_wake_list_rm(sigfd_proc_state_t *pstate, signalfd_state_t *state)
+{
+	list_t *lst = &pstate->sigfd_list;
+	sigfd_poll_waiter_t *pw;
 
-	lst = &((sigfd_proc_state_t *)p->p_sigfd)->sigfd_list;
-	for (wlp = list_head(lst); wlp != NULL; wlp = list_next(lst, wlp)) {
-		if (wlp->sigfd_wl_state == state) {
-			signalfd_wake_rm(lst, wlp);
+	for (pw = list_head(lst); pw != NULL; pw = list_next(lst, pw)) {
+		if (pw->spw_state == state) {
 			break;
 		}
 	}
 
-	if (list_is_empty(lst)) {
-		((sigfd_proc_state_t *)p->p_sigfd)->sigfd_pollwake_cb = NULL;
-		list_destroy(lst);
-		kmem_free(p->p_sigfd, sizeof (sigfd_proc_state_t));
-		p->p_sigfd = NULL;
+	if (pw != NULL) {
+		list_remove(lst, pw);
+		mutex_enter(&state->sfd_lock);
+		signalfd_state_release(state, B_FALSE);
+		pw->spw_state = NULL;
 	}
+
+	return (pw);
 }
 
 static void
 signalfd_wake_list_cleanup(proc_t *p)
 {
-	sigfd_wake_list_t *wlp;
+	sigfd_proc_state_t *pstate = p->p_sigfd;
+	sigfd_poll_waiter_t *pw;
 	list_t *lst;
 
 	ASSERT(MUTEX_HELD(&p->p_lock));
+	ASSERT(pstate != NULL);
+
+	lst = &pstate->sigfd_list;
+	while ((pw = list_remove_head(lst)) != NULL) {
+		signalfd_state_t *state = pw->spw_state;
 
-	((sigfd_proc_state_t *)p->p_sigfd)->sigfd_pollwake_cb = NULL;
+		pw->spw_state = NULL;
+		mutex_enter(&state->sfd_lock);
+		signalfd_state_release(state, B_FALSE);
 
-	lst = &((sigfd_proc_state_t *)p->p_sigfd)->sigfd_list;
-	while (!list_is_empty(lst)) {
-		wlp = (sigfd_wake_list_t *)list_remove_head(lst);
-		kmem_free(wlp, sizeof (sigfd_wake_list_t));
+		pollwakeup(&pw->spw_pollhd, POLLERR);
+		pollhead_clean(&pw->spw_pollhd);
+		kmem_free(pw, sizeof (*pw));
 	}
+	list_destroy(lst);
+
+	p->p_sigfd = NULL;
+	kmem_free(pstate, sizeof (*pstate));
 }
 
 static void
 signalfd_exit_helper(void)
 {
 	proc_t *p = curproc;
-	list_t *lst;
-
-	/* This being non-null is the only way we can get here */
-	ASSERT(p->p_sigfd != NULL);
 
 	mutex_enter(&p->p_lock);
-	lst = &((sigfd_proc_state_t *)p->p_sigfd)->sigfd_list;
-
 	signalfd_wake_list_cleanup(p);
-	list_destroy(lst);
-	kmem_free(p->p_sigfd, sizeof (sigfd_proc_state_t));
-	p->p_sigfd = NULL;
 	mutex_exit(&p->p_lock);
 }
 
@@ -254,35 +262,40 @@ static void
 signalfd_pollwake_cb(void *arg0, int sig)
 {
 	proc_t *p = (proc_t *)arg0;
+	sigfd_proc_state_t *pstate = (sigfd_proc_state_t *)p->p_sigfd;
 	list_t *lst;
-	sigfd_wake_list_t *wlp;
+	sigfd_poll_waiter_t *pw;
 
 	ASSERT(MUTEX_HELD(&p->p_lock));
+	ASSERT(pstate != NULL);
 
-	if (p->p_sigfd == NULL)
-		return;
-
-	lst = &((sigfd_proc_state_t *)p->p_sigfd)->sigfd_list;
-	wlp = list_head(lst);
-	while (wlp != NULL) {
-		signalfd_state_t *state = wlp->sigfd_wl_state;
+	lst = &pstate->sigfd_list;
+	pw = list_head(lst);
+	while (pw != NULL) {
+		signalfd_state_t *state = pw->spw_state;
+		sigfd_poll_waiter_t *next;
+		short pollev;
 
 		mutex_enter(&state->sfd_lock);
-
-		if (sigismember(&state->sfd_set, sig) &&
-		    state->sfd_pollhd.ph_list != NULL) {
-			sigfd_wake_list_t *tmp = wlp;
-
-			/* remove it from the list */
-			wlp = list_next(lst, wlp);
-			signalfd_wake_rm(lst, tmp);
-
-			mutex_exit(&state->sfd_lock);
-			pollwakeup(&state->sfd_pollhd, POLLRDNORM | POLLIN);
+		if (!state->sfd_valid) {
+			pollev = POLLERR;
+		} else if (sigismember(&state->sfd_set, sig)) {
+			pollev = POLLRDNORM | POLLIN;
 		} else {
 			mutex_exit(&state->sfd_lock);
-			wlp = list_next(lst, wlp);
+			pw = list_next(lst, pw);
+			continue;
 		}
+
+		signalfd_state_release(state, B_FALSE);
+		pw->spw_state = NULL;
+		pollwakeup(&pw->spw_pollhd, pollev);
+		pollhead_clean(&pw->spw_pollhd);
+
+		next = list_next(lst, pw);
+		list_remove(lst, pw);
+		kmem_free(pw, sizeof (*pw));
+		pw = next;
 	}
 }
 
@@ -290,7 +303,7 @@ _NOTE(ARGSUSED(1))
 static int
 signalfd_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
 {
-	signalfd_state_t *state;
+	signalfd_state_t *state, **sstate;
 	major_t major = getemajor(*devp);
 	minor_t minor = getminor(*devp);
 
@@ -300,18 +313,20 @@ signalfd_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
 	mutex_enter(&signalfd_lock);
 
 	minor = (minor_t)id_allocff(signalfd_minor);
-
 	if (ddi_soft_state_zalloc(signalfd_softstate, minor) != DDI_SUCCESS) {
 		id_free(signalfd_minor, minor);
 		mutex_exit(&signalfd_lock);
 		return (ENODEV);
 	}
 
-	state = ddi_get_soft_state(signalfd_softstate, minor);
-	*devp = makedevice(major, minor);
+	state = kmem_zalloc(sizeof (*state), KM_SLEEP);
+	state->sfd_valid = B_TRUE;
+	state->sfd_count = 1;
+	list_insert_head(&signalfd_state, (void *)state);
 
-	state->sfd_next = signalfd_state;
-	signalfd_state = state;
+	sstate = ddi_get_soft_state(signalfd_softstate, minor);
+	*sstate = state;
+	*devp = makedevice(major, minor);
 
 	mutex_exit(&signalfd_lock);
 
@@ -405,6 +420,9 @@ consume_signal(k_sigset_t set, uio_t *uio, boolean_t block)
 	lwp->lwp_extsig = 0;
 	mutex_exit(&p->p_lock);
 
+	if (PROC_IS_BRANDED(p) && BROP(p)->b_sigfd_translate)
+		BROP(p)->b_sigfd_translate(infop);
+
 	/* Convert k_siginfo into external, datamodel independent, struct. */
 	bzero(ssp, sizeof (*ssp));
 	ssp->ssi_signo = infop->si_signo;
@@ -439,7 +457,7 @@ _NOTE(ARGSUSED(2))
 static int
 signalfd_read(dev_t dev, uio_t *uio, cred_t *cr)
 {
-	signalfd_state_t *state;
+	signalfd_state_t *state, **sstate;
 	minor_t minor = getminor(dev);
 	boolean_t block = B_TRUE;
 	k_sigset_t set;
@@ -449,7 +467,8 @@ signalfd_read(dev_t dev, uio_t *uio, cred_t *cr)
 	if (uio->uio_resid < sizeof (signalfd_siginfo_t))
 		return (EINVAL);
 
-	state = ddi_get_soft_state(signalfd_softstate, minor);
+	sstate = ddi_get_soft_state(signalfd_softstate, minor);
+	state = *sstate;
 
 	if (uio->uio_fmode & (FNDELAY|FNONBLOCK))
 		block = B_FALSE;
@@ -462,15 +481,26 @@ signalfd_read(dev_t dev, uio_t *uio, cred_t *cr)
 		return (set_errno(EINVAL));
 
 	do  {
-		res = consume_signal(state->sfd_set, uio, block);
-		if (res == 0)
-			got_one = B_TRUE;
+		res = consume_signal(set, uio, block);
 
-		/*
-		 * After consuming one signal we won't block trying to consume
-		 * further signals.
-		 */
-		block = B_FALSE;
+		if (res == 0) {
+			/*
+			 * After consuming one signal, do not block while
+			 * trying to consume more.
+			 */
+			got_one = B_TRUE;
+			block = B_FALSE;
+
+			/*
+			 * Refresh the matching signal set in case it was
+			 * updated during the wait.
+			 */
+			mutex_enter(&state->sfd_lock);
+			set = state->sfd_set;
+			mutex_exit(&state->sfd_lock);
+			if (sigisempty(&set))
+				break;
+		}
 	} while (res == 0 && uio->uio_resid >= sizeof (signalfd_siginfo_t));
 
 	if (got_one)
@@ -499,13 +529,14 @@ static int
 signalfd_poll(dev_t dev, short events, int anyyet, short *reventsp,
     struct pollhead **phpp)
 {
-	signalfd_state_t *state;
+	signalfd_state_t *state, **sstate;
 	minor_t minor = getminor(dev);
 	kthread_t *t = curthread;
 	proc_t *p = ttoproc(t);
 	short revents = 0;
 
-	state = ddi_get_soft_state(signalfd_softstate, minor);
+	sstate = ddi_get_soft_state(signalfd_softstate, minor);
+	state = *sstate;
 
 	mutex_enter(&state->sfd_lock);
 
@@ -515,39 +546,36 @@ signalfd_poll(dev_t dev, short events, int anyyet, short *reventsp,
 	mutex_exit(&state->sfd_lock);
 
 	if (!(*reventsp = revents & events) && !anyyet) {
-		*phpp = &state->sfd_pollhd;
+		sigfd_proc_state_t *pstate;
+		sigfd_poll_waiter_t *pw;
 
 		/*
 		 * Enable pollwakeup handling.
 		 */
-		if (p->p_sigfd == NULL) {
-			sigfd_proc_state_t *pstate;
+		mutex_enter(&p->p_lock);
+		if ((pstate = (sigfd_proc_state_t *)p->p_sigfd) == NULL) {
 
-			pstate = kmem_zalloc(sizeof (sigfd_proc_state_t),
-			    KM_SLEEP);
+			mutex_exit(&p->p_lock);
+			pstate = kmem_zalloc(sizeof (*pstate), KM_SLEEP);
 			list_create(&pstate->sigfd_list,
-			    sizeof (sigfd_wake_list_t),
-			    offsetof(sigfd_wake_list_t, sigfd_wl_lst));
+			    sizeof (sigfd_poll_waiter_t),
+			    offsetof(sigfd_poll_waiter_t, spw_list));
+			pstate->sigfd_pollwake_cb = signalfd_pollwake_cb;
 
+			/* Check again, after blocking for the alloc. */
 			mutex_enter(&p->p_lock);
-			/* check again now that we're locked */
 			if (p->p_sigfd == NULL) {
 				p->p_sigfd = pstate;
 			} else {
 				/* someone beat us to it */
 				list_destroy(&pstate->sigfd_list);
-				kmem_free(pstate, sizeof (sigfd_proc_state_t));
+				kmem_free(pstate, sizeof (*pstate));
+				pstate = p->p_sigfd;
 			}
-			mutex_exit(&p->p_lock);
 		}
 
-		mutex_enter(&p->p_lock);
-		if (((sigfd_proc_state_t *)p->p_sigfd)->sigfd_pollwake_cb ==
-		    NULL) {
-			((sigfd_proc_state_t *)p->p_sigfd)->sigfd_pollwake_cb =
-			    signalfd_pollwake_cb;
-		}
-		signalfd_wake_list_add(state);
+		pw = signalfd_wake_list_add(pstate, state);
+		*phpp = &pw->spw_pollhd;
 		mutex_exit(&p->p_lock);
 	}
 
@@ -558,11 +586,12 @@ _NOTE(ARGSUSED(4))
 static int
 signalfd_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv)
 {
-	signalfd_state_t *state;
+	signalfd_state_t *state, **sstate;
 	minor_t minor = getminor(dev);
 	sigset_t mask;
 
-	state = ddi_get_soft_state(signalfd_softstate, minor);
+	sstate = ddi_get_soft_state(signalfd_softstate, minor);
+	state = *sstate;
 
 	switch (cmd) {
 	case SIGNALFDIOC_MASK:
@@ -587,33 +616,42 @@ _NOTE(ARGSUSED(1))
 static int
 signalfd_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
 {
-	signalfd_state_t *state, **sp;
+	signalfd_state_t *state, **sstate;
+	sigfd_poll_waiter_t *pw = NULL;
 	minor_t minor = getminor(dev);
 	proc_t *p = curproc;
 
-	state = ddi_get_soft_state(signalfd_softstate, minor);
-
-	if (state->sfd_pollhd.ph_list != NULL) {
-		pollwakeup(&state->sfd_pollhd, POLLERR);
-		pollhead_clean(&state->sfd_pollhd);
-	}
+	sstate = ddi_get_soft_state(signalfd_softstate, minor);
+	state = *sstate;
 
-	/* Make sure our state is removed from our proc's pollwake list. */
+	/* Make sure state is removed from this proc's pollwake list. */
 	mutex_enter(&p->p_lock);
-	signalfd_wake_list_rm(p, state);
-	mutex_exit(&p->p_lock);
+	if (p->p_sigfd != NULL) {
+		sigfd_proc_state_t *pstate = p->p_sigfd;
 
-	mutex_enter(&signalfd_lock);
+		pw = signalfd_wake_list_rm(pstate, state);
+		if (list_is_empty(&pstate->sigfd_list)) {
+			signalfd_wake_list_cleanup(p);
+		}
+	}
+	mutex_exit(&p->p_lock);
 
-	/* Remove our state from our global list. */
-	for (sp = &signalfd_state; *sp != state; sp = &((*sp)->sfd_next))
-		VERIFY(*sp != NULL);
+	if (pw != NULL) {
+		pollwakeup(&pw->spw_pollhd, POLLERR);
+		pollhead_clean(&pw->spw_pollhd);
+		kmem_free(pw, sizeof (*pw));
+	}
 
-	*sp = (*sp)->sfd_next;
+	mutex_enter(&signalfd_lock);
 
+	*sstate = NULL;
 	ddi_soft_state_free(signalfd_softstate, minor);
 	id_free(signalfd_minor, minor);
 
+	mutex_enter(&state->sfd_lock);
+	state->sfd_valid = B_FALSE;
+	signalfd_state_release(state, B_TRUE);
+
 	mutex_exit(&signalfd_lock);
 
 	return (0);
@@ -635,7 +673,7 @@ signalfd_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
 	}
 
 	if (ddi_soft_state_init(&signalfd_softstate,
-	    sizeof (signalfd_state_t), 0) != 0) {
+	    sizeof (signalfd_state_t *), 0) != 0) {
 		cmn_err(CE_WARN, "signalfd failed to create soft state");
 		id_space_destroy(signalfd_minor);
 		mutex_exit(&signalfd_lock);
@@ -656,6 +694,9 @@ signalfd_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
 
 	sigfd_exit_helper = signalfd_exit_helper;
 
+	list_create(&signalfd_state, sizeof (signalfd_state_t),
+	    offsetof(signalfd_state_t, sfd_list));
+
 	mutex_exit(&signalfd_lock);
 
 	return (DDI_SUCCESS);
@@ -673,10 +714,19 @@ signalfd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
 		return (DDI_FAILURE);
 	}
 
-	/* list should be empty */
-	VERIFY(signalfd_state == NULL);
-
 	mutex_enter(&signalfd_lock);
+
+	if (!list_is_empty(&signalfd_state)) {
+		/*
+		 * There are dangling poll waiters holding signalfd_state_t
+		 * entries on the global list.  Detach is not possible until
+		 * they purge themselves.
+		 */
+		return (DDI_FAILURE);
+		mutex_exit(&signalfd_lock);
+	}
+	list_destroy(&signalfd_state);
+
 	id_space_destroy(signalfd_minor);
 
 	ddi_remove_minor_node(signalfd_devi, NULL);
diff --git a/usr/src/uts/common/io/stream.c b/usr/src/uts/common/io/stream.c
index e9af19ca18..994ca8baa8 100644
--- a/usr/src/uts/common/io/stream.c
+++ b/usr/src/uts/common/io/stream.c
@@ -1451,6 +1451,16 @@ copyb(mblk_t *bp)
 	ndp = nbp->b_datap;
 
 	/*
+	 * Copy the various checksum information that came in
+	 * originally.
+	 */
+	ndp->db_cksumstart = dp->db_cksumstart;
+	ndp->db_cksumend = dp->db_cksumend;
+	ndp->db_cksumstuff = dp->db_cksumstuff;
+	bcopy(dp->db_struioun.data, ndp->db_struioun.data,
+	    sizeof (dp->db_struioun.data));
+
+	/*
 	 * Well, here is a potential issue.  If we are trying to
 	 * trace a flow, and we copy the message, we might lose
 	 * information about where this message might have been.
diff --git a/usr/src/uts/common/io/udmf/dm9601reg.h b/usr/src/uts/common/io/udmf/dm9601reg.h
new file mode 100644
index 0000000000..a36f2b0fc8
--- /dev/null
+++ b/usr/src/uts/common/io/udmf/dm9601reg.h
@@ -0,0 +1,348 @@
+/*
+ * %W% %E%
+ * Macro definitions for Davicom DM9601 USB to fast ethernet controler
+ * based on Davicom DM9601E data sheet
+ * This file is public domain. Coded by M.Murayama (KHF04453@nifty.com)
+ */
+
+#ifndef __DM9601_H__
+#define __DM9601_H__
+
+/*
+ * offset of registers
+ */
+#define	NCR	0x00U	/* network control register */
+#define	NSR	0x01U	/* network status register */
+#define	TCR	0x02U	/* tx control register */
+#define	TSR1	0x03U	/* tx status register 1 */
+#define	TSR2	0x04U	/* tx status register 2 */
+#define	RCR	0x05U	/* rx control register */
+#define	RSR	0x06U	/* rx status register */
+#define	ROCR	0x07U	/* rx overflow counter register */
+#define	BPTR	0x08U	/* back pressure threshold regster */
+#define	FCTR	0x09U	/* flow control threshold regster */
+#define	FCR	0x0aU	/* flow control threshold regster */
+#define	EPCR	0x0bU	/* eeprom & phy control register */
+#define	EPAR	0x0cU	/* eeprom & phy address register */
+#define	EPDR	0x0dU	/* eeprom & phy data register (2byte) */
+#define	WCR	0x0fU	/* wake up control register */
+#define	PAR	0x10U	/* physical address register (6byte) */
+#define	MAR	0x16U	/* multicast address register (8byte) */
+#define	GPCR	0x1eU	/* general purpose control register */
+#define	GPR	0x1fU	/* general purpose register */
+#define	VID	0x28U	/* vendor ID (2byte) */
+#define	PID	0x2aU	/* product ID (2byte) */
+#define	CHIPR	0x2cU	/* chip revision */
+#define	USBDA	0xf0U	/* usb device address register */
+#define	RXC	0xf1U	/* received packet counter register */
+#define	TUSC	0xf2U	/* tx packet counter/usb status register */
+#define	USBC	0xf4U	/* usb control register */
+
+/*
+ * register definitions
+ */
+/* network control register */
+#define	NCR_EXT_PHY	0x80U	/* 1: select external phy */
+#define	NCR_WAKEEN	0x40U	/* 1: wake up event enable */
+#define	NCR_FCOL	0x10U	/* force collision mode for test */
+#define	NCR_FDX		0x08U	/* 1: full duplex mode (for external phy) */
+#define	NCR_LBK		0x06U
+#define		NCR_LBK_SHIFT		1
+#define		NCR_LBK_NORMAL	(0U << NCR_LBK_SHIFT)
+#define		NCR_LBK_MAC	(1U << NCR_LBK_SHIFT)
+#define		NCR_LBK_PHY_D	(2U << NCR_LBK_SHIFT)
+#define		NCR_LBK_PHY_A	(3U << NCR_LBK_SHIFT)
+#define	NCR_RST		0x01U	/* 1: reset, auto clear */
+
+#define	NCR_BITS	\
+	"\020"	\
+	"\010EXT_PHY"	\
+	"\007WAKEEN"	\
+	"\005FCOL"	\
+	"\004FDX"	\
+	"\001RST"
+
+/* network status register */
+#define	NSR_SPEED	0x80U	/* 1:10M 0:100M */
+#define	NSR_LINKST	0x40U	/* 1:ok 0:fail */
+#define	NSR_WAKEST	0x20U	/* 1:enabled */
+#define	NSR_TXFULL	0x10U	/* 1:tx fifo full */
+#define	NSR_TX2END	0x08U	/* tx packet2 complete status */
+#define	NSR_TX1END	0x04U	/* tx packet1 complete status */
+#define	NSR_RXOV	0x02U	/* rx fifo overflow */
+#define	NSR_RXRDY	0x01U	/* rx packet ready */
+
+#define	NSR_BITS	\
+	"\020"	\
+	"\010SPEED_10"	\
+	"\007LINKST_UP"	\
+	"\006WAKEST"	\
+	"\005TXFULL"	\
+	"\004TX2END"	\
+	"\003TX1END"	\
+	"\002RXOV"	\
+	"\001RXRDY"
+
+/* tx control register */
+#define	TCR_TJDIS	0x40U	/* tx jitter control */
+#define	TCR_EXCEDM	0x20U	/* excessive collision mode */
+#define	TCR_PAD_DIS2	0x10U	/* PAD appends disable for pkt2 */
+#define	TCR_CRC_DIS2	0x08U	/* CRC appends disable for pkt2 */
+#define	TCR_PAD_DIS1	0x04U	/* PAD appends disable for pkt1 */
+#define	TCR_CRC_DIS1	0x02U	/* CRC appends disable for pkt1 */
+
+#define	TCR_BITS	\
+	"\020"	\
+	"\007TJDIS"	\
+	"\006EXCEDM"	\
+	"\005PAD_DIS2"	\
+	"\004CRC_DIS2"	\
+	"\003PAD_DIS1"	\
+	"\002CRC_DIS1"
+
+/* tx status register (ro) */
+#define	TSR_TJTO	0x80U	/* tx jabber time out */
+#define	TSR_LC		0x40U	/* loss of carrier */
+#define	TSR_NC		0x20U	/* no carrier */
+#define	TSR_LATEC	0x10U	/* late collision */
+#define	TSR_COL		0x08U	/* late collision */
+#define	TSR_EL		0x04U	/* excessive collision */
+
+#define	TSR_BITS	\
+	"\020"		\
+	"\010TJTO"	\
+	"\007LC"	\
+	"\006NC"	\
+	"\005LATEC"	\
+	"\004COL"	\
+	"\003EL"
+
+/* rx control register */
+#define	RCR_WTDIS	0x40U	/* watch dog timer disable */
+#define	RCR_DIS_LONG	0x20U	/* discard longer packets than 1522 */
+#define	RCR_DIS_CRC	0x10U	/* discard crc error packets */
+#define	RCR_ALL		0x08U	/* pass all multicast */
+#define	RCR_RUNT	0x04U	/* pass runt packets */
+#define	RCR_PRMSC	0x02U	/* promiscuous mode */
+#define	RCR_RXEN	0x01U	/* rx enable */
+
+#define	RCR_BITS	\
+	"\020"		\
+	"\007WTDIS"	\
+	"\006DIS_LONG"	\
+	"\005DIS_CRC"	\
+	"\004ALL"	\
+	"\003RUNT"	\
+	"\002PRMSC"	\
+	"\001RXEN"
+
+/* rx status register */
+#define	RSR_RF		0x80U	/* runt frame */
+#define	RSR_MF		0x40U	/* multicast frame */
+#define	RSR_LCS		0x20U	/* late collision seen */
+#define	RSR_RWTO	0x10U	/* receive watchdog timeout */
+#define	RSR_PLE		0x08U	/* physical layer error */
+#define	RSR_AE		0x04U	/* alignment error */
+#define	RSR_CE		0x02U	/* crc error */
+#define	RSR_FOE		0x01U	/* fifo overflow error */
+
+#define	RSR_BITS	\
+	"\020"		\
+	"\010RF"	\
+	"\007MF"	\
+	"\006LCS"	\
+	"\005RWTO"	\
+	"\004PLE"	\
+	"\003AE"	\
+	"\002CE"	\
+	"\001FOE"
+
+/* receive overflow counter register */
+#define	ROCR_RXFU	0x80U	/* receive overflow counter overflow */
+#define	ROCR_ROC	0x7fU	/* receive overflow counter */
+
+#define	ROCR_BITS	\
+	"\020"		\
+	"\010RXFU"
+
+/* back pressure threshold register */
+#define	BPTR_BPHW	0xf0U	/* high water overflow threshold */
+#define		BPTR_BPHW_SHIFT	4
+#define		BPTR_BPHW_UNIT	1024U
+#define		BPTR_BPHW_DEFAULT	(3 << BPTR_BPHW_SHIFT)	/* 3k */
+#define	BPTR_JPT	0x0fU	/* jam pattern time */
+#define		BPTR_JPT_SHIFT	0
+#define		BPTR_JPT_5us	(0U << BPTR_JPT_SHIFT)
+#define		BPTR_JPT_10us	(1U << BPTR_JPT_SHIFT)
+#define		BPTR_JPT_15us	(2U << BPTR_JPT_SHIFT)
+#define		BPTR_JPT_25us	(3U << BPTR_JPT_SHIFT)
+#define		BPTR_JPT_50us	(4U << BPTR_JPT_SHIFT)
+#define		BPTR_JPT_100us	(5U << BPTR_JPT_SHIFT)
+#define		BPTR_JPT_150us	(6U << BPTR_JPT_SHIFT)
+#define		BPTR_JPT_200us	(7U << BPTR_JPT_SHIFT)
+#define		BPTR_JPT_250us	(8U << BPTR_JPT_SHIFT)
+#define		BPTR_JPT_300us	(9U << BPTR_JPT_SHIFT)
+#define		BPTR_JPT_350us	(10U << BPTR_JPT_SHIFT)
+#define		BPTR_JPT_400us	(11U << BPTR_JPT_SHIFT)
+#define		BPTR_JPT_450us	(12U << BPTR_JPT_SHIFT)
+#define		BPTR_JPT_500us	(13U << BPTR_JPT_SHIFT)
+#define		BPTR_JPT_550us	(14U << BPTR_JPT_SHIFT)
+#define		BPTR_JPT_600us	(15U << BPTR_JPT_SHIFT)
+
+/* flow control threshold register */
+#define	FCTR_HWOT	0xf0U	/* rx fifo high water overflow threshold */
+#define		FCTR_HWOT_SHIFT	4
+#define		FCTR_HWOT_UNIT	1024U
+#define	FCTR_LWOT	0x0fU	/* rx fifo low water overflow threshold */
+#define		FCTR_LWOT_SHIFT	0
+#define		FCTR_LWOT_UNIT	1024U
+
+/* rx/tx flow control register */
+#define	FCR_TXPO	0x80U	/* tx pause packet */
+#define	FCR_TXPF	0x40U	/* tx pause packet */
+#define	FCR_TXPEN	0x20U	/* tx pause packet */
+#define	FCR_BKPA	0x10U	/* back pressure mode */
+#define	FCR_BKPM	0x08U	/* back pressure mode */
+#define	FCR_BKPS	0x04U	/* rx pause packet current status (r/c) */
+#define	FCR_RXPCS	0x02U	/* rx pause packet current status (ro) */
+#define	FCR_FLCE	0x01U	/* flow control enbale */
+
+#define	FCR_BITS	\
+	"\020"		\
+	"\000TXPO"	\
+	"\000TXPF"	\
+	"\000TXPEN"	\
+	"\000BKPA"	\
+	"\000BKPM"	\
+	"\000BKPS"	\
+	"\000RXPCS"	\
+	"\000FLCE"
+
+/* EEPROM & PHY control register (0x0b) */
+#define	EPCR_REEP	0x20U	/* reload eeprom */
+#define	EPCR_WEP	0x10U	/* write eeprom enable */
+#define	EPCR_EPOS	0x08U	/* select device, 0:eeprom, 1:phy */
+#define	EPCR_ERPRR	0x04U	/* read command */
+#define	EPCR_ERPRW	0x02U	/* write command */
+#define	EPCR_ERRE	0x01U	/* eeprom/phy access in progress (ro) */
+
+#define	EPCR_BITS	\
+	"\020"		\
+	"\005REEP"	\
+	"\004WEP"	\
+	"\003EPOS"	\
+	"\002ERPRR"	\
+	"\001ERPRW"	\
+	"\000ERRE"
+
+/* EEPROM & PHY access register (0x0c) */
+#define	EPAR_PHYADR	0xc0U	/* phy address, internal phy(1) or external */
+#define		EPAR_PHYADR_SHIFT	6
+#define	EPAR_EROA	0x3fU	/* eeprom word addr or phy register addr */
+#define		EPAR_EROA_SHIFT	0
+
+/* EEPROM & PHY data register (0x0d(low)-0x0e(hi)) */
+
+/* wake up control register (0x0f) */
+#define	WCR_LINKEN	0x20U	/* enable link status event */
+#define	WCR_SAMPLEEN	0x10U	/* enable sample frame event */
+#define	WCR_MAGICEN	0x08U	/* enable magic pkt event */
+#define	WCR_LINKST	0x04U	/* link status change occur ro */
+#define	WCR_SAMPLEST	0x02U	/* sample frame rx occur ro */
+#define	WCR_MAGICST	0x01U	/* magic pkt rx occur ro */
+
+#define	WCR_BITS	\
+	"\020"		\
+	"\000LINKEN"	\
+	"\000SAMPLEEN"	\
+	"\000MAGICEN"	\
+	"\000LINKST"	\
+	"\000SAMPLEST"	\
+	"\000MAGICST"
+
+/* physical address register (0x10-0x15) */
+/* multicast address register (0x16-0x1c) */
+/* general purpose control register (0x1e) */
+#define	GPCR_GEPCTRL	0x7f
+#define		GPCR_OUT(n)	(1U << (n))
+
+#define	GPCR_BITS	\
+	"\020"		\
+	"\006OUT5"	\
+	"\005OUT4"	\
+	"\004OUT3"	\
+	"\003OUT2"	\
+	"\002OUT1"	\
+	"\001OUT0"
+
+/* general purpose register (0x1f) */
+#define	GPR_GEPIO5	0x20U
+#define	GPR_GEPIO4	0x10U
+#define	GPR_GEPIO3	0x08U
+#define	GPR_GEPIO2	0x04U
+#define	GPR_GEPIO1	0x02U
+#define	GPR_GEPIO0	0x01U
+
+#define	GPR_BITS	\
+	"\020"		\
+	"\006GEPIO5"	\
+	"\005GEPIO4"	\
+	"\004GEPIO3"	\
+	"\003GEPIO2"	\
+	"\002GEPIO1"	\
+	"\001GEPIO0"
+
+/* vendor id register (0x28-0x29) */
+/* product id register (0x2a-0x2b) */
+/* chip revision register (0x2c) */
+
+/* usb device address register (0xf0) */
+#define	USBDA_USBFA	0x3fU	/* usb device address */
+#define		USBDA_USBFA_SHIFT	0
+
+/* receive packet counter register (0xf1) */
+
+/* transmitpacket counter/usb status register (0xf2) */
+#define	TUSR_RXFAULT	0x80U	/* indicate rx has unexpected condition */
+#define	TUSR_SUSFLAG	0x40U	/* indicate device has suspended condition */
+#define	TUSR_EP1RDY	0x20U	/* ready for read from ep1 pipe */
+#define	TUSR_SRAM	0x18U	/* sram size 0:32K, 1:48K, 2:16K, 3:64K */
+#define		TUSR_SRAM_SHIFT	3
+#define		TUSR_SRAM_32K	(0U << TUSR_SRAM_SHIFT)
+#define		TUSR_SRAM_48K	(1U << TUSR_SRAM_SHIFT)
+#define		TUSR_SRAM_16K	(2U << TUSR_SRAM_SHIFT)
+#define		TUSR_SRAM_64K	(3U << TUSR_SRAM_SHIFT)
+#define	TUSR_TXC2	0x04U	/* two or more packets in tx buffer */
+#define	TUSR_TXC1	0x02U	/* one packet in tx buffer */
+#define	TUSR_TXC0	0x01U	/* no packet in tx buffer */
+
+#define	TUSR_BITS	\
+	"\020"		\
+	"\010RXFAULT"	\
+	"\007SUSFLAG"	\
+	"\006EP1RDY"	\
+	"\003TXC2"	\
+	"\002TXC1"	\
+	"\001TXC0"
+
+/* usb control register (0xf4) */
+#define	USBC_EP3ACK	0x20U	/* ep3 will alway return 8byte data if NAK=0*/
+#define	USBC_EP3NACK	0x10U	/* ep3 will alway return NAK */
+#define	USBC_MEMTST	0x01U
+
+/* bulk message format */
+#define	TX_HEADER_SIZE	2
+#define	RX_HEADER_SIZE	3
+
+/* interrupt msg format */
+struct intr_msg {
+	uint8_t	im_nsr;
+	uint8_t	im_tsr1;
+	uint8_t	im_tsr2;
+	uint8_t	im_rsr;
+	uint8_t	im_rocr;
+	uint8_t	im_rxc;
+	uint8_t	im_txc;
+	uint8_t	im_gpr;
+};
+#endif /* __DM9601_H__ */
diff --git a/usr/src/uts/common/io/udmf/udmf_usbgem.c b/usr/src/uts/common/io/udmf/udmf_usbgem.c
new file mode 100644
index 0000000000..0637de054b
--- /dev/null
+++ b/usr/src/uts/common/io/udmf/udmf_usbgem.c
@@ -0,0 +1,1036 @@
+/*
+ * udmfE_usbgem.c : Davicom DM9601E USB to Fast Ethernet Driver for Solaris
+ *
+ * Copyright (c) 2009-2012 Masayuki Murayama.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the author nor the names of its contributors may be
+ *    used to endorse or promote products derived from this software without
+ *    specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
+
+#pragma ident "%W% %E%"
+
+/*
+ *  Changelog:
+ */
+
+/*
+ * TODO
+ */
+/* ======================================================= */
+
+/*
+ * Solaris system header files and macros
+ */
+
+/* minimum kernel headers for drivers */
+#include <sys/types.h>
+#include <sys/conf.h>
+#include <sys/debug.h>
+#include <sys/kmem.h>
+#include <sys/modctl.h>
+#include <sys/errno.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/byteorder.h>
+
+/* ethernet stuff */
+#include <sys/ethernet.h>
+
+/* interface card depend stuff */
+#include <sys/stropts.h>
+#include <sys/stream.h>
+#include <sys/strlog.h>
+#include <sys/usb/usba.h>
+#include "usbgem.h"
+
+/* hardware stuff */
+#include "usbgem_mii.h"
+#include "dm9601reg.h"
+
+char	ident[] = "dm9601 usbnic driver v" VERSION;
+
+/*
+ * Useful macros
+ */
+#define	CHECK_AND_JUMP(err, label)	if (err != USB_SUCCESS) goto label
+#define	LE16P(p)	((((uint8_t *)(p))[1] << 8) | ((uint8_t *)(p))[0])
+
+/*
+ * Debugging
+ */
+#ifdef DEBUG_LEVEL
+static int udmf_debug = DEBUG_LEVEL;
+#define	DPRINTF(n, args)	if (udmf_debug > (n)) cmn_err args
+#else
+#define	DPRINTF(n, args)
+#endif
+
+/*
+ * Our configration for dm9601
+ */
+/* timeouts */
+#define	ONESEC	(drv_usectohz(1*1000000))
+
+/*
+ * Local device definitions
+ */
+struct udmf_dev {
+	/*
+	 * Misc HW information
+	 */
+	uint8_t	rcr;
+	uint8_t	last_nsr;
+	uint8_t	mac_addr[ETHERADDRL];
+};
+
+/*
+ * private functions
+ */
+
+/* mii operations */
+static uint16_t udmf_mii_read(struct usbgem_dev *, uint_t, int *errp);
+static void udmf_mii_write(struct usbgem_dev *, uint_t, uint16_t, int *errp);
+
+/* nic operations */
+static int udmf_reset_chip(struct usbgem_dev *);
+static int udmf_init_chip(struct usbgem_dev *);
+static int udmf_start_chip(struct usbgem_dev *);
+static int udmf_stop_chip(struct usbgem_dev *);
+static int udmf_set_media(struct usbgem_dev *);
+static int udmf_set_rx_filter(struct usbgem_dev *);
+static int udmf_get_stats(struct usbgem_dev *);
+static void udmf_interrupt(struct usbgem_dev *, mblk_t *);
+
+/* packet operations */
+static mblk_t *udmf_tx_make_packet(struct usbgem_dev *, mblk_t *);
+static mblk_t *udmf_rx_make_packet(struct usbgem_dev *, mblk_t *);
+
+/* =============================================================== */
+/*
+ * I/O functions
+ */
+/* =============================================================== */
+#define	OUT(dp, ix, len, buf, errp, label)	\
+	if ((*(errp) = usbgem_ctrl_out((dp), 	\
+	/* bmRequestType */ USB_DEV_REQ_HOST_TO_DEV	\
+		    | USB_DEV_REQ_TYPE_VENDOR | USB_DEV_REQ_RCPT_DEV,	\
+	/* bRequest */	1,	\
+	/* wValue */	0,	\
+	/* wIndex */	(ix),	\
+	/* wLength */	(len),	\
+	/* value */	(buf),	\
+	/* size */	(len))) != USB_SUCCESS) goto label
+
+#define	OUTB(dp, ix, val, errp, label)	\
+	if ((*(errp) = usbgem_ctrl_out((dp), 	\
+	/* bmRequestType */ USB_DEV_REQ_HOST_TO_DEV	\
+		    | USB_DEV_REQ_TYPE_VENDOR | USB_DEV_REQ_RCPT_DEV,	\
+	/* bRequest */	3,	\
+	/* wValue */	(val),	\
+	/* wIndex */	(ix),	\
+	/* wLength */	0,	\
+	/* value */	NULL,	\
+	/* size */	0)) != USB_SUCCESS) goto label
+
+#define	IN(dp, ix, len, buf, errp, label)	\
+	if ((*(errp) = usbgem_ctrl_in((dp), 	\
+	/* bmRequestType */ USB_DEV_REQ_DEV_TO_HOST	\
+		    | USB_DEV_REQ_TYPE_VENDOR | USB_DEV_REQ_RCPT_DEV,	\
+	/* bRequest */	0,	\
+	/* wValue */	0,	\
+	/* wIndex */	(ix),	\
+	/* wLength */	(len),	\
+	/* valuep */	(buf),	\
+	/* size */	(len))) != USB_SUCCESS) goto label
+
+/* =============================================================== */
+/*
+ * Hardware manupilation
+ */
+/* =============================================================== */
+static void
+udmf_enable_phy(struct usbgem_dev *dp)
+{
+	int	err = USB_SUCCESS;
+
+	/* de-assert reset signal to phy */
+	OUTB(dp, GPCR, GPCR_OUT(0), &err, usberr);
+	OUTB(dp, GPR, 0, &err, usberr);
+usberr:
+	;
+}
+
+static int
+udmf_reset_chip(struct usbgem_dev *dp)
+{
+	int	err = USB_SUCCESS;
+
+	DPRINTF(2, (CE_CONT, "!%s: %s: called", dp->name, __func__));
+
+	OUTB(dp, NCR, NCR_LBK_NORMAL | NCR_RST, &err, usberr);
+	drv_usecwait(100);
+usberr:
+	return (err);
+}
+
+/*
+ * Setup dm9601
+ */
+static int
+udmf_init_chip(struct usbgem_dev *dp)
+{
+	int		i;
+	uint32_t	val;
+	int		err = USB_SUCCESS;
+	uint16_t	reg;
+	uint8_t		buf[2];
+	struct udmf_dev	*lp = dp->private;
+
+	DPRINTF(2, (CE_CONT, "!%s: %s: called", dp->name, __func__));
+
+	OUTB(dp, NCR, NCR_LBK_NORMAL, &err, usberr);
+
+	/* tx control regiser: enable padding and crc generation */
+	OUTB(dp, TCR, 0, &err, usberr);
+
+	/* rx control register: will be set later by udmf_set_rx_filer() */
+	lp->rcr = RCR_RUNT;
+
+	/* back pressure threshold: */
+	OUTB(dp, BPTR, (2 << BPTR_BPHW_SHIFT) | BPTR_JPT_200us,
+	    &err, usberr);
+
+	/* flow control threshold: same as default */
+	OUTB(dp, FCTR, (3 << FCTR_HWOT_SHIFT) | (8 << FCTR_LWOT_SHIFT),
+	    &err, usberr);
+
+	/* usb control register */
+	OUTB(dp, USBC, USBC_EP3ACK | 0x06, &err, usberr);
+
+	/* flow control: will be set later by udmf_set_media() */
+
+	/* wake up control register: */
+	OUTB(dp, WCR, 0, &err, usberr);
+
+usberr:
+	DPRINTF(2, (CE_CONT, "!%s: %s: end err:%d(%s)",
+	    dp->name, __func__,
+	    err, err == USB_SUCCESS ? "success" : "error"));
+	return (err);
+}
+
+static int
+udmf_start_chip(struct usbgem_dev *dp)
+{
+	int	err = USB_SUCCESS;
+	struct udmf_dev	*lp = dp->private;
+
+	/* enable Rx */
+	lp->rcr |= RCR_RXEN;
+	OUTB(dp, RCR, lp->rcr, &err, usberr);
+
+usberr:
+	DPRINTF(2, (CE_CONT, "!%s: %s: end err:%d(%s)",
+	    dp->name, __func__,
+	    err, err == USB_SUCCESS ? "success" : "error"));
+	return (err);
+}
+
+static int
+udmf_stop_chip(struct usbgem_dev *dp)
+{
+	int	err = USB_SUCCESS;
+	struct udmf_dev	*lp = dp->private;
+
+	/* disable rx */
+	lp->rcr &= ~RCR_RXEN;
+	OUTB(dp, RCR, lp->rcr, &err, usberr);
+
+usberr:
+	DPRINTF(2, (CE_CONT, "!%s: %s: end err:%d(%s)",
+	    dp->name, __func__,
+	    err, err == USB_SUCCESS ? "success" : "error"));
+	return (err);
+}
+
+static int
+udmf_get_stats(struct usbgem_dev *dp)
+{
+	/* EMPTY */
+	return (USB_SUCCESS);
+}
+
+static uint_t
+udmf_mcast_hash(struct usbgem_dev *dp, const uint8_t *addr)
+{
+	return (usbgem_ether_crc_le(addr) & 0x3f);
+}
+
+static int
+udmf_set_rx_filter(struct usbgem_dev *dp)
+{
+	int		i;
+	uint8_t		rcr;
+	uint8_t		mode;
+	uint8_t		mhash[8];
+	uint8_t		*mac;
+	uint_t		h;
+	int		err = USB_SUCCESS;
+	struct udmf_dev	*lp = dp->private;
+	static uint8_t	invalid_mac[ETHERADDRL] = {0, 0, 0, 0, 0, 0};
+
+	DPRINTF(2, (CE_CONT, "!%s: %s: called, rxmode:%x",
+	    dp->name, __func__, dp->rxmode));
+
+	if (lp->rcr & RCR_RXEN) {
+		/* set promiscuous mode before changing rx filter mode */
+		OUTB(dp, RCR, lp->rcr | RCR_PRMSC, &err, usberr);
+	}
+
+	lp->rcr &= ~(RCR_ALL | RCR_PRMSC);
+	mode = 0;
+	bzero(mhash, sizeof (mhash));
+	mac = dp->cur_addr.ether_addr_octet;
+
+	if ((dp->rxmode & RXMODE_ENABLE) == 0) {
+		mac = invalid_mac;
+	} else if (dp->rxmode & RXMODE_PROMISC) {
+		/* promiscious mode implies all multicast and all physical */
+		mode |= RCR_PRMSC;
+	} else if ((dp->rxmode & RXMODE_ALLMULTI) || dp->mc_count > 32) {
+		/* accept all multicast packets */
+		mode |= RCR_ALL;
+	} else if (dp->mc_count > 0) {
+		/*
+		 * make hash table to select interresting
+		 * multicast address only.
+		 */
+		for (i = 0; i < dp->mc_count; i++) {
+			/* hash table is 64 = 2^6 bit width */
+			h = dp->mc_list[i].hash;
+			mhash[h / 8] |= 1 << (h % 8);
+		}
+	}
+
+	/* set node address */
+	if (bcmp(mac, lp->mac_addr, ETHERADDRL) != 0) {
+		OUT(dp, PAR, ETHERADDRL, dp->cur_addr.ether_addr_octet,
+		    &err, usberr);
+		bcopy(mac, lp->mac_addr, ETHERADDRL);
+	}
+
+	/* set multicast hash table */
+	OUT(dp, MAR, sizeof (mhash), &mhash[0], &err, usberr);
+
+	/* update rcr */
+	lp->rcr |= mode;
+	OUTB(dp, RCR, lp->rcr, &err, usberr);
+
+#if DEBUG_LEVEL > 1
+	/* verify rcr */
+	IN(dp, RCR, 1, &rcr, &err, usberr);
+	cmn_err(CE_CONT, "!%s: %s: rcr:%b returned",
+	    dp->name, __func__, rcr, RCR_BITS);
+#endif
+usberr:
+	DPRINTF(2, (CE_CONT, "!%s: %s: end err:%d(%s)",
+	    dp->name, __func__,
+	    err, err == USB_SUCCESS ? "success" : "error"));
+	return (err);
+}
+
+static int
+udmf_set_media(struct usbgem_dev *dp)
+{
+	int	err = USB_SUCCESS;
+	uint8_t	fcr;
+	struct udmf_dev	*lp = dp->private;
+
+	DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__));
+
+	/* setup flow control */
+	fcr = 0;
+	if (dp->full_duplex) {
+		/* select flow control */
+		switch (dp->flow_control) {
+		case FLOW_CONTROL_RX_PAUSE:
+			fcr |= FCR_FLCE;
+			break;
+
+		case FLOW_CONTROL_TX_PAUSE:
+			fcr |= FCR_TXPEN;
+			break;
+
+		case FLOW_CONTROL_SYMMETRIC:
+			fcr |= FCR_FLCE | FCR_TXPEN;
+			break;
+		}
+	}
+
+	/* update flow control register */
+	OUTB(dp, FCR, fcr, &err, usberr);
+
+usberr:
+	DPRINTF(2, (CE_CONT, "!%s: %s: end err:%d(%s)",
+	    dp->name, __func__,
+	    err, err == USB_SUCCESS ? "success" : "error"));
+	return (err);
+}
+
+/*
+ * send/receive packet check
+ */
+static mblk_t *
+udmf_tx_make_packet(struct usbgem_dev *dp, mblk_t *mp)
+{
+	int		n;
+	size_t		pkt_size;
+	mblk_t		*new;
+	mblk_t		*tp;
+	uint8_t		*bp;
+	uint8_t		*last_pos;
+	uint_t		align_mask;
+
+	pkt_size = msgdsize(mp);
+	align_mask = 63;
+
+	/*
+	 * re-allocate the mp
+	 */
+
+	/* minimum ethernet packet size of ETHERMIN */
+	pkt_size = max(pkt_size, ETHERMIN);
+
+#if 0 /* CONFIG_ADD_TX_DELIMITOR_ALWAYS */
+	pkt_size += TX_HEADER_SIZE;
+#endif
+	if (((pkt_size + TX_HEADER_SIZE) & align_mask) == 0) {
+		/* padding is required in usb communication */
+		pkt_size += TX_HEADER_SIZE;
+	}
+
+	if ((new = allocb(TX_HEADER_SIZE + pkt_size, 0)) == NULL) {
+		return (NULL);
+	}
+	new->b_wptr = new->b_rptr + TX_HEADER_SIZE + pkt_size;
+
+	/* add a header */
+	bp = new->b_rptr;
+	bp[0] = (uint8_t)pkt_size;
+	bp[1] = (uint8_t)(pkt_size >> 8);
+	bp += TX_HEADER_SIZE;
+
+	/* copy contents of the buffer */
+	for (tp = mp; tp; tp = tp->b_cont) {
+		n = tp->b_wptr - tp->b_rptr;
+		bcopy(tp->b_rptr, bp, n);
+		bp += n;
+	}
+
+	/* clear the rest including the next zero length header */
+	last_pos = new->b_wptr;
+	while (bp < last_pos) {
+		*bp++ = 0;
+	}
+
+	return (new);
+}
+
+static void
+udmf_dump_packet(struct usbgem_dev *dp, uint8_t *bp, int n)
+{
+	int	i;
+
+	for (i = 0; i < n; i += 8, bp += 8) {
+		cmn_err(CE_CONT, "%02x %02x %02x %02x %02x %02x %02x %02x",
+		    bp[0], bp[1], bp[2], bp[3], bp[4], bp[5], bp[6], bp[7]);
+	}
+}
+
+static mblk_t *
+udmf_rx_make_packet(struct usbgem_dev *dp, mblk_t *mp)
+{
+	int	len;
+	uint8_t	rx_stat;
+
+	len = mp->b_wptr - mp->b_rptr;
+
+	if (len <= RX_HEADER_SIZE) {
+		/*
+		 * the usb bulk-in frame doesn't include a valid
+		 * ethernet packet.
+		 */
+		return (NULL);
+	}
+
+	/* remove rx header */
+	rx_stat = mp->b_rptr[0];
+	if (rx_stat & (RSR_RF |  RSR_LCS | RSR_RWTO |
+	    RSR_PLE | RSR_AE | RSR_CE |  RSR_FOE)) {
+		if (rx_stat & RSR_RF) {
+			dp->stats.runt++;
+		}
+		if (rx_stat & RSR_LCS) {
+			/* late collision */
+			dp->stats.rcv_internal_err++;
+		}
+		if (rx_stat & RSR_RWTO) {
+			/* rx timeout */
+			dp->stats.rcv_internal_err++;
+		}
+		if (rx_stat & RSR_PLE) {
+			/* physical layer error */
+			dp->stats.rcv_internal_err++;
+		}
+		if (rx_stat & RSR_AE) {
+			/* alignment error */
+			dp->stats.frame++;
+		}
+		if (rx_stat & RSR_CE) {
+			/* crc error */
+			dp->stats.crc++;
+		}
+		if (rx_stat & RSR_FOE) {
+			/* fifo overflow error */
+			dp->stats.overflow++;
+		}
+		dp->stats.errrcv++;
+	}
+	len = LE16P(&mp->b_rptr[1]);
+	if (len >= ETHERFCSL) {
+		len -= ETHERFCSL;
+	}
+	mp->b_rptr += RX_HEADER_SIZE;
+	mp->b_wptr = mp->b_rptr + len;
+
+	return (mp);
+}
+
+/*
+ * MII Interfaces
+ */
+static uint16_t
+udmf_ep_read(struct usbgem_dev *dp, uint_t which, uint_t addr, int *errp)
+{
+	int	i;
+	uint8_t	epcr;
+	uint16_t	val;
+
+	DPRINTF(4, (CE_CONT, "!%s: %s: called, ix:%d",
+	    dp->name, __func__, addr));
+
+	OUTB(dp, EPAR, addr, errp, usberr);
+	OUTB(dp, EPCR, which | EPCR_ERPRR, errp, usberr);
+
+	for (i = 0; i < 100; i++) {
+		IN(dp, EPCR, sizeof (epcr), &epcr, errp, usberr);
+		if ((epcr & EPCR_ERRE) == 0) {
+			/* done */
+			IN(dp, EPDR, sizeof (val), &val, errp, usberr);
+			val = LE_16(val);
+			goto done;
+		}
+		drv_usecwait(10);
+	}
+	/* timeout */
+	cmn_err(CE_WARN, "!%s: %s: timeout", dp->name, __func__);
+	val = 0;
+done:
+	OUTB(dp, EPCR, 0, errp, usberr);
+	return (val);
+
+usberr:
+	DPRINTF(2, (CE_CONT, "!%s: %s: end err:%d(%s)",
+	    dp->name, __func__,
+	    *errp, *errp == USB_SUCCESS ? "success" : "error"));
+	return (0);
+}
+
+static void
+udmf_ep_write(struct usbgem_dev *dp, uint_t which, uint_t addr,
+    uint16_t val, int *errp)
+{
+	int	i;
+	uint8_t	epcr;
+
+	DPRINTF(5, (CE_CONT, "!%s: %s called", dp->name, __func__));
+
+	val = LE_16(val);
+	OUT(dp, EPDR, sizeof (val), &val, errp, usberr);
+
+	OUTB(dp, EPAR, addr, errp, usberr);
+
+	OUTB(dp, EPCR, which | EPCR_WEP | EPCR_ERPRW, errp, usberr);
+
+	for (i = 0; i < 100; i++) {
+		IN(dp, EPCR, 1, &epcr, errp, usberr);
+		if ((epcr & EPCR_ERRE) == 0) {
+			/* done */
+			goto done;
+		}
+		drv_usecwait(10);
+	}
+	/* timeout */
+	cmn_err(CE_WARN, "!%s: %s: timeout", dp->name, __func__);
+done:
+	OUTB(dp, EPCR, 0, errp, usberr);
+	return;
+
+usberr:
+	DPRINTF(2, (CE_CONT, "!%s: %s: end err:%d(%s)",
+	    dp->name, __func__,
+	    *errp, *errp == USB_SUCCESS ? "success" : "error"));
+}
+
+static uint16_t
+udmf_mii_read(struct usbgem_dev *dp, uint_t index, int *errp)
+{
+	uint16_t	val;
+
+	val = udmf_ep_read(dp, EPCR_EPOS,
+	    (dp->mii_phy_addr << EPAR_PHYADR_SHIFT) | index, errp);
+
+	return (val);
+}
+
+static void
+udmf_mii_write(struct usbgem_dev *dp, uint_t index, uint16_t val, int *errp)
+{
+	udmf_ep_write(dp, EPCR_EPOS,
+	    (dp->mii_phy_addr << EPAR_PHYADR_SHIFT) | index, val, errp);
+}
+
+static void
+udmf_interrupt(struct usbgem_dev *dp, mblk_t *mp)
+{
+	struct intr_msg	*imp;
+	struct udmf_dev	*lp = dp->private;
+
+	imp = (struct intr_msg *)&mp->b_rptr[0];
+
+	DPRINTF(4, (CE_CONT,
+	    "!%s: %s: size:%d, nsr:%b tsr1:%b tsr2:%b"
+	    " rsr:%b rocr:%b rxc:%02x txc:%b gpr:%b",
+	    dp->name, __func__, mp->b_wptr - mp->b_rptr,
+	    imp->im_nsr, NSR_BITS,
+	    imp->im_tsr1, TSR_BITS,
+	    imp->im_tsr2, TSR_BITS,
+	    imp->im_rsr, RSR_BITS,
+	    imp->im_rocr, ROCR_BITS,
+	    imp->im_rxc,
+	    imp->im_txc, TUSR_BITS,
+	    imp->im_gpr, GPR_BITS));
+
+	if ((lp->last_nsr ^ imp->im_nsr) & NSR_LINKST) {
+		usbgem_mii_update_link(dp);
+	}
+
+	lp->last_nsr = imp->im_nsr;
+}
+
+/* ======================================================== */
+/*
+ * OS depend (device driver DKI) routine
+ */
+/* ======================================================== */
+static uint16_t
+udmf_eeprom_read(struct usbgem_dev *dp, uint_t index, int *errp)
+{
+	uint16_t	val;
+
+	val = udmf_ep_read(dp, 0, index, errp);
+
+	return (val);
+}
+
+#ifdef DEBUG_LEVEL
+static void
+udmf_eeprom_dump(struct usbgem_dev *dp, int size)
+{
+	int	i;
+	int	err;
+	uint16_t	w0, w1, w2, w3;
+
+	cmn_err(CE_CONT, "!%s: eeprom dump:", dp->name);
+
+	err = USB_SUCCESS;
+
+	for (i = 0; i < size; i += 4) {
+		w0 = udmf_eeprom_read(dp, i + 0, &err);
+		w1 = udmf_eeprom_read(dp, i + 1, &err);
+		w2 = udmf_eeprom_read(dp, i + 2, &err);
+		w3 = udmf_eeprom_read(dp, i + 3, &err);
+		cmn_err(CE_CONT, "!0x%02x: 0x%04x 0x%04x 0x%04x 0x%04x",
+		    i, w0, w1, w2, w3);
+	}
+usberr:
+	;
+}
+#endif
+
+static int
+udmf_attach_chip(struct usbgem_dev *dp)
+{
+	int	i;
+	uint_t	val;
+	uint8_t	*m;
+	int	err;
+	struct udmf_dev	*lp = dp->private;
+
+	DPRINTF(0, (CE_CONT, "!%s: %s enter", dp->name, __func__));
+
+	/*
+	 * get mac address from EEPROM
+	 */
+	m = dp->dev_addr.ether_addr_octet;
+	for (i = 0; i < ETHERADDRL; i += 2)  {
+		val = udmf_eeprom_read(dp, i/2, &err);
+		m[i + 0] = (uint8_t)val;
+		m[i + 1] = (uint8_t)(val >> 8);
+	}
+
+	/* invalidate a private cache for mac addr */
+	bzero(lp->mac_addr, sizeof (lp->mac_addr));
+#ifdef CONFIG_VLAN
+	dp->misc_flag = USBGEM_VLAN;
+#endif
+#if DEBUG_LEVEL > 0
+	udmf_eeprom_dump(dp, /* 0x3f + 1 */ 128);
+#endif
+{
+	static uint8_t bcst[6] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+	DPRINTF(0, (CE_CONT, "!%s: %s: hash of bcast:%x",
+	    dp->name, __func__, usbgem_ether_crc_be(bcst)));
+}
+	return (USB_SUCCESS);
+
+usberr:
+	cmn_err(CE_WARN, "%s: %s: usb error detected (%d)",
+	    dp->name, __func__, err);
+	return (USB_FAILURE);
+}
+
+static int
+udmf_mii_probe(struct usbgem_dev *dp)
+{
+	DPRINTF(2, (CE_CONT, "!%s: %s: called", dp->name, __func__));
+
+	udmf_enable_phy(dp);
+	return (usbgem_mii_probe_default(dp));
+}
+
+static int
+udmf_mii_init(struct usbgem_dev *dp)
+{
+	DPRINTF(2, (CE_CONT, "!%s: %s: called", dp->name, __func__));
+	udmf_enable_phy(dp);
+	return (USB_SUCCESS);
+}
+
+static int
+udmfattach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+	int			i;
+	ddi_iblock_cookie_t	c;
+	int			ret;
+	int			revid;
+	int			unit;
+	int			len;
+	const char		*drv_name;
+	struct usbgem_dev	*dp;
+	void			*base;
+	struct usbgem_conf	*ugcp;
+	struct udmf_dev		*lp;
+
+	unit = ddi_get_instance(dip);
+	drv_name = ddi_driver_name(dip);
+
+	DPRINTF(3, (CE_CONT, "!%s%d: %s: called, cmd:%d",
+	    drv_name, unit, __func__, cmd));
+
+	if (cmd == DDI_ATTACH) {
+		/*
+		 * construct usbgem configration
+		 */
+		ugcp = kmem_zalloc(sizeof (*ugcp), KM_SLEEP);
+
+		/* name */
+		/*
+		 * softmac requires that ppa is the instance number
+		 * of the device, otherwise it hangs in seaching the device.
+		 */
+		sprintf(ugcp->usbgc_name, "%s%d", drv_name, unit);
+		ugcp->usbgc_ppa = unit;
+
+		ugcp->usbgc_ifnum = 0;
+		ugcp->usbgc_alt = 0;
+
+		ugcp->usbgc_tx_list_max = 64;
+
+		ugcp->usbgc_rx_header_len = RX_HEADER_SIZE;
+		ugcp->usbgc_rx_list_max = 64;
+
+		/* time out parameters */
+		ugcp->usbgc_tx_timeout = USBGEM_TX_TIMEOUT;
+		ugcp->usbgc_tx_timeout_interval = USBGEM_TX_TIMEOUT_INTERVAL;
+#if 1
+		/* flow control */
+		ugcp->usbgc_flow_control = FLOW_CONTROL_RX_PAUSE;
+#else
+		/*
+		 * XXX - flow control caused link down frequently under
+		 * heavy traffic
+		 */
+		ugcp->usbgc_flow_control = FLOW_CONTROL_NONE;
+#endif
+		/* MII timeout parameters */
+		ugcp->usbgc_mii_link_watch_interval =
+		    USBGEM_LINK_WATCH_INTERVAL;
+		ugcp->usbgc_mii_an_watch_interval = 
+		    USBGEM_LINK_WATCH_INTERVAL/5;
+		ugcp->usbgc_mii_reset_timeout = MII_RESET_TIMEOUT; /* 1 sec */
+		ugcp->usbgc_mii_an_timeout = MII_AN_TIMEOUT;	/* 5 sec */
+		ugcp->usbgc_mii_an_wait = (25*ONESEC)/10;
+		ugcp->usbgc_mii_linkdown_timeout = MII_LINKDOWN_TIMEOUT;
+
+		ugcp->usbgc_mii_an_delay = ONESEC/10;
+		ugcp->usbgc_mii_linkdown_action = MII_ACTION_RSA;
+		ugcp->usbgc_mii_linkdown_timeout_action = MII_ACTION_RESET;
+		ugcp->usbgc_mii_dont_reset = B_FALSE;
+		ugcp->usbgc_mii_hw_link_detection = B_TRUE;
+
+		/* I/O methods */
+
+		/* mac operation */
+		ugcp->usbgc_attach_chip = &udmf_attach_chip;
+		ugcp->usbgc_reset_chip = &udmf_reset_chip;
+		ugcp->usbgc_init_chip = &udmf_init_chip;
+		ugcp->usbgc_start_chip = &udmf_start_chip;
+		ugcp->usbgc_stop_chip = &udmf_stop_chip;
+		ugcp->usbgc_multicast_hash = &udmf_mcast_hash;
+
+		ugcp->usbgc_set_rx_filter = &udmf_set_rx_filter;
+		ugcp->usbgc_set_media = &udmf_set_media;
+		ugcp->usbgc_get_stats = &udmf_get_stats;
+		ugcp->usbgc_interrupt = &udmf_interrupt;
+
+		/* packet operation */
+		ugcp->usbgc_tx_make_packet = &udmf_tx_make_packet;
+		ugcp->usbgc_rx_make_packet = &udmf_rx_make_packet;
+
+		/* mii operations */
+		ugcp->usbgc_mii_probe = &udmf_mii_probe;
+		ugcp->usbgc_mii_init = &udmf_mii_init;
+		ugcp->usbgc_mii_config = &usbgem_mii_config_default;
+		ugcp->usbgc_mii_read = &udmf_mii_read;
+		ugcp->usbgc_mii_write = &udmf_mii_write;
+		ugcp->usbgc_mii_addr_min = 1;
+
+		/* mtu */
+		ugcp->usbgc_min_mtu = ETHERMTU;
+		ugcp->usbgc_max_mtu = ETHERMTU;
+		ugcp->usbgc_default_mtu = ETHERMTU;
+		
+		lp = kmem_zalloc(sizeof (struct udmf_dev), KM_SLEEP);
+		lp->last_nsr;
+
+		ddi_set_driver_private(dip, NULL);
+
+		dp = usbgem_do_attach(dip, ugcp, lp, sizeof (struct udmf_dev));
+
+		kmem_free(ugcp, sizeof (*ugcp));
+
+		if (dp != NULL) {
+			return (DDI_SUCCESS);
+		}
+
+err_free_mem:
+		kmem_free(lp, sizeof (struct udmf_dev));
+err_close_pipe:
+err:
+		return (DDI_FAILURE);
+	}
+
+	if (cmd == DDI_RESUME) {
+		return (usbgem_resume(dip));
+	}
+
+	return (DDI_FAILURE);
+}
+
+static int
+udmfdetach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+	int	ret;
+
+	if (cmd == DDI_DETACH) {
+		ret = usbgem_do_detach(dip);
+		if (ret != DDI_SUCCESS) {
+			return (DDI_FAILURE);
+		}
+		return (DDI_SUCCESS);
+	}
+	if (cmd == DDI_SUSPEND) {
+		return (usbgem_suspend(dip));
+	}
+	return (DDI_FAILURE);
+}
+
+/* ======================================================== */
+/*
+ * OS depend (loadable streams driver) routine
+ */
+/* ======================================================== */
+#ifdef USBGEM_CONFIG_GLDv3
+USBGEM_STREAM_OPS(udmf_ops, udmfattach, udmfdetach);
+#else
+static	struct module_info udmfminfo = {
+	0,			/* mi_idnum */
+	"udmf",			/* mi_idname */
+	0,			/* mi_minpsz */
+	ETHERMTU,		/* mi_maxpsz */
+	ETHERMTU*128,		/* mi_hiwat */
+	1,			/* mi_lowat */
+};
+
+static	struct qinit udmfrinit = {
+	(int (*)()) NULL,	/* qi_putp */
+	usbgem_rsrv,		/* qi_srvp */
+	usbgem_open,		/* qi_qopen */
+	usbgem_close,		/* qi_qclose */
+	(int (*)()) NULL,	/* qi_qadmin */
+	&udmfminfo,		/* qi_minfo */
+	NULL			/* qi_mstat */
+};
+
+static	struct qinit udmfwinit = {
+	usbgem_wput,		/* qi_putp */
+	usbgem_wsrv,		/* qi_srvp */
+	(int (*)()) NULL,	/* qi_qopen */
+	(int (*)()) NULL,	/* qi_qclose */
+	(int (*)()) NULL,	/* qi_qadmin */
+	&udmfminfo,		/* qi_minfo */
+	NULL			/* qi_mstat */
+};
+
+static struct streamtab	udmf_info = {
+	&udmfrinit,	/* st_rdinit */
+	&udmfwinit,	/* st_wrinit */
+	NULL,		/* st_muxrinit */
+	NULL		/* st_muxwrinit */
+};
+
+static	struct cb_ops cb_udmf_ops = {
+	nulldev,	/* cb_open */
+	nulldev,	/* cb_close */
+	nodev,		/* cb_strategy */
+	nodev,		/* cb_print */
+	nodev,		/* cb_dump */
+	nodev,		/* cb_read */
+	nodev,		/* cb_write */
+	nodev,		/* cb_ioctl */
+	nodev,		/* cb_devmap */
+	nodev,		/* cb_mmap */
+	nodev,		/* cb_segmap */
+	nochpoll,	/* cb_chpoll */
+	ddi_prop_op,	/* cb_prop_op */
+	&udmf_info,	/* cb_stream */
+	D_NEW|D_MP	/* cb_flag */
+};
+
+static	struct dev_ops udmf_ops = {
+	DEVO_REV,	/* devo_rev */
+	0,		/* devo_refcnt */
+	usbgem_getinfo,	/* devo_getinfo */
+	nulldev,	/* devo_identify */
+	nulldev,	/* devo_probe */
+	udmfattach,	/* devo_attach */
+	udmfdetach,	/* devo_detach */
+	nodev,		/* devo_reset */
+	&cb_udmf_ops,	/* devo_cb_ops */
+	NULL,		/* devo_bus_ops */
+        usbgem_power,   /* devo_power */
+#if DEVO_REV >= 4
+	usbgem_quiesce, /* devo_quiesce */
+#endif
+};
+#endif
+
+static struct modldrv modldrv = {
+	&mod_driverops,	/* Type of module.  This one is a driver */
+	ident,
+	&udmf_ops,	/* driver ops */
+};
+
+static struct modlinkage modlinkage = {
+	MODREV_1, &modldrv, NULL
+};
+
+/* ======================================================== */
+/*
+ * _init : done
+ */
+/* ======================================================== */
+int
+_init(void)
+{
+	int 	status;
+
+	DPRINTF(2, (CE_CONT, "!udmf: _init: called"));
+
+	status = usbgem_mod_init(&udmf_ops, "udmf");
+	if (status != DDI_SUCCESS) {
+		return (status);
+	}
+	status = mod_install(&modlinkage);
+	if (status != DDI_SUCCESS) {
+		usbgem_mod_fini(&udmf_ops);
+	}
+	return (status);
+}
+
+/*
+ * _fini : done
+ */
+int
+_fini(void)
+{
+	int	status;
+
+	DPRINTF(2, (CE_CONT, "!udmf: _fini: called"));
+	status = mod_remove(&modlinkage);
+	if (status == DDI_SUCCESS) {
+		usbgem_mod_fini(&udmf_ops);
+	}
+	return (status);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&modlinkage, modinfop));
+}
diff --git a/usr/src/uts/common/io/upf/adm8511reg.h b/usr/src/uts/common/io/upf/adm8511reg.h
new file mode 100644
index 0000000000..68a2207bb5
--- /dev/null
+++ b/usr/src/uts/common/io/upf/adm8511reg.h
@@ -0,0 +1,205 @@
+/*
+ * @(#)adm8511reg.h	1.1 09/06/20
+ * Register dehinitsions of ADMtek ADM8511 Fast Ethernet to USB controller.
+ * Codeded by Masayuki Murayama(KHF04453@nifty.ne.jp)
+ * This file is public domain.
+ */
+
+#define	EC0		0x00	/* B */
+#define	EC1		0x01	/* B */
+#define	EC2		0x02	/* B */
+#define	MA		0x08	/* 8byte array */
+#define	EID		0x10	/* B */
+#define	PAUSETIMER	0x18	/* B pause timer */
+#define	RPNBFC		0x1a	/* B */
+#define	ORFBFC		0x1b	/* B */
+#define	EP1C		0x1c	/* B */
+#define	RXFC		0x1d	/* B */
+#define	BIST		0x1e	/* B */
+#define	EEOFFSET	0x20	/* B */
+#define	EEDATA		0x21	/* W */
+#define	EECTRL		0x23	/* B */
+#define	PHYA		0x25	/* B */
+#define	PHYD		0x26 	/* W */
+#define	PHYAC		0x28	/* B */
+#define	USBSTAT		0x2a	/* B */
+#define	ETHTXSTAT	0x2b	/* W */
+#define	ETHRXSTAT	0x2d	/* B */
+#define	LOSTCNT		0x2e	/* W */
+#define	WF0MASK		0x30	/* 16byte array */
+#define	WF0OFFSET	0x40	/* W */
+#define	WF0CRC		0x41	/* W */
+#define	WF1MASK		0x48	/* 16byte array */
+#define	WF1OFFSET	0x58	/* W */
+#define	WF1CRC		0x59	/* W */
+#define	WF2MASK		0x60	/* 16byte array */
+#define	WF2OFFSET	0x70	/* W */
+#define	WF2CRC		0x71	/* W */
+#define	WCTRL		0x78	/* B */
+#define	WSTAT		0x7a	/* B */
+#define	IPHYC		0x7b	/* B */
+#define	GPIO54		0x7c	/* B */
+#define	GPIO10		0x7e	/* B */
+#define	GPIO32		0x7f	/* B */
+#define	TEST		0x80	/* B */
+#define	TM		0x81	/* B */
+#define	RPN		0x82	/* B */
+
+/* Ethernet control register 0: offset 0 */
+#define	EC0_TXE		0x80U
+#define	EC0_RXE		0x40U
+#define	EC0_RXFCE	0x20U
+#define	EC0_WOE		0x10U
+#define	EC0_RXSA	0x08U
+#define	EC0_SBO		0x04U
+#define	EC0_RXMA	0x02U
+#define	EC0_RXCS	0x01U
+
+#define	EC0_BITS	\
+	"\020"	\
+	"\010TXE"	\
+	"\007RXE"	\
+	"\006RXFCE"	\
+	"\005WOE"	\
+	"\004RXSA"	\
+	"\003SBO"	\
+	"\002RXMA"	\
+	"\001RXCS"
+
+/* Ethernet control register 1: offset 1 */
+#define	EC1_FD		0x20U
+#define	EC1_100M	0x10U	/* 0:10Mbps 1:100Mbps */
+#define	EC1_RM		0x08U	/* reset mac */
+
+#define	EC1_BITS	\
+	"\020"	\
+	"\006FD"	\
+	"\005100M"	\
+	"\004RM"
+
+/* Ethernet control register 2: offset 2 */
+#define	EC2_MEPL	0x80U	/* 8515: MTU 0:1528, 1:1638 */
+#define	EC2_RPNC	0x40U
+#define	EC2_LEEPRS	0x20U
+#define	EC2_EEPRW	0x10U
+#define	EC2_LB		0x08U
+#define	EC2_PROM	0x04U
+#define	EC2_RXBP	0x02U
+#define	EC2_EP3RC	0x01U
+
+#define	EC2_BITS	\
+	"\020"	\
+	"\010MEPS"	\
+	"\007RPNC"	\
+	"\006LEEPRS"	\
+	"\005EEPRW"	\
+	"\004LB"	\
+	"\003PROM"	\
+	"\002RXBP"	\
+	"\001EP3RC"
+
+/* Recieve Packet number based Flow Control register: offset 0x1a */
+#define	RPNBFC_PN	0x7eU	/* */
+#define		RPNBFC_PN_SHIFT	1
+#define	RPNBFC_FCP	0x01U	/* enable rx flow control */
+
+/* Occupied Recieve FIFO based Flow Control register: offset 0x1b */
+#define	ORFBFC_RXS	0x7eU	/* */
+#define		ORFBFC_RXS_SHIFT	1
+#define		ORFBFC_RXS_UNIT	1024U
+#define	ORFBFC_FCRXS	0x01U	/* enable rx flow control */
+
+/* EP1 control register: offset 0x1c */
+#define	EP1C_EP1S0E	0x80U	/* send 0 enable */
+#define	EP1C_ITMA	0x60U	/* internal test mode A */
+#define	EP1C_ITMB	0x1fU	/* internal test mode B */
+
+#define	EP1C_BITS	\
+	"\020"	\
+	"\010EP1S0E"
+
+/* Rx FIFO Control register: offset 0x1d */
+#define	RXFC_EXT_SRAM	0x02	/* enable external 32k sram */
+#define	RXFC_RX32PKT	0x01	/* max 32 packet */
+
+/* EEPROM offset register: offset 0x20 */
+#define	EEOFFSET_MASK	0x3f	/* eeprom offset address in word */
+
+/* EEPROM access control register: offset 0x23 */
+#define	EECTRL_DONE	0x04
+#define	EECTRL_RD	0x02
+#define	EECTRL_WR	0x01
+
+#define	EECTRL_BITS	\
+	"\020"	\
+	"\003DONE"	\
+	"\002RD"	\
+	"\001WR"
+
+/* PHY control register: offset 28 */
+#define	PHYAC_DO	0x80U	/* Done */
+#define	PHYAC_RDPHY	0x40U	/* read phy */
+#define	PHYAC_WRPHY	0x20U	/* write phy */
+#define	PHYAC_PHYRA	0x1fU	/* PHY register address */
+
+#define	PHYCTRL_BITS	\
+	"\020"	\
+	"\010DO"	\
+	"\007RDPHY"	\
+	"\006WRPHY"
+
+/* Internal PHY control register: offset 7b */
+#define	IPHYC_EPHY	0x02
+#define	IPHYC_PHYR	0x01
+
+#define	IPHYC_BITS	\
+	"\020"	\
+	"\002EPHY"	\
+	"\001PHYR"
+
+/* GPIO45 register: offset 7c */
+#define	GPIO54_5OE	0x20
+#define	GPIO54_5O	0x10
+#define	GPIO54_5I	0x08
+#define	GPIO54_4OE	0x04
+#define	GPIO54_4O	0x02
+#define	GPIO54_4I	0x01
+
+/* GPIO01 register: offset 7e */
+#define	GPIO10_1OE	0x20
+#define	GPIO10_1O	0x10
+#define	GPIO10_1I	0x08
+#define	GPIO10_0OE	0x04
+#define	GPIO10_0O	0x02
+#define	GPIO10_0I	0x01
+
+/* GPIO23 register: offset 7f */
+#define	GPIO32_3OE	0x20
+#define	GPIO32_3O	0x10
+#define	GPIO32_3I	0x08
+#define	GPIO32_2OE	0x04
+#define	GPIO32_2O	0x02
+#define	GPIO32_2I	0x01
+
+/* rx status at the end of received packets */
+/* byte 0 and 1 is packet length in little endian */
+/* byte 2 is receive status */
+#define	RSR_DRIBBLE	0x10
+#define	RSR_CRC		0x08
+#define	RSR_RUNT	0x04
+#define	RSR_LONG	0x02
+#define	RSR_MULTI	0x01
+
+#define	RSR_ERRORS	\
+	(RSR_DRIBBLE | RSR_CRC | RSR_RUNT | RSR_LONG | RSR_MULTI)
+
+#define	RSR_BITS	\
+	"\020"	\
+	"\005DRIBBLE"	\
+	"\004CRC"	\
+	"\003RUNT"	\
+	"\002LONG"	\
+	"\001MULTI"
+/* byte 3 is reserved */
+
+/* TEST register: offset 80 */
diff --git a/usr/src/uts/common/io/upf/upf_usbgem.c b/usr/src/uts/common/io/upf/upf_usbgem.c
new file mode 100644
index 0000000000..5614803158
--- /dev/null
+++ b/usr/src/uts/common/io/upf/upf_usbgem.c
@@ -0,0 +1,1213 @@
+/*
+ * upf_usbgem.c : ADMtek an986/adm8511/adm8513/adm8515 USB to
+ * Fast Ethernet Driver for Solaris
+ */
+
+/*
+ * Copyright (c) 2004-2011 Masayuki Murayama.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the author nor the names of its contributors may be
+ *    used to endorse or promote products derived from this software without
+ *    specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
+
+#pragma ident   "%W% %E%"
+
+/*
+ *  Changelog:
+ */
+
+/*
+ * TODO
+ */
+/* ======================================================= */
+
+/*
+ * Solaris system header files and macros
+ */
+#include <sys/types.h>
+#include <sys/conf.h>
+#include <sys/debug.h>
+#include <sys/kmem.h>
+#include <sys/modctl.h>
+#include <sys/errno.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/byteorder.h>
+
+/* ethernet stuff */
+#include <sys/ethernet.h>
+
+/* interface card depend stuff */
+#include <sys/stropts.h>
+#include <sys/stream.h>
+#include <sys/strlog.h>
+#include <sys/usb/usba.h>
+#include "usbgem.h"
+
+/* hardware stuff */
+#include "usbgem_mii.h"
+#include "adm8511reg.h"
+
+char	ident[] = "pegasus usbnic driver v" VERSION;
+
+/*
+ * Useful macros
+ */
+#define	CHECK_AND_JUMP(val, label)	\
+	if ((val) != USB_SUCCESS) { goto label; }
+
+/*
+ * Debugging
+ */
+#ifdef DEBUG_LEVEL
+static int upf_debug = DEBUG_LEVEL;
+#define	DPRINTF(n, args)	if (upf_debug > (n)) cmn_err args
+#else
+#define	DPRINTF(n, args)
+#endif
+
+/*
+ * Our configration for ADMtek Pegasus/PegasusII
+ */
+/* timeouts */
+#define	ONESEC		(drv_usectohz(1*1000000))
+
+/*
+ * Local device definitions
+ */
+struct upf_dev {
+	/*
+	 * Misc HW information
+	 */
+	uint8_t		ec[3];
+	uint8_t		mac_addr[ETHERADDRL];
+	int		chip_type;
+#define	CHIP_AN986	1	/* avoid 0 */
+#define	CHIP_ADM8511	2	/* including adm8515 */
+#define	CHIP_ADM8513	3
+	boolean_t	phy_init_done;
+	uint8_t		last_link_state;
+
+	uint16_t	vid;	/* vendor id */
+	uint16_t	pid;	/* product id */
+};
+
+/*
+ * private functions
+ */
+
+/* mii operations */
+static uint16_t upf_mii_read(struct usbgem_dev *, uint_t, int *errp);
+static void upf_mii_write(struct usbgem_dev *, uint_t, uint16_t, int *errp);
+
+/* nic operations */
+static int upf_attach_chip(struct usbgem_dev *);
+static int upf_reset_chip(struct usbgem_dev *);
+static int upf_init_chip(struct usbgem_dev *);
+static int upf_start_chip(struct usbgem_dev *);
+static int upf_stop_chip(struct usbgem_dev *);
+static int upf_set_media(struct usbgem_dev *);
+static int upf_set_rx_filter(struct usbgem_dev *);
+static int upf_get_stats(struct usbgem_dev *);
+
+/* packet operations */
+static mblk_t *upf_tx_make_packet(struct usbgem_dev *, mblk_t *);
+static mblk_t *upf_rx_make_packet(struct usbgem_dev *, mblk_t *);
+
+/* interrupt handler */
+static void upf_interrupt(struct usbgem_dev *, mblk_t *);
+
+/* =============================================================== */
+/*
+ * I/O functions
+ */
+/* =============================================================== */
+#define	UPF_REQ_GET_REGISTER	0xf0
+#define	UPF_REQ_SET_REGISTER	0xf1
+#define	OUTB(dp, p, v, errp, label)	\
+	if ((*(errp) = usbgem_ctrl_out((dp), 	\
+	/* bmRequestType */ USB_DEV_REQ_HOST_TO_DEV	\
+		| USB_DEV_REQ_TYPE_VENDOR | USB_DEV_REQ_RCPT_DEV,	\
+	/* bRequest */	UPF_REQ_SET_REGISTER,	\
+	/* wValue */	(v),	\
+	/* wIndex */	(p),	\
+	/* wLength */	1,	\
+	/* buf */	NULL,	\
+	/* size */	0)) != USB_SUCCESS) goto label;
+
+#define	OUTW(dp, p, v, errp, label)	\
+	if ((*(errp) = usbgem_ctrl_out_val((dp), 	\
+	/* bmRequestType */ USB_DEV_REQ_HOST_TO_DEV	\
+		| USB_DEV_REQ_TYPE_VENDOR | USB_DEV_REQ_RCPT_DEV,	\
+	/* bRequest */	UPF_REQ_SET_REGISTER,	\
+	/* wValue */	0,	\
+	/* wIndex */	(p),	\
+	/* wLength */	2,	\
+	/* value */	(v))) != USB_SUCCESS) goto label
+
+#define	OUTS(dp, p, buf, len, errp, label)	\
+	if ((*(errp) = usbgem_ctrl_out((dp), 	\
+	/* bmRequestType */ USB_DEV_REQ_HOST_TO_DEV	\
+		| USB_DEV_REQ_TYPE_VENDOR | USB_DEV_REQ_RCPT_DEV,	\
+	/* bRequest */	UPF_REQ_SET_REGISTER,	\
+	/* wValue */	0,	\
+	/* wIndex */	(p),	\
+	/* wLength */	(len),	\
+	/* buf */	(buf),	\
+	/* size */	(len))) != USB_SUCCESS) goto label
+
+#define	INB(dp, p, vp, errp, label)	\
+	if ((*(errp) = usbgem_ctrl_in_val((dp), 	\
+	/* bmRequestType */ USB_DEV_REQ_DEV_TO_HOST	\
+		| USB_DEV_REQ_TYPE_VENDOR | USB_DEV_REQ_RCPT_DEV,	\
+	/* bRequest */ UPF_REQ_GET_REGISTER,	\
+	/* wValue */	0,	\
+	/* wIndex */	(p),	\
+	/* wLength */	1,	\
+	/* valuep */	(vp))) != USB_SUCCESS) goto label
+
+#define	INW(dp, p, vp, errp, label)	\
+	if ((*(errp) = usbgem_ctrl_in_val((dp), 	\
+	/* bmRequestType */ USB_DEV_REQ_DEV_TO_HOST	\
+		| USB_DEV_REQ_TYPE_VENDOR | USB_DEV_REQ_RCPT_DEV,	\
+	/* bRequest */ UPF_REQ_GET_REGISTER,	\
+	/* wValue */	0,	\
+	/* wIndex */	(p),	\
+	/* wLength */	2,	\
+	/* valuep */	(vp))) != USB_SUCCESS) goto label
+
+#define	INS(dp, p, buf, len, errp, label)	\
+	if ((*(errp) = usbgem_ctrl_in((dp), 	\
+	/* bmRequestType */ USB_DEV_REQ_DEV_TO_HOST	\
+		    | USB_DEV_REQ_TYPE_VENDOR | USB_DEV_REQ_RCPT_DEV,	\
+	/* bRequest */ UPF_REQ_GET_REGISTER,	\
+	/* wValue */	0,	\
+	/* wIndex */	(p),	\
+	/* wLength */	(len),	\
+	/* buf */	(buf),	\
+	/* size */	(len))) != USB_SUCCESS) goto label
+
+/* =============================================================== */
+/*
+ * Hardware manupilation
+ */
+/* =============================================================== */
+static int
+upf_reset_chip(struct usbgem_dev *dp)
+{
+	int		i;
+	uint8_t		val;
+	int		err;
+	struct upf_dev	*lp = dp->private;
+
+	DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__));
+	bzero(lp->mac_addr, sizeof (lp->mac_addr));
+
+	lp->ec[1] = 0;
+	OUTB(dp, EC1, EC1_RM, &err, usberr);
+
+	for (i = 0; i < 1000; i++) {
+		INB(dp, EC1, &val, &err, usberr);
+		if ((val & EC1_RM) == 0) {
+			lp->ec[1] = val;
+			return (USB_SUCCESS);
+		}
+		drv_usecwait(10);
+	}
+
+	/* time out */
+	cmn_err(CE_WARN, "!%s: failed to reset: timeout", dp->name);
+	return (USB_FAILURE);
+
+usberr:
+	cmn_err(CE_NOTE, "!%s: %s: usberr detected", dp->name, __func__);
+	return (USB_FAILURE);
+}
+
+/*
+ * Setup an986/adm8511/adm8513/adm8515
+ */
+static int
+upf_init_chip(struct usbgem_dev *dp)
+{
+	uint64_t	zero64 = 0;
+	int	err = USB_SUCCESS;
+	struct upf_dev	*lp = dp->private;
+
+	DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__));
+
+	/* ethernet control register 0 */
+	lp->ec[0] |= EC0_RXSA | EC0_RXCS;
+	OUTB(dp, EC0, lp->ec[0], &err, usberr);
+
+	/* ethernet control reg1: will be set later in set_rx_filter() */
+
+	/* ethernet control register 2: will be set later in set_rx_filter() */
+	INB(dp, EC2, &lp->ec[2], &err, usberr);
+	lp->ec[2] |= EC2_RXBP | EC2_EP3RC;
+#ifdef CONFIG_VLAN
+	if (dp->misc_flag & USBGEM_VLAN) {
+		lp->ec[2] |= EC2_MEPL;
+	}
+#endif
+	OUTB(dp, EC2, lp->ec[2], &err, usberr);
+
+	/* Multicast address hash: clear */
+	OUTS(dp, MA, &zero64, 8, &err, usberr);
+
+	/* Ethernet ID : will be set later in upf_set_rx_filter() */
+
+	/* PAUSE timer */
+	OUTB(dp, PAUSETIMER, 0x1f, &err, usberr);
+
+	/* receive packet number based pause control:set in upf_set_media() */
+
+	/* occupied receive FIFO based pause control:set in upf_set_media() */
+
+	/* EP1 control: default */
+
+	/* Rx FIFO control */
+	if (lp->chip_type != CHIP_AN986) {
+		/* use 24K internal sram, 16pkts in fifo */
+		OUTB(dp, RXFC, 0, &err, usberr);
+	}
+
+	/* BIST contror: do nothing */
+	err = upf_set_media(dp);
+	CHECK_AND_JUMP(err, usberr);
+
+	DPRINTF(2, (CE_CONT, "!%s: %s: end (success)", dp->name, __func__));
+	return (USB_SUCCESS);
+
+usberr:
+	cmn_err(CE_NOTE, "!%s: %s: usberr(%d) detected",
+	    dp->name, __func__, err);
+	return (err);
+}
+
+static int
+upf_start_chip(struct usbgem_dev *dp)
+{
+	int	err = USB_SUCCESS;
+	struct upf_dev	*lp = dp->private;
+
+	DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__));
+
+	/* enable RX and TX */
+	lp->ec[0] |= EC0_TXE | EC0_RXE;
+	OUTB(dp, EC0, lp->ec[0], &err, usberr);
+	return (USB_SUCCESS);
+
+usberr:
+	cmn_err(CE_WARN, "!%s: %s: usberr(%d) detected",
+	    dp->name, __func__, err);
+	return (err);
+}
+
+static int
+upf_stop_chip(struct usbgem_dev *dp)
+{
+	int	err;
+	struct upf_dev	*lp = dp->private;
+
+	DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__));
+
+	/* disable RX and TX */
+	lp->ec[0] &= ~(EC0_TXE | EC0_RXE);
+	OUTB(dp, EC0, lp->ec[0], &err, usberr);
+
+	return (USB_SUCCESS);
+
+usberr:
+	cmn_err(CE_WARN, "!%s: %s: usberr(%d) detected",
+	    dp->name, __func__, err);
+	return (err);
+}
+
+static int
+upf_get_stats(struct usbgem_dev *dp)
+{
+	/* do nothing */
+	return (USB_SUCCESS);
+}
+
+static uint_t
+upf_mcast_hash(struct usbgem_dev *dp, const uint8_t *addr)
+{
+	/* hash table is 64 = 2^6 bit width */
+	return (usbgem_ether_crc_le(addr) & 0x3f);
+}
+
+static int
+upf_set_rx_filter(struct usbgem_dev *dp)
+{
+	int		i;
+	int		err;
+#ifdef DEBUG_LEVEL
+	uint8_t		reg0;
+	uint8_t		reg1;
+	uint8_t		reg2;
+#endif
+	struct upf_dev	*lp = dp->private;
+
+	DPRINTF(0, (CE_CONT, "!%s: %s: called, rxmode:%b",
+	    dp->name, __func__, dp->rxmode, RXMODE_BITS));
+
+	/* reset rx mode */
+	lp->ec[0] &= ~EC0_RXMA;
+	lp->ec[2] &= ~EC2_PROM;
+
+	if (dp->rxmode & RXMODE_PROMISC) {
+		/* promiscious mode implies all multicast and all physical */
+		lp->ec[0] |= EC0_RXMA;
+		lp->ec[2] |= EC2_PROM;
+	} else if ((dp->rxmode & RXMODE_ALLMULTI) || dp->mc_count > 0) {
+		/* XXX - multicast hash table didin't work */
+		/* accept all multicast packets */
+		lp->ec[0] |= EC0_RXMA;
+	}
+
+	if (bcmp(dp->cur_addr.ether_addr_octet,
+	    lp->mac_addr, ETHERADDRL) != 0) {
+
+		/* need to update mac address */
+		bcopy(dp->cur_addr.ether_addr_octet,
+		    lp->mac_addr, ETHERADDRL);
+		OUTS(dp, EID,
+		    lp->mac_addr, ETHERADDRL, &err, usberr);
+	}
+
+	/* update rx mode */
+	OUTS(dp, EC0, lp->ec, 3, &err, usberr);
+
+#if DEBUG_LEVEL > 0
+	INB(dp, EC0, &reg0, &err, usberr);
+	INB(dp, EC1, &reg1, &err, usberr);
+	INB(dp, EC2, &reg2, &err, usberr);
+
+	cmn_err(CE_CONT, "!%s: %s: returned, ec:%b %b %b",
+	    dp->name, __func__,
+	    reg0, EC0_BITS, reg1, EC1_BITS, reg2, EC2_BITS);
+#endif
+	return (USB_SUCCESS);
+
+usberr:
+	cmn_err(CE_NOTE, "!%s: %s: usberr detected", dp->name, __func__);
+	return (err);
+}
+
+static int
+upf_set_media(struct usbgem_dev *dp)
+{
+	int	err;
+	struct upf_dev	*lp = dp->private;
+
+	DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__));
+
+	lp->ec[1] &= ~(EC1_FD | EC1_100M);
+
+	/* select duplex */
+	if (dp->full_duplex) {
+		lp->ec[1] |= EC1_FD;
+	}
+
+	/* select speed */
+	if (dp->speed == USBGEM_SPD_100) {
+		lp->ec[1] |= EC1_100M;
+	}
+
+	/* rx flow control */
+	switch (dp->flow_control) {
+	case FLOW_CONTROL_SYMMETRIC:
+	case FLOW_CONTROL_RX_PAUSE:
+		lp->ec[0] |= EC0_RXFCE;
+		break;
+
+	default:
+		lp->ec[0] &= ~EC0_RXFCE;
+		break;
+	}
+
+	/* tx flow control */
+	switch (dp->flow_control) {
+	case FLOW_CONTROL_SYMMETRIC:
+	case FLOW_CONTROL_TX_PAUSE:
+		if (lp->chip_type != CHIP_AN986) {
+			/* pegasus II has internal 24k fifo */
+			OUTB(dp, ORFBFC,
+			    (12 << ORFBFC_RXS_SHIFT) |  ORFBFC_FCRXS,
+			    &err, usberr);
+
+			/* 16 packts can be stored in rx fifo */
+			OUTB(dp, RPNBFC_PN,
+			    (8 << RPNBFC_PN_SHIFT) |  RPNBFC_FCP,
+			    &err, usberr);
+		} else {
+			/* an986 has external 32k fifo */
+			OUTB(dp, ORFBFC,
+			    (16 << ORFBFC_RXS_SHIFT) |  ORFBFC_FCRXS,
+			    &err, usberr);
+
+			/* AN986 fails to link up when RPNBFC is enabled */
+			OUTB(dp, RPNBFC, 0, &err, usberr);
+		}
+		break;
+
+	default:
+		OUTB(dp, ORFBFC, 0, &err, usberr);
+		OUTB(dp, RPNBFC, 0, &err, usberr);
+		break;
+	}
+
+	/* update ether control registers */
+	OUTS(dp, EC0, lp->ec, 2, &err, usberr);
+	DPRINTF(0, (CE_CONT, "!%s: %s: returned, ec0:%b, ec1:%b",
+	    dp->name, __func__, lp->ec[0], EC0_BITS, lp->ec[1], EC1_BITS));
+
+	return (USB_SUCCESS);
+
+usberr:
+	cmn_err(CE_WARN, "%s: %s: failed to write ec1", dp->name, __func__);
+	return (err);
+}
+
+/*
+ * send/receive packet check
+ */
+static mblk_t *
+upf_tx_make_packet(struct usbgem_dev *dp, mblk_t *mp)
+{
+	size_t		len;
+	mblk_t		*new;
+	mblk_t		*tp;
+	uint8_t		*bp;
+	uint8_t		*last_pos;
+	int		msglen;
+
+	DPRINTF(3, (CE_CONT, "!%s: %s: called", dp->name, __func__));
+
+	len = msgdsize(mp);
+	if (len < ETHERMIN) {
+		len = ETHERMIN;
+	}
+
+	/* allocate msg block */
+	msglen = len + sizeof (uint16_t);
+
+	/* avoid usb controller bug */
+	if ((msglen & 0x3f) == 0) {
+		/* add a header for additional 0-length usb message */
+		msglen += sizeof (uint16_t);
+	}
+
+	if ((new = allocb(msglen, 0)) == NULL) {
+		return (NULL);
+	}
+
+	/* copy contents of the buffer */
+	new->b_wptr = new->b_rptr + msglen;
+	bp = new->b_rptr;
+
+	/* the nic requires a two byte header of the packet size */
+	bp[0] = (uint8_t)len;
+	bp[1] = (uint8_t)(len >> 8);
+	bp += sizeof (uint16_t);
+
+	/* copy the payload */
+	for (tp = mp; tp; tp = tp->b_cont) {
+		len = tp->b_wptr - tp->b_rptr;
+		if (len > 0) {
+			bcopy(tp->b_rptr, bp, len);
+			bp += len;
+		}
+	}
+
+	/* clear ethernet pads and additional usb header if we have */
+	last_pos = new->b_wptr;
+	while (bp < last_pos) {
+		*bp++ = 0;
+	}
+
+	return (new);
+}
+
+static void
+upf_dump_packet(struct usbgem_dev *dp, uint8_t *bp, int n)
+{
+	int	i;
+
+	for (i = 0; i < n; i += 8, bp += 8) {
+		cmn_err(CE_CONT, "%02x %02x %02x %02x %02x %02x %02x %02x",
+		    bp[0], bp[1], bp[2], bp[3], bp[4], bp[5], bp[6], bp[7]);
+	}
+}
+
+static mblk_t *
+upf_rx_make_packet(struct usbgem_dev *dp, mblk_t *mp)
+{
+	uint8_t		*p;
+	uint16_t	rxhd;
+	uint_t		len;
+	uint8_t		rsr;
+	struct upf_dev	*lp = dp->private;
+
+	ASSERT(mp != NULL);
+
+#ifdef DEBUG_LEVEL
+	len = msgdsize(mp);
+	DPRINTF(2, (CE_CONT, "!%s: time:%d %s: cont:%p",
+	    dp->name, ddi_get_lbolt(), __func__, len, mp->b_cont));
+
+	if (upf_debug > 3) {
+		upf_dump_packet(dp, mp->b_rptr, max(6, len));
+	}
+#endif
+	/* get the length of Rx packet */
+	p = mp->b_wptr - 4;
+	rsr = p[3];
+	if (lp->chip_type == CHIP_ADM8513) {
+		/* As Rx packets from ADM8513 have two byte header, remove it */
+		p = mp->b_rptr;
+		len = ((p[1] << 8) | p[0]) & 0x0fff;
+		mp->b_rptr += 2;
+	} else {
+		len = (((p[1] << 8) | p[0]) & 0x0fff) - ETHERFCSL - 4;
+	}
+
+	DPRINTF(2, (CE_CONT, "!%s: %s: rsr:%b len:%d",
+	    dp->name, __func__, rsr, RSR_BITS, len));
+
+	/* check if error happen */
+	if (rsr & RSR_ERRORS) {
+		DPRINTF(0, (CE_CONT, "!%s: rsr:%b", dp->name, rsr, RSR_BITS));
+		if (rsr & (RSR_CRC | RSR_DRIBBLE)) {
+			dp->stats.frame++;
+		}
+		if (rsr & RSR_LONG) {
+			dp->stats.frame_too_long++;
+		}
+		if (rsr & RSR_RUNT) {
+			dp->stats.runt++;
+		}
+
+		dp->stats.errrcv++;
+		return (NULL);
+	}
+#ifndef CONFIG_VLAN
+	/* check packet size */
+	if (len > ETHERMAX) {
+		/* too long */
+		dp->stats.frame_too_long++;
+		dp->stats.errrcv++;
+		return (NULL);
+	} else if (len < ETHERMIN) {
+		dp->stats.runt++;
+		dp->stats.errrcv++;
+		return (NULL);
+	}
+#endif
+	/* remove tailing crc and rx status fields */
+	mp->b_wptr = mp->b_rptr + len;
+	ASSERT(mp->b_next == NULL);
+	return (mp);
+}
+
+/*
+ * Device depend interrupt handler
+ */
+static void
+upf_interrupt(struct usbgem_dev *dp, mblk_t *mp)
+{
+	uint8_t	*bp;
+	struct upf_dev	*lp = dp->private;
+
+	bp = mp->b_rptr;
+
+	DPRINTF(2, (CE_CONT,
+	    "!%s: %s: size:%d, %02x %02x %02x %02x %02x %02x %02x %02x",
+	    dp->name, __func__, mp->b_wptr - mp->b_rptr,
+	    bp[0], bp[1], bp[2], bp[3], bp[4], bp[5], bp[6], bp[7]));
+
+	if ((lp->last_link_state ^ bp[5]) & 1) {
+		DPRINTF(1, (CE_CONT, "!%s:%s link status changed:",
+		    dp->name, __func__));
+		usbgem_mii_update_link(dp);
+	}
+
+	lp->last_link_state = bp[5] & 1;
+}
+
+/*
+ * MII Interfaces
+ */
+static uint16_t
+upf_mii_read(struct usbgem_dev *dp, uint_t index, int *errp)
+{
+	uint8_t		phyctrl;
+	uint16_t	val;
+	int		i;
+
+	DPRINTF(4, (CE_CONT, "!%s: %s: called, ix:%d",
+	    dp->name, __func__, index));
+	ASSERT(index >= 0 && index < 32);
+
+	*errp = USB_SUCCESS;
+
+	/* set PHYADDR */
+	OUTB(dp, PHYA, dp->mii_phy_addr, errp, usberr);
+
+	/* Initiate MII read transaction */
+	OUTB(dp, PHYAC, index | PHYAC_RDPHY, errp, usberr);
+
+	for (i = 0; i < 100; i++) {
+		INB(dp, PHYAC, &phyctrl, errp, usberr);
+		if (phyctrl & PHYAC_DO) {
+			/* done */
+			INW(dp, PHYD, &val, errp, usberr);
+			DPRINTF(4, (CE_CONT, "!%s: %s: return %04x",
+			    dp->name, __func__, val));
+			return (val);
+		}
+		drv_usecwait(10);
+	}
+	/* timeout */
+	cmn_err(CE_WARN, "!%s: %s: timeout detected", dp->name, __func__);
+	*errp = USB_FAILURE;
+	return (0);
+
+usberr:
+	cmn_err(CE_CONT,
+	    "!%s: %s: usberr(%d) detected", dp->name, __func__, *errp);
+	return (0);
+}
+
+static void
+upf_mii_write(struct usbgem_dev *dp, uint_t index, uint16_t val, int *errp)
+{
+	int		i;
+	uint8_t		phyctrl;
+
+	DPRINTF(4, (CE_CONT, "!%s: %s called index:%d val:0x%04x",
+	    dp->name, __func__, index, val));
+	ASSERT(index >= 0 && index < 32);
+
+	*errp = USB_SUCCESS;
+
+	OUTW(dp, PHYD, val, errp, usberr);
+	OUTB(dp, PHYA, dp->mii_phy_addr, errp, usberr);
+	OUTB(dp, PHYAC, index | PHYAC_WRPHY, errp, usberr);
+
+	for (i = 0; i < 100; i++) {
+		INB(dp, PHYAC, &phyctrl, errp, usberr);
+		if (phyctrl & PHYAC_DO) {
+			/* done */
+			return;
+		}
+		drv_usecwait(10);
+	}
+
+	/* time out */
+	cmn_err(CE_WARN, "!%s: %s: timeout detected", dp->name, __func__);
+	*errp = USB_FAILURE;
+	return;
+
+usberr:
+	cmn_err(CE_CONT,
+	    "!%s: %s: usberr(%d) detected", dp->name, __func__, *errp);
+}
+
+
+static int
+upf_enable_phy(struct usbgem_dev *dp)
+{
+	uint8_t	val;
+	int	err;
+	struct upf_dev	*lp = dp->private;
+
+	/*
+	 * first, try to enable internal phy
+	 */
+	INB(dp, IPHYC, &val, &err, usberr);
+	val = (val | IPHYC_EPHY) & ~IPHYC_PHYR;
+	OUTB(dp, IPHYC, val, &err, usberr);
+
+	INB(dp, IPHYC, &val, &err, usberr);
+	DPRINTF(0, (CE_CONT, "!%s: %s: IPHYC: %b",
+	    dp->name, __func__, val, IPHYC_BITS));
+	if (val) {
+		/* reset internal phy */
+		OUTB(dp, IPHYC, val | IPHYC_PHYR, &err, usberr);
+		OUTB(dp, IPHYC, val, &err, usberr);
+		delay(drv_usectohz(10000));
+
+		/* identify the chip generation */
+		OUTB(dp, 0x83, 0xa5, &err, usberr);
+		INB(dp, 0x83, &val, &err, usberr);
+		if (val == 0xa5) {
+			lp->chip_type = CHIP_ADM8513;
+		} else {
+			/* adm8511 or adm8515 */
+			lp->chip_type = CHIP_ADM8511;
+		}
+		dp->ugc.usbgc_mii_hw_link_detection = B_TRUE;
+	} else {
+		/*
+		 * It should be AN986 which doesn't have an internal PHY.
+		 * We need to setup gpio ports in AN986, which are
+		 * connected to external PHY control pins.
+		 */
+		lp->chip_type = CHIP_AN986;
+
+		/* reset external phy */
+		/* output port#0 L, port#1 L */
+		OUTB(dp, GPIO10, GPIO10_0O | GPIO10_0OE, &err, usberr);
+
+		/* output port#0 H, port#1 L */
+		OUTB(dp, GPIO10,
+		    GPIO10_0O | GPIO10_0OE | GPIO10_1OE, &err, usberr);
+
+		/* hw link detection doesn't work correctly */
+		dp->ugc.usbgc_mii_hw_link_detection = B_FALSE;
+	}
+
+	return (USB_SUCCESS);
+
+usberr:
+	cmn_err(CE_NOTE, "!%s: %s: usberr detected", dp->name, __func__);
+	return (USB_FAILURE);
+}
+
+static int
+upf_mii_probe(struct usbgem_dev *dp)
+{
+	int	err;
+	uint16_t	val;
+	struct upf_dev	*lp = dp->private;
+
+	if (!lp->phy_init_done) {
+		upf_enable_phy(dp);
+		lp->phy_init_done = B_TRUE;
+	}
+
+	return (usbgem_mii_probe_default(dp));
+}
+
+static int
+upf_mii_init(struct usbgem_dev *dp)
+{
+	uint16_t	val;
+	int		err = USB_SUCCESS;
+	struct upf_dev	*lp = dp->private;
+
+	if (!lp->phy_init_done) {
+		upf_enable_phy(dp);
+	}
+	lp->phy_init_done = B_FALSE;
+
+	if (lp->chip_type == CHIP_AN986 &&
+	    (lp->vid == 0x0db7 /* elecom */ ||
+	    lp->vid == 0x066b /* linksys */ ||
+	    lp->vid == 0x077b /* linksys */ ||
+	    lp->vid == 0x2001 /* dlink */)) {
+		/* special treatment for Linksys products */
+		val = upf_mii_read(dp, 0x1b, &err) | 0x4;
+		upf_mii_write(dp, 0x1b, val, &err);
+	}
+	return (err);
+}
+
+/* ======================================================== */
+/*
+ * OS depend (device driver DKI) routine
+ */
+/* ======================================================== */
+static uint16_t
+upf_read_eeprom(struct usbgem_dev *dp, int index, int *errp)
+{
+	int		i;
+	uint8_t		eectrl;
+	uint16_t	data;
+
+	*errp = USB_SUCCESS;
+
+	OUTB(dp, EECTRL, 0, errp, usberr);
+
+	OUTB(dp, EEOFFSET, index, errp, usberr);
+	OUTB(dp, EECTRL, EECTRL_RD, errp, usberr);
+
+	for (i = 0; i < 100; i++) {
+		INB(dp, EECTRL, &eectrl, errp, usberr);
+		if (eectrl & EECTRL_DONE) {
+			INW(dp, EEDATA, &data, errp, usberr);
+			return (data);
+		}
+		drv_usecwait(10);
+	}
+
+	/* time out */
+	*errp = USB_FAILURE;
+	return (0);
+
+usberr:
+	cmn_err(CE_CONT,
+	    "!%s: %s: usberr(%d) detected", dp->name, __func__, *errp);
+	return (0);
+}
+
+static void
+upf_eeprom_dump(struct usbgem_dev *dp, int size)
+{
+	int	i;
+	int	err;
+
+	cmn_err(CE_CONT, "!%s: %s dump:", dp->name, __func__);
+
+	for (i = 0; i < size; i += 4) {
+		cmn_err(CE_CONT, "!0x%02x: 0x%04x 0x%04x 0x%04x 0x%04x",
+		    i*2,
+		    upf_read_eeprom(dp, i + 0, &err),
+		    upf_read_eeprom(dp, i + 1, &err),
+		    upf_read_eeprom(dp, i + 2, &err),
+		    upf_read_eeprom(dp, i + 3, &err));
+	}
+}
+
+static int
+upf_attach_chip(struct usbgem_dev *dp)
+{
+	int		i;
+	int		err;
+	uint16_t	val;
+	uint8_t		*mac;
+	struct upf_dev	*lp = dp->private;
+
+	/*
+	 * Read mac address from EEPROM
+	 */
+	mac = dp->dev_addr.ether_addr_octet;
+	for (i = 0; i < 3; i++) {
+		val = upf_read_eeprom(dp, i, &err);
+		if (err != USB_SUCCESS) {
+			goto usberr;
+		}
+		mac[i*2+0] = (uint8_t)val;
+		mac[i*2+1] = (uint8_t)(val >> 8);
+	}
+
+	DPRINTF(0, (CE_CONT,
+	    "%s: %s: mac: %02x:%02x:%02x:%02x:%02x:%02x",
+	    dp->name, __func__,
+	    mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]));
+
+	dp->misc_flag = 0;
+#ifdef CONFIG_VLAN
+	dp->misc_flag |= USBGEM_VLAN;
+#endif
+#if DEBUG_LEVEL > 3
+	upf_eeprom_dump(dp, 0x80);
+#endif
+	return (USB_SUCCESS);
+
+usberr:
+	cmn_err(CE_WARN, "!%s: %s: usb error detected", dp->name, __func__);
+	return (USB_FAILURE);
+}
+
+static int
+upfattach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+	int			i;
+	ddi_iblock_cookie_t	c;
+	int			ret;
+	int			unit;
+	uint32_t		tcr;
+	int			len;
+	const char		*drv_name;
+	struct usbgem_dev	*dp;
+	void			*base;
+	struct usbgem_conf	*ugcp;
+	struct upf_dev		*lp;
+
+	unit = ddi_get_instance(dip);
+	drv_name = ddi_driver_name(dip);
+
+	DPRINTF(3, (CE_CONT, "!%s%d: %s: called, cmd:%d",
+	    drv_name, unit, __func__, cmd));
+
+	if (cmd == DDI_ATTACH) {
+		/*
+		 * construct usbgem configration
+		 */
+		ugcp = kmem_zalloc(sizeof (*ugcp), KM_SLEEP);
+
+		/* name */
+		sprintf(ugcp->usbgc_name, "%s%d", drv_name, unit);
+		ugcp->usbgc_ppa = unit;
+
+		ugcp->usbgc_ifnum = 0;
+		ugcp->usbgc_alt = 0;
+
+		ugcp->usbgc_tx_list_max = 16;
+
+		ugcp->usbgc_rx_header_len = 4;
+		ugcp->usbgc_rx_list_max = 64;
+
+		/* time out parameters */
+		ugcp->usbgc_tx_timeout = USBGEM_TX_TIMEOUT;
+		ugcp->usbgc_tx_timeout_interval = USBGEM_TX_TIMEOUT_INTERVAL;
+
+		/* flow control */
+		ugcp->usbgc_flow_control = FLOW_CONTROL_NONE;
+		ugcp->usbgc_flow_control = FLOW_CONTROL_RX_PAUSE;
+
+		/* MII timeout parameters */
+		ugcp->usbgc_mii_link_watch_interval = ONESEC;
+		ugcp->usbgc_mii_an_watch_interval = ONESEC/5;
+		ugcp->usbgc_mii_reset_timeout = MII_RESET_TIMEOUT; /* 1 sec */
+		ugcp->usbgc_mii_an_timeout = MII_AN_TIMEOUT;	/* 5 sec */
+		ugcp->usbgc_mii_an_wait = MII_AN_TIMEOUT/2;
+		ugcp->usbgc_mii_linkdown_timeout = MII_LINKDOWN_TIMEOUT;
+		ugcp->usbgc_mii_an_delay = ONESEC/10;
+
+		ugcp->usbgc_mii_linkdown_action = MII_ACTION_RESET;
+		ugcp->usbgc_mii_linkdown_timeout_action = MII_ACTION_RESET;
+		ugcp->usbgc_mii_dont_reset = B_FALSE;
+
+		/* I/O methods */
+
+		/* mac operation */
+		ugcp->usbgc_attach_chip = &upf_attach_chip;
+		ugcp->usbgc_reset_chip = &upf_reset_chip;
+		ugcp->usbgc_init_chip = &upf_init_chip;
+		ugcp->usbgc_start_chip = &upf_start_chip;
+		ugcp->usbgc_stop_chip = &upf_stop_chip;
+		ugcp->usbgc_multicast_hash = &upf_mcast_hash;
+
+		ugcp->usbgc_set_rx_filter = &upf_set_rx_filter;
+		ugcp->usbgc_set_media = &upf_set_media;
+		ugcp->usbgc_get_stats = &upf_get_stats;
+		ugcp->usbgc_interrupt = &upf_interrupt;
+
+		/* packet operation */
+		ugcp->usbgc_tx_make_packet = &upf_tx_make_packet;
+		ugcp->usbgc_rx_make_packet = &upf_rx_make_packet;
+
+		/* mii operations */
+		ugcp->usbgc_mii_probe = &upf_mii_probe;
+		ugcp->usbgc_mii_init = &upf_mii_init;
+		ugcp->usbgc_mii_config = &usbgem_mii_config_default;
+		ugcp->usbgc_mii_read = &upf_mii_read;
+		ugcp->usbgc_mii_write = &upf_mii_write;
+
+		/* mtu */
+		ugcp->usbgc_min_mtu = ETHERMTU;
+		ugcp->usbgc_max_mtu = ETHERMTU;
+		ugcp->usbgc_default_mtu = ETHERMTU;
+
+		lp = kmem_zalloc(sizeof (struct upf_dev), KM_SLEEP);
+
+		lp->vid = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
+		    DDI_PROP_DONTPASS, "usb-vendor-id", -1);
+		lp->pid = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
+		    DDI_PROP_DONTPASS, "usb-product-id", -1);
+
+		dp = usbgem_do_attach(dip, ugcp, lp, sizeof (struct upf_dev));
+
+		kmem_free(ugcp, sizeof (*ugcp));
+
+		if (dp != NULL) {
+			return (DDI_SUCCESS);
+		}
+
+err_free_mem:
+		kmem_free(lp, sizeof (struct upf_dev));
+err_close_pipe:
+err:
+		return (DDI_FAILURE);
+	}
+	if (cmd == DDI_RESUME) {
+		dp = USBGEM_GET_DEV(dip);
+		lp = dp->private;
+		lp->phy_init_done = B_FALSE;
+
+		return (usbgem_resume(dip));
+	}
+	return (DDI_FAILURE);
+}
+
+static int
+upfdetach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+	int	ret;
+
+	if (cmd == DDI_DETACH) {
+		ret = usbgem_do_detach(dip);
+		if (ret != DDI_SUCCESS) {
+			return (DDI_FAILURE);
+		}
+		return (DDI_SUCCESS);
+	}
+	if (cmd == DDI_SUSPEND) {
+		return (usbgem_suspend(dip));
+	}
+	return (DDI_FAILURE);
+}
+
+/* ======================================================== */
+/*
+ * OS depend (loadable streams driver) routine
+ */
+/* ======================================================== */
+#ifdef USBGEM_CONFIG_GLDv3
+USBGEM_STREAM_OPS(upf_ops, upfattach, upfdetach);
+#else
+static	struct module_info upfminfo = {
+	0,			/* mi_idnum */
+	"upf",			/* mi_idname */
+	0,			/* mi_minpsz */
+	ETHERMTU,		/* mi_maxpsz */
+	32*1024,		/* mi_hiwat */
+	1,			/* mi_lowat */
+};
+
+static	struct qinit upfrinit = {
+	(int (*)()) NULL,	/* qi_putp */
+	usbgem_rsrv,		/* qi_srvp */
+	usbgem_open,		/* qi_qopen */
+	usbgem_close,		/* qi_qclose */
+	(int (*)()) NULL,	/* qi_qadmin */
+	&upfminfo,		/* qi_minfo */
+	NULL			/* qi_mstat */
+};
+
+static	struct qinit upfwinit = {
+	usbgem_wput,		/* qi_putp */
+	usbgem_wsrv,		/* qi_srvp */
+	(int (*)()) NULL,	/* qi_qopen */
+	(int (*)()) NULL,	/* qi_qclose */
+	(int (*)()) NULL,	/* qi_qadmin */
+	&upfminfo,		/* qi_minfo */
+	NULL			/* qi_mstat */
+};
+
+static struct streamtab	upf_info = {
+	&upfrinit,	/* st_rdinit */
+	&upfwinit,	/* st_wrinit */
+	NULL,		/* st_muxrinit */
+	NULL		/* st_muxwrinit */
+};
+
+static	struct cb_ops cb_upf_ops = {
+	nulldev,	/* cb_open */
+	nulldev,	/* cb_close */
+	nodev,		/* cb_strategy */
+	nodev,		/* cb_print */
+	nodev,		/* cb_dump */
+	nodev,		/* cb_read */
+	nodev,		/* cb_write */
+	nodev,		/* cb_ioctl */
+	nodev,		/* cb_devmap */
+	nodev,		/* cb_mmap */
+	nodev,		/* cb_segmap */
+	nochpoll,	/* cb_chpoll */
+	ddi_prop_op,	/* cb_prop_op */
+	&upf_info,	/* cb_stream */
+	D_MP		/* cb_flag */
+};
+
+static	struct dev_ops upf_ops = {
+	DEVO_REV,	/* devo_rev */
+	0,		/* devo_refcnt */
+	usbgem_getinfo,	/* devo_getinfo */
+	nulldev,	/* devo_identify */
+	nulldev,	/* devo_probe */
+	upfattach,	/* devo_attach */
+	upfdetach,	/* devo_detach */
+	nodev,		/* devo_reset */
+	&cb_upf_ops,	/* devo_cb_ops */
+	NULL,		/* devo_bus_ops */
+	usbgem_power,	/* devo_power */
+#if DEVO_REV >= 4
+	usbgem_quiesce,	/* devo_quiesce */
+#endif
+
+};
+#endif
+static struct modldrv modldrv = {
+	&mod_driverops,	/* Type of module.  This one is a driver */
+	ident,
+	&upf_ops,	/* driver ops */
+};
+
+static struct modlinkage modlinkage = {
+	MODREV_1, &modldrv, NULL
+};
+
+/* ======================================================== */
+/*
+ * _init : done
+ */
+/* ======================================================== */
+int
+_init(void)
+{
+	int 	status;
+
+	DPRINTF(2, (CE_CONT, "!upf: _init: called"));
+
+	status = usbgem_mod_init(&upf_ops, "upf");
+	if (status != DDI_SUCCESS) {
+		return (status);
+	}
+	status = mod_install(&modlinkage);
+	if (status != DDI_SUCCESS) {
+		usbgem_mod_fini(&upf_ops);
+	}
+	return (status);
+}
+
+/*
+ * _fini : done
+ */
+int
+_fini(void)
+{
+	int	status;
+
+	DPRINTF(2, (CE_CONT, "!upf: _fini: called"));
+	status = mod_remove(&modlinkage);
+	if (status == DDI_SUCCESS) {
+		usbgem_mod_fini(&upf_ops);
+	}
+	return (status);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&modlinkage, modinfop));
+}
diff --git a/usr/src/uts/common/io/urf/rtl8150reg.h b/usr/src/uts/common/io/urf/rtl8150reg.h
new file mode 100644
index 0000000000..7cba53356e
--- /dev/null
+++ b/usr/src/uts/common/io/urf/rtl8150reg.h
@@ -0,0 +1,218 @@
+/*
+ * @(#)rtl8150reg.h	1.1 04/09/16
+ * Macro definitions for Realtek 8150 USB to fast ethernet controller
+ * based on Realtek RTL8150 data sheet
+ * This file is public domain. Coded by M.Murayama (KHF04453@nifty.com)
+ */
+
+/*
+ * Register offset
+ */
+#define	IDR	0x0120	/* Base of ID registers */
+#define	MAR	0x0126	/* Base of multicast registers */
+#define	CR	0x012e	/* Command register */
+#define	TCR	0x012f	/* Transmit Configuration register */
+#define	RCR	0x0130	/* Receive Configuration register */
+#define	TSR	0x0132	/* Transmit Status register */
+#define	RSR	0x0133	/* Receive Status register */
+#define	CON0	0x0135	/* Configuration register 0 */
+#define	CON1	0x0136	/* Configuration register 1 */
+#define	MSR	0x0137	/* Media Status register */
+#define	PHYADD	0x0138	/* PHY address register */
+#define	PHYDAT	0x0139	/* PHY data register */
+#define	PHYCNT	0x013b	/* PHY control register */
+#define	GPPC	0x013d	/* General purpose pin control */
+#define	WAKECNT	0x013e	/* Wake up event control */
+#define	BMCR	0x0140	/* Basic Mode Control register */
+#define	BMSR	0x0142	/* Basic Mode Status register */
+#define	ANAR	0x0144	/* Auto Negotiation Advertisement register */
+#define	ANLP	0x0146	/* Auto Negotiation Link Partner register */
+#define	ANER	0x0148	/* Auto Negotiation Expansion register */
+#define	NWAYT	0x014a	/* Nway test register */
+#define	CSCR	0x014c	/* CS configuration register */
+#define	CRC0	0x014e	/* Power management register for wakeup frame0 */
+#define	CRC1	0x0150	/* Power management register for wakeup frame1 */
+#define	CRC2	0x0152	/* Power management register for wakeup frame2 */
+#define	CRC3	0x0154	/* Power management register for wakeup frame3 */
+#define	CRC4	0x0156	/* Power management register for wakeup frame4 */
+#define	BYTEMASK0 0x0158	/* Power management wakeup frame0 bytemask */
+#define	BYTEMASK1 0x0160	/* Power management wakeup frame1 bytemask */
+#define	BYTEMASK2 0x0168	/* Power management wakeup frame2 bytemask */
+#define	BYTEMASK3 0x0170	/* Power management wakeup frame3 bytemask */
+#define	BYTEMASK4 0x0178	/* Power management wakeup frame4 bytemask */
+#define	PHY1	0x0180	/* PHY parameter 1 */
+#define	PHY2	0x0184	/* PHY parameter 2 */
+#define	TW1	0x0186	/* Twister parameter 1 */
+
+/*
+ * Bit field definitions
+ */
+/* CR : Command register (uint8_t) */
+#define	CR_WEPROM	0x20	/* EEPROM write enable */
+#define	CR_SOFT_RST	0x10	/* Reset */
+#define	CR_RE		0x08	/* Ethernet receive enable */
+#define	CR_TE		0x04	/* Ethernet transmit enable */
+#define	CR_EP3CLREN	0x02	/* clear performance counter after EP3 */
+#define	CR_AUTOLOAD	0x01	/* autoload contents of 93c46 */
+
+#define	CR_BITS	"\020\006WEPROM\005SOFT_RST\004RE\003TE\002EP3CLREN\001AUTOLOAD"
+
+/* TCR: Transmit Configuration register */
+#define	TCR_TXRR	0xc0	/* Tx retry count */
+#define	TCR_TXRR_SHIFT		6
+#define	TCR_IFG		0x18	/* Interframe Gap */
+#define	TCR_IFG_SHIFT		3
+#define	TCR_IFG_802_3		(3 << TCR_IFG_SHIFT)	/* 802.3 standard */
+#define	TCR_NOCRC	0x01	/* Inhibit Appending CRC */
+	
+#define	TCR_BITS	"\020\001NOCRC"
+
+/* Receive Configuration register */
+#define	RCR_TAIL	0x0080	/* Rx header forward to host in CRC field */
+#define	RCR_AER		0x0040	/* Accept Error packet */
+#define	RCR_AR		0x0020	/* Accept runt */
+#define	RCR_AM		0x0010	/* Accept multicast */
+#define	RCR_AB		0x0008	/* Accept broadcast */
+#define	RCR_AD		0x0004	/* Accept physical match */
+#define	RCR_AAM		0x0002	/* Accept all Multicast */
+#define	RCR_AAP		0x0001	/* Accept all physical */
+
+#define	RCR_ACCEPT_MODE		\
+	(RCR_AER | RCR_AR | RCR_AM | RCR_AB | RCR_AD | RCR_AAM | RCR_AAP)
+
+#define	RCR_BITS	\
+	"\020\010TAIL\007AER\006AR\005AM\004AB\003AD\002AAM\001AAP"
+
+/* Transmit Status register */
+
+#define	TSR_ECOL	0x20	/* excessive collision indication */
+#define	TSR_LCOL	0x10	/* late collision indication */
+#define	TSR_LOSS_CRS	0x08	/* lost of carrier indication */
+#define	TSR_JBR		0x04	/* jabber time out indication */
+#define	TSR_BUF_EMPTY	0x02	/* Tx buffer is empty */
+#define	TSR_BUF_FULL	0x01	/* Tx buffer is full */
+
+#define	TSR_BITS	\
+	"\020"		\
+	"\006ECOL"	\
+	"\005LCOL"	\
+	"\004LOSS_CRS"	\
+	"\003JBR"	\
+	"\002BUF_EMPTY"	\
+	"\001BUF_FULL"
+
+/* Receive status register in Rx packet field */
+#define	RSR_WEVENT	0x80	/* Wakeup event indication */
+#define	RSR_RX_BUF_FULL	0x40	/* Receive buffer full indication */
+#define	RSR_LKCHG	0x20	/* Link change indication */
+#define	RSR_RUNT	0x10	/* short packet indication */
+#define	RSR_LONG	0x08	/* Long packet indication*/
+#define	RSR_CRC		0x04	/* CRC error indication*/
+#define	RSR_FAE		0x02	/* Frame alignment error */
+#define	RSR_ROK		0x01	/* Receive OK indication */
+
+#define	RSR_ERRS	(RSR_RUNT | RSR_LONG | RSR_CRC | RSR_FAE)
+#define	RSR_BITS	\
+	"\020"		\
+	"\010WEVENT"	\
+	"\007RX_BUF_FULL"	\
+	"\006LKCHG"	\
+	"\005RUNT"	\
+	"\004LONG"	\
+	"\003CRC"	\
+	"\002FAE"	\
+	"\001ROK"
+
+/* Config 0 */
+
+#define	CON0_SUSLED	0x80
+#define	CON0_PARM_EN	0x40	/* parameter enable */
+#define	CON0_LDPS	0x08
+#define CON0_MSEL	0x04	/* media select 1:MII, 0:auto */
+#define CON0_LEDS	0x03	/* LED pattern */
+
+/* Config 1 */
+#define CON0_BWF	0x40	/* Broadcast wakeup function 1:on 0:off */
+#define CON0_MWF	0x20	/* Multicast wakeup function 1:on 0:off */
+#define CON0_UWF	0x10	/* Unicast wakeup function 1:on 0:off */
+#define CON0_LONGWF1	0x02	/* */
+#define CON0_LONGWF0	0x01	/* */
+
+
+/* MSR : Media Status register */
+#define	MSR_TXFCE	0x80	/* Tx Flow control enable */
+#define	MSR_RXFCE	0x40	/* Rx Flow control enable */
+#define	MSR_DUPLEX	0x10	/* full duplex */
+#define	MSR_SPEED_100	0x08	/* 100Mbps mode */
+#define	MSR_LINK	0x04	/* link status */
+#define	MSR_TXPF	0x02	/* 8150 sends pause packet */
+#define	MSR_RXPF	0x01	/* 8150 is in backoff state*/
+
+#define	MSR_BITS	\
+	"\020"		\
+	"\010TXFCE"	\
+	"\007RXFCE"	\
+	"\005DUPLEX"	\
+	"\004SPEED_100"	\
+	"\003LINK"	\
+	"\002TXPF"	\
+	"\001RXPF"
+
+/* MII PHY Address */
+#define	PHYADD_MASK	0x1f
+
+/* MII PHY Data */
+#define	PHYCNT_OWN	0x40	/* 8150 owns:1 not owns:0 */
+#define	PHYCNT_RWCR	0x20	/* write:1 read:0 */
+#define	PHYCNT_PHYOFF	0x1f
+
+/* BMCR (almost same with MII_CONTROL register) */
+#define	BMCR_RESET	0x8000	/* PHY reset */
+#define	BMCR_Spd_Set	0x2000	/* 100Mbps */
+#define	BMCR_ANE	0x1000	/* auto negotiation enable */
+#define	BMCR_RSA	0x0200	/* restart auto negotiation */
+#define	BMCR_duplex	0x0100	/* 100Mbps */
+
+/* Basic mode status register */
+/* Auto-negotiation Advertisement register */
+/* Auto-negotiation Link Partner Ability register */
+/* Auto-negotiation Expansion register */
+
+/* Nway test register */
+#define	NWAYT_NWLPBK	0x0080
+#define	NWAYT_ENNWLE	0x0008
+#define	NWAYT_FLAGABD	0x0004
+#define	NWAYT_FLAGPDF	0x0002
+#define	NWAYT_FLAGLSC	0x0001
+
+/* CS configuration register */
+#define	CS_TESTFUN	0x8000	/* */
+#define	CS_LD		0x0200	/* */
+#define	CS_HEARTBEAT	0x0100	/* */
+#define	CS_JBEN		0x0080	/* */
+#define	CS_F_LINK100	0x0040	/* */
+#define	CS_F_CONNECT	0x0020	/* */
+#define	CS_CON_STATUS	0x0008	/* */
+#define	CS_CON_STATUS_EN 0x0004	/* */
+#define	CS_PASS_SCR	0x0001	/* bypass scramble function */
+
+/*
+ * header format of rx packet
+ */
+#define	RXHD_MULT	0x8000	/* multicast packet */
+#define	RXHD_PHYS	0x4000	/* physical match packet */
+#define	RXHD_RUNT	0x2000	/* too short */
+#define	RXHD_VALID	0x1000	/* packet is ok */
+#define	RXHD_BYTECNT	0x0fff	/* rx byte count */
+
+#define	RXHD_BITS	\
+	"\020"		\
+	"\020MULT"	\
+	"\017PHYS"	\
+	"\016RUNT"	\
+	"\015VALID"
+/*
+ * Offset to EPROM contents
+ */
+#define	URF_EEPROM_BASE		0x1200
+#define	EPROM_EthernetID	0x0002
diff --git a/usr/src/uts/common/io/urf/urf_usbgem.c b/usr/src/uts/common/io/urf/urf_usbgem.c
new file mode 100644
index 0000000000..f61c8e3502
--- /dev/null
+++ b/usr/src/uts/common/io/urf/urf_usbgem.c
@@ -0,0 +1,1039 @@
+/*
+ * urf_usbgem.c : Realtek RTL8150 USB to Fast Ethernet Driver for Solaris
+ *
+ * Copyright (c) 2003-2012 Masayuki Murayama.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the author nor the names of its contributors may be
+ *    used to endorse or promote products derived from this software without
+ *    specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
+
+#pragma ident   "%W% %E%"
+
+/*
+ *  Changelog:
+ */
+
+/*
+ * TODO
+ */
+/* ======================================================= */
+
+/*
+ * Solaris system header files and macros
+ */
+
+/* minimum kernel headers for drivers */
+#include <sys/types.h>
+#include <sys/conf.h>
+#include <sys/debug.h>
+#include <sys/kmem.h>
+#include <sys/modctl.h>
+#include <sys/errno.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/byteorder.h>
+
+/* ethernet stuff */
+#include <sys/ethernet.h>
+
+/* interface card depend stuff */
+#include <sys/stropts.h>
+#include <sys/stream.h>
+#include <sys/strlog.h>
+#include <sys/usb/usba.h>
+#include "usbgem.h"
+#include "usbgem_mii.h"
+#include "rtl8150reg.h"
+
+char	ident[] = "rtl8150 usbnic driver v" VERSION;
+
+/*
+ * Useful macros
+ */
+#define	ROUNDUP2(x, y)	(((x)+(y)-1) & ~((y)-1))
+#define	CHECK_AND_JUMP(err, label)	if (err != USB_SUCCESS) goto label
+
+/*
+ * Debugging
+ */
+#ifdef DEBUG_LEVEL
+static int urf_debug = DEBUG_LEVEL;
+#define	DPRINTF(n, args)	if (urf_debug > (n)) cmn_err args
+#else
+#define	DPRINTF(n, args)
+#endif
+
+/*
+ * Our configration for rtl8150
+ */
+/* timeouts */
+#define	ONESEC			(drv_usectohz(1*1000000))
+
+/*
+ * Local device definitions
+ */
+struct chip_info {
+	int		flags;
+	char		*name;
+	int		type;
+};
+
+#define	CHIPTABLESIZE	(sizeof (chiptbl_8150) / sizeof (struct chip_info))
+
+struct urf_dev {
+	/*
+	 * Misc HW information
+	 */
+	struct chip_info	*chip;
+	uint8_t			cr;
+	uint8_t			tsr;
+	uint16_t		rcr;
+	uint8_t			txok_cnt;
+};
+
+/*
+ * private functions
+ */
+
+/* mii operations */
+static uint16_t  urf_mii_read(struct usbgem_dev *, uint_t, int *errp);
+static void urf_mii_write(struct usbgem_dev *, uint_t, uint16_t, int *errp);
+
+/* nic operations */
+static int urf_attach_chip(struct usbgem_dev *);
+static int urf_reset_chip(struct usbgem_dev *);
+static int urf_init_chip(struct usbgem_dev *);
+static int urf_start_chip(struct usbgem_dev *);
+static int urf_stop_chip(struct usbgem_dev *);
+static int urf_set_media(struct usbgem_dev *);
+static int urf_set_rx_filter(struct usbgem_dev *);
+static int urf_get_stats(struct usbgem_dev *);
+
+/* packet operations */
+static mblk_t *urf_tx_make_packet(struct usbgem_dev *, mblk_t *);
+static mblk_t *urf_rx_make_packet(struct usbgem_dev *, mblk_t *);
+
+/* =============================================================== */
+/*
+ * I/O functions
+ */
+/* =============================================================== */
+#define	OUTB(dp, p, v, errp, label)	\
+	if ((*(errp) = usbgem_ctrl_out_val((dp), 	\
+	/* bmRequestType */ USB_DEV_REQ_HOST_TO_DEV	\
+		    | USB_DEV_REQ_TYPE_VENDOR | USB_DEV_REQ_RCPT_DEV,	\
+	/* bRequest */ USB_REQ_SET_ADDRESS,	\
+	/* wValue */   (p),	\
+	/* wIndex */   0,	\
+	/* wLength */  1,	\
+	/* value */   (v))) != USB_SUCCESS) goto label
+
+#define	OUTW(dp, p, v, errp, label)	\
+	if ((*(errp) = usbgem_ctrl_out_val((dp), 	\
+	/* bmRequestType */ USB_DEV_REQ_HOST_TO_DEV	\
+		    | USB_DEV_REQ_TYPE_VENDOR | USB_DEV_REQ_RCPT_DEV,	\
+	/* bRequest */ USB_REQ_SET_ADDRESS,	\
+	/* wValue */   (p),	\
+	/* wIndex */   0,	\
+	/* wLength */  2,	\
+	/* value */   (v))) != USB_SUCCESS) goto label
+
+#define	OUTS(dp, p, buf, len, errp, label)	\
+	if ((*(errp) = usbgem_ctrl_out((dp), 	\
+	/* bmRequestType */ USB_DEV_REQ_HOST_TO_DEV	\
+		    | USB_DEV_REQ_TYPE_VENDOR | USB_DEV_REQ_RCPT_DEV,	\
+	/* bRequest */ USB_REQ_SET_ADDRESS,	\
+	/* wValue */   (p),	\
+	/* wIndex */   0,	\
+	/* wLength */  (len),	\
+	/* value */    (buf),	\
+	/* size */     (len))) != USB_SUCCESS) goto label
+
+#define	IN(dp, p, vp, errp, label)	\
+	if ((*(errp) = usbgem_ctrl_in_val((dp), 	\
+	/* bmRequestType */ USB_DEV_REQ_DEV_TO_HOST	\
+		    | USB_DEV_REQ_TYPE_VENDOR | USB_DEV_REQ_RCPT_DEV,	\
+	/* bRequest */ USB_REQ_SET_ADDRESS,	\
+	/* wValue */  (p),	\
+	/* wIndex */  0,	\
+	/* wLength */ sizeof ((*vp)),	\
+	/* valuep */  (vp))) != USB_SUCCESS) goto label
+
+#define	INS(dp, p, buf, len, errp, label)	\
+	if ((*(errp) = usbgem_ctrl_in((dp), 	\
+	/* bmRequestType */ USB_DEV_REQ_DEV_TO_HOST	\
+		    | USB_DEV_REQ_TYPE_VENDOR | USB_DEV_REQ_RCPT_DEV,	\
+	/* bRequest */ USB_REQ_SET_ADDRESS,	\
+	/* wValue */   (p),	\
+	/* wIndex */   0,	\
+	/* wLength */  (len),	\
+	/* valuep */  (buf),	\
+	/* size   */  (len))) != USB_SUCCESS) goto label
+
+/* =============================================================== */
+/*
+ * variables
+ */
+/* =============================================================== */
+static int urf_ppa = 0;
+
+/* =============================================================== */
+/*
+ * Hardware manupilation
+ */
+/* =============================================================== */
+static int
+urf_reset_chip(struct usbgem_dev *dp)
+{
+	int		i;
+	int		err;
+	uint8_t		reg;
+	struct urf_dev	*lp = dp->private;
+
+	DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__));
+
+	lp->cr = 0;
+	OUTB(dp, CR, lp->cr | CR_SOFT_RST, &err, usberr);
+
+	for (i = 0; i < 100; i++) {
+		IN(dp, CR, &reg, &err, usberr);
+		if ((reg & CR_SOFT_RST) == 0) {
+			return (USB_SUCCESS);
+		}
+	}
+	/* time out */
+	cmn_err(CE_WARN, "%s: failed to reset: timeout", dp->name);
+	return (USB_FAILURE);
+
+usberr:
+	cmn_err(CE_NOTE, "!%s: %s: usberr detected", dp->name, __func__);
+	return (USB_FAILURE);
+}
+
+/*
+ * Setup rtl8150
+ */
+static int
+urf_init_chip(struct usbgem_dev *dp)
+{
+	int		i;
+	uint32_t	val;
+	int		err;
+	struct urf_dev	*lp = dp->private;
+
+	DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__));
+
+	/* ID registers: set later by urf_set_rx_filter */
+
+	/* Multicast registers: set later by urf_set_rx_filter */
+
+	/* Command register : Enable Tx and Rx before writing TCR and RCR */
+	lp->cr |= CR_RE | CR_TE;
+	OUTB(dp, CR, lp->cr, &err, usberr);
+
+	/* Transmit configration register : */
+	OUTB(dp, TCR, TCR_IFG_802_3, &err, usberr);
+
+	/* Receive configuration register :  disable rx filter */
+	lp->rcr = RCR_TAIL | RCR_AER | RCR_AR;
+	OUTW(dp, RCR, lp->rcr, &err, usberr);
+#ifdef notdef
+	/* Media status register */
+	err = urf_set_media(dp);
+	CHECK_AND_JUMP(err, usberr);
+#endif
+	/* Configuration register 0: no need to change */
+
+	DPRINTF(2, (CE_CONT, "!%s: %s: end (success)", dp->name, __func__));
+	return (USB_SUCCESS);
+
+usberr:
+	cmn_err(CE_NOTE, "!%s: %s: usberr detected", dp->name, __func__);
+	return (USB_FAILURE);
+}
+
+static int
+urf_start_chip(struct usbgem_dev *dp)
+{
+	struct urf_dev	*lp = dp->private;
+
+	/* do nothing */
+	return (USB_SUCCESS);
+}
+
+static int
+urf_stop_chip(struct usbgem_dev *dp)
+{
+	return (urf_reset_chip(dp));
+}
+
+static int
+urf_get_stats(struct usbgem_dev *dp)
+{
+	/* do nothing */
+	return (USB_SUCCESS);
+}
+
+static uint_t
+urf_mcast_hash(struct usbgem_dev *dp, const uint8_t *addr)
+{
+	return (usbgem_ether_crc_be(addr));
+}
+
+static int
+urf_set_rx_filter(struct usbgem_dev *dp)
+{
+	int		i;
+	uint16_t	mode;
+	uint8_t		mhash[8];
+	int		err;
+	int16_t		rcr;
+	struct urf_dev	*lp = dp->private;
+
+	DPRINTF(2, (CE_CONT, "!%s: %s: called, rxmode:%x",
+	    dp->name, __func__, dp->rxmode));
+
+	if (lp->rcr & (RCR_AB | RCR_AD | RCR_AAM | RCR_AAP | RCR_AM)) {
+#ifdef notdef
+		/* disable rx filter before changing it. */
+		lp->rcr &= ~(RCR_AB | RCR_AD | RCR_AAM | RCR_AAP | RCR_AM);
+		OUTW(dp, RCR, lp->rcr, &err, usberr);
+#else
+		/* receive all packets while we change rx filter*/
+		OUTW(dp, RCR, lp->rcr | RCR_AAM | RCR_AAP, &err, usberr);
+#endif
+	}
+
+	mode = RCR_AB	/* accept broadcast */
+	    | RCR_AD;	/* accept physical match  */
+	bzero(mhash, sizeof (mhash));
+
+	if (dp->rxmode & RXMODE_PROMISC) {
+		/* promiscious mode implies all multicast and all physical */
+		mode |= RCR_AAM | RCR_AAP;
+	} else if ((dp->rxmode & RXMODE_ALLMULTI) || dp->mc_count > 64/2) {
+		/* accept all multicast packets */
+		mode |= RCR_AAM;
+	} else if (dp->mc_count > 0) {
+		/*
+		 * make hash table to select interresting
+		 * multicast address only.
+		 */
+		mode |= RCR_AM;
+		for (i = 0; i < dp->mc_count; i++) {
+			uint_t	h;
+			/* hash table is 64 = 2^6 bit width */
+			h = dp->mc_list[i].hash >> (32 - 6);
+			mhash[h / 8] |= 1 << (h % 8);
+		}
+	}
+	lp->rcr |= mode;
+
+	/* set mac address */
+	OUTS(dp, IDR, dp->cur_addr.ether_addr_octet, ETHERADDRL, &err, usberr);
+
+	/* set multicast hash table */
+	if (mode & RCR_AM) {
+		/* need to set up multicast hash table */
+		OUTS(dp, MAR, mhash, sizeof (mhash), &err, usberr);
+	}
+
+	OUTW(dp, RCR, lp->rcr, &err, usberr);
+
+#if DEBUG_LEVEL > 2
+	IN(dp, RCR, &rcr, &err, usberr);
+	cmn_err(CE_CONT, "!%s: %s: rcr:%b returned",
+	    dp->name, __func__, rcr, RCR_BITS);
+#endif
+	return (USB_SUCCESS);
+
+usberr:
+	cmn_err(CE_NOTE, "!%s: %s: usberr detected", dp->name, __func__);
+	return (USB_FAILURE);
+}
+
+static int
+urf_set_media(struct usbgem_dev *dp)
+{
+	uint8_t		new;
+	uint8_t		old;
+	int		err;
+	struct urf_dev	*lp = dp->private;
+
+	DPRINTF(2, (CE_CONT, "!%s: %s: called", dp->name, __func__));
+
+	/* select duplex: do nothing */
+
+	/* select speed: do nothing */
+
+	/* flow control */
+	IN(dp, MSR, &old, &err, usberr);
+
+
+	/* setup flow control */
+	new = old & ~(MSR_TXFCE | MSR_RXFCE);
+	switch (dp->flow_control) {
+	case FLOW_CONTROL_SYMMETRIC:
+		new |= MSR_TXFCE | MSR_RXFCE;
+		break;
+
+	case FLOW_CONTROL_TX_PAUSE:
+		new |= MSR_TXFCE;
+		break;
+
+	case FLOW_CONTROL_RX_PAUSE:
+		new |= MSR_RXFCE;
+		break;
+
+	case FLOW_CONTROL_NONE:
+	default:
+		break;
+	}
+
+	if (new != old) {
+		OUTB(dp, MSR, new, &err, usberr);
+	}
+	DPRINTF(2, (CE_CONT, "!%s: %s: returned", dp->name, __func__));
+	return (USB_SUCCESS);
+
+usberr:
+	cmn_err(CE_NOTE, "!%s: %s: usberr detected", dp->name, __func__);
+	return (USB_FAILURE);
+}
+
+/*
+ * send/receive packet check
+ */
+static mblk_t *
+urf_tx_make_packet(struct usbgem_dev *dp, mblk_t *mp)
+{
+	size_t		len;
+	mblk_t		*new;
+	mblk_t		*tp;
+	uint8_t		*bp;
+	uint8_t		*last_pos;
+
+	len = msgdsize(mp);
+
+	if (len < ETHERMIN || mp->b_cont != NULL || (len & 0x3f) == 0) {
+		/*
+		 * re-allocate mp
+		 */
+		len = max(len, ETHERMIN);
+
+		if ((len & 0x3f) == 0) {
+			/* workaround for buggy USB hba */
+			len++;
+		}
+
+		if ((new = allocb(len, 0)) == NULL) {
+			return (NULL);
+		}
+
+		/* copy contents of the buffer */
+		new->b_wptr = new->b_rptr + len;
+		bp = new->b_rptr;
+		for (tp = mp; tp; tp = tp->b_cont) {
+			len = tp->b_wptr - tp->b_rptr;
+			bcopy(tp->b_rptr, bp, len);
+			bp += len;
+		}
+
+		last_pos = new->b_wptr;
+		while (bp < last_pos) {
+			*bp++ = 0;
+		}
+
+		mp = new;
+	}
+
+	return (mp);
+}
+
+static void
+urf_dump_packet(struct usbgem_dev *dp, uint8_t *bp, int n)
+{
+	int	i;
+
+	for (i = 0; i < n; i += 8, bp += 8) {
+		cmn_err(CE_CONT, "%02x %02x %02x %02x %02x %02x %02x %02x",
+		    bp[0], bp[1], bp[2], bp[3], bp[4], bp[5], bp[6], bp[7]);
+	}
+}
+
+static mblk_t *
+urf_rx_make_packet(struct usbgem_dev *dp, mblk_t *mp)
+{
+	uint8_t		*p;
+	uint16_t	rxhd;
+	uint_t		len;
+
+	ASSERT(mp != NULL);
+	len = msgdsize(mp);
+#ifdef DEBUG_LEVEL
+	DPRINTF(2, (CE_CONT, "!%s: time:%d %s: len:%d cont:%p",
+	    dp->name, ddi_get_lbolt(), __func__, len, mp->b_cont));
+
+	if (urf_debug > 2) {
+		urf_dump_packet(dp, mp->b_rptr, max(6, len));
+	}
+#endif
+	if (len < ETHERMIN + ETHERFCSL) {
+		/* Too short */
+		dp->stats.runt++;
+		dp->stats.errrcv++;
+		return (NULL);
+	}
+
+	/* get Rx header which is placed at tail of the packet. */
+	p = mp->b_wptr - 4;
+	rxhd = (p[1] << 8) | p[0];
+	len = rxhd & RXHD_BYTECNT;
+
+	DPRINTF(2, (CE_CONT, "!%s: %s: rsr:%b len:%d",
+	    dp->name, __func__, rxhd, RXHD_BITS, len));
+
+	/* check if error happen */
+	if ((rxhd & (RXHD_VALID)) == 0) {
+		DPRINTF(-1, (CE_CONT, "!%s: %s: rxhd:%b",
+		    dp->name, __func__, rxhd, RXHD_BITS));
+		if (rxhd & RXHD_RUNT) {
+			dp->stats.runt++;
+		}
+
+		dp->stats.errrcv++;
+		return (NULL);
+	}
+#ifdef notdef
+	/* check packet size */
+	if (len > ETHERMAX + ETHERFCSL) {
+		/* too long */
+		dp->stats.frame_too_long++;
+		dp->stats.errrcv++;
+		return (NULL);
+	} else if (len < ETHERMIN + ETHERFCSL) {
+		dp->stats.runt++;
+		dp->stats.errrcv++;
+		return (NULL);
+	}
+#endif
+	/* remove tailing crc field */
+	mp->b_wptr -= ETHERFCSL;
+	return (mp);
+}
+
+/*
+ * MII Interfaces
+ */
+static uint16_t
+urf_mii_read(struct usbgem_dev *dp, uint_t index, int *errp)
+{
+	int		reg;
+	uint16_t	val;
+
+	DPRINTF(4, (CE_CONT, "!%s: %s: called, ix:%d",
+	    dp->name, __func__, index));
+
+	*errp = USB_SUCCESS;
+
+	switch (index) {
+	case MII_CONTROL:
+		reg = BMCR;
+		break;
+
+	case MII_STATUS:
+		reg = BMSR;
+		break;
+
+	case MII_AN_ADVERT:
+		reg = ANAR;
+		break;
+
+	case MII_AN_LPABLE:
+		reg = ANLP;
+		break;
+
+	case MII_AN_EXPANSION:
+		reg = ANER;
+		break;
+
+	default:
+		return (0);
+	}
+
+	IN(dp, reg, &val, errp, usberr);
+
+	if (index == MII_STATUS) {
+		uint8_t	msr;
+		/*
+		 * Fix MII status register as it does't have LINKUP and
+		 * MFPRMBLSUPR bits.
+		 */
+		IN(dp, MSR, &msr, errp, usberr);
+
+		val |= (MII_STATUS_MFPRMBLSUPR | MII_STATUS_LINKUP);
+		if ((msr & MSR_LINK) == 0) {
+			val &= ~MII_STATUS_LINKUP;
+		}
+	}
+
+	return (val);
+
+usberr:
+	cmn_err(CE_CONT,
+	    "!%s: %s: usberr(%d) detected", dp->name, __func__, *errp);
+
+	return (0);
+}
+
+static void
+urf_mii_write(struct usbgem_dev *dp, uint_t index, uint16_t val, int *errp)
+{
+	int	reg;
+
+	DPRINTF(5, (CE_CONT, "!%s: %s called", dp->name, __func__));
+
+	*errp = USB_SUCCESS;
+
+	switch (index) {
+	case MII_CONTROL:
+		reg = BMCR;
+		break;
+
+	case MII_STATUS:
+		reg = BMSR;
+		break;
+
+	case MII_AN_ADVERT:
+		reg = ANAR;
+		break;
+
+	case MII_AN_LPABLE:
+		reg = ANLP;
+		break;
+
+	case MII_AN_EXPANSION:
+		reg = ANER;
+		break;
+
+	default:
+		return;
+	}
+
+	OUTW(dp, reg, val, errp, usberr);
+usberr:
+	;
+}
+
+/* ======================================================== */
+/*
+ * OS depend (device driver DKI) routine
+ */
+/* ======================================================== */
+static void
+urf_eeprom_dump(struct usbgem_dev *dp, int size)
+{
+	int		i;
+	int		err;
+	uint16_t	w0, w1, w2, w3;
+
+	cmn_err(CE_CONT, "!%s: eeprom dump:", dp->name);
+	for (i = URF_EEPROM_BASE; i < size + URF_EEPROM_BASE; i += 8) {
+		IN(dp, i + 0, &w0, &err, usberr);
+		IN(dp, i + 2, &w1, &err, usberr);
+		IN(dp, i + 4, &w2, &err, usberr);
+		IN(dp, i + 6, &w3, &err, usberr);
+		cmn_err(CE_CONT, "!0x%02x: 0x%04x 0x%04x 0x%04x 0x%04x",
+		    i - URF_EEPROM_BASE, w0, w1, w2, w3);
+	}
+usberr:
+	;
+}
+
+static int
+urf_attach_chip(struct usbgem_dev *dp)
+{
+	int		i;
+	uint8_t		old;
+	uint_t		new;
+	uint8_t		reg;
+	int		err;
+	struct urf_dev	*lp = dp->private;
+
+	/*
+	 * setup flow control bit in eeprom
+	 */
+	IN(dp, URF_EEPROM_BASE + 9, &old, &err, usberr);
+
+	DPRINTF(0, (CE_CONT, "!%s: eeprom offset 9: %02x", dp->name, old));
+
+	if (dp->ugc.usbgc_flow_control != FLOW_CONTROL_NONE) {
+		/* enable PAUSE bit */
+		new = old | 0x04;
+	} else {
+		/* clear PAUSE bit */
+		new = old & ~0x04;
+	}
+	if (new != old) {
+		/* make eeprom writable */
+		OUTB(dp, CR, lp->cr | CR_WEPROM, &err, usberr);
+
+		/* eerom allows only word access for writing */
+		IN(dp, URF_EEPROM_BASE + 8, &reg, &err, usberr);
+		new = (new << 8) | reg;
+
+		OUTW(dp, URF_EEPROM_BASE + 8, new, &err, usberr);
+
+		/* make eeprom non-writable */
+		OUTB(dp, CR, lp->cr, &err, usberr);
+	}
+
+	/*
+	 * load EEPROM contents into nic
+	 */
+	OUTB(dp, CR, lp->cr | CR_AUTOLOAD, &err, usberr);
+	CHECK_AND_JUMP(err, usberr);
+
+	for (i = 0; i < 100; i++) {
+		IN(dp, CR, &reg, &err, usberr);
+		if ((reg & CR_AUTOLOAD) == 0) {
+			goto autoload_done;
+		}
+	}
+	/* timeout */
+	cmn_err(CE_WARN, "%s: %s: failed to autoload: timeout",
+	    dp->name, __func__);
+	goto usberr;
+
+autoload_done:
+	/*
+	 * mac address in EEPROM has loaded to ID registers.
+	 */
+	INS(dp, IDR, dp->dev_addr.ether_addr_octet, ETHERADDRL, &err, usberr);
+
+	/* no need to scan phy */
+	dp->mii_phy_addr = -1;
+
+#if DEBUG_LEVEL > 2
+	urf_eeprom_dump(dp, 0x80);
+#endif
+
+#ifdef CONFIG_VLAN
+	dp->misc_flag = USBGEM_VLAN;
+#endif
+	return (USB_SUCCESS);
+
+usberr:
+	cmn_err(CE_WARN, "%s: urf_attach_chip: usb error detected", dp->name);
+	return (USB_FAILURE);
+}
+
+static int
+urfattach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+	int			i;
+	ddi_iblock_cookie_t	c;
+	int			ret;
+	int			unit;
+	struct chip_info	*p;
+	const char		*drv_name;
+	struct usbgem_dev	*dp;
+	void			*base;
+	struct usbgem_conf	*ugcp;
+	struct urf_dev		*lp;
+
+	unit =  ddi_get_instance(dip);
+	drv_name = ddi_driver_name(dip);
+
+	DPRINTF(3, (CE_CONT, "!%s%d: %s: called, cmd:%d",
+	    drv_name, __func__, unit, cmd));
+
+	if (cmd == DDI_ATTACH) {
+		/*
+		 * Check if the chip is supported.
+		 */
+
+		/*
+		 * Check the chip if it is really realtek rtl8150
+		 */
+
+		/*
+		 * construct usbgem configration
+		 */
+		ugcp = kmem_zalloc(sizeof (*ugcp), KM_SLEEP);
+
+		/* name */
+		sprintf(ugcp->usbgc_name,
+		    "%s%d(ppa=%d)", drv_name, unit, urf_ppa);
+#ifdef USBGEM_CONFIG_GLDv3
+		ugcp->usbgc_ppa = urf_ppa;
+#else
+		ugcp->usbgc_ppa = unit;
+#endif
+		ugcp->usbgc_ifnum = 0;
+		ugcp->usbgc_alt = 0;
+
+		ugcp->usbgc_tx_list_max = 16;
+
+		/* the rx status partially replaces FCS */
+		ugcp->usbgc_rx_header_len = 0;
+		ugcp->usbgc_rx_list_max = 64;
+
+		/* time out parameters */
+		ugcp->usbgc_tx_timeout = USBGEM_TX_TIMEOUT;
+		ugcp->usbgc_tx_timeout_interval = ONESEC;
+
+		/* flow control */
+		ugcp->usbgc_flow_control = FLOW_CONTROL_RX_PAUSE;
+
+		/* MII timeout parameters */
+		ugcp->usbgc_mii_link_watch_interval = ONESEC;
+		ugcp->usbgc_mii_an_watch_interval = ONESEC/5;
+		ugcp->usbgc_mii_reset_timeout = MII_RESET_TIMEOUT; /* 1 sec */
+		ugcp->usbgc_mii_an_timeout = MII_AN_TIMEOUT;	/* 5 sec */
+		ugcp->usbgc_mii_an_wait = (25*ONESEC)/10;
+		ugcp->usbgc_mii_linkdown_timeout = MII_LINKDOWN_TIMEOUT;
+
+		ugcp->usbgc_mii_an_delay = ONESEC/10;
+		ugcp->usbgc_mii_linkdown_action = MII_ACTION_RSA;
+		ugcp->usbgc_mii_linkdown_timeout_action = MII_ACTION_RESET;
+		ugcp->usbgc_mii_dont_reset = B_FALSE;
+
+		/* I/O methods */
+
+		/* mac operation */
+		ugcp->usbgc_attach_chip = &urf_attach_chip;
+		ugcp->usbgc_reset_chip = &urf_reset_chip;
+		ugcp->usbgc_init_chip = &urf_init_chip;
+		ugcp->usbgc_start_chip = &urf_start_chip;
+		ugcp->usbgc_stop_chip = &urf_stop_chip;
+		ugcp->usbgc_multicast_hash = &urf_mcast_hash;
+
+		ugcp->usbgc_set_rx_filter = &urf_set_rx_filter;
+		ugcp->usbgc_set_media = &urf_set_media;
+		ugcp->usbgc_get_stats = &urf_get_stats;
+#ifdef notdef
+		ugcp->usbgc_interrupt = &urf_interrupt;
+#else
+		ugcp->usbgc_interrupt = NULL;
+#endif
+		/* packet operation */
+		ugcp->usbgc_tx_make_packet = &urf_tx_make_packet;
+		ugcp->usbgc_rx_make_packet = &urf_rx_make_packet;
+
+		/* mii operations */
+		ugcp->usbgc_mii_probe = &usbgem_mii_probe_default;
+		ugcp->usbgc_mii_init = &usbgem_mii_init_default;
+		ugcp->usbgc_mii_config = &usbgem_mii_config_default;
+		ugcp->usbgc_mii_read = &urf_mii_read;
+		ugcp->usbgc_mii_write = &urf_mii_write;
+
+		/* mtu */
+		ugcp->usbgc_min_mtu = ETHERMTU;
+		ugcp->usbgc_max_mtu = ETHERMTU;
+		ugcp->usbgc_default_mtu = ETHERMTU;
+
+		lp = kmem_zalloc(sizeof (struct urf_dev), KM_SLEEP);
+		lp->chip = p;
+
+		ddi_set_driver_private(dip, NULL);
+
+		dp = usbgem_do_attach(dip, ugcp, lp, sizeof (struct urf_dev));
+
+		kmem_free(ugcp, sizeof (*ugcp));
+
+		if (dp != NULL) {
+			urf_ppa++;
+			return (DDI_SUCCESS);
+		}
+
+err_free_mem:
+		kmem_free(lp, sizeof (struct urf_dev));
+err_close_pipe:
+err:
+		return (DDI_FAILURE);
+	}
+	if (cmd == DDI_RESUME) {
+		return (usbgem_resume(dip));
+	}
+	return (DDI_FAILURE);
+}
+
+static int
+urfdetach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+	int	ret;
+
+	if (cmd == DDI_DETACH) {
+		ret = usbgem_do_detach(dip);
+		if (ret != DDI_SUCCESS) {
+			return (DDI_FAILURE);
+		}
+		urf_ppa--;
+		return (DDI_SUCCESS);
+	}
+	if (cmd == DDI_SUSPEND) {
+		return (usbgem_suspend(dip));
+	}
+	return (DDI_FAILURE);
+}
+
+/* ======================================================== */
+/*
+ * OS depend (loadable streams driver) routine
+ */
+/* ======================================================== */
+#ifdef USBGEM_CONFIG_GLDv3
+USBGEM_STREAM_OPS(urf_ops, urfattach, urfdetach);
+#else
+static	struct module_info urfminfo = {
+	0,			/* mi_idnum */
+	"urf",			/* mi_idname */
+	0,			/* mi_minpsz */
+	ETHERMTU,		/* mi_maxpsz */
+	ETHERMTU*128,		/* mi_hiwat */
+	1,			/* mi_lowat */
+};
+
+static	struct qinit urfrinit = {
+	(int (*)()) NULL,	/* qi_putp */
+	usbgem_rsrv,		/* qi_srvp */
+	usbgem_open,		/* qi_qopen */
+	usbgem_close,		/* qi_qclose */
+	(int (*)()) NULL,	/* qi_qadmin */
+	&urfminfo,		/* qi_minfo */
+	NULL			/* qi_mstat */
+};
+
+static	struct qinit urfwinit = {
+	usbgem_wput,		/* qi_putp */
+	usbgem_wsrv,		/* qi_srvp */
+	(int (*)()) NULL,	/* qi_qopen */
+	(int (*)()) NULL,	/* qi_qclose */
+	(int (*)()) NULL,	/* qi_qadmin */
+	&urfminfo,		/* qi_minfo */
+	NULL			/* qi_mstat */
+};
+
+static struct streamtab	urf_info = {
+	&urfrinit,	/* st_rdinit */
+	&urfwinit,	/* st_wrinit */
+	NULL,		/* st_muxrinit */
+	NULL		/* st_muxwrinit */
+};
+
+static	struct cb_ops cb_urf_ops = {
+	nulldev,	/* cb_open */
+	nulldev,	/* cb_close */
+	nodev,		/* cb_strategy */
+	nodev,		/* cb_print */
+	nodev,		/* cb_dump */
+	nodev,		/* cb_read */
+	nodev,		/* cb_write */
+	nodev,		/* cb_ioctl */
+	nodev,		/* cb_devmap */
+	nodev,		/* cb_mmap */
+	nodev,		/* cb_segmap */
+	nochpoll,	/* cb_chpoll */
+	ddi_prop_op,	/* cb_prop_op */
+	&urf_info,	/* cb_stream */
+	D_NEW|D_MP	/* cb_flag */
+};
+
+static	struct dev_ops urf_ops = {
+	DEVO_REV,	/* devo_rev */
+	0,		/* devo_refcnt */
+	usbgem_getinfo,	/* devo_getinfo */
+	nulldev,	/* devo_identify */
+	nulldev,	/* devo_probe */
+	urfattach,	/* devo_attach */
+	urfdetach,	/* devo_detach */
+	nodev,		/* devo_reset */
+	&cb_urf_ops,	/* devo_cb_ops */
+	NULL,		/* devo_bus_ops */
+	usbgem_power,	/* devo_power */
+#if DEVO_REV >= 4
+	usbgem_quiesce, /* devo_quiesce */
+#endif
+
+};
+#endif
+
+static struct modldrv modldrv = {
+	&mod_driverops,	/* Type of module.  This one is a driver */
+	ident,
+	&urf_ops,	/* driver ops */
+};
+
+static struct modlinkage modlinkage = {
+	MODREV_1, &modldrv, NULL
+};
+
+/* ======================================================== */
+/*
+ * _init : done
+ */
+/* ======================================================== */
+int
+_init(void)
+{
+	int 	status;
+
+	DPRINTF(2, (CE_CONT, "!urf: _init: called"));
+
+	status = usbgem_mod_init(&urf_ops, "urf");
+	if (status != DDI_SUCCESS) {
+		return (status);
+	}
+	status = mod_install(&modlinkage);
+	if (status != DDI_SUCCESS) {
+		usbgem_mod_fini(&urf_ops);
+	}
+	return (status);
+}
+
+/*
+ * _fini : done
+ */
+int
+_fini(void)
+{
+	int	status;
+
+	DPRINTF(2, (CE_CONT, "!urf: _fini: called"));
+	status = mod_remove(&modlinkage);
+	if (status == DDI_SUCCESS) {
+		usbgem_mod_fini(&urf_ops);
+	}
+	return (status);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&modlinkage, modinfop));
+}
diff --git a/usr/src/uts/common/io/usbgem/usbgem.c b/usr/src/uts/common/io/usbgem/usbgem.c
new file mode 100644
index 0000000000..a42f7119ef
--- /dev/null
+++ b/usr/src/uts/common/io/usbgem/usbgem.c
@@ -0,0 +1,6389 @@
+/*
+ * usbgem.c: General USB to Fast Ethernet mac driver framework
+ *
+ * Copyright (c) 2002-2012 Masayuki Murayama.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the author nor the names of its contributors may be
+ *    used to endorse or promote products derived from this software without
+ *    specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
+
+#pragma	ident	"@(#)usbgem.c 1.6     12/02/09"
+
+/*
+ * Change log
+ */
+
+/*
+ * TODO:
+ * 	implement DELAYED_START
+ */
+
+/*
+ * System Header files.
+ */
+#include <sys/types.h>
+#include <sys/conf.h>
+#include <sys/debug.h>
+#include <sys/kmem.h>
+#include <sys/vtrace.h>
+#include <sys/ethernet.h>
+#include <sys/modctl.h>
+#include <sys/errno.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#ifndef USBGEM_CONFIG_GLDv3
+#include <sys/dlpi.h>
+#include <sys/strsubr.h>
+#endif
+#include <sys/stream.h>		/* required for MBLK* */
+#include <sys/strsun.h>		/* required for mionack() */
+#include <sys/byteorder.h>
+
+#include <sys/usb/usba.h>
+#ifdef USBGEM_CONFIG_GLDv3
+#include <inet/common.h>
+#include <inet/led.h>
+#include <inet/mi.h>
+#include <inet/nd.h>
+#endif
+
+/* supplement definitions */
+extern const char *usb_str_cr(usb_cr_t);
+
+#ifndef USBGEM_CONFIG_GLDv3
+#pragma weak	gld_linkstate
+#endif
+#include <sys/note.h>
+
+#include "usbgem_mii.h"
+#include "usbgem.h"
+
+#ifdef MODULE
+char	ident[] = "usb general ethernet mac driver v" VERSION;
+#else
+extern char	ident[];
+#endif
+
+/* Debugging support */
+#ifdef USBGEM_DEBUG_LEVEL
+static int usbgem_debug = USBGEM_DEBUG_LEVEL;
+#define	DPRINTF(n, args)	if (usbgem_debug > (n)) cmn_err args
+#else
+#define	DPRINTF(n, args)
+#endif
+
+/*
+ * Useful macros and typedefs
+ */
+#define	ROUNDUP(x, a)		(((x) + (a) - 1) & ~((a) - 1))
+#define	DEFAULT_PIPE(dp)	((dp)->reg_data->dev_default_ph)
+#define	VTAG_SIZE	4
+#define	BOOLEAN(x)	((x) != 0)
+/*
+ * configuration parameters
+ */
+#define	USBDRV_MAJOR_VER	2
+#define	USBDRV_MINOR_VER	0
+
+#define	ETHERHEADERL	(sizeof (struct ether_header))
+#define	MAXPKTLEN(dp)	((dp)->mtu + ETHERHEADERL)
+#define	MAXPKTBUF(dp)	((dp)->mtu + ETHERHEADERL + ETHERFCSL)
+
+#define	WATCH_INTERVAL_FAST	drv_usectohz(100*1000)
+
+#define	STOP_GRACEFUL	B_TRUE
+
+/*
+ * Private functions
+ */
+static int usbgem_open_pipes(struct usbgem_dev *dp);
+static int usbgem_close_pipes(struct usbgem_dev *dp);
+static void usbgem_intr_cb(usb_pipe_handle_t, usb_intr_req_t *);
+static void usbgem_bulkin_cb(usb_pipe_handle_t, usb_bulk_req_t *);
+static void usbgem_bulkout_cb(usb_pipe_handle_t, usb_bulk_req_t *);
+
+static int usbgem_mii_start(struct usbgem_dev *);
+static void usbgem_mii_stop(struct usbgem_dev *);
+
+/* local buffer management */
+static int usbgem_init_rx_buf(struct usbgem_dev *);
+
+/* internal mac interfaces */
+static void usbgem_tx_timeout(struct usbgem_dev *);
+static void usbgem_mii_link_watcher(struct usbgem_dev *);
+static int usbgem_mac_init(struct usbgem_dev *);
+static int usbgem_mac_start(struct usbgem_dev *);
+static int usbgem_mac_stop(struct usbgem_dev *, int, boolean_t);
+static void usbgem_mac_ioctl(struct usbgem_dev *, queue_t *, mblk_t *);
+
+int usbgem_speed_value[] = {10, 100, 1000};
+
+static int usbgem_ctrl_retry = 5;
+
+/* usb event support */
+static int usbgem_disconnect_cb(dev_info_t *dip);
+static int usbgem_reconnect_cb(dev_info_t *dip);
+int usbgem_suspend(dev_info_t *dip);
+int usbgem_resume(dev_info_t *dip);
+
+static uint8_t usbgem_bcastaddr[] = {
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+};
+
+#ifdef MODULE
+extern struct mod_ops mod_miscops;
+
+static struct modlmisc modlmisc = {
+	&mod_miscops,
+	"usbgem v" VERSION,
+};
+
+static struct modlinkage modlinkage = {
+	MODREV_1, &modlmisc, NULL
+};
+
+/*
+ * _init : done
+ */
+int
+_init(void)
+{
+	int 	status;
+
+	DPRINTF(2, (CE_CONT, "!usbgem: _init: called"));
+	status = mod_install(&modlinkage);
+
+	return (status);
+}
+
+/*
+ * _fini : done
+ */
+int
+_fini(void)
+{
+	int	status;
+
+	DPRINTF(2, (CE_CONT, "!usbgem: _fini: called"));
+	status = mod_remove(&modlinkage);
+	return (status);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&modlinkage, modinfop));
+}
+#endif /* MODULE */
+
+/* ============================================================== */
+/*
+ * Ether CRC calculation utilities
+ */
+/* ============================================================== */
+/*
+ * Ether CRC calculation according to 21143 data sheet
+ */
+#define	CRC32_POLY_LE	0xedb88320
+uint32_t
+usbgem_ether_crc_le(const uint8_t *addr)
+{
+	int		idx;
+	int		bit;
+	uint_t		data;
+	uint32_t	crc = 0xffffffff;
+
+	crc = 0xffffffff;
+	for (idx = 0; idx < ETHERADDRL; idx++) {
+		for (data = *addr++, bit = 0; bit < 8; bit++, data >>= 1) {
+			crc = (crc >> 1) ^
+			    (((crc ^ data) & 1) ? CRC32_POLY_LE : 0);
+		}
+	}
+	return	(crc);
+}
+
+#define	CRC32_POLY_BE	0x04c11db7
+uint32_t
+usbgem_ether_crc_be(const uint8_t *addr)
+{
+	int		idx;
+	int		bit;
+	uint_t		data;
+	uint32_t	crc;
+
+	crc = 0xffffffff;
+	for (idx = 0; idx < ETHERADDRL; idx++) {
+		for (data = *addr++, bit = 0; bit < 8; bit++, data >>= 1) {
+			crc = (crc << 1) ^
+			    ((((crc >> 31) ^ data) & 1) ? CRC32_POLY_BE : 0);
+		}
+	}
+	return (crc);
+}
+
+int
+usbgem_prop_get_int(struct usbgem_dev *dp, char *prop_template, int def_val)
+{
+	char	propname[32];
+
+	(void) sprintf(propname, prop_template, dp->name);
+
+	return (ddi_prop_get_int(DDI_DEV_T_ANY, dp->dip,
+	    DDI_PROP_DONTPASS, propname, def_val));
+}
+
+static int
+usbgem_population(uint32_t x)
+{
+	int	i;
+	int	cnt;
+
+	cnt = 0;
+	for (i = 0; i < 32; i++) {
+		if (x & (1 << i)) {
+			cnt++;
+		}
+	}
+	return (cnt);
+}
+
+static clock_t
+usbgem_timestamp_nz()
+{
+	clock_t	now;
+	now = ddi_get_lbolt();
+	return (now ? now : (clock_t)1);
+}
+
+#ifdef USBGEM_DEBUG_LEVEL
+#ifdef USBGEM_DEBUG_VLAN
+#ifdef notdef
+#include <netinet/in.h>
+#endif
+static void
+usbgem_dump_packet(struct usbgem_dev *dp, char *title, mblk_t *mp,
+    boolean_t check_cksum)
+{
+	char	msg[180];
+	uint8_t	buf[18+20+20];
+	uint8_t	*p;
+	size_t	offset;
+	uint_t	ethertype;
+	uint_t	proto;
+	uint_t	ipproto = 0;
+	uint_t	iplen;
+	uint_t	iphlen;
+	uint_t	tcplen;
+	uint_t	udplen;
+	uint_t	cksum;
+	int	rest;
+	int	len;
+	char	*bp;
+	mblk_t	*tp;
+	extern uint_t	ip_cksum(mblk_t *, int, uint32_t);
+
+	msg[0] = 0;
+	bp = msg;
+
+	rest = sizeof (buf);
+	offset = 0;
+	for (tp = mp; tp; tp = tp->b_cont) {
+		len = tp->b_wptr - tp->b_rptr;
+		len = min(rest, len);
+		bcopy(tp->b_rptr, &buf[offset], len);
+		rest -= len;
+		offset += len;
+		if (rest == 0) {
+			break;
+		}
+	}
+
+	offset = 0;
+	p = &buf[offset];
+
+	/* ethernet address */
+	sprintf(bp,
+	    "ether: %02x:%02x:%02x:%02x:%02x:%02x"
+	    " -> %02x:%02x:%02x:%02x:%02x:%02x",
+	    p[6], p[7], p[8], p[9], p[10], p[11],
+	    p[0], p[1], p[2], p[3], p[4], p[5]);
+	bp = &msg[strlen(msg)];
+
+	/* vlag tag and etherrtype */
+	ethertype = GET_ETHERTYPE(p);
+	if (ethertype == VTAG_TPID) {
+		sprintf(bp, " vtag:0x%04x", GET_NET16(&p[14]));
+		bp = &msg[strlen(msg)];
+
+		offset += VTAG_SIZE;
+		p = &buf[offset];
+		ethertype = GET_ETHERTYPE(p);
+	}
+	sprintf(bp, " type:%04x", ethertype);
+	bp = &msg[strlen(msg)];
+
+	/* ethernet packet length */
+	sprintf(bp, " mblklen:%d", msgdsize(mp));
+	bp = &msg[strlen(msg)];
+	if (mp->b_cont) {
+		sprintf(bp, "(");
+		bp = &msg[strlen(msg)];
+		for (tp = mp; tp; tp = tp->b_cont) {
+			if (tp == mp) {
+				sprintf(bp, "%d", tp->b_wptr - tp->b_rptr);
+			} else {
+				sprintf(bp, "+%d", tp->b_wptr - tp->b_rptr);
+			}
+			bp = &msg[strlen(msg)];
+		}
+		sprintf(bp, ")");
+		bp = &msg[strlen(msg)];
+	}
+
+	if (ethertype != ETHERTYPE_IP) {
+		goto x;
+	}
+
+	/* ip address */
+	offset += sizeof (struct ether_header);
+	p = &buf[offset];
+	ipproto = p[9];
+	iplen = GET_NET16(&p[2]);
+	sprintf(bp, ", ip: %d.%d.%d.%d -> %d.%d.%d.%d proto:%d iplen:%d",
+	    p[12], p[13], p[14], p[15],
+	    p[16], p[17], p[18], p[19],
+	    ipproto, iplen);
+	bp = (void *)&msg[strlen(msg)];
+
+	iphlen = (p[0] & 0xf) * 4;
+
+	/* cksum for psuedo header */
+	cksum = *(uint16_t *)&p[12];
+	cksum += *(uint16_t *)&p[14];
+	cksum += *(uint16_t *)&p[16];
+	cksum += *(uint16_t *)&p[18];
+	cksum += BE_16(ipproto);
+
+	/* tcp or udp protocol header */
+	offset += iphlen;
+	p = &buf[offset];
+	if (ipproto == IPPROTO_TCP) {
+		tcplen = iplen - iphlen;
+		sprintf(bp, ", tcp: len:%d cksum:%x",
+		    tcplen, GET_NET16(&p[16]));
+		bp = (void *)&msg[strlen(msg)];
+
+		if (check_cksum) {
+			cksum += BE_16(tcplen);
+			cksum = (uint16_t)ip_cksum(mp, offset, cksum);
+			sprintf(bp, " (%s)",
+			    (cksum == 0 || cksum == 0xffff) ? "ok" : "ng");
+			bp = (void *)&msg[strlen(msg)];
+		}
+	} else if (ipproto == IPPROTO_UDP) {
+		udplen = GET_NET16(&p[4]);
+		sprintf(bp, ", udp: len:%d cksum:%x",
+		    udplen, GET_NET16(&p[6]));
+		bp = (void *)&msg[strlen(msg)];
+
+		if (GET_NET16(&p[6]) && check_cksum) {
+			cksum += *(uint16_t *)&p[4];
+			cksum = (uint16_t)ip_cksum(mp, offset, cksum);
+			sprintf(bp, " (%s)",
+			    (cksum == 0 || cksum == 0xffff) ? "ok" : "ng");
+			bp = (void *)&msg[strlen(msg)];
+		}
+	}
+x:
+	cmn_err(CE_CONT, "!%s: %s: %s", dp->name, title, msg);
+}
+#endif /* USBGEM_DEBUG_VLAN */
+#endif /* USBGEM_DEBUG_LEVEL */
+
+#ifdef GEM_GCC_RUNTIME
+/*
+ * gcc3 runtime routines
+ */
+#pragma weak memcmp
+int
+memcmp(const void *s1, const void *s2, size_t n)
+{
+	int	i;
+	int	ret;
+
+	ret = 0;
+	for (i = 0; i < n; i++) {
+		ret = (int)((uint8_t *)s1)[i] - (int)((uint8_t *)s2)[i];
+		if (ret) {
+			return (ret);
+		}
+	}
+	return (0);
+}
+
+#pragma weak memset
+void *
+memset(void *s, int c, size_t n)
+{
+	if ((c & 0xff) == 0) {
+		bzero(s, n);
+	} else {
+		while (n--) {
+			((uint8_t *)s)[n] = c;
+		}
+	}
+	return (s);
+}
+
+#pragma weak _memcpy = memcpy
+#pragma weak memcpy
+void *
+memcpy(void *s1, const void *s2, size_t n)
+{
+	bcopy(s2, s1, n);
+	return (s1);
+}
+#endif /* GEM_GCC_RUNTIME */
+/* ============================================================== */
+/*
+ * hardware operations
+ */
+/* ============================================================== */
+static int
+usbgem_hal_reset_chip(struct usbgem_dev *dp)
+{
+	int	err;
+
+	sema_p(&dp->hal_op_lock);
+	err = (*dp->ugc.usbgc_reset_chip)(dp);
+	sema_v(&dp->hal_op_lock);
+	return (err);
+}
+
+static int
+usbgem_hal_init_chip(struct usbgem_dev *dp)
+{
+	int	err;
+
+	sema_p(&dp->hal_op_lock);
+	err = (*dp->ugc.usbgc_init_chip)(dp);
+	sema_v(&dp->hal_op_lock);
+	return (err);
+}
+
+static int
+usbgem_hal_attach_chip(struct usbgem_dev *dp)
+{
+	int	err;
+
+	sema_p(&dp->hal_op_lock);
+	err = (*dp->ugc.usbgc_attach_chip)(dp);
+	sema_v(&dp->hal_op_lock);
+	return (err);
+}
+
+static int
+usbgem_hal_set_rx_filter(struct usbgem_dev *dp)
+{
+	int	err;
+
+	sema_p(&dp->hal_op_lock);
+	err = (*dp->ugc.usbgc_set_rx_filter)(dp);
+	sema_v(&dp->hal_op_lock);
+	return (err);
+}
+
+static int
+usbgem_hal_set_media(struct usbgem_dev *dp)
+{
+	int	err;
+
+	sema_p(&dp->hal_op_lock);
+	err = (*dp->ugc.usbgc_set_media)(dp);
+	sema_v(&dp->hal_op_lock);
+	return (err);
+}
+
+static int
+usbgem_hal_start_chip(struct usbgem_dev *dp)
+{
+	int	err;
+
+	sema_p(&dp->hal_op_lock);
+	err = (*dp->ugc.usbgc_start_chip)(dp);
+	sema_v(&dp->hal_op_lock);
+	return (err);
+}
+
+static int
+usbgem_hal_stop_chip(struct usbgem_dev *dp)
+{
+	int	err;
+
+	sema_p(&dp->hal_op_lock);
+	err = (*dp->ugc.usbgc_stop_chip)(dp);
+	sema_v(&dp->hal_op_lock);
+	return (err);
+}
+
+static int
+usbgem_hal_get_stats(struct usbgem_dev *dp)
+{
+	int	err;
+
+	sema_p(&dp->hal_op_lock);
+	err = (*dp->ugc.usbgc_get_stats)(dp);
+	sema_v(&dp->hal_op_lock);
+	return (err);
+}
+
+
+/* ============================================================== */
+/*
+ * USB pipe management
+ */
+/* ============================================================== */
+static boolean_t
+usbgem_rx_start_unit(struct usbgem_dev *dp, usb_bulk_req_t *req)
+{
+	mblk_t	*mp;
+	int	err;
+	usb_flags_t	flags;
+
+	ASSERT(req);
+
+	mp = allocb(dp->rx_buf_len, BPRI_MED);
+	if (mp == NULL) {
+		cmn_err(CE_WARN, "!%s: %s: failed to allocate mblk",
+		    dp->name, __func__);
+		goto err;
+	}
+
+	req->bulk_len = dp->rx_buf_len;
+	req->bulk_data = mp;
+	req->bulk_client_private = (usb_opaque_t)dp;
+	req->bulk_timeout = 0;
+	req->bulk_attributes = USB_ATTRS_SHORT_XFER_OK;
+	req->bulk_cb = usbgem_bulkin_cb;
+	req->bulk_exc_cb = usbgem_bulkin_cb;
+	req->bulk_completion_reason = 0;
+	req->bulk_cb_flags = 0;
+
+	flags = 0;
+	err = usb_pipe_bulk_xfer(dp->bulkin_pipe, req, flags);
+
+	if (err != USB_SUCCESS) {
+		cmn_err(CE_WARN, "%s: failed to bulk_xfer for rx, err:%d",
+		    dp->name, err);
+
+		/* free req and mp */
+		usb_free_bulk_req(req);
+		goto err;
+	}
+	return (B_TRUE);
+err:
+	return (B_FALSE);
+}
+
+/* ============================================================== */
+/*
+ * Rx/Tx buffer management
+ */
+/* ============================================================== */
+static int
+usbgem_init_rx_buf(struct usbgem_dev *dp)
+{
+	int	i;
+	usb_bulk_req_t	*req;
+
+	ASSERT(dp->mac_state == MAC_STATE_ONLINE);
+
+	for (i = 0; i < dp->ugc.usbgc_rx_list_max; i++) {
+		req = usb_alloc_bulk_req(dp->dip, 0, USB_FLAGS_SLEEP);
+		if (req == NULL) {
+			cmn_err(CE_WARN,
+			    "!%s: %s: failed to allocate bulkreq for rx",
+			    dp->name, __func__);
+			return (USB_FAILURE);
+		}
+		if (!usbgem_rx_start_unit(dp, req)) {
+			return (USB_FAILURE);
+		}
+		mutex_enter(&dp->rxlock);
+		dp->rx_busy_cnt++;
+		mutex_exit(&dp->rxlock);
+	}
+	return (USB_SUCCESS);
+}
+
+/* ============================================================== */
+/*
+ * memory resource management
+ */
+/* ============================================================== */
+static int
+usbgem_free_memory(struct usbgem_dev *dp)
+{
+	usb_bulk_req_t	*req;
+
+	/* free all tx requst structure */
+	while ((req = dp->tx_free_list) != NULL) {
+		dp->tx_free_list =
+		    (usb_bulk_req_t *)req->bulk_client_private;
+		req->bulk_data = NULL;
+		usb_free_bulk_req(req);
+	}
+	return (USB_SUCCESS);
+}
+
+static int
+usbgem_alloc_memory(struct usbgem_dev *dp)
+{
+	int	i;
+	usb_bulk_req_t	*req;
+
+	/* allocate tx requests */
+	dp->tx_free_list = NULL;
+	for (i = 0; i < dp->ugc.usbgc_tx_list_max; i++) {
+		req = usb_alloc_bulk_req(dp->dip, 0, USB_FLAGS_SLEEP);
+		if (req == NULL) {
+			cmn_err(CE_WARN,
+			    "%s:%s failed to allocate tx requests",
+			    dp->name, __func__);
+
+			/* free partially allocated tx requests */
+			(void) usbgem_free_memory(dp);
+			return (USB_FAILURE);
+		}
+
+		/* add the new one allocated into tx free list */
+		req->bulk_client_private = (usb_opaque_t)dp->tx_free_list;
+		dp->tx_free_list = req;
+	}
+
+	return (USB_SUCCESS);
+}
+
+/* ========================================================== */
+/*
+ * Start transmission.
+ * Return zero on success,
+ */
+/* ========================================================== */
+
+#ifdef TXTIMEOUT_TEST
+static int usbgem_send_cnt = 0;
+#endif
+
+/*
+ * usbgem_send is used only to send data packet into ethernet line.
+ */
+static mblk_t *
+usbgem_send_common(struct usbgem_dev *dp, mblk_t *mp, uint32_t flags)
+{
+	int		err;
+	mblk_t		*new;
+	usb_bulk_req_t	*req;
+	int		mcast;
+	int		bcast;
+	int		len;
+	boolean_t	intr;
+	usb_flags_t	usb_flags = 0;
+#ifdef USBGEM_DEBUG_LEVEL
+	usb_pipe_state_t	p_state;
+#endif
+	DPRINTF(2, (CE_CONT, "!%s: %s: called", dp->name, __func__));
+
+	intr = (flags & 1) != 0;
+	len = msgdsize(mp);
+	bcast = 0;
+	mcast = 0;
+	if (mp->b_rptr[0] & 1) {
+		if (bcmp(mp->b_rptr, &usbgem_bcastaddr, ETHERADDRL) == 0) {
+			bcast = 1;
+		} else {
+			mcast = 1;
+		}
+	}
+	new = (*dp->ugc.usbgc_tx_make_packet)(dp, mp);
+	if (new == NULL) {
+		/*
+		 * no memory resource. we don't stop downstream,
+		 * we just discard the packet.
+		 */
+		DPRINTF(0, (CE_CONT, "!%s: %s: no memory",
+		    dp->name, __func__));
+		freemsg(mp);
+
+		mutex_enter(&dp->txlock);
+		dp->stats.noxmtbuf++;
+		dp->stats.errxmt++;
+		mutex_exit(&dp->txlock);
+
+		return (NULL);
+	}
+
+	ASSERT(new->b_cont == NULL);
+
+	mutex_enter(&dp->txlock);
+	if (dp->tx_free_list == NULL) {
+		/*
+		 * no tx free slot
+		 */
+		ASSERT(dp->tx_busy_cnt == dp->ugc.usbgc_tx_list_max);
+		mutex_exit(&dp->txlock);
+
+		DPRINTF(4, (CE_CONT, "!%s: %s: no free slot",
+		    dp->name, __func__));
+		if (new && new != mp) {
+			/* free reallocated message */
+			freemsg(new);
+		}
+		return (mp);
+	}
+	req = dp->tx_free_list;
+	dp->tx_free_list = (usb_bulk_req_t *)req->bulk_client_private;
+	dp->tx_busy_cnt++;
+	
+	if (dp->tx_free_list == NULL) {
+		intr = B_TRUE;
+	}
+	if (intr) {
+		dp->tx_intr_pended++;
+	}
+	DB_TCI(new) = intr;
+#ifdef USBGEM_DEBUG_LEVEL
+	new->b_datap->db_cksum32 = dp->tx_seq_num;
+	dp->tx_seq_num++;
+#endif
+	dp->stats.obytes += len;
+	dp->stats.opackets++;
+	if (bcast | mcast) {
+		dp->stats.obcast += bcast;
+		dp->stats.omcast += mcast;
+	}
+	mutex_exit(&dp->txlock);
+
+	DPRINTF(2, (CE_CONT, "!%s: %s: sending", dp->name, __func__));
+
+	req->bulk_len = (long)new->b_wptr - (long)new->b_rptr;
+	req->bulk_data = new;
+	req->bulk_client_private = (usb_opaque_t)dp;
+	req->bulk_timeout = dp->bulkout_timeout;	/* in second */
+	req->bulk_attributes = 0;
+	req->bulk_cb = usbgem_bulkout_cb;
+	req->bulk_exc_cb = usbgem_bulkout_cb;
+	req->bulk_completion_reason = 0;
+	req->bulk_cb_flags = 0;
+
+	if (intr) {
+		usb_flags = USB_FLAGS_SLEEP;
+	}
+	if ((err = usb_pipe_bulk_xfer(dp->bulkout_pipe, req, usb_flags))
+	    != USB_SUCCESS) {
+
+		/* failed to transfer the packet, discard it. */
+		freemsg(new);
+		req->bulk_data = NULL;
+
+		/* recycle the request block */
+		mutex_enter(&dp->txlock);
+		dp->tx_busy_cnt--;
+		req->bulk_client_private = (usb_opaque_t)dp->tx_free_list;
+		dp->tx_free_list = req;
+		mutex_exit(&dp->txlock);
+
+		cmn_err(CE_NOTE,
+		    "%s: %s: usb_pipe_bulk_xfer: failed: err:%d",
+		    dp->name, __func__, err);
+
+		/* we use another flag to indicate error state. */
+		if (dp->fatal_error == (clock_t)0) {
+			dp->fatal_error = usbgem_timestamp_nz();
+		}
+	} else {
+		/* record the start time */
+		dp->tx_start_time = ddi_get_lbolt();
+	}
+
+	if (err == USB_SUCCESS && (usb_flags & USB_FLAGS_SLEEP)) {
+		usbgem_bulkout_cb(dp->bulkout_pipe, req);
+	}
+
+	if (new != mp) {
+		freemsg(mp);
+	}
+	return (NULL);
+}
+
+int
+usbgem_restart_nic(struct usbgem_dev *dp)
+{
+	int	ret;
+	int	flags = 0;
+
+	DPRINTF(1, (CE_CONT, "!%s: %s: called", dp->name, __func__));
+
+	ASSERT(dp->mac_state != MAC_STATE_DISCONNECTED);
+
+	/*
+	 * ensure to stop the nic
+	 */
+	if (dp->mac_state == MAC_STATE_ONLINE) {
+		(void) usbgem_mac_stop(dp, MAC_STATE_STOPPED, STOP_GRACEFUL);
+	}
+
+	/* now the nic become quiescent, reset the chip */
+	if (usbgem_hal_reset_chip(dp) != USB_SUCCESS) {
+		cmn_err(CE_WARN, "%s: %s: failed to reset chip",
+		    dp->name, __func__);
+		goto err;
+	}
+
+	/*
+	 * restore the nic state step by step
+	 */
+	if (dp->nic_state < NIC_STATE_INITIALIZED) {
+		goto done;
+	}
+
+	if (usbgem_mac_init(dp) != USB_SUCCESS) {
+		cmn_err(CE_WARN, "%s: %s: failed to initialize chip",
+		    dp->name, __func__);
+		goto err;
+	}
+
+	/* setup mac address and enable rx filter */
+	sema_p(&dp->rxfilter_lock);
+	dp->rxmode |= RXMODE_ENABLE;
+	ret = usbgem_hal_set_rx_filter(dp);
+	sema_v(&dp->rxfilter_lock);
+	if (ret != USB_SUCCESS) {
+		goto err;
+	}
+
+	/*
+	 * update the link state asynchronously
+	 */
+	cv_signal(&dp->link_watcher_wait_cv);
+
+	/*
+	 * XXX - a panic happened because of linkdown.
+	 * We must check mii_state here, because the link can be down just
+	 * before the restart event happen. If the link is down now,
+	 * gem_mac_start() will be called from gem_mii_link_check() when
+	 * the link become up later.
+	 */
+	if (dp->mii_state == MII_STATE_LINKUP) {
+		if (usbgem_hal_set_media(dp) != USB_SUCCESS) {
+			goto err;
+		}
+		if (dp->nic_state < NIC_STATE_ONLINE) {
+			goto done;
+		}
+
+		(void) usbgem_mac_start(dp);
+
+	}
+done:
+	return (USB_SUCCESS);
+err:
+#ifdef GEM_CONFIG_FMA
+	ddi_fm_service_impact(dp->dip, DDI_SERVICE_DEGRADED);
+#endif
+	return (USB_FAILURE);
+}
+
+static void
+usbgem_tx_timeout(struct usbgem_dev *dp)
+{
+	int	ret;
+	uint_t	rwlock;
+	clock_t	now;
+
+	for (; ; ) {
+		mutex_enter(&dp->tx_watcher_lock);
+		ret = cv_timedwait(&dp->tx_watcher_cv, &dp->tx_watcher_lock,
+		    dp->tx_watcher_interval + ddi_get_lbolt());
+		mutex_exit(&dp->tx_watcher_lock);
+
+		if (dp->tx_watcher_stop) {
+			break;
+		}
+
+		now = ddi_get_lbolt();
+
+		rwlock = RW_READER;
+again:
+		rw_enter(&dp->dev_state_lock, rwlock);
+
+		if ((dp->mac_state != MAC_STATE_DISCONNECTED &&
+		    dp->fatal_error &&
+		    now - dp->fatal_error >= dp->ugc.usbgc_tx_timeout) ||
+		    (dp->mac_state == MAC_STATE_ONLINE &&
+		    dp->mii_state == MII_STATE_LINKUP &&
+		    dp->tx_busy_cnt != 0 &&
+		    now - dp->tx_start_time >= dp->ugc.usbgc_tx_timeout)) {
+			if (rwlock == RW_READER) {
+				/*
+				 * Upgrade dev_state_lock from shared mode
+				 * to exclusive mode to restart nic
+				 */
+				rwlock = RW_WRITER;
+				rw_exit(&dp->dev_state_lock);
+				goto again;
+			}
+			cmn_err(CE_WARN, "%s: %s: restarting the nic:"
+			    " fatal_error:%ld nic_state:%d"
+			    " mac_state:%d starttime:%ld",
+			    dp->name, __func__,
+			    dp->fatal_error ? now - dp->fatal_error: 0,
+			    dp->nic_state, dp->mac_state,
+			    dp->tx_busy_cnt ? now - dp->tx_start_time : 0);
+
+			(void) usbgem_restart_nic(dp);
+		}
+
+		rw_exit(&dp->dev_state_lock);
+	}
+}
+
+static int
+usbgem_tx_watcher_start(struct usbgem_dev *dp)
+{
+	int	err;
+	kthread_t	*wdth;
+
+	DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__));
+
+	/* make a first call of uwgem_lw_link_check() */
+	dp->tx_watcher_stop = 0;
+	dp->tx_watcher_interval = drv_usectohz(1000*1000);
+
+	wdth = thread_create(NULL, 0, usbgem_tx_timeout, dp, 0, &p0,
+	    TS_RUN, minclsyspri);
+	if (wdth == NULL) {
+		cmn_err(CE_WARN,
+		    "!%s: %s: failed to create a tx_watcher thread",
+		    dp->name, __func__);
+		return (USB_FAILURE);
+	}
+	dp->tx_watcher_did = wdth->t_did;
+
+	return (USB_SUCCESS);
+}
+
+static void
+usbgem_tx_watcher_stop(struct usbgem_dev *dp)
+{
+	DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__));
+	if (dp->tx_watcher_did) {
+		/* Ensure timer routine stopped */
+		dp->tx_watcher_stop = 1;
+		cv_signal(&dp->tx_watcher_cv);
+		thread_join(dp->tx_watcher_did);
+		dp->tx_watcher_did = NULL;
+	}
+}
+
+/* ================================================================== */
+/*
+ * Callback handlers
+ */
+/* ================================================================== */
+static void
+usbgem_bulkin_cb(usb_pipe_handle_t pipe, usb_bulk_req_t *req)
+{
+	mblk_t	*newmp;
+	mblk_t	*mp;
+	mblk_t	*tp;
+	int	len = 0;
+	int	pkts = 0;
+	int	bcast = 0;
+	int	mcast = 0;
+	boolean_t	busy;
+	struct usbgem_dev	*dp;
+
+	dp = (struct usbgem_dev *)req->bulk_client_private;
+	mp = req->bulk_data;
+	req->bulk_data = NULL;
+
+	DPRINTF(2, (CE_CONT, "!%s: %s: mp:%p, cr:%s(%d)",
+	    dp->name, __func__, mp,
+	    usb_str_cr(req->bulk_completion_reason),
+	    req->bulk_completion_reason));
+
+	/*
+	 * we cannot acquire dev_state_lock because the routine
+	 * must be executed during usbgem_mac_stop() to avoid
+	 * dead lock.
+	 * we use a simle membar operation to get the state correctly.
+	 */
+	membar_consumer();
+
+	if (req->bulk_completion_reason == USB_CR_OK &&
+	    dp->nic_state == NIC_STATE_ONLINE) {
+		newmp = (*dp->ugc.usbgc_rx_make_packet)(dp, mp);
+
+		if (newmp != mp) {
+			/* the message has been reallocated, free old one */
+			freemsg(mp);
+		}
+
+		/* the message may includes one or more ethernet packets */
+		for (tp = newmp; tp; tp = tp->b_next) {
+			len += tp->b_wptr - tp->b_rptr;
+			pkts++;
+			if (tp->b_rptr[0] & 1) {
+				if (bcmp(tp->b_rptr, &usbgem_bcastaddr,
+				    ETHERADDRL) == 0) {
+					bcast++;
+				} else {
+					mcast++;
+				}
+			}
+		}
+
+		/* send up if it is a valid packet */
+#ifdef USBGEM_CONFIG_GLDv3
+		mac_rx(dp->mh, NULL, newmp);
+#else
+		while (newmp) {
+			tp = newmp;
+			newmp = newmp->b_next;
+			tp->b_next = NULL;
+			gld_recv(dp->macinfo, tp);
+		}
+#endif
+	} else {
+		freemsg(mp);
+		len = 0;
+	}
+
+	mutex_enter(&dp->rxlock);
+	/* update rx_active */
+	if (dp->rx_active) {
+		dp->rx_active = dp->mac_state == MAC_STATE_ONLINE;
+	}
+
+	dp->stats.rbytes += len;
+	dp->stats.rpackets += pkts;
+	if (bcast | mcast) {
+		dp->stats.rbcast += bcast;
+		dp->stats.rmcast += mcast;
+	}
+	mutex_exit(&dp->rxlock);
+
+	if (dp->rx_active) {
+		/* prepare to receive the next packets */
+		if (usbgem_rx_start_unit(dp, req)) {
+			/* we successed */
+			goto done;
+		}
+		cmn_err(CE_WARN,
+		    "!%s: %s: failed to fill next rx packet",
+		    dp->name, __func__);
+		/*
+		 * we use another flag to indicate error state.
+		 * if we acquire dev_state_lock for RW_WRITER here,
+		 * usbgem_mac_stop() may hang.
+		 */
+		if (dp->fatal_error == (clock_t)0) {
+			dp->fatal_error = usbgem_timestamp_nz();
+		}
+	} else {
+		/* no need to prepare the next packets */
+		usb_free_bulk_req(req);
+	}
+
+	mutex_enter(&dp->rxlock);
+	dp->rx_active = B_FALSE;
+	dp->rx_busy_cnt--;
+	if (dp->rx_busy_cnt == 0) {
+		/* wake up someone waits for me */
+		cv_broadcast(&dp->rx_drain_cv);
+	}
+	mutex_exit(&dp->rxlock);
+done:
+	;
+}
+
+static void
+usbgem_bulkout_cb(usb_pipe_handle_t pipe, usb_bulk_req_t *req)
+{
+	boolean_t	intr;
+	boolean_t	tx_sched;
+	struct usbgem_dev	*dp;
+
+	dp = (struct usbgem_dev *)req->bulk_client_private;
+	tx_sched = B_FALSE;
+
+	DPRINTF(2, (CE_CONT,
+	    "!%s: %s: cr:%s(%d) cb_flags:0x%x head:%d tail:%d",
+	    dp->name, __func__,
+	    usb_str_cr(req->bulk_completion_reason),
+	    req->bulk_completion_reason,
+	    req->bulk_cb_flags,
+	    dp->tx_busy_cnt));
+
+	/* we have finished to transfer the packet into tx fifo */
+	intr = DB_TCI(req->bulk_data);
+	freemsg(req->bulk_data);
+
+	if (req->bulk_completion_reason != USB_CR_OK &&
+	    dp->fatal_error == (clock_t)0) {
+		dp->fatal_error = usbgem_timestamp_nz();
+	}
+
+	mutex_enter(&dp->txlock);
+
+	if (intr) {
+		ASSERT(dp->tx_intr_pended > 0);
+		/* find the last interrupt we have scheduled */
+		if (--(dp->tx_intr_pended) == 0) {
+			tx_sched = B_TRUE;
+		}
+	}
+
+	ASSERT(dp->tx_busy_cnt > 0);
+	req->bulk_client_private = (usb_opaque_t)dp->tx_free_list;
+	dp->tx_free_list = req;
+	dp->tx_busy_cnt--;
+
+#ifdef CONFIG_TX_LIMITER
+	if (tx_sched) {
+		dp->tx_max_packets =
+		    min(dp->tx_max_packets + 1, dp->ugc.usbgc_tx_list_max);
+	}
+#endif
+	if (dp->mac_state != MAC_STATE_ONLINE && dp->tx_busy_cnt == 0) {
+		cv_broadcast(&dp->tx_drain_cv);
+	}
+
+	mutex_exit(&dp->txlock);
+
+	if (tx_sched) {
+#ifdef USBGEM_CONFIG_GLDv3
+		mac_tx_update(dp->mh);
+#else
+		gld_sched(dp->macinfo);
+#endif
+	}
+}
+
+static void
+usbgem_intr_cb(usb_pipe_handle_t ph, usb_intr_req_t *req)
+{
+	struct usbgem_dev	*dp;
+
+	dp = (struct usbgem_dev *)req->intr_client_private;
+	dp->stats.intr++;
+
+	if (req->intr_completion_reason == USB_CR_OK) {
+		(*dp->ugc.usbgc_interrupt)(dp, req->intr_data);
+	}
+
+	/* free the request and data */
+	usb_free_intr_req(req);
+}
+
+/* ======================================================================== */
+/*
+ * MII support routines
+ */
+/* ======================================================================== */
+static void
+usbgem_choose_forcedmode(struct usbgem_dev *dp)
+{
+	/* choose media mode */
+	if (dp->anadv_1000fdx || dp->anadv_1000hdx) {
+		dp->speed = USBGEM_SPD_1000;
+		dp->full_duplex = dp->anadv_1000fdx;
+	} else if (dp->anadv_100fdx || dp->anadv_100t4) {
+		dp->speed = USBGEM_SPD_100;
+		dp->full_duplex = B_TRUE;
+	} else if (dp->anadv_100hdx) {
+		dp->speed = USBGEM_SPD_100;
+		dp->full_duplex = B_FALSE;
+	} else {
+		dp->speed = USBGEM_SPD_10;
+		dp->full_duplex = dp->anadv_10fdx;
+	}
+}
+
+static uint16_t
+usbgem_mii_read(struct usbgem_dev *dp, uint_t reg, int *errp)
+{
+	uint16_t	val;
+
+	sema_p(&dp->hal_op_lock);
+	val = (*dp->ugc.usbgc_mii_read)(dp, reg, errp);
+	sema_v(&dp->hal_op_lock);
+
+	return (val);
+}
+
+static void
+usbgem_mii_write(struct usbgem_dev *dp, uint_t reg, uint16_t val, int *errp)
+{
+	sema_p(&dp->hal_op_lock);
+	(*dp->ugc.usbgc_mii_write)(dp, reg, val, errp);
+	sema_v(&dp->hal_op_lock);
+}
+
+static int
+usbgem_mii_probe(struct usbgem_dev *dp)
+{
+	int	err;
+
+	err = (*dp->ugc.usbgc_mii_probe)(dp);
+	return (err);
+}
+
+static int
+usbgem_mii_init(struct usbgem_dev *dp)
+{
+	int	err;
+
+	err = (*dp->ugc.usbgc_mii_init)(dp);
+	return (err);
+}
+
+#define	fc_cap_decode(x)	\
+	((((x) & MII_ABILITY_PAUSE) != 0 ? 1 : 0) |	\
+	(((x) & MII_ABILITY_ASM_DIR) != 0 ? 2 : 0))
+
+int
+usbgem_mii_config_default(struct usbgem_dev *dp, int *errp)
+{
+	uint16_t	mii_stat;
+	uint16_t	val;
+
+	DPRINTF(1, (CE_CONT, "!%s: %s: called", dp->name, __func__));
+
+	/*
+	 * Configure bits in advertisement register
+	 */
+	mii_stat = dp->mii_status;
+
+	DPRINTF(1, (CE_CONT, "!%s: %s: MII_STATUS reg:%b",
+	    dp->name, __func__, mii_stat, MII_STATUS_BITS));
+
+	if ((mii_stat & MII_STATUS_ABILITY_TECH) == 0) {
+		/* it's funny */
+		cmn_err(CE_WARN, "!%s: wrong ability bits: mii_status:%b",
+		    dp->name, mii_stat, MII_STATUS_BITS);
+		return (USB_FAILURE);
+	}
+
+	/* Do not change the rest of ability bits in advert reg */
+	val = usbgem_mii_read(dp, MII_AN_ADVERT, errp) & ~MII_ABILITY_ALL;
+	if (*errp != USB_SUCCESS) {
+		goto usberr;
+	}
+
+	DPRINTF(0, (CE_CONT,
+	    "!%s: %s: 100T4:%d 100F:%d 100H:%d 10F:%d 10H:%d",
+	    dp->name, __func__,
+	    dp->anadv_100t4, dp->anadv_100fdx, dp->anadv_100hdx,
+	    dp->anadv_10fdx, dp->anadv_10hdx));
+
+	/* set technology bits */
+	if (dp->anadv_100t4) {
+		val |= MII_ABILITY_100BASE_T4;
+	}
+	if (dp->anadv_100fdx) {
+		val |= MII_ABILITY_100BASE_TX_FD;
+	}
+	if (dp->anadv_100hdx) {
+		val |= MII_ABILITY_100BASE_TX;
+	}
+	if (dp->anadv_10fdx) {
+		val |= MII_ABILITY_10BASE_T_FD;
+	}
+	if (dp->anadv_10hdx) {
+		val |= MII_ABILITY_10BASE_T;
+	}
+
+	/* set flow control capabilities */
+	if (dp->anadv_pause) {
+		val |= MII_ABILITY_PAUSE;
+	}
+	if (dp->anadv_asmpause) {
+		val |= MII_ABILITY_ASM_DIR;
+	}
+
+	DPRINTF(0, (CE_CONT,
+	    "!%s: %s: setting MII_AN_ADVERT reg:%b, pause:%d, asmpause:%d",
+	    dp->name, __func__, val, MII_ABILITY_BITS,
+	    dp->anadv_pause, dp->anadv_asmpause));
+
+	usbgem_mii_write(dp, MII_AN_ADVERT, val, errp);
+	if (*errp != USB_SUCCESS) {
+		goto usberr;
+	}
+
+	if (dp->mii_status & MII_STATUS_XSTATUS) {
+		/*
+		 * 1000Base-T GMII support
+		 */
+		if (!dp->anadv_autoneg) {
+			/* enable manual configuration */
+			val = MII_1000TC_CFG_EN;
+			if (dp->anadv_1000t_ms == 2) {
+				val |= MII_1000TC_CFG_VAL;
+			}
+		} else {
+			val = 0;
+			if (dp->anadv_1000fdx) {
+				val |= MII_1000TC_ADV_FULL;
+			}
+			if (dp->anadv_1000hdx) {
+				val |= MII_1000TC_ADV_HALF;
+			}
+			switch (dp->anadv_1000t_ms) {
+			case 1:
+				/* slave */
+				val |= MII_1000TC_CFG_EN;
+				break;
+
+			case 2:
+				/* master */
+				val |= MII_1000TC_CFG_EN | MII_1000TC_CFG_VAL;
+				break;
+
+			default:
+				/* auto: do nothing */
+				break;
+			}
+		}
+		DPRINTF(0, (CE_CONT,
+		    "!%s: %s: setting MII_1000TC reg:%b",
+		    dp->name, __func__, val, MII_1000TC_BITS));
+
+		usbgem_mii_write(dp, MII_1000TC, val, errp);
+		if (*errp != USB_SUCCESS) {
+			goto usberr;
+		}
+	}
+	return (USB_SUCCESS);
+
+usberr:
+	return (*errp);
+}
+
+static char *usbgem_fc_type[] = {
+	"without",
+	"with symmetric",
+	"with tx",
+	"with rx",
+};
+
+#ifdef USBGEM_CONFIG_GLDv3
+#define	USBGEM_LINKUP(dp)	mac_link_update((dp)->mh, LINK_STATE_UP)
+#define	USBGEM_LINKDOWN(dp)	mac_link_update((dp)->mh, LINK_STATE_DOWN)
+#else
+#define	USBGEM_LINKUP(dp)	\
+	if (gld_linkstate) {	\
+		gld_linkstate((dp)->macinfo, GLD_LINKSTATE_UP);	\
+	}
+#define	USBGEM_LINKDOWN(dp)	\
+	if (gld_linkstate) {	\
+		gld_linkstate((dp)->macinfo, GLD_LINKSTATE_DOWN);	\
+	}
+#endif
+
+static uint8_t usbgem_fc_result[4 /* my cap */][4 /* lp cap */] = {
+/*	 none	symm	tx	rx/symm */
+/* none */
+	{FLOW_CONTROL_NONE,
+		FLOW_CONTROL_NONE,
+			FLOW_CONTROL_NONE,
+				FLOW_CONTROL_NONE},
+/* sym */
+	{FLOW_CONTROL_NONE,
+		FLOW_CONTROL_SYMMETRIC,
+			FLOW_CONTROL_NONE,
+				FLOW_CONTROL_SYMMETRIC},
+/* tx */
+	{FLOW_CONTROL_NONE,
+		FLOW_CONTROL_NONE,
+			FLOW_CONTROL_NONE,
+				FLOW_CONTROL_TX_PAUSE},
+/* rx/symm */
+	{FLOW_CONTROL_NONE,
+		FLOW_CONTROL_SYMMETRIC,
+			FLOW_CONTROL_RX_PAUSE,
+				FLOW_CONTROL_SYMMETRIC},
+};
+
+static boolean_t
+usbgem_mii_link_check(struct usbgem_dev *dp, int *oldstatep, int *newstatep)
+{
+	boolean_t	tx_sched = B_FALSE;
+	uint16_t	status;
+	uint16_t	advert;
+	uint16_t	lpable;
+	uint16_t	exp;
+	uint16_t	ctl1000;
+	uint16_t	stat1000;
+	uint16_t	val;
+	clock_t		now;
+	clock_t		diff;
+	int		linkdown_action;
+	boolean_t	fix_phy = B_FALSE;
+	int		err;
+	uint_t		rwlock;
+
+	DPRINTF(4, (CE_CONT, "!%s: %s: time:%d state:%d",
+	    dp->name, __func__, ddi_get_lbolt(), dp->mii_state));
+
+	if (dp->mii_state != MII_STATE_LINKUP) {
+		rwlock = RW_WRITER;
+	} else {
+		rwlock = RW_READER;
+	}
+again:
+	rw_enter(&dp->dev_state_lock, rwlock);
+
+	/* save old mii state */
+	*oldstatep = dp->mii_state;
+
+	if (dp->mac_state == MAC_STATE_DISCONNECTED) {
+		/* stop periodic execution of the link watcher */
+		dp->mii_interval = 0;
+		tx_sched = B_FALSE;
+		goto next;
+	}
+
+	now = ddi_get_lbolt();
+	diff = now - dp->mii_last_check;
+	dp->mii_last_check = now;
+
+	/*
+	 * For NWAM, don't show linkdown state right
+	 * when the device is attached.
+	 */
+	if (dp->linkup_delay > 0) {
+		if (dp->linkup_delay > diff) {
+			dp->linkup_delay -= diff;
+		} else {
+			/* link up timeout */
+			dp->linkup_delay = -1;
+		}
+	}
+
+next_nowait:
+	switch (dp->mii_state) {
+	case MII_STATE_UNKNOWN:
+		goto reset_phy;
+
+	case MII_STATE_RESETTING:
+		dp->mii_timer -= diff;
+		if (dp->mii_timer > 0) {
+			/* don't read phy registers in resetting */
+			dp->mii_interval = WATCH_INTERVAL_FAST;
+			goto next;
+		}
+
+		val = usbgem_mii_read(dp, MII_CONTROL, &err);
+		if (err != USB_SUCCESS) {
+			goto usberr;
+		}
+		if (val & MII_CONTROL_RESET) {
+			cmn_err(CE_NOTE,
+			    "!%s: time:%ld resetting phy not complete."
+			    " mii_control:0x%b",
+			    dp->name, ddi_get_lbolt(),
+			    val, MII_CONTROL_BITS);
+		}
+
+		/* ensure neither isolated nor pwrdown nor auto-nego mode */
+		usbgem_mii_write(dp, MII_CONTROL, 0, &err);
+		if (err != USB_SUCCESS) {
+			goto usberr;
+		}
+#if USBGEM_DEBUG_LEVEL > 10
+		val = usbgem_mii_read(dp, MII_CONTROL, &err);
+		cmn_err(CE_CONT, "!%s: readback control %b",
+		    dp->name, val, MII_CONTROL_BITS);
+#endif
+		/* As resetting PHY has completed, configure PHY registers */
+		if ((*dp->ugc.usbgc_mii_config)(dp, &err) != USB_SUCCESS) {
+			/* we failed to configure PHY */
+			goto usberr;
+		}
+
+		/* prepare for forced mode */
+		usbgem_choose_forcedmode(dp);
+
+		dp->mii_lpable = 0;
+		dp->mii_advert = 0;
+		dp->mii_exp = 0;
+		dp->mii_ctl1000 = 0;
+		dp->mii_stat1000 = 0;
+
+		dp->flow_control = FLOW_CONTROL_NONE;
+
+		if (!dp->anadv_autoneg) {
+			/* skip auto-negotiation phase */
+			dp->mii_state = MII_STATE_MEDIA_SETUP;
+			dp->mii_timer = dp->ugc.usbgc_mii_linkdown_timeout;
+			goto next_nowait;
+		}
+
+		/* issue an auto-negotiation command */
+		goto autonego;
+
+	case MII_STATE_AUTONEGOTIATING:
+		/*
+		 * Autonegotiation in progress
+		 */
+		dp->mii_timer -= diff;
+		if (dp->mii_timer -
+		    (dp->ugc.usbgc_mii_an_timeout - dp->ugc.usbgc_mii_an_wait)
+		    > 0) {
+			/* wait for minimum time (2.3 - 2.5 sec) */
+			dp->mii_interval = WATCH_INTERVAL_FAST;
+			goto next;
+		}
+
+		/* read PHY status */
+		status = usbgem_mii_read(dp, MII_STATUS, &err);
+		if (err != USB_SUCCESS) {
+			goto usberr;
+		}
+		DPRINTF(4, (CE_CONT,
+		    "!%s: %s: called: mii_state:%d MII_STATUS reg:%b",
+		    dp->name, __func__, dp->mii_state,
+		    status, MII_STATUS_BITS));
+
+		if (status & MII_STATUS_REMFAULT) {
+			/*
+			 * The link parnert told me something wrong happend.
+			 * What do we do ?
+			 */
+			cmn_err(CE_CONT,
+			    "!%s: auto-negotiation failed: remote fault",
+			    dp->name);
+			goto autonego;
+		}
+
+		if ((status & MII_STATUS_ANDONE) == 0) {
+			if (dp->mii_timer <= 0) {
+				/*
+				 * Auto-negotiation has been timed out,
+				 * Reset PHY and try again.
+				 */
+				if (!dp->mii_supress_msg) {
+					cmn_err(CE_WARN,
+					    "!%s: auto-negotiation failed:"
+					    " timeout",
+					    dp->name);
+					dp->mii_supress_msg = B_TRUE;
+				}
+				goto autonego;
+			}
+			/*
+			 * Auto-negotiation is in progress. Wait for a while.
+			 */
+			dp->mii_interval = dp->ugc.usbgc_mii_an_watch_interval;
+			goto next;
+		}
+
+		/*
+		 * Auto-negotiation has been completed. Let's go to AN_DONE.
+		 */
+		dp->mii_state = MII_STATE_AN_DONE;
+		dp->mii_supress_msg = B_FALSE;
+		DPRINTF(0, (CE_CONT,
+		    "!%s: auto-negotiation completed, MII_STATUS:%b",
+		    dp->name, status, MII_STATUS_BITS));
+
+		if (dp->ugc.usbgc_mii_an_delay > 0) {
+			dp->mii_timer = dp->ugc.usbgc_mii_an_delay;
+			dp->mii_interval = drv_usectohz(20*1000);
+			goto next;
+		}
+
+		dp->mii_timer = 0;
+		diff = 0;
+		goto next_nowait;
+
+	case MII_STATE_AN_DONE:
+		/*
+		 * Auto-negotiation has done. Now we can set up media.
+		 */
+		dp->mii_timer -= diff;
+		if (dp->mii_timer > 0) {
+			/* wait for a while */
+			dp->mii_interval = WATCH_INTERVAL_FAST;
+			goto next;
+		}
+
+		/*
+		 * Setup speed and duplex mode according with
+		 * the result of auto negotiation.
+		 */
+
+		/*
+		 * Read registers required to determin current
+		 * duplex mode and media speed.
+		 */
+		if (dp->ugc.usbgc_mii_an_delay > 0) {
+			/* the 'status' variable is not initialized yet */
+			status = usbgem_mii_read(dp, MII_STATUS, &err);
+			if (err != USB_SUCCESS) {
+				goto usberr;
+			}
+		}
+		advert = usbgem_mii_read(dp, MII_AN_ADVERT, &err);
+		if (err != USB_SUCCESS) {
+			goto usberr;
+		}
+		lpable = usbgem_mii_read(dp, MII_AN_LPABLE, &err);
+		if (err != USB_SUCCESS) {
+			goto usberr;
+		}
+		exp = usbgem_mii_read(dp, MII_AN_EXPANSION, &err);
+		if (err != USB_SUCCESS) {
+			goto usberr;
+		}
+		if (exp == 0xffff) {
+			/* some phys don't have exp register */
+			exp = 0;
+		}
+
+		ctl1000 = 0;
+		stat1000 = 0;
+		if (dp->mii_status & MII_STATUS_XSTATUS) {
+			ctl1000 = usbgem_mii_read(dp, MII_1000TC, &err);
+			if (err != USB_SUCCESS) {
+				goto usberr;
+			}
+			stat1000 = usbgem_mii_read(dp, MII_1000TS, &err);
+			if (err != USB_SUCCESS) {
+				goto usberr;
+			}
+		}
+		dp->mii_lpable = lpable;
+		dp->mii_advert = advert;
+		dp->mii_exp = exp;
+		dp->mii_ctl1000 = ctl1000;
+		dp->mii_stat1000 = stat1000;
+
+		cmn_err(CE_CONT,
+		    "!%s: auto-negotiation done: "
+		    "status:%b, advert:%b, lpable:%b, exp:%b",
+		    dp->name,
+		    status, MII_STATUS_BITS,
+		    advert, MII_ABILITY_BITS,
+		    lpable, MII_ABILITY_BITS,
+		    exp, MII_AN_EXP_BITS);
+
+		DPRINTF(0, (CE_CONT, "!%s: MII_STATUS:%b",
+		    dp->name, status, MII_STATUS_BITS));
+
+		if (dp->mii_status & MII_STATUS_XSTATUS) {
+			cmn_err(CE_CONT,
+			    "! MII_1000TC reg:%b, MII_1000TS reg:%b",
+			    ctl1000, MII_1000TC_BITS,
+			    stat1000, MII_1000TS_BITS);
+		}
+
+		if (usbgem_population(lpable) <= 1 &&
+		    (exp & MII_AN_EXP_LPCANAN) == 0) {
+			if ((advert & MII_ABILITY_TECH) != lpable) {
+				cmn_err(CE_WARN,
+				    "!%s: but the link partner doesn't seem"
+				    " to have auto-negotiation capability."
+				    " please check the link configuration.",
+				    dp->name);
+			}
+			/*
+			 * it should be a result of pararell detection,
+			 * which cannot detect duplex mode.
+			 */
+			if ((advert & lpable) == 0 &&
+			    lpable & MII_ABILITY_10BASE_T) {
+				/* no common technology, try 10M half mode */
+				lpable |= advert & MII_ABILITY_10BASE_T;
+				fix_phy = B_TRUE;
+			}
+		} else if (lpable == 0) {
+			cmn_err(CE_WARN, "!%s: wrong lpable.", dp->name);
+			goto reset_phy;
+		}
+		/*
+		 * configure current link mode according to AN priority.
+		 */
+		val = advert & lpable;
+		if ((ctl1000 & MII_1000TC_ADV_FULL) &&
+		    (stat1000 & MII_1000TS_LP_FULL)) {
+			/* 1000BaseT & full duplex */
+			dp->speed = USBGEM_SPD_1000;
+			dp->full_duplex = B_TRUE;
+		} else if ((ctl1000 & MII_1000TC_ADV_HALF) &&
+		    (stat1000 & MII_1000TS_LP_HALF)) {
+			/* 1000BaseT & half duplex */
+			dp->speed = USBGEM_SPD_1000;
+			dp->full_duplex = B_FALSE;
+		} else if ((val & MII_ABILITY_100BASE_TX_FD)) {
+			/* 100BaseTx & fullduplex */
+			dp->speed = USBGEM_SPD_100;
+			dp->full_duplex = B_TRUE;
+		} else if ((val & MII_ABILITY_100BASE_T4)) {
+			/* 100BaseTx & fullduplex */
+			dp->speed = USBGEM_SPD_100;
+			dp->full_duplex = B_TRUE;
+		} else if ((val & MII_ABILITY_100BASE_TX)) {
+			/* 100BaseTx & half duplex */
+			dp->speed = USBGEM_SPD_100;
+			dp->full_duplex = B_FALSE;
+		} else if ((val & MII_ABILITY_10BASE_T_FD)) {
+			/* 10BaseT & full duplex */
+			dp->speed = USBGEM_SPD_10;
+			dp->full_duplex = B_TRUE;
+		} else if ((val & MII_ABILITY_10BASE_T)) {
+			/* 10BaseT & half duplex */
+			dp->speed = USBGEM_SPD_10;
+			dp->full_duplex = B_FALSE;
+		} else {
+			/*
+			 * the link partner doesn't seem to have
+			 * auto-negotiation capability and our PHY
+			 * could not report current mode correctly.
+			 * We guess current mode by mii_control register.
+			 */
+			val = usbgem_mii_read(dp, MII_CONTROL, &err);
+			if (err != USB_SUCCESS) {
+				goto usberr;
+			}
+
+			/* select 100m half or 10m half */
+			dp->speed = (val & MII_CONTROL_100MB) ?
+			    USBGEM_SPD_100 : USBGEM_SPD_10;
+			dp->full_duplex = B_FALSE;
+			fix_phy = B_TRUE;
+
+			cmn_err(CE_NOTE,
+			    "!%s: auto-negotiation done but "
+			    "common ability not found.\n"
+			    "PHY state: control:%b advert:%b lpable:%b\n"
+			    "guessing %d Mbps %s duplex mode",
+			    dp->name,
+			    val, MII_CONTROL_BITS,
+			    advert, MII_ABILITY_BITS,
+			    lpable, MII_ABILITY_BITS,
+			    usbgem_speed_value[dp->speed],
+			    dp->full_duplex ? "full" : "half");
+		}
+
+		if (dp->full_duplex) {
+			dp->flow_control =
+			    usbgem_fc_result[fc_cap_decode(advert)]
+			    [fc_cap_decode(lpable)];
+		} else {
+			dp->flow_control = FLOW_CONTROL_NONE;
+		}
+		dp->mii_state = MII_STATE_MEDIA_SETUP;
+		dp->mii_timer = dp->ugc.usbgc_mii_linkdown_timeout;
+		goto next_nowait;
+
+	case MII_STATE_MEDIA_SETUP:
+		DPRINTF(2, (CE_CONT, "!%s: setup midia mode", dp->name));
+
+		/* assume the link state is down */
+		dp->mii_state = MII_STATE_LINKDOWN;
+		dp->mii_supress_msg = B_FALSE;
+
+		/* use short interval */
+		dp->mii_interval = WATCH_INTERVAL_FAST;
+
+		if ((!dp->anadv_autoneg) ||
+		    dp->ugc.usbgc_mii_an_oneshot || fix_phy) {
+
+			/*
+			 * write the result of auto negotiation back.
+			 */
+			val = usbgem_mii_read(dp, MII_CONTROL, &err);
+			if (err != USB_SUCCESS) {
+				goto usberr;
+			}
+			val &= ~(MII_CONTROL_SPEED | MII_CONTROL_FDUPLEX |
+			    MII_CONTROL_ANE   | MII_CONTROL_RSAN);
+
+			if (dp->full_duplex) {
+				val |= MII_CONTROL_FDUPLEX;
+			}
+
+			switch (dp->speed) {
+			case USBGEM_SPD_1000:
+				val |= MII_CONTROL_1000MB;
+				break;
+
+			case USBGEM_SPD_100:
+				val |= MII_CONTROL_100MB;
+				break;
+
+			default:
+				cmn_err(CE_WARN, "%s: unknown speed:%d",
+				    dp->name, dp->speed);
+				/* FALLTHROUGH */
+
+			case USBGEM_SPD_10:
+				/* for USBGEM_SPD_10, do nothing */
+				break;
+			}
+
+			if (dp->mii_status & MII_STATUS_XSTATUS) {
+				usbgem_mii_write(dp,
+				    MII_1000TC, MII_1000TC_CFG_EN, &err);
+				if (err != USB_SUCCESS) {
+					goto usberr;
+				}
+			}
+			usbgem_mii_write(dp, MII_CONTROL, val, &err);
+			if (err != USB_SUCCESS) {
+				goto usberr;
+			}
+		}
+		/*
+		 * XXX -- nic state should be one of
+		 * NIC_STATE_DISCONNECTED
+		 * NIC_STATE_STOPPED
+		 * NIC_STATE_INITIALIZED
+		 * NIC_STATE_ONLINE
+		 */
+		if (dp->nic_state >= NIC_STATE_INITIALIZED) {
+			/* notify the result of autonegotiation to mac */
+			if (usbgem_hal_set_media(dp) != USB_SUCCESS) {
+				goto usberr;
+			}
+		}
+		goto next_nowait;
+
+	case MII_STATE_LINKDOWN:
+		status = usbgem_mii_read(dp, MII_STATUS, &err);
+		if (err != USB_SUCCESS) {
+			goto usberr;
+		}
+		if (status & MII_STATUS_LINKUP) {
+			/*
+			 * Link is going up
+			 */
+			dp->mii_state = MII_STATE_LINKUP;
+			dp->mii_supress_msg = B_FALSE;
+
+			DPRINTF(0, (CE_CONT,
+			    "!%s: link up detected: status:%b",
+			    dp->name, status, MII_STATUS_BITS));
+
+			/*
+			 * MII_CONTROL_100MB and  MII_CONTROL_FDUPLEX are
+			 * ignored when MII_CONTROL_ANE is set.
+			 */
+			cmn_err(CE_CONT,
+			    "!%s: Link up: %d Mbps %s duplex %s flow control",
+			    dp->name,
+			    usbgem_speed_value[dp->speed],
+			    dp->full_duplex ? "full" : "half",
+			    usbgem_fc_type[dp->flow_control]);
+
+			dp->mii_interval =
+			    dp->ugc.usbgc_mii_link_watch_interval;
+
+			if (dp->ugc.usbgc_mii_hw_link_detection &&
+			    dp->nic_state == NIC_STATE_ONLINE) {
+				dp->mii_interval = 0;
+			}
+
+			if (dp->nic_state == NIC_STATE_ONLINE) {
+				if (dp->mac_state == MAC_STATE_INITIALIZED) {
+					(void) usbgem_mac_start(dp);
+				}
+				tx_sched = B_TRUE;
+			}
+
+			goto next;
+		}
+
+		dp->mii_supress_msg = B_TRUE;
+		if (dp->anadv_autoneg) {
+			dp->mii_timer -= diff;
+			if (dp->mii_timer <= 0) {
+				/*
+				 * the link down timer expired.
+				 * need to restart auto-negotiation.
+				 */
+				linkdown_action =
+				    dp->ugc.usbgc_mii_linkdown_timeout_action;
+				goto restart_autonego;
+			}
+		}
+		/* don't change mii_state */
+		goto next;
+
+	case MII_STATE_LINKUP:
+		if (rwlock == RW_READER) {
+			/* first pass, read mii status */
+			status = usbgem_mii_read(dp, MII_STATUS, &err);
+			if (err != USB_SUCCESS) {
+				goto usberr;
+			}
+		}
+		if ((status & MII_STATUS_LINKUP) == 0) {
+			/*
+			 * Link is going down
+			 */
+			cmn_err(CE_NOTE,
+			    "!%s: link down detected: status:%b",
+			    dp->name, status, MII_STATUS_BITS);
+			/*
+			 * Acquire exclusive lock to change mii_state
+			 */
+			if (rwlock == RW_READER) {
+				rwlock = RW_WRITER;
+				rw_exit(&dp->dev_state_lock);
+				goto again;
+			}
+
+			dp->mii_state = MII_STATE_LINKDOWN;
+			dp->mii_timer = dp->ugc.usbgc_mii_linkdown_timeout;
+
+			/*
+			 * As we may change the state of the device,
+			 * let us acquire exclusive lock for the state.
+			 */
+			if (dp->nic_state == NIC_STATE_ONLINE &&
+			    dp->mac_state == MAC_STATE_ONLINE &&
+			    dp->ugc.usbgc_mii_stop_mac_on_linkdown) {
+				(void) usbgem_restart_nic(dp);
+				/* drain tx */
+				tx_sched = B_TRUE;
+			}
+
+			if (dp->anadv_autoneg) {
+				/* need to restart auto-negotiation */
+				linkdown_action =
+				    dp->ugc.usbgc_mii_linkdown_action;
+				goto restart_autonego;
+			}
+			/*
+			 * don't use hw link down detection until the link
+			 * status become stable for a while.
+			 */
+			dp->mii_interval =
+			    dp->ugc.usbgc_mii_link_watch_interval;
+
+			goto next;
+		}
+
+		/*
+		 * still link up, no need to change mii_state
+		 */
+		if (dp->ugc.usbgc_mii_hw_link_detection &&
+		    dp->nic_state == NIC_STATE_ONLINE) {
+			/*
+			 * no need to check link status periodicly
+			 * if nic can generate interrupts when link go down.
+			 */
+			dp->mii_interval = 0;
+		}
+		goto next;
+	}
+	/* NOTREACHED */
+	cmn_err(CE_PANIC, "!%s: %s: not reached", dp->name, __func__);
+
+	/*
+	 * Actions for new state.
+	 */
+restart_autonego:
+	switch (linkdown_action) {
+	case MII_ACTION_RESET:
+		if (!dp->mii_supress_msg) {
+			cmn_err(CE_CONT, "!%s: resetting PHY", dp->name);
+		}
+		dp->mii_supress_msg = B_TRUE;
+		goto reset_phy;
+
+	case MII_ACTION_NONE:
+		dp->mii_supress_msg = B_TRUE;
+		if (dp->ugc.usbgc_mii_an_oneshot) {
+			goto autonego;
+		}
+		/* PHY will restart autonego automatically */
+		dp->mii_state = MII_STATE_AUTONEGOTIATING;
+		dp->mii_timer = dp->ugc.usbgc_mii_an_timeout;
+		dp->mii_interval = dp->ugc.usbgc_mii_an_watch_interval;
+		goto next;
+
+	case MII_ACTION_RSA:
+		if (!dp->mii_supress_msg) {
+			cmn_err(CE_CONT, "!%s: restarting auto-negotiation",
+			    dp->name);
+		}
+		dp->mii_supress_msg = B_TRUE;
+		goto autonego;
+
+	default:
+		cmn_err(CE_PANIC, "!%s: unknowm linkdown action: %d",
+		    dp->name, dp->ugc.usbgc_mii_linkdown_action);
+		dp->mii_supress_msg = B_TRUE;
+	}
+	/* NOTREACHED */
+
+reset_phy:
+	if (!dp->mii_supress_msg) {
+		cmn_err(CE_CONT, "!%s: resetting PHY", dp->name);
+	}
+	dp->mii_state = MII_STATE_RESETTING;
+	dp->mii_timer = dp->ugc.usbgc_mii_reset_timeout;
+	if (!dp->ugc.usbgc_mii_dont_reset) {
+		usbgem_mii_write(dp, MII_CONTROL, MII_CONTROL_RESET, &err);
+		if (err != USB_SUCCESS) {
+			goto usberr;
+		}
+	}
+	dp->mii_interval = WATCH_INTERVAL_FAST;
+	goto next;
+
+autonego:
+	if (!dp->mii_supress_msg) {
+		cmn_err(CE_CONT, "!%s: auto-negotiation started", dp->name);
+	}
+	dp->mii_state = MII_STATE_AUTONEGOTIATING;
+	dp->mii_timer = dp->ugc.usbgc_mii_an_timeout;
+
+	/* start/restart autoneg */
+	val = usbgem_mii_read(dp, MII_CONTROL, &err) &
+	    ~(MII_CONTROL_ISOLATE | MII_CONTROL_PWRDN | MII_CONTROL_RESET);
+	if (err != USB_SUCCESS) {
+		goto usberr;
+	}
+	if (val & MII_CONTROL_ANE) {
+		val |= MII_CONTROL_RSAN;
+	}
+	usbgem_mii_write(dp, MII_CONTROL,
+	    val | dp->ugc.usbgc_mii_an_cmd | MII_CONTROL_ANE, &err);
+	if (err != USB_SUCCESS) {
+		goto usberr;
+	}
+
+	dp->mii_interval = dp->ugc.usbgc_mii_an_watch_interval;
+	goto next;
+
+usberr:
+	dp->mii_state = MII_STATE_UNKNOWN;
+	dp->mii_interval = dp->ugc.usbgc_mii_link_watch_interval;
+	tx_sched = B_TRUE;
+
+next:
+	*newstatep = dp->mii_state;
+	rw_exit(&dp->dev_state_lock);
+	return (tx_sched);
+}
+
+static void
+usbgem_mii_link_watcher(struct usbgem_dev *dp)
+{
+	int		old_mii_state;
+	int		new_mii_state;
+	boolean_t	tx_sched;
+
+	DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__));
+
+	for (; ; ) {
+
+		mutex_enter(&dp->link_watcher_lock);
+		if (dp->mii_interval) {
+			(void) cv_timedwait(&dp->link_watcher_wait_cv,
+			    &dp->link_watcher_lock,
+			    dp->mii_interval + ddi_get_lbolt());
+		} else {
+			cv_wait(&dp->link_watcher_wait_cv,
+			    &dp->link_watcher_lock);
+		}
+		mutex_exit(&dp->link_watcher_lock);
+
+		if (dp->link_watcher_stop) {
+			break;
+		}
+
+		/* we block callbacks from disconnect/suspend and restart */
+		tx_sched = usbgem_mii_link_check(dp,
+		    &old_mii_state, &new_mii_state);
+
+		/*
+		 * gld v2 notifier functions are not able to
+		 * be called with any locks in this layer.
+		 */
+		if (tx_sched) {
+			/* kick potentially stopped downstream */
+#ifdef USBGEM_CONFIG_GLDv3
+			mac_tx_update(dp->mh);
+#else
+			gld_sched(dp->macinfo);
+#endif
+		}
+
+		if (old_mii_state != new_mii_state) {
+			/* notify new mii link state */
+			if (new_mii_state == MII_STATE_LINKUP) {
+				dp->linkup_delay = 0;
+				USBGEM_LINKUP(dp);
+			} else if (dp->linkup_delay <= 0) {
+				USBGEM_LINKDOWN(dp);
+			}
+		} else if (dp->linkup_delay < 0) {
+			/* first linkup timeout */
+			dp->linkup_delay = 0;
+			USBGEM_LINKDOWN(dp);
+		}
+	}
+
+	thread_exit();
+}
+
+void
+usbgem_mii_update_link(struct usbgem_dev *dp)
+{
+	cv_signal(&dp->link_watcher_wait_cv);
+}
+
+int
+usbgem_mii_probe_default(struct usbgem_dev *dp)
+{
+	int		phy;
+	uint16_t	status;
+	uint16_t	xstatus;
+	int		err;
+	uint16_t	adv;
+	uint16_t	adv_org;
+
+	DPRINTF(3, (CE_CONT, "!%s: %s: called", dp->name, __func__));
+
+	/*
+	 * Scan PHY
+	 */
+	dp->mii_status = 0;
+
+	/* Try default phy first */
+	if (dp->mii_phy_addr) {
+		status = usbgem_mii_read(dp, MII_STATUS, &err);
+		if (err != USB_SUCCESS) {
+			goto usberr;
+		}
+		if (status != 0xffff && status != 0x0000) {
+			goto PHY_found;
+		}
+
+		if (dp->mii_phy_addr < 0) {
+			cmn_err(CE_NOTE,
+		    "!%s: failed to probe default internal and/or non-MII PHY",
+			    dp->name);
+			return (USB_FAILURE);
+		}
+
+		cmn_err(CE_NOTE,
+		    "!%s: failed to probe default MII PHY at %d",
+		    dp->name, dp->mii_phy_addr);
+	}
+
+	/* Try all possible address */
+	for (phy = dp->ugc.usbgc_mii_addr_min; phy < 32; phy++) {
+		dp->mii_phy_addr = phy;
+		status = usbgem_mii_read(dp, MII_STATUS, &err);
+		if (err != USB_SUCCESS) {
+			DPRINTF(0, (CE_CONT,
+			    "!%s: %s: mii_read(status) failed",
+			    dp->name, __func__));
+			goto usberr;
+		}
+
+		if (status != 0xffff && status != 0x0000) {
+			usbgem_mii_write(dp, MII_CONTROL, 0, &err);
+			if (err != USB_SUCCESS) {
+				DPRINTF(0, (CE_CONT,
+				    "!%s: %s: mii_write(control) failed",
+				    dp->name, __func__));
+				goto usberr;
+			}
+			goto PHY_found;
+		}
+	}
+	for (phy = dp->ugc.usbgc_mii_addr_min; phy < 32; phy++) {
+		dp->mii_phy_addr = phy;
+		usbgem_mii_write(dp, MII_CONTROL, 0, &err);
+		if (err != USB_SUCCESS) {
+			DPRINTF(0, (CE_CONT,
+			    "!%s: %s: mii_write(control) failed",
+			    dp->name, __func__));
+			goto usberr;
+		}
+		status = usbgem_mii_read(dp, MII_STATUS, &err);
+		if (err != USB_SUCCESS) {
+			DPRINTF(0, (CE_CONT,
+			    "!%s: %s: mii_read(status) failed",
+			    dp->name, __func__));
+			goto usberr;
+		}
+
+		if (status != 0xffff && status != 0) {
+			goto PHY_found;
+		}
+	}
+
+	cmn_err(CE_NOTE, "!%s: no MII PHY found", dp->name);
+	return (USB_FAILURE);
+
+PHY_found:
+	dp->mii_status = status;
+	dp->mii_status_ro = ~status;
+	dp->mii_phy_id = usbgem_mii_read(dp, MII_PHYIDH, &err) << 16;
+	if (err != USB_SUCCESS) {
+		DPRINTF(0, (CE_CONT,
+		    "!%s: %s: mii_read(PHYIDH) failed",
+		    dp->name, __func__));
+		goto usberr;
+	}
+	dp->mii_phy_id |= usbgem_mii_read(dp, MII_PHYIDL, &err);
+	if (err != USB_SUCCESS) {
+		DPRINTF(0, (CE_CONT,
+		    "!%s: %s: mii_read(PHYIDL) failed",
+		    dp->name, __func__));
+		goto usberr;
+	}
+
+	if (dp->mii_phy_addr < 0) {
+		cmn_err(CE_CONT, "!%s: using internal/non-MII PHY(0x%08x)",
+		    dp->name, dp->mii_phy_id);
+	} else {
+		cmn_err(CE_CONT, "!%s: MII PHY (0x%08x) found at %d",
+		    dp->name, dp->mii_phy_id, dp->mii_phy_addr);
+	}
+
+	cmn_err(CE_CONT,
+	    "!%s: PHY control:%b, status:%b, advert:%b, lpar:%b, exp:%b",
+	    dp->name,
+	    usbgem_mii_read(dp, MII_CONTROL, &err), MII_CONTROL_BITS,
+	    status, MII_STATUS_BITS,
+	    usbgem_mii_read(dp, MII_AN_ADVERT, &err), MII_ABILITY_BITS,
+	    usbgem_mii_read(dp, MII_AN_LPABLE, &err), MII_ABILITY_BITS,
+	    usbgem_mii_read(dp, MII_AN_EXPANSION, &err), MII_AN_EXP_BITS);
+
+	dp->mii_xstatus = 0;
+	if (status & MII_STATUS_XSTATUS) {
+		dp->mii_xstatus = usbgem_mii_read(dp, MII_XSTATUS, &err);
+
+		cmn_err(CE_CONT, "!%s: xstatus:%b",
+		    dp->name, dp->mii_xstatus, MII_XSTATUS_BITS);
+	}
+	dp->mii_xstatus_ro = ~dp->mii_xstatus;
+
+	/* check if the phy can advertize pause abilities */
+	adv_org = usbgem_mii_read(dp, MII_AN_ADVERT, &err);
+	if (err != USB_SUCCESS) {
+		goto usberr;
+	}
+
+	usbgem_mii_write(dp, MII_AN_ADVERT,
+	    MII_ABILITY_PAUSE | MII_ABILITY_ASM_DIR, &err);
+	if (err != USB_SUCCESS) {
+		goto usberr;
+	}
+
+	adv = usbgem_mii_read(dp, MII_AN_ADVERT, &err);
+	if (err != USB_SUCCESS) {
+		goto usberr;
+	}
+
+	if ((adv & MII_ABILITY_PAUSE) == 0) {
+		dp->ugc.usbgc_flow_control &= ~1;
+	}
+
+	if ((adv & MII_ABILITY_ASM_DIR) == 0) {
+		dp->ugc.usbgc_flow_control &= ~2;
+	}
+
+	usbgem_mii_write(dp, MII_AN_ADVERT, adv_org, &err);
+	if (err != USB_SUCCESS) {
+		goto usberr;
+	}
+	return (USB_SUCCESS);
+
+usberr:
+	return (USB_FAILURE);
+}
+
+int
+usbgem_mii_init_default(struct usbgem_dev *dp)
+{
+	/* ENPTY */
+	return (USB_SUCCESS);
+}
+
+static int
+usbgem_mii_start(struct usbgem_dev *dp)
+{
+	int	err;
+	kthread_t	*lwth;
+
+	DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__));
+
+	/* make a first call of usbgem_mii_link_check() */
+	dp->link_watcher_stop = 0;
+	dp->mii_state = MII_STATE_UNKNOWN;
+	dp->mii_interval = drv_usectohz(1000*1000); /* 1sec */
+	dp->mii_last_check = ddi_get_lbolt();
+	dp->linkup_delay = 600 * drv_usectohz(1000*1000); /* 10 minutes */
+
+	lwth = thread_create(NULL, 0, usbgem_mii_link_watcher, dp, 0, &p0,
+	    TS_RUN, minclsyspri);
+	if (lwth == NULL) {
+		cmn_err(CE_WARN,
+		    "!%s: %s: failed to create a link watcher thread",
+		    dp->name, __func__);
+		return (USB_FAILURE);
+	}
+	dp->link_watcher_did = lwth->t_did;
+
+	return (USB_SUCCESS);
+}
+
+static void
+usbgem_mii_stop(struct usbgem_dev *dp)
+{
+	DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__));
+
+	/* Ensure timer routine stopped */
+	dp->link_watcher_stop = 1;
+	cv_signal(&dp->link_watcher_wait_cv);
+	thread_join(dp->link_watcher_did);
+}
+
+/* ============================================================== */
+/*
+ * internal mac register operation interface
+ */
+/* ============================================================== */
+/*
+ * usbgem_mac_init: cold start
+ */
+static int
+usbgem_mac_init(struct usbgem_dev *dp)
+{
+	int	err;
+
+	DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__));
+
+	if (dp->mac_state == MAC_STATE_DISCONNECTED) {
+		/* pretend we succeeded */
+		return (USB_SUCCESS);
+	}
+
+	ASSERT(dp->mac_state == MAC_STATE_STOPPED);
+
+	/* reset fatal error timestamp */
+	dp->fatal_error = (clock_t)0;
+
+	/* reset tx side state */
+	mutex_enter(&dp->txlock);
+	dp->tx_busy_cnt = 0;
+	dp->tx_max_packets = dp->ugc.usbgc_tx_list_max;
+	mutex_exit(&dp->txlock);
+
+	/* reset rx side state */
+	mutex_enter(&dp->rxlock);
+	dp->rx_busy_cnt = 0;
+	mutex_exit(&dp->rxlock);
+
+	err = usbgem_hal_init_chip(dp);
+	if (err == USB_SUCCESS) {
+		dp->mac_state = MAC_STATE_INITIALIZED;
+	}
+
+	return (err);
+}
+
+/*
+ * usbgem_mac_start: warm start
+ */
+static int
+usbgem_mac_start(struct usbgem_dev *dp)
+{
+	int	err;
+	int	i;
+	usb_flags_t	flags = 0;
+	usb_intr_req_t	*req;
+#ifdef USBGEM_DEBUG_LEVEL
+	usb_pipe_state_t	p_state;
+#endif
+	DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__));
+
+	if (dp->mac_state == MAC_STATE_DISCONNECTED) {
+		/* do nothing but don't return failure */
+		return (USB_SUCCESS);
+	}
+
+	if (dp->mac_state != MAC_STATE_INITIALIZED) {
+		/* don't return failer */
+		DPRINTF(0, (CE_CONT,
+		    "!%s: %s: mac_state(%d) is not MAC_STATE_INITIALIZED",
+		    dp->name, __func__, dp->mac_state));
+		goto x;
+	}
+
+	dp->mac_state = MAC_STATE_ONLINE;
+
+	if (usbgem_hal_start_chip(dp) != USB_SUCCESS) {
+		cmn_err(CE_NOTE,
+		    "!%s: %s: usb error was detected during start_chip",
+		    dp->name, __func__);
+		goto x;
+	}
+
+#ifdef USBGEM_DEBUG_LEVEL
+	usb_pipe_get_state(dp->intr_pipe, &p_state, 0);
+	ASSERT(p_state == USB_PIPE_STATE_IDLE);
+#endif /* USBGEM_DEBUG_LEVEL */
+
+	if (dp->ugc.usbgc_interrupt && dp->intr_pipe) {
+
+		/* make a request for interrupt */
+
+		req = usb_alloc_intr_req(dp->dip, 0, USB_FLAGS_SLEEP);
+		if (req == NULL) {
+			cmn_err(CE_WARN, "!%s: %s: failed to allocate intreq",
+			    dp->name, __func__);
+			goto x;
+		}
+		req->intr_data = NULL;
+		req->intr_client_private = (usb_opaque_t)dp;
+		req->intr_timeout = 0;
+		req->intr_attributes =
+		    USB_ATTRS_SHORT_XFER_OK | USB_ATTRS_AUTOCLEARING;
+		req->intr_len = dp->ep_intr->wMaxPacketSize;
+		req->intr_cb = usbgem_intr_cb;
+		req->intr_exc_cb = usbgem_intr_cb;
+		req->intr_completion_reason = 0;
+		req->intr_cb_flags = 0;
+
+		err = usb_pipe_intr_xfer(dp->intr_pipe, req, flags);
+		if (err != USB_SUCCESS) {
+			cmn_err(CE_WARN,
+			    "%s: err:%d failed to start polling of intr pipe",
+			    dp->name, err);
+			goto x;
+		}
+	}
+
+	/* kick to receive the first packet */
+	if (usbgem_init_rx_buf(dp) != USB_SUCCESS) {
+		goto err_stop_intr;
+	}
+	dp->rx_active = B_TRUE;
+
+	return (USB_SUCCESS);
+
+err_stop_intr:
+	/* stop the interrupt pipe */
+	DPRINTF(0, (CE_CONT, "!%s: %s: FAULURE", dp->name, __func__));
+	if (dp->ugc.usbgc_interrupt && dp->intr_pipe) {
+		usb_pipe_stop_intr_polling(dp->intr_pipe, USB_FLAGS_SLEEP);
+	}
+x:
+	ASSERT(dp->mac_state == MAC_STATE_ONLINE);
+	/* we use another flag to indicate error state. */
+	if (dp->fatal_error == (clock_t)0) {
+		dp->fatal_error = usbgem_timestamp_nz();
+	}
+	return (USB_FAILURE);
+}
+
+static int
+usbgem_mac_stop(struct usbgem_dev *dp, int new_state, boolean_t graceful)
+{
+	DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__));
+
+	/*
+	 * we must have writer lock for dev_state_lock
+	 */
+	ASSERT(new_state == MAC_STATE_STOPPED
+	    || new_state == MAC_STATE_DISCONNECTED);
+
+	/* stop polling interrupt pipe */
+	if (dp->ugc.usbgc_interrupt && dp->intr_pipe) {
+		usb_pipe_stop_intr_polling(dp->intr_pipe, USB_FLAGS_SLEEP);
+	}
+
+	if (new_state == MAC_STATE_STOPPED || graceful) {
+		/* stop the nic hardware completely */
+		if (usbgem_hal_stop_chip(dp) != USB_SUCCESS) {
+			(void) usbgem_hal_reset_chip(dp);
+		}
+	}
+
+	/* stop preparing new rx packets and sending new packets */
+	dp->mac_state = new_state;
+
+	/* other processors must get mac_state correctly after here */
+	membar_producer();
+
+	/* cancel all requests we have sent */
+	usb_pipe_reset(dp->dip, dp->bulkin_pipe, USB_FLAGS_SLEEP, NULL, 0);
+	usb_pipe_reset(dp->dip, dp->bulkout_pipe, USB_FLAGS_SLEEP, NULL, 0);
+
+	DPRINTF(0, (CE_CONT,
+	    "!%s: %s: rx_busy_cnt:%d tx_busy_cnt:%d",
+	    dp->name, __func__, dp->rx_busy_cnt, dp->tx_busy_cnt));
+
+	/*
+	 * Here all rx packets has been cancelled and their call back
+	 * function has been exeuted, because we called usb_pipe_reset
+	 * synchronously.
+	 * So actually we just ensure rx_busy_cnt == 0.
+	 */
+	mutex_enter(&dp->rxlock);
+	while (dp->rx_busy_cnt > 0) {
+		cv_wait(&dp->rx_drain_cv, &dp->rxlock);
+	}
+	mutex_exit(&dp->rxlock);
+
+	DPRINTF(0, (CE_CONT, "!%s: %s: rx_busy_cnt is %d now",
+	    dp->name, __func__, dp->rx_busy_cnt));
+
+	mutex_enter(&dp->txlock);
+	while (dp->tx_busy_cnt > 0) {
+		cv_wait(&dp->tx_drain_cv, &dp->txlock);
+	}
+	mutex_exit(&dp->txlock);
+
+	DPRINTF(0, (CE_CONT, "!%s: %s: tx_busy_cnt is %d now",
+	    dp->name, __func__, dp->tx_busy_cnt));
+
+	return (USB_SUCCESS);
+}
+
+static int
+usbgem_add_multicast(struct usbgem_dev *dp, const uint8_t *ep)
+{
+	int	cnt;
+	int	err;
+
+	DPRINTF(1, (CE_CONT, "!%s: %s: called", dp->name, __func__));
+
+	sema_p(&dp->rxfilter_lock);
+	if (dp->mc_count_req++ < USBGEM_MAXMC) {
+		/* append the new address at the end of the mclist */
+		cnt = dp->mc_count;
+		bcopy(ep, dp->mc_list[cnt].addr.ether_addr_octet,
+		    ETHERADDRL);
+		if (dp->ugc.usbgc_multicast_hash) {
+			dp->mc_list[cnt].hash =
+			    (*dp->ugc.usbgc_multicast_hash)(dp, ep);
+		}
+		dp->mc_count = cnt + 1;
+	}
+
+	if (dp->mc_count_req != dp->mc_count) {
+		/* multicast address list overflow */
+		dp->rxmode |= RXMODE_MULTI_OVF;
+	} else {
+		dp->rxmode &= ~RXMODE_MULTI_OVF;
+	}
+
+	if (dp->mac_state != MAC_STATE_DISCONNECTED) {
+		/* tell new multicast list to the hardware */
+		err = usbgem_hal_set_rx_filter(dp);
+	}
+	sema_v(&dp->rxfilter_lock);
+
+	return (err);
+}
+
+static int
+usbgem_remove_multicast(struct usbgem_dev *dp, const uint8_t *ep)
+{
+	size_t		len;
+	int		i;
+	int		cnt;
+	int		err;
+
+	DPRINTF(1, (CE_CONT, "!%s: %s: called", dp->name, __func__));
+
+	sema_p(&dp->rxfilter_lock);
+	dp->mc_count_req--;
+	cnt = dp->mc_count;
+	for (i = 0; i < cnt; i++) {
+		if (bcmp(ep, &dp->mc_list[i].addr, ETHERADDRL)) {
+			continue;
+		}
+		/* shrink the mclist by copying forward */
+		len = (cnt - (i + 1)) * sizeof (*dp->mc_list);
+		if (len > 0) {
+			bcopy(&dp->mc_list[i+1], &dp->mc_list[i], len);
+		}
+		dp->mc_count--;
+		break;
+	}
+
+	if (dp->mc_count_req != dp->mc_count) {
+		/* multicast address list overflow */
+		dp->rxmode |= RXMODE_MULTI_OVF;
+	} else {
+		dp->rxmode &= ~RXMODE_MULTI_OVF;
+	}
+
+	if (dp->mac_state != MAC_STATE_DISCONNECTED) {
+		err = usbgem_hal_set_rx_filter(dp);
+	}
+	sema_v(&dp->rxfilter_lock);
+
+	return (err);
+}
+
+
+/* ============================================================== */
+/*
+ * ioctl
+ */
+/* ============================================================== */
+enum ioc_reply {
+	IOC_INVAL = -1,				/* bad, NAK with EINVAL	*/
+	IOC_DONE,				/* OK, reply sent	*/
+	IOC_ACK,				/* OK, just send ACK	*/
+	IOC_REPLY,				/* OK, just send reply	*/
+	IOC_RESTART_ACK,			/* OK, restart & ACK	*/
+	IOC_RESTART_REPLY			/* OK, restart & reply	*/
+};
+
+
+#ifdef USBGEM_CONFIG_MAC_PROP
+static int
+usbgem_get_def_val(struct usbgem_dev *dp, mac_prop_id_t pr_num,
+    uint_t pr_valsize, void *pr_val)
+{
+	link_flowctrl_t fl;
+	int err = 0;
+
+	ASSERT(pr_valsize > 0);
+	switch (pr_num) {
+	case MAC_PROP_AUTONEG:
+		*(uint8_t *)pr_val =
+		    BOOLEAN(dp->mii_status & MII_STATUS_CANAUTONEG);
+		break;
+
+	case MAC_PROP_FLOWCTRL:
+		if (pr_valsize < sizeof (link_flowctrl_t)) {
+			return (EINVAL);
+		}
+		switch (dp->ugc.usbgc_flow_control) {
+		case FLOW_CONTROL_NONE:
+			fl = LINK_FLOWCTRL_NONE;
+			break;
+		case FLOW_CONTROL_SYMMETRIC:
+			fl = LINK_FLOWCTRL_BI;
+			break;
+		case FLOW_CONTROL_TX_PAUSE:
+			fl = LINK_FLOWCTRL_TX;
+			break;
+		case FLOW_CONTROL_RX_PAUSE:
+			fl = LINK_FLOWCTRL_RX;
+			break;
+		}
+		bcopy(&fl, pr_val, sizeof (fl));
+		break;
+
+	case MAC_PROP_ADV_1000FDX_CAP:
+	case MAC_PROP_EN_1000FDX_CAP:
+		*(uint8_t *)pr_val =
+		    (dp->mii_xstatus & MII_XSTATUS_1000BASET_FD) ||
+		    (dp->mii_xstatus & MII_XSTATUS_1000BASEX_FD);
+		break;
+
+	case MAC_PROP_ADV_1000HDX_CAP:
+	case MAC_PROP_EN_1000HDX_CAP:
+		*(uint8_t *)pr_val =
+		    (dp->mii_xstatus & MII_XSTATUS_1000BASET) ||
+		    (dp->mii_xstatus & MII_XSTATUS_1000BASEX);
+		break;
+
+	case MAC_PROP_ADV_100T4_CAP:
+	case MAC_PROP_EN_100T4_CAP:
+		*(uint8_t *)pr_val =
+		    BOOLEAN(dp->mii_status & MII_STATUS_100_BASE_T4);
+		break;
+
+	case MAC_PROP_ADV_100FDX_CAP:
+	case MAC_PROP_EN_100FDX_CAP:
+		*(uint8_t *)pr_val =
+		    BOOLEAN(dp->mii_status & MII_STATUS_100_BASEX_FD);
+		break;
+
+	case MAC_PROP_ADV_100HDX_CAP:
+	case MAC_PROP_EN_100HDX_CAP:
+		*(uint8_t *)pr_val =
+		    BOOLEAN(dp->mii_status & MII_STATUS_100_BASEX);
+		break;
+
+	case MAC_PROP_ADV_10FDX_CAP:
+	case MAC_PROP_EN_10FDX_CAP:
+		*(uint8_t *)pr_val =
+		    BOOLEAN(dp->mii_status & MII_STATUS_10_FD);
+		break;
+
+	case MAC_PROP_ADV_10HDX_CAP:
+	case MAC_PROP_EN_10HDX_CAP:
+		*(uint8_t *)pr_val =
+		    BOOLEAN(dp->mii_status & MII_STATUS_10);
+		break;
+
+	default:
+		err = ENOTSUP;
+		break;
+	}
+	return (err);
+}
+
+#ifdef MAC_VERSION_V1
+static void
+usbgem_m_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num,
+    mac_prop_info_handle_t prh)
+{
+	struct usbgem_dev *dp = arg;
+	link_flowctrl_t fl;
+
+	/*
+	 * By default permissions are read/write unless specified
+	 * otherwise by the driver.
+	 */
+
+	switch (pr_num) {
+	case MAC_PROP_DUPLEX:
+	case MAC_PROP_SPEED:
+	case MAC_PROP_STATUS:
+	case MAC_PROP_ADV_1000FDX_CAP:
+	case MAC_PROP_ADV_1000HDX_CAP:
+	case MAC_PROP_ADV_100FDX_CAP:
+	case MAC_PROP_ADV_100HDX_CAP:
+	case MAC_PROP_ADV_10FDX_CAP:
+	case MAC_PROP_ADV_10HDX_CAP:
+	case MAC_PROP_ADV_100T4_CAP:
+	case MAC_PROP_EN_100T4_CAP:
+		mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
+		break;
+
+	case MAC_PROP_EN_1000FDX_CAP:
+		if ((dp->mii_xstatus_ro & MII_XSTATUS_1000BASET_FD) == 0) {
+			mac_prop_info_set_default_uint8(prh,
+			    BOOLEAN(
+			    dp->mii_xstatus & MII_XSTATUS_1000BASET_FD));
+		} else if ((dp->mii_xstatus_ro & MII_XSTATUS_1000BASEX_FD)
+		    == 0) {
+			mac_prop_info_set_default_uint8(prh,
+			    BOOLEAN(
+			    dp->mii_xstatus & MII_XSTATUS_1000BASEX_FD));
+		} else {
+			mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
+		}
+		break;
+
+	case MAC_PROP_EN_1000HDX_CAP:
+		if ((dp->mii_xstatus_ro & MII_XSTATUS_1000BASET) == 0) {
+			mac_prop_info_set_default_uint8(prh,
+			    BOOLEAN(
+			    dp->mii_xstatus & MII_XSTATUS_1000BASET));
+		} else if ((dp->mii_xstatus_ro & MII_XSTATUS_1000BASEX) == 0) {
+			mac_prop_info_set_default_uint8(prh,
+			    BOOLEAN(
+			    dp->mii_xstatus & MII_XSTATUS_1000BASEX));
+		} else {
+			mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
+		}
+		break;
+
+	case MAC_PROP_EN_100FDX_CAP:
+		if ((dp->mii_status_ro & MII_STATUS_100_BASEX_FD) == 0) {
+			mac_prop_info_set_default_uint8(prh,
+			    BOOLEAN(dp->mii_status & MII_STATUS_100_BASEX_FD));
+		} else {
+			mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
+		}
+		break;
+
+	case MAC_PROP_EN_100HDX_CAP:
+		if ((dp->mii_status_ro & MII_STATUS_100_BASEX) == 0) {
+			mac_prop_info_set_default_uint8(prh,
+			    BOOLEAN(dp->mii_status & MII_STATUS_100_BASEX));
+		} else {
+			mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
+		}
+		break;
+
+	case MAC_PROP_EN_10FDX_CAP:
+		if ((dp->mii_status_ro & MII_STATUS_10_FD) == 0) {
+			mac_prop_info_set_default_uint8(prh,
+			    BOOLEAN(dp->mii_status & MII_STATUS_10_FD));
+		} else {
+			mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
+		}
+		break;
+
+	case MAC_PROP_EN_10HDX_CAP:
+		if ((dp->mii_status_ro & MII_STATUS_10) == 0) {
+			mac_prop_info_set_default_uint8(prh,
+			    BOOLEAN(dp->mii_status & MII_STATUS_10));
+		} else {
+			mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
+		}
+		break;
+
+	case MAC_PROP_AUTONEG:
+		if ((dp->mii_status_ro & MII_STATUS_CANAUTONEG) == 0) {
+			mac_prop_info_set_default_uint8(prh,
+			    BOOLEAN(dp->mii_status & MII_STATUS_CANAUTONEG));
+		} else {
+			mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
+		}
+		break;
+
+	case MAC_PROP_FLOWCTRL:
+		switch (dp->ugc.usbgc_flow_control) {
+		case FLOW_CONTROL_NONE:
+			fl = LINK_FLOWCTRL_NONE;
+			break;
+		case FLOW_CONTROL_SYMMETRIC:
+			fl = LINK_FLOWCTRL_BI;
+			break;
+		case FLOW_CONTROL_TX_PAUSE:
+			fl = LINK_FLOWCTRL_TX;
+			break;
+		case FLOW_CONTROL_RX_PAUSE:
+			fl = LINK_FLOWCTRL_RX;
+			break;
+		}
+		mac_prop_info_set_default_link_flowctrl(prh, fl);
+		break;
+
+	case MAC_PROP_MTU:
+		mac_prop_info_set_range_uint32(prh,
+		    dp->ugc.usbgc_min_mtu, dp->ugc.usbgc_max_mtu);
+		break;
+
+	case MAC_PROP_PRIVATE:
+		break;
+	}
+}
+#endif
+
+static int
+usbgem_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
+    uint_t pr_valsize, const void *pr_val)
+{
+	struct usbgem_dev *dp = arg;
+	int err = 0;
+	boolean_t	update = B_FALSE;
+	link_flowctrl_t flowctrl;
+	uint32_t cur_mtu, new_mtu;
+
+	rw_enter(&dp->dev_state_lock, RW_WRITER);
+	switch (pr_num) {
+	case MAC_PROP_EN_1000FDX_CAP:
+		if ((dp->mii_xstatus_ro & MII_XSTATUS_1000BASET_FD) == 0 ||
+		    (dp->mii_xstatus_ro & MII_XSTATUS_1000BASEX_FD) == 0) {
+			if (dp->anadv_1000fdx != *(uint8_t *)pr_val) {
+				dp->anadv_1000fdx = *(uint8_t *)pr_val;
+				update = B_TRUE;
+			}
+		} else {
+			err = ENOTSUP;
+		}
+		break;
+
+	case MAC_PROP_EN_1000HDX_CAP:
+		if ((dp->mii_xstatus_ro & MII_XSTATUS_1000BASET) == 0 ||
+		    (dp->mii_xstatus_ro & MII_XSTATUS_1000BASEX) == 0) {
+			if (dp->anadv_1000hdx != *(uint8_t *)pr_val) {
+				dp->anadv_1000hdx = *(uint8_t *)pr_val;
+				update = B_TRUE;
+			}
+		} else {
+			err = ENOTSUP;
+		}
+		break;
+
+	case MAC_PROP_EN_100FDX_CAP:
+		if ((dp->mii_status_ro & MII_STATUS_100_BASEX_FD) == 0) {
+			if (dp->anadv_100fdx != *(uint8_t *)pr_val) {
+				dp->anadv_100fdx = *(uint8_t *)pr_val;
+				update = B_TRUE;
+			}
+		} else {
+			err = ENOTSUP;
+		}
+		break;
+
+	case MAC_PROP_EN_100HDX_CAP:
+		if ((dp->mii_status_ro & MII_STATUS_100_BASEX) == 0) {
+			if (dp->anadv_100hdx != *(uint8_t *)pr_val) {
+				dp->anadv_100hdx = *(uint8_t *)pr_val;
+				update = B_TRUE;
+			}
+		} else {
+			err = ENOTSUP;
+		}
+		break;
+
+	case MAC_PROP_EN_10FDX_CAP:
+		if ((dp->mii_status_ro & MII_STATUS_10_FD) == 0) {
+			if (dp->anadv_10fdx != *(uint8_t *)pr_val) {
+				dp->anadv_10fdx = *(uint8_t *)pr_val;
+				update = B_TRUE;
+			}
+		} else {
+			err = ENOTSUP;
+		}
+		break;
+
+	case MAC_PROP_EN_10HDX_CAP:
+		if ((dp->mii_status_ro & MII_STATUS_10_FD) == 0) {
+			if (dp->anadv_10hdx != *(uint8_t *)pr_val) {
+				dp->anadv_10hdx = *(uint8_t *)pr_val;
+				update = B_TRUE;
+			}
+		} else {
+			err = ENOTSUP;
+		}
+		break;
+
+	case MAC_PROP_AUTONEG:
+		if ((dp->mii_status_ro & MII_STATUS_CANAUTONEG) == 0) {
+			if (dp->anadv_autoneg != *(uint8_t *)pr_val) {
+				dp->anadv_autoneg = *(uint8_t *)pr_val;
+				update = B_TRUE;
+			}
+		} else {
+			err = ENOTSUP;
+		}
+		break;
+
+	case MAC_PROP_FLOWCTRL:
+		bcopy(pr_val, &flowctrl, sizeof (flowctrl));
+
+		switch (flowctrl) {
+		default:
+			err = EINVAL;
+			break;
+
+		case LINK_FLOWCTRL_NONE:
+			if (dp->flow_control != FLOW_CONTROL_NONE) {
+				dp->flow_control = FLOW_CONTROL_NONE;
+				update = B_TRUE;
+			}
+			break;
+
+		case LINK_FLOWCTRL_RX:
+			if (dp->flow_control != FLOW_CONTROL_RX_PAUSE) {
+				dp->flow_control = FLOW_CONTROL_RX_PAUSE;
+				update = B_TRUE;
+			}
+			break;
+
+		case LINK_FLOWCTRL_TX:
+			if (dp->flow_control != FLOW_CONTROL_TX_PAUSE) {
+				dp->flow_control = FLOW_CONTROL_TX_PAUSE;
+				update = B_TRUE;
+			}
+			break;
+
+		case LINK_FLOWCTRL_BI:
+			if (dp->flow_control != FLOW_CONTROL_SYMMETRIC) {
+				dp->flow_control = FLOW_CONTROL_SYMMETRIC;
+				update = B_TRUE;
+			}
+			break;
+		}
+		break;
+
+	case MAC_PROP_ADV_1000FDX_CAP:
+	case MAC_PROP_ADV_1000HDX_CAP:
+	case MAC_PROP_ADV_100FDX_CAP:
+	case MAC_PROP_ADV_100HDX_CAP:
+	case MAC_PROP_ADV_10FDX_CAP:
+	case MAC_PROP_ADV_10HDX_CAP:
+	case MAC_PROP_STATUS:
+	case MAC_PROP_SPEED:
+	case MAC_PROP_DUPLEX:
+		err = ENOTSUP; /* read-only prop. Can't set this. */
+		break;
+
+	case MAC_PROP_MTU:
+		bcopy(pr_val, &new_mtu, sizeof (new_mtu));
+		if (new_mtu != dp->mtu) {
+			err = EINVAL;
+		}
+		break;
+
+	case MAC_PROP_PRIVATE:
+		err = ENOTSUP;
+		break;
+
+	default:
+		err = ENOTSUP;
+		break;
+	}
+
+	if (update) {
+		/* sync with PHY */
+		usbgem_choose_forcedmode(dp);
+		dp->mii_state = MII_STATE_UNKNOWN;
+		cv_signal(&dp->link_watcher_wait_cv);
+	}
+	rw_exit(&dp->dev_state_lock);
+	return (err);
+}
+
+static int
+#ifdef MAC_VERSION_V1
+usbgem_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
+    uint_t pr_valsize, void *pr_val)
+#else
+usbgem_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
+    uint_t pr_flags, uint_t pr_valsize, void *pr_val, uint_t *perm)
+#endif
+{
+	struct usbgem_dev *dp = arg;
+	int err = 0;
+	link_flowctrl_t flowctrl;
+	uint64_t tmp = 0;
+
+	if (pr_valsize == 0) {
+		return (EINVAL);
+	}
+#ifndef MAC_VERSION_V1
+	*perm = MAC_PROP_PERM_RW;
+#endif
+	bzero(pr_val, pr_valsize);
+#ifndef MAC_VERSION_V1
+	if ((pr_flags & MAC_PROP_DEFAULT) && (pr_num != MAC_PROP_PRIVATE)) {
+		return (usbgem_get_def_val(dp, pr_num, pr_valsize, pr_val));
+	}
+#endif
+	rw_enter(&dp->dev_state_lock, RW_READER);
+	switch (pr_num) {
+	case MAC_PROP_DUPLEX:
+#ifndef MAC_VERSION_V1
+		*perm = MAC_PROP_PERM_READ;
+#endif
+		if (pr_valsize >= sizeof (link_duplex_t)) {
+			if (dp->mii_state != MII_STATE_LINKUP) {
+				*(link_duplex_t *)pr_val = LINK_DUPLEX_UNKNOWN;
+			} else if (dp->full_duplex) {
+				*(link_duplex_t *)pr_val = LINK_DUPLEX_FULL;
+			} else {
+				*(link_duplex_t *)pr_val = LINK_DUPLEX_HALF;
+			}
+		} else {
+			err = EINVAL;
+		}
+		break;
+	case MAC_PROP_SPEED:
+#ifndef MAC_VERSION_V1
+		*perm = MAC_PROP_PERM_READ;
+#endif
+		if (pr_valsize >= sizeof (uint64_t)) {
+			switch (dp->speed) {
+			case USBGEM_SPD_1000:
+				tmp = 1000000000;
+				break;
+			case USBGEM_SPD_100:
+				tmp = 100000000;
+				break;
+			case USBGEM_SPD_10:
+				tmp = 10000000;
+				break;
+			default:
+				tmp = 0;
+			}
+			bcopy(&tmp, pr_val, sizeof (tmp));
+		} else {
+			err = EINVAL;
+		}
+		break;
+
+	case MAC_PROP_AUTONEG:
+#ifndef MAC_VERSION_V1
+		if (dp->mii_status_ro & MII_STATUS_CANAUTONEG) {
+			*perm = MAC_PROP_PERM_READ;
+		}
+#endif
+		*(uint8_t *)pr_val = dp->anadv_autoneg;
+		break;
+
+	case MAC_PROP_FLOWCTRL:
+		if (pr_valsize >= sizeof (link_flowctrl_t)) {
+			switch (dp->flow_control) {
+			case FLOW_CONTROL_NONE:
+				flowctrl = LINK_FLOWCTRL_NONE;
+				break;
+			case FLOW_CONTROL_RX_PAUSE:
+				flowctrl = LINK_FLOWCTRL_RX;
+				break;
+			case FLOW_CONTROL_TX_PAUSE:
+				flowctrl = LINK_FLOWCTRL_TX;
+				break;
+			case FLOW_CONTROL_SYMMETRIC:
+				flowctrl = LINK_FLOWCTRL_BI;
+				break;
+			}
+			bcopy(&flowctrl, pr_val, sizeof (flowctrl));
+		} else {
+			err = EINVAL;
+		}
+		break;
+
+	case MAC_PROP_ADV_1000FDX_CAP:
+	case MAC_PROP_ADV_1000HDX_CAP:
+	case MAC_PROP_ADV_100FDX_CAP:
+	case MAC_PROP_ADV_100HDX_CAP:
+	case MAC_PROP_ADV_10FDX_CAP:
+	case MAC_PROP_ADV_10HDX_CAP:
+	case MAC_PROP_ADV_100T4_CAP:
+		usbgem_get_def_val(dp, pr_num, pr_valsize, pr_val);
+		break;
+
+	case MAC_PROP_EN_1000FDX_CAP:
+#ifndef MAC_VERSION_V1
+		if ((dp->mii_xstatus_ro & MII_XSTATUS_1000BASET_FD) &&
+		    (dp->mii_xstatus_ro & MII_XSTATUS_1000BASEX_FD)) {
+			*perm = MAC_PROP_PERM_READ;
+		}
+#endif
+		*(uint8_t *)pr_val = dp->anadv_1000fdx;
+		break;
+
+	case MAC_PROP_EN_1000HDX_CAP:
+#ifndef MAC_VERSION_V1
+		if ((dp->mii_xstatus_ro & MII_XSTATUS_1000BASET) &&
+		    (dp->mii_xstatus_ro & MII_XSTATUS_1000BASEX)) {
+			*perm = MAC_PROP_PERM_READ;
+		}
+#endif
+		*(uint8_t *)pr_val = dp->anadv_1000hdx;
+		break;
+
+	case MAC_PROP_EN_100FDX_CAP:
+#ifndef MAC_VERSION_V1
+		if (dp->mii_status_ro & MII_STATUS_100_BASEX_FD) {
+			*perm = MAC_PROP_PERM_READ;
+		}
+#endif
+		*(uint8_t *)pr_val = dp->anadv_100fdx;
+		break;
+
+	case MAC_PROP_EN_100HDX_CAP:
+#ifndef MAC_VERSION_V1
+		if (dp->mii_status_ro & MII_STATUS_100_BASEX) {
+			*perm = MAC_PROP_PERM_READ;
+		}
+#endif
+		*(uint8_t *)pr_val = dp->anadv_100hdx;
+		break;
+
+	case MAC_PROP_EN_10FDX_CAP:
+#ifndef MAC_VERSION_V1
+		if (dp->mii_status_ro & MII_STATUS_10_FD) {
+			*perm = MAC_PROP_PERM_READ;
+		}
+#endif
+		*(uint8_t *)pr_val = dp->anadv_10fdx;
+		break;
+
+	case MAC_PROP_EN_10HDX_CAP:
+#ifndef MAC_VERSION_V1
+		if (dp->mii_status_ro & MII_STATUS_10) {
+			*perm = MAC_PROP_PERM_READ;
+		}
+#endif
+		*(uint8_t *)pr_val = dp->anadv_10hdx;
+		break;
+
+	case MAC_PROP_EN_100T4_CAP:
+#ifndef MAC_VERSION_V1
+		if (dp->mii_status_ro & MII_STATUS_100_BASE_T4) {
+			*perm = MAC_PROP_PERM_READ;
+		}
+#endif
+		*(uint8_t *)pr_val = dp->anadv_100t4;
+		break;
+
+	case MAC_PROP_PRIVATE:
+		err = ENOTSUP;
+		break;
+
+#ifndef MAC_VERSION_V1
+	case MAC_PROP_MTU: {
+		mac_propval_range_t range;
+		if (!(pr_flags & MAC_PROP_POSSIBLE)) {
+			err = ENOTSUP;
+			break;
+		}
+		if (pr_valsize < sizeof (mac_propval_range_t)) {
+			err = EINVAL;
+			break;
+		}
+		range.mpr_count = 1;
+		range.mpr_type = MAC_PROPVAL_UINT32;
+		range.range_uint32[0].mpur_min = ETHERMTU;
+		range.range_uint32[0].mpur_max = dp->mtu;
+		bcopy(&range, pr_val, sizeof (range));
+		break;
+	}
+#endif
+	default:
+		err = ENOTSUP;
+		break;
+	}
+
+	rw_exit(&dp->dev_state_lock);
+	return (err);
+}
+#endif /* USBGEM_CONFIG_MAC_PROP */
+
+#ifdef USBGEM_CONFIG_ND
+/* ============================================================== */
+/*
+ * ND interface
+ */
+/* ============================================================== */
+enum {
+	PARAM_AUTONEG_CAP,
+	PARAM_PAUSE_CAP,
+	PARAM_ASYM_PAUSE_CAP,
+	PARAM_1000FDX_CAP,
+	PARAM_1000HDX_CAP,
+	PARAM_100T4_CAP,
+	PARAM_100FDX_CAP,
+	PARAM_100HDX_CAP,
+	PARAM_10FDX_CAP,
+	PARAM_10HDX_CAP,
+
+	PARAM_ADV_AUTONEG_CAP,
+	PARAM_ADV_PAUSE_CAP,
+	PARAM_ADV_ASYM_PAUSE_CAP,
+	PARAM_ADV_1000FDX_CAP,
+	PARAM_ADV_1000HDX_CAP,
+	PARAM_ADV_100T4_CAP,
+	PARAM_ADV_100FDX_CAP,
+	PARAM_ADV_100HDX_CAP,
+	PARAM_ADV_10FDX_CAP,
+	PARAM_ADV_10HDX_CAP,
+	PARAM_ADV_1000T_MS,
+
+	PARAM_LP_AUTONEG_CAP,
+	PARAM_LP_PAUSE_CAP,
+	PARAM_LP_ASYM_PAUSE_CAP,
+	PARAM_LP_1000FDX_CAP,
+	PARAM_LP_1000HDX_CAP,
+	PARAM_LP_100T4_CAP,
+	PARAM_LP_100FDX_CAP,
+	PARAM_LP_100HDX_CAP,
+	PARAM_LP_10FDX_CAP,
+	PARAM_LP_10HDX_CAP,
+
+	PARAM_LINK_STATUS,
+	PARAM_LINK_SPEED,
+	PARAM_LINK_DUPLEX,
+
+	PARAM_LINK_AUTONEG,
+	PARAM_LINK_RX_PAUSE,
+	PARAM_LINK_TX_PAUSE,
+
+	PARAM_LOOP_MODE,
+	PARAM_MSI_CNT,
+#ifdef DEBUG_RESUME
+	PARAM_RESUME_TEST,
+#endif
+
+	PARAM_COUNT
+};
+
+struct usbgem_nd_arg {
+	struct usbgem_dev	*dp;
+	int		item;
+};
+
+static int
+usbgem_param_get(queue_t *q, mblk_t *mp, caddr_t arg, cred_t *credp)
+{
+	struct usbgem_dev	*dp = ((struct usbgem_nd_arg *)(void *)arg)->dp;
+	int		item = ((struct usbgem_nd_arg *)(void *)arg)->item;
+	long		val;
+
+	DPRINTF(1, (CE_CONT, "!%s: %s: called, item:%d",
+	    dp->name, __func__, item));
+
+	switch (item) {
+	case PARAM_AUTONEG_CAP:
+		val = BOOLEAN(dp->mii_status & MII_STATUS_CANAUTONEG);
+		DPRINTF(1, (CE_CONT, "autoneg_cap:%d", val));
+		break;
+
+	case PARAM_PAUSE_CAP:
+		val = dp->ugc.usbgc_flow_control != FLOW_CONTROL_NONE;
+		break;
+
+	case PARAM_ASYM_PAUSE_CAP:
+		val = dp->ugc.usbgc_flow_control > FLOW_CONTROL_SYMMETRIC;
+		break;
+
+	case PARAM_1000FDX_CAP:
+		val = (dp->mii_xstatus & MII_XSTATUS_1000BASET_FD) ||
+		    (dp->mii_xstatus & MII_XSTATUS_1000BASEX_FD);
+		break;
+
+	case PARAM_1000HDX_CAP:
+		val = (dp->mii_xstatus & MII_XSTATUS_1000BASET) ||
+		    (dp->mii_xstatus & MII_XSTATUS_1000BASEX);
+		break;
+
+	case PARAM_100T4_CAP:
+		val = BOOLEAN(dp->mii_status & MII_STATUS_100_BASE_T4);
+		break;
+
+	case PARAM_100FDX_CAP:
+		val = BOOLEAN(dp->mii_status & MII_STATUS_100_BASEX_FD);
+		break;
+
+	case PARAM_100HDX_CAP:
+		val = BOOLEAN(dp->mii_status & MII_STATUS_100_BASEX);
+		break;
+
+	case PARAM_10FDX_CAP:
+		val = BOOLEAN(dp->mii_status & MII_STATUS_10_FD);
+		break;
+
+	case PARAM_10HDX_CAP:
+		val = BOOLEAN(dp->mii_status & MII_STATUS_10);
+		break;
+
+	case PARAM_ADV_AUTONEG_CAP:
+		val = dp->anadv_autoneg;
+		break;
+
+	case PARAM_ADV_PAUSE_CAP:
+		val = dp->anadv_pause;
+		break;
+
+	case PARAM_ADV_ASYM_PAUSE_CAP:
+		val = dp->anadv_asmpause;
+		break;
+
+	case PARAM_ADV_1000FDX_CAP:
+		val = dp->anadv_1000fdx;
+		break;
+
+	case PARAM_ADV_1000HDX_CAP:
+		val = dp->anadv_1000hdx;
+		break;
+
+	case PARAM_ADV_100T4_CAP:
+		val = dp->anadv_100t4;
+		break;
+
+	case PARAM_ADV_100FDX_CAP:
+		val = dp->anadv_100fdx;
+		break;
+
+	case PARAM_ADV_100HDX_CAP:
+		val = dp->anadv_100hdx;
+		break;
+
+	case PARAM_ADV_10FDX_CAP:
+		val = dp->anadv_10fdx;
+		break;
+
+	case PARAM_ADV_10HDX_CAP:
+		val = dp->anadv_10hdx;
+		break;
+
+	case PARAM_ADV_1000T_MS:
+		val = dp->anadv_1000t_ms;
+		break;
+
+	case PARAM_LP_AUTONEG_CAP:
+		val = BOOLEAN(dp->mii_exp & MII_AN_EXP_LPCANAN);
+		break;
+
+	case PARAM_LP_PAUSE_CAP:
+		val = BOOLEAN(dp->mii_lpable & MII_ABILITY_PAUSE);
+		break;
+
+	case PARAM_LP_ASYM_PAUSE_CAP:
+		val = BOOLEAN(dp->mii_lpable & MII_ABILITY_ASM_DIR);
+		break;
+
+	case PARAM_LP_1000FDX_CAP:
+		val = BOOLEAN(dp->mii_stat1000 & MII_1000TS_LP_FULL);
+		break;
+
+	case PARAM_LP_1000HDX_CAP:
+		val = BOOLEAN(dp->mii_stat1000 & MII_1000TS_LP_HALF);
+		break;
+
+	case PARAM_LP_100T4_CAP:
+		val = BOOLEAN(dp->mii_lpable & MII_ABILITY_100BASE_T4);
+		break;
+
+	case PARAM_LP_100FDX_CAP:
+		val = BOOLEAN(dp->mii_lpable & MII_ABILITY_100BASE_TX_FD);
+		break;
+
+	case PARAM_LP_100HDX_CAP:
+		val = BOOLEAN(dp->mii_lpable & MII_ABILITY_100BASE_TX);
+		break;
+
+	case PARAM_LP_10FDX_CAP:
+		val = BOOLEAN(dp->mii_lpable & MII_ABILITY_10BASE_T_FD);
+		break;
+
+	case PARAM_LP_10HDX_CAP:
+		val = BOOLEAN(dp->mii_lpable & MII_ABILITY_10BASE_T);
+		break;
+
+	case PARAM_LINK_STATUS:
+		val = (dp->mii_state == MII_STATE_LINKUP);
+		break;
+
+	case PARAM_LINK_SPEED:
+		val = usbgem_speed_value[dp->speed];
+		break;
+
+	case PARAM_LINK_DUPLEX:
+		val = 0;
+		if (dp->mii_state == MII_STATE_LINKUP) {
+			val = dp->full_duplex ? 2 : 1;
+		}
+		break;
+
+	case PARAM_LINK_AUTONEG:
+		val = BOOLEAN(dp->mii_exp & MII_AN_EXP_LPCANAN);
+		break;
+
+	case PARAM_LINK_RX_PAUSE:
+		val = (dp->flow_control == FLOW_CONTROL_SYMMETRIC) ||
+		    (dp->flow_control == FLOW_CONTROL_RX_PAUSE);
+		break;
+
+	case PARAM_LINK_TX_PAUSE:
+		val = (dp->flow_control == FLOW_CONTROL_SYMMETRIC) ||
+		    (dp->flow_control == FLOW_CONTROL_TX_PAUSE);
+		break;
+
+#ifdef DEBUG_RESUME
+	case PARAM_RESUME_TEST:
+		val = 0;
+		break;
+#endif
+	default:
+		cmn_err(CE_WARN, "%s: unimplemented ndd control (%d)",
+		    dp->name, item);
+		break;
+	}
+
+	(void) mi_mpprintf(mp, "%ld", val);
+
+	return (0);
+}
+
+static int
+usbgem_param_set(queue_t *q,
+    mblk_t *mp, char *value, caddr_t arg, cred_t *credp)
+{
+	struct usbgem_dev	*dp = ((struct usbgem_nd_arg *)(void *)arg)->dp;
+	int		item = ((struct usbgem_nd_arg *)(void *)arg)->item;
+	long		val;
+	char		*end;
+
+	DPRINTF(1, (CE_CONT, "!%s: %s: called", dp->name, __func__));
+	if (ddi_strtol(value, &end, 10, &val)) {
+		return (EINVAL);
+	}
+	if (end == value) {
+		return (EINVAL);
+	}
+
+	switch (item) {
+	case PARAM_ADV_AUTONEG_CAP:
+		if (val != 0 && val != 1) {
+			goto err;
+		}
+		if (val && (dp->mii_status & MII_STATUS_CANAUTONEG) == 0) {
+			goto err;
+		}
+		dp->anadv_autoneg = (int)val;
+		break;
+
+	case PARAM_ADV_PAUSE_CAP:
+		if (val != 0 && val != 1) {
+			goto err;
+		}
+		if (val && dp->ugc.usbgc_flow_control == FLOW_CONTROL_NONE) {
+			goto err;
+		}
+		dp->anadv_pause = (int)val;
+		break;
+
+	case PARAM_ADV_ASYM_PAUSE_CAP:
+		if (val != 0 && val != 1) {
+			goto err;
+		}
+		if (val &&
+		    dp->ugc.usbgc_flow_control <= FLOW_CONTROL_SYMMETRIC) {
+			goto err;
+		}
+		dp->anadv_asmpause = (int)val;
+		break;
+
+	case PARAM_ADV_1000FDX_CAP:
+		if (val != 0 && val != 1) {
+			goto err;
+		}
+		if (val && (dp->mii_xstatus &
+		    (MII_XSTATUS_1000BASET_FD |
+		    MII_XSTATUS_1000BASEX_FD)) == 0) {
+			goto err;
+		}
+		dp->anadv_1000fdx = (int)val;
+		break;
+
+	case PARAM_ADV_1000HDX_CAP:
+		if (val != 0 && val != 1) {
+			goto err;
+		}
+		if (val && (dp->mii_xstatus &
+		    (MII_XSTATUS_1000BASET | MII_XSTATUS_1000BASEX)) == 0) {
+			goto err;
+		}
+		dp->anadv_1000hdx = (int)val;
+		break;
+
+	case PARAM_ADV_100T4_CAP:
+		if (val != 0 && val != 1) {
+			goto err;
+		}
+		if (val && (dp->mii_status & MII_STATUS_100_BASE_T4) == 0) {
+			goto err;
+		}
+		dp->anadv_100t4 = (int)val;
+		break;
+
+	case PARAM_ADV_100FDX_CAP:
+		if (val != 0 && val != 1) {
+			goto err;
+		}
+		if (val && (dp->mii_status & MII_STATUS_100_BASEX_FD) == 0) {
+			goto err;
+		}
+		dp->anadv_100fdx = (int)val;
+		break;
+
+	case PARAM_ADV_100HDX_CAP:
+		if (val != 0 && val != 1) {
+			goto err;
+		}
+		if (val && (dp->mii_status & MII_STATUS_100_BASEX) == 0) {
+			goto err;
+		}
+		dp->anadv_100hdx = (int)val;
+		break;
+
+	case PARAM_ADV_10FDX_CAP:
+		if (val != 0 && val != 1) {
+			goto err;
+		}
+		if (val && (dp->mii_status & MII_STATUS_10_FD) == 0) {
+			goto err;
+		}
+		dp->anadv_10fdx = (int)val;
+		break;
+
+	case PARAM_ADV_10HDX_CAP:
+		if (val != 0 && val != 1) {
+			goto err;
+		}
+		if (val && (dp->mii_status & MII_STATUS_10) == 0) {
+			goto err;
+		}
+		dp->anadv_10hdx = (int)val;
+		break;
+
+	case PARAM_ADV_1000T_MS:
+		if (val != 0 && val != 1 && val != 2) {
+			goto err;
+		}
+		if (val && (dp->mii_xstatus &
+		    (MII_XSTATUS_1000BASET | MII_XSTATUS_1000BASET_FD)) == 0) {
+			goto err;
+		}
+		dp->anadv_1000t_ms = (int)val;
+		break;
+
+#ifdef DEBUG_RESUME
+	case PARAM_RESUME_TEST:
+		mutex_exit(&dp->xmitlock);
+		mutex_exit(&dp->intrlock);
+		gem_suspend(dp->dip);
+		gem_resume(dp->dip);
+		mutex_enter(&dp->intrlock);
+		mutex_enter(&dp->xmitlock);
+		break;
+#endif
+	}
+
+	/* sync with PHY */
+	usbgem_choose_forcedmode(dp);
+
+	dp->mii_state = MII_STATE_UNKNOWN;
+	if (dp->ugc.usbgc_mii_hw_link_detection) {
+		/* wake up link watcher possiblely sleeps */
+		cv_signal(&dp->link_watcher_wait_cv);
+	}
+
+	return (0);
+err:
+	return (EINVAL);
+}
+
+static void
+usbgem_nd_load(struct usbgem_dev *dp,
+    char *name, ndgetf_t gf, ndsetf_t sf, int item)
+{
+	struct usbgem_nd_arg	*arg;
+
+	ASSERT(item >= 0);
+	ASSERT(item < PARAM_COUNT);
+
+	arg = &((struct usbgem_nd_arg *)(void *)dp->nd_arg_p)[item];
+	arg->dp = dp;
+	arg->item = item;
+
+	DPRINTF(2, (CE_CONT, "!%s: %s: name:%s, item:%d",
+	    dp->name, __func__, name, item));
+	(void) nd_load(&dp->nd_data_p, name, gf, sf, (caddr_t)arg);
+}
+
+static void
+usbgem_nd_setup(struct usbgem_dev *dp)
+{
+	DPRINTF(1, (CE_CONT, "!%s: %s: called, mii_status:0x%b",
+	    dp->name, __func__, dp->mii_status, MII_STATUS_BITS));
+
+	ASSERT(dp->nd_arg_p == NULL);
+
+	dp->nd_arg_p =
+	    kmem_zalloc(sizeof (struct usbgem_nd_arg) * PARAM_COUNT, KM_SLEEP);
+
+#define	SETFUNC(x)	((x) ? usbgem_param_set : NULL)
+
+	usbgem_nd_load(dp, "autoneg_cap",
+	    usbgem_param_get, NULL, PARAM_AUTONEG_CAP);
+	usbgem_nd_load(dp, "pause_cap",
+	    usbgem_param_get, NULL, PARAM_PAUSE_CAP);
+	usbgem_nd_load(dp, "asym_pause_cap",
+	    usbgem_param_get, NULL, PARAM_ASYM_PAUSE_CAP);
+	usbgem_nd_load(dp, "1000fdx_cap",
+	    usbgem_param_get, NULL, PARAM_1000FDX_CAP);
+	usbgem_nd_load(dp, "1000hdx_cap",
+	    usbgem_param_get, NULL, PARAM_1000HDX_CAP);
+	usbgem_nd_load(dp, "100T4_cap",
+	    usbgem_param_get, NULL, PARAM_100T4_CAP);
+	usbgem_nd_load(dp, "100fdx_cap",
+	    usbgem_param_get, NULL, PARAM_100FDX_CAP);
+	usbgem_nd_load(dp, "100hdx_cap",
+	    usbgem_param_get, NULL, PARAM_100HDX_CAP);
+	usbgem_nd_load(dp, "10fdx_cap",
+	    usbgem_param_get, NULL, PARAM_10FDX_CAP);
+	usbgem_nd_load(dp, "10hdx_cap",
+	    usbgem_param_get, NULL, PARAM_10HDX_CAP);
+
+	/* Our advertised capabilities */
+	usbgem_nd_load(dp, "adv_autoneg_cap", usbgem_param_get,
+	    SETFUNC(dp->mii_status & MII_STATUS_CANAUTONEG),
+	    PARAM_ADV_AUTONEG_CAP);
+	usbgem_nd_load(dp, "adv_pause_cap", usbgem_param_get,
+	    SETFUNC(dp->ugc.usbgc_flow_control & 1),
+	    PARAM_ADV_PAUSE_CAP);
+	usbgem_nd_load(dp, "adv_asym_pause_cap", usbgem_param_get,
+	    SETFUNC(dp->ugc.usbgc_flow_control & 2),
+	    PARAM_ADV_ASYM_PAUSE_CAP);
+	usbgem_nd_load(dp, "adv_1000fdx_cap", usbgem_param_get,
+	    SETFUNC(dp->mii_xstatus &
+	    (MII_XSTATUS_1000BASEX_FD | MII_XSTATUS_1000BASET_FD)),
+	    PARAM_ADV_1000FDX_CAP);
+	usbgem_nd_load(dp, "adv_1000hdx_cap", usbgem_param_get,
+	    SETFUNC(dp->mii_xstatus &
+	    (MII_XSTATUS_1000BASEX | MII_XSTATUS_1000BASET)),
+	    PARAM_ADV_1000HDX_CAP);
+	usbgem_nd_load(dp, "adv_100T4_cap", usbgem_param_get,
+	    SETFUNC((dp->mii_status & MII_STATUS_100_BASE_T4) &&
+	    !dp->mii_advert_ro),
+	    PARAM_ADV_100T4_CAP);
+	usbgem_nd_load(dp, "adv_100fdx_cap", usbgem_param_get,
+	    SETFUNC((dp->mii_status & MII_STATUS_100_BASEX_FD) &&
+	    !dp->mii_advert_ro),
+	    PARAM_ADV_100FDX_CAP);
+	usbgem_nd_load(dp, "adv_100hdx_cap", usbgem_param_get,
+	    SETFUNC((dp->mii_status & MII_STATUS_100_BASEX) &&
+	    !dp->mii_advert_ro),
+	    PARAM_ADV_100HDX_CAP);
+	usbgem_nd_load(dp, "adv_10fdx_cap", usbgem_param_get,
+	    SETFUNC((dp->mii_status & MII_STATUS_10_FD) &&
+	    !dp->mii_advert_ro),
+	    PARAM_ADV_10FDX_CAP);
+	usbgem_nd_load(dp, "adv_10hdx_cap", usbgem_param_get,
+	    SETFUNC((dp->mii_status & MII_STATUS_10) &&
+	    !dp->mii_advert_ro),
+	    PARAM_ADV_10HDX_CAP);
+	usbgem_nd_load(dp, "adv_1000t_ms", usbgem_param_get,
+	    SETFUNC(dp->mii_xstatus &
+	    (MII_XSTATUS_1000BASET_FD | MII_XSTATUS_1000BASET)),
+	    PARAM_ADV_1000T_MS);
+
+
+	/* Partner's advertised capabilities */
+	usbgem_nd_load(dp, "lp_autoneg_cap",
+	    usbgem_param_get, NULL, PARAM_LP_AUTONEG_CAP);
+	usbgem_nd_load(dp, "lp_pause_cap",
+	    usbgem_param_get, NULL, PARAM_LP_PAUSE_CAP);
+	usbgem_nd_load(dp, "lp_asym_pause_cap",
+	    usbgem_param_get, NULL, PARAM_LP_ASYM_PAUSE_CAP);
+	usbgem_nd_load(dp, "lp_1000fdx_cap",
+	    usbgem_param_get, NULL, PARAM_LP_1000FDX_CAP);
+	usbgem_nd_load(dp, "lp_1000hdx_cap",
+	    usbgem_param_get, NULL, PARAM_LP_1000HDX_CAP);
+	usbgem_nd_load(dp, "lp_100T4_cap",
+	    usbgem_param_get, NULL, PARAM_LP_100T4_CAP);
+	usbgem_nd_load(dp, "lp_100fdx_cap",
+	    usbgem_param_get, NULL, PARAM_LP_100FDX_CAP);
+	usbgem_nd_load(dp, "lp_100hdx_cap",
+	    usbgem_param_get, NULL, PARAM_LP_100HDX_CAP);
+	usbgem_nd_load(dp, "lp_10fdx_cap",
+	    usbgem_param_get, NULL, PARAM_LP_10FDX_CAP);
+	usbgem_nd_load(dp, "lp_10hdx_cap",
+	    usbgem_param_get, NULL, PARAM_LP_10HDX_CAP);
+
+	/* Current operating modes */
+	usbgem_nd_load(dp, "link_status",
+	    usbgem_param_get, NULL, PARAM_LINK_STATUS);
+	usbgem_nd_load(dp, "link_speed",
+	    usbgem_param_get, NULL, PARAM_LINK_SPEED);
+	usbgem_nd_load(dp, "link_duplex",
+	    usbgem_param_get, NULL, PARAM_LINK_DUPLEX);
+	usbgem_nd_load(dp, "link_autoneg",
+	    usbgem_param_get, NULL, PARAM_LINK_AUTONEG);
+	usbgem_nd_load(dp, "link_rx_pause",
+	    usbgem_param_get, NULL, PARAM_LINK_RX_PAUSE);
+	usbgem_nd_load(dp, "link_tx_pause",
+	    usbgem_param_get, NULL, PARAM_LINK_TX_PAUSE);
+#ifdef DEBUG_RESUME
+	usbgem_nd_load(dp, "resume_test",
+	    usbgem_param_get, usbgem_param_set, PARAM_RESUME_TEST);
+#endif
+#undef	SETFUNC
+}
+
+static
+enum ioc_reply
+usbgem_nd_ioctl(struct usbgem_dev *dp,
+    queue_t *wq, mblk_t *mp, struct iocblk *iocp)
+{
+	boolean_t	ok;
+
+	DPRINTF(1, (CE_CONT, "!%s: %s: called", dp->name, __func__));
+
+	switch (iocp->ioc_cmd) {
+	case ND_GET:
+		ok = nd_getset(wq, dp->nd_data_p, mp);
+		DPRINTF(1, (CE_CONT,
+		    "%s: get %s", dp->name, ok ? "OK" : "FAIL"));
+		return (ok ? IOC_REPLY : IOC_INVAL);
+
+	case ND_SET:
+		ok = nd_getset(wq, dp->nd_data_p, mp);
+
+		DPRINTF(1, (CE_CONT, "%s: set %s err %d",
+		    dp->name, ok ? "OK" : "FAIL", iocp->ioc_error));
+
+		if (!ok) {
+			return (IOC_INVAL);
+		}
+
+		if (iocp->ioc_error) {
+			return (IOC_REPLY);
+		}
+
+		return (IOC_RESTART_REPLY);
+	}
+
+	cmn_err(CE_WARN, "%s: invalid cmd 0x%x", dp->name, iocp->ioc_cmd);
+
+	return (IOC_INVAL);
+}
+
+static void
+usbgem_nd_cleanup(struct usbgem_dev *dp)
+{
+	ASSERT(dp->nd_data_p != NULL);
+	ASSERT(dp->nd_arg_p != NULL);
+
+	nd_free(&dp->nd_data_p);
+
+	kmem_free(dp->nd_arg_p, sizeof (struct usbgem_nd_arg) * PARAM_COUNT);
+	dp->nd_arg_p = NULL;
+}
+#endif /* USBGEM_CONFIG_ND */
+
+static void
+usbgem_mac_ioctl(struct usbgem_dev *dp, queue_t *wq, mblk_t *mp)
+{
+	struct iocblk	*iocp;
+	enum ioc_reply	status;
+	int		cmd;
+
+	DPRINTF(1, (CE_CONT, "!%s: %s: called", dp->name, __func__));
+
+	/*
+	 * Validate the command before bothering with the mutex ...
+	 */
+	iocp = (void *)mp->b_rptr;
+	iocp->ioc_error = 0;
+	cmd = iocp->ioc_cmd;
+
+	DPRINTF(1, (CE_CONT, "%s: %s cmd:0x%x", dp->name, __func__, cmd));
+
+#ifdef USBGEM_CONFIG_ND
+	switch (cmd) {
+	default:
+		_NOTE(NOTREACHED)
+		status = IOC_INVAL;
+		break;
+
+	case ND_GET:
+	case ND_SET:
+		status = usbgem_nd_ioctl(dp, wq, mp, iocp);
+		break;
+	}
+
+	/*
+	 * Finally, decide how to reply
+	 */
+	switch (status) {
+	default:
+	case IOC_INVAL:
+		/*
+		 * Error, reply with a NAK and EINVAL or the specified error
+		 */
+		miocnak(wq, mp, 0, iocp->ioc_error == 0 ?
+		    EINVAL : iocp->ioc_error);
+		break;
+
+	case IOC_DONE:
+		/*
+		 * OK, reply already sent
+		 */
+		break;
+
+	case IOC_RESTART_ACK:
+	case IOC_ACK:
+		/*
+		 * OK, reply with an ACK
+		 */
+		miocack(wq, mp, 0, 0);
+		break;
+
+	case IOC_RESTART_REPLY:
+	case IOC_REPLY:
+		/*
+		 * OK, send prepared reply as ACK or NAK
+		 */
+		mp->b_datap->db_type =
+		    iocp->ioc_error == 0 ? M_IOCACK : M_IOCNAK;
+		qreply(wq, mp);
+		break;
+	}
+#else
+	miocnak(wq, mp, 0, EINVAL);
+	return;
+#endif /* USBGEM_CONFIG_GLDv3 */
+}
+
+#ifndef SYS_MAC_H
+#define	XCVR_UNDEFINED	0
+#define	XCVR_NONE	1
+#define	XCVR_10		2
+#define	XCVR_100T4	3
+#define	XCVR_100X	4
+#define	XCVR_100T2	5
+#define	XCVR_1000X	6
+#define	XCVR_1000T	7
+#endif
+static int
+usbgem_mac_xcvr_inuse(struct usbgem_dev *dp)
+{
+	int	val = XCVR_UNDEFINED;
+
+	if ((dp->mii_status & MII_STATUS_XSTATUS) == 0) {
+		if (dp->mii_status & MII_STATUS_100_BASE_T4) {
+			val = XCVR_100T4;
+		} else if (dp->mii_status &
+		    (MII_STATUS_100_BASEX_FD |
+		    MII_STATUS_100_BASEX)) {
+			val = XCVR_100X;
+		} else if (dp->mii_status &
+		    (MII_STATUS_100_BASE_T2_FD |
+		    MII_STATUS_100_BASE_T2)) {
+			val = XCVR_100T2;
+		} else if (dp->mii_status &
+		    (MII_STATUS_10_FD | MII_STATUS_10)) {
+			val = XCVR_10;
+		}
+	} else if (dp->mii_xstatus &
+	    (MII_XSTATUS_1000BASET_FD | MII_XSTATUS_1000BASET)) {
+		val = XCVR_1000T;
+	} else if (dp->mii_xstatus &
+	    (MII_XSTATUS_1000BASEX_FD | MII_XSTATUS_1000BASEX)) {
+		val = XCVR_1000X;
+	}
+
+	return (val);
+}
+
+#ifdef USBGEM_CONFIG_GLDv3
+/* ============================================================== */
+/*
+ * GLDv3 interface
+ */
+/* ============================================================== */
+static int	usbgem_m_getstat(void *, uint_t, uint64_t *);
+static int	usbgem_m_start(void *);
+static void	usbgem_m_stop(void *);
+static int	usbgem_m_setpromisc(void *, boolean_t);
+static int	usbgem_m_multicst(void *, boolean_t, const uint8_t *);
+static int	usbgem_m_unicst(void *, const uint8_t *);
+static mblk_t	*usbgem_m_tx(void *, mblk_t *);
+static void	usbgem_m_ioctl(void *, queue_t *, mblk_t *);
+#ifdef GEM_CONFIG_MAC_PROP
+static int	usbgem_m_setprop(void *, const char *, mac_prop_id_t,
+    uint_t, const void *);
+#ifdef MAC_VERSION_V1
+static int	usbgem_m_getprop(void *, const char *, mac_prop_id_t,
+    uint_t, void *);
+#else
+static int	usbgem_m_getprop(void *, const char *, mac_prop_id_t,
+    uint_t, uint_t, void *, uint_t *);
+#endif
+#endif
+
+#ifdef _SYS_MAC_PROVIDER_H
+#define	GEM_M_CALLBACK_FLAGS	(MC_IOCTL)
+#else
+#define	GEM_M_CALLBACK_FLAGS	(MC_IOCTL)
+#endif
+
+static mac_callbacks_t gem_m_callbacks = {
+#ifdef USBGEM_CONFIG_MAC_PROP
+#ifdef MAC_VERSION_V1
+	GEM_M_CALLBACK_FLAGS | MC_SETPROP | MC_GETPROP | MC_PROPINFO,
+#else
+	GEM_M_CALLBACK_FLAGS | MC_SETPROP | MC_GETPROP,
+#endif
+#else
+	GEM_M_CALLBACK_FLAGS,
+#endif
+	usbgem_m_getstat,
+	usbgem_m_start,
+	usbgem_m_stop,
+	usbgem_m_setpromisc,
+	usbgem_m_multicst,
+	usbgem_m_unicst,
+	usbgem_m_tx,
+#ifdef _SYS_MAC_PROVIDER_H
+#ifdef MAC_VERSION_V1
+	NULL,
+#endif
+#else
+	NULL,	/* m_resources */
+#endif
+	usbgem_m_ioctl,
+	NULL, /* m_getcapab */
+#ifdef USBGEM_CONFIG_MAC_PROP
+	NULL,
+	NULL,
+	usbgem_m_setprop,
+	usbgem_m_getprop,
+#endif
+#ifdef MAC_VERSION_V1
+	usbgem_m_propinfo,
+#endif
+};
+
+static int
+usbgem_m_start(void *arg)
+{
+	int	ret;
+	int	err;
+	struct usbgem_dev *dp = arg;
+
+	DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__));
+
+	err = EIO;
+
+	rw_enter(&dp->dev_state_lock, RW_WRITER);
+	dp->nic_state = NIC_STATE_ONLINE;
+
+	if (dp->mac_state == MAC_STATE_DISCONNECTED) {
+		err = 0;
+		goto x;
+	}
+	if (usbgem_mac_init(dp) != USB_SUCCESS) {
+		goto x;
+	}
+
+	/* initialize rx filter state */
+	sema_p(&dp->rxfilter_lock);
+	dp->mc_count = 0;
+	dp->mc_count_req = 0;
+
+	bcopy(dp->dev_addr.ether_addr_octet,
+	    dp->cur_addr.ether_addr_octet, ETHERADDRL);
+	dp->rxmode |= RXMODE_ENABLE;
+
+	ret = usbgem_hal_set_rx_filter(dp);
+	sema_v(&dp->rxfilter_lock);
+
+	if (ret != USB_SUCCESS) {
+		goto x;
+	}
+
+	if (dp->mii_state == MII_STATE_LINKUP) {
+		/* setup media mode if the link have been up */
+		if (usbgem_hal_set_media(dp) != USB_SUCCESS) {
+			goto x;
+		}
+		if (usbgem_mac_start(dp) != USB_SUCCESS) {
+			goto x;
+		}
+	}
+
+	err = 0;
+x:
+	rw_exit(&dp->dev_state_lock);
+	return (err);
+}
+
+static void
+usbgem_m_stop(void *arg)
+{
+	struct usbgem_dev	*dp = arg;
+
+	DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__));
+
+	/* stop rx gracefully */
+	rw_enter(&dp->dev_state_lock, RW_READER);
+	sema_p(&dp->rxfilter_lock);
+	dp->rxmode &= ~RXMODE_ENABLE;
+
+	if (dp->mac_state != MAC_STATE_DISCONNECTED) {
+		(void) usbgem_hal_set_rx_filter(dp);
+	}
+	sema_v(&dp->rxfilter_lock);
+	rw_exit(&dp->dev_state_lock);
+
+	/* make the nic state inactive */
+	rw_enter(&dp->dev_state_lock, RW_WRITER);
+	dp->nic_state = NIC_STATE_STOPPED;
+
+	/* stop mac completely */
+	if (dp->mac_state != MAC_STATE_DISCONNECTED) {
+		(void) usbgem_mac_stop(dp, MAC_STATE_STOPPED, STOP_GRACEFUL);
+	}
+	rw_exit(&dp->dev_state_lock);
+}
+
+static int
+usbgem_m_multicst(void *arg, boolean_t add, const uint8_t *ep)
+{
+	int	err;
+	int	ret;
+	struct usbgem_dev	*dp = arg;
+
+	DPRINTF(1, (CE_CONT, "!%s: %s: called", dp->name, __func__));
+
+	rw_enter(&dp->dev_state_lock, RW_READER);
+	if (add) {
+		ret = usbgem_add_multicast(dp, ep);
+	} else {
+		ret = usbgem_remove_multicast(dp, ep);
+	}
+	rw_exit(&dp->dev_state_lock);
+
+	err = 0;
+	if (ret != USB_SUCCESS) {
+#ifdef GEM_CONFIG_FMA
+		ddi_fm_service_impact(dp->dip, DDI_SERVICE_DEGRADED);
+#endif
+		err = EIO;
+	}
+
+	return (err);
+}
+
+static int
+usbgem_m_setpromisc(void *arg, boolean_t on)
+{
+	int	err;
+	struct usbgem_dev	*dp = arg;
+
+	DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__));
+
+	rw_enter(&dp->dev_state_lock, RW_READER);
+
+	sema_p(&dp->rxfilter_lock);
+	if (on) {
+		dp->rxmode |= RXMODE_PROMISC;
+	} else {
+		dp->rxmode &= ~RXMODE_PROMISC;
+	}
+
+	err = 0;
+	if (dp->mac_state != MAC_STATE_DISCONNECTED) {
+		if (usbgem_hal_set_rx_filter(dp) != USB_SUCCESS) {
+			err = EIO;
+		}
+	}
+	sema_v(&dp->rxfilter_lock);
+
+	rw_exit(&dp->dev_state_lock);
+
+#ifdef GEM_CONFIG_FMA
+	if (err != 0) {
+		ddi_fm_service_impact(dp->dip, DDI_SERVICE_DEGRADED);
+	}
+#endif
+	return (err);
+}
+
+int
+usbgem_m_getstat(void *arg, uint_t stat, uint64_t *valp)
+{
+	int	ret;
+	uint64_t	val;
+	struct usbgem_dev	*dp = arg;
+	struct usbgem_stats	*gstp = &dp->stats;
+
+	DPRINTF(1, (CE_CONT, "!%s: %s: called", dp->name, __func__));
+
+	rw_enter(&dp->dev_state_lock, RW_READER);
+	if (dp->mac_state == MAC_STATE_DISCONNECTED) {
+		rw_exit(&dp->dev_state_lock);
+		return (0);
+	}
+	ret = usbgem_hal_get_stats(dp);
+	rw_exit(&dp->dev_state_lock);
+
+#ifdef GEM_CONFIG_FMA
+	if (ret != USB_SUCCESS) {
+		ddi_fm_service_impact(dp->dip, DDI_SERVICE_DEGRADED);
+		return (EIO);
+	}
+#endif
+
+	switch (stat) {
+	case MAC_STAT_IFSPEED:
+		val = usbgem_speed_value[dp->speed] *1000000ull;
+		break;
+
+	case MAC_STAT_MULTIRCV:
+		val = gstp->rmcast;
+		break;
+
+	case MAC_STAT_BRDCSTRCV:
+		val = gstp->rbcast;
+		break;
+
+	case MAC_STAT_MULTIXMT:
+		val = gstp->omcast;
+		break;
+
+	case MAC_STAT_BRDCSTXMT:
+		val = gstp->obcast;
+		break;
+
+	case MAC_STAT_NORCVBUF:
+		val = gstp->norcvbuf + gstp->missed;
+		break;
+
+	case MAC_STAT_IERRORS:
+		val = gstp->errrcv;
+		break;
+
+	case MAC_STAT_NOXMTBUF:
+		val = gstp->noxmtbuf;
+		break;
+
+	case MAC_STAT_OERRORS:
+		val = gstp->errxmt;
+		break;
+
+	case MAC_STAT_COLLISIONS:
+		val = gstp->collisions;
+		break;
+
+	case MAC_STAT_RBYTES:
+		val = gstp->rbytes;
+		break;
+
+	case MAC_STAT_IPACKETS:
+		val = gstp->rpackets;
+		break;
+
+	case MAC_STAT_OBYTES:
+		val = gstp->obytes;
+		break;
+
+	case MAC_STAT_OPACKETS:
+		val = gstp->opackets;
+		break;
+
+	case MAC_STAT_UNDERFLOWS:
+		val = gstp->underflow;
+		break;
+
+	case MAC_STAT_OVERFLOWS:
+		val = gstp->overflow;
+		break;
+
+	case ETHER_STAT_ALIGN_ERRORS:
+		val = gstp->frame;
+		break;
+
+	case ETHER_STAT_FCS_ERRORS:
+		val = gstp->crc;
+		break;
+
+	case ETHER_STAT_FIRST_COLLISIONS:
+		val = gstp->first_coll;
+		break;
+
+	case ETHER_STAT_MULTI_COLLISIONS:
+		val = gstp->multi_coll;
+		break;
+
+	case ETHER_STAT_SQE_ERRORS:
+		val = gstp->sqe;
+		break;
+
+	case ETHER_STAT_DEFER_XMTS:
+		val = gstp->defer;
+		break;
+
+	case ETHER_STAT_TX_LATE_COLLISIONS:
+		val = gstp->xmtlatecoll;
+		break;
+
+	case ETHER_STAT_EX_COLLISIONS:
+		val = gstp->excoll;
+		break;
+
+	case ETHER_STAT_MACXMT_ERRORS:
+		val = gstp->xmit_internal_err;
+		break;
+
+	case ETHER_STAT_CARRIER_ERRORS:
+		val = gstp->nocarrier;
+		break;
+
+	case ETHER_STAT_TOOLONG_ERRORS:
+		val = gstp->frame_too_long;
+		break;
+
+	case ETHER_STAT_MACRCV_ERRORS:
+		val = gstp->rcv_internal_err;
+		break;
+
+	case ETHER_STAT_XCVR_ADDR:
+		val = dp->mii_phy_addr;
+		break;
+
+	case ETHER_STAT_XCVR_ID:
+		val = dp->mii_phy_id;
+		break;
+
+	case ETHER_STAT_XCVR_INUSE:
+		val = usbgem_mac_xcvr_inuse(dp);
+		break;
+
+	case ETHER_STAT_CAP_1000FDX:
+		val = (dp->mii_xstatus & MII_XSTATUS_1000BASET_FD) ||
+		    (dp->mii_xstatus & MII_XSTATUS_1000BASEX_FD);
+		break;
+
+	case ETHER_STAT_CAP_1000HDX:
+		val = (dp->mii_xstatus & MII_XSTATUS_1000BASET) ||
+		    (dp->mii_xstatus & MII_XSTATUS_1000BASEX);
+		break;
+
+	case ETHER_STAT_CAP_100FDX:
+		val = BOOLEAN(dp->mii_status & MII_STATUS_100_BASEX_FD);
+		break;
+
+	case ETHER_STAT_CAP_100HDX:
+		val = BOOLEAN(dp->mii_status & MII_STATUS_100_BASEX);
+		break;
+
+	case ETHER_STAT_CAP_10FDX:
+		val = BOOLEAN(dp->mii_status & MII_STATUS_10_FD);
+		break;
+
+	case ETHER_STAT_CAP_10HDX:
+		val = BOOLEAN(dp->mii_status & MII_STATUS_10);
+		break;
+
+	case ETHER_STAT_CAP_ASMPAUSE:
+		val = dp->ugc.usbgc_flow_control > FLOW_CONTROL_SYMMETRIC;
+		break;
+
+	case ETHER_STAT_CAP_PAUSE:
+		val = dp->ugc.usbgc_flow_control != FLOW_CONTROL_NONE;
+		break;
+
+	case ETHER_STAT_CAP_AUTONEG:
+		val = BOOLEAN(dp->mii_status & MII_STATUS_CANAUTONEG);
+		break;
+
+	case ETHER_STAT_ADV_CAP_1000FDX:
+		val = dp->anadv_1000fdx;
+		break;
+
+	case ETHER_STAT_ADV_CAP_1000HDX:
+		val = dp->anadv_1000hdx;
+		break;
+
+	case ETHER_STAT_ADV_CAP_100FDX:
+		val = dp->anadv_100fdx;
+		break;
+
+	case ETHER_STAT_ADV_CAP_100HDX:
+		val = dp->anadv_100hdx;
+		break;
+
+	case ETHER_STAT_ADV_CAP_10FDX:
+		val = dp->anadv_10fdx;
+		break;
+
+	case ETHER_STAT_ADV_CAP_10HDX:
+		val = dp->anadv_10hdx;
+		break;
+
+	case ETHER_STAT_ADV_CAP_ASMPAUSE:
+		val = dp->anadv_asmpause;
+		break;
+
+	case ETHER_STAT_ADV_CAP_PAUSE:
+		val = dp->anadv_pause;
+		break;
+
+	case ETHER_STAT_ADV_CAP_AUTONEG:
+		val = dp->anadv_autoneg;
+		break;
+
+	case ETHER_STAT_LP_CAP_1000FDX:
+		val = BOOLEAN(dp->mii_stat1000 & MII_1000TS_LP_FULL);
+		break;
+
+	case ETHER_STAT_LP_CAP_1000HDX:
+		val = BOOLEAN(dp->mii_stat1000 & MII_1000TS_LP_HALF);
+		break;
+
+	case ETHER_STAT_LP_CAP_100FDX:
+		val = BOOLEAN(dp->mii_lpable & MII_ABILITY_100BASE_TX_FD);
+		break;
+
+	case ETHER_STAT_LP_CAP_100HDX:
+		val = BOOLEAN(dp->mii_lpable & MII_ABILITY_100BASE_TX);
+		break;
+
+	case ETHER_STAT_LP_CAP_10FDX:
+		val = BOOLEAN(dp->mii_lpable & MII_ABILITY_10BASE_T_FD);
+		break;
+
+	case ETHER_STAT_LP_CAP_10HDX:
+		val = BOOLEAN(dp->mii_lpable & MII_ABILITY_10BASE_T);
+		break;
+
+	case ETHER_STAT_LP_CAP_ASMPAUSE:
+		val = BOOLEAN(dp->mii_lpable & MII_ABILITY_ASM_DIR);
+		break;
+
+	case ETHER_STAT_LP_CAP_PAUSE:
+		val = BOOLEAN(dp->mii_lpable & MII_ABILITY_PAUSE);
+		break;
+
+	case ETHER_STAT_LP_CAP_AUTONEG:
+		val = BOOLEAN(dp->mii_exp & MII_AN_EXP_LPCANAN);
+		break;
+
+	case ETHER_STAT_LINK_ASMPAUSE:
+		val = BOOLEAN(dp->flow_control & 2);
+		break;
+
+	case ETHER_STAT_LINK_PAUSE:
+		val = BOOLEAN(dp->flow_control & 1);
+		break;
+
+	case ETHER_STAT_LINK_AUTONEG:
+		val = dp->anadv_autoneg &&
+		    BOOLEAN(dp->mii_exp & MII_AN_EXP_LPCANAN);
+		break;
+
+	case ETHER_STAT_LINK_DUPLEX:
+		val = (dp->mii_state == MII_STATE_LINKUP) ?
+		    (dp->full_duplex ? 2 : 1) : 0;
+		break;
+
+	case ETHER_STAT_TOOSHORT_ERRORS:
+		val = gstp->runt;
+		break;
+#ifdef NEVER	/* it doesn't make sense */
+	case ETHER_STAT_CAP_REMFAULT:
+		val = B_TRUE;
+		break;
+
+	case ETHER_STAT_ADV_REMFAULT:
+		val = dp->anadv_remfault;
+		break;
+#endif
+	case ETHER_STAT_LP_REMFAULT:
+		val = BOOLEAN(dp->mii_lpable & MII_AN_ADVERT_REMFAULT);
+		break;
+
+	case ETHER_STAT_JABBER_ERRORS:
+		val = gstp->jabber;
+		break;
+
+	case ETHER_STAT_CAP_100T4:
+		val = BOOLEAN(dp->mii_status & MII_STATUS_100_BASE_T4);
+		break;
+
+	case ETHER_STAT_ADV_CAP_100T4:
+		val = dp->anadv_100t4;
+		break;
+
+	case ETHER_STAT_LP_CAP_100T4:
+		val = BOOLEAN(dp->mii_lpable & MII_ABILITY_100BASE_T4);
+		break;
+
+	default:
+#if GEM_DEBUG_LEVEL > 2
+		cmn_err(CE_WARN,
+		    "%s: unrecognized parameter value = %d",
+		    __func__, stat);
+#endif
+		*valp = 0;
+		return (ENOTSUP);
+	}
+
+	*valp = val;
+
+	return (0);
+}
+
+static int
+usbgem_m_unicst(void *arg, const uint8_t *mac)
+{
+	int	err;
+	struct usbgem_dev	*dp = arg;
+
+	DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__));
+
+	rw_enter(&dp->dev_state_lock, RW_READER);
+
+	sema_p(&dp->rxfilter_lock);
+	bcopy(mac, dp->cur_addr.ether_addr_octet, ETHERADDRL);
+	dp->rxmode |= RXMODE_ENABLE;
+
+	err = 0;
+	if (dp->mac_state != MAC_STATE_DISCONNECTED) {
+		if (usbgem_hal_set_rx_filter(dp) != USB_SUCCESS) {
+			err = EIO;
+		}
+	}
+	sema_v(&dp->rxfilter_lock);
+	rw_exit(&dp->dev_state_lock);
+
+#ifdef GEM_CONFIG_FMA
+	if (err != 0) {
+		ddi_fm_service_impact(dp->dip, DDI_SERVICE_DEGRADED);
+	}
+#endif
+	return (err);
+}
+
+/*
+ * usbgem_m_tx is used only for sending data packets into ethernet wire.
+ */
+static mblk_t *
+usbgem_m_tx(void *arg, mblk_t *mp_head)
+{
+	int	limit;
+	mblk_t	*mp;
+	mblk_t	*nmp;
+	uint32_t	flags;
+	struct usbgem_dev	*dp = arg;
+
+	DPRINTF(4, (CE_CONT, "!%s: %s: called", dp->name, __func__));
+
+	mp = mp_head;
+	flags = 0;
+
+	rw_enter(&dp->dev_state_lock, RW_READER);
+
+	if (dp->mii_state != MII_STATE_LINKUP ||
+	    dp->mac_state != MAC_STATE_ONLINE) {
+		/* some nics hate to send packets during the link is down */
+		for (; mp; mp = nmp) {
+			nmp = mp->b_next;
+			mp->b_next = NULL;
+			freemsg(mp);
+		}
+		goto x;
+	}
+
+	ASSERT(dp->nic_state == NIC_STATE_ONLINE);
+
+	limit = dp->tx_max_packets;
+	for (; limit-- && mp; mp = nmp) {
+		nmp = mp->b_next;
+		mp->b_next = NULL;
+		if (usbgem_send_common(dp, mp,
+		    (limit == 0 && nmp) ? 1 : 0)) {
+			mp->b_next = nmp;
+			break;
+		}
+	}
+#ifdef CONFIG_TX_LIMITER
+	if (mp == mp_head) {
+		/* no packets were sent, descrease allocation limit */
+		mutex_enter(&dp->txlock);
+		dp->tx_max_packets = max(dp->tx_max_packets - 1, 1);
+		mutex_exit(&dp->txlock);
+	}
+#endif
+x:
+	rw_exit(&dp->dev_state_lock);
+
+	return (mp);
+}
+
+static void
+usbgem_m_ioctl(void *arg, queue_t *wq, mblk_t *mp)
+{
+	struct usbgem_dev	*dp = arg;
+
+	DPRINTF(1, (CE_CONT, "!%s: %s: called",
+	    ((struct usbgem_dev *)arg)->name, __func__));
+
+	rw_enter(&dp->dev_state_lock, RW_READER);
+	usbgem_mac_ioctl((struct usbgem_dev *)arg, wq, mp);
+	rw_exit(&dp->dev_state_lock);
+}
+
+static void
+usbgem_gld3_init(struct usbgem_dev *dp, mac_register_t *macp)
+{
+	macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
+	macp->m_driver = dp;
+	macp->m_dip = dp->dip;
+	macp->m_src_addr = dp->dev_addr.ether_addr_octet;
+	macp->m_callbacks = &gem_m_callbacks;
+	macp->m_min_sdu = 0;
+	macp->m_max_sdu = dp->mtu;
+
+	if (dp->misc_flag & USBGEM_VLAN) {
+		macp->m_margin = VTAG_SIZE;
+	}
+}
+#else
+/* ============================================================== */
+/*
+ * GLDv2 interface
+ */
+/* ============================================================== */
+static int usbgem_gld_reset(gld_mac_info_t *);
+static int usbgem_gld_start(gld_mac_info_t *);
+static int usbgem_gld_stop(gld_mac_info_t *);
+static int usbgem_gld_set_mac_address(gld_mac_info_t *, uint8_t *);
+static int usbgem_gld_set_multicast(gld_mac_info_t *, uint8_t *, int);
+static int usbgem_gld_set_promiscuous(gld_mac_info_t *, int);
+static int usbgem_gld_get_stats(gld_mac_info_t *, struct gld_stats *);
+static int usbgem_gld_send(gld_mac_info_t *, mblk_t *);
+static int usbgem_gld_send_tagged(gld_mac_info_t *, mblk_t *, uint32_t);
+
+static int
+usbgem_gld_reset(gld_mac_info_t *macinfo)
+{
+	int	err;
+	struct usbgem_dev	*dp;
+
+	err = GLD_SUCCESS;
+	dp = (struct usbgem_dev *)macinfo->gldm_private;
+
+	DPRINTF(1, (CE_CONT, "!%s: %s: called", dp->name, __func__));
+
+	rw_enter(&dp->dev_state_lock, RW_WRITER);
+	if (usbgem_mac_init(dp) != USB_SUCCESS) {
+		err = GLD_FAILURE;
+		goto x;
+	}
+
+	dp->nic_state = NIC_STATE_INITIALIZED;
+
+	/* setup media mode if the link have been up */
+	if (dp->mii_state == MII_STATE_LINKUP) {
+		if (dp->mac_state != MAC_STATE_DISCONNECTED) {
+			(void) usbgem_hal_set_media(dp);
+		}
+	}
+x:
+	rw_exit(&dp->dev_state_lock);
+	return (err);
+}
+
+static int
+usbgem_gld_start(gld_mac_info_t *macinfo)
+{
+	int	err;
+	struct usbgem_dev *dp;
+
+	dp = (struct usbgem_dev *)macinfo->gldm_private;
+
+	DPRINTF(1, (CE_CONT, "!%s: %s: called", dp->name, __func__));
+
+	rw_enter(&dp->dev_state_lock, RW_WRITER);
+
+	dp->nic_state = NIC_STATE_ONLINE;
+
+	if (dp->mii_state == MII_STATE_LINKUP) {
+		if (usbgem_mac_start(dp) != USB_SUCCESS) {
+			/* sema_v(&dp->mii_lock); */
+			err = GLD_FAILURE;
+			goto x;
+		}
+	}
+
+	/*
+	 * XXX - don't call gld_linkstate() here,
+	 * otherwise it cause recursive mutex call.
+	 */
+	err = GLD_SUCCESS;
+x:
+	rw_exit(&dp->dev_state_lock);
+
+	return (err);
+}
+
+static int
+usbgem_gld_stop(gld_mac_info_t *macinfo)
+{
+	int	err = GLD_SUCCESS;
+	struct usbgem_dev	*dp;
+
+	dp = (struct usbgem_dev *)macinfo->gldm_private;
+
+	DPRINTF(1, (CE_CONT, "!%s: %s: called", dp->name, __func__));
+
+	/* try to stop rx gracefully */
+	rw_enter(&dp->dev_state_lock, RW_READER);
+	sema_p(&dp->rxfilter_lock);
+	dp->rxmode &= ~RXMODE_ENABLE;
+
+	if (dp->mac_state != MAC_STATE_DISCONNECTED) {
+		(void) usbgem_hal_set_rx_filter(dp);
+	}
+	sema_v(&dp->rxfilter_lock);
+	rw_exit(&dp->dev_state_lock);
+
+	/* make the nic state inactive */
+	rw_enter(&dp->dev_state_lock, RW_WRITER);
+	dp->nic_state = NIC_STATE_STOPPED;
+
+	if (dp->mac_state != MAC_STATE_DISCONNECTED) {
+		if (usbgem_mac_stop(dp, MAC_STATE_STOPPED, STOP_GRACEFUL)
+		    != USB_SUCCESS) {
+			err = GLD_FAILURE;
+		}
+	}
+	rw_exit(&dp->dev_state_lock);
+
+	return (err);
+}
+
+static int
+usbgem_gld_set_multicast(gld_mac_info_t *macinfo, uint8_t *ep, int flag)
+{
+	int		err;
+	int		ret;
+	struct usbgem_dev	*dp;
+
+	dp = (struct usbgem_dev *)macinfo->gldm_private;
+
+	DPRINTF(1, (CE_CONT, "!%s: %s: called", dp->name, __func__));
+
+	rw_enter(&dp->dev_state_lock, RW_READER);
+	if (flag == GLD_MULTI_ENABLE) {
+		ret = usbgem_add_multicast(dp, ep);
+	} else {
+		ret = usbgem_remove_multicast(dp, ep);
+	}
+	rw_exit(&dp->dev_state_lock);
+
+	err = GLD_SUCCESS;
+	if (ret != USB_SUCCESS) {
+#ifdef GEM_CONFIG_FMA
+		ddi_fm_service_impact(dp->dip, DDI_SERVICE_DEGRADED);
+#endif
+		err = GLD_FAILURE;
+	}
+	return (err);
+}
+
+static int
+usbgem_gld_set_promiscuous(gld_mac_info_t *macinfo, int flag)
+{
+	boolean_t	need_to_change = B_TRUE;
+	struct usbgem_dev	*dp;
+
+	dp = (struct usbgem_dev *)macinfo->gldm_private;
+
+	DPRINTF(1, (CE_CONT, "!%s: %s: called", dp->name, __func__));
+
+	sema_p(&dp->rxfilter_lock);
+	if (flag == GLD_MAC_PROMISC_NONE) {
+		dp->rxmode &= ~(RXMODE_PROMISC | RXMODE_ALLMULTI_REQ);
+	} else if (flag == GLD_MAC_PROMISC_MULTI) {
+		dp->rxmode |= RXMODE_ALLMULTI_REQ;
+	} else if (flag == GLD_MAC_PROMISC_PHYS) {
+		dp->rxmode |= RXMODE_PROMISC;
+	} else {
+		/* mode unchanged */
+		need_to_change = B_FALSE;
+	}
+
+	if (need_to_change) {
+		if (dp->mac_state != MAC_STATE_DISCONNECTED) {
+			(void) usbgem_hal_set_rx_filter(dp);
+		}
+	}
+	sema_v(&dp->rxfilter_lock);
+
+	return (GLD_SUCCESS);
+}
+
+static int
+usbgem_gld_set_mac_address(gld_mac_info_t *macinfo, uint8_t *mac)
+{
+	struct usbgem_dev	*dp;
+	dp = (struct usbgem_dev *)macinfo->gldm_private;
+
+	DPRINTF(1, (CE_CONT, "!%s: %s: called", dp->name, __func__));
+
+	sema_p(&dp->rxfilter_lock);
+	bcopy(mac, dp->cur_addr.ether_addr_octet, ETHERADDRL);
+	dp->rxmode |= RXMODE_ENABLE;
+
+	if (dp->mac_state != MAC_STATE_DISCONNECTED) {
+		(void) usbgem_hal_set_rx_filter(dp);
+	}
+	sema_v(&dp->rxfilter_lock);
+
+	return (GLD_SUCCESS);
+}
+
+static	int
+usbgem_gld_get_stats(gld_mac_info_t *macinfo, struct gld_stats *gs)
+{
+	struct usbgem_dev	*dp;
+	struct usbgem_stats	*vs;
+
+	dp = (struct usbgem_dev *)macinfo->gldm_private;
+
+	if ((*dp->ugc.usbgc_get_stats)(dp) != USB_SUCCESS) {
+#ifdef GEM_CONFIG_FMA
+		ddi_fm_service_impact(dp->dip, DDI_SERVICE_DEGRADED);
+#endif
+		return (USB_FAILURE);
+	}
+
+	vs = &dp->stats;
+
+	gs->glds_errxmt = vs->errxmt;
+	gs->glds_errrcv = vs->errrcv;
+	gs->glds_collisions = vs->collisions;
+
+	gs->glds_excoll = vs->excoll;
+	gs->glds_defer = vs->defer;
+	gs->glds_frame = vs->frame;
+	gs->glds_crc = vs->crc;
+
+	gs->glds_overflow = vs->overflow; /* fifo err,underrun,rbufovf */
+	gs->glds_underflow = vs->underflow;
+	gs->glds_short = vs->runt;
+	gs->glds_missed = vs->missed; /* missed pkts while rbuf ovf */
+	gs->glds_xmtlatecoll = vs->xmtlatecoll;
+	gs->glds_nocarrier = vs->nocarrier;
+	gs->glds_norcvbuf = vs->norcvbuf;	/* OS resource exaust */
+	gs->glds_intr = vs->intr;
+
+	/* all before here must be kept in place for v0 compatibility */
+	gs->glds_speed = usbgem_speed_value[dp->speed] * 1000000;
+	gs->glds_media = GLDM_PHYMII;
+	gs->glds_duplex = dp->full_duplex ? GLD_DUPLEX_FULL : GLD_DUPLEX_HALF;
+
+	/* gs->glds_media_specific */
+	gs->glds_dot3_first_coll = vs->first_coll;
+	gs->glds_dot3_multi_coll = vs->multi_coll;
+	gs->glds_dot3_sqe_error = 0;
+	gs->glds_dot3_mac_xmt_error = 0;
+	gs->glds_dot3_mac_rcv_error = 0;
+	gs->glds_dot3_frame_too_long = vs->frame_too_long;
+
+	return (GLD_SUCCESS);
+}
+
+static int
+usbgem_gld_ioctl(gld_mac_info_t *macinfo, queue_t *wq, mblk_t *mp)
+{
+	struct usbgem_dev	*dp;
+
+	dp = (struct usbgem_dev *)macinfo->gldm_private;
+	usbgem_mac_ioctl(dp, wq, mp);
+
+	return (GLD_SUCCESS);
+}
+
+/*
+ * gem_gld_send is used only for sending data packets into ethernet wire.
+ */
+static int
+usbgem_gld_send(gld_mac_info_t *macinfo, mblk_t *mp)
+{
+	int		ret;
+	uint32_t	flags = 0;
+	struct usbgem_dev	*dp;
+
+	dp = (struct usbgem_dev *)macinfo->gldm_private;
+
+	/* nic state must be online of suspended */
+	rw_enter(&dp->dev_state_lock, RW_READER);
+
+	ASSERT(dp->nic_state == NIC_STATE_ONLINE);
+	ASSERT(mp->b_next == NULL);
+
+	if (dp->mii_state != MII_STATE_LINKUP) {
+		/* Some nics hate to send packets while the link is down. */
+		/* we discard the untransmitted packets silently */
+		rw_exit(&dp->dev_state_lock);
+
+		freemsg(mp);
+#ifdef GEM_CONFIG_FMA
+		/* FIXME - should we ignore the error? */
+		ddi_fm_service_impact(dp->dip, DDI_SERVICE_DEGRADED);
+#endif
+		return (GLD_SUCCESS);
+	}
+
+	ret = (usbgem_send_common(dp, mp, flags) == NULL)
+	    ? GLD_SUCCESS : GLD_NORESOURCES;
+	rw_exit(&dp->dev_state_lock);
+
+	return (ret);
+}
+
+/*
+ * usbgem_gld_send is used only for sending data packets into ethernet wire.
+ */
+static int
+usbgem_gld_send_tagged(gld_mac_info_t *macinfo, mblk_t *mp, uint32_t vtag)
+{
+	uint32_t	flags;
+	struct usbgem_dev	*dp;
+
+	dp = (struct usbgem_dev *)macinfo->gldm_private;
+
+	/*
+	 * Some nics hate to send packets while the link is down.
+	 */
+	if (dp->mii_state != MII_STATE_LINKUP) {
+		/* we dicard the untransmitted packets silently */
+		freemsg(mp);
+#ifdef GEM_CONFIG_FMA
+		/* FIXME - should we ignore the error? */
+		ddi_fm_service_impact(dp->dip, DDI_SERVICE_UNAFFECTED);
+#endif
+		return (GLD_SUCCESS);
+	}
+#ifdef notyet
+	flags = GLD_VTAG_TCI(vtag) << GEM_SEND_VTAG_SHIFT;
+#endif
+	return ((usbgem_send_common(dp, mp, 0) == NULL) ?
+	    GLD_SUCCESS : GLD_NORESOURCES);
+}
+
+static void
+usbgem_gld_init(struct usbgem_dev *dp, gld_mac_info_t *macinfo, char *ident)
+{
+	/*
+	 * configure GLD
+	 */
+	macinfo->gldm_devinfo = dp->dip;
+	macinfo->gldm_private = (caddr_t)dp;
+
+	macinfo->gldm_reset = usbgem_gld_reset;
+	macinfo->gldm_start = usbgem_gld_start;
+	macinfo->gldm_stop = usbgem_gld_stop;
+	macinfo->gldm_set_mac_addr = usbgem_gld_set_mac_address;
+	macinfo->gldm_send = usbgem_gld_send;
+	macinfo->gldm_set_promiscuous = usbgem_gld_set_promiscuous;
+	macinfo->gldm_get_stats = usbgem_gld_get_stats;
+	macinfo->gldm_ioctl = usbgem_gld_ioctl;
+	macinfo->gldm_set_multicast = usbgem_gld_set_multicast;
+	macinfo->gldm_intr = NULL;
+	macinfo->gldm_mctl = NULL;
+
+	macinfo->gldm_ident = ident;
+	macinfo->gldm_type = DL_ETHER;
+	macinfo->gldm_minpkt = 0;
+	macinfo->gldm_maxpkt = dp->mtu;
+	macinfo->gldm_addrlen = ETHERADDRL;
+	macinfo->gldm_saplen = -2;
+	macinfo->gldm_ppa = ddi_get_instance(dp->dip);
+#ifdef GLD_CAP_LINKSTATE
+	macinfo->gldm_capabilities = GLD_CAP_LINKSTATE;
+#endif
+	macinfo->gldm_vendor_addr = dp->dev_addr.ether_addr_octet;
+	macinfo->gldm_broadcast_addr = usbgem_bcastaddr;
+}
+#endif /* USBGEM_CONFIG_GLDv3 */
+
+
+/* ======================================================================== */
+/*
+ * .conf interface
+ */
+/* ======================================================================== */
+void
+usbgem_generate_macaddr(struct usbgem_dev *dp, uint8_t *mac)
+{
+	extern char	hw_serial[];
+	char		*hw_serial_p;
+	int		i;
+	uint64_t	val;
+	uint64_t	key;
+
+	cmn_err(CE_NOTE,
+	    "!%s: using temp ether address,"
+	    " do not use this for long time",
+	    dp->name);
+
+	/* prefer a fixed address for DHCP */
+	hw_serial_p = &hw_serial[0];
+	val = stoi(&hw_serial_p);
+
+	key = 0;
+	for (i = 0; i < USBGEM_NAME_LEN; i++) {
+		if (dp->name[i] == 0) {
+			break;
+		}
+		key ^= dp->name[i];
+	}
+	key ^= ddi_get_instance(dp->dip);
+	val ^= key << 32;
+
+	/* generate a local address */
+	mac[0] = 0x02;
+	mac[1] = (uint8_t)(val >> 32);
+	mac[2] = (uint8_t)(val >> 24);
+	mac[3] = (uint8_t)(val >> 16);
+	mac[4] = (uint8_t)(val >> 8);
+	mac[5] = (uint8_t)val;
+}
+
+boolean_t
+usbgem_get_mac_addr_conf(struct usbgem_dev *dp)
+{
+	char		propname[32];
+	char		*valstr;
+	uint8_t		mac[ETHERADDRL];
+	char		*cp;
+	int		c;
+	int		i;
+	int		j;
+	uint8_t		v;
+	uint8_t		d;
+	uint8_t		ored;
+
+	DPRINTF(3, (CE_CONT, "!%s: %s: called", dp->name, __func__));
+	/*
+	 * Get ethernet address from .conf file
+	 */
+	(void) sprintf(propname, "mac-addr");
+	if ((ddi_prop_lookup_string(DDI_DEV_T_ANY, dp->dip,
+	    DDI_PROP_DONTPASS, propname, &valstr)) != DDI_PROP_SUCCESS) {
+		return (B_FALSE);
+	}
+
+	if (strlen(valstr) != ETHERADDRL*3-1) {
+		goto syntax_err;
+	}
+
+	cp = valstr;
+	j = 0;
+	ored = 0;
+	for (;;) {
+		v = 0;
+		for (i = 0; i < 2; i++) {
+			c = *cp++;
+
+			if (c >= 'a' && c <= 'f') {
+				d = c - 'a' + 10;
+			} else if (c >= 'A' && c <= 'F') {
+				d = c - 'A' + 10;
+			} else if (c >= '0' && c <= '9') {
+				d = c - '0';
+			} else {
+				goto syntax_err;
+			}
+			v = (v << 4) | d;
+		}
+
+		mac[j++] = v;
+		ored |= v;
+		if (j == ETHERADDRL) {
+			/* done */
+			break;
+		}
+
+		c = *cp++;
+		if (c != ':') {
+			goto syntax_err;
+		}
+	}
+
+	if (ored == 0) {
+		usbgem_generate_macaddr(dp, mac);
+	}
+	for (i = 0; i < ETHERADDRL; i++) {
+		dp->dev_addr.ether_addr_octet[i] = mac[i];
+	}
+	ddi_prop_free(valstr);
+	return (B_TRUE);
+
+syntax_err:
+	cmn_err(CE_CONT,
+	    "!%s: read mac addr: trying .conf: syntax err %s",
+	    dp->name, valstr);
+	ddi_prop_free(valstr);
+
+	return (B_FALSE);
+}
+
+static void
+usbgem_read_conf(struct usbgem_dev *dp)
+{
+	int	val;
+
+	DPRINTF(1, (CE_CONT, "!%s: %s: called", dp->name, __func__));
+
+	/*
+	 * Get media mode infomation from .conf file
+	 */
+	dp->anadv_autoneg = usbgem_prop_get_int(dp, "adv_autoneg_cap", 1) != 0;
+	dp->anadv_1000fdx = usbgem_prop_get_int(dp, "adv_1000fdx_cap", 1) != 0;
+	dp->anadv_1000hdx = usbgem_prop_get_int(dp, "adv_1000hdx_cap", 1) != 0;
+	dp->anadv_100t4 = usbgem_prop_get_int(dp, "adv_100T4_cap", 1) != 0;
+	dp->anadv_100fdx = usbgem_prop_get_int(dp, "adv_100fdx_cap", 1) != 0;
+	dp->anadv_100hdx = usbgem_prop_get_int(dp, "adv_100hdx_cap", 1) != 0;
+	dp->anadv_10fdx = usbgem_prop_get_int(dp, "adv_10fdx_cap", 1) != 0;
+	dp->anadv_10hdx = usbgem_prop_get_int(dp, "adv_10hdx_cap", 1) != 0;
+	dp->anadv_1000t_ms = usbgem_prop_get_int(dp, "adv_1000t_ms", 0);
+
+	if ((ddi_prop_exists(DDI_DEV_T_ANY, dp->dip,
+	    DDI_PROP_DONTPASS, "full-duplex"))) {
+		dp->full_duplex =
+		    usbgem_prop_get_int(dp, "full-duplex", 1) != 0;
+		dp->anadv_autoneg = B_FALSE;
+		if (dp->full_duplex) {
+			dp->anadv_1000hdx = B_FALSE;
+			dp->anadv_100hdx = B_FALSE;
+			dp->anadv_10hdx = B_FALSE;
+		} else {
+			dp->anadv_1000fdx = B_FALSE;
+			dp->anadv_100fdx = B_FALSE;
+			dp->anadv_10fdx = B_FALSE;
+		}
+	}
+
+	if ((val = usbgem_prop_get_int(dp, "speed", 0)) > 0) {
+		dp->anadv_autoneg = B_FALSE;
+		switch (val) {
+		case 1000:
+			dp->speed = USBGEM_SPD_1000;
+			dp->anadv_100t4 = B_FALSE;
+			dp->anadv_100fdx = B_FALSE;
+			dp->anadv_100hdx = B_FALSE;
+			dp->anadv_10fdx = B_FALSE;
+			dp->anadv_10hdx = B_FALSE;
+			break;
+		case 100:
+			dp->speed = USBGEM_SPD_100;
+			dp->anadv_1000fdx = B_FALSE;
+			dp->anadv_1000hdx = B_FALSE;
+			dp->anadv_10fdx = B_FALSE;
+			dp->anadv_10hdx = B_FALSE;
+			break;
+		case 10:
+			dp->speed = USBGEM_SPD_10;
+			dp->anadv_1000fdx = B_FALSE;
+			dp->anadv_1000hdx = B_FALSE;
+			dp->anadv_100t4 = B_FALSE;
+			dp->anadv_100fdx = B_FALSE;
+			dp->anadv_100hdx = B_FALSE;
+			break;
+		default:
+			cmn_err(CE_WARN,
+			    "!%s: property %s: illegal value:%d",
+			    dp->name, "speed", val);
+			dp->anadv_autoneg = B_TRUE;
+			break;
+		}
+	}
+	val = usbgem_prop_get_int(dp,
+	    "adv_pause", dp->ugc.usbgc_flow_control & 1);
+	val |= usbgem_prop_get_int(dp,
+	    "adv_asmpause", BOOLEAN(dp->ugc.usbgc_flow_control & 2)) << 1;
+	if (val > FLOW_CONTROL_RX_PAUSE || val < FLOW_CONTROL_NONE) {
+		cmn_err(CE_WARN,
+		    "!%s: property %s: illegal value:%d",
+		    dp->name, "flow-control", val);
+	} else {
+		val = min(val, dp->ugc.usbgc_flow_control);
+	}
+	dp->anadv_pause = BOOLEAN(val & 1);
+	dp->anadv_asmpause = BOOLEAN(val & 2);
+
+	dp->mtu = usbgem_prop_get_int(dp, "mtu", dp->mtu);
+	dp->txthr = usbgem_prop_get_int(dp, "txthr", dp->txthr);
+	dp->rxthr = usbgem_prop_get_int(dp, "rxthr", dp->rxthr);
+	dp->txmaxdma = usbgem_prop_get_int(dp, "txmaxdma", dp->txmaxdma);
+	dp->rxmaxdma = usbgem_prop_get_int(dp, "rxmaxdma", dp->rxmaxdma);
+#ifdef GEM_CONFIG_POLLING
+	dp->poll_pkt_delay =
+	    usbgem_prop_get_int(dp, "pkt_delay", dp->poll_pkt_delay);
+
+	dp->max_poll_interval[GEM_SPD_10] =
+	    usbgem_prop_get_int(dp, "max_poll_interval_10",
+	    dp->max_poll_interval[GEM_SPD_10]);
+	dp->max_poll_interval[GEM_SPD_100] =
+	    usbgem_prop_get_int(dp, "max_poll_interval_100",
+	    dp->max_poll_interval[GEM_SPD_100]);
+	dp->max_poll_interval[GEM_SPD_1000] =
+	    usbgem_prop_get_int(dp, "max_poll_interval_1000",
+	    dp->max_poll_interval[GEM_SPD_1000]);
+
+	dp->min_poll_interval[GEM_SPD_10] =
+	    usbgem_prop_get_int(dp, "min_poll_interval_10",
+	    dp->min_poll_interval[GEM_SPD_10]);
+	dp->min_poll_interval[GEM_SPD_100] =
+	    usbgem_prop_get_int(dp, "min_poll_interval_100",
+	    dp->min_poll_interval[GEM_SPD_100]);
+	dp->min_poll_interval[GEM_SPD_1000] =
+	    usbgem_prop_get_int(dp, "min_poll_interval_1000",
+	    dp->min_poll_interval[GEM_SPD_1000]);
+#endif
+}
+
+/*
+ * usbem kstat support
+ */
+#ifndef GEM_CONFIG_GLDv3
+/* kstat items based from dmfe driver */
+
+struct usbgem_kstat_named {
+	struct kstat_named	ks_xcvr_addr;
+	struct kstat_named	ks_xcvr_id;
+	struct kstat_named	ks_xcvr_inuse;
+	struct kstat_named	ks_link_up;
+	struct kstat_named	ks_link_duplex;	/* 0:unknwon, 1:half, 2:full */
+	struct kstat_named	ks_cap_1000fdx;
+	struct kstat_named	ks_cap_1000hdx;
+	struct kstat_named	ks_cap_100fdx;
+	struct kstat_named	ks_cap_100hdx;
+	struct kstat_named	ks_cap_10fdx;
+	struct kstat_named	ks_cap_10hdx;
+#ifdef NEVER
+	struct kstat_named	ks_cap_remfault;
+#endif
+	struct kstat_named	ks_cap_autoneg;
+
+	struct kstat_named	ks_adv_cap_1000fdx;
+	struct kstat_named	ks_adv_cap_1000hdx;
+	struct kstat_named	ks_adv_cap_100fdx;
+	struct kstat_named	ks_adv_cap_100hdx;
+	struct kstat_named	ks_adv_cap_10fdx;
+	struct kstat_named	ks_adv_cap_10hdx;
+#ifdef NEVER
+	struct kstat_named	ks_adv_cap_remfault;
+#endif
+	struct kstat_named	ks_adv_cap_autoneg;
+	struct kstat_named	ks_lp_cap_1000fdx;
+	struct kstat_named	ks_lp_cap_1000hdx;
+	struct kstat_named	ks_lp_cap_100fdx;
+	struct kstat_named	ks_lp_cap_100hdx;
+	struct kstat_named	ks_lp_cap_10fdx;
+	struct kstat_named	ks_lp_cap_10hdx;
+	struct kstat_named	ks_lp_cap_remfault;
+	struct kstat_named	ks_lp_cap_autoneg;
+};
+
+static int
+usbgem_kstat_update(kstat_t *ksp, int rw)
+{
+	struct usbgem_kstat_named *knp;
+	struct usbgem_dev *dp = (struct usbgem_dev *)ksp->ks_private;
+
+	if (rw != KSTAT_READ) {
+		return (0);
+	}
+
+	knp = (struct usbgem_kstat_named *)ksp->ks_data;
+
+	knp->ks_xcvr_addr.value.ul = dp->mii_phy_addr;
+	knp->ks_xcvr_id.value.ul = dp->mii_phy_id;
+	knp->ks_xcvr_inuse.value.ul = usbgem_mac_xcvr_inuse(dp);
+	knp->ks_link_up.value.ul = dp->mii_state == MII_STATE_LINKUP;
+	knp->ks_link_duplex.value.ul =
+	    (dp->mii_state == MII_STATE_LINKUP) ?
+	    (dp->full_duplex ? 2 : 1) : 0;
+
+	knp->ks_cap_1000fdx.value.ul =
+	    (dp->mii_xstatus & MII_XSTATUS_1000BASET_FD) ||
+	    (dp->mii_xstatus & MII_XSTATUS_1000BASEX_FD);
+	knp->ks_cap_1000hdx.value.ul =
+	    (dp->mii_xstatus & MII_XSTATUS_1000BASET) ||
+	    (dp->mii_xstatus & MII_XSTATUS_1000BASEX);
+	knp->ks_cap_100fdx.value.ul =
+	    BOOLEAN(dp->mii_status & MII_STATUS_100_BASEX_FD);
+	knp->ks_cap_100hdx.value.ul =
+	    BOOLEAN(dp->mii_status & MII_STATUS_100_BASEX);
+	knp->ks_cap_10fdx.value.ul =
+	    BOOLEAN(dp->mii_status & MII_STATUS_10_FD);
+	knp->ks_cap_10hdx.value.ul =
+	    BOOLEAN(dp->mii_status & MII_STATUS_10);
+#ifdef NEVER
+	knp->ks_cap_remfault.value.ul = B_TRUE;
+#endif
+	knp->ks_cap_autoneg.value.ul =
+	    BOOLEAN(dp->mii_status & MII_STATUS_CANAUTONEG);
+
+	knp->ks_adv_cap_1000fdx.value.ul = dp->anadv_1000fdx;
+	knp->ks_adv_cap_1000hdx.value.ul = dp->anadv_1000hdx;
+	knp->ks_adv_cap_100fdx.value.ul	= dp->anadv_100fdx;
+	knp->ks_adv_cap_100hdx.value.ul	= dp->anadv_100hdx;
+	knp->ks_adv_cap_10fdx.value.ul	= dp->anadv_10fdx;
+	knp->ks_adv_cap_10hdx.value.ul	= dp->anadv_10hdx;
+#ifdef NEVER
+	knp->ks_adv_cap_remfault.value.ul = 0;
+#endif
+	knp->ks_adv_cap_autoneg.value.ul = dp->anadv_autoneg;
+
+	knp->ks_lp_cap_1000fdx.value.ul =
+	    BOOLEAN(dp->mii_stat1000 & MII_1000TS_LP_FULL);
+	knp->ks_lp_cap_1000hdx.value.ul =
+	    BOOLEAN(dp->mii_stat1000 & MII_1000TS_LP_HALF);
+	knp->ks_lp_cap_100fdx.value.ul =
+	    BOOLEAN(dp->mii_lpable & MII_ABILITY_100BASE_TX_FD);
+	knp->ks_lp_cap_100hdx.value.ul =
+	    BOOLEAN(dp->mii_lpable & MII_ABILITY_100BASE_TX);
+	knp->ks_lp_cap_10fdx.value.ul =
+	    BOOLEAN(dp->mii_lpable & MII_ABILITY_10BASE_T_FD);
+	knp->ks_lp_cap_10hdx.value.ul =
+	    BOOLEAN(dp->mii_lpable & MII_ABILITY_10BASE_T);
+	knp->ks_lp_cap_remfault.value.ul =
+	    BOOLEAN(dp->mii_exp & MII_AN_EXP_PARFAULT);
+	knp->ks_lp_cap_autoneg.value.ul =
+	    BOOLEAN(dp->mii_exp & MII_AN_EXP_LPCANAN);
+
+	return (0);
+}
+
+
+static int
+usbgem_kstat_init(struct usbgem_dev *dp)
+{
+	int			i;
+	kstat_t			*ksp;
+	struct usbgem_kstat_named	*knp;
+
+	ksp = kstat_create(
+	    (char *)ddi_driver_name(dp->dip), ddi_get_instance(dp->dip),
+	    "mii", "net", KSTAT_TYPE_NAMED,
+	    sizeof (*knp) / sizeof (knp->ks_xcvr_addr), 0);
+
+	if (ksp == NULL) {
+		cmn_err(CE_WARN, "%s: %s() for mii failed",
+		    dp->name, __func__);
+		return (USB_FAILURE);
+	}
+
+	knp = (struct usbgem_kstat_named *)ksp->ks_data;
+
+	kstat_named_init(&knp->ks_xcvr_addr, "xcvr_addr",
+	    KSTAT_DATA_INT32);
+	kstat_named_init(&knp->ks_xcvr_id, "xcvr_id",
+	    KSTAT_DATA_UINT32);
+	kstat_named_init(&knp->ks_xcvr_inuse, "xcvr_inuse",
+	    KSTAT_DATA_UINT32);
+	kstat_named_init(&knp->ks_link_up, "link_up",
+	    KSTAT_DATA_UINT32);
+	kstat_named_init(&knp->ks_link_duplex, "link_duplex",
+	    KSTAT_DATA_UINT32);
+	kstat_named_init(&knp->ks_cap_1000fdx, "cap_1000fdx",
+	    KSTAT_DATA_UINT32);
+	kstat_named_init(&knp->ks_cap_1000hdx, "cap_1000hdx",
+	    KSTAT_DATA_UINT32);
+	kstat_named_init(&knp->ks_cap_100fdx, "cap_100fdx",
+	    KSTAT_DATA_UINT32);
+	kstat_named_init(&knp->ks_cap_100hdx, "cap_100hdx",
+	    KSTAT_DATA_UINT32);
+	kstat_named_init(&knp->ks_cap_10fdx, "cap_10fdx",
+	    KSTAT_DATA_UINT32);
+	kstat_named_init(&knp->ks_cap_10hdx, "cap_10hdx",
+	    KSTAT_DATA_UINT32);
+#ifdef NEVER
+	kstat_named_init(&knp->ks_cap_remfault, "cap_rem_fault",
+	    KSTAT_DATA_UINT32);
+#endif
+	kstat_named_init(&knp->ks_cap_autoneg, "cap_autoneg",
+	    KSTAT_DATA_UINT32);
+	kstat_named_init(&knp->ks_adv_cap_1000fdx, "adv_cap_1000fdx",
+	    KSTAT_DATA_UINT32);
+	kstat_named_init(&knp->ks_adv_cap_1000hdx, "adv_cap_1000hdx",
+	    KSTAT_DATA_UINT32);
+	kstat_named_init(&knp->ks_adv_cap_100fdx, "adv_cap_100fdx",
+	    KSTAT_DATA_UINT32);
+	kstat_named_init(&knp->ks_adv_cap_100hdx, "adv_cap_100hdx",
+	    KSTAT_DATA_UINT32);
+	kstat_named_init(&knp->ks_adv_cap_10fdx, "adv_cap_10fdx",
+	    KSTAT_DATA_UINT32);
+	kstat_named_init(&knp->ks_adv_cap_10hdx, "adv_cap_10hdx",
+	    KSTAT_DATA_UINT32);
+#ifdef NEVER
+	kstat_named_init(&knp->ks_adv_cap_remfault, "adv_rem_fault",
+	    KSTAT_DATA_UINT32);
+#endif
+	kstat_named_init(&knp->ks_adv_cap_autoneg, "adv_cap_autoneg",
+	    KSTAT_DATA_UINT32);
+
+	kstat_named_init(&knp->ks_lp_cap_1000fdx, "lp_cap_1000fdx",
+	    KSTAT_DATA_UINT32);
+	kstat_named_init(&knp->ks_lp_cap_1000hdx, "lp_cap_1000hdx",
+	    KSTAT_DATA_UINT32);
+	kstat_named_init(&knp->ks_lp_cap_100fdx, "lp_cap_100fdx",
+	    KSTAT_DATA_UINT32);
+	kstat_named_init(&knp->ks_lp_cap_100hdx, "lp_cap_100hdx",
+	    KSTAT_DATA_UINT32);
+	kstat_named_init(&knp->ks_lp_cap_10fdx, "lp_cap_10fdx",
+	    KSTAT_DATA_UINT32);
+	kstat_named_init(&knp->ks_lp_cap_10hdx, "lp_cap_10hdx",
+	    KSTAT_DATA_UINT32);
+	kstat_named_init(&knp->ks_lp_cap_remfault, "lp_cap_rem_fault",
+	    KSTAT_DATA_UINT32);
+	kstat_named_init(&knp->ks_lp_cap_autoneg, "lp_cap_autoneg",
+	    KSTAT_DATA_UINT32);
+
+	ksp->ks_private = (void *) dp;
+	ksp->ks_update = usbgem_kstat_update;
+	dp->ksp = ksp;
+
+	kstat_install(ksp);
+
+	return (USB_SUCCESS);
+}
+#endif /* GEM_CONFIG_GLDv3 */
+/* ======================================================================== */
+/*
+ * attach/detatch/usb support
+ */
+/* ======================================================================== */
+int
+usbgem_ctrl_out(struct usbgem_dev *dp,
+	uint8_t reqt, uint8_t req, uint16_t val, uint16_t ix, uint16_t len,
+	void *bp, int size)
+{
+	mblk_t			*data;
+	usb_ctrl_setup_t	setup;
+	usb_cr_t		completion_reason;
+	usb_cb_flags_t		cb_flags;
+	usb_flags_t		flags;
+	int			i;
+	int			ret;
+
+	DPRINTF(4, (CE_CONT, "!%s: %s "
+	    "reqt:0x%02x req:0x%02x val:0x%04x ix:0x%04x len:0x%02x "
+	    "bp:0x%p nic_state:%d",
+	    dp->name, __func__, reqt, req, val, ix, len, bp, dp->nic_state));
+
+	if (dp->mac_state == MAC_STATE_DISCONNECTED) {
+		return (USB_PIPE_ERROR);
+	}
+
+	data = NULL;
+	if (size > 0) {
+		if ((data = allocb(size, 0)) == NULL) {
+			return (USB_FAILURE);
+		}
+
+		bcopy(bp, data->b_rptr, size);
+		data->b_wptr = data->b_rptr + size;
+	}
+
+	setup.bmRequestType = reqt;
+	setup.bRequest = req;
+	setup.wValue = val;
+	setup.wIndex = ix;
+	setup.wLength = len;
+	setup.attrs = 0;	/* attributes */
+
+	for (i = usbgem_ctrl_retry; i > 0; i--) {
+		completion_reason = 0;
+		cb_flags = 0;
+
+		ret = usb_pipe_ctrl_xfer_wait(DEFAULT_PIPE(dp),
+		    &setup, &data, &completion_reason, &cb_flags, 0);
+
+		if (ret == USB_SUCCESS) {
+			break;
+		}
+		if (i == 1) {
+			cmn_err(CE_WARN,
+			    "!%s: %s failed: "
+			    "reqt:0x%x req:0x%x val:0x%x ix:0x%x len:0x%x "
+			    "ret:%d cr:%s(%d), cb_flags:0x%x %s",
+			    dp->name, __func__, reqt, req, val, ix, len,
+			    ret, usb_str_cr(completion_reason),
+			    completion_reason,
+			    cb_flags,
+			    (i > 1) ? "retrying..." : "fatal");
+		}
+	}
+
+	if (data != NULL) {
+		freemsg(data);
+	}
+
+	return (ret);
+}
+
+int
+usbgem_ctrl_in(struct usbgem_dev *dp,
+	uint8_t reqt, uint8_t req, uint16_t val, uint16_t ix, uint16_t len,
+	void *bp, int size)
+{
+	mblk_t			*data;
+	usb_ctrl_setup_t	setup;
+	usb_cr_t		completion_reason;
+	usb_cb_flags_t		cb_flags;
+	int			i;
+	int			ret;
+	int			reclen;
+
+	DPRINTF(4, (CE_CONT,
+	    "!%s: %s:"
+	    " reqt:0x%02x req:0x%02x val:0x%04x ix:0x%04x len:0x%02x"
+	    " bp:x%p mac_state:%d",
+	    dp->name, __func__, reqt, req, val, ix, len, bp, dp->mac_state));
+
+	if (dp->mac_state == MAC_STATE_DISCONNECTED) {
+		return (USB_PIPE_ERROR);
+	}
+
+	data = NULL;
+
+	setup.bmRequestType = reqt;
+	setup.bRequest = req;
+	setup.wValue = val;
+	setup.wIndex = ix;
+	setup.wLength = len;
+	setup.attrs = USB_ATTRS_AUTOCLEARING;	/* XXX */
+
+	for (i = usbgem_ctrl_retry; i > 0; i--) {
+		completion_reason = 0;
+		cb_flags = 0;
+		ret = usb_pipe_ctrl_xfer_wait(DEFAULT_PIPE(dp), &setup, &data,
+		    &completion_reason, &cb_flags, 0);
+
+		if (ret == USB_SUCCESS) {
+			reclen = msgdsize(data);
+			bcopy(data->b_rptr, bp, min(reclen, size));
+			break;
+		}
+		if (i == 1) {
+			cmn_err(CE_WARN,
+			    "!%s: %s failed: "
+			    "reqt:0x%x req:0x%x val:0x%x ix:0x%x len:0x%x "
+			    "ret:%d cr:%s(%d) cb_flags:0x%x %s",
+			    dp->name, __func__,
+			    reqt, req, val, ix, len,
+			    ret, usb_str_cr(completion_reason),
+			    completion_reason,
+			    cb_flags,
+			    (i > 1) ? "retrying..." : "fatal");
+		}
+	}
+
+	if (data) {
+		freemsg(data);
+	}
+
+	return (ret);
+}
+
+int
+usbgem_ctrl_out_val(struct usbgem_dev *dp,
+    uint8_t reqt, uint8_t req, uint16_t val, uint16_t ix, uint16_t len,
+    uint32_t v)
+{
+	uint8_t	buf[4];
+
+	/* convert to little endian from native byte order */
+	switch (len) {
+	case 4:
+		buf[3] = v >> 24;
+		buf[2] = v >> 16;
+		/* fall thru */
+	case 2:
+		buf[1] = v >> 8;
+		/* fall thru */
+	case 1:
+		buf[0] = v;
+	}
+
+	return (usbgem_ctrl_out(dp, reqt, req, val, ix, len, buf, len));
+}
+
+int
+usbgem_ctrl_in_val(struct usbgem_dev *dp,
+    uint8_t reqt, uint8_t req, uint16_t val, uint16_t ix, uint16_t len,
+    void *valp)
+{
+	uint8_t		buf[4];
+	uint_t		v;
+	int		err;
+
+#ifdef SANITY
+	bzero(buf, sizeof (buf));
+#endif
+	err = usbgem_ctrl_in(dp, reqt, req, val, ix, len, buf, len);
+	if (err == USB_SUCCESS) {
+		v = 0;
+		switch (len) {
+		case 4:
+			v |= buf[3] << 24;
+			v |= buf[2] << 16;
+			/* FALLTHROUGH */
+		case 2:
+			v |= buf[1] << 8;
+			/* FALLTHROUGH */
+		case 1:
+			v |= buf[0];
+		}
+
+		switch (len) {
+		case 4:
+			*(uint32_t *)valp = v;
+			break;
+		case 2:
+			*(uint16_t *)valp = v;
+			break;
+		case 1:
+			*(uint8_t *)valp = v;
+			break;
+		}
+	}
+	return (err);
+}
+
+/*
+ * Attach / detach / disconnect / reconnect management
+ */
+static int
+usbgem_open_pipes(struct usbgem_dev *dp)
+{
+	int			i;
+	int			ret;
+	int			ifnum;
+	int			alt;
+	usb_client_dev_data_t	*reg_data;
+	usb_ep_data_t		*ep_tree_node;
+
+	DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__));
+
+	ifnum = dp->ugc.usbgc_ifnum;
+	alt = dp->ugc.usbgc_alt;
+
+	ep_tree_node = usb_lookup_ep_data(dp->dip, dp->reg_data, ifnum, alt,
+	    0, USB_EP_ATTR_BULK, USB_EP_DIR_IN);
+	if (ep_tree_node == NULL) {
+		cmn_err(CE_WARN, "!%s: %s: ep_bulkin is NULL",
+		    dp->name, __func__);
+		goto err;
+	}
+	dp->ep_bulkin = &ep_tree_node->ep_descr;
+
+	ep_tree_node = usb_lookup_ep_data(dp->dip, dp->reg_data, ifnum, alt,
+	    0, USB_EP_ATTR_BULK, USB_EP_DIR_OUT);
+	if (ep_tree_node == NULL) {
+		cmn_err(CE_WARN, "!%s: %s: ep_bulkout is NULL",
+		    dp->name, __func__);
+		goto err;
+	}
+	dp->ep_bulkout = &ep_tree_node->ep_descr;
+
+	ep_tree_node = usb_lookup_ep_data(dp->dip, dp->reg_data, ifnum, alt,
+	    0, USB_EP_ATTR_INTR, USB_EP_DIR_IN);
+	if (ep_tree_node) {
+		dp->ep_intr = &ep_tree_node->ep_descr;
+	} else {
+		/* don't care */
+		DPRINTF(1, (CE_CONT, "!%s: %s: ep_intr is NULL",
+		    dp->name, __func__));
+		dp->ep_intr = NULL;
+	}
+
+	/* XXX -- no need to open default pipe */
+
+	/* open bulk out pipe */
+	bzero(&dp->policy_bulkout, sizeof (usb_pipe_policy_t));
+	dp->policy_bulkout.pp_max_async_reqs = 1;
+
+	if ((ret = usb_pipe_open(dp->dip,
+	    dp->ep_bulkout, &dp->policy_bulkout, USB_FLAGS_SLEEP,
+	    &dp->bulkout_pipe)) != USB_SUCCESS) {
+		cmn_err(CE_WARN,
+		    "!%s: %s: err:%x: failed to open bulk-out pipe",
+		    dp->name, __func__, ret);
+		dp->bulkout_pipe = NULL;
+		goto err;
+	}
+	DPRINTF(1, (CE_CONT, "!%s: %s: bulkout_pipe opened successfully",
+	    dp->name, __func__));
+
+	/* open bulk in pipe */
+	bzero(&dp->policy_bulkin, sizeof (usb_pipe_policy_t));
+	dp->policy_bulkin.pp_max_async_reqs = 1;
+	if ((ret = usb_pipe_open(dp->dip,
+	    dp->ep_bulkin, &dp->policy_bulkin, USB_FLAGS_SLEEP,
+	    &dp->bulkin_pipe)) != USB_SUCCESS) {
+		cmn_err(CE_WARN,
+		    "!%s: %s: ret:%x failed to open bulk-in pipe",
+		    dp->name, __func__, ret);
+		dp->bulkin_pipe = NULL;
+		goto err;
+	}
+	DPRINTF(1, (CE_CONT, "!%s: %s: bulkin_pipe opened successfully",
+	    dp->name, __func__));
+
+	if (dp->ep_intr) {
+		/* open interrupt pipe */
+		bzero(&dp->policy_interrupt, sizeof (usb_pipe_policy_t));
+		dp->policy_interrupt.pp_max_async_reqs = 1;
+		if ((ret = usb_pipe_open(dp->dip, dp->ep_intr,
+		    &dp->policy_interrupt, USB_FLAGS_SLEEP,
+		    &dp->intr_pipe)) != USB_SUCCESS) {
+			cmn_err(CE_WARN,
+			    "!%s: %s: ret:%x failed to open interrupt pipe",
+			    dp->name, __func__, ret);
+			dp->intr_pipe = NULL;
+			goto err;
+		}
+	}
+	DPRINTF(1, (CE_CONT, "!%s: %s: intr_pipe opened successfully",
+	    dp->name, __func__));
+
+	return (USB_SUCCESS);
+
+err:
+	if (dp->bulkin_pipe) {
+		usb_pipe_close(dp->dip,
+		    dp->bulkin_pipe, USB_FLAGS_SLEEP, NULL, 0);
+		dp->bulkin_pipe = NULL;
+	}
+	if (dp->bulkout_pipe) {
+		usb_pipe_close(dp->dip,
+		    dp->bulkout_pipe, USB_FLAGS_SLEEP, NULL, 0);
+		dp->bulkout_pipe = NULL;
+	}
+	if (dp->intr_pipe) {
+		usb_pipe_close(dp->dip,
+		    dp->intr_pipe, USB_FLAGS_SLEEP, NULL, 0);
+		dp->intr_pipe = NULL;
+	}
+
+	return (USB_FAILURE);
+}
+
+static int
+usbgem_close_pipes(struct usbgem_dev *dp)
+{
+	DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__));
+
+	if (dp->intr_pipe) {
+		usb_pipe_close(dp->dip,
+		    dp->intr_pipe, USB_FLAGS_SLEEP, NULL, 0);
+		dp->intr_pipe = NULL;
+	}
+	DPRINTF(1, (CE_CONT, "!%s: %s: 1", dp->name, __func__));
+
+	ASSERT(dp->bulkin_pipe);
+	usb_pipe_close(dp->dip, dp->bulkin_pipe, USB_FLAGS_SLEEP, NULL, 0);
+	dp->bulkin_pipe = NULL;
+	DPRINTF(1, (CE_CONT, "!%s: %s: 2", dp->name, __func__));
+
+	ASSERT(dp->bulkout_pipe);
+	usb_pipe_close(dp->dip, dp->bulkout_pipe, USB_FLAGS_SLEEP, NULL, 0);
+	dp->bulkout_pipe = NULL;
+	DPRINTF(1, (CE_CONT, "!%s: %s: 3", dp->name, __func__));
+
+	return (USB_SUCCESS);
+}
+
+#define FREEZE_GRACEFUL		(B_TRUE)
+#define FREEZE_NO_GRACEFUL	(B_FALSE)
+static int
+usbgem_freeze_device(struct usbgem_dev *dp, boolean_t graceful)
+{
+	DPRINTF(0, (CE_NOTE, "!%s: %s: called", dp->name, __func__));
+
+	/* stop nic activity */
+	(void) usbgem_mac_stop(dp, MAC_STATE_DISCONNECTED, graceful);
+
+	/*
+	 * Here we free all memory resource allocated, because it will
+	 * cause to panic the system that we free usb_bulk_req objects
+	 * during the usb device is disconnected.
+	 */
+	(void) usbgem_free_memory(dp);
+
+	return (USB_SUCCESS);
+}
+
+static int
+usbgem_disconnect_cb(dev_info_t *dip)
+{
+	int	ret;
+	struct usbgem_dev	*dp;
+
+	dp = USBGEM_GET_DEV(dip);
+
+	cmn_err(CE_NOTE, "!%s: the usb device was disconnected (dp=%p)",
+	    dp->name, dp);
+
+	/* start serialize */
+	rw_enter(&dp->dev_state_lock, RW_WRITER);
+
+	ret = usbgem_freeze_device(dp, 0);
+
+	/* end of serialize */
+	rw_exit(&dp->dev_state_lock);
+
+	return (ret);
+}
+
+static int
+usbgem_recover_device(struct usbgem_dev	*dp)
+{
+	int	err;
+
+	DPRINTF(0, (CE_NOTE, "!%s: %s: called", dp->name, __func__));
+
+	err = USB_SUCCESS;
+
+	/* reinitialize the usb connection */
+	usbgem_close_pipes(dp);
+	if ((err = usbgem_open_pipes(dp)) != USB_SUCCESS) {
+		goto x;
+	}
+
+	/* initialize nic state */
+	dp->mac_state = MAC_STATE_STOPPED;
+	dp->mii_state = MII_STATE_UNKNOWN;
+
+	/* allocate memory resources again */
+	if ((err = usbgem_alloc_memory(dp)) != USB_SUCCESS) {
+		goto x;
+	}
+
+	/* restart nic and recover state */
+	(void) usbgem_restart_nic(dp);
+
+	usbgem_mii_init(dp);
+
+	/* kick potentially stopped house keeping thread */
+	cv_signal(&dp->link_watcher_wait_cv);
+x:
+	return (err);
+}
+
+static int
+usbgem_reconnect_cb(dev_info_t *dip)
+{
+	int	err = USB_SUCCESS;
+	struct usbgem_dev	*dp;
+
+	dp = USBGEM_GET_DEV(dip);
+	DPRINTF(0, (CE_CONT, "!%s: dp=%p", ddi_get_name(dip), dp));
+#ifdef notdef
+	/* check device changes after disconnect */
+	if (usb_check_same_device(dp->dip, NULL, USB_LOG_L2, -1,
+	    USB_CHK_BASIC | USB_CHK_CFG, NULL) != USB_SUCCESS) {
+		cmn_err(CE_CONT,
+		     "!%s: no or different device installed", dp->name);
+		return (DDI_SUCCESS);
+	}
+#endif
+	cmn_err(CE_NOTE, "%s: the usb device was reconnected", dp->name);
+
+	/* start serialize */
+	rw_enter(&dp->dev_state_lock, RW_WRITER);
+
+	if (dp->mac_state == MAC_STATE_DISCONNECTED) {
+		err = usbgem_recover_device(dp);
+	}
+
+	/* end of serialize */
+	rw_exit(&dp->dev_state_lock);
+
+	return (err == USB_SUCCESS ? DDI_SUCCESS : DDI_FAILURE);
+}
+
+int
+usbgem_suspend(dev_info_t *dip)
+{
+	int	err = USB_SUCCESS;
+	struct usbgem_dev	*dp;
+
+	dp = USBGEM_GET_DEV(dip);
+
+	DPRINTF(0, (CE_CONT, "!%s: %s: callded", dp->name, __func__));
+
+	/* start serialize */
+	rw_enter(&dp->dev_state_lock, RW_WRITER);
+
+	if (dp->mac_state == MAC_STATE_DISCONNECTED) {
+		err = usbgem_freeze_device(dp, STOP_GRACEFUL);
+	}
+
+	/* end of serialize */
+	rw_exit(&dp->dev_state_lock);
+
+	return (err == USB_SUCCESS ? DDI_SUCCESS : DDI_FAILURE);
+}
+
+int
+usbgem_resume(dev_info_t *dip)
+{
+	int	err = USB_SUCCESS;
+	struct usbgem_dev	*dp;
+
+	dp = USBGEM_GET_DEV(dip);
+
+	DPRINTF(0, (CE_CONT, "!%s: %s: callded", dp->name, __func__));
+#ifdef notdef
+	/* check device changes after disconnect */
+	if (usb_check_same_device(dp->dip, NULL, USB_LOG_L2, -1,
+	    USB_CHK_BASIC | USB_CHK_CFG, NULL) != USB_SUCCESS) {
+		cmn_err(CE_CONT,
+		     "!%s: no or different device installed", dp->name);
+		return (DDI_SUCCESS);
+	}
+#endif
+	/* start serialize */
+	rw_enter(&dp->dev_state_lock, RW_WRITER);
+
+	if (dp->mac_state == MAC_STATE_DISCONNECTED) {
+		err = usbgem_recover_device(dp);
+	}
+
+	/* end of serialize */
+	rw_exit(&dp->dev_state_lock);
+
+	return (err == USB_SUCCESS ? DDI_SUCCESS : DDI_FAILURE);
+}
+
+#define	USBGEM_LOCAL_DATA_SIZE(gc)	\
+	(sizeof (struct usbgem_dev) + USBGEM_MCALLOC)
+
+struct usbgem_dev *
+usbgem_do_attach(dev_info_t *dip,
+	struct usbgem_conf *gc, void *lp, int lmsize)
+{
+	struct usbgem_dev	*dp;
+	int			i;
+#ifdef USBGEM_CONFIG_GLDv3
+	mac_register_t		*macp = NULL;
+#else
+	gld_mac_info_t		*macinfo;
+	void			*tmp;
+#endif
+	int			ret;
+	int			unit;
+	int			err;
+
+	unit = ddi_get_instance(dip);
+
+	DPRINTF(2, (CE_CONT, "!usbgem%d: %s: called", unit, __func__));
+
+	/*
+	 * Allocate soft data structure
+	 */
+	dp = kmem_zalloc(USBGEM_LOCAL_DATA_SIZE(gc), KM_SLEEP);
+	if (dp == NULL) {
+#ifndef USBGEM_CONFIG_GLDv3
+		gld_mac_free(macinfo);
+#endif
+		return (NULL);
+	}
+#ifdef USBGEM_CONFIG_GLDv3
+	if ((macp = mac_alloc(MAC_VERSION)) == NULL) {
+		cmn_err(CE_WARN, "!gem%d: %s: mac_alloc failed",
+		    unit, __func__);
+		return (NULL);
+	}
+#else
+	macinfo = gld_mac_alloc(dip);
+	dp->macinfo = macinfo;
+#endif
+
+	/* link to private area */
+	dp->private = lp;
+	dp->priv_size = lmsize;
+	dp->mc_list = (struct mcast_addr *)&dp[1];
+
+	dp->dip = dip;
+	bcopy(gc->usbgc_name, dp->name, USBGEM_NAME_LEN);
+
+	/*
+	 * register with usb service
+	 */
+	if (usb_client_attach(dip, USBDRV_VERSION, 0) != USB_SUCCESS) {
+		cmn_err(CE_WARN,
+		    "%s: %s: usb_client_attach failed",
+		    dp->name, __func__);
+		goto err_free_private;
+	}
+
+	if (usb_get_dev_data(dip, &dp->reg_data,
+	    USB_PARSE_LVL_ALL, 0) != USB_SUCCESS) {
+		dp->reg_data = NULL;
+		goto err_unregister_client;
+	}
+#ifdef USBGEM_DEBUG_LEVEL
+	usb_print_descr_tree(dp->dip, dp->reg_data);
+#endif
+
+	if (usbgem_open_pipes(dp) != USB_SUCCESS) {
+		/* failed to open pipes */
+		cmn_err(CE_WARN, "!%s: %s: failed to open pipes",
+		    dp->name, __func__);
+		goto err_unregister_client;
+	}
+
+	/*
+	 * Initialize mutexs and condition variables
+	 */
+	mutex_init(&dp->rxlock, NULL, MUTEX_DRIVER, NULL);
+	mutex_init(&dp->txlock, NULL, MUTEX_DRIVER, NULL);
+	cv_init(&dp->rx_drain_cv, NULL, CV_DRIVER, NULL);
+	cv_init(&dp->tx_drain_cv, NULL, CV_DRIVER, NULL);
+	rw_init(&dp->dev_state_lock, NULL, RW_DRIVER, NULL);
+	mutex_init(&dp->link_watcher_lock, NULL, MUTEX_DRIVER, NULL);
+	cv_init(&dp->link_watcher_wait_cv, NULL, CV_DRIVER, NULL);
+	sema_init(&dp->hal_op_lock, 1, NULL, SEMA_DRIVER, NULL);
+	sema_init(&dp->rxfilter_lock, 1, NULL, SEMA_DRIVER, NULL);
+
+	/*
+	 * Initialize configuration
+	 */
+	dp->ugc = *gc;
+
+	dp->mtu = ETHERMTU;
+	dp->rxmode = 0;
+	dp->speed = USBGEM_SPD_10;	/* default is 10Mbps */
+	dp->full_duplex = B_FALSE;	/* default is half */
+	dp->flow_control = FLOW_CONTROL_NONE;
+
+	dp->nic_state = NIC_STATE_STOPPED;
+	dp->mac_state = MAC_STATE_STOPPED;
+	dp->mii_state = MII_STATE_UNKNOWN;
+
+	/* performance tuning parameters */
+	dp->txthr = ETHERMAX;		/* tx fifo threshoold */
+	dp->txmaxdma = 16*4;		/* tx max dma burst size */
+	dp->rxthr = 128;		/* rx fifo threshoold */
+	dp->rxmaxdma = 16*4;		/* rx max dma burst size */
+
+	/*
+	 * Get media mode infomation from .conf file
+	 */
+	usbgem_read_conf(dp);
+
+	/* rx_buf_len depend on MTU */
+	dp->rx_buf_len = MAXPKTBUF(dp) + dp->ugc.usbgc_rx_header_len;
+
+	/*
+	 * Reset the chip
+	 */
+	if (usbgem_hal_reset_chip(dp) != USB_SUCCESS) {
+		cmn_err(CE_WARN,
+		    "!%s: %s: failed to reset the usb device",
+		    dp->name, __func__);
+		goto err_destroy_locks;
+	}
+
+	/*
+	 * HW dependant paremeter initialization
+	 */
+	if (usbgem_hal_attach_chip(dp) != USB_SUCCESS) {
+		cmn_err(CE_WARN,
+		    "!%s: %s: failed to attach the usb device",
+		    dp->name, __func__);
+		goto err_destroy_locks;
+	}
+
+	/* allocate resources */
+	if (usbgem_alloc_memory(dp) != USB_SUCCESS) {
+		goto err_destroy_locks;
+	}
+
+	DPRINTF(0, (CE_CONT,
+	    "!%s: %02x:%02x:%02x:%02x:%02x:%02x",
+	    dp->name,
+	    dp->dev_addr.ether_addr_octet[0],
+	    dp->dev_addr.ether_addr_octet[1],
+	    dp->dev_addr.ether_addr_octet[2],
+	    dp->dev_addr.ether_addr_octet[3],
+	    dp->dev_addr.ether_addr_octet[4],
+	    dp->dev_addr.ether_addr_octet[5]));
+
+	/* copy mac address */
+	dp->cur_addr = dp->dev_addr;
+
+	/* pre-calculated tx timeout in second for performance */
+	dp->bulkout_timeout =
+	    dp->ugc.usbgc_tx_timeout / drv_usectohz(1000*1000);
+
+#ifdef USBGEM_CONFIG_GLDv3
+	usbgem_gld3_init(dp, macp);
+#else
+	usbgem_gld_init(dp, macinfo, ident);
+#endif
+
+	/* Probe MII phy (scan phy) */
+	dp->mii_lpable = 0;
+	dp->mii_advert = 0;
+	dp->mii_exp = 0;
+	dp->mii_ctl1000 = 0;
+	dp->mii_stat1000 = 0;
+
+	dp->mii_status_ro = 0;
+	dp->mii_xstatus_ro = 0;
+
+	if (usbgem_mii_probe(dp) != USB_SUCCESS) {
+		cmn_err(CE_WARN, "!%s: %s: mii_probe failed",
+		    dp->name, __func__);
+		goto err_free_memory;
+	}
+
+	/* mask unsupported abilities */
+	dp->anadv_autoneg &= BOOLEAN(dp->mii_status & MII_STATUS_CANAUTONEG);
+	dp->anadv_1000fdx &=
+	    BOOLEAN(dp->mii_xstatus &
+	    (MII_XSTATUS_1000BASEX_FD | MII_XSTATUS_1000BASET_FD));
+	dp->anadv_1000hdx &=
+	    BOOLEAN(dp->mii_xstatus &
+	    (MII_XSTATUS_1000BASEX | MII_XSTATUS_1000BASET));
+	dp->anadv_100t4 &= BOOLEAN(dp->mii_status & MII_STATUS_100_BASE_T4);
+	dp->anadv_100fdx &= BOOLEAN(dp->mii_status & MII_STATUS_100_BASEX_FD);
+	dp->anadv_100hdx &= BOOLEAN(dp->mii_status & MII_STATUS_100_BASEX);
+	dp->anadv_10fdx &= BOOLEAN(dp->mii_status & MII_STATUS_10_FD);
+	dp->anadv_10hdx &= BOOLEAN(dp->mii_status & MII_STATUS_10);
+
+	if (usbgem_mii_init(dp) != USB_SUCCESS) {
+		cmn_err(CE_WARN, "!%s: %s: mii_init failed",
+		    dp->name, __func__);
+		goto err_free_memory;
+	}
+
+	/*
+	 * initialize kstats including mii statistics
+	 */
+#ifdef USBGEM_CONFIG_GLDv3
+#ifdef USBGEM_CONFIG_ND
+	usbgem_nd_setup(dp);
+#endif
+#else
+	if (usbgem_kstat_init(dp) != USB_SUCCESS) {
+		goto err_free_memory;
+	}
+#endif
+
+	/*
+	 * Add interrupt to system.
+	 */
+#ifdef USBGEM_CONFIG_GLDv3
+	if (ret = mac_register(macp, &dp->mh)) {
+		cmn_err(CE_WARN, "!%s: mac_register failed, error:%d",
+		    dp->name, ret);
+		goto err_release_stats;
+	}
+	mac_free(macp);
+	macp = NULL;
+#else
+	/* gld_register will corrupts driver_private */
+	tmp = ddi_get_driver_private(dip);
+	if (gld_register(dip,
+	    (char *)ddi_driver_name(dip), macinfo) != DDI_SUCCESS) {
+		cmn_err(CE_WARN, "!%s: %s: gld_register failed",
+		    dp->name, __func__);
+		ddi_set_driver_private(dip, tmp);
+		goto err_release_stats;
+	}
+	/* restore driver private */
+	ddi_set_driver_private(dip, tmp);
+#endif /* USBGEM_CONFIG_GLDv3 */
+	if (usb_register_hotplug_cbs(dip,
+	    usbgem_suspend, usbgem_resume) != USB_SUCCESS) {
+		cmn_err(CE_WARN,
+		    "!%s: %s: failed to register hotplug cbs",
+		    dp->name, __func__);
+		goto err_unregister_gld;
+	}
+
+	/* reset mii and start mii link watcher */
+	if (usbgem_mii_start(dp) != USB_SUCCESS) {
+		goto err_unregister_hotplug;
+	}
+
+	/* start tx watchdow watcher */
+	if (usbgem_tx_watcher_start(dp)) {
+		goto err_usbgem_mii_stop;
+	}
+
+	ddi_set_driver_private(dip, (caddr_t)dp);
+
+	DPRINTF(2, (CE_CONT, "!%s: %s: return: success", dp->name, __func__));
+
+	return (dp);
+
+err_usbgem_mii_stop:
+	usbgem_mii_stop(dp);
+
+err_unregister_hotplug:
+	usb_unregister_hotplug_cbs(dip);
+
+err_unregister_gld:
+#ifdef USBGEM_CONFIG_GLDv3
+	mac_unregister(dp->mh);
+#else
+	gld_unregister(macinfo);
+#endif
+
+err_release_stats:
+#ifdef USBGEM_CONFIG_GLDv3
+#ifdef USBGEM_CONFIG_ND
+	/* release NDD resources */
+	usbgem_nd_cleanup(dp);
+#endif
+#else
+	kstat_delete(dp->ksp);
+#endif
+
+err_free_memory:
+	usbgem_free_memory(dp);
+
+err_destroy_locks:
+	cv_destroy(&dp->tx_drain_cv);
+	cv_destroy(&dp->rx_drain_cv);
+	mutex_destroy(&dp->txlock);
+	mutex_destroy(&dp->rxlock);
+	rw_destroy(&dp->dev_state_lock);
+	mutex_destroy(&dp->link_watcher_lock);
+	cv_destroy(&dp->link_watcher_wait_cv);
+	sema_destroy(&dp->hal_op_lock);
+	sema_destroy(&dp->rxfilter_lock);
+
+err_close_pipes:
+	(void) usbgem_close_pipes(dp);
+
+err_unregister_client:
+	usb_client_detach(dp->dip, dp->reg_data);
+
+err_free_private:
+#ifdef USBGEM_CONFIG_GLDv3
+	if (macp) {
+		mac_free(macp);
+	}
+#else
+	gld_mac_free(macinfo);
+#endif
+	kmem_free((caddr_t)dp, USBGEM_LOCAL_DATA_SIZE(gc));
+
+	return (NULL);
+}
+
+int
+usbgem_do_detach(dev_info_t *dip)
+{
+	struct usbgem_dev	*dp;
+
+	dp = USBGEM_GET_DEV(dip);
+
+#ifdef USBGEM_CONFIG_GLDv3
+	/* unregister with gld v3 */
+	if (mac_unregister(dp->mh) != DDI_SUCCESS) {
+		return (DDI_FAILURE);
+	}
+#else
+	/* unregister with gld v2 */
+	if (gld_unregister(dp->macinfo) != DDI_SUCCESS) {
+		return (DDI_FAILURE);
+	}
+#endif
+	/* unregister with hotplug service */
+	usb_unregister_hotplug_cbs(dip);
+
+	/* stop tx watchdog watcher*/
+	usbgem_tx_watcher_stop(dp);
+
+	/* stop the link manager */
+	usbgem_mii_stop(dp);
+
+	/* unregister with usb service */
+	(void) usbgem_free_memory(dp);
+	(void) usbgem_close_pipes(dp);
+	usb_client_detach(dp->dip, dp->reg_data);
+	dp->reg_data = NULL;
+
+	/* unregister with kernel statistics */
+#ifdef USBGEM_CONFIG_GLDv3
+#ifdef USBGEM_CONFIG_ND
+	/* release ndd resources */
+	usbgem_nd_cleanup(dp);
+#endif
+#else
+	/* destroy kstat objects */
+	kstat_delete(dp->ksp);
+#endif
+
+	/* release locks and condition variables */
+	mutex_destroy(&dp->txlock);
+	mutex_destroy(&dp->rxlock);
+	cv_destroy(&dp->tx_drain_cv);
+	cv_destroy(&dp->rx_drain_cv);
+	rw_destroy(&dp->dev_state_lock);
+	mutex_destroy(&dp->link_watcher_lock);
+	cv_destroy(&dp->link_watcher_wait_cv);
+	sema_destroy(&dp->hal_op_lock);
+	sema_destroy(&dp->rxfilter_lock);
+
+	/* release basic memory resources */
+#ifndef USBGEM_CONFIG_GLDv3
+	gld_mac_free(dp->macinfo);
+#endif
+	kmem_free((caddr_t)(dp->private), dp->priv_size);
+	kmem_free((caddr_t)dp, USBGEM_LOCAL_DATA_SIZE(&dp->ugc));
+
+	DPRINTF(2, (CE_CONT, "!%s: %s: return: success",
+	    ddi_driver_name(dip), __func__));
+
+	return (DDI_SUCCESS);
+}
+
+int
+usbgem_mod_init(struct dev_ops *dop, char *name)
+{
+#ifdef USBGEM_CONFIG_GLDv3
+	major_t	major;
+	major = ddi_name_to_major(name);
+	if (major == DDI_MAJOR_T_NONE) {
+		return (DDI_FAILURE);
+	}
+	mac_init_ops(dop, name);
+#endif
+	return (DDI_SUCCESS);
+}
+
+void
+usbgem_mod_fini(struct dev_ops *dop)
+{
+#ifdef USBGEM_CONFIG_GLDv3
+	mac_fini_ops(dop);
+#endif
+}
+
+int
+usbgem_quiesce(dev_info_t *dip)
+{
+	struct usbgem_dev	*dp;
+
+	dp = USBGEM_GET_DEV(dip);
+
+	ASSERT(dp != NULL);
+
+	if (dp->mac_state != MAC_STATE_DISCONNECTED &&
+	    dp->mac_state != MAC_STATE_STOPPED) {
+		if (usbgem_hal_stop_chip(dp) != USB_SUCCESS) {
+			(void) usbgem_hal_reset_chip(dp);
+		}
+	}
+
+	/* devo_quiesce() must return DDI_SUCCESS always */
+	return (DDI_SUCCESS);
+}
diff --git a/usr/src/uts/common/io/usbgem/usbgem.h b/usr/src/uts/common/io/usbgem/usbgem.h
new file mode 100644
index 0000000000..80b89a260e
--- /dev/null
+++ b/usr/src/uts/common/io/usbgem/usbgem.h
@@ -0,0 +1,428 @@
+/*
+ * usbgem.h: General USB to Ethernet MAC driver framework
+ * @(#)usbgem.h	1.4 12/02/09
+ * (C) Copyright 2003-2009 Masayuki Murayama KHF04453@nifty.ne.jp
+ */
+
+#ifndef __USBGEM_H__
+#define __USBGEM_H__
+
+#pragma	ident	"@(#)usbgem.h	1.4 12/02/09"
+
+#ifdef USBGEM_CONFIG_GLDv3
+#include <sys/mac.h>
+#ifndef MAC_VERSION
+#include <sys/mac_provider.h>
+#endif
+#include <sys/mac_ether.h>
+#else
+#include <sys/gld.h>
+#endif /* GLDv3 */
+
+/*
+ * Useful macros and typedefs
+ */
+#define	USBGEM_NAME_LEN	32
+
+#define USBGEM_TX_TIMEOUT		(drv_usectohz(3*1000000))
+#define USBGEM_TX_TIMEOUT_INTERVAL	(drv_usectohz(1*1000000))
+#define USBGEM_LINK_WATCH_INTERVAL	(drv_usectohz(1*1000000))
+
+/* general return code */
+#define	USBGEM_SUCCESS	0
+#define	USBGEM_FAILURE	1
+
+/* return code of usbgem_tx_done */
+#define	INTR_RESTART_TX	0x80000000U
+
+struct usbgem_stats {
+	uint32_t	intr;
+
+	uint32_t	crc;
+	uint32_t	errrcv;
+	uint32_t	overflow;
+	uint32_t	frame;
+	uint32_t	missed;
+	uint32_t	runt;
+	uint32_t	frame_too_long;
+	uint32_t	norcvbuf;
+	uint32_t	sqe;
+
+	uint32_t	collisions;
+	uint32_t	first_coll;
+	uint32_t	multi_coll;
+	uint32_t	excoll;
+	uint32_t	xmit_internal_err;
+	uint32_t	nocarrier;
+	uint32_t	defer;
+	uint32_t	errxmt;
+	uint32_t	underflow;
+	uint32_t	xmtlatecoll;
+	uint32_t	noxmtbuf;
+	uint32_t	jabber;
+
+
+	uint64_t	rbytes;
+	uint64_t	obytes;
+	uint64_t	rpackets;
+	uint64_t	opackets;
+	uint32_t	rbcast;
+	uint32_t	obcast;
+	uint32_t	rmcast;
+	uint32_t	omcast;
+	uint32_t	rcv_internal_err;
+};
+
+struct mcast_addr {
+	struct ether_addr	addr;
+	uint32_t		hash;
+};
+
+#define	USBGEM_MAXMC	64
+#define	USBGEM_MCALLOC	(sizeof(struct mcast_addr) * USBGEM_MAXMC)
+
+#define	SLOT(dp, n)	((n) % (dp)->ugc.usbgc_tx_list_max)
+
+/*
+ * mac soft state
+ */
+struct usbgem_dev {
+	dev_info_t	*dip;
+#ifdef USBGEM_CONFIG_GLDv3
+	mac_handle_t	mh;
+#else
+	void		*macinfo;	/* opaque handle for upper layer */
+#endif
+	char		name[USBGEM_NAME_LEN];
+
+	/* pointer to usb private data */
+	usb_client_dev_data_t	*reg_data;
+
+	/* usb handles */
+	usb_pipe_handle_t	default_pipe;
+	usb_pipe_handle_t	bulkin_pipe;
+	usb_pipe_handle_t	bulkout_pipe;
+	usb_pipe_handle_t	intr_pipe;
+
+	/* usb endpoints */
+	usb_ep_descr_t		*ep_default;
+	usb_ep_descr_t		*ep_bulkin;
+	usb_ep_descr_t		*ep_bulkout;
+	usb_ep_descr_t		*ep_intr;
+
+	/* usb policies */
+	usb_pipe_policy_t	policy_default;
+	usb_pipe_policy_t	policy_bulkin;
+	usb_pipe_policy_t	policy_bulkout;
+	usb_pipe_policy_t	policy_interrupt;
+
+	/* MAC address information */
+	struct ether_addr	cur_addr;
+	struct ether_addr	dev_addr;
+
+	/* RX state and resource management */
+	kmutex_t		rxlock;
+	int			rx_busy_cnt;
+	boolean_t		rx_active;
+	kcondvar_t		rx_drain_cv;
+
+	/* RX buffer management */
+	int			rx_buf_len;
+
+	/* TX state and resource management */
+	kmutex_t		txlock;
+	int			tx_busy_cnt;
+	usb_bulk_req_t		*tx_free_list;
+	kcondvar_t		tx_drain_cv;
+	clock_t			tx_start_time;
+	int			bulkout_timeout;	/* in second */
+	int			tx_max_packets;
+	int			tx_seq_num;
+	int			tx_intr_pended;
+
+	/* NIC state from OS view */
+	int			nic_state;
+#define	NIC_STATE_UNKNOWN	0
+#define	NIC_STATE_STOPPED	1
+#define	NIC_STATE_INITIALIZED	2
+#define	NIC_STATE_ONLINE	3
+
+	/* MAC state from hardware view */
+	int			mac_state;
+#define	MAC_STATE_DISCONNECTED	0	/* it includes suspended state too */
+#define	MAC_STATE_STOPPED	1	/* powered up / buf not initialized */
+#define	MAC_STATE_INITIALIZED	2	/* initialized */
+#define	MAC_STATE_ONLINE	3	/* working correctly  */
+#define	MAC_STATE_ERROR		4	/* need to restart nic */
+
+	clock_t			fatal_error;
+
+	/* robustness: timer and watchdog */
+	uint_t			tx_watcher_stop;
+	kt_did_t		tx_watcher_did;
+	kcondvar_t		tx_watcher_cv;
+	kmutex_t		tx_watcher_lock;
+	clock_t			tx_watcher_timeout;
+	clock_t			tx_watcher_interval;
+
+	/* MII mamagement */
+	boolean_t		anadv_autoneg:1;
+	boolean_t		anadv_1000fdx:1;
+	boolean_t		anadv_1000hdx:1;
+	boolean_t		anadv_100t4:1;
+	boolean_t		anadv_100fdx:1;
+	boolean_t		anadv_100hdx:1;
+	boolean_t		anadv_10fdx:1;
+	boolean_t		anadv_10hdx:1;
+	boolean_t		anadv_1000t_ms:2;
+	boolean_t		anadv_pause:1;
+	boolean_t		anadv_asmpause:1;
+	boolean_t		mii_advert_ro:1;
+
+	boolean_t		full_duplex:1;
+	int			speed:3;
+#define		USBGEM_SPD_10	0
+#define		USBGEM_SPD_100	1
+#define		USBGEM_SPD_1000	2
+#define		USBGEM_SPD_NUM	3
+	unsigned int		flow_control:2;
+#define		FLOW_CONTROL_NONE	0
+#define		FLOW_CONTROL_SYMMETRIC	1
+#define		FLOW_CONTROL_TX_PAUSE	2
+#define		FLOW_CONTROL_RX_PAUSE	3
+
+	boolean_t		mii_supress_msg:1;
+
+	uint32_t		mii_phy_id;
+	uint16_t		mii_status;
+	uint16_t		mii_advert;
+	uint16_t		mii_lpable;
+	uint16_t		mii_exp;
+	uint16_t		mii_ctl1000;
+	uint16_t		mii_stat1000;
+	uint16_t		mii_xstatus;
+	int8_t			mii_phy_addr;	/* must be signed */
+
+	uint16_t		mii_status_ro;
+	uint16_t		mii_xstatus_ro;
+
+	int			mii_state;
+#define		MII_STATE_UNKNOWN		0
+#define		MII_STATE_RESETTING		1
+#define		MII_STATE_AUTONEGOTIATING	2
+#define		MII_STATE_AN_DONE		3
+#define		MII_STATE_MEDIA_SETUP		4
+#define		MII_STATE_LINKUP		5
+#define		MII_STATE_LINKDOWN		6
+
+	clock_t			mii_last_check;	/* in tick */
+	clock_t			mii_timer;	/* in tick */
+#define		MII_RESET_TIMEOUT	drv_usectohz(1000*1000)
+#define		MII_AN_TIMEOUT		drv_usectohz(5000*1000)
+#define		MII_LINKDOWN_TIMEOUT	drv_usectohz(10000*1000)
+
+	clock_t			mii_interval;	/* in tick */
+	clock_t			linkup_delay;	/* in tick */
+
+	uint_t			link_watcher_stop;
+	kt_did_t		link_watcher_did;
+	kcondvar_t		link_watcher_wait_cv;
+	kmutex_t		link_watcher_lock;
+
+	krwlock_t		dev_state_lock;	/* mac_state and nic_state */
+	ksema_t			hal_op_lock;	/* serialize hw operations */
+	ksema_t			drv_op_lock;	/* hotplug op lock */
+
+	/* multcast list */
+	ksema_t			rxfilter_lock;
+	int			mc_count;
+	int			mc_count_req;
+	struct mcast_addr	*mc_list;
+	int			rxmode;
+#define		RXMODE_PROMISC		0x01
+#define		RXMODE_ALLMULTI_REQ	0x02
+#define		RXMODE_MULTI_OVF	0x04
+#define		RXMODE_ENABLE		0x08
+#define		RXMODE_ALLMULTI		(RXMODE_ALLMULTI_REQ | RXMODE_MULTI_OVF)
+#define		RXMODE_BITS	\
+			"\020"	\
+			"\004ENABLE"	\
+			"\003MULTI_OVF"	\
+			"\002ALLMULTI_REQ"	\
+			"\001PROMISC"
+
+	/* statistcs */
+	struct usbgem_stats		stats;
+
+	/* pointer to local structure */
+	void			*private;
+	int			priv_size;
+
+	/* configuration */
+	struct usbgem_conf {
+		/* name */
+		char		usbgc_name[USBGEM_NAME_LEN];
+		int		usbgc_ppa;
+
+		/* specification on usb */
+		int	usbgc_ifnum;	/* interface number */
+		int	usbgc_alt;	/* alternate */
+
+		/* specification on tx engine */
+		int		usbgc_tx_list_max;
+
+		/* specification on rx engine */
+		int		usbgc_rx_header_len;
+		int		usbgc_rx_list_max;
+
+		/* time out parameters */
+		clock_t		usbgc_tx_timeout;
+		clock_t		usbgc_tx_timeout_interval;
+
+		/* flow control */
+		int		usbgc_flow_control;
+
+		/* MII timeout parameters */
+		clock_t	usbgc_mii_linkdown_timeout;
+		clock_t	usbgc_mii_link_watch_interval;
+		clock_t	usbgc_mii_reset_timeout;
+
+		clock_t	usbgc_mii_an_watch_interval;
+		clock_t	usbgc_mii_an_timeout;
+		clock_t	usbgc_mii_an_wait;	
+		clock_t	usbgc_mii_an_delay;
+
+		/* MII configuration */
+		int	usbgc_mii_addr_min;
+		int	usbgc_mii_linkdown_action;
+		int	usbgc_mii_linkdown_timeout_action;
+#define		MII_ACTION_NONE		0
+#define		MII_ACTION_RESET	1
+#define		MII_ACTION_RSA		2
+		boolean_t	usbgc_mii_dont_reset:1;	
+		boolean_t	usbgc_mii_an_oneshot:1;
+		boolean_t	usbgc_mii_hw_link_detection:1;
+		boolean_t	usbgc_mii_stop_mac_on_linkdown:1;
+		uint16_t	usbgc_mii_an_cmd;
+
+		/* I/O methods */
+
+		/* mac operation */
+		int	(*usbgc_attach_chip)(struct usbgem_dev *dp);
+		int	(*usbgc_reset_chip)(struct usbgem_dev *dp);
+		int	(*usbgc_init_chip)(struct usbgem_dev *dp);
+		int	(*usbgc_start_chip)(struct usbgem_dev *dp);
+		int	(*usbgc_stop_chip)(struct usbgem_dev *dp);
+		uint32_t (*usbgc_multicast_hash)(struct usbgem_dev *dp,
+		    const uint8_t *);
+		int	(*usbgc_set_rx_filter)(struct usbgem_dev *dp);
+		int	(*usbgc_set_media)(struct usbgem_dev *dp);
+		int	(*usbgc_get_stats)(struct usbgem_dev *dp);
+		void	(*usbgc_interrupt)(struct usbgem_dev *dp, mblk_t *mp);
+
+		/* packet manupilation */
+		mblk_t	*(*usbgc_tx_make_packet)(struct usbgem_dev *dp, mblk_t *mp);
+		mblk_t	*(*usbgc_rx_make_packet)(struct usbgem_dev *dp, mblk_t *mp);
+		/* mii operations */
+		int	(*usbgc_mii_probe)(struct usbgem_dev *dp);
+		int	(*usbgc_mii_init)(struct usbgem_dev *dp);
+		int	(*usbgc_mii_config)(struct usbgem_dev *dp, int *errp);
+		uint16_t (*usbgc_mii_read)(struct usbgem_dev *dp, uint_t reg, int *errp);
+		void	(*usbgc_mii_write)(struct usbgem_dev *dp, uint_t reg, uint16_t val, int *errp);
+
+		/* jumbo frame */
+		int     usbgc_max_mtu;
+		int     usbgc_default_mtu;
+		int     usbgc_min_mtu;
+	} ugc;
+
+	int	misc_flag;
+#define USBGEM_VLAN	0x0001
+	timeout_id_t	intr_watcher_id;
+
+	/* buffer size */
+	uint_t	mtu;
+
+	/* performance tuning parameters */
+	uint_t	txthr;		/* tx fifo threshoold */
+	uint_t	txmaxdma;	/* tx max dma burst size */ 
+	uint_t	rxthr;		/* rx fifo threshoold */
+	uint_t	rxmaxdma;	/* tx max dma burst size */ 
+
+	/* kstat stuff */
+	kstat_t	*ksp;
+
+	/* ndd stuff */
+	caddr_t	nd_data_p;
+	caddr_t	nd_arg_p;
+
+#ifdef USBGEM_DEBUG_LEVEL
+	int	tx_cnt;
+#endif
+};
+
+/*
+ * Exported functions
+ */
+int usbgem_ctrl_out(struct usbgem_dev *dp,
+    uint8_t reqt, uint8_t req, uint16_t val, uint16_t ix, uint16_t len,
+    void *bp, int size);
+
+int usbgem_ctrl_in(struct usbgem_dev *dp,
+    uint8_t reqt, uint8_t req, uint16_t val, uint16_t ix, uint16_t len,
+    void *bp, int size);
+
+int usbgem_ctrl_out_val(struct usbgem_dev *dp,
+    uint8_t reqt, uint8_t req, uint16_t val, uint16_t ix, uint16_t len,
+    uint32_t v);
+
+int usbgem_ctrl_in_val(struct usbgem_dev *dp,
+    uint8_t reqt, uint8_t req, uint16_t val, uint16_t ix, uint16_t len,
+    void *valp);
+
+void usbgem_generate_macaddr(struct usbgem_dev *, uint8_t *);
+boolean_t usbgem_get_mac_addr_conf(struct usbgem_dev *);
+int usbgem_mii_probe_default(struct usbgem_dev *);
+int usbgem_mii_init_default(struct usbgem_dev *);
+int usbgem_mii_config_default(struct usbgem_dev *, int *errp);
+void usbgem_mii_update_link(struct usbgem_dev *);
+void usbgem_restart_tx(struct usbgem_dev *);
+boolean_t usbgem_tx_done(struct usbgem_dev *, int);
+void usbgem_receive(struct usbgem_dev *);
+struct usbgem_dev *usbgem_do_attach(dev_info_t *,
+    struct usbgem_conf *, void *, int);
+int usbgem_do_detach(dev_info_t *);
+
+uint32_t usbgem_ether_crc_le(const uint8_t *addr);
+uint32_t usbgem_ether_crc_be(const uint8_t *addr);
+
+int usbgem_resume(dev_info_t *);
+int usbgem_suspend(dev_info_t *);
+int usbgem_quiesce(dev_info_t *);
+
+#ifdef USBGEM_CONFIG_GLDv3
+#if DEVO_REV < 4
+#define USBGEM_STREAM_OPS(dev_ops, attach, detach) \
+        DDI_DEFINE_STREAM_OPS(dev_ops, nulldev, nulldev, attach, detach, \
+        nodev, NULL, D_MP, NULL)
+#else
+#define USBGEM_STREAM_OPS(dev_ops, attach, detach) \
+	DDI_DEFINE_STREAM_OPS(dev_ops, nulldev, nulldev, attach, detach, \
+	nodev, NULL, D_MP, NULL, usbgem_quiesce)
+#endif
+#else
+#define usbgem_getinfo     gld_getinfo
+#define usbgem_open        gld_open
+#define usbgem_close       gld_close
+#define usbgem_wput        gld_wput
+#define usbgem_wsrv        gld_wsrv
+#define usbgem_rsrv        gld_rsrv
+#define usbgem_power       NULL
+#endif
+int usbgem_mod_init(struct dev_ops *, char *);
+void usbgem_mod_fini(struct dev_ops *);
+
+#define USBGEM_GET_DEV(dip) \
+	((struct usbgem_dev *)(ddi_get_driver_private(dip)))
+
+#endif /* __USBGEM_H__ */
diff --git a/usr/src/uts/common/io/usbgem/usbgem_mii.h b/usr/src/uts/common/io/usbgem/usbgem_mii.h
new file mode 100644
index 0000000000..2b4176a340
--- /dev/null
+++ b/usr/src/uts/common/io/usbgem/usbgem_mii.h
@@ -0,0 +1,242 @@
+/*
+ *  gem_mii.h: mii header for gem
+ *
+ * Copyright (c) 2002-2007 Masayuki Murayama.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the author nor the names of its contributors may be
+ *    used to endorse or promote products derived from this software without
+ *    specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
+#pragma ident	"@(#)gem_mii.h	1.4	07/11/30"
+
+/*
+ * gem_mii.h : MII registers
+ */
+#ifndef _GEM_MII_H_
+#define	_GEM_MII_H_
+
+#ifdef GEM_CONFIG_GLDv3
+#include <sys/miiregs.h>
+#else
+#define	MII_CONTROL 		0
+#define	MII_STATUS 		1
+#define	MII_PHYIDH		2
+#define	MII_PHYIDL		3
+#define	MII_AN_ADVERT		4
+#define	MII_AN_LPABLE		5
+#define	MII_AN_EXPANSION	6
+#define	MII_AN_NXTPGXMIT	7
+#endif /* GEM_CONFIG_GLDv3 */
+
+#define	MII_AN_LPANXT		8
+#define	MII_MS_CONTROL		9
+#define	MII_MS_STATUS		10
+#define	MII_XSTATUS		15
+
+/* for 1000BaseT support */
+#define	MII_1000TC		MII_MS_CONTROL
+#define	MII_1000TS		MII_MS_STATUS
+#ifndef GEM_CONFIG_GLDv3
+#define	MII_CONTROL_RESET	0x8000
+#define	MII_CONTROL_LOOPBACK	0x4000
+#define	MII_CONTROL_100MB	0x2000
+#define	MII_CONTROL_ANE		0x1000
+#define	MII_CONTROL_PWRDN	0x0800
+#define	MII_CONTROL_ISOLATE	0x0400
+#define	MII_CONTROL_RSAN	0x0200
+#define	MII_CONTROL_FDUPLEX	0x0100
+#define	MII_CONTROL_COLTST	0x0080
+#endif /* !GEM_CONFIG_GLDv3 */
+#define	MII_CONTROL_SPEED	0x2040
+
+#define	MII_CONTROL_10MB	0x0000
+#define	MII_CONTROL_1000MB	0x0040
+
+#define	MII_CONTROL_BITS	\
+	"\020"	\
+	"\020RESET"	\
+	"\017LOOPBACK"	\
+	"\016100MB"	\
+	"\015ANE"	\
+	"\014PWRDN"	\
+	"\013ISOLATE"	\
+	"\012RSAN"	\
+	"\011FDUPLEX"	\
+	"\010COLTST"	\
+	"\0071000M"
+#ifndef GEM_CONFIG_GLDv3
+#define	MII_STATUS_100_BASE_T4		0x8000
+#define	MII_STATUS_100_BASEX_FD		0x4000
+#define	MII_STATUS_100_BASEX		0x2000
+#define	MII_STATUS_10_FD		0x1000
+#define	MII_STATUS_10			0x0800
+#define	MII_STATUS_MFPRMBLSUPR		0x0040
+#define	MII_STATUS_ANDONE		0x0020
+#define	MII_STATUS_REMFAULT		0x0010
+#define	MII_STATUS_CANAUTONEG		0x0008
+#define	MII_STATUS_LINKUP		0x0004
+#define	MII_STATUS_JABBERING		0x0002
+#define	MII_STATUS_EXTENDED		0x0001
+#endif /* !GEM_CONFIG_GLDv3 */
+#define	MII_STATUS_XSTATUS		0x0100
+#define	MII_STATUS_100_BASE_T2_FD	0x0400
+#define	MII_STATUS_100_BASE_T2		0x0200
+
+#define	MII_STATUS_ABILITY_TECH	\
+	(MII_STATUS_100_BASE_T4	|	\
+	MII_STATUS_100_BASEX_FD |	\
+	MII_STATUS_100_BASEX |	\
+	MII_STATUS_10 |	\
+	MII_STATUS_10_FD)
+
+
+#define	MII_STATUS_BITS	\
+	"\020"	\
+	"\020100_BASE_T4"	\
+	"\017100_BASEX_FD"	\
+	"\016100_BASEX"	\
+	"\01510_BASE_FD"	\
+	"\01410_BASE"	\
+	"\013100_BASE_T2_FD"	\
+	"\012100_BASE_T2"	\
+	"\011XSTATUS"	\
+	"\007MFPRMBLSUPR"	\
+	"\006ANDONE"	\
+	"\005REMFAULT"	\
+	"\004CANAUTONEG"	\
+	"\003LINKUP"	\
+	"\002JABBERING"	\
+	"\001EXTENDED"
+#ifndef GEM_CONFIG_GLDv3
+#define	MII_AN_ADVERT_NP		0x8000
+#define	MII_AN_ADVERT_REMFAULT		0x2000
+#define	MII_AN_ADVERT_SELECTOR		0x001f
+#endif /* !GEM_CONFIG_GLDv3 */
+
+#define	MII_ABILITY_ASM_DIR		0x0800	/* for annex 28B */
+#ifndef	MII_ABILITY_PAUSE
+#define	MII_ABILITY_PAUSE		0x0400	/* for IEEE 802.3x */
+#endif
+#ifndef GEM_CONFIG_GLDv3
+#define	MII_ABILITY_100BASE_T4		0x0200
+#define	MII_ABILITY_100BASE_TX_FD	0x0100
+#define	MII_ABILITY_100BASE_TX		0x0080
+#define	MII_ABILITY_10BASE_T_FD		0x0040
+#define	MII_ABILITY_10BASE_T		0x0020
+#endif /* !GEM_CONFIG_GLDv3 */
+
+#define	MII_AN_LPABLE_NP	0x8000
+
+#define	MII_ABILITY_TECH	\
+	(MII_ABILITY_100BASE_T4	|	\
+	MII_ABILITY_100BASE_TX_FD |	\
+	MII_ABILITY_100BASE_TX |	\
+	MII_ABILITY_10BASE_T |	\
+	MII_ABILITY_10BASE_T_FD)
+
+#define	MII_ABILITY_ALL	\
+	(MII_AN_ADVERT_REMFAULT |	\
+	MII_ABILITY_ASM_DIR |	\
+	MII_ABILITY_PAUSE |	\
+	MII_ABILITY_TECH)
+
+
+#define	MII_ABILITY_BITS	\
+	"\020"	\
+	"\016REMFAULT"	\
+	"\014ASM_DIR"	\
+	"\013PAUSE"	\
+	"\012100BASE_T4"	\
+	"\011100BASE_TX_FD"	\
+	"\010100BASE_TX"	\
+	"\00710BASE_T_FD"	\
+	"\00610BASE_T"
+#ifndef GEM_CONFIG_GLDv3
+#define	MII_AN_EXP_PARFAULT	0x0010
+#define	MII_AN_EXP_LPCANNXTP	0x0008
+#define	MII_AN_EXP_CANNXTPP	0x0004
+#define	MII_AN_EXP_PAGERCVD 	0x0002
+#define	MII_AN_EXP_LPCANAN 	0x0001
+#endif /* !GEM_CONFIG_GLDv3 */
+
+#define	MII_AN_EXP_BITS	\
+	"\020"	\
+	"\005PARFAULT"	\
+	"\004LPCANNXTP"	\
+	"\003CANNXTPP"	\
+	"\002PAGERCVD"	\
+	"\001LPCANAN"
+
+#define	MII_1000TC_TESTMODE	0xe000
+#define	MII_1000TC_CFG_EN	0x1000
+#define	MII_1000TC_CFG_VAL	0x0800
+#define	MII_1000TC_PORTTYPE	0x0400
+#define	MII_1000TC_ADV_FULL	0x0200
+#define	MII_1000TC_ADV_HALF	0x0100
+
+#define	MII_1000TC_BITS	\
+	"\020"	\
+	"\015CFG_EN"	\
+	"\014CFG_VAL"	\
+	"\013PORTTYPE"	\
+	"\012FULL"	\
+	"\011HALF"
+
+#define	MII_1000TS_CFG_FAULT	0x8000
+#define	MII_1000TS_CFG_MASTER	0x4000
+#define	MII_1000TS_LOCALRXOK	0x2000
+#define	MII_1000TS_REMOTERXOK	0x1000
+#define	MII_1000TS_LP_FULL	0x0800
+#define	MII_1000TS_LP_HALF	0x0400
+
+#define	MII_1000TS_BITS	\
+	"\020"	\
+	"\020CFG_FAULT"	\
+	"\017CFG_MASTER"	\
+	"\014CFG_LOCALRXOK"	\
+	"\013CFG_REMOTERXOK"	\
+	"\012LP_FULL"	\
+	"\011LP_HALF"
+
+#define	MII_XSTATUS_1000BASEX_FD	0x8000
+#define	MII_XSTATUS_1000BASEX		0x4000
+#define	MII_XSTATUS_1000BASET_FD	0x2000
+#define	MII_XSTATUS_1000BASET		0x1000
+
+#define	MII_XSTATUS_BITS	\
+	"\020"	\
+	"\0201000BASEX_FD"	\
+	"\0171000BASEX"		\
+	"\0161000BASET_FD"	\
+	"\0151000BASET"
+
+#define	MII_READ_CMD(p, r)	\
+	((6<<(18+5+5)) | ((p)<<(18+5)) | ((r)<<18))
+
+#define	MII_WRITE_CMD(p, r, v)	\
+	((5<<(18+5+5)) | ((p)<<(18+5)) | ((r)<<18) | (2 << 16) | (v))
+
+#endif /* _GEM_MII_H_ */
diff --git a/usr/src/uts/common/io/vioif/vioif.c b/usr/src/uts/common/io/vioif/vioif.c
index 0d1132febc..27241894aa 100644
--- a/usr/src/uts/common/io/vioif/vioif.c
+++ b/usr/src/uts/common/io/vioif/vioif.c
@@ -12,6 +12,7 @@
 /*
  * Copyright 2013 Nexenta Inc.  All rights reserved.
  * Copyright (c) 2014, 2015 by Delphix. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
  */
 
 /* Based on the NetBSD virtio driver by Minoura Makoto. */
@@ -285,6 +286,13 @@ struct vioif_softc {
 	unsigned int		sc_tx_csum:1;
 	unsigned int		sc_tx_tso4:1;
 
+	/*
+	 * For debugging, it is useful to know whether the MAC address we
+	 * are using came from the host (via VIRTIO_NET_CONFIG_MAC) or
+	 * was otherwise generated or set from within the guest.
+	 */
+	unsigned int		sc_mac_from_host:1;
+
 	int			sc_mtu;
 	uint8_t			sc_mac[ETHERADDRL];
 	/*
@@ -312,7 +320,10 @@ struct vioif_softc {
 	/* Copying small packets turns out to be faster then mapping them. */
 	unsigned long		sc_rxcopy_thresh;
 	unsigned long		sc_txcopy_thresh;
-	/* Some statistic coming here */
+
+	/*
+	 * Statistics visible through mac:
+	 */
 	uint64_t		sc_ipackets;
 	uint64_t		sc_opackets;
 	uint64_t		sc_rbytes;
@@ -325,6 +336,18 @@ struct vioif_softc {
 	uint64_t		sc_notxbuf;
 	uint64_t		sc_ierrors;
 	uint64_t		sc_oerrors;
+
+	/*
+	 * Internal debugging statistics:
+	 */
+	uint64_t		sc_rxfail_dma_handle;
+	uint64_t		sc_rxfail_dma_buffer;
+	uint64_t		sc_rxfail_dma_bind;
+	uint64_t		sc_rxfail_chain_undersize;
+	uint64_t		sc_rxfail_no_descriptors;
+	uint64_t		sc_txfail_dma_handle;
+	uint64_t		sc_txfail_dma_bind;
+	uint64_t		sc_txfail_indirect_limit;
 };
 
 #define	ETHER_HEADER_LEN		sizeof (struct ether_header)
@@ -474,8 +497,7 @@ vioif_rx_construct(void *buffer, void *user_arg, int kmflags)
 
 	if (ddi_dma_alloc_handle(sc->sc_dev, &vioif_mapped_buf_dma_attr,
 	    DDI_DMA_SLEEP, NULL, &buf->rb_mapping.vbm_dmah)) {
-		dev_err(sc->sc_dev, CE_WARN,
-		    "Can't allocate dma handle for rx buffer");
+		sc->sc_rxfail_dma_handle++;
 		goto exit_handle;
 	}
 
@@ -483,8 +505,7 @@ vioif_rx_construct(void *buffer, void *user_arg, int kmflags)
 	    VIOIF_RX_SIZE + sizeof (struct virtio_net_hdr),
 	    &vioif_bufattr, DDI_DMA_STREAMING, DDI_DMA_SLEEP,
 	    NULL, &buf->rb_mapping.vbm_buf, &len, &buf->rb_mapping.vbm_acch)) {
-		dev_err(sc->sc_dev, CE_WARN,
-		    "Can't allocate rx buffer");
+		sc->sc_rxfail_dma_buffer++;
 		goto exit_alloc;
 	}
 	ASSERT(len >= VIOIF_RX_SIZE);
@@ -493,8 +514,7 @@ vioif_rx_construct(void *buffer, void *user_arg, int kmflags)
 	    buf->rb_mapping.vbm_buf, len, DDI_DMA_READ | DDI_DMA_STREAMING,
 	    DDI_DMA_SLEEP, NULL, &buf->rb_mapping.vbm_dmac,
 	    &buf->rb_mapping.vbm_ncookies)) {
-		dev_err(sc->sc_dev, CE_WARN, "Can't bind tx buffer");
-
+		sc->sc_rxfail_dma_bind++;
 		goto exit_bind;
 	}
 
@@ -716,27 +736,24 @@ vioif_add_rx(struct vioif_softc *sc, int kmflag)
 	struct vioif_rx_buf *buf;
 
 	ve = vq_alloc_entry(sc->sc_rx_vq);
-	if (!ve) {
+	if (ve == NULL) {
 		/*
 		 * Out of free descriptors - ring already full.
-		 * It would be better to update sc_norxdescavail
-		 * but MAC does not ask for this info, hence we
-		 * update sc_norecvbuf.
 		 */
+		sc->sc_rxfail_no_descriptors++;
 		sc->sc_norecvbuf++;
 		goto exit_vq;
 	}
 	buf = sc->sc_rxbufs[ve->qe_index];
 
-	if (!buf) {
+	if (buf == NULL) {
 		/* First run, allocate the buffer. */
 		buf = kmem_cache_alloc(sc->sc_rxbuf_cache, kmflag);
 		sc->sc_rxbufs[ve->qe_index] = buf;
 	}
 
 	/* Still nothing? Bye. */
-	if (!buf) {
-		dev_err(sc->sc_dev, CE_WARN, "Can't allocate rx buffer");
+	if (buf == NULL) {
 		sc->sc_norecvbuf++;
 		goto exit_buf;
 	}
@@ -789,20 +806,19 @@ static int
 vioif_populate_rx(struct vioif_softc *sc, int kmflag)
 {
 	int i = 0;
-	int ret;
 
 	for (;;) {
-		ret = vioif_add_rx(sc, kmflag);
-		if (ret)
+		if (vioif_add_rx(sc, kmflag) != DDI_SUCCESS) {
 			/*
 			 * We could not allocate some memory. Try to work with
 			 * what we've got.
 			 */
 			break;
+		}
 		i++;
 	}
 
-	if (i)
+	if (i != 0)
 		virtio_sync_vq(sc->sc_rx_vq);
 
 	return (i);
@@ -823,8 +839,7 @@ vioif_process_rx(struct vioif_softc *sc)
 		ASSERT(buf);
 
 		if (len < sizeof (struct virtio_net_hdr)) {
-			dev_err(sc->sc_dev, CE_WARN, "RX: Cnain too small: %u",
-			    len - (uint32_t)sizeof (struct virtio_net_hdr));
+			sc->sc_rxfail_chain_undersize++;
 			sc->sc_ierrors++;
 			virtio_free_chain(ve);
 			continue;
@@ -838,7 +853,7 @@ vioif_process_rx(struct vioif_softc *sc)
 		 */
 		if (len < sc->sc_rxcopy_thresh) {
 			mp = allocb(len, 0);
-			if (!mp) {
+			if (mp == NULL) {
 				sc->sc_norecvbuf++;
 				sc->sc_ierrors++;
 
@@ -855,7 +870,7 @@ vioif_process_rx(struct vioif_softc *sc)
 			    buf->rb_mapping.vbm_buf +
 			    sizeof (struct virtio_net_hdr) +
 			    VIOIF_IP_ALIGN, len, 0, &buf->rb_frtn);
-			if (!mp) {
+			if (mp == NULL) {
 				sc->sc_norecvbuf++;
 				sc->sc_ierrors++;
 
@@ -901,31 +916,32 @@ vioif_reclaim_used_tx(struct vioif_softc *sc)
 	struct vioif_tx_buf *buf;
 	uint32_t len;
 	mblk_t *mp;
-	int i = 0;
+	unsigned chains = 0;
 
 	while ((ve = virtio_pull_chain(sc->sc_tx_vq, &len))) {
 		/* We don't chain descriptors for tx, so don't expect any. */
-		ASSERT(!ve->qe_next);
+		ASSERT(ve->qe_next == NULL);
 
 		buf = &sc->sc_txbufs[ve->qe_index];
 		mp = buf->tb_mp;
 		buf->tb_mp = NULL;
 
-		if (mp) {
-			for (i = 0; i < buf->tb_external_num; i++)
+		if (mp != NULL) {
+			for (int i = 0; i < buf->tb_external_num; i++) {
 				(void) ddi_dma_unbind_handle(
 				    buf->tb_external_mapping[i].vbm_dmah);
+			}
 		}
 
 		virtio_free_chain(ve);
 
 		/* External mapping used, mp was not freed in vioif_send() */
-		if (mp)
+		if (mp != NULL)
 			freemsg(mp);
-		i++;
+		chains++;
 	}
 
-	if (sc->sc_tx_stopped && i) {
+	if (sc->sc_tx_stopped != 0 && chains > 0) {
 		sc->sc_tx_stopped = 0;
 		mac_tx_update(sc->sc_mac_handle);
 	}
@@ -962,8 +978,7 @@ vioif_tx_lazy_handle_alloc(struct vioif_softc *sc, struct vioif_tx_buf *buf,
 		    &vioif_mapped_buf_dma_attr, DDI_DMA_SLEEP, NULL,
 		    &buf->tb_external_mapping[i].vbm_dmah);
 		if (ret != DDI_SUCCESS) {
-			dev_err(sc->sc_dev, CE_WARN,
-			    "Can't allocate dma handle for external tx buffer");
+			sc->sc_txfail_dma_handle++;
 		}
 	}
 
@@ -1017,17 +1032,14 @@ vioif_tx_external(struct vioif_softc *sc, struct vq_entry *ve, mblk_t *mp,
 		    DDI_DMA_SLEEP, NULL, &dmac, &ncookies);
 
 		if (ret != DDI_SUCCESS) {
+			sc->sc_txfail_dma_bind++;
 			sc->sc_oerrors++;
-			dev_err(sc->sc_dev, CE_NOTE,
-			    "TX: Failed to bind external handle");
 			goto exit_bind;
 		}
 
 		/* Check if we still fit into the indirect table. */
 		if (virtio_ve_indirect_available(ve) < ncookies) {
-			dev_err(sc->sc_dev, CE_NOTE,
-			    "TX: Indirect descriptor table limit reached."
-			    " It took %d fragments.", i);
+			sc->sc_txfail_indirect_limit++;
 			sc->sc_notxbuf++;
 			sc->sc_oerrors++;
 
@@ -1086,7 +1098,7 @@ vioif_send(struct vioif_softc *sc, mblk_t *mp)
 
 	ve = vq_alloc_entry(sc->sc_tx_vq);
 
-	if (!ve) {
+	if (ve == NULL) {
 		sc->sc_notxbuf++;
 		/* Out of free descriptors - try later. */
 		return (B_FALSE);
@@ -1138,9 +1150,9 @@ vioif_send(struct vioif_softc *sc, mblk_t *mp)
 	/* meanwhile update the statistic */
 	if (mp->b_rptr[0] & 0x1) {
 		if (bcmp(mp->b_rptr, vioif_broadcast, ETHERADDRL) != 0)
-				sc->sc_multixmt++;
-			else
-				sc->sc_brdcstxmt++;
+			sc->sc_multixmt++;
+		else
+			sc->sc_brdcstxmt++;
 	}
 
 	/*
@@ -1202,8 +1214,7 @@ vioif_start(void *arg)
 {
 	struct vioif_softc *sc = arg;
 
-	mac_link_update(sc->sc_mac_handle,
-	    vioif_link_state(sc));
+	mac_link_update(sc->sc_mac_handle, vioif_link_state(sc));
 
 	virtio_start_vq_intr(sc->sc_rx_vq);
 
@@ -1404,10 +1415,8 @@ vioif_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num,
 	case MAC_PROP_PRIVATE:
 		bzero(valstr, sizeof (valstr));
 		if (strcmp(pr_name, vioif_txcopy_thresh) == 0) {
-
 			value = sc->sc_txcopy_thresh;
-		} else	if (strcmp(pr_name,
-		    vioif_rxcopy_thresh) == 0) {
+		} else if (strcmp(pr_name, vioif_rxcopy_thresh) == 0) {
 			value = sc->sc_rxcopy_thresh;
 		} else {
 			return;
@@ -1483,7 +1492,6 @@ vioif_show_features(struct vioif_softc *sc, const char *prefix,
 	bufp += virtio_show_features(features, bufp, bufend - bufp);
 	*bufp = '\0';
 
-
 	/* Using '!' to only CE_NOTE this to the system log. */
 	dev_err(sc->sc_dev, CE_NOTE, "!%s Vioif (%b)", buf, features,
 	    VIRTIO_NET_FEATURE_BITS);
@@ -1512,8 +1520,8 @@ vioif_dev_features(struct vioif_softc *sc)
 	    sc->sc_virtio.sc_features);
 
 	if (!(sc->sc_virtio.sc_features & VIRTIO_F_RING_INDIRECT_DESC)) {
-		dev_err(sc->sc_dev, CE_NOTE,
-		    "Host does not support RING_INDIRECT_DESC, bye.");
+		dev_err(sc->sc_dev, CE_WARN,
+		    "Host does not support RING_INDIRECT_DESC. Cannot attach.");
 		return (DDI_FAILURE);
 	}
 
@@ -1535,6 +1543,7 @@ vioif_set_mac(struct vioif_softc *sc)
 		virtio_write_device_config_1(&sc->sc_virtio,
 		    VIRTIO_NET_CONFIG_MAC + i, sc->sc_mac[i]);
 	}
+	sc->sc_mac_from_host = 0;
 }
 
 /* Get the mac address out of the hardware, or make up one. */
@@ -1548,8 +1557,7 @@ vioif_get_mac(struct vioif_softc *sc)
 			    &sc->sc_virtio,
 			    VIRTIO_NET_CONFIG_MAC + i);
 		}
-		dev_err(sc->sc_dev, CE_NOTE, "Got MAC address from host: %s",
-		    ether_sprintf((struct ether_addr *)sc->sc_mac));
+		sc->sc_mac_from_host = 1;
 	} else {
 		/* Get a few random bytes */
 		(void) random_get_pseudo_bytes(sc->sc_mac, ETHERADDRL);
@@ -1561,7 +1569,7 @@ vioif_get_mac(struct vioif_softc *sc)
 		vioif_set_mac(sc);
 
 		dev_err(sc->sc_dev, CE_NOTE,
-		    "Generated a random MAC address: %s",
+		    "!Generated a random MAC address: %s",
 		    ether_sprintf((struct ether_addr *)sc->sc_mac));
 	}
 }
@@ -1624,7 +1632,7 @@ vioif_check_features(struct vioif_softc *sc)
 		if (!vioif_has_feature(sc, VIRTIO_NET_F_GUEST_CSUM)) {
 			sc->sc_rx_csum = 0;
 		}
-		cmn_err(CE_NOTE, "Csum enabled.");
+		dev_err(sc->sc_dev, CE_NOTE, "!Csum enabled.");
 
 		if (vioif_has_feature(sc, VIRTIO_NET_F_HOST_TSO4)) {
 
@@ -1638,11 +1646,11 @@ vioif_check_features(struct vioif_softc *sc)
 			 */
 			if (!vioif_has_feature(sc, VIRTIO_NET_F_HOST_ECN)) {
 				dev_err(sc->sc_dev, CE_NOTE,
-				    "TSO4 supported, but not ECN. "
+				    "!TSO4 supported, but not ECN. "
 				    "Not using LSO.");
 				sc->sc_tx_tso4 = 0;
 			} else {
-				cmn_err(CE_NOTE, "LSO enabled");
+				dev_err(sc->sc_dev, CE_NOTE, "!LSO enabled");
 			}
 		}
 	}
@@ -1766,7 +1774,7 @@ vioif_attach(dev_info_t *devinfo, ddi_attach_cmd_t cmd)
 
 	vioif_check_features(sc);
 
-	if (vioif_alloc_mems(sc))
+	if (vioif_alloc_mems(sc) != 0)
 		goto exit_alloc_mems;
 
 	if ((macp = mac_alloc(MAC_VERSION)) == NULL) {
@@ -1854,7 +1862,7 @@ vioif_detach(dev_info_t *devinfo, ddi_detach_cmd_t cmd)
 		return (DDI_FAILURE);
 	}
 
-	if (sc->sc_rxloan) {
+	if (sc->sc_rxloan > 0) {
 		dev_err(devinfo, CE_WARN, "!Some rx buffers are still upstream,"
 		    " not detaching.");
 		return (DDI_FAILURE);
diff --git a/usr/src/uts/common/io/vnd/frameio.c b/usr/src/uts/common/io/vnd/frameio.c
new file mode 100644
index 0000000000..e4e700fa12
--- /dev/null
+++ b/usr/src/uts/common/io/vnd/frameio.c
@@ -0,0 +1,464 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014, Joyent, Inc.  All rights reserved.
+ */
+
+/*
+ * Frame I/O utility functions
+ */
+
+#include <sys/frameio.h>
+
+#include <sys/file.h>
+#include <sys/types.h>
+#include <sys/kmem.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/stream.h>
+#include <sys/strsun.h>
+#include <sys/sysmacros.h>
+#include <sys/inttypes.h>
+
+static kmem_cache_t *frameio_cache;
+
+int
+frameio_init(void)
+{
+	frameio_cache = kmem_cache_create("frameio_cache",
+	    sizeof (frameio_t) + sizeof (framevec_t) * FRAMEIO_NVECS_MAX,
+	    0, NULL, NULL, NULL, NULL, NULL, 0);
+	if (frameio_cache == NULL)
+		return (1);
+
+	return (0);
+}
+
+void
+frameio_fini(void)
+{
+	if (frameio_cache != NULL)
+		kmem_cache_destroy(frameio_cache);
+}
+
+frameio_t *
+frameio_alloc(int kmflags)
+{
+	return (kmem_cache_alloc(frameio_cache, kmflags));
+}
+
+void
+frameio_free(frameio_t *fio)
+{
+	return (kmem_cache_free(frameio_cache, fio));
+}
+
+/*
+ * Ensure that we don't see any garbage in the framevecs that we're nominally
+ * supposed to work with. Specifically we want to make sure that the buflen and
+ * the address are not zero.
+ */
+static int
+frameio_hdr_check_vecs(frameio_t *fio)
+{
+	int i;
+	for (i = 0; i < fio->fio_nvecs; i++)
+		if (fio->fio_vecs[i].fv_buf == NULL ||
+		    fio->fio_vecs[i].fv_buflen == 0)
+			return (EINVAL);
+
+	return (0);
+}
+
+/*
+ * We have to copy in framevec32_t's. To work around the data model issues and
+ * trying not to copy memory we first copy in the framevec32_t data into the
+ * standard fio_vec space. Next we work backwards copying a given framevec32_t
+ * to a temporaory framevec_t and then overwrite the frameio_t's data. Note that
+ * it is important that we do this in reverse so as to ensure that we don't
+ * clobber data as the framevec_t is larger than the framevec32_t.
+ */
+static int
+frameio_hdr_copyin_ilp32(frameio_t *fio, const void *addr)
+{
+	framevec32_t *vec32p;
+	framevec_t fv;
+	int i;
+
+	vec32p = (framevec32_t *)&fio->fio_vecs[0];
+
+	if (ddi_copyin(addr, vec32p, sizeof (framevec32_t) * fio->fio_nvecs,
+	    0) != 0)
+		return (EFAULT);
+
+	for (i = fio->fio_nvecs - 1; i >= 0; i--) {
+		fv.fv_buf = (void *)(uintptr_t)vec32p[i].fv_buf;
+		fv.fv_buflen = vec32p[i].fv_buflen;
+		fv.fv_actlen = vec32p[i].fv_actlen;
+		fio->fio_vecs[i].fv_buf = fv.fv_buf;
+		fio->fio_vecs[i].fv_buflen = fv.fv_buflen;
+		fio->fio_vecs[i].fv_actlen = fv.fv_actlen;
+	}
+
+	return (frameio_hdr_check_vecs(fio));
+}
+
+/*
+ * Copy in a frame io header into fio with space for up to nvecs. If the frameio
+ * contains more vectors than specified it will be ignored. mode should contain
+ * information about the datamodel.
+ */
+int
+frameio_hdr_copyin(frameio_t *fio, int max_vecs, const void *addr, uint_t mode)
+{
+	int model = ddi_model_convert_from(mode & FMODELS);
+	int cpf = mode & FKIOCTL ? FKIOCTL : 0;
+	size_t fsize = model == DDI_MODEL_ILP32 ?
+	    sizeof (frameio32_t) : sizeof (frameio_t);
+
+	/*
+	 * The start of the header is the same in all data models for the
+	 * current verison.
+	 */
+	if (ddi_copyin(addr, fio, fsize, cpf) != 0)
+		return (EFAULT);
+
+	if (fio->fio_version != FRAMEIO_VERSION_ONE)
+		return (EINVAL);
+
+	if (fio->fio_nvecs > FRAMEIO_NVECS_MAX || fio->fio_nvecs == 0)
+		return (EINVAL);
+
+	if (fio->fio_nvpf == 0)
+		return (EINVAL);
+
+	if (fio->fio_nvecs % fio->fio_nvpf != 0)
+		return (EINVAL);
+
+	if (fio->fio_nvecs > max_vecs)
+		return (EOVERFLOW);
+
+	addr = (void *)((uintptr_t)addr + fsize);
+	if (model == DDI_MODEL_ILP32) {
+		if (cpf != 0)
+			return (EINVAL);
+		return (frameio_hdr_copyin_ilp32(fio, addr));
+	}
+
+	if (ddi_copyin(addr, &fio->fio_vecs[0],
+	    sizeof (framevec_t) * fio->fio_nvecs, cpf) != 0)
+		return (EFAULT);
+
+	return (frameio_hdr_check_vecs(fio));
+}
+
+static mblk_t *
+frameio_allocb(size_t sz)
+{
+	mblk_t *mp;
+
+	mp = allocb(sz, 0);
+	if (mp == NULL)
+		return (NULL);
+
+	mp->b_datap->db_type = M_DATA;
+	return (mp);
+}
+
+static int
+framevec_mblk_read(framevec_t *fv, mblk_t **mpp, int cpf)
+{
+	mblk_t *mp;
+	cpf = cpf != 0 ? FKIOCTL : 0;
+
+	mp = frameio_allocb(fv->fv_buflen);
+
+	if (mp == NULL) {
+		freemsg(mp);
+		return (EAGAIN);
+	}
+
+	if (ddi_copyin(fv->fv_buf, mp->b_wptr, fv->fv_buflen,
+	    cpf) != 0) {
+		freemsg(mp);
+		return (EFAULT);
+	}
+
+	mp->b_wptr += fv->fv_buflen;
+	*mpp = mp;
+	return (0);
+}
+
+/*
+ * Read a set of frame vectors that make up a single message boundary and return
+ * that as a single message in *mpp that consists of multiple data parts.
+ */
+static int
+frameio_mblk_read(frameio_t *fio, framevec_t *fv, mblk_t **mpp, int cpf)
+{
+	int nparts = fio->fio_nvpf;
+	int part, error;
+	mblk_t *mp;
+
+	*mpp = NULL;
+	cpf = cpf != 0 ? FKIOCTL : 0;
+
+	/*
+	 * Construct the initial frame
+	 */
+	for (part = 0; part < nparts; part++) {
+		error = framevec_mblk_read(fv, &mp, cpf);
+		if (error != 0) {
+			freemsg(*mpp);
+			return (error);
+		}
+
+		if (*mpp == NULL)
+			*mpp = mp;
+		else
+			linkb(*mpp, mp);
+		fv++;
+	}
+
+	return (0);
+}
+
+/*
+ * Read data from a series of frameio vectors into a message block chain. A
+ * given frameio request has a number of discrete messages divided into
+ * individual vectors based on fio->fio_nvcspframe. Each discrete message will
+ * be constructed into a message block chain pointed to by b_next.
+ *
+ * If we get an EAGAIN while trying to construct a given message block what we
+ * return depends on what else we've done so far. If we have succesfully
+ * completed at least one message then we free everything else we've done so
+ * far and return that. If no messages have been completed we return EAGAIN. If
+ * instead we encounter a different error, say EFAULT, then all of the fv_actlen
+ * entries values are undefined.
+ */
+int
+frameio_mblk_chain_read(frameio_t *fio, mblk_t **mpp, int *nvecs, int cpf)
+{
+	int error = ENOTSUP;
+	int nframes = fio->fio_nvecs / fio->fio_nvpf;
+	int frame;
+	framevec_t *fv;
+	mblk_t *mp, *bmp = NULL;
+
+	/*
+	 * Protect against bogus kernel subsystems.
+	 */
+	VERIFY(fio->fio_nvecs > 0);
+	VERIFY(fio->fio_nvecs % fio->fio_nvpf == 0);
+
+	*mpp = NULL;
+	cpf = cpf != 0 ? FKIOCTL : 0;
+
+	fv = &fio->fio_vecs[0];
+	for (frame = 0; frame < nframes; frame++) {
+		error = frameio_mblk_read(fio, fv, &mp, cpf);
+		if (error != 0)
+			goto failed;
+
+		if (bmp != NULL)
+			bmp->b_next = mp;
+		else
+			*mpp = mp;
+		bmp = mp;
+	}
+
+	*nvecs = nframes;
+	return (0);
+failed:
+	/*
+	 * On EAGAIN we've already taken care of making sure that we have no
+	 * leftover messages, eg. they were never linked in.
+	 */
+	if (error == EAGAIN) {
+		if (frame != 0)
+			error = 0;
+		if (*nvecs != NULL)
+			*nvecs = frame;
+		ASSERT(*mpp != NULL);
+	} else {
+		for (mp = *mpp; mp != NULL; mp = bmp) {
+			bmp = mp->b_next;
+			freemsg(mp);
+		}
+		if (nvecs != NULL)
+			*nvecs = 0;
+		*mpp = NULL;
+	}
+	return (error);
+}
+
+size_t
+frameio_frame_length(frameio_t *fio, framevec_t *fv)
+{
+	int i;
+	size_t len = 0;
+
+	for (i = 0; i < fio->fio_nvpf; i++, fv++)
+		len += fv->fv_buflen;
+
+	return (len);
+}
+
+/*
+ * Write a portion of an mblk to the current.
+ */
+static int
+framevec_write_mblk_part(framevec_t *fv, mblk_t *mp, size_t len, size_t moff,
+    size_t foff, int cpf)
+{
+	ASSERT(len <= MBLKL(mp) - moff);
+	ASSERT(len <= fv->fv_buflen - fv->fv_actlen);
+	cpf = cpf != 0 ? FKIOCTL : 0;
+
+	if (ddi_copyout(mp->b_rptr + moff, fv->fv_buf + foff, len, cpf) != 0)
+		return (EFAULT);
+	fv->fv_actlen += len;
+
+	return (0);
+}
+
+/*
+ * Because copying this out to the user might fail we don't want to update the
+ * b_rptr in case we need to copy it out again.
+ */
+static int
+framevec_map_blk(frameio_t *fio, framevec_t *fv, mblk_t *mp, int cpf)
+{
+	int err;
+	size_t msize, blksize, len, moff, foff;
+
+	msize = msgsize(mp);
+	if (msize > frameio_frame_length(fio, fv))
+		return (EOVERFLOW);
+
+	moff = 0;
+	foff = 0;
+	blksize = MBLKL(mp);
+	fv->fv_actlen = 0;
+	while (msize != 0) {
+		len = MIN(blksize, fv->fv_buflen - fv->fv_actlen);
+		err = framevec_write_mblk_part(fv, mp, len, moff, foff, cpf);
+		if (err != 0)
+			return (err);
+
+		msize -= len;
+		blksize -= len;
+		moff += len;
+		foff += len;
+
+		if (blksize == 0 && msize != 0) {
+			mp = mp->b_cont;
+			ASSERT(mp != NULL);
+			moff = 0;
+			blksize = MBLKL(mp);
+		}
+
+		if (fv->fv_buflen == fv->fv_actlen && msize != 0) {
+			fv++;
+			fv->fv_actlen = 0;
+			foff = 0;
+		}
+	}
+
+	return (0);
+}
+
+int
+frameio_mblk_chain_write(frameio_t *fio, frameio_write_mblk_map_t map,
+    mblk_t *mp, int *nwrite, int cpf)
+{
+	int mcount = 0;
+	int ret = 0;
+
+	if (map != MAP_BLK_FRAME)
+		return (EINVAL);
+
+	while (mp != NULL && mcount < fio->fio_nvecs) {
+		ret = framevec_map_blk(fio, &fio->fio_vecs[mcount], mp, cpf);
+		if (ret != 0)
+			break;
+		mcount += fio->fio_nvpf;
+		mp = mp->b_next;
+	}
+
+	if (ret != 0 && mcount == 0) {
+		if (nwrite != NULL)
+			*nwrite = 0;
+		return (ret);
+	}
+
+	if (nwrite != NULL)
+		*nwrite = mcount / fio->fio_nvpf;
+
+	return (0);
+}
+
+/*
+ * Copy out nframes worth of frameio header data back to userland.
+ */
+int
+frameio_hdr_copyout(frameio_t *fio, int nframes, void *addr, uint_t mode)
+{
+	int i;
+	int model = ddi_model_convert_from(mode & FMODELS);
+	framevec32_t *vec32p;
+	framevec32_t f;
+
+	if (fio->fio_nvecs / fio->fio_nvpf < nframes)
+		return (EINVAL);
+
+	fio->fio_nvecs = nframes * fio->fio_nvpf;
+
+	if (model == DDI_MODEL_NONE) {
+		if (ddi_copyout(fio, addr,
+		    sizeof (frameio_t) + fio->fio_nvecs * sizeof (framevec_t),
+		    mode & FKIOCTL) != 0)
+			return (EFAULT);
+		return (0);
+	}
+
+	ASSERT(model == DDI_MODEL_ILP32);
+
+	vec32p = (framevec32_t *)&fio->fio_vecs[0];
+	for (i = 0; i < fio->fio_nvecs; i++) {
+		f.fv_buf = (caddr32_t)(uintptr_t)fio->fio_vecs[i].fv_buf;
+		if (fio->fio_vecs[i].fv_buflen > UINT_MAX ||
+		    fio->fio_vecs[i].fv_actlen > UINT_MAX)
+			return (EOVERFLOW);
+		f.fv_buflen = fio->fio_vecs[i].fv_buflen;
+		f.fv_actlen = fio->fio_vecs[i].fv_actlen;
+		vec32p[i].fv_buf = f.fv_buf;
+		vec32p[i].fv_buflen = f.fv_buflen;
+		vec32p[i].fv_actlen = f.fv_actlen;
+	}
+
+	if (ddi_copyout(fio, addr,
+	    sizeof (frameio32_t) + fio->fio_nvecs * sizeof (framevec32_t),
+	    mode & FKIOCTL) != 0)
+		return (EFAULT);
+	return (0);
+}
+
+void
+frameio_mark_consumed(frameio_t *fio, int nframes)
+{
+	int i;
+
+	ASSERT(fio->fio_nvecs / fio->fio_nvpf >= nframes);
+	for (i = 0; i < nframes * fio->fio_nvpf; i++)
+		fio->fio_vecs[i].fv_actlen = fio->fio_vecs[i].fv_buflen;
+}
diff --git a/usr/src/uts/common/io/vnd/vnd.c b/usr/src/uts/common/io/vnd/vnd.c
new file mode 100644
index 0000000000..2abb6f9464
--- /dev/null
+++ b/usr/src/uts/common/io/vnd/vnd.c
@@ -0,0 +1,5800 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+/*
+ * vnd - virtual (machine) networking datapath
+ *
+ * vnd's purpose is to provide a highly performant data path for Layer 2 network
+ * traffic and exist side by side an active IP netstack, each servicing
+ * different datalinks. vnd provides many of the same capabilities as the
+ * current TCP/IP stack does and some specific to layer two. Specifically:
+ *
+ * 	o Use of the DLD fastpath
+ * 	o Packet capture hooks
+ * 	o Ability to use hardware capabilities
+ * 	o Useful interfaces for handling multiple frames
+ *
+ * The following image shows where vnd fits into today's networking stack:
+ *
+ *             +---------+----------+----------+
+ *             | libdlpi |  libvnd  | libsocket|
+ *             +---------+----------+----------+
+ *             |         ·          ·    VFS   |
+ *             |   VFS   ·    VFS   +----------+
+ *             |         ·          |  sockfs  |
+ *             +---------+----------+----------+
+ *             |         |    VND   |    IP    |
+ *             |         +----------+----------+
+ *             |            DLD/DLS            |
+ *             +-------------------------------+
+ *             |              MAC              |
+ *             +-------------------------------+
+ *             |             GLDv3             |
+ *             +-------------------------------+
+ *
+ * -----------------------------------------
+ * A Tale of Two Devices - DDI Device Basics
+ * -----------------------------------------
+ *
+ * vnd presents itself to userland as a character device; however, it also is a
+ * STREAMS device so that it can interface with dld and the rest of the
+ * networking stack. Users never interface with the STREAMs devices directly and
+ * they are purely an implementation detail of vnd. Opening the STREAMS device
+ * require kcred and as such userland cannot interact with it or push it onto
+ * the stream head.
+ *
+ * The main vnd character device, /dev/vnd/ctl, is a self-cloning device. Every
+ * clone gets its own minor number; however, minor nodes are not created in the
+ * devices tree for these instances. In this state a user may do two different
+ * things. They may issue ioctls that affect global state or they may issue
+ * ioctls that try to attach it to a given datalink. Once a minor device has
+ * been attached to a datalink, all operations on it are scoped to that context,
+ * therefore subsequent global operations are not permitted.
+ *
+ * A given device can be linked into the /devices and /dev name space via a link
+ * ioctl. That ioctl causes a minor node to be created in /devices and then it
+ * will also appear under /dev/vnd/ due to vnd's sdev plugin. This is similar
+ * to, but simpler than, IP's persistence mechanism.
+ *
+ * ---------------------
+ * Binding to a datalink
+ * ---------------------
+ *
+ * Datalinks are backed by the dld (datalink device) and dls (datalink services)
+ * drivers. These drivers provide a STREAMS device for datalinks on the system
+ * which are exposed through /dev/net. Userland generally manipulates datalinks
+ * through libdlpi. When an IP interface is being plumbed up what actually
+ * happens is that someone does a dlpi_open(3DLPI) of the underlying datalink
+ * and then pushes on the ip STREAMS module with an I_PUSH ioctl.  Modules may
+ * then can negotiate with dld and dls to obtain access to various capabilities
+ * and fast paths via a series of STREAMS messages.
+ *
+ * In vnd, we do the same thing, but we leave our STREAMS module as an
+ * implementation detail of the system. We don't want users to be able to
+ * arbitrarily push vnd STREAMS module onto any stream, so we explicitly require
+ * kcred to manipulate it. Thus, when a user issues a request to attach a
+ * datalink to a minor instance of the character device, that vnd minor instance
+ * itself does a layered open (ldi_open_by_name(9F)) of the specified datalink.
+ * vnd does that open using the passed in credentials from the ioctl, not kcred.
+ * This ensures that users who doesn't have permissions to open the device
+ * cannot. Once that's been opened, we push on the vnd streams module.
+ *
+ * Once the vnd STREAMS instance has been created for this device, eg. the
+ * I_PUSH ioctl returns, we explicitly send a STREAMS ioctl
+ * (VND_STRIOC_ASSOCIATE) to associate the vnd STREAMS and character devices.
+ * This association begins the STREAM device's initialization. We start up an
+ * asynchronous state machine that takes care of all the different aspects of
+ * plumbing up the device with dld and dls and enabling the MAC fast path. We
+ * need to guarantee to consumers of the character device that by the time their
+ * ioctl returns, the data path has been fully initialized.
+ *
+ * The state progression is fairly linear. There are two general steady states.
+ * The first is VND_S_ONLINE, which means that everything is jacked up and good
+ * to go. The alternative is VND_S_ZOMBIE, which means that the streams device
+ * encountered an error or we have finished tearing it down and the character
+ * device can clean it up. The following is our state progression and the
+ * meaning of each state:
+ *
+ *                |
+ *                |
+ *                V
+ *        +---------------+
+ *        | VNS_S_INITIAL |                  This is our initial state. Every
+ *        +---------------+                  vnd STREAMS device starts here.
+ *                |                          While in this state, only dlpi
+ *                |                          M_PROTO and M_IOCTL messages can be
+ *                |                          sent or received. All STREAMS based
+ *                |                          data messages are dropped.
+ *                |                          We transition out of this state by
+ *                |                          sending a DL_INFO_REQ to obtain
+ *                |                          information about the underlying
+ *                |                          link.
+ *                v
+ *        +-----------------+
+ *   +--<-| VNS_S_INFO_SENT |                In this state, we verify and
+ *   |    +-----------------+                record information about the
+ *   |            |                          underlying device. If the device is
+ *   |            |                          not suitable, eg. not of type
+ *   v            |                          DL_ETHER, then we immediately
+ *   |            |                          become a ZOMBIE. To leave this
+ *   |            |                          state we request exclusive active
+ *   |            |                          access to the device via
+ *   v            |                          DL_EXCLUSIVE_REQ.
+ *   |            v
+ *   |    +----------------------+
+ *   +--<-| VNS_S_EXCLUSIVE_SENT |           In this state, we verify whether
+ *   |    +----------------------+           or not we were able to obtain
+ *   |       |             |                 exclusive access to the device. If
+ *   |       |             |                 we were not able to, then we leave,
+ *   v       |             |                 as that means that something like
+ *   |       |             |                 IP is already plumbed up on top of
+ *   |       |             |                 the datalink. We leave this state
+ *   |       |             |                 by progressing through to the
+ *   |       |             |                 appropriate DLPI primitive, either
+ *   v       |             |                 DLPI_ATTACH_REQ or DLPI_BIND_REQ
+ *   |       |             |                 depending on the style of the
+ *   |       |             |                 datalink.
+ *   |       |             v
+ *   |       |    +-------------------+
+ *   +------ |--<-| VNS_S_ATTACH_SENT |      In this state, we verify we were
+ *   |       |    +-------------------+      able to perform a standard DLPI
+ *   |       |          |                    attach and if so, go ahead and
+ *   v       |          |                    send a DLPI_BIND_REQ.
+ *   |       v          v
+ *   |    +-------------------+
+ *   +--<-| VNS_S_BIND_SENT   |              In this state we see the result of
+ *   |    +-------------------+              our attempt to bind to PPA 0 of the
+ *   v             |                         underlying device. Because we're
+ *   |             |                         trying to be a layer two datapath,
+ *   |             |                         the specific attachment point isn't
+ *   |             |                         too important as we're going to
+ *   v             |                         have to enable promiscuous mode. We
+ *   |             |                         transition out of this by sending
+ *   |             |                         our first of three promiscuous mode
+ *   |             |                         requests.
+ *   v             v
+ *   |    +------------------------+
+ *   +--<-| VNS_S_SAP_PROMISC_SENT |         In this state we verify that we
+ *   |    +------------------------+         were able to enable promiscuous
+ *   |             |                         mode at the physical level. We
+ *   |             |                         transition out of this by enabling
+ *   |             |                         multicast and broadcast promiscuous
+ *   v             |                         mode.
+ *   |             v
+ *   |    +--------------------------+
+ *   +--<-| VNS_S_MULTI_PROMISC_SENT |       In this state we verify that we
+ *   |    +--------------------------+       have enabled DL_PROMISC_MULTI and
+ *   v             |                         move onto the second promiscuous
+ *   |             |                         mode request.
+ *   |             v
+ *   |    +----------------------------+
+ *   +--<-| VNS_S_RX_ONLY_PROMISC_SENT |     In this state we verify that we
+ *   |    +----------------------------+     enabled RX_ONLY promiscuous mode.
+ *   |             |                         We specifically do this as we don't
+ *   v             |                         want to receive our own traffic
+ *   |             |                         that we'll send out. We leave this
+ *   |             |                         state by enabling the final flag
+ *   |             |                         DL_PROMISC_FIXUPS.
+ *   |             v
+ *   |    +--------------------------+
+ *   +--<-| VNS_S_FIXUP_PROMISC_SENT |       In this state we verify that we
+ *   |    +--------------------------+       enabled FIXUP promiscuous mode.
+ *   |             |                         We specifically do this as we need
+ *   v             |                         to ensure that traffic which is
+ *   |             |                         received by being looped back to us
+ *   |             |                         correctly has checksums fixed. We
+ *   |             |                         leave this state by requesting the
+ *   |             |                         dld/dls capabilities that we can
+ *   v             |                         process.
+ *   |             v
+ *   |    +--------------------+
+ *   +--<-| VNS_S_CAPAB_Q_SENT |             We loop over the set of
+ *   |    +--------------------+             capabilities that dld advertised
+ *   |             |                         and enable the ones that currently
+ *   v             |                         support for use. See the section
+ *   |             |                         later on regarding capabilities
+ *   |             |                         for more information. We leave this
+ *   |             |                         state by sending an enable request.
+ *   v             v
+ *   |    +--------------------+
+ *   +--<-| VNS_S_CAPAB_E_SENT |             Here we finish all capability
+ *   |    +--------------------+             initialization. Once finished, we
+ *   |             |                         transition to the next state. If
+ *   v             |                         the dld fast path is not available,
+ *   |             |                         we become a zombie.
+ *   |             v
+ *   |    +--------------+
+ *   |    | VNS_S_ONLINE |                   This is a vnd STREAMS device's
+ *   |    +--------------+                   steady state. It will normally
+ *   |             |                         reside in this state while it is in
+ *   |             |                         active use. It will only transition
+ *   v             |                         to the next state when the STREAMS
+ *   |             |                         device is closed by the character
+ *   |             |                         device. In this state, all data
+ *   |             |                         flows over the dld fast path.
+ *   |             v
+ *   |    +---------------------+
+ *   +--->| VNS_S_SHUTTING_DOWN |            This vnd state takes care of
+ *   |    +---------------------+            disabling capabilities and
+ *   |             |                         flushing all data. At this point
+ *   |             |                         any additional data that we receive
+ *   |             |                         will be dropped. We leave this
+ *   v             |                         state by trying to remove multicast
+ *   |             |                         promiscuity.
+ *   |             |
+ *   |             v
+ *   |   +---------------------------------+
+ *   +-->| VNS_S_MULTICAST_PROMISCOFF_SENT | In this state, we check if we have
+ *   |   +---------------------------------+ successfully removed multicast
+ *   |             |                         promiscuous mode. If we have
+ *   |             |                         failed, we still carry on but only
+ *   |             |                         warn. We leave this state by trying
+ *   |             |                         to disable SAP level promiscuous
+ *   |             |                         mode.
+ *   |             v
+ *   |   +---------------------------+
+ *   +-->| VNS_S_SAP_PROMISCOFF_SENT |       In this state, we check if we have
+ *   |   +---------------------------+       successfully removed SAP level
+ *   |             |                         promiscuous mode. If we have
+ *   |             |                         failed, we still carry on but only
+ *   |             |                         warn. Note that we don't worry
+ *   |             |                         about either of
+ *   |             |                         DL_PROMISC_FIXUPS or
+ *   |             |                         DL_PROMISC_RX_ONLY. If these are
+ *   |             |                         the only two entries left, then we
+ *   |             |                         should have anything that MAC is
+ *   |             |                         doing for us at this point,
+ *   |             |                         therefore it's safe for us to
+ *   |             |                         proceed to unbind, which is how we
+ *   |             |                         leave this state via a
+ *   |             v                         DL_UNBIND_REQ.
+ *   |    +-------------------+
+ *   +--->| VNS_S_UNBIND_SENT |              Here, we check how the unbind
+ *   |    +-------------------+              request went. Regardless of its
+ *   |             |                         success, we always transition to
+ *   |             |                         a zombie state.
+ *   |             v
+ *   |    +--------------+
+ *   +--->| VNS_S_ZOMBIE |                   In this state, the vnd STREAMS
+ *        +--------------+                   device is waiting to finish being
+ *                                           reaped. Because we have no more
+ *                                           ways to receive data it should be
+ *                                           safe to destroy all remaining data
+ *                                           structures.
+ *
+ * If the stream association fails for any reason the state machine reaches
+ * VNS_S_ZOMBIE. A more detailed vnd_errno_t will propagate back through the
+ * STREAMS ioctl to the character device. That will fail the user ioctl and
+ * propagate the vnd_errno_t back to userland. If, on the other hand, the
+ * association succeeds, then the vnd STREAMS device will be fully plumbed up
+ * and ready to transmit and receive message blocks. Consumers will be able to
+ * start using the other cbops(9E) entry points once the attach has fully
+ * finished, which will occur after the original user attach ioctl to the
+ * character device returns.
+ *
+ * It's quite important that we end up sending the full series of STREAMS
+ * messages when tearing down. While it's tempting to say that we should just
+ * rely on the STREAMS device being closed to properly ensure that we have no
+ * more additional data, that's not sufficient due to our use of direct
+ * callbacks.  DLS does not ensure that by the time we change the direct
+ * callback (vnd_mac_input) that all callers to it will have been quiesced.
+ * However, it does guarantee that if we disable promiscuous mode ourselves and
+ * we turn off the main data path via DL_UNBIND_REQ that it will work.
+ * Therefore, we make sure to do this ourselves rather than letting DLS/DLD do
+ * it as part of tearing down the STREAMS device. This ensures that we'll
+ * quiesce all data before we destroy our data structures and thus we should
+ * eliminate the race in changing the data function.
+ *
+ * --------------------
+ * General Architecture
+ * --------------------
+ *
+ * There are several different devices and structures in the vnd driver. There
+ * is a per-netstack component, pieces related to the character device that
+ * consumers see, the internal STREAMS device state, and the data queues
+ * themselves. The following ASCII art picture describes their relationships and
+ * some of the major pieces of data that contain them. These are not exhaustive,
+ * e.g. synchronization primitives are left out.
+ *
+ *  +----------------+     +-----------------+
+ *  | global         |     | global          |
+ *  | device list    |     | netstack list   |
+ *  | vnd_dev_list   |     | vnd_nsd_list    |
+ *  +----------------+     +-----------------+
+ *      |                    |
+ *      |                    v
+ *      |    +-------------------+      +-------------------+
+ *      |    | per-netstack data | ---> | per-netstack data | --> ...
+ *      |    | vnd_pnsd_t        |      | vnd_pnsd_t        |
+ *      |    |                   |      +-------------------+
+ *      |    |                   |
+ *      |    | nestackid_t    ---+----> Netstack ID
+ *      |    | vnd_pnsd_flags_t -+----> Status flags
+ *      |    | zoneid_t       ---+----> Zone ID for this netstack
+ *      |    | hook_family_t  ---+----> VND IPv4 Hooks
+ *      |    | hook_family_t  ---+----> VND IPv6 Hooks
+ *      |    | list_t ----+      |
+ *      |    +------------+------+
+ *      |                 |
+ *      |                 v
+ *      |           +------------------+       +------------------+
+ *      |           | character device |  ---> | character device | -> ...
+ *      +---------->| vnd_dev_t        |       | vnd_dev_t        |
+ *                  |                  |       +------------------+
+ *                  |                  |
+ *                  | minor_t       ---+--> device minor number
+ *                  | ldi_handle_t  ---+--> handle to /dev/net/%datalink
+ *                  | vnd_dev_flags_t -+--> device flags, non blocking, etc.
+ *                  | char[]        ---+--> name if linked
+ *                  | vnd_str_t * -+   |
+ *                  +--------------+---+
+ *                                 |
+ *                                 v
+ *          +-------------------------+
+ *          | STREAMS device          |
+ *          | vnd_str_t               |
+ *          |                         |
+ *          | vnd_str_state_t      ---+---> State machine state
+ *          | gsqueue_t *          ---+---> mblk_t Serialization queue
+ *          | vnd_str_stat_t       ---+---> per-device kstats
+ *          | vnd_str_capab_t      ---+----------------------------+
+ *          | vnd_data_queue_t ---+   |                            |
+ *          | vnd_data_queue_t -+ |   |                            v
+ *          +-------------------+-+---+                  +---------------------+
+ *                              | |                      | Stream capabilities |
+ *                              | |                      | vnd_str_capab_t     |
+ *                              | |                      |                     |
+ *                              | |    supported caps <--+-- vnd_capab_flags_t |
+ *                              | |    dld cap handle <--+-- void *            |
+ *                              | |    direct tx func <--+-- vnd_dld_tx_t      |
+ *                              | |                      +---------------------+
+ *                              | |
+ *             +----------------+ +-------------+
+ *             |                                |
+ *             v                                v
+ *  +-------------------+                  +-------------------+
+ *  | Read data queue   |                  | Write data queue  |
+ *  | vnd_data_queue_t  |                  | vnd_data_queue_t  |
+ *  |                   |                  |                   |
+ *  | size_t        ----+--> Current size  | size_t        ----+--> Current size
+ *  | size_t        ----+--> Max size      | size_t        ----+--> Max size
+ *  | mblk_t *      ----+--> Queue head    | mblk_t *      ----+--> Queue head
+ *  | mblk_t *      ----+--> Queue tail    | mblk_t *      ----+--> Queue tail
+ *  +-------------------+                  +-------------------+
+ *
+ *
+ * Globally, we maintain two lists. One list contains all of the character
+ * device soft states. The other maintains a list of all our netstack soft
+ * states. Each netstack maintains a list of active devices that have been
+ * associated with a datalink in its netstack.
+ *
+ * Recall that a given minor instance of the character device exists in one of
+ * two modes. It can either be a cloned open of /dev/vnd/ctl, the control node,
+ * or it can be associated with a given datalink. When minor instances are in
+ * the former state, they do not exist in a given vnd_pnsd_t's list of devices.
+ * As part of attaching to a datalink, the given vnd_dev_t will be inserted into
+ * the appropriate vnd_pnsd_t. In addition, this will cause a STREAMS device, a
+ * vnd_str_t, to be created and associated to a vnd_dev_t.
+ *
+ * The character device, and its vnd_dev_t, is the interface to the rest of the
+ * system. The vnd_dev_t keeps track of various aspects like whether various
+ * operations, such as read, write and the frameio ioctls, are considered
+ * blocking or non-blocking in the O_NONBLOCK sense. It also is responsible for
+ * keeping track of things like the name of the device, if any, in /dev. The
+ * vnd_str_t, on the other hand manages aspects like buffer sizes and the actual
+ * data queues. However, ioctls that manipulate these properties all go through
+ * the vnd_dev_t to its associated vnd_str_t.
+ *
+ * Each of the STREAMS devices, the vnd_str_t, maintains two data queues. One
+ * for frames to transmit (write queue) and one for frames received (read
+ * queue). These data queues have a maximum size and attempting to add data
+ * beyond that maximum size will result in data being dropped. The sizes are
+ * configurable via ioctls VND_IOC_SETTXBUF, VND_IOC_SETRXBUF. Data either sits
+ * in those buffers or has a reservation in those buffers while they are in vnd
+ * and waiting to be consumed by the user or by mac.
+ *
+ * Finally, the vnd_str_t also has a vnd_str_capab_t which we use to manage the
+ * available, negotiated, and currently active features.
+ *
+ * ----------------------
+ * Data Path and gsqueues
+ * ----------------------
+ *
+ * There's a lot of plumbing in vnd to get to the point where we can send data,
+ * but vnd's bread and butter is the data path, so it's worth diving into it in
+ * more detail. Data enters and exits the system from two ends.
+ *
+ * The first end is the vnd consumer. This comes in the form of read and write
+ * system calls as well as the frame I/O ioctls. The read and write system calls
+ * operate on a single frame at a time. Think of a frame as a single message
+ * that has come in off the wire, which may itself comprise multiple mblk_t's
+ * linked together in the kernel. readv(2) and writev(2) have the same
+ * limitations as read(2) and write(2). We enforce this as the system is
+ * required to fill up every uio(9S) buffer before moving onto the next one.
+ * This means that if you have a MTU sized buffer and two frames come in which
+ * are less than half of the MTU they must fill up the given iovec. Even if we
+ * didn't want to do this, we have no way of informing the supplier of the
+ * iovecs that they were only partially filled or where one frame ends and
+ * another begins.  That's life, as such we have frame I/O which solves this
+ * problem. It allows for multiple frames to be consumed as well as for frames
+ * to be broken down into multiple vector components.
+ *
+ * The second end is the mac direct calls. As part of negotiating capabilities
+ * via dld, we give mac a function of ours to call when packets are received
+ * [vnd_mac_input()] and a callback to indicate that flow has been restored
+ * [vnd_mac_flow_control()]. In turn, we also get a function pointer that we can
+ * transmit data with. As part of the contract with mac, mac is allowed to flow
+ * control us by returning a cookie to the transmit function. When that happens,
+ * all outbound traffic is halted until our callback function is called and we
+ * can schedule drains.
+ *
+ * It's worth looking at these in further detail. We'll start with the rx path.
+ *
+ *
+ *                                |
+ *                                * . . . packets from gld
+ *                                |
+ *                                v
+ *                         +-------------+
+ *                         |     mac     |
+ *                         +-------------+
+ *                                |
+ *                                v
+ *                         +-------------+
+ *                         |     dld     |
+ *                         +-------------+
+ *                                |
+ *                                * . . . dld direct callback
+ *                                |
+ *                                v
+ *                        +---------------+
+ *                        | vnd_mac_input |
+ *                        +---------------+
+ *                                |
+ *                                v
+ * +---------+             +-------------+
+ * | dropped |<--*---------|  vnd_hooks  |
+ * |   by    |   .         +-------------+
+ * |  hooks  |   . drop probe     |
+ * +---------+     kstat bump     * . . . Do we have free
+ *                                |         buffer space?
+ *                                |
+ *                          no .  |      . yes
+ *                             .  +      .
+ *                         +---*--+------*-------+
+ *                         |                     |
+ *                         * . . drop probe      * . . recv probe
+ *                         |     kstat bump      |     kstat bump
+ *                         v                     |
+ *                      +---------+              * . . fire pollin
+ *                      | freemsg |              v
+ *                      +---------+   +-----------------------+
+ *                                    | vnd_str_t`vns_dq_read |
+ *                                    +-----------------------+
+ *                                             ^ ^
+ *                             +----------+    | |     +---------+
+ *                             | read(9E) |-->-+ +--<--| frameio |
+ *                             +----------+            +---------+
+ *
+ * The rx path is rather linear. Packets come into us from mac. We always run
+ * them through the various hooks, and if they come out of that, we inspect the
+ * read data queue. If there is not enough space for a packet, we drop it.
+ * Otherwise, we append it to the data queue, and fire read notifications
+ * targetting anyone polling or doing blocking I/O on this device. Those
+ * consumers then drain the head of the data queue.
+ *
+ * The tx path is more complicated due to mac flow control. After any call into
+ * mac, we may have to potentially suspend writes and buffer data for an
+ * arbitrary amount of time. As such, we need to carefully track the total
+ * amount of outstanding data so that we don't waste kernel memory. This is
+ * further complicated by the fact that mac will asynchronously tell us when our
+ * flow has been resumed.
+ *
+ * For data to be able to enter the system, it needs to be able to take a
+ * reservation from the write data queue. Once the reservation has been
+ * obtained, we enter the gsqueue so that we can actually append it. We use
+ * gsqueues (serialization queues) to ensure that packets are manipulated in
+ * order as we deal with the draining and appending packets. We also leverage
+ * its worker thread to help us do draining after mac has restorted our flow.
+ *
+ * The following image describes the flow:
+ *
+ * +-----------+   +--------------+       +-------------------------+   +------+
+ * | write(9E) |-->| Space in the |--*--->| gsqueue_enter_one()     |-->| Done |
+ * | frameio   |   | write queue? |  .    | +->vnd_squeue_tx_append |   +------+
+ * +-----------+   +--------------+  .    +-------------------------+
+ *                         |   ^     .
+ *                         |   |     . reserve space           from gsqueue
+ *                         |   |                                   |
+ *            queue  . . . *   |       space                       v
+ *             full        |   * . . . avail          +------------------------+
+ *                         v   |                      | vnd_squeue_tx_append() |
+ * +--------+          +------------+                 +------------------------+
+ * | EAGAIN |<--*------| Non-block? |<-+                           |
+ * +--------+   .      +------------+  |                           v
+ *              . yes             v    |     wait          +--------------+
+ *                          no . .*    * . . for           | append chain |
+ *                                +----+     space         | to outgoing  |
+ *                                                         |  mblk chain  |
+ *   from gsqueue                                          +--------------+
+ *       |                                                        |
+ *       |      +-------------------------------------------------+
+ *       |      |
+ *       |      |                            yes . . .
+ *       v      v                                    .
+ *  +-----------------------+    +--------------+    .     +------+
+ *  | vnd_squeue_tx_drain() |--->| mac blocked? |----*---->| Done |
+ *  +-----------------------+    +--------------+          +------+
+ *                                       |                     |
+ *     +---------------------------------|---------------------+
+ *     |                                 |           tx        |
+ *     |                          no . . *           queue . . *
+ *     | flow controlled .               |           empty     * . fire pollout
+ *     |                 .               v                     |   if mblk_t's
+ *   +-------------+     .      +---------------------+        |   sent
+ *   | set blocked |<----*------| vnd_squeue_tx_one() |--------^-------+
+ *   | flags       |            +---------------------+                |
+ *   +-------------+    More data       |    |      |      More data   |
+ *                      and limit       ^    v      * . .  and limit   ^
+ *                      not reached . . *    |      |      reached     |
+ *                                      +----+      |                  |
+ *                                                  v                  |
+ *   +----------+          +-------------+    +---------------------------+
+ *   | mac flow |--------->| remove mac  |--->| gsqueue_enter_one() with  |
+ *   | control  |          | block flags |    | vnd_squeue_tx_drain() and |
+ *   | callback |          +-------------+    | GSQUEUE_FILL flag, iff    |
+ *   +----------+                             | not already scheduled     |
+ *                                            +---------------------------+
+ *
+ * The final path taken for a given write(9E)/frameio ioctl depends on whether
+ * or not the vnd_dev_t is non-blocking. That controls the initial path of
+ * trying to take a reservation in write data queue. If the device is in
+ * non-blocking mode, we'll return EAGAIN when there is not enough space
+ * available, otherwise, the calling thread blocks on the data queue.
+ *
+ * Today when we call into vnd_squeue_tx_drain() we will not try to drain the
+ * entire queue, as that could be quite large and we don't want to necessarily
+ * keep the thread that's doing the drain until it's been finished. Not only
+ * could more data be coming in, but the draining thread could be a userland
+ * thread that has more work to do. We have two limits today. There is an upper
+ * bound on the total amount of data and the total number of mblk_t chains. If
+ * we hit either limit, then we will schedule another drain in the gsqueue and
+ * go from there.
+ *
+ * It's worth taking some time to describe how we interact with gsqueues. vnd
+ * has a gsqueue_set_t for itself. It's important that it has its own set, as
+ * the profile of work that vnd does is different from other sub-systems in the
+ * kernel. When we open a STREAMS device in vnd_s_open, we get a random gsqueue.
+ * Unlike TCP/IP which uses an gsqueue for per TCP connection, we end up
+ * maintaining one for a given device. Because of that, we want to use a
+ * pseudo-random one to try and spread out the load, and picking one at random
+ * is likely to be just as good as any fancy algorithm we might come up with,
+ * especially as any two devices could have radically different transmit
+ * profiles.
+ *
+ * While some of the write path may seem complicated, it does allow us to
+ * maintain an important property. Once we have acknowledged a write(9E) or
+ * frameio ioctl, we will not drop the packet, excepting something like ipf via
+ * the firewall hooks.
+ *
+ * There is one other source of flow control that can exist in the system which
+ * is in the form of a barrier. The barrier is an internal mechanism used for
+ * ensuring that an gsqueue is drained for a given device. We use this as part
+ * of tearing down. Specifically we disable the write path so nothing new can be
+ * inserted into the gsqueue and then insert a barrier block. Once the barrier
+ * block comes out of the gsqueue, then we know nothing else in the gsqueue that
+ * could refer to the vnd_str_t, being destroyed, exists.
+ *
+ * ---------------------
+ * vnd, zones, netstacks
+ * ---------------------
+ *
+ * vnd devices are scoped to datalinks and datalinks are scoped to a netstack.
+ * Because of that, vnd is also a netstack module. It registers with the
+ * netstack sub-system and receives callbacks every time a netstack is created,
+ * being shutdown, and destroyed. The netstack callbacks drive the creation and
+ * destruction of the vnd_pnsd_t structures.
+ *
+ * Recall from the earlier architecture diagrams that every vnd device is scoped
+ * to a netstack and known about by a given vnd_pnsd_t. When that netstack is
+ * torn down, we also tear down any vnd devices that are hanging around. When
+ * the netstack is torn down, we know that any zones that are scoped to that
+ * netstack are being shut down and have no processes remaining. This is going
+ * to be the case whether they are shared or exclusive stack zones. We have to
+ * perform a careful dance.
+ *
+ * There are two different callbacks that happen on tear down, the first is a
+ * shutdown callback, the second is a destroy callback. When the shutdown
+ * callback is fired we need to prepare for the netstack to go away and ensure
+ * that nothing can continue to persist itself.
+ *
+ * More specifically, when we get notice of a stack being shutdown we first
+ * remove the netstack from the global netstack list to ensure that no one new
+ * can come in and find the netstack and get a reference to it. After that, we
+ * notify the neti hooks that they're going away. Once that's all done, we get
+ * to the heart of the matter.
+ *
+ * When shutting down there could be any number of outstanding contexts that
+ * have a reference on the vnd_pnsd_t and on the individual links. However, we
+ * know that no one new will be able to find the vnd_pnsd_t. To account for
+ * things that have existing references we mark the vnd_pnsd_t`vpnd_flags with
+ * VND_NS_CONDEMNED. This is checked by code paths that wish to append a device
+ * to the netstack's list. If this is set, then they must not append to it.
+ * Once this is set, we know that the netstack's list of devices can never grow,
+ * only shrink.
+ *
+ * Next, for each device we tag it with VND_D_ZONE_DYING. This indicates that
+ * the container for the device is being destroyed and that we should not allow
+ * additional references to the device to be created, whether via open, or
+ * linking. The presence of this bit also allows things like the list ioctl and
+ * sdev to know not to consider its existence. At the conclusion of this being
+ * set, we know that no one else should be able to obtain a new reference to the
+ * device.
+ *
+ * Once that has been set for all devices, we go through and remove any existing
+ * links that have been established in sdev. Because doing that may cause the
+ * final reference for the device to be dropped, which still has a reference to
+ * the netstack, we have to restart our walk due to dropped locks. We know that
+ * this walk will eventually complete because the device cannot be relinked and
+ * no new devices will be attached in this netstack due to VND_NS_CONDEMNED.
+ * Once that's finished, the shutdown callback returns.
+ *
+ * When we reach the destroy callback, we simply wait for references on the
+ * netstack to disappear. Because the zone has been shut down, all processes in
+ * it that have open references have been terminated and reaped. Any threads
+ * that are newly trying to reference it will fail. However, there is one thing
+ * that can halt this that we have no control over, which is the global zone
+ * holding open a reference to the device. In this case the zone halt will hang
+ * in vnd_stack_destroy. Once the last references is dropped we finish destroy
+ * the netinfo hooks and free the vnd_pnsd_t.
+ *
+ * ----
+ * sdev
+ * ----
+ *
+ * vnd registers a sdev plugin which allows it to dynamically fill out /dev/vnd
+ * for both the global and non-global zones. In any given zone we always supply
+ * a control node via /dev/vnd/ctl. This is the self-cloning node. Each zone
+ * will also have an entry per-link in that zone under /dev/vnd/%datalink, eg.
+ * if a link was named net0, there would be a /dev/vnd/net0. The global zone can
+ * also see every link for every zone, ala /dev/net, under
+ * /dev/vnd/%zonename/%datalink, eg. if a zone named 'turin' had a vnd device
+ * named net0, the global zone would have /dev/vnd/turin/net0.
+ *
+ * The sdev plugin has three interfaces that it supplies back to sdev. One is to
+ * validate that a given node is still valid. The next is a callback from sdev
+ * to say that it is no longer using the node. The third and final one is from
+ * sdev where it asks us to fill a directory. All of the heavy lifting is done
+ * in directory filling and in valiation. We opt not to maintain a reference on
+ * the device while there is an sdev node present. This makes the removal of
+ * nodes much simpler and most of the possible failure modes shouldn't cause any
+ * real problems. For example, the open path has to handle both dev_t's which no
+ * longer exist and which are no longer linked.
+ *
+ * -----
+ * hooks
+ * -----
+ *
+ * Like IP, vnd sends all L3 packets through its firewall hooks. Currently vnd
+ * provides these for L3 IP and IPv6 traffic. Each netstack provides these hooks
+ * in a minimal fashion. While we will allow traffic to be filtered through the
+ * hooks, we do not provide means for packet injection or additional inspection
+ * at this time. There are a total of four different events created:
+ *
+ *   o IPv4 physical in
+ *   o IPv4 physical out
+ *   o IPv6 physical in
+ *   o IPv6 physical out
+ *
+ * ---------------
+ * Synchronization
+ * ---------------
+ *
+ * To make our synchronization simpler, we've put more effort into making the
+ * metadata/setup paths do more work. That work allows the data paths to make
+ * assumptions around synchronization that simplify the general case. Each major
+ * structure, the vnd_pnsd_t, vnd_dev_t, vnd_str_t, and vnd_data_queue_t is
+ * annotated with the protection that its members receives.  The following
+ * annotations are used:
+ *
+ * 	A	Atomics; these values are only modified using atomics values.
+ *		Currently this only applies to kstat values.
+ * 	E	Existence; no lock is needed to access this member, it does not
+ *		change while the structure is valid.
+ * 	GL	Global Lock; these members are protected by the global
+ *		vnd_dev_lock.
+ * 	L	Locked; access to the member is controlled by a lock that is in
+ * 		the structure.
+ * 	NSL	netstack lock; this member is protected by the containing
+ * 		netstack. This only applies to the vnd_dev_t`vdd_nslink.
+ *	X	This member is special, and is discussed in this section.
+ *
+ * In addition to locking, we also have reference counts on the vnd_dev_t and
+ * the vnd_pnsd_t. The reference counts describe the lifetimes of the structure.
+ * With rare exception, once a reference count is decremented, the consumer
+ * should not assume that the data is valid any more. The only exception to this
+ * is the case where we're removing an extant reference count from a link into
+ * /devices or /dev. Reference counts are obtained on these structures as a part
+ * of looking them up.
+ *
+ * 	# Global Lock Ordering
+ * 	######################
+ *
+ * The following is the order that you must take locks in vnd:
+ *
+ * 1) vnd`vnd_dev_lock
+ * 2) vnd_pnsd_t`vpnd_lock
+ * 3) vnd_dev_t`vnd_lock
+ * 4) vnd_str_t`vns_lock
+ * 5) vnd_data_queue_t`vdq_lock
+ *
+ * One must adhere to the following rules:
+ *
+ *   o You must acquire a lower numbered lock before a high numbered lock.
+ *   o It is NOT legal to hold two locks of the same level concurrently, eg. you
+ *     can not hold two different vnd_dev_t's vnd_lock at the same time.
+ *   o You may release locks in any order.
+ *   o If you release a lock, you must honor the locking rules before acquiring
+ *     it again.
+ *   o You should not hold any locks when calling any of the rele functions.
+ *
+ * 	# Special Considerations
+ * 	########################
+ *
+ * While most of the locking is what's expected, it's worth going into the
+ * special nature that a few members hold.  Today, only two structures have
+ * special considerations: the vnd_dev_t and the vnd_str_t. All members with
+ * special considerations have an additional annotation that describes how you
+ * should interact with it.
+ *
+ * vnd_dev_t: The vdd_nsd and vdd_cr are only valid when the minor node is
+ * attached or in the process of attaching. If the code path that goes through
+ * requires an attached vnd_dev_t, eg. the data path and tear down path, then it
+ * is always legal to dereference that member without a lock held. When they are
+ * added to the system, they should be done under the vdd_lock and done as part
+ * of setting the VND_D_ATTACH_INFLIGHT flag. These should not change during the
+ * lifetime of the vnd_dev_t.
+ *
+ * vnd_dev_t: The vdd_ldih is similar to the vdd_nsd and vdd_cr, except that it
+ * always exists as it is a part of the structure. The only time that it's valid
+ * to be using it is during the attach path with the VND_D_ATTACH_INFLIGHT flag
+ * set or during tear down. Outside of those paths which are naturally
+ * serialized, there is no explicit locking around the member.
+ *
+ * vnd_str_t: The vns_dev and vns_nsd work in similar ways. They are not
+ * initially set as part of creating the structure, but are set as part of
+ * responding to the association ioctl. Anything in the data path or metadata
+ * path that requires association may assume that they exist, as we do not kick
+ * off the state machine until they're set.
+ *
+ * vnd_str_t: The vns_drainblk and vns_barrierblk are similarly special. The
+ * members are designed to be used as part of various operations with the
+ * gsqueues. A lock isn't needed to use them, but to work with them, the
+ * appropriate flag in the vnd_str_t`vns_flags must have been set by the current
+ * thread. Otherwise, it is always fair game to refer to their addresses. Their
+ * contents are ignored by vnd, but some members are manipulated by the gsqueue
+ * subsystem.
+ */
+
+#include <sys/conf.h>
+#include <sys/devops.h>
+#include <sys/modctl.h>
+#include <sys/stat.h>
+#include <sys/file.h>
+#include <sys/types.h>
+#include <sys/errno.h>
+#include <sys/open.h>
+#include <sys/ddi.h>
+#include <sys/ethernet.h>
+#include <sys/stropts.h>
+#include <sys/sunddi.h>
+#include <sys/stream.h>
+#include <sys/strsun.h>
+#include <sys/ksynch.h>
+#include <sys/taskq_impl.h>
+#include <sys/sdt.h>
+#include <sys/debug.h>
+#include <sys/sysmacros.h>
+#include <sys/dlpi.h>
+#include <sys/cred.h>
+#include <sys/id_space.h>
+#include <sys/list.h>
+#include <sys/ctype.h>
+#include <sys/policy.h>
+#include <sys/sunldi.h>
+#include <sys/cred.h>
+#include <sys/strsubr.h>
+#include <sys/poll.h>
+#include <sys/neti.h>
+#include <sys/hook.h>
+#include <sys/hook_event.h>
+#include <sys/vlan.h>
+#include <sys/dld.h>
+#include <sys/mac_client.h>
+#include <sys/netstack.h>
+#include <sys/fs/sdev_plugin.h>
+#include <sys/kstat.h>
+#include <sys/atomic.h>
+#include <sys/disp.h>
+#include <sys/random.h>
+#include <sys/gsqueue.h>
+
+#include <inet/ip.h>
+#include <inet/ip6.h>
+
+#include <sys/vnd.h>
+
+/*
+ * Globals
+ */
+static dev_info_t *vnd_dip;
+static taskq_t *vnd_taskq;
+static kmem_cache_t *vnd_str_cache;
+static kmem_cache_t *vnd_dev_cache;
+static kmem_cache_t *vnd_pnsd_cache;
+static id_space_t *vnd_minors;
+static int vnd_list_init = 0;
+static sdev_plugin_hdl_t vnd_sdev_hdl;
+static gsqueue_set_t *vnd_sqset;
+
+static kmutex_t vnd_dev_lock;
+static list_t vnd_dev_list;	/* Protected by the vnd_dev_lock */
+static list_t vnd_nsd_list;	/* Protected by the vnd_dev_lock */
+
+/*
+ * STREAMs ioctls
+ *
+ * The STREAMs ioctls are internal to vnd. No one should be seeing them, as such
+ * they aren't a part of the header file.
+ */
+#define	VND_STRIOC	(('v' << 24) | ('n' << 16) | ('d' << 8) | 0x80)
+
+/*
+ * Private ioctl to associate a given streams instance with a minor instance of
+ * the character device.
+ */
+#define	VND_STRIOC_ASSOCIATE	(VND_STRIOC | 0x1)
+
+typedef struct vnd_strioc_associate {
+	minor_t	vsa_minor;	/* minor device node */
+	netstackid_t vsa_nsid;	/* netstack id */
+	vnd_errno_t vsa_errno;	/* errno */
+} vnd_strioc_associate_t;
+
+typedef enum vnd_strioc_state {
+	VSS_UNKNOWN = 0,
+	VSS_COPYIN = 1,
+	VSS_COPYOUT = 2,
+} vnd_strioc_state_t;
+
+typedef struct vnd_strioc {
+	vnd_strioc_state_t vs_state;
+	caddr_t vs_addr;
+} vnd_strioc_t;
+
+/*
+ * VND SQUEUE TAGS, start at 0x42 so we don't overlap with extent tags. Though
+ * really, overlap is at the end of the day, inevitable.
+ */
+#define	VND_SQUEUE_TAG_TX_DRAIN		0x42
+#define	VND_SQUEUE_TAG_MAC_FLOW_CONTROL	0x43
+#define	VND_SQUEUE_TAG_VND_WRITE	0x44
+#define	VND_SQUEUE_TAG_ND_FRAMEIO_WRITE	0x45
+#define	VND_SQUEUE_TAG_STRBARRIER	0x46
+
+/*
+ * vnd reserved names. These are names which are reserved by vnd and thus
+ * shouldn't be used by some external program.
+ */
+static char *vnd_reserved_names[] = {
+	"ctl",
+	"zone",
+	NULL
+};
+
+/*
+ * vnd's DTrace probe macros
+ *
+ * DTRACE_VND* are all for a stable provider. We also have an unstable internal
+ * set of probes for reference count manipulation.
+ */
+#define	DTRACE_VND3(name, type1, arg1, type2, arg2, type3, arg3) \
+    DTRACE_PROBE3(__vnd_##name, type1, arg1, type2, arg2, type3, arg3);
+
+#define	DTRACE_VND4(name, type1, arg1, type2, arg2, type3, arg3, type4, arg4) \
+    DTRACE_PROBE4(__vnd_##name, type1, arg1, type2, arg2, type3, arg3, \
+	type4, arg4);
+
+#define	DTRACE_VND5(name, type1, arg1, type2, arg2, type3, arg3,	\
+    type4, arg4, type5, arg5)						\
+    DTRACE_PROBE5(__vnd_##name, type1, arg1, type2, arg2, type3, arg3,	\
+	type4, arg4, type5, arg5);
+
+#define	DTRACE_VND_REFINC(vdp) \
+    DTRACE_PROBE2(vnd__ref__inc, vnd_dev_t *, vdp, int, vdp->vdd_ref);
+#define	DTRACE_VND_REFDEC(vdp) \
+    DTRACE_PROBE2(vnd__ref__dec, vnd_dev_t *, vdp, int, vdp->vdd_ref);
+
+
+/*
+ * Tunables
+ */
+size_t vnd_vdq_default_size = 1024 * 64;	/* 64 KB */
+size_t vnd_vdq_hard_max = 1024 * 1024 * 4;	/* 4 MB */
+
+/*
+ * These numbers are designed as per-device tunables that are applied when a new
+ * vnd device is attached. They're a rough stab at what may be a reasonable
+ * amount of work to do in one burst in an squeue.
+ */
+size_t vnd_flush_burst_size = 1520 * 10;	/* 10 1500 MTU packets */
+size_t vnd_flush_nburst = 10;			/* 10 frames */
+
+/*
+ * Constants related to our sdev plugins
+ */
+#define	VND_SDEV_NAME	"vnd"
+#define	VND_SDEV_ROOT	"/dev/vnd"
+#define	VND_SDEV_ZROOT	"/dev/vnd/zone"
+
+/*
+ * Statistic macros
+ */
+#define	VND_STAT_INC(vsp, field, val) \
+    atomic_add_64(&(vsp)->vns_ksdata.field.value.ui64, val)
+#define	VND_LATENCY_1MS		1000000
+#define	VND_LATENCY_10MS	10000000
+#define	VND_LATENCY_100MS	100000000
+#define	VND_LATENCY_1S		1000000000
+#define	VND_LATENCY_10S		10000000000
+
+/*
+ * Constants for vnd hooks
+ */
+static uint8_t vnd_bcast_addr[6] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
+#define	IPV4_MCAST_LEN	3
+static uint8_t vnd_ipv4_mcast[3] = { 0x01, 0x00, 0x5E };
+#define	IPV6_MCAST_LEN	2
+static uint8_t vnd_ipv6_mcast[2] = { 0x33, 0x33 };
+
+/*
+ * vnd internal data structures and types
+ */
+
+struct vnd_str;
+struct vnd_dev;
+struct vnd_pnsd;
+
+/*
+ * As part of opening the device stream we need to properly communicate with our
+ * underlying stream. This is a bit of an asynchronous dance and we need to
+ * properly work with dld to get everything set up. We have to initiate the
+ * conversation with dld and as such we keep track of our state here.
+ */
+typedef enum vnd_str_state {
+	VNS_S_INITIAL = 0,
+	VNS_S_INFO_SENT,
+	VNS_S_EXCLUSIVE_SENT,
+	VNS_S_ATTACH_SENT,
+	VNS_S_BIND_SENT,
+	VNS_S_SAP_PROMISC_SENT,
+	VNS_S_MULTI_PROMISC_SENT,
+	VNS_S_RX_ONLY_PROMISC_SENT,
+	VNS_S_FIXUP_PROMISC_SENT,
+	VNS_S_CAPAB_Q_SENT,
+	VNS_S_CAPAB_E_SENT,
+	VNS_S_ONLINE,
+	VNS_S_SHUTTING_DOWN,
+	VNS_S_MULTICAST_PROMISCOFF_SENT,
+	VNS_S_SAP_PROMISCOFF_SENT,
+	VNS_S_UNBIND_SENT,
+	VNS_S_ZOMBIE
+} vnd_str_state_t;
+
+typedef enum vnd_str_flags {
+	VNS_F_NEED_ZONE = 0x1,
+	VNS_F_TASKQ_DISPATCHED = 0x2,
+	VNS_F_CONDEMNED = 0x4,
+	VNS_F_FLOW_CONTROLLED = 0x8,
+	VNS_F_DRAIN_SCHEDULED = 0x10,
+	VNS_F_BARRIER = 0x20,
+	VNS_F_BARRIER_DONE = 0x40
+} vnd_str_flags_t;
+
+typedef enum vnd_capab_flags {
+	VNS_C_HCKSUM = 0x1,
+	VNS_C_DLD = 0x2,
+	VNS_C_DIRECT = 0x4,
+	VNS_C_HCKSUM_BADVERS = 0x8
+} vnd_capab_flags_t;
+
+/*
+ * Definitions to interact with direct callbacks
+ */
+typedef void (*vnd_rx_t)(struct vnd_str *, mac_resource_t *, mblk_t *,
+    mac_header_info_t *);
+typedef uintptr_t vnd_mac_cookie_t;
+/* DLD Direct capability function */
+typedef int (*vnd_dld_cap_t)(void *, uint_t, void *, uint_t);
+/* DLD Direct tx function */
+typedef vnd_mac_cookie_t (*vnd_dld_tx_t)(void *, mblk_t *, uint64_t, uint16_t);
+/* DLD Direct function to set flow control callback */
+typedef void *(*vnd_dld_set_fcb_t)(void *, void (*)(void *, vnd_mac_cookie_t),
+    void *);
+/* DLD Direct function to see if flow controlled still */
+typedef int (*vnd_dld_is_fc_t)(void *, vnd_mac_cookie_t);
+
+/*
+ * The vnd_str_capab_t is always protected by the vnd_str_t it's a member of.
+ */
+typedef struct vnd_str_capab {
+	vnd_capab_flags_t vsc_flags;
+	t_uscalar_t vsc_hcksum_opts;
+	vnd_dld_cap_t vsc_capab_f;
+	void *vsc_capab_hdl;
+	vnd_dld_tx_t vsc_tx_f;
+	void *vsc_tx_hdl;
+	vnd_dld_set_fcb_t vsc_set_fcb_f;
+	void *vsc_set_fcb_hdl;
+	vnd_dld_is_fc_t vsc_is_fc_f;
+	void *vsc_is_fc_hdl;
+	vnd_mac_cookie_t vsc_fc_cookie;
+	void *vsc_tx_fc_hdl;
+} vnd_str_capab_t;
+
+/*
+ * The vnd_data_queue is a simple construct for storing a series of messages in
+ * a queue.
+ *
+ * See synchronization section of the big theory statement for member
+ * annotations.
+ */
+typedef struct vnd_data_queue {
+	struct vnd_str *vdq_vns;	/* E */
+	kmutex_t vdq_lock;
+	kcondvar_t vdq_ready;		/* Uses vdq_lock */
+	ssize_t vdq_max;		/* L */
+	ssize_t vdq_cur;		/* L */
+	mblk_t *vdq_head;		/* L */
+	mblk_t *vdq_tail;		/* L */
+} vnd_data_queue_t;
+
+typedef struct vnd_str_stat {
+	kstat_named_t	vks_rbytes;
+	kstat_named_t	vks_rpackets;
+	kstat_named_t	vks_obytes;
+	kstat_named_t	vks_opackets;
+	kstat_named_t	vks_nhookindrops;
+	kstat_named_t	vks_nhookoutdrops;
+	kstat_named_t	vks_ndlpidrops;
+	kstat_named_t	vks_ndataindrops;
+	kstat_named_t	vks_ndataoutdrops;
+	kstat_named_t	vks_tdrops;
+	kstat_named_t	vks_linkname;
+	kstat_named_t	vks_zonename;
+	kstat_named_t	vks_nmacflow;
+	kstat_named_t	vks_tmacflow;
+	kstat_named_t	vks_mac_flow_1ms;
+	kstat_named_t	vks_mac_flow_10ms;
+	kstat_named_t	vks_mac_flow_100ms;
+	kstat_named_t	vks_mac_flow_1s;
+	kstat_named_t	vks_mac_flow_10s;
+} vnd_str_stat_t;
+
+/*
+ * vnd stream structure
+ *
+ * See synchronization section of the big theory statement for member
+ * annotations.
+ */
+typedef struct vnd_str {
+	kmutex_t 	vns_lock;
+	kcondvar_t	vns_cancelcv;		/* Uses vns_lock */
+	kcondvar_t	vns_barriercv;		/* Uses vns_lock */
+	kcondvar_t	vns_stcv;		/* Uses vns_lock */
+	vnd_str_state_t	vns_state;		/* L */
+	vnd_str_state_t	vns_laststate;		/* L */
+	vnd_errno_t	vns_errno;		/* L */
+	vnd_str_flags_t	vns_flags;		/* L */
+	vnd_str_capab_t vns_caps;		/* L */
+	taskq_ent_t	vns_tqe;		/* L */
+	vnd_data_queue_t vns_dq_read;		/* E */
+	vnd_data_queue_t vns_dq_write;		/* E */
+	mblk_t		*vns_dlpi_inc;		/* L */
+	queue_t		*vns_rq;		/* E */
+	queue_t		*vns_wq;		/* E */
+	queue_t		*vns_lrq;		/* E */
+	t_uscalar_t	vns_dlpi_style;		/* L */
+	t_uscalar_t	vns_minwrite;		/* L */
+	t_uscalar_t	vns_maxwrite;		/* L */
+	hrtime_t	vns_fclatch;		/* L */
+	hrtime_t	vns_fcupdate;		/* L */
+	kstat_t		*vns_kstat;		/* E */
+	gsqueue_t	*vns_squeue;		/* E */
+	mblk_t		vns_drainblk;		/* E + X */
+	mblk_t		vns_barrierblk;		/* E + X */
+	vnd_str_stat_t	vns_ksdata;		/* A */
+	size_t		vns_nflush;		/* L */
+	size_t 		vns_bsize;		/* L */
+	struct vnd_dev	*vns_dev;		/* E + X */
+	struct vnd_pnsd	*vns_nsd;		/* E + X */
+} vnd_str_t;
+
+typedef enum vnd_dev_flags {
+	VND_D_ATTACH_INFLIGHT =	0x001,
+	VND_D_ATTACHED =	0x002,
+	VND_D_LINK_INFLIGHT =	0x004,
+	VND_D_LINKED =		0x008,
+	VND_D_CONDEMNED =	0x010,
+	VND_D_ZONE_DYING =	0x020,
+	VND_D_OPENED =		0x040
+} vnd_dev_flags_t;
+
+/*
+ * This represents the data associated with a minor device instance.
+ *
+ * See synchronization section of the big theory statement for member
+ * annotations.
+ */
+typedef struct vnd_dev {
+	kmutex_t	vdd_lock;
+	list_node_t	vdd_link;			/* GL */
+	list_node_t	vdd_nslink;			/* NSL */
+	int		vdd_ref;			/* L */
+	vnd_dev_flags_t	vdd_flags;			/* L */
+	minor_t		vdd_minor;			/* E */
+	dev_t		vdd_devid;			/* E */
+	ldi_ident_t	vdd_ldiid;			/* E */
+	ldi_handle_t	vdd_ldih;			/* X */
+	cred_t		*vdd_cr;			/* X */
+	vnd_str_t	*vdd_str;			/* L */
+	struct pollhead	vdd_ph;				/* E */
+	struct vnd_pnsd *vdd_nsd;			/* E + X */
+	char		vdd_datalink[VND_NAMELEN];	/* L */
+	char		vdd_lname[VND_NAMELEN];		/* L */
+} vnd_dev_t;
+
+typedef enum vnd_pnsd_flags {
+	VND_NS_CONDEMNED = 0x1
+} vnd_pnsd_flags_t;
+
+/*
+ * Per netstack data structure.
+ *
+ * See synchronization section of the big theory statement for member
+ * annotations.
+ */
+typedef struct vnd_pnsd {
+	list_node_t vpnd_link;	/* protected by global dev lock */
+	zoneid_t vpnd_zid;			/* E */
+	netstackid_t vpnd_nsid;			/* E */
+	boolean_t vpnd_hooked;			/* E */
+	net_handle_t vpnd_neti_v4;		/* E */
+	hook_family_t vpnd_family_v4;		/* E */
+	hook_event_t vpnd_event_in_v4;		/* E */
+	hook_event_t vpnd_event_out_v4;		/* E */
+	hook_event_token_t vpnd_token_in_v4;	/* E */
+	hook_event_token_t vpnd_token_out_v4;	/* E */
+	net_handle_t vpnd_neti_v6;		/* E */
+	hook_family_t vpnd_family_v6;		/* E */
+	hook_event_t vpnd_event_in_v6;		/* E */
+	hook_event_t vpnd_event_out_v6;		/* E */
+	hook_event_token_t vpnd_token_in_v6;	/* E */
+	hook_event_token_t vpnd_token_out_v6;	/* E */
+	kmutex_t vpnd_lock;		/* Protects remaining members */
+	kcondvar_t vpnd_ref_change;		/* Uses vpnd_lock */
+	int vpnd_ref;				/* L */
+	vnd_pnsd_flags_t vpnd_flags;		/* L */
+	list_t vpnd_dev_list;			/* L */
+} vnd_pnsd_t;
+
+static void vnd_squeue_tx_drain(void *, mblk_t *, gsqueue_t *, void *);
+
+/*
+ * Drop function signature.
+ */
+typedef void (*vnd_dropper_f)(vnd_str_t *, mblk_t *, const char *);
+
+static void
+vnd_drop_ctl(vnd_str_t *vsp, mblk_t *mp, const char *reason)
+{
+	DTRACE_VND4(drop__ctl, mblk_t *, mp, vnd_str_t *, vsp, mblk_t *,
+	    mp, const char *, reason);
+	if (mp != NULL) {
+		freemsg(mp);
+	}
+	VND_STAT_INC(vsp, vks_ndlpidrops, 1);
+	VND_STAT_INC(vsp, vks_tdrops, 1);
+}
+
+static void
+vnd_drop_in(vnd_str_t *vsp, mblk_t *mp, const char *reason)
+{
+	DTRACE_VND4(drop__in, mblk_t *, mp, vnd_str_t *, vsp, mblk_t *,
+	    mp, const char *, reason);
+	if (mp != NULL) {
+		freemsg(mp);
+	}
+	VND_STAT_INC(vsp, vks_ndataindrops, 1);
+	VND_STAT_INC(vsp, vks_tdrops, 1);
+}
+
+static void
+vnd_drop_out(vnd_str_t *vsp, mblk_t *mp, const char *reason)
+{
+	DTRACE_VND4(drop__out, mblk_t *, mp, vnd_str_t *, vsp, mblk_t *,
+	    mp, const char *, reason);
+	if (mp != NULL) {
+		freemsg(mp);
+	}
+	VND_STAT_INC(vsp, vks_ndataoutdrops, 1);
+	VND_STAT_INC(vsp, vks_tdrops, 1);
+}
+
+static void
+vnd_drop_hook_in(vnd_str_t *vsp, mblk_t *mp, const char *reason)
+{
+	DTRACE_VND4(drop__in, mblk_t *, mp, vnd_str_t *, vsp, mblk_t *,
+	    mp, const char *, reason);
+	if (mp != NULL) {
+		freemsg(mp);
+	}
+	VND_STAT_INC(vsp, vks_nhookindrops, 1);
+	VND_STAT_INC(vsp, vks_tdrops, 1);
+}
+
+static void
+vnd_drop_hook_out(vnd_str_t *vsp, mblk_t *mp, const char *reason)
+{
+	DTRACE_VND4(drop__out, mblk_t *, mp, vnd_str_t *, vsp, mblk_t *,
+	    mp, const char *, reason);
+	if (mp != NULL) {
+		freemsg(mp);
+	}
+	VND_STAT_INC(vsp, vks_nhookoutdrops, 1);
+	VND_STAT_INC(vsp, vks_tdrops, 1);
+}
+
+static void
+vnd_drop_panic(vnd_str_t *vsp, mblk_t *mp, const char *reason)
+{
+	panic("illegal vnd drop");
+}
+
+static void
+vnd_mac_drop_input(vnd_str_t *vsp, mac_resource_t *unused, mblk_t *mp_chain,
+    mac_header_info_t *mhip)
+{
+	mblk_t *mp;
+
+	while (mp_chain != NULL) {
+		mp = mp_chain;
+		mp_chain = mp->b_next;
+		vnd_drop_hook_in(vsp, mp, "stream not associated");
+	}
+}
+
+static vnd_pnsd_t *
+vnd_nsd_lookup(netstackid_t nsid)
+{
+	vnd_pnsd_t *nsp;
+
+	mutex_enter(&vnd_dev_lock);
+	for (nsp = list_head(&vnd_nsd_list); nsp != NULL;
+	    nsp = list_next(&vnd_nsd_list, nsp)) {
+		if (nsp->vpnd_nsid == nsid) {
+			mutex_enter(&nsp->vpnd_lock);
+			VERIFY(nsp->vpnd_ref >= 0);
+			nsp->vpnd_ref++;
+			mutex_exit(&nsp->vpnd_lock);
+			break;
+		}
+	}
+	mutex_exit(&vnd_dev_lock);
+	return (nsp);
+}
+
+static vnd_pnsd_t *
+vnd_nsd_lookup_by_zid(zoneid_t zid)
+{
+	netstack_t *ns;
+	vnd_pnsd_t *nsp;
+	ns = netstack_find_by_zoneid(zid);
+	if (ns == NULL)
+		return (NULL);
+	nsp = vnd_nsd_lookup(ns->netstack_stackid);
+	netstack_rele(ns);
+	return (nsp);
+}
+
+static vnd_pnsd_t *
+vnd_nsd_lookup_by_zonename(char *zname)
+{
+	zone_t *zonep;
+	vnd_pnsd_t *nsp;
+
+	zonep = zone_find_by_name(zname);
+	if (zonep == NULL)
+		return (NULL);
+
+	nsp = vnd_nsd_lookup_by_zid(zonep->zone_id);
+	zone_rele(zonep);
+	return (nsp);
+}
+
+static void
+vnd_nsd_ref(vnd_pnsd_t *nsp)
+{
+	mutex_enter(&nsp->vpnd_lock);
+	/*
+	 * This can only be used on something that has been obtained through
+	 * some other means. As such, the caller should already have a reference
+	 * before adding another one. This function should not be used as a
+	 * means of creating the initial reference.
+	 */
+	VERIFY(nsp->vpnd_ref > 0);
+	nsp->vpnd_ref++;
+	mutex_exit(&nsp->vpnd_lock);
+	cv_broadcast(&nsp->vpnd_ref_change);
+}
+
+static void
+vnd_nsd_rele(vnd_pnsd_t *nsp)
+{
+	mutex_enter(&nsp->vpnd_lock);
+	VERIFY(nsp->vpnd_ref > 0);
+	nsp->vpnd_ref--;
+	mutex_exit(&nsp->vpnd_lock);
+	cv_broadcast(&nsp->vpnd_ref_change);
+}
+
+static vnd_dev_t *
+vnd_dev_lookup(minor_t m)
+{
+	vnd_dev_t *vdp;
+	mutex_enter(&vnd_dev_lock);
+	for (vdp = list_head(&vnd_dev_list); vdp != NULL;
+	    vdp = list_next(&vnd_dev_list, vdp)) {
+		if (vdp->vdd_minor == m) {
+			mutex_enter(&vdp->vdd_lock);
+			VERIFY(vdp->vdd_ref > 0);
+			vdp->vdd_ref++;
+			DTRACE_VND_REFINC(vdp);
+			mutex_exit(&vdp->vdd_lock);
+			break;
+		}
+	}
+	mutex_exit(&vnd_dev_lock);
+	return (vdp);
+}
+
+static void
+vnd_dev_free(vnd_dev_t *vdp)
+{
+	/*
+	 * When the STREAM exists we need to go through and make sure
+	 * communication gets torn down. As part of closing the stream, we
+	 * guarantee that nothing else should be able to enter the stream layer
+	 * at this point. That means no one should be able to call
+	 * read(),write() or one of the frameio ioctls.
+	 */
+	if (vdp->vdd_flags & VND_D_ATTACHED) {
+		ldi_close(vdp->vdd_ldih, FREAD | FWRITE, vdp->vdd_cr);
+		crfree(vdp->vdd_cr);
+		vdp->vdd_cr = NULL;
+
+		/*
+		 * We have to remove ourselves from our parents list now. It is
+		 * really quite important that we have already set the condemend
+		 * flag here so that our containing netstack basically knows
+		 * that we're on the way down and knows not to wait for us. It's
+		 * also important that we do that before we put a rele on the
+		 * the device as that is the point at which it will check again.
+		 */
+		mutex_enter(&vdp->vdd_nsd->vpnd_lock);
+		list_remove(&vdp->vdd_nsd->vpnd_dev_list, vdp);
+		mutex_exit(&vdp->vdd_nsd->vpnd_lock);
+		vnd_nsd_rele(vdp->vdd_nsd);
+		vdp->vdd_nsd = NULL;
+	}
+	ASSERT(vdp->vdd_flags & VND_D_CONDEMNED);
+	id_free(vnd_minors, vdp->vdd_minor);
+	mutex_destroy(&vdp->vdd_lock);
+	kmem_cache_free(vnd_dev_cache, vdp);
+}
+
+static void
+vnd_dev_ref(vnd_dev_t *vdp)
+{
+	mutex_enter(&vdp->vdd_lock);
+	VERIFY(vdp->vdd_ref > 0);
+	vdp->vdd_ref++;
+	DTRACE_VND_REFINC(vdp);
+	mutex_exit(&vdp->vdd_lock);
+}
+
+/*
+ * As part of releasing the hold on this we may tear down a given vnd_dev_t As
+ * such we need to make sure that we grab the list lock first before grabbing
+ * the vnd_dev_t's lock to ensure proper lock ordering.
+ */
+static void
+vnd_dev_rele(vnd_dev_t *vdp)
+{
+	mutex_enter(&vnd_dev_lock);
+	mutex_enter(&vdp->vdd_lock);
+	VERIFY(vdp->vdd_ref > 0);
+	vdp->vdd_ref--;
+	DTRACE_VND_REFDEC(vdp);
+	if (vdp->vdd_ref > 0) {
+		mutex_exit(&vdp->vdd_lock);
+		mutex_exit(&vnd_dev_lock);
+		return;
+	}
+
+	/*
+	 * Now that we've removed this from the list, we can go ahead and
+	 * drop the list lock. No one else can find this device and reference
+	 * it. As its reference count is zero, it by definition does not have
+	 * any remaining entries in /devices that could lead someone back to
+	 * this.
+	 */
+	vdp->vdd_flags |= VND_D_CONDEMNED;
+	list_remove(&vnd_dev_list, vdp);
+	mutex_exit(&vdp->vdd_lock);
+	mutex_exit(&vnd_dev_lock);
+
+	vnd_dev_free(vdp);
+}
+
+/*
+ * Insert a mesage block chain if there's space, otherwise drop it. Return one
+ * so someone who was waiting for data would now end up having found it. eg.
+ * caller should consider a broadcast.
+ */
+static int
+vnd_dq_push(vnd_data_queue_t *vqp, mblk_t *mp, boolean_t reserved,
+    vnd_dropper_f dropf)
+{
+	size_t msize;
+
+	ASSERT(MUTEX_HELD(&vqp->vdq_lock));
+	if (reserved == B_FALSE) {
+		msize = msgsize(mp);
+		if (vqp->vdq_cur + msize > vqp->vdq_max) {
+			dropf(vqp->vdq_vns, mp, "buffer full");
+			return (0);
+		}
+		vqp->vdq_cur += msize;
+	}
+
+	if (vqp->vdq_head == NULL) {
+		ASSERT(vqp->vdq_tail == NULL);
+		vqp->vdq_head = mp;
+		vqp->vdq_tail = mp;
+	} else {
+		vqp->vdq_tail->b_next = mp;
+		vqp->vdq_tail = mp;
+	}
+
+	return (1);
+}
+
+/*
+ * Remove a message message block chain. If the amount of space in the buffer
+ * has changed we return 1. We have no way of knowing whether or not there is
+ * enough space overall for a given writer who is blocked, so we always end up
+ * having to return true and thus tell consumers that they should consider
+ * signalling.
+ */
+static int
+vnd_dq_pop(vnd_data_queue_t *vqp, mblk_t **mpp)
+{
+	size_t msize;
+	mblk_t *mp;
+
+	ASSERT(MUTEX_HELD(&vqp->vdq_lock));
+	ASSERT(mpp != NULL);
+	if (vqp->vdq_head == NULL) {
+		ASSERT(vqp->vdq_tail == NULL);
+		*mpp = NULL;
+		return (0);
+	}
+
+	mp = vqp->vdq_head;
+	msize = msgsize(mp);
+
+	vqp->vdq_cur -= msize;
+	if (mp->b_next == NULL) {
+		vqp->vdq_head = NULL;
+		vqp->vdq_tail = NULL;
+		/*
+		 * We can't be certain that this is always going to be zero.
+		 * Someone may have basically taken a reservation of space on
+		 * the data queue, eg. claimed spae but not yet pushed it on
+		 * yet.
+		 */
+		ASSERT(vqp->vdq_cur >= 0);
+	} else {
+		vqp->vdq_head = mp->b_next;
+		ASSERT(vqp->vdq_cur > 0);
+	}
+	mp->b_next = NULL;
+	*mpp = mp;
+	return (1);
+}
+
+/*
+ * Reserve space in the queue. This will bump up the size of the queue and
+ * entitle the user to push something on later without bumping the space.
+ */
+static int
+vnd_dq_reserve(vnd_data_queue_t *vqp, ssize_t size)
+{
+	ASSERT(MUTEX_HELD(&vqp->vdq_lock));
+	ASSERT(size >= 0);
+
+	if (size == 0)
+		return (0);
+
+	if (size + vqp->vdq_cur > vqp->vdq_max)
+		return (0);
+
+	vqp->vdq_cur += size;
+	return (1);
+}
+
+static void
+vnd_dq_unreserve(vnd_data_queue_t *vqp, ssize_t size)
+{
+	ASSERT(MUTEX_HELD(&vqp->vdq_lock));
+	ASSERT(size > 0);
+	ASSERT(size <= vqp->vdq_cur);
+
+	vqp->vdq_cur -= size;
+}
+
+static void
+vnd_dq_flush(vnd_data_queue_t *vqp, vnd_dropper_f dropf)
+{
+	mblk_t *mp, *next;
+
+	mutex_enter(&vqp->vdq_lock);
+	for (mp = vqp->vdq_head; mp != NULL; mp = next) {
+		next = mp->b_next;
+		mp->b_next = NULL;
+		dropf(vqp->vdq_vns, mp, "vnd_dq_flush");
+	}
+	vqp->vdq_cur = 0;
+	vqp->vdq_head = NULL;
+	vqp->vdq_tail = NULL;
+	mutex_exit(&vqp->vdq_lock);
+}
+
+static boolean_t
+vnd_dq_is_empty(vnd_data_queue_t *vqp)
+{
+	boolean_t ret;
+
+	mutex_enter(&vqp->vdq_lock);
+	if (vqp->vdq_head == NULL)
+		ret = B_TRUE;
+	else
+		ret = B_FALSE;
+	mutex_exit(&vqp->vdq_lock);
+
+	return (ret);
+}
+
+/*
+ * Get a network uint16_t from the message and translate it into something the
+ * host understands.
+ */
+static int
+vnd_mbc_getu16(mblk_t *mp, off_t off, uint16_t *out)
+{
+	size_t mpsize;
+	uint8_t *bp;
+
+	mpsize = msgsize(mp);
+	/* Check for overflow */
+	if (off + sizeof (uint16_t) > mpsize)
+		return (1);
+
+	mpsize = MBLKL(mp);
+	while (off >= mpsize) {
+		mp = mp->b_cont;
+		off -= mpsize;
+		mpsize = MBLKL(mp);
+	}
+
+	/*
+	 * Data is in network order. Note the second byte of data might be in
+	 * the next mp.
+	 */
+	bp = mp->b_rptr + off;
+	*out = *bp << 8;
+	if (off + 1 == mpsize) {
+		mp = mp->b_cont;
+		bp = mp->b_rptr;
+	} else {
+		bp++;
+	}
+
+	*out |= *bp;
+	return (0);
+}
+
+/*
+ * Given an mblk chain find the mblk and address of a particular offset.
+ */
+static int
+vnd_mbc_getoffset(mblk_t *mp, off_t off, mblk_t **mpp, uintptr_t *offp)
+{
+	size_t mpsize;
+
+	if (off >= msgsize(mp))
+		return (1);
+
+	mpsize = MBLKL(mp);
+	while (off >= mpsize) {
+		mp = mp->b_cont;
+		off -= mpsize;
+		mpsize = MBLKL(mp);
+	}
+	*mpp = mp;
+	*offp = (uintptr_t)mp->b_rptr + off;
+
+	return (0);
+}
+
+/*
+ * Fetch the destination mac address. Set *dstp to that mac address. If the data
+ * is not contiguous in the first mblk_t, fill in datap and set *dstp to it.
+ */
+static int
+vnd_mbc_getdstmac(mblk_t *mp, uint8_t **dstpp, uint8_t *datap)
+{
+	int i;
+
+	if (MBLKL(mp) >= ETHERADDRL) {
+		*dstpp = mp->b_rptr;
+		return (0);
+	}
+
+	*dstpp = datap;
+	for (i = 0; i < ETHERADDRL; i += 2, datap += 2) {
+		if (vnd_mbc_getu16(mp, i, (uint16_t *)datap) != 0)
+			return (1);
+	}
+
+	return (0);
+}
+
+static int
+vnd_hook(vnd_str_t *vsp, mblk_t **mpp, net_handle_t netiv4, hook_event_t hev4,
+    hook_event_token_t hetv4, net_handle_t netiv6, hook_event_t hev6,
+    hook_event_token_t hetv6, vnd_dropper_f hdrop, vnd_dropper_f ddrop)
+{
+	uint16_t etype;
+	int vlan = 0;
+	hook_pkt_event_t info;
+	size_t offset, mblen;
+	uint8_t *dstp;
+	uint8_t dstaddr[6];
+	hook_event_t he;
+	hook_event_token_t het;
+	net_handle_t neti;
+
+	/*
+	 * Before we can ask if we're interested we have to do enough work to
+	 * determine the ethertype.
+	 */
+
+	/* Byte 12 is either the VLAN tag or the ethertype */
+	if (vnd_mbc_getu16(*mpp, 12, &etype) != 0) {
+		ddrop(vsp, *mpp, "packet has incomplete ethernet header");
+		*mpp = NULL;
+		return (1);
+	}
+
+	if (etype == ETHERTYPE_VLAN) {
+		vlan = 1;
+		/* Actual ethertype is another four bytes in */
+		if (vnd_mbc_getu16(*mpp, 16, &etype) != 0) {
+			ddrop(vsp, *mpp,
+			    "packet has incomplete ethernet vlan header");
+			*mpp = NULL;
+			return (1);
+		}
+		offset = sizeof (struct ether_vlan_header);
+	} else {
+		offset = sizeof (struct ether_header);
+	}
+
+	/*
+	 * At the moment we only hook on the kinds of things that the IP module
+	 * would normally.
+	 */
+	if (etype != ETHERTYPE_IP && etype != ETHERTYPE_IPV6)
+		return (0);
+
+	if (etype == ETHERTYPE_IP) {
+		neti = netiv4;
+		he = hev4;
+		het = hetv4;
+	} else {
+		neti = netiv6;
+		he = hev6;
+		het = hetv6;
+	}
+
+	if (!he.he_interested)
+		return (0);
+
+
+	if (vnd_mbc_getdstmac(*mpp, &dstp, dstaddr) != 0) {
+		ddrop(vsp, *mpp, "packet has incomplete ethernet header");
+		*mpp = NULL;
+		return (1);
+	}
+
+	/*
+	 * Now that we know we're interested, we have to do some additional
+	 * sanity checking for IPF's sake, ala ip_check_length(). Specifically
+	 * we need to check to make sure that the remaining packet size,
+	 * excluding MAC, is at least the size of an IP header.
+	 */
+	mblen = msgsize(*mpp);
+	if ((etype == ETHERTYPE_IP &&
+	    mblen - offset < IP_SIMPLE_HDR_LENGTH) ||
+	    (etype == ETHERTYPE_IPV6 && mblen - offset < IPV6_HDR_LEN)) {
+		ddrop(vsp, *mpp, "packet has invalid IP header");
+		*mpp = NULL;
+		return (1);
+	}
+
+	info.hpe_protocol = neti;
+	info.hpe_ifp = (phy_if_t)vsp;
+	info.hpe_ofp = (phy_if_t)vsp;
+	info.hpe_mp = mpp;
+	info.hpe_flags = 0;
+
+	if (bcmp(vnd_bcast_addr, dstp, ETHERADDRL) == 0)
+		info.hpe_flags |= HPE_BROADCAST;
+	else if (etype == ETHERTYPE_IP &&
+	    bcmp(vnd_ipv4_mcast, vnd_bcast_addr, IPV4_MCAST_LEN) == 0)
+		info.hpe_flags |= HPE_MULTICAST;
+	else if (etype == ETHERTYPE_IPV6 &&
+	    bcmp(vnd_ipv6_mcast, vnd_bcast_addr, IPV6_MCAST_LEN) == 0)
+		info.hpe_flags |= HPE_MULTICAST;
+
+	if (vnd_mbc_getoffset(*mpp, offset, &info.hpe_mb,
+	    (uintptr_t *)&info.hpe_hdr) != 0) {
+		ddrop(vsp, *mpp, "packet too small -- "
+		    "unable to find payload");
+		*mpp = NULL;
+		return (1);
+	}
+
+	if (hook_run(neti->netd_hooks, het, (hook_data_t)&info) != 0) {
+		hdrop(vsp, *mpp, "drooped by hooks");
+		return (1);
+	}
+
+	return (0);
+}
+
+/*
+ * This should not be used for DL_INFO_REQ.
+ */
+static mblk_t *
+vnd_dlpi_alloc(size_t len, t_uscalar_t prim)
+{
+	mblk_t *mp;
+	mp = allocb(len, BPRI_MED);
+	if (mp == NULL)
+		return (NULL);
+
+	mp->b_datap->db_type = M_PROTO;
+	mp->b_wptr = mp->b_rptr + len;
+	bzero(mp->b_rptr, len);
+	((dl_unitdata_req_t *)mp->b_rptr)->dl_primitive = prim;
+
+	return (mp);
+}
+
+static void
+vnd_dlpi_inc_push(vnd_str_t *vsp, mblk_t *mp)
+{
+	mblk_t **mpp;
+
+	VERIFY(MUTEX_HELD(&vsp->vns_lock));
+	ASSERT(mp->b_next == NULL);
+	mpp = &vsp->vns_dlpi_inc;
+	while (*mpp != NULL)
+		mpp = &((*mpp)->b_next);
+	*mpp = mp;
+}
+
+static mblk_t *
+vnd_dlpi_inc_pop(vnd_str_t *vsp)
+{
+	mblk_t *mp;
+
+	VERIFY(MUTEX_HELD(&vsp->vns_lock));
+	mp = vsp->vns_dlpi_inc;
+	if (mp != NULL) {
+		VERIFY(mp->b_next == NULL || mp->b_next != mp);
+		vsp->vns_dlpi_inc = mp->b_next;
+		mp->b_next = NULL;
+	}
+	return (mp);
+}
+
+static int
+vnd_st_sinfo(vnd_str_t *vsp)
+{
+	mblk_t *mp;
+	dl_info_req_t *dlir;
+
+	VERIFY(MUTEX_HELD(&vsp->vns_lock));
+	mp = allocb(MAX(sizeof (dl_info_req_t), sizeof (dl_info_ack_t)),
+	    BPRI_HI);
+	if (mp == NULL) {
+		vsp->vns_errno = VND_E_NOMEM;
+		return (1);
+	}
+	vsp->vns_state = VNS_S_INFO_SENT;
+	cv_broadcast(&vsp->vns_stcv);
+
+	mp->b_datap->db_type = M_PCPROTO;
+	dlir = (dl_info_req_t *)mp->b_rptr;
+	mp->b_wptr = (uchar_t *)&dlir[1];
+	dlir->dl_primitive = DL_INFO_REQ;
+	putnext(vsp->vns_wq, mp);
+
+	return (0);
+}
+
+static int
+vnd_st_info(vnd_str_t *vsp)
+{
+	dl_info_ack_t *dlia;
+	mblk_t *mp;
+
+	VERIFY(MUTEX_HELD(&vsp->vns_lock));
+	mp = vnd_dlpi_inc_pop(vsp);
+	dlia = (dl_info_ack_t *)mp->b_rptr;
+	vsp->vns_dlpi_style = dlia->dl_provider_style;
+	vsp->vns_minwrite = dlia->dl_min_sdu;
+	vsp->vns_maxwrite = dlia->dl_max_sdu;
+
+	/*
+	 * At this time we only support DL_ETHER devices.
+	 */
+	if (dlia->dl_mac_type != DL_ETHER) {
+		freemsg(mp);
+		vsp->vns_errno = VND_E_NOTETHER;
+		return (1);
+	}
+
+	/*
+	 * Because vnd operates on entire packets, we need to manually account
+	 * for the ethernet header information. We add the size of the
+	 * ether_vlan_header to account for this, regardless if it is using
+	 * vlans or not.
+	 */
+	vsp->vns_maxwrite += sizeof (struct ether_vlan_header);
+
+	freemsg(mp);
+	return (0);
+}
+
+static int
+vnd_st_sexclusive(vnd_str_t *vsp)
+{
+	mblk_t *mp;
+
+	VERIFY(MUTEX_HELD(&vsp->vns_lock));
+	mp = vnd_dlpi_alloc(sizeof (dl_attach_req_t), DL_EXCLUSIVE_REQ);
+	if (mp == NULL) {
+		vsp->vns_errno = VND_E_NOMEM;
+		return (1);
+	}
+
+	vsp->vns_state = VNS_S_EXCLUSIVE_SENT;
+	cv_broadcast(&vsp->vns_stcv);
+	putnext(vsp->vns_wq, mp);
+	return (0);
+}
+
+static int
+vnd_st_exclusive(vnd_str_t *vsp)
+{
+	mblk_t *mp;
+	t_uscalar_t prim, cprim;
+
+	VERIFY(MUTEX_HELD(&vsp->vns_lock));
+	mp = vnd_dlpi_inc_pop(vsp);
+	prim = ((dl_error_ack_t *)mp->b_rptr)->dl_primitive;
+	cprim = ((dl_ok_ack_t *)mp->b_rptr)->dl_correct_primitive;
+
+	if (prim != DL_OK_ACK && prim != DL_ERROR_ACK) {
+		vnd_drop_ctl(vsp, mp,
+		    "wrong dlpi primitive for vnd_st_exclusive");
+		vsp->vns_errno = VND_E_DLPIINVAL;
+		return (1);
+	}
+
+	if (cprim != DL_EXCLUSIVE_REQ) {
+		vnd_drop_ctl(vsp, mp,
+		    "vnd_st_exclusive: got ack/nack for wrong primitive");
+		vsp->vns_errno = VND_E_DLPIINVAL;
+		return (1);
+	}
+
+	if (prim == DL_ERROR_ACK)
+		vsp->vns_errno = VND_E_DLEXCL;
+
+	freemsg(mp);
+	return (prim == DL_ERROR_ACK);
+}
+
+/*
+ * Send down a DLPI_ATTACH_REQ.
+ */
+static int
+vnd_st_sattach(vnd_str_t *vsp)
+{
+	mblk_t *mp;
+
+	VERIFY(MUTEX_HELD(&vsp->vns_lock));
+	mp = vnd_dlpi_alloc(sizeof (dl_attach_req_t), DL_ATTACH_REQ);
+	if (mp == NULL) {
+		vsp->vns_errno = VND_E_NOMEM;
+		return (1);
+	}
+
+	((dl_attach_req_t *)mp->b_rptr)->dl_ppa = 0;
+	vsp->vns_state = VNS_S_ATTACH_SENT;
+	cv_broadcast(&vsp->vns_stcv);
+	putnext(vsp->vns_wq, mp);
+
+	return (0);
+}
+
+static int
+vnd_st_attach(vnd_str_t *vsp)
+{
+	mblk_t *mp;
+	t_uscalar_t prim, cprim;
+
+	VERIFY(MUTEX_HELD(&vsp->vns_lock));
+	mp = vnd_dlpi_inc_pop(vsp);
+	prim = ((dl_ok_ack_t *)mp->b_rptr)->dl_primitive;
+	cprim = ((dl_ok_ack_t *)mp->b_rptr)->dl_correct_primitive;
+
+
+	if (prim != DL_OK_ACK && prim != DL_ERROR_ACK) {
+		vnd_drop_ctl(vsp, mp, "vnd_st_attach: unknown primitive type");
+		vsp->vns_errno = VND_E_DLPIINVAL;
+		return (1);
+	}
+
+	if (cprim != DL_ATTACH_REQ) {
+		vnd_drop_ctl(vsp, mp,
+		    "vnd_st_attach: Got ack/nack for wrong primitive");
+		vsp->vns_errno = VND_E_DLPIINVAL;
+		return (1);
+	}
+
+	if (prim == DL_ERROR_ACK)
+		vsp->vns_errno = VND_E_ATTACHFAIL;
+
+	freemsg(mp);
+	return (prim == DL_ERROR_ACK);
+}
+
+static int
+vnd_st_sbind(vnd_str_t *vsp)
+{
+	mblk_t *mp;
+	dl_bind_req_t *dbrp;
+
+	VERIFY(MUTEX_HELD(&vsp->vns_lock));
+	mp = vnd_dlpi_alloc(sizeof (dl_bind_req_t) + sizeof (long),
+	    DL_BIND_REQ);
+	if (mp == NULL) {
+		vsp->vns_errno = VND_E_NOMEM;
+		return (1);
+	}
+	dbrp = (dl_bind_req_t *)(mp->b_rptr);
+	dbrp->dl_sap = 0;
+	dbrp->dl_service_mode = DL_CLDLS;
+
+	vsp->vns_state = VNS_S_BIND_SENT;
+	cv_broadcast(&vsp->vns_stcv);
+	putnext(vsp->vns_wq, mp);
+
+	return (0);
+}
+
+static int
+vnd_st_bind(vnd_str_t *vsp)
+{
+	mblk_t *mp;
+	t_uscalar_t prim;
+
+	VERIFY(MUTEX_HELD(&vsp->vns_lock));
+	mp = vnd_dlpi_inc_pop(vsp);
+	prim = ((dl_error_ack_t *)mp->b_rptr)->dl_primitive;
+
+	if (prim != DL_BIND_ACK && prim != DL_ERROR_ACK) {
+		vnd_drop_ctl(vsp, mp, "wrong dlpi primitive for vnd_st_bind");
+		vsp->vns_errno = VND_E_DLPIINVAL;
+		return (1);
+	}
+
+	if (prim == DL_ERROR_ACK)
+		vsp->vns_errno = VND_E_BINDFAIL;
+
+	freemsg(mp);
+	return (prim == DL_ERROR_ACK);
+}
+
+static int
+vnd_st_spromisc(vnd_str_t *vsp, int type, vnd_str_state_t next)
+{
+	mblk_t *mp;
+	dl_promiscon_req_t *dprp;
+
+	VERIFY(MUTEX_HELD(&vsp->vns_lock));
+	mp = vnd_dlpi_alloc(sizeof (dl_promiscon_req_t), DL_PROMISCON_REQ);
+	if (mp == NULL) {
+		vsp->vns_errno = VND_E_NOMEM;
+		return (1);
+	}
+
+	dprp = (dl_promiscon_req_t *)mp->b_rptr;
+	dprp->dl_level = type;
+
+	vsp->vns_state = next;
+	cv_broadcast(&vsp->vns_stcv);
+	putnext(vsp->vns_wq, mp);
+
+	return (0);
+}
+
+static int
+vnd_st_promisc(vnd_str_t *vsp)
+{
+	mblk_t *mp;
+	t_uscalar_t prim, cprim;
+
+	VERIFY(MUTEX_HELD(&vsp->vns_lock));
+	mp = vnd_dlpi_inc_pop(vsp);
+	prim = ((dl_ok_ack_t *)mp->b_rptr)->dl_primitive;
+	cprim = ((dl_ok_ack_t *)mp->b_rptr)->dl_correct_primitive;
+
+	if (prim != DL_OK_ACK && prim != DL_ERROR_ACK) {
+		vnd_drop_ctl(vsp, mp,
+		    "wrong dlpi primitive for vnd_st_promisc");
+		vsp->vns_errno = VND_E_DLPIINVAL;
+		return (1);
+	}
+
+	if (cprim != DL_PROMISCON_REQ) {
+		vnd_drop_ctl(vsp, mp,
+		    "vnd_st_promisc: Got ack/nack for wrong primitive");
+		vsp->vns_errno = VND_E_DLPIINVAL;
+		return (1);
+	}
+
+	if (prim == DL_ERROR_ACK)
+		vsp->vns_errno = VND_E_PROMISCFAIL;
+
+	freemsg(mp);
+	return (prim == DL_ERROR_ACK);
+}
+
+static int
+vnd_st_scapabq(vnd_str_t *vsp)
+{
+	mblk_t *mp;
+
+	VERIFY(MUTEX_HELD(&vsp->vns_lock));
+
+	mp = vnd_dlpi_alloc(sizeof (dl_capability_req_t), DL_CAPABILITY_REQ);
+	if (mp == NULL) {
+		vsp->vns_errno = VND_E_NOMEM;
+		return (1);
+	}
+
+	vsp->vns_state = VNS_S_CAPAB_Q_SENT;
+	cv_broadcast(&vsp->vns_stcv);
+	putnext(vsp->vns_wq, mp);
+
+	return (0);
+}
+
+static void
+vnd_mac_input(vnd_str_t *vsp, mac_resource_t *unused, mblk_t *mp_chain,
+    mac_header_info_t *mhip)
+{
+	int signal = 0;
+	mblk_t *mp;
+	vnd_pnsd_t *nsp = vsp->vns_nsd;
+
+	ASSERT(vsp != NULL);
+	ASSERT(mp_chain != NULL);
+
+	for (mp = mp_chain; mp != NULL; mp = mp_chain) {
+		uint16_t vid;
+		mp_chain = mp->b_next;
+		mp->b_next = NULL;
+
+		/*
+		 * If we were operating in a traditional dlpi context then we
+		 * would have enabled DLIOCRAW and rather than the fast path, we
+		 * would come through dld_str_rx_raw. That function does two
+		 * things that we have to consider doing ourselves. The first is
+		 * that it adjusts the b_rptr back to account for dld bumping us
+		 * past the mac header. It also tries to account for cases where
+		 * mac provides an illusion of the mac header. Fortunately, dld
+		 * only allows the fastpath when the media type is the same as
+		 * the native type. Therefore all we have to do here is adjust
+		 * the b_rptr.
+		 */
+		ASSERT(mp->b_rptr >= DB_BASE(mp) + mhip->mhi_hdrsize);
+		mp->b_rptr -= mhip->mhi_hdrsize;
+		vid = VLAN_ID(mhip->mhi_tci);
+		if (mhip->mhi_istagged && vid != VLAN_ID_NONE) {
+			bcopy(mp->b_rptr, mp->b_rptr + 4, 12);
+			mp->b_rptr += 4;
+		}
+
+		if (nsp->vpnd_hooked && vnd_hook(vsp, &mp, nsp->vpnd_neti_v4,
+		    nsp->vpnd_event_in_v4, nsp->vpnd_token_in_v4,
+		    nsp->vpnd_neti_v6, nsp->vpnd_event_in_v6,
+		    nsp->vpnd_token_in_v6, vnd_drop_hook_in, vnd_drop_in) != 0)
+			continue;
+
+		VND_STAT_INC(vsp, vks_rpackets, 1);
+		VND_STAT_INC(vsp, vks_rbytes, msgsize(mp));
+		DTRACE_VND5(recv, mblk_t *, mp, void *, NULL, void *, NULL,
+		    vnd_str_t *, vsp, mblk_t *, mp);
+		mutex_enter(&vsp->vns_dq_read.vdq_lock);
+		signal |= vnd_dq_push(&vsp->vns_dq_read, mp, B_FALSE,
+		    vnd_drop_in);
+		mutex_exit(&vsp->vns_dq_read.vdq_lock);
+
+	}
+
+	if (signal != 0) {
+		cv_broadcast(&vsp->vns_dq_read.vdq_ready);
+		pollwakeup(&vsp->vns_dev->vdd_ph, POLLIN | POLLRDNORM);
+	}
+
+}
+
+static void
+vnd_mac_flow_control_stat(vnd_str_t *vsp, hrtime_t diff)
+{
+	VND_STAT_INC(vsp, vks_nmacflow, 1);
+	VND_STAT_INC(vsp, vks_tmacflow, diff);
+	if (diff >= VND_LATENCY_1MS)
+		VND_STAT_INC(vsp, vks_mac_flow_1ms, 1);
+	if (diff >= VND_LATENCY_10MS)
+		VND_STAT_INC(vsp, vks_mac_flow_10ms, 1);
+	if (diff >= VND_LATENCY_100MS)
+		VND_STAT_INC(vsp, vks_mac_flow_100ms, 1);
+	if (diff >= VND_LATENCY_1S)
+		VND_STAT_INC(vsp, vks_mac_flow_1s, 1);
+	if (diff >= VND_LATENCY_10S)
+		VND_STAT_INC(vsp, vks_mac_flow_10s, 1);
+}
+
+/*
+ * This is a callback from MAC that indicates that we are allowed to send
+ * packets again.
+ */
+static void
+vnd_mac_flow_control(void *arg, vnd_mac_cookie_t cookie)
+{
+	vnd_str_t *vsp = arg;
+	hrtime_t now, diff;
+
+	mutex_enter(&vsp->vns_lock);
+	now = gethrtime();
+
+	/*
+	 * Check for the case that we beat vnd_squeue_tx_one to the punch.
+	 * There's also an additional case here that we got notified because
+	 * we're sharing a device that ran out of tx descriptors, even though it
+	 * wasn't because of us.
+	 */
+	if (!(vsp->vns_flags & VNS_F_FLOW_CONTROLLED)) {
+		vsp->vns_fcupdate = now;
+		mutex_exit(&vsp->vns_lock);
+		return;
+	}
+
+	ASSERT(vsp->vns_flags & VNS_F_FLOW_CONTROLLED);
+	ASSERT(vsp->vns_caps.vsc_fc_cookie == cookie);
+	vsp->vns_flags &= ~VNS_F_FLOW_CONTROLLED;
+	vsp->vns_caps.vsc_fc_cookie = NULL;
+	diff = now - vsp->vns_fclatch;
+	vsp->vns_fclatch = 0;
+	DTRACE_VND3(flow__resumed, vnd_str_t *, vsp, uint64_t,
+	    vsp->vns_dq_write.vdq_cur, uintptr_t, cookie);
+	/*
+	 * If someone has asked to flush the squeue and thus inserted a barrier,
+	 * than we shouldn't schedule a drain.
+	 */
+	if (!(vsp->vns_flags & (VNS_F_DRAIN_SCHEDULED | VNS_F_BARRIER))) {
+		vsp->vns_flags |= VNS_F_DRAIN_SCHEDULED;
+		gsqueue_enter_one(vsp->vns_squeue, &vsp->vns_drainblk,
+		    vnd_squeue_tx_drain, vsp, GSQUEUE_FILL,
+		    VND_SQUEUE_TAG_MAC_FLOW_CONTROL);
+	}
+	mutex_exit(&vsp->vns_lock);
+}
+
+static void
+vnd_mac_enter(vnd_str_t *vsp, mac_perim_handle_t *mphp)
+{
+	ASSERT(MUTEX_HELD(&vsp->vns_lock));
+	VERIFY(vsp->vns_caps.vsc_capab_f(vsp->vns_caps.vsc_capab_hdl,
+	    DLD_CAPAB_PERIM, mphp, DLD_ENABLE) == 0);
+}
+
+static void
+vnd_mac_exit(vnd_str_t *vsp, mac_perim_handle_t mph)
+{
+	ASSERT(MUTEX_HELD(&vsp->vns_lock));
+	VERIFY(vsp->vns_caps.vsc_capab_f(vsp->vns_caps.vsc_capab_hdl,
+	    DLD_CAPAB_PERIM, mph, DLD_DISABLE) == 0);
+}
+
+static int
+vnd_dld_cap_enable(vnd_str_t *vsp, vnd_rx_t rxfunc)
+{
+	int ret;
+	dld_capab_direct_t d;
+	mac_perim_handle_t mph;
+	vnd_str_capab_t *c = &vsp->vns_caps;
+
+	bzero(&d, sizeof (d));
+	d.di_rx_cf = (uintptr_t)rxfunc;
+	d.di_rx_ch = vsp;
+	d.di_flags = DI_DIRECT_RAW;
+
+	vnd_mac_enter(vsp, &mph);
+
+	/*
+	 * If we're coming in here for a second pass, we need to make sure that
+	 * we remove an existing flow control notification callback, otherwise
+	 * we'll create a duplicate that will remain with garbage data.
+	 */
+	if (c->vsc_tx_fc_hdl != NULL) {
+		ASSERT(c->vsc_set_fcb_hdl != NULL);
+		(void) c->vsc_set_fcb_f(c->vsc_set_fcb_hdl, NULL,
+		    c->vsc_tx_fc_hdl);
+		c->vsc_tx_fc_hdl = NULL;
+	}
+
+	if (vsp->vns_caps.vsc_capab_f(c->vsc_capab_hdl,
+	    DLD_CAPAB_DIRECT, &d, DLD_ENABLE) == 0) {
+		c->vsc_tx_f = (vnd_dld_tx_t)d.di_tx_df;
+		c->vsc_tx_hdl = d.di_tx_dh;
+		c->vsc_set_fcb_f = (vnd_dld_set_fcb_t)d.di_tx_cb_df;
+		c->vsc_set_fcb_hdl = d.di_tx_cb_dh;
+		c->vsc_is_fc_f = (vnd_dld_is_fc_t)d.di_tx_fctl_df;
+		c->vsc_is_fc_hdl = d.di_tx_fctl_dh;
+		c->vsc_tx_fc_hdl = c->vsc_set_fcb_f(c->vsc_set_fcb_hdl,
+		    vnd_mac_flow_control, vsp);
+		c->vsc_flags |= VNS_C_DIRECT;
+		ret = 0;
+	} else {
+		vsp->vns_errno = VND_E_DIRECTFAIL;
+		ret = 1;
+	}
+	vnd_mac_exit(vsp, mph);
+	return (ret);
+}
+
+static int
+vnd_st_capabq(vnd_str_t *vsp)
+{
+	mblk_t *mp;
+	dl_capability_ack_t *cap;
+	dl_capability_sub_t *subp;
+	dl_capab_hcksum_t *hck;
+	dl_capab_dld_t *dld;
+	unsigned char *rp;
+	int ret = 0;
+
+	VERIFY(MUTEX_HELD(&vsp->vns_lock));
+	mp = vnd_dlpi_inc_pop(vsp);
+
+	rp = mp->b_rptr;
+	cap = (dl_capability_ack_t *)rp;
+	if (cap->dl_sub_length == 0)
+		goto done;
+
+	/* Don't try to process something too big */
+	if (sizeof (dl_capability_ack_t) + cap->dl_sub_length > MBLKL(mp)) {
+		VND_STAT_INC(vsp, vks_ndlpidrops, 1);
+		VND_STAT_INC(vsp, vks_tdrops, 1);
+		vsp->vns_errno = VND_E_CAPACKINVAL;
+		ret = 1;
+		goto done;
+	}
+
+	rp += cap->dl_sub_offset;
+
+	while (cap->dl_sub_length > 0) {
+		subp = (dl_capability_sub_t *)rp;
+		/* Sanity check something crazy from down below */
+		if (subp->dl_length + sizeof (dl_capability_sub_t) >
+		    cap->dl_sub_length) {
+			VND_STAT_INC(vsp, vks_ndlpidrops, 1);
+			VND_STAT_INC(vsp, vks_tdrops, 1);
+			vsp->vns_errno = VND_E_SUBCAPINVAL;
+			ret = 1;
+			goto done;
+		}
+
+		switch (subp->dl_cap) {
+		case DL_CAPAB_HCKSUM:
+			hck = (dl_capab_hcksum_t *)(rp +
+			    sizeof (dl_capability_sub_t));
+			if (hck->hcksum_version != HCKSUM_CURRENT_VERSION) {
+				vsp->vns_caps.vsc_flags |= VNS_C_HCKSUM_BADVERS;
+				break;
+			}
+			if (dlcapabcheckqid(&hck->hcksum_mid, vsp->vns_lrq) !=
+			    B_TRUE) {
+				vsp->vns_errno = VND_E_CAPABPASS;
+				ret = 1;
+				goto done;
+			}
+			vsp->vns_caps.vsc_flags |= VNS_C_HCKSUM;
+			vsp->vns_caps.vsc_hcksum_opts = hck->hcksum_txflags;
+			break;
+		case DL_CAPAB_DLD:
+			dld = (dl_capab_dld_t *)(rp +
+			    sizeof (dl_capability_sub_t));
+			if (dld->dld_version != DLD_CURRENT_VERSION) {
+				vsp->vns_errno = VND_E_DLDBADVERS;
+				ret = 1;
+				goto done;
+			}
+			if (dlcapabcheckqid(&dld->dld_mid, vsp->vns_lrq) !=
+			    B_TRUE) {
+				vsp->vns_errno = VND_E_CAPABPASS;
+				ret = 1;
+				goto done;
+			}
+			vsp->vns_caps.vsc_flags |= VNS_C_DLD;
+			vsp->vns_caps.vsc_capab_f =
+			    (vnd_dld_cap_t)dld->dld_capab;
+			vsp->vns_caps.vsc_capab_hdl =
+			    (void *)dld->dld_capab_handle;
+			/*
+			 * At this point in time, we have to set up a direct
+			 * function that drops all input. This validates that
+			 * we'll be able to set up direct input and that we can
+			 * easily switch it earlier to the real data function
+			 * when we've plumbed everything up.
+			 */
+			if (vnd_dld_cap_enable(vsp, vnd_mac_drop_input) != 0) {
+				/* vns_errno set by vnd_dld_cap_enable */
+				ret = 1;
+				goto done;
+			}
+			break;
+		default:
+			/* Ignore unsupported cap */
+			break;
+		}
+
+		rp += sizeof (dl_capability_sub_t) + subp->dl_length;
+		cap->dl_sub_length -= sizeof (dl_capability_sub_t) +
+		    subp->dl_length;
+	}
+
+done:
+	/* Make sure we enabled direct callbacks */
+	if (ret == 0 && !(vsp->vns_caps.vsc_flags & VNS_C_DIRECT)) {
+		vsp->vns_errno = VND_E_DIRECTNOTSUP;
+		ret = 1;
+	}
+
+	freemsg(mp);
+	return (ret);
+}
+
+static void
+vnd_st_sonline(vnd_str_t *vsp)
+{
+	VERIFY(MUTEX_HELD(&vsp->vns_lock));
+	vsp->vns_state = VNS_S_ONLINE;
+	cv_broadcast(&vsp->vns_stcv);
+}
+
+static void
+vnd_st_shutdown(vnd_str_t *vsp)
+{
+	mac_perim_handle_t mph;
+	vnd_str_capab_t *vsc = &vsp->vns_caps;
+
+	VERIFY(MUTEX_HELD(&vsp->vns_lock));
+
+	/*
+	 * At this point in time we know that there is no one transmitting as
+	 * our final reference has been torn down and that vnd_s_close inserted
+	 * a barrier to validate that everything is flushed.
+	 */
+	if (vsc->vsc_flags & VNS_C_DIRECT) {
+		vnd_mac_enter(vsp, &mph);
+		vsc->vsc_flags &= ~VNS_C_DIRECT;
+		(void) vsc->vsc_set_fcb_f(vsc->vsc_set_fcb_hdl, NULL,
+		    vsc->vsc_tx_fc_hdl);
+		vsc->vsc_tx_fc_hdl = NULL;
+		(void) vsc->vsc_capab_f(vsc->vsc_capab_hdl, DLD_CAPAB_DIRECT,
+		    NULL, DLD_DISABLE);
+		vnd_mac_exit(vsp, mph);
+	}
+}
+
+static boolean_t
+vnd_st_spromiscoff(vnd_str_t *vsp, int type, vnd_str_state_t next)
+{
+	boolean_t ret = B_TRUE;
+	mblk_t *mp;
+	dl_promiscoff_req_t *dprp;
+
+	VERIFY(MUTEX_HELD(&vsp->vns_lock));
+	mp = vnd_dlpi_alloc(sizeof (dl_promiscon_req_t), DL_PROMISCOFF_REQ);
+	if (mp == NULL) {
+		cmn_err(CE_NOTE, "!vnd failed to allocate mblk_t for "
+		    "promiscoff request");
+		ret = B_FALSE;
+		goto next;
+	}
+
+	dprp = (dl_promiscoff_req_t *)mp->b_rptr;
+	dprp->dl_level = type;
+
+	putnext(vsp->vns_wq, mp);
+next:
+	vsp->vns_state = next;
+	cv_broadcast(&vsp->vns_stcv);
+	return (ret);
+}
+
+static void
+vnd_st_promiscoff(vnd_str_t *vsp)
+{
+	mblk_t *mp;
+	t_uscalar_t prim, cprim;
+
+	VERIFY(MUTEX_HELD(&vsp->vns_lock));
+
+	/*
+	 * Unlike other cases where we guard against the incoming packet being
+	 * NULL, during tear down we try to keep driving and therefore we may
+	 * have gotten here due to an earlier failure, so there's nothing to do.
+	 */
+	mp = vnd_dlpi_inc_pop(vsp);
+	if (mp == NULL)
+		return;
+
+	prim = ((dl_ok_ack_t *)mp->b_rptr)->dl_primitive;
+	cprim = ((dl_ok_ack_t *)mp->b_rptr)->dl_correct_primitive;
+
+	if (prim != DL_OK_ACK && prim != DL_ERROR_ACK) {
+		vnd_drop_ctl(vsp, mp,
+		    "wrong dlpi primitive for vnd_st_promiscoff");
+		return;
+	}
+
+	if (cprim != DL_PROMISCOFF_REQ) {
+		vnd_drop_ctl(vsp, mp,
+		    "vnd_st_promiscoff: Got ack/nack for wrong primitive");
+		return;
+	}
+
+	if (prim == DL_ERROR_ACK) {
+		cmn_err(CE_WARN, "!failed to disable promiscuos mode during "
+		    "vnd teardown");
+	}
+}
+
+static boolean_t
+vnd_st_sunbind(vnd_str_t *vsp)
+{
+	mblk_t *mp;
+	boolean_t ret = B_TRUE;
+
+	mp = vnd_dlpi_alloc(sizeof (dl_unbind_req_t), DL_UNBIND_REQ);
+	if (mp == NULL) {
+		cmn_err(CE_NOTE, "!vnd failed to allocate mblk_t for "
+		    "unbind request");
+		ret = B_FALSE;
+		goto next;
+	}
+
+	putnext(vsp->vns_wq, mp);
+next:
+	vsp->vns_state = VNS_S_UNBIND_SENT;
+	cv_broadcast(&vsp->vns_stcv);
+	return (ret);
+}
+
+static void
+vnd_st_unbind(vnd_str_t *vsp)
+{
+	mblk_t *mp;
+	t_uscalar_t prim, cprim;
+
+	/*
+	 * Unlike other cases where we guard against the incoming packet being
+	 * NULL, during tear down we try to keep driving and therefore we may
+	 * have gotten here due to an earlier failure, so there's nothing to do.
+	 */
+	mp = vnd_dlpi_inc_pop(vsp);
+	if (mp == NULL)
+		goto next;
+
+	prim = ((dl_ok_ack_t *)mp->b_rptr)->dl_primitive;
+	cprim = ((dl_ok_ack_t *)mp->b_rptr)->dl_correct_primitive;
+
+	if (prim != DL_OK_ACK && prim != DL_ERROR_ACK) {
+		vnd_drop_ctl(vsp, mp,
+		    "wrong dlpi primitive for vnd_st_unbind");
+		goto next;
+	}
+
+	if (cprim != DL_UNBIND_REQ) {
+		vnd_drop_ctl(vsp, mp,
+		    "vnd_st_unbind: Got ack/nack for wrong primitive");
+		goto next;
+	}
+
+	if (prim == DL_ERROR_ACK) {
+		cmn_err(CE_WARN, "!failed to unbind stream during vnd "
+		    "teardown");
+	}
+
+next:
+	vsp->vns_state = VNS_S_ZOMBIE;
+	cv_broadcast(&vsp->vns_stcv);
+}
+
+/*
+ * Perform state transitions. This is a one way shot down the flow chart
+ * described in the big theory statement.
+ */
+static void
+vnd_str_state_transition(void *arg)
+{
+	boolean_t died = B_FALSE;
+	vnd_str_t *vsp = arg;
+	mblk_t *mp;
+
+	mutex_enter(&vsp->vns_lock);
+	if (vsp->vns_dlpi_inc == NULL && (vsp->vns_state != VNS_S_INITIAL &&
+	    vsp->vns_state != VNS_S_SHUTTING_DOWN)) {
+		mutex_exit(&vsp->vns_lock);
+		return;
+	}
+
+	/*
+	 * When trying to shut down, or unwinding from a failed enabling, rather
+	 * than immediately entering the ZOMBIE state, we may instead opt to try
+	 * and enter the next state in the progression. This is especially
+	 * important when trying to tear everything down.
+	 */
+loop:
+	DTRACE_PROBE2(vnd__state__transition, uintptr_t, vsp,
+	    vnd_str_state_t, vsp->vns_state);
+	switch (vsp->vns_state) {
+	case VNS_S_INITIAL:
+		VERIFY(vsp->vns_dlpi_inc == NULL);
+		if (vnd_st_sinfo(vsp) != 0)
+			died = B_TRUE;
+		break;
+	case VNS_S_INFO_SENT:
+		VERIFY(vsp->vns_dlpi_inc != NULL);
+		if (vnd_st_info(vsp) == 0) {
+			if (vnd_st_sexclusive(vsp) != 0)
+				died = B_TRUE;
+		} else {
+			died = B_TRUE;
+		}
+		break;
+	case VNS_S_EXCLUSIVE_SENT:
+		VERIFY(vsp->vns_dlpi_inc != NULL);
+		if (vnd_st_exclusive(vsp) == 0) {
+			if (vsp->vns_dlpi_style == DL_STYLE2) {
+				if (vnd_st_sattach(vsp) != 0)
+					died = B_TRUE;
+			} else {
+				if (vnd_st_sbind(vsp) != 0)
+					died = B_TRUE;
+			}
+		} else  {
+			died = B_TRUE;
+		}
+		break;
+	case VNS_S_ATTACH_SENT:
+		VERIFY(vsp->vns_dlpi_inc != NULL);
+		if (vnd_st_attach(vsp) == 0) {
+			if (vnd_st_sbind(vsp) != 0)
+				died = B_TRUE;
+		} else {
+			died = B_TRUE;
+		}
+		break;
+	case VNS_S_BIND_SENT:
+		VERIFY(vsp->vns_dlpi_inc != NULL);
+		if (vnd_st_bind(vsp) == 0) {
+			if (vnd_st_spromisc(vsp, DL_PROMISC_SAP,
+			    VNS_S_SAP_PROMISC_SENT) != 0)
+				died = B_TRUE;
+		} else {
+			died = B_TRUE;
+		}
+		break;
+	case VNS_S_SAP_PROMISC_SENT:
+		VERIFY(vsp->vns_dlpi_inc != NULL);
+		if (vnd_st_promisc(vsp) == 0) {
+			if (vnd_st_spromisc(vsp, DL_PROMISC_MULTI,
+			    VNS_S_MULTI_PROMISC_SENT) != 0)
+				died = B_TRUE;
+		} else {
+			died = B_TRUE;
+		}
+		break;
+	case VNS_S_MULTI_PROMISC_SENT:
+		VERIFY(vsp->vns_dlpi_inc != NULL);
+		if (vnd_st_promisc(vsp) == 0) {
+			if (vnd_st_spromisc(vsp, DL_PROMISC_RX_ONLY,
+			    VNS_S_RX_ONLY_PROMISC_SENT) != 0)
+				died = B_TRUE;
+		} else {
+			died = B_TRUE;
+		}
+		break;
+	case VNS_S_RX_ONLY_PROMISC_SENT:
+		VERIFY(vsp->vns_dlpi_inc != NULL);
+		if (vnd_st_promisc(vsp) == 0) {
+			if (vnd_st_spromisc(vsp, DL_PROMISC_FIXUPS,
+			    VNS_S_FIXUP_PROMISC_SENT) != 0)
+				died = B_TRUE;
+		} else {
+			died = B_TRUE;
+		}
+		break;
+	case VNS_S_FIXUP_PROMISC_SENT:
+		VERIFY(vsp->vns_dlpi_inc != NULL);
+		if (vnd_st_promisc(vsp) == 0) {
+			if (vnd_st_scapabq(vsp) != 0)
+				died = B_TRUE;
+		} else {
+			died = B_TRUE;
+		}
+		break;
+	case VNS_S_CAPAB_Q_SENT:
+		if (vnd_st_capabq(vsp) != 0)
+			died = B_TRUE;
+		else
+			vnd_st_sonline(vsp);
+		break;
+	case VNS_S_SHUTTING_DOWN:
+		vnd_st_shutdown(vsp);
+		if (vnd_st_spromiscoff(vsp, DL_PROMISC_MULTI,
+		    VNS_S_MULTICAST_PROMISCOFF_SENT) == B_FALSE)
+			goto loop;
+		break;
+	case VNS_S_MULTICAST_PROMISCOFF_SENT:
+		vnd_st_promiscoff(vsp);
+		if (vnd_st_spromiscoff(vsp, DL_PROMISC_SAP,
+		    VNS_S_SAP_PROMISCOFF_SENT) == B_FALSE)
+			goto loop;
+		break;
+	case VNS_S_SAP_PROMISCOFF_SENT:
+		vnd_st_promiscoff(vsp);
+		if (vnd_st_sunbind(vsp) == B_FALSE)
+			goto loop;
+		break;
+	case VNS_S_UNBIND_SENT:
+		vnd_st_unbind(vsp);
+		break;
+	case VNS_S_ZOMBIE:
+		while ((mp = vnd_dlpi_inc_pop(vsp)) != NULL)
+			vnd_drop_ctl(vsp, mp, "vsp received data as a zombie");
+		break;
+	default:
+		panic("vnd_str_t entered an unknown state");
+	}
+
+	if (died == B_TRUE) {
+		ASSERT(vsp->vns_errno != VND_E_SUCCESS);
+		vsp->vns_laststate = vsp->vns_state;
+		vsp->vns_state = VNS_S_ZOMBIE;
+		cv_broadcast(&vsp->vns_stcv);
+	}
+
+	mutex_exit(&vsp->vns_lock);
+}
+
+static void
+vnd_dlpi_taskq_dispatch(void *arg)
+{
+	vnd_str_t *vsp = arg;
+	int run = 1;
+
+	while (run != 0) {
+		vnd_str_state_transition(vsp);
+		mutex_enter(&vsp->vns_lock);
+		if (vsp->vns_flags & VNS_F_CONDEMNED ||
+		    vsp->vns_dlpi_inc == NULL) {
+			run = 0;
+			vsp->vns_flags &= ~VNS_F_TASKQ_DISPATCHED;
+		}
+		if (vsp->vns_flags & VNS_F_CONDEMNED)
+			cv_signal(&vsp->vns_cancelcv);
+		mutex_exit(&vsp->vns_lock);
+	}
+}
+
+static int
+vnd_neti_getifname(net_handle_t neti, phy_if_t phy, char *buf, const size_t len)
+{
+	return (-1);
+}
+
+static int
+vnd_neti_getmtu(net_handle_t neti, phy_if_t phy, lif_if_t ifdata)
+{
+	return (-1);
+}
+
+static int
+vnd_neti_getptmue(net_handle_t neti)
+{
+	return (-1);
+}
+
+static int
+vnd_neti_getlifaddr(net_handle_t neti, phy_if_t phy, lif_if_t ifdata,
+    size_t nelem, net_ifaddr_t type[], void *storage)
+{
+	return (-1);
+}
+
+static int
+vnd_neti_getlifzone(net_handle_t neti, phy_if_t phy, lif_if_t ifdata,
+    zoneid_t *zid)
+{
+	return (-1);
+}
+
+static int
+vnd_neti_getlifflags(net_handle_t neti, phy_if_t phy, lif_if_t ifdata,
+    uint64_t *flags)
+{
+	return (-1);
+}
+
+static phy_if_t
+vnd_neti_phygetnext(net_handle_t neti, phy_if_t phy)
+{
+	return (-1);
+}
+
+static phy_if_t
+vnd_neti_phylookup(net_handle_t neti, const char *name)
+{
+	return (-1);
+}
+
+static lif_if_t
+vnd_neti_lifgetnext(net_handle_t neti, phy_if_t phy, lif_if_t ifdata)
+{
+	return (-1);
+}
+
+static int
+vnd_neti_inject(net_handle_t neti, inject_t style, net_inject_t *packet)
+{
+	return (-1);
+}
+
+static phy_if_t
+vnd_neti_route(net_handle_t neti, struct sockaddr *address,
+    struct sockaddr *next)
+{
+	return ((phy_if_t)-1);
+}
+
+static int
+vnd_neti_ispchksum(net_handle_t neti, mblk_t *mp)
+{
+	return (-1);
+}
+
+static int
+vnd_neti_isvchksum(net_handle_t neti, mblk_t *mp)
+{
+	return (-1);
+}
+
+static net_protocol_t vnd_neti_info_v4 = {
+	NETINFO_VERSION,
+	NHF_VND_INET,
+	vnd_neti_getifname,
+	vnd_neti_getmtu,
+	vnd_neti_getptmue,
+	vnd_neti_getlifaddr,
+	vnd_neti_getlifzone,
+	vnd_neti_getlifflags,
+	vnd_neti_phygetnext,
+	vnd_neti_phylookup,
+	vnd_neti_lifgetnext,
+	vnd_neti_inject,
+	vnd_neti_route,
+	vnd_neti_ispchksum,
+	vnd_neti_isvchksum
+};
+
+static net_protocol_t vnd_neti_info_v6 = {
+	NETINFO_VERSION,
+	NHF_VND_INET6,
+	vnd_neti_getifname,
+	vnd_neti_getmtu,
+	vnd_neti_getptmue,
+	vnd_neti_getlifaddr,
+	vnd_neti_getlifzone,
+	vnd_neti_getlifflags,
+	vnd_neti_phygetnext,
+	vnd_neti_phylookup,
+	vnd_neti_lifgetnext,
+	vnd_neti_inject,
+	vnd_neti_route,
+	vnd_neti_ispchksum,
+	vnd_neti_isvchksum
+};
+
+
+static int
+vnd_netinfo_init(vnd_pnsd_t *nsp)
+{
+	nsp->vpnd_neti_v4 = net_protocol_register(nsp->vpnd_nsid,
+	    &vnd_neti_info_v4);
+	ASSERT(nsp->vpnd_neti_v4 != NULL);
+
+	nsp->vpnd_neti_v6 = net_protocol_register(nsp->vpnd_nsid,
+	    &vnd_neti_info_v6);
+	ASSERT(nsp->vpnd_neti_v6 != NULL);
+
+	nsp->vpnd_family_v4.hf_version = HOOK_VERSION;
+	nsp->vpnd_family_v4.hf_name = "vnd_inet";
+
+	if (net_family_register(nsp->vpnd_neti_v4, &nsp->vpnd_family_v4) != 0) {
+		net_protocol_unregister(nsp->vpnd_neti_v4);
+		net_protocol_unregister(nsp->vpnd_neti_v6);
+		cmn_err(CE_NOTE, "vnd_netinfo_init: net_family_register "
+		    "failed for stack %d", nsp->vpnd_nsid);
+		return (1);
+	}
+
+	nsp->vpnd_family_v6.hf_version = HOOK_VERSION;
+	nsp->vpnd_family_v6.hf_name = "vnd_inet6";
+
+	if (net_family_register(nsp->vpnd_neti_v6, &nsp->vpnd_family_v6) != 0) {
+		net_family_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_family_v4);
+		net_protocol_unregister(nsp->vpnd_neti_v4);
+		net_protocol_unregister(nsp->vpnd_neti_v6);
+		cmn_err(CE_NOTE, "vnd_netinfo_init: net_family_register "
+		    "failed for stack %d", nsp->vpnd_nsid);
+		return (1);
+	}
+
+	nsp->vpnd_event_in_v4.he_version = HOOK_VERSION;
+	nsp->vpnd_event_in_v4.he_name = NH_PHYSICAL_IN;
+	nsp->vpnd_event_in_v4.he_flags = 0;
+	nsp->vpnd_event_in_v4.he_interested = B_FALSE;
+
+	nsp->vpnd_token_in_v4 = net_event_register(nsp->vpnd_neti_v4,
+	    &nsp->vpnd_event_in_v4);
+	if (nsp->vpnd_token_in_v4 == NULL) {
+		net_family_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_family_v4);
+		net_family_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_family_v6);
+		net_protocol_unregister(nsp->vpnd_neti_v4);
+		net_protocol_unregister(nsp->vpnd_neti_v6);
+		cmn_err(CE_NOTE, "vnd_netinfo_init: net_event_register "
+		    "failed for stack %d", nsp->vpnd_nsid);
+		return (1);
+	}
+
+	nsp->vpnd_event_in_v6.he_version = HOOK_VERSION;
+	nsp->vpnd_event_in_v6.he_name = NH_PHYSICAL_IN;
+	nsp->vpnd_event_in_v6.he_flags = 0;
+	nsp->vpnd_event_in_v6.he_interested = B_FALSE;
+
+	nsp->vpnd_token_in_v6 = net_event_register(nsp->vpnd_neti_v6,
+	    &nsp->vpnd_event_in_v6);
+	if (nsp->vpnd_token_in_v6 == NULL) {
+		net_event_shutdown(nsp->vpnd_neti_v4, &nsp->vpnd_event_in_v4);
+		net_event_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_event_in_v4);
+		net_family_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_family_v4);
+		net_family_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_family_v6);
+		net_protocol_unregister(nsp->vpnd_neti_v4);
+		net_protocol_unregister(nsp->vpnd_neti_v6);
+		cmn_err(CE_NOTE, "vnd_netinfo_init: net_event_register "
+		    "failed for stack %d", nsp->vpnd_nsid);
+		return (1);
+	}
+
+	nsp->vpnd_event_out_v4.he_version = HOOK_VERSION;
+	nsp->vpnd_event_out_v4.he_name = NH_PHYSICAL_OUT;
+	nsp->vpnd_event_out_v4.he_flags = 0;
+	nsp->vpnd_event_out_v4.he_interested = B_FALSE;
+
+	nsp->vpnd_token_out_v4 = net_event_register(nsp->vpnd_neti_v4,
+	    &nsp->vpnd_event_out_v4);
+	if (nsp->vpnd_token_out_v4 == NULL) {
+		net_event_shutdown(nsp->vpnd_neti_v6, &nsp->vpnd_event_in_v6);
+		net_event_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_event_in_v6);
+		net_event_shutdown(nsp->vpnd_neti_v4, &nsp->vpnd_event_in_v4);
+		net_event_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_event_in_v4);
+		net_family_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_family_v4);
+		net_family_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_family_v6);
+		net_protocol_unregister(nsp->vpnd_neti_v4);
+		net_protocol_unregister(nsp->vpnd_neti_v6);
+		cmn_err(CE_NOTE, "vnd_netinfo_init: net_event_register "
+		    "failed for stack %d", nsp->vpnd_nsid);
+		return (1);
+	}
+
+	nsp->vpnd_event_out_v6.he_version = HOOK_VERSION;
+	nsp->vpnd_event_out_v6.he_name = NH_PHYSICAL_OUT;
+	nsp->vpnd_event_out_v6.he_flags = 0;
+	nsp->vpnd_event_out_v6.he_interested = B_FALSE;
+
+	nsp->vpnd_token_out_v6 = net_event_register(nsp->vpnd_neti_v6,
+	    &nsp->vpnd_event_out_v6);
+	if (nsp->vpnd_token_out_v6 == NULL) {
+		net_event_shutdown(nsp->vpnd_neti_v6, &nsp->vpnd_event_in_v6);
+		net_event_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_event_in_v6);
+		net_event_shutdown(nsp->vpnd_neti_v6, &nsp->vpnd_event_in_v6);
+		net_event_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_event_in_v6);
+		net_event_shutdown(nsp->vpnd_neti_v4, &nsp->vpnd_event_in_v4);
+		net_event_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_event_in_v4);
+		net_family_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_family_v4);
+		net_family_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_family_v6);
+		net_protocol_unregister(nsp->vpnd_neti_v4);
+		net_protocol_unregister(nsp->vpnd_neti_v6);
+		cmn_err(CE_NOTE, "vnd_netinfo_init: net_event_register "
+		    "failed for stack %d", nsp->vpnd_nsid);
+		return (1);
+	}
+
+	return (0);
+}
+
+static void
+vnd_netinfo_shutdown(vnd_pnsd_t *nsp)
+{
+	int ret;
+
+	ret = net_event_shutdown(nsp->vpnd_neti_v4, &nsp->vpnd_event_in_v4);
+	VERIFY(ret == 0);
+	ret = net_event_shutdown(nsp->vpnd_neti_v4, &nsp->vpnd_event_out_v4);
+	VERIFY(ret == 0);
+	ret = net_event_shutdown(nsp->vpnd_neti_v6, &nsp->vpnd_event_in_v6);
+	VERIFY(ret == 0);
+	ret = net_event_shutdown(nsp->vpnd_neti_v6, &nsp->vpnd_event_out_v6);
+	VERIFY(ret == 0);
+}
+
+static void
+vnd_netinfo_fini(vnd_pnsd_t *nsp)
+{
+	int ret;
+
+	ret = net_event_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_event_in_v4);
+	VERIFY(ret == 0);
+	ret = net_event_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_event_out_v4);
+	VERIFY(ret == 0);
+	ret = net_event_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_event_in_v6);
+	VERIFY(ret == 0);
+	ret = net_event_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_event_out_v6);
+	VERIFY(ret == 0);
+	ret = net_family_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_family_v4);
+	VERIFY(ret == 0);
+	ret = net_family_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_family_v6);
+	VERIFY(ret == 0);
+	ret = net_protocol_unregister(nsp->vpnd_neti_v4);
+	VERIFY(ret == 0);
+	ret = net_protocol_unregister(nsp->vpnd_neti_v6);
+	VERIFY(ret == 0);
+}
+
+static void
+vnd_strbarrier_cb(void *arg, mblk_t *bmp, gsqueue_t *gsp, void *dummy)
+{
+	vnd_str_t *vsp = arg;
+
+	VERIFY(bmp == &vsp->vns_barrierblk);
+	mutex_enter(&vsp->vns_lock);
+	VERIFY(vsp->vns_flags & VNS_F_BARRIER);
+	VERIFY(!(vsp->vns_flags & VNS_F_BARRIER_DONE));
+	vsp->vns_flags |= VNS_F_BARRIER_DONE;
+	mutex_exit(&vsp->vns_lock);
+
+	/*
+	 * For better or worse, we have to broadcast here as we could have a
+	 * thread that's blocked for completion as well as one that's blocked
+	 * waiting to do a barrier itself.
+	 */
+	cv_broadcast(&vsp->vns_barriercv);
+}
+
+/*
+ * This is a data barrier for the stream while it is in fastpath mode. It blocks
+ * and ensures that there is nothing else in the squeue.
+ */
+static void
+vnd_strbarrier(vnd_str_t *vsp)
+{
+	mutex_enter(&vsp->vns_lock);
+	while (vsp->vns_flags & VNS_F_BARRIER)
+		cv_wait(&vsp->vns_barriercv, &vsp->vns_lock);
+	vsp->vns_flags |= VNS_F_BARRIER;
+	mutex_exit(&vsp->vns_lock);
+
+	gsqueue_enter_one(vsp->vns_squeue, &vsp->vns_barrierblk,
+	    vnd_strbarrier_cb, vsp, GSQUEUE_PROCESS, VND_SQUEUE_TAG_STRBARRIER);
+
+	mutex_enter(&vsp->vns_lock);
+	while (!(vsp->vns_flags & VNS_F_BARRIER_DONE))
+		cv_wait(&vsp->vns_barriercv, &vsp->vns_lock);
+	vsp->vns_flags &= ~VNS_F_BARRIER;
+	vsp->vns_flags &= ~VNS_F_BARRIER_DONE;
+	mutex_exit(&vsp->vns_lock);
+
+	/*
+	 * We have to broadcast in case anyone is waiting for the barrier
+	 * themselves.
+	 */
+	cv_broadcast(&vsp->vns_barriercv);
+}
+
+/*
+ * Based on the type of message that we're dealing with we're going to want to
+ * do one of several things. Basically if it looks like it's something we know
+ * about, we should probably handle it in one of our transition threads.
+ * Otherwise, we should just simply putnext.
+ */
+static int
+vnd_s_rput(queue_t *q, mblk_t *mp)
+{
+	t_uscalar_t prim;
+	int dispatch = 0;
+	vnd_str_t *vsp = q->q_ptr;
+
+	switch (DB_TYPE(mp)) {
+	case M_PROTO:
+	case M_PCPROTO:
+		if (MBLKL(mp) < sizeof (t_uscalar_t)) {
+			vnd_drop_ctl(vsp, mp, "PROTO message too short");
+			break;
+		}
+
+		prim = ((union DL_primitives *)mp->b_rptr)->dl_primitive;
+		if (prim == DL_UNITDATA_REQ || prim == DL_UNITDATA_IND) {
+			vnd_drop_ctl(vsp, mp,
+			    "recieved an unsupported dlpi DATA req");
+			break;
+		}
+
+		/*
+		 * Enqueue the entry and fire off a taskq dispatch.
+		 */
+		mutex_enter(&vsp->vns_lock);
+		vnd_dlpi_inc_push(vsp, mp);
+		if (!(vsp->vns_flags & VNS_F_TASKQ_DISPATCHED)) {
+			dispatch = 1;
+			vsp->vns_flags |= VNS_F_TASKQ_DISPATCHED;
+		}
+		mutex_exit(&vsp->vns_lock);
+		if (dispatch != 0)
+			taskq_dispatch_ent(vnd_taskq, vnd_dlpi_taskq_dispatch,
+			    vsp, 0, &vsp->vns_tqe);
+		break;
+	case M_DATA:
+		vnd_drop_in(vsp, mp, "M_DATA via put(9E)");
+		break;
+	default:
+		putnext(vsp->vns_rq, mp);
+	}
+	return (0);
+}
+
+static void
+vnd_strioctl(queue_t *q, vnd_str_t *vsp, mblk_t *mp, struct iocblk *iocp)
+{
+	int error;
+	vnd_strioc_t *visp;
+
+	if (iocp->ioc_cmd != VND_STRIOC_ASSOCIATE ||
+	    iocp->ioc_count != TRANSPARENT) {
+		error = EINVAL;
+		goto nak;
+	}
+
+	/*
+	 * All streams ioctls that we support must use kcred as a means to
+	 * distinguish that this is a layered open by the kernel as opposed to
+	 * one by a user who has done an I_PUSH of the module.
+	 */
+	if (iocp->ioc_cr != kcred) {
+		error = EPERM;
+		goto nak;
+	}
+
+	if (mp->b_cont == NULL) {
+		error = EAGAIN;
+		goto nak;
+	}
+
+	visp = kmem_alloc(sizeof (vnd_strioc_t), KM_SLEEP);
+	ASSERT(MBLKL(mp->b_cont) == sizeof (caddr_t));
+	visp->vs_addr = *(caddr_t *)mp->b_cont->b_rptr;
+	visp->vs_state = VSS_COPYIN;
+
+	mcopyin(mp, (void *)visp, sizeof (vnd_strioc_associate_t), NULL);
+	qreply(q, mp);
+
+	return;
+
+nak:
+	if (mp->b_cont != NULL) {
+		freemsg(mp->b_cont);
+		mp->b_cont = NULL;
+	}
+
+	iocp->ioc_error = error;
+	mp->b_datap->db_type = M_IOCNAK;
+	iocp->ioc_count = 0;
+	qreply(q, mp);
+}
+
+static void
+vnd_striocdata(queue_t *q, vnd_str_t *vsp, mblk_t *mp, struct copyresp *csp)
+{
+	int error;
+	vnd_str_state_t state;
+	struct copyreq *crp;
+	vnd_strioc_associate_t *vss;
+	vnd_dev_t *vdp = NULL;
+	vnd_pnsd_t *nsp = NULL;
+	char iname[2*VND_NAMELEN];
+	zone_t *zone;
+	vnd_strioc_t *visp;
+
+	visp = (vnd_strioc_t *)csp->cp_private;
+
+	/* If it's not ours, it's not our problem */
+	if (csp->cp_cmd != VND_STRIOC_ASSOCIATE) {
+		if (q->q_next != NULL) {
+			putnext(q, mp);
+		} else {
+			VND_STAT_INC(vsp, vks_ndlpidrops, 1);
+			VND_STAT_INC(vsp, vks_tdrops, 1);
+			vnd_drop_ctl(vsp, mp, "uknown cmd for M_IOCDATA");
+		}
+		kmem_free(visp, sizeof (vnd_strioc_t));
+		return;
+	}
+
+	/* The nak is already sent for us */
+	if (csp->cp_rval != 0) {
+		vnd_drop_ctl(vsp, mp, "M_COPYIN failed");
+		kmem_free(visp, sizeof (vnd_strioc_t));
+		return;
+	}
+
+	/* Data is sitting for us in b_cont */
+	if (mp->b_cont == NULL ||
+	    MBLKL(mp->b_cont) != sizeof (vnd_strioc_associate_t)) {
+		kmem_free(visp, sizeof (vnd_strioc_t));
+		miocnak(q, mp, 0, EINVAL);
+		return;
+	}
+
+	vss = (vnd_strioc_associate_t *)mp->b_cont->b_rptr;
+	vdp = vnd_dev_lookup(vss->vsa_minor);
+	if (vdp == NULL) {
+		error = EIO;
+		vss->vsa_errno = VND_E_NODEV;
+		goto nak;
+	}
+
+	nsp = vnd_nsd_lookup(vss->vsa_nsid);
+	if (nsp == NULL) {
+		error = EIO;
+		vss->vsa_errno = VND_E_NONETSTACK;
+		goto nak;
+	}
+
+	mutex_enter(&vsp->vns_lock);
+	if (!(vsp->vns_flags & VNS_F_NEED_ZONE)) {
+		mutex_exit(&vsp->vns_lock);
+		error = EEXIST;
+		vss->vsa_errno = VND_E_ASSOCIATED;
+		goto nak;
+	}
+
+	vsp->vns_nsd = nsp;
+	vsp->vns_flags &= ~VNS_F_NEED_ZONE;
+	vsp->vns_flags |= VNS_F_TASKQ_DISPATCHED;
+	mutex_exit(&vsp->vns_lock);
+
+	taskq_dispatch_ent(vnd_taskq, vnd_dlpi_taskq_dispatch, vsp, 0,
+	    &vsp->vns_tqe);
+
+
+	/* At this point we need to wait until we have transitioned to ONLINE */
+	mutex_enter(&vsp->vns_lock);
+	while (vsp->vns_state != VNS_S_ONLINE && vsp->vns_state != VNS_S_ZOMBIE)
+		cv_wait(&vsp->vns_stcv, &vsp->vns_lock);
+	state = vsp->vns_state;
+	mutex_exit(&vsp->vns_lock);
+
+	if (state == VNS_S_ZOMBIE) {
+		vss->vsa_errno = vsp->vns_errno;
+		error = EIO;
+		goto nak;
+	}
+
+	mutex_enter(&vdp->vdd_lock);
+	mutex_enter(&vsp->vns_lock);
+	VERIFY(vdp->vdd_str == NULL);
+	/*
+	 * Now initialize the remaining kstat properties and let's go ahead and
+	 * create it.
+	 */
+	(void) snprintf(iname, sizeof (iname), "z%d_%d",
+	    vdp->vdd_nsd->vpnd_zid, vdp->vdd_minor);
+	vsp->vns_kstat = kstat_create_zone("vnd", vdp->vdd_minor, iname, "net",
+	    KSTAT_TYPE_NAMED, sizeof (vnd_str_stat_t) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL, GLOBAL_ZONEID);
+	if (vsp->vns_kstat == NULL) {
+		error = EIO;
+		vss->vsa_errno = VND_E_KSTATCREATE;
+		mutex_exit(&vsp->vns_lock);
+		mutex_exit(&vdp->vdd_lock);
+		goto nak;
+	}
+	vdp->vdd_str = vsp;
+	vsp->vns_dev = vdp;
+
+	/*
+	 * Now, it's time to do the las thing that can fail, changing out the
+	 * input function. After this we know that we can receive data, so we
+	 * should make sure that we're ready.
+	 */
+	if (vnd_dld_cap_enable(vsp, vnd_mac_input) != 0) {
+		error = EIO;
+		vss->vsa_errno = VND_E_DIRECTFAIL;
+		vdp->vdd_str = NULL;
+		vsp->vns_dev = NULL;
+		mutex_exit(&vsp->vns_lock);
+		mutex_exit(&vdp->vdd_lock);
+		goto nak;
+	}
+
+	zone = zone_find_by_id(vdp->vdd_nsd->vpnd_zid);
+	ASSERT(zone != NULL);
+	vsp->vns_kstat->ks_data = &vsp->vns_ksdata;
+	/* Account for zone name */
+	vsp->vns_kstat->ks_data_size += strlen(zone->zone_name) + 1;
+	/* Account for eventual link name */
+	vsp->vns_kstat->ks_data_size += VND_NAMELEN;
+	kstat_named_setstr(&vsp->vns_ksdata.vks_zonename, zone->zone_name);
+	kstat_named_setstr(&vdp->vdd_str->vns_ksdata.vks_linkname,
+	    vdp->vdd_lname);
+	zone_rele(zone);
+	kstat_install(vsp->vns_kstat);
+
+	mutex_exit(&vsp->vns_lock);
+	mutex_exit(&vdp->vdd_lock);
+
+	/*
+	 * Note that the vnd_str_t does not keep a permanent hold on the
+	 * vnd_pnsd_t. We leave that up to the vnd_dev_t as that's also what
+	 * the nestack goes through to take care of everything.
+	 */
+	vss->vsa_errno = VND_E_SUCCESS;
+nak:
+	if (vdp != NULL)
+		vnd_dev_rele(vdp);
+	if (nsp != NULL)
+		vnd_nsd_rele(nsp);
+	/*
+	 * Change the copyin request to a copyout. Note that we can't use
+	 * mcopyout here as it only works when the DB_TYPE is M_IOCTL. That's
+	 * okay, as the copyin vs. copyout is basically the same.
+	 */
+	DB_TYPE(mp) = M_COPYOUT;
+	visp->vs_state = VSS_COPYOUT;
+	crp = (struct copyreq *)mp->b_rptr;
+	crp->cq_private = (void *)visp;
+	crp->cq_addr = visp->vs_addr;
+	crp->cq_size = sizeof (vnd_strioc_associate_t);
+	qreply(q, mp);
+}
+
+static void
+vnd_stroutdata(queue_t *q, vnd_str_t *vsp, mblk_t *mp, struct copyresp *csp)
+{
+	ASSERT(csp->cp_private != NULL);
+	kmem_free(csp->cp_private, sizeof (vnd_strioc_t));
+	if (csp->cp_cmd != VND_STRIOC_ASSOCIATE) {
+		if (q->q_next != NULL) {
+			putnext(q, mp);
+		} else {
+			VND_STAT_INC(vsp, vks_ndlpidrops, 1);
+			VND_STAT_INC(vsp, vks_tdrops, 1);
+			vnd_drop_ctl(vsp, mp, "uknown cmd for M_IOCDATA");
+		}
+		return;
+	}
+
+	/* The nak is already sent for us */
+	if (csp->cp_rval != 0) {
+		vnd_drop_ctl(vsp, mp, "M_COPYOUT failed");
+		return;
+	}
+
+	/* Ack and let's be done with it all */
+	miocack(q, mp, 0, 0);
+}
+
+static int
+vnd_s_wput(queue_t *q, mblk_t *mp)
+{
+	vnd_str_t *vsp = q->q_ptr;
+	struct copyresp *crp;
+	vnd_strioc_state_t vstate;
+	vnd_strioc_t *visp;
+
+	switch (DB_TYPE(mp)) {
+	case M_IOCTL:
+		vnd_strioctl(q, vsp, mp, (struct iocblk *)mp->b_rptr);
+		return (0);
+	case M_IOCDATA:
+		crp = (struct copyresp *)mp->b_rptr;
+		ASSERT(crp->cp_private != NULL);
+		visp = (vnd_strioc_t *)crp->cp_private;
+		vstate = visp->vs_state;
+		ASSERT(vstate == VSS_COPYIN || vstate == VSS_COPYOUT);
+		if (vstate == VSS_COPYIN)
+			vnd_striocdata(q, vsp, mp,
+			    (struct copyresp *)mp->b_rptr);
+		else
+			vnd_stroutdata(q, vsp, mp,
+			    (struct copyresp *)mp->b_rptr);
+		return (0);
+	default:
+		break;
+	}
+	if (q->q_next != NULL)
+		putnext(q, mp);
+	else
+		vnd_drop_ctl(vsp, mp, "!M_IOCTL in wput");
+
+	return (0);
+}
+
+static int
+vnd_s_open(queue_t *q, dev_t *devp, int oflag, int sflag, cred_t *credp)
+{
+	vnd_str_t *vsp;
+	uint_t rand;
+
+	if (q->q_ptr != NULL)
+		return (EINVAL);
+
+	if (!(sflag & MODOPEN))
+		return (ENXIO);
+
+	if (credp != kcred)
+		return (EPERM);
+
+	vsp = kmem_cache_alloc(vnd_str_cache, KM_SLEEP);
+	bzero(vsp, sizeof (*vsp));
+	mutex_init(&vsp->vns_lock, NULL, MUTEX_DRIVER, NULL);
+	cv_init(&vsp->vns_cancelcv, NULL, CV_DRIVER, NULL);
+	cv_init(&vsp->vns_barriercv, NULL, CV_DRIVER, NULL);
+	cv_init(&vsp->vns_stcv, NULL, CV_DRIVER, NULL);
+	vsp->vns_state = VNS_S_INITIAL;
+
+	mutex_init(&vsp->vns_dq_read.vdq_lock, NULL, MUTEX_DRIVER, NULL);
+	mutex_init(&vsp->vns_dq_write.vdq_lock, NULL, MUTEX_DRIVER, NULL);
+	mutex_enter(&vnd_dev_lock);
+	vsp->vns_dq_read.vdq_max = vnd_vdq_default_size;
+	vsp->vns_dq_read.vdq_vns = vsp;
+	vsp->vns_dq_write.vdq_max = vnd_vdq_default_size;
+	vsp->vns_dq_write.vdq_vns = vsp;
+	mutex_exit(&vnd_dev_lock);
+	vsp->vns_rq = q;
+	vsp->vns_wq = WR(q);
+	q->q_ptr = WR(q)->q_ptr = vsp;
+	vsp->vns_flags = VNS_F_NEED_ZONE;
+	vsp->vns_nflush = vnd_flush_nburst;
+	vsp->vns_bsize = vnd_flush_burst_size;
+
+	(void) random_get_pseudo_bytes((uint8_t *)&rand, sizeof (rand));
+	vsp->vns_squeue = gsqueue_set_get(vnd_sqset, rand);
+
+	/*
+	 * We create our kstat and initialize all of its fields now, but we
+	 * don't install it until we actually do the zone association so we can
+	 * get everything.
+	 */
+	kstat_named_init(&vsp->vns_ksdata.vks_rbytes, "rbytes",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&vsp->vns_ksdata.vks_rpackets, "rpackets",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&vsp->vns_ksdata.vks_obytes, "obytes",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&vsp->vns_ksdata.vks_opackets, "opackets",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&vsp->vns_ksdata.vks_nhookindrops, "nhookindrops",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&vsp->vns_ksdata.vks_nhookoutdrops, "nhookoutdrops",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&vsp->vns_ksdata.vks_ndlpidrops, "ndlpidrops",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&vsp->vns_ksdata.vks_ndataindrops, "ndataindrops",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&vsp->vns_ksdata.vks_ndataoutdrops, "ndataoutdrops",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&vsp->vns_ksdata.vks_tdrops, "total_drops",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&vsp->vns_ksdata.vks_linkname, "linkname",
+	    KSTAT_DATA_STRING);
+	kstat_named_init(&vsp->vns_ksdata.vks_zonename, "zonename",
+	    KSTAT_DATA_STRING);
+	kstat_named_init(&vsp->vns_ksdata.vks_nmacflow, "flowcontrol_events",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&vsp->vns_ksdata.vks_tmacflow, "flowcontrol_time",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&vsp->vns_ksdata.vks_mac_flow_1ms, "flowcontrol_1ms",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&vsp->vns_ksdata.vks_mac_flow_10ms, "flowcontrol_10ms",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&vsp->vns_ksdata.vks_mac_flow_100ms,
+	    "flowcontrol_100ms", KSTAT_DATA_UINT64);
+	kstat_named_init(&vsp->vns_ksdata.vks_mac_flow_1s, "flowcontrol_1s",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&vsp->vns_ksdata.vks_mac_flow_10s, "flowcontrol_10s",
+	    KSTAT_DATA_UINT64);
+	qprocson(q);
+	/*
+	 * Now that we've called qprocson, grab the lower module for making sure
+	 * that we don't have any pass through modules.
+	 */
+	vsp->vns_lrq = RD(vsp->vns_wq->q_next);
+
+	return (0);
+}
+
+static int
+vnd_s_close(queue_t *q, int flag, cred_t *credp)
+{
+	vnd_str_t *vsp;
+	mblk_t *mp;
+
+	VERIFY(WR(q)->q_next != NULL);
+
+	vsp = q->q_ptr;
+	ASSERT(vsp != NULL);
+
+	/*
+	 * We need to transition ourselves down.  This means that we have a few
+	 * important different things to do in the process of tearing down our
+	 * input and output buffers, making sure we've drained the current
+	 * squeue, and disabling the fast path. Before we disable the fast path,
+	 * we should make sure the squeue is drained. Because we're in streams
+	 * close, we know that no packets can come into us from userland, but we
+	 * can receive more. As such, the following is the exact order of things
+	 * that we do:
+	 *
+	 * 1) flush the vns_dq_read
+	 * 2) Insert the drain mblk
+	 * 3) When it's been received, tear down the fast path by kicking
+	 * off the state machine.
+	 * 4) One final flush of both the vns_dq_read,vns_dq_write
+	 */
+
+	vnd_dq_flush(&vsp->vns_dq_read, vnd_drop_in);
+	vnd_strbarrier(vsp);
+	mutex_enter(&vsp->vns_lock);
+	vsp->vns_state = VNS_S_SHUTTING_DOWN;
+	if (!(vsp->vns_flags & VNS_F_TASKQ_DISPATCHED)) {
+		vsp->vns_flags |= VNS_F_TASKQ_DISPATCHED;
+		taskq_dispatch_ent(vnd_taskq, vnd_dlpi_taskq_dispatch, vsp,
+		    0, &vsp->vns_tqe);
+	}
+	while (vsp->vns_state != VNS_S_ZOMBIE)
+		cv_wait(&vsp->vns_stcv, &vsp->vns_lock);
+	mutex_exit(&vsp->vns_lock);
+
+	qprocsoff(q);
+	mutex_enter(&vsp->vns_lock);
+	vsp->vns_flags |= VNS_F_CONDEMNED;
+	while (vsp->vns_flags & VNS_F_TASKQ_DISPATCHED)
+		cv_wait(&vsp->vns_cancelcv, &vsp->vns_lock);
+
+	while ((mp = vnd_dlpi_inc_pop(vsp)) != NULL)
+		vnd_drop_ctl(vsp, mp, "vnd_s_close");
+	mutex_exit(&vsp->vns_lock);
+
+	q->q_ptr = NULL;
+	vnd_dq_flush(&vsp->vns_dq_read, vnd_drop_in);
+	vnd_dq_flush(&vsp->vns_dq_write, vnd_drop_out);
+	mutex_destroy(&vsp->vns_dq_read.vdq_lock);
+	mutex_destroy(&vsp->vns_dq_write.vdq_lock);
+
+	if (vsp->vns_kstat != NULL)
+		kstat_delete(vsp->vns_kstat);
+	mutex_destroy(&vsp->vns_lock);
+	cv_destroy(&vsp->vns_stcv);
+	cv_destroy(&vsp->vns_barriercv);
+	cv_destroy(&vsp->vns_cancelcv);
+	kmem_cache_free(vnd_str_cache, vsp);
+
+	return (0);
+}
+
+static vnd_mac_cookie_t
+vnd_squeue_tx_one(vnd_str_t *vsp, mblk_t *mp)
+{
+	hrtime_t txtime;
+	vnd_mac_cookie_t vc;
+
+	VND_STAT_INC(vsp, vks_opackets, 1);
+	VND_STAT_INC(vsp, vks_obytes, msgsize(mp));
+	DTRACE_VND5(send, mblk_t *, mp, void *, NULL, void *, NULL,
+	    vnd_str_t *, vsp, mblk_t *, mp);
+	/* Actually tx now */
+	txtime = gethrtime();
+	vc = vsp->vns_caps.vsc_tx_f(vsp->vns_caps.vsc_tx_hdl,
+	    mp, 0, MAC_DROP_ON_NO_DESC);
+
+	/*
+	 * We need to check two different conditions before we immediately set
+	 * the flow control lock. The first thing that we need to do is verify
+	 * that this is an instance of hard flow control, so to say. The flow
+	 * control callbacks won't always fire in cases where we still get a
+	 * cookie returned. The explicit check for flow control will guarantee
+	 * us that we'll get a subsequent notification callback.
+	 *
+	 * The second case comes about because we do not hold the
+	 * vnd_str_t`vns_lock across calls to tx, we need to determine if a flow
+	 * control notification already came across for us in a different thread
+	 * calling vnd_mac_flow_control(). To deal with this, we record a
+	 * timestamp every time that we change the flow control state. We grab
+	 * txtime here before we transmit because that guarantees that the
+	 * hrtime_t of the call to vnd_mac_flow_control() will be after txtime.
+	 *
+	 * If the flow control notification beat us to the punch, the value of
+	 * vns_fcupdate will be larger than the value of txtime, and we should
+	 * just record the statistics. However, if we didn't beat it to the
+	 * punch (txtime > vns_fcupdate), then we know that it's safe to wait
+	 * for a notification.
+	 */
+	if (vc != NULL) {
+		hrtime_t diff;
+
+		if (vsp->vns_caps.vsc_is_fc_f(vsp->vns_caps.vsc_is_fc_hdl,
+		    vc) == 0)
+			return (NULL);
+		mutex_enter(&vsp->vns_lock);
+		diff = vsp->vns_fcupdate - txtime;
+		if (diff > 0) {
+			mutex_exit(&vsp->vns_lock);
+			vnd_mac_flow_control_stat(vsp, diff);
+			return (NULL);
+		}
+		vsp->vns_flags |= VNS_F_FLOW_CONTROLLED;
+		vsp->vns_caps.vsc_fc_cookie = vc;
+		vsp->vns_fclatch = txtime;
+		vsp->vns_fcupdate = txtime;
+		DTRACE_VND3(flow__blocked, vnd_str_t *, vsp,
+		    uint64_t, vsp->vns_dq_write.vdq_cur, uintptr_t, vc);
+		mutex_exit(&vsp->vns_lock);
+	}
+
+	return (vc);
+}
+
+static void
+vnd_squeue_tx_drain(void *arg, mblk_t *drain_mp, gsqueue_t *gsp, void *dummy)
+{
+	mblk_t *mp;
+	int nmps;
+	size_t mptot, nflush, bsize;
+	boolean_t blocked, empty;
+	vnd_data_queue_t *vqp;
+	vnd_str_t *vsp = arg;
+
+	mutex_enter(&vsp->vns_lock);
+	/*
+	 * We either enter here via an squeue or via vnd_squeue_tx_append(). In
+	 * the former case we need to mark that there is no longer an active
+	 * user of the drain block.
+	 */
+	if (drain_mp != NULL) {
+		VERIFY(drain_mp == &vsp->vns_drainblk);
+		VERIFY(vsp->vns_flags & VNS_F_DRAIN_SCHEDULED);
+		vsp->vns_flags &= ~VNS_F_DRAIN_SCHEDULED;
+	}
+
+	/*
+	 * If we're still flow controlled or under a flush barrier, nothing to
+	 * do.
+	 */
+	if (vsp->vns_flags & (VNS_F_FLOW_CONTROLLED | VNS_F_BARRIER)) {
+		mutex_exit(&vsp->vns_lock);
+		return;
+	}
+
+	nflush = vsp->vns_nflush;
+	bsize = vsp->vns_bsize;
+	mutex_exit(&vsp->vns_lock);
+
+	nmps = 0;
+	mptot = 0;
+	blocked = B_FALSE;
+	vqp = &vsp->vns_dq_write;
+	while (nmps < nflush && mptot <= bsize) {
+		mutex_enter(&vqp->vdq_lock);
+		if (vnd_dq_pop(vqp, &mp) == 0) {
+			mutex_exit(&vqp->vdq_lock);
+			break;
+		}
+		mutex_exit(&vqp->vdq_lock);
+
+		nmps++;
+		mptot += msgsize(mp);
+		if (vnd_squeue_tx_one(vsp, mp) != NULL) {
+			blocked = B_TRUE;
+			break;
+		}
+	}
+
+	empty = vnd_dq_is_empty(&vsp->vns_dq_write);
+
+	/*
+	 * If the queue is not empty, we're not blocked, and there isn't a drain
+	 * scheduled, put it into the squeue with the drain block and
+	 * GSQUEUE_FILL.
+	 */
+	if (blocked == B_FALSE && empty == B_FALSE) {
+		mutex_enter(&vsp->vns_lock);
+		if (!(vsp->vns_flags & VNS_F_DRAIN_SCHEDULED)) {
+			mblk_t *mp = &vsp->vns_drainblk;
+			vsp->vns_flags |= VNS_F_DRAIN_SCHEDULED;
+			gsqueue_enter_one(vsp->vns_squeue,
+			    mp, vnd_squeue_tx_drain, vsp,
+			    GSQUEUE_FILL, VND_SQUEUE_TAG_TX_DRAIN);
+		}
+		mutex_exit(&vsp->vns_lock);
+	}
+
+	/*
+	 * If we drained some amount of data, we need to signal the data queue.
+	 */
+	if (nmps > 0) {
+		cv_broadcast(&vsp->vns_dq_write.vdq_ready);
+		pollwakeup(&vsp->vns_dev->vdd_ph, POLLOUT);
+	}
+}
+
+static void
+vnd_squeue_tx_append(void *arg, mblk_t *mp, gsqueue_t *gsp, void *dummy)
+{
+	vnd_str_t *vsp = arg;
+	vnd_data_queue_t *vqp = &vsp->vns_dq_write;
+	vnd_pnsd_t *nsp = vsp->vns_nsd;
+	size_t len = msgsize(mp);
+
+	/*
+	 * Before we append this packet, we should run it through the firewall
+	 * rules.
+	 */
+	if (nsp->vpnd_hooked && vnd_hook(vsp, &mp, nsp->vpnd_neti_v4,
+	    nsp->vpnd_event_out_v4, nsp->vpnd_token_out_v4, nsp->vpnd_neti_v6,
+	    nsp->vpnd_event_out_v6, nsp->vpnd_token_out_v6, vnd_drop_hook_out,
+	    vnd_drop_out) != 0) {
+		/*
+		 * Because we earlier reserved space for this packet and it's
+		 * not making the cut, we need to go through and unreserve that
+		 * space. Also note that the message block will likely be freed
+		 * by the time we return from vnd_hook so we cannot rely on it.
+		 */
+		mutex_enter(&vqp->vdq_lock);
+		vnd_dq_unreserve(vqp, len);
+		mutex_exit(&vqp->vdq_lock);
+		return;
+	}
+
+	/*
+	 * We earlier reserved space for this packet. So for now simply append
+	 * it and call drain. We know that no other drain can be going on right
+	 * now thanks to the squeue.
+	 */
+	mutex_enter(&vqp->vdq_lock);
+	(void) vnd_dq_push(&vsp->vns_dq_write, mp, B_TRUE, vnd_drop_panic);
+	mutex_exit(&vqp->vdq_lock);
+	vnd_squeue_tx_drain(vsp, NULL, NULL, NULL);
+}
+
+/*
+ * We need to see if this is a valid name of sorts for us. That means a few
+ * things. First off, we can't assume that what we've been given has actually
+ * been null terminated. More importantly, that it's a valid name as far as
+ * ddi_create_minor_node is concerned (that means no '@', '/', or ' '). We
+ * further constrain ourselves to simply alphanumeric characters and a few
+ * additional ones, ':', '-', and '_'.
+ */
+static int
+vnd_validate_name(const char *buf, size_t buflen)
+{
+	int i, len;
+
+	/* First make sure a null terminator exists */
+	for (i = 0; i < buflen; i++)
+		if (buf[i] == '\0')
+			break;
+	len = i;
+	if (i == 0 || i == buflen)
+		return (0);
+
+	for (i = 0; i < len; i++)
+		if (!isalnum(buf[i]) && buf[i] != ':' && buf[i] != '-' &&
+		    buf[i] != '_')
+			return (0);
+
+	return (1);
+}
+
+static int
+vnd_ioctl_attach(vnd_dev_t *vdp, uintptr_t arg, cred_t *credp, int cpflag)
+{
+	vnd_ioc_attach_t via;
+	vnd_strioc_associate_t vss;
+	vnd_pnsd_t *nsp;
+	zone_t *zonep;
+	zoneid_t zid;
+	char buf[2*VND_NAMELEN];
+	int ret, rp;
+
+	if (secpolicy_net_config(credp, B_FALSE) != 0)
+		return (EPERM);
+
+	if (secpolicy_net_rawaccess(credp) != 0)
+		return (EPERM);
+
+	if (ddi_copyin((void *)arg, &via, sizeof (via), cpflag) != 0)
+		return (EFAULT);
+	via.via_errno = VND_E_SUCCESS;
+
+	if (vnd_validate_name(via.via_name, VND_NAMELEN) == 0) {
+		via.via_errno = VND_E_BADNAME;
+		ret = EIO;
+		goto errcopyout;
+	}
+
+	/*
+	 * Only the global zone can request to create a device in a different
+	 * zone.
+	 */
+	zid = crgetzoneid(credp);
+	if (zid != GLOBAL_ZONEID && via.via_zoneid != -1 &&
+	    zid != via.via_zoneid) {
+		via.via_errno = VND_E_PERM;
+		ret = EIO;
+		goto errcopyout;
+	}
+
+	if (via.via_zoneid == -1)
+		via.via_zoneid = zid;
+
+	/*
+	 * Establish the name we'll use now. We want to be extra paranoid about
+	 * the device we're opening so check that now.
+	 */
+	if (zid == GLOBAL_ZONEID && via.via_zoneid != zid) {
+		zonep = zone_find_by_id(via.via_zoneid);
+		if (zonep == NULL) {
+			via.via_errno = VND_E_NOZONE;
+			ret = EIO;
+			goto errcopyout;
+		}
+		if (snprintf(NULL, 0, "/dev/net/zone/%s/%s", zonep->zone_name,
+		    via.via_name) >= sizeof (buf)) {
+			zone_rele(zonep);
+			via.via_errno = VND_E_BADNAME;
+			ret = EIO;
+			goto errcopyout;
+		}
+		(void) snprintf(buf, sizeof (buf), "/dev/net/zone/%s/%s",
+		    zonep->zone_name, via.via_name);
+		zone_rele(zonep);
+		zonep = NULL;
+	} else {
+		if (snprintf(NULL, 0, "/dev/net/%s", via.via_name) >=
+		    sizeof (buf)) {
+			via.via_errno = VND_E_BADNAME;
+			ret = EIO;
+			goto errcopyout;
+		}
+		(void) snprintf(buf, sizeof (buf), "/dev/net/%s", via.via_name);
+	}
+
+	/*
+	 * If our zone is dying then the netstack will have been removed from
+	 * this list.
+	 */
+	nsp = vnd_nsd_lookup_by_zid(via.via_zoneid);
+	if (nsp == NULL) {
+		via.via_errno = VND_E_NOZONE;
+		ret = EIO;
+		goto errcopyout;
+	}
+
+	/*
+	 * Note we set the attached handle even though we haven't actually
+	 * finished the process of attaching the ldi handle.
+	 */
+	mutex_enter(&vdp->vdd_lock);
+	if (vdp->vdd_flags & (VND_D_ATTACHED | VND_D_ATTACH_INFLIGHT)) {
+		mutex_exit(&vdp->vdd_lock);
+		vnd_nsd_rele(nsp);
+		via.via_errno = VND_E_ATTACHED;
+		ret = EIO;
+		goto errcopyout;
+	}
+	vdp->vdd_flags |= VND_D_ATTACH_INFLIGHT;
+	ASSERT(vdp->vdd_cr == NULL);
+	crhold(credp);
+	vdp->vdd_cr = credp;
+	ASSERT(vdp->vdd_nsd == NULL);
+	vdp->vdd_nsd = nsp;
+	mutex_exit(&vdp->vdd_lock);
+
+	/*
+	 * Place an additional hold on the vnd_pnsd_t as we go through and do
+	 * all of the rest of our work. This will be the hold that we keep for
+	 * as long as this thing is attached.
+	 */
+	vnd_nsd_ref(nsp);
+
+	ret = ldi_open_by_name(buf, FREAD | FWRITE, vdp->vdd_cr,
+	    &vdp->vdd_ldih, vdp->vdd_ldiid);
+	if (ret != 0) {
+		if (ret == ENODEV)
+			via.via_errno = VND_E_NODATALINK;
+		goto err;
+	}
+
+	/*
+	 * Unfortunately the I_PUSH interface doesn't allow us a way to detect
+	 * whether or not we're coming in from a layered device. We really want
+	 * to make sure that a normal user can't push on our streams module.
+	 * Currently the only idea I have for this is to make sure that the
+	 * credp is kcred which is really terrible.
+	 */
+	ret = ldi_ioctl(vdp->vdd_ldih, I_PUSH, (intptr_t)"vnd", FKIOCTL,
+	    kcred, &rp);
+	if (ret != 0) {
+		rp = ldi_close(vdp->vdd_ldih, FREAD | FWRITE, vdp->vdd_cr);
+		VERIFY(rp == 0);
+		via.via_errno = VND_E_STRINIT;
+		ret = EIO;
+		goto err;
+	}
+
+	vss.vsa_minor = vdp->vdd_minor;
+	vss.vsa_nsid = nsp->vpnd_nsid;
+
+	ret = ldi_ioctl(vdp->vdd_ldih, VND_STRIOC_ASSOCIATE, (intptr_t)&vss,
+	    FKIOCTL, kcred, &rp);
+	if (ret != 0 || vss.vsa_errno != VND_E_SUCCESS) {
+		rp = ldi_close(vdp->vdd_ldih, FREAD | FWRITE, vdp->vdd_cr);
+		VERIFY(rp == 0);
+		if (ret == 0) {
+			via.via_errno = vss.vsa_errno;
+			ret = EIO;
+		}
+		goto err;
+	}
+
+	mutex_enter(&vdp->vdd_nsd->vpnd_lock);
+
+	/*
+	 * There's a chance that our netstack was condemned while we've had a
+	 * hold on it. As such we need to check and if so, error out.
+	 */
+	if (vdp->vdd_nsd->vpnd_flags & VND_NS_CONDEMNED) {
+		mutex_exit(&vdp->vdd_nsd->vpnd_lock);
+		rp = ldi_close(vdp->vdd_ldih, FREAD | FWRITE, vdp->vdd_cr);
+		VERIFY(rp == 0);
+		ret = EIO;
+		via.via_errno = VND_E_NOZONE;
+		goto err;
+	}
+
+	mutex_enter(&vdp->vdd_lock);
+	VERIFY(vdp->vdd_str != NULL);
+	vdp->vdd_flags &= ~VND_D_ATTACH_INFLIGHT;
+	vdp->vdd_flags |= VND_D_ATTACHED;
+	(void) strlcpy(vdp->vdd_datalink, via.via_name,
+	    sizeof (vdp->vdd_datalink));
+	list_insert_tail(&vdp->vdd_nsd->vpnd_dev_list, vdp);
+	mutex_exit(&vdp->vdd_lock);
+	mutex_exit(&vdp->vdd_nsd->vpnd_lock);
+	vnd_nsd_rele(nsp);
+
+	return (0);
+
+err:
+	mutex_enter(&vdp->vdd_lock);
+	vdp->vdd_flags &= ~VND_D_ATTACH_INFLIGHT;
+	crfree(vdp->vdd_cr);
+	vdp->vdd_cr = NULL;
+	vdp->vdd_nsd = NULL;
+	mutex_exit(&vdp->vdd_lock);
+
+	/*
+	 * We have two holds to drop here. One for our original reference and
+	 * one for the hold this operation would have represented.
+	 */
+	vnd_nsd_rele(nsp);
+	vnd_nsd_rele(nsp);
+errcopyout:
+	if (ddi_copyout(&via, (void *)arg, sizeof (via), cpflag) != 0)
+		ret = EFAULT;
+
+	return (ret);
+}
+
+static int
+vnd_ioctl_link(vnd_dev_t *vdp, intptr_t arg, cred_t *credp, int cpflag)
+{
+	int ret = 0;
+	vnd_ioc_link_t vil;
+	char mname[2*VND_NAMELEN];
+	char **c;
+	vnd_dev_t *v;
+	zoneid_t zid;
+
+	/* Not anyone can link something */
+	if (secpolicy_net_config(credp, B_FALSE) != 0)
+		return (EPERM);
+
+	if (ddi_copyin((void *)arg, &vil, sizeof (vil), cpflag) != 0)
+		return (EFAULT);
+
+	if (vnd_validate_name(vil.vil_name, VND_NAMELEN) == 0) {
+		ret = EIO;
+		vil.vil_errno = VND_E_BADNAME;
+		goto errcopyout;
+	}
+
+	c = vnd_reserved_names;
+	while (*c != NULL) {
+		if (strcmp(vil.vil_name, *c) == 0) {
+			ret = EIO;
+			vil.vil_errno = VND_E_BADNAME;
+			goto errcopyout;
+		}
+		c++;
+	}
+
+	mutex_enter(&vdp->vdd_lock);
+	if (!(vdp->vdd_flags & VND_D_ATTACHED)) {
+		mutex_exit(&vdp->vdd_lock);
+		ret = EIO;
+		vil.vil_errno = VND_E_NOTATTACHED;
+		goto errcopyout;
+	}
+
+	if (vdp->vdd_flags & VND_D_ZONE_DYING) {
+		mutex_exit(&vdp->vdd_lock);
+		ret = EIO;
+		vil.vil_errno = VND_E_NOZONE;
+		goto errcopyout;
+	}
+
+	if (vdp->vdd_flags & (VND_D_LINK_INFLIGHT | VND_D_LINKED)) {
+		mutex_exit(&vdp->vdd_lock);
+		ret = EIO;
+		vil.vil_errno = VND_E_LINKED;
+		goto errcopyout;
+	}
+	vdp->vdd_flags |= VND_D_LINK_INFLIGHT;
+	zid = vdp->vdd_nsd->vpnd_zid;
+	mutex_exit(&vdp->vdd_lock);
+
+	if (snprintf(NULL, 0, "z%d:%s", zid, vil.vil_name) >=
+	    sizeof (mname)) {
+		ret = EIO;
+		vil.vil_errno = VND_E_BADNAME;
+		goto errcopyout;
+	}
+
+	mutex_enter(&vnd_dev_lock);
+	for (v = list_head(&vnd_dev_list); v != NULL;
+	    v = list_next(&vnd_dev_list, v)) {
+		if (!(v->vdd_flags & VND_D_LINKED))
+			continue;
+
+		if (v->vdd_nsd->vpnd_zid == zid &&
+		    strcmp(v->vdd_lname, vil.vil_name) == 0) {
+			mutex_exit(&vnd_dev_lock);
+			ret = EIO;
+			vil.vil_errno = VND_E_LINKEXISTS;
+			goto error;
+		}
+	}
+
+	/*
+	 * We set the name and mark ourselves attached while holding the list
+	 * lock to ensure that no other user can mistakingly find our name.
+	 */
+	(void) snprintf(mname, sizeof (mname), "z%d:%s", zid,
+	    vil.vil_name);
+	mutex_enter(&vdp->vdd_lock);
+
+	/*
+	 * Because we dropped our lock, we need to double check whether or not
+	 * the zone was marked as dying while we were here. If it hasn't, then
+	 * it's safe for us to link it in.
+	 */
+	if (vdp->vdd_flags & VND_D_ZONE_DYING) {
+		mutex_exit(&vdp->vdd_lock);
+		mutex_exit(&vnd_dev_lock);
+		ret = EIO;
+		vil.vil_errno = VND_E_NOZONE;
+		goto error;
+	}
+
+	(void) strlcpy(vdp->vdd_lname, vil.vil_name, sizeof (vdp->vdd_lname));
+	if (ddi_create_minor_node(vnd_dip, mname, S_IFCHR, vdp->vdd_minor,
+	    DDI_PSEUDO, 0) != DDI_SUCCESS) {
+		ret = EIO;
+		vil.vil_errno = VND_E_MINORNODE;
+	} else {
+		vdp->vdd_flags &= ~VND_D_LINK_INFLIGHT;
+		vdp->vdd_flags |= VND_D_LINKED;
+		kstat_named_setstr(&vdp->vdd_str->vns_ksdata.vks_linkname,
+		    vdp->vdd_lname);
+		ret = 0;
+	}
+	mutex_exit(&vdp->vdd_lock);
+	mutex_exit(&vnd_dev_lock);
+
+	if (ret == 0) {
+		/*
+		 * Add a reference to represent that this device is linked into
+		 * the file system name space to ensure that it doesn't
+		 * disappear.
+		 */
+		vnd_dev_ref(vdp);
+		return (0);
+	}
+
+error:
+	mutex_enter(&vdp->vdd_lock);
+	vdp->vdd_flags &= ~VND_D_LINK_INFLIGHT;
+	vdp->vdd_lname[0] = '\0';
+	mutex_exit(&vdp->vdd_lock);
+
+errcopyout:
+	if (ddi_copyout(&vil, (void *)arg, sizeof (vil), cpflag) != 0)
+		ret = EFAULT;
+	return (ret);
+}
+
+/*
+ * Common unlink function. This is used both from the ioctl path and from the
+ * netstack shutdown path. The caller is required to hold the mutex on the
+ * vnd_dev_t, but they basically will have it relinquished for them. The only
+ * thing the caller is allowed to do afterward is to potentially rele the
+ * vnd_dev_t if they have their own hold. Note that only the ioctl path has its
+ * own hold.
+ */
+static void
+vnd_dev_unlink(vnd_dev_t *vdp)
+{
+	char mname[2*VND_NAMELEN];
+
+	ASSERT(MUTEX_HELD(&vdp->vdd_lock));
+
+	(void) snprintf(mname, sizeof (mname), "z%d:%s",
+	    vdp->vdd_nsd->vpnd_zid, vdp->vdd_lname);
+	ddi_remove_minor_node(vnd_dip, mname);
+	vdp->vdd_lname[0] = '\0';
+	vdp->vdd_flags &= ~VND_D_LINKED;
+	kstat_named_setstr(&vdp->vdd_str->vns_ksdata.vks_linkname,
+	    vdp->vdd_lname);
+	mutex_exit(&vdp->vdd_lock);
+
+	/*
+	 * This rele corresponds to the reference that we took in
+	 * vnd_ioctl_link.
+	 */
+	vnd_dev_rele(vdp);
+}
+
+static int
+vnd_ioctl_unlink(vnd_dev_t *vdp, intptr_t arg, cred_t *credp, int cpflag)
+{
+	int ret;
+	zoneid_t zid;
+	vnd_ioc_unlink_t viu;
+
+	/* Not anyone can unlink something */
+	if (secpolicy_net_config(credp, B_FALSE) != 0)
+		return (EPERM);
+
+	zid = crgetzoneid(credp);
+
+	if (ddi_copyin((void *)arg, &viu, sizeof (viu), cpflag) != 0)
+		return (EFAULT);
+
+	viu.viu_errno = VND_E_SUCCESS;
+
+	mutex_enter(&vdp->vdd_lock);
+	if (!(vdp->vdd_flags & VND_D_LINKED)) {
+		mutex_exit(&vdp->vdd_lock);
+		ret = EIO;
+		viu.viu_errno = VND_E_NOTLINKED;
+		goto err;
+	}
+	VERIFY(vdp->vdd_flags & VND_D_ATTACHED);
+
+	if (zid != GLOBAL_ZONEID && zid != vdp->vdd_nsd->vpnd_zid) {
+		mutex_exit(&vdp->vdd_lock);
+		ret = EIO;
+		viu.viu_errno = VND_E_PERM;
+		goto err;
+	}
+
+	/* vnd_dev_unlink releases the vdp mutex for us */
+	vnd_dev_unlink(vdp);
+	ret = 0;
+err:
+	if (ddi_copyout(&viu, (void *)arg, sizeof (viu), cpflag) != 0)
+		return (EFAULT);
+
+	return (ret);
+}
+
+static int
+vnd_ioctl_setrxbuf(vnd_dev_t *vdp, intptr_t arg, int cpflag)
+{
+	int ret;
+	vnd_ioc_buf_t vib;
+
+	if (ddi_copyin((void *)arg, &vib, sizeof (vib), cpflag) != 0)
+		return (EFAULT);
+
+	mutex_enter(&vnd_dev_lock);
+	if (vib.vib_size > vnd_vdq_hard_max) {
+		mutex_exit(&vnd_dev_lock);
+		vib.vib_errno = VND_E_BUFTOOBIG;
+		ret = EIO;
+		goto err;
+	}
+	mutex_exit(&vnd_dev_lock);
+
+	mutex_enter(&vdp->vdd_lock);
+	if (!(vdp->vdd_flags & VND_D_ATTACHED)) {
+		mutex_exit(&vdp->vdd_lock);
+		vib.vib_errno = VND_E_NOTATTACHED;
+		ret = EIO;
+		goto err;
+	}
+
+	mutex_enter(&vdp->vdd_str->vns_lock);
+	if (vib.vib_size < vdp->vdd_str->vns_minwrite) {
+		mutex_exit(&vdp->vdd_str->vns_lock);
+		mutex_exit(&vdp->vdd_lock);
+		vib.vib_errno = VND_E_BUFTOOSMALL;
+		ret = EIO;
+		goto err;
+	}
+
+	mutex_exit(&vdp->vdd_str->vns_lock);
+	mutex_enter(&vdp->vdd_str->vns_dq_read.vdq_lock);
+	vdp->vdd_str->vns_dq_read.vdq_max = vib.vib_size;
+	mutex_exit(&vdp->vdd_str->vns_dq_read.vdq_lock);
+	mutex_exit(&vdp->vdd_lock);
+	ret = 0;
+
+err:
+	if (ddi_copyout(&vib, (void *)arg, sizeof (vib), cpflag) != 0)
+		return (EFAULT);
+
+	return (ret);
+}
+
+static int
+vnd_ioctl_getrxbuf(vnd_dev_t *vdp, intptr_t arg, int cpflag)
+{
+	int ret;
+	vnd_ioc_buf_t vib;
+
+	mutex_enter(&vdp->vdd_lock);
+	if (!(vdp->vdd_flags & VND_D_ATTACHED)) {
+		mutex_exit(&vdp->vdd_lock);
+		vib.vib_errno = VND_E_NOTATTACHED;
+		ret = EIO;
+		goto err;
+	}
+
+	mutex_enter(&vdp->vdd_str->vns_dq_read.vdq_lock);
+	vib.vib_size = vdp->vdd_str->vns_dq_read.vdq_max;
+	mutex_exit(&vdp->vdd_str->vns_dq_read.vdq_lock);
+	mutex_exit(&vdp->vdd_lock);
+	ret = 0;
+
+err:
+	if (ddi_copyout(&vib, (void *)arg, sizeof (vib), cpflag) != 0)
+		return (EFAULT);
+
+	return (ret);
+}
+
+static int
+vnd_ioctl_getmaxbuf(vnd_dev_t *vdp, intptr_t arg, int cpflag)
+{
+	vnd_ioc_buf_t vib;
+
+	mutex_enter(&vnd_dev_lock);
+	vib.vib_size = vnd_vdq_hard_max;
+	mutex_exit(&vnd_dev_lock);
+
+	if (ddi_copyout(&vib, (void *)arg, sizeof (vib), cpflag) != 0)
+		return (EFAULT);
+
+	return (0);
+}
+
+static int
+vnd_ioctl_gettxbuf(vnd_dev_t *vdp, intptr_t arg, int cpflag)
+{
+	int ret;
+	vnd_ioc_buf_t vib;
+
+	mutex_enter(&vdp->vdd_lock);
+	if (!(vdp->vdd_flags & VND_D_ATTACHED)) {
+		mutex_exit(&vdp->vdd_lock);
+		vib.vib_errno = VND_E_NOTATTACHED;
+		ret = EIO;
+		goto err;
+	}
+
+	mutex_enter(&vdp->vdd_str->vns_dq_write.vdq_lock);
+	vib.vib_size = vdp->vdd_str->vns_dq_write.vdq_max;
+	mutex_exit(&vdp->vdd_str->vns_dq_write.vdq_lock);
+	mutex_exit(&vdp->vdd_lock);
+	ret = 0;
+
+err:
+	if (ddi_copyout(&vib, (void *)arg, sizeof (vib), cpflag) != 0)
+		return (EFAULT);
+
+	return (ret);
+}
+
+static int
+vnd_ioctl_settxbuf(vnd_dev_t *vdp, intptr_t arg, int cpflag)
+{
+	int ret;
+	vnd_ioc_buf_t vib;
+
+	if (ddi_copyin((void *)arg, &vib, sizeof (vib), cpflag) != 0)
+		return (EFAULT);
+
+	mutex_enter(&vnd_dev_lock);
+	if (vib.vib_size > vnd_vdq_hard_max) {
+		mutex_exit(&vnd_dev_lock);
+		vib.vib_errno = VND_E_BUFTOOBIG;
+		ret = EIO;
+		goto err;
+	}
+	mutex_exit(&vnd_dev_lock);
+
+	mutex_enter(&vdp->vdd_lock);
+	if (!(vdp->vdd_flags & VND_D_ATTACHED)) {
+		mutex_exit(&vdp->vdd_lock);
+		vib.vib_errno = VND_E_NOTATTACHED;
+		ret = EIO;
+		goto err;
+	}
+
+	mutex_enter(&vdp->vdd_str->vns_lock);
+	if (vib.vib_size < vdp->vdd_str->vns_minwrite) {
+		mutex_exit(&vdp->vdd_str->vns_lock);
+		mutex_exit(&vdp->vdd_lock);
+		vib.vib_errno = VND_E_BUFTOOSMALL;
+		ret = EIO;
+		goto err;
+	}
+	mutex_exit(&vdp->vdd_str->vns_lock);
+
+	mutex_enter(&vdp->vdd_str->vns_dq_write.vdq_lock);
+	vdp->vdd_str->vns_dq_write.vdq_max = vib.vib_size;
+	mutex_exit(&vdp->vdd_str->vns_dq_write.vdq_lock);
+	mutex_exit(&vdp->vdd_lock);
+	ret = 0;
+
+err:
+	if (ddi_copyout(&vib, (void *)arg, sizeof (vib), cpflag) != 0)
+		return (EFAULT);
+
+	return (ret);
+}
+
+static int
+vnd_ioctl_gettu(vnd_dev_t *vdp, intptr_t arg, int mode, boolean_t min)
+{
+	vnd_ioc_buf_t vib;
+
+	vib.vib_errno = 0;
+	mutex_enter(&vdp->vdd_lock);
+	if (vdp->vdd_flags & VND_D_ATTACHED) {
+		mutex_enter(&vdp->vdd_str->vns_lock);
+		if (min == B_TRUE)
+			vib.vib_size = vdp->vdd_str->vns_minwrite;
+		else
+			vib.vib_size = vdp->vdd_str->vns_maxwrite;
+		mutex_exit(&vdp->vdd_str->vns_lock);
+	} else {
+		vib.vib_errno = VND_E_NOTATTACHED;
+	}
+	mutex_exit(&vdp->vdd_lock);
+
+	if (ddi_copyout(&vib, (void *)arg, sizeof (vib), mode & FKIOCTL) != 0)
+		return (EFAULT);
+
+	return (0);
+}
+
+static int
+vnd_frameio_read(vnd_dev_t *vdp, intptr_t addr, int mode)
+{
+	int ret, nonblock, nwrite;
+	frameio_t *fio;
+	vnd_data_queue_t *vqp;
+	mblk_t *mp;
+
+	fio = frameio_alloc(KM_NOSLEEP | KM_NORMALPRI);
+	if (fio == NULL)
+		return (EAGAIN);
+
+	ret = frameio_hdr_copyin(fio, FRAMEIO_NVECS_MAX, (const void *)addr,
+	    mode);
+	if (ret != 0) {
+		frameio_free(fio);
+		return (ret);
+	}
+
+	mutex_enter(&vdp->vdd_lock);
+	if (!(vdp->vdd_flags & VND_D_ATTACHED)) {
+		mutex_exit(&vdp->vdd_lock);
+		frameio_free(fio);
+		return (ENXIO);
+	}
+	mutex_exit(&vdp->vdd_lock);
+
+	nonblock = mode & (FNONBLOCK | FNDELAY);
+
+	vqp = &vdp->vdd_str->vns_dq_read;
+	mutex_enter(&vqp->vdq_lock);
+
+	/* Check empty case */
+	if (vqp->vdq_cur == 0) {
+		if (nonblock != 0) {
+			mutex_exit(&vqp->vdq_lock);
+			frameio_free(fio);
+			return (EWOULDBLOCK);
+		}
+		while (vqp->vdq_cur == 0) {
+			if (cv_wait_sig(&vqp->vdq_ready, &vqp->vdq_lock) <= 0) {
+				mutex_exit(&vqp->vdq_lock);
+				frameio_free(fio);
+				return (EINTR);
+			}
+		}
+	}
+
+	ret = frameio_mblk_chain_write(fio, MAP_BLK_FRAME, vqp->vdq_head,
+	    &nwrite, mode & FKIOCTL);
+	if (ret != 0) {
+		mutex_exit(&vqp->vdq_lock);
+		frameio_free(fio);
+		return (ret);
+	}
+
+	ret = frameio_hdr_copyout(fio, nwrite, (void *)addr, mode);
+	if (ret != 0) {
+		mutex_exit(&vqp->vdq_lock);
+		frameio_free(fio);
+		return (ret);
+	}
+
+	while (nwrite > 0) {
+		(void) vnd_dq_pop(vqp, &mp);
+		freemsg(mp);
+		nwrite--;
+	}
+	mutex_exit(&vqp->vdq_lock);
+	frameio_free(fio);
+
+	return (0);
+}
+
+static int
+vnd_frameio_write(vnd_dev_t *vdp, intptr_t addr, int mode)
+{
+	frameio_t *fio;
+	int ret, nonblock, nframes, i, nread;
+	size_t maxwrite, minwrite, total, flen;
+	mblk_t *mp_chain, *mp, *nmp;
+	vnd_data_queue_t *vqp;
+
+	fio = frameio_alloc(KM_NOSLEEP | KM_NORMALPRI);
+	if (fio == NULL)
+		return (EAGAIN);
+
+	ret = frameio_hdr_copyin(fio, FRAMEIO_NVECS_MAX, (void *)addr, mode);
+	if (ret != 0) {
+		frameio_free(fio);
+		return (ret);
+	}
+
+	mutex_enter(&vdp->vdd_lock);
+	if (!(vdp->vdd_flags & VND_D_ATTACHED)) {
+		mutex_exit(&vdp->vdd_lock);
+		frameio_free(fio);
+		return (ENXIO);
+	}
+	mutex_exit(&vdp->vdd_lock);
+
+	nonblock = mode & (FNONBLOCK | FNDELAY);
+
+	/*
+	 * Make sure no single frame is larger than we can accept.
+	 */
+	mutex_enter(&vdp->vdd_str->vns_lock);
+	minwrite = vdp->vdd_str->vns_minwrite;
+	maxwrite = vdp->vdd_str->vns_maxwrite;
+	mutex_exit(&vdp->vdd_str->vns_lock);
+
+	nframes = fio->fio_nvpf / fio->fio_nvecs;
+	total = 0;
+	for (i = 0; i < nframes; i++) {
+		flen = frameio_frame_length(fio,
+		    &fio->fio_vecs[i*fio->fio_nvpf]);
+		if (flen < minwrite || flen > maxwrite) {
+			frameio_free(fio);
+			return (ERANGE);
+		}
+		total += flen;
+	}
+
+	vqp = &vdp->vdd_str->vns_dq_write;
+	mutex_enter(&vqp->vdq_lock);
+	while (vnd_dq_reserve(vqp, total) == 0) {
+		if (nonblock != 0) {
+			frameio_free(fio);
+			mutex_exit(&vqp->vdq_lock);
+			return (EAGAIN);
+		}
+		if (cv_wait_sig(&vqp->vdq_ready, &vqp->vdq_lock) <= 0) {
+			mutex_exit(&vqp->vdq_lock);
+			frameio_free(fio);
+			return (EINTR);
+		}
+	}
+	mutex_exit(&vqp->vdq_lock);
+
+	/*
+	 * We've reserved our space, let's copyin and go from here.
+	 */
+	ret = frameio_mblk_chain_read(fio, &mp_chain, &nread, mode & FKIOCTL);
+	if (ret != 0) {
+		frameio_free(fio);
+		vnd_dq_unreserve(vqp, total);
+		cv_broadcast(&vqp->vdq_ready);
+		pollwakeup(&vdp->vdd_ph, POLLOUT);
+		return (ret);
+	}
+
+	for (mp = mp_chain; mp != NULL; mp = nmp) {
+		nmp = mp->b_next;
+		mp->b_next = NULL;
+		gsqueue_enter_one(vdp->vdd_str->vns_squeue, mp,
+		    vnd_squeue_tx_append, vdp->vdd_str, GSQUEUE_PROCESS,
+		    VND_SQUEUE_TAG_VND_WRITE);
+	}
+
+	/*
+	 * Update the frameio structure to indicate that we wrote those frames.
+	 */
+	frameio_mark_consumed(fio, nread);
+	ret = frameio_hdr_copyout(fio, nread, (void *)addr, mode);
+	frameio_free(fio);
+
+	return (ret);
+}
+
+static int
+vnd_ioctl_list_copy_info(vnd_dev_t *vdp, vnd_ioc_info_t *arg, int mode)
+{
+	const char *link;
+	uint32_t vers = 1;
+	ASSERT(MUTEX_HELD(&vdp->vdd_lock));
+
+	/*
+	 * Copy all of the members out to userland.
+	 */
+	if (ddi_copyout(&vers, &arg->vii_version, sizeof (uint32_t),
+	    mode & FKIOCTL) != 0)
+		return (EFAULT);
+
+	if (vdp->vdd_flags & VND_D_LINKED)
+		link = vdp->vdd_lname;
+	else
+		link = "<anonymous>";
+	if (ddi_copyout(link, arg->vii_name, sizeof (arg->vii_name),
+	    mode & FKIOCTL) != 0)
+		return (EFAULT);
+
+	if (ddi_copyout(vdp->vdd_datalink, arg->vii_datalink,
+	    sizeof (arg->vii_datalink), mode & FKIOCTL) != 0)
+		return (EFAULT);
+
+	if (ddi_copyout(&vdp->vdd_nsd->vpnd_zid, &arg->vii_zone,
+	    sizeof (zoneid_t), mode & FKIOCTL) != 0)
+		return (EFAULT);
+	return (0);
+}
+
+static int
+vnd_ioctl_list(intptr_t arg, cred_t *credp, int mode)
+{
+	vnd_ioc_list_t vl;
+	vnd_ioc_list32_t vl32;
+	zoneid_t zid;
+	vnd_dev_t *vdp;
+	vnd_ioc_info_t *vip;
+	int found, cancopy, ret;
+
+	if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) {
+		if (ddi_copyin((void *)arg, &vl32, sizeof (vnd_ioc_list32_t),
+		    mode & FKIOCTL) != 0)
+			return (EFAULT);
+		vl.vl_nents = vl32.vl_nents;
+		vl.vl_actents = vl32.vl_actents;
+		vl.vl_ents = (void *)(uintptr_t)vl32.vl_ents;
+	} else {
+		if (ddi_copyin((void *)arg, &vl, sizeof (vnd_ioc_list_t),
+		    mode & FKIOCTL) != 0)
+			return (EFAULT);
+	}
+
+	cancopy = vl.vl_nents;
+	vip = vl.vl_ents;
+	found = 0;
+	zid = crgetzoneid(credp);
+	mutex_enter(&vnd_dev_lock);
+	for (vdp = list_head(&vnd_dev_list); vdp != NULL;
+	    vdp = list_next(&vnd_dev_list, vdp)) {
+		mutex_enter(&vdp->vdd_lock);
+		if (vdp->vdd_flags & VND_D_ATTACHED &&
+		    !(vdp->vdd_flags & (VND_D_CONDEMNED | VND_D_ZONE_DYING)) &&
+		    (zid == GLOBAL_ZONEID || zid == vdp->vdd_nsd->vpnd_zid)) {
+			found++;
+			if (cancopy > 0) {
+				ret = vnd_ioctl_list_copy_info(vdp, vip, mode);
+				if (ret != 0) {
+					mutex_exit(&vdp->vdd_lock);
+					mutex_exit(&vnd_dev_lock);
+					return (ret);
+				}
+				cancopy--;
+				vip++;
+			}
+		}
+		mutex_exit(&vdp->vdd_lock);
+	}
+	mutex_exit(&vnd_dev_lock);
+
+	if (ddi_copyout(&found, &((vnd_ioc_list_t *)arg)->vl_actents,
+	    sizeof (uint_t), mode & FKIOCTL) != 0)
+		return (EFAULT);
+
+	return (0);
+}
+
+
+static int
+vnd_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
+    int *rvalp)
+{
+	int ret;
+	minor_t m;
+	vnd_dev_t *vdp;
+
+	m = getminor(dev);
+	ASSERT(m != 0);
+
+	/*
+	 * Make sure no one has come in on an ioctl from the strioc case.
+	 */
+	if ((cmd & VND_STRIOC) == VND_STRIOC)
+		return (ENOTTY);
+
+	/*
+	 * Like close, seems like if this minor isn't found, it's a programmer
+	 * error somehow.
+	 */
+	vdp = vnd_dev_lookup(m);
+	if (vdp == NULL)
+		return (ENXIO);
+
+	switch (cmd) {
+	case VND_IOC_ATTACH:
+		if (!(mode & FWRITE)) {
+			ret = EBADF;
+			break;
+		}
+		ret = vnd_ioctl_attach(vdp, arg, credp, mode);
+		break;
+	case VND_IOC_LINK:
+		if (!(mode & FWRITE)) {
+			ret = EBADF;
+			break;
+		}
+		ret = vnd_ioctl_link(vdp, arg, credp, mode);
+		break;
+	case VND_IOC_UNLINK:
+		if (!(mode & FWRITE)) {
+			ret = EBADF;
+			break;
+		}
+		ret = vnd_ioctl_unlink(vdp, arg, credp, mode);
+		break;
+	case VND_IOC_GETRXBUF:
+		if (!(mode & FREAD)) {
+			ret = EBADF;
+			break;
+		}
+		ret = vnd_ioctl_getrxbuf(vdp, arg, mode);
+		break;
+	case VND_IOC_SETRXBUF:
+		if (!(mode & FWRITE)) {
+			ret = EBADF;
+			break;
+		}
+		ret = vnd_ioctl_setrxbuf(vdp, arg, mode);
+		break;
+	case VND_IOC_GETTXBUF:
+		if (!(mode & FREAD)) {
+			ret = EBADF;
+			break;
+		}
+		ret = vnd_ioctl_gettxbuf(vdp, arg, mode);
+		break;
+	case VND_IOC_SETTXBUF:
+		if (!(mode & FWRITE)) {
+			ret = EBADF;
+			break;
+		}
+		ret = vnd_ioctl_settxbuf(vdp, arg, mode);
+		break;
+	case VND_IOC_GETMAXBUF:
+		if (!(mode & FREAD)) {
+			ret = EBADF;
+			break;
+		}
+		if (crgetzoneid(credp) != GLOBAL_ZONEID) {
+			ret = EPERM;
+			break;
+		}
+		ret = vnd_ioctl_getmaxbuf(vdp, arg, mode);
+		break;
+	case VND_IOC_GETMINTU:
+		if (!(mode & FREAD)) {
+			ret = EBADF;
+			break;
+		}
+		ret = vnd_ioctl_gettu(vdp, arg, mode, B_TRUE);
+		break;
+	case VND_IOC_GETMAXTU:
+		if (!(mode & FREAD)) {
+			ret = EBADF;
+			break;
+		}
+		ret = vnd_ioctl_gettu(vdp, arg, mode, B_FALSE);
+		break;
+	case VND_IOC_FRAMEIO_READ:
+		if (!(mode & FREAD)) {
+			ret = EBADF;
+			break;
+		}
+		ret = vnd_frameio_read(vdp, arg, mode);
+		break;
+	case VND_IOC_FRAMEIO_WRITE:
+		if (!(mode & FWRITE)) {
+			ret = EBADF;
+			break;
+		}
+		ret = vnd_frameio_write(vdp, arg, mode);
+		break;
+	case VND_IOC_LIST:
+		if (!(mode & FREAD)) {
+			ret = EBADF;
+			break;
+		}
+		ret = vnd_ioctl_list(arg, credp, mode);
+		break;
+	default:
+		ret = ENOTTY;
+		break;
+	}
+
+	vnd_dev_rele(vdp);
+	return (ret);
+}
+
+static int
+vnd_open(dev_t *devp, int flag, int otyp, cred_t *credp)
+{
+	vnd_dev_t *vdp;
+	minor_t m;
+	zoneid_t zid;
+
+	if (flag & (FEXCL | FNDELAY))
+		return (ENOTSUP);
+
+	if (otyp & OTYP_BLK)
+		return (ENOTSUP);
+
+	zid = crgetzoneid(credp);
+	m = getminor(*devp);
+
+	/*
+	 * If we have an open of a non-zero instance then we need to look that
+	 * up in our list of entries.
+	 */
+	if (m != 0) {
+
+		/*
+		 * We don't check for rawaccess globally as a user could be
+		 * doing a list ioctl on the control node which doesn't require
+		 * this privilege.
+		 */
+		if (secpolicy_net_rawaccess(credp) != 0)
+			return (EPERM);
+
+
+		vdp = vnd_dev_lookup(m);
+		if (vdp == NULL)
+			return (ENOENT);
+
+		/*
+		 * We need to check to make sure that the user is allowed to
+		 * open this node. At this point it should be an attached handle
+		 * as that's all we're allowed to access.
+		 */
+		mutex_enter(&vdp->vdd_lock);
+		if (!(vdp->vdd_flags & VND_D_LINKED)) {
+			mutex_exit(&vdp->vdd_lock);
+			vnd_dev_rele(vdp);
+			return (ENOENT);
+		}
+
+		if (vdp->vdd_flags & VND_D_ZONE_DYING) {
+			mutex_exit(&vdp->vdd_lock);
+			vnd_dev_rele(vdp);
+			return (ENOENT);
+		}
+
+		if (zid != GLOBAL_ZONEID && zid != vdp->vdd_nsd->vpnd_zid) {
+			mutex_exit(&vdp->vdd_lock);
+			vnd_dev_rele(vdp);
+			return (ENOENT);
+		}
+
+		if ((flag & FEXCL) && (vdp->vdd_flags & VND_D_OPENED)) {
+			mutex_exit(&vdp->vdd_lock);
+			vnd_dev_rele(vdp);
+			return (EBUSY);
+		}
+
+		if (!(vdp->vdd_flags & VND_D_OPENED)) {
+			vdp->vdd_flags |= VND_D_OPENED;
+			vdp->vdd_ref++;
+			DTRACE_VND_REFINC(vdp);
+		}
+		mutex_exit(&vdp->vdd_lock);
+		vnd_dev_rele(vdp);
+
+		return (0);
+	}
+
+	if (flag & FEXCL)
+		return (ENOTSUP);
+
+	/*
+	 * We need to clone ourselves and set up new a state.
+	 */
+	vdp = kmem_cache_alloc(vnd_dev_cache, KM_SLEEP);
+	bzero(vdp, sizeof (vnd_dev_t));
+
+	if (ldi_ident_from_dev(*devp, &vdp->vdd_ldiid) != 0) {
+		kmem_cache_free(vnd_dev_cache, vdp);
+		return (EINVAL);
+	}
+
+	vdp->vdd_minor = id_alloc(vnd_minors);
+	mutex_init(&vdp->vdd_lock, NULL, MUTEX_DRIVER, NULL);
+	list_link_init(&vdp->vdd_link);
+	vdp->vdd_ref = 1;
+	*devp = makedevice(getmajor(*devp), vdp->vdd_minor);
+	vdp->vdd_devid = *devp;
+	DTRACE_VND_REFINC(vdp);
+	vdp->vdd_flags |= VND_D_OPENED;
+
+	mutex_enter(&vnd_dev_lock);
+	list_insert_head(&vnd_dev_list, vdp);
+	mutex_exit(&vnd_dev_lock);
+
+	return (0);
+}
+
+static int
+vnd_close(dev_t dev, int flag, int otyp, cred_t *credp)
+{
+	minor_t m;
+	vnd_dev_t *vdp;
+
+	m = getminor(dev);
+	if (m == 0)
+		return (ENXIO);
+
+	vdp = vnd_dev_lookup(m);
+	if (vdp == NULL)
+		return (ENXIO);
+
+	mutex_enter(&vdp->vdd_lock);
+	VERIFY(vdp->vdd_flags & VND_D_OPENED);
+	vdp->vdd_flags &= ~VND_D_OPENED;
+	mutex_exit(&vdp->vdd_lock);
+
+	/* Remove the hold from the previous open. */
+	vnd_dev_rele(vdp);
+
+	/* And now from lookup */
+	vnd_dev_rele(vdp);
+	return (0);
+}
+
+static int
+vnd_read(dev_t dev, struct uio *uiop, cred_t *credp)
+{
+	int nonblock, error = 0;
+	size_t mpsize;
+	vnd_dev_t *vdp;
+	vnd_data_queue_t *vqp;
+	mblk_t *mp = NULL;
+	offset_t u_loffset;
+
+	/*
+	 * If we have more than one uio we refuse to do anything. That's for
+	 * frameio.
+	 */
+	if (uiop->uio_iovcnt > 1)
+		return (EINVAL);
+
+	vdp = vnd_dev_lookup(getminor(dev));
+	if (vdp == NULL)
+		return (ENXIO);
+
+	mutex_enter(&vdp->vdd_lock);
+	if (!(vdp->vdd_flags & VND_D_ATTACHED)) {
+		mutex_exit(&vdp->vdd_lock);
+		vnd_dev_rele(vdp);
+		return (ENXIO);
+	}
+	mutex_exit(&vdp->vdd_lock);
+	nonblock = uiop->uio_fmode & (FNONBLOCK | FNDELAY);
+
+	vqp = &vdp->vdd_str->vns_dq_read;
+	mutex_enter(&vqp->vdq_lock);
+
+	/* Check empty case */
+	if (vqp->vdq_cur == 0) {
+		if (nonblock != 0) {
+			error = EWOULDBLOCK;
+			goto err;
+		}
+		while (vqp->vdq_cur == 0) {
+			if (cv_wait_sig(&vqp->vdq_ready, &vqp->vdq_lock) <= 0) {
+				error = EINTR;
+				goto err;
+			}
+		}
+	}
+
+	/* Ensure our buffer is big enough */
+	mp = vqp->vdq_head;
+	ASSERT(mp != NULL);
+	mpsize = msgsize(mp);
+	if (mpsize > uiop->uio_resid) {
+		error = EOVERFLOW;
+		goto err;
+	}
+
+	u_loffset = uiop->uio_loffset;
+	while (mp != NULL) {
+		if (uiomove(mp->b_rptr, MBLKL(mp), UIO_READ, uiop) != 0) {
+			error = EFAULT;
+			uiop->uio_loffset = u_loffset;
+			mp = NULL;
+			goto err;
+		}
+		mpsize -= MBLKL(mp);
+		mp = mp->b_cont;
+	}
+	ASSERT(mpsize == 0);
+	(void) vnd_dq_pop(vqp, &mp);
+	freemsg(mp);
+err:
+	mutex_exit(&vqp->vdq_lock);
+	vnd_dev_rele(vdp);
+
+	return (error);
+}
+
+static int
+vnd_write(dev_t dev, struct uio *uiop, cred_t *credp)
+{
+	int nonblock, error;
+	vnd_dev_t *vdp;
+	mblk_t *mp;
+	ssize_t iosize, origsize;
+	vnd_data_queue_t *vqp;
+
+	if (uiop->uio_iovcnt > 1)
+		return (EINVAL);
+
+	vdp = vnd_dev_lookup(getminor(dev));
+	if (vdp == NULL)
+		return (ENXIO);
+
+	mutex_enter(&vdp->vdd_lock);
+	if (!(vdp->vdd_flags & VND_D_ATTACHED)) {
+		mutex_exit(&vdp->vdd_lock);
+		vnd_dev_rele(vdp);
+		return (ENXIO);
+	}
+	mutex_exit(&vdp->vdd_lock);
+	nonblock = uiop->uio_fmode & (FNONBLOCK | FNDELAY);
+
+	mutex_enter(&vdp->vdd_str->vns_lock);
+	if (uiop->uio_resid > vdp->vdd_str->vns_maxwrite ||
+	    uiop->uio_resid < vdp->vdd_str->vns_minwrite) {
+		mutex_exit(&vdp->vdd_str->vns_lock);
+		vnd_dev_rele(vdp);
+		return (ERANGE);
+	}
+	mutex_exit(&vdp->vdd_str->vns_lock);
+	VERIFY(vdp->vdd_str != NULL);
+
+	/*
+	 * Reserve space in the data queue if we can. If we can't, block or
+	 * return EAGAIN. If we can, go and squeue_enter.
+	 */
+	vqp = &vdp->vdd_str->vns_dq_write;
+	mutex_enter(&vqp->vdq_lock);
+	while (vnd_dq_reserve(vqp, uiop->uio_resid) == 0) {
+		if (nonblock != 0) {
+			mutex_exit(&vqp->vdq_lock);
+			vnd_dev_rele(vdp);
+			return (EAGAIN);
+		}
+		if (cv_wait_sig(&vqp->vdq_ready, &vqp->vdq_lock) <= 0) {
+			mutex_exit(&vqp->vdq_lock);
+			vnd_dev_rele(vdp);
+			return (EINTR);
+		}
+	}
+	mutex_exit(&vqp->vdq_lock);
+
+	/*
+	 * Now that we've reserved the space, try to allocate kernel space for
+	 * and copy in the block. To take care of all this we use the
+	 * strmakedata subroutine for now.
+	 */
+	origsize = iosize = uiop->uio_resid;
+	error = strmakedata(&iosize, uiop, vdp->vdd_str->vns_wq->q_stream, 0,
+	    &mp);
+
+	/*
+	 * strmakedata() will return an error or it may only consume a portion
+	 * of the data.
+	 */
+	if (error != 0 || uiop->uio_resid != 0) {
+		vnd_dq_unreserve(vqp, origsize);
+		cv_broadcast(&vqp->vdq_ready);
+		pollwakeup(&vdp->vdd_ph, POLLOUT);
+		vnd_dev_rele(vdp);
+		return (ENOSR);
+	}
+
+	gsqueue_enter_one(vdp->vdd_str->vns_squeue, mp,
+	    vnd_squeue_tx_append, vdp->vdd_str, GSQUEUE_PROCESS,
+	    VND_SQUEUE_TAG_VND_WRITE);
+
+	vnd_dev_rele(vdp);
+	return (0);
+}
+
+static int
+vnd_chpoll(dev_t dev, short events, int anyyet, short *reventsp,
+    struct pollhead **phpp)
+{
+	int ready = 0;
+	vnd_dev_t *vdp;
+	vnd_data_queue_t *vqp;
+
+	vdp = vnd_dev_lookup(getminor(dev));
+	if (vdp == NULL)
+		return (ENXIO);
+
+	mutex_enter(&vdp->vdd_lock);
+	if (!(vdp->vdd_flags & VND_D_ATTACHED)) {
+		mutex_exit(&vdp->vdd_lock);
+		vnd_dev_rele(vdp);
+		return (ENXIO);
+	}
+	mutex_exit(&vdp->vdd_lock);
+
+	if ((events & POLLIN) || (events & POLLRDNORM)) {
+		vqp = &vdp->vdd_str->vns_dq_read;
+		mutex_enter(&vqp->vdq_lock);
+		if (vqp->vdq_head != NULL)
+			ready |= events & (POLLIN | POLLRDNORM);
+		mutex_exit(&vqp->vdq_lock);
+	}
+
+	if (events & POLLOUT) {
+		vqp = &vdp->vdd_str->vns_dq_write;
+		mutex_enter(&vqp->vdq_lock);
+		if (vqp->vdq_cur != vqp->vdq_max)
+			ready |= POLLOUT;
+		mutex_exit(&vqp->vdq_lock);
+	}
+
+	if (ready != 0) {
+		*reventsp = ready;
+		vnd_dev_rele(vdp);
+		return (0);
+	}
+
+	*reventsp = 0;
+	if (!anyyet)
+		*phpp = &vdp->vdd_ph;
+
+	vnd_dev_rele(vdp);
+	return (0);
+}
+
+static void *
+vnd_stack_init(netstackid_t stackid, netstack_t *ns)
+{
+	vnd_pnsd_t *nsp;
+
+	nsp = kmem_cache_alloc(vnd_pnsd_cache, KM_SLEEP);
+	bzero(nsp, sizeof (*nsp));
+	nsp->vpnd_nsid = stackid;
+	nsp->vpnd_zid = netstackid_to_zoneid(stackid);
+	nsp->vpnd_flags = 0;
+	mutex_init(&nsp->vpnd_lock, NULL, MUTEX_DRIVER, NULL);
+	list_create(&nsp->vpnd_dev_list, sizeof (vnd_dev_t),
+	    offsetof(vnd_dev_t, vdd_nslink));
+	if (vnd_netinfo_init(nsp) == 0)
+		nsp->vpnd_hooked = B_TRUE;
+
+	mutex_enter(&vnd_dev_lock);
+	list_insert_tail(&vnd_nsd_list, nsp);
+	mutex_exit(&vnd_dev_lock);
+
+	return (nsp);
+}
+
+static void
+vnd_stack_shutdown(netstackid_t stackid, void *arg)
+{
+	vnd_pnsd_t *nsp = arg;
+	vnd_dev_t *vdp;
+
+	ASSERT(nsp != NULL);
+	/*
+	 * After shut down no one should be able to find their way to this
+	 * netstack again.
+	 */
+	mutex_enter(&vnd_dev_lock);
+	list_remove(&vnd_nsd_list, nsp);
+	mutex_exit(&vnd_dev_lock);
+
+	/*
+	 * Make sure hooks know that they're going away.
+	 */
+	if (nsp->vpnd_hooked == B_TRUE)
+		vnd_netinfo_shutdown(nsp);
+
+	/*
+	 * Now we need to go through and notify each zone that they are in
+	 * teardown phase.  See the big theory statement section on vnd, zones,
+	 * netstacks, and sdev for more information about this.
+	 */
+	mutex_enter(&nsp->vpnd_lock);
+	nsp->vpnd_flags |= VND_NS_CONDEMNED;
+	for (vdp = list_head(&nsp->vpnd_dev_list); vdp != NULL;
+	    vdp = list_next(&nsp->vpnd_dev_list, vdp)) {
+		mutex_enter(&vdp->vdd_lock);
+		if (!(vdp->vdd_flags & VND_D_CONDEMNED))
+			vdp->vdd_flags |= VND_D_ZONE_DYING;
+		mutex_exit(&vdp->vdd_lock);
+	}
+	mutex_exit(&nsp->vpnd_lock);
+
+	/*
+	 * Next we remove all the links as we know nothing new can be added to
+	 * the list and that none of the extent devices can obtain additional
+	 * links.
+	 */
+restart:
+	mutex_enter(&nsp->vpnd_lock);
+	for (vdp = list_head(&nsp->vpnd_dev_list); vdp != NULL;
+	    vdp = list_next(&nsp->vpnd_dev_list, vdp)) {
+		mutex_enter(&vdp->vdd_lock);
+		if ((vdp->vdd_flags & VND_D_CONDEMNED) ||
+		    !(vdp->vdd_flags & VND_D_LINKED)) {
+			mutex_exit(&vdp->vdd_lock);
+			continue;
+		}
+
+		/*
+		 * We drop our lock here and restart afterwards. Note that as
+		 * part of unlinking we end up doing a rele of the vnd_dev_t. If
+		 * this is the final hold on the vnd_dev_t then it might try and
+		 * remove itself. Our locking rules requires not to be holding
+		 * any locks when we call any of the rele functions.
+		 *
+		 * Note that the unlink function requires holders to call into
+		 * it with the vnd_dev_t->vdd_lock held and will take care of it
+		 * for us. Because we don't have a hold on it, we're done at
+		 * this point.
+		 */
+		mutex_exit(&nsp->vpnd_lock);
+		/* Forcibly unlink */
+		vnd_dev_unlink(vdp);
+		goto restart;
+	}
+	mutex_exit(&nsp->vpnd_lock);
+}
+
+static void
+vnd_stack_destroy(netstackid_t stackid, void *arg)
+{
+	vnd_pnsd_t *nsp = arg;
+
+	ASSERT(nsp != NULL);
+
+	/*
+	 * Now that we've unlinked everything we just have to hang out for
+	 * it to finish exiting. Now that it's no longer the kernel itself
+	 * that's doing this we just need to wait for our reference count to
+	 * equal zero and then we're free. If the global zone is holding open a
+	 * reference to a vnd device for another zone, that's bad, but there's
+	 * nothing much we can do. See the section on 'vnd, zones, netstacks' in
+	 * the big theory statement for more information.
+	 */
+	mutex_enter(&nsp->vpnd_lock);
+	while (nsp->vpnd_ref != 0)
+		cv_wait(&nsp->vpnd_ref_change, &nsp->vpnd_lock);
+	mutex_exit(&nsp->vpnd_lock);
+
+	/*
+	 * During shutdown we removed ourselves from the list and now we have no
+	 * more references so we can safely say that there is nothing left and
+	 * destroy everything that we had sitting around.
+	 */
+	if (nsp->vpnd_hooked == B_TRUE)
+		vnd_netinfo_fini(nsp);
+
+	mutex_destroy(&nsp->vpnd_lock);
+	list_destroy(&nsp->vpnd_dev_list);
+	kmem_cache_free(vnd_pnsd_cache, nsp);
+}
+
+/*
+ * Convert a node with a name of the form /dev/vnd/zone/%zonename and
+ * /dev/vnd/zone/%zonename/%linkname to the corresponding vnd netstack.
+ */
+static vnd_pnsd_t *
+vnd_sdev_ctx_to_ns(sdev_ctx_t ctx)
+{
+	enum vtype vt;
+	const char *path = sdev_ctx_path(ctx);
+	char *zstart, *dup;
+	size_t duplen;
+	vnd_pnsd_t *nsp;
+
+	vt = sdev_ctx_vtype(ctx);
+	ASSERT(strncmp(path, VND_SDEV_ZROOT, strlen(VND_SDEV_ZROOT)) == 0);
+
+	if (vt == VDIR) {
+		zstart = strrchr(path, '/');
+		ASSERT(zstart != NULL);
+		zstart++;
+		return (vnd_nsd_lookup_by_zonename(zstart));
+	}
+
+	ASSERT(vt == VCHR);
+
+	dup = strdup(path);
+	duplen = strlen(dup) + 1;
+	zstart = strrchr(dup, '/');
+	*zstart = '\0';
+	zstart--;
+	zstart = strrchr(dup, '/');
+	zstart++;
+	nsp = vnd_nsd_lookup_by_zonename(zstart);
+	kmem_free(dup, duplen);
+
+	return (nsp);
+}
+
+static sdev_plugin_validate_t
+vnd_sdev_validate_dir(sdev_ctx_t ctx)
+{
+	vnd_pnsd_t *nsp;
+
+	if (strcmp(sdev_ctx_path(ctx), VND_SDEV_ROOT) == 0)
+		return (SDEV_VTOR_VALID);
+
+	if (strcmp(sdev_ctx_path(ctx), VND_SDEV_ZROOT) == 0) {
+		ASSERT(getzoneid() == GLOBAL_ZONEID);
+		ASSERT(sdev_ctx_flags(ctx) & SDEV_CTX_GLOBAL);
+		return (SDEV_VTOR_VALID);
+	}
+
+	nsp = vnd_sdev_ctx_to_ns(ctx);
+	if (nsp == NULL)
+		return (SDEV_VTOR_INVALID);
+	vnd_nsd_rele(nsp);
+
+	return (SDEV_VTOR_VALID);
+}
+
+static sdev_plugin_validate_t
+vnd_sdev_validate(sdev_ctx_t ctx)
+{
+	enum vtype vt;
+	dev_t dev;
+	vnd_dev_t *vdp;
+
+	vt = sdev_ctx_vtype(ctx);
+	if (vt == VDIR)
+		return (vnd_sdev_validate_dir(ctx));
+	ASSERT(vt == VCHR);
+
+	if (strcmp("ctl", sdev_ctx_name(ctx)) == 0)
+		return (SDEV_VTOR_VALID);
+
+	dev = (uintptr_t)sdev_ctx_vtype_data(ctx);
+	vdp = vnd_dev_lookup(getminor(dev));
+	if (vdp == NULL)
+		return (SDEV_VTOR_STALE);
+
+	mutex_enter(&vdp->vdd_lock);
+	if (!(vdp->vdd_flags & VND_D_LINKED) ||
+	    (vdp->vdd_flags & (VND_D_CONDEMNED | VND_D_ZONE_DYING))) {
+		mutex_exit(&vdp->vdd_lock);
+		vnd_dev_rele(vdp);
+		return (SDEV_VTOR_STALE);
+	}
+
+	if (strcmp(sdev_ctx_name(ctx), vdp->vdd_lname) != 0) {
+		mutex_exit(&vdp->vdd_lock);
+		vnd_dev_rele(vdp);
+		return (SDEV_VTOR_STALE);
+	}
+
+	mutex_exit(&vdp->vdd_lock);
+	vnd_dev_rele(vdp);
+	return (SDEV_VTOR_VALID);
+}
+
+/*
+ * This function is a no-op. sdev never has holds on our devices as they can go
+ * away at any time and specfs has to deal with that fact.
+ */
+static void
+vnd_sdev_inactive(sdev_ctx_t ctx)
+{
+}
+
+static int
+vnd_sdev_fillzone(vnd_pnsd_t *nsp, sdev_ctx_t ctx)
+{
+	int ret;
+	vnd_dev_t *vdp;
+
+	mutex_enter(&nsp->vpnd_lock);
+	for (vdp = list_head(&nsp->vpnd_dev_list); vdp != NULL;
+	    vdp = list_next(&nsp->vpnd_dev_list, vdp)) {
+		mutex_enter(&vdp->vdd_lock);
+		if ((vdp->vdd_flags & VND_D_LINKED) &&
+		    !(vdp->vdd_flags & (VND_D_CONDEMNED | VND_D_ZONE_DYING))) {
+			ret = sdev_plugin_mknod(ctx, vdp->vdd_lname, S_IFCHR,
+			    vdp->vdd_devid);
+			if (ret != 0 && ret != EEXIST) {
+				mutex_exit(&vdp->vdd_lock);
+				mutex_exit(&nsp->vpnd_lock);
+				vnd_nsd_rele(nsp);
+				return (ret);
+			}
+		}
+		mutex_exit(&vdp->vdd_lock);
+	}
+	mutex_exit(&nsp->vpnd_lock);
+
+	return (0);
+}
+
+static int
+vnd_sdev_filldir_root(sdev_ctx_t ctx)
+{
+	zoneid_t zid;
+	vnd_pnsd_t *nsp;
+	int ret;
+
+	zid = getzoneid();
+	nsp = vnd_nsd_lookup(zoneid_to_netstackid(zid));
+	ASSERT(nsp != NULL);
+	ret = vnd_sdev_fillzone(nsp, ctx);
+	vnd_nsd_rele(nsp);
+	if (ret != 0)
+		return (ret);
+
+	/*
+	 * Checking the zone id is not sufficient as the global zone could be
+	 * reaching down into a non-global zone's mounted /dev.
+	 */
+	if (zid == GLOBAL_ZONEID && (sdev_ctx_flags(ctx) & SDEV_CTX_GLOBAL)) {
+		ret = sdev_plugin_mkdir(ctx, "zone");
+		if (ret != 0 && ret != EEXIST)
+			return (ret);
+	}
+
+	/*
+	 * Always add a reference to the control node. There's no need to
+	 * reference it since it always exists and is always what we clone from.
+	 */
+	ret = sdev_plugin_mknod(ctx, "ctl", S_IFCHR,
+	    makedevice(ddi_driver_major(vnd_dip), 0));
+	if (ret != 0 && ret != EEXIST)
+		return (ret);
+
+	return (0);
+}
+
+static int
+vnd_sdev_filldir_zroot(sdev_ctx_t ctx)
+{
+	int ret;
+	vnd_pnsd_t *nsp;
+	zone_t *zonep;
+
+	ASSERT(getzoneid() == GLOBAL_ZONEID);
+	ASSERT(sdev_ctx_flags(ctx) & SDEV_CTX_GLOBAL);
+
+	mutex_enter(&vnd_dev_lock);
+	for (nsp = list_head(&vnd_nsd_list); nsp != NULL;
+	    nsp = list_next(&vnd_nsd_list, nsp)) {
+		mutex_enter(&nsp->vpnd_lock);
+		if (list_is_empty(&nsp->vpnd_dev_list)) {
+			mutex_exit(&nsp->vpnd_lock);
+			continue;
+		}
+		mutex_exit(&nsp->vpnd_lock);
+		zonep = zone_find_by_id(nsp->vpnd_zid);
+		/*
+		 * This zone must be being torn down, so skip it.
+		 */
+		if (zonep == NULL)
+			continue;
+		ret = sdev_plugin_mkdir(ctx, zonep->zone_name);
+		zone_rele(zonep);
+		if (ret != 0 && ret != EEXIST) {
+			mutex_exit(&vnd_dev_lock);
+			return (ret);
+		}
+	}
+	mutex_exit(&vnd_dev_lock);
+	return (0);
+}
+
+static int
+vnd_sdev_filldir(sdev_ctx_t ctx)
+{
+	int ret;
+	vnd_pnsd_t *nsp;
+
+	ASSERT(sdev_ctx_vtype(ctx) == VDIR);
+	if (strcmp(VND_SDEV_ROOT, sdev_ctx_path(ctx)) == 0)
+		return (vnd_sdev_filldir_root(ctx));
+
+	if (strcmp(VND_SDEV_ZROOT, sdev_ctx_path(ctx)) == 0)
+		return (vnd_sdev_filldir_zroot(ctx));
+
+	ASSERT(strncmp(VND_SDEV_ZROOT, sdev_ctx_path(ctx),
+	    strlen(VND_SDEV_ZROOT)) == 0);
+	nsp = vnd_sdev_ctx_to_ns(ctx);
+	if (nsp == NULL)
+		return (0);
+
+	ret = vnd_sdev_fillzone(nsp, ctx);
+	vnd_nsd_rele(nsp);
+
+	return (ret);
+}
+
+static sdev_plugin_ops_t vnd_sdev_ops = {
+	SDEV_PLUGIN_VERSION,
+	SDEV_PLUGIN_SUBDIR,
+	vnd_sdev_validate,
+	vnd_sdev_filldir,
+	vnd_sdev_inactive
+};
+
+static int
+vnd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+	int errp = 0;
+
+	if (cmd != DDI_ATTACH)
+		return (DDI_FAILURE);
+
+	/*
+	 * Only allow one instance.
+	 */
+	if (vnd_dip != NULL)
+		return (DDI_FAILURE);
+
+	vnd_dip = dip;
+	if (ddi_create_minor_node(vnd_dip, "vnd", S_IFCHR, 0, DDI_PSEUDO, 0) !=
+	    DDI_SUCCESS) {
+		vnd_dip = NULL;
+		return (DDI_FAILURE);
+	}
+
+	if (ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
+	    DDI_KERNEL_IOCTL, NULL, 0) != DDI_PROP_SUCCESS) {
+		ddi_remove_minor_node(vnd_dip, NULL);
+		vnd_dip = NULL;
+		return (DDI_FAILURE);
+	}
+
+	vnd_sdev_hdl = sdev_plugin_register(VND_SDEV_NAME, &vnd_sdev_ops,
+	    &errp);
+	if (vnd_sdev_hdl == NULL) {
+		ddi_remove_minor_node(vnd_dip, NULL);
+		ddi_prop_remove_all(vnd_dip);
+		vnd_dip = NULL;
+		return (DDI_FAILURE);
+	}
+
+	vnd_sqset = gsqueue_set_create(GSQUEUE_DEFAULT_WAIT,
+	    GSQUEUE_DEFAULT_PRIORITY);
+
+	return (DDI_SUCCESS);
+}
+
+static int
+vnd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+	if (cmd != DDI_DETACH)
+		return (DDI_FAILURE);
+
+	mutex_enter(&vnd_dev_lock);
+	if (!list_is_empty(&vnd_dev_list)) {
+		mutex_exit(&vnd_dev_lock);
+		return (DDI_FAILURE);
+	}
+	mutex_exit(&vnd_dev_lock);
+
+	return (DDI_FAILURE);
+}
+
+static int
+vnd_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
+{
+	int error;
+
+	switch (cmd) {
+	case DDI_INFO_DEVT2DEVINFO:
+		*result = (void *)vnd_dip;
+		error = DDI_SUCCESS;
+		break;
+	case DDI_INFO_DEVT2INSTANCE:
+		*result = (void *)0;
+		error = DDI_SUCCESS;
+		break;
+	default:
+		error = DDI_FAILURE;
+		break;
+	}
+	return (error);
+}
+
+
+
+static void
+vnd_ddi_fini(void)
+{
+	netstack_unregister(NS_VND);
+	if (vnd_taskq != NULL)
+		taskq_destroy(vnd_taskq);
+	if (vnd_str_cache != NULL)
+		kmem_cache_destroy(vnd_str_cache);
+	if (vnd_dev_cache != NULL)
+		kmem_cache_destroy(vnd_dev_cache);
+	if (vnd_pnsd_cache != NULL)
+		kmem_cache_destroy(vnd_pnsd_cache);
+	if (vnd_minors != NULL)
+		id_space_destroy(vnd_minors);
+	if (vnd_list_init != 0) {
+		list_destroy(&vnd_nsd_list);
+		list_destroy(&vnd_dev_list);
+		mutex_destroy(&vnd_dev_lock);
+		vnd_list_init = 0;
+	}
+	frameio_fini();
+}
+
+static int
+vnd_ddi_init(void)
+{
+	if (frameio_init() != 0)
+		return (DDI_FAILURE);
+
+	vnd_str_cache = kmem_cache_create("vnd_str_cache", sizeof (vnd_str_t),
+	    0, NULL, NULL, NULL, NULL, NULL, 0);
+	if (vnd_str_cache == NULL) {
+		frameio_fini();
+		return (DDI_FAILURE);
+	}
+	vnd_dev_cache = kmem_cache_create("vnd_dev_cache", sizeof (vnd_dev_t),
+	    0, NULL, NULL, NULL, NULL, NULL, 0);
+	if (vnd_dev_cache == NULL) {
+		kmem_cache_destroy(vnd_str_cache);
+		frameio_fini();
+		return (DDI_FAILURE);
+	}
+	vnd_pnsd_cache = kmem_cache_create("vnd_pnsd_cache",
+	    sizeof (vnd_pnsd_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+	if (vnd_pnsd_cache == NULL) {
+		kmem_cache_destroy(vnd_dev_cache);
+		kmem_cache_destroy(vnd_str_cache);
+		frameio_fini();
+		return (DDI_FAILURE);
+	}
+
+	vnd_taskq = taskq_create_instance("vnd", -1, 1, minclsyspri, 0, 0, 0);
+	if (vnd_taskq == NULL) {
+		kmem_cache_destroy(vnd_pnsd_cache);
+		kmem_cache_destroy(vnd_dev_cache);
+		kmem_cache_destroy(vnd_str_cache);
+		frameio_fini();
+		return (DDI_FAILURE);
+	}
+
+	vnd_minors = id_space_create("vnd_minors", 1, INT32_MAX);
+	if (vnd_minors == NULL) {
+		taskq_destroy(vnd_taskq);
+		kmem_cache_destroy(vnd_pnsd_cache);
+		kmem_cache_destroy(vnd_dev_cache);
+		kmem_cache_destroy(vnd_str_cache);
+		frameio_fini();
+		return (DDI_FAILURE);
+	}
+
+	mutex_init(&vnd_dev_lock, NULL, MUTEX_DRIVER, NULL);
+	list_create(&vnd_dev_list, sizeof (vnd_dev_t),
+	    offsetof(vnd_dev_t, vdd_link));
+	list_create(&vnd_nsd_list, sizeof (vnd_pnsd_t),
+	    offsetof(vnd_pnsd_t, vpnd_link));
+	vnd_list_init = 1;
+
+	netstack_register(NS_VND, vnd_stack_init, vnd_stack_shutdown,
+	    vnd_stack_destroy);
+
+	return (DDI_SUCCESS);
+}
+
+static struct module_info vnd_minfo = {
+	0,		/* module id */
+	"vnd",		/* module name */
+	1,		/* smallest packet size */
+	INFPSZ,		/* largest packet size (infinite) */
+	1,		/* high watermark */
+	0		/* low watermark */
+};
+
+static struct qinit vnd_r_qinit = {
+	vnd_s_rput,
+	NULL,
+	vnd_s_open,
+	vnd_s_close,
+	NULL,
+	&vnd_minfo,
+	NULL
+};
+
+static struct qinit vnd_w_qinit = {
+	vnd_s_wput,
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	&vnd_minfo,
+	NULL
+};
+
+static struct streamtab vnd_strtab = {
+	&vnd_r_qinit,
+	&vnd_w_qinit,
+	NULL,
+	NULL
+};
+
+
+static struct cb_ops vnd_cb_ops = {
+	vnd_open,		/* open */
+	vnd_close,		/* close */
+	nulldev,		/* strategy */
+	nulldev,		/* print */
+	nodev,			/* dump */
+	vnd_read,		/* read */
+	vnd_write,		/* write */
+	vnd_ioctl,		/* ioctl */
+	nodev,			/* devmap */
+	nodev,			/* mmap */
+	nodev,			/* segmap */
+	vnd_chpoll,		/* poll */
+	ddi_prop_op,		/* cb_prop_op */
+	NULL,			/* streamtab  */
+	D_MP			/* Driver compatibility flag */
+};
+
+static struct dev_ops vnd_dev_ops = {
+	DEVO_REV,		/* devo_rev */
+	0,			/* refcnt */
+	vnd_info,		/* get_dev_info */
+	nulldev,		/* identify */
+	nulldev,		/* probe */
+	vnd_attach,		/* attach */
+	vnd_detach,		/* detach */
+	nodev,			/* reset */
+	&vnd_cb_ops,		/* driver operations */
+	NULL,			/* bus operations */
+	nodev,			/* dev power */
+	ddi_quiesce_not_needed	/* quiesce */
+};
+
+static struct modldrv vnd_modldrv = {
+	&mod_driverops,
+	"Virtual Networking Datapath Driver",
+	&vnd_dev_ops
+};
+
+static struct fmodsw vnd_fmodfsw = {
+	"vnd",
+	&vnd_strtab,
+	D_NEW | D_MP
+};
+
+static struct modlstrmod vnd_modlstrmod = {
+	&mod_strmodops,
+	"Virtual Networking Datapath Driver",
+	&vnd_fmodfsw
+};
+
+static struct modlinkage vnd_modlinkage = {
+	MODREV_1,
+	&vnd_modldrv,
+	&vnd_modlstrmod,
+	NULL
+};
+
+int
+_init(void)
+{
+	int error;
+
+	/*
+	 * We need to do all of our global initialization in init as opposed to
+	 * attach and detach. The problem here is that because vnd can be used
+	 * from a stream context while being detached, we can not rely on having
+	 * run attach to create everything, alas. so it goes in _init, just like
+	 * our friend ip.
+	 */
+	if ((error = vnd_ddi_init()) != DDI_SUCCESS)
+		return (error);
+	error = mod_install((&vnd_modlinkage));
+	if (error != 0)
+		vnd_ddi_fini();
+	return (error);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&vnd_modlinkage, modinfop));
+}
+
+int
+_fini(void)
+{
+	int error;
+
+	error = mod_remove(&vnd_modlinkage);
+	if (error == 0)
+		vnd_ddi_fini();
+	return (error);
+}
diff --git a/usr/src/uts/common/io/vnd/vnd.conf b/usr/src/uts/common/io/vnd/vnd.conf
new file mode 100644
index 0000000000..65872e1ddf
--- /dev/null
+++ b/usr/src/uts/common/io/vnd/vnd.conf
@@ -0,0 +1,16 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2014, Joyent, Inc.  All rights reserved.
+#
+
+name="vnd" parent="pseudo" instance=0;
diff --git a/usr/src/uts/common/io/vnic/vnic_dev.c b/usr/src/uts/common/io/vnic/vnic_dev.c
index 3cacbe395b..3cb7e7660a 100644
--- a/usr/src/uts/common/io/vnic/vnic_dev.c
+++ b/usr/src/uts/common/io/vnic/vnic_dev.c
@@ -53,6 +53,7 @@
 #include <sys/vlan.h>
 #include <sys/vnic.h>
 #include <sys/vnic_impl.h>
+#include <sys/mac_impl.h>
 #include <sys/mac_flow_impl.h>
 #include <inet/ip_impl.h>
 
@@ -369,6 +370,7 @@ vnic_dev_create(datalink_id_t vnic_id, datalink_id_t linkid,
 
 	bzero(vnic, sizeof (*vnic));
 
+	vnic->vn_ls = LINK_STATE_UNKNOWN;
 	vnic->vn_id = vnic_id;
 	vnic->vn_link_id = linkid;
 	vnic->vn_vrid = vrid;
@@ -579,11 +581,12 @@ vnic_dev_create(datalink_id_t vnic_id, datalink_id_t linkid,
 	vnic->vn_enabled = B_TRUE;
 
 	if (is_anchor) {
-		mac_link_update(vnic->vn_mh, LINK_STATE_UP);
+		vnic->vn_ls = LINK_STATE_UP;
 	} else {
-		mac_link_update(vnic->vn_mh,
-		    mac_client_stat_get(vnic->vn_mch, MAC_STAT_LINK_STATE));
+		vnic->vn_ls = mac_client_stat_get(vnic->vn_mch,
+		    MAC_STAT_LINK_STATE);
 	}
+	mac_link_update(vnic->vn_mh, vnic->vn_ls);
 
 	rw_exit(&vnic_lock);
 
@@ -1072,6 +1075,18 @@ vnic_m_setprop(void *m_driver, const char *pr_name, mac_prop_id_t pr_num,
 		err = mac_maxsdu_update(vn->vn_mh, mtu);
 		break;
 	}
+	case MAC_PROP_VN_PROMISC_FILTERED: {
+		boolean_t filtered;
+
+		if (pr_valsize < sizeof (filtered)) {
+			err = EINVAL;
+			break;
+		}
+
+		bcopy(pr_val, &filtered, sizeof (filtered));
+		mac_set_promisc_filtered(vn->vn_mch, filtered);
+		break;
+	}
 	case MAC_PROP_SECONDARY_ADDRS: {
 		mac_secondary_addr_t msa;
 
@@ -1079,6 +1094,34 @@ vnic_m_setprop(void *m_driver, const char *pr_name, mac_prop_id_t pr_num,
 		err = vnic_set_secondary_macs(vn, &msa);
 		break;
 	}
+	case MAC_PROP_PRIVATE: {
+		long val, i;
+		const char *v;
+
+		if (vn->vn_link_id != DATALINK_INVALID_LINKID ||
+		    strcmp(pr_name, "_linkstate") != 0) {
+			err = ENOTSUP;
+			break;
+		}
+
+		for (v = pr_val, i = 0; i < pr_valsize; i++, v++) {
+			if (*v == '\0')
+				break;
+		}
+		if (i == pr_valsize) {
+			err = EINVAL;
+			break;
+		}
+
+		(void) ddi_strtol(pr_val, (char **)NULL, 0, &val);
+		if (val != LINK_STATE_UP && val != LINK_STATE_DOWN) {
+			err = EINVAL;
+			break;
+		}
+		vn->vn_ls = val;
+		mac_link_update(vn->vn_mh, vn->vn_ls);
+		break;
+	}
 	default:
 		err = ENOTSUP;
 		break;
@@ -1093,11 +1136,29 @@ vnic_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
 {
 	vnic_t		*vn = arg;
 	int 		ret = 0;
+	boolean_t	out;
 
 	switch (pr_num) {
+	case MAC_PROP_VN_PROMISC_FILTERED:
+		out = mac_get_promisc_filtered(vn->vn_mch);
+		ASSERT(pr_valsize >= sizeof (boolean_t));
+		bcopy(&out, pr_val, sizeof (boolean_t));
+		break;
 	case MAC_PROP_SECONDARY_ADDRS:
 		ret = vnic_get_secondary_macs(vn, pr_valsize, pr_val);
 		break;
+	case MAC_PROP_PRIVATE:
+		if (vn->vn_link_id != DATALINK_INVALID_LINKID) {
+			ret = EINVAL;
+			break;
+		}
+
+		if (strcmp(pr_name, "_linkstate") != 0) {
+			ret = EINVAL;
+			break;
+		}
+		(void) snprintf(pr_val, pr_valsize, "%d", vn->vn_ls);
+		break;
 	default:
 		ret = ENOTSUP;
 		break;
@@ -1107,7 +1168,8 @@ vnic_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
 }
 
 /* ARGSUSED */
-static void vnic_m_propinfo(void *m_driver, const char *pr_name,
+static void
+vnic_m_propinfo(void *m_driver, const char *pr_name,
     mac_prop_id_t pr_num, mac_prop_info_handle_t prh)
 {
 	vnic_t		*vn = m_driver;
@@ -1150,6 +1212,18 @@ static void vnic_m_propinfo(void *m_driver, const char *pr_name,
 			mac_perim_exit(mph);
 		}
 		break;
+	case MAC_PROP_PRIVATE:
+		if (vn->vn_link_id != DATALINK_INVALID_LINKID)
+			break;
+
+		if (strcmp(pr_name, "_linkstate") == 0) {
+			char buf[16];
+
+			mac_prop_info_set_perm(prh, MAC_PROP_PERM_RW);
+			(void) snprintf(buf, sizeof (buf), "%d", vn->vn_ls);
+			mac_prop_info_set_default_str(prh, buf);
+		}
+		break;
 	}
 }
 
@@ -1222,8 +1296,9 @@ vnic_notify_cb(void *arg, mac_notify_type_t type)
 		break;
 
 	case MAC_NOTE_LINK:
-		mac_link_update(vnic->vn_mh,
-		    mac_client_stat_get(vnic->vn_mch, MAC_STAT_LINK_STATE));
+		vnic->vn_ls = mac_client_stat_get(vnic->vn_mch,
+		    MAC_STAT_LINK_STATE);
+		mac_link_update(vnic->vn_mh, vnic->vn_ls);
 		break;
 
 	default:
diff --git a/usr/src/uts/common/io/vscan/vscan_svc.c b/usr/src/uts/common/io/vscan/vscan_svc.c
index a9817f571f..92eb0901c2 100644
--- a/usr/src/uts/common/io/vscan/vscan_svc.c
+++ b/usr/src/uts/common/io/vscan/vscan_svc.c
@@ -22,6 +22,7 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright (c) 2015, Joyent, Inc.
  */
 
 #include <sys/stat.h>
@@ -461,7 +462,7 @@ vscan_svc_scan_file(vnode_t *vp, cred_t *cr, int async)
 	boolean_t allow;
 	clock_t timeout, time_left;
 
-	if ((vp == NULL) || (vp->v_path == NULL) || cr == NULL)
+	if ((vp == NULL) || (vp->v_path == vn_vpath_empty) || cr == NULL)
 		return (0);
 
 	DTRACE_PROBE2(vscan__scan__file, char *, vp->v_path, int, async);
@@ -1080,7 +1081,6 @@ vscan_svc_exempt_file(vnode_t *vp, boolean_t *allow)
 	struct vattr attr;
 
 	ASSERT(vp != NULL);
-	ASSERT(vp->v_path != NULL);
 
 	attr.va_mask = AT_SIZE;
 
diff --git a/usr/src/uts/common/io/zfd.c b/usr/src/uts/common/io/zfd.c
new file mode 100644
index 0000000000..2da310ab8d
--- /dev/null
+++ b/usr/src/uts/common/io/zfd.c
@@ -0,0 +1,1154 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2016 Joyent, Inc.  All rights reserved.
+ */
+
+/*
+ * Zone File Descriptor Driver.
+ *
+ * This driver is derived from the zcons driver which is in turn derived from
+ * the pts/ptm drivers. The purpose is to expose file descriptors within the
+ * zone which are connected to zoneadmd and used for logging or an interactive
+ * connection to a process within the zone.
+ *
+ * Its implementation is straightforward. Each instance of the driver
+ * represents a global-zone/local-zone pair. Unlike the zcons device, zoneadmd
+ * uses these devices unidirectionally to provide stdin, stdout and stderr to
+ * the process within the zone.
+ *
+ * Instances of zfd are onlined as children of /pseudo/zfdnex@2/ by zoneadmd,
+ * using the devctl framework; thus the driver does not need to maintain any
+ * sort of "admin" node.
+ *
+ * The driver shuttles I/O from master side to slave side and back.  In a break
+ * from the pts/ptm semantics, if one side is not open, I/O directed towards
+ * it will simply be discarded. This is so that if zoneadmd is not holding the
+ * master side fd open (i.e. it has died somehow), processes in the zone do not
+ * experience any errors and I/O to the fd does not cause the process to hang.
+ *
+ * The driver can also act as a multiplexer so that data written to the
+ * slave side within the zone is also redirected back to another zfd device
+ * inside the zone for consumption (i.e. it can be read). The intention is
+ * that a logging process within the zone can consume data that is being
+ * written by an application onto the primary stream. This is essentially
+ * a tee off of the primary stream into a log stream. This tee can also be
+ * configured to be flow controlled via an ioctl. Flow control happens on the
+ * primary stream and is used to ensure that the log stream receives all of
+ * the messages off the primary stream when consumption of the data off of
+ * the log stream gets behind. Configuring for flow control implies that the
+ * application writing to the primary stream will be blocked when the log
+ * consumer gets behind. Note that closing the log stream (e.g. when the zone
+ * halts) will cause the loss of all messages queued in the stream.
+ *
+ * The zone's zfd device configuration is driven by zoneadmd and a zone mode.
+ * The mode, which is controlled by the zone attribute "zlog-mode" is somewhat
+ * of a misnomer since its purpose has evolved. The attribute can have a
+ * variety of values, but the lowest two positions are used to control how many
+ * zfd devices are created inside the zone and if the primary stream is a tty.
+ *
+ * Here is a summary of how the 4 modes control what zfd devices are created
+ * and how they're used:
+ *
+ *    t-:  1 stdio zdev  (0) configured as a tty
+ *    --:  3 stdio zdevs (0, 1, 2), not configured as a tty
+ *    tn:  1 stdio zdev  (0) configured as a tty, 1 additional zdev (1)
+ *    -n:  3 stdio zdevs (0, 1, 2), not tty, 2 additional zdevs (3, 4)
+ *
+ * With the 't' flag set, stdin/out/err is multiplexed onto a single full-duplex
+ * stream which is configured as a tty. That is, ptem, ldterm and ttycompat are
+ * autopushed onto the stream when the slave side is opened. There is only a
+ * single zfd dev (0) needed for the primary stream.
+ *
+ * When the 'n' flag is set, it is assumed that output logging will be done
+ * within the zone itself. In this configuration 1 or 2 additional zfd devices,
+ * depending on tty mode ('t' flag) are created within the zone. An application
+ * can then configure the zfd streams driver into a multiplexer. Output from
+ * the stdout/stderr zfd(s) will be teed into the correspond logging zfd(s)
+ * within the zone.
+ *
+ * The following is a diagram of how this works for a '-n' configuration:
+ *
+ *
+ *              zoneadmd (for zlogin -I stdout)
+ * GZ:             ^
+ *                 |
+ *     --------------------------
+ *                 ^
+ * NGZ:            |
+ *      app >1 -> zfd1 -> zfd3 -> logger (for logger to consume app's stdout)
+ *
+ * There would be a similar path for the app's stderr into zfd4 for the logger
+ * to consume stderr.
+ */
+
+#include <sys/types.h>
+#include <sys/cmn_err.h>
+#include <sys/conf.h>
+#include <sys/cred.h>
+#include <sys/ddi.h>
+#include <sys/debug.h>
+#include <sys/devops.h>
+#include <sys/errno.h>
+#include <sys/file.h>
+#include <sys/kstr.h>
+#include <sys/modctl.h>
+#include <sys/param.h>
+#include <sys/stat.h>
+#include <sys/stream.h>
+#include <sys/stropts.h>
+#include <sys/strsun.h>
+#include <sys/sunddi.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/types.h>
+#include <sys/zfd.h>
+#include <sys/vnode.h>
+#include <sys/fs/snode.h>
+#include <sys/zone.h>
+#include <sys/sdt.h>
+
+static kmutex_t zfd_mux_lock;
+
+static int zfd_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
+static int zfd_attach(dev_info_t *, ddi_attach_cmd_t);
+static int zfd_detach(dev_info_t *, ddi_detach_cmd_t);
+
+static int zfd_open(queue_t *, dev_t *, int, int, cred_t *);
+static int zfd_close(queue_t *, int, cred_t *);
+static void zfd_wput(queue_t *, mblk_t *);
+static void zfd_rsrv(queue_t *);
+static void zfd_wsrv(queue_t *);
+
+/*
+ * The instance number is encoded in the dev_t in the minor number; the lowest
+ * bit of the minor number is used to track the master vs. slave side of the
+ * fd. The rest of the bits in the minor number are the instance.
+ */
+#define	ZFD_MASTER_MINOR		0
+#define	ZFD_SLAVE_MINOR		1
+
+#define	ZFD_INSTANCE(x)		(getminor((x)) >> 1)
+#define	ZFD_NODE(x)		(getminor((x)) & 0x01)
+
+/*
+ * This macro converts a zfd_state_t pointer to the associated slave minor
+ * node's dev_t.
+ */
+#define	ZFD_STATE_TO_SLAVEDEV(x)	\
+	(makedevice(ddi_driver_major((x)->zfd_devinfo), \
+	(minor_t)(ddi_get_instance((x)->zfd_devinfo) << 1 | ZFD_SLAVE_MINOR)))
+
+int zfd_debug = 0;
+#define	DBG(a)		if (zfd_debug) cmn_err(CE_NOTE, a)
+#define	DBG1(a, b)	if (zfd_debug) cmn_err(CE_NOTE, a, b)
+
+/*
+ * ZFD Pseudo Terminal Module: stream data structure definitions,
+ * based on zcons.
+ */
+static struct module_info zfd_info = {
+	0x20FD,	/* ZOFD - 8445 */
+	"zfd",
+	0,		/* min packet size */
+	INFPSZ,		/* max packet size - infinity */
+	2048,		/* high water */
+	128		/* low water */
+};
+
+static struct qinit zfd_rinit = {
+	NULL,
+	(int (*)()) zfd_rsrv,
+	zfd_open,
+	zfd_close,
+	NULL,
+	&zfd_info,
+	NULL
+};
+
+static struct qinit zfd_winit = {
+	(int (*)()) zfd_wput,
+	(int (*)()) zfd_wsrv,
+	NULL,
+	NULL,
+	NULL,
+	&zfd_info,
+	NULL
+};
+
+static struct streamtab zfd_tab_info = {
+	&zfd_rinit,
+	&zfd_winit,
+	NULL,
+	NULL
+};
+
+#define	ZFD_CONF_FLAG	(D_MP | D_MTQPAIR | D_MTOUTPERIM | D_MTOCEXCL)
+
+/*
+ * this will define (struct cb_ops cb_zfd_ops) and (struct dev_ops zfd_ops)
+ */
+DDI_DEFINE_STREAM_OPS(zfd_ops, nulldev, nulldev, zfd_attach, zfd_detach, \
+	nodev, zfd_getinfo, ZFD_CONF_FLAG, &zfd_tab_info, \
+	ddi_quiesce_not_needed);
+
+/*
+ * Module linkage information for the kernel.
+ */
+
+static struct modldrv modldrv = {
+	&mod_driverops, 	/* Type of module (this is a pseudo driver) */
+	"Zone FD driver",	/* description of module */
+	&zfd_ops		/* driver ops */
+};
+
+static struct modlinkage modlinkage = {
+	MODREV_1,
+	&modldrv,
+	NULL
+};
+
+typedef enum {
+	ZFD_NO_MUX,
+	ZFD_PRIMARY_STREAM,
+	ZFD_LOG_STREAM
+} zfd_mux_type_t;
+
+typedef struct zfd_state {
+	dev_info_t *zfd_devinfo;	/* instance info */
+	queue_t *zfd_master_rdq;	/* GZ read queue */
+	queue_t *zfd_slave_rdq;		/* in-zone read queue */
+	int zfd_state;			/* ZFD_STATE_MOPEN, ZFD_STATE_SOPEN */
+	int zfd_tty;			/* ZFD_MAKETTY - strm mods will push */
+	boolean_t zfd_is_flowcon;	/* primary stream flow stopped */
+	boolean_t zfd_allow_flowcon;	/* use flow control */
+	zfd_mux_type_t zfd_muxt;	/* state type: none, primary, log */
+	struct zfd_state *zfd_inst_pri; /* log state's primary ptr */
+	struct zfd_state *zfd_inst_log;	/* primary state's log ptr */
+} zfd_state_t;
+
+#define	ZFD_STATE_MOPEN	0x01
+#define	ZFD_STATE_SOPEN	0x02
+
+static void *zfd_soft_state;
+
+/*
+ * List of STREAMS modules that are autopushed onto a slave instance when its
+ * opened, but only if the ZFD_MAKETTY ioctl has first been received by the
+ * master.
+ */
+static char *zfd_mods[] = {
+	"ptem",
+	"ldterm",
+	"ttcompat",
+	NULL
+};
+
+int
+_init(void)
+{
+	int err;
+
+	if ((err = ddi_soft_state_init(&zfd_soft_state, sizeof (zfd_state_t),
+	    0)) != 0) {
+		return (err);
+	}
+
+	if ((err = mod_install(&modlinkage)) != 0)
+		ddi_soft_state_fini(zfd_soft_state);
+
+	mutex_init(&zfd_mux_lock, NULL, MUTEX_DEFAULT, NULL);
+	return (err);
+}
+
+
+int
+_fini(void)
+{
+	int err;
+
+	if ((err = mod_remove(&modlinkage)) != 0) {
+		return (err);
+	}
+
+	ddi_soft_state_fini(&zfd_soft_state);
+	mutex_destroy(&zfd_mux_lock);
+	return (0);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&modlinkage, modinfop));
+}
+
+static int
+zfd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+	zfd_state_t *zfds;
+	int instance;
+	char masternm[ZFD_NAME_LEN], slavenm[ZFD_NAME_LEN];
+
+	if (cmd != DDI_ATTACH)
+		return (DDI_FAILURE);
+
+	instance = ddi_get_instance(dip);
+	if (ddi_soft_state_zalloc(zfd_soft_state, instance) != DDI_SUCCESS)
+		return (DDI_FAILURE);
+
+	(void) snprintf(masternm, sizeof (masternm), "%s%d", ZFD_MASTER_NAME,
+	    instance);
+	(void) snprintf(slavenm, sizeof (slavenm), "%s%d", ZFD_SLAVE_NAME,
+	    instance);
+
+	/*
+	 * Create the master and slave minor nodes.
+	 */
+	if ((ddi_create_minor_node(dip, slavenm, S_IFCHR,
+	    instance << 1 | ZFD_SLAVE_MINOR, DDI_PSEUDO, 0) == DDI_FAILURE) ||
+	    (ddi_create_minor_node(dip, masternm, S_IFCHR,
+	    instance << 1 | ZFD_MASTER_MINOR, DDI_PSEUDO, 0) == DDI_FAILURE)) {
+		ddi_remove_minor_node(dip, NULL);
+		ddi_soft_state_free(zfd_soft_state, instance);
+		return (DDI_FAILURE);
+	}
+
+	VERIFY((zfds = ddi_get_soft_state(zfd_soft_state, instance)) != NULL);
+	zfds->zfd_devinfo = dip;
+	zfds->zfd_tty = 0;
+	zfds->zfd_muxt = ZFD_NO_MUX;
+	zfds->zfd_inst_log = NULL;
+	return (DDI_SUCCESS);
+}
+
+static int
+zfd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+	zfd_state_t *zfds;
+	int instance;
+
+	if (cmd != DDI_DETACH)
+		return (DDI_FAILURE);
+
+	instance = ddi_get_instance(dip);
+	if ((zfds = ddi_get_soft_state(zfd_soft_state, instance)) == NULL)
+		return (DDI_FAILURE);
+
+	if ((zfds->zfd_state & ZFD_STATE_MOPEN) ||
+	    (zfds->zfd_state & ZFD_STATE_SOPEN)) {
+		DBG1("zfd_detach: device (dip=%p) still open\n", (void *)dip);
+		return (DDI_FAILURE);
+	}
+
+	ddi_remove_minor_node(dip, NULL);
+	ddi_soft_state_free(zfd_soft_state, instance);
+
+	return (DDI_SUCCESS);
+}
+
+/*
+ * zfd_getinfo()
+ *	getinfo(9e) entrypoint.
+ */
+/*ARGSUSED*/
+static int
+zfd_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
+{
+	zfd_state_t *zfds;
+	int instance = ZFD_INSTANCE((dev_t)arg);
+
+	switch (infocmd) {
+	case DDI_INFO_DEVT2DEVINFO:
+		if ((zfds = ddi_get_soft_state(zfd_soft_state,
+		    instance)) == NULL)
+			return (DDI_FAILURE);
+		*result = zfds->zfd_devinfo;
+		return (DDI_SUCCESS);
+	case DDI_INFO_DEVT2INSTANCE:
+		*result = (void *)(uintptr_t)instance;
+		return (DDI_SUCCESS);
+	}
+	return (DDI_FAILURE);
+}
+
+/*
+ * Return the equivalent queue from the other side of the relationship.
+ * e.g.: given the slave's write queue, return the master's write queue.
+ */
+static queue_t *
+zfd_switch(queue_t *qp)
+{
+	zfd_state_t *zfds = qp->q_ptr;
+	ASSERT(zfds != NULL);
+
+	if (qp == zfds->zfd_master_rdq)
+		return (zfds->zfd_slave_rdq);
+	else if (OTHERQ(qp) == zfds->zfd_master_rdq && zfds->zfd_slave_rdq
+	    != NULL)
+		return (OTHERQ(zfds->zfd_slave_rdq));
+	else if (qp == zfds->zfd_slave_rdq)
+		return (zfds->zfd_master_rdq);
+	else if (OTHERQ(qp) == zfds->zfd_slave_rdq && zfds->zfd_master_rdq
+	    != NULL)
+		return (OTHERQ(zfds->zfd_master_rdq));
+	else
+		return (NULL);
+}
+
+/*
+ * For debugging and outputting messages.  Returns the name of the side of
+ * the relationship associated with this queue.
+ */
+static const char *
+zfd_side(queue_t *qp)
+{
+	zfd_state_t *zfds = qp->q_ptr;
+	ASSERT(zfds != NULL);
+
+	if (qp == zfds->zfd_master_rdq ||
+	    OTHERQ(qp) == zfds->zfd_master_rdq) {
+		return ("master");
+	}
+	ASSERT(qp == zfds->zfd_slave_rdq || OTHERQ(qp) == zfds->zfd_slave_rdq);
+	return ("slave");
+}
+
+/*ARGSUSED*/
+static int
+zfd_master_open(zfd_state_t *zfds,
+    queue_t	*rqp,	/* pointer to the read side queue */
+    dev_t	*devp,	/* pointer to stream tail's dev */
+    int		oflag,	/* the user open(2) supplied flags */
+    int		sflag,	/* open state flag */
+    cred_t	*credp)	/* credentials */
+{
+	mblk_t *mop;
+	struct stroptions *sop;
+
+	/*
+	 * Enforce exclusivity on the master side; the only consumer should
+	 * be the zoneadmd for the zone.
+	 */
+	if ((zfds->zfd_state & ZFD_STATE_MOPEN) != 0)
+		return (EBUSY);
+
+	if ((mop = allocb(sizeof (struct stroptions), BPRI_MED)) == NULL) {
+		DBG("zfd_master_open(): mop allocation failed\n");
+		return (ENOMEM);
+	}
+
+	zfds->zfd_state |= ZFD_STATE_MOPEN;
+
+	/*
+	 * q_ptr stores driver private data; stash the soft state data on both
+	 * read and write sides of the queue.
+	 */
+	WR(rqp)->q_ptr = rqp->q_ptr = zfds;
+	qprocson(rqp);
+
+	/*
+	 * Following qprocson(), the master side is fully plumbed into the
+	 * STREAM and may send/receive messages.  Setting zfds->zfd_master_rdq
+	 * will allow the slave to send messages to us (the master).
+	 * This cannot occur before qprocson() because the master is not
+	 * ready to process them until that point.
+	 */
+	zfds->zfd_master_rdq = rqp;
+
+	/*
+	 * set up hi/lo water marks on stream head read queue and add
+	 * controlling tty as needed.
+	 */
+	mop->b_datap->db_type = M_SETOPTS;
+	mop->b_wptr += sizeof (struct stroptions);
+	sop = (struct stroptions *)(void *)mop->b_rptr;
+	if (oflag & FNOCTTY)
+		sop->so_flags = SO_HIWAT | SO_LOWAT;
+	else
+		sop->so_flags = SO_HIWAT | SO_LOWAT | SO_ISTTY;
+	sop->so_hiwat = 512;
+	sop->so_lowat = 256;
+	putnext(rqp, mop);
+
+	return (0);
+}
+
+/*ARGSUSED*/
+static int
+zfd_slave_open(zfd_state_t *zfds,
+    queue_t	*rqp,	/* pointer to the read side queue */
+    dev_t	*devp,	/* pointer to stream tail's dev */
+    int		oflag,	/* the user open(2) supplied flags */
+    int		sflag,	/* open state flag */
+    cred_t	*credp)	/* credentials */
+{
+	mblk_t *mop;
+	struct stroptions *sop;
+	/*
+	 * The slave side can be opened as many times as needed.
+	 */
+	if ((zfds->zfd_state & ZFD_STATE_SOPEN) != 0) {
+		ASSERT((rqp != NULL) && (WR(rqp)->q_ptr == zfds));
+		return (0);
+	}
+
+	/* A log stream is read-only */
+	if (zfds->zfd_muxt == ZFD_LOG_STREAM &&
+	    (oflag & (FREAD | FWRITE)) != FREAD)
+		return (EINVAL);
+
+	if (zfds->zfd_tty == 1) {
+		major_t major;
+		minor_t minor;
+		minor_t lastminor;
+		uint_t anchorindex;
+
+		/*
+		 * Set up sad(7D) so that the necessary STREAMS modules will
+		 * be in place.  A wrinkle is that 'ptem' must be anchored
+		 * in place (see streamio(7i)) because we always want the
+		 * fd to have terminal semantics.
+		 */
+		minor =
+		    ddi_get_instance(zfds->zfd_devinfo) << 1 | ZFD_SLAVE_MINOR;
+		major = ddi_driver_major(zfds->zfd_devinfo);
+		lastminor = 0;
+		anchorindex = 1;
+		if (kstr_autopush(SET_AUTOPUSH, &major, &minor, &lastminor,
+		    &anchorindex, zfd_mods) != 0) {
+			DBG("zfd_slave_open(): kstr_autopush() failed\n");
+			return (EIO);
+		}
+	}
+
+	if ((mop = allocb(sizeof (struct stroptions), BPRI_MED)) == NULL) {
+		DBG("zfd_slave_open(): mop allocation failed\n");
+		return (ENOMEM);
+	}
+
+	zfds->zfd_state |= ZFD_STATE_SOPEN;
+
+	/*
+	 * q_ptr stores driver private data; stash the soft state data on both
+	 * read and write sides of the queue.
+	 */
+	WR(rqp)->q_ptr = rqp->q_ptr = zfds;
+
+	qprocson(rqp);
+
+	/*
+	 * Must follow qprocson(), since we aren't ready to process until then.
+	 */
+	zfds->zfd_slave_rdq = rqp;
+
+	/*
+	 * set up hi/lo water marks on stream head read queue and add
+	 * controlling tty as needed.
+	 */
+	mop->b_datap->db_type = M_SETOPTS;
+	mop->b_wptr += sizeof (struct stroptions);
+	sop = (struct stroptions *)(void *)mop->b_rptr;
+	sop->so_flags = SO_HIWAT | SO_LOWAT | SO_ISTTY;
+	sop->so_hiwat = 512;
+	sop->so_lowat = 256;
+	putnext(rqp, mop);
+
+	return (0);
+}
+
+/*
+ * open(9e) entrypoint; checks sflag, and rejects anything unordinary.
+ */
+static int
+zfd_open(queue_t *rqp,		/* pointer to the read side queue */
+	dev_t   *devp,		/* pointer to stream tail's dev */
+	int	oflag,		/* the user open(2) supplied flags */
+	int	sflag,		/* open state flag */
+	cred_t  *credp)		/* credentials */
+{
+	int instance = ZFD_INSTANCE(*devp);
+	int ret;
+	zfd_state_t *zfds;
+
+	if (sflag != 0)
+		return (EINVAL);
+
+	if ((zfds = ddi_get_soft_state(zfd_soft_state, instance)) == NULL)
+		return (ENXIO);
+
+	switch (ZFD_NODE(*devp)) {
+	case ZFD_MASTER_MINOR:
+		ret = zfd_master_open(zfds, rqp, devp, oflag, sflag, credp);
+		break;
+	case ZFD_SLAVE_MINOR:
+		ret = zfd_slave_open(zfds, rqp, devp, oflag, sflag, credp);
+		/*
+		 * If we just opened the log stream and flow control has
+		 * been enabled, we want to make sure the primary stream can
+		 * start flowing.
+		 */
+		if (ret == 0 && zfds->zfd_muxt == ZFD_LOG_STREAM &&
+		    zfds->zfd_inst_pri->zfd_allow_flowcon) {
+			zfds->zfd_inst_pri->zfd_is_flowcon = B_FALSE;
+			if (zfds->zfd_inst_pri->zfd_master_rdq != NULL)
+				qenable(RD(zfds->zfd_inst_pri->zfd_master_rdq));
+		}
+		break;
+	default:
+		ret = ENXIO;
+		break;
+	}
+
+	return (ret);
+}
+
+/*
+ * close(9e) entrypoint.
+ */
+/*ARGSUSED1*/
+static int
+zfd_close(queue_t *rqp, int flag, cred_t *credp)
+{
+	queue_t *wqp;
+	mblk_t	*bp;
+	zfd_state_t *zfds;
+	major_t major;
+	minor_t minor;
+
+	zfds = (zfd_state_t *)rqp->q_ptr;
+
+	if (rqp == zfds->zfd_master_rdq) {
+		DBG("Closing master side");
+
+		zfds->zfd_master_rdq = NULL;
+		zfds->zfd_state &= ~ZFD_STATE_MOPEN;
+
+		/*
+		 * qenable slave side write queue so that it can flush
+		 * its messages as master's read queue is going away
+		 */
+		if (zfds->zfd_slave_rdq != NULL) {
+			qenable(WR(zfds->zfd_slave_rdq));
+		}
+
+		qprocsoff(rqp);
+		WR(rqp)->q_ptr = rqp->q_ptr = NULL;
+
+	} else if (rqp == zfds->zfd_slave_rdq) {
+
+		DBG("Closing slave side");
+		zfds->zfd_state &= ~ZFD_STATE_SOPEN;
+		zfds->zfd_slave_rdq = NULL;
+
+		wqp = WR(rqp);
+		while ((bp = getq(wqp)) != NULL) {
+			if (zfds->zfd_master_rdq != NULL)
+				putnext(zfds->zfd_master_rdq, bp);
+			else if (bp->b_datap->db_type == M_IOCTL)
+				miocnak(wqp, bp, 0, 0);
+			else
+				freemsg(bp);
+		}
+
+		/*
+		 * Qenable master side write queue so that it can flush its
+		 * messages as slaves's read queue is going away.
+		 */
+		if (zfds->zfd_master_rdq != NULL)
+			qenable(WR(zfds->zfd_master_rdq));
+
+		/*
+		 * Qenable primary stream if necessary.
+		 */
+		if (zfds->zfd_muxt == ZFD_LOG_STREAM &&
+		    zfds->zfd_inst_pri->zfd_allow_flowcon) {
+			zfds->zfd_inst_pri->zfd_is_flowcon = B_FALSE;
+			if (zfds->zfd_inst_pri->zfd_master_rdq != NULL)
+				qenable(RD(zfds->zfd_inst_pri->zfd_master_rdq));
+		}
+
+		qprocsoff(rqp);
+		WR(rqp)->q_ptr = rqp->q_ptr = NULL;
+
+		if (zfds->zfd_tty == 1) {
+			/*
+			 * Clear the sad configuration so that reopening
+			 * doesn't fail to set up sad configuration.
+			 */
+			major = ddi_driver_major(zfds->zfd_devinfo);
+			minor = ddi_get_instance(zfds->zfd_devinfo) << 1 |
+			    ZFD_SLAVE_MINOR;
+			(void) kstr_autopush(CLR_AUTOPUSH, &major, &minor,
+			    NULL, NULL, NULL);
+		}
+	}
+
+	return (0);
+}
+
+static void
+handle_mflush(queue_t *qp, mblk_t *mp)
+{
+	mblk_t *nmp;
+	DBG1("M_FLUSH on %s side", zfd_side(qp));
+
+	if (*mp->b_rptr & FLUSHW) {
+		DBG1("M_FLUSH, FLUSHW, %s side", zfd_side(qp));
+		flushq(qp, FLUSHDATA);
+		*mp->b_rptr &= ~FLUSHW;
+		if ((*mp->b_rptr & FLUSHR) == 0) {
+			/*
+			 * FLUSHW only. Change to FLUSHR and putnext other side,
+			 * then we are done.
+			 */
+			*mp->b_rptr |= FLUSHR;
+			if (zfd_switch(RD(qp)) != NULL) {
+				putnext(zfd_switch(RD(qp)), mp);
+				return;
+			}
+		} else if ((zfd_switch(RD(qp)) != NULL) &&
+		    (nmp = copyb(mp)) != NULL) {
+			/*
+			 * It is a FLUSHRW; we copy the mblk and send
+			 * it to the other side, since we still need to use
+			 * the mblk in FLUSHR processing, below.
+			 */
+			putnext(zfd_switch(RD(qp)), nmp);
+		}
+	}
+
+	if (*mp->b_rptr & FLUSHR) {
+		DBG("qreply(qp) turning FLUSHR around\n");
+		qreply(qp, mp);
+		return;
+	}
+	freemsg(mp);
+}
+
+/*
+ * Evaluate the various conditionals to determine if we're teeing into a log
+ * stream and if the primary stream should be flow controlled. This function
+ * can set the zfd_is_flowcon flag as a side effect.
+ *
+ * When teeing with flow control, we always queue the teed msg here and if
+ * the queue is getting full, we set zfd_is_flowcon. The primary stream will
+ * always queue when zfd_is_flowcon and will also not be served when
+ * zfd_is_flowcon is set. This causes backpressure on the primary stream
+ * until the teed queue can drain.
+ */
+static void
+zfd_tee_handler(zfd_state_t *zfds, unsigned char type, mblk_t *mp)
+{
+	queue_t *log_qp;
+	zfd_state_t *log_zfds;
+	mblk_t *lmp;
+
+	if (zfds->zfd_muxt != ZFD_PRIMARY_STREAM)
+		return;
+
+	if (type != M_DATA)
+		return;
+
+	log_zfds = zfds->zfd_inst_log;
+	if (log_zfds == NULL)
+		return;
+
+	ASSERT(log_zfds->zfd_muxt == ZFD_LOG_STREAM);
+
+	if ((log_zfds->zfd_state & ZFD_STATE_SOPEN) == 0) {
+		if (zfds->zfd_allow_flowcon)
+			zfds->zfd_is_flowcon = B_TRUE;
+		return;
+	}
+
+	/* The zfd_slave_rdq is null until the log dev is opened in the zone */
+	log_qp = RD(log_zfds->zfd_slave_rdq);
+	DTRACE_PROBE2(zfd__tee__check, void *, log_qp, void *, zfds);
+
+	if (!zfds->zfd_allow_flowcon) {
+		/*
+		 * We're not supposed to tee with flow control and the tee is
+		 * full so we skip teeing into the log stream.
+		 */
+		if ((log_qp->q_flag & QFULL) != 0)
+			return;
+	}
+
+	/*
+	 * Tee the message into the log stream.
+	 */
+	lmp = dupmsg(mp);
+	if (lmp == NULL) {
+		if (zfds->zfd_allow_flowcon)
+			zfds->zfd_is_flowcon = B_TRUE;
+		return;
+	}
+
+	if (log_qp->q_first == NULL && bcanputnext(log_qp, lmp->b_band)) {
+		putnext(log_qp, lmp);
+	} else {
+		if (putq(log_qp, lmp) == 0) {
+			/* The logger queue is full, free the msg. */
+			freemsg(lmp);
+		}
+		/*
+		 * If we're supposed to tee with flow control and the tee is
+		 * over the high water mark then we want the primary stream to
+		 * stop flowing. We'll stop queueing the primary stream after
+		 * the log stream has drained.
+		 */
+		if (zfds->zfd_allow_flowcon &&
+		    log_qp->q_count > log_qp->q_hiwat) {
+			zfds->zfd_is_flowcon = B_TRUE;
+		}
+	}
+}
+
+/*
+ * wput(9E) is symmetric for master and slave sides, so this handles both
+ * without splitting the codepath.  (The only exception to this is the
+ * processing of zfd ioctls, which is restricted to the master side.)
+ *
+ * zfd_wput() looks at the other side; if there is no process holding that
+ * side open, it frees the message.  This prevents processes from hanging
+ * if no one is holding open the fd.  Otherwise, it putnext's high
+ * priority messages, putnext's normal messages if possible, and otherwise
+ * enqueues the messages; in the case that something is enqueued, wsrv(9E)
+ * will take care of eventually shuttling I/O to the other side.
+ *
+ * When configured as a multiplexer, then anything written to the stream
+ * from inside the zone is also teed off to the corresponding log stream
+ * for consumption within the zone (i.e. the log stream can be read, but never
+ * written to, by an application inside the zone).
+ */
+static void
+zfd_wput(queue_t *qp, mblk_t *mp)
+{
+	unsigned char type = mp->b_datap->db_type;
+	zfd_state_t *zfds;
+	struct iocblk *iocbp;
+	boolean_t must_queue = B_FALSE;
+
+	ASSERT(qp->q_ptr);
+
+	DBG1("entering zfd_wput, %s side", zfd_side(qp));
+
+	/*
+	 * Process zfd ioctl messages if qp is the master side's write queue.
+	 */
+	zfds = (zfd_state_t *)qp->q_ptr;
+
+	if (type == M_IOCTL) {
+		iocbp = (struct iocblk *)(void *)mp->b_rptr;
+
+		switch (iocbp->ioc_cmd) {
+		case ZFD_MAKETTY:
+			zfds->zfd_tty = 1;
+			miocack(qp, mp, 0, 0);
+			return;
+		case ZFD_EOF:
+			if (zfds->zfd_slave_rdq != NULL)
+				(void) putnextctl(zfds->zfd_slave_rdq,
+				    M_HANGUP);
+			miocack(qp, mp, 0, 0);
+			return;
+		case ZFD_HAS_SLAVE:
+			if ((zfds->zfd_state & ZFD_STATE_SOPEN) != 0) {
+				miocack(qp, mp, 0, 0);
+			} else {
+				miocack(qp, mp, 0, ENOTTY);
+			}
+			return;
+		case ZFD_MUX: {
+			/*
+			 * Setup the multiplexer configuration for the two
+			 * streams.
+			 *
+			 * We expect to be called on the stream that will
+			 * become the log stream and be passed one data block
+			 * with the minor number of the slave side of the
+			 * primary stream.
+			 */
+			int to;
+			int instance;
+			zfd_state_t *prim_zfds;
+
+			if (iocbp->ioc_count != TRANSPARENT ||
+			    mp->b_cont == NULL) {
+				miocack(qp, mp, 0, EINVAL);
+				return;
+			}
+
+			/* Get the primary slave minor device number */
+			to = *(int *)mp->b_cont->b_rptr;
+			instance = ZFD_INSTANCE(to);
+
+			if ((prim_zfds = ddi_get_soft_state(zfd_soft_state,
+			    instance)) == NULL) {
+				miocack(qp, mp, 0, EINVAL);
+				return;
+			}
+
+			/* Disallow changing primary/log once set. */
+			mutex_enter(&zfd_mux_lock);
+			if (zfds->zfd_muxt != ZFD_NO_MUX ||
+			    prim_zfds->zfd_muxt != ZFD_NO_MUX) {
+				mutex_exit(&zfd_mux_lock);
+				miocack(qp, mp, 0, EINVAL);
+				return;
+			}
+
+			zfds->zfd_muxt = ZFD_LOG_STREAM;
+			zfds->zfd_inst_pri = prim_zfds;
+			prim_zfds->zfd_muxt = ZFD_PRIMARY_STREAM;
+			prim_zfds->zfd_inst_log = zfds;
+			mutex_exit(&zfd_mux_lock);
+			DTRACE_PROBE2(zfd__mux__link, void *, prim_zfds,
+			    void *, zfds);
+
+			miocack(qp, mp, 0, 0);
+			return;
+			}
+		case ZFD_MUX_FLOWCON: {
+			/*
+			 * We expect this ioctl to be issued against the
+			 * log stream. We don't use the primary stream since
+			 * there can be other streams modules pushed onto that
+			 * stream which would interfere with the ioctl.
+			 */
+			int val;
+			zfd_state_t *prim_zfds;
+
+			if (iocbp->ioc_count != TRANSPARENT ||
+			    mp->b_cont == NULL) {
+				miocack(qp, mp, 0, EINVAL);
+				return;
+			}
+
+			if (zfds->zfd_muxt != ZFD_LOG_STREAM) {
+				miocack(qp, mp, 0, EINVAL);
+				return;
+			}
+			prim_zfds = zfds->zfd_inst_pri;
+
+			/* Get the flow control setting */
+			val = *(int *)mp->b_cont->b_rptr;
+			if (val != 0 && val != 1) {
+				miocack(qp, mp, 0, EINVAL);
+				return;
+			}
+
+			prim_zfds->zfd_allow_flowcon = (boolean_t)val;
+			if (!prim_zfds->zfd_allow_flowcon)
+				prim_zfds->zfd_is_flowcon = B_FALSE;
+
+			DTRACE_PROBE1(zfd__mux__flowcon, void *, prim_zfds);
+			miocack(qp, mp, 0, 0);
+			return;
+			}
+		default:
+			break;
+		}
+	}
+
+	/* if on the write side, may need to tee */
+	if (zfds->zfd_slave_rdq != NULL && qp == WR(zfds->zfd_slave_rdq)) {
+		/* tee output to any attached log stream */
+		zfd_tee_handler(zfds, type, mp);
+
+		/* high-priority msgs are not subject to flow control */
+		if (zfds->zfd_is_flowcon && type == M_DATA)
+			must_queue = B_TRUE;
+	}
+
+	if (zfd_switch(RD(qp)) == NULL) {
+		DBG1("wput to %s side (no one listening)", zfd_side(qp));
+		switch (type) {
+		case M_FLUSH:
+			handle_mflush(qp, mp);
+			break;
+		case M_IOCTL:
+			miocnak(qp, mp, 0, 0);
+			break;
+		default:
+			freemsg(mp);
+			break;
+		}
+		return;
+	}
+
+	if (type >= QPCTL) {
+		DBG1("(hipri) wput, %s side", zfd_side(qp));
+		switch (type) {
+		case M_READ:		/* supposedly from ldterm? */
+			DBG("zfd_wput: tossing M_READ\n");
+			freemsg(mp);
+			break;
+		case M_FLUSH:
+			handle_mflush(qp, mp);
+			break;
+		default:
+			/*
+			 * Put this to the other side.
+			 */
+			ASSERT(zfd_switch(RD(qp)) != NULL);
+			putnext(zfd_switch(RD(qp)), mp);
+			break;
+		}
+		DBG1("done (hipri) wput, %s side", zfd_side(qp));
+		return;
+	}
+
+	/*
+	 * If the primary stream has been stopped for flow control then
+	 * enqueue the msg, otherwise only putnext if there isn't already
+	 * something in the queue. If we don't do this then things would wind
+	 * up out of order.
+	 */
+	if (!must_queue && qp->q_first == NULL &&
+	    bcanputnext(RD(zfd_switch(qp)), mp->b_band)) {
+		putnext(RD(zfd_switch(qp)), mp);
+	} else {
+		/*
+		 * zfd_wsrv expects msgs queued on the primary queue. Those
+		 * will be handled by zfd_wsrv after zfd_rsrv performs the
+		 * qenable on the proper queue.
+		 */
+		(void) putq(qp, mp);
+	}
+
+	DBG1("done wput, %s side", zfd_side(qp));
+}
+
+/*
+ * Read server
+ *
+ * For primary stream:
+ * Under normal execution rsrv(9E) is symmetric for master and slave, so
+ * zfd_rsrv() can handle both without splitting up the codepath. We do this by
+ * enabling the write side of the partner.  This triggers the partner to send
+ * messages queued on its write side to this queue's read side.
+ *
+ * For log stream:
+ * Internally we've queued up the msgs that we've teed off to the log stream
+ * so when we're invoked we need to pass these along.
+ */
+static void
+zfd_rsrv(queue_t *qp)
+{
+	zfd_state_t *zfds;
+	zfds = (zfd_state_t *)qp->q_ptr;
+
+	/*
+	 * log stream server
+	 */
+	if (zfds->zfd_muxt == ZFD_LOG_STREAM && zfds->zfd_slave_rdq != NULL) {
+		queue_t *log_qp;
+		mblk_t *mp;
+
+		log_qp = RD(zfds->zfd_slave_rdq);
+
+		if ((zfds->zfd_state & ZFD_STATE_SOPEN) != 0) {
+			zfd_state_t *pzfds = zfds->zfd_inst_pri;
+
+			while ((mp = getq(qp)) != NULL) {
+				if (bcanputnext(log_qp, mp->b_band)) {
+					putnext(log_qp, mp);
+				} else {
+					(void) putbq(log_qp, mp);
+					break;
+				}
+			}
+
+			if (log_qp->q_count < log_qp->q_lowat) {
+				DTRACE_PROBE(zfd__flow__on);
+				pzfds->zfd_is_flowcon = B_FALSE;
+				if (pzfds->zfd_master_rdq != NULL)
+					qenable(RD(pzfds->zfd_master_rdq));
+			}
+		} else {
+			/* No longer open, drain the queue */
+			while ((mp = getq(qp)) != NULL) {
+				freemsg(mp);
+			}
+			flushq(qp, FLUSHALL);
+		}
+		return;
+	}
+
+	/*
+	 * Care must be taken here, as either of the master or slave side
+	 * qptr could be NULL.
+	 */
+	ASSERT(qp == zfds->zfd_master_rdq || qp == zfds->zfd_slave_rdq);
+	if (zfd_switch(qp) == NULL) {
+		DBG("zfd_rsrv: other side isn't listening\n");
+		return;
+	}
+	qenable(WR(zfd_switch(qp)));
+}
+
+/*
+ * Write server
+ *
+ * This routine is symmetric for master and slave, so it handles both without
+ * splitting up the codepath.
+ *
+ * If there are messages on this queue that can be sent to the other, send
+ * them via putnext(). Else, if queued messages cannot be sent, leave them
+ * on this queue.
+ */
+static void
+zfd_wsrv(queue_t *qp)
+{
+	queue_t *swq;
+	mblk_t *mp;
+	zfd_state_t *zfds = (zfd_state_t *)qp->q_ptr;
+
+	ASSERT(zfds != NULL);
+
+	/*
+	 * Partner has no read queue, so take the data, and throw it away.
+	 */
+	if (zfd_switch(RD(qp)) == NULL) {
+		DBG("zfd_wsrv: other side isn't listening");
+		while ((mp = getq(qp)) != NULL) {
+			if (mp->b_datap->db_type == M_IOCTL)
+				miocnak(qp, mp, 0, 0);
+			else
+				freemsg(mp);
+		}
+		flushq(qp, FLUSHALL);
+		return;
+	}
+
+	swq = RD(zfd_switch(qp));
+
+	/*
+	 * while there are messages on this write queue...
+	 */
+	while (!zfds->zfd_is_flowcon && (mp = getq(qp)) != NULL) {
+		/*
+		 * Due to the way zfd_wput is implemented, we should never
+		 * see a high priority control message here.
+		 */
+		ASSERT(mp->b_datap->db_type < QPCTL);
+
+		if (bcanputnext(swq, mp->b_band)) {
+			putnext(swq, mp);
+		} else {
+			(void) putbq(qp, mp);
+			break;
+		}
+	}
+}
diff --git a/usr/src/uts/common/mapfiles/README b/usr/src/uts/common/mapfiles/README
new file mode 100644
index 0000000000..5b65771325
--- /dev/null
+++ b/usr/src/uts/common/mapfiles/README
@@ -0,0 +1,68 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2016 Joyent, Inc.
+#
+
+Kernel Module Build Time Symbol Verification
+--------------------------------------------
+
+Historically, kernel modules have all been built as relocatable objects.
+They are not dynamic objects and dependency information is always noted
+in individual makefiles. Along with this, there has never been any
+verification of the symbols that are being used. This means that it's
+possible for a kernel module author to refer to a symbol that doesn't
+exist and not find out until they try to install the module.
+
+To help find these problems at build time, we provide an opt-in system
+for modules to use, leveraging the link-editor's '-z defs' option.  This
+option ensures that there are no unknown definitons at link-edit time.
+To supply these definitions we supply a series of mapfiles in this
+directory.
+
+These mapfiles are not the traditional versioning mapfiles like those in
+usr/src/lib/README.mapfiles! Please review the following differences
+closely:
+
+* These mapfiles do not declare any versions!
+* These mapfiles do not use the 'SYMBOL_VERSION' directive, instead they
+  use the 'SYMBOL_SCOPE' directive.
+* These mapfiles do not hide symbols! Library mapfiles always have
+  something to catch all local symbols. That should *never* be used
+  here. These mapfiles should not effect visibility.
+* All symbols in these mapfiles should be marked 'EXTERN' to indicate
+  that they are not provided by the kernel module but by another.
+* These mapfiles do not declare what is or isn't a public interface,
+  though they are often grouped around interfaces, to make it easier for
+  a driver author to get this right.
+
+Mapfiles are organized based on kernel module. For example the GLDv3
+device driver interface is provided by the 'mac' module and thus is
+found in the file 'mac.mapfile'. The DDI is currently in the 'ddi'
+mapfile. Functions that are found in genunix and unix that aren't in
+the DDI should not be put in that mapfile.
+
+Note, the existing files may not be complete. These are intended to only
+have the public interfaces provided by modules and thus should not
+include every symbol in them. As the need arises, add new symbols or
+modules as appropriate.
+
+To opt a module into this, first declare a series of MAPFILES that they
+should check against in the module. This should be a series of one or
+more files, for example:
+
+MAPFILES += ddi mac
+
+Next, you should add an include of Makefile.mapfile right before you
+include Makefile.targ. You can do this with the following line:
+
+include $(UTSBASE)/Makefile.mapfile
diff --git a/usr/src/uts/common/mapfiles/ddi.mapfile b/usr/src/uts/common/mapfiles/ddi.mapfile
new file mode 100644
index 0000000000..25aa8ab045
--- /dev/null
+++ b/usr/src/uts/common/mapfiles/ddi.mapfile
@@ -0,0 +1,190 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2016 Joyent, Inc.
+#
+
+#
+# MAPFILE HEADER START
+#
+# WARNING:  STOP NOW.  DO NOT MODIFY THIS FILE.
+# Object scoping must comply with the rules detailed in
+#
+#	usr/src/uts/common/README.mapfiles
+#
+# You should not be making modifications here until you've read the most current
+# copy of that file. If you need help, contact a gatekeeper for guidance.
+#
+# MAPFILE HEADER END
+#
+
+#
+# This file contains core functions provided by the DDI and also items
+# required as part of the platform's runime ABI (think compiler
+# functions).
+#
+
+$mapfile_version 2
+
+SYMBOL_SCOPE {
+    global:
+	__divdi3			{ FLAGS = EXTERN };
+	__stack_chk_fail		{ FLAGS = EXTERN };
+	__stack_chk_guard		{ FLAGS = EXTERN };
+	allocb				{ FLAGS = EXTERN };
+	assfail				{ FLAGS = EXTERN };
+	assfail3			{ FLAGS = EXTERN };
+	atomic_dec_32_nv		{ FLAGS = EXTERN };
+	bcmp				{ FLAGS = EXTERN };
+	bcopy				{ FLAGS = EXTERN };
+	bzero				{ FLAGS = EXTERN };
+	cmn_err				{ FLAGS = EXTERN };
+	cv_broadcast			{ FLAGS = EXTERN };
+	cv_destroy			{ FLAGS = EXTERN };
+	cv_init				{ FLAGS = EXTERN };
+	cv_reltimedwait			{ FLAGS = EXTERN };
+	ddi_cb_register			{ FLAGS = EXTERN };
+	ddi_cb_unregister		{ FLAGS = EXTERN };
+	ddi_dev_regsize			{ FLAGS = EXTERN };
+	ddi_dma_addr_bind_handle	{ FLAGS = EXTERN };
+	ddi_dma_alloc_handle		{ FLAGS = EXTERN };
+	ddi_dma_free_handle		{ FLAGS = EXTERN };
+	ddi_dma_mem_alloc		{ FLAGS = EXTERN };
+	ddi_dma_mem_free		{ FLAGS = EXTERN };
+	ddi_dma_nextcookie		{ FLAGS = EXTERN };
+	ddi_dma_sync			{ FLAGS = EXTERN };
+	ddi_dma_unbind_handle		{ FLAGS = EXTERN };
+	ddi_fm_acc_err_clear		{ FLAGS = EXTERN };
+	ddi_fm_acc_err_get		{ FLAGS = EXTERN };
+	ddi_fm_dma_err_get		{ FLAGS = EXTERN };
+	ddi_fm_ereport_post		{ FLAGS = EXTERN };
+	ddi_fm_fini			{ FLAGS = EXTERN };
+	ddi_fm_handler_register		{ FLAGS = EXTERN };
+	ddi_fm_handler_unregister	{ FLAGS = EXTERN };
+	ddi_fm_init			{ FLAGS = EXTERN };
+	ddi_fm_service_impact		{ FLAGS = EXTERN };
+	ddi_get_driver_private		{ FLAGS = EXTERN };
+	ddi_get_instance		{ FLAGS = EXTERN };
+	ddi_get_lbolt			{ FLAGS = EXTERN };
+	ddi_get_lbolt64			{ FLAGS = EXTERN };
+	ddi_get_name			{ FLAGS = EXTERN };
+	ddi_get_parent			{ FLAGS = EXTERN };
+	ddi_get16			{ FLAGS = EXTERN };
+	ddi_get32			{ FLAGS = EXTERN };
+	ddi_get64			{ FLAGS = EXTERN };
+	ddi_intr_add_handler		{ FLAGS = EXTERN };
+	ddi_intr_alloc			{ FLAGS = EXTERN };
+	ddi_intr_block_disable		{ FLAGS = EXTERN };
+	ddi_intr_block_enable		{ FLAGS = EXTERN };
+	ddi_intr_disable		{ FLAGS = EXTERN };
+	ddi_intr_enable			{ FLAGS = EXTERN };
+	ddi_intr_free			{ FLAGS = EXTERN };
+	ddi_intr_get_cap		{ FLAGS = EXTERN };
+	ddi_intr_get_navail		{ FLAGS = EXTERN };
+	ddi_intr_get_nintrs		{ FLAGS = EXTERN };
+	ddi_intr_get_pri		{ FLAGS = EXTERN };
+	ddi_intr_get_supported_types	{ FLAGS = EXTERN };
+	ddi_intr_remove_handler		{ FLAGS = EXTERN };
+	ddi_periodic_add		{ FLAGS = EXTERN };
+	ddi_periodic_delete		{ FLAGS = EXTERN };
+	ddi_power			{ FLAGS = EXTERN };
+	ddi_prop_free			{ FLAGS = EXTERN };
+	ddi_prop_get_int		{ FLAGS = EXTERN };
+	ddi_prop_lookup_int_array	{ FLAGS = EXTERN };
+	ddi_prop_op			{ FLAGS = EXTERN };
+	ddi_prop_remove_all		{ FLAGS = EXTERN };
+	ddi_prop_update_int_array	{ FLAGS = EXTERN };
+	ddi_prop_update_string		{ FLAGS = EXTERN };
+	ddi_ptob			{ FLAGS = EXTERN };
+	ddi_put16			{ FLAGS = EXTERN };
+	ddi_put32			{ FLAGS = EXTERN };
+	ddi_quiesce_not_supported	{ FLAGS = EXTERN };
+	ddi_regs_map_free		{ FLAGS = EXTERN };
+	ddi_regs_map_setup		{ FLAGS = EXTERN };
+	ddi_set_driver_private		{ FLAGS = EXTERN };
+	ddi_strtol			{ FLAGS = EXTERN };
+	ddi_taskq_create		{ FLAGS = EXTERN };
+	ddi_taskq_destroy		{ FLAGS = EXTERN };
+	ddi_taskq_dispatch		{ FLAGS = EXTERN };
+	delay				{ FLAGS = EXTERN };
+	desballoc			{ FLAGS = EXTERN };
+	dev_err				{ FLAGS = EXTERN };
+	drv_usectohz			{ FLAGS = EXTERN };
+	drv_usecwait			{ FLAGS = EXTERN };
+	fm_ena_generate			{ FLAGS = EXTERN };
+	freeb				{ FLAGS = EXTERN };
+	freemsg				{ FLAGS = EXTERN };
+	freemsgchain			{ FLAGS = EXTERN };
+	gethrtime			{ FLAGS = EXTERN };
+	kmem_alloc			{ FLAGS = EXTERN };
+	kmem_free			{ FLAGS = EXTERN };
+	kmem_zalloc			{ FLAGS = EXTERN };
+	kstat_create			{ FLAGS = EXTERN };
+	kstat_delete			{ FLAGS = EXTERN };
+	kstat_install			{ FLAGS = EXTERN };
+	kstat_named_init		{ FLAGS = EXTERN };
+	list_create			{ FLAGS = EXTERN };
+	list_destroy			{ FLAGS = EXTERN };
+	list_head			{ FLAGS = EXTERN };
+	list_insert_tail		{ FLAGS = EXTERN };
+	list_next			{ FLAGS = EXTERN };
+	list_remove			{ FLAGS = EXTERN };
+	memcpy				{ FLAGS = EXTERN };
+	memset				{ FLAGS = EXTERN };
+	miocack				{ FLAGS = EXTERN };
+	miocnak				{ FLAGS = EXTERN };
+	mod_driverops			{ FLAGS = EXTERN };
+	mod_info			{ FLAGS = EXTERN };
+	mod_install			{ FLAGS = EXTERN };
+	mod_remove			{ FLAGS = EXTERN };
+	msgpullup			{ FLAGS = EXTERN };
+	msgsize				{ FLAGS = EXTERN };
+	mutex_destroy			{ FLAGS = EXTERN };
+	mutex_enter			{ FLAGS = EXTERN };
+	mutex_exit			{ FLAGS = EXTERN };
+	mutex_init			{ FLAGS = EXTERN };
+	mutex_owned			{ FLAGS = EXTERN };
+	mutex_tryenter			{ FLAGS = EXTERN };
+	nochpoll			{ FLAGS = EXTERN };
+	nodev				{ FLAGS = EXTERN };
+	nulldev				{ FLAGS = EXTERN };
+	panic				{ FLAGS = EXTERN };
+	pci_config_get16		{ FLAGS = EXTERN };
+	pci_config_get32		{ FLAGS = EXTERN };
+	pci_config_get64		{ FLAGS = EXTERN };
+	pci_config_get8			{ FLAGS = EXTERN };
+	pci_config_put16		{ FLAGS = EXTERN };
+	pci_config_put32		{ FLAGS = EXTERN };
+	pci_config_put64		{ FLAGS = EXTERN };
+	pci_config_put8			{ FLAGS = EXTERN };
+	pci_config_setup		{ FLAGS = EXTERN };
+	pci_config_teardown		{ FLAGS = EXTERN };
+	pci_ereport_post		{ FLAGS = EXTERN };
+	pci_ereport_setup		{ FLAGS = EXTERN };
+	pci_ereport_teardown		{ FLAGS = EXTERN };
+	pci_lcap_locate			{ FLAGS = EXTERN };
+	qreply				{ FLAGS = EXTERN };
+	rw_destroy			{ FLAGS = EXTERN };
+	rw_enter			{ FLAGS = EXTERN };
+	rw_exit				{ FLAGS = EXTERN };
+	rw_init				{ FLAGS = EXTERN };
+	snprintf			{ FLAGS = EXTERN };
+	sprintf				{ FLAGS = EXTERN };
+	strcat				{ FLAGS = EXTERN };
+	strcmp				{ FLAGS = EXTERN };
+	strcpy				{ FLAGS = EXTERN };
+	strlen				{ FLAGS = EXTERN };
+	timeout				{ FLAGS = EXTERN };
+	untimeout			{ FLAGS = EXTERN };
+	vsnprintf			{ FLAGS = EXTERN };
+	vsprintf			{ FLAGS = EXTERN };
+};
diff --git a/usr/src/uts/common/mapfiles/dtrace.mapfile.awk b/usr/src/uts/common/mapfiles/dtrace.mapfile.awk
new file mode 100644
index 0000000000..b8a7e2d372
--- /dev/null
+++ b/usr/src/uts/common/mapfiles/dtrace.mapfile.awk
@@ -0,0 +1,34 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2016 Joyent, Inc.
+#
+
+#
+# This script is designed to assemble a mapfile for DTrace probes.
+#
+BEGIN {
+	print "#"
+	print "# This file is autogenerated by dtrace.mapfile.awk"
+	print "#"
+	print "$mapfile_version 2"
+	print "SYMBOL_SCOPE {"
+	print "    global:"
+}
+
+/__dtrace_probe_/ {
+	printf "\t%s\t{ FLAGS = EXTERN };\n", $1
+}
+
+END {
+	print "};"
+}
diff --git a/usr/src/uts/common/mapfiles/kernel.mapfile b/usr/src/uts/common/mapfiles/kernel.mapfile
new file mode 100644
index 0000000000..6bddb3c7ef
--- /dev/null
+++ b/usr/src/uts/common/mapfiles/kernel.mapfile
@@ -0,0 +1,41 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2016 Joyent, Inc.
+#
+
+#
+# MAPFILE HEADER START
+#
+# WARNING:  STOP NOW.  DO NOT MODIFY THIS FILE.
+# Object scoping must comply with the rules detailed in
+#
+#	usr/src/uts/common/README.mapfiles
+#
+# You should not be making modifications here until you've read the most current
+# copy of that file. If you need help, contact a gatekeeper for guidance.
+#
+# MAPFILE HEADER END
+#
+
+#
+# This file contains functions provided by the kernel that various
+# modules use. This is a combination of things in both unix and genunix.
+#
+
+$mapfile_version 2
+
+SYMBOL_SCOPE {
+    global:
+	bt_getlowbit			{ FLAGS = EXTERN };
+	servicing_interrupt		{ FLAGS = EXTERN };
+};
diff --git a/usr/src/uts/common/mapfiles/mac.mapfile b/usr/src/uts/common/mapfiles/mac.mapfile
new file mode 100644
index 0000000000..30462f80d5
--- /dev/null
+++ b/usr/src/uts/common/mapfiles/mac.mapfile
@@ -0,0 +1,55 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2016 Joyent, Inc.
+#
+
+#
+# MAPFILE HEADER START
+#
+# WARNING:  STOP NOW.  DO NOT MODIFY THIS FILE.
+# Object scoping must comply with the rules detailed in
+#
+#	usr/src/uts/common/README.mapfiles
+#
+# You should not be making modifications here until you've read the most current
+# copy of that file. If you need help, contact a gatekeeper for guidance.
+#
+# MAPFILE HEADER END
+#
+
+$mapfile_version 2
+
+SYMBOL_SCOPE {
+    global:
+	mac_alloc				{ FLAGS = EXTERN };
+	mac_fini_ops				{ FLAGS = EXTERN };
+	mac_free				{ FLAGS = EXTERN };
+	mac_hcksum_get				{ FLAGS = EXTERN };
+	mac_hcksum_set				{ FLAGS = EXTERN };
+	mac_init_ops				{ FLAGS = EXTERN };
+	mac_link_update				{ FLAGS = EXTERN };
+	mac_lso_get				{ FLAGS = EXTERN };
+	mac_maxsdu_update			{ FLAGS = EXTERN };
+	mac_prop_info_set_default_link_flowctrl	{ FLAGS = EXTERN };
+	mac_prop_info_set_default_str		{ FLAGS = EXTERN };
+	mac_prop_info_set_default_uint8		{ FLAGS = EXTERN };
+	mac_prop_info_set_perm			{ FLAGS = EXTERN };
+	mac_prop_info_set_range_uint32		{ FLAGS = EXTERN };
+	mac_ring_intr_set			{ FLAGS = EXTERN };
+	mac_register				{ FLAGS = EXTERN };
+	mac_rx					{ FLAGS = EXTERN };
+	mac_rx_ring				{ FLAGS = EXTERN };
+	mac_tx_ring_update			{ FLAGS = EXTERN };
+	mac_tx_update				{ FLAGS = EXTERN };
+	mac_unregister				{ FLAGS = EXTERN };
+};
diff --git a/usr/src/uts/common/mapfiles/random.mapfile b/usr/src/uts/common/mapfiles/random.mapfile
new file mode 100644
index 0000000000..d3d8bc89fa
--- /dev/null
+++ b/usr/src/uts/common/mapfiles/random.mapfile
@@ -0,0 +1,37 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2016 Joyent, Inc.
+#
+
+#
+# MAPFILE HEADER START
+#
+# WARNING:  STOP NOW.  DO NOT MODIFY THIS FILE.
+# Object scoping must comply with the rules detailed in
+#
+#	usr/src/uts/common/README.mapfiles
+#
+# You should not be making modifications here until you've read the most current
+# copy of that file. If you need help, contact a gatekeeper for guidance.
+#
+# MAPFILE HEADER END
+#
+
+$mapfile_version 2
+
+SYMBOL_SCOPE {
+    global:
+	random_get_bytes			{ FLAGS = EXTERN };
+	random_get_blocking_bytes		{ FLAGS = EXTERN };
+	random_get_pseudo_bytes			{ FLAGS = EXTERN };
+};
diff --git a/usr/src/uts/common/netinet/in.h b/usr/src/uts/common/netinet/in.h
index d530b7f36e..7927cf5e24 100644
--- a/usr/src/uts/common/netinet/in.h
+++ b/usr/src/uts/common/netinet/in.h
@@ -3,6 +3,7 @@
  * Use is subject to license terms.
  *
  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2015, Joyent, Inc.
  */
 /*
  * Copyright (c) 1982, 1986 Regents of the University of California.
@@ -225,6 +226,7 @@ typedef	uint16_t	sa_family_t;
 #define	IPPORT_SLP		427
 #define	IPPORT_MIP		434
 #define	IPPORT_SMB		445		/* a.k.a. microsoft-ds */
+#define	IPPORT_VXLAN		4789
 
 /*
  * Internet Key Exchange (IKE) ports
@@ -268,6 +270,11 @@ typedef	uint16_t	sa_family_t;
 #define	IPPORT_RESERVED		1024
 #define	IPPORT_USERRESERVED	5000
 
+#ifdef _KERNEL
+#define	IPPORT_DYNAMIC_MIN	49152
+#define	IPPORT_DYNAMIC_MAX	65535
+#endif
+
 /*
  * Link numbers
  */
diff --git a/usr/src/uts/common/netinet/udp.h b/usr/src/uts/common/netinet/udp.h
index c65a9bad3a..74cff75d43 100644
--- a/usr/src/uts/common/netinet/udp.h
+++ b/usr/src/uts/common/netinet/udp.h
@@ -1,6 +1,7 @@
 /*
  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2015 Joyent, Inc.
  */
 
 /*
@@ -17,9 +18,6 @@
 #ifndef	_NETINET_UDP_H
 #define	_NETINET_UDP_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-/* udp.h 1.7 88/08/19 SMI; from UCB 7.1 6/5/86	*/
-
 #ifdef	__cplusplus
 extern "C" {
 #endif
@@ -36,6 +34,16 @@ struct udphdr {
 #define	UDP_EXCLBIND		0x0101		/* for internal use only */
 #define	UDP_RCVHDR		0x0102		/* for internal use only */
 #define	UDP_NAT_T_ENDPOINT	0x0103		/* for internal use only */
+#define	UDP_SRCPORT_HASH	0x0104		/* for internal use only */
+#define	UDP_SND_TO_CONNECTED	0x0105		/* for internal use only */
+
+/*
+ * Hash definitions for UDP_SRCPORT_HASH that effectively tell UDP how to go
+ * handle UDP_SRCPORT_HASH.
+ */
+#define	UDP_HASH_DISABLE	0x0000		/* for internal use only */
+#define	UDP_HASH_VXLAN		0x0001		/* for internal use only */
+
 /*
  * Following option in UDP_ namespace required to be exposed through
  * <xti.h> (It also requires exposing options not implemented). The options
diff --git a/usr/src/uts/common/os/brand.c b/usr/src/uts/common/os/brand.c
index 0af67f5d98..02901d023d 100644
--- a/usr/src/uts/common/os/brand.c
+++ b/usr/src/uts/common/os/brand.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, Joyent, Inc. All rights reserved.
  */
 
 #include <sys/kmem.h>
@@ -45,7 +46,7 @@ struct brand_mach_ops native_mach_ops  = {
 };
 #else /* !__sparcv9 */
 struct brand_mach_ops native_mach_ops  = {
-		NULL, NULL, NULL, NULL
+		NULL, NULL, NULL, NULL, NULL, NULL, NULL
 };
 #endif /* !__sparcv9 */
 
@@ -53,7 +54,8 @@ brand_t native_brand = {
 		BRAND_VER_1,
 		"native",
 		NULL,
-		&native_mach_ops
+		&native_mach_ops,
+		0
 };
 
 /*
@@ -310,46 +312,112 @@ brand_unregister_zone(struct brand *bp)
 	mutex_exit(&brand_list_lock);
 }
 
-void
-brand_setbrand(proc_t *p)
+int
+brand_setbrand(proc_t *p, boolean_t lwps_ok)
 {
 	brand_t *bp = p->p_zone->zone_brand;
+	void *brand_data = NULL;
 
-	ASSERT(bp != NULL);
-	ASSERT(p->p_brand == &native_brand);
+	VERIFY(MUTEX_NOT_HELD(&p->p_lock));
+	VERIFY(bp != NULL);
 
 	/*
-	 * We should only be called from exec(), when we know the process
-	 * is single-threaded.
+	 * Process branding occurs during fork() and exec().  When it happens
+	 * during fork(), the LWP count will always be 0 since branding is
+	 * performed as part of getproc(), before LWPs have been associated.
+	 * The same is not true during exec(), where a multi-LWP process may
+	 * undergo branding just prior to gexec(). This is to ensure
+	 * exec-related brand hooks are available.  While it may seem
+	 * complicated to brand a multi-LWP process, the two possible outcomes
+	 * simplify things:
+	 *
+	 * 1. The exec() succeeds:  LWPs besides the caller will be killed and
+	 *    any further branding will occur in a single-LWP context.
+	 * 2. The exec() fails: The process will be promptly unbranded since
+	 *    the hooks are no longer needed.
+	 *
+	 * To prevent inconsistent brand state from being encountered during
+	 * the exec(), LWPs beyond the caller which are associated with this
+	 * process must be held temporarily.  They will be released either when
+	 * they are killed in the exec() success, or when the brand is cleared
+	 * after exec() failure.
 	 */
-	ASSERT(p->p_tlist == p->p_tlist->t_forw);
+	if (lwps_ok) {
+		/*
+		 * We've been called from a exec() context tolerating the
+		 * existence of multiple LWPs during branding is necessary.
+		 */
+		VERIFY(p == curproc);
+		VERIFY(p->p_tlist != NULL);
 
+		if (p->p_tlist != p->p_tlist->t_forw) {
+			/*
+			 * Multiple LWPs are present.  Hold all but the caller.
+			 */
+			if (!holdlwps(SHOLDFORK1)) {
+				return (-1);
+			}
+		}
+	} else {
+		/*
+		 * Processes branded during fork() should not have LWPs at all.
+		 */
+		VERIFY(p->p_tlist == NULL);
+	}
+
+	if (bp->b_data_size > 0) {
+		brand_data = kmem_zalloc(bp->b_data_size, KM_SLEEP);
+	}
+
+	mutex_enter(&p->p_lock);
+	ASSERT(!PROC_IS_BRANDED(p));
 	p->p_brand = bp;
+	p->p_brand_data = brand_data;
 	ASSERT(PROC_IS_BRANDED(p));
 	BROP(p)->b_setbrand(p);
+	mutex_exit(&p->p_lock);
+	return (0);
 }
 
 void
-brand_clearbrand(proc_t *p, boolean_t no_lwps)
+brand_clearbrand(proc_t *p, boolean_t lwps_ok)
 {
 	brand_t *bp = p->p_zone->zone_brand;
-	klwp_t *lwp = NULL;
-	ASSERT(bp != NULL);
-	ASSERT(!no_lwps || (p->p_tlist == NULL));
+	void *brand_data;
 
-	/*
-	 * If called from exec_common() or proc_exit(),
-	 * we know the process is single-threaded.
-	 * If called from fork_fail, p_tlist is NULL.
-	 */
-	if (!no_lwps) {
-		ASSERT(p->p_tlist == p->p_tlist->t_forw);
-		lwp = p->p_tlist->t_lwp;
-	}
+	VERIFY(MUTEX_NOT_HELD(&p->p_lock));
+	VERIFY(bp != NULL);
+	VERIFY(PROC_IS_BRANDED(p));
 
-	ASSERT(PROC_IS_BRANDED(p));
-	BROP(p)->b_proc_exit(p, lwp);
+	mutex_enter(&p->p_lock);
 	p->p_brand = &native_brand;
+	brand_data = p->p_brand_data;
+	p->p_brand_data = NULL;
+
+	if (lwps_ok) {
+		VERIFY(p == curproc);
+		/*
+		 * A process with multiple LWPs is being de-branded after
+		 * failing an exec.  The other LWPs were held as part of the
+		 * procedure, so they must be resumed now.
+		 */
+		if (p->p_tlist != NULL && p->p_tlist != p->p_tlist->t_forw) {
+			continuelwps(p);
+		}
+	} else {
+		/*
+		 * While clearing the brand, it's ok for one LWP to be present.
+		 * This happens when a native binary is executed inside a
+		 * branded zone, since the brand will be removed during the
+		 * course of a successful exec.
+		 */
+		VERIFY(p->p_tlist == NULL || p->p_tlist == p->p_tlist->t_forw);
+	}
+	mutex_exit(&p->p_lock);
+
+	if (brand_data != NULL) {
+		kmem_free(brand_data, bp->b_data_size);
+	}
 }
 
 #if defined(__sparcv9)
@@ -483,7 +551,7 @@ brand_solaris_cmd(int cmd, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3,
 		return (ENOSYS);
 
 	/* For all other operations this must be a branded process. */
-	if (p->p_brand == &native_brand)
+	if (!PROC_IS_BRANDED(p))
 		return (ENOSYS);
 
 	ASSERT(p->p_brand == pbrand);
@@ -601,15 +669,15 @@ restoreexecenv(struct execenv *ep, stack_t *sp)
 int
 brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
     intpdata_t *idatap, int level, long *execsz, int setid, caddr_t exec_file,
-    cred_t *cred, int brand_action, struct brand *pbrand, char *bname,
-    char *brandlib, char *brandlib32, char *brandlinker, char *brandlinker32)
+    cred_t *cred, int *brand_action, struct brand *pbrand, char *bname,
+    char *brandlib, char *brandlib32)
 {
 
 	vnode_t		*nvp;
 	Ehdr		ehdr;
 	Addr		uphdr_vaddr;
 	intptr_t	voffset;
-	int		interp;
+	char		*interp;
 	int		i, err;
 	struct execenv	env;
 	struct execenv	origenv;
@@ -619,7 +687,6 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
 	klwp_t		*lwp = ttolwp(curthread);
 	brand_proc_data_t	*spd;
 	brand_elf_data_t sed, *sedp;
-	char		*linker;
 	uintptr_t	lddata; /* lddata of executable's linker */
 
 	ASSERT(curproc->p_brand == pbrand);
@@ -636,12 +703,10 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
 	 */
 	if (args->to_model == DATAMODEL_NATIVE) {
 		args->emulator = brandlib;
-		linker = brandlinker;
 	}
 #if defined(_LP64)
 	else {
 		args->emulator = brandlib32;
-		linker = brandlinker32;
 	}
 #endif  /* _LP64 */
 
@@ -725,7 +790,7 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
 	if (args->to_model == DATAMODEL_NATIVE) {
 		err = mapexec_brand(vp, args, &ehdr, &uphdr_vaddr,
 		    &voffset, exec_file, &interp, &env.ex_bssbase,
-		    &env.ex_brkbase, &env.ex_brksize, NULL);
+		    &env.ex_brkbase, &env.ex_brksize, NULL, NULL);
 	}
 #if defined(_LP64)
 	else {
@@ -733,7 +798,7 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
 		Elf32_Addr uphdr_vaddr32;
 		err = mapexec32_brand(vp, args, &ehdr32, &uphdr_vaddr32,
 		    &voffset, exec_file, &interp, &env.ex_bssbase,
-		    &env.ex_brkbase, &env.ex_brksize, NULL);
+		    &env.ex_brkbase, &env.ex_brksize, NULL, NULL);
 		Ehdr32to64(&ehdr32, &ehdr);
 
 		if (uphdr_vaddr32 == (Elf32_Addr)-1)
@@ -744,6 +809,10 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
 #endif  /* _LP64 */
 	if (err != 0) {
 		restoreexecenv(&origenv, &orig_sigaltstack);
+
+		if (interp != NULL)
+			kmem_free(interp, MAXPATHLEN);
+
 		return (err);
 	}
 
@@ -761,7 +830,7 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
 	sedp->sed_phent = ehdr.e_phentsize;
 	sedp->sed_phnum = ehdr.e_phnum;
 
-	if (interp) {
+	if (interp != NULL) {
 		if (ehdr.e_type == ET_DYN) {
 			/*
 			 * This is a shared object executable, so we
@@ -777,16 +846,20 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
 		 * it in and store relevant information about it in the
 		 * aux vector, where the brand library can find it.
 		 */
-		if ((err = lookupname(linker, UIO_SYSSPACE,
+		if ((err = lookupname(interp, UIO_SYSSPACE,
 		    FOLLOW, NULLVPP, &nvp)) != 0) {
-			uprintf("%s: not found.", brandlinker);
+			uprintf("%s: not found.", interp);
 			restoreexecenv(&origenv, &orig_sigaltstack);
+			kmem_free(interp, MAXPATHLEN);
 			return (err);
 		}
+
+		kmem_free(interp, MAXPATHLEN);
+
 		if (args->to_model == DATAMODEL_NATIVE) {
 			err = mapexec_brand(nvp, args, &ehdr,
 			    &uphdr_vaddr, &voffset, exec_file, &interp,
-			    NULL, NULL, NULL, &lddata);
+			    NULL, NULL, NULL, &lddata, NULL);
 		}
 #if defined(_LP64)
 		else {
@@ -794,7 +867,7 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
 			Elf32_Addr uphdr_vaddr32;
 			err = mapexec32_brand(nvp, args, &ehdr32,
 			    &uphdr_vaddr32, &voffset, exec_file, &interp,
-			    NULL, NULL, NULL, &lddata);
+			    NULL, NULL, NULL, &lddata, NULL);
 			Ehdr32to64(&ehdr32, &ehdr);
 
 			if (uphdr_vaddr32 == (Elf32_Addr)-1)
@@ -934,9 +1007,9 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
 
 	/*
 	 * Third, the /proc aux vectors set up by elfexec() point to
-	 * brand emulation library and it's linker.  Copy these to the
+	 * brand emulation library and its linker.  Copy these to the
 	 * /proc brand specific aux vector, and update the regular
-	 * /proc aux vectors to point to the executable (and it's
+	 * /proc aux vectors to point to the executable (and its
 	 * linker).  This will enable debuggers to access the
 	 * executable via the usual /proc or elf notes aux vectors.
 	 *
@@ -1078,55 +1151,31 @@ brand_solaris_freelwp(klwp_t *l, struct brand *pbrand)
 }
 
 /*ARGSUSED*/
-int
+void
 brand_solaris_initlwp(klwp_t *l, struct brand *pbrand)
 {
 	ASSERT(l->lwp_procp->p_brand == pbrand);
 	ASSERT(l->lwp_procp->p_brand_data != NULL);
 	ASSERT(l->lwp_brand == NULL);
 	l->lwp_brand = (void *)-1;
-	return (0);
 }
 
 /*ARGSUSED*/
 void
 brand_solaris_lwpexit(klwp_t *l, struct brand *pbrand)
 {
-	proc_t  *p = l->lwp_procp;
-
 	ASSERT(l->lwp_procp->p_brand == pbrand);
 	ASSERT(l->lwp_procp->p_brand_data != NULL);
 	ASSERT(l->lwp_brand != NULL);
-
-	/*
-	 * We should never be called for the last thread in a process.
-	 * (That case is handled by brand_solaris_proc_exit().)
-	 * Therefore this lwp must be exiting from a multi-threaded
-	 * process.
-	 */
-	ASSERT(p->p_tlist != p->p_tlist->t_forw);
-
-	l->lwp_brand = NULL;
 }
 
 /*ARGSUSED*/
 void
-brand_solaris_proc_exit(struct proc *p, klwp_t *l, struct brand *pbrand)
+brand_solaris_proc_exit(struct proc *p, struct brand *pbrand)
 {
 	ASSERT(p->p_brand == pbrand);
 	ASSERT(p->p_brand_data != NULL);
 
-	/*
-	 * When called from proc_exit(), we know that process is
-	 * single-threaded and free our lwp brand data.
-	 * otherwise just free p_brand_data and return.
-	 */
-	if (l != NULL) {
-		ASSERT(p->p_tlist == p->p_tlist->t_forw);
-		ASSERT(p->p_tlist->t_lwp == l);
-		(void) brand_solaris_freelwp(l, pbrand);
-	}
-
 	/* upon exit, free our proc brand data */
 	kmem_free(p->p_brand_data, sizeof (brand_proc_data_t));
 	p->p_brand_data = NULL;
@@ -1145,5 +1194,4 @@ brand_solaris_setbrand(proc_t *p, struct brand *pbrand)
 	ASSERT(p->p_tlist == p->p_tlist->t_forw);
 
 	p->p_brand_data = kmem_zalloc(sizeof (brand_proc_data_t), KM_SLEEP);
-	(void) brand_solaris_initlwp(p->p_tlist->t_lwp, pbrand);
 }
diff --git a/usr/src/uts/common/os/clock_highres.c b/usr/src/uts/common/os/clock_highres.c
index 805813037d..1280c8a1b6 100644
--- a/usr/src/uts/common/os/clock_highres.c
+++ b/usr/src/uts/common/os/clock_highres.c
@@ -25,7 +25,7 @@
  */
 
 /*
- * Copyright (c) 2015, Joyent Inc. All rights reserved.
+ * Copyright 2016, Joyent Inc.
  */
 
 #include <sys/timer.h>
@@ -41,6 +41,9 @@
 
 static clock_backend_t clock_highres;
 
+/* minimum non-privileged interval (200us) */
+long clock_highres_interval_min = 200000;
+
 /*ARGSUSED*/
 static int
 clock_highres_settime(timespec_t *ts)
@@ -68,17 +71,6 @@ clock_highres_getres(timespec_t *ts)
 static int
 clock_highres_timer_create(itimer_t *it, void (*fire)(itimer_t *))
 {
-	/*
-	 * CLOCK_HIGHRES timers of sufficiently high resolution can deny
-	 * service; only allow privileged users to create such timers.
-	 * Sites that do not wish to have this restriction should
-	 * give users the "proc_clock_highres" privilege.
-	 */
-	if (secpolicy_clock_highres(CRED()) != 0) {
-		it->it_arg = NULL;
-		return (EPERM);
-	}
-
 	it->it_arg = kmem_zalloc(sizeof (cyclic_id_t), KM_SLEEP);
 	it->it_fire = fire;
 
@@ -111,6 +103,49 @@ clock_highres_timer_settime(itimer_t *it, int flags,
 	cpu_t *cpu;
 	cpupart_t *cpupart;
 	int pset;
+	boolean_t value_need_clamp = B_FALSE;
+	boolean_t intval_need_clamp = B_FALSE;
+	cred_t *cr = CRED();
+	struct itimerspec clamped;
+
+	/*
+	 * CLOCK_HIGHRES timers of sufficiently high resolution can deny
+	 * service; only allow privileged users to create such timers.
+	 * Non-privileged users (those without the "proc_clock_highres"
+	 * privilege) can create timers with lower resolution but if they
+	 * attempt to use a very low time value (< 200us) then their
+	 * timer will be clamped at 200us.
+	 */
+	if (when->it_value.tv_sec == 0 &&
+	    when->it_value.tv_nsec > 0 &&
+	    when->it_value.tv_nsec < clock_highres_interval_min)
+		value_need_clamp = B_TRUE;
+
+	if (when->it_interval.tv_sec == 0 &&
+	    when->it_interval.tv_nsec > 0 &&
+	    when->it_interval.tv_nsec < clock_highres_interval_min)
+		intval_need_clamp = B_TRUE;
+
+	if ((value_need_clamp || intval_need_clamp) &&
+	    secpolicy_clock_highres(cr) != 0) {
+		clamped.it_value.tv_sec = when->it_value.tv_sec;
+		clamped.it_interval.tv_sec = when->it_interval.tv_sec;
+
+		if (value_need_clamp) {
+			clamped.it_value.tv_nsec = clock_highres_interval_min;
+		} else {
+			clamped.it_value.tv_nsec = when->it_value.tv_nsec;
+		}
+
+		if (intval_need_clamp) {
+			clamped.it_interval.tv_nsec =
+			    clock_highres_interval_min;
+		} else {
+			clamped.it_interval.tv_nsec = when->it_interval.tv_nsec;
+		}
+
+		when = &clamped;
+	}
 
 	cyctime.cyt_when = ts2hrt(&when->it_value);
 	cyctime.cyt_interval = ts2hrt(&when->it_interval);
diff --git a/usr/src/uts/common/os/contract.c b/usr/src/uts/common/os/contract.c
index 249066674e..9ea08f5535 100644
--- a/usr/src/uts/common/os/contract.c
+++ b/usr/src/uts/common/os/contract.c
@@ -21,6 +21,7 @@
 /*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2016 Joyent, Inc.
  */
 
 /*
@@ -287,7 +288,10 @@ contract_ctor(contract_t *ct, ct_type_t *type, ct_template_t *tmpl, void *data,
 	avl_index_t where;
 	klwp_t *curlwp = ttolwp(curthread);
 
-	ASSERT(author == curproc);
+	/*
+	 * It's possible that author is not curproc if the zone is creating
+	 * a new process as a child of zsched.
+	 */
 
 	mutex_init(&ct->ct_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&ct->ct_reflock, NULL, MUTEX_DEFAULT, NULL);
diff --git a/usr/src/uts/common/os/cred.c b/usr/src/uts/common/os/cred.c
index 733fd03a92..b0098946b3 100644
--- a/usr/src/uts/common/os/cred.c
+++ b/usr/src/uts/common/os/cred.c
@@ -727,6 +727,14 @@ crgetzoneid(const cred_t *cr)
 	    cr->cr_zone->zone_id);
 }
 
+zoneid_t
+crgetzonedid(const cred_t *cr)
+{
+	return (cr->cr_zone == NULL ?
+	    (cr->cr_uid == -1 ? (zoneid_t)-1 : GLOBAL_ZONEID) :
+	    cr->cr_zone->zone_did);
+}
+
 projid_t
 crgetprojid(const cred_t *cr)
 {
diff --git a/usr/src/uts/common/os/ddi_intr_irm.c b/usr/src/uts/common/os/ddi_intr_irm.c
index c3c0481e7f..a4b35dcb5b 100644
--- a/usr/src/uts/common/os/ddi_intr_irm.c
+++ b/usr/src/uts/common/os/ddi_intr_irm.c
@@ -1320,7 +1320,7 @@ i_ddi_irm_notify(ddi_irm_pool_t *pool_p, ddi_irm_req_t *req_p)
 
 	/* Log callback errors */
 	if (ret != DDI_SUCCESS) {
-		cmn_err(CE_WARN, "%s%d: failed callback (action=%d, ret=%d)\n",
+		cmn_err(CE_WARN, "!%s%d: failed callback (action=%d, ret=%d)\n",
 		    ddi_driver_name(req_p->ireq_dip),
 		    ddi_get_instance(req_p->ireq_dip), (int)action, ret);
 	}
diff --git a/usr/src/uts/common/os/dumpsubr.c b/usr/src/uts/common/os/dumpsubr.c
index 781c564233..38d5f1ab18 100644
--- a/usr/src/uts/common/os/dumpsubr.c
+++ b/usr/src/uts/common/os/dumpsubr.c
@@ -21,6 +21,7 @@
 
 /*
  * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
  */
 
 #include <sys/types.h>
@@ -342,6 +343,7 @@ typedef struct dumpsync {
 	uint_t neednl;			/* will need to print a newline */
 	uint_t percent;			/* dump progress */
 	uint_t percent_done;		/* dump progress reported */
+	int sec_done;			/* dump progress last report time */
 	cqueue_t freebufq;		/* free kmem bufs for writing */
 	cqueue_t mainq;			/* input for main task */
 	cqueue_t helperq;		/* input for helpers */
@@ -2285,7 +2287,7 @@ dumpsys_main_task(void *arg)
 	cbuf_t *cp;
 	pgcnt_t baseoff, pfnoff;
 	pfn_t base, pfn;
-	int sec, i, dumpserial;
+	int i, dumpserial;
 
 	/*
 	 * Fall back to serial mode if there are no helpers.
@@ -2311,13 +2313,20 @@ dumpsys_main_task(void *arg)
 
 	dump_init_memlist_walker(&mlw);
 
-	/* CONSTCOND */
-	while (1) {
+	for (;;) {
+		int sec = (gethrtime() - ds->start) / NANOSEC;
 
-		if (ds->percent > ds->percent_done) {
+		/*
+		 * Render a simple progress display on the system console to
+		 * make clear to the operator that the system has not hung.
+		 * Emit an update when dump progress has advanced by one
+		 * percent, or when no update has been drawn in the last
+		 * second.
+		 */
+		if (ds->percent > ds->percent_done || sec > ds->sec_done) {
+			ds->sec_done = sec;
 			ds->percent_done = ds->percent;
-			sec = (gethrtime() - ds->start) / 1000 / 1000 / 1000;
-			uprintf("^\r%2d:%02d %3d%% done",
+			uprintf("^\rdumping: %2d:%02d %3d%% done",
 			    sec / 60, sec % 60, ds->percent);
 			ds->neednl = 1;
 		}
@@ -2501,8 +2510,7 @@ dumpsys_main_task(void *arg)
 			break;
 
 		} /* end switch */
-
-	} /* end while(1) */
+	}
 }
 
 #ifdef	COLLECT_METRICS
diff --git a/usr/src/uts/common/os/exec.c b/usr/src/uts/common/os/exec.c
index 172fce8d89..d46b8538a9 100644
--- a/usr/src/uts/common/os/exec.c
+++ b/usr/src/uts/common/os/exec.c
@@ -26,7 +26,7 @@
 /*	Copyright (c) 1988 AT&T	*/
 /*	  All Rights Reserved  	*/
 /*
- * Copyright 2014, Joyent, Inc.  All rights reserved.
+ * Copyright 2015, Joyent, Inc.  All rights reserved.
  */
 
 #include <sys/types.h>
@@ -69,6 +69,7 @@
 #include <sys/sdt.h>
 #include <sys/brand.h>
 #include <sys/klpd.h>
+#include <sys/random.h>
 
 #include <c2/audit.h>
 
@@ -97,6 +98,7 @@ uint_t auxv_hwcap32_2 = 0;	/* 32-bit version of auxv_hwcap2 */
 #endif
 
 #define	PSUIDFLAGS		(SNOCD|SUGID)
+#define	RANDOM_LEN	16	/* 16 bytes for AT_RANDOM aux entry */
 
 /*
  * exece() - system call wrapper around exec_common()
@@ -297,14 +299,43 @@ exec_common(const char *fname, const char **argp, const char **envp,
 	ua.argp = argp;
 	ua.envp = envp;
 
-	/* If necessary, brand this process before we start the exec. */
-	if (brandme)
-		brand_setbrand(p);
+	/* If necessary, brand this process/lwp before we start the exec. */
+	if (brandme) {
+		void *brand_data = NULL;
+
+		/*
+		 * Process branding may fail if multiple LWPs are present and
+		 * holdlwps() cannot complete successfully.
+		 */
+		error = brand_setbrand(p, B_TRUE);
+
+		if (error == 0 && BROP(p)->b_lwpdata_alloc != NULL) {
+			brand_data = BROP(p)->b_lwpdata_alloc(p);
+			if (brand_data == NULL) {
+				error = 1;
+			}
+		}
+
+		if (error == 0) {
+			mutex_enter(&p->p_lock);
+			BROP(p)->b_initlwp(lwp, brand_data);
+			mutex_exit(&p->p_lock);
+		} else {
+			VN_RELE(vp);
+			if (dir != NULL) {
+				VN_RELE(dir);
+			}
+			pn_free(&resolvepn);
+			goto fail;
+		}
+	}
 
 	if ((error = gexec(&vp, &ua, &args, NULL, 0, &execsz,
-	    exec_file, p->p_cred, brand_action)) != 0) {
-		if (brandme)
-			brand_clearbrand(p, B_FALSE);
+	    exec_file, p->p_cred, &brand_action)) != 0) {
+		if (brandme) {
+			BROP(p)->b_freelwp(lwp);
+			brand_clearbrand(p, B_TRUE);
+		}
 		VN_RELE(vp);
 		if (dir != NULL)
 			VN_RELE(dir);
@@ -336,7 +367,7 @@ exec_common(const char *fname, const char **argp, const char **envp,
 	/*
 	 * Clear contract template state
 	 */
-	lwp_ctmpl_clear(lwp);
+	lwp_ctmpl_clear(lwp, B_TRUE);
 
 	/*
 	 * Save the directory in which we found the executable for expanding
@@ -360,6 +391,8 @@ exec_common(const char *fname, const char **argp, const char **envp,
 	 * pending held signals remain held, so don't clear t_hold.
 	 */
 	mutex_enter(&p->p_lock);
+	DTRACE_PROBE3(oldcontext__set, klwp_t *, lwp,
+	    uintptr_t, lwp->lwp_oldcontext, uintptr_t, 0);
 	lwp->lwp_oldcontext = 0;
 	lwp->lwp_ustack = 0;
 	lwp->lwp_old_stk_ctl = 0;
@@ -419,8 +452,10 @@ exec_common(const char *fname, const char **argp, const char **envp,
 	TRACE_2(TR_FAC_PROC, TR_PROC_EXEC, "proc_exec:p %p up %p", p, up);
 
 	/* Unbrand ourself if necessary. */
-	if (PROC_IS_BRANDED(p) && (brand_action == EBA_NATIVE))
+	if (PROC_IS_BRANDED(p) && (brand_action == EBA_NATIVE)) {
+		BROP(p)->b_freelwp(lwp);
 		brand_clearbrand(p, B_FALSE);
+	}
 
 	setregs(&args);
 
@@ -544,7 +579,7 @@ gexec(
 	long *execsz,
 	caddr_t exec_file,
 	struct cred *cred,
-	int brand_action)
+	int *brand_action)
 {
 	struct vnode *vp, *execvp = NULL;
 	proc_t *pp = ttoproc(curthread);
@@ -858,8 +893,14 @@ gexec(
 			if (pp->p_plist || (pp->p_proc_flag & P_PR_TRACE))
 				args->traceinval = 1;
 		}
-		if (pp->p_proc_flag & P_PR_PTRACE)
+
+		/*
+		 * If legacy ptrace is enabled, generate the SIGTRAP.
+		 */
+		if (pp->p_proc_flag & P_PR_PTRACE) {
 			psignal(pp, SIGTRAP);
+		}
+
 		if (args->traceinval)
 			prinvalidate(&pp->p_user);
 	}
@@ -1517,6 +1558,27 @@ stk_add(uarg_t *args, const char *sp, enum uio_seg segflg)
 	return (0);
 }
 
+/*
+ * Add a fixed size byte array to the stack (only from kernel space).
+ */
+static int
+stk_byte_add(uarg_t *args, const uint8_t *sp, size_t len)
+{
+	int error;
+
+	if (STK_AVAIL(args) < sizeof (int))
+		return (E2BIG);
+	*--args->stk_offp = args->stk_strp - args->stk_base;
+
+	if (len > STK_AVAIL(args))
+		return (E2BIG);
+	bcopy(sp, args->stk_strp, len);
+
+	args->stk_strp += len;
+
+	return (0);
+}
+
 static int
 stk_getptr(uarg_t *args, char *src, char **dst)
 {
@@ -1553,6 +1615,7 @@ stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp)
 	size_t size, pad;
 	char *argv = (char *)uap->argp;
 	char *envp = (char *)uap->envp;
+	uint8_t rdata[RANDOM_LEN];
 
 	/*
 	 * Copy interpreter's name and argument to argv[0] and argv[1].
@@ -1635,8 +1698,9 @@ stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp)
 	args->ne = args->na - argc;
 
 	/*
-	 * Add AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME, and
-	 * AT_SUN_EMULATOR strings to the stack.
+	 * Add AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME,
+	 * AT_SUN_BRAND_NROOT, and AT_SUN_EMULATOR strings, as well as AT_RANDOM
+	 * array, to the stack.
 	 */
 	if (auxvpp != NULL && *auxvpp != NULL) {
 		if ((error = stk_add(args, platform, UIO_SYSSPACE)) != 0)
@@ -1649,6 +1713,20 @@ stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp)
 		if (args->emulator != NULL &&
 		    (error = stk_add(args, args->emulator, UIO_SYSSPACE)) != 0)
 			return (error);
+
+		/*
+		 * For the AT_RANDOM aux vector we provide 16 bytes of random
+		 * data.
+		 */
+		(void) random_get_pseudo_bytes(rdata, sizeof (rdata));
+
+		if ((error = stk_byte_add(args, rdata, sizeof (rdata))) != 0)
+			return (error);
+
+		if (args->brand_nroot != NULL &&
+		    (error = stk_add(args, args->brand_nroot,
+		    UIO_SYSSPACE)) != 0)
+			return (error);
 	}
 
 	/*
@@ -1755,7 +1833,7 @@ stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up)
 	/*
 	 * Fill in the aux vector now that we know the user stack addresses
 	 * for the AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME and
-	 * AT_SUN_EMULATOR strings.
+	 * AT_SUN_EMULATOR strings, as well as the AT_RANDOM array.
 	 */
 	if (auxvpp != NULL && *auxvpp != NULL) {
 		if (args->to_model == DATAMODEL_NATIVE) {
@@ -1768,6 +1846,11 @@ stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up)
 			if (args->emulator != NULL)
 				ADDAUX(*a,
 				    AT_SUN_EMULATOR, (long)&ustrp[*--offp])
+			ADDAUX(*a, AT_RANDOM, (long)&ustrp[*--offp])
+			if (args->brand_nroot != NULL) {
+				ADDAUX(*a,
+				    AT_SUN_BRAND_NROOT, (long)&ustrp[*--offp])
+			}
 		} else {
 			auxv32_t **a = (auxv32_t **)auxvpp;
 			ADDAUX(*a,
@@ -1780,6 +1863,11 @@ stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up)
 			if (args->emulator != NULL)
 				ADDAUX(*a, AT_SUN_EMULATOR,
 				    (int)(uintptr_t)&ustrp[*--offp])
+			ADDAUX(*a, AT_RANDOM, (int)(uintptr_t)&ustrp[*--offp])
+			if (args->brand_nroot != NULL) {
+				ADDAUX(*a, AT_SUN_BRAND_NROOT,
+				    (int)(uintptr_t)&ustrp[*--offp])
+			}
 		}
 	}
 
@@ -1868,6 +1956,9 @@ exec_args(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp)
 		usrstack = (char *)USRSTACK32;
 	}
 
+	if (args->maxstack != 0 && (uintptr_t)usrstack > args->maxstack)
+		usrstack = (char *)args->maxstack;
+
 	ASSERT(P2PHASE((uintptr_t)usrstack, args->stk_align) == 0);
 
 #if defined(__sparc)
diff --git a/usr/src/uts/common/os/exit.c b/usr/src/uts/common/os/exit.c
index f0c0983a3a..0e213deb21 100644
--- a/usr/src/uts/common/os/exit.c
+++ b/usr/src/uts/common/os/exit.c
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, Joyent, Inc. All rights reserved.
+ * Copyright 2015 Joyent, Inc. All rights reserved.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -230,7 +230,7 @@ restart_init(int what, int why)
 		siginfofree(lwp->lwp_curinfo);
 		lwp->lwp_curinfo = NULL;
 	}
-	lwp_ctmpl_clear(lwp);
+	lwp_ctmpl_clear(lwp, B_FALSE);
 
 	/*
 	 * Reset both the process root directory and the current working
@@ -366,19 +366,6 @@ proc_exit(int why, int what)
 	}
 	mutex_exit(&p->p_lock);
 
-	DTRACE_PROC(lwp__exit);
-	DTRACE_PROC1(exit, int, why);
-
-	/*
-	 * Will perform any brand specific proc exit processing, since this
-	 * is always the last lwp, will also perform lwp_exit and free brand
-	 * data
-	 */
-	if (PROC_IS_BRANDED(p)) {
-		lwp_detach_brand_hdlrs(lwp);
-		brand_clearbrand(p, B_FALSE);
-	}
-
 	/*
 	 * Don't let init exit unless zone_start_init() failed its exec, or
 	 * we are shutting down the zone or the machine.
@@ -390,12 +377,35 @@ proc_exit(int why, int what)
 		if (z->zone_boot_err == 0 &&
 		    zone_status_get(z) < ZONE_IS_SHUTTING_DOWN &&
 		    zone_status_get(global_zone) < ZONE_IS_SHUTTING_DOWN) {
-			if (z->zone_restart_init == B_TRUE) {
-				if (restart_init(what, why) == 0)
-					return (0);
+
+			/*
+			 * If the init process should be restarted, the
+			 * "zone_restart_init" member will be set.  Some init
+			 * programs in branded zones do not tolerate a restart
+			 * in the traditional manner; setting the
+			 * "zone_reboot_on_init_exit" member will cause the
+			 * entire zone to be rebooted instead.  If neither of
+			 * these flags is set the zone will shut down.
+			 */
+			if (z->zone_reboot_on_init_exit == B_TRUE &&
+			    z->zone_restart_init == B_TRUE) {
+				/*
+				 * Trigger a zone reboot and continue
+				 * with exit processing.
+				 */
+				z->zone_init_status = wstat(why, what);
+				(void) zone_kadmin(A_REBOOT, 0, NULL,
+				    zone_kcred());
+
 			} else {
+				if (z->zone_restart_init == B_TRUE) {
+					if (restart_init(what, why) == 0)
+						return (0);
+				}
+
+				z->zone_init_status = wstat(why, what);
 				(void) zone_kadmin(A_SHUTDOWN, AD_HALT, NULL,
-				    CRED());
+				    zone_kcred());
 			}
 		}
 
@@ -407,6 +417,32 @@ proc_exit(int why, int what)
 		z->zone_proc_initpid = -1;
 	}
 
+	/*
+	 * Delay firing probes (and performing brand cleanup) until after the
+	 * zone_proc_initpid check. Cases which result in zone shutdown or
+	 * restart via zone_kadmin eventually result in a call back to
+	 * proc_exit.
+	 */
+	DTRACE_PROC(lwp__exit);
+	DTRACE_PROC1(exit, int, why);
+
+	/*
+	 * Will perform any brand specific proc exit processing. Since this
+	 * is always the last lwp, will also perform lwp exit/free and proc
+	 * exit. Brand data will be freed when the process is reaped.
+	 */
+	if (PROC_IS_BRANDED(p)) {
+		BROP(p)->b_lwpexit(lwp);
+		BROP(p)->b_proc_exit(p);
+		/*
+		 * To ensure that b_proc_exit has access to brand-specific data
+		 * contained by the one remaining lwp, call the freelwp hook as
+		 * the last part of this clean-up process.
+		 */
+		BROP(p)->b_freelwp(lwp);
+		lwp_detach_brand_hdlrs(lwp);
+	}
+
 	lwp_pcb_exit();
 
 	/*
@@ -658,10 +694,22 @@ proc_exit(int why, int what)
 	if ((q = p->p_child) != NULL && p != proc_init) {
 		struct proc	*np;
 		struct proc	*initp = proc_init;
+		pid_t		zone_initpid = 1;
+		struct proc	*zoneinitp = NULL;
 		boolean_t	setzonetop = B_FALSE;
 
-		if (!INGLOBALZONE(curproc))
-			setzonetop = B_TRUE;
+		if (!INGLOBALZONE(curproc)) {
+			zone_initpid = curproc->p_zone->zone_proc_initpid;
+
+			ASSERT(MUTEX_HELD(&pidlock));
+			zoneinitp = prfind(zone_initpid);
+			if (zoneinitp != NULL) {
+				initp = zoneinitp;
+			} else {
+				zone_initpid = 1;
+				setzonetop = B_TRUE;
+			}
+		}
 
 		pgdetach(p);
 
@@ -673,7 +721,8 @@ proc_exit(int why, int what)
 			 */
 			delete_ns(q->p_parent, q);
 
-			q->p_ppid = 1;
+			q->p_ppid = zone_initpid;
+
 			q->p_pidflag &= ~(CLDNOSIGCHLD | CLDWAITPID);
 			if (setzonetop) {
 				mutex_enter(&q->p_lock);
@@ -847,8 +896,50 @@ proc_exit(int why, int what)
 
 	mutex_exit(&p->p_lock);
 	if (!evaporate) {
-		p->p_pidflag &= ~CLDPEND;
-		sigcld(p, sqp);
+		/*
+		 * The brand specific code only happens when the brand has a
+		 * function to call in place of sigcld and the parent of the
+		 * exiting process is not the global zone init. If the parent
+		 * is the global zone init, then the process was reparented,
+		 * and we don't want brand code delivering possibly strange
+		 * signals to init. Also, init is not branded, so any brand
+		 * specific exit data will not be picked up by init anyway.
+		 */
+		if (PROC_IS_BRANDED(p) &&
+		    BROP(p)->b_exit_with_sig != NULL &&
+		    p->p_ppid != 1) {
+			/*
+			 * The code for _fini that could unload the brand_t
+			 * blocks until the count of zones using the module
+			 * reaches zero. Zones decrement the refcount on their
+			 * brands only after all user tasks in that zone have
+			 * exited and been waited on. The decrement on the
+			 * brand's refcount happen in zone_destroy(). That
+			 * depends on zone_shutdown() having been completed.
+			 * zone_shutdown() includes a call to zone_empty(),
+			 * where the zone waits for itself to reach the state
+			 * ZONE_IS_EMPTY. This state is only set in either
+			 * zone_shutdown(), when there are no user processes as
+			 * the zone enters this function, or in
+			 * zone_task_rele(). zone_task_rele() is called from
+			 * code triggered by waiting on processes, not by the
+			 * processes exiting through proc_exit().  This means
+			 * all the branded processes that could exist for a
+			 * specific brand_t must exit and get reaped before the
+			 * refcount on the brand_t can reach 0. _fini will
+			 * never unload the corresponding brand module before
+			 * proc_exit finishes execution for all processes
+			 * branded with a particular brand_t, which makes the
+			 * operation below safe to do. Brands that wish to use
+			 * this mechanism must wait in _fini as described
+			 * above.
+			 */
+			BROP(p)->b_exit_with_sig(p, sqp);
+		} else {
+			p->p_pidflag &= ~CLDPEND;
+			sigcld(p, sqp);
+		}
+
 	} else {
 		/*
 		 * Do what sigcld() would do if the disposition
@@ -927,10 +1018,9 @@ winfo(proc_t *pp, k_siginfo_t *ip, int waitflag)
 int
 waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)
 {
-	int found;
 	proc_t *cp, *pp;
-	int proc_gone;
 	int waitflag = !(options & WNOWAIT);
+	boolean_t have_brand_helper = B_FALSE;
 
 	/*
 	 * Obsolete flag, defined here only for binary compatibility
@@ -958,7 +1048,8 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)
 	pp = ttoproc(curthread);
 
 	/*
-	 * lock parent mutex so that sibling chain can be searched.
+	 * Anytime you are looking for a process, you take pidlock to prevent
+	 * things from changing as you look.
 	 */
 	mutex_enter(&pidlock);
 
@@ -978,10 +1069,37 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)
 		return (ECHILD);
 	}
 
-	while (pp->p_child != NULL) {
+	if (PROC_IS_BRANDED(pp) && BROP(pp)->b_waitid_helper != NULL) {
+		have_brand_helper = B_TRUE;
+	}
+
+	while (pp->p_child != NULL || have_brand_helper) {
+		boolean_t brand_wants_wait = B_FALSE;
+		int proc_gone = 0;
+		int found = 0;
 
-		proc_gone = 0;
+		/*
+		 * Give the brand a chance to return synthetic results from
+		 * this waitid() call before we do the real thing.
+		 */
+		if (have_brand_helper) {
+			int ret;
 
+			if (BROP(pp)->b_waitid_helper(idtype, id, ip, options,
+			    &brand_wants_wait, &ret) == 0) {
+				mutex_exit(&pidlock);
+				return (ret);
+			}
+
+			if (pp->p_child == NULL) {
+				goto no_real_children;
+			}
+		}
+
+		/*
+		 * Look for interesting children in the newstate list.
+		 */
+		VERIFY(pp->p_child != NULL);
 		for (cp = pp->p_child_ns; cp != NULL; cp = cp->p_sibling_ns) {
 			if (idtype != P_PID && (cp->p_pidflag & CLDWAITPID))
 				continue;
@@ -989,6 +1107,11 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)
 				continue;
 			if (idtype == P_PGID && id != cp->p_pgrp)
 				continue;
+			if (PROC_IS_BRANDED(pp)) {
+				if (BROP(pp)->b_wait_filter != NULL &&
+				    BROP(pp)->b_wait_filter(pp, cp) == B_FALSE)
+					continue;
+			}
 
 			switch (cp->p_wcode) {
 
@@ -1033,12 +1156,16 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)
 		 * Wow! None of the threads on the p_sibling_ns list were
 		 * interesting threads. Check all the kids!
 		 */
-		found = 0;
 		for (cp = pp->p_child; cp != NULL; cp = cp->p_sibling) {
 			if (idtype == P_PID && id != cp->p_pid)
 				continue;
 			if (idtype == P_PGID && id != cp->p_pgrp)
 				continue;
+			if (PROC_IS_BRANDED(pp)) {
+				if (BROP(pp)->b_wait_filter != NULL &&
+				    BROP(pp)->b_wait_filter(pp, cp) == B_FALSE)
+					continue;
+			}
 
 			switch (cp->p_wcode) {
 			case CLD_TRAPPED:
@@ -1107,11 +1234,12 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)
 				break;
 		}
 
+no_real_children:
 		/*
 		 * If we found no interesting processes at all,
 		 * break out and return ECHILD.
 		 */
-		if (found + proc_gone == 0)
+		if (!brand_wants_wait && (found + proc_gone == 0))
 			break;
 
 		if (options & WNOHANG) {
@@ -1130,7 +1258,7 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)
 		 * change state while we wait, we don't wait at all.
 		 * Get out with ECHILD according to SVID.
 		 */
-		if (found == proc_gone)
+		if (!brand_wants_wait && (found == proc_gone))
 			break;
 
 		if (!cv_wait_sig_swap(&pp->p_cv, &pidlock)) {
@@ -1226,6 +1354,12 @@ freeproc(proc_t *p)
 		p->p_killsqp = NULL;
 	}
 
+	/* Clear any remaining brand data */
+	if (PROC_IS_BRANDED(p)) {
+		brand_clearbrand(p, B_FALSE);
+	}
+
+
 	prfree(p);	/* inform /proc */
 
 	/*
diff --git a/usr/src/uts/common/os/fio.c b/usr/src/uts/common/os/fio.c
index 76eddd4e50..bfee77130d 100644
--- a/usr/src/uts/common/os/fio.c
+++ b/usr/src/uts/common/os/fio.c
@@ -852,7 +852,8 @@ flist_fork(uf_info_t *pfip, uf_info_t *cfip)
 	 */
 	cfip->fi_nfiles = nfiles = flist_minsize(pfip);
 
-	cfip->fi_list = kmem_zalloc(nfiles * sizeof (uf_entry_t), KM_SLEEP);
+	cfip->fi_list = nfiles == 0 ? NULL :
+	    kmem_zalloc(nfiles * sizeof (uf_entry_t), KM_SLEEP);
 
 	for (fd = 0, pufp = pfip->fi_list, cufp = cfip->fi_list; fd < nfiles;
 	    fd++, pufp++, cufp++) {
diff --git a/usr/src/uts/common/os/fork.c b/usr/src/uts/common/os/fork.c
index fe3a362fa7..d5ba123894 100644
--- a/usr/src/uts/common/os/fork.c
+++ b/usr/src/uts/common/os/fork.c
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2013, Joyent, Inc. All rights reserved.
+ * Copyright 2016, Joyent, Inc.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -84,6 +84,7 @@ static int64_t cfork(int, int, int);
 static int getproc(proc_t **, pid_t, uint_t);
 #define	GETPROC_USER	0x0
 #define	GETPROC_KERNEL	0x1
+#define	GETPROC_ZSCHED	0x2
 
 static void fork_fail(proc_t *);
 static void forklwp_fail(proc_t *);
@@ -696,7 +697,7 @@ fork_fail(proc_t *cp)
 	if (PTOU(curproc)->u_cwd)
 		refstr_rele(PTOU(curproc)->u_cwd);
 	if (PROC_IS_BRANDED(cp)) {
-		brand_clearbrand(cp, B_TRUE);
+		brand_clearbrand(cp, B_FALSE);
 	}
 }
 
@@ -745,7 +746,7 @@ forklwp_fail(proc_t *p)
 			kmem_free(t->t_door, sizeof (door_data_t));
 			t->t_door = NULL;
 		}
-		lwp_ctmpl_clear(ttolwp(t));
+		lwp_ctmpl_clear(ttolwp(t), B_FALSE);
 
 		/*
 		 * Remove the thread from the all threads list.
@@ -782,6 +783,9 @@ extern struct as kas;
 
 /*
  * fork a kernel process.
+ *
+ * Passing a pid argument of -1 indicates that the new process should be
+ * launched as a child of 'zsched' within the zone.
  */
 int
 newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct,
@@ -800,6 +804,7 @@ newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct,
 		rctl_set_t *init_set;
 
 		ASSERT(pid != 1);
+		ASSERT(pid >= 0);
 
 		if (getproc(&p, pid, GETPROC_KERNEL) < 0)
 			return (EAGAIN);
@@ -843,8 +848,18 @@ newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct,
 		rctl_set_t *init_set;
 		task_t *tk, *tk_old;
 		klwp_t *lwp;
+		boolean_t pzsched = B_FALSE;
+		int flag = GETPROC_USER;
+
+		/* Handle a new user-level thread as child of zsched. */
+		if (pid < 0) {
+			VERIFY(curzone != global_zone);
+			flag = GETPROC_ZSCHED;
+			pzsched = B_TRUE;
+			pid = 0;
+		}
 
-		if (getproc(&p, pid, GETPROC_USER) < 0)
+		if (getproc(&p, pid, flag) < 0)
 			return (EAGAIN);
 		/*
 		 * init creates a new task, distinct from the task
@@ -902,7 +917,8 @@ newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct,
 		}
 		t = lwptot(lwp);
 
-		ctp = contract_process_fork(sys_process_tmpl, p, curproc,
+		ctp = contract_process_fork(sys_process_tmpl, p,
+		    (pzsched ? curproc->p_zone->zone_zsched : curproc),
 		    B_FALSE);
 		ASSERT(ctp != NULL);
 		if (ct != NULL)
@@ -943,7 +959,11 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags)
 	if (zone_status_get(curproc->p_zone) >= ZONE_IS_SHUTTING_DOWN)
 		return (-1);	/* no point in starting new processes */
 
-	pp = (flags & GETPROC_KERNEL) ? &p0 : curproc;
+	if (flags & GETPROC_ZSCHED) {
+		pp = curproc->p_zone->zone_zsched;
+	} else {
+		pp = (flags & GETPROC_KERNEL) ? &p0 : curproc;
+	}
 	task = pp->p_task;
 	proj = task->tk_proj;
 	zone = pp->p_zone;
@@ -1004,6 +1024,9 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags)
 	cp->p_t1_lgrpid = LGRP_NONE;
 	cp->p_tr_lgrpid = LGRP_NONE;
 
+	/* Default to native brand initially */
+	cp->p_brand = &native_brand;
+
 	if ((newpid = pid_allocate(cp, pid, PID_ALLOC_PROC)) == -1) {
 		if (nproc == v.v_proc) {
 			CPU_STATS_ADDQ(CPU, sys, procovf, 1);
@@ -1071,9 +1094,6 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags)
 	cp->p_flag = pp->p_flag & (SJCTL|SNOWAIT|SNOCD);
 	cp->p_sessp = pp->p_sessp;
 	sess_hold(pp);
-	cp->p_brand = pp->p_brand;
-	if (PROC_IS_BRANDED(pp))
-		BROP(pp)->b_copy_procdata(cp, pp);
 	cp->p_bssbase = pp->p_bssbase;
 	cp->p_brkbase = pp->p_brkbase;
 	cp->p_brksize = pp->p_brksize;
@@ -1153,6 +1173,18 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags)
 	mutex_exit(&cp->p_lock);
 	mutex_exit(&pidlock);
 
+	if (PROC_IS_BRANDED(pp)) {
+		/*
+		 * The only reason why process branding should fail is when
+		 * the procedure is complicated by multiple LWPs on the scene.
+		 * With an LWP count of 0, this newly allocated process has no
+		 * reason to fail branding.
+		 */
+		VERIFY0(brand_setbrand(cp, B_FALSE));
+
+		BROP(pp)->b_copy_procdata(cp, pp);
+	}
+
 	avl_create(&cp->p_ct_held, contract_compar, sizeof (contract_t),
 	    offsetof(contract_t, ct_ctlist));
 
diff --git a/usr/src/uts/common/os/grow.c b/usr/src/uts/common/os/grow.c
index f5e92cfd94..0c4c0bcad6 100644
--- a/usr/src/uts/common/os/grow.c
+++ b/usr/src/uts/common/os/grow.c
@@ -19,7 +19,10 @@
  * CDDL HEADER END
  */
 
-/* Copyright 2013 OmniTI Computer Consulting, Inc. All rights reserved. */
+/*
+ * Copyright 2013 OmniTI Computer Consulting, Inc. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ */
 
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
@@ -52,6 +55,7 @@
 #include <sys/fcntl.h>
 #include <sys/lwpchan_impl.h>
 #include <sys/nbmlock.h>
+#include <sys/brand.h>
 
 #include <vm/hat.h>
 #include <vm/as.h>
@@ -522,6 +526,20 @@ choose_addr(struct as *as, caddr_t *addrp, size_t len, offset_t off,
 	return (0);
 }
 
+caddr_t
+map_userlimit(proc_t *pp, struct as *as, int flags)
+{
+	if (flags & _MAP_LOW32) {
+		if (PROC_IS_BRANDED(pp) && BROP(pp)->b_map32limit != NULL) {
+			return ((caddr_t)(uintptr_t)BROP(pp)->b_map32limit(pp));
+		} else {
+			return ((caddr_t)_userlimit32);
+		}
+	}
+
+	return (as->a_userlimit);
+}
+
 
 /*
  * Used for MAP_ANON - fast way to get anonymous pages
@@ -537,8 +555,6 @@ zmap(struct as *as, caddr_t *addrp, size_t len, uint_t uprot, int flags,
 		return (EACCES);
 
 	if ((flags & MAP_FIXED) != 0) {
-		caddr_t userlimit;
-
 		/*
 		 * Use the user address.  First verify that
 		 * the address to be used is page aligned.
@@ -547,9 +563,8 @@ zmap(struct as *as, caddr_t *addrp, size_t len, uint_t uprot, int flags,
 		if (((uintptr_t)*addrp & PAGEOFFSET) != 0)
 			return (EINVAL);
 
-		userlimit = flags & _MAP_LOW32 ?
-		    (caddr_t)USERLIMIT32 : as->a_userlimit;
-		switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) {
+		switch (valid_usr_range(*addrp, len, uprot, as,
+		    map_userlimit(as->a_proc, as, flags))) {
 		case RANGE_OKAY:
 			break;
 		case RANGE_BADPROT:
@@ -717,8 +732,6 @@ smmap_common(caddr_t *addrp, size_t len,
 	 * If the user specified an address, do some simple checks here
 	 */
 	if ((flags & MAP_FIXED) != 0) {
-		caddr_t userlimit;
-
 		/*
 		 * Use the user address.  First verify that
 		 * the address to be used is page aligned.
@@ -726,10 +739,8 @@ smmap_common(caddr_t *addrp, size_t len,
 		 */
 		if (((uintptr_t)*addrp & PAGEOFFSET) != 0)
 			return (EINVAL);
-
-		userlimit = flags & _MAP_LOW32 ?
-		    (caddr_t)USERLIMIT32 : as->a_userlimit;
-		switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) {
+		switch (valid_usr_range(*addrp, len, uprot, as,
+		    map_userlimit(curproc, as, flags))) {
 		case RANGE_OKAY:
 			break;
 		case RANGE_BADPROT:
diff --git a/usr/src/uts/common/os/id_space.c b/usr/src/uts/common/os/id_space.c
deleted file mode 100644
index 2dad0cb940..0000000000
--- a/usr/src/uts/common/os/id_space.c
+++ /dev/null
@@ -1,159 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
- */
-
-#include <sys/types.h>
-#include <sys/id_space.h>
-#include <sys/debug.h>
-
-/*
- * ID Spaces
- *
- *   The id_space_t provides a simple implementation of a managed range of
- *   integer identifiers using a vmem arena.  An ID space guarantees that the
- *   next identifer returned by an allocation is larger than the previous one,
- *   unless there are no larger slots remaining in the range.  In this case,
- *   the ID space will return the first available slot in the lower part of the
- *   range (viewing the previous identifier as a partitioning element).  If no
- *   slots are available, id_alloc()/id_allocff() will sleep until an
- *   identifier becomes available.  Accordingly, id_space allocations must be
- *   initiated from contexts where sleeping is acceptable.  id_alloc_nosleep()/
- *   id_allocff_nosleep() will return -1 if no slots are available or if the
- *   system is low on memory.  If id_alloc_nosleep() fails, callers should
- *   not try to extend the ID space.  This is to avoid making a possible
- *   low-memory situation worse.
- *
- *   As an ID space is designed for representing a range of id_t's, there
- *   is a preexisting maximal range: [0, MAXUID].  ID space requests outside
- *   that range will fail on a DEBUG kernel.  The id_allocff*() functions
- *   return the first available id, and should be used when there is benefit
- *   to having a compact allocated range.
- *
- *   (Presently, the id_space_t abstraction supports only direct allocations; ID
- *   reservation, in which an ID is allocated but placed in a internal
- *   dictionary for later use, should be added when a consuming subsystem
- *   arrives.)
- */
-
-#define	ID_TO_ADDR(id) ((void *)(uintptr_t)(id + 1))
-#define	ADDR_TO_ID(addr) ((id_t)((uintptr_t)addr - 1))
-
-/*
- * Create an arena to represent the range [low, high).
- * Caller must be in a context in which VM_SLEEP is legal.
- */
-id_space_t *
-id_space_create(const char *name, id_t low, id_t high)
-{
-	ASSERT(low >= 0);
-	ASSERT(low < high);
-
-	return (vmem_create(name, ID_TO_ADDR(low), high - low, 1,
-	    NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER));
-}
-
-/*
- * Destroy a previously created ID space.
- * No restrictions on caller's context.
- */
-void
-id_space_destroy(id_space_t *isp)
-{
-	vmem_destroy(isp);
-}
-
-void
-id_space_extend(id_space_t *isp, id_t low, id_t high)
-{
-	(void) vmem_add(isp, ID_TO_ADDR(low), high - low, VM_SLEEP);
-}
-
-/*
- * Allocate an id_t from specified ID space.
- * Caller must be in a context in which VM_SLEEP is legal.
- */
-id_t
-id_alloc(id_space_t *isp)
-{
-	return (ADDR_TO_ID(vmem_alloc(isp, 1, VM_SLEEP | VM_NEXTFIT)));
-}
-
-/*
- * Allocate an id_t from specified ID space.
- * Returns -1 on failure (see module block comments for more information on
- * failure modes).
- */
-id_t
-id_alloc_nosleep(id_space_t *isp)
-{
-	return (ADDR_TO_ID(vmem_alloc(isp, 1, VM_NOSLEEP | VM_NEXTFIT)));
-}
-
-/*
- * Allocate an id_t from specified ID space using FIRSTFIT.
- * Caller must be in a context in which VM_SLEEP is legal.
- */
-id_t
-id_allocff(id_space_t *isp)
-{
-	return (ADDR_TO_ID(vmem_alloc(isp, 1, VM_SLEEP | VM_FIRSTFIT)));
-}
-
-/*
- * Allocate an id_t from specified ID space using FIRSTFIT
- * Returns -1 on failure (see module block comments for more information on
- * failure modes).
- */
-id_t
-id_allocff_nosleep(id_space_t *isp)
-{
-	return (ADDR_TO_ID(vmem_alloc(isp, 1, VM_NOSLEEP | VM_FIRSTFIT)));
-}
-
-/*
- * Allocate a specific identifier if possible, returning the id if
- * successful, or -1 on failure.
- */
-id_t
-id_alloc_specific_nosleep(id_space_t *isp, id_t id)
-{
-	void *minaddr = ID_TO_ADDR(id);
-	void *maxaddr = ID_TO_ADDR(id + 1);
-
-	/*
-	 * Note that even though we're vmem_free()ing this later, it
-	 * should be OK, since there's no quantum cache.
-	 */
-	return (ADDR_TO_ID(vmem_xalloc(isp, 1, 1, 0, 0,
-	    minaddr, maxaddr, VM_NOSLEEP)));
-}
-
-/*
- * Free a previously allocated ID.
- * No restrictions on caller's context.
- */
-void
-id_free(id_space_t *isp, id_t id)
-{
-	vmem_free(isp, ID_TO_ADDR(id), 1);
-}
diff --git a/usr/src/uts/common/os/ipc.c b/usr/src/uts/common/os/ipc.c
index 9381019cd1..6a6f5d84ef 100644
--- a/usr/src/uts/common/os/ipc.c
+++ b/usr/src/uts/common/os/ipc.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T		*/
@@ -1217,6 +1218,23 @@ ipc_remove(ipc_service_t *service, kipc_perm_t *perm)
 	    (IPC_ZONE_USAGE(perm, service) == 0)));
 }
 
+/*
+ * Perform actual IPC_RMID, either via ipc_rmid or due to a delayed *_RMID.
+ */
+void
+ipc_rmsvc(ipc_service_t *service, kipc_perm_t *perm)
+{
+	ASSERT(service->ipcs_count > 0);
+	ASSERT(MUTEX_HELD(&service->ipcs_lock));
+
+	ipc_remove(service, perm);
+	mutex_exit(&service->ipcs_lock);
+
+	/* perform any per-service removal actions */
+	service->ipcs_rmid(perm);
+
+	ipc_rele(service, perm);
+}
 
 /*
  * Common code to perform an IPC_RMID.  Returns an errno value on
@@ -1247,13 +1265,7 @@ ipc_rmid(ipc_service_t *service, int id, cred_t *cr)
 	/*
 	 * Nothing can fail from this point on.
 	 */
-	ipc_remove(service, perm);
-	mutex_exit(&service->ipcs_lock);
-
-	/* perform any per-service removal actions */
-	service->ipcs_rmid(perm);
-
-	ipc_rele(service, perm);
+	ipc_rmsvc(service, perm);
 
 	return (0);
 }
diff --git a/usr/src/uts/common/os/kmem.c b/usr/src/uts/common/os/kmem.c
index cc53c2fb76..734fa910e4 100644
--- a/usr/src/uts/common/os/kmem.c
+++ b/usr/src/uts/common/os/kmem.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015 Joyent, Inc.  All rights reserved.
  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  */
 
@@ -158,10 +159,22 @@
  *               find known objects and is about to free it, or
  *            c) the client has freed the object.
  *            In all these cases (a, b, and c) kmem frees the new object (the
- *            unused copy destination) and searches for the old object in the
- *            magazine layer. If found, the object is removed from the magazine
- *            layer and freed to the slab layer so it will no longer hold the
- *            slab hostage.
+ *            unused copy destination).  In the first case, the object is in
+ *            use and the correct action is that for LATER; in the latter two
+ *            cases, we know that the object is either freed or about to be
+ *            freed, in which case it is either already in a magazine or about
+ *            to be in one.  In these cases, we know that the object will either
+ *            be reallocated and reused, or it will end up in a full magazine
+ *            that will be reaped (thereby liberating the slab).  Because it
+ *            is prohibitively expensive to differentiate these cases, and
+ *            because the defrag code is executed when we're low on memory
+ *            (thereby biasing the system to reclaim full magazines) we treat
+ *            all DONT_KNOW cases as LATER and rely on cache reaping to
+ *            generally clean up full magazines.  While we take the same action
+ *            for these cases, we maintain their semantic distinction:  if
+ *            defragmentation is not occurring, it is useful to know if this
+ *            is due to objects in use (LATER) or objects in an unknown state
+ *            of transition (DONT_KNOW).
  *
  * 2.3 Object States
  *
@@ -284,10 +297,10 @@
  * view of the slab layer, making it a candidate for the move callback. Most
  * objects unrecognized by the client in the move callback fall into this
  * category and are cheaply distinguished from known objects by the test
- * described earlier. Since recognition is cheap for the client, and searching
- * magazines is expensive for kmem, kmem defers searching until the client first
- * returns KMEM_CBRC_DONT_KNOW. As long as the needed effort is reasonable, kmem
- * elsewhere does what it can to avoid bothering the client unnecessarily.
+ * described earlier. Because searching magazines is prohibitively expensive
+ * for kmem, clients that do not mark freed objects (and therefore return
+ * KMEM_CBRC_DONT_KNOW for large numbers of objects) may find defragmentation
+ * efficacy reduced.
  *
  * Invalidating the designated pointer member before freeing the object marks
  * the object to be avoided in the callback, and conversely, assigning a valid
@@ -997,6 +1010,7 @@ size_t kmem_transaction_log_size; /* transaction log size [2% of memory] */
 size_t kmem_content_log_size;	/* content log size [2% of memory] */
 size_t kmem_failure_log_size;	/* failure log [4 pages per CPU] */
 size_t kmem_slab_log_size;	/* slab create log [4 pages per CPU] */
+size_t kmem_zerosized_log_size;	/* zero-sized log [4 pages per CPU] */
 size_t kmem_content_maxsave = 256; /* KMF_CONTENTS max bytes to log */
 size_t kmem_lite_minsize = 0;	/* minimum buffer size for KMF_LITE */
 size_t kmem_lite_maxalign = 1024; /* maximum buffer alignment for KMF_LITE */
@@ -1004,6 +1018,14 @@ int kmem_lite_pcs = 4;		/* number of PCs to store in KMF_LITE mode */
 size_t kmem_maxverify;		/* maximum bytes to inspect in debug routines */
 size_t kmem_minfirewall;	/* hardware-enforced redzone threshold */
 
+#ifdef DEBUG
+int kmem_warn_zerosized = 1;	/* whether to warn on zero-sized KM_SLEEP */
+#else
+int kmem_warn_zerosized = 0;	/* whether to warn on zero-sized KM_SLEEP */
+#endif
+
+int kmem_panic_zerosized = 0;	/* whether to panic on zero-sized KM_SLEEP */
+
 #ifdef _LP64
 size_t	kmem_max_cached = KMEM_BIG_MAXBUF;	/* maximum kmem_alloc cache */
 #else
@@ -1037,21 +1059,7 @@ static vmem_t		*kmem_default_arena;
 static vmem_t		*kmem_firewall_va_arena;
 static vmem_t		*kmem_firewall_arena;
 
-/*
- * Define KMEM_STATS to turn on statistic gathering. By default, it is only
- * turned on when DEBUG is also defined.
- */
-#ifdef	DEBUG
-#define	KMEM_STATS
-#endif	/* DEBUG */
-
-#ifdef	KMEM_STATS
-#define	KMEM_STAT_ADD(stat)			((stat)++)
-#define	KMEM_STAT_COND_ADD(cond, stat)		((void) (!(cond) || (stat)++))
-#else
-#define	KMEM_STAT_ADD(stat)			/* nothing */
-#define	KMEM_STAT_COND_ADD(cond, stat)		/* nothing */
-#endif	/* KMEM_STATS */
+static int		kmem_zerosized;		/* # of zero-sized allocs */
 
 /*
  * kmem slab consolidator thresholds (tunables)
@@ -1070,47 +1078,6 @@ size_t kmem_reclaim_max_slabs = 1;
  */
 size_t kmem_reclaim_scan_range = 12;
 
-#ifdef	KMEM_STATS
-static struct {
-	uint64_t kms_callbacks;
-	uint64_t kms_yes;
-	uint64_t kms_no;
-	uint64_t kms_later;
-	uint64_t kms_dont_need;
-	uint64_t kms_dont_know;
-	uint64_t kms_hunt_found_mag;
-	uint64_t kms_hunt_found_slab;
-	uint64_t kms_hunt_alloc_fail;
-	uint64_t kms_hunt_lucky;
-	uint64_t kms_notify;
-	uint64_t kms_notify_callbacks;
-	uint64_t kms_disbelief;
-	uint64_t kms_already_pending;
-	uint64_t kms_callback_alloc_fail;
-	uint64_t kms_callback_taskq_fail;
-	uint64_t kms_endscan_slab_dead;
-	uint64_t kms_endscan_slab_destroyed;
-	uint64_t kms_endscan_nomem;
-	uint64_t kms_endscan_refcnt_changed;
-	uint64_t kms_endscan_nomove_changed;
-	uint64_t kms_endscan_freelist;
-	uint64_t kms_avl_update;
-	uint64_t kms_avl_noupdate;
-	uint64_t kms_no_longer_reclaimable;
-	uint64_t kms_notify_no_longer_reclaimable;
-	uint64_t kms_notify_slab_dead;
-	uint64_t kms_notify_slab_destroyed;
-	uint64_t kms_alloc_fail;
-	uint64_t kms_constructor_fail;
-	uint64_t kms_dead_slabs_freed;
-	uint64_t kms_defrags;
-	uint64_t kms_scans;
-	uint64_t kms_scan_depot_ws_reaps;
-	uint64_t kms_debug_reaps;
-	uint64_t kms_debug_scans;
-} kmem_move_stats;
-#endif	/* KMEM_STATS */
-
 /* consolidator knobs */
 static boolean_t kmem_move_noreap;
 static boolean_t kmem_move_blocked;
@@ -1141,6 +1108,7 @@ kmem_log_header_t	*kmem_transaction_log;
 kmem_log_header_t	*kmem_content_log;
 kmem_log_header_t	*kmem_failure_log;
 kmem_log_header_t	*kmem_slab_log;
+kmem_log_header_t	*kmem_zerosized_log;
 
 static int		kmem_lite_count; /* # of PCs in kmem_buftag_lite_t */
 
@@ -1921,15 +1889,7 @@ kmem_slab_free(kmem_cache_t *cp, void *buf)
 		cp->cache_complete_slab_count--;
 		avl_add(&cp->cache_partial_slabs, sp);
 	} else {
-#ifdef	DEBUG
-		if (avl_update_gt(&cp->cache_partial_slabs, sp)) {
-			KMEM_STAT_ADD(kmem_move_stats.kms_avl_update);
-		} else {
-			KMEM_STAT_ADD(kmem_move_stats.kms_avl_noupdate);
-		}
-#else
 		(void) avl_update_gt(&cp->cache_partial_slabs, sp);
-#endif
 	}
 
 	ASSERT((cp->cache_slab_create - cp->cache_slab_destroy) ==
@@ -2941,8 +2901,33 @@ kmem_alloc(size_t size, int kmflag)
 		/* fall through to kmem_cache_alloc() */
 
 	} else {
-		if (size == 0)
+		if (size == 0) {
+			if (kmflag != KM_SLEEP && !(kmflag & KM_PANIC))
+				return (NULL);
+
+			/*
+			 * If this is a sleeping allocation or one that has
+			 * been specified to panic on allocation failure, we
+			 * consider it to be deprecated behavior to allocate
+			 * 0 bytes.  If we have been configured to panic under
+			 * this condition, we panic; if to warn, we warn -- and
+			 * regardless, we log to the kmem_zerosized_log that
+			 * that this condition has occurred (which gives us
+			 * enough information to be able to debug it).
+			 */
+			if (kmem_panic && kmem_panic_zerosized)
+				panic("attempted to kmem_alloc() size of 0");
+
+			if (kmem_warn_zerosized) {
+				cmn_err(CE_WARN, "kmem_alloc(): sleeping "
+				    "allocation with size of 0; "
+				    "see kmem_zerosized_log for details");
+			}
+
+			kmem_log_event(kmem_zerosized_log, NULL, NULL, NULL);
+
 			return (NULL);
+		}
 
 		buf = vmem_alloc(kmem_oversize_arena, size,
 		    kmflag & KM_VMFLAGS);
@@ -3556,7 +3541,7 @@ kmem_cache_kstat_update(kstat_t *ksp, int rw)
 		kmcp->kmc_move_later.value.ui64		= kd->kmd_later;
 		kmcp->kmc_move_dont_need.value.ui64	= kd->kmd_dont_need;
 		kmcp->kmc_move_dont_know.value.ui64	= kd->kmd_dont_know;
-		kmcp->kmc_move_hunt_found.value.ui64	= kd->kmd_hunt_found;
+		kmcp->kmc_move_hunt_found.value.ui64	= 0;
 		kmcp->kmc_move_slabs_freed.value.ui64	= kd->kmd_slabs_freed;
 		kmcp->kmc_defrag.value.ui64		= kd->kmd_defrags;
 		kmcp->kmc_scan.value.ui64		= kd->kmd_scans;
@@ -4127,7 +4112,8 @@ kmem_cache_destroy(kmem_cache_t *cp)
 
 	if (kmem_taskq != NULL)
 		taskq_wait(kmem_taskq);
-	if (kmem_move_taskq != NULL)
+
+	if (kmem_move_taskq != NULL && cp->cache_defrag != NULL)
 		taskq_wait(kmem_move_taskq);
 
 	kmem_cache_magazine_purge(cp);
@@ -4465,8 +4451,8 @@ kmem_init(void)
 	}
 
 	kmem_failure_log = kmem_log_init(kmem_failure_log_size);
-
 	kmem_slab_log = kmem_log_init(kmem_slab_log_size);
+	kmem_zerosized_log = kmem_log_init(kmem_zerosized_log_size);
 
 	/*
 	 * Initialize STREAMS message caches so allocb() is available.
@@ -4654,94 +4640,6 @@ kmem_slab_is_reclaimable(kmem_cache_t *cp, kmem_slab_t *sp, int flags)
 	    (sp->slab_chunks * cp->cache_defrag->kmd_reclaim_numer));
 }
 
-static void *
-kmem_hunt_mag(kmem_cache_t *cp, kmem_magazine_t *m, int n, void *buf,
-    void *tbuf)
-{
-	int i;		/* magazine round index */
-
-	for (i = 0; i < n; i++) {
-		if (buf == m->mag_round[i]) {
-			if (cp->cache_flags & KMF_BUFTAG) {
-				(void) kmem_cache_free_debug(cp, tbuf,
-				    caller());
-			}
-			m->mag_round[i] = tbuf;
-			return (buf);
-		}
-	}
-
-	return (NULL);
-}
-
-/*
- * Hunt the magazine layer for the given buffer. If found, the buffer is
- * removed from the magazine layer and returned, otherwise NULL is returned.
- * The state of the returned buffer is freed and constructed.
- */
-static void *
-kmem_hunt_mags(kmem_cache_t *cp, void *buf)
-{
-	kmem_cpu_cache_t *ccp;
-	kmem_magazine_t	*m;
-	int cpu_seqid;
-	int n;		/* magazine rounds */
-	void *tbuf;	/* temporary swap buffer */
-
-	ASSERT(MUTEX_NOT_HELD(&cp->cache_lock));
-
-	/*
-	 * Allocated a buffer to swap with the one we hope to pull out of a
-	 * magazine when found.
-	 */
-	tbuf = kmem_cache_alloc(cp, KM_NOSLEEP);
-	if (tbuf == NULL) {
-		KMEM_STAT_ADD(kmem_move_stats.kms_hunt_alloc_fail);
-		return (NULL);
-	}
-	if (tbuf == buf) {
-		KMEM_STAT_ADD(kmem_move_stats.kms_hunt_lucky);
-		if (cp->cache_flags & KMF_BUFTAG) {
-			(void) kmem_cache_free_debug(cp, buf, caller());
-		}
-		return (buf);
-	}
-
-	/* Hunt the depot. */
-	mutex_enter(&cp->cache_depot_lock);
-	n = cp->cache_magtype->mt_magsize;
-	for (m = cp->cache_full.ml_list; m != NULL; m = m->mag_next) {
-		if (kmem_hunt_mag(cp, m, n, buf, tbuf) != NULL) {
-			mutex_exit(&cp->cache_depot_lock);
-			return (buf);
-		}
-	}
-	mutex_exit(&cp->cache_depot_lock);
-
-	/* Hunt the per-CPU magazines. */
-	for (cpu_seqid = 0; cpu_seqid < max_ncpus; cpu_seqid++) {
-		ccp = &cp->cache_cpu[cpu_seqid];
-
-		mutex_enter(&ccp->cc_lock);
-		m = ccp->cc_loaded;
-		n = ccp->cc_rounds;
-		if (kmem_hunt_mag(cp, m, n, buf, tbuf) != NULL) {
-			mutex_exit(&ccp->cc_lock);
-			return (buf);
-		}
-		m = ccp->cc_ploaded;
-		n = ccp->cc_prounds;
-		if (kmem_hunt_mag(cp, m, n, buf, tbuf) != NULL) {
-			mutex_exit(&ccp->cc_lock);
-			return (buf);
-		}
-		mutex_exit(&ccp->cc_lock);
-	}
-
-	kmem_cache_free(cp, tbuf);
-	return (NULL);
-}
-
 /*
  * May be called from the kmem_move_taskq, from kmem_cache_move_notify_task(),
  * or when the buffer is freed.
@@ -4805,7 +4703,7 @@ static void kmem_move_end(kmem_cache_t *, kmem_move_t *);
  * NO		kmem frees the new buffer, marks the slab of the old buffer
  *              non-reclaimable to avoid bothering the client again
  * LATER	kmem frees the new buffer, increments slab_later_count
- * DONT_KNOW	kmem frees the new buffer, searches mags for the old buffer
+ * DONT_KNOW	kmem frees the new buffer
  * DONT_NEED	kmem frees both the old buffer and the new buffer
  *
  * The pending callback argument now being processed contains both of the
@@ -4839,19 +4737,14 @@ kmem_move_buffer(kmem_move_t *callback)
 	 * another buffer on the same slab.
 	 */
 	if (!kmem_slab_is_reclaimable(cp, sp, callback->kmm_flags)) {
-		KMEM_STAT_ADD(kmem_move_stats.kms_no_longer_reclaimable);
-		KMEM_STAT_COND_ADD((callback->kmm_flags & KMM_NOTIFY),
-		    kmem_move_stats.kms_notify_no_longer_reclaimable);
 		kmem_slab_free(cp, callback->kmm_to_buf);
 		kmem_move_end(cp, callback);
 		return;
 	}
 
 	/*
-	 * Hunting magazines is expensive, so we'll wait to do that until the
-	 * client responds KMEM_CBRC_DONT_KNOW. However, checking the slab layer
-	 * is cheap, so we might as well do that here in case we can avoid
-	 * bothering the client.
+	 * Checking the slab layer is easy, so we might as well do that here
+	 * in case we can avoid bothering the client.
 	 */
 	mutex_enter(&cp->cache_lock);
 	free_on_slab = (kmem_slab_allocated(cp, sp,
@@ -4859,7 +4752,6 @@ kmem_move_buffer(kmem_move_t *callback)
 	mutex_exit(&cp->cache_lock);
 
 	if (free_on_slab) {
-		KMEM_STAT_ADD(kmem_move_stats.kms_hunt_found_slab);
 		kmem_slab_free(cp, callback->kmm_to_buf);
 		kmem_move_end(cp, callback);
 		return;
@@ -4871,7 +4763,6 @@ kmem_move_buffer(kmem_move_t *callback)
 		 */
 		if (kmem_cache_alloc_debug(cp, callback->kmm_to_buf,
 		    KM_NOSLEEP, 1, caller()) != 0) {
-			KMEM_STAT_ADD(kmem_move_stats.kms_alloc_fail);
 			kmem_move_end(cp, callback);
 			return;
 		}
@@ -4879,15 +4770,11 @@ kmem_move_buffer(kmem_move_t *callback)
 	    cp->cache_constructor(callback->kmm_to_buf, cp->cache_private,
 	    KM_NOSLEEP) != 0) {
 		atomic_inc_64(&cp->cache_alloc_fail);
-		KMEM_STAT_ADD(kmem_move_stats.kms_constructor_fail);
 		kmem_slab_free(cp, callback->kmm_to_buf);
 		kmem_move_end(cp, callback);
 		return;
 	}
 
-	KMEM_STAT_ADD(kmem_move_stats.kms_callbacks);
-	KMEM_STAT_COND_ADD((callback->kmm_flags & KMM_NOTIFY),
-	    kmem_move_stats.kms_notify_callbacks);
 	cp->cache_defrag->kmd_callbacks++;
 	cp->cache_defrag->kmd_thread = curthread;
 	cp->cache_defrag->kmd_from_buf = callback->kmm_from_buf;
@@ -4905,7 +4792,6 @@ kmem_move_buffer(kmem_move_t *callback)
 	cp->cache_defrag->kmd_to_buf = NULL;
 
 	if (response == KMEM_CBRC_YES) {
-		KMEM_STAT_ADD(kmem_move_stats.kms_yes);
 		cp->cache_defrag->kmd_yes++;
 		kmem_slab_free_constructed(cp, callback->kmm_from_buf, B_FALSE);
 		/* slab safe to access until kmem_move_end() */
@@ -4920,14 +4806,12 @@ kmem_move_buffer(kmem_move_t *callback)
 
 	switch (response) {
 	case KMEM_CBRC_NO:
-		KMEM_STAT_ADD(kmem_move_stats.kms_no);
 		cp->cache_defrag->kmd_no++;
 		mutex_enter(&cp->cache_lock);
 		kmem_slab_move_no(cp, sp, callback->kmm_from_buf);
 		mutex_exit(&cp->cache_lock);
 		break;
 	case KMEM_CBRC_LATER:
-		KMEM_STAT_ADD(kmem_move_stats.kms_later);
 		cp->cache_defrag->kmd_later++;
 		mutex_enter(&cp->cache_lock);
 		if (!KMEM_SLAB_IS_PARTIAL(sp)) {
@@ -4936,7 +4820,6 @@ kmem_move_buffer(kmem_move_t *callback)
 		}
 
 		if (++sp->slab_later_count >= KMEM_DISBELIEF) {
-			KMEM_STAT_ADD(kmem_move_stats.kms_disbelief);
 			kmem_slab_move_no(cp, sp, callback->kmm_from_buf);
 		} else if (!(sp->slab_flags & KMEM_SLAB_NOMOVE)) {
 			sp->slab_stuck_offset = KMEM_SLAB_OFFSET(sp,
@@ -4945,7 +4828,6 @@ kmem_move_buffer(kmem_move_t *callback)
 		mutex_exit(&cp->cache_lock);
 		break;
 	case KMEM_CBRC_DONT_NEED:
-		KMEM_STAT_ADD(kmem_move_stats.kms_dont_need);
 		cp->cache_defrag->kmd_dont_need++;
 		kmem_slab_free_constructed(cp, callback->kmm_from_buf, B_FALSE);
 		if (sp->slab_refcnt == 0)
@@ -4955,19 +4837,21 @@ kmem_move_buffer(kmem_move_t *callback)
 		mutex_exit(&cp->cache_lock);
 		break;
 	case KMEM_CBRC_DONT_KNOW:
-		KMEM_STAT_ADD(kmem_move_stats.kms_dont_know);
+		/*
+		 * If we don't know if we can move this buffer or not, we'll
+		 * just assume that we can't:  if the buffer is in fact free,
+		 * then it is sitting in one of the per-CPU magazines or in
+		 * a full magazine in the depot layer.  Either way, because
+		 * defrag is induced in the same logic that reaps a cache,
+		 * it's likely that full magazines will be returned to the
+		 * system soon (thereby accomplishing what we're trying to
+		 * accomplish here: return those magazines to their slabs).
+		 * Given this, any work that we might do now to locate a buffer
+		 * in a magazine is wasted (and expensive!) work; we bump
+		 * a counter in this case and otherwise assume that we can't
+		 * move it.
+		 */
 		cp->cache_defrag->kmd_dont_know++;
-		if (kmem_hunt_mags(cp, callback->kmm_from_buf) != NULL) {
-			KMEM_STAT_ADD(kmem_move_stats.kms_hunt_found_mag);
-			cp->cache_defrag->kmd_hunt_found++;
-			kmem_slab_free_constructed(cp, callback->kmm_from_buf,
-			    B_TRUE);
-			if (sp->slab_refcnt == 0)
-				cp->cache_defrag->kmd_slabs_freed++;
-			mutex_enter(&cp->cache_lock);
-			kmem_slab_move_yes(cp, sp, callback->kmm_from_buf);
-			mutex_exit(&cp->cache_lock);
-		}
 		break;
 	default:
 		panic("'%s' (%p) unexpected move callback response %d\n",
@@ -4992,10 +4876,9 @@ kmem_move_begin(kmem_cache_t *cp, kmem_slab_t *sp, void *buf, int flags)
 	ASSERT(sp->slab_flags & KMEM_SLAB_MOVE_PENDING);
 
 	callback = kmem_cache_alloc(kmem_move_cache, KM_NOSLEEP);
-	if (callback == NULL) {
-		KMEM_STAT_ADD(kmem_move_stats.kms_callback_alloc_fail);
+
+	if (callback == NULL)
 		return (B_FALSE);
-	}
 
 	callback->kmm_from_slab = sp;
 	callback->kmm_from_buf = buf;
@@ -5020,7 +4903,6 @@ kmem_move_begin(kmem_cache_t *cp, kmem_slab_t *sp, void *buf, int flags)
 			pending->kmm_flags |= KMM_DESPERATE;
 		}
 		mutex_exit(&cp->cache_lock);
-		KMEM_STAT_ADD(kmem_move_stats.kms_already_pending);
 		kmem_cache_free(kmem_move_cache, callback);
 		return (B_TRUE);
 	}
@@ -5034,7 +4916,6 @@ kmem_move_begin(kmem_cache_t *cp, kmem_slab_t *sp, void *buf, int flags)
 
 	if (!taskq_dispatch(kmem_move_taskq, (task_func_t *)kmem_move_buffer,
 	    callback, TQ_NOSLEEP)) {
-		KMEM_STAT_ADD(kmem_move_stats.kms_callback_taskq_fail);
 		mutex_enter(&cp->cache_lock);
 		avl_remove(&cp->cache_defrag->kmd_moves_pending, callback);
 		mutex_exit(&cp->cache_lock);
@@ -5080,7 +4961,6 @@ kmem_move_end(kmem_cache_t *cp, kmem_move_t *callback)
 			cp->cache_slab_destroy++;
 			mutex_exit(&cp->cache_lock);
 			kmem_slab_destroy(cp, sp);
-			KMEM_STAT_ADD(kmem_move_stats.kms_dead_slabs_freed);
 			mutex_enter(&cp->cache_lock);
 		}
 	}
@@ -5225,8 +5105,6 @@ kmem_move_buffers(kmem_cache_t *cp, size_t max_scan, size_t max_slabs,
 					 * pending move completes.
 					 */
 					list_insert_head(deadlist, sp);
-					KMEM_STAT_ADD(kmem_move_stats.
-					    kms_endscan_slab_dead);
 					return (-1);
 				}
 
@@ -5241,10 +5119,6 @@ kmem_move_buffers(kmem_cache_t *cp, size_t max_scan, size_t max_slabs,
 				cp->cache_slab_destroy++;
 				mutex_exit(&cp->cache_lock);
 				kmem_slab_destroy(cp, sp);
-				KMEM_STAT_ADD(kmem_move_stats.
-				    kms_dead_slabs_freed);
-				KMEM_STAT_ADD(kmem_move_stats.
-				    kms_endscan_slab_destroyed);
 				mutex_enter(&cp->cache_lock);
 				/*
 				 * Since we can't pick up the scan where we left
@@ -5260,8 +5134,6 @@ kmem_move_buffers(kmem_cache_t *cp, size_t max_scan, size_t max_slabs,
 				 * for the request and say nothing about the
 				 * number of reclaimable slabs.
 				 */
-				KMEM_STAT_COND_ADD(s < max_slabs,
-				    kmem_move_stats.kms_endscan_nomem);
 				return (-1);
 			}
 
@@ -5277,16 +5149,10 @@ kmem_move_buffers(kmem_cache_t *cp, size_t max_scan, size_t max_slabs,
 				 * destination buffer on the same slab. In that
 				 * case, we're not interested in counting it.
 				 */
-				KMEM_STAT_COND_ADD(!(flags & KMM_DEBUG) &&
-				    (s < max_slabs),
-				    kmem_move_stats.kms_endscan_refcnt_changed);
 				return (-1);
 			}
-			if ((sp->slab_flags & KMEM_SLAB_NOMOVE) != nomove) {
-				KMEM_STAT_COND_ADD(s < max_slabs,
-				    kmem_move_stats.kms_endscan_nomove_changed);
+			if ((sp->slab_flags & KMEM_SLAB_NOMOVE) != nomove)
 				return (-1);
-			}
 
 			/*
 			 * Generating a move request allocates a destination
@@ -5313,11 +5179,6 @@ kmem_move_buffers(kmem_cache_t *cp, size_t max_scan, size_t max_slabs,
 	}
 end_scan:
 
-	KMEM_STAT_COND_ADD(!(flags & KMM_DEBUG) &&
-	    (s < max_slabs) &&
-	    (sp == avl_first(&cp->cache_partial_slabs)),
-	    kmem_move_stats.kms_endscan_freelist);
-
 	return (s);
 }
 
@@ -5377,8 +5238,6 @@ kmem_cache_move_notify_task(void *arg)
 			    &cp->cache_defrag->kmd_moves_pending)) {
 				list_insert_head(deadlist, sp);
 				mutex_exit(&cp->cache_lock);
-				KMEM_STAT_ADD(kmem_move_stats.
-				    kms_notify_slab_dead);
 				return;
 			}
 
@@ -5386,9 +5245,6 @@ kmem_cache_move_notify_task(void *arg)
 			cp->cache_slab_destroy++;
 			mutex_exit(&cp->cache_lock);
 			kmem_slab_destroy(cp, sp);
-			KMEM_STAT_ADD(kmem_move_stats.kms_dead_slabs_freed);
-			KMEM_STAT_ADD(kmem_move_stats.
-			    kms_notify_slab_destroyed);
 			return;
 		}
 	} else {
@@ -5402,7 +5258,6 @@ kmem_cache_move_notify(kmem_cache_t *cp, void *buf)
 {
 	kmem_move_notify_args_t *args;
 
-	KMEM_STAT_ADD(kmem_move_stats.kms_notify);
 	args = kmem_alloc(sizeof (kmem_move_notify_args_t), KM_NOSLEEP);
 	if (args != NULL) {
 		args->kmna_cache = cp;
@@ -5425,7 +5280,6 @@ kmem_cache_defrag(kmem_cache_t *cp)
 	n = avl_numnodes(&cp->cache_partial_slabs);
 	if (n > 1) {
 		/* kmem_move_buffers() drops and reacquires cache_lock */
-		KMEM_STAT_ADD(kmem_move_stats.kms_defrags);
 		cp->cache_defrag->kmd_defrags++;
 		(void) kmem_move_buffers(cp, n, 0, KMM_DESPERATE);
 	}
@@ -5524,7 +5378,6 @@ kmem_cache_scan(kmem_cache_t *cp)
 		 *
 		 * kmem_move_buffers() drops and reacquires cache_lock.
 		 */
-		KMEM_STAT_ADD(kmem_move_stats.kms_scans);
 		kmd->kmd_scans++;
 		slabs_found = kmem_move_buffers(cp, kmem_reclaim_scan_range,
 		    kmem_reclaim_max_slabs, 0);
@@ -5565,12 +5418,9 @@ kmem_cache_scan(kmem_cache_t *cp)
 			if (!kmem_move_noreap &&
 			    ((debug_rand % kmem_mtb_reap) == 0)) {
 				mutex_exit(&cp->cache_lock);
-				KMEM_STAT_ADD(kmem_move_stats.kms_debug_reaps);
 				kmem_cache_reap(cp);
 				return;
 			} else if ((debug_rand % kmem_mtb_move) == 0) {
-				KMEM_STAT_ADD(kmem_move_stats.kms_scans);
-				KMEM_STAT_ADD(kmem_move_stats.kms_debug_scans);
 				kmd->kmd_scans++;
 				(void) kmem_move_buffers(cp,
 				    kmem_reclaim_scan_range, 1, KMM_DEBUG);
@@ -5581,8 +5431,6 @@ kmem_cache_scan(kmem_cache_t *cp)
 
 	mutex_exit(&cp->cache_lock);
 
-	if (reap) {
-		KMEM_STAT_ADD(kmem_move_stats.kms_scan_depot_ws_reaps);
+	if (reap)
 		kmem_depot_ws_reap(cp);
-	}
 }
diff --git a/usr/src/uts/common/os/logsubr.c b/usr/src/uts/common/os/logsubr.c
index 149f5f8a88..cbc4fa0000 100644
--- a/usr/src/uts/common/os/logsubr.c
+++ b/usr/src/uts/common/os/logsubr.c
@@ -22,6 +22,7 @@
 /*
  * Copyright (c) 2013 Gary Mills
  * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2016, Joyent, Inc.
  */
 
 #include <sys/types.h>
@@ -249,8 +250,7 @@ log_init(void)
 	 */
 	printf("\rSunOS Release %s Version %s %u-bit\n",
 	    utsname.release, utsname.version, NBBY * (uint_t)sizeof (void *));
-	printf("Copyright (c) 1983, 2010, Oracle and/or its affiliates. "
-	    "All rights reserved.\n");
+	printf("Copyright (c) 2010-2016, Joyent Inc. All rights reserved.\n");
 #ifdef DEBUG
 	printf("DEBUG enabled\n");
 #endif
diff --git a/usr/src/uts/common/os/lwp.c b/usr/src/uts/common/os/lwp.c
index feb8e76c42..a7de7b513f 100644
--- a/usr/src/uts/common/os/lwp.c
+++ b/usr/src/uts/common/os/lwp.c
@@ -25,7 +25,7 @@
  */
 
 /*
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright 2016, Joyent, Inc.
  */
 
 #include <sys/param.h>
@@ -57,6 +57,8 @@
 #include <sys/lgrp.h>
 #include <sys/rctl.h>
 #include <sys/contract_impl.h>
+#include <sys/contract/process.h>
+#include <sys/contract/process_impl.h>
 #include <sys/cpc_impl.h>
 #include <sys/sdt.h>
 #include <sys/cmn_err.h>
@@ -115,7 +117,7 @@ lwp_create(void (*proc)(), caddr_t arg, size_t len, proc_t *p,
 	ret_tidhash_t *ret_tidhash = NULL;
 	int i;
 	int rctlfail = 0;
-	boolean_t branded = 0;
+	void *brand_data = NULL;
 	struct ctxop *ctx = NULL;
 
 	ASSERT(cid != sysdccid);	/* system threads must start in SYS */
@@ -283,6 +285,19 @@ lwp_create(void (*proc)(), caddr_t arg, size_t len, proc_t *p,
 	 */
 	lep = kmem_zalloc(sizeof (*lep), KM_SLEEP);
 
+	/*
+	 * If necessary, speculatively allocate lwp brand data.  This is done
+	 * ahead of time so p_lock need not be dropped during lwp branding.
+	 */
+	if (PROC_IS_BRANDED(p) && BROP(p)->b_lwpdata_alloc != NULL) {
+		if ((brand_data = BROP(p)->b_lwpdata_alloc(p)) == NULL) {
+			mutex_enter(&p->p_lock);
+			err = 1;
+			atomic_inc_32(&p->p_zone->zone_ffmisc);
+			goto error;
+		}
+	}
+
 	mutex_enter(&p->p_lock);
 grow:
 	/*
@@ -630,18 +645,6 @@ grow:
 		} while (lwp_hash_lookup(p, t->t_tid) != NULL);
 	}
 
-	/*
-	 * If this is a branded process, let the brand do any necessary lwp
-	 * initialization.
-	 */
-	if (PROC_IS_BRANDED(p)) {
-		if (BROP(p)->b_initlwp(lwp)) {
-			err = 1;
-			atomic_inc_32(&p->p_zone->zone_ffmisc);
-			goto error;
-		}
-		branded = 1;
-	}
 
 	if (t->t_tid == 1) {
 		kpreempt_disable();
@@ -654,7 +657,6 @@ grow:
 		}
 	}
 
-	p->p_lwpcnt++;
 	t->t_waitfor = -1;
 
 	/*
@@ -696,8 +698,27 @@ grow:
 	t->t_post_sys = 1;
 
 	/*
+	 * Perform lwp branding
+	 *
+	 * The b_initlwp hook is _not_ allowed to drop p->p_lock as it must be
+	 * continuously held between when the tidhash is sized and when the lwp
+	 * is inserted into it.  Operations requiring p->p_lock to be
+	 * temporarily dropped can be performed in b_initlwp_post.
+	 */
+	if (PROC_IS_BRANDED(p)) {
+		BROP(p)->b_initlwp(lwp, brand_data);
+		/*
+		 * The b_initlwp hook is expected to consume any preallocated
+		 * brand_data in a way that prepares it for deallocation by the
+		 * b_freelwp hook.
+		 */
+		brand_data = NULL;
+	}
+
+	/*
 	 * Insert the new thread into the list of all threads.
 	 */
+	p->p_lwpcnt++;
 	if ((tx = p->p_tlist) == NULL) {
 		t->t_back = t;
 		t->t_forw = t;
@@ -718,6 +739,13 @@ grow:
 	lep->le_start = t->t_start;
 	lwp_hash_in(p, lep, p->p_tidhash, p->p_tidhash_sz, 1);
 
+	/*
+	 * Complete lwp branding
+	 */
+	if (PROC_IS_BRANDED(p) && BROP(p)->b_initlwp_post != NULL) {
+		BROP(p)->b_initlwp_post(lwp);
+	}
+
 	if (state == TS_RUN) {
 		/*
 		 * We set the new lwp running immediately.
@@ -753,8 +781,9 @@ error:
 		if (cid != NOCLASS && bufp != NULL)
 			CL_FREE(cid, bufp);
 
-		if (branded)
-			BROP(p)->b_freelwp(lwp);
+		if (brand_data != NULL) {
+			BROP(p)->b_lwpdata_free(brand_data);
+		}
 
 		mutex_exit(&p->p_lock);
 		t->t_state = TS_FREE;
@@ -827,8 +856,27 @@ lwp_ctmpl_copy(klwp_t *dst, klwp_t *src)
 	int i;
 
 	for (i = 0; i < ct_ntypes; i++) {
-		dst->lwp_ct_active[i] = ctmpl_dup(src->lwp_ct_active[i]);
+		ct_template_t *tmpl = src->lwp_ct_active[i];
+
+		/*
+		 * If the process contract template is setup to be preserved
+		 * across exec, then if we're forking, perform an implicit
+		 * template_clear now. This ensures that future children of
+		 * this child will remain in the same contract unless they're
+		 * explicitly setup differently. We know we're forking if the
+		 * two LWPs belong to different processes.
+		 */
+		if (i == CTT_PROCESS && tmpl != NULL) {
+			ctmpl_process_t *ctp = tmpl->ctmpl_data;
+
+			if (dst->lwp_procp != src->lwp_procp &&
+			    (ctp->ctp_params & CT_PR_KEEP_EXEC) != 0)
+				tmpl = NULL;
+		}
+
+		dst->lwp_ct_active[i] = ctmpl_dup(tmpl);
 		dst->lwp_ct_latest[i] = NULL;
+
 	}
 }
 
@@ -836,21 +884,33 @@ lwp_ctmpl_copy(klwp_t *dst, klwp_t *src)
  * Clear an LWP's contract template state.
  */
 void
-lwp_ctmpl_clear(klwp_t *lwp)
+lwp_ctmpl_clear(klwp_t *lwp, boolean_t is_exec)
 {
 	ct_template_t *tmpl;
 	int i;
 
 	for (i = 0; i < ct_ntypes; i++) {
-		if ((tmpl = lwp->lwp_ct_active[i]) != NULL) {
-			ctmpl_free(tmpl);
-			lwp->lwp_ct_active[i] = NULL;
-		}
-
 		if (lwp->lwp_ct_latest[i] != NULL) {
 			contract_rele(lwp->lwp_ct_latest[i]);
 			lwp->lwp_ct_latest[i] = NULL;
 		}
+
+		if ((tmpl = lwp->lwp_ct_active[i]) != NULL) {
+			/*
+			 * If we're exec-ing a new program and the process
+			 * contract template is setup to be preserved across
+			 * exec, then don't clear it.
+			 */
+			if (is_exec && i == CTT_PROCESS) {
+				ctmpl_process_t *ctp = tmpl->ctmpl_data;
+
+				if ((ctp->ctp_params & CT_PR_KEEP_EXEC) != 0)
+					continue;
+			}
+
+			ctmpl_free(tmpl);
+			lwp->lwp_ct_active[i] = NULL;
+		}
 	}
 }
 
@@ -891,13 +951,6 @@ lwp_exit(void)
 	if (t->t_upimutex != NULL)
 		upimutex_cleanup();
 
-	/*
-	 * Perform any brand specific exit processing, then release any
-	 * brand data associated with the lwp
-	 */
-	if (PROC_IS_BRANDED(p))
-		BROP(p)->b_lwpexit(lwp);
-
 	lwp_pcb_exit();
 
 	mutex_enter(&p->p_lock);
@@ -941,6 +994,18 @@ lwp_exit(void)
 	DTRACE_PROC(lwp__exit);
 
 	/*
+	 * Perform any brand specific exit processing, then release any
+	 * brand data associated with the lwp
+	 */
+	if (PROC_IS_BRANDED(p)) {
+		mutex_exit(&p->p_lock);
+		BROP(p)->b_lwpexit(lwp);
+		BROP(p)->b_freelwp(lwp);
+		mutex_enter(&p->p_lock);
+		prbarrier(p);
+	}
+
+	/*
 	 * If the lwp is a detached lwp or if the process is exiting,
 	 * remove (lwp_hash_out()) the lwp from the lwp directory.
 	 * Otherwise null out the lwp's le_thread pointer in the lwp
@@ -1101,7 +1166,7 @@ lwp_cleanup(void)
 	}
 	kpreempt_enable();
 
-	lwp_ctmpl_clear(ttolwp(t));
+	lwp_ctmpl_clear(ttolwp(t), B_FALSE);
 }
 
 int
diff --git a/usr/src/uts/common/os/main.c b/usr/src/uts/common/os/main.c
index 7afc1cfe00..dda0b3e4a6 100644
--- a/usr/src/uts/common/os/main.c
+++ b/usr/src/uts/common/os/main.c
@@ -27,7 +27,7 @@
 /*	  All Rights Reserved  	*/
 
 /*
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright 2015, Joyent, Inc.
  */
 
 #include <sys/types.h>
@@ -157,7 +157,7 @@ exec_init(const char *initpath, const char *args)
 	int error = 0, count = 0;
 	proc_t *p = ttoproc(curthread);
 	klwp_t *lwp = ttolwp(curthread);
-	int brand_action;
+	int brand_action = EBA_NONE;
 
 	if (args == NULL)
 		args = "";
@@ -268,7 +268,15 @@ exec_init(const char *initpath, const char *args)
 	 */
 	sigemptyset(&curthread->t_hold);
 
-	brand_action = ZONE_IS_BRANDED(p->p_zone) ? EBA_BRAND : EBA_NONE;
+	/*
+	 * Only instruct exec_common to brand the process if necessary.  It is
+	 * possible that the init process is already properly branded due to the
+	 * proc_exit -> restart_init -> exec_init call chain.
+	 */
+	if (ZONE_IS_BRANDED(p->p_zone) &&
+	    p->p_brand != p->p_zone->zone_brand) {
+		brand_action = EBA_BRAND;
+	}
 again:
 	error = exec_common((const char *)(uintptr_t)exec_fnamep,
 	    (const char **)(uintptr_t)uap, NULL, brand_action);
diff --git a/usr/src/uts/common/os/mmapobj.c b/usr/src/uts/common/os/mmapobj.c
index b6b5446d71..596c855a45 100644
--- a/usr/src/uts/common/os/mmapobj.c
+++ b/usr/src/uts/common/os/mmapobj.c
@@ -1360,10 +1360,15 @@ calc_loadable(Ehdr *ehdrp, caddr_t phdrbase, int nphdrs, size_t *len,
 			}
 			if (num_segs++ == 0) {
 				/*
-				 * The p_vaddr of the first PT_LOAD segment
-				 * must either be NULL or within the first
-				 * page in order to be interpreted.
-				 * Otherwise, its an invalid file.
+				 * While ELF doesn't specify the meaning of
+				 * p_vaddr for PT_LOAD segments in ET_DYN
+				 * objects, we mandate that is either NULL or
+				 * (to accommodate some historical binaries)
+				 * within the first page.  (Note that there
+				 * exist non-native ET_DYN objects that violate
+				 * this constraint that we nonetheless must be
+				 * able to execute; see the ET_DYN handling in
+				 * mapelfexec() for details.)
 				 */
 				if (e_type == ET_DYN &&
 				    ((caddr_t)((uintptr_t)vaddr &
diff --git a/usr/src/uts/common/os/pid.c b/usr/src/uts/common/os/pid.c
index b555bb82b7..eba6147fab 100644
--- a/usr/src/uts/common/os/pid.c
+++ b/usr/src/uts/common/os/pid.c
@@ -21,6 +21,7 @@
 
 /*
  * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -112,6 +113,18 @@ pid_lookup(pid_t pid)
 	return (pidp);
 }
 
+struct pid *
+pid_find(pid_t pid)
+{
+	struct pid *pidp;
+
+	mutex_enter(&pidlinklock);
+	pidp = pid_lookup(pid);
+	mutex_exit(&pidlinklock);
+
+	return (pidp);
+}
+
 void
 pid_setmin(void)
 {
@@ -522,6 +535,20 @@ sprunlock(proc_t *p)
 	THREAD_KPRI_RELEASE();
 }
 
+/*
+ * Undo effects of sprlock but without dropping p->p_lock
+ */
+void
+sprunprlock(proc_t *p)
+{
+	ASSERT(p->p_proc_flag & P_PR_LOCK);
+	ASSERT(MUTEX_HELD(&p->p_lock));
+
+	cv_signal(&pr_pid_cv[p->p_slot]);
+	p->p_proc_flag &= ~P_PR_LOCK;
+	THREAD_KPRI_RELEASE();
+}
+
 void
 pid_init(void)
 {
diff --git a/usr/src/uts/common/os/policy.c b/usr/src/uts/common/os/policy.c
index 07bc2920da..d2bdb4ce37 100644
--- a/usr/src/uts/common/os/policy.c
+++ b/usr/src/uts/common/os/policy.c
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2013, Joyent, Inc. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
  */
 
 #include <sys/types.h>
@@ -55,6 +55,7 @@
 #include <sys/mntent.h>
 #include <sys/contract_impl.h>
 #include <sys/dld_ioc.h>
+#include <sys/brand.h>
 
 /*
  * There are two possible layers of privilege routines and two possible
@@ -1243,6 +1244,22 @@ secpolicy_vnode_owner(const cred_t *cr, uid_t owner)
 void
 secpolicy_setid_clear(vattr_t *vap, cred_t *cr)
 {
+	proc_t *p = curproc;
+
+	/*
+	 * Allow the brand to override this behaviour.
+	 */
+	if (PROC_IS_BRANDED(p) && BROP(p)->b_setid_clear != NULL) {
+		/*
+		 * This brand hook will return 0 if handling is complete, or
+		 * some other value if the brand would like us to fall back to
+		 * the usual behaviour.
+		 */
+		if (BROP(p)->b_setid_clear(vap, cr) == 0) {
+			return;
+		}
+	}
+
 	if ((vap->va_mode & (S_ISUID | S_ISGID)) != 0 &&
 	    secpolicy_vnode_setid_retain(cr,
 	    (vap->va_mode & S_ISUID) != 0 &&
@@ -2078,6 +2095,13 @@ secpolicy_meminfo(const cred_t *cr)
 }
 
 int
+secpolicy_fs_import(const cred_t *cr)
+{
+	return (PRIV_POLICY(cr, PRIV_SYS_FS_IMPORT, B_FALSE, EPERM, NULL));
+}
+
+
+int
 secpolicy_pfexec_register(const cred_t *cr)
 {
 	return (PRIV_POLICY(cr, PRIV_SYS_ADMIN, B_TRUE, EPERM, NULL));
@@ -2581,3 +2605,11 @@ secpolicy_ppp_config(const cred_t *cr)
 		return (secpolicy_net_config(cr, B_FALSE));
 	return (PRIV_POLICY(cr, PRIV_SYS_PPP_CONFIG, B_FALSE, EPERM, NULL));
 }
+
+int
+secpolicy_hyprlofs_control(const cred_t *cr)
+{
+	if (PRIV_POLICY(cr, PRIV_HYPRLOFS_CONTROL, B_FALSE, EPERM, NULL))
+		return (EPERM);
+	return (0);
+}
diff --git a/usr/src/uts/common/os/priv_defs b/usr/src/uts/common/os/priv_defs
index a3cdaccc2a..cc1c5e03a6 100644
--- a/usr/src/uts/common/os/priv_defs
+++ b/usr/src/uts/common/os/priv_defs
@@ -177,6 +177,10 @@ privilege PRIV_GRAPHICS_MAP
 	Allows a process to perform privileged mappings through a
 	graphics device.
 
+privilege PRIV_HYPRLOFS_CONTROL
+
+	Allows a process to manage hyprlofs entries.
+
 privilege PRIV_IPC_DAC_READ
 
 	Allows a process to read a System V IPC
@@ -372,6 +376,10 @@ privilege PRIV_SYS_DEVICES
 	Allows a process to open the real console device directly.
 	Allows a process to open devices that have been exclusively opened.
 
+privilege PRIV_SYS_FS_IMPORT
+
+	Allows a process to import a potentially untrusted file system.
+
 privilege PRIV_SYS_IPC_CONFIG
 
 	Allows a process to increase the size of a System V IPC Message
diff --git a/usr/src/uts/common/os/project.c b/usr/src/uts/common/os/project.c
index 7bd3dd963f..d89f62bea7 100644
--- a/usr/src/uts/common/os/project.c
+++ b/usr/src/uts/common/os/project.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2016, Joyent, Inc.
  */
 
 #include <sys/project.h>
@@ -659,6 +660,14 @@ static rctl_ops_t project_tasks_ops = {
  */
 
 /*ARGSUSED*/
+static rctl_qty_t
+project_shmmax_usage(rctl_t *rctl, struct proc *p)
+{
+	ASSERT(MUTEX_HELD(&p->p_lock));
+	return (p->p_task->tk_proj->kpj_data.kpd_shmmax);
+}
+
+/*ARGSUSED*/
 static int
 project_shmmax_test(struct rctl *rctl, struct proc *p, rctl_entity_p_t *e,
     rctl_val_t *rval, rctl_qty_t inc, uint_t flags)
@@ -675,7 +684,7 @@ project_shmmax_test(struct rctl *rctl, struct proc *p, rctl_entity_p_t *e,
 
 static rctl_ops_t project_shmmax_ops = {
 	rcop_no_action,
-	rcop_no_usage,
+	project_shmmax_usage,
 	rcop_no_set,
 	project_shmmax_test
 };
@@ -685,6 +694,14 @@ static rctl_ops_t project_shmmax_ops = {
  */
 
 /*ARGSUSED*/
+static rctl_qty_t
+project_shmmni_usage(rctl_t *rctl, struct proc *p)
+{
+	ASSERT(MUTEX_HELD(&p->p_lock));
+	return (p->p_task->tk_proj->kpj_data.kpd_ipc.ipcq_shmmni);
+}
+
+/*ARGSUSED*/
 static int
 project_shmmni_test(struct rctl *rctl, struct proc *p, rctl_entity_p_t *e,
     rctl_val_t *rval, rctl_qty_t inc, uint_t flags)
@@ -701,7 +718,7 @@ project_shmmni_test(struct rctl *rctl, struct proc *p, rctl_entity_p_t *e,
 
 static rctl_ops_t project_shmmni_ops = {
 	rcop_no_action,
-	rcop_no_usage,
+	project_shmmni_usage,
 	rcop_no_set,
 	project_shmmni_test
 };
@@ -711,6 +728,14 @@ static rctl_ops_t project_shmmni_ops = {
  */
 
 /*ARGSUSED*/
+static rctl_qty_t
+project_semmni_usage(rctl_t *rctl, struct proc *p)
+{
+	ASSERT(MUTEX_HELD(&p->p_lock));
+	return (p->p_task->tk_proj->kpj_data.kpd_ipc.ipcq_semmni);
+}
+
+/*ARGSUSED*/
 static int
 project_semmni_test(struct rctl *rctl, struct proc *p, rctl_entity_p_t *e,
     rctl_val_t *rval, rctl_qty_t inc, uint_t flags)
@@ -727,7 +752,7 @@ project_semmni_test(struct rctl *rctl, struct proc *p, rctl_entity_p_t *e,
 
 static rctl_ops_t project_semmni_ops = {
 	rcop_no_action,
-	rcop_no_usage,
+	project_semmni_usage,
 	rcop_no_set,
 	project_semmni_test
 };
@@ -737,6 +762,14 @@ static rctl_ops_t project_semmni_ops = {
  */
 
 /*ARGSUSED*/
+static rctl_qty_t
+project_msgmni_usage(rctl_t *rctl, struct proc *p)
+{
+	ASSERT(MUTEX_HELD(&p->p_lock));
+	return (p->p_task->tk_proj->kpj_data.kpd_ipc.ipcq_msgmni);
+}
+
+/*ARGSUSED*/
 static int
 project_msgmni_test(struct rctl *rctl, struct proc *p, rctl_entity_p_t *e,
     rctl_val_t *rval, rctl_qty_t inc, uint_t flags)
@@ -753,7 +786,7 @@ project_msgmni_test(struct rctl *rctl, struct proc *p, rctl_entity_p_t *e,
 
 static rctl_ops_t project_msgmni_ops = {
 	rcop_no_action,
-	rcop_no_usage,
+	project_msgmni_usage,
 	rcop_no_set,
 	project_msgmni_test
 };
diff --git a/usr/src/uts/common/os/sched.c b/usr/src/uts/common/os/sched.c
index c1d6569f11..15e77d39f7 100644
--- a/usr/src/uts/common/os/sched.c
+++ b/usr/src/uts/common/os/sched.c
@@ -27,6 +27,10 @@
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
 /*	  All Rights Reserved	*/
 
+/*
+ * Copyright (c) 2015, Joyent, Inc.  All rights reserved.
+ */
+
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/sysmacros.h>
@@ -646,16 +650,17 @@ top:
 		klwp_t *lwp = ttolwp(tp);
 
 		/*
-		 * Swapout eligible lwps (specified by the scheduling
-		 * class) which don't have TS_DONT_SWAP set.  Set the
-		 * "intent to swap" flag (TS_SWAPENQ) on threads
-		 * which have TS_DONT_SWAP set so that they can be
+		 * Swapout eligible lwps (specified by the scheduling class)
+		 * which don't have TS_DONT_SWAP set.  Set the "intent to swap"
+		 * flag (TS_SWAPENQ) on threads which have either TS_DONT_SWAP
+		 * set or are currently on a split stack so that they can be
 		 * swapped if and when they reach a safe point.
 		 */
 		thread_lock(tp);
 		thread_pri = CL_SWAPOUT(tp, swapflags);
 		if (thread_pri != -1) {
-			if (tp->t_schedflag & TS_DONT_SWAP) {
+			if ((tp->t_schedflag & TS_DONT_SWAP) ||
+			    (tp->t_flag & T_SPLITSTK)) {
 				tp->t_schedflag |= TS_SWAPENQ;
 				tp->t_trapret = 1;
 				aston(tp);
diff --git a/usr/src/uts/common/os/shm.c b/usr/src/uts/common/os/shm.c
index bacc595f78..5deae96d73 100644
--- a/usr/src/uts/common/os/shm.c
+++ b/usr/src/uts/common/os/shm.c
@@ -21,6 +21,7 @@
 
 /*
  * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -319,6 +320,7 @@ shmat(int shmid, caddr_t uaddr, int uflags, uintptr_t *rvp)
 		size_t	share_size;
 		struct	shm_data ssd;
 		uintptr_t align_hint;
+		long	curprot;
 
 		/*
 		 * Pick a share pagesize to use, if (!isspt(sp)).
@@ -453,6 +455,7 @@ shmat(int shmid, caddr_t uaddr, int uflags, uintptr_t *rvp)
 			}
 		}
 
+		curprot = sp->shm_opts & SHM_PROT_MASK;
 		if (!isspt(sp)) {
 			error = sptcreate(size, &segspt, sp->shm_amp, prot,
 			    flags, share_szc);
@@ -462,8 +465,8 @@ shmat(int shmid, caddr_t uaddr, int uflags, uintptr_t *rvp)
 			}
 			sp->shm_sptinfo->sptas = segspt->s_as;
 			sp->shm_sptseg = segspt;
-			sp->shm_sptprot = prot;
-		} else if ((prot & sp->shm_sptprot) != sp->shm_sptprot) {
+			sp->shm_opts = (sp->shm_opts & ~SHM_PROT_MASK) | prot;
+		} else if ((prot & curprot) != curprot) {
 			/*
 			 * Ensure we're attaching to an ISM segment with
 			 * fewer or equal permissions than what we're
@@ -748,6 +751,23 @@ shmctl(int shmid, int cmd, void *arg)
 		}
 		break;
 
+	/* Stage segment for removal, but don't remove until last detach */
+	case SHM_RMID:
+		if ((error = secpolicy_ipc_owner(cr, (kipc_perm_t *)sp)) != 0)
+			break;
+
+		/*
+		 * If attached, just mark it as a pending remove, otherwise
+		 * we must perform the normal ipc_rmid now.
+		 */
+		if ((sp->shm_perm.ipc_ref - 1) > 0) {
+			sp->shm_opts |= SHM_RM_PENDING;
+		} else {
+			mutex_exit(lock);
+			return (ipc_rmid(shm_svc, shmid, cr));
+		}
+		break;
+
 	default:
 		error = EINVAL;
 		break;
@@ -778,6 +798,23 @@ shm_detach(proc_t *pp, segacct_t *sap)
 		sp->shm_ismattch--;
 	sp->shm_dtime = gethrestime_sec();
 	sp->shm_lpid = pp->p_pid;
+	if ((sp->shm_opts & SHM_RM_PENDING) != 0 &&
+	    sp->shm_perm.ipc_ref == 2) {
+		/*
+		 * If this is the last detach of the segment across the whole
+		 * system then now we can perform the delayed IPC_RMID.
+		 * The ipc_ref count has 1 for the original 'get' and one for
+		 * each 'attach' (see 'stat' handling in shmctl).
+		 */
+		sp->shm_opts &= ~SHM_RM_PENDING;
+		mutex_enter(&shm_svc->ipcs_lock);
+		ipc_rmsvc(shm_svc, (kipc_perm_t *)sp);	/* Drops lock */
+		ASSERT(!MUTEX_HELD(&shm_svc->ipcs_lock));
+		ASSERT(((kipc_perm_t *)sp)->ipc_ref > 0);
+
+		/* Lock was dropped, need to retake it for following rele. */
+		(void) ipc_lock(shm_svc, sp->shm_perm.ipc_id);
+	}
 	ipc_rele(shm_svc, (kipc_perm_t *)sp);	/* Drops lock */
 
 	kmem_free(sap, sizeof (segacct_t));
diff --git a/usr/src/uts/common/os/sig.c b/usr/src/uts/common/os/sig.c
index 453b1f22d4..5ef12f3ae4 100644
--- a/usr/src/uts/common/os/sig.c
+++ b/usr/src/uts/common/os/sig.c
@@ -22,7 +22,7 @@
 /*
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
- * Copyright (c) 2014, Joyent, Inc.  All rights reserved.
+ * Copyright 2015, Joyent, Inc.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -60,6 +60,7 @@
 #include <sys/cyclic.h>
 #include <sys/dtrace.h>
 #include <sys/sdt.h>
+#include <sys/brand.h>
 #include <sys/signalfd.h>
 
 const k_sigset_t nullsmask = {0, 0, 0};
@@ -148,6 +149,21 @@ signal_is_blocked(kthread_t *t, int sig)
 }
 
 /*
+ * Return true if the signal can safely be ignored.
+ * That is, if the signal is included in the p_ignore mask and doing so is not
+ * forbidden by any process branding.
+ */
+static int
+sig_ignorable(proc_t *p, klwp_t *lwp, int sig)
+{
+	return (sigismember(&p->p_ignore, sig) &&	/* sig in ignore mask */
+	    !(PROC_IS_BRANDED(p) &&			/* allowed by brand */
+	    BROP(p)->b_sig_ignorable != NULL &&
+	    BROP(p)->b_sig_ignorable(p, lwp, sig) == B_FALSE));
+
+}
+
+/*
  * Return true if the signal can safely be discarded on generation.
  * That is, if there is no need for the signal on the receiving end.
  * The answer is true if the process is a zombie or
@@ -159,12 +175,13 @@ signal_is_blocked(kthread_t *t, int sig)
  *	the signal is not being accepted via sigwait()
  */
 static int
-sig_discardable(proc_t *p, int sig)
+sig_discardable(proc_t *p, kthread_t *tp, int sig)
 {
 	kthread_t *t = p->p_tlist;
+	klwp_t *lwp = (tp == NULL) ? NULL : tp->t_lwp;
 
 	return (t == NULL ||		/* if zombie or ... */
-	    (sigismember(&p->p_ignore, sig) &&	/* signal is ignored */
+	    (sig_ignorable(p, lwp, sig) &&		/* signal is ignored */
 	    t->t_forw == t &&			/* and single-threaded */
 	    !tracing(p, sig) &&			/* and no /proc tracing */
 	    !signal_is_blocked(t, sig) &&	/* and signal not blocked */
@@ -200,7 +217,7 @@ eat_signal(kthread_t *t, int sig)
 		    !(ttoproc(t)->p_proc_flag & P_PR_LOCK)) {
 			ttoproc(t)->p_stopsig = 0;
 			t->t_dtrace_stop = 0;
-			t->t_schedflag |= TS_XSTART | TS_PSTART;
+			t->t_schedflag |= TS_XSTART | TS_PSTART | TS_BSTART;
 			setrun_locked(t);
 		} else if (t != curthread && t->t_state == TS_ONPROC) {
 			aston(t);	/* make it do issig promptly */
@@ -297,7 +314,7 @@ sigtoproc(proc_t *p, kthread_t *t, int sig)
 		}
 	}
 
-	if (sig_discardable(p, sig)) {
+	if (sig_discardable(p, t, sig)) {
 		DTRACE_PROC3(signal__discard, kthread_t *, p->p_tlist,
 		    proc_t *, p, int, sig);
 		return;
@@ -497,7 +514,7 @@ issig_justlooking(void)
 			if (sigismember(&set, sig) &&
 			    (tracing(p, sig) ||
 			    sigismember(&t->t_sigwait, sig) ||
-			    !sigismember(&p->p_ignore, sig))) {
+			    !sig_ignorable(p, lwp, sig))) {
 				/*
 				 * Don't promote a signal that will stop
 				 * the process when lwp_nostop is set.
@@ -623,6 +640,21 @@ issig_forreal(void)
 		}
 
 		/*
+		 * Allow the brand the chance to alter (or suppress) delivery
+		 * of this signal.
+		 */
+		if (PROC_IS_BRANDED(p) && BROP(p)->b_issig_stop != NULL) {
+			/*
+			 * The brand hook will return 0 if it would like
+			 * us to drive on, or -1 if we should restart
+			 * the loop to check other conditions.
+			 */
+			if (BROP(p)->b_issig_stop(p, lwp) != 0) {
+				continue;
+			}
+		}
+
+		/*
 		 * Honor requested stop before dealing with the
 		 * current signal; a debugger may change it.
 		 * Do not want to go back to loop here since this is a special
@@ -656,7 +688,7 @@ issig_forreal(void)
 			lwp->lwp_cursig = 0;
 			lwp->lwp_extsig = 0;
 			if (sigismember(&t->t_sigwait, sig) ||
-			    (!sigismember(&p->p_ignore, sig) &&
+			    (!sig_ignorable(p, lwp, sig) &&
 			    !isjobstop(sig))) {
 				if (p->p_flag & (SEXITLWPS|SKILLED)) {
 					sig = SIGKILL;
@@ -708,7 +740,7 @@ issig_forreal(void)
 				toproc = 0;
 				if (tracing(p, sig) ||
 				    sigismember(&t->t_sigwait, sig) ||
-				    !sigismember(&p->p_ignore, sig)) {
+				    !sig_ignorable(p, lwp, sig)) {
 					if (sigismember(&t->t_extsig, sig))
 						ext = 1;
 					break;
@@ -722,7 +754,7 @@ issig_forreal(void)
 				toproc = 1;
 				if (tracing(p, sig) ||
 				    sigismember(&t->t_sigwait, sig) ||
-				    !sigismember(&p->p_ignore, sig)) {
+				    !sig_ignorable(p, lwp, sig)) {
 					if (sigismember(&p->p_extsig, sig))
 						ext = 1;
 					break;
@@ -954,6 +986,16 @@ stop(int why, int what)
 		}
 		break;
 
+	case PR_BRAND:
+		/*
+		 * We have been stopped by the brand code for a brand-private
+		 * reason.  This is an asynchronous stop affecting only this
+		 * LWP.
+		 */
+		VERIFY(PROC_IS_BRANDED(p));
+		flags &= ~TS_BSTART;
+		break;
+
 	default:	/* /proc stop */
 		flags &= ~TS_PSTART;
 		/*
@@ -1065,7 +1107,7 @@ stop(int why, int what)
 		}
 	}
 
-	if (why != PR_JOBCONTROL && why != PR_CHECKPOINT) {
+	if (why != PR_JOBCONTROL && why != PR_CHECKPOINT && why != PR_BRAND) {
 		/*
 		 * Do process-level notification when all lwps are
 		 * either stopped on events of interest to /proc
@@ -1171,6 +1213,13 @@ stop(int why, int what)
 	if (why == PR_CHECKPOINT)
 		del_one_utstop();
 
+	/*
+	 * Allow the brand to post notification of this stop condition.
+	 */
+	if (PROC_IS_BRANDED(p) && BROP(p)->b_stop_notify != NULL) {
+		BROP(p)->b_stop_notify(p, lwp, why, what);
+	}
+
 	thread_lock(t);
 	ASSERT((t->t_schedflag & TS_ALLSTART) == 0);
 	t->t_schedflag |= flags;
@@ -1192,7 +1241,7 @@ stop(int why, int what)
 		    (p->p_flag & (SEXITLWPS|SKILLED))) {
 			p->p_stopsig = 0;
 			thread_lock(t);
-			t->t_schedflag |= TS_XSTART | TS_PSTART;
+			t->t_schedflag |= TS_XSTART | TS_PSTART | TS_BSTART;
 			setrun_locked(t);
 			thread_unlock_nopreempt(t);
 		} else if (why == PR_JOBCONTROL) {
@@ -1327,7 +1376,7 @@ psig(void)
 	 * this signal from pending to current (we dropped p->p_lock).
 	 * This can happen only in a multi-threaded process.
 	 */
-	if (sigismember(&p->p_ignore, sig) ||
+	if (sig_ignorable(p, lwp, sig) ||
 	    (func == SIG_DFL && sigismember(&stopdefault, sig))) {
 		lwp->lwp_cursig = 0;
 		lwp->lwp_extsig = 0;
@@ -1771,9 +1820,12 @@ post_sigcld(proc_t *cp, sigqueue_t *sqp)
 			/*
 			 * This can only happen when the parent is init.
 			 * (See call to sigcld(q, NULL) in exit().)
-			 * Use KM_NOSLEEP to avoid deadlock.
+			 * Use KM_NOSLEEP to avoid deadlock. The child procs
+			 * initpid can be 1 for zlogin.
 			 */
-			ASSERT(pp == proc_init);
+			ASSERT(pp->p_pidp->pid_id ==
+			    cp->p_zone->zone_proc_initpid ||
+			    pp->p_pidp->pid_id == 1);
 			winfo(cp, &info, 0);
 			sigaddq(pp, NULL, &info, KM_NOSLEEP);
 		} else {
@@ -1804,6 +1856,15 @@ sigcld_repost()
 
 	sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP);
 	mutex_enter(&pidlock);
+	if (PROC_IS_BRANDED(pp) && BROP(pp)->b_sigcld_repost != NULL) {
+		/*
+		 * Allow the brand to inject synthetic SIGCLD signals.
+		 */
+		if (BROP(pp)->b_sigcld_repost(pp, sqp) == 0) {
+			mutex_exit(&pidlock);
+			return;
+		}
+	}
 	for (cp = pp->p_child; cp; cp = cp->p_sibling) {
 		if (cp->p_pidflag & CLDPEND) {
 			post_sigcld(cp, sqp);
@@ -2115,7 +2176,7 @@ sigaddqa(proc_t *p, kthread_t *t, sigqueue_t *sigqp)
 	ASSERT(MUTEX_HELD(&p->p_lock));
 	ASSERT(sig >= 1 && sig < NSIG);
 
-	if (sig_discardable(p, sig))
+	if (sig_discardable(p, t, sig))
 		siginfofree(sigqp);
 	else
 		sigaddqins(p, t, sigqp);
@@ -2141,7 +2202,7 @@ sigaddq(proc_t *p, kthread_t *t, k_siginfo_t *infop, int km_flags)
 	 * blocking the signal (it *could* change it's mind while
 	 * the signal is pending) then don't bother creating one.
 	 */
-	if (!sig_discardable(p, sig) &&
+	if (!sig_discardable(p, t, sig) &&
 	    (sigismember(&p->p_siginfo, sig) ||
 	    (curproc->p_ct_process != p->p_ct_process) ||
 	    (sig == SIGCLD && SI_FROMKERNEL(infop))) &&
diff --git a/usr/src/uts/common/os/smb_subr.c b/usr/src/uts/common/os/smb_subr.c
index 6084676b17..6dc7230bed 100644
--- a/usr/src/uts/common/os/smb_subr.c
+++ b/usr/src/uts/common/os/smb_subr.c
@@ -25,7 +25,9 @@
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
+/*
+ * Copyright (c) 2015 Joyent, Inc.  All rights reserved.
+ */
 
 #include <sys/smbios_impl.h>
 #include <sys/cmn_err.h>
@@ -43,13 +45,13 @@ smb_strerror(int err)
 void *
 smb_alloc(size_t len)
 {
-	return (kmem_alloc(len, KM_SLEEP));
+	return (len > 0 ? kmem_alloc(len, KM_SLEEP) : NULL);
 }
 
 void *
 smb_zalloc(size_t len)
 {
-	return (kmem_zalloc(len, KM_SLEEP));
+	return (len > 0 ? kmem_zalloc(len, KM_SLEEP) : NULL);
 }
 
 void
diff --git a/usr/src/uts/common/os/streamio.c b/usr/src/uts/common/os/streamio.c
index 62f94729cf..0a1406e0cd 100644
--- a/usr/src/uts/common/os/streamio.c
+++ b/usr/src/uts/common/os/streamio.c
@@ -24,7 +24,7 @@
 
 /*
  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright 2015, Joyent, Inc. All rights reserved.
  */
 
 #include <sys/types.h>
@@ -77,6 +77,7 @@
 #include <sys/policy.h>
 #include <sys/dld.h>
 #include <sys/zone.h>
+#include <sys/limits.h>
 #include <c2/audit.h>
 
 /*
@@ -985,12 +986,20 @@ strget(struct stdata *stp, queue_t *q, struct uio *uiop, int first,
 		 * (registered in sd_wakeq).
 		 */
 		struiod_t uiod;
+		struct iovec buf[IOV_MAX_STACK];
+		int iovlen = 0;
 
 		if (first)
 			stp->sd_wakeq &= ~RSLEEP;
 
-		(void) uiodup(uiop, &uiod.d_uio, uiod.d_iov,
-		    sizeof (uiod.d_iov) / sizeof (*uiod.d_iov));
+		if (uiop->uio_iovcnt > IOV_MAX_STACK) {
+			iovlen = uiop->uio_iovcnt * sizeof (iovec_t);
+			uiod.d_iov = kmem_alloc(iovlen, KM_SLEEP);
+		} else {
+			uiod.d_iov = buf;
+		}
+
+		(void) uiodup(uiop, &uiod.d_uio, uiod.d_iov, uiop->uio_iovcnt);
 		uiod.d_mp = 0;
 		/*
 		 * Mark that a thread is in rwnext on the read side
@@ -1029,6 +1038,8 @@ strget(struct stdata *stp, queue_t *q, struct uio *uiop, int first,
 			if ((bp = uiod.d_mp) != NULL) {
 				*errorp = 0;
 				ASSERT(MUTEX_HELD(&stp->sd_lock));
+				if (iovlen != 0)
+					kmem_free(uiod.d_iov, iovlen);
 				return (bp);
 			}
 			error = 0;
@@ -1048,8 +1059,14 @@ strget(struct stdata *stp, queue_t *q, struct uio *uiop, int first,
 		} else {
 			*errorp = error;
 			ASSERT(MUTEX_HELD(&stp->sd_lock));
+			if (iovlen != 0)
+				kmem_free(uiod.d_iov, iovlen);
 			return (NULL);
 		}
+
+		if (iovlen != 0)
+			kmem_free(uiod.d_iov, iovlen);
+
 		/*
 		 * Try a getq in case a rwnext() generated mblk
 		 * has bubbled up via strrput().
@@ -2544,6 +2561,8 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize,
     int b_flag, int pri, int flags)
 {
 	struiod_t uiod;
+	struct iovec buf[IOV_MAX_STACK];
+	int iovlen = 0;
 	mblk_t *mp;
 	queue_t *wqp = stp->sd_wrq;
 	int error = 0;
@@ -2635,13 +2654,21 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize,
 	mp->b_flag |= b_flag;
 	mp->b_band = (uchar_t)pri;
 
-	(void) uiodup(uiop, &uiod.d_uio, uiod.d_iov,
-	    sizeof (uiod.d_iov) / sizeof (*uiod.d_iov));
+	if (uiop->uio_iovcnt > IOV_MAX_STACK) {
+		iovlen = uiop->uio_iovcnt * sizeof (iovec_t);
+		uiod.d_iov = (struct iovec *)kmem_alloc(iovlen, KM_SLEEP);
+	} else {
+		uiod.d_iov = buf;
+	}
+
+	(void) uiodup(uiop, &uiod.d_uio, uiod.d_iov, uiop->uio_iovcnt);
 	uiod.d_uio.uio_offset = 0;
 	uiod.d_mp = mp;
 	error = rwnext(wqp, &uiod);
 	if (! uiod.d_mp) {
 		uioskip(uiop, *iosize);
+		if (iovlen != 0)
+			kmem_free(uiod.d_iov, iovlen);
 		return (error);
 	}
 	ASSERT(mp == uiod.d_mp);
@@ -2659,17 +2686,23 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize,
 		error = 0;
 	} else {
 		freemsg(mp);
+		if (iovlen != 0)
+			kmem_free(uiod.d_iov, iovlen);
 		return (error);
 	}
 	/* Have to check canput before consuming data from the uio */
 	if (pri == 0) {
 		if (!canputnext(wqp) && !(flags & MSG_IGNFLOW)) {
 			freemsg(mp);
+			if (iovlen != 0)
+				kmem_free(uiod.d_iov, iovlen);
 			return (EWOULDBLOCK);
 		}
 	} else {
 		if (!bcanputnext(wqp, pri) && !(flags & MSG_IGNFLOW)) {
 			freemsg(mp);
+			if (iovlen != 0)
+				kmem_free(uiod.d_iov, iovlen);
 			return (EWOULDBLOCK);
 		}
 	}
@@ -2677,6 +2710,8 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize,
 	/* Copyin data from the uio */
 	if ((error = struioget(wqp, mp, &uiod, 0)) != 0) {
 		freemsg(mp);
+		if (iovlen != 0)
+			kmem_free(uiod.d_iov, iovlen);
 		return (error);
 	}
 	uioskip(uiop, *iosize);
@@ -2693,6 +2728,8 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize,
 		putnext(wqp, mp);
 		stream_runservice(stp);
 	}
+	if (iovlen != 0)
+		kmem_free(uiod.d_iov, iovlen);
 	return (0);
 }
 
@@ -3178,6 +3215,7 @@ job_control_type(int cmd)
 	case JAGENT:	/* Obsolete */
 	case JTRUN:	/* Obsolete */
 	case JXTPROTO:	/* Obsolete */
+	case TIOCSETLD:
 		return (JCSETP);
 	}
 
diff --git a/usr/src/uts/common/os/sysent.c b/usr/src/uts/common/os/sysent.c
index 0d1bb6a8a1..aa44ccf788 100644
--- a/usr/src/uts/common/os/sysent.c
+++ b/usr/src/uts/common/os/sysent.c
@@ -1093,18 +1093,20 @@ char **syscallnames;
 
 systrace_sysent_t *systrace_sysent;
 void (*systrace_probe)(dtrace_id_t, uintptr_t, uintptr_t,
-    uintptr_t, uintptr_t, uintptr_t, uintptr_t);
+    uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t);
 
 /*ARGSUSED*/
 void
 systrace_stub(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1,
-    uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5)
+    uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5,
+    uintptr_t arg6, uintptr_t arg7)
 {}
 
 /*ARGSUSED*/
 int64_t
 dtrace_systrace_syscall(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2,
-    uintptr_t arg3, uintptr_t arg4, uintptr_t arg5)
+    uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, uintptr_t arg6,
+    uintptr_t arg7)
 {
 	systrace_sysent_t *sy = &systrace_sysent[curthread->t_sysnum];
 	dtrace_id_t id;
@@ -1112,7 +1114,8 @@ dtrace_systrace_syscall(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2,
 	proc_t *p;
 
 	if ((id = sy->stsy_entry) != DTRACE_IDNONE)
-		(*systrace_probe)(id, arg0, arg1, arg2, arg3, arg4, arg5);
+		(*systrace_probe)(id, arg0, arg1, arg2, arg3, arg4, arg5,
+		    arg6, arg7);
 
 	/*
 	 * We want to explicitly allow DTrace consumers to stop a process
@@ -1126,14 +1129,15 @@ dtrace_systrace_syscall(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2,
 	}
 	mutex_exit(&p->p_lock);
 
-	rval = (*sy->stsy_underlying)(arg0, arg1, arg2, arg3, arg4, arg5);
+	rval = (*sy->stsy_underlying)(arg0, arg1, arg2, arg3, arg4, arg5,
+	    arg6, arg7);
 
 	if (ttolwp(curthread)->lwp_errno != 0)
 		rval = -1;
 
 	if ((id = sy->stsy_return) != DTRACE_IDNONE)
 		(*systrace_probe)(id, (uintptr_t)rval, (uintptr_t)rval,
-		    (uintptr_t)((int64_t)rval >> 32), 0, 0, 0);
+		    (uintptr_t)((int64_t)rval >> 32), 0, 0, 0, 0, 0);
 
 	return (rval);
 }
@@ -1145,7 +1149,8 @@ systrace_sysent_t *systrace_sysent32;
 /*ARGSUSED*/
 int64_t
 dtrace_systrace_syscall32(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2,
-    uintptr_t arg3, uintptr_t arg4, uintptr_t arg5)
+    uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, uintptr_t arg6,
+    uintptr_t arg7)
 {
 	systrace_sysent_t *sy = &systrace_sysent32[curthread->t_sysnum];
 	dtrace_id_t id;
@@ -1153,7 +1158,8 @@ dtrace_systrace_syscall32(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2,
 	proc_t *p;
 
 	if ((id = sy->stsy_entry) != DTRACE_IDNONE)
-		(*systrace_probe)(id, arg0, arg1, arg2, arg3, arg4, arg5);
+		(*systrace_probe)(id, arg0, arg1, arg2, arg3, arg4, arg5, arg6,
+		    arg7);
 
 	/*
 	 * We want to explicitly allow DTrace consumers to stop a process
@@ -1167,14 +1173,15 @@ dtrace_systrace_syscall32(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2,
 	}
 	mutex_exit(&p->p_lock);
 
-	rval = (*sy->stsy_underlying)(arg0, arg1, arg2, arg3, arg4, arg5);
+	rval = (*sy->stsy_underlying)(arg0, arg1, arg2, arg3, arg4, arg5, arg6,
+	    arg7);
 
 	if (ttolwp(curthread)->lwp_errno != 0)
 		rval = -1;
 
 	if ((id = sy->stsy_return) != DTRACE_IDNONE)
 		(*systrace_probe)(id, (uintptr_t)rval, (uintptr_t)rval,
-		    (uintptr_t)((uint64_t)rval >> 32), 0, 0, 0);
+		    (uintptr_t)((uint64_t)rval >> 32), 0, 0, 0, 0, 0);
 
 	return (rval);
 }
@@ -1202,5 +1209,5 @@ dtrace_systrace_rtt(void)
 	}
 
 	if ((id = sy->stsy_return) != DTRACE_IDNONE)
-		(*systrace_probe)(id, 0, 0, 0, 0, 0, 0);
+		(*systrace_probe)(id, 0, 0, 0, 0, 0, 0, 0, 0);
 }
diff --git a/usr/src/uts/common/os/vmem.c b/usr/src/uts/common/os/vmem.c
index a554f8c3f3..0a6fe0ef96 100644
--- a/usr/src/uts/common/os/vmem.c
+++ b/usr/src/uts/common/os/vmem.c
@@ -1618,7 +1618,7 @@ vmem_destroy(vmem_t *vmp)
 
 	leaked = vmem_size(vmp, VMEM_ALLOC);
 	if (leaked != 0)
-		cmn_err(CE_WARN, "vmem_destroy('%s'): leaked %lu %s",
+		cmn_err(CE_WARN, "!vmem_destroy('%s'): leaked %lu %s",
 		    vmp->vm_name, leaked, (vmp->vm_cflags & VMC_IDENTIFIER) ?
 		    "identifiers" : "bytes");
 
diff --git a/usr/src/uts/common/os/zone.c b/usr/src/uts/common/os/zone.c
index c997f8fd8d..e86fe138e3 100644
--- a/usr/src/uts/common/os/zone.c
+++ b/usr/src/uts/common/os/zone.c
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2015, Joyent Inc. All rights reserved.
+ * Copyright 2016, Joyent Inc.
  */
 
 /*
@@ -250,6 +250,8 @@
 #include <sys/cpucaps.h>
 #include <vm/seg.h>
 #include <sys/mac.h>
+#include <sys/rt.h>
+#include <sys/fx.h>
 
 /*
  * This constant specifies the number of seconds that threads waiting for
@@ -370,8 +372,12 @@ static char *zone_ref_subsys_names[] = {
 rctl_hndl_t rc_zone_cpu_shares;
 rctl_hndl_t rc_zone_locked_mem;
 rctl_hndl_t rc_zone_max_swap;
+rctl_hndl_t rc_zone_phys_mem;
 rctl_hndl_t rc_zone_max_lofi;
 rctl_hndl_t rc_zone_cpu_cap;
+rctl_hndl_t rc_zone_cpu_baseline;
+rctl_hndl_t rc_zone_cpu_burst_time;
+rctl_hndl_t rc_zone_zfs_io_pri;
 rctl_hndl_t rc_zone_nlwps;
 rctl_hndl_t rc_zone_nprocs;
 rctl_hndl_t rc_zone_shmmax;
@@ -417,8 +423,9 @@ static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *,
  * Version 5 alters the zone_boot system call, and converts its old
  *     bootargs parameter to be set by the zone_setattr API instead.
  * Version 6 adds the flag argument to zone_create.
+ * Version 7 adds the requested zoneid to zone_create.
  */
-static const int ZONE_SYSCALL_API_VERSION = 6;
+static const int ZONE_SYSCALL_API_VERSION = 7;
 
 /*
  * Certain filesystems (such as NFS and autofs) need to know which zone
@@ -1377,6 +1384,114 @@ static rctl_ops_t zone_cpu_cap_ops = {
 
 /*ARGSUSED*/
 static rctl_qty_t
+zone_cpu_base_get(rctl_t *rctl, struct proc *p)
+{
+	ASSERT(MUTEX_HELD(&p->p_lock));
+	return (cpucaps_zone_get_base(p->p_zone));
+}
+
+/*
+ * The zone cpu base is used to set the baseline CPU for the zone
+ * so we can track when the zone is bursting.
+ */
+/*ARGSUSED*/
+static int
+zone_cpu_base_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
+    rctl_qty_t nv)
+{
+	zone_t *zone = e->rcep_p.zone;
+
+	ASSERT(MUTEX_HELD(&p->p_lock));
+	ASSERT(e->rcep_t == RCENTITY_ZONE);
+
+	if (zone == NULL)
+		return (0);
+
+	return (cpucaps_zone_set_base(zone, nv));
+}
+
+static rctl_ops_t zone_cpu_base_ops = {
+	rcop_no_action,
+	zone_cpu_base_get,
+	zone_cpu_base_set,
+	rcop_no_test
+};
+
+/*ARGSUSED*/
+static rctl_qty_t
+zone_cpu_burst_time_get(rctl_t *rctl, struct proc *p)
+{
+	ASSERT(MUTEX_HELD(&p->p_lock));
+	return (cpucaps_zone_get_burst_time(p->p_zone));
+}
+
+/*
+ * The zone cpu burst time is used to set the amount of time CPU(s) can be
+ * bursting for the zone.
+ */
+/*ARGSUSED*/
+static int
+zone_cpu_burst_time_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
+    rctl_qty_t nv)
+{
+	zone_t *zone = e->rcep_p.zone;
+
+	ASSERT(MUTEX_HELD(&p->p_lock));
+	ASSERT(e->rcep_t == RCENTITY_ZONE);
+
+	if (zone == NULL)
+		return (0);
+
+	return (cpucaps_zone_set_burst_time(zone, nv));
+}
+
+static rctl_ops_t zone_cpu_burst_time_ops = {
+	rcop_no_action,
+	zone_cpu_burst_time_get,
+	zone_cpu_burst_time_set,
+	rcop_no_test
+};
+
+/*
+ * zone.zfs-io-pri resource control support (IO priority).
+ */
+/*ARGSUSED*/
+static rctl_qty_t
+zone_zfs_io_pri_get(rctl_t *rctl, struct proc *p)
+{
+	ASSERT(MUTEX_HELD(&p->p_lock));
+	return (p->p_zone->zone_zfs_io_pri);
+}
+
+/*ARGSUSED*/
+static int
+zone_zfs_io_pri_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
+    rctl_qty_t nv)
+{
+	zone_t *zone = e->rcep_p.zone;
+
+	ASSERT(MUTEX_HELD(&p->p_lock));
+	ASSERT(e->rcep_t == RCENTITY_ZONE);
+
+	if (zone == NULL)
+		return (0);
+
+	/*
+	 * set priority to the new value.
+	 */
+	zone->zone_zfs_io_pri = nv;
+	return (0);
+}
+
+static rctl_ops_t zone_zfs_io_pri_ops = {
+	rcop_no_action,
+	zone_zfs_io_pri_get,
+	zone_zfs_io_pri_set,
+	rcop_no_test
+};
+
+/*ARGSUSED*/
+static rctl_qty_t
 zone_lwps_usage(rctl_t *r, proc_t *p)
 {
 	rctl_qty_t nlwps;
@@ -1486,6 +1601,14 @@ static rctl_ops_t zone_procs_ops = {
 };
 
 /*ARGSUSED*/
+static rctl_qty_t
+zone_shmmax_usage(rctl_t *rctl, struct proc *p)
+{
+	ASSERT(MUTEX_HELD(&p->p_lock));
+	return (p->p_zone->zone_shmmax);
+}
+
+/*ARGSUSED*/
 static int
 zone_shmmax_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
     rctl_qty_t incr, uint_t flags)
@@ -1501,12 +1624,20 @@ zone_shmmax_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
 
 static rctl_ops_t zone_shmmax_ops = {
 	rcop_no_action,
-	rcop_no_usage,
+	zone_shmmax_usage,
 	rcop_no_set,
 	zone_shmmax_test
 };
 
 /*ARGSUSED*/
+static rctl_qty_t
+zone_shmmni_usage(rctl_t *rctl, struct proc *p)
+{
+	ASSERT(MUTEX_HELD(&p->p_lock));
+	return (p->p_zone->zone_ipc.ipcq_shmmni);
+}
+
+/*ARGSUSED*/
 static int
 zone_shmmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
     rctl_qty_t incr, uint_t flags)
@@ -1522,12 +1653,20 @@ zone_shmmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
 
 static rctl_ops_t zone_shmmni_ops = {
 	rcop_no_action,
-	rcop_no_usage,
+	zone_shmmni_usage,
 	rcop_no_set,
 	zone_shmmni_test
 };
 
 /*ARGSUSED*/
+static rctl_qty_t
+zone_semmni_usage(rctl_t *rctl, struct proc *p)
+{
+	ASSERT(MUTEX_HELD(&p->p_lock));
+	return (p->p_zone->zone_ipc.ipcq_semmni);
+}
+
+/*ARGSUSED*/
 static int
 zone_semmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
     rctl_qty_t incr, uint_t flags)
@@ -1543,12 +1682,20 @@ zone_semmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
 
 static rctl_ops_t zone_semmni_ops = {
 	rcop_no_action,
-	rcop_no_usage,
+	zone_semmni_usage,
 	rcop_no_set,
 	zone_semmni_test
 };
 
 /*ARGSUSED*/
+static rctl_qty_t
+zone_msgmni_usage(rctl_t *rctl, struct proc *p)
+{
+	ASSERT(MUTEX_HELD(&p->p_lock));
+	return (p->p_zone->zone_ipc.ipcq_msgmni);
+}
+
+/*ARGSUSED*/
 static int
 zone_msgmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
     rctl_qty_t incr, uint_t flags)
@@ -1564,7 +1711,7 @@ zone_msgmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
 
 static rctl_ops_t zone_msgmni_ops = {
 	rcop_no_action,
-	rcop_no_usage,
+	zone_msgmni_usage,
 	rcop_no_set,
 	zone_msgmni_test
 };
@@ -1671,6 +1818,39 @@ static rctl_ops_t zone_max_swap_ops = {
 
 /*ARGSUSED*/
 static rctl_qty_t
+zone_phys_mem_usage(rctl_t *rctl, struct proc *p)
+{
+	rctl_qty_t q;
+	zone_t *z = p->p_zone;
+
+	ASSERT(MUTEX_HELD(&p->p_lock));
+	/* No additional lock because not enforced in the kernel */
+	q = z->zone_phys_mem;
+	return (q);
+}
+
+/*ARGSUSED*/
+static int
+zone_phys_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
+    rctl_qty_t nv)
+{
+	ASSERT(MUTEX_HELD(&p->p_lock));
+	ASSERT(e->rcep_t == RCENTITY_ZONE);
+	if (e->rcep_p.zone == NULL)
+		return (0);
+	e->rcep_p.zone->zone_phys_mem_ctl = nv;
+	return (0);
+}
+
+static rctl_ops_t zone_phys_mem_ops = {
+	rcop_no_action,
+	zone_phys_mem_usage,
+	zone_phys_mem_set,
+	rcop_no_test
+};
+
+/*ARGSUSED*/
+static rctl_qty_t
 zone_max_lofi_usage(rctl_t *rctl, struct proc *p)
 {
 	rctl_qty_t q;
@@ -1764,6 +1944,20 @@ zone_lockedmem_kstat_update(kstat_t *ksp, int rw)
 }
 
 static int
+zone_physmem_kstat_update(kstat_t *ksp, int rw)
+{
+	zone_t *zone = ksp->ks_private;
+	zone_kstat_t *zk = ksp->ks_data;
+
+	if (rw == KSTAT_WRITE)
+		return (EACCES);
+
+	zk->zk_usage.value.ui64 = zone->zone_phys_mem;
+	zk->zk_value.value.ui64 = zone->zone_phys_mem_ctl;
+	return (0);
+}
+
+static int
 zone_nprocs_kstat_update(kstat_t *ksp, int rw)
 {
 	zone_t *zone = ksp->ks_private;
@@ -1792,7 +1986,7 @@ zone_swapresv_kstat_update(kstat_t *ksp, int rw)
 }
 
 static kstat_t *
-zone_kstat_create_common(zone_t *zone, char *name,
+zone_rctl_kstat_create_common(zone_t *zone, char *name,
     int (*updatefunc) (kstat_t *, int))
 {
 	kstat_t *ksp;
@@ -1817,6 +2011,160 @@ zone_kstat_create_common(zone_t *zone, char *name,
 	return (ksp);
 }
 
+static int
+zone_vfs_kstat_update(kstat_t *ksp, int rw)
+{
+	zone_t *zone = ksp->ks_private;
+	zone_vfs_kstat_t *zvp = ksp->ks_data;
+	kstat_io_t *kiop = &zone->zone_vfs_rwstats;
+
+	if (rw == KSTAT_WRITE)
+		return (EACCES);
+
+	/*
+	 * Extract the VFS statistics from the kstat_io_t structure used by
+	 * kstat_runq_enter() and related functions.  Since the slow ops
+	 * counters are updated directly by the VFS layer, there's no need to
+	 * copy those statistics here.
+	 *
+	 * Note that kstat_runq_enter() and the related functions use
+	 * gethrtime_unscaled(), so scale the time here.
+	 */
+	zvp->zv_nread.value.ui64 = kiop->nread;
+	zvp->zv_reads.value.ui64 = kiop->reads;
+	zvp->zv_rtime.value.ui64 = kiop->rtime;
+	zvp->zv_rcnt.value.ui64 = kiop->rcnt;
+	zvp->zv_rlentime.value.ui64 = kiop->rlentime;
+	zvp->zv_nwritten.value.ui64 = kiop->nwritten;
+	zvp->zv_writes.value.ui64 = kiop->writes;
+	zvp->zv_wtime.value.ui64 = kiop->wtime;
+	zvp->zv_wcnt.value.ui64 = kiop->wcnt;
+	zvp->zv_wlentime.value.ui64 = kiop->wlentime;
+
+	scalehrtime((hrtime_t *)&zvp->zv_rtime.value.ui64);
+	scalehrtime((hrtime_t *)&zvp->zv_rlentime.value.ui64);
+	scalehrtime((hrtime_t *)&zvp->zv_wtime.value.ui64);
+	scalehrtime((hrtime_t *)&zvp->zv_wlentime.value.ui64);
+
+	return (0);
+}
+
+static kstat_t *
+zone_vfs_kstat_create(zone_t *zone)
+{
+	kstat_t *ksp;
+	zone_vfs_kstat_t *zvp;
+
+	if ((ksp = kstat_create_zone("zone_vfs", zone->zone_id,
+	    zone->zone_name, "zone_vfs", KSTAT_TYPE_NAMED,
+	    sizeof (zone_vfs_kstat_t) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
+		return (NULL);
+
+	if (zone->zone_id != GLOBAL_ZONEID)
+		kstat_zone_add(ksp, GLOBAL_ZONEID);
+
+	zvp = ksp->ks_data = kmem_zalloc(sizeof (zone_vfs_kstat_t), KM_SLEEP);
+	ksp->ks_data_size += strlen(zone->zone_name) + 1;
+	ksp->ks_lock = &zone->zone_vfs_lock;
+	zone->zone_vfs_stats = zvp;
+
+	/* The kstat "name" field is not large enough for a full zonename */
+	kstat_named_init(&zvp->zv_zonename, "zonename", KSTAT_DATA_STRING);
+	kstat_named_setstr(&zvp->zv_zonename, zone->zone_name);
+	kstat_named_init(&zvp->zv_nread, "nread", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_reads, "reads", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_rtime, "rtime", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_rcnt, "rcnt", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_rlentime, "rlentime", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_nwritten, "nwritten", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_writes, "writes", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_wtime, "wtime", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_wcnt, "wcnt", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_wlentime, "wlentime", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_10ms_ops, "10ms_ops", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_100ms_ops, "100ms_ops", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_1s_ops, "1s_ops", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_10s_ops, "10s_ops", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_delay_cnt, "delay_cnt", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_delay_time, "delay_time", KSTAT_DATA_UINT64);
+
+	ksp->ks_update = zone_vfs_kstat_update;
+	ksp->ks_private = zone;
+
+	kstat_install(ksp);
+	return (ksp);
+}
+
+static int
+zone_zfs_kstat_update(kstat_t *ksp, int rw)
+{
+	zone_t *zone = ksp->ks_private;
+	zone_zfs_kstat_t *zzp = ksp->ks_data;
+	kstat_io_t *kiop = &zone->zone_zfs_rwstats;
+
+	if (rw == KSTAT_WRITE)
+		return (EACCES);
+
+	/*
+	 * Extract the ZFS statistics from the kstat_io_t structure used by
+	 * kstat_runq_enter() and related functions.  Since the I/O throttle
+	 * counters are updated directly by the ZFS layer, there's no need to
+	 * copy those statistics here.
+	 *
+	 * Note that kstat_runq_enter() and the related functions use
+	 * gethrtime_unscaled(), so scale the time here.
+	 */
+	zzp->zz_nread.value.ui64 = kiop->nread;
+	zzp->zz_reads.value.ui64 = kiop->reads;
+	zzp->zz_rtime.value.ui64 = kiop->rtime;
+	zzp->zz_rlentime.value.ui64 = kiop->rlentime;
+	zzp->zz_nwritten.value.ui64 = kiop->nwritten;
+	zzp->zz_writes.value.ui64 = kiop->writes;
+
+	scalehrtime((hrtime_t *)&zzp->zz_rtime.value.ui64);
+	scalehrtime((hrtime_t *)&zzp->zz_rlentime.value.ui64);
+
+	return (0);
+}
+
+static kstat_t *
+zone_zfs_kstat_create(zone_t *zone)
+{
+	kstat_t *ksp;
+	zone_zfs_kstat_t *zzp;
+
+	if ((ksp = kstat_create_zone("zone_zfs", zone->zone_id,
+	    zone->zone_name, "zone_zfs", KSTAT_TYPE_NAMED,
+	    sizeof (zone_zfs_kstat_t) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
+		return (NULL);
+
+	if (zone->zone_id != GLOBAL_ZONEID)
+		kstat_zone_add(ksp, GLOBAL_ZONEID);
+
+	zzp = ksp->ks_data = kmem_zalloc(sizeof (zone_zfs_kstat_t), KM_SLEEP);
+	ksp->ks_data_size += strlen(zone->zone_name) + 1;
+	ksp->ks_lock = &zone->zone_zfs_lock;
+	zone->zone_zfs_stats = zzp;
+
+	/* The kstat "name" field is not large enough for a full zonename */
+	kstat_named_init(&zzp->zz_zonename, "zonename", KSTAT_DATA_STRING);
+	kstat_named_setstr(&zzp->zz_zonename, zone->zone_name);
+	kstat_named_init(&zzp->zz_nread, "nread", KSTAT_DATA_UINT64);
+	kstat_named_init(&zzp->zz_reads, "reads", KSTAT_DATA_UINT64);
+	kstat_named_init(&zzp->zz_rtime, "rtime", KSTAT_DATA_UINT64);
+	kstat_named_init(&zzp->zz_rlentime, "rlentime", KSTAT_DATA_UINT64);
+	kstat_named_init(&zzp->zz_nwritten, "nwritten", KSTAT_DATA_UINT64);
+	kstat_named_init(&zzp->zz_writes, "writes", KSTAT_DATA_UINT64);
+	kstat_named_init(&zzp->zz_waittime, "waittime", KSTAT_DATA_UINT64);
+
+	ksp->ks_update = zone_zfs_kstat_update;
+	ksp->ks_private = zone;
+
+	kstat_install(ksp);
+	return (ksp);
+}
 
 static int
 zone_mcap_kstat_update(kstat_t *ksp, int rw)
@@ -1827,11 +2175,19 @@ zone_mcap_kstat_update(kstat_t *ksp, int rw)
 	if (rw == KSTAT_WRITE)
 		return (EACCES);
 
+	zmp->zm_rss.value.ui64 = zone->zone_phys_mem;
+	zmp->zm_phys_cap.value.ui64 = zone->zone_phys_mem_ctl;
+	zmp->zm_swap.value.ui64 = zone->zone_max_swap;
+	zmp->zm_swap_cap.value.ui64 = zone->zone_max_swap_ctl;
+	zmp->zm_nover.value.ui64 = zone->zone_mcap_nover;
+	zmp->zm_pagedout.value.ui64 = zone->zone_mcap_pagedout;
 	zmp->zm_pgpgin.value.ui64 = zone->zone_pgpgin;
 	zmp->zm_anonpgin.value.ui64 = zone->zone_anonpgin;
 	zmp->zm_execpgin.value.ui64 = zone->zone_execpgin;
 	zmp->zm_fspgin.value.ui64 = zone->zone_fspgin;
 	zmp->zm_anon_alloc_fail.value.ui64 = zone->zone_anon_alloc_fail;
+	zmp->zm_pf_throttle.value.ui64 = zone->zone_pf_throttle;
+	zmp->zm_pf_throttle_usec.value.ui64 = zone->zone_pf_throttle_usec;
 
 	return (0);
 }
@@ -1859,12 +2215,22 @@ zone_mcap_kstat_create(zone_t *zone)
 	/* The kstat "name" field is not large enough for a full zonename */
 	kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
 	kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
+	kstat_named_init(&zmp->zm_rss, "rss", KSTAT_DATA_UINT64);
+	kstat_named_init(&zmp->zm_phys_cap, "physcap", KSTAT_DATA_UINT64);
+	kstat_named_init(&zmp->zm_swap, "swap", KSTAT_DATA_UINT64);
+	kstat_named_init(&zmp->zm_swap_cap, "swapcap", KSTAT_DATA_UINT64);
+	kstat_named_init(&zmp->zm_nover, "nover", KSTAT_DATA_UINT64);
+	kstat_named_init(&zmp->zm_pagedout, "pagedout", KSTAT_DATA_UINT64);
 	kstat_named_init(&zmp->zm_pgpgin, "pgpgin", KSTAT_DATA_UINT64);
 	kstat_named_init(&zmp->zm_anonpgin, "anonpgin", KSTAT_DATA_UINT64);
 	kstat_named_init(&zmp->zm_execpgin, "execpgin", KSTAT_DATA_UINT64);
 	kstat_named_init(&zmp->zm_fspgin, "fspgin", KSTAT_DATA_UINT64);
 	kstat_named_init(&zmp->zm_anon_alloc_fail, "anon_alloc_fail",
 	    KSTAT_DATA_UINT64);
+	kstat_named_init(&zmp->zm_pf_throttle, "n_pf_throttle",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&zmp->zm_pf_throttle_usec, "n_pf_throttle_usec",
+	    KSTAT_DATA_UINT64);
 
 	ksp->ks_update = zone_mcap_kstat_update;
 	ksp->ks_private = zone;
@@ -1960,13 +2326,25 @@ zone_misc_kstat_create(zone_t *zone)
 static void
 zone_kstat_create(zone_t *zone)
 {
-	zone->zone_lockedmem_kstat = zone_kstat_create_common(zone,
+	zone->zone_lockedmem_kstat = zone_rctl_kstat_create_common(zone,
 	    "lockedmem", zone_lockedmem_kstat_update);
-	zone->zone_swapresv_kstat = zone_kstat_create_common(zone,
+	zone->zone_swapresv_kstat = zone_rctl_kstat_create_common(zone,
 	    "swapresv", zone_swapresv_kstat_update);
-	zone->zone_nprocs_kstat = zone_kstat_create_common(zone,
+	zone->zone_physmem_kstat = zone_rctl_kstat_create_common(zone,
+	    "physicalmem", zone_physmem_kstat_update);
+	zone->zone_nprocs_kstat = zone_rctl_kstat_create_common(zone,
 	    "nprocs", zone_nprocs_kstat_update);
 
+	if ((zone->zone_vfs_ksp = zone_vfs_kstat_create(zone)) == NULL) {
+		zone->zone_vfs_stats = kmem_zalloc(
+		    sizeof (zone_vfs_kstat_t), KM_SLEEP);
+	}
+
+	if ((zone->zone_zfs_ksp = zone_zfs_kstat_create(zone)) == NULL) {
+		zone->zone_zfs_stats = kmem_zalloc(
+		    sizeof (zone_zfs_kstat_t), KM_SLEEP);
+	}
+
 	if ((zone->zone_mcap_ksp = zone_mcap_kstat_create(zone)) == NULL) {
 		zone->zone_mcap_stats = kmem_zalloc(
 		    sizeof (zone_mcap_kstat_t), KM_SLEEP);
@@ -1998,8 +2376,15 @@ zone_kstat_delete(zone_t *zone)
 	    sizeof (zone_kstat_t));
 	zone_kstat_delete_common(&zone->zone_swapresv_kstat,
 	    sizeof (zone_kstat_t));
+	zone_kstat_delete_common(&zone->zone_physmem_kstat,
+	    sizeof (zone_kstat_t));
 	zone_kstat_delete_common(&zone->zone_nprocs_kstat,
 	    sizeof (zone_kstat_t));
+
+	zone_kstat_delete_common(&zone->zone_vfs_ksp,
+	    sizeof (zone_vfs_kstat_t));
+	zone_kstat_delete_common(&zone->zone_zfs_ksp,
+	    sizeof (zone_zfs_kstat_t));
 	zone_kstat_delete_common(&zone->zone_mcap_ksp,
 	    sizeof (zone_mcap_kstat_t));
 	zone_kstat_delete_common(&zone->zone_misc_ksp,
@@ -2037,6 +2422,8 @@ zone_zsd_init(void)
 	zone0.zone_locked_mem_ctl = UINT64_MAX;
 	ASSERT(zone0.zone_max_swap == 0);
 	zone0.zone_max_swap_ctl = UINT64_MAX;
+	zone0.zone_phys_mem = 0;
+	zone0.zone_phys_mem_ctl = UINT64_MAX;
 	zone0.zone_max_lofi = 0;
 	zone0.zone_max_lofi_ctl = UINT64_MAX;
 	zone0.zone_shmmax = 0;
@@ -2060,8 +2447,9 @@ zone_zsd_init(void)
 	zone0.zone_initname = initname;
 	zone0.zone_lockedmem_kstat = NULL;
 	zone0.zone_swapresv_kstat = NULL;
+	zone0.zone_physmem_kstat = NULL;
 	zone0.zone_nprocs_kstat = NULL;
-
+	zone0.zone_zfs_io_pri = 1;
 	zone0.zone_stime = 0;
 	zone0.zone_utime = 0;
 	zone0.zone_wtime = 0;
@@ -2172,6 +2560,21 @@ zone_init(void)
 	    RCTL_GLOBAL_INFINITE,
 	    MAXCAP, MAXCAP, &zone_cpu_cap_ops);
 
+	rc_zone_cpu_baseline = rctl_register("zone.cpu-baseline",
+	    RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
+	    RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
+	    MAXCAP, MAXCAP, &zone_cpu_base_ops);
+
+	rc_zone_cpu_burst_time = rctl_register("zone.cpu-burst-time",
+	    RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
+	    RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
+	    INT_MAX, INT_MAX, &zone_cpu_burst_time_ops);
+
+	rc_zone_zfs_io_pri = rctl_register("zone.zfs-io-priority",
+	    RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
+	    RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
+	    16384, 16384, &zone_zfs_io_pri_ops);
+
 	rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE,
 	    RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
 	    INT_MAX, INT_MAX, &zone_lwps_ops);
@@ -2213,6 +2616,20 @@ zone_init(void)
 	rde = rctl_dict_lookup("zone.cpu-shares");
 	(void) rctl_val_list_insert(&rde->rcd_default_value, dval);
 
+	/*
+	 * Create a rctl_val with PRIVILEGED, NOACTION, value = 1.  Then attach
+	 * this at the head of the rctl_dict_entry for ``zone.zfs-io-priority'.
+	 */
+	dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
+	bzero(dval, sizeof (rctl_val_t));
+	dval->rcv_value = 1;
+	dval->rcv_privilege = RCPRIV_PRIVILEGED;
+	dval->rcv_flagaction = RCTL_LOCAL_NOACTION;
+	dval->rcv_action_recip_pid = -1;
+
+	rde = rctl_dict_lookup("zone.zfs-io-priority");
+	(void) rctl_val_list_insert(&rde->rcd_default_value, dval);
+
 	rc_zone_locked_mem = rctl_register("zone.max-locked-memory",
 	    RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
 	    RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
@@ -2223,6 +2640,11 @@ zone_init(void)
 	    RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
 	    &zone_max_swap_ops);
 
+	rc_zone_phys_mem = rctl_register("zone.max-physical-memory",
+	    RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
+	    RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
+	    &zone_phys_mem_ops);
+
 	rc_zone_max_lofi = rctl_register("zone.max-lofi",
 	    RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |
 	    RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
@@ -2244,6 +2666,8 @@ zone_init(void)
 	zone0.zone_ntasks = 1;
 	mutex_exit(&p0.p_lock);
 	zone0.zone_restart_init = B_TRUE;
+	zone0.zone_reboot_on_init_exit = B_FALSE;
+	zone0.zone_init_status = -1;
 	zone0.zone_brand = &native_brand;
 	rctl_prealloc_destroy(gp);
 	/*
@@ -2323,6 +2747,8 @@ zone_init(void)
 static void
 zone_free(zone_t *zone)
 {
+	zone_dl_t *zdl;
+
 	ASSERT(zone != global_zone);
 	ASSERT(zone->zone_ntasks == 0);
 	ASSERT(zone->zone_nlwps == 0);
@@ -2351,6 +2777,19 @@ zone_free(zone_t *zone)
 	list_destroy(&zone->zone_ref_list);
 	zone_free_zsd(zone);
 	zone_free_datasets(zone);
+
+	/*
+	 * While dlmgmtd should have removed all of these, it could have left
+	 * something behind or crashed. In which case it's not safe for us to
+	 * assume that the list is empty which list_destroy() will ASSERT. We
+	 * clean up for our userland comrades which may have crashed, or worse,
+	 * been disabled by SMF.
+	 */
+	while ((zdl = list_remove_head(&zone->zone_dl_list)) != NULL) {
+		if (zdl->zdl_net != NULL)
+			nvlist_free(zdl->zdl_net);
+		kmem_free(zdl, sizeof (zone_dl_t));
+	}
 	list_destroy(&zone->zone_dl_list);
 
 	if (zone->zone_rootvp != NULL)
@@ -2395,12 +2834,18 @@ zone_free(zone_t *zone)
 static void
 zone_status_set(zone_t *zone, zone_status_t status)
 {
+	timestruc_t now;
+	uint64_t t;
 
 	nvlist_t *nvl = NULL;
 	ASSERT(MUTEX_HELD(&zone_status_lock));
 	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE &&
 	    status >= zone_status_get(zone));
 
+	/* Current time since Jan 1 1970 but consumers expect NS */
+	gethrestime(&now);
+	t = (now.tv_sec * NANOSEC) + now.tv_nsec;
+
 	if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) ||
 	    nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) ||
 	    nvlist_add_string(nvl, ZONE_CB_NEWSTATE,
@@ -2408,7 +2853,7 @@ zone_status_set(zone_t *zone, zone_status_t status)
 	    nvlist_add_string(nvl, ZONE_CB_OLDSTATE,
 	    zone_status_table[zone->zone_status]) ||
 	    nvlist_add_int32(nvl, ZONE_CB_ZONEID, zone->zone_id) ||
-	    nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, (uint64_t)gethrtime()) ||
+	    nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, t) ||
 	    sysevent_evc_publish(zone_event_chan, ZONE_EVENT_STATUS_CLASS,
 	    ZONE_EVENT_STATUS_SUBCLASS, "sun.com", "kernel", nvl, EVCH_SLEEP)) {
 #ifdef DEBUG
@@ -2486,9 +2931,14 @@ zone_set_brand(zone_t *zone, const char *brand)
 		return (EINVAL);
 	}
 
-	/* set up the brand specific data */
+	/*
+	 * Set up the brand specific data.
+	 * Note that it's possible that the hook has to drop the
+	 * zone_status_lock and reaquire it before returning so we can't
+	 * assume the lock has been held the entire time.
+	 */
 	zone->zone_brand = bp;
-	ZBROP(zone)->b_init_brand_data(zone);
+	ZBROP(zone)->b_init_brand_data(zone, &zone_status_lock);
 
 	mutex_exit(&zone_status_lock);
 	return (0);
@@ -2534,14 +2984,65 @@ zone_set_initname(zone_t *zone, const char *zone_initname)
 	return (0);
 }
 
+/*
+ * The zone_set_mcap_nover and zone_set_mcap_pageout functions are used
+ * to provide the physical memory capping kstats.  Since physical memory
+ * capping is currently implemented in userland, that code uses the setattr
+ * entry point to increment the kstats.  We always simply increment nover
+ * every time that setattr is called and we always add in the input value
+ * to zone_mcap_pagedout every time that is called.
+ */
+/*ARGSUSED*/
+static int
+zone_set_mcap_nover(zone_t *zone, const uint64_t *zone_nover)
+{
+	zone->zone_mcap_nover++;
+
+	return (0);
+}
+
 static int
-zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap)
+zone_set_mcap_pageout(zone_t *zone, const uint64_t *zone_pageout)
 {
-	uint64_t mcap;
-	int err = 0;
+	uint64_t pageout;
+	int err;
 
-	if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0)
-		zone->zone_phys_mcap = mcap;
+	if ((err = copyin(zone_pageout, &pageout, sizeof (uint64_t))) == 0)
+		zone->zone_mcap_pagedout += pageout;
+
+	return (err);
+}
+
+/*
+ * The zone_set_page_fault_delay function is used to set the number of usecs
+ * to throttle page faults.  This is normally 0 but can be set to a non-0 value
+ * by the user-land memory capping code when the zone is over its physcial
+ * memory cap.
+ */
+static int
+zone_set_page_fault_delay(zone_t *zone, const uint32_t *pfdelay)
+{
+	uint32_t dusec;
+	int err;
+
+	if ((err = copyin(pfdelay, &dusec, sizeof (uint32_t))) == 0)
+		zone->zone_pg_flt_delay = dusec;
+
+	return (err);
+}
+
+/*
+ * The zone_set_rss function is used to set the zone's RSS when we do the
+ * fast, approximate calculation in user-land.
+ */
+static int
+zone_set_rss(zone_t *zone, const uint64_t *prss)
+{
+	uint64_t rss;
+	int err;
+
+	if ((err = copyin(prss, &rss, sizeof (uint64_t))) == 0)
+		zone->zone_phys_mem = rss;
 
 	return (err);
 }
@@ -2953,6 +3454,12 @@ getzoneid(void)
 	return (curproc->p_zone->zone_id);
 }
 
+zoneid_t
+getzonedid(void)
+{
+	return (curproc->p_zone->zone_did);
+}
+
 /*
  * Internal versions of zone_find_by_*().  These don't zone_hold() or
  * check the validity of a zone's state.
@@ -3696,6 +4203,17 @@ zone_start_init(void)
 	 */
 	z->zone_proc_initpid = p->p_pid;
 
+	if (z->zone_setup_app_contract == B_TRUE) {
+		/*
+		 * Normally a process cannot modify its own contract, but we're
+		 * just starting the zone's init process and its contract is
+		 * always initialized from the sys_process_tmpl template, so
+		 * this is the simplest way to setup init's contract to kill
+		 * the process if any other process in the contract exits.
+		 */
+		p->p_ct_process->conp_ev_fatal |= CT_PR_EV_EXIT;
+	}
+
 	/*
 	 * We maintain zone_boot_err so that we can return the cause of the
 	 * failure back to the caller of the zone_boot syscall.
@@ -3724,9 +4242,54 @@ zone_start_init(void)
 			lwp_exit();
 		}
 	} else {
+		id_t cid = curthread->t_cid;
+
 		if (zone_status_get(z) == ZONE_IS_BOOTING)
 			zone_status_set(z, ZONE_IS_RUNNING);
 		mutex_exit(&zone_status_lock);
+
+		mutex_enter(&class_lock);
+		ASSERT(cid < loaded_classes);
+		if (strcmp(sclass[cid].cl_name, "FX") == 0 &&
+		    z->zone_fixed_hipri) {
+			/*
+			 * If the zone is using FX then by default all
+			 * processes start at the lowest priority and stay
+			 * there. We provide a mechanism for the zone to
+			 * indicate that it should run at "high priority". In
+			 * this case we setup init to run at the highest FX
+			 * priority (which is one level higher than the
+			 * non-fixed scheduling classes can use).
+			 */
+			pcparms_t pcparms;
+
+			pcparms.pc_cid = cid;
+			((fxkparms_t *)pcparms.pc_clparms)->fx_upri = FXMAXUPRI;
+			((fxkparms_t *)pcparms.pc_clparms)->fx_uprilim =
+			    FXMAXUPRI;
+			((fxkparms_t *)pcparms.pc_clparms)->fx_cflags =
+			    FX_DOUPRILIM | FX_DOUPRI;
+
+			mutex_enter(&pidlock);
+			mutex_enter(&curproc->p_lock);
+
+			(void) parmsset(&pcparms, curthread);
+
+			mutex_exit(&curproc->p_lock);
+			mutex_exit(&pidlock);
+		} else if (strcmp(sclass[cid].cl_name, "RT") == 0) {
+			/*
+			 * zsched always starts the init lwp at priority
+			 * minclsyspri - 1. This priority gets set in t_pri and
+			 * is invalid for RT, but RT never uses t_pri. However
+			 * t_pri is used by procfs, so we always see processes
+			 * within an RT zone with an invalid priority value.
+			 * We fix that up now.
+			 */
+			curthread->t_pri = RTGPPRIO0;
+		}
+		mutex_exit(&class_lock);
+
 		/* cause the process to return to userland. */
 		lwp_rtt();
 	}
@@ -3768,6 +4331,7 @@ zsched(void *arg)
 	PTOU(pp)->u_argc = 0;
 	PTOU(pp)->u_argv = NULL;
 	PTOU(pp)->u_envp = NULL;
+	PTOU(pp)->u_commpagep = NULL;
 	closeall(P_FINFO(pp));
 
 	/*
@@ -4210,8 +4774,9 @@ parse_rctls(caddr_t ubuf, size_t buflen, nvlist_t **nvlp)
 
 		error = EINVAL;
 		name = nvpair_name(nvp);
-		if (strncmp(nvpair_name(nvp), "zone.", sizeof ("zone.") - 1)
-		    != 0 || nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) {
+		if ((strncmp(name, "zone.", sizeof ("zone.") - 1) != 0 &&
+		    strncmp(name, "project.", sizeof ("project.") - 1) != 0) ||
+		    nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) {
 			goto out;
 		}
 		if ((hndl = rctl_hndl_lookup(name)) == -1) {
@@ -4329,7 +4894,7 @@ zone_create(const char *zone_name, const char *zone_root,
     caddr_t rctlbuf, size_t rctlbufsz,
     caddr_t zfsbuf, size_t zfsbufsz, int *extended_error,
     int match, uint32_t doi, const bslabel_t *label,
-    int flags)
+    int flags, zoneid_t zone_did)
 {
 	struct zsched_arg zarg;
 	nvlist_t *rctls = NULL;
@@ -4352,6 +4917,7 @@ zone_create(const char *zone_name, const char *zone_root,
 
 	zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP);
 	zoneid = zone->zone_id = id_alloc(zoneid_space);
+	zone->zone_did = zone_did;
 	zone->zone_status = ZONE_IS_UNINITIALIZED;
 	zone->zone_pool = pool_default;
 	zone->zone_pool_mod = gethrtime();
@@ -4359,6 +4925,8 @@ zone_create(const char *zone_name, const char *zone_root,
 	zone->zone_ncpus = 0;
 	zone->zone_ncpus_online = 0;
 	zone->zone_restart_init = B_TRUE;
+	zone->zone_reboot_on_init_exit = B_FALSE;
+	zone->zone_init_status = -1;
 	zone->zone_brand = &native_brand;
 	zone->zone_initname = NULL;
 	mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -4420,10 +4988,14 @@ zone_create(const char *zone_name, const char *zone_root,
 	zone->zone_locked_mem_ctl = UINT64_MAX;
 	zone->zone_max_swap = 0;
 	zone->zone_max_swap_ctl = UINT64_MAX;
+	zone->zone_phys_mem = 0;
+	zone->zone_phys_mem_ctl = UINT64_MAX;
 	zone->zone_max_lofi = 0;
 	zone->zone_max_lofi_ctl = UINT64_MAX;
-	zone0.zone_lockedmem_kstat = NULL;
-	zone0.zone_swapresv_kstat = NULL;
+	zone->zone_lockedmem_kstat = NULL;
+	zone->zone_swapresv_kstat = NULL;
+	zone->zone_physmem_kstat = NULL;
+	zone->zone_zfs_io_pri = 1;
 
 	/*
 	 * Zsched initializes the rctls.
@@ -4578,8 +5150,8 @@ zone_create(const char *zone_name, const char *zone_root,
 	/*
 	 * The process, task, and project rctls are probably wrong;
 	 * we need an interface to get the default values of all rctls,
-	 * and initialize zsched appropriately.  I'm not sure that that
-	 * makes much of a difference, though.
+	 * and initialize zsched appropriately. However, we allow zoneadmd
+	 * to pass down both zone and project rctls for the zone's init.
 	 */
 	error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL, 0);
 	if (error != 0) {
@@ -4718,6 +5290,7 @@ zone_boot(zoneid_t zoneid)
 static int
 zone_empty(zone_t *zone)
 {
+	int cnt = 0;
 	int waitstatus;
 
 	/*
@@ -4728,7 +5301,16 @@ zone_empty(zone_t *zone)
 	ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
 	while ((waitstatus = zone_status_timedwait_sig(zone,
 	    ddi_get_lbolt() + hz, ZONE_IS_EMPTY)) == -1) {
-		killall(zone->zone_id);
+		boolean_t force = B_FALSE;
+
+		/* Every 30 seconds, try harder */
+		if (cnt++ >= 30) {
+			cmn_err(CE_WARN, "attempt to force kill zone %d\n",
+			    zone->zone_id);
+			force = B_TRUE;
+			cnt = 0;
+		}
+		killall(zone->zone_id, force);
 	}
 	/*
 	 * return EINTR if we were signaled
@@ -5479,14 +6061,6 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
 				error = EFAULT;
 		}
 		break;
-	case ZONE_ATTR_PHYS_MCAP:
-		size = sizeof (zone->zone_phys_mcap);
-		if (bufsize > size)
-			bufsize = size;
-		if (buf != NULL &&
-		    copyout(&zone->zone_phys_mcap, buf, bufsize) != 0)
-			error = EFAULT;
-		break;
 	case ZONE_ATTR_SCHED_CLASS:
 		mutex_enter(&class_lock);
 
@@ -5541,6 +6115,23 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
 		}
 		kmem_free(zbuf, bufsize);
 		break;
+	case ZONE_ATTR_DID:
+		size = sizeof (zoneid_t);
+		if (bufsize > size)
+			bufsize = size;
+
+		if (buf != NULL && copyout(&zone->zone_did, buf, bufsize) != 0)
+			error = EFAULT;
+		break;
+	case ZONE_ATTR_SCHED_FIXEDHI:
+		size = sizeof (boolean_t);
+		if (bufsize > size)
+			bufsize = size;
+
+		if (buf != NULL && copyout(&zone->zone_fixed_hipri, buf,
+		    bufsize) != 0)
+			error = EFAULT;
+		break;
 	default:
 		if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) {
 			size = bufsize;
@@ -5572,10 +6163,11 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
 		return (set_errno(EPERM));
 
 	/*
-	 * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the
-	 * global zone.
+	 * Only the ZONE_ATTR_PMCAP_NOVER and ZONE_ATTR_PMCAP_PAGEOUT
+	 * attributes can be set on the global zone.
 	 */
-	if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) {
+	if (zoneid == GLOBAL_ZONEID &&
+	    attr != ZONE_ATTR_PMCAP_NOVER && attr != ZONE_ATTR_PMCAP_PAGEOUT) {
 		return (set_errno(EINVAL));
 	}
 
@@ -5592,7 +6184,9 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
 	 * non-global zones.
 	 */
 	zone_status = zone_status_get(zone);
-	if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY) {
+	if (attr != ZONE_ATTR_PMCAP_NOVER && attr != ZONE_ATTR_PMCAP_PAGEOUT &&
+	    attr != ZONE_ATTR_PG_FLT_DELAY && attr != ZONE_ATTR_RSS &&
+	    zone_status > ZONE_IS_READY) {
 		err = EINVAL;
 		goto done;
 	}
@@ -5614,8 +6208,17 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
 	case ZONE_ATTR_FS_ALLOWED:
 		err = zone_set_fs_allowed(zone, (const char *)buf);
 		break;
-	case ZONE_ATTR_PHYS_MCAP:
-		err = zone_set_phys_mcap(zone, (const uint64_t *)buf);
+	case ZONE_ATTR_PMCAP_NOVER:
+		err = zone_set_mcap_nover(zone, (const uint64_t *)buf);
+		break;
+	case ZONE_ATTR_PMCAP_PAGEOUT:
+		err = zone_set_mcap_pageout(zone, (const uint64_t *)buf);
+		break;
+	case ZONE_ATTR_PG_FLT_DELAY:
+		err = zone_set_page_fault_delay(zone, (const uint32_t *)buf);
+		break;
+	case ZONE_ATTR_RSS:
+		err = zone_set_rss(zone, (const uint64_t *)buf);
 		break;
 	case ZONE_ATTR_SCHED_CLASS:
 		err = zone_set_sched_class(zone, (const char *)buf);
@@ -5644,6 +6247,22 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
 		err = zone_set_network(zoneid, zbuf);
 		kmem_free(zbuf, bufsize);
 		break;
+	case ZONE_ATTR_APP_SVC_CT:
+		if (bufsize != sizeof (boolean_t)) {
+			err = EINVAL;
+		} else {
+			zone->zone_setup_app_contract = (boolean_t)buf;
+			err = 0;
+		}
+		break;
+	case ZONE_ATTR_SCHED_FIXEDHI:
+		if (bufsize != sizeof (boolean_t)) {
+			err = EINVAL;
+		} else {
+			zone->zone_fixed_hipri = (boolean_t)buf;
+			err = 0;
+		}
+		break;
 	default:
 		if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone))
 			err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize);
@@ -6336,6 +6955,7 @@ zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
 			zs.doi = zs32.doi;
 			zs.label = (const bslabel_t *)(uintptr_t)zs32.label;
 			zs.flags = zs32.flags;
+			zs.zoneid = zs32.zoneid;
 #else
 			panic("get_udatamodel() returned bogus result\n");
 #endif
@@ -6346,7 +6966,7 @@ zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
 		    (caddr_t)zs.rctlbuf, zs.rctlbufsz,
 		    (caddr_t)zs.zfsbuf, zs.zfsbufsz,
 		    zs.extended_error, zs.match, zs.doi,
-		    zs.label, zs.flags));
+		    zs.label, zs.flags, zs.zoneid));
 	case ZONE_BOOT:
 		return (zone_boot((zoneid_t)(uintptr_t)arg1));
 	case ZONE_DESTROY:
@@ -6447,6 +7067,7 @@ zone_ki_call_zoneadmd(struct zarg *zargp)
 	bcopy(zone->zone_name, zone_name, zone_namelen);
 	zoneid = zone->zone_id;
 	uniqid = zone->zone_uniqid;
+	arg.status = zone->zone_init_status;
 	/*
 	 * zoneadmd may be down, but at least we can empty out the zone.
 	 * We can ignore the return value of zone_empty() since we're called
@@ -6624,7 +7245,7 @@ zone_kadmin(int cmd, int fcn, const char *mdep, cred_t *credp)
 	 * zone_ki_call_zoneadmd() will do a more thorough job of this
 	 * later.
 	 */
-	killall(zone->zone_id);
+	killall(zone->zone_id, B_FALSE);
 	/*
 	 * Now, create the thread to contact zoneadmd and do the rest of the
 	 * work.  This thread can't be created in our zone otherwise
@@ -6687,16 +7308,15 @@ zone_shutdown_global(void)
 }
 
 /*
- * Returns true if the named dataset is visible in the current zone.
+ * Returns true if the named dataset is visible in the specified zone.
  * The 'write' parameter is set to 1 if the dataset is also writable.
  */
 int
-zone_dataset_visible(const char *dataset, int *write)
+zone_dataset_visible_inzone(zone_t *zone, const char *dataset, int *write)
 {
 	static int zfstype = -1;
 	zone_dataset_t *zd;
 	size_t len;
-	zone_t *zone = curproc->p_zone;
 	const char *name = NULL;
 	vfs_t *vfsp = NULL;
 
@@ -6764,7 +7384,8 @@ zone_dataset_visible(const char *dataset, int *write)
 	vfs_list_read_lock();
 	vfsp = zone->zone_vfslist;
 	do {
-		ASSERT(vfsp);
+		if (vfsp == NULL)
+			break;
 		if (vfsp->vfs_fstype == zfstype) {
 			name = refstr_value(vfsp->vfs_resource);
 
@@ -6801,6 +7422,18 @@ zone_dataset_visible(const char *dataset, int *write)
 }
 
 /*
+ * Returns true if the named dataset is visible in the current zone.
+ * The 'write' parameter is set to 1 if the dataset is also writable.
+ */
+int
+zone_dataset_visible(const char *dataset, int *write)
+{
+	zone_t *zone = curproc->p_zone;
+
+	return (zone_dataset_visible_inzone(zone, dataset, write));
+}
+
+/*
  * zone_find_by_any_path() -
  *
  * kernel-private routine similar to zone_find_by_path(), but which
diff --git a/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas_hash.c b/usr/src/uts/common/refhash/refhash.c
index 8f96c2d9f1..e2de00597e 100644
--- a/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas_hash.c
+++ b/usr/src/uts/common/refhash/refhash.c
@@ -10,16 +10,18 @@
  */
 
 /*
- * Copyright 2014 Joyent, Inc.  All rights reserved.
+ * Copyright 2015 Joyent, Inc.
  */
 
-#include <sys/scsi/adapters/mpt_sas/mptsas_hash.h>
+#include <sys/refhash.h>
 #include <sys/sysmacros.h>
 #include <sys/types.h>
 #include <sys/kmem.h>
 #include <sys/list.h>
 #include <sys/ddi.h>
 
+#define	RHL_F_DEAD	0x01
+
 #ifdef lint
 extern refhash_link_t *obj_to_link(refhash_t *, void *);
 extern void *link_to_obj(refhash_t *, refhash_link_t *);
diff --git a/usr/src/uts/common/sys/Makefile b/usr/src/uts/common/sys/Makefile
index d5dd20bff9..052a28a5e2 100644
--- a/usr/src/uts/common/sys/Makefile
+++ b/usr/src/uts/common/sys/Makefile
@@ -23,6 +23,7 @@
 # Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
 # Copyright 2014, Joyent, Inc. All rights reserved.
 # Copyright 2013 Garrett D'Amore <garrett@damore.org>
+# Copyright 2015, Joyent, Inc. All rights reserved.
 # Copyright 2013 Saso Kiselkov. All rights reserved.
 # Copyright 2015 Igor Kozhukhov <ikozhukhov@gmail.com>
 # Copyright 2016 Nexenta Systems, Inc.
@@ -250,6 +251,7 @@ CHKHDRS=			\
 	flock.h			\
 	flock_impl.h		\
 	fork.h			\
+	frameio.h		\
 	fss.h			\
 	fsspriocntl.h		\
 	fsid.h			\
@@ -275,6 +277,7 @@ CHKHDRS=			\
 	idmap.h 		\
 	ieeefp.h		\
 	id_space.h		\
+	inotify.h		\
 	instance.h		\
 	int_const.h		\
 	int_fmtio.h		\
@@ -343,6 +346,7 @@ CHKHDRS=			\
 	lgrp.h			\
 	lgrp_user.h		\
 	libc_kernel.h		\
+	limits.h		\
 	link.h			\
 	list.h			\
 	list_impl.h		\
@@ -424,6 +428,9 @@ CHKHDRS=			\
 	ontrap.h		\
 	open.h			\
 	openpromio.h		\
+	overlay.h		\
+	overlay_common.h	\
+	overlay_target.h	\
 	panic.h			\
 	param.h			\
 	pathconf.h		\
@@ -646,6 +653,8 @@ CHKHDRS=			\
 	vmem.h			\
 	vmem_impl.h		\
 	vmsystm.h		\
+	vnd.h			\
+	vnd_errno.h		\
 	vnic.h			\
 	vnic_impl.h		\
 	vnode.h			\
@@ -657,12 +666,14 @@ CHKHDRS=			\
 	vuid_queue.h		\
 	vuid_state.h		\
 	vuid_store.h		\
+	vxlan.h			\
 	wait.h			\
 	waitq.h			\
 	wanboot_impl.h		\
 	watchpoint.h		\
 	winlockio.h		\
 	zcons.h			\
+	zfd.h			\
 	zone.h			\
 	xti_inet.h		\
 	xti_osi.h		\
@@ -856,13 +867,14 @@ FSHDRS=				\
 	autofs.h		\
 	decomp.h		\
 	dv_node.h		\
-	sdev_impl.h		\
 	fifonode.h		\
 	hsfs_isospec.h		\
 	hsfs_node.h		\
 	hsfs_rrip.h		\
 	hsfs_spec.h		\
 	hsfs_susp.h		\
+	hyprlofs.h		\
+	hyprlofs_info.h		\
 	lofs_info.h		\
 	lofs_node.h		\
 	mntdata.h		\
@@ -872,6 +884,8 @@ FSHDRS=				\
 	pc_label.h		\
 	pc_node.h		\
 	pxfs_ki.h		\
+	sdev_impl.h		\
+	sdev_plugin.h		\
 	snode.h			\
 	swapnode.h		\
 	tmp.h			\
@@ -996,6 +1010,7 @@ SATAGENHDRS=		\
 
 SYSEVENTHDRS=		\
 	ap_driver.h     \
+	datalink.h	\
 	dev.h		\
 	domain.h        \
 	dr.h            \
diff --git a/usr/src/uts/common/sys/aggr_impl.h b/usr/src/uts/common/sys/aggr_impl.h
index 547c9cc241..a4c0409304 100644
--- a/usr/src/uts/common/sys/aggr_impl.h
+++ b/usr/src/uts/common/sys/aggr_impl.h
@@ -21,6 +21,7 @@
 /*
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2012 OmniTI Computer Consulting, Inc  All rights reserved.
  */
 
 #ifndef	_SYS_AGGR_IMPL_H
@@ -308,6 +309,8 @@ extern boolean_t aggr_port_notify_link(aggr_grp_t *, aggr_port_t *);
 extern void aggr_port_init_callbacks(aggr_port_t *);
 
 extern void aggr_recv_cb(void *, mac_resource_handle_t, mblk_t *, boolean_t);
+extern void aggr_recv_promisc_cb(void *, mac_resource_handle_t, mblk_t *,
+    boolean_t);
 
 extern void aggr_tx_ring_update(void *, uintptr_t);
 extern void aggr_tx_notify_thread(void *);
diff --git a/usr/src/uts/common/sys/auxv.h b/usr/src/uts/common/sys/auxv.h
index 3a2e705850..48b94e2951 100644
--- a/usr/src/uts/common/sys/auxv.h
+++ b/usr/src/uts/common/sys/auxv.h
@@ -29,7 +29,7 @@
  * Use is subject to license terms.
  */
 /*
- * Copyright (c) 2012, Joyent, Inc.  All rights reserved.
+ * Copyright 2016 Joyent, Inc.
  */
 
 #ifndef	_SYS_AUXV_H
@@ -78,6 +78,9 @@ typedef struct {
 #define	AT_FLAGS	8	/* processor flags */
 #define	AT_ENTRY	9	/* a.out entry point */
 
+/* First introduced on Linux */
+#define	AT_RANDOM	25	/* address of 16 random bytes */
+
 /*
  * These relate to the original PPC ABI document; Linux reused
  * the values for other things (see below), so disambiguation of
@@ -90,19 +93,18 @@ typedef struct {
  * These are the values from LSB 1.3, the first five are also described
  * in the draft amd64 ABI.
  *
- * At the time of writing, Solaris doesn't place any of these values into
- * the aux vector, except AT_CLKTCK which is placed on the aux vector for
- * lx branded processes; also, we do similar things via AT_SUN_ values.
+ * At the time of writing, illumos doesn't place any of these values into the
+ * aux vector, except where noted. We do similar things via AT_SUN_ values.
  *
  * AT_NOTELF		10	program is not ELF?
- * AT_UID		11	real user id
- * AT_EUID		12	effective user id
- * AT_GID		13	real group id
- * AT_EGID		14	effective group id
+ * AT_UID		11	real user id (provided in LX)
+ * AT_EUID		12	effective user id (provided in LX)
+ * AT_GID		13	real group id (provided in LX)
+ * AT_EGID		14	effective group id (provided in LX)
  *
  * AT_PLATFORM		15
  * AT_HWCAP		16
- * AT_CLKTCK		17	c.f. _SC_CLK_TCK
+ * AT_CLKTCK		17	c.f. _SC_CLK_TCK (provided in LX)
  * AT_FPUCW		18
  *
  * AT_DCACHEBSIZE	19	(moved from 10)
@@ -110,6 +112,16 @@ typedef struct {
  * AT_UCACHEBSIZE	21	(moved from 12)
  *
  * AT_IGNOREPPC		22
+ *
+ * On Linux:
+ * AT_* values 18 through 22 are reserved
+ * AT_SECURE		23	secure mode boolean (provided in LX)
+ * AT_BASE_PLATFORM	24	string identifying real platform, may
+ *				differ from AT_PLATFORM.
+ * AT_HWCAP2		26	extension of AT_HWCAP
+ * AT_EXECFN		31	filename of program
+ * AT_SYSINFO		32
+ * AT_SYSINFO_EHDR	33	The vDSO location
  */
 
 /*
@@ -186,6 +198,13 @@ extern uint_t getisax(uint32_t *, uint_t);
 #define	AT_SUN_BRAND_AUX1	2020
 #define	AT_SUN_BRAND_AUX2	2021
 #define	AT_SUN_BRAND_AUX3	2022
+#define	AT_SUN_BRAND_AUX4	2025
+#define	AT_SUN_BRAND_NROOT	2024
+
+/*
+ * Aux vector for comm page
+ */
+#define	AT_SUN_COMMPAGE		2026
 
 /*
  * Note that 2023 is reserved for the AT_SUN_HWCAP2 word defined above.
diff --git a/usr/src/uts/common/sys/auxv_386.h b/usr/src/uts/common/sys/auxv_386.h
index ec4c8b0f19..a3256a464f 100644
--- a/usr/src/uts/common/sys/auxv_386.h
+++ b/usr/src/uts/common/sys/auxv_386.h
@@ -89,10 +89,12 @@ extern "C" {
 #define	AV_386_2_BMI2		0x00008 /* BMI2 insns */
 #define	AV_386_2_FMA		0x00010	/* FMA insns */
 #define	AV_386_2_AVX2		0x00020	/* AVX2 insns */
+#define	AV_386_2_ADX		0x00040	/* ADX insns */
+#define	AV_386_2_RDSEED		0x00080	/* RDSEED insn */
 
 #define	FMT_AV_386_2							\
 	"\020"								\
-	"\06avx2\05fma\04bmi2\03bmi1\02rdrand\01f16c"
+	"\10rdseed\07adx\06avx2\05fma\04bmi2\03bmi1\02rdrand\01f16c"
 
 #ifdef __cplusplus
 }
diff --git a/usr/src/uts/common/sys/brand.h b/usr/src/uts/common/sys/brand.h
index badc3faff8..d88d5683a7 100644
--- a/usr/src/uts/common/sys/brand.h
+++ b/usr/src/uts/common/sys/brand.h
@@ -21,6 +21,7 @@
 
 /*
  * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2016, Joyent, Inc.
  */
 
 #ifndef _SYS_BRAND_H
@@ -102,29 +103,101 @@ struct brand_mach_ops;
 struct intpdata;
 struct execa;
 
+/*
+ * Common structure to define hooks for brand operation.
+ *
+ * Required Fields:
+ * b_init_brand_data - Setup zone brand data during zone_setbrand
+ * b_free_brand_data - Free zone brand data during zone_destroy
+ * b_brandsys - Syscall handler for brandsys
+ * b_setbrand - Initialize process brand data
+ * b_getattr - Get brand-custom zone attribute
+ * b_setattr - Set brand-custom zone attribute
+ * b_copy_procdata - Copy process brand data during fork
+ * b_proc_exit - Perform process brand exit processing
+ * b_exec - Reset branded process state on exec
+ * b_lwp_setrval - Set return code for forked child
+ * b_initlwp - Initialize lwp brand data (cannot drop p->p_lock)
+ * b_forklwp - Copy lwp brand data during fork
+ * b_freelwp - Free lwp brand data
+ * b_lwpexit - Perform lwp-specific brand exit processing
+ * b_elfexec - Load and execute ELF binary
+ * b_sigset_native_to_brand - Convert sigset native->brand
+ * b_sigset_brand_to_native - Convert sigset brand->native
+ * b_nsig - Maxiumum signal number
+ * b_sendsig - Update process state after sendsig
+ *
+ * Optional Fields:
+ * b_lwpdata_alloc - Speculatively allocate data for use in b_initlwp
+ * b_lwpdata_free - Free data from allocated by b_lwpdata_alloc if errors occur
+ *                  during lwp creation before b_initlwp could be called.
+ * b_initlwp_post - Complete lwp branding (can temporarily drop p->p_lock)
+ * b_exit_with_sig - Instead of sending SIGCLD, exit with custom behavior
+ * b_psig_to_proc - Custom additional behavior during psig
+ * b_wait_filter - Filter processes from being matched by waitid
+ * b_native_exec - Provide interpreter path prefix for executables
+ * b_ptrace_exectrap - Custom behavior for legacy ptrace traps
+ * b_map32limit - Specify alternate limit for MAP_32BIT mappings
+ * b_stop_notify - Hook process stop events
+ * b_waitid_helper - Generate synthetic results for waitid
+ * b_sigcld_repost - Post synthetic SIGCLD signals
+ * b_issig_stop - Alter/suppress signal delivery during issig
+ * b_sig_ignorable - Disallow discarding of signals
+ * b_savecontext - Alter context during savecontext
+ * b_restorecontext - Alter context during restorecontext
+ * b_sendsig_stack - Override stack used for signal delivery
+ * b_setid_clear - Override setid_clear behavior
+ * b_pagefault - Trap pagefault events
+ * b_intp_parse_arg - Controls interpreter argument handling (allow 1 or all)
+ */
 struct brand_ops {
-	void	(*b_init_brand_data)(zone_t *);
+	void	(*b_init_brand_data)(zone_t *, kmutex_t *);
 	void	(*b_free_brand_data)(zone_t *);
 	int	(*b_brandsys)(int, int64_t *, uintptr_t, uintptr_t, uintptr_t,
-		uintptr_t, uintptr_t, uintptr_t);
+		uintptr_t, uintptr_t);
 	void	(*b_setbrand)(struct proc *);
 	int	(*b_getattr)(zone_t *, int, void *, size_t *);
 	int	(*b_setattr)(zone_t *, int, void *, size_t);
 	void	(*b_copy_procdata)(struct proc *, struct proc *);
-	void	(*b_proc_exit)(struct proc *, klwp_t *);
+	void	(*b_proc_exit)(struct proc *);
 	void	(*b_exec)();
 	void	(*b_lwp_setrval)(klwp_t *, int, int);
-	int	(*b_initlwp)(klwp_t *);
+	void	*(*b_lwpdata_alloc)(struct proc *);
+	void	(*b_lwpdata_free)(void *);
+	void	(*b_initlwp)(klwp_t *, void *);
+	void	(*b_initlwp_post)(klwp_t *);
 	void	(*b_forklwp)(klwp_t *, klwp_t *);
 	void	(*b_freelwp)(klwp_t *);
 	void	(*b_lwpexit)(klwp_t *);
 	int	(*b_elfexec)(struct vnode *vp, struct execa *uap,
 	    struct uarg *args, struct intpdata *idata, int level,
 	    long *execsz, int setid, caddr_t exec_file,
-	    struct cred *cred, int brand_action);
+	    struct cred *cred, int *brand_action);
 	void	(*b_sigset_native_to_brand)(sigset_t *);
 	void	(*b_sigset_brand_to_native)(sigset_t *);
+	void	(*b_sigfd_translate)(k_siginfo_t *);
 	int	b_nsig;
+	void	(*b_exit_with_sig)(proc_t *, sigqueue_t *);
+	boolean_t (*b_wait_filter)(proc_t *, proc_t *);
+	boolean_t (*b_native_exec)(uint8_t, const char **);
+	uint32_t (*b_map32limit)(proc_t *);
+	void	(*b_stop_notify)(proc_t *, klwp_t *, ushort_t, ushort_t);
+	int	(*b_waitid_helper)(idtype_t, id_t, k_siginfo_t *, int,
+	    boolean_t *, int *);
+	int	(*b_sigcld_repost)(proc_t *, sigqueue_t *);
+	int	(*b_issig_stop)(proc_t *, klwp_t *);
+	boolean_t (*b_sig_ignorable)(proc_t *, klwp_t *, int);
+	void	(*b_savecontext)(ucontext_t *);
+#if defined(_SYSCALL32_IMPL)
+	void	(*b_savecontext32)(ucontext32_t *);
+#endif
+	void	(*b_restorecontext)(ucontext_t *);
+	caddr_t	(*b_sendsig_stack)(int);
+	void	(*b_sendsig)(int);
+	int	(*b_setid_clear)(vattr_t *vap, cred_t *cr);
+	int	(*b_pagefault)(proc_t *, klwp_t *, caddr_t, enum fault_type,
+	    enum seg_rw);
+	boolean_t b_intp_parse_arg;
 };
 
 /*
@@ -135,6 +208,7 @@ typedef struct brand {
 	char    		*b_name;
 	struct brand_ops	*b_ops;
 	struct brand_mach_ops	*b_machops;
+	size_t			b_data_size;
 } brand_t;
 
 extern brand_t native_brand;
@@ -165,7 +239,7 @@ extern brand_t	*brand_register_zone(struct brand_attr *);
 extern brand_t	*brand_find_name(char *);
 extern void	brand_unregister_zone(brand_t *);
 extern int	brand_zone_count(brand_t *);
-extern void	brand_setbrand(proc_t *);
+extern int	brand_setbrand(proc_t *, boolean_t);
 extern void	brand_clearbrand(proc_t *, boolean_t);
 
 /*
@@ -178,17 +252,16 @@ extern int	brand_solaris_cmd(int, uintptr_t, uintptr_t, uintptr_t,
 extern void	brand_solaris_copy_procdata(proc_t *, proc_t *,
 		    struct brand *);
 extern int	brand_solaris_elfexec(vnode_t *, execa_t *, uarg_t *,
-		    intpdata_t *, int, long *, int, caddr_t, cred_t *, int,
-		    struct brand *, char *, char *, char *, char *, char *);
+		    intpdata_t *, int, long *, int, caddr_t, cred_t *, int *,
+		    struct brand *, char *, char *, char *);
 extern void	brand_solaris_exec(struct brand *);
 extern int	brand_solaris_fini(char **, struct modlinkage *,
 		    struct brand *);
 extern void	brand_solaris_forklwp(klwp_t *, klwp_t *, struct brand *);
 extern void	brand_solaris_freelwp(klwp_t *, struct brand *);
-extern int	brand_solaris_initlwp(klwp_t *, struct brand *);
+extern void	brand_solaris_initlwp(klwp_t *, struct brand *);
 extern void	brand_solaris_lwpexit(klwp_t *, struct brand *);
-extern void	brand_solaris_proc_exit(struct proc *, klwp_t *,
-		    struct brand *);
+extern void	brand_solaris_proc_exit(struct proc *, struct brand *);
 extern void	brand_solaris_setbrand(proc_t *, struct brand *);
 
 #if defined(_SYSCALL32)
diff --git a/usr/src/uts/common/sys/buf.h b/usr/src/uts/common/sys/buf.h
index a9191aed7c..cb8a6012fc 100644
--- a/usr/src/uts/common/sys/buf.h
+++ b/usr/src/uts/common/sys/buf.h
@@ -21,6 +21,7 @@
 /*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2012 Joyent, Inc.  All rights reserved.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -186,6 +187,7 @@ struct biostats {
 #define	B_STARTED	0x2000000	/* io:::start probe called for buf */
 #define	B_ABRWRITE	0x4000000	/* Application based recovery active */
 #define	B_PAGE_NOWAIT	0x8000000	/* Skip the page if it is locked */
+#define	B_INVALCURONLY	0x10000000	/* invalidate only for curproc */
 
 /*
  * There is some confusion over the meaning of B_FREE and B_INVAL and what
@@ -198,6 +200,12 @@ struct biostats {
  * between the sole use of these two flags.  In both cases, IO will be done
  * if the page is not yet committed to storage.
  *
+ * The B_INVALCURONLY flag modifies the behavior of the B_INVAL flag and is
+ * intended to be used in conjunction with B_INVAL.  B_INVALCURONLY has no
+ * meaning on its own.  When both B_INVALCURONLY and B_INVAL are set, then
+ * the mapping for the page is only invalidated for the current process.
+ * In this case, the page is not destroyed unless this was the final mapping.
+ *
  * In order to discard pages without writing them back, (B_INVAL | B_TRUNC)
  * should be used.
  *
diff --git a/usr/src/uts/common/sys/contract/process.h b/usr/src/uts/common/sys/contract/process.h
index 21cf94dcf9..2c70d7c9f1 100644
--- a/usr/src/uts/common/sys/contract/process.h
+++ b/usr/src/uts/common/sys/contract/process.h
@@ -21,13 +21,12 @@
 /*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2015 Joyent, Inc.
  */
 
 #ifndef	_SYS_CONTRACT_PROCESS_H
 #define	_SYS_CONTRACT_PROCESS_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/contract.h>
 #include <sys/time.h>
 
@@ -55,7 +54,8 @@ typedef struct cont_process cont_process_t;
 #define	CT_PR_NOORPHAN	0x2	/* kill when contract is abandoned */
 #define	CT_PR_PGRPONLY	0x4	/* only kill process group on fatal errors */
 #define	CT_PR_REGENT	0x8	/* automatically detach inherited contracts */
-#define	CT_PR_ALLPARAM	0xf
+#define	CT_PR_KEEP_EXEC	0x10	/* preserve template accross exec */
+#define	CT_PR_ALLPARAM	0x1f
 
 /*
  * ctr_ev_* flags
diff --git a/usr/src/uts/common/sys/cpucaps.h b/usr/src/uts/common/sys/cpucaps.h
index 6063ff4380..6bc042108c 100644
--- a/usr/src/uts/common/sys/cpucaps.h
+++ b/usr/src/uts/common/sys/cpucaps.h
@@ -22,6 +22,7 @@
 /*
  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2011, 2012, Joyent, Inc.  All rights reserved.
  */
 
 #ifndef	_SYS_CPUCAPS_H
@@ -84,12 +85,16 @@ extern void cpucaps_zone_remove(zone_t *);
  */
 extern int cpucaps_project_set(kproject_t *, rctl_qty_t);
 extern int cpucaps_zone_set(zone_t *, rctl_qty_t);
+extern int cpucaps_zone_set_base(zone_t *, rctl_qty_t);
+extern int cpucaps_zone_set_burst_time(zone_t *, rctl_qty_t);
 
 /*
  * Get current CPU usage for a project/zone.
  */
 extern rctl_qty_t cpucaps_project_get(kproject_t *);
 extern rctl_qty_t cpucaps_zone_get(zone_t *);
+extern rctl_qty_t cpucaps_zone_get_base(zone_t *);
+extern rctl_qty_t cpucaps_zone_get_burst_time(zone_t *);
 
 /*
  * Scheduling class hooks into CPU caps framework.
diff --git a/usr/src/uts/common/sys/cpucaps_impl.h b/usr/src/uts/common/sys/cpucaps_impl.h
index 95afd21827..2cd4ed644d 100644
--- a/usr/src/uts/common/sys/cpucaps_impl.h
+++ b/usr/src/uts/common/sys/cpucaps_impl.h
@@ -22,6 +22,7 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2011, 2012, Joyent, Inc.  All rights reserved.
  */
 
 #ifndef	_SYS_CPUCAPS_IMPL_H
@@ -66,8 +67,12 @@ typedef struct cpucap {
 	waitq_t		cap_waitq;	/* waitq for capped threads	*/
 	kstat_t		*cap_kstat;	/* cpucaps specific kstat	*/
 	int64_t		cap_gen;	/* zone cap specific 		*/
+	hrtime_t	cap_chk_value;	/* effective CPU usage cap	*/
 	hrtime_t	cap_value;	/* scaled CPU usage cap		*/
 	hrtime_t	cap_usage;	/* current CPU usage		*/
+	hrtime_t	cap_base;	/* base CPU for burst		*/
+	u_longlong_t	cap_burst_limit; /* max secs (in tics) for a burst */
+	u_longlong_t	cap_bursting;	/* # of ticks currently bursting */
 	disp_lock_t	cap_usagelock;	/* protects cap_usage above	*/
 	/*
 	 * Per cap statistics.
@@ -75,6 +80,7 @@ typedef struct cpucap {
 	hrtime_t	cap_maxusage;	/* maximum cap usage		*/
 	u_longlong_t	cap_below;	/* # of ticks spend below the cap */
 	u_longlong_t	cap_above;	/* # of ticks spend above the cap */
+	u_longlong_t	cap_above_base;	/* # of ticks spent above the base */
 } cpucap_t;
 
 /*
diff --git a/usr/src/uts/common/sys/cred.h b/usr/src/uts/common/sys/cred.h
index 5056f9a511..914f132dc0 100644
--- a/usr/src/uts/common/sys/cred.h
+++ b/usr/src/uts/common/sys/cred.h
@@ -93,6 +93,7 @@ extern gid_t crgetgid(const cred_t *);
 extern gid_t crgetrgid(const cred_t *);
 extern gid_t crgetsgid(const cred_t *);
 extern zoneid_t crgetzoneid(const cred_t *);
+extern zoneid_t crgetzonedid(const cred_t *);
 extern projid_t crgetprojid(const cred_t *);
 
 extern cred_t *crgetmapped(const cred_t *);
diff --git a/usr/src/uts/common/sys/ctf_api.h b/usr/src/uts/common/sys/ctf_api.h
index 04d73c3181..bc99f67d3f 100644
--- a/usr/src/uts/common/sys/ctf_api.h
+++ b/usr/src/uts/common/sys/ctf_api.h
@@ -24,7 +24,7 @@
  * Use is subject to license terms.
  */
 /*
- * Copyright (c) 2013, Joyent, Inc.  All rights reserved.
+ * Copyright (c) 2015, Joyent, Inc.
  */
 
 /*
@@ -60,6 +60,65 @@ extern "C" {
 typedef struct ctf_file ctf_file_t;
 typedef long ctf_id_t;
 
+#define	ECTF_BASE	1000	/* base value for libctf errnos */
+
+enum {
+	ECTF_FMT = ECTF_BASE,	/* file is not in CTF or ELF format */
+	ECTF_ELFVERS,		/* ELF version is more recent than libctf */
+	ECTF_CTFVERS,		/* CTF version is more recent than libctf */
+	ECTF_ENDIAN,		/* data is different endian-ness than lib */
+	ECTF_SYMTAB,		/* symbol table uses invalid entry size */
+	ECTF_SYMBAD,		/* symbol table data buffer invalid */
+	ECTF_STRBAD,		/* string table data buffer invalid */
+	ECTF_CORRUPT,		/* file data corruption detected */
+	ECTF_NOCTFDATA,		/* ELF file does not contain CTF data */
+	ECTF_NOCTFBUF,		/* buffer does not contain CTF data */
+	ECTF_NOSYMTAB,		/* symbol table data is not available */
+	ECTF_NOPARENT,		/* parent CTF container is not available */
+	ECTF_DMODEL,		/* data model mismatch */
+	ECTF_MMAP,		/* failed to mmap a data section */
+	ECTF_ZMISSING,		/* decompression library not installed */
+	ECTF_ZINIT,		/* failed to initialize decompression library */
+	ECTF_ZALLOC,		/* failed to allocate decompression buffer */
+	ECTF_DECOMPRESS,	/* failed to decompress CTF data */
+	ECTF_STRTAB,		/* string table for this string is missing */
+	ECTF_BADNAME,		/* string offset is corrupt w.r.t. strtab */
+	ECTF_BADID,		/* invalid type ID number */
+	ECTF_NOTSOU,		/* type is not a struct or union */
+	ECTF_NOTENUM,		/* type is not an enum */
+	ECTF_NOTSUE,		/* type is not a struct, union, or enum */
+	ECTF_NOTINTFP,		/* type is not an integer or float */
+	ECTF_NOTARRAY,		/* type is not an array */
+	ECTF_NOTREF,		/* type does not reference another type */
+	ECTF_NAMELEN,		/* buffer is too small to hold type name */
+	ECTF_NOTYPE,		/* no type found corresponding to name */
+	ECTF_SYNTAX,		/* syntax error in type name */
+	ECTF_NOTFUNC,		/* symtab entry does not refer to a function */
+	ECTF_NOFUNCDAT,		/* no func info available for function */
+	ECTF_NOTDATA,		/* symtab entry does not refer to a data obj */
+	ECTF_NOTYPEDAT,		/* no type info available for object */
+	ECTF_NOLABEL,		/* no label found corresponding to name */
+	ECTF_NOLABELDATA,	/* file does not contain any labels */
+	ECTF_NOTSUP,		/* feature not supported */
+	ECTF_NOENUMNAM,		/* enum element name not found */
+	ECTF_NOMEMBNAM,		/* member name not found */
+	ECTF_RDONLY,		/* CTF container is read-only */
+	ECTF_DTFULL,		/* CTF type is full (no more members allowed) */
+	ECTF_FULL,		/* CTF container is full */
+	ECTF_DUPMEMBER,		/* duplicate member name definition */
+	ECTF_CONFLICT,		/* conflicting type definition present */
+	ECTF_REFERENCED,	/* type has outstanding references */
+	ECTF_NOTDYN,		/* type is not a dynamic type */
+	ECTF_ELF,		/* elf library failure */
+	ECTF_MCHILD,		/* cannot merge child container */
+	ECTF_LABELEXISTS,	/* label already exists */
+	ECTF_LCONFLICT,		/* merged labels conflict */
+	ECTF_ZLIB,		/* zlib library failure */
+	ECTF_CONVBKERR,		/* CTF conversion backend error */
+	ECTF_CONVNOCSRC,	/* No C source to convert from */
+	ECTF_NOCONVBKEND	/* No applicable conversion backend */
+};
+
 /*
  * If the debugger needs to provide the CTF library with a set of raw buffers
  * for use as the CTF data, symbol table, and string table, it can do so by
@@ -143,19 +202,24 @@ typedef struct ctf_lblinfo {
 typedef int ctf_visit_f(const char *, ctf_id_t, ulong_t, int, void *);
 typedef int ctf_member_f(const char *, ctf_id_t, ulong_t, void *);
 typedef int ctf_enum_f(const char *, int, void *);
-typedef int ctf_type_f(ctf_id_t, void *);
+typedef int ctf_type_f(ctf_id_t, boolean_t, void *);
 typedef int ctf_label_f(const char *, const ctf_lblinfo_t *, void *);
+typedef int ctf_function_f(const char *, ulong_t, ctf_funcinfo_t *, void *);
+typedef int ctf_object_f(const char *, ctf_id_t, ulong_t, void *);
+typedef int ctf_string_f(const char *, void *);
 
 extern ctf_file_t *ctf_bufopen(const ctf_sect_t *, const ctf_sect_t *,
     const ctf_sect_t *, int *);
 extern ctf_file_t *ctf_fdopen(int, int *);
 extern ctf_file_t *ctf_open(const char *, int *);
 extern ctf_file_t *ctf_create(int *);
+extern ctf_file_t *ctf_fdcreate(int, int *);
 extern ctf_file_t *ctf_dup(ctf_file_t *);
 extern void ctf_close(ctf_file_t *);
 
 extern ctf_file_t *ctf_parent_file(ctf_file_t *);
 extern const char *ctf_parent_name(ctf_file_t *);
+extern const char *ctf_parent_label(ctf_file_t *);
 
 extern int ctf_import(ctf_file_t *, ctf_file_t *);
 extern int ctf_setmodel(ctf_file_t *, int);
@@ -165,15 +229,20 @@ extern void ctf_setspecific(ctf_file_t *, void *);
 extern void *ctf_getspecific(ctf_file_t *);
 
 extern int ctf_errno(ctf_file_t *);
+extern uint_t ctf_flags(ctf_file_t *);
 extern const char *ctf_errmsg(int);
 extern int ctf_version(int);
 
 extern int ctf_func_info(ctf_file_t *, ulong_t, ctf_funcinfo_t *);
+extern int ctf_func_info_by_id(ctf_file_t *, ctf_id_t, ctf_funcinfo_t *);
 extern int ctf_func_args(ctf_file_t *, ulong_t, uint_t, ctf_id_t *);
+extern int ctf_func_args_by_id(ctf_file_t *, ctf_id_t, uint_t, ctf_id_t *);
 
 extern ctf_id_t ctf_lookup_by_name(ctf_file_t *, const char *);
 extern ctf_id_t ctf_lookup_by_symbol(ctf_file_t *, ulong_t);
 
+extern char *ctf_symbol_name(ctf_file_t *, ulong_t, char *, size_t);
+
 extern ctf_id_t ctf_type_resolve(ctf_file_t *, ctf_id_t);
 extern ssize_t ctf_type_lname(ctf_file_t *, ctf_id_t, char *, size_t);
 extern char *ctf_type_name(ctf_file_t *, ctf_id_t, char *, size_t);
@@ -182,6 +251,7 @@ extern char *ctf_type_qname(ctf_file_t *, ctf_id_t, char *, size_t,
 extern ssize_t ctf_type_size(ctf_file_t *, ctf_id_t);
 extern ssize_t ctf_type_align(ctf_file_t *, ctf_id_t);
 extern int ctf_type_kind(ctf_file_t *, ctf_id_t);
+extern const char *ctf_kind_name(ctf_file_t *, int);
 extern ctf_id_t ctf_type_reference(ctf_file_t *, ctf_id_t);
 extern ctf_id_t ctf_type_pointer(ctf_file_t *, ctf_id_t);
 extern int ctf_type_encoding(ctf_file_t *, ctf_id_t, ctf_encoding_t *);
@@ -201,37 +271,50 @@ extern int ctf_label_info(ctf_file_t *, const char *, ctf_lblinfo_t *);
 
 extern int ctf_member_iter(ctf_file_t *, ctf_id_t, ctf_member_f *, void *);
 extern int ctf_enum_iter(ctf_file_t *, ctf_id_t, ctf_enum_f *, void *);
-extern int ctf_type_iter(ctf_file_t *, ctf_type_f *, void *);
+extern int ctf_type_iter(ctf_file_t *, boolean_t, ctf_type_f *, void *);
 extern int ctf_label_iter(ctf_file_t *, ctf_label_f *, void *);
+extern int ctf_function_iter(ctf_file_t *, ctf_function_f *, void *);
+extern int ctf_object_iter(ctf_file_t *, ctf_object_f *, void *);
+extern int ctf_string_iter(ctf_file_t *, ctf_string_f *, void *);
 
 extern ctf_id_t ctf_add_array(ctf_file_t *, uint_t, const ctf_arinfo_t *);
-extern ctf_id_t ctf_add_const(ctf_file_t *, uint_t, ctf_id_t);
+extern ctf_id_t ctf_add_const(ctf_file_t *, uint_t, const char *, ctf_id_t);
 extern ctf_id_t ctf_add_enum(ctf_file_t *, uint_t, const char *);
 extern ctf_id_t ctf_add_float(ctf_file_t *, uint_t,
     const char *, const ctf_encoding_t *);
 extern ctf_id_t ctf_add_forward(ctf_file_t *, uint_t, const char *, uint_t);
-extern ctf_id_t ctf_add_function(ctf_file_t *, uint_t,
-    const ctf_funcinfo_t *, const ctf_id_t *);
+extern ctf_id_t ctf_add_funcptr(ctf_file_t *, uint_t, const ctf_funcinfo_t *,
+    const ctf_id_t *);
 extern ctf_id_t ctf_add_integer(ctf_file_t *, uint_t,
     const char *, const ctf_encoding_t *);
-extern ctf_id_t ctf_add_pointer(ctf_file_t *, uint_t, ctf_id_t);
+extern ctf_id_t ctf_add_pointer(ctf_file_t *, uint_t, const char *, ctf_id_t);
 extern ctf_id_t ctf_add_type(ctf_file_t *, ctf_file_t *, ctf_id_t);
 extern ctf_id_t ctf_add_typedef(ctf_file_t *, uint_t, const char *, ctf_id_t);
-extern ctf_id_t ctf_add_restrict(ctf_file_t *, uint_t, ctf_id_t);
+extern ctf_id_t ctf_add_restrict(ctf_file_t *, uint_t, const char *, ctf_id_t);
 extern ctf_id_t ctf_add_struct(ctf_file_t *, uint_t, const char *);
 extern ctf_id_t ctf_add_union(ctf_file_t *, uint_t, const char *);
-extern ctf_id_t ctf_add_volatile(ctf_file_t *, uint_t, ctf_id_t);
+extern ctf_id_t ctf_add_volatile(ctf_file_t *, uint_t, const char *, ctf_id_t);
 
 extern int ctf_add_enumerator(ctf_file_t *, ctf_id_t, const char *, int);
-extern int ctf_add_member(ctf_file_t *, ctf_id_t, const char *, ctf_id_t);
+extern int ctf_add_member(ctf_file_t *, ctf_id_t, const char *, ctf_id_t,
+    ulong_t);
+
+
+extern int ctf_add_function(ctf_file_t *, ulong_t, const ctf_funcinfo_t *,
+    const ctf_id_t *);
+extern int ctf_add_object(ctf_file_t *, ulong_t, ctf_id_t);
+extern int ctf_add_label(ctf_file_t *, const char *, ctf_id_t, uint_t);
 
 extern int ctf_set_array(ctf_file_t *, ctf_id_t, const ctf_arinfo_t *);
+extern int ctf_set_root(ctf_file_t *, ctf_id_t, const boolean_t);
+extern int ctf_set_size(ctf_file_t *, ctf_id_t, const ulong_t);
 
 extern int ctf_delete_type(ctf_file_t *, ctf_id_t);
 
 extern int ctf_update(ctf_file_t *);
 extern int ctf_discard(ctf_file_t *);
 extern int ctf_write(ctf_file_t *, int);
+extern void ctf_dataptr(ctf_file_t *, const void **, size_t *);
 
 #ifdef _KERNEL
 
diff --git a/usr/src/uts/common/sys/dktp/dadk.h b/usr/src/uts/common/sys/dktp/dadk.h
index f5c990e7c0..2178ad1f0d 100644
--- a/usr/src/uts/common/sys/dktp/dadk.h
+++ b/usr/src/uts/common/sys/dktp/dadk.h
@@ -65,6 +65,8 @@ struct	dadk {
 	kstat_t		*dad_errstats;	/* error stats			*/
 	kmutex_t	dad_cmd_mutex;
 	int		dad_cmd_count;
+	uint32_t	dad_err_cnt;	/* number of recent errors	*/
+	hrtime_t	dad_last_log;	/* time of last error log	*/
 };
 
 #define	DAD_SECSIZ	dad_phyg.g_secsiz
diff --git a/usr/src/uts/common/sys/dld.h b/usr/src/uts/common/sys/dld.h
index fb2a0749d3..4cd93be56e 100644
--- a/usr/src/uts/common/sys/dld.h
+++ b/usr/src/uts/common/sys/dld.h
@@ -21,6 +21,7 @@
 /*
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2011 Joyent, Inc.  All rights reserved.
  */
 
 #ifndef	_SYS_DLD_H
@@ -191,6 +192,7 @@ typedef struct dld_ioc_rename {
 	datalink_id_t	dir_linkid1;
 	datalink_id_t	dir_linkid2;
 	char		dir_link[MAXLINKNAMELEN];
+	boolean_t	dir_zoneinit;
 } dld_ioc_rename_t;
 
 /*
@@ -203,6 +205,7 @@ typedef struct dld_ioc_rename {
 typedef struct dld_ioc_zid {
 	zoneid_t	diz_zid;
 	datalink_id_t	diz_linkid;
+	boolean_t	diz_transient;
 } dld_ioc_zid_t;
 
 /*
@@ -350,6 +353,7 @@ typedef struct dld_hwgrpinfo {
  */
 typedef	int	(*dld_capab_func_t)(void *, uint_t, void *, uint_t);
 
+#define	DI_DIRECT_RAW	0x1
 /*
  * Direct Tx/Rx capability.
  */
@@ -374,6 +378,9 @@ typedef struct dld_capab_direct_s {
 	/* flow control "can I put on a ring" callback */
 	uintptr_t	di_tx_fctl_df; /* canput-like callback */
 	void		*di_tx_fctl_dh;
+
+	/* flags that control our behavior */
+	uint_t		di_flags;
 } dld_capab_direct_t;
 
 /*
diff --git a/usr/src/uts/common/sys/dld_impl.h b/usr/src/uts/common/sys/dld_impl.h
index a76a927e59..81708aad38 100644
--- a/usr/src/uts/common/sys/dld_impl.h
+++ b/usr/src/uts/common/sys/dld_impl.h
@@ -53,7 +53,8 @@ typedef enum {
 typedef enum {
 	DLD_UNINITIALIZED,
 	DLD_PASSIVE,
-	DLD_ACTIVE
+	DLD_ACTIVE,
+	DLD_EXCLUSIVE
 } dld_passivestate_t;
 
 /*
@@ -256,6 +257,8 @@ extern void		dld_str_rx_unitdata(void *, mac_resource_handle_t,
 extern void		dld_str_notify_ind(dld_str_t *);
 extern mac_tx_cookie_t	str_mdata_fastpath_put(dld_str_t *, mblk_t *,
     uintptr_t, uint16_t);
+extern mac_tx_cookie_t	str_mdata_raw_fastpath_put(dld_str_t *, mblk_t *,
+    uintptr_t, uint16_t);
 extern int		dld_flow_ctl_callb(dld_str_t *, uint64_t,
     int (*func)(), void *);
 
diff --git a/usr/src/uts/common/sys/dld_ioc.h b/usr/src/uts/common/sys/dld_ioc.h
index 2f519a8eda..093a4dc0c3 100644
--- a/usr/src/uts/common/sys/dld_ioc.h
+++ b/usr/src/uts/common/sys/dld_ioc.h
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
  */
 
 #ifndef	_SYS_DLD_IOC_H
@@ -59,6 +60,7 @@ extern "C" {
 #define	IPTUN_IOC	0x454A
 #define	BRIDGE_IOC	0xB81D
 #define	IBPART_IOC	0x6171
+#define	OVERLAY_IOC	0x2005
 
 /* GLDv3 modules use these macros to generate unique ioctl commands */
 #define	DLDIOC(cmdid)		DLD_IOC_CMD(DLD_IOC, (cmdid))
@@ -68,6 +70,7 @@ extern "C" {
 #define	IPTUNIOC(cmdid)		DLD_IOC_CMD(IPTUN_IOC, (cmdid))
 #define	BRIDGEIOC(cmdid)	DLD_IOC_CMD(BRIDGE_IOC, (cmdid))
 #define	IBPARTIOC(cmdid)	DLD_IOC_CMD(IBPART_IOC, (cmdid))
+#define	OVERLAYIOC(cmdid)	DLD_IOC_CMD(OVERLAY_IOC, (cmdid))
 
 #ifdef _KERNEL
 
diff --git a/usr/src/uts/common/sys/dlpi.h b/usr/src/uts/common/sys/dlpi.h
index 5bc2bd41c5..34f1c17236 100644
--- a/usr/src/uts/common/sys/dlpi.h
+++ b/usr/src/uts/common/sys/dlpi.h
@@ -107,6 +107,7 @@ typedef struct dl_ipnetinfo {
 #define	DL_PASSIVE_REQ		0x114	/* Allow access to aggregated link */
 #define	DL_INTR_MODE_REQ	0x115	/* Request Rx processing in INTR mode */
 #define	DL_NOTIFY_CONF		0x116	/* Notification from upstream */
+#define	DL_EXCLUSIVE_REQ	0x117	/* Make bind active */
 
 /*
  * Primitives used for Connectionless Service
@@ -388,6 +389,8 @@ typedef struct dl_ipnetinfo {
 #define	DL_PROMISC_PHYS		0x01	/* promiscuous mode at phys level */
 #define	DL_PROMISC_SAP		0x02	/* promiscuous mode at sap level */
 #define	DL_PROMISC_MULTI	0x03	/* promiscuous mode for multicast */
+#define	DL_PROMISC_RX_ONLY	0x04	/* above only enabled for rx */
+#define	DL_PROMISC_FIXUPS	0x05	/* above will be fixed up */
 
 /*
  * DLPI notification codes for DL_NOTIFY_REQ primitives.
@@ -1107,6 +1110,13 @@ typedef struct {
 } dl_intr_mode_req_t;
 
 /*
+ * DL_EXCLUSIVE_REQ, M_PROTO type
+ */
+typedef struct {
+	t_uscalar_t	dl_primitive;
+} dl_exclusive_req_t;
+
+/*
  *	CONNECTION-ORIENTED SERVICE PRIMITIVES
  */
 
@@ -1528,6 +1538,7 @@ union DL_primitives {
 	dl_control_ack_t	control_ack;
 	dl_passive_req_t	passive_req;
 	dl_intr_mode_req_t	intr_mode_req;
+	dl_exclusive_req_t	exclusive_req;
 };
 
 #define	DL_INFO_REQ_SIZE	sizeof (dl_info_req_t)
@@ -1596,6 +1607,7 @@ union DL_primitives {
 #define	DL_CONTROL_ACK_SIZE	sizeof (dl_control_ack_t)
 #define	DL_PASSIVE_REQ_SIZE	sizeof (dl_passive_req_t)
 #define	DL_INTR_MODE_REQ_SIZE	sizeof (dl_intr_mode_req_t)
+#define	DL_EXCLUSIVE_REQ_SIZE	sizeof (dl_exclusive_req_t)
 
 #ifdef	_KERNEL
 /*
diff --git a/usr/src/uts/common/sys/dls.h b/usr/src/uts/common/sys/dls.h
index 6bd2bbe35a..81f9e2abac 100644
--- a/usr/src/uts/common/sys/dls.h
+++ b/usr/src/uts/common/sys/dls.h
@@ -21,6 +21,7 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2015 Joyent, Inc.
  */
 
 #ifndef	_SYS_DLS_H
@@ -85,6 +86,8 @@ typedef struct dls_link_s	dls_link_t;
 #define	DLS_PROMISC_SAP		0x00000001
 #define	DLS_PROMISC_MULTI	0x00000002
 #define	DLS_PROMISC_PHYS	0x00000004
+#define	DLS_PROMISC_RX_ONLY	0x00000008
+#define	DLS_PROMISC_FIXUPS	0x00000010
 
 extern int	dls_open(dls_link_t *, dls_dl_handle_t, dld_str_t *);
 extern void	dls_close(dld_str_t *);
@@ -106,11 +109,13 @@ extern void	str_notify(void *, mac_notify_type_t);
 
 extern int		dls_devnet_open(const char *,
 			    dls_dl_handle_t *, dev_t *);
+extern int		dls_devnet_open_in_zone(const char *,
+			    dls_dl_handle_t *, dev_t *, zoneid_t);
 extern void		dls_devnet_close(dls_dl_handle_t);
 extern boolean_t	dls_devnet_rebuild();
 
 extern int		dls_devnet_rename(datalink_id_t, datalink_id_t,
-			    const char *);
+			    const char *, boolean_t);
 extern int		dls_devnet_create(mac_handle_t, datalink_id_t,
 			    zoneid_t);
 extern int		dls_devnet_destroy(mac_handle_t, datalink_id_t *,
@@ -122,12 +127,13 @@ extern int		dls_devnet_hold_by_dev(dev_t, dls_dl_handle_t *);
 extern void		dls_devnet_rele(dls_dl_handle_t);
 extern void		dls_devnet_prop_task_wait(dls_dl_handle_t);
 
+extern const char	*dls_devnet_link(dls_dl_handle_t);
 extern const char	*dls_devnet_mac(dls_dl_handle_t);
 extern uint16_t		dls_devnet_vid(dls_dl_handle_t);
 extern datalink_id_t	dls_devnet_linkid(dls_dl_handle_t);
 extern int		dls_devnet_dev2linkid(dev_t, datalink_id_t *);
 extern int		dls_devnet_phydev(datalink_id_t, dev_t *);
-extern int		dls_devnet_setzid(dls_dl_handle_t, zoneid_t);
+extern int		dls_devnet_setzid(dls_dl_handle_t, zoneid_t, boolean_t);
 extern zoneid_t		dls_devnet_getzid(dls_dl_handle_t);
 extern zoneid_t		dls_devnet_getownerzid(dls_dl_handle_t);
 extern boolean_t	dls_devnet_islinkvisible(datalink_id_t, zoneid_t);
@@ -141,6 +147,8 @@ extern int		dls_mgmt_update(const char *, uint32_t, boolean_t,
 extern int		dls_mgmt_get_linkinfo(datalink_id_t, char *,
 			    datalink_class_t *, uint32_t *, uint32_t *);
 extern int		dls_mgmt_get_linkid(const char *, datalink_id_t *);
+extern int		dls_mgmt_get_linkid_in_zone(const char *,
+    datalink_id_t *, zoneid_t);
 extern datalink_id_t	dls_mgmt_get_next(datalink_id_t, datalink_class_t,
 			    datalink_media_t, uint32_t);
 extern int		dls_devnet_macname2linkid(const char *,
diff --git a/usr/src/uts/common/sys/dls_impl.h b/usr/src/uts/common/sys/dls_impl.h
index 60f51c47b5..329f8dd08e 100644
--- a/usr/src/uts/common/sys/dls_impl.h
+++ b/usr/src/uts/common/sys/dls_impl.h
@@ -21,6 +21,7 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2015 Joyent, Inc.
  */
 
 #ifndef	_SYS_DLS_IMPL_H
@@ -46,11 +47,12 @@ typedef struct dls_multicst_addr_s {
 } dls_multicst_addr_t;
 
 struct dls_link_s {				/* Protected by */
-	char			dl_name[MAXNAMELEN];	/* SL */
+	char			dl_name[MAXNAMELEN];	/* RO */
 	uint_t			dl_ddi_instance;	/* SL */
 	mac_handle_t		dl_mh;			/* SL */
 	mac_client_handle_t	dl_mch;			/* SL */
 	mac_unicast_handle_t	dl_mah;			/* SL */
+	mac_notify_handle_t	dl_mnh;			/* SL */
 	const mac_info_t	*dl_mip;		/* SL */
 	uint_t			dl_ref;			/* SL */
 	mod_hash_t		*dl_str_hash;		/* SL, modhash lock */
@@ -61,6 +63,7 @@ struct dls_link_s {				/* Protected by */
 	uint_t			dl_zone_ref;
 	link_tagmode_t		dl_tagmode;		/* atomic */
 	uint_t			dl_nonip_cnt;		/* SL */
+	uint_t			dl_exclusive;		/* SL */
 };
 
 typedef struct dls_head_s {
@@ -96,13 +99,16 @@ extern void		dls_create_str_kstats(dld_str_t *);
 extern int		dls_stat_update(kstat_t *, dls_link_t *, int);
 extern int		dls_stat_create(const char *, int, const char *,
 			    zoneid_t, int (*)(struct kstat *, int), void *,
-			    kstat_t **);
+			    kstat_t **, zoneid_t);
+extern void	dls_stat_delete(kstat_t *);
 
 extern int		dls_devnet_open_by_dev(dev_t, dls_link_t **,
 			    dls_dl_handle_t *);
 extern int		dls_devnet_hold_link(datalink_id_t, dls_dl_handle_t *,
 			    dls_link_t **);
 extern void		dls_devnet_rele_link(dls_dl_handle_t, dls_link_t *);
+extern int		dls_devnet_hold_tmp_by_link(dls_link_t *,
+			    dls_dl_handle_t *);
 
 extern void		dls_init(void);
 extern int		dls_fini(void);
@@ -126,6 +132,7 @@ extern void		dls_mgmt_init(void);
 extern void		dls_mgmt_fini(void);
 
 extern int		dls_mgmt_get_phydev(datalink_id_t, dev_t *);
+extern int		dls_exclusive_set(dld_str_t *, boolean_t);
 
 #ifdef	__cplusplus
 }
diff --git a/usr/src/uts/common/sys/dls_mgmt.h b/usr/src/uts/common/sys/dls_mgmt.h
index b4032c24d6..214e225ac9 100644
--- a/usr/src/uts/common/sys/dls_mgmt.h
+++ b/usr/src/uts/common/sys/dls_mgmt.h
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015, Joyent, Inc.
  */
 
 #ifndef	_DLS_MGMT_H
@@ -46,13 +47,15 @@ typedef enum {
 	DATALINK_CLASS_SIMNET		= 0x20,
 	DATALINK_CLASS_BRIDGE		= 0x40,
 	DATALINK_CLASS_IPTUN		= 0x80,
-	DATALINK_CLASS_PART		= 0x100
+	DATALINK_CLASS_PART		= 0x100,
+	DATALINK_CLASS_OVERLAY		= 0x200
 } datalink_class_t;
 
 #define	DATALINK_CLASS_ALL	(DATALINK_CLASS_PHYS |	\
 	DATALINK_CLASS_VLAN | DATALINK_CLASS_AGGR | DATALINK_CLASS_VNIC | \
 	DATALINK_CLASS_ETHERSTUB | DATALINK_CLASS_SIMNET | \
-	DATALINK_CLASS_BRIDGE | DATALINK_CLASS_IPTUN | DATALINK_CLASS_PART)
+	DATALINK_CLASS_BRIDGE | DATALINK_CLASS_IPTUN | DATALINK_CLASS_PART | \
+	DATALINK_CLASS_OVERLAY)
 
 /*
  * A combination of flags and media.
@@ -165,6 +168,7 @@ typedef struct dlmgmt_door_getname {
 typedef struct dlmgmt_door_getlinkid {
 	int			ld_cmd;
 	char			ld_link[MAXLINKNAMELEN];
+	zoneid_t		ld_zoneid;
 } dlmgmt_door_getlinkid_t;
 
 typedef struct dlmgmt_door_getnext_s {
@@ -225,6 +229,7 @@ typedef struct dlmgmt_getattr_retval_s {
 	char			lr_attrval[MAXLINKATTRVALLEN];
 } dlmgmt_getattr_retval_t;
 
+
 #ifdef	__cplusplus
 }
 #endif
diff --git a/usr/src/uts/common/sys/elf.h b/usr/src/uts/common/sys/elf.h
index dd1eecc70d..b88d215336 100644
--- a/usr/src/uts/common/sys/elf.h
+++ b/usr/src/uts/common/sys/elf.h
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright 2012 DEY Storage Systems, Inc.  All rights reserved.
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2015, Joyent, Inc. All rights reserved.
  */
 /*
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
@@ -348,6 +348,11 @@ typedef struct {
 #define	PT_GNU_STACK	0x6474e551	/* Indicates stack executability */
 #define	PT_GNU_RELRO	0x6474e552	/* Read-only after relocation */
 
+/*
+ * Linux specific program headers not even used by Linux (!!)
+ */
+#define	PT_PAX_FLAGS	0x65041580	/* PaX flags (see below) */
+
 #define	PT_LOSUNW	0x6ffffffa
 #define	PT_SUNWBSS	0x6ffffffa	/* Sun Specific segment (unused) */
 #define	PT_SUNWSTACK	0x6ffffffb	/* describes the stack segment */
@@ -363,6 +368,45 @@ typedef struct {
 #define	PF_W		0x2
 #define	PF_X		0x1
 
+/*
+ * PaX is a regrettable series of never-integrated Linux patches for a
+ * facility to provide additional protections on memory pages for purposes of
+ * increasing security, and for allowing binaries to demand (or refuse) those
+ * protections via the PT_PAX_FLAGS program header.  (Portents of its
+ * rudderless existence, "PaX" is a term of indefinite origin written by an
+ * unknown group of people.)  This facility is unfortunate in any number of
+ * ways, and was largely obviated by the broad adoption of non-executable
+ * stacks at any rate -- but it lives on in binaries that continue to mark
+ * themselves to explicitly refuse the (never-integrated, now-obviated)
+ * facility.  One might cringe that PaX overloads the meaning of the p_flags
+ * to specify protections, but that is the least of its transgressions:
+ * instead of using one p_type constant to explicitly enable a series of
+ * protections and another to explicitly disable others, it insists on
+ * conflating both actions into PT_PAX_FLAGS.  The resulting doubling of
+ * constant definitions (two constant definitions for every protection instead
+ * of merely one) assures that the values can't even fit in the eight
+ * PF_MASKOS bits putatively defined to provide a modicum of cleanliness for
+ * such filthy functionality.  And were all of this not enough, there is one
+ * final nomenclature insult to be added to this semantic injury:  the
+ * constants for the p_flags don't even embed "_PAX_" in their name -- despite
+ * the fact that this is their only purpose!  We resist the temptation to
+ * right this final wrong here; we grit our teeth and provide exactly the
+ * Linux definitions -- or rather, what would have been the Linux definitions
+ * had this belching jalopy ever been permitted to crash itself into mainline.
+ */
+#define	PF_PAGEEXEC	0x00000010	/* PaX: enable PAGEEXEC */
+#define	PF_NOPAGEEXEC	0x00000020	/* PaX: disable PAGEEXEC */
+#define	PF_SEGMEXEC	0x00000040	/* PaX: enable SEGMEXEC */
+#define	PF_NOSEGMEXEC	0x00000080	/* PaX: disable SEGMEXEC */
+#define	PF_MPROTECT	0x00000100	/* PaX: enable MPROTECT */
+#define	PF_NOMPROTECT	0x00000200	/* PaX: disable MPROTECT */
+#define	PF_RANDEXEC	0x00000400	/* PaX: enable RANDEXEC */
+#define	PF_NORANDEXEC	0x00000800	/* PaX: disable RANDEXEC */
+#define	PF_EMUTRAMP	0x00001000	/* PaX: enable EMUTRAMP */
+#define	PF_NOEMUTRAMP	0x00002000	/* PaX: disable EMUTRAMP */
+#define	PF_RANDMMAP	0x00004000	/* PaX: enable RANDMMAP */
+#define	PF_NORANDMMAP	0x00008000	/* PaX: disable RANDMMAP */
+
 #define	PF_MASKOS	0x0ff00000	/* OS specific values */
 #define	PF_MASKPROC	0xf0000000	/* processor specific values */
 
diff --git a/usr/src/uts/common/sys/exec.h b/usr/src/uts/common/sys/exec.h
index b5e2c58be5..b2db3f2987 100644
--- a/usr/src/uts/common/sys/exec.h
+++ b/usr/src/uts/common/sys/exec.h
@@ -26,6 +26,10 @@
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
 /*	  All Rights Reserved  	*/
 
+/*
+ * Copyright 2016, Joyent, Inc.
+ */
+
 #ifndef _SYS_EXEC_H
 #define	_SYS_EXEC_H
 
@@ -102,10 +106,14 @@ typedef struct uarg {
 	vnode_t	*ex_vp;
 	char	*emulator;
 	char	*brandname;
+	const char *brand_nroot;
 	char	*auxp_auxflags; /* addr of auxflags auxv on the user stack */
 	char	*auxp_brand; /* address of first brand auxv on user stack */
 	cred_t	*pfcred;
 	boolean_t scrubenv;
+	uintptr_t maxstack;
+	boolean_t stk_prot_override;
+	uintptr_t commpage;
 } uarg_t;
 
 /*
@@ -175,7 +183,7 @@ struct execsw {
 	int	(*exec_func)(struct vnode *vp, struct execa *uap,
 		    struct uarg *args, struct intpdata *idata, int level,
 		    long *execsz, int setid, caddr_t exec_file,
-		    struct cred *cred, int brand_action);
+		    struct cred *cred, int *brand_action);
 	int	(*exec_core)(struct vnode *vp, struct proc *p,
 		    struct cred *cred, rlim64_t rlimit, int sig,
 		    core_content_t content);
@@ -213,7 +221,7 @@ extern int exec_common(const char *fname, const char **argp,
     const char **envp, int brand_action);
 extern int gexec(vnode_t **vp, struct execa *uap, struct uarg *args,
     struct intpdata *idata, int level, long *execsz, caddr_t exec_file,
-    struct cred *cred, int brand_action);
+    struct cred *cred, int *brand_action);
 extern struct execsw *allocate_execsw(char *name, char *magic,
     size_t magic_size);
 extern struct execsw *findexecsw(char *magic);
@@ -238,16 +246,22 @@ extern void exec_set_sp(size_t);
  * when compiling the 32-bit compatability elf code in the elfexec module.
  */
 extern int elfexec(vnode_t *, execa_t *, uarg_t *, intpdata_t *, int,
-    long *, int, caddr_t, cred_t *, int);
+    long *, int, caddr_t, cred_t *, int *);
 extern int mapexec_brand(vnode_t *, uarg_t *, Ehdr *, Addr *,
-    intptr_t *, caddr_t, int *, caddr_t *, caddr_t *, size_t *, uintptr_t *);
+    intptr_t *, caddr_t, char **, caddr_t *, caddr_t *, size_t *,
+    uintptr_t *, uintptr_t *);
+extern int elfreadhdr(vnode_t *, cred_t *, Ehdr *, int *, caddr_t *,
+    ssize_t *);
 #endif /* !_ELF32_COMPAT */
 
 #if defined(_LP64)
 extern int elf32exec(vnode_t *, execa_t *, uarg_t *, intpdata_t *, int,
-    long *, int, caddr_t, cred_t *, int);
+    long *, int, caddr_t, cred_t *, int *);
 extern int mapexec32_brand(vnode_t *, uarg_t *, Elf32_Ehdr *, Elf32_Addr *,
-    intptr_t *, caddr_t, int *, caddr_t *, caddr_t *, size_t *, uintptr_t *);
+    intptr_t *, caddr_t, char **, caddr_t *, caddr_t *, size_t *,
+    uintptr_t *, uintptr_t *);
+extern int elf32readhdr(vnode_t *, cred_t *, Elf32_Ehdr *, int *, caddr_t *,
+    ssize_t *);
 #endif  /* _LP64 */
 
 /*
diff --git a/usr/src/uts/common/sys/frameio.h b/usr/src/uts/common/sys/frameio.h
new file mode 100644
index 0000000000..54e6dbeedf
--- /dev/null
+++ b/usr/src/uts/common/sys/frameio.h
@@ -0,0 +1,107 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014, Joyent, Inc.  All rights reserved.
+ */
+
+#ifndef _SYS_FRAMEIO_H
+#define	_SYS_FRAMEIO_H
+
+/*
+ * Frame I/O definitions
+ */
+
+#include <sys/types.h>
+
+#ifdef _KERNEL
+/* Kernel only headers */
+#include <sys/stream.h>
+#endif	/* _KERNEL */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * An individual frame vector component. Collections of these are used to make
+ * ioctls.
+ */
+typedef struct framevec {
+	void	*fv_buf;	/* Buffer with data */
+	size_t	fv_buflen;	/* Size of the buffer */
+	size_t	fv_actlen;	/* Amount of buffer consumed, ignore on error */
+} framevec_t;
+
+/*
+ * The base unit used with frameio.
+ */
+typedef struct frameio {
+	uint_t	fio_version;	/* Should always be FRAMEIO_CURRENT_VERSION */
+	uint_t	fio_nvpf;	/* How many vectors make up one frame */
+	uint_t	fio_nvecs;	/* The total number of vectors */
+	framevec_t fio_vecs[];	/* C99 VLA */
+} frameio_t;
+
+
+#define	FRAMEIO_VERSION_ONE	1
+#define	FRAMEIO_CURRENT_VERSION	FRAMEIO_VERSION_ONE
+
+#define	FRAMEIO_NVECS_MAX	32
+
+/*
+ * Definitions for kernel modules to include as helpers. These are consolidation
+ * private.
+ */
+#ifdef _KERNEL
+
+/*
+ * 32-bit versions for 64-bit kernels
+ */
+typedef struct framevec32 {
+	caddr32_t fv_buf;
+	size32_t fv_buflen;
+	size32_t fv_actlen;
+} framevec32_t;
+
+typedef struct frameio32 {
+	uint_t fio_version;
+	uint_t fio_vecspframe;
+	uint_t fio_nvecs;
+	framevec32_t fio_vecs[];
+} frameio32_t;
+
+/*
+ * Describe the different ways that vectors should map to frames.
+ */
+typedef enum frameio_write_mblk_map {
+	MAP_BLK_FRAME
+} frameio_write_mblk_map_t;
+
+int frameio_init(void);
+void frameio_fini(void);
+frameio_t *frameio_alloc(int);
+void frameio_free(frameio_t *);
+int frameio_hdr_copyin(frameio_t *, int, const void *, uint_t);
+int frameio_mblk_chain_read(frameio_t *, mblk_t **, int *, int);
+int frameio_mblk_chain_write(frameio_t *, frameio_write_mblk_map_t, mblk_t *,
+    int *, int);
+int frameio_hdr_copyout(frameio_t *, int, void *, uint_t);
+size_t frameio_frame_length(frameio_t *, framevec_t *);
+void frameio_mark_consumed(frameio_t *, int);
+
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_FRAMEIO_H */
diff --git a/usr/src/uts/common/sys/fs/hyprlofs.h b/usr/src/uts/common/sys/fs/hyprlofs.h
new file mode 100644
index 0000000000..b8c4149df2
--- /dev/null
+++ b/usr/src/uts/common/sys/fs/hyprlofs.h
@@ -0,0 +1,91 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2012, Joyent, Inc.  All rights reserved.
+ */
+
+#ifndef	_SYS_FS_HYPRLOFS_H
+#define	_SYS_FS_HYPRLOFS_H
+
+#include <sys/param.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * hyprlofs ioctl numbers.
+ */
+#define	HYPRLOFS_IOC	('H' << 8)
+
+#define	HYPRLOFS_ADD_ENTRIES	(HYPRLOFS_IOC | 1)
+#define	HYPRLOFS_RM_ENTRIES	(HYPRLOFS_IOC | 2)
+#define	HYPRLOFS_RM_ALL		(HYPRLOFS_IOC | 3)
+#define	HYPRLOFS_GET_ENTRIES	(HYPRLOFS_IOC | 4)
+
+typedef struct {
+	char	*hle_path;
+	uint_t	hle_plen;
+	char	*hle_name;
+	uint_t	hle_nlen;
+} hyprlofs_entry_t;
+
+typedef struct {
+	hyprlofs_entry_t	*hle_entries;
+	uint_t			hle_len;
+} hyprlofs_entries_t;
+
+typedef struct {
+	char		hce_path[MAXPATHLEN];
+	char		hce_name[MAXPATHLEN];
+} hyprlofs_curr_entry_t;
+
+typedef struct {
+	hyprlofs_curr_entry_t	*hce_entries;
+	uint_t			hce_cnt;
+} hyprlofs_curr_entries_t;
+
+#ifdef _KERNEL
+typedef struct {
+	caddr32_t	hle_path;
+	uint_t		hle_plen;
+	caddr32_t	hle_name;
+	uint_t		hle_nlen;
+} hyprlofs_entry32_t;
+
+typedef struct {
+	caddr32_t	hle_entries;
+	uint_t		hle_len;
+} hyprlofs_entries32_t;
+
+typedef struct {
+	caddr32_t	hce_entries;
+	uint_t		hce_cnt;
+} hyprlofs_curr_entries32_t;
+
+#endif /* _KERNEL */
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_FS_HYPRLOFS_H */
diff --git a/usr/src/uts/common/sys/fs/hyprlofs_info.h b/usr/src/uts/common/sys/fs/hyprlofs_info.h
new file mode 100644
index 0000000000..38389f77d9
--- /dev/null
+++ b/usr/src/uts/common/sys/fs/hyprlofs_info.h
@@ -0,0 +1,174 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2012, Joyent, Inc. All rights reserved.
+ */
+
+#ifndef	_SYS_FS_HYPRLOFS_INFO_H
+#define	_SYS_FS_HYPRLOFS_INFO_H
+
+#include <sys/t_lock.h>
+#include <vm/seg.h>
+#include <vm/seg_vn.h>
+#include <sys/vfs_opreg.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * hlnode is the file system dependent node for hyprlofs.
+ * It is modeled on the tmpfs tmpnode.
+ *
+ *	hln_rwlock protects access of the directory list at hln_dir
+ *	as well as syncronizing read/writes to directory hlnodes.
+ *	hln_tlock protects updates to hln_mode and hln_nlink.
+ *	hln_tlock doesn't require any hlnode locks.
+ */
+typedef struct hlnode {
+	struct hlnode	*hln_back;		/* linked list of hlnodes */
+	struct hlnode	*hln_forw;		/* linked list of hlnodes */
+	union {
+		struct {
+			struct hldirent	*un_dirlist; /* dirent list */
+			uint_t	un_dirents;	/* number of dirents */
+		} un_dirstruct;
+		vnode_t	*un_realvp;		/* real vnode */
+	} un_hlnode;
+	vnode_t 	*hln_vnode;		/* vnode for this hlnode */
+	int 		hln_gen;		/* pseudo gen num for hlfid */
+	int 		hln_looped;		/* flag indicating loopback */
+	vattr_t		hln_attr;		/* attributes */
+	krwlock_t	hln_rwlock;		/* rw - serialize mods and */
+						/* directory updates */
+	kmutex_t	hln_tlock;		/* time, flag, and nlink lock */
+} hlnode_t;
+
+/*
+ * hyprlofs per-mount data structure.
+ * All fields are protected by hlm_contents.
+ */
+typedef struct {
+	vfs_t		*hlm_vfsp;	/* filesystem's vfs struct */
+	hlnode_t	*hlm_rootnode;	/* root hlnode */
+	char 		*hlm_mntpath;	/* name of hyprlofs mount point */
+	dev_t		hlm_dev;	/* unique dev # of mounted `device' */
+	uint_t		hlm_gen;	/* pseudo generation number for files */
+	kmutex_t	hlm_contents;	/* lock for hlfsmount structure */
+} hlfsmount_t;
+
+/*
+ * hyprlofs directories are made up of a linked list of hldirent structures
+ * hanging off directory hlnodes.  File names are not fixed length,
+ * but are null terminated.
+ */
+typedef struct hldirent {
+	hlnode_t	*hld_hlnode;		/* hlnode for this file */
+	struct hldirent	*hld_next;		/* next directory entry */
+	struct hldirent	*hld_prev;		/* prev directory entry */
+	uint_t		hld_offset;		/* "offset" of dir entry */
+	uint_t		hld_hash;		/* a hash of td_name */
+	struct hldirent	*hld_link;		/* linked via the hash table */
+	hlnode_t	*hld_parent;		/* parent, dir we are in */
+	char		*hld_name;		/* must be null terminated */
+						/* max length is MAXNAMELEN */
+} hldirent_t;
+
+/*
+ * hlfid overlays the fid structure (for VFS_VGET)
+ */
+typedef struct {
+	uint16_t hlfid_len;
+	ino32_t	hlfid_ino;
+	int32_t	hlfid_gen;
+} hlfid_t;
+
+/*
+ * File system independent to hyprlofs conversion macros
+ */
+#define	VFSTOHLM(vfsp)		((hlfsmount_t *)(vfsp)->vfs_data)
+#define	VTOHLM(vp)		((hlfsmount_t *)(vp)->v_vfsp->vfs_data)
+#define	VTOHLN(vp)		((hlnode_t *)(vp)->v_data)
+#define	HLNTOV(tp)		((tp)->hln_vnode)
+#define	REALVP(vp)		((vnode_t *)VTOHLN(vp)->hln_realvp)
+#define	hlnode_hold(tp)		VN_HOLD(HLNTOV(tp))
+#define	hlnode_rele(tp)		VN_RELE(HLNTOV(tp))
+
+#define	hln_dir		un_hlnode.un_dirstruct.un_dirlist
+#define	hln_dirents	un_hlnode.un_dirstruct.un_dirents
+#define	hln_realvp	un_hlnode.un_realvp
+
+/*
+ * Attributes
+ */
+#define	hln_mask	hln_attr.va_mask
+#define	hln_type	hln_attr.va_type
+#define	hln_mode	hln_attr.va_mode
+#define	hln_uid		hln_attr.va_uid
+#define	hln_gid		hln_attr.va_gid
+#define	hln_fsid	hln_attr.va_fsid
+#define	hln_nodeid	hln_attr.va_nodeid
+#define	hln_nlink	hln_attr.va_nlink
+#define	hln_size	hln_attr.va_size
+#define	hln_atime	hln_attr.va_atime
+#define	hln_mtime	hln_attr.va_mtime
+#define	hln_ctime	hln_attr.va_ctime
+#define	hln_rdev	hln_attr.va_rdev
+#define	hln_blksize	hln_attr.va_blksize
+#define	hln_nblocks	hln_attr.va_nblocks
+#define	hln_seq		hln_attr.va_seq
+
+/*
+ * enums
+ */
+enum de_op	{ DE_CREATE, DE_MKDIR }; /* direnter ops */
+enum dr_op	{ DR_REMOVE, DR_RMDIR }; /* dirremove ops */
+
+/*
+ * hyprlofs_minfree is the amount (in pages) of anonymous memory that hyprlofs
+ * leaves free for the rest of the system. The default value for
+ * hyprlofs_minfree is btopr(HYPRLOFSMINFREE) but it can be patched to a
+ * different number of pages.  Since hyprlofs doesn't actually use much
+ * memory, its unlikely this ever needs to be patched.
+ */
+#define		HYPRLOFSMINFREE 8 * 1024 * 1024 /* 8 Megabytes */
+
+extern size_t  hyprlofs_minfree;		/* Anonymous memory in pages */
+
+extern	void	hyprlofs_node_init(hlfsmount_t *, hlnode_t *, vattr_t *,
+		    cred_t *);
+extern	int	hyprlofs_dirlookup(hlnode_t *, char *, hlnode_t **, cred_t *);
+extern	int	hyprlofs_dirdelete(hlnode_t *, hlnode_t *, char *, enum dr_op,
+		    cred_t *);
+extern	void	hyprlofs_dirinit(hlnode_t *, hlnode_t *);
+extern	void	hyprlofs_dirtrunc(hlnode_t *);
+extern	int	hyprlofs_taccess(void *, int, cred_t *);
+extern	int	hyprlofs_direnter(hlfsmount_t *, hlnode_t *, char *, enum de_op,
+		    vnode_t *, vattr_t *, hlnode_t **, cred_t *);
+
+extern struct vnodeops *hyprlofs_vnodeops;
+extern const struct fs_operation_def hyprlofs_vnodeops_template[];
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_FS_HYPRLOFS_INFO_H */
diff --git a/usr/src/uts/common/sys/fs/sdev_impl.h b/usr/src/uts/common/sys/fs/sdev_impl.h
index 561939fc20..e6fa895060 100644
--- a/usr/src/uts/common/sys/fs/sdev_impl.h
+++ b/usr/src/uts/common/sys/fs/sdev_impl.h
@@ -36,6 +36,7 @@ extern "C" {
 #include <sys/vfs_opreg.h>
 #include <sys/list.h>
 #include <sys/nvpair.h>
+#include <sys/fs/sdev_plugin.h>
 #include <sys/sunddi.h>
 
 /*
@@ -128,6 +129,21 @@ typedef struct sdev_local_data {
 	struct sdev_dprof sdev_lprof;	/* profile for multi-inst */
 } sdev_local_data_t;
 
+/* sdev_flags */
+typedef enum sdev_flags {
+	SDEV_BUILD =		0x0001,	/* directory cache out-of-date */
+	SDEV_GLOBAL =		0x0002,	/* global /dev nodes */
+	SDEV_PERSIST =		0x0004,	/* backing store persisted node */
+	SDEV_NO_NCACHE = 	0x0008,	/* do not include in neg. cache */
+	SDEV_DYNAMIC =		0x0010,	/* special-purpose vnode ops */
+					/* (ex: pts) */
+	SDEV_VTOR =		0x0020,	/* validate sdev_nodes during search */
+	SDEV_ATTR_INVALID =	0x0040,	/* invalid node attributes, */
+					/* need update */
+	SDEV_SUBDIR =		0x0080,	/* match all subdirs under here */
+	SDEV_ZONED =		0x0100	/* zoned subdir */
+} sdev_flags_t;
+
 /*
  * /dev filesystem sdev_node defines
  */
@@ -150,7 +166,7 @@ typedef struct sdev_node {
 	ino64_t		sdev_ino;	/* inode */
 	uint_t		sdev_nlink;	/* link count */
 	int		sdev_state;	/* state of this node */
-	int		sdev_flags;	/* flags bit */
+	sdev_flags_t	sdev_flags;	/* flags bit */
 
 	kmutex_t	sdev_lookup_lock; /* node creation synch lock */
 	kcondvar_t	sdev_lookup_cv;	/* node creation sync cv */
@@ -161,7 +177,7 @@ typedef struct sdev_node {
 		struct sdev_global_data	sdev_globaldata;
 		struct sdev_local_data	sdev_localdata;
 	} sdev_instance_data;
-
+	list_node_t	sdev_plist;	/* link on plugin list */
 	void		*sdev_private;
 } sdev_node_t;
 
@@ -192,29 +208,11 @@ typedef enum {
 	SDEV_READY
 } sdev_node_state_t;
 
-/* sdev_flags */
-#define	SDEV_BUILD		0x0001	/* directory cache out-of-date */
-#define	SDEV_GLOBAL		0x0002	/* global /dev nodes */
-#define	SDEV_PERSIST		0x0004	/* backing store persisted node */
-#define	SDEV_NO_NCACHE		0x0008	/* do not include in neg. cache */
-#define	SDEV_DYNAMIC		0x0010	/* special-purpose vnode ops */
-					/* (ex: pts) */
-#define	SDEV_VTOR		0x0020	/* validate sdev_nodes during search */
-#define	SDEV_ATTR_INVALID	0x0040	/* invalid node attributes, */
-					/* need update */
-#define	SDEV_SUBDIR		0x0080	/* match all subdirs under here */
-#define	SDEV_ZONED		0x0100  /* zoned subdir */
-
 /* sdev_lookup_flags */
 #define	SDEV_LOOKUP	0x0001	/* node creation in progress */
 #define	SDEV_READDIR	0x0002	/* VDIR readdir in progress */
 #define	SDEV_LGWAITING	0x0004	/* waiting for devfsadm completion */
 
-#define	SDEV_VTOR_INVALID	-1
-#define	SDEV_VTOR_SKIP		0
-#define	SDEV_VTOR_VALID		1
-#define	SDEV_VTOR_STALE		2
-
 /* convenient macros */
 #define	SDEV_IS_GLOBAL(dv)	\
 	(dv->sdev_flags & SDEV_GLOBAL)
@@ -366,8 +364,13 @@ extern void sdev_devfsadmd_thread(struct sdev_node *, struct sdev_node *,
 extern int devname_profile_update(char *, size_t);
 extern struct sdev_data *sdev_find_mntinfo(char *);
 void sdev_mntinfo_rele(struct sdev_data *);
+typedef void (*sdev_mnt_walk_f)(struct sdev_node *, void *);
+void sdev_mnt_walk(sdev_mnt_walk_f, void *);
 extern struct vnodeops *devpts_getvnodeops(void);
 extern struct vnodeops *devvt_getvnodeops(void);
+extern void sdev_plugin_nodeready(struct sdev_node *);
+extern int sdev_plugin_init(void);
+extern int sdev_plugin_fini(void);
 
 /*
  * boot states - warning, the ordering here is significant
@@ -513,6 +516,23 @@ extern void sdev_nc_path_exists(sdev_nc_list_t *, char *);
 extern void sdev_modctl_dump_files(void);
 
 /*
+ * plugin and legacy vtab stuff
+ */
+/* directory dependent vop table */
+typedef struct sdev_vop_table {
+	char *vt_name;				/* subdirectory name */
+	const fs_operation_def_t *vt_service;	/* vnodeops table */
+	struct vnodeops **vt_global_vops;	/* global container for vop */
+	int (*vt_vtor)(struct sdev_node *);	/* validate sdev_node */
+	int vt_flags;
+} sdev_vop_table_t;
+
+extern struct sdev_vop_table vtab[];
+extern struct vnodeops *sdev_get_vop(struct sdev_node *);
+extern void sdev_set_no_negcache(struct sdev_node *);
+extern void *sdev_get_vtor(struct sdev_node *dv);
+
+/*
  * globals
  */
 extern kmutex_t sdev_lock;
@@ -525,6 +545,7 @@ extern struct vnodeops		*devipnet_vnodeops;
 extern struct vnodeops		*devvt_vnodeops;
 extern struct sdev_data *sdev_origins; /* mount info for global /dev instance */
 extern struct vnodeops		*devzvol_vnodeops;
+extern int			sdev_vnodeops_tbl_size;
 
 extern const fs_operation_def_t	sdev_vnodeops_tbl[];
 extern const fs_operation_def_t	devpts_vnodeops_tbl[];
diff --git a/usr/src/uts/common/sys/fs/sdev_plugin.h b/usr/src/uts/common/sys/fs/sdev_plugin.h
new file mode 100644
index 0000000000..8783df58e6
--- /dev/null
+++ b/usr/src/uts/common/sys/fs/sdev_plugin.h
@@ -0,0 +1,106 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 Joyent, Inc.  All rights reserved.
+ */
+
+#ifndef _SYS_SDEV_PLUGIN_H
+#define	_SYS_SDEV_PLUGIN_H
+
+/*
+ * Kernel sdev plugin interface
+ */
+
+#ifdef _KERNEL
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/vnode.h>
+
+#endif	/* _KERNEL */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _KERNEL
+
+typedef uintptr_t sdev_plugin_hdl_t;
+typedef uintptr_t sdev_ctx_t;
+
+/*
+ * Valid return values for sdev_plugin_validate_t.
+ */
+typedef enum sdev_plugin_validate {
+	SDEV_VTOR_INVALID = -1,
+	SDEV_VTOR_SKIP = 0,
+	SDEV_VTOR_VALID	= 1,
+	SDEV_VTOR_STALE	= 2
+} sdev_plugin_validate_t;
+
+/*
+ * Valid flags
+ */
+typedef enum sdev_plugin_flags {
+	SDEV_PLUGIN_NO_NCACHE = 0x1,
+	SDEV_PLUGIN_SUBDIR = 0x2
+} sdev_plugin_flags_t;
+
+#define	SDEV_PLUGIN_FLAGS_MASK	0x3
+
+/*
+ * Functions a module must implement
+ */
+typedef sdev_plugin_validate_t (*sp_valid_f)(sdev_ctx_t);
+typedef int (*sp_filldir_f)(sdev_ctx_t);
+typedef void (*sp_inactive_f)(sdev_ctx_t);
+
+#define	SDEV_PLUGIN_VERSION	1
+
+typedef struct sdev_plugin_ops {
+	int spo_version;
+	sdev_plugin_flags_t spo_flags;
+	sp_valid_f spo_validate;
+	sp_filldir_f spo_filldir;
+	sp_inactive_f spo_inactive;
+} sdev_plugin_ops_t;
+
+extern sdev_plugin_hdl_t sdev_plugin_register(const char *, sdev_plugin_ops_t *,
+    int *);
+extern int sdev_plugin_unregister(sdev_plugin_hdl_t);
+
+typedef enum sdev_ctx_flags {
+	SDEV_CTX_GLOBAL = 0x2	/* node belongs to the GZ */
+} sdev_ctx_flags_t;
+
+/*
+ * Context helper functions
+ */
+extern sdev_ctx_flags_t sdev_ctx_flags(sdev_ctx_t);
+extern const char *sdev_ctx_name(sdev_ctx_t);
+extern const char *sdev_ctx_path(sdev_ctx_t);
+extern enum vtype sdev_ctx_vtype(sdev_ctx_t);
+extern const void *sdev_ctx_vtype_data(sdev_ctx_t);
+
+/*
+ * Callbacks to manipulate nodes
+ */
+extern int sdev_plugin_mkdir(sdev_ctx_t, char *);
+extern int sdev_plugin_mknod(sdev_ctx_t, char *, mode_t, dev_t);
+
+#endif	/* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_SDEV_PLUGIN_H */
diff --git a/usr/src/uts/common/sys/fs/tmp.h b/usr/src/uts/common/sys/fs/tmp.h
index 68dd67c61e..f8740e8873 100644
--- a/usr/src/uts/common/sys/fs/tmp.h
+++ b/usr/src/uts/common/sys/fs/tmp.h
@@ -22,12 +22,13 @@
  * Copyright 2007 Sun Microsystems, Inc.
  * All rights reserved.  Use is subject to license terms.
  */
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
 
 #ifndef	_SYS_FS_TMP_H
 #define	_SYS_FS_TMP_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #ifdef	__cplusplus
 extern "C" {
 #endif
@@ -68,29 +69,28 @@ enum dr_op	{ DR_REMOVE, DR_RMDIR, DR_RENAME };	/* dirremove ops */
 
 /*
  * tmpfs_minfree is the amount (in pages) of anonymous memory that tmpfs
- * leaves free for the rest of the system.  E.g. in a system with 32MB of
- * configured swap space, if 16MB were reserved (leaving 16MB free),
- * tmpfs could allocate up to 16MB - tmpfs_minfree.  The default value
- * for tmpfs_minfree is btopr(TMPMINFREE) but it can cautiously patched
- * to a different number of pages.
- * NB: If tmpfs allocates too much swap space, other processes will be
- * unable to execute.
+ * leaves free for the rest of the system.  In antiquity, this number could be
+ * relevant on a system-wide basis, as physical DRAM was routinely exhausted;
+ * however, in more modern times, the relative growth of DRAM with respect to
+ * application footprint means that this number is only likely to become
+ * factor in a virtualized OS environment (e.g., a zone) -- and even then only
+ * when DRAM and swap have both been capped low to allow for maximum tenancy.
+ * TMPMINFREE -- the value from which tmpfs_minfree is derived -- should
+ * therefore be configured to a value that is roughly the smallest practical
+ * value for memory + swap minus the largest reasonable size for tmpfs in such
+ * a configuration.  As of this writing, the smallest practical memory + swap
+ * configuration is 128MB, and it seems reasonable to allow tmpfs to consume
+ * no more than seven-eighths of this, yielding a TMPMINFREE of 16MB.  Care
+ * should be exercised in changing this:  tuning this value too high will
+ * result in spurious ENOSPC errors in tmpfs in small zones (a problem that
+ * can induce cascading failure surprisingly often); tuning this value too low
+ * will result in tmpfs consumption alone to alone induce application-level
+ * memory allocation failure.
  */
-#define	TMPMINFREE	2 * 1024 * 1024	/* 2 Megabytes */
+#define	TMPMINFREE	16 * 1024 * 1024	/* 16 Megabytes */
 
 extern size_t	tmpfs_minfree;		/* Anonymous memory in pages */
 
-/*
- * tmpfs can allocate only a certain percentage of kernel memory,
- * which is used for tmpnodes, directories, file names, etc.
- * This is statically set as TMPMAXFRACKMEM of physical memory.
- * The actual number of allocatable bytes can be patched in tmpfs_maxkmem.
- */
-#define	TMPMAXFRACKMEM	25	/* 1/25 of physical memory */
-
-extern size_t 	tmp_kmemspace;
-extern size_t	tmpfs_maxkmem;	/* Allocatable kernel memory in bytes */
-
 extern	void	tmpnode_init(struct tmount *, struct tmpnode *,
 	struct vattr *, struct cred *);
 extern	int	tmpnode_trunc(struct tmount *, struct tmpnode *, ulong_t);
@@ -101,13 +101,12 @@ extern	int	tdirdelete(struct tmpnode *, struct tmpnode *, char *,
 	enum dr_op, struct cred *);
 extern	void	tdirinit(struct tmpnode *, struct tmpnode *);
 extern	void	tdirtrunc(struct tmpnode *);
-extern	void	*tmp_memalloc(size_t, int);
-extern	void	tmp_memfree(void *, size_t);
 extern	int	tmp_resv(struct tmount *, struct tmpnode *, size_t, int);
 extern	int	tmp_taccess(void *, int, struct cred *);
 extern	int	tmp_sticky_remove_access(struct tmpnode *, struct tmpnode *,
 	struct cred *);
 extern	int	tmp_convnum(char *, pgcnt_t *);
+extern	int	tmp_convmode(char *, mode_t *);
 extern	int	tdirenter(struct tmount *, struct tmpnode *, char *,
 	enum de_op, struct tmpnode *, struct tmpnode *, struct vattr *,
 	struct tmpnode **, struct cred *, caller_context_t *);
diff --git a/usr/src/uts/common/sys/fx.h b/usr/src/uts/common/sys/fx.h
index 2d4e1aa7fb..4a48af52a1 100644
--- a/usr/src/uts/common/sys/fx.h
+++ b/usr/src/uts/common/sys/fx.h
@@ -21,13 +21,12 @@
 /*
  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2015 Joyent, Inc.
  */
 
 #ifndef _SYS_FX_H
 #define	_SYS_FX_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/types.h>
 #include <sys/thread.h>
 #include <sys/ddi.h>
@@ -145,7 +144,14 @@ typedef struct	fxkparms {
 	uint_t	fx_cflags;
 } fxkparms_t;
 
+/*
+ * control flags (kparms->fx_cflags).
+ */
+#define	FX_DOUPRILIM	0x01	/* change user priority limit */
+#define	FX_DOUPRI	0x02	/* change user priority */
+#define	FX_DOTQ		0x04	/* change FX time quantum */
 
+#define	FXMAXUPRI	60	/* maximum user priority setting */
 
 /*
  * Interface for partner private code. This is not a public interface.
diff --git a/usr/src/uts/common/sys/gsqueue.h b/usr/src/uts/common/sys/gsqueue.h
new file mode 100644
index 0000000000..40ef4ce982
--- /dev/null
+++ b/usr/src/uts/common/sys/gsqueue.h
@@ -0,0 +1,65 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 Joyent, Inc.  All rights reserved.
+ */
+
+#ifndef _SYS_GSQUEUE_H
+#define	_SYS_GSQUEUE_H
+
+/*
+ * Standard interfaces to serializaion queues for everyone (except IP).
+ */
+
+#include <sys/types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _KERNEL
+
+typedef struct gsqueue gsqueue_t;
+typedef struct gsqueue_set gsqueue_set_t;
+
+typedef void (*gsqueue_cb_f)(gsqueue_set_t *, gsqueue_t *, void *, boolean_t);
+typedef void (*gsqueue_proc_f)(void *, mblk_t *, gsqueue_t *, void *);
+
+extern gsqueue_set_t *gsqueue_set_create(uint_t, pri_t);
+extern void gsqueue_set_destroy(gsqueue_set_t *);
+extern gsqueue_t *gsqueue_set_get(gsqueue_set_t *, uint_t);
+
+extern uintptr_t gsqueue_set_cb_add(gsqueue_set_t *, gsqueue_cb_f, void *);
+extern int gsqueue_set_cb_remove(gsqueue_set_t *, uintptr_t);
+
+#define	GSQUEUE_FILL	0x0001
+#define	GSQUEUE_NODRAIN	0x0002
+#define	GSQUEUE_PROCESS	0x0004
+
+extern void gsqueue_enter_one(gsqueue_t *, mblk_t *, gsqueue_proc_f, void *,
+    int, uint8_t);
+
+/*
+ * The default wait is inherited from IP. This determines the amount of time
+ * that must pass after queuing work, before we wake up the worker thread. This
+ * value is in milliseconds.
+ */
+#define	GSQUEUE_DEFAULT_WAIT	10
+#define	GSQUEUE_DEFAULT_PRIORITY	MAXCLSYSPRI
+
+#endif	/* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_GSQUEUE_H */
diff --git a/usr/src/uts/common/sys/id_space.h b/usr/src/uts/common/sys/id_space.h
index d56fcceb5a..46d25f207f 100644
--- a/usr/src/uts/common/sys/id_space.h
+++ b/usr/src/uts/common/sys/id_space.h
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc.  All Rights reserved.
  */
 
 #ifndef	_ID_SPACE_H
@@ -34,8 +35,6 @@ extern "C" {
 #include <sys/mutex.h>
 #include <sys/vmem.h>
 
-#ifdef _KERNEL
-
 typedef vmem_t id_space_t;
 
 id_space_t *id_space_create(const char *, id_t, id_t);
@@ -48,8 +47,6 @@ id_t id_allocff_nosleep(id_space_t *);
 id_t id_alloc_specific_nosleep(id_space_t *, id_t);
 void id_free(id_space_t *, id_t);
 
-#endif /* _KERNEL */
-
 #ifdef	__cplusplus
 }
 #endif
diff --git a/usr/src/uts/common/sys/inotify.h b/usr/src/uts/common/sys/inotify.h
new file mode 100644
index 0000000000..8acc1a7280
--- /dev/null
+++ b/usr/src/uts/common/sys/inotify.h
@@ -0,0 +1,153 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 Joyent, Inc.  All rights reserved.
+ */
+
+/*
+ * Header file to support for the inotify facility.  Note that this facility
+ * is designed to be binary compatible with the Linux inotify facility; values
+ * for constants here should therefore exactly match those found in Linux, and
+ * this facility shouldn't be extended independently of Linux.
+ */
+
+#ifndef _SYS_INOTIFY_H
+#define	_SYS_INOTIFY_H
+
+#include <sys/types.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * Events that can be explicitly requested on any inotify watch.
+ */
+#define	IN_ACCESS		0x00000001
+#define	IN_MODIFY		0x00000002
+#define	IN_ATTRIB		0x00000004
+#define	IN_CLOSE_WRITE		0x00000008
+#define	IN_CLOSE_NOWRITE	0x00000010
+#define	IN_OPEN			0x00000020
+#define	IN_MOVED_FROM		0x00000040
+#define	IN_MOVED_TO		0x00000080
+#define	IN_CREATE		0x00000100
+#define	IN_DELETE		0x00000200
+#define	IN_DELETE_SELF		0x00000400
+#define	IN_MOVE_SELF		0x00000800
+
+/*
+ * Events that can be sent to an inotify watch -- requested or not.
+ */
+#define	IN_UNMOUNT		0x00002000
+#define	IN_Q_OVERFLOW		0x00004000
+#define	IN_IGNORED		0x00008000
+
+/*
+ * Flags that can modify an inotify event.
+ */
+#define	IN_ONLYDIR		0x01000000
+#define	IN_DONT_FOLLOW		0x02000000
+#define	IN_EXCL_UNLINK		0x04000000
+#define	IN_MASK_ADD		0x20000000
+#define	IN_ISDIR		0x40000000
+#define	IN_ONESHOT		0x80000000
+
+/*
+ * Helpful constants.
+ */
+#define	IN_CLOSE		(IN_CLOSE_WRITE | IN_CLOSE_NOWRITE)
+#define	IN_MOVE			(IN_MOVED_FROM | IN_MOVED_TO)
+#define	IN_ALL_EVENTS		\
+	(IN_ACCESS | IN_MODIFY | IN_ATTRIB | IN_CLOSE_WRITE | \
+	IN_CLOSE_NOWRITE | IN_OPEN | IN_MOVED_FROM | IN_MOVED_TO | \
+	IN_DELETE | IN_CREATE | IN_DELETE_SELF | IN_MOVE_SELF)
+
+#define	IN_CHILD_EVENTS		\
+	(IN_ACCESS | IN_MODIFY | IN_ATTRIB | IN_CLOSE_WRITE | \
+	IN_CLOSE_NOWRITE | IN_MODIFY | IN_OPEN)
+
+/*
+ * To assure binary compatibility with Linux, these values are fixed at their
+ * Linux equivalents, not their native ones.
+ */
+#define	IN_CLOEXEC		02000000		/* LX_O_CLOEXEC */
+#define	IN_NONBLOCK		04000			/* LX_O_NONBLOCK */
+
+struct inotify_event {
+	int32_t		wd;		/* watch descriptor */
+	uint32_t	mask;		/* mask of events */
+	uint32_t	cookie;		/* event association cookie, if any */
+	uint32_t	len;		/* size of name field */
+	char		name[];		/* optional NUL-terminated name */
+};
+
+/*
+ * These ioctl values are specific to the native implementation; applications
+ * shouldn't be using them directly, and they should therefore be safe to
+ * change without breaking apps.
+ */
+#define	INOTIFYIOC		(('i' << 24) | ('n' << 16) | ('y' << 8))
+#define	INOTIFYIOC_ADD_WATCH	(INOTIFYIOC | 1)	/* add watch */
+#define	INOTIFYIOC_RM_WATCH	(INOTIFYIOC | 2)	/* remove watch */
+#define	INOTIFYIOC_ADD_CHILD	(INOTIFYIOC | 3)	/* add child watch */
+#define	INOTIFYIOC_ACTIVATE	(INOTIFYIOC | 4)	/* activate watch */
+
+#ifndef _LP64
+#ifndef _LITTLE_ENDIAN
+#define	INOTIFY_PTR(type, name)	uint32_t name##pad; type *name
+#else
+#define	INOTIFY_PTR(type, name)	type *name; uint32_t name##pad
+#endif
+#else
+#define	INOTIFY_PTR(type, name)	type *name
+#endif
+
+typedef struct inotify_addwatch {
+	int inaw_fd;			/* open fd for object */
+	uint32_t inaw_mask;		/* desired mask */
+} inotify_addwatch_t;
+
+typedef struct inotify_addchild {
+	INOTIFY_PTR(char, inac_name);	/* pointer to name */
+	int inac_fd;			/* open fd for parent */
+} inotify_addchild_t;
+
+#ifndef _KERNEL
+
+extern int inotify_init(void);
+extern int inotify_init1(int);
+extern int inotify_add_watch(int, const char *, uint32_t);
+extern int inotify_rm_watch(int, int);
+
+#else
+
+#define	IN_UNMASKABLE \
+	(IN_UNMOUNT | IN_Q_OVERFLOW | IN_IGNORED | IN_ISDIR)
+
+#define	IN_MODIFIERS \
+	(IN_EXCL_UNLINK | IN_ONESHOT)
+
+#define	IN_FLAGS \
+	(IN_ONLYDIR | IN_DONT_FOLLOW | IN_MASK_ADD)
+
+#define	IN_REMOVAL		(1ULL << 32)
+#define	INOTIFYMNRN_INOTIFY	0
+#define	INOTIFYMNRN_CLONE	1
+
+#endif /* _KERNEL */
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_INOTIFY_H */
diff --git a/usr/src/uts/common/sys/ipc_impl.h b/usr/src/uts/common/sys/ipc_impl.h
index 0569c3e967..d7dc365c09 100644
--- a/usr/src/uts/common/sys/ipc_impl.h
+++ b/usr/src/uts/common/sys/ipc_impl.h
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2016, Joyent, Inc.
  */
 
 #ifndef	_IPC_IMPL_H
@@ -226,6 +227,7 @@ int ipc_commit_begin(ipc_service_t *, key_t, int, kipc_perm_t *);
 kmutex_t *ipc_commit_end(ipc_service_t *, kipc_perm_t *);
 void ipc_cleanup(ipc_service_t *, kipc_perm_t *);
 
+void ipc_rmsvc(ipc_service_t *, kipc_perm_t *);
 int ipc_rmid(ipc_service_t *, int, cred_t *);
 int ipc_ids(ipc_service_t *, int *, uint_t, uint_t *);
 
diff --git a/usr/src/uts/common/sys/iso/signal_iso.h b/usr/src/uts/common/sys/iso/signal_iso.h
index b1990121b8..0ae64b45d7 100644
--- a/usr/src/uts/common/sys/iso/signal_iso.h
+++ b/usr/src/uts/common/sys/iso/signal_iso.h
@@ -22,6 +22,7 @@
 /*
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2015, Joyent, Inc.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -95,7 +96,7 @@ extern "C" {
 
 /* insert new signals here, and move _SIGRTM* appropriately */
 #define	_SIGRTMIN 42	/* first (highest-priority) realtime signal */
-#define	_SIGRTMAX 73	/* last (lowest-priority) realtime signal */
+#define	_SIGRTMAX 74	/* last (lowest-priority) realtime signal */
 extern long _sysconf(int);	/* System Private interface to sysconf() */
 #define	SIGRTMIN ((int)_sysconf(_SC_SIGRT_MIN))	/* first realtime signal */
 #define	SIGRTMAX ((int)_sysconf(_SC_SIGRT_MAX))	/* last realtime signal */
diff --git a/usr/src/uts/common/sys/klwp.h b/usr/src/uts/common/sys/klwp.h
index 41b70f6a6e..bdbff0be9b 100644
--- a/usr/src/uts/common/sys/klwp.h
+++ b/usr/src/uts/common/sys/klwp.h
@@ -24,7 +24,7 @@
  */
 
 /*
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
  */
 
 #ifndef	_SYS_KLWP_H
@@ -191,7 +191,19 @@ typedef struct _klwp {
 	struct ct_template *lwp_ct_active[CTT_MAXTYPE]; /* active templates */
 	struct contract	*lwp_ct_latest[CTT_MAXTYPE]; /* last created contract */
 
-	void	*lwp_brand;		/* per-lwp brand data */
+	/*
+	 * Branding:
+	 * lwp_brand			- per-lwp brand data
+	 * lwp_brand_syscall		- brand syscall interposer
+	 * lwp_brand_syscall_fast	- brand "fast path" syscall interposer
+	 *
+	 * The lwp_brand_syscall_fast handler should only be used if an
+	 * lwp_brand_syscall handler is also in place.
+	 */
+	void	*lwp_brand;
+	int	(*lwp_brand_syscall)(void);
+	int	(*lwp_brand_syscall_fast)(void);
+
 	struct psinfo *lwp_spymaster;	/* if an agent LWP, our spymaster */
 } klwp_t;
 
diff --git a/usr/src/uts/common/sys/kmem_impl.h b/usr/src/uts/common/sys/kmem_impl.h
index 90e0477c45..26ab055dbc 100644
--- a/usr/src/uts/common/sys/kmem_impl.h
+++ b/usr/src/uts/common/sys/kmem_impl.h
@@ -302,7 +302,6 @@ typedef struct kmem_defrag {
 	uint64_t	kmd_later;		/* LATER responses */
 	uint64_t	kmd_dont_need;		/* DONT_NEED responses */
 	uint64_t	kmd_dont_know;		/* DONT_KNOW responses */
-	uint64_t	kmd_hunt_found;		/* DONT_KNOW: # found in mag */
 	uint64_t	kmd_slabs_freed;	/* slabs freed by moves */
 	uint64_t	kmd_defrags;		/* kmem_cache_defrag() */
 	uint64_t	kmd_scans;		/* kmem_cache_scan() */
diff --git a/usr/src/uts/common/sys/ksocket.h b/usr/src/uts/common/sys/ksocket.h
index dfe25eec76..be669cb78d 100644
--- a/usr/src/uts/common/sys/ksocket.h
+++ b/usr/src/uts/common/sys/ksocket.h
@@ -21,6 +21,7 @@
 /*
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015, Joyent, Inc.
  */
 
 #ifndef _SYS_KSOCKET_H_
@@ -121,6 +122,10 @@ extern int 	ksocket_close(ksocket_t, struct cred *);
 extern void	ksocket_hold(ksocket_t);
 extern void	ksocket_rele(ksocket_t);
 
+typedef boolean_t (*ksocket_krecv_f)(ksocket_t, mblk_t *, size_t, int, void *);
+extern int	ksocket_krecv_set(ksocket_t, ksocket_krecv_f, void *);
+extern void	ksocket_krecv_unblock(ksocket_t);
+
 #ifdef	__cplusplus
 }
 #endif
diff --git a/usr/src/uts/common/sys/limits.h b/usr/src/uts/common/sys/limits.h
new file mode 100644
index 0000000000..88625d1829
--- /dev/null
+++ b/usr/src/uts/common/sys/limits.h
@@ -0,0 +1,32 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+/*
+ * Copyright 2015 Joyent, Inc.  All rights reserved.
+ */
+
+#ifndef _SYS_LIMITS_H
+#define	_SYS_LIMITS_H
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#define	IOV_MAX	1024
+
+#ifdef _KERNEL
+#define	IOV_MAX_STACK	16	/* max. IOV on-stack allocation */
+#endif /* _KERNEL */
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_LIMITS_H */
diff --git a/usr/src/uts/common/sys/mac.h b/usr/src/uts/common/sys/mac.h
index 247c3bd48d..cdbbe4ce62 100644
--- a/usr/src/uts/common/sys/mac.h
+++ b/usr/src/uts/common/sys/mac.h
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2014, Joyent, Inc.  All rights reserved.
+ * Copyright 2015, Joyent, Inc.
  * Copyright (c) 2015 Garrett D'Amore <garrett@damore.org>
  */
 
@@ -101,6 +101,14 @@ typedef struct mac_propval_uint32_range_s {
 } mac_propval_uint32_range_t;
 
 /*
+ * Defines ranges which are a series of C style strings.
+ */
+typedef struct mac_propval_str_range_s {
+	uint32_t mpur_nextbyte;
+	char mpur_data[1];
+} mac_propval_str_range_t;
+
+/*
  * Data type of property values.
  */
 typedef enum {
@@ -120,6 +128,7 @@ typedef struct mac_propval_range_s {
 	mac_propval_type_t mpr_type;		/* type of value */
 	union {
 		mac_propval_uint32_range_t mpr_uint32[1];
+		mac_propval_str_range_t mpr_str;
 	} u;
 } mac_propval_range_t;
 
@@ -214,6 +223,7 @@ typedef enum {
 	MAC_PROP_MAX_RXHWCLNT_AVAIL,
 	MAC_PROP_MAX_TXHWCLNT_AVAIL,
 	MAC_PROP_IB_LINKMODE,
+	MAC_PROP_VN_PROMISC_FILTERED,
 	MAC_PROP_SECONDARY_ADDRS,
 	MAC_PROP_ADV_40GFDX_CAP,
 	MAC_PROP_EN_40GFDX_CAP,
diff --git a/usr/src/uts/common/sys/mac_client.h b/usr/src/uts/common/sys/mac_client.h
index 0fc4939503..1f2c732e6d 100644
--- a/usr/src/uts/common/sys/mac_client.h
+++ b/usr/src/uts/common/sys/mac_client.h
@@ -22,7 +22,7 @@
 /*
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
- * Copyright 2013 Joyent, Inc.  All rights reserved.
+ * Copyright 2015 Joyent, Inc.  All rights reserved.
  */
 
 /*
@@ -115,6 +115,7 @@ typedef enum {
 #define	MAC_PROMISC_FLAGS_NO_PHYS		0x0002
 #define	MAC_PROMISC_FLAGS_VLAN_TAG_STRIP	0x0004
 #define	MAC_PROMISC_FLAGS_NO_COPY		0x0008
+#define	MAC_PROMISC_FLAGS_DO_FIXUPS		0x0010
 
 /* flags passed to mac_tx() */
 #define	MAC_DROP_ON_NO_DESC	0x01 /* freemsg() if no tx descs */
diff --git a/usr/src/uts/common/sys/mac_client_impl.h b/usr/src/uts/common/sys/mac_client_impl.h
index 0904b28645..d2fd145375 100644
--- a/usr/src/uts/common/sys/mac_client_impl.h
+++ b/usr/src/uts/common/sys/mac_client_impl.h
@@ -23,7 +23,7 @@
  * Use is subject to license terms.
  */
 /*
- * Copyright (c) 2013, Joyent, Inc.  All rights reserved.
+ * Copyright 2015 Joyent, Inc.
  */
 
 #ifndef	_SYS_MAC_CLIENT_IMPL_H
@@ -82,6 +82,7 @@ typedef struct mac_promisc_impl_s {			/* Protected by */
 	boolean_t			mpi_no_phys;	/* WO */
 	boolean_t			mpi_strip_vlan_tag;	/* WO */
 	boolean_t			mpi_no_copy;	/* WO */
+	boolean_t			mpi_do_fixups;	/* WO */
 } mac_promisc_impl_t;
 
 typedef union mac_tx_percpu_s {
@@ -330,13 +331,14 @@ extern	int	mac_tx_percpu_cnt;
 
 /* Mac protection flags */
 #define	MPT_FLAG_V6_LOCAL_ADDR_SET	0x0001
+#define	MPT_FLAG_PROMISC_FILTERED	0x0002
 
 /* in mac_client.c */
 extern void mac_promisc_client_dispatch(mac_client_impl_t *, mblk_t *);
 extern void mac_client_init(void);
 extern void mac_client_fini(void);
 extern void mac_promisc_dispatch(mac_impl_t *, mblk_t *,
-    mac_client_impl_t *);
+    mac_client_impl_t *, boolean_t);
 
 extern int mac_validate_props(mac_impl_t *, mac_resource_props_t *);
 
diff --git a/usr/src/uts/common/sys/mac_client_priv.h b/usr/src/uts/common/sys/mac_client_priv.h
index 6b409513a6..a5848625c2 100644
--- a/usr/src/uts/common/sys/mac_client_priv.h
+++ b/usr/src/uts/common/sys/mac_client_priv.h
@@ -22,7 +22,7 @@
 /*
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
- * Copyright 2013 Joyent, Inc.  All rights reserved.
+ * Copyright 2015 Joyent, Inc.
  */
 
 /*
@@ -171,6 +171,7 @@ extern void mac_client_set_intr_cpu(void *, mac_client_handle_t, int32_t);
 extern void *mac_get_devinfo(mac_handle_t);
 
 extern boolean_t mac_is_vnic(mac_handle_t);
+extern boolean_t mac_is_overlay(mac_handle_t);
 extern uint32_t mac_no_notification(mac_handle_t);
 
 extern int mac_set_prop(mac_handle_t, mac_prop_id_t, char *, void *, uint_t);
diff --git a/usr/src/uts/common/sys/mac_impl.h b/usr/src/uts/common/sys/mac_impl.h
index 2286b587e8..46293b1a74 100644
--- a/usr/src/uts/common/sys/mac_impl.h
+++ b/usr/src/uts/common/sys/mac_impl.h
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2014, Joyent, Inc.  All rights reserved.
+ * Copyright 2015, Joyent, Inc.
  */
 
 #ifndef	_SYS_MAC_IMPL_H
@@ -331,7 +331,7 @@ struct mac_group_s {
 	if ((src_mcip)->mci_state_flags & MCIS_SHARE_BOUND)		\
 		rhandle = (mip)->mi_default_tx_ring;			\
 	if (mip->mi_promisc_list != NULL)				\
-		mac_promisc_dispatch(mip, mp, src_mcip);		\
+		mac_promisc_dispatch(mip, mp, src_mcip, B_TRUE);	\
 	/*								\
 	 * Grab the proper transmit pointer and handle. Special 	\
 	 * optimization: we can test mi_bridge_link itself atomically,	\
@@ -643,6 +643,7 @@ struct mac_impl_s {
 #define	MIS_LEGACY		0x0040
 #define	MIS_NO_ACTIVE		0x0080
 #define	MIS_POLL_DISABLE	0x0100
+#define	MIS_IS_OVERLAY		0x0200
 
 #define	mi_getstat	mi_callbacks->mc_getstat
 #define	mi_start	mi_callbacks->mc_start
@@ -894,6 +895,8 @@ extern void mac_protect_fini(mac_client_impl_t *);
 extern int mac_set_resources(mac_handle_t, mac_resource_props_t *);
 extern void mac_get_resources(mac_handle_t, mac_resource_props_t *);
 extern void mac_get_effective_resources(mac_handle_t, mac_resource_props_t *);
+extern void mac_set_promisc_filtered(mac_client_handle_t, boolean_t);
+extern boolean_t mac_get_promisc_filtered(mac_client_handle_t);
 
 extern cpupart_t *mac_pset_find(mac_resource_props_t *, boolean_t *);
 extern void mac_set_pool_effective(boolean_t, cpupart_t *,
diff --git a/usr/src/uts/common/sys/mac_provider.h b/usr/src/uts/common/sys/mac_provider.h
index 9f7f2a1a73..5f02451542 100644
--- a/usr/src/uts/common/sys/mac_provider.h
+++ b/usr/src/uts/common/sys/mac_provider.h
@@ -21,6 +21,7 @@
 
 /*
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015, Joyent, Inc.
  */
 
 #ifndef	_SYS_MAC_PROVIDER_H
@@ -106,7 +107,8 @@ typedef enum {
 	MAC_CAPAB_NO_NATIVEVLAN	= 0x00080000, /* boolean only, no data */
 	MAC_CAPAB_NO_ZCOPY	= 0x00100000, /* boolean only, no data */
 	MAC_CAPAB_LEGACY	= 0x00200000, /* data is mac_capab_legacy_t */
-	MAC_CAPAB_VRRP		= 0x00400000  /* data is mac_capab_vrrp_t */
+	MAC_CAPAB_VRRP		= 0x00400000, /* data is mac_capab_vrrp_t */
+	MAC_CAPAB_OVERLAY	= 0x00800000  /* boolean only, no data */
 } mac_capab_t;
 
 /*
diff --git a/usr/src/uts/common/sys/mman.h b/usr/src/uts/common/sys/mman.h
index 6ec5f4ff41..34e491fd3b 100644
--- a/usr/src/uts/common/sys/mman.h
+++ b/usr/src/uts/common/sys/mman.h
@@ -338,6 +338,7 @@ struct memcntl_mha32 {
 #define	MS_SYNC		0x4		/* wait for msync */
 #define	MS_ASYNC	0x1		/* return immediately */
 #define	MS_INVALIDATE	0x2		/* invalidate caches */
+#define	MS_INVALCURPROC	0x8		/* invalidate cache for curproc only */
 
 #if	(_POSIX_C_SOURCE <= 2) && !defined(_XPG4_2) || defined(__EXTENSIONS__)
 /* functions to mctl */
diff --git a/usr/src/uts/common/sys/mntent.h b/usr/src/uts/common/sys/mntent.h
index 88c98dc5a4..7196f7b3ac 100644
--- a/usr/src/uts/common/sys/mntent.h
+++ b/usr/src/uts/common/sys/mntent.h
@@ -21,6 +21,7 @@
 /*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2012, Joyent, Inc.  All rights reserved.
  * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
  *
  *	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T
@@ -47,6 +48,7 @@ extern "C" {
 #define	MNTTYPE_PCFS	"pcfs"		/* PC (MSDOS) file system */
 #define	MNTTYPE_PC	MNTTYPE_PCFS	/* Deprecated name; use MNTTYPE_PCFS */
 #define	MNTTYPE_LOFS	"lofs"		/* Loop back file system */
+#define	MNTTYPE_HYPRLOFS "hyprlofs"	/* Hyperlofs file system */
 #define	MNTTYPE_LO	MNTTYPE_LOFS	/* Deprecated name; use MNTTYPE_LOFS */
 #define	MNTTYPE_HSFS	"hsfs"		/* High Sierra (9660) file system */
 #define	MNTTYPE_SWAP	"swap"		/* Swap file system */
diff --git a/usr/src/uts/common/sys/netconfig.h b/usr/src/uts/common/sys/netconfig.h
index 14b1aa55db..883c329aed 100644
--- a/usr/src/uts/common/sys/netconfig.h
+++ b/usr/src/uts/common/sys/netconfig.h
@@ -28,6 +28,7 @@
  *
  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2015 Joyent, Inc.
  */
 
 #ifndef	_SYS_NETCONFIG_H
@@ -147,6 +148,8 @@ extern int		endnetpath(void *);
 extern struct netconfig *getnetpath(void *);
 extern void		nc_perror(const char *);
 extern char		*nc_sperror(void);
+extern void		_nsl_brand_set_hooks(int (*)(void),
+			    struct netconfig *(*)(int));
 
 #ifdef	__cplusplus
 }
diff --git a/usr/src/uts/common/sys/neti.h b/usr/src/uts/common/sys/neti.h
index 93b5fc3e01..ea85c78f6b 100644
--- a/usr/src/uts/common/sys/neti.h
+++ b/usr/src/uts/common/sys/neti.h
@@ -44,6 +44,8 @@ extern "C" {
 #define	NHF_INET	"NHF_INET"
 #define	NHF_INET6	"NHF_INET6"
 #define	NHF_ARP		"NHF_ARP"
+#define	NHF_VND_INET	"NHF_VND_INET"
+#define	NHF_VND_INET6	"NHF_VND_INET6"
 
 /*
  * Event identification
diff --git a/usr/src/uts/common/sys/netstack.h b/usr/src/uts/common/sys/netstack.h
index 2c77e1be96..73f29d1e63 100644
--- a/usr/src/uts/common/sys/netstack.h
+++ b/usr/src/uts/common/sys/netstack.h
@@ -81,7 +81,8 @@ typedef id_t	netstackid_t;
 #define	NS_IPSECESP	16
 #define	NS_IPNET	17
 #define	NS_ILB		18
-#define	NS_MAX		(NS_ILB+1)
+#define	NS_VND		19
+#define	NS_MAX		(NS_VND+1)
 
 /*
  * State maintained for each module which tracks the state of
diff --git a/usr/src/uts/common/sys/overlay.h b/usr/src/uts/common/sys/overlay.h
new file mode 100644
index 0000000000..12d0dbca51
--- /dev/null
+++ b/usr/src/uts/common/sys/overlay.h
@@ -0,0 +1,96 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015, Joyent, Inc.
+ */
+
+#ifndef _SYS_OVERLAY_H
+#define	_SYS_OVERLAY_H
+
+/*
+ * Overlay device support
+ */
+
+#include <sys/param.h>
+#include <sys/dld_ioc.h>
+#include <sys/mac.h>
+#include <sys/overlay_common.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define	OVERLAY_IOC_CREATE	OVERLAYIOC(1)
+#define	OVERLAY_IOC_DELETE	OVERLAYIOC(2)
+#define	OVERLAY_IOC_PROPINFO	OVERLAYIOC(3)
+#define	OVERLAY_IOC_GETPROP	OVERLAYIOC(4)
+#define	OVERLAY_IOC_SETPROP	OVERLAYIOC(5)
+#define	OVERLAY_IOC_NPROPS	OVERLAYIOC(6)
+#define	OVERLAY_IOC_ACTIVATE	OVERLAYIOC(7)
+#define	OVERLAY_IOC_STATUS	OVERLAYIOC(8)
+
+typedef struct overlay_ioc_create {
+	datalink_id_t	oic_linkid;
+	uint32_t	oic_filler;
+	uint64_t	oic_vnetid;
+	char		oic_encap[MAXLINKNAMELEN];
+} overlay_ioc_create_t;
+
+typedef struct overlay_ioc_activate {
+	datalink_id_t	oia_linkid;
+} overlay_ioc_activate_t;
+
+typedef struct overlay_ioc_delete {
+	datalink_id_t	oid_linkid;
+} overlay_ioc_delete_t;
+
+typedef struct overlay_ioc_nprops {
+	datalink_id_t	oipn_linkid;
+	int32_t		oipn_nprops;
+} overlay_ioc_nprops_t;
+
+typedef struct overlay_ioc_propinfo {
+	datalink_id_t	oipi_linkid;
+	int32_t		oipi_id;
+	char		oipi_name[OVERLAY_PROP_NAMELEN];
+	uint_t		oipi_type;
+	uint_t		oipi_prot;
+	uint8_t		oipi_default[OVERLAY_PROP_SIZEMAX];
+	uint32_t	oipi_defsize;
+	uint32_t	oipi_posssize;
+	uint8_t		oipi_poss[OVERLAY_PROP_SIZEMAX];
+} overlay_ioc_propinfo_t;
+
+typedef struct overlay_ioc_prop {
+	datalink_id_t	oip_linkid;
+	int32_t		oip_id;
+	char		oip_name[OVERLAY_PROP_NAMELEN];
+	uint8_t		oip_value[OVERLAY_PROP_SIZEMAX];
+	uint32_t	oip_size;
+} overlay_ioc_prop_t;
+
+typedef enum overlay_status {
+	OVERLAY_I_OK		= 0x00,
+	OVERLAY_I_DEGRADED	= 0x01
+} overlay_status_t;
+
+typedef struct overlay_ioc_status {
+	datalink_id_t	ois_linkid;
+	uint_t		ois_status;
+	char		ois_message[OVERLAY_STATUS_BUFLEN];
+} overlay_ioc_status_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_OVERLAY_H */
diff --git a/usr/src/uts/common/sys/overlay_common.h b/usr/src/uts/common/sys/overlay_common.h
new file mode 100644
index 0000000000..d638096006
--- /dev/null
+++ b/usr/src/uts/common/sys/overlay_common.h
@@ -0,0 +1,65 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+#ifndef _SYS_OVERLAY_COMMON_H
+#define	_SYS_OVERLAY_COMMON_H
+
+/*
+ * Common overlay definitions
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum overlay_target_mode {
+	OVERLAY_TARGET_NONE = 0x0,
+	OVERLAY_TARGET_POINT,
+	OVERLAY_TARGET_DYNAMIC
+} overlay_target_mode_t;
+
+typedef enum overlay_plugin_dest {
+	OVERLAY_PLUGIN_D_INVALID	= 0x0,
+	OVERLAY_PLUGIN_D_ETHERNET	= 0x1,
+	OVERLAY_PLUGIN_D_IP		= 0x2,
+	OVERLAY_PLUGIN_D_PORT 		= 0x4,
+	OVERLAY_PLUGIN_D_MASK		= 0x7
+} overlay_plugin_dest_t;
+
+typedef enum overlay_prop_type {
+	OVERLAY_PROP_T_INT = 0x1,	/* signed int */
+	OVERLAY_PROP_T_UINT,		/* unsigned int */
+	OVERLAY_PROP_T_IP,		/* sinaddr6 */
+	OVERLAY_PROP_T_STRING		/* OVERLAY_PROPS_SIZEMAX */
+} overlay_prop_type_t;
+
+typedef enum overlay_prop_prot {
+	OVERLAY_PROP_PERM_REQ	= 0x1,
+	OVERLAY_PROP_PERM_READ	= 0x2,
+	OVERLAY_PROP_PERM_WRITE	= 0x4,
+	OVERLAY_PROP_PERM_RW 	= 0x6,
+	OVERLAY_PROP_PERM_RRW	= 0x7,
+	OVERLAY_PROP_PERM_MASK	= 0x7
+} overlay_prop_prot_t;
+
+#define	OVERLAY_PROP_NAMELEN	64
+#define	OVERLAY_PROP_SIZEMAX	256
+#define	OVERLAY_STATUS_BUFLEN	256
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_OVERLAY_COMMON_H */
diff --git a/usr/src/uts/common/sys/overlay_impl.h b/usr/src/uts/common/sys/overlay_impl.h
new file mode 100644
index 0000000000..7fb8b8da1d
--- /dev/null
+++ b/usr/src/uts/common/sys/overlay_impl.h
@@ -0,0 +1,205 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#ifndef _SYS_OVERLAY_IMPL_H
+#define	_SYS_OVERLAY_IMPL_H
+
+/*
+ * Overlay device support
+ */
+
+#include <sys/overlay.h>
+#include <sys/overlay_common.h>
+#include <sys/overlay_plugin.h>
+#include <sys/overlay_target.h>
+#include <sys/ksynch.h>
+#include <sys/list.h>
+#include <sys/avl.h>
+#include <sys/ksocket.h>
+#include <sys/socket.h>
+#include <sys/refhash.h>
+#include <sys/ethernet.h>
+#include <sys/list.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define	OVEP_VERSION_ONE	0x1
+
+typedef struct overlay_plugin {
+	kmutex_t ovp_mutex;
+	list_node_t ovp_link;			/* overlay_plugin_lock */
+	uint_t ovp_active;			/* ovp_mutex */
+	const char *ovp_name;			/* RO */
+	const overlay_plugin_ops_t *ovp_ops;	/* RO */
+	const char *const *ovp_props;		/* RO */
+	uint_t ovp_nprops;			/* RO */
+	uint_t ovp_id_size;			/* RO */
+	overlay_plugin_flags_t ovp_flags;	/* RO */
+	overlay_plugin_dest_t ovp_dest;		/* RO */
+} overlay_plugin_t;
+
+typedef struct overlay_mux {
+	list_node_t		omux_lnode;
+	ksocket_t		omux_ksock;	/* RO */
+	overlay_plugin_t	*omux_plugin;	/* RO: associated encap */
+	int			omux_domain;	/* RO: socket domain */
+	int			omux_family;	/* RO: socket family */
+	int			omux_protocol;	/* RO: socket protocol */
+	struct sockaddr 	*omux_addr;	/* RO: socket address */
+	socklen_t		omux_alen;	/* RO: sockaddr len */
+	kmutex_t		omux_lock;	/* Protects everything below */
+	uint_t			omux_count;	/* Active instances */
+	avl_tree_t		omux_devices;	/* Tree of devices */
+} overlay_mux_t;
+
+typedef enum overlay_target_flag {
+	OVERLAY_T_TEARDOWN	= 0x1
+} overlay_target_flag_t;
+
+typedef struct overlay_target {
+	kmutex_t		ott_lock;
+	kcondvar_t		ott_cond;
+	overlay_target_mode_t	ott_mode;	/* RO */
+	overlay_plugin_dest_t	ott_dest;	/* RO */
+	uint64_t		ott_id;		/* RO */
+	overlay_target_flag_t	ott_flags;	/* ott_lock */
+	uint_t			ott_ocount;	/* ott_lock */
+	union {					/* ott_lock */
+		overlay_target_point_t	ott_point;
+		struct overlay_target_dyn {
+			refhash_t	*ott_dhash;
+			avl_tree_t	ott_tree;
+		} ott_dyn;
+	} ott_u;
+} overlay_target_t;
+
+typedef enum overlay_dev_flag {
+	OVERLAY_F_ACTIVATED	= 0x01, /* Activate ioctl completed */
+	OVERLAY_F_IN_MUX	= 0x02,	/* Currently in a mux */
+	OVERLAY_F_IN_TX		= 0x04,	/* Currently doing tx */
+	OVERLAY_F_IN_RX		= 0x08, /* Currently doing rx */
+	OVERLAY_F_IOMASK	= 0x0c,	/* A mask for rx and tx */
+	OVERLAY_F_MDDROP	= 0x10,	/* Drop traffic for metadata update */
+	OVERLAY_F_STOPMASK	= 0x1e,	/* None set when stopping */
+	OVERLAY_F_VARPD		= 0x20,	/* varpd plugin exists */
+	OVERLAY_F_DEGRADED	= 0x40,	/* device is degraded */
+	OVERLAY_F_MASK		= 0x7f	/* mask of everything */
+} overlay_dev_flag_t;
+
+typedef struct overlay_dev {
+	kmutex_t	odd_lock;
+	kcondvar_t	odd_iowait;
+	list_node_t	odd_link;		/* overlay_dev_lock */
+	mac_handle_t	odd_mh;			/* RO */
+	overlay_plugin_t *odd_plugin;		/* RO */
+	datalink_id_t	odd_linkid;		/* RO */
+	void		*odd_pvoid;		/* RO -- only used by plugin */
+	uint_t		odd_ref;		/* protected by odd_lock */
+	uint_t		odd_mtu;		/* protected by odd_lock */
+	overlay_dev_flag_t odd_flags;		/* protected by odd_lock */
+	uint_t		odd_rxcount;		/* protected by odd_lock */
+	uint_t		odd_txcount;		/* protected by odd_lock */
+	overlay_mux_t	*odd_mux;		/* protected by odd_lock */
+	uint64_t	odd_vid;		/* RO if active else odd_lock */
+	avl_node_t	odd_muxnode;		/* managed by mux */
+	overlay_target_t *odd_target;		/* See big theory statement */
+	char		odd_fmamsg[OVERLAY_STATUS_BUFLEN];	/* odd_lock */
+} overlay_dev_t;
+
+typedef enum overlay_target_entry_flags {
+	OVERLAY_ENTRY_F_PENDING		= 0x01,	/* lookup in progress */
+	OVERLAY_ENTRY_F_VALID		= 0x02,	/* entry is currently valid */
+	OVERLAY_ENTRY_F_DROP		= 0x04,	/* always drop target */
+	OVERLAY_ENTRY_F_VALID_MASK	= 0x06
+} overlay_target_entry_flags_t;
+
+typedef struct overlay_target_entry {
+	kmutex_t		ote_lock;
+	refhash_link_t		ote_reflink;	/* hashtable link */
+	avl_node_t		ote_avllink;	/* iteration link */
+	list_node_t		ote_qlink;
+	overlay_target_entry_flags_t ote_flags;	/* RW: state flags */
+	uint8_t			ote_addr[ETHERADDRL];	/* RO: mac addr */
+	overlay_target_t	*ote_ott;	/* RO */
+	overlay_dev_t		*ote_odd;	/* RO */
+	overlay_target_point_t	ote_dest;	/* RW: destination */
+	mblk_t			*ote_chead;	/* RW: blocked mb chain head */
+	mblk_t			*ote_ctail;	/* RW: blocked mb chain tail */
+	size_t			ote_mbsize;	/* RW: outstanding mblk size */
+	hrtime_t		ote_vtime;	/* RW: valid timestamp */
+} overlay_target_entry_t;
+
+
+#define	OVERLAY_CTL	"overlay"
+
+extern dev_info_t *overlay_dip;
+
+extern mblk_t *overlay_m_tx(void *, mblk_t *);
+
+typedef int (*overlay_dev_iter_f)(overlay_dev_t *, void *);
+extern void overlay_dev_iter(overlay_dev_iter_f, void *);
+
+extern void overlay_plugin_init(void);
+extern overlay_plugin_t *overlay_plugin_lookup(const char *);
+extern void overlay_plugin_rele(overlay_plugin_t *);
+extern void overlay_plugin_fini(void);
+typedef int (*overlay_plugin_walk_f)(overlay_plugin_t *, void *);
+extern void overlay_plugin_walk(overlay_plugin_walk_f, void *);
+
+extern void overlay_io_start(overlay_dev_t *, overlay_dev_flag_t);
+extern void overlay_io_done(overlay_dev_t *, overlay_dev_flag_t);
+
+extern void overlay_mux_init(void);
+extern void overlay_mux_fini(void);
+
+extern overlay_mux_t *overlay_mux_open(overlay_plugin_t *, int, int, int,
+    struct sockaddr *, socklen_t, int *);
+extern void overlay_mux_close(overlay_mux_t *);
+extern void overlay_mux_add_dev(overlay_mux_t *, overlay_dev_t *);
+extern void overlay_mux_remove_dev(overlay_mux_t *, overlay_dev_t *);
+extern int overlay_mux_tx(overlay_mux_t *, struct msghdr *, mblk_t *);
+
+extern void overlay_prop_init(overlay_prop_handle_t);
+
+extern void overlay_target_init(void);
+extern int overlay_target_busy(void);
+extern int overlay_target_open(dev_t *, int, int, cred_t *);
+extern int overlay_target_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
+extern int overlay_target_close(dev_t, int, int, cred_t *);
+extern void overlay_target_free(overlay_dev_t *);
+
+#define	OVERLAY_TARGET_OK	0
+#define	OVERLAY_TARGET_DROP	1
+#define	OVERLAY_TARGET_ASYNC	2
+extern int overlay_target_lookup(overlay_dev_t *, mblk_t *, struct sockaddr *,
+    socklen_t *);
+extern void overlay_target_quiesce(overlay_target_t *);
+extern void overlay_target_fini(void);
+
+extern void overlay_fm_init(void);
+extern void overlay_fm_fini(void);
+extern void overlay_fm_degrade(overlay_dev_t *, const char *);
+extern void overlay_fm_restore(overlay_dev_t *);
+
+extern overlay_dev_t *overlay_hold_by_dlid(datalink_id_t);
+extern void overlay_hold_rele(overlay_dev_t *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_OVERLAY_IMPL_H */
diff --git a/usr/src/uts/common/sys/overlay_plugin.h b/usr/src/uts/common/sys/overlay_plugin.h
new file mode 100644
index 0000000000..07efaa05df
--- /dev/null
+++ b/usr/src/uts/common/sys/overlay_plugin.h
@@ -0,0 +1,324 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2015 Joyent, Inc.
+ */
+
+#ifndef _SYS_OVERLAY_PLUGIN_H
+#define	_SYS_OVERLAY_PLUGIN_H
+
+/*
+ * overlay plugin interface for encapsulation/decapsulation modules
+ *
+ * This header file defines how encapsulation and decapsulation plugins
+ * interact within the broader system. At this time, these interfaces are
+ * considered private to illumos and therefore are subject to change. As we gain
+ * more experience with a few of the different encapsulation formats, say nvgre
+ * or geneve, then we can move to make this a more-stable interface.
+ *
+ * A plugin is a general kernel module that uses the miscellaneous mod-linkage.
+ *
+ * In it's _init(9E) routine, it must register itself with the overlay
+ * subsystem. To do this, it allocates an overlay_plugin_register_t via
+ * overlay_plugin_alloc(), that it then  * fills out with various required
+ * information and then attempts to register with the system via a call to
+ * overlay_plugin_register(). If that succeeds, it should then call
+ * mod_install(9F). If the mod_install(9F) fails, then it should call
+ * overlay_plugin_unregister(). Regardless of success or failure, it should call
+ * overlay_plugin_free() to ensure that any memory that may be associated with
+ * the registration is freed.
+ *
+ * When the module's _fini(9E) is called, overlay_plugin_unregister() should be
+ * called first. It may return an error, such as EBUSY. In such cases, it should
+ * be returned as the return status of _fini(9E). This is quite necessary, it
+ * ensures that if the module is in use it doesn't get unloaded out from under
+ * us the broader subsystem while it's still in use. A driver can use that to
+ * know that there are no current instances of its private data.
+ *
+ * ------------------
+ * Plugin Definitions
+ * ------------------
+ *
+ * A plugin is required to fill in both an operations vector and a series of
+ * information to the callback routine. Here are the routines and their
+ * purposes. The full signatures are available below.
+ *
+ *   overlay_plugin_init_t
+ *
+ * 	This interface is used to create a new instance of a plugin. An instance
+ * 	of a plugin will be created for each overlay device that is created. For
+ * 	example, if a device is created with VXLAN ID 23 and ID 42, then there
+ * 	will be two different calls to this function.
+ *
+ * 	This function gives the plugin a chance to create a private data
+ * 	structure that will be returned on subsequent calls to the system.
+ *
+ *   overlay_plugin_fini_t
+ *
+ *   	This is the opposite of overlay_plugin_init_t. It will be called when it
+ *   	is safe to remove any private data that is associated with this instance
+ *   	of the plugin.
+ *
+ *   overlay_plugin_propinfo_t
+ *
+ *   	This is called with the name of a property that is registered when the
+ *   	plugin is created. This function will be called with the name of the
+ *   	property that information is being requested about. The plugin is
+ *   	responsible for filling out information such as setting the name, the
+ *   	type of property it is, the protection of the property (can a user
+ *   	update it?), whether the property is required, an optional default value
+ *   	for the property, and an optional set of values or ranges that are
+ *   	allowed.
+ *
+ *   overlay_plugin_getprop_t
+ *
+ *	Return the value of the named property from the current instance of the
+ *	plugin.
+ *
+ *   overlay_plugin_setprop_t
+ *
+ *	Set the value of the named property to the specified value for the
+ *	current instance of the plugin. Note, that it is the plugin's
+ *	responsibility to ensure that the value of the property is valid and to
+ *	update state as appropriate.
+ *
+ *   overlay_plugin_socket_t
+ *
+ *   	Every overlay device has a corresponding socket that it uses to send and
+ *   	receive traffic. This routine is used to get the parameters that should
+ *   	be used to define such a socket. The actual socket may be multiplexed
+ *   	with other uses of it.
+ *
+ *   overlay_plugin_sockopt_t
+ *
+ *   	Allow a plugin to set any necessary socket options that it needs on the
+ *   	kernel socket that is being used by a mux. This will only be called once
+ *   	for a given mux, if additional devices are added to a mux, it will not
+ *   	be called additional times.
+ *
+ *   overlay_plugin_encap_t
+ *
+ *   	In this routine you're given a message block and information about the
+ *   	packet, such as the identifier and are asked to fill out a message block
+ *   	that represents the encapsulation header and optionally manipulate the
+ *   	input message if required.
+ *
+ *   overlay_plugin_decap_t
+ *
+ *   	In this routine, you're given the encapsulated message block. The
+ *   	requirement is to decapsulate it and determine what is the correct
+ *   	overlay identifier for this network and to fill in the header size so
+ *   	the broader system knows how much of this data should be considered
+ *   	consumed.
+ *
+ *   ovpo_callbacks
+ *
+ *   	This should be set to zero, it's reserved for future use.
+ *
+ * Once these properties are defined, the module should define the following
+ * members in the overlay_plugin_register_t.
+ *
+ *   ovep_version
+ *
+ *   	Should be set to the value of the macro OVEP_VERSION.
+ *
+ *   ovep_name
+ *
+ *   	Should be set to a character string that has the name of the module.
+ *   	Generally this should match the name of the kernel module; however, this
+ *   	is the name that users will use to refer to this module when creating
+ *   	devices.
+ *
+ *   overlay_plugin_ops_t
+ *
+ *   	Should be set to the functions as described above.
+ *
+ *   ovep_props
+ *
+ *   	This is an array of character strings that holds the names of the
+ *   	properties of the encapsulation plugin.
+ *
+ *
+ *   ovep_id_size
+ *
+ *   	This is the size in bytes of the valid range for the identifier. The
+ *   	valid identifier range is considered a ovep_id_size byte unsigned
+ *   	integer, [ 0, 1 << (ovep_id_size * 8) ).
+ *
+ *   ovep_flags
+ *
+ *   	A series of flags that indicate optional features that are supported.
+ *   	Valid flags include:
+ *
+ *   		OVEP_F_VLAN_TAG
+ *
+ * 			The encapsulation format allows for the encapsulated
+ * 			packet to maintain a VLAN tag.
+ *
+ *   ovep_dest
+ *
+ *   	Describes the kind of destination that the overlay plugin supports for
+ *   	sending traffic. For example, vxlan uses UDP, therefore it requires both
+ *   	an IP address and a port; however, nvgre uses the gre header and
+ *   	therefore only requires an IP address. The following flags may be
+ *   	combined:
+ *
+ *   		OVERLAY_PLUGIN_D_ETHERNET
+ *
+ *   			Indicates that to send a packet to its destination, we
+ *   			require a link-layer ethernet address.
+ *
+ * 		OVERLAY_PLUGIN_D_IP
+ *
+ * 			Indicates that to send a packet to its destination, we
+ * 			require an IP address. Note, all IP addresses are
+ * 			transmitted as IPv6 addresses and for an IPv4
+ * 			destination, using an IPv4-mapped IPv6 address is the
+ * 			expected way to transmit that.
+ *
+ * 		OVERLAY_PLUGIN_D_PORT
+ *
+ * 			Indicates that to send a packet to its destination, a
+ * 			port is required, this usually indicates that the
+ * 			protocol uses something like TCP or UDP.
+ *
+ *
+ * -------------------------------------------------
+ * Downcalls, Upcalls, and Synchronization Guarantees
+ * -------------------------------------------------
+ *
+ * Every instance of a given module is independent. The kernel only guarantees
+ * that it will probably perform downcalls into different instances in parallel
+ * at some point. No locking is provided by the framework for synchronization
+ * across instances. If a module finds itself needing that, it will be up to it
+ * to provide it.
+ *
+ * In a given instance, the kernel may call into entry points in parallel. If
+ * the instance has private data, it should likely synchronize it. The one
+ * guarantee that we do make, is that calls to getprop and setprop will be done
+ * synchronized by a caller holding the MAC perimeter.
+ *
+ * While servicing a downcall from the general overlay device framework, a
+ * kernel module should not make any upcalls, excepting those functions that are
+ * defined in this header file, eg. the property related callbacks. Improtantly,
+ * it cannot make any assumptions about what locks may or may not be held by the
+ * broader system. The only thing that it is safe for it to use are its own
+ * locks.
+ *
+ * ----------------
+ * Downcall Context
+ * ----------------
+ *
+ * For all of the downcalls, excepting the overlay_plugin_encap_t and
+ * overlay_plugin_decap_t, the calls will be made either in kernel or user
+ * context, the module should not assume either way.
+ *
+ * overlay_plugin_encap_t and overlay_plugin_decap_t may be called in user,
+ * kernel or interrupt context; however, it is guaranteed that the interrupt
+ * will be below LOCK_LEVEL, and therefore it is safe to grab locks.
+ */
+
+#include <sys/stream.h>
+#include <sys/mac_provider.h>
+#include <sys/ksocket.h>
+#include <sys/overlay_common.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define	OVEP_VERSION	0x1
+
+typedef enum overlay_plugin_flags {
+	OVEP_F_VLAN_TAG	= 0x01	/* Supports VLAN Tags */
+} overlay_plugin_flags_t;
+
+/*
+ * The ID space could easily be more than a 64-bit number, even
+ * though today it's either a 24-64 bit value. How should we future
+ * proof ourselves here?
+ */
+typedef struct ovep_encap_info {
+	uint64_t	ovdi_id;
+	size_t		ovdi_hdr_size;
+} ovep_encap_info_t;
+
+typedef struct __overlay_prop_handle *overlay_prop_handle_t;
+typedef struct __overlay_handle *overlay_handle_t;
+
+/*
+ * Plugins are guaranteed that calls to setprop are serialized. However, any
+ * number of other calls can be going on in parallel otherwise.
+ */
+typedef int (*overlay_plugin_encap_t)(void *, mblk_t *,
+    ovep_encap_info_t *, mblk_t **);
+typedef int (*overlay_plugin_decap_t)(void *, mblk_t *,
+    ovep_encap_info_t *);
+typedef int (*overlay_plugin_init_t)(overlay_handle_t, void **);
+typedef void (*overlay_plugin_fini_t)(void *);
+typedef int (*overlay_plugin_socket_t)(void *, int *, int *, int *,
+    struct sockaddr *, socklen_t *);
+typedef int (*overlay_plugin_sockopt_t)(ksocket_t);
+typedef int (*overlay_plugin_getprop_t)(void *, const char *, void *,
+    uint32_t *);
+typedef int (*overlay_plugin_setprop_t)(void *, const char *, const void *,
+    uint32_t);
+typedef int (*overlay_plugin_propinfo_t)(const char *, overlay_prop_handle_t);
+
+typedef struct overlay_plugin_ops {
+	uint_t			ovpo_callbacks;
+	overlay_plugin_init_t	ovpo_init;
+	overlay_plugin_fini_t	ovpo_fini;
+	overlay_plugin_encap_t	ovpo_encap;
+	overlay_plugin_decap_t	ovpo_decap;
+	overlay_plugin_socket_t ovpo_socket;
+	overlay_plugin_sockopt_t ovpo_sockopt;
+	overlay_plugin_getprop_t ovpo_getprop;
+	overlay_plugin_setprop_t ovpo_setprop;
+	overlay_plugin_propinfo_t ovpo_propinfo;
+} overlay_plugin_ops_t;
+
+typedef struct overlay_plugin_register {
+	uint_t			ovep_version;
+	const char		*ovep_name;
+	const overlay_plugin_ops_t	*ovep_ops;
+	const char 		**ovep_props;
+	uint_t			ovep_id_size;
+	uint_t			ovep_flags;
+	uint_t			ovep_dest;
+} overlay_plugin_register_t;
+
+/*
+ * Functions that interact with registration
+ */
+extern overlay_plugin_register_t *overlay_plugin_alloc(uint_t);
+extern void overlay_plugin_free(overlay_plugin_register_t *);
+extern int overlay_plugin_register(overlay_plugin_register_t *);
+extern int overlay_plugin_unregister(const char *);
+
+/*
+ * Property information callbacks
+ */
+extern void overlay_prop_set_name(overlay_prop_handle_t, const char *);
+extern void overlay_prop_set_prot(overlay_prop_handle_t, overlay_prop_prot_t);
+extern void overlay_prop_set_type(overlay_prop_handle_t, overlay_prop_type_t);
+extern int overlay_prop_set_default(overlay_prop_handle_t, void *, ssize_t);
+extern void overlay_prop_set_nodefault(overlay_prop_handle_t);
+extern void overlay_prop_set_range_uint32(overlay_prop_handle_t, uint32_t,
+    uint32_t);
+extern void overlay_prop_set_range_str(overlay_prop_handle_t, const char *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_OVERLAY_PLUGIN_H */
diff --git a/usr/src/uts/common/sys/overlay_target.h b/usr/src/uts/common/sys/overlay_target.h
new file mode 100644
index 0000000000..cae193c334
--- /dev/null
+++ b/usr/src/uts/common/sys/overlay_target.h
@@ -0,0 +1,292 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2015 Joyent, Inc.
+ */
+
+#ifndef _OVERLAY_TARGET_H
+#define	_OVERLAY_TARGET_H
+
+/*
+ * Overlay device varpd ioctl interface (/dev/overlay)
+ */
+
+#include <sys/types.h>
+#include <sys/ethernet.h>
+#include <netinet/in.h>
+#include <sys/overlay_common.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct overlay_target_point {
+	uint8_t		otp_mac[ETHERADDRL];
+	struct in6_addr	otp_ip;
+	uint16_t	otp_port;
+} overlay_target_point_t;
+
+#define	OVERLAY_TARG_IOCTL	(('o' << 24) | ('v' << 16) | ('t' << 8))
+
+#define	OVERLAY_TARG_INFO	(OVERLAY_TARG_IOCTL | 0x01)
+
+typedef enum overlay_targ_info_flags {
+	OVERLAY_TARG_INFO_F_ACTIVE = 0x01,
+	OVERLAY_TARG_INFO_F_DEGRADED = 0x02
+} overlay_targ_info_flags_t;
+
+/*
+ * Get target information about an overlay device
+ */
+typedef struct overlay_targ_info {
+	datalink_id_t		oti_linkid;
+	uint32_t		oti_needs;
+	uint64_t		oti_flags;
+	uint64_t		oti_vnetid;
+} overlay_targ_info_t;
+
+/*
+ * Declare an association between a given varpd instance and a datalink.
+ */
+#define	OVERLAY_TARG_ASSOCIATE	(OVERLAY_TARG_IOCTL | 0x02)
+
+typedef struct overlay_targ_associate {
+	datalink_id_t		ota_linkid;
+	uint32_t		ota_mode;
+	uint64_t		ota_id;
+	uint32_t		ota_provides;
+	overlay_target_point_t	ota_point;
+} overlay_targ_associate_t;
+
+/*
+ * Remove an association from a device. If the device has already been started,
+ * this implies OVERLAY_TARG_DEGRADE.
+ */
+#define	OVERLAY_TARG_DISASSOCIATE	(OVERLAY_TARG_IOCTL | 0x3)
+
+/*
+ * Tells the kernel that while a varpd instance still exists, it basically isn't
+ * making any forward progress, so the device should consider itself degraded.
+ */
+#define	OVERLAY_TARG_DEGRADE	(OVERLAY_TARG_IOCTL | 0x4)
+
+typedef struct overlay_targ_degrade {
+	datalink_id_t	otd_linkid;
+	uint32_t	otd_pad;
+	char		otd_buf[OVERLAY_STATUS_BUFLEN];
+} overlay_targ_degrade_t;
+
+/*
+ * Tells the kernel to remove the degraded status that it set on a device.
+ */
+#define	OVERLAY_TARG_RESTORE	(OVERLAY_TARG_IOCTL | 0x5)
+
+typedef struct overlay_targ_id {
+	datalink_id_t	otid_linkid;
+} overlay_targ_id_t;
+
+/*
+ * The following ioctls are all used to support dynamic lookups from userland,
+ * generally serviced by varpd.
+ *
+ * The way this is designed to work is that user land will have threads sitting
+ * in OVERLAY_TARG_LOOKUP ioctls waiting to service requests. A thread will sit
+ * waiting for work for up to approximately one second of time before they will
+ * be sent back out to user land to give user land a chance to clean itself up
+ * or more generally, come back into the kernel for work. Once these threads
+ * return, they will have a request with which more action can be done. The
+ * following ioctls can all be used to answer the request.
+ *
+ *	OVERLAY_TARG_RESPOND - overlay_targ_resp_t
+ *
+ *		The overlay_targ_resp_t has the appropriate information from
+ *		which a reply can be generated. The information is filled into
+ *		an overlay_targ_point_t as appropriate based on the
+ *		overlay_plugin_dest_t type.
+ *
+ *
+ *	OVERLAY_TARG_DROP - overlay_targ_resp_t
+ *
+ *		The overlay_targ_resp_t should identify a request for which to
+ *		drop a packet.
+ *
+ *
+ * 	OVERLAY_TARG_INJECT - overlay_targ_pkt_t
+ *
+ * 		The overlay_targ_pkt_t injects a fully formed packet into the
+ * 		virtual network. It may either be identified by its data link id
+ * 		or by the request id. If both are specified, the
+ * 		datalink id will be used. Note, that an injection is not
+ * 		considered a reply and if this corresponds to a requeset, then
+ * 		that individual packet must still be dropped.
+ *
+ *
+ * 	OVERLAY_TARG_PKT - overlay_targ_pkt_t
+ *
+ * 		This ioctl can be used to copy data from a given request into a
+ * 		user buffer. This can be used in combination with
+ * 		OVERLAY_TARG_INJECT to implemnt services such as a proxy-arp.
+ *
+ *
+ * 	OVERLAY_TARG_RESEND - overlay_targ_pkt_t
+ *
+ * 		This ioctl is similar to the OVERLAY_TARG_INJECT, except instead
+ * 		of receiving it on the local mac handle, it queues it for
+ * 		retransmission again. This is useful if you have a packet that
+ * 		was originally destined for some broadcast or multicast address
+ * 		that you now want to send to a unicast address.
+ */
+#define	OVERLAY_TARG_LOOKUP	(OVERLAY_TARG_IOCTL | 0x10)
+#define	OVERLAY_TARG_RESPOND	(OVERLAY_TARG_IOCTL | 0x11)
+#define	OVERLAY_TARG_DROP	(OVERLAY_TARG_IOCTL | 0x12)
+#define	OVERLAY_TARG_INJECT	(OVERLAY_TARG_IOCTL | 0x13)
+#define	OVERLAY_TARG_PKT	(OVERLAY_TARG_IOCTL | 0x14)
+#define	OVERLAY_TARG_RESEND	(OVERLAY_TARG_IOCTL | 0x15)
+
+typedef struct overlay_targ_lookup {
+	uint64_t	otl_dlid;
+	uint64_t	otl_reqid;
+	uint64_t	otl_varpdid;
+	uint64_t	otl_vnetid;
+	uint64_t	otl_hdrsize;
+	uint64_t	otl_pktsize;
+	uint8_t		otl_srcaddr[ETHERADDRL];
+	uint8_t		otl_dstaddr[ETHERADDRL];
+	uint32_t	otl_dsttype;
+	uint32_t	otl_sap;
+	int32_t		otl_vlan;
+} overlay_targ_lookup_t;
+
+typedef struct overlay_targ_resp {
+	uint64_t	otr_reqid;
+	overlay_target_point_t otr_answer;
+} overlay_targ_resp_t;
+
+typedef struct overlay_targ_pkt {
+	uint64_t	otp_linkid;
+	uint64_t	otp_reqid;
+	uint64_t	otp_size;
+	void		*otp_buf;
+} overlay_targ_pkt_t;
+
+#ifdef _KERNEL
+
+typedef struct overlay_targ_pkt32 {
+	uint64_t	otp_linkid;
+	uint64_t	otp_reqid;
+	uint64_t	otp_size;
+	caddr32_t	otp_buf;
+} overlay_targ_pkt32_t;
+
+#endif /* _KERNEL */
+
+/*
+ * This provides a way to get a list of active overlay devices independently
+ * from dlmgmtd. At the end of the day the kernel always knows what will exist
+ * and this allows varpd which is an implementation of libdladm not to end up
+ * needing to call back into dlmgmtd via libdladm and create an unfortunate
+ * dependency cycle.
+ */
+
+#define	OVERLAY_TARG_LIST	(OVERLAY_TARG_IOCTL | 0x20)
+
+typedef struct overlay_targ_list {
+	uint32_t	otl_nents;
+	uint32_t	otl_ents[];
+} overlay_targ_list_t;
+
+/*
+ * The following family of ioctls all manipulate the target cache of a given
+ * device.
+ *
+ * 	OVERLAY_TARG_CACHE_GET - overlay_targ_cache_t
+ *
+ * 		The overlay_targ_cache_t should be have its link identifier and
+ * 		the desired mac address filled in. On return, it will fill in
+ * 		the otc_dest member, if the entry exists in the table.
+ *
+ *
+ * 	OVERLAY_TARG_CACHE_SET - overlay_targ_cache_t
+ *
+ * 		The cache table entry of the mac address referred to by otc_mac
+ * 		and otd_linkid will be filled in with the details provided by in
+ * 		the otc_dest member.
+ *
+ * 	OVERLAY_TARG_CACHE_REMOVE - overlay_targ_cache_t
+ *
+ * 		Removes the cache entry identified by otc_mac from the table.
+ * 		Note that this does not stop any in-flight lookups or deal with
+ * 		any data that is awaiting a lookup.
+ *
+ *
+ * 	OVERLAY_TARG_CACHE_FLUSH - overlay_targ_cache_t
+ *
+ * 		Similar to OVERLAY_TARG_CACHE_REMOVE, but functions on the
+ * 		entire table identified by otc_linkid. All other parameters are
+ * 		ignored.
+ *
+ *
+ * 	OVERLAY_TARG_CACHE_ITER - overlay_targ_cache_iter_t
+ *
+ * 		Iterates over the contents of a target cache identified by
+ * 		otci_linkid. Iteration is guaranteed to be exactly once for
+ * 		items which are in the hashtable at the beginning and end of
+ * 		iteration. For items which are added or removed after iteration
+ * 		has begun, only at most once semantics are guaranteed. Consumers
+ * 		should ensure that otci_marker is zeroed before starting
+ * 		iteration and should preserve its contents across calls.
+ *
+ * 		Before calling in, otci_count should be set to the number of
+ * 		entries that space has been allocated for in otci_ents. The
+ * 		value will be updated to indicate the total number written out.
+ */
+
+#define	OVERLAY_TARG_CACHE_GET		(OVERLAY_TARG_IOCTL | 0x30)
+#define	OVERLAY_TARG_CACHE_SET		(OVERLAY_TARG_IOCTL | 0x31)
+#define	OVERLAY_TARG_CACHE_REMOVE	(OVERLAY_TARG_IOCTL | 0x32)
+#define	OVERLAY_TARG_CACHE_FLUSH	(OVERLAY_TARG_IOCTL | 0x33)
+#define	OVERLAY_TARG_CACHE_ITER		(OVERLAY_TARG_IOCTL | 0x34)
+
+/*
+ * This is a pretty arbitrary number that we're constraining ourselves to
+ * for iteration. Basically the goal is to make sure that we can't have a user
+ * ask us to allocate too much memory on their behalf at any time. A more
+ * dynamic form may be necessary some day.
+ */
+#define	OVERLAY_TARGET_ITER_MAX	500
+
+#define	OVERLAY_TARGET_CACHE_DROP	0x01
+
+typedef struct overlay_targ_cache_entry {
+	uint8_t			otce_mac[ETHERADDRL];
+	uint16_t		otce_flags;
+	overlay_target_point_t	otce_dest;
+} overlay_targ_cache_entry_t;
+
+typedef struct overlay_targ_cache {
+	datalink_id_t			otc_linkid;
+	overlay_targ_cache_entry_t	otc_entry;
+} overlay_targ_cache_t;
+
+typedef struct overlay_targ_cache_iter {
+	datalink_id_t			otci_linkid;
+	uint32_t			otci_pad;
+	uint64_t			otci_marker;
+	uint16_t			otci_count;
+	overlay_targ_cache_entry_t	otci_ents[];
+} overlay_targ_cache_iter_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _OVERLAY_TARGET_H */
diff --git a/usr/src/uts/common/sys/param.h b/usr/src/uts/common/sys/param.h
index c3a1b9a97b..ea2fdfd886 100644
--- a/usr/src/uts/common/sys/param.h
+++ b/usr/src/uts/common/sys/param.h
@@ -104,7 +104,7 @@ extern "C" {
 #define	DEFAULT_MAXPID	999999
 #define	DEFAULT_JUMPPID	100000
 #else
-#define	DEFAULT_MAXPID	30000
+#define	DEFAULT_MAXPID	99999
 #define	DEFAULT_JUMPPID	0
 #endif
 
diff --git a/usr/src/uts/common/sys/policy.h b/usr/src/uts/common/sys/policy.h
index 5f5b66d437..5328d02c59 100644
--- a/usr/src/uts/common/sys/policy.h
+++ b/usr/src/uts/common/sys/policy.h
@@ -108,6 +108,7 @@ int secpolicy_ipc_owner(const cred_t *, const struct kipc_perm *);
 int secpolicy_kmdb(const cred_t *);
 int secpolicy_lock_memory(const cred_t *);
 int secpolicy_meminfo(const cred_t *);
+int secpolicy_fs_import(const cred_t *);
 int secpolicy_modctl(const cred_t *, int);
 int secpolicy_net(const cred_t *, int, boolean_t);
 int secpolicy_net_bindmlp(const cred_t *);
@@ -174,6 +175,7 @@ int secpolicy_setid_setsticky_clear(vnode_t *, vattr_t *,
     const vattr_t *, cred_t *);
 int secpolicy_xvattr(xvattr_t *, uid_t, cred_t *, vtype_t);
 int secpolicy_xvm_control(const cred_t *);
+int secpolicy_hyprlofs_control(const cred_t *);
 
 int secpolicy_basic_exec(const cred_t *, vnode_t *);
 int secpolicy_basic_fork(const cred_t *);
diff --git a/usr/src/uts/common/sys/proc.h b/usr/src/uts/common/sys/proc.h
index 5abf8fd3cd..ff4a1abce4 100644
--- a/usr/src/uts/common/sys/proc.h
+++ b/usr/src/uts/common/sys/proc.h
@@ -21,6 +21,7 @@
 
 /*
  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015 Joyent, Inc.  All rights reserved.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -348,7 +349,9 @@ typedef struct	proc {
 	struct zone	*p_zone;	/* zone in which process lives */
 	struct vnode	*p_execdir;	/* directory that p_exec came from */
 	struct brand	*p_brand;	/* process's brand  */
-	void		*p_brand_data;	/* per-process brand state */
+
+	/* per-process brand state */
+	void		*p_brand_data;
 
 	/* additional lock to protect p_sessp (but not its contents) */
 	kmutex_t p_splock;
@@ -363,7 +366,6 @@ typedef struct	proc {
 	 */
 	struct user p_user;		/* (see sys/user.h) */
 } proc_t;
-
 #define	PROC_T				/* headers relying on proc_t are OK */
 
 #ifdef _KERNEL
@@ -629,6 +631,7 @@ extern int signal_is_blocked(kthread_t *, int);
 extern int sigcheck(proc_t *, kthread_t *);
 extern void sigdefault(proc_t *);
 
+extern struct pid *pid_find(pid_t pid);
 extern void pid_setmin(void);
 extern pid_t pid_allocate(proc_t *, pid_t, int);
 extern int pid_rele(struct pid *);
@@ -644,6 +647,7 @@ extern int sprtrylock_proc(proc_t *);
 extern void sprwaitlock_proc(proc_t *);
 extern void sprlock_proc(proc_t *);
 extern void sprunlock(proc_t *);
+extern void sprunprlock(proc_t *);
 extern void pid_init(void);
 extern proc_t *pid_entry(int);
 extern int pid_slot(proc_t *);
@@ -718,6 +722,10 @@ extern	kthread_t *thread_unpin(void);
 extern	void	thread_init(void);
 extern	void	thread_load(kthread_t *, void (*)(), caddr_t, size_t);
 
+extern	void	thread_splitstack(void (*)(void *), void *, size_t);
+extern	void	thread_splitstack_run(caddr_t, void (*)(void *), void *);
+extern	void	thread_splitstack_cleanup(void);
+
 extern	void	tsd_create(uint_t *, void (*)(void *));
 extern	void	tsd_destroy(uint_t *);
 extern	void	*tsd_getcreate(uint_t *, void (*)(void *), void *(*)(void));
@@ -759,7 +767,7 @@ extern	void	pokelwps(proc_t *);
 extern	void	continuelwps(proc_t *);
 extern	int	exitlwps(int);
 extern	void	lwp_ctmpl_copy(klwp_t *, klwp_t *);
-extern	void	lwp_ctmpl_clear(klwp_t *);
+extern	void	lwp_ctmpl_clear(klwp_t *, boolean_t);
 extern	klwp_t	*forklwp(klwp_t *, proc_t *, id_t);
 extern	void	lwp_load(klwp_t *, gregset_t, uintptr_t);
 extern	void	lwp_setrval(klwp_t *, int, int);
diff --git a/usr/src/uts/common/sys/procfs.h b/usr/src/uts/common/sys/procfs.h
index f592fd9dcf..501af712ef 100644
--- a/usr/src/uts/common/sys/procfs.h
+++ b/usr/src/uts/common/sys/procfs.h
@@ -25,6 +25,7 @@
  */
 /*
  * Copyright 2012 DEY Storage Systems, Inc.  All rights reserved.
+ * Copyright 2015, Joyent, Inc.
  */
 
 #ifndef _SYS_PROCFS_H
@@ -233,6 +234,7 @@ typedef struct pstatus {
 #define	PR_FAULTED	6
 #define	PR_SUSPENDED	7
 #define	PR_CHECKPOINT	8
+#define	PR_BRAND	9
 
 /*
  * lwp ps(1) information file.  /proc/<pid>/lwp/<lwpid>/lwpsinfo
diff --git a/usr/src/uts/common/sys/ptms.h b/usr/src/uts/common/sys/ptms.h
index 6c79ee266d..ba8b2b1210 100644
--- a/usr/src/uts/common/sys/ptms.h
+++ b/usr/src/uts/common/sys/ptms.h
@@ -126,6 +126,12 @@ extern void ptms_logp(char *, uintptr_t);
 #define	DDBGP(a, b)
 #endif
 
+typedef struct __ptmptsopencb_arg *ptmptsopencb_arg_t;
+typedef struct ptmptsopencb {
+	boolean_t		(*ppocb_func)(ptmptsopencb_arg_t);
+	ptmptsopencb_arg_t	ppocb_arg;
+} ptmptsopencb_t;
+
 #endif /* _KERNEL */
 
 typedef struct pt_own {
@@ -157,6 +163,19 @@ typedef struct pt_own {
 #define	ZONEPT		(('P'<<8)|4)	/* set zone of master/slave pair */
 #define	OWNERPT		(('P'<<8)|5)	/* set owner/group for slave device */
 
+#ifdef _KERNEL
+/*
+ * kernel ioctl commands
+ *
+ * PTMPTSOPENCB: Returns a callback function pointer and opaque argument.
+ *	      The return value of the callback function when it's invoked
+ *	      with the opaque argument passed to it will indicate if the
+ *	      pts slave device is currently open.
+ */
+#define	PTMPTSOPENCB	(('P'<<8)|6)	/* check if the slave is open */
+
+#endif /* _KERNEL */
+
 #ifdef	__cplusplus
 }
 #endif
diff --git a/usr/src/uts/common/sys/scsi/adapters/mpt_sas/mptsas_hash.h b/usr/src/uts/common/sys/refhash.h
index 2069e6d3f1..b7427a454d 100644
--- a/usr/src/uts/common/sys/scsi/adapters/mpt_sas/mptsas_hash.h
+++ b/usr/src/uts/common/sys/refhash.h
@@ -10,11 +10,11 @@
  */
 
 /*
- * Copyright 2014 Joyent, Inc.  All rights reserved.
+ * Copyright 2015 Joyent, Inc.
  */
 
-#ifndef	_SYS_SCSI_ADAPTERS_MPTHASH_H
-#define	_SYS_SCSI_ADAPTERS_MPTHASH_H
+#ifndef	_SYS_REFHASH_H
+#define	_SYS_REFHASH_H
 
 #include <sys/types.h>
 #include <sys/list.h>
@@ -58,4 +58,4 @@ extern void *refhash_first(refhash_t *);
 extern void *refhash_next(refhash_t *, void *);
 extern boolean_t refhash_obj_valid(refhash_t *hp, const void *);
 
-#endif	/* _SYS_SCSI_ADAPTERS_MPTHASH_H */
+#endif	/* _SYS_REFHASH_H */
diff --git a/usr/src/uts/common/sys/resource.h b/usr/src/uts/common/sys/resource.h
index 2d3800b946..4b70a77db8 100644
--- a/usr/src/uts/common/sys/resource.h
+++ b/usr/src/uts/common/sys/resource.h
@@ -23,6 +23,7 @@
  *
  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2014 Joyent, Inc.  All rights reserved.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -191,6 +192,7 @@ struct	rusage {
 #define	_RUSAGESYS_GETRUSAGE_CHLD	1	/* rusage child process */
 #define	_RUSAGESYS_GETRUSAGE_LWP	2	/* rusage lwp */
 #define	_RUSAGESYS_GETVMUSAGE		3	/* getvmusage */
+#define	_RUSAGESYS_INVALMAP		4	/* vm_map_inval */
 
 #if defined(_SYSCALL32)
 
diff --git a/usr/src/uts/common/sys/rt.h b/usr/src/uts/common/sys/rt.h
index ca52f8d995..82cc08d326 100644
--- a/usr/src/uts/common/sys/rt.h
+++ b/usr/src/uts/common/sys/rt.h
@@ -22,6 +22,7 @@
 /*
  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2015 Joyent, Inc.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -31,8 +32,6 @@
 #ifndef _SYS_RT_H
 #define	_SYS_RT_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"	/* SVr4.0 1.4 */
-
 #include <sys/types.h>
 #include <sys/thread.h>
 
@@ -77,6 +76,16 @@ typedef struct	rtkparms {
 	int	rt_tqsig;	/* real-time time quantum signal */
 	uint_t	rt_cflags;	/* real-time control flags */
 } rtkparms_t;
+
+#define	RTGPPRIO0	100	/* Global priority for RT priority 0 */
+
+/*
+ * control flags (kparms->rt_cflags).
+ */
+#define	RT_DOPRI	0x01	/* change priority */
+#define	RT_DOTQ		0x02	/* change RT time quantum */
+#define	RT_DOSIG	0x04	/* change RT time quantum signal */
+
 #endif	/* _KERNEL */
 
 #ifdef	__cplusplus
diff --git a/usr/src/uts/common/sys/scsi/adapters/mpt_sas/mptsas_var.h b/usr/src/uts/common/sys/scsi/adapters/mpt_sas/mptsas_var.h
index 3983188fce..02116b45c4 100644
--- a/usr/src/uts/common/sys/scsi/adapters/mpt_sas/mptsas_var.h
+++ b/usr/src/uts/common/sys/scsi/adapters/mpt_sas/mptsas_var.h
@@ -22,7 +22,7 @@
 /*
  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright 2015, Joyent, Inc.
  * Copyright (c) 2014, Tegile Systems Inc. All rights reserved.
  */
 
@@ -58,10 +58,10 @@
 
 #include <sys/byteorder.h>
 #include <sys/queue.h>
+#include <sys/refhash.h>
 #include <sys/isa_defs.h>
 #include <sys/sunmdi.h>
 #include <sys/mdi_impldefs.h>
-#include <sys/scsi/adapters/mpt_sas/mptsas_hash.h>
 #include <sys/scsi/adapters/mpt_sas/mptsas_ioctl.h>
 #include <sys/scsi/adapters/mpt_sas/mpi/mpi2_tool.h>
 #include <sys/scsi/adapters/mpt_sas/mpi/mpi2_cnfg.h>
diff --git a/usr/src/uts/common/sys/shm.h b/usr/src/uts/common/sys/shm.h
index e3bd2a77d3..030379488f 100644
--- a/usr/src/uts/common/sys/shm.h
+++ b/usr/src/uts/common/sys/shm.h
@@ -21,6 +21,7 @@
  */
 /*
  * Copyright 2014 Garrett D'Amore <garrett@damore.org>
+ * Copyright 2016 Joyent, Inc.
  *
  * Copyright 2003 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
@@ -120,6 +121,10 @@ struct shmid_ds {
 #define	SHM_LOCK	3	/* Lock segment in core */
 #define	SHM_UNLOCK	4	/* Unlock segment */
 
+#if defined(_KERNEL)
+#define	SHM_RMID	5	/* Private RMID for lx support */
+#endif
+
 #if !defined(_KERNEL)
 int shmget(key_t, size_t, int);
 int shmids(int *, uint_t, uint_t *);
diff --git a/usr/src/uts/common/sys/shm_impl.h b/usr/src/uts/common/sys/shm_impl.h
index 4d8cdcede5..1eae2ca0a4 100644
--- a/usr/src/uts/common/sys/shm_impl.h
+++ b/usr/src/uts/common/sys/shm_impl.h
@@ -21,13 +21,12 @@
 /*
  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2016 Joyent, Inc.
  */
 
 #ifndef	_SYS_SHM_IMPL_H
 #define	_SYS_SHM_IMPL_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/ipc_impl.h>
 #if defined(_KERNEL) || defined(_KMEMUSER)
 #include <sys/shm.h>
@@ -70,7 +69,11 @@ typedef struct kshmid {
 	time_t		shm_ctime;	/* last change time */
 	struct sptinfo	*shm_sptinfo;	/* info about ISM segment */
 	struct seg	*shm_sptseg;	/* pointer to ISM segment */
-	long		shm_sptprot;	/* was reserved (still a "long") */
+	ulong_t		shm_opts;
+					/*
+					 * Composed of: sptprot (uchar_t) and
+					 * RM_PENDING flag (1 bit).
+					 */
 } kshmid_t;
 
 /*
@@ -78,6 +81,14 @@ typedef struct kshmid {
  */
 #define	SHMSA_ISM	1	/* uses shared page table */
 
+/*
+ * shm_opts definitions
+ * Low byte in shm_opts is used for sptprot (see PROT_ALL). The upper bits are
+ * used for additional options.
+ */
+#define	SHM_PROT_MASK	0xff
+#define	SHM_RM_PENDING	0x100
+
 typedef struct sptinfo {
 	struct as	*sptas;		/* dummy as ptr. for spt segment */
 } sptinfo_t;
diff --git a/usr/src/uts/common/sys/signal.h b/usr/src/uts/common/sys/signal.h
index 8f0e1794f4..139784d578 100644
--- a/usr/src/uts/common/sys/signal.h
+++ b/usr/src/uts/common/sys/signal.h
@@ -22,6 +22,7 @@
 /*
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2015, Joyent, Inc.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -158,8 +159,8 @@ struct sigaction32 {
  * use of these symbols by applications is injurious
  *	to binary compatibility
  */
-#define	NSIG	74	/* valid signals range from 1 to NSIG-1 */
-#define	MAXSIG	73	/* size of u_signal[], NSIG-1 <= MAXSIG */
+#define	NSIG	75	/* valid signals range from 1 to NSIG-1 */
+#define	MAXSIG	74	/* size of u_signal[], NSIG-1 <= MAXSIG */
 #endif /* defined(__EXTENSIONS__) || !defined(_XPG4_2) */
 
 #define	MINSIGSTKSZ	2048
diff --git a/usr/src/uts/common/sys/signalfd.h b/usr/src/uts/common/sys/signalfd.h
index 2661d5a05f..89d0647020 100644
--- a/usr/src/uts/common/sys/signalfd.h
+++ b/usr/src/uts/common/sys/signalfd.h
@@ -10,7 +10,7 @@
  */
 
 /*
- * Copyright 2015 Joyent, Inc.
+ * Copyright 2016 Joyent, Inc.
  */
 
 /*
@@ -75,13 +75,9 @@ extern int signalfd(int, const sigset_t *, int);
 #define	SIGNALFDMNRN_SIGNALFD	0
 #define	SIGNALFDMNRN_CLONE	1
 
-typedef struct sigfd_wake_list {
-	list_node_t sigfd_wl_lst;
-	void *sigfd_wl_state;
-} sigfd_wake_list_t;
-
 /*
  * This holds the proc_t state for a process which is using signalfd.
+ * Its presence and contents are protected by p_lock.
  */
 typedef struct sigfd_proc_state {
 	void (*sigfd_pollwake_cb)(void *, int);
diff --git a/usr/src/uts/common/sys/socket.h b/usr/src/uts/common/sys/socket.h
index da8e3ab351..e55cd165aa 100644
--- a/usr/src/uts/common/sys/socket.h
+++ b/usr/src/uts/common/sys/socket.h
@@ -22,6 +22,7 @@
  * Copyright 2014 Garrett D'Amore <garrett@damore.org>
  *
  * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015, Joyent, Inc. All rights reserved.
  */
 
 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
@@ -39,6 +40,9 @@
 
 /* Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. */
 
+/*
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ */
 #ifndef	_SYS_SOCKET_H
 #define	_SYS_SOCKET_H
 
@@ -194,6 +198,7 @@ struct so_snd_bufinfo {
 #define	SO_SRCADDR	0x2001		/* Internal: AF_UNIX source address */
 #define	SO_FILEP	0x2002		/* Internal: AF_UNIX file pointer */
 #define	SO_UNIX_CLOSE	0x2003		/* Internal: AF_UNIX peer closed */
+#define	SO_REUSEPORT	0x2004		/* allow simultaneous port reuse */
 #endif	/* _KERNEL */
 
 /*
@@ -293,8 +298,9 @@ struct	linger {
 #define	AF_INET_OFFLOAD	30		/* Sun private; do not use */
 #define	AF_TRILL	31		/* TRILL interface */
 #define	AF_PACKET	32		/* PF_PACKET Linux socket interface */
+#define	AF_LX_NETLINK	33		/* Linux-compatible netlink */
 
-#define	AF_MAX		32
+#define	AF_MAX		33
 
 /*
  * Protocol families, same as address families for now.
@@ -334,6 +340,7 @@ struct	linger {
 #define	PF_INET_OFFLOAD	AF_INET_OFFLOAD	/* Sun private; do not use */
 #define	PF_TRILL	AF_TRILL
 #define	PF_PACKET	AF_PACKET
+#define	PF_LX_NETLINK	AF_LX_NETLINK
 
 #define	PF_MAX		AF_MAX
 
@@ -420,6 +427,7 @@ struct msghdr32 {
 #define	MSG_NOTIFICATION 0x100		/* Notification, not data */
 #define	MSG_XPG4_2	0x8000		/* Private: XPG4.2 flag */
 
+/* Obsolete but kept for compilation compatability. Use IOV_MAX. */
 #define	MSG_MAXIOVLEN	16
 
 #ifdef _KERNEL
diff --git a/usr/src/uts/common/sys/socketvar.h b/usr/src/uts/common/sys/socketvar.h
index 52fa3a5822..da61975904 100644
--- a/usr/src/uts/common/sys/socketvar.h
+++ b/usr/src/uts/common/sys/socketvar.h
@@ -21,6 +21,7 @@
 
 /*
  * Copyright (c) 1996, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
  */
 
 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
@@ -102,6 +103,7 @@ struct sockaddr_ux {
 
 typedef struct sonodeops sonodeops_t;
 typedef struct sonode sonode_t;
+typedef boolean_t (*so_krecv_f)(sonode_t *, mblk_t *, size_t, int, void *);
 
 struct sodirect_s;
 
@@ -244,6 +246,10 @@ struct sonode {
 	struct sof_instance	*so_filter_top;		/* top of stack */
 	struct sof_instance	*so_filter_bottom;	/* bottom of stack */
 	clock_t			so_filter_defertime;	/* time when deferred */
+
+	/* Kernel direct receive callbacks */
+	so_krecv_f		so_krecv_cb;		/* recv callback */
+	void			*so_krecv_arg;		/* recv cb arg */
 };
 
 #define	SO_HAVE_DATA(so)						\
@@ -297,15 +303,16 @@ struct sonode {
 #define	SS_OOBPEND		0x00002000 /* OOB pending or present - poll */
 #define	SS_HAVEOOBDATA		0x00004000 /* OOB data present */
 #define	SS_HADOOBDATA		0x00008000 /* OOB data consumed */
-#define	SS_CLOSING		0x00010000 /* in process of closing */
 
+#define	SS_CLOSING		0x00010000 /* in process of closing */
 #define	SS_FIL_DEFER		0x00020000 /* filter deferred notification */
 #define	SS_FILOP_OK		0x00040000 /* socket can attach filters */
 #define	SS_FIL_RCV_FLOWCTRL	0x00080000 /* filter asserted rcv flow ctrl */
+
 #define	SS_FIL_SND_FLOWCTRL	0x00100000 /* filter asserted snd flow ctrl */
 #define	SS_FIL_STOP		0x00200000 /* no more filter actions */
-
 #define	SS_SODIRECT		0x00400000 /* transport supports sodirect */
+#define	SS_FILOP_UNSF		0x00800000 /* block attaching unsafe filters */
 
 #define	SS_SENTLASTREADSIG	0x01000000 /* last rx signal has been sent */
 #define	SS_SENTLASTWRITESIG	0x02000000 /* last tx signal has been sent */
@@ -321,7 +328,8 @@ struct sonode {
 
 /*
  * Sockets that can fall back to TPI must ensure that fall back is not
- * initiated while a thread is using a socket.
+ * initiated while a thread is using a socket. Otherwise this disables all
+ * future filter attachment.
  */
 #define	SO_BLOCK_FALLBACK(so, fn)				\
 	ASSERT(MUTEX_NOT_HELD(&(so)->so_lock));			\
@@ -337,6 +345,24 @@ struct sonode {
 		}						\
 	}
 
+/*
+ * Sockets that can fall back to TPI must ensure that fall back is not
+ * initiated while a thread is using a socket. Otherwise this disables all
+ * future unsafe filter attachment. Safe filters can still attach after
+ * we execute the function in which this macro is used.
+ */
+#define	SO_BLOCK_FALLBACK_SAFE(so, fn)				\
+	ASSERT(MUTEX_NOT_HELD(&(so)->so_lock));			\
+	rw_enter(&(so)->so_fallback_rwlock, RW_READER);		\
+	if ((so)->so_state & SS_FALLBACK_COMP) {		\
+		rw_exit(&(so)->so_fallback_rwlock);		\
+		return (fn);					\
+	} else if (((so)->so_state & SS_FILOP_UNSF) == 0) {	\
+		mutex_enter(&(so)->so_lock);			\
+		(so)->so_state |= SS_FILOP_UNSF;		\
+		mutex_exit(&(so)->so_lock);			\
+	}
+
 #define	SO_UNBLOCK_FALLBACK(so)	{			\
 	rw_exit(&(so)->so_fallback_rwlock);		\
 }
@@ -368,6 +394,7 @@ struct sonode {
 /* The modes below are only for non-streams sockets */
 #define	SM_ACCEPTSUPP		0x400	/* can handle accept() */
 #define	SM_SENDFILESUPP		0x800	/* Private: proto supp sendfile  */
+#define	SM_DEFERERR		0x1000	/* Private: defer so_error delivery */
 
 /*
  * Socket versions. Used by the socket library when calling _so_socket().
@@ -946,6 +973,15 @@ extern struct sonode	*socreate(struct sockparams *, int, int, int, int,
 extern int	so_copyin(const void *, void *, size_t, int);
 extern int	so_copyout(const void *, void *, size_t, int);
 
+/*
+ * Functions to manipulate the use of direct receive callbacks. This should not
+ * be used outside of sockfs and ksocket. These are generally considered a use
+ * once interface for a socket and will cause all outstanding data on the socket
+ * to be flushed.
+ */
+extern int	so_krecv_set(sonode_t *, so_krecv_f, void *);
+extern void	so_krecv_unblock(sonode_t *);
+
 #endif
 
 /*
diff --git a/usr/src/uts/common/sys/sockfilter.h b/usr/src/uts/common/sys/sockfilter.h
index 9f6d8b499b..c4dd6539de 100644
--- a/usr/src/uts/common/sys/sockfilter.h
+++ b/usr/src/uts/common/sys/sockfilter.h
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
  */
 
 #ifndef	_SYS_SOCKFILTER_H
@@ -129,6 +130,15 @@ typedef struct sof_ops {
 
 #define	SOF_VERSION	1
 
+/*
+ * Flag indicating that the filter module is safe to attach after bind,
+ * getsockname, getsockopt or setsockopt calls. By default filters are unsafe
+ * so may not be attached after any socket operation. However, a safe filter
+ * can still be attached after one of the above calls. This makes attaching
+ * the filter less dependent on the initial socket setup order.
+ */
+#define	SOF_ATT_SAFE	0x1
+
 extern int	sof_register(int, const char *, const sof_ops_t *, int);
 extern int	sof_unregister(const char *);
 
diff --git a/usr/src/uts/common/sys/squeue.h b/usr/src/uts/common/sys/squeue.h
index f1bd429815..35e1cf64c7 100644
--- a/usr/src/uts/common/sys/squeue.h
+++ b/usr/src/uts/common/sys/squeue.h
@@ -29,6 +29,17 @@
 extern "C" {
 #endif
 
+/*
+ * Originally in illumos, we had an IP-centric view of the serialization queue
+ * abstraction. While that has useful properties, the implementation of squeues
+ * hardcodes various parts of the implementation of IP into it which makes it
+ * unsuitable for other consumers. To enable them, we created another interface,
+ * but opted not to port all of the functionality that IP uses in the form of
+ * ip_squeue.c As other consumers need the functionality that IP has in squeues,
+ * then we'll come up with more genericized methods and add that functionality
+ * to <sys/gsqueue.h>. Please do not continue to use this header.
+ */
+
 #include <sys/types.h>
 #include <sys/processor.h>
 #include <sys/stream.h>
@@ -76,12 +87,13 @@ typedef enum {
 
 struct ip_recv_attr_s;
 extern void squeue_init(void);
-extern squeue_t *squeue_create(clock_t, pri_t);
+extern squeue_t *squeue_create(clock_t, pri_t, boolean_t);
 extern void squeue_bind(squeue_t *, processorid_t);
 extern void squeue_unbind(squeue_t *);
 extern void squeue_enter(squeue_t *, mblk_t *, mblk_t *,
     uint32_t, struct ip_recv_attr_s *, int, uint8_t);
 extern uintptr_t *squeue_getprivate(squeue_t *, sqprivate_t);
+extern void squeue_destroy(squeue_t *);
 
 struct conn_s;
 extern int squeue_synch_enter(struct conn_s *, mblk_t *);
diff --git a/usr/src/uts/common/sys/squeue_impl.h b/usr/src/uts/common/sys/squeue_impl.h
index 22550886eb..d2418bbc15 100644
--- a/usr/src/uts/common/sys/squeue_impl.h
+++ b/usr/src/uts/common/sys/squeue_impl.h
@@ -117,6 +117,7 @@ struct squeue_s {
 	squeue_set_t	*sq_set;	/* managed by squeue creator */
 
 	pri_t		sq_priority;	/* squeue thread priority */
+	boolean_t	sq_isip;	/* use IP-centric features */
 
 	/* Keep the debug-only fields at the end of the structure */
 #ifdef DEBUG
@@ -165,6 +166,7 @@ struct squeue_s {
 #define	SQS_POLL_RESTART_DONE	0x01000000
 #define	SQS_POLL_THR_QUIESCE	0x02000000
 #define	SQS_PAUSE		0x04000000 /* The squeue has been paused */
+#define	SQS_EXIT		0x08000000 /* squeue is being torn down */
 
 #define	SQS_WORKER_THR_CONTROL          \
 	(SQS_POLL_QUIESCE | SQS_POLL_RESTART | SQS_POLL_CLEANUP)
diff --git a/usr/src/uts/common/sys/stream.h b/usr/src/uts/common/sys/stream.h
index a04019a9ce..28289649dd 100644
--- a/usr/src/uts/common/sys/stream.h
+++ b/usr/src/uts/common/sys/stream.h
@@ -21,6 +21,7 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2015 Joyent, Inc.  All rights reserved.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -628,16 +629,11 @@ struct stroptions {
 /*
  * Structure for rw (read/write) procedure calls. A pointer
  * to a struiod_t is passed as a parameter to the rwnext() call.
- *
- * Note: DEF_IOV_MAX is defined and used as it is in "fs/vncalls.c"
- *	 as there isn't a formal definition of IOV_MAX ???
  */
-#define	DEF_IOV_MAX	16
-
 typedef struct struiod {
 	mblk_t		*d_mp;		/* pointer to mblk (chain) */
 	uio_t		d_uio;		/* uio info */
-	iovec_t d_iov[DEF_IOV_MAX];	/* iov referenced by uio */
+	iovec_t 	*d_iov;		/* iov referenced by uio */
 } struiod_t;
 
 /*
diff --git a/usr/src/uts/common/sys/sysevent.h b/usr/src/uts/common/sys/sysevent.h
index 46a800e62b..255e98b871 100644
--- a/usr/src/uts/common/sys/sysevent.h
+++ b/usr/src/uts/common/sys/sysevent.h
@@ -21,6 +21,7 @@
 
 /*
  * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
  */
 
 #ifndef	_SYS_SYSEVENT_H
@@ -67,10 +68,12 @@ extern "C" {
 #define	SE_KERN_PID	0
 
 #define	SUNW_VENDOR	"SUNW"
+#define	ILLUMOS_VENDOR	"ILLUMOS"
 #define	SE_USR_PUB	"usr:"
 #define	SE_KERN_PUB	"kern:"
 #define	SUNW_KERN_PUB	SUNW_VENDOR":"SE_KERN_PUB
 #define	SUNW_USR_PUB	SUNW_VENDOR":"SE_USR_PUB
+#define	ILLUMOS_KERN_PUB	ILLUMOS_VENDOR":"SE_KERN_PUB
 
 /*
  * Event header and attribute value limits
diff --git a/usr/src/uts/common/sys/sysevent/datalink.h b/usr/src/uts/common/sys/sysevent/datalink.h
new file mode 100644
index 0000000000..592ef5bdde
--- /dev/null
+++ b/usr/src/uts/common/sys/sysevent/datalink.h
@@ -0,0 +1,54 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+#ifndef _SYS_SYSEVENT_DATALINK_H
+#define	_SYS_SYSEVENT_DATALINK_H
+
+/*
+ * Datalink System Event payloads
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Event schema for EC_DATALINK_LINK_STATE
+ *
+ * 	Event Class	- EC_DATALINK
+ * 	Event Sub-Class	- EC_DATALINK_LINK_STATE
+ *
+ * 	Attribute Name	- DATALINK_EV_LINK_NAME
+ * 	Attribute Type	- SE_DATA_TYPE_STRING
+ * 	Attribute Value	- [Name of the datalink]
+ *
+ * 	Attribute Name	- DATALINK_EV_LINK_ID
+ * 	Attribute Type	- SE_DATA_TYPE_INT32
+ * 	Attribute Value	- [datalink_id_t for the device]
+ *
+ * 	Attribute Name	- DATALINK_EV_ZONE_ID
+ * 	Attribute Type	- SE_DATA_TYPE_INT32
+ * 	Attribute Value	- [zoneid_t of the zone the datalink is in]
+ */
+
+#define	DATALINK_EV_LINK_NAME		"link"
+#define	DATALINK_EV_LINK_ID		"linkid"
+#define	DATALINK_EV_ZONE_ID		"zone"
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_SYSEVENT_DATALINK_H */
diff --git a/usr/src/uts/common/sys/sysevent/eventdefs.h b/usr/src/uts/common/sys/sysevent/eventdefs.h
index 25401cec53..0a78d0310b 100644
--- a/usr/src/uts/common/sys/sysevent/eventdefs.h
+++ b/usr/src/uts/common/sys/sysevent/eventdefs.h
@@ -267,9 +267,11 @@ extern "C" {
 #define	ESC_ZFS_POOL_REGUID		"ESC_ZFS_pool_reguid"
 
 /*
- * datalink subclass definitions.
+ * datalink subclass definitions. Supporting attributes for datalink state found
+ * in sys/sysevent/datalink.h.
  */
 #define	ESC_DATALINK_PHYS_ADD	"ESC_datalink_phys_add"	/* new physical link */
+#define	ESC_DATALINK_LINK_STATE	"ESC_datalink_link_state"	/* link state */
 
 /*
  * VRRP subclass definitions. Supporting attributes (name/value paris) are
diff --git a/usr/src/uts/common/sys/systrace.h b/usr/src/uts/common/sys/systrace.h
index d43974451e..17e509d4d8 100644
--- a/usr/src/uts/common/sys/systrace.h
+++ b/usr/src/uts/common/sys/systrace.h
@@ -22,13 +22,12 @@
 /*
  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2014 Joyent, Inc.  All rights reserved.
  */
 
 #ifndef _SYS_SYSTRACE_H
 #define	_SYS_SYSTRACE_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/dtrace.h>
 
 #ifdef	__cplusplus
@@ -47,16 +46,18 @@ extern systrace_sysent_t *systrace_sysent;
 extern systrace_sysent_t *systrace_sysent32;
 
 extern void (*systrace_probe)(dtrace_id_t, uintptr_t, uintptr_t,
-    uintptr_t, uintptr_t, uintptr_t, uintptr_t);
+    uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t);
 extern void systrace_stub(dtrace_id_t, uintptr_t, uintptr_t,
-    uintptr_t, uintptr_t, uintptr_t, uintptr_t);
+    uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t);
 
 extern int64_t dtrace_systrace_syscall(uintptr_t arg0, uintptr_t arg1,
-    uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5);
+    uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5,
+    uintptr_t arg6, uintptr_t arg7);
 
 #ifdef _SYSCALL32_IMPL
 extern int64_t dtrace_systrace_syscall32(uintptr_t arg0, uintptr_t arg1,
-    uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5);
+    uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5,
+    uintptr_t arg6, uintptr_t arg7);
 #endif
 
 #endif
diff --git a/usr/src/uts/common/sys/termios.h b/usr/src/uts/common/sys/termios.h
index 09be20858d..889a7096cd 100644
--- a/usr/src/uts/common/sys/termios.h
+++ b/usr/src/uts/common/sys/termios.h
@@ -361,6 +361,24 @@ extern pid_t tcgetsid(int);
 #define	TCSETSF		(_TIOC|16)
 
 /*
+ * linux terminal ioctls we need to be aware of
+ */
+#define	TIOCSETLD	(_TIOC|123)	/* set line discipline parms */
+#define	TIOCGETLD	(_TIOC|124)	/* get line discipline parms */
+
+/*
+ * The VMIN and VTIME and solaris overlap with VEOF and VEOL - This is
+ * perfectly legal except, linux expects them to be separate. So we keep
+ * them separately.
+ */
+struct lx_cc {
+	unsigned char veof;	/* veof value */
+	unsigned char veol;	/* veol value */
+	unsigned char vmin;	/* vmin value */
+	unsigned char vtime;	/* vtime value */
+};
+
+/*
  * NTP PPS ioctls
  */
 #define	TIOCGPPS	(_TIOC|125)
diff --git a/usr/src/uts/common/sys/thread.h b/usr/src/uts/common/sys/thread.h
index d917944edf..6a1c36f2e7 100644
--- a/usr/src/uts/common/sys/thread.h
+++ b/usr/src/uts/common/sys/thread.h
@@ -24,6 +24,10 @@
  * Use is subject to license terms.
  */
 
+/*
+ * Copyright (c) 2015, Joyent, Inc.  All rights reserved.
+ */
+
 #ifndef	_SYS_THREAD_H
 #define	_SYS_THREAD_H
 
@@ -68,6 +72,8 @@ typedef struct ctxop {
 	void	(*free_op)(void *, int); /* function which frees the context */
 	void	*arg;		/* argument to above functions, ctx pointer */
 	struct ctxop *next;	/* next context ops */
+	hrtime_t save_ts;		/* timestamp of last save */
+	hrtime_t restore_ts;		/* timestamp of last restore */
 } ctxop_t;
 
 /*
@@ -366,7 +372,7 @@ typedef struct _kthread {
 #define	T_WOULDBLOCK	0x0020	/* for lockfs */
 #define	T_DONTBLOCK	0x0040	/* for lockfs */
 #define	T_DONTPEND	0x0080	/* for lockfs */
-#define	T_SYS_PROF	0x0100	/* profiling on for duration of system call */
+#define	T_SPLITSTK	0x0100	/* kernel stack is currently split */
 #define	T_WAITCVSEM	0x0200	/* waiting for a lwp_cv or lwp_sema on sleepq */
 #define	T_WATCHPT	0x0400	/* thread undergoing a watchpoint emulation */
 #define	T_PANIC		0x0800	/* thread initiated a system panic */
@@ -414,8 +420,9 @@ typedef struct _kthread {
 #define	TS_RESUME	0x1000	/* setrun() by CPR resume process */
 #define	TS_CREATE	0x2000	/* setrun() by syslwp_create() */
 #define	TS_RUNQMATCH	0x4000	/* exact run queue balancing by setbackdq() */
+#define	TS_BSTART	0x8000	/* setrun() by brand */
 #define	TS_ALLSTART	\
-	(TS_CSTART|TS_UNPAUSE|TS_XSTART|TS_PSTART|TS_RESUME|TS_CREATE)
+	(TS_CSTART|TS_UNPAUSE|TS_XSTART|TS_PSTART|TS_RESUME|TS_CREATE|TS_BSTART)
 #define	TS_ANYWAITQ	(TS_PROJWAITQ|TS_ZONEWAITQ)
 
 /*
@@ -443,6 +450,10 @@ typedef struct _kthread {
 #define	ISTOPPED(t) ((t)->t_state == TS_STOPPED && \
 			!((t)->t_schedflag & TS_PSTART))
 
+/* True if thread is stopped for a brand-specific reason */
+#define	BSTOPPED(t)	((t)->t_state == TS_STOPPED && \
+			    !((t)->t_schedflag & TS_BSTART))
+
 /* True if thread is asleep and wakeable */
 #define	ISWAKEABLE(t) (((t)->t_state == TS_SLEEP && \
 			((t)->t_flag & T_WAKEABLE)))
diff --git a/usr/src/uts/common/sys/uadmin.h b/usr/src/uts/common/sys/uadmin.h
index d5168c9b2c..c14a3bf11e 100644
--- a/usr/src/uts/common/sys/uadmin.h
+++ b/usr/src/uts/common/sys/uadmin.h
@@ -23,6 +23,7 @@
  *
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2011 Joyent, Inc.  All rights reserved.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -159,7 +160,7 @@ extern kmutex_t ualock;
 extern void mdboot(int, int, char *, boolean_t);
 extern void mdpreboot(int, int, char *);
 extern int kadmin(int, int, void *, cred_t *);
-extern void killall(zoneid_t);
+extern void killall(zoneid_t, boolean_t);
 #endif
 
 extern int uadmin(int, int, uintptr_t);
diff --git a/usr/src/uts/common/sys/uio.h b/usr/src/uts/common/sys/uio.h
index e803efeb45..5663929bfb 100644
--- a/usr/src/uts/common/sys/uio.h
+++ b/usr/src/uts/common/sys/uio.h
@@ -145,7 +145,8 @@ typedef struct uioa_s {
  */
 typedef enum xuio_type {
 	UIOTYPE_ASYNCIO,
-	UIOTYPE_ZEROCOPY
+	UIOTYPE_ZEROCOPY,
+	UIOTYPE_PEEKSIZE
 } xuio_type_t;
 
 typedef struct xuio {
@@ -175,6 +176,15 @@ typedef struct xuio {
 			int xu_zc_rw;	/* read or write buffer */
 			void *xu_zc_priv;	/* fs specific */
 		} xu_zc;
+
+		/*
+		 * Peek Size Support -- facilitate peeking at the size of a
+		 * waiting message on a socket.
+		 */
+		struct {
+			ssize_t xu_ps_size;	/* size of waiting msg */
+			boolean_t xu_ps_set;	/* was size calculated? */
+		} xu_ps;
 	} xu_ext;
 } xuio_t;
 
diff --git a/usr/src/uts/common/sys/user.h b/usr/src/uts/common/sys/user.h
index a7bff8dd52..66250a3f2b 100644
--- a/usr/src/uts/common/sys/user.h
+++ b/usr/src/uts/common/sys/user.h
@@ -26,7 +26,7 @@
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
 /*	  All Rights Reserved	*/
 /*
- * Copyright (c) 2012 Joyent, Inc.  All rights reserved.
+ * Copyright 2016 Joyent, Inc.
  */
 
 
@@ -185,9 +185,9 @@ typedef struct {		/* kernel syscall set type */
  * This value should not be changed in a patch.
  */
 #if defined(__sparc)
-#define	__KERN_NAUXV_IMPL 20
+#define	__KERN_NAUXV_IMPL 24
 #elif defined(__i386) || defined(__amd64)
-#define	__KERN_NAUXV_IMPL 22
+#define	__KERN_NAUXV_IMPL 26
 #endif
 
 struct execsw;
@@ -211,6 +211,7 @@ typedef	struct	user {
 	int	u_argc;			/* value of argc passed to main() */
 	uintptr_t u_argv;		/* value of argv passed to main() */
 	uintptr_t u_envp;		/* value of envp passed to main() */
+	uintptr_t u_commpagep;		/* address of mapped comm page */
 
 	/*
 	 * These fields are protected by p_lock:
diff --git a/usr/src/uts/common/sys/vm_usage.h b/usr/src/uts/common/sys/vm_usage.h
index 1aa4a8ee6d..c2954cbc29 100644
--- a/usr/src/uts/common/sys/vm_usage.h
+++ b/usr/src/uts/common/sys/vm_usage.h
@@ -21,6 +21,7 @@
 /*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2014 Joyent, Inc.  All rights reserved.
  */
 
 #ifndef	_SYS_VM_USAGE_H
@@ -79,8 +80,9 @@ extern "C" {
 					/* zoneid */
 #define	VMUSAGE_COL_EUSERS	0x2000	/* same as VMUSAGE_COL_RUSERS, but by */
 					/* euser */
+#define	VMUSAGE_A_ZONE		0x4000	/* rss/swap for a specified zone */
 
-#define	VMUSAGE_MASK		0x3fff  /* all valid flags for getvmusage() */
+#define	VMUSAGE_MASK		0x7fff  /* all valid flags for getvmusage() */
 
 typedef struct vmusage {
 	id_t	vmu_zoneid;		/* zoneid, or ALL_ZONES for */
@@ -108,6 +110,7 @@ extern int getvmusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres);
 
 int vm_getusage(uint_t, time_t, vmusage_t *, size_t *, int);
 void vm_usage_init();
+int vm_map_inval(pid_t, caddr_t, size_t);
 
 #endif	/* _KERNEL */
 
diff --git a/usr/src/uts/common/sys/vmsystm.h b/usr/src/uts/common/sys/vmsystm.h
index 6122b6cd2f..c7b41730b6 100644
--- a/usr/src/uts/common/sys/vmsystm.h
+++ b/usr/src/uts/common/sys/vmsystm.h
@@ -19,6 +19,9 @@
  * CDDL HEADER END
  */
 /*
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ */
+/*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
@@ -159,6 +162,8 @@ extern	void	*boot_virt_alloc(void *addr, size_t size);
 
 extern	size_t	exec_get_spslew(void);
 
+extern	caddr_t	map_userlimit(proc_t *pp, struct as *as, int flags);
+
 #endif	/* _KERNEL */
 
 #ifdef	__cplusplus
diff --git a/usr/src/uts/common/sys/vnd.h b/usr/src/uts/common/sys/vnd.h
new file mode 100644
index 0000000000..bc7c9c3122
--- /dev/null
+++ b/usr/src/uts/common/sys/vnd.h
@@ -0,0 +1,141 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved.
+ */
+
+#ifndef _SYS_VND_H
+#define	_SYS_VND_H
+
+#include <sys/types.h>
+#include <sys/vnd_errno.h>
+#include <sys/frameio.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * We distinguish between normal ioctls and private ioctls we issues to out
+ * streams version. Streams ioctls have the upper bit set in the lowest byte.
+ * Note that there are no STREAMs ioctls for userland and all definitions
+ * related to them are not present in this file.
+ */
+#define	VND_IOC		(('v' << 24) | ('n' << 16) | ('d' << 8))
+
+/*
+ * Attach the current minor instance to a given dlpi datalink identified by a
+ * vnd_ioc_name_t argument. This fails if it's already been attached. Note that
+ * unlike the other ioctls, this is passed directly as opposed to every other
+ * function which is passed as a pointer to the value.
+ */
+#define	VND_IOC_ATTACH		(VND_IOC | 0x1)
+
+#define	VND_NAMELEN	32
+
+typedef struct vnd_ioc_attach {
+	char		via_name[VND_NAMELEN];
+	zoneid_t	via_zoneid;
+	uint32_t	via_errno;
+} vnd_ioc_attach_t;
+
+/*
+ * Link the current minor instance into the /devices name space.
+ *
+ * This ioctl adds entries into /devices with a name of the form z%d:%s vil_zid,
+ * vil_name. The device will be namespaced to the zone. The global zone will be
+ * able to see all minor nodes. In the zone, only the /dev entries will exist.
+ * At this time, a given device can only have one link at a time. Note that a
+ * user cannot specify the zone to pass in, rather it is the zone that the
+ * device was attached in.
+ */
+#define	VND_IOC_LINK		(VND_IOC | 0x2)
+
+typedef struct vnd_ioc_link {
+	char		vil_name[VND_NAMELEN];
+	uint32_t	vil_errno;
+} vnd_ioc_link_t;
+
+/*
+ * Unlink the opened minor instance from the /devices name space. A zone may use
+ * this to unlink an extent entry in /dev; however, they will not be able to
+ * link it in again.
+ */
+#define	VND_IOC_UNLINK		(VND_IOC | 0x3)
+typedef struct vnd_ioc_unlink {
+	uint32_t viu_errno;
+} vnd_ioc_unlink_t;
+
+/*
+ * Controls to get and set the current buffer recieve buffer size.
+ */
+typedef struct vnd_ioc_buf {
+	uint64_t	vib_size;
+	uint32_t	vib_filler;
+	uint32_t	vib_errno;
+} vnd_ioc_buf_t;
+
+#define	VND_IOC_GETRXBUF	(VND_IOC | 0x04)
+#define	VND_IOC_SETRXBUF	(VND_IOC | 0x05)
+#define	VND_IOC_GETMAXBUF	(VND_IOC | 0x06)
+#define	VND_IOC_GETTXBUF	(VND_IOC | 0x07)
+#define	VND_IOC_SETTXBUF	(VND_IOC | 0x08)
+#define	VND_IOC_GETMINTU	(VND_IOC | 0x09)
+#define	VND_IOC_GETMAXTU	(VND_IOC | 0x0a)
+
+/*
+ * Information and listing ioctls
+ *
+ * This gets information about all of the active vnd instances. vl_actents is
+ * always updated to the number around and vl_nents is the number of
+ * vnd_ioc_info_t elements are allocated in vl_ents.
+ */
+typedef struct vnd_ioc_info {
+	uint32_t vii_version;
+	zoneid_t vii_zone;
+	char vii_name[VND_NAMELEN];
+	char vii_datalink[VND_NAMELEN];
+} vnd_ioc_info_t;
+
+typedef struct vnd_ioc_list {
+	uint_t vl_nents;
+	uint_t vl_actents;
+	vnd_ioc_info_t *vl_ents;
+} vnd_ioc_list_t;
+
+#ifdef _KERNEL
+
+typedef struct vnd_ioc_list32 {
+	uint_t vl_nents;
+	uint_t vl_actents;
+	caddr32_t vl_ents;
+} vnd_ioc_list32_t;
+
+#endif	/* _KERNEL */
+
+#define	VND_IOC_LIST		(VND_IOC | 0x20)
+
+/*
+ * Framed I/O ioctls
+ *
+ * Users should use the standard frameio_t as opposed to a vnd specific type.
+ * This is a consolidation private ioctl pending futher stability in the form of
+ * specific system work.
+ */
+#define	VND_IOC_FRAMEIO_READ	(VND_IOC | 0x30)
+#define	VND_IOC_FRAMEIO_WRITE	(VND_IOC | 0x31)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_VND_H */
diff --git a/usr/src/uts/common/sys/vnd_errno.h b/usr/src/uts/common/sys/vnd_errno.h
new file mode 100644
index 0000000000..89e5fc2543
--- /dev/null
+++ b/usr/src/uts/common/sys/vnd_errno.h
@@ -0,0 +1,72 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 Joyent, Inc.  All rights reserved.
+ */
+
+#ifndef _SYS_VND_ERRNO_H
+#define	_SYS_VND_ERRNO_H
+
+/*
+ * This header contains all of the available vnd errors.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum vnd_errno {
+	VND_E_SUCCESS = 0,		/* no error */
+	VND_E_NOMEM,			/* no memory */
+	VND_E_NODATALINK,		/* no such datalink */
+	VND_E_NOTETHER,			/* not DL_ETHER */
+	VND_E_DLPIINVAL,		/* Unknown DLPI failures */
+	VND_E_ATTACHFAIL,		/* DL_ATTACH_REQ failed */
+	VND_E_BINDFAIL,			/* DL_BIND_REQ failed */
+	VND_E_PROMISCFAIL,		/* DL_PROMISCON_REQ failed */
+	VND_E_DIRECTFAIL,		/* DLD_CAPAB_DIRECT enable failed */
+	VND_E_CAPACKINVAL,		/* bad dl_capability_ack_t */
+	VND_E_SUBCAPINVAL,		/* bad dl_capability_sub_t */
+	VND_E_DLDBADVERS,		/* bad dld version */
+	VND_E_KSTATCREATE,		/* failed to create kstats */
+	VND_E_NODEV,			/* no such vnd link */
+	VND_E_NONETSTACK,		/* netstack doesn't exist */
+	VND_E_ASSOCIATED,		/* device already associated */
+	VND_E_ATTACHED,			/* device already attached */
+	VND_E_LINKED,			/* device already linked */
+	VND_E_BADNAME,			/* invalid name */
+	VND_E_PERM,			/* can't touch this */
+	VND_E_NOZONE,			/* no such zone */
+	VND_E_STRINIT,		/* failed to initialize vnd stream module */
+	VND_E_NOTATTACHED,		/* device not attached */
+	VND_E_NOTLINKED,		/* device not linked */
+	VND_E_LINKEXISTS,	/* another device has the same link name */
+	VND_E_MINORNODE,		/* failed to create minor node */
+	VND_E_BUFTOOBIG,		/* requested buffer size is too large */
+	VND_E_BUFTOOSMALL,		/* requested buffer size is too small */
+	VND_E_DLEXCL,			/* unable to get dlpi excl access */
+	VND_E_DIRECTNOTSUP,
+			/* DLD direct capability not suported over data link */
+	VND_E_BADPROPSIZE,		/* invalid property size */
+	VND_E_BADPROP,			/* invalid property */
+	VND_E_PROPRDONLY,		/* property is read only */
+	VND_E_SYS,			/* unexpected system error */
+	VND_E_CAPABPASS,
+			/* capabilities invalid, pass-through module detected */
+	VND_E_UNKNOWN			/* unknown error */
+} vnd_errno_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_VND_ERRNO_H */
diff --git a/usr/src/uts/common/sys/vnic_impl.h b/usr/src/uts/common/sys/vnic_impl.h
index 7e50091347..1a91158da6 100644
--- a/usr/src/uts/common/sys/vnic_impl.h
+++ b/usr/src/uts/common/sys/vnic_impl.h
@@ -21,7 +21,7 @@
 /*
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
- * Copyright 2014 Joyent, Inc.  All rights reserved.
+ * Copyright 2015 Joyent, Inc.
  */
 
 #ifndef	_SYS_VNIC_IMPL_H
@@ -65,6 +65,7 @@ typedef struct vnic_s {
 
 	uint32_t		vn_hcksum_txflags;
 	uint32_t		vn_mtu;
+	link_state_t		vn_ls;
 } vnic_t;
 
 #define	vn_mch	vn_mc_handles[0]
diff --git a/usr/src/uts/common/sys/vnode.h b/usr/src/uts/common/sys/vnode.h
index e4d43cea7f..d12f6c4046 100644
--- a/usr/src/uts/common/sys/vnode.h
+++ b/usr/src/uts/common/sys/vnode.h
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2015, Joyent, Inc.
  */
 
 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
@@ -738,12 +738,14 @@ typedef enum vnevent	{
 	VE_RMDIR	= 4,	/* Remove of directory vnode's name */
 	VE_CREATE	= 5,	/* Create with vnode's name which exists */
 	VE_LINK		= 6, 	/* Link with vnode's name as source */
-	VE_RENAME_DEST_DIR	= 7, 	/* Rename with vnode as target dir */
+	VE_RENAME_DEST_DIR = 7,	/* Rename with vnode as target dir */
 	VE_MOUNTEDOVER	= 8, 	/* File or Filesystem got mounted over vnode */
 	VE_TRUNCATE = 9,	/* Truncate */
 	VE_PRE_RENAME_SRC = 10,	/* Pre-rename, with vnode as source */
 	VE_PRE_RENAME_DEST = 11, /* Pre-rename, with vnode as target/dest. */
-	VE_PRE_RENAME_DEST_DIR = 12 /* Pre-rename with vnode as target dir */
+	VE_PRE_RENAME_DEST_DIR = 12, /* Pre-rename with vnode as target dir */
+	VE_RENAME_SRC_DIR = 13,	/* Rename with vnode as source dir */
+	VE_RESIZE	= 14	/* Resize/truncate to non-zero offset */
 } vnevent_t;
 
 /*
@@ -1298,7 +1300,8 @@ void	vnevent_remove(vnode_t *, vnode_t *, char *, caller_context_t *);
 void	vnevent_rmdir(vnode_t *, vnode_t *, char *, caller_context_t *);
 void	vnevent_create(vnode_t *, caller_context_t *);
 void	vnevent_link(vnode_t *, caller_context_t *);
-void	vnevent_rename_dest_dir(vnode_t *, caller_context_t *ct);
+void	vnevent_rename_dest_dir(vnode_t *, vnode_t *, char *,
+    caller_context_t *ct);
 void	vnevent_mountedover(vnode_t *, caller_context_t *);
 void	vnevent_truncate(vnode_t *, caller_context_t *);
 int	vnevent_support(vnode_t *, caller_context_t *);
@@ -1308,6 +1311,7 @@ void	vnevent_pre_rename_dest(vnode_t *, vnode_t *, char *,
 	    caller_context_t *);
 void	vnevent_pre_rename_dest_dir(vnode_t *, vnode_t *, char *,
 	    caller_context_t *);
+void	vnevent_resize(vnode_t *, caller_context_t *);
 
 /* Vnode specific data */
 void vsd_create(uint_t *, void (*)(void *));
@@ -1337,6 +1341,9 @@ u_longlong_t	fs_new_caller_id();
 
 int	vn_vmpss_usepageio(vnode_t *);
 
+/* Empty v_path placeholder */
+extern char *vn_vpath_empty;
+
 /*
  * Needed for use of IS_VMODSORT() in kernel.
  */
diff --git a/usr/src/uts/common/sys/vxlan.h b/usr/src/uts/common/sys/vxlan.h
new file mode 100644
index 0000000000..d87786b507
--- /dev/null
+++ b/usr/src/uts/common/sys/vxlan.h
@@ -0,0 +1,47 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+#ifndef _SYS_VXLAN_H
+#define	_SYS_VXLAN_H
+
+/*
+ * Common VXLAN information
+ */
+
+#include <sys/inttypes.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Sizes in bytes */
+#define	VXLAN_HDR_LEN	8
+#define	VXLAN_ID_LEN	3
+
+#define	VXLAN_F_VDI	0x08000000
+#define	VXLAN_ID_SHIFT	8
+
+#pragma pack(1)
+typedef struct vxlan_hdr {
+	uint32_t vxlan_flags;
+	uint32_t vxlan_id;
+} vxlan_hdr_t;
+#pragma pack()
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_VXLAN_H */
diff --git a/usr/src/uts/common/sys/zfd.h b/usr/src/uts/common/sys/zfd.h
new file mode 100644
index 0000000000..e08d75ecba
--- /dev/null
+++ b/usr/src/uts/common/sys/zfd.h
@@ -0,0 +1,78 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+#ifndef _SYS_ZFD_H
+#define	_SYS_ZFD_H
+
+#include <sys/types.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * Minor node name of the global zone side (often called the "master" side)
+ * of the zfd dev.
+ */
+#define	ZFD_MASTER_NAME	"master"
+
+/*
+ * Minor node name of the non-global zone side (often called the "slave"
+ * side) of the zfd dev.
+ */
+#define	ZFD_SLAVE_NAME	"slave"
+
+#define	ZFD_NAME_LEN	16
+
+/*
+ * ZFD_IOC forms the base for all zfd ioctls.
+ */
+#define	ZFD_IOC		(('Z' << 24) | ('f' << 16) | ('d' << 8))
+
+/*
+ * This ioctl tells the slave side it should push the TTY stream modules
+ * so that the fd looks like a tty.
+ */
+#define	ZFD_MAKETTY		(ZFD_IOC | 0)
+
+/*
+ * This ioctl puts a hangup into the stream so that the slave side sees EOF.
+ */
+#define	ZFD_EOF			(ZFD_IOC | 1)
+
+/*
+ * This ioctl succeeds if the slave side is open.
+ */
+#define	ZFD_HAS_SLAVE		(ZFD_IOC | 2)
+
+/*
+ * This ioctl links two streams into a multiplexer configuration for in-zone
+ * logging.
+ */
+#define	ZFD_MUX			(ZFD_IOC | 3)
+
+/*
+ * This ioctl controls the flow control setting for the log multiplexer stream
+ * (1 = true, 0 = false). The default is false which implies teeing into the
+ * log stream is "best-effort" but data will be discarded if the stream
+ * becomes full. If set and the log stream begins to fill up, the primary
+ * stream will stop flowing.
+ */
+#define	ZFD_MUX_FLOWCON		(ZFD_IOC | 4)
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_ZFD_H */
diff --git a/usr/src/uts/common/sys/zone.h b/usr/src/uts/common/sys/zone.h
index 2e69b0d1c7..754f8e3978 100644
--- a/usr/src/uts/common/sys/zone.h
+++ b/usr/src/uts/common/sys/zone.h
@@ -20,9 +20,9 @@
  */
 /*
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2015 Joyent, Inc. All rights reserved.
  * Copyright 2014 Nexenta Systems, Inc. All rights reserved.
  * Copyright 2014 Igor Kozhukhov <ikozhukhov@gmail.com>.
+ * Copyright 2016, Joyent, Inc.
  */
 
 #ifndef _SYS_ZONE_H
@@ -97,13 +97,19 @@ extern "C" {
 #define	ZONE_ATTR_INITNAME	9
 #define	ZONE_ATTR_BOOTARGS	10
 #define	ZONE_ATTR_BRAND		11
-#define	ZONE_ATTR_PHYS_MCAP	12
+#define	ZONE_ATTR_PMCAP_NOVER	12
 #define	ZONE_ATTR_SCHED_CLASS	13
 #define	ZONE_ATTR_FLAGS		14
 #define	ZONE_ATTR_HOSTID	15
 #define	ZONE_ATTR_FS_ALLOWED	16
 #define	ZONE_ATTR_NETWORK	17
+#define	ZONE_ATTR_DID		18
+#define	ZONE_ATTR_PMCAP_PAGEOUT	19
 #define	ZONE_ATTR_INITNORESTART	20
+#define	ZONE_ATTR_PG_FLT_DELAY	21
+#define	ZONE_ATTR_RSS		22
+#define	ZONE_ATTR_APP_SVC_CT	23
+#define	ZONE_ATTR_SCHED_FIXEDHI	24
 
 /* Start of the brand-specific attribute namespace */
 #define	ZONE_ATTR_BRAND_ATTRS	32768
@@ -184,6 +190,7 @@ typedef struct {
 	uint32_t doi;			/* DOI for label */
 	caddr32_t label;		/* label associated with zone */
 	int flags;
+	zoneid_t zoneid;		/* requested zoneid */
 } zone_def32;
 #endif
 typedef struct {
@@ -200,6 +207,7 @@ typedef struct {
 	uint32_t doi;			/* DOI for label */
 	const bslabel_t *label;		/* label associated with zone */
 	int flags;
+	zoneid_t zoneid;		/* requested zoneid */
 } zone_def;
 
 /* extended error information */
@@ -244,9 +252,12 @@ typedef enum zone_cmd {
 typedef struct zone_cmd_arg {
 	uint64_t	uniqid;		/* unique "generation number" */
 	zone_cmd_t	cmd;		/* requested action */
-	uint32_t	_pad;		/* need consistent 32/64 bit alignmt */
+	int		status;		/* init status on shutdown */
+	uint32_t	debug;		/* enable brand hook debug */
 	char locale[MAXPATHLEN];	/* locale in which to render messages */
 	char bootbuf[BOOTARGS_MAX];	/* arguments passed to zone_boot() */
+	/* Needed for 32/64 zoneadm -> zoneadmd door arg size check. */
+	int		pad;
 } zone_cmd_arg_t;
 
 /*
@@ -372,7 +383,7 @@ typedef struct zone_dataset {
 } zone_dataset_t;
 
 /*
- * structure for zone kstats
+ * structure for rctl zone kstats
  */
 typedef struct zone_kstat {
 	kstat_named_t zk_zonename;
@@ -383,12 +394,57 @@ typedef struct zone_kstat {
 struct cpucap;
 
 typedef struct {
+	hrtime_t	cycle_start;
+	uint_t		cycle_cnt;
+	hrtime_t	zone_avg_cnt;
+} sys_zio_cntr_t;
+
+typedef struct {
+	kstat_named_t	zv_zonename;
+	kstat_named_t	zv_nread;
+	kstat_named_t	zv_reads;
+	kstat_named_t	zv_rtime;
+	kstat_named_t	zv_rlentime;
+	kstat_named_t	zv_rcnt;
+	kstat_named_t	zv_nwritten;
+	kstat_named_t	zv_writes;
+	kstat_named_t	zv_wtime;
+	kstat_named_t	zv_wlentime;
+	kstat_named_t	zv_wcnt;
+	kstat_named_t	zv_10ms_ops;
+	kstat_named_t	zv_100ms_ops;
+	kstat_named_t	zv_1s_ops;
+	kstat_named_t	zv_10s_ops;
+	kstat_named_t 	zv_delay_cnt;
+	kstat_named_t	zv_delay_time;
+} zone_vfs_kstat_t;
+
+typedef struct {
+	kstat_named_t	zz_zonename;
+	kstat_named_t	zz_nread;
+	kstat_named_t	zz_reads;
+	kstat_named_t	zz_rtime;
+	kstat_named_t	zz_rlentime;
+	kstat_named_t	zz_nwritten;
+	kstat_named_t	zz_writes;
+	kstat_named_t	zz_waittime;
+} zone_zfs_kstat_t;
+
+typedef struct {
 	kstat_named_t	zm_zonename;
+	kstat_named_t	zm_rss;
+	kstat_named_t	zm_phys_cap;
+	kstat_named_t	zm_swap;
+	kstat_named_t	zm_swap_cap;
+	kstat_named_t	zm_nover;
+	kstat_named_t	zm_pagedout;
 	kstat_named_t	zm_pgpgin;
 	kstat_named_t	zm_anonpgin;
 	kstat_named_t	zm_execpgin;
 	kstat_named_t	zm_fspgin;
 	kstat_named_t	zm_anon_alloc_fail;
+	kstat_named_t	zm_pf_throttle;
+	kstat_named_t	zm_pf_throttle_usec;
 } zone_mcap_kstat_t;
 
 typedef struct {
@@ -447,6 +503,7 @@ typedef struct zone {
 	 */
 	list_node_t	zone_linkage;
 	zoneid_t	zone_id;	/* ID of zone */
+	zoneid_t	zone_did;	/* persistent debug ID of zone */
 	uint_t		zone_ref;	/* count of zone_hold()s on zone */
 	uint_t		zone_cred_ref;	/* count of zone_hold_cred()s on zone */
 	/*
@@ -499,10 +556,11 @@ typedef struct zone {
 	kcondvar_t	zone_cv;	/* used to signal state changes */
 	struct proc	*zone_zsched;	/* Dummy kernel "zsched" process */
 	pid_t		zone_proc_initpid; /* pid of "init" for this zone */
-	char		*zone_initname;	/* fs path to 'init' */
+	char		*zone_initname;		/* fs path to 'init' */
+	int		zone_init_status;	/* init's exit status */
 	int		zone_boot_err;  /* for zone_boot() if boot fails */
 	char		*zone_bootargs;	/* arguments passed via zone_boot() */
-	uint64_t	zone_phys_mcap;	/* physical memory cap */
+	rctl_qty_t	zone_phys_mem_ctl;	/* current phys. memory limit */
 	/*
 	 * zone_kthreads is protected by zone_status_lock.
 	 */
@@ -540,9 +598,12 @@ typedef struct zone {
 	tsol_mlp_list_t zone_mlps;	/* MLPs on zone-private addresses */
 
 	boolean_t	zone_restart_init;	/* Restart init if it dies? */
+	boolean_t	zone_reboot_on_init_exit; /* Reboot if init dies? */
+	boolean_t	zone_setup_app_contract; /* setup contract? */
 	struct brand	*zone_brand;		/* zone's brand */
 	void 		*zone_brand_data;	/* store brand specific data */
 	id_t		zone_defaultcid;	/* dflt scheduling class id */
+	boolean_t	zone_fixed_hipri;	/* fixed sched. hi prio */
 	kstat_t		*zone_swapresv_kstat;
 	kstat_t		*zone_lockedmem_kstat;
 	/*
@@ -551,6 +612,37 @@ typedef struct zone {
 	list_t		zone_dl_list;
 	netstack_t	*zone_netstack;
 	struct cpucap	*zone_cpucap;	/* CPU caps data */
+
+	/*
+	 * Data and counters used for ZFS fair-share disk IO.
+	 */
+	rctl_qty_t	zone_zfs_io_pri;	/* ZFS IO priority */
+	uint_t		zone_zfs_queued[2];	/* sync I/O enqueued count */
+	uint64_t	zone_zfs_weight;	/* used to prevent starvation */
+	uint64_t	zone_io_util;		/* IO utilization metric */
+	boolean_t	zone_io_util_above_avg;	/* IO util percent > avg. */
+	uint16_t	zone_io_delay;		/* IO delay on logical r/w */
+	kmutex_t	zone_stg_io_lock;	/* protects IO window data */
+	sys_zio_cntr_t	zone_rd_ops;		/* Counters for ZFS reads, */
+	sys_zio_cntr_t	zone_wr_ops;		/* writes and */
+	sys_zio_cntr_t	zone_lwr_ops;		/* logical writes. */
+
+	/*
+	 * kstats and counters for VFS ops and bytes.
+	 */
+	kmutex_t	zone_vfs_lock;		/* protects VFS statistics */
+	kstat_t		*zone_vfs_ksp;
+	kstat_io_t	zone_vfs_rwstats;
+	zone_vfs_kstat_t *zone_vfs_stats;
+
+	/*
+	 * kstats for ZFS I/O ops and bytes.
+	 */
+	kmutex_t	zone_zfs_lock;		/* protects ZFS statistics */
+	kstat_t		*zone_zfs_ksp;
+	kstat_io_t	zone_zfs_rwstats;
+	zone_zfs_kstat_t *zone_zfs_stats;
+
 	/*
 	 * Solaris Auditing per-zone audit context
 	 */
@@ -569,6 +661,13 @@ typedef struct zone {
 						/* zone_rctls->rcs_lock */
 	kstat_t		*zone_nprocs_kstat;
 
+	/*
+	 * kstats and counters for physical memory capping.
+	 */
+	rctl_qty_t	zone_phys_mem;	/* current bytes of phys. mem. (RSS) */
+	kstat_t		*zone_physmem_kstat;
+	uint64_t	zone_mcap_nover;	/* # of times over phys. cap */
+	uint64_t	zone_mcap_pagedout;	/* bytes of mem. paged out */
 	kmutex_t	zone_mcap_lock;	/* protects mcap statistics */
 	kstat_t		*zone_mcap_ksp;
 	zone_mcap_kstat_t *zone_mcap_stats;
@@ -577,6 +676,11 @@ typedef struct zone {
 	uint64_t	zone_execpgin;		/* exec pages paged in */
 	uint64_t	zone_fspgin;		/* fs pages paged in */
 	uint64_t	zone_anon_alloc_fail;	/* cnt of anon alloc fails */
+	uint64_t	zone_pf_throttle;	/* cnt of page flt throttles */
+	uint64_t	zone_pf_throttle_usec;	/* time of page flt throttles */
+
+	/* Num usecs to throttle page fault when zone is over phys. mem cap */
+	uint32_t	zone_pg_flt_delay;
 
 	/*
 	 * Misc. kstats and counters for zone cpu-usage aggregation.
@@ -658,6 +762,7 @@ extern zone_t *zone_find_by_name(char *);
 extern zone_t *zone_find_by_any_path(const char *, boolean_t);
 extern zone_t *zone_find_by_path(const char *);
 extern zoneid_t getzoneid(void);
+extern zoneid_t getzonedid(void);
 extern zone_t *zone_find_by_id_nolock(zoneid_t);
 extern int zone_datalink_walk(zoneid_t, int (*)(datalink_id_t, void *), void *);
 extern int zone_check_datalink(zoneid_t *, datalink_id_t);
@@ -838,6 +943,7 @@ extern int zone_ncpus_online_get(zone_t *);
  * Returns true if the named pool/dataset is visible in the current zone.
  */
 extern int zone_dataset_visible(const char *, int *);
+extern int zone_dataset_visible_inzone(zone_t *, const char *, int *);
 
 /*
  * zone version of kadmin()
@@ -852,6 +958,7 @@ extern int zone_walk(int (*)(zone_t *, void *), void *);
 
 extern rctl_hndl_t rc_zone_locked_mem;
 extern rctl_hndl_t rc_zone_max_swap;
+extern rctl_hndl_t rc_zone_phys_mem;
 extern rctl_hndl_t rc_zone_max_lofi;
 
 #endif	/* _KERNEL */
diff --git a/usr/src/uts/common/syscall/brandsys.c b/usr/src/uts/common/syscall/brandsys.c
index 9b4bd38baa..8ee5511fd0 100644
--- a/usr/src/uts/common/syscall/brandsys.c
+++ b/usr/src/uts/common/syscall/brandsys.c
@@ -23,7 +23,9 @@
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
+/*
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ */
 
 #include <sys/brand.h>
 #include <sys/systm.h>
@@ -35,7 +37,7 @@
  */
 int64_t
 brandsys(int cmd, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3,
-    uintptr_t arg4, uintptr_t arg5, uintptr_t arg6)
+    uintptr_t arg4, uintptr_t arg5)
 {
 	struct proc *p = curthread->t_procp;
 	int64_t rval = 0;
@@ -49,7 +51,7 @@ brandsys(int cmd, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3,
 		return (set_errno(ENOSYS));
 
 	if ((err = ZBROP(p->p_zone)->b_brandsys(cmd, &rval, arg1, arg2, arg3,
-	    arg4, arg5, arg6)) != 0)
+	    arg4, arg5)) != 0)
 		return (set_errno(err));
 
 	return (rval);
diff --git a/usr/src/uts/common/syscall/fcntl.c b/usr/src/uts/common/syscall/fcntl.c
index 371bc83c29..d631fe62f6 100644
--- a/usr/src/uts/common/syscall/fcntl.c
+++ b/usr/src/uts/common/syscall/fcntl.c
@@ -54,7 +54,8 @@
 
 #include <sys/cmn_err.h>
 
-static int flock_check(vnode_t *, flock64_t *, offset_t, offset_t);
+/* This is global so that it can be used by brand emulation. */
+int flock_check(vnode_t *, flock64_t *, offset_t, offset_t);
 static int flock_get_start(vnode_t *, flock64_t *, offset_t, u_offset_t *);
 static void fd_too_big(proc_t *);
 
diff --git a/usr/src/uts/common/syscall/memcntl.c b/usr/src/uts/common/syscall/memcntl.c
index 1ee4b6a395..721f884a7e 100644
--- a/usr/src/uts/common/syscall/memcntl.c
+++ b/usr/src/uts/common/syscall/memcntl.c
@@ -115,13 +115,17 @@ memcntl(caddr_t addr, size_t len, int cmd, caddr_t arg, int attr, int mask)
 		 * MS_SYNC used to be defined to be zero but is now non-zero.
 		 * For binary compatibility we still accept zero
 		 * (the absence of MS_ASYNC) to mean the same thing.
+		 * Binary compatibility is not an issue for MS_INVALCURPROC.
 		 */
 		iarg = (uintptr_t)arg;
 		if ((iarg & ~MS_INVALIDATE) == 0)
 			iarg |= MS_SYNC;
 
-		if (((iarg & ~(MS_SYNC|MS_ASYNC|MS_INVALIDATE)) != 0) ||
-		    ((iarg & (MS_SYNC|MS_ASYNC)) == (MS_SYNC|MS_ASYNC))) {
+		if (((iarg &
+		    ~(MS_SYNC|MS_ASYNC|MS_INVALIDATE|MS_INVALCURPROC)) != 0) ||
+		    ((iarg & (MS_SYNC|MS_ASYNC)) == (MS_SYNC|MS_ASYNC)) ||
+		    ((iarg & (MS_INVALIDATE|MS_INVALCURPROC)) ==
+		    (MS_INVALIDATE|MS_INVALCURPROC))) {
 			error = set_errno(EINVAL);
 		} else {
 			error = as_ctl(as, addr, len, cmd, attr, iarg, NULL, 0);
diff --git a/usr/src/uts/common/syscall/open.c b/usr/src/uts/common/syscall/open.c
index edb04c824b..874e31869c 100644
--- a/usr/src/uts/common/syscall/open.c
+++ b/usr/src/uts/common/syscall/open.c
@@ -74,12 +74,12 @@ copen(int startfd, char *fname, int filemode, int createmode)
 
 	if (filemode & (FSEARCH|FEXEC)) {
 		/*
-		 * Must be one or the other and neither FREAD nor FWRITE
+		 * Must be one or the other.
 		 * Must not be any of FAPPEND FCREAT FTRUNC FXATTR FXATTRDIROPEN
-		 * XXX: Should these just be silently ignored?
+		 * XXX: Should these just be silently ignored like we
+		 * silently ignore FREAD|FWRITE?
 		 */
-		if ((filemode & (FREAD|FWRITE)) ||
-		    (filemode & (FSEARCH|FEXEC)) == (FSEARCH|FEXEC) ||
+		if ((filemode & (FSEARCH|FEXEC)) == (FSEARCH|FEXEC) ||
 		    (filemode & (FAPPEND|FCREAT|FTRUNC|FXATTR|FXATTRDIROPEN)))
 			return (set_errno(EINVAL));
 	}
diff --git a/usr/src/uts/common/syscall/poll.c b/usr/src/uts/common/syscall/poll.c
index cc125f127a..3d0a5cc04b 100644
--- a/usr/src/uts/common/syscall/poll.c
+++ b/usr/src/uts/common/syscall/poll.c
@@ -29,7 +29,7 @@
 
 /*
  * Copyright (c) 2012 by Delphix. All rights reserved.
- * Copyright 2015, Joyent, Inc.
+ * Copyright 2016, Joyent, Inc.
  */
 
 /*
@@ -317,20 +317,58 @@ polllock(pollhead_t *php, kmutex_t *lp)
 	return (0);
 }
 
-static int
-poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp)
+int
+poll_copyin(pollstate_t *ps, pollfd_t *fds, nfds_t nfds)
+{
+	pollfd_t *pollfdp;
+	nfds_t old_nfds;
+
+	/*
+	 * NOTE: for performance, buffers are saved across poll() calls.
+	 * The theory is that if a process polls heavily, it tends to poll
+	 * on the same set of descriptors.  Therefore, we only reallocate
+	 * buffers when nfds changes.  There is no hysteresis control,
+	 * because there is no data to suggest that this is necessary;
+	 * the penalty of reallocating is not *that* great in any event.
+	 */
+	old_nfds = ps->ps_nfds;
+	if (nfds != old_nfds) {
+		kmem_free(ps->ps_pollfd, old_nfds * sizeof (pollfd_t));
+		pollfdp = kmem_alloc(nfds * sizeof (pollfd_t), KM_SLEEP);
+		ps->ps_pollfd = pollfdp;
+		ps->ps_nfds = nfds;
+	}
+
+	pollfdp = ps->ps_pollfd;
+	if (copyin(fds, pollfdp, nfds * sizeof (pollfd_t))) {
+		return (EFAULT);
+	}
+
+	if (fds == NULL) {
+		/*
+		 * If the process has page 0 mapped, then the copyin() above
+		 * will succeed even if fds is NULL.  However, our cached
+		 * poll lists are keyed by the address of the passed-in fds
+		 * structure, and we use the value NULL to indicate an unused
+		 * poll cache list entry.  As such, we elect not to support
+		 * NULL as a valid (user) memory address and fail the poll()
+		 * call.
+		 */
+		return (EFAULT);
+	}
+	return (0);
+}
+
+int
+poll_common(pollstate_t *ps, pollfd_t *fds, nfds_t nfds, timespec_t *tsp,
+    int *fdcnt)
 {
 	kthread_t *t = curthread;
-	klwp_t *lwp = ttolwp(t);
 	proc_t *p = ttoproc(t);
-	int fdcnt = 0;
-	int i;
 	hrtime_t deadline; /* hrtime value when we want to return */
 	pollfd_t *pollfdp;
-	pollstate_t *ps;
 	pollcache_t *pcp;
 	int error = 0;
-	nfds_t old_nfds;
 	int cacheindex = 0;	/* which cache set is used */
 
 	/*
@@ -348,32 +386,11 @@ poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp)
 	}
 
 	/*
-	 * Reset our signal mask, if requested.
-	 */
-	if (ksetp != NULL) {
-		mutex_enter(&p->p_lock);
-		schedctl_finish_sigblock(t);
-		lwp->lwp_sigoldmask = t->t_hold;
-		t->t_hold = *ksetp;
-		t->t_flag |= T_TOMASK;
-		/*
-		 * Call cv_reltimedwait_sig() just to check for signals.
-		 * We will return immediately with either 0 or -1.
-		 */
-		if (!cv_reltimedwait_sig(&t->t_delay_cv, &p->p_lock, 0,
-		    TR_CLOCK_TICK)) {
-			mutex_exit(&p->p_lock);
-			error = EINTR;
-			goto pollout;
-		}
-		mutex_exit(&p->p_lock);
-	}
-
-	/*
-	 * Check to see if this guy just wants to use poll() as a timeout.
+	 * Check to see if the caller just wants to use poll() as a timeout.
 	 * If yes then bypass all the other stuff and make him sleep.
 	 */
 	if (nfds == 0) {
+		*fdcnt = 0;
 		/*
 		 * Sleep until we have passed the requested future
 		 * time or until interrupted by a signal.
@@ -385,66 +402,14 @@ poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp)
 			    &t->t_delay_lock, deadline)) > 0)
 				continue;
 			mutex_exit(&t->t_delay_lock);
-			error = (error == 0) ? EINTR : 0;
+			return ((error == 0) ? EINTR : 0);
 		}
-		goto pollout;
-	}
-
-	if (nfds > p->p_fno_ctl) {
-		mutex_enter(&p->p_lock);
-		(void) rctl_action(rctlproc_legacy[RLIMIT_NOFILE],
-		    p->p_rctls, p, RCA_SAFE);
-		mutex_exit(&p->p_lock);
-		error = EINVAL;
-		goto pollout;
-	}
-
-	/*
-	 * Need to allocate memory for pollstate before anything because
-	 * the mutex and cv are created in this space
-	 */
-	ps = pollstate_create();
-
-	if (ps->ps_pcache == NULL)
-		ps->ps_pcache = pcache_alloc();
-	pcp = ps->ps_pcache;
-
-	/*
-	 * NOTE: for performance, buffers are saved across poll() calls.
-	 * The theory is that if a process polls heavily, it tends to poll
-	 * on the same set of descriptors.  Therefore, we only reallocate
-	 * buffers when nfds changes.  There is no hysteresis control,
-	 * because there is no data to suggest that this is necessary;
-	 * the penalty of reallocating is not *that* great in any event.
-	 */
-	old_nfds = ps->ps_nfds;
-	if (nfds != old_nfds) {
-
-		kmem_free(ps->ps_pollfd, old_nfds * sizeof (pollfd_t));
-		pollfdp = kmem_alloc(nfds * sizeof (pollfd_t), KM_SLEEP);
-		ps->ps_pollfd = pollfdp;
-		ps->ps_nfds = nfds;
+		return (0);
 	}
 
+	VERIFY(ps != NULL);
 	pollfdp = ps->ps_pollfd;
-	if (copyin(fds, pollfdp, nfds * sizeof (pollfd_t))) {
-		error = EFAULT;
-		goto pollout;
-	}
-
-	if (fds == NULL) {
-		/*
-		 * If the process has page 0 mapped, then the copyin() above
-		 * will succeed even if fds is NULL.  However, our cached
-		 * poll lists are keyed by the address of the passed-in fds
-		 * structure, and we use the value NULL to indicate an unused
-		 * poll cache list entry.  As such, we elect not to support
-		 * NULL as a valid (user) memory address and fail the poll()
-		 * call.
-		 */
-		error = EINVAL;
-		goto pollout;
-	}
+	VERIFY(pollfdp != NULL);
 
 	/*
 	 * If this thread polls for the first time, allocate ALL poll
@@ -460,10 +425,10 @@ poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp)
 		/*
 		 * poll and cache this poll fd list in ps_pcacheset[0].
 		 */
-		error = pcacheset_cache_list(ps, fds, &fdcnt, cacheindex);
-		if (fdcnt || error) {
+		error = pcacheset_cache_list(ps, fds, fdcnt, cacheindex);
+		if (error || *fdcnt) {
 			mutex_exit(&ps->ps_lock);
-			goto pollout;
+			return (error);
 		}
 	} else {
 		pollcacheset_t	*pcset = ps->ps_pcacheset;
@@ -488,11 +453,11 @@ poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp)
 				 * the callee will guarantee the consistency
 				 * of cached poll list and cache content.
 				 */
-				error = pcacheset_resolve(ps, nfds, &fdcnt,
+				error = pcacheset_resolve(ps, nfds, fdcnt,
 				    cacheindex);
 				if (error) {
 					mutex_exit(&ps->ps_lock);
-					goto pollout;
+					return (error);
 				}
 				break;
 			}
@@ -509,11 +474,11 @@ poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp)
 				 * found an unused entry. Use it to cache
 				 * this poll list.
 				 */
-				error = pcacheset_cache_list(ps, fds, &fdcnt,
+				error = pcacheset_cache_list(ps, fds, fdcnt,
 				    cacheindex);
-				if (fdcnt || error) {
+				if (error || *fdcnt) {
 					mutex_exit(&ps->ps_lock);
-					goto pollout;
+					return (error);
 				}
 				break;
 			}
@@ -527,10 +492,10 @@ poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp)
 			cacheindex = pcacheset_replace(ps);
 			ASSERT(cacheindex < ps->ps_nsets);
 			pcset[cacheindex].pcs_usradr = (uintptr_t)fds;
-			error = pcacheset_resolve(ps, nfds, &fdcnt, cacheindex);
+			error = pcacheset_resolve(ps, nfds, fdcnt, cacheindex);
 			if (error) {
 				mutex_exit(&ps->ps_lock);
-				goto pollout;
+				return (error);
 			}
 		}
 	}
@@ -548,8 +513,8 @@ poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp)
 	mutex_enter(&pcp->pc_lock);
 	for (;;) {
 		pcp->pc_flag = 0;
-		error = pcache_poll(pollfdp, ps, nfds, &fdcnt, cacheindex);
-		if (fdcnt || error) {
+		error = pcache_poll(pollfdp, ps, nfds, fdcnt, cacheindex);
+		if (error || *fdcnt) {
 			mutex_exit(&pcp->pc_lock);
 			mutex_exit(&ps->ps_lock);
 			break;
@@ -595,13 +560,116 @@ poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp)
 		mutex_enter(&pcp->pc_lock);
 	}
 
+	return (error);
+}
+
+/*
+ * This is the system call trap that poll(),
+ * select() and pselect() are built upon.
+ * It is a private interface between libc and the kernel.
+ */
+int
+pollsys(pollfd_t *fds, nfds_t nfds, timespec_t *timeoutp, sigset_t *setp)
+{
+	kthread_t *t = curthread;
+	klwp_t *lwp = ttolwp(t);
+	proc_t *p = ttoproc(t);
+	timespec_t ts;
+	timespec_t *tsp;
+	k_sigset_t kset;
+	pollstate_t *ps = NULL;
+	pollfd_t *pollfdp = NULL;
+	int error = 0, fdcnt = 0;
+
+	/*
+	 * Copy in timeout
+	 */
+	if (timeoutp == NULL) {
+		tsp = NULL;
+	} else {
+		if (get_udatamodel() == DATAMODEL_NATIVE) {
+			if (copyin(timeoutp, &ts, sizeof (ts)))
+				return (set_errno(EFAULT));
+		} else {
+			timespec32_t ts32;
+
+			if (copyin(timeoutp, &ts32, sizeof (ts32)))
+				return (set_errno(EFAULT));
+			TIMESPEC32_TO_TIMESPEC(&ts, &ts32)
+		}
+
+		if (itimerspecfix(&ts))
+			return (set_errno(EINVAL));
+		tsp = &ts;
+	}
+
+	/*
+	 * Copy in and reset signal mask, if requested.
+	 */
+	if (setp != NULL) {
+		sigset_t set;
+
+		if (copyin(setp, &set, sizeof (set)))
+			return (set_errno(EFAULT));
+		sigutok(&set, &kset);
+
+		mutex_enter(&p->p_lock);
+		schedctl_finish_sigblock(t);
+		lwp->lwp_sigoldmask = t->t_hold;
+		t->t_hold = kset;
+		t->t_flag |= T_TOMASK;
+		/*
+		 * Call cv_reltimedwait_sig() just to check for signals.
+		 * We will return immediately with either 0 or -1.
+		 */
+		if (!cv_reltimedwait_sig(&t->t_delay_cv, &p->p_lock, 0,
+		    TR_CLOCK_TICK)) {
+			mutex_exit(&p->p_lock);
+			error = EINTR;
+			goto pollout;
+		}
+		mutex_exit(&p->p_lock);
+	}
+
+	/*
+	 * Initialize pollstate and copy in pollfd data if present.
+	 * If nfds == 0, we will skip all of the copying and check steps and
+	 * proceed directly into poll_common to process the supplied timeout.
+	 */
+	if (nfds != 0) {
+		if (nfds > p->p_fno_ctl) {
+			mutex_enter(&p->p_lock);
+			(void) rctl_action(rctlproc_legacy[RLIMIT_NOFILE],
+			    p->p_rctls, p, RCA_SAFE);
+			mutex_exit(&p->p_lock);
+			error = EINVAL;
+			goto pollout;
+		}
+
+		/*
+		 * Need to allocate memory for pollstate before anything
+		 * because the mutex and cv are created in this space
+		 */
+		ps = pollstate_create();
+		if (ps->ps_pcache == NULL)
+			ps->ps_pcache = pcache_alloc();
+
+		if ((error = poll_copyin(ps, fds, nfds)) != 0)
+			goto pollout;
+		pollfdp = ps->ps_pollfd;
+	}
+
+	/*
+	 * Perform the actual poll.
+	 */
+	error = poll_common(ps, fds, nfds, tsp, &fdcnt);
+
 pollout:
 	/*
-	 * If we changed the signal mask but we received
-	 * no signal then restore the signal mask.
-	 * Otherwise psig() will deal with the signal mask.
+	 * If we changed the signal mask but we received no signal then restore
+	 * the signal mask.  Otherwise psig() will deal with the signal mask.
 	 */
-	if (ksetp != NULL) {
+	if (setp != NULL) {
 		mutex_enter(&p->p_lock);
 		if (lwp->lwp_cursig == 0) {
 			t->t_hold = lwp->lwp_sigoldmask;
@@ -612,12 +680,10 @@ pollout:
 
 	if (error)
 		return (set_errno(error));
-
 	/*
 	 * Copy out the events and return the fdcnt to the user.
 	 */
-	if (nfds != 0 &&
-	    copyout(pollfdp, fds, nfds * sizeof (pollfd_t)))
+	if (nfds != 0 && copyout(pollfdp, fds, nfds * sizeof (pollfd_t)))
 		return (set_errno(EFAULT));
 
 #ifdef DEBUG
@@ -625,7 +691,7 @@ pollout:
 	 * Another sanity check:
 	 */
 	if (fdcnt) {
-		int	reventcnt = 0;
+		int i, reventcnt = 0;
 
 		for (i = 0; i < nfds; i++) {
 			if (pollfdp[i].fd < 0) {
@@ -638,6 +704,8 @@ pollout:
 		}
 		ASSERT(fdcnt == reventcnt);
 	} else {
+		int i;
+
 		for (i = 0; i < nfds; i++) {
 			ASSERT(pollfdp[i].revents == 0);
 		}
@@ -648,52 +716,6 @@ pollout:
 }
 
 /*
- * This is the system call trap that poll(),
- * select() and pselect() are built upon.
- * It is a private interface between libc and the kernel.
- */
-int
-pollsys(pollfd_t *fds, nfds_t nfds, timespec_t *timeoutp, sigset_t *setp)
-{
-	timespec_t ts;
-	timespec_t *tsp;
-	sigset_t set;
-	k_sigset_t kset;
-	k_sigset_t *ksetp;
-	model_t datamodel = get_udatamodel();
-
-	if (timeoutp == NULL)
-		tsp = NULL;
-	else {
-		if (datamodel == DATAMODEL_NATIVE) {
-			if (copyin(timeoutp, &ts, sizeof (ts)))
-				return (set_errno(EFAULT));
-		} else {
-			timespec32_t ts32;
-
-			if (copyin(timeoutp, &ts32, sizeof (ts32)))
-				return (set_errno(EFAULT));
-			TIMESPEC32_TO_TIMESPEC(&ts, &ts32)
-		}
-
-		if (itimerspecfix(&ts))
-			return (set_errno(EINVAL));
-		tsp = &ts;
-	}
-
-	if (setp == NULL)
-		ksetp = NULL;
-	else {
-		if (copyin(setp, &set, sizeof (set)))
-			return (set_errno(EFAULT));
-		sigutok(&set, &kset);
-		ksetp = &kset;
-	}
-
-	return (poll_common(fds, nfds, tsp, ksetp));
-}
-
-/*
  * Clean up any state left around by poll(2). Called when a thread exits.
  */
 void
diff --git a/usr/src/uts/common/syscall/rusagesys.c b/usr/src/uts/common/syscall/rusagesys.c
index 3e0e63f4c0..417c629168 100644
--- a/usr/src/uts/common/syscall/rusagesys.c
+++ b/usr/src/uts/common/syscall/rusagesys.c
@@ -21,6 +21,7 @@
 /*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2014 Joyent, Inc.  All rights reserved.
  */
 
 /*
@@ -257,6 +258,19 @@ rusagesys(int code, void *arg1, void *arg2, void *arg3, void *arg4)
 	case _RUSAGESYS_GETVMUSAGE:
 		return (vm_getusage((uint_t)(uintptr_t)arg1, (time_t)arg2,
 		    (vmusage_t *)arg3, (size_t *)arg4, 0));
+	case _RUSAGESYS_INVALMAP:
+		/*
+		 * SPARC sfmmu hat does not support HAT_CURPROC_PGUNLOAD
+		 * handling so callers on SPARC should get simple sync
+		 * handling with invalidation to all processes.
+		 */
+#if defined(__sparc)
+		return (memcntl((caddr_t)arg2, (size_t)arg3, MC_SYNC,
+		    (caddr_t)(MS_ASYNC | MS_INVALIDATE), 0, 0));
+#else
+		return (vm_map_inval((pid_t)(uintptr_t)arg1, (caddr_t)arg2,
+		    (size_t)arg3));
+#endif
 	default:
 		return (set_errno(EINVAL));
 	}
diff --git a/usr/src/uts/common/syscall/rw.c b/usr/src/uts/common/syscall/rw.c
index a28894b2c9..943b7d244e 100644
--- a/usr/src/uts/common/syscall/rw.c
+++ b/usr/src/uts/common/syscall/rw.c
@@ -22,7 +22,7 @@
 /*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
- * Copyright (c) 2015, Joyent, Inc.  All rights reserved.
+ * Copyright 2015, Joyent, Inc.  All rights reserved.
  */
 
 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
@@ -50,6 +50,7 @@
 #include <sys/debug.h>
 #include <sys/rctl.h>
 #include <sys/nbmlock.h>
+#include <sys/limits.h>
 
 #define	COPYOUT_MAX_CACHE	(1<<17)		/* 128K */
 
@@ -607,19 +608,12 @@ out:
 	return (bcount);
 }
 
-/*
- * XXX -- The SVID refers to IOV_MAX, but doesn't define it.  Grrrr....
- * XXX -- However, SVVS expects readv() and writev() to fail if
- * XXX -- iovcnt > 16 (yes, it's hard-coded in the SVVS source),
- * XXX -- so I guess that's the "interface".
- */
-#define	DEF_IOV_MAX	16
-
 ssize_t
 readv(int fdes, struct iovec *iovp, int iovcnt)
 {
 	struct uio auio;
-	struct iovec aiov[DEF_IOV_MAX];
+	struct iovec buf[IOV_MAX_STACK], *aiov = buf;
+	int aiovlen = 0;
 	file_t *fp;
 	register vnode_t *vp;
 	struct cpu *cp;
@@ -630,9 +624,14 @@ readv(int fdes, struct iovec *iovp, int iovcnt)
 	u_offset_t fileoff;
 	int in_crit = 0;
 
-	if (iovcnt <= 0 || iovcnt > DEF_IOV_MAX)
+	if (iovcnt <= 0 || iovcnt > IOV_MAX)
 		return (set_errno(EINVAL));
 
+	if (iovcnt > IOV_MAX_STACK) {
+		aiovlen = iovcnt * sizeof (iovec_t);
+		aiov = kmem_alloc(aiovlen, KM_SLEEP);
+	}
+
 #ifdef _SYSCALL32_IMPL
 	/*
 	 * 32-bit callers need to have their iovec expanded,
@@ -640,36 +639,63 @@ readv(int fdes, struct iovec *iovp, int iovcnt)
 	 * of data in a single call.
 	 */
 	if (get_udatamodel() == DATAMODEL_ILP32) {
-		struct iovec32 aiov32[DEF_IOV_MAX];
+		struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32;
+		int aiov32len;
 		ssize32_t count32;
 
-		if (copyin(iovp, aiov32, iovcnt * sizeof (struct iovec32)))
+		aiov32len = iovcnt * sizeof (iovec32_t);
+		if (aiovlen != 0)
+			aiov32 = kmem_alloc(aiov32len, KM_SLEEP);
+
+		if (copyin(iovp, aiov32, aiov32len)) {
+			if (aiovlen != 0) {
+				kmem_free(aiov32, aiov32len);
+				kmem_free(aiov, aiovlen);
+			}
 			return (set_errno(EFAULT));
+		}
 
 		count32 = 0;
 		for (i = 0; i < iovcnt; i++) {
 			ssize32_t iovlen32 = aiov32[i].iov_len;
 			count32 += iovlen32;
-			if (iovlen32 < 0 || count32 < 0)
+			if (iovlen32 < 0 || count32 < 0) {
+				if (aiovlen != 0) {
+					kmem_free(aiov32, aiov32len);
+					kmem_free(aiov, aiovlen);
+				}
 				return (set_errno(EINVAL));
+			}
 			aiov[i].iov_len = iovlen32;
 			aiov[i].iov_base =
 			    (caddr_t)(uintptr_t)aiov32[i].iov_base;
 		}
+
+		if (aiovlen != 0)
+			kmem_free(aiov32, aiov32len);
 	} else
 #endif
-	if (copyin(iovp, aiov, iovcnt * sizeof (struct iovec)))
+	if (copyin(iovp, aiov, iovcnt * sizeof (iovec_t))) {
+		if (aiovlen != 0)
+			kmem_free(aiov, aiovlen);
 		return (set_errno(EFAULT));
+	}
 
 	count = 0;
 	for (i = 0; i < iovcnt; i++) {
 		ssize_t iovlen = aiov[i].iov_len;
 		count += iovlen;
-		if (iovlen < 0 || count < 0)
+		if (iovlen < 0 || count < 0) {
+			if (aiovlen != 0)
+				kmem_free(aiov, aiovlen);
 			return (set_errno(EINVAL));
+		}
 	}
-	if ((fp = getf(fdes)) == NULL)
+	if ((fp = getf(fdes)) == NULL) {
+		if (aiovlen != 0)
+			kmem_free(aiov, aiovlen);
 		return (set_errno(EBADF));
+	}
 	if (((fflag = fp->f_flag) & FREAD) == 0) {
 		error = EBADF;
 		goto out;
@@ -768,6 +794,8 @@ out:
 	if (in_crit)
 		nbl_end_crit(vp);
 	releasef(fdes);
+	if (aiovlen != 0)
+		kmem_free(aiov, aiovlen);
 	if (error)
 		return (set_errno(error));
 	return (count);
@@ -777,7 +805,8 @@ ssize_t
 writev(int fdes, struct iovec *iovp, int iovcnt)
 {
 	struct uio auio;
-	struct iovec aiov[DEF_IOV_MAX];
+	struct iovec buf[IOV_MAX_STACK], *aiov = buf;
+	int aiovlen = 0;
 	file_t *fp;
 	register vnode_t *vp;
 	struct cpu *cp;
@@ -788,9 +817,14 @@ writev(int fdes, struct iovec *iovp, int iovcnt)
 	u_offset_t fileoff;
 	int in_crit = 0;
 
-	if (iovcnt <= 0 || iovcnt > DEF_IOV_MAX)
+	if (iovcnt <= 0 || iovcnt > IOV_MAX)
 		return (set_errno(EINVAL));
 
+	if (iovcnt > IOV_MAX_STACK) {
+		aiovlen = iovcnt * sizeof (iovec_t);
+		aiov = kmem_alloc(aiovlen, KM_SLEEP);
+	}
+
 #ifdef _SYSCALL32_IMPL
 	/*
 	 * 32-bit callers need to have their iovec expanded,
@@ -798,36 +832,62 @@ writev(int fdes, struct iovec *iovp, int iovcnt)
 	 * of data in a single call.
 	 */
 	if (get_udatamodel() == DATAMODEL_ILP32) {
-		struct iovec32 aiov32[DEF_IOV_MAX];
+		struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32;
+		int aiov32len;
 		ssize32_t count32;
 
-		if (copyin(iovp, aiov32, iovcnt * sizeof (struct iovec32)))
+		aiov32len = iovcnt * sizeof (iovec32_t);
+		if (aiovlen != 0)
+			aiov32 = kmem_alloc(aiov32len, KM_SLEEP);
+
+		if (copyin(iovp, aiov32, aiov32len)) {
+			if (aiovlen != 0) {
+				kmem_free(aiov32, aiov32len);
+				kmem_free(aiov, aiovlen);
+			}
 			return (set_errno(EFAULT));
+		}
 
 		count32 = 0;
 		for (i = 0; i < iovcnt; i++) {
 			ssize32_t iovlen = aiov32[i].iov_len;
 			count32 += iovlen;
-			if (iovlen < 0 || count32 < 0)
+			if (iovlen < 0 || count32 < 0) {
+				if (aiovlen != 0) {
+					kmem_free(aiov32, aiov32len);
+					kmem_free(aiov, aiovlen);
+				}
 				return (set_errno(EINVAL));
+			}
 			aiov[i].iov_len = iovlen;
 			aiov[i].iov_base =
 			    (caddr_t)(uintptr_t)aiov32[i].iov_base;
 		}
+		if (aiovlen != 0)
+			kmem_free(aiov32, aiov32len);
 	} else
 #endif
-	if (copyin(iovp, aiov, iovcnt * sizeof (struct iovec)))
+	if (copyin(iovp, aiov, iovcnt * sizeof (iovec_t))) {
+		if (aiovlen != 0)
+			kmem_free(aiov, aiovlen);
 		return (set_errno(EFAULT));
+	}
 
 	count = 0;
 	for (i = 0; i < iovcnt; i++) {
 		ssize_t iovlen = aiov[i].iov_len;
 		count += iovlen;
-		if (iovlen < 0 || count < 0)
+		if (iovlen < 0 || count < 0) {
+			if (aiovlen != 0)
+				kmem_free(aiov, aiovlen);
 			return (set_errno(EINVAL));
+		}
 	}
-	if ((fp = getf(fdes)) == NULL)
+	if ((fp = getf(fdes)) == NULL) {
+		if (aiovlen != 0)
+			kmem_free(aiov, aiovlen);
 		return (set_errno(EBADF));
+	}
 	if (((fflag = fp->f_flag) & FWRITE) == 0) {
 		error = EBADF;
 		goto out;
@@ -917,6 +977,8 @@ out:
 	if (in_crit)
 		nbl_end_crit(vp);
 	releasef(fdes);
+	if (aiovlen != 0)
+		kmem_free(aiov, aiovlen);
 	if (error)
 		return (set_errno(error));
 	return (count);
@@ -927,7 +989,8 @@ preadv(int fdes, struct iovec *iovp, int iovcnt, off_t offset,
     off_t extended_offset)
 {
 	struct uio auio;
-	struct iovec aiov[DEF_IOV_MAX];
+	struct iovec buf[IOV_MAX_STACK], *aiov = buf;
+	int aiovlen = 0;
 	file_t *fp;
 	register vnode_t *vp;
 	struct cpu *cp;
@@ -952,9 +1015,14 @@ preadv(int fdes, struct iovec *iovp, int iovcnt, off_t offset,
 
 	int in_crit = 0;
 
-	if (iovcnt <= 0 || iovcnt > DEF_IOV_MAX)
+	if (iovcnt <= 0 || iovcnt > IOV_MAX)
 		return (set_errno(EINVAL));
 
+	if (iovcnt > IOV_MAX_STACK) {
+		aiovlen = iovcnt * sizeof (iovec_t);
+		aiov = kmem_alloc(aiovlen, KM_SLEEP);
+	}
+
 #ifdef _SYSCALL32_IMPL
 	/*
 	 * 32-bit callers need to have their iovec expanded,
@@ -962,39 +1030,68 @@ preadv(int fdes, struct iovec *iovp, int iovcnt, off_t offset,
 	 * of data in a single call.
 	 */
 	if (get_udatamodel() == DATAMODEL_ILP32) {
-		struct iovec32 aiov32[DEF_IOV_MAX];
+		struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32;
+		int aiov32len;
 		ssize32_t count32;
 
-		if (copyin(iovp, aiov32, iovcnt * sizeof (struct iovec32)))
+		aiov32len = iovcnt * sizeof (iovec32_t);
+		if (aiovlen != 0)
+			aiov32 = kmem_alloc(aiov32len, KM_SLEEP);
+
+		if (copyin(iovp, aiov32, aiov32len)) {
+			if (aiovlen != 0) {
+				kmem_free(aiov32, aiov32len);
+				kmem_free(aiov, aiovlen);
+			}
 			return (set_errno(EFAULT));
+		}
 
 		count32 = 0;
 		for (i = 0; i < iovcnt; i++) {
 			ssize32_t iovlen32 = aiov32[i].iov_len;
 			count32 += iovlen32;
-			if (iovlen32 < 0 || count32 < 0)
+			if (iovlen32 < 0 || count32 < 0) {
+				if (aiovlen != 0) {
+					kmem_free(aiov32, aiov32len);
+					kmem_free(aiov, aiovlen);
+				}
 				return (set_errno(EINVAL));
+			}
 			aiov[i].iov_len = iovlen32;
 			aiov[i].iov_base =
 			    (caddr_t)(uintptr_t)aiov32[i].iov_base;
 		}
+		if (aiovlen != 0)
+			kmem_free(aiov32, aiov32len);
 	} else
 #endif /* _SYSCALL32_IMPL */
-		if (copyin(iovp, aiov, iovcnt * sizeof (struct iovec)))
+		if (copyin(iovp, aiov, iovcnt * sizeof (iovec_t))) {
+			if (aiovlen != 0)
+				kmem_free(aiov, aiovlen);
 			return (set_errno(EFAULT));
+		}
 
 	count = 0;
 	for (i = 0; i < iovcnt; i++) {
 		ssize_t iovlen = aiov[i].iov_len;
 		count += iovlen;
-		if (iovlen < 0 || count < 0)
+		if (iovlen < 0 || count < 0) {
+			if (aiovlen != 0)
+				kmem_free(aiov, aiovlen);
 			return (set_errno(EINVAL));
+		}
 	}
 
-	if ((bcount = (ssize_t)count) < 0)
+	if ((bcount = (ssize_t)count) < 0) {
+		if (aiovlen != 0)
+			kmem_free(aiov, aiovlen);
 		return (set_errno(EINVAL));
-	if ((fp = getf(fdes)) == NULL)
+	}
+	if ((fp = getf(fdes)) == NULL) {
+		if (aiovlen != 0)
+			kmem_free(aiov, aiovlen);
 		return (set_errno(EBADF));
+	}
 	if (((fflag = fp->f_flag) & FREAD) == 0) {
 		error = EBADF;
 		goto out;
@@ -1099,6 +1196,8 @@ out:
 	if (in_crit)
 		nbl_end_crit(vp);
 	releasef(fdes);
+	if (aiovlen != 0)
+		kmem_free(aiov, aiovlen);
 	if (error)
 		return (set_errno(error));
 	return (count);
@@ -1109,7 +1208,8 @@ pwritev(int fdes, struct iovec *iovp, int iovcnt, off_t offset,
     off_t extended_offset)
 {
 	struct uio auio;
-	struct iovec aiov[DEF_IOV_MAX];
+	struct iovec buf[IOV_MAX_STACK], *aiov = buf;
+	int aiovlen = 0;
 	file_t *fp;
 	register vnode_t *vp;
 	struct cpu *cp;
@@ -1134,9 +1234,14 @@ pwritev(int fdes, struct iovec *iovp, int iovcnt, off_t offset,
 
 	int in_crit = 0;
 
-	if (iovcnt <= 0 || iovcnt > DEF_IOV_MAX)
+	if (iovcnt <= 0 || iovcnt > IOV_MAX)
 		return (set_errno(EINVAL));
 
+	if (iovcnt > IOV_MAX_STACK) {
+		aiovlen = iovcnt * sizeof (iovec_t);
+		aiov = kmem_alloc(aiovlen, KM_SLEEP);
+	}
+
 #ifdef _SYSCALL32_IMPL
 	/*
 	 * 32-bit callers need to have their iovec expanded,
@@ -1144,39 +1249,68 @@ pwritev(int fdes, struct iovec *iovp, int iovcnt, off_t offset,
 	 * of data in a single call.
 	 */
 	if (get_udatamodel() == DATAMODEL_ILP32) {
-		struct iovec32 aiov32[DEF_IOV_MAX];
+		struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32;
+		int aiov32len;
 		ssize32_t count32;
 
-		if (copyin(iovp, aiov32, iovcnt * sizeof (struct iovec32)))
+		aiov32len = iovcnt * sizeof (iovec32_t);
+		if (aiovlen != 0)
+			aiov32 = kmem_alloc(aiov32len, KM_SLEEP);
+
+		if (copyin(iovp, aiov32, aiov32len)) {
+			if (aiovlen != 0) {
+				kmem_free(aiov32, aiov32len);
+				kmem_free(aiov, aiovlen);
+			}
 			return (set_errno(EFAULT));
+		}
 
 		count32 = 0;
 		for (i = 0; i < iovcnt; i++) {
 			ssize32_t iovlen32 = aiov32[i].iov_len;
 			count32 += iovlen32;
-			if (iovlen32 < 0 || count32 < 0)
+			if (iovlen32 < 0 || count32 < 0) {
+				if (aiovlen != 0) {
+					kmem_free(aiov32, aiov32len);
+					kmem_free(aiov, aiovlen);
+				}
 				return (set_errno(EINVAL));
+			}
 			aiov[i].iov_len = iovlen32;
 			aiov[i].iov_base =
 			    (caddr_t)(uintptr_t)aiov32[i].iov_base;
 		}
+		if (aiovlen != 0)
+			kmem_free(aiov32, aiov32len);
 	} else
 #endif /* _SYSCALL32_IMPL */
-		if (copyin(iovp, aiov, iovcnt * sizeof (struct iovec)))
+		if (copyin(iovp, aiov, iovcnt * sizeof (iovec_t))) {
+			if (aiovlen != 0)
+				kmem_free(aiov, aiovlen);
 			return (set_errno(EFAULT));
+		}
 
 	count = 0;
 	for (i = 0; i < iovcnt; i++) {
 		ssize_t iovlen = aiov[i].iov_len;
 		count += iovlen;
-		if (iovlen < 0 || count < 0)
+		if (iovlen < 0 || count < 0) {
+			if (aiovlen != 0)
+				kmem_free(aiov, aiovlen);
 			return (set_errno(EINVAL));
+		}
 	}
 
-	if ((bcount = (ssize_t)count) < 0)
+	if ((bcount = (ssize_t)count) < 0) {
+		if (aiovlen != 0)
+			kmem_free(aiov, aiovlen);
 		return (set_errno(EINVAL));
-	if ((fp = getf(fdes)) == NULL)
+	}
+	if ((fp = getf(fdes)) == NULL) {
+		if (aiovlen != 0)
+			kmem_free(aiov, aiovlen);
 		return (set_errno(EBADF));
+	}
 	if (((fflag = fp->f_flag) & FWRITE) == 0) {
 		error = EBADF;
 		goto out;
@@ -1308,6 +1442,8 @@ out:
 	if (in_crit)
 		nbl_end_crit(vp);
 	releasef(fdes);
+	if (aiovlen != 0)
+		kmem_free(aiov, aiovlen);
 	if (error)
 		return (set_errno(error));
 	return (count);
diff --git a/usr/src/uts/common/syscall/sendfile.c b/usr/src/uts/common/syscall/sendfile.c
index cb8246f584..ccceca7c6d 100644
--- a/usr/src/uts/common/syscall/sendfile.c
+++ b/usr/src/uts/common/syscall/sendfile.c
@@ -82,7 +82,7 @@ extern sotpi_info_t *sotpi_sototpi(struct sonode *);
  * 64 bit kernel or 32 bit kernel. For 32 bit apps, we can't transfer
  * more than 2GB of data.
  */
-int
+static int
 sendvec_chunk64(file_t *fp, u_offset_t *fileoff, struct ksendfilevec64 *sfv,
     int copy_cnt, ssize32_t *count)
 {
@@ -343,7 +343,7 @@ sendvec_chunk64(file_t *fp, u_offset_t *fileoff, struct ksendfilevec64 *sfv,
 	return (0);
 }
 
-ssize32_t
+static ssize32_t
 sendvec64(file_t *fp, const struct ksendfilevec64 *vec, int sfvcnt,
 	size32_t *xferred, int fildes)
 {
@@ -390,7 +390,7 @@ sendvec64(file_t *fp, const struct ksendfilevec64 *vec, int sfvcnt,
 }
 #endif
 
-int
+static int
 sendvec_small_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv,
     int copy_cnt, ssize_t total_size, int maxblk, ssize_t *count)
 {
@@ -680,7 +680,7 @@ sendvec_small_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv,
 }
 
 
-int
+static int
 sendvec_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv,
     int copy_cnt, ssize_t *count)
 {
@@ -1160,6 +1160,17 @@ sendfilev(int opcode, int fildes, const struct sendfilevec *vec, int sfvcnt,
 		} else {
 			maxblk = (int)vp->v_stream->sd_maxblk;
 		}
+
+		/*
+		 * We need to make sure that the socket that we're sending on
+		 * supports sendfile behavior. sockfs doesn't know that the APIs
+		 * we want to use are coming from sendfile, so we can't rely on
+		 * it to check for us.
+		 */
+		if ((so->so_mode & SM_SENDFILESUPP) == 0) {
+			error = EOPNOTSUPP;
+			goto err;
+		}
 		break;
 	case VREG:
 		break;
diff --git a/usr/src/uts/common/syscall/stat.c b/usr/src/uts/common/syscall/stat.c
index 4085104cc7..93f26121bc 100644
--- a/usr/src/uts/common/syscall/stat.c
+++ b/usr/src/uts/common/syscall/stat.c
@@ -61,7 +61,7 @@
  * to VOP_GETATTR
  */
 
-static int
+int
 cstatat_getvp(int fd, char *name, int follow, vnode_t **vp, cred_t **cred)
 {
 	vnode_t *startvp;
diff --git a/usr/src/uts/common/syscall/sysconfig.c b/usr/src/uts/common/syscall/sysconfig.c
index 03f2fabe13..26ea859224 100644
--- a/usr/src/uts/common/syscall/sysconfig.c
+++ b/usr/src/uts/common/syscall/sysconfig.c
@@ -22,6 +22,7 @@
 /*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2012 Joyent, Inc.  All rights reserved.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -170,8 +171,8 @@ sysconfig(int which)
 		 * even though rcapd can be used on the global zone too.
 		 */
 		if (!INGLOBALZONE(curproc) &&
-		    curproc->p_zone->zone_phys_mcap != 0)
-			return (MIN(btop(curproc->p_zone->zone_phys_mcap),
+		    curproc->p_zone->zone_phys_mem_ctl != UINT64_MAX)
+			return (MIN(btop(curproc->p_zone->zone_phys_mem_ctl),
 			    physinstalled));
 
 		return (physinstalled);
@@ -179,26 +180,23 @@ sysconfig(int which)
 	case _CONFIG_AVPHYS_PAGES:
 		/*
 		 * If the non-global zone has a phys. memory cap, use
-		 * the phys. memory cap - zone's current rss.  We always
+		 * the phys. memory cap - zone's rss.  We always
 		 * report the system-wide value for the global zone, even
-		 * though rcapd can be used on the global zone too.
+		 * though memory capping can be used on the global zone too.
+		 * We use the cached value for the RSS since vm_getusage()
+		 * is so expensive and we don't need this value to be exact.
 		 */
 		if (!INGLOBALZONE(curproc) &&
-		    curproc->p_zone->zone_phys_mcap != 0) {
+		    curproc->p_zone->zone_phys_mem_ctl != UINT64_MAX) {
 			pgcnt_t cap, rss, free;
-			vmusage_t in_use;
-			size_t cnt = 1;
 
-			cap = btop(curproc->p_zone->zone_phys_mcap);
+			cap = btop(curproc->p_zone->zone_phys_mem_ctl);
 			if (cap > physinstalled)
 				return (freemem);
 
-			if (vm_getusage(VMUSAGE_ZONE, 1, &in_use, &cnt,
-			    FKIOCTL) != 0)
-				in_use.vmu_rss_all = 0;
-			rss = btop(in_use.vmu_rss_all);
+			rss = btop(curproc->p_zone->zone_phys_mem);
 			/*
-			 * Because rcapd implements a soft cap, it is possible
+			 * Because this is a soft cap, it is possible
 			 * for rss to be temporarily over the cap.
 			 */
 			if (cap > rss)
diff --git a/usr/src/uts/common/syscall/uadmin.c b/usr/src/uts/common/syscall/uadmin.c
index 2dda4001bf..68aa1a95f5 100644
--- a/usr/src/uts/common/syscall/uadmin.c
+++ b/usr/src/uts/common/syscall/uadmin.c
@@ -78,7 +78,7 @@ volatile int fastreboot_dryrun = 0;
  * system with many zones.
  */
 void
-killall(zoneid_t zoneid)
+killall(zoneid_t zoneid, boolean_t force)
 {
 	proc_t *p;
 
@@ -108,7 +108,7 @@ killall(zoneid_t zoneid)
 		    p->p_stat != SIDL &&
 		    p->p_stat != SZOMB) {
 			mutex_enter(&p->p_lock);
-			if (sigismember(&p->p_sig, SIGKILL)) {
+			if (!force && sigismember(&p->p_sig, SIGKILL)) {
 				mutex_exit(&p->p_lock);
 				p = p->p_next;
 			} else {
@@ -245,7 +245,7 @@ kadmin(int cmd, int fcn, void *mdep, cred_t *credp)
 		 */
 		zone_shutdown_global();
 
-		killall(ALL_ZONES);
+		killall(ALL_ZONES, B_FALSE);
 		/*
 		 * If we are calling kadmin() from a kernel context then we
 		 * do not release these resources.
diff --git a/usr/src/uts/common/vm/hat.h b/usr/src/uts/common/vm/hat.h
index 1d91475e38..c908a9e16c 100644
--- a/usr/src/uts/common/vm/hat.h
+++ b/usr/src/uts/common/vm/hat.h
@@ -21,6 +21,7 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2014 Joyent, Inc.  All rights reserved.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -269,7 +270,12 @@ void	hat_kpm_walk(void (*)(void *, void *, size_t), void *);
  *	call.
  *
  * int hat_pageunload(pp, forceflag)
- *	unload all translations attached to pp.
+ *	Unload all translations attached to pp. On x86 the bulk of the work is
+ *	done by hat_page_inval.
+ *
+ * void	hat_page_inval(pp, pgsz, curhat)
+ *	Unload translations attached to pp. If curhat is provided, only the
+ *	translation for that process is unloaded, otherwise all are unloaded.
  *
  * uint_t hat_pagesync(pp, flags)
  *	get hw stats from hardware into page struct and reset hw stats
@@ -291,6 +297,7 @@ void	hat_page_setattr(struct page *, uint_t);
 void	hat_page_clrattr(struct page *, uint_t);
 uint_t	hat_page_getattr(struct page *, uint_t);
 int	hat_pageunload(struct page *, uint_t);
+void	hat_page_inval(struct page *, uint_t, struct hat *);
 uint_t	hat_pagesync(struct page *, uint_t);
 ulong_t	hat_page_getshare(struct page *);
 int	hat_page_checkshare(struct page *, ulong_t);
@@ -460,6 +467,7 @@ void	hat_setstat(struct as *, caddr_t, size_t, uint_t);
  */
 #define	HAT_ADV_PGUNLOAD	0x00
 #define	HAT_FORCE_PGUNLOAD	0x01
+#define	HAT_CURPROC_PGUNLOAD	0x02
 
 /*
  * Attributes for hat_page_*attr, hat_setstats and
diff --git a/usr/src/uts/common/vm/seg_kmem.c b/usr/src/uts/common/vm/seg_kmem.c
index 90e1b73b70..439c859d96 100644
--- a/usr/src/uts/common/vm/seg_kmem.c
+++ b/usr/src/uts/common/vm/seg_kmem.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
  */
 
 #include <sys/types.h>
@@ -773,7 +774,7 @@ segkmem_capable(struct seg *seg, segcapability_t capability)
 	return (0);
 }
 
-static struct seg_ops segkmem_ops = {
+struct seg_ops segkmem_ops = {
 	SEGKMEM_BADOP(int),		/* dup */
 	SEGKMEM_BADOP(int),		/* unmap */
 	SEGKMEM_BADOP(void),		/* free */
diff --git a/usr/src/uts/common/vm/seg_kmem.h b/usr/src/uts/common/vm/seg_kmem.h
index 2a4ed3b2aa..3ad4202e91 100644
--- a/usr/src/uts/common/vm/seg_kmem.h
+++ b/usr/src/uts/common/vm/seg_kmem.h
@@ -21,6 +21,7 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2016 Joyent, Inc.
  */
 
 #ifndef _VM_SEG_KMEM_H
@@ -136,6 +137,8 @@ extern size_t	segkmem_kmemlp_max;
 #define	IS_KMEM_VA_LARGEPAGE(vaddr)				        \
 	(((vaddr) >= heap_lp_base) && ((vaddr) < heap_lp_end))
 
+extern struct seg_ops segkmem_ops;
+
 #endif	/* _KERNEL */
 
 #ifdef	__cplusplus
diff --git a/usr/src/uts/common/vm/seg_umap.c b/usr/src/uts/common/vm/seg_umap.c
new file mode 100644
index 0000000000..ccad71c5d6
--- /dev/null
+++ b/usr/src/uts/common/vm/seg_umap.c
@@ -0,0 +1,466 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+/*
+ * VM - Kernel-to-user mapping segment
+ *
+ * The umap segment driver was primarily designed to facilitate the comm page:
+ * a portion of kernel memory shared with userspace so that certain (namely
+ * clock-related) actions could operate without making an expensive trip into
+ * the kernel.
+ *
+ * Since the initial requirements for the comm page are slim, advanced features
+ * of the segment driver such as per-page protection have been left
+ * unimplemented at this time.
+ */
+
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/errno.h>
+#include <sys/cred.h>
+#include <sys/kmem.h>
+#include <sys/lgrp.h>
+#include <sys/mman.h>
+
+#include <vm/hat.h>
+#include <vm/as.h>
+#include <vm/seg.h>
+#include <vm/seg_kmem.h>
+#include <vm/seg_umap.h>
+
+
+static boolean_t segumap_verify_safe(caddr_t, size_t);
+static int segumap_dup(struct seg *, struct seg *);
+static int segumap_unmap(struct seg *, caddr_t, size_t);
+static void segumap_free(struct seg *);
+static faultcode_t segumap_fault(struct hat *, struct seg *, caddr_t, size_t,
+    enum fault_type, enum seg_rw);
+static faultcode_t segumap_faulta(struct seg *, caddr_t);
+static int segumap_setprot(struct seg *, caddr_t, size_t, uint_t);
+static int segumap_checkprot(struct seg *, caddr_t, size_t, uint_t);
+static int segumap_sync(struct seg *, caddr_t, size_t, int, uint_t);
+static size_t segumap_incore(struct seg *, caddr_t, size_t, char *);
+static int segumap_lockop(struct seg *, caddr_t, size_t, int, int, ulong_t *,
+    size_t);
+static int segumap_getprot(struct seg *, caddr_t, size_t, uint_t *);
+static u_offset_t segumap_getoffset(struct seg *, caddr_t);
+static int segumap_gettype(struct seg *, caddr_t);
+static int segumap_getvp(struct seg *, caddr_t, struct vnode **);
+static int segumap_advise(struct seg *, caddr_t, size_t, uint_t);
+static void segumap_dump(struct seg *);
+static int segumap_pagelock(struct seg *, caddr_t, size_t, struct page ***,
+    enum lock_type, enum seg_rw);
+static int segumap_setpagesize(struct seg *, caddr_t, size_t, uint_t);
+static int segumap_getmemid(struct seg *, caddr_t, memid_t *);
+static int segumap_capable(struct seg *, segcapability_t);
+
+static struct seg_ops segumap_ops = {
+	segumap_dup,
+	segumap_unmap,
+	segumap_free,
+	segumap_fault,
+	segumap_faulta,
+	segumap_setprot,
+	segumap_checkprot,
+	NULL,			/* kluster: disabled */
+	NULL,			/* swapout: disabled */
+	segumap_sync,
+	segumap_incore,
+	segumap_lockop,
+	segumap_getprot,
+	segumap_getoffset,
+	segumap_gettype,
+	segumap_getvp,
+	segumap_advise,
+	segumap_dump,
+	segumap_pagelock,
+	segumap_setpagesize,
+	segumap_getmemid,
+	NULL,			/* getpolicy: disabled */
+	segumap_capable,
+	seg_inherit_notsup
+};
+
+
+/*
+ * Create a kernel/user-mapped segment.
+ */
+int
+segumap_create(struct seg *seg, void *argsp)
+{
+	segumap_crargs_t *a = (struct segumap_crargs *)argsp;
+	segumap_data_t *data;
+
+	ASSERT((uintptr_t)a->kaddr > _userlimit);
+
+	/*
+	 * Check several aspects of the mapping request to ensure validity:
+	 * - kernel pages must reside entirely in kernel space
+	 * - target protection must be user-accessible
+	 * - kernel address must be page-aligned
+	 * - kernel address must reside inside a "safe" segment
+	 */
+	if ((uintptr_t)a->kaddr <= _userlimit ||
+	    ((uintptr_t)a->kaddr + seg->s_size) < (uintptr_t)a->kaddr ||
+	    (a->prot & PROT_USER) == 0 ||
+	    ((uintptr_t)a->kaddr & PAGEOFFSET) != 0 ||
+	    !segumap_verify_safe(a->kaddr, seg->s_size)) {
+		return (EINVAL);
+	}
+
+	data = kmem_zalloc(sizeof (*data), KM_SLEEP);
+	rw_init(&data->sud_lock, NULL, RW_DEFAULT, NULL);
+	data->sud_kaddr = a->kaddr;
+	data->sud_prot = a->prot;
+	data->sud_loaded = B_FALSE;
+
+	seg->s_ops = &segumap_ops;
+	seg->s_data = data;
+	return (0);
+}
+
+static boolean_t
+segumap_verify_safe(caddr_t kaddr, size_t len)
+{
+	struct seg *seg;
+
+	/*
+	 * Presently, only pages which are backed by segkmem are allowed to be
+	 * shared with userspace.  This prevents nasty paging behavior with
+	 * other drivers such as seg_kp.  Furthermore, the backing kernel
+	 * segment must completely contain the region to be mapped.
+	 *
+	 * Failing these checks is fatal for now since such mappings are done
+	 * in a very limited context from the kernel.
+	 */
+	AS_LOCK_ENTER(&kas, RW_READER);
+	seg = as_segat(&kas, kaddr);
+	VERIFY(seg != NULL);
+	VERIFY(seg->s_base + seg->s_size >= kaddr + len);
+	VERIFY(seg->s_ops == &segkmem_ops);
+	AS_LOCK_EXIT(&kas);
+
+	return (B_TRUE);
+}
+
+static int
+segumap_dup(struct seg *seg, struct seg *newseg)
+{
+	segumap_data_t *sud = (segumap_data_t *)seg->s_data;
+	segumap_data_t *newsud;
+
+	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
+
+	newsud = kmem_zalloc(sizeof (segumap_data_t), KM_SLEEP);
+	rw_init(&newsud->sud_lock, NULL, RW_DEFAULT, NULL);
+	newsud->sud_kaddr = sud->sud_kaddr;
+	newsud->sud_prot = sud->sud_prot;
+	newsud->sud_loaded = B_FALSE;
+
+	newseg->s_ops = seg->s_ops;
+	newseg->s_data = newsud;
+	return (0);
+}
+
+static int
+segumap_unmap(struct seg *seg, caddr_t addr, size_t len)
+{
+	segumap_data_t *sud = (segumap_data_t *)seg->s_data;
+
+	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
+
+	/* Only allow unmap of entire segment */
+	if (addr != seg->s_base || len != seg->s_size) {
+		return (EINVAL);
+	}
+	if (sud->sud_softlockcnt != 0) {
+		return (EAGAIN);
+	}
+
+	hat_unload(seg->s_as->a_hat, addr, len, HAT_UNLOAD_UNMAP);
+	/*
+	 * While setting this field before immediately freeing the segment is
+	 * not necessary, it is done for the sake of completeness.  Doing so
+	 * outside sud_lock is safe with the AS write-locked.
+	 */
+	sud->sud_loaded = B_FALSE;
+
+	seg_free(seg);
+	return (0);
+}
+
+static void
+segumap_free(struct seg *seg)
+{
+	segumap_data_t *data = (segumap_data_t *)seg->s_data;
+
+	ASSERT(data != NULL);
+
+	rw_destroy(&data->sud_lock);
+	VERIFY(data->sud_loaded == B_FALSE);
+	VERIFY(data->sud_softlockcnt == 0);
+	kmem_free(data, sizeof (*data));
+	seg->s_data = NULL;
+}
+
+/* ARGSUSED */
+static faultcode_t
+segumap_fault(struct hat *hat, struct seg *seg, caddr_t addr, size_t len,
+    enum fault_type type, enum seg_rw tw)
+{
+	segumap_data_t *sud = (segumap_data_t *)seg->s_data;
+
+	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
+
+	if (type == F_PROT) {
+		/*
+		 * Since protection on the segment is fixed, there is nothing
+		 * to do but report an error for protection faults.
+		 */
+		return (FC_PROT);
+	} else if (type == F_SOFTUNLOCK) {
+		size_t plen = btop(len);
+
+		rw_enter(&sud->sud_lock, RW_WRITER);
+		VERIFY(sud->sud_softlockcnt >= plen);
+		sud->sud_softlockcnt -= plen;
+		rw_exit(&sud->sud_lock);
+		return (0);
+	}
+
+	ASSERT(type == F_INVAL || type == F_SOFTLOCK);
+	rw_enter(&sud->sud_lock, RW_WRITER);
+
+	if (type == F_INVAL && sud->sud_loaded) {
+		rw_exit(&sud->sud_lock);
+		return (FC_NOMAP);
+	}
+
+	/*
+	 * Load the (entire) segment into the HAT if it has not been done so.
+	 */
+	if (!sud->sud_loaded) {
+		for (uintptr_t i = 0; i < seg->s_size; i += PAGESIZE) {
+			pfn_t pfn;
+
+			pfn = hat_getpfnum(kas.a_hat, sud->sud_kaddr + i);
+			VERIFY(pfn != PFN_INVALID);
+			hat_devload(seg->s_as->a_hat, seg->s_base + i,
+			    PAGESIZE, pfn, sud->sud_prot, HAT_LOAD);
+		}
+		sud->sud_loaded = B_TRUE;
+	} else {
+		/*
+		 * If there the segment has already been loaded, there is no
+		 * reason to take an F_INVALID fault.
+		 */
+		VERIFY(type != F_INVAL);
+	}
+
+	if (type == F_SOFTLOCK) {
+		size_t nval = sud->sud_softlockcnt + btop(len);
+
+		if (sud->sud_softlockcnt >= nval) {
+			rw_exit(&sud->sud_lock);
+			return (FC_MAKE_ERR(EOVERFLOW));
+		}
+		sud->sud_softlockcnt = nval;
+	}
+	rw_exit(&sud->sud_lock);
+	return (0);
+}
+
+/* ARGSUSED */
+static faultcode_t
+segumap_faulta(struct seg *seg, caddr_t addr)
+{
+	/* Do nothing since asynch pagefault should not load translation. */
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+segumap_setprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
+{
+	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
+
+	/*
+	 * The seg_umap driver does not yet allow protection to be changed.
+	 */
+	return (EACCES);
+}
+
+/* ARGSUSED */
+static int
+segumap_checkprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
+{
+	segumap_data_t *sud = (segumap_data_t *)seg->s_data;
+	int error = 0;
+
+	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
+
+	rw_enter(&sud->sud_lock, RW_READER);
+	if ((sud->sud_prot & prot) != prot) {
+		error = EACCES;
+	}
+	rw_exit(&sud->sud_lock);
+	return (error);
+}
+
+/* ARGSUSED */
+static int
+segumap_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags)
+{
+	/* Always succeed since there are no backing store to sync */
+	return (0);
+}
+
+/* ARGSUSED */
+static size_t
+segumap_incore(struct seg *seg, caddr_t addr, size_t len, char *vec)
+{
+	size_t sz = 0;
+
+	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
+
+	len = (len + PAGEOFFSET) & PAGEMASK;
+	while (len > 0) {
+		*vec = 1;
+		sz += PAGESIZE;
+		vec++;
+		len -= PAGESIZE;
+	}
+	return (sz);
+}
+
+/* ARGSUSED */
+static int
+segumap_lockop(struct seg *seg, caddr_t addr, size_t len, int attr, int op,
+    ulong_t *lockmap, size_t pos)
+{
+	/* Report success since kernel pages are always in memory. */
+	return (0);
+}
+
+static int
+segumap_getprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv)
+{
+	segumap_data_t *sud = (segumap_data_t *)seg->s_data;
+	size_t pgno;
+	uint_t prot;
+
+	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
+
+	rw_enter(&sud->sud_lock, RW_READER);
+	prot = sud->sud_prot;
+	rw_exit(&sud->sud_lock);
+
+	/*
+	 * Reporting protection is simple since it is not tracked per-page.
+	 */
+	pgno = seg_page(seg, addr + len) - seg_page(seg, addr) + 1;
+	while (pgno > 0) {
+		protv[--pgno] = prot;
+	}
+	return (0);
+}
+
+/* ARGSUSED */
+static u_offset_t
+segumap_getoffset(struct seg *seg, caddr_t addr)
+{
+	/*
+	 * To avoid leaking information about the layout of the kernel address
+	 * space, always report '0' as the offset.
+	 */
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+segumap_gettype(struct seg *seg, caddr_t addr)
+{
+	/*
+	 * Since already-existing kernel pages are being mapped into userspace,
+	 * always report the segment type as shared.
+	 */
+	return (MAP_SHARED);
+}
+
+/* ARGSUSED */
+static int
+segumap_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp)
+{
+	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
+
+	*vpp = NULL;
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+segumap_advise(struct seg *seg, caddr_t addr, size_t len, uint_t behav)
+{
+	if (behav == MADV_PURGE) {
+		/* Purge does not make sense for this mapping */
+		return (EINVAL);
+	}
+	/* Indicate success for everything else. */
+	return (0);
+}
+
+/* ARGSUSED */
+static void
+segumap_dump(struct seg *seg)
+{
+	/*
+	 * Since this is a mapping to share kernel data with userspace, nothing
+	 * additional should be dumped.
+	 */
+}
+
+/* ARGSUSED */
+static int
+segumap_pagelock(struct seg *seg, caddr_t addr, size_t len, struct page ***ppp,
+    enum lock_type type, enum seg_rw rw)
+{
+	return (ENOTSUP);
+}
+
+/* ARGSUSED */
+static int
+segumap_setpagesize(struct seg *seg, caddr_t addr, size_t len, uint_t szc)
+{
+	return (ENOTSUP);
+}
+
+static int
+segumap_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp)
+{
+	segumap_data_t *sud = (segumap_data_t *)seg->s_data;
+
+	memidp->val[0] = (uintptr_t)sud->sud_kaddr;
+	memidp->val[1] = (uintptr_t)(addr - seg->s_base);
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+segumap_capable(struct seg *seg, segcapability_t capability)
+{
+	/* no special capablities */
+	return (0);
+}
diff --git a/usr/src/uts/common/vm/seg_umap.h b/usr/src/uts/common/vm/seg_umap.h
new file mode 100644
index 0000000000..bcf7447509
--- /dev/null
+++ b/usr/src/uts/common/vm/seg_umap.h
@@ -0,0 +1,43 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#ifndef	_VM_SEG_UMAP_H
+#define	_VM_SEG_UMAP_H
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+typedef struct segumap_crargs {
+	caddr_t	kaddr;
+	uchar_t	prot;		/* protection */
+	uchar_t	maxprot;	/* maximum protection */
+} segumap_crargs_t;
+
+typedef struct segumap_data {
+	krwlock_t	sud_lock;
+	caddr_t		sud_kaddr;
+	uchar_t		sud_prot;
+	size_t		sud_softlockcnt;
+	boolean_t	sud_loaded;
+} segumap_data_t;
+
+extern int segumap_create(struct seg *, void *);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _VM_SEG_UMAP_H */
diff --git a/usr/src/uts/common/vm/seg_vn.c b/usr/src/uts/common/vm/seg_vn.c
index 875dec7fe9..f143c1e464 100644
--- a/usr/src/uts/common/vm/seg_vn.c
+++ b/usr/src/uts/common/vm/seg_vn.c
@@ -7308,7 +7308,8 @@ segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags)
 	vpp = svd->vpage;
 	offset = svd->offset + (uintptr_t)(addr - seg->s_base);
 	bflags = ((flags & MS_ASYNC) ? B_ASYNC : 0) |
-	    ((flags & MS_INVALIDATE) ? B_INVAL : 0);
+	    ((flags & MS_INVALIDATE) ? B_INVAL : 0) |
+	    ((flags & MS_INVALCURPROC) ? (B_INVALCURONLY | B_INVAL) : 0);
 
 	if (attr) {
 		pageprot = attr & ~(SHARED|PRIVATE);
@@ -7333,11 +7334,11 @@ segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags)
 			vpp = &svd->vpage[seg_page(seg, addr)];
 
 	} else if (svd->vp && svd->amp == NULL &&
-	    (flags & MS_INVALIDATE) == 0) {
+	    (flags & (MS_INVALIDATE | MS_INVALCURPROC)) == 0) {
 
 		/*
-		 * No attributes, no anonymous pages and MS_INVALIDATE flag
-		 * is not on, just use one big request.
+		 * No attributes, no anonymous pages and MS_INVAL* flags
+		 * are not on, just use one big request.
 		 */
 		err = VOP_PUTPAGE(svd->vp, (offset_t)offset, len,
 		    bflags, svd->cred, NULL);
@@ -7389,7 +7390,7 @@ segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags)
 		 * might race in and lock the page after we unlock and before
 		 * we do the PUTPAGE, then PUTPAGE simply does nothing.
 		 */
-		if (flags & MS_INVALIDATE) {
+		if (flags & (MS_INVALIDATE | MS_INVALCURPROC)) {
 			if ((pp = page_lookup(vp, off, SE_SHARED)) != NULL) {
 				if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
 					page_unlock(pp);
diff --git a/usr/src/uts/common/vm/vm_anon.c b/usr/src/uts/common/vm/vm_anon.c
index 4fd32a3f4a..01db9b23d7 100644
--- a/usr/src/uts/common/vm/vm_anon.c
+++ b/usr/src/uts/common/vm/vm_anon.c
@@ -788,14 +788,21 @@ anon_resvmem(size_t size, boolean_t takemem, zone_t *zone, int tryhard)
 	pgcnt_t pswap_pages = 0;
 	proc_t *p = curproc;
 
-	if (zone != NULL && takemem) {
+	if (zone != NULL) {
 		/* test zone.max-swap resource control */
 		mutex_enter(&p->p_lock);
 		if (rctl_incr_swap(p, zone, ptob(npages)) != 0) {
 			mutex_exit(&p->p_lock);
-			atomic_add_64(&zone->zone_anon_alloc_fail, 1);
+
+			if (takemem)
+				atomic_add_64(&zone->zone_anon_alloc_fail, 1);
+
 			return (0);
 		}
+
+		if (!takemem)
+			rctl_decr_swap(zone, ptob(npages));
+
 		mutex_exit(&p->p_lock);
 	}
 	mutex_enter(&anoninfo_lock);
diff --git a/usr/src/uts/common/vm/vm_as.c b/usr/src/uts/common/vm/vm_as.c
index bb5a96eb0f..b0a5e7fb33 100644
--- a/usr/src/uts/common/vm/vm_as.c
+++ b/usr/src/uts/common/vm/vm_as.c
@@ -57,6 +57,7 @@
 #include <sys/debug.h>
 #include <sys/tnf_probe.h>
 #include <sys/vtrace.h>
+#include <sys/ddi.h>
 
 #include <vm/hat.h>
 #include <vm/as.h>
@@ -848,8 +849,7 @@ as_fault(struct hat *hat, struct as *as, caddr_t addr, size_t size,
 	struct seg *segsav;
 	int as_lock_held;
 	klwp_t *lwp = ttolwp(curthread);
-
-
+	zone_t *zonep = curzone;
 
 retry:
 	/*
@@ -885,6 +885,22 @@ retry:
 		if (as == &kas)
 			CPU_STATS_ADDQ(CPU, vm, kernel_asflt, 1);
 		CPU_STATS_EXIT_K();
+		if (zonep->zone_pg_flt_delay != 0) {
+			/*
+			 * The zone in which this process is running is
+			 * currently over it's physical memory cap. Throttle
+			 * page faults to help the user-land memory capper
+			 * catch up. Note that drv_usectohz() rounds up.
+			 */
+			atomic_add_64(&zonep->zone_pf_throttle, 1);
+			atomic_add_64(&zonep->zone_pf_throttle_usec,
+			    zonep->zone_pg_flt_delay);
+			if (zonep->zone_pg_flt_delay < TICK_TO_USEC(1)) {
+				drv_usecwait(zonep->zone_pg_flt_delay);
+			} else {
+				delay(drv_usectohz(zonep->zone_pg_flt_delay));
+			}
+		}
 		break;
 	}
 
diff --git a/usr/src/uts/common/vm/vm_pvn.c b/usr/src/uts/common/vm/vm_pvn.c
index 1b8d12eb8d..a206320a30 100644
--- a/usr/src/uts/common/vm/vm_pvn.c
+++ b/usr/src/uts/common/vm/vm_pvn.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  */
 
@@ -432,7 +433,14 @@ pvn_write_done(page_t *plist, int flags)
 				page_io_unlock(pp);
 				page_unlock(pp);
 			}
-		} else if (flags & B_INVAL) {
+		} else if ((flags & (B_INVAL | B_INVALCURONLY)) == B_INVAL) {
+			/*
+			 * If B_INVALCURONLY is set, then we handle that case
+			 * in the next conditional if hat_page_is_mapped()
+			 * indicates that there are no additional mappings
+			 * to the page.
+			 */
+
 			/*
 			 * XXX - Failed writes with B_INVAL set are
 			 * not handled appropriately.
@@ -573,8 +581,9 @@ pvn_write_done(page_t *plist, int flags)
 }
 
 /*
- * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_DELWRI,
- * B_TRUNC, B_FORCE}.  B_DELWRI indicates that this page is part of a kluster
+ * Flags are composed of {B_ASYNC, B_INVAL, B_INVALCURONLY, B_FREE,
+ * B_DONTNEED, B_DELWRI, B_TRUNC, B_FORCE}.
+ * B_DELWRI indicates that this page is part of a kluster
  * operation and is only to be considered if it doesn't involve any
  * waiting here.  B_TRUNC indicates that the file is being truncated
  * and so no i/o needs to be done. B_FORCE indicates that the page
@@ -628,13 +637,17 @@ pvn_getdirty(page_t *pp, int flags)
 	 * If we want to free or invalidate the page then
 	 * we need to unload it so that anyone who wants
 	 * it will have to take a minor fault to get it.
+	 * If we are only invalidating the page for the
+	 * current process, then pass in a different flag.
 	 * Otherwise, we're just writing the page back so we
 	 * need to sync up the hardwre and software mod bit to
 	 * detect any future modifications.  We clear the
 	 * software mod bit when we put the page on the dirty
 	 * list.
 	 */
-	if (flags & (B_INVAL | B_FREE)) {
+	if (flags & B_INVALCURONLY) {
+		(void) hat_pageunload(pp, HAT_CURPROC_PGUNLOAD);
+	} else if (flags & (B_INVAL | B_FREE)) {
 		(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
 	} else {
 		(void) hat_pagesync(pp, HAT_SYNC_ZERORM);
@@ -646,7 +659,7 @@ pvn_getdirty(page_t *pp, int flags)
 		 * list after all.
 		 */
 		page_io_unlock(pp);
-		if (flags & B_INVAL) {
+		if ((flags & (B_INVAL | B_INVALCURONLY)) == B_INVAL) {
 			/*LINTED: constant in conditional context*/
 			VN_DISPOSE(pp, B_INVAL, 0, kcred);
 		} else if (flags & B_FREE) {
@@ -658,6 +671,9 @@ pvn_getdirty(page_t *pp, int flags)
 			 * of VOP_PUTPAGE() who prefer freeing the
 			 * page _only_ if no one else is accessing it.
 			 * E.g. segmap_release()
+			 * We also take this path for B_INVALCURONLY and
+			 * let page_release call VN_DISPOSE if no one else is
+			 * using the page.
 			 *
 			 * The above hat_ismod() check is useless because:
 			 * (1) we may not be holding SE_EXCL lock;
@@ -682,7 +698,7 @@ pvn_getdirty(page_t *pp, int flags)
 	 * We'll detect the fact that they used it when the
 	 * i/o is done and avoid freeing the page.
 	 */
-	if (flags & B_FREE)
+	if (flags & (B_FREE | B_INVALCURONLY))
 		page_downgrade(pp);
 
 
diff --git a/usr/src/uts/common/vm/vm_swap.c b/usr/src/uts/common/vm/vm_swap.c
index 1a28c04357..2a008e114b 100644
--- a/usr/src/uts/common/vm/vm_swap.c
+++ b/usr/src/uts/common/vm/vm_swap.c
@@ -18,6 +18,11 @@
  *
  * CDDL HEADER END
  */
+
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
 /*
  * Copyright (c) 1987, 2010, Oracle and/or its affiliates. All rights reserved.
  */
@@ -625,7 +630,18 @@ swapctl(int sc_cmd, void *sc_arg, int *rv)
 			return (0);
 		}
 beginning:
+		mutex_enter(&swapinfo_lock);
 		tmp_nswapfiles = nswapfiles;
+		mutex_exit(&swapinfo_lock);
+
+		/*
+		 * Return early if there are no swap entries to report:
+		 */
+		if (tmp_nswapfiles < 1) {
+			*rv = 0;
+			return (0);
+		}
+
 		/* Return an error if not enough space for the whole table. */
 		if (length < tmp_nswapfiles)
 			return (ENOMEM);
@@ -920,7 +936,18 @@ swapctl32(int sc_cmd, void *sc_arg, int *rv)
 			return (0);
 		}
 beginning:
+		mutex_enter(&swapinfo_lock);
 		tmp_nswapfiles = nswapfiles;
+		mutex_exit(&swapinfo_lock);
+
+		/*
+		 * Return early if there are no swap entries to report:
+		 */
+		if (tmp_nswapfiles < 1) {
+			*rv = 0;
+			return (0);
+		}
+
 		/* Return an error if not enough space for the whole table. */
 		if (length < tmp_nswapfiles)
 			return (ENOMEM);
diff --git a/usr/src/uts/common/vm/vm_usage.c b/usr/src/uts/common/vm/vm_usage.c
index 57166b4e63..8b9fd0d7a3 100644
--- a/usr/src/uts/common/vm/vm_usage.c
+++ b/usr/src/uts/common/vm/vm_usage.c
@@ -25,6 +25,10 @@
  */
 
 /*
+ * Copyright 2016, Joyent, Inc.
+ */
+
+/*
  * vm_usage
  *
  * This file implements the getvmusage() private system call.
@@ -114,7 +118,7 @@
  *	For accurate counting of map-shared and COW-shared pages.
  *
  *    - visited private anons (refcnt > 1) for each collective.
- *	(entity->vme_anon_hash)
+ *	(entity->vme_anon)
  *	For accurate counting of COW-shared pages.
  *
  * The common accounting structure is the vmu_entity_t, which represents
@@ -152,6 +156,7 @@
 #include <sys/vm_usage.h>
 #include <sys/zone.h>
 #include <sys/sunddi.h>
+#include <sys/sysmacros.h>
 #include <sys/avl.h>
 #include <vm/anon.h>
 #include <vm/as.h>
@@ -199,6 +204,14 @@ typedef struct vmu_object {
 } vmu_object_t;
 
 /*
+ * Node for tree of visited COW anons.
+ */
+typedef struct vmu_anon {
+	avl_node_t vma_node;
+	uintptr_t vma_addr;
+} vmu_anon_t;
+
+/*
  * Entity by which to count results.
  *
  * The entity structure keeps the current rss/swap counts for each entity
@@ -221,7 +234,7 @@ typedef struct vmu_entity {
 	struct vmu_entity *vme_next_calc;
 	mod_hash_t	*vme_vnode_hash; /* vnodes visited for entity */
 	mod_hash_t	*vme_amp_hash;	 /* shared amps visited for entity */
-	mod_hash_t	*vme_anon_hash;	 /* COW anons visited for entity */
+	avl_tree_t	vme_anon;	 /* COW anons visited for entity */
 	vmusage_t	vme_result;	 /* identifies entity and results */
 } vmu_entity_t;
 
@@ -324,6 +337,23 @@ bounds_cmp(const void *bnd1, const void *bnd2)
 }
 
 /*
+ * Comparison routine for our AVL tree of anon structures.
+ */
+static int
+vmu_anon_cmp(const void *lhs, const void *rhs)
+{
+	const vmu_anon_t *l = lhs, *r = rhs;
+
+	if (l->vma_addr == r->vma_addr)
+		return (0);
+
+	if (l->vma_addr < r->vma_addr)
+		return (-1);
+
+	return (1);
+}
+
+/*
  * Save a bound on the free list.
  */
 static void
@@ -363,13 +393,18 @@ static void
 vmu_free_entity(mod_hash_val_t val)
 {
 	vmu_entity_t *entity = (vmu_entity_t *)val;
+	vmu_anon_t *anon;
+	void *cookie = NULL;
 
 	if (entity->vme_vnode_hash != NULL)
 		i_mod_hash_clear_nosync(entity->vme_vnode_hash);
 	if (entity->vme_amp_hash != NULL)
 		i_mod_hash_clear_nosync(entity->vme_amp_hash);
-	if (entity->vme_anon_hash != NULL)
-		i_mod_hash_clear_nosync(entity->vme_anon_hash);
+
+	while ((anon = avl_destroy_nodes(&entity->vme_anon, &cookie)) != NULL)
+		kmem_free(anon, sizeof (vmu_anon_t));
+
+	avl_destroy(&entity->vme_anon);
 
 	entity->vme_next = vmu_data.vmu_free_entities;
 	vmu_data.vmu_free_entities = entity;
@@ -485,10 +520,10 @@ vmu_alloc_entity(id_t id, int type, id_t zoneid)
 		    "vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object,
 		    sizeof (struct anon_map));
 
-	if (entity->vme_anon_hash == NULL)
-		entity->vme_anon_hash = mod_hash_create_ptrhash(
-		    "vmusage anon hash", VMUSAGE_HASH_SIZE,
-		    mod_hash_null_valdtor, sizeof (struct anon));
+	VERIFY(avl_first(&entity->vme_anon) == NULL);
+
+	avl_create(&entity->vme_anon, vmu_anon_cmp, sizeof (struct vmu_anon),
+	    offsetof(struct vmu_anon, vma_node));
 
 	entity->vme_next = vmu_data.vmu_entities;
 	vmu_data.vmu_entities = entity;
@@ -518,7 +553,8 @@ vmu_alloc_zone(id_t id)
 
 	zone->vmz_id = id;
 
-	if ((vmu_data.vmu_calc_flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES)) != 0)
+	if ((vmu_data.vmu_calc_flags &
+	    (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE)) != 0)
 		zone->vmz_zone = vmu_alloc_entity(id, VMUSAGE_ZONE, id);
 
 	if ((vmu_data.vmu_calc_flags & (VMUSAGE_PROJECTS |
@@ -613,21 +649,19 @@ vmu_find_insert_object(mod_hash_t *hash, caddr_t key, uint_t type)
 }
 
 static int
-vmu_find_insert_anon(mod_hash_t *hash, caddr_t key)
+vmu_find_insert_anon(vmu_entity_t *entity, void *key)
 {
-	int ret;
-	caddr_t val;
+	vmu_anon_t anon, *ap;
 
-	ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key,
-	    (mod_hash_val_t *)&val);
+	anon.vma_addr = (uintptr_t)key;
 
-	if (ret == 0)
+	if (avl_find(&entity->vme_anon, &anon, NULL) != NULL)
 		return (0);
 
-	ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key,
-	    (mod_hash_val_t)key, (mod_hash_hndl_t)0);
+	ap = kmem_alloc(sizeof (vmu_anon_t), KM_SLEEP);
+	ap->vma_addr = (uintptr_t)key;
 
-	ASSERT(ret == 0);
+	avl_add(&entity->vme_anon, ap);
 
 	return (1);
 }
@@ -918,6 +952,8 @@ vmu_amp_update_incore_bounds(avl_tree_t *tree, struct anon_map *amp,
 			next = AVL_NEXT(tree, next);
 			continue;
 		}
+
+		ASSERT(next->vmb_type == VMUSAGE_BOUND_UNKNOWN);
 		bound_type = next->vmb_type;
 		index = next->vmb_start;
 		while (index <= next->vmb_end) {
@@ -937,7 +973,10 @@ vmu_amp_update_incore_bounds(avl_tree_t *tree, struct anon_map *amp,
 
 			if (ap != NULL && vn != NULL && vn->v_pages != NULL &&
 			    (page = page_exists(vn, off)) != NULL) {
-				page_type = VMUSAGE_BOUND_INCORE;
+				if (PP_ISFREE(page))
+					page_type = VMUSAGE_BOUND_NOT_INCORE;
+				else
+					page_type = VMUSAGE_BOUND_INCORE;
 				if (page->p_szc > 0) {
 					pgcnt = page_get_pagecnt(page->p_szc);
 					pgshft = page_get_shift(page->p_szc);
@@ -947,8 +986,10 @@ vmu_amp_update_incore_bounds(avl_tree_t *tree, struct anon_map *amp,
 			} else {
 				page_type = VMUSAGE_BOUND_NOT_INCORE;
 			}
+
 			if (bound_type == VMUSAGE_BOUND_UNKNOWN) {
 				next->vmb_type = page_type;
+				bound_type = page_type;
 			} else if (next->vmb_type != page_type) {
 				/*
 				 * If current bound type does not match page
@@ -1009,6 +1050,7 @@ vmu_vnode_update_incore_bounds(avl_tree_t *tree, vnode_t *vnode,
 			continue;
 		}
 
+		ASSERT(next->vmb_type == VMUSAGE_BOUND_UNKNOWN);
 		bound_type = next->vmb_type;
 		index = next->vmb_start;
 		while (index <= next->vmb_end) {
@@ -1024,7 +1066,10 @@ vmu_vnode_update_incore_bounds(avl_tree_t *tree, vnode_t *vnode,
 
 			if (vnode->v_pages != NULL &&
 			    (page = page_exists(vnode, ptob(index))) != NULL) {
-				page_type = VMUSAGE_BOUND_INCORE;
+				if (PP_ISFREE(page))
+					page_type = VMUSAGE_BOUND_NOT_INCORE;
+				else
+					page_type = VMUSAGE_BOUND_INCORE;
 				if (page->p_szc > 0) {
 					pgcnt = page_get_pagecnt(page->p_szc);
 					pgshft = page_get_shift(page->p_szc);
@@ -1034,8 +1079,10 @@ vmu_vnode_update_incore_bounds(avl_tree_t *tree, vnode_t *vnode,
 			} else {
 				page_type = VMUSAGE_BOUND_NOT_INCORE;
 			}
+
 			if (bound_type == VMUSAGE_BOUND_UNKNOWN) {
 				next->vmb_type = page_type;
+				bound_type = page_type;
 			} else if (next->vmb_type != page_type) {
 				/*
 				 * If current bound type does not match page
@@ -1304,6 +1351,12 @@ vmu_calculate_seg(vmu_entity_t *vmu_entities, struct seg *seg)
 			}
 
 			/*
+			 * Pages on the free list aren't counted for the rss.
+			 */
+			if (PP_ISFREE(page))
+				continue;
+
+			/*
 			 * Assume anon structs with a refcnt
 			 * of 1 are not COW shared, so there
 			 * is no reason to track them per entity.
@@ -1320,8 +1373,7 @@ vmu_calculate_seg(vmu_entity_t *vmu_entities, struct seg *seg)
 				 * Track COW anons per entity so
 				 * they are not double counted.
 				 */
-				if (vmu_find_insert_anon(entity->vme_anon_hash,
-				    (caddr_t)ap) == 0)
+				if (vmu_find_insert_anon(entity, ap) == 0)
 					continue;
 
 				result->vmu_rss_all += (pgcnt << PAGESHIFT);
@@ -1461,8 +1513,9 @@ vmu_calculate_proc(proc_t *p)
 		entities = tmp;
 	}
 	if (vmu_data.vmu_calc_flags &
-	    (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_PROJECTS |
-	    VMUSAGE_ALL_PROJECTS | VMUSAGE_TASKS | VMUSAGE_ALL_TASKS |
+	    (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE |
+	    VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS |
+	    VMUSAGE_TASKS | VMUSAGE_ALL_TASKS |
 	    VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_EUSERS |
 	    VMUSAGE_ALL_EUSERS)) {
 		ret = i_mod_hash_find_nosync(vmu_data.vmu_zones_hash,
@@ -1594,8 +1647,7 @@ vmu_free_extra()
 			mod_hash_destroy_hash(te->vme_vnode_hash);
 		if (te->vme_amp_hash != NULL)
 			mod_hash_destroy_hash(te->vme_amp_hash);
-		if (te->vme_anon_hash != NULL)
-			mod_hash_destroy_hash(te->vme_anon_hash);
+		VERIFY(avl_first(&te->vme_anon) == NULL);
 		kmem_free(te, sizeof (vmu_entity_t));
 	}
 	while (vmu_data.vmu_free_zones != NULL) {
@@ -1739,12 +1791,34 @@ vmu_cache_rele(vmu_cache_t *cache)
 }
 
 /*
+ * When new data is calculated, update the phys_mem rctl usage value in the
+ * zones.
+ */
+static void
+vmu_update_zone_rctls(vmu_cache_t *cache)
+{
+	vmusage_t	*rp;
+	size_t		i = 0;
+	zone_t		*zp;
+
+	for (rp = cache->vmc_results; i < cache->vmc_nresults; rp++, i++) {
+		if (rp->vmu_type == VMUSAGE_ZONE &&
+		    rp->vmu_zoneid != ALL_ZONES) {
+			if ((zp = zone_find_by_id(rp->vmu_zoneid)) != NULL) {
+				zp->zone_phys_mem = rp->vmu_rss_all;
+				zone_rele(zp);
+			}
+		}
+	}
+}
+
+/*
  * Copy out the cached results to a caller.  Inspect the callers flags
  * and zone to determine which cached results should be copied.
  */
 static int
 vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres,
-    uint_t flags, int cpflg)
+    uint_t flags, id_t req_zone_id, int cpflg)
 {
 	vmusage_t *result, *out_result;
 	vmusage_t dummy;
@@ -1763,7 +1837,7 @@ vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres,
 	/* figure out what results the caller is interested in. */
 	if ((flags & VMUSAGE_SYSTEM) && curproc->p_zone == global_zone)
 		types |= VMUSAGE_SYSTEM;
-	if (flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES))
+	if (flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE))
 		types |= VMUSAGE_ZONE;
 	if (flags & (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS |
 	    VMUSAGE_COL_PROJECTS))
@@ -1826,26 +1900,33 @@ vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres,
 				continue;
 		}
 
-		/* Skip "other zone" results if not requested */
-		if (result->vmu_zoneid != curproc->p_zone->zone_id) {
-			if (result->vmu_type == VMUSAGE_ZONE &&
-			    (flags & VMUSAGE_ALL_ZONES) == 0)
-				continue;
-			if (result->vmu_type == VMUSAGE_PROJECTS &&
-			    (flags & (VMUSAGE_ALL_PROJECTS |
-			    VMUSAGE_COL_PROJECTS)) == 0)
-				continue;
-			if (result->vmu_type == VMUSAGE_TASKS &&
-			    (flags & VMUSAGE_ALL_TASKS) == 0)
-				continue;
-			if (result->vmu_type == VMUSAGE_RUSERS &&
-			    (flags & (VMUSAGE_ALL_RUSERS |
-			    VMUSAGE_COL_RUSERS)) == 0)
-				continue;
-			if (result->vmu_type == VMUSAGE_EUSERS &&
-			    (flags & (VMUSAGE_ALL_EUSERS |
-			    VMUSAGE_COL_EUSERS)) == 0)
+		if (result->vmu_type == VMUSAGE_ZONE &&
+		    flags & VMUSAGE_A_ZONE) {
+			/* Skip non-requested zone results */
+			if (result->vmu_zoneid != req_zone_id)
 				continue;
+		} else {
+			/* Skip "other zone" results if not requested */
+			if (result->vmu_zoneid != curproc->p_zone->zone_id) {
+				if (result->vmu_type == VMUSAGE_ZONE &&
+				    (flags & VMUSAGE_ALL_ZONES) == 0)
+					continue;
+				if (result->vmu_type == VMUSAGE_PROJECTS &&
+				    (flags & (VMUSAGE_ALL_PROJECTS |
+				    VMUSAGE_COL_PROJECTS)) == 0)
+					continue;
+				if (result->vmu_type == VMUSAGE_TASKS &&
+				    (flags & VMUSAGE_ALL_TASKS) == 0)
+					continue;
+				if (result->vmu_type == VMUSAGE_RUSERS &&
+				    (flags & (VMUSAGE_ALL_RUSERS |
+				    VMUSAGE_COL_RUSERS)) == 0)
+					continue;
+				if (result->vmu_type == VMUSAGE_EUSERS &&
+				    (flags & (VMUSAGE_ALL_EUSERS |
+				    VMUSAGE_COL_EUSERS)) == 0)
+					continue;
+			}
 		}
 		count++;
 		if (out_result != NULL) {
@@ -1901,10 +1982,12 @@ vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres, int cpflg)
 	int cacherecent = 0;
 	hrtime_t now;
 	uint_t flags_orig;
+	id_t req_zone_id;
 
 	/*
 	 * Non-global zones cannot request system wide and/or collated
-	 * results, or the system result, so munge the flags accordingly.
+	 * results, or the system result, or usage of another zone, so munge
+	 * the flags accordingly.
 	 */
 	flags_orig = flags;
 	if (curproc->p_zone != global_zone) {
@@ -1924,6 +2007,10 @@ vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres, int cpflg)
 			flags &= ~VMUSAGE_SYSTEM;
 			flags |= VMUSAGE_ZONE;
 		}
+		if (flags & VMUSAGE_A_ZONE) {
+			flags &= ~VMUSAGE_A_ZONE;
+			flags |= VMUSAGE_ZONE;
+		}
 	}
 
 	/* Check for unknown flags */
@@ -1934,6 +2021,21 @@ vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres, int cpflg)
 	if ((flags & VMUSAGE_MASK) == 0)
 		return (set_errno(EINVAL));
 
+	/* If requesting results for a specific zone, get the zone ID */
+	if (flags & VMUSAGE_A_ZONE) {
+		size_t bufsize;
+		vmusage_t zreq;
+
+		if (ddi_copyin((caddr_t)nres, &bufsize, sizeof (size_t), cpflg))
+			return (set_errno(EFAULT));
+		/* Requested zone ID is passed in buf, so 0 len not allowed */
+		if (bufsize == 0)
+			return (set_errno(EINVAL));
+		if (ddi_copyin((caddr_t)buf, &zreq, sizeof (vmusage_t), cpflg))
+			return (set_errno(EFAULT));
+		req_zone_id = zreq.vmu_id;
+	}
+
 	mutex_enter(&vmu_data.vmu_lock);
 	now = gethrtime();
 
@@ -1953,7 +2055,7 @@ start:
 			mutex_exit(&vmu_data.vmu_lock);
 
 			ret = vmu_copyout_results(cache, buf, nres, flags_orig,
-			    cpflg);
+			    req_zone_id, cpflg);
 			mutex_enter(&vmu_data.vmu_lock);
 			vmu_cache_rele(cache);
 			if (vmu_data.vmu_pending_waiters > 0)
@@ -2009,8 +2111,11 @@ start:
 
 		mutex_exit(&vmu_data.vmu_lock);
 
+		/* update zone's phys. mem. rctl usage */
+		vmu_update_zone_rctls(cache);
 		/* copy cache */
-		ret = vmu_copyout_results(cache, buf, nres, flags_orig, cpflg);
+		ret = vmu_copyout_results(cache, buf, nres, flags_orig,
+		    req_zone_id, cpflg);
 		mutex_enter(&vmu_data.vmu_lock);
 		vmu_cache_rele(cache);
 		mutex_exit(&vmu_data.vmu_lock);
@@ -2030,3 +2135,185 @@ start:
 	vmu_data.vmu_pending_waiters--;
 	goto start;
 }
+
+#if defined(__x86)
+/*
+ * Attempt to invalidate all of the pages in the mapping for the given process.
+ */
+static void
+map_inval(proc_t *p, struct seg *seg, caddr_t addr, size_t size)
+{
+	page_t		*pp;
+	size_t		psize;
+	u_offset_t	off;
+	caddr_t		eaddr;
+	struct vnode	*vp;
+	struct segvn_data *svd;
+	struct hat	*victim_hat;
+
+	ASSERT((addr + size) <= (seg->s_base + seg->s_size));
+
+	victim_hat = p->p_as->a_hat;
+	svd = (struct segvn_data *)seg->s_data;
+	vp = svd->vp;
+	psize = page_get_pagesize(seg->s_szc);
+
+	off = svd->offset + (uintptr_t)(addr - seg->s_base);
+
+	for (eaddr = addr + size; addr < eaddr; addr += psize, off += psize) {
+		pp = page_lookup_nowait(vp, off, SE_SHARED);
+
+		if (pp != NULL) {
+			/* following logic based on pvn_getdirty() */
+
+			if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
+				page_unlock(pp);
+				continue;
+			}
+
+			page_io_lock(pp);
+			hat_page_inval(pp, 0, victim_hat);
+			page_io_unlock(pp);
+
+			/*
+			 * For B_INVALCURONLY-style handling we let
+			 * page_release call VN_DISPOSE if no one else is using
+			 * the page.
+			 *
+			 * A hat_ismod() check would be useless because:
+			 * (1) we are not be holding SE_EXCL lock
+			 * (2) we've not unloaded _all_ translations
+			 *
+			 * Let page_release() do the heavy-lifting.
+			 */
+			(void) page_release(pp, 1);
+		}
+	}
+}
+
+/*
+ * vm_map_inval()
+ *
+ * Invalidate as many pages as possible within the given mapping for the given
+ * process. addr is expected to be the base address of the mapping and size is
+ * the length of the mapping. In some cases a mapping will encompass an
+ * entire segment, but at least for anon or stack mappings, these will be
+ * regions within a single large segment. Thus, the invalidation is oriented
+ * around a single mapping and not an entire segment.
+ *
+ * SPARC sfmmu hat does not support HAT_CURPROC_PGUNLOAD-style handling so
+ * this code is only applicable to x86.
+ */
+int
+vm_map_inval(pid_t pid, caddr_t addr, size_t size)
+{
+	int ret;
+	int error = 0;
+	proc_t *p;		/* target proc */
+	struct as *as;		/* target proc's address space */
+	struct seg *seg;	/* working segment */
+
+	if (curproc->p_zone != global_zone || crgetruid(curproc->p_cred) != 0)
+		return (set_errno(EPERM));
+
+	/* If not a valid mapping address, return an error */
+	if ((caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK) != addr)
+		return (set_errno(EINVAL));
+
+again:
+	mutex_enter(&pidlock);
+	p = prfind(pid);
+	if (p == NULL) {
+		mutex_exit(&pidlock);
+		return (set_errno(ESRCH));
+	}
+
+	mutex_enter(&p->p_lock);
+	mutex_exit(&pidlock);
+
+	if (panicstr != NULL) {
+		mutex_exit(&p->p_lock);
+		return (0);
+	}
+
+	as = p->p_as;
+
+	/*
+	 * Try to set P_PR_LOCK - prevents process "changing shape"
+	 * - blocks fork
+	 * - blocks sigkill
+	 * - cannot be a system proc
+	 * - must be fully created proc
+	 */
+	ret = sprtrylock_proc(p);
+	if (ret == -1) {
+		/* Process in invalid state */
+		mutex_exit(&p->p_lock);
+		return (set_errno(ESRCH));
+	}
+
+	if (ret == 1) {
+		/*
+		 * P_PR_LOCK is already set. Wait and try again. This also
+		 * drops p_lock so p may no longer be valid since the proc may
+		 * have exited.
+		 */
+		sprwaitlock_proc(p);
+		goto again;
+	}
+
+	/* P_PR_LOCK is now set */
+	mutex_exit(&p->p_lock);
+
+	AS_LOCK_ENTER(as, RW_READER);
+	if ((seg = as_segat(as, addr)) == NULL) {
+		AS_LOCK_EXIT(as);
+		mutex_enter(&p->p_lock);
+		sprunlock(p);
+		return (set_errno(ENOMEM));
+	}
+
+	/*
+	 * The invalidation behavior only makes sense for vnode-backed segments.
+	 */
+	if (seg->s_ops != &segvn_ops) {
+		AS_LOCK_EXIT(as);
+		mutex_enter(&p->p_lock);
+		sprunlock(p);
+		return (0);
+	}
+
+	/*
+	 * If the mapping is out of bounds of the segement return an error.
+	 */
+	if ((addr + size) > (seg->s_base + seg->s_size)) {
+		AS_LOCK_EXIT(as);
+		mutex_enter(&p->p_lock);
+		sprunlock(p);
+		return (set_errno(EINVAL));
+	}
+
+	/*
+	 * Don't use MS_INVALCURPROC flag here since that would eventually
+	 * initiate hat invalidation based on curthread. Since we're doing this
+	 * on behalf of a different process, that would erroneously invalidate
+	 * our own process mappings.
+	 */
+	error = SEGOP_SYNC(seg, addr, size, 0, (uint_t)MS_ASYNC);
+	if (error == 0) {
+		/*
+		 * Since we didn't invalidate during the sync above, we now
+		 * try to invalidate all of the pages in the mapping.
+		 */
+		map_inval(p, seg, addr, size);
+	}
+	AS_LOCK_EXIT(as);
+
+	mutex_enter(&p->p_lock);
+	sprunlock(p);
+
+	if (error)
+		(void) set_errno(error);
+	return (error);
+}
+#endif