144 files changed, 39398 insertions, 3188 deletions
diff --git a/usr/src/uts/common/io/aggr/aggr_grp.c b/usr/src/uts/common/io/aggr/aggr_grp.c
index 3554e111c1..47840951d3 100644
--- a/usr/src/uts/common/io/aggr/aggr_grp.c
+++ b/usr/src/uts/common/io/aggr/aggr_grp.c
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2015 Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
  */
 
 /*
@@ -32,39 +32,69 @@
  * module. The hash key is the linkid associated with the link
  * aggregation group.
  *
- * A set of MAC ports are associated with each association group.
+ * Each aggregation contains a set of ports. The port is represented
+ * by the aggr_port_t structure. A port consists of a single MAC
+ * client which has exclusive (MCIS_EXCLUSIVE) use of the underlying
+ * MAC. This client is used by the aggr to send and receive LACP
+ * traffic. Each port client takes on the same MAC unicast address --
+ * the address of the aggregation itself (taken from the first port by
+ * default).
  *
- * Aggr pseudo TX rings
- * --------------------
- * The underlying ports (NICs) in an aggregation can have TX rings. To
- * enhance aggr's performance, these TX rings are made available to the
- * aggr layer as pseudo TX rings. The concept of pseudo rings are not new.
- * They are already present and implemented on the RX side. It is called
- * as pseudo RX rings. The same concept is extended to the TX side where
- * each TX ring of an underlying port is reflected in aggr as a pseudo
- * TX ring. Thus each pseudo TX ring will map to a specific hardware TX
- * ring. Even in the case of a NIC that does not have a TX ring, a pseudo
- * TX ring is given to the aggregation layer.
+ * The MAC client that hangs off each aggr port is not your typical
+ * MAC client. Not only does it have exclusive control of the MAC, but
+ * it also has no Tx or Rx SRSes. An SRS is designed to queue and
+ * fanout traffic among L4 protocols; but the aggr is an intermediary,
+ * not a consumer. Instead of using SRSes, the aggr puts the
+ * underlying hardware rings into passthru mode and ships packets up
+ * via a direct call to aggr_recv_cb(). This allows aggr to enforce
+ * LACP while passing all other traffic up to clients of the aggr.
+ *
+ * Pseudo Rx Groups and Rings
+ * --------------------------
+ *
+ * It is imperative for client performance that the aggr provide as
+ * many MAC groups as possible. In order to use the underlying HW
+ * resources, aggr creates pseudo groups to aggregate the underlying
+ * HW groups. Every HW group gets mapped to a pseudo group; and every
+ * HW ring in that group gets mapped to a pseudo ring. The pseudo
+ * group at index 0 combines all the HW groups at index 0 from each
+ * port, etc. The aggr's MAC then creates normal MAC groups and rings
+ * out of these pseudo groups and rings to present to the aggr's
+ * clients. To the clients, the aggr's groups and rings are absolutely
+ * no different than a NIC's groups or rings.
+ *
+ * Pseudo Tx Rings
+ * ---------------
+ *
+ * The underlying ports (NICs) in an aggregation can have Tx rings. To
+ * enhance aggr's performance, these Tx rings are made available to
+ * the aggr layer as pseudo Tx rings. The concept of pseudo rings are
+ * not new. They are already present and implemented on the Rx side.
+ * The same concept is extended to the Tx side where each Tx ring of
+ * an underlying port is reflected in aggr as a pseudo Tx ring. Thus
+ * each pseudo Tx ring will map to a specific hardware Tx ring. Even
+ * in the case of a NIC that does not have a Tx ring, a pseudo Tx ring
+ * is given to the aggregation layer.
  *
  * With this change, the outgoing stack depth looks much better:
  *
  * mac_tx() -> mac_tx_aggr_mode() -> mac_tx_soft_ring_process() ->
  * mac_tx_send() -> aggr_ring_rx() -> <driver>_ring_tx()
  *
- * Two new modes are introduced to mac_tx() to handle aggr pseudo TX rings:
+ * Two new modes are introduced to mac_tx() to handle aggr pseudo Tx rings:
  * SRS_TX_AGGR and SRS_TX_BW_AGGR.
  *
  * In SRS_TX_AGGR mode, mac_tx_aggr_mode() routine is called. This routine
- * invokes an aggr function, aggr_find_tx_ring(), to find a (pseudo) TX
+ * invokes an aggr function, aggr_find_tx_ring(), to find a (pseudo) Tx
  * ring belonging to a port on which the packet has to be sent.
  * aggr_find_tx_ring() first finds the outgoing port based on L2/L3/L4
- * policy and then uses the fanout_hint passed to it to pick a TX ring from
+ * policy and then uses the fanout_hint passed to it to pick a Tx ring from
  * the selected port.
  *
  * In SRS_TX_BW_AGGR mode, mac_tx_bw_mode() function is called where
  * bandwidth limit is applied first on the outgoing packet and the packets
  * allowed to go out would call mac_tx_aggr_mode() to send the packet on a
- * particular TX ring.
+ * particular Tx ring.
  */
 
 #include <sys/types.h>
@@ -121,10 +151,12 @@ static int aggr_add_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *);
 static void aggr_rem_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *);
 static int aggr_pseudo_disable_intr(mac_intr_handle_t);
 static int aggr_pseudo_enable_intr(mac_intr_handle_t);
-static int aggr_pseudo_start_ring(mac_ring_driver_t, uint64_t);
-static void aggr_pseudo_stop_ring(mac_ring_driver_t);
+static int aggr_pseudo_start_rx_ring(mac_ring_driver_t, uint64_t);
+static void aggr_pseudo_stop_rx_ring(mac_ring_driver_t);
 static int aggr_addmac(void *, const uint8_t *);
 static int aggr_remmac(void *, const uint8_t *);
+static int aggr_addvlan(mac_group_driver_t, uint16_t);
+static int aggr_remvlan(mac_group_driver_t, uint16_t);
 static mblk_t *aggr_rx_poll(void *, int);
 static void aggr_fill_ring(void *, mac_ring_type_t, const int,
     const int, mac_ring_info_t *, mac_ring_handle_t);
@@ -325,6 +357,7 @@ aggr_grp_attach_port(aggr_grp_t *grp, aggr_port_t *port)
 		return (B_FALSE);
 	}
 
+	mutex_enter(&grp->lg_stat_lock);
 	if (grp->lg_ifspeed == 0) {
 		/*
 		 * The group inherits the speed of the first link being
@@ -338,8 +371,10 @@ aggr_grp_attach_port(aggr_grp_t *grp, aggr_port_t *port)
 		 * the group link speed, as per 802.3ad. Since it is
 		 * not, the attach is cancelled.
 		 */
+		mutex_exit(&grp->lg_stat_lock);
 		return (B_FALSE);
 	}
+	mutex_exit(&grp->lg_stat_lock);
 
 	grp->lg_nattached_ports++;
 
@@ -348,7 +383,9 @@ aggr_grp_attach_port(aggr_grp_t *grp, aggr_port_t *port)
 	 */
 	if (grp->lg_link_state != LINK_STATE_UP) {
 		grp->lg_link_state = LINK_STATE_UP;
+		mutex_enter(&grp->lg_stat_lock);
 		grp->lg_link_duplex = LINK_DUPLEX_FULL;
+		mutex_exit(&grp->lg_stat_lock);
 		link_state_changed = B_TRUE;
 	}
 
@@ -360,9 +397,13 @@ aggr_grp_attach_port(aggr_grp_t *grp, aggr_port_t *port)
 	aggr_grp_multicst_port(port, B_TRUE);
 
 	/*
-	 * Set port's receive callback
+	 * The port client doesn't have an Rx SRS; instead of calling
+	 * mac_rx_set() we set the client's flow callback directly.
+	 * This datapath is used only when the port's driver doesn't
+	 * support MAC_CAPAB_RINGS. Drivers with ring support will
+	 * deliver traffic to the aggr via ring passthru.
 	 */
-	mac_rx_set(port->lp_mch, aggr_recv_cb, port);
+	mac_client_set_flow_cb(port->lp_mch, aggr_recv_cb, port);
 
 	/*
 	 * If LACP is OFF, the port can be used to send data as soon
@@ -392,7 +433,7 @@ aggr_grp_detach_port(aggr_grp_t *grp, aggr_port_t *port)
 	if (port->lp_state != AGGR_PORT_STATE_ATTACHED)
 		return (B_FALSE);
 
-	mac_rx_clear(port->lp_mch);
+	mac_client_clear_flow_cb(port->lp_mch);
 
 	aggr_grp_multicst_port(port, B_FALSE);
 
@@ -406,9 +447,11 @@ aggr_grp_detach_port(aggr_grp_t *grp, aggr_port_t *port)
 	grp->lg_nattached_ports--;
 	if (grp->lg_nattached_ports == 0) {
 		/* the last attached MAC port of the group is being detached */
-		grp->lg_ifspeed = 0;
 		grp->lg_link_state = LINK_STATE_DOWN;
+		mutex_enter(&grp->lg_stat_lock);
+		grp->lg_ifspeed = 0;
 		grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN;
+		mutex_exit(&grp->lg_stat_lock);
 		link_state_changed = B_TRUE;
 	}
 
@@ -529,26 +572,27 @@ aggr_grp_add_port(aggr_grp_t *grp, datalink_id_t port_linkid, boolean_t force,
 	zoneid_t port_zoneid = ALL_ZONES;
 	int err;
 
-	/* The port must be int the same zone as the aggregation. */
+	/* The port must be in the same zone as the aggregation. */
 	if (zone_check_datalink(&port_zoneid, port_linkid) != 0)
 		port_zoneid = GLOBAL_ZONEID;
 	if (grp->lg_zoneid != port_zoneid)
 		return (EBUSY);
 
 	/*
-	 * lg_mh could be NULL when the function is called during the creation
-	 * of the aggregation.
+	 * If we are creating the aggr, then there is no MAC handle
+	 * and thus no perimeter to hold. If we are adding a port to
+	 * an existing aggr, then the perimiter of the aggr's MAC must
+	 * be held.
 	 */
 	ASSERT(grp->lg_mh == NULL || MAC_PERIM_HELD(grp->lg_mh));
 
-	/* create new port */
 	err = aggr_port_create(grp, port_linkid, force, &port);
 	if (err != 0)
 		return (err);
 
 	mac_perim_enter_by_mh(port->lp_mh, &mph);
 
-	/* add port to list of group constituent ports */
+	/* Add the new port to the end of the list. */
 	cport = &grp->lg_ports;
 	while (*cport != NULL)
 		cport = &((*cport)->lp_next);
@@ -630,6 +674,7 @@ aggr_add_pseudo_rx_ring(aggr_port_t *port,
 	ring->arr_flags |= MAC_PSEUDO_RING_INUSE;
 	ring->arr_hw_rh = hw_rh;
 	ring->arr_port = port;
+	ring->arr_grp = rx_grp;
 	rx_grp->arg_ring_cnt++;
 
 	/*
@@ -640,10 +685,15 @@ aggr_add_pseudo_rx_ring(aggr_port_t *port,
 		ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE;
 		ring->arr_hw_rh = NULL;
 		ring->arr_port = NULL;
+		ring->arr_grp = NULL;
 		rx_grp->arg_ring_cnt--;
 	} else {
-		mac_hwring_setup(hw_rh, (mac_resource_handle_t)ring,
-		    mac_find_ring(rx_grp->arg_gh, j));
+		/*
+		 * This must run after the MAC is registered.
+		 */
+		ASSERT3P(ring->arr_rh, !=, NULL);
+		mac_hwring_set_passthru(hw_rh, (mac_rx_t)aggr_recv_cb,
+		    (void *)port, (mac_resource_handle_t)ring);
 	}
 	return (err);
 }
@@ -654,11 +704,9 @@ aggr_add_pseudo_rx_ring(aggr_port_t *port,
 static void
 aggr_rem_pseudo_rx_ring(aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh)
 {
-	aggr_pseudo_rx_ring_t	*ring;
-	int			j;
+	for (uint_t j = 0; j < MAX_RINGS_PER_GROUP; j++) {
+		aggr_pseudo_rx_ring_t *ring = rx_grp->arg_rings + j;
 
-	for (j = 0; j < MAX_RINGS_PER_GROUP; j++) {
-		ring = rx_grp->arg_rings + j;
 		if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE) ||
 		    ring->arr_hw_rh != hw_rh) {
 			continue;
@@ -669,134 +717,140 @@ aggr_rem_pseudo_rx_ring(aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh)
 		ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE;
 		ring->arr_hw_rh = NULL;
 		ring->arr_port = NULL;
+		ring->arr_grp = NULL;
 		rx_grp->arg_ring_cnt--;
-		mac_hwring_teardown(hw_rh);
+		mac_hwring_clear_passthru(hw_rh);
 		break;
 	}
 }
 
 /*
- * This function is called to create pseudo rings over the hardware rings of
- * the underlying device. Note that there is a 1:1 mapping between the pseudo
- * RX rings of the aggr and the hardware rings of the underlying port.
+ * Create pseudo rings over the HW rings of the port.
+ *
+ * o Create a pseudo ring in rx_grp per HW ring in the port's HW group.
+ *
+ * o Program existing unicast filters on the pseudo group into the HW group.
+ *
+ * o Program existing VLAN filters on the pseudo group into the HW group.
  */
 static int
 aggr_add_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp)
 {
-	aggr_grp_t		*grp = port->lp_grp;
 	mac_ring_handle_t	hw_rh[MAX_RINGS_PER_GROUP];
 	aggr_unicst_addr_t	*addr, *a;
 	mac_perim_handle_t	pmph;
-	int			hw_rh_cnt, i = 0, j;
+	aggr_vlan_t		*avp;
+	uint_t			hw_rh_cnt, i;
 	int			err = 0;
+	uint_t			g_idx = rx_grp->arg_index;
 
-	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
+	ASSERT(MAC_PERIM_HELD(port->lp_grp->lg_mh));
+	ASSERT3U(g_idx, <, MAX_GROUPS_PER_PORT);
 	mac_perim_enter_by_mh(port->lp_mh, &pmph);
 
 	/*
-	 * This function must be called after the aggr registers its mac
-	 * and its RX group has been initialized.
+	 * This function must be called after the aggr registers its
+	 * MAC and its Rx groups have been initialized.
 	 */
 	ASSERT(rx_grp->arg_gh != NULL);
 
 	/*
-	 * Get the list the the underlying HW rings.
+	 * Get the list of the underlying HW rings.
 	 */
-	hw_rh_cnt = mac_hwrings_get(port->lp_mch,
-	    &port->lp_hwgh, hw_rh, MAC_RING_TYPE_RX);
-
-	if (port->lp_hwgh != NULL) {
-		/*
-		 * Quiesce the HW ring and the mac srs on the ring. Note
-		 * that the HW ring will be restarted when the pseudo ring
-		 * is started. At that time all the packets will be
-		 * directly passed up to the pseudo RX ring and handled
-		 * by mac srs created over the pseudo RX ring.
-		 */
-		mac_rx_client_quiesce(port->lp_mch);
-		mac_srs_perm_quiesce(port->lp_mch, B_TRUE);
-	}
+	hw_rh_cnt = mac_hwrings_idx_get(port->lp_mh, g_idx,
+	    &port->lp_hwghs[g_idx], hw_rh, MAC_RING_TYPE_RX);
 
 	/*
-	 * Add all the unicast addresses to the newly added port.
+	 * Add existing VLAN and unicast address filters to the port.
 	 */
+	for (avp = list_head(&rx_grp->arg_vlans); avp != NULL;
+	    avp = list_next(&rx_grp->arg_vlans, avp)) {
+		if ((err = aggr_port_addvlan(port, g_idx, avp->av_vid)) != 0)
+			goto err;
+	}
+
 	for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next) {
-		if ((err = aggr_port_addmac(port, addr->aua_addr)) != 0)
-			break;
+		if ((err = aggr_port_addmac(port, g_idx, addr->aua_addr)) != 0)
+			goto err;
 	}
 
-	for (i = 0; err == 0 && i < hw_rh_cnt; i++)
+	for (i = 0; i < hw_rh_cnt; i++) {
 		err = aggr_add_pseudo_rx_ring(port, rx_grp, hw_rh[i]);
+		if (err != 0)
+			goto err;
+	}
 
-	if (err != 0) {
-		for (j = 0; j < i; j++)
-			aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[j]);
+	mac_perim_exit(pmph);
+	return (0);
+
+err:
+	ASSERT(err != 0);
+
+	for (uint_t j = 0; j < i; j++)
+		aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[j]);
+
+	for (a = rx_grp->arg_macaddr; a != addr; a = a->aua_next)
+		aggr_port_remmac(port, g_idx, a->aua_addr);
 
-		for (a = rx_grp->arg_macaddr; a != addr; a = a->aua_next)
-			aggr_port_remmac(port, a->aua_addr);
+	if (avp != NULL)
+		avp = list_prev(&rx_grp->arg_vlans, avp);
 
-		if (port->lp_hwgh != NULL) {
-			mac_srs_perm_quiesce(port->lp_mch, B_FALSE);
-			mac_rx_client_restart(port->lp_mch);
-			port->lp_hwgh = NULL;
+	for (; avp != NULL; avp = list_prev(&rx_grp->arg_vlans, avp)) {
+		int err2;
+
+		if ((err2 = aggr_port_remvlan(port, g_idx, avp->av_vid)) != 0) {
+			cmn_err(CE_WARN, "Failed to remove VLAN %u from port %s"
+			    ": errno %d.", avp->av_vid,
+			    mac_client_name(port->lp_mch), err2);
 		}
-	} else {
-		port->lp_rx_grp_added = B_TRUE;
 	}
-done:
+
+	port->lp_hwghs[g_idx] = NULL;
 	mac_perim_exit(pmph);
 	return (err);
 }
 
 /*
- * This function is called by aggr to remove pseudo RX rings over the
- * HW rings of the underlying port.
+ * Destroy the pseudo rings mapping to this port and remove all VLAN
+ * and unicast filters from this port. Even if there are no underlying
+ * HW rings we must still remove the unicast filters to take the port
+ * out of promisc mode.
  */
 static void
 aggr_rem_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp)
 {
-	aggr_grp_t		*grp = port->lp_grp;
 	mac_ring_handle_t	hw_rh[MAX_RINGS_PER_GROUP];
 	aggr_unicst_addr_t	*addr;
-	mac_group_handle_t	hwgh;
 	mac_perim_handle_t	pmph;
-	int			hw_rh_cnt, i;
+	uint_t			hw_rh_cnt;
+	uint_t			g_idx = rx_grp->arg_index;
 
-	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
+	ASSERT(MAC_PERIM_HELD(port->lp_grp->lg_mh));
+	ASSERT3U(g_idx, <, MAX_GROUPS_PER_PORT);
+	ASSERT3P(rx_grp->arg_gh, !=, NULL);
 	mac_perim_enter_by_mh(port->lp_mh, &pmph);
 
-	if (!port->lp_rx_grp_added)
-		goto done;
-
-	ASSERT(rx_grp->arg_gh != NULL);
-	hw_rh_cnt = mac_hwrings_get(port->lp_mch,
-	    &hwgh, hw_rh, MAC_RING_TYPE_RX);
+	hw_rh_cnt = mac_hwrings_idx_get(port->lp_mh, g_idx, NULL, hw_rh,
+	    MAC_RING_TYPE_RX);
 
-	/*
-	 * If hw_rh_cnt is 0, it means that the underlying port does not
-	 * support RX rings. Directly return in this case.
-	 */
-	for (i = 0; i < hw_rh_cnt; i++)
+	for (uint_t i = 0; i < hw_rh_cnt; i++)
 		aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[i]);
 
 	for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next)
-		aggr_port_remmac(port, addr->aua_addr);
+		aggr_port_remmac(port, g_idx, addr->aua_addr);
 
-	if (port->lp_hwgh != NULL) {
-		port->lp_hwgh = NULL;
+	for (aggr_vlan_t *avp = list_head(&rx_grp->arg_vlans); avp != NULL;
+	    avp = list_next(&rx_grp->arg_vlans, avp)) {
+		int err;
 
-		/*
-		 * First clear the permanent-quiesced flag of the RX srs then
-		 * restart the HW ring and the mac srs on the ring. Note that
-		 * the HW ring and associated SRS will soon been removed when
-		 * the port is removed from the aggr.
-		 */
-		mac_srs_perm_quiesce(port->lp_mch, B_FALSE);
-		mac_rx_client_restart(port->lp_mch);
+		if ((err = aggr_port_remvlan(port, g_idx, avp->av_vid)) != 0) {
+			cmn_err(CE_WARN, "Failed to remove VLAN %u from port %s"
+			    ": errno %d.", avp->av_vid,
+			    mac_client_name(port->lp_mch), err);
+		}
 	}
 
-	port->lp_rx_grp_added = B_FALSE;
-done:
+	port->lp_hwghs[g_idx] = NULL;
 	mac_perim_exit(pmph);
 }
 
@@ -900,8 +954,8 @@ aggr_add_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp)
 	/*
 	 * Get the list the the underlying HW rings.
 	 */
-	hw_rh_cnt = mac_hwrings_get(port->lp_mch,
-	    NULL, hw_rh, MAC_RING_TYPE_TX);
+	hw_rh_cnt = mac_hwrings_get(port->lp_mch, NULL, hw_rh,
+	    MAC_RING_TYPE_TX);
 
 	/*
 	 * Even if the underlying NIC does not have TX rings, we
@@ -1006,23 +1060,46 @@ aggr_pseudo_enable_intr(mac_intr_handle_t ih)
 	return (mac_hwring_enable_intr(rr_ring->arr_hw_rh));
 }
 
+/*
+ * Start the pseudo ring. Since the pseudo ring is just an abstraction
+ * over an actual HW ring, the real task is to start the underlying HW
+ * ring.
+ */
 static int
-aggr_pseudo_start_ring(mac_ring_driver_t arg, uint64_t mr_gen)
+aggr_pseudo_start_rx_ring(mac_ring_driver_t arg, uint64_t mr_gen)
 {
-	aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg;
 	int err;
+	aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg;
 
 	err = mac_hwring_start(rr_ring->arr_hw_rh);
-	if (err == 0)
-		rr_ring->arr_gen = mr_gen;
+
+	if (err != 0)
+		return (err);
+
+	rr_ring->arr_gen = mr_gen;
 	return (err);
 }
 
+/*
+ * Stop the pseudo ring. Since the pseudo ring is just an abstraction
+ * over an actual HW ring, the real task is to stop the underlying HW
+ * ring.
+ */
 static void
-aggr_pseudo_stop_ring(mac_ring_driver_t arg)
+aggr_pseudo_stop_rx_ring(mac_ring_driver_t arg)
 {
 	aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg;
-	mac_hwring_stop(rr_ring->arr_hw_rh);
+
+	/*
+	 * The rings underlying the default group must stay up to
+	 * continue receiving LACP traffic. We would normally never
+	 * stop the default Rx rings because of the primary MAC
+	 * client; but aggr's primary MAC client doesn't call
+	 * mac_unicast_add() and thus mi_active is 0 when the last
+	 * non-primary client is deleted.
+	 */
+	if (rr_ring->arr_grp->arg_index != 0)
+		mac_hwring_stop(rr_ring->arr_hw_rh);
 }
 
 /*
@@ -1032,13 +1109,15 @@ int
 aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force,
     laioc_port_t *ports)
 {
-	int rc, i, nadded = 0;
+	int rc;
+	uint_t port_added = 0;
+	uint_t grp_added;
 	aggr_grp_t *grp = NULL;
 	aggr_port_t *port;
 	boolean_t link_state_changed = B_FALSE;
 	mac_perim_handle_t mph, pmph;
 
-	/* get group corresponding to linkid */
+	/* Get the aggr corresponding to linkid. */
 	rw_enter(&aggr_grp_lock, RW_READER);
 	if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
 	    (mod_hash_val_t *)&grp) != 0) {
@@ -1048,20 +1127,22 @@ aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force,
 	AGGR_GRP_REFHOLD(grp);
 
 	/*
-	 * Hold the perimeter so that the aggregation won't be destroyed.
+	 * Hold the perimeter so that the aggregation can't be destroyed.
 	 */
 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
 	rw_exit(&aggr_grp_lock);
 
-	/* add the specified ports to group */
-	for (i = 0; i < nports; i++) {
-		/* add port to group */
+	/* Add the specified ports to the aggr. */
+	for (uint_t i = 0; i < nports; i++) {
+		grp_added = 0;
+
 		if ((rc = aggr_grp_add_port(grp, ports[i].lp_linkid,
 		    force, &port)) != 0) {
 			goto bail;
 		}
+
 		ASSERT(port != NULL);
-		nadded++;
+		port_added++;
 
 		/* check capabilities */
 		if (!aggr_grp_capab_check(grp, port) ||
@@ -1078,9 +1159,16 @@ aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force,
 		rc = aggr_add_pseudo_tx_group(port, &grp->lg_tx_group);
 		if (rc != 0)
 			goto bail;
-		rc = aggr_add_pseudo_rx_group(port, &grp->lg_rx_group);
-		if (rc != 0)
-			goto bail;
+
+		for (uint_t j = 0; j < grp->lg_rx_group_count; j++) {
+			rc = aggr_add_pseudo_rx_group(port,
+			    &grp->lg_rx_groups[j]);
+
+			if (rc != 0)
+				goto bail;
+
+			grp_added++;
+		}
 
 		mac_perim_enter_by_mh(port->lp_mh, &pmph);
 
@@ -1098,7 +1186,7 @@ aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force,
 			/*
 			 * Turn on the promiscuous mode over the port when it
 			 * is requested to be turned on to receive the
-			 * non-primary address over a port, or the promiscous
+			 * non-primary address over a port, or the promiscuous
 			 * mode is enabled over the aggr.
 			 */
 			if (grp->lg_promisc || port->lp_prom_addr != NULL) {
@@ -1133,17 +1221,33 @@ aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force,
 bail:
 	if (rc != 0) {
 		/* stop and remove ports that have been added */
-		for (i = 0; i < nadded; i++) {
+		for (uint_t i = 0; i < port_added; i++) {
+			uint_t grp_remove;
+
 			port = aggr_grp_port_lookup(grp, ports[i].lp_linkid);
 			ASSERT(port != NULL);
+
 			if (grp->lg_started) {
 				mac_perim_enter_by_mh(port->lp_mh, &pmph);
 				(void) aggr_port_promisc(port, B_FALSE);
 				aggr_port_stop(port);
 				mac_perim_exit(pmph);
 			}
+
 			aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group);
-			aggr_rem_pseudo_rx_group(port, &grp->lg_rx_group);
+
+			/*
+			 * Only the last port could have a partial set
+			 * of groups added.
+			 */
+			grp_remove = (i + 1 == port_added) ? grp_added :
+			    grp->lg_rx_group_count;
+
+			for (uint_t j = 0; j < grp_remove; j++) {
+				aggr_rem_pseudo_rx_group(port,
+				    &grp->lg_rx_groups[j]);
+			}
+
 			(void) aggr_grp_rem_port(grp, port, NULL, NULL);
 		}
 	}
@@ -1305,7 +1409,8 @@ aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports,
 	grp->lg_tx_blocked_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) *
 	    MAX_RINGS_PER_GROUP), KM_SLEEP);
 	grp->lg_tx_blocked_cnt = 0;
-	bzero(&grp->lg_rx_group, sizeof (aggr_pseudo_rx_group_t));
+	bzero(&grp->lg_rx_groups,
+	    sizeof (aggr_pseudo_rx_group_t) * MAX_GROUPS_PER_PORT);
 	bzero(&grp->lg_tx_group, sizeof (aggr_pseudo_tx_group_t));
 	aggr_lacp_init_grp(grp);
 
@@ -1325,11 +1430,48 @@ aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports,
 	grp->lg_key = key;
 
 	for (i = 0; i < nports; i++) {
-		err = aggr_grp_add_port(grp, ports[i].lp_linkid, force, NULL);
+		err = aggr_grp_add_port(grp, ports[i].lp_linkid, force, &port);
 		if (err != 0)
 			goto bail;
 	}
 
+	grp->lg_rx_group_count = 1;
+
+	for (i = 0, port = grp->lg_ports; port != NULL;
+	     i++, port = port->lp_next) {
+		uint_t num_rgroups;
+
+		mac_perim_enter_by_mh(port->lp_mh, &mph);
+		num_rgroups = mac_get_num_rx_groups(port->lp_mh);
+		mac_perim_exit(mph);
+
+		/*
+		 * Utilize all the groups in a port. If some ports
+		 * have less groups than others, then traffic destined
+		 * for the same unicast address may be HW classified
+		 * on some ports but SW classified by aggr when
+		 * arriving on other ports.
+		 */
+		grp->lg_rx_group_count = MAX(grp->lg_rx_group_count,
+		    num_rgroups);
+	}
+
+	/*
+	 * There could be cases where the hardware provides more
+	 * groups than aggr can support. Make sure we never go above
+	 * the max aggr can support.
+	 */
+	grp->lg_rx_group_count = MIN(grp->lg_rx_group_count,
+	    MAX_GROUPS_PER_PORT);
+
+	ASSERT3U(grp->lg_rx_group_count, >, 0);
+	for (i = 0; i < MAX_GROUPS_PER_PORT; i++) {
+		grp->lg_rx_groups[i].arg_index = i;
+		grp->lg_rx_groups[i].arg_untagged = 0;
+		list_create(&(grp->lg_rx_groups[i].arg_vlans),
+		    sizeof (aggr_vlan_t), offsetof(aggr_vlan_t, av_link));
+	}
+
 	/*
 	 * If no explicit MAC address was specified by the administrator,
 	 * set it to the MAC address of the first port.
@@ -1347,7 +1489,7 @@ aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports,
 		grp->lg_mac_addr_port = grp->lg_ports;
 	}
 
-	/* set the initial group capabilities */
+	/* Set the initial group capabilities. */
 	aggr_grp_capab_set(grp);
 
 	if ((mac = mac_alloc(MAC_VERSION)) == NULL) {
@@ -1382,14 +1524,18 @@ aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports,
 	 * Update the MAC address of the constituent ports.
 	 * None of the port is attached at this time, the link state of the
 	 * aggregation will not change.
+	 *
+	 * All ports take on the primary MAC address of the aggr
+	 * (lg_aggr). At this point, none of the ports are attached;
+	 * thus the link state of the aggregation will not change.
 	 */
 	link_state_changed = aggr_grp_update_ports_mac(grp);
 	ASSERT(!link_state_changed);
 
-	/* update outbound load balancing policy */
+	/* Update outbound load balancing policy. */
 	aggr_send_update_policy(grp, policy);
 
-	/* set LACP mode */
+	/* Set LACP mode. */
 	aggr_lacp_set_mode(grp, lacp_mode, lacp_timer);
 
 	/*
@@ -1397,12 +1543,18 @@ aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports,
 	 */
 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
 		/*
-		 * Create the pseudo ring for each HW ring of the underlying
-		 * port. Note that this is done after the aggr registers the
-		 * mac.
+		 * Create the pseudo ring for each HW ring of the
+		 * underlying port. Note that this is done after the
+		 * aggr registers its MAC.
 		 */
-		VERIFY(aggr_add_pseudo_tx_group(port, &grp->lg_tx_group) == 0);
-		VERIFY(aggr_add_pseudo_rx_group(port, &grp->lg_rx_group) == 0);
+		VERIFY3S(aggr_add_pseudo_tx_group(port, &grp->lg_tx_group),
+		    ==, 0);
+
+		for (i = 0; i < grp->lg_rx_group_count; i++) {
+			VERIFY3S(aggr_add_pseudo_rx_group(port,
+			    &grp->lg_rx_groups[i]), ==, 0);
+		}
+
 		if (aggr_port_notify_link(grp, port))
 			link_state_changed = B_TRUE;
 
@@ -1547,7 +1699,9 @@ aggr_grp_rem_port(aggr_grp_t *grp, aggr_port_t *port,
 			continue;
 		val = aggr_port_stat(port, stat);
 		val -= port->lp_stat[i];
+		mutex_enter(&grp->lg_stat_lock);
 		grp->lg_stat[i] += val;
+		mutex_exit(&grp->lg_stat_lock);
 	}
 	for (i = 0; i < ETHER_NSTAT; i++) {
 		stat = i + MACTYPE_STAT_MIN;
@@ -1555,7 +1709,9 @@ aggr_grp_rem_port(aggr_grp_t *grp, aggr_port_t *port,
 			continue;
 		val = aggr_port_stat(port, stat);
 		val -= port->lp_ether_stat[i];
+		mutex_enter(&grp->lg_stat_lock);
 		grp->lg_ether_stat[i] += val;
+		mutex_exit(&grp->lg_stat_lock);
 	}
 
 	grp->lg_nports--;
@@ -1680,7 +1836,8 @@ aggr_grp_rem_ports(datalink_id_t linkid, uint_t nports, laioc_port_t *ports)
 		 * aggr_find_tx_ring() will not return any rings
 		 * belonging to it.
 		 */
-		aggr_rem_pseudo_rx_group(port, &grp->lg_rx_group);
+		for (i = 0; i < grp->lg_rx_group_count; i++)
+			aggr_rem_pseudo_rx_group(port, &grp->lg_rx_groups[i]);
 
 		/* remove port from group */
 		rc = aggr_grp_rem_port(grp, port, &mac_addr_changed,
@@ -1785,7 +1942,8 @@ aggr_grp_delete(datalink_id_t linkid, cred_t *cred)
 		(void) aggr_grp_detach_port(grp, port);
 		mac_perim_exit(pmph);
 		aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group);
-		aggr_rem_pseudo_rx_group(port, &grp->lg_rx_group);
+		for (uint_t i = 0; i < grp->lg_rx_group_count; i++)
+			aggr_rem_pseudo_rx_group(port, &grp->lg_rx_groups[i]);
 		aggr_port_delete(port);
 		port = cport;
 	}
@@ -1804,6 +1962,10 @@ aggr_grp_delete(datalink_id_t linkid, cred_t *cred)
 	VERIFY(mac_unregister(grp->lg_mh) == 0);
 	grp->lg_mh = NULL;
 
+	for (uint_t i = 0; i < MAX_GROUPS_PER_PORT; i++) {
+		list_destroy(&(grp->lg_rx_groups[i].arg_vlans));
+	}
+
 	AGGR_GRP_REFRELE(grp);
 	return (0);
 }
@@ -1886,6 +2048,8 @@ aggr_grp_stat(aggr_grp_t *grp, uint_t stat, uint64_t *val)
 	aggr_port_t	*port;
 	uint_t		stat_index;
 
+	ASSERT(MUTEX_HELD(&grp->lg_stat_lock));
+
 	/* We only aggregate counter statistics. */
 	if (IS_MAC_STAT(stat) && !MAC_STAT_ISACOUNTER(stat) ||
 	    IS_MACTYPE_STAT(stat) && !ETHER_STAT_ISACOUNTER(stat)) {
@@ -1954,10 +2118,9 @@ static int
 aggr_m_stat(void *arg, uint_t stat, uint64_t *val)
 {
 	aggr_grp_t		*grp = arg;
-	mac_perim_handle_t	mph;
 	int			rval = 0;
 
-	mac_perim_enter_by_mh(grp->lg_mh, &mph);
+	mutex_enter(&grp->lg_stat_lock);
 
 	switch (stat) {
 	case MAC_STAT_IFSPEED:
@@ -1977,7 +2140,7 @@ aggr_m_stat(void *arg, uint_t stat, uint64_t *val)
 		rval = aggr_grp_stat(grp, stat, val);
 	}
 
-	mac_perim_exit(mph);
+	mutex_exit(&grp->lg_stat_lock);
 	return (rval);
 }
 
@@ -2167,17 +2330,15 @@ aggr_m_capab_get(void *arg, mac_capab_t cap, void *cap_data)
 		return (!grp->lg_zcopy);
 	case MAC_CAPAB_RINGS: {
 		mac_capab_rings_t *cap_rings = cap_data;
+		uint_t ring_cnt = 0;
+
+		for (uint_t i = 0; i < grp->lg_rx_group_count; i++)
+			ring_cnt += grp->lg_rx_groups[i].arg_ring_cnt;
 
 		if (cap_rings->mr_type == MAC_RING_TYPE_RX) {
 			cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
-			cap_rings->mr_rnum = grp->lg_rx_group.arg_ring_cnt;
-
-			/*
-			 * An aggregation advertises only one (pseudo) RX
-			 * group, which virtualizes the main/primary group of
-			 * the underlying devices.
-			 */
-			cap_rings->mr_gnum = 1;
+			cap_rings->mr_rnum = ring_cnt;
+			cap_rings->mr_gnum = grp->lg_rx_group_count;
 			cap_rings->mr_gaddring = NULL;
 			cap_rings->mr_gremring = NULL;
 		} else {
@@ -2209,19 +2370,17 @@ aggr_m_capab_get(void *arg, mac_capab_t cap, void *cap_data)
 }
 
 /*
- * Callback funtion for MAC layer to register groups.
+ * Callback function for MAC layer to register groups.
  */
 static void
 aggr_fill_group(void *arg, mac_ring_type_t rtype, const int index,
     mac_group_info_t *infop, mac_group_handle_t gh)
 {
 	aggr_grp_t *grp = arg;
-	aggr_pseudo_rx_group_t *rx_group;
-	aggr_pseudo_tx_group_t *tx_group;
 
-	ASSERT(index == 0);
 	if (rtype == MAC_RING_TYPE_RX) {
-		rx_group = &grp->lg_rx_group;
+		aggr_pseudo_rx_group_t *rx_group = &grp->lg_rx_groups[index];
+
 		rx_group->arg_gh = gh;
 		rx_group->arg_grp = grp;
 
@@ -2231,8 +2390,18 @@ aggr_fill_group(void *arg, mac_ring_type_t rtype, const int index,
 		infop->mgi_addmac = aggr_addmac;
 		infop->mgi_remmac = aggr_remmac;
 		infop->mgi_count = rx_group->arg_ring_cnt;
+
+		/*
+		 * Always set the HW VLAN callbacks. They are smart
+		 * enough to know when a port has HW VLAN filters to
+		 * program and when it doesn't.
+		 */
+		infop->mgi_addvlan = aggr_addvlan;
+		infop->mgi_remvlan = aggr_remvlan;
 	} else {
-		tx_group = &grp->lg_tx_group;
+		aggr_pseudo_tx_group_t *tx_group = &grp->lg_tx_group;
+
+		ASSERT3S(index, ==, 0);
 		tx_group->atg_gh = gh;
 	}
 }
@@ -2248,13 +2417,13 @@ aggr_fill_ring(void *arg, mac_ring_type_t rtype, const int rg_index,
 
 	switch (rtype) {
 	case MAC_RING_TYPE_RX: {
-		aggr_pseudo_rx_group_t	*rx_group = &grp->lg_rx_group;
+		aggr_pseudo_rx_group_t	*rx_group;
 		aggr_pseudo_rx_ring_t	*rx_ring;
 		mac_intr_t		aggr_mac_intr;
 
-		ASSERT(rg_index == 0);
-
-		ASSERT((index >= 0) && (index < rx_group->arg_ring_cnt));
+		rx_group = &grp->lg_rx_groups[rg_index];
+		ASSERT3S(index, >=, 0);
+		ASSERT3S(index, <, rx_group->arg_ring_cnt);
 		rx_ring = rx_group->arg_rings + index;
 		rx_ring->arr_rh = rh;
 
@@ -2268,8 +2437,8 @@ aggr_fill_ring(void *arg, mac_ring_type_t rtype, const int rg_index,
 		aggr_mac_intr.mi_ddi_handle = NULL;
 
 		infop->mri_driver = (mac_ring_driver_t)rx_ring;
-		infop->mri_start = aggr_pseudo_start_ring;
-		infop->mri_stop = aggr_pseudo_stop_ring;
+		infop->mri_start = aggr_pseudo_start_rx_ring;
+		infop->mri_stop = aggr_pseudo_stop_rx_ring;
 
 		infop->mri_intr = aggr_mac_intr;
 		infop->mri_poll = aggr_rx_poll;
@@ -2356,6 +2525,7 @@ aggr_addmac(void *arg, const uint8_t *mac_addr)
 	aggr_port_t		*port, *p;
 	mac_perim_handle_t	mph;
 	int			err = 0;
+	uint_t			idx = rx_group->arg_index;
 
 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
 
@@ -2382,12 +2552,12 @@ aggr_addmac(void *arg, const uint8_t *mac_addr)
 	*pprev = addr;
 
 	for (port = grp->lg_ports; port != NULL; port = port->lp_next)
-		if ((err = aggr_port_addmac(port, mac_addr)) != 0)
+		if ((err = aggr_port_addmac(port, idx, mac_addr)) != 0)
 			break;
 
 	if (err != 0) {
 		for (p = grp->lg_ports; p != port; p = p->lp_next)
-			aggr_port_remmac(p, mac_addr);
+			aggr_port_remmac(p, idx, mac_addr);
 
 		*pprev = NULL;
 		kmem_free(addr, sizeof (aggr_unicst_addr_t));
@@ -2432,7 +2602,7 @@ aggr_remmac(void *arg, const uint8_t *mac_addr)
 	}
 
 	for (port = grp->lg_ports; port != NULL; port = port->lp_next)
-		aggr_port_remmac(port, mac_addr);
+		aggr_port_remmac(port, rx_group->arg_index, mac_addr);
 
 	*pprev = addr->aua_next;
 	kmem_free(addr, sizeof (aggr_unicst_addr_t));
@@ -2442,6 +2612,188 @@ aggr_remmac(void *arg, const uint8_t *mac_addr)
 }
 
 /*
+ * Search for VID in the Rx group's list and return a pointer if
+ * found. Otherwise return NULL.
+ */
+static aggr_vlan_t *
+aggr_find_vlan(aggr_pseudo_rx_group_t *rx_group, uint16_t vid)
+{
+	ASSERT(MAC_PERIM_HELD(rx_group->arg_grp->lg_mh));
+	for (aggr_vlan_t *avp = list_head(&rx_group->arg_vlans); avp != NULL;
+	    avp = list_next(&rx_group->arg_vlans, avp)) {
+		if (avp->av_vid == vid)
+			return (avp);
+	}
+
+	return (NULL);
+}
+
+/*
+ * Accept traffic on the specified VID.
+ *
+ * Persist VLAN state in the aggr so that ports added later will
+ * receive the correct filters. In the future it would be nice to
+ * allow aggr to iterate its clients instead of duplicating state.
+ */
+static int
+aggr_addvlan(mac_group_driver_t gdriver, uint16_t vid)
+{
+	aggr_pseudo_rx_group_t	*rx_group = (aggr_pseudo_rx_group_t *)gdriver;
+	aggr_grp_t		*aggr = rx_group->arg_grp;
+	aggr_port_t		*port, *p;
+	mac_perim_handle_t	mph;
+	int			err = 0;
+	aggr_vlan_t		*avp = NULL;
+	uint_t			idx = rx_group->arg_index;
+
+	mac_perim_enter_by_mh(aggr->lg_mh, &mph);
+
+	if (vid == MAC_VLAN_UNTAGGED) {
+		/*
+		 * Aggr is both a MAC provider and MAC client. As a
+		 * MAC provider it is passed MAC_VLAN_UNTAGGED by its
+		 * client. As a client itself, it should pass
+		 * VLAN_ID_NONE to its ports.
+		 */
+		vid = VLAN_ID_NONE;
+		rx_group->arg_untagged++;
+		goto update_ports;
+	}
+
+	avp = aggr_find_vlan(rx_group, vid);
+
+	if (avp != NULL) {
+		avp->av_refs++;
+		mac_perim_exit(mph);
+		return (0);
+	}
+
+	avp = kmem_zalloc(sizeof (aggr_vlan_t), KM_SLEEP);
+	avp->av_vid = vid;
+	avp->av_refs = 1;
+
+update_ports:
+	for (port = aggr->lg_ports; port != NULL; port = port->lp_next)
+		if ((err = aggr_port_addvlan(port, idx, vid)) != 0)
+			break;
+
+	if (err != 0) {
+		/*
+		 * If any of these calls fail then we are in a
+		 * situation where the ports have different HW state.
+		 * There's no reasonable action the MAC client can
+		 * take in this scenario to rectify the situation.
+		 */
+		for (p = aggr->lg_ports; p != port; p = p->lp_next) {
+			int err2;
+
+			if ((err2 = aggr_port_remvlan(p, idx, vid)) != 0) {
+				cmn_err(CE_WARN, "Failed to remove VLAN %u"
+				    " from port %s: errno %d.", vid,
+				    mac_client_name(p->lp_mch), err2);
+			}
+
+		}
+
+		if (vid == VLAN_ID_NONE)
+			rx_group->arg_untagged--;
+
+		if (avp != NULL) {
+			kmem_free(avp, sizeof (aggr_vlan_t));
+			avp = NULL;
+		}
+	}
+
+	if (avp != NULL)
+		list_insert_tail(&rx_group->arg_vlans, avp);
+
+done:
+	mac_perim_exit(mph);
+	return (err);
+}
+
+/*
+ * Stop accepting traffic on this VLAN if it's the last use of this VLAN.
+ */
+static int
+aggr_remvlan(mac_group_driver_t gdriver, uint16_t vid)
+{
+	aggr_pseudo_rx_group_t	*rx_group = (aggr_pseudo_rx_group_t *)gdriver;
+	aggr_grp_t		*aggr = rx_group->arg_grp;
+	aggr_port_t		*port, *p;
+	mac_perim_handle_t	mph;
+	int			err = 0;
+	aggr_vlan_t		*avp = NULL;
+	uint_t			idx = rx_group->arg_index;
+
+	mac_perim_enter_by_mh(aggr->lg_mh, &mph);
+
+	/*
+	 * See the comment in aggr_addvlan().
+	 */
+	if (vid == MAC_VLAN_UNTAGGED) {
+		vid = VLAN_ID_NONE;
+		rx_group->arg_untagged--;
+
+		if (rx_group->arg_untagged > 0)
+			goto done;
+
+		goto update_ports;
+	}
+
+	avp = aggr_find_vlan(rx_group, vid);
+
+	if (avp == NULL) {
+		err = ENOENT;
+		goto done;
+	}
+
+	avp->av_refs--;
+
+	if (avp->av_refs > 0)
+		goto done;
+
+update_ports:
+	for (port = aggr->lg_ports; port != NULL; port = port->lp_next)
+		if ((err = aggr_port_remvlan(port, idx, vid)) != 0)
+			break;
+
+	/*
+	 * See the comment in aggr_addvlan() for justification of the
+	 * use of VERIFY here.
+	 */
+	if (err != 0) {
+		for (p = aggr->lg_ports; p != port; p = p->lp_next) {
+			int err2;
+
+			if ((err2 = aggr_port_addvlan(p, idx, vid)) != 0) {
+				cmn_err(CE_WARN, "Failed to add VLAN %u"
+				    " to port %s: errno %d.", vid,
+				    mac_client_name(p->lp_mch), err2);
+			}
+		}
+
+		if (avp != NULL)
+			avp->av_refs++;
+
+		if (vid == VLAN_ID_NONE)
+			rx_group->arg_untagged++;
+
+		goto done;
+	}
+
+	if (err == 0 && avp != NULL) {
+		VERIFY3U(avp->av_refs, ==, 0);
+		list_remove(&rx_group->arg_vlans, avp);
+		kmem_free(avp, sizeof (aggr_vlan_t));
+	}
+
+done:
+	mac_perim_exit(mph);
+	return (err);
+}
+
+/*
  * Add or remove the multicast addresses that are defined for the group
  * to or from the specified port.
  *
diff --git a/usr/src/uts/common/io/aggr/aggr_lacp.c b/usr/src/uts/common/io/aggr/aggr_lacp.c
index a0d566d12b..7dcd3bfb95 100644
--- a/usr/src/uts/common/io/aggr/aggr_lacp.c
+++ b/usr/src/uts/common/io/aggr/aggr_lacp.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2017, Joyent, Inc.
  */
 
 /*
@@ -606,7 +607,7 @@ lacp_xmit_sm(aggr_port_t *portp)
 	ASSERT(MAC_PERIM_HELD(portp->lp_grp->lg_mh));
 
 	/* LACP_OFF state not in specification so check here.  */
-	if (!pl->sm.lacp_on || !pl->NTT || !portp->lp_started)
+	if (!pl->sm.lacp_on || !pl->NTT)
 		return;
 
 	/*
diff --git a/usr/src/uts/common/io/aggr/aggr_port.c b/usr/src/uts/common/io/aggr/aggr_port.c
index 00545d2c03..c8dbe00336 100644
--- a/usr/src/uts/common/io/aggr/aggr_port.c
+++ b/usr/src/uts/common/io/aggr/aggr_port.c
@@ -21,6 +21,8 @@
 /*
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2012 OmniTI Computer Consulting, Inc  All rights reserved.
+ * Copyright 2018 Joyent, Inc.
  */
 
 /*
@@ -69,10 +71,10 @@ aggr_port_destructor(void *buf, void *arg)
 {
 	aggr_port_t *port = buf;
 
-	ASSERT(port->lp_mnh == NULL);
-	ASSERT(port->lp_mphp == NULL);
-	ASSERT(!port->lp_rx_grp_added && !port->lp_tx_grp_added);
-	ASSERT(port->lp_hwgh == NULL);
+	ASSERT3P(port->lp_mnh, ==, NULL);
+	ASSERT(!port->lp_tx_grp_added);
+	for (uint_t i = 0; i < MAX_GROUPS_PER_PORT; i++)
+		ASSERT3P(port->lp_hwghs[i], ==, NULL);
 }
 
 void
@@ -126,7 +128,6 @@ aggr_port_init_callbacks(aggr_port_t *port)
 	aggr_grp_port_hold(port);
 }
 
-/* ARGSUSED */
 int
 aggr_port_create(aggr_grp_t *grp, const datalink_id_t linkid, boolean_t force,
     aggr_port_t **pp)
@@ -195,9 +196,9 @@ aggr_port_create(aggr_grp_t *grp, const datalink_id_t linkid, boolean_t force,
 	}
 
 	/*
-	 * As the underlying mac's current margin size is used to determine
+	 * As the underlying MAC's current margin size is used to determine
 	 * the margin size of the aggregation itself, request the underlying
-	 * mac not to change to a smaller size.
+	 * MAC not to change to a smaller size.
 	 */
 	if ((err = mac_margin_add(mh, &margin, B_TRUE)) != 0) {
 		id_free(aggr_portids, portid);
@@ -206,7 +207,7 @@ aggr_port_create(aggr_grp_t *grp, const datalink_id_t linkid, boolean_t force,
 
 	if ((err = mac_unicast_add(mch, NULL, MAC_UNICAST_PRIMARY |
 	    MAC_UNICAST_DISABLE_TX_VID_CHECK, &mah, 0, &diag)) != 0) {
-		VERIFY(mac_margin_remove(mh, margin) == 0);
+		VERIFY3S(mac_margin_remove(mh, margin), ==, 0);
 		id_free(aggr_portids, portid);
 		goto fail;
 	}
@@ -261,6 +262,7 @@ aggr_port_create(aggr_grp_t *grp, const datalink_id_t linkid, boolean_t force,
 fail:
 	if (mch != NULL)
 		mac_client_close(mch, MAC_CLOSE_FLAGS_EXCLUSIVE);
+
 	mac_close(mh);
 	return (err);
 }
@@ -270,13 +272,11 @@ aggr_port_delete(aggr_port_t *port)
 {
 	aggr_lacp_port_t *pl = &port->lp_lacp;
 
-	ASSERT(port->lp_mphp == NULL);
 	ASSERT(!port->lp_promisc_on);
-
 	port->lp_closing = B_TRUE;
+	VERIFY0(mac_margin_remove(port->lp_mh, port->lp_margin));
+	mac_client_clear_flow_cb(port->lp_mch);
 
-	VERIFY(mac_margin_remove(port->lp_mh, port->lp_margin) == 0);
-	mac_rx_clear(port->lp_mch);
 	/*
 	 * If the notification callback is already in process and waiting for
 	 * the aggr grp's mac perimeter, don't wait (otherwise there would be
@@ -307,8 +307,10 @@ aggr_port_delete(aggr_port_t *port)
 	 * port's MAC_NOTE_UNICST notify callback function being called.
 	 */
 	(void) mac_unicast_primary_set(port->lp_mh, port->lp_addr);
+
 	if (port->lp_mah != NULL)
 		(void) mac_unicast_remove(port->lp_mch, port->lp_mah);
+
 	mac_client_close(port->lp_mch, MAC_CLOSE_FLAGS_EXCLUSIVE);
 	mac_close(port->lp_mh);
 	AGGR_PORT_REFRELE(port);
@@ -373,10 +375,14 @@ aggr_port_notify_link(aggr_grp_t *grp, aggr_port_t *port)
 	/* link speed changes? */
 	ifspeed = aggr_port_stat(port, MAC_STAT_IFSPEED);
 	if (port->lp_ifspeed != ifspeed) {
+		mutex_enter(&grp->lg_stat_lock);
+
 		if (port->lp_state == AGGR_PORT_STATE_ATTACHED)
 			do_detach |= (ifspeed != grp->lg_ifspeed);
 		else
 			do_attach |= (ifspeed == grp->lg_ifspeed);
+
+		mutex_exit(&grp->lg_stat_lock);
 	}
 	port->lp_ifspeed = ifspeed;
 
@@ -515,6 +521,10 @@ aggr_port_stop(aggr_port_t *port)
 	port->lp_started = B_FALSE;
 }
 
+/*
+ * Set the promisc mode of the port. If the port is already in the
+ * requested mode then do nothing.
+ */
 int
 aggr_port_promisc(aggr_port_t *port, boolean_t on)
 {
@@ -523,27 +533,14 @@ aggr_port_promisc(aggr_port_t *port, boolean_t on)
 	ASSERT(MAC_PERIM_HELD(port->lp_mh));
 
 	if (on == port->lp_promisc_on)
-		/* already in desired promiscous mode */
 		return (0);
 
-	if (on) {
-		mac_rx_clear(port->lp_mch);
-		rc = mac_promisc_add(port->lp_mch, MAC_CLIENT_PROMISC_ALL,
-		    aggr_recv_cb, port, &port->lp_mphp,
-		    MAC_PROMISC_FLAGS_NO_TX_LOOP);
-		if (rc != 0) {
-			mac_rx_set(port->lp_mch, aggr_recv_cb, port);
-			return (rc);
-		}
-	} else {
-		mac_promisc_remove(port->lp_mphp);
-		port->lp_mphp = NULL;
-		mac_rx_set(port->lp_mch, aggr_recv_cb, port);
-	}
+	rc = mac_set_promisc(port->lp_mh, on);
 
-	port->lp_promisc_on = on;
+	if (rc == 0)
+		port->lp_promisc_on = on;
 
-	return (0);
+	return (rc);
 }
 
 /*
@@ -583,35 +580,45 @@ aggr_port_stat(aggr_port_t *port, uint_t stat)
 }
 
 /*
- * Add a non-primary unicast address to the underlying port. If the port
- * supports HW Rx group, try to add the address into the HW Rx group of
- * the port first. If that fails, or if the port does not support HW Rx
- * group, enable the port's promiscous mode.
+ * Add a non-primary unicast address to the underlying port. If the
+ * port supports HW Rx groups, then try to add the address filter to
+ * the HW group first. If that fails, or if the port does not support
+ * RINGS capab, then enable the port's promiscous mode.
  */
 int
-aggr_port_addmac(aggr_port_t *port, const uint8_t *mac_addr)
+aggr_port_addmac(aggr_port_t *port, uint_t idx, const uint8_t *mac_addr)
 {
 	aggr_unicst_addr_t	*addr, **pprev;
 	mac_perim_handle_t	pmph;
 	int			err;
 
 	ASSERT(MAC_PERIM_HELD(port->lp_grp->lg_mh));
+	ASSERT3U(idx, <, MAX_GROUPS_PER_PORT);
 	mac_perim_enter_by_mh(port->lp_mh, &pmph);
 
 	/*
-	 * If the underlying port support HW Rx group, add the mac to its
-	 * RX group directly.
+	 * If the port doesn't have a HW group to back the aggr's
+	 * pseudo group, then try using the port's default group and
+	 * let the aggr SW classify its traffic. This scenario happens
+	 * when mixing ports with a different number of HW groups.
 	 */
-	if ((port->lp_hwgh != NULL) &&
-	    ((mac_hwgroup_addmac(port->lp_hwgh, mac_addr)) == 0)) {
+	if (port->lp_hwghs[idx] == NULL)
+		idx = 0;
+
+	/*
+	 * If there is an underlying HW Rx group, then try adding this
+	 * unicast address to it.
+	 */
+	if ((port->lp_hwghs[idx] != NULL) &&
+	    ((mac_hwgroup_addmac(port->lp_hwghs[idx], mac_addr)) == 0)) {
 		mac_perim_exit(pmph);
 		return (0);
 	}
 
 	/*
-	 * If that fails, or if the port does not support HW Rx group, enable
-	 * the port's promiscous mode. (Note that we turn on the promiscous
-	 * mode only if the port is already started.
+	 * If the port doesn't have HW groups, or we failed to add the
+	 * HW filter, then enable the port's promiscuous mode. We
+	 * enable promiscuous mode only if the port is already started.
 	 */
 	if (port->lp_started &&
 	    ((err = aggr_port_promisc(port, B_TRUE)) != 0)) {
@@ -643,13 +650,14 @@ aggr_port_addmac(aggr_port_t *port, const uint8_t *mac_addr)
  * promiscous mode.
  */
 void
-aggr_port_remmac(aggr_port_t *port, const uint8_t *mac_addr)
+aggr_port_remmac(aggr_port_t *port, uint_t idx, const uint8_t *mac_addr)
 {
 	aggr_grp_t		*grp = port->lp_grp;
 	aggr_unicst_addr_t	*addr, **pprev;
 	mac_perim_handle_t	pmph;
 
 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
+	ASSERT3U(idx, <, MAX_GROUPS_PER_PORT);
 	mac_perim_enter_by_mh(port->lp_mh, &pmph);
 
 	/*
@@ -662,6 +670,7 @@ aggr_port_remmac(aggr_port_t *port, const uint8_t *mac_addr)
 			break;
 		pprev = &addr->aua_next;
 	}
+
 	if (addr != NULL) {
 		/*
 		 * This unicast address put the port into the promiscous mode,
@@ -674,8 +683,65 @@ aggr_port_remmac(aggr_port_t *port, const uint8_t *mac_addr)
 		if (port->lp_prom_addr == NULL && !grp->lg_promisc)
 			(void) aggr_port_promisc(port, B_FALSE);
 	} else {
-		ASSERT(port->lp_hwgh != NULL);
-		(void) mac_hwgroup_remmac(port->lp_hwgh, mac_addr);
+		/* See comment in aggr_port_addmac(). */
+		if (port->lp_hwghs[idx] == NULL)
+			idx = 0;
+
+		ASSERT3P(port->lp_hwghs[idx], !=, NULL);
+		(void) mac_hwgroup_remmac(port->lp_hwghs[idx], mac_addr);
 	}
+
 	mac_perim_exit(pmph);
 }
+
+int
+aggr_port_addvlan(aggr_port_t *port, uint_t idx, uint16_t vid)
+{
+	mac_perim_handle_t	pmph;
+	int			err;
+
+	ASSERT(MAC_PERIM_HELD(port->lp_grp->lg_mh));
+	ASSERT3U(idx, <, MAX_GROUPS_PER_PORT);
+	mac_perim_enter_by_mh(port->lp_mh, &pmph);
+
+	/* See comment in aggr_port_addmac(). */
+	if (port->lp_hwghs[idx] == NULL)
+		idx = 0;
+
+	/*
+	 * Add the VLAN filter to the HW group if the port has a HW
+	 * group. If the port doesn't have a HW group, then it will
+	 * implicitly allow tagged traffic to pass and there is
+	 * nothing to do.
+	 */
+	if (port->lp_hwghs[idx] == NULL)
+		err = 0;
+	else
+		err = mac_hwgroup_addvlan(port->lp_hwghs[idx], vid);
+
+	mac_perim_exit(pmph);
+	return (err);
+}
+
+int
+aggr_port_remvlan(aggr_port_t *port, uint_t idx, uint16_t vid)
+{
+	mac_perim_handle_t	pmph;
+	int			err;
+
+	ASSERT(MAC_PERIM_HELD(port->lp_grp->lg_mh));
+	ASSERT3U(idx, <, MAX_GROUPS_PER_PORT);
+	mac_perim_enter_by_mh(port->lp_mh, &pmph);
+
+	/* See comment in aggr_port_addmac(). */
+	if (port->lp_hwghs[idx] == NULL)
+		idx = 0;
+
+	if (port->lp_hwghs[idx] == NULL)
+		err = 0;
+	else
+		err = mac_hwgroup_remvlan(port->lp_hwghs[idx], vid);
+
+	mac_perim_exit(pmph);
+	return (err);
+}
diff --git a/usr/src/uts/common/io/aggr/aggr_recv.c b/usr/src/uts/common/io/aggr/aggr_recv.c
index 2bdb7872e3..b6b3e6de1f 100644
--- a/usr/src/uts/common/io/aggr/aggr_recv.c
+++ b/usr/src/uts/common/io/aggr/aggr_recv.c
@@ -21,6 +21,8 @@
 /*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2012 OmniTI Computer Consulting, Inc  All rights reserved.
+ * Copyright 2018 Joyent, Inc.
  */
 
 /*
@@ -55,7 +57,7 @@ aggr_recv_lacp(aggr_port_t *port, mac_resource_handle_t mrh, mblk_t *mp)
 {
 	aggr_grp_t *grp = port->lp_grp;
 
-	/* in promiscuous mode, send copy of packet up */
+	/* In promiscuous mode, pass copy of packet up. */
 	if (grp->lg_promisc) {
 		mblk_t *nmp = copymsg(mp);
 
@@ -68,11 +70,11 @@ aggr_recv_lacp(aggr_port_t *port, mac_resource_handle_t mrh, mblk_t *mp)
 
 /*
  * Callback function invoked by MAC service module when packets are
- * made available by a MAC port.
+ * made available by a MAC port, both in promisc_on mode and not.
  */
 /* ARGSUSED */
-void
-aggr_recv_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
+static void
+aggr_recv_path_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
     boolean_t loopback)
 {
 	aggr_port_t *port = (aggr_port_t *)arg;
@@ -161,3 +163,10 @@ aggr_recv_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
 		}
 	}
 }
+
+void
+aggr_recv_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
+    boolean_t loopback)
+{
+	aggr_recv_path_cb(arg, mrh, mp, loopback);
+}
diff --git a/usr/src/uts/common/io/bpf/bpf_filter.c b/usr/src/uts/common/io/bpf/bpf_filter.c
deleted file mode 100644
index db5b224a5e..0000000000
--- a/usr/src/uts/common/io/bpf/bpf_filter.c
+++ /dev/null
@@ -1,576 +0,0 @@
-/*	$NetBSD: bpf_filter.c,v 1.35 2008/08/20 13:01:54 joerg Exp $	*/
-
-/*
- * Copyright (c) 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997
- *	The Regents of the University of California.  All rights reserved.
- *
- * This code is derived from the Stanford/CMU enet packet filter,
- * (net/enet.c) distributed as part of 4.3BSD, and code contributed
- * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence
- * Berkeley Laboratory.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. Neither the name of the University nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- *	@(#)bpf_filter.c	8.1 (Berkeley) 6/10/93
- */
-/*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#include <sys/param.h>
-#include <sys/time.h>
-#include <sys/stream.h>
-#include <sys/byteorder.h>
-#include <sys/sdt.h>
-
-#define	EXTRACT_SHORT(p)	BE_IN16(p)
-#define	EXTRACT_LONG(p)		BE_IN32(p)
-
-#ifdef _KERNEL
-#define	M_LEN(_m)	((_m)->b_wptr - (_m)->b_rptr)
-#define	mtod(_a, _t)	((_t)((_a)->b_rptr))
-#define	MINDEX(len, m, k) 		\
-{ 					\
-	len = M_LEN(m); 		\
-	while (k >= len) { 		\
-		k -= len; 		\
-		m = m->b_cont; 		\
-		if (m == 0) 		\
-			return (0); 	\
-		len = M_LEN(m); 	\
-	} 				\
-}
-
-static int m_xword(mblk_t *, uint32_t, int *);
-static int m_xhalf(mblk_t *, uint32_t, int *);
-
-static int
-m_xword(mblk_t *m, uint32_t k, int *err)
-{
-	int len;
-	uchar_t *cp, *np;
-	mblk_t *m0;
-
-	*err = 1;
-	MINDEX(len, m, k);
-	cp = mtod(m, uchar_t *) + k;
-	if (len >= k + 4) {
-		*err = 0;
-		return (EXTRACT_LONG(cp));
-	}
-	m0 = m->b_cont;
-	if (m0 == 0 || M_LEN(m0) + len - k < 4) {
-		DTRACE_PROBE3(mblk_xword_fail, mblk_t *, m0, int, len, int, k);
-		return (0);
-	}
-	*err = 0;
-	np = mtod(m0, uchar_t *);
-	switch (len - k) {
-
-	case 1:
-		return ((cp[0] << 24) | (np[0] << 16) | (np[1] << 8) | np[2]);
-
-	case 2:
-		return ((cp[0] << 24) | (cp[1] << 16) | (np[0] << 8) | np[1]);
-
-	default:
-		return ((cp[0] << 24) | (cp[1] << 16) | (cp[2] << 8) | np[0]);
-	}
-}
-
-static int
-m_xhalf(mblk_t *m, uint32_t k, int *err)
-{
-	int len;
-	uchar_t *cp;
-	mblk_t *m0;
-
-	*err = 1;
-	MINDEX(len, m, k);
-	cp = mtod(m, uchar_t *) + k;
-	if (len >= k + 2) {
-		*err = 0;
-		return (EXTRACT_SHORT(cp));
-	}
-	m0 = m->b_cont;
-	if (m0 == 0) {
-		DTRACE_PROBE3(mblk_xhalf_fail, mblk_t *, m0, int, len, int, k);
-		return (0);
-	}
-	*err = 0;
-	return ((cp[0] << 8) | mtod(m0, uchar_t *)[0]);
-}
-#else /* _KERNEL */
-#include <stdlib.h>
-#endif /* !_KERNEL */
-
-#include <net/bpf.h>
-
-/*
- * Execute the filter program starting at pc on the packet p
- * wirelen is the length of the original packet
- * buflen is the amount of data present
- * When buflen is non-0, p is a pointer to a the start of the packet and the
- * packet is only in one mblk_t.
- * When buflen is 0, p is an mblk_t pointer.
- */
-uint_t
-bpf_filter(struct bpf_insn *pc, uchar_t *p, uint_t wirelen, uint_t buflen)
-{
-	uint32_t A, X, k;
-	uint32_t mem[BPF_MEMWORDS];
-
-	if (pc == 0)
-		/*
-		 * No filter means accept all.
-		 */
-		return ((uint_t)-1);
-	A = 0;
-	X = 0;
-	--pc;
-	/* CONSTCOND */
-	while (1) {
-		++pc;
-		switch (pc->code) {
-
-		default:
-#ifdef _KERNEL
-			DTRACE_PROBE1(bpf_insn_unknown,
-			    struct bpf_insn *, pc);
-			return (0);
-#else
-			abort();
-#endif
-		case BPF_RET|BPF_K:
-			return ((uint_t)pc->k);
-
-		case BPF_RET|BPF_A:
-			return ((uint_t)A);
-
-		case BPF_LD|BPF_W|BPF_ABS:
-			k = pc->k;
-			if (k + sizeof (int32_t) > buflen) {
-#ifdef _KERNEL
-				int merr = 0;
-
-				if (buflen != 0)
-					return (0);
-				A = m_xword((mblk_t *)p, k, &merr);
-				if (merr != 0)
-					return (0);
-				continue;
-#else
-				return (0);
-#endif
-			}
-			A = EXTRACT_LONG(&p[k]);
-			continue;
-
-		case BPF_LD|BPF_H|BPF_ABS:
-			k = pc->k;
-			if (k + sizeof (int16_t) > buflen) {
-#ifdef _KERNEL
-				int merr;
-
-				if (buflen != 0)
-					return (0);
-				A = m_xhalf((mblk_t *)p, k, &merr);
-				if (merr != 0)
-					return (0);
-				continue;
-#else
-				return (0);
-#endif
-			}
-			A = EXTRACT_SHORT(&p[k]);
-			continue;
-
-		case BPF_LD|BPF_B|BPF_ABS:
-			k = pc->k;
-			if (k >= buflen) {
-#ifdef _KERNEL
-				mblk_t *m;
-				int len;
-
-				if (buflen != 0)
-					return (0);
-				m = (mblk_t *)p;
-				MINDEX(len, m, k);
-				A = mtod(m, uchar_t *)[k];
-				continue;
-#else
-				return (0);
-#endif
-			}
-			A = p[k];
-			continue;
-
-		case BPF_LD|BPF_W|BPF_LEN:
-			A = wirelen;
-			continue;
-
-		case BPF_LDX|BPF_W|BPF_LEN:
-			X = wirelen;
-			continue;
-
-		case BPF_LD|BPF_W|BPF_IND:
-			k = X + pc->k;
-			if (k + sizeof (int32_t) > buflen) {
-#ifdef _KERNEL
-				int merr = 0;
-
-				if (buflen != 0)
-					return (0);
-				A = m_xword((mblk_t *)p, k, &merr);
-				if (merr != 0)
-					return (0);
-				continue;
-#else
-				return (0);
-#endif
-			}
-			A = EXTRACT_LONG(&p[k]);
-			continue;
-
-		case BPF_LD|BPF_H|BPF_IND:
-			k = X + pc->k;
-			if (k + sizeof (int16_t) > buflen) {
-#ifdef _KERNEL
-				int merr = 0;
-
-				if (buflen != 0)
-					return (0);
-				A = m_xhalf((mblk_t *)p, k, &merr);
-				if (merr != 0)
-					return (0);
-				continue;
-#else
-				return (0);
-#endif
-			}
-			A = EXTRACT_SHORT(&p[k]);
-			continue;
-
-		case BPF_LD|BPF_B|BPF_IND:
-			k = X + pc->k;
-			if (k >= buflen) {
-#ifdef _KERNEL
-				mblk_t *m;
-				int len;
-
-				if (buflen != 0)
-					return (0);
-				m = (mblk_t *)p;
-				MINDEX(len, m, k);
-				A = mtod(m, uchar_t *)[k];
-				continue;
-#else
-				return (0);
-#endif
-			}
-			A = p[k];
-			continue;
-
-		case BPF_LDX|BPF_MSH|BPF_B:
-			k = pc->k;
-			if (k >= buflen) {
-#ifdef _KERNEL
-				mblk_t *m;
-				int len;
-
-				if (buflen != 0)
-					return (0);
-				m = (mblk_t *)p;
-				MINDEX(len, m, k);
-				X = (mtod(m, char *)[k] & 0xf) << 2;
-				continue;
-#else
-				return (0);
-#endif
-			}
-			X = (p[pc->k] & 0xf) << 2;
-			continue;
-
-		case BPF_LD|BPF_IMM:
-			A = pc->k;
-			continue;
-
-		case BPF_LDX|BPF_IMM:
-			X = pc->k;
-			continue;
-
-		case BPF_LD|BPF_MEM:
-			A = mem[pc->k];
-			continue;
-
-		case BPF_LDX|BPF_MEM:
-			X = mem[pc->k];
-			continue;
-
-		case BPF_ST:
-			mem[pc->k] = A;
-			continue;
-
-		case BPF_STX:
-			mem[pc->k] = X;
-			continue;
-
-		case BPF_JMP|BPF_JA:
-			pc += pc->k;
-			continue;
-
-		case BPF_JMP|BPF_JGT|BPF_K:
-			pc += (A > pc->k) ? pc->jt : pc->jf;
-			continue;
-
-		case BPF_JMP|BPF_JGE|BPF_K:
-			pc += (A >= pc->k) ? pc->jt : pc->jf;
-			continue;
-
-		case BPF_JMP|BPF_JEQ|BPF_K:
-			pc += (A == pc->k) ? pc->jt : pc->jf;
-			continue;
-
-		case BPF_JMP|BPF_JSET|BPF_K:
-			pc += (A & pc->k) ? pc->jt : pc->jf;
-			continue;
-
-		case BPF_JMP|BPF_JGT|BPF_X:
-			pc += (A > X) ? pc->jt : pc->jf;
-			continue;
-
-		case BPF_JMP|BPF_JGE|BPF_X:
-			pc += (A >= X) ? pc->jt : pc->jf;
-			continue;
-
-		case BPF_JMP|BPF_JEQ|BPF_X:
-			pc += (A == X) ? pc->jt : pc->jf;
-			continue;
-
-		case BPF_JMP|BPF_JSET|BPF_X:
-			pc += (A & X) ? pc->jt : pc->jf;
-			continue;
-
-		case BPF_ALU|BPF_ADD|BPF_X:
-			A += X;
-			continue;
-
-		case BPF_ALU|BPF_SUB|BPF_X:
-			A -= X;
-			continue;
-
-		case BPF_ALU|BPF_MUL|BPF_X:
-			A *= X;
-			continue;
-
-		case BPF_ALU|BPF_DIV|BPF_X:
-			if (X == 0)
-				return (0);
-			A /= X;
-			continue;
-
-		case BPF_ALU|BPF_AND|BPF_X:
-			A &= X;
-			continue;
-
-		case BPF_ALU|BPF_OR|BPF_X:
-			A |= X;
-			continue;
-
-		case BPF_ALU|BPF_LSH|BPF_X:
-			A <<= X;
-			continue;
-
-		case BPF_ALU|BPF_RSH|BPF_X:
-			A >>= X;
-			continue;
-
-		case BPF_ALU|BPF_ADD|BPF_K:
-			A += pc->k;
-			continue;
-
-		case BPF_ALU|BPF_SUB|BPF_K:
-			A -= pc->k;
-			continue;
-
-		case BPF_ALU|BPF_MUL|BPF_K:
-			A *= pc->k;
-			continue;
-
-		case BPF_ALU|BPF_DIV|BPF_K:
-			A /= pc->k;
-			continue;
-
-		case BPF_ALU|BPF_AND|BPF_K:
-			A &= pc->k;
-			continue;
-
-		case BPF_ALU|BPF_OR|BPF_K:
-			A |= pc->k;
-			continue;
-
-		case BPF_ALU|BPF_LSH|BPF_K:
-			A <<= pc->k;
-			continue;
-
-		case BPF_ALU|BPF_RSH|BPF_K:
-			A >>= pc->k;
-			continue;
-
-		case BPF_ALU|BPF_NEG:
-			A = -A;
-			continue;
-
-		case BPF_MISC|BPF_TAX:
-			X = A;
-			continue;
-
-		case BPF_MISC|BPF_TXA:
-			A = X;
-			continue;
-		}
-	}
-	/* NOTREACHED */
-}
-
-#ifdef _KERNEL
-/*
- * Return true if the 'fcode' is a valid filter program.
- * The constraints are that each jump be forward and to a valid
- * code, that memory accesses are within valid ranges (to the
- * extent that this can be checked statically; loads of packet
- * data have to be, and are, also checked at run time), and that
- * the code terminates with either an accept or reject.
- *
- * The kernel needs to be able to verify an application's filter code.
- * Otherwise, a bogus program could easily crash the system.
- */
-int
-bpf_validate(struct bpf_insn *f, int len)
-{
-	uint_t i, from;
-	struct bpf_insn *p;
-
-	if (len < 1 || len > BPF_MAXINSNS)
-		return (0);
-
-	for (i = 0; i < len; ++i) {
-		p = &f[i];
-		DTRACE_PROBE1(bpf_valid_insn, struct bpf_insn *, p);
-		switch (BPF_CLASS(p->code)) {
-		/*
-		 * Check that memory operations use valid addresses.
-		 */
-		case BPF_LD:
-		case BPF_LDX:
-			switch (BPF_MODE(p->code)) {
-			case BPF_MEM:
-				if (p->k >= BPF_MEMWORDS)
-					return (0);
-				break;
-			case BPF_ABS:
-			case BPF_IND:
-			case BPF_MSH:
-			case BPF_IMM:
-			case BPF_LEN:
-				break;
-			default:
-				return (0);
-			}
-			break;
-		case BPF_ST:
-		case BPF_STX:
-			if (p->k >= BPF_MEMWORDS)
-				return (0);
-			break;
-		case BPF_ALU:
-			switch (BPF_OP(p->code)) {
-			case BPF_ADD:
-			case BPF_SUB:
-			case BPF_MUL:
-			case BPF_OR:
-			case BPF_AND:
-			case BPF_LSH:
-			case BPF_RSH:
-			case BPF_NEG:
-				break;
-			case BPF_DIV:
-				/*
-				 * Check for constant division by 0.
-				 */
-				if (BPF_RVAL(p->code) == BPF_K && p->k == 0)
-					return (0);
-				break;
-			default:
-				return (0);
-			}
-			break;
-		case BPF_JMP:
-			/*
-			 * Check that jumps are within the code block,
-			 * and that unconditional branches don't go
-			 * backwards as a result of an overflow.
-			 * Unconditional branches have a 32-bit offset,
-			 * so they could overflow; we check to make
-			 * sure they don't.  Conditional branches have
-			 * an 8-bit offset, and the from address is <=
-			 * BPF_MAXINSNS, and we assume that BPF_MAXINSNS
-			 * is sufficiently small that adding 255 to it
-			 * won't overflow.
-			 *
-			 * We know that len is <= BPF_MAXINSNS, and we
-			 * assume that BPF_MAXINSNS is < the maximum size
-			 * of a uint_t, so that i + 1 doesn't overflow.
-			 */
-			from = i + 1;
-			switch (BPF_OP(p->code)) {
-			case BPF_JA:
-				if (from + p->k < from || from + p->k >= len)
-					return (0);
-				break;
-			case BPF_JEQ:
-			case BPF_JGT:
-			case BPF_JGE:
-			case BPF_JSET:
-				if (from + p->jt >= len || from + p->jf >= len)
-					return (0);
-				break;
-			default:
-				return (0);
-			}
-			break;
-		case BPF_RET:
-			break;
-		case BPF_MISC:
-			break;
-		default:
-			return (0);
-		}
-	}
-
-	return (BPF_CLASS(f[len - 1].code) == BPF_RET);
-}
-#endif
diff --git a/usr/src/uts/common/io/bpf/bpf_wrap.c b/usr/src/uts/common/io/bpf/bpf_wrap.c
new file mode 100644
index 0000000000..6cbde58a20
--- /dev/null
+++ b/usr/src/uts/common/io/bpf/bpf_wrap.c
@@ -0,0 +1,35 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#include <sys/types.h>
+#include <net/bpf.h>
+#include <inet/bpf.h>
+
+/*
+ * With BPF filter validation and evaluation moved into the 'ip' module, these
+ * wrapper functions are provided to expose the original interface.
+ */
+
+uint_t
+bpf_filter(struct bpf_insn *pc, uchar_t *p, uint_t wirelen, uint_t buflen)
+{
+	return ((uint_t)ip_bpf_filter((ip_bpf_insn_t *)pc, p, wirelen, buflen));
+}
+
+int
+bpf_validate(struct bpf_insn *f, int len)
+{
+	return ((int)ip_bpf_validate((ip_bpf_insn_t *)f, (uint_t)len));
+}
diff --git a/usr/src/uts/common/io/bridge.c b/usr/src/uts/common/io/bridge.c
index 28f16175ee..dc79139ce6 100644
--- a/usr/src/uts/common/io/bridge.c
+++ b/usr/src/uts/common/io/bridge.c
@@ -23,6 +23,7 @@
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  * Copyright (c) 2016 by Delphix. All rights reserved.
+ * Copyright 2018 Joyent, Inc.
  */
 
 /*
@@ -41,6 +42,7 @@
 #include <sys/modctl.h>
 #include <sys/note.h>
 #include <sys/param.h>
+#include <sys/pattr.h>
 #include <sys/policy.h>
 #include <sys/sdt.h>
 #include <sys/stat.h>
@@ -1693,7 +1695,8 @@ bridge_learn(bridge_link_t *blp, const uint8_t *saddr, uint16_t ingress_nick,
  * The passed-in tci is the "impossible" value 0xFFFF when no tag is present.
  */
 static mblk_t *
-reform_vlan_header(mblk_t *mp, uint16_t vlanid, uint16_t tci, uint16_t pvid)
+reform_vlan_header(mblk_t *mp, uint16_t vlanid, uint16_t tci, uint16_t pvid,
+    boolean_t keep_flags)
 {
 	boolean_t source_has_tag = (tci != 0xFFFF);
 	mblk_t *mpcopy;
@@ -1705,8 +1708,13 @@ reform_vlan_header(mblk_t *mp, uint16_t vlanid, uint16_t tci, uint16_t pvid)
 	if (mp == NULL)
 		return (mp);
 
-	/* No forwarded packet can have hardware checksum enabled */
-	DB_CKSUMFLAGS(mp) = 0;
+	/*
+	 * A forwarded packet cannot have HW offloads enabled unless
+	 * the destination is known to be local to the host and HW
+	 * offloads haven't been emulated.
+	 */
+	if (!keep_flags)
+		DB_CKSUMFLAGS(mp) = 0;
 
 	/* Get the no-modification cases out of the way first */
 	if (!source_has_tag && vlanid == pvid)		/* 1a */
@@ -1907,17 +1915,46 @@ bridge_forward(bridge_link_t *blp, mac_header_info_t *hdr_info, mblk_t *mp,
 				blp->bl_trillthreads++;
 				mutex_exit(&blp->bl_trilllock);
 				update_header(mp, hdr_info, B_FALSE);
-				if (is_xmit)
-					mp = mac_fix_cksum(mp);
-				/* all trill data frames have Inner.VLAN */
-				mp = reform_vlan_header(mp, vlanid, tci, 0);
-				if (mp == NULL) {
-					KIINCR(bki_drops);
-					fwd_unref(bfp);
-					return (NULL);
+
+				if (is_xmit) {
+					mac_hw_emul(&mp, NULL, NULL,
+					    MAC_HWCKSUM_EMUL | MAC_LSO_EMUL);
+
+					if (mp == NULL) {
+						KIINCR(bki_drops);
+						goto done;
+					}
 				}
-				trill_encap_fn(tdp, blp, hdr_info, mp,
-				    bfp->bf_trill_nick);
+
+				while (mp != NULL) {
+					mblk_t *next = mp->b_next;
+
+					mp->b_next = NULL;
+
+					/*
+					 * All trill data frames have
+					 * Inner.VLAN.
+					 */
+					mp = reform_vlan_header(mp, vlanid, tci,
+					    0, B_FALSE);
+
+					if (mp == NULL) {
+						/*
+						 * Make sure to free
+						 * any remaining
+						 * segments.
+						 */
+						freemsgchain(next);
+						KIINCR(bki_drops);
+						goto done;
+					}
+
+					trill_encap_fn(tdp, blp, hdr_info, mp,
+					    bfp->bf_trill_nick);
+					mp = next;
+				}
+
+done:
 				mutex_enter(&blp->bl_trilllock);
 				if (--blp->bl_trillthreads == 0 &&
 				    blp->bl_trilldata == NULL)
@@ -1959,31 +1996,68 @@ bridge_forward(bridge_link_t *blp, mac_header_info_t *hdr_info, mblk_t *mp,
 				mpsend = copymsg(mp);
 			}
 
-			if (!from_trill && is_xmit)
-				mpsend = mac_fix_cksum(mpsend);
+			/*
+			 * If the destination is not local to the host
+			 * then we need to emulate HW offloads because
+			 * we can't guarantee the forwarding
+			 * destination provides them.
+			 */
+			if (!from_trill && is_xmit &&
+			    !(bfp->bf_flags & BFF_LOCALADDR)) {
+				mac_hw_emul(&mpsend, NULL, NULL,
+				    MAC_HWCKSUM_EMUL | MAC_LSO_EMUL);
 
-			mpsend = reform_vlan_header(mpsend, vlanid, tci,
-			    blpsend->bl_pvid);
-			if (mpsend == NULL) {
-				KIINCR(bki_drops);
-				continue;
+				if (mpsend == NULL) {
+					KIINCR(bki_drops);
+					continue;
+				}
+			}
+
+			/*
+			 * The HW emulation above may have segmented
+			 * an LSO mblk.
+			 */
+			while ((mpsend != NULL) &&
+			    !(bfp->bf_flags & BFF_LOCALADDR)) {
+				mblk_t *next = mpsend->b_next;
+
+				mpsend->b_next = NULL;
+				mpsend = reform_vlan_header(mpsend, vlanid, tci,
+				    blpsend->bl_pvid, B_FALSE);
+
+				if (mpsend == NULL) {
+					KIINCR(bki_drops);
+					mpsend = next;
+					continue;
+				}
+
+				KIINCR(bki_forwards);
+				KLPINCR(blpsend, bkl_xmit);
+				MAC_RING_TX(blpsend->bl_mh, NULL, mpsend,
+				    mpsend);
+				freemsg(mpsend);
+				mpsend = next;
 			}
 
-			KIINCR(bki_forwards);
 			/*
 			 * No need to bump up the link reference count, as
 			 * the forwarding entry itself holds a reference to
 			 * the link.
 			 */
 			if (bfp->bf_flags & BFF_LOCALADDR) {
+				mpsend = reform_vlan_header(mpsend, vlanid, tci,
+				    blpsend->bl_pvid, B_TRUE);
+
+				if (mpsend == NULL) {
+					KIINCR(bki_drops);
+					continue;
+				}
+
+				KIINCR(bki_forwards);
 				mac_rx_common(blpsend->bl_mh, NULL, mpsend);
-			} else {
-				KLPINCR(blpsend, bkl_xmit);
-				MAC_RING_TX(blpsend->bl_mh, NULL, mpsend,
-				    mpsend);
-				freemsg(mpsend);
 			}
 		}
+
 		/*
 		 * Handle a special case: if we're transmitting to the original
 		 * link, then check whether the localaddr flag is set.  If it
@@ -2019,7 +2093,7 @@ bridge_forward(bridge_link_t *blp, mac_header_info_t *hdr_info, mblk_t *mp,
 					 * Inner.VLAN
 					 */
 					mpsend = reform_vlan_header(mpsend,
-					    vlanid, tci, 0);
+					    vlanid, tci, 0, B_FALSE);
 					if (mpsend == NULL) {
 						KIINCR(bki_drops);
 					} else {
@@ -2070,25 +2144,57 @@ bridge_forward(bridge_link_t *blp, mac_header_info_t *hdr_info, mblk_t *mp,
 				mpsend = copymsg(mp);
 			}
 
-			if (!from_trill && is_xmit)
-				mpsend = mac_fix_cksum(mpsend);
+			/*
+			 * In this case, send to all links connected
+			 * to the bridge. Some of these destinations
+			 * may not provide HW offload -- so just
+			 * emulate it here.
+			 */
+			if (!from_trill && is_xmit) {
+				mac_hw_emul(&mpsend, NULL, NULL,
+				    MAC_HWCKSUM_EMUL | MAC_LSO_EMUL);
 
-			mpsend = reform_vlan_header(mpsend, vlanid, tci,
-			    blpsend->bl_pvid);
-			if (mpsend == NULL) {
-				KIINCR(bki_drops);
-				continue;
+				if (mpsend == NULL) {
+					KIINCR(bki_drops);
+					continue;
+				}
+			}
+
+			/*
+			 * The HW emulation above may have segmented
+			 * an LSO mblk.
+			 */
+			while (mpsend != NULL) {
+				mblk_t *next = mpsend->b_next;
+
+				mpsend->b_next = NULL;
+				mpsend = reform_vlan_header(mpsend, vlanid, tci,
+				    blpsend->bl_pvid, B_FALSE);
+
+				if (mpsend == NULL) {
+					KIINCR(bki_drops);
+					mpsend = next;
+					continue;
+				}
+
+				if (hdr_info->mhi_dsttype ==
+				    MAC_ADDRTYPE_UNICAST)
+					KIINCR(bki_unknown);
+				else
+					KIINCR(bki_mbcast);
+
+				KLPINCR(blpsend, bkl_xmit);
+				if ((mpcopy = copymsg(mpsend)) != NULL) {
+					mac_rx_common(blpsend->bl_mh, NULL,
+					    mpcopy);
+				}
+
+				MAC_RING_TX(blpsend->bl_mh, NULL, mpsend,
+				    mpsend);
+				freemsg(mpsend);
+				mpsend = next;
 			}
 
-			if (hdr_info->mhi_dsttype == MAC_ADDRTYPE_UNICAST)
-				KIINCR(bki_unknown);
-			else
-				KIINCR(bki_mbcast);
-			KLPINCR(blpsend, bkl_xmit);
-			if ((mpcopy = copymsg(mpsend)) != NULL)
-				mac_rx_common(blpsend->bl_mh, NULL, mpcopy);
-			MAC_RING_TX(blpsend->bl_mh, NULL, mpsend, mpsend);
-			freemsg(mpsend);
 			link_unref(blpsend);
 		}
 	}
diff --git a/usr/src/uts/common/io/bufmod.c b/usr/src/uts/common/io/bufmod.c
index f553c58aff..c1dd697a78 100644
--- a/usr/src/uts/common/io/bufmod.c
+++ b/usr/src/uts/common/io/bufmod.c
@@ -22,10 +22,9 @@
 /*
  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2018 Joyent, Inc.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * STREAMS Buffering module
  *
@@ -1016,15 +1015,15 @@ sbaddmsg(queue_t *rq, mblk_t *mp)
 
 		pad = Align(hp.sbh_totlen);
 		hp.sbh_totlen += sizeof (hp);
-		hp.sbh_totlen += pad;
+
+		/* We can't fit this message on the current chunk. */
+		if ((sbp->sb_mlen + hp.sbh_totlen) > sbp->sb_chunk)
+			sbclosechunk(sbp);
 
 		/*
-		 * Would the inclusion of this message overflow the current
-		 * chunk? If so close the chunk off and start a new one.
+		 * If we closed it (just now or during a previous
+		 * call) then allocate the head of a new chunk.
 		 */
-		if ((hp.sbh_totlen + sbp->sb_mlen) > sbp->sb_chunk)
-				sbclosechunk(sbp);
-
 		if (sbp->sb_head == NULL) {
 			/* Allocate leading header of new chunk */
 			sbp->sb_head = allocb(sizeof (hp), BPRI_MED);
@@ -1044,36 +1043,41 @@ sbaddmsg(queue_t *rq, mblk_t *mp)
 		}
 
 		/*
-		 * Copy header into message
+		 * Set the header values and join the message to the
+		 * chunk. The header values are copied into the chunk
+		 * after we adjust for padding below.
 		 */
 		hp.sbh_drops = sbp->sb_drops;
 		hp.sbh_origlen = origlen;
-		(void) memcpy(sbp->sb_head->b_wptr, (char *)&hp, sizeof (hp));
-		sbp->sb_head->b_wptr += sizeof (hp);
-
-		ASSERT(sbp->sb_head->b_wptr <= sbp->sb_head->b_datap->db_lim);
-
-		/*
-		 * Join message to the chunk
-		 */
 		linkb(sbp->sb_head, mp);
-
 		sbp->sb_mcount++;
 		sbp->sb_mlen += hp.sbh_totlen;
 
 		/*
-		 * If the first message alone is too big for the chunk close
-		 * the chunk now.
-		 * If the next message would immediately cause the chunk to
-		 * overflow we may as well close the chunk now. The next
-		 * message is certain to be at least SMALLEST_MESSAGE size.
+		 * There's no chance to fit another message on the
+		 * chunk -- forgo the padding and close the chunk.
 		 */
-		if (hp.sbh_totlen + SMALLEST_MESSAGE > sbp->sb_chunk) {
+		if ((sbp->sb_mlen + pad + SMALLEST_MESSAGE) > sbp->sb_chunk) {
+			(void) memcpy(sbp->sb_head->b_wptr, (char *)&hp,
+			    sizeof (hp));
+			sbp->sb_head->b_wptr += sizeof (hp);
+			ASSERT(sbp->sb_head->b_wptr <=
+			    sbp->sb_head->b_datap->db_lim);
 			sbclosechunk(sbp);
 			return;
 		}
 
 		/*
+		 * We may add another message to this chunk -- adjust
+		 * the headers for padding to be added below.
+		 */
+		hp.sbh_totlen += pad;
+		(void) memcpy(sbp->sb_head->b_wptr, (char *)&hp, sizeof (hp));
+		sbp->sb_head->b_wptr += sizeof (hp);
+		ASSERT(sbp->sb_head->b_wptr <= sbp->sb_head->b_datap->db_lim);
+		sbp->sb_mlen += pad;
+
+		/*
 		 * Find space for the wrapper. The wrapper consists of:
 		 *
 		 * 1) Padding for this message (this is to ensure each header
@@ -1086,7 +1090,6 @@ sbaddmsg(queue_t *rq, mblk_t *mp)
 		 * of the message, but only if we 'own' the data. If the dblk
 		 * has been shared through dupmsg() we mustn't alter it.
 		 */
-
 		wrapperlen = (sizeof (hp) + pad);
 
 		/* Is there space for the wrapper beyond the message's data ? */
diff --git a/usr/src/uts/common/io/chxge/ch.c b/usr/src/uts/common/io/chxge/ch.c
index 9f1f7f87de..80400061af 100644
--- a/usr/src/uts/common/io/chxge/ch.c
+++ b/usr/src/uts/common/io/chxge/ch.c
@@ -22,6 +22,7 @@
 /*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2018 Joyent, Inc.
  */
 
 /*
@@ -59,6 +60,7 @@
 #include <sys/sunddi.h>
 #include <sys/dlpi.h>
 #include <sys/ethernet.h>
+#include <sys/mac_provider.h>
 #include <sys/strsun.h>
 #include <sys/strsubr.h>
 #include <inet/common.h>
@@ -1377,8 +1379,7 @@ ch_send_up(ch_t *chp, mblk_t *mp, uint32_t cksum, int flg)
 		 * set in /etc/system (see sge.c).
 		 */
 		if (flg)
-			(void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, cksum,
-			    HCK_FULLCKSUM, 0);
+			mac_hcksum_set(mp, 0, 0, 0, cksum, HCK_FULLCKSUM);
 		gld_recv(chp->ch_macp, mp);
 	} else {
 		freemsg(mp);
@@ -1693,8 +1694,7 @@ ch_send(gld_mac_info_t *macinfo, mblk_t *mp)
 	msg_flg = 0;
 	if (chp->ch_config.cksum_enabled) {
 		if (is_T2(chp)) {
-			hcksum_retrieve(mp, NULL, NULL, NULL, NULL, NULL,
-			    NULL, &msg_flg);
+			mac_hcksum_get(mp, NULL, NULL, NULL, NULL, &msg_flg);
 			flg = (msg_flg & HCK_FULLCKSUM)?
 			    CH_NO_CPL: CH_NO_HWCKSUM|CH_NO_CPL;
 		} else
diff --git a/usr/src/uts/common/io/cons.c b/usr/src/uts/common/io/cons.c
index 6ef1b0b9f7..495ae93cf9 100644
--- a/usr/src/uts/common/io/cons.c
+++ b/usr/src/uts/common/io/cons.c
@@ -21,6 +21,7 @@
 
 /*
  * Copyright (c) 1982, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015, Joyent, Inc. All rights reserved.
  */
 
 /*
@@ -53,6 +54,7 @@
 #include <sys/vnode.h>
 #include <sys/uio.h>
 #include <sys/stat.h>
+#include <sys/limits.h>
 
 #include <sys/console.h>
 #include <sys/consdev.h>
@@ -414,14 +416,24 @@ cnwrite(dev_t dev, struct uio *uio, struct cred *cred)
 	 */
 	if (vsconsvp != NULL && vsconsvp->v_stream != NULL) {
 		struiod_t uiod;
+		struct iovec buf[IOV_MAX_STACK];
+		int iovlen = 0;
+
+		if (uio->uio_iovcnt > IOV_MAX_STACK) {
+			iovlen = uio->uio_iovcnt * sizeof (iovec_t);
+			uiod.d_iov = kmem_alloc(iovlen, KM_SLEEP);
+		} else {
+			uiod.d_iov = buf;
+		}
 
 		/*
 		 * strwrite modifies uio so need to make copy.
 		 */
-		(void) uiodup(uio, &uiod.d_uio, uiod.d_iov,
-		    sizeof (uiod.d_iov) / sizeof (*uiod.d_iov));
+		(void) uiodup(uio, &uiod.d_uio, uiod.d_iov, uio->uio_iovcnt);
 
 		(void) strwrite(vsconsvp, &uiod.d_uio, cred);
+		if (iovlen != 0)
+			kmem_free(uiod.d_iov, iovlen);
 	}
 
 	if (rconsvp->v_stream != NULL)
diff --git a/usr/src/uts/common/io/cpqary3/cpqary3.c b/usr/src/uts/common/io/cpqary3/cpqary3.c
index 622f0dcf68..f67d77b3d2 100644
--- a/usr/src/uts/common/io/cpqary3/cpqary3.c
+++ b/usr/src/uts/common/io/cpqary3/cpqary3.c
@@ -41,7 +41,7 @@ extern cpqary3_driver_info_t gdriver_info;
  * Global Variables Definitions
  */
 
-static char cpqary3_brief[]    =	"HP Smart Array Driver";
+static char cpqary3_brief[]    =	"HP Smart Array (Legacy)";
 void *cpqary3_state;
 
 /* HPQaculi Changes */
diff --git a/usr/src/uts/common/io/devpoll.c b/usr/src/uts/common/io/devpoll.c
index 4ef14a99de..e11ab34142 100644
--- a/usr/src/uts/common/io/devpoll.c
+++ b/usr/src/uts/common/io/devpoll.c
@@ -25,7 +25,7 @@
 
 /*
  * Copyright (c) 2012 by Delphix. All rights reserved.
- * Copyright 2017 Joyent, Inc.
+ * Copyright 2019 Joyent, Inc.
  */
 
 #include <sys/types.h>
@@ -245,30 +245,20 @@ dpinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
  *	 stale entries!
  */
 static int
-dp_pcache_poll(dp_entry_t *dpep, void *dpbuf,
-    pollcache_t *pcp, nfds_t nfds, int *fdcntp)
+dp_pcache_poll(dp_entry_t *dpep, void *dpbuf, pollcache_t *pcp, nfds_t nfds,
+    int *fdcntp)
 {
-	int		start, ostart, end;
-	int		fdcnt, fd;
-	boolean_t	done;
-	file_t		*fp;
-	short		revent;
-	boolean_t	no_wrap;
-	pollhead_t	*php;
-	polldat_t	*pdp;
+	int		start, ostart, end, fdcnt, error = 0;
+	boolean_t	done, no_wrap;
 	pollfd_t	*pfdp;
 	epoll_event_t	*epoll;
-	int		error = 0;
-	short		mask = POLLRDHUP | POLLWRBAND;
-	boolean_t	is_epoll = (dpep->dpe_flag & DP_ISEPOLLCOMPAT) != 0;
+	const short	mask = POLLRDHUP | POLLWRBAND;
+	const boolean_t	is_epoll = (dpep->dpe_flag & DP_ISEPOLLCOMPAT) != 0;
 
 	ASSERT(MUTEX_HELD(&pcp->pc_lock));
 	if (pcp->pc_bitmap == NULL) {
-		/*
-		 * No Need to search because no poll fd
-		 * has been cached.
-		 */
-		return (error);
+		/* No Need to search because no poll fd has been cached. */
+		return (0);
 	}
 
 	if (is_epoll) {
@@ -281,7 +271,6 @@ dp_pcache_poll(dp_entry_t *dpep, void *dpbuf,
 retry:
 	start = ostart = pcp->pc_mapstart;
 	end = pcp->pc_mapend;
-	php = NULL;
 
 	if (start == 0) {
 		/*
@@ -294,8 +283,11 @@ retry:
 	done = B_FALSE;
 	fdcnt = 0;
 	while ((fdcnt < nfds) && !done) {
-		php = NULL;
-		revent = 0;
+		pollhead_t *php = NULL;
+		short revent = 0;
+		uf_entry_gen_t gen;
+		int fd;
+
 		/*
 		 * Examine the bit map in a circular fashion
 		 * to avoid starvation. Always resume from
@@ -305,6 +297,9 @@ retry:
 		fd = bt_getlowbit(pcp->pc_bitmap, start, end);
 		ASSERT(fd <= end);
 		if (fd >= 0) {
+			file_t *fp;
+			polldat_t *pdp;
+
 			if (fd == end) {
 				if (no_wrap) {
 					done = B_TRUE;
@@ -328,28 +323,14 @@ repoll:
 				 */
 				continue;
 			}
-			if ((fp = getf(fd)) == NULL) {
-				/*
-				 * The fd has been closed, but user has not
-				 * done a POLLREMOVE on this fd yet. Instead
-				 * of cleaning it here implicitly, we return
-				 * POLLNVAL. This is consistent with poll(2)
-				 * polling a closed fd. Hope this will remind
-				 * user to do a POLLREMOVE.
-				 */
-				if (!is_epoll && pfdp != NULL) {
-					pfdp[fdcnt].fd = fd;
-					pfdp[fdcnt].revents = POLLNVAL;
-					fdcnt++;
-					continue;
-				}
-
-				/*
-				 * In the epoll compatibility case, we actually
-				 * perform the implicit removal to remain
-				 * closer to the epoll semantics.
-				 */
+			if ((fp = getf_gen(fd, &gen)) == NULL) {
 				if (is_epoll) {
+					/*
+					 * In the epoll compatibility case, we
+					 * actually perform the implicit
+					 * removal to remain closer to the
+					 * epoll semantics.
+					 */
 					pdp->pd_fp = NULL;
 					pdp->pd_events = 0;
 
@@ -360,30 +341,36 @@ repoll:
 					}
 
 					BT_CLEAR(pcp->pc_bitmap, fd);
-					continue;
+				} else if (pfdp != NULL) {
+					/*
+					 * The fd has been closed, but user has
+					 * not done a POLLREMOVE on this fd
+					 * yet. Instead of cleaning it here
+					 * implicitly, we return POLLNVAL. This
+					 * is consistent with poll(2) polling a
+					 * closed fd. Hope this will remind
+					 * user to do a POLLREMOVE.
+					 */
+					pfdp[fdcnt].fd = fd;
+					pfdp[fdcnt].revents = POLLNVAL;
+					fdcnt++;
 				}
+				continue;
 			}
 
-			if (fp != pdp->pd_fp) {
+			/*
+			 * Detect a change to the resource underlying a cached
+			 * file descriptor.  While the fd generation comparison
+			 * will catch nearly all cases, the file_t comparison
+			 * is maintained as a failsafe as well.
+			 */
+			if (gen != pdp->pd_gen || fp != pdp->pd_fp) {
 				/*
 				 * The user is polling on a cached fd which was
 				 * closed and then reused.  Unfortunately there
 				 * is no good way to communicate this fact to
 				 * the consumer.
 				 *
-				 * If the file struct is also reused, we may
-				 * not be able to detect the fd reuse at all.
-				 * As long as this does not cause system
-				 * failure and/or memory leaks, we will play
-				 * along.  The man page states that if the user
-				 * does not clean up closed fds, polling
-				 * results will be indeterministic.
-				 *
-				 * XXX: perhaps log the detection of fd reuse?
-				 */
-				pdp->pd_fp = fp;
-
-				/*
 				 * When this situation has been detected, it's
 				 * likely that any existing pollhead is
 				 * ill-suited to perform proper wake-ups.
@@ -396,7 +383,42 @@ repoll:
 					pollhead_delete(pdp->pd_php, pdp);
 					pdp->pd_php = NULL;
 				}
+
+				/*
+				 * Since epoll is expected to act on the
+				 * underlying 'struct file' (in Linux terms,
+				 * our vnode_t would be a closer analog) rather
+				 * than the fd itself, an implicit remove
+				 * is necessary under these circumstances to
+				 * suppress any results (or errors) from the
+				 * new resource occupying the fd.
+				 */
+				if (is_epoll) {
+					pdp->pd_fp = NULL;
+					pdp->pd_events = 0;
+					BT_CLEAR(pcp->pc_bitmap, fd);
+					releasef(fd);
+					continue;
+				} else {
+					/*
+					 * Regular /dev/poll is unbothered
+					 * about the fd reassignment.
+					 */
+					pdp->pd_fp = fp;
+					pdp->pd_gen = gen;
+				}
 			}
+
+			/*
+			 * Skip entries marked with the sentinal value for
+			 * having already fired under oneshot conditions.
+			 */
+			if (pdp->pd_events == POLLONESHOT) {
+				releasef(fd);
+				BT_CLEAR(pcp->pc_bitmap, fd);
+				continue;
+			}
+
 			/*
 			 * XXX - pollrelock() logic needs to know which
 			 * which pollcache lock to grab. It'd be a
@@ -537,18 +559,19 @@ repoll:
 				/* Handle special polling modes. */
 				if (pdp->pd_events & POLLONESHOT) {
 					/*
-					 * If POLLONESHOT is set, perform the
-					 * implicit POLLREMOVE.
+					 * Entries operating under POLLONESHOT
+					 * will be marked with a sentinel value
+					 * to indicate that they have "fired"
+					 * when emitting an event.  This will
+					 * disable them from polling until a
+					 * later add/modify event rearms them.
 					 */
-					pdp->pd_fp = NULL;
-					pdp->pd_events = 0;
-
+					pdp->pd_events = POLLONESHOT;
 					if (pdp->pd_php != NULL) {
 						pollhead_delete(pdp->pd_php,
 						    pdp);
 						pdp->pd_php = NULL;
 					}
-
 					BT_CLEAR(pcp->pc_bitmap, fd);
 				} else if (pdp->pd_events & POLLET) {
 					/*
@@ -700,14 +723,10 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp)
 	pollfd_t	*pollfdp, *pfdp;
 	dvpoll_epollfd_t *epfdp;
 	uintptr_t	limit;
-	int		error, size;
-	ssize_t		uiosize;
-	size_t		copysize;
+	int		error;
+	uint_t		size;
+	size_t		copysize, uiosize;
 	nfds_t		pollfdnum;
-	struct pollhead	*php = NULL;
-	polldat_t	*pdp;
-	int		fd;
-	file_t		*fp;
 	boolean_t	is_epoll, fds_added = B_FALSE;
 
 	minor = getminor(dev);
@@ -732,10 +751,27 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp)
 		pcp->pc_pid = curproc->p_pid;
 	}
 
-	uiosize = uiop->uio_resid;
+	if (uiop->uio_resid < 0) {
+		/* No one else is this careful, but maybe they should be. */
+		return (EINVAL);
+	}
+
+	uiosize = (size_t)uiop->uio_resid;
 	pollfdnum = uiosize / size;
 
 	/*
+	 * For epoll-enabled handles, restrict the allowed write size to 2.
+	 * This corresponds to an epoll_ctl(3C) performing an EPOLL_CTL_MOD
+	 * operation which is expanded into two operations (DEL and ADD).
+	 *
+	 * All other operations performed through epoll_ctl(3C) will consist of
+	 * a single entry.
+	 */
+	if (is_epoll && pollfdnum > 2) {
+		return (EINVAL);
+	}
+
+	/*
 	 * We want to make sure that pollfdnum isn't large enough to DoS us,
 	 * but we also don't want to grab p_lock unnecessarily -- so we
 	 * perform the full check against our resource limits if and only if
@@ -794,6 +830,21 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp)
 	while ((dpep->dpe_flag & DP_WRITER_PRESENT) != 0) {
 		ASSERT(dpep->dpe_refcnt != 0);
 
+		/*
+		 * The epoll API does not allow EINTR as a result when making
+		 * modifications to the set of polled fds.  Given that write
+		 * activity is relatively quick and the size of accepted writes
+		 * is limited above to two entries, a signal-ignorant wait is
+		 * used here to avoid the EINTR.
+		 */
+		if (is_epoll) {
+			cv_wait(&dpep->dpe_cv, &dpep->dpe_lock);
+			continue;
+		}
+
+		/*
+		 * Non-epoll writers to /dev/poll handles can tolerate EINTR.
+		 */
 		if (!cv_wait_sig_swap(&dpep->dpe_cv, &dpep->dpe_lock)) {
 			dpep->dpe_writerwait--;
 			mutex_exit(&dpep->dpe_lock);
@@ -828,7 +879,9 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp)
 	}
 	for (pfdp = pollfdp; (uintptr_t)pfdp < limit;
 	    pfdp = (pollfd_t *)((uintptr_t)pfdp + size)) {
-		fd = pfdp->fd;
+		int fd = pfdp->fd;
+		polldat_t *pdp;
+
 		if ((uint_t)fd >= P_FINFO(curproc)->fi_nfiles) {
 			/*
 			 * epoll semantics demand that we return EBADF if our
@@ -844,76 +897,60 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp)
 
 		pdp = pcache_lookup_fd(pcp, fd);
 		if (pfdp->events != POLLREMOVE) {
+			uf_entry_gen_t gen;
+			file_t *fp = NULL;
+			struct pollhead *php = NULL;
 
-			fp = NULL;
-
-			if (pdp == NULL) {
-				/*
-				 * If we're in epoll compatibility mode, check
-				 * that the fd is valid before allocating
-				 * anything for it; epoll semantics demand that
-				 * we return EBADF if our specified fd is
-				 * invalid.
-				 */
-				if (is_epoll) {
-					if ((fp = getf(fd)) == NULL) {
-						error = EBADF;
-						break;
-					}
+			/*
+			 * If we're in epoll compatibility mode, check that the
+			 * fd is valid before allocating anything for it; epoll
+			 * semantics demand that we return EBADF if our
+			 * specified fd is invalid.
+			 */
+			if (is_epoll) {
+				if ((fp = getf_gen(fd, &gen)) == NULL) {
+					error = EBADF;
+					break;
 				}
-
+			}
+			if (pdp == NULL) {
 				pdp = pcache_alloc_fd(0);
 				pdp->pd_fd = fd;
 				pdp->pd_pcache = pcp;
 				pcache_insert_fd(pcp, pdp, pollfdnum);
-			} else {
+			}
+
+			if (is_epoll) {
 				/*
-				 * epoll semantics demand that we error out if
-				 * a file descriptor is added twice, which we
-				 * check (imperfectly) by checking if we both
-				 * have the file descriptor cached and the
-				 * file pointer that correponds to the file
-				 * descriptor matches our cached value.  If
-				 * there is a pointer mismatch, the file
-				 * descriptor was closed without being removed.
-				 * The converse is clearly not true, however,
-				 * so to narrow the window by which a spurious
-				 * EEXIST may be returned, we also check if
-				 * this fp has been added to an epoll control
-				 * descriptor in the past; if it hasn't, we
-				 * know that this is due to fp reuse -- it's
-				 * not a true EEXIST case.  (By performing this
-				 * additional check, we limit the window of
-				 * spurious EEXIST to situations where a single
-				 * file descriptor is being used across two or
-				 * more epoll control descriptors -- and even
-				 * then, the file descriptor must be closed and
-				 * reused in a relatively tight time span.)
+				 * If the fd is already a member of the epoll
+				 * set, error emission is needed only when the
+				 * fd assignment generation matches the one
+				 * recorded in the polldat_t.  Absence of such
+				 * a generation match indicates that a new
+				 * resource has been assigned at that fd.
+				 *
+				 * Caveat: It is possible to force a generation
+				 * update while keeping the same backing
+				 * resource.  This is possible via dup2, but
+				 * does not represent real-world use cases,
+				 * making the lack of error acceptable.
 				 */
-				if (is_epoll) {
-					if (pdp->pd_fp != NULL &&
-					    (fp = getf(fd)) != NULL &&
-					    fp == pdp->pd_fp &&
-					    (fp->f_flag2 & FEPOLLED)) {
-						error = EEXIST;
-						releasef(fd);
-						break;
-					}
-
-					/*
-					 * We have decided that the cached
-					 * information was stale: it either
-					 * didn't match, or the fp had never
-					 * actually been epoll()'d on before.
-					 * We need to now clear our pd_events
-					 * to assure that we don't mistakenly
-					 * operate on cached event disposition.
-					 */
-					pdp->pd_events = 0;
+				if (pdp->pd_fp != NULL && pdp->pd_gen == gen) {
+					error = EEXIST;
+					releasef(fd);
+					break;
 				}
-			}
 
-			if (is_epoll) {
+				/*
+				 * We have decided that the cached information
+				 * was stale.  Reset pd_events to assure that
+				 * we don't mistakenly operate on cached event
+				 * disposition.  This configures the implicit
+				 * subscription to HUP and ERR events which
+				 * epoll features.
+				 */
+				pdp->pd_events = POLLERR|POLLHUP;
+
 				epfdp = (dvpoll_epollfd_t *)pfdp;
 				pdp->pd_epolldata = epfdp->dpep_data;
 			}
@@ -928,39 +965,36 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp)
 			if (fd > pcp->pc_mapend) {
 				pcp->pc_mapend = fd;
 			}
-			if (fp == NULL && (fp = getf(fd)) == NULL) {
-				/*
-				 * The fd is not valid. Since we can't pass
-				 * this error back in the write() call, set
-				 * the bit in bitmap to force DP_POLL ioctl
-				 * to examine it.
-				 */
-				BT_SET(pcp->pc_bitmap, fd);
-				pdp->pd_events |= pfdp->events;
-				continue;
-			}
 
-			/*
-			 * To (greatly) reduce EEXIST false positives, we
-			 * denote that this fp has been epoll()'d.  We do this
-			 * regardless of epoll compatibility mode, as the flag
-			 * is harmless if not in epoll compatibility mode.
-			 */
-			fp->f_flag2 |= FEPOLLED;
+			if (!is_epoll) {
+				ASSERT(fp == NULL);
 
-			/*
-			 * Don't do VOP_POLL for an already cached fd with
-			 * same poll events.
-			 */
-			if ((pdp->pd_events == pfdp->events) &&
-			    (pdp->pd_fp == fp)) {
+				if ((fp = getf_gen(fd, &gen)) == NULL) {
+					/*
+					 * The fd is not valid. Since we can't
+					 * pass this error back in the write()
+					 * call, set the bit in bitmap to force
+					 * DP_POLL ioctl to examine it.
+					 */
+					BT_SET(pcp->pc_bitmap, fd);
+					pdp->pd_events |= pfdp->events;
+					continue;
+				}
 				/*
-				 * the events are already cached
+				 * Don't do VOP_POLL for an already cached fd
+				 * with same poll events.
 				 */
-				releasef(fd);
-				continue;
+				if ((pdp->pd_events == pfdp->events) &&
+				    (pdp->pd_fp == fp)) {
+					/*
+					 * the events are already cached
+					 */
+					releasef(fd);
+					continue;
+				}
 			}
 
+
 			/*
 			 * do VOP_POLL and cache this poll fd.
 			 */
@@ -992,11 +1026,11 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp)
 			 * wake-ups.
 			 *
 			 * Drivers which never emit a pollhead will simply
-			 * disobey the exectation of edge-triggered behavior.
+			 * disobey the expectation of edge-triggered behavior.
 			 * This includes recursive epoll which, even on Linux,
 			 * yields its events in a level-triggered fashion only.
 			 */
-			if ((pdp->pd_events & POLLET) && error == 0 &&
+			if ((pfdp->events & POLLET) != 0 && error == 0 &&
 			    php == NULL) {
 				short levent = 0;
 
@@ -1018,6 +1052,7 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp)
 				break;
 			}
 			pdp->pd_fp = fp;
+			pdp->pd_gen = gen;
 			pdp->pd_events |= pfdp->events;
 			if (php != NULL) {
 				if (pdp->pd_php == NULL) {
@@ -1143,8 +1178,13 @@ dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp)
 		 * to turn it off for a particular open.
 		 */
 		dpep->dpe_flag |= DP_ISEPOLLCOMPAT;
-		mutex_exit(&dpep->dpe_lock);
 
+		/* Record the epoll-enabled nature in the pollcache too */
+		mutex_enter(&pcp->pc_lock);
+		pcp->pc_flag |= PC_EPOLL;
+		mutex_exit(&pcp->pc_lock);
+
+		mutex_exit(&dpep->dpe_lock);
 		return (0);
 	}
 
diff --git a/usr/src/uts/common/io/dld/dld_drv.c b/usr/src/uts/common/io/dld/dld_drv.c
index cfe0f78415..00b5f0e3de 100644
--- a/usr/src/uts/common/io/dld/dld_drv.c
+++ b/usr/src/uts/common/io/dld/dld_drv.c
@@ -347,8 +347,8 @@ drv_ioc_attr(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
 	if ((err = dls_devnet_hold_tmp(diap->dia_linkid, &dlh)) != 0)
 		return (err);
 
-	if ((err = mac_perim_enter_by_macname(
-	    dls_devnet_mac(dlh), &mph)) != 0) {
+	if ((err = mac_perim_enter_by_macname(dls_devnet_mac(dlh),
+	    &mph)) != 0) {
 		dls_devnet_rele_tmp(dlh);
 		return (err);
 	}
@@ -360,7 +360,6 @@ drv_ioc_attr(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
 	}
 
 	mac_sdu_get(dlp->dl_mh, NULL, &diap->dia_max_sdu);
-
 	dls_link_rele(dlp);
 	mac_perim_exit(mph);
 	dls_devnet_rele_tmp(dlh);
@@ -702,7 +701,8 @@ drv_ioc_prop_common(dld_ioc_macprop_t *prop, intptr_t arg, boolean_t set,
 				err = EACCES;
 				goto done;
 			}
-			err = dls_devnet_setzid(dlh, dzp->diz_zid);
+			err = dls_devnet_setzid(dlh, dzp->diz_zid,
+			    dzp->diz_transient);
 		} else {
 			kprop->pr_perm_flags = MAC_PROP_PERM_RW;
 			(*(zoneid_t *)kprop->pr_val) = dls_devnet_getzid(dlh);
@@ -717,8 +717,18 @@ drv_ioc_prop_common(dld_ioc_macprop_t *prop, intptr_t arg, boolean_t set,
 			else
 				err = drv_ioc_clrap(linkid);
 		} else {
-			if (kprop->pr_valsize == 0)
-				return (ENOBUFS);
+			/*
+			 * You might think that the earlier call to
+			 * mac_prop_check_size() should catch this but
+			 * it can't. The autopush prop uses 0 as a
+			 * sentinel value to clear the prop. This
+			 * check ensures we don't allow a get with a
+			 * valsize of 0.
+			 */
+			if (kprop->pr_valsize == 0) {
+				err = ENOBUFS;
+				goto done;
+			}
 
 			kprop->pr_perm_flags = MAC_PROP_PERM_RW;
 			err = drv_ioc_getap(linkid, dlap);
@@ -866,7 +876,7 @@ drv_ioc_rename(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
 		return (err);
 
 	if ((err = dls_devnet_rename(dir->dir_linkid1, dir->dir_linkid2,
-	    dir->dir_link)) != 0)
+	    dir->dir_link, dir->dir_zoneinit)) != 0)
 		return (err);
 
 	if (dir->dir_linkid2 == DATALINK_INVALID_LINKID)
@@ -1321,10 +1331,13 @@ drv_ioc_gettran(void *karg, intptr_t arg, int mode, cred_t *cred,
 	dls_link_t		*dlp = NULL;
 	dld_ioc_gettran_t	*dgt = karg;
 
-	if ((ret = mac_perim_enter_by_linkid(dgt->dgt_linkid, &mph)) != 0)
+	if ((ret = dls_devnet_hold_tmp(dgt->dgt_linkid, &dlh)) != 0)
+		goto done;
+
+	if ((ret = mac_perim_enter_by_macname(dls_devnet_mac(dlh), &mph)) != 0)
 		goto done;
 
-	if ((ret = dls_devnet_hold_link(dgt->dgt_linkid, &dlh, &dlp)) != 0)
+	if ((ret = dls_link_hold(dls_devnet_mac(dlh), &dlp)) != 0)
 		goto done;
 
 	/*
@@ -1343,13 +1356,14 @@ drv_ioc_gettran(void *karg, intptr_t arg, int mode, cred_t *cred,
 	}
 
 done:
-	if (dlh != NULL && dlp != NULL) {
-		dls_devnet_rele_link(dlh, dlp);
-	}
+	if (dlp != NULL)
+		dls_link_rele(dlp);
 
-	if (mph != NULL) {
+	if (mph != NULL)
 		mac_perim_exit(mph);
-	}
+
+	if (dlh != NULL)
+		dls_devnet_rele_tmp(dlh);
 
 	return (ret);
 }
@@ -1373,10 +1387,13 @@ drv_ioc_readtran(void *karg, intptr_t arg, int mode, cred_t *cred,
 	if (dti->dti_nbytes != 256 || dti->dti_off != 0)
 		return (EINVAL);
 
-	if ((ret = mac_perim_enter_by_linkid(dti->dti_linkid, &mph)) != 0)
+	if ((ret = dls_devnet_hold_tmp(dti->dti_linkid, &dlh)) != 0)
+		goto done;
+
+	if ((ret = mac_perim_enter_by_macname(dls_devnet_mac(dlh), &mph)) != 0)
 		goto done;
 
-	if ((ret = dls_devnet_hold_link(dti->dti_linkid, &dlh, &dlp)) != 0)
+	if ((ret = dls_link_hold(dls_devnet_mac(dlh), &dlp)) != 0)
 		goto done;
 
 	/*
@@ -1396,13 +1413,14 @@ drv_ioc_readtran(void *karg, intptr_t arg, int mode, cred_t *cred,
 	}
 
 done:
-	if (dlh != NULL && dlp != NULL) {
-		dls_devnet_rele_link(dlh, dlp);
-	}
+	if (dlp != NULL)
+		dls_link_rele(dlp);
 
-	if (mph != NULL) {
+	if (mph != NULL)
 		mac_perim_exit(mph);
-	}
+
+	if (dlh != NULL)
+		dls_devnet_rele_tmp(dlh);
 
 	return (ret);
 }
@@ -1499,7 +1517,6 @@ done:
 	return (ret);
 }
 
-
 /*
  * Note that ioctls that modify links have a NULL di_priv_func(), as
  * privileges can only be checked after we know the class of the link being
@@ -1575,7 +1592,8 @@ static dld_ioc_modentry_t dld_ioc_modtable[] = {
 	{SIMNET_IOC,	"simnet", 0, NULL, 0},
 	{BRIDGE_IOC,	"bridge", 0, NULL, 0},
 	{IPTUN_IOC,	"iptun", 0, NULL, 0},
-	{IBPART_IOC,	"ibp", -1, NULL, 0}
+	{IBPART_IOC,	"ibp", -1, NULL, 0},
+	{OVERLAY_IOC,	"overlay", 0, NULL, 0}
 };
 #define	DLDIOC_CNT	\
 	(sizeof (dld_ioc_modtable) / sizeof (dld_ioc_modentry_t))
diff --git a/usr/src/uts/common/io/dld/dld_proto.c b/usr/src/uts/common/io/dld/dld_proto.c
index 05989dccb2..238dd9bb22 100644
--- a/usr/src/uts/common/io/dld/dld_proto.c
+++ b/usr/src/uts/common/io/dld/dld_proto.c
@@ -42,7 +42,7 @@ static proto_reqfunc_t proto_info_req, proto_attach_req, proto_detach_req,
     proto_bind_req, proto_unbind_req, proto_promiscon_req, proto_promiscoff_req,
     proto_enabmulti_req, proto_disabmulti_req, proto_physaddr_req,
     proto_setphysaddr_req, proto_udqos_req, proto_req, proto_capability_req,
-    proto_notify_req, proto_passive_req;
+    proto_notify_req, proto_passive_req, proto_exclusive_req;
 
 static void proto_capability_advertise(dld_str_t *, mblk_t *);
 static int dld_capab_poll_disable(dld_str_t *, dld_capab_poll_t *);
@@ -122,6 +122,9 @@ dld_proto(dld_str_t *dsp, mblk_t *mp)
 	case DL_PASSIVE_REQ:
 		proto_passive_req(dsp, mp);
 		break;
+	case DL_EXCLUSIVE_REQ:
+		proto_exclusive_req(dsp, mp);
+		break;
 	default:
 		proto_req(dsp, mp);
 		break;
@@ -606,6 +609,14 @@ proto_promiscon_req(dld_str_t *dsp, mblk_t *mp)
 		new_flags |= DLS_PROMISC_PHYS;
 		break;
 
+	case DL_PROMISC_RX_ONLY:
+		new_flags |= DLS_PROMISC_RX_ONLY;
+		break;
+
+	case DL_PROMISC_FIXUPS:
+		new_flags |= DLS_PROMISC_FIXUPS;
+		break;
+
 	default:
 		dl_err = DL_NOTSUPPORTED;
 		goto failed2;
@@ -693,6 +704,22 @@ proto_promiscoff_req(dld_str_t *dsp, mblk_t *mp)
 		new_flags &= ~DLS_PROMISC_PHYS;
 		break;
 
+	case DL_PROMISC_RX_ONLY:
+		if (!(dsp->ds_promisc & DLS_PROMISC_RX_ONLY)) {
+			dl_err = DL_NOTENAB;
+			goto failed2;
+		}
+		new_flags &= ~DLS_PROMISC_RX_ONLY;
+		break;
+
+	case DL_PROMISC_FIXUPS:
+		if (!(dsp->ds_promisc & DLS_PROMISC_FIXUPS)) {
+			dl_err = DL_NOTENAB;
+			goto failed2;
+		}
+		new_flags &= ~DLS_PROMISC_FIXUPS;
+		break;
+
 	default:
 		dl_err = DL_NOTSUPPORTED;
 		goto failed2;
@@ -1184,7 +1211,6 @@ proto_unitdata_req(dld_str_t *dsp, mblk_t *mp)
 	uint16_t		sap;
 	uint_t			addr_length;
 	mblk_t			*bp, *payload;
-	uint32_t		start, stuff, end, value, flags;
 	t_uscalar_t		dl_err;
 	uint_t			max_sdu;
 
@@ -1253,9 +1279,7 @@ proto_unitdata_req(dld_str_t *dsp, mblk_t *mp)
 	/*
 	 * Transfer the checksum offload information if it is present.
 	 */
-	hcksum_retrieve(payload, NULL, NULL, &start, &stuff, &end, &value,
-	    &flags);
-	(void) hcksum_assoc(bp, NULL, NULL, start, stuff, end, value, flags, 0);
+	mac_hcksum_clone(payload, bp);
 
 	/*
 	 * Link the payload onto the new header.
@@ -1296,7 +1320,8 @@ proto_passive_req(dld_str_t *dsp, mblk_t *mp)
 	 * If we've already become active by issuing an active primitive,
 	 * then it's too late to try to become passive.
 	 */
-	if (dsp->ds_passivestate == DLD_ACTIVE) {
+	if (dsp->ds_passivestate == DLD_ACTIVE ||
+	    dsp->ds_passivestate == DLD_EXCLUSIVE) {
 		dl_err = DL_OUTSTATE;
 		goto failed;
 	}
@@ -1350,12 +1375,20 @@ dld_capab_direct(dld_str_t *dsp, void *data, uint_t flags)
 
 	ASSERT(MAC_PERIM_HELD(dsp->ds_mh));
 
+	if (dsp->ds_sap == ETHERTYPE_IPV6)
+		return (ENOTSUP);
+
 	switch (flags) {
 	case DLD_ENABLE:
 		dls_rx_set(dsp, (dls_rx_t)direct->di_rx_cf,
 		    direct->di_rx_ch);
 
-		direct->di_tx_df = (uintptr_t)str_mdata_fastpath_put;
+		if (direct->di_flags & DI_DIRECT_RAW) {
+			direct->di_tx_df =
+			    (uintptr_t)str_mdata_raw_fastpath_put;
+		} else {
+			direct->di_tx_df = (uintptr_t)str_mdata_fastpath_put;
+		}
 		direct->di_tx_dh = dsp;
 		direct->di_tx_cb_df = (uintptr_t)mac_client_tx_notify;
 		direct->di_tx_cb_dh = dsp->ds_mch;
@@ -1377,24 +1410,22 @@ dld_capab_direct(dld_str_t *dsp, void *data, uint_t flags)
 }
 
 /*
- * dld_capab_poll_enable()
- *
- * This function is misnamed. All polling  and fanouts are run out of the
- * lower mac (in case of VNIC and the only mac in case of NICs). The
- * availability of Rx ring and promiscous mode is all taken care between
- * the soft ring set (mac_srs), the Rx ring, and S/W classifier. Any
- * fanout necessary is done by the soft rings that are part of the
- * mac_srs (by default mac_srs sends the packets up via a TCP and
- * non TCP soft ring).
+ * This function is misnamed. All polling and fanouts are run out of
+ * the lower MAC for VNICs and out of the MAC for NICs. The
+ * availability of Rx rings and promiscous mode is taken care of
+ * between the soft ring set (mac_srs), the Rx ring, and the SW
+ * classifier. Fanout, if necessary, is done by the soft rings that
+ * are part of the SRS. By default the SRS divvies up the packets
+ * based on protocol: TCP, UDP, or Other (OTH).
  *
- * The mac_srs (or its associated soft rings) always store the ill_rx_ring
+ * The SRS (or its associated soft rings) always store the ill_rx_ring
  * (the cookie returned when they registered with IP during plumb) as their
  * 2nd argument which is passed up as mac_resource_handle_t. The upcall
  * function and 1st argument is what the caller registered when they
  * called mac_rx_classify_flow_add() to register the flow. For VNIC,
  * the function is vnic_rx and argument is vnic_t. For regular NIC
  * case, it mac_rx_default and mac_handle_t. As explained above, the
- * mac_srs (or its soft ring) will add the ill_rx_ring (mac_resource_handle_t)
+ * SRS (or its soft ring) will add the ill_rx_ring (mac_resource_handle_t)
  * from its stored 2nd argument.
  */
 static int
@@ -1407,11 +1438,11 @@ dld_capab_poll_enable(dld_str_t *dsp, dld_capab_poll_t *poll)
 		return (ENOTSUP);
 
 	/*
-	 * Enable client polling if and only if DLS bypass is possible.
-	 * Special cases like VLANs need DLS processing in the Rx data path.
-	 * In such a case we can neither allow the client (IP) to directly
-	 * poll the softring (since DLS processing hasn't been done) nor can
-	 * we allow DLS bypass.
+	 * Enable client polling if and only if DLS bypass is
+	 * possible. Some traffic requires DLS processing in the Rx
+	 * data path. In such a case we can neither allow the client
+	 * (IP) to directly poll the soft ring (since DLS processing
+	 * hasn't been done) nor can we allow DLS bypass.
 	 */
 	if (!mac_rx_bypass_set(dsp->ds_mch, dsp->ds_rx, dsp->ds_rx_arg))
 		return (ENOTSUP);
@@ -1456,6 +1487,9 @@ dld_capab_poll(dld_str_t *dsp, void *data, uint_t flags)
 
 	ASSERT(MAC_PERIM_HELD(dsp->ds_mh));
 
+	if (dsp->ds_sap == ETHERTYPE_IPV6)
+		return (ENOTSUP);
+
 	switch (flags) {
 	case DLD_ENABLE:
 		return (dld_capab_poll_enable(dsp, poll));
@@ -1466,12 +1500,34 @@ dld_capab_poll(dld_str_t *dsp, void *data, uint_t flags)
 }
 
 static int
+dld_capab_ipcheck(dld_str_t *dsp, void *data, uint_t flags)
+{
+	dld_capab_ipcheck_t	*ipc = data;
+
+	ASSERT(MAC_PERIM_HELD(dsp->ds_mh));
+
+	switch (flags) {
+	case DLD_ENABLE:
+		ipc->ipc_allowed_df = (uintptr_t)mac_protect_check_addr;
+		ipc->ipc_allowed_dh = dsp->ds_mch;
+		return (0);
+	case DLD_DISABLE:
+		return (0);
+	}
+
+	return (ENOTSUP);
+}
+
+static int
 dld_capab_lso(dld_str_t *dsp, void *data, uint_t flags)
 {
 	dld_capab_lso_t		*lso = data;
 
 	ASSERT(MAC_PERIM_HELD(dsp->ds_mh));
 
+	if (dsp->ds_sap == ETHERTYPE_IPV6)
+		return (ENOTSUP);
+
 	switch (flags) {
 	case DLD_ENABLE: {
 		mac_capab_lso_t		mac_lso;
@@ -1517,8 +1573,9 @@ dld_capab(dld_str_t *dsp, uint_t type, void *data, uint_t flags)
 	 * completes. So we limit the check to DLD_ENABLE case.
 	 */
 	if ((flags == DLD_ENABLE && type != DLD_CAPAB_PERIM) &&
-	    (dsp->ds_sap != ETHERTYPE_IP ||
-	    !check_mod_above(dsp->ds_rq, "ip"))) {
+	    (((dsp->ds_sap != ETHERTYPE_IP && dsp->ds_sap != ETHERTYPE_IPV6) ||
+	    !check_mod_above(dsp->ds_rq, "ip")) &&
+	    !check_mod_above(dsp->ds_rq, "vnd"))) {
 		return (ENOTSUP);
 	}
 
@@ -1539,6 +1596,10 @@ dld_capab(dld_str_t *dsp, uint_t type, void *data, uint_t flags)
 		err = dld_capab_lso(dsp, data, flags);
 		break;
 
+	case DLD_CAPAB_IPCHECK:
+		err = dld_capab_ipcheck(dsp, data, flags);
+		break;
+
 	default:
 		err = ENOTSUP;
 		break;
@@ -1600,9 +1661,15 @@ proto_capability_advertise(dld_str_t *dsp, mblk_t *mp)
 	}
 
 	/*
-	 * Direct capability negotiation interface between IP and DLD
+	 * Direct capability negotiation interface between IP/VND and DLD. Note
+	 * that for vnd we only allow the case where the media type is the
+	 * native media type so we know that there are no transformations that
+	 * would have to happen to the mac header that it receives.
 	 */
-	if (dsp->ds_sap == ETHERTYPE_IP && check_mod_above(dsp->ds_rq, "ip")) {
+	if (((dsp->ds_sap == ETHERTYPE_IP || dsp->ds_sap == ETHERTYPE_IPV6) &&
+	    check_mod_above(dsp->ds_rq, "ip")) ||
+	    (check_mod_above(dsp->ds_rq, "vnd") &&
+	    dsp->ds_mip->mi_media == dsp->ds_mip->mi_nativemedia)) {
 		dld_capable = B_TRUE;
 		subsize += sizeof (dl_capability_sub_t) +
 		    sizeof (dl_capab_dld_t);
@@ -1721,3 +1788,36 @@ dld_capabilities_disable(dld_str_t *dsp)
 	if (dsp->ds_polling)
 		(void) dld_capab_poll_disable(dsp, NULL);
 }
+
+static void
+proto_exclusive_req(dld_str_t *dsp, mblk_t *mp)
+{
+	int ret = 0;
+	t_uscalar_t dl_err;
+	mac_perim_handle_t mph;
+
+	if (dsp->ds_passivestate != DLD_UNINITIALIZED) {
+		dl_err = DL_OUTSTATE;
+		goto failed;
+	}
+
+	if (MBLKL(mp) < DL_EXCLUSIVE_REQ_SIZE) {
+		dl_err = DL_BADPRIM;
+		goto failed;
+	}
+
+	mac_perim_enter_by_mh(dsp->ds_mh, &mph);
+	ret = dls_exclusive_set(dsp, B_TRUE);
+	mac_perim_exit(mph);
+
+	if (ret != 0) {
+		dl_err = DL_SYSERR;
+		goto failed;
+	}
+
+	dsp->ds_passivestate = DLD_EXCLUSIVE;
+	dlokack(dsp->ds_wq, mp, DL_EXCLUSIVE_REQ);
+	return;
+failed:
+	dlerrorack(dsp->ds_wq, mp, DL_EXCLUSIVE_REQ, dl_err, (t_uscalar_t)ret);
+}
diff --git a/usr/src/uts/common/io/dld/dld_str.c b/usr/src/uts/common/io/dld/dld_str.c
index 24b5ab2e72..4cb0f95cf5 100644
--- a/usr/src/uts/common/io/dld/dld_str.c
+++ b/usr/src/uts/common/io/dld/dld_str.c
@@ -855,6 +855,77 @@ i_dld_ether_header_update_tag(mblk_t *mp, uint_t pri, uint16_t vid,
 	return (mp);
 }
 
+static boolean_t
+i_dld_raw_ether_check(dld_str_t *dsp, mac_header_info_t *mhip, mblk_t **mpp)
+{
+	mblk_t *mp = *mpp;
+	mblk_t *newmp;
+	uint_t pri, vid, dvid;
+
+	dvid = mac_client_vid(dsp->ds_mch);
+
+	/*
+	 * Discard the packet if this is a VLAN stream but the VID in
+	 * the packet is not correct.
+	 */
+	vid = VLAN_ID(mhip->mhi_tci);
+	if ((dvid != VLAN_ID_NONE) && (vid != VLAN_ID_NONE))
+		return (B_FALSE);
+
+	/*
+	 * Discard the packet if this packet is a tagged packet
+	 * but both pri and VID are 0.
+	 */
+	pri = VLAN_PRI(mhip->mhi_tci);
+	if (mhip->mhi_istagged && !mhip->mhi_ispvid && pri == 0 &&
+	    vid == VLAN_ID_NONE)
+		return (B_FALSE);
+
+	/*
+	 * Update the priority bits to the per-stream priority if
+	 * priority is not set in the packet. Update the VID for
+	 * packets on a VLAN stream.
+	 */
+	pri = (pri == 0) ? dsp->ds_pri : 0;
+	if ((pri != 0) || (dvid != VLAN_ID_NONE)) {
+		if ((newmp = i_dld_ether_header_update_tag(mp, pri,
+		    dvid, dsp->ds_dlp->dl_tagmode)) == NULL) {
+			return (B_FALSE);
+		}
+		*mpp = newmp;
+	}
+
+	return (B_TRUE);
+}
+
+mac_tx_cookie_t
+str_mdata_raw_fastpath_put(dld_str_t *dsp, mblk_t *mp, uintptr_t f_hint,
+    uint16_t flag)
+{
+	boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER);
+	mac_header_info_t mhi;
+	mac_tx_cookie_t cookie;
+
+	if (mac_vlan_header_info(dsp->ds_mh, mp, &mhi) != 0)
+		goto discard;
+
+	if (is_ethernet) {
+		if (i_dld_raw_ether_check(dsp, &mhi, &mp) == B_FALSE)
+			goto discard;
+	}
+
+	if ((cookie = DLD_TX(dsp, mp, f_hint, flag)) != NULL) {
+		DLD_SETQFULL(dsp);
+	}
+	return (cookie);
+discard:
+	/* TODO: bump kstat? */
+	freemsg(mp);
+	return (NULL);
+}
+
+
+
 /*
  * M_DATA put (IP fast-path mode)
  */
@@ -903,7 +974,6 @@ str_mdata_raw_put(dld_str_t *dsp, mblk_t *mp)
 	mblk_t *bp, *newmp;
 	size_t size;
 	mac_header_info_t mhi;
-	uint_t pri, vid, dvid;
 	uint_t max_sdu;
 
 	/*
@@ -949,38 +1019,8 @@ str_mdata_raw_put(dld_str_t *dsp, mblk_t *mp)
 		goto discard;
 
 	if (is_ethernet) {
-		dvid = mac_client_vid(dsp->ds_mch);
-
-		/*
-		 * Discard the packet if this is a VLAN stream but the VID in
-		 * the packet is not correct.
-		 */
-		vid = VLAN_ID(mhi.mhi_tci);
-		if ((dvid != VLAN_ID_NONE) && (vid != VLAN_ID_NONE))
-			goto discard;
-
-		/*
-		 * Discard the packet if this packet is a tagged packet
-		 * but both pri and VID are 0.
-		 */
-		pri = VLAN_PRI(mhi.mhi_tci);
-		if (mhi.mhi_istagged && !mhi.mhi_ispvid && pri == 0 &&
-		    vid == VLAN_ID_NONE)
+		if (i_dld_raw_ether_check(dsp, &mhi, &mp) == B_FALSE)
 			goto discard;
-
-		/*
-		 * Update the priority bits to the per-stream priority if
-		 * priority is not set in the packet. Update the VID for
-		 * packets on a VLAN stream.
-		 */
-		pri = (pri == 0) ? dsp->ds_pri : 0;
-		if ((pri != 0) || (dvid != VLAN_ID_NONE)) {
-			if ((newmp = i_dld_ether_header_update_tag(mp, pri,
-			    dvid, dsp->ds_dlp->dl_tagmode)) == NULL) {
-				goto discard;
-			}
-			mp = newmp;
-		}
 	}
 
 	if (DLD_TX(dsp, mp, 0, 0) != NULL) {
diff --git a/usr/src/uts/common/io/dls/dls.c b/usr/src/uts/common/io/dls/dls.c
index 92993ada58..303ea8eb24 100644
--- a/usr/src/uts/common/io/dls/dls.c
+++ b/usr/src/uts/common/io/dls/dls.c
@@ -25,7 +25,7 @@
  */
 
 /*
- * Copyright (c) 2013 Joyent, Inc.  All rights reserved.
+ * Copyright 2018 Joyent, Inc.
  */
 
 /*
@@ -171,16 +171,16 @@ dls_bind(dld_str_t *dsp, uint32_t sap)
 	/*
 	 * The MAC layer does the VLAN demultiplexing and will only pass up
 	 * untagged packets to non-promiscuous primary MAC clients. In order to
-	 * support the binding to the VLAN SAP which is required by DLPI, dls
+	 * support binding to the VLAN SAP, which is required by DLPI, DLS
 	 * needs to get a copy of all tagged packets when the client binds to
 	 * the VLAN SAP. We do this by registering a separate promiscuous
-	 * callback for each dls client binding to that SAP.
+	 * callback for each DLS client binding to that SAP.
 	 *
 	 * Note: even though there are two promiscuous handles in dld_str_t,
 	 * ds_mph is for the regular promiscuous mode, ds_vlan_mph is the handle
-	 * to receive VLAN pkt when promiscuous mode is not on. Only one of
-	 * them can be non-NULL at the same time, to avoid receiving dup copies
-	 * of pkts.
+	 * to receive VLAN traffic when promiscuous mode is not on. Only one of
+	 * them can be non-NULL at the same time, to avoid receiving duplicate
+	 * copies of packets.
 	 */
 	if (sap == ETHERTYPE_VLAN && dsp->ds_promisc == 0) {
 		int err;
@@ -248,19 +248,69 @@ dls_promisc(dld_str_t *dsp, uint32_t new_flags)
 {
 	int err = 0;
 	uint32_t old_flags = dsp->ds_promisc;
+	uint32_t new_type = new_flags &
+	    ~(DLS_PROMISC_RX_ONLY | DLS_PROMISC_FIXUPS);
 	mac_client_promisc_type_t mptype = MAC_CLIENT_PROMISC_ALL;
+	uint16_t mac_flags = 0;
+	boolean_t doremove = B_FALSE;
 
 	ASSERT(MAC_PERIM_HELD(dsp->ds_mh));
 	ASSERT(!(new_flags & ~(DLS_PROMISC_SAP | DLS_PROMISC_MULTI |
-	    DLS_PROMISC_PHYS)));
+	    DLS_PROMISC_PHYS | DLS_PROMISC_RX_ONLY | DLS_PROMISC_FIXUPS)));
+
+	/*
+	 * If we only have the non-data receive flags set or are only changing
+	 * them, then there's nothing to do other than update the flags here.
+	 * Basically when we only have something in the set of
+	 * DLS_PROMISC_RX_ONLY and DLS_PROMISC_FIXUPS around, then there's
+	 * nothing else for us to do other than toggle it, as there's no need to
+	 * talk to MAC and we don't have to do anything else.
+	 */
+	if ((old_flags & ~(DLS_PROMISC_RX_ONLY | DLS_PROMISC_FIXUPS)) == 0 &&
+	    (new_flags & ~(DLS_PROMISC_RX_ONLY | DLS_PROMISC_FIXUPS)) == 0) {
+		dsp->ds_promisc = new_flags;
+		return (0);
+	}
 
 	/*
 	 * If the user has only requested DLS_PROMISC_MULTI then we need to make
 	 * sure that they don't see all packets.
 	 */
-	if (new_flags == DLS_PROMISC_MULTI)
+	if (new_type == DLS_PROMISC_MULTI)
 		mptype = MAC_CLIENT_PROMISC_MULTI;
 
+	/*
+	 * Look at new flags and figure out the correct mac promisc flags.
+	 * If we've only requested DLS_PROMISC_SAP and not _MULTI or _PHYS,
+	 * don't turn on physical promisc mode.
+	 */
+	if (new_flags & DLS_PROMISC_RX_ONLY)
+		mac_flags |= MAC_PROMISC_FLAGS_NO_TX_LOOP;
+	if (new_flags & DLS_PROMISC_FIXUPS)
+		mac_flags |= MAC_PROMISC_FLAGS_DO_FIXUPS;
+	if (new_type == DLS_PROMISC_SAP)
+		mac_flags |= MAC_PROMISC_FLAGS_NO_PHYS;
+
+	/*
+	 * If we're coming in and we're being asked to transition to a state
+	 * where the only DLS flags would be enabled are flags that change what
+	 * we do with promiscuous packets (DLS_PROMISC_RX_ONLY and
+	 * DLS_PROMISC_FIXUPS) and not which packets we should receive, then we
+	 * need to remove the MAC layer promiscuous handler.
+	 */
+	if ((new_flags & ~(DLS_PROMISC_RX_ONLY | DLS_PROMISC_FIXUPS)) == 0 &&
+	    (old_flags & ~(DLS_PROMISC_RX_ONLY | DLS_PROMISC_FIXUPS)) != 0 &&
+	    new_flags != 0) {
+		doremove = B_TRUE;
+	}
+
+	/*
+	 * There are three cases we care about here with respect to MAC. Going
+	 * from nothing to something, something to nothing, something to
+	 * something where we need to change how we're getting stuff from mac.
+	 * In the last case, as long as they're not equal, we need to assume
+	 * something has changed and do something about it.
+	 */
 	if (dsp->ds_promisc == 0 && new_flags != 0) {
 		/*
 		 * If only DLS_PROMISC_SAP, we don't turn on the
@@ -268,9 +318,7 @@ dls_promisc(dld_str_t *dsp, uint32_t new_flags)
 		 */
 		dsp->ds_promisc = new_flags;
 		err = mac_promisc_add(dsp->ds_mch, mptype,
-		    dls_rx_promisc, dsp, &dsp->ds_mph,
-		    (new_flags != DLS_PROMISC_SAP) ? 0 :
-		    MAC_PROMISC_FLAGS_NO_PHYS);
+		    dls_rx_promisc, dsp, &dsp->ds_mph, mac_flags);
 		if (err != 0) {
 			dsp->ds_promisc = old_flags;
 			return (err);
@@ -281,7 +329,8 @@ dls_promisc(dld_str_t *dsp, uint32_t new_flags)
 			mac_promisc_remove(dsp->ds_vlan_mph);
 			dsp->ds_vlan_mph = NULL;
 		}
-	} else if (dsp->ds_promisc != 0 && new_flags == 0) {
+	} else if (dsp->ds_promisc != 0 &&
+	    (new_flags == 0 || doremove == B_TRUE)) {
 		ASSERT(dsp->ds_mph != NULL);
 
 		mac_promisc_remove(dsp->ds_mph);
@@ -296,19 +345,13 @@ dls_promisc(dld_str_t *dsp, uint32_t new_flags)
 			    MAC_CLIENT_PROMISC_ALL, dls_rx_vlan_promisc, dsp,
 			    &dsp->ds_vlan_mph, MAC_PROMISC_FLAGS_NO_PHYS);
 		}
-	} else if (dsp->ds_promisc == DLS_PROMISC_SAP && new_flags != 0 &&
-	    new_flags != dsp->ds_promisc) {
-		/*
-		 * If the old flag is PROMISC_SAP, but the current flag has
-		 * changed to some new non-zero value, we need to turn the
-		 * physical promiscuous mode.
-		 */
+	} else if (new_flags != 0 && new_flags != old_flags) {
 		ASSERT(dsp->ds_mph != NULL);
 		mac_promisc_remove(dsp->ds_mph);
 		/* Honors both after-remove and before-add semantics! */
 		dsp->ds_promisc = new_flags;
 		err = mac_promisc_add(dsp->ds_mch, mptype,
-		    dls_rx_promisc, dsp, &dsp->ds_mph, 0);
+		    dls_rx_promisc, dsp, &dsp->ds_mph, mac_flags);
 		if (err != 0)
 			dsp->ds_promisc = old_flags;
 	} else {
@@ -629,6 +672,22 @@ boolean_t
 dls_accept_promisc(dld_str_t *dsp, mac_header_info_t *mhip, dls_rx_t *ds_rx,
     void **ds_rx_arg, boolean_t loopback)
 {
+	if (dsp->ds_promisc == 0) {
+		/*
+		 * If there are active walkers of the mi_promisc_list when
+		 * promiscuousness is disabled, ds_promisc will be cleared,
+		 * but the DLS will remain on the mi_promisc_list until the
+		 * walk is completed.  If we do not recognize this case here,
+		 * we won't properly execute the ds_promisc case in the common
+		 * accept routine -- and we will potentially accept a packet
+		 * that has originated with this DLS (which in turn can
+		 * induce recursion and death by stack overflow).  If
+		 * ds_promisc is zero, we know that we are in this window --
+		 * and we refuse to accept the packet.
+		 */
+		return (B_FALSE);
+	}
+
 	return (dls_accept_common(dsp, mhip, ds_rx, ds_rx_arg, B_TRUE,
 	    loopback));
 }
@@ -650,8 +709,8 @@ dls_mac_active_set(dls_link_t *dlp)
 		/* request the primary MAC address */
 		if ((err = mac_unicast_add(dlp->dl_mch, NULL,
 		    MAC_UNICAST_PRIMARY | MAC_UNICAST_TAG_DISABLE |
-		    MAC_UNICAST_DISABLE_TX_VID_CHECK, &dlp->dl_mah, 0,
-		    &diag)) != 0) {
+		    MAC_UNICAST_DISABLE_TX_VID_CHECK, &dlp->dl_mah,
+		    VLAN_ID_NONE, &diag)) != 0) {
 			return (err);
 		}
 
@@ -659,7 +718,10 @@ dls_mac_active_set(dls_link_t *dlp)
 		 * Set the function to start receiving packets.
 		 */
 		mac_rx_set(dlp->dl_mch, i_dls_link_rx, dlp);
+	} else if (dlp->dl_exclusive == B_TRUE) {
+		return (EBUSY);
 	}
+
 	dlp->dl_nactive++;
 	return (0);
 }
@@ -685,7 +747,11 @@ dls_active_set(dld_str_t *dsp)
 	if (dsp->ds_passivestate == DLD_PASSIVE)
 		return (0);
 
-	/* If we're already active, then there's nothing more to do. */
+	if (dsp->ds_dlp->dl_exclusive == B_TRUE &&
+	    dsp->ds_passivestate != DLD_EXCLUSIVE)
+		return (EBUSY);
+
+	/* If we're already active, we need to check the link's exclusivity */
 	if ((dsp->ds_nactive == 0) &&
 	    ((err = dls_mac_active_set(dsp->ds_dlp)) != 0)) {
 		/* except for ENXIO all other errors are mapped to EBUSY */
@@ -694,7 +760,8 @@ dls_active_set(dld_str_t *dsp)
 		return (err);
 	}
 
-	dsp->ds_passivestate = DLD_ACTIVE;
+	dsp->ds_passivestate = dsp->ds_dlp->dl_exclusive == B_TRUE ?
+	    DLD_EXCLUSIVE : DLD_ACTIVE;
 	dsp->ds_nactive++;
 	return (0);
 }
@@ -725,7 +792,32 @@ dls_active_clear(dld_str_t *dsp, boolean_t all)
 	if (dsp->ds_nactive != 0)
 		return;
 
-	ASSERT(dsp->ds_passivestate == DLD_ACTIVE);
+	ASSERT(dsp->ds_passivestate == DLD_ACTIVE ||
+	    dsp->ds_passivestate == DLD_EXCLUSIVE);
 	dls_mac_active_clear(dsp->ds_dlp);
+	/*
+	 * We verify below to ensure that no other part of DLS has mucked with
+	 * our exclusive state.
+	 */
+	if (dsp->ds_passivestate == DLD_EXCLUSIVE)
+		VERIFY(dls_exclusive_set(dsp, B_FALSE) == 0);
 	dsp->ds_passivestate = DLD_UNINITIALIZED;
 }
+
+int
+dls_exclusive_set(dld_str_t *dsp, boolean_t enable)
+{
+	ASSERT(MAC_PERIM_HELD(dsp->ds_mh));
+
+	if (enable == B_FALSE) {
+		dsp->ds_dlp->dl_exclusive = B_FALSE;
+		return (0);
+	}
+
+	if (dsp->ds_dlp->dl_nactive != 0)
+		return (EBUSY);
+
+	dsp->ds_dlp->dl_exclusive = B_TRUE;
+
+	return (0);
+}
diff --git a/usr/src/uts/common/io/dls/dls_link.c b/usr/src/uts/common/io/dls/dls_link.c
index 6c8ffcb0a9..c792251052 100644
--- a/usr/src/uts/common/io/dls/dls_link.c
+++ b/usr/src/uts/common/io/dls/dls_link.c
@@ -21,7 +21,7 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
- * Copyright 2017 Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
  */
 
 /*
@@ -30,11 +30,15 @@
 
 #include	<sys/sysmacros.h>
 #include	<sys/strsubr.h>
+#include	<sys/pattr.h>
 #include	<sys/strsun.h>
 #include	<sys/vlan.h>
 #include	<sys/dld_impl.h>
 #include	<sys/sdt.h>
 #include	<sys/atomic.h>
+#include	<sys/sysevent.h>
+#include	<sys/sysevent/eventdefs.h>
+#include	<sys/sysevent/datalink.h>
 
 static kmem_cache_t	*i_dls_link_cachep;
 mod_hash_t		*i_dls_link_hash;
@@ -159,6 +163,18 @@ i_dls_link_subchain(dls_link_t *dlp, mblk_t *mp, const mac_header_info_t *mhip,
 		uint16_t cvid, cpri;
 		int err;
 
+		/*
+		 * If this message is from a same-machine sender, then
+		 * there may be HW checksum offloads to emulate.
+		 */
+		if (DB_CKSUMFLAGS(mp) & HW_LOCAL_MAC) {
+			mblk_t *tmpnext = mp->b_next;
+
+			mp->b_next = NULL;
+			mac_hw_emul(&mp, NULL, NULL, MAC_HWCKSUM_EMUL);
+			mp->b_next = tmpnext;
+		}
+
 		DLS_PREPARE_PKT(dlp->dl_mh, mp, &cmhi, err);
 		if (err != 0)
 			break;
@@ -353,6 +369,22 @@ i_dls_link_rx(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
 	int				err, rval;
 
 	/*
+	 * The mac_hw_emul() function, by design, doesn't predicate on
+	 * HW_LOCAL_MAC. But since we are in Rx context we know that
+	 * any LSO packet must also be from a same-machine sender. We
+	 * take advantage of that and forgoe writing a manual loop to
+	 * predicate on HW_LOCAL_MAC.
+	 *
+	 * But for checksum emulation we need to predicate on
+	 * HW_LOCAL_MAC to avoid calling mac_hw_emul() on packets that
+	 * don't need it (thanks to the fact that HCK_IPV4_HDRCKSUM
+	 * and HCK_IPV4_HDRCKSUM_OK use the same value). Therefore we
+	 * do the checksum emulation in the second loop and in
+	 * subchain matching.
+	 */
+	mac_hw_emul(&mp, NULL, NULL, MAC_LSO_EMUL);
+
+	/*
 	 * Walk the packet chain.
 	 */
 	for (; mp != NULL; mp = nextp) {
@@ -361,6 +393,18 @@ i_dls_link_rx(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
 		 */
 		accepted = B_FALSE;
 
+		/*
+		 * If this message is from a same-machine sender, then
+		 * there may be HW checksum offloads to emulate.
+		 */
+		if (DB_CKSUMFLAGS(mp) & HW_LOCAL_MAC) {
+			mblk_t *tmpnext = mp->b_next;
+
+			mp->b_next = NULL;
+			mac_hw_emul(&mp, NULL, NULL, MAC_HWCKSUM_EMUL);
+			mp->b_next = tmpnext;
+		}
+
 		DLS_PREPARE_PKT(dlp->dl_mh, mp, &mhi, err);
 		if (err != 0) {
 			atomic_inc_32(&(dlp->dl_unknowns));
@@ -379,7 +423,16 @@ i_dls_link_rx(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
 
 		vid = VLAN_ID(mhi.mhi_tci);
 
+		/*
+		 * This condition is true only when a sun4v vsw client
+		 * is on the scene; as it is the only type of client
+		 * that multiplexes VLANs on a single client instance.
+		 * All other types of clients have one VLAN per client
+		 * instance. In that case, MAC strips the VLAN tag
+		 * before delivering it to DLS (see mac_rx_deliver()).
+		 */
 		if (mhi.mhi_istagged) {
+
 			/*
 			 * If it is tagged traffic, send it upstream to
 			 * all dld_str_t which are attached to the physical
@@ -554,7 +607,13 @@ dls_rx_promisc(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
 	dls_head_t			*dhp;
 	mod_hash_key_t			key;
 
+	/*
+	 * We expect to deal with only a single packet.
+	 */
+	ASSERT3P(mp->b_next, ==, NULL);
+
 	DLS_PREPARE_PKT(dlp->dl_mh, mp, &mhi, err);
+
 	if (err != 0)
 		goto drop;
 
@@ -580,6 +639,67 @@ drop:
 	freemsg(mp);
 }
 
+/*
+ * We'd like to notify via sysevents that a link state change has occurred.
+ * There are a couple of challenges associated with this. The first is that if
+ * the link is flapping a lot, we may not see an accurate state when we launch
+ * the notification, we're told it changed, not what it changed to.
+ *
+ * The next problem is that all of the information that a user has associated
+ * with this device is the exact opposite of what we have on the dls_link_t. We
+ * have the name of the mac device, which has no bearing on what users see.
+ * Likewise, we don't have the datalink id either. So we're going to have to get
+ * this from dls.
+ *
+ * This is all further complicated by the fact that this could be going on in
+ * another thread at the same time as someone is tearing down the dls_link_t
+ * that we're associated with. We need to be careful not to grab the mac
+ * perimeter, otherwise we stand a good chance of deadlock.
+ */
+static void
+dls_link_notify(void *arg, mac_notify_type_t type)
+{
+	dls_link_t 	*dlp = arg;
+	dls_dl_handle_t	dhp;
+	nvlist_t	*nvp;
+	sysevent_t	*event;
+	sysevent_id_t	eid;
+
+	if (type != MAC_NOTE_LINK && type != MAC_NOTE_LOWLINK)
+		return;
+
+	/*
+	 * If we can't find a devnet handle for this link, then there is no user
+	 * knowable device for this at the moment and there's nothing we can
+	 * really share with them that will make sense.
+	 */
+	if (dls_devnet_hold_tmp_by_link(dlp, &dhp) != 0)
+		return;
+
+	/*
+	 * Because we're attaching this nvlist_t to the sysevent, it'll get
+	 * cleaned up when we call sysevent_free.
+	 */
+	VERIFY(nvlist_alloc(&nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+	VERIFY(nvlist_add_int32(nvp, DATALINK_EV_LINK_ID,
+	    dls_devnet_linkid(dhp)) == 0);
+	VERIFY(nvlist_add_string(nvp, DATALINK_EV_LINK_NAME,
+	    dls_devnet_link(dhp)) == 0);
+	VERIFY(nvlist_add_int32(nvp, DATALINK_EV_ZONE_ID,
+	    dls_devnet_getzid(dhp)) == 0);
+
+	dls_devnet_rele_tmp(dhp);
+
+	event = sysevent_alloc(EC_DATALINK, ESC_DATALINK_LINK_STATE,
+	    ILLUMOS_KERN_PUB"dls", SE_SLEEP);
+	VERIFY(event != NULL);
+	(void) sysevent_attach_attributes(event, (sysevent_attr_list_t *)nvp);
+
+	(void) log_sysevent(event, SE_SLEEP, &eid);
+	sysevent_free(event);
+
+}
+
 static void
 i_dls_link_destroy(dls_link_t *dlp)
 {
@@ -590,6 +710,9 @@ i_dls_link_destroy(dls_link_t *dlp)
 	/*
 	 * Free the structure back to the cache.
 	 */
+	if (dlp->dl_mnh != NULL)
+		mac_notify_remove(dlp->dl_mnh, B_TRUE);
+
 	if (dlp->dl_mch != NULL)
 		mac_client_close(dlp->dl_mch, 0);
 
@@ -601,8 +724,10 @@ i_dls_link_destroy(dls_link_t *dlp)
 	dlp->dl_mh = NULL;
 	dlp->dl_mch = NULL;
 	dlp->dl_mip = NULL;
+	dlp->dl_mnh = NULL;
 	dlp->dl_unknowns = 0;
 	dlp->dl_nonip_cnt = 0;
+	dlp->dl_exclusive = B_FALSE;
 	kmem_cache_free(i_dls_link_cachep, dlp);
 }
 
@@ -641,6 +766,8 @@ i_dls_link_create(const char *name, dls_link_t **dlpp)
 	if (err != 0)
 		goto bail;
 
+	dlp->dl_mnh = mac_notify_add(dlp->dl_mh, dls_link_notify, dlp);
+
 	DTRACE_PROBE2(dls__primary__client, char *, dlp->dl_name, void *,
 	    dlp->dl_mch);
 
diff --git a/usr/src/uts/common/io/dls/dls_mgmt.c b/usr/src/uts/common/io/dls/dls_mgmt.c
index 9755aed43f..a910cbe6b1 100644
--- a/usr/src/uts/common/io/dls/dls_mgmt.c
+++ b/usr/src/uts/common/io/dls/dls_mgmt.c
@@ -21,6 +21,7 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright (c) 2017 Joyent, Inc.
  */
 /*
  * Copyright (c) 2016 by Delphix. All rights reserved.
@@ -85,6 +86,14 @@ static door_handle_t	dls_mgmt_dh = NULL;
 /* dls_devnet_t dd_flags */
 #define	DD_CONDEMNED		0x1
 #define	DD_IMPLICIT_IPTUN	0x2 /* Implicitly-created ip*.*tun* tunnel */
+#define	DD_INITIALIZING		0x4
+
+/*
+ * If the link is marked as initializing or condemned then it should
+ * not be visible outside of the DLS framework.
+ */
+#define	DD_NOT_VISIBLE(flags)	(					\
+	(flags & (DD_CONDEMNED | DD_INITIALIZING)) != 0)
 
 /*
  * This structure is used to keep the <linkid, macname> mapping.
@@ -108,13 +117,14 @@ typedef struct dls_devnet_s {
 	zoneid_t	dd_zid;		/* current zone */
 	boolean_t	dd_prop_loaded;
 	taskqid_t	dd_prop_taskid;
+	boolean_t	dd_transient;	/* link goes away when zone does */
 } dls_devnet_t;
 
 static int i_dls_devnet_create_iptun(const char *, const char *,
     datalink_id_t *);
 static int i_dls_devnet_destroy_iptun(datalink_id_t);
-static int i_dls_devnet_setzid(dls_devnet_t *, zoneid_t, boolean_t);
-static int dls_devnet_unset(const char *, datalink_id_t *, boolean_t);
+static int i_dls_devnet_setzid(dls_devnet_t *, zoneid_t, boolean_t, boolean_t);
+static int dls_devnet_unset(mac_handle_t, datalink_id_t *, boolean_t);
 
 /*ARGSUSED*/
 static int
@@ -134,9 +144,9 @@ i_dls_devnet_destructor(void *buf, void *arg)
 {
 	dls_devnet_t	*ddp = buf;
 
-	ASSERT(ddp->dd_ksp == NULL);
-	ASSERT(ddp->dd_ref == 0);
-	ASSERT(ddp->dd_tref == 0);
+	VERIFY(ddp->dd_ksp == NULL);
+	VERIFY(ddp->dd_ref == 0);
+	VERIFY(ddp->dd_tref == 0);
 	mutex_destroy(&ddp->dd_mutex);
 	cv_destroy(&ddp->dd_cv);
 }
@@ -148,7 +158,12 @@ dls_zone_remove(datalink_id_t linkid, void *arg)
 	dls_devnet_t *ddp;
 
 	if (dls_devnet_hold_tmp(linkid, &ddp) == 0) {
-		(void) dls_devnet_setzid(ddp, GLOBAL_ZONEID);
+		/*
+		 * Don't bother moving transient links back to the global zone
+		 * since we will simply delete them in dls_devnet_unset.
+		 */
+		if (!ddp->dd_transient)
+			(void) dls_devnet_setzid(ddp, GLOBAL_ZONEID, B_FALSE);
 		dls_devnet_rele_tmp(ddp);
 	}
 	return (0);
@@ -529,6 +544,27 @@ dls_mgmt_get_linkid(const char *link, datalink_id_t *linkid)
 
 	getlinkid.ld_cmd = DLMGMT_CMD_GETLINKID;
 	(void) strlcpy(getlinkid.ld_link, link, MAXLINKNAMELEN);
+	getlinkid.ld_zoneid = getzoneid();
+
+	if ((err = i_dls_mgmt_upcall(&getlinkid, sizeof (getlinkid), &retval,
+	    sizeof (retval))) == 0) {
+		*linkid = retval.lr_linkid;
+	}
+	return (err);
+}
+
+int
+dls_mgmt_get_linkid_in_zone(const char *link, datalink_id_t *linkid,
+    zoneid_t zid)
+{
+	dlmgmt_door_getlinkid_t		getlinkid;
+	dlmgmt_getlinkid_retval_t	retval;
+	int				err;
+
+	ASSERT(getzoneid() == GLOBAL_ZONEID || zid == getzoneid());
+	getlinkid.ld_cmd = DLMGMT_CMD_GETLINKID;
+	(void) strlcpy(getlinkid.ld_link, link, MAXLINKNAMELEN);
+	getlinkid.ld_zoneid = zid;
 
 	if ((err = i_dls_mgmt_upcall(&getlinkid, sizeof (getlinkid), &retval,
 	    sizeof (retval))) == 0) {
@@ -537,6 +573,7 @@ dls_mgmt_get_linkid(const char *link, datalink_id_t *linkid)
 	return (err);
 }
 
+
 datalink_id_t
 dls_mgmt_get_next(datalink_id_t linkid, datalink_class_t class,
     datalink_media_t dmedia, uint32_t flags)
@@ -736,13 +773,24 @@ dls_devnet_stat_update(kstat_t *ksp, int rw)
  * Create the "link" kstats.
  */
 static void
-dls_devnet_stat_create(dls_devnet_t *ddp, zoneid_t zoneid)
+dls_devnet_stat_create(dls_devnet_t *ddp, zoneid_t zoneid, zoneid_t newzoneid)
 {
 	kstat_t	*ksp;
+	char	*nm;
+	char	kname[MAXLINKNAMELEN];
+
+	if (zoneid != newzoneid) {
+		ASSERT(zoneid == GLOBAL_ZONEID);
+		(void) snprintf(kname, sizeof (kname), "z%d_%s", newzoneid,
+		    ddp->dd_linkname);
+		nm = kname;
+	} else {
+		nm = ddp->dd_linkname;
+	}
 
-	if (dls_stat_create("link", 0, ddp->dd_linkname, zoneid,
+	if (dls_stat_create("link", 0, nm, zoneid,
 	    dls_devnet_stat_update, (void *)(uintptr_t)ddp->dd_linkid,
-	    &ksp) == 0) {
+	    &ksp, newzoneid) == 0) {
 		ASSERT(ksp != NULL);
 		if (zoneid == ddp->dd_owner_zid) {
 			ASSERT(ddp->dd_ksp == NULL);
@@ -762,12 +810,12 @@ dls_devnet_stat_destroy(dls_devnet_t *ddp, zoneid_t zoneid)
 {
 	if (zoneid == ddp->dd_owner_zid) {
 		if (ddp->dd_ksp != NULL) {
-			kstat_delete(ddp->dd_ksp);
+			dls_stat_delete(ddp->dd_ksp);
 			ddp->dd_ksp = NULL;
 		}
 	} else {
 		if (ddp->dd_zone_ksp != NULL) {
-			kstat_delete(ddp->dd_zone_ksp);
+			dls_stat_delete(ddp->dd_zone_ksp);
 			ddp->dd_zone_ksp = NULL;
 		}
 	}
@@ -778,24 +826,38 @@ dls_devnet_stat_destroy(dls_devnet_t *ddp, zoneid_t zoneid)
  * and create the new set using the new name.
  */
 static void
-dls_devnet_stat_rename(dls_devnet_t *ddp)
+dls_devnet_stat_rename(dls_devnet_t *ddp, boolean_t zoneinit)
 {
 	if (ddp->dd_ksp != NULL) {
-		kstat_delete(ddp->dd_ksp);
+		dls_stat_delete(ddp->dd_ksp);
 		ddp->dd_ksp = NULL;
 	}
-	/* We can't rename a link while it's assigned to a non-global zone. */
+	if (zoneinit && ddp->dd_zone_ksp != NULL) {
+		dls_stat_delete(ddp->dd_zone_ksp);
+		ddp->dd_zone_ksp = NULL;
+	}
+	/*
+	 * We can't rename a link while it's assigned to a non-global zone
+	 * unless we're first initializing the zone while readying it.
+	 */
 	ASSERT(ddp->dd_zone_ksp == NULL);
-	dls_devnet_stat_create(ddp, ddp->dd_owner_zid);
+	dls_devnet_stat_create(ddp, ddp->dd_owner_zid,
+	    (zoneinit ? ddp->dd_zid : ddp->dd_owner_zid));
+	if (zoneinit)
+		dls_devnet_stat_create(ddp, ddp->dd_zid, ddp->dd_zid);
 }
 
 /*
- * Associate a linkid with a given link (identified by macname)
+ * Associate the linkid with the link identified by macname. If this
+ * is called on behalf of a physical link then linkid may be
+ * DATALINK_INVALID_LINKID. Otherwise, if called on behalf of a
+ * virtual link, linkid must have a value.
  */
 static int
-dls_devnet_set(const char *macname, datalink_id_t linkid, zoneid_t zoneid,
+dls_devnet_set(mac_handle_t mh, datalink_id_t linkid, zoneid_t zoneid,
     dls_devnet_t **ddpp)
 {
+	const char		*macname = mac_name(mh);
 	dls_devnet_t		*ddp = NULL;
 	datalink_class_t	class;
 	int			err;
@@ -828,17 +890,41 @@ dls_devnet_set(const char *macname, datalink_id_t linkid, zoneid_t zoneid,
 		}
 
 		/*
-		 * This might be a physical link that has already
-		 * been created, but which does not have a linkid
-		 * because dlmgmtd was not running when it was created.
+		 * If we arrive here we know we are attempting to set
+		 * the linkid on a physical link. A virtual link
+		 * should never arrive here because it should never
+		 * call this function without a linkid. Virtual links
+		 * are created through dlgmtmd and thus we know
+		 * dlmgmtd is alive to assign it a linkid (search for
+		 * uses of dladm_create_datalink_id() to prove this to
+		 * yourself); we don't have the same guarantee for a
+		 * physical link which may perform an upcall for a
+		 * linkid while dlmgmtd is down but will continue
+		 * creating a devnet without the linkid (see
+		 * softmac_create_datalink() to see how physical link
+		 * creation works). That is why there is no entry in
+		 * the id hash but there is one in the macname hash --
+		 * softmac couldn't acquire a linkid the first time it
+		 * called this function.
+		 *
+		 * Because of the check above, we also know that
+		 * ddp->dd_linkid is not set. Following this, the link
+		 * must still be in the DD_INITIALIZING state because
+		 * that flag is removed IFF dd_linkid is set. This is
+		 * why we can ASSERT the DD_INITIALIZING flag below if
+		 * the call to i_dls_devnet_setzid() fails.
 		 */
 		if (linkid == DATALINK_INVALID_LINKID ||
 		    class != DATALINK_CLASS_PHYS) {
 			err = EINVAL;
 			goto done;
 		}
+
+		ASSERT(ddp->dd_flags & DD_INITIALIZING);
+
 	} else {
 		ddp = kmem_cache_alloc(i_dls_devnet_cachep, KM_SLEEP);
+		ddp->dd_flags = DD_INITIALIZING;
 		ddp->dd_tref = 0;
 		ddp->dd_ref++;
 		ddp->dd_owner_zid = zoneid;
@@ -856,12 +942,6 @@ dls_devnet_set(const char *macname, datalink_id_t linkid, zoneid_t zoneid,
 		    (mod_hash_val_t)ddp) == 0);
 		devnet_need_rebuild = B_TRUE;
 		stat_create = B_TRUE;
-		mutex_enter(&ddp->dd_mutex);
-		if (!ddp->dd_prop_loaded && (ddp->dd_prop_taskid == NULL)) {
-			ddp->dd_prop_taskid = taskq_dispatch(system_taskq,
-			    dls_devnet_prop_task, ddp, TQ_SLEEP);
-		}
-		mutex_exit(&ddp->dd_mutex);
 	}
 	err = 0;
 done:
@@ -875,8 +955,19 @@ done:
 	rw_exit(&i_dls_devnet_lock);
 	if (err == 0) {
 		if (zoneid != GLOBAL_ZONEID &&
-		    (err = i_dls_devnet_setzid(ddp, zoneid, B_FALSE)) != 0)
-			(void) dls_devnet_unset(macname, &linkid, B_TRUE);
+		    (err = i_dls_devnet_setzid(ddp, zoneid, B_FALSE,
+		    B_FALSE)) != 0) {
+			/*
+			 * At this point the link is marked as
+			 * DD_INITIALIZING -- there can be no
+			 * outstanding temp refs and therefore no need
+			 * to wait for them.
+			 */
+			ASSERT(ddp->dd_flags & DD_INITIALIZING);
+			(void) dls_devnet_unset(mh, &linkid, B_FALSE);
+			return (err);
+		}
+
 		/*
 		 * The kstat subsystem holds its own locks (rather perimeter)
 		 * before calling the ks_update (dls_devnet_stat_update) entry
@@ -884,20 +975,35 @@ done:
 		 * lock hierarchy is kstat locks -> i_dls_devnet_lock.
 		 */
 		if (stat_create)
-			dls_devnet_stat_create(ddp, zoneid);
+			dls_devnet_stat_create(ddp, zoneid, zoneid);
 		if (ddpp != NULL)
 			*ddpp = ddp;
+
+		mutex_enter(&ddp->dd_mutex);
+		if (linkid != DATALINK_INVALID_LINKID &&
+		    !ddp->dd_prop_loaded && ddp->dd_prop_taskid == NULL) {
+			ddp->dd_prop_taskid = taskq_dispatch(system_taskq,
+			    dls_devnet_prop_task, ddp, TQ_SLEEP);
+		}
+		mutex_exit(&ddp->dd_mutex);
+
 	}
 	return (err);
 }
 
 /*
- * Disassociate a linkid with a given link (identified by macname)
- * This waits until temporary references to the dls_devnet_t are gone.
+ * Disassociate the linkid from the link identified by macname. If
+ * wait is B_TRUE, wait until all temporary refs are released and the
+ * prop task is finished.
+ *
+ * If waiting then you SHOULD NOT call this from inside the MAC perim
+ * as deadlock will ensue. Otherwise, this function is safe to call
+ * from inside or outside the MAC perim.
  */
 static int
-dls_devnet_unset(const char *macname, datalink_id_t *id, boolean_t wait)
+dls_devnet_unset(mac_handle_t mh, datalink_id_t *id, boolean_t wait)
 {
+	const char	*macname = mac_name(mh);
 	dls_devnet_t	*ddp;
 	int		err;
 	mod_hash_val_t	val;
@@ -918,21 +1024,62 @@ dls_devnet_unset(const char *macname, datalink_id_t *id, boolean_t wait)
 	 * deadlock. Return EBUSY if the asynchronous thread started for
 	 * property loading as part of the post attach hasn't yet completed.
 	 */
-	ASSERT(ddp->dd_ref != 0);
+	VERIFY(ddp->dd_ref != 0);
 	if ((ddp->dd_ref != 1) || (!wait &&
 	    (ddp->dd_tref != 0 || ddp->dd_prop_taskid != NULL))) {
-		mutex_exit(&ddp->dd_mutex);
-		rw_exit(&i_dls_devnet_lock);
-		return (EBUSY);
+		int zstatus = 0;
+
+		/*
+		 * There are a couple of alternatives that might be going on
+		 * here; a) the zone is shutting down and it has a transient
+		 * link assigned, in which case we want to clean it up instead
+		 * of moving it back to the global zone, or b) its possible
+		 * that we're trying to clean up an orphaned vnic that was
+		 * delegated to a zone and which wasn't cleaned up properly
+		 * when the zone went away.  Check for either of these cases
+		 * before we simply return EBUSY.
+		 *
+		 * zstatus indicates which situation we are dealing with:
+		 *	 0 - means return EBUSY
+		 *	 1 - means case (a), cleanup transient link
+		 *	-1 - means case (b), orphained VNIC
+		 */
+		if (ddp->dd_ref > 1 && ddp->dd_zid != GLOBAL_ZONEID) {
+			zone_t	*zp;
+
+			if ((zp = zone_find_by_id(ddp->dd_zid)) == NULL) {
+				zstatus = -1;
+			} else {
+				if (ddp->dd_transient) {
+					zone_status_t s = zone_status_get(zp);
+
+					if (s >= ZONE_IS_SHUTTING_DOWN)
+						zstatus = 1;
+				}
+				zone_rele(zp);
+			}
+		}
+
+		if (zstatus == 0) {
+			mutex_exit(&ddp->dd_mutex);
+			rw_exit(&i_dls_devnet_lock);
+			return (EBUSY);
+		}
+
+		/*
+		 * We want to delete the link, reset ref to 1;
+		 */
+		if (zstatus == -1)
+			/* Log a warning, but continue in this case */
+			cmn_err(CE_WARN, "clear orphaned datalink: %s\n",
+			    ddp->dd_linkname);
+		ddp->dd_ref = 1;
 	}
 
 	ddp->dd_flags |= DD_CONDEMNED;
 	ddp->dd_ref--;
 	*id = ddp->dd_linkid;
 
-	if (ddp->dd_zid != GLOBAL_ZONEID)
-		(void) i_dls_devnet_setzid(ddp, GLOBAL_ZONEID, B_FALSE);
-
 	/*
 	 * Remove this dls_devnet_t from the hash table.
 	 */
@@ -947,18 +1094,40 @@ dls_devnet_unset(const char *macname, datalink_id_t *id, boolean_t wait)
 	}
 	rw_exit(&i_dls_devnet_lock);
 
+	/*
+	 * It is important to call i_dls_devnet_setzid() WITHOUT the
+	 * i_dls_devnet_lock held. The setzid call grabs the MAC
+	 * perim; thus causing DLS -> MAC lock ordering if performed
+	 * with the i_dls_devnet_lock held. This forces consumers to
+	 * grab the MAC perim before calling dls_devnet_unset() (the
+	 * locking rules state MAC -> DLS order). By performing the
+	 * setzid outside of the i_dls_devnet_lock consumers can
+	 * safely call dls_devnet_unset() outside the MAC perim.
+	 */
+	if (ddp->dd_zid != GLOBAL_ZONEID) {
+		dls_devnet_stat_destroy(ddp, ddp->dd_zid);
+		(void) i_dls_devnet_setzid(ddp, GLOBAL_ZONEID, B_FALSE,
+		    B_FALSE);
+	}
+
 	if (wait) {
 		/*
 		 * Wait until all temporary references are released.
+		 * The holders of the tref need the MAC perim to
+		 * perform their work and release the tref. To avoid
+		 * deadlock, assert that the perim is never held here.
 		 */
+		ASSERT0(MAC_PERIM_HELD(mh));
 		while ((ddp->dd_tref != 0) || (ddp->dd_prop_taskid != NULL))
 			cv_wait(&ddp->dd_cv, &ddp->dd_mutex);
 	} else {
-		ASSERT(ddp->dd_tref == 0 && ddp->dd_prop_taskid == NULL);
+		VERIFY(ddp->dd_tref == 0);
+		VERIFY(ddp->dd_prop_taskid == NULL);
 	}
 
-	if (ddp->dd_linkid != DATALINK_INVALID_LINKID)
+	if (ddp->dd_linkid != DATALINK_INVALID_LINKID) {
 		dls_devnet_stat_destroy(ddp, ddp->dd_owner_zid);
+	}
 
 	ddp->dd_prop_loaded = B_FALSE;
 	ddp->dd_linkid = DATALINK_INVALID_LINKID;
@@ -969,6 +1138,39 @@ dls_devnet_unset(const char *macname, datalink_id_t *id, boolean_t wait)
 	return (0);
 }
 
+/*
+ * This is a private hold routine used when we already have the dls_link_t, thus
+ * we know that it cannot go away.
+ */
+int
+dls_devnet_hold_tmp_by_link(dls_link_t *dlp, dls_dl_handle_t *ddhp)
+{
+	int err;
+	dls_devnet_t *ddp = NULL;
+
+	rw_enter(&i_dls_devnet_lock, RW_WRITER);
+	if ((err = mod_hash_find(i_dls_devnet_hash,
+	    (mod_hash_key_t)dlp->dl_name, (mod_hash_val_t *)&ddp)) != 0) {
+		ASSERT(err == MH_ERR_NOTFOUND);
+		rw_exit(&i_dls_devnet_lock);
+		return (ENOENT);
+	}
+
+	mutex_enter(&ddp->dd_mutex);
+	VERIFY(ddp->dd_ref > 0);
+	if (DD_NOT_VISIBLE(ddp->dd_flags)) {
+		mutex_exit(&ddp->dd_mutex);
+		rw_exit(&i_dls_devnet_lock);
+		return (ENOENT);
+	}
+	ddp->dd_tref++;
+	mutex_exit(&ddp->dd_mutex);
+	rw_exit(&i_dls_devnet_lock);
+
+	*ddhp = ddp;
+	return (0);
+}
+
 static int
 dls_devnet_hold_common(datalink_id_t linkid, dls_devnet_t **ddpp,
     boolean_t tmp_hold)
@@ -985,8 +1187,8 @@ dls_devnet_hold_common(datalink_id_t linkid, dls_devnet_t **ddpp,
 	}
 
 	mutex_enter(&ddp->dd_mutex);
-	ASSERT(ddp->dd_ref > 0);
-	if (ddp->dd_flags & DD_CONDEMNED) {
+	VERIFY(ddp->dd_ref > 0);
+	if (DD_NOT_VISIBLE(ddp->dd_flags)) {
 		mutex_exit(&ddp->dd_mutex);
 		rw_exit(&i_dls_devnet_lock);
 		return (ENOENT);
@@ -1053,8 +1255,8 @@ dls_devnet_hold_by_dev(dev_t dev, dls_dl_handle_t *ddhp)
 		return (ENOENT);
 	}
 	mutex_enter(&ddp->dd_mutex);
-	ASSERT(ddp->dd_ref > 0);
-	if (ddp->dd_flags & DD_CONDEMNED) {
+	VERIFY(ddp->dd_ref > 0);
+	if (DD_NOT_VISIBLE(ddp->dd_flags)) {
 		mutex_exit(&ddp->dd_mutex);
 		rw_exit(&i_dls_devnet_lock);
 		return (ENOENT);
@@ -1071,7 +1273,7 @@ void
 dls_devnet_rele(dls_devnet_t *ddp)
 {
 	mutex_enter(&ddp->dd_mutex);
-	ASSERT(ddp->dd_ref > 1);
+	VERIFY(ddp->dd_ref > 1);
 	ddp->dd_ref--;
 	if ((ddp->dd_flags & DD_IMPLICIT_IPTUN) && ddp->dd_ref == 1) {
 		mutex_exit(&ddp->dd_mutex);
@@ -1083,7 +1285,7 @@ dls_devnet_rele(dls_devnet_t *ddp)
 }
 
 static int
-dls_devnet_hold_by_name(const char *link, dls_devnet_t **ddpp)
+dls_devnet_hold_by_name(const char *link, dls_devnet_t **ddpp, zoneid_t zid)
 {
 	char			drv[MAXLINKNAMELEN];
 	uint_t			ppa;
@@ -1093,7 +1295,7 @@ dls_devnet_hold_by_name(const char *link, dls_devnet_t **ddpp)
 	dls_dev_handle_t	ddh;
 	int			err;
 
-	if ((err = dls_mgmt_get_linkid(link, &linkid)) == 0)
+	if ((err = dls_mgmt_get_linkid_in_zone(link, &linkid, zid)) == 0)
 		return (dls_devnet_hold(linkid, ddpp));
 
 	/*
@@ -1236,9 +1438,15 @@ dls_devnet_phydev(datalink_id_t vlanid, dev_t *devp)
  *
  *    This case does not change the <link name, linkid> mapping, so the link's
  *    kstats need to be updated with using name associated the given id2.
+ *
+ * The zoneinit parameter is used to allow us to create a VNIC in the global
+ * zone which is assigned to a non-global zone.  Since there is a race condition
+ * in the create process if two VNICs have the same name, we need to rename it
+ * after it has been assigned to the zone.
  */
 int
-dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link)
+dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link,
+    boolean_t zoneinit)
 {
 	dls_dev_handle_t	ddh = NULL;
 	int			err = 0;
@@ -1283,10 +1491,12 @@ dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link)
 	}
 
 	mutex_enter(&ddp->dd_mutex);
-	if (ddp->dd_ref > 1) {
-		mutex_exit(&ddp->dd_mutex);
-		err = EBUSY;
-		goto done;
+	if (!zoneinit) {
+		if (ddp->dd_ref > 1) {
+			mutex_exit(&ddp->dd_mutex);
+			err = EBUSY;
+			goto done;
+		}
 	}
 	mutex_exit(&ddp->dd_mutex);
 
@@ -1297,7 +1507,15 @@ dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link)
 		/* rename mac client name and its flow if exists */
 		if ((err = mac_open(ddp->dd_mac, &mh)) != 0)
 			goto done;
-		(void) mac_rename_primary(mh, link);
+		if (zoneinit) {
+			char tname[MAXLINKNAMELEN];
+
+			(void) snprintf(tname, sizeof (tname), "z%d_%s",
+			    ddp->dd_zid, link);
+			(void) mac_rename_primary(mh, tname);
+		} else {
+			(void) mac_rename_primary(mh, link);
+		}
 		mac_close(mh);
 		goto done;
 	}
@@ -1364,7 +1582,7 @@ done:
 	rw_exit(&i_dls_devnet_lock);
 
 	if (err == 0)
-		dls_devnet_stat_rename(ddp);
+		dls_devnet_stat_rename(ddp, zoneinit);
 
 	if (mph != NULL)
 		mac_perim_exit(mph);
@@ -1373,7 +1591,8 @@ done:
 }
 
 static int
-i_dls_devnet_setzid(dls_devnet_t *ddp, zoneid_t new_zoneid, boolean_t setprop)
+i_dls_devnet_setzid(dls_devnet_t *ddp, zoneid_t new_zoneid, boolean_t setprop,
+    boolean_t transient)
 {
 	int			err;
 	mac_perim_handle_t	mph;
@@ -1402,10 +1621,18 @@ i_dls_devnet_setzid(dls_devnet_t *ddp, zoneid_t new_zoneid, boolean_t setprop)
 		    sizeof (retval));
 		if (err != 0)
 			goto done;
+
+		/*
+		 * We set upcall_done only if the upcall is
+		 * successful. This way, if dls_link_setzid() fails,
+		 * we know another upcall must be done to reset the
+		 * dlmgmtd state.
+		 */
 		upcall_done = B_TRUE;
 	}
 	if ((err = dls_link_setzid(ddp->dd_mac, new_zoneid)) == 0) {
 		ddp->dd_zid = new_zoneid;
+		ddp->dd_transient = transient;
 		devnet_need_rebuild = B_TRUE;
 	}
 
@@ -1420,7 +1647,7 @@ done:
 }
 
 int
-dls_devnet_setzid(dls_dl_handle_t ddh, zoneid_t new_zid)
+dls_devnet_setzid(dls_dl_handle_t ddh, zoneid_t new_zid, boolean_t transient)
 {
 	dls_devnet_t	*ddp;
 	int		err;
@@ -1442,7 +1669,7 @@ dls_devnet_setzid(dls_dl_handle_t ddh, zoneid_t new_zid)
 		refheld = B_TRUE;
 	}
 
-	if ((err = i_dls_devnet_setzid(ddh, new_zid, B_TRUE)) != 0) {
+	if ((err = i_dls_devnet_setzid(ddh, new_zid, B_TRUE, transient)) != 0) {
 		if (refheld)
 			dls_devnet_rele(ddp);
 		return (err);
@@ -1459,7 +1686,7 @@ dls_devnet_setzid(dls_dl_handle_t ddh, zoneid_t new_zid)
 	if (old_zid != GLOBAL_ZONEID)
 		dls_devnet_stat_destroy(ddh, old_zid);
 	if (new_zid != GLOBAL_ZONEID)
-		dls_devnet_stat_create(ddh, new_zid);
+		dls_devnet_stat_create(ddh, new_zid, new_zid);
 
 	return (0);
 }
@@ -1497,15 +1724,19 @@ dls_devnet_islinkvisible(datalink_id_t linkid, zoneid_t zoneid)
  * Access a vanity naming node.
  */
 int
-dls_devnet_open(const char *link, dls_dl_handle_t *dhp, dev_t *devp)
+dls_devnet_open_in_zone(const char *link, dls_dl_handle_t *dhp, dev_t *devp,
+    zoneid_t zid)
 {
 	dls_devnet_t	*ddp;
 	dls_link_t	*dlp;
-	zoneid_t	zid = getzoneid();
+	zoneid_t	czid = getzoneid();
 	int		err;
 	mac_perim_handle_t	mph;
 
-	if ((err = dls_devnet_hold_by_name(link, &ddp)) != 0)
+	if (czid != GLOBAL_ZONEID && czid != zid)
+		return (ENOENT);
+
+	if ((err = dls_devnet_hold_by_name(link, &ddp, zid)) != 0)
 		return (err);
 
 	dls_devnet_prop_task_wait(ddp);
@@ -1538,6 +1769,12 @@ dls_devnet_open(const char *link, dls_dl_handle_t *dhp, dev_t *devp)
 	return (0);
 }
 
+int
+dls_devnet_open(const char *link, dls_dl_handle_t *dhp, dev_t *devp)
+{
+	return (dls_devnet_open_in_zone(link, dhp, devp, getzoneid()));
+}
+
 /*
  * Close access to a vanity naming node.
  */
@@ -1594,13 +1831,32 @@ dls_devnet_create(mac_handle_t mh, datalink_id_t linkid, zoneid_t zoneid)
 	 * we need to use the linkid to get the user name for the link
 	 * when we create the MAC client.
 	 */
-	if ((err = dls_devnet_set(mac_name(mh), linkid, zoneid, &ddp)) == 0) {
+	if ((err = dls_devnet_set(mh, linkid, zoneid, &ddp)) == 0) {
 		if ((err = dls_link_hold_create(mac_name(mh), &dlp)) != 0) {
 			mac_perim_exit(mph);
-			(void) dls_devnet_unset(mac_name(mh), &linkid, B_TRUE);
+			(void) dls_devnet_unset(mh, &linkid, B_FALSE);
 			return (err);
 		}
+
+		/*
+		 * If dd_linkid is set then the link was successfully
+		 * initialized. In this case we can remove the
+		 * initializing flag and make the link visible to the
+		 * rest of the system.
+		 *
+		 * If not set then we were called by softmac and it
+		 * was unable to obtain a linkid for the physical link
+		 * because dlmgmtd is down. In that case softmac will
+		 * eventually obtain a linkid and call
+		 * dls_devnet_recreate() to complete initialization.
+		 */
+		mutex_enter(&ddp->dd_mutex);
+		if (ddp->dd_linkid != DATALINK_INVALID_LINKID)
+			ddp->dd_flags &= ~DD_INITIALIZING;
+		mutex_exit(&ddp->dd_mutex);
+
 	}
+
 	mac_perim_exit(mph);
 	return (err);
 }
@@ -1614,8 +1870,19 @@ dls_devnet_create(mac_handle_t mh, datalink_id_t linkid, zoneid_t zoneid)
 int
 dls_devnet_recreate(mac_handle_t mh, datalink_id_t linkid)
 {
-	ASSERT(linkid != DATALINK_INVALID_LINKID);
-	return (dls_devnet_set(mac_name(mh), linkid, GLOBAL_ZONEID, NULL));
+	dls_devnet_t	*ddp;
+	int		err;
+
+	VERIFY(linkid != DATALINK_INVALID_LINKID);
+	if ((err = dls_devnet_set(mh, linkid, GLOBAL_ZONEID, &ddp)) == 0) {
+		mutex_enter(&ddp->dd_mutex);
+		if (ddp->dd_linkid != DATALINK_INVALID_LINKID)
+			ddp->dd_flags &= ~DD_INITIALIZING;
+		mutex_exit(&ddp->dd_mutex);
+	}
+
+	return (err);
+
 }
 
 int
@@ -1625,15 +1892,52 @@ dls_devnet_destroy(mac_handle_t mh, datalink_id_t *idp, boolean_t wait)
 	mac_perim_handle_t	mph;
 
 	*idp = DATALINK_INVALID_LINKID;
-	err = dls_devnet_unset(mac_name(mh), idp, wait);
-	if (err != 0 && err != ENOENT)
+	err = dls_devnet_unset(mh, idp, wait);
+
+	/*
+	 * We continue on in the face of ENOENT because the devnet
+	 * unset and DLS link release are not atomic and we may have a
+	 * scenario where there is no entry in i_dls_devnet_hash for
+	 * the MAC name but there is an entry in i_dls_link_hash. For
+	 * example, if the following occurred:
+	 *
+	 * 1. dls_devnet_unset() returns success, and
+	 *
+	 * 2. dls_link_rele_by_name() fails with ENOTEMPTY because
+	 *    flows still exist, and
+	 *
+	 * 3. dls_devnet_set() fails to set the zone id and calls
+	 *    dls_devnet_unset() -- leaving an entry in
+	 *    i_dls_link_hash but no corresponding entry in
+	 *    i_dls_devnet_hash.
+	 *
+	 * Even if #3 wasn't true the dls_devnet_set() may fail for
+	 * different reasons in the future; the point is that it _can_
+	 * fail as part of its contract. We can't rely on it working
+	 * so we must assume that these two pieces of state (devnet
+	 * and link hashes), which should always be in sync, can get
+	 * out of sync and thus even if we get ENOENT from the devnet
+	 * hash we should still try to delete from the link hash just
+	 * in case.
+	 *
+	 * We could prevent the ENOTEMPTY from dls_link_rele_by_name()
+	 * by calling mac_disable() before calling
+	 * dls_devnet_destroy() but that's not currently possible due
+	 * to a long-standing bug. OpenSolaris 6791335: The semantics
+	 * of mac_disable() were modified by Crossbow such that
+	 * dls_devnet_destroy() needs to be called before
+	 * mac_disable() can succeed. This is because of the implicit
+	 * reference that dls has on the mac_impl_t.
+	 */
+	if (err != 0 && err != ENOENT) {
 		return (err);
+	}
 
 	mac_perim_enter_by_mh(mh, &mph);
 	err = dls_link_rele_by_name(mac_name(mh));
-	mac_perim_exit(mph);
-
 	if (err != 0) {
+		dls_devnet_t	*ddp;
+
 		/*
 		 * XXX It is a general GLDv3 bug that dls_devnet_set() has to
 		 * be called to re-set the link when destroy fails.  The
@@ -1641,9 +1945,22 @@ dls_devnet_destroy(mac_handle_t mh, datalink_id_t *idp, boolean_t wait)
 		 * called from kernel context or from a zone other than that
 		 * which initially created the link.
 		 */
-		(void) dls_devnet_set(mac_name(mh), *idp, crgetzoneid(CRED()),
-		    NULL);
+		(void) dls_devnet_set(mh, *idp, crgetzoneid(CRED()), &ddp);
+
+		/*
+		 * You might think dd_linkid should always be set
+		 * here, but in the case where dls_devnet_unset()
+		 * returns ENOENT it will be DATALINK_INVALID_LINKID.
+		 * Stay consistent with the rest of DLS and only
+		 * remove the initializing flag if linkid is set.
+		 */
+		mutex_enter(&ddp->dd_mutex);
+		if (ddp->dd_linkid != DATALINK_INVALID_LINKID)
+			ddp->dd_flags &= ~DD_INITIALIZING;
+		mutex_exit(&ddp->dd_mutex);
 	}
+
+	mac_perim_exit(mph);
 	return (err);
 }
 
@@ -1717,6 +2034,12 @@ i_dls_devnet_destroy_iptun(datalink_id_t linkid)
 }
 
 const char *
+dls_devnet_link(dls_dl_handle_t ddh)
+{
+	return (ddh->dd_linkname);
+}
+
+const char *
 dls_devnet_mac(dls_dl_handle_t ddh)
 {
 	return (ddh->dd_mac);
diff --git a/usr/src/uts/common/io/dls/dls_stat.c b/usr/src/uts/common/io/dls/dls_stat.c
index 51e4be7260..82dceff278 100644
--- a/usr/src/uts/common/io/dls/dls_stat.c
+++ b/usr/src/uts/common/io/dls/dls_stat.c
@@ -21,6 +21,7 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2011 Joyent, Inc.  All rights reserved.
  */
 
 /*
@@ -30,30 +31,33 @@
 #include <sys/dld_impl.h>
 #include <sys/mac_ether.h>
 
-static mac_stat_info_t	i_dls_si[] = {
-	{ MAC_STAT_IFSPEED, "ifspeed", KSTAT_DATA_UINT64, 0 },
-	{ MAC_STAT_MULTIRCV, "multircv", KSTAT_DATA_UINT32, 0 },
-	{ MAC_STAT_BRDCSTRCV, "brdcstrcv", KSTAT_DATA_UINT32, 0 },
-	{ MAC_STAT_MULTIXMT, "multixmt", KSTAT_DATA_UINT32, 0 },
-	{ MAC_STAT_BRDCSTXMT, "brdcstxmt", KSTAT_DATA_UINT32, 0 },
-	{ MAC_STAT_NORCVBUF, "norcvbuf", KSTAT_DATA_UINT32, 0 },
-	{ MAC_STAT_IERRORS, "ierrors", KSTAT_DATA_UINT32, 0 },
-	{ MAC_STAT_NOXMTBUF, "noxmtbuf", KSTAT_DATA_UINT32, 0 },
-	{ MAC_STAT_OERRORS, "oerrors", KSTAT_DATA_UINT32, 0 },
-	{ MAC_STAT_COLLISIONS, "collisions", KSTAT_DATA_UINT32, 0 },
-	{ MAC_STAT_RBYTES, "rbytes", KSTAT_DATA_UINT32, 0 },
-	{ MAC_STAT_IPACKETS, "ipackets", KSTAT_DATA_UINT32, 0 },
-	{ MAC_STAT_OBYTES, "obytes", KSTAT_DATA_UINT32, 0 },
-	{ MAC_STAT_OPACKETS, "opackets", KSTAT_DATA_UINT32, 0 },
-	{ MAC_STAT_RBYTES, "rbytes64", KSTAT_DATA_UINT64, 0 },
-	{ MAC_STAT_IPACKETS, "ipackets64", KSTAT_DATA_UINT64, 0 },
-	{ MAC_STAT_OBYTES, "obytes64", KSTAT_DATA_UINT64, 0 },
-	{ MAC_STAT_OPACKETS, "opackets64", KSTAT_DATA_UINT64, 0 },
-	{ MAC_STAT_LINK_STATE, "link_state", KSTAT_DATA_UINT32,
-	    (uint64_t)LINK_STATE_UNKNOWN}
-};
-
-#define	STAT_INFO_COUNT	(sizeof (i_dls_si) / sizeof (i_dls_si[0]))
+/*
+ * structure for link kstats
+ */
+typedef struct {
+	kstat_named_t	dk_ifspeed;
+	kstat_named_t	dk_multircv;
+	kstat_named_t	dk_brdcstrcv;
+	kstat_named_t	dk_multixmt;
+	kstat_named_t	dk_brdcstxmt;
+	kstat_named_t	dk_norcvbuf;
+	kstat_named_t	dk_ierrors;
+	kstat_named_t	dk_noxmtbuf;
+	kstat_named_t	dk_oerrors;
+	kstat_named_t	dk_collisions;
+	kstat_named_t	dk_rbytes;
+	kstat_named_t	dk_ipackets;
+	kstat_named_t	dk_obytes;
+	kstat_named_t	dk_opackets;
+	kstat_named_t	dk_rbytes64;
+	kstat_named_t	dk_ipackets64;
+	kstat_named_t	dk_obytes64;
+	kstat_named_t	dk_opackets64;
+	kstat_named_t	dk_link_state;
+	kstat_named_t	dk_link_duplex;
+	kstat_named_t	dk_unknowns;
+	kstat_named_t	dk_zonename;
+} dls_kstat_t;
 
 /*
  * Exported functions.
@@ -61,42 +65,54 @@ static mac_stat_info_t	i_dls_si[] = {
 int
 dls_stat_update(kstat_t *ksp, dls_link_t *dlp, int rw)
 {
-	kstat_named_t	*knp;
-	uint_t		i;
-	uint64_t	val;
+	dls_kstat_t *dkp = ksp->ks_data;
 
 	if (rw != KSTAT_READ)
 		return (EACCES);
 
-	knp = (kstat_named_t *)ksp->ks_data;
-	for (i = 0; i < STAT_INFO_COUNT; i++) {
-		val = mac_stat_get(dlp->dl_mh, i_dls_si[i].msi_stat);
-
-		switch (i_dls_si[i].msi_type) {
-		case KSTAT_DATA_UINT64:
-			knp->value.ui64 = val;
-			break;
-		case KSTAT_DATA_UINT32:
-			knp->value.ui32 = (uint32_t)val;
-			break;
-		default:
-			ASSERT(B_FALSE);
-		}
-
-		knp++;
-	}
+	dkp->dk_ifspeed.value.ui64 = mac_stat_get(dlp->dl_mh, MAC_STAT_IFSPEED);
+	dkp->dk_multircv.value.ui32 = mac_stat_get(dlp->dl_mh,
+	    MAC_STAT_MULTIRCV);
+	dkp->dk_brdcstrcv.value.ui32 = mac_stat_get(dlp->dl_mh,
+	    MAC_STAT_BRDCSTRCV);
+	dkp->dk_multixmt.value.ui32 = mac_stat_get(dlp->dl_mh,
+	    MAC_STAT_MULTIXMT);
+	dkp->dk_brdcstxmt.value.ui32 = mac_stat_get(dlp->dl_mh,
+	    MAC_STAT_BRDCSTXMT);
+	dkp->dk_norcvbuf.value.ui32 = mac_stat_get(dlp->dl_mh,
+	    MAC_STAT_NORCVBUF);
+	dkp->dk_ierrors.value.ui32 = mac_stat_get(dlp->dl_mh, MAC_STAT_IERRORS);
+	dkp->dk_noxmtbuf.value.ui32 = mac_stat_get(dlp->dl_mh,
+	    MAC_STAT_NOXMTBUF);
+	dkp->dk_oerrors.value.ui32 = mac_stat_get(dlp->dl_mh, MAC_STAT_OERRORS);
+	dkp->dk_collisions.value.ui32 = mac_stat_get(dlp->dl_mh,
+	    MAC_STAT_COLLISIONS);
+	dkp->dk_rbytes.value.ui32 = mac_stat_get(dlp->dl_mh, MAC_STAT_RBYTES);
+	dkp->dk_ipackets.value.ui32 = mac_stat_get(dlp->dl_mh,
+	    MAC_STAT_IPACKETS);
+	dkp->dk_obytes.value.ui32 = mac_stat_get(dlp->dl_mh, MAC_STAT_OBYTES);
+	dkp->dk_opackets.value.ui32 = mac_stat_get(dlp->dl_mh,
+	    MAC_STAT_OPACKETS);
+	dkp->dk_rbytes64.value.ui64 = mac_stat_get(dlp->dl_mh, MAC_STAT_RBYTES);
+	dkp->dk_ipackets64.value.ui64 = mac_stat_get(dlp->dl_mh,
+	    MAC_STAT_IPACKETS);
+	dkp->dk_obytes64.value.ui64 = mac_stat_get(dlp->dl_mh, MAC_STAT_OBYTES);
+	dkp->dk_opackets64.value.ui64 = mac_stat_get(dlp->dl_mh,
+	    MAC_STAT_OPACKETS);
+	dkp->dk_link_state.value.ui32 = mac_stat_get(dlp->dl_mh,
+	    MAC_STAT_LINK_STATE);
 
 	/*
 	 * Ethernet specific kstat "link_duplex"
 	 */
 	if (dlp->dl_mip->mi_nativemedia != DL_ETHER) {
-		knp->value.ui32 = LINK_DUPLEX_UNKNOWN;
+		dkp->dk_link_duplex.value.ui32 = LINK_DUPLEX_UNKNOWN;
 	} else {
-		val = mac_stat_get(dlp->dl_mh, ETHER_STAT_LINK_DUPLEX);
-		knp->value.ui32 = (uint32_t)val;
+		dkp->dk_link_duplex.value.ui32 =
+		    (uint32_t)mac_stat_get(dlp->dl_mh, ETHER_STAT_LINK_DUPLEX);
 	}
-	knp++;
-	knp->value.ui32 = dlp->dl_unknowns;
+
+	dkp->dk_unknowns.value.ui32 = dlp->dl_unknowns;
 
 	return (0);
 }
@@ -104,30 +120,66 @@ dls_stat_update(kstat_t *ksp, dls_link_t *dlp, int rw)
 int
 dls_stat_create(const char *module, int instance, const char *name,
     zoneid_t zoneid, int (*update)(struct kstat *, int), void *private,
-    kstat_t **kspp)
+    kstat_t **kspp, zoneid_t newzoneid)
 {
 	kstat_t		*ksp;
-	kstat_named_t	*knp;
-	uint_t		i;
+	zone_t		*zone;
+	dls_kstat_t	*dkp;
 
 	if ((ksp = kstat_create_zone(module, instance, name, "net",
-	    KSTAT_TYPE_NAMED, STAT_INFO_COUNT + 2, 0, zoneid)) == NULL) {
+	    KSTAT_TYPE_NAMED, sizeof (dls_kstat_t) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL, zoneid)) == NULL) {
 		return (EINVAL);
 	}
 
 	ksp->ks_update = update;
 	ksp->ks_private = private;
+	dkp = ksp->ks_data = kmem_zalloc(sizeof (dls_kstat_t), KM_SLEEP);
+	if ((zone = zone_find_by_id(newzoneid)) != NULL) {
+		ksp->ks_data_size += strlen(zone->zone_name) + 1;
+	}
 
-	knp = (kstat_named_t *)ksp->ks_data;
-	for (i = 0; i < STAT_INFO_COUNT; i++) {
-		kstat_named_init(knp, i_dls_si[i].msi_name,
-		    i_dls_si[i].msi_type);
-		knp++;
+	kstat_named_init(&dkp->dk_ifspeed, "ifspeed", KSTAT_DATA_UINT64);
+	kstat_named_init(&dkp->dk_multircv, "multircv", KSTAT_DATA_UINT32);
+	kstat_named_init(&dkp->dk_brdcstrcv, "brdcstrcv", KSTAT_DATA_UINT32);
+	kstat_named_init(&dkp->dk_multixmt, "multixmt", KSTAT_DATA_UINT32);
+	kstat_named_init(&dkp->dk_brdcstxmt, "brdcstxmt", KSTAT_DATA_UINT32);
+	kstat_named_init(&dkp->dk_norcvbuf, "norcvbuf", KSTAT_DATA_UINT32);
+	kstat_named_init(&dkp->dk_ierrors, "ierrors", KSTAT_DATA_UINT32);
+	kstat_named_init(&dkp->dk_noxmtbuf, "noxmtbuf", KSTAT_DATA_UINT32);
+	kstat_named_init(&dkp->dk_oerrors, "oerrors", KSTAT_DATA_UINT32);
+	kstat_named_init(&dkp->dk_collisions, "collisions", KSTAT_DATA_UINT32);
+	kstat_named_init(&dkp->dk_rbytes, "rbytes", KSTAT_DATA_UINT32);
+	kstat_named_init(&dkp->dk_ipackets, "ipackets", KSTAT_DATA_UINT32);
+	kstat_named_init(&dkp->dk_obytes, "obytes", KSTAT_DATA_UINT32);
+	kstat_named_init(&dkp->dk_opackets, "opackets", KSTAT_DATA_UINT32);
+	kstat_named_init(&dkp->dk_rbytes64, "rbytes64", KSTAT_DATA_UINT64);
+	kstat_named_init(&dkp->dk_ipackets64, "ipackets64", KSTAT_DATA_UINT64);
+	kstat_named_init(&dkp->dk_obytes64, "obytes64", KSTAT_DATA_UINT64);
+	kstat_named_init(&dkp->dk_opackets64, "opackets64", KSTAT_DATA_UINT64);
+	kstat_named_init(&dkp->dk_link_state, "link_state", KSTAT_DATA_UINT32);
+	kstat_named_init(&dkp->dk_link_duplex, "link_duplex",
+		    KSTAT_DATA_UINT32);
+	kstat_named_init(&dkp->dk_unknowns, "unknowns", KSTAT_DATA_UINT32);
+	kstat_named_init(&dkp->dk_zonename, "zonename", KSTAT_DATA_STRING);
+
+	if (zone != NULL) {
+		kstat_named_setstr(&dkp->dk_zonename, zone->zone_name);
+		zone_rele(zone);
 	}
 
-	kstat_named_init(knp++, "link_duplex", KSTAT_DATA_UINT32);
-	kstat_named_init(knp, "unknowns", KSTAT_DATA_UINT32);
 	kstat_install(ksp);
 	*kspp = ksp;
 	return (0);
 }
+
+void
+dls_stat_delete(kstat_t *ksp)
+{
+	void *data;
+	if (ksp != NULL) {
+		data = ksp->ks_data;
+		kstat_delete(ksp);
+		kmem_free(data, sizeof (dls_kstat_t));
+	}
+}
diff --git a/usr/src/uts/common/io/dr_sas/THIRDPARTYLICENSE b/usr/src/uts/common/io/dr_sas/THIRDPARTYLICENSE
new file mode 100644
index 0000000000..00aefb6f51
--- /dev/null
+++ b/usr/src/uts/common/io/dr_sas/THIRDPARTYLICENSE
@@ -0,0 +1,32 @@
+/*
+ * MegaRAID device driver for SAS2.0 controllers
+ * Copyright (c) 2009, LSI Logic Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the author nor the names of its contributors may be
+ *    used to endorse or promote products derived from this software without
+ *    specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
diff --git a/usr/src/uts/common/io/dr_sas/THIRDPARTYLICENSE.descrip b/usr/src/uts/common/io/dr_sas/THIRDPARTYLICENSE.descrip
new file mode 100644
index 0000000000..ac6d2d1b15
--- /dev/null
+++ b/usr/src/uts/common/io/dr_sas/THIRDPARTYLICENSE.descrip
@@ -0,0 +1 @@
+DR_SAS DRIVER
diff --git a/usr/src/uts/common/io/dr_sas/dr_sas.c b/usr/src/uts/common/io/dr_sas/dr_sas.c
new file mode 100644
index 0000000000..02354c9b16
--- /dev/null
+++ b/usr/src/uts/common/io/dr_sas/dr_sas.c
@@ -0,0 +1,5510 @@
+/*
+ * dr_sas.c: source for dr_sas driver
+ *
+ * MegaRAID device driver for SAS2.0 controllers
+ * Copyright (c) 2008-2009, LSI Logic Corporation.
+ * All rights reserved.
+ *
+ * Version:
+ * Author:
+ *		Arun Chandrashekhar
+ *		Manju R
+ *		Rajesh Prabhakaran
+ *		Seokmann Ju
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the author nor the names of its contributors may be
+ *    used to endorse or promote products derived from this software without
+ *    specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/file.h>
+#include <sys/errno.h>
+#include <sys/open.h>
+#include <sys/cred.h>
+#include <sys/modctl.h>
+#include <sys/conf.h>
+#include <sys/devops.h>
+#include <sys/cmn_err.h>
+#include <sys/kmem.h>
+#include <sys/stat.h>
+#include <sys/mkdev.h>
+#include <sys/pci.h>
+#include <sys/scsi/scsi.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/atomic.h>
+#include <sys/signal.h>
+#include <sys/fs/dv_node.h>	/* devfs_clean */
+
+#include "dr_sas.h"
+
+/*
+ * FMA header files
+ */
+#include <sys/ddifm.h>
+#include <sys/fm/protocol.h>
+#include <sys/fm/util.h>
+#include <sys/fm/io/ddi.h>
+
+/*
+ * Local static data
+ */
+static void	*drsas_state = NULL;
+static int	debug_level_g = CL_NONE;
+
+#pragma weak scsi_hba_open
+#pragma weak scsi_hba_close
+#pragma weak scsi_hba_ioctl
+
+static ddi_dma_attr_t drsas_generic_dma_attr = {
+	DMA_ATTR_V0,		/* dma_attr_version */
+	0,			/* low DMA address range */
+	0xFFFFFFFFU,		/* high DMA address range */
+	0xFFFFFFFFU,		/* DMA counter register  */
+	8,			/* DMA address alignment */
+	0x07,			/* DMA burstsizes  */
+	1,			/* min DMA size */
+	0xFFFFFFFFU,		/* max DMA size */
+	0xFFFFFFFFU,		/* segment boundary */
+	DRSAS_MAX_SGE_CNT,	/* dma_attr_sglen */
+	512,			/* granularity of device */
+	0			/* bus specific DMA flags */
+};
+
+int32_t drsas_max_cap_maxxfer = 0x1000000;
+
+/*
+ * cb_ops contains base level routines
+ */
+static struct cb_ops drsas_cb_ops = {
+	drsas_open,		/* open */
+	drsas_close,		/* close */
+	nodev,			/* strategy */
+	nodev,			/* print */
+	nodev,			/* dump */
+	nodev,			/* read */
+	nodev,			/* write */
+	drsas_ioctl,		/* ioctl */
+	nodev,			/* devmap */
+	nodev,			/* mmap */
+	nodev,			/* segmap */
+	nochpoll,		/* poll */
+	nodev,			/* cb_prop_op */
+	0,			/* streamtab  */
+	D_NEW | D_HOTPLUG,	/* cb_flag */
+	CB_REV,			/* cb_rev */
+	nodev,			/* cb_aread */
+	nodev			/* cb_awrite */
+};
+
+/*
+ * dev_ops contains configuration routines
+ */
+static struct dev_ops drsas_ops = {
+	DEVO_REV,		/* rev, */
+	0,			/* refcnt */
+	drsas_getinfo,		/* getinfo */
+	nulldev,		/* identify */
+	nulldev,		/* probe */
+	drsas_attach,		/* attach */
+	drsas_detach,		/* detach */
+	drsas_reset,		/* reset */
+	&drsas_cb_ops,		/* char/block ops */
+	NULL,			/* bus ops */
+	NULL,			/* power */
+	ddi_quiesce_not_supported,		/* quiesce */
+};
+
+char _depends_on[] = "misc/scsi";
+
+static struct modldrv modldrv = {
+	&mod_driverops,		/* module type - driver */
+	DRSAS_VERSION,
+	&drsas_ops,		/* driver ops */
+};
+
+static struct modlinkage modlinkage = {
+	MODREV_1,	/* ml_rev - must be MODREV_1 */
+	&modldrv,	/* ml_linkage */
+	NULL		/* end of driver linkage */
+};
+
+static struct ddi_device_acc_attr endian_attr = {
+	DDI_DEVICE_ATTR_V0,
+	DDI_STRUCTURE_LE_ACC,
+	DDI_STRICTORDER_ACC
+};
+
+
+/*
+ * ************************************************************************** *
+ *                                                                            *
+ *         common entry points - for loadable kernel modules                  *
+ *                                                                            *
+ * ************************************************************************** *
+ */
+
+int
+_init(void)
+{
+	int ret;
+
+	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+	ret = ddi_soft_state_init(&drsas_state,
+	    sizeof (struct drsas_instance), 0);
+
+	if (ret != DDI_SUCCESS) {
+		con_log(CL_ANN, (CE_WARN, "dr_sas: could not init state"));
+		return (ret);
+	}
+
+	if ((ret = scsi_hba_init(&modlinkage)) != DDI_SUCCESS) {
+		con_log(CL_ANN, (CE_WARN, "dr_sas: could not init scsi hba"));
+		ddi_soft_state_fini(&drsas_state);
+		return (ret);
+	}
+
+	ret = mod_install(&modlinkage);
+
+	if (ret != DDI_SUCCESS) {
+		con_log(CL_ANN, (CE_WARN, "dr_sas: mod_install failed"));
+		scsi_hba_fini(&modlinkage);
+		ddi_soft_state_fini(&drsas_state);
+	}
+
+	return (ret);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+	return (mod_info(&modlinkage, modinfop));
+}
+
+int
+_fini(void)
+{
+	int ret;
+
+	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+	if ((ret = mod_remove(&modlinkage)) != DDI_SUCCESS)
+		return (ret);
+
+	scsi_hba_fini(&modlinkage);
+
+	ddi_soft_state_fini(&drsas_state);
+
+	return (ret);
+}
+
+
+/*
+ * ************************************************************************** *
+ *                                                                            *
+ *               common entry points - for autoconfiguration                  *
+ *                                                                            *
+ * ************************************************************************** *
+ */
+
+static int
+drsas_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+	int		instance_no;
+	int		nregs;
+	uint8_t		added_isr_f = 0;
+	uint8_t		added_soft_isr_f = 0;
+	uint8_t		create_devctl_node_f = 0;
+	uint8_t		create_scsi_node_f = 0;
+	uint8_t		create_ioc_node_f = 0;
+	uint8_t		tran_alloc_f = 0;
+	uint8_t		irq;
+	uint16_t	vendor_id;
+	uint16_t	device_id;
+	uint16_t	subsysvid;
+	uint16_t	subsysid;
+	uint16_t	command;
+	off_t		reglength = 0;
+	int		intr_types = 0;
+	char		*data;
+	int		msi_enable = 0;
+
+	scsi_hba_tran_t		*tran;
+	ddi_dma_attr_t  tran_dma_attr;
+	struct drsas_instance	*instance;
+
+	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+	/* CONSTCOND */
+	ASSERT(NO_COMPETING_THREADS);
+
+	instance_no = ddi_get_instance(dip);
+
+	/*
+	 * check to see whether this device is in a DMA-capable slot.
+	 */
+	if (ddi_slaveonly(dip) == DDI_SUCCESS) {
+		con_log(CL_ANN, (CE_WARN,
+		    "dr_sas%d: Device in slave-only slot, unused",
+		    instance_no));
+		return (DDI_FAILURE);
+	}
+
+	switch (cmd) {
+		case DDI_ATTACH:
+			con_log(CL_DLEVEL1, (CE_NOTE, "dr_sas: DDI_ATTACH"));
+			/* allocate the soft state for the instance */
+			if (ddi_soft_state_zalloc(drsas_state, instance_no)
+			    != DDI_SUCCESS) {
+				con_log(CL_ANN, (CE_WARN,
+				    "dr_sas%d: Failed to allocate soft state",
+				    instance_no));
+
+				return (DDI_FAILURE);
+			}
+
+			instance = (struct drsas_instance *)ddi_get_soft_state
+			    (drsas_state, instance_no);
+
+			if (instance == NULL) {
+				con_log(CL_ANN, (CE_WARN,
+				    "dr_sas%d: Bad soft state", instance_no));
+
+				ddi_soft_state_free(drsas_state, instance_no);
+
+				return (DDI_FAILURE);
+			}
+
+			bzero((caddr_t)instance,
+			    sizeof (struct drsas_instance));
+
+			instance->func_ptr = kmem_zalloc(
+			    sizeof (struct drsas_func_ptr), KM_SLEEP);
+			ASSERT(instance->func_ptr);
+
+			/* Setup the PCI configuration space handles */
+			if (pci_config_setup(dip, &instance->pci_handle) !=
+			    DDI_SUCCESS) {
+				con_log(CL_ANN, (CE_WARN,
+				    "dr_sas%d: pci config setup failed ",
+				    instance_no));
+
+				kmem_free(instance->func_ptr,
+				    sizeof (struct drsas_func_ptr));
+				ddi_soft_state_free(drsas_state, instance_no);
+
+				return (DDI_FAILURE);
+			}
+
+			if (ddi_dev_nregs(dip, &nregs) != DDI_SUCCESS) {
+				con_log(CL_ANN, (CE_WARN,
+				    "dr_sas: failed to get registers."));
+
+				pci_config_teardown(&instance->pci_handle);
+				kmem_free(instance->func_ptr,
+				    sizeof (struct drsas_func_ptr));
+				ddi_soft_state_free(drsas_state, instance_no);
+
+				return (DDI_FAILURE);
+			}
+
+			vendor_id = pci_config_get16(instance->pci_handle,
+			    PCI_CONF_VENID);
+			device_id = pci_config_get16(instance->pci_handle,
+			    PCI_CONF_DEVID);
+
+			subsysvid = pci_config_get16(instance->pci_handle,
+			    PCI_CONF_SUBVENID);
+			subsysid = pci_config_get16(instance->pci_handle,
+			    PCI_CONF_SUBSYSID);
+
+			pci_config_put16(instance->pci_handle, PCI_CONF_COMM,
+			    (pci_config_get16(instance->pci_handle,
+			    PCI_CONF_COMM) | PCI_COMM_ME));
+			irq = pci_config_get8(instance->pci_handle,
+			    PCI_CONF_ILINE);
+
+			con_log(CL_DLEVEL1, (CE_CONT, "dr_sas%d: "
+			    "0x%x:0x%x 0x%x:0x%x, irq:%d drv-ver:%s",
+			    instance_no, vendor_id, device_id, subsysvid,
+			    subsysid, irq, DRSAS_VERSION));
+
+			/* enable bus-mastering */
+			command = pci_config_get16(instance->pci_handle,
+			    PCI_CONF_COMM);
+
+			if (!(command & PCI_COMM_ME)) {
+				command |= PCI_COMM_ME;
+
+				pci_config_put16(instance->pci_handle,
+				    PCI_CONF_COMM, command);
+
+				con_log(CL_ANN, (CE_CONT, "dr_sas%d: "
+				    "enable bus-mastering", instance_no));
+			} else {
+				con_log(CL_DLEVEL1, (CE_CONT, "dr_sas%d: "
+				"bus-mastering already set", instance_no));
+			}
+
+			/* initialize function pointers */
+			if ((device_id == PCI_DEVICE_ID_LSI_2108VDE) ||
+			    (device_id == PCI_DEVICE_ID_LSI_2108V)) {
+				con_log(CL_DLEVEL1, (CE_CONT, "dr_sas%d: "
+				    "2108V/DE detected", instance_no));
+				instance->func_ptr->read_fw_status_reg =
+				    read_fw_status_reg_ppc;
+				instance->func_ptr->issue_cmd = issue_cmd_ppc;
+				instance->func_ptr->issue_cmd_in_sync_mode =
+				    issue_cmd_in_sync_mode_ppc;
+				instance->func_ptr->issue_cmd_in_poll_mode =
+				    issue_cmd_in_poll_mode_ppc;
+				instance->func_ptr->enable_intr =
+				    enable_intr_ppc;
+				instance->func_ptr->disable_intr =
+				    disable_intr_ppc;
+				instance->func_ptr->intr_ack = intr_ack_ppc;
+			} else {
+				con_log(CL_ANN, (CE_WARN,
+				    "dr_sas: Invalid device detected"));
+
+				pci_config_teardown(&instance->pci_handle);
+				kmem_free(instance->func_ptr,
+				    sizeof (struct drsas_func_ptr));
+				ddi_soft_state_free(drsas_state, instance_no);
+
+				return (DDI_FAILURE);
+			}
+
+			instance->baseaddress = pci_config_get32(
+			    instance->pci_handle, PCI_CONF_BASE0);
+			instance->baseaddress &= 0x0fffc;
+
+			instance->dip		= dip;
+			instance->vendor_id	= vendor_id;
+			instance->device_id	= device_id;
+			instance->subsysvid	= subsysvid;
+			instance->subsysid	= subsysid;
+			instance->instance	= instance_no;
+
+			/* Initialize FMA */
+			instance->fm_capabilities = ddi_prop_get_int(
+			    DDI_DEV_T_ANY, instance->dip, DDI_PROP_DONTPASS,
+			    "fm-capable", DDI_FM_EREPORT_CAPABLE |
+			    DDI_FM_ACCCHK_CAPABLE | DDI_FM_DMACHK_CAPABLE
+			    | DDI_FM_ERRCB_CAPABLE);
+
+			drsas_fm_init(instance);
+
+			/* Initialize Interrupts */
+			if ((ddi_dev_regsize(instance->dip,
+			    REGISTER_SET_IO_2108, &reglength) != DDI_SUCCESS) ||
+			    reglength < MINIMUM_MFI_MEM_SZ) {
+				return (DDI_FAILURE);
+			}
+			if (reglength > DEFAULT_MFI_MEM_SZ) {
+				reglength = DEFAULT_MFI_MEM_SZ;
+				con_log(CL_DLEVEL1, (CE_NOTE,
+				    "dr_sas: register length to map is "
+				    "0x%lx bytes", reglength));
+			}
+			if (ddi_regs_map_setup(instance->dip,
+			    REGISTER_SET_IO_2108, &instance->regmap, 0,
+			    reglength, &endian_attr, &instance->regmap_handle)
+			    != DDI_SUCCESS) {
+				con_log(CL_ANN, (CE_NOTE,
+				    "dr_sas: couldn't map control registers"));
+				goto fail_attach;
+			}
+
+			/*
+			 * Disable Interrupt Now.
+			 * Setup Software interrupt
+			 */
+			instance->func_ptr->disable_intr(instance);
+
+			msi_enable = 0;
+			if (ddi_prop_lookup_string(DDI_DEV_T_ANY, dip, 0,
+			    "drsas-enable-msi", &data) == DDI_SUCCESS) {
+				if (strncmp(data, "yes", 3) == 0) {
+					msi_enable = 1;
+					con_log(CL_ANN, (CE_WARN,
+					    "msi_enable = %d ENABLED",
+					    msi_enable));
+				}
+				ddi_prop_free(data);
+			}
+
+			con_log(CL_DLEVEL1, (CE_WARN, "msi_enable = %d",
+			    msi_enable));
+
+			/* Check for all supported interrupt types */
+			if (ddi_intr_get_supported_types(
+			    dip, &intr_types) != DDI_SUCCESS) {
+				con_log(CL_ANN, (CE_WARN,
+				    "ddi_intr_get_supported_types() failed"));
+				goto fail_attach;
+			}
+
+			con_log(CL_DLEVEL1, (CE_NOTE,
+			    "ddi_intr_get_supported_types() ret: 0x%x",
+			    intr_types));
+
+			/* Initialize and Setup Interrupt handler */
+			if (msi_enable && (intr_types & DDI_INTR_TYPE_MSIX)) {
+				if (drsas_add_intrs(instance,
+				    DDI_INTR_TYPE_MSIX) != DDI_SUCCESS) {
+					con_log(CL_ANN, (CE_WARN,
+					    "MSIX interrupt query failed"));
+					goto fail_attach;
+				}
+				instance->intr_type = DDI_INTR_TYPE_MSIX;
+			} else if (msi_enable && (intr_types &
+			    DDI_INTR_TYPE_MSI)) {
+				if (drsas_add_intrs(instance,
+				    DDI_INTR_TYPE_MSI) != DDI_SUCCESS) {
+					con_log(CL_ANN, (CE_WARN,
+					    "MSI interrupt query failed"));
+					goto fail_attach;
+				}
+				instance->intr_type = DDI_INTR_TYPE_MSI;
+			} else if (intr_types & DDI_INTR_TYPE_FIXED) {
+				msi_enable = 0;
+				if (drsas_add_intrs(instance,
+				    DDI_INTR_TYPE_FIXED) != DDI_SUCCESS) {
+					con_log(CL_ANN, (CE_WARN,
+					    "FIXED interrupt query failed"));
+					goto fail_attach;
+				}
+				instance->intr_type = DDI_INTR_TYPE_FIXED;
+			} else {
+				con_log(CL_ANN, (CE_WARN, "Device cannot "
+				    "suppport either FIXED or MSI/X "
+				    "interrupts"));
+				goto fail_attach;
+			}
+
+			added_isr_f = 1;
+
+			/* setup the mfi based low level driver */
+			if (init_mfi(instance) != DDI_SUCCESS) {
+				con_log(CL_ANN, (CE_WARN, "dr_sas: "
+				"could not initialize the low level driver"));
+
+				goto fail_attach;
+			}
+
+			/* Initialize all Mutex */
+			INIT_LIST_HEAD(&instance->completed_pool_list);
+			mutex_init(&instance->completed_pool_mtx,
+			    "completed_pool_mtx", MUTEX_DRIVER,
+			    DDI_INTR_PRI(instance->intr_pri));
+
+			mutex_init(&instance->int_cmd_mtx, "int_cmd_mtx",
+			    MUTEX_DRIVER, DDI_INTR_PRI(instance->intr_pri));
+			cv_init(&instance->int_cmd_cv, NULL, CV_DRIVER, NULL);
+
+			mutex_init(&instance->cmd_pool_mtx, "cmd_pool_mtx",
+			    MUTEX_DRIVER, DDI_INTR_PRI(instance->intr_pri));
+
+			/* Register our soft-isr for highlevel interrupts. */
+			instance->isr_level = instance->intr_pri;
+			if (instance->isr_level == HIGH_LEVEL_INTR) {
+				if (ddi_add_softintr(dip, DDI_SOFTINT_HIGH,
+				    &instance->soft_intr_id, NULL, NULL,
+				    drsas_softintr, (caddr_t)instance) !=
+				    DDI_SUCCESS) {
+					con_log(CL_ANN, (CE_WARN,
+					    " Software ISR did not register"));
+
+					goto fail_attach;
+				}
+
+				added_soft_isr_f = 1;
+			}
+
+			/* Allocate a transport structure */
+			tran = scsi_hba_tran_alloc(dip, SCSI_HBA_CANSLEEP);
+
+			if (tran == NULL) {
+				con_log(CL_ANN, (CE_WARN,
+				    "scsi_hba_tran_alloc failed"));
+				goto fail_attach;
+			}
+
+			tran_alloc_f = 1;
+
+			instance->tran = tran;
+
+			tran->tran_hba_private	= instance;
+			tran->tran_tgt_init	= drsas_tran_tgt_init;
+			tran->tran_tgt_probe	= scsi_hba_probe;
+			tran->tran_tgt_free	= drsas_tran_tgt_free;
+			tran->tran_init_pkt	= drsas_tran_init_pkt;
+			tran->tran_start	= drsas_tran_start;
+			tran->tran_abort	= drsas_tran_abort;
+			tran->tran_reset	= drsas_tran_reset;
+			tran->tran_getcap	= drsas_tran_getcap;
+			tran->tran_setcap	= drsas_tran_setcap;
+			tran->tran_destroy_pkt	= drsas_tran_destroy_pkt;
+			tran->tran_dmafree	= drsas_tran_dmafree;
+			tran->tran_sync_pkt	= drsas_tran_sync_pkt;
+			tran->tran_bus_config	= drsas_tran_bus_config;
+
+			tran_dma_attr = drsas_generic_dma_attr;
+			tran_dma_attr.dma_attr_sgllen = instance->max_num_sge;
+
+			/* Attach this instance of the hba */
+			if (scsi_hba_attach_setup(dip, &tran_dma_attr, tran, 0)
+			    != DDI_SUCCESS) {
+				con_log(CL_ANN, (CE_WARN,
+				    "scsi_hba_attach failed"));
+
+				goto fail_attach;
+			}
+
+			/* create devctl node for cfgadm command */
+			if (ddi_create_minor_node(dip, "devctl",
+			    S_IFCHR, INST2DEVCTL(instance_no),
+			    DDI_NT_SCSI_NEXUS, 0) == DDI_FAILURE) {
+				con_log(CL_ANN, (CE_WARN,
+				    "dr_sas: failed to create devctl node."));
+
+				goto fail_attach;
+			}
+
+			create_devctl_node_f = 1;
+
+			/* create scsi node for cfgadm command */
+			if (ddi_create_minor_node(dip, "scsi", S_IFCHR,
+			    INST2SCSI(instance_no),
+			    DDI_NT_SCSI_ATTACHMENT_POINT, 0) ==
+			    DDI_FAILURE) {
+				con_log(CL_ANN, (CE_WARN,
+				    "dr_sas: failed to create scsi node."));
+
+				goto fail_attach;
+			}
+
+			create_scsi_node_f = 1;
+
+			(void) sprintf(instance->iocnode, "%d:lsirdctl",
+			    instance_no);
+
+			/*
+			 * Create a node for applications
+			 * for issuing ioctl to the driver.
+			 */
+			if (ddi_create_minor_node(dip, instance->iocnode,
+			    S_IFCHR, INST2LSIRDCTL(instance_no),
+			    DDI_PSEUDO, 0) == DDI_FAILURE) {
+				con_log(CL_ANN, (CE_WARN,
+				    "dr_sas: failed to create ioctl node."));
+
+				goto fail_attach;
+			}
+
+			create_ioc_node_f = 1;
+
+			/* Create a taskq to handle dr events */
+			if ((instance->taskq = ddi_taskq_create(dip,
+			    "drsas_dr_taskq", 1,
+			    TASKQ_DEFAULTPRI, 0)) == NULL) {
+				con_log(CL_ANN, (CE_WARN,
+				    "dr_sas: failed to create taskq "));
+				instance->taskq = NULL;
+				goto fail_attach;
+			}
+
+			/* enable interrupt */
+			instance->func_ptr->enable_intr(instance);
+
+			/* initiate AEN */
+			if (start_mfi_aen(instance)) {
+				con_log(CL_ANN, (CE_WARN,
+				    "dr_sas: failed to initiate AEN."));
+				goto fail_initiate_aen;
+			}
+
+			con_log(CL_DLEVEL1, (CE_NOTE,
+			    "AEN started for instance %d.", instance_no));
+
+			/* Finally! We are on the air.  */
+			ddi_report_dev(dip);
+
+			if (drsas_check_acc_handle(instance->regmap_handle) !=
+			    DDI_SUCCESS) {
+				goto fail_attach;
+			}
+			if (drsas_check_acc_handle(instance->pci_handle) !=
+			    DDI_SUCCESS) {
+				goto fail_attach;
+			}
+			instance->dr_ld_list =
+			    kmem_zalloc(MRDRV_MAX_LD * sizeof (struct drsas_ld),
+			    KM_SLEEP);
+			break;
+		case DDI_PM_RESUME:
+			con_log(CL_ANN, (CE_NOTE,
+			    "dr_sas: DDI_PM_RESUME"));
+			break;
+		case DDI_RESUME:
+			con_log(CL_ANN, (CE_NOTE,
+			    "dr_sas: DDI_RESUME"));
+			break;
+		default:
+			con_log(CL_ANN, (CE_WARN,
+			    "dr_sas: invalid attach cmd=%x", cmd));
+			return (DDI_FAILURE);
+	}
+
+	return (DDI_SUCCESS);
+
+fail_initiate_aen:
+fail_attach:
+	if (create_devctl_node_f) {
+		ddi_remove_minor_node(dip, "devctl");
+	}
+
+	if (create_scsi_node_f) {
+		ddi_remove_minor_node(dip, "scsi");
+	}
+
+	if (create_ioc_node_f) {
+		ddi_remove_minor_node(dip, instance->iocnode);
+	}
+
+	if (tran_alloc_f) {
+		scsi_hba_tran_free(tran);
+	}
+
+
+	if (added_soft_isr_f) {
+		ddi_remove_softintr(instance->soft_intr_id);
+	}
+
+	if (added_isr_f) {
+		drsas_rem_intrs(instance);
+	}
+
+	if (instance && instance->taskq) {
+		ddi_taskq_destroy(instance->taskq);
+	}
+
+	drsas_fm_ereport(instance, DDI_FM_DEVICE_NO_RESPONSE);
+	ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST);
+
+	drsas_fm_fini(instance);
+
+	pci_config_teardown(&instance->pci_handle);
+
+	ddi_soft_state_free(drsas_state, instance_no);
+
+	con_log(CL_ANN, (CE_NOTE,
+	    "dr_sas: return failure from drsas_attach"));
+
+	return (DDI_FAILURE);
+}
+
+/*ARGSUSED*/
+static int
+drsas_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd,  void *arg, void **resultp)
+{
+	int	rval;
+	int	drsas_minor = getminor((dev_t)arg);
+
+	struct drsas_instance	*instance;
+
+	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+	switch (cmd) {
+		case DDI_INFO_DEVT2DEVINFO:
+			instance = (struct drsas_instance *)
+			    ddi_get_soft_state(drsas_state,
+			    MINOR2INST(drsas_minor));
+
+			if (instance == NULL) {
+				*resultp = NULL;
+				rval = DDI_FAILURE;
+			} else {
+				*resultp = instance->dip;
+				rval = DDI_SUCCESS;
+			}
+			break;
+		case DDI_INFO_DEVT2INSTANCE:
+			*resultp = (void *)instance;
+			rval = DDI_SUCCESS;
+			break;
+		default:
+			*resultp = NULL;
+			rval = DDI_FAILURE;
+	}
+
+	return (rval);
+}
+
+static int
+drsas_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+	int	instance_no;
+
+	struct drsas_instance	*instance;
+
+	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+	/* CONSTCOND */
+	ASSERT(NO_COMPETING_THREADS);
+
+	instance_no = ddi_get_instance(dip);
+
+	instance = (struct drsas_instance *)ddi_get_soft_state(drsas_state,
+	    instance_no);
+
+	if (!instance) {
+		con_log(CL_ANN, (CE_WARN,
+		    "dr_sas:%d could not get instance in detach",
+		    instance_no));
+
+		return (DDI_FAILURE);
+	}
+
+	con_log(CL_ANN, (CE_NOTE,
+	    "dr_sas%d: detaching device 0x%4x:0x%4x:0x%4x:0x%4x",
+	    instance_no, instance->vendor_id, instance->device_id,
+	    instance->subsysvid, instance->subsysid));
+
+	switch (cmd) {
+	case DDI_DETACH:
+		con_log(CL_ANN, (CE_NOTE,
+		    "drsas_detach: DDI_DETACH"));
+
+		if (scsi_hba_detach(dip) != DDI_SUCCESS) {
+			con_log(CL_ANN, (CE_WARN,
+			    "dr_sas:%d failed to detach",
+			    instance_no));
+
+			return (DDI_FAILURE);
+		}
+
+		scsi_hba_tran_free(instance->tran);
+
+		flush_cache(instance);
+
+		if (abort_aen_cmd(instance, instance->aen_cmd)) {
+			con_log(CL_ANN, (CE_WARN, "drsas_detach: "
+			    "failed to abort prevous AEN command"));
+
+			return (DDI_FAILURE);
+		}
+
+		instance->func_ptr->disable_intr(instance);
+
+		if (instance->isr_level == HIGH_LEVEL_INTR) {
+			ddi_remove_softintr(instance->soft_intr_id);
+		}
+
+		drsas_rem_intrs(instance);
+
+		if (instance->taskq) {
+			ddi_taskq_destroy(instance->taskq);
+		}
+		kmem_free(instance->dr_ld_list, MRDRV_MAX_LD
+		    * sizeof (struct drsas_ld));
+		free_space_for_mfi(instance);
+
+		drsas_fm_fini(instance);
+
+		pci_config_teardown(&instance->pci_handle);
+
+		kmem_free(instance->func_ptr,
+		    sizeof (struct drsas_func_ptr));
+
+		ddi_soft_state_free(drsas_state, instance_no);
+		break;
+	case DDI_PM_SUSPEND:
+		con_log(CL_ANN, (CE_NOTE,
+		    "drsas_detach: DDI_PM_SUSPEND"));
+
+		break;
+	case DDI_SUSPEND:
+		con_log(CL_ANN, (CE_NOTE,
+		    "drsas_detach: DDI_SUSPEND"));
+
+		break;
+	default:
+		con_log(CL_ANN, (CE_WARN,
+		    "invalid detach command:0x%x", cmd));
+		return (DDI_FAILURE);
+	}
+
+	return (DDI_SUCCESS);
+}
+
+/*
+ * ************************************************************************** *
+ *                                                                            *
+ *             common entry points - for character driver types               *
+ *                                                                            *
+ * ************************************************************************** *
+ */
+static  int
+drsas_open(dev_t *dev, int openflags, int otyp, cred_t *credp)
+{
+	int	rval = 0;
+
+	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+	/* Check root permissions */
+	if (drv_priv(credp) != 0) {
+		con_log(CL_ANN, (CE_WARN,
+		    "dr_sas: Non-root ioctl access denied!"));
+		return (EPERM);
+	}
+
+	/* Verify we are being opened as a character device */
+	if (otyp != OTYP_CHR) {
+		con_log(CL_ANN, (CE_WARN,
+		    "dr_sas: ioctl node must be a char node"));
+		return (EINVAL);
+	}
+
+	if (ddi_get_soft_state(drsas_state, MINOR2INST(getminor(*dev)))
+	    == NULL) {
+		return (ENXIO);
+	}
+
+	if (scsi_hba_open) {
+		rval = scsi_hba_open(dev, openflags, otyp, credp);
+	}
+
+	return (rval);
+}
+
+static  int
+drsas_close(dev_t dev, int openflags, int otyp, cred_t *credp)
+{
+	int	rval = 0;
+
+	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+	/* no need for locks! */
+
+	if (scsi_hba_close) {
+		rval = scsi_hba_close(dev, openflags, otyp, credp);
+	}
+
+	return (rval);
+}
+
+static int
+drsas_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
+    int *rvalp)
+{
+	int	rval = 0;
+
+	struct drsas_instance	*instance;
+	struct drsas_ioctl	*ioctl;
+	struct drsas_aen	aen;
+	int i;
+	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+	instance = ddi_get_soft_state(drsas_state, MINOR2INST(getminor(dev)));
+
+	if (instance == NULL) {
+		/* invalid minor number */
+		con_log(CL_ANN, (CE_WARN, "dr_sas: adapter not found."));
+		return (ENXIO);
+	}
+
+	ioctl = (struct drsas_ioctl *)kmem_zalloc(sizeof (struct drsas_ioctl),
+	    KM_SLEEP);
+	ASSERT(ioctl);
+
+	switch ((uint_t)cmd) {
+		case DRSAS_IOCTL_FIRMWARE:
+			for (i = 0; i < sizeof (struct drsas_ioctl); i++) {
+				if (ddi_copyin((uint8_t *)arg+i,
+				    (uint8_t *)ioctl+i, 1, mode)) {
+					con_log(CL_ANN, (CE_WARN, "drsas_ioctl "
+					    "ERROR IOCTL copyin"));
+					kmem_free(ioctl,
+					    sizeof (struct drsas_ioctl));
+					return (EFAULT);
+				}
+			}
+			if (ioctl->control_code == DRSAS_DRIVER_IOCTL_COMMON) {
+				rval = handle_drv_ioctl(instance, ioctl, mode);
+			} else {
+				rval = handle_mfi_ioctl(instance, ioctl, mode);
+			}
+			for (i = 0; i < sizeof (struct drsas_ioctl) - 1; i++) {
+				if (ddi_copyout((uint8_t *)ioctl+i,
+				    (uint8_t *)arg+i, 1, mode)) {
+					con_log(CL_ANN, (CE_WARN,
+					    "drsas_ioctl: ddi_copyout "
+					    "failed"));
+					rval = 1;
+					break;
+				}
+			}
+
+			break;
+		case DRSAS_IOCTL_AEN:
+			for (i = 0; i < sizeof (struct drsas_aen); i++) {
+				if (ddi_copyin((uint8_t *)arg+i,
+				    (uint8_t *)&aen+i, 1, mode)) {
+					con_log(CL_ANN, (CE_WARN,
+					    "drsas_ioctl: "
+					    "ERROR AEN copyin"));
+					kmem_free(ioctl,
+					    sizeof (struct drsas_ioctl));
+					return (EFAULT);
+				}
+			}
+
+			rval = handle_mfi_aen(instance, &aen);
+			for (i = 0; i < sizeof (struct drsas_aen); i++) {
+				if (ddi_copyout((uint8_t *)&aen + i,
+				    (uint8_t *)arg + i, 1, mode)) {
+					con_log(CL_ANN, (CE_WARN,
+					    "drsas_ioctl: "
+					    "ddi_copyout failed"));
+					rval = 1;
+					break;
+				}
+			}
+
+			break;
+		default:
+			rval = scsi_hba_ioctl(dev, cmd, arg,
+			    mode, credp, rvalp);
+
+			con_log(CL_DLEVEL1, (CE_NOTE, "drsas_ioctl: "
+			    "scsi_hba_ioctl called, ret = %x.", rval));
+	}
+
+	kmem_free(ioctl, sizeof (struct drsas_ioctl));
+	return (rval);
+}
+
+/*
+ * ************************************************************************** *
+ *                                                                            *
+ *               common entry points - for block driver types                 *
+ *                                                                            *
+ * ************************************************************************** *
+ */
+/*ARGSUSED*/
+static int
+drsas_reset(dev_info_t *dip, ddi_reset_cmd_t cmd)
+{
+	int	instance_no;
+
+	struct drsas_instance	*instance;
+
+	instance_no = ddi_get_instance(dip);
+	instance = (struct drsas_instance *)ddi_get_soft_state
+	    (drsas_state, instance_no);
+
+	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+	if (!instance) {
+		con_log(CL_ANN, (CE_WARN, "dr_sas:%d could not get adapter "
+		    "in reset", instance_no));
+		return (DDI_FAILURE);
+	}
+
+	instance->func_ptr->disable_intr(instance);
+
+	con_log(CL_ANN1, (CE_NOTE, "flushing cache for instance %d",
+	    instance_no));
+
+	flush_cache(instance);
+
+	return (DDI_SUCCESS);
+}
+
+
+/*
+ * ************************************************************************** *
+ *                                                                            *
+ *                          entry points (SCSI HBA)                           *
+ *                                                                            *
+ * ************************************************************************** *
+ */
+/*ARGSUSED*/
+static int
+drsas_tran_tgt_init(dev_info_t *hba_dip, dev_info_t *tgt_dip,
+    scsi_hba_tran_t *tran, struct scsi_device *sd)
+{
+	struct drsas_instance *instance;
+	uint16_t tgt = sd->sd_address.a_target;
+	uint8_t lun = sd->sd_address.a_lun;
+
+	con_log(CL_ANN1, (CE_NOTE, "drsas_tgt_init target %d lun %d",
+	    tgt, lun));
+
+	instance = ADDR2MR(&sd->sd_address);
+
+	if (ndi_dev_is_persistent_node(tgt_dip) == 0) {
+		(void) ndi_merge_node(tgt_dip, drsas_name_node);
+		ddi_set_name_addr(tgt_dip, NULL);
+
+		con_log(CL_ANN1, (CE_NOTE, "drsas_tgt_init in "
+		    "ndi_dev_is_persistent_node DDI_FAILURE t = %d l = %d",
+		    tgt, lun));
+		return (DDI_FAILURE);
+	}
+
+	con_log(CL_ANN1, (CE_NOTE, "drsas_tgt_init dev_dip %p tgt_dip %p",
+	    (void *)instance->dr_ld_list[tgt].dip, (void *)tgt_dip));
+
+	if (tgt < MRDRV_MAX_LD && lun == 0) {
+		if (instance->dr_ld_list[tgt].dip == NULL &&
+		    strcmp(ddi_driver_name(sd->sd_dev), "sd") == 0) {
+			instance->dr_ld_list[tgt].dip = tgt_dip;
+			instance->dr_ld_list[tgt].lun_type = DRSAS_LD_LUN;
+		}
+	}
+	return (DDI_SUCCESS);
+}
+
+/*ARGSUSED*/
+static void
+drsas_tran_tgt_free(dev_info_t *hba_dip, dev_info_t *tgt_dip,
+    scsi_hba_tran_t *hba_tran, struct scsi_device *sd)
+{
+	struct drsas_instance *instance;
+	int tgt = sd->sd_address.a_target;
+	int lun = sd->sd_address.a_lun;
+
+	instance = ADDR2MR(&sd->sd_address);
+
+	con_log(CL_ANN1, (CE_NOTE, "tgt_free t = %d l = %d", tgt, lun));
+
+	if (tgt < MRDRV_MAX_LD && lun == 0) {
+		if (instance->dr_ld_list[tgt].dip == tgt_dip) {
+			instance->dr_ld_list[tgt].dip = NULL;
+		}
+	}
+}
+
+static dev_info_t *
+drsas_find_child(struct drsas_instance *instance, uint16_t tgt, uint8_t lun)
+{
+	dev_info_t *child = NULL;
+	char addr[SCSI_MAXNAMELEN];
+	char tmp[MAXNAMELEN];
+
+	(void) sprintf(addr, "%x,%x", tgt, lun);
+	for (child = ddi_get_child(instance->dip); child;
+	    child = ddi_get_next_sibling(child)) {
+
+		if (drsas_name_node(child, tmp, MAXNAMELEN) !=
+		    DDI_SUCCESS) {
+			continue;
+		}
+
+		if (strcmp(addr, tmp) == 0) {
+			break;
+		}
+	}
+	con_log(CL_ANN1, (CE_NOTE, "drsas_find_child: return child = %p",
+	    (void *)child));
+	return (child);
+}
+
+static int
+drsas_name_node(dev_info_t *dip, char *name, int len)
+{
+	int tgt, lun;
+
+	tgt = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
+	    DDI_PROP_DONTPASS, "target", -1);
+	con_log(CL_ANN1, (CE_NOTE,
+	    "drsas_name_node: dip %p tgt %d", (void *)dip, tgt));
+	if (tgt == -1) {
+		return (DDI_FAILURE);
+	}
+	lun = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
+	    "lun", -1);
+	con_log(CL_ANN1,
+	    (CE_NOTE, "drsas_name_node: tgt %d lun %d", tgt, lun));
+	if (lun == -1) {
+		return (DDI_FAILURE);
+	}
+	(void) snprintf(name, len, "%x,%x", tgt, lun);
+	return (DDI_SUCCESS);
+}
+
+static struct scsi_pkt *
+drsas_tran_init_pkt(struct scsi_address *ap, register struct scsi_pkt *pkt,
+    struct buf *bp, int cmdlen, int statuslen, int tgtlen,
+    int flags, int (*callback)(), caddr_t arg)
+{
+	struct scsa_cmd	*acmd;
+	struct drsas_instance	*instance;
+	struct scsi_pkt	*new_pkt;
+
+	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+	instance = ADDR2MR(ap);
+
+	/* step #1 : pkt allocation */
+	if (pkt == NULL) {
+		pkt = scsi_hba_pkt_alloc(instance->dip, ap, cmdlen, statuslen,
+		    tgtlen, sizeof (struct scsa_cmd), callback, arg);
+		if (pkt == NULL) {
+			return (NULL);
+		}
+
+		acmd = PKT2CMD(pkt);
+
+		/*
+		 * Initialize the new pkt - we redundantly initialize
+		 * all the fields for illustrative purposes.
+		 */
+		acmd->cmd_pkt		= pkt;
+		acmd->cmd_flags		= 0;
+		acmd->cmd_scblen	= statuslen;
+		acmd->cmd_cdblen	= cmdlen;
+		acmd->cmd_dmahandle	= NULL;
+		acmd->cmd_ncookies	= 0;
+		acmd->cmd_cookie	= 0;
+		acmd->cmd_cookiecnt	= 0;
+		acmd->cmd_nwin		= 0;
+
+		pkt->pkt_address	= *ap;
+		pkt->pkt_comp		= (void (*)())NULL;
+		pkt->pkt_flags		= 0;
+		pkt->pkt_time		= 0;
+		pkt->pkt_resid		= 0;
+		pkt->pkt_state		= 0;
+		pkt->pkt_statistics	= 0;
+		pkt->pkt_reason		= 0;
+		new_pkt			= pkt;
+	} else {
+		acmd = PKT2CMD(pkt);
+		new_pkt = NULL;
+	}
+
+	/* step #2 : dma allocation/move */
+	if (bp && bp->b_bcount != 0) {
+		if (acmd->cmd_dmahandle == NULL) {
+			if (drsas_dma_alloc(instance, pkt, bp, flags,
+			    callback) == DDI_FAILURE) {
+				if (new_pkt) {
+					scsi_hba_pkt_free(ap, new_pkt);
+				}
+				return ((struct scsi_pkt *)NULL);
+			}
+		} else {
+			if (drsas_dma_move(instance, pkt, bp) == DDI_FAILURE) {
+				return ((struct scsi_pkt *)NULL);
+			}
+		}
+	}
+
+	return (pkt);
+}
+
+static int
+drsas_tran_start(struct scsi_address *ap, register struct scsi_pkt *pkt)
+{
+	uchar_t cmd_done = 0;
+
+	struct drsas_instance	*instance = ADDR2MR(ap);
+	struct drsas_cmd	*cmd;
+
+	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d:SCSI CDB[0]=0x%x",
+	    __func__, __LINE__, pkt->pkt_cdbp[0]));
+
+	pkt->pkt_reason	= CMD_CMPLT;
+	*pkt->pkt_scbp = STATUS_GOOD; /* clear arq scsi_status */
+
+	cmd = build_cmd(instance, ap, pkt, &cmd_done);
+
+	/*
+	 * Check if the command is already completed by the drsas_build_cmd()
+	 * routine. In which case the busy_flag would be clear and scb will be
+	 * NULL and appropriate reason provided in pkt_reason field
+	 */
+	if (cmd_done) {
+		pkt->pkt_reason = CMD_CMPLT;
+		pkt->pkt_scbp[0] = STATUS_GOOD;
+		pkt->pkt_state |= STATE_GOT_BUS | STATE_GOT_TARGET
+		    | STATE_SENT_CMD;
+		if (((pkt->pkt_flags & FLAG_NOINTR) == 0) && pkt->pkt_comp) {
+			(*pkt->pkt_comp)(pkt);
+		}
+
+		return (TRAN_ACCEPT);
+	}
+
+	if (cmd == NULL) {
+		return (TRAN_BUSY);
+	}
+
+	if ((pkt->pkt_flags & FLAG_NOINTR) == 0) {
+		if (instance->fw_outstanding > instance->max_fw_cmds) {
+			con_log(CL_ANN, (CE_CONT, "dr_sas:Firmware busy"));
+			return_mfi_pkt(instance, cmd);
+			return (TRAN_BUSY);
+		}
+
+		/* Synchronize the Cmd frame for the controller */
+		(void) ddi_dma_sync(cmd->frame_dma_obj.dma_handle, 0, 0,
+		    DDI_DMA_SYNC_FORDEV);
+
+		instance->func_ptr->issue_cmd(cmd, instance);
+
+	} else {
+		struct drsas_header *hdr = &cmd->frame->hdr;
+
+		cmd->sync_cmd = DRSAS_TRUE;
+
+		instance->func_ptr-> issue_cmd_in_poll_mode(instance, cmd);
+
+		pkt->pkt_reason		= CMD_CMPLT;
+		pkt->pkt_statistics	= 0;
+		pkt->pkt_state |= STATE_XFERRED_DATA | STATE_GOT_STATUS;
+
+		switch (ddi_get8(cmd->frame_dma_obj.acc_handle,
+		    &hdr->cmd_status)) {
+		case MFI_STAT_OK:
+			pkt->pkt_scbp[0] = STATUS_GOOD;
+			break;
+
+		case MFI_STAT_SCSI_DONE_WITH_ERROR:
+
+			pkt->pkt_reason	= CMD_CMPLT;
+			pkt->pkt_statistics = 0;
+
+			((struct scsi_status *)pkt->pkt_scbp)->sts_chk = 1;
+			break;
+
+		case MFI_STAT_DEVICE_NOT_FOUND:
+			pkt->pkt_reason		= CMD_DEV_GONE;
+			pkt->pkt_statistics	= STAT_DISCON;
+			break;
+
+		default:
+			((struct scsi_status *)pkt->pkt_scbp)->sts_busy = 1;
+		}
+
+		return_mfi_pkt(instance, cmd);
+		(void) drsas_common_check(instance, cmd);
+
+		if (pkt->pkt_comp) {
+			(*pkt->pkt_comp)(pkt);
+		}
+
+	}
+
+	return (TRAN_ACCEPT);
+}
+
+/*ARGSUSED*/
+static int
+drsas_tran_abort(struct scsi_address *ap, struct scsi_pkt *pkt)
+{
+	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+	/* abort command not supported by H/W */
+
+	return (DDI_FAILURE);
+}
+
+/*ARGSUSED*/
+static int
+drsas_tran_reset(struct scsi_address *ap, int level)
+{
+	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+	/* reset command not supported by H/W */
+
+	return (DDI_FAILURE);
+
+}
+
+/*ARGSUSED*/
+static int
+drsas_tran_getcap(struct scsi_address *ap, char *cap, int whom)
+{
+	int	rval = 0;
+
+	struct drsas_instance	*instance = ADDR2MR(ap);
+
+	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+	/* we do allow inquiring about capabilities for other targets */
+	if (cap == NULL) {
+		return (-1);
+	}
+
+	switch (scsi_hba_lookup_capstr(cap)) {
+	case SCSI_CAP_DMA_MAX:
+		/* Limit to 16MB max transfer */
+		rval = drsas_max_cap_maxxfer;
+		break;
+	case SCSI_CAP_MSG_OUT:
+		rval = 1;
+		break;
+	case SCSI_CAP_DISCONNECT:
+		rval = 0;
+		break;
+	case SCSI_CAP_SYNCHRONOUS:
+		rval = 0;
+		break;
+	case SCSI_CAP_WIDE_XFER:
+		rval = 1;
+		break;
+	case SCSI_CAP_TAGGED_QING:
+		rval = 1;
+		break;
+	case SCSI_CAP_UNTAGGED_QING:
+		rval = 1;
+		break;
+	case SCSI_CAP_PARITY:
+		rval = 1;
+		break;
+	case SCSI_CAP_INITIATOR_ID:
+		rval = instance->init_id;
+		break;
+	case SCSI_CAP_ARQ:
+		rval = 1;
+		break;
+	case SCSI_CAP_LINKED_CMDS:
+		rval = 0;
+		break;
+	case SCSI_CAP_RESET_NOTIFICATION:
+		rval = 1;
+		break;
+	case SCSI_CAP_GEOMETRY:
+		rval = -1;
+
+		break;
+	default:
+		con_log(CL_DLEVEL2, (CE_NOTE, "Default cap coming 0x%x",
+		    scsi_hba_lookup_capstr(cap)));
+		rval = -1;
+		break;
+	}
+
+	return (rval);
+}
+
+/*ARGSUSED*/
+static int
+drsas_tran_setcap(struct scsi_address *ap, char *cap, int value, int whom)
+{
+	int		rval = 1;
+
+	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+	/* We don't allow setting capabilities for other targets */
+	if (cap == NULL || whom == 0) {
+		return (-1);
+	}
+
+	switch (scsi_hba_lookup_capstr(cap)) {
+		case SCSI_CAP_DMA_MAX:
+		case SCSI_CAP_MSG_OUT:
+		case SCSI_CAP_PARITY:
+		case SCSI_CAP_LINKED_CMDS:
+		case SCSI_CAP_RESET_NOTIFICATION:
+		case SCSI_CAP_DISCONNECT:
+		case SCSI_CAP_SYNCHRONOUS:
+		case SCSI_CAP_UNTAGGED_QING:
+		case SCSI_CAP_WIDE_XFER:
+		case SCSI_CAP_INITIATOR_ID:
+		case SCSI_CAP_ARQ:
+			/*
+			 * None of these are settable via
+			 * the capability interface.
+			 */
+			break;
+		case SCSI_CAP_TAGGED_QING:
+			rval = 1;
+			break;
+		case SCSI_CAP_SECTOR_SIZE:
+			rval = 1;
+			break;
+
+		case SCSI_CAP_TOTAL_SECTORS:
+			rval = 1;
+			break;
+		default:
+			rval = -1;
+			break;
+	}
+
+	return (rval);
+}
+
+static void
+drsas_tran_destroy_pkt(struct scsi_address *ap, struct scsi_pkt *pkt)
+{
+	struct scsa_cmd *acmd = PKT2CMD(pkt);
+
+	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+	if (acmd->cmd_flags & CFLAG_DMAVALID) {
+		acmd->cmd_flags &= ~CFLAG_DMAVALID;
+
+		(void) ddi_dma_unbind_handle(acmd->cmd_dmahandle);
+
+		ddi_dma_free_handle(&acmd->cmd_dmahandle);
+
+		acmd->cmd_dmahandle = NULL;
+	}
+
+	/* free the pkt */
+	scsi_hba_pkt_free(ap, pkt);
+}
+
+/*ARGSUSED*/
+static void
+drsas_tran_dmafree(struct scsi_address *ap, struct scsi_pkt *pkt)
+{
+	register struct scsa_cmd *acmd = PKT2CMD(pkt);
+
+	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+	if (acmd->cmd_flags & CFLAG_DMAVALID) {
+		acmd->cmd_flags &= ~CFLAG_DMAVALID;
+
+		(void) ddi_dma_unbind_handle(acmd->cmd_dmahandle);
+
+		ddi_dma_free_handle(&acmd->cmd_dmahandle);
+
+		acmd->cmd_dmahandle = NULL;
+	}
+}
+
+/*ARGSUSED*/
+static void
+drsas_tran_sync_pkt(struct scsi_address *ap, struct scsi_pkt *pkt)
+{
+	register struct scsa_cmd	*acmd = PKT2CMD(pkt);
+
+	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+	if (acmd->cmd_flags & CFLAG_DMAVALID) {
+		(void) ddi_dma_sync(acmd->cmd_dmahandle, acmd->cmd_dma_offset,
+		    acmd->cmd_dma_len, (acmd->cmd_flags & CFLAG_DMASEND) ?
+		    DDI_DMA_SYNC_FORDEV : DDI_DMA_SYNC_FORCPU);
+	}
+}
+
+/*
+ * drsas_isr(caddr_t)
+ *
+ * The Interrupt Service Routine
+ *
+ * Collect status for all completed commands and do callback
+ *
+ */
+static uint_t
+drsas_isr(struct drsas_instance *instance)
+{
+	int		need_softintr;
+	uint32_t	producer;
+	uint32_t	consumer;
+	uint32_t	context;
+
+	struct drsas_cmd	*cmd;
+
+	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+	ASSERT(instance);
+	if ((instance->intr_type == DDI_INTR_TYPE_FIXED) &&
+	    !instance->func_ptr->intr_ack(instance)) {
+		return (DDI_INTR_UNCLAIMED);
+	}
+
+	(void) ddi_dma_sync(instance->mfi_internal_dma_obj.dma_handle,
+	    0, 0, DDI_DMA_SYNC_FORCPU);
+
+	if (drsas_check_dma_handle(instance->mfi_internal_dma_obj.dma_handle)
+	    != DDI_SUCCESS) {
+		drsas_fm_ereport(instance, DDI_FM_DEVICE_NO_RESPONSE);
+		ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST);
+		return (DDI_INTR_UNCLAIMED);
+	}
+
+	producer = ddi_get32(instance->mfi_internal_dma_obj.acc_handle,
+	    instance->producer);
+	consumer = ddi_get32(instance->mfi_internal_dma_obj.acc_handle,
+	    instance->consumer);
+
+	con_log(CL_ANN1, (CE_CONT, " producer %x consumer %x ",
+	    producer, consumer));
+	if (producer == consumer) {
+		con_log(CL_ANN1, (CE_WARN, "producer = consumer case"));
+		return (DDI_INTR_UNCLAIMED);
+	}
+	mutex_enter(&instance->completed_pool_mtx);
+
+	while (consumer != producer) {
+		context = ddi_get32(instance->mfi_internal_dma_obj.acc_handle,
+		    &instance->reply_queue[consumer]);
+		cmd = instance->cmd_list[context];
+		mlist_add_tail(&cmd->list, &instance->completed_pool_list);
+
+		consumer++;
+		if (consumer == (instance->max_fw_cmds + 1)) {
+			consumer = 0;
+		}
+	}
+
+	mutex_exit(&instance->completed_pool_mtx);
+
+	ddi_put32(instance->mfi_internal_dma_obj.acc_handle,
+	    instance->consumer, consumer);
+	(void) ddi_dma_sync(instance->mfi_internal_dma_obj.dma_handle,
+	    0, 0, DDI_DMA_SYNC_FORDEV);
+
+	if (instance->softint_running) {
+		need_softintr = 0;
+	} else {
+		need_softintr = 1;
+	}
+
+	if (instance->isr_level == HIGH_LEVEL_INTR) {
+		if (need_softintr) {
+			ddi_trigger_softintr(instance->soft_intr_id);
+		}
+	} else {
+		/*
+		 * Not a high-level interrupt, therefore call the soft level
+		 * interrupt explicitly
+		 */
+		(void) drsas_softintr(instance);
+	}
+
+	return (DDI_INTR_CLAIMED);
+}
+
+
+/*
+ * ************************************************************************** *
+ *                                                                            *
+ *                                  libraries                                 *
+ *                                                                            *
+ * ************************************************************************** *
+ */
+/*
+ * get_mfi_pkt : Get a command from the free pool
+ * After successful allocation, the caller of this routine
+ * must clear the frame buffer (memset to zero) before
+ * using the packet further.
+ *
+ * ***** Note *****
+ * After clearing the frame buffer the context id of the
+ * frame buffer SHOULD be restored back.
+ */
+static struct drsas_cmd *
+get_mfi_pkt(struct drsas_instance *instance)
+{
+	mlist_t			*head = &instance->cmd_pool_list;
+	struct drsas_cmd	*cmd = NULL;
+
+	mutex_enter(&instance->cmd_pool_mtx);
+	ASSERT(mutex_owned(&instance->cmd_pool_mtx));
+
+	if (!mlist_empty(head)) {
+		cmd = mlist_entry(head->next, struct drsas_cmd, list);
+		mlist_del_init(head->next);
+	}
+	if (cmd != NULL)
+		cmd->pkt = NULL;
+	mutex_exit(&instance->cmd_pool_mtx);
+
+	return (cmd);
+}
+
+/*
+ * return_mfi_pkt : Return a cmd to free command pool
+ */
+static void
+return_mfi_pkt(struct drsas_instance *instance, struct drsas_cmd *cmd)
+{
+	mutex_enter(&instance->cmd_pool_mtx);
+	ASSERT(mutex_owned(&instance->cmd_pool_mtx));
+
+	mlist_add(&cmd->list, &instance->cmd_pool_list);
+
+	mutex_exit(&instance->cmd_pool_mtx);
+}
+
+/*
+ * destroy_mfi_frame_pool
+ */
+static void
+destroy_mfi_frame_pool(struct drsas_instance *instance)
+{
+	int		i;
+	uint32_t	max_cmd = instance->max_fw_cmds;
+
+	struct drsas_cmd	*cmd;
+
+	/* return all frames to pool */
+	for (i = 0; i < max_cmd+1; i++) {
+
+		cmd = instance->cmd_list[i];
+
+		if (cmd->frame_dma_obj_status == DMA_OBJ_ALLOCATED)
+			(void) drsas_free_dma_obj(instance, cmd->frame_dma_obj);
+
+		cmd->frame_dma_obj_status  = DMA_OBJ_FREED;
+	}
+
+}
+
+/*
+ * create_mfi_frame_pool
+ */
+static int
+create_mfi_frame_pool(struct drsas_instance *instance)
+{
+	int		i = 0;
+	int		cookie_cnt;
+	uint16_t	max_cmd;
+	uint16_t	sge_sz;
+	uint32_t	sgl_sz;
+	uint32_t	tot_frame_size;
+
+	struct drsas_cmd	*cmd;
+
+	max_cmd = instance->max_fw_cmds;
+
+	sge_sz	= sizeof (struct drsas_sge64);
+
+	/* calculated the number of 64byte frames required for SGL */
+	sgl_sz		= sge_sz * instance->max_num_sge;
+	tot_frame_size	= sgl_sz + MRMFI_FRAME_SIZE + SENSE_LENGTH;
+
+	con_log(CL_DLEVEL3, (CE_NOTE, "create_mfi_frame_pool: "
+	    "sgl_sz %x tot_frame_size %x", sgl_sz, tot_frame_size));
+
+	while (i < max_cmd+1) {
+		cmd = instance->cmd_list[i];
+
+		cmd->frame_dma_obj.size	= tot_frame_size;
+		cmd->frame_dma_obj.dma_attr = drsas_generic_dma_attr;
+		cmd->frame_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU;
+		cmd->frame_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU;
+		cmd->frame_dma_obj.dma_attr.dma_attr_sgllen = 1;
+		cmd->frame_dma_obj.dma_attr.dma_attr_align = 64;
+
+
+		cookie_cnt = drsas_alloc_dma_obj(instance, &cmd->frame_dma_obj,
+		    (uchar_t)DDI_STRUCTURE_LE_ACC);
+
+		if (cookie_cnt == -1 || cookie_cnt > 1) {
+			con_log(CL_ANN, (CE_WARN,
+			    "create_mfi_frame_pool: could not alloc."));
+			return (DDI_FAILURE);
+		}
+
+		bzero(cmd->frame_dma_obj.buffer, tot_frame_size);
+
+		cmd->frame_dma_obj_status = DMA_OBJ_ALLOCATED;
+		cmd->frame = (union drsas_frame *)cmd->frame_dma_obj.buffer;
+		cmd->frame_phys_addr =
+		    cmd->frame_dma_obj.dma_cookie[0].dmac_address;
+
+		cmd->sense = (uint8_t *)(((unsigned long)
+		    cmd->frame_dma_obj.buffer) +
+		    tot_frame_size - SENSE_LENGTH);
+		cmd->sense_phys_addr =
+		    cmd->frame_dma_obj.dma_cookie[0].dmac_address +
+		    tot_frame_size - SENSE_LENGTH;
+
+		if (!cmd->frame || !cmd->sense) {
+			con_log(CL_ANN, (CE_NOTE,
+			    "dr_sas: pci_pool_alloc failed"));
+
+			return (ENOMEM);
+		}
+
+		ddi_put32(cmd->frame_dma_obj.acc_handle,
+		    &cmd->frame->io.context, cmd->index);
+		i++;
+
+		con_log(CL_DLEVEL3, (CE_NOTE, "[%x]-%x",
+		    cmd->index, cmd->frame_phys_addr));
+	}
+
+	return (DDI_SUCCESS);
+}
+
+/*
+ * free_additional_dma_buffer
+ */
+static void
+free_additional_dma_buffer(struct drsas_instance *instance)
+{
+	if (instance->mfi_internal_dma_obj.status == DMA_OBJ_ALLOCATED) {
+		(void) drsas_free_dma_obj(instance,
+		    instance->mfi_internal_dma_obj);
+		instance->mfi_internal_dma_obj.status = DMA_OBJ_FREED;
+	}
+
+	if (instance->mfi_evt_detail_obj.status == DMA_OBJ_ALLOCATED) {
+		(void) drsas_free_dma_obj(instance,
+		    instance->mfi_evt_detail_obj);
+		instance->mfi_evt_detail_obj.status = DMA_OBJ_FREED;
+	}
+}
+
+/*
+ * alloc_additional_dma_buffer
+ */
+static int
+alloc_additional_dma_buffer(struct drsas_instance *instance)
+{
+	uint32_t	reply_q_sz;
+	uint32_t	internal_buf_size = PAGESIZE*2;
+
+	/* max cmds plus 1 + producer & consumer */
+	reply_q_sz = sizeof (uint32_t) * (instance->max_fw_cmds + 1 + 2);
+
+	instance->mfi_internal_dma_obj.size = internal_buf_size;
+	instance->mfi_internal_dma_obj.dma_attr	= drsas_generic_dma_attr;
+	instance->mfi_internal_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU;
+	instance->mfi_internal_dma_obj.dma_attr.dma_attr_count_max =
+	    0xFFFFFFFFU;
+	instance->mfi_internal_dma_obj.dma_attr.dma_attr_sgllen	= 1;
+
+	if (drsas_alloc_dma_obj(instance, &instance->mfi_internal_dma_obj,
+	    (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) {
+		con_log(CL_ANN, (CE_WARN,
+		    "dr_sas: could not alloc reply queue"));
+		return (DDI_FAILURE);
+	}
+
+	bzero(instance->mfi_internal_dma_obj.buffer, internal_buf_size);
+
+	instance->mfi_internal_dma_obj.status |= DMA_OBJ_ALLOCATED;
+
+	instance->producer = (uint32_t *)((unsigned long)
+	    instance->mfi_internal_dma_obj.buffer);
+	instance->consumer = (uint32_t *)((unsigned long)
+	    instance->mfi_internal_dma_obj.buffer + 4);
+	instance->reply_queue = (uint32_t *)((unsigned long)
+	    instance->mfi_internal_dma_obj.buffer + 8);
+	instance->internal_buf = (caddr_t)(((unsigned long)
+	    instance->mfi_internal_dma_obj.buffer) + reply_q_sz + 8);
+	instance->internal_buf_dmac_add =
+	    instance->mfi_internal_dma_obj.dma_cookie[0].dmac_address +
+	    (reply_q_sz + 8);
+	instance->internal_buf_size = internal_buf_size -
+	    (reply_q_sz + 8);
+
+	/* allocate evt_detail */
+	instance->mfi_evt_detail_obj.size = sizeof (struct drsas_evt_detail);
+	instance->mfi_evt_detail_obj.dma_attr = drsas_generic_dma_attr;
+	instance->mfi_evt_detail_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU;
+	instance->mfi_evt_detail_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU;
+	instance->mfi_evt_detail_obj.dma_attr.dma_attr_sgllen = 1;
+	instance->mfi_evt_detail_obj.dma_attr.dma_attr_align = 1;
+
+	if (drsas_alloc_dma_obj(instance, &instance->mfi_evt_detail_obj,
+	    (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) {
+		con_log(CL_ANN, (CE_WARN, "alloc_additional_dma_buffer: "
+		    "could not allocate data transfer buffer."));
+		return (DDI_FAILURE);
+	}
+
+	bzero(instance->mfi_evt_detail_obj.buffer,
+	    sizeof (struct drsas_evt_detail));
+
+	instance->mfi_evt_detail_obj.status |= DMA_OBJ_ALLOCATED;
+
+	return (DDI_SUCCESS);
+}
+
+/*
+ * free_space_for_mfi
+ */
+static void
+free_space_for_mfi(struct drsas_instance *instance)
+{
+	int		i;
+	uint32_t	max_cmd = instance->max_fw_cmds;
+
+	/* already freed */
+	if (instance->cmd_list == NULL) {
+		return;
+	}
+
+	free_additional_dma_buffer(instance);
+
+	/* first free the MFI frame pool */
+	destroy_mfi_frame_pool(instance);
+
+	/* free all the commands in the cmd_list */
+	for (i = 0; i < instance->max_fw_cmds+1; i++) {
+		kmem_free(instance->cmd_list[i],
+		    sizeof (struct drsas_cmd));
+
+		instance->cmd_list[i] = NULL;
+	}
+
+	/* free the cmd_list buffer itself */
+	kmem_free(instance->cmd_list,
+	    sizeof (struct drsas_cmd *) * (max_cmd+1));
+
+	instance->cmd_list = NULL;
+
+	INIT_LIST_HEAD(&instance->cmd_pool_list);
+}
+
+/*
+ * alloc_space_for_mfi
+ */
+static int
+alloc_space_for_mfi(struct drsas_instance *instance)
+{
+	int		i;
+	uint32_t	max_cmd;
+	size_t		sz;
+
+	struct drsas_cmd	*cmd;
+
+	max_cmd = instance->max_fw_cmds;
+
+	/* reserve 1 more slot for flush_cache */
+	sz = sizeof (struct drsas_cmd *) * (max_cmd+1);
+
+	/*
+	 * instance->cmd_list is an array of struct drsas_cmd pointers.
+	 * Allocate the dynamic array first and then allocate individual
+	 * commands.
+	 */
+	instance->cmd_list = kmem_zalloc(sz, KM_SLEEP);
+	ASSERT(instance->cmd_list);
+
+	for (i = 0; i < max_cmd+1; i++) {
+		instance->cmd_list[i] = kmem_zalloc(sizeof (struct drsas_cmd),
+		    KM_SLEEP);
+		ASSERT(instance->cmd_list[i]);
+	}
+
+	INIT_LIST_HEAD(&instance->cmd_pool_list);
+
+	/* add all the commands to command pool (instance->cmd_pool) */
+	for (i = 0; i < max_cmd; i++) {
+		cmd		= instance->cmd_list[i];
+		cmd->index	= i;
+
+		mlist_add_tail(&cmd->list, &instance->cmd_pool_list);
+	}
+
+	/* single slot for flush_cache won't be added in command pool */
+	cmd		= instance->cmd_list[max_cmd];
+	cmd->index	= i;
+
+	/* create a frame pool and assign one frame to each cmd */
+	if (create_mfi_frame_pool(instance)) {
+		con_log(CL_ANN, (CE_NOTE, "error creating frame DMA pool"));
+		return (DDI_FAILURE);
+	}
+
+	/* create a frame pool and assign one frame to each cmd */
+	if (alloc_additional_dma_buffer(instance)) {
+		con_log(CL_ANN, (CE_NOTE, "error creating frame DMA pool"));
+		return (DDI_FAILURE);
+	}
+
+	return (DDI_SUCCESS);
+}
+
+/*
+ * get_ctrl_info
+ */
+static int
+get_ctrl_info(struct drsas_instance *instance,
+    struct drsas_ctrl_info *ctrl_info)
+{
+	int	ret = 0;
+
+	struct drsas_cmd		*cmd;
+	struct drsas_dcmd_frame	*dcmd;
+	struct drsas_ctrl_info	*ci;
+
+	cmd = get_mfi_pkt(instance);
+
+	if (!cmd) {
+		con_log(CL_ANN, (CE_WARN,
+		    "Failed to get a cmd for ctrl info"));
+		return (DDI_FAILURE);
+	}
+	/* Clear the frame buffer and assign back the context id */
+	(void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame));
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context,
+	    cmd->index);
+
+	dcmd = &cmd->frame->dcmd;
+
+	ci = (struct drsas_ctrl_info *)instance->internal_buf;
+
+	if (!ci) {
+		con_log(CL_ANN, (CE_WARN,
+		    "Failed to alloc mem for ctrl info"));
+		return_mfi_pkt(instance, cmd);
+		return (DDI_FAILURE);
+	}
+
+	(void) memset(ci, 0, sizeof (struct drsas_ctrl_info));
+
+	/* for( i = 0; i < DCMD_MBOX_SZ; i++ ) dcmd->mbox.b[i] = 0; */
+	(void) memset(dcmd->mbox.b, 0, DCMD_MBOX_SZ);
+
+	ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd, MFI_CMD_OP_DCMD);
+	ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd_status,
+	    MFI_CMD_STATUS_POLL_MODE);
+	ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->sge_count, 1);
+	ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->flags,
+	    MFI_FRAME_DIR_READ);
+	ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->timeout, 0);
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->data_xfer_len,
+	    sizeof (struct drsas_ctrl_info));
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->opcode,
+	    DR_DCMD_CTRL_GET_INFO);
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->sgl.sge32[0].phys_addr,
+	    instance->internal_buf_dmac_add);
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->sgl.sge32[0].length,
+	    sizeof (struct drsas_ctrl_info));
+
+	cmd->frame_count = 1;
+
+	if (!instance->func_ptr->issue_cmd_in_poll_mode(instance, cmd)) {
+		ret = 0;
+		ddi_rep_get8(cmd->frame_dma_obj.acc_handle,
+		    (uint8_t *)ctrl_info, (uint8_t *)ci,
+		    sizeof (struct drsas_ctrl_info), DDI_DEV_AUTOINCR);
+	} else {
+		con_log(CL_ANN, (CE_WARN, "get_ctrl_info: Ctrl info failed"));
+		ret = -1;
+	}
+
+	return_mfi_pkt(instance, cmd);
+	if (drsas_common_check(instance, cmd) != DDI_SUCCESS) {
+		ret = -1;
+	}
+
+	return (ret);
+}
+
+/*
+ * abort_aen_cmd
+ */
+static int
+abort_aen_cmd(struct drsas_instance *instance,
+    struct drsas_cmd *cmd_to_abort)
+{
+	int	ret = 0;
+
+	struct drsas_cmd		*cmd;
+	struct drsas_abort_frame	*abort_fr;
+
+	cmd = get_mfi_pkt(instance);
+
+	if (!cmd) {
+		con_log(CL_ANN, (CE_WARN,
+		    "Failed to get a cmd for ctrl info"));
+		return (DDI_FAILURE);
+	}
+	/* Clear the frame buffer and assign back the context id */
+	(void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame));
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context,
+	    cmd->index);
+
+	abort_fr = &cmd->frame->abort;
+
+	/* prepare and issue the abort frame */
+	ddi_put8(cmd->frame_dma_obj.acc_handle,
+	    &abort_fr->cmd, MFI_CMD_OP_ABORT);
+	ddi_put8(cmd->frame_dma_obj.acc_handle, &abort_fr->cmd_status,
+	    MFI_CMD_STATUS_SYNC_MODE);
+	ddi_put16(cmd->frame_dma_obj.acc_handle, &abort_fr->flags, 0);
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &abort_fr->abort_context,
+	    cmd_to_abort->index);
+	ddi_put32(cmd->frame_dma_obj.acc_handle,
+	    &abort_fr->abort_mfi_phys_addr_lo, cmd_to_abort->frame_phys_addr);
+	ddi_put32(cmd->frame_dma_obj.acc_handle,
+	    &abort_fr->abort_mfi_phys_addr_hi, 0);
+
+	instance->aen_cmd->abort_aen = 1;
+
+	cmd->sync_cmd = DRSAS_TRUE;
+	cmd->frame_count = 1;
+
+	if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) {
+		con_log(CL_ANN, (CE_WARN,
+		    "abort_aen_cmd: issue_cmd_in_sync_mode failed"));
+		ret = -1;
+	} else {
+		ret = 0;
+	}
+
+	instance->aen_cmd->abort_aen = 1;
+	instance->aen_cmd = 0;
+
+	return_mfi_pkt(instance, cmd);
+	(void) drsas_common_check(instance, cmd);
+
+	return (ret);
+}
+
+/*
+ * init_mfi
+ */
+static int
+init_mfi(struct drsas_instance *instance)
+{
+	struct drsas_cmd		*cmd;
+	struct drsas_ctrl_info		ctrl_info;
+	struct drsas_init_frame		*init_frame;
+	struct drsas_init_queue_info	*initq_info;
+
+	/* we expect the FW state to be READY */
+	if (mfi_state_transition_to_ready(instance)) {
+		con_log(CL_ANN, (CE_WARN, "dr_sas: F/W is not ready"));
+		goto fail_ready_state;
+	}
+
+	/* get various operational parameters from status register */
+	instance->max_num_sge =
+	    (instance->func_ptr->read_fw_status_reg(instance) &
+	    0xFF0000) >> 0x10;
+	/*
+	 * Reduce the max supported cmds by 1. This is to ensure that the
+	 * reply_q_sz (1 more than the max cmd that driver may send)
+	 * does not exceed max cmds that the FW can support
+	 */
+	instance->max_fw_cmds =
+	    instance->func_ptr->read_fw_status_reg(instance) & 0xFFFF;
+	instance->max_fw_cmds = instance->max_fw_cmds - 1;
+
+	instance->max_num_sge =
+	    (instance->max_num_sge > DRSAS_MAX_SGE_CNT) ?
+	    DRSAS_MAX_SGE_CNT : instance->max_num_sge;
+
+	/* create a pool of commands */
+	if (alloc_space_for_mfi(instance) != DDI_SUCCESS)
+		goto fail_alloc_fw_space;
+
+	/*
+	 * Prepare a init frame. Note the init frame points to queue info
+	 * structure. Each frame has SGL allocated after first 64 bytes. For
+	 * this frame - since we don't need any SGL - we use SGL's space as
+	 * queue info structure
+	 */
+	cmd = get_mfi_pkt(instance);
+	/* Clear the frame buffer and assign back the context id */
+	(void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame));
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context,
+	    cmd->index);
+
+	init_frame = (struct drsas_init_frame *)cmd->frame;
+	initq_info = (struct drsas_init_queue_info *)
+	    ((unsigned long)init_frame + 64);
+
+	(void) memset(init_frame, 0, MRMFI_FRAME_SIZE);
+	(void) memset(initq_info, 0, sizeof (struct drsas_init_queue_info));
+
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &initq_info->init_flags, 0);
+
+	ddi_put32(cmd->frame_dma_obj.acc_handle,
+	    &initq_info->reply_queue_entries, instance->max_fw_cmds + 1);
+
+	ddi_put32(cmd->frame_dma_obj.acc_handle,
+	    &initq_info->producer_index_phys_addr_hi, 0);
+	ddi_put32(cmd->frame_dma_obj.acc_handle,
+	    &initq_info->producer_index_phys_addr_lo,
+	    instance->mfi_internal_dma_obj.dma_cookie[0].dmac_address);
+
+	ddi_put32(cmd->frame_dma_obj.acc_handle,
+	    &initq_info->consumer_index_phys_addr_hi, 0);
+	ddi_put32(cmd->frame_dma_obj.acc_handle,
+	    &initq_info->consumer_index_phys_addr_lo,
+	    instance->mfi_internal_dma_obj.dma_cookie[0].dmac_address + 4);
+
+	ddi_put32(cmd->frame_dma_obj.acc_handle,
+	    &initq_info->reply_queue_start_phys_addr_hi, 0);
+	ddi_put32(cmd->frame_dma_obj.acc_handle,
+	    &initq_info->reply_queue_start_phys_addr_lo,
+	    instance->mfi_internal_dma_obj.dma_cookie[0].dmac_address + 8);
+
+	ddi_put8(cmd->frame_dma_obj.acc_handle,
+	    &init_frame->cmd, MFI_CMD_OP_INIT);
+	ddi_put8(cmd->frame_dma_obj.acc_handle, &init_frame->cmd_status,
+	    MFI_CMD_STATUS_POLL_MODE);
+	ddi_put16(cmd->frame_dma_obj.acc_handle, &init_frame->flags, 0);
+	ddi_put32(cmd->frame_dma_obj.acc_handle,
+	    &init_frame->queue_info_new_phys_addr_lo,
+	    cmd->frame_phys_addr + 64);
+	ddi_put32(cmd->frame_dma_obj.acc_handle,
+	    &init_frame->queue_info_new_phys_addr_hi, 0);
+
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &init_frame->data_xfer_len,
+	    sizeof (struct drsas_init_queue_info));
+
+	cmd->frame_count = 1;
+
+	/* issue the init frame in polled mode */
+	if (instance->func_ptr->issue_cmd_in_poll_mode(instance, cmd)) {
+		con_log(CL_ANN, (CE_WARN, "failed to init firmware"));
+		goto fail_fw_init;
+	}
+
+	return_mfi_pkt(instance, cmd);
+	if (drsas_common_check(instance, cmd) != DDI_SUCCESS) {
+		goto fail_fw_init;
+	}
+
+	/* gather misc FW related information */
+	if (!get_ctrl_info(instance, &ctrl_info)) {
+		instance->max_sectors_per_req = ctrl_info.max_request_size;
+		con_log(CL_ANN1, (CE_NOTE, "product name %s ld present %d",
+		    ctrl_info.product_name, ctrl_info.ld_present_count));
+	} else {
+		instance->max_sectors_per_req = instance->max_num_sge *
+		    PAGESIZE / 512;
+	}
+
+	if (drsas_check_acc_handle(instance->regmap_handle) != DDI_SUCCESS) {
+		goto fail_fw_init;
+	}
+
+	return (DDI_SUCCESS);
+
+fail_fw_init:
+fail_alloc_fw_space:
+
+	free_space_for_mfi(instance);
+
+fail_ready_state:
+	ddi_regs_map_free(&instance->regmap_handle);
+
+fail_mfi_reg_setup:
+	return (DDI_FAILURE);
+}
+
+/*
+ * mfi_state_transition_to_ready	: Move the FW to READY state
+ *
+ * @reg_set			: MFI register set
+ */
+static int
+mfi_state_transition_to_ready(struct drsas_instance *instance)
+{
+	int		i;
+	uint8_t		max_wait;
+	uint32_t	fw_ctrl;
+	uint32_t	fw_state;
+	uint32_t	cur_state;
+
+	fw_state =
+	    instance->func_ptr->read_fw_status_reg(instance) & MFI_STATE_MASK;
+	con_log(CL_ANN1, (CE_NOTE,
+	    "mfi_state_transition_to_ready:FW state = 0x%x", fw_state));
+
+	while (fw_state != MFI_STATE_READY) {
+		con_log(CL_ANN, (CE_NOTE,
+		    "mfi_state_transition_to_ready:FW state%x", fw_state));
+
+		switch (fw_state) {
+		case MFI_STATE_FAULT:
+			con_log(CL_ANN, (CE_NOTE,
+			    "dr_sas: FW in FAULT state!!"));
+
+			return (ENODEV);
+		case MFI_STATE_WAIT_HANDSHAKE:
+			/* set the CLR bit in IMR0 */
+			con_log(CL_ANN, (CE_NOTE,
+			    "dr_sas: FW waiting for HANDSHAKE"));
+			/*
+			 * PCI_Hot Plug: MFI F/W requires
+			 * (MFI_INIT_CLEAR_HANDSHAKE|MFI_INIT_HOTPLUG)
+			 * to be set
+			 */
+			/* WR_IB_MSG_0(MFI_INIT_CLEAR_HANDSHAKE, instance); */
+			WR_IB_DOORBELL(MFI_INIT_CLEAR_HANDSHAKE |
+			    MFI_INIT_HOTPLUG, instance);
+
+			max_wait	= 2;
+			cur_state	= MFI_STATE_WAIT_HANDSHAKE;
+			break;
+		case MFI_STATE_BOOT_MESSAGE_PENDING:
+			/* set the CLR bit in IMR0 */
+			con_log(CL_ANN, (CE_NOTE,
+			    "dr_sas: FW state boot message pending"));
+			/*
+			 * PCI_Hot Plug: MFI F/W requires
+			 * (MFI_INIT_CLEAR_HANDSHAKE|MFI_INIT_HOTPLUG)
+			 * to be set
+			 */
+			WR_IB_DOORBELL(MFI_INIT_HOTPLUG, instance);
+
+			max_wait	= 10;
+			cur_state	= MFI_STATE_BOOT_MESSAGE_PENDING;
+			break;
+		case MFI_STATE_OPERATIONAL:
+			/* bring it to READY state; assuming max wait 2 secs */
+			instance->func_ptr->disable_intr(instance);
+			con_log(CL_ANN1, (CE_NOTE,
+			    "dr_sas: FW in OPERATIONAL state"));
+			/*
+			 * PCI_Hot Plug: MFI F/W requires
+			 * (MFI_INIT_READY | MFI_INIT_MFIMODE | MFI_INIT_ABORT)
+			 * to be set
+			 */
+			/* WR_IB_DOORBELL(MFI_INIT_READY, instance); */
+			WR_IB_DOORBELL(MFI_RESET_FLAGS, instance);
+
+			max_wait	= 10;
+			cur_state	= MFI_STATE_OPERATIONAL;
+			break;
+		case MFI_STATE_UNDEFINED:
+			/* this state should not last for more than 2 seconds */
+			con_log(CL_ANN, (CE_NOTE, "FW state undefined"));
+
+			max_wait	= 2;
+			cur_state	= MFI_STATE_UNDEFINED;
+			break;
+		case MFI_STATE_BB_INIT:
+			max_wait	= 2;
+			cur_state	= MFI_STATE_BB_INIT;
+			break;
+		case MFI_STATE_FW_INIT:
+			max_wait	= 2;
+			cur_state	= MFI_STATE_FW_INIT;
+			break;
+		case MFI_STATE_DEVICE_SCAN:
+			max_wait	= 10;
+			cur_state	= MFI_STATE_DEVICE_SCAN;
+			break;
+		default:
+			con_log(CL_ANN, (CE_NOTE,
+			    "dr_sas: Unknown state 0x%x", fw_state));
+			return (ENODEV);
+		}
+
+		/* the cur_state should not last for more than max_wait secs */
+		for (i = 0; i < (max_wait * MILLISEC); i++) {
+			/* fw_state = RD_OB_MSG_0(instance) & MFI_STATE_MASK; */
+			fw_state =
+			    instance->func_ptr->read_fw_status_reg(instance) &
+			    MFI_STATE_MASK;
+
+			if (fw_state == cur_state) {
+				delay(1 * drv_usectohz(MILLISEC));
+			} else {
+				break;
+			}
+		}
+
+		/* return error if fw_state hasn't changed after max_wait */
+		if (fw_state == cur_state) {
+			con_log(CL_ANN, (CE_NOTE,
+			    "FW state hasn't changed in %d secs", max_wait));
+			return (ENODEV);
+		}
+	};
+
+	fw_ctrl = RD_IB_DOORBELL(instance);
+
+	con_log(CL_ANN1, (CE_NOTE,
+	    "mfi_state_transition_to_ready:FW ctrl = 0x%x", fw_ctrl));
+
+	/*
+	 * Write 0xF to the doorbell register to do the following.
+	 * - Abort all outstanding commands (bit 0).
+	 * - Transition from OPERATIONAL to READY state (bit 1).
+	 * - Discard (possible) low MFA posted in 64-bit mode (bit-2).
+	 * - Set to release FW to continue running (i.e. BIOS handshake
+	 *   (bit 3).
+	 */
+	WR_IB_DOORBELL(0xF, instance);
+
+	if (drsas_check_acc_handle(instance->regmap_handle) != DDI_SUCCESS) {
+		return (ENODEV);
+	}
+	return (DDI_SUCCESS);
+}
+
+/*
+ * get_seq_num
+ */
+static int
+get_seq_num(struct drsas_instance *instance,
+    struct drsas_evt_log_info *eli)
+{
+	int	ret = DDI_SUCCESS;
+
+	dma_obj_t			dcmd_dma_obj;
+	struct drsas_cmd		*cmd;
+	struct drsas_dcmd_frame		*dcmd;
+
+	cmd = get_mfi_pkt(instance);
+
+	if (!cmd) {
+		cmn_err(CE_WARN, "dr_sas: failed to get a cmd");
+		return (ENOMEM);
+	}
+	/* Clear the frame buffer and assign back the context id */
+	(void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame));
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context,
+	    cmd->index);
+
+	dcmd	= &cmd->frame->dcmd;
+
+	/* allocate the data transfer buffer */
+	dcmd_dma_obj.size = sizeof (struct drsas_evt_log_info);
+	dcmd_dma_obj.dma_attr = drsas_generic_dma_attr;
+	dcmd_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU;
+	dcmd_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU;
+	dcmd_dma_obj.dma_attr.dma_attr_sgllen = 1;
+	dcmd_dma_obj.dma_attr.dma_attr_align = 1;
+
+	if (drsas_alloc_dma_obj(instance, &dcmd_dma_obj,
+	    (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) {
+		con_log(CL_ANN, (CE_WARN,
+		    "get_seq_num: could not allocate data transfer buffer."));
+		return (DDI_FAILURE);
+	}
+
+	(void) memset(dcmd_dma_obj.buffer, 0,
+	    sizeof (struct drsas_evt_log_info));
+
+	(void) memset(dcmd->mbox.b, 0, DCMD_MBOX_SZ);
+
+	ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd, MFI_CMD_OP_DCMD);
+	ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd_status, 0);
+	ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->sge_count, 1);
+	ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->flags,
+	    MFI_FRAME_DIR_READ);
+	ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->timeout, 0);
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->data_xfer_len,
+	    sizeof (struct drsas_evt_log_info));
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->opcode,
+	    DR_DCMD_CTRL_EVENT_GET_INFO);
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->sgl.sge32[0].length,
+	    sizeof (struct drsas_evt_log_info));
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->sgl.sge32[0].phys_addr,
+	    dcmd_dma_obj.dma_cookie[0].dmac_address);
+
+	cmd->sync_cmd = DRSAS_TRUE;
+	cmd->frame_count = 1;
+
+	if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) {
+		cmn_err(CE_WARN, "get_seq_num: "
+		    "failed to issue DRSAS_DCMD_CTRL_EVENT_GET_INFO");
+		ret = DDI_FAILURE;
+	} else {
+		/* copy the data back into callers buffer */
+		ddi_rep_get8(cmd->frame_dma_obj.acc_handle, (uint8_t *)eli,
+		    (uint8_t *)dcmd_dma_obj.buffer,
+		    sizeof (struct drsas_evt_log_info), DDI_DEV_AUTOINCR);
+		ret = DDI_SUCCESS;
+	}
+
+	if (drsas_free_dma_obj(instance, dcmd_dma_obj) != DDI_SUCCESS)
+		ret = DDI_FAILURE;
+
+	return_mfi_pkt(instance, cmd);
+	if (drsas_common_check(instance, cmd) != DDI_SUCCESS) {
+		ret = DDI_FAILURE;
+	}
+	return (ret);
+}
+
+/*
+ * start_mfi_aen
+ */
+static int
+start_mfi_aen(struct drsas_instance *instance)
+{
+	int	ret = 0;
+
+	struct drsas_evt_log_info	eli;
+	union drsas_evt_class_locale	class_locale;
+
+	/* get the latest sequence number from FW */
+	(void) memset(&eli, 0, sizeof (struct drsas_evt_log_info));
+
+	if (get_seq_num(instance, &eli)) {
+		cmn_err(CE_WARN, "start_mfi_aen: failed to get seq num");
+		return (-1);
+	}
+
+	/* register AEN with FW for latest sequence number plus 1 */
+	class_locale.members.reserved	= 0;
+	class_locale.members.locale	= DR_EVT_LOCALE_ALL;
+	class_locale.members.class	= DR_EVT_CLASS_INFO;
+	ret = register_mfi_aen(instance, eli.newest_seq_num + 1,
+	    class_locale.word);
+
+	if (ret) {
+		cmn_err(CE_WARN, "start_mfi_aen: aen registration failed");
+		return (-1);
+	}
+
+	return (ret);
+}
+
+/*
+ * flush_cache
+ */
+static void
+flush_cache(struct drsas_instance *instance)
+{
+	struct drsas_cmd		*cmd = NULL;
+	struct drsas_dcmd_frame		*dcmd;
+	uint32_t	max_cmd = instance->max_fw_cmds;
+
+	cmd = instance->cmd_list[max_cmd];
+
+	if (cmd == NULL)
+		return;
+
+	dcmd = &cmd->frame->dcmd;
+
+	(void) memset(dcmd->mbox.b, 0, DCMD_MBOX_SZ);
+
+	ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd, MFI_CMD_OP_DCMD);
+	ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd_status, 0x0);
+	ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->sge_count, 0);
+	ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->flags,
+	    MFI_FRAME_DIR_NONE);
+	ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->timeout, 0);
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->data_xfer_len, 0);
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->opcode,
+	    DR_DCMD_CTRL_CACHE_FLUSH);
+	ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->mbox.b[0],
+	    DR_FLUSH_CTRL_CACHE | DR_FLUSH_DISK_CACHE);
+
+	cmd->frame_count = 1;
+
+	if (instance->func_ptr->issue_cmd_in_poll_mode(instance, cmd)) {
+		con_log(CL_ANN1, (CE_WARN,
+	    "flush_cache: failed to issue MFI_DCMD_CTRL_CACHE_FLUSH"));
+	}
+	con_log(CL_DLEVEL1, (CE_NOTE, "done"));
+}
+
+/*
+ * service_mfi_aen-	Completes an AEN command
+ * @instance:			Adapter soft state
+ * @cmd:			Command to be completed
+ *
+ */
+static void
+service_mfi_aen(struct drsas_instance *instance, struct drsas_cmd *cmd)
+{
+	uint32_t	seq_num;
+	struct drsas_evt_detail *evt_detail =
+	    (struct drsas_evt_detail *)instance->mfi_evt_detail_obj.buffer;
+	int		rval = 0;
+	int		tgt = 0;
+	ddi_acc_handle_t		acc_handle;
+
+	acc_handle = cmd->frame_dma_obj.acc_handle;
+
+	cmd->cmd_status = ddi_get8(acc_handle, &cmd->frame->io.cmd_status);
+
+	if (cmd->cmd_status == ENODATA) {
+		cmd->cmd_status = 0;
+	}
+
+	/*
+	 * log the MFI AEN event to the sysevent queue so that
+	 * application will get noticed
+	 */
+	if (ddi_log_sysevent(instance->dip, DDI_VENDOR_LSI, "LSIMEGA", "SAS",
+	    NULL, NULL, DDI_NOSLEEP) != DDI_SUCCESS) {
+		int	instance_no = ddi_get_instance(instance->dip);
+		con_log(CL_ANN, (CE_WARN,
+		    "dr_sas%d: Failed to log AEN event", instance_no));
+	}
+	/*
+	 * Check for any ld devices that has changed state. i.e. online
+	 * or offline.
+	 */
+	con_log(CL_ANN1, (CE_NOTE,
+	    "AEN: code = %x class = %x locale = %x args = %x",
+	    ddi_get32(acc_handle, &evt_detail->code),
+	    evt_detail->cl.members.class,
+	    ddi_get16(acc_handle, &evt_detail->cl.members.locale),
+	    ddi_get8(acc_handle, &evt_detail->arg_type)));
+
+	switch (ddi_get32(acc_handle, &evt_detail->code)) {
+	case DR_EVT_CFG_CLEARED: {
+		for (tgt = 0; tgt < MRDRV_MAX_LD; tgt++) {
+			if (instance->dr_ld_list[tgt].dip != NULL) {
+				rval = drsas_service_evt(instance, tgt, 0,
+				    DRSAS_EVT_UNCONFIG_TGT, NULL);
+				con_log(CL_ANN1, (CE_WARN,
+				    "dr_sas: CFG CLEARED AEN rval = %d "
+				    "tgt id = %d", rval, tgt));
+			}
+		}
+		break;
+	}
+
+	case DR_EVT_LD_DELETED: {
+		rval = drsas_service_evt(instance,
+		    ddi_get16(acc_handle, &evt_detail->args.ld.target_id), 0,
+		    DRSAS_EVT_UNCONFIG_TGT, NULL);
+		con_log(CL_ANN1, (CE_WARN, "dr_sas: LD DELETED AEN rval = %d "
+		    "tgt id = %d index = %d", rval,
+		    ddi_get16(acc_handle, &evt_detail->args.ld.target_id),
+		    ddi_get8(acc_handle, &evt_detail->args.ld.ld_index)));
+		break;
+	} /* End of DR_EVT_LD_DELETED */
+
+	case DR_EVT_LD_CREATED: {
+		rval = drsas_service_evt(instance,
+		    ddi_get16(acc_handle, &evt_detail->args.ld.target_id), 0,
+		    DRSAS_EVT_CONFIG_TGT, NULL);
+		con_log(CL_ANN1, (CE_WARN, "dr_sas: LD CREATED AEN rval = %d "
+		    "tgt id = %d index = %d", rval,
+		    ddi_get16(acc_handle, &evt_detail->args.ld.target_id),
+		    ddi_get8(acc_handle, &evt_detail->args.ld.ld_index)));
+		break;
+	} /* End of DR_EVT_LD_CREATED */
+	} /* End of Main Switch */
+
+	/* get copy of seq_num and class/locale for re-registration */
+	seq_num = ddi_get32(acc_handle, &evt_detail->seq_num);
+	seq_num++;
+	(void) memset(instance->mfi_evt_detail_obj.buffer, 0,
+	    sizeof (struct drsas_evt_detail));
+
+	ddi_put8(acc_handle, &cmd->frame->dcmd.cmd_status, 0x0);
+	ddi_put32(acc_handle, &cmd->frame->dcmd.mbox.w[0], seq_num);
+
+	instance->aen_seq_num = seq_num;
+
+	cmd->frame_count = 1;
+
+	/* Issue the aen registration frame */
+	instance->func_ptr->issue_cmd(cmd, instance);
+}
+
+/*
+ * complete_cmd_in_sync_mode -	Completes an internal command
+ * @instance:			Adapter soft state
+ * @cmd:			Command to be completed
+ *
+ * The issue_cmd_in_sync_mode() function waits for a command to complete
+ * after it issues a command. This function wakes up that waiting routine by
+ * calling wake_up() on the wait queue.
+ */
+static void
+complete_cmd_in_sync_mode(struct drsas_instance *instance,
+    struct drsas_cmd *cmd)
+{
+	cmd->cmd_status = ddi_get8(cmd->frame_dma_obj.acc_handle,
+	    &cmd->frame->io.cmd_status);
+
+	cmd->sync_cmd = DRSAS_FALSE;
+
+	if (cmd->cmd_status == ENODATA) {
+		cmd->cmd_status = 0;
+	}
+
+	cv_broadcast(&instance->int_cmd_cv);
+}
+
+/*
+ * drsas_softintr - The Software ISR
+ * @param arg	: HBA soft state
+ *
+ * called from high-level interrupt if hi-level interrupt are not there,
+ * otherwise triggered as a soft interrupt
+ */
+static uint_t
+drsas_softintr(struct drsas_instance *instance)
+{
+	struct scsi_pkt		*pkt;
+	struct scsa_cmd		*acmd;
+	struct drsas_cmd	*cmd;
+	struct mlist_head	*pos, *next;
+	mlist_t			process_list;
+	struct drsas_header	*hdr;
+	struct scsi_arq_status	*arqstat;
+
+	con_log(CL_ANN1, (CE_CONT, "drsas_softintr called"));
+
+	ASSERT(instance);
+	mutex_enter(&instance->completed_pool_mtx);
+
+	if (mlist_empty(&instance->completed_pool_list)) {
+		mutex_exit(&instance->completed_pool_mtx);
+		return (DDI_INTR_UNCLAIMED);
+	}
+
+	instance->softint_running = 1;
+
+	INIT_LIST_HEAD(&process_list);
+	mlist_splice(&instance->completed_pool_list, &process_list);
+	INIT_LIST_HEAD(&instance->completed_pool_list);
+
+	mutex_exit(&instance->completed_pool_mtx);
+
+	/* perform all callbacks first, before releasing the SCBs */
+	mlist_for_each_safe(pos, next, &process_list) {
+		cmd = mlist_entry(pos, struct drsas_cmd, list);
+
+		/* syncronize the Cmd frame for the controller */
+		(void) ddi_dma_sync(cmd->frame_dma_obj.dma_handle,
+		    0, 0, DDI_DMA_SYNC_FORCPU);
+
+		if (drsas_check_dma_handle(cmd->frame_dma_obj.dma_handle) !=
+		    DDI_SUCCESS) {
+			drsas_fm_ereport(instance, DDI_FM_DEVICE_NO_RESPONSE);
+			ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST);
+			return (DDI_INTR_UNCLAIMED);
+		}
+
+		hdr = &cmd->frame->hdr;
+
+		/* remove the internal command from the process list */
+		mlist_del_init(&cmd->list);
+
+		switch (ddi_get8(cmd->frame_dma_obj.acc_handle, &hdr->cmd)) {
+		case MFI_CMD_OP_PD_SCSI:
+		case MFI_CMD_OP_LD_SCSI:
+		case MFI_CMD_OP_LD_READ:
+		case MFI_CMD_OP_LD_WRITE:
+			/*
+			 * MFI_CMD_OP_PD_SCSI and MFI_CMD_OP_LD_SCSI
+			 * could have been issued either through an
+			 * IO path or an IOCTL path. If it was via IOCTL,
+			 * we will send it to internal completion.
+			 */
+			if (cmd->sync_cmd == DRSAS_TRUE) {
+				complete_cmd_in_sync_mode(instance, cmd);
+				break;
+			}
+
+			/* regular commands */
+			acmd =	cmd->cmd;
+			pkt =	CMD2PKT(acmd);
+
+			if (acmd->cmd_flags & CFLAG_DMAVALID) {
+				if (acmd->cmd_flags & CFLAG_CONSISTENT) {
+					(void) ddi_dma_sync(acmd->cmd_dmahandle,
+					    acmd->cmd_dma_offset,
+					    acmd->cmd_dma_len,
+					    DDI_DMA_SYNC_FORCPU);
+				}
+			}
+
+			pkt->pkt_reason		= CMD_CMPLT;
+			pkt->pkt_statistics	= 0;
+			pkt->pkt_state = STATE_GOT_BUS
+			    | STATE_GOT_TARGET | STATE_SENT_CMD
+			    | STATE_XFERRED_DATA | STATE_GOT_STATUS;
+
+			con_log(CL_ANN1, (CE_CONT,
+			    "CDB[0] = %x completed for %s: size %lx context %x",
+			    pkt->pkt_cdbp[0], ((acmd->islogical) ? "LD" : "PD"),
+			    acmd->cmd_dmacount, hdr->context));
+
+			if (pkt->pkt_cdbp[0] == SCMD_INQUIRY) {
+				struct scsi_inquiry	*inq;
+
+				if (acmd->cmd_dmacount != 0) {
+					bp_mapin(acmd->cmd_buf);
+					inq = (struct scsi_inquiry *)
+					    acmd->cmd_buf->b_un.b_addr;
+
+					/* don't expose physical drives to OS */
+					if (acmd->islogical &&
+					    (hdr->cmd_status == MFI_STAT_OK)) {
+						display_scsi_inquiry(
+						    (caddr_t)inq);
+					} else if ((hdr->cmd_status ==
+					    MFI_STAT_OK) && inq->inq_dtype ==
+					    DTYPE_DIRECT) {
+
+						display_scsi_inquiry(
+						    (caddr_t)inq);
+
+						/* for physical disk */
+						hdr->cmd_status =
+						    MFI_STAT_DEVICE_NOT_FOUND;
+					}
+				}
+			}
+
+			switch (hdr->cmd_status) {
+			case MFI_STAT_OK:
+				pkt->pkt_scbp[0] = STATUS_GOOD;
+				break;
+			case MFI_STAT_LD_CC_IN_PROGRESS:
+			case MFI_STAT_LD_RECON_IN_PROGRESS:
+				pkt->pkt_scbp[0] = STATUS_GOOD;
+				break;
+			case MFI_STAT_LD_INIT_IN_PROGRESS:
+				con_log(CL_ANN,
+				    (CE_WARN, "Initialization in Progress"));
+				pkt->pkt_reason	= CMD_TRAN_ERR;
+
+				break;
+			case MFI_STAT_SCSI_DONE_WITH_ERROR:
+				con_log(CL_ANN1, (CE_CONT, "scsi_done error"));
+
+				pkt->pkt_reason	= CMD_CMPLT;
+				((struct scsi_status *)
+				    pkt->pkt_scbp)->sts_chk = 1;
+
+				if (pkt->pkt_cdbp[0] == SCMD_TEST_UNIT_READY) {
+
+					con_log(CL_ANN,
+					    (CE_WARN, "TEST_UNIT_READY fail"));
+
+				} else {
+					pkt->pkt_state |= STATE_ARQ_DONE;
+					arqstat = (void *)(pkt->pkt_scbp);
+					arqstat->sts_rqpkt_reason = CMD_CMPLT;
+					arqstat->sts_rqpkt_resid = 0;
+					arqstat->sts_rqpkt_state |=
+					    STATE_GOT_BUS | STATE_GOT_TARGET
+					    | STATE_SENT_CMD
+					    | STATE_XFERRED_DATA;
+					*(uint8_t *)&arqstat->sts_rqpkt_status =
+					    STATUS_GOOD;
+					ddi_rep_get8(
+					    cmd->frame_dma_obj.acc_handle,
+					    (uint8_t *)
+					    &(arqstat->sts_sensedata),
+					    cmd->sense,
+					    acmd->cmd_scblen -
+					    offsetof(struct scsi_arq_status,
+					    sts_sensedata), DDI_DEV_AUTOINCR);
+				}
+				break;
+			case MFI_STAT_LD_OFFLINE:
+			case MFI_STAT_DEVICE_NOT_FOUND:
+				con_log(CL_ANN1, (CE_CONT,
+				    "device not found error"));
+				pkt->pkt_reason	= CMD_DEV_GONE;
+				pkt->pkt_statistics  = STAT_DISCON;
+				break;
+			case MFI_STAT_LD_LBA_OUT_OF_RANGE:
+				pkt->pkt_state |= STATE_ARQ_DONE;
+				pkt->pkt_reason	= CMD_CMPLT;
+				((struct scsi_status *)
+				    pkt->pkt_scbp)->sts_chk = 1;
+
+				arqstat = (void *)(pkt->pkt_scbp);
+				arqstat->sts_rqpkt_reason = CMD_CMPLT;
+				arqstat->sts_rqpkt_resid = 0;
+				arqstat->sts_rqpkt_state |= STATE_GOT_BUS
+				    | STATE_GOT_TARGET | STATE_SENT_CMD
+				    | STATE_XFERRED_DATA;
+				*(uint8_t *)&arqstat->sts_rqpkt_status =
+				    STATUS_GOOD;
+
+				arqstat->sts_sensedata.es_valid = 1;
+				arqstat->sts_sensedata.es_key =
+				    KEY_ILLEGAL_REQUEST;
+				arqstat->sts_sensedata.es_class =
+				    CLASS_EXTENDED_SENSE;
+
+				/*
+				 * LOGICAL BLOCK ADDRESS OUT OF RANGE:
+				 * ASC: 0x21h; ASCQ: 0x00h;
+				 */
+				arqstat->sts_sensedata.es_add_code = 0x21;
+				arqstat->sts_sensedata.es_qual_code = 0x00;
+
+				break;
+
+			default:
+				con_log(CL_ANN, (CE_CONT, "Unknown status!"));
+				pkt->pkt_reason	= CMD_TRAN_ERR;
+
+				break;
+			}
+
+			atomic_add_16(&instance->fw_outstanding, (-1));
+
+			return_mfi_pkt(instance, cmd);
+
+			(void) drsas_common_check(instance, cmd);
+
+			if (acmd->cmd_dmahandle) {
+				if (drsas_check_dma_handle(
+				    acmd->cmd_dmahandle) != DDI_SUCCESS) {
+					ddi_fm_service_impact(instance->dip,
+					    DDI_SERVICE_UNAFFECTED);
+					pkt->pkt_reason = CMD_TRAN_ERR;
+					pkt->pkt_statistics = 0;
+				}
+			}
+
+			/* Call the callback routine */
+			if (((pkt->pkt_flags & FLAG_NOINTR) == 0) &&
+			    pkt->pkt_comp) {
+				(*pkt->pkt_comp)(pkt);
+			}
+
+			break;
+		case MFI_CMD_OP_SMP:
+		case MFI_CMD_OP_STP:
+			complete_cmd_in_sync_mode(instance, cmd);
+			break;
+		case MFI_CMD_OP_DCMD:
+			/* see if got an event notification */
+			if (ddi_get32(cmd->frame_dma_obj.acc_handle,
+			    &cmd->frame->dcmd.opcode) ==
+			    DR_DCMD_CTRL_EVENT_WAIT) {
+				if ((instance->aen_cmd == cmd) &&
+				    (instance->aen_cmd->abort_aen)) {
+					con_log(CL_ANN, (CE_WARN,
+					    "drsas_softintr: "
+					    "aborted_aen returned"));
+				} else {
+					atomic_add_16(&instance->fw_outstanding,
+					    (-1));
+					service_mfi_aen(instance, cmd);
+				}
+			} else {
+				complete_cmd_in_sync_mode(instance, cmd);
+			}
+
+			break;
+		case MFI_CMD_OP_ABORT:
+			con_log(CL_ANN, (CE_WARN, "MFI_CMD_OP_ABORT complete"));
+			/*
+			 * MFI_CMD_OP_ABORT successfully completed
+			 * in the synchronous mode
+			 */
+			complete_cmd_in_sync_mode(instance, cmd);
+			break;
+		default:
+			drsas_fm_ereport(instance, DDI_FM_DEVICE_NO_RESPONSE);
+			ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST);
+
+			if (cmd->pkt != NULL) {
+				pkt = cmd->pkt;
+				if (((pkt->pkt_flags & FLAG_NOINTR) == 0) &&
+				    pkt->pkt_comp) {
+					(*pkt->pkt_comp)(pkt);
+				}
+			}
+			con_log(CL_ANN, (CE_WARN, "Cmd type unknown !"));
+			break;
+		}
+	}
+
+	instance->softint_running = 0;
+
+	return (DDI_INTR_CLAIMED);
+}
+
+/*
+ * drsas_alloc_dma_obj
+ *
+ * Allocate the memory and other resources for an dma object.
+ */
+static int
+drsas_alloc_dma_obj(struct drsas_instance *instance, dma_obj_t *obj,
+    uchar_t endian_flags)
+{
+	int	i;
+	size_t	alen = 0;
+	uint_t	cookie_cnt;
+	struct ddi_device_acc_attr tmp_endian_attr;
+
+	tmp_endian_attr = endian_attr;
+	tmp_endian_attr.devacc_attr_endian_flags = endian_flags;
+
+	i = ddi_dma_alloc_handle(instance->dip, &obj->dma_attr,
+	    DDI_DMA_SLEEP, NULL, &obj->dma_handle);
+	if (i != DDI_SUCCESS) {
+
+		switch (i) {
+			case DDI_DMA_BADATTR :
+				con_log(CL_ANN, (CE_WARN,
+				"Failed ddi_dma_alloc_handle- Bad attribute"));
+				break;
+			case DDI_DMA_NORESOURCES :
+				con_log(CL_ANN, (CE_WARN,
+				"Failed ddi_dma_alloc_handle- No Resources"));
+				break;
+			default :
+				con_log(CL_ANN, (CE_WARN,
+				"Failed ddi_dma_alloc_handle: "
+				"unknown status %d", i));
+				break;
+		}
+
+		return (-1);
+	}
+
+	if ((ddi_dma_mem_alloc(obj->dma_handle, obj->size, &tmp_endian_attr,
+	    DDI_DMA_RDWR | DDI_DMA_STREAMING, DDI_DMA_SLEEP, NULL,
+	    &obj->buffer, &alen, &obj->acc_handle) != DDI_SUCCESS) ||
+	    alen < obj->size) {
+
+		ddi_dma_free_handle(&obj->dma_handle);
+
+		con_log(CL_ANN, (CE_WARN, "Failed : ddi_dma_mem_alloc"));
+
+		return (-1);
+	}
+
+	if (ddi_dma_addr_bind_handle(obj->dma_handle, NULL, obj->buffer,
+	    obj->size, DDI_DMA_RDWR | DDI_DMA_STREAMING, DDI_DMA_SLEEP,
+	    NULL, &obj->dma_cookie[0], &cookie_cnt) != DDI_SUCCESS) {
+
+		ddi_dma_mem_free(&obj->acc_handle);
+		ddi_dma_free_handle(&obj->dma_handle);
+
+		con_log(CL_ANN, (CE_WARN, "Failed : ddi_dma_addr_bind_handle"));
+
+		return (-1);
+	}
+
+	if (drsas_check_dma_handle(obj->dma_handle) != DDI_SUCCESS) {
+		ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST);
+		return (-1);
+	}
+
+	if (drsas_check_acc_handle(obj->acc_handle) != DDI_SUCCESS) {
+		ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST);
+		return (-1);
+	}
+
+	return (cookie_cnt);
+}
+
+/*
+ * drsas_free_dma_obj(struct drsas_instance *, dma_obj_t)
+ *
+ * De-allocate the memory and other resources for an dma object, which must
+ * have been alloated by a previous call to drsas_alloc_dma_obj()
+ */
+static int
+drsas_free_dma_obj(struct drsas_instance *instance, dma_obj_t obj)
+{
+
+	if (drsas_check_dma_handle(obj.dma_handle) != DDI_SUCCESS) {
+		ddi_fm_service_impact(instance->dip, DDI_SERVICE_UNAFFECTED);
+		return (DDI_FAILURE);
+	}
+
+	if (drsas_check_acc_handle(obj.acc_handle) != DDI_SUCCESS) {
+		ddi_fm_service_impact(instance->dip, DDI_SERVICE_UNAFFECTED);
+		return (DDI_FAILURE);
+	}
+
+	(void) ddi_dma_unbind_handle(obj.dma_handle);
+	ddi_dma_mem_free(&obj.acc_handle);
+	ddi_dma_free_handle(&obj.dma_handle);
+
+	return (DDI_SUCCESS);
+}
+
+/*
+ * drsas_dma_alloc(instance_t *, struct scsi_pkt *, struct buf *,
+ * int, int (*)())
+ *
+ * Allocate dma resources for a new scsi command
+ */
+static int
+drsas_dma_alloc(struct drsas_instance *instance, struct scsi_pkt *pkt,
+    struct buf *bp, int flags, int (*callback)())
+{
+	int	dma_flags;
+	int	(*cb)(caddr_t);
+	int	i;
+
+	ddi_dma_attr_t	tmp_dma_attr = drsas_generic_dma_attr;
+	struct scsa_cmd	*acmd = PKT2CMD(pkt);
+
+	acmd->cmd_buf = bp;
+
+	if (bp->b_flags & B_READ) {
+		acmd->cmd_flags &= ~CFLAG_DMASEND;
+		dma_flags = DDI_DMA_READ;
+	} else {
+		acmd->cmd_flags |= CFLAG_DMASEND;
+		dma_flags = DDI_DMA_WRITE;
+	}
+
+	if (flags & PKT_CONSISTENT) {
+		acmd->cmd_flags |= CFLAG_CONSISTENT;
+		dma_flags |= DDI_DMA_CONSISTENT;
+	}
+
+	if (flags & PKT_DMA_PARTIAL) {
+		dma_flags |= DDI_DMA_PARTIAL;
+	}
+
+	dma_flags |= DDI_DMA_REDZONE;
+
+	cb = (callback == NULL_FUNC) ? DDI_DMA_DONTWAIT : DDI_DMA_SLEEP;
+
+	tmp_dma_attr.dma_attr_sgllen = instance->max_num_sge;
+	tmp_dma_attr.dma_attr_addr_hi = 0xffffffffffffffffull;
+
+	if ((i = ddi_dma_alloc_handle(instance->dip, &tmp_dma_attr,
+	    cb, 0, &acmd->cmd_dmahandle)) != DDI_SUCCESS) {
+		switch (i) {
+		case DDI_DMA_BADATTR:
+			bioerror(bp, EFAULT);
+			return (DDI_FAILURE);
+
+		case DDI_DMA_NORESOURCES:
+			bioerror(bp, 0);
+			return (DDI_FAILURE);
+
+		default:
+			con_log(CL_ANN, (CE_PANIC, "ddi_dma_alloc_handle: "
+			    "impossible result (0x%x)", i));
+			bioerror(bp, EFAULT);
+			return (DDI_FAILURE);
+		}
+	}
+
+	i = ddi_dma_buf_bind_handle(acmd->cmd_dmahandle, bp, dma_flags,
+	    cb, 0, &acmd->cmd_dmacookies[0], &acmd->cmd_ncookies);
+
+	switch (i) {
+	case DDI_DMA_PARTIAL_MAP:
+		if ((dma_flags & DDI_DMA_PARTIAL) == 0) {
+			con_log(CL_ANN, (CE_PANIC, "ddi_dma_buf_bind_handle: "
+			    "DDI_DMA_PARTIAL_MAP impossible"));
+			goto no_dma_cookies;
+		}
+
+		if (ddi_dma_numwin(acmd->cmd_dmahandle, &acmd->cmd_nwin) ==
+		    DDI_FAILURE) {
+			con_log(CL_ANN, (CE_PANIC, "ddi_dma_numwin failed"));
+			goto no_dma_cookies;
+		}
+
+		if (ddi_dma_getwin(acmd->cmd_dmahandle, acmd->cmd_curwin,
+		    &acmd->cmd_dma_offset, &acmd->cmd_dma_len,
+		    &acmd->cmd_dmacookies[0], &acmd->cmd_ncookies) ==
+		    DDI_FAILURE) {
+
+			con_log(CL_ANN, (CE_PANIC, "ddi_dma_getwin failed"));
+			goto no_dma_cookies;
+		}
+
+		goto get_dma_cookies;
+	case DDI_DMA_MAPPED:
+		acmd->cmd_nwin = 1;
+		acmd->cmd_dma_len = 0;
+		acmd->cmd_dma_offset = 0;
+
+get_dma_cookies:
+		i = 0;
+		acmd->cmd_dmacount = 0;
+		for (;;) {
+			acmd->cmd_dmacount +=
+			    acmd->cmd_dmacookies[i++].dmac_size;
+
+			if (i == instance->max_num_sge ||
+			    i == acmd->cmd_ncookies)
+				break;
+
+			ddi_dma_nextcookie(acmd->cmd_dmahandle,
+			    &acmd->cmd_dmacookies[i]);
+		}
+
+		acmd->cmd_cookie = i;
+		acmd->cmd_cookiecnt = i;
+
+		acmd->cmd_flags |= CFLAG_DMAVALID;
+
+		if (bp->b_bcount >= acmd->cmd_dmacount) {
+			pkt->pkt_resid = bp->b_bcount - acmd->cmd_dmacount;
+		} else {
+			pkt->pkt_resid = 0;
+		}
+
+		return (DDI_SUCCESS);
+	case DDI_DMA_NORESOURCES:
+		bioerror(bp, 0);
+		break;
+	case DDI_DMA_NOMAPPING:
+		bioerror(bp, EFAULT);
+		break;
+	case DDI_DMA_TOOBIG:
+		bioerror(bp, EINVAL);
+		break;
+	case DDI_DMA_INUSE:
+		con_log(CL_ANN, (CE_PANIC, "ddi_dma_buf_bind_handle:"
+		    " DDI_DMA_INUSE impossible"));
+		break;
+	default:
+		con_log(CL_ANN, (CE_PANIC, "ddi_dma_buf_bind_handle: "
+		    "impossible result (0x%x)", i));
+		break;
+	}
+
+no_dma_cookies:
+	ddi_dma_free_handle(&acmd->cmd_dmahandle);
+	acmd->cmd_dmahandle = NULL;
+	acmd->cmd_flags &= ~CFLAG_DMAVALID;
+	return (DDI_FAILURE);
+}
+
+/*
+ * drsas_dma_move(struct drsas_instance *, struct scsi_pkt *, struct buf *)
+ *
+ * move dma resources to next dma window
+ *
+ */
+static int
+drsas_dma_move(struct drsas_instance *instance, struct scsi_pkt *pkt,
+    struct buf *bp)
+{
+	int	i = 0;
+
+	struct scsa_cmd	*acmd = PKT2CMD(pkt);
+
+	/*
+	 * If there are no more cookies remaining in this window,
+	 * must move to the next window first.
+	 */
+	if (acmd->cmd_cookie == acmd->cmd_ncookies) {
+		if (acmd->cmd_curwin == acmd->cmd_nwin && acmd->cmd_nwin == 1) {
+			return (DDI_SUCCESS);
+		}
+
+		/* at last window, cannot move */
+		if (++acmd->cmd_curwin >= acmd->cmd_nwin) {
+			return (DDI_FAILURE);
+		}
+
+		if (ddi_dma_getwin(acmd->cmd_dmahandle, acmd->cmd_curwin,
+		    &acmd->cmd_dma_offset, &acmd->cmd_dma_len,
+		    &acmd->cmd_dmacookies[0], &acmd->cmd_ncookies) ==
+		    DDI_FAILURE) {
+			return (DDI_FAILURE);
+		}
+
+		acmd->cmd_cookie = 0;
+	} else {
+		/* still more cookies in this window - get the next one */
+		ddi_dma_nextcookie(acmd->cmd_dmahandle,
+		    &acmd->cmd_dmacookies[0]);
+	}
+
+	/* get remaining cookies in this window, up to our maximum */
+	for (;;) {
+		acmd->cmd_dmacount += acmd->cmd_dmacookies[i++].dmac_size;
+		acmd->cmd_cookie++;
+
+		if (i == instance->max_num_sge ||
+		    acmd->cmd_cookie == acmd->cmd_ncookies) {
+			break;
+		}
+
+		ddi_dma_nextcookie(acmd->cmd_dmahandle,
+		    &acmd->cmd_dmacookies[i]);
+	}
+
+	acmd->cmd_cookiecnt = i;
+
+	if (bp->b_bcount >= acmd->cmd_dmacount) {
+		pkt->pkt_resid = bp->b_bcount - acmd->cmd_dmacount;
+	} else {
+		pkt->pkt_resid = 0;
+	}
+
+	return (DDI_SUCCESS);
+}
+
+/*
+ * build_cmd
+ */
+static struct drsas_cmd *
+build_cmd(struct drsas_instance *instance, struct scsi_address *ap,
+    struct scsi_pkt *pkt, uchar_t *cmd_done)
+{
+	uint16_t	flags = 0;
+	uint32_t	i;
+	uint32_t	context __unused;
+	uint32_t	sge_bytes;
+	ddi_acc_handle_t acc_handle;
+	struct drsas_cmd		*cmd;
+	struct drsas_sge64		*mfi_sgl;
+	struct scsa_cmd			*acmd = PKT2CMD(pkt);
+	struct drsas_pthru_frame	*pthru;
+	struct drsas_io_frame		*ldio;
+
+	/* find out if this is logical or physical drive command.  */
+	acmd->islogical = MRDRV_IS_LOGICAL(ap);
+	acmd->device_id = MAP_DEVICE_ID(instance, ap);
+	*cmd_done = 0;
+
+	/* get the command packet */
+	if (!(cmd = get_mfi_pkt(instance))) {
+		return (NULL);
+	}
+
+	acc_handle = cmd->frame_dma_obj.acc_handle;
+
+	/* Clear the frame buffer and assign back the context id */
+	(void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame));
+	ddi_put32(acc_handle, &cmd->frame->hdr.context, cmd->index);
+
+	cmd->pkt = pkt;
+	cmd->cmd = acmd;
+
+	/* lets get the command directions */
+	if (acmd->cmd_flags & CFLAG_DMASEND) {
+		flags = MFI_FRAME_DIR_WRITE;
+
+		if (acmd->cmd_flags & CFLAG_CONSISTENT) {
+			(void) ddi_dma_sync(acmd->cmd_dmahandle,
+			    acmd->cmd_dma_offset, acmd->cmd_dma_len,
+			    DDI_DMA_SYNC_FORDEV);
+		}
+	} else if (acmd->cmd_flags & ~CFLAG_DMASEND) {
+		flags = MFI_FRAME_DIR_READ;
+
+		if (acmd->cmd_flags & CFLAG_CONSISTENT) {
+			(void) ddi_dma_sync(acmd->cmd_dmahandle,
+			    acmd->cmd_dma_offset, acmd->cmd_dma_len,
+			    DDI_DMA_SYNC_FORCPU);
+		}
+	} else {
+		flags = MFI_FRAME_DIR_NONE;
+	}
+
+	flags |= MFI_FRAME_SGL64;
+
+	switch (pkt->pkt_cdbp[0]) {
+
+	/*
+	 * case SCMD_SYNCHRONIZE_CACHE:
+	 *	flush_cache(instance);
+	 *	return_mfi_pkt(instance, cmd);
+	 *	*cmd_done = 1;
+	 *
+	 *	return (NULL);
+	 */
+
+	case SCMD_READ:
+	case SCMD_WRITE:
+	case SCMD_READ_G1:
+	case SCMD_WRITE_G1:
+		if (acmd->islogical) {
+			ldio = (struct drsas_io_frame *)cmd->frame;
+
+			/*
+			 * preare the Logical IO frame:
+			 * 2nd bit is zero for all read cmds
+			 */
+			ddi_put8(acc_handle, &ldio->cmd,
+			    (pkt->pkt_cdbp[0] & 0x02) ? MFI_CMD_OP_LD_WRITE
+			    : MFI_CMD_OP_LD_READ);
+			ddi_put8(acc_handle, &ldio->cmd_status, 0x0);
+			ddi_put8(acc_handle, &ldio->scsi_status, 0x0);
+			ddi_put8(acc_handle, &ldio->target_id, acmd->device_id);
+			ddi_put16(acc_handle, &ldio->timeout, 0);
+			ddi_put8(acc_handle, &ldio->reserved_0, 0);
+			ddi_put16(acc_handle, &ldio->pad_0, 0);
+			ddi_put16(acc_handle, &ldio->flags, flags);
+
+			/* Initialize sense Information */
+			bzero(cmd->sense, SENSE_LENGTH);
+			ddi_put8(acc_handle, &ldio->sense_len, SENSE_LENGTH);
+			ddi_put32(acc_handle, &ldio->sense_buf_phys_addr_hi, 0);
+			ddi_put32(acc_handle, &ldio->sense_buf_phys_addr_lo,
+			    cmd->sense_phys_addr);
+			ddi_put32(acc_handle, &ldio->start_lba_hi, 0);
+			ddi_put8(acc_handle, &ldio->access_byte,
+			    (acmd->cmd_cdblen != 6) ? pkt->pkt_cdbp[1] : 0);
+			ddi_put8(acc_handle, &ldio->sge_count,
+			    acmd->cmd_cookiecnt);
+			mfi_sgl = (struct drsas_sge64	*)&ldio->sgl;
+
+			context = ddi_get32(acc_handle, &ldio->context);
+
+			if (acmd->cmd_cdblen == CDB_GROUP0) {
+				ddi_put32(acc_handle, &ldio->lba_count, (
+				    (uint16_t)(pkt->pkt_cdbp[4])));
+
+				ddi_put32(acc_handle, &ldio->start_lba_lo, (
+				    ((uint32_t)(pkt->pkt_cdbp[3])) |
+				    ((uint32_t)(pkt->pkt_cdbp[2]) << 8) |
+				    ((uint32_t)((pkt->pkt_cdbp[1]) & 0x1F)
+				    << 16)));
+			} else if (acmd->cmd_cdblen == CDB_GROUP1) {
+				ddi_put32(acc_handle, &ldio->lba_count, (
+				    ((uint16_t)(pkt->pkt_cdbp[8])) |
+				    ((uint16_t)(pkt->pkt_cdbp[7]) << 8)));
+
+				ddi_put32(acc_handle, &ldio->start_lba_lo, (
+				    ((uint32_t)(pkt->pkt_cdbp[5])) |
+				    ((uint32_t)(pkt->pkt_cdbp[4]) << 8) |
+				    ((uint32_t)(pkt->pkt_cdbp[3]) << 16) |
+				    ((uint32_t)(pkt->pkt_cdbp[2]) << 24)));
+			} else if (acmd->cmd_cdblen == CDB_GROUP2) {
+				ddi_put32(acc_handle, &ldio->lba_count, (
+				    ((uint16_t)(pkt->pkt_cdbp[9])) |
+				    ((uint16_t)(pkt->pkt_cdbp[8]) << 8) |
+				    ((uint16_t)(pkt->pkt_cdbp[7]) << 16) |
+				    ((uint16_t)(pkt->pkt_cdbp[6]) << 24)));
+
+				ddi_put32(acc_handle, &ldio->start_lba_lo, (
+				    ((uint32_t)(pkt->pkt_cdbp[5])) |
+				    ((uint32_t)(pkt->pkt_cdbp[4]) << 8) |
+				    ((uint32_t)(pkt->pkt_cdbp[3]) << 16) |
+				    ((uint32_t)(pkt->pkt_cdbp[2]) << 24)));
+			} else if (acmd->cmd_cdblen == CDB_GROUP3) {
+				ddi_put32(acc_handle, &ldio->lba_count, (
+				    ((uint16_t)(pkt->pkt_cdbp[13])) |
+				    ((uint16_t)(pkt->pkt_cdbp[12]) << 8) |
+				    ((uint16_t)(pkt->pkt_cdbp[11]) << 16) |
+				    ((uint16_t)(pkt->pkt_cdbp[10]) << 24)));
+
+				ddi_put32(acc_handle, &ldio->start_lba_lo, (
+				    ((uint32_t)(pkt->pkt_cdbp[9])) |
+				    ((uint32_t)(pkt->pkt_cdbp[8]) << 8) |
+				    ((uint32_t)(pkt->pkt_cdbp[7]) << 16) |
+				    ((uint32_t)(pkt->pkt_cdbp[6]) << 24)));
+
+				ddi_put32(acc_handle, &ldio->start_lba_lo, (
+				    ((uint32_t)(pkt->pkt_cdbp[5])) |
+				    ((uint32_t)(pkt->pkt_cdbp[4]) << 8) |
+				    ((uint32_t)(pkt->pkt_cdbp[3]) << 16) |
+				    ((uint32_t)(pkt->pkt_cdbp[2]) << 24)));
+			}
+
+			break;
+		}
+		/* fall through */
+	default:
+
+		switch (pkt->pkt_cdbp[0]) {
+		case SCMD_MODE_SENSE:
+		case SCMD_MODE_SENSE_G1: {
+			union scsi_cdb	*cdbp;
+			uint16_t	page_code;
+
+			cdbp = (void *)pkt->pkt_cdbp;
+			page_code = (uint16_t)cdbp->cdb_un.sg.scsi[0];
+			switch (page_code) {
+			case 0x3:
+			case 0x4:
+				(void) drsas_mode_sense_build(pkt);
+				return_mfi_pkt(instance, cmd);
+				*cmd_done = 1;
+				return (NULL);
+			}
+			break;
+		}
+		default:
+			break;
+		}
+
+		pthru	= (struct drsas_pthru_frame *)cmd->frame;
+
+		/* prepare the DCDB frame */
+		ddi_put8(acc_handle, &pthru->cmd, (acmd->islogical) ?
+		    MFI_CMD_OP_LD_SCSI : MFI_CMD_OP_PD_SCSI);
+		ddi_put8(acc_handle, &pthru->cmd_status, 0x0);
+		ddi_put8(acc_handle, &pthru->scsi_status, 0x0);
+		ddi_put8(acc_handle, &pthru->target_id, acmd->device_id);
+		ddi_put8(acc_handle, &pthru->lun, 0);
+		ddi_put8(acc_handle, &pthru->cdb_len, acmd->cmd_cdblen);
+		ddi_put16(acc_handle, &pthru->timeout, 0);
+		ddi_put16(acc_handle, &pthru->flags, flags);
+		ddi_put32(acc_handle, &pthru->data_xfer_len,
+		    acmd->cmd_dmacount);
+		ddi_put8(acc_handle, &pthru->sge_count, acmd->cmd_cookiecnt);
+		mfi_sgl			= (struct drsas_sge64 *)&pthru->sgl;
+
+		bzero(cmd->sense, SENSE_LENGTH);
+		ddi_put8(acc_handle, &pthru->sense_len, SENSE_LENGTH);
+		ddi_put32(acc_handle, &pthru->sense_buf_phys_addr_hi, 0);
+		ddi_put32(acc_handle, &pthru->sense_buf_phys_addr_lo,
+		    cmd->sense_phys_addr);
+
+		context = ddi_get32(acc_handle, &pthru->context);
+		ddi_rep_put8(acc_handle, (uint8_t *)pkt->pkt_cdbp,
+		    (uint8_t *)pthru->cdb, acmd->cmd_cdblen, DDI_DEV_AUTOINCR);
+
+		break;
+	}
+#ifdef lint
+	context = context;
+#endif
+	/* prepare the scatter-gather list for the firmware */
+	for (i = 0; i < acmd->cmd_cookiecnt; i++, mfi_sgl++) {
+		ddi_put64(acc_handle, &mfi_sgl->phys_addr,
+		    acmd->cmd_dmacookies[i].dmac_laddress);
+		ddi_put32(acc_handle, &mfi_sgl->length,
+		    acmd->cmd_dmacookies[i].dmac_size);
+	}
+
+	sge_bytes = sizeof (struct drsas_sge64)*acmd->cmd_cookiecnt;
+
+	cmd->frame_count = (sge_bytes / MRMFI_FRAME_SIZE) +
+	    ((sge_bytes % MRMFI_FRAME_SIZE) ? 1 : 0) + 1;
+
+	if (cmd->frame_count >= 8) {
+		cmd->frame_count = 8;
+	}
+
+	return (cmd);
+}
+
+/*
+ * issue_mfi_pthru
+ */
+static int
+issue_mfi_pthru(struct drsas_instance *instance, struct drsas_ioctl *ioctl,
+    struct drsas_cmd *cmd, int mode)
+{
+	void		*ubuf;
+	uint32_t	kphys_addr = 0;
+	uint32_t	xferlen = 0;
+	uint_t		model;
+	ddi_acc_handle_t	acc_handle = cmd->frame_dma_obj.acc_handle;
+	dma_obj_t			pthru_dma_obj;
+	struct drsas_pthru_frame	*kpthru;
+	struct drsas_pthru_frame	*pthru;
+	int i;
+	pthru = &cmd->frame->pthru;
+	kpthru = (struct drsas_pthru_frame *)&ioctl->frame[0];
+
+	model = ddi_model_convert_from(mode & FMODELS);
+	if (model == DDI_MODEL_ILP32) {
+		con_log(CL_ANN1, (CE_NOTE, "issue_mfi_pthru: DDI_MODEL_LP32"));
+
+		xferlen	= kpthru->sgl.sge32[0].length;
+
+		ubuf	= (void *)(ulong_t)kpthru->sgl.sge32[0].phys_addr;
+	} else {
+#ifdef _ILP32
+		con_log(CL_ANN1, (CE_NOTE, "issue_mfi_pthru: DDI_MODEL_LP32"));
+		xferlen	= kpthru->sgl.sge32[0].length;
+		ubuf	= (void *)(ulong_t)kpthru->sgl.sge32[0].phys_addr;
+#else
+		con_log(CL_ANN1, (CE_NOTE, "issue_mfi_pthru: DDI_MODEL_LP64"));
+		xferlen	= kpthru->sgl.sge64[0].length;
+		ubuf	= (void *)(ulong_t)kpthru->sgl.sge64[0].phys_addr;
+#endif
+	}
+
+	if (xferlen) {
+		/* means IOCTL requires DMA */
+		/* allocate the data transfer buffer */
+		pthru_dma_obj.size = xferlen;
+		pthru_dma_obj.dma_attr = drsas_generic_dma_attr;
+		pthru_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU;
+		pthru_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU;
+		pthru_dma_obj.dma_attr.dma_attr_sgllen = 1;
+		pthru_dma_obj.dma_attr.dma_attr_align = 1;
+
+		/* allocate kernel buffer for DMA */
+		if (drsas_alloc_dma_obj(instance, &pthru_dma_obj,
+		    (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) {
+			con_log(CL_ANN, (CE_WARN, "issue_mfi_pthru: "
+			    "could not allocate data transfer buffer."));
+			return (DDI_FAILURE);
+		}
+
+		/* If IOCTL requires DMA WRITE, do ddi_copyin IOCTL data copy */
+		if (kpthru->flags & MFI_FRAME_DIR_WRITE) {
+			for (i = 0; i < xferlen; i++) {
+				if (ddi_copyin((uint8_t *)ubuf+i,
+				    (uint8_t *)pthru_dma_obj.buffer+i,
+				    1, mode)) {
+					con_log(CL_ANN, (CE_WARN,
+					    "issue_mfi_pthru : "
+					    "copy from user space failed"));
+					return (DDI_FAILURE);
+				}
+			}
+		}
+
+		kphys_addr = pthru_dma_obj.dma_cookie[0].dmac_address;
+	}
+
+	ddi_put8(acc_handle, &pthru->cmd, kpthru->cmd);
+	ddi_put8(acc_handle, &pthru->sense_len, kpthru->sense_len);
+	ddi_put8(acc_handle, &pthru->cmd_status, 0);
+	ddi_put8(acc_handle, &pthru->scsi_status, 0);
+	ddi_put8(acc_handle, &pthru->target_id, kpthru->target_id);
+	ddi_put8(acc_handle, &pthru->lun, kpthru->lun);
+	ddi_put8(acc_handle, &pthru->cdb_len, kpthru->cdb_len);
+	ddi_put8(acc_handle, &pthru->sge_count, kpthru->sge_count);
+	ddi_put16(acc_handle, &pthru->timeout, kpthru->timeout);
+	ddi_put32(acc_handle, &pthru->data_xfer_len, kpthru->data_xfer_len);
+
+	ddi_put32(acc_handle, &pthru->sense_buf_phys_addr_hi, 0);
+	/* pthru->sense_buf_phys_addr_lo = cmd->sense_phys_addr; */
+	ddi_put32(acc_handle, &pthru->sense_buf_phys_addr_lo, 0);
+
+	ddi_rep_put8(acc_handle, (uint8_t *)kpthru->cdb, (uint8_t *)pthru->cdb,
+	    pthru->cdb_len, DDI_DEV_AUTOINCR);
+
+	ddi_put16(acc_handle, &pthru->flags, kpthru->flags & ~MFI_FRAME_SGL64);
+	ddi_put32(acc_handle, &pthru->sgl.sge32[0].length, xferlen);
+	ddi_put32(acc_handle, &pthru->sgl.sge32[0].phys_addr, kphys_addr);
+
+	cmd->sync_cmd = DRSAS_TRUE;
+	cmd->frame_count = 1;
+
+	if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) {
+		con_log(CL_ANN, (CE_WARN,
+		    "issue_mfi_pthru: fw_ioctl failed"));
+	} else {
+		if (xferlen && kpthru->flags & MFI_FRAME_DIR_READ) {
+			for (i = 0; i < xferlen; i++) {
+				if (ddi_copyout(
+				    (uint8_t *)pthru_dma_obj.buffer+i,
+				    (uint8_t *)ubuf+i, 1, mode)) {
+					con_log(CL_ANN, (CE_WARN,
+					    "issue_mfi_pthru : "
+					    "copy to user space failed"));
+					return (DDI_FAILURE);
+				}
+			}
+		}
+	}
+
+	kpthru->cmd_status = ddi_get8(acc_handle, &pthru->cmd_status);
+	kpthru->scsi_status = ddi_get8(acc_handle, &pthru->scsi_status);
+
+	con_log(CL_ANN, (CE_NOTE, "issue_mfi_pthru: cmd_status %x, "
+	    "scsi_status %x", kpthru->cmd_status, kpthru->scsi_status));
+
+	if (xferlen) {
+		/* free kernel buffer */
+		if (drsas_free_dma_obj(instance, pthru_dma_obj) != DDI_SUCCESS)
+			return (DDI_FAILURE);
+	}
+
+	return (DDI_SUCCESS);
+}
+
+/*
+ * issue_mfi_dcmd
+ */
+static int
+issue_mfi_dcmd(struct drsas_instance *instance, struct drsas_ioctl *ioctl,
+    struct drsas_cmd *cmd, int mode)
+{
+	void		*ubuf;
+	uint32_t	kphys_addr = 0;
+	uint32_t	xferlen = 0;
+	uint32_t	model;
+	dma_obj_t	dcmd_dma_obj;
+	struct drsas_dcmd_frame	*kdcmd;
+	struct drsas_dcmd_frame	*dcmd;
+	ddi_acc_handle_t	acc_handle = cmd->frame_dma_obj.acc_handle;
+	int i;
+	dcmd = &cmd->frame->dcmd;
+	kdcmd = (struct drsas_dcmd_frame *)&ioctl->frame[0];
+
+	model = ddi_model_convert_from(mode & FMODELS);
+	if (model == DDI_MODEL_ILP32) {
+		con_log(CL_ANN1, (CE_NOTE, "issue_mfi_dcmd: DDI_MODEL_ILP32"));
+
+		xferlen	= kdcmd->sgl.sge32[0].length;
+
+		ubuf	= (void *)(ulong_t)kdcmd->sgl.sge32[0].phys_addr;
+	} else {
+#ifdef _ILP32
+		con_log(CL_ANN1, (CE_NOTE, "issue_mfi_dcmd: DDI_MODEL_ILP32"));
+		xferlen	= kdcmd->sgl.sge32[0].length;
+		ubuf	= (void *)(ulong_t)kdcmd->sgl.sge32[0].phys_addr;
+#else
+		con_log(CL_ANN1, (CE_NOTE, "issue_mfi_dcmd: DDI_MODEL_LP64"));
+		xferlen	= kdcmd->sgl.sge64[0].length;
+		ubuf	= (void *)(ulong_t)kdcmd->sgl.sge64[0].phys_addr;
+#endif
+	}
+	if (xferlen) {
+		/* means IOCTL requires DMA */
+		/* allocate the data transfer buffer */
+		dcmd_dma_obj.size = xferlen;
+		dcmd_dma_obj.dma_attr = drsas_generic_dma_attr;
+		dcmd_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU;
+		dcmd_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU;
+		dcmd_dma_obj.dma_attr.dma_attr_sgllen = 1;
+		dcmd_dma_obj.dma_attr.dma_attr_align = 1;
+
+		/* allocate kernel buffer for DMA */
+		if (drsas_alloc_dma_obj(instance, &dcmd_dma_obj,
+		    (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) {
+			con_log(CL_ANN, (CE_WARN, "issue_mfi_dcmd: "
+			    "could not allocate data transfer buffer."));
+			return (DDI_FAILURE);
+		}
+
+		/* If IOCTL requires DMA WRITE, do ddi_copyin IOCTL data copy */
+		if (kdcmd->flags & MFI_FRAME_DIR_WRITE) {
+			for (i = 0; i < xferlen; i++) {
+				if (ddi_copyin((uint8_t *)ubuf + i,
+				    (uint8_t *)dcmd_dma_obj.buffer + i,
+				    1, mode)) {
+					con_log(CL_ANN, (CE_WARN,
+					    "issue_mfi_dcmd : "
+					    "copy from user space failed"));
+					return (DDI_FAILURE);
+				}
+			}
+		}
+
+		kphys_addr = dcmd_dma_obj.dma_cookie[0].dmac_address;
+	}
+
+	ddi_put8(acc_handle, &dcmd->cmd, kdcmd->cmd);
+	ddi_put8(acc_handle, &dcmd->cmd_status, 0);
+	ddi_put8(acc_handle, &dcmd->sge_count, kdcmd->sge_count);
+	ddi_put16(acc_handle, &dcmd->timeout, kdcmd->timeout);
+	ddi_put32(acc_handle, &dcmd->data_xfer_len, kdcmd->data_xfer_len);
+	ddi_put32(acc_handle, &dcmd->opcode, kdcmd->opcode);
+
+	ddi_rep_put8(acc_handle, (uint8_t *)kdcmd->mbox.b,
+	    (uint8_t *)dcmd->mbox.b, DCMD_MBOX_SZ, DDI_DEV_AUTOINCR);
+
+	ddi_put16(acc_handle, &dcmd->flags, kdcmd->flags & ~MFI_FRAME_SGL64);
+	ddi_put32(acc_handle, &dcmd->sgl.sge32[0].length, xferlen);
+	ddi_put32(acc_handle, &dcmd->sgl.sge32[0].phys_addr, kphys_addr);
+
+	cmd->sync_cmd = DRSAS_TRUE;
+	cmd->frame_count = 1;
+
+	if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) {
+		con_log(CL_ANN, (CE_WARN, "issue_mfi_dcmd: fw_ioctl failed"));
+	} else {
+		if (xferlen && (kdcmd->flags & MFI_FRAME_DIR_READ)) {
+			for (i = 0; i < xferlen; i++) {
+				if (ddi_copyout(
+				    (uint8_t *)dcmd_dma_obj.buffer + i,
+				    (uint8_t *)ubuf + i,
+				    1, mode)) {
+					con_log(CL_ANN, (CE_WARN,
+					    "issue_mfi_dcmd : "
+					    "copy to user space failed"));
+					return (DDI_FAILURE);
+				}
+			}
+		}
+	}
+
+	kdcmd->cmd_status = ddi_get8(acc_handle, &dcmd->cmd_status);
+
+	if (xferlen) {
+		/* free kernel buffer */
+		if (drsas_free_dma_obj(instance, dcmd_dma_obj) != DDI_SUCCESS)
+			return (DDI_FAILURE);
+	}
+
+	return (DDI_SUCCESS);
+}
+
+/*
+ * issue_mfi_smp
+ */
+static int
+issue_mfi_smp(struct drsas_instance *instance, struct drsas_ioctl *ioctl,
+    struct drsas_cmd *cmd, int mode)
+{
+	void		*request_ubuf;
+	void		*response_ubuf;
+	uint32_t	request_xferlen = 0;
+	uint32_t	response_xferlen = 0;
+	uint_t		model;
+	dma_obj_t			request_dma_obj;
+	dma_obj_t			response_dma_obj;
+	ddi_acc_handle_t	acc_handle = cmd->frame_dma_obj.acc_handle;
+	struct drsas_smp_frame		*ksmp;
+	struct drsas_smp_frame		*smp;
+	struct drsas_sge32		*sge32;
+#ifndef _ILP32
+	struct drsas_sge64		*sge64;
+#endif
+	int i;
+	uint64_t			tmp_sas_addr;
+
+	smp = &cmd->frame->smp;
+	ksmp = (struct drsas_smp_frame *)&ioctl->frame[0];
+
+	model = ddi_model_convert_from(mode & FMODELS);
+	if (model == DDI_MODEL_ILP32) {
+		con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: DDI_MODEL_ILP32"));
+
+		sge32			= &ksmp->sgl[0].sge32[0];
+		response_xferlen	= sge32[0].length;
+		request_xferlen		= sge32[1].length;
+		con_log(CL_ANN, (CE_NOTE, "issue_mfi_smp: "
+		    "response_xferlen = %x, request_xferlen = %x",
+		    response_xferlen, request_xferlen));
+
+		response_ubuf	= (void *)(ulong_t)sge32[0].phys_addr;
+		request_ubuf	= (void *)(ulong_t)sge32[1].phys_addr;
+		con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: "
+		    "response_ubuf = %p, request_ubuf = %p",
+		    response_ubuf, request_ubuf));
+	} else {
+#ifdef _ILP32
+		con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: DDI_MODEL_ILP32"));
+
+		sge32			= &ksmp->sgl[0].sge32[0];
+		response_xferlen	= sge32[0].length;
+		request_xferlen		= sge32[1].length;
+		con_log(CL_ANN, (CE_NOTE, "issue_mfi_smp: "
+		    "response_xferlen = %x, request_xferlen = %x",
+		    response_xferlen, request_xferlen));
+
+		response_ubuf	= (void *)(ulong_t)sge32[0].phys_addr;
+		request_ubuf	= (void *)(ulong_t)sge32[1].phys_addr;
+		con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: "
+		    "response_ubuf = %p, request_ubuf = %p",
+		    response_ubuf, request_ubuf));
+#else
+		con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: DDI_MODEL_LP64"));
+
+		sge64			= &ksmp->sgl[0].sge64[0];
+		response_xferlen	= sge64[0].length;
+		request_xferlen		= sge64[1].length;
+
+		response_ubuf	= (void *)(ulong_t)sge64[0].phys_addr;
+		request_ubuf	= (void *)(ulong_t)sge64[1].phys_addr;
+#endif
+	}
+	if (request_xferlen) {
+		/* means IOCTL requires DMA */
+		/* allocate the data transfer buffer */
+		request_dma_obj.size = request_xferlen;
+		request_dma_obj.dma_attr = drsas_generic_dma_attr;
+		request_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU;
+		request_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU;
+		request_dma_obj.dma_attr.dma_attr_sgllen = 1;
+		request_dma_obj.dma_attr.dma_attr_align = 1;
+
+		/* allocate kernel buffer for DMA */
+		if (drsas_alloc_dma_obj(instance, &request_dma_obj,
+		    (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) {
+			con_log(CL_ANN, (CE_WARN, "issue_mfi_smp: "
+			    "could not allocate data transfer buffer."));
+			return (DDI_FAILURE);
+		}
+
+		/* If IOCTL requires DMA WRITE, do ddi_copyin IOCTL data copy */
+		for (i = 0; i < request_xferlen; i++) {
+			if (ddi_copyin((uint8_t *)request_ubuf + i,
+			    (uint8_t *)request_dma_obj.buffer + i,
+			    1, mode)) {
+				con_log(CL_ANN, (CE_WARN, "issue_mfi_smp: "
+				    "copy from user space failed"));
+				return (DDI_FAILURE);
+			}
+		}
+	}
+
+	if (response_xferlen) {
+		/* means IOCTL requires DMA */
+		/* allocate the data transfer buffer */
+		response_dma_obj.size = response_xferlen;
+		response_dma_obj.dma_attr = drsas_generic_dma_attr;
+		response_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU;
+		response_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU;
+		response_dma_obj.dma_attr.dma_attr_sgllen = 1;
+		response_dma_obj.dma_attr.dma_attr_align = 1;
+
+		/* allocate kernel buffer for DMA */
+		if (drsas_alloc_dma_obj(instance, &response_dma_obj,
+		    (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) {
+			con_log(CL_ANN, (CE_WARN, "issue_mfi_smp: "
+			    "could not allocate data transfer buffer."));
+			return (DDI_FAILURE);
+		}
+
+		/* If IOCTL requires DMA WRITE, do ddi_copyin IOCTL data copy */
+		for (i = 0; i < response_xferlen; i++) {
+			if (ddi_copyin((uint8_t *)response_ubuf + i,
+			    (uint8_t *)response_dma_obj.buffer + i,
+			    1, mode)) {
+				con_log(CL_ANN, (CE_WARN, "issue_mfi_smp: "
+				    "copy from user space failed"));
+				return (DDI_FAILURE);
+			}
+		}
+	}
+
+	ddi_put8(acc_handle, &smp->cmd, ksmp->cmd);
+	ddi_put8(acc_handle, &smp->cmd_status, 0);
+	ddi_put8(acc_handle, &smp->connection_status, 0);
+	ddi_put8(acc_handle, &smp->sge_count, ksmp->sge_count);
+	/* smp->context		= ksmp->context; */
+	ddi_put16(acc_handle, &smp->timeout, ksmp->timeout);
+	ddi_put32(acc_handle, &smp->data_xfer_len, ksmp->data_xfer_len);
+
+	bcopy((void *)&ksmp->sas_addr, (void *)&tmp_sas_addr,
+	    sizeof (uint64_t));
+	ddi_put64(acc_handle, &smp->sas_addr, tmp_sas_addr);
+
+	ddi_put16(acc_handle, &smp->flags, ksmp->flags & ~MFI_FRAME_SGL64);
+
+	model = ddi_model_convert_from(mode & FMODELS);
+	if (model == DDI_MODEL_ILP32) {
+		con_log(CL_ANN1, (CE_NOTE,
+		    "handle_drv_ioctl: DDI_MODEL_ILP32"));
+
+		sge32 = &smp->sgl[0].sge32[0];
+		ddi_put32(acc_handle, &sge32[0].length, response_xferlen);
+		ddi_put32(acc_handle, &sge32[0].phys_addr,
+		    response_dma_obj.dma_cookie[0].dmac_address);
+		ddi_put32(acc_handle, &sge32[1].length, request_xferlen);
+		ddi_put32(acc_handle, &sge32[1].phys_addr,
+		    request_dma_obj.dma_cookie[0].dmac_address);
+	} else {
+#ifdef _ILP32
+		con_log(CL_ANN1, (CE_NOTE,
+		    "handle_drv_ioctl: DDI_MODEL_ILP32"));
+		sge32 = &smp->sgl[0].sge32[0];
+		ddi_put32(acc_handle, &sge32[0].length, response_xferlen);
+		ddi_put32(acc_handle, &sge32[0].phys_addr,
+		    response_dma_obj.dma_cookie[0].dmac_address);
+		ddi_put32(acc_handle, &sge32[1].length, request_xferlen);
+		ddi_put32(acc_handle, &sge32[1].phys_addr,
+		    request_dma_obj.dma_cookie[0].dmac_address);
+#else
+		con_log(CL_ANN1, (CE_NOTE,
+		    "issue_mfi_smp: DDI_MODEL_LP64"));
+		sge64 = &smp->sgl[0].sge64[0];
+		ddi_put32(acc_handle, &sge64[0].length, response_xferlen);
+		ddi_put64(acc_handle, &sge64[0].phys_addr,
+		    response_dma_obj.dma_cookie[0].dmac_address);
+		ddi_put32(acc_handle, &sge64[1].length, request_xferlen);
+		ddi_put64(acc_handle, &sge64[1].phys_addr,
+		    request_dma_obj.dma_cookie[0].dmac_address);
+#endif
+	}
+	con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp : "
+	    "smp->response_xferlen = %d, smp->request_xferlen = %d "
+	    "smp->data_xfer_len = %d", ddi_get32(acc_handle, &sge32[0].length),
+	    ddi_get32(acc_handle, &sge32[1].length),
+	    ddi_get32(acc_handle, &smp->data_xfer_len)));
+
+	cmd->sync_cmd = DRSAS_TRUE;
+	cmd->frame_count = 1;
+
+	if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) {
+		con_log(CL_ANN, (CE_WARN,
+		    "issue_mfi_smp: fw_ioctl failed"));
+	} else {
+		con_log(CL_ANN1, (CE_NOTE,
+		    "issue_mfi_smp: copy to user space"));
+
+		if (request_xferlen) {
+			for (i = 0; i < request_xferlen; i++) {
+				if (ddi_copyout(
+				    (uint8_t *)request_dma_obj.buffer +
+				    i, (uint8_t *)request_ubuf + i,
+				    1, mode)) {
+					con_log(CL_ANN, (CE_WARN,
+					    "issue_mfi_smp : copy to user space"
+					    " failed"));
+					return (DDI_FAILURE);
+				}
+			}
+		}
+
+		if (response_xferlen) {
+			for (i = 0; i < response_xferlen; i++) {
+				if (ddi_copyout(
+				    (uint8_t *)response_dma_obj.buffer
+				    + i, (uint8_t *)response_ubuf
+				    + i, 1, mode)) {
+					con_log(CL_ANN, (CE_WARN,
+					    "issue_mfi_smp : copy to "
+					    "user space failed"));
+					return (DDI_FAILURE);
+				}
+			}
+		}
+	}
+
+	ksmp->cmd_status = ddi_get8(acc_handle, &smp->cmd_status);
+	con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: smp->cmd_status = %d",
+	    ddi_get8(acc_handle, &smp->cmd_status)));
+
+
+	if (request_xferlen) {
+		/* free kernel buffer */
+		if (drsas_free_dma_obj(instance, request_dma_obj) !=
+		    DDI_SUCCESS)
+			return (DDI_FAILURE);
+	}
+
+	if (response_xferlen) {
+		/* free kernel buffer */
+		if (drsas_free_dma_obj(instance, response_dma_obj) !=
+		    DDI_SUCCESS)
+			return (DDI_FAILURE);
+	}
+
+	return (DDI_SUCCESS);
+}
+
+/*
+ * issue_mfi_stp
+ */
+static int
+issue_mfi_stp(struct drsas_instance *instance, struct drsas_ioctl *ioctl,
+    struct drsas_cmd *cmd, int mode)
+{
+	void		*fis_ubuf;
+	void		*data_ubuf;
+	uint32_t	fis_xferlen = 0;
+	uint32_t	data_xferlen = 0;
+	uint_t		model;
+	dma_obj_t	fis_dma_obj;
+	dma_obj_t	data_dma_obj;
+	struct drsas_stp_frame	*kstp;
+	struct drsas_stp_frame	*stp;
+	ddi_acc_handle_t	acc_handle = cmd->frame_dma_obj.acc_handle;
+	int i;
+
+	stp = &cmd->frame->stp;
+	kstp = (struct drsas_stp_frame *)&ioctl->frame[0];
+
+	model = ddi_model_convert_from(mode & FMODELS);
+	if (model == DDI_MODEL_ILP32) {
+		con_log(CL_ANN1, (CE_NOTE, "issue_mfi_stp: DDI_MODEL_ILP32"));
+
+		fis_xferlen	= kstp->sgl.sge32[0].length;
+		data_xferlen	= kstp->sgl.sge32[1].length;
+
+		fis_ubuf	= (void *)(ulong_t)kstp->sgl.sge32[0].phys_addr;
+		data_ubuf	= (void *)(ulong_t)kstp->sgl.sge32[1].phys_addr;
+	}
+	else
+	{
+#ifdef _ILP32
+		con_log(CL_ANN1, (CE_NOTE, "issue_mfi_stp: DDI_MODEL_ILP32"));
+
+		fis_xferlen	= kstp->sgl.sge32[0].length;
+		data_xferlen	= kstp->sgl.sge32[1].length;
+
+		fis_ubuf	= (void *)(ulong_t)kstp->sgl.sge32[0].phys_addr;
+		data_ubuf	= (void *)(ulong_t)kstp->sgl.sge32[1].phys_addr;
+#else
+		con_log(CL_ANN1, (CE_NOTE, "issue_mfi_stp: DDI_MODEL_LP64"));
+
+		fis_xferlen	= kstp->sgl.sge64[0].length;
+		data_xferlen	= kstp->sgl.sge64[1].length;
+
+		fis_ubuf	= (void *)(ulong_t)kstp->sgl.sge64[0].phys_addr;
+		data_ubuf	= (void *)(ulong_t)kstp->sgl.sge64[1].phys_addr;
+#endif
+	}
+
+
+	if (fis_xferlen) {
+		con_log(CL_ANN, (CE_NOTE, "issue_mfi_stp: "
+		    "fis_ubuf = %p fis_xferlen = %x", fis_ubuf, fis_xferlen));
+
+		/* means IOCTL requires DMA */
+		/* allocate the data transfer buffer */
+		fis_dma_obj.size = fis_xferlen;
+		fis_dma_obj.dma_attr = drsas_generic_dma_attr;
+		fis_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU;
+		fis_dma_obj.dma_attr.dma_attr_count_max	= 0xFFFFFFFFU;
+		fis_dma_obj.dma_attr.dma_attr_sgllen = 1;
+		fis_dma_obj.dma_attr.dma_attr_align = 1;
+
+		/* allocate kernel buffer for DMA */
+		if (drsas_alloc_dma_obj(instance, &fis_dma_obj,
+		    (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) {
+			con_log(CL_ANN, (CE_WARN, "issue_mfi_stp : "
+			    "could not allocate data transfer buffer."));
+			return (DDI_FAILURE);
+		}
+
+		/* If IOCTL requires DMA WRITE, do ddi_copyin IOCTL data copy */
+		for (i = 0; i < fis_xferlen; i++) {
+			if (ddi_copyin((uint8_t *)fis_ubuf + i,
+			    (uint8_t *)fis_dma_obj.buffer + i, 1, mode)) {
+				con_log(CL_ANN, (CE_WARN, "issue_mfi_stp: "
+				    "copy from user space failed"));
+				return (DDI_FAILURE);
+			}
+		}
+	}
+
+	if (data_xferlen) {
+		con_log(CL_ANN, (CE_NOTE, "issue_mfi_stp: data_ubuf = %p "
+		    "data_xferlen = %x", data_ubuf, data_xferlen));
+
+		/* means IOCTL requires DMA */
+		/* allocate the data transfer buffer */
+		data_dma_obj.size = data_xferlen;
+		data_dma_obj.dma_attr = drsas_generic_dma_attr;
+		data_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU;
+		data_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU;
+		data_dma_obj.dma_attr.dma_attr_sgllen = 1;
+		data_dma_obj.dma_attr.dma_attr_align = 1;
+
+/* allocate kernel buffer for DMA */
+		if (drsas_alloc_dma_obj(instance, &data_dma_obj,
+		    (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) {
+			con_log(CL_ANN, (CE_WARN, "issue_mfi_stp: "
+			    "could not allocate data transfer buffer."));
+			return (DDI_FAILURE);
+		}
+
+		/* If IOCTL requires DMA WRITE, do ddi_copyin IOCTL data copy */
+		for (i = 0; i < data_xferlen; i++) {
+			if (ddi_copyin((uint8_t *)data_ubuf + i,
+			    (uint8_t *)data_dma_obj.buffer + i, 1, mode)) {
+				con_log(CL_ANN, (CE_WARN, "issue_mfi_stp: "
+				    "copy from user space failed"));
+				return (DDI_FAILURE);
+			}
+		}
+	}
+
+	ddi_put8(acc_handle, &stp->cmd, kstp->cmd);
+	ddi_put8(acc_handle, &stp->cmd_status, 0);
+	ddi_put8(acc_handle, &stp->connection_status, 0);
+	ddi_put8(acc_handle, &stp->target_id, kstp->target_id);
+	ddi_put8(acc_handle, &stp->sge_count, kstp->sge_count);
+
+	ddi_put16(acc_handle, &stp->timeout, kstp->timeout);
+	ddi_put32(acc_handle, &stp->data_xfer_len, kstp->data_xfer_len);
+
+	ddi_rep_put8(acc_handle, (uint8_t *)kstp->fis, (uint8_t *)stp->fis, 10,
+	    DDI_DEV_AUTOINCR);
+
+	ddi_put16(acc_handle, &stp->flags, kstp->flags & ~MFI_FRAME_SGL64);
+	ddi_put32(acc_handle, &stp->stp_flags, kstp->stp_flags);
+	ddi_put32(acc_handle, &stp->sgl.sge32[0].length, fis_xferlen);
+	ddi_put32(acc_handle, &stp->sgl.sge32[0].phys_addr,
+	    fis_dma_obj.dma_cookie[0].dmac_address);
+	ddi_put32(acc_handle, &stp->sgl.sge32[1].length, data_xferlen);
+	ddi_put32(acc_handle, &stp->sgl.sge32[1].phys_addr,
+	    data_dma_obj.dma_cookie[0].dmac_address);
+
+	cmd->sync_cmd = DRSAS_TRUE;
+	cmd->frame_count = 1;
+
+	if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) {
+		con_log(CL_ANN, (CE_WARN, "issue_mfi_stp: fw_ioctl failed"));
+	} else {
+
+		if (fis_xferlen) {
+			for (i = 0; i < fis_xferlen; i++) {
+				if (ddi_copyout(
+				    (uint8_t *)fis_dma_obj.buffer + i,
+				    (uint8_t *)fis_ubuf + i, 1, mode)) {
+					con_log(CL_ANN, (CE_WARN,
+					    "issue_mfi_stp : copy to "
+					    "user space failed"));
+					return (DDI_FAILURE);
+				}
+			}
+		}
+	}
+	if (data_xferlen) {
+		for (i = 0; i < data_xferlen; i++) {
+			if (ddi_copyout(
+			    (uint8_t *)data_dma_obj.buffer + i,
+			    (uint8_t *)data_ubuf + i, 1, mode)) {
+				con_log(CL_ANN, (CE_WARN,
+				    "issue_mfi_stp : copy to"
+				    " user space failed"));
+				return (DDI_FAILURE);
+			}
+		}
+	}
+
+	kstp->cmd_status = ddi_get8(acc_handle, &stp->cmd_status);
+
+	if (fis_xferlen) {
+		/* free kernel buffer */
+		if (drsas_free_dma_obj(instance, fis_dma_obj) != DDI_SUCCESS)
+			return (DDI_FAILURE);
+	}
+
+	if (data_xferlen) {
+		/* free kernel buffer */
+		if (drsas_free_dma_obj(instance, data_dma_obj) != DDI_SUCCESS)
+			return (DDI_FAILURE);
+	}
+
+	return (DDI_SUCCESS);
+}
+
+/*
+ * fill_up_drv_ver
+ */
+static void
+fill_up_drv_ver(struct drsas_drv_ver *dv)
+{
+	(void) memset(dv, 0, sizeof (struct drsas_drv_ver));
+
+	(void) memcpy(dv->signature, "$LSI LOGIC$", strlen("$LSI LOGIC$"));
+	(void) memcpy(dv->os_name, "Solaris", strlen("Solaris"));
+	(void) memcpy(dv->drv_name, "dr_sas", strlen("dr_sas"));
+	(void) memcpy(dv->drv_ver, DRSAS_VERSION, strlen(DRSAS_VERSION));
+	(void) memcpy(dv->drv_rel_date, DRSAS_RELDATE,
+	    strlen(DRSAS_RELDATE));
+}
+
+/*
+ * handle_drv_ioctl
+ */
+static int
+handle_drv_ioctl(struct drsas_instance *instance, struct drsas_ioctl *ioctl,
+    int mode)
+{
+	int	i;
+	int	rval = DDI_SUCCESS;
+	int	*props = NULL;
+	void	*ubuf;
+
+	uint8_t		*pci_conf_buf;
+	uint32_t	xferlen;
+	uint32_t	num_props;
+	uint_t		model;
+	struct drsas_dcmd_frame	*kdcmd;
+	struct drsas_drv_ver	dv;
+	struct drsas_pci_information pi;
+
+	kdcmd = (struct drsas_dcmd_frame *)&ioctl->frame[0];
+
+	model = ddi_model_convert_from(mode & FMODELS);
+	if (model == DDI_MODEL_ILP32) {
+		con_log(CL_ANN1, (CE_NOTE,
+		    "handle_drv_ioctl: DDI_MODEL_ILP32"));
+
+		xferlen	= kdcmd->sgl.sge32[0].length;
+
+		ubuf = (void *)(ulong_t)kdcmd->sgl.sge32[0].phys_addr;
+	} else {
+#ifdef _ILP32
+		con_log(CL_ANN1, (CE_NOTE,
+		    "handle_drv_ioctl: DDI_MODEL_ILP32"));
+		xferlen	= kdcmd->sgl.sge32[0].length;
+		ubuf = (void *)(ulong_t)kdcmd->sgl.sge32[0].phys_addr;
+#else
+		con_log(CL_ANN1, (CE_NOTE,
+		    "handle_drv_ioctl: DDI_MODEL_LP64"));
+		xferlen	= kdcmd->sgl.sge64[0].length;
+		ubuf = (void *)(ulong_t)kdcmd->sgl.sge64[0].phys_addr;
+#endif
+	}
+	con_log(CL_ANN1, (CE_NOTE, "handle_drv_ioctl: "
+	    "dataBuf=%p size=%d bytes", ubuf, xferlen));
+
+	switch (kdcmd->opcode) {
+	case DRSAS_DRIVER_IOCTL_DRIVER_VERSION:
+		con_log(CL_ANN1, (CE_NOTE, "handle_drv_ioctl: "
+		    "DRSAS_DRIVER_IOCTL_DRIVER_VERSION"));
+
+		fill_up_drv_ver(&dv);
+		for (i = 0; i < xferlen; i++) {
+			if (ddi_copyout((uint8_t *)&dv + i, (uint8_t *)ubuf + i,
+			    1, mode)) {
+				con_log(CL_ANN, (CE_WARN, "handle_drv_ioctl: "
+				    "DRSAS_DRIVER_IOCTL_DRIVER_VERSION"
+				    " : copy to user space failed"));
+				kdcmd->cmd_status = 1;
+				rval = DDI_FAILURE;
+				break;
+			}
+		}
+		if (i == xferlen)
+			kdcmd->cmd_status = 0;
+		break;
+	case DRSAS_DRIVER_IOCTL_PCI_INFORMATION:
+		con_log(CL_ANN1, (CE_NOTE, "handle_drv_ioctl: "
+		    "DRSAS_DRIVER_IOCTL_PCI_INFORMAITON"));
+
+		if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, instance->dip,
+		    0, "reg", &props, &num_props)) {
+			con_log(CL_ANN, (CE_WARN, "handle_drv_ioctl: "
+			    "DRSAS_DRIVER_IOCTL_PCI_INFORMATION : "
+			    "ddi_prop_look_int_array failed"));
+			rval = DDI_FAILURE;
+		} else {
+
+			pi.busNumber = (props[0] >> 16) & 0xFF;
+			pi.deviceNumber = (props[0] >> 11) & 0x1f;
+			pi.functionNumber = (props[0] >> 8) & 0x7;
+			ddi_prop_free((void *)props);
+		}
+
+		pci_conf_buf = (uint8_t *)&pi.pciHeaderInfo;
+
+		for (i = 0; i < (sizeof (struct drsas_pci_information) -
+		    offsetof(struct drsas_pci_information, pciHeaderInfo));
+		    i++) {
+			pci_conf_buf[i] =
+			    pci_config_get8(instance->pci_handle, i);
+		}
+		for (i = 0; i < xferlen; i++) {
+			if (ddi_copyout((uint8_t *)&pi + i, (uint8_t *)ubuf + i,
+			    1, mode)) {
+				con_log(CL_ANN, (CE_WARN, "handle_drv_ioctl: "
+				    "DRSAS_DRIVER_IOCTL_PCI_INFORMATION"
+				    " : copy to user space failed"));
+				kdcmd->cmd_status = 1;
+				rval = DDI_FAILURE;
+				break;
+			}
+		}
+
+		if (i == xferlen)
+			kdcmd->cmd_status = 0;
+
+		break;
+	default:
+		con_log(CL_ANN, (CE_WARN, "handle_drv_ioctl: "
+		    "invalid driver specific IOCTL opcode = 0x%x",
+		    kdcmd->opcode));
+		kdcmd->cmd_status = 1;
+		rval = DDI_FAILURE;
+		break;
+	}
+
+	return (rval);
+}
+
+/*
+ * handle_mfi_ioctl
+ */
+static int
+handle_mfi_ioctl(struct drsas_instance *instance, struct drsas_ioctl *ioctl,
+    int mode)
+{
+	int	rval = DDI_SUCCESS;
+
+	struct drsas_header	*hdr;
+	struct drsas_cmd	*cmd;
+
+	cmd = get_mfi_pkt(instance);
+
+	if (!cmd) {
+		con_log(CL_ANN, (CE_WARN, "dr_sas: "
+		    "failed to get a cmd packet"));
+		return (DDI_FAILURE);
+	}
+
+	/* Clear the frame buffer and assign back the context id */
+	(void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame));
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context,
+	    cmd->index);
+
+	hdr = (struct drsas_header *)&ioctl->frame[0];
+
+	switch (hdr->cmd) {
+	case MFI_CMD_OP_DCMD:
+		rval = issue_mfi_dcmd(instance, ioctl, cmd, mode);
+		break;
+	case MFI_CMD_OP_SMP:
+		rval = issue_mfi_smp(instance, ioctl, cmd, mode);
+		break;
+	case MFI_CMD_OP_STP:
+		rval = issue_mfi_stp(instance, ioctl, cmd, mode);
+		break;
+	case MFI_CMD_OP_LD_SCSI:
+	case MFI_CMD_OP_PD_SCSI:
+		rval = issue_mfi_pthru(instance, ioctl, cmd, mode);
+		break;
+	default:
+		con_log(CL_ANN, (CE_WARN, "handle_mfi_ioctl: "
+		    "invalid mfi ioctl hdr->cmd = %d", hdr->cmd));
+		rval = DDI_FAILURE;
+		break;
+	}
+
+
+	return_mfi_pkt(instance, cmd);
+	if (drsas_common_check(instance, cmd) != DDI_SUCCESS)
+		rval = DDI_FAILURE;
+	return (rval);
+}
+
+/*
+ * AEN
+ */
+static int
+handle_mfi_aen(struct drsas_instance *instance, struct drsas_aen *aen)
+{
+	int	rval = 0;
+
+	rval = register_mfi_aen(instance, instance->aen_seq_num,
+	    aen->class_locale_word);
+
+	aen->cmd_status = (uint8_t)rval;
+
+	return (rval);
+}
+
+static int
+register_mfi_aen(struct drsas_instance *instance, uint32_t seq_num,
+    uint32_t class_locale_word)
+{
+	int	ret_val;
+
+	struct drsas_cmd	*cmd, *aen_cmd;
+	struct drsas_dcmd_frame	*dcmd;
+	union drsas_evt_class_locale	curr_aen;
+	union drsas_evt_class_locale	prev_aen;
+
+	/*
+	 * If there an AEN pending already (aen_cmd), check if the
+	 * class_locale of that pending AEN is inclusive of the new
+	 * AEN request we currently have. If it is, then we don't have
+	 * to do anything. In other words, whichever events the current
+	 * AEN request is subscribing to, have already been subscribed
+	 * to.
+	 *
+	 * If the old_cmd is _not_ inclusive, then we have to abort
+	 * that command, form a class_locale that is superset of both
+	 * old and current and re-issue to the FW
+	 */
+
+	curr_aen.word = class_locale_word;
+	aen_cmd = instance->aen_cmd;
+	if (aen_cmd) {
+		prev_aen.word = ddi_get32(aen_cmd->frame_dma_obj.acc_handle,
+		    &aen_cmd->frame->dcmd.mbox.w[1]);
+
+		/*
+		 * A class whose enum value is smaller is inclusive of all
+		 * higher values. If a PROGRESS (= -1) was previously
+		 * registered, then a new registration requests for higher
+		 * classes need not be sent to FW. They are automatically
+		 * included.
+		 *
+		 * Locale numbers don't have such hierarchy. They are bitmap
+		 * values
+		 */
+		if ((prev_aen.members.class <= curr_aen.members.class) &&
+		    !((prev_aen.members.locale & curr_aen.members.locale) ^
+		    curr_aen.members.locale)) {
+			/*
+			 * Previously issued event registration includes
+			 * current request. Nothing to do.
+			 */
+
+			return (0);
+		} else {
+			curr_aen.members.locale |= prev_aen.members.locale;
+
+			if (prev_aen.members.class < curr_aen.members.class)
+				curr_aen.members.class = prev_aen.members.class;
+
+			ret_val = abort_aen_cmd(instance, aen_cmd);
+
+			if (ret_val) {
+				con_log(CL_ANN, (CE_WARN, "register_mfi_aen: "
+				    "failed to abort prevous AEN command"));
+
+				return (ret_val);
+			}
+		}
+	} else {
+		curr_aen.word = class_locale_word;
+	}
+
+	cmd = get_mfi_pkt(instance);
+
+	if (!cmd)
+		return (ENOMEM);
+	/* Clear the frame buffer and assign back the context id */
+	(void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame));
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context,
+	    cmd->index);
+
+	dcmd = &cmd->frame->dcmd;
+
+	/* for(i = 0; i < DCMD_MBOX_SZ; i++) dcmd->mbox.b[i] = 0; */
+	(void) memset(dcmd->mbox.b, 0, DCMD_MBOX_SZ);
+
+	(void) memset(instance->mfi_evt_detail_obj.buffer, 0,
+	    sizeof (struct drsas_evt_detail));
+
+	/* Prepare DCMD for aen registration */
+	ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd, MFI_CMD_OP_DCMD);
+	ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd_status, 0x0);
+	ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->sge_count, 1);
+	ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->flags,
+	    MFI_FRAME_DIR_READ);
+	ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->timeout, 0);
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->data_xfer_len,
+	    sizeof (struct drsas_evt_detail));
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->opcode,
+	    DR_DCMD_CTRL_EVENT_WAIT);
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->mbox.w[0], seq_num);
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->mbox.w[1],
+	    curr_aen.word);
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->sgl.sge32[0].phys_addr,
+	    instance->mfi_evt_detail_obj.dma_cookie[0].dmac_address);
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->sgl.sge32[0].length,
+	    sizeof (struct drsas_evt_detail));
+
+	instance->aen_seq_num = seq_num;
+
+
+	/*
+	 * Store reference to the cmd used to register for AEN. When an
+	 * application wants us to register for AEN, we have to abort this
+	 * cmd and re-register with a new EVENT LOCALE supplied by that app
+	 */
+	instance->aen_cmd = cmd;
+
+	cmd->frame_count = 1;
+
+	/* Issue the aen registration frame */
+	/* atomic_add_16 (&instance->fw_outstanding, 1); */
+	instance->func_ptr->issue_cmd(cmd, instance);
+
+	return (0);
+}
+
+static void
+display_scsi_inquiry(caddr_t scsi_inq)
+{
+#define	MAX_SCSI_DEVICE_CODE	14
+	int		i;
+	char		inquiry_buf[256] = {0};
+	int		len;
+	const char	*const scsi_device_types[] = {
+		"Direct-Access    ",
+		"Sequential-Access",
+		"Printer          ",
+		"Processor        ",
+		"WORM             ",
+		"CD-ROM           ",
+		"Scanner          ",
+		"Optical Device   ",
+		"Medium Changer   ",
+		"Communications   ",
+		"Unknown          ",
+		"Unknown          ",
+		"Unknown          ",
+		"Enclosure        ",
+	};
+
+	len = 0;
+
+	len += snprintf(inquiry_buf + len, 265 - len, "  Vendor: ");
+	for (i = 8; i < 16; i++) {
+		len += snprintf(inquiry_buf + len, 265 - len, "%c",
+		    scsi_inq[i]);
+	}
+
+	len += snprintf(inquiry_buf + len, 265 - len, "  Model: ");
+
+	for (i = 16; i < 32; i++) {
+		len += snprintf(inquiry_buf + len, 265 - len, "%c",
+		    scsi_inq[i]);
+	}
+
+	len += snprintf(inquiry_buf + len, 265 - len, "  Rev: ");
+
+	for (i = 32; i < 36; i++) {
+		len += snprintf(inquiry_buf + len, 265 - len, "%c",
+		    scsi_inq[i]);
+	}
+
+	len += snprintf(inquiry_buf + len, 265 - len, "\n");
+
+
+	i = scsi_inq[0] & 0x1f;
+
+
+	len += snprintf(inquiry_buf + len, 265 - len, "  Type:   %s ",
+	    i < MAX_SCSI_DEVICE_CODE ? scsi_device_types[i] :
+	    "Unknown          ");
+
+
+	len += snprintf(inquiry_buf + len, 265 - len,
+	    "                 ANSI SCSI revision: %02x", scsi_inq[2] & 0x07);
+
+	if ((scsi_inq[2] & 0x07) == 1 && (scsi_inq[3] & 0x0f) == 1) {
+		len += snprintf(inquiry_buf + len, 265 - len, " CCS\n");
+	} else {
+		len += snprintf(inquiry_buf + len, 265 - len, "\n");
+	}
+
+	con_log(CL_ANN1, (CE_CONT, inquiry_buf));
+}
+
+static int
+read_fw_status_reg_ppc(struct drsas_instance *instance)
+{
+	return ((int)RD_OB_SCRATCH_PAD_0(instance));
+}
+
+static void
+issue_cmd_ppc(struct drsas_cmd *cmd, struct drsas_instance *instance)
+{
+	atomic_add_16(&instance->fw_outstanding, 1);
+
+	/* Issue the command to the FW */
+	WR_IB_QPORT((cmd->frame_phys_addr) |
+	    (((cmd->frame_count - 1) << 1) | 1), instance);
+}
+
+/*
+ * issue_cmd_in_sync_mode
+ */
+static int
+issue_cmd_in_sync_mode_ppc(struct drsas_instance *instance,
+    struct drsas_cmd *cmd)
+{
+	int		i;
+	uint32_t	msecs = MFI_POLL_TIMEOUT_SECS * (10 * MILLISEC);
+
+	con_log(CL_ANN1, (CE_NOTE, "issue_cmd_in_sync_mode_ppc: called"));
+
+	cmd->cmd_status	= ENODATA;
+
+	WR_IB_QPORT((cmd->frame_phys_addr) |
+	    (((cmd->frame_count - 1) << 1) | 1), instance);
+
+	mutex_enter(&instance->int_cmd_mtx);
+
+	for (i = 0; i < msecs && (cmd->cmd_status == ENODATA); i++) {
+		cv_wait(&instance->int_cmd_cv, &instance->int_cmd_mtx);
+	}
+
+	mutex_exit(&instance->int_cmd_mtx);
+
+	con_log(CL_ANN1, (CE_NOTE, "issue_cmd_in_sync_mode_ppc: done"));
+
+	if (i < (msecs -1)) {
+		return (DDI_SUCCESS);
+	} else {
+		return (DDI_FAILURE);
+	}
+}
+
+/*
+ * issue_cmd_in_poll_mode
+ */
+static int
+issue_cmd_in_poll_mode_ppc(struct drsas_instance *instance,
+    struct drsas_cmd *cmd)
+{
+	int		i;
+	uint16_t	flags;
+	uint32_t	msecs = MFI_POLL_TIMEOUT_SECS * MILLISEC;
+	struct drsas_header *frame_hdr;
+
+	con_log(CL_ANN1, (CE_NOTE, "issue_cmd_in_poll_mode_ppc: called"));
+
+	frame_hdr = (struct drsas_header *)cmd->frame;
+	ddi_put8(cmd->frame_dma_obj.acc_handle, &frame_hdr->cmd_status,
+	    MFI_CMD_STATUS_POLL_MODE);
+	flags = ddi_get16(cmd->frame_dma_obj.acc_handle, &frame_hdr->flags);
+	flags |= MFI_FRAME_DONT_POST_IN_REPLY_QUEUE;
+
+	ddi_put16(cmd->frame_dma_obj.acc_handle, &frame_hdr->flags, flags);
+
+	/* issue the frame using inbound queue port */
+	WR_IB_QPORT((cmd->frame_phys_addr) |
+	    (((cmd->frame_count - 1) << 1) | 1), instance);
+
+	/* wait for cmd_status to change from 0xFF */
+	for (i = 0; i < msecs && (
+	    ddi_get8(cmd->frame_dma_obj.acc_handle, &frame_hdr->cmd_status)
+	    == MFI_CMD_STATUS_POLL_MODE); i++) {
+		drv_usecwait(MILLISEC); /* wait for 1000 usecs */
+	}
+
+	if (ddi_get8(cmd->frame_dma_obj.acc_handle, &frame_hdr->cmd_status)
+	    == MFI_CMD_STATUS_POLL_MODE) {
+		con_log(CL_ANN, (CE_NOTE, "issue_cmd_in_poll_mode: "
+		    "cmd polling timed out"));
+		return (DDI_FAILURE);
+	}
+
+	return (DDI_SUCCESS);
+}
+
+static void
+enable_intr_ppc(struct drsas_instance *instance)
+{
+	uint32_t	mask;
+
+	con_log(CL_ANN1, (CE_NOTE, "enable_intr_ppc: called"));
+
+	/* WR_OB_DOORBELL_CLEAR(0xFFFFFFFF, instance); */
+	WR_OB_DOORBELL_CLEAR(OB_DOORBELL_CLEAR_MASK, instance);
+
+	/* WR_OB_INTR_MASK(~0x80000000, instance); */
+	WR_OB_INTR_MASK(~(MFI_REPLY_2108_MESSAGE_INTR_MASK), instance);
+
+	/* dummy read to force PCI flush */
+	mask = RD_OB_INTR_MASK(instance);
+
+	con_log(CL_ANN1, (CE_NOTE, "enable_intr_ppc: "
+	    "outbound_intr_mask = 0x%x", mask));
+}
+
+static void
+disable_intr_ppc(struct drsas_instance *instance)
+{
+	uint32_t	mask __unused;
+
+	con_log(CL_ANN1, (CE_NOTE, "disable_intr_ppc: called"));
+
+	con_log(CL_ANN1, (CE_NOTE, "disable_intr_ppc: before : "
+	    "outbound_intr_mask = 0x%x", RD_OB_INTR_MASK(instance)));
+
+	/* WR_OB_INTR_MASK(0xFFFFFFFF, instance); */
+	WR_OB_INTR_MASK(OB_INTR_MASK, instance);
+
+	con_log(CL_ANN1, (CE_NOTE, "disable_intr_ppc: after : "
+	    "outbound_intr_mask = 0x%x", RD_OB_INTR_MASK(instance)));
+
+	/* dummy read to force PCI flush */
+	mask = RD_OB_INTR_MASK(instance);
+#ifdef lint
+	mask = mask;
+#endif
+}
+
+static int
+intr_ack_ppc(struct drsas_instance *instance)
+{
+	uint32_t	status;
+
+	con_log(CL_ANN1, (CE_NOTE, "intr_ack_ppc: called"));
+
+	/* check if it is our interrupt */
+	status = RD_OB_INTR_STATUS(instance);
+
+	con_log(CL_ANN1, (CE_NOTE, "intr_ack_ppc: status = 0x%x", status));
+
+	if (!(status & MFI_REPLY_2108_MESSAGE_INTR)) {
+		return (DDI_INTR_UNCLAIMED);
+	}
+
+	/* clear the interrupt by writing back the same value */
+	WR_OB_DOORBELL_CLEAR(status, instance);
+
+	/* dummy READ */
+	status = RD_OB_INTR_STATUS(instance);
+
+	con_log(CL_ANN1, (CE_NOTE, "intr_ack_ppc: interrupt cleared"));
+
+	return (DDI_INTR_CLAIMED);
+}
+
+static int
+drsas_common_check(struct drsas_instance *instance,
+    struct  drsas_cmd *cmd)
+{
+	int ret = DDI_SUCCESS;
+
+	if (drsas_check_dma_handle(cmd->frame_dma_obj.dma_handle) !=
+	    DDI_SUCCESS) {
+		ddi_fm_service_impact(instance->dip, DDI_SERVICE_UNAFFECTED);
+		if (cmd->pkt != NULL) {
+			cmd->pkt->pkt_reason = CMD_TRAN_ERR;
+			cmd->pkt->pkt_statistics = 0;
+		}
+		ret = DDI_FAILURE;
+	}
+	if (drsas_check_dma_handle(instance->mfi_internal_dma_obj.dma_handle)
+	    != DDI_SUCCESS) {
+		ddi_fm_service_impact(instance->dip, DDI_SERVICE_UNAFFECTED);
+		if (cmd->pkt != NULL) {
+			cmd->pkt->pkt_reason = CMD_TRAN_ERR;
+			cmd->pkt->pkt_statistics = 0;
+		}
+		ret = DDI_FAILURE;
+	}
+	if (drsas_check_dma_handle(instance->mfi_evt_detail_obj.dma_handle) !=
+	    DDI_SUCCESS) {
+		ddi_fm_service_impact(instance->dip, DDI_SERVICE_UNAFFECTED);
+		if (cmd->pkt != NULL) {
+			cmd->pkt->pkt_reason = CMD_TRAN_ERR;
+			cmd->pkt->pkt_statistics = 0;
+		}
+		ret = DDI_FAILURE;
+	}
+	if (drsas_check_acc_handle(instance->regmap_handle) != DDI_SUCCESS) {
+		ddi_fm_service_impact(instance->dip, DDI_SERVICE_UNAFFECTED);
+
+		ddi_fm_acc_err_clear(instance->regmap_handle, DDI_FME_VER0);
+
+		if (cmd->pkt != NULL) {
+			cmd->pkt->pkt_reason = CMD_TRAN_ERR;
+			cmd->pkt->pkt_statistics = 0;
+		}
+		ret = DDI_FAILURE;
+	}
+
+	return (ret);
+}
+
+/*ARGSUSED*/
+static int
+drsas_fm_error_cb(dev_info_t *dip, ddi_fm_error_t *err, const void *impl_data)
+{
+	/*
+	 * as the driver can always deal with an error in any dma or
+	 * access handle, we can just return the fme_status value.
+	 */
+	pci_ereport_post(dip, err, NULL);
+	return (err->fme_status);
+}
+
+static void
+drsas_fm_init(struct drsas_instance *instance)
+{
+	/* Need to change iblock to priority for new MSI intr */
+	ddi_iblock_cookie_t fm_ibc;
+
+	/* Only register with IO Fault Services if we have some capability */
+	if (instance->fm_capabilities) {
+		/* Adjust access and dma attributes for FMA */
+		endian_attr.devacc_attr_access = DDI_FLAGERR_ACC;
+		drsas_generic_dma_attr.dma_attr_flags = DDI_DMA_FLAGERR;
+
+		/*
+		 * Register capabilities with IO Fault Services.
+		 * fm_capabilities will be updated to indicate
+		 * capabilities actually supported (not requested.)
+		 */
+
+		ddi_fm_init(instance->dip, &instance->fm_capabilities, &fm_ibc);
+
+		/*
+		 * Initialize pci ereport capabilities if ereport
+		 * capable (should always be.)
+		 */
+
+		if (DDI_FM_EREPORT_CAP(instance->fm_capabilities) ||
+		    DDI_FM_ERRCB_CAP(instance->fm_capabilities)) {
+			pci_ereport_setup(instance->dip);
+		}
+
+		/*
+		 * Register error callback if error callback capable.
+		 */
+		if (DDI_FM_ERRCB_CAP(instance->fm_capabilities)) {
+			ddi_fm_handler_register(instance->dip,
+			    drsas_fm_error_cb, (void*) instance);
+		}
+	} else {
+		endian_attr.devacc_attr_access = DDI_DEFAULT_ACC;
+		drsas_generic_dma_attr.dma_attr_flags = 0;
+	}
+}
+
+static void
+drsas_fm_fini(struct drsas_instance *instance)
+{
+	/* Only unregister FMA capabilities if registered */
+	if (instance->fm_capabilities) {
+		/*
+		 * Un-register error callback if error callback capable.
+		 */
+		if (DDI_FM_ERRCB_CAP(instance->fm_capabilities)) {
+			ddi_fm_handler_unregister(instance->dip);
+		}
+
+		/*
+		 * Release any resources allocated by pci_ereport_setup()
+		 */
+		if (DDI_FM_EREPORT_CAP(instance->fm_capabilities) ||
+		    DDI_FM_ERRCB_CAP(instance->fm_capabilities)) {
+			pci_ereport_teardown(instance->dip);
+		}
+
+		/* Unregister from IO Fault Services */
+		ddi_fm_fini(instance->dip);
+
+		/* Adjust access and dma attributes for FMA */
+		endian_attr.devacc_attr_access = DDI_DEFAULT_ACC;
+		drsas_generic_dma_attr.dma_attr_flags = 0;
+	}
+}
+
+int
+drsas_check_acc_handle(ddi_acc_handle_t handle)
+{
+	ddi_fm_error_t de;
+
+	if (handle == NULL) {
+		return (DDI_FAILURE);
+	}
+
+	ddi_fm_acc_err_get(handle, &de, DDI_FME_VERSION);
+
+	return (de.fme_status);
+}
+
+int
+drsas_check_dma_handle(ddi_dma_handle_t handle)
+{
+	ddi_fm_error_t de;
+
+	if (handle == NULL) {
+		return (DDI_FAILURE);
+	}
+
+	ddi_fm_dma_err_get(handle, &de, DDI_FME_VERSION);
+
+	return (de.fme_status);
+}
+
+void
+drsas_fm_ereport(struct drsas_instance *instance, char *detail)
+{
+	uint64_t ena;
+	char buf[FM_MAX_CLASS];
+
+	(void) snprintf(buf, FM_MAX_CLASS, "%s.%s", DDI_FM_DEVICE, detail);
+	ena = fm_ena_generate(0, FM_ENA_FMT1);
+	if (DDI_FM_EREPORT_CAP(instance->fm_capabilities)) {
+		ddi_fm_ereport_post(instance->dip, buf, ena, DDI_NOSLEEP,
+		    FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERSION, NULL);
+	}
+}
+
+static int
+drsas_add_intrs(struct drsas_instance *instance, int intr_type)
+{
+
+	dev_info_t *dip = instance->dip;
+	int	avail, actual, count;
+	int	i, flag, ret;
+
+	con_log(CL_DLEVEL1, (CE_WARN, "drsas_add_intrs: intr_type = %x",
+	    intr_type));
+
+	/* Get number of interrupts */
+	ret = ddi_intr_get_nintrs(dip, intr_type, &count);
+	if ((ret != DDI_SUCCESS) || (count == 0)) {
+		con_log(CL_ANN, (CE_WARN, "ddi_intr_get_nintrs() failed:"
+		    "ret %d count %d", ret, count));
+
+		return (DDI_FAILURE);
+	}
+
+	con_log(CL_DLEVEL1, (CE_WARN, "drsas_add_intrs: count = %d ", count));
+
+	/* Get number of available interrupts */
+	ret = ddi_intr_get_navail(dip, intr_type, &avail);
+	if ((ret != DDI_SUCCESS) || (avail == 0)) {
+		con_log(CL_ANN, (CE_WARN, "ddi_intr_get_navail() failed:"
+		    "ret %d avail %d", ret, avail));
+
+		return (DDI_FAILURE);
+	}
+	con_log(CL_DLEVEL1, (CE_WARN, "drsas_add_intrs: avail = %d ", avail));
+
+	/* Only one interrupt routine. So limit the count to 1 */
+	if (count > 1) {
+		count = 1;
+	}
+
+	/*
+	 * Allocate an array of interrupt handlers. Currently we support
+	 * only one interrupt. The framework can be extended later.
+	 */
+	instance->intr_size = count * sizeof (ddi_intr_handle_t);
+	instance->intr_htable = kmem_zalloc(instance->intr_size, KM_SLEEP);
+	ASSERT(instance->intr_htable);
+
+	flag = ((intr_type == DDI_INTR_TYPE_MSI) || (intr_type ==
+	    DDI_INTR_TYPE_MSIX)) ? DDI_INTR_ALLOC_STRICT:DDI_INTR_ALLOC_NORMAL;
+
+	/* Allocate interrupt */
+	ret = ddi_intr_alloc(dip, instance->intr_htable, intr_type, 0,
+	    count, &actual, flag);
+
+	if ((ret != DDI_SUCCESS) || (actual == 0)) {
+		con_log(CL_ANN, (CE_WARN, "drsas_add_intrs: "
+		    "avail = %d", avail));
+		kmem_free(instance->intr_htable, instance->intr_size);
+		return (DDI_FAILURE);
+	}
+	if (actual < count) {
+		con_log(CL_ANN, (CE_WARN, "drsas_add_intrs: "
+		    "Requested = %d  Received = %d", count, actual));
+	}
+	instance->intr_cnt = actual;
+
+	/*
+	 * Get the priority of the interrupt allocated.
+	 */
+	if ((ret = ddi_intr_get_pri(instance->intr_htable[0],
+	    &instance->intr_pri)) != DDI_SUCCESS) {
+		con_log(CL_ANN, (CE_WARN, "drsas_add_intrs: "
+		    "get priority call failed"));
+
+		for (i = 0; i < actual; i++) {
+			(void) ddi_intr_free(instance->intr_htable[i]);
+		}
+		kmem_free(instance->intr_htable, instance->intr_size);
+		return (DDI_FAILURE);
+	}
+
+	/*
+	 * Test for high level mutex. we don't support them.
+	 */
+	if (instance->intr_pri >= ddi_intr_get_hilevel_pri()) {
+		con_log(CL_ANN, (CE_WARN, "drsas_add_intrs: "
+		    "High level interrupts not supported."));
+
+		for (i = 0; i < actual; i++) {
+			(void) ddi_intr_free(instance->intr_htable[i]);
+		}
+		kmem_free(instance->intr_htable, instance->intr_size);
+		return (DDI_FAILURE);
+	}
+
+	con_log(CL_DLEVEL1, (CE_NOTE, "drsas_add_intrs: intr_pri = 0x%x ",
+	    instance->intr_pri));
+
+	/* Call ddi_intr_add_handler() */
+	for (i = 0; i < actual; i++) {
+		ret = ddi_intr_add_handler(instance->intr_htable[i],
+		    (ddi_intr_handler_t *)drsas_isr, (caddr_t)instance,
+		    (caddr_t)(uintptr_t)i);
+
+		if (ret != DDI_SUCCESS) {
+			con_log(CL_ANN, (CE_WARN, "drsas_add_intrs:"
+			    "failed %d", ret));
+
+			for (i = 0; i < actual; i++) {
+				(void) ddi_intr_free(instance->intr_htable[i]);
+			}
+			kmem_free(instance->intr_htable, instance->intr_size);
+			return (DDI_FAILURE);
+		}
+
+	}
+
+	con_log(CL_DLEVEL1, (CE_WARN, " ddi_intr_add_handler done"));
+
+	if ((ret = ddi_intr_get_cap(instance->intr_htable[0],
+	    &instance->intr_cap)) != DDI_SUCCESS) {
+		con_log(CL_ANN, (CE_WARN, "ddi_intr_get_cap() failed %d",
+		    ret));
+
+		/* Free already allocated intr */
+		for (i = 0; i < actual; i++) {
+			(void) ddi_intr_remove_handler(
+			    instance->intr_htable[i]);
+			(void) ddi_intr_free(instance->intr_htable[i]);
+		}
+		kmem_free(instance->intr_htable, instance->intr_size);
+		return (DDI_FAILURE);
+	}
+
+	if (instance->intr_cap &  DDI_INTR_FLAG_BLOCK) {
+		con_log(CL_ANN, (CE_WARN, "Calling ddi_intr_block _enable"));
+
+		(void) ddi_intr_block_enable(instance->intr_htable,
+		    instance->intr_cnt);
+	} else {
+		con_log(CL_ANN, (CE_NOTE, " calling ddi_intr_enable"));
+
+		for (i = 0; i < instance->intr_cnt; i++) {
+			(void) ddi_intr_enable(instance->intr_htable[i]);
+			con_log(CL_ANN, (CE_NOTE, "ddi intr enable returns "
+			    "%d", i));
+		}
+	}
+
+	return (DDI_SUCCESS);
+
+}
+
+
+static void
+drsas_rem_intrs(struct drsas_instance *instance)
+{
+	int i;
+
+	con_log(CL_ANN, (CE_NOTE, "drsas_rem_intrs called"));
+
+	/* Disable all interrupts first */
+	if (instance->intr_cap & DDI_INTR_FLAG_BLOCK) {
+		(void) ddi_intr_block_disable(instance->intr_htable,
+		    instance->intr_cnt);
+	} else {
+		for (i = 0; i < instance->intr_cnt; i++) {
+			(void) ddi_intr_disable(instance->intr_htable[i]);
+		}
+	}
+
+	/* Remove all the handlers */
+
+	for (i = 0; i < instance->intr_cnt; i++) {
+		(void) ddi_intr_remove_handler(instance->intr_htable[i]);
+		(void) ddi_intr_free(instance->intr_htable[i]);
+	}
+
+	kmem_free(instance->intr_htable, instance->intr_size);
+}
+
+static int
+drsas_tran_bus_config(dev_info_t *parent, uint_t flags,
+    ddi_bus_config_op_t op, void *arg, dev_info_t **childp)
+{
+	struct drsas_instance *instance;
+	int config;
+	int rval;
+
+	char *ptr = NULL;
+	int tgt, lun;
+
+	con_log(CL_ANN1, (CE_NOTE, "Bus config called for op = %x", op));
+
+	if ((instance = ddi_get_soft_state(drsas_state,
+	    ddi_get_instance(parent))) == NULL) {
+		return (NDI_FAILURE);
+	}
+
+	/* Hold nexus during bus_config */
+	ndi_devi_enter(parent, &config);
+	switch (op) {
+	case BUS_CONFIG_ONE: {
+
+		/* parse wwid/target name out of name given */
+		if ((ptr = strchr((char *)arg, '@')) == NULL) {
+			rval = NDI_FAILURE;
+			break;
+		}
+		ptr++;
+
+		if (drsas_parse_devname(arg, &tgt, &lun) != 0) {
+			rval = NDI_FAILURE;
+			break;
+		}
+
+		if (lun == 0) {
+			rval = drsas_config_ld(instance, tgt, lun, childp);
+		} else {
+			rval = NDI_FAILURE;
+		}
+
+		break;
+	}
+	case BUS_CONFIG_DRIVER:
+	case BUS_CONFIG_ALL: {
+
+		rval = drsas_config_all_devices(instance);
+
+		rval = NDI_SUCCESS;
+		break;
+	}
+	}
+
+	if (rval == NDI_SUCCESS) {
+		rval = ndi_busop_bus_config(parent, flags, op, arg, childp, 0);
+
+	}
+	ndi_devi_exit(parent, config);
+
+	con_log(CL_ANN1, (CE_NOTE, "drsas_tran_bus_config: rval = %x",
+	    rval));
+	return (rval);
+}
+
+static int
+drsas_config_all_devices(struct drsas_instance *instance)
+{
+	int rval, tgt;
+
+	for (tgt = 0; tgt < MRDRV_MAX_LD; tgt++) {
+		(void) drsas_config_ld(instance, tgt, 0, NULL);
+
+	}
+
+	rval = NDI_SUCCESS;
+	return (rval);
+}
+
+static int
+drsas_parse_devname(char *devnm, int *tgt, int *lun)
+{
+	char devbuf[SCSI_MAXNAMELEN];
+	char *addr;
+	char *p,  *tp, *lp;
+	long num;
+
+	/* Parse dev name and address */
+	(void) strcpy(devbuf, devnm);
+	addr = "";
+	for (p = devbuf; *p != '\0'; p++) {
+		if (*p == '@') {
+			addr = p + 1;
+			*p = '\0';
+		} else if (*p == ':') {
+			*p = '\0';
+			break;
+		}
+	}
+
+	/* Parse target and lun */
+	for (p = tp = addr, lp = NULL; *p != '\0'; p++) {
+		if (*p == ',') {
+			lp = p + 1;
+			*p = '\0';
+			break;
+		}
+	}
+	if (tgt && tp) {
+		if (ddi_strtol(tp, NULL, 0x10, &num)) {
+			return (DDI_FAILURE); /* Can declare this as constant */
+		}
+			*tgt = (int)num;
+	}
+	if (lun && lp) {
+		if (ddi_strtol(lp, NULL, 0x10, &num)) {
+			return (DDI_FAILURE);
+		}
+			*lun = (int)num;
+	}
+	return (DDI_SUCCESS);  /* Success case */
+}
+
+static int
+drsas_config_ld(struct drsas_instance *instance, uint16_t tgt,
+    uint8_t lun, dev_info_t **ldip)
+{
+	struct scsi_device *sd;
+	dev_info_t *child;
+	int rval;
+
+	con_log(CL_ANN1, (CE_NOTE, "drsas_config_ld: t = %d l = %d",
+	    tgt, lun));
+
+	if ((child = drsas_find_child(instance, tgt, lun)) != NULL) {
+		if (ldip) {
+			*ldip = child;
+		}
+		con_log(CL_ANN1, (CE_NOTE,
+		    "drsas_config_ld: Child = %p found t = %d l = %d",
+		    (void *)child, tgt, lun));
+		return (NDI_SUCCESS);
+	}
+
+	sd = kmem_zalloc(sizeof (struct scsi_device), KM_SLEEP);
+	sd->sd_address.a_hba_tran = instance->tran;
+	sd->sd_address.a_target = (uint16_t)tgt;
+	sd->sd_address.a_lun = (uint8_t)lun;
+
+	if (scsi_hba_probe(sd, NULL) == SCSIPROBE_EXISTS)
+		rval = drsas_config_scsi_device(instance, sd, ldip);
+	else
+		rval = NDI_FAILURE;
+
+	/* sd_unprobe is blank now. Free buffer manually */
+	if (sd->sd_inq) {
+		kmem_free(sd->sd_inq, SUN_INQSIZE);
+		sd->sd_inq = (struct scsi_inquiry *)NULL;
+	}
+
+	kmem_free(sd, sizeof (struct scsi_device));
+	con_log(CL_ANN1, (CE_NOTE, "drsas_config_ld: return rval = %d",
+	    rval));
+	return (rval);
+}
+
+static int
+drsas_config_scsi_device(struct drsas_instance *instance,
+    struct scsi_device *sd, dev_info_t **dipp)
+{
+	char *nodename = NULL;
+	char **compatible = NULL;
+	int ncompatible = 0;
+	char *childname;
+	dev_info_t *ldip = NULL;
+	int tgt = sd->sd_address.a_target;
+	int lun = sd->sd_address.a_lun;
+	int dtype = sd->sd_inq->inq_dtype & DTYPE_MASK;
+	int rval;
+
+	con_log(CL_ANN1, (CE_WARN, "dr_sas: scsi_device t%dL%d", tgt, lun));
+	scsi_hba_nodename_compatible_get(sd->sd_inq, NULL, dtype,
+	    NULL, &nodename, &compatible, &ncompatible);
+
+	if (nodename == NULL) {
+		con_log(CL_ANN1, (CE_WARN, "dr_sas: Found no compatible driver "
+		    "for t%dL%d", tgt, lun));
+		rval = NDI_FAILURE;
+		goto finish;
+	}
+
+	childname = (dtype == DTYPE_DIRECT) ? "sd" : nodename;
+	con_log(CL_ANN1, (CE_WARN,
+	    "dr_sas: Childname = %2s nodename = %s", childname, nodename));
+
+	/* Create a dev node */
+	rval = ndi_devi_alloc(instance->dip, childname, DEVI_SID_NODEID, &ldip);
+	con_log(CL_ANN1, (CE_WARN,
+	    "dr_sas_config_scsi_device: ndi_devi_alloc rval = %x", rval));
+	if (rval == NDI_SUCCESS) {
+		if (ndi_prop_update_int(DDI_DEV_T_NONE, ldip, "target", tgt) !=
+		    DDI_PROP_SUCCESS) {
+			con_log(CL_ANN1, (CE_WARN, "dr_sas: unable to create "
+			    "property for t%dl%d target", tgt, lun));
+			rval = NDI_FAILURE;
+			goto finish;
+		}
+		if (ndi_prop_update_int(DDI_DEV_T_NONE, ldip, "lun", lun) !=
+		    DDI_PROP_SUCCESS) {
+			con_log(CL_ANN1, (CE_WARN, "dr_sas: unable to create "
+			    "property for t%dl%d lun", tgt, lun));
+			rval = NDI_FAILURE;
+			goto finish;
+		}
+
+		if (ndi_prop_update_string_array(DDI_DEV_T_NONE, ldip,
+		    "compatible", compatible, ncompatible) !=
+		    DDI_PROP_SUCCESS) {
+			con_log(CL_ANN1, (CE_WARN, "dr_sas: unable to create "
+			    "property for t%dl%d compatible", tgt, lun));
+			rval = NDI_FAILURE;
+			goto finish;
+		}
+
+		rval = ndi_devi_online(ldip, NDI_ONLINE_ATTACH);
+		if (rval != NDI_SUCCESS) {
+			con_log(CL_ANN1, (CE_WARN, "dr_sas: unable to online "
+			    "t%dl%d", tgt, lun));
+			ndi_prop_remove_all(ldip);
+			(void) ndi_devi_free(ldip);
+		} else {
+			con_log(CL_ANN1, (CE_WARN, "dr_sas: online Done :"
+			    "0 t%dl%d", tgt, lun));
+		}
+
+	}
+finish:
+	if (dipp) {
+		*dipp = ldip;
+	}
+
+	con_log(CL_DLEVEL1, (CE_WARN,
+	    "dr_sas: config_scsi_device rval = %d t%dL%d",
+	    rval, tgt, lun));
+	scsi_hba_nodename_compatible_free(nodename, compatible);
+	return (rval);
+}
+
+/*ARGSUSED*/
+static int
+drsas_service_evt(struct drsas_instance *instance, int tgt, int lun, int event,
+    uint64_t wwn)
+{
+	struct drsas_eventinfo *mrevt = NULL;
+
+	con_log(CL_ANN1, (CE_NOTE,
+	    "drsas_service_evt called for t%dl%d event = %d",
+	    tgt, lun, event));
+
+	if ((instance->taskq == NULL) || (mrevt =
+	    kmem_zalloc(sizeof (struct drsas_eventinfo), KM_NOSLEEP)) == NULL) {
+		return (ENOMEM);
+	}
+
+	mrevt->instance = instance;
+	mrevt->tgt = tgt;
+	mrevt->lun = lun;
+	mrevt->event = event;
+
+	if ((ddi_taskq_dispatch(instance->taskq,
+	    (void (*)(void *))drsas_issue_evt_taskq, mrevt, DDI_NOSLEEP)) !=
+	    DDI_SUCCESS) {
+		con_log(CL_ANN1, (CE_NOTE,
+		    "dr_sas: Event task failed for t%dl%d event = %d",
+		    tgt, lun, event));
+		kmem_free(mrevt, sizeof (struct drsas_eventinfo));
+		return (DDI_FAILURE);
+	}
+	return (DDI_SUCCESS);
+}
+
+static void
+drsas_issue_evt_taskq(struct drsas_eventinfo *mrevt)
+{
+	struct drsas_instance *instance = mrevt->instance;
+	dev_info_t *dip, *pdip;
+	int circ1 = 0;
+	char *devname;
+
+	con_log(CL_ANN1, (CE_NOTE, "drsas_issue_evt_taskq: called for"
+	    " tgt %d lun %d event %d",
+	    mrevt->tgt, mrevt->lun, mrevt->event));
+
+	if (mrevt->tgt < MRDRV_MAX_LD && mrevt->lun == 0) {
+		dip = instance->dr_ld_list[mrevt->tgt].dip;
+	} else {
+		return;
+	}
+
+	ndi_devi_enter(instance->dip, &circ1);
+	switch (mrevt->event) {
+	case DRSAS_EVT_CONFIG_TGT:
+		if (dip == NULL) {
+
+			if (mrevt->lun == 0) {
+				(void) drsas_config_ld(instance, mrevt->tgt,
+				    0, NULL);
+			}
+			con_log(CL_ANN1, (CE_NOTE,
+			    "dr_sas: EVT_CONFIG_TGT called:"
+			    " for tgt %d lun %d event %d",
+			    mrevt->tgt, mrevt->lun, mrevt->event));
+
+		} else {
+			con_log(CL_ANN1, (CE_NOTE,
+			    "dr_sas: EVT_CONFIG_TGT dip != NULL:"
+			    " for tgt %d lun %d event %d",
+			    mrevt->tgt, mrevt->lun, mrevt->event));
+		}
+		break;
+	case DRSAS_EVT_UNCONFIG_TGT:
+		if (dip) {
+			if (i_ddi_devi_attached(dip)) {
+
+				pdip = ddi_get_parent(dip);
+
+				devname = kmem_zalloc(MAXNAMELEN + 1, KM_SLEEP);
+				(void) ddi_deviname(dip, devname);
+
+				(void) devfs_clean(pdip, devname + 1,
+				    DV_CLEAN_FORCE);
+				kmem_free(devname, MAXNAMELEN + 1);
+			}
+			(void) ndi_devi_offline(dip, NDI_DEVI_REMOVE);
+			con_log(CL_ANN1, (CE_NOTE,
+			    "dr_sas: EVT_UNCONFIG_TGT called:"
+			    " for tgt %d lun %d event %d",
+			    mrevt->tgt, mrevt->lun, mrevt->event));
+		} else {
+			con_log(CL_ANN1, (CE_NOTE,
+			    "dr_sas: EVT_UNCONFIG_TGT dip == NULL:"
+			    " for tgt %d lun %d event %d",
+			    mrevt->tgt, mrevt->lun, mrevt->event));
+		}
+		break;
+	}
+	kmem_free(mrevt, sizeof (struct drsas_eventinfo));
+	ndi_devi_exit(instance->dip, circ1);
+}
+
+static int
+drsas_mode_sense_build(struct scsi_pkt *pkt)
+{
+	union scsi_cdb		*cdbp;
+	uint16_t		page_code;
+	struct scsa_cmd		*acmd;
+	struct buf		*bp;
+	struct mode_header	*modehdrp;
+
+	cdbp = (void *)pkt->pkt_cdbp;
+	page_code = cdbp->cdb_un.sg.scsi[0];
+	acmd = PKT2CMD(pkt);
+	bp = acmd->cmd_buf;
+	if ((!bp) && bp->b_un.b_addr && bp->b_bcount && acmd->cmd_dmacount) {
+		con_log(CL_ANN1, (CE_WARN, "Failing MODESENSE Command"));
+		/* ADD pkt statistics as Command failed. */
+		return (NULL);
+	}
+
+	bp_mapin(bp);
+	bzero(bp->b_un.b_addr, bp->b_bcount);
+
+	switch (page_code) {
+		case 0x3: {
+			struct mode_format *page3p = NULL;
+			modehdrp = (struct mode_header *)(bp->b_un.b_addr);
+			modehdrp->bdesc_length = MODE_BLK_DESC_LENGTH;
+
+			page3p = (void *)((caddr_t)modehdrp +
+			    MODE_HEADER_LENGTH + MODE_BLK_DESC_LENGTH);
+			page3p->mode_page.code = 0x3;
+			page3p->mode_page.length =
+			    (uchar_t)(sizeof (struct mode_format));
+			page3p->data_bytes_sect = 512;
+			page3p->sect_track = 63;
+			break;
+		}
+		case 0x4: {
+			struct mode_geometry *page4p = NULL;
+			modehdrp = (struct mode_header *)(bp->b_un.b_addr);
+			modehdrp->bdesc_length = MODE_BLK_DESC_LENGTH;
+
+			page4p = (void *)((caddr_t)modehdrp +
+			    MODE_HEADER_LENGTH + MODE_BLK_DESC_LENGTH);
+			page4p->mode_page.code = 0x4;
+			page4p->mode_page.length =
+			    (uchar_t)(sizeof (struct mode_geometry));
+			page4p->heads = 255;
+			page4p->rpm = 10000;
+			break;
+		}
+		default:
+			break;
+	}
+	return (NULL);
+}
diff --git a/usr/src/uts/common/io/dr_sas/dr_sas.conf b/usr/src/uts/common/io/dr_sas/dr_sas.conf
new file mode 100644
index 0000000000..3792f43ca4
--- /dev/null
+++ b/usr/src/uts/common/io/dr_sas/dr_sas.conf
@@ -0,0 +1,15 @@
+#
+# Copyright (c) 2008-2009, LSI Logic Corporation.
+# All rights reserved.
+#
+# Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+
+#
+# dr_sas.conf for sol 10 (and later) for all supported architectures
+#
+# global definitions
+
+# MSI specific flag. user can uncomment this line and set flag "yes" to enable MSI
+#drsas-enable-msi="yes";
diff --git a/usr/src/uts/common/io/dr_sas/dr_sas.h b/usr/src/uts/common/io/dr_sas/dr_sas.h
new file mode 100644
index 0000000000..8f78658edf
--- /dev/null
+++ b/usr/src/uts/common/io/dr_sas/dr_sas.h
@@ -0,0 +1,1766 @@
+/*
+ * dr_sas.h: header for dr_sas
+ *
+ * Solaris MegaRAID driver for SAS2.0 controllers
+ * Copyright (c) 2008-2009, LSI Logic Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the author nor the names of its contributors may be
+ *    used to endorse or promote products derived from this software without
+ *    specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_DR_SAS_H_
+#define	_DR_SAS_H_
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#include <sys/scsi/scsi.h>
+#include "dr_sas_list.h"
+
+/*
+ * MegaRAID SAS2.0 Driver meta data
+ */
+#define	DRSAS_VERSION				"LSIv2.0"
+#define	DRSAS_RELDATE				"Jan 9, 2009"
+
+#define	DRSAS_TRUE				1
+#define	DRSAS_FALSE				0
+
+/*
+ * MegaRAID SAS2.0 device id conversion definitions.
+ */
+#define	INST2LSIRDCTL(x)		((x) << INST_MINOR_SHIFT)
+
+/*
+ * MegaRAID SAS2.0 supported controllers
+ */
+#define	PCI_DEVICE_ID_LSI_2108VDE		0x0078
+#define	PCI_DEVICE_ID_LSI_2108V			0x0079
+
+/*
+ * Register Index for 2108 Controllers.
+ */
+#define	REGISTER_SET_IO_2108			(2)
+
+#define	DRSAS_MAX_SGE_CNT			0x50
+
+#define	DRSAS_IOCTL_DRIVER			0x12341234
+#define	DRSAS_IOCTL_FIRMWARE			0x12345678
+#define	DRSAS_IOCTL_AEN				0x87654321
+
+#define	DRSAS_1_SECOND				1000000
+
+/* Dynamic Enumeration Flags */
+#define	DRSAS_PD_LUN		1
+#define	DRSAS_LD_LUN		0
+#define	DRSAS_PD_TGT_MAX	255
+#define	DRSAS_GET_PD_MAX(s)	((s)->dr_pd_max)
+#define	WWN_STRLEN		17
+
+/*
+ * =====================================
+ * MegaRAID SAS2.0 MFI firmware definitions
+ * =====================================
+ */
+/*
+ * MFI stands for  MegaRAID SAS2.0 FW Interface. This is just a moniker for
+ * protocol between the software and firmware. Commands are issued using
+ * "message frames"
+ */
+
+/*
+ * FW posts its state in upper 4 bits of outbound_msg_0 register
+ */
+#define	MFI_STATE_SHIFT 			28
+#define	MFI_STATE_MASK				((uint32_t)0xF<<MFI_STATE_SHIFT)
+#define	MFI_STATE_UNDEFINED			((uint32_t)0x0<<MFI_STATE_SHIFT)
+#define	MFI_STATE_BB_INIT			((uint32_t)0x1<<MFI_STATE_SHIFT)
+#define	MFI_STATE_FW_INIT			((uint32_t)0x4<<MFI_STATE_SHIFT)
+#define	MFI_STATE_WAIT_HANDSHAKE		((uint32_t)0x6<<MFI_STATE_SHIFT)
+#define	MFI_STATE_FW_INIT_2			((uint32_t)0x7<<MFI_STATE_SHIFT)
+#define	MFI_STATE_DEVICE_SCAN			((uint32_t)0x8<<MFI_STATE_SHIFT)
+#define	MFI_STATE_BOOT_MESSAGE_PENDING		((uint32_t)0x9<<MFI_STATE_SHIFT)
+#define	MFI_STATE_FLUSH_CACHE			((uint32_t)0xA<<MFI_STATE_SHIFT)
+#define	MFI_STATE_READY				((uint32_t)0xB<<MFI_STATE_SHIFT)
+#define	MFI_STATE_OPERATIONAL			((uint32_t)0xC<<MFI_STATE_SHIFT)
+#define	MFI_STATE_FAULT				((uint32_t)0xF<<MFI_STATE_SHIFT)
+
+#define	MRMFI_FRAME_SIZE			64
+
+/*
+ * During FW init, clear pending cmds & reset state using inbound_msg_0
+ *
+ * ABORT	: Abort all pending cmds
+ * READY	: Move from OPERATIONAL to READY state; discard queue info
+ * MFIMODE	: Discard (possible) low MFA posted in 64-bit mode (??)
+ * CLR_HANDSHAKE: FW is waiting for HANDSHAKE from BIOS or Driver
+ */
+#define	MFI_INIT_ABORT				0x00000001
+#define	MFI_INIT_READY				0x00000002
+#define	MFI_INIT_MFIMODE			0x00000004
+#define	MFI_INIT_CLEAR_HANDSHAKE		0x00000008
+#define	MFI_INIT_HOTPLUG			0x00000010
+#define	MFI_STOP_ADP				0x00000020
+#define	MFI_RESET_FLAGS		MFI_INIT_READY|MFI_INIT_MFIMODE|MFI_INIT_ABORT
+
+/*
+ * MFI frame flags
+ */
+#define	MFI_FRAME_POST_IN_REPLY_QUEUE		0x0000
+#define	MFI_FRAME_DONT_POST_IN_REPLY_QUEUE	0x0001
+#define	MFI_FRAME_SGL32				0x0000
+#define	MFI_FRAME_SGL64				0x0002
+#define	MFI_FRAME_SENSE32			0x0000
+#define	MFI_FRAME_SENSE64			0x0004
+#define	MFI_FRAME_DIR_NONE			0x0000
+#define	MFI_FRAME_DIR_WRITE			0x0008
+#define	MFI_FRAME_DIR_READ			0x0010
+#define	MFI_FRAME_DIR_BOTH			0x0018
+
+/*
+ * Definition for cmd_status
+ */
+#define	MFI_CMD_STATUS_POLL_MODE		0xFF
+#define	MFI_CMD_STATUS_SYNC_MODE		0xFF
+
+/*
+ * MFI command opcodes
+ */
+#define	MFI_CMD_OP_INIT				0x00
+#define	MFI_CMD_OP_LD_READ			0x01
+#define	MFI_CMD_OP_LD_WRITE			0x02
+#define	MFI_CMD_OP_LD_SCSI			0x03
+#define	MFI_CMD_OP_PD_SCSI			0x04
+#define	MFI_CMD_OP_DCMD				0x05
+#define	MFI_CMD_OP_ABORT			0x06
+#define	MFI_CMD_OP_SMP				0x07
+#define	MFI_CMD_OP_STP				0x08
+
+#define	DR_DCMD_CTRL_GET_INFO			0x01010000
+
+#define	DR_DCMD_CTRL_CACHE_FLUSH		0x01101000
+#define	DR_FLUSH_CTRL_CACHE			0x01
+#define	DR_FLUSH_DISK_CACHE			0x02
+
+#define	DR_DCMD_CTRL_SHUTDOWN			0x01050000
+#define	DRSAS_ENABLE_DRIVE_SPINDOWN		0x01
+
+#define	DR_DCMD_CTRL_EVENT_GET_INFO		0x01040100
+#define	DR_DCMD_CTRL_EVENT_GET			0x01040300
+#define	DR_DCMD_CTRL_EVENT_WAIT			0x01040500
+#define	DR_DCMD_LD_GET_PROPERTIES		0x03030000
+#define	DR_DCMD_PD_GET_INFO			0x02020000
+
+/*
+ * Solaris Specific MAX values
+ */
+#define	MAX_SGL					24
+/*
+ * MFI command completion codes
+ */
+enum MFI_STAT {
+	MFI_STAT_OK				= 0x00,
+	MFI_STAT_INVALID_CMD			= 0x01,
+	MFI_STAT_INVALID_DCMD			= 0x02,
+	MFI_STAT_INVALID_PARAMETER		= 0x03,
+	MFI_STAT_INVALID_SEQUENCE_NUMBER	= 0x04,
+	MFI_STAT_ABORT_NOT_POSSIBLE		= 0x05,
+	MFI_STAT_APP_HOST_CODE_NOT_FOUND	= 0x06,
+	MFI_STAT_APP_IN_USE			= 0x07,
+	MFI_STAT_APP_NOT_INITIALIZED		= 0x08,
+	MFI_STAT_ARRAY_INDEX_INVALID		= 0x09,
+	MFI_STAT_ARRAY_ROW_NOT_EMPTY		= 0x0a,
+	MFI_STAT_CONFIG_RESOURCE_CONFLICT	= 0x0b,
+	MFI_STAT_DEVICE_NOT_FOUND		= 0x0c,
+	MFI_STAT_DRIVE_TOO_SMALL		= 0x0d,
+	MFI_STAT_FLASH_ALLOC_FAIL		= 0x0e,
+	MFI_STAT_FLASH_BUSY			= 0x0f,
+	MFI_STAT_FLASH_ERROR			= 0x10,
+	MFI_STAT_FLASH_IMAGE_BAD		= 0x11,
+	MFI_STAT_FLASH_IMAGE_INCOMPLETE		= 0x12,
+	MFI_STAT_FLASH_NOT_OPEN			= 0x13,
+	MFI_STAT_FLASH_NOT_STARTED		= 0x14,
+	MFI_STAT_FLUSH_FAILED			= 0x15,
+	MFI_STAT_HOST_CODE_NOT_FOUNT		= 0x16,
+	MFI_STAT_LD_CC_IN_PROGRESS		= 0x17,
+	MFI_STAT_LD_INIT_IN_PROGRESS		= 0x18,
+	MFI_STAT_LD_LBA_OUT_OF_RANGE		= 0x19,
+	MFI_STAT_LD_MAX_CONFIGURED		= 0x1a,
+	MFI_STAT_LD_NOT_OPTIMAL			= 0x1b,
+	MFI_STAT_LD_RBLD_IN_PROGRESS		= 0x1c,
+	MFI_STAT_LD_RECON_IN_PROGRESS		= 0x1d,
+	MFI_STAT_LD_WRONG_RAID_LEVEL		= 0x1e,
+	MFI_STAT_MAX_SPARES_EXCEEDED		= 0x1f,
+	MFI_STAT_MEMORY_NOT_AVAILABLE		= 0x20,
+	MFI_STAT_MFC_HW_ERROR			= 0x21,
+	MFI_STAT_NO_HW_PRESENT			= 0x22,
+	MFI_STAT_NOT_FOUND			= 0x23,
+	MFI_STAT_NOT_IN_ENCL			= 0x24,
+	MFI_STAT_PD_CLEAR_IN_PROGRESS		= 0x25,
+	MFI_STAT_PD_TYPE_WRONG			= 0x26,
+	MFI_STAT_PR_DISABLED			= 0x27,
+	MFI_STAT_ROW_INDEX_INVALID		= 0x28,
+	MFI_STAT_SAS_CONFIG_INVALID_ACTION	= 0x29,
+	MFI_STAT_SAS_CONFIG_INVALID_DATA	= 0x2a,
+	MFI_STAT_SAS_CONFIG_INVALID_PAGE	= 0x2b,
+	MFI_STAT_SAS_CONFIG_INVALID_TYPE	= 0x2c,
+	MFI_STAT_SCSI_DONE_WITH_ERROR		= 0x2d,
+	MFI_STAT_SCSI_IO_FAILED			= 0x2e,
+	MFI_STAT_SCSI_RESERVATION_CONFLICT	= 0x2f,
+	MFI_STAT_SHUTDOWN_FAILED		= 0x30,
+	MFI_STAT_TIME_NOT_SET			= 0x31,
+	MFI_STAT_WRONG_STATE			= 0x32,
+	MFI_STAT_LD_OFFLINE			= 0x33,
+	/* UNUSED: 0x34 to 0xfe */
+	MFI_STAT_INVALID_STATUS			= 0xFF
+};
+
+enum DR_EVT_CLASS {
+	DR_EVT_CLASS_DEBUG		= -2,
+	DR_EVT_CLASS_PROGRESS		= -1,
+	DR_EVT_CLASS_INFO		=  0,
+	DR_EVT_CLASS_WARNING		=  1,
+	DR_EVT_CLASS_CRITICAL		=  2,
+	DR_EVT_CLASS_FATAL		=  3,
+	DR_EVT_CLASS_DEAD		=  4
+};
+
+enum DR_EVT_LOCALE {
+	DR_EVT_LOCALE_LD		= 0x0001,
+	DR_EVT_LOCALE_PD		= 0x0002,
+	DR_EVT_LOCALE_ENCL		= 0x0004,
+	DR_EVT_LOCALE_BBU		= 0x0008,
+	DR_EVT_LOCALE_SAS		= 0x0010,
+	DR_EVT_LOCALE_CTRL		= 0x0020,
+	DR_EVT_LOCALE_CONFIG		= 0x0040,
+	DR_EVT_LOCALE_CLUSTER		= 0x0080,
+	DR_EVT_LOCALE_ALL		= 0xffff
+};
+
+#define	DR_EVT_CFG_CLEARED		0x0004
+#define	DR_EVT_LD_CREATED		0x008a
+#define	DR_EVT_LD_DELETED		0x008b
+#define	DR_EVT_PD_REMOVED_EXT		0x00f8
+#define	DR_EVT_PD_INSERTED_EXT		0x00f7
+
+enum LD_STATE {
+	LD_OFFLINE		= 0,
+	LD_PARTIALLY_DEGRADED	= 1,
+	LD_DEGRADED		= 2,
+	LD_OPTIMAL		= 3,
+	LD_INVALID		= 0xFF
+};
+
+enum DRSAS_EVT {
+	DRSAS_EVT_CONFIG_TGT	= 0,
+	DRSAS_EVT_UNCONFIG_TGT	= 1,
+	DRSAS_EVT_UNCONFIG_SMP	= 2
+};
+
+#define	DMA_OBJ_ALLOCATED	1
+#define	DMA_OBJ_REALLOCATED	2
+#define	DMA_OBJ_FREED		3
+
+/*
+ * dma_obj_t	- Our DMA object
+ * @param buffer	: kernel virtual address
+ * @param size		: size of the data to be allocated
+ * @param acc_handle	: access handle
+ * @param dma_handle	: dma handle
+ * @param dma_cookie	: scatter-gather list
+ * @param dma_attr	: dma attributes for this buffer
+ * Our DMA object. The caller must initialize the size and dma attributes
+ * (dma_attr) fields before allocating the resources.
+ */
+typedef struct {
+	caddr_t			buffer;
+	uint32_t		size;
+	ddi_acc_handle_t	acc_handle;
+	ddi_dma_handle_t	dma_handle;
+	ddi_dma_cookie_t	dma_cookie[DRSAS_MAX_SGE_CNT];
+	ddi_dma_attr_t		dma_attr;
+	uint8_t			status;
+	uint8_t			reserved[3];
+} dma_obj_t;
+
+struct drsas_eventinfo {
+	struct drsas_instance	*instance;
+	int 			tgt;
+	int 			lun;
+	int 			event;
+};
+
+struct drsas_ld {
+	dev_info_t		*dip;
+	uint8_t 		lun_type;
+	uint8_t 		reserved[3];
+};
+
+struct drsas_pd {
+	dev_info_t		*dip;
+	uint8_t 		lun_type;
+	uint8_t 		dev_id;
+	uint8_t 		flags;
+	uint8_t 		reserved;
+};
+
+struct drsas_pd_info {
+	uint16_t	deviceId;
+	uint16_t	seqNum;
+	uint8_t		inquiryData[96];
+	uint8_t		vpdPage83[64];
+	uint8_t		notSupported;
+	uint8_t		scsiDevType;
+	uint8_t		a;
+	uint8_t		device_speed;
+	uint32_t	mediaerrcnt;
+	uint32_t	other;
+	uint32_t	pred;
+	uint32_t	lastpred;
+	uint16_t	fwState;
+	uint8_t		disabled;
+	uint8_t		linkspwwd;
+	uint32_t	ddfType;
+	struct {
+		uint8_t	count;
+		uint8_t	isPathBroken;
+		uint8_t	connectorIndex[2];
+		uint8_t	reserved[4];
+		uint64_t sasAddr[2];
+		uint8_t	reserved2[16];
+	} pathInfo;
+};
+
+typedef struct drsas_instance {
+	uint32_t	*producer;
+	uint32_t	*consumer;
+
+	uint32_t	*reply_queue;
+	dma_obj_t	mfi_internal_dma_obj;
+
+	uint8_t		init_id;
+	uint8_t		reserved[3];
+
+	uint16_t	max_num_sge;
+	uint16_t	max_fw_cmds;
+	uint32_t	max_sectors_per_req;
+
+	struct drsas_cmd **cmd_list;
+
+	mlist_t		cmd_pool_list;
+	kmutex_t	cmd_pool_mtx;
+
+	mlist_t		cmd_pend_list;
+	kmutex_t	cmd_pend_mtx;
+
+	dma_obj_t	mfi_evt_detail_obj;
+	struct drsas_cmd *aen_cmd;
+
+	uint32_t	aen_seq_num;
+	uint32_t	aen_class_locale_word;
+
+	scsi_hba_tran_t		*tran;
+
+	kcondvar_t	int_cmd_cv;
+	kmutex_t	int_cmd_mtx;
+
+	kcondvar_t	aen_cmd_cv;
+	kmutex_t	aen_cmd_mtx;
+
+	kcondvar_t	abort_cmd_cv;
+	kmutex_t	abort_cmd_mtx;
+
+	dev_info_t		*dip;
+	ddi_acc_handle_t	pci_handle;
+
+	timeout_id_t	timeout_id;
+	uint32_t	unique_id;
+	uint16_t	fw_outstanding;
+	caddr_t		regmap;
+	ddi_acc_handle_t	regmap_handle;
+	uint8_t		isr_level;
+	ddi_iblock_cookie_t	iblock_cookie;
+	ddi_iblock_cookie_t	soft_iblock_cookie;
+	ddi_softintr_t		soft_intr_id;
+	uint8_t		softint_running;
+	kmutex_t	completed_pool_mtx;
+	mlist_t		completed_pool_list;
+
+	caddr_t		internal_buf;
+	uint32_t	internal_buf_dmac_add;
+	uint32_t	internal_buf_size;
+
+	uint16_t	vendor_id;
+	uint16_t	device_id;
+	uint16_t	subsysvid;
+	uint16_t	subsysid;
+	int		instance;
+	int		baseaddress;
+	char		iocnode[16];
+
+	int		fm_capabilities;
+
+	struct drsas_func_ptr *func_ptr;
+	/* MSI interrupts specific */
+	ddi_intr_handle_t *intr_htable;
+	int		intr_type;
+	int		intr_cnt;
+	size_t		intr_size;
+	uint_t		intr_pri;
+	int		intr_cap;
+
+	ddi_taskq_t	*taskq;
+	struct drsas_ld	*dr_ld_list;
+} drsas_t;
+
+struct drsas_func_ptr {
+	int (*read_fw_status_reg)(struct drsas_instance *);
+	void (*issue_cmd)(struct drsas_cmd *, struct drsas_instance *);
+	int (*issue_cmd_in_sync_mode)(struct drsas_instance *,
+	    struct drsas_cmd *);
+	int (*issue_cmd_in_poll_mode)(struct drsas_instance *,
+	    struct drsas_cmd *);
+	void (*enable_intr)(struct drsas_instance *);
+	void (*disable_intr)(struct drsas_instance *);
+	int (*intr_ack)(struct drsas_instance *);
+};
+
+/*
+ * ### Helper routines ###
+ */
+
+/*
+ * con_log() - console log routine
+ * @param level		: indicates the severity of the message.
+ * @fparam mt		: format string
+ *
+ * con_log displays the error messages on the console based on the current
+ * debug level. Also it attaches the appropriate kernel severity level with
+ * the message.
+ *
+ *
+ * console messages debug levels
+ */
+#define	CL_NONE		0	/* No debug information */
+#define	CL_ANN		1	/* print unconditionally, announcements */
+#define	CL_ANN1		2	/* No o/p  */
+#define	CL_DLEVEL1	3	/* debug level 1, informative */
+#define	CL_DLEVEL2	4	/* debug level 2, verbose */
+#define	CL_DLEVEL3	5	/* debug level 3, very verbose */
+
+#ifdef __SUNPRO_C
+#define	__func__ ""
+#endif
+
+#define	con_log(level, fmt) { if (debug_level_g >= level) cmn_err fmt; }
+
+/*
+ * ### SCSA definitions ###
+ */
+#define	PKT2TGT(pkt)	((pkt)->pkt_address.a_target)
+#define	PKT2LUN(pkt)	((pkt)->pkt_address.a_lun)
+#define	PKT2TRAN(pkt)	((pkt)->pkt_adress.a_hba_tran)
+#define	ADDR2TRAN(ap)	((ap)->a_hba_tran)
+
+#define	TRAN2MR(tran)	(struct drsas_instance *)(tran)->tran_hba_private)
+#define	ADDR2MR(ap)	(TRAN2MR(ADDR2TRAN(ap))
+
+#define	PKT2CMD(pkt)	((struct scsa_cmd *)(pkt)->pkt_ha_private)
+#define	CMD2PKT(sp)	((sp)->cmd_pkt)
+#define	PKT2REQ(pkt)	(&(PKT2CMD(pkt)->request))
+
+#define	CMD2ADDR(cmd)	(&CMD2PKT(cmd)->pkt_address)
+#define	CMD2TRAN(cmd)	(CMD2PKT(cmd)->pkt_address.a_hba_tran)
+#define	CMD2MR(cmd)	(TRAN2MR(CMD2TRAN(cmd)))
+
+#define	CFLAG_DMAVALID		0x0001	/* requires a dma operation */
+#define	CFLAG_DMASEND		0x0002	/* Transfer from the device */
+#define	CFLAG_CONSISTENT	0x0040	/* consistent data transfer */
+
+/*
+ * ### Data structures for ioctl inteface and internal commands ###
+ */
+
+/*
+ * Data direction flags
+ */
+#define	UIOC_RD		0x00001
+#define	UIOC_WR		0x00002
+
+#define	SCP2HOST(scp)		(scp)->device->host	/* to host */
+#define	SCP2HOSTDATA(scp)	SCP2HOST(scp)->hostdata	/* to soft state */
+#define	SCP2CHANNEL(scp)	(scp)->device->channel	/* to channel */
+#define	SCP2TARGET(scp)		(scp)->device->id	/* to target */
+#define	SCP2LUN(scp)		(scp)->device->lun	/* to LUN */
+
+#define	SCSIHOST2ADAP(host)	(((caddr_t *)(host->hostdata))[0])
+#define	SCP2ADAPTER(scp)				\
+	(struct drsas_instance *)SCSIHOST2ADAP(SCP2HOST(scp))
+
+#define	MRDRV_IS_LOGICAL_SCSA(instance, acmd)		\
+	(acmd->device_id < MRDRV_MAX_LD) ? 1 : 0
+#define	MRDRV_IS_LOGICAL(ap)				\
+	((ap->a_target < MRDRV_MAX_LD) && (ap->a_lun == 0)) ? 1 : 0
+#define	MAP_DEVICE_ID(instance, ap)			\
+	(ap->a_target)
+
+#define	HIGH_LEVEL_INTR			1
+#define	NORMAL_LEVEL_INTR		0
+
+/*
+ * scsa_cmd  - Per-command mr private data
+ * @param cmd_dmahandle		:  dma handle
+ * @param cmd_dmacookies	:  current dma cookies
+ * @param cmd_pkt		:  scsi_pkt reference
+ * @param cmd_dmacount		:  dma count
+ * @param cmd_cookie		:  next cookie
+ * @param cmd_ncookies		:  cookies per window
+ * @param cmd_cookiecnt		:  cookies per sub-win
+ * @param cmd_nwin		:  number of dma windows
+ * @param cmd_curwin		:  current dma window
+ * @param cmd_dma_offset	:  current window offset
+ * @param cmd_dma_len		:  current window length
+ * @param cmd_flags		:  private flags
+ * @param cmd_cdblen		:  length of cdb
+ * @param cmd_scblen		:  length of scb
+ * @param cmd_buf		:  command buffer
+ * @param channel		:  channel for scsi sub-system
+ * @param target		:  target for scsi sub-system
+ * @param lun			:  LUN for scsi sub-system
+ *
+ * - Allocated at same time as scsi_pkt by scsi_hba_pkt_alloc(9E)
+ * - Pointed to by pkt_ha_private field in scsi_pkt
+ */
+struct scsa_cmd {
+	ddi_dma_handle_t	cmd_dmahandle;
+	ddi_dma_cookie_t	cmd_dmacookies[DRSAS_MAX_SGE_CNT];
+	struct scsi_pkt		*cmd_pkt;
+	ulong_t			cmd_dmacount;
+	uint_t			cmd_cookie;
+	uint_t			cmd_ncookies;
+	uint_t			cmd_cookiecnt;
+	uint_t			cmd_nwin;
+	uint_t			cmd_curwin;
+	off_t			cmd_dma_offset;
+	ulong_t			cmd_dma_len;
+	ulong_t			cmd_flags;
+	uint_t			cmd_cdblen;
+	uint_t			cmd_scblen;
+	struct buf		*cmd_buf;
+	ushort_t		device_id;
+	uchar_t			islogical;
+	uchar_t			lun;
+	struct drsas_device	*drsas_dev;
+};
+
+
+struct drsas_cmd {
+	union drsas_frame	*frame;
+	uint32_t		frame_phys_addr;
+	uint8_t			*sense;
+	uint32_t		sense_phys_addr;
+	dma_obj_t		frame_dma_obj;
+	uint8_t			frame_dma_obj_status;
+
+	uint32_t		index;
+	uint8_t			sync_cmd;
+	uint8_t			cmd_status;
+	uint16_t		abort_aen;
+	mlist_t			list;
+	uint32_t		frame_count;
+	struct scsa_cmd		*cmd;
+	struct scsi_pkt		*pkt;
+};
+
+#define	MAX_MGMT_ADAPTERS			1024
+#define	IOC_SIGNATURE				"MR-SAS"
+
+#define	IOC_CMD_FIRMWARE			0x0
+#define	DRSAS_DRIVER_IOCTL_COMMON		0xF0010000
+#define	DRSAS_DRIVER_IOCTL_DRIVER_VERSION	0xF0010100
+#define	DRSAS_DRIVER_IOCTL_PCI_INFORMATION	0xF0010200
+#define	DRSAS_DRIVER_IOCTL_MRRAID_STATISTICS	0xF0010300
+
+
+#define	DRSAS_MAX_SENSE_LENGTH			32
+
+struct drsas_mgmt_info {
+
+	uint16_t			count;
+	struct drsas_instance		*instance[MAX_MGMT_ADAPTERS];
+	uint16_t			map[MAX_MGMT_ADAPTERS];
+	int				max_index;
+};
+
+#pragma pack(1)
+
+/*
+ * SAS controller properties
+ */
+struct drsas_ctrl_prop {
+	uint16_t	seq_num;
+	uint16_t	pred_fail_poll_interval;
+	uint16_t	intr_throttle_count;
+	uint16_t	intr_throttle_timeouts;
+
+	uint8_t		rebuild_rate;
+	uint8_t		patrol_read_rate;
+	uint8_t		bgi_rate;
+	uint8_t		cc_rate;
+	uint8_t		recon_rate;
+
+	uint8_t		cache_flush_interval;
+
+	uint8_t		spinup_drv_count;
+	uint8_t		spinup_delay;
+
+	uint8_t		cluster_enable;
+	uint8_t		coercion_mode;
+	uint8_t		disk_write_cache_disable;
+	uint8_t		alarm_enable;
+
+	uint8_t		reserved[44];
+};
+
+/*
+ * SAS controller information
+ */
+struct drsas_ctrl_info {
+	/* PCI device information */
+	struct {
+		uint16_t	vendor_id;
+		uint16_t	device_id;
+		uint16_t	sub_vendor_id;
+		uint16_t	sub_device_id;
+		uint8_t	reserved[24];
+	} pci;
+
+	/* Host interface information */
+	struct {
+		uint8_t	PCIX		: 1;
+		uint8_t	PCIE		: 1;
+		uint8_t	iSCSI		: 1;
+		uint8_t	SAS_3G		: 1;
+		uint8_t	reserved_0	: 4;
+		uint8_t	reserved_1[6];
+		uint8_t	port_count;
+		uint64_t	port_addr[8];
+	} host_interface;
+
+	/* Device (backend) interface information */
+	struct {
+		uint8_t	SPI		: 1;
+		uint8_t	SAS_3G		: 1;
+		uint8_t	SATA_1_5G	: 1;
+		uint8_t	SATA_3G		: 1;
+		uint8_t	reserved_0	: 4;
+		uint8_t	reserved_1[6];
+		uint8_t	port_count;
+		uint64_t	port_addr[8];
+	} device_interface;
+
+	/* List of components residing in flash. All str are null terminated */
+	uint32_t	image_check_word;
+	uint32_t	image_component_count;
+
+	struct {
+		char	name[8];
+		char	version[32];
+		char	build_date[16];
+		char	built_time[16];
+	} image_component[8];
+
+	/*
+	 * List of flash components that have been flashed on the card, but
+	 * are not in use, pending reset of the adapter. This list will be
+	 * empty if a flash operation has not occurred. All stings are null
+	 * terminated
+	 */
+	uint32_t	pending_image_component_count;
+
+	struct {
+		char	name[8];
+		char	version[32];
+		char	build_date[16];
+		char	build_time[16];
+	} pending_image_component[8];
+
+	uint8_t		max_arms;
+	uint8_t		max_spans;
+	uint8_t		max_arrays;
+	uint8_t		max_lds;
+
+	char		product_name[80];
+	char		serial_no[32];
+
+	/*
+	 * Other physical/controller/operation information. Indicates the
+	 * presence of the hardware
+	 */
+	struct {
+		uint32_t	bbu		: 1;
+		uint32_t	alarm		: 1;
+		uint32_t	nvram		: 1;
+		uint32_t	uart		: 1;
+		uint32_t	reserved	: 28;
+	} hw_present;
+
+	uint32_t	current_fw_time;
+
+	/* Maximum data transfer sizes */
+	uint16_t		max_concurrent_cmds;
+	uint16_t		max_sge_count;
+	uint32_t		max_request_size;
+
+	/* Logical and physical device counts */
+	uint16_t		ld_present_count;
+	uint16_t		ld_degraded_count;
+	uint16_t		ld_offline_count;
+
+	uint16_t		pd_present_count;
+	uint16_t		pd_disk_present_count;
+	uint16_t		pd_disk_pred_failure_count;
+	uint16_t		pd_disk_failed_count;
+
+	/* Memory size information */
+	uint16_t		nvram_size;
+	uint16_t		memory_size;
+	uint16_t		flash_size;
+
+	/* Error counters */
+	uint16_t		mem_correctable_error_count;
+	uint16_t		mem_uncorrectable_error_count;
+
+	/* Cluster information */
+	uint8_t		cluster_permitted;
+	uint8_t		cluster_active;
+	uint8_t		reserved_1[2];
+
+	/* Controller capabilities structures */
+	struct {
+		uint32_t	raid_level_0	: 1;
+		uint32_t	raid_level_1	: 1;
+		uint32_t	raid_level_5	: 1;
+		uint32_t	raid_level_1E	: 1;
+		uint32_t	reserved	: 28;
+	} raid_levels;
+
+	struct {
+		uint32_t	rbld_rate		: 1;
+		uint32_t	cc_rate			: 1;
+		uint32_t	bgi_rate		: 1;
+		uint32_t	recon_rate		: 1;
+		uint32_t	patrol_rate		: 1;
+		uint32_t	alarm_control		: 1;
+		uint32_t	cluster_supported	: 1;
+		uint32_t	bbu			: 1;
+		uint32_t	spanning_allowed	: 1;
+		uint32_t	dedicated_hotspares	: 1;
+		uint32_t	revertible_hotspares	: 1;
+		uint32_t	foreign_config_import	: 1;
+		uint32_t	self_diagnostic		: 1;
+		uint32_t	reserved		: 19;
+	} adapter_operations;
+
+	struct {
+		uint32_t	read_policy	: 1;
+		uint32_t	write_policy	: 1;
+		uint32_t	io_policy	: 1;
+		uint32_t	access_policy	: 1;
+		uint32_t	reserved	: 28;
+	} ld_operations;
+
+	struct {
+		uint8_t	min;
+		uint8_t	max;
+		uint8_t	reserved[2];
+	} stripe_size_operations;
+
+	struct {
+		uint32_t	force_online	: 1;
+		uint32_t	force_offline	: 1;
+		uint32_t	force_rebuild	: 1;
+		uint32_t	reserved	: 29;
+	} pd_operations;
+
+	struct {
+		uint32_t	ctrl_supports_sas	: 1;
+		uint32_t	ctrl_supports_sata	: 1;
+		uint32_t	allow_mix_in_encl	: 1;
+		uint32_t	allow_mix_in_ld		: 1;
+		uint32_t	allow_sata_in_cluster	: 1;
+		uint32_t	reserved		: 27;
+	} pd_mix_support;
+
+	/* Include the controller properties (changeable items) */
+	uint8_t				reserved_2[12];
+	struct drsas_ctrl_prop		properties;
+
+	uint8_t				pad[0x800 - 0x640];
+};
+
+/*
+ * ==================================
+ * MegaRAID SAS2.0 driver definitions
+ * ==================================
+ */
+#define	MRDRV_MAX_NUM_CMD			1024
+
+#define	MRDRV_MAX_PD_CHANNELS			2
+#define	MRDRV_MAX_LD_CHANNELS			2
+#define	MRDRV_MAX_CHANNELS			(MRDRV_MAX_PD_CHANNELS + \
+						MRDRV_MAX_LD_CHANNELS)
+#define	MRDRV_MAX_DEV_PER_CHANNEL		128
+#define	MRDRV_DEFAULT_INIT_ID			-1
+#define	MRDRV_MAX_CMD_PER_LUN			1000
+#define	MRDRV_MAX_LUN				1
+#define	MRDRV_MAX_LD				64
+
+#define	MRDRV_RESET_WAIT_TIME			300
+#define	MRDRV_RESET_NOTICE_INTERVAL		5
+
+#define	DRSAS_IOCTL_CMD				0
+
+/*
+ * FW can accept both 32 and 64 bit SGLs. We want to allocate 32/64 bit
+ * SGLs based on the size of dma_addr_t
+ */
+#define	IS_DMA64		(sizeof (dma_addr_t) == 8)
+
+#define	IB_MSG_0_OFF			0x10	/* XScale */
+#define	OB_MSG_0_OFF			0x18	/* XScale */
+#define	IB_DOORBELL_OFF			0x20	/* XScale & ROC */
+#define	OB_INTR_STATUS_OFF		0x30	/* XScale & ROC */
+#define	OB_INTR_MASK_OFF		0x34	/* XScale & ROC */
+#define	IB_QPORT_OFF			0x40	/* XScale & ROC */
+#define	OB_DOORBELL_CLEAR_OFF		0xA0	/* ROC */
+#define	OB_SCRATCH_PAD_0_OFF		0xB0	/* ROC */
+#define	OB_INTR_MASK			0xFFFFFFFF
+#define	OB_DOORBELL_CLEAR_MASK		0xFFFFFFFF
+
+/*
+ * All MFI register set macros accept drsas_register_set*
+ */
+#define	WR_IB_MSG_0(v, instance) 	ddi_put32((instance)->regmap_handle, \
+	(uint32_t *)((uintptr_t)(instance)->regmap + IB_MSG_0_OFF), (v))
+
+#define	RD_OB_MSG_0(instance) 		ddi_get32((instance)->regmap_handle, \
+	(uint32_t *)((uintptr_t)(instance)->regmap + OB_MSG_0_OFF))
+
+#define	WR_IB_DOORBELL(v, instance)	ddi_put32((instance)->regmap_handle, \
+	(uint32_t *)((uintptr_t)(instance)->regmap + IB_DOORBELL_OFF), (v))
+
+#define	RD_IB_DOORBELL(instance)	ddi_get32((instance)->regmap_handle, \
+	(uint32_t *)((uintptr_t)(instance)->regmap + IB_DOORBELL_OFF))
+
+#define	WR_OB_INTR_STATUS(v, instance) 	ddi_put32((instance)->regmap_handle, \
+	(uint32_t *)((uintptr_t)(instance)->regmap + OB_INTR_STATUS_OFF), (v))
+
+#define	RD_OB_INTR_STATUS(instance) 	ddi_get32((instance)->regmap_handle, \
+	(uint32_t *)((uintptr_t)(instance)->regmap + OB_INTR_STATUS_OFF))
+
+#define	WR_OB_INTR_MASK(v, instance) 	ddi_put32((instance)->regmap_handle, \
+	(uint32_t *)((uintptr_t)(instance)->regmap + OB_INTR_MASK_OFF), (v))
+
+#define	RD_OB_INTR_MASK(instance) 	ddi_get32((instance)->regmap_handle, \
+	(uint32_t *)((uintptr_t)(instance)->regmap + OB_INTR_MASK_OFF))
+
+#define	WR_IB_QPORT(v, instance) 	ddi_put32((instance)->regmap_handle, \
+	(uint32_t *)((uintptr_t)(instance)->regmap + IB_QPORT_OFF), (v))
+
+#define	WR_OB_DOORBELL_CLEAR(v, instance) ddi_put32((instance)->regmap_handle, \
+	(uint32_t *)((uintptr_t)(instance)->regmap + OB_DOORBELL_CLEAR_OFF), \
+	(v))
+
+#define	RD_OB_SCRATCH_PAD_0(instance) 	ddi_get32((instance)->regmap_handle, \
+	(uint32_t *)((uintptr_t)(instance)->regmap + OB_SCRATCH_PAD_0_OFF))
+
+/*
+ * When FW is in MFI_STATE_READY or MFI_STATE_OPERATIONAL, the state data
+ * of Outbound Msg Reg 0 indicates max concurrent cmds supported, max SGEs
+ * supported per cmd and if 64-bit MFAs (M64) is enabled or disabled.
+ */
+#define	MFI_OB_INTR_STATUS_MASK		0x00000002
+
+/*
+ * This MFI_REPLY_2108_MESSAGE_INTR flag is used also
+ * in enable_intr_ppc also. Hence bit 2, i.e. 0x4 has
+ * been set in this flag along with bit 1.
+ */
+#define	MFI_REPLY_2108_MESSAGE_INTR		0x00000001
+#define	MFI_REPLY_2108_MESSAGE_INTR_MASK	0x00000005
+
+#define	MFI_POLL_TIMEOUT_SECS		60
+
+#define	MFI_ENABLE_INTR(instance)  ddi_put32((instance)->regmap_handle, \
+	(uint32_t *)((uintptr_t)(instance)->regmap + OB_INTR_MASK_OFF), 1)
+#define	MFI_DISABLE_INTR(instance)					\
+{									\
+	uint32_t disable = 1;						\
+	uint32_t mask =  ddi_get32((instance)->regmap_handle, 		\
+	    (uint32_t *)((uintptr_t)(instance)->regmap + OB_INTR_MASK_OFF));\
+	mask &= ~disable;						\
+	ddi_put32((instance)->regmap_handle, (uint32_t *)		\
+	    (uintptr_t)((instance)->regmap + OB_INTR_MASK_OFF), mask);	\
+}
+
+/* By default, the firmware programs for 8 Kbytes of memory */
+#define	DEFAULT_MFI_MEM_SZ	8192
+#define	MINIMUM_MFI_MEM_SZ	4096
+
+/* DCMD Message Frame MAILBOX0-11 */
+#define	DCMD_MBOX_SZ		12
+
+
+struct drsas_register_set {
+	uint32_t	reserved_0[4];
+
+	uint32_t	inbound_msg_0;
+	uint32_t	inbound_msg_1;
+	uint32_t	outbound_msg_0;
+	uint32_t	outbound_msg_1;
+
+	uint32_t	inbound_doorbell;
+	uint32_t	inbound_intr_status;
+	uint32_t	inbound_intr_mask;
+
+	uint32_t	outbound_doorbell;
+	uint32_t	outbound_intr_status;
+	uint32_t	outbound_intr_mask;
+
+	uint32_t	reserved_1[2];
+
+	uint32_t	inbound_queue_port;
+	uint32_t	outbound_queue_port;
+
+	uint32_t 	reserved_2[22];
+
+	uint32_t 	outbound_doorbell_clear;
+
+	uint32_t 	reserved_3[3];
+
+	uint32_t 	outbound_scratch_pad;
+
+	uint32_t 	reserved_4[3];
+
+	uint32_t 	inbound_low_queue_port;
+
+	uint32_t 	inbound_high_queue_port;
+
+	uint32_t 	reserved_5;
+	uint32_t 	index_registers[820];
+};
+
+struct drsas_sge32 {
+	uint32_t	phys_addr;
+	uint32_t	length;
+};
+
+struct drsas_sge64 {
+	uint64_t	phys_addr;
+	uint32_t	length;
+};
+
+union drsas_sgl {
+	struct drsas_sge32	sge32[1];
+	struct drsas_sge64	sge64[1];
+};
+
+struct drsas_header {
+	uint8_t		cmd;
+	uint8_t		sense_len;
+	uint8_t		cmd_status;
+	uint8_t		scsi_status;
+
+	uint8_t		target_id;
+	uint8_t		lun;
+	uint8_t		cdb_len;
+	uint8_t		sge_count;
+
+	uint32_t	context;
+	uint8_t		req_id;
+	uint8_t		msgvector;
+	uint16_t	pad_0;
+
+	uint16_t	flags;
+	uint16_t	timeout;
+	uint32_t	data_xferlen;
+};
+
+union drsas_sgl_frame {
+	struct drsas_sge32	sge32[8];
+	struct drsas_sge64	sge64[5];
+};
+
+struct drsas_init_frame {
+	uint8_t		cmd;
+	uint8_t		reserved_0;
+	uint8_t		cmd_status;
+
+	uint8_t		reserved_1;
+	uint32_t	reserved_2;
+
+	uint32_t	context;
+	uint8_t		req_id;
+	uint8_t		msgvector;
+	uint16_t	pad_0;
+
+	uint16_t	flags;
+	uint16_t	reserved_3;
+	uint32_t	data_xfer_len;
+
+	uint32_t	queue_info_new_phys_addr_lo;
+	uint32_t	queue_info_new_phys_addr_hi;
+	uint32_t	queue_info_old_phys_addr_lo;
+	uint32_t	queue_info_old_phys_addr_hi;
+
+	uint32_t	reserved_4[6];
+};
+
+struct drsas_init_queue_info {
+	uint32_t		init_flags;
+	uint32_t		reply_queue_entries;
+
+	uint32_t		reply_queue_start_phys_addr_lo;
+	uint32_t		reply_queue_start_phys_addr_hi;
+	uint32_t		producer_index_phys_addr_lo;
+	uint32_t		producer_index_phys_addr_hi;
+	uint32_t		consumer_index_phys_addr_lo;
+	uint32_t		consumer_index_phys_addr_hi;
+};
+
+struct drsas_io_frame {
+	uint8_t			cmd;
+	uint8_t			sense_len;
+	uint8_t			cmd_status;
+	uint8_t			scsi_status;
+
+	uint8_t			target_id;
+	uint8_t			access_byte;
+	uint8_t			reserved_0;
+	uint8_t			sge_count;
+
+	uint32_t		context;
+	uint8_t			req_id;
+	uint8_t			msgvector;
+	uint16_t		pad_0;
+
+	uint16_t		flags;
+	uint16_t		timeout;
+	uint32_t		lba_count;
+
+	uint32_t		sense_buf_phys_addr_lo;
+	uint32_t		sense_buf_phys_addr_hi;
+
+	uint32_t		start_lba_lo;
+	uint32_t		start_lba_hi;
+
+	union drsas_sgl		sgl;
+};
+
+struct drsas_pthru_frame {
+	uint8_t			cmd;
+	uint8_t			sense_len;
+	uint8_t			cmd_status;
+	uint8_t			scsi_status;
+
+	uint8_t			target_id;
+	uint8_t			lun;
+	uint8_t			cdb_len;
+	uint8_t			sge_count;
+
+	uint32_t		context;
+	uint8_t			req_id;
+	uint8_t			msgvector;
+	uint16_t		pad_0;
+
+	uint16_t		flags;
+	uint16_t		timeout;
+	uint32_t		data_xfer_len;
+
+	uint32_t		sense_buf_phys_addr_lo;
+	uint32_t		sense_buf_phys_addr_hi;
+
+	uint8_t			cdb[16];
+	union drsas_sgl		sgl;
+};
+
+struct drsas_dcmd_frame {
+	uint8_t			cmd;
+	uint8_t			reserved_0;
+	uint8_t			cmd_status;
+	uint8_t			reserved_1[4];
+	uint8_t			sge_count;
+
+	uint32_t		context;
+	uint8_t			req_id;
+	uint8_t			msgvector;
+	uint16_t		pad_0;
+
+	uint16_t		flags;
+	uint16_t		timeout;
+
+	uint32_t		data_xfer_len;
+	uint32_t		opcode;
+
+	union {
+		uint8_t b[DCMD_MBOX_SZ];
+		uint16_t s[6];
+		uint32_t w[3];
+	} mbox;
+
+	union drsas_sgl		sgl;
+};
+
+struct drsas_abort_frame {
+	uint8_t		cmd;
+	uint8_t		reserved_0;
+	uint8_t		cmd_status;
+
+	uint8_t		reserved_1;
+	uint32_t	reserved_2;
+
+	uint32_t	context;
+	uint8_t		req_id;
+	uint8_t		msgvector;
+	uint16_t	pad_0;
+
+	uint16_t	flags;
+	uint16_t	reserved_3;
+	uint32_t	reserved_4;
+
+	uint32_t	abort_context;
+	uint32_t	pad_1;
+
+	uint32_t	abort_mfi_phys_addr_lo;
+	uint32_t	abort_mfi_phys_addr_hi;
+
+	uint32_t	reserved_5[6];
+};
+
+struct drsas_smp_frame {
+	uint8_t		cmd;
+	uint8_t		reserved_1;
+	uint8_t		cmd_status;
+	uint8_t		connection_status;
+
+	uint8_t		reserved_2[3];
+	uint8_t		sge_count;
+
+	uint32_t	context;
+	uint8_t		req_id;
+	uint8_t		msgvector;
+	uint16_t	pad_0;
+
+	uint16_t	flags;
+	uint16_t	timeout;
+
+	uint32_t	data_xfer_len;
+
+	uint64_t	sas_addr;
+
+	union drsas_sgl	sgl[2];
+};
+
+struct drsas_stp_frame {
+	uint8_t		cmd;
+	uint8_t		reserved_1;
+	uint8_t		cmd_status;
+	uint8_t		connection_status;
+
+	uint8_t		target_id;
+	uint8_t		reserved_2[2];
+	uint8_t		sge_count;
+
+	uint32_t	context;
+	uint8_t		req_id;
+	uint8_t		msgvector;
+	uint16_t	pad_0;
+
+	uint16_t	flags;
+	uint16_t	timeout;
+
+	uint32_t	data_xfer_len;
+
+	uint16_t	fis[10];
+	uint32_t	stp_flags;
+	union drsas_sgl	sgl;
+};
+
+union drsas_frame {
+	struct drsas_header		hdr;
+	struct drsas_init_frame		init;
+	struct drsas_io_frame		io;
+	struct drsas_pthru_frame	pthru;
+	struct drsas_dcmd_frame		dcmd;
+	struct drsas_abort_frame	abort;
+	struct drsas_smp_frame		smp;
+	struct drsas_stp_frame		stp;
+
+	uint8_t			raw_bytes[64];
+};
+
+typedef struct drsas_pd_address {
+	uint16_t	device_id;
+	uint16_t	encl_id;
+
+	union {
+		struct {
+			uint8_t encl_index;
+			uint8_t slot_number;
+		} pd_address;
+		struct {
+			uint8_t	encl_position;
+			uint8_t	encl_connector_index;
+		} encl_address;
+	}address;
+
+	uint8_t	scsi_dev_type;
+
+	union {
+		uint8_t		port_bitmap;
+		uint8_t		port_numbers;
+	} connected;
+
+	uint64_t		sas_addr[2];
+} drsas_pd_address_t;
+
+union drsas_evt_class_locale {
+	struct {
+		uint16_t	locale;
+		uint8_t		reserved;
+		int8_t		class;
+	} members;
+
+	uint32_t	word;
+};
+
+struct drsas_evt_log_info {
+	uint32_t	newest_seq_num;
+	uint32_t	oldest_seq_num;
+	uint32_t	clear_seq_num;
+	uint32_t	shutdown_seq_num;
+	uint32_t	boot_seq_num;
+};
+
+struct drsas_progress {
+	uint16_t	progress;
+	uint16_t	elapsed_seconds;
+};
+
+struct drsas_evtarg_ld {
+	uint16_t	target_id;
+	uint8_t		ld_index;
+	uint8_t		reserved;
+};
+
+struct drsas_evtarg_pd {
+	uint16_t	device_id;
+	uint8_t		encl_index;
+	uint8_t		slot_number;
+};
+
+struct drsas_evt_detail {
+	uint32_t	seq_num;
+	uint32_t	time_stamp;
+	uint32_t	code;
+	union drsas_evt_class_locale	cl;
+	uint8_t		arg_type;
+	uint8_t		reserved1[15];
+
+	union {
+		struct {
+			struct drsas_evtarg_pd	pd;
+			uint8_t			cdb_length;
+			uint8_t			sense_length;
+			uint8_t			reserved[2];
+			uint8_t			cdb[16];
+			uint8_t			sense[64];
+		} cdbSense;
+
+		struct drsas_evtarg_ld		ld;
+
+		struct {
+			struct drsas_evtarg_ld	ld;
+			uint64_t		count;
+		} ld_count;
+
+		struct {
+			uint64_t		lba;
+			struct drsas_evtarg_ld	ld;
+		} ld_lba;
+
+		struct {
+			struct drsas_evtarg_ld	ld;
+			uint32_t		prevOwner;
+			uint32_t		newOwner;
+		} ld_owner;
+
+		struct {
+			uint64_t		ld_lba;
+			uint64_t		pd_lba;
+			struct drsas_evtarg_ld	ld;
+			struct drsas_evtarg_pd	pd;
+		} ld_lba_pd_lba;
+
+		struct {
+			struct drsas_evtarg_ld	ld;
+			struct drsas_progress	prog;
+		} ld_prog;
+
+		struct {
+			struct drsas_evtarg_ld	ld;
+			uint32_t		prev_state;
+			uint32_t		new_state;
+		} ld_state;
+
+		struct {
+			uint64_t		strip;
+			struct drsas_evtarg_ld	ld;
+		} ld_strip;
+
+		struct drsas_evtarg_pd		pd;
+
+		struct {
+			struct drsas_evtarg_pd	pd;
+			uint32_t		err;
+		} pd_err;
+
+		struct {
+			uint64_t		lba;
+			struct drsas_evtarg_pd	pd;
+		} pd_lba;
+
+		struct {
+			uint64_t		lba;
+			struct drsas_evtarg_pd	pd;
+			struct drsas_evtarg_ld	ld;
+		} pd_lba_ld;
+
+		struct {
+			struct drsas_evtarg_pd	pd;
+			struct drsas_progress	prog;
+		} pd_prog;
+
+		struct {
+			struct drsas_evtarg_pd	pd;
+			uint32_t		prevState;
+			uint32_t		newState;
+		} pd_state;
+
+		struct {
+			uint16_t	vendorId;
+			uint16_t	deviceId;
+			uint16_t	subVendorId;
+			uint16_t	subDeviceId;
+		} pci;
+
+		uint32_t	rate;
+		char		str[96];
+
+		struct {
+			uint32_t	rtc;
+			uint32_t	elapsedSeconds;
+		} time;
+
+		struct {
+			uint32_t	ecar;
+			uint32_t	elog;
+			char		str[64];
+		} ecc;
+
+		drsas_pd_address_t	pd_addr;
+
+		uint8_t		b[96];
+		uint16_t	s[48];
+		uint32_t	w[24];
+		uint64_t	d[12];
+	} args;
+
+	char	description[128];
+
+};
+
+/* only 63 are usable by the application */
+#define	MAX_LOGICAL_DRIVES			64
+/* only 255 physical devices may be used */
+#define	MAX_PHYSICAL_DEVICES			256
+#define	MAX_PD_PER_ENCLOSURE			64
+/* maximum disks per array */
+#define	MAX_ROW_SIZE				32
+/* maximum spans per logical drive */
+#define	MAX_SPAN_DEPTH				8
+/* maximum number of arrays a hot spare may be dedicated to */
+#define	MAX_ARRAYS_DEDICATED			16
+/* maximum number of arrays which may exist */
+#define	MAX_ARRAYS				128
+/* maximum number of foreign configs that may ha managed at once */
+#define	MAX_FOREIGN_CONFIGS			8
+/* maximum spares (global and dedicated combined) */
+#define	MAX_SPARES_FOR_THE_CONTROLLER		MAX_PHYSICAL_DEVICES
+/* maximum possible Target IDs (i.e. 0 to 63) */
+#define	MAX_TARGET_ID				63
+/* maximum number of supported enclosures */
+#define	MAX_ENCLOSURES				32
+/* maximum number of PHYs per controller */
+#define	MAX_PHYS_PER_CONTROLLER			16
+/* maximum number of LDs per array (due to DDF limitations) */
+#define	MAX_LDS_PER_ARRAY			16
+
+/*
+ * -----------------------------------------------------------------------------
+ * -----------------------------------------------------------------------------
+ *
+ * Logical Drive commands
+ *
+ * -----------------------------------------------------------------------------
+ * -----------------------------------------------------------------------------
+ */
+#define	DR_DCMD_LD	0x03000000,	/* Logical Device (LD) opcodes */
+
+/*
+ * Input:	dcmd.opcode	- DR_DCMD_LD_GET_LIST
+ *		dcmd.mbox	- reserved
+ *		dcmd.sge IN	- ptr to returned DR_LD_LIST structure
+ * Desc:	Return the logical drive list structure
+ * Status:	No error
+ */
+
+/*
+ * defines the logical drive reference structure
+ */
+typedef	union _DR_LD_REF {	/* LD reference structure */
+	struct {
+		uint8_t	targetId; /* LD target id (0 to MAX_TARGET_ID) */
+		uint8_t	reserved; /* reserved for in line with DR_PD_REF */
+		uint16_t seqNum;  /* Sequence Number */
+	} ld_ref;
+	uint32_t ref;		/* shorthand reference to full 32-bits */
+} DR_LD_REF;			/* 4 bytes */
+
+/*
+ * defines the logical drive list structure
+ */
+typedef struct _DR_LD_LIST {
+	uint32_t	ldCount;	/* number of LDs */
+	uint32_t	reserved;	/* pad to 8-byte boundary */
+	struct {
+		DR_LD_REF ref;	/* LD reference */
+		uint8_t	state;		/* current LD state (DR_LD_STATE) */
+		uint8_t	reserved[3];	/* pad to 8-byte boundary */
+		uint64_t size;		/* LD size */
+	} ldList[MAX_LOGICAL_DRIVES];
+} DR_LD_LIST;
+
+struct drsas_drv_ver {
+	uint8_t	signature[12];
+	uint8_t	os_name[16];
+	uint8_t	os_ver[12];
+	uint8_t	drv_name[20];
+	uint8_t	drv_ver[32];
+	uint8_t	drv_rel_date[20];
+};
+
+#define	PCI_TYPE0_ADDRESSES		6
+#define	PCI_TYPE1_ADDRESSES		2
+#define	PCI_TYPE2_ADDRESSES		5
+
+struct drsas_pci_common_header {
+	uint16_t	vendorID;		/* (ro) */
+	uint16_t	deviceID;		/* (ro) */
+	uint16_t	command;		/* Device control */
+	uint16_t	status;
+	uint8_t		revisionID;		/* (ro) */
+	uint8_t		progIf;			/* (ro) */
+	uint8_t		subClass;		/* (ro) */
+	uint8_t		baseClass;		/* (ro) */
+	uint8_t		cacheLineSize;		/* (ro+) */
+	uint8_t		latencyTimer;		/* (ro+) */
+	uint8_t		headerType;		/* (ro) */
+	uint8_t		bist;			/* Built in self test */
+
+	union {
+	    struct {
+		uint32_t	baseAddresses[PCI_TYPE0_ADDRESSES];
+		uint32_t	cis;
+		uint16_t	subVendorID;
+		uint16_t	subSystemID;
+		uint32_t	romBaseAddress;
+		uint8_t		capabilitiesPtr;
+		uint8_t		reserved1[3];
+		uint32_t	reserved2;
+		uint8_t		interruptLine;
+		uint8_t		interruptPin;	/* (ro) */
+		uint8_t		minimumGrant;	/* (ro) */
+		uint8_t		maximumLatency;	/* (ro) */
+	    } type_0;
+
+	    struct {
+		uint32_t	baseAddresses[PCI_TYPE1_ADDRESSES];
+		uint8_t		primaryBus;
+		uint8_t		secondaryBus;
+		uint8_t		subordinateBus;
+		uint8_t		secondaryLatency;
+		uint8_t		ioBase;
+		uint8_t		ioLimit;
+		uint16_t	secondaryStatus;
+		uint16_t	memoryBase;
+		uint16_t	memoryLimit;
+		uint16_t	prefetchBase;
+		uint16_t	prefetchLimit;
+		uint32_t	prefetchBaseUpper32;
+		uint32_t	prefetchLimitUpper32;
+		uint16_t	ioBaseUpper16;
+		uint16_t	ioLimitUpper16;
+		uint8_t		capabilitiesPtr;
+		uint8_t		reserved1[3];
+		uint32_t	romBaseAddress;
+		uint8_t		interruptLine;
+		uint8_t		interruptPin;
+		uint16_t	bridgeControl;
+	    } type_1;
+
+	    struct {
+		uint32_t	socketRegistersBaseAddress;
+		uint8_t		capabilitiesPtr;
+		uint8_t		reserved;
+		uint16_t	secondaryStatus;
+		uint8_t		primaryBus;
+		uint8_t		secondaryBus;
+		uint8_t		subordinateBus;
+		uint8_t		secondaryLatency;
+		struct {
+			uint32_t	base;
+			uint32_t	limit;
+		} range[PCI_TYPE2_ADDRESSES-1];
+		uint8_t		interruptLine;
+		uint8_t		interruptPin;
+		uint16_t	bridgeControl;
+	    } type_2;
+	} header;
+};
+
+struct drsas_pci_link_capability {
+	union {
+	    struct {
+		uint32_t linkSpeed		:4;
+		uint32_t linkWidth		:6;
+		uint32_t aspmSupport		:2;
+		uint32_t losExitLatency		:3;
+		uint32_t l1ExitLatency		:3;
+		uint32_t rsvdp			:6;
+		uint32_t portNumber		:8;
+	    } bits;
+
+	    uint32_t asUlong;
+	} cap;
+
+};
+
+struct drsas_pci_link_status_capability {
+	union {
+	    struct {
+		uint16_t linkSpeed		:4;
+		uint16_t negotiatedLinkWidth	:6;
+		uint16_t linkTrainingError	:1;
+		uint16_t linkTraning		:1;
+		uint16_t slotClockConfig	:1;
+		uint16_t rsvdZ			:3;
+	    } bits;
+
+	    uint16_t asUshort;
+	} stat_cap;
+
+	uint16_t reserved;
+
+};
+
+struct drsas_pci_capabilities {
+	struct drsas_pci_link_capability	linkCapability;
+	struct drsas_pci_link_status_capability linkStatusCapability;
+};
+
+struct drsas_pci_information
+{
+	uint32_t		busNumber;
+	uint8_t			deviceNumber;
+	uint8_t			functionNumber;
+	uint8_t			interruptVector;
+	uint8_t			reserved;
+	struct drsas_pci_common_header pciHeaderInfo;
+	struct drsas_pci_capabilities capability;
+	uint8_t			reserved2[32];
+};
+
+struct drsas_ioctl {
+	uint16_t	version;
+	uint16_t	controller_id;
+	uint8_t		signature[8];
+	uint32_t	reserved_1;
+	uint32_t	control_code;
+	uint32_t	reserved_2[2];
+	uint8_t		frame[64];
+	union drsas_sgl_frame sgl_frame;
+	uint8_t		sense_buff[DRSAS_MAX_SENSE_LENGTH];
+	uint8_t		data[1];
+};
+
+struct drsas_aen {
+	uint16_t	host_no;
+	uint16_t	cmd_status;
+	uint32_t	seq_num;
+	uint32_t	class_locale_word;
+};
+#pragma pack()
+
+#ifndef	DDI_VENDOR_LSI
+#define	DDI_VENDOR_LSI		"LSI"
+#endif /* DDI_VENDOR_LSI */
+
+static int	drsas_getinfo(dev_info_t *, ddi_info_cmd_t,  void *, void **);
+static int	drsas_attach(dev_info_t *, ddi_attach_cmd_t);
+static int	drsas_reset(dev_info_t *, ddi_reset_cmd_t);
+static int	drsas_detach(dev_info_t *, ddi_detach_cmd_t);
+static int	drsas_open(dev_t *, int, int, cred_t *);
+static int	drsas_close(dev_t, int, int, cred_t *);
+static int	drsas_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
+
+static int	drsas_tran_tgt_init(dev_info_t *, dev_info_t *,
+		    scsi_hba_tran_t *, struct scsi_device *);
+static struct scsi_pkt *drsas_tran_init_pkt(struct scsi_address *, register
+		    struct scsi_pkt *, struct buf *, int, int, int, int,
+		    int (*)(), caddr_t);
+static int	drsas_tran_start(struct scsi_address *,
+		    register struct scsi_pkt *);
+static int	drsas_tran_abort(struct scsi_address *, struct scsi_pkt *);
+static int	drsas_tran_reset(struct scsi_address *, int);
+static int	drsas_tran_getcap(struct scsi_address *, char *, int);
+static int	drsas_tran_setcap(struct scsi_address *, char *, int, int);
+static void	drsas_tran_destroy_pkt(struct scsi_address *,
+		    struct scsi_pkt *);
+static void	drsas_tran_dmafree(struct scsi_address *, struct scsi_pkt *);
+static void	drsas_tran_sync_pkt(struct scsi_address *, struct scsi_pkt *);
+static uint_t	drsas_isr();
+static uint_t	drsas_softintr();
+
+static int	init_mfi(struct drsas_instance *);
+static int	drsas_free_dma_obj(struct drsas_instance *, dma_obj_t);
+static int	drsas_alloc_dma_obj(struct drsas_instance *, dma_obj_t *,
+		    uchar_t);
+static struct drsas_cmd *get_mfi_pkt(struct drsas_instance *);
+static void	return_mfi_pkt(struct drsas_instance *,
+		    struct drsas_cmd *);
+
+static void	free_space_for_mfi(struct drsas_instance *);
+static void	free_additional_dma_buffer(struct drsas_instance *);
+static int	alloc_additional_dma_buffer(struct drsas_instance *);
+static int	read_fw_status_reg_ppc(struct drsas_instance *);
+static void	issue_cmd_ppc(struct drsas_cmd *, struct drsas_instance *);
+static int	issue_cmd_in_poll_mode_ppc(struct drsas_instance *,
+		    struct drsas_cmd *);
+static int	issue_cmd_in_sync_mode_ppc(struct drsas_instance *,
+		    struct drsas_cmd *);
+static void	enable_intr_ppc(struct drsas_instance *);
+static void	disable_intr_ppc(struct drsas_instance *);
+static int	intr_ack_ppc(struct drsas_instance *);
+static int	mfi_state_transition_to_ready(struct drsas_instance *);
+static void	destroy_mfi_frame_pool(struct drsas_instance *);
+static int	create_mfi_frame_pool(struct drsas_instance *);
+static int	drsas_dma_alloc(struct drsas_instance *, struct scsi_pkt *,
+		    struct buf *, int, int (*)());
+static int	drsas_dma_move(struct drsas_instance *,
+			struct scsi_pkt *, struct buf *);
+static void	flush_cache(struct drsas_instance *instance);
+static void	display_scsi_inquiry(caddr_t);
+static int	start_mfi_aen(struct drsas_instance *instance);
+static int	handle_drv_ioctl(struct drsas_instance *instance,
+		    struct drsas_ioctl *ioctl, int mode);
+static int	handle_mfi_ioctl(struct drsas_instance *instance,
+		    struct drsas_ioctl *ioctl, int mode);
+static int	handle_mfi_aen(struct drsas_instance *instance,
+		    struct drsas_aen *aen);
+static void	fill_up_drv_ver(struct drsas_drv_ver *dv);
+static struct drsas_cmd *build_cmd(struct drsas_instance *instance,
+		    struct scsi_address *ap, struct scsi_pkt *pkt,
+		    uchar_t *cmd_done);
+static int	register_mfi_aen(struct drsas_instance *instance,
+		    uint32_t seq_num, uint32_t class_locale_word);
+static int	issue_mfi_pthru(struct drsas_instance *instance, struct
+		    drsas_ioctl *ioctl, struct drsas_cmd *cmd, int mode);
+static int	issue_mfi_dcmd(struct drsas_instance *instance, struct
+		    drsas_ioctl *ioctl, struct drsas_cmd *cmd, int mode);
+static int	issue_mfi_smp(struct drsas_instance *instance, struct
+		    drsas_ioctl *ioctl, struct drsas_cmd *cmd, int mode);
+static int	issue_mfi_stp(struct drsas_instance *instance, struct
+		    drsas_ioctl *ioctl, struct drsas_cmd *cmd, int mode);
+static int	abort_aen_cmd(struct drsas_instance *instance,
+		    struct drsas_cmd *cmd_to_abort);
+
+static int	drsas_common_check(struct drsas_instance *instance,
+		    struct  drsas_cmd *cmd);
+static void	drsas_fm_init(struct drsas_instance *instance);
+static void	drsas_fm_fini(struct drsas_instance *instance);
+static int	drsas_fm_error_cb(dev_info_t *, ddi_fm_error_t *,
+		    const void *);
+static void	drsas_fm_ereport(struct drsas_instance *instance,
+		    char *detail);
+static int	drsas_check_dma_handle(ddi_dma_handle_t handle);
+static int	drsas_check_acc_handle(ddi_acc_handle_t handle);
+
+static void	drsas_rem_intrs(struct drsas_instance *instance);
+static int	drsas_add_intrs(struct drsas_instance *instance, int intr_type);
+
+static void	drsas_tran_tgt_free(dev_info_t *, dev_info_t *,
+		    scsi_hba_tran_t *, struct scsi_device *);
+static int	drsas_tran_bus_config(dev_info_t *, uint_t,
+		    ddi_bus_config_op_t, void *, dev_info_t **);
+static int	drsas_parse_devname(char *, int *, int *);
+static int	drsas_config_all_devices(struct drsas_instance *);
+static int 	drsas_config_scsi_device(struct drsas_instance *,
+		    struct scsi_device *, dev_info_t **);
+static int 	drsas_config_ld(struct drsas_instance *, uint16_t,
+				uint8_t, dev_info_t **);
+static dev_info_t *drsas_find_child(struct drsas_instance *, uint16_t,
+			uint8_t);
+static int	drsas_name_node(dev_info_t *, char *, int);
+static void	drsas_issue_evt_taskq(struct drsas_eventinfo *);
+static int	drsas_service_evt(struct drsas_instance *, int, int, int,
+			uint64_t);
+static int	drsas_mode_sense_build(struct scsi_pkt *);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _DR_SAS_H_ */
diff --git a/usr/src/uts/common/io/dr_sas/dr_sas_list.h b/usr/src/uts/common/io/dr_sas/dr_sas_list.h
new file mode 100644
index 0000000000..4154a77796
--- /dev/null
+++ b/usr/src/uts/common/io/dr_sas/dr_sas_list.h
@@ -0,0 +1,212 @@
+/*
+ * dr_sas_list.h: header for dr_sas
+ *
+ * Solaris MegaRAID driver for SAS2.0 controllers
+ * Copyright (c) 2008-2009, LSI Logic Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the author nor the names of its contributors may be
+ *    used to endorse or promote products derived from this software without
+ *    specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_DR_SAS_LIST_H_
+#define	_DR_SAS_LIST_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Simple doubly linked list implementation.
+ *
+ * Some of the internal functions ("__xxx") are useful when
+ * manipulating whole lists rather than single entries, as
+ * sometimes we already know the next/prev entries and we can
+ * generate better code by using them directly rather than
+ * using the generic single-entry routines.
+ */
+
+struct mlist_head {
+	struct mlist_head *next, *prev;
+};
+
+typedef struct mlist_head mlist_t;
+
+#define	LIST_HEAD_INIT(name) { &(name), &(name) }
+
+#define	LIST_HEAD(name) \
+	struct mlist_head name = LIST_HEAD_INIT(name)
+
+#define	INIT_LIST_HEAD(ptr) { \
+	(ptr)->next = (ptr); (ptr)->prev = (ptr); \
+}
+
+
+/*
+ * Insert a new entry between two known consecutive entries.
+ *
+ * This is only for internal list manipulation where we know
+ * the prev/next entries already!
+ */
+static void __list_add(struct mlist_head *new,
+	struct mlist_head *prev,
+	struct mlist_head *next)
+{
+	next->prev = new;
+	new->next = next;
+	new->prev = prev;
+	prev->next = new;
+}
+
+
+/*
+ * mlist_add - add a new entry
+ * @new: new entry to be added
+ * @head: list head to add it after
+ *
+ * Insert a new entry after the specified head.
+ * This is good for implementing stacks.
+ */
+static void mlist_add(struct mlist_head *new, struct mlist_head *head)
+{
+	__list_add(new, head, head->next);
+}
+
+
+/*
+ * mlist_add_tail - add a new entry
+ * @new: new entry to be added
+ * @head: list head to add it before
+ *
+ * Insert a new entry before the specified head.
+ * This is useful for implementing queues.
+ */
+static void mlist_add_tail(struct mlist_head *new, struct mlist_head *head)
+{
+	__list_add(new, head->prev, head);
+}
+
+
+
+/*
+ * Delete a list entry by making the prev/next entries
+ * point to each other.
+ *
+ * This is only for internal list manipulation where we know
+ * the prev/next entries already!
+ */
+static void __list_del(struct mlist_head *prev,
+			struct mlist_head *next)
+{
+	next->prev = prev;
+	prev->next = next;
+}
+
+
+/*
+ * mlist_del_init - deletes entry from list and reinitialize it.
+ * @entry: the element to delete from the list.
+ */
+static void mlist_del_init(struct mlist_head *entry)
+{
+	__list_del(entry->prev, entry->next);
+	INIT_LIST_HEAD(entry);
+}
+
+
+/*
+ * mlist_empty - tests whether a list is empty
+ * @head: the list to test.
+ */
+static int mlist_empty(struct mlist_head *head)
+{
+	return (head->next == head);
+}
+
+
+/*
+ * mlist_splice - join two lists
+ * @list: the new list to add.
+ * @head: the place to add it in the first list.
+ */
+static void mlist_splice(struct mlist_head *list, struct mlist_head *head)
+{
+	struct mlist_head *first = list->next;
+
+	if (first != list) {
+		struct mlist_head *last = list->prev;
+		struct mlist_head *at = head->next;
+
+		first->prev = head;
+		head->next = first;
+
+		last->next = at;
+		at->prev = last;
+	}
+}
+
+
+/*
+ * mlist_entry - get the struct for this entry
+ * @ptr:	the &struct mlist_head pointer.
+ * @type:	the type of the struct this is embedded in.
+ * @member:	the name of the list_struct within the struct.
+ */
+#define	mlist_entry(ptr, type, member) \
+	((type *)((size_t)(ptr) - offsetof(type, member)))
+
+
+/*
+ * mlist_for_each	-	iterate over a list
+ * @pos:	the &struct mlist_head to use as a loop counter.
+ * @head:	the head for your list.
+ */
+#define	mlist_for_each(pos, head) \
+	for (pos = (head)->next, prefetch(pos->next); pos != (head); \
+		pos = pos->next, prefetch(pos->next))
+
+
+/*
+ * mlist_for_each_safe - iterate over a list safe against removal of list entry
+ * @pos:	the &struct mlist_head to use as a loop counter.
+ * @n:		another &struct mlist_head to use as temporary storage
+ * @head:	the head for your list.
+ */
+#define	mlist_for_each_safe(pos, n, head) \
+	for (pos = (head)->next, n = pos->next; pos != (head); \
+		pos = n, n = pos->next)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _DR_SAS_LIST_H_ */
diff --git a/usr/src/uts/common/io/elxl/elxl.c b/usr/src/uts/common/io/elxl/elxl.c
index 2ffe96aff3..42552225f8 100644
--- a/usr/src/uts/common/io/elxl/elxl.c
+++ b/usr/src/uts/common/io/elxl/elxl.c
@@ -1,6 +1,7 @@
 /*
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2018 Joyent, Inc.
  */
 
 /*
@@ -1163,8 +1164,7 @@ elxl_m_tx(void *arg, mblk_t *mp)
 		cflags = 0;
 		if ((sc->ex_conf & CONF_90XB) != 0) {
 			uint32_t	pflags;
-			hcksum_retrieve(mp, NULL, NULL, NULL, NULL, NULL, NULL,
-			    &pflags);
+			mac_hcksum_get(mp, NULL, NULL, NULL, NULL, &pflags);
 			if (pflags & HCK_IPV4_HDRCKSUM) {
 				cflags |= EX_DPD_IPCKSUM;
 			}
@@ -1327,7 +1327,7 @@ elxl_recv(elxl_t *sc, ex_desc_t *rxd, uint32_t stat)
 		if (stat & (EX_UPD_TCPCHECKED | EX_UPD_UDPCHECKED)) {
 			pflags |= (HCK_FULLCKSUM | HCK_FULLCKSUM_OK);
 		}
-		(void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, 0, pflags, 0);
+		mac_hcksum_set(mp, 0, 0, 0, 0, pflags);
 	}
 
 	return (mp);
diff --git a/usr/src/uts/common/io/eventfd.c b/usr/src/uts/common/io/eventfd.c
index 9b0840aa8b..e26cdfc78f 100644
--- a/usr/src/uts/common/io/eventfd.c
+++ b/usr/src/uts/common/io/eventfd.c
@@ -141,37 +141,39 @@ eventfd_read(dev_t dev, uio_t *uio, cred_t *cr)
 	 * transitions from EVENTFD_VALMAX to a lower value.  At all other
 	 * times, it is already considered writable by poll.
 	 */
-	if (oval == EVENTFD_VALMAX) {
+	if (oval >= EVENTFD_VALMAX) {
 		pollwakeup(&state->efd_pollhd, POLLWRNORM | POLLOUT);
 	}
 
 	return (err);
 }
 
-/*ARGSUSED*/
 static int
-eventfd_write(dev_t dev, struct uio *uio, cred_t *credp)
+eventfd_post(eventfd_state_t *state, uint64_t val, boolean_t is_async,
+    boolean_t file_nonblock)
 {
-	eventfd_state_t *state;
-	minor_t minor = getminor(dev);
-	uint64_t val, oval;
-	int err;
-
-	if (uio->uio_resid < sizeof (val))
-		return (EINVAL);
-
-	if ((err = uiomove(&val, sizeof (val), UIO_WRITE, uio)) != 0)
-		return (err);
-
-	if (val > EVENTFD_VALMAX)
-		return (EINVAL);
-
-	state = ddi_get_soft_state(eventfd_softstate, minor);
+	uint64_t oval;
+	boolean_t overflow = B_FALSE;
 
 	mutex_enter(&state->efd_lock);
 
 	while (val > EVENTFD_VALMAX - state->efd_value) {
-		if (uio->uio_fmode & (FNDELAY|FNONBLOCK)) {
+
+		/*
+		 * When called from (LX) AIO, expectations about overflow and
+		 * blocking are different than normal operation.  If the
+		 * incoming value would cause overflow, it is clamped to reach
+		 * the overflow value exactly.  This is added to the existing
+		 * value without blocking.  Any pollers of the eventfd will see
+		 * POLLERR asserted when this occurs.
+		 */
+		if (is_async) {
+			val = EVENTFD_VALOVERFLOW - state->efd_value;
+			overflow = B_TRUE;
+			break;
+		}
+
+		if (file_nonblock) {
 			mutex_exit(&state->efd_lock);
 			return (EAGAIN);
 		}
@@ -186,7 +188,7 @@ eventfd_write(dev_t dev, struct uio *uio, cred_t *credp)
 	}
 
 	/*
-	 * We now know that we can add the value without overflowing.
+	 * We now know that we can safely add the value.
 	 */
 	state->efd_value = (oval = state->efd_value) + val;
 
@@ -200,10 +202,13 @@ eventfd_write(dev_t dev, struct uio *uio, cred_t *credp)
 	mutex_exit(&state->efd_lock);
 
 	/*
-	 * Notify pollers as well if the eventfd is now readable.
+	 * Notify pollers as well if the eventfd has become readable or has
+	 * transitioned into overflow.
 	 */
 	if (oval == 0) {
 		pollwakeup(&state->efd_pollhd, POLLRDNORM | POLLIN);
+	} else if (overflow && val != 0) {
+		pollwakeup(&state->efd_pollhd, POLLERR);
 	}
 
 	return (0);
@@ -211,6 +216,29 @@ eventfd_write(dev_t dev, struct uio *uio, cred_t *credp)
 
 /*ARGSUSED*/
 static int
+eventfd_write(dev_t dev, struct uio *uio, cred_t *credp)
+{
+	eventfd_state_t *state;
+	boolean_t file_nonblock;
+	uint64_t val;
+	int err;
+
+	if (uio->uio_resid < sizeof (val))
+		return (EINVAL);
+
+	if ((err = uiomove(&val, sizeof (val), UIO_WRITE, uio)) != 0)
+		return (err);
+
+	if (val > EVENTFD_VALMAX)
+		return (EINVAL);
+
+	file_nonblock = (uio->uio_fmode & (FNDELAY|FNONBLOCK)) != 0;
+	state = ddi_get_soft_state(eventfd_softstate, getminor(dev));
+	return (eventfd_post(state, val, B_FALSE, file_nonblock));
+}
+
+/*ARGSUSED*/
+static int
 eventfd_poll(dev_t dev, short events, int anyyet, short *reventsp,
     struct pollhead **phpp)
 {
@@ -228,6 +256,9 @@ eventfd_poll(dev_t dev, short events, int anyyet, short *reventsp,
 	if (state->efd_value < EVENTFD_VALMAX)
 		revents |= POLLWRNORM | POLLOUT;
 
+	if (state->efd_value == EVENTFD_VALOVERFLOW)
+		revents |= POLLERR;
+
 	*reventsp = revents & events;
 	if ((*reventsp == 0 && !anyyet) || (events & POLLET)) {
 		*phpp = &state->efd_pollhd;
@@ -244,17 +275,28 @@ eventfd_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv)
 {
 	eventfd_state_t *state;
 	minor_t minor = getminor(dev);
+	uint64_t *valp;
 
 	state = ddi_get_soft_state(eventfd_softstate, minor);
 
 	switch (cmd) {
-	case EVENTFDIOC_SEMAPHORE: {
+	case EVENTFDIOC_SEMAPHORE:
 		mutex_enter(&state->efd_lock);
 		state->efd_semaphore ^= 1;
 		mutex_exit(&state->efd_lock);
+		return (0);
 
+	case EVENTFDIOC_POST:
+		/*
+		 * This ioctl is expected to be kernel-internal, used only by
+		 * the AIO emulation in LX.
+		 */
+		if ((md & FKIOCTL) == 0) {
+			break;
+		}
+		valp = (uint64_t *)arg;
+		VERIFY(eventfd_post(state, *valp, B_TRUE, B_FALSE) == 0);
 		return (0);
-	}
 
 	default:
 		break;
diff --git a/usr/src/uts/common/io/fibre-channel/impl/fctl.c b/usr/src/uts/common/io/fibre-channel/impl/fctl.c
index 4c2a39013a..eb2a0c2ec5 100644
--- a/usr/src/uts/common/io/fibre-channel/impl/fctl.c
+++ b/usr/src/uts/common/io/fibre-channel/impl/fctl.c
@@ -24,6 +24,7 @@
  */
 /*
  * Copyright 2012 Garrett D'Amore <garrett@damore.org>.  All rights reserved.
+ * Copyright (c) 2015 Joyent, Inc.  All rights reserved.
  */
 /*
  * Fibre channel Transport Library (fctl)
@@ -5500,6 +5501,11 @@ fc_ulp_get_adapter_paths(char *pathList, int count)
 		maxPorts ++;
 	}
 
+	if (maxPorts == 0) {
+		mutex_exit(&fctl_port_lock);
+		return (0);
+	}
+
 	/* Now allocate a buffer to store all the pointers for comparisons */
 	portList = kmem_zalloc(sizeof (fc_local_port_t *) * maxPorts, KM_SLEEP);
 
diff --git a/usr/src/uts/common/io/gld.c b/usr/src/uts/common/io/gld.c
index c6c6b65900..5502ea54af 100644
--- a/usr/src/uts/common/io/gld.c
+++ b/usr/src/uts/common/io/gld.c
@@ -22,6 +22,7 @@
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  * Copyright (c) 2016 by Delphix. All rights reserved.
+ * Copyright 2018 Joyent, Inc.
  */
 
 /*
@@ -4550,8 +4551,7 @@ gld_unitdata(queue_t *q, mblk_t *mp)
 	ifp = ((gld_mac_pvt_t *)macinfo->gldm_mac_pvt)->interfacep;
 
 	/* grab any checksum information that may be present */
-	hcksum_retrieve(mp->b_cont, NULL, NULL, &start, &stuff, &end,
-	    &value, &flags);
+	mac_hcksum_get(mp->b_cont, &start, &stuff, &end, &value, &flags);
 
 	/*
 	 * Prepend a valid header for transmission
@@ -4567,8 +4567,7 @@ gld_unitdata(queue_t *q, mblk_t *mp)
 	}
 
 	/* apply any checksum information to the first block in the chain */
-	(void) hcksum_assoc(nmp, NULL, NULL, start, stuff, end, value,
-	    flags, 0);
+	mac_hcksum_set(nmp, start, stuff, end, value, flags);
 
 	GLD_CLEAR_MBLK_VTAG(nmp);
 	if (gld_start(q, nmp, GLD_WSRV, upri) == GLD_NORESOURCES) {
diff --git a/usr/src/uts/common/io/gsqueue/gsqueue.c b/usr/src/uts/common/io/gsqueue/gsqueue.c
new file mode 100644
index 0000000000..03bb799499
--- /dev/null
+++ b/usr/src/uts/common/io/gsqueue/gsqueue.c
@@ -0,0 +1,608 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc.
+ */
+
+/*
+ * Serialization queues are a technique used in illumos to provide what's
+ * commonly known as a 'vertical' perimeter. The idea (described a bit in
+ * uts/common/inet/squeue.c) is to provide a means to make sure that message
+ * blocks (mblk_t) are processed in a specific order. Subsystems like ip and vnd
+ * consume these on different policies, ip on a conn_t basis, vnd on a per
+ * device basis, and use this to ensure that only one packet is being processed
+ * at a given time.
+ *
+ * Serialization queues were originally used by ip. As part of that
+ * implementation, many of the details of ip were baked into it. That includes
+ * things like conn_t, ip receive attributes, and the notion of sets. While an
+ * individual serialization queue, or gsqueue_t, is a useful level of
+ * abstraction, it isn't the basis on which monst consumers want to manage them.
+ * Instead, we have the notion of a set of serialization queues. These sets are
+ * DR (CPU Dynamic reconfiguration) aware, and allow consumers to have a
+ * gsqueue_t per CPU to fanout on without managing them all itself. In the
+ * original implementation, this existed, but they were heavily tied into the
+ * infrastructure of IP, and its notion of polling on the underlying MAC
+ * devices.
+ *
+ * The result of that past is a new interface to serialization queues and a
+ * similar, but slightly different, abstraction to sets of these
+ * (gsqueue_set_t).  When designing this there are two different approaches that
+ * one could consider. The first is that the system has one gsqueue_set_t that
+ * the entire world shares, whether IP or some other consumer. The other is that
+ * every consumer has their own set.
+ *
+ * The trade offs between these two failure modes are the pathological failure
+ * modes. There is no guarantee that any two consumers here are equivalent. In
+ * fact, they very likely have very different latency profiles. If they are
+ * being processed in the same queue, that can lead to very odd behaviors. More
+ * generally, if we have a series of processing functions from one consumer
+ * which are generally short, and another which are generally long, that'll
+ * cause undue latency that's harder to observe. If we instead take the approach
+ * that each consumer should have its own set that it fans out over then we
+ * won't end up with the problem that a given serialization queue will have
+ * multiple latency profiles, but instead we'll see cpu contention for the bound
+ * gsqueue_t worker thread. Keep in mind though, that only the gsqueue_t worker
+ * thread is bound and it is in fact possible for it to be processed by other
+ * threads on other CPUs.
+ *
+ * We've opted to go down the second path, so each consumer has its own
+ * independent set of serialization queues that it is bound over.
+ *
+ * Structure Hierarchies
+ * ---------------------
+ *
+ * At the top level, we have a single list of gsqueue_set_t. The gsqueue_set_t
+ * encapsulates all the per-CPU gsqueue_t that exist in the form of
+ * gsqueue_cpu_t.  The gsqueue_cpu_t has been designed such that it could
+ * accommodate more than one gsqueue_t, but today there is a one to one mapping.
+ *
+ * We maintain two different lists of gsqueue_cpu_t, the active and defunct
+ * sets. The active set is maintained in the array `gs_cpus`. There are NCPU
+ * entries available in `gs_cpus` with the total number of currently active cpus
+ * described in `gs_ncpus`. The ordering of `gs_cpus` is unimportant.  When
+ * there is no longer a need for a given binding (see the following section for
+ * more explanation on when this is the case) then we move the entry to the
+ * `gs_defunct` list which is just a list_t of gsqueue_cpu_t.
+ *
+ * In addition, each gsqueue_set_t can have a series of callbacks registered
+ * with it. These are described in the following section. Graphically, a given
+ * gsqueue_set_t looks roughly like the following:
+ *
+ *     +---------------+
+ *     | gsqueue_set_t |
+ *     +---------------+
+ *       |    |     |
+ *       |    |     * . . . gs_cpus
+ *       |    |     |
+ *       |    |     |    +-------------------------------------------------+
+ *       |    |     +--->| gsqueue_cpu_t || gsqueue_cpu_t || gsqueue_cpu_t |...
+ *       |    |          +-------------------------------------------------+
+ *       |    |
+ *       |    * . . . gs_defunct
+ *       |    |
+ *       |    |    +---------------+   +---------------+   +---------------+
+ *       |    +--->| gsqueue_cpu_t |-->| gsqueue_cpu_t |-->| gsqueue_cpu_t |...
+ *       |         +---------------+   +---------------+   +---------------+
+ *       * . . . gs_cbs
+ *       |
+ *       |    +--------------+   +--------------+  +--------------+
+ *       +--->| gsqueue_cb_t |-->| gsqueue_cb_t |->| gsqueue_cb_t |...
+ *            +--------------+   +--------------+  +--------------+
+ *
+ * CPU DR, gsqueue_t, and gsqueue_t
+ * --------------------------------
+ *
+ * Recall, that every serialization queue (gsqueue_t or squeue_t) has a worker
+ * thread that may end up doing work. As part of supporting fanout, we have one
+ * gsqueue_t per CPU, and its worker thread is bound to that CPU. Because of
+ * this binding, we need to deal with CPU DR changes.
+ *
+ * The gsqueue driver maintains a single CPU DR callback that is used for the
+ * entire sub-system. We break down CPU DR events into three groups. Offline
+ * events, online events, and events we can ignore. When the first group occurs,
+ * we need to go through every gsqueue_t, find the gsqueue_cpu_t that
+ * corresponds to that processor id, and unbind all of its gsqueue_t's. It's
+ * rather important that we only unbind the gsqueue_t's and not actually destroy
+ * them. When this happens, they could very easily have data queued inside of
+ * them and it's unreasonable to just throw out everything in them at this
+ * point. The data remains intact and service continues uinterrupted.
+ *
+ * When we receive an online event, we do the opposite. We try to find a
+ * gsqueue_cpu_t that previously was bound to this CPU (by leaving its gqc_cpuid
+ * field intact) in the defunct list. If we find one, we remove it from the
+ * defunct list and add it to the active list as well as binding the gsqueue_t
+ * to the CPU in question. If we don't find one, then we create a new one.
+ *
+ * To deal with these kinds of situations, we allow a consumer to register
+ * callbacks for the gsqueue_t that they are interested in. These callbacks will
+ * fire whenever we are handling a topology change. The design of the callbacks
+ * is not that the user can take any administrative action during them, but
+ * rather set something for them to do asynchronously. It is illegal to make any
+ * calls into the gsqueue system while you are in a callback.
+ *
+ * Locking
+ * -------
+ *
+ * The lock ordering here is fairly straightforward. Due to our use of CPU
+ * binding and the CPU DR callbacks, we have an additional lock to consider
+ * cpu_lock. Because of that, the following are the rules for locking:
+ *
+ *
+ *   o If performing binding operations, you must grab cpu_lock. cpu_lock is
+ *     also at the top of the order.
+ *
+ *   o cpu_lock > gsqueue_lock > gsqueue_t`gs_lock > squeue_t`sq_lock
+ *     If you need to take multiple locks, you must take the greatest
+ *     (left-most) one first.
+ */
+
+#include <sys/types.h>
+#include <sys/conf.h>
+#include <sys/stat.h>
+#include <sys/kmem.h>
+#include <sys/stream.h>
+#include <sys/modctl.h>
+#include <sys/cpuvar.h>
+#include <sys/list.h>
+#include <sys/sysmacros.h>
+
+#include <sys/gsqueue.h>
+#include <sys/squeue_impl.h>
+
+typedef struct gsqueue_cb {
+	struct gsqueue_cb *gcb_next;
+	gsqueue_cb_f gcb_func;
+	void *gcb_arg;
+} gsqueue_cb_t;
+
+typedef struct gsqueue_cpu {
+	list_node_t gqc_lnode;
+	squeue_t *gqc_head;
+	processorid_t gqc_cpuid;
+} gsqueue_cpu_t;
+
+struct gsqueue_set {
+	list_node_t gs_next;
+	pri_t gs_wpri;
+	kmutex_t gs_lock;
+	int gs_ncpus;
+	gsqueue_cpu_t **gs_cpus;
+	list_t gs_defunct;
+	gsqueue_cb_t *gs_cbs;
+};
+
+static kmutex_t gsqueue_lock;
+static list_t gsqueue_list;
+static kmem_cache_t *gsqueue_cb_cache;
+static kmem_cache_t *gsqueue_cpu_cache;
+static kmem_cache_t *gsqueue_set_cache;
+
+static gsqueue_cpu_t *
+gsqueue_cpu_create(pri_t wpri, processorid_t cpuid)
+{
+	gsqueue_cpu_t *scp;
+
+	scp = kmem_cache_alloc(gsqueue_cpu_cache, KM_SLEEP);
+
+	list_link_init(&scp->gqc_lnode);
+	scp->gqc_cpuid = cpuid;
+	scp->gqc_head = squeue_create(wpri, B_FALSE);
+	scp->gqc_head->sq_state = SQS_DEFAULT;
+	squeue_bind(scp->gqc_head, cpuid);
+
+	return (scp);
+}
+
+static void
+gsqueue_cpu_destroy(gsqueue_cpu_t *scp)
+{
+	squeue_destroy(scp->gqc_head);
+	kmem_cache_free(gsqueue_cpu_cache, scp);
+}
+
+gsqueue_set_t *
+gsqueue_set_create(pri_t wpri)
+{
+	int i;
+	gsqueue_set_t *gssp;
+
+	gssp = kmem_cache_alloc(gsqueue_set_cache, KM_SLEEP);
+	gssp->gs_wpri = wpri;
+	gssp->gs_ncpus = 0;
+
+	/*
+	 * We're grabbing CPU lock. Once we let go of it we have to ensure all
+	 * set up of the gsqueue_set_t is complete, as it'll be in there for the
+	 * various CPU DR bits.
+	 */
+	mutex_enter(&cpu_lock);
+
+	for (i = 0; i < NCPU; i++) {
+		gsqueue_cpu_t *scp;
+		cpu_t *cp = cpu_get(i);
+		if (cp != NULL && CPU_ACTIVE(cp) &&
+		    cp->cpu_flags & CPU_EXISTS) {
+			scp = gsqueue_cpu_create(wpri, cp->cpu_id);
+			gssp->gs_cpus[gssp->gs_ncpus] = scp;
+			gssp->gs_ncpus++;
+		}
+	}
+
+	/* Finally we can add it to our global list and be done */
+	mutex_enter(&gsqueue_lock);
+	list_insert_tail(&gsqueue_list, gssp);
+	mutex_exit(&gsqueue_lock);
+	mutex_exit(&cpu_lock);
+
+	return (gssp);
+}
+
+void
+gsqueue_set_destroy(gsqueue_set_t *gssp)
+{
+	int i;
+	gsqueue_cpu_t *scp;
+
+	/*
+	 * Go through and unbind all of the squeues while cpu_lock is held and
+	 * move them to the defunct list. Once that's done, we don't need to do
+	 * anything else with cpu_lock.
+	 */
+	mutex_enter(&cpu_lock);
+	mutex_enter(&gsqueue_lock);
+	list_remove(&gsqueue_list, gssp);
+	mutex_exit(&gsqueue_lock);
+
+	mutex_enter(&gssp->gs_lock);
+
+	for (i = 0; i < gssp->gs_ncpus; i++) {
+		scp = gssp->gs_cpus[i];
+		squeue_unbind(scp->gqc_head);
+		list_insert_tail(&gssp->gs_defunct, scp);
+		gssp->gs_cpus[i] = NULL;
+	}
+	gssp->gs_ncpus = 0;
+
+	mutex_exit(&gssp->gs_lock);
+	mutex_exit(&cpu_lock);
+
+	while ((scp = list_remove_head(&gssp->gs_defunct)) != NULL) {
+		gsqueue_cpu_destroy(scp);
+	}
+
+	while (gssp->gs_cbs != NULL) {
+		gsqueue_cb_t *cbp;
+
+		cbp = gssp->gs_cbs;
+		gssp->gs_cbs = cbp->gcb_next;
+		kmem_cache_free(gsqueue_cb_cache, cbp);
+	}
+
+	ASSERT3U(gssp->gs_ncpus, ==, 0);
+	ASSERT3P(list_head(&gssp->gs_defunct), ==, NULL);
+	ASSERT3P(gssp->gs_cbs, ==, NULL);
+	kmem_cache_free(gsqueue_set_cache, gssp);
+}
+
+gsqueue_t *
+gsqueue_set_get(gsqueue_set_t *gssp, uint_t index)
+{
+	squeue_t *sqp;
+	gsqueue_cpu_t *scp;
+
+	mutex_enter(&gssp->gs_lock);
+	scp = gssp->gs_cpus[index % gssp->gs_ncpus];
+	sqp = scp->gqc_head;
+	mutex_exit(&gssp->gs_lock);
+	return ((gsqueue_t *)sqp);
+}
+
+uintptr_t
+gsqueue_set_cb_add(gsqueue_set_t *gssp, gsqueue_cb_f cb, void *arg)
+{
+	gsqueue_cb_t *cbp;
+
+	cbp = kmem_cache_alloc(gsqueue_cb_cache, KM_SLEEP);
+	cbp->gcb_func = cb;
+	cbp->gcb_arg = arg;
+
+	mutex_enter(&gssp->gs_lock);
+	cbp->gcb_next = gssp->gs_cbs;
+	gssp->gs_cbs = cbp;
+	mutex_exit(&gssp->gs_lock);
+	return ((uintptr_t)cbp);
+}
+
+int
+gsqueue_set_cb_remove(gsqueue_set_t *gssp, uintptr_t id)
+{
+	gsqueue_cb_t *cbp, *prev;
+	mutex_enter(&gssp->gs_lock);
+	cbp = gssp->gs_cbs;
+	prev = NULL;
+	while (cbp != NULL) {
+		if ((uintptr_t)cbp != id) {
+			prev = cbp;
+			cbp = cbp->gcb_next;
+			continue;
+		}
+
+		if (prev == NULL) {
+			gssp->gs_cbs = cbp->gcb_next;
+		} else {
+			prev->gcb_next = cbp->gcb_next;
+		}
+
+		mutex_exit(&gssp->gs_lock);
+		kmem_cache_free(gsqueue_cb_cache, cbp);
+		return (0);
+	}
+	mutex_exit(&gssp->gs_lock);
+	return (-1);
+}
+
+void
+gsqueue_enter_one(gsqueue_t *gsp, mblk_t *mp, gsqueue_proc_f func, void *arg,
+    int flags, uint8_t tag)
+{
+	squeue_t *sqp = (squeue_t *)gsp;
+
+	ASSERT(mp->b_next == NULL);
+	ASSERT(mp->b_prev == NULL);
+	mp->b_queue = (queue_t *)func;
+	mp->b_prev = arg;
+	sqp->sq_enter(sqp, mp, mp, 1, NULL, flags, tag);
+}
+
+static void
+gsqueue_notify(gsqueue_set_t *gssp, squeue_t *sqp, boolean_t online)
+{
+	gsqueue_cb_t *cbp;
+
+	ASSERT(MUTEX_HELD(&gssp->gs_lock));
+	cbp = gssp->gs_cbs;
+	while (cbp != NULL) {
+		cbp->gcb_func(gssp, (gsqueue_t *)sqp, cbp->gcb_arg, online);
+		cbp = cbp->gcb_next;
+	}
+
+}
+
+/*
+ * When we online a processor we need to go through and either bind a defunct
+ * squeue or create a new one. We'll try to reuse a gsqueue_cpu_t from the
+ * defunct list that used to be on that processor. If no such gsqueue_cpu_t
+ * exists, then we'll create a new one. We'd rather avoid taking over an
+ * existing defunct one that used to be on another CPU, as its not unreasonable
+ * to believe that its CPU will come back. More CPUs are offlined and onlined by
+ * the administrator or by creating cpu sets than actually get offlined by FMA.
+ */
+static void
+gsqueue_handle_online(processorid_t id)
+{
+	gsqueue_set_t *gssp;
+
+	ASSERT(MUTEX_HELD(&cpu_lock));
+	mutex_enter(&gsqueue_lock);
+	for (gssp = list_head(&gsqueue_list); gssp != NULL;
+	    gssp = list_next(&gsqueue_list, gssp)) {
+		gsqueue_cpu_t *scp;
+
+		mutex_enter(&gssp->gs_lock);
+		for (scp = list_head(&gssp->gs_defunct); scp != NULL;
+		    scp = list_next(&gssp->gs_defunct, scp)) {
+			if (scp->gqc_cpuid == id) {
+				list_remove(&gssp->gs_defunct, scp);
+				break;
+			}
+		}
+
+		if (scp == NULL) {
+			scp = gsqueue_cpu_create(gssp->gs_wpri, id);
+		} else {
+			squeue_bind(scp->gqc_head, id);
+		}
+
+		ASSERT(gssp->gs_ncpus < NCPU);
+		gssp->gs_cpus[gssp->gs_ncpus] = scp;
+		gssp->gs_ncpus++;
+		gsqueue_notify(gssp, scp->gqc_head, B_TRUE);
+		mutex_exit(&gssp->gs_lock);
+	}
+	mutex_exit(&gsqueue_lock);
+}
+
+static void
+gsqueue_handle_offline(processorid_t id)
+{
+	gsqueue_set_t *gssp;
+
+	ASSERT(MUTEX_HELD(&cpu_lock));
+	mutex_enter(&gsqueue_lock);
+	for (gssp = list_head(&gsqueue_list); gssp != NULL;
+	    gssp = list_next(&gsqueue_list, gssp)) {
+		int i;
+		gsqueue_cpu_t *scp = NULL;
+
+		mutex_enter(&gssp->gs_lock);
+		for (i = 0; i < gssp->gs_ncpus; i++) {
+			if (gssp->gs_cpus[i]->gqc_cpuid == id) {
+				scp = gssp->gs_cpus[i];
+				break;
+			}
+		}
+
+		if (scp != NULL) {
+			squeue_unbind(scp->gqc_head);
+			list_insert_tail(&gssp->gs_defunct, scp);
+			gssp->gs_cpus[i] = gssp->gs_cpus[gssp->gs_ncpus-1];
+			gssp->gs_ncpus--;
+			gsqueue_notify(gssp, scp->gqc_head, B_FALSE);
+		}
+		mutex_exit(&gssp->gs_lock);
+	}
+	mutex_exit(&gsqueue_lock);
+}
+
+/* ARGSUSED */
+static int
+gsqueue_cpu_setup(cpu_setup_t what, int id, void *unused)
+{
+	cpu_t *cp;
+
+	ASSERT(MUTEX_HELD(&cpu_lock));
+	cp = cpu_get(id);
+	switch (what) {
+	case CPU_CONFIG:
+	case CPU_ON:
+	case CPU_INIT:
+	case CPU_CPUPART_IN:
+		if (cp != NULL && CPU_ACTIVE(cp) && cp->cpu_flags & CPU_EXISTS)
+			gsqueue_handle_online(cp->cpu_id);
+		break;
+	case CPU_UNCONFIG:
+	case CPU_OFF:
+	case CPU_CPUPART_OUT:
+		gsqueue_handle_offline(cp->cpu_id);
+		break;
+	default:
+		break;
+	}
+
+	return (0);
+}
+
+
+/* ARGSUSED */
+static int
+gsqueue_set_cache_construct(void *buf, void *arg, int kmflags)
+{
+	gsqueue_set_t *gssp = buf;
+
+	gssp->gs_cpus = kmem_alloc(sizeof (gsqueue_cpu_t *) * NCPU, kmflags);
+	if (gssp->gs_cpus == NULL)
+		return (-1);
+
+	mutex_init(&gssp->gs_lock, NULL, MUTEX_DRIVER, NULL);
+	list_create(&gssp->gs_defunct, sizeof (gsqueue_cpu_t),
+	    offsetof(gsqueue_cpu_t, gqc_lnode));
+	gssp->gs_ncpus = 0;
+	gssp->gs_cbs = NULL;
+
+	return (0);
+}
+
+/* ARGSUSED */
+static void
+gsqueue_set_cache_destruct(void *buf, void *arg)
+{
+	gsqueue_set_t *gssp = buf;
+
+	kmem_free(gssp->gs_cpus, sizeof (gsqueue_cpu_t *) * NCPU);
+	gssp->gs_cpus = NULL;
+	list_destroy(&gssp->gs_defunct);
+	mutex_destroy(&gssp->gs_lock);
+}
+
+static void
+gsqueue_ddiinit(void)
+{
+	list_create(&gsqueue_list, sizeof (gsqueue_set_t),
+	    offsetof(gsqueue_set_t, gs_next));
+	mutex_init(&gsqueue_lock, NULL, MUTEX_DRIVER, NULL);
+
+	gsqueue_cb_cache = kmem_cache_create("gsqueue_cb_cache",
+	    sizeof (gsqueue_cb_t),
+	    0, NULL, NULL, NULL, NULL, NULL, 0);
+	gsqueue_cpu_cache = kmem_cache_create("gsqueue_cpu_cache",
+	    sizeof (gsqueue_cpu_t),
+	    0, NULL, NULL, NULL, NULL, NULL, 0);
+	gsqueue_set_cache = kmem_cache_create("squeue_set_cache",
+	    sizeof (gsqueue_set_t),
+	    0, gsqueue_set_cache_construct, gsqueue_set_cache_destruct,
+	    NULL, NULL, NULL, 0);
+
+
+	mutex_enter(&cpu_lock);
+	register_cpu_setup_func(gsqueue_cpu_setup, NULL);
+	mutex_exit(&cpu_lock);
+}
+
+static int
+gsqueue_ddifini(void)
+{
+	mutex_enter(&gsqueue_lock);
+	if (list_is_empty(&gsqueue_list) == 0) {
+		mutex_exit(&gsqueue_lock);
+		return (EBUSY);
+	}
+	list_destroy(&gsqueue_list);
+	mutex_exit(&gsqueue_lock);
+
+	mutex_enter(&cpu_lock);
+	register_cpu_setup_func(gsqueue_cpu_setup, NULL);
+	mutex_exit(&cpu_lock);
+
+	kmem_cache_destroy(gsqueue_set_cache);
+	kmem_cache_destroy(gsqueue_cpu_cache);
+	kmem_cache_destroy(gsqueue_cb_cache);
+
+	mutex_destroy(&gsqueue_lock);
+
+	return (0);
+}
+
+static struct modlmisc		gsqueue_modmisc = {
+	&mod_miscops,
+	"gsqueue"
+};
+
+static struct modlinkage	gsqueue_modlinkage = {
+	MODREV_1,
+	&gsqueue_modmisc,
+	NULL
+};
+
+int
+_init(void)
+{
+	int ret;
+
+	gsqueue_ddiinit();
+	if ((ret = mod_install(&gsqueue_modlinkage)) != 0) {
+		VERIFY(gsqueue_ddifini() == 0);
+		return (ret);
+	}
+
+	return (ret);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&gsqueue_modlinkage, modinfop));
+}
+
+int
+_fini(void)
+{
+	int ret;
+
+	if ((ret = gsqueue_ddifini()) != 0)
+		return (ret);
+
+	if ((ret = mod_remove(&gsqueue_modlinkage)) != 0)
+		return (ret);
+
+	return (0);
+}
diff --git a/usr/src/uts/common/io/hook.c b/usr/src/uts/common/io/hook.c
index c3ebfa0e47..6726f72147 100644
--- a/usr/src/uts/common/io/hook.c
+++ b/usr/src/uts/common/io/hook.c
@@ -1050,7 +1050,7 @@ hook_family_free(hook_family_int_t *hfi, hook_stack_t *hks)
 	/* Free container */
 	kmem_free(hfi, sizeof (*hfi));
 
-	if (hks->hks_shutdown == 2)
+	if (hks != NULL && hks->hks_shutdown == 2)
 		hook_stack_remove(hks);
 
 	mutex_exit(&hook_stack_lock);
diff --git a/usr/src/uts/common/io/i40e/i40e_gld.c b/usr/src/uts/common/io/i40e/i40e_gld.c
index d34057d64f..ccf814be0b 100644
--- a/usr/src/uts/common/io/i40e/i40e_gld.c
+++ b/usr/src/uts/common/io/i40e/i40e_gld.c
@@ -39,7 +39,8 @@ char *i40e_priv_props[] = {
 static int
 i40e_group_remove_mac(void *arg, const uint8_t *mac_addr)
 {
-	i40e_t *i40e = arg;
+	i40e_rx_group_t *rxg = arg;
+	i40e_t *i40e = rxg->irg_i40e;
 	struct i40e_aqc_remove_macvlan_element_data filt;
 	struct i40e_hw *hw = &i40e->i40e_hw_space;
 	int ret, i, last;
@@ -107,10 +108,11 @@ done:
 static int
 i40e_group_add_mac(void *arg, const uint8_t *mac_addr)
 {
-	i40e_t *i40e = arg;
-	struct i40e_hw *hw = &i40e->i40e_hw_space;
-	int i, ret;
-	i40e_uaddr_t *iua;
+	i40e_rx_group_t	*rxg = arg;
+	i40e_t		*i40e = rxg->irg_i40e;
+	struct i40e_hw	*hw = &i40e->i40e_hw_space;
+	int		i, ret;
+	i40e_uaddr_t	*iua;
 	struct i40e_aqc_add_macvlan_element_data filt;
 
 	if (I40E_IS_MULTICAST(mac_addr))
@@ -136,16 +138,12 @@ i40e_group_add_mac(void *arg, const uint8_t *mac_addr)
 		}
 	}
 
-	/*
-	 * Note, the general use of the i40e_vsi_id will have to be refactored
-	 * when we have proper group support.
-	 */
 	bzero(&filt, sizeof (filt));
 	bcopy(mac_addr, filt.mac_addr, ETHERADDRL);
 	filt.flags = I40E_AQC_MACVLAN_ADD_PERFECT_MATCH	|
 	    I40E_AQC_MACVLAN_ADD_IGNORE_VLAN;
 
-	if ((ret = i40e_aq_add_macvlan(hw, i40e->i40e_vsi_id, &filt, 1,
+	if ((ret = i40e_aq_add_macvlan(hw, rxg->irg_vsi_seid, &filt, 1,
 	    NULL)) != I40E_SUCCESS) {
 		i40e_error(i40e, "failed to add mac address "
 		    "%2x:%2x:%2x:%2x:%2x:%2x to unicast filter: %d",
@@ -157,7 +155,7 @@ i40e_group_add_mac(void *arg, const uint8_t *mac_addr)
 
 	iua = &i40e->i40e_uaddrs[i40e->i40e_resources.ifr_nmacfilt_used];
 	bcopy(mac_addr, iua->iua_mac, ETHERADDRL);
-	iua->iua_vsi = i40e->i40e_vsi_id;
+	iua->iua_vsi = rxg->irg_vsi_seid;
 	i40e->i40e_resources.ifr_nmacfilt_used++;
 	ASSERT(i40e->i40e_resources.ifr_nmacfilt_used <=
 	    i40e->i40e_resources.ifr_nmacfilt);
@@ -227,7 +225,7 @@ i40e_m_promisc(void *arg, boolean_t on)
 	}
 
 
-	ret = i40e_aq_set_vsi_unicast_promiscuous(hw, i40e->i40e_vsi_id,
+	ret = i40e_aq_set_vsi_unicast_promiscuous(hw, I40E_DEF_VSI_SEID(i40e),
 	    on, NULL, B_FALSE);
 	if (ret != I40E_SUCCESS) {
 		i40e_error(i40e, "failed to %s unicast promiscuity on "
@@ -246,7 +244,7 @@ i40e_m_promisc(void *arg, boolean_t on)
 		goto done;
 	}
 
-	ret = i40e_aq_set_vsi_multicast_promiscuous(hw, i40e->i40e_vsi_id,
+	ret = i40e_aq_set_vsi_multicast_promiscuous(hw, I40E_DEF_VSI_SEID(i40e),
 	    on, NULL);
 	if (ret != I40E_SUCCESS) {
 		i40e_error(i40e, "failed to %s multicast promiscuity on "
@@ -257,8 +255,8 @@ i40e_m_promisc(void *arg, boolean_t on)
 		 * Try our best to put us back into a state that MAC expects us
 		 * to be in.
 		 */
-		ret = i40e_aq_set_vsi_unicast_promiscuous(hw, i40e->i40e_vsi_id,
-		    !on, NULL, B_FALSE);
+		ret = i40e_aq_set_vsi_unicast_promiscuous(hw,
+		    I40E_DEF_VSI_SEID(i40e), !on, NULL, B_FALSE);
 		if (ret != I40E_SUCCESS) {
 			i40e_error(i40e, "failed to %s unicast promiscuity on "
 			    "the default VSI after toggling multicast failed: "
@@ -294,11 +292,11 @@ i40e_multicast_add(i40e_t *i40e, const uint8_t *multicast_address)
 		if (i40e->i40e_mcast_promisc_count == 0 &&
 		    i40e->i40e_promisc_on == B_FALSE) {
 			ret = i40e_aq_set_vsi_multicast_promiscuous(hw,
-			    i40e->i40e_vsi_id, B_TRUE, NULL);
+			    I40E_DEF_VSI_SEID(i40e), B_TRUE, NULL);
 			if (ret != I40E_SUCCESS) {
 				i40e_error(i40e, "failed to enable multicast "
 				    "promiscuous mode on VSI %d: %d",
-				    i40e->i40e_vsi_id, ret);
+				    I40E_DEF_VSI_SEID(i40e), ret);
 				return (EIO);
 			}
 		}
@@ -312,7 +310,7 @@ i40e_multicast_add(i40e_t *i40e, const uint8_t *multicast_address)
 	filt.flags = I40E_AQC_MACVLAN_ADD_HASH_MATCH |
 	    I40E_AQC_MACVLAN_ADD_IGNORE_VLAN;
 
-	if ((ret = i40e_aq_add_macvlan(hw, i40e->i40e_vsi_id, &filt, 1,
+	if ((ret = i40e_aq_add_macvlan(hw, I40E_DEF_VSI_SEID(i40e), &filt, 1,
 	    NULL)) != I40E_SUCCESS) {
 		i40e_error(i40e, "failed to add mac address "
 		    "%2x:%2x:%2x:%2x:%2x:%2x to multicast filter: %d",
@@ -353,8 +351,8 @@ i40e_multicast_remove(i40e_t *i40e, const uint8_t *multicast_address)
 		filt.flags = I40E_AQC_MACVLAN_DEL_HASH_MATCH |
 		    I40E_AQC_MACVLAN_DEL_IGNORE_VLAN;
 
-		if (i40e_aq_remove_macvlan(hw, i40e->i40e_vsi_id,
-		    &filt, 1, NULL) != I40E_SUCCESS) {
+		if (i40e_aq_remove_macvlan(hw, I40E_DEF_VSI_SEID(i40e), &filt,
+		    1, NULL) != I40E_SUCCESS) {
 			i40e_error(i40e, "failed to remove mac address "
 			    "%2x:%2x:%2x:%2x:%2x:%2x from multicast "
 			    "filter: %d",
@@ -381,11 +379,11 @@ i40e_multicast_remove(i40e_t *i40e, const uint8_t *multicast_address)
 		if (i40e->i40e_mcast_promisc_count == 1 &&
 		    i40e->i40e_promisc_on == B_FALSE) {
 			ret = i40e_aq_set_vsi_multicast_promiscuous(hw,
-			    i40e->i40e_vsi_id, B_FALSE, NULL);
+			    I40E_DEF_VSI_SEID(i40e), B_FALSE, NULL);
 			if (ret != I40E_SUCCESS) {
 				i40e_error(i40e, "failed to disable "
 				    "multicast promiscuous mode on VSI %d: %d",
-				    i40e->i40e_vsi_id, ret);
+				    I40E_DEF_VSI_SEID(i40e), ret);
 				return (EIO);
 			}
 		}
@@ -490,7 +488,7 @@ i40e_fill_tx_ring(void *arg, mac_ring_type_t rtype, const int group_index,
 	 * we're not actually grouping things tx-wise at this time.
 	 */
 	ASSERT(group_index == -1);
-	ASSERT(ring_index < i40e->i40e_num_trqpairs);
+	ASSERT(ring_index < i40e->i40e_num_trqpairs_per_vsi);
 
 	itrq->itrq_mactxring = rh;
 	infop->mri_driver = (mac_ring_driver_t)itrq;
@@ -516,15 +514,16 @@ i40e_fill_rx_ring(void *arg, mac_ring_type_t rtype, const int group_index,
 {
 	i40e_t *i40e = arg;
 	mac_intr_t *mintr = &infop->mri_intr;
-	i40e_trqpair_t *itrq = &i40e->i40e_trqpairs[ring_index];
+	uint_t trqpair_index;
+	i40e_trqpair_t *itrq;
 
-	/*
-	 * We assert the group number and ring index to help sanity check
-	 * ourselves and mark that we'll need to rework this when we have
-	 * multiple groups.
-	 */
-	ASSERT3S(group_index, ==, 0);
-	ASSERT3S(ring_index, <, i40e->i40e_num_trqpairs);
+	/* This assumes static groups. */
+	ASSERT3S(group_index, >=, 0);
+	ASSERT3S(ring_index, >=, 0);
+	trqpair_index = (group_index * i40e->i40e_num_trqpairs_per_vsi) +
+	    ring_index;
+	ASSERT3U(trqpair_index, <, i40e->i40e_num_trqpairs);
+	itrq = &i40e->i40e_trqpairs[trqpair_index];
 
 	itrq->itrq_macrxring = rh;
 	infop->mri_driver = (mac_ring_driver_t)itrq;
@@ -552,24 +551,22 @@ i40e_fill_rx_group(void *arg, mac_ring_type_t rtype, const int index,
     mac_group_info_t *infop, mac_group_handle_t gh)
 {
 	i40e_t *i40e = arg;
+	i40e_rx_group_t *rxg;
 
 	if (rtype != MAC_RING_TYPE_RX)
 		return;
 
-	/*
-	 * Note, this is a simplified view of a group, given that we only have a
-	 * single group and a single ring at the moment. We'll want to expand
-	 * upon this as we leverage more hardware functionality.
-	 */
-	i40e->i40e_rx_group_handle = gh;
-	infop->mgi_driver = (mac_group_driver_t)i40e;
+	rxg = &i40e->i40e_rx_groups[index];
+	rxg->irg_grp_hdl = gh;
+
+	infop->mgi_driver = (mac_group_driver_t)rxg;
 	infop->mgi_start = NULL;
 	infop->mgi_stop = NULL;
 	infop->mgi_addmac = i40e_group_add_mac;
 	infop->mgi_remmac = i40e_group_remove_mac;
 
-	ASSERT(i40e->i40e_num_rx_groups == I40E_GROUP_MAX);
-	infop->mgi_count = i40e->i40e_num_trqpairs;
+	ASSERT(i40e->i40e_num_rx_groups <= I40E_GROUP_MAX);
+	infop->mgi_count = i40e->i40e_num_trqpairs_per_vsi;
 }
 
 static int
@@ -732,20 +729,32 @@ i40e_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
 		break;
 	}
 
+	case MAC_CAPAB_LSO: {
+		mac_capab_lso_t *cap_lso = cap_data;
+
+		if (i40e->i40e_tx_lso_enable == B_TRUE) {
+			cap_lso->lso_flags = LSO_TX_BASIC_TCP_IPV4;
+			cap_lso->lso_basic_tcp_ipv4.lso_max = I40E_LSO_MAXLEN;
+		} else {
+			return (B_FALSE);
+		}
+		break;
+	}
+
 	case MAC_CAPAB_RINGS:
 		cap_rings = cap_data;
 		cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
 		switch (cap_rings->mr_type) {
 		case MAC_RING_TYPE_TX:
 			/*
-			 * Note, saying we have no rings, but some number of
-			 * groups indicates to MAC that it should create
-			 * psuedo-groups with one for each TX ring. This may not
-			 * be the long term behavior we want, but it'll work for
-			 * now.
+			 * Note, saying we have no groups, but some
+			 * number of rings indicates to MAC that it
+			 * should create psuedo-groups with one for
+			 * each TX ring. This may not be the long term
+			 * behavior we want, but it'll work for now.
 			 */
 			cap_rings->mr_gnum = 0;
-			cap_rings->mr_rnum = i40e->i40e_num_trqpairs;
+			cap_rings->mr_rnum = i40e->i40e_num_trqpairs_per_vsi;
 			cap_rings->mr_rget = i40e_fill_tx_ring;
 			cap_rings->mr_gget = NULL;
 			cap_rings->mr_gaddring = NULL;
@@ -754,7 +763,7 @@ i40e_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
 		case MAC_RING_TYPE_RX:
 			cap_rings->mr_rnum = i40e->i40e_num_trqpairs;
 			cap_rings->mr_rget = i40e_fill_rx_ring;
-			cap_rings->mr_gnum = I40E_GROUP_MAX;
+			cap_rings->mr_gnum = i40e->i40e_num_rx_groups;
 			cap_rings->mr_gget = i40e_fill_rx_group;
 			cap_rings->mr_gaddring = NULL;
 			cap_rings->mr_gremring = NULL;
diff --git a/usr/src/uts/common/io/i40e/i40e_intr.c b/usr/src/uts/common/io/i40e/i40e_intr.c
index 51d1bbac92..170bef7ec6 100644
--- a/usr/src/uts/common/io/i40e/i40e_intr.c
+++ b/usr/src/uts/common/io/i40e/i40e_intr.c
@@ -10,7 +10,7 @@
  */
 
 /*
- * Copyright (c) 2017, Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
  * Copyright 2017 Tegile Systems, Inc.  All rights reserved.
  */
 
@@ -229,12 +229,20 @@ i40e_intr_adminq_disable(i40e_t *i40e)
 	I40E_WRITE_REG(hw, I40E_PFINT_DYN_CTL0, reg);
 }
 
+/*
+ * The next two functions enable/disable the reception of interrupts
+ * on the given vector. Only vectors 1..N are programmed by these
+ * functions; vector 0 is special and handled by a different register.
+ * We must subtract one from the vector because i40e implicitly adds
+ * one to the vector value. See section 10.2.2.10.13 for more details.
+ */
 static void
 i40e_intr_io_enable(i40e_t *i40e, int vector)
 {
 	uint32_t reg;
 	i40e_hw_t *hw = &i40e->i40e_hw_space;
 
+	ASSERT3S(vector, >, 0);
 	reg = I40E_PFINT_DYN_CTLN_INTENA_MASK |
 	    I40E_PFINT_DYN_CTLN_CLEARPBA_MASK |
 	    (I40E_ITR_INDEX_NONE << I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT);
@@ -247,6 +255,7 @@ i40e_intr_io_disable(i40e_t *i40e, int vector)
 	uint32_t reg;
 	i40e_hw_t *hw = &i40e->i40e_hw_space;
 
+	ASSERT3S(vector, >, 0);
 	reg = I40E_ITR_INDEX_NONE << I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT;
 	I40E_WRITE_REG(hw, I40E_PFINT_DYN_CTLN(vector - 1), reg);
 }
@@ -375,49 +384,109 @@ i40e_intr_chip_fini(i40e_t *i40e)
 }
 
 /*
- * Enable all of the queues and set the corresponding LNKLSTN registers. Note
- * that we always enable queues as interrupt sources, even though we don't
- * enable the MSI-X interrupt vectors.
+ * Set the head of the interrupt linked list. The PFINT_LNKLSTN[N]
+ * register actually refers to the 'N + 1' interrupt vector. E.g.,
+ * PFINT_LNKLSTN[0] refers to interrupt vector 1.
+ */
+static void
+i40e_set_lnklstn(i40e_t *i40e, uint_t vector, uint_t queue)
+{
+	uint32_t	reg;
+	i40e_hw_t	*hw = &i40e->i40e_hw_space;
+
+	reg = (queue << I40E_PFINT_LNKLSTN_FIRSTQ_INDX_SHIFT) |
+	    (I40E_QUEUE_TYPE_RX << I40E_PFINT_LNKLSTN_FIRSTQ_TYPE_SHIFT);
+
+	I40E_WRITE_REG(hw, I40E_PFINT_LNKLSTN(vector), reg);
+	DEBUGOUT2("PFINT_LNKLSTN[%u] = 0x%x", vector, reg);
+}
+
+/*
+ * Set the QINT_RQCTL[queue] register. The next queue is always the Tx
+ * queue associated with this Rx queue. Unlike PFINT_LNKLSTN, the
+ * vector should be the actual vector this queue is on -- i.e., it
+ * should be equal to itrq_rx_intrvec.
+ */
+static void
+i40e_set_rqctl(i40e_t *i40e, uint_t vector, uint_t queue)
+{
+	uint32_t	reg;
+	i40e_hw_t	*hw = &i40e->i40e_hw_space;
+
+	ASSERT3U(vector, ==, i40e->i40e_trqpairs[queue].itrq_rx_intrvec);
+
+	reg = (vector << I40E_QINT_RQCTL_MSIX_INDX_SHIFT) |
+	    (I40E_ITR_INDEX_RX << I40E_QINT_RQCTL_ITR_INDX_SHIFT) |
+	    (queue << I40E_QINT_RQCTL_NEXTQ_INDX_SHIFT) |
+	    (I40E_QUEUE_TYPE_TX << I40E_QINT_RQCTL_NEXTQ_TYPE_SHIFT) |
+	    I40E_QINT_RQCTL_CAUSE_ENA_MASK;
+
+	I40E_WRITE_REG(hw, I40E_QINT_RQCTL(queue), reg);
+	DEBUGOUT2("QINT_RQCTL[%u] = 0x%x", queue, reg);
+}
+
+/*
+ * Like i40e_set_rqctl(), but for QINT_TQCTL[queue]. The next queue is
+ * either the Rx queue of another TRQP, or EOL.
+ */
+static void
+i40e_set_tqctl(i40e_t *i40e, uint_t vector, uint_t queue, uint_t next_queue)
+{
+	uint32_t	reg;
+	i40e_hw_t	*hw = &i40e->i40e_hw_space;
+
+	ASSERT3U(vector, ==, i40e->i40e_trqpairs[queue].itrq_tx_intrvec);
+
+	reg = (vector << I40E_QINT_TQCTL_MSIX_INDX_SHIFT) |
+	    (I40E_ITR_INDEX_TX << I40E_QINT_TQCTL_ITR_INDX_SHIFT) |
+	    (next_queue << I40E_QINT_TQCTL_NEXTQ_INDX_SHIFT) |
+	    (I40E_QUEUE_TYPE_RX << I40E_QINT_TQCTL_NEXTQ_TYPE_SHIFT) |
+	    I40E_QINT_TQCTL_CAUSE_ENA_MASK;
+
+	I40E_WRITE_REG(hw, I40E_QINT_TQCTL(queue), reg);
+	DEBUGOUT2("QINT_TQCTL[%u] = 0x%x", queue, reg);
+}
+
+/*
+ * Program the interrupt linked list. Each vector has a linked list of
+ * queues which act as event sources for that vector. When one of
+ * those sources has an event the associated interrupt vector is
+ * fired. This mapping must match the mapping found in
+ * i40e_map_intrs_to_vectors().
+ *
+ * See section 7.5.3 for more information about the configuration of
+ * the interrupt linked list.
  */
 static void
 i40e_intr_init_queue_msix(i40e_t *i40e)
 {
-	i40e_hw_t *hw = &i40e->i40e_hw_space;
-	uint32_t reg;
-	int i;
+	uint_t intr_count;
 
 	/*
-	 * Map queues to MSI-X interrupts. Queue i is mapped to vector i + 1.
-	 * Note that we skip the ITR logic for the moment, just to make our
-	 * lives as explicit and simple as possible.
+	 * The 0th vector is for 'Other Interrupts' only (subject to
+	 * change in the future).
 	 */
-	for (i = 0; i < i40e->i40e_num_trqpairs; i++) {
-		i40e_trqpair_t *itrq = &i40e->i40e_trqpairs[i];
+	intr_count = i40e->i40e_intr_count - 1;
 
-		reg = (i << I40E_PFINT_LNKLSTN_FIRSTQ_INDX_SHIFT) |
-		    (I40E_QUEUE_TYPE_RX <<
-		    I40E_PFINT_LNKLSTN_FIRSTQ_TYPE_SHIFT);
-		I40E_WRITE_REG(hw, I40E_PFINT_LNKLSTN(i), reg);
+	for (uint_t vec = 0; vec < intr_count; vec++) {
+		boolean_t head = B_TRUE;
 
-		reg =
-		    (itrq->itrq_rx_intrvec << I40E_QINT_RQCTL_MSIX_INDX_SHIFT) |
-		    (I40E_ITR_INDEX_RX << I40E_QINT_RQCTL_ITR_INDX_SHIFT) |
-		    (i << I40E_QINT_RQCTL_NEXTQ_INDX_SHIFT) |
-		    (I40E_QUEUE_TYPE_TX << I40E_QINT_RQCTL_NEXTQ_TYPE_SHIFT) |
-		    I40E_QINT_RQCTL_CAUSE_ENA_MASK;
+		for (uint_t qidx = vec; qidx < i40e->i40e_num_trqpairs;
+		     qidx += intr_count) {
+			uint_t next_qidx = qidx + intr_count;
 
-		I40E_WRITE_REG(hw, I40E_QINT_RQCTL(i), reg);
+			next_qidx = (next_qidx > i40e->i40e_num_trqpairs) ?
+			    I40E_QUEUE_TYPE_EOL : next_qidx;
 
-		reg =
-		    (itrq->itrq_tx_intrvec << I40E_QINT_TQCTL_MSIX_INDX_SHIFT) |
-		    (I40E_ITR_INDEX_TX << I40E_QINT_RQCTL_ITR_INDX_SHIFT) |
-		    (I40E_QUEUE_TYPE_EOL << I40E_QINT_TQCTL_NEXTQ_INDX_SHIFT) |
-		    (I40E_QUEUE_TYPE_RX << I40E_QINT_TQCTL_NEXTQ_TYPE_SHIFT) |
-		    I40E_QINT_TQCTL_CAUSE_ENA_MASK;
+			if (head) {
+				i40e_set_lnklstn(i40e, vec, qidx);
+				head = B_FALSE;
+			}
 
-		I40E_WRITE_REG(hw, I40E_QINT_TQCTL(i), reg);
+			i40e_set_rqctl(i40e, vec + 1, qidx);
+			i40e_set_tqctl(i40e, vec + 1, qidx, next_qidx);
+		}
 	}
-
 }
 
 /*
@@ -604,31 +673,26 @@ i40e_intr_adminq_work(i40e_t *i40e)
 }
 
 static void
-i40e_intr_rx_work(i40e_t *i40e, int queue)
+i40e_intr_rx_work(i40e_t *i40e, i40e_trqpair_t *itrq)
 {
 	mblk_t *mp = NULL;
-	i40e_trqpair_t *itrq;
-
-	ASSERT(queue < i40e->i40e_num_trqpairs);
-	itrq = &i40e->i40e_trqpairs[queue];
 
 	mutex_enter(&itrq->itrq_rx_lock);
 	if (!itrq->itrq_intr_poll)
 		mp = i40e_ring_rx(itrq, I40E_POLL_NULL);
 	mutex_exit(&itrq->itrq_rx_lock);
 
-	if (mp != NULL) {
-		mac_rx_ring(i40e->i40e_mac_hdl, itrq->itrq_macrxring, mp,
-		    itrq->itrq_rxgen);
-	}
+	if (mp == NULL)
+		return;
+
+	mac_rx_ring(i40e->i40e_mac_hdl, itrq->itrq_macrxring, mp,
+	    itrq->itrq_rxgen);
 }
 
+/* ARGSUSED */
 static void
-i40e_intr_tx_work(i40e_t *i40e, int queue)
+i40e_intr_tx_work(i40e_t *i40e, i40e_trqpair_t *itrq)
 {
-	i40e_trqpair_t *itrq;
-
-	itrq = &i40e->i40e_trqpairs[queue];
 	i40e_tx_recycle_ring(itrq);
 }
 
@@ -665,11 +729,17 @@ i40e_intr_other_work(i40e_t *i40e)
 	i40e_intr_adminq_enable(i40e);
 }
 
+/*
+ * Handle an MSI-X interrupt. See section 7.5.1.3 for an overview of
+ * the MSI-X interrupt sequence.
+ */
 uint_t
 i40e_intr_msix(void *arg1, void *arg2)
 {
 	i40e_t *i40e = (i40e_t *)arg1;
-	int vector_idx = (int)(uintptr_t)arg2;
+	uint_t vector_idx = (uint_t)(uintptr_t)arg2;
+
+	ASSERT3U(vector_idx, <, i40e->i40e_intr_count);
 
 	/*
 	 * When using MSI-X interrupts, vector 0 is always reserved for the
@@ -681,10 +751,29 @@ i40e_intr_msix(void *arg1, void *arg2)
 		return (DDI_INTR_CLAIMED);
 	}
 
-	i40e_intr_rx_work(i40e, vector_idx - 1);
-	i40e_intr_tx_work(i40e, vector_idx - 1);
-	i40e_intr_io_enable(i40e, vector_idx);
+	ASSERT3U(vector_idx, >, 0);
 
+	/*
+	 * We determine the queue indexes via simple arithmetic (as
+	 * opposed to keeping explicit state like a bitmap). While
+	 * conveinent, it does mean that i40e_map_intrs_to_vectors(),
+	 * i40e_intr_init_queue_msix(), and this function must be
+	 * modified as a unit.
+	 *
+	 * We subtract 1 from the vector to offset the addition we
+	 * performed during i40e_map_intrs_to_vectors().
+	 */
+	for (uint_t i = vector_idx - 1; i < i40e->i40e_num_trqpairs;
+	     i += (i40e->i40e_intr_count - 1)) {
+		i40e_trqpair_t *itrq = &i40e->i40e_trqpairs[i];
+
+		ASSERT3U(i, <, i40e->i40e_num_trqpairs);
+		ASSERT3P(itrq, !=, NULL);
+		i40e_intr_rx_work(i40e, itrq);
+		i40e_intr_tx_work(i40e, itrq);
+	}
+
+	i40e_intr_io_enable(i40e, vector_idx);
 	return (DDI_INTR_CLAIMED);
 }
 
@@ -693,6 +782,7 @@ i40e_intr_notx(i40e_t *i40e, boolean_t shared)
 {
 	i40e_hw_t *hw = &i40e->i40e_hw_space;
 	uint32_t reg;
+	i40e_trqpair_t *itrq = &i40e->i40e_trqpairs[0];
 	int ret = DDI_INTR_CLAIMED;
 
 	if (shared == B_TRUE) {
@@ -722,10 +812,10 @@ i40e_intr_notx(i40e_t *i40e, boolean_t shared)
 		i40e_intr_adminq_work(i40e);
 
 	if (reg & I40E_INTR_NOTX_RX_MASK)
-		i40e_intr_rx_work(i40e, 0);
+		i40e_intr_rx_work(i40e, itrq);
 
 	if (reg & I40E_INTR_NOTX_TX_MASK)
-		i40e_intr_tx_work(i40e, 0);
+		i40e_intr_tx_work(i40e, itrq);
 
 done:
 	i40e_intr_adminq_enable(i40e);
diff --git a/usr/src/uts/common/io/i40e/i40e_main.c b/usr/src/uts/common/io/i40e/i40e_main.c
index 54aef43424..20f74d4e95 100644
--- a/usr/src/uts/common/io/i40e/i40e_main.c
+++ b/usr/src/uts/common/io/i40e/i40e_main.c
@@ -11,7 +11,7 @@
 
 /*
  * Copyright 2015 OmniTI Computer Consulting, Inc. All rights reserved.
- * Copyright (c) 2017, Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
  * Copyright 2017 Tegile Systems, Inc.  All rights reserved.
  */
 
@@ -188,14 +188,15 @@
  * VSI Management
  * --------------
  *
- * At this time, we currently only support a single MAC group, and thus a single
- * VSI. This VSI is considered the default VSI and should be the only one that
- * exists after a reset. Currently it is stored as the member
- * i40e_t`i40e_vsi_id. While this works for the moment and for an initial
- * driver, it's not sufficient for the longer-term path of the driver. Instead,
- * we'll want to actually have a unique i40e_vsi_t structure which is used
- * everywhere. Note that this means that every place that uses the
- * i40e_t`i40e_vsi_id will need to be refactored.
+ * The PFs share 384 VSIs. The firmware creates one VSI per PF by default.
+ * During chip start we retrieve the SEID of this VSI and assign it as the
+ * default VSI for our VEB (one VEB per PF). We then add additional VSIs to
+ * the VEB up to the determined number of rx groups: i40e_t`i40e_num_rx_groups.
+ * We currently cap this number to I40E_GROUP_MAX to a) make sure all PFs can
+ * allocate the same number of VSIs, and b) to keep the interrupt multiplexing
+ * under control. In the future, when we improve the interrupt allocation, we
+ * may want to revisit this cap to make better use of the available VSIs. The
+ * VSI allocation and configuration can be found in i40e_chip_start().
  *
  * ----------------
  * Structure Layout
@@ -240,7 +241,7 @@
  *          | i40e_hw_t               --+---> Intel common code structure
  *          | mac_handle_t            --+---> GLDv3 handle to MAC
  *          | ddi_periodic_t          --+---> Link activity timer
- *          | int (vsi_id)            --+---> VSI ID, main identifier
+ *          | i40e_vsi_t *            --+---> Array of VSIs
  *          | i40e_func_rsrc_t        --+---> Available hardware resources
  *          | i40e_switch_rsrc_t *    --+---> Switch resource snapshot
  *          | i40e_sdu                --+---> Current MTU
@@ -249,11 +250,10 @@
  *          | i40e_maddr_t *          --+---> Array of assigned multicast MACs
  *          | i40e_mcast_promisccount --+---> Active multicast state
  *          | i40e_promisc_on         --+---> Current promiscuous mode state
- *          | int                     --+---> Number of transmit/receive pairs
+ *          | uint_t                  --+---> Number of transmit/receive pairs
+ *          | i40e_rx_group_t *       --+---> Array of Rx groups
  *          | kstat_t *               --+---> PF kstats
- *          | kstat_t *               --+---> VSI kstats
  *          | i40e_pf_stats_t         --+---> PF kstat backing data
- *          | i40e_vsi_stats_t        --+---> VSI kstat backing data
  *          | i40e_trqpair_t *        --+---------+
  *          +---------------------------+         |
  *                                                |
@@ -359,8 +359,6 @@
  * While bugs have been filed to cover this future work, the following gives an
  * overview of expected work:
  *
- *  o TSO support
- *  o Multiple group support
  *  o DMA binding and breaking up the locking in ring recycling.
  *  o Enhanced detection of device errors
  *  o Participation in IRM
@@ -371,7 +369,7 @@
 
 #include "i40e_sw.h"
 
-static char i40e_ident[] = "Intel 10/40Gb Ethernet v1.0.1";
+static char i40e_ident[] = "Intel 10/40Gb Ethernet v1.0.2";
 
 /*
  * The i40e_glock primarily protects the lists below and the i40e_device_t
@@ -761,15 +759,16 @@ i40e_fm_ereport(i40e_t *i40e, char *detail)
 }
 
 /*
- * Here we're trying to get the ID of the default VSI. In general, when we come
- * through and look at this shortly after attach, we expect there to only be a
- * single element present, which is the default VSI. Importantly, each PF seems
- * to not see any other devices, in part because of the simple switch mode that
- * we're using. If for some reason, we see more artifact, we'll need to revisit
- * what we're doing here.
+ * Here we're trying to set the SEID of the default VSI. In general,
+ * when we come through and look at this shortly after attach, we
+ * expect there to only be a single element present, which is the
+ * default VSI. Importantly, each PF seems to not see any other
+ * devices, in part because of the simple switch mode that we're
+ * using. If for some reason, we see more artifacts, we'll need to
+ * revisit what we're doing here.
  */
-static int
-i40e_get_vsi_id(i40e_t *i40e)
+static boolean_t
+i40e_set_def_vsi_seid(i40e_t *i40e)
 {
 	i40e_hw_t *hw = &i40e->i40e_hw_space;
 	struct i40e_aqc_get_switch_config_resp *sw_config;
@@ -784,17 +783,43 @@ i40e_get_vsi_id(i40e_t *i40e)
 	if (rc != I40E_SUCCESS) {
 		i40e_error(i40e, "i40e_aq_get_switch_config() failed %d: %d",
 		    rc, hw->aq.asq_last_status);
-		return (-1);
+		return (B_FALSE);
 	}
 
 	if (LE_16(sw_config->header.num_reported) != 1) {
 		i40e_error(i40e, "encountered multiple (%d) switching units "
 		    "during attach, not proceeding",
 		    LE_16(sw_config->header.num_reported));
+		return (B_FALSE);
+	}
+
+	I40E_DEF_VSI_SEID(i40e) = sw_config->element[0].seid;
+	return (B_TRUE);
+}
+
+/*
+ * Get the SEID of the uplink MAC.
+ */
+static int
+i40e_get_mac_seid(i40e_t *i40e)
+{
+	i40e_hw_t *hw = &i40e->i40e_hw_space;
+	struct i40e_aqc_get_switch_config_resp *sw_config;
+	uint8_t aq_buf[I40E_AQ_LARGE_BUF];
+	uint16_t next = 0;
+	int rc;
+
+	/* LINTED: E_BAD_PTR_CAST_ALIGN */
+	sw_config = (struct i40e_aqc_get_switch_config_resp *)aq_buf;
+	rc = i40e_aq_get_switch_config(hw, sw_config, sizeof (aq_buf), &next,
+	    NULL);
+	if (rc != I40E_SUCCESS) {
+		i40e_error(i40e, "i40e_aq_get_switch_config() failed %d: %d",
+		    rc, hw->aq.asq_last_status);
 		return (-1);
 	}
 
-	return (sw_config->element[0].seid);
+	return (LE_16(sw_config->element[0].uplink_seid));
 }
 
 /*
@@ -1098,11 +1123,16 @@ i40e_disable_interrupts(i40e_t *i40e)
 static void
 i40e_free_trqpairs(i40e_t *i40e)
 {
-	int i;
 	i40e_trqpair_t *itrq;
 
+	if (i40e->i40e_rx_groups != NULL) {
+		kmem_free(i40e->i40e_rx_groups,
+		    sizeof (i40e_rx_group_t) * i40e->i40e_num_rx_groups);
+		i40e->i40e_rx_groups = NULL;
+	}
+
 	if (i40e->i40e_trqpairs != NULL) {
-		for (i = 0; i < i40e->i40e_num_trqpairs; i++) {
+		for (uint_t i = 0; i < i40e->i40e_num_trqpairs; i++) {
 			itrq = &i40e->i40e_trqpairs[i];
 			mutex_destroy(&itrq->itrq_rx_lock);
 			mutex_destroy(&itrq->itrq_tx_lock);
@@ -1133,7 +1163,6 @@ i40e_free_trqpairs(i40e_t *i40e)
 static boolean_t
 i40e_alloc_trqpairs(i40e_t *i40e)
 {
-	int i;
 	void *mutexpri = DDI_INTR_PRI(i40e->i40e_intr_pri);
 
 	/*
@@ -1146,7 +1175,7 @@ i40e_alloc_trqpairs(i40e_t *i40e)
 
 	i40e->i40e_trqpairs = kmem_zalloc(sizeof (i40e_trqpair_t) *
 	    i40e->i40e_num_trqpairs, KM_SLEEP);
-	for (i = 0; i < i40e->i40e_num_trqpairs; i++) {
+	for (uint_t i = 0; i < i40e->i40e_num_trqpairs; i++) {
 		i40e_trqpair_t *itrq = &i40e->i40e_trqpairs[i];
 
 		itrq->itrq_i40e = i40e;
@@ -1156,6 +1185,16 @@ i40e_alloc_trqpairs(i40e_t *i40e)
 		itrq->itrq_index = i;
 	}
 
+	i40e->i40e_rx_groups = kmem_zalloc(sizeof (i40e_rx_group_t) *
+	    i40e->i40e_num_rx_groups, KM_SLEEP);
+
+	for (uint_t i = 0; i < i40e->i40e_num_rx_groups; i++) {
+		i40e_rx_group_t *rxg = &i40e->i40e_rx_groups[i];
+
+		rxg->irg_index = i;
+		rxg->irg_i40e = i40e;
+	}
+
 	return (B_TRUE);
 }
 
@@ -1164,16 +1203,19 @@ i40e_alloc_trqpairs(i40e_t *i40e)
 /*
  * Unless a .conf file already overrode i40e_t structure values, they will
  * be 0, and need to be set in conjunction with the now-available HW report.
- *
- * However, at the moment, we cap all of these resources as we only support a
- * single receive ring and a single group.
  */
 /* ARGSUSED */
 static void
 i40e_hw_to_instance(i40e_t *i40e, i40e_hw_t *hw)
 {
-	if (i40e->i40e_num_trqpairs == 0) {
-		i40e->i40e_num_trqpairs = I40E_TRQPAIR_MAX;
+	if (i40e->i40e_num_trqpairs_per_vsi == 0) {
+		if (i40e_is_x722(i40e)) {
+			i40e->i40e_num_trqpairs_per_vsi =
+			    I40E_722_MAX_TC_QUEUES;
+		} else {
+			i40e->i40e_num_trqpairs_per_vsi =
+			    I40E_710_MAX_TC_QUEUES;
+		}
 	}
 
 	if (i40e->i40e_num_rx_groups == 0) {
@@ -1309,12 +1351,11 @@ i40e_common_code_init(i40e_t *i40e, i40e_hw_t *hw)
 	}
 
 	/*
-	 * We need to obtain the Virtual Station ID (VSI) before we can
-	 * perform other operations on the device.
+	 * We need to obtain the Default Virtual Station SEID (VSI)
+	 * before we can perform other operations on the device.
 	 */
-	i40e->i40e_vsi_id = i40e_get_vsi_id(i40e);
-	if (i40e->i40e_vsi_id == -1) {
-		i40e_error(i40e, "failed to obtain VSI ID");
+	if (!i40e_set_def_vsi_seid(i40e)) {
+		i40e_error(i40e, "failed to obtain Default VSI SEID");
 		return (B_FALSE);
 	}
 
@@ -1559,6 +1600,9 @@ i40e_init_properties(i40e_t *i40e)
 	i40e->i40e_tx_hcksum_enable = i40e_get_prop(i40e, "tx_hcksum_enable",
 	    B_FALSE, B_TRUE, B_TRUE);
 
+	i40e->i40e_tx_lso_enable = i40e_get_prop(i40e, "tx_lso_enable",
+	    B_FALSE, B_TRUE, B_TRUE);
+
 	i40e->i40e_rx_hcksum_enable = i40e_get_prop(i40e, "rx_hcksum_enable",
 	    B_FALSE, B_TRUE, B_TRUE);
 
@@ -1728,15 +1772,56 @@ i40e_alloc_intrs(i40e_t *i40e, dev_info_t *devinfo)
 	}
 
 	i40e->i40e_intr_type = 0;
+	i40e->i40e_num_rx_groups = I40E_GROUP_MAX;
 
+	/*
+	 * We need to determine the number of queue pairs per traffic
+	 * class. We only have one traffic class (TC0), so we'll base
+	 * this off the number of interrupts provided. Furthermore,
+	 * since we only use one traffic class, the number of queues
+	 * per traffic class and per VSI are the same.
+	 */
 	if ((intr_types & DDI_INTR_TYPE_MSIX) &&
-	    i40e->i40e_intr_force <= I40E_INTR_MSIX) {
-		if (i40e_alloc_intr_handles(i40e, devinfo,
-		    DDI_INTR_TYPE_MSIX)) {
-			i40e->i40e_num_trqpairs =
-			    MIN(i40e->i40e_intr_count - 1, max_trqpairs);
-			return (B_TRUE);
-		}
+	    (i40e->i40e_intr_force <= I40E_INTR_MSIX) &&
+	    (i40e_alloc_intr_handles(i40e, devinfo, DDI_INTR_TYPE_MSIX))) {
+		uint32_t n;
+
+		/*
+		 * While we want the number of queue pairs to match
+		 * the number of interrupts, we must keep stay in
+		 * bounds of the maximum number of queues per traffic
+		 * class. We subtract one from i40e_intr_count to
+		 * account for interrupt zero; which is currently
+		 * restricted to admin queue commands and other
+		 * interrupt causes.
+		 */
+		n = MIN(i40e->i40e_intr_count - 1, max_trqpairs);
+		ASSERT3U(n, >, 0);
+
+		/*
+		 * Round up to the nearest power of two to ensure that
+		 * the QBASE aligns with the TC size which must be
+		 * programmed as a power of two. See the queue mapping
+		 * description in section 7.4.9.5.5.1.
+		 *
+		 * If i40e_intr_count - 1 is not a power of two then
+		 * some queue pairs on the same VSI will have to share
+		 * an interrupt.
+		 *
+		 * We may want to revisit this logic in a future where
+		 * we have more interrupts and more VSIs. Otherwise,
+		 * each VSI will use as many interrupts as possible.
+		 * Using more QPs per VSI means better RSS for each
+		 * group, but at the same time may require more
+		 * sharing of interrupts across VSIs. This may be a
+		 * good candidate for a .conf tunable.
+		 */
+		n = 0x1 << ddi_fls(n);
+		i40e->i40e_num_trqpairs_per_vsi = n;
+		ASSERT3U(i40e->i40e_num_rx_groups, >, 0);
+		i40e->i40e_num_trqpairs = i40e->i40e_num_trqpairs_per_vsi *
+		    i40e->i40e_num_rx_groups;
+		return (B_TRUE);
 	}
 
 	/*
@@ -1745,6 +1830,7 @@ i40e_alloc_intrs(i40e_t *i40e, dev_info_t *devinfo)
 	 * single MSI interrupt.
 	 */
 	i40e->i40e_num_trqpairs = I40E_TRQPAIR_NOMSIX;
+	i40e->i40e_num_trqpairs_per_vsi = i40e->i40e_num_trqpairs;
 	i40e->i40e_num_rx_groups = I40E_GROUP_NOMSIX;
 
 	if ((intr_types & DDI_INTR_TYPE_MSI) &&
@@ -1767,24 +1853,20 @@ i40e_alloc_intrs(i40e_t *i40e, dev_info_t *devinfo)
 static boolean_t
 i40e_map_intrs_to_vectors(i40e_t *i40e)
 {
-	int i;
-
 	if (i40e->i40e_intr_type != DDI_INTR_TYPE_MSIX) {
 		return (B_TRUE);
 	}
 
 	/*
-	 * Each queue pair is mapped to a single interrupt, so transmit
-	 * and receive interrupts for a given queue share the same vector.
-	 * The number of queue pairs is one less than the number of interrupt
-	 * vectors and is assigned the vector one higher than its index.
-	 * Vector zero is reserved for the admin queue.
+	 * Each queue pair is mapped to a single interrupt, so
+	 * transmit and receive interrupts for a given queue share the
+	 * same vector. Vector zero is reserved for the admin queue.
 	 */
-	ASSERT(i40e->i40e_intr_count == i40e->i40e_num_trqpairs + 1);
+	for (uint_t i = 0; i < i40e->i40e_num_trqpairs; i++) {
+		uint_t vector = i % (i40e->i40e_intr_count - 1);
 
-	for (i = 0; i < i40e->i40e_num_trqpairs; i++) {
-		i40e->i40e_trqpairs[i].itrq_rx_intrvec = i + 1;
-		i40e->i40e_trqpairs[i].itrq_tx_intrvec = i + 1;
+		i40e->i40e_trqpairs[i].itrq_rx_intrvec = vector + 1;
+		i40e->i40e_trqpairs[i].itrq_tx_intrvec = vector + 1;
 	}
 
 	return (B_TRUE);
@@ -1923,89 +2005,282 @@ i40e_init_macaddrs(i40e_t *i40e, i40e_hw_t *hw)
 }
 
 /*
- * Configure the hardware for the Virtual Station Interface (VSI).  Currently
- * we only support one, but in the future we could instantiate more than one
- * per attach-point.
+ * Set the properties which have common values across all the VSIs.
+ * Consult the "Add VSI" command section (7.4.9.5.5.1) for a
+ * complete description of these properties.
  */
-static boolean_t
-i40e_config_vsi(i40e_t *i40e, i40e_hw_t *hw)
+static void
+i40e_set_shared_vsi_props(i40e_t *i40e,
+    struct i40e_aqc_vsi_properties_data *info, uint_t vsi_idx)
 {
-	struct i40e_vsi_context	context;
-	int err, tc_queues;
+	uint_t tc_queues;
+	uint16_t vsi_qp_base;
 
-	bzero(&context, sizeof (struct i40e_vsi_context));
-	context.seid = i40e->i40e_vsi_id;
-	context.pf_num = hw->pf_id;
-	err = i40e_aq_get_vsi_params(hw, &context, NULL);
-	if (err != I40E_SUCCESS) {
-		i40e_error(i40e, "get VSI params failed with %d", err);
-		return (B_FALSE);
-	}
-
-	i40e->i40e_vsi_num = context.vsi_number;
+	/*
+	 * It's important that we use bitwise-OR here; callers to this
+	 * function might enable other sections before calling this
+	 * function.
+	 */
+	info->valid_sections |= LE_16(I40E_AQ_VSI_PROP_QUEUE_MAP_VALID |
+	    I40E_AQ_VSI_PROP_VLAN_VALID);
 
 	/*
-	 * Set the queue and traffic class bits.  Keep it simple for now.
+	 * Calculate the starting QP index for this VSI. This base is
+	 * relative to the PF queue space; so a value of 0 for PF#1
+	 * represents the absolute index PFLAN_QALLOC_FIRSTQ for PF#1.
 	 */
-	context.info.valid_sections = I40E_AQ_VSI_PROP_QUEUE_MAP_VALID;
-	context.info.mapping_flags = I40E_AQ_VSI_QUE_MAP_CONTIG;
-	context.info.queue_mapping[0] = I40E_ASSIGN_ALL_QUEUES;
+	vsi_qp_base = vsi_idx * i40e->i40e_num_trqpairs_per_vsi;
+	info->mapping_flags = LE_16(I40E_AQ_VSI_QUE_MAP_CONTIG);
+	info->queue_mapping[0] =
+	    LE_16((vsi_qp_base << I40E_AQ_VSI_QUEUE_SHIFT) &
+		I40E_AQ_VSI_QUEUE_MASK);
 
 	/*
-	 * tc_queues determines the size of the traffic class, where the size is
-	 * 2^^tc_queues to a maximum of 64 for the X710 and 128 for the X722.
+	 * tc_queues determines the size of the traffic class, where
+	 * the size is 2^^tc_queues to a maximum of 64 for the X710
+	 * and 128 for the X722.
 	 *
 	 * Some examples:
-	 * 	i40e_num_trqpairs == 1 =>  tc_queues = 0, 2^^0 = 1.
-	 * 	i40e_num_trqpairs == 7 =>  tc_queues = 3, 2^^3 = 8.
-	 * 	i40e_num_trqpairs == 8 =>  tc_queues = 3, 2^^3 = 8.
-	 * 	i40e_num_trqpairs == 9 =>  tc_queues = 4, 2^^4 = 16.
-	 * 	i40e_num_trqpairs == 17 => tc_queues = 5, 2^^5 = 32.
-	 * 	i40e_num_trqpairs == 64 => tc_queues = 6, 2^^6 = 64.
+	 * 	i40e_num_trqpairs_per_vsi == 1 =>  tc_queues = 0, 2^^0 = 1.
+	 * 	i40e_num_trqpairs_per_vsi == 7 =>  tc_queues = 3, 2^^3 = 8.
+	 * 	i40e_num_trqpairs_per_vsi == 8 =>  tc_queues = 3, 2^^3 = 8.
+	 * 	i40e_num_trqpairs_per_vsi == 9 =>  tc_queues = 4, 2^^4 = 16.
+	 * 	i40e_num_trqpairs_per_vsi == 17 => tc_queues = 5, 2^^5 = 32.
+	 * 	i40e_num_trqpairs_per_vsi == 64 => tc_queues = 6, 2^^6 = 64.
 	 */
-	tc_queues = ddi_fls(i40e->i40e_num_trqpairs - 1);
+	tc_queues = ddi_fls(i40e->i40e_num_trqpairs_per_vsi - 1);
 
-	context.info.tc_mapping[0] = ((0 << I40E_AQ_VSI_TC_QUE_OFFSET_SHIFT) &
-	    I40E_AQ_VSI_TC_QUE_OFFSET_MASK) |
-	    ((tc_queues << I40E_AQ_VSI_TC_QUE_NUMBER_SHIFT) &
-	    I40E_AQ_VSI_TC_QUE_NUMBER_MASK);
+	/*
+	 * The TC queue mapping is in relation to the VSI queue space.
+	 * Since we are only using one traffic class (TC0) we always
+	 * start at queue offset 0.
+	 */
+	info->tc_mapping[0] =
+	    LE_16(((0 << I40E_AQ_VSI_TC_QUE_OFFSET_SHIFT) &
+		    I40E_AQ_VSI_TC_QUE_OFFSET_MASK) |
+		((tc_queues << I40E_AQ_VSI_TC_QUE_NUMBER_SHIFT) &
+		    I40E_AQ_VSI_TC_QUE_NUMBER_MASK));
 
-	context.info.valid_sections |= I40E_AQ_VSI_PROP_VLAN_VALID;
-	context.info.port_vlan_flags = I40E_AQ_VSI_PVLAN_MODE_ALL |
+	/*
+	 * I40E_AQ_VSI_PVLAN_MODE_ALL ("VLAN driver insertion mode")
+	 *
+	 *	Allow tagged and untagged packets to be sent to this
+	 *	VSI from the host.
+	 *
+	 * I40E_AQ_VSI_PVLAN_EMOD_NOTHING ("VLAN and UP expose mode")
+	 *
+	 *	Leave the tag on the frame and place no VLAN
+	 *	information in the descriptor. We want this mode
+	 *	because our MAC layer will take care of the VLAN tag,
+	 *	if there is one.
+	 */
+	info->port_vlan_flags = I40E_AQ_VSI_PVLAN_MODE_ALL |
 	    I40E_AQ_VSI_PVLAN_EMOD_NOTHING;
+}
+
+/*
+ * Delete the VSI at this index, if one exists. We assume there is no
+ * action we can take if this command fails but to log the failure.
+ */
+static void
+i40e_delete_vsi(i40e_t *i40e, uint_t idx)
+{
+	i40e_hw_t	*hw = &i40e->i40e_hw_space;
+	uint16_t	seid = i40e->i40e_vsis[idx].iv_seid;
 
-	context.flags = LE16_TO_CPU(I40E_AQ_VSI_TYPE_PF);
+	if (seid != 0) {
+		int rc;
 
-	i40e->i40e_vsi_stat_id = LE16_TO_CPU(context.info.stat_counter_idx);
-	if (i40e_stat_vsi_init(i40e) == B_FALSE)
-		return (B_FALSE);
+		rc = i40e_aq_delete_element(hw, seid, NULL);
 
-	err = i40e_aq_update_vsi_params(hw, &context, NULL);
-	if (err != I40E_SUCCESS) {
-		i40e_error(i40e, "Update VSI params failed with %d", err);
+		if (rc != I40E_SUCCESS) {
+			i40e_error(i40e, "Failed to delete VSI %d: %d",
+			    rc, hw->aq.asq_last_status);
+		}
+
+		i40e->i40e_vsis[idx].iv_seid = 0;
+	}
+}
+
+/*
+ * Add a new VSI.
+ */
+static boolean_t
+i40e_add_vsi(i40e_t *i40e, i40e_hw_t *hw, uint_t idx)
+{
+	struct i40e_vsi_context	ctx;
+	i40e_rx_group_t		*rxg;
+	int			rc;
+
+	/*
+	 * The default VSI is created by the controller. This function
+	 * creates new, non-defualt VSIs only.
+	 */
+	ASSERT3U(idx, !=, 0);
+
+	bzero(&ctx, sizeof (struct i40e_vsi_context));
+	ctx.uplink_seid = i40e->i40e_veb_seid;
+	ctx.pf_num = hw->pf_id;
+	ctx.flags = I40E_AQ_VSI_TYPE_PF;
+	ctx.connection_type = I40E_AQ_VSI_CONN_TYPE_NORMAL;
+	i40e_set_shared_vsi_props(i40e, &ctx.info, idx);
+
+	rc = i40e_aq_add_vsi(hw, &ctx, NULL);
+	if (rc != I40E_SUCCESS) {
+		i40e_error(i40e, "i40e_aq_add_vsi() failed %d: %d", rc,
+		    hw->aq.asq_last_status);
 		return (B_FALSE);
 	}
 
+	rxg = &i40e->i40e_rx_groups[idx];
+	rxg->irg_vsi_seid = ctx.seid;
+	i40e->i40e_vsis[idx].iv_number = ctx.vsi_number;
+	i40e->i40e_vsis[idx].iv_seid = ctx.seid;
+	i40e->i40e_vsis[idx].iv_stats_id = LE_16(ctx.info.stat_counter_idx);
+
+	if (i40e_stat_vsi_init(i40e, idx) == B_FALSE)
+		return (B_FALSE);
 
 	return (B_TRUE);
 }
 
 /*
- * Configure the RSS key. For the X710 controller family, this is set on a
- * per-PF basis via registers. For the X722, this is done on a per-VSI basis
- * through the admin queue.
+ * Configure the hardware for the Default Virtual Station Interface (VSI).
  */
 static boolean_t
-i40e_config_rss_key(i40e_t *i40e, i40e_hw_t *hw)
+i40e_config_def_vsi(i40e_t *i40e, i40e_hw_t *hw)
 {
-	uint32_t seed[I40E_PFQF_HKEY_MAX_INDEX + 1];
+	struct i40e_vsi_context	ctx;
+	i40e_rx_group_t *def_rxg;
+	int err;
+	struct i40e_aqc_remove_macvlan_element_data filt;
 
-	(void) random_get_pseudo_bytes((uint8_t *)seed, sizeof (seed));
+	bzero(&ctx, sizeof (struct i40e_vsi_context));
+	ctx.seid = I40E_DEF_VSI_SEID(i40e);
+	ctx.pf_num = hw->pf_id;
+	err = i40e_aq_get_vsi_params(hw, &ctx, NULL);
+	if (err != I40E_SUCCESS) {
+		i40e_error(i40e, "get VSI params failed with %d", err);
+		return (B_FALSE);
+	}
 
-	if (i40e_is_x722(i40e)) {
+	ctx.info.valid_sections = 0;
+	i40e->i40e_vsis[0].iv_number = ctx.vsi_number;
+	i40e->i40e_vsis[0].iv_stats_id = LE_16(ctx.info.stat_counter_idx);
+	if (i40e_stat_vsi_init(i40e, 0) == B_FALSE)
+		return (B_FALSE);
+
+	i40e_set_shared_vsi_props(i40e, &ctx.info, I40E_DEF_VSI_IDX);
+
+	err = i40e_aq_update_vsi_params(hw, &ctx, NULL);
+	if (err != I40E_SUCCESS) {
+		i40e_error(i40e, "Update VSI params failed with %d", err);
+		return (B_FALSE);
+	}
+
+	def_rxg = &i40e->i40e_rx_groups[0];
+	def_rxg->irg_vsi_seid = I40E_DEF_VSI_SEID(i40e);
+
+	/*
+	 * We have seen three different behaviors in regards to the
+	 * Default VSI and its implicit L2 MAC+VLAN filter.
+	 *
+	 * 1. It has an implicit filter for the factory MAC address
+	 *    and this filter counts against 'ifr_nmacfilt_used'.
+	 *
+	 * 2. It has an implicit filter for the factory MAC address
+	 *    and this filter DOES NOT count against 'ifr_nmacfilt_used'.
+	 *
+	 * 3. It DOES NOT have an implicit filter.
+	 *
+	 * All three of these cases are accounted for below. If we
+	 * fail to remove the L2 filter (ENOENT) then we assume there
+	 * wasn't one. Otherwise, if we successfully remove the
+	 * filter, we make sure to update the 'ifr_nmacfilt_used'
+	 * count accordingly.
+	 *
+	 * We remove this filter to prevent duplicate delivery of
+	 * packets destined for the primary MAC address as DLS will
+	 * create the same filter on a non-default VSI for the primary
+	 * MAC client.
+	 *
+	 * If you change the following code please test it across as
+	 * many X700 series controllers and firmware revisions as you
+	 * can.
+	 */
+	bzero(&filt, sizeof (filt));
+	bcopy(hw->mac.port_addr, filt.mac_addr, ETHERADDRL);
+	filt.flags = I40E_AQC_MACVLAN_DEL_PERFECT_MATCH;
+	filt.vlan_tag = 0;
+
+	ASSERT3U(i40e->i40e_resources.ifr_nmacfilt_used, <=, 1);
+	i40e_log(i40e, "Num L2 filters: %u",
+	    i40e->i40e_resources.ifr_nmacfilt_used);
+
+	err = i40e_aq_remove_macvlan(hw, I40E_DEF_VSI_SEID(i40e), &filt, 1,
+	    NULL);
+	if (err == I40E_SUCCESS) {
+		i40e_log(i40e,
+		    "Removed L2 filter from Default VSI with SEID %u",
+		    I40E_DEF_VSI_SEID(i40e));
+	} else if (hw->aq.asq_last_status == ENOENT) {
+		i40e_log(i40e,
+		    "No L2 filter for Default VSI with SEID %u",
+		    I40E_DEF_VSI_SEID(i40e));
+	} else {
+		i40e_error(i40e, "Failed to remove L2 filter from"
+		    " Default VSI with SEID %u: %d (%d)",
+		    I40E_DEF_VSI_SEID(i40e), err, hw->aq.asq_last_status);
+
+		return (B_FALSE);
+	}
+
+	/*
+	 *  As mentioned above, the controller created an implicit L2
+	 *  filter for the primary MAC. We want to remove both the
+	 *  filter and decrement the filter count. However, not all
+	 *  controllers count this implicit filter against the total
+	 *  MAC filter count. So here we are making sure it is either
+	 *  one or zero. If it is one, then we know it is for the
+	 *  implicit filter and we should decrement since we just
+	 *  removed the filter above. If it is zero then we know the
+	 *  controller that does not count the implicit filter, and it
+	 *  was enough to just remove it; we leave the count alone.
+	 *  But if it is neither, then we have never seen a controller
+	 *  like this before and we should fail to attach.
+	 *
+	 *  It is unfortunate that this code must exist but the
+	 *  behavior of this implicit L2 filter and its corresponding
+	 *  count were dicovered through empirical testing. The
+	 *  programming manuals hint at this filter but do not
+	 *  explicitly call out the exact behavior.
+	 */
+	if (i40e->i40e_resources.ifr_nmacfilt_used == 1) {
+		i40e->i40e_resources.ifr_nmacfilt_used--;
+	} else {
+		if (i40e->i40e_resources.ifr_nmacfilt_used != 0) {
+			i40e_error(i40e, "Unexpected L2 filter count: %u"
+			    " (expected 0)",
+			    i40e->i40e_resources.ifr_nmacfilt_used);
+			    return (B_FALSE);
+		}
+	}
+
+	return (B_TRUE);
+}
+
+static boolean_t
+i40e_config_rss_key_x722(i40e_t *i40e, i40e_hw_t *hw)
+{
+	for (uint_t i = 0; i < i40e->i40e_num_rx_groups; i++) {
+		uint32_t seed[I40E_PFQF_HKEY_MAX_INDEX + 1];
 		struct i40e_aqc_get_set_rss_key_data key;
-		const char *u8seed = (char *)seed;
+		const char *u8seed;
 		enum i40e_status_code status;
+		uint16_t vsi_number = i40e->i40e_vsis[i].iv_number;
+
+		(void) random_get_pseudo_bytes((uint8_t *)seed, sizeof (seed));
+		u8seed = (char *)seed;
 
 		CTASSERT(sizeof (key) >= (sizeof (key.standard_rss_key) +
 		    sizeof (key.extended_hash_key)));
@@ -2015,14 +2290,35 @@ i40e_config_rss_key(i40e_t *i40e, i40e_hw_t *hw)
 		bcopy(&u8seed[sizeof (key.standard_rss_key)],
 		    key.extended_hash_key, sizeof (key.extended_hash_key));
 
-		status = i40e_aq_set_rss_key(hw, i40e->i40e_vsi_num, &key);
+		ASSERT3U(vsi_number, !=, 0);
+		status = i40e_aq_set_rss_key(hw, vsi_number, &key);
+
 		if (status != I40E_SUCCESS) {
-			i40e_error(i40e, "failed to set rss key: %d", status);
+			i40e_error(i40e, "failed to set RSS key for VSI %u: %d",
+			    vsi_number, status);
 			return (B_FALSE);
 		}
+	}
+
+	return (B_TRUE);
+}
+
+/*
+ * Configure the RSS key. For the X710 controller family, this is set on a
+ * per-PF basis via registers. For the X722, this is done on a per-VSI basis
+ * through the admin queue.
+ */
+static boolean_t
+i40e_config_rss_key(i40e_t *i40e, i40e_hw_t *hw)
+{
+	if (i40e_is_x722(i40e)) {
+		if (!i40e_config_rss_key_x722(i40e, hw))
+			return (B_FALSE);
 	} else {
-		uint_t i;
-		for (i = 0; i <= I40E_PFQF_HKEY_MAX_INDEX; i++)
+		uint32_t seed[I40E_PFQF_HKEY_MAX_INDEX + 1];
+
+		(void) random_get_pseudo_bytes((uint8_t *)seed, sizeof (seed));
+		for (uint_t i = 0; i <= I40E_PFQF_HKEY_MAX_INDEX; i++)
 			i40e_write_rx_ctl(hw, I40E_PFQF_HKEY(i), seed[i]);
 	}
 
@@ -2034,11 +2330,12 @@ i40e_config_rss_key(i40e_t *i40e, i40e_hw_t *hw)
  * family, with the X722 using a known 7-bit width. On the X710 controller, this
  * is programmed through its control registers where as on the X722 this is
  * configured through the admin queue. Also of note, the X722 allows the LUT to
- * be set on a per-PF or VSI basis. At this time, as we only have a single VSI,
- * we use the PF setting as it is the primary VSI.
+ * be set on a per-PF or VSI basis. At this time we use the PF setting. If we
+ * decide to use the per-VSI LUT in the future, then we will need to modify the
+ * i40e_add_vsi() function to set the RSS LUT bits in the queueing section.
  *
  * We populate the LUT in a round robin fashion with the rx queue indices from 0
- * to i40e_num_trqpairs - 1.
+ * to i40e_num_trqpairs_per_vsi - 1.
  */
 static boolean_t
 i40e_config_rss_hlut(i40e_t *i40e, i40e_hw_t *hw)
@@ -2068,15 +2365,20 @@ i40e_config_rss_hlut(i40e_t *i40e, i40e_hw_t *hw)
 		lut_mask = (1 << hw->func_caps.rss_table_entry_width) - 1;
 	}
 
-	for (i = 0; i < I40E_HLUT_TABLE_SIZE; i++)
-		((uint8_t *)hlut)[i] = (i % i40e->i40e_num_trqpairs) & lut_mask;
+	for (i = 0; i < I40E_HLUT_TABLE_SIZE; i++) {
+		((uint8_t *)hlut)[i] =
+		    (i % i40e->i40e_num_trqpairs_per_vsi) & lut_mask;
+	}
 
 	if (i40e_is_x722(i40e)) {
 		enum i40e_status_code status;
-		status = i40e_aq_set_rss_lut(hw, i40e->i40e_vsi_num, B_TRUE,
-		    (uint8_t *)hlut, I40E_HLUT_TABLE_SIZE);
+
+		status = i40e_aq_set_rss_lut(hw, 0, B_TRUE, (uint8_t *)hlut,
+		    I40E_HLUT_TABLE_SIZE);
+
 		if (status != I40E_SUCCESS) {
-			i40e_error(i40e, "failed to set RSS LUT: %d", status);
+			i40e_error(i40e, "failed to set RSS LUT %d: %d",
+			    status, hw->aq.asq_last_status);
 			goto out;
 		}
 	} else {
@@ -2188,8 +2490,34 @@ i40e_chip_start(i40e_t *i40e)
 
 	i40e_intr_chip_init(i40e);
 
-	if (!i40e_config_vsi(i40e, hw))
+	rc = i40e_get_mac_seid(i40e);
+	if (rc == -1) {
+		i40e_error(i40e, "failed to obtain MAC Uplink SEID");
 		return (B_FALSE);
+	}
+	i40e->i40e_mac_seid = (uint16_t)rc;
+
+	/*
+	 * Create a VEB in order to support multiple VSIs. Each VSI
+	 * functions as a MAC group. This call sets the PF's MAC as
+	 * the uplink port and the PF's default VSI as the default
+	 * downlink port.
+	 */
+	rc = i40e_aq_add_veb(hw, i40e->i40e_mac_seid, I40E_DEF_VSI_SEID(i40e),
+	    0x1, B_TRUE, &i40e->i40e_veb_seid, B_FALSE, NULL);
+	if (rc != I40E_SUCCESS) {
+		i40e_error(i40e, "i40e_aq_add_veb() failed %d: %d", rc,
+		    hw->aq.asq_last_status);
+		return (B_FALSE);
+	}
+
+	if (!i40e_config_def_vsi(i40e, hw))
+		return (B_FALSE);
+
+	for (uint_t i = 1; i < i40e->i40e_num_rx_groups; i++) {
+		if (!i40e_add_vsi(i40e, hw, i))
+			return (B_FALSE);
+	}
 
 	if (!i40e_config_rss(i40e, hw))
 		return (B_FALSE);
@@ -2549,7 +2877,7 @@ i40e_setup_tx_hmc(i40e_trqpair_t *itrq)
 	 * assigned to traffic class zero, because we don't actually use them.
 	 */
 	bzero(&context, sizeof (struct i40e_vsi_context));
-	context.seid = i40e->i40e_vsi_id;
+	context.seid = I40E_DEF_VSI_SEID(i40e);
 	context.pf_num = hw->pf_id;
 	err = i40e_aq_get_vsi_params(hw, &context, NULL);
 	if (err != I40E_SUCCESS) {
@@ -2653,7 +2981,8 @@ i40e_setup_tx_rings(i40e_t *i40e)
 void
 i40e_stop(i40e_t *i40e, boolean_t free_allocations)
 {
-	int i;
+	uint_t i;
+	i40e_hw_t *hw = &i40e->i40e_hw_space;
 
 	ASSERT(MUTEX_HELD(&i40e->i40e_general_lock));
 
@@ -2689,6 +3018,27 @@ i40e_stop(i40e_t *i40e, boolean_t free_allocations)
 
 	delay(50 * drv_usectohz(1000));
 
+	/*
+	 * We don't delete the default VSI because it replaces the VEB
+	 * after VEB deletion (see the "Delete Element" section).
+	 * Furthermore, since the default VSI is provided by the
+	 * firmware, we never attempt to delete it.
+	 */
+	for (i = 1; i < i40e->i40e_num_rx_groups; i++) {
+		i40e_delete_vsi(i40e, i);
+	}
+
+	if (i40e->i40e_veb_seid != 0) {
+		int rc = i40e_aq_delete_element(hw, i40e->i40e_veb_seid, NULL);
+
+		if (rc != I40E_SUCCESS) {
+			i40e_error(i40e, "Failed to delete VEB %d: %d", rc,
+			    hw->aq.asq_last_status);
+		}
+
+		i40e->i40e_veb_seid = 0;
+	}
+
 	i40e_intr_chip_fini(i40e);
 
 	for (i = 0; i < i40e->i40e_num_trqpairs; i++) {
@@ -2718,7 +3068,9 @@ i40e_stop(i40e_t *i40e, boolean_t free_allocations)
 		mutex_exit(&i40e->i40e_trqpairs[i].itrq_tx_lock);
 	}
 
-	i40e_stat_vsi_fini(i40e);
+	for (i = 0; i < i40e->i40e_num_rx_groups; i++) {
+		i40e_stat_vsi_fini(i40e, i);
+	}
 
 	i40e->i40e_link_speed = 0;
 	i40e->i40e_link_duplex = 0;
@@ -2783,7 +3135,8 @@ i40e_start(i40e_t *i40e, boolean_t alloc)
 	 * Enable broadcast traffic; however, do not enable multicast traffic.
 	 * That's handle exclusively through MAC's mc_multicst routines.
 	 */
-	err = i40e_aq_set_vsi_broadcast(hw, i40e->i40e_vsi_id, B_TRUE, NULL);
+	err = i40e_aq_set_vsi_broadcast(hw, I40E_DEF_VSI_SEID(i40e), B_TRUE,
+	    NULL);
 	if (err != I40E_SUCCESS) {
 		i40e_error(i40e, "failed to set default VSI: %d", err);
 		rc = B_FALSE;
diff --git a/usr/src/uts/common/io/i40e/i40e_stats.c b/usr/src/uts/common/io/i40e/i40e_stats.c
index 7a4f0faedd..97e1a08628 100644
--- a/usr/src/uts/common/io/i40e/i40e_stats.c
+++ b/usr/src/uts/common/io/i40e/i40e_stats.c
@@ -11,7 +11,7 @@
 
 /*
  * Copyright 2015 OmniTI Computer Consulting, Inc. All rights reserved.
- * Copyright 2016 Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
  */
 
 #include "i40e_sw.h"
@@ -69,12 +69,7 @@
  * ---------------------
  *
  * The hardware keeps statistics at each physical function/MAC (PF) and it keeps
- * statistics on each virtual station interface (VSI). Currently we only use one
- * VSI per PF (see the i40e_main.c theory statement). The hardware has a limited
- * number of statistics units available. While every PF is guaranteed to have a
- * statistics unit, it is possible that we will run out for a given VSI. We'll
- * have to figure out an appropriate strategy here when we end up supporting
- * multiple VSIs.
+ * statistics on each virtual station interface (VSI).
  *
  * The hardware keeps these statistics as 32-bit and 48-bit counters. We are
  * required to read them and then compute the differences between them. The
@@ -100,10 +95,10 @@
  * data.
  *
  * The pf kstats data is stored in the i40e_t`i40e_pf_kstat. It is backed by the
- * i40e_t`i40e_pf_stat structure. Similarly the VSI related kstat is in
- * i40e_t`i40e_vsi_kstat and the data is backed in the i40e_t`i40e_vsi_stat. All
- * of this data is protected by the i40e_stat_lock, which should be taken last,
- * when acquiring locks.
+ * i40e_t`i40e_pf_stat structure. Similarly the VSI related kstats are in
+ * i40e_t`i40e_vsis[idx].iv_kstats and the data is backed in the
+ * i40e_t`i40e_vsis[idx].iv_stats. All of this data is protected by the
+ * i40e_stat_lock, which should be taken last, when acquiring locks.
  */
 
 static void
@@ -169,15 +164,15 @@ i40e_stat_get_uint32(i40e_t *i40e, uintptr_t reg, kstat_named_t *kstat,
 }
 
 static void
-i40e_stat_vsi_update(i40e_t *i40e, boolean_t init)
+i40e_stat_vsi_update(i40e_t *i40e, uint_t idx, boolean_t init)
 {
 	i40e_vsi_stats_t *ivs;
 	i40e_vsi_kstats_t *ivk;
-	int id = i40e->i40e_vsi_stat_id;
+	uint16_t id = i40e->i40e_vsis[idx].iv_stats_id;
 
-	ASSERT(i40e->i40e_vsi_kstat != NULL);
-	ivs = &i40e->i40e_vsi_stat;
-	ivk = i40e->i40e_vsi_kstat->ks_data;
+	ASSERT3P(i40e->i40e_vsis[idx].iv_kstats, !=, NULL);
+	ivs = &i40e->i40e_vsis[idx].iv_stats;
+	ivk = i40e->i40e_vsis[idx].iv_kstats->ks_data;
 
 	mutex_enter(&i40e->i40e_stat_lock);
 
@@ -231,39 +226,41 @@ i40e_stat_vsi_kstat_update(kstat_t *ksp, int rw)
 		return (EACCES);
 
 	i40e = ksp->ks_private;
-	i40e_stat_vsi_update(i40e, B_FALSE);
+	for (uint_t i = 0; i < i40e->i40e_num_rx_groups; i++)
+		i40e_stat_vsi_update(i40e, i, B_FALSE);
+
 	return (0);
 }
 
 void
-i40e_stat_vsi_fini(i40e_t *i40e)
+i40e_stat_vsi_fini(i40e_t *i40e, uint_t idx)
 {
-	if (i40e->i40e_vsi_kstat != NULL) {
-		kstat_delete(i40e->i40e_vsi_kstat);
-		i40e->i40e_vsi_kstat = NULL;
+	if (i40e->i40e_vsis[idx].iv_kstats != NULL) {
+		kstat_delete(i40e->i40e_vsis[idx].iv_kstats);
+		i40e->i40e_vsis[idx].iv_kstats = NULL;
 	}
 }
 
 boolean_t
-i40e_stat_vsi_init(i40e_t *i40e)
+i40e_stat_vsi_init(i40e_t *i40e, uint_t idx)
 {
 	kstat_t *ksp;
 	i40e_vsi_kstats_t *ivk;
 	char buf[64];
+	uint16_t vsi_id = i40e->i40e_vsis[idx].iv_seid;
 
-	(void) snprintf(buf, sizeof (buf), "vsi_%d", i40e->i40e_vsi_id);
+	(void) snprintf(buf, sizeof (buf), "vsi_%u", vsi_id);
 
 	ksp = kstat_create(I40E_MODULE_NAME, ddi_get_instance(i40e->i40e_dip),
 	    buf, "net", KSTAT_TYPE_NAMED,
 	    sizeof (i40e_vsi_kstats_t) / sizeof (kstat_named_t), 0);
 
 	if (ksp == NULL) {
-		i40e_error(i40e, "Failed to create kstats for VSI %d",
-		    i40e->i40e_vsi_id);
+		i40e_error(i40e, "Failed to create kstats for VSI %u", vsi_id);
 		return (B_FALSE);
 	}
 
-	i40e->i40e_vsi_kstat = ksp;
+	i40e->i40e_vsis[idx].iv_kstats = ksp;
 	ivk = ksp->ks_data;
 	ksp->ks_update = i40e_stat_vsi_kstat_update;
 	ksp->ks_private = i40e;
@@ -291,9 +288,9 @@ i40e_stat_vsi_init(i40e_t *i40e)
 	kstat_named_init(&ivk->ivk_tx_errors, "tx_errors",
 	    KSTAT_DATA_UINT64);
 
-	bzero(&i40e->i40e_vsi_stat, sizeof (i40e_vsi_stats_t));
-	i40e_stat_vsi_update(i40e, B_TRUE);
-	kstat_install(i40e->i40e_vsi_kstat);
+	bzero(&i40e->i40e_vsis[idx].iv_stats, sizeof (i40e_vsi_stats_t));
+	i40e_stat_vsi_update(i40e, idx, B_TRUE);
+	kstat_install(i40e->i40e_vsis[idx].iv_kstats);
 
 	return (B_TRUE);
 }
@@ -670,7 +667,12 @@ i40e_stat_pf_init(i40e_t *i40e)
 void
 i40e_stats_fini(i40e_t *i40e)
 {
-	ASSERT(i40e->i40e_vsi_kstat == NULL);
+#ifdef DEBUG
+	for (uint_t i = 0; i < i40e->i40e_num_rx_groups; i++) {
+		ASSERT3P(i40e->i40e_vsis[i].iv_kstats, ==, NULL);
+	}
+#endif
+
 	if (i40e->i40e_pf_kstat != NULL) {
 		kstat_delete(i40e->i40e_pf_kstat);
 		i40e->i40e_pf_kstat = NULL;
@@ -1249,6 +1251,12 @@ i40e_stats_trqpair_init(i40e_trqpair_t *itrq)
 	kstat_named_init(&tsp->itxs_hck_badl4, "tx_hck_badl4",
 	    KSTAT_DATA_UINT64);
 	tsp->itxs_hck_badl4.value.ui64 = 0;
+	kstat_named_init(&tsp->itxs_lso_nohck, "tx_lso_nohck",
+	    KSTAT_DATA_UINT64);
+	tsp->itxs_lso_nohck.value.ui64 = 0;
+	kstat_named_init(&tsp->itxs_bind_fails, "tx_bind_fails",
+	    KSTAT_DATA_UINT64);
+	tsp->itxs_bind_fails.value.ui64 = 0;
 	kstat_named_init(&tsp->itxs_err_notcb, "tx_err_notcb",
 	    KSTAT_DATA_UINT64);
 	tsp->itxs_err_notcb.value.ui64 = 0;
diff --git a/usr/src/uts/common/io/i40e/i40e_sw.h b/usr/src/uts/common/io/i40e/i40e_sw.h
index 78aced0144..fbb6d11b41 100644
--- a/usr/src/uts/common/io/i40e/i40e_sw.h
+++ b/usr/src/uts/common/io/i40e/i40e_sw.h
@@ -11,7 +11,7 @@
 
 /*
  * Copyright 2015 OmniTI Computer Consulting, Inc. All rights reserved.
- * Copyright (c) 2017, Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
  * Copyright 2017 Tegile Systems, Inc.  All rights reserved.
  */
 
@@ -152,9 +152,10 @@ typedef enum i40e_itr_index {
 } i40e_itr_index_t;
 
 /*
- * Table 1-5 of the PRM notes that LSO supports up to 256 KB.
+ * The hardware claims to support LSO up to 256 KB, but due to the limitations
+ * imposed by the IP header for non-jumbo frames, we cap it at 64 KB.
  */
-#define	I40E_LSO_MAXLEN	(256 * 1024)
+#define	I40E_LSO_MAXLEN	(64 * 1024)
 
 #define	I40E_CYCLIC_PERIOD NANOSEC	/* 1 second */
 #define	I40E_DRAIN_RX_WAIT	(500 * MILLISEC)	/* In us */
@@ -173,13 +174,22 @@ typedef enum i40e_itr_index {
 #define	I40E_BUF_IPHDR_ALIGNMENT	2
 
 /*
- * The XL710 controller has a limit of eight buffers being allowed to be used
- * for the transmission of a single frame. This is defined in 8.4.1 - Transmit
+ * The XL710 controller has a total of eight buffers available for the
+ * transmission of any single frame. This is defined in 8.4.1 - Transmit
  * Packet in System Memory.
  */
 #define	I40E_TX_MAX_COOKIE	8
 
 /*
+ * An LSO frame can be as large as 64KB, so we allow a DMA bind to span more
+ * cookies than a non-LSO frame.  The key here to is to select a value such
+ * that once the HW has chunked up the LSO frame into MSS-sized segments that no
+ * single segment spans more than 8 cookies (see comments for
+ * I40E_TX_MAX_COOKIE)
+ */
+#define	I40E_TX_LSO_MAX_COOKIE	32
+
+/*
  * Sizing to determine the amount of available descriptors at which we'll
  * consider ourselves blocked. Also, when we have these available, we'll then
  * consider ourselves available to transmit to MAC again. Strictly speaking, the
@@ -203,6 +213,12 @@ typedef enum i40e_itr_index {
 #define	I40E_MAX_TX_DMA_THRESH		INT32_MAX
 
 /*
+ * The max size of each individual tx buffer is 16KB - 1.
+ * See table 8-17
+ */
+#define	I40E_MAX_TX_BUFSZ		0x0000000000003FFFull
+
+/*
  * Resource sizing counts. There are various aspects of hardware where we may
  * have some variable number of elements that we need to handle. Such as the
  * hardware capabilities and switch capacities. We cannot know a priori how many
@@ -240,21 +256,6 @@ typedef enum i40e_itr_index {
 #define	I40E_HMC_TX_TPH_DISABLE		0
 
 /*
- * Whenever we establish and create a VSI, we need to assign some number of
- * queues that it's allowed to access from the PF. Because we only have a single
- * VSI per PF at this time, we assign it all the queues.
- *
- * Many of the devices support what's called Data-center Bridging. Which is a
- * feature that we don't have much use of at this time. However, we still need
- * to fill in this information. We follow the guidance of the note in Table 7-80
- * which talks about bytes 62-77. It says that if we don't want to assign
- * anything to traffic classes, we should set the field to zero. Effectively
- * this means that everything in the system is assigned to traffic class zero.
- */
-#define	I40E_ASSIGN_ALL_QUEUES		0
-#define	I40E_TRAFFIC_CLASS_NO_QUEUES	0
-
-/*
  * This defines the error mask that we care about from rx descriptors. Currently
  * we're only concerned with the general errors and oversize errors.
  */
@@ -268,12 +269,12 @@ typedef enum i40e_itr_index {
 #define	I40E_DDI_PROP_LEN	64
 
 /*
- * We currently consolidate some overrides that we use in the code here. These
- * will be gone in the fullness of time, but as we're bringing up the device,
- * this is what we use.
+ * Place an artificial limit on the max number of groups. The X710
+ * series supports up to 384 VSIs to be partitioned across PFs as the
+ * driver sees fit. But until we support more interrupts this seems
+ * like a good place to start.
  */
-#define	I40E_GROUP_MAX		1
-#define	I40E_TRQPAIR_MAX	1
+#define	I40E_GROUP_MAX		32
 
 #define	I40E_GROUP_NOMSIX	1
 #define	I40E_TRQPAIR_NOMSIX	1
@@ -405,18 +406,29 @@ typedef struct i40e_rx_control_block {
 typedef enum {
 	I40E_TX_NONE,
 	I40E_TX_COPY,
-	I40E_TX_DMA
+	I40E_TX_DMA,
+	I40E_TX_DESC,
 } i40e_tx_type_t;
 
 typedef struct i40e_tx_desc i40e_tx_desc_t;
+typedef struct i40e_tx_context_desc i40e_tx_context_desc_t;
 typedef union i40e_32byte_rx_desc i40e_rx_desc_t;
 
+struct i40e_dma_bind_info {
+	caddr_t dbi_paddr;
+	size_t dbi_len;
+};
+
 typedef struct i40e_tx_control_block {
 	struct i40e_tx_control_block	*tcb_next;
 	mblk_t				*tcb_mp;
 	i40e_tx_type_t			tcb_type;
 	ddi_dma_handle_t		tcb_dma_handle;
+	ddi_dma_handle_t		tcb_lso_dma_handle;
 	i40e_dma_buffer_t		tcb_dma;
+	struct i40e_dma_bind_info	*tcb_bind_info;
+	uint_t				tcb_bind_ncookies;
+	boolean_t			tcb_used_lso;
 } i40e_tx_control_block_t;
 
 /*
@@ -526,6 +538,8 @@ typedef struct i40e_txq_stat {
 	kstat_named_t	itxs_hck_nol4info;	/* Missing l4 info */
 	kstat_named_t	itxs_hck_badl3;		/* Not IPv4/IPv6 */
 	kstat_named_t	itxs_hck_badl4;		/* Bad L4 Paylaod */
+	kstat_named_t	itxs_lso_nohck;		/* Missing offloads for LSO */
+	kstat_named_t	itxs_bind_fails;	/* DMA bind failures */
 
 	kstat_named_t	itxs_err_notcb;		/* No tcb's available */
 	kstat_named_t	itxs_err_nodescs;	/* No tcb's available */
@@ -761,6 +775,25 @@ typedef struct i40e_func_rsrc {
 	uint_t	ifr_nmcastfilt_used;
 } i40e_func_rsrc_t;
 
+typedef struct i40e_vsi {
+	uint16_t		iv_seid;
+	uint16_t		iv_number;
+	kstat_t			*iv_kstats;
+	i40e_vsi_stats_t	iv_stats;
+	uint16_t		iv_stats_id;
+} i40e_vsi_t;
+
+/*
+ * While irg_index and irg_grp_hdl aren't used anywhere, they are
+ * still useful for debugging.
+ */
+typedef struct i40e_rx_group {
+	uint32_t		irg_index;    /* index in i40e_rx_groups[] */
+	uint16_t		irg_vsi_seid; /* SEID of VSI for this group */
+	mac_group_handle_t	irg_grp_hdl;  /* handle to mac_group_t */
+	struct i40e		*irg_i40e;    /* ref to i40e_t */
+} i40e_rx_group_t;
+
 /*
  * Main i40e per-instance state.
  */
@@ -789,11 +822,18 @@ typedef struct i40e {
 	struct i40e_aq_get_phy_abilities_resp	i40e_phy;
 	void 					*i40e_aqbuf;
 
+#define	I40E_DEF_VSI_IDX	0
+#define	I40E_DEF_VSI(i40e)	((i40e)->i40e_vsis[I40E_DEF_VSI_IDX])
+#define	I40E_DEF_VSI_SEID(i40e)	(I40E_DEF_VSI(i40e).iv_seid)
+
 	/*
 	 * Device state, switch information, and resources.
 	 */
-	int			i40e_vsi_id;
-	uint16_t		i40e_vsi_num;
+	i40e_vsi_t		i40e_vsis[I40E_GROUP_MAX];
+	uint16_t		i40e_mac_seid;	 /* SEID of physical MAC */
+	uint16_t		i40e_veb_seid;	 /* switch atop MAC (SEID) */
+	uint16_t		i40e_vsi_avail;	 /* VSIs avail to this PF */
+	uint16_t		i40e_vsi_used;	 /* VSIs used by this PF */
 	struct i40e_device	*i40e_device;
 	i40e_func_rsrc_t	i40e_resources;
 	uint16_t		i40e_switch_rsrc_alloc;
@@ -814,12 +854,13 @@ typedef struct i40e {
 	 */
 	i40e_trqpair_t	*i40e_trqpairs;
 	boolean_t 	i40e_mr_enable;
-	int		i40e_num_trqpairs;
+	uint_t		i40e_num_trqpairs; /* total TRQPs (per PF) */
+	uint_t		i40e_num_trqpairs_per_vsi; /* TRQPs per VSI */
 	uint_t		i40e_other_itr;
 
-	int		i40e_num_rx_groups;
+	i40e_rx_group_t	*i40e_rx_groups;
+	uint_t		i40e_num_rx_groups;
 	int		i40e_num_rx_descs;
-	mac_group_handle_t i40e_rx_group_handle;
 	uint32_t	i40e_rx_ring_size;
 	uint32_t	i40e_rx_buf_size;
 	boolean_t	i40e_rx_hcksum_enable;
@@ -832,6 +873,7 @@ typedef struct i40e {
 	uint32_t	i40e_tx_buf_size;
 	uint32_t	i40e_tx_block_thresh;
 	boolean_t	i40e_tx_hcksum_enable;
+	boolean_t	i40e_tx_lso_enable;
 	uint32_t	i40e_tx_dma_min;
 	uint_t		i40e_tx_itr;
 
@@ -855,6 +897,7 @@ typedef struct i40e {
 	 */
 	ddi_dma_attr_t		i40e_static_dma_attr;
 	ddi_dma_attr_t		i40e_txbind_dma_attr;
+	ddi_dma_attr_t		i40e_txbind_lso_dma_attr;
 	ddi_device_acc_attr_t	i40e_desc_acc_attr;
 	ddi_device_acc_attr_t	i40e_buf_acc_attr;
 
@@ -872,10 +915,7 @@ typedef struct i40e {
 	 */
 	kmutex_t		i40e_stat_lock;
 	kstat_t			*i40e_pf_kstat;
-	kstat_t			*i40e_vsi_kstat;
 	i40e_pf_stats_t		i40e_pf_stat;
-	i40e_vsi_stats_t	i40e_vsi_stat;
-	uint16_t		i40e_vsi_stat_id;
 
 	/*
 	 * Misc. stats and counters that should maybe one day be kstats.
@@ -975,8 +1015,8 @@ extern void i40e_tx_cleanup_ring(i40e_trqpair_t *);
  */
 extern boolean_t i40e_stats_init(i40e_t *);
 extern void i40e_stats_fini(i40e_t *);
-extern boolean_t i40e_stat_vsi_init(i40e_t *);
-extern void i40e_stat_vsi_fini(i40e_t *);
+extern boolean_t i40e_stat_vsi_init(i40e_t *, uint_t);
+extern void i40e_stat_vsi_fini(i40e_t *, uint_t);
 extern boolean_t i40e_stats_trqpair_init(i40e_trqpair_t *);
 extern void i40e_stats_trqpair_fini(i40e_trqpair_t *);
 extern int i40e_m_stat(void *, uint_t, uint64_t *);
diff --git a/usr/src/uts/common/io/i40e/i40e_transceiver.c b/usr/src/uts/common/io/i40e/i40e_transceiver.c
index 75132e27f0..28f3b88dee 100644
--- a/usr/src/uts/common/io/i40e/i40e_transceiver.c
+++ b/usr/src/uts/common/io/i40e/i40e_transceiver.c
@@ -11,7 +11,7 @@
 
 /*
  * Copyright 2015 OmniTI Computer Consulting, Inc. All rights reserved.
- * Copyright 2016 Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
  */
 
 #include "i40e_sw.h"
@@ -60,19 +60,19 @@
  * This size is then rounded up to the nearest 1k chunk, which represents the
  * actual amount of memory that we'll allocate for a single frame.
  *
- * Note, that for rx, we do something that might be unexpected. We always add
+ * Note, that for RX, we do something that might be unexpected. We always add
  * an extra two bytes to the frame size that we allocate. We then offset the DMA
  * address that we receive a packet into by two bytes. This ensures that the IP
  * header will always be 4 byte aligned because the MAC header is either 14 or
  * 18 bytes in length, depending on the use of 802.1Q tagging, which makes IP's
  * and MAC's lives easier.
  *
- * Both the rx and tx descriptor rings (which are what we use to communicate
+ * Both the RX and TX descriptor rings (which are what we use to communicate
  * with hardware) are allocated as a single region of DMA memory which is the
  * size of the descriptor (4 bytes and 2 bytes respectively) times the total
- * number of descriptors for an rx and tx ring.
+ * number of descriptors for an RX and TX ring.
  *
- * While the rx and tx descriptors are allocated using DMA-based memory, the
+ * While the RX and TX descriptors are allocated using DMA-based memory, the
  * control blocks for each of them are allocated using normal kernel memory.
  * They aren't special from a DMA perspective. We'll go over the design of both
  * receiving and transmitting separately, as they have slightly different
@@ -113,16 +113,16 @@
  *
  * To try and ensure that the device always has blocks that it can receive data
  * into, we maintain two lists of control blocks, a working list and a free
- * list. Each list is sized equal to the number of descriptors in the rx ring.
- * During the GLDv3 mc_start routine, we allocate a number of rx control blocks
+ * list. Each list is sized equal to the number of descriptors in the RX ring.
+ * During the GLDv3 mc_start routine, we allocate a number of RX control blocks
  * equal to twice the number of descriptors in the ring and we assign them
  * equally to the free list and to the working list. Each control block also has
  * DMA memory allocated and associated with which it will be used to receive the
  * actual packet data. All of a received frame's data will end up in a single
  * DMA buffer.
  *
- * During operation, we always maintain the invariant that each rx descriptor
- * has an associated rx control block which lives in the working list. If we
+ * During operation, we always maintain the invariant that each RX descriptor
+ * has an associated RX control block which lives in the working list. If we
  * feel that we should loan up DMA memory to MAC in the form of a message block,
  * we can only do so if we can maintain this invariant. To do that, we swap in
  * one of the buffers from the free list. If none are available, then we resort
@@ -130,14 +130,14 @@
  * size.
  *
  * Loaned message blocks come back to use when freemsg(9F) or freeb(9F) is
- * called on the block, at which point we restore the rx control block to the
+ * called on the block, at which point we restore the RX control block to the
  * free list and are able to reuse the DMA memory again. While the scheme may
  * seem odd, it importantly keeps us out of trying to do any DMA allocations in
  * the normal path of operation, even though we may still have to allocate
  * message blocks and copy.
  *
- * The following state machine describes the life time of a rx control block. In
- * the diagram we abbrviate the rx ring descriptor entry as rxd and the rx
+ * The following state machine describes the life time of a RX control block. In
+ * the diagram we abbrviate the RX ring descriptor entry as rxd and the rx
  * control block entry as rcb.
  *
  *             |                                   |
@@ -160,11 +160,11 @@
  *             +--------------------<-----| rcb loaned to MAC |
  *                                        +-------------------+
  *
- * Finally, note that every rx control block has a reference count on it. One
+ * Finally, note that every RX control block has a reference count on it. One
  * reference is added as long as the driver has had the GLDv3 mc_start endpoint
  * called. If the GLDv3 mc_stop entry point is called, IP has been unplumbed and
  * no other DLPI consumers remain, then we'll decrement the reference count by
- * one. Whenever we loan up the rx control block and associated buffer to MAC,
+ * one. Whenever we loan up the RX control block and associated buffer to MAC,
  * then we bump the reference count again. Even though the device is stopped,
  * there may still be loaned frames in upper levels that we'll want to account
  * for. Our callback from freemsg(9F)/freeb(9F) will take care of making sure
@@ -192,10 +192,10 @@
  * state tracking. Effectively, we cache the HEAD register and then update it
  * ourselves based on our work.
  *
- * When we iterate over the rx descriptors and thus the received frames, we are
+ * When we iterate over the RX descriptors and thus the received frames, we are
  * either in an interrupt context or we've been asked by MAC to poll on the
  * ring. If we've been asked to poll on the ring, we have a maximum number of
- * bytes of mblk_t's to return. If processing an rx descriptor would cause us to
+ * bytes of mblk_t's to return. If processing an RX descriptor would cause us to
  * exceed that count, then we do not process it. When in interrupt context, we
  * don't have a strict byte count. However, to ensure liveness, we limit the
  * amount of data based on a configuration value
@@ -249,31 +249,44 @@
  * differently due to the fact that all data is originated by the operating
  * system and not by the device.
  *
- * Like rx, there is both a descriptor ring that we use to communicate to the
- * driver and which points to the memory used to transmit a frame. Similarly,
- * there is a corresponding transmit control block. Each transmit control block
- * has a region of DMA memory allocated to it; however, the way we use it
- * varies.
+ * Like RX, there is both a descriptor ring that we use to communicate to the
+ * driver and which points to the memory used to transmit a frame.  Similarly,
+ * there is a corresponding transmit control block, however, the correspondence
+ * between descriptors and control blocks is more complex and not necessarily
+ * 1-to-1.
  *
  * The driver is asked to process a single frame at a time. That message block
  * may be made up of multiple fragments linked together by the mblk_t`b_cont
  * member. The device has a hard limit of up to 8 buffers being allowed for use
  * for a single logical frame. For each fragment, we'll try and use an entry
- * from the tx descriptor ring and then we'll allocate a corresponding tx
- * control block. Depending on the size of the fragment, we may copy it around
- * or we might instead try to do DMA binding of the fragment.
- *
- * If we exceed the number of blocks that fit, we'll try to pull up the block
- * and then we'll do a DMA bind and send it out.
- *
- * If we don't have enough space in the ring or tx control blocks available,
+ * from the TX descriptor ring and then we'll allocate a corresponding TX
+ * control block.
+ *
+ * We alter our DMA strategy based on a threshold tied to the frame size.
+ * This threshold is configurable via the tx_dma_threshold property. If the
+ * frame size is above the threshold, we do DMA binding of the fragments,
+ * building a control block and data descriptor for each piece.  If it's below
+ * or at the threshold then we just use a single control block and data
+ * descriptor and simply bcopy all of the fragments into the pre-allocated DMA
+ * buffer in the control block. For the LSO TX case we always do DMA binding of
+ * the fragments, with one control block and one TX data descriptor allocated
+ * per fragment.
+ *
+ * Furthermore, if the frame requires HW offloads such as LSO, tunneling or
+ * filtering, then the TX data descriptors must be preceeded by a single TX
+ * context descriptor.  Because there is no DMA transfer associated with the
+ * context descriptor, we allocate a control block with a special type which
+ * indicates to the TX ring recycle code that there are no associated DMA
+ * resources to unbind when the control block is free'd.
+ *
+ * If we don't have enough space in the ring or TX control blocks available,
  * then we'll return the unprocessed message block to MAC. This will induce flow
  * control and once we recycle enough entries, we'll once again enable sending
  * on the ring.
  *
  * We size the working list as equal to the number of descriptors in the ring.
  * We size the free list as equal to 1.5 times the number of descriptors in the
- * ring. We'll allocate a number of tx control block entries equal to the number
+ * ring. We'll allocate a number of TX control block entries equal to the number
  * of entries in the free list. By default, all entries are placed in the free
  * list. As we come along and try to send something, we'll allocate entries from
  * the free list and add them to the working list, where they'll stay until the
@@ -325,7 +338,7 @@
  *    +------------------+                       +------------------+
  *    | tcb on free list |---*------------------>| tcb on work list |
  *    +------------------+   .                   +------------------+
- *             ^             . tcb allocated               |
+ *             ^             . N tcbs allocated[1]         |
  *             |               to send frame               v
  *             |               or fragment on              |
  *             |               wire, mblk from             |
@@ -335,20 +348,27 @@
  *                    .
  *                    . Hardware indicates
  *                      entry transmitted.
- *                      tcb recycled, mblk
+ *                      tcbs recycled, mblk
  *                      from MAC freed.
  *
+ * [1] We allocate N tcbs to transmit a single frame where N can be 1 context
+ *     descriptor plus 1 data descriptor, in the non-DMA-bind case.  In the DMA
+ *     bind case, N can be 1 context descriptor plus 1 data descriptor per
+ *     b_cont in the mblk.  In this case, the mblk is associated with the first
+ *     data descriptor and freed as part of freeing that data descriptor.
+ *
  * ------------
  * Blocking MAC
  * ------------
  *
- * Wen performing transmit, we can run out of descriptors and ring entries. When
- * such a case happens, we return the mblk_t to MAC to indicate that we've been
- * blocked. At that point in time, MAC becomes blocked and will not transmit
- * anything out that specific ring until we notify MAC. To indicate that we're
- * in such a situation we set i40e_trqpair_t`itrq_tx_blocked member to B_TRUE.
+ * When performing transmit, we can run out of descriptors and ring entries.
+ * When such a case happens, we return the mblk_t to MAC to indicate that we've
+ * been blocked. At that point in time, MAC becomes blocked and will not
+ * transmit anything out that specific ring until we notify MAC. To indicate
+ * that we're in such a situation we set i40e_trqpair_t`itrq_tx_blocked member
+ * to B_TRUE.
  *
- * When we recycle tx descriptors then we'll end up signaling MAC by calling
+ * When we recycle TX descriptors then we'll end up signaling MAC by calling
  * mac_tx_ring_update() if we were blocked, letting it know that it's safe to
  * start sending frames out to us again.
  */
@@ -367,13 +387,15 @@
 
 /*
  * This structure is used to maintain information and flags related to
- * transmitting a frame. The first member is the set of flags we need to or into
- * the command word (generally checksumming related). The second member controls
- * the word offsets which is required for IP and L4 checksumming.
+ * transmitting a frame.  These fields are ultimately used to construct the
+ * TX data descriptor(s) and, if necessary, the TX context descriptor.
  */
 typedef struct i40e_tx_context {
-	enum i40e_tx_desc_cmd_bits	itc_cmdflags;
-	uint32_t			itc_offsets;
+	enum i40e_tx_desc_cmd_bits	itc_data_cmdflags;
+	uint32_t			itc_data_offsets;
+	enum i40e_tx_ctx_desc_cmd_bits	itc_ctx_cmdflags;
+	uint32_t			itc_ctx_tsolen;
+	uint32_t			itc_ctx_mss;
 } i40e_tx_context_t;
 
 /*
@@ -395,14 +417,18 @@ i40e_debug_rx_t i40e_debug_rx_mode = I40E_DEBUG_RX_DEFAULT;
  * i40e_static_dma_attr, is designed to be used for both the descriptor rings
  * and the static buffers that we associate with control blocks. For this
  * reason, we force an SGL length of one. While technically the driver supports
- * a larger SGL (5 on rx and 8 on tx), we opt to only use one to simplify our
+ * a larger SGL (5 on RX and 8 on TX), we opt to only use one to simplify our
  * management here. In addition, when the Intel common code wants to allocate
  * memory via the i40e_allocate_virt_mem osdep function, we have it leverage
  * the static dma attr.
  *
- * The second set of attributes, i40e_txbind_dma_attr, is what we use when we're
- * binding a bunch of mblk_t fragments to go out the door. Note that the main
- * difference here is that we're allowed a larger SGL length -- eight.
+ * The latter two sets of attributes, are what we use when we're binding a
+ * bunch of mblk_t fragments to go out the door. Note that the main difference
+ * here is that we're allowed a larger SGL length.  For non-LSO TX, we
+ * restrict the SGL length to match the number of TX buffers available to the
+ * PF (8).  For the LSO case we can go much larger, with the caveat that each
+ * MSS-sized chunk (segment) must not span more than 8 data descriptors and
+ * hence must not span more than 8 cookies.
  *
  * Note, we default to setting ourselves to be DMA capable here. However,
  * because we could have multiple instances which have different FMA error
@@ -429,7 +455,7 @@ static const ddi_dma_attr_t i40e_g_txbind_dma_attr = {
 	DMA_ATTR_V0,			/* version number */
 	0x0000000000000000ull,		/* low address */
 	0xFFFFFFFFFFFFFFFFull,		/* high address */
-	0x00000000FFFFFFFFull,		/* dma counter max */
+	I40E_MAX_TX_BUFSZ - 1,		/* dma counter max */
 	I40E_DMA_ALIGNMENT,		/* alignment */
 	0x00000FFF,			/* burst sizes */
 	0x00000001,			/* minimum transfer size */
@@ -440,6 +466,21 @@ static const ddi_dma_attr_t i40e_g_txbind_dma_attr = {
 	DDI_DMA_FLAGERR			/* DMA flags */
 };
 
+static const ddi_dma_attr_t i40e_g_txbind_lso_dma_attr = {
+	DMA_ATTR_V0,			/* version number */
+	0x0000000000000000ull,		/* low address */
+	0xFFFFFFFFFFFFFFFFull,		/* high address */
+	I40E_MAX_TX_BUFSZ - 1,		/* dma counter max */
+	I40E_DMA_ALIGNMENT,		/* alignment */
+	0x00000FFF,			/* burst sizes */
+	0x00000001,			/* minimum transfer size */
+	0x00000000FFFFFFFFull,		/* maximum transfer size */
+	0xFFFFFFFFFFFFFFFFull,		/* maximum segment size	 */
+	I40E_TX_LSO_MAX_COOKIE,		/* scatter/gather list length */
+	0x00000001,			/* granularity */
+	DDI_DMA_FLAGERR			/* DMA flags */
+};
+
 /*
  * Next, we have the attributes for these structures. The descriptor rings are
  * all strictly little endian, while the data buffers are just arrays of bytes
@@ -668,7 +709,7 @@ i40e_alloc_rx_data(i40e_t *i40e, i40e_trqpair_t *itrq)
 	rxd->rxd_work_list = kmem_zalloc(sizeof (i40e_rx_control_block_t *) *
 	    rxd->rxd_ring_size, KM_NOSLEEP);
 	if (rxd->rxd_work_list == NULL) {
-		i40e_error(i40e, "failed to allocate rx work list for a ring "
+		i40e_error(i40e, "failed to allocate RX work list for a ring "
 		    "of %d entries for ring %d", rxd->rxd_ring_size,
 		    itrq->itrq_index);
 		goto cleanup;
@@ -677,7 +718,7 @@ i40e_alloc_rx_data(i40e_t *i40e, i40e_trqpair_t *itrq)
 	rxd->rxd_free_list = kmem_zalloc(sizeof (i40e_rx_control_block_t *) *
 	    rxd->rxd_free_list_size, KM_NOSLEEP);
 	if (rxd->rxd_free_list == NULL) {
-		i40e_error(i40e, "failed to allocate a %d entry rx free list "
+		i40e_error(i40e, "failed to allocate a %d entry RX free list "
 		    "for ring %d", rxd->rxd_free_list_size, itrq->itrq_index);
 		goto cleanup;
 	}
@@ -765,7 +806,7 @@ i40e_alloc_rx_dma(i40e_rx_data_t *rxd)
 	i40e_t *i40e = rxd->rxd_i40e;
 
 	/*
-	 * First allocate the rx descriptor ring.
+	 * First allocate the RX descriptor ring.
 	 */
 	dmasz = sizeof (i40e_rx_desc_t) * rxd->rxd_ring_size;
 	VERIFY(dmasz > 0);
@@ -773,7 +814,7 @@ i40e_alloc_rx_dma(i40e_rx_data_t *rxd)
 	    &i40e->i40e_static_dma_attr, &i40e->i40e_desc_acc_attr, B_FALSE,
 	    B_TRUE, dmasz) == B_FALSE) {
 		i40e_error(i40e, "failed to allocate DMA resources "
-		    "for rx descriptor ring");
+		    "for RX descriptor ring");
 		return (B_FALSE);
 	}
 	rxd->rxd_desc_ring =
@@ -799,7 +840,7 @@ i40e_alloc_rx_dma(i40e_rx_data_t *rxd)
 		if (i40e_alloc_dma_buffer(i40e, dmap,
 		    &i40e->i40e_static_dma_attr, &i40e->i40e_buf_acc_attr,
 		    B_TRUE, B_FALSE, dmasz) == B_FALSE) {
-			i40e_error(i40e, "failed to allocate rx dma buffer");
+			i40e_error(i40e, "failed to allocate RX dma buffer");
 			return (B_FALSE);
 		}
 
@@ -841,6 +882,10 @@ i40e_free_tx_dma(i40e_trqpair_t *itrq)
 				ddi_dma_free_handle(&tcb->tcb_dma_handle);
 				tcb->tcb_dma_handle = NULL;
 			}
+			if (tcb->tcb_lso_dma_handle != NULL) {
+				ddi_dma_free_handle(&tcb->tcb_lso_dma_handle);
+				tcb->tcb_lso_dma_handle = NULL;
+			}
 		}
 
 		fsz = sizeof (i40e_tx_control_block_t) *
@@ -881,7 +926,7 @@ i40e_alloc_tx_dma(i40e_trqpair_t *itrq)
 	    (i40e->i40e_tx_ring_size >> 1);
 
 	/*
-	 * Allocate an additional tx descriptor for the writeback head.
+	 * Allocate an additional TX descriptor for the writeback head.
 	 */
 	dmasz = sizeof (i40e_tx_desc_t) * itrq->itrq_tx_ring_size;
 	dmasz += sizeof (i40e_tx_desc_t);
@@ -890,7 +935,7 @@ i40e_alloc_tx_dma(i40e_trqpair_t *itrq)
 	if (i40e_alloc_dma_buffer(i40e, &itrq->itrq_desc_area,
 	    &i40e->i40e_static_dma_attr, &i40e->i40e_desc_acc_attr,
 	    B_FALSE, B_TRUE, dmasz) == B_FALSE) {
-		i40e_error(i40e, "failed to allocate DMA resources for tx "
+		i40e_error(i40e, "failed to allocate DMA resources for TX "
 		    "descriptor ring");
 		return (B_FALSE);
 	}
@@ -905,7 +950,7 @@ i40e_alloc_tx_dma(i40e_trqpair_t *itrq)
 	itrq->itrq_tcb_work_list = kmem_zalloc(itrq->itrq_tx_ring_size *
 	    sizeof (i40e_tx_control_block_t *), KM_NOSLEEP);
 	if (itrq->itrq_tcb_work_list == NULL) {
-		i40e_error(i40e, "failed to allocate a %d entry tx work list "
+		i40e_error(i40e, "failed to allocate a %d entry TX work list "
 		    "for ring %d", itrq->itrq_tx_ring_size, itrq->itrq_index);
 		goto cleanup;
 	}
@@ -913,14 +958,14 @@ i40e_alloc_tx_dma(i40e_trqpair_t *itrq)
 	itrq->itrq_tcb_free_list = kmem_zalloc(itrq->itrq_tx_free_list_size *
 	    sizeof (i40e_tx_control_block_t *), KM_SLEEP);
 	if (itrq->itrq_tcb_free_list == NULL) {
-		i40e_error(i40e, "failed to allocate a %d entry tx free list "
+		i40e_error(i40e, "failed to allocate a %d entry TX free list "
 		    "for ring %d", itrq->itrq_tx_free_list_size,
 		    itrq->itrq_index);
 		goto cleanup;
 	}
 
 	/*
-	 * We allocate enough tx control blocks to cover the free list.
+	 * We allocate enough TX control blocks to cover the free list.
 	 */
 	itrq->itrq_tcb_area = kmem_zalloc(sizeof (i40e_tx_control_block_t) *
 	    itrq->itrq_tx_free_list_size, KM_NOSLEEP);
@@ -948,18 +993,29 @@ i40e_alloc_tx_dma(i40e_trqpair_t *itrq)
 		    &i40e->i40e_txbind_dma_attr, DDI_DMA_DONTWAIT, NULL,
 		    &tcb->tcb_dma_handle);
 		if (ret != DDI_SUCCESS) {
-			i40e_error(i40e, "failed to allocate DMA handle for tx "
+			i40e_error(i40e, "failed to allocate DMA handle for TX "
 			    "data binding on ring %d: %d", itrq->itrq_index,
 			    ret);
 			tcb->tcb_dma_handle = NULL;
 			goto cleanup;
 		}
 
+		ret = ddi_dma_alloc_handle(i40e->i40e_dip,
+		    &i40e->i40e_txbind_lso_dma_attr, DDI_DMA_DONTWAIT, NULL,
+		    &tcb->tcb_lso_dma_handle);
+		if (ret != DDI_SUCCESS) {
+			i40e_error(i40e, "failed to allocate DMA handle for TX "
+			    "LSO data binding on ring %d: %d", itrq->itrq_index,
+			    ret);
+			tcb->tcb_lso_dma_handle = NULL;
+			goto cleanup;
+		}
+
 		if (i40e_alloc_dma_buffer(i40e, &tcb->tcb_dma,
 		    &i40e->i40e_static_dma_attr, &i40e->i40e_buf_acc_attr,
 		    B_TRUE, B_FALSE, dmasz) == B_FALSE) {
 			i40e_error(i40e, "failed to allocate %ld bytes of "
-			    "DMA for tx data binding on ring %d", dmasz,
+			    "DMA for TX data binding on ring %d", dmasz,
 			    itrq->itrq_index);
 			goto cleanup;
 		}
@@ -989,10 +1045,17 @@ i40e_free_ring_mem(i40e_t *i40e, boolean_t failed_init)
 		i40e_rx_data_t *rxd = i40e->i40e_trqpairs[i].itrq_rxdata;
 
 		/*
-		 * Clean up our rx data. We have to free DMA resources first and
+		 * In some cases i40e_alloc_rx_data() may have failed
+		 * and in that case there is no rxd to free.
+		 */
+		if (rxd == NULL)
+			continue;
+
+		/*
+		 * Clean up our RX data. We have to free DMA resources first and
 		 * then if we have no more pending RCB's, then we'll go ahead
 		 * and clean things up. Note, we can't set the stopped flag on
-		 * the rx data until after we've done the first pass of the
+		 * the RX data until after we've done the first pass of the
 		 * pending resources. Otherwise we might race with
 		 * i40e_rx_recycle on determining who should free the
 		 * i40e_rx_data_t above.
@@ -1055,6 +1118,8 @@ i40e_init_dma_attrs(i40e_t *i40e, boolean_t fma)
 	    sizeof (ddi_dma_attr_t));
 	bcopy(&i40e_g_txbind_dma_attr, &i40e->i40e_txbind_dma_attr,
 	    sizeof (ddi_dma_attr_t));
+	bcopy(&i40e_g_txbind_lso_dma_attr, &i40e->i40e_txbind_lso_dma_attr,
+	    sizeof (ddi_dma_attr_t));
 	bcopy(&i40e_g_desc_acc_attr, &i40e->i40e_desc_acc_attr,
 	    sizeof (ddi_device_acc_attr_t));
 	bcopy(&i40e_g_buf_acc_attr, &i40e->i40e_buf_acc_attr,
@@ -1063,9 +1128,13 @@ i40e_init_dma_attrs(i40e_t *i40e, boolean_t fma)
 	if (fma == B_TRUE) {
 		i40e->i40e_static_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR;
 		i40e->i40e_txbind_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR;
+		i40e->i40e_txbind_lso_dma_attr.dma_attr_flags |=
+		    DDI_DMA_FLAGERR;
 	} else {
 		i40e->i40e_static_dma_attr.dma_attr_flags &= ~DDI_DMA_FLAGERR;
 		i40e->i40e_txbind_dma_attr.dma_attr_flags &= ~DDI_DMA_FLAGERR;
+		i40e->i40e_txbind_lso_dma_attr.dma_attr_flags &=
+		    ~DDI_DMA_FLAGERR;
 	}
 }
 
@@ -1102,7 +1171,7 @@ i40e_rcb_alloc(i40e_rx_data_t *rxd)
 /*
  * This is the callback that we get from the OS when freemsg(9F) has been called
  * on a loaned descriptor. In addition, if we take the last reference count
- * here, then we have to tear down all of the rx data.
+ * here, then we have to tear down all of the RX data.
  */
 void
 i40e_rx_recycle(caddr_t arg)
@@ -1768,16 +1837,19 @@ mac_ether_offload_info(mblk_t *mp, mac_ether_offload_info_t *meoi)
  * to properly program the hardware for checksum offload as well as the
  * generally required flags.
  *
- * The i40e_tx_context_t`itc_cmdflags contains the set of flags we need to or
- * into the descriptor based on the checksum flags for this mblk_t and the
+ * The i40e_tx_context_t`itc_data_cmdflags contains the set of flags we need to
+ * 'or' into the descriptor based on the checksum flags for this mblk_t and the
  * actual information we care about.
+ *
+ * If the mblk requires LSO then we'll also gather the information that will be
+ * used to construct the Transmit Context Descriptor.
  */
 static int
 i40e_tx_context(i40e_t *i40e, i40e_trqpair_t *itrq, mblk_t *mp,
     i40e_tx_context_t *tctx)
 {
 	int ret;
-	uint32_t flags, start;
+	uint32_t chkflags, start, mss, lsoflags;
 	mac_ether_offload_info_t meo;
 	i40e_txq_stat_t *txs = &itrq->itrq_txstat;
 
@@ -1786,8 +1858,10 @@ i40e_tx_context(i40e_t *i40e, i40e_trqpair_t *itrq, mblk_t *mp,
 	if (i40e->i40e_tx_hcksum_enable != B_TRUE)
 		return (0);
 
-	mac_hcksum_get(mp, &start, NULL, NULL, NULL, &flags);
-	if (flags == 0)
+	mac_hcksum_get(mp, &start, NULL, NULL, NULL, &chkflags);
+	mac_lso_get(mp, &mss, &lsoflags);
+
+	if (chkflags == 0 && lsoflags == 0)
 		return (0);
 
 	if ((ret = mac_ether_offload_info(mp, &meo)) != 0) {
@@ -1800,7 +1874,7 @@ i40e_tx_context(i40e_t *i40e, i40e_trqpair_t *itrq, mblk_t *mp,
 	 * have sufficient information and then set the proper fields in the
 	 * command structure.
 	 */
-	if (flags & HCK_IPV4_HDRCKSUM) {
+	if (chkflags & HCK_IPV4_HDRCKSUM) {
 		if ((meo.meoi_flags & MEOI_L2INFO_SET) == 0) {
 			txs->itxs_hck_nol2info.value.ui64++;
 			return (-1);
@@ -1813,10 +1887,10 @@ i40e_tx_context(i40e_t *i40e, i40e_trqpair_t *itrq, mblk_t *mp,
 			txs->itxs_hck_badl3.value.ui64++;
 			return (-1);
 		}
-		tctx->itc_cmdflags |= I40E_TX_DESC_CMD_IIPT_IPV4_CSUM;
-		tctx->itc_offsets |= (meo.meoi_l2hlen >> 1) <<
+		tctx->itc_data_cmdflags |= I40E_TX_DESC_CMD_IIPT_IPV4_CSUM;
+		tctx->itc_data_offsets |= (meo.meoi_l2hlen >> 1) <<
 		    I40E_TX_DESC_LENGTH_MACLEN_SHIFT;
-		tctx->itc_offsets |= (meo.meoi_l3hlen >> 2) <<
+		tctx->itc_data_offsets |= (meo.meoi_l3hlen >> 2) <<
 		    I40E_TX_DESC_LENGTH_IPLEN_SHIFT;
 	}
 
@@ -1826,13 +1900,13 @@ i40e_tx_context(i40e_t *i40e, i40e_trqpair_t *itrq, mblk_t *mp,
 	 * onto seeing if we have enough information for the L4 checksum
 	 * offload.
 	 */
-	if (flags & HCK_PARTIALCKSUM) {
+	if (chkflags & HCK_PARTIALCKSUM) {
 		if ((meo.meoi_flags & MEOI_L4INFO_SET) == 0) {
 			txs->itxs_hck_nol4info.value.ui64++;
 			return (-1);
 		}
 
-		if (!(flags & HCK_IPV4_HDRCKSUM)) {
+		if (!(chkflags & HCK_IPV4_HDRCKSUM)) {
 			if ((meo.meoi_flags & MEOI_L2INFO_SET) == 0) {
 				txs->itxs_hck_nol2info.value.ui64++;
 				return (-1);
@@ -1843,40 +1917,60 @@ i40e_tx_context(i40e_t *i40e, i40e_trqpair_t *itrq, mblk_t *mp,
 			}
 
 			if (meo.meoi_l3proto == ETHERTYPE_IP) {
-				tctx->itc_cmdflags |=
+				tctx->itc_data_cmdflags |=
 				    I40E_TX_DESC_CMD_IIPT_IPV4;
 			} else if (meo.meoi_l3proto == ETHERTYPE_IPV6) {
-				tctx->itc_cmdflags |=
+				tctx->itc_data_cmdflags |=
 				    I40E_TX_DESC_CMD_IIPT_IPV6;
 			} else {
 				txs->itxs_hck_badl3.value.ui64++;
 				return (-1);
 			}
-			tctx->itc_offsets |= (meo.meoi_l2hlen >> 1) <<
+			tctx->itc_data_offsets |= (meo.meoi_l2hlen >> 1) <<
 			    I40E_TX_DESC_LENGTH_MACLEN_SHIFT;
-			tctx->itc_offsets |= (meo.meoi_l3hlen >> 2) <<
+			tctx->itc_data_offsets |= (meo.meoi_l3hlen >> 2) <<
 			    I40E_TX_DESC_LENGTH_IPLEN_SHIFT;
 		}
 
 		switch (meo.meoi_l4proto) {
 		case IPPROTO_TCP:
-			tctx->itc_cmdflags |= I40E_TX_DESC_CMD_L4T_EOFT_TCP;
+			tctx->itc_data_cmdflags |=
+			    I40E_TX_DESC_CMD_L4T_EOFT_TCP;
 			break;
 		case IPPROTO_UDP:
-			tctx->itc_cmdflags |= I40E_TX_DESC_CMD_L4T_EOFT_UDP;
+			tctx->itc_data_cmdflags |=
+			    I40E_TX_DESC_CMD_L4T_EOFT_UDP;
 			break;
 		case IPPROTO_SCTP:
-			tctx->itc_cmdflags |= I40E_TX_DESC_CMD_L4T_EOFT_SCTP;
+			tctx->itc_data_cmdflags |=
+			    I40E_TX_DESC_CMD_L4T_EOFT_SCTP;
 			break;
 		default:
 			txs->itxs_hck_badl4.value.ui64++;
 			return (-1);
 		}
 
-		tctx->itc_offsets |= (meo.meoi_l4hlen >> 2) <<
+		tctx->itc_data_offsets |= (meo.meoi_l4hlen >> 2) <<
 		    I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
 	}
 
+	if (lsoflags & HW_LSO) {
+		/*
+		 * LSO requires that checksum offloads are enabled.  If for
+		 * some reason they're not we bail out with an error.
+		 */
+		if ((chkflags & HCK_IPV4_HDRCKSUM) == 0 ||
+		    (chkflags & HCK_PARTIALCKSUM) == 0) {
+			txs->itxs_lso_nohck.value.ui64++;
+			return (-1);
+		}
+
+		tctx->itc_ctx_cmdflags |= I40E_TX_CTX_DESC_TSO;
+		tctx->itc_ctx_mss = mss;
+		tctx->itc_ctx_tsolen = msgsize(mp) -
+		    (meo.meoi_l2hlen + meo.meoi_l3hlen + meo.meoi_l4hlen);
+	}
+
 	return (0);
 }
 
@@ -1925,7 +2019,20 @@ i40e_tcb_reset(i40e_tx_control_block_t *tcb)
 		tcb->tcb_dma.dmab_len = 0;
 		break;
 	case I40E_TX_DMA:
-		(void) ddi_dma_unbind_handle(tcb->tcb_dma_handle);
+		if (tcb->tcb_used_lso == B_TRUE && tcb->tcb_bind_ncookies > 0)
+			(void) ddi_dma_unbind_handle(tcb->tcb_lso_dma_handle);
+		else if (tcb->tcb_bind_ncookies > 0)
+			(void) ddi_dma_unbind_handle(tcb->tcb_dma_handle);
+		if (tcb->tcb_bind_info != NULL) {
+			kmem_free(tcb->tcb_bind_info,
+			    tcb->tcb_bind_ncookies *
+			    sizeof (struct i40e_dma_bind_info));
+		}
+		tcb->tcb_bind_info = NULL;
+		tcb->tcb_bind_ncookies = 0;
+		tcb->tcb_used_lso = B_FALSE;
+		break;
+	case I40E_TX_DESC:
 		break;
 	case I40E_TX_NONE:
 		/* Cast to pacify lint */
@@ -1935,8 +2042,10 @@ i40e_tcb_reset(i40e_tx_control_block_t *tcb)
 	}
 
 	tcb->tcb_type = I40E_TX_NONE;
-	freemsg(tcb->tcb_mp);
-	tcb->tcb_mp = NULL;
+	if (tcb->tcb_mp != NULL) {
+		freemsg(tcb->tcb_mp);
+		tcb->tcb_mp = NULL;
+	}
 	tcb->tcb_next = NULL;
 }
 
@@ -1969,10 +2078,11 @@ i40e_tx_cleanup_ring(i40e_trqpair_t *itrq)
 		i40e_tx_control_block_t *tcb;
 
 		tcb = itrq->itrq_tcb_work_list[index];
-		VERIFY(tcb != NULL);
-		itrq->itrq_tcb_work_list[index] = NULL;
-		i40e_tcb_reset(tcb);
-		i40e_tcb_free(itrq, tcb);
+		if (tcb != NULL) {
+			itrq->itrq_tcb_work_list[index] = NULL;
+			i40e_tcb_reset(tcb);
+			i40e_tcb_free(itrq, tcb);
+		}
 
 		bzero(&itrq->itrq_desc_ring[index], sizeof (i40e_tx_desc_t));
 		index = i40e_next_desc(index, 1, itrq->itrq_tx_ring_size);
@@ -1995,6 +2105,7 @@ i40e_tx_recycle_ring(i40e_trqpair_t *itrq)
 	uint32_t wbhead, toclean, count;
 	i40e_tx_control_block_t *tcbhead;
 	i40e_t *i40e = itrq->itrq_i40e;
+	uint_t desc_per_tcb, i;
 
 	mutex_enter(&itrq->itrq_tx_lock);
 
@@ -2042,11 +2153,27 @@ i40e_tx_recycle_ring(i40e_trqpair_t *itrq)
 		tcbhead = tcb;
 
 		/*
-		 * We zero this out for sanity purposes.
+		 * In the DMA bind case, there may not necessarily be a 1:1
+		 * mapping between tcb's and descriptors.  If the tcb type
+		 * indicates a DMA binding then check the number of DMA
+		 * cookies to determine how many entries to clean in the
+		 * descriptor ring.
 		 */
-		bzero(&itrq->itrq_desc_ring[toclean], sizeof (i40e_tx_desc_t));
-		toclean = i40e_next_desc(toclean, 1, itrq->itrq_tx_ring_size);
-		count++;
+		if (tcb->tcb_type == I40E_TX_DMA)
+			desc_per_tcb = tcb->tcb_bind_ncookies;
+		else
+			desc_per_tcb = 1;
+
+		for (i = 0; i < desc_per_tcb; i++) {
+			/*
+			 * We zero this out for sanity purposes.
+			 */
+			bzero(&itrq->itrq_desc_ring[toclean],
+			    sizeof (i40e_tx_desc_t));
+			toclean = i40e_next_desc(toclean, 1,
+			    itrq->itrq_tx_ring_size);
+			count++;
+		}
 	}
 
 	itrq->itrq_desc_head = wbhead;
@@ -2078,6 +2205,102 @@ i40e_tx_recycle_ring(i40e_trqpair_t *itrq)
 	DTRACE_PROBE2(i40e__recycle, i40e_trqpair_t *, itrq, uint32_t, count);
 }
 
+static i40e_tx_control_block_t *
+i40e_tx_bind_fragment(i40e_trqpair_t *itrq, const mblk_t *mp,
+    boolean_t use_lso)
+{
+	ddi_dma_handle_t dma_handle;
+	ddi_dma_cookie_t dma_cookie;
+	uint_t i = 0, ncookies = 0, dmaflags;
+	i40e_tx_control_block_t *tcb;
+	i40e_txq_stat_t *txs = &itrq->itrq_txstat;
+
+	if ((tcb = i40e_tcb_alloc(itrq)) == NULL) {
+		txs->itxs_err_notcb.value.ui64++;
+		return (NULL);
+	}
+	tcb->tcb_type = I40E_TX_DMA;
+
+	if (use_lso == B_TRUE)
+		dma_handle = tcb->tcb_lso_dma_handle;
+	else
+		dma_handle = tcb->tcb_dma_handle;
+
+	dmaflags = DDI_DMA_WRITE | DDI_DMA_STREAMING;
+	if (ddi_dma_addr_bind_handle(dma_handle, NULL,
+	    (caddr_t)mp->b_rptr, MBLKL(mp), dmaflags, DDI_DMA_DONTWAIT, NULL,
+	    &dma_cookie, &ncookies) != DDI_DMA_MAPPED) {
+		txs->itxs_bind_fails.value.ui64++;
+		goto bffail;
+	}
+	tcb->tcb_bind_ncookies = ncookies;
+	tcb->tcb_used_lso = use_lso;
+
+	tcb->tcb_bind_info =
+	    kmem_zalloc(ncookies * sizeof (struct i40e_dma_bind_info),
+	    KM_NOSLEEP);
+	if (tcb->tcb_bind_info == NULL)
+		goto bffail;
+
+	while (i < ncookies) {
+		if (i > 0)
+			ddi_dma_nextcookie(dma_handle, &dma_cookie);
+
+		tcb->tcb_bind_info[i].dbi_paddr =
+		    (caddr_t)dma_cookie.dmac_laddress;
+		tcb->tcb_bind_info[i++].dbi_len = dma_cookie.dmac_size;
+	}
+
+	return (tcb);
+
+bffail:
+	i40e_tcb_reset(tcb);
+	i40e_tcb_free(itrq, tcb);
+	return (NULL);
+}
+
+static void
+i40e_tx_set_data_desc(i40e_trqpair_t *itrq, i40e_tx_context_t *tctx,
+    struct i40e_dma_bind_info *dbi, boolean_t last_desc)
+{
+	i40e_tx_desc_t *txdesc;
+	int cmd;
+
+	ASSERT(MUTEX_HELD(&itrq->itrq_tx_lock));
+	itrq->itrq_desc_free--;
+	txdesc = &itrq->itrq_desc_ring[itrq->itrq_desc_tail];
+	itrq->itrq_desc_tail = i40e_next_desc(itrq->itrq_desc_tail, 1,
+	    itrq->itrq_tx_ring_size);
+
+	cmd = I40E_TX_DESC_CMD_ICRC | tctx->itc_data_cmdflags;
+
+	/*
+	 * The last data descriptor needs the EOP bit set, so that the HW knows
+	 * that we're ready to send.  Additionally, we set the RS (Report
+	 * Status) bit, so that we are notified when the transmit engine has
+	 * completed DMA'ing all of the data descriptors and data buffers
+	 * associated with this frame.
+	 */
+	if (last_desc == B_TRUE) {
+		cmd |= I40E_TX_DESC_CMD_EOP;
+		cmd |= I40E_TX_DESC_CMD_RS;
+	}
+
+	/*
+	 * Per the X710 manual, section 8.4.2.1.1, the buffer size
+	 * must be a value from 1 to 16K minus 1, inclusive.
+	 */
+	ASSERT3U(dbi->dbi_len, >=, 1);
+	ASSERT3U(dbi->dbi_len, <=, I40E_MAX_TX_BUFSZ - 1);
+
+	txdesc->buffer_addr = CPU_TO_LE64((uintptr_t)dbi->dbi_paddr);
+	txdesc->cmd_type_offset_bsz =
+	    LE_64(((uint64_t)I40E_TX_DESC_DTYPE_DATA |
+	    ((uint64_t)tctx->itc_data_offsets << I40E_TXD_QW1_OFFSET_SHIFT) |
+	    ((uint64_t)cmd << I40E_TXD_QW1_CMD_SHIFT) |
+	    ((uint64_t)dbi->dbi_len << I40E_TXD_QW1_TX_BUF_SZ_SHIFT)));
+}
+
 /*
  * We've been asked to send a message block on the wire. We'll only have a
  * single chain. There will not be any b_next pointers; however, there may be
@@ -2098,10 +2321,15 @@ i40e_ring_tx(void *arg, mblk_t *mp)
 {
 	const mblk_t *nmp;
 	size_t mpsize;
-	i40e_tx_control_block_t *tcb;
+	i40e_tx_control_block_t *tcb_ctx = NULL, *tcb_data = NULL,
+	    **tcb_dma = NULL;
 	i40e_tx_desc_t *txdesc;
+	i40e_tx_context_desc_t *ctxdesc;
 	i40e_tx_context_t tctx;
 	int cmd, type;
+	uint_t i, needed_desc = 0, nbufs = 0;
+	boolean_t do_ctx_desc = B_FALSE, do_dma_bind = B_FALSE,
+	    use_lso = B_FALSE;
 
 	i40e_trqpair_t *itrq = arg;
 	i40e_t *i40e = itrq->itrq_i40e;
@@ -2121,7 +2349,7 @@ i40e_ring_tx(void *arg, mblk_t *mp)
 
 	/*
 	 * Figure out the relevant context about this frame that we might need
-	 * for enabling checksum, lso, etc. This also fills in information that
+	 * for enabling checksum, LSO, etc. This also fills in information that
 	 * we might set around the packet type, etc.
 	 */
 	if (i40e_tx_context(i40e, itrq, mp, &tctx) < 0) {
@@ -2129,97 +2357,212 @@ i40e_ring_tx(void *arg, mblk_t *mp)
 		itrq->itrq_txstat.itxs_err_context.value.ui64++;
 		return (NULL);
 	}
+	if (tctx.itc_ctx_cmdflags & I40E_TX_CTX_DESC_TSO) {
+		use_lso = B_TRUE;
+		do_ctx_desc = B_TRUE;
+	}
 
 	/*
 	 * For the primordial driver we can punt on doing any recycling right
 	 * now; however, longer term we need to probably do some more pro-active
-	 * recycling to cut back on stalls in the tx path.
+	 * recycling to cut back on stalls in the TX path.
 	 */
 
 	/*
-	 * Do a quick size check to make sure it fits into what we think it
-	 * should for this device. Note that longer term this will be false,
-	 * particularly when we have the world of TSO.
+	 * Iterate through the mblks to calculate both the total size of the
+	 * frame and the number of fragments.  This is used to determine
+	 * whether we're doing DMA binding and, if so, how many TX control
+	 * blocks we'll need.
 	 */
 	mpsize = 0;
 	for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
-		mpsize += MBLKL(nmp);
+		size_t blksz = MBLKL(nmp);
+		if (blksz > 0) {
+			mpsize += blksz;
+			nbufs++;
+		}
 	}
 
-	/*
-	 * First we allocate our tx control block and prepare the packet for
-	 * transmit before we do a final check for descriptors. We do it this
-	 * way to minimize the time under the tx lock.
-	 */
-	tcb = i40e_tcb_alloc(itrq);
-	if (tcb == NULL) {
-		txs->itxs_err_notcb.value.ui64++;
-		goto txfail;
+	if (do_ctx_desc) {
+		/*
+		 * If we're doing tunneling or LSO, then we'll need a TX
+		 * context descriptor in addition to one or more TX data
+		 * descriptors.  Since there's no data DMA block or handle
+		 * associated with the context descriptor, we create a special
+		 * control block that behaves effectively like a NOP.
+		 */
+		if ((tcb_ctx = i40e_tcb_alloc(itrq)) == NULL) {
+			txs->itxs_err_notcb.value.ui64++;
+			goto txfail;
+		}
+		tcb_ctx->tcb_type = I40E_TX_DESC;
+		needed_desc++;
 	}
 
 	/*
-	 * For transmitting a block, we're currently going to use just a
-	 * single control block and bcopy all of the fragments into it. We
-	 * should be more intelligent about doing DMA binding or otherwise, but
-	 * for getting off the ground this will have to do.
+	 * For the non-LSO TX case, we alter our DMA strategy based on a
+	 * threshold tied to the frame size.  This threshold is configurable
+	 * via the tx_dma_threshold property.
+	 *
+	 * If the frame size is above the threshold, we do DMA binding of the
+	 * fragments, building a control block and data descriptor for each
+	 * piece.
+	 *
+	 * If it's below or at the threshold then we just use a single control
+	 * block and data descriptor and simply bcopy all of the fragments into
+	 * the pre-allocated DMA buffer in the control block.
+	 *
+	 * For the LSO TX case we always do DMA binding.
 	 */
-	ASSERT(tcb->tcb_dma.dmab_len == 0);
-	ASSERT(tcb->tcb_dma.dmab_size >= mpsize);
-	for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
-		size_t clen = MBLKL(nmp);
-		void *coff = tcb->tcb_dma.dmab_address + tcb->tcb_dma.dmab_len;
+	if (use_lso == B_TRUE || mpsize > i40e->i40e_tx_dma_min) {
+		do_dma_bind = B_TRUE;
+		tcb_dma =
+		    kmem_zalloc(nbufs * sizeof (i40e_tx_control_block_t *),
+		    KM_NOSLEEP);
+		if (tcb_dma == NULL) {
+			i40e_error(i40e, "failed to allocate tcb_dma list");
+			goto txfail;
+		}
+		/*
+		 * For each b_cont: bind the control block's DMA handle to the
+		 * b_rptr, and record the cookies so that we can later iterate
+		 * through them and build TX data descriptors.
+		 */
+		for (nmp = mp, i = 0; nmp != NULL; nmp = nmp->b_cont) {
+			if (MBLKL(nmp) == 0)
+				continue;
+			tcb_dma[i] = i40e_tx_bind_fragment(itrq, nmp, use_lso);
+			if (tcb_dma[i] == NULL) {
+				i40e_error(i40e, "dma bind failed!");
+				goto txfail;
+			}
+			if (i == 0)
+				tcb_dma[i]->tcb_mp = mp;
+			needed_desc += tcb_dma[i++]->tcb_bind_ncookies;
+		}
+	} else {
+		/*
+		 * Just use a single control block and bcopy all of the
+		 * fragments into its pre-allocated DMA buffer.
+		 */
+		if ((tcb_data = i40e_tcb_alloc(itrq)) == NULL) {
+			txs->itxs_err_notcb.value.ui64++;
+			goto txfail;
+		}
+		tcb_data->tcb_type = I40E_TX_COPY;
+
+		ASSERT(tcb_data->tcb_dma.dmab_len == 0);
+		ASSERT(tcb_data->tcb_dma.dmab_size >= mpsize);
+
+		for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
+			size_t clen = MBLKL(nmp);
+			void *coff = tcb_data->tcb_dma.dmab_address +
+			    tcb_data->tcb_dma.dmab_len;
 
-		bcopy(nmp->b_rptr, coff, clen);
-		tcb->tcb_dma.dmab_len += clen;
+			bcopy(nmp->b_rptr, coff, clen);
+			tcb_data->tcb_dma.dmab_len += clen;
+		}
+		ASSERT(tcb_data->tcb_dma.dmab_len == mpsize);
+		I40E_DMA_SYNC(&tcb_data->tcb_dma, DDI_DMA_SYNC_FORDEV);
+
+		tcb_data->tcb_mp = mp;
+		needed_desc++;
 	}
-	ASSERT(tcb->tcb_dma.dmab_len == mpsize);
 
 	/*
-	 * While there's really no need to keep the mp here, but let's just do
-	 * it to help with our own debugging for now.
+	 * The second condition ensures that 'itrq_desc_tail' never
+	 * equals 'itrq_desc_head'. This enforces the rule found in
+	 * the second bullet point of section 8.4.3.1.5 of the XL710
+	 * PG, which declares the TAIL pointer in I40E_QTX_TAIL should
+	 * never overlap with the head. This means that we only ever
+	 * have 'itrq_tx_ring_size - 1' total available descriptors.
 	 */
-	tcb->tcb_mp = mp;
-	tcb->tcb_type = I40E_TX_COPY;
-	I40E_DMA_SYNC(&tcb->tcb_dma, DDI_DMA_SYNC_FORDEV);
-
 	mutex_enter(&itrq->itrq_tx_lock);
-	if (itrq->itrq_desc_free < i40e->i40e_tx_block_thresh) {
+	if (itrq->itrq_desc_free < i40e->i40e_tx_block_thresh ||
+	    (itrq->itrq_desc_free - 1) < needed_desc) {
 		txs->itxs_err_nodescs.value.ui64++;
 		mutex_exit(&itrq->itrq_tx_lock);
 		goto txfail;
 	}
 
-	/*
-	 * Build up the descriptor and send it out. Thankfully at the moment
-	 * we only need a single desc, because we're not doing anything fancy
-	 * yet.
-	 */
-	ASSERT(itrq->itrq_desc_free > 0);
-	itrq->itrq_desc_free--;
-	txdesc = &itrq->itrq_desc_ring[itrq->itrq_desc_tail];
-	itrq->itrq_tcb_work_list[itrq->itrq_desc_tail] = tcb;
-	itrq->itrq_desc_tail = i40e_next_desc(itrq->itrq_desc_tail, 1,
-	    itrq->itrq_tx_ring_size);
+	if (do_ctx_desc) {
+		/*
+		 * If we're enabling any offloads for this frame, then we'll
+		 * need to build up a transmit context descriptor, first.  The
+		 * context descriptor needs to be placed in the TX ring before
+		 * the data descriptor(s).  See section 8.4.2, table 8-16
+		 */
+		uint_t tail = itrq->itrq_desc_tail;
+		itrq->itrq_desc_free--;
+		ctxdesc = (i40e_tx_context_desc_t *)&itrq->itrq_desc_ring[tail];
+		itrq->itrq_tcb_work_list[tail] = tcb_ctx;
+		itrq->itrq_desc_tail = i40e_next_desc(tail, 1,
+		    itrq->itrq_tx_ring_size);
+
+		/* QW0 */
+		type = I40E_TX_DESC_DTYPE_CONTEXT;
+		ctxdesc->tunneling_params = 0;
+		ctxdesc->l2tag2 = 0;
+
+		/* QW1 */
+		ctxdesc->type_cmd_tso_mss = CPU_TO_LE64((uint64_t)type);
+		if (tctx.itc_ctx_cmdflags & I40E_TX_CTX_DESC_TSO) {
+			ctxdesc->type_cmd_tso_mss |= CPU_TO_LE64((uint64_t)
+			    ((uint64_t)tctx.itc_ctx_cmdflags <<
+			    I40E_TXD_CTX_QW1_CMD_SHIFT) |
+			    ((uint64_t)tctx.itc_ctx_tsolen <<
+			    I40E_TXD_CTX_QW1_TSO_LEN_SHIFT) |
+			    ((uint64_t)tctx.itc_ctx_mss <<
+			    I40E_TXD_CTX_QW1_MSS_SHIFT));
+		}
+	}
 
-	/*
-	 * Note, we always set EOP and RS which indicates that this is the last
-	 * data frame and that we should ask for it to be transmitted. We also
-	 * must always set ICRC, because that is an internal bit that must be
-	 * set to one for data descriptors. The remaining bits in the command
-	 * descriptor depend on checksumming and are determined based on the
-	 * information set up in i40e_tx_context().
-	 */
-	type = I40E_TX_DESC_DTYPE_DATA;
-	cmd = I40E_TX_DESC_CMD_EOP |
-	    I40E_TX_DESC_CMD_RS |
-	    I40E_TX_DESC_CMD_ICRC |
-	    tctx.itc_cmdflags;
-	txdesc->buffer_addr =
-	    CPU_TO_LE64((uintptr_t)tcb->tcb_dma.dmab_dma_address);
-	txdesc->cmd_type_offset_bsz = CPU_TO_LE64(((uint64_t)type |
-	    ((uint64_t)tctx.itc_offsets << I40E_TXD_QW1_OFFSET_SHIFT) |
-	    ((uint64_t)cmd << I40E_TXD_QW1_CMD_SHIFT) |
-	    ((uint64_t)tcb->tcb_dma.dmab_len << I40E_TXD_QW1_TX_BUF_SZ_SHIFT)));
+	if (do_dma_bind == B_TRUE) {
+		/*
+		 * Next build up a transmit data descriptor for each buffer.
+		 */
+		boolean_t last_desc = B_FALSE;
+		for (i = 0; i < nbufs; i++) {
+			itrq->itrq_tcb_work_list[itrq->itrq_desc_tail] =
+			    tcb_dma[i];
+
+			for (uint_t c = 0; c < tcb_dma[i]->tcb_bind_ncookies;
+			    c++) {
+				if (i == (nbufs - 1) &&
+				    c == (tcb_dma[i]->tcb_bind_ncookies - 1)) {
+					last_desc = B_TRUE;
+				}
+				i40e_tx_set_data_desc(itrq, &tctx,
+				    &tcb_dma[i]->tcb_bind_info[c], last_desc);
+			}
+		}
+		kmem_free(tcb_dma, nbufs * sizeof (i40e_tx_control_block_t *));
+		tcb_dma = NULL;
+	} else {
+		/*
+		 * Build up the single transmit data descriptor needed for the
+		 * non-DMA-bind case.
+		 */
+		itrq->itrq_desc_free--;
+		txdesc = &itrq->itrq_desc_ring[itrq->itrq_desc_tail];
+		itrq->itrq_tcb_work_list[itrq->itrq_desc_tail] = tcb_data;
+		itrq->itrq_desc_tail = i40e_next_desc(itrq->itrq_desc_tail, 1,
+		    itrq->itrq_tx_ring_size);
+
+		type = I40E_TX_DESC_DTYPE_DATA;
+		cmd = I40E_TX_DESC_CMD_EOP |
+		    I40E_TX_DESC_CMD_RS |
+		    I40E_TX_DESC_CMD_ICRC |
+		    tctx.itc_data_cmdflags;
+		txdesc->buffer_addr =
+		    CPU_TO_LE64((uintptr_t)tcb_data->tcb_dma.dmab_dma_address);
+		txdesc->cmd_type_offset_bsz = CPU_TO_LE64(((uint64_t)type |
+		    ((uint64_t)tctx.itc_data_offsets <<
+		    I40E_TXD_QW1_OFFSET_SHIFT) |
+		    ((uint64_t)cmd << I40E_TXD_QW1_CMD_SHIFT) |
+		    ((uint64_t)tcb_data->tcb_dma.dmab_len <<
+		    I40E_TXD_QW1_TX_BUF_SZ_SHIFT)));
+	}
 
 	/*
 	 * Now, finally, sync the DMA data and alert hardware.
@@ -2228,6 +2571,7 @@ i40e_ring_tx(void *arg, mblk_t *mp)
 
 	I40E_WRITE_REG(hw, I40E_QTX_TAIL(itrq->itrq_index),
 	    itrq->itrq_desc_tail);
+
 	if (i40e_check_acc_handle(i40e->i40e_osdep_space.ios_reg_handle) !=
 	    DDI_FM_OK) {
 		/*
@@ -2241,7 +2585,7 @@ i40e_ring_tx(void *arg, mblk_t *mp)
 
 	txs->itxs_bytes.value.ui64 += mpsize;
 	txs->itxs_packets.value.ui64++;
-	txs->itxs_descriptors.value.ui64++;
+	txs->itxs_descriptors.value.ui64 += needed_desc;
 
 	mutex_exit(&itrq->itrq_tx_lock);
 
@@ -2254,10 +2598,25 @@ txfail:
 	 * Make sure to reset their message block's, since we'll return them
 	 * back to MAC.
 	 */
-	if (tcb != NULL) {
-		tcb->tcb_mp = NULL;
-		i40e_tcb_reset(tcb);
-		i40e_tcb_free(itrq, tcb);
+	if (tcb_ctx != NULL) {
+		tcb_ctx->tcb_mp = NULL;
+		i40e_tcb_reset(tcb_ctx);
+		i40e_tcb_free(itrq, tcb_ctx);
+	}
+	if (tcb_data != NULL) {
+		tcb_data->tcb_mp = NULL;
+		i40e_tcb_reset(tcb_data);
+		i40e_tcb_free(itrq, tcb_data);
+	}
+	if (tcb_dma != NULL) {
+		for (i = 0; i < nbufs; i++) {
+			if (tcb_dma[i] == NULL)
+				break;
+			tcb_dma[i]->tcb_mp = NULL;
+			i40e_tcb_reset(tcb_dma[i]);
+			i40e_tcb_free(itrq, tcb_dma[i]);
+		}
+		kmem_free(tcb_dma, nbufs * sizeof (i40e_tx_control_block_t *));
 	}
 
 	mutex_enter(&itrq->itrq_tx_lock);
diff --git a/usr/src/uts/common/io/ib/clients/ibd/ibd_cm.c b/usr/src/uts/common/io/ib/clients/ibd/ibd_cm.c
index 1c8318b191..55c4159bc4 100644
--- a/usr/src/uts/common/io/ib/clients/ibd/ibd_cm.c
+++ b/usr/src/uts/common/io/ib/clients/ibd/ibd_cm.c
@@ -21,6 +21,7 @@
 
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2018 Joyent, Inc.
  */
 /* Copyright (c) 1990 Mentat Inc. */
 
@@ -272,8 +273,7 @@ ibd_async_rc_process_too_big(ibd_state_t *state, ibd_req_t *req)
 	icmph->icmph_checksum = IP_CSUM(pmtu_mp,
 	    (int32_t)sizeof (ib_header_info_t) + (int32_t)sizeof (ipha_t), 0);
 
-	(void) hcksum_assoc(pmtu_mp, NULL, NULL, 0, 0, 0, 0,
-	    HCK_FULLCKSUM | HCK_FULLCKSUM_OK, 0);
+	mac_hcksum_set(pmtu_mp, 0, 0, 0, 0, HCK_FULLCKSUM | HCK_FULLCKSUM_OK);
 
 	DPRINT(30, "ibd_async_rc_process_too_big: sap=0x%x, ip_src=0x%x, "
 	    "ip_dst=0x%x, ttl=%d, len_needed=%d, msg_len=%d",
@@ -1560,8 +1560,7 @@ ibd_rc_process_rx(ibd_rc_chan_t *chan, ibd_rwqe_t *rwqe, ibt_wc_t *wc)
 	/*
 	 * Can RC mode in IB guarantee its checksum correctness?
 	 *
-	 *	(void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, 0,
-	 *	    HCK_FULLCKSUM | HCK_FULLCKSUM_OK, 0);
+	 * mac_hcksum_set(mp, 0, 0, 0, 0, HCK_FULLCKSUM | HCK_FULLCKSUM_OK);
 	 */
 
 	/*
diff --git a/usr/src/uts/common/io/inotify.c b/usr/src/uts/common/io/inotify.c
new file mode 100644
index 0000000000..eaa0c33f0f
--- /dev/null
+++ b/usr/src/uts/common/io/inotify.c
@@ -0,0 +1,1555 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2017 Joyent, Inc.
+ * Copyright (c) 2015 The MathWorks, Inc.  All rights reserved.
+ */
+
+/*
+ * Support for the inotify facility, a Linux-borne facility for asynchronous
+ * notification of certain events on specified files or directories.  Our
+ * implementation broadly leverages the file event monitoring facility, and
+ * would actually be quite straightforward were it not for a very serious
+ * blunder in the inotify interface:  in addition to allowing for one to be
+ * notified on events on a particular file or directory, inotify also allows
+ * for one to be notified on certain events on files _within_ a watched
+ * directory -- even though those events have absolutely nothing to do with
+ * the directory itself.  This leads to all sorts of madness because file
+ * operations are (of course) not undertaken on paths but rather on open
+ * files -- and the relationships between open files and the paths that resolve
+ * to those files are neither static nor isomorphic.  We implement this
+ * concept by having _child watches_ when directories are watched with events
+ * in IN_CHILD_EVENTS.  We add child watches when a watch on a directory is
+ * first added, and we modify those child watches dynamically as files are
+ * created, deleted, moved into or moved out of the specified directory.  This
+ * mechanism works well, absent hard links.  Hard links, unfortunately, break
+ * this rather badly, and the user is warned that watches on directories that
+ * have multiple directory entries referring to the same file may behave
+ * unexpectedly.
+ */
+
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/inotify.h>
+#include <sys/fem.h>
+#include <sys/conf.h>
+#include <sys/stat.h>
+#include <sys/vfs_opreg.h>
+#include <sys/vmem.h>
+#include <sys/avl.h>
+#include <sys/sysmacros.h>
+#include <sys/cyclic.h>
+#include <sys/filio.h>
+
+struct inotify_state;
+struct inotify_kevent;
+
+typedef struct inotify_watch inotify_watch_t;
+typedef struct inotify_state inotify_state_t;
+typedef struct inotify_kevent inotify_kevent_t;
+
+struct inotify_watch {
+	kmutex_t inw_lock;			/* lock protecting ref count */
+	int inw_refcnt;				/* reference count */
+	uint8_t inw_zombie:1;			/* boolean: is zombie */
+	uint8_t inw_fired:1;			/* boolean: fired one-shot */
+	uint8_t inw_active:1;			/* boolean: watch is active */
+	uint8_t inw_orphaned:1;			/* boolean: orphaned */
+	kcondvar_t inw_cv;			/* condvar for zombifier */
+	uint32_t inw_mask;			/* mask of watch */
+	int32_t inw_wd;				/* watch descriptor */
+	vnode_t *inw_vp;			/* underlying vnode */
+	inotify_watch_t *inw_parent;		/* parent, if a child */
+	avl_node_t inw_byvp;			/* watches by vnode */
+	avl_node_t inw_bywd;			/* watches by descriptor */
+	avl_tree_t inw_children;		/* children, if a parent */
+	char *inw_name;				/* name, if a child */
+	list_node_t inw_orphan;			/* orphan list */
+	cred_t *inw_cred;			/* cred, if orphaned */
+	inotify_state_t *inw_state;		/* corresponding state */
+};
+
+struct inotify_kevent {
+	inotify_kevent_t *ine_next;		/* next event in queue */
+	struct inotify_event ine_event;		/* event (variable size) */
+};
+
+#define	INOTIFY_EVENT_LENGTH(ev) \
+	(sizeof (inotify_kevent_t) + (ev)->ine_event.len)
+
+struct inotify_state {
+	kmutex_t ins_lock;			/* lock protecting state */
+	avl_tree_t ins_byvp;			/* watches by vnode */
+	avl_tree_t ins_bywd;			/* watches by descriptor */
+	vmem_t *ins_wds;			/* watch identifier arena */
+	int ins_maxwatches;			/* maximum number of watches */
+	int ins_maxevents;			/* maximum number of events */
+	int ins_nevents;			/* current # of events */
+	int32_t ins_size;			/* total size of events */
+	inotify_kevent_t *ins_head;		/* head of event queue */
+	inotify_kevent_t *ins_tail;		/* tail of event queue */
+	pollhead_t ins_pollhd;			/* poll head */
+	kcondvar_t ins_cv;			/* condvar for reading */
+	list_t ins_orphans;			/* orphan list */
+	ddi_periodic_t ins_cleaner;		/* cyclic for cleaning */
+	inotify_watch_t *ins_zombies;		/* zombie watch list */
+	cred_t *ins_cred;			/* creator's credentials */
+	inotify_state_t *ins_next;		/* next state on global list */
+};
+
+/*
+ * Tunables (exported read-only in lx-branded zones via /proc).
+ */
+int	inotify_maxwatches = 8192;		/* max watches per instance */
+int	inotify_maxevents = 16384;		/* max events */
+int	inotify_maxinstances = 128;		/* max instances per user */
+
+/*
+ * Internal global variables.
+ */
+static kmutex_t		inotify_lock;		/* lock protecting state */
+static dev_info_t	*inotify_devi;		/* device info */
+static fem_t		*inotify_femp;		/* FEM pointer */
+static vmem_t		*inotify_minor;		/* minor number arena */
+static void		*inotify_softstate;	/* softstate pointer */
+static inotify_state_t	*inotify_state;		/* global list if state */
+
+static void inotify_watch_event(inotify_watch_t *, uint64_t, char *);
+static void inotify_watch_insert(inotify_watch_t *, vnode_t *, char *);
+static void inotify_watch_delete(inotify_watch_t *, uint32_t);
+static void inotify_watch_remove(inotify_state_t *state,
+	inotify_watch_t *watch);
+
+static int
+inotify_fop_close(femarg_t *vf, int flag, int count, offset_t offset,
+    cred_t *cr, caller_context_t *ct)
+{
+	inotify_watch_t *watch = vf->fa_fnode->fn_available;
+	int rval;
+
+	if ((rval = vnext_close(vf, flag, count, offset, cr, ct)) == 0) {
+		inotify_watch_event(watch, flag & FWRITE ?
+		    IN_CLOSE_WRITE : IN_CLOSE_NOWRITE, NULL);
+	}
+
+	return (rval);
+}
+
+static int
+inotify_fop_create(femarg_t *vf, char *name, vattr_t *vap, vcexcl_t excl,
+    int mode, vnode_t **vpp, cred_t *cr, int flag, caller_context_t *ct,
+    vsecattr_t *vsecp)
+{
+	inotify_watch_t *watch = vf->fa_fnode->fn_available;
+	int rval;
+
+	if ((rval = vnext_create(vf, name, vap, excl, mode,
+	    vpp, cr, flag, ct, vsecp)) == 0) {
+		inotify_watch_insert(watch, *vpp, name);
+		inotify_watch_event(watch, IN_CREATE, name);
+	}
+
+	return (rval);
+}
+
+static int
+inotify_fop_link(femarg_t *vf, vnode_t *svp, char *tnm, cred_t *cr,
+    caller_context_t *ct, int flags)
+{
+	inotify_watch_t *watch = vf->fa_fnode->fn_available;
+	int rval;
+
+	if ((rval = vnext_link(vf, svp, tnm, cr, ct, flags)) == 0) {
+		inotify_watch_insert(watch, svp, tnm);
+		inotify_watch_event(watch, IN_CREATE, tnm);
+	}
+
+	return (rval);
+}
+
+static int
+inotify_fop_mkdir(femarg_t *vf, char *name, vattr_t *vap, vnode_t **vpp,
+    cred_t *cr, caller_context_t *ct, int flags, vsecattr_t *vsecp)
+{
+	inotify_watch_t *watch = vf->fa_fnode->fn_available;
+	int rval;
+
+	if ((rval = vnext_mkdir(vf, name, vap, vpp, cr,
+	    ct, flags, vsecp)) == 0) {
+		inotify_watch_insert(watch, *vpp, name);
+		inotify_watch_event(watch, IN_CREATE | IN_ISDIR, name);
+	}
+
+	return (rval);
+}
+
+static int
+inotify_fop_open(femarg_t *vf, int mode, cred_t *cr, caller_context_t *ct)
+{
+	inotify_watch_t *watch = vf->fa_fnode->fn_available;
+	int rval;
+
+	if ((rval = vnext_open(vf, mode, cr, ct)) == 0)
+		inotify_watch_event(watch, IN_OPEN, NULL);
+
+	return (rval);
+}
+
+static int
+inotify_fop_read(femarg_t *vf, struct uio *uiop, int ioflag, struct cred *cr,
+    caller_context_t *ct)
+{
+	inotify_watch_t *watch = vf->fa_fnode->fn_available;
+	int rval = vnext_read(vf, uiop, ioflag, cr, ct);
+	inotify_watch_event(watch, IN_ACCESS, NULL);
+
+	return (rval);
+}
+
+static int
+inotify_fop_readdir(femarg_t *vf, uio_t *uiop, cred_t *cr, int *eofp,
+    caller_context_t *ct, int flags)
+{
+	inotify_watch_t *watch = vf->fa_fnode->fn_available;
+	int rval = vnext_readdir(vf, uiop, cr, eofp, ct, flags);
+	inotify_watch_event(watch, IN_ACCESS | IN_ISDIR, NULL);
+
+	return (rval);
+}
+
+int
+inotify_fop_remove(femarg_t *vf, char *nm, cred_t *cr, caller_context_t *ct,
+    int flags)
+{
+	inotify_watch_t *watch = vf->fa_fnode->fn_available;
+	int rval;
+
+	if ((rval = vnext_remove(vf, nm, cr, ct, flags)) == 0)
+		inotify_watch_event(watch, IN_DELETE, nm);
+
+	return (rval);
+}
+
+int
+inotify_fop_rmdir(femarg_t *vf, char *nm, vnode_t *cdir, cred_t *cr,
+    caller_context_t *ct, int flags)
+{
+	inotify_watch_t *watch = vf->fa_fnode->fn_available;
+	int rval;
+
+	if ((rval = vnext_rmdir(vf, nm, cdir, cr, ct, flags)) == 0)
+		inotify_watch_event(watch, IN_DELETE | IN_ISDIR, nm);
+
+	return (rval);
+}
+
+static int
+inotify_fop_setattr(femarg_t *vf, vattr_t *vap, int flags, cred_t *cr,
+    caller_context_t *ct)
+{
+	inotify_watch_t *watch = vf->fa_fnode->fn_available;
+	int rval;
+
+	if ((rval = vnext_setattr(vf, vap, flags, cr, ct)) == 0)
+		inotify_watch_event(watch, IN_ATTRIB, NULL);
+
+	return (rval);
+}
+
+static int
+inotify_fop_write(femarg_t *vf, struct uio *uiop, int ioflag, struct cred *cr,
+    caller_context_t *ct)
+{
+	inotify_watch_t *watch = vf->fa_fnode->fn_available;
+	int rval = vnext_write(vf, uiop, ioflag, cr, ct);
+	inotify_watch_event(watch, IN_MODIFY, NULL);
+
+	return (rval);
+}
+
+static int
+inotify_fop_vnevent(femarg_t *vf, vnevent_t vnevent, vnode_t *dvp, char *name,
+    caller_context_t *ct)
+{
+	inotify_watch_t *watch = vf->fa_fnode->fn_available;
+
+	switch (vnevent) {
+	case VE_RENAME_SRC:
+		inotify_watch_event(watch, IN_MOVE_SELF, NULL);
+		inotify_watch_delete(watch, IN_MOVE_SELF);
+		break;
+	case VE_REMOVE:
+		/*
+		 * Linux will apparently fire an IN_ATTRIB event when the link
+		 * count changes (including when it drops to 0 on a remove).
+		 * This is merely somewhat odd; what is amazing is that this
+		 * IN_ATTRIB event is not visible on an inotify watch on the
+		 * parent directory.  (IN_ATTRIB events are normally sent to
+		 * watches on the parent directory).  While it's hard to
+		 * believe that this constitutes desired semantics, ltp
+		 * unfortunately tests this case (if implicitly); in the name
+		 * of bug-for-bug compatibility, we fire IN_ATTRIB iff we are
+		 * explicitly watching the file that has been removed.
+		 */
+		if (watch->inw_parent == NULL)
+			inotify_watch_event(watch, IN_ATTRIB, NULL);
+
+		/*FALLTHROUGH*/
+	case VE_RENAME_DEST:
+		inotify_watch_event(watch, IN_DELETE_SELF, NULL);
+		inotify_watch_delete(watch, IN_DELETE_SELF);
+		break;
+	case VE_RMDIR:
+		/*
+		 * It seems that IN_ISDIR should really be OR'd in here, but
+		 * Linux doesn't seem to do that in this case; for the sake of
+		 * bug-for-bug compatibility, we don't do it either.
+		 */
+		inotify_watch_event(watch, IN_DELETE_SELF, NULL);
+		inotify_watch_delete(watch, IN_DELETE_SELF);
+		break;
+	case VE_CREATE:
+	case VE_TRUNCATE:
+	case VE_RESIZE:
+		inotify_watch_event(watch, IN_MODIFY | IN_ATTRIB, NULL);
+		break;
+	case VE_LINK:
+		inotify_watch_event(watch, IN_ATTRIB, NULL);
+		break;
+	case VE_RENAME_SRC_DIR:
+		inotify_watch_event(watch, IN_MOVED_FROM, name);
+		break;
+	case VE_RENAME_DEST_DIR:
+		if (name == NULL)
+			name = dvp->v_path;
+
+		inotify_watch_insert(watch, dvp, name);
+		inotify_watch_event(watch, IN_MOVED_TO, name);
+		break;
+	case VE_SUPPORT:
+	case VE_MOUNTEDOVER:
+	case VE_PRE_RENAME_SRC:
+	case VE_PRE_RENAME_DEST:
+	case VE_PRE_RENAME_DEST_DIR:
+		break;
+	}
+
+	return (vnext_vnevent(vf, vnevent, dvp, name, ct));
+}
+
+const fs_operation_def_t inotify_vnodesrc_template[] = {
+	VOPNAME_CLOSE,		{ .femop_close = inotify_fop_close },
+	VOPNAME_CREATE,		{ .femop_create = inotify_fop_create },
+	VOPNAME_LINK,		{ .femop_link = inotify_fop_link },
+	VOPNAME_MKDIR,		{ .femop_mkdir = inotify_fop_mkdir },
+	VOPNAME_OPEN,		{ .femop_open = inotify_fop_open },
+	VOPNAME_READ,		{ .femop_read = inotify_fop_read },
+	VOPNAME_READDIR,	{ .femop_readdir = inotify_fop_readdir },
+	VOPNAME_REMOVE,		{ .femop_remove = inotify_fop_remove },
+	VOPNAME_RMDIR,		{ .femop_rmdir = inotify_fop_rmdir },
+	VOPNAME_SETATTR,	{ .femop_setattr = inotify_fop_setattr },
+	VOPNAME_WRITE,		{ .femop_write = inotify_fop_write },
+	VOPNAME_VNEVENT,	{ .femop_vnevent = inotify_fop_vnevent },
+	NULL, NULL
+};
+
+static int
+inotify_watch_cmpwd(inotify_watch_t *lhs, inotify_watch_t *rhs)
+{
+	if (lhs->inw_wd < rhs->inw_wd)
+		return (-1);
+
+	if (lhs->inw_wd > rhs->inw_wd)
+		return (1);
+
+	return (0);
+}
+
+static int
+inotify_watch_cmpvp(inotify_watch_t *lhs, inotify_watch_t *rhs)
+{
+	uintptr_t lvp = (uintptr_t)lhs->inw_vp, rvp = (uintptr_t)rhs->inw_vp;
+
+	if (lvp < rvp)
+		return (-1);
+
+	if (lvp > rvp)
+		return (1);
+
+	return (0);
+}
+
+static void
+inotify_watch_hold(inotify_watch_t *watch)
+{
+	mutex_enter(&watch->inw_lock);
+	VERIFY(watch->inw_refcnt > 0);
+	watch->inw_refcnt++;
+	mutex_exit(&watch->inw_lock);
+}
+
+static void
+inotify_watch_release(inotify_watch_t *watch)
+{
+	mutex_enter(&watch->inw_lock);
+	VERIFY(watch->inw_refcnt > 1);
+
+	if (--watch->inw_refcnt == 1 && watch->inw_zombie) {
+		/*
+		 * We're down to our last reference; kick anyone that might be
+		 * waiting.
+		 */
+		cv_signal(&watch->inw_cv);
+	}
+
+	mutex_exit(&watch->inw_lock);
+}
+
+static void
+inotify_watch_event(inotify_watch_t *watch, uint64_t mask, char *name)
+{
+	inotify_kevent_t *event, *tail;
+	inotify_state_t *state = watch->inw_state;
+	uint32_t wd = watch->inw_wd, cookie = 0, len;
+	boolean_t removal = mask & IN_REMOVAL ? B_TRUE : B_FALSE;
+	inotify_watch_t *source = watch;
+
+	if (!(mask &= watch->inw_mask) || mask == IN_ISDIR)
+		return;
+
+	if (watch->inw_parent != NULL) {
+		/*
+		 * This is an event on the child; if this isn't a valid child
+		 * event, return.  Otherwise, we move our watch to be our
+		 * parent (which we know is around because we have a hold on
+		 * it) and continue.
+		 */
+		if (!(mask & IN_CHILD_EVENTS))
+			return;
+
+		name = watch->inw_name;
+		watch = watch->inw_parent;
+		wd = watch->inw_wd;
+	}
+
+	if (!removal) {
+		mutex_enter(&state->ins_lock);
+
+		if (watch->inw_zombie ||
+		    watch->inw_fired || !watch->inw_active) {
+			mutex_exit(&state->ins_lock);
+			return;
+		}
+	} else {
+		if (!watch->inw_active)
+			return;
+
+		VERIFY(MUTEX_HELD(&state->ins_lock));
+	}
+
+	/*
+	 * If this is an operation on a directory and it's a child event
+	 * (event if it's not on a child), we specify IN_ISDIR.
+	 */
+	if (source->inw_vp->v_type == VDIR && (mask & IN_CHILD_EVENTS))
+		mask |= IN_ISDIR;
+
+	if (mask & (IN_MOVED_FROM | IN_MOVED_TO))
+		cookie = (uint32_t)curthread->t_did;
+
+	if (state->ins_nevents >= state->ins_maxevents) {
+		/*
+		 * We're at our maximum number of events -- turn our event
+		 * into an IN_Q_OVERFLOW event, which will be coalesced if
+		 * it's already the tail event.
+		 */
+		mask = IN_Q_OVERFLOW;
+		wd = (uint32_t)-1;
+		cookie = 0;
+		len = 0;
+	}
+
+	if ((tail = state->ins_tail) != NULL && tail->ine_event.wd == wd &&
+	    tail->ine_event.mask == mask && tail->ine_event.cookie == cookie &&
+	    ((tail->ine_event.len == 0 && len == 0) ||
+	    (name != NULL && tail->ine_event.len != 0 &&
+	    strcmp(tail->ine_event.name, name) == 0))) {
+		/*
+		 * This is an implicitly coalesced event; we're done.
+		 */
+		if (!removal)
+			mutex_exit(&state->ins_lock);
+		return;
+	}
+
+	if (name != NULL) {
+		/*
+		 * We are in the context of a file event monitoring operation,
+		 * so the name length is bounded by the kernel.
+		 */
+		len = strlen(name) + 1;
+		len = roundup(len, sizeof (struct inotify_event));
+	} else {
+		len = 0;
+	}
+
+	event = kmem_zalloc(sizeof (inotify_kevent_t) + len, KM_SLEEP);
+	event->ine_event.wd = wd;
+	event->ine_event.mask = (uint32_t)mask;
+	event->ine_event.cookie = cookie;
+	event->ine_event.len = len;
+
+	if (name != NULL)
+		(void) strcpy(event->ine_event.name, name);
+
+	if (tail != NULL) {
+		tail->ine_next = event;
+	} else {
+		VERIFY(state->ins_head == NULL);
+		state->ins_head = event;
+		cv_broadcast(&state->ins_cv);
+	}
+
+	state->ins_tail = event;
+	state->ins_nevents++;
+	state->ins_size += sizeof (event->ine_event) + len;
+
+	if (removal)
+		return;
+
+	if ((watch->inw_mask & IN_ONESHOT) && !watch->inw_fired) {
+		/*
+		 * If this is a one-shot, we need to remove the watch.  (Note
+		 * that this will recurse back into inotify_watch_event() to
+		 * fire the IN_IGNORED event -- but with "removal" set.)
+		 */
+		watch->inw_fired = 1;
+		inotify_watch_remove(state, watch);
+	}
+
+	mutex_exit(&state->ins_lock);
+	pollwakeup(&state->ins_pollhd, POLLRDNORM | POLLIN);
+}
+
+/*
+ * Destroy a watch.  By the time we're in here, the watch must have exactly
+ * one reference.
+ */
+static void
+inotify_watch_destroy(inotify_watch_t *watch)
+{
+	VERIFY(MUTEX_HELD(&watch->inw_lock));
+
+	if (watch->inw_name != NULL)
+		kmem_free(watch->inw_name, strlen(watch->inw_name) + 1);
+
+	kmem_free(watch, sizeof (inotify_watch_t));
+}
+
+static int
+inotify_fem_install(vnode_t *vp, inotify_watch_t *watch)
+{
+	/*
+	 * For vnodes that are devices (of type VCHR or VBLK), we silently
+	 * refuse to actually install any event monitor.  This is to avoid
+	 * single-thread deadlock when both a special device vnode and its
+	 * underlying real vnode are being watched:  releasing the device
+	 * vnode upon watch removal can induce an attribute update on the
+	 * underlying vnode, which will bring us into inotify_watch_event()
+	 * with our lock already held.  While we could fail earlier and more
+	 * explicitly in this case, we choose to keep with the Linux behavior
+	 * on unwatchable entities and allow the watch but not generate any
+	 * events for it.
+	 */
+	if (vp->v_type == VCHR || vp->v_type == VBLK)
+		return (0);
+
+	return (fem_install(vp, inotify_femp, watch, OPARGUNIQ,
+	    (void (*)(void *))inotify_watch_hold,
+	    (void (*)(void *))inotify_watch_release));
+}
+
+static int
+inotify_fem_uninstall(vnode_t *vp, inotify_watch_t *watch)
+{
+	/*
+	 * See inotify_fem_install(), above, for our rationale here.
+	 */
+	if (vp->v_type == VCHR || vp->v_type == VBLK)
+		return (0);
+
+	return (fem_uninstall(vp, inotify_femp, watch));
+}
+
+/*
+ * Zombify a watch.  By the time we come in here, it must be true that the
+ * watch has already been fem_uninstall()'d -- the only reference should be
+ * in the state's data structure.  If we can get away with freeing it, we'll
+ * do that -- but if the reference count is greater than one due to an active
+ * vnode operation, we'll put this watch on the zombie list on the state
+ * structure.
+ */
+static void
+inotify_watch_zombify(inotify_watch_t *watch)
+{
+	inotify_state_t *state = watch->inw_state;
+
+	VERIFY(MUTEX_HELD(&state->ins_lock));
+	VERIFY(!watch->inw_zombie);
+
+	watch->inw_zombie = 1;
+
+	if (watch->inw_parent != NULL) {
+		inotify_watch_release(watch->inw_parent);
+	} else {
+		avl_remove(&state->ins_byvp, watch);
+		avl_remove(&state->ins_bywd, watch);
+		vmem_free(state->ins_wds, (void *)(uintptr_t)watch->inw_wd, 1);
+		watch->inw_wd = -1;
+	}
+
+	mutex_enter(&watch->inw_lock);
+
+	if (watch->inw_refcnt == 1) {
+		/*
+		 * There are no operations in flight and there is no way
+		 * for anyone to discover this watch -- we can destroy it.
+		 */
+		inotify_watch_destroy(watch);
+	} else {
+		/*
+		 * There are operations in flight; we will need to enqueue
+		 * this for later destruction.
+		 */
+		watch->inw_parent = state->ins_zombies;
+		state->ins_zombies = watch;
+		mutex_exit(&watch->inw_lock);
+	}
+}
+
+static inotify_watch_t *
+inotify_watch_add(inotify_state_t *state, inotify_watch_t *parent,
+    const char *name, vnode_t *vp, uint32_t mask)
+{
+	inotify_watch_t *watch;
+	int err;
+
+	VERIFY(MUTEX_HELD(&state->ins_lock));
+
+	watch = kmem_zalloc(sizeof (inotify_watch_t), KM_SLEEP);
+
+	watch->inw_vp = vp;
+	watch->inw_mask = mask;
+	watch->inw_state = state;
+	watch->inw_refcnt = 1;
+
+	if (parent == NULL) {
+		watch->inw_wd = (int)(uintptr_t)vmem_alloc(state->ins_wds,
+		    1, VM_BESTFIT | VM_SLEEP);
+		avl_add(&state->ins_byvp, watch);
+		avl_add(&state->ins_bywd, watch);
+
+		avl_create(&watch->inw_children,
+		    (int(*)(const void *, const void *))inotify_watch_cmpvp,
+		    sizeof (inotify_watch_t),
+		    offsetof(inotify_watch_t, inw_byvp));
+	} else {
+		VERIFY(name != NULL);
+		inotify_watch_hold(parent);
+		watch->inw_mask &= IN_CHILD_EVENTS;
+		watch->inw_parent = parent;
+
+		/*
+		 * Copy the name.  Note that when the name is user-specified,
+		 * its length is bounded by the copyinstr() to be MAXPATHLEN
+		 * (and regardless, we know by this point that it exists in
+		 * our parent).
+		 */
+		watch->inw_name = kmem_alloc(strlen(name) + 1, KM_SLEEP);
+		(void) strcpy(watch->inw_name, name);
+
+		avl_add(&parent->inw_children, watch);
+	}
+
+	/*
+	 * Add our monitor to the vnode.  We must not have the watch lock held
+	 * when we do this, as it will immediately hold our watch.
+	 */
+	err = inotify_fem_install(vp, watch);
+
+	VERIFY(err == 0);
+
+	return (watch);
+}
+
+/*
+ * Remove a (non-child) watch.  This is called from either synchronous context
+ * via inotify_rm_watch() or monitor context via either a vnevent or a
+ * one-shot.
+ */
+static void
+inotify_watch_remove(inotify_state_t *state, inotify_watch_t *watch)
+{
+	inotify_watch_t *child;
+	int err;
+
+	VERIFY(MUTEX_HELD(&state->ins_lock));
+	VERIFY(watch->inw_parent == NULL);
+
+	err = inotify_fem_uninstall(watch->inw_vp, watch);
+	VERIFY(err == 0);
+
+	/*
+	 * If we have children, we're going to remove them all and set them
+	 * all to be zombies.
+	 */
+	while ((child = avl_first(&watch->inw_children)) != NULL) {
+		VERIFY(child->inw_parent == watch);
+		avl_remove(&watch->inw_children, child);
+
+		err = inotify_fem_uninstall(child->inw_vp, child);
+		VERIFY(err == 0);
+
+		/*
+		 * If this child watch has been orphaned, remove it from the
+		 * state's list of orphans.
+		 */
+		if (child->inw_orphaned) {
+			list_remove(&state->ins_orphans, child);
+			crfree(child->inw_cred);
+		}
+
+		VN_RELE(child->inw_vp);
+
+		/*
+		 * We're down (or should be down) to a single reference to
+		 * this child watch; it's safe to zombify it.
+		 */
+		inotify_watch_zombify(child);
+	}
+
+	inotify_watch_event(watch, IN_IGNORED | IN_REMOVAL, NULL);
+	VN_RELE(watch->inw_vp);
+
+	/*
+	 * It's now safe to zombify the watch -- we know that the only reference
+	 * can come from operations in flight.
+	 */
+	inotify_watch_zombify(watch);
+}
+
+/*
+ * Delete a watch.  Should only be called from VOP context.
+ */
+static void
+inotify_watch_delete(inotify_watch_t *watch, uint32_t event)
+{
+	inotify_state_t *state = watch->inw_state;
+	inotify_watch_t cmp = { .inw_vp = watch->inw_vp }, *parent;
+	int err;
+
+	if (event != IN_DELETE_SELF && !(watch->inw_mask & IN_CHILD_EVENTS))
+		return;
+
+	mutex_enter(&state->ins_lock);
+
+	if (watch->inw_zombie) {
+		mutex_exit(&state->ins_lock);
+		return;
+	}
+
+	if ((parent = watch->inw_parent) == NULL) {
+		if (event == IN_DELETE_SELF) {
+			/*
+			 * If we're here because we're being deleted and we
+			 * are not a child watch, we need to delete the entire
+			 * watch, children and all.
+			 */
+			inotify_watch_remove(state, watch);
+		}
+
+		mutex_exit(&state->ins_lock);
+		return;
+	} else {
+		if (event == IN_DELETE_SELF &&
+		    !(parent->inw_mask & IN_EXCL_UNLINK)) {
+			/*
+			 * This is a child watch for a file that is being
+			 * removed and IN_EXCL_UNLINK has not been specified;
+			 * indicate that it is orphaned and add it to the list
+			 * of orphans.  (This list will be checked by the
+			 * cleaning cyclic to determine when the watch has
+			 * become the only hold on the vnode, at which point
+			 * the watch can be zombified.)  Note that we check
+			 * if the watch is orphaned before we orphan it:  hard
+			 * links make it possible for VE_REMOVE to be called
+			 * multiple times on the same vnode. (!)
+			 */
+			if (!watch->inw_orphaned) {
+				watch->inw_orphaned = 1;
+				watch->inw_cred = CRED();
+				crhold(watch->inw_cred);
+				list_insert_head(&state->ins_orphans, watch);
+			}
+
+			mutex_exit(&state->ins_lock);
+			return;
+		}
+
+		if (watch->inw_orphaned) {
+			/*
+			 * If we're here, a file was orphaned and then later
+			 * moved -- which almost certainly means that hard
+			 * links are on the scene.  We choose the orphan over
+			 * the move because we don't want to spuriously
+			 * drop events if we can avoid it.
+			 */
+			crfree(watch->inw_cred);
+			list_remove(&state->ins_orphans, watch);
+		}
+	}
+
+	if (avl_find(&parent->inw_children, &cmp, NULL) == NULL) {
+		/*
+		 * This watch has already been deleted from the parent.
+		 */
+		mutex_exit(&state->ins_lock);
+		return;
+	}
+
+	avl_remove(&parent->inw_children, watch);
+	err = inotify_fem_uninstall(watch->inw_vp, watch);
+	VERIFY(err == 0);
+
+	VN_RELE(watch->inw_vp);
+
+	/*
+	 * It's now safe to zombify the watch -- which won't actually delete
+	 * it as we know that the reference count is greater than 1.
+	 */
+	inotify_watch_zombify(watch);
+	mutex_exit(&state->ins_lock);
+}
+
+/*
+ * Insert a new child watch.  Should only be called from VOP context when
+ * a child is created in a watched directory.
+ */
+static void
+inotify_watch_insert(inotify_watch_t *watch, vnode_t *vp, char *name)
+{
+	inotify_state_t *state = watch->inw_state;
+	inotify_watch_t cmp = { .inw_vp = vp };
+
+	if (!(watch->inw_mask & IN_CHILD_EVENTS))
+		return;
+
+	mutex_enter(&state->ins_lock);
+
+	if (watch->inw_zombie || watch->inw_parent != NULL || vp == NULL) {
+		mutex_exit(&state->ins_lock);
+		return;
+	}
+
+	if (avl_find(&watch->inw_children, &cmp, NULL) != NULL) {
+		mutex_exit(&state->ins_lock);
+		return;
+	}
+
+	VN_HOLD(vp);
+	watch = inotify_watch_add(state, watch, name, vp, watch->inw_mask);
+	VERIFY(watch != NULL);
+
+	mutex_exit(&state->ins_lock);
+}
+
+
+static int
+inotify_add_watch(inotify_state_t *state, vnode_t *vp, uint32_t mask,
+    int32_t *wdp)
+{
+	inotify_watch_t *watch, cmp = { .inw_vp = vp };
+	uint32_t set;
+
+	set = (mask & (IN_ALL_EVENTS | IN_MODIFIERS)) | IN_UNMASKABLE;
+
+	/*
+	 * Lookup our vnode to determine if we already have a watch on it.
+	 */
+	mutex_enter(&state->ins_lock);
+
+	if ((watch = avl_find(&state->ins_byvp, &cmp, NULL)) == NULL) {
+		/*
+		 * We don't have this watch; allocate a new one, provided that
+		 * we have fewer than our limit.
+		 */
+		if (avl_numnodes(&state->ins_bywd) >= state->ins_maxwatches) {
+			mutex_exit(&state->ins_lock);
+			return (ENOSPC);
+		}
+
+		VN_HOLD(vp);
+		watch = inotify_watch_add(state, NULL, NULL, vp, set);
+		*wdp = watch->inw_wd;
+		mutex_exit(&state->ins_lock);
+
+		return (0);
+	}
+
+	VERIFY(!watch->inw_zombie);
+
+	if (!(mask & IN_MASK_ADD)) {
+		/*
+		 * Note that if we're resetting our event mask and we're
+		 * transitioning from an event mask that includes child events
+		 * to one that doesn't, there will be potentially some stale
+		 * child watches.  This is basically fine:  they won't fire,
+		 * and they will correctly be removed when the watch is
+		 * removed.
+		 */
+		watch->inw_mask = 0;
+	}
+
+	watch->inw_mask |= set;
+
+	*wdp = watch->inw_wd;
+
+	mutex_exit(&state->ins_lock);
+
+	return (0);
+}
+
+static int
+inotify_add_child(inotify_state_t *state, vnode_t *vp, char *name)
+{
+	inotify_watch_t *watch, cmp = { .inw_vp = vp };
+	vnode_t *cvp;
+	int err;
+
+	/*
+	 * Verify that the specified child doesn't have a directory component
+	 * within it.
+	 */
+	if (strchr(name, '/') != NULL)
+		return (EINVAL);
+
+	/*
+	 * Lookup the underlying file.  Note that this will succeed even if
+	 * we don't have permissions to actually read the file.
+	 */
+	if ((err = lookupnameat(name,
+	    UIO_SYSSPACE, NO_FOLLOW, NULL, &cvp, vp)) != 0) {
+		return (err);
+	}
+
+	/*
+	 * Use our vnode to find our watch, and then add our child watch to it.
+	 */
+	mutex_enter(&state->ins_lock);
+
+	if ((watch = avl_find(&state->ins_byvp, &cmp, NULL)) == NULL) {
+		/*
+		 * This is unexpected -- it means that we don't have the
+		 * watch that we thought we had.
+		 */
+		mutex_exit(&state->ins_lock);
+		VN_RELE(cvp);
+		return (ENXIO);
+	}
+
+	/*
+	 * Now lookup the child vnode in the watch; we'll only add it if it
+	 * isn't already there.
+	 */
+	cmp.inw_vp = cvp;
+
+	if (avl_find(&watch->inw_children, &cmp, NULL) != NULL) {
+		mutex_exit(&state->ins_lock);
+		VN_RELE(cvp);
+		return (0);
+	}
+
+	watch = inotify_watch_add(state, watch, name, cvp, watch->inw_mask);
+	VERIFY(watch != NULL);
+	mutex_exit(&state->ins_lock);
+
+	return (0);
+}
+
+static int
+inotify_rm_watch(inotify_state_t *state, int32_t wd)
+{
+	inotify_watch_t *watch, cmp = { .inw_wd = wd };
+
+	mutex_enter(&state->ins_lock);
+
+	if ((watch = avl_find(&state->ins_bywd, &cmp, NULL)) == NULL) {
+		mutex_exit(&state->ins_lock);
+		return (EINVAL);
+	}
+
+	inotify_watch_remove(state, watch);
+	mutex_exit(&state->ins_lock);
+
+	/*
+	 * Because removing a watch will generate an IN_IGNORED event (and
+	 * because inotify_watch_remove() won't alone induce a pollwakeup()),
+	 * we need to explicitly issue a pollwakeup().
+	 */
+	pollwakeup(&state->ins_pollhd, POLLRDNORM | POLLIN);
+
+	return (0);
+}
+
+static int
+inotify_activate(inotify_state_t *state, int32_t wd)
+{
+	inotify_watch_t *watch, cmp = { .inw_wd = wd };
+
+	mutex_enter(&state->ins_lock);
+
+	if ((watch = avl_find(&state->ins_bywd, &cmp, NULL)) == NULL) {
+		mutex_exit(&state->ins_lock);
+		return (EINVAL);
+	}
+
+	watch->inw_active = 1;
+
+	mutex_exit(&state->ins_lock);
+
+	return (0);
+}
+
+/*
+ * Called periodically as a cyclic to process the orphans and zombies.
+ */
+static void
+inotify_clean(void *arg)
+{
+	inotify_state_t *state = arg;
+	inotify_watch_t *watch, *parent, *next, **prev;
+	cred_t *savecred;
+	int err;
+
+	mutex_enter(&state->ins_lock);
+
+	for (watch = list_head(&state->ins_orphans);
+	    watch != NULL; watch = next) {
+		next = list_next(&state->ins_orphans, watch);
+
+		VERIFY(!watch->inw_zombie);
+		VERIFY((parent = watch->inw_parent) != NULL);
+
+		if (watch->inw_vp->v_count > 1)
+			continue;
+
+		avl_remove(&parent->inw_children, watch);
+		err = inotify_fem_uninstall(watch->inw_vp, watch);
+		VERIFY(err == 0);
+
+		list_remove(&state->ins_orphans, watch);
+
+		/*
+		 * For purposes of releasing the vnode, we need to switch our
+		 * cred to be the cred of the orphaning thread (which we held
+		 * at the time this watch was orphaned).
+		 */
+		savecred = curthread->t_cred;
+		curthread->t_cred = watch->inw_cred;
+		VN_RELE(watch->inw_vp);
+		crfree(watch->inw_cred);
+		curthread->t_cred = savecred;
+
+		inotify_watch_zombify(watch);
+	}
+
+	prev = &state->ins_zombies;
+
+	while ((watch = *prev) != NULL) {
+		mutex_enter(&watch->inw_lock);
+
+		if (watch->inw_refcnt == 1) {
+			*prev = watch->inw_parent;
+			inotify_watch_destroy(watch);
+			continue;
+		}
+
+		prev = &watch->inw_parent;
+		mutex_exit(&watch->inw_lock);
+	}
+
+	mutex_exit(&state->ins_lock);
+}
+
+/*ARGSUSED*/
+static int
+inotify_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
+{
+	inotify_state_t *state;
+	major_t major = getemajor(*devp);
+	minor_t minor = getminor(*devp);
+	int instances = 0;
+	char c[64];
+
+	if (minor != INOTIFYMNRN_INOTIFY)
+		return (ENXIO);
+
+	mutex_enter(&inotify_lock);
+
+	for (state = inotify_state; state != NULL; state = state->ins_next) {
+		if (state->ins_cred == cred_p)
+			instances++;
+	}
+
+	if (instances >= inotify_maxinstances) {
+		mutex_exit(&inotify_lock);
+		return (EMFILE);
+	}
+
+	minor = (minor_t)(uintptr_t)vmem_alloc(inotify_minor, 1,
+	    VM_BESTFIT | VM_SLEEP);
+
+	if (ddi_soft_state_zalloc(inotify_softstate, minor) != DDI_SUCCESS) {
+		vmem_free(inotify_minor, (void *)(uintptr_t)minor, 1);
+		mutex_exit(&inotify_lock);
+		return (NULL);
+	}
+
+	state = ddi_get_soft_state(inotify_softstate, minor);
+	*devp = makedevice(major, minor);
+
+	crhold(cred_p);
+	state->ins_cred = cred_p;
+	state->ins_next = inotify_state;
+	inotify_state = state;
+
+	(void) snprintf(c, sizeof (c), "inotify_watchid_%d", minor);
+	state->ins_wds = vmem_create(c, (void *)1, UINT32_MAX, 1,
+	    NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
+
+	avl_create(&state->ins_bywd,
+	    (int(*)(const void *, const void *))inotify_watch_cmpwd,
+	    sizeof (inotify_watch_t),
+	    offsetof(inotify_watch_t, inw_bywd));
+
+	avl_create(&state->ins_byvp,
+	    (int(*)(const void *, const void *))inotify_watch_cmpvp,
+	    sizeof (inotify_watch_t),
+	    offsetof(inotify_watch_t, inw_byvp));
+
+	list_create(&state->ins_orphans, sizeof (inotify_watch_t),
+	    offsetof(inotify_watch_t, inw_orphan));
+
+	state->ins_maxwatches = inotify_maxwatches;
+	state->ins_maxevents = inotify_maxevents;
+
+	mutex_exit(&inotify_lock);
+
+	state->ins_cleaner = ddi_periodic_add(inotify_clean,
+	    state, NANOSEC, DDI_IPL_0);
+
+	return (0);
+}
+
+/*ARGSUSED*/
+static int
+inotify_read(dev_t dev, uio_t *uio, cred_t *cr)
+{
+	inotify_state_t *state;
+	inotify_kevent_t *event;
+	minor_t minor = getminor(dev);
+	int err = 0, nevents = 0;
+	size_t len;
+
+	state = ddi_get_soft_state(inotify_softstate, minor);
+
+	mutex_enter(&state->ins_lock);
+
+	while (state->ins_head == NULL) {
+		if (uio->uio_fmode & (FNDELAY|FNONBLOCK)) {
+			mutex_exit(&state->ins_lock);
+			return (EAGAIN);
+		}
+
+		if (!cv_wait_sig_swap(&state->ins_cv, &state->ins_lock)) {
+			mutex_exit(&state->ins_lock);
+			return (EINTR);
+		}
+	}
+
+	/*
+	 * We have events and we have our lock; return as many as we can.
+	 */
+	while ((event = state->ins_head) != NULL) {
+		len = sizeof (event->ine_event) + event->ine_event.len;
+
+		if (uio->uio_resid < len) {
+			if (nevents == 0)
+				err = EINVAL;
+			break;
+		}
+
+		nevents++;
+
+		if ((err = uiomove(&event->ine_event, len, UIO_READ, uio)) != 0)
+			break;
+
+		VERIFY(state->ins_nevents > 0);
+		state->ins_nevents--;
+
+		VERIFY(state->ins_size > 0);
+		state->ins_size -= len;
+
+		if ((state->ins_head = event->ine_next) == NULL) {
+			VERIFY(event == state->ins_tail);
+			VERIFY(state->ins_nevents == 0);
+			state->ins_tail = NULL;
+		}
+
+		kmem_free(event, INOTIFY_EVENT_LENGTH(event));
+	}
+
+	mutex_exit(&state->ins_lock);
+
+	return (err);
+}
+
+static int
+inotify_poll(dev_t dev, short events, int anyyet, short *reventsp,
+    struct pollhead **phpp)
+{
+	inotify_state_t *state;
+	minor_t minor = getminor(dev);
+
+	state = ddi_get_soft_state(inotify_softstate, minor);
+
+	mutex_enter(&state->ins_lock);
+
+	if (state->ins_head != NULL) {
+		*reventsp = events & (POLLRDNORM | POLLIN);
+	} else {
+		*reventsp = 0;
+	}
+
+	if ((*reventsp == 0 && !anyyet) || (events & POLLET)) {
+		*phpp = &state->ins_pollhd;
+	}
+
+	mutex_exit(&state->ins_lock);
+
+	return (0);
+}
+
+/*ARGSUSED*/
+static int
+inotify_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv)
+{
+	inotify_state_t *state;
+	minor_t minor = getminor(dev);
+	file_t *fp;
+	int rval;
+
+	state = ddi_get_soft_state(inotify_softstate, minor);
+
+	switch (cmd) {
+	case INOTIFYIOC_ADD_WATCH: {
+		inotify_addwatch_t addwatch;
+		file_t *fp;
+
+		if (copyin((void *)arg, &addwatch, sizeof (addwatch)) != 0)
+			return (EFAULT);
+
+		if ((fp = getf(addwatch.inaw_fd)) == NULL)
+			return (EBADF);
+
+		rval = inotify_add_watch(state, fp->f_vnode,
+		    addwatch.inaw_mask, rv);
+
+		releasef(addwatch.inaw_fd);
+		return (rval);
+	}
+
+	case INOTIFYIOC_ADD_CHILD: {
+		inotify_addchild_t addchild;
+		char name[MAXPATHLEN];
+
+		if (copyin((void *)arg, &addchild, sizeof (addchild)) != 0)
+			return (EFAULT);
+
+		if (copyinstr(addchild.inac_name, name, MAXPATHLEN, NULL) != 0)
+			return (EFAULT);
+
+		if ((fp = getf(addchild.inac_fd)) == NULL)
+			return (EBADF);
+
+		rval = inotify_add_child(state, fp->f_vnode, name);
+
+		releasef(addchild.inac_fd);
+		return (rval);
+	}
+
+	case INOTIFYIOC_RM_WATCH:
+		return (inotify_rm_watch(state, arg));
+
+	case INOTIFYIOC_ACTIVATE:
+		return (inotify_activate(state, arg));
+
+	case FIONREAD: {
+		int32_t size;
+
+		mutex_enter(&state->ins_lock);
+		size = state->ins_size;
+		mutex_exit(&state->ins_lock);
+
+		if (copyout(&size, (void *)arg, sizeof (size)) != 0)
+			return (EFAULT);
+
+		return (0);
+	}
+
+	default:
+		break;
+	}
+
+	return (ENOTTY);
+}
+
+/*ARGSUSED*/
+static int
+inotify_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
+{
+	inotify_state_t *state, **sp;
+	inotify_watch_t *watch, *zombies;
+	inotify_kevent_t *event;
+	minor_t minor = getminor(dev);
+
+	state = ddi_get_soft_state(inotify_softstate, minor);
+
+	if (state->ins_pollhd.ph_list != NULL) {
+		pollwakeup(&state->ins_pollhd, POLLERR);
+		pollhead_clean(&state->ins_pollhd);
+	}
+
+	mutex_enter(&state->ins_lock);
+
+	/*
+	 * First, destroy all of our watches.
+	 */
+	while ((watch = avl_first(&state->ins_bywd)) != NULL)
+		inotify_watch_remove(state, watch);
+
+	/*
+	 * And now destroy our event queue.
+	 */
+	while ((event = state->ins_head) != NULL) {
+		state->ins_head = event->ine_next;
+		kmem_free(event, INOTIFY_EVENT_LENGTH(event));
+	}
+
+	zombies = state->ins_zombies;
+	state->ins_zombies = NULL;
+	mutex_exit(&state->ins_lock);
+
+	/*
+	 * Now that our state lock is dropped, we can synchronously wait on
+	 * any zombies.
+	 */
+	while ((watch = zombies) != NULL) {
+		zombies = zombies->inw_parent;
+
+		mutex_enter(&watch->inw_lock);
+
+		while (watch->inw_refcnt > 1)
+			cv_wait(&watch->inw_cv, &watch->inw_lock);
+
+		inotify_watch_destroy(watch);
+	}
+
+	if (state->ins_cleaner != NULL) {
+		ddi_periodic_delete(state->ins_cleaner);
+		state->ins_cleaner = NULL;
+	}
+
+	mutex_enter(&inotify_lock);
+
+	/*
+	 * Remove our state from our global list, and release our hold on
+	 * the cred.
+	 */
+	for (sp = &inotify_state; *sp != state; sp = &((*sp)->ins_next))
+		VERIFY(*sp != NULL);
+
+	*sp = (*sp)->ins_next;
+	crfree(state->ins_cred);
+	vmem_destroy(state->ins_wds);
+
+	ddi_soft_state_free(inotify_softstate, minor);
+	vmem_free(inotify_minor, (void *)(uintptr_t)minor, 1);
+
+	mutex_exit(&inotify_lock);
+
+	return (0);
+}
+
+/*ARGSUSED*/
+static int
+inotify_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
+{
+	mutex_enter(&inotify_lock);
+
+	if (ddi_soft_state_init(&inotify_softstate,
+	    sizeof (inotify_state_t), 0) != 0) {
+		cmn_err(CE_NOTE, "/dev/inotify failed to create soft state");
+		mutex_exit(&inotify_lock);
+		return (DDI_FAILURE);
+	}
+
+	if (ddi_create_minor_node(devi, "inotify", S_IFCHR,
+	    INOTIFYMNRN_INOTIFY, DDI_PSEUDO, NULL) == DDI_FAILURE) {
+		cmn_err(CE_NOTE, "/dev/inotify couldn't create minor node");
+		ddi_soft_state_fini(&inotify_softstate);
+		mutex_exit(&inotify_lock);
+		return (DDI_FAILURE);
+	}
+
+	if (fem_create("inotify_fem",
+	    inotify_vnodesrc_template, &inotify_femp) != 0) {
+		cmn_err(CE_NOTE, "/dev/inotify couldn't create FEM state");
+		ddi_remove_minor_node(devi, NULL);
+		ddi_soft_state_fini(&inotify_softstate);
+		mutex_exit(&inotify_lock);
+		return (DDI_FAILURE);
+	}
+
+	ddi_report_dev(devi);
+	inotify_devi = devi;
+
+	inotify_minor = vmem_create("inotify_minor", (void *)INOTIFYMNRN_CLONE,
+	    UINT32_MAX - INOTIFYMNRN_CLONE, 1, NULL, NULL, NULL, 0,
+	    VM_SLEEP | VMC_IDENTIFIER);
+
+	mutex_exit(&inotify_lock);
+
+	return (DDI_SUCCESS);
+}
+
+/*ARGSUSED*/
+static int
+inotify_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+	switch (cmd) {
+	case DDI_DETACH:
+		break;
+
+	case DDI_SUSPEND:
+		return (DDI_SUCCESS);
+
+	default:
+		return (DDI_FAILURE);
+	}
+
+	mutex_enter(&inotify_lock);
+	fem_free(inotify_femp);
+	vmem_destroy(inotify_minor);
+
+	ddi_remove_minor_node(inotify_devi, NULL);
+	inotify_devi = NULL;
+
+	ddi_soft_state_fini(&inotify_softstate);
+	mutex_exit(&inotify_lock);
+
+	return (DDI_SUCCESS);
+}
+
+/*ARGSUSED*/
+static int
+inotify_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
+{
+	int error;
+
+	switch (infocmd) {
+	case DDI_INFO_DEVT2DEVINFO:
+		*result = (void *)inotify_devi;
+		error = DDI_SUCCESS;
+		break;
+	case DDI_INFO_DEVT2INSTANCE:
+		*result = (void *)0;
+		error = DDI_SUCCESS;
+		break;
+	default:
+		error = DDI_FAILURE;
+	}
+	return (error);
+}
+
+static struct cb_ops inotify_cb_ops = {
+	inotify_open,		/* open */
+	inotify_close,		/* close */
+	nulldev,		/* strategy */
+	nulldev,		/* print */
+	nodev,			/* dump */
+	inotify_read,		/* read */
+	nodev,			/* write */
+	inotify_ioctl,		/* ioctl */
+	nodev,			/* devmap */
+	nodev,			/* mmap */
+	nodev,			/* segmap */
+	inotify_poll,		/* poll */
+	ddi_prop_op,		/* cb_prop_op */
+	0,			/* streamtab  */
+	D_NEW | D_MP		/* Driver compatibility flag */
+};
+
+static struct dev_ops inotify_ops = {
+	DEVO_REV,		/* devo_rev */
+	0,			/* refcnt */
+	inotify_info,		/* get_dev_info */
+	nulldev,		/* identify */
+	nulldev,		/* probe */
+	inotify_attach,		/* attach */
+	inotify_detach,		/* detach */
+	nodev,			/* reset */
+	&inotify_cb_ops,	/* driver operations */
+	NULL,			/* bus operations */
+	nodev,			/* dev power */
+	ddi_quiesce_not_needed,	/* quiesce */
+};
+
+static struct modldrv modldrv = {
+	&mod_driverops,		/* module type (this is a pseudo driver) */
+	"inotify support",	/* name of module */
+	&inotify_ops,		/* driver ops */
+};
+
+static struct modlinkage modlinkage = {
+	MODREV_1,
+	(void *)&modldrv,
+	NULL
+};
+
+int
+_init(void)
+{
+	return (mod_install(&modlinkage));
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&modlinkage, modinfop));
+}
+
+int
+_fini(void)
+{
+	return (mod_remove(&modlinkage));
+}
diff --git a/usr/src/uts/common/io/inotify.conf b/usr/src/uts/common/io/inotify.conf
new file mode 100644
index 0000000000..ce9da6180f
--- /dev/null
+++ b/usr/src/uts/common/io/inotify.conf
@@ -0,0 +1,16 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2014 Joyent, Inc.  All rights reserved.
+#
+
+name="inotify" parent="pseudo" instance=0;
diff --git a/usr/src/uts/common/io/ixgbe/core/ixgbe_82598.c b/usr/src/uts/common/io/ixgbe/core/ixgbe_82598.c
index c10e23a8a6..cde57df235 100644
--- a/usr/src/uts/common/io/ixgbe/core/ixgbe_82598.c
+++ b/usr/src/uts/common/io/ixgbe/core/ixgbe_82598.c
@@ -996,17 +996,20 @@ static s32 ixgbe_clear_vmdq_82598(struct ixgbe_hw *hw, u32 rar, u32 vmdq)
  *  @vlan: VLAN id to write to VLAN filter
  *  @vind: VMDq output index that maps queue to VLAN id in VFTA
  *  @vlan_on: boolean flag to turn on/off VLAN in VFTA
+ *  @vlvf_bypass: boolean flag - unused
  *
  *  Turn on/off specified VLAN in the VLAN filter table.
  **/
 s32 ixgbe_set_vfta_82598(struct ixgbe_hw *hw, u32 vlan, u32 vind,
-			 bool vlan_on)
+			 bool vlan_on, bool vlvf_bypass)
 {
 	u32 regindex;
 	u32 bitindex;
 	u32 bits;
 	u32 vftabyte;
 
+	UNREFERENCED_1PARAMETER(vlvf_bypass);
+
 	DEBUGFUNC("ixgbe_set_vfta_82598");
 
 	if (vlan > 4095)
diff --git a/usr/src/uts/common/io/ixgbe/core/ixgbe_82598.h b/usr/src/uts/common/io/ixgbe/core/ixgbe_82598.h
index d2241c70cd..c32672187a 100644
--- a/usr/src/uts/common/io/ixgbe/core/ixgbe_82598.h
+++ b/usr/src/uts/common/io/ixgbe/core/ixgbe_82598.h
@@ -40,7 +40,8 @@ s32 ixgbe_fc_enable_82598(struct ixgbe_hw *hw);
 s32 ixgbe_start_hw_82598(struct ixgbe_hw *hw);
 void ixgbe_enable_relaxed_ordering_82598(struct ixgbe_hw *hw);
 s32 ixgbe_set_vmdq_82598(struct ixgbe_hw *hw, u32 rar, u32 vmdq);
-s32 ixgbe_set_vfta_82598(struct ixgbe_hw *hw, u32 vlan, u32 vind, bool vlan_on);
+s32 ixgbe_set_vfta_82598(struct ixgbe_hw *hw, u32 vlan, u32 vind, bool vlan_on,
+			 bool vlvf_bypass);
 s32 ixgbe_read_analog_reg8_82598(struct ixgbe_hw *hw, u32 reg, u8 *val);
 s32 ixgbe_write_analog_reg8_82598(struct ixgbe_hw *hw, u32 reg, u8 val);
 s32 ixgbe_read_i2c_eeprom_82598(struct ixgbe_hw *hw, u8 byte_offset,
diff --git a/usr/src/uts/common/io/ixgbe/core/ixgbe_api.c b/usr/src/uts/common/io/ixgbe/core/ixgbe_api.c
index 894d0b2ac9..c550982710 100644
--- a/usr/src/uts/common/io/ixgbe/core/ixgbe_api.c
+++ b/usr/src/uts/common/io/ixgbe/core/ixgbe_api.c
@@ -1057,33 +1057,38 @@ s32 ixgbe_clear_vfta(struct ixgbe_hw *hw)
  *  ixgbe_set_vfta - Set VLAN filter table
  *  @hw: pointer to hardware structure
  *  @vlan: VLAN id to write to VLAN filter
- *  @vind: VMDq output index that maps queue to VLAN id in VFTA
- *  @vlan_on: boolean flag to turn on/off VLAN in VFTA
+ *  @vind: VMDq output index that maps queue to VLAN id in VLVFB
+ *  @vlan_on: boolean flag to turn on/off VLAN
+ *  @vlvf_bypass: boolean flag indicating updating the default pool is okay
  *
  *  Turn on/off specified VLAN in the VLAN filter table.
  **/
-s32 ixgbe_set_vfta(struct ixgbe_hw *hw, u32 vlan, u32 vind, bool vlan_on)
+s32 ixgbe_set_vfta(struct ixgbe_hw *hw, u32 vlan, u32 vind, bool vlan_on,
+		   bool vlvf_bypass)
 {
 	return ixgbe_call_func(hw, hw->mac.ops.set_vfta, (hw, vlan, vind,
-			       vlan_on), IXGBE_NOT_IMPLEMENTED);
+			       vlan_on, vlvf_bypass), IXGBE_NOT_IMPLEMENTED);
 }
 
 /**
  *  ixgbe_set_vlvf - Set VLAN Pool Filter
  *  @hw: pointer to hardware structure
  *  @vlan: VLAN id to write to VLAN filter
- *  @vind: VMDq output index that maps queue to VLAN id in VFVFB
- *  @vlan_on: boolean flag to turn on/off VLAN in VFVF
- *  @vfta_changed: pointer to boolean flag which indicates whether VFTA
- *                 should be changed
+ *  @vind: VMDq output index that maps queue to VLAN id in VLVFB
+ *  @vlan_on: boolean flag to turn on/off VLAN in VLVF
+ *  @vfta_delta: pointer to the difference between the current value of VFTA
+ *		 and the desired value
+ *  @vfta: the desired value of the VFTA
+ *  @vlvf_bypass: boolean flag indicating updating the default pool is okay
  *
  *  Turn on/off specified bit in VLVF table.
  **/
 s32 ixgbe_set_vlvf(struct ixgbe_hw *hw, u32 vlan, u32 vind, bool vlan_on,
-		    bool *vfta_changed)
+		   u32 *vfta_delta, u32 vfta, bool vlvf_bypass)
 {
 	return ixgbe_call_func(hw, hw->mac.ops.set_vlvf, (hw, vlan, vind,
-			       vlan_on, vfta_changed), IXGBE_NOT_IMPLEMENTED);
+			       vlan_on, vfta_delta, vfta, vlvf_bypass),
+			       IXGBE_NOT_IMPLEMENTED);
 }
 
 /**
diff --git a/usr/src/uts/common/io/ixgbe/core/ixgbe_api.h b/usr/src/uts/common/io/ixgbe/core/ixgbe_api.h
index 24d507039d..3bee89e45e 100644
--- a/usr/src/uts/common/io/ixgbe/core/ixgbe_api.h
+++ b/usr/src/uts/common/io/ixgbe/core/ixgbe_api.h
@@ -125,9 +125,10 @@ s32 ixgbe_enable_mc(struct ixgbe_hw *hw);
 s32 ixgbe_disable_mc(struct ixgbe_hw *hw);
 s32 ixgbe_clear_vfta(struct ixgbe_hw *hw);
 s32 ixgbe_set_vfta(struct ixgbe_hw *hw, u32 vlan,
-		   u32 vind, bool vlan_on);
+		   u32 vind, bool vlan_on, bool vlvf_bypass);
 s32 ixgbe_set_vlvf(struct ixgbe_hw *hw, u32 vlan, u32 vind,
-		   bool vlan_on, bool *vfta_changed);
+		   bool vlan_on, u32 *vfta_delta, u32 vfta,
+		   bool vlvf_bypass);
 s32 ixgbe_fc_enable(struct ixgbe_hw *hw);
 s32 ixgbe_setup_fc(struct ixgbe_hw *hw);
 s32 ixgbe_set_fw_drv_ver(struct ixgbe_hw *hw, u8 maj, u8 min, u8 build,
diff --git a/usr/src/uts/common/io/ixgbe/core/ixgbe_common.c b/usr/src/uts/common/io/ixgbe/core/ixgbe_common.c
index f342eee637..656534862c 100644
--- a/usr/src/uts/common/io/ixgbe/core/ixgbe_common.c
+++ b/usr/src/uts/common/io/ixgbe/core/ixgbe_common.c
@@ -3810,68 +3810,65 @@ s32 ixgbe_init_uta_tables_generic(struct ixgbe_hw *hw)
  *  return the VLVF index where this VLAN id should be placed
  *
  **/
-s32 ixgbe_find_vlvf_slot(struct ixgbe_hw *hw, u32 vlan)
+s32 ixgbe_find_vlvf_slot(struct ixgbe_hw *hw, u32 vlan, bool vlvf_bypass)
 {
-	u32 bits = 0;
-	u32 first_empty_slot = 0;
-	s32 regindex;
+	s32 regindex, first_empty_slot;
+	u32 bits;
 
 	/* short cut the special case */
 	if (vlan == 0)
 		return 0;
 
-	/*
-	  * Search for the vlan id in the VLVF entries. Save off the first empty
-	  * slot found along the way
-	  */
-	for (regindex = 1; regindex < IXGBE_VLVF_ENTRIES; regindex++) {
+	/* if vlvf_bypass is set we don't want to use an empty slot, we
+	 * will simply bypass the VLVF if there are no entries present in the
+	 * VLVF that contain our VLAN
+	 */
+	first_empty_slot = vlvf_bypass ? IXGBE_ERR_NO_SPACE : 0;
+
+	/* add VLAN enable bit for comparison */
+	vlan |= IXGBE_VLVF_VIEN;
+
+	/* Search for the vlan id in the VLVF entries. Save off the first empty
+	 * slot found along the way.
+	 *
+	 * pre-decrement loop covering (IXGBE_VLVF_ENTRIES - 1) .. 1
+	 */
+	for (regindex = IXGBE_VLVF_ENTRIES; --regindex;) {
 		bits = IXGBE_READ_REG(hw, IXGBE_VLVF(regindex));
-		if (!bits && !(first_empty_slot))
+		if (bits == vlan)
+			return regindex;
+		if (!first_empty_slot && !bits)
 			first_empty_slot = regindex;
-		else if ((bits & 0x0FFF) == vlan)
-			break;
 	}
 
-	/*
-	  * If regindex is less than IXGBE_VLVF_ENTRIES, then we found the vlan
-	  * in the VLVF. Else use the first empty VLVF register for this
-	  * vlan id.
-	  */
-	if (regindex >= IXGBE_VLVF_ENTRIES) {
-		if (first_empty_slot)
-			regindex = first_empty_slot;
-		else {
-			ERROR_REPORT1(IXGBE_ERROR_SOFTWARE,
-				     "No space in VLVF.\n");
-			regindex = IXGBE_ERR_NO_SPACE;
-		}
-	}
+	/* If we are here then we didn't find the VLAN.  Return first empty
+	 * slot we found during our search, else error.
+	 */
+	if (!first_empty_slot)
+		ERROR_REPORT1(IXGBE_ERROR_SOFTWARE, "No space in VLVF.\n");
 
-	return regindex;
+	return first_empty_slot ? first_empty_slot : IXGBE_ERR_NO_SPACE;
 }
 
 /**
  *  ixgbe_set_vfta_generic - Set VLAN filter table
  *  @hw: pointer to hardware structure
  *  @vlan: VLAN id to write to VLAN filter
- *  @vind: VMDq output index that maps queue to VLAN id in VFVFB
- *  @vlan_on: boolean flag to turn on/off VLAN in VFVF
+ *  @vind: VMDq output index that maps queue to VLAN id in VLVFB
+ *  @vlan_on: boolean flag to turn on/off VLAN
+ *  @vlvf_bypass: boolean flag indicating updating default pool is okay
  *
  *  Turn on/off specified VLAN in the VLAN filter table.
  **/
 s32 ixgbe_set_vfta_generic(struct ixgbe_hw *hw, u32 vlan, u32 vind,
-			   bool vlan_on)
+			   bool vlan_on, bool vlvf_bypass)
 {
-	s32 regindex;
-	u32 bitindex;
-	u32 vfta;
-	u32 targetbit;
-	s32 ret_val = IXGBE_SUCCESS;
-	bool vfta_changed = FALSE;
+	u32 regidx, vfta_delta, vfta;
+	s32 ret_val;
 
 	DEBUGFUNC("ixgbe_set_vfta_generic");
 
-	if (vlan > 4095)
+	if (vlan > 4095 || vind > 63)
 		return IXGBE_ERR_PARAM;
 
 	/*
@@ -3886,33 +3883,33 @@ s32 ixgbe_set_vfta_generic(struct ixgbe_hw *hw, u32 vlan, u32 vind,
 	 *    bits[11-5]: which register
 	 *    bits[4-0]:  which bit in the register
 	 */
-	regindex = (vlan >> 5) & 0x7F;
-	bitindex = vlan & 0x1F;
-	targetbit = (1 << bitindex);
-	vfta = IXGBE_READ_REG(hw, IXGBE_VFTA(regindex));
-
-	if (vlan_on) {
-		if (!(vfta & targetbit)) {
-			vfta |= targetbit;
-			vfta_changed = TRUE;
-		}
-	} else {
-		if ((vfta & targetbit)) {
-			vfta &= ~targetbit;
-			vfta_changed = TRUE;
-		}
-	}
+	regidx = vlan / 32;
+	vfta_delta = 1 << (vlan % 32);
+	vfta = IXGBE_READ_REG(hw, IXGBE_VFTA(regidx));
+
+	/*
+	 * vfta_delta represents the difference between the current value
+	 * of vfta and the value we want in the register.  Since the diff
+	 * is an XOR mask we can just update the vfta using an XOR
+	 */
+	vfta_delta &= vlan_on ? ~vfta : vfta;
+	vfta ^= vfta_delta;
 
 	/* Part 2
 	 * Call ixgbe_set_vlvf_generic to set VLVFB and VLVF
 	 */
-	ret_val = ixgbe_set_vlvf_generic(hw, vlan, vind, vlan_on,
-					 &vfta_changed);
-	if (ret_val != IXGBE_SUCCESS)
+	ret_val = ixgbe_set_vlvf_generic(hw, vlan, vind, vlan_on, &vfta_delta,
+					 vfta, vlvf_bypass);
+	if (ret_val != IXGBE_SUCCESS) {
+		if (vlvf_bypass)
+			goto vfta_update;
 		return ret_val;
+	}
 
-	if (vfta_changed)
-		IXGBE_WRITE_REG(hw, IXGBE_VFTA(regindex), vfta);
+vfta_update:
+	/* Update VFTA now that we are ready for traffic */
+	if (vfta_delta)
+		IXGBE_WRITE_REG(hw, IXGBE_VFTA(regidx), vfta);
 
 	return IXGBE_SUCCESS;
 }
@@ -3921,21 +3918,25 @@ s32 ixgbe_set_vfta_generic(struct ixgbe_hw *hw, u32 vlan, u32 vind,
  *  ixgbe_set_vlvf_generic - Set VLAN Pool Filter
  *  @hw: pointer to hardware structure
  *  @vlan: VLAN id to write to VLAN filter
- *  @vind: VMDq output index that maps queue to VLAN id in VFVFB
- *  @vlan_on: boolean flag to turn on/off VLAN in VFVF
- *  @vfta_changed: pointer to boolean flag which indicates whether VFTA
- *                 should be changed
+ *  @vind: VMDq output index that maps queue to VLAN id in VLVFB
+ *  @vlan_on: boolean flag to turn on/off VLAN in VLVF
+ *  @vfta_delta: pointer to the difference between the current value of VFTA
+ *		 and the desired value
+ *  @vfta: the desired value of the VFTA
+ *  @vlvf_bypass: boolean flag indicating updating default pool is okay
  *
  *  Turn on/off specified bit in VLVF table.
  **/
 s32 ixgbe_set_vlvf_generic(struct ixgbe_hw *hw, u32 vlan, u32 vind,
-			    bool vlan_on, bool *vfta_changed)
+			   bool vlan_on, u32 *vfta_delta, u32 vfta,
+			   bool vlvf_bypass)
 {
-	u32 vt;
+	u32 bits;
+	s32 vlvf_index;
 
 	DEBUGFUNC("ixgbe_set_vlvf_generic");
 
-	if (vlan > 4095)
+	if (vlan > 4095 || vind > 63)
 		return IXGBE_ERR_PARAM;
 
 	/* If VT Mode is set
@@ -3945,83 +3946,60 @@ s32 ixgbe_set_vlvf_generic(struct ixgbe_hw *hw, u32 vlan, u32 vind,
 	 *   Or !vlan_on
 	 *     clear the pool bit and possibly the vind
 	 */
-	vt = IXGBE_READ_REG(hw, IXGBE_VT_CTL);
-	if (vt & IXGBE_VT_CTL_VT_ENABLE) {
-		s32 vlvf_index;
-		u32 bits;
-
-		vlvf_index = ixgbe_find_vlvf_slot(hw, vlan);
-		if (vlvf_index < 0)
-			return vlvf_index;
-
-		if (vlan_on) {
-			/* set the pool bit */
-			if (vind < 32) {
-				bits = IXGBE_READ_REG(hw,
-						IXGBE_VLVFB(vlvf_index * 2));
-				bits |= (1 << vind);
-				IXGBE_WRITE_REG(hw,
-						IXGBE_VLVFB(vlvf_index * 2),
-						bits);
-			} else {
-				bits = IXGBE_READ_REG(hw,
-					IXGBE_VLVFB((vlvf_index * 2) + 1));
-				bits |= (1 << (vind - 32));
-				IXGBE_WRITE_REG(hw,
-					IXGBE_VLVFB((vlvf_index * 2) + 1),
-					bits);
-			}
-		} else {
-			/* clear the pool bit */
-			if (vind < 32) {
-				bits = IXGBE_READ_REG(hw,
-						IXGBE_VLVFB(vlvf_index * 2));
-				bits &= ~(1 << vind);
-				IXGBE_WRITE_REG(hw,
-						IXGBE_VLVFB(vlvf_index * 2),
-						bits);
-				bits |= IXGBE_READ_REG(hw,
-					IXGBE_VLVFB((vlvf_index * 2) + 1));
-			} else {
-				bits = IXGBE_READ_REG(hw,
-					IXGBE_VLVFB((vlvf_index * 2) + 1));
-				bits &= ~(1 << (vind - 32));
-				IXGBE_WRITE_REG(hw,
-					IXGBE_VLVFB((vlvf_index * 2) + 1),
-					bits);
-				bits |= IXGBE_READ_REG(hw,
-						IXGBE_VLVFB(vlvf_index * 2));
-			}
-		}
+	if (!(IXGBE_READ_REG(hw, IXGBE_VT_CTL) & IXGBE_VT_CTL_VT_ENABLE))
+		return IXGBE_SUCCESS;
 
-		/*
-		 * If there are still bits set in the VLVFB registers
-		 * for the VLAN ID indicated we need to see if the
-		 * caller is requesting that we clear the VFTA entry bit.
-		 * If the caller has requested that we clear the VFTA
-		 * entry bit but there are still pools/VFs using this VLAN
-		 * ID entry then ignore the request.  We're not worried
-		 * about the case where we're turning the VFTA VLAN ID
-		 * entry bit on, only when requested to turn it off as
-		 * there may be multiple pools and/or VFs using the
-		 * VLAN ID entry.  In that case we cannot clear the
-		 * VFTA bit until all pools/VFs using that VLAN ID have also
-		 * been cleared.  This will be indicated by "bits" being
-		 * zero.
+	vlvf_index = ixgbe_find_vlvf_slot(hw, vlan, vlvf_bypass);
+	if (vlvf_index < 0)
+		return vlvf_index;
+
+	bits = IXGBE_READ_REG(hw, IXGBE_VLVFB(vlvf_index * 2 + vind / 32));
+
+	/* set the pool bit */
+	bits |= 1 << (vind % 32);
+	if (vlan_on)
+		goto vlvf_update;
+
+	/* clear the pool bit */
+	bits ^= 1 << (vind % 32);
+
+	if (!bits &&
+	    !IXGBE_READ_REG(hw, IXGBE_VLVFB(vlvf_index * 2 + 1 - vind / 32))) {
+		/* Clear VFTA first, then disable VLVF.  Otherwise
+		 * we run the risk of stray packets leaking into
+		 * the PF via the default pool
 		 */
-		if (bits) {
-			IXGBE_WRITE_REG(hw, IXGBE_VLVF(vlvf_index),
-					(IXGBE_VLVF_VIEN | vlan));
-			if ((!vlan_on) && (vfta_changed != NULL)) {
-				/* someone wants to clear the vfta entry
-				 * but some pools/VFs are still using it.
-				 * Ignore it. */
-				*vfta_changed = FALSE;
-			}
-		} else
-			IXGBE_WRITE_REG(hw, IXGBE_VLVF(vlvf_index), 0);
+		if (*vfta_delta)
+			IXGBE_WRITE_REG(hw, IXGBE_VFTA(vlan / 32), vfta);
+
+		/* disable VLVF and clear remaining bit from pool */
+		IXGBE_WRITE_REG(hw, IXGBE_VLVF(vlvf_index), 0);
+		IXGBE_WRITE_REG(hw, IXGBE_VLVFB(vlvf_index * 2 + vind / 32), 0);
+
+		return IXGBE_SUCCESS;
 	}
 
+	/* If there are still bits set in the VLVFB registers
+	 * for the VLAN ID indicated we need to see if the
+	 * caller is requesting that we clear the VFTA entry bit.
+	 * If the caller has requested that we clear the VFTA
+	 * entry bit but there are still pools/VFs using this VLAN
+	 * ID entry then ignore the request.  We're not worried
+	 * about the case where we're turning the VFTA VLAN ID
+	 * entry bit on, only when requested to turn it off as
+	 * there may be multiple pools and/or VFs using the
+	 * VLAN ID entry.  In that case we cannot clear the
+	 * VFTA bit until all pools/VFs using that VLAN ID have also
+	 * been cleared.  This will be indicated by "bits" being
+	 * zero.
+	 */
+	*vfta_delta = 0;
+
+vlvf_update:
+	/* record pool change and enable VLAN ID if not already enabled */
+	IXGBE_WRITE_REG(hw, IXGBE_VLVFB(vlvf_index * 2 + vind / 32), bits);
+	IXGBE_WRITE_REG(hw, IXGBE_VLVF(vlvf_index), IXGBE_VLVF_VIEN | vlan);
+
 	return IXGBE_SUCCESS;
 }
 
diff --git a/usr/src/uts/common/io/ixgbe/core/ixgbe_common.h b/usr/src/uts/common/io/ixgbe/core/ixgbe_common.h
index 069fc88c96..bd18e96f82 100644
--- a/usr/src/uts/common/io/ixgbe/core/ixgbe_common.h
+++ b/usr/src/uts/common/io/ixgbe/core/ixgbe_common.h
@@ -135,11 +135,12 @@ s32 ixgbe_clear_vmdq_generic(struct ixgbe_hw *hw, u32 rar, u32 vmdq);
 s32 ixgbe_insert_mac_addr_generic(struct ixgbe_hw *hw, u8 *addr, u32 vmdq);
 s32 ixgbe_init_uta_tables_generic(struct ixgbe_hw *hw);
 s32 ixgbe_set_vfta_generic(struct ixgbe_hw *hw, u32 vlan,
-			 u32 vind, bool vlan_on);
+			 u32 vind, bool vlan_on, bool vlvf_bypass);
 s32 ixgbe_set_vlvf_generic(struct ixgbe_hw *hw, u32 vlan, u32 vind,
-			   bool vlan_on, bool *vfta_changed);
+			   bool vlan_on, u32 *vfta_delta, u32 vfta,
+			   bool vlvf_bypass);
 s32 ixgbe_clear_vfta_generic(struct ixgbe_hw *hw);
-s32 ixgbe_find_vlvf_slot(struct ixgbe_hw *hw, u32 vlan);
+s32 ixgbe_find_vlvf_slot(struct ixgbe_hw *hw, u32 vlan, bool vlvf_bypass);
 
 s32 ixgbe_check_mac_link_generic(struct ixgbe_hw *hw,
 			       ixgbe_link_speed *speed,
diff --git a/usr/src/uts/common/io/ixgbe/core/ixgbe_type.h b/usr/src/uts/common/io/ixgbe/core/ixgbe_type.h
index 45e8a7d029..9231979ff7 100644
--- a/usr/src/uts/common/io/ixgbe/core/ixgbe_type.h
+++ b/usr/src/uts/common/io/ixgbe/core/ixgbe_type.h
@@ -3715,8 +3715,9 @@ struct ixgbe_mac_operations {
 	s32 (*enable_mc)(struct ixgbe_hw *);
 	s32 (*disable_mc)(struct ixgbe_hw *);
 	s32 (*clear_vfta)(struct ixgbe_hw *);
-	s32 (*set_vfta)(struct ixgbe_hw *, u32, u32, bool);
-	s32 (*set_vlvf)(struct ixgbe_hw *, u32, u32, bool, bool *);
+	s32 (*set_vfta)(struct ixgbe_hw *, u32, u32, bool, bool);
+	s32 (*set_vlvf)(struct ixgbe_hw *, u32, u32, bool, u32 *, u32,
+			bool);
 	s32 (*init_uta_tables)(struct ixgbe_hw *);
 	void (*set_mac_anti_spoofing)(struct ixgbe_hw *, bool, int);
 	void (*set_vlan_anti_spoofing)(struct ixgbe_hw *, bool, int);
diff --git a/usr/src/uts/common/io/ixgbe/core/ixgbe_vf.c b/usr/src/uts/common/io/ixgbe/core/ixgbe_vf.c
index 2ce4d32a30..66d836eb8f 100644
--- a/usr/src/uts/common/io/ixgbe/core/ixgbe_vf.c
+++ b/usr/src/uts/common/io/ixgbe/core/ixgbe_vf.c
@@ -321,15 +321,16 @@ static s32 ixgbe_mta_vector(struct ixgbe_hw *hw, u8 *mc_addr)
 	return vector;
 }
 
-static void ixgbevf_write_msg_read_ack(struct ixgbe_hw *hw,
-					u32 *msg, u16 size)
+static s32 ixgbevf_write_msg_read_ack(struct ixgbe_hw *hw, u32 *msg,
+				      u32 *retmsg, u16 size)
 {
 	struct ixgbe_mbx_info *mbx = &hw->mbx;
-	u32 retmsg[IXGBE_VFMAILBOX_SIZE];
 	s32 retval = mbx->ops.write_posted(hw, msg, size, 0);
 
-	if (!retval)
-		mbx->ops.read_posted(hw, retmsg, size, 0);
+	if (retval)
+		return retval;
+
+	return mbx->ops.read_posted(hw, retmsg, size, 0);
 }
 
 /**
@@ -415,29 +416,29 @@ s32 ixgbe_update_mc_addr_list_vf(struct ixgbe_hw *hw, u8 *mc_addr_list,
 	return mbx->ops.write_posted(hw, msgbuf, IXGBE_VFMAILBOX_SIZE, 0);
 }
 
-/**
+/*
  *  ixgbe_set_vfta_vf - Set/Unset vlan filter table address
  *  @hw: pointer to the HW structure
  *  @vlan: 12 bit VLAN ID
  *  @vind: unused by VF drivers
  *  @vlan_on: if TRUE then set bit, else clear bit
+ *  @vlvf_bypass: boolean flag indicating updating default pool is okay
+ *
+ *  Turn on/off specified VLAN in the VLAN filter table.
  **/
-s32 ixgbe_set_vfta_vf(struct ixgbe_hw *hw, u32 vlan, u32 vind, bool vlan_on)
+s32 ixgbe_set_vfta_vf(struct ixgbe_hw *hw, u32 vlan, u32 vind,
+		      bool vlan_on, bool vlvf_bypass)
 {
-	struct ixgbe_mbx_info *mbx = &hw->mbx;
 	u32 msgbuf[2];
 	s32 ret_val;
-	UNREFERENCED_1PARAMETER(vind);
+	UNREFERENCED_2PARAMETER(vind, vlvf_bypass);
 
 	msgbuf[0] = IXGBE_VF_SET_VLAN;
 	msgbuf[1] = vlan;
 	/* Setting the 8 bit field MSG INFO to TRUE indicates "add" */
 	msgbuf[0] |= vlan_on << IXGBE_VT_MSGINFO_SHIFT;
 
-	ret_val = mbx->ops.write_posted(hw, msgbuf, 2, 0);
-	if (!ret_val)
-		ret_val = mbx->ops.read_posted(hw, msgbuf, 1, 0);
-
+	ret_val = ixgbevf_write_msg_read_ack(hw, msgbuf, msgbuf, 2);
 	if (!ret_val && (msgbuf[0] & IXGBE_VT_MSGTYPE_ACK))
 		return IXGBE_SUCCESS;
 
@@ -628,7 +629,7 @@ void ixgbevf_rlpml_set_vf(struct ixgbe_hw *hw, u16 max_size)
 
 	msgbuf[0] = IXGBE_VF_SET_LPE;
 	msgbuf[1] = max_size;
-	ixgbevf_write_msg_read_ack(hw, msgbuf, 2);
+	ixgbevf_write_msg_read_ack(hw, msgbuf, msgbuf, 2);
 }
 
 /**
diff --git a/usr/src/uts/common/io/ixgbe/core/ixgbe_vf.h b/usr/src/uts/common/io/ixgbe/core/ixgbe_vf.h
index edc801367d..e9b8dc34ae 100644
--- a/usr/src/uts/common/io/ixgbe/core/ixgbe_vf.h
+++ b/usr/src/uts/common/io/ixgbe/core/ixgbe_vf.h
@@ -132,7 +132,8 @@ s32 ixgbevf_set_uc_addr_vf(struct ixgbe_hw *hw, u32 index, u8 *addr);
 s32 ixgbe_update_mc_addr_list_vf(struct ixgbe_hw *hw, u8 *mc_addr_list,
 				 u32 mc_addr_count, ixgbe_mc_addr_itr,
 				 bool clear);
-s32 ixgbe_set_vfta_vf(struct ixgbe_hw *hw, u32 vlan, u32 vind, bool vlan_on);
+s32 ixgbe_set_vfta_vf(struct ixgbe_hw *hw, u32 vlan, u32 vind,
+		      bool vlan_on, bool vlvf_bypass);
 void ixgbevf_rlpml_set_vf(struct ixgbe_hw *hw, u16 max_size);
 int ixgbevf_negotiate_api_version(struct ixgbe_hw *hw, int api);
 int ixgbevf_get_queues(struct ixgbe_hw *hw, unsigned int *num_tcs,
diff --git a/usr/src/uts/common/io/ixgbe/ixgbe_main.c b/usr/src/uts/common/io/ixgbe/ixgbe_main.c
index 41464050bc..82c9bcdc36 100644
--- a/usr/src/uts/common/io/ixgbe/ixgbe_main.c
+++ b/usr/src/uts/common/io/ixgbe/ixgbe_main.c
@@ -25,7 +25,7 @@
 
 /*
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2017, Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
  * Copyright 2012 Nexenta Systems, Inc. All rights reserved.
  * Copyright (c) 2013 Saso Kiselkov. All rights reserved.
  * Copyright (c) 2013 OSN Online Service Nuernberg GmbH. All rights reserved.
@@ -57,8 +57,8 @@ static int ixgbe_alloc_rings(ixgbe_t *);
 static void ixgbe_free_rings(ixgbe_t *);
 static int ixgbe_alloc_rx_data(ixgbe_t *);
 static void ixgbe_free_rx_data(ixgbe_t *);
-static void ixgbe_setup_rings(ixgbe_t *);
-static void ixgbe_setup_rx(ixgbe_t *);
+static int ixgbe_setup_rings(ixgbe_t *);
+static int ixgbe_setup_rx(ixgbe_t *);
 static void ixgbe_setup_tx(ixgbe_t *);
 static void ixgbe_setup_rx_ring(ixgbe_rx_ring_t *);
 static void ixgbe_setup_tx_ring(ixgbe_tx_ring_t *);
@@ -67,6 +67,7 @@ static void ixgbe_setup_vmdq(ixgbe_t *);
 static void ixgbe_setup_vmdq_rss(ixgbe_t *);
 static void ixgbe_setup_rss_table(ixgbe_t *);
 static void ixgbe_init_unicst(ixgbe_t *);
+static int ixgbe_init_vlan(ixgbe_t *);
 static int ixgbe_unicst_find(ixgbe_t *, const uint8_t *);
 static void ixgbe_setup_multicst(ixgbe_t *);
 static void ixgbe_get_hw_state(ixgbe_t *);
@@ -113,6 +114,8 @@ static void ixgbe_intr_other_work(ixgbe_t *, uint32_t);
 static void ixgbe_get_driver_control(struct ixgbe_hw *);
 static int ixgbe_addmac(void *, const uint8_t *);
 static int ixgbe_remmac(void *, const uint8_t *);
+static int ixgbe_addvlan(mac_group_driver_t, uint16_t);
+static int ixgbe_remvlan(mac_group_driver_t, uint16_t);
 static void ixgbe_release_driver_control(struct ixgbe_hw *);
 
 static int ixgbe_attach(dev_info_t *, ddi_attach_cmd_t);
@@ -273,7 +276,7 @@ static adapter_info_t ixgbe_82599eb_cap = {
 	128,		/* default number of rx queues */
 	64,		/* maximum number of rx groups */
 	1,		/* minimum number of rx groups */
-	1,		/* default number of rx groups */
+	32,		/* default number of rx groups */
 	128,		/* maximum number of tx queues */
 	1,		/* minimum number of tx queues */
 	8,		/* default number of tx queues */
@@ -304,7 +307,7 @@ static adapter_info_t ixgbe_X540_cap = {
 	128,		/* default number of rx queues */
 	64,		/* maximum number of rx groups */
 	1,		/* minimum number of rx groups */
-	1,		/* default number of rx groups */
+	32,		/* default number of rx groups */
 	128,		/* maximum number of tx queues */
 	1,		/* minimum number of tx queues */
 	8,		/* default number of tx queues */
@@ -1149,6 +1152,8 @@ ixgbe_init_driver_settings(ixgbe_t *ixgbe)
 		rx_group = &ixgbe->rx_groups[i];
 		rx_group->index = i;
 		rx_group->ixgbe = ixgbe;
+		list_create(&rx_group->vlans, sizeof (ixgbe_vlan_t),
+		    offsetof(ixgbe_vlan_t, ixvl_link));
 	}
 
 	for (i = 0; i < ixgbe->num_tx_rings; i++) {
@@ -1898,7 +1903,8 @@ ixgbe_start(ixgbe_t *ixgbe, boolean_t alloc_buffer)
 	/*
 	 * Setup the rx/tx rings
 	 */
-	ixgbe_setup_rings(ixgbe);
+	if (ixgbe_setup_rings(ixgbe) != IXGBE_SUCCESS)
+		goto start_failure;
 
 	/*
 	 * ixgbe_start() will be called when resetting, however if reset
@@ -1999,6 +2005,7 @@ ixgbe_cbfunc(dev_info_t *dip, ddi_cb_action_t cbaction, void *cbarg,
     void *arg1, void *arg2)
 {
 	ixgbe_t *ixgbe = (ixgbe_t *)arg1;
+	int prev = ixgbe->intr_cnt;
 
 	switch (cbaction) {
 	/* IRM callback */
@@ -2012,7 +2019,8 @@ ixgbe_cbfunc(dev_info_t *dip, ddi_cb_action_t cbaction, void *cbarg,
 		if (ixgbe_intr_adjust(ixgbe, cbaction, count) !=
 		    DDI_SUCCESS) {
 			ixgbe_error(ixgbe,
-			    "IRM CB: Failed to adjust interrupts");
+			    "IRM CB: Failed to adjust interrupts [%d %d %d]",
+			    cbaction, count, prev);
 			goto cb_fail;
 		}
 		break;
@@ -2271,6 +2279,16 @@ ixgbe_free_rings(ixgbe_t *ixgbe)
 		ixgbe->tx_rings = NULL;
 	}
 
+	for (uint_t i = 0; i < ixgbe->num_rx_groups; i++) {
+		ixgbe_vlan_t *vlp;
+		ixgbe_rx_group_t *rx_group = &ixgbe->rx_groups[i];
+
+		while ((vlp = list_remove_head(&rx_group->vlans)) != NULL)
+			kmem_free(vlp, sizeof (ixgbe_vlan_t));
+
+		list_destroy(&rx_group->vlans);
+	}
+
 	if (ixgbe->rx_groups != NULL) {
 		kmem_free(ixgbe->rx_groups,
 		    sizeof (ixgbe_rx_group_t) * ixgbe->num_rx_groups);
@@ -2325,7 +2343,7 @@ ixgbe_free_rx_data(ixgbe_t *ixgbe)
 /*
  * ixgbe_setup_rings - Setup rx/tx rings.
  */
-static void
+static int
 ixgbe_setup_rings(ixgbe_t *ixgbe)
 {
 	/*
@@ -2335,9 +2353,12 @@ ixgbe_setup_rings(ixgbe_t *ixgbe)
 	 * 2. Initialize necessary registers for receive/transmit;
 	 * 3. Initialize software pointers/parameters for receive/transmit;
 	 */
-	ixgbe_setup_rx(ixgbe);
+	if (ixgbe_setup_rx(ixgbe) != IXGBE_SUCCESS)
+		return (IXGBE_FAILURE);
 
 	ixgbe_setup_tx(ixgbe);
+
+	return (IXGBE_SUCCESS);
 }
 
 static void
@@ -2423,7 +2444,7 @@ ixgbe_setup_rx_ring(ixgbe_rx_ring_t *rx_ring)
 	IXGBE_WRITE_REG(hw, IXGBE_SRRCTL(rx_ring->hw_index), reg_val);
 }
 
-static void
+static int
 ixgbe_setup_rx(ixgbe_t *ixgbe)
 {
 	ixgbe_rx_ring_t *rx_ring;
@@ -2517,6 +2538,15 @@ ixgbe_setup_rx(ixgbe_t *ixgbe)
 	}
 
 	/*
+	 * Initialize VLAN SW and HW state if VLAN filtering is
+	 * enabled.
+	 */
+	if (ixgbe->vlft_enabled) {
+		if (ixgbe_init_vlan(ixgbe) != IXGBE_SUCCESS)
+			return (IXGBE_FAILURE);
+	}
+
+	/*
 	 * Enable the receive unit.  This must be done after filter
 	 * control is set in FCTRL. On 82598, we disable the descriptor monitor.
 	 * 82598 is the only adapter which defines this RXCTRL option.
@@ -2598,6 +2628,8 @@ ixgbe_setup_rx(ixgbe_t *ixgbe)
 
 		IXGBE_WRITE_REG(hw, IXGBE_RDRXCTL, reg_val);
 	}
+
+	return (IXGBE_SUCCESS);
 }
 
 static void
@@ -2829,7 +2861,7 @@ static void
 ixgbe_setup_vmdq(ixgbe_t *ixgbe)
 {
 	struct ixgbe_hw *hw = &ixgbe->hw;
-	uint32_t vmdctl, i, vtctl;
+	uint32_t vmdctl, i, vtctl, vlnctl;
 
 	/*
 	 * Setup the VMDq Control register, enable VMDq based on
@@ -2864,10 +2896,20 @@ ixgbe_setup_vmdq(ixgbe_t *ixgbe)
 		/*
 		 * Enable Virtualization and Replication.
 		 */
-		vtctl = IXGBE_VT_CTL_VT_ENABLE | IXGBE_VT_CTL_REPLEN;
+		vtctl = IXGBE_READ_REG(hw, IXGBE_VT_CTL);
+		ixgbe->rx_def_group = vtctl & IXGBE_VT_CTL_POOL_MASK;
+		vtctl |= IXGBE_VT_CTL_VT_ENABLE | IXGBE_VT_CTL_REPLEN;
 		IXGBE_WRITE_REG(hw, IXGBE_VT_CTL, vtctl);
 
 		/*
+		 * Enable VLAN filtering and switching (VFTA and VLVF).
+		 */
+		vlnctl = IXGBE_READ_REG(hw, IXGBE_VLNCTRL);
+		vlnctl |= IXGBE_VLNCTRL_VFE;
+		IXGBE_WRITE_REG(hw, IXGBE_VLNCTRL, vlnctl);
+		ixgbe->vlft_enabled = B_TRUE;
+
+		/*
 		 * Enable receiving packets to all VFs
 		 */
 		IXGBE_WRITE_REG(hw, IXGBE_VFRE(0), IXGBE_VFRE_ENABLE_ALL);
@@ -2887,7 +2929,7 @@ ixgbe_setup_vmdq_rss(ixgbe_t *ixgbe)
 {
 	struct ixgbe_hw *hw = &ixgbe->hw;
 	uint32_t i, mrqc;
-	uint32_t vtctl, vmdctl;
+	uint32_t vtctl, vmdctl, vlnctl;
 
 	/*
 	 * Initialize RETA/ERETA table
@@ -2969,10 +3011,21 @@ ixgbe_setup_vmdq_rss(ixgbe_t *ixgbe)
 		/*
 		 * Enable Virtualization and Replication.
 		 */
+		vtctl = IXGBE_READ_REG(hw, IXGBE_VT_CTL);
+		ixgbe->rx_def_group = vtctl & IXGBE_VT_CTL_POOL_MASK;
+		vtctl |= IXGBE_VT_CTL_VT_ENABLE | IXGBE_VT_CTL_REPLEN;
 		vtctl = IXGBE_VT_CTL_VT_ENABLE | IXGBE_VT_CTL_REPLEN;
 		IXGBE_WRITE_REG(hw, IXGBE_VT_CTL, vtctl);
 
 		/*
+		 * Enable VLAN filtering and switching (VFTA and VLVF).
+		 */
+		vlnctl = IXGBE_READ_REG(hw, IXGBE_VLNCTRL);
+		vlnctl |= IXGBE_VLNCTRL_VFE;
+		IXGBE_WRITE_REG(hw, IXGBE_VLNCTRL, vlnctl);
+		ixgbe->vlft_enabled = B_TRUE;
+
+		/*
 		 * Enable receiving packets to all VFs
 		 */
 		IXGBE_WRITE_REG(hw, IXGBE_VFRE(0), IXGBE_VFRE_ENABLE_ALL);
@@ -3143,6 +3196,53 @@ ixgbe_unicst_find(ixgbe_t *ixgbe, const uint8_t *mac_addr)
 }
 
 /*
+ * Restore the HW state to match the SW state during restart.
+ */
+static int
+ixgbe_init_vlan(ixgbe_t *ixgbe)
+{
+	/*
+	 * The device is starting for the first time; there is nothing
+	 * to do.
+	 */
+	if (!ixgbe->vlft_init) {
+		ixgbe->vlft_init = B_TRUE;
+		return (IXGBE_SUCCESS);
+	}
+
+	for (uint_t i = 0; i < ixgbe->num_rx_groups; i++) {
+		int			ret;
+		boolean_t		vlvf_bypass;
+		ixgbe_rx_group_t	*rxg = &ixgbe->rx_groups[i];
+		struct ixgbe_hw		*hw = &ixgbe->hw;
+
+		if (rxg->aupe) {
+			uint32_t vml2flt;
+
+			vml2flt = IXGBE_READ_REG(hw, IXGBE_VMOLR(rxg->index));
+			vml2flt |= IXGBE_VMOLR_AUPE;
+			IXGBE_WRITE_REG(hw, IXGBE_VMOLR(rxg->index), vml2flt);
+		}
+
+		vlvf_bypass = (rxg->index == ixgbe->rx_def_group);
+		for (ixgbe_vlan_t *vlp = list_head(&rxg->vlans); vlp != NULL;
+		    vlp = list_next(&rxg->vlans, vlp)) {
+			ret = ixgbe_set_vfta(hw, vlp->ixvl_vid, rxg->index,
+			    B_TRUE, vlvf_bypass);
+
+			if (ret != IXGBE_SUCCESS) {
+				ixgbe_error(ixgbe, "Failed to program VFTA"
+				    " for group %u, VID: %u, ret: %d.",
+				    rxg->index, vlp->ixvl_vid, ret);
+				return (IXGBE_FAILURE);
+			}
+		}
+	}
+
+	return (IXGBE_SUCCESS);
+}
+
+/*
  * ixgbe_multicst_add - Add a multicst address.
  */
 int
@@ -6152,6 +6252,7 @@ ixgbe_fill_group(void *arg, mac_ring_type_t rtype, const int index,
     mac_group_info_t *infop, mac_group_handle_t gh)
 {
 	ixgbe_t *ixgbe = (ixgbe_t *)arg;
+	struct ixgbe_hw *hw = &ixgbe->hw;
 
 	switch (rtype) {
 	case MAC_RING_TYPE_RX: {
@@ -6165,6 +6266,20 @@ ixgbe_fill_group(void *arg, mac_ring_type_t rtype, const int index,
 		infop->mgi_stop = NULL;
 		infop->mgi_addmac = ixgbe_addmac;
 		infop->mgi_remmac = ixgbe_remmac;
+
+		if ((ixgbe->classify_mode == IXGBE_CLASSIFY_VMDQ ||
+		    ixgbe->classify_mode == IXGBE_CLASSIFY_VMDQ_RSS) &&
+		    (hw->mac.type == ixgbe_mac_82599EB ||
+		    hw->mac.type == ixgbe_mac_X540 ||
+		    hw->mac.type == ixgbe_mac_X550 ||
+		    hw->mac.type == ixgbe_mac_X550EM_x)) {
+			infop->mgi_addvlan = ixgbe_addvlan;
+			infop->mgi_remvlan = ixgbe_remvlan;
+		} else {
+			infop->mgi_addvlan = NULL;
+			infop->mgi_remvlan = NULL;
+		}
+
 		infop->mgi_count = (ixgbe->num_rx_rings / ixgbe->num_rx_groups);
 
 		break;
@@ -6264,6 +6379,228 @@ ixgbe_rx_ring_intr_disable(mac_intr_handle_t intrh)
 	return (0);
 }
 
+static ixgbe_vlan_t *
+ixgbe_find_vlan(ixgbe_rx_group_t *rx_group, uint16_t vid)
+{
+	for (ixgbe_vlan_t *vlp = list_head(&rx_group->vlans); vlp != NULL;
+	    vlp = list_next(&rx_group->vlans, vlp)) {
+		if (vlp->ixvl_vid == vid)
+			return (vlp);
+	}
+
+	return (NULL);
+}
+
+/*
+ * Attempt to use a VLAN HW filter for this group. If the group is
+ * interested in untagged packets then set AUPE only. If the group is
+ * the default then only set the VFTA. Leave the VLVF slots open for
+ * reserved groups to guarantee their use of HW filtering.
+ */
+static int
+ixgbe_addvlan(mac_group_driver_t gdriver, uint16_t vid)
+{
+	ixgbe_rx_group_t	*rx_group = (ixgbe_rx_group_t *)gdriver;
+	ixgbe_t			*ixgbe = rx_group->ixgbe;
+	struct ixgbe_hw		*hw = &ixgbe->hw;
+	ixgbe_vlan_t		*vlp;
+	int			ret;
+	boolean_t		is_def_grp;
+
+	mutex_enter(&ixgbe->gen_lock);
+
+	if (ixgbe->ixgbe_state & IXGBE_SUSPENDED) {
+		mutex_exit(&ixgbe->gen_lock);
+		return (ECANCELED);
+	}
+
+	/*
+	 * Let's be sure VLAN filtering is enabled.
+	 */
+	VERIFY3B(ixgbe->vlft_enabled, ==, B_TRUE);
+	is_def_grp = (rx_group->index == ixgbe->rx_def_group);
+
+	/*
+	 * VLAN filtering is enabled but we want to receive untagged
+	 * traffic on this group -- set the AUPE bit on the group and
+	 * leave the VLAN tables alone.
+	 */
+	if (vid == MAC_VLAN_UNTAGGED) {
+		/*
+		 * We never enable AUPE on the default group; it is
+		 * redundant. Untagged traffic which passes L2
+		 * filtering is delivered to the default group if no
+		 * other group is interested.
+		 */
+		if (!is_def_grp) {
+			uint32_t vml2flt;
+
+			vml2flt = IXGBE_READ_REG(hw,
+			    IXGBE_VMOLR(rx_group->index));
+			vml2flt |= IXGBE_VMOLR_AUPE;
+			IXGBE_WRITE_REG(hw, IXGBE_VMOLR(rx_group->index),
+			    vml2flt);
+			rx_group->aupe = B_TRUE;
+		}
+
+		mutex_exit(&ixgbe->gen_lock);
+		return (0);
+	}
+
+	vlp = ixgbe_find_vlan(rx_group, vid);
+	if (vlp != NULL) {
+		/* Only the default group supports multiple clients. */
+		VERIFY3B(is_def_grp, ==, B_TRUE);
+		vlp->ixvl_refs++;
+		mutex_exit(&ixgbe->gen_lock);
+		return (0);
+	}
+
+	/*
+	 * The default group doesn't require a VLVF entry, only a VFTA
+	 * entry. All traffic passing L2 filtering (MPSAR + VFTA) is
+	 * delivered to the default group if no other group is
+	 * interested. The fourth argument, vlvf_bypass, tells the
+	 * ixgbe common code to avoid using a VLVF slot if one isn't
+	 * already allocated to this VLAN.
+	 *
+	 * This logic is meant to reserve VLVF slots for use by
+	 * reserved groups: guaranteeing their use of HW filtering.
+	 */
+	ret = ixgbe_set_vfta(hw, vid, rx_group->index, B_TRUE, is_def_grp);
+
+	if (ret == IXGBE_SUCCESS) {
+		vlp = kmem_zalloc(sizeof (ixgbe_vlan_t), KM_SLEEP);
+		vlp->ixvl_vid = vid;
+		vlp->ixvl_refs = 1;
+		list_insert_tail(&rx_group->vlans, vlp);
+		mutex_exit(&ixgbe->gen_lock);
+		return (0);
+	}
+
+	/*
+	 * We should actually never return ENOSPC because we've set
+	 * things up so that every reserved group is guaranteed to
+	 * have a VLVF slot.
+	 */
+	if (ret == IXGBE_ERR_PARAM)
+		ret = EINVAL;
+	else if (ret == IXGBE_ERR_NO_SPACE)
+		ret = ENOSPC;
+	else
+		ret = EIO;
+
+	mutex_exit(&ixgbe->gen_lock);
+	return (ret);
+}
+
+/*
+ * Attempt to remove the VLAN HW filter associated with this group. If
+ * we are removing a HW filter for the default group then we know only
+ * the VFTA was set (VLVF is reserved for non-default/reserved
+ * groups). If the group wishes to stop receiving untagged traffic
+ * then clear the AUPE but leave the VLAN filters alone.
+ */
+static int
+ixgbe_remvlan(mac_group_driver_t gdriver, uint16_t vid)
+{
+	ixgbe_rx_group_t	*rx_group = (ixgbe_rx_group_t *)gdriver;
+	ixgbe_t			*ixgbe = rx_group->ixgbe;
+	struct ixgbe_hw		*hw = &ixgbe->hw;
+	int			ret;
+	ixgbe_vlan_t		*vlp;
+	boolean_t		is_def_grp;
+
+	mutex_enter(&ixgbe->gen_lock);
+
+	if (ixgbe->ixgbe_state & IXGBE_SUSPENDED) {
+		mutex_exit(&ixgbe->gen_lock);
+		return (ECANCELED);
+	}
+
+	is_def_grp = (rx_group->index == ixgbe->rx_def_group);
+
+	/* See the AUPE comment in ixgbe_addvlan(). */
+	if (vid == MAC_VLAN_UNTAGGED) {
+		if (!is_def_grp) {
+			uint32_t vml2flt;
+
+			vml2flt = IXGBE_READ_REG(hw,
+			    IXGBE_VMOLR(rx_group->index));
+			vml2flt &= ~IXGBE_VMOLR_AUPE;
+			IXGBE_WRITE_REG(hw,
+			    IXGBE_VMOLR(rx_group->index), vml2flt);
+			rx_group->aupe = B_FALSE;
+		}
+		mutex_exit(&ixgbe->gen_lock);
+		return (0);
+	}
+
+	vlp = ixgbe_find_vlan(rx_group, vid);
+	if (vlp == NULL)
+		return (ENOENT);
+
+	/*
+	 * See the comment in ixgbe_addvlan() about is_def_grp and
+	 * vlvf_bypass.
+	 */
+	if (vlp->ixvl_refs == 1) {
+		ret = ixgbe_set_vfta(hw, vid, rx_group->index, B_FALSE,
+		    is_def_grp);
+	} else {
+		/*
+		 * Only the default group can have multiple clients.
+		 * If there is more than one client, leave the
+		 * VFTA[vid] bit alone.
+		 */
+		VERIFY3B(is_def_grp, ==, B_TRUE);
+		VERIFY3U(vlp->ixvl_refs, >, 1);
+		vlp->ixvl_refs--;
+		mutex_exit(&ixgbe->gen_lock);
+		return (0);
+	}
+
+	if (ret != IXGBE_SUCCESS) {
+		mutex_exit(&ixgbe->gen_lock);
+		/* IXGBE_ERR_PARAM should be the only possible error here. */
+		if (ret == IXGBE_ERR_PARAM)
+			return (EINVAL);
+		else
+			return (EIO);
+	}
+
+	VERIFY3U(vlp->ixvl_refs, ==, 1);
+	vlp->ixvl_refs = 0;
+	list_remove(&rx_group->vlans, vlp);
+	kmem_free(vlp, sizeof (ixgbe_vlan_t));
+
+	/*
+	 * Calling ixgbe_set_vfta() on a non-default group may have
+	 * cleared the VFTA[vid] bit even though the default group
+	 * still has clients using the vid. This happens because the
+	 * ixgbe common code doesn't ref count the use of VLANs. Check
+	 * for any use of vid on the default group and make sure the
+	 * VFTA[vid] bit is set. This operation is idempotent: setting
+	 * VFTA[vid] to true if already true won't hurt anything.
+	 */
+	if (!is_def_grp) {
+		ixgbe_rx_group_t *defgrp;
+
+		defgrp = &ixgbe->rx_groups[ixgbe->rx_def_group];
+		vlp = ixgbe_find_vlan(defgrp, vid);
+		if (vlp != NULL) {
+			/* This shouldn't fail, but if it does return EIO. */
+			ret = ixgbe_set_vfta(hw, vid, rx_group->index, B_TRUE,
+			    B_TRUE);
+			if (ret != IXGBE_SUCCESS)
+				return (EIO);
+		}
+	}
+
+	mutex_exit(&ixgbe->gen_lock);
+	return (0);
+}
+
 /*
  * Add a mac address.
  */
diff --git a/usr/src/uts/common/io/ixgbe/ixgbe_sw.h b/usr/src/uts/common/io/ixgbe/ixgbe_sw.h
index ca52b10c89..baa4766c0e 100644
--- a/usr/src/uts/common/io/ixgbe/ixgbe_sw.h
+++ b/usr/src/uts/common/io/ixgbe/ixgbe_sw.h
@@ -27,7 +27,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2013 Saso Kiselkov. All rights reserved.
  * Copyright 2016 OmniTI Computer Consulting, Inc. All rights reserved.
- * Copyright (c) 2017, Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
  */
 
 #ifndef	_IXGBE_SW_H
@@ -91,6 +91,8 @@ extern "C" {
 
 #define	MAX_NUM_UNICAST_ADDRESSES 	0x80
 #define	MAX_NUM_MULTICAST_ADDRESSES 	0x1000
+#define	MAX_NUM_VLAN_FILTERS		0x40
+
 #define	IXGBE_INTR_NONE			0
 #define	IXGBE_INTR_MSIX			1
 #define	IXGBE_INTR_MSI			2
@@ -387,6 +389,15 @@ typedef union ixgbe_ether_addr {
 	} mac;
 } ixgbe_ether_addr_t;
 
+/*
+ * The list of VLANs an Rx group will accept.
+ */
+typedef struct ixgbe_vlan {
+	list_node_t		ixvl_link;
+	uint16_t		ixvl_vid;   /* The VLAN ID */
+	uint_t			ixvl_refs;  /* Number of users of this VLAN */
+} ixgbe_vlan_t;
+
 typedef enum {
 	USE_NONE,
 	USE_COPY,
@@ -589,6 +600,7 @@ typedef struct ixgbe_rx_ring {
 
 	struct ixgbe		*ixgbe;		/* Pointer to ixgbe struct */
 } ixgbe_rx_ring_t;
+
 /*
  * Software Receive Ring Group
  */
@@ -596,6 +608,8 @@ typedef struct ixgbe_rx_group {
 	uint32_t		index;		/* Group index */
 	mac_group_handle_t	group_handle;   /* call back group handle */
 	struct ixgbe		*ixgbe;		/* Pointer to ixgbe struct */
+	boolean_t		aupe;		/* AUPE bit */
+	list_t			vlans;		/* list of VLANs to allow */
 } ixgbe_rx_group_t;
 
 /*
@@ -662,6 +676,7 @@ typedef struct ixgbe {
 	 */
 	ixgbe_rx_group_t	*rx_groups;	/* Array of rx groups */
 	uint32_t		num_rx_groups;	/* Number of rx groups in use */
+	uint32_t		rx_def_group;	/* Default Rx group index */
 
 	/*
 	 * Transmit Rings
@@ -715,6 +730,9 @@ typedef struct ixgbe {
 	uint32_t		mcast_count;
 	struct ether_addr	mcast_table[MAX_NUM_MULTICAST_ADDRESSES];
 
+	boolean_t		vlft_enabled; /* VLAN filtering enabled? */
+	boolean_t		vlft_init;    /* VLAN filtering initialized? */
+
 	ulong_t			sys_page_size;
 
 	boolean_t		link_check_complete;
diff --git a/usr/src/uts/common/io/ksocket/ksocket.c b/usr/src/uts/common/io/ksocket/ksocket.c
index 17ac19b612..d96bfd3d75 100644
--- a/usr/src/uts/common/io/ksocket/ksocket.c
+++ b/usr/src/uts/common/io/ksocket/ksocket.c
@@ -22,7 +22,7 @@
 /*
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright 2017 Joyent, Inc.
  */
 
 #include <sys/file.h>
@@ -932,3 +932,15 @@ ksocket_rele(ksocket_t ks)
 			cv_signal(&so->so_closing_cv);
 	}
 }
+
+int
+ksocket_krecv_set(ksocket_t ks, ksocket_krecv_f cb, void *arg)
+{
+	return (so_krecv_set(KSTOSO(ks), (so_krecv_f)cb, arg));
+}
+
+void
+ksocket_krecv_unblock(ksocket_t ks)
+{
+	so_krecv_unblock(KSTOSO(ks));
+}
diff --git a/usr/src/uts/common/io/ksocket/ksocket_impl.h b/usr/src/uts/common/io/ksocket/ksocket_impl.h
index ac5251540f..516a68d358 100644
--- a/usr/src/uts/common/io/ksocket/ksocket_impl.h
+++ b/usr/src/uts/common/io/ksocket/ksocket_impl.h
@@ -22,11 +22,17 @@
 /*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2015, Joyent, Inc.
  */
 
 #ifndef	_INET_KSOCKET_KSOCKET_IMPL_H
 #define	_INET_KSOCKET_KSOCKET_IMPL_H
 
+/*
+ * Note that if this relationship ever changes, the logic in ksocket_krecv_set
+ * must be updated and we must maintain local state about this on whatever the
+ * new ksocket object is.
+ */
 #define	KSTOSO(ks)	((struct sonode *)(ks))
 #define	SOTOKS(so)	((ksocket_t)(uintptr_t)(so))
 
diff --git a/usr/src/uts/common/io/ksyms.c b/usr/src/uts/common/io/ksyms.c
index c9f0c63b69..5233fcd0b4 100644
--- a/usr/src/uts/common/io/ksyms.c
+++ b/usr/src/uts/common/io/ksyms.c
@@ -21,6 +21,7 @@
 /*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2016 Joyent, Inc.
  */
 
 
@@ -219,6 +220,14 @@ ksyms_open(dev_t *devp, int flag, int otyp, struct cred *cred)
 	char *addr;
 	void *hptr = NULL;
 	ksyms_buflist_hdr_t hdr;
+
+	/*
+	 * This device should never be visible in a zone, but if it somehow
+	 * does get created we refuse to allow the zone to use it.
+	 */
+	if (crgetzoneid(cred) != GLOBAL_ZONEID)
+		return (EACCES);
+
 	bzero(&hdr, sizeof (struct ksyms_buflist_hdr));
 	list_create(&hdr.blist, PAGESIZE,
 	    offsetof(ksyms_buflist_t, buflist_node));
diff --git a/usr/src/uts/common/io/lofi.c b/usr/src/uts/common/io/lofi.c
index 95f4cd7254..1169f3fdfc 100644
--- a/usr/src/uts/common/io/lofi.c
+++ b/usr/src/uts/common/io/lofi.c
@@ -24,6 +24,7 @@
  * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
  * Copyright (c) 2016 Andrey Sokolov
  * Copyright 2016 Toomas Soome <tsoome@me.com>
+ * Copyright 2019 Joyent, Inc.
  */
 
 /*
@@ -173,7 +174,7 @@
 #define	SIZE_PROP_NAME		"Size"
 #define	ZONE_PROP_NAME		"zone"
 
-#define	SETUP_C_DATA(cd, buf, len) 		\
+#define	SETUP_C_DATA(cd, buf, len)		\
 	(cd).cd_format = CRYPTO_DATA_RAW;	\
 	(cd).cd_offset = 0;			\
 	(cd).cd_miscdata = NULL;		\
@@ -553,7 +554,7 @@ lofi_destroy(struct lofi_state *lsp, cred_t *credp)
 	}
 
 	if (lsp->ls_vp != NULL) {
-		(void) VOP_PUTPAGE(lsp->ls_vp, 0, 0, B_INVAL, credp, NULL);
+		(void) VOP_PUTPAGE(lsp->ls_vp, 0, 0, B_FREE, credp, NULL);
 		(void) VOP_CLOSE(lsp->ls_vp, lsp->ls_openflag,
 		    1, 0, credp, NULL);
 		VN_RELE(lsp->ls_vp);
@@ -2934,7 +2935,7 @@ err:
 		lofi_destroy(lsp, credp);
 	} else {
 		if (vp != NULL) {
-			(void) VOP_PUTPAGE(vp, 0, 0, B_INVAL, credp, NULL);
+			(void) VOP_PUTPAGE(vp, 0, 0, B_FREE, credp, NULL);
 			(void) VOP_CLOSE(vp, flag, 1, 0, credp, NULL);
 			VN_RELE(vp);
 		}
diff --git a/usr/src/uts/common/io/mac/mac.c b/usr/src/uts/common/io/mac/mac.c
index c386eb2941..a63a6a5c61 100644
--- a/usr/src/uts/common/io/mac/mac.c
+++ b/usr/src/uts/common/io/mac/mac.c
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2017, Joyent, Inc.
+ * Copyright (c) 2018, Joyent, Inc.
  * Copyright 2015 Garrett D'Amore <garrett@damore.org>
  */
 
@@ -158,7 +158,7 @@
  * perimeter) across a call to any other layer from the mac layer. The call to
  * any other layer could be via mi_* entry points, classifier entry points into
  * the driver or via upcall pointers into layers above. The mac perimeter may
- * be acquired or held only in the down direction, for e.g. when calling into
+ * be acquired or held only in the down direction, e.g. when calling into
  * a mi_* driver enty point to provide atomicity of the operation.
  *
  * R8. Since it is not guaranteed (see R14) that drivers won't hold locks across
@@ -207,7 +207,7 @@
  * number whenever the ring's stop routine is invoked.
  * See comments in mac_rx_ring();
  *
- * R17 Similarly mi_stop is another synchronization point and the driver must
+ * R17. Similarly mi_stop is another synchronization point and the driver must
  * ensure that all upcalls are done and there won't be any future upcall
  * before returning from mi_stop.
  *
@@ -227,7 +227,7 @@
  *
  * cpu_lock -> mac_srs_g_lock -> srs_lock -> s_ring_lock [mac_walk_srs_and_bind]
  *
- * i_dls_devnet_lock -> mac layer locks [dls_devnet_rename]
+ * mac perim -> i_dls_devnet_lock [dls_devnet_rename]
  *
  * Perimeters are ordered P1 -> P2 -> P3 from top to bottom in order of mac
  * client to driver. In the case of clients that explictly use the mac provided
@@ -460,7 +460,7 @@ mac_init(void)
 	mac_logging_interval = 20;
 	mac_flow_log_enable = B_FALSE;
 	mac_link_log_enable = B_FALSE;
-	mac_logging_timer = 0;
+	mac_logging_timer = NULL;
 
 	/* Register to be notified of noteworthy pools events */
 	mac_pool_event_reg.pec_func =  mac_pool_event_cb;
@@ -1115,9 +1115,10 @@ mac_start(mac_handle_t mh)
 
 		if ((defgrp = MAC_DEFAULT_RX_GROUP(mip)) != NULL) {
 			/*
-			 * Start the default ring, since it will be needed
-			 * to receive broadcast and multicast traffic for
-			 * both primary and non-primary MAC clients.
+			 * Start the default group which is responsible
+			 * for receiving broadcast and multicast
+			 * traffic for both primary and non-primary
+			 * MAC clients.
 			 */
 			ASSERT(defgrp->mrg_state == MAC_GROUP_STATE_REGISTERED);
 			err = mac_start_group_and_rings(defgrp);
@@ -1456,7 +1457,7 @@ mac_rx_group_unmark(mac_group_t *grp, uint_t flag)
  * used by the aggr driver to access and control the underlying HW Rx group
  * and rings. In this case, the aggr driver has exclusive control of the
  * underlying HW Rx group/rings, it calls the following functions to
- * start/stop the HW Rx rings, disable/enable polling, add/remove mac'
+ * start/stop the HW Rx rings, disable/enable polling, add/remove MAC
  * addresses, or set up the Rx callback.
  */
 /* ARGSUSED */
@@ -1501,8 +1502,9 @@ mac_hwrings_get(mac_client_handle_t mch, mac_group_handle_t *hwgh,
 		ASSERT(B_FALSE);
 		return (-1);
 	}
+
 	/*
-	 * The mac client did not reserve any RX group, return directly.
+	 * The MAC client did not reserve an Rx group, return directly.
 	 * This is probably because the underlying MAC does not support
 	 * any groups.
 	 */
@@ -1511,7 +1513,7 @@ mac_hwrings_get(mac_client_handle_t mch, mac_group_handle_t *hwgh,
 	if (grp == NULL)
 		return (0);
 	/*
-	 * This group must be reserved by this mac client.
+	 * This group must be reserved by this MAC client.
 	 */
 	ASSERT((grp->mrg_state == MAC_GROUP_STATE_RESERVED) &&
 	    (mcip == MAC_GROUP_ONLY_CLIENT(grp)));
@@ -1527,6 +1529,77 @@ mac_hwrings_get(mac_client_handle_t mch, mac_group_handle_t *hwgh,
 }
 
 /*
+ * Get the HW ring handles of the given group index. If the MAC
+ * doesn't have a group at this index, or any groups at all, then 0 is
+ * returned and hwgh is set to NULL. This is a private client API. The
+ * MAC perimeter must be held when calling this function.
+ *
+ * mh: A handle to the MAC that owns the group.
+ *
+ * idx: The index of the HW group to be read.
+ *
+ * hwgh: If non-NULL, contains a handle to the HW group on return.
+ *
+ * hwrh: An array of ring handles pointing to the HW rings in the
+ * group. The array must be large enough to hold a handle to each ring
+ * in the group. To be safe, this array should be of size MAX_RINGS_PER_GROUP.
+ *
+ * rtype: Used to determine if we are fetching Rx or Tx rings.
+ *
+ * Returns the number of rings in the group.
+ */
+uint_t
+mac_hwrings_idx_get(mac_handle_t mh, uint_t idx, mac_group_handle_t *hwgh,
+    mac_ring_handle_t *hwrh, mac_ring_type_t rtype)
+{
+	mac_impl_t		*mip = (mac_impl_t *)mh;
+	mac_group_t		*grp;
+	mac_ring_t		*ring;
+	uint_t			cnt = 0;
+
+	/*
+	 * The MAC perimeter must be held when accessing the
+	 * mi_{rx,tx}_groups fields.
+	 */
+	ASSERT(MAC_PERIM_HELD(mh));
+	ASSERT(rtype == MAC_RING_TYPE_RX || rtype == MAC_RING_TYPE_TX);
+
+	if (rtype == MAC_RING_TYPE_RX) {
+		grp = mip->mi_rx_groups;
+	} else if (rtype == MAC_RING_TYPE_TX) {
+		grp = mip->mi_tx_groups;
+	}
+
+	while (grp != NULL && grp->mrg_index != idx)
+		grp = grp->mrg_next;
+
+	/*
+	 * If the MAC doesn't have a group at this index or doesn't
+	 * impelement RINGS capab, then set hwgh to NULL and return 0.
+	 */
+	if (hwgh != NULL)
+		*hwgh = NULL;
+
+	if (grp == NULL)
+		return (0);
+
+	ASSERT3U(idx, ==, grp->mrg_index);
+
+	for (ring = grp->mrg_rings; ring != NULL; ring = ring->mr_next, cnt++) {
+		ASSERT3U(cnt, <, MAX_RINGS_PER_GROUP);
+		hwrh[cnt] = (mac_ring_handle_t)ring;
+	}
+
+	/* A group should always have at least one ring. */
+	ASSERT3U(cnt, >, 0);
+
+	if (hwgh != NULL)
+		*hwgh = (mac_group_handle_t)grp;
+
+	return (cnt);
+}
+
+/*
  * This function is called to get info about Tx/Rx rings.
  *
  * Return value: returns uint_t which will have various bits set
@@ -1542,6 +1615,69 @@ mac_hwring_getinfo(mac_ring_handle_t rh)
 }
 
 /*
+ * Set the passthru callback on the hardware ring.
+ */
+void
+mac_hwring_set_passthru(mac_ring_handle_t hwrh, mac_rx_t fn, void *arg1,
+    mac_resource_handle_t arg2)
+{
+	mac_ring_t *hwring = (mac_ring_t *)hwrh;
+
+	ASSERT3S(hwring->mr_type, ==, MAC_RING_TYPE_RX);
+
+	hwring->mr_classify_type = MAC_PASSTHRU_CLASSIFIER;
+
+	hwring->mr_pt_fn = fn;
+	hwring->mr_pt_arg1 = arg1;
+	hwring->mr_pt_arg2 = arg2;
+}
+
+/*
+ * Clear the passthru callback on the hardware ring.
+ */
+void
+mac_hwring_clear_passthru(mac_ring_handle_t hwrh)
+{
+	mac_ring_t *hwring = (mac_ring_t *)hwrh;
+
+	ASSERT3S(hwring->mr_type, ==, MAC_RING_TYPE_RX);
+
+	hwring->mr_classify_type = MAC_NO_CLASSIFIER;
+
+	hwring->mr_pt_fn = NULL;
+	hwring->mr_pt_arg1 = NULL;
+	hwring->mr_pt_arg2 = NULL;
+}
+
+void
+mac_client_set_flow_cb(mac_client_handle_t mch, mac_rx_t func, void *arg1)
+{
+	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
+	flow_entry_t		*flent = mcip->mci_flent;
+
+	mutex_enter(&flent->fe_lock);
+	flent->fe_cb_fn = (flow_fn_t)func;
+	flent->fe_cb_arg1 = arg1;
+	flent->fe_cb_arg2 = NULL;
+	flent->fe_flags &= ~FE_MC_NO_DATAPATH;
+	mutex_exit(&flent->fe_lock);
+}
+
+void
+mac_client_clear_flow_cb(mac_client_handle_t mch)
+{
+	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
+	flow_entry_t		*flent = mcip->mci_flent;
+
+	mutex_enter(&flent->fe_lock);
+	flent->fe_cb_fn = (flow_fn_t)mac_rx_def;
+	flent->fe_cb_arg1 = NULL;
+	flent->fe_cb_arg2 = NULL;
+	flent->fe_flags |= FE_MC_NO_DATAPATH;
+	mutex_exit(&flent->fe_lock);
+}
+
+/*
  * Export ddi interrupt handles from the HW ring to the pseudo ring and
  * setup the RX callback of the mac client which exclusively controls
  * HW ring.
@@ -1613,17 +1749,56 @@ mac_hwring_enable_intr(mac_ring_handle_t rh)
 	return (intr->mi_enable(intr->mi_handle));
 }
 
+/*
+ * Start the HW ring pointed to by rh.
+ *
+ * This is used by special MAC clients that are MAC themselves and
+ * need to exert control over the underlying HW rings of the NIC.
+ */
 int
 mac_hwring_start(mac_ring_handle_t rh)
 {
 	mac_ring_t *rr_ring = (mac_ring_t *)rh;
+	int rv = 0;
+
+	if (rr_ring->mr_state != MR_INUSE)
+		rv = mac_start_ring(rr_ring);
+
+	return (rv);
+}
+
+/*
+ * Stop the HW ring pointed to by rh. Also see mac_hwring_start().
+ */
+void
+mac_hwring_stop(mac_ring_handle_t rh)
+{
+	mac_ring_t *rr_ring = (mac_ring_t *)rh;
+
+	if (rr_ring->mr_state != MR_FREE)
+		mac_stop_ring(rr_ring);
+}
+
+/*
+ * Remove the quiesced flag from the HW ring pointed to by rh.
+ *
+ * This is used by special MAC clients that are MAC themselves and
+ * need to exert control over the underlying HW rings of the NIC.
+ */
+int
+mac_hwring_activate(mac_ring_handle_t rh)
+{
+	mac_ring_t *rr_ring = (mac_ring_t *)rh;
 
 	MAC_RING_UNMARK(rr_ring, MR_QUIESCE);
 	return (0);
 }
 
+/*
+ * Quiesce the HW ring pointed to by rh. Also see mac_hwring_activate().
+ */
 void
-mac_hwring_stop(mac_ring_handle_t rh)
+mac_hwring_quiesce(mac_ring_handle_t rh)
 {
 	mac_ring_t *rr_ring = (mac_ring_t *)rh;
 
@@ -1730,6 +1905,68 @@ mac_hwgroup_remmac(mac_group_handle_t gh, const uint8_t *addr)
 }
 
 /*
+ * Program the group's HW VLAN filter if it has such support.
+ * Otherwise, the group will implicitly accept tagged traffic and
+ * there is nothing to do.
+ */
+int
+mac_hwgroup_addvlan(mac_group_handle_t gh, uint16_t vid)
+{
+	mac_group_t *group = (mac_group_t *)gh;
+
+	if (!MAC_GROUP_HW_VLAN(group))
+		return (0);
+
+	return (mac_group_addvlan(group, vid));
+}
+
+int
+mac_hwgroup_remvlan(mac_group_handle_t gh, uint16_t vid)
+{
+	mac_group_t *group = (mac_group_t *)gh;
+
+	if (!MAC_GROUP_HW_VLAN(group))
+		return (0);
+
+	return (mac_group_remvlan(group, vid));
+}
+
+/*
+ * Determine if a MAC has HW VLAN support. This is a private API
+ * consumed by aggr. In the future it might be nice to have a bitfield
+ * in mac_capab_rings_t to track which forms of HW filtering are
+ * supported by the MAC.
+ */
+boolean_t
+mac_has_hw_vlan(mac_handle_t mh)
+{
+	mac_impl_t *mip = (mac_impl_t *)mh;
+
+	return (MAC_GROUP_HW_VLAN(mip->mi_rx_groups));
+}
+
+/*
+ * Get the number of Rx HW groups on this MAC.
+ */
+uint_t
+mac_get_num_rx_groups(mac_handle_t mh)
+{
+	mac_impl_t *mip = (mac_impl_t *)mh;
+
+	ASSERT(MAC_PERIM_HELD(mh));
+	return (mip->mi_rx_group_count);
+}
+
+int
+mac_set_promisc(mac_handle_t mh, boolean_t value)
+{
+	mac_impl_t *mip = (mac_impl_t *)mh;
+
+	ASSERT(MAC_PERIM_HELD(mh));
+	return (i_mac_promisc_set(mip, value));
+}
+
+/*
  * Set the RX group to be shared/reserved. Note that the group must be
  * started/stopped outside of this function.
  */
@@ -2416,7 +2653,6 @@ mac_disable(mac_handle_t mh)
 /*
  * Called when the MAC instance has a non empty flow table, to de-multiplex
  * incoming packets to the right flow.
- * The MAC's rw lock is assumed held as a READER.
  */
 /* ARGSUSED */
 static mblk_t *
@@ -2426,19 +2662,6 @@ mac_rx_classify(mac_impl_t *mip, mac_resource_handle_t mrh, mblk_t *mp)
 	uint_t		flags = FLOW_INBOUND;
 	int		err;
 
-	/*
-	 * If the mac is a port of an aggregation, pass FLOW_IGNORE_VLAN
-	 * to mac_flow_lookup() so that the VLAN packets can be successfully
-	 * passed to the non-VLAN aggregation flows.
-	 *
-	 * Note that there is possibly a race between this and
-	 * mac_unicast_remove/add() and VLAN packets could be incorrectly
-	 * classified to non-VLAN flows of non-aggregation mac clients. These
-	 * VLAN packets will be then filtered out by the mac module.
-	 */
-	if ((mip->mi_state_flags & MIS_EXCLUSIVE) != 0)
-		flags |= FLOW_IGNORE_VLAN;
-
 	err = mac_flow_lookup(mip->mi_flow_tab, mp, flags, &flent);
 	if (err != 0) {
 		/* no registered receive function */
@@ -3772,9 +3995,27 @@ mac_start_group_and_rings(mac_group_t *group)
 
 	for (ring = group->mrg_rings; ring != NULL; ring = ring->mr_next) {
 		ASSERT(ring->mr_state == MR_FREE);
+
 		if ((rv = mac_start_ring(ring)) != 0)
 			goto error;
-		ring->mr_classify_type = MAC_SW_CLASSIFIER;
+
+		/*
+		 * When aggr_set_port_sdu() is called, it will remove
+		 * the port client's unicast address. This will cause
+		 * MAC to stop the default group's rings on the port
+		 * MAC. After it modifies the SDU, it will then re-add
+		 * the unicast address. At which time, this function is
+		 * called to start the default group's rings. Normally
+		 * this function would set the classify type to
+		 * MAC_SW_CLASSIFIER; but that will break aggr which
+		 * relies on the passthru classify mode being set for
+		 * correct delivery (see mac_rx_common()). To avoid
+		 * that, we check for a passthru callback and set the
+		 * classify type to MAC_PASSTHRU_CLASSIFIER; as it was
+		 * before the rings were stopped.
+		 */
+		ring->mr_classify_type = (ring->mr_pt_fn != NULL) ?
+		    MAC_PASSTHRU_CLASSIFIER : MAC_SW_CLASSIFIER;
 	}
 	return (0);
 
@@ -4077,12 +4318,15 @@ mac_init_rings(mac_impl_t *mip, mac_ring_type_t rtype)
 
 
 		/*
-		 * Driver must register group->mgi_addmac/remmac() for rx groups
-		 * to support multiple MAC addresses.
+		 * The driver must register some form of hardware MAC
+		 * filter in order for Rx groups to support multiple
+		 * MAC addresses.
 		 */
 		if (rtype == MAC_RING_TYPE_RX &&
-		    ((group_info.mgi_addmac == NULL) ||
-		    (group_info.mgi_remmac == NULL))) {
+		    (group_info.mgi_addmac == NULL ||
+		    group_info.mgi_remmac == NULL)) {
+			DTRACE_PROBE1(mac__init__rings__no__mac__filter,
+			    char *, mip->mi_name);
 			err = EINVAL;
 			goto bail;
 		}
@@ -4129,8 +4373,9 @@ mac_init_rings(mac_impl_t *mip, mac_ring_type_t rtype)
 
 		/* Update this group's status */
 		mac_set_group_state(group, MAC_GROUP_STATE_REGISTERED);
-	} else
+	} else {
 		group->mrg_rings = NULL;
+	}
 
 	ASSERT(ring_left == 0);
 
@@ -4320,6 +4565,38 @@ mac_free_rings(mac_impl_t *mip, mac_ring_type_t rtype)
 }
 
 /*
+ * Associate the VLAN filter to the receive group.
+ */
+int
+mac_group_addvlan(mac_group_t *group, uint16_t vlan)
+{
+	VERIFY3S(group->mrg_type, ==, MAC_RING_TYPE_RX);
+	VERIFY3P(group->mrg_info.mgi_addvlan, !=, NULL);
+
+	if (vlan > VLAN_ID_MAX)
+		return (EINVAL);
+
+	vlan = MAC_VLAN_UNTAGGED_VID(vlan);
+	return (group->mrg_info.mgi_addvlan(group->mrg_info.mgi_driver, vlan));
+}
+
+/*
+ * Dissociate the VLAN from the receive group.
+ */
+int
+mac_group_remvlan(mac_group_t *group, uint16_t vlan)
+{
+	VERIFY3S(group->mrg_type, ==, MAC_RING_TYPE_RX);
+	VERIFY3P(group->mrg_info.mgi_remvlan, !=, NULL);
+
+	if (vlan > VLAN_ID_MAX)
+		return (EINVAL);
+
+	vlan = MAC_VLAN_UNTAGGED_VID(vlan);
+	return (group->mrg_info.mgi_remvlan(group->mrg_info.mgi_driver, vlan));
+}
+
+/*
  * Associate a MAC address with a receive group.
  *
  * The return value of this function should always be checked properly, because
@@ -4335,8 +4612,8 @@ mac_free_rings(mac_impl_t *mip, mac_ring_type_t rtype)
 int
 mac_group_addmac(mac_group_t *group, const uint8_t *addr)
 {
-	ASSERT(group->mrg_type == MAC_RING_TYPE_RX);
-	ASSERT(group->mrg_info.mgi_addmac != NULL);
+	VERIFY3S(group->mrg_type, ==, MAC_RING_TYPE_RX);
+	VERIFY3P(group->mrg_info.mgi_addmac, !=, NULL);
 
 	return (group->mrg_info.mgi_addmac(group->mrg_info.mgi_driver, addr));
 }
@@ -4347,8 +4624,8 @@ mac_group_addmac(mac_group_t *group, const uint8_t *addr)
 int
 mac_group_remmac(mac_group_t *group, const uint8_t *addr)
 {
-	ASSERT(group->mrg_type == MAC_RING_TYPE_RX);
-	ASSERT(group->mrg_info.mgi_remmac != NULL);
+	VERIFY3S(group->mrg_type, ==, MAC_RING_TYPE_RX);
+	VERIFY3P(group->mrg_info.mgi_remmac, !=, NULL);
 
 	return (group->mrg_info.mgi_remmac(group->mrg_info.mgi_driver, addr));
 }
@@ -4523,28 +4800,20 @@ i_mac_group_add_ring(mac_group_t *group, mac_ring_t *ring, int index)
 	switch (ring->mr_type) {
 	case MAC_RING_TYPE_RX:
 		/*
-		 * Setup SRS on top of the new ring if the group is
-		 * reserved for someones exclusive use.
+		 * Setup an SRS on top of the new ring if the group is
+		 * reserved for someone's exclusive use.
 		 */
 		if (group->mrg_state == MAC_GROUP_STATE_RESERVED) {
-			mac_client_impl_t *mcip;
+			mac_client_impl_t *mcip =  MAC_GROUP_ONLY_CLIENT(group);
 
-			mcip = MAC_GROUP_ONLY_CLIENT(group);
-			/*
-			 * Even though this group is reserved we migth still
-			 * have multiple clients, i.e a VLAN shares the
-			 * group with the primary mac client.
-			 */
-			if (mcip != NULL) {
-				flent = mcip->mci_flent;
-				ASSERT(flent->fe_rx_srs_cnt > 0);
-				mac_rx_srs_group_setup(mcip, flent, SRST_LINK);
-				mac_fanout_setup(mcip, flent,
-				    MCIP_RESOURCE_PROPS(mcip), mac_rx_deliver,
-				    mcip, NULL, NULL);
-			} else {
-				ring->mr_classify_type = MAC_SW_CLASSIFIER;
-			}
+			VERIFY3P(mcip, !=, NULL);
+			flent = mcip->mci_flent;
+			VERIFY3S(flent->fe_rx_srs_cnt, >, 0);
+			mac_rx_srs_group_setup(mcip, flent, SRST_LINK);
+			mac_fanout_setup(mcip, flent, MCIP_RESOURCE_PROPS(mcip),
+			    mac_rx_deliver, mcip, NULL, NULL);
+		} else {
+			ring->mr_classify_type = MAC_SW_CLASSIFIER;
 		}
 		break;
 	case MAC_RING_TYPE_TX:
@@ -4570,7 +4839,7 @@ i_mac_group_add_ring(mac_group_t *group, mac_ring_t *ring, int index)
 
 			mcip = mgcp->mgc_client;
 			flent = mcip->mci_flent;
-			is_aggr = (mcip->mci_state_flags & MCIS_IS_AGGR);
+			is_aggr = (mcip->mci_state_flags & MCIS_IS_AGGR_CLIENT);
 			mac_srs = MCIP_TX_SRS(mcip);
 			tx = &mac_srs->srs_tx;
 			mac_tx_client_quiesce((mac_client_handle_t)mcip);
@@ -4714,7 +4983,7 @@ i_mac_group_rem_ring(mac_group_t *group, mac_ring_t *ring,
 
 			mcip = MAC_GROUP_ONLY_CLIENT(group);
 			ASSERT(mcip != NULL);
-			ASSERT(mcip->mci_state_flags & MCIS_IS_AGGR);
+			ASSERT(mcip->mci_state_flags & MCIS_IS_AGGR_CLIENT);
 			mac_srs = MCIP_TX_SRS(mcip);
 			ASSERT(mac_srs->srs_tx.st_mode == SRS_TX_AGGR ||
 			    mac_srs->srs_tx.st_mode == SRS_TX_BW_AGGR);
@@ -4922,12 +5191,12 @@ mac_free_macaddr(mac_address_t *map)
 	mac_impl_t *mip = map->ma_mip;
 
 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
-	ASSERT(mip->mi_addresses != NULL);
-
-	map = mac_find_macaddr(mip, map->ma_addr);
+	VERIFY3P(mip->mi_addresses, !=, NULL);
 
-	ASSERT(map != NULL);
-	ASSERT(map->ma_nusers == 0);
+	VERIFY3P(map, ==, mac_find_macaddr(mip, map->ma_addr));
+	VERIFY3P(map, !=, NULL);
+	VERIFY3S(map->ma_nusers, ==, 0);
+	VERIFY3P(map->ma_vlans, ==, NULL);
 
 	if (map == mip->mi_addresses) {
 		mip->mi_addresses = map->ma_next;
@@ -4943,85 +5212,201 @@ mac_free_macaddr(mac_address_t *map)
 	kmem_free(map, sizeof (mac_address_t));
 }
 
+static mac_vlan_t *
+mac_find_vlan(mac_address_t *map, uint16_t vid)
+{
+	mac_vlan_t *mvp;
+
+	for (mvp = map->ma_vlans; mvp != NULL; mvp = mvp->mv_next) {
+		if (mvp->mv_vid == vid)
+			return (mvp);
+	}
+
+	return (NULL);
+}
+
+static mac_vlan_t *
+mac_add_vlan(mac_address_t *map, uint16_t vid)
+{
+	mac_vlan_t *mvp;
+
+	/*
+	 * We should never add the same {addr, VID} tuple more
+	 * than once, but let's be sure.
+	 */
+	for (mvp = map->ma_vlans; mvp != NULL; mvp = mvp->mv_next)
+		VERIFY3U(mvp->mv_vid, !=, vid);
+
+	/* Add the VLAN to the head of the VLAN list. */
+	mvp = kmem_zalloc(sizeof (mac_vlan_t), KM_SLEEP);
+	mvp->mv_vid = vid;
+	mvp->mv_next = map->ma_vlans;
+	map->ma_vlans = mvp;
+
+	return (mvp);
+}
+
+static void
+mac_rem_vlan(mac_address_t *map, mac_vlan_t *mvp)
+{
+	mac_vlan_t *pre;
+
+	if (map->ma_vlans == mvp) {
+		map->ma_vlans = mvp->mv_next;
+	} else {
+		pre = map->ma_vlans;
+		while (pre->mv_next != mvp) {
+			pre = pre->mv_next;
+
+			/*
+			 * We've reached the end of the list without
+			 * finding mvp.
+			 */
+			VERIFY3P(pre, !=, NULL);
+		}
+		pre->mv_next = mvp->mv_next;
+	}
+
+	kmem_free(mvp, sizeof (mac_vlan_t));
+}
+
 /*
- * Add a MAC address reference for a client. If the desired MAC address
- * exists, add a reference to it. Otherwise, add the new address by adding
- * it to a reserved group or setting promiscuous mode. Won't try different
- * group is the group is non-NULL, so the caller must explictly share
- * default group when needed.
- *
- * Note, the primary MAC address is initialized at registration time, so
- * to add it to default group only need to activate it if its reference
- * count is still zero. Also, some drivers may not have advertised RINGS
- * capability.
+ * Create a new mac_address_t if this is the first use of the address
+ * or add a VID to an existing address. In either case, the
+ * mac_address_t acts as a list of {addr, VID} tuples where each tuple
+ * shares the same addr. If group is non-NULL then attempt to program
+ * the MAC's HW filters for this group. Otherwise, if group is NULL,
+ * then the MAC has no rings and there is nothing to program.
  */
 int
-mac_add_macaddr(mac_impl_t *mip, mac_group_t *group, uint8_t *mac_addr,
-    boolean_t use_hw)
+mac_add_macaddr_vlan(mac_impl_t *mip, mac_group_t *group, uint8_t *addr,
+    uint16_t vid, boolean_t use_hw)
 {
-	mac_address_t *map;
-	int err = 0;
-	boolean_t allocated_map = B_FALSE;
+	mac_address_t	*map;
+	mac_vlan_t	*mvp;
+	int		err = 0;
+	boolean_t	allocated_map = B_FALSE;
+	boolean_t	hw_mac = B_FALSE;
+	boolean_t	hw_vlan = B_FALSE;
 
 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
 
-	map = mac_find_macaddr(mip, mac_addr);
+	map = mac_find_macaddr(mip, addr);
 
 	/*
-	 * If the new MAC address has not been added. Allocate a new one
-	 * and set it up.
+	 * If this is the first use of this MAC address then allocate
+	 * and initialize a new structure.
 	 */
 	if (map == NULL) {
 		map = kmem_zalloc(sizeof (mac_address_t), KM_SLEEP);
 		map->ma_len = mip->mi_type->mt_addr_length;
-		bcopy(mac_addr, map->ma_addr, map->ma_len);
+		bcopy(addr, map->ma_addr, map->ma_len);
 		map->ma_nusers = 0;
 		map->ma_group = group;
 		map->ma_mip = mip;
+		map->ma_untagged = B_FALSE;
 
-		/* add the new MAC address to the head of the address list */
+		/* Add the new MAC address to the head of the address list. */
 		map->ma_next = mip->mi_addresses;
 		mip->mi_addresses = map;
 
 		allocated_map = B_TRUE;
 	}
 
-	ASSERT(map->ma_group == NULL || map->ma_group == group);
+	VERIFY(map->ma_group == NULL || map->ma_group == group);
 	if (map->ma_group == NULL)
 		map->ma_group = group;
 
+	if (vid == VLAN_ID_NONE) {
+		map->ma_untagged = B_TRUE;
+		mvp = NULL;
+	} else {
+		mvp = mac_add_vlan(map, vid);
+	}
+
+	/*
+	 * Set the VLAN HW filter if:
+	 *
+	 * o the MAC's VLAN HW filtering is enabled, and
+	 * o the address does not currently rely on promisc mode.
+	 *
+	 * This is called even when the client specifies an untagged
+	 * address (VLAN_ID_NONE) because some MAC providers require
+	 * setting additional bits to accept untagged traffic when
+	 * VLAN HW filtering is enabled.
+	 */
+	if (MAC_GROUP_HW_VLAN(group) &&
+	    map->ma_type != MAC_ADDRESS_TYPE_UNICAST_PROMISC) {
+		if ((err = mac_group_addvlan(group, vid)) != 0)
+			goto bail;
+
+		hw_vlan = B_TRUE;
+	}
+
+	VERIFY3S(map->ma_nusers, >=, 0);
+	map->ma_nusers++;
+
 	/*
-	 * If the MAC address is already in use, simply account for the
-	 * new client.
+	 * If this MAC address already has a HW filter then simply
+	 * increment the counter.
 	 */
-	if (map->ma_nusers++ > 0)
+	if (map->ma_nusers > 1)
 		return (0);
 
 	/*
+	 * All logic from here on out is executed during initial
+	 * creation only.
+	 */
+	VERIFY3S(map->ma_nusers, ==, 1);
+
+	/*
 	 * Activate this MAC address by adding it to the reserved group.
 	 */
 	if (group != NULL) {
-		err = mac_group_addmac(group, (const uint8_t *)mac_addr);
-		if (err == 0) {
-			map->ma_type = MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED;
-			return (0);
+		err = mac_group_addmac(group, (const uint8_t *)addr);
+
+		/*
+		 * If the driver is out of filters then we can
+		 * continue and use promisc mode. For any other error,
+		 * assume the driver is in a state where we can't
+		 * program the filters or use promisc mode; so we must
+		 * bail.
+		 */
+		if (err != 0 && err != ENOSPC) {
+			map->ma_nusers--;
+			goto bail;
 		}
+
+		hw_mac = (err == 0);
+	}
+
+	if (hw_mac) {
+		map->ma_type = MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED;
+		return (0);
 	}
 
 	/*
 	 * The MAC address addition failed. If the client requires a
-	 * hardware classified MAC address, fail the operation.
+	 * hardware classified MAC address, fail the operation. This
+	 * feature is only used by sun4v vsw.
 	 */
-	if (use_hw) {
+	if (use_hw && !hw_mac) {
 		err = ENOSPC;
+		map->ma_nusers--;
 		goto bail;
 	}
 
 	/*
-	 * Try promiscuous mode.
-	 *
-	 * For drivers that don't advertise RINGS capability, do
-	 * nothing for the primary address.
+	 * If we reach this point then either the MAC doesn't have
+	 * RINGS capability or we are out of MAC address HW filters.
+	 * In any case we must put the MAC into promiscuous mode.
+	 */
+	VERIFY(group == NULL || !hw_mac);
+
+	/*
+	 * The one exception is the primary address. A non-RINGS
+	 * driver filters the primary address by default; promisc mode
+	 * is not needed.
 	 */
 	if ((group == NULL) &&
 	    (bcmp(map->ma_addr, mip->mi_addr, map->ma_len) == 0)) {
@@ -5030,53 +5415,76 @@ mac_add_macaddr(mac_impl_t *mip, mac_group_t *group, uint8_t *mac_addr,
 	}
 
 	/*
-	 * Enable promiscuous mode in order to receive traffic
-	 * to the new MAC address.
+	 * Enable promiscuous mode in order to receive traffic to the
+	 * new MAC address. All existing HW filters still send their
+	 * traffic to their respective group/SRSes. But with promisc
+	 * enabled all unknown traffic is delivered to the default
+	 * group where it is SW classified via mac_rx_classify().
 	 */
 	if ((err = i_mac_promisc_set(mip, B_TRUE)) == 0) {
 		map->ma_type = MAC_ADDRESS_TYPE_UNICAST_PROMISC;
 		return (0);
 	}
 
-	/*
-	 * Free the MAC address that could not be added. Don't free
-	 * a pre-existing address, it could have been the entry
-	 * for the primary MAC address which was pre-allocated by
-	 * mac_init_macaddr(), and which must remain on the list.
-	 */
 bail:
-	map->ma_nusers--;
+	if (hw_vlan) {
+		int err2 = mac_group_remvlan(group, vid);
+
+		if (err2 != 0) {
+			cmn_err(CE_WARN, "Failed to remove VLAN %u from group"
+			    " %d on MAC %s: %d.", vid, group->mrg_index,
+			    mip->mi_name, err2);
+		}
+	}
+
+	if (mvp != NULL)
+		mac_rem_vlan(map, mvp);
+
 	if (allocated_map)
 		mac_free_macaddr(map);
+
 	return (err);
 }
 
-/*
- * Remove a reference to a MAC address. This may cause to remove the MAC
- * address from an associated group or to turn off promiscuous mode.
- * The caller needs to handle the failure properly.
- */
 int
-mac_remove_macaddr(mac_address_t *map)
+mac_remove_macaddr_vlan(mac_address_t *map, uint16_t vid)
 {
-	mac_impl_t *mip = map->ma_mip;
-	int err = 0;
+	mac_vlan_t	*mvp;
+	mac_impl_t	*mip = map->ma_mip;
+	mac_group_t	*group = map->ma_group;
+	int		err = 0;
 
 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+	VERIFY3P(map, ==, mac_find_macaddr(mip, map->ma_addr));
+
+	if (vid == VLAN_ID_NONE) {
+		map->ma_untagged = B_FALSE;
+		mvp = NULL;
+	} else {
+		mvp = mac_find_vlan(map, vid);
+		VERIFY3P(mvp, !=, NULL);
+	}
+
+	if (MAC_GROUP_HW_VLAN(group) &&
+	    map->ma_type == MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED &&
+	    ((err = mac_group_remvlan(group, vid)) != 0))
+		return (err);
 
-	ASSERT(map == mac_find_macaddr(mip, map->ma_addr));
+	if (mvp != NULL)
+		mac_rem_vlan(map, mvp);
 
 	/*
 	 * If it's not the last client using this MAC address, only update
 	 * the MAC clients count.
 	 */
-	if (--map->ma_nusers > 0)
+	map->ma_nusers--;
+	if (map->ma_nusers > 0)
 		return (0);
 
 	/*
-	 * The MAC address is no longer used by any MAC client, so remove
-	 * it from its associated group, or turn off promiscuous mode
-	 * if it was enabled for the MAC address.
+	 * The MAC address is no longer used by any MAC client, so
+	 * remove it from its associated group. Turn off promiscuous
+	 * mode if this is the last address relying on it.
 	 */
 	switch (map->ma_type) {
 	case MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED:
@@ -5084,18 +5492,44 @@ mac_remove_macaddr(mac_address_t *map)
 		 * Don't free the preset primary address for drivers that
 		 * don't advertise RINGS capability.
 		 */
-		if (map->ma_group == NULL)
+		if (group == NULL)
 			return (0);
 
-		err = mac_group_remmac(map->ma_group, map->ma_addr);
-		if (err == 0)
-			map->ma_group = NULL;
+		if ((err = mac_group_remmac(group, map->ma_addr)) != 0) {
+			if (vid == VLAN_ID_NONE)
+				map->ma_untagged = B_TRUE;
+			else
+				(void) mac_add_vlan(map, vid);
+
+			/*
+			 * If we fail to remove the MAC address HW
+			 * filter but then also fail to re-add the
+			 * VLAN HW filter then we are in a busted
+			 * state and should just crash.
+			 */
+			if (MAC_GROUP_HW_VLAN(group)) {
+				int err2;
+
+				err2 = mac_group_addvlan(group, vid);
+				if (err2 != 0) {
+					cmn_err(CE_WARN, "Failed to readd VLAN"
+					    " %u to group %d on MAC %s: %d.",
+					    vid, group->mrg_index, mip->mi_name,
+					    err2);
+				}
+			}
+
+			return (err);
+		}
+
+		map->ma_group = NULL;
 		break;
 	case MAC_ADDRESS_TYPE_UNICAST_PROMISC:
 		err = i_mac_promisc_set(mip, B_FALSE);
 		break;
 	default:
-		ASSERT(B_FALSE);
+		panic("Unexpected ma_type 0x%x, file: %s, line %d",
+		    map->ma_type, __FILE__, __LINE__);
 	}
 
 	if (err != 0)
@@ -5252,8 +5686,9 @@ mac_fini_macaddr(mac_impl_t *mip)
 	 * If mi_addresses is initialized, there should be exactly one
 	 * entry left on the list with no users.
 	 */
-	ASSERT(map->ma_nusers == 0);
-	ASSERT(map->ma_next == NULL);
+	VERIFY3S(map->ma_nusers, ==, 0);
+	VERIFY3P(map->ma_next, ==, NULL);
+	VERIFY3P(map->ma_vlans, ==, NULL);
 
 	kmem_free(map, sizeof (mac_address_t));
 	mip->mi_addresses = NULL;
@@ -5815,7 +6250,7 @@ mac_stop_logusage(mac_logtype_t type)
 	mod_hash_walk(i_mac_impl_hash, i_mac_fastpath_walker, &estate);
 
 	(void) untimeout(mac_logging_timer);
-	mac_logging_timer = 0;
+	mac_logging_timer = NULL;
 
 	/* Write log entries for each mac_impl in the list */
 	i_mac_log_info(&net_log_list, &lstate);
@@ -5933,7 +6368,7 @@ mac_reserve_tx_ring(mac_impl_t *mip, mac_ring_t *desired_ring)
 }
 
 /*
- * For a reserved group with multiple clients, return the primary client.
+ * For a non-default group with multiple clients, return the primary client.
  */
 static mac_client_impl_t *
 mac_get_grp_primary(mac_group_t *grp)
@@ -6292,13 +6727,12 @@ mac_group_add_client(mac_group_t *grp, mac_client_impl_t *mcip)
 			break;
 	}
 
-	VERIFY(mgcp == NULL);
+	ASSERT(mgcp == NULL);
 
 	mgcp = kmem_zalloc(sizeof (mac_grp_client_t), KM_SLEEP);
 	mgcp->mgc_client = mcip;
 	mgcp->mgc_next = grp->mrg_clients;
 	grp->mrg_clients = mgcp;
-
 }
 
 void
@@ -6319,8 +6753,27 @@ mac_group_remove_client(mac_group_t *grp, mac_client_impl_t *mcip)
 }
 
 /*
- * mac_reserve_rx_group()
- *
+ * Return true if any client on this group explicitly asked for HW
+ * rings (of type mask) or have a bound share.
+ */
+static boolean_t
+i_mac_clients_hw(mac_group_t *grp, uint32_t mask)
+{
+	mac_grp_client_t	*mgcip;
+	mac_client_impl_t	*mcip;
+	mac_resource_props_t	*mrp;
+
+	for (mgcip = grp->mrg_clients; mgcip != NULL; mgcip = mgcip->mgc_next) {
+		mcip = mgcip->mgc_client;
+		mrp = MCIP_RESOURCE_PROPS(mcip);
+		if (mcip->mci_share != NULL || (mrp->mrp_mask & mask) != 0)
+			return (B_TRUE);
+	}
+
+	return (B_FALSE);
+}
+
+/*
  * Finds an available group and exclusively reserves it for a client.
  * The group is chosen to suit the flow's resource controls (bandwidth and
  * fanout requirements) and the address type.
@@ -6343,7 +6796,6 @@ mac_reserve_rx_group(mac_client_impl_t *mcip, uint8_t *mac_addr, boolean_t move)
 	int			need_rings = 0;
 	mac_group_t		*candidate_grp = NULL;
 	mac_client_impl_t	*gclient;
-	mac_resource_props_t	*gmrp;
 	mac_group_t		*donorgrp = NULL;
 	boolean_t		rxhw = mrp->mrp_mask & MRP_RX_RINGS;
 	boolean_t		unspec = mrp->mrp_mask & MRP_RXRINGS_UNSPEC;
@@ -6354,18 +6806,20 @@ mac_reserve_rx_group(mac_client_impl_t *mcip, uint8_t *mac_addr, boolean_t move)
 	isprimary = mcip->mci_flent->fe_type & FLOW_PRIMARY_MAC;
 
 	/*
-	 * Check if a group already has this mac address (case of VLANs)
+	 * Check if a group already has this MAC address (case of VLANs)
 	 * unless we are moving this MAC client from one group to another.
 	 */
 	if (!move && (map = mac_find_macaddr(mip, mac_addr)) != NULL) {
 		if (map->ma_group != NULL)
 			return (map->ma_group);
 	}
+
 	if (mip->mi_rx_groups == NULL || mip->mi_rx_group_count == 0)
 		return (NULL);
+
 	/*
-	 * If exclusive open, return NULL which will enable the
-	 * caller to use the default group.
+	 * If this client is requesting exclusive MAC access then
+	 * return NULL to ensure the client uses the default group.
 	 */
 	if (mcip->mci_state_flags & MCIS_EXCLUSIVE)
 		return (NULL);
@@ -6375,6 +6829,7 @@ mac_reserve_rx_group(mac_client_impl_t *mcip, uint8_t *mac_addr, boolean_t move)
 	    mip->mi_rx_group_type == MAC_GROUP_TYPE_DYNAMIC) {
 		mrp->mrp_nrxrings = 1;
 	}
+
 	/*
 	 * For static grouping we allow only specifying rings=0 and
 	 * unspecified
@@ -6383,6 +6838,7 @@ mac_reserve_rx_group(mac_client_impl_t *mcip, uint8_t *mac_addr, boolean_t move)
 	    mip->mi_rx_group_type == MAC_GROUP_TYPE_STATIC) {
 		return (NULL);
 	}
+
 	if (rxhw) {
 		/*
 		 * We have explicitly asked for a group (with nrxrings,
@@ -6444,25 +6900,19 @@ mac_reserve_rx_group(mac_client_impl_t *mcip, uint8_t *mac_addr, boolean_t move)
 		 * that didn't ask for an exclusive group, but got
 		 * one and it has enough rings (combined with what
 		 * the donor group can donate) for the new MAC
-		 * client
+		 * client.
 		 */
 		if (grp->mrg_state >= MAC_GROUP_STATE_RESERVED) {
 			/*
-			 * If the primary/donor group is not the default
-			 * group, don't bother looking for a candidate group.
-			 * If we don't have enough rings we will check
-			 * if the primary group can be vacated.
+			 * If the donor group is not the default
+			 * group, don't bother looking for a candidate
+			 * group. If we don't have enough rings we
+			 * will check if the primary group can be
+			 * vacated.
 			 */
 			if (candidate_grp == NULL &&
 			    donorgrp == MAC_DEFAULT_RX_GROUP(mip)) {
-				ASSERT(!MAC_GROUP_NO_CLIENT(grp));
-				gclient = MAC_GROUP_ONLY_CLIENT(grp);
-				if (gclient == NULL)
-					gclient = mac_get_grp_primary(grp);
-				ASSERT(gclient != NULL);
-				gmrp = MCIP_RESOURCE_PROPS(gclient);
-				if (gclient->mci_share == NULL &&
-				    (gmrp->mrp_mask & MRP_RX_RINGS) == 0 &&
+				if (!i_mac_clients_hw(grp, MRP_RX_RINGS) &&
 				    (unspec ||
 				    (grp->mrg_cur_count + donor_grp_rcnt >=
 				    need_rings))) {
@@ -6528,6 +6978,7 @@ mac_reserve_rx_group(mac_client_impl_t *mcip, uint8_t *mac_addr, boolean_t move)
 		 */
 		mac_stop_group(grp);
 	}
+
 	/* We didn't find an exclusive group for this MAC client */
 	if (i >= mip->mi_rx_group_count) {
 
@@ -6535,12 +6986,12 @@ mac_reserve_rx_group(mac_client_impl_t *mcip, uint8_t *mac_addr, boolean_t move)
 			return (NULL);
 
 		/*
-		 * If we found a candidate group then we switch the
-		 * MAC client from the candidate_group to the default
-		 * group and give the group to this MAC client. If
-		 * we didn't find a candidate_group, check if the
-		 * primary is in its own group and if it can make way
-		 * for this MAC client.
+		 * If we found a candidate group then move the
+		 * existing MAC client from the candidate_group to the
+		 * default group and give the candidate_group to the
+		 * new MAC client. If we didn't find a candidate
+		 * group, then check if the primary is in its own
+		 * group and if it can make way for this MAC client.
 		 */
 		if (candidate_grp == NULL &&
 		    donorgrp != MAC_DEFAULT_RX_GROUP(mip) &&
@@ -6551,15 +7002,15 @@ mac_reserve_rx_group(mac_client_impl_t *mcip, uint8_t *mac_addr, boolean_t move)
 			boolean_t	prim_grp = B_FALSE;
 
 			/*
-			 * Switch the MAC client from the candidate group
-			 * to the default group.. If this group was the
-			 * donor group, then after the switch we need
-			 * to update the donor group too.
+			 * Switch the existing MAC client from the
+			 * candidate group to the default group. If
+			 * the candidate group is the donor group,
+			 * then after the switch we need to update the
+			 * donor group too.
 			 */
 			grp = candidate_grp;
-			gclient = MAC_GROUP_ONLY_CLIENT(grp);
-			if (gclient == NULL)
-				gclient = mac_get_grp_primary(grp);
+			gclient = grp->mrg_clients->mgc_client;
+			VERIFY3P(gclient, !=, NULL);
 			if (grp == mip->mi_rx_donor_grp)
 				prim_grp = B_TRUE;
 			if (mac_rx_switch_group(gclient, grp,
@@ -6572,7 +7023,6 @@ mac_reserve_rx_group(mac_client_impl_t *mcip, uint8_t *mac_addr, boolean_t move)
 				donorgrp = MAC_DEFAULT_RX_GROUP(mip);
 			}
 
-
 			/*
 			 * Now give this group with the required rings
 			 * to this MAC client.
@@ -6620,10 +7070,10 @@ mac_reserve_rx_group(mac_client_impl_t *mcip, uint8_t *mac_addr, boolean_t move)
 /*
  * mac_rx_release_group()
  *
- * This is called when there are no clients left for the group.
- * The group is stopped and marked MAC_GROUP_STATE_REGISTERED,
- * and if it is a non default group, the shares are removed and
- * all rings are assigned back to default group.
+ * Release the group when it has no remaining clients. The group is
+ * stopped and its shares are removed and all rings are assigned back
+ * to default group. This should never be called against the default
+ * group.
  */
 void
 mac_release_rx_group(mac_client_impl_t *mcip, mac_group_t *group)
@@ -6632,6 +7082,7 @@ mac_release_rx_group(mac_client_impl_t *mcip, mac_group_t *group)
 	mac_ring_t		*ring;
 
 	ASSERT(group != MAC_DEFAULT_RX_GROUP(mip));
+	ASSERT(MAC_GROUP_NO_CLIENT(group) == B_TRUE);
 
 	if (mip->mi_rx_donor_grp == group)
 		mip->mi_rx_donor_grp = MAC_DEFAULT_RX_GROUP(mip);
@@ -6683,56 +7134,7 @@ mac_release_rx_group(mac_client_impl_t *mcip, mac_group_t *group)
 }
 
 /*
- * When we move the primary's mac address between groups, we need to also
- * take all the clients sharing the same mac address along with it (VLANs)
- * We remove the mac address for such clients from the group after quiescing
- * them. When we add the mac address we restart the client. Note that
- * the primary's mac address is removed from the group after all the
- * other clients sharing the address are removed. Similarly, the primary's
- * mac address is added before all the other client's mac address are
- * added. While grp is the group where the clients reside, tgrp is
- * the group where the addresses have to be added.
- */
-static void
-mac_rx_move_macaddr_prim(mac_client_impl_t *mcip, mac_group_t *grp,
-    mac_group_t *tgrp, uint8_t *maddr, boolean_t add)
-{
-	mac_impl_t		*mip = mcip->mci_mip;
-	mac_grp_client_t	*mgcp = grp->mrg_clients;
-	mac_client_impl_t	*gmcip;
-	boolean_t		prim;
-
-	prim = (mcip->mci_state_flags & MCIS_UNICAST_HW) != 0;
-
-	/*
-	 * If the clients are in a non-default group, we just have to
-	 * walk the group's client list. If it is in the default group
-	 * (which will be shared by other clients as well, we need to
-	 * check if the unicast address matches mcip's unicast.
-	 */
-	while (mgcp != NULL) {
-		gmcip = mgcp->mgc_client;
-		if (gmcip != mcip &&
-		    (grp != MAC_DEFAULT_RX_GROUP(mip) ||
-		    mcip->mci_unicast == gmcip->mci_unicast)) {
-			if (!add) {
-				mac_rx_client_quiesce(
-				    (mac_client_handle_t)gmcip);
-				(void) mac_remove_macaddr(mcip->mci_unicast);
-			} else {
-				(void) mac_add_macaddr(mip, tgrp, maddr, prim);
-				mac_rx_client_restart(
-				    (mac_client_handle_t)gmcip);
-			}
-		}
-		mgcp = mgcp->mgc_next;
-	}
-}
-
-
-/*
- * Move the MAC address from fgrp to tgrp. If this is the primary client,
- * we need to take any VLANs etc. together too.
+ * Move the MAC address from fgrp to tgrp.
  */
 static int
 mac_rx_move_macaddr(mac_client_impl_t *mcip, mac_group_t *fgrp,
@@ -6741,56 +7143,86 @@ mac_rx_move_macaddr(mac_client_impl_t *mcip, mac_group_t *fgrp,
 	mac_impl_t		*mip = mcip->mci_mip;
 	uint8_t			maddr[MAXMACADDRLEN];
 	int			err = 0;
-	boolean_t		prim;
-	boolean_t		multiclnt = B_FALSE;
+	uint16_t		vid;
+	mac_unicast_impl_t	*muip;
+	boolean_t		use_hw;
 
 	mac_rx_client_quiesce((mac_client_handle_t)mcip);
-	ASSERT(mcip->mci_unicast != NULL);
+	VERIFY3P(mcip->mci_unicast, !=, NULL);
 	bcopy(mcip->mci_unicast->ma_addr, maddr, mcip->mci_unicast->ma_len);
 
-	prim = (mcip->mci_state_flags & MCIS_UNICAST_HW) != 0;
-	if (mcip->mci_unicast->ma_nusers > 1) {
-		mac_rx_move_macaddr_prim(mcip, fgrp, NULL, maddr, B_FALSE);
-		multiclnt = B_TRUE;
-	}
-	ASSERT(mcip->mci_unicast->ma_nusers == 1);
-	err = mac_remove_macaddr(mcip->mci_unicast);
+	/*
+	 * Does the client require MAC address hardware classifiction?
+	 */
+	use_hw = (mcip->mci_state_flags & MCIS_UNICAST_HW) != 0;
+	vid = i_mac_flow_vid(mcip->mci_flent);
+
+	/*
+	 * You can never move an address that is shared by multiple
+	 * clients. mac_datapath_setup() ensures that clients sharing
+	 * an address are placed on the default group. This guarantees
+	 * that a non-default group will only ever have one client and
+	 * thus make full use of HW filters.
+	 */
+	if (mac_check_macaddr_shared(mcip->mci_unicast))
+		return (EINVAL);
+
+	err = mac_remove_macaddr_vlan(mcip->mci_unicast, vid);
+
 	if (err != 0) {
 		mac_rx_client_restart((mac_client_handle_t)mcip);
-		if (multiclnt) {
-			mac_rx_move_macaddr_prim(mcip, fgrp, fgrp, maddr,
-			    B_TRUE);
-		}
 		return (err);
 	}
+
+	/*
+	 * If this isn't the primary MAC address then the
+	 * mac_address_t has been freed by the last call to
+	 * mac_remove_macaddr_vlan(). In any case, NULL the reference
+	 * to avoid a dangling pointer.
+	 */
+	mcip->mci_unicast = NULL;
+
 	/*
-	 * Program the H/W Classifier first, if this fails we need
-	 * not proceed with the other stuff.
+	 * We also have to NULL all the mui_map references -- sun4v
+	 * strikes again!
 	 */
-	if ((err = mac_add_macaddr(mip, tgrp, maddr, prim)) != 0) {
+	rw_enter(&mcip->mci_rw_lock, RW_WRITER);
+	for (muip = mcip->mci_unicast_list; muip != NULL; muip = muip->mui_next)
+		muip->mui_map = NULL;
+	rw_exit(&mcip->mci_rw_lock);
+
+	/*
+	 * Program the H/W Classifier first, if this fails we need not
+	 * proceed with the other stuff.
+	 */
+	if ((err = mac_add_macaddr_vlan(mip, tgrp, maddr, vid, use_hw)) != 0) {
+		int err2;
+
 		/* Revert back the H/W Classifier */
-		if ((err = mac_add_macaddr(mip, fgrp, maddr, prim)) != 0) {
-			/*
-			 * This should not fail now since it worked earlier,
-			 * should we panic?
-			 */
-			cmn_err(CE_WARN,
-			    "mac_rx_switch_group: switching %p back"
-			    " to group %p failed!!", (void *)mcip,
-			    (void *)fgrp);
+		err2 = mac_add_macaddr_vlan(mip, fgrp, maddr, vid, use_hw);
+
+		if (err2 != 0) {
+			cmn_err(CE_WARN, "Failed to revert HW classification"
+			    " on MAC %s, for client %s: %d.", mip->mi_name,
+			    mcip->mci_name, err2);
 		}
+
 		mac_rx_client_restart((mac_client_handle_t)mcip);
-		if (multiclnt) {
-			mac_rx_move_macaddr_prim(mcip, fgrp, fgrp, maddr,
-			    B_TRUE);
-		}
 		return (err);
 	}
+
+	/*
+	 * Get a reference to the new mac_address_t and update the
+	 * client's reference. Then restart the client and add the
+	 * other clients of this MAC addr (if they exsit).
+	 */
 	mcip->mci_unicast = mac_find_macaddr(mip, maddr);
+	rw_enter(&mcip->mci_rw_lock, RW_WRITER);
+	for (muip = mcip->mci_unicast_list; muip != NULL; muip = muip->mui_next)
+		muip->mui_map = mcip->mci_unicast;
+	rw_exit(&mcip->mci_rw_lock);
 	mac_rx_client_restart((mac_client_handle_t)mcip);
-	if (multiclnt)
-		mac_rx_move_macaddr_prim(mcip, fgrp, tgrp, maddr, B_TRUE);
-	return (err);
+	return (0);
 }
 
 /*
@@ -6811,19 +7243,34 @@ mac_rx_switch_group(mac_client_impl_t *mcip, mac_group_t *fgrp,
 	mac_impl_t		*mip = mcip->mci_mip;
 	mac_grp_client_t	*mgcp;
 
-	ASSERT(fgrp == mcip->mci_flent->fe_rx_ring_group);
+	VERIFY3P(fgrp, ==, mcip->mci_flent->fe_rx_ring_group);
 
 	if ((err = mac_rx_move_macaddr(mcip, fgrp, tgrp)) != 0)
 		return (err);
 
 	/*
-	 * The group might be reserved, but SRSs may not be set up, e.g.
-	 * primary and its vlans using a reserved group.
+	 * If the group is marked as reserved and in use by a single
+	 * client, then there is an SRS to teardown.
 	 */
 	if (fgrp->mrg_state == MAC_GROUP_STATE_RESERVED &&
 	    MAC_GROUP_ONLY_CLIENT(fgrp) != NULL) {
 		mac_rx_srs_group_teardown(mcip->mci_flent, B_TRUE);
 	}
+
+	/*
+	 * If we are moving the client from a non-default group, then
+	 * we know that any additional clients on this group share the
+	 * same MAC address. Since we moved the MAC address filter, we
+	 * need to move these clients too.
+	 *
+	 * If we are moving the client from the default group and its
+	 * MAC address has VLAN clients, then we must move those
+	 * clients as well.
+	 *
+	 * In both cases the idea is the same: we moved the MAC
+	 * address filter to the tgrp, so we must move all clients
+	 * using that MAC address to tgrp as well.
+	 */
 	if (fgrp != MAC_DEFAULT_RX_GROUP(mip)) {
 		mgcp = fgrp->mrg_clients;
 		while (mgcp != NULL) {
@@ -6834,20 +7281,21 @@ mac_rx_switch_group(mac_client_impl_t *mcip, mac_group_t *fgrp,
 			gmcip->mci_flent->fe_rx_ring_group = tgrp;
 		}
 		mac_release_rx_group(mcip, fgrp);
-		ASSERT(MAC_GROUP_NO_CLIENT(fgrp));
+		VERIFY3B(MAC_GROUP_NO_CLIENT(fgrp), ==, B_TRUE);
 		mac_set_group_state(fgrp, MAC_GROUP_STATE_REGISTERED);
 	} else {
 		mac_group_remove_client(fgrp, mcip);
 		mac_group_add_client(tgrp, mcip);
 		mcip->mci_flent->fe_rx_ring_group = tgrp;
+
 		/*
 		 * If there are other clients (VLANs) sharing this address
-		 * we should be here only for the primary.
+		 * then move them too.
 		 */
-		if (mcip->mci_unicast->ma_nusers > 1) {
+		if (mac_check_macaddr_shared(mcip->mci_unicast)) {
 			/*
 			 * We need to move all the clients that are using
-			 * this h/w address.
+			 * this MAC address.
 			 */
 			mgcp = fgrp->mrg_clients;
 			while (mgcp != NULL) {
@@ -6861,20 +7309,24 @@ mac_rx_switch_group(mac_client_impl_t *mcip, mac_group_t *fgrp,
 				}
 			}
 		}
+
 		/*
-		 * The default group will still take the multicast,
-		 * broadcast traffic etc., so it won't go to
+		 * The default group still handles multicast and
+		 * broadcast traffic; it won't transition to
 		 * MAC_GROUP_STATE_REGISTERED.
 		 */
 		if (fgrp->mrg_state == MAC_GROUP_STATE_RESERVED)
 			mac_rx_group_unmark(fgrp, MR_CONDEMNED);
 		mac_set_group_state(fgrp, MAC_GROUP_STATE_SHARED);
 	}
+
 	next_state = mac_group_next_state(tgrp, &group_only_mcip,
 	    MAC_DEFAULT_RX_GROUP(mip), B_TRUE);
 	mac_set_group_state(tgrp, next_state);
+
 	/*
-	 * If the destination group is reserved, setup the SRSs etc.
+	 * If the destination group is reserved, then setup the SRSes.
+	 * Otherwise make sure to use SW classification.
 	 */
 	if (tgrp->mrg_state == MAC_GROUP_STATE_RESERVED) {
 		mac_rx_srs_group_setup(mcip, mcip->mci_flent, SRST_LINK);
@@ -6885,6 +7337,7 @@ mac_rx_switch_group(mac_client_impl_t *mcip, mac_group_t *fgrp,
 	} else {
 		mac_rx_switch_grp_to_sw(tgrp);
 	}
+
 	return (0);
 }
 
@@ -6915,6 +7368,7 @@ mac_reserve_tx_group(mac_client_impl_t *mcip, boolean_t move)
 	boolean_t		isprimary;
 
 	isprimary = mcip->mci_flent->fe_type & FLOW_PRIMARY_MAC;
+
 	/*
 	 * When we come here for a VLAN on the primary (dladm create-vlan),
 	 * we need to pair it along with the primary (to keep it consistent
@@ -6996,8 +7450,7 @@ mac_reserve_tx_group(mac_client_impl_t *mcip, boolean_t move)
 			if (grp->mrg_state == MAC_GROUP_STATE_RESERVED &&
 			    candidate_grp == NULL) {
 				gclient = MAC_GROUP_ONLY_CLIENT(grp);
-				if (gclient == NULL)
-					gclient = mac_get_grp_primary(grp);
+				VERIFY3P(gclient, !=, NULL);
 				gmrp = MCIP_RESOURCE_PROPS(gclient);
 				if (gclient->mci_share == NULL &&
 				    (gmrp->mrp_mask & MRP_TX_RINGS) == 0 &&
@@ -7034,13 +7487,14 @@ mac_reserve_tx_group(mac_client_impl_t *mcip, boolean_t move)
 		 */
 		if (need_exclgrp && candidate_grp != NULL) {
 			/*
-			 * Switch the MAC client from the candidate group
-			 * to the default group.
+			 * Switch the MAC client from the candidate
+			 * group to the default group. We know the
+			 * candidate_grp came from a reserved group
+			 * and thus only has one client.
 			 */
 			grp = candidate_grp;
 			gclient = MAC_GROUP_ONLY_CLIENT(grp);
-			if (gclient == NULL)
-				gclient = mac_get_grp_primary(grp);
+			VERIFY3P(gclient, !=, NULL);
 			mac_tx_client_quiesce((mac_client_handle_t)gclient);
 			mac_tx_switch_group(gclient, grp, defgrp);
 			mac_tx_client_restart((mac_client_handle_t)gclient);
@@ -7208,7 +7662,7 @@ mac_tx_switch_group(mac_client_impl_t *mcip, mac_group_t *fgrp,
 		 */
 		mac_group_remove_client(fgrp, mcip);
 		mac_tx_dismantle_soft_rings(fgrp, flent);
-		if (mcip->mci_unicast->ma_nusers > 1) {
+		if (mac_check_macaddr_shared(mcip->mci_unicast)) {
 			mgcp = fgrp->mrg_clients;
 			while (mgcp != NULL) {
 				gmcip = mgcp->mgc_client;
@@ -7454,7 +7908,7 @@ mac_no_active(mac_handle_t mh)
  * changes and update the mac_resource_props_t for the VLAN's client.
  * We need to do this since we don't support setting these properties
  * on the primary's VLAN clients, but the VLAN clients have to
- * follow the primary w.r.t the rings property;
+ * follow the primary w.r.t the rings property.
  */
 void
 mac_set_prim_vlan_rings(mac_impl_t  *mip, mac_resource_props_t *mrp)
@@ -7603,13 +8057,10 @@ mac_group_ring_modify(mac_client_impl_t *mcip, mac_group_t *group,
 				    MAC_GROUP_STATE_RESERVED) {
 					continue;
 				}
-				mcip = MAC_GROUP_ONLY_CLIENT(tgrp);
-				if (mcip == NULL)
-					mcip = mac_get_grp_primary(tgrp);
-				ASSERT(mcip != NULL);
-				mrp = MCIP_RESOURCE_PROPS(mcip);
-				if ((mrp->mrp_mask & MRP_RX_RINGS) != 0)
+				if (i_mac_clients_hw(tgrp, MRP_RX_RINGS))
 					continue;
+				mcip = tgrp->mrg_clients->mgc_client;
+				VERIFY3P(mcip, !=, NULL);
 				if ((tgrp->mrg_cur_count +
 				    defgrp->mrg_cur_count) < (modify + 1)) {
 					continue;
@@ -7624,12 +8075,10 @@ mac_group_ring_modify(mac_client_impl_t *mcip, mac_group_t *group,
 				    MAC_GROUP_STATE_RESERVED) {
 					continue;
 				}
-				mcip = MAC_GROUP_ONLY_CLIENT(tgrp);
-				if (mcip == NULL)
-					mcip = mac_get_grp_primary(tgrp);
-				mrp = MCIP_RESOURCE_PROPS(mcip);
-				if ((mrp->mrp_mask & MRP_TX_RINGS) != 0)
+				if (i_mac_clients_hw(tgrp, MRP_TX_RINGS))
 					continue;
+				mcip = tgrp->mrg_clients->mgc_client;
+				VERIFY3P(mcip, !=, NULL);
 				if ((tgrp->mrg_cur_count +
 				    defgrp->mrg_cur_count) < (modify + 1)) {
 					continue;
@@ -7899,10 +8348,10 @@ mac_pool_event_cb(pool_event_t what, poolid_t id, void *arg)
  * Set effective rings property. This could be called from datapath_setup/
  * datapath_teardown or set-linkprop.
  * If the group is reserved we just go ahead and set the effective rings.
- * Additionally, for TX this could mean the default  group has lost/gained
+ * Additionally, for TX this could mean the default group has lost/gained
  * some rings, so if the default group is reserved, we need to adjust the
  * effective rings for the default group clients. For RX, if we are working
- * with the non-default group, we just need * to reset the effective props
+ * with the non-default group, we just need to reset the effective props
  * for the default group clients.
  */
 void
@@ -8032,6 +8481,7 @@ mac_check_primary_relocation(mac_client_impl_t *mcip, boolean_t rxhw)
 	 * the first non-primary.
 	 */
 	ASSERT(mip->mi_nactiveclients == 2);
+
 	/*
 	 * OK, now we have the primary that needs to be relocated.
 	 */
diff --git a/usr/src/uts/common/io/mac/mac_bcast.c b/usr/src/uts/common/io/mac/mac_bcast.c
index 1ff33c3578..3b674be1d0 100644
--- a/usr/src/uts/common/io/mac/mac_bcast.c
+++ b/usr/src/uts/common/io/mac/mac_bcast.c
@@ -21,6 +21,7 @@
 /*
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2018 Joyent, Inc.
  */
 
 #include <sys/types.h>
@@ -146,7 +147,7 @@ mac_bcast_send(void *arg1, void *arg2, mblk_t *mp_chain, boolean_t is_loopback)
 	uint64_t gen;
 	uint_t i;
 	mblk_t *mp_chain1;
-	flow_entry_t	*flent;
+	flow_entry_t *flent;
 	int err;
 
 	rw_enter(&mip->mi_rw_lock, RW_READER);
@@ -182,13 +183,6 @@ mac_bcast_send(void *arg1, void *arg2, mblk_t *mp_chain, boolean_t is_loopback)
 		 */
 		if ((mp_chain1 = mac_copymsgchain_cksum(mp_chain)) == NULL)
 			break;
-		/*
-		 * Fix the checksum for packets originating
-		 * from the local machine.
-		 */
-		if ((src_mcip != NULL) &&
-		    (mp_chain1 = mac_fix_cksum(mp_chain1)) == NULL)
-			break;
 
 		FLOW_TRY_REFHOLD(flent, err);
 		if (err != 0) {
diff --git a/usr/src/uts/common/io/mac/mac_client.c b/usr/src/uts/common/io/mac/mac_client.c
index efc3cb7f07..de5ef6121f 100644
--- a/usr/src/uts/common/io/mac/mac_client.c
+++ b/usr/src/uts/common/io/mac/mac_client.c
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2014, Joyent, Inc.  All rights reserved.
+ * Copyright 2018 Joyent, Inc.
  * Copyright 2017 RackTop Systems.
  */
 
@@ -114,6 +114,7 @@
 #include <sys/stream.h>
 #include <sys/strsun.h>
 #include <sys/strsubr.h>
+#include <sys/pattr.h>
 #include <sys/dlpi.h>
 #include <sys/modhash.h>
 #include <sys/mac_impl.h>
@@ -865,9 +866,12 @@ mac_unicast_update_client_flow(mac_client_impl_t *mcip)
 	mac_protect_update_mac_token(mcip);
 
 	/*
-	 * A MAC client could have one MAC address but multiple
-	 * VLANs. In that case update the flow entries corresponding
-	 * to all VLANs of the MAC client.
+	 * When there are multiple VLANs sharing the same MAC address,
+	 * each gets its own MAC client, except when running on sun4v
+	 * vsw. In that case the mci_flent_list is used to place
+	 * multiple VLAN flows on one MAC client. If we ever get rid
+	 * of vsw then this code can go, but until then we need to
+	 * update all flow entries.
 	 */
 	for (flent = mcip->mci_flent_list; flent != NULL;
 	    flent = flent->fe_client_next) {
@@ -1025,7 +1029,7 @@ mac_unicast_primary_set(mac_handle_t mh, const uint8_t *addr)
 		return (0);
 	}
 
-	if (mac_find_macaddr(mip, (uint8_t *)addr) != 0) {
+	if (mac_find_macaddr(mip, (uint8_t *)addr) != NULL) {
 		i_mac_perim_exit(mip);
 		return (EBUSY);
 	}
@@ -1040,9 +1044,9 @@ mac_unicast_primary_set(mac_handle_t mh, const uint8_t *addr)
 		mac_capab_aggr_t aggr_cap;
 
 		/*
-		 * If the mac is an aggregation, other than the unicast
+		 * If the MAC is an aggregation, other than the unicast
 		 * addresses programming, aggr must be informed about this
-		 * primary unicst address change to change its mac address
+		 * primary unicst address change to change its MAC address
 		 * policy to be user-specified.
 		 */
 		ASSERT(map->ma_type == MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED);
@@ -1353,7 +1357,7 @@ mac_client_open(mac_handle_t mh, mac_client_handle_t *mchp, char *name,
 
 	mcip->mci_mip = mip;
 	mcip->mci_upper_mip = NULL;
-	mcip->mci_rx_fn = mac_pkt_drop;
+	mcip->mci_rx_fn = mac_rx_def;
 	mcip->mci_rx_arg = NULL;
 	mcip->mci_rx_p_fn = NULL;
 	mcip->mci_rx_p_arg = NULL;
@@ -1374,7 +1378,7 @@ mac_client_open(mac_handle_t mh, mac_client_handle_t *mchp, char *name,
 		mcip->mci_state_flags |= MCIS_IS_AGGR_PORT;
 
 	if (mip->mi_state_flags & MIS_IS_AGGR)
-		mcip->mci_state_flags |= MCIS_IS_AGGR;
+		mcip->mci_state_flags |= MCIS_IS_AGGR_CLIENT;
 
 	if ((flags & MAC_OPEN_FLAGS_USE_DATALINK_NAME) != 0) {
 		datalink_id_t	linkid;
@@ -1433,6 +1437,7 @@ mac_client_open(mac_handle_t mh, mac_client_handle_t *mchp, char *name,
 	mcip->mci_flent = flent;
 	FLOW_MARK(flent, FE_MC_NO_DATAPATH);
 	flent->fe_mcip = mcip;
+
 	/*
 	 * Place initial creation reference on the flow. This reference
 	 * is released in the corresponding delete action viz.
@@ -1539,7 +1544,8 @@ mac_client_close(mac_client_handle_t mch, uint16_t flags)
 }
 
 /*
- * Set the rx bypass receive callback.
+ * Set the Rx bypass receive callback and return B_TRUE. Return
+ * B_FALSE if it's not possible to enable bypass.
  */
 boolean_t
 mac_rx_bypass_set(mac_client_handle_t mch, mac_direct_rx_t rx_fn, void *arg1)
@@ -1550,11 +1556,11 @@ mac_rx_bypass_set(mac_client_handle_t mch, mac_direct_rx_t rx_fn, void *arg1)
 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
 
 	/*
-	 * If the mac_client is a VLAN, we should not do DLS bypass and
-	 * instead let the packets come up via mac_rx_deliver so the vlan
-	 * header can be stripped.
+	 * If the client has more than one VLAN then process packets
+	 * through DLS. This should happen only when sun4v vsw is on
+	 * the scene.
 	 */
-	if (mcip->mci_nvids > 0)
+	if (mcip->mci_nvids > 1)
 		return (B_FALSE);
 
 	/*
@@ -1608,8 +1614,8 @@ mac_rx_set(mac_client_handle_t mch, mac_rx_t rx_fn, void *arg)
 	i_mac_perim_exit(mip);
 
 	/*
-	 * If we're changing the rx function on the primary mac of a vnic,
-	 * make sure any secondary macs on the vnic are updated as well.
+	 * If we're changing the Rx function on the primary MAC of a VNIC,
+	 * make sure any secondary addresses on the VNIC are updated as well.
 	 */
 	if (umip != NULL) {
 		ASSERT((umip->mi_state_flags & MIS_IS_VNIC) != 0);
@@ -1623,7 +1629,7 @@ mac_rx_set(mac_client_handle_t mch, mac_rx_t rx_fn, void *arg)
 void
 mac_rx_clear(mac_client_handle_t mch)
 {
-	mac_rx_set(mch, mac_pkt_drop, NULL);
+	mac_rx_set(mch, mac_rx_def, NULL);
 }
 
 void
@@ -1787,6 +1793,14 @@ mac_client_set_rings_prop(mac_client_impl_t *mcip, mac_resource_props_t *mrp,
 				}
 			/* Let check if we can give this an excl group */
 			} else if (group == defgrp) {
+				/*
+				 * If multiple clients share an
+				 * address then they must stay on the
+				 * default group.
+				 */
+				if (mac_check_macaddr_shared(mcip->mci_unicast))
+					return (0);
+
 				ngrp = 	mac_reserve_rx_group(mcip, mac_addr,
 				    B_TRUE);
 				/* Couldn't give it a group, that's fine */
@@ -1809,6 +1823,16 @@ mac_client_set_rings_prop(mac_client_impl_t *mcip, mac_resource_props_t *mrp,
 		}
 
 		if (group == defgrp && ((mrp->mrp_nrxrings > 0) || unspec)) {
+			/*
+			 * We are requesting Rx rings. Try to reserve
+			 * a non-default group.
+			 *
+			 * If multiple clients share an address then
+			 * they must stay on the default group.
+			 */
+			if (mac_check_macaddr_shared(mcip->mci_unicast))
+				return (EINVAL);
+
 			ngrp = 	mac_reserve_rx_group(mcip, mac_addr, B_TRUE);
 			if (ngrp == NULL)
 				return (ENOSPC);
@@ -2166,10 +2190,10 @@ mac_unicast_flow_create(mac_client_impl_t *mcip, uint8_t *mac_addr,
 		flent_flags = FLOW_VNIC_MAC;
 
 	/*
-	 * For the first flow we use the mac client's name - mci_name, for
-	 * subsequent ones we just create a name with the vid. This is
+	 * For the first flow we use the MAC client's name - mci_name, for
+	 * subsequent ones we just create a name with the VID. This is
 	 * so that we can add these flows to the same flow table. This is
-	 * fine as the flow name (except for the one with the mac client's
+	 * fine as the flow name (except for the one with the MAC client's
 	 * name) is not visible. When the first flow is removed, we just replace
 	 * its fdesc with another from the list, so we will still retain the
 	 * flent with the MAC client's flow name.
@@ -2327,6 +2351,7 @@ mac_client_datapath_setup(mac_client_impl_t *mcip, uint16_t vid,
 		 * The unicast MAC address must have been added successfully.
 		 */
 		ASSERT(mcip->mci_unicast != NULL);
+
 		/*
 		 * Push down the sub-flows that were defined on this link
 		 * hitherto. The flows are added to the active flow table
@@ -2338,15 +2363,23 @@ mac_client_datapath_setup(mac_client_impl_t *mcip, uint16_t vid,
 
 		ASSERT(!no_unicast);
 		/*
-		 * A unicast flow already exists for that MAC client,
-		 * this flow must be the same mac address but with
-		 * different VID. It has been checked by mac_addr_in_use().
+		 * A unicast flow already exists for that MAC client
+		 * so this flow must be the same MAC address but with
+		 * a different VID. It has been checked by
+		 * mac_addr_in_use().
+		 *
+		 * We will use the SRS etc. from the initial
+		 * mci_flent. We don't need to create a kstat for
+		 * this, as except for the fdesc, everything will be
+		 * used from the first flent.
 		 *
-		 * We will use the SRS etc. from the mci_flent. Note that
-		 * We don't need to create kstat for this as except for
-		 * the fdesc, everything will be used from in the 1st flent.
+		 * The only time we should see multiple flents on the
+		 * same MAC client is on the sun4v vsw. If we removed
+		 * that code we should be able to remove the entire
+		 * notion of multiple flents on a MAC client (this
+		 * doesn't affect sub/user flows because they have
+		 * their own list unrelated to mci_flent_list).
 		 */
-
 		if (bcmp(mac_addr, map->ma_addr, map->ma_len) != 0) {
 			err = EINVAL;
 			goto bail;
@@ -2406,7 +2439,17 @@ done_setup:
 	if (flent->fe_rx_ring_group != NULL)
 		mac_rx_group_unmark(flent->fe_rx_ring_group, MR_INCIPIENT);
 	FLOW_UNMARK(flent, FE_INCIPIENT);
-	FLOW_UNMARK(flent, FE_MC_NO_DATAPATH);
+
+	/*
+	 * If this is an aggr port client, don't enable the flow's
+	 * datapath at this stage. Otherwise, bcast traffic could
+	 * arrive while the aggr port is in the process of
+	 * initializing. Instead, the flow's datapath is started later
+	 * when mac_client_set_flow_cb() is called.
+	 */
+	if ((mcip->mci_state_flags & MCIS_IS_AGGR_PORT) == 0)
+		FLOW_UNMARK(flent, FE_MC_NO_DATAPATH);
+
 	mac_tx_client_unblock(mcip);
 	return (0);
 bail:
@@ -2475,8 +2518,12 @@ i_mac_unicast_add(mac_client_handle_t mch, uint8_t *mac_addr, uint16_t flags,
 	boolean_t		is_vnic_primary =
 	    (flags & MAC_UNICAST_VNIC_PRIMARY);
 
-	/* when VID is non-zero, the underlying MAC can not be VNIC */
-	ASSERT(!((mip->mi_state_flags & MIS_IS_VNIC) && (vid != 0)));
+	/*
+	 * When the VID is non-zero the underlying MAC cannot be a
+	 * VNIC. I.e., dladm create-vlan cannot take a VNIC as
+	 * argument, only the primary MAC client.
+	 */
+	ASSERT(!((mip->mi_state_flags & MIS_IS_VNIC) && (vid != VLAN_ID_NONE)));
 
 	/*
 	 * Can't unicast add if the client asked only for minimal datapath
@@ -2489,18 +2536,19 @@ i_mac_unicast_add(mac_client_handle_t mch, uint8_t *mac_addr, uint16_t flags,
 	 * Check for an attempted use of the current Port VLAN ID, if enabled.
 	 * No client may use it.
 	 */
-	if (mip->mi_pvid != 0 && vid == mip->mi_pvid)
+	if (mip->mi_pvid != VLAN_ID_NONE && vid == mip->mi_pvid)
 		return (EBUSY);
 
 	/*
 	 * Check whether it's the primary client and flag it.
 	 */
-	if (!(mcip->mci_state_flags & MCIS_IS_VNIC) && is_primary && vid == 0)
+	if (!(mcip->mci_state_flags & MCIS_IS_VNIC) && is_primary &&
+	    vid == VLAN_ID_NONE)
 		mcip->mci_flags |= MAC_CLIENT_FLAGS_PRIMARY;
 
 	/*
 	 * is_vnic_primary is true when we come here as a VLAN VNIC
-	 * which uses the primary mac client's address but with a non-zero
+	 * which uses the primary MAC client's address but with a non-zero
 	 * VID. In this case the MAC address is not specified by an upper
 	 * MAC client.
 	 */
@@ -2552,7 +2600,7 @@ i_mac_unicast_add(mac_client_handle_t mch, uint8_t *mac_addr, uint16_t flags,
 		/*
 		 * Create a handle for vid 0.
 		 */
-		ASSERT(vid == 0);
+		ASSERT(vid == VLAN_ID_NONE);
 		muip = kmem_zalloc(sizeof (mac_unicast_impl_t), KM_SLEEP);
 		muip->mui_vid = vid;
 		*mah = (mac_unicast_handle_t)muip;
@@ -2572,7 +2620,9 @@ i_mac_unicast_add(mac_client_handle_t mch, uint8_t *mac_addr, uint16_t flags,
 	}
 
 	/*
-	 * If this is a VNIC/VLAN, disable softmac fast-path.
+	 * If this is a VNIC/VLAN, disable softmac fast-path. This is
+	 * only relevant to legacy devices which use softmac to
+	 * interface with GLDv3.
 	 */
 	if (mcip->mci_state_flags & MCIS_IS_VNIC) {
 		err = mac_fastpath_disable((mac_handle_t)mip);
@@ -2620,9 +2670,11 @@ i_mac_unicast_add(mac_client_handle_t mch, uint8_t *mac_addr, uint16_t flags,
 		(void) mac_client_set_resources(mch, mrp);
 	} else if (mcip->mci_state_flags & MCIS_IS_VNIC) {
 		/*
-		 * This is a primary VLAN client, we don't support
-		 * specifying rings property for this as it inherits the
-		 * rings property from its MAC.
+		 * This is a VLAN client sharing the address of the
+		 * primary MAC client; i.e., one created via dladm
+		 * create-vlan. We don't support specifying ring
+		 * properties for this type of client as it inherits
+		 * these from the primary MAC client.
 		 */
 		if (is_vnic_primary) {
 			mac_resource_props_t	*vmrp;
@@ -2681,7 +2733,7 @@ i_mac_unicast_add(mac_client_handle_t mch, uint8_t *mac_addr, uint16_t flags,
 
 	/*
 	 * Set the flags here so that if this is a passive client, we
-	 * can return  and set it when we call mac_client_datapath_setup
+	 * can return and set it when we call mac_client_datapath_setup
 	 * when this becomes the active client. If we defer to using these
 	 * flags to mac_client_datapath_setup, then for a passive client,
 	 * we'd have to store the flags somewhere (probably fe_flags)
@@ -2918,7 +2970,7 @@ mac_client_datapath_teardown(mac_client_handle_t mch, mac_unicast_impl_t *muip,
 	mac_misc_stat_delete(flent);
 
 	/* Initialize the receiver function to a safe routine */
-	flent->fe_cb_fn = (flow_fn_t)mac_pkt_drop;
+	flent->fe_cb_fn = (flow_fn_t)mac_rx_def;
 	flent->fe_cb_arg1 = NULL;
 	flent->fe_cb_arg2 = NULL;
 
@@ -2984,14 +3036,14 @@ mac_unicast_remove(mac_client_handle_t mch, mac_unicast_handle_t mah)
 	i_mac_perim_enter(mip);
 	if (mcip->mci_flags & MAC_CLIENT_FLAGS_VNIC_PRIMARY) {
 		/*
-		 * Called made by the upper MAC client of a VNIC.
+		 * Call made by the upper MAC client of a VNIC.
 		 * There's nothing much to do, the unicast address will
 		 * be removed by the VNIC driver when the VNIC is deleted,
 		 * but let's ensure that all our transmit is done before
 		 * the client does a mac_client_stop lest it trigger an
 		 * assert in the driver.
 		 */
-		ASSERT(muip->mui_vid == 0);
+		ASSERT(muip->mui_vid == VLAN_ID_NONE);
 
 		mac_tx_client_flush(mcip);
 
@@ -3055,6 +3107,7 @@ mac_unicast_remove(mac_client_handle_t mch, mac_unicast_handle_t mah)
 		i_mac_perim_exit(mip);
 		return (0);
 	}
+
 	/*
 	 * Remove the VID from the list of client's VIDs.
 	 */
@@ -3081,7 +3134,7 @@ mac_unicast_remove(mac_client_handle_t mch, mac_unicast_handle_t mah)
 		 * flows.
 		 */
 		flent = mac_client_get_flow(mcip, muip);
-		ASSERT(flent != NULL);
+		VERIFY3P(flent, !=, NULL);
 
 		/*
 		 * The first one is disappearing, need to make sure
@@ -3109,6 +3162,7 @@ mac_unicast_remove(mac_client_handle_t mch, mac_unicast_handle_t mah)
 
 		FLOW_FINAL_REFRELE(flent);
 		ASSERT(!(mcip->mci_state_flags & MCIS_EXCLUSIVE));
+
 		/*
 		 * Enable fastpath if this is a VNIC or a VLAN.
 		 */
@@ -3122,7 +3176,8 @@ mac_unicast_remove(mac_client_handle_t mch, mac_unicast_handle_t mah)
 	mui_vid = muip->mui_vid;
 	mac_client_datapath_teardown(mch, muip, flent);
 
-	if ((mcip->mci_flags & MAC_CLIENT_FLAGS_PRIMARY) && mui_vid == 0) {
+	if ((mcip->mci_flags & MAC_CLIENT_FLAGS_PRIMARY) &&
+	    mui_vid == VLAN_ID_NONE) {
 		mcip->mci_flags &= ~MAC_CLIENT_FLAGS_PRIMARY;
 	} else {
 		i_mac_perim_exit(mip);
@@ -3264,6 +3319,11 @@ mac_promisc_add(mac_client_handle_t mch, mac_client_promisc_type_t type,
 	mac_cb_info_t	*mcbi;
 	int rc;
 
+	if ((flags & MAC_PROMISC_FLAGS_NO_COPY) &&
+	    (flags & MAC_PROMISC_FLAGS_DO_FIXUPS)) {
+		return (EINVAL);
+	}
+
 	i_mac_perim_enter(mip);
 
 	if ((rc = mac_start((mac_handle_t)mip)) != 0) {
@@ -3310,6 +3370,7 @@ mac_promisc_add(mac_client_handle_t mch, mac_client_promisc_type_t type,
 	mpip->mpi_strip_vlan_tag =
 	    ((flags & MAC_PROMISC_FLAGS_VLAN_TAG_STRIP) != 0);
 	mpip->mpi_no_copy = ((flags & MAC_PROMISC_FLAGS_NO_COPY) != 0);
+	mpip->mpi_do_fixups = ((flags & MAC_PROMISC_FLAGS_DO_FIXUPS) != 0);
 
 	mcbi = &mip->mi_promisc_cb_info;
 	mutex_enter(mcbi->mcbi_lockp);
@@ -3530,6 +3591,13 @@ mac_tx(mac_client_handle_t mch, mblk_t *mp_chain, uintptr_t hint,
 		obytes = (mp_chain->b_cont == NULL ? MBLKL(mp_chain) :
 		    msgdsize(mp_chain));
 
+		/*
+		 * There's a chance this primary client might be part
+		 * of a bridge and the packet forwarded to a local
+		 * receiver -- mark the packet accordingly.
+		 */
+		DB_CKSUMFLAGS(mp_chain) |= HW_LOCAL_MAC;
+
 		MAC_TX(mip, srs_tx->st_arg2, mp_chain, mcip);
 		if (mp_chain == NULL) {
 			cookie = NULL;
@@ -3943,33 +4011,63 @@ mac_client_get_effective_resources(mac_client_handle_t mch,
  * The unicast packets of MAC_CLIENT_PROMISC_FILTER callbacks are dispatched
  * after classification by mac_rx_deliver().
  */
-
 static void
 mac_promisc_dispatch_one(mac_promisc_impl_t *mpip, mblk_t *mp,
     boolean_t loopback)
 {
-	mblk_t *mp_copy, *mp_next;
+	mblk_t *mp_next;
+	boolean_t local = (DB_CKSUMFLAGS(mp) & HW_LOCAL_MAC) != 0;
+
+	if (!mpip->mpi_no_copy || mpip->mpi_strip_vlan_tag ||
+	    (mpip->mpi_do_fixups && local)) {
+		mblk_t *mp_copy;
 
-	if (!mpip->mpi_no_copy || mpip->mpi_strip_vlan_tag) {
 		mp_copy = copymsg(mp);
 		if (mp_copy == NULL)
 			return;
 
+		/*
+		 * The consumer has requested we emulate HW offloads
+		 * for host-local packets.
+		 */
+		if (mpip->mpi_do_fixups && local) {
+			/*
+			 * Remember that copymsg() doesn't copy
+			 * b_next, so we are only passing a single
+			 * packet to mac_hw_emul(). Also keep in mind
+			 * that mp_copy will become an mblk chain if
+			 * the argument is an LSO message.
+			 */
+			mac_hw_emul(&mp_copy, NULL, NULL,
+			    MAC_HWCKSUM_EMUL | MAC_LSO_EMUL);
+
+			if (mp_copy == NULL)
+				return;
+		}
+
 		if (mpip->mpi_strip_vlan_tag) {
 			mp_copy = mac_strip_vlan_tag_chain(mp_copy);
 			if (mp_copy == NULL)
 				return;
 		}
-		mp_next = NULL;
-	} else {
-		mp_copy = mp;
-		mp_next = mp->b_next;
+
+		/*
+		 * There is code upstack that can't deal with message
+		 * chains.
+		 */
+		for (mblk_t *tmp = mp_copy; tmp != NULL; tmp = mp_next) {
+			mp_next = tmp->b_next;
+			tmp->b_next = NULL;
+			mpip->mpi_fn(mpip->mpi_arg, NULL, tmp, loopback);
+		}
+
+		return;
 	}
-	mp_copy->b_next = NULL;
 
-	mpip->mpi_fn(mpip->mpi_arg, NULL, mp_copy, loopback);
-	if (mp_copy == mp)
-		mp->b_next = mp_next;
+	mp_next = mp->b_next;
+	mp->b_next = NULL;
+	mpip->mpi_fn(mpip->mpi_arg, NULL, mp, loopback);
+	mp->b_next = mp_next;
 }
 
 /*
@@ -4051,8 +4149,9 @@ mac_promisc_dispatch(mac_impl_t *mip, mblk_t *mp_chain,
 
 			if (is_sender ||
 			    mpip->mpi_type == MAC_CLIENT_PROMISC_ALL ||
-			    is_mcast)
+			    is_mcast) {
 				mac_promisc_dispatch_one(mpip, mp, is_sender);
+			}
 		}
 	}
 	MAC_PROMISC_WALKER_DCR(mip);
@@ -4152,16 +4251,15 @@ mac_info_get(const char *name, mac_info_t *minfop)
 /*
  * To get the capabilities that MAC layer cares about, such as rings, factory
  * mac address, vnic or not, it should directly invoke this function.  If the
- * link is part of a bridge, then the only "capability" it has is the inability
- * to do zero copy.
+ * link is part of a bridge, then the link is unable to do zero copy.
  */
 boolean_t
 i_mac_capab_get(mac_handle_t mh, mac_capab_t cap, void *cap_data)
 {
 	mac_impl_t *mip = (mac_impl_t *)mh;
 
-	if (mip->mi_bridge_link != NULL)
-		return (cap == MAC_CAPAB_NO_ZCOPY);
+	if (mip->mi_bridge_link != NULL && cap == MAC_CAPAB_NO_ZCOPY)
+		return (B_TRUE);
 	else if (mip->mi_callbacks->mc_callbacks & MC_GETCAPAB)
 		return (mip->mi_getcapab(mip->mi_driver, cap, cap_data));
 	else
@@ -4180,8 +4278,9 @@ mac_capab_get(mac_handle_t mh, mac_capab_t cap, void *cap_data)
 	mac_impl_t *mip = (mac_impl_t *)mh;
 
 	/*
-	 * if mi_nactiveclients > 1, only MAC_CAPAB_LEGACY, MAC_CAPAB_HCKSUM,
-	 * MAC_CAPAB_NO_NATIVEVLAN and MAC_CAPAB_NO_ZCOPY can be advertised.
+	 * Some capabilities are restricted when there are more than one active
+	 * clients on the MAC resource.  The ones noted below are safe,
+	 * independent of that count.
 	 */
 	if (mip->mi_nactiveclients > 1) {
 		switch (cap) {
@@ -4189,6 +4288,7 @@ mac_capab_get(mac_handle_t mh, mac_capab_t cap, void *cap_data)
 			return (B_TRUE);
 		case MAC_CAPAB_LEGACY:
 		case MAC_CAPAB_HCKSUM:
+		case MAC_CAPAB_LSO:
 		case MAC_CAPAB_NO_NATIVEVLAN:
 			break;
 		default:
@@ -4340,7 +4440,13 @@ mac_addr_len(mac_handle_t mh)
 boolean_t
 mac_is_vnic(mac_handle_t mh)
 {
-	return (((mac_impl_t *)mh)->mi_state_flags & MIS_IS_VNIC);
+	return ((((mac_impl_t *)mh)->mi_state_flags & MIS_IS_VNIC) != 0);
+}
+
+boolean_t
+mac_is_overlay(mac_handle_t mh)
+{
+	return ((((mac_impl_t *)mh)->mi_state_flags & MIS_IS_OVERLAY) != 0);
 }
 
 mac_handle_t
diff --git a/usr/src/uts/common/io/mac/mac_datapath_setup.c b/usr/src/uts/common/io/mac/mac_datapath_setup.c
index 8a8f9a9269..70585df698 100644
--- a/usr/src/uts/common/io/mac/mac_datapath_setup.c
+++ b/usr/src/uts/common/io/mac/mac_datapath_setup.c
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2017, Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
  */
 
 #include <sys/types.h>
@@ -604,6 +604,7 @@ mac_srs_cpu_setup(cpu_setup_t what, int id, void *arg)
  *
  * TODO: Cleanup and tighten some of the assumptions.
  */
+boolean_t mac_check_overlay = B_TRUE;
 boolean_t mac_use_bw_heuristic = B_TRUE;
 static int
 mac_compute_soft_ring_count(flow_entry_t *flent, int rx_srs_cnt, int maxcpus)
@@ -611,6 +612,7 @@ mac_compute_soft_ring_count(flow_entry_t *flent, int rx_srs_cnt, int maxcpus)
 	uint64_t cpu_speed, bw = 0;
 	int srings = 0;
 	boolean_t bw_enabled = B_FALSE;
+	mac_client_impl_t *mcip = flent->fe_mcip;
 
 	ASSERT(!(flent->fe_type & FLOW_USER));
 	if (flent->fe_resource_props.mrp_mask & MRP_MAXBW &&
@@ -638,7 +640,16 @@ mac_compute_soft_ring_count(flow_entry_t *flent, int rx_srs_cnt, int maxcpus)
 			 */
 			if (mac_soft_ring_enable)
 				srings = srings * 2;
+		} else if (mac_check_overlay == B_TRUE &&
+		    (mcip->mci_state_flags & MCIS_IS_VNIC) != 0) {
+			/* Is this a VNIC on an overlay? */
+			mac_handle_t mh = (mac_handle_t)mcip->mci_mip;
+			if (mac_is_overlay(mh) == B_TRUE) {
+				srings = mac_rx_soft_ring_10gig_count;
+			}
 		}
+
+
 	} else {
 		/*
 		 * Soft ring computation using CPU speed and specified
@@ -1186,7 +1197,7 @@ mac_srs_fanout_list_alloc(mac_soft_ring_set_t *mac_srs)
 		mac_srs->srs_tx_soft_rings = (mac_soft_ring_t **)
 		    kmem_zalloc(sizeof (mac_soft_ring_t *) *
 		    MAX_RINGS_PER_GROUP, KM_SLEEP);
-		if (mcip->mci_state_flags & MCIS_IS_AGGR) {
+		if (mcip->mci_state_flags & MCIS_IS_AGGR_CLIENT) {
 			mac_srs_tx_t *tx = &mac_srs->srs_tx;
 
 			tx->st_soft_rings = (mac_soft_ring_t **)
@@ -1595,13 +1606,13 @@ mac_srs_update_bwlimit(flow_entry_t *flent, mac_resource_props_t *mrp)
 
 /*
  * When the first sub-flow is added to a link, we disable polling on the
- * link and also modify the entry point to mac_rx_srs_subflow_process.
+ * link and also modify the entry point to mac_rx_srs_subflow_process().
  * (polling is disabled because with the subflow added, accounting
  * for polling needs additional logic, it is assumed that when a subflow is
  * added, we can take some hit as a result of disabling polling rather than
  * adding more complexity - if this becomes a perf. issue we need to
  * re-rvaluate this logic).  When the last subflow is removed, we turn back
- * polling and also reset the entry point to mac_rx_srs_process.
+ * polling and also reset the entry point to mac_rx_srs_process().
  *
  * In the future if there are multiple SRS, we can simply
  * take one and give it to the flow rather than disabling polling and
@@ -1646,7 +1657,7 @@ mac_client_update_classifier(mac_client_impl_t *mcip, boolean_t enable)
 	 * Change the S/W classifier so that we can land in the
 	 * correct processing function with correct argument.
 	 * If all subflows have been removed we can revert to
-	 * mac_rx_srsprocess, else we need mac_rx_srs_subflow_process.
+	 * mac_rx_srs_process(), else we need mac_rx_srs_subflow_process().
 	 */
 	mutex_enter(&flent->fe_lock);
 	flent->fe_cb_fn = (flow_fn_t)rx_func;
@@ -1977,8 +1988,6 @@ no_softrings:
 }
 
 /*
- * mac_fanout_setup:
- *
  * Calls mac_srs_fanout_init() or modify() depending upon whether
  * the SRS is getting initialized or re-initialized.
  */
@@ -1991,14 +2000,14 @@ mac_fanout_setup(mac_client_impl_t *mcip, flow_entry_t *flent,
 	int i, rx_srs_cnt;
 
 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
+
 	/*
-	 * This is an aggregation port. Fanout will be setup
-	 * over the aggregation itself.
+	 * Aggr ports do not have SRSes. This function should never be
+	 * called on an aggr port.
 	 */
-	if (mcip->mci_state_flags & MCIS_EXCLUSIVE)
-		return;
-
+	ASSERT3U((mcip->mci_state_flags & MCIS_IS_AGGR_PORT), ==, 0);
 	mac_rx_srs = flent->fe_rx_srs[0];
+
 	/*
 	 * Set up the fanout on the tx side only once, with the
 	 * first rx SRS. The CPU binding, fanout, and bandwidth
@@ -2054,8 +2063,6 @@ mac_fanout_setup(mac_client_impl_t *mcip, flow_entry_t *flent,
 }
 
 /*
- * mac_srs_create:
- *
  * Create a mac_soft_ring_set_t (SRS). If soft_ring_fanout_type is
  * SRST_TX, an SRS for Tx side is created. Otherwise an SRS for Rx side
  * processing is created.
@@ -2187,7 +2194,7 @@ mac_srs_create(mac_client_impl_t *mcip, flow_entry_t *flent, uint32_t srs_type,
 	 *    find nothing plus we have an existing backlog
 	 *    (sr_poll_pkt_cnt > 0), we stay in polling mode but don't poll
 	 *    the H/W for packets anymore (let the polling thread go to sleep).
-	 * 5) Once the backlog is relived (packets are processed) we reenable
+	 * 5) Once the backlog is relieved (packets are processed) we reenable
 	 *    polling (by signalling the poll thread) only when the backlog
 	 *    dips below sr_poll_thres.
 	 * 6) sr_hiwat is used exclusively when we are not polling capable
@@ -2210,7 +2217,14 @@ mac_srs_create(mac_client_impl_t *mcip, flow_entry_t *flent, uint32_t srs_type,
 			mac_srs->srs_state |= SRS_SOFTRING_QUEUE;
 	}
 
-	mac_srs->srs_worker = thread_create(NULL, 0,
+	/*
+	 * Create the srs_worker with twice the stack of a normal kernel thread
+	 * to reduce the likelihood of stack overflows in receive-side
+	 * processing.  (The larger stacks are not the only precaution taken
+	 * against stack overflows; see the use of the MAC_RX_SRS_TOODEEP
+	 * macro for details.)
+	 */
+	mac_srs->srs_worker = thread_create(NULL, default_stksize << 1,
 	    mac_srs_worker, mac_srs, 0, &p0, TS_RUN, mac_srs->srs_pri);
 
 	if (is_tx_srs) {
@@ -2258,8 +2272,8 @@ mac_srs_create(mac_client_impl_t *mcip, flow_entry_t *flent, uint32_t srs_type,
 		/*
 		 * Some drivers require serialization and don't send
 		 * packet chains in interrupt context. For such
-		 * drivers, we should always queue in soft ring
-		 * so that we get a chance to switch into a polling
+		 * drivers, we should always queue in the soft ring
+		 * so that we get a chance to switch into polling
 		 * mode under backlog.
 		 */
 		ring_info = mac_hwring_getinfo((mac_ring_handle_t)ring);
@@ -2357,6 +2371,10 @@ mac_srs_group_setup(mac_client_impl_t *mcip, flow_entry_t *flent,
 	mac_rx_srs_group_setup(mcip, flent, link_type);
 	mac_tx_srs_group_setup(mcip, flent, link_type);
 
+	/* Aggr ports don't have SRSes; thus there is no soft ring fanout. */
+	if ((mcip->mci_state_flags & MCIS_IS_AGGR_PORT) != 0)
+		return;
+
 	pool_lock();
 	cpupart = mac_pset_find(mrp, &use_default);
 	mac_fanout_setup(mcip, flent, MCIP_RESOURCE_PROPS(mcip),
@@ -2366,9 +2384,11 @@ mac_srs_group_setup(mac_client_impl_t *mcip, flow_entry_t *flent,
 }
 
 /*
- * Set up the RX SRSs. If the S/W SRS is not set, set  it up, if there
- * is a group associated with this MAC client, set up SRSs for individual
- * h/w rings.
+ * Set up the Rx SRSes. If there is no group associated with the
+ * client, then only setup SW classification. If the client has
+ * exlusive (MAC_GROUP_STATE_RESERVED) use of the group, then create an
+ * SRS for each HW ring. If the client is sharing a group, then make
+ * sure to teardown the HW SRSes.
  */
 void
 mac_rx_srs_group_setup(mac_client_impl_t *mcip, flow_entry_t *flent,
@@ -2379,13 +2399,37 @@ mac_rx_srs_group_setup(mac_client_impl_t *mcip, flow_entry_t *flent,
 	mac_ring_t 		*ring;
 	uint32_t		fanout_type;
 	mac_group_t		*rx_group = flent->fe_rx_ring_group;
+	boolean_t		no_unicast;
+
+	/*
+	 * If this is an an aggr port, then don't setup Rx SRS and Rx
+	 * soft rings as they won't be used. However, we still need to
+	 * start the rings to receive data on them.
+	 */
+	if (mcip->mci_state_flags & MCIS_IS_AGGR_PORT) {
+		if (rx_group == NULL)
+			return;
+
+		for (ring = rx_group->mrg_rings; ring != NULL;
+		    ring = ring->mr_next) {
+			if (ring->mr_state != MR_INUSE)
+				(void) mac_start_ring(ring);
+		}
+
+		return;
+	}
+
+	/*
+	 * Aggr ports should never have SRSes.
+	 */
+	ASSERT3U((mcip->mci_state_flags & MCIS_IS_AGGR_PORT), ==, 0);
 
 	fanout_type = mac_find_fanout(flent, link_type);
+	no_unicast = (mcip->mci_state_flags & MCIS_NO_UNICAST_ADDR) != 0;
 
-	/* Create the SRS for S/W classification if none exists */
+	/* Create the SRS for SW classification if none exists */
 	if (flent->fe_rx_srs[0] == NULL) {
 		ASSERT(flent->fe_rx_srs_cnt == 0);
-		/* Setup the Rx SRS */
 		mac_srs = mac_srs_create(mcip, flent, fanout_type | link_type,
 		    mac_rx_deliver, mcip, NULL, NULL);
 		mutex_enter(&flent->fe_lock);
@@ -2397,15 +2441,17 @@ mac_rx_srs_group_setup(mac_client_impl_t *mcip, flow_entry_t *flent,
 
 	if (rx_group == NULL)
 		return;
+
 	/*
-	 * fanout for default SRS is done when default SRS are created
-	 * above. As each ring is added to the group, we setup the
-	 * SRS and fanout to it.
+	 * If the group is marked RESERVED then setup an SRS and
+	 * fanout for each HW ring.
 	 */
 	switch (rx_group->mrg_state) {
 	case MAC_GROUP_STATE_RESERVED:
 		for (ring = rx_group->mrg_rings; ring != NULL;
 		    ring = ring->mr_next) {
+			uint16_t vid = i_mac_flow_vid(mcip->mci_flent);
+
 			switch (ring->mr_state) {
 			case MR_INUSE:
 			case MR_FREE:
@@ -2415,20 +2461,23 @@ mac_rx_srs_group_setup(mac_client_impl_t *mcip, flow_entry_t *flent,
 					(void) mac_start_ring(ring);
 
 				/*
-				 * Since the group is exclusively ours create
-				 * an SRS for this ring to allow the
-				 * individual SRS to dynamically poll the
-				 * ring. Do this only if the  client is not
-				 * a VLAN MAC client, since for VLAN we do
-				 * s/w classification for the VID check, and
-				 * if it has a unicast address.
+				 * If a client requires SW VLAN
+				 * filtering or has no unicast address
+				 * then we don't create any HW ring
+				 * SRSes.
 				 */
-				if ((mcip->mci_state_flags &
-				    MCIS_NO_UNICAST_ADDR) ||
-				    i_mac_flow_vid(mcip->mci_flent) !=
-				    VLAN_ID_NONE) {
+				if ((!MAC_GROUP_HW_VLAN(rx_group) &&
+				    vid != VLAN_ID_NONE) || no_unicast)
 					break;
-				}
+
+				/*
+				 * When a client has exclusive use of
+				 * a group, and that group's traffic
+				 * is fully HW classified, we create
+				 * an SRS for each HW ring in order to
+				 * make use of dynamic polling of said
+				 * HW rings.
+				 */
 				mac_srs = mac_srs_create(mcip, flent,
 				    fanout_type | link_type,
 				    mac_rx_deliver, mcip, NULL, ring);
@@ -2444,14 +2493,9 @@ mac_rx_srs_group_setup(mac_client_impl_t *mcip, flow_entry_t *flent,
 		break;
 	case MAC_GROUP_STATE_SHARED:
 		/*
-		 * Set all rings of this group to software classified.
-		 *
-		 * If the group is current RESERVED, the existing mac
-		 * client (the only client on this group) is using
-		 * this group exclusively.  In that case we need to
-		 * disable polling on the rings of the group (if it
-		 * was enabled), and free the SRS associated with the
-		 * rings.
+		 * When a group is shared by multiple clients, we must
+		 * use SW classifiction to ensure packets are
+		 * delivered to the correct client.
 		 */
 		mac_rx_switch_grp_to_sw(rx_group);
 		break;
@@ -2468,46 +2512,49 @@ void
 mac_tx_srs_group_setup(mac_client_impl_t *mcip, flow_entry_t *flent,
     uint32_t link_type)
 {
-	int			cnt;
-	int			ringcnt;
-	mac_ring_t		*ring;
-	mac_group_t		*grp;
-
 	/*
-	 * If we are opened exclusively (like aggr does for aggr_ports),
-	 * don't set up Tx SRS and Tx soft rings as they won't be used.
-	 * The same thing has to be done for Rx side also. See bug:
-	 * 6880080
+	 * If this is an exclusive client (e.g. an aggr port), then
+	 * don't setup Tx SRS and Tx soft rings as they won't be used.
+	 * However, we still need to start the rings to send data
+	 * across them.
 	 */
 	if (mcip->mci_state_flags & MCIS_EXCLUSIVE) {
-		/*
-		 * If we have rings, start them here.
-		 */
-		if (flent->fe_tx_ring_group == NULL)
-			return;
+		mac_ring_t		*ring;
+		mac_group_t		*grp;
+
 		grp = (mac_group_t *)flent->fe_tx_ring_group;
-		ringcnt = grp->mrg_cur_count;
-		ring = grp->mrg_rings;
-		for (cnt = 0; cnt < ringcnt; cnt++) {
-			if (ring->mr_state != MR_INUSE) {
+
+		if (grp == NULL)
+			return;
+
+		for (ring = grp->mrg_rings; ring != NULL;
+		    ring = ring->mr_next) {
+			if (ring->mr_state != MR_INUSE)
 				(void) mac_start_ring(ring);
-			}
-			ring = ring->mr_next;
 		}
+
 		return;
 	}
+
+	/*
+	 * Aggr ports should never have SRSes.
+	 */
+	ASSERT3U((mcip->mci_state_flags & MCIS_IS_AGGR_PORT), ==, 0);
+
 	if (flent->fe_tx_srs == NULL) {
 		(void) mac_srs_create(mcip, flent, SRST_TX | link_type,
 		    NULL, mcip, NULL, NULL);
 	}
+
 	mac_tx_srs_setup(mcip, flent);
 }
 
 /*
- * Remove all the RX SRSs. If we want to remove only the SRSs associated
- * with h/w rings, leave the S/W SRS alone. This is used when we want to
- * move the MAC client from one group to another, so we need to teardown
- * on the h/w SRSs.
+ * Teardown all the Rx SRSes. Unless hwonly is set, then only teardown
+ * the Rx HW SRSes and leave the SW SRS alone. The hwonly flag is set
+ * when we wish to move a MAC client from one group to another. In
+ * that case, we need to release the current HW SRSes but keep the SW
+ * SRS for continued traffic classifiction.
  */
 void
 mac_rx_srs_group_teardown(flow_entry_t *flent, boolean_t hwonly)
@@ -2525,8 +2572,16 @@ mac_rx_srs_group_teardown(flow_entry_t *flent, boolean_t hwonly)
 		flent->fe_rx_srs[i] = NULL;
 		flent->fe_rx_srs_cnt--;
 	}
-	ASSERT(!hwonly || flent->fe_rx_srs_cnt == 1);
-	ASSERT(hwonly || flent->fe_rx_srs_cnt == 0);
+
+	/*
+	 * If we are only tearing down the HW SRSes then there must be
+	 * one SRS left for SW classification. Otherwise we are tearing
+	 * down both HW and SW and there should be no SRSes left.
+	 */
+	if (hwonly)
+		VERIFY3S(flent->fe_rx_srs_cnt, ==, 1);
+	else
+		VERIFY3S(flent->fe_rx_srs_cnt, ==, 0);
 }
 
 /*
@@ -2828,6 +2883,7 @@ mac_group_next_state(mac_group_t *grp, mac_client_impl_t **group_only_mcip,
  * even if this is the only client in the default group, we will
  * leave group as shared).
  */
+
 int
 mac_datapath_setup(mac_client_impl_t *mcip, flow_entry_t *flent,
     uint32_t link_type)
@@ -2839,6 +2895,7 @@ mac_datapath_setup(mac_client_impl_t *mcip, flow_entry_t *flent,
 	mac_group_t		*default_tgroup;
 	int			err;
 	uint8_t 		*mac_addr;
+	uint16_t		vid;
 	mac_group_state_t	next_state;
 	mac_client_impl_t	*group_only_mcip;
 	mac_resource_props_t	*mrp = MCIP_RESOURCE_PROPS(mcip);
@@ -2850,6 +2907,7 @@ mac_datapath_setup(mac_client_impl_t *mcip, flow_entry_t *flent,
 	boolean_t		no_unicast;
 	boolean_t		isprimary = flent->fe_type & FLOW_PRIMARY_MAC;
 	mac_client_impl_t	*reloc_pmcip = NULL;
+	boolean_t		use_hw;
 
 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
 
@@ -2881,15 +2939,19 @@ mac_datapath_setup(mac_client_impl_t *mcip, flow_entry_t *flent,
 		    (mrp->mrp_mask & MRP_TXRINGS_UNSPEC));
 
 		/*
-		 * By default we have given the primary all the rings
-		 * i.e. the default group. Let's see if the primary
-		 * needs to be relocated so that the addition of this
-		 * client doesn't impact the primary's performance,
-		 * i.e. if the primary is in the default group and
-		 * we add this client, the primary will lose polling.
-		 * We do this only for NICs supporting dynamic ring
-		 * grouping and only when this is the first client
-		 * after the primary (i.e. nactiveclients is 2)
+		 * All the rings initially belong to the default group
+		 * under dynamic grouping. The primary client uses the
+		 * default group when it is the only client. The
+		 * default group is also used as the destination for
+		 * all multicast and broadcast traffic of all clients.
+		 * Therefore, the primary client loses its ability to
+		 * poll the softrings on addition of a second client.
+		 * To avoid a performance penalty, MAC will move the
+		 * primary client to a dedicated group when it can.
+		 *
+		 * When using static grouping, the primary client
+		 * begins life on a non-default group. There is
+		 * no moving needed upon addition of a second client.
 		 */
 		if (!isprimary && mip->mi_nactiveclients == 2 &&
 		    (group_only_mcip = mac_primary_client_handle(mip)) !=
@@ -2897,6 +2959,7 @@ mac_datapath_setup(mac_client_impl_t *mcip, flow_entry_t *flent,
 			reloc_pmcip = mac_check_primary_relocation(
 			    group_only_mcip, rxhw);
 		}
+
 		/*
 		 * Check to see if we can get an exclusive group for
 		 * this mac address or if there already exists a
@@ -2910,6 +2973,26 @@ mac_datapath_setup(mac_client_impl_t *mcip, flow_entry_t *flent,
 		} else if (rgroup == NULL) {
 			rgroup = default_rgroup;
 		}
+
+		/*
+		 * If we are adding a second client to a
+		 * non-default group then we need to move the
+		 * existing client to the default group and
+		 * add the new client to the default group as
+		 * well.
+		 */
+		if (rgroup != default_rgroup &&
+		    rgroup->mrg_state == MAC_GROUP_STATE_RESERVED) {
+			group_only_mcip = MAC_GROUP_ONLY_CLIENT(rgroup);
+			err = mac_rx_switch_group(group_only_mcip, rgroup,
+			    default_rgroup);
+
+			if (err != 0)
+				goto setup_failed;
+
+			rgroup = default_rgroup;
+		}
+
 		/*
 		 * Check to see if we can get an exclusive group for
 		 * this mac client. If no groups are available, use
@@ -2941,14 +3024,17 @@ mac_datapath_setup(mac_client_impl_t *mcip, flow_entry_t *flent,
 					    rgroup->mrg_cur_count);
 				}
 			}
+
 			flent->fe_rx_ring_group = rgroup;
 			/*
-			 * Add the client to the group. This could cause
-			 * either this group to move to the shared state or
-			 * cause the default group to move to the shared state.
-			 * The actions on this group are done here, while the
-			 * actions on the default group are postponed to
-			 * the end of this function.
+			 * Add the client to the group and update the
+			 * group's state. If rgroup != default_group
+			 * then the rgroup should only ever have one
+			 * client and be in the RESERVED state. But no
+			 * matter what, the default_rgroup will enter
+			 * the SHARED state since it has to receive
+			 * all broadcast and multicast traffic. This
+			 * case is handled later in the function.
 			 */
 			mac_group_add_client(rgroup, mcip);
 			next_state = mac_group_next_state(rgroup,
@@ -2973,28 +3059,37 @@ mac_datapath_setup(mac_client_impl_t *mcip, flow_entry_t *flent,
 			    &group_only_mcip, default_tgroup, B_FALSE);
 			tgroup->mrg_state = next_state;
 		}
-		/*
-		 * Setup the Rx and Tx SRSes. If we got a pristine group
-		 * exclusively above, mac_srs_group_setup would simply create
-		 * the required SRSes. If we ended up sharing a previously
-		 * reserved group, mac_srs_group_setup would also dismantle the
-		 * SRSes of the previously exclusive group
-		 */
-		mac_srs_group_setup(mcip, flent, link_type);
 
 		/* We are setting up minimal datapath only */
-		if (no_unicast)
+		if (no_unicast) {
+			mac_srs_group_setup(mcip, flent, link_type);
 			break;
-		/* Program the S/W Classifer */
+		}
+
+		/* Program software classification. */
 		if ((err = mac_flow_add(mip->mi_flow_tab, flent)) != 0)
 			goto setup_failed;
 
-		/* Program the H/W Classifier */
-		if ((err = mac_add_macaddr(mip, rgroup, mac_addr,
-		    (mcip->mci_state_flags & MCIS_UNICAST_HW) != 0)) != 0)
+		/* Program hardware classification. */
+		vid = i_mac_flow_vid(flent);
+		use_hw = (mcip->mci_state_flags & MCIS_UNICAST_HW) != 0;
+		err = mac_add_macaddr_vlan(mip, rgroup, mac_addr, vid, use_hw);
+
+		if (err != 0)
 			goto setup_failed;
+
 		mcip->mci_unicast = mac_find_macaddr(mip, mac_addr);
-		ASSERT(mcip->mci_unicast != NULL);
+		VERIFY3P(mcip->mci_unicast, !=, NULL);
+
+		/*
+		 * Setup the Rx and Tx SRSes. If the client has a
+		 * reserved group, then mac_srs_group_setup() creates
+		 * the required SRSes for the HW rings. If we have a
+		 * shared group, mac_srs_group_setup() dismantles the
+		 * HW SRSes of the previously exclusive group.
+		 */
+		mac_srs_group_setup(mcip, flent, link_type);
+
 		/* (Re)init the v6 token & local addr used by link protection */
 		mac_protect_update_mac_token(mcip);
 		break;
@@ -3038,17 +3133,23 @@ mac_datapath_setup(mac_client_impl_t *mcip, flow_entry_t *flent,
 			ASSERT(default_rgroup->mrg_state ==
 			    MAC_GROUP_STATE_SHARED);
 		}
+
 		/*
-		 * If we get an exclusive group for a VLAN MAC client we
-		 * need to take the s/w path to make the additional check for
-		 * the vid. Disable polling and set it to s/w classification.
-		 * Similarly for clients that don't have a unicast address.
+		 * A VLAN MAC client on a reserved group still
+		 * requires SW classification if the MAC doesn't
+		 * provide VLAN HW filtering.
+		 *
+		 * Clients with no unicast address also require SW
+		 * classification.
 		 */
 		if (rgroup->mrg_state == MAC_GROUP_STATE_RESERVED &&
-		    (i_mac_flow_vid(flent) != VLAN_ID_NONE || no_unicast)) {
+		    ((!MAC_GROUP_HW_VLAN(rgroup) && vid != VLAN_ID_NONE) ||
+		    no_unicast)) {
 			mac_rx_switch_grp_to_sw(rgroup);
 		}
+
 	}
+
 	mac_set_rings_effective(mcip);
 	return (0);
 
@@ -3074,6 +3175,7 @@ mac_datapath_teardown(mac_client_impl_t *mcip, flow_entry_t *flent,
 	boolean_t		check_default_group = B_FALSE;
 	mac_group_state_t	next_state;
 	mac_resource_props_t	*mrp = MCIP_RESOURCE_PROPS(mcip);
+	uint16_t		vid;
 
 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
 
@@ -3086,16 +3188,24 @@ mac_datapath_teardown(mac_client_impl_t *mcip, flow_entry_t *flent,
 	case SRST_LINK:
 		/* Stop sending packets */
 		mac_tx_client_block(mcip);
+		group = flent->fe_rx_ring_group;
+		vid = i_mac_flow_vid(flent);
 
-		/* Stop the packets coming from the H/W */
+		/*
+		 * Stop the packet flow from the hardware by disabling
+		 * any hardware filters assigned to this client.
+		 */
 		if (mcip->mci_unicast != NULL) {
 			int err;
-			err = mac_remove_macaddr(mcip->mci_unicast);
+
+			err = mac_remove_macaddr_vlan(mcip->mci_unicast, vid);
+
 			if (err != 0) {
-				cmn_err(CE_WARN, "%s: failed to remove a MAC"
-				    " address because of error 0x%x",
+				cmn_err(CE_WARN, "%s: failed to remove a MAC HW"
+				    " filters because of error 0x%x",
 				    mip->mi_name, err);
 			}
+
 			mcip->mci_unicast = NULL;
 		}
 
@@ -3103,12 +3213,12 @@ mac_datapath_teardown(mac_client_impl_t *mcip, flow_entry_t *flent,
 		mac_flow_remove(mip->mi_flow_tab, flent, B_FALSE);
 		mac_flow_wait(flent, FLOW_DRIVER_UPCALL);
 
-		/* Now quiesce and destroy all SRS and soft rings */
+		/* Quiesce and destroy all the SRSes. */
 		mac_rx_srs_group_teardown(flent, B_FALSE);
 		mac_tx_srs_group_teardown(mcip, flent, SRST_LINK);
 
-		ASSERT((mcip->mci_flent == flent) &&
-		    (flent->fe_next == NULL));
+		ASSERT3P(mcip->mci_flent, ==, flent);
+		ASSERT3P(flent->fe_next, ==, NULL);
 
 		/*
 		 * Release our hold on the group as well. We need
@@ -3116,17 +3226,17 @@ mac_datapath_teardown(mac_client_impl_t *mcip, flow_entry_t *flent,
 		 * left who can use it exclusively. Also, if we
 		 * were the last client, release the group.
 		 */
-		group = flent->fe_rx_ring_group;
 		default_group = MAC_DEFAULT_RX_GROUP(mip);
 		if (group != NULL) {
 			mac_group_remove_client(group, mcip);
 			next_state = mac_group_next_state(group,
 			    &grp_only_mcip, default_group, B_TRUE);
+
 			if (next_state == MAC_GROUP_STATE_RESERVED) {
 				/*
 				 * Only one client left on this RX group.
 				 */
-				ASSERT(grp_only_mcip != NULL);
+				VERIFY3P(grp_only_mcip, !=, NULL);
 				mac_set_group_state(group,
 				    MAC_GROUP_STATE_RESERVED);
 				group_only_flent = grp_only_mcip->mci_flent;
@@ -3151,7 +3261,7 @@ mac_datapath_teardown(mac_client_impl_t *mcip, flow_entry_t *flent,
 				 * to see if the primary client can get
 				 * exclusive access to the default group.
 				 */
-				ASSERT(group != MAC_DEFAULT_RX_GROUP(mip));
+				VERIFY3P(group, !=, MAC_DEFAULT_RX_GROUP(mip));
 				if (mrp->mrp_mask & MRP_RX_RINGS) {
 					MAC_RX_GRP_RELEASED(mip);
 					if (mip->mi_rx_group_type ==
@@ -3165,7 +3275,8 @@ mac_datapath_teardown(mac_client_impl_t *mcip, flow_entry_t *flent,
 				    MAC_GROUP_STATE_REGISTERED);
 				check_default_group = B_TRUE;
 			} else {
-				ASSERT(next_state == MAC_GROUP_STATE_SHARED);
+				VERIFY3S(next_state, ==,
+				    MAC_GROUP_STATE_SHARED);
 				mac_set_group_state(group,
 				    MAC_GROUP_STATE_SHARED);
 				mac_rx_group_unmark(group, MR_CONDEMNED);
@@ -3254,12 +3365,12 @@ mac_datapath_teardown(mac_client_impl_t *mcip, flow_entry_t *flent,
 	 */
 	if (check_default_group) {
 		default_group = MAC_DEFAULT_RX_GROUP(mip);
-		ASSERT(default_group->mrg_state == MAC_GROUP_STATE_SHARED);
+		VERIFY3S(default_group->mrg_state, ==, MAC_GROUP_STATE_SHARED);
 		next_state = mac_group_next_state(default_group,
 		    &grp_only_mcip, default_group, B_TRUE);
 		if (next_state == MAC_GROUP_STATE_RESERVED) {
-			ASSERT(grp_only_mcip != NULL &&
-			    mip->mi_nactiveclients == 1);
+			VERIFY3P(grp_only_mcip, !=, NULL);
+			VERIFY3U(mip->mi_nactiveclients, ==, 1);
 			mac_set_group_state(default_group,
 			    MAC_GROUP_STATE_RESERVED);
 			mac_rx_srs_group_setup(grp_only_mcip,
@@ -3385,7 +3496,7 @@ mac_srs_free(mac_soft_ring_set_t *mac_srs)
 	ASSERT((mac_srs->srs_state & (SRS_CONDEMNED | SRS_CONDEMNED_DONE |
 	    SRS_PROC | SRS_PROC_FAST)) == (SRS_CONDEMNED | SRS_CONDEMNED_DONE));
 
-	mac_pkt_drop(NULL, NULL, mac_srs->srs_first, B_FALSE);
+	mac_drop_chain(mac_srs->srs_first, "SRS free");
 	mac_srs_ring_free(mac_srs);
 	mac_srs_soft_rings_free(mac_srs);
 	mac_srs_fanout_list_free(mac_srs);
@@ -3783,7 +3894,7 @@ mac_tx_srs_del_ring(mac_soft_ring_set_t *mac_srs, mac_ring_t *tx_ring)
 	 * is also stored in st_soft_rings[] array. That entry should
 	 * be removed.
 	 */
-	if (mcip->mci_state_flags & MCIS_IS_AGGR) {
+	if (mcip->mci_state_flags & MCIS_IS_AGGR_CLIENT) {
 		mac_srs_tx_t *tx = &mac_srs->srs_tx;
 
 		ASSERT(tx->st_soft_rings[tx_ring->mr_index] == remove_sring);
@@ -3812,7 +3923,7 @@ mac_tx_srs_setup(mac_client_impl_t *mcip, flow_entry_t *flent)
 	boolean_t		is_aggr;
 	uint_t			ring_info = 0;
 
-	is_aggr = (mcip->mci_state_flags & MCIS_IS_AGGR) != 0;
+	is_aggr = (mcip->mci_state_flags & MCIS_IS_AGGR_CLIENT) != 0;
 	grp = flent->fe_tx_ring_group;
 	if (grp == NULL) {
 		ring = (mac_ring_t *)mip->mi_default_tx_ring;
@@ -3956,8 +4067,8 @@ mac_fanout_recompute_client(mac_client_impl_t *mcip, cpupart_t *cpupart)
 }
 
 /*
- * Walk through the list of mac clients for the MAC.
- * For each active mac client, recompute the number of soft rings
+ * Walk through the list of MAC clients for the MAC.
+ * For each active MAC client, recompute the number of soft rings
  * associated with every client, only if current speed is different
  * from the speed that was previously used for soft ring computation.
  * If the cable is disconnected whlie the NIC is started, we would get
@@ -3980,6 +4091,10 @@ mac_fanout_recompute(mac_impl_t *mip)
 
 	for (mcip = mip->mi_clients_list; mcip != NULL;
 	    mcip = mcip->mci_client_next) {
+		/* Aggr port clients don't have SRSes. */
+		if ((mcip->mci_state_flags & MCIS_IS_AGGR_PORT) != 0)
+			continue;
+
 		if ((mcip->mci_state_flags & MCIS_SHARE_BOUND) != 0 ||
 		    !MCIP_DATAPATH_SETUP(mcip))
 			continue;
@@ -3992,6 +4107,7 @@ mac_fanout_recompute(mac_impl_t *mip)
 		mac_set_pool_effective(use_default, cpupart, mrp, emrp);
 		pool_unlock();
 	}
+
 	i_mac_perim_exit(mip);
 }
 
diff --git a/usr/src/uts/common/io/mac/mac_flow.c b/usr/src/uts/common/io/mac/mac_flow.c
index aa4985fe4c..62612122d6 100644
--- a/usr/src/uts/common/io/mac/mac_flow.c
+++ b/usr/src/uts/common/io/mac/mac_flow.c
@@ -22,6 +22,7 @@
 /*
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2018 Joyent, Inc.
  */
 
 #include <sys/strsun.h>
@@ -229,7 +230,7 @@ mac_flow_create(flow_desc_t *fd, mac_resource_props_t *mrp, char *name,
 		cv_init(&flent->fe_cv, NULL, CV_DEFAULT, NULL);
 
 		/* Initialize the receiver function to a safe routine */
-		flent->fe_cb_fn = (flow_fn_t)mac_pkt_drop;
+		flent->fe_cb_fn = (flow_fn_t)mac_rx_def;
 		flent->fe_index = -1;
 	}
 	(void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAMELEN);
diff --git a/usr/src/uts/common/io/mac/mac_protect.c b/usr/src/uts/common/io/mac/mac_protect.c
index da83dc643e..ee493bbca1 100644
--- a/usr/src/uts/common/io/mac/mac_protect.c
+++ b/usr/src/uts/common/io/mac/mac_protect.c
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2015, Joyent, Inc.  All rights reserved.
+ * Copyright 2017, Joyent, Inc.  All rights reserved.
  */
 /*
  * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
@@ -209,7 +209,7 @@ typedef struct slaac_addr {
 } slaac_addr_t;
 
 static void	start_txn_cleanup_timer(mac_client_impl_t *);
-static boolean_t allowed_ips_set(mac_resource_props_t *, uint32_t);
+static boolean_t dynamic_method_set(mac_protect_t *, uint32_t);
 
 #define	BUMP_STAT(m, s)	(m)->mci_misc_stat.mms_##s++
 
@@ -580,8 +580,7 @@ intercept_dhcpv4_outbound(mac_client_impl_t *mcip, ipha_t *ipha, uchar_t *end)
 	if (get_dhcpv4_info(ipha, end, &dh4) != 0)
 		return (B_TRUE);
 
-	/* ip_nospoof/allowed-ips and DHCP are mutually exclusive by default */
-	if (allowed_ips_set(mrp, IPV4_VERSION))
+	if (!dynamic_method_set(&mrp->mrp_protect, MPT_DYN_DHCPV4))
 		return (B_FALSE);
 
 	if (get_dhcpv4_option(dh4, end, CD_DHCP_TYPE, &opt, &opt_len) != 0 ||
@@ -1310,8 +1309,7 @@ intercept_dhcpv6_outbound(mac_client_impl_t *mcip, ip6_t *ip6h, uchar_t *end)
 	if (get_dhcpv6_info(ip6h, end, &dh6) != 0)
 		return (B_TRUE);
 
-	/* ip_nospoof/allowed-ips and DHCP are mutually exclusive by default */
-	if (allowed_ips_set(mrp, IPV6_VERSION))
+	if (!dynamic_method_set(&mrp->mrp_protect, MPT_DYN_DHCPV6))
 		return (B_FALSE);
 
 	/*
@@ -1517,6 +1515,10 @@ intercept_ra_inbound(mac_client_impl_t *mcip, ip6_t *ip6h, uchar_t *end,
 {
 	struct nd_opt_hdr *opt;
 	int len, optlen;
+	mac_protect_t *protect = &MCIP_RESOURCE_PROPS(mcip)->mrp_protect;
+
+	if (!dynamic_method_set(protect, MPT_DYN_SLAAC))
+		return;
 
 	if (ip6h->ip6_hlim != 255) {
 		DTRACE_PROBE1(invalid__hoplimit, uint8_t, ip6h->ip6_hlim);
@@ -1755,6 +1757,7 @@ ipnospoof_check_v4(mac_client_impl_t *mcip, mac_protect_t *protect,
 	if (*addr == INADDR_ANY)
 		return (B_TRUE);
 
+	/* If any specific addresses or subnets are allowed, check them */
 	for (i = 0; i < protect->mp_ipaddrcnt; i++) {
 		mac_ipaddr_t	*v4addr = &protect->mp_ipaddrs[i];
 
@@ -1775,14 +1778,19 @@ ipnospoof_check_v4(mac_client_impl_t *mcip, mac_protect_t *protect,
 				return (B_TRUE);
 		}
 	}
-	return (protect->mp_ipaddrcnt == 0 ?
-	    check_dhcpv4_dyn_ip(mcip, *addr) : B_FALSE);
+
+	if (dynamic_method_set(protect, MPT_DYN_DHCPV4)) {
+		return (check_dhcpv4_dyn_ip(mcip, *addr));
+	}
+
+	return (B_FALSE);
 }
 
 static boolean_t
 ipnospoof_check_v6(mac_client_impl_t *mcip, mac_protect_t *protect,
     in6_addr_t *addr)
 {
+	boolean_t slaac_enabled, dhcpv6_enabled;
 	uint_t	i;
 
 	/*
@@ -1793,7 +1801,7 @@ ipnospoof_check_v6(mac_client_impl_t *mcip, mac_protect_t *protect,
 	    IN6_ARE_ADDR_EQUAL(&mcip->mci_v6_local_addr, addr)))
 		return (B_TRUE);
 
-
+	/* If any specific addresses or subnets are allowed, check them */
 	for (i = 0; i < protect->mp_ipaddrcnt; i++) {
 		mac_ipaddr_t	*v6addr = &protect->mp_ipaddrs[i];
 
@@ -1804,12 +1812,15 @@ ipnospoof_check_v6(mac_client_impl_t *mcip, mac_protect_t *protect,
 			return (B_TRUE);
 	}
 
-	if (protect->mp_ipaddrcnt == 0) {
-		return (check_slaac_ip(mcip, addr) ||
-		    check_dhcpv6_dyn_ip(mcip, addr));
-	} else {
-		return (B_FALSE);
-	}
+	slaac_enabled = dynamic_method_set(protect, MPT_DYN_SLAAC);
+	if (slaac_enabled && check_slaac_ip(mcip, addr))
+		return (B_TRUE);
+
+	dhcpv6_enabled = dynamic_method_set(protect, MPT_DYN_DHCPV6);
+	if (dhcpv6_enabled && check_dhcpv6_dyn_ip(mcip, addr))
+		return (B_TRUE);
+
+	return (B_FALSE);
 }
 
 /*
@@ -2025,6 +2036,9 @@ dhcpnospoof_check_cid(mac_protect_t *p, uchar_t *cid, uint_t cidlen)
 		    bcmp(dcid->dc_id, cid, cidlen) == 0)
 			return (B_TRUE);
 	}
+
+	DTRACE_PROBE3(missing__cid, mac_protect_t *, p,
+	    uchar_t *, cid, uint_t, cidlen);
 	return (B_FALSE);
 }
 
@@ -2046,6 +2060,12 @@ dhcpnospoof_check_v4(mac_client_impl_t *mcip, mac_protect_t *p,
 	    bcmp(mcip->mci_unicast->ma_addr, dh4->chaddr, maclen) != 0) {
 		return (B_FALSE);
 	}
+
+	/* Everything after here is checking the Client Identifier */
+	if (p->mp_allcids == MPT_TRUE) {
+		return (B_TRUE);
+	}
+
 	if (get_dhcpv4_option(dh4, end, CD_CLIENT_ID, &cid, &optlen) == 0)
 		cidlen = optlen;
 
@@ -2082,6 +2102,11 @@ dhcpnospoof_check_v6(mac_client_impl_t *mcip, mac_protect_t *p,
 	    mtype == DHCPV6_MSG_RECONFIGURE)
 		return (B_TRUE);
 
+	/* Everything after here is checking the Client Identifier */
+	if (p->mp_allcids == MPT_TRUE) {
+		return (B_TRUE);
+	}
+
 	d6o = get_dhcpv6_option(&dh6[1], end - (uchar_t *)&dh6[1], NULL,
 	    DHCPV6_OPT_CLIENTID, &cidlen);
 	if (d6o == NULL || (uchar_t *)d6o + cidlen > end)
@@ -2159,7 +2184,6 @@ dhcpnospoof_check(mac_client_impl_t *mcip, mac_protect_t *protect,
 	return (0);
 
 fail:
-	/* increment dhcpnospoof stat here */
 	freemsg(nmp);
 	return (err);
 }
@@ -2487,6 +2511,11 @@ mac_protect_validate(mac_resource_props_t *mrp)
 	if ((err = validate_cids(p)) != 0)
 		return (err);
 
+	if (p->mp_allcids != MPT_FALSE && p->mp_allcids != MPT_TRUE &&
+	    p->mp_allcids != MPT_RESET) {
+		return (EINVAL);
+	}
+
 	return (0);
 }
 
@@ -2554,6 +2583,16 @@ mac_protect_update(mac_resource_props_t *new, mac_resource_props_t *curr)
 			cp->mp_cidcnt = 0;
 		}
 	}
+	if (np->mp_allcids == MPT_RESET) {
+		cp->mp_allcids = MPT_FALSE;
+	} else if (np->mp_allcids != 0) {
+		cp->mp_allcids = MPT_TRUE;
+	}
+	if (np->mp_dynamic == MPT_RESET) {
+		cp->mp_dynamic = 0;
+	} else if (np->mp_dynamic != 0) {
+		cp->mp_dynamic = np->mp_dynamic;
+	}
 }
 
 void
@@ -2597,15 +2636,50 @@ mac_protect_fini(mac_client_impl_t *mcip)
 }
 
 static boolean_t
-allowed_ips_set(mac_resource_props_t *mrp, uint32_t af)
+dynamic_method_set(mac_protect_t *mpt, uint32_t method)
+{
+	if (mpt->mp_dynamic != 0) {
+		return ((mpt->mp_dynamic & method) != 0);
+	} else {
+		return (mpt->mp_ipaddrcnt == 0);
+	}
+}
+
+boolean_t
+mac_protect_check_addr(mac_client_handle_t mch, boolean_t isv6,
+    in6_addr_t *v6addr)
 {
-	int i;
+	mac_perim_handle_t	perim;
+	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
+	mac_handle_t		mh = (mac_handle_t)mcip->mci_mip;
 
-	for (i = 0; i < mrp->mrp_protect.mp_ipaddrcnt; i++) {
-		if (mrp->mrp_protect.mp_ipaddrs[i].ip_version == af)
-			return (B_TRUE);
+	mac_perim_enter_by_mh(mh, &perim);
+
+	mac_resource_props_t	*mrp = MCIP_RESOURCE_PROPS(mcip);
+	mac_protect_t		*p;
+	boolean_t		allowed;
+
+	ASSERT(mrp != NULL);
+
+	p = &mrp->mrp_protect;
+
+	/* If mac protection/ipnospoof isn't enabled, return true */
+	if ((mrp->mrp_mask & MRP_PROTECT) == 0 ||
+	    (p->mp_types & MPT_IPNOSPOOF) == 0) {
+		allowed = B_TRUE;
+		goto done;
 	}
-	return (B_FALSE);
+
+	if (isv6) {
+		allowed = ipnospoof_check_v6(mcip, p, v6addr);
+	} else {
+		in_addr_t *v4addr = &V4_PART_OF_V6((*v6addr));
+		allowed = ipnospoof_check_v4(mcip, p, v4addr);
+	}
+
+done:
+	mac_perim_exit(perim);
+	return (allowed);
 }
 
 mac_protect_t *
diff --git a/usr/src/uts/common/io/mac/mac_provider.c b/usr/src/uts/common/io/mac/mac_provider.c
index 07201afdec..cb1a76aef6 100644
--- a/usr/src/uts/common/io/mac/mac_provider.c
+++ b/usr/src/uts/common/io/mac/mac_provider.c
@@ -21,6 +21,7 @@
 
 /*
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2018 Joyent, Inc.
  * Copyright 2017 OmniTI Computer Consulting, Inc. All rights reserved.
  */
 
@@ -56,6 +57,7 @@
 #include <sys/sdt.h>
 #include <sys/pattr.h>
 #include <sys/strsun.h>
+#include <sys/vlan.h>
 
 /*
  * MAC Provider Interface.
@@ -351,6 +353,9 @@ mac_register(mac_register_t *mregp, mac_handle_t *mhp)
 	if (i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_AGGR, NULL))
 		mip->mi_state_flags |= MIS_IS_AGGR;
 
+	if (i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_OVERLAY, NULL))
+		mip->mi_state_flags |= MIS_IS_OVERLAY;
+
 	mac_addr_factory_init(mip);
 
 	mac_transceiver_init(mip);
@@ -697,7 +702,6 @@ mac_rx_common(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain)
 	mac_ring_t		*mr = (mac_ring_t *)mrh;
 	mac_soft_ring_set_t 	*mac_srs;
 	mblk_t			*bp = mp_chain;
-	boolean_t		hw_classified = B_FALSE;
 
 	/*
 	 * If there are any promiscuous mode callbacks defined for
@@ -709,7 +713,7 @@ mac_rx_common(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain)
 	if (mr != NULL) {
 		/*
 		 * If the SRS teardown has started, just return. The 'mr'
-		 * continues to be valid until the driver unregisters the mac.
+		 * continues to be valid until the driver unregisters the MAC.
 		 * Hardware classified packets will not make their way up
 		 * beyond this point once the teardown has started. The driver
 		 * is never passed a pointer to a flow entry or SRS or any
@@ -722,11 +726,25 @@ mac_rx_common(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain)
 			freemsgchain(mp_chain);
 			return;
 		}
-		if (mr->mr_classify_type == MAC_HW_CLASSIFIER) {
-			hw_classified = B_TRUE;
+
+		/*
+		 * The ring is in passthru mode; pass the chain up to
+		 * the pseudo ring.
+		 */
+		if (mr->mr_classify_type == MAC_PASSTHRU_CLASSIFIER) {
 			MR_REFHOLD_LOCKED(mr);
+			mutex_exit(&mr->mr_lock);
+			mr->mr_pt_fn(mr->mr_pt_arg1, mr->mr_pt_arg2, mp_chain,
+			    B_FALSE);
+			MR_REFRELE(mr);
+			return;
 		}
-		mutex_exit(&mr->mr_lock);
+
+		/*
+		 * The passthru callback should only be set when in
+		 * MAC_PASSTHRU_CLASSIFIER mode.
+		 */
+		ASSERT3P(mr->mr_pt_fn, ==, NULL);
 
 		/*
 		 * We check if an SRS is controlling this ring.
@@ -734,19 +752,24 @@ mac_rx_common(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain)
 		 * routine otherwise we need to go through mac_rx_classify
 		 * to reach the right place.
 		 */
-		if (hw_classified) {
+		if (mr->mr_classify_type == MAC_HW_CLASSIFIER) {
+			MR_REFHOLD_LOCKED(mr);
+			mutex_exit(&mr->mr_lock);
+			ASSERT3P(mr->mr_srs, !=, NULL);
 			mac_srs = mr->mr_srs;
+
 			/*
-			 * This is supposed to be the fast path.
-			 * All packets received though here were steered by
-			 * the hardware classifier, and share the same
-			 * MAC header info.
+			 * This is the fast path. All packets received
+			 * on this ring are hardware classified and
+			 * share the same MAC header info.
 			 */
 			mac_srs->srs_rx.sr_lower_proc(mh,
 			    (mac_resource_handle_t)mac_srs, mp_chain, B_FALSE);
 			MR_REFRELE(mr);
 			return;
 		}
+
+		mutex_exit(&mr->mr_lock);
 		/* We'll fall through to software classification */
 	} else {
 		flow_entry_t *flent;
@@ -1472,7 +1495,8 @@ mac_prop_info_set_perm(mac_prop_info_handle_t ph, uint8_t perm)
 	pr->pr_flags |= MAC_PROP_INFO_PERM;
 }
 
-void mac_hcksum_get(mblk_t *mp, uint32_t *start, uint32_t *stuff,
+void
+mac_hcksum_get(const mblk_t *mp, uint32_t *start, uint32_t *stuff,
     uint32_t *end, uint32_t *value, uint32_t *flags_ptr)
 {
 	uint32_t flags;
@@ -1497,8 +1521,9 @@ void mac_hcksum_get(mblk_t *mp, uint32_t *start, uint32_t *stuff,
 		*flags_ptr = flags;
 }
 
-void mac_hcksum_set(mblk_t *mp, uint32_t start, uint32_t stuff,
-    uint32_t end, uint32_t value, uint32_t flags)
+void
+mac_hcksum_set(mblk_t *mp, uint32_t start, uint32_t stuff, uint32_t end,
+    uint32_t value, uint32_t flags)
 {
 	ASSERT(DB_TYPE(mp) == M_DATA);
 
@@ -1510,6 +1535,31 @@ void mac_hcksum_set(mblk_t *mp, uint32_t start, uint32_t stuff,
 }
 
 void
+mac_hcksum_clone(const mblk_t *src, mblk_t *dst)
+{
+	ASSERT3U(DB_TYPE(src), ==, M_DATA);
+	ASSERT3U(DB_TYPE(dst), ==, M_DATA);
+
+	/*
+	 * Do these assignments unconditionally, rather than only when
+	 * flags is non-zero. This protects a situation where zeroed
+	 * hcksum data does not make the jump onto an mblk_t with
+	 * stale data in those fields. It's important to copy all
+	 * possible flags (HCK_* as well as HW_*) and not just the
+	 * checksum specific flags. Dropping flags during a clone
+	 * could result in dropped packets. If the caller has good
+	 * reason to drop those flags then it should do it manually,
+	 * after the clone.
+	 */
+	DB_CKSUMFLAGS(dst) = DB_CKSUMFLAGS(src);
+	DB_CKSUMSTART(dst) = DB_CKSUMSTART(src);
+	DB_CKSUMSTUFF(dst) = DB_CKSUMSTUFF(src);
+	DB_CKSUMEND(dst) = DB_CKSUMEND(src);
+	DB_CKSUM16(dst) = DB_CKSUM16(src);
+	DB_LSOMSS(dst) = DB_LSOMSS(src);
+}
+
+void
 mac_lso_get(mblk_t *mp, uint32_t *mss, uint32_t *flags)
 {
 	ASSERT(DB_TYPE(mp) == M_DATA);
diff --git a/usr/src/uts/common/io/mac/mac_sched.c b/usr/src/uts/common/io/mac/mac_sched.c
index d91d07db09..e42cbd1320 100644
--- a/usr/src/uts/common/io/mac/mac_sched.c
+++ b/usr/src/uts/common/io/mac/mac_sched.c
@@ -21,7 +21,7 @@
 /*
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
- * Copyright 2017 Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
  * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
  */
 
@@ -300,9 +300,8 @@
  *
  * Otherwise, all fanout is performed by software. MAC divides incoming frames
  * into one of three buckets -- IPv4 TCP traffic, IPv4 UDP traffic, and
- * everything else. Note, VLAN tagged traffic is considered other, regardless of
- * the interior EtherType. Regardless of the type of fanout, these three
- * categories or buckets are always used.
+ * everything else. Regardless of the type of fanout, these three categories
+ * or buckets are always used.
  *
  * The difference between protocol level fanout and full software ring protocol
  * fanout is the number of software rings that end up getting created. The
@@ -969,6 +968,7 @@
 
 #include <sys/types.h>
 #include <sys/callb.h>
+#include <sys/pattr.h>
 #include <sys/sdt.h>
 #include <sys/strsubr.h>
 #include <sys/strsun.h>
@@ -1328,7 +1328,7 @@ int mac_srs_worker_wakeup_ticks = 0;
 			 * b_prev may be set to the fanout hint		\
 			 * hence can't use freemsg directly		\
 			 */						\
-			mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE);	\
+			mac_drop_chain(mp_chain, "SRS Tx max queue");	\
 			DTRACE_PROBE1(tx_queued_hiwat,			\
 			    mac_soft_ring_set_t *, srs);		\
 			enqueue = 0;					\
@@ -1347,11 +1347,11 @@ int mac_srs_worker_wakeup_ticks = 0;
 	if (!(srs->srs_type & SRST_TX))					\
 		mutex_exit(&srs->srs_bw->mac_bw_lock);
 
-#define	MAC_TX_SRS_DROP_MESSAGE(srs, mp, cookie) {		\
-	mac_pkt_drop(NULL, NULL, mp, B_FALSE);			\
+#define	MAC_TX_SRS_DROP_MESSAGE(srs, chain, cookie, s) {	\
+	mac_drop_pkt((chain), (s));				\
 	/* increment freed stats */				\
-	mac_srs->srs_tx.st_stat.mts_sdrops++;			\
-	cookie = (mac_tx_cookie_t)srs;				\
+	(srs)->srs_tx.st_stat.mts_sdrops++;			\
+	(cookie) = (mac_tx_cookie_t)(srs);			\
 }
 
 #define	MAC_TX_SET_NO_ENQUEUE(srs, mp_chain, ret_mp, cookie) {		\
@@ -1367,11 +1367,11 @@ int mac_srs_worker_wakeup_ticks = 0;
  * can occur in situ (in the interrupt thread) or if it should be left to a
  * worker thread.  Note that the constant used to make this determination is
  * not entirely made-up, and is a result of some emprical validation. That
- * said, the constant is left as a static variable to allow it to be
+ * said, the constant is left as a global variable to allow it to be
  * dynamically tuned in the field if and as needed.
  */
-static uintptr_t mac_rx_srs_stack_needed = 10240;
-static uint_t mac_rx_srs_stack_toodeep;
+uintptr_t mac_rx_srs_stack_needed = 14336;
+uint_t mac_rx_srs_stack_toodeep;
 
 #ifndef STACK_GROWTH_DOWN
 #error Downward stack growth assumed.
@@ -1379,7 +1379,7 @@ static uint_t mac_rx_srs_stack_toodeep;
 
 #define	MAC_RX_SRS_TOODEEP() (STACK_BIAS + (uintptr_t)getfp() - \
 	(uintptr_t)curthread->t_stkbase < mac_rx_srs_stack_needed && \
-	++mac_rx_srs_stack_toodeep)
+	(++mac_rx_srs_stack_toodeep || (mac_rx_srs_stack_toodeep = 1)))
 
 
 /*
@@ -1475,16 +1475,15 @@ enum pkt_type {
 #define	PORTS_SIZE 4
 
 /*
- * mac_rx_srs_proto_fanout
- *
- * This routine delivers packets destined to an SRS into one of the
+ * This routine delivers packets destined for an SRS into one of the
  * protocol soft rings.
  *
- * Given a chain of packets we need to split it up into multiple sub chains
- * destined into TCP, UDP or OTH soft ring. Instead of entering
- * the soft ring one packet at a time, we want to enter it in the form of a
- * chain otherwise we get this start/stop behaviour where the worker thread
- * goes to sleep and then next packets comes in forcing it to wake up etc.
+ * Given a chain of packets we need to split it up into multiple sub
+ * chains: TCP, UDP or OTH soft ring. Instead of entering the soft
+ * ring one packet at a time, we want to enter it in the form of a
+ * chain otherwise we get this start/stop behaviour where the worker
+ * thread goes to sleep and then next packet comes in forcing it to
+ * wake up.
  */
 static void
 mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
@@ -1523,9 +1522,9 @@ mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
 	    mac_srs->srs_ring->mr_classify_type == MAC_HW_CLASSIFIER;
 
 	/*
-	 * Special clients (eg. VLAN, non ether, etc) need DLS
-	 * processing in the Rx path. SRST_DLS_BYPASS will be clear for
-	 * such SRSs. Another way of disabling bypass is to set the
+	 * Some clients, such as non-ethernet, need DLS processing in
+	 * the Rx path. Such clients clear the SRST_DLS_BYPASS flag.
+	 * DLS bypass may also be disabled via the
 	 * MCIS_RX_BYPASS_DISABLE flag.
 	 */
 	dls_bypass = ((mac_srs->srs_type & SRST_DLS_BYPASS) != 0) &&
@@ -1537,10 +1536,11 @@ mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
 	bzero(sz, MAX_SR_TYPES * sizeof (size_t));
 
 	/*
-	 * We got a chain from SRS that we need to send to the soft rings.
-	 * Since squeues for TCP & IPv4 sap poll their soft rings (for
-	 * performance reasons), we need to separate out v4_tcp, v4_udp
-	 * and the rest goes in other.
+	 * We have a chain from SRS that we need to split across the
+	 * soft rings. The squeues for the TCP and IPv4 SAPs use their
+	 * own soft rings to allow polling from the squeue. The rest of
+	 * the packets are delivered on the OTH soft ring which cannot
+	 * be polled.
 	 */
 	while (head != NULL) {
 		mp = head;
@@ -1568,9 +1568,14 @@ mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
 				evhp = (struct ether_vlan_header *)mp->b_rptr;
 				sap = ntohs(evhp->ether_type);
 				hdrsize = sizeof (struct ether_vlan_header);
+
 				/*
-				 * Check if the VID of the packet, if any,
-				 * belongs to this client.
+				 * Check if the VID of the packet, if
+				 * any, belongs to this client.
+				 * Technically, if this packet came up
+				 * via a HW classified ring then we
+				 * don't need to perform this check.
+				 * Perhaps a future optimization.
 				 */
 				if (!mac_client_check_flow_vid(mcip,
 				    VLAN_ID(ntohs(evhp->ether_tci)))) {
@@ -1635,7 +1640,6 @@ mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
 		 * performance and may bypass DLS. All other cases go through
 		 * the 'OTH' type path without DLS bypass.
 		 */
-
 		ipha = (ipha_t *)(mp->b_rptr + hdrsize);
 		if ((type != OTH) && MBLK_RX_FANOUT_SLOWPATH(mp, ipha))
 			type = OTH;
@@ -1647,11 +1651,13 @@ mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
 		}
 
 		ASSERT(type == UNDEF);
+
 		/*
-		 * We look for at least 4 bytes past the IP header to get
-		 * the port information. If we get an IP fragment, we don't
-		 * have the port information, and we use just the protocol
-		 * information.
+		 * Determine the type from the IP protocol value. If
+		 * classified as TCP or UDP, then update the read
+		 * pointer to the beginning of the IP header.
+		 * Otherwise leave the message as is for further
+		 * processing by DLS.
 		 */
 		switch (ipha->ipha_protocol) {
 		case IPPROTO_TCP:
@@ -1695,11 +1701,10 @@ mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
 int	fanout_unaligned = 0;
 
 /*
- * mac_rx_srs_long_fanout
- *
- * The fanout routine for VLANs, and for anything else that isn't performing
- * explicit dls bypass.  Returns -1 on an error (drop the packet due to a
- * malformed packet), 0 on success, with values written in *indx and *type.
+ * The fanout routine for any clients with DLS bypass disabled or for
+ * traffic classified as "other". Returns -1 on an error (drop the
+ * packet due to a malformed packet), 0 on success, with values
+ * written in *indx and *type.
  */
 static int
 mac_rx_srs_long_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *mp,
@@ -1865,16 +1870,15 @@ src_dst_based_fanout:
 }
 
 /*
- * mac_rx_srs_fanout
- *
- * This routine delivers packets destined to an SRS into a soft ring member
+ * This routine delivers packets destined for an SRS into a soft ring member
  * of the set.
  *
- * Given a chain of packets we need to split it up into multiple sub chains
- * destined for one of the TCP, UDP or OTH soft rings. Instead of entering
- * the soft ring one packet at a time, we want to enter it in the form of a
- * chain otherwise we get this start/stop behaviour where the worker thread
- * goes to sleep and then next packets comes in forcing it to wake up etc.
+ * Given a chain of packets we need to split it up into multiple sub
+ * chains: TCP, UDP or OTH soft ring. Instead of entering the soft
+ * ring one packet at a time, we want to enter it in the form of a
+ * chain otherwise we get this start/stop behaviour where the worker
+ * thread goes to sleep and then next packet comes in forcing it to
+ * wake up.
  *
  * Note:
  * Since we know what is the maximum fanout possible, we create a 2D array
@@ -1935,10 +1939,11 @@ mac_rx_srs_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
 	    mac_srs->srs_ring->mr_classify_type == MAC_HW_CLASSIFIER;
 
 	/*
-	 * Special clients (eg. VLAN, non ether, etc) need DLS
-	 * processing in the Rx path. SRST_DLS_BYPASS will be clear for
-	 * such SRSs. Another way of disabling bypass is to set the
-	 * MCIS_RX_BYPASS_DISABLE flag.
+	 * Some clients, such as non Ethernet, need DLS processing in
+	 * the Rx path. Such clients clear the SRST_DLS_BYPASS flag.
+	 * DLS bypass may also be disabled via the
+	 * MCIS_RX_BYPASS_DISABLE flag, but this is only consumed by
+	 * sun4v vsw currently.
 	 */
 	dls_bypass = ((mac_srs->srs_type & SRST_DLS_BYPASS) != 0) &&
 	    ((mcip->mci_state_flags & MCIS_RX_BYPASS_DISABLE) == 0);
@@ -1960,7 +1965,7 @@ mac_rx_srs_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
 
 	/*
 	 * We got a chain from SRS that we need to send to the soft rings.
-	 * Since squeues for TCP & IPv4 sap poll their soft rings (for
+	 * Since squeues for TCP & IPv4 SAP poll their soft rings (for
 	 * performance reasons), we need to separate out v4_tcp, v4_udp
 	 * and the rest goes in other.
 	 */
@@ -1990,9 +1995,14 @@ mac_rx_srs_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
 				evhp = (struct ether_vlan_header *)mp->b_rptr;
 				sap = ntohs(evhp->ether_type);
 				hdrsize = sizeof (struct ether_vlan_header);
+
 				/*
-				 * Check if the VID of the packet, if any,
-				 * belongs to this client.
+				 * Check if the VID of the packet, if
+				 * any, belongs to this client.
+				 * Technically, if this packet came up
+				 * via a HW classified ring then we
+				 * don't need to perform this check.
+				 * Perhaps a future optimization.
 				 */
 				if (!mac_client_check_flow_vid(mcip,
 				    VLAN_ID(ntohs(evhp->ether_tci)))) {
@@ -2032,7 +2042,6 @@ mac_rx_srs_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
 			continue;
 		}
 
-
 		/*
 		 * If we are using the default Rx ring where H/W or S/W
 		 * classification has not happened, we need to verify if
@@ -2621,7 +2630,6 @@ again:
 
 	mac_srs->srs_state |= (SRS_PROC|proc_type);
 
-
 	/*
 	 * mcip is NULL for broadcast and multicast flows. The promisc
 	 * callbacks for broadcast and multicast packets are delivered from
@@ -2641,10 +2649,8 @@ again:
 	}
 
 	/*
-	 * Check if SRS itself is doing the processing
-	 * This direct path does not apply when subflows are present. In this
-	 * case, packets need to be dispatched to a soft ring according to the
-	 * flow's bandwidth and other resources contraints.
+	 * Check if SRS itself is doing the processing. This direct
+	 * path applies only when subflows are present.
 	 */
 	if (mac_srs->srs_type & SRST_NO_SOFT_RINGS) {
 		mac_direct_rx_t		proc;
@@ -2888,7 +2894,7 @@ again:
 		mac_srs->srs_bw->mac_bw_sz -= sz;
 		mac_srs->srs_bw->mac_bw_drop_bytes += sz;
 		mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
-		mac_pkt_drop(NULL, NULL, head, B_FALSE);
+		mac_drop_chain(head, "Rx no bandwidth");
 		goto leave_poll;
 	} else {
 		mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
@@ -3270,9 +3276,10 @@ mac_rx_srs_subflow_process(void *arg, mac_resource_handle_t srs,
 }
 
 /*
- * mac_rx_srs_process
- *
- * Receive side routine called from the interrupt path.
+ * MAC SRS receive side routine. If the data is coming from the
+ * network (i.e. from a NIC) then this is called in interrupt context.
+ * If the data is coming from a local sender (e.g. mac_tx_send() or
+ * bridge_forward()) then this is not called in interrupt context.
  *
  * loopback is set to force a context switch on the loopback
  * path between MAC clients.
@@ -3332,7 +3339,7 @@ mac_rx_srs_process(void *arg, mac_resource_handle_t srs, mblk_t *mp_chain,
 			mac_bw->mac_bw_drop_bytes += sz;
 			mutex_exit(&mac_bw->mac_bw_lock);
 			mutex_exit(&mac_srs->srs_lock);
-			mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE);
+			mac_drop_chain(mp_chain, "Rx no bandwidth");
 			return;
 		} else {
 			if ((mac_bw->mac_bw_sz + sz) <=
@@ -3454,7 +3461,8 @@ mac_tx_srs_no_desc(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
 
 	ASSERT(tx_mode == SRS_TX_DEFAULT || tx_mode == SRS_TX_BW);
 	if (flag & MAC_DROP_ON_NO_DESC) {
-		MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie);
+		MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie,
+		    "Tx no desc");
 	} else {
 		if (mac_srs->srs_first != NULL)
 			wakeup_worker = B_FALSE;
@@ -3517,7 +3525,8 @@ mac_tx_srs_enqueue(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
 	MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
 	if (flag & MAC_DROP_ON_NO_DESC) {
 		if (mac_srs->srs_count > mac_srs->srs_tx.st_hiwat) {
-			MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie);
+			MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie,
+			    "Tx SRS hiwat");
 		} else {
 			MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs,
 			    mp_chain, tail, cnt, sz);
@@ -3890,7 +3899,8 @@ mac_tx_bw_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
 			cookie = (mac_tx_cookie_t)mac_srs;
 			*ret_mp = mp_chain;
 		} else {
-			MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie);
+			MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie,
+			    "Tx no bandwidth");
 		}
 		mutex_exit(&mac_srs->srs_lock);
 		return (cookie);
@@ -4336,6 +4346,14 @@ mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain,
 			obytes += (mp->b_cont == NULL ? MBLKL(mp) :
 			    msgdsize(mp));
 
+			/*
+			 * Mark all packets as local so that a
+			 * receiver can determine if a packet arrived
+			 * from a local source or from the network.
+			 * This allows some consumers to avoid
+			 * unecessary work like checksum computation.
+			 */
+			DB_CKSUMFLAGS(mp) |= HW_LOCAL_MAC;
 			CHECK_VID_AND_ADD_TAG(mp);
 			MAC_TX(mip, ring, mp, src_mcip);
 
@@ -4368,7 +4386,6 @@ mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain,
 		flow_entry_t *dst_flow_ent;
 		void *flow_cookie;
 		size_t	pkt_size;
-		mblk_t *mp1;
 
 		next = mp->b_next;
 		mp->b_next = NULL;
@@ -4378,49 +4395,25 @@ mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain,
 		CHECK_VID_AND_ADD_TAG(mp);
 
 		/*
+		 * Mark all packets as local so that a receiver can
+		 * determine if a packet arrived from a local source
+		 * or from the network. This allows some consumers to
+		 * avoid unecessary work like checksum computation.
+		 */
+		DB_CKSUMFLAGS(mp) |= HW_LOCAL_MAC;
+
+		/*
 		 * Find the destination.
 		 */
 		dst_flow_ent = mac_tx_classify(mip, mp);
 
 		if (dst_flow_ent != NULL) {
-			size_t	hdrsize;
-			int	err = 0;
-
-			if (mip->mi_info.mi_nativemedia == DL_ETHER) {
-				struct ether_vlan_header *evhp =
-				    (struct ether_vlan_header *)mp->b_rptr;
-
-				if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN)
-					hdrsize = sizeof (*evhp);
-				else
-					hdrsize = sizeof (struct ether_header);
-			} else {
-				mac_header_info_t	mhi;
-
-				err = mac_header_info((mac_handle_t)mip,
-				    mp, &mhi);
-				if (err == 0)
-					hdrsize = mhi.mhi_hdrsize;
-			}
-
 			/*
 			 * Got a matching flow. It's either another
 			 * MAC client, or a broadcast/multicast flow.
-			 * Make sure the packet size is within the
-			 * allowed size. If not drop the packet and
-			 * move to next packet.
 			 */
-			if (err != 0 ||
-			    (pkt_size - hdrsize) > mip->mi_sdu_max) {
-				oerrors++;
-				DTRACE_PROBE2(loopback__drop, size_t, pkt_size,
-				    mblk_t *, mp);
-				freemsg(mp);
-				mp = next;
-				FLOW_REFRELE(dst_flow_ent);
-				continue;
-			}
 			flow_cookie = mac_flow_get_client_cookie(dst_flow_ent);
+
 			if (flow_cookie != NULL) {
 				/*
 				 * The vnic_bcast_send function expects
@@ -4438,6 +4431,7 @@ mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain,
 				 * bypass is set.
 				 */
 				boolean_t do_switch;
+
 				mac_client_impl_t *dst_mcip =
 				    dst_flow_ent->fe_mcip;
 
@@ -4453,19 +4447,19 @@ mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain,
 				 * check is done inside the MAC_TX()
 				 * macro.
 				 */
-				if (mip->mi_promisc_list != NULL)
+				if (mip->mi_promisc_list != NULL) {
 					mac_promisc_dispatch(mip, mp, src_mcip);
+				}
 
 				do_switch = ((src_mcip->mci_state_flags &
 				    dst_mcip->mci_state_flags &
 				    MCIS_CLIENT_POLL_CAPABLE) != 0);
 
-				if ((mp1 = mac_fix_cksum(mp)) != NULL) {
-					(dst_flow_ent->fe_cb_fn)(
-					    dst_flow_ent->fe_cb_arg1,
-					    dst_flow_ent->fe_cb_arg2,
-					    mp1, do_switch);
-				}
+				(dst_flow_ent->fe_cb_fn)(
+				    dst_flow_ent->fe_cb_arg1,
+				    dst_flow_ent->fe_cb_arg2,
+				    mp, do_switch);
+
 			}
 			FLOW_REFRELE(dst_flow_ent);
 		} else {
@@ -4656,6 +4650,9 @@ mac_rx_deliver(void *arg1, mac_resource_handle_t mrh, mblk_t *mp_chain,
 		 * the packet to the promiscuous listeners of the
 		 * client, since they expect to see the whole
 		 * frame including the VLAN headers.
+		 *
+		 * The MCIS_STRIP_DISABLE is only issued when sun4v
+		 * vsw is in play.
 		 */
 		mp_chain = mac_strip_vlan_tag_chain(mp_chain);
 	}
@@ -4664,13 +4661,11 @@ mac_rx_deliver(void *arg1, mac_resource_handle_t mrh, mblk_t *mp_chain,
 }
 
 /*
- * mac_rx_soft_ring_process
- *
- * process a chain for a given soft ring. The number of packets queued
- * in the SRS and its associated soft rings (including this one) is
- * very small (tracked by srs_poll_pkt_cnt), then allow the entering
- * thread (interrupt or poll thread) to do inline processing. This
- * helps keep the latency down under low load.
+ * Process a chain for a given soft ring. If the number of packets
+ * queued in the SRS and its associated soft rings (including this
+ * one) is very small (tracked by srs_poll_pkt_cnt) then allow the
+ * entering thread (interrupt or poll thread) to process the chain
+ * inline. This is meant to reduce latency under low load.
  *
  * The proc and arg for each mblk is already stored in the mblk in
  * appropriate places.
@@ -4729,13 +4724,13 @@ mac_rx_soft_ring_process(mac_client_impl_t *mcip, mac_soft_ring_t *ringp,
 
 			ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock));
 			/*
-			 * If we have a soft ring set which is doing
-			 * bandwidth control, we need to decrement
-			 * srs_size and count so it the SRS can have a
-			 * accurate idea of what is the real data
-			 * queued between SRS and its soft rings. We
-			 * decrement the counters only when the packet
-			 * gets processed by both SRS and the soft ring.
+			 * If we have an SRS performing bandwidth
+			 * control then we need to decrement the size
+			 * and count so the SRS has an accurate count
+			 * of the data queued between the SRS and its
+			 * soft rings. We decrement the counters only
+			 * when the packet is processed by both the
+			 * SRS and the soft ring.
 			 */
 			mutex_enter(&mac_srs->srs_lock);
 			MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt);
@@ -4751,8 +4746,8 @@ mac_rx_soft_ring_process(mac_client_impl_t *mcip, mac_soft_ring_t *ringp,
 			if ((ringp->s_ring_first == NULL) ||
 			    (ringp->s_ring_state & S_RING_BLANK)) {
 				/*
-				 * We processed inline our packet and
-				 * nothing new has arrived or our
+				 * We processed a single packet inline
+				 * and nothing new has arrived or our
 				 * receiver doesn't want to receive
 				 * any packets. We are done.
 				 */
@@ -4821,7 +4816,7 @@ mac_tx_sring_enqueue(mac_soft_ring_t *ringp, mblk_t *mp_chain, uint16_t flag,
 	ASSERT(MUTEX_HELD(&ringp->s_ring_lock));
 	MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
 	if (flag & MAC_DROP_ON_NO_DESC) {
-		mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE);
+		mac_drop_chain(mp_chain, "Tx softring no desc");
 		/* increment freed stats */
 		ringp->s_ring_drops += cnt;
 		cookie = (mac_tx_cookie_t)ringp;
@@ -4865,8 +4860,8 @@ mac_tx_sring_enqueue(mac_soft_ring_t *ringp, mblk_t *mp_chain, uint16_t flag,
 					 * b_prev may be set to the fanout hint
 					 * hence can't use freemsg directly
 					 */
-					mac_pkt_drop(NULL, NULL,
-					    mp_chain, B_FALSE);
+					mac_drop_chain(mp_chain,
+					    "Tx softring max queue");
 					DTRACE_PROBE1(tx_queued_hiwat,
 					    mac_soft_ring_t *, ringp);
 					enqueue = B_FALSE;
diff --git a/usr/src/uts/common/io/mac/mac_soft_ring.c b/usr/src/uts/common/io/mac/mac_soft_ring.c
index dc8cfdd145..4655631dc1 100644
--- a/usr/src/uts/common/io/mac/mac_soft_ring.c
+++ b/usr/src/uts/common/io/mac/mac_soft_ring.c
@@ -21,7 +21,7 @@
 /*
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
- * Copyright 2017 Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
  */
 
 /*
@@ -207,7 +207,7 @@ mac_soft_ring_create(int id, clock_t wait, uint16_t type,
 		ringp->s_ring_tx_hiwat =
 		    (mac_tx_soft_ring_hiwat > mac_tx_soft_ring_max_q_cnt) ?
 		    mac_tx_soft_ring_max_q_cnt : mac_tx_soft_ring_hiwat;
-		if (mcip->mci_state_flags & MCIS_IS_AGGR) {
+		if (mcip->mci_state_flags & MCIS_IS_AGGR_CLIENT) {
 			mac_srs_tx_t *tx = &mac_srs->srs_tx;
 
 			ASSERT(tx->st_soft_rings[
@@ -242,7 +242,7 @@ mac_soft_ring_free(mac_soft_ring_t *softring)
 	ASSERT((softring->s_ring_state &
 	    (S_RING_CONDEMNED | S_RING_CONDEMNED_DONE | S_RING_PROC)) ==
 	    (S_RING_CONDEMNED | S_RING_CONDEMNED_DONE));
-	mac_pkt_drop(NULL, NULL, softring->s_ring_first, B_FALSE);
+	mac_drop_chain(softring->s_ring_first, "softring free");
 	softring->s_ring_tx_arg2 = NULL;
 	mac_soft_ring_stat_delete(softring);
 	mac_callback_free(softring->s_ring_notify_cb_list);
@@ -339,15 +339,14 @@ mac_soft_ring_fire(void *arg)
 }
 
 /*
- * mac_rx_soft_ring_drain
+ * Drain the soft ring pointed to by ringp.
  *
- * Called when worker thread model (ST_RING_WORKER_ONLY) of processing
- * incoming packets is used. s_ring_first contain the queued packets.
- * s_ring_rx_func contains the upper level (client) routine where the
- * packets are destined and s_ring_rx_arg1/s_ring_rx_arg2 are the
- * cookie meant for the client.
+ *    o s_ring_first: pointer to the queued packet chain.
+ *
+ *    o s_ring_rx_func: pointer to to the client's Rx routine.
+ *
+ *    o s_ring_rx_{arg1,arg2}: opaque values specific to the client.
  */
-/* ARGSUSED */
 static void
 mac_rx_soft_ring_drain(mac_soft_ring_t *ringp)
 {
@@ -392,13 +391,12 @@ mac_rx_soft_ring_drain(mac_soft_ring_t *ringp)
 		(*proc)(arg1, arg2, mp, NULL);
 
 		/*
-		 * If we have a soft ring set which is doing
-		 * bandwidth control, we need to decrement its
-		 * srs_size so it can have a accurate idea of
-		 * what is the real data queued between SRS and
-		 * its soft rings. We decrement the size for a
-		 * packet only when it gets processed by both
-		 * SRS and the soft ring.
+		 * If we have an SRS performing bandwidth control, then
+		 * we need to decrement the size and count so the SRS
+		 * has an accurate measure of the data queued between
+		 * the SRS and its soft rings. We decrement the
+		 * counters only when the packet is processed by both
+		 * the SRS and the soft ring.
 		 */
 		mutex_enter(&mac_srs->srs_lock);
 		MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt);
@@ -414,12 +412,10 @@ mac_rx_soft_ring_drain(mac_soft_ring_t *ringp)
 }
 
 /*
- * mac_soft_ring_worker
- *
  * The soft ring worker routine to process any queued packets. In
- * normal case, the worker thread is bound to a CPU. It the soft
- * ring is dealing with TCP packets, then the worker thread will
- * be bound to the same CPU as the TCP squeue.
+ * normal case, the worker thread is bound to a CPU. If the soft ring
+ * handles TCP packets then the worker thread is bound to the same CPU
+ * as the TCP squeue.
  */
 static void
 mac_soft_ring_worker(mac_soft_ring_t *ringp)
@@ -604,7 +600,7 @@ mac_soft_ring_dls_bypass(void *arg, mac_direct_rx_t rx_func, void *rx_arg1)
 	mac_soft_ring_t		*softring = arg;
 	mac_soft_ring_set_t	*srs;
 
-	ASSERT(rx_func != NULL);
+	VERIFY3P(rx_func, !=, NULL);
 
 	mutex_enter(&softring->s_ring_lock);
 	softring->s_ring_rx_func = rx_func;
diff --git a/usr/src/uts/common/io/mac/mac_stat.c b/usr/src/uts/common/io/mac/mac_stat.c
index 31972f94d8..2244218f20 100644
--- a/usr/src/uts/common/io/mac/mac_stat.c
+++ b/usr/src/uts/common/io/mac/mac_stat.c
@@ -21,6 +21,7 @@
 /*
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2018 Joyent, Inc.
  */
 
 /*
@@ -390,8 +391,8 @@ i_mac_stat_create(void *handle, const char *modname, const char *statname,
 	kstat_t		*ksp;
 	kstat_named_t	*knp;
 
-	ksp = kstat_create(modname, 0, statname, "net",
-	    KSTAT_TYPE_NAMED, count, 0);
+	ksp = kstat_create_zone(modname, 0, statname, "net",
+	    KSTAT_TYPE_NAMED, count, 0, getzoneid());
 
 	if (ksp == NULL)
 		return (NULL);
@@ -948,9 +949,9 @@ mac_driver_stat_create(mac_impl_t *mip)
 	major_t		major = getmajor(mip->mi_phy_dev);
 
 	count = MAC_MOD_NKSTAT + MAC_NKSTAT + mip->mi_type->mt_statcount;
-	ksp = kstat_create((const char *)ddi_major_to_name(major),
+	ksp = kstat_create_zone((const char *)ddi_major_to_name(major),
 	    getminor(mip->mi_phy_dev) - 1, MAC_KSTAT_NAME,
-	    MAC_KSTAT_CLASS, KSTAT_TYPE_NAMED, count, 0);
+	    MAC_KSTAT_CLASS, KSTAT_TYPE_NAMED, count, 0, getzoneid());
 	if (ksp == NULL)
 		return;
 
@@ -1003,6 +1004,7 @@ void
 mac_ring_stat_create(mac_ring_t *ring)
 {
 	mac_impl_t	*mip = ring->mr_mip;
+	mac_group_t	*grp = (mac_group_t *)ring->mr_gh;
 	char		statname[MAXNAMELEN];
 	char		modname[MAXNAMELEN];
 
@@ -1014,8 +1016,8 @@ mac_ring_stat_create(mac_ring_t *ring)
 
 	switch (ring->mr_type) {
 	case MAC_RING_TYPE_RX:
-		(void) snprintf(statname, sizeof (statname), "mac_rx_ring%d",
-		    ring->mr_index);
+		(void) snprintf(statname, sizeof (statname),
+		    "mac_rx_ring_%d_%d", grp->mrg_index, ring->mr_index);
 		i_mac_rx_ring_stat_create(ring, modname, statname);
 		break;
 
diff --git a/usr/src/uts/common/io/mac/mac_util.c b/usr/src/uts/common/io/mac/mac_util.c
index e83af37f16..111b323f81 100644
--- a/usr/src/uts/common/io/mac/mac_util.c
+++ b/usr/src/uts/common/io/mac/mac_util.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2018 Joyent, Inc.
  */
 
 /*
@@ -47,6 +48,74 @@
 #include <inet/sadb.h>
 #include <inet/ipsecesp.h>
 #include <inet/ipsecah.h>
+#include <inet/tcp.h>
+#include <inet/udp_impl.h>
+
+/*
+ * The next two functions are used for dropping packets or chains of
+ * packets, respectively. We could use one function for both but
+ * separating the use cases allows us to specify intent and prevent
+ * dropping more data than intended.
+ *
+ * The purpose of these functions is to aid the debugging effort,
+ * especially in production. Rather than use freemsg()/freemsgchain(),
+ * it's preferable to use these functions when dropping a packet in
+ * the MAC layer. These functions should only be used during
+ * unexpected conditions. That is, any time a packet is dropped
+ * outside of the regular, successful datapath. Consolidating all
+ * drops on these functions allows the user to trace one location and
+ * determine why the packet was dropped based on the msg. It also
+ * allows the user to inspect the packet before it is freed. Finally,
+ * it allows the user to avoid tracing freemsg()/freemsgchain() thus
+ * keeping the hot path running as efficiently as possible.
+ *
+ * NOTE: At this time not all MAC drops are aggregated on these
+ * functions; but that is the plan. This comment should be erased once
+ * completed.
+ */
+
+/*PRINTFLIKE2*/
+void
+mac_drop_pkt(mblk_t *mp, const char *fmt, ...)
+{
+	va_list adx;
+	char msg[128];
+	char *msgp = msg;
+
+	ASSERT3P(mp->b_next, ==, NULL);
+
+	va_start(adx, fmt);
+	(void) vsnprintf(msgp, sizeof (msg), fmt, adx);
+	va_end(adx);
+
+	DTRACE_PROBE2(mac__drop, mblk_t *, mp, char *, msgp);
+	freemsg(mp);
+}
+
+/*PRINTFLIKE2*/
+void
+mac_drop_chain(mblk_t *chain, const char *fmt, ...)
+{
+	va_list adx;
+	char msg[128];
+	char *msgp = msg;
+
+	va_start(adx, fmt);
+	(void) vsnprintf(msgp, sizeof (msg), fmt, adx);
+	va_end(adx);
+
+	/*
+	 * We could use freemsgchain() for the actual freeing but
+	 * since we are already walking the chain to fire the dtrace
+	 * probe we might as well free the msg here too.
+	 */
+	for (mblk_t *mp = chain, *next; mp != NULL; ) {
+		next = mp->b_next;
+		DTRACE_PROBE2(mac__drop, mblk_t *, mp, char *, msgp);
+		freemsg(mp);
+		mp = next;
+	}
+}
 
 /*
  * Copy an mblk, preserving its hardware checksum flags.
@@ -55,15 +124,12 @@ static mblk_t *
 mac_copymsg_cksum(mblk_t *mp)
 {
 	mblk_t *mp1;
-	uint32_t start, stuff, end, value, flags;
 
 	mp1 = copymsg(mp);
 	if (mp1 == NULL)
 		return (NULL);
 
-	hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value, &flags);
-	(void) hcksum_assoc(mp1, NULL, NULL, start, stuff, end, value,
-	    flags, KM_NOSLEEP);
+	mac_hcksum_clone(mp, mp1);
 
 	return (mp1);
 }
@@ -91,224 +157,1125 @@ mac_copymsgchain_cksum(mblk_t *mp)
 }
 
 /*
- * Process the specified mblk chain for proper handling of hardware
- * checksum offload. This routine is invoked for loopback traffic
- * between MAC clients.
- * The function handles a NULL mblk chain passed as argument.
+ * Perform software checksum on a single message, if needed. The
+ * emulation performed is determined by an intersection of the mblk's
+ * flags and the emul flags requested. The emul flags are documented
+ * in mac.h.
  */
-mblk_t *
-mac_fix_cksum(mblk_t *mp_chain)
+static mblk_t *
+mac_sw_cksum(mblk_t *mp, mac_emul_t emul)
 {
-	mblk_t *mp, *prev = NULL, *new_chain = mp_chain, *mp1;
+	mblk_t *skipped_hdr = NULL;
 	uint32_t flags, start, stuff, end, value;
+	uint16_t len;
+	uint32_t offset;
+	uint16_t etype;
+	struct ether_header *ehp;
+	ipha_t *ipha;
+	uint8_t proto;
+	const char *err = "";
 
-	for (mp = mp_chain; mp != NULL; prev = mp, mp = mp->b_next) {
-		uint16_t len;
-		uint32_t offset;
-		struct ether_header *ehp;
-		uint16_t sap;
+	/*
+	 * This function should only be called from mac_hw_emul()
+	 * which handles mblk chains and the shared ref case.
+	 */
+	ASSERT3P(mp->b_next, ==, NULL);
 
-		hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value,
-		    &flags);
-		if (flags == 0)
-			continue;
+	mac_hcksum_get(mp, &start, &stuff, &end, &value, NULL);
+
+	/*
+	 * We use DB_CKSUMFLAGS (instead of mac_hcksum_get()) because
+	 * we don't want to mask-out the HW_LOCAL_MAC flag.
+	 */
+	flags = DB_CKSUMFLAGS(mp);
+
+	/* Why call this if checksum emulation isn't needed? */
+	ASSERT3U(flags & (HCK_FLAGS), !=, 0);
+
+	/*
+	 * Ethernet, and optionally VLAN header. mac_hw_emul() has
+	 * already verified we have enough data to read the L2 header.
+	 */
+	ehp = (struct ether_header *)mp->b_rptr;
+	if (ntohs(ehp->ether_type) == VLAN_TPID) {
+		struct ether_vlan_header *evhp;
+
+		evhp = (struct ether_vlan_header *)mp->b_rptr;
+		etype = ntohs(evhp->ether_type);
+		offset = sizeof (struct ether_vlan_header);
+	} else {
+		etype = ntohs(ehp->ether_type);
+		offset = sizeof (struct ether_header);
+	}
+
+	/*
+	 * If this packet isn't IPv4, then leave it alone. We still
+	 * need to add IPv6 support and we don't want to affect non-IP
+	 * traffic like ARP.
+	 */
+	if (etype != ETHERTYPE_IP)
+		return (mp);
+
+	ASSERT3U(MBLKL(mp), >=, offset);
+
+	/*
+	 * If the first mblk of this packet contains only the ethernet
+	 * header, skip past it for now. Packets with their data
+	 * contained in only a single mblk can then use the fastpaths
+	 * tuned to that possibility.
+	 */
+	if (MBLKL(mp) == offset) {
+		offset -= MBLKL(mp);
+		/* This is guaranteed by mac_hw_emul(). */
+		ASSERT3P(mp->b_cont, !=, NULL);
+		skipped_hdr = mp;
+		mp = mp->b_cont;
+	}
+
+	/*
+	 * Both full and partial checksum rely on finding the IP
+	 * header in the current mblk. Our native TCP stack honors
+	 * this assumption but it's prudent to guard our future
+	 * clients that might not honor this contract.
+	 */
+	ASSERT3U(MBLKL(mp), >=, offset + sizeof (ipha_t));
+	if (MBLKL(mp) < (offset + sizeof (ipha_t))) {
+		err = "mblk doesn't contain IP header";
+		goto bail;
+	}
+
+	/*
+	 * We are about to modify the header mblk; make sure we are
+	 * modifying our own copy. The code that follows assumes that
+	 * the IP/ULP headers exist in this mblk (and drops the
+	 * message if they don't).
+	 */
+	if (DB_REF(mp) > 1) {
+		mblk_t *tmp = copyb(mp);
+
+		if (tmp == NULL) {
+			err = "copyb failed";
+			goto bail;
+		}
+
+		if (skipped_hdr != NULL) {
+			ASSERT3P(skipped_hdr->b_cont, ==, mp);
+			skipped_hdr->b_cont = tmp;
+		}
+
+		tmp->b_cont = mp->b_cont;
+		freeb(mp);
+		mp = tmp;
+	}
+
+	ipha = (ipha_t *)(mp->b_rptr + offset);
+
+	/*
+	 * This code assumes a "simple" IP header (20 bytes, no
+	 * options). IPv4 options are mostly a historic artifact. The
+	 * one slight exception is Router Alert, but we don't expect
+	 * such a packet to land here.
+	 */
+	proto = ipha->ipha_protocol;
+	ASSERT(ipha->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION);
+	if (ipha->ipha_version_and_hdr_length != IP_SIMPLE_HDR_VERSION) {
+		err = "not simple IP header";
+		goto bail;
+	}
+
+	switch (proto) {
+	case IPPROTO_TCP:
+		ASSERT3U(MBLKL(mp), >=,
+		    (offset + sizeof (ipha_t) + sizeof (tcph_t)));
+		if (MBLKL(mp) < (offset + sizeof (ipha_t) + sizeof (tcph_t))) {
+			err = "mblk doesn't contain TCP header";
+			goto bail;
+		}
+		break;
+
+	case IPPROTO_UDP:
+		ASSERT3U(MBLKL(mp), >=,
+		    (offset + sizeof (ipha_t) + sizeof (udpha_t)));
+		if (MBLKL(mp) < (offset + sizeof (ipha_t) + sizeof (udpha_t))) {
+			err = "mblk doesn't contain UDP header";
+			goto bail;
+		}
+		break;
+
+	default:
+		err = "unexpected protocol";
+		goto bail;
+	}
+
+	if (flags & (HCK_FULLCKSUM | HCK_IPV4_HDRCKSUM)) {
+		if ((flags & HCK_FULLCKSUM) && (emul & MAC_HWCKSUM_EMUL)) {
+			ipaddr_t src, dst;
+			uint32_t cksum;
+			uint16_t *up;
+
+			/* Get a pointer to the ULP checksum. */
+			switch (proto) {
+			case IPPROTO_TCP:
+				/* LINTED: improper alignment cast */
+				up = IPH_TCPH_CHECKSUMP(ipha,
+				    IP_SIMPLE_HDR_LENGTH);
+				break;
+
+			case IPPROTO_UDP:
+				/* LINTED: improper alignment cast */
+				up = IPH_UDPH_CHECKSUMP(ipha,
+				    IP_SIMPLE_HDR_LENGTH);
+				break;
+			}
+
+			/* Pseudo-header checksum. */
+			src = ipha->ipha_src;
+			dst = ipha->ipha_dst;
+			len = ntohs(ipha->ipha_length) - IP_SIMPLE_HDR_LENGTH;
+
+			cksum = (dst >> 16) + (dst & 0xFFFF) +
+			    (src >> 16) + (src & 0xFFFF);
+			cksum += htons(len);
+
+			/*
+			 * The checksum value stored in the packet
+			 * needs to be correct. Compute it here.
+			 */
+			*up = 0;
+			cksum += (((proto) == IPPROTO_UDP) ?
+			    IP_UDP_CSUM_COMP : IP_TCP_CSUM_COMP);
+			cksum = IP_CSUM(mp, IP_SIMPLE_HDR_LENGTH +
+			    offset, cksum);
+			*(up) = (uint16_t)(cksum ? cksum : ~cksum);
+
+		}
+
+		/* We always update the ULP checksum flags. */
+		if ((flags & HCK_FULLCKSUM) && (emul & MAC_HWCKSUM_EMULS)) {
+			flags &= ~HCK_FULLCKSUM;
+			flags |= HCK_FULLCKSUM_OK;
+			value = 0;
+		}
 
 		/*
-		 * Since the processing of checksum offload for loopback
-		 * traffic requires modification of the packet contents,
-		 * ensure sure that we are always modifying our own copy.
+		 * Out of paranoia, and for the sake of correctness,
+		 * we won't calulate the IP header checksum if it's
+		 * already populated. While unlikely, it's possible to
+		 * write code that might end up calling mac_sw_cksum()
+		 * twice on the same mblk (performing both LSO and
+		 * checksum emualtion in a single mblk chain loop --
+		 * the LSO emulation inserts a new chain into the
+		 * existing chain and then the loop iterates back over
+		 * the new segments and emulates the checksum a second
+		 * time). Normally this wouldn't be a problem, because
+		 * the HCK_*_OK flags are supposed to indicate that we
+		 * don't need to do peform the work. But
+		 * HCK_IPV4_HDRCKSUM and HCK_IPV4_HDRCKSUM_OK have the
+		 * same value; so we cannot use these flags to
+		 * determine if the IP header checksum has already
+		 * been calculated or not. Luckily, if IP requests
+		 * HCK_IPV4_HDRCKSUM, then the IP header checksum will
+		 * be zero. So this test works just as well as
+		 * checking the flag. However, in the future, we
+		 * should fix the HCK_* flags.
 		 */
-		if (DB_REF(mp) > 1) {
-			mp1 = copymsg(mp);
-			if (mp1 == NULL)
-				continue;
-			mp1->b_next = mp->b_next;
-			mp->b_next = NULL;
-			freemsg(mp);
-			if (prev != NULL)
-				prev->b_next = mp1;
-			else
-				new_chain = mp1;
-			mp = mp1;
+		if ((flags & HCK_IPV4_HDRCKSUM) && (emul & MAC_HWCKSUM_EMULS) &&
+		    ipha->ipha_hdr_checksum == 0) {
+			ipha->ipha_hdr_checksum = (uint16_t)ip_csum_hdr(ipha);
+			flags &= ~HCK_IPV4_HDRCKSUM;
+			flags |= HCK_IPV4_HDRCKSUM_OK;
 		}
+	}
+
+	if ((flags & HCK_PARTIALCKSUM) && (emul & MAC_HWCKSUM_EMUL)) {
+		uint16_t *up, partial, cksum;
+		uchar_t *ipp; /* ptr to beginning of IP header */
+
+		ipp = mp->b_rptr + offset;
+		/* LINTED: cast may result in improper alignment */
+		up = (uint16_t *)((uchar_t *)ipp + stuff);
+		partial = *up;
+		*up = 0;
+
+		ASSERT3S(end, >, start);
+		cksum = ~IP_CSUM_PARTIAL(mp, offset + start, partial);
+		*up = cksum != 0 ? cksum : ~cksum;
+	}
+
+	/* We always update the ULP checksum flags. */
+	if ((flags & HCK_PARTIALCKSUM) && (emul & MAC_HWCKSUM_EMULS)) {
+		flags &= ~HCK_PARTIALCKSUM;
+		flags |= HCK_FULLCKSUM_OK;
+		value = 0;
+	}
+
+	mac_hcksum_set(mp, start, stuff, end, value, flags);
+
+	/* Don't forget to reattach the header. */
+	if (skipped_hdr != NULL) {
+		ASSERT3P(skipped_hdr->b_cont, ==, mp);
 
 		/*
-		 * Ethernet, and optionally VLAN header.
+		 * Duplicate the HCKSUM data into the header mblk.
+		 * This mimics mac_add_vlan_tag which ensures that
+		 * both the first mblk _and_ the first data bearing
+		 * mblk possess the HCKSUM information. Consumers like
+		 * IP will end up discarding the ether_header mblk, so
+		 * for now, it is important that the data be available
+		 * in both places.
 		 */
-		/* LINTED: improper alignment cast */
-		ehp = (struct ether_header *)mp->b_rptr;
-		if (ntohs(ehp->ether_type) == VLAN_TPID) {
-			struct ether_vlan_header *evhp;
+		mac_hcksum_clone(mp, skipped_hdr);
+		mp = skipped_hdr;
+	}
 
-			ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
-			/* LINTED: improper alignment cast */
-			evhp = (struct ether_vlan_header *)mp->b_rptr;
-			sap = ntohs(evhp->ether_type);
-			offset = sizeof (struct ether_vlan_header);
+	return (mp);
+
+bail:
+	if (skipped_hdr != NULL) {
+		ASSERT3P(skipped_hdr->b_cont, ==, mp);
+		mp = skipped_hdr;
+	}
+
+	mac_drop_pkt(mp, err);
+	return (NULL);
+}
+
+/*
+ * Build a single data segment from an LSO packet. The mblk chain
+ * returned, seg_head, represents the data segment and is always
+ * exactly seg_len bytes long. The lso_mp and offset input/output
+ * parameters track our position in the LSO packet. This function
+ * exists solely as a helper to mac_sw_lso().
+ *
+ * Case A
+ *
+ *     The current lso_mp is larger than the requested seg_len. The
+ *     beginning of seg_head may start at the beginning of lso_mp or
+ *     offset into it. In either case, a single mblk is returned, and
+ *     *offset is updated to reflect our new position in the current
+ *     lso_mp.
+ *
+ *          +----------------------------+
+ *          |  in *lso_mp / out *lso_mp  |
+ *          +----------------------------+
+ *          ^                        ^
+ *          |                        |
+ *          |                        |
+ *          |                        |
+ *          +------------------------+
+ *          |        seg_head        |
+ *          +------------------------+
+ *          ^                        ^
+ *          |                        |
+ *   in *offset = 0        out *offset = seg_len
+ *
+ *          |------   seg_len    ----|
+ *
+ *
+ *       +------------------------------+
+ *       |   in *lso_mp / out *lso_mp   |
+ *       +------------------------------+
+ *          ^                        ^
+ *          |                        |
+ *          |                        |
+ *          |                        |
+ *          +------------------------+
+ *          |        seg_head        |
+ *          +------------------------+
+ *          ^                        ^
+ *          |                        |
+ *   in *offset = N        out *offset = N + seg_len
+ *
+ *          |------   seg_len    ----|
+ *
+ *
+ *
+ * Case B
+ *
+ *    The requested seg_len consumes exactly the rest of the lso_mp.
+ *    I.e., the seg_head's b_wptr is equivalent to lso_mp's b_wptr.
+ *    The seg_head may start at the beginning of the lso_mp or at some
+ *    offset into it. In either case we return a single mblk, reset
+ *    *offset to zero, and walk to the next lso_mp.
+ *
+ *          +------------------------+           +------------------------+
+ *          |       in *lso_mp       |---------->|      out *lso_mp       |
+ *          +------------------------+           +------------------------+
+ *          ^                        ^           ^
+ *          |                        |           |
+ *          |                        |    out *offset = 0
+ *          |                        |
+ *          +------------------------+
+ *          |        seg_head        |
+ *          +------------------------+
+ *          ^
+ *          |
+ *   in *offset = 0
+ *
+ *          |------   seg_len    ----|
+ *
+ *
+ *
+ *      +----------------------------+           +------------------------+
+ *      |         in *lso_mp         |---------->|      out *lso_mp       |
+ *      +----------------------------+           +------------------------+
+ *          ^                        ^           ^
+ *          |                        |           |
+ *          |                        |    out *offset = 0
+ *          |                        |
+ *          +------------------------+
+ *          |        seg_head        |
+ *          +------------------------+
+ *          ^
+ *          |
+ *   in *offset = N
+ *
+ *          |------   seg_len    ----|
+ *
+ *
+ * Case C
+ *
+ *    The requested seg_len is greater than the current lso_mp. In
+ *    this case we must consume LSO mblks until we have enough data to
+ *    satisfy either case (A) or (B) above. We will return multiple
+ *    mblks linked via b_cont, offset will be set based on the cases
+ *    above, and lso_mp will walk forward at least one mblk, but maybe
+ *    more.
+ *
+ *    N.B. This digram is not exhaustive. The seg_head may start on
+ *    the beginning of an lso_mp. The seg_tail may end exactly on the
+ *    boundary of an lso_mp. And there may be two (in this case the
+ *    middle block wouldn't exist), three, or more mblks in the
+ *    seg_head chain. This is meant as one example of what might
+ *    happen. The main thing to remember is that the seg_tail mblk
+ *    must be one of case (A) or (B) above.
+ *
+ *  +------------------+    +----------------+    +------------------+
+ *  |    in *lso_mp    |--->|    *lso_mp     |--->|   out *lso_mp    |
+ *  +------------------+    +----------------+    +------------------+
+ *        ^            ^    ^                ^    ^            ^
+ *        |            |    |                |    |            |
+ *        |            |    |                |    |            |
+ *        |            |    |                |    |            |
+ *        |            |    |                |    |            |
+ *        +------------+    +----------------+    +------------+
+ *        |  seg_head  |--->|                |--->|  seg_tail  |
+ *        +------------+    +----------------+    +------------+
+ *        ^                                                    ^
+ *        |                                                    |
+ *  in *offset = N                          out *offset = MBLKL(seg_tail)
+ *
+ *        |-------------------   seg_len    -------------------|
+ *
+ */
+static mblk_t *
+build_data_seg(mblk_t **lso_mp, uint32_t *offset, uint32_t seg_len)
+{
+	mblk_t *seg_head, *seg_tail, *seg_mp;
+
+	ASSERT3P(*lso_mp, !=, NULL);
+	ASSERT3U((*lso_mp)->b_rptr + *offset, <, (*lso_mp)->b_wptr);
+
+	seg_mp = dupb(*lso_mp);
+	if (seg_mp == NULL)
+		return (NULL);
+
+	seg_head = seg_mp;
+	seg_tail = seg_mp;
+
+	/* Continue where we left off from in the lso_mp. */
+	seg_mp->b_rptr += *offset;
+
+last_mblk:
+	/* Case (A) */
+	if ((seg_mp->b_rptr + seg_len) < seg_mp->b_wptr) {
+		*offset += seg_len;
+		seg_mp->b_wptr = seg_mp->b_rptr + seg_len;
+		return (seg_head);
+	}
+
+	/* Case (B) */
+	if ((seg_mp->b_rptr + seg_len) == seg_mp->b_wptr) {
+		*offset = 0;
+		*lso_mp = (*lso_mp)->b_cont;
+		return (seg_head);
+	}
+
+	/* Case (C) */
+	ASSERT3U(seg_mp->b_rptr + seg_len, >, seg_mp->b_wptr);
+
+	/*
+	 * The current LSO mblk doesn't have enough data to satisfy
+	 * seg_len -- continue peeling off LSO mblks to build the new
+	 * segment message. If allocation fails we free the previously
+	 * allocated segment mblks and return NULL.
+	 */
+	while ((seg_mp->b_rptr + seg_len) > seg_mp->b_wptr) {
+		ASSERT3U(MBLKL(seg_mp), <=, seg_len);
+		seg_len -= MBLKL(seg_mp);
+		*offset = 0;
+		*lso_mp = (*lso_mp)->b_cont;
+		seg_mp = dupb(*lso_mp);
+
+		if (seg_mp == NULL) {
+			freemsgchain(seg_head);
+			return (NULL);
+		}
+
+		seg_tail->b_cont = seg_mp;
+		seg_tail = seg_mp;
+	}
+
+	/*
+	 * We've walked enough LSO mblks that we can now satisfy the
+	 * remaining seg_len. At this point we need to jump back to
+	 * determine if we have arrived at case (A) or (B).
+	 */
+
+	/* Just to be paranoid that we didn't underflow. */
+	ASSERT3U(seg_len, <, IP_MAXPACKET);
+	ASSERT3U(seg_len, >, 0);
+	goto last_mblk;
+}
+
+/*
+ * Perform software segmentation of a single LSO message. Take an LSO
+ * message as input and return head/tail pointers as output. This
+ * function should not be invoked directly but instead through
+ * mac_hw_emul().
+ *
+ * The resulting chain is comprised of multiple (nsegs) MSS sized
+ * segments. Each segment will consist of two or more mblks joined by
+ * b_cont: a header and one or more data mblks. The header mblk is
+ * allocated anew for each message. The first segment's header is used
+ * as a template for the rest with adjustments made for things such as
+ * ID, sequence, length, TCP flags, etc. The data mblks reference into
+ * the existing LSO mblk (passed in as omp) by way of dupb(). Their
+ * b_rptr/b_wptr values are adjusted to reference only the fraction of
+ * the LSO message they are responsible for. At the successful
+ * completion of this function the original mblk (omp) is freed,
+ * leaving the newely created segment chain as the only remaining
+ * reference to the data.
+ */
+static void
+mac_sw_lso(mblk_t *omp, mac_emul_t emul, mblk_t **head, mblk_t **tail,
+    uint_t *count)
+{
+	uint32_t ocsum_flags, ocsum_start, ocsum_stuff;
+	uint32_t mss;
+	uint32_t oehlen, oiphlen, otcphlen, ohdrslen, opktlen, odatalen;
+	uint32_t oleft;
+	uint_t nsegs, seg;
+	int len;
+
+	struct ether_vlan_header *oevh;
+	const ipha_t *oiph;
+	const tcph_t *otcph;
+	ipha_t *niph;
+	tcph_t *ntcph;
+	uint16_t ip_id;
+	uint32_t tcp_seq, tcp_sum, otcp_sum;
+
+	uint32_t offset;
+	mblk_t *odatamp;
+	mblk_t *seg_chain, *prev_nhdrmp, *next_nhdrmp, *nhdrmp, *ndatamp;
+	mblk_t *tmptail;
+
+	ASSERT3P(head, !=, NULL);
+	ASSERT3P(tail, !=, NULL);
+	ASSERT3P(count, !=, NULL);
+	ASSERT3U((DB_CKSUMFLAGS(omp) & HW_LSO), !=, 0);
+
+	/* Assume we are dealing with a single LSO message. */
+	ASSERT3P(omp->b_next, ==, NULL);
+
+	/*
+	 * XXX: This is a hack to deal with mac_add_vlan_tag().
+	 *
+	 * When VLANs are in play, mac_add_vlan_tag() creates a new
+	 * mblk with just the ether_vlan_header and tacks it onto the
+	 * front of 'omp'. This breaks the assumptions made below;
+	 * namely that the TCP/IP headers are in the first mblk. In
+	 * this case, since we already have to pay the cost of LSO
+	 * emulation, we simply pull up everything. While this might
+	 * seem irksome, keep in mind this will only apply in a couple
+	 * of scenarios: a) an LSO-capable VLAN client sending to a
+	 * non-LSO-capable client over the "MAC/bridge loopback"
+	 * datapath or b) an LSO-capable VLAN client is sending to a
+	 * client that, for whatever reason, doesn't have DLS-bypass
+	 * enabled. Finally, we have to check for both a tagged and
+	 * untagged sized mblk depending on if the mblk came via
+	 * mac_promisc_dispatch() or mac_rx_deliver().
+	 *
+	 * In the future, two things should be done:
+	 *
+	 * 1. This function should make use of some yet to be
+	 *    implemented "mblk helpers". These helper functions would
+	 *    perform all the b_cont walking for us and guarantee safe
+	 *    access to the mblk data.
+	 *
+	 * 2. We should add some slop to the mblks so that
+	 *    mac_add_vlan_tag() can just edit the first mblk instead
+	 *    of allocating on the hot path.
+	 */
+	if (MBLKL(omp) == sizeof (struct ether_vlan_header) ||
+	    MBLKL(omp) == sizeof (struct ether_header)) {
+		mblk_t *tmp = msgpullup(omp, -1);
+
+		if (tmp == NULL) {
+			mac_drop_pkt(omp, "failed to pull up");
+			goto fail;
+		}
+
+		mac_hcksum_clone(omp, tmp);
+		freemsg(omp);
+		omp = tmp;
+	}
+
+	mss = DB_LSOMSS(omp);
+	ASSERT3U(msgsize(omp), <=, IP_MAXPACKET +
+	    sizeof (struct ether_vlan_header));
+	opktlen = msgsize(omp);
+
+	/*
+	 * First, get references to the IP and TCP headers and
+	 * determine the total TCP length (header + data).
+	 *
+	 * Thanks to mac_hw_emul() we know that the first mblk must
+	 * contain (at minimum) the full L2 header. However, this
+	 * function assumes more than that. It assumes the L2/L3/L4
+	 * headers are all contained in the first mblk of a message
+	 * (i.e., no b_cont walking for headers). While this is a
+	 * current reality (our native TCP stack and viona both
+	 * enforce this) things may become more nuanced in the future
+	 * (e.g. when introducing encap support or adding new
+	 * clients). For now we guard against this case by dropping
+	 * the packet.
+	 */
+	oevh = (struct ether_vlan_header *)omp->b_rptr;
+	if (oevh->ether_tpid == htons(ETHERTYPE_VLAN))
+		oehlen = sizeof (struct ether_vlan_header);
+	else
+		oehlen = sizeof (struct ether_header);
+
+	ASSERT3U(MBLKL(omp), >=, (oehlen + sizeof (ipha_t) + sizeof (tcph_t)));
+	if (MBLKL(omp) < (oehlen + sizeof (ipha_t) + sizeof (tcph_t))) {
+		mac_drop_pkt(omp, "mblk doesn't contain TCP/IP headers");
+		goto fail;
+	}
+
+	oiph = (ipha_t *)(omp->b_rptr + oehlen);
+	oiphlen = IPH_HDR_LENGTH(oiph);
+	otcph = (tcph_t *)(omp->b_rptr + oehlen + oiphlen);
+	otcphlen = TCP_HDR_LENGTH(otcph);
+
+	/*
+	 * Currently we only support LSO for TCP/IPv4.
+	 */
+	if (IPH_HDR_VERSION(oiph) != IPV4_VERSION) {
+		mac_drop_pkt(omp, "LSO unsupported IP version: %uhh",
+		    IPH_HDR_VERSION(oiph));
+		goto fail;
+	}
+
+	if (oiph->ipha_protocol != IPPROTO_TCP) {
+		mac_drop_pkt(omp, "LSO unsupported protocol: %uhh",
+		    oiph->ipha_protocol);
+		goto fail;
+	}
+
+	if (otcph->th_flags[0] & (TH_SYN | TH_RST | TH_URG)) {
+		mac_drop_pkt(omp, "LSO packet has SYN|RST|URG set");
+		goto fail;
+	}
+
+	ohdrslen = oehlen + oiphlen + otcphlen;
+	if ((len = MBLKL(omp)) < ohdrslen) {
+		mac_drop_pkt(omp, "LSO packet too short: %d < %u", len,
+		    ohdrslen);
+		goto fail;
+	}
+
+	/*
+	 * Either we have data in the first mblk or it's just the
+	 * header. In either case, we need to set rptr to the start of
+	 * the TCP data.
+	 */
+	if (len > ohdrslen) {
+		odatamp = omp;
+		offset = ohdrslen;
+	} else {
+		ASSERT3U(len, ==, ohdrslen);
+		odatamp = omp->b_cont;
+		offset = 0;
+	}
+
+	/* Make sure we still have enough data. */
+	ASSERT3U(msgsize(odatamp), >=, opktlen - ohdrslen);
+
+	/*
+	 * If a MAC negotiated LSO then it must negotioate both
+	 * HCKSUM_IPHDRCKSUM and either HCKSUM_INET_FULL_V4 or
+	 * HCKSUM_INET_PARTIAL; because both the IP and TCP headers
+	 * change during LSO segmentation (only the 3 fields of the
+	 * pseudo header checksum don't change: src, dst, proto). Thus
+	 * we would expect these flags (HCK_IPV4_HDRCKSUM |
+	 * HCK_PARTIALCKSUM | HCK_FULLCKSUM) to be set and for this
+	 * function to emulate those checksums in software. However,
+	 * that assumes a world where we only expose LSO if the
+	 * underlying hardware exposes LSO. Moving forward the plan is
+	 * to assume LSO in the upper layers and have MAC perform
+	 * software LSO when the underlying provider doesn't support
+	 * it. In such a world, if the provider doesn't support LSO
+	 * but does support hardware checksum offload, then we could
+	 * simply perform the segmentation and allow the hardware to
+	 * calculate the checksums. To the hardware it's just another
+	 * chain of non-LSO packets.
+	 */
+	ASSERT3S(DB_TYPE(omp), ==, M_DATA);
+	ocsum_flags = DB_CKSUMFLAGS(omp);
+	ASSERT3U(ocsum_flags & HCK_IPV4_HDRCKSUM, !=, 0);
+	ASSERT3U(ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM), !=, 0);
+
+	/*
+	 * If hardware only provides partial checksum then software
+	 * must supply the pseudo-header checksum. In the case of LSO
+	 * we leave the TCP length at zero to be filled in by
+	 * hardware. This function must handle two scenarios.
+	 *
+	 * 1. Being called by a MAC client on the Rx path to segment
+	 *    an LSO packet and calculate the checksum.
+	 *
+	 * 2. Being called by a MAC provider to segment an LSO packet.
+	 *    In this case the LSO segmentation is performed in
+	 *    software (by this routine) but the MAC provider should
+	 *    still calculate the TCP/IP checksums in hardware.
+	 *
+	 *  To elaborate on the second case: we cannot have the
+	 *  scenario where IP sends LSO packets but the underlying HW
+	 *  doesn't support checksum offload -- because in that case
+	 *  TCP/IP would calculate the checksum in software (for the
+	 *  LSO packet) but then MAC would segment the packet and have
+	 *  to redo all the checksum work. So IP should never do LSO
+	 *  if HW doesn't support both IP and TCP checksum.
+	 */
+	if (ocsum_flags & HCK_PARTIALCKSUM) {
+		ocsum_start = (uint32_t)DB_CKSUMSTART(omp);
+		ocsum_stuff = (uint32_t)DB_CKSUMSTUFF(omp);
+	}
+
+	odatalen = opktlen - ohdrslen;
+
+	/*
+	 * Subtract one to account for the case where the data length
+	 * is evenly divisble by the MSS. Add one to account for the
+	 * fact that the division will always result in one less
+	 * segment than needed.
+	 */
+	nsegs = ((odatalen - 1) / mss) + 1;
+	if (nsegs < 2) {
+		mac_drop_pkt(omp, "LSO not enough segs: %u", nsegs);
+		goto fail;
+	}
+
+	DTRACE_PROBE6(sw__lso__start, mblk_t *, omp, void_ip_t *, oiph,
+	    __dtrace_tcp_tcph_t *, otcph, uint_t, odatalen, uint_t, mss, uint_t,
+	    nsegs);
+
+	seg_chain = NULL;
+	tmptail = seg_chain;
+	oleft = odatalen;
+
+	for (uint_t i = 0; i < nsegs; i++) {
+		boolean_t last_seg = ((i + 1) == nsegs);
+		uint32_t seg_len;
+
+		/*
+		 * If we fail to allocate, then drop the partially
+		 * allocated chain as well as the LSO packet. Let the
+		 * sender deal with the fallout.
+		 */
+		if ((nhdrmp = allocb(ohdrslen, 0)) == NULL) {
+			freemsgchain(seg_chain);
+			mac_drop_pkt(omp, "failed to alloc segment header");
+			goto fail;
+		}
+		ASSERT3P(nhdrmp->b_cont, ==, NULL);
+
+		if (seg_chain == NULL) {
+			seg_chain = nhdrmp;
 		} else {
-			sap = ntohs(ehp->ether_type);
-			offset = sizeof (struct ether_header);
+			ASSERT3P(tmptail, !=, NULL);
+			tmptail->b_next = nhdrmp;
 		}
 
-		if (MBLKL(mp) <= offset) {
-			offset -= MBLKL(mp);
-			if (mp->b_cont == NULL) {
-				/* corrupted packet, skip it */
-				if (prev != NULL)
-					prev->b_next = mp->b_next;
-				else
-					new_chain = mp->b_next;
-				mp1 = mp->b_next;
-				mp->b_next = NULL;
-				freemsg(mp);
-				mp = mp1;
-				continue;
-			}
-			mp = mp->b_cont;
+		tmptail = nhdrmp;
+
+		/*
+		 * Calculate this segment's lengh. It's either the MSS
+		 * or whatever remains for the last segment.
+		 */
+		seg_len = last_seg ? oleft : mss;
+		ASSERT3U(seg_len, <=, mss);
+		ndatamp = build_data_seg(&odatamp, &offset, seg_len);
+
+		if (ndatamp == NULL) {
+			freemsgchain(seg_chain);
+			mac_drop_pkt(omp, "LSO failed to segment data");
+			goto fail;
 		}
 
-		if (flags & (HCK_FULLCKSUM | HCK_IPV4_HDRCKSUM)) {
-			ipha_t *ipha = NULL;
+		/* Attach data mblk to header mblk. */
+		nhdrmp->b_cont = ndatamp;
+		DB_CKSUMFLAGS(ndatamp) &= ~HW_LSO;
+		ASSERT3U(seg_len, <=, oleft);
+		oleft -= seg_len;
+	}
 
-			/*
-			 * In order to compute the full and header
-			 * checksums, we need to find and parse
-			 * the IP and/or ULP headers.
-			 */
+	/* We should have consumed entire LSO msg. */
+	ASSERT3S(oleft, ==, 0);
+	ASSERT3P(odatamp, ==, NULL);
+
+	/*
+	 * All seg data mblks are referenced by the header mblks, null
+	 * out this pointer to catch any bad derefs.
+	 */
+	ndatamp = NULL;
+
+	/*
+	 * Set headers and checksum for first segment.
+	 */
+	nhdrmp = seg_chain;
+	bcopy(omp->b_rptr, nhdrmp->b_rptr, ohdrslen);
+	nhdrmp->b_wptr = nhdrmp->b_rptr + ohdrslen;
+	niph = (ipha_t *)(nhdrmp->b_rptr + oehlen);
+	ASSERT3U(msgsize(nhdrmp->b_cont), ==, mss);
+	niph->ipha_length = htons(oiphlen + otcphlen + mss);
+	niph->ipha_hdr_checksum = 0;
+	ip_id = ntohs(niph->ipha_ident);
+	ntcph = (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen);
+	tcp_seq = BE32_TO_U32(ntcph->th_seq);
+	tcp_seq += mss;
 
-			sap = (sap < ETHERTYPE_802_MIN) ? 0 : sap;
+	/*
+	 * The first segment shouldn't:
+	 *
+	 *	o indicate end of data transmission (FIN),
+	 *	o indicate immediate handling of the data (PUSH).
+	 */
+	ntcph->th_flags[0] &= ~(TH_FIN | TH_PUSH);
+	DB_CKSUMFLAGS(nhdrmp) = (uint16_t)(ocsum_flags & ~HW_LSO);
 
+	/*
+	 * If the underlying HW provides partial checksum, then make
+	 * sure to correct the pseudo header checksum before calling
+	 * mac_sw_cksum(). The native TCP stack doesn't include the
+	 * length field in the pseudo header when LSO is in play -- so
+	 * we need to calculate it here.
+	 */
+	if (ocsum_flags & HCK_PARTIALCKSUM) {
+		DB_CKSUMSTART(nhdrmp) = ocsum_start;
+		DB_CKSUMEND(nhdrmp) = ntohs(niph->ipha_length);
+		DB_CKSUMSTUFF(nhdrmp) = ocsum_stuff;
+		tcp_sum = BE16_TO_U16(ntcph->th_sum);
+		otcp_sum = tcp_sum;
+		tcp_sum += mss + otcphlen;
+		tcp_sum = (tcp_sum >> 16) + (tcp_sum & 0xFFFF);
+		U16_TO_BE16(tcp_sum, ntcph->th_sum);
+	}
+
+	if ((ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) &&
+	    (emul & MAC_HWCKSUM_EMULS)) {
+		next_nhdrmp = nhdrmp->b_next;
+		nhdrmp->b_next = NULL;
+		nhdrmp = mac_sw_cksum(nhdrmp, emul);
+		nhdrmp->b_next = next_nhdrmp;
+		next_nhdrmp = NULL;
+
+		/*
+		 * We may have freed the nhdrmp argument during
+		 * checksum emulation, make sure that seg_chain
+		 * references a valid mblk.
+		 */
+		seg_chain = nhdrmp;
+	}
+
+	ASSERT3P(nhdrmp, !=, NULL);
+
+	seg = 1;
+	DTRACE_PROBE5(sw__lso__seg, mblk_t *, nhdrmp, void_ip_t *,
+	    (ipha_t *)(nhdrmp->b_rptr + oehlen), __dtrace_tcp_tcph_t *,
+	    (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen), uint_t, mss,
+	    uint_t, seg);
+	seg++;
+
+	/* There better be at least 2 segs. */
+	ASSERT3P(nhdrmp->b_next, !=, NULL);
+	prev_nhdrmp = nhdrmp;
+	nhdrmp = nhdrmp->b_next;
+
+	/*
+	 * Now adjust the headers of the middle segments. For each
+	 * header we need to adjust the following.
+	 *
+	 *	o IP ID
+	 *	o IP length
+	 *	o TCP sequence
+	 *	o TCP flags
+	 *	o cksum flags
+	 *	o cksum values (if MAC_HWCKSUM_EMUL is set)
+	 */
+	for (; seg < nsegs; seg++) {
+		/*
+		 * We use seg_chain as a reference to the first seg
+		 * header mblk -- this first header is a template for
+		 * the rest of the segments. This copy will include
+		 * the now updated checksum values from the first
+		 * header. We must reset these checksum values to
+		 * their original to make sure we produce the correct
+		 * value.
+		 */
+		bcopy(seg_chain->b_rptr, nhdrmp->b_rptr, ohdrslen);
+		nhdrmp->b_wptr = nhdrmp->b_rptr + ohdrslen;
+		niph = (ipha_t *)(nhdrmp->b_rptr + oehlen);
+		niph->ipha_ident = htons(++ip_id);
+		ASSERT3P(msgsize(nhdrmp->b_cont), ==, mss);
+		niph->ipha_length = htons(oiphlen + otcphlen + mss);
+		niph->ipha_hdr_checksum = 0;
+		ntcph = (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen);
+		U32_TO_BE32(tcp_seq, ntcph->th_seq);
+		tcp_seq += mss;
+		/*
+		 * Just like the first segment, the middle segments
+		 * shouldn't have these flags set.
+		 */
+		ntcph->th_flags[0] &= ~(TH_FIN | TH_PUSH);
+		DB_CKSUMFLAGS(nhdrmp) = (uint16_t)(ocsum_flags & ~HW_LSO);
+
+		if (ocsum_flags & HCK_PARTIALCKSUM) {
 			/*
-			 * IP header.
+			 * First and middle segs have same
+			 * pseudo-header checksum.
 			 */
-			if (sap != ETHERTYPE_IP)
-				continue;
-
-			ASSERT(MBLKL(mp) >= offset + sizeof (ipha_t));
-			/* LINTED: improper alignment cast */
-			ipha = (ipha_t *)(mp->b_rptr + offset);
-
-			if (flags & HCK_FULLCKSUM) {
-				ipaddr_t src, dst;
-				uint32_t cksum;
-				uint16_t *up;
-				uint8_t proto;
-
-				/*
-				 * Pointer to checksum field in ULP header.
-				 */
-				proto = ipha->ipha_protocol;
-				ASSERT(ipha->ipha_version_and_hdr_length ==
-				    IP_SIMPLE_HDR_VERSION);
-
-				switch (proto) {
-				case IPPROTO_TCP:
-					/* LINTED: improper alignment cast */
-					up = IPH_TCPH_CHECKSUMP(ipha,
-					    IP_SIMPLE_HDR_LENGTH);
-					break;
-
-				case IPPROTO_UDP:
-					/* LINTED: improper alignment cast */
-					up = IPH_UDPH_CHECKSUMP(ipha,
-					    IP_SIMPLE_HDR_LENGTH);
-					break;
-
-				default:
-					cmn_err(CE_WARN, "mac_fix_cksum: "
-					    "unexpected protocol: %d", proto);
-					continue;
-				}
-
-				/*
-				 * Pseudo-header checksum.
-				 */
-				src = ipha->ipha_src;
-				dst = ipha->ipha_dst;
-				len = ntohs(ipha->ipha_length) -
-				    IP_SIMPLE_HDR_LENGTH;
-
-				cksum = (dst >> 16) + (dst & 0xFFFF) +
-				    (src >> 16) + (src & 0xFFFF);
-				cksum += htons(len);
-
-				/*
-				 * The checksum value stored in the packet needs
-				 * to be correct. Compute it here.
-				 */
-				*up = 0;
-				cksum += (((proto) == IPPROTO_UDP) ?
-				    IP_UDP_CSUM_COMP : IP_TCP_CSUM_COMP);
-				cksum = IP_CSUM(mp, IP_SIMPLE_HDR_LENGTH +
-				    offset, cksum);
-				*(up) = (uint16_t)(cksum ? cksum : ~cksum);
-
-				/*
-				 * Flag the packet so that it appears
-				 * that the checksum has already been
-				 * verified by the hardware.
-				 */
-				flags &= ~HCK_FULLCKSUM;
-				flags |= HCK_FULLCKSUM_OK;
-				value = 0;
-			}
+			U16_TO_BE16(tcp_sum, ntcph->th_sum);
+			DB_CKSUMSTART(nhdrmp) = ocsum_start;
+			DB_CKSUMEND(nhdrmp) = ntohs(niph->ipha_length);
+			DB_CKSUMSTUFF(nhdrmp) = ocsum_stuff;
+		}
 
-			if (flags & HCK_IPV4_HDRCKSUM) {
-				ASSERT(ipha != NULL);
-				ipha->ipha_hdr_checksum =
-				    (uint16_t)ip_csum_hdr(ipha);
-				flags &= ~HCK_IPV4_HDRCKSUM;
-				flags |= HCK_IPV4_HDRCKSUM_OK;
+		if ((ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) &&
+		    (emul & MAC_HWCKSUM_EMULS)) {
+			next_nhdrmp = nhdrmp->b_next;
+			nhdrmp->b_next = NULL;
+			nhdrmp = mac_sw_cksum(nhdrmp, emul);
+			nhdrmp->b_next = next_nhdrmp;
+			next_nhdrmp = NULL;
+			/* We may have freed the original nhdrmp. */
+			prev_nhdrmp->b_next = nhdrmp;
+		}
 
-			}
+		DTRACE_PROBE5(sw__lso__seg, mblk_t *, nhdrmp, void_ip_t *,
+		    (ipha_t *)(nhdrmp->b_rptr + oehlen), __dtrace_tcp_tcph_t *,
+		    (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen),
+		    uint_t, mss, uint_t, seg);
+
+		ASSERT3P(nhdrmp->b_next, !=, NULL);
+		prev_nhdrmp = nhdrmp;
+		nhdrmp = nhdrmp->b_next;
+	}
+
+	/* Make sure we are on the last segment. */
+	ASSERT3U(seg, ==, nsegs);
+	ASSERT3P(nhdrmp->b_next, ==, NULL);
+
+	/*
+	 * Now we set the last segment header. The difference being
+	 * that FIN/PSH/RST flags are allowed.
+	 */
+	bcopy(seg_chain->b_rptr, nhdrmp->b_rptr, ohdrslen);
+	nhdrmp->b_wptr = nhdrmp->b_rptr + ohdrslen;
+	niph = (ipha_t *)(nhdrmp->b_rptr + oehlen);
+	niph->ipha_ident = htons(++ip_id);
+	len = msgsize(nhdrmp->b_cont);
+	ASSERT3S(len, >, 0);
+	niph->ipha_length = htons(oiphlen + otcphlen + len);
+	niph->ipha_hdr_checksum = 0;
+	ntcph = (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen);
+	U32_TO_BE32(tcp_seq, ntcph->th_seq);
+
+	DB_CKSUMFLAGS(nhdrmp) = (uint16_t)(ocsum_flags & ~HW_LSO);
+	if (ocsum_flags & HCK_PARTIALCKSUM) {
+		DB_CKSUMSTART(nhdrmp) = ocsum_start;
+		DB_CKSUMEND(nhdrmp) = ntohs(niph->ipha_length);
+		DB_CKSUMSTUFF(nhdrmp) = ocsum_stuff;
+		tcp_sum = otcp_sum;
+		tcp_sum += len + otcphlen;
+		tcp_sum = (tcp_sum >> 16) + (tcp_sum & 0xFFFF);
+		U16_TO_BE16(tcp_sum, ntcph->th_sum);
+	}
+
+	if ((ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) &&
+	    (emul & MAC_HWCKSUM_EMULS)) {
+		/* This should be the last mblk. */
+		ASSERT3P(nhdrmp->b_next, ==, NULL);
+		nhdrmp = mac_sw_cksum(nhdrmp, emul);
+		prev_nhdrmp->b_next = nhdrmp;
+	}
+
+	DTRACE_PROBE5(sw__lso__seg, mblk_t *, nhdrmp, void_ip_t *,
+	    (ipha_t *)(nhdrmp->b_rptr + oehlen), __dtrace_tcp_tcph_t *,
+	    (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen), uint_t, len,
+	    uint_t, seg);
+
+	/*
+	 * Free the reference to the original LSO message as it is
+	 * being replaced by seg_cahin.
+	 */
+	freemsg(omp);
+	*head = seg_chain;
+	*tail = nhdrmp;
+	*count = nsegs;
+	return;
+
+fail:
+	*head = NULL;
+	*tail = NULL;
+	*count = 0;
+}
+
+#define	HCK_NEEDED	(HCK_IPV4_HDRCKSUM | HCK_PARTIALCKSUM | HCK_FULLCKSUM)
+
+/*
+ * Emulate various hardware offload features in software. Take a chain
+ * of packets as input and emulate the hardware features specified in
+ * 'emul'. The resulting chain's head pointer replaces the 'mp_chain'
+ * pointer given as input, and its tail pointer is written to
+ * '*otail'. The number of packets in the new chain is written to
+ * '*ocount'. The 'otail' and 'ocount' arguments are optional and thus
+ * may be NULL. The 'mp_chain' argument may point to a NULL chain; in
+ * which case 'mp_chain' will simply stay a NULL chain.
+ *
+ * While unlikely, it is technically possible that this function could
+ * receive a non-NULL chain as input and return a NULL chain as output
+ * ('*mp_chain' and '*otail' would be NULL and '*ocount' would be
+ * zero). This could happen if all the packets in the chain are
+ * dropped or if we fail to allocate new mblks. In this case, there is
+ * nothing for the caller to free. In any event, the caller shouldn't
+ * assume that '*mp_chain' is non-NULL on return.
+ *
+ * This function was written with two main use cases in mind.
+ *
+ * 1. A way for MAC clients to emulate hardware offloads when they
+ *    can't directly handle LSO packets or packets without fully
+ *    calculated checksums.
+ *
+ * 2. A way for MAC providers (drivers) to offer LSO even when the
+ *    underlying HW can't or won't supply LSO offload.
+ *
+ * At the time of this writing no provider is making use of this
+ * function. However, the plan for the future is to always assume LSO
+ * is available and then add SW LSO emulation to all providers that
+ * don't support it in HW.
+ */
+void
+mac_hw_emul(mblk_t **mp_chain, mblk_t **otail, uint_t *ocount, mac_emul_t emul)
+{
+	mblk_t *head = NULL, *tail = NULL;
+	uint_t count = 0;
+
+	ASSERT3S(~(MAC_HWCKSUM_EMULS | MAC_LSO_EMUL) & emul, ==, 0);
+	ASSERT3P(mp_chain, !=, NULL);
+
+	for (mblk_t *mp = *mp_chain; mp != NULL; ) {
+		mblk_t *tmp, *next, *tmphead, *tmptail;
+		struct ether_header *ehp;
+		uint32_t flags;
+		uint_t len = MBLKL(mp), l2len;
+
+		/* Perform LSO/cksum one message at a time. */
+		next = mp->b_next;
+		mp->b_next = NULL;
+
+		/*
+		 * For our sanity the first mblk should contain at
+		 * least the full L2 header.
+		 */
+		if (len < sizeof (struct ether_header)) {
+			mac_drop_pkt(mp, "packet too short (A): %u", len);
+			mp = next;
+			continue;
 		}
 
-		if (flags & HCK_PARTIALCKSUM) {
-			uint16_t *up, partial, cksum;
-			uchar_t *ipp; /* ptr to beginning of IP header */
-
-			if (mp->b_cont != NULL) {
-				mblk_t *mp1;
-
-				mp1 = msgpullup(mp, offset + end);
-				if (mp1 == NULL)
-					continue;
-				mp1->b_next = mp->b_next;
-				mp->b_next = NULL;
-				freemsg(mp);
-				if (prev != NULL)
-					prev->b_next = mp1;
-				else
-					new_chain = mp1;
-				mp = mp1;
-			}
+		ehp = (struct ether_header *)mp->b_rptr;
+		if (ntohs(ehp->ether_type) == VLAN_TPID)
+			l2len = sizeof (struct ether_vlan_header);
+		else
+			l2len = sizeof (struct ether_header);
 
-			ipp = mp->b_rptr + offset;
-			/* LINTED: cast may result in improper alignment */
-			up = (uint16_t *)((uchar_t *)ipp + stuff);
-			partial = *up;
-			*up = 0;
+		/*
+		 * If the first mblk is solely the L2 header, then
+		 * there better be more data.
+		 */
+		if (len < l2len || (len == l2len && mp->b_cont == NULL)) {
+			mac_drop_pkt(mp, "packet too short (C): %u", len);
+			mp = next;
+			continue;
+		}
+
+		DTRACE_PROBE2(mac__emul, mblk_t *, mp, mac_emul_t, emul);
+
+		/*
+		 * We use DB_CKSUMFLAGS (instead of mac_hcksum_get())
+		 * because we don't want to mask-out the LSO flag.
+		 */
+		flags = DB_CKSUMFLAGS(mp);
 
-			cksum = IP_BCSUM_PARTIAL(mp->b_rptr + offset + start,
-			    end - start, partial);
-			cksum = ~cksum;
-			*up = cksum ? cksum : ~cksum;
+		if ((flags & HW_LSO) && (emul & MAC_LSO_EMUL)) {
+			uint_t tmpcount = 0;
 
 			/*
-			 * Since we already computed the whole checksum,
-			 * indicate to the stack that it has already
-			 * been verified by the hardware.
+			 * LSO fix-up handles checksum emulation
+			 * inline (if requested). It also frees mp.
 			 */
-			flags &= ~HCK_PARTIALCKSUM;
-			flags |= HCK_FULLCKSUM_OK;
-			value = 0;
+			mac_sw_lso(mp, emul, &tmphead, &tmptail,
+			    &tmpcount);
+			count += tmpcount;
+		} else if ((flags & HCK_NEEDED) && (emul & MAC_HWCKSUM_EMULS)) {
+			tmp = mac_sw_cksum(mp, emul);
+			tmphead = tmp;
+			tmptail = tmp;
+			count++;
+		} else {
+			/* There is nothing to emulate. */
+			tmp = mp;
+			tmphead = tmp;
+			tmptail = tmp;
+			count++;
 		}
 
-		(void) hcksum_assoc(mp, NULL, NULL, start, stuff, end,
-		    value, flags, KM_NOSLEEP);
+		/*
+		 * The tmp mblk chain is either the start of the new
+		 * chain or added to the tail of the new chain.
+		 */
+		if (head == NULL) {
+			head = tmphead;
+			tail = tmptail;
+		} else {
+			/* Attach the new mblk to the end of the new chain. */
+			tail->b_next = tmphead;
+			tail = tmptail;
+		}
+
+		mp = next;
 	}
 
-	return (new_chain);
+	*mp_chain = head;
+
+	if (otail != NULL)
+		*otail = tail;
+
+	if (ocount != NULL)
+		*ocount = count;
 }
 
 /*
@@ -320,7 +1287,6 @@ mac_add_vlan_tag(mblk_t *mp, uint_t pri, uint16_t vid)
 	mblk_t *hmp;
 	struct ether_vlan_header *evhp;
 	struct ether_header *ehp;
-	uint32_t start, stuff, end, value, flags;
 
 	ASSERT(pri != 0 || vid != 0);
 
@@ -350,9 +1316,7 @@ mac_add_vlan_tag(mblk_t *mp, uint_t pri, uint16_t vid)
 	 * Free the original message if it's now empty. Link the
 	 * rest of messages to the header message.
 	 */
-	hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value, &flags);
-	(void) hcksum_assoc(hmp, NULL, NULL, start, stuff, end, value, flags,
-	    KM_NOSLEEP);
+	mac_hcksum_clone(mp, hmp);
 	if (MBLKL(mp) == 0) {
 		hmp->b_cont = mp->b_cont;
 		freeb(mp);
@@ -456,16 +1420,9 @@ mac_strip_vlan_tag_chain(mblk_t *mp_chain)
  */
 /* ARGSUSED */
 void
-mac_pkt_drop(void *arg, mac_resource_handle_t resource, mblk_t *mp,
+mac_rx_def(void *arg, mac_resource_handle_t resource, mblk_t *mp,
     boolean_t loopback)
 {
-	mblk_t	*mp1 = mp;
-
-	while (mp1 != NULL) {
-		mp1->b_prev = NULL;
-		mp1->b_queue = NULL;
-		mp1 = mp1->b_next;
-	}
 	freemsgchain(mp);
 }
 
diff --git a/usr/src/uts/common/io/mem.c b/usr/src/uts/common/io/mem.c
index 950fab1272..fcea4a8f03 100644
--- a/usr/src/uts/common/io/mem.c
+++ b/usr/src/uts/common/io/mem.c
@@ -225,10 +225,19 @@ mmopen(dev_t *devp, int flag, int typ, struct cred *cred)
 	case M_NULL:
 	case M_ZERO:
 	case M_FULL:
+		/* standard devices */
+		break;
+
 	case M_MEM:
 	case M_KMEM:
 	case M_ALLKMEM:
-		/* standard devices */
+		/*
+		 * These devices should never be visible in a zone, but if they
+		 * somehow do get created we refuse to allow the zone to use
+		 * them.
+		 */
+		if (crgetzoneid(cred) != GLOBAL_ZONEID)
+			return (EACCES);
 		break;
 
 	default:
diff --git a/usr/src/uts/common/io/mr_sas/mr_sas.conf b/usr/src/uts/common/io/mr_sas/mr_sas.conf
index cfda434e23..6c585c6a42 100644
--- a/usr/src/uts/common/io/mr_sas/mr_sas.conf
+++ b/usr/src/uts/common/io/mr_sas/mr_sas.conf
@@ -13,3 +13,11 @@
 # Fast-Path specific flag. Default is "yes".
 # mrsas-enable-fp="yes";
 
+flow_control="dmult" queue="qsort" tape="sctp";
+
+# MSI specific flag. To enable MSI modify the flag value to "yes"
+mrsas-enable-msi="yes";
+
+# Fast-Path specific flag. To enable Fast-Path modify the flag value to "yes"
+mrsas-enable-fp="yes";
+
diff --git a/usr/src/uts/common/io/nfp/THIRDPARTYLICENSE b/usr/src/uts/common/io/nfp/THIRDPARTYLICENSE
new file mode 100644
index 0000000000..187088ff34
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/THIRDPARTYLICENSE
@@ -0,0 +1,19 @@
+Copyright (c) 2014, Thales UK Limited
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/usr/src/uts/common/io/nfp/THIRDPARTYLICENSE.descrip b/usr/src/uts/common/io/nfp/THIRDPARTYLICENSE.descrip
new file mode 100644
index 0000000000..cde8b65b37
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/THIRDPARTYLICENSE.descrip
@@ -0,0 +1 @@
+NFAST CRYPTO ACCELERATOR DRIVER
diff --git a/usr/src/uts/common/io/nfp/autoversion.h b/usr/src/uts/common/io/nfp/autoversion.h
new file mode 100644
index 0000000000..b9021942b2
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/autoversion.h
@@ -0,0 +1,21 @@
+/*
+
+(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+*/
+
+/* AUTOGENERATED - DO NOT EDIT */
+#ifndef AUTOVERSION_H
+#define AUTOVERSION_H
+
+#define VERSION_RELEASEMAJOR 2
+#define VERSION_RELEASEMINOR 26
+#define VERSION_RELEASEPATCH 40
+#define VERSION_NO "2.26.40cam999"
+#define VERSION_COMPNAME "nfdrv"
+
+#endif
diff --git a/usr/src/uts/common/io/nfp/drvlist.c b/usr/src/uts/common/io/nfp/drvlist.c
new file mode 100644
index 0000000000..a04b1fd5b0
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/drvlist.c
@@ -0,0 +1,19 @@
+/*
+
+(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+*/
+
+#include "nfp_common.h"
+#include "nfp_cmd.h"
+
+const nfpcmd_dev *nfp_drvlist[] = {
+  &i21285_cmddev,
+  &i21555_cmddev,
+  NULL
+};
+
diff --git a/usr/src/uts/common/io/nfp/hostif.c b/usr/src/uts/common/io/nfp/hostif.c
new file mode 100644
index 0000000000..684be703ea
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/hostif.c
@@ -0,0 +1,1192 @@
+/*
+
+hostif.c: nFast PCI driver for Solaris 2.5, 2.6, 2.7 and 2.8
+
+(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+history
+
+06/05/1998 jsh  Original solaris 2.6
+21/05/1999 jsh  added support for solaris 2.5
+10/06/1999 jsh  added support for solaris 2.7 (32 and 64 bit)
+??/??/2001 jsh  added support for solaris 2.8 (32 and 64 bit)
+16/10/2001 jsh  moved from nfast to new structure in nfdrv
+12/02/2002 jsh  added high level interrupt support
+
+*/
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/errno.h>
+#include <sys/file.h>
+#include <sys/conf.h>
+#include <sys/uio.h>
+#include <sys/map.h>
+#include <sys/debug.h>
+#include <sys/modctl.h>
+#include <sys/kmem.h>
+#include <sys/cmn_err.h>
+#include <sys/open.h>
+#include <sys/stat.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/pci.h>
+
+#include "nfp_common.h"
+#include "nfp_hostif.h"
+#include "nfp_osif.h"
+#include "nfp_cmd.h"
+
+#include "nfp.h"
+
+/* mapped memory attributes, no-swap endianess (done in higher level) */
+static struct ddi_device_acc_attr nosw_attr = {
+  DDI_DEVICE_ATTR_V0,
+  DDI_NEVERSWAP_ACC,
+  DDI_STRICTORDER_ACC
+};
+
+/* dma attributes */
+static ddi_dma_attr_t dma_attrs = {
+  DMA_ATTR_V0,            /* version number */
+  (uint64_t)0x0,          /* low address */
+  (uint64_t)0xffffffff,   /* high address */
+  (uint64_t)0xffffff,     /* DMA counter max */
+  (uint64_t)0x1,          /* alignment */
+  0x0c,                   /* burst sizes */
+  0x1,                    /* minimum transfer size */
+  (uint64_t)0x3ffffff,    /* maximum transfer size */
+  (uint64_t)0x7fff,       /* maximum segment size */
+  1,                      /* no scatter/gather lists */
+  1,                      /* granularity */
+  0                       /* DMA flags */
+};
+
+/*
+ * Debug message control
+ * Debug Levels:
+ *  0 = no messages
+ *  1 = Errors
+ *  2 = Subroutine calls & control flow
+ *  3 = I/O Data (verbose!)
+ * Can be set with adb or in the /etc/system file with
+ * "set nfp:nfp_debug=<value>"
+ */
+
+int nfp_debug= 1;
+
+static void *state_head; /* opaque handle top of state structs */
+
+static int nfp_open(dev_t *dev, int openflags, int otyp, cred_t *credp);
+static int nfp_close(dev_t dev, int openflags, int otyp, cred_t *credp);
+static int nfp_release_dev( dev_info_t *dip );
+
+static int nfp_read(dev_t dev, struct uio *uiop, cred_t *credp);
+static int nfp_write(dev_t dev, struct uio *uiop, cred_t *credp);
+static int nfp_strategy(struct buf *bp);
+
+static int nfp_ioctl(dev_t dev, int cmd, ioctlptr_t arg, int mode, cred_t *credp, int *rvalp);
+static int nfp_chpoll(dev_t dev, short events, int anyyet, short *reventsp,
+                    struct pollhead **phpp);
+
+static void nfp_wrtimeout (void *pdev);
+static void nfp_rdtimeout (void *pdev);
+
+static int nfp_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result);
+static int nfp_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
+static int nfp_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
+
+static void nfp_read_complete_final(nfp_dev *pdev, int ok);
+static void nfp_write_complete_final(nfp_dev *pdev, int ok);
+
+/* nfp file ops --------------------------------------------------- */
+
+static struct cb_ops nfp_cb_ops = {
+  nfp_open,
+  nfp_close,
+  nodev,                /* no nfp_strategy */
+  nodev,                /* no print routine */
+  nodev,                /* no dump routine */
+  nfp_read,
+  nfp_write,
+  nfp_ioctl,
+  nodev,                /* no devmap routine */
+  nodev,                /* no mmap routine */
+  nodev,                /* no segmap routine */
+  nfp_chpoll,
+  ddi_prop_op,
+  0,            /* not a STREAMS driver, no cb_str routine */
+  D_NEW | D_MP | EXTRA_CB_FLAGS, /* must be safe for multi-thread/multi-processor */
+  CB_REV,
+  nodev,                /* aread */
+  nodev                 /* awrite */
+};
+
+static struct dev_ops nfp_ops = {
+  DEVO_REV,               /* DEVO_REV indicated by manual */
+  0,                      /* device reference count       */
+  nfp_getinfo,
+  nulldev,                /* identify */
+  nulldev,                /* probe */
+  nfp_attach,
+  nfp_detach,
+  nodev,                  /* device reset routine         */
+  &nfp_cb_ops,
+  (struct bus_ops *)0,    /* bus operations               */
+};
+
+extern struct mod_ops mod_driverops;
+static struct modldrv modldrv = {
+  &mod_driverops,
+  NFP_DRVNAME,
+  &nfp_ops,
+};
+
+static struct modlinkage modlinkage = {
+  MODREV_1,               /* MODREV_1 indicated by manual */
+  (void *)&modldrv,
+  NULL,                   /* termination of list of linkage structures */
+};
+
+/* interface resource allocation */
+
+int nfp_alloc_pci_push( nfp_dev *pdev ) {
+  /* allocate resources needed for PCI Push,
+   * if not already allocated.
+   * return True if successful
+   */
+  nfp_err ret;
+  uint_t cookie_count;
+	size_t real_length;
+
+  if(!pdev->read_buf) {
+    /* allocate read buffer */
+    pdev->read_buf = kmem_zalloc( NFP_READBUF_SIZE, KM_NOSLEEP );
+  }
+  if(!pdev->read_buf) {
+    nfp_log( NFP_DBG1, "nfp_attach: kmem_zalloc read buffer failed");
+    pdev->read_buf = NULL;
+    return 0;
+  }
+
+  if(!pdev->rd_dma_ok) {
+    /* allocate dma handle for read buffer */
+    ret = ddi_dma_alloc_handle( pdev->dip,
+                                &dma_attrs,
+                                DDI_DMA_DONTWAIT,
+                                NULL,
+                                &pdev->read_dma_handle );
+    if( ret != DDI_SUCCESS ) {
+      nfp_log( NFP_DBG1,
+               "nfp_alloc_pci_push: ddi_dma_alloc_handle failed (%d)",
+               ret );
+      return 0;
+    }
+
+    /* Allocate the memory for dma transfers */
+    ret = ddi_dma_mem_alloc(pdev->read_dma_handle, NFP_READBUF_SIZE, &nosw_attr,
+			    DDI_DMA_CONSISTENT, DDI_DMA_DONTWAIT, NULL,
+			    (caddr_t*)&pdev->read_buf, &real_length, &pdev->acchandle);
+    if (ret != DDI_SUCCESS) {
+      nfp_log( NFP_DBG1, "nfp_alloc_pci_push: ddi_dma_mem_alloc failed (%d)", ret);
+      ddi_dma_free_handle( &pdev->read_dma_handle );
+      return 0;
+    }
+
+    ret = ddi_dma_addr_bind_handle( pdev->read_dma_handle,
+                                    NULL, /* kernel address space */
+                                    (caddr_t)pdev->read_buf, real_length,
+                                    DDI_DMA_READ | DDI_DMA_CONSISTENT, /* dma flags */
+                                    DDI_DMA_DONTWAIT, NULL,
+                                    &pdev->read_dma_cookie, &cookie_count );
+    if( ret != DDI_DMA_MAPPED ) {
+      nfp_log( NFP_DBG1,
+               "nfp_alloc_pci_push: ddi_dma_addr_bind_handle failed (%d)",
+               ret);
+      ddi_dma_mem_free(&pdev->acchandle);
+      ddi_dma_free_handle( &pdev->read_dma_handle );
+      return 0;
+    }
+    if( cookie_count > 1 ) {
+      nfp_log( NFP_DBG1,
+               "nfp_alloc_pci_push: error:"
+               " ddi_dma_addr_bind_handle wants %d transfers",
+               cookie_count);
+      ddi_dma_mem_free(&pdev->acchandle);
+      (void) ddi_dma_unbind_handle( pdev->read_dma_handle );
+      ddi_dma_free_handle( &pdev->read_dma_handle );
+      return 0;
+    }
+    pdev->rd_dma_ok = 1;
+  }
+  return pdev->rd_dma_ok;
+}
+
+void nfp_free_pci_push( nfp_dev *pdev ) {
+  /* free resources allocated to PCI Push */
+  if( pdev->rd_dma_ok ) {
+    (void) ddi_dma_sync(pdev->read_dma_handle,0,0,DDI_DMA_SYNC_FORKERNEL);
+    ddi_dma_mem_free(&pdev->acchandle);
+    (void) ddi_dma_unbind_handle( pdev->read_dma_handle );
+    ddi_dma_free_handle( &pdev->read_dma_handle );
+    pdev->rd_dma_ok = 0;
+  }
+  if( pdev->read_buf ) {
+    kmem_free( pdev->read_buf, NFP_READBUF_SIZE );
+    pdev->read_buf = NULL;
+  }
+}
+
+/* include definition of nfp_set_ifvers() */
+#define nfp_ifvers NFDEV_IF_PCI_PUSH
+#include "nfp_ifvers.c"
+#undef nfp_ifvers
+
+/*--------------------*/
+/*  nfp_isr           */
+/*--------------------*/
+
+static u_int nfp_isr( char *pdev_in ) {
+  /* LINTED: alignment */
+  nfp_dev *pdev= (nfp_dev *)pdev_in;
+  nfp_err ne;
+  int handled;
+
+  nfp_log( NFP_DBG3, "nfp_isr: entered");
+
+  if( !pdev ) {
+    nfp_log( NFP_DBG1, "nfp_isr: cannot find dev");
+    return DDI_INTR_UNCLAIMED;
+  }
+
+  /* The isr needs to be mutex'ed - an SMP can call us while we're still
+   * running!
+   */
+  mutex_enter(&pdev->low_mutex);
+  ne= pdev->cmddev->isr( pdev->common.cmdctx, &handled );
+  mutex_exit(&pdev->low_mutex);
+
+  if( !ne && handled )
+    return DDI_INTR_CLAIMED;
+  if (ne)
+    nfp_log( NFP_DBG1, "nfp_isr: failed");
+  else
+    nfp_log( NFP_DBG3, "nfp_isr: unclaimed");
+  return DDI_INTR_UNCLAIMED;
+}
+
+static u_int nfp_soft_isr( char *pdev_in ) {
+  /* LINTED: alignment */
+  nfp_dev *pdev= (nfp_dev *)pdev_in;
+  int rd, wr;
+
+  nfp_log( NFP_DBG3, "nfp_soft_isr: entered");
+
+  if( !pdev ) {
+    nfp_log( NFP_DBG1, "nfp_soft_isr: cannot find dev");
+    return DDI_INTR_UNCLAIMED;
+  }
+  rd= wr= 0;
+  
+  mutex_enter(&pdev->high_mutex);
+  if(pdev->high_read) {
+    pdev->high_read= 0;
+    mutex_exit(&pdev->high_mutex);
+    rd= 1;
+  }
+  if(pdev->high_write) {
+    pdev->high_write= 0;
+    wr= 1;
+  }
+  mutex_exit(&pdev->high_mutex);
+
+  if(rd) {
+    nfp_log( NFP_DBG3, "nfp_soft_isr: read done");
+    nfp_read_complete_final(pdev, pdev->rd_ok);
+  }
+  if(wr) {
+    nfp_log( NFP_DBG3, "nfp_soft_isr: write done");
+    nfp_write_complete_final(pdev, pdev->wr_ok);
+  }
+  if( rd || wr )
+    return DDI_INTR_CLAIMED;
+
+  nfp_log( NFP_DBG2, "nfp_isr: unclaimed");
+  return DDI_INTR_UNCLAIMED;
+}
+
+
+/*-------------------------*/
+/*  nfp_read               */
+/*-------------------------*/
+
+void nfp_read_complete(nfp_dev *pdev, int ok) {
+  nfp_log( NFP_DBG2,"nfp_read_complete: entering");
+
+  if(pdev->high_intr) {
+    nfp_log(NFP_DBG2, "nfp_read_complete: high_intr");
+    mutex_enter(&pdev->high_mutex);
+    nfp_log(NFP_DBG3, "nfp_read_complete: high_mutex entered");
+    if(pdev->high_read)
+      nfp_log(NFP_DBG1, "nfp_read_complete: high_read allread set!");
+    pdev->high_read= 1;
+    pdev->rd_ok= ok;
+    nfp_log(NFP_DBG3, "nfp_read_complete: exiting high_mutex");
+    mutex_exit(&pdev->high_mutex);
+    ddi_trigger_softintr(pdev->soft_int_id);
+  } else
+    nfp_read_complete_final( pdev, ok );
+  nfp_log( NFP_DBG2,"nfp_read_complete: exiting");
+}
+
+static void nfp_read_complete_final(nfp_dev *pdev, int ok) {
+  nfp_log( NFP_DBG2,"nfp_read_complete_final: entering");
+  if(pdev->rdtimeout)
+    (void) untimeout(pdev->rdtimeout);
+  if(!pdev->rd_outstanding) {
+    nfp_log( NFP_DBG1,"nfp_read_complete_final: !pdev->rd_outstanding");
+  }
+  nfp_log( NFP_DBG2,"nfp_read_complete_final: pdev->rd_outstanding=0, ok %d", ok);
+  mutex_enter(&pdev->isr_mutex);
+  pdev->rd_outstanding= 0;
+  pdev->rd_ready= 1;
+  pdev->rd_ok= ok;
+  cv_broadcast(&pdev->rd_cv);
+  mutex_exit(&pdev->isr_mutex);
+  pollwakeup (&pdev->pollhead, POLLRDNORM);
+  nfp_log( NFP_DBG2,"nfp_read_complete_final: exiting");
+}
+
+static void nfp_rdtimeout( void *pdev_in )
+{
+  nfp_dev *pdev= (nfp_dev *)pdev_in;
+
+  nfp_log( NFP_DBG1, "nfp_rdtimeout: read timed out");
+
+  if (!pdev) {
+    nfp_log( NFP_DBG1, "nfp_rdtimeout: NULL pdev." );
+    return;
+  }
+  pdev->rdtimeout= 0;
+  nfp_read_complete_final(pdev, 0);
+}
+
+/* ARGSUSED */
+static int nfp_read(dev_t dev, struct uio *uiop, cred_t *credp) {
+  int ret;
+  nfp_log( NFP_DBG2, "nfp_read: entered" );
+  if (ddi_get_soft_state(state_head, getminor(dev)) != NULL) {
+    nfp_log( NFP_DBG1, "nfp_read: unable to get nfp_dev");
+    return (ENODEV);
+  }
+  nfp_log( NFP_DBG2, "nfp_read: about to physio." );
+  ret = physio(nfp_strategy, (struct buf *)0, dev, B_READ, minphys, uiop );
+  if(ret)
+    nfp_log( NFP_DBG1, "nfp_read: physio returned %x.", ret );
+  return ret;
+}
+
+/*-------------------------*/
+/*  nfp_write              */
+/*-------------------------*/
+
+void nfp_write_complete( nfp_dev *pdev, int ok) {
+  nfp_log( NFP_DBG2,"nfp_write_complete: entering");
+
+  if(pdev->high_intr) {
+    mutex_enter(&pdev->high_mutex);
+    if(pdev->high_write)
+      nfp_log(NFP_DBG1, "nfp_write_complete: high_write allread set!");
+    pdev->high_write= 1;
+    pdev->wr_ok= ok;
+    mutex_exit(&pdev->high_mutex);
+    ddi_trigger_softintr(pdev->soft_int_id);
+  } else
+    nfp_write_complete_final( pdev, ok );
+  nfp_log( NFP_DBG2,"nfp_write_complete: exiting");
+}
+
+static void nfp_write_complete_final( nfp_dev *pdev, int ok) {
+  struct buf *local_wr_bp;
+  nfp_log( NFP_DBG2,"nfp_write_complete_final: entering");
+  if(pdev->wrtimeout)
+    (void) untimeout(pdev->wrtimeout);
+
+  if (!pdev->wr_bp) {
+    nfp_log( NFP_DBG2, "nfp_write_complete_final: write: wr_bp == NULL." );
+    return;
+  }
+
+  bp_mapout(pdev->wr_bp);
+  pdev->wr_bp->b_resid = ok ? 0 : pdev->wr_bp->b_bcount;
+  /* Make sure we set wr_ready before calling biodone to avoid a race */
+  pdev->wr_ready = 1;
+  bioerror(pdev->wr_bp, ok ? 0 : ENXIO);
+  local_wr_bp = pdev->wr_bp;
+  pdev->wr_bp = 0;
+  biodone(local_wr_bp);
+  nfp_log( NFP_DBG2, "nfp_write_complete_final: isr_mutex extited");
+  pollwakeup (&pdev->pollhead, POLLWRNORM);
+
+  nfp_log( NFP_DBG2, "nfp_write_complete_final: leaving");
+}
+
+static void nfp_wrtimeout( void *pdev_in )
+{
+  nfp_dev *pdev= (nfp_dev *)pdev_in;
+
+  nfp_log( NFP_DBG1, "nfp_wrtimeout: write timed out");
+
+  if (!pdev) {
+    nfp_log( NFP_DBG1, "nfp_wrtimeout: NULL pdev." );
+    return;
+  }
+  pdev->wrtimeout= 0;
+  nfp_write_complete_final(pdev, 0);
+}
+
+/* ARGSUSED */
+static int nfp_write(dev_t dev, struct uio *uiop, cred_t *credp) {
+  int ret;
+  nfp_log( NFP_DBG2, "nfp_write: entered." );
+  if (ddi_get_soft_state(state_head, getminor(dev)) == NULL) {
+    nfp_log( NFP_DBG1, "nfp_chread: unable to get nfp_dev.");
+    return (ENODEV);
+  }
+  nfp_log( NFP_DBG2, "nfp_write: about to physio." );
+  ret = physio(nfp_strategy, (struct buf *)0, dev, B_WRITE, minphys, uiop );
+  if(ret)
+    nfp_log( NFP_DBG1, "nfp_write: physio returned %x.", ret );
+  return ret;
+}
+
+/*-------------------------*/
+/*  nfp_strategy           */
+/*-------------------------*/
+
+#define NFP_STRAT_ERR(thebp,err,txt) \
+      nfp_log( NFP_DBG1, "nfp_strategy: " txt ".\n"); \
+      (thebp)->b_resid = (thebp)->b_bcount; \
+      bioerror ((thebp), err); \
+      biodone ((thebp));
+
+static int nfp_strategy(struct buf *bp) {
+  register struct nfp_dev *pdev;
+  nfp_err ne;
+  
+  nfp_log( NFP_DBG2, "nfp_strategy: entered." );
+  if (!(pdev = ddi_get_soft_state(state_head, getminor(bp->b_edev)))) {
+    NFP_STRAT_ERR (bp, ENXIO, "unable to get nfp_dev");
+    return (0);
+  }
+
+  if (bp->b_flags & B_READ) {
+    int count;
+    /* read */
+    if (!pdev->rd_ready) {
+      NFP_STRAT_ERR (bp,ENXIO,"read called when not ready");
+      return (0);
+    }
+    pdev->rd_ready=0;
+    pdev->rd_pending = 0;
+    if( !pdev->rd_ok) {
+      NFP_STRAT_ERR (bp,ENXIO,"read failed");
+      return (0);
+    }
+    /* copy data from module */
+    if(pdev->ifvers >= NFDEV_IF_PCI_PUSH) {
+      nfp_log( NFP_DBG3, "nfp_strategy: copying kernel read buffer");
+      if( ddi_dma_sync(pdev->read_dma_handle,0,0,DDI_DMA_SYNC_FORKERNEL) != DDI_SUCCESS )
+      {
+        NFP_STRAT_ERR(bp,ENXIO,"ddi_dma_sync(read_dma_handle) failed");
+        return (0);
+      }
+      /* LINTED: alignment */
+      count= *(unsigned int *)(pdev->read_buf+4);
+      count= FROM_LE32_MEM(&count);
+      nfp_log( NFP_DBG3, "nfp_strategy: read count %d", count);
+      if(count<0 || count>bp->b_bcount) {
+        NFP_STRAT_ERR(bp,ENXIO,"bad read byte count from device");
+        nfp_log( NFP_DBG1, "nfp_strategy: bad read byte count (%d) from device", count);
+        return (0);
+      }
+      bp_mapin (bp);
+      bcopy( pdev->read_buf + 8, bp->b_un.b_addr, count );
+      bp_mapout (bp);
+    } else {
+      bp_mapin (bp);
+      ne=  pdev->cmddev->read_block( bp->b_un.b_addr, bp->b_bcount, pdev->common.cmdctx, &count );
+      bp_mapout (bp);
+      if( ne != NFP_SUCCESS) {
+        NFP_STRAT_ERR (bp,nfp_oserr(ne),"read_block failed");
+        return (0);
+      }
+    }
+    bioerror(bp, 0);
+    bp->b_resid = 0;
+    biodone (bp);
+  } else {
+    /* write */
+    if (!pdev->wr_ready) {
+      NFP_STRAT_ERR (bp,ENXIO,"write called when not ready");
+      return (0);
+    }
+    if (pdev->wr_bp) {
+      NFP_STRAT_ERR (bp,ENXIO,"wr_bp != NULL");
+      return (0);
+    }
+    pdev->wrtimeout= timeout(nfp_wrtimeout, (caddr_t)pdev, NFP_TIMEOUT_SEC * drv_usectohz(1000000));
+    pdev->wr_bp = bp;
+    pdev->wr_ready = 0;
+    bp_mapin (bp);
+    ne= pdev->cmddev->write_block( bp->b_un.b_addr, bp->b_bcount,  pdev->common.cmdctx);
+    if( ne != NFP_SUCCESS ) {
+      bp_mapout (bp);
+      (void) untimeout(pdev->wrtimeout);
+      pdev->wr_bp = 0;
+      pdev->wr_ready = 1;
+      NFP_STRAT_ERR (bp,nfp_oserr(ne),"write failed");
+      return (0);
+    }
+  }
+  nfp_log( NFP_DBG2, "nfp_strategy: leaving");
+
+  return (0);
+}
+
+
+/*--------------------*/
+/*  poll / select     */
+/*--------------------*/
+
+static int nfp_chpoll(dev_t dev, short events, int anyyet, short *reventsp,
+                      struct pollhead **phpp) {
+  nfp_dev *pdev;
+  short revents;
+
+  if (!(pdev = ddi_get_soft_state(state_head, getminor(dev)))) {
+    nfp_log( NFP_DBG1, "nfp_chpoll: unable to get nfp_dev");
+    *reventsp=0;
+    return (0);
+  }
+  nfp_log( NFP_DBG2, "nfp_chpoll: entered %x", events);
+
+  revents=0;
+  if (events&POLLWRNORM) {
+    if (pdev->wr_ready) {
+      nfp_log( NFP_DBG2, "nfp_chpoll: write ready");
+      revents|=POLLWRNORM;
+    }
+  }
+
+  if (events&POLLRDNORM) {
+    if (pdev->rd_ready) {
+      nfp_log( NFP_DBG2, "nfp_chpoll: read ready");
+      revents|=POLLRDNORM;
+    }
+  }
+
+  if (!revents && !anyyet) {
+    *phpp=&pdev->pollhead;
+  }
+  *reventsp=revents;
+
+  nfp_log( NFP_DBG2, "nfp_chpoll: leaving");
+  return (0);
+}
+
+
+/*--------------------*/
+/*  ioctl             */
+/*--------------------*/
+
+/* ARGSUSED */
+static int nfp_ioctl(dev_t dev, int cmd, ioctlptr_t arg, int mode, cred_t *credp, int *rvalp) {
+  register struct nfp_dev *pdev;
+
+  nfp_log( NFP_DBG2, "nfp_ioctl: entered." );
+
+  if (!(pdev = ddi_get_soft_state(state_head, getminor(dev)))) {
+    nfp_log( NFP_DBG1, "nfp_ioctl: unable to get nfp dev.");
+    return (ENXIO);
+  }
+
+  switch (cmd) {
+  case NFDEV_IOCTL_ENQUIRY:
+    {
+      long *outp;
+      int outlen;
+      nfdev_enquiry_str enq_data;
+
+      enq_data.busno = (unsigned int)-1;
+      enq_data.slotno = (unsigned char)-1;
+
+      /* get our bus and slot num */
+      if (ddi_getlongprop (DDI_DEV_T_NONE,
+                           pdev->dip, 0, "reg",
+                           (caddr_t)&outp, &outlen) != DDI_PROP_NOT_FOUND) {
+        nfp_log( NFP_DBG2, "ddi_getlongprop('reg') ok." );
+        if( outlen > 0 ) {
+          enq_data.busno = ((*outp)>>16) & 0xff;
+          enq_data.slotno = ((*outp)>>11) & 0x1f;
+          nfp_log( NFP_DBG2, "busno %d, slotno %d.",
+                   enq_data.busno, enq_data.slotno );
+        }
+      } else
+        nfp_log( NFP_DBG1, "ddi_getlongprop('reg') failed." );
+
+      if( ddi_copyout( (char *)&enq_data, (void *)arg, sizeof(enq_data), mode ) != 0 ) {
+        nfp_log( NFP_DBG1, "ddi_copyout() failed." );
+        return EFAULT;
+      }
+    }
+    break;
+
+  case NFDEV_IOCTL_ENSUREREADING:
+    {
+      unsigned int addr, len;
+      nfp_err  ret;
+      if( ddi_copyin( (void *)arg, (char *)&len, sizeof(unsigned int), mode ) != 0 ) {
+        nfp_log( NFP_DBG1, "ddi_copyin() failed." );
+        return (EFAULT);
+      }
+      /* signal a read to the module */
+      nfp_log( NFP_DBG2, "nfp_ioctl: signalling read request to module, len = %x.", len );
+      if (len>8192) {
+        nfp_log( NFP_DBG1, "nfp_ioctl: len >8192 = %x.", len );
+        return EINVAL;
+      }
+      if (pdev->rd_outstanding==1) {
+        nfp_log( NFP_DBG1, "nfp_ioctl: not about to call read with read outstanding.");
+        return EIO;
+      }
+
+      addr= 0;
+      if(pdev->ifvers >= NFDEV_IF_PCI_PUSH) {
+        if( len > NFP_READBUF_SIZE ) {
+          nfp_log( NFP_DBG1, "nfp_ioctl: len > NFP_READBUF_SIZE = %x.", len );
+          return EINVAL;
+        }
+        addr= pdev->read_dma_cookie.dmac_address;
+      }
+
+      pdev->rd_outstanding = 1;
+      nfp_log( NFP_DBG2,"nfp_ioctl: pdev->rd_outstanding=1");
+
+      /* setup timeout timer */
+      pdev->rdtimeout= timeout(nfp_rdtimeout, (caddr_t)pdev, NFP_TIMEOUT_SEC * drv_usectohz(1000000));
+
+      nfp_log( NFP_DBG2, "nfp_ioctl: read request");
+      ret = pdev->cmddev->ensure_reading(addr, len, pdev->common.cmdctx);
+      if ( ret != NFP_SUCCESS ) {
+        (void) untimeout(pdev->rdtimeout);
+        pdev->rdtimeout = 0;
+        pdev->rd_outstanding = 0;
+        nfp_log( NFP_DBG1, "nfp_ioctl : cmddev->ensure_reading failed ");
+        return nfp_oserr( ret );
+      }
+    }
+    break;
+
+  case NFDEV_IOCTL_PCI_IFVERS:
+    {
+      int vers;
+
+      nfp_log( NFP_DBG2, "nfp_ioctl: NFDEV_IOCTL_PCI_IFVERS");
+
+      if( ddi_copyin( (void *)arg, (char *)&vers, sizeof(vers), mode ) != 0 ) {
+        nfp_log( NFP_DBG1, "ddi_copyin() failed." );
+        return (EFAULT);
+      }
+
+      if( pdev->rd_outstanding ) {
+        nfp_log( NFP_DBG1, "nfp_ioctl: can't set ifvers %d as read outstanding", vers);
+        return EIO;
+      }
+
+      nfp_set_ifvers(pdev, vers);
+      if( pdev->ifvers != vers ) {
+        nfp_log( NFP_DBG1, "nfp_ioctl: can't set ifvers %d", vers);
+        return EIO;
+      }
+    }
+    break;
+
+  case NFDEV_IOCTL_STATS:
+    {
+      if( ddi_copyout( (char *)&(pdev->common.stats),
+                       (void *)arg,
+                       sizeof(nfdev_stats_str),
+                       mode ) != 0 ) {
+        nfp_log( NFP_DBG1, "ddi_copyout() failed." );
+        return EFAULT;
+      }
+    }
+    break;
+
+  default:
+    nfp_log( NFP_DBG1, "nfp_ioctl: unknown ioctl." );
+    return EINVAL;
+  }
+
+  return 0;
+}
+
+/*-------------------------*/
+/*  nfp_open               */
+/*-------------------------*/
+
+/* ARGSUSED */
+int nfp_open(dev_t *dev, int openflags, int otyp, cred_t *credp)
+{       
+  nfp_err ret;
+  register struct nfp_dev *pdev;
+ 
+  nfp_log( NFP_DBG2, "entered nfp_open." );
+      
+  pdev = (nfp_dev *)ddi_get_soft_state(state_head, getminor(*dev));
+  
+  if( !pdev ) {
+    nfp_log( NFP_DBG1, "nfp_open: unable to get nfp dev.");
+    return (ENODEV);
+  }     
+        
+  if( otyp != OTYP_CHR ) {
+    nfp_log( NFP_DBG1, "nfp_open: not opened as character device");
+    return (EINVAL);
+  } 
+    
+  mutex_enter(&pdev->busy_mutex);
+    
+  if (pdev->busy) {
+    mutex_exit(&pdev->busy_mutex);
+    nfp_log( NFP_DBG1, "nfp_open: device busy");
+    return EBUSY;
+  } 
+  pdev->busy= 1;
+  mutex_exit(&pdev->busy_mutex);
+
+  /* use oldest possible interface until told otherwise */
+  pdev->ifvers= NFDEV_IF_STANDARD;
+  nfp_log( NFP_DBG3, "nfp_open: setting ifvers %d", pdev->ifvers);
+  pdev->rd_ready= 0; /* drop any old data */
+ 
+  ret = pdev->cmddev->open(pdev->common.cmdctx);
+  if( ret != NFP_SUCCESS ) {
+    nfp_log( NFP_DBG1, "nfp_open : cmddev->open failed ");
+    return nfp_oserr( ret );
+  } 
+
+  nfp_log( NFP_DBG2, "nfp_open: done");
+
+  return 0;
+}
+
+/*--------------------*/
+/*  nfp_close         */
+/*--------------------*/
+
+/* ARGSUSED */
+static int nfp_close(dev_t dev, int openflags, int otyp, cred_t *credp) {
+  nfp_dev *pdev;
+  nfp_err ret;
+
+  nfp_log( NFP_DBG2, "nfp_close: entered");
+
+  pdev = (struct nfp_dev *)ddi_get_soft_state(state_head, getminor(dev));
+  if( !pdev ) {
+    nfp_log( NFP_DBG1, "nfp_close: cannot find dev.");
+    return ENODEV;
+  }
+
+  mutex_enter(&pdev->isr_mutex);
+  if(pdev->rd_outstanding) {
+    int lbolt, err;
+    nfp_get_lbolt(&lbolt, err);
+    if(!err)
+      (void) cv_timedwait(&pdev->rd_cv, &pdev->isr_mutex, lbolt + (NFP_TIMEOUT_SEC * drv_usectohz(1000000)) );
+  }
+  mutex_exit(&pdev->isr_mutex);
+  ret = pdev->cmddev->close(pdev->common.cmdctx);
+  if (ret != NFP_SUCCESS ) {
+    nfp_log( NFP_DBG1, " nfp_close : cmddev->close failed");
+    return nfp_oserr( ret );
+  }
+
+  mutex_enter(&pdev->busy_mutex);
+  pdev->busy= 0;
+  mutex_exit(&pdev->busy_mutex);
+
+  return 0;
+}
+
+/****************************************************************************
+
+  nfp driver config
+
+ ****************************************************************************/
+
+/*-------------------------*/
+/*  nfp_getinfo            */
+/*-------------------------*/
+
+/* ARGSUSED */
+static int nfp_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) {
+  int error;
+  nfp_dev *pdev;
+
+  nfp_log( NFP_DBG2, "nfp_getinfo: entered" );
+
+  pdev = (struct nfp_dev *)ddi_get_soft_state(state_head, getminor((dev_t)arg));
+  if( !pdev ) {
+    nfp_log( NFP_DBG1, "nfp_close: cannot find dev.");
+    return ENODEV;
+  }
+
+  switch (infocmd) {
+  case DDI_INFO_DEVT2DEVINFO:
+    if (pdev == NULL) {
+      *result = NULL;
+      error = DDI_FAILURE;
+    } else {
+      /*
+       * don't need to use a MUTEX even though we are
+       * accessing our instance structure; dev->dip
+       * never changes.
+       */
+      *result = pdev->dip;
+      error = DDI_SUCCESS;
+    }
+    break;
+  case DDI_INFO_DEVT2INSTANCE:
+    *result = (void *)(uintptr_t)getminor((dev_t)arg);
+    error = DDI_SUCCESS;
+    break;
+  default:
+    *result = NULL;
+    error = DDI_FAILURE;
+  }
+
+  nfp_log( NFP_DBG2, "nfp_getinfo: leaving." );
+  return (error);
+}
+
+/*-------------------------*/
+/*  nfp_release            */
+/*-------------------------*/
+
+static int nfp_release_dev( dev_info_t *dip ) {
+  nfp_dev *pdev;
+  int instance, i;
+  nfp_err ret;
+
+  nfp_log( NFP_DBG2, "nfp_release_dev: entering" );
+
+  instance = ddi_get_instance(dip);
+  pdev = (struct nfp_dev *)ddi_get_soft_state(state_head, instance);
+  if (pdev) {
+    nfp_log( NFP_DBG3, "nfp_release_dev: removing device" );
+
+    nfp_free_pci_push(pdev);
+
+    if( pdev->cmddev ) {
+      nfp_log( NFP_DBG3, "nfp_release_dev: destroying cmd dev" );
+       ret = pdev->cmddev->destroy(pdev->common.cmdctx);
+       if (ret != NFP_SUCCESS) {
+         nfp_log( NFP_DBG1, " nfp_release_dev : cmddev->destroy failed ");
+         return nfp_oserr( ret );
+       }
+    }
+
+    if(pdev->high_iblock_cookie) {
+      nfp_log( NFP_DBG3, "nfp_release_dev: removing high and soft irq" );
+      ddi_remove_softintr(pdev->soft_int_id);
+      ddi_remove_intr(pdev->dip, 0, pdev->high_iblock_cookie);
+      mutex_destroy( &pdev->busy_mutex );
+      cv_destroy( &pdev->rd_cv );
+      mutex_destroy( &pdev->isr_mutex );
+      mutex_destroy( &pdev->high_mutex );
+    } else if(pdev->iblock_cookie) {
+      nfp_log( NFP_DBG3, "nfp_release_dev: removing irq" );
+      ddi_remove_intr(pdev->dip, 0, pdev->iblock_cookie);
+      mutex_destroy( &pdev->busy_mutex );
+      cv_destroy( &pdev->rd_cv );
+      mutex_destroy( &pdev->isr_mutex );
+    }
+    if(pdev->low_iblock_cookie) {
+      ddi_remove_intr(pdev->dip, 0, pdev->low_iblock_cookie);
+      mutex_destroy( &pdev->low_mutex);
+    }
+
+    for(i=0;i<6;i++) {
+      if( pdev->common.extra[i] ) {
+        nfp_log( NFP_DBG3, "nfp_release_dev: unmapping BAR %d", i );
+        ddi_regs_map_free ((ddi_acc_handle_t *)&pdev->common.extra[i]);
+      }
+    }
+
+    ddi_remove_minor_node(dip, NULL);
+
+    if (pdev->conf_handle)
+      pci_config_teardown( &pdev->conf_handle );
+
+    ddi_soft_state_free(state_head, instance);
+  }
+  nfp_log( NFP_DBG2, "nfp_release: finished" );
+
+  return DDI_SUCCESS;
+}
+
+
+/*-------------------------*/
+/*  nfp_attach             */
+/*-------------------------*/
+
+static int nfp_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) {
+  int instance;
+  nfp_dev *pdev = NULL;
+  int intres;
+  uint16_t device, vendor, sub_device, sub_vendor;
+  long *outp;
+  nfpcmd_dev const *cmddev;
+  int index, i;
+  nfp_err ret;
+
+  nfp_log( NFP_DBG2, "nfp_attach: entered." );
+
+  if (cmd != DDI_ATTACH) {
+    nfp_log( NFP_DBG1, "nfp_attach: bad command." );
+    goto bailout;
+  }
+
+  instance = ddi_get_instance(dip);
+
+  if (ddi_soft_state_zalloc(state_head, instance) != 0) {
+    nfp_log( NFP_DBG1, "nfp_attach: ddi_soft_state_zalloc() failed." );
+    goto bailout;
+  }
+
+  pdev = (struct nfp_dev *)ddi_get_soft_state(state_head, instance);
+  if( !pdev ) {
+    nfp_log( NFP_DBG1, "nfp_attach: cannot find dev.");
+    return ENODEV;
+  }
+  pdev->dip = dip;
+
+  /* map in pci config registers */
+  if (pci_config_setup(dip, &pdev->conf_handle)) {
+    nfp_log( NFP_DBG1, "nfp_attach: pci_config_setup() failed." );
+    goto bailout;
+  }
+
+  /* find out what we have got */
+  vendor= PCI_CONFIG_GET16( pdev->conf_handle, PCI_CONF_VENID );
+  device = PCI_CONFIG_GET16( pdev->conf_handle, PCI_CONF_DEVID );
+  sub_vendor = PCI_CONFIG_GET16( pdev->conf_handle, PCI_CONF_SUBVENID );
+  sub_device = PCI_CONFIG_GET16( pdev->conf_handle, PCI_CONF_SUBSYSID );
+
+  index= 0;
+  while( (cmddev = nfp_drvlist[index++]) != NULL ) {
+    if( cmddev->vendorid == vendor &&
+        cmddev->deviceid == device &&
+        cmddev->sub_vendorid == sub_vendor &&
+        cmddev->sub_deviceid == sub_device )
+      break;
+  }
+  if( !cmddev ) {
+    nfp_log( NFP_DBG1, "nfp_attach: unknonw device." );
+    goto bailout;
+  }
+
+  /* map BARs */
+  for( i=0; i<6; i++ ) {
+    if( cmddev->bar_sizes[i] ) {
+      off_t size;
+      if( ddi_dev_regsize(dip, i+1, &size) != DDI_SUCCESS) {
+        nfp_log( NFP_DBG1, "nfp_attach: ddi_dev_regsize() failed for BAR %d", i );
+        goto bailout;
+      }
+      if( size < (cmddev->bar_sizes[i] & ~NFP_MEMBAR_MASK) ) { 
+        nfp_log( NFP_DBG1, "nfp_attach: BAR %d too small %x (%x)", i, size, (cmddev->bar_sizes[i] & ~0xF) );
+        goto bailout;
+      }
+      if (ddi_regs_map_setup(dip, i+1, (caddr_t *)&pdev->common.bar[i],
+                         0, cmddev->bar_sizes[i] & ~NFP_MEMBAR_MASK, &nosw_attr, (ddi_acc_handle_t *)&pdev->common.extra[i] )) { 
+        nfp_log( NFP_DBG1, "nfp_attach: ddi_regs_map_setup() failed for BAR %d", i );
+        goto bailout;
+      }
+      nfp_log( NFP_DBG3, "nfp_attach: BAR[%d] mapped to %x (%x)", i, pdev->common.bar[i], size );
+    }
+  }
+  
+  pdev->read_buf = NULL;
+  pdev->rd_dma_ok = 0;
+
+  /* attach to minor node */
+  if (ddi_create_minor_node(dip, "nfp", S_IFCHR, instance, (char *)cmddev->name, 0) == DDI_FAILURE) {
+    ddi_remove_minor_node(dip, NULL);
+    nfp_log( NFP_DBG1, "nfp_attach: ddi_create_minor_node() failed." );
+    goto bailout;
+  }
+  
+  pdev->wr_ready = 1;
+  pdev->rd_ready = 0;
+  pdev->rd_pending = 0;
+  pdev->rd_outstanding = 0;
+  pdev->busy=0; 
+  pdev->cmddev= cmddev;
+  
+  ret = pdev->cmddev->create(&pdev->common);
+  if( ret != NFP_SUCCESS) {
+    nfp_log( NFP_DBG1, "nfp_attach: failed to create command device");
+    goto bailout;
+  }
+  pdev->common.dev= pdev;
+
+  if (ddi_intr_hilevel(dip, 0) != 0){
+    nfp_log( NFP_DBG2, "nfp_attach: high-level interrupt");
+    if( ddi_get_iblock_cookie(dip, 0, &pdev->high_iblock_cookie) ) {
+      nfp_log( NFP_DBG1, "nfp_attach: ddi_get_iblock_cookie(high) failed." );
+      goto bailout;
+    } 
+    if( ddi_get_iblock_cookie(dip, 0, &pdev->low_iblock_cookie) ) {
+      nfp_log( NFP_DBG1, "nfp_attach: ddi_get_iblock_cookie(low) failed." );
+      goto bailout;
+    }
+    mutex_init(&pdev->high_mutex, NULL, MUTEX_DRIVER,
+                (void *)pdev->high_iblock_cookie);
+    mutex_init(&pdev->low_mutex, NULL, MUTEX_DRIVER,
+                (void *)pdev->low_iblock_cookie);
+    if (ddi_add_intr(dip, 0, NULL,
+                NULL, nfp_isr,
+                (caddr_t)pdev) != DDI_SUCCESS) {
+      nfp_log( NFP_DBG1, "nfp_attach: ddi_add_intr(high) failed." );
+      goto bailout;
+    }
+    if( ddi_get_soft_iblock_cookie(dip, DDI_SOFTINT_HIGH,
+                &pdev->iblock_cookie) ) {
+      nfp_log( NFP_DBG1, "nfp_attach: ddi_get_iblock_cookie(soft) failed." );
+      goto bailout;
+    }
+    mutex_init(&pdev->isr_mutex, NULL, MUTEX_DRIVER,
+                (void *)pdev->iblock_cookie);
+    if (ddi_add_softintr(dip, DDI_SOFTINT_HIGH, &pdev->soft_int_id,
+                &pdev->iblock_cookie, NULL,
+                nfp_soft_isr, (caddr_t)pdev) != DDI_SUCCESS)
+                goto bailout;
+    pdev->high_intr= 1;
+  } else {
+    nfp_log( NFP_DBG2, "nfp_attach: low-level interrupt");
+
+    if (ddi_get_iblock_cookie (dip, 0, &pdev->iblock_cookie)) {
+      nfp_log( NFP_DBG1, "nfp_attach: ddi_get_iblock_cookie() failed." );
+      goto bailout;
+    }
+  
+    mutex_init(&pdev->isr_mutex, "nfp isr mutex", MUTEX_DRIVER, (void *)pdev->iblock_cookie);
+  
+    if (ddi_add_intr(dip, 0, NULL,
+                     (ddi_idevice_cookie_t *)NULL, nfp_isr,
+                     (caddr_t)pdev) != DDI_SUCCESS) {
+      nfp_log( NFP_DBG1, "nfp_attach: ddi_add_intr() failed." );
+      goto bailout;
+    }
+  }
+  mutex_init(&pdev->busy_mutex, "nfp busy mutex", MUTEX_DRIVER, NULL );
+  cv_init(&pdev->rd_cv, "nfp read condvar", CV_DRIVER, NULL );
+
+  /* get our bus and slot num */
+  if (ddi_getlongprop (DDI_DEV_T_NONE, 
+                       pdev->dip, 0, "reg",
+                       (caddr_t)&outp, &intres) != DDI_PROP_NOT_FOUND) {
+    nfp_log( NFP_DBG2, "nfp_attach: ddi_getlongprop('reg') ok." );
+    if( intres > 0 ) {
+      nfp_log( NFP_DBG1, "nfp_attach: found PCI nfast bus %x slot %x.",
+               ((*outp)>>16) & 0xff, ((*outp)>>11) & 0x1f );
+    }
+  }
+  
+  nfp_log( NFP_DBG2, "nfp_attach: attach succeeded." );
+  return DDI_SUCCESS;
+  
+bailout:
+  (void) nfp_release_dev( dip );
+
+  return DDI_FAILURE;
+}
+
+/*-------------------------*/
+/*  nfp_detach             */
+/*-------------------------*/
+
+/* 
+ * When our driver is unloaded, nfp_detach cleans up and frees the resources
+ * we allocated in nfp_attach.
+ */
+static int nfp_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) {
+  if (cmd != DDI_DETACH)
+    return (DDI_FAILURE);
+
+  (void) nfp_release_dev(dip);
+
+  return (DDI_SUCCESS);
+}
+
+/*-------------------------*/
+/*  _init                  */
+/*-------------------------*/
+
+int _init(void) {
+  register int error;
+
+  nfp_log( NFP_DBG2, "_init: entered" );
+
+  if ((error = ddi_soft_state_init(&state_head, sizeof (struct nfp_dev), 1)) != 0) {
+    nfp_log( NFP_DBG1, "_init: soft_state_init() failed" );
+    return (error);
+  }
+  
+  if ((error = mod_install(&modlinkage)) != 0) {
+    nfp_log( NFP_DBG1, "_init: mod_install() failed" );
+    ddi_soft_state_fini(&state_head);
+  }
+  
+  nfp_log( NFP_DBG2, "_init: leaving" );
+  return (error);
+}
+
+/*-------------------------*/
+/*  _info                  */
+/*-------------------------*/
+
+int _info(struct modinfo *modinfop) { 
+  nfp_log( NFP_DBG2, "_info: entered" );
+  
+  return (mod_info(&modlinkage, modinfop));
+}
+
+/*-------------------------*/
+/*  _fini                  */
+/*-------------------------*/
+
+int _fini(void) {
+  int status;
+  
+  nfp_log( NFP_DBG2, "_fini: entered" );
+  
+  if ((status = mod_remove(&modlinkage)) != 0) {
+    nfp_log( NFP_DBG2, "_fini: mod_remove() failed." );
+    return (status);
+  }
+  
+  ddi_soft_state_fini(&state_head);
+  
+  nfp_log( NFP_DBG2, "_fini: leaving" );
+  
+  return (status);
+}
+
diff --git a/usr/src/uts/common/io/nfp/i21285.c b/usr/src/uts/common/io/nfp/i21285.c
new file mode 100644
index 0000000000..f51a09188d
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/i21285.c
@@ -0,0 +1,310 @@
+/*
+
+i21285.c: nCipher PCI HSM intel/digital 21285 command driver
+
+(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+
+history
+
+09/10/2001 jsh  Original
+
+*/
+
+#include "nfp_common.h"
+#include "nfp_error.h"
+#include "nfp_hostif.h"
+#include "nfp_osif.h"
+#include "i21285.h"
+#include "nfp_cmd.h"
+#include "nfpci.h"
+
+/* create ------------------------------------------------------- */
+
+static nfp_err i21285_create( nfp_cdev *pdev ) {
+  unsigned int tmp32;
+
+  nfp_log( NFP_DBG2, "i21285_create: entered");
+  pdev->cmdctx= pdev;  /* set our context to just be a pointer to our nfp_cdev */
+
+  nfp_log( NFP_DBG2, "i21285_create: enable doorbell");
+  if(!pdev->bar[ IOBAR ]) {
+    nfp_log( NFP_DBG1, "i21285_create: null BAR[%d]", IOBAR );
+    return NFP_ENOMEM;
+  }
+  TO_LE32_IO( &tmp32, DOORBELL_ENABLE | POSTLIST_ENABLE);
+  nfp_outl( pdev, IOBAR, I21285_OFFSET_INTERRUPT_MASK, tmp32 );
+
+  return NFP_SUCCESS;
+}
+
+/* stop ------------------------------------------------------- */
+
+static nfp_err i21285_destroy( void * ctx ) {
+  nfp_cdev *pdev;
+  unsigned int tmp32;
+
+  nfp_log( NFP_DBG2, "i21285_destroy: entered");
+
+  pdev= (nfp_cdev *)ctx;
+  if(!pdev) {
+    nfp_log( NFP_DBG1, "i21285_destroy: NULL pdev");
+    return NFP_ENODEV;
+  }
+  if(!pdev->bar[ IOBAR ]) {
+    nfp_log( NFP_DBG1, "i21285_destroy: null BAR[%d]", IOBAR );
+    return NFP_ENOMEM;
+  }
+  TO_LE32_IO( &tmp32, DOORBELL_DISABLE | POSTLIST_DISABLE );
+  nfp_outl( pdev, IOBAR, I21285_OFFSET_INTERRUPT_MASK, tmp32 );
+
+  return NFP_SUCCESS;
+}
+
+/* open ------------------------------------------------------- */
+
+/* ARGSUSED */
+static nfp_err i21285_open( void * ctx ) {
+  nfp_log( NFP_DBG2, "i21285_open: entered");
+
+  return NFP_SUCCESS;
+}
+
+/* close ------------------------------------------------------- */
+
+/* ARGSUSED */
+static nfp_err i21285_close( void * ctx ) {
+  nfp_log( NFP_DBG2, "i21285_close: entered");
+
+  return NFP_SUCCESS;
+}
+
+/* isr ------------------------------------------------------- */
+
+static nfp_err i21285_isr( void *ctx, int *handled ) {
+  nfp_cdev *pdev;
+  unsigned int doorbell;
+  unsigned int tmp32;
+
+  nfp_log( NFP_DBG3, "i21285_isr: entered");
+
+  *handled= 0;
+  pdev= (nfp_cdev *)ctx;
+  if(!pdev) {
+    nfp_log( NFP_DBG1, "i21285_isr: NULL pdev");
+    return NFP_ENODEV;
+  }
+
+  doorbell= nfp_inl( pdev, IOBAR, I21285_OFFSET_DOORBELL);
+  doorbell= FROM_LE32_IO(&doorbell) & 0xffff;
+  while( doorbell && doorbell != 0xffff) {
+    *handled= 1;
+    /* service interrupts */
+    if( doorbell & (NFAST_INT_DEVICE_WRITE_OK | NFAST_INT_DEVICE_WRITE_FAILED)) {
+      TO_LE32_IO( &tmp32, NFAST_INT_DEVICE_WRITE_OK | NFAST_INT_DEVICE_WRITE_FAILED);
+      nfp_outl( pdev, IOBAR, I21285_OFFSET_DOORBELL, tmp32 );
+
+      nfp_log(NFP_DBG2, "i21285_isr: write done interrupt, ok = %d.", doorbell & NFAST_INT_DEVICE_WRITE_OK ? 1 : 0 );
+
+      nfp_write_complete(pdev->dev, doorbell & NFAST_INT_DEVICE_WRITE_OK ? 1 : 0 );
+    }
+
+    if( doorbell & (NFAST_INT_DEVICE_READ_OK | NFAST_INT_DEVICE_READ_FAILED)) {
+       TO_LE32_IO( &tmp32, NFAST_INT_DEVICE_READ_OK | NFAST_INT_DEVICE_READ_FAILED );
+       nfp_outl( pdev, IOBAR, I21285_OFFSET_DOORBELL, tmp32 );
+
+      nfp_log(NFP_DBG2, "i21285_isr: read ack interrupt, ok = %d.", doorbell & NFAST_INT_DEVICE_READ_OK ? 1 : 0 );
+      nfp_read_complete( pdev->dev, doorbell & NFAST_INT_DEVICE_READ_OK ? 1 : 0);
+    }
+
+    if( doorbell & ~(NFAST_INT_DEVICE_READ_OK  | NFAST_INT_DEVICE_READ_FAILED |
+                     NFAST_INT_DEVICE_WRITE_OK | NFAST_INT_DEVICE_WRITE_FAILED)) {
+      nfp_log( NFP_DBG1, "i21285_isr: unexpected interrupt %x", doorbell );
+      TO_LE32_IO( &tmp32, 0xffff & doorbell );
+      nfp_outl( pdev, IOBAR, I21285_OFFSET_DOORBELL, tmp32 );
+    }
+    doorbell= nfp_inl( pdev, IOBAR, I21285_OFFSET_DOORBELL);
+    doorbell= FROM_LE32_IO(&doorbell) & 0xffff;
+  }
+  return 0;
+}
+
+/* write ------------------------------------------------------- */
+
+static nfp_err i21285_write( const char *block, int len, void *ctx ) {
+  nfp_cdev *cdev;
+  unsigned int hdr[2];
+  nfp_err ne;
+  unsigned int tmp32;
+
+  nfp_log( NFP_DBG2, "i21285_write: entered");
+
+  cdev= (nfp_cdev *)ctx;
+  if(!cdev) {
+    nfp_log( NFP_DBG1, "i21285_write: NULL pdev");
+    return NFP_ENODEV;
+  }
+
+  nfp_log(NFP_DBG2, "i21285_write: pdev->bar[ MEMBAR ]= %x\n", cdev->bar[ MEMBAR ]);
+  nfp_log(NFP_DBG2, "i21285_write: pdev->bar[ IOBAR ]= %x\n", cdev->bar[ IOBAR ]);
+  if(!cdev->bar[ MEMBAR ]) {
+    nfp_log( NFP_DBG1, "i21285_write: null BAR[%d]", MEMBAR );
+    return NFP_ENOMEM;
+  }
+  ne= nfp_copy_from_user_to_dev( cdev, MEMBAR, NFPCI_JOBS_WR_DATA, block, len);
+  if (ne) {
+    nfp_log( NFP_DBG1, "i21285_write: nfp_copy_from_user_to_dev failed");
+    return ne;
+  }
+  TO_LE32_MEM(&hdr[0], NFPCI_JOB_CONTROL);
+  TO_LE32_MEM(&hdr[1], len);
+
+  ne= nfp_copy_to_dev( cdev, MEMBAR, NFPCI_JOBS_WR_CONTROL, (const char *)hdr, 8);
+  if (ne) {
+    nfp_log( NFP_DBG1, "i21285_write: nfp_copy_to_dev failed");
+    return ne;
+  }
+
+  ne= nfp_copy_from_dev( cdev, MEMBAR, NFPCI_JOBS_WR_LENGTH, (char *)hdr, 4);
+  if (ne) {
+    nfp_log( NFP_DBG1, "i21285_write: nfp_copy_from_dev failed");
+    return ne;
+  }
+  
+  TO_LE32_MEM( &tmp32, len );
+  if ( hdr[0] != tmp32 ) {
+    nfp_log( NFP_DBG1, "i21285_write: length not written");
+    return NFP_EIO;
+  }
+
+  TO_LE32_IO( &tmp32, NFAST_INT_HOST_WRITE_REQUEST);
+
+  nfp_outl( cdev, IOBAR, I21285_OFFSET_DOORBELL, tmp32 );
+
+  nfp_log( NFP_DBG2, "i21285_write: done");
+  return NFP_SUCCESS;
+}
+
+/* read ------------------------------------------------------- */
+
+static nfp_err i21285_read( char *block, int len, void *ctx, int *rcount) {
+  nfp_cdev *cdev;
+  nfp_err ne;
+  int count;
+
+  nfp_log( NFP_DBG2, "i21285_read: entered, len %d", len);
+  *rcount= 0;
+
+  cdev= (nfp_cdev *)ctx;
+  if(!cdev) {
+    nfp_log( NFP_DBG1, "i21285_read: NULL pdev");
+    return NFP_ENODEV;
+  }
+
+  if(!cdev->bar[ MEMBAR ]) {
+    nfp_log( NFP_DBG1, "i21285_read: null BAR[%d]", MEMBAR );
+    return NFP_ENOMEM;
+  }
+  ne= nfp_copy_from_dev( cdev, MEMBAR, NFPCI_JOBS_RD_LENGTH, (char *)&count, 4);
+  if(ne) {
+    nfp_log( NFP_DBG1, "i21285_read: nfp_copy_from_dev failed.");
+    return ne;
+  }
+  count= FROM_LE32_MEM(&count);
+  if(count<0 || count>len) {
+    nfp_log( NFP_DBG1, "i21285_read: bad byte count (%d) from device", count);
+    return NFP_EIO;
+  }
+  ne= nfp_copy_to_user_from_dev( cdev, MEMBAR, NFPCI_JOBS_RD_DATA, block, count);
+  if( ne ) {
+    nfp_log( NFP_DBG1, "i21285_read: nfp_copy_to_user_from_dev failed.");
+    return ne;
+  }
+  nfp_log( NFP_DBG2, "i21285_read: done");
+  *rcount= count;
+  return NFP_SUCCESS;
+}
+
+/* chupdate  ------------------------------------------------------- */
+
+/* ARGSUSED */
+static nfp_err i21285_chupdate( char *data, int len, void *ctx ) {
+  nfp_log( NFP_DBG1, "i21285_chupdate: NYI");
+  return NFP_SUCCESS;
+}
+
+/* ensure reading -------------------------------------------------- */
+
+static nfp_err i21285_ensure_reading( unsigned int addr, int len, void *ctx ) {
+  nfp_cdev *cdev;
+  unsigned int hdr[2];
+  unsigned int tmp32;
+  nfp_err ne;
+
+  nfp_log( NFP_DBG2, "i21285_ensure_reading: entered");
+
+  if(addr) {
+    nfp_log( NFP_DBG2, "i21285_ensure_reading: bad addr");
+    return -NFP_EINVAL;
+  }
+
+  cdev= (nfp_cdev *)ctx;
+  if(!cdev) {
+    nfp_log( NFP_DBG1, "i21285_ensure_reading: NULL pdev");
+    return NFP_ENODEV;
+  }
+
+  if(!cdev->bar[ MEMBAR ]) {
+    nfp_log( NFP_DBG1, "i21285_ensure_reading: null BAR[%d]", MEMBAR );
+    return NFP_ENXIO;
+  }
+  nfp_log( NFP_DBG3, "i21285_ensure_reading: pdev->bar[ MEMBAR ]= %x", cdev->bar[ MEMBAR ]);
+  nfp_log( NFP_DBG3, "i21285_ensure_reading: pdev->bar[ IOBAR ]= %x", cdev->bar[ IOBAR ]);
+  TO_LE32_MEM( &hdr[0], NFPCI_JOB_CONTROL);
+  TO_LE32_MEM( &hdr[1], len);
+  ne= nfp_copy_to_dev( cdev, MEMBAR, NFPCI_JOBS_RD_CONTROL, (const char *)hdr, 8);
+  if (ne) {
+    nfp_log( NFP_DBG1, "i21285_ensure_reading: nfp_copy_to_dev failed");
+    return ne;
+  }
+  ne= nfp_copy_from_dev( cdev, MEMBAR, NFPCI_JOBS_RD_LENGTH, (char *)hdr, 4);
+  if (ne) {
+    nfp_log( NFP_DBG1, "i21285_ensure_reading: nfp_copy_from_dev failed");
+    return ne;
+  }
+  TO_LE32_MEM( &tmp32, len );
+  if ( hdr[0] != tmp32 ) {
+    nfp_log( NFP_DBG1, "i21285_ensure_reading: len not written");
+    return NFP_EIO;
+  };
+  TO_LE32_IO( &tmp32, NFAST_INT_HOST_READ_REQUEST );
+  nfp_outl( cdev, IOBAR, I21285_OFFSET_DOORBELL, tmp32 );
+
+  return NFP_SUCCESS;
+}
+
+/* command device structure ------------------------------------- */
+
+
+const nfpcmd_dev i21285_cmddev = {
+  "nCipher Gen 1 PCI",
+  PCI_VENDOR_ID_DEC, PCI_DEVICE_ID_DEC_21285,
+  PCI_VENDOR_ID_NCIPHER, PCI_DEVICE_ID_NFAST_GEN1,
+  { 0, IOSIZE | PCI_BASE_ADDRESS_SPACE_IO, NFPCI_RAM_MINSIZE, 0, 0, 0 },
+  NFP_CMD_FLG_NEED_IOBUF,
+  i21285_create,
+  i21285_destroy,
+  i21285_open,
+  i21285_close,
+  i21285_isr,
+  i21285_write,
+  i21285_read,
+  i21285_chupdate,
+  i21285_ensure_reading,
+  0, /* no debug */
+};
+  
diff --git a/usr/src/uts/common/io/nfp/i21285.h b/usr/src/uts/common/io/nfp/i21285.h
new file mode 100644
index 0000000000..4ea1d853ec
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/i21285.h
@@ -0,0 +1,43 @@
+/*
+
+(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+*/
+
+#ifndef NFP_I21285_H
+#define NFP_I21285_H
+
+#ifndef PCI_VENDOR_ID_DEC
+#define PCI_VENDOR_ID_DEC               0x1011
+#endif
+#ifndef PCI_DEVICE_ID_DEC_21285
+#define PCI_DEVICE_ID_DEC_21285         0x1065
+#endif
+#ifndef PCI_VENDOR_ID_NCIPHER
+#define PCI_VENDOR_ID_NCIPHER           0x0100
+#endif
+
+#ifndef PCI_DEVICE_ID_NFAST_GEN1
+#define PCI_DEVICE_ID_NFAST_GEN1	0x0100
+#endif
+
+#define I21285_OFFSET_DOORBELL		0x60
+#define I21285_OFFSET_INTERRUPT_MASK	0x34
+
+#define DOORBELL_ENABLE 0x0
+#define DOORBELL_DISABLE 0x4
+
+#define POSTLIST_ENABLE 0x0
+#define POSTLIST_DISABLE 0x8
+
+#define IOBAR	1
+#define MEMBAR	2
+
+#define IOSIZE	0x80
+#define MEMSIZE	0x100000
+
+#endif
diff --git a/usr/src/uts/common/io/nfp/i21555.c b/usr/src/uts/common/io/nfp/i21555.c
new file mode 100644
index 0000000000..82024dc800
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/i21555.c
@@ -0,0 +1,423 @@
+/*
+
+i21555.c: nCipher PCI HSM intel 21555 command driver
+
+(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+history
+
+09/10/2001 jsh  Original
+
+*/
+
+#include "nfp_common.h"
+#include "nfp_error.h"
+#include "nfp_hostif.h"
+#include "nfp_osif.h"
+#include "i21555.h"
+#include "nfp_cmd.h"
+#include "nfpci.h"
+
+/* started ------------------------------------------------------
+ *
+ * Check that device is ready to talk, by checking that
+ * the i21555 has master enabled on its secondary interface
+ */
+
+static nfp_err i21555_started( nfp_cdev *pdev ) {
+  unsigned int tmp32;
+#ifdef CONFIGSPACE_DEBUG
+  unsigned int reg32[64];
+  int i;
+#endif
+  nfp_err ne;
+
+  nfp_log( NFP_DBG2, "i21555_started: entered");
+
+#ifdef CONFIGSPACE_DEBUG
+  /* Suck up all the registers */
+  for (i=0; i < 64; i++) {
+    ne = nfp_config_inl( pdev, i*4, &reg32[i] );
+  }
+
+  for (i=0; i < 16; i++) {
+    int j = i * 4;
+    nfp_log( NFP_DBG3, "i21555 config reg %2x: %08x %08x %08x %08x", j*4,
+        reg32[j], reg32[j+1], reg32[j+2], reg32[j+3]);
+  }
+#endif
+
+  ne = nfp_config_inl( pdev, I21555_CFG_SEC_CMD_STATUS, &tmp32 );
+  if (ne) {
+    /* succeed if PCI config reads are not implemented */
+    if (ne == NFP_EUNKNOWN)
+      return NFP_SUCCESS;
+    nfp_log( NFP_DBG1, "i21555_started: nfp_config_inl failed");
+    return ne;
+  }
+
+  tmp32= FROM_LE32_IO(&tmp32) & 0xffff;
+
+  if ( tmp32 & CFG_CMD_MASTER ) {
+    nfp_log( NFP_DBG3, "i21555_started: Yes %x", tmp32);
+    return NFP_SUCCESS;
+  } else {
+    nfp_log( NFP_DBG1, "i21555_started: device not started yet %x", tmp32);
+    return NFP_ESTARTING;
+  }
+}
+
+/* create ------------------------------------------------------- */
+
+static nfp_err i21555_create( nfp_cdev *pdev ) {
+  unsigned int tmp32;
+
+  nfp_log( NFP_DBG2, "i21555_create: entered");
+  pdev->cmdctx= pdev;  /* set our context to just be a pointer to our nfp_cdev */
+
+  if(!pdev->bar[ IOBAR ]) {
+    nfp_log( NFP_DBG1, "i21555_create: null BAR[%d]", IOBAR );
+    return NFP_ENOMEM;
+  }
+  nfp_log( NFP_DBG2, "i21555_create: enable doorbell");
+  TO_LE32_IO( &tmp32, I21555_DOORBELL_PRI_ENABLE );
+  nfp_outl( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_SET_MASK, tmp32 );
+  nfp_outl( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_CLEAR_MASK, tmp32 );
+  return NFP_SUCCESS;
+}
+
+/* stop ------------------------------------------------------- */
+
+static nfp_err i21555_destroy( void * ctx ) {
+  nfp_cdev *pdev;
+  unsigned int tmp32;
+
+  nfp_log( NFP_DBG2, "i21555_destroy: entered");
+
+  pdev= (nfp_cdev *)ctx;
+  if(!pdev) {
+    nfp_log( NFP_DBG1, "i21555_destroy: NULL pdev");
+    return NFP_ENODEV;
+  }
+  if(!pdev->bar[ IOBAR ]) {
+    nfp_log( NFP_DBG1, "i21555_destroy: null BAR[%d]", IOBAR );
+    return NFP_ENOMEM;
+  }
+  TO_LE32_IO( &tmp32, I21555_DOORBELL_PRI_DISABLE );
+  nfp_outl( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_SET_MASK, tmp32 );
+  nfp_outl( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_CLEAR_MASK, tmp32 );
+
+  return NFP_SUCCESS;
+}
+
+/* open ------------------------------------------------------- */
+
+/* ARGSUSED */
+static nfp_err i21555_open( void * ctx ) {
+
+  nfp_log( NFP_DBG2, "i21555_open: entered");
+
+  return NFP_SUCCESS;
+}
+
+/* close ------------------------------------------------------- */
+
+/* ARGSUSED */
+static nfp_err i21555_close( void * ctx ) {
+  nfp_log( NFP_DBG2, "i21555_close: entered");
+
+  return NFP_SUCCESS;
+}
+
+/* isr ------------------------------------------------------- */
+
+static nfp_err i21555_isr( void *ctx, int *handled ) {
+  nfp_cdev *pdev;
+  nfp_err ne;
+  unsigned short doorbell;
+  unsigned short tmp16;
+
+  nfp_log( NFP_DBG3, "i21555_isr: entered");
+
+  *handled= 0;
+  pdev= (nfp_cdev *)ctx;
+  if(!pdev) {
+    nfp_log( NFP_DBG1, "i21555_isr: NULL pdev");
+    return NFP_ENODEV;
+  }
+
+  pdev->stats.isr++;
+
+  if(!pdev->bar[ IOBAR ]) {
+    nfp_log( NFP_DBG1, "i21555_isr: null BAR[%d]", IOBAR );
+    return NFP_ENOMEM;
+  }
+
+  /* This interrupt may not be from our module, so check that it actually is
+   * us before handling it.
+   */
+  ne = i21555_started( pdev );
+  if (ne) {
+    if (ne != NFP_ESTARTING) {
+      nfp_log( NFP_DBG1, "i21555_isr: i21555_started failed");
+    }
+    return ne;
+  }
+
+  doorbell= nfp_inw( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_SET);
+  doorbell= FROM_LE16_IO(&doorbell);
+  while( doorbell && doorbell != 0xffff) {
+    *handled= 1;
+    /* service interrupts */
+    if( doorbell & (NFAST_INT_DEVICE_WRITE_OK | NFAST_INT_DEVICE_WRITE_FAILED)) {
+      pdev->stats.isr_write++;
+      TO_LE16_IO(&tmp16,NFAST_INT_DEVICE_WRITE_OK | NFAST_INT_DEVICE_WRITE_FAILED);
+      nfp_outw( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_CLEAR, tmp16 );
+
+      nfp_log( NFP_DBG2, "i21555_isr: write done interrupt, ok = %d.", doorbell & NFAST_INT_DEVICE_WRITE_OK ? 1 : 0 );
+
+      nfp_write_complete(pdev->dev, doorbell & NFAST_INT_DEVICE_WRITE_OK ? 1 : 0 );
+    }
+
+    if( doorbell & (NFAST_INT_DEVICE_READ_OK | NFAST_INT_DEVICE_READ_FAILED)) {
+      pdev->stats.isr_read++;
+      TO_LE16_IO(&tmp16,NFAST_INT_DEVICE_READ_OK | NFAST_INT_DEVICE_READ_FAILED);
+      nfp_outw( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_CLEAR, tmp16 );
+
+      nfp_log( NFP_DBG2, "i21555_isr: read ack interrupt, ok = %d.", doorbell & NFAST_INT_DEVICE_READ_OK ? 1 : 0 );
+      nfp_read_complete( pdev->dev, doorbell & NFAST_INT_DEVICE_READ_OK ? 1 : 0);
+    }
+
+    if( doorbell & ~(NFAST_INT_DEVICE_READ_OK  | NFAST_INT_DEVICE_READ_FAILED |
+                     NFAST_INT_DEVICE_WRITE_OK | NFAST_INT_DEVICE_WRITE_FAILED)) {
+      TO_LE16_IO(&tmp16,doorbell);
+      nfp_outw( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_CLEAR, tmp16 );
+      nfp_log( NFP_DBG1, "i21555_isr: unexpected interrupt %x", doorbell );
+    }
+    doorbell= nfp_inw( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_SET);
+    doorbell= FROM_LE16_IO(&doorbell);
+  }
+  nfp_log( NFP_DBG3, "i21555_isr: exiting");
+  return 0;
+}
+
+/* write ------------------------------------------------------- */
+
+static nfp_err i21555_write( const char *block, int len, void *ctx) {
+  nfp_cdev *cdev;
+  unsigned int hdr[2];
+  nfp_err ne;
+  unsigned short tmp16;
+  unsigned int tmp32;
+
+  nfp_log( NFP_DBG2, "i21555_write: entered");
+
+  cdev= (nfp_cdev *)ctx;
+  if(!cdev) {
+    nfp_log( NFP_DBG1, "i21555_write: NULL cdev");
+    return NFP_ENODEV;
+  }
+
+  cdev->stats.write_fail++;
+
+  if(!cdev->bar[ IOBAR ]) {
+    nfp_log( NFP_DBG1, "i21555_write: null BAR[%d]", IOBAR );
+    return NFP_ENOMEM;
+  }
+
+  ne = i21555_started( cdev );
+  if (ne) {
+    if (ne != NFP_ESTARTING) {
+      nfp_log( NFP_DBG1, "i21555_write: i21555_started failed");
+    }
+    return ne;
+  }
+
+  nfp_log( NFP_DBG3, "i21555_write: cdev->bar[ MEMBAR ]= %x", cdev->bar[ MEMBAR ]);
+  nfp_log( NFP_DBG3, "i21555_write: cdev->bar[ IOBAR ]= %x", cdev->bar[ IOBAR ]);
+  nfp_log( NFP_DBG3, "i21555_write: block len %d", len ); 
+  ne= nfp_copy_from_user_to_dev( cdev, MEMBAR, NFPCI_JOBS_WR_DATA, block, len);
+  if (ne) {
+    nfp_log( NFP_DBG1, "i21555_write: nfp_copy_from_user_to_dev failed");
+    return ne;
+  }
+  TO_LE32_MEM(&hdr[0], NFPCI_JOB_CONTROL);
+  TO_LE32_MEM(&hdr[1], len);
+  ne= nfp_copy_to_dev( cdev, MEMBAR, NFPCI_JOBS_WR_CONTROL, (const char *)hdr, 8);
+  if (ne) {
+    nfp_log( NFP_DBG1, "i21555_write: nfp_copy_to_dev failed");
+    return ne;
+  }
+
+  ne= nfp_copy_from_dev( cdev, MEMBAR, NFPCI_JOBS_WR_LENGTH, (char *)hdr, 4);
+  if (ne) {
+    nfp_log( NFP_DBG1, "i21555_write: nfp_copy_from_dev failed");
+    return ne;
+  }
+
+  TO_LE32_MEM(&tmp32, len);
+  if ( hdr[0] != tmp32 ) {
+    nfp_log( NFP_DBG1, "i21555_write: length not written");
+    return NFP_EIO;
+  }
+  TO_LE16_IO(&tmp16, NFAST_INT_HOST_WRITE_REQUEST >> 16);
+  nfp_outw( cdev, IOBAR, I21555_OFFSET_DOORBELL_SEC_SET, tmp16);
+
+  cdev->stats.write_fail--;
+  cdev->stats.write_block++;
+  cdev->stats.write_byte += len;
+
+  nfp_log( NFP_DBG2, "i21555_write: done");
+  return NFP_SUCCESS;
+}
+
+/* read ------------------------------------------------------- */
+
+static nfp_err i21555_read( char *block, int len, void *ctx, int *rcount) {
+  nfp_cdev *cdev;
+  nfp_err ne;
+  int count;
+
+  nfp_log( NFP_DBG2, "i21555_read: entered");
+  *rcount= 0;
+
+  cdev= (nfp_cdev *)ctx;
+  if(!cdev) {
+    nfp_log( NFP_DBG1, "i21555_read: NULL pdev");
+    return NFP_ENODEV;
+  }
+
+  cdev->stats.read_fail++;
+
+  if(!cdev->bar[ IOBAR ]) {
+    nfp_log( NFP_DBG1, "i21555_read: null BAR[%d]", IOBAR );
+    return NFP_ENOMEM;
+  }
+
+  ne= nfp_copy_from_dev( cdev, MEMBAR, NFPCI_JOBS_RD_LENGTH, (char *)&count, 4);
+  if (ne) {
+    nfp_log( NFP_DBG1, "i21555_read: nfp_copy_from_dev failed.");
+    return ne;
+  }
+  count= FROM_LE32_MEM(&count);
+  if(count<0 || count>len) {
+    nfp_log( NFP_DBG1, "i21555_read: bad byte count (%d) from device", count);
+    return NFP_EIO;
+  }
+  ne= nfp_copy_to_user_from_dev( cdev, MEMBAR, NFPCI_JOBS_RD_DATA, block, count);
+  if (ne) {
+    nfp_log( NFP_DBG1, "i21555_read: nfp_copy_to_user failed.");
+    return ne;
+  }
+  nfp_log( NFP_DBG2, "i21555_read: done");
+  *rcount= count;
+  cdev->stats.read_fail--;
+  cdev->stats.read_block++;
+  cdev->stats.read_byte += len;
+  return NFP_SUCCESS;
+}
+
+/* chupdate  ------------------------------------------------------- */
+
+/* ARGSUSED */
+static nfp_err i21555_chupdate( char *data, int len, void *ctx ) {
+  nfp_log( NFP_DBG1, "i21555_chupdate: NYI");
+  return NFP_SUCCESS;
+}
+
+/* ensure reading -------------------------------------------------- */
+
+static nfp_err i21555_ensure_reading( unsigned int addr, int len, void *ctx ) {
+  nfp_cdev *cdev;
+  unsigned int hdr[3];
+  unsigned short tmp16;
+  unsigned int tmp32;
+  nfp_err ne;
+  int hdr_len;
+
+  nfp_log( NFP_DBG2, "i21555_ensure_reading: entered");
+
+  cdev= (nfp_cdev *)ctx;
+  if(!cdev) {
+    nfp_log( NFP_DBG1, "i21555_ensure_reading: NULL pdev");
+    return NFP_ENODEV;
+  }
+
+  cdev->stats.ensure_fail++;
+
+  if(!cdev->bar[ IOBAR ]) {
+    nfp_log( NFP_DBG1, "i21555_ensure_reading: null BAR[%d]", IOBAR );
+    return NFP_ENOMEM;
+  }
+
+  ne = i21555_started( cdev );
+  if (ne) {
+    if (ne != NFP_ESTARTING) {
+      nfp_log( NFP_DBG1, "i21555_ensure_reading: i21555_started failed");
+    }
+    return ne;
+  }
+
+  nfp_log( NFP_DBG3, "i21555_ensure_reading: pdev->bar[ MEMBAR ]= %x", cdev->bar[ MEMBAR ]);
+  nfp_log( NFP_DBG3, "i21555_ensure_reading: pdev->bar[ IOBAR ]= %x", cdev->bar[ IOBAR ]);
+  if(addr) {
+    nfp_log( NFP_DBG3, "i21555_ensure_reading: new format, addr %x", addr);
+    TO_LE32_MEM(&hdr[0], NFPCI_JOB_CONTROL_PCI_PUSH);
+    TO_LE32_MEM(&hdr[1], len);
+    TO_LE32_MEM(&hdr[2], addr);
+    hdr_len= 12;
+  } else {
+    TO_LE32_MEM(&hdr[0], NFPCI_JOB_CONTROL);
+    TO_LE32_MEM(&hdr[1], len);
+    hdr_len= 8;
+  }
+  ne= nfp_copy_to_dev( cdev, MEMBAR, NFPCI_JOBS_RD_CONTROL, (const char *)hdr, hdr_len);
+  if (ne) {
+    nfp_log( NFP_DBG1, "i21555_ensure_reading: nfp_copy_to_dev failed");
+    return ne;
+  }
+
+  ne= nfp_copy_from_dev( cdev, MEMBAR, NFPCI_JOBS_RD_LENGTH, (char *)hdr, 4);
+  if (ne) {
+    nfp_log( NFP_DBG1, "i21555_ensure_reading: nfp_copy_from_dev failed");
+    return ne;
+  }
+
+  TO_LE32_MEM(&tmp32, len);
+
+  if ( hdr[0] != tmp32 ) {
+    nfp_log( NFP_DBG1, "i21555_ensure_reading: len not written");
+    return NFP_EIO;
+  }
+  TO_LE16_IO( &tmp16, NFAST_INT_HOST_READ_REQUEST >> 16);
+  nfp_outw( cdev, IOBAR, I21555_OFFSET_DOORBELL_SEC_SET, tmp16);
+
+  cdev->stats.ensure_fail--;
+  cdev->stats.ensure++;
+
+  return NFP_SUCCESS;
+}
+
+/* command device structure ------------------------------------- */
+
+const nfpcmd_dev i21555_cmddev = {
+  "nCipher Gen 2 PCI",
+  PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_21555,
+  PCI_VENDOR_ID_NCIPHER, PCI_SUBSYSTEM_ID_NFAST_REV1,
+  { 0, IOSIZE | PCI_BASE_ADDRESS_SPACE_IO, NFPCI_RAM_MINSIZE_JOBS, 0, 0, 0 },
+  NFP_CMD_FLG_NEED_IOBUF,
+  i21555_create,
+  i21555_destroy,
+  i21555_open,
+  i21555_close,
+  i21555_isr,
+  i21555_write,
+  i21555_read,
+  i21555_chupdate,
+  i21555_ensure_reading,
+  i21555_debug,
+};
diff --git a/usr/src/uts/common/io/nfp/i21555.h b/usr/src/uts/common/io/nfp/i21555.h
new file mode 100644
index 0000000000..d8f3965938
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/i21555.h
@@ -0,0 +1,51 @@
+/*
+
+(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+*/
+
+#ifndef I21555_H
+#define I21555_H
+
+#ifndef PCI_VENDOR_ID_INTEL
+#define PCI_VENDOR_ID_INTEL             0x8086
+#endif
+
+#ifndef PCI_DEVICE_ID_INTEL_21555
+#define PCI_DEVICE_ID_INTEL_21555       0xb555
+#endif
+
+#ifndef PCI_VENDOR_ID_NCIPHER
+#define PCI_VENDOR_ID_NCIPHER           0x0100
+#endif
+
+#ifndef PCI_SUBSYSTEM_ID_NFAST_REV1
+#define PCI_SUBSYSTEM_ID_NFAST_REV1     0x0100
+#endif
+
+#define I21555_OFFSET_DOORBELL_PRI_SET		0x9C
+#define I21555_OFFSET_DOORBELL_SEC_SET		0x9E
+#define I21555_OFFSET_DOORBELL_PRI_CLEAR	0x98
+
+#define I21555_OFFSET_DOORBELL_PRI_SET_MASK	0xA4
+#define I21555_OFFSET_DOORBELL_PRI_CLEAR_MASK	0xA0
+
+#define I21555_DOORBELL_PRI_ENABLE 0x0000
+#define I21555_DOORBELL_PRI_DISABLE 0xFFFF
+
+#define I21555_CFG_SEC_CMD_STATUS 0x44
+
+#define CFG_CMD_MASTER 0x0004
+
+#define IOBAR   1
+#define MEMBAR  2
+
+#define IOSIZE  0x100
+
+extern nfp_err i21555_debug( int cmd, void *ctx );
+
+#endif
diff --git a/usr/src/uts/common/io/nfp/i21555d.c b/usr/src/uts/common/io/nfp/i21555d.c
new file mode 100644
index 0000000000..183ace8275
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/i21555d.c
@@ -0,0 +1,28 @@
+/*
+
+i21555d.c: nCipher PCI HSM intel 21555 debug ioctl
+
+(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+
+history
+
+15/05/2002 jsh  Original, does nothing
+
+*/
+
+#include "nfp_common.h"
+#include "nfp_error.h"
+#include "nfp_osif.h"
+#include "i21555.h"
+
+/* ARGSUSED */
+nfp_err i21555_debug( int cmd, void *ctx) {
+  nfp_log( NFP_DBG1, "i21555_debug: entered");
+
+  return NFP_EUNKNOWN;
+}
diff --git a/usr/src/uts/common/io/nfp/nfdev-common.h b/usr/src/uts/common/io/nfp/nfdev-common.h
new file mode 100644
index 0000000000..8a97bf2c63
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/nfdev-common.h
@@ -0,0 +1,141 @@
+/*
+
+(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+*/
+/** \file nfdev-common.h
+ *
+ * \brief nFast device driver (not generic SCSI) ioctl struct definition file
+ *  include NFDEV-$(system) for ioctl number definitions
+ *
+ *  1998.07.13	jsh	Started
+ *
+ * 
+ */
+
+#ifndef NFDEV_COMMON_H
+#define NFDEV_COMMON_H
+
+/**
+ * Result of the ENQUIRY ioctl.
+ */
+typedef struct nfdev_enquiry_str {
+  unsigned int  busno; /**< Which bus is the PCI device on. */
+  unsigned char slotno; /**< Which slot is the PCI device in. */
+  unsigned char reserved[3]; /**< for consistant struct alignment */
+} nfdev_enquiry_str;
+
+/**
+ * Result of the STATS ioctl.
+ */
+typedef struct nfdev_stats_str {
+  unsigned long  isr; /**< Count interrupts. */
+  unsigned long  isr_read; /**< Count read interrupts. */
+  unsigned long  isr_write; /**< Count write interrupts. */
+  unsigned long  write_fail; /**< Count write failures. */
+  unsigned long  write_block; /**< Count blocks written. */
+  unsigned long  write_byte; /**< Count bytes written. */
+  unsigned long  read_fail; /**< Count read failures. */
+  unsigned long  read_block; /**< Count blocks read. */
+  unsigned long  read_byte; /**< Count bytes read. */
+  unsigned long  ensure_fail; /**< Count read request failures. */
+  unsigned long  ensure; /**< Count read requests. */
+} nfdev_stats_str;
+
+/**
+ * Input to the CONTROL ioctl.
+ */
+typedef struct nfdev_control_str {
+  unsigned control; /**< Control flags. */
+} nfdev_control_str;
+
+/** Control bit indicating host supports MOI control */
+#define NFDEV_CONTROL_HOST_MOI 0x0001
+
+/** Index of control bits indicating desired mode
+ *
+ * Desired mode follows the M_ModuleMode enumeration.
+ */
+#define NFDEV_CONTROL_MODE_SHIFT 1
+
+/** Detect a backwards-compatible control value
+ *
+ * Returns true if the request control value "makes no difference", i.e.
+ * and the failure of an attempt to set it is therefore uninteresting.
+ */
+#define NFDEV_CONTROL_HARMLESS(c) ((c) <= 1)
+
+/**
+ * Result of the STATUS ioctl.
+ */
+typedef struct nfdev_status_str {
+  unsigned  status; /**< Status flags. */
+  char      error[8]; /**< Error string. */
+} nfdev_status_str;
+
+/** Monitor firmware supports MOI control and error reporting */
+#define NFDEV_STATUS_MONITOR_MOI 0x0001
+
+/** Application firmware supports MOI control and error reporting */
+#define NFDEV_STATUS_APPLICATION_MOI 0x0002
+
+/** Application firmware running and supports error reporting */
+#define NFDEV_STATUS_APPLICATION_RUNNING 0x0004
+
+/** HSM failed
+ *
+ * Consult error[] for additional information.
+ */
+#define NFDEV_STATUS_FAILED 0x0008
+
+/** Standard PCI interface. */
+#define NFDEV_IF_STANDARD	0x01
+
+/** PCI interface with results pushed from device
+ *  via DMA.
+ */
+#define NFDEV_IF_PCI_PUSH	0x02
+
+/* platform independant base ioctl numbers */
+
+/** Enquiry ioctl.
+ *  \return nfdev_enquiry_str describing the attached device. */
+#define NFDEV_IOCTL_NUM_ENQUIRY        0x01
+/** Channel Update ioctl.
+ *  \deprecated */
+#define NFDEV_IOCTL_NUM_CHUPDATE       0x02
+/** Ensure Reading ioctl.
+ *  Signal a read request to the device.
+ *  \param (unsigned int) Length of data to be read.
+ */
+#define NFDEV_IOCTL_NUM_ENSUREREADING  0x03
+/** Device Count ioctl.
+ *  Not implemented for on all platforms.
+ *  \return (int) the number of attached devices. */
+#define NFDEV_IOCTL_NUM_DEVCOUNT       0x04
+/** Internal Debug ioctl.
+ *  Not implemented in release drivers. */
+#define NFDEV_IOCTL_NUM_DEBUG          0x05
+/** PCI Interface Version ioctl.
+ *  \param (int) Maximum PCI interface version
+ *   supported by the user of the device. */
+#define NFDEV_IOCTL_NUM_PCI_IFVERS     0x06
+/** Statistics ioctl.
+ *  \return nfdev_enquiry_str describing the attached device. */
+#define NFDEV_IOCTL_NUM_STATS          0x07
+
+/** Module control ioctl
+ * \param (nfdev_control_str) Value to write to HSM control register
+ */
+#define NFDEV_IOCTL_NUM_CONTROL        0x08
+
+/** Module state ioctl
+ * \return (nfdev_status_str) Values read from HSM status/error registers
+ */
+#define NFDEV_IOCTL_NUM_STATUS         0x09
+
+#endif
diff --git a/usr/src/uts/common/io/nfp/nfdev-solaris.h b/usr/src/uts/common/io/nfp/nfdev-solaris.h
new file mode 100644
index 0000000000..923b902e46
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/nfdev-solaris.h
@@ -0,0 +1,37 @@
+/*
+
+nfdev-solaris.h: nFast solaris specific device ioctl interface.
+
+(C) Copyright nCipher Corporation Ltd 1998-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+history
+
+14/07/1998 jsh  Original
+
+*/
+
+#ifndef NFDEV_SOLARIS_H
+#define NFDEV_SOLARIS_H
+
+#include "nfdev-common.h"
+
+#define NFDEV_IOCTL_TYPE ('n'<<8)
+
+#define NFDEV_IOCTL_ENQUIRY		( NFDEV_IOCTL_TYPE | \
+					  NFDEV_IOCTL_NUM_ENQUIRY )
+#define NFDEV_IOCTL_ENSUREREADING	( NFDEV_IOCTL_TYPE | \
+					  NFDEV_IOCTL_NUM_ENSUREREADING )
+#define NFDEV_IOCTL_DEVCOUNT		( NFDEV_IOCTL_TYPE | \
+					  NFDEV_IOCTL_NUM_DEVCOUNT )
+#define NFDEV_IOCTL_DEBUG		( NFDEV_IOCTL_TYPE | \
+					  NFDEV_IOCTL_NUM_DEBUG )
+#define NFDEV_IOCTL_PCI_IFVERS		( NFDEV_IOCTL_TYPE | \
+					  NFDEV_IOCTL_NUM_PCI_IFVERS )
+#define NFDEV_IOCTL_STATS		( NFDEV_IOCTL_TYPE | \
+					  NFDEV_IOCTL_NUM_STATS )
+
+#endif /* NFDEV_SOLARIS_H */
diff --git a/usr/src/uts/common/io/nfp/nfp.h b/usr/src/uts/common/io/nfp/nfp.h
new file mode 100644
index 0000000000..9704f04fbc
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/nfp.h
@@ -0,0 +1,113 @@
+/*
+
+nfp.h: nFast PCI driver for Solaris 2.5, 2.6 and 2.7
+
+(C) Copyright nCipher Corporation Ltd 2001-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+history
+
+06/05/1998 jsh  Original solaris 2.6
+21/05/1999 jsh  added support for solaris 2.5
+10/06/1999 jsh  added support for solaris 2.7 (32 and 64 bit)
+16/10/2001 jsh  moved from nfast to new structure in nfdrv
+
+*/
+
+#ifndef NFP_H
+#define NFP_H
+
+#ifndef _KERNEL
+#error Hello?  this is a driver, please compile with -D_KERNEL
+#endif
+
+#if ( CH_KERNELVER < 260 )
+typedef int ioctlptr_t;
+typedef unsigned short uint16_t;
+#define DDI_GET32    ddi_getl
+#define DDI_PUT32    ddi_putl
+#define DDI_GET16    ddi_getw
+#define DDI_PUT16    ddi_putw
+#define DDI_REP_GET8 ddi_rep_getb
+#define DDI_REP_PUT8 ddi_rep_putb
+#define DDI_REP_GET32 ddi_rep_getl
+#define DDI_REP_PUT32 ddi_rep_putl
+#define PCI_CONFIG_GET16 pci_config_getw
+#else /* ( CH_KERNELVER >= 260 ) */
+typedef intptr_t ioctlptr_t;
+#define DDI_GET32    ddi_get32
+#define DDI_PUT32    ddi_put32
+#define DDI_GET16    ddi_get16
+#define DDI_PUT16    ddi_put16
+#define DDI_REP_GET8 ddi_rep_get8
+#define DDI_REP_PUT8 ddi_rep_put8
+#define DDI_REP_GET32 ddi_rep_get32
+#define DDI_REP_PUT32 ddi_rep_put32
+#define PCI_CONFIG_GET16 pci_config_get16
+#endif
+
+#if ( CH_KERNELVER < 270 )
+typedef int nfp_timeout_t;
+#define EXTRA_CB_FLAGS 0
+#define VSXPRINTF(s, n, format, ap) vsprintf (s, format, ap)
+#else /* ( CH_KERNELVER >= 270 ) */
+typedef timeout_id_t nfp_timeout_t;
+#define EXTRA_CB_FLAGS D_64BIT
+#define VSXPRINTF(s, n, format, ap) vsnprintf(s, n, format, ap)
+#endif
+
+typedef struct nfp_dev {
+  int rd_ok;
+  int wr_ok;
+
+  int ifvers;
+
+  /* for PCI push read interface */
+  unsigned char *read_buf;
+  ddi_dma_handle_t read_dma_handle;
+  ddi_dma_cookie_t read_dma_cookie;
+
+  ddi_acc_handle_t acchandle;
+
+  int rd_dma_ok;
+
+  nfp_timeout_t wrtimeout;
+  nfp_timeout_t rdtimeout;
+
+  struct buf *wr_bp;
+  int wr_ready;
+  int rd_ready;
+  int rd_pending;
+  int rd_outstanding;
+  kcondvar_t rd_cv;
+
+  struct pollhead pollhead;
+  dev_info_t *dip;
+
+  ddi_iblock_cookie_t high_iblock_cookie; /* for mutex */
+  ddi_iblock_cookie_t low_iblock_cookie; /* for mutex */
+  kmutex_t high_mutex;
+  kmutex_t low_mutex;
+  int high_intr;
+  ddi_softintr_t soft_int_id;
+  int high_read;
+  int high_write;
+
+  ddi_iblock_cookie_t iblock_cookie; /* for mutex */
+  kmutex_t isr_mutex;
+
+  kmutex_t busy_mutex;
+  int busy;
+ 
+  ddi_acc_handle_t conf_handle;
+
+  nfp_cdev common;
+  const nfpcmd_dev *cmddev;
+} nfp_dev;
+
+extern struct nfp_dev *nfp_dev_list[];
+
+#endif /* NFP_H */
diff --git a/usr/src/uts/common/io/nfp/nfp_cmd.h b/usr/src/uts/common/io/nfp/nfp_cmd.h
new file mode 100644
index 0000000000..db8af0b2f9
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/nfp_cmd.h
@@ -0,0 +1,68 @@
+/*
+
+nfp_cmd.h: nCipher PCI HSM command driver decalrations
+
+(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+history
+
+10/10/2001 jsh  Original
+
+*/
+
+#ifndef NFPCMD_H
+#define NFPCMD_H
+
+#include "nfp_hostif.h"
+#include "nfp_error.h"
+
+/* read and write called with userspace buffer */
+
+typedef struct nfpcmd_dev {
+  const char *name;
+  unsigned short vendorid, deviceid,
+                 sub_vendorid, sub_deviceid;
+  unsigned int bar_sizes[6];    /* includes IO bit */
+  unsigned int flags;
+  nfp_err (*create)(struct nfp_cdev *pdev);
+  nfp_err (*destroy)(void * ctx);
+  nfp_err (*open)(void * ctx);
+  nfp_err (*close)(void * ctx);
+  nfp_err (*isr)(void *ctx, int *handled);
+  nfp_err (*write_block)( const char *ublock, int len, void *ctx );
+  nfp_err (*read_block)( char *ublock, int len, void *ctx, int *rcount);
+  nfp_err (*channel_update)( char *data, int len, void *ctx);
+  nfp_err (*ensure_reading)( unsigned int addr, int len, void *ctx );
+  nfp_err (*debug)( int cmd, void *ctx);
+} nfpcmd_dev;
+
+#define NFP_CMD_FLG_NEED_IOBUF	0x1
+
+/* list of all supported drivers ---------------------------------------- */
+
+extern const nfpcmd_dev *nfp_drvlist[];
+
+extern const nfpcmd_dev i21285_cmddev;
+extern const nfpcmd_dev i21555_cmddev;
+extern const nfpcmd_dev bcm5820_cmddev;
+
+#ifndef PCI_BASE_ADDRESS_SPACE_IO
+#define PCI_BASE_ADDRESS_SPACE_IO	0x1
+#endif
+
+#define NFP_MAXDEV	16
+
+
+#define NFP_MEMBAR_MASK    ~0xf
+#define NFP_IOBAR_MASK     ~0x3
+/*
+   This masks off the bottom bits of the PCI_CSR_BAR which signify that the
+   BAR is an IO BAR rather than a MEM BAR 
+*/ 
+
+#endif
+
diff --git a/usr/src/uts/common/io/nfp/nfp_common.h b/usr/src/uts/common/io/nfp/nfp_common.h
new file mode 100644
index 0000000000..d1d2100fea
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/nfp_common.h
@@ -0,0 +1,68 @@
+/*
+
+(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+*/
+
+#ifndef NFP_COMMON_H
+#define NFP_COMMON_H
+
+#include <sys/types.h>
+#include <sys/conf.h>
+
+typedef uint32_t UINT32;
+typedef uint8_t BYTE;
+
+#define DEFINE_NFPCI_PACKED_STRUCTS
+#include "nfpci.h"
+#include "nfdev-solaris.h"
+
+typedef int oserr_t;
+
+#if CH_BIGENDIAN
+
+/* Big Endian Sparc */
+
+#define SWP32(x) \
+( (((unsigned int)(x)>>24)&0xff) | (((unsigned int)(x)>>8)&0xff00) | (((unsigned int)(x)<<8)&0xff0000) | (((unsigned int)(x)<<24)&0xff000000) ) 
+
+#define SWP16(x) ( (((x)>>8)&0xff) | (((x)<<8)&0xff00) )
+
+#define FROM_LE32_IO(x)		SWP32(*x)
+#define TO_LE32_IO(x,y)		*x=SWP32(y)
+
+#define FROM_LE32_MEM(x)	SWP32(*x)
+#define TO_LE32_MEM(x,y)	*x=SWP32(y)
+
+#define FROM_LE16_IO(x)		SWP16(*x)
+#define TO_LE16_IO(x,y)		*x=SWP16(y)
+
+#else
+
+/* Little Endian x86 */
+
+#define FROM_LE32_IO(x) (*x)
+#define TO_LE32_IO(x,y) (*x=y)
+
+#define FROM_LE32_MEM(x) (*x)
+#define TO_LE32_MEM(x,y) (*x=y)
+
+#define FROM_LE16_IO(x) (*x)
+#define TO_LE16_IO(x,y) (*x=y)
+
+#endif /* !CH_BIGENDIAN */
+
+#include <sys/types.h>
+
+#if CH_KERNELVER == 260
+#define nfp_get_lbolt( lbolt, err ) err= drv_getparm( LBOLT, lbolt )
+#else
+#define nfp_get_lbolt( lbolt, err ) { *lbolt= ddi_get_lbolt(); err= 0; }
+#endif
+
+#endif
+
diff --git a/usr/src/uts/common/io/nfp/nfp_error.h b/usr/src/uts/common/io/nfp/nfp_error.h
new file mode 100644
index 0000000000..d64cb78fd4
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/nfp_error.h
@@ -0,0 +1,48 @@
+/*
+
+nfp_error.h: nCipher PCI HSM error handling
+
+(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+history
+
+05/12/2001 jsh  Original
+
+*/
+
+#ifndef NFP_ERROR_H
+#define NFP_ERROR_H
+
+#include "nfp_common.h"
+
+#define NFP_SUCCESS	0x0
+#define NFP_EFAULT      0x1
+#define NFP_ENOMEM	0x2
+#define NFP_EINVAL	0x3
+#define NFP_EIO		0x4
+#define NFP_ENXIO	0x5
+#define NFP_ENODEV	0x6
+#define NFP_EINTR	0x7
+#define NFP_ESTARTING	0x8
+#define NFP_EAGAIN	0x9
+#define NFP_EUNKNOWN	0x100
+
+typedef int nfp_err;
+
+extern oserr_t nfp_oserr( nfp_err nerr );
+extern nfp_err nfp_error( oserr_t oerr );
+
+#define nfr( x) \
+  return nfp_error((x))
+
+#define nfer(x, fn, msg) \
+  { oserr_t err=(x); if(err) { nfp_log( NFP_DBG1, #fn ": " msg); return nfp_error(err); } }
+
+#define er(x, fn, msg ) \
+{ nfp_err err=(x); if(err) { nfp_log( NFP_DBG1, #fn ": " msg); return err; } }
+
+#endif
diff --git a/usr/src/uts/common/io/nfp/nfp_hostif.h b/usr/src/uts/common/io/nfp/nfp_hostif.h
new file mode 100644
index 0000000000..3e7d8187e5
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/nfp_hostif.h
@@ -0,0 +1,54 @@
+/*
+
+nfp_hostif.h: nCipher PCI HSM host interface declarations
+
+(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+history
+
+10/10/2001 jsh  Original
+
+*/
+
+#ifndef NFP_HOSTIF_H
+#define NFP_HOSTIF_H
+
+#include "nfdev-common.h"
+
+struct nfp_dev;
+
+/* common device structure */
+
+typedef struct nfp_cdev {
+  unsigned char *bar[6];
+  void *extra[6];
+
+  int busno;
+  int slotno;
+
+  void *cmdctx;
+
+  char *iobuf;
+
+  struct nfp_dev* dev;
+
+  struct nfdev_stats_str stats;
+
+} nfp_cdev;
+
+/* callbacks from command drivers -------------------------------------- */
+
+void nfp_read_complete(  struct nfp_dev *pdev, int ok);
+void nfp_write_complete( struct nfp_dev *pdev, int ok);
+
+#define NFP_READ_MAX (8 * 1024)
+#define NFP_READBUF_SIZE (NFP_READ_MAX + 8)
+#define NFP_TIMEOUT_SEC 10
+
+#define NFP_DRVNAME "nCipher nFast PCI driver"
+
+#endif
diff --git a/usr/src/uts/common/io/nfp/nfp_ifvers.c b/usr/src/uts/common/io/nfp/nfp_ifvers.c
new file mode 100644
index 0000000000..807b4f24c5
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/nfp_ifvers.c
@@ -0,0 +1,51 @@
+/*
+
+(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+*/
+
+/*
+ * nfp_ifervs.c  - common pci interface versioning
+ *
+ * uses:
+ *
+ * int pdev->ifvers
+ *     device interface version
+ *
+ * int nfp_ifvers
+ *     interface version limit
+ * 
+ * int nfp_alloc_pci_push( nfp_dev *pdev )
+ *     allocates resources needed for PCI Push,
+ *     if not already allocated, and return True if successful
+ *
+ * void nfp_free_pci_push( nfp_dev *pdev ) {
+ *     frees any resources allocated to PCI Push
+ */
+
+void nfp_set_ifvers( nfp_dev *pdev, int vers ) {
+  if( nfp_ifvers != 0 && vers > nfp_ifvers ) {
+    nfp_log( NFP_DBG2,
+             "nfp_set_ifvers: can't set ifvers %d"
+             " as nfp_ifvers wants max ifvers %d",
+             vers, nfp_ifvers);
+    return;
+  }
+  if( vers >= NFDEV_IF_PCI_PUSH ) {
+    if(!nfp_alloc_pci_push(pdev)) {
+      nfp_log( NFP_DBG1,
+               "nfp_set_ifvers: can't set ifvers %d"
+               " as resources not available",
+               vers);
+      return;
+    }
+  } else {
+    nfp_free_pci_push(pdev);
+  }
+  pdev->ifvers= vers;
+  nfp_log( NFP_DBG3, "nfp_set_ifvers: setting ifvers %d", vers);
+}
diff --git a/usr/src/uts/common/io/nfp/nfp_osif.h b/usr/src/uts/common/io/nfp/nfp_osif.h
new file mode 100644
index 0000000000..17ffe469ce
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/nfp_osif.h
@@ -0,0 +1,105 @@
+/*
+
+nfp_osif.h: nCipher PCI HSM OS interface declarations
+
+(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+history
+
+10/10/2001 jsh  Original
+
+*/
+
+#ifndef NFP_OSIF_H
+#define NFP_OSIF_H
+
+#include "nfp_hostif.h"
+#include "nfp_error.h"
+
+/* general typedefs ----------------------------------------------- */
+
+typedef volatile unsigned int reg32;
+typedef volatile unsigned short reg16;
+typedef volatile unsigned char reg8;
+
+/* sempaphores, mutexs and events --------------------------------- */
+
+#if 0
+extern nfp_err nfp_sema_init( nfp_sema *sema, int initial);
+extern void nfp_sema_destroy( nfp_sema *sema );
+extern void nfp_sema_post( nfp_sema *sema );
+extern void nfp_sema_wait( nfp_sema *sema );
+extern int nfp_sema_wait_sig( nfp_sema *sema );
+
+extern nfp_err nfp_mutex_init( nfp_mutex *mutex );
+extern void nfp_mutex_destroy( nfp_mutex *mutex );
+extern void nfp_mutex_enter( nfp_mutex *mutex );
+extern void nfp_mutex_exit( nfp_mutex *mutex );
+
+extern nfp_err nfp_event_init( nfp_event *event );
+extern void nfp_event_destroy( nfp_event *event );
+extern void nfp_event_set( nfp_event *event );
+extern void nfp_event_clear( nfp_event *event );
+extern void nfp_event_wait( nfp_event *event );
+extern void nfp_event_wait_sig( nfp_event *event );
+
+#endif
+
+/* timeouts ------------------------------------------------------ */
+
+extern void nfp_sleep( int ms );
+
+/* memory handling ----------------------------------------------- */
+
+#define KMALLOC_DMA	0
+#define KMALLOC_CACHED	1
+
+extern void *nfp_kmalloc( int size, int flags );
+extern void *nfp_krealloc( void *ptr, int size, int flags );
+extern void nfp_kfree( void * );
+
+/* config space access ------------------------------------------------ */
+
+/* return Little Endian 32 bit config register */
+extern nfp_err nfp_config_inl( nfp_cdev *pdev, int offset, unsigned int *res );
+
+/* io space access ------------------------------------------------ */
+
+extern unsigned int nfp_inl( nfp_cdev *pdev, int bar, int offset );
+extern unsigned short nfp_inw( nfp_cdev *pdev, int bar, int offset );
+extern void nfp_outl( nfp_cdev *pdev, int bar, int offset, unsigned int data );
+extern void nfp_outw( nfp_cdev *pdev, int bar, int offset, unsigned short data );
+
+/* user and device memory space access ---------------------------- */
+
+/* NB these 2 functions are not guarenteed to be re-entrant for a given device */
+extern nfp_err nfp_copy_from_user_to_dev( nfp_cdev *cdev, int bar, int offset, const char *ubuf, int len);
+extern nfp_err nfp_copy_to_user_from_dev( nfp_cdev *cdev, int bar, int offset, char *ubuf, int len);
+
+extern nfp_err nfp_copy_from_user( char *kbuf, const char *ubuf, int len );
+extern nfp_err nfp_copy_to_user( char *ubuf, const char *kbuf, int len );
+
+extern nfp_err nfp_copy_from_dev( nfp_cdev *cdev, int bar, int offset, char *kbuf, int len );
+extern nfp_err nfp_copy_to_dev( nfp_cdev *cdev, int bar, int offset, const char *kbuf, int len);
+
+/* debug ------------------------------------------------------------ */
+
+#define NFP_DBG1	1
+#define NFP_DBGE	NFP_DBG1
+#define NFP_DBG2	2
+#define NFP_DBG3	3
+#define NFP_DBG4	4
+
+#ifdef STRANGE_VARARGS
+extern void nfp_log();
+#else
+extern void nfp_log( int severity, const char *format, ...);
+#endif
+
+extern int nfp_debug;
+
+#endif
diff --git a/usr/src/uts/common/io/nfp/nfpci.h b/usr/src/uts/common/io/nfp/nfpci.h
new file mode 100644
index 0000000000..793f5995e6
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/nfpci.h
@@ -0,0 +1,171 @@
+/*
+
+(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+*/
+
+/*
+*
+*  NFPCI.H	- nFast PCI interface definition file
+*
+*
+*
+*  1998.06.09	IH	Started
+*
+* The interface presented by nFast PCI devices consists of:
+*
+* A region of shared RAM used for data transfer & control information
+* A doorbell interrupt register, so both sides can give each other interrupts
+* A number of DMA channels for transferring data
+*/
+
+#ifndef NFPCI_H
+#define NFPCI_H
+
+/* Sizes of some regions */
+#define NFPCI_RAM_MINSIZE	0x00100000
+/* This is the minimum size of shared RAM. In future it may be possible to
+   negotiate larger sizes of shared RAM or auto-detect how big it is */
+#define NFPCI_RAM_MINSIZE_JOBS	0x00020000 /* standard jobs only */
+#define NFPCI_RAM_MINSIZE_KERN	0x00040000 /* standard and kernel jobs */
+
+/* Offsets within shared memory space.
+   The following main regions are:
+     jobs input area
+     jobs output area
+     kernel jobs input area
+     kernel output area
+*/
+
+#define NFPCI_OFFSET_JOBS		0x00000000
+#define NFPCI_OFFSET_JOBS_WR		0x00000000
+#define NFPCI_OFFSET_JOBS_RD		0x00010000
+#define NFPCI_OFFSET_KERN		0x00020000
+#define NFPCI_OFFSET_KERN_WR		0x00020000
+#define NFPCI_OFFSET_KERN_RD		0x00030000
+
+/* Interrupts, defined by bit position in doorbell register */
+
+/* Interrupts from device to host */
+#define NFAST_INT_DEVICE_WRITE_OK               0x00000001
+#define NFAST_INT_DEVICE_WRITE_FAILED           0x00000002
+#define NFAST_INT_DEVICE_READ_OK                0x00000004
+#define NFAST_INT_DEVICE_READ_FAILED            0x00000008
+#define NFAST_INT_DEVICE_KERN_WRITE_OK		0x00000010
+#define NFAST_INT_DEVICE_KERN_WRITE_FAILED	0x00000020
+#define NFAST_INT_DEVICE_KERN_READ_OK		0x00000040
+#define NFAST_INT_DEVICE_KERN_READ_FAILED	0x00000080
+
+/* Interrupts from host to device */
+#define NFAST_INT_HOST_WRITE_REQUEST            0x00010000
+#define NFAST_INT_HOST_READ_REQUEST             0x00020000
+#define NFAST_INT_HOST_DEBUG                    0x00040000
+#define NFAST_INT_HOST_KERN_WRITE_REQUEST	0x00080000
+#define NFAST_INT_HOST_KERN_READ_REQUEST	0x00100000
+
+/* Ordinary job submission ------------------------ */
+
+/* The NFPCI_OFFSET_JOBS_WR and NFPCI_OFFSET_JOBS_RD regions are defined
+   by the following (byte) address offsets... */
+
+#define NFPCI_OFFSET_CONTROL	0x0
+#define NFPCI_OFFSET_LENGTH	0x4
+#define NFPCI_OFFSET_DATA	0x8
+#define NFPCI_OFFSET_PUSH_ADDR	0x8
+
+#define NFPCI_JOBS_WR_CONTROL	(NFPCI_OFFSET_JOBS_WR + NFPCI_OFFSET_CONTROL)
+#define NFPCI_JOBS_WR_LENGTH	(NFPCI_OFFSET_JOBS_WR + NFPCI_OFFSET_LENGTH)
+#define NFPCI_JOBS_WR_DATA	(NFPCI_OFFSET_JOBS_WR + NFPCI_OFFSET_DATA)
+#define NFPCI_MAX_JOBS_WR_LEN		(0x0000FFF8)
+
+#define NFPCI_JOBS_RD_CONTROL	(NFPCI_OFFSET_JOBS_RD + NFPCI_OFFSET_CONTROL)
+#define NFPCI_JOBS_RD_LENGTH	(NFPCI_OFFSET_JOBS_RD + NFPCI_OFFSET_LENGTH)
+#define NFPCI_JOBS_RD_DATA	(NFPCI_OFFSET_JOBS_RD + NFPCI_OFFSET_DATA)
+/* address in PCI space of host buffer for NFPCI_JOB_CONTROL_PCI_PUSH */
+#define NFPCI_JOBS_RD_PUSH_ADDR	(NFPCI_OFFSET_JOBS_RD + NFPCI_OFFSET_PUSH_ADDR)
+#define NFPCI_MAX_JOBS_RD_LEN		(0x000FFF8)
+
+/* Kernel inferface job submission ---------------- */
+
+#define NFPCI_KERN_WR_CONTROL   (NFPCI_OFFSET_KERN_WR + NFPCI_OFFSET_CONTROL)
+#define NFPCI_KERN_WR_LENGTH    (NFPCI_OFFSET_KERN_WR + NFPCI_OFFSET_LENGTH)
+#define NFPCI_KERN_WR_DATA      (NFPCI_OFFSET_KERN_WR + NFPCI_OFFSET_DATA)
+#define NFPCI_MAX_KERN_WR_LEN      (0x0000FFF8)
+
+#define NFPCI_KERN_RD_CONTROL   (NFPCI_OFFSET_KERN_RD + NFPCI_OFFSET_CONTROL)
+#define NFPCI_KERN_RD_LENGTH    (NFPCI_OFFSET_KERN_RD + NFPCI_OFFSET_LENGTH)
+#define NFPCI_KERN_RD_DATA      (NFPCI_OFFSET_KERN_RD + NFPCI_OFFSET_DATA)
+/* address in PCI space of host buffer for NFPCI_JOB_CONTROL_PCI_PUSH */
+#define NFPCI_KERN_RD_ADDR      (NFPCI_OFFSET_KERN_RD + NFPCI_OFFSET_PUSH_ADDR)
+#define NFPCI_MAX_KERN_RD_LEN		(0x000FFF8)
+
+#ifdef DEFINE_NFPCI_PACKED_STRUCTS
+typedef struct
+{
+  UINT32	controlword;
+  UINT32	length;		/* length of data to follow */
+  union {
+    BYTE	data[1];
+    UINT32	addr;
+  } uu;
+}
+  NFPCI_JOBS_BLOCK;
+#endif
+
+
+#define NFPCI_JOB_CONTROL		0x00000001
+#define NFPCI_JOB_CONTROL_PCI_PUSH	0x00000002
+/*
+   The 'Control' word is analogous to the SCSI read/write address;
+   1 = standard push/pull IO
+   2 = push/push IO
+
+   To submit a block of job data, the host:
+   - sets the (32-bit, little-endian) word at NFPCI_JOBS_WR_CONTROL to NFPCI_JOB_CONTROL
+   - sets the word at NFPCI_JOBS_WR_LENGTH to the length of the data
+   - copies the data to NFPCI_JOBS_WR_DATA
+   - sets interrupt NFAST_INT_HOST_WRITE_REQUEST in the doorbell register
+   - awaits the NFAST_INT_DEVICE_WRITE_OK (or _FAILED) interrupts back
+
+   To read a block of jobs back, the host:
+   - sets the word at NFPCI_JOBS_RD_CONTROL to NFPCI_JOB_CONTROL
+   - sets the word at NFPCI_JOBS_RD_LENGTH to the max length for returned data
+   - sets interrupt NFAST_INT_HOST_READ_REQUEST
+   - awaits the NFAST_INT_DEVICE_READ_OK (or _FAILED) interrupt
+   - reads the data from NFPCI_JOBS_RD_DATA; the module will set the word at
+	NFPCI_JOBS_RD_LENGTH to its actual length.
+
+   Optionally the host can request the PCI read data to be pushed to host PCI mapped ram:
+   - allocates a contiguous PCI addressable buffer for a NFPCI_JOBS_BLOCK of max
+        size NFPCI_MAX_JOBS_RD_LEN (or NFPCI_MAX_KERN_RD_LEN) + 8
+   - sets the word at NFPCI_JOBS_RD_CONTROL to NFPCI_JOB_CONTROL_PCI_PUSH
+   - sets the word at NFPCI_JOBS_RD_LENGTH to the max length for returned data
+   - sets the word at NFPCI_JOBS_RD_PUSH_ADDR to be the host PCI address of
+        the buffer
+   - sets interrupt NFAST_INT_HOST_READ_REQUEST
+   - awaits the NFAST_INT_DEVICE_READ_OK (or _FAILED) interrupt
+   - reads the data from the buffer at NFPCI_OFFSET_DATA in the buffer.  The
+        module will set NFPCI_OFFSET_LENGTH to the actual length.
+*/
+
+#define NFPCI_SCRATCH_CONTROL       0
+
+#define NFPCI_SCRATCH_CONTROL_HOST_MOI   (1<<0)
+#define NFPCI_SCRATCH_CONTROL_MODE_SHIFT 1
+#define NFPCI_SCRATCH_CONTROL_MODE_MASK  (3<<NFPCI_SCRATCH_CONTROL_MODE_SHIFT)
+
+#define NFPCI_SCRATCH_STATUS        1
+
+#define NFPCI_SCRATCH_STATUS_MONITOR_MOI         (1<<0)
+#define NFPCI_SCRATCH_STATUS_APPLICATION_MOI     (1<<1)
+#define NFPCI_SCRATCH_STATUS_APPLICATION_RUNNING (1<<2)
+#define NFPCI_SCRATCH_STATUS_ERROR               (1<<3)
+
+#define NFPCI_SCRATCH_ERROR_LO      2
+#define NFPCI_SCRATCH_ERROR_HI      3
+
+#endif
diff --git a/usr/src/uts/common/io/nfp/osif.c b/usr/src/uts/common/io/nfp/osif.c
new file mode 100644
index 0000000000..fba62f9a37
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/osif.c
@@ -0,0 +1,184 @@
+/*
+
+(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+*/
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/errno.h>
+#include <sys/file.h>
+#include <sys/conf.h>
+#include <sys/uio.h>
+#include <sys/map.h>
+#include <sys/debug.h>
+#include <sys/modctl.h>
+#include <sys/kmem.h>
+#include <sys/cmn_err.h>
+#include <sys/open.h>
+#include <sys/stat.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/pci.h>
+
+#include "nfp_common.h"
+#include "nfp_hostif.h"
+#include "nfp_error.h"
+#include "nfp_osif.h"
+#include "nfp_cmd.h"
+#include "nfp.h"
+#include "autoversion.h"
+
+/* config space access ---------------------------------- */
+
+nfp_err nfp_config_inl( nfp_cdev *pdev, int offset, unsigned int *res ) {
+  unsigned int tmp32;
+  if ( !pdev || !pdev->dev || !pdev->dev->conf_handle )
+    return NFP_ENODEV;
+
+/* pci_config_get32() does byte swapping, so put back to LE */
+  tmp32 = pci_config_get32( pdev->dev->conf_handle, offset );
+  TO_LE32_IO(res, tmp32);
+
+  return NFP_SUCCESS;
+}
+
+/* user space memory access ---------------------------------- */
+
+nfp_err nfp_copy_from_user( char *kbuf, const char *ubuf, int len) {
+  bcopy(ubuf, kbuf, len);
+  return 0;
+}
+
+nfp_err nfp_copy_to_user( char *ubuf, const char *kbuf, int len) {
+  bcopy(kbuf, ubuf, len);
+  return 0;
+}
+
+nfp_err nfp_copy_from_user_to_dev( nfp_cdev *cdev, int bar, int offset, const char *ubuf, int len) {
+  /* dirty hack on Solaris, as we are called from strategy we are, in fact, copying from kernel mem */
+  return nfp_copy_to_dev( cdev, bar, offset, ubuf, len );
+}
+
+nfp_err nfp_copy_to_user_from_dev( nfp_cdev *cdev, int bar, int offset, char *ubuf, int len) {
+  /* dirty hack on Solaris, as we are called from strategy we are, in fact, copying to kernel mem */
+  return nfp_copy_from_dev( cdev, bar, offset, ubuf, len );
+}
+
+nfp_err nfp_copy_from_dev( nfp_cdev *cdev, int bar, int offset, char *kbuf, int len) {
+  if( len & 0x3 || offset & 0x3 )
+    DDI_REP_GET8( cdev->extra[bar], (unsigned char *)kbuf, cdev->bar[bar] + offset, len, DDI_DEV_AUTOINCR);
+  else
+    /* LINTED: alignment */
+    DDI_REP_GET32( cdev->extra[bar], (unsigned int *)kbuf, (unsigned int *)(cdev->bar[bar] + offset), len / 4, DDI_DEV_AUTOINCR);
+  return NFP_SUCCESS;
+}
+
+nfp_err nfp_copy_to_dev( nfp_cdev *cdev, int bar, int offset, const char *kbuf, int len) {
+  if( len & 0x3 || offset & 0x3 )
+    DDI_REP_PUT8( cdev->extra[bar], (unsigned char *)kbuf, cdev->bar[bar] + offset, len, DDI_DEV_AUTOINCR );
+  else
+    /* LINTED: alignment */
+    DDI_REP_PUT32( cdev->extra[bar], (unsigned int *)kbuf, (unsigned int *)(cdev->bar[bar] + offset), len / 4, DDI_DEV_AUTOINCR );
+  return NFP_SUCCESS;
+}
+
+/* pci io space access --------------------------------------- */
+
+unsigned int nfp_inl( nfp_cdev *pdev, int bar, int offset ) {
+  nfp_log( NFP_DBG3, "nfp_inl: addr %x", (uintptr_t) pdev->bar[bar] + offset);
+  /* LINTED: alignment */
+  return DDI_GET32( pdev->extra[bar], (uint32_t *)(pdev->bar[bar] + offset) );
+}
+
+unsigned short nfp_inw( nfp_cdev *pdev, int bar, int offset ) {
+  nfp_log( NFP_DBG3, "nfp_inw: addr %x", (uintptr_t) pdev->bar[bar] + offset);
+  /* LINTED: alignment */
+  return DDI_GET16( pdev->extra[bar], (unsigned short *)(pdev->bar[ bar ] + offset) );
+}
+
+void nfp_outl( nfp_cdev *pdev, int bar, int offset, unsigned int data ) {
+  nfp_log( NFP_DBG3, "nfp_outl: addr %x, data %x", (uintptr_t) pdev->bar[bar] + offset, data);
+  /* LINTED: alignment */
+  DDI_PUT32( pdev->extra[bar], (uint32_t *)(pdev->bar[ bar ] + offset), data ); 
+}
+
+void nfp_outw( nfp_cdev *pdev, int bar, int offset, unsigned short data ) {
+  nfp_log( NFP_DBG3, "nfp_outl: addr %x, data %x", (uintptr_t) pdev->bar[bar] + offset, data);
+  /* LINTED: alignment */
+  DDI_PUT16( pdev->extra[bar], (unsigned short *)(pdev->bar[ bar ] + offset), data ); 
+}
+
+/* logging ---------------------------------------------------- */
+
+void nfp_log( int level, const char *fmt, ...)
+{
+  auto char buf[256];
+  va_list ap;
+
+  switch (level) {
+  case NFP_DBG4: if (nfp_debug < 4) break;
+  /*FALLTHROUGH*/
+  case NFP_DBG3: if (nfp_debug < 3) break;
+  /*FALLTHROUGH*/
+  case NFP_DBG2: if (nfp_debug < 2) break;
+  /*FALLTHROUGH*/
+  case NFP_DBG1: if (nfp_debug < 1) break;
+  /*FALLTHROUGH*/
+  default:
+    va_start(ap, fmt);
+    (void) vsnprintf(buf, 256, fmt, ap);
+    va_end(ap);
+    cmn_err(CE_CONT, "!" VERSION_COMPNAME " " VERSION_NO ": %s\n", buf);
+    break;
+  }
+}
+
+struct errstr {
+  int oserr;
+  nfp_err nferr;
+};
+
+
+static struct errstr errtab[] = {
+  { EFAULT, NFP_EFAULT },
+  { ENOMEM, NFP_ENOMEM },
+  { EINVAL, NFP_EINVAL },
+  { EIO,    NFP_EIO    },
+  { ENXIO,  NFP_ENXIO  },
+  { ENODEV, NFP_ENODEV  },
+  { EINVAL, NFP_EUNKNOWN },
+  { 0, 0 }
+};
+
+nfp_err nfp_error( int oserr )
+{
+  struct errstr *perr;
+  if(!oserr)
+    return 0;
+  perr= errtab;
+  while(perr->nferr) {
+   if(perr->oserr == oserr)
+     return perr->nferr;
+   perr++;
+  }
+  return NFP_EUNKNOWN;
+}
+
+int nfp_oserr( nfp_err nferr )
+{
+  struct errstr *perr;
+  if(nferr == NFP_SUCCESS)
+    return 0;
+  perr= errtab;
+  while(perr->nferr) {
+   if(perr->nferr == nferr)
+     return perr->oserr;
+   perr++;
+  }
+  return EIO;
+}
diff --git a/usr/src/uts/common/io/overlay/overlay.c b/usr/src/uts/common/io/overlay/overlay.c
new file mode 100644
index 0000000000..2ad3f4f591
--- /dev/null
+++ b/usr/src/uts/common/io/overlay/overlay.c
@@ -0,0 +1,2184 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+/*
+ * Overlay Devices
+ *
+ * Overlay devices provide a means for creating overlay networks, a means of
+ * multiplexing multiple logical, isolated, and discrete layer two and layer
+ * three networks on top of one physical network.
+ *
+ * In general, these overlay devices encapsulate the logic to answer two
+ * different questions:
+ *
+ *   1) How should I transform a packet to put it on the wire?
+ *   2) Where should I send a transformed packet?
+ *
+ * Each overlay device is presented to the user as a GLDv3 device. While the
+ * link itself cannot have an IP interface created on top of it, it allows for
+ * additional GLDv3 devices, such as a VNIC, to be created on top of it which
+ * can be plumbed up with IP interfaces.
+ *
+ *
+ * --------------------
+ * General Architecture
+ * --------------------
+ *
+ * The logical overlay device that a user sees in dladm(1M) is a combination of
+ * two different components that work together. The first component is this
+ * kernel module, which is responsible for answering question one -- how should
+ * I transform a packet to put it on the wire.
+ *
+ * The second component is what we call the virtual ARP daemon, or varpd. It is
+ * a userland component that is responsible for answering the second question --
+ * Where should I send a transformed packet. Instances of the kernel overlay
+ * GLDv3 device ask varpd the question of where should a packet go.
+ *
+ * The split was done for a few reasons. Importantly, we wanted to keep the act
+ * of generating encapsulated packets in the kernel so as to ensure that the
+ * general data path was fast and also kept simple. On the flip side, while the
+ * question of where should something go may be simple, it may often be
+ * complicated and need to interface with several different external or
+ * distributed systems. In those cases, it's simpler to allow for the full
+ * flexibility of userland to be brought to bear to solve that problem and in
+ * general, the path isn't very common.
+ *
+ * The following is what makes up the logical overlay device that a user would
+ * create with dladm(1M).
+ *
+ *       Kernel                                     Userland
+ *   . . . . . . . . . . . . . . . . . . . . .   . . . . . . . . . . . . .
+ *   . +--------+   +--------+  +--------+   .   .                       .
+ *   . | VNIC 0 |   | VNIC 1 |  | VNIC 2 |   .   .                       .
+ *   . +--------+   +--------+  +--------+   .   .                       .
+ *   .     |            |           |        .   .                       .
+ *   .     |            |           |        .   .                       .
+ *   .     +------------+-----------+        .   .                       .
+ *   .                  |              . . /dev/overlay                  .
+ *   .           +--------------+      .     .   .       +------------+  .
+ *   .           |              |      .     .   .       |            |  .
+ *   .           |    Overlay   |======*=================|   Virtual  |  .
+ *   .           | GLDv3 Device |========================| ARP Daemon |  .
+ *   .           |              |            .   .       |            |  .
+ *   .           +--------------+            .   .       +------------+  .
+ *   .                  |                    .   .              |        .
+ *   .                  |                    .   .              |        .
+ *   .           +----------------+          .   .         +--------+    .
+ *   .           |  Overlay       |          .   .         | varpd  |    .
+ *   .           |  Encapsulation |          .   .         | Lookup |    .
+ *   .           |  Plugin        |          .   .         | Plugin |    .
+ *   .           +----------------+          .   .         +--------+    .
+ *   . . . . . . . . . . . . . . . . . . . . .   . . . . . . . . . . . . .
+ *
+ *
+ * This image shows the two different components and where they live.
+ * Importantly, it also shows that both the kernel overlay device and the
+ * userland varpd both support plugins. The plugins actually implement the
+ * things that users care about and the APIs have been designed to try to
+ * minimize the amount of things that a module writer needs to worry about it.
+ *
+ * IDENTIFIERS
+ *
+ * Every overlay device is defined by a unique identifier which is the overlay
+ * identifier. Its purpose is similar to that of a VLAN identifier, it's a
+ * unique number that is used to differentiate between different entries on the
+ * wire.
+ *
+ * ENCAPSULATION
+ *
+ * An overlay encapsulation plugin is a kernel miscellaneous module whose
+ * purpose is to contain knowledge about how to transform packets to put them
+ * onto the wire and to take them off. An example of an encapsulation plugin is
+ * vxlan. It's also how support for things like nvgre or geneve would be brought
+ * into the system.
+ *
+ * Each encapsulation plugins defines a series of operation vectors and
+ * properties. For the full details on everything they should provide, please
+ * read uts/common/sys/overlay_plugin.h. The encapsulation plugin is responsible
+ * for telling the system what information is required to send a packet. For
+ * example, vxlan is defined to send everything over a UDP packet and therefore
+ * requires a port and an IP address, while nvgre on the other hand is its own
+ * IP type and therefore just requires an IP address. In addition, it also
+ * provides information about the kind of socket that should be created. This is
+ * used by the kernel multiplexor, more of that in the Kernel Components
+ * section.
+ *
+ * LOOKUPS
+ *
+ * The kernel communicates requests for lookups over the character device
+ * /dev/overlay. varpd is responsible for listening for requests on that device
+ * and answering them. The character device is specific to the target path and
+ * varpd.
+ *
+ * Much as the kernel overlay module handles the bulk of the scaffolding but
+ * leaves the important work to the encapsulation plugin, varpd provides a
+ * similar role and leaves the full brunt of lookups to a userland dynamic
+ * shared object which implements the logic of lookups.
+ *
+ * Each lookup plugin defines a series of operation vectors and properties. For
+ * the full details on everything that they should provide, please read
+ * lib/varpd/libvarpd/libvarpd_provider.h. Essentially, they are given a MAC
+ * address and asked to give an address on the physical network that it should
+ * be sent to. In addition, they handle questions related to how to handle
+ * things like broadcast and multicast traffic, etc.
+ *
+ * ----------
+ * Properties
+ * ----------
+ *
+ * A device from a dladm perspective has a unique set of properties that are
+ * combined from three different sources:
+ *
+ *   1) Generic properties that every overlay device has
+ *   2) Properties that are specific to the encapsulation plugin
+ *   3) Properties that are specific to the lookup plugin
+ *
+ * All of these are exposed in a single set of properties in dladm. Note that
+ * these are not necessarily traditional link properties. However, if something
+ * is both a traditional GLDv3 link property, say the MTU of a device, and a
+ * specific property here, than the driver ensures that all existing GLDv3
+ * specific means of manipulating it are used and wraps up its private property
+ * interfaces to ensure that works.
+ *
+ * Properties in the second and third category are prefixed with the name of
+ * their module. For example, the vxlan encapsulation module has a property
+ * called the 'listen_ip'. This property would show up in dladm as
+ * 'vxlan/listen_ip'. This allows different plugins to both use similar names
+ * for similar properties and to also have independent name spaces so that
+ * overlapping names do not conflict with anything else.
+ *
+ * While the kernel combines both sets one and two into a single coherent view,
+ * it does not do anything with respect to the properties that are owned by the
+ * lookup plugin -- those are owned wholly by varpd. Instead, libdladm is in
+ * charge of bridging these two worlds into one magical experience for the user.
+ * It carries the burden of knowing about both overlay specific and varpd
+ * specific properties. Importantly, we want to maintain this distinction. We
+ * don't want to treat the kernel as an arbitrary key/value store for varpd and
+ * we want the kernel to own its own data and not have to ask userland for
+ * information that it owns.
+ *
+ * Every property in the system has the following attributes:
+ *
+ *   o A name
+ *   o A type
+ *   o A size
+ *   o Permissions
+ *   o Default value
+ *   o Valid value ranges
+ *   o A value
+ *
+ * Everything except for the value is obtained by callers through the propinfo
+ * callbacks and a property has a maximum size of OVERLAY_PROP_SIZEMAX,
+ * currently 256 bytes.
+ *
+ * The following are the supported types of properties:
+ *
+ * 	OVERLAY_PROP_T_INT
+ *
+ * 		A signed integer, its length is 8 bytes, corresponding to a
+ * 		int64_t.
+ *
+ * 	OVERLAY_PROP_T_UINT
+ *
+ * 		An unsigned integer, its length is 8 bytes, corresponding to a
+ * 		uint64_t.
+ *
+ * 	OVERLAY_PROP_T_IP
+ *
+ * 		A struct in6_addr, it has a fixed size.
+ *
+ * 	OVERLAY_PROP_T_STRING
+ *
+ * 		A null-terminated character string encoded in either ASCII or
+ * 		UTF-8. Note that the size of the string includes the null
+ * 		terminator.
+ *
+ * The next thing that we apply to a property is its permission. The permissions
+ * are put together by the bitwise or of the following flags and values.
+ *
+ * 	OVERLAY_PROP_PERM_REQ
+ *
+ * 		This indicates a required property. A property that is required
+ * 		must be set by a consumer before the device can be created. If a
+ * 		required property has a default property, this constraint is
+ * 		loosened because the default property defines the value.
+ *
+ * 	OVERLAY_PORP_PERM_READ
+ *
+ * 		This indicates that a property can be read. All properties will
+ * 		have this value set.
+ *
+ * 	OVERLAY_PROP_PERM_WRITE
+ *
+ * 		This indicates that a property can be written to and thus
+ * 		updated by userland. Properties that are only intended to
+ * 		display information, will not have OVERLAY_PROP_PERM_WRITE set.
+ *
+ * In addition, a few additional values are defined as a convenience to
+ * consumers. The first, OVERLAY_PROP_PERM_RW, is a combination of
+ * OVERLAY_PROP_PERM_READ and OVERLAY_PERM_PROP_WRITE. The second,
+ * OVERLAY_PROP_PERM_RRW, is a combination of OVERLAY_PROP_PERM_REQ,
+ * OVERLAY_PROP_PERM_READ, and OVERLAY_PROP_PERM_WRITE. The protection mode of a
+ * property should generally be a constant across its lifetime.
+ *
+ * A property may optionally have a default value. If it does have a default
+ * value, and that property is not set to be a different value, then the default
+ * value is inherited automatically. It also means that if the default value is
+ * acceptable, there is no need to set the value for a required property. For
+ * example, the vxlan module has the vxlan/listen_port property which is
+ * required, but has a default value of 4789 (the IANA assigned port). Because
+ * of that default value, there is no need for it to be set.
+ *
+ * Finally, a property may declare a list of valid values. These valid values
+ * are used for display purposes, they are not enforced by the broader system,
+ * but merely allow a means for the information to be communicated to the user
+ * through dladm(1M). Like a default value, this is optional.
+ *
+ * The general scaffolding does not do very much with respect to the getting and
+ * setting of properties. That is really owned by the individual plugins
+ * themselves.
+ *
+ * -----------------------------
+ * Destinations and Plugin Types
+ * -----------------------------
+ *
+ * Both encapsulation and lookup plugins define the kinds of destinations that
+ * they know how to support. There are three different pieces of information
+ * that can be used to address to a destination currently, all of which is
+ * summarized in the type overlay_point_t. Any combination of these is
+ * supported.
+ *
+ * 	OVERLAY_PLUGIN_D_ETHERNET
+ *
+ * 		An Ethernet MAC address is required.
+ *
+ * 	OVERLAY_PLUGIN_D_IP
+ *
+ * 		An IP address is required. All IP addresses used by the overlay
+ * 		system are transmitted as IPv6 addresses. IPv4 addresses can be
+ * 		represented by using IPv4-mapped IPv6 addresses.
+ *
+ * 	OVERLAY_PLUGIN_D_PORT
+ *
+ * 		A TCP/UDP port is required.
+ *
+ * A kernel encapsulation plugin declares which of these that it requires, it's
+ * a static set. On the other hand, a userland lookup plugin can be built to
+ * support all of these or any combination thereof. It gets passed the required
+ * destination type, based on the kernel encapsulation method, and then it makes
+ * the determination as to whether or not it supports it. For example, the
+ * direct plugin can support either an IP or both an IP and a port, it simply
+ * doesn't display the direct/dest_port property in the cases where a port is
+ * not required to support this.
+ *
+ * The user lookup plugins have two different modes of operation which
+ * determines how they interact with the broader system and how look ups are
+ * performed. These types are:
+ *
+ * 	OVERLAY_TARGET_POINT
+ *
+ * 		A point to point plugin has a single static definition for where
+ * 		to send all traffic. Every packet in the system always gets sent
+ * 		to the exact same destination which is programmed into the
+ * 		kernel when the general device is activated.
+ *
+ * 	OVERLAY_TARGET_DYNAMIC
+ *
+ * 		A dynamic plugin does not have a single static definition.
+ * 		Instead, for each destination, the kernel makes an asynchronous
+ * 		request to varpd to determine where the packet should be routed,
+ * 		and if a specific destination is found, then that destination is
+ * 		cached in the overlay device's target cache.
+ *
+ * This distinction, while important for the general overlay device's operation,
+ * is not important to the encapsulation plugins. They don't need to know about
+ * any of these pieces. It's just a concern for varpd, the userland plugin, and
+ * the general overlay scaffolding.
+ *
+ * When an overlay device is set to OVERLAY_TARGET_POINT, then it does not
+ * maintain a target cache, and instead just keeps track of the destination and
+ * always sends encapsulated packets to that address. When the target type is of
+ * OVERLAY_TARGET_DYNAMIC, then the kernel maintains a cache of all such
+ * destinations. These destinations are kept around in an instance of a
+ * reference hash that is specific to the given overlay device. Entries in the
+ * cache can be invalidated and replaced by varpd and its lookup plugins.
+ *
+ * ----------------------------------
+ * Kernel Components and Architecture
+ * ----------------------------------
+ *
+ * There are multiple pieces inside the kernel that work together, there is the
+ * general overlay_dev_t structure, which is the logical GLDv3 device, but it
+ * itself has references to things like an instance of an encapsulation plugin,
+ * a pointer to a mux and a target cache. It can roughly be summarized in the
+ * following image:
+ *
+ *     +------------------+
+ *     | global           |
+ *     | overlay list     |
+ *     | overlay_dev_list |
+ *     +------------------+
+ *        |
+ *        |  +-----------------------+            +---------------+
+ *        +->| GLDv3 Device          |----------->| GLDv3 Device  | -> ...
+ *           | overlay_dev_t         |            | overlay_dev_t |
+ *           |                       |            +---------------+
+ *           |                       |
+ *           | mac_handle_t     -----+---> GLDv3 handle to MAC
+ *           | datalink_id_t    -----+---> Datalink ID used by DLS
+ *           | overlay_dev_flag_t ---+---> Device state
+ *           | uint_t           -----+---> Curent device MTU
+ *           | uint_t           -----+---> In-progress RX operations
+ *           | uint_t           -----+---> In-progress TX operations
+ *           | char[]           -----+---> FMA degraded message
+ *           | void *           -----+---> plugin private data
+ *           | overlay_target_t * ---+---------------------+
+ *           | overlay_plugin_t * ---+---------+           |
+ *           +-----------------------+         |           |
+ *                           ^                 |           |
+ *   +--------------------+  |                 |           |
+ *   | Kernel Socket      |  |                 |           |
+ *   | Multiplexor        |  |                 |           |
+ *   | overlay_mux_t      |  |                 |           |
+ *   |                    |  |                 |           |
+ *   | avl_tree_t        -+--+                 |           |
+ *   | uint_t            -+--> socket family   |           |
+ *   | uint_t            -+--> socket type     |           |
+ *   | uint_t            -+--> socket protocol |           |
+ *   | ksocket_t         -+--> I/O socket      |           |
+ *   | struct sockaddr * -+--> ksocket address |           |
+ *   | overlay_plugin_t --+--------+           |           |
+ *   +--------------------+        |           |           |
+ *                                 |           |           |
+ *   +-------------------------+   |           |           |
+ *   | Encap Plugin            |<--+-----------+           |
+ *   | overlay_plugin_t        |                           |
+ *   |                         |                           |
+ *   | char *               ---+--> plugin name            |
+ *   | overlay_plugin_ops_t * -+--> plugin downcalls       |
+ *   | char ** (props)      ---+--> property list          |
+ *   | uint_t               ---+--> id length              |
+ *   | overlay_plugin_flags_t -+--> plugin flags           |
+ *   | overlay_plugin_dest_t --+--> destination type       v
+ *   +-------------------------+                    +-------------------------+
+ *                                                  |   Target Cache          |
+ *                                                  |   overlay_target_t      |
+ *                                                  |                         |
+ *                                    cache mode <--+- overlay_target_mode_t  |
+ *                                     dest type <--+- overlay_plugin_dest_t  |
+ *                                   cache flags <--+- overlay_target_flag_t  |
+ *                                     varpd id  <--+- uint64_t               |
+ *                       outstanding varpd reqs. <--+- uint_t                 |
+ *                   OVERLAY_TARGET_POINT state  <--+- overlay_target_point_t |
+ *               OVERLAY_TARGET_DYNAMIC state <-+---+- overlay_target_dyn_t   |
+ *                                              |   +-------------------------+
+ *                      +-----------------------+
+ *                      |
+ *                      v
+ *   +-------------------------------+   +------------------------+
+ *   | Target Entry                  |-->| Target Entry           |--> ...
+ *   | overlay_target_entry_t        |   | overlay_target_entry_t |
+ *   |                               |   +------------------------+
+ *   |                               |
+ *   | overlay_target_entry_flags_t -+--> Entry flags
+ *   | uint8_t[ETHERADDRL]        ---+--> Target MAC address
+ *   | overlay_target_point_t     ---+--> Target underlay address
+ *   | mblk_t *                   ---+--> outstanding mblk head
+ *   | mblk_t *                   ---+--> outstanding mblk tail
+ *   | size_t                     ---+--> outstanding mblk size
+ *   +-------------------------------+
+ *
+ * The primary entries that we care about are the overlay_dev_t, which
+ * correspond to each overlay device that is created with dladm(1M). Globally,
+ * these devices are maintained in a simple list_t which is protected with a
+ * lock.  Hence, these include important information such as the mac_handle_t
+ * and a datalink_id_t which is used to interact with the broader MAC and DLS
+ * ecosystem. We also maintain additional information such as the current state,
+ * outstanding operations, the mtu, and importantly, the plugin's private data.
+ * This is the instance of an encapsulation plugin that gets created as part of
+ * creating an overlay device. Another aspect of this is that the overlay_dev_t
+ * also includes information with respect to FMA. For more information, see the
+ * FMA section.
+ *
+ * Each overlay_dev_t has a pointer to a plugin, a mux, and a target. The plugin
+ * is the encapsulation plugin. This allows the device to make downcalls into it
+ * based on doing things like getting and setting properties. Otherwise, the
+ * plugin itself is a fairly straightforward entity. They are maintained in an
+ * (not pictured above) list. The plugins themselves mostly maintain things like
+ * the static list of properties, what kind of destination they require, and the
+ * operations vector. A given module may contain more if necessary.
+ *
+ * The next piece of the puzzle is the mux, or a multiplexor. The mux itself
+ * maintains a ksocket and it is through the mux that we send and receive
+ * message blocks. The mux represents a socket type and address, as well as a
+ * plugin. Multiple overlay_dev_t devices may then share the same mux. For
+ * example, consider the case where you have different instances of vxlan all on
+ * the same underlay network. These would all logically share the same IP
+ * address and port that packets are sent and received on; however, what differs
+ * is the decapuslation ID.
+ *
+ * Each mux maintains a ksocket_t which is similar to a socket(3SOCKET). Unlike
+ * a socket, we enable a direct callback on the ksocket. This means that
+ * whenever a message block chain is received, rather than sitting there and
+ * getting a callback in a context and kicking that back out to a taskq. Instead
+ * data comes into the callback function overlay_mux_recv().
+ *
+ * The mux is given encapsulated packets (via overlay_m_tx, the GLDv3 tx
+ * function) to transmit. It receives encapsulated packets, decapsulates them to
+ * determine the overlay identifier, looks up the given device that matches that
+ * identifier, and then causes the broader MAC world to receive the packet with
+ * a call to mac_rx().
+ *
+ * Today, we don't do too much that's special with the ksocket; however, as
+ * hardware is gaining understanding for these encapuslation protocols, we'll
+ * probably want to think of better ways to get those capabilities passed down
+ * and potentially better ways to program receive filters so they get directly
+ * to us. Though, that's all fantasy future land.
+ *
+ * The next part of the puzzle is the target cache. The purpose of the target
+ * cache is to cache where we should send a packet on the underlay network,
+ * given its mac address. The target cache operates in two modes depending on
+ * whether the lookup module was declared to OVERLAY_TARGET_POINT or
+ * OVERLAY_TARGET_DYANMIC.
+ *
+ * In the case where the target cache has been programmed to be
+ * OVERLAY_TARGET_POINT, then we only maintain a single overlay_target_point_t
+ * which has the destination that we send everything, no matter the destination
+ * mac address.
+ *
+ * On the other hand, when we have an instance of OVERLAY_TARGET_DYNAMIC, things
+ * are much more interesting and as a result, more complicated. We primarily
+ * store lists of overlay_target_entry_t's which are stored in both an avl tree
+ * and a refhash_t. The primary look up path uses the refhash_t and the avl tree
+ * is only used for a few of the target ioctls used to dump data such that we
+ * can get a consistent iteration order for things like dladm show-overlay -t.
+ * The key that we use for the reference hashtable is based on the mac address
+ * in the cache and currently we just do a simple CRC32 to transform it into a
+ * hash.
+ *
+ * Each entry maintains a set of flags to indicate the current status of the
+ * request. The flags may indicate one of three states: that current cache entry
+ * is valid, that the current cache entry has been directed to drop all output,
+ * and that the current cache entry is invalid and may be being looked up. In
+ * the case where it's valid, we just take the destination address and run with
+ * it.
+ *
+ * If it's invalid and a lookup has not been made, then we start the process
+ * that prepares a query that will make its way up to varpd. The cache entry
+ * entry maintains a message block chain of outstanding message blocks and a
+ * size. These lists are populated only when we don't know the answer as to
+ * where should these be sent. The size entry is used to cap the amount of
+ * outstanding data that we don't know the answer to. If we exceed a cap on the
+ * amount of outstanding data (currently 1 Mb), then we'll drop any additional
+ * packets. Once we get an answer indicating a valid destination, we transmit
+ * any outstanding data to that place. For the full story on how we look that up
+ * will be discussed in the section on the Target Cache Lifecycle.
+ *
+ * ------------------------
+ * FMA and Degraded Devices
+ * ------------------------
+ *
+ * Every kernel overlay device keeps track of its FMA state. Today in FMA we
+ * cannot represent partitions between resources nor can we represent that a
+ * given minor node of a psuedo device has failed -- if we degrade the overlay
+ * device, then the entire dev_info_t is degraded. However, we still want to be
+ * able to indicate to administrators that things may go wrong.
+ *
+ * To this end, we've added a notion of a degraded state to every overlay
+ * device. This state is primarily dictated by userland and it can happen for
+ * various reasons. Generally, because a userland lookup plugin has been
+ * partitioned, or something has gone wrong such that there is no longer any
+ * userland lookup module for a device, then we'll mark it degraded.
+ *
+ * As long as any of our minor instances is degraded, then we'll fire off the
+ * FMA event to note that. Once the last degraded instance is no longer
+ * degraded, then we'll end up telling FMA that we're all clean.
+ *
+ * To help administrators get a better sense of which of the various minor
+ * devices is wrong, we store the odd_fmamsg[] character array. This character
+ * array can be fetched with doing a dladm show-overlay -f.
+ *
+ * Note, that it's important that we do not update the link status of the
+ * devices. We want to remain up as much as possible. By changing the link in a
+ * degraded state, this may end up making things worse. We may still actually
+ * have information in the target cache and if we mark the link down, that'll
+ * result in not being able to use it. The reason being that this'll mark all
+ * the downstream VNICs down which will go to IP and from there we end up
+ * dealing with sadness.
+ *
+ * -----------------------
+ * Target Cache Life Cycle
+ * -----------------------
+ *
+ * This section only applies when we have a lookup plugin of
+ * OVERLAY_TARGET_DYNAMIC. None of this applies to those of type
+ * OVERLAY_TARGET_POINT.
+ *
+ * While we got into the target cache in the general architecture section, it's
+ * worth going into more details as to how this actually works and showing some
+ * examples and state machines. Recall that a target cache entry basically has
+ * the following state transition diagram:
+ *
+ * Initial state
+ *    . . .           . . . first access       . . . varpd lookup enqueued
+ *        .           .                        .
+ *        .           .                        .
+ *     +-------+      .     +----------+       .
+ *     |  No   |------*---->| Invalid  |-------*----+
+ *     | Entry |            |  Entry   |            |
+ *     +-------+            +----------+            |
+ *                 varpd      ^      ^   varpd      |
+ *                 invalidate |      |   drop       |
+ *                      . . . *      * . .          v
+ *          +-------+         |      |         +---------+
+ *          | Entry |--->-----+      +----<----| Entry   |
+ *          | Valid |<----------*---------<----| Pending |->-+     varpd
+ *          +-------+           .              +---------+   * . . drop, but
+ *                              . varpd                ^     |     other queued
+ *                              . success              |     |     entries
+ *                                                     +-----+
+ *
+ * When the table is first created, it is empty. As we attempt to lookup entries
+ * and we find there is no entry at all, we'll create a new table entry for it.
+ * At that point the entry is technically in an invalid state, that means that
+ * we have no valid data from varpd. In that case, we'll go ahead and queue the
+ * packet into the entry's pending chain, and queue a varpd lookup, setting the
+ * OVERLAY_ENTRY_F_PENDING flag in the progress.
+ *
+ * If additional mblk_t's come in for this entry, we end up appending them to
+ * the tail of the chain, if and only if, we don't exceed the threshold for the
+ * amount of space they can take up. An entry remains pending until we get a
+ * varpd reply. If varpd replies with a valid results, we move to the valid
+ * entry state, and remove the OVERLAY_ENTRY_F_PENDING flag and set it with one
+ * of OVERLAY_ENTRY_F_VALID or OVERLAY_ENTRY_F_DROP as appropriate.
+ *
+ * Once an entry is valid, it stays valid until user land tells us to invalidate
+ * it with an ioctl or replace it, OVERLAY_TARG_CACHE_REMOE and
+ * OVERLAY_TARG_CACHE_SET respectively.
+ *
+ * If the lookup fails with a call to drop the packet, then the next state is
+ * determined by the state of the queue. If the set of outstanding entries is
+ * empty, then we just transition back to the invalid state. If instead, the
+ * set of outstanding entries is not empty, then we'll queue another entry and
+ * stay in the same state, repeating this until the number of requests is
+ * drained.
+ *
+ * The following images describes the flow of a given lookup and where the
+ * overlay_target_entry_t is at any given time.
+ *
+ *     +-------------------+
+ *     | Invalid Entry     |		An entry starts off as an invalid entry
+ *     | de:ad:be:ef:00:00 |		and only exists in the target cache.
+ *     +-------------------+
+ *
+ * 	~~~~
+ *
+ *     +---------------------+
+ *     | Global list_t       |		A mblk_t comes in for an entry. We
+ *     | overlay_target_list |		append it to the overlay_target_list.
+ *     +---------------------+
+ *                   |
+ *                   v
+ *             +-------------------+      +-------------------+
+ *             | Pending Entry     |----->| Pending Entry     |--->...
+ *             | 42:5e:1a:10:d6:2d |      | de:ad:be:ef:00:00 |
+ *             +-------------------+      +-------------------+
+ *
+ * 	~~~~
+ *
+ *     +--------------------------+
+ *     | /dev/overlay minor state |	User land said that it would look up an
+ *     | overlay_target_hdl_t     |	entry for us. We remove it from the
+ *     +--------------------------+	global list and add it to the handle's
+ *                  |			outstanding list.
+ *                  |
+ *                  v
+ *            +-------------------+      +-------------------+
+ *            | Pending Entry     |----->| Pending Entry     |
+ *            | 90:b8:d0:79:02:dd |      | de:ad:be:ef:00:00 |
+ *            +-------------------+      +-------------------+
+ *
+ * 	~~~~
+ *
+ *     +-------------------+
+ *     | Valid Entry       |		varpd returned an answer with
+ *     | de:ad:be:ef:00:00 |		OVERLAY_IOC_RESPOND and the target cache
+ *     | 10.169.23.42:4789 |		entry is now populated with a
+ *     +-------------------+		destination and marked as valid
+ *
+ *
+ * The lookup mechanism is performed via a series of operations on the character
+ * psuedo-device /dev/overlay. The only thing that uses this device is the
+ * userland daemon varpd. /dev/overlay is a cloneable device, each open of it
+ * granting a new minor number which maintains its own state. We maintain this
+ * state so that way if an outstanding lookup was queued to something that
+ * crashed or closed its handle without responding, we can know about this and
+ * thus handle it appropriately.
+ *
+ * When a lookup is first created it's added to our global list of outstanding
+ * lookups. To service requests, userland is required to perform an ioctl to ask
+ * for a request. We will block it in the kernel a set amount of time waiting
+ * for a request. When we give a request to a given minor instance of the
+ * device, we remove it from the global list and append the request to the
+ * device's list of outstanding entries, for the reasons we discussed above.
+ * When a lookup comes in, we give user land a smaller amount of information
+ * specific to that packet, the overlay_targ_lookup_t. It includes a request id
+ * to identify this, and then the overlay id, the varpd id, the header and
+ * packet size, the source and destination mac address, the SAP, and any
+ * potential VLAN header.
+ *
+ * At that point, it stays in that outstanding list until one of two ioctls are
+ * returned: OVERLAY_TARG_RESPOND or OVERLAY_TARG_DROP. During this time,
+ * userland may also perform other operations. For example, it may use
+ * OVERLAY_TARG_PKT to get a copy of this packet so it can perform more in-depth
+ * analysis of what to do beyond what we gave it initially. This is useful for
+ * providing proxy arp and the like. Finally, there are two other ioctls that
+ * varpd can then do. The first is OVERLAY_TARG_INJECT which injects the
+ * non-jumbo frame packet up into that mac device and OVERLAY_TARG_RESEND which
+ * causes us to encapsulate and send out the packet they've given us.
+ *
+ *
+ * Finally, through the target cache, several ioctls are provided to allow for
+ * interrogation and management of the cache. They allow for individual entries
+ * to be retrieved, set, or have the entire table flushed. For the full set of
+ * ioctls here and what they do, take a look at uts/common/sys/overlay_target.h.
+ *
+ * ------------------
+ * Sample Packet Flow
+ * ------------------
+ *
+ * There's a lot of pieces here, hopefully an example of how this all fits
+ * together will help clarify and elucidate what's going on. We're going to
+ * first track an outgoing packet, eg. one that is sent from an IP interface on
+ * a VNIC on top of an overlay device, and then we'll look at what it means to
+ * respond to that.
+ *
+ *
+ *    +----------------+        +--------------+            +------------------+
+ *    | IP/DLS send    |------->| MAC sends it |----------->| mblk_t reaches   |
+ *    | packet to MAC  |        | to the GLDv3 |            | overlay GLDv3 tx |
+ *    +----------------+        | VNIC device  |            | overlay_m_tx()   |
+ *                              +--------------+            +------------------+
+ *                                                                   |
+ *                             . lookup              . cache         |
+ *                             . drop                . miss          v
+ *            +---------+      .       +--------+    .      +------------------+
+ *            | freemsg |<-----*-------| varpd  |<---*------| Lookup each mblk |
+ *            | mblk_t  |              | lookup |           | in the target    |
+ *            +---------+              | queued |           | cache            |
+ *                ^                    +--------+           +------------------+
+ *      on send   |                        |                         |     cache
+ *      error . . *                        *. . lookup               * . . hit
+ *                |                        |    success              v
+ *                |                        |                +------------------+
+ *    +-----------------+                  +--------------->| call plugin      |
+ *    | Send out        |                                   | ovpo_encap() to  |
+ *    | overlay_mux_t's |<----------------------------------| get encap mblk_t |
+ *    | ksocket         |                                   +------------------+
+ *    +-----------------+
+ *
+ * The receive end point looks a little different and looks more like:
+ *
+ *  +------------------+     +----------------+    +-----------+
+ *  | mblk_t comes off |---->| enter netstack |--->| delivered |---+
+ *  | the physical     |     | IP stack       |    |     to    |   * . . direct
+ *  | device           |     +----------------+    |  ksocket  |   |   callback
+ *  +------------------+                           +-----------+   |
+ *                       . overlay id                              |
+ *                       . not found                               v
+ *       +-----------+   .      +-----------------+       +--------------------+
+ *       | freemsg   |<--*------| call plugin     |<------| overlay_mux_recv() |
+ *       | mblk_t    |          | ovpo_decap() to |       +--------------------+
+ *       +-----------+          | decap mblk_t    |
+ *                              +-----------------+
+ *                                     |
+ *                                     * . . overlay id
+ *                                     v     found
+ *                                 +--------+      +----------------+
+ *                                 | adjust |----->| call mac_rx    |
+ *                                 | mblk_t |      | on original    |
+ *                                 +--------+      | decaped packet |
+ *                                                 +----------------+
+ *
+ * ------------------
+ * Netstack Awareness
+ * ------------------
+ *
+ * In the above image we note that this enters a netstack. Today the only
+ * netstack that can be is the global zone as the overlay driver itself is not
+ * exactly netstack aware. What this really means is that varpd cannot run in a
+ * non-global zone and an overlay device cannot belong to a non-global zone.
+ * Non-global zones can still have a VNIC assigned to them that's been created
+ * over the overlay device the same way they would if it had been created over
+ * an etherstub or a physical device.
+ *
+ * The majority of the work to make it netstack aware is straightforward and the
+ * biggest thing is to create a netstack module that allows us to hook into
+ * netstack (and thus zone) creation and destruction.  From there, we need to
+ * amend the target cache lookup routines that we discussed earlier to not have
+ * a global outstanding list and a global list of handles, but rather, one per
+ * netstack.
+ *
+ * For the mux, we'll need to open the ksocket in the context of the zone, we
+ * can likely do this with a properly composed credential, but we'll need to do
+ * some more work on that path. Finally, we'll want to make sure the dld ioctls
+ * are aware of the zoneid of the caller and we use that appropriately and store
+ * it in the overlay_dev_t.
+ *
+ * -----------
+ * GLDv3 Notes
+ * -----------
+ *
+ * The overlay driver implements a GLDv3 device. Parts of GLDv3 are more
+ * relevant and other parts are much less relevant for us. For example, the
+ * GLDv3 is used to toggle the device being put into and out of promiscuous
+ * mode, to program MAC addresses for unicast and multicast hardware filters.
+ * Today, an overlay device doesn't have a notion of promiscuous mode nor does
+ * it have a notion of unicast and multicast addresses programmed into the
+ * device. Instead, for the purposes of the hardware filter, we don't do
+ * anything and just always accept new addresses being added and removed.
+ *
+ * If the GLDv3 start function has not been called, then we will not use this
+ * device for I/O purposes. Any calls to transmit or receive should be dropped,
+ * though the GLDv3 guarantees us that transmit will not be called without
+ * calling start. Similarly, once stop is called, then no packets can be dealt
+ * with.
+ *
+ * Today we don't support the stat interfaces, though there's no good reason
+ * that we shouldn't assemble some of the stats based on what we have in the
+ * future.
+ *
+ * When it comes to link properties, many of the traditional link properties do
+ * not apply and many others MAC handles for us. For example, we don't need to
+ * implement anything for overlay_m_getprop() to deal with returning the MTU, as
+ * MAC never calls into us for that. As such, there isn't much of anything to
+ * support in terms of properties.
+ *
+ * Today, we don't support any notion of hardware capabilities. However, if
+ * future NIC hardware or other changes to the system cause it to make sense for
+ * us to emulate logical groups, then we should do that. However, we still do
+ * implement a capab function so that we can identify ourselves as an overlay
+ * device to the broader MAC framework. This is done mostly so that a device
+ * created on top of us can have fanout rings as we don't try to lie about a
+ * speed for our device.
+ *
+ * The other question is what should be done for a device's MTU and margin. We
+ * set our minimum supported MTU to be the minimum value that an IP network may
+ * be set to 576 -- which mimics what an etherstub does. On the flip side, we
+ * have our upper bound set to 8900. This value comes from the fact that a lot
+ * of jumbo networks use their maximum as 9000. As such, we want to reserve 100
+ * bytes, which isn't exactly the most accurate number, but it'll be good enough
+ * for now. Because of that, our default MTU off of these devices is 1400, as
+ * the default MTU for everything is usually 1500 or whatever the underlying
+ * device is at; however, this is a bit simpler than asking the netstack what
+ * are all the IP interfaces at. It also calls into question how PMTU and PMTU
+ * discovery should work here. The challenge, especially for
+ * OVERLAY_TARG_DYNAMIC is that the MTU to any of the places will vary and it's
+ * not clear that if you have a single bad entry that the overall MTU should be
+ * lowered. Instead, we should figure out a better way of determining these
+ * kinds of PMTU errors and appropriately alerting the administrator via FMA.
+ *
+ * Regarding margin, we allow a margin of up to VLAN_TAGSZ depending on whether
+ * or not the underlying encapsulation device supports VLAN tags. If it does,
+ * then we'll set the margin to allow for it, otherwise, we will not.
+ */
+
+#include <sys/conf.h>
+#include <sys/errno.h>
+#include <sys/stat.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/modctl.h>
+#include <sys/policy.h>
+#include <sys/stream.h>
+#include <sys/strsubr.h>
+#include <sys/strsun.h>
+#include <sys/types.h>
+#include <sys/kmem.h>
+#include <sys/param.h>
+#include <sys/sysmacros.h>
+#include <sys/ddifm.h>
+
+#include <sys/dls.h>
+#include <sys/dld_ioc.h>
+#include <sys/mac_provider.h>
+#include <sys/mac_client_priv.h>
+#include <sys/mac_ether.h>
+#include <sys/vlan.h>
+
+#include <sys/overlay_impl.h>
+
+dev_info_t *overlay_dip;
+static kmutex_t overlay_dev_lock;
+static list_t overlay_dev_list;
+static uint8_t overlay_macaddr[ETHERADDRL] =
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
+
+typedef enum overlay_dev_prop {
+	OVERLAY_DEV_P_MTU = 0,
+	OVERLAY_DEV_P_VNETID,
+	OVERLAY_DEV_P_ENCAP,
+	OVERLAY_DEV_P_VARPDID
+} overlay_dev_prop_t;
+
+#define	OVERLAY_DEV_NPROPS	4
+static const char *overlay_dev_props[] = {
+	"mtu",
+	"vnetid",
+	"encap",
+	"varpd/id"
+};
+
+#define	OVERLAY_MTU_MIN	576
+#define	OVERLAY_MTU_DEF	1400
+#define	OVERLAY_MTU_MAX	8900
+
+overlay_dev_t *
+overlay_hold_by_dlid(datalink_id_t id)
+{
+	overlay_dev_t *o;
+
+	mutex_enter(&overlay_dev_lock);
+	for (o = list_head(&overlay_dev_list); o != NULL;
+	    o = list_next(&overlay_dev_list, o)) {
+		if (id == o->odd_linkid) {
+			mutex_enter(&o->odd_lock);
+			o->odd_ref++;
+			mutex_exit(&o->odd_lock);
+			mutex_exit(&overlay_dev_lock);
+			return (o);
+		}
+	}
+
+	mutex_exit(&overlay_dev_lock);
+	return (NULL);
+}
+
+void
+overlay_hold_rele(overlay_dev_t *odd)
+{
+	mutex_enter(&odd->odd_lock);
+	ASSERT(odd->odd_ref > 0);
+	odd->odd_ref--;
+	mutex_exit(&odd->odd_lock);
+}
+
+void
+overlay_io_start(overlay_dev_t *odd, overlay_dev_flag_t flag)
+{
+	ASSERT(flag == OVERLAY_F_IN_RX || flag == OVERLAY_F_IN_TX);
+	ASSERT(MUTEX_HELD(&odd->odd_lock));
+
+	if (flag & OVERLAY_F_IN_RX)
+		odd->odd_rxcount++;
+	if (flag & OVERLAY_F_IN_TX)
+		odd->odd_txcount++;
+	odd->odd_flags |= flag;
+}
+
+void
+overlay_io_done(overlay_dev_t *odd, overlay_dev_flag_t flag)
+{
+	boolean_t signal = B_FALSE;
+
+	ASSERT(flag == OVERLAY_F_IN_RX || flag == OVERLAY_F_IN_TX);
+	ASSERT(MUTEX_HELD(&odd->odd_lock));
+
+	if (flag & OVERLAY_F_IN_RX) {
+		ASSERT(odd->odd_rxcount > 0);
+		odd->odd_rxcount--;
+		if (odd->odd_rxcount == 0) {
+			signal = B_TRUE;
+			odd->odd_flags &= ~OVERLAY_F_IN_RX;
+		}
+	}
+	if (flag & OVERLAY_F_IN_TX) {
+		ASSERT(odd->odd_txcount > 0);
+		odd->odd_txcount--;
+		if (odd->odd_txcount == 0) {
+			signal = B_TRUE;
+			odd->odd_flags &= ~OVERLAY_F_IN_TX;
+		}
+	}
+
+	if (signal == B_TRUE)
+		cv_broadcast(&odd->odd_iowait);
+}
+
+static void
+overlay_io_wait(overlay_dev_t *odd, overlay_dev_flag_t flag)
+{
+	ASSERT((flag & ~OVERLAY_F_IOMASK) == 0);
+	ASSERT(MUTEX_HELD(&odd->odd_lock));
+
+	while (odd->odd_flags & flag) {
+		cv_wait(&odd->odd_iowait, &odd->odd_lock);
+	}
+}
+
+void
+overlay_dev_iter(overlay_dev_iter_f func, void *arg)
+{
+	overlay_dev_t *odd;
+
+	mutex_enter(&overlay_dev_lock);
+	for (odd = list_head(&overlay_dev_list); odd != NULL;
+	    odd = list_next(&overlay_dev_list, odd)) {
+		if (func(odd, arg) != 0) {
+			mutex_exit(&overlay_dev_lock);
+			return;
+		}
+	}
+	mutex_exit(&overlay_dev_lock);
+}
+
+/* ARGSUSED */
+static int
+overlay_m_stat(void *arg, uint_t stat, uint64_t *val)
+{
+	return (ENOTSUP);
+}
+
+static int
+overlay_m_start(void *arg)
+{
+	overlay_dev_t *odd = arg;
+	overlay_mux_t *mux;
+	int ret, domain, family, prot;
+	struct sockaddr_storage storage;
+	socklen_t slen;
+
+	mutex_enter(&odd->odd_lock);
+	if ((odd->odd_flags & OVERLAY_F_ACTIVATED) == 0) {
+		mutex_exit(&odd->odd_lock);
+		return (EAGAIN);
+	}
+	mutex_exit(&odd->odd_lock);
+
+	ret = odd->odd_plugin->ovp_ops->ovpo_socket(odd->odd_pvoid, &domain,
+	    &family, &prot, (struct sockaddr *)&storage, &slen);
+	if (ret != 0)
+		return (ret);
+
+	mux = overlay_mux_open(odd->odd_plugin, domain, family, prot,
+	    (struct sockaddr *)&storage, slen, &ret);
+	if (mux == NULL)
+		return (ret);
+
+	overlay_mux_add_dev(mux, odd);
+	odd->odd_mux = mux;
+	mutex_enter(&odd->odd_lock);
+	ASSERT(!(odd->odd_flags & OVERLAY_F_IN_MUX));
+	odd->odd_flags |= OVERLAY_F_IN_MUX;
+	mutex_exit(&odd->odd_lock);
+
+	return (0);
+}
+
+static void
+overlay_m_stop(void *arg)
+{
+	overlay_dev_t *odd = arg;
+
+	/*
+	 * The MAC Perimeter is held here, so we don't have to worry about
+	 * synchornizing this with respect to metadata operations.
+	 */
+	mutex_enter(&odd->odd_lock);
+	VERIFY(odd->odd_flags & OVERLAY_F_IN_MUX);
+	VERIFY(!(odd->odd_flags & OVERLAY_F_MDDROP));
+	odd->odd_flags |= OVERLAY_F_MDDROP;
+	overlay_io_wait(odd, OVERLAY_F_IOMASK);
+	mutex_exit(&odd->odd_lock);
+
+	overlay_mux_remove_dev(odd->odd_mux, odd);
+	overlay_mux_close(odd->odd_mux);
+	odd->odd_mux = NULL;
+
+	mutex_enter(&odd->odd_lock);
+	odd->odd_flags &= ~OVERLAY_F_IN_MUX;
+	odd->odd_flags &= ~OVERLAY_F_MDDROP;
+	VERIFY((odd->odd_flags & OVERLAY_F_STOPMASK) == 0);
+	mutex_exit(&odd->odd_lock);
+}
+
+/*
+ * For more info on this, see the big theory statement.
+ */
+/* ARGSUSED */
+static int
+overlay_m_promisc(void *arg, boolean_t on)
+{
+	return (0);
+}
+
+/*
+ * For more info on this, see the big theory statement.
+ */
+/* ARGSUSED */
+static int
+overlay_m_multicast(void *arg, boolean_t add, const uint8_t *addrp)
+{
+	return (0);
+}
+
+/*
+ * For more info on this, see the big theory statement.
+ */
+/* ARGSUSED */
+static int
+overlay_m_unicast(void *arg, const uint8_t *macaddr)
+{
+	return (0);
+}
+
+mblk_t *
+overlay_m_tx(void *arg, mblk_t *mp_chain)
+{
+	overlay_dev_t *odd = arg;
+	mblk_t *mp, *ep;
+	int ret;
+	ovep_encap_info_t einfo;
+	struct msghdr hdr;
+
+	mutex_enter(&odd->odd_lock);
+	if ((odd->odd_flags & OVERLAY_F_MDDROP) ||
+	    !(odd->odd_flags & OVERLAY_F_IN_MUX)) {
+		mutex_exit(&odd->odd_lock);
+		freemsgchain(mp_chain);
+		return (NULL);
+	}
+	overlay_io_start(odd, OVERLAY_F_IN_TX);
+	mutex_exit(&odd->odd_lock);
+
+	bzero(&hdr, sizeof (struct msghdr));
+
+	bzero(&einfo, sizeof (ovep_encap_info_t));
+	einfo.ovdi_id = odd->odd_vid;
+	mp = mp_chain;
+	while (mp != NULL) {
+		socklen_t slen;
+		struct sockaddr_storage storage;
+
+		mp_chain = mp->b_next;
+		mp->b_next = NULL;
+		ep = NULL;
+
+		ret = overlay_target_lookup(odd, mp,
+		    (struct sockaddr *)&storage, &slen);
+		if (ret != OVERLAY_TARGET_OK) {
+			if (ret == OVERLAY_TARGET_DROP)
+				freemsg(mp);
+			mp = mp_chain;
+			continue;
+		}
+
+		hdr.msg_name = &storage;
+		hdr.msg_namelen = slen;
+
+		ret = odd->odd_plugin->ovp_ops->ovpo_encap(odd->odd_mh, mp,
+		    &einfo, &ep);
+		if (ret != 0 || ep == NULL) {
+			freemsg(mp);
+			goto out;
+		}
+
+		ASSERT(ep->b_cont == mp || ep == mp);
+		ret = overlay_mux_tx(odd->odd_mux, &hdr, ep);
+		if (ret != 0)
+			goto out;
+
+		mp = mp_chain;
+	}
+
+out:
+	mutex_enter(&odd->odd_lock);
+	overlay_io_done(odd, OVERLAY_F_IN_TX);
+	mutex_exit(&odd->odd_lock);
+	return (mp_chain);
+}
+
+/* ARGSUSED */
+static void
+overlay_m_ioctl(void *arg, queue_t *q, mblk_t *mp)
+{
+	miocnak(q, mp, 0, ENOTSUP);
+}
+
+/* ARGSUSED */
+static boolean_t
+overlay_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
+{
+	/*
+	 * Tell MAC we're an overlay.
+	 */
+	if (cap == MAC_CAPAB_OVERLAY)
+		return (B_TRUE);
+	return (B_FALSE);
+}
+
+/* ARGSUSED */
+static int
+overlay_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
+    uint_t pr_valsize, const void *pr_val)
+{
+	uint32_t mtu, old;
+	int err;
+	overlay_dev_t *odd = arg;
+
+	if (pr_num != MAC_PROP_MTU)
+		return (ENOTSUP);
+
+	bcopy(pr_val, &mtu, sizeof (mtu));
+	if (mtu < OVERLAY_MTU_MIN || mtu > OVERLAY_MTU_MAX)
+		return (EINVAL);
+
+	mutex_enter(&odd->odd_lock);
+	old = odd->odd_mtu;
+	odd->odd_mtu = mtu;
+	err = mac_maxsdu_update(odd->odd_mh, mtu);
+	if (err != 0)
+		odd->odd_mtu = old;
+	mutex_exit(&odd->odd_lock);
+
+	return (err);
+}
+
+/* ARGSUSED */
+static int
+overlay_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
+    uint_t pr_valsize, void *pr_val)
+{
+	return (ENOTSUP);
+}
+
+/* ARGSUSED */
+static void
+overlay_m_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num,
+    mac_prop_info_handle_t prh)
+{
+	if (pr_num != MAC_PROP_MTU)
+		return;
+
+	mac_prop_info_set_default_uint32(prh, OVERLAY_MTU_DEF);
+	mac_prop_info_set_range_uint32(prh, OVERLAY_MTU_MIN, OVERLAY_MTU_MAX);
+}
+
+static mac_callbacks_t overlay_m_callbacks = {
+	.mc_callbacks = (MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_GETPROP |
+	    MC_PROPINFO),
+	.mc_getstat = overlay_m_stat,
+	.mc_start = overlay_m_start,
+	.mc_stop = overlay_m_stop,
+	.mc_setpromisc = overlay_m_promisc,
+	.mc_multicst = overlay_m_multicast,
+	.mc_unicst = overlay_m_unicast,
+	.mc_tx = overlay_m_tx,
+	.mc_ioctl = overlay_m_ioctl,
+	.mc_getcapab = overlay_m_getcapab,
+	.mc_getprop = overlay_m_getprop,
+	.mc_setprop = overlay_m_setprop,
+	.mc_propinfo = overlay_m_propinfo
+};
+
+static boolean_t
+overlay_valid_name(const char *name, size_t buflen)
+{
+	size_t actlen;
+	int err, i;
+
+	for (i = 0; i < buflen; i++) {
+		if (name[i] == '\0')
+			break;
+	}
+
+	if (i == 0 || i == buflen)
+		return (B_FALSE);
+	actlen = i;
+	if (strchr(name, '/') != NULL)
+		return (B_FALSE);
+	if (u8_validate((char *)name, actlen, NULL,
+	    U8_VALIDATE_ENTIRE, &err) < 0)
+		return (B_FALSE);
+	return (B_TRUE);
+}
+
+/* ARGSUSED */
+static int
+overlay_i_create(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
+{
+	int err;
+	uint64_t maxid;
+	overlay_dev_t *odd, *o;
+	mac_register_t *mac;
+	overlay_ioc_create_t *oicp = karg;
+
+	if (overlay_valid_name(oicp->oic_encap, MAXLINKNAMELEN) == B_FALSE)
+		return (EINVAL);
+
+	odd = kmem_zalloc(sizeof (overlay_dev_t), KM_SLEEP);
+	odd->odd_linkid = oicp->oic_linkid;
+	odd->odd_plugin = overlay_plugin_lookup(oicp->oic_encap);
+	if (odd->odd_plugin == NULL) {
+		kmem_free(odd, sizeof (overlay_dev_t));
+		return (ENOENT);
+	}
+	err = odd->odd_plugin->ovp_ops->ovpo_init((overlay_handle_t)odd,
+	    &odd->odd_pvoid);
+	if (err != 0) {
+		odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
+		overlay_plugin_rele(odd->odd_plugin);
+		kmem_free(odd, sizeof (overlay_dev_t));
+		return (EINVAL);
+	}
+
+	/*
+	 * Make sure that our virtual network id is valid for the given plugin
+	 * that we're working with.
+	 */
+	ASSERT(odd->odd_plugin->ovp_id_size <= 8);
+	maxid = UINT64_MAX;
+	if (odd->odd_plugin->ovp_id_size != 8)
+		maxid = (1ULL << (odd->odd_plugin->ovp_id_size * 8)) - 1ULL;
+	if (oicp->oic_vnetid > maxid) {
+		odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
+		overlay_plugin_rele(odd->odd_plugin);
+		kmem_free(odd, sizeof (overlay_dev_t));
+		return (EINVAL);
+	}
+	odd->odd_vid = oicp->oic_vnetid;
+
+	mac = mac_alloc(MAC_VERSION);
+	if (mac == NULL) {
+		mutex_exit(&overlay_dev_lock);
+		odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
+		overlay_plugin_rele(odd->odd_plugin);
+		kmem_free(odd, sizeof (overlay_dev_t));
+		return (EINVAL);
+	}
+
+	mac->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
+	mac->m_driver = odd;
+	mac->m_dip = overlay_dip;
+	mac->m_dst_addr = NULL;
+	mac->m_callbacks = &overlay_m_callbacks;
+	mac->m_pdata = NULL;
+	mac->m_pdata_size = 0;
+
+	mac->m_priv_props = NULL;
+
+	/* Let mac handle this itself. */
+	mac->m_instance = (uint_t)-1;
+
+	/*
+	 * There is no real source address that should be used here, but saying
+	 * that we're not ethernet is going to cause its own problems. At the
+	 * end of the say, this is fine.
+	 */
+	mac->m_src_addr = overlay_macaddr;
+
+	/*
+	 * Start with the default MTU as the max SDU. If the MTU is changed, the
+	 * SDU will be changed to reflect that.
+	 */
+	mac->m_min_sdu = 1;
+	mac->m_max_sdu = OVERLAY_MTU_DEF;
+	mac->m_multicast_sdu = 0;
+
+	/*
+	 * The underlying device doesn't matter, instead this comes from the
+	 * encapsulation protocol and whether or not they allow VLAN tags.
+	 */
+	if (odd->odd_plugin->ovp_flags & OVEP_F_VLAN_TAG) {
+		mac->m_margin = VLAN_TAGSZ;
+	} else {
+		mac->m_margin = 0;
+	}
+
+	/*
+	 * Today, we have no MAC virtualization, it may make sense in the future
+	 * to go ahead and emulate some subset of this, but it doesn't today.
+	 */
+	mac->m_v12n = MAC_VIRT_NONE;
+
+	mutex_enter(&overlay_dev_lock);
+	for (o = list_head(&overlay_dev_list); o != NULL;
+	    o = list_next(&overlay_dev_list, o)) {
+		if (o->odd_linkid == oicp->oic_linkid) {
+			mutex_exit(&overlay_dev_lock);
+			odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
+			overlay_plugin_rele(odd->odd_plugin);
+			kmem_free(odd, sizeof (overlay_dev_t));
+			return (EEXIST);
+		}
+
+		if (o->odd_vid == oicp->oic_vnetid &&
+		    o->odd_plugin == odd->odd_plugin) {
+			mutex_exit(&overlay_dev_lock);
+			odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
+			overlay_plugin_rele(odd->odd_plugin);
+			kmem_free(odd, sizeof (overlay_dev_t));
+			return (EEXIST);
+		}
+	}
+
+	err = mac_register(mac, &odd->odd_mh);
+	mac_free(mac);
+	if (err != 0) {
+		mutex_exit(&overlay_dev_lock);
+		odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
+		overlay_plugin_rele(odd->odd_plugin);
+		kmem_free(odd, sizeof (overlay_dev_t));
+		return (err);
+	}
+
+	err = dls_devnet_create(odd->odd_mh, odd->odd_linkid,
+	    crgetzoneid(cred));
+	if (err != 0) {
+		mutex_exit(&overlay_dev_lock);
+		(void) mac_unregister(odd->odd_mh);
+		odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
+		overlay_plugin_rele(odd->odd_plugin);
+		kmem_free(odd, sizeof (overlay_dev_t));
+		return (err);
+	}
+
+	mutex_init(&odd->odd_lock, NULL, MUTEX_DRIVER, NULL);
+	cv_init(&odd->odd_iowait, NULL, CV_DRIVER, NULL);
+	odd->odd_ref = 0;
+	odd->odd_flags = 0;
+	list_insert_tail(&overlay_dev_list, odd);
+	mutex_exit(&overlay_dev_lock);
+
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+overlay_i_activate(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
+{
+	int i, ret;
+	overlay_dev_t *odd;
+	mac_perim_handle_t mph;
+	overlay_ioc_activate_t *oiap = karg;
+	overlay_ioc_propinfo_t *infop;
+	overlay_ioc_prop_t *oip;
+	overlay_prop_handle_t phdl;
+
+	odd = overlay_hold_by_dlid(oiap->oia_linkid);
+	if (odd == NULL)
+		return (ENOENT);
+
+	infop = kmem_alloc(sizeof (overlay_ioc_propinfo_t), KM_SLEEP);
+	oip = kmem_alloc(sizeof (overlay_ioc_prop_t), KM_SLEEP);
+	phdl = (overlay_prop_handle_t)infop;
+
+	mac_perim_enter_by_mh(odd->odd_mh, &mph);
+	mutex_enter(&odd->odd_lock);
+	if (odd->odd_flags & OVERLAY_F_ACTIVATED) {
+		mutex_exit(&odd->odd_lock);
+		mac_perim_exit(mph);
+		overlay_hold_rele(odd);
+		kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
+		kmem_free(oip, sizeof (overlay_ioc_prop_t));
+		return (EEXIST);
+	}
+	mutex_exit(&odd->odd_lock);
+
+	for (i = 0; i < odd->odd_plugin->ovp_nprops; i++) {
+		const char *pname = odd->odd_plugin->ovp_props[i];
+		bzero(infop, sizeof (overlay_ioc_propinfo_t));
+		overlay_prop_init(phdl);
+		ret = odd->odd_plugin->ovp_ops->ovpo_propinfo(pname, phdl);
+		if (ret != 0) {
+			mac_perim_exit(mph);
+			overlay_hold_rele(odd);
+			kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
+			kmem_free(oip, sizeof (overlay_ioc_prop_t));
+			return (ret);
+		}
+
+		if ((infop->oipi_prot & OVERLAY_PROP_PERM_REQ) == 0)
+			continue;
+		bzero(oip, sizeof (overlay_ioc_prop_t));
+		oip->oip_size = sizeof (oip->oip_value);
+		ret = odd->odd_plugin->ovp_ops->ovpo_getprop(odd->odd_pvoid,
+		    pname, oip->oip_value, &oip->oip_size);
+		if (ret != 0) {
+			mac_perim_exit(mph);
+			overlay_hold_rele(odd);
+			kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
+			kmem_free(oip, sizeof (overlay_ioc_prop_t));
+			return (ret);
+		}
+		if (oip->oip_size == 0) {
+			mac_perim_exit(mph);
+			overlay_hold_rele(odd);
+			kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
+			kmem_free(oip, sizeof (overlay_ioc_prop_t));
+			return (EINVAL);
+		}
+	}
+
+	mutex_enter(&odd->odd_lock);
+	if ((odd->odd_flags & OVERLAY_F_VARPD) == 0) {
+		mutex_exit(&odd->odd_lock);
+		mac_perim_exit(mph);
+		overlay_hold_rele(odd);
+		kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
+		kmem_free(oip, sizeof (overlay_ioc_prop_t));
+		return (ENXIO);
+	}
+
+	ASSERT((odd->odd_flags & OVERLAY_F_ACTIVATED) == 0);
+	odd->odd_flags |= OVERLAY_F_ACTIVATED;
+
+	/*
+	 * Now that we've activated ourselves, we should indicate to the world
+	 * that we're up. Note that we may not be able to perform lookups at
+	 * this time, but our notion of being 'up' isn't dependent on that
+	 * ability.
+	 */
+	mac_link_update(odd->odd_mh, LINK_STATE_UP);
+	mutex_exit(&odd->odd_lock);
+
+	mac_perim_exit(mph);
+	overlay_hold_rele(odd);
+	kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
+	kmem_free(oip, sizeof (overlay_ioc_prop_t));
+
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+overlay_i_delete(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
+{
+	overlay_ioc_delete_t *oidp = karg;
+	overlay_dev_t *odd;
+	datalink_id_t tid;
+	int ret;
+
+	odd = overlay_hold_by_dlid(oidp->oid_linkid);
+	if (odd == NULL) {
+		return (ENOENT);
+	}
+
+	mutex_enter(&odd->odd_lock);
+	/* If we're not the only hold, we're busy */
+	if (odd->odd_ref != 1) {
+		mutex_exit(&odd->odd_lock);
+		overlay_hold_rele(odd);
+		return (EBUSY);
+	}
+
+	if (odd->odd_flags & OVERLAY_F_IN_MUX) {
+		mutex_exit(&odd->odd_lock);
+		overlay_hold_rele(odd);
+		return (EBUSY);
+	}
+
+	/*
+	 * To remove this, we need to first remove it from dls and then remove
+	 * it from mac. The act of removing it from mac will check if there are
+	 * devices on top of this, eg. vnics. If there are, then that will fail
+	 * and we'll have to go through and recreate the dls entry. Only after
+	 * mac_unregister has succeeded, then we'll go through and actually free
+	 * everything and drop the dev lock.
+	 */
+	ret = dls_devnet_destroy(odd->odd_mh, &tid, B_TRUE);
+	if (ret != 0) {
+		overlay_hold_rele(odd);
+		return (ret);
+	}
+
+	ASSERT(oidp->oid_linkid == tid);
+	ret = mac_disable(odd->odd_mh);
+	if (ret != 0) {
+		(void) dls_devnet_create(odd->odd_mh, odd->odd_linkid,
+		    crgetzoneid(cred));
+		overlay_hold_rele(odd);
+		return (ret);
+	}
+
+	overlay_target_quiesce(odd->odd_target);
+
+	mutex_enter(&overlay_dev_lock);
+	list_remove(&overlay_dev_list, odd);
+	mutex_exit(&overlay_dev_lock);
+
+	cv_destroy(&odd->odd_iowait);
+	mutex_destroy(&odd->odd_lock);
+	overlay_target_free(odd);
+	odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
+	overlay_plugin_rele(odd->odd_plugin);
+	kmem_free(odd, sizeof (overlay_dev_t));
+
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+overlay_i_nprops(void *karg, intptr_t arg, int mode, cred_t *cred,
+    int *rvalp)
+{
+	overlay_dev_t *odd;
+	overlay_ioc_nprops_t *on = karg;
+
+	odd = overlay_hold_by_dlid(on->oipn_linkid);
+	if (odd == NULL)
+		return (ENOENT);
+	on->oipn_nprops = odd->odd_plugin->ovp_nprops + OVERLAY_DEV_NPROPS;
+	overlay_hold_rele(odd);
+
+	return (0);
+}
+
+static int
+overlay_propinfo_plugin_cb(overlay_plugin_t *opp, void *arg)
+{
+	overlay_prop_handle_t phdl = arg;
+	overlay_prop_set_range_str(phdl, opp->ovp_name);
+	return (0);
+}
+
+static int
+overlay_i_name_to_propid(overlay_dev_t *odd, const char *name, uint_t *id)
+{
+	int i;
+
+	for (i = 0; i < OVERLAY_DEV_NPROPS; i++) {
+		if (strcmp(overlay_dev_props[i], name) == 0) {
+			*id = i;
+			return (0);
+		}
+	}
+
+	for (i = 0; i < odd->odd_plugin->ovp_nprops; i++) {
+		if (strcmp(odd->odd_plugin->ovp_props[i], name) == 0) {
+			*id = i + OVERLAY_DEV_NPROPS;
+			return (0);
+		}
+	}
+
+	return (ENOENT);
+}
+
+static void
+overlay_i_propinfo_mtu(overlay_dev_t *odd, overlay_prop_handle_t phdl)
+{
+	uint32_t def;
+	mac_propval_range_t range;
+	uint_t perm;
+
+	ASSERT(MAC_PERIM_HELD(odd->odd_mh));
+
+	bzero(&range, sizeof (mac_propval_range_t));
+	range.mpr_count = 1;
+	if (mac_prop_info(odd->odd_mh, MAC_PROP_MTU, "mtu", &def,
+	    sizeof (def), &range, &perm) != 0)
+		return;
+
+	if (perm == MAC_PROP_PERM_READ)
+		overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_READ);
+	else if (perm == MAC_PROP_PERM_WRITE)
+		overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_WRITE);
+	else if (perm == MAC_PROP_PERM_RW)
+		overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RW);
+
+	overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT);
+	overlay_prop_set_default(phdl, &def, sizeof (def));
+	overlay_prop_set_range_uint32(phdl, range.mpr_range_uint32[0].mpur_min,
+	    range.mpr_range_uint32[0].mpur_max);
+}
+
+/* ARGSUSED */
+static int
+overlay_i_propinfo(void *karg, intptr_t arg, int mode, cred_t *cred,
+    int *rvalp)
+{
+	overlay_dev_t *odd;
+	int ret;
+	mac_perim_handle_t mph;
+	uint_t propid = UINT_MAX;
+	overlay_ioc_propinfo_t *oip = karg;
+	overlay_prop_handle_t phdl = (overlay_prop_handle_t)oip;
+
+	odd = overlay_hold_by_dlid(oip->oipi_linkid);
+	if (odd == NULL)
+		return (ENOENT);
+
+	overlay_prop_init(phdl);
+	mac_perim_enter_by_mh(odd->odd_mh, &mph);
+
+	/*
+	 * If the id is -1, then the property that we're looking for is named in
+	 * oipi_name and we should fill in its id. Otherwise, we've been given
+	 * an id and we need to turn that into a name for our plugin's sake. The
+	 * id is our own fabrication for property discovery.
+	 */
+	if (oip->oipi_id == -1) {
+		/*
+		 * Determine if it's a known generic property or it belongs to a
+		 * module by checking against the list of known names.
+		 */
+		oip->oipi_name[OVERLAY_PROP_NAMELEN-1] = '\0';
+		if ((ret = overlay_i_name_to_propid(odd, oip->oipi_name,
+		    &propid)) != 0) {
+			overlay_hold_rele(odd);
+			mac_perim_exit(mph);
+			return (ret);
+		}
+		oip->oipi_id = propid;
+		if (propid >= OVERLAY_DEV_NPROPS) {
+			ret = odd->odd_plugin->ovp_ops->ovpo_propinfo(
+			    oip->oipi_name, phdl);
+			overlay_hold_rele(odd);
+			mac_perim_exit(mph);
+			return (ret);
+
+		}
+	} else if (oip->oipi_id >= OVERLAY_DEV_NPROPS) {
+		uint_t id = oip->oipi_id - OVERLAY_DEV_NPROPS;
+
+		if (id >= odd->odd_plugin->ovp_nprops) {
+			overlay_hold_rele(odd);
+			mac_perim_exit(mph);
+			return (EINVAL);
+		}
+		ret = odd->odd_plugin->ovp_ops->ovpo_propinfo(
+		    odd->odd_plugin->ovp_props[id], phdl);
+		overlay_hold_rele(odd);
+		mac_perim_exit(mph);
+		return (ret);
+	} else if (oip->oipi_id < -1) {
+		overlay_hold_rele(odd);
+		mac_perim_exit(mph);
+		return (EINVAL);
+	} else {
+		ASSERT(oip->oipi_id < OVERLAY_DEV_NPROPS);
+		ASSERT(oip->oipi_id >= 0);
+		propid = oip->oipi_id;
+		(void) strlcpy(oip->oipi_name, overlay_dev_props[propid],
+		    sizeof (oip->oipi_name));
+	}
+
+	switch (propid) {
+	case OVERLAY_DEV_P_MTU:
+		overlay_i_propinfo_mtu(odd, phdl);
+		break;
+	case OVERLAY_DEV_P_VNETID:
+		overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RW);
+		overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT);
+		overlay_prop_set_nodefault(phdl);
+		break;
+	case OVERLAY_DEV_P_ENCAP:
+		overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_READ);
+		overlay_prop_set_type(phdl, OVERLAY_PROP_T_STRING);
+		overlay_prop_set_nodefault(phdl);
+		overlay_plugin_walk(overlay_propinfo_plugin_cb, phdl);
+		break;
+	case OVERLAY_DEV_P_VARPDID:
+		overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_READ);
+		overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT);
+		overlay_prop_set_nodefault(phdl);
+		break;
+	default:
+		overlay_hold_rele(odd);
+		mac_perim_exit(mph);
+		return (ENOENT);
+	}
+
+	overlay_hold_rele(odd);
+	mac_perim_exit(mph);
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+overlay_i_getprop(void *karg, intptr_t arg, int mode, cred_t *cred,
+    int *rvalp)
+{
+	int ret;
+	overlay_dev_t *odd;
+	mac_perim_handle_t mph;
+	overlay_ioc_prop_t *oip = karg;
+	uint_t propid, mtu;
+
+	odd = overlay_hold_by_dlid(oip->oip_linkid);
+	if (odd == NULL)
+		return (ENOENT);
+
+	mac_perim_enter_by_mh(odd->odd_mh, &mph);
+	oip->oip_size = OVERLAY_PROP_SIZEMAX;
+	oip->oip_name[OVERLAY_PROP_NAMELEN-1] = '\0';
+	if (oip->oip_id == -1) {
+		int i;
+
+		for (i = 0; i < OVERLAY_DEV_NPROPS; i++) {
+			if (strcmp(overlay_dev_props[i], oip->oip_name) == 0)
+				break;
+			if (i == OVERLAY_DEV_NPROPS) {
+				ret = odd->odd_plugin->ovp_ops->ovpo_getprop(
+				    odd->odd_pvoid, oip->oip_name,
+				    oip->oip_value, &oip->oip_size);
+				overlay_hold_rele(odd);
+				mac_perim_exit(mph);
+				return (ret);
+			}
+		}
+
+		propid = i;
+	} else if (oip->oip_id >= OVERLAY_DEV_NPROPS) {
+		uint_t id = oip->oip_id - OVERLAY_DEV_NPROPS;
+
+		if (id > odd->odd_plugin->ovp_nprops) {
+			overlay_hold_rele(odd);
+			mac_perim_exit(mph);
+			return (EINVAL);
+		}
+		ret = odd->odd_plugin->ovp_ops->ovpo_getprop(odd->odd_pvoid,
+		    odd->odd_plugin->ovp_props[id], oip->oip_value,
+		    &oip->oip_size);
+		overlay_hold_rele(odd);
+		mac_perim_exit(mph);
+		return (ret);
+	} else if (oip->oip_id < -1) {
+		overlay_hold_rele(odd);
+		mac_perim_exit(mph);
+		return (EINVAL);
+	} else {
+		ASSERT(oip->oip_id < OVERLAY_DEV_NPROPS);
+		ASSERT(oip->oip_id >= 0);
+		propid = oip->oip_id;
+	}
+
+	ret = 0;
+	switch (propid) {
+	case OVERLAY_DEV_P_MTU:
+		/*
+		 * The MTU is always set and retrieved through MAC, to allow for
+		 * MAC to do whatever it wants, as really that property belongs
+		 * to MAC. This is important for things where vnics have hold on
+		 * the MTU.
+		 */
+		mac_sdu_get(odd->odd_mh, NULL, &mtu);
+		bcopy(&mtu, oip->oip_value, sizeof (uint_t));
+		oip->oip_size = sizeof (uint_t);
+		break;
+	case OVERLAY_DEV_P_VNETID:
+		/*
+		 * While it's read-only while inside of a mux, we're not in a
+		 * context that can guarantee that. Therefore we always grab the
+		 * overlay_dev_t's odd_lock.
+		 */
+		mutex_enter(&odd->odd_lock);
+		bcopy(&odd->odd_vid, oip->oip_value, sizeof (uint64_t));
+		mutex_exit(&odd->odd_lock);
+		oip->oip_size = sizeof (uint64_t);
+		break;
+	case OVERLAY_DEV_P_ENCAP:
+		oip->oip_size = strlcpy((char *)oip->oip_value,
+		    odd->odd_plugin->ovp_name, oip->oip_size);
+		break;
+	case OVERLAY_DEV_P_VARPDID:
+		mutex_enter(&odd->odd_lock);
+		if (odd->odd_flags & OVERLAY_F_VARPD) {
+			const uint64_t val = odd->odd_target->ott_id;
+			bcopy(&val, oip->oip_value, sizeof (uint64_t));
+			oip->oip_size = sizeof (uint64_t);
+		} else {
+			oip->oip_size = 0;
+		}
+		mutex_exit(&odd->odd_lock);
+		break;
+	default:
+		ret = ENOENT;
+	}
+
+	overlay_hold_rele(odd);
+	mac_perim_exit(mph);
+	return (ret);
+}
+
+static void
+overlay_setprop_vnetid(overlay_dev_t *odd, uint64_t vnetid)
+{
+	mutex_enter(&odd->odd_lock);
+
+	/* Simple case, not active */
+	if (!(odd->odd_flags & OVERLAY_F_IN_MUX)) {
+		odd->odd_vid = vnetid;
+		mutex_exit(&odd->odd_lock);
+		return;
+	}
+
+	/*
+	 * In the hard case, we need to set the drop flag, quiesce I/O and then
+	 * we can go ahead and do everything.
+	 */
+	odd->odd_flags |= OVERLAY_F_MDDROP;
+	overlay_io_wait(odd, OVERLAY_F_IOMASK);
+	mutex_exit(&odd->odd_lock);
+
+	overlay_mux_remove_dev(odd->odd_mux, odd);
+	mutex_enter(&odd->odd_lock);
+	odd->odd_vid = vnetid;
+	mutex_exit(&odd->odd_lock);
+	overlay_mux_add_dev(odd->odd_mux, odd);
+
+	mutex_enter(&odd->odd_lock);
+	ASSERT(odd->odd_flags & OVERLAY_F_IN_MUX);
+	odd->odd_flags &= ~OVERLAY_F_IN_MUX;
+	mutex_exit(&odd->odd_lock);
+}
+
+/* ARGSUSED */
+static int
+overlay_i_setprop(void *karg, intptr_t arg, int mode, cred_t *cred,
+    int *rvalp)
+{
+	int ret;
+	overlay_dev_t *odd;
+	overlay_ioc_prop_t *oip = karg;
+	uint_t propid = UINT_MAX;
+	mac_perim_handle_t mph;
+	uint64_t maxid, *vidp;
+
+	if (oip->oip_size > OVERLAY_PROP_SIZEMAX)
+		return (EINVAL);
+
+	odd = overlay_hold_by_dlid(oip->oip_linkid);
+	if (odd == NULL)
+		return (ENOENT);
+
+	oip->oip_name[OVERLAY_PROP_NAMELEN-1] = '\0';
+	mac_perim_enter_by_mh(odd->odd_mh, &mph);
+	mutex_enter(&odd->odd_lock);
+	if (odd->odd_flags & OVERLAY_F_ACTIVATED) {
+		mac_perim_exit(mph);
+		mutex_exit(&odd->odd_lock);
+		return (ENOTSUP);
+	}
+	mutex_exit(&odd->odd_lock);
+	if (oip->oip_id == -1) {
+		int i;
+
+		for (i = 0; i < OVERLAY_DEV_NPROPS; i++) {
+			if (strcmp(overlay_dev_props[i], oip->oip_name) == 0)
+				break;
+			if (i == OVERLAY_DEV_NPROPS) {
+				ret = odd->odd_plugin->ovp_ops->ovpo_setprop(
+				    odd->odd_pvoid, oip->oip_name,
+				    oip->oip_value, oip->oip_size);
+				overlay_hold_rele(odd);
+				mac_perim_exit(mph);
+				return (ret);
+			}
+		}
+
+		propid = i;
+	} else if (oip->oip_id >= OVERLAY_DEV_NPROPS) {
+		uint_t id = oip->oip_id - OVERLAY_DEV_NPROPS;
+
+		if (id > odd->odd_plugin->ovp_nprops) {
+			mac_perim_exit(mph);
+			overlay_hold_rele(odd);
+			return (EINVAL);
+		}
+		ret = odd->odd_plugin->ovp_ops->ovpo_setprop(odd->odd_pvoid,
+		    odd->odd_plugin->ovp_props[id], oip->oip_value,
+		    oip->oip_size);
+		mac_perim_exit(mph);
+		overlay_hold_rele(odd);
+		return (ret);
+	} else if (oip->oip_id < -1) {
+		mac_perim_exit(mph);
+		overlay_hold_rele(odd);
+		return (EINVAL);
+	} else {
+		ASSERT(oip->oip_id < OVERLAY_DEV_NPROPS);
+		ASSERT(oip->oip_id >= 0);
+		propid = oip->oip_id;
+	}
+
+	ret = 0;
+	switch (propid) {
+	case OVERLAY_DEV_P_MTU:
+		ret = mac_set_prop(odd->odd_mh, MAC_PROP_MTU, "mtu",
+		    oip->oip_value, oip->oip_size);
+		break;
+	case OVERLAY_DEV_P_VNETID:
+		if (oip->oip_size != sizeof (uint64_t)) {
+			ret = EINVAL;
+			break;
+		}
+		vidp = (uint64_t *)oip->oip_value;
+		ASSERT(odd->odd_plugin->ovp_id_size <= 8);
+		maxid = UINT64_MAX;
+		if (odd->odd_plugin->ovp_id_size != 8)
+			maxid = (1ULL << (odd->odd_plugin->ovp_id_size * 8)) -
+			    1ULL;
+		if (*vidp >= maxid) {
+			ret = EINVAL;
+			break;
+		}
+		overlay_setprop_vnetid(odd, *vidp);
+		break;
+	case OVERLAY_DEV_P_ENCAP:
+	case OVERLAY_DEV_P_VARPDID:
+		ret = EPERM;
+		break;
+	default:
+		ret = ENOENT;
+	}
+
+	mac_perim_exit(mph);
+	overlay_hold_rele(odd);
+	return (ret);
+}
+
+/* ARGSUSED */
+static int
+overlay_i_status(void *karg, intptr_t arg, int mode, cred_t *cred,
+    int *rvalp)
+{
+	overlay_dev_t *odd;
+	overlay_ioc_status_t *os = karg;
+
+	odd = overlay_hold_by_dlid(os->ois_linkid);
+	if (odd == NULL)
+		return (ENOENT);
+
+	mutex_enter(&odd->odd_lock);
+	if ((odd->odd_flags & OVERLAY_F_DEGRADED) != 0) {
+		os->ois_status = OVERLAY_I_DEGRADED;
+		if (odd->odd_fmamsg != NULL) {
+			(void) strlcpy(os->ois_message, odd->odd_fmamsg,
+			    OVERLAY_STATUS_BUFLEN);
+		} else {
+			os->ois_message[0] = '\0';
+		}
+
+	} else {
+		os->ois_status = OVERLAY_I_OK;
+		os->ois_message[0] = '\0';
+	}
+	mutex_exit(&odd->odd_lock);
+	overlay_hold_rele(odd);
+
+	return (0);
+}
+
+static dld_ioc_info_t overlay_ioc_list[] = {
+	{ OVERLAY_IOC_CREATE, DLDCOPYIN, sizeof (overlay_ioc_create_t),
+		overlay_i_create, secpolicy_dl_config },
+	{ OVERLAY_IOC_ACTIVATE, DLDCOPYIN, sizeof (overlay_ioc_activate_t),
+		overlay_i_activate, secpolicy_dl_config },
+	{ OVERLAY_IOC_DELETE, DLDCOPYIN, sizeof (overlay_ioc_delete_t),
+		overlay_i_delete, secpolicy_dl_config },
+	{ OVERLAY_IOC_PROPINFO, DLDCOPYIN | DLDCOPYOUT,
+		sizeof (overlay_ioc_propinfo_t), overlay_i_propinfo,
+		secpolicy_dl_config },
+	{ OVERLAY_IOC_GETPROP, DLDCOPYIN | DLDCOPYOUT,
+		sizeof (overlay_ioc_prop_t), overlay_i_getprop,
+		secpolicy_dl_config },
+	{ OVERLAY_IOC_SETPROP, DLDCOPYIN,
+		sizeof (overlay_ioc_prop_t), overlay_i_setprop,
+		secpolicy_dl_config },
+	{ OVERLAY_IOC_NPROPS, DLDCOPYIN | DLDCOPYOUT,
+		sizeof (overlay_ioc_nprops_t), overlay_i_nprops,
+		secpolicy_dl_config },
+	{ OVERLAY_IOC_STATUS, DLDCOPYIN | DLDCOPYOUT,
+		sizeof (overlay_ioc_status_t), overlay_i_status,
+		NULL }
+};
+
+static int
+overlay_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+	int fmcap = DDI_FM_EREPORT_CAPABLE;
+	if (cmd != DDI_ATTACH)
+		return (DDI_FAILURE);
+
+	if (overlay_dip != NULL || ddi_get_instance(dip) != 0)
+		return (DDI_FAILURE);
+
+	ddi_fm_init(dip, &fmcap, NULL);
+
+	if (ddi_create_minor_node(dip, OVERLAY_CTL, S_IFCHR,
+	    ddi_get_instance(dip), DDI_PSEUDO, 0) == DDI_FAILURE)
+		return (DDI_FAILURE);
+
+	if (dld_ioc_register(OVERLAY_IOC, overlay_ioc_list,
+	    DLDIOCCNT(overlay_ioc_list)) != 0) {
+		ddi_remove_minor_node(dip, OVERLAY_CTL);
+		return (DDI_FAILURE);
+	}
+
+	overlay_dip = dip;
+	return (DDI_SUCCESS);
+}
+
+/* ARGSUSED */
+static int
+overlay_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resp)
+{
+	int error;
+
+	switch (cmd) {
+	case DDI_INFO_DEVT2DEVINFO:
+		*resp = (void *)overlay_dip;
+		error = DDI_SUCCESS;
+		break;
+	case DDI_INFO_DEVT2INSTANCE:
+		*resp = (void *)0;
+		error = DDI_SUCCESS;
+		break;
+	default:
+		error = DDI_FAILURE;
+		break;
+	}
+
+	return (error);
+}
+
+static int
+overlay_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+	if (cmd != DDI_DETACH)
+		return (DDI_FAILURE);
+
+	mutex_enter(&overlay_dev_lock);
+	if (!list_is_empty(&overlay_dev_list) || overlay_target_busy()) {
+		mutex_exit(&overlay_dev_lock);
+		return (EBUSY);
+	}
+	mutex_exit(&overlay_dev_lock);
+
+
+	dld_ioc_unregister(OVERLAY_IOC);
+	ddi_remove_minor_node(dip, OVERLAY_CTL);
+	ddi_fm_fini(dip);
+	overlay_dip = NULL;
+	return (DDI_SUCCESS);
+}
+
+static struct cb_ops overlay_cbops = {
+	overlay_target_open,	/* cb_open */
+	overlay_target_close,	/* cb_close */
+	nodev,			/* cb_strategy */
+	nodev,			/* cb_print */
+	nodev,			/* cb_dump */
+	nodev,			/* cb_read */
+	nodev,			/* cb_write */
+	overlay_target_ioctl,	/* cb_ioctl */
+	nodev,			/* cb_devmap */
+	nodev,			/* cb_mmap */
+	nodev,			/* cb_segmap */
+	nochpoll,		/* cb_chpoll */
+	ddi_prop_op,		/* cb_prop_op */
+	NULL,			/* cb_stream */
+	D_MP,			/* cb_flag */
+	CB_REV,			/* cb_rev */
+	nodev,			/* cb_aread */
+	nodev,			/* cb_awrite */
+};
+
+static struct dev_ops overlay_dev_ops = {
+	DEVO_REV,		/* devo_rev */
+	0,			/* devo_refcnt */
+	overlay_getinfo,	/* devo_getinfo */
+	nulldev,		/* devo_identify */
+	nulldev,		/* devo_probe */
+	overlay_attach,		/* devo_attach */
+	overlay_detach,		/* devo_detach */
+	nulldev,		/* devo_reset */
+	&overlay_cbops,		/* devo_cb_ops */
+	NULL,			/* devo_bus_ops */
+	NULL,			/* devo_power */
+	ddi_quiesce_not_supported	/* devo_quiesce */
+};
+
+static struct modldrv overlay_modldrv = {
+	&mod_driverops,
+	"Overlay Network Driver",
+	&overlay_dev_ops
+};
+
+static struct modlinkage overlay_linkage = {
+	MODREV_1,
+	&overlay_modldrv
+};
+
+static int
+overlay_init(void)
+{
+	mutex_init(&overlay_dev_lock, NULL, MUTEX_DRIVER, NULL);
+	list_create(&overlay_dev_list, sizeof (overlay_dev_t),
+	    offsetof(overlay_dev_t, odd_link));
+	overlay_mux_init();
+	overlay_plugin_init();
+	overlay_target_init();
+
+	return (DDI_SUCCESS);
+}
+
+static void
+overlay_fini(void)
+{
+	overlay_target_fini();
+	overlay_plugin_fini();
+	overlay_mux_fini();
+	mutex_destroy(&overlay_dev_lock);
+	list_destroy(&overlay_dev_list);
+}
+
+int
+_init(void)
+{
+	int err;
+
+	if ((err = overlay_init()) != DDI_SUCCESS)
+		return (err);
+
+	mac_init_ops(NULL, "overlay");
+	err = mod_install(&overlay_linkage);
+	if (err != DDI_SUCCESS) {
+		overlay_fini();
+		return (err);
+	}
+
+	return (0);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&overlay_linkage, modinfop));
+}
+
+int
+_fini(void)
+{
+	int err;
+
+	err = mod_remove(&overlay_linkage);
+	if (err != 0)
+		return (err);
+
+	overlay_fini();
+	return (0);
+}
diff --git a/usr/src/uts/common/io/overlay/overlay.conf b/usr/src/uts/common/io/overlay/overlay.conf
new file mode 100644
index 0000000000..4b62fafd94
--- /dev/null
+++ b/usr/src/uts/common/io/overlay/overlay.conf
@@ -0,0 +1,16 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2015, Joyent, Inc.
+#
+
+name="overlay" parent="pseudo" instance=0;
diff --git a/usr/src/uts/common/io/overlay/overlay.mapfile b/usr/src/uts/common/io/overlay/overlay.mapfile
new file mode 100644
index 0000000000..800d72dc2b
--- /dev/null
+++ b/usr/src/uts/common/io/overlay/overlay.mapfile
@@ -0,0 +1,46 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2015 Joyent, Inc.
+#
+
+#
+# MAPFILE HEADER START
+#
+# WARNING:  STOP NOW.  DO NOT MODIFY THIS FILE.
+# Object versioning must comply with the rules detailed in
+#
+#	usr/src/lib/README.mapfiles
+#
+# You should not be making modifications here until you've read the most current
+# copy of that file. If you need help, contact a gatekeeper for guidance.
+#
+# MAPFILE HEADER END
+#
+
+$mapfile_version 2
+
+SYMBOL_VERSION ILLUMOSprivate {
+    global:
+	# DDI Interfaces
+	_fini;
+	_init;
+	_info;
+
+	# Encapsualation Plugin interfaces
+	overlay_plugin_alloc;
+	overlay_plugin_free;
+	overlay_plugin_register;
+	overlay_plugin_unregister;
+    local:
+	*;
+};
diff --git a/usr/src/uts/common/io/overlay/overlay_fm.c b/usr/src/uts/common/io/overlay/overlay_fm.c
new file mode 100644
index 0000000000..0701d08e8b
--- /dev/null
+++ b/usr/src/uts/common/io/overlay/overlay_fm.c
@@ -0,0 +1,82 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+/*
+ * Overlay device FMA operations.
+ *
+ * For more information, see the big theory statement in
+ * uts/common/io/overlay/overlay.c
+ */
+
+#include <sys/ddifm.h>
+#include <sys/overlay_impl.h>
+
+kmutex_t overlay_fm_lock;
+uint_t overlay_fm_count;
+
+void
+overlay_fm_init(void)
+{
+	overlay_fm_count = 0;
+	mutex_init(&overlay_fm_lock, NULL, MUTEX_DRIVER, NULL);
+}
+
+void
+overlay_fm_fini(void)
+{
+	VERIFY(overlay_fm_count == 0);
+	mutex_destroy(&overlay_fm_lock);
+}
+
+void
+overlay_fm_degrade(overlay_dev_t *odd, const char *msg)
+{
+	mutex_enter(&overlay_fm_lock);
+	mutex_enter(&odd->odd_lock);
+
+	if (msg != NULL)
+		(void) strlcpy(odd->odd_fmamsg, msg, OVERLAY_STATUS_BUFLEN);
+
+	if (odd->odd_flags & OVERLAY_F_DEGRADED)
+		goto out;
+
+	odd->odd_flags |= OVERLAY_F_DEGRADED;
+	overlay_fm_count++;
+	if (overlay_fm_count == 1) {
+		ddi_fm_service_impact(overlay_dip, DDI_SERVICE_DEGRADED);
+	}
+out:
+	mutex_exit(&odd->odd_lock);
+	mutex_exit(&overlay_fm_lock);
+}
+
+void
+overlay_fm_restore(overlay_dev_t *odd)
+{
+	mutex_enter(&overlay_fm_lock);
+	mutex_enter(&odd->odd_lock);
+	if (!(odd->odd_flags & OVERLAY_F_DEGRADED))
+		goto out;
+
+	odd->odd_fmamsg[0] = '\0';
+	odd->odd_flags &= ~OVERLAY_F_DEGRADED;
+	overlay_fm_count--;
+	if (overlay_fm_count == 0) {
+		ddi_fm_service_impact(overlay_dip, DDI_SERVICE_RESTORED);
+	}
+out:
+	mutex_exit(&odd->odd_lock);
+	mutex_exit(&overlay_fm_lock);
+}
diff --git a/usr/src/uts/common/io/overlay/overlay_mux.c b/usr/src/uts/common/io/overlay/overlay_mux.c
new file mode 100644
index 0000000000..58e9f2665d
--- /dev/null
+++ b/usr/src/uts/common/io/overlay/overlay_mux.c
@@ -0,0 +1,368 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2019 Joyent, Inc.
+ */
+
+/*
+ * Overlay device ksocket multiplexer.
+ *
+ * For more information, see the big theory statement in
+ * uts/common/io/overlay/overlay.c
+ */
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/ksynch.h>
+#include <sys/ksocket.h>
+#include <sys/avl.h>
+#include <sys/list.h>
+#include <sys/pattr.h>
+#include <sys/sysmacros.h>
+#include <sys/strsubr.h>
+#include <sys/strsun.h>
+#include <sys/tihdr.h>
+
+#include <sys/overlay_impl.h>
+
+#include <sys/sdt.h>
+
+#define	OVERLAY_FREEMSG(mp, reason) \
+    DTRACE_PROBE2(overlay__fremsg, mblk_t *, mp, char *, reason)
+
+static list_t overlay_mux_list;
+static kmutex_t overlay_mux_lock;
+
+void
+overlay_mux_init(void)
+{
+	list_create(&overlay_mux_list, sizeof (overlay_mux_t),
+	    offsetof(overlay_mux_t, omux_lnode));
+	mutex_init(&overlay_mux_lock, NULL, MUTEX_DRIVER, NULL);
+}
+
+void
+overlay_mux_fini(void)
+{
+	mutex_destroy(&overlay_mux_lock);
+	list_destroy(&overlay_mux_list);
+}
+
+static int
+overlay_mux_comparator(const void *a, const void *b)
+{
+	const overlay_dev_t *odl, *odr;
+	odl = a;
+	odr = b;
+	if (odl->odd_vid > odr->odd_vid)
+		return (1);
+	else if (odl->odd_vid < odr->odd_vid)
+		return (-1);
+	else
+		return (0);
+}
+
+/*
+ * This is the central receive data path. We need to decode the packet, if we
+ * can, and then deliver it to the appropriate overlay.
+ */
+/* ARGSUSED */
+static boolean_t
+overlay_mux_recv(ksocket_t ks, mblk_t *mpchain, size_t msgsize, int oob,
+    void *arg)
+{
+	mblk_t *mp, *nmp, *fmp;
+	overlay_mux_t *mux = arg;
+
+	/*
+	 * We may have a received a chain of messages. Each messsage in the
+	 * chain will likely have a T_unitdata_ind attached to it as an M_PROTO.
+	 * If we aren't getting that, we should probably drop that for the
+	 * moment.
+	 */
+	for (mp = mpchain; mp != NULL; mp = nmp) {
+		struct T_unitdata_ind *tudi;
+		ovep_encap_info_t infop;
+		overlay_dev_t od, *odd;
+		int ret;
+
+		nmp = mp->b_next;
+		mp->b_next = NULL;
+
+		if (DB_TYPE(mp) != M_PROTO) {
+			OVERLAY_FREEMSG(mp, "first one isn't M_PROTO");
+			freemsg(mp);
+			continue;
+		}
+
+		if (mp->b_cont == NULL) {
+			OVERLAY_FREEMSG(mp, "missing a b_cont");
+			freemsg(mp);
+			continue;
+		}
+
+		tudi = (struct T_unitdata_ind *)mp->b_rptr;
+		if (tudi->PRIM_type != T_UNITDATA_IND) {
+			OVERLAY_FREEMSG(mp, "Not a T_unitdata_ind *");
+			freemsg(mp);
+			continue;
+		}
+
+		/*
+		 * In the future, we'll care about the source information
+		 * for purposes of telling varpd for oob invalidation. But for
+		 * now, just drop that block.
+		 */
+		fmp = mp;
+		mp = fmp->b_cont;
+		freeb(fmp);
+
+		/*
+		 * Until we have VXLAN-or-other-decap HW acceleration support
+		 * (e.g.  we support NICs that reach into VXLAN-encapsulated
+		 * packets and check the inside-VXLAN IP packets' checksums,
+		 * or do LSO with VXLAN), we should clear any HW-accelerated-
+		 * performed bits.
+		 *
+		 * We do this, even in cases of HW_LOCAL_MAC, because we
+		 * absolutely have NO context about the inner packet.
+		 * It could've arrived off an external NIC and been forwarded
+		 * to the overlay network, which means no context.
+		 */
+		DB_CKSUMFLAGS(mp) = 0;
+
+		/*
+		 * Decap and deliver.
+		 */
+		bzero(&infop, sizeof (ovep_encap_info_t));
+		ret = mux->omux_plugin->ovp_ops->ovpo_decap(NULL, mp, &infop);
+		if (ret != 0) {
+			OVERLAY_FREEMSG(mp, "decap failed");
+			freemsg(mp);
+			continue;
+		}
+		if (MBLKL(mp) > infop.ovdi_hdr_size) {
+			mp->b_rptr += infop.ovdi_hdr_size;
+		} else {
+			while (infop.ovdi_hdr_size != 0) {
+				size_t rem, blkl;
+
+				if (mp == NULL)
+					break;
+
+				blkl = MBLKL(mp);
+				rem = MIN(infop.ovdi_hdr_size, blkl);
+				infop.ovdi_hdr_size -= rem;
+				mp->b_rptr += rem;
+				if (rem == blkl) {
+					fmp = mp;
+					mp = fmp->b_cont;
+					fmp->b_cont = NULL;
+					OVERLAY_FREEMSG(mp,
+					    "freed a fmp block");
+					freemsg(fmp);
+				}
+			}
+			if (mp == NULL) {
+				OVERLAY_FREEMSG(mp, "freed it all...");
+				continue;
+			}
+		}
+
+
+		od.odd_vid = infop.ovdi_id;
+		mutex_enter(&mux->omux_lock);
+		odd = avl_find(&mux->omux_devices, &od, NULL);
+		if (odd == NULL) {
+			mutex_exit(&mux->omux_lock);
+			OVERLAY_FREEMSG(mp, "no matching vid");
+			freemsg(mp);
+			continue;
+		}
+		mutex_enter(&odd->odd_lock);
+		if ((odd->odd_flags & OVERLAY_F_MDDROP) ||
+		    !(odd->odd_flags & OVERLAY_F_IN_MUX)) {
+			mutex_exit(&odd->odd_lock);
+			mutex_exit(&mux->omux_lock);
+			OVERLAY_FREEMSG(mp, "dev dropped");
+			freemsg(mp);
+			continue;
+		}
+		overlay_io_start(odd, OVERLAY_F_IN_RX);
+		mutex_exit(&odd->odd_lock);
+		mutex_exit(&mux->omux_lock);
+
+		mac_rx(odd->odd_mh, NULL, mp);
+
+		mutex_enter(&odd->odd_lock);
+		overlay_io_done(odd, OVERLAY_F_IN_RX);
+		mutex_exit(&odd->odd_lock);
+	}
+
+	return (B_TRUE);
+}
+
+/*
+ * Register a given device with a socket backend. If no such device socket
+ * exists, create a new one.
+ */
+overlay_mux_t *
+overlay_mux_open(overlay_plugin_t *opp, int domain, int family, int protocol,
+    struct sockaddr *addr, socklen_t len, int *errp)
+{
+	int err;
+	overlay_mux_t *mux;
+	ksocket_t ksock;
+
+	if (errp == NULL)
+		errp = &err;
+
+	mutex_enter(&overlay_mux_lock);
+	for (mux = list_head(&overlay_mux_list); mux != NULL;
+	    mux = list_next(&overlay_mux_list, mux)) {
+		if (domain == mux->omux_domain &&
+		    family == mux->omux_family &&
+		    protocol == mux->omux_protocol &&
+		    len == mux->omux_alen &&
+		    bcmp(addr, mux->omux_addr, len) == 0) {
+
+			if (opp != mux->omux_plugin) {
+				*errp = EEXIST;
+				return (NULL);
+			}
+
+			mutex_enter(&mux->omux_lock);
+			mux->omux_count++;
+			mutex_exit(&mux->omux_lock);
+			mutex_exit(&overlay_mux_lock);
+			*errp = 0;
+			return (mux);
+		}
+	}
+
+	/*
+	 * Today we aren't zone-aware and only exist in the global zone. When we
+	 * allow for things to exist in the non-global zone, we'll want to use a
+	 * credential that's actually specific to the zone.
+	 */
+	*errp = ksocket_socket(&ksock, domain, family, protocol, KSOCKET_SLEEP,
+	    kcred);
+	if (*errp != 0) {
+		mutex_exit(&overlay_mux_lock);
+		return (NULL);
+	}
+
+	*errp = ksocket_bind(ksock, addr, len, kcred);
+	if (*errp != 0) {
+		mutex_exit(&overlay_mux_lock);
+		ksocket_close(ksock, kcred);
+		return (NULL);
+	}
+
+	/*
+	 * Ask our lower layer to optionally toggle anything they need on this
+	 * socket. Because a socket is owned by a single type of plugin, we can
+	 * then ask it to perform any additional socket set up it'd like to do.
+	 */
+	if (opp->ovp_ops->ovpo_sockopt != NULL &&
+	    (*errp = opp->ovp_ops->ovpo_sockopt(ksock)) != 0) {
+		mutex_exit(&overlay_mux_lock);
+		ksocket_close(ksock, kcred);
+		return (NULL);
+	}
+
+	mux = kmem_alloc(sizeof (overlay_mux_t), KM_SLEEP);
+	list_link_init(&mux->omux_lnode);
+	mux->omux_ksock = ksock;
+	mux->omux_plugin = opp;
+	mux->omux_domain = domain;
+	mux->omux_family = family;
+	mux->omux_protocol = protocol;
+	mux->omux_addr = kmem_alloc(len, KM_SLEEP);
+	bcopy(addr, mux->omux_addr, len);
+	mux->omux_alen = len;
+	mux->omux_count = 1;
+	avl_create(&mux->omux_devices, overlay_mux_comparator,
+	    sizeof (overlay_dev_t), offsetof(overlay_dev_t, odd_muxnode));
+	mutex_init(&mux->omux_lock, NULL, MUTEX_DRIVER, NULL);
+
+
+	/* Once this is called, we need to expect to rx data */
+	*errp = ksocket_krecv_set(ksock, overlay_mux_recv, mux);
+	if (*errp != 0) {
+		ksocket_close(ksock, kcred);
+		mutex_destroy(&mux->omux_lock);
+		avl_destroy(&mux->omux_devices);
+		kmem_free(mux->omux_addr, len);
+		kmem_free(mux, sizeof (overlay_mux_t));
+		return (NULL);
+	}
+
+	list_insert_tail(&overlay_mux_list, mux);
+	mutex_exit(&overlay_mux_lock);
+
+	*errp = 0;
+	return (mux);
+}
+
+void
+overlay_mux_close(overlay_mux_t *mux)
+{
+	mutex_enter(&overlay_mux_lock);
+	mutex_enter(&mux->omux_lock);
+	mux->omux_count--;
+	if (mux->omux_count != 0) {
+		mutex_exit(&mux->omux_lock);
+		mutex_exit(&overlay_mux_lock);
+		return;
+	}
+	list_remove(&overlay_mux_list, mux);
+	mutex_exit(&mux->omux_lock);
+	mutex_exit(&overlay_mux_lock);
+
+	ksocket_close(mux->omux_ksock, kcred);
+	avl_destroy(&mux->omux_devices);
+	kmem_free(mux->omux_addr, mux->omux_alen);
+	kmem_free(mux, sizeof (overlay_mux_t));
+}
+
+void
+overlay_mux_add_dev(overlay_mux_t *mux, overlay_dev_t *odd)
+{
+	mutex_enter(&mux->omux_lock);
+	avl_add(&mux->omux_devices, odd);
+	mutex_exit(&mux->omux_lock);
+}
+
+void
+overlay_mux_remove_dev(overlay_mux_t *mux, overlay_dev_t *odd)
+{
+	mutex_enter(&mux->omux_lock);
+	avl_remove(&mux->omux_devices, odd);
+	mutex_exit(&mux->omux_lock);
+}
+
+int
+overlay_mux_tx(overlay_mux_t *mux, struct msghdr *hdr, mblk_t *mp)
+{
+	int ret;
+
+	/*
+	 * It'd be nice to be able to use MSG_MBLK_QUICKRELE, unfortunately,
+	 * that isn't actually supported by UDP at this time.
+	 */
+	ret = ksocket_sendmblk(mux->omux_ksock, hdr, 0, &mp, kcred);
+	if (ret != 0)
+		freemsg(mp);
+
+	return (ret);
+}
diff --git a/usr/src/uts/common/io/overlay/overlay_plugin.c b/usr/src/uts/common/io/overlay/overlay_plugin.c
new file mode 100644
index 0000000000..348ddb92a2
--- /dev/null
+++ b/usr/src/uts/common/io/overlay/overlay_plugin.c
@@ -0,0 +1,281 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+/*
+ * Overlay device encapsulation plugin management
+ *
+ * For more information, see the big theory statement in
+ * uts/common/io/overlay/overlay.c
+ */
+
+#include <sys/types.h>
+#include <sys/kmem.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/errno.h>
+#include <sys/sysmacros.h>
+#include <sys/modctl.h>
+
+#include <sys/overlay_impl.h>
+
+static kmem_cache_t *overlay_plugin_cache;
+static kmutex_t overlay_plugin_lock;
+static list_t overlay_plugin_list;
+
+#define	OVERLAY_MODDIR	"overlay"
+
+/* ARGSUSED */
+static int
+overlay_plugin_cache_constructor(void *buf, void *arg, int kmflags)
+{
+	overlay_plugin_t *opp = buf;
+
+	mutex_init(&opp->ovp_mutex, NULL, MUTEX_DRIVER, NULL);
+	list_link_init(&opp->ovp_link);
+
+	return (0);
+}
+
+/* ARGSUSED */
+static void
+overlay_plugin_cache_destructor(void *buf, void *arg)
+{
+	overlay_plugin_t *opp = buf;
+	ASSERT(list_link_active(&opp->ovp_link) == 0);
+	mutex_destroy(&opp->ovp_mutex);
+}
+
+void
+overlay_plugin_init(void)
+{
+	mutex_init(&overlay_plugin_lock, NULL, MUTEX_DRIVER, 0);
+
+	/*
+	 * In the future we may want to have a reaper to unload unused modules
+	 * to help the kernel be able to reclaim memory.
+	 */
+	overlay_plugin_cache = kmem_cache_create("overlay_plugin_cache",
+	    sizeof (overlay_plugin_t), 0, overlay_plugin_cache_constructor,
+	    overlay_plugin_cache_destructor, NULL, NULL, NULL, 0);
+	list_create(&overlay_plugin_list, sizeof (overlay_plugin_t),
+	    offsetof(overlay_plugin_t, ovp_link));
+}
+
+void
+overlay_plugin_fini(void)
+{
+	mutex_enter(&overlay_plugin_lock);
+	VERIFY(list_is_empty(&overlay_plugin_list));
+	mutex_exit(&overlay_plugin_lock);
+
+	list_destroy(&overlay_plugin_list);
+	kmem_cache_destroy(overlay_plugin_cache);
+	mutex_destroy(&overlay_plugin_lock);
+}
+
+overlay_plugin_register_t *
+overlay_plugin_alloc(uint_t version)
+{
+	overlay_plugin_register_t *ovrp;
+	/* Version 1 is the only one that exists */
+	if (version != OVEP_VERSION_ONE)
+		return (NULL);
+
+	ovrp = kmem_zalloc(sizeof (overlay_plugin_register_t), KM_SLEEP);
+	ovrp->ovep_version = version;
+	return (ovrp);
+}
+
+void
+overlay_plugin_free(overlay_plugin_register_t *ovrp)
+{
+	kmem_free(ovrp, sizeof (overlay_plugin_register_t));
+}
+
+int
+overlay_plugin_register(overlay_plugin_register_t *ovrp)
+{
+	overlay_plugin_t *opp, *ipp;
+
+	/* Sanity check parameters of the registration */
+	if (ovrp->ovep_version != OVEP_VERSION_ONE)
+		return (EINVAL);
+
+	if (ovrp->ovep_name == NULL || ovrp->ovep_ops == NULL)
+		return (EINVAL);
+
+	if ((ovrp->ovep_flags & ~(OVEP_F_VLAN_TAG)) != 0)
+		return (EINVAL);
+
+	if (ovrp->ovep_id_size < 1)
+		return (EINVAL);
+
+	/* Don't support anything that has an id size larger than 8 bytes */
+	if (ovrp->ovep_id_size > 8)
+		return (ENOTSUP);
+
+	if (ovrp->ovep_dest == OVERLAY_PLUGIN_D_INVALID)
+		return (EINVAL);
+
+	if ((ovrp->ovep_dest & ~OVERLAY_PLUGIN_D_MASK) != 0)
+		return (EINVAL);
+
+	if (ovrp->ovep_ops->ovpo_callbacks != 0)
+		return (EINVAL);
+	if (ovrp->ovep_ops->ovpo_init == NULL)
+		return (EINVAL);
+	if (ovrp->ovep_ops->ovpo_fini == NULL)
+		return (EINVAL);
+	if (ovrp->ovep_ops->ovpo_encap == NULL)
+		return (EINVAL);
+	if (ovrp->ovep_ops->ovpo_decap == NULL)
+		return (EINVAL);
+	if (ovrp->ovep_ops->ovpo_socket == NULL)
+		return (EINVAL);
+	if (ovrp->ovep_ops->ovpo_getprop == NULL)
+		return (EINVAL);
+	if (ovrp->ovep_ops->ovpo_setprop == NULL)
+		return (EINVAL);
+	if (ovrp->ovep_ops->ovpo_propinfo == NULL)
+		return (EINVAL);
+
+
+	opp = kmem_cache_alloc(overlay_plugin_cache, KM_SLEEP);
+	opp->ovp_active = 0;
+	opp->ovp_name = ovrp->ovep_name;
+	opp->ovp_ops = ovrp->ovep_ops;
+	opp->ovp_props = ovrp->ovep_props;
+	opp->ovp_id_size = ovrp->ovep_id_size;
+	opp->ovp_flags = ovrp->ovep_flags;
+	opp->ovp_dest = ovrp->ovep_dest;
+
+	opp->ovp_nprops = 0;
+	if (ovrp->ovep_props != NULL) {
+		while (ovrp->ovep_props[opp->ovp_nprops] != NULL) {
+			if (strlen(ovrp->ovep_props[opp->ovp_nprops]) >=
+			    OVERLAY_PROP_NAMELEN) {
+				mutex_exit(&overlay_plugin_lock);
+				kmem_cache_free(overlay_plugin_cache, opp);
+				return (EINVAL);
+			}
+			opp->ovp_nprops++;
+		}
+	}
+
+	mutex_enter(&overlay_plugin_lock);
+	for (ipp = list_head(&overlay_plugin_list); ipp != NULL;
+	    ipp = list_next(&overlay_plugin_list, ipp)) {
+		if (strcmp(ipp->ovp_name, opp->ovp_name) == 0) {
+			mutex_exit(&overlay_plugin_lock);
+			kmem_cache_free(overlay_plugin_cache, opp);
+			return (EEXIST);
+		}
+	}
+	list_insert_tail(&overlay_plugin_list, opp);
+	mutex_exit(&overlay_plugin_lock);
+
+	return (0);
+}
+
+int
+overlay_plugin_unregister(const char *name)
+{
+	overlay_plugin_t *opp;
+
+	mutex_enter(&overlay_plugin_lock);
+	for (opp = list_head(&overlay_plugin_list); opp != NULL;
+	    opp = list_next(&overlay_plugin_list, opp)) {
+		if (strcmp(opp->ovp_name, name) == 0)
+			break;
+	}
+
+	if (opp == NULL) {
+		mutex_exit(&overlay_plugin_lock);
+		return (ENOENT);
+	}
+
+	mutex_enter(&opp->ovp_mutex);
+	if (opp->ovp_active > 0) {
+		mutex_exit(&opp->ovp_mutex);
+		mutex_exit(&overlay_plugin_lock);
+		return (EBUSY);
+	}
+	mutex_exit(&opp->ovp_mutex);
+
+	list_remove(&overlay_plugin_list, opp);
+	mutex_exit(&overlay_plugin_lock);
+
+	kmem_cache_free(overlay_plugin_cache, opp);
+	return (0);
+}
+
+overlay_plugin_t *
+overlay_plugin_lookup(const char *name)
+{
+	overlay_plugin_t *opp;
+	boolean_t trymodload = B_FALSE;
+
+	for (;;) {
+		mutex_enter(&overlay_plugin_lock);
+		for (opp = list_head(&overlay_plugin_list); opp != NULL;
+		    opp = list_next(&overlay_plugin_list, opp)) {
+			if (strcmp(name, opp->ovp_name) == 0) {
+				mutex_enter(&opp->ovp_mutex);
+				opp->ovp_active++;
+				mutex_exit(&opp->ovp_mutex);
+				mutex_exit(&overlay_plugin_lock);
+				return (opp);
+			}
+		}
+		mutex_exit(&overlay_plugin_lock);
+
+		if (trymodload == B_TRUE)
+			return (NULL);
+
+		/*
+		 * If we didn't find it, it may still exist, but just not have
+		 * been a loaded module. In that case, we'll do one attempt to
+		 * load it.
+		 */
+		if (modload(OVERLAY_MODDIR, (char *)name) == -1)
+			return (NULL);
+		trymodload = B_TRUE;
+	}
+
+}
+
+void
+overlay_plugin_rele(overlay_plugin_t *opp)
+{
+	mutex_enter(&opp->ovp_mutex);
+	ASSERT(opp->ovp_active > 0);
+	opp->ovp_active--;
+	mutex_exit(&opp->ovp_mutex);
+}
+
+void
+overlay_plugin_walk(overlay_plugin_walk_f func, void *arg)
+{
+	overlay_plugin_t *opp;
+	mutex_enter(&overlay_plugin_lock);
+	for (opp = list_head(&overlay_plugin_list); opp != NULL;
+	    opp = list_next(&overlay_plugin_list, opp)) {
+		if (func(opp, arg) != 0) {
+			mutex_exit(&overlay_plugin_lock);
+			return;
+		}
+	}
+	mutex_exit(&overlay_plugin_lock);
+}
diff --git a/usr/src/uts/common/io/overlay/overlay_prop.c b/usr/src/uts/common/io/overlay/overlay_prop.c
new file mode 100644
index 0000000000..ba1ea2a629
--- /dev/null
+++ b/usr/src/uts/common/io/overlay/overlay_prop.c
@@ -0,0 +1,122 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015, Joyent, Inc.
+ */
+
+/*
+ * Routines for manipulating property information structures.
+ *
+ * For more information, see the big theory statement in
+ * uts/common/io/overlay/overlay.c
+ */
+
+#include <sys/overlay_impl.h>
+
+void
+overlay_prop_init(overlay_prop_handle_t phdl)
+{
+	overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl;
+	mac_propval_range_t *rangep = (mac_propval_range_t *)infop->oipi_poss;
+
+	infop->oipi_posssize = sizeof (mac_propval_range_t);
+	bzero(rangep, sizeof (mac_propval_range_t));
+}
+
+void
+overlay_prop_set_name(overlay_prop_handle_t phdl, const char *name)
+{
+	overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl;
+	(void) strlcpy(infop->oipi_name, name, OVERLAY_PROP_NAMELEN);
+}
+
+void
+overlay_prop_set_prot(overlay_prop_handle_t phdl, overlay_prop_prot_t prot)
+{
+	overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl;
+	infop->oipi_prot = prot;
+}
+
+void
+overlay_prop_set_type(overlay_prop_handle_t phdl, overlay_prop_type_t type)
+{
+	overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl;
+	infop->oipi_type = type;
+}
+
+int
+overlay_prop_set_default(overlay_prop_handle_t phdl, void *def, ssize_t len)
+{
+	overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl;
+
+	if (len > OVERLAY_PROP_SIZEMAX)
+		return (E2BIG);
+
+	if (len < 0)
+		return (EOVERFLOW);
+
+	bcopy(def, infop->oipi_default, len);
+	infop->oipi_defsize = (uint32_t)len;
+
+	return (0);
+}
+
+void
+overlay_prop_set_nodefault(overlay_prop_handle_t phdl)
+{
+	overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl;
+	infop->oipi_default[0] = '\0';
+	infop->oipi_defsize = 0;
+}
+
+void
+overlay_prop_set_range_uint32(overlay_prop_handle_t phdl, uint32_t min,
+    uint32_t max)
+{
+	overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl;
+	mac_propval_range_t *rangep = (mac_propval_range_t *)infop->oipi_poss;
+
+	if (rangep->mpr_count != 0 && rangep->mpr_type != MAC_PROPVAL_UINT32)
+		return;
+
+	if (infop->oipi_posssize + sizeof (mac_propval_uint32_range_t) >
+	    sizeof (infop->oipi_poss))
+		return;
+
+	infop->oipi_posssize += sizeof (mac_propval_uint32_range_t);
+	rangep->mpr_count++;
+	rangep->mpr_type = MAC_PROPVAL_UINT32;
+	rangep->u.mpr_uint32[rangep->mpr_count-1].mpur_min = min;
+	rangep->u.mpr_uint32[rangep->mpr_count-1].mpur_max = max;
+}
+
+void
+overlay_prop_set_range_str(overlay_prop_handle_t phdl, const char *str)
+{
+	size_t len = strlen(str) + 1; /* Account for a null terminator */
+	overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl;
+	mac_propval_range_t *rangep = (mac_propval_range_t *)infop->oipi_poss;
+	mac_propval_str_range_t *pstr = &rangep->u.mpr_str;
+
+	if (rangep->mpr_count != 0 && rangep->mpr_type != MAC_PROPVAL_STR)
+		return;
+
+	if (infop->oipi_posssize + len > sizeof (infop->oipi_poss))
+		return;
+
+	rangep->mpr_count++;
+	rangep->mpr_type = MAC_PROPVAL_STR;
+	strlcpy((char *)&pstr->mpur_data[pstr->mpur_nextbyte], str,
+	    sizeof (infop->oipi_poss) - infop->oipi_posssize);
+	pstr->mpur_nextbyte += len;
+	infop->oipi_posssize += len;
+}
diff --git a/usr/src/uts/common/io/overlay/overlay_target.c b/usr/src/uts/common/io/overlay/overlay_target.c
new file mode 100644
index 0000000000..f4147b56d1
--- /dev/null
+++ b/usr/src/uts/common/io/overlay/overlay_target.c
@@ -0,0 +1,1651 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+/*
+ * Overlay device target cache management
+ *
+ * For more information, see the big theory statement in
+ * uts/common/io/overlay/overlay.c
+ */
+
+#include <sys/types.h>
+#include <sys/ethernet.h>
+#include <sys/kmem.h>
+#include <sys/policy.h>
+#include <sys/sysmacros.h>
+#include <sys/stream.h>
+#include <sys/strsun.h>
+#include <sys/strsubr.h>
+#include <sys/mac_provider.h>
+#include <sys/mac_client.h>
+#include <sys/mac_client_priv.h>
+#include <sys/vlan.h>
+#include <sys/crc32.h>
+#include <sys/cred.h>
+#include <sys/file.h>
+#include <sys/errno.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+
+#include <sys/overlay_impl.h>
+#include <sys/sdt.h>
+
+/*
+ * This is total straw man, but at least it's a prime number. Here we're
+ * going to have to go through and do a lot of evaluation and understanding as
+ * to how these target caches should grow and shrink, as well as, memory
+ * pressure and evictions. This just gives us a starting point that'll be 'good
+ * enough', until it's not.
+ */
+#define	OVERLAY_HSIZE	823
+
+/*
+ * We use this data structure to keep track of what requests have been actively
+ * allocated to a given instance so we know what to put back on the pending
+ * list.
+ */
+typedef struct overlay_target_hdl {
+	minor_t oth_minor;		/* RO */
+	zoneid_t oth_zoneid;		/* RO */
+	int oth_oflags;			/* RO */
+	list_node_t oth_link;		/* overlay_target_lock */
+	kmutex_t oth_lock;
+	list_t	oth_outstanding;	/* oth_lock */
+} overlay_target_hdl_t;
+
+typedef int (*overlay_target_copyin_f)(const void *, void **, size_t *, int);
+typedef int (*overlay_target_ioctl_f)(overlay_target_hdl_t *, void *);
+typedef int (*overlay_target_copyout_f)(void *, void *, size_t, int);
+
+typedef struct overaly_target_ioctl {
+	int		oti_cmd;	/* ioctl id */
+	boolean_t	oti_write;	/* ioctl requires FWRITE */
+	boolean_t	oti_ncopyout;	/* copyout data? */
+	overlay_target_copyin_f oti_copyin;	/* copyin func */
+	overlay_target_ioctl_f oti_func; /* function to call */
+	overlay_target_copyout_f oti_copyout;	/* copyin func */
+	size_t		oti_size;	/* size of user level structure */
+} overlay_target_ioctl_t;
+
+static kmem_cache_t *overlay_target_cache;
+static kmem_cache_t *overlay_entry_cache;
+static id_space_t *overlay_thdl_idspace;
+static void *overlay_thdl_state;
+
+/*
+ * When we support overlay devices in the NGZ, then all of these need to become
+ * zone aware, by plugging into the netstack engine and becoming per-netstack
+ * data.
+ */
+static list_t overlay_thdl_list;
+static kmutex_t overlay_target_lock;
+static kcondvar_t overlay_target_condvar;
+static list_t overlay_target_list;
+static boolean_t overlay_target_excl;
+
+/*
+ * Outstanding data per hash table entry.
+ */
+static int overlay_ent_size = 128 * 1024;
+
+/* ARGSUSED */
+static int
+overlay_target_cache_constructor(void *buf, void *arg, int kmflgs)
+{
+	overlay_target_t *ott = buf;
+
+	mutex_init(&ott->ott_lock, NULL, MUTEX_DRIVER, NULL);
+	cv_init(&ott->ott_cond, NULL, CV_DRIVER, NULL);
+	return (0);
+}
+
+/* ARGSUSED */
+static void
+overlay_target_cache_destructor(void *buf, void *arg)
+{
+	overlay_target_t *ott = buf;
+
+	cv_destroy(&ott->ott_cond);
+	mutex_destroy(&ott->ott_lock);
+}
+
+/* ARGSUSED */
+static int
+overlay_entry_cache_constructor(void *buf, void *arg, int kmflgs)
+{
+	overlay_target_entry_t *ote = buf;
+
+	bzero(ote, sizeof (overlay_target_entry_t));
+	mutex_init(&ote->ote_lock, NULL, MUTEX_DRIVER, NULL);
+	return (0);
+}
+
+/* ARGSUSED */
+static void
+overlay_entry_cache_destructor(void *buf, void *arg)
+{
+	overlay_target_entry_t *ote = buf;
+
+	mutex_destroy(&ote->ote_lock);
+}
+
+static uint64_t
+overlay_mac_hash(const void *v)
+{
+	uint32_t crc;
+	CRC32(crc, v, ETHERADDRL, -1U, crc32_table);
+	return (crc);
+}
+
+static int
+overlay_mac_cmp(const void *a, const void *b)
+{
+	return (bcmp(a, b, ETHERADDRL));
+}
+
+/* ARGSUSED */
+static void
+overlay_target_entry_dtor(void *arg)
+{
+	overlay_target_entry_t *ote = arg;
+
+	ote->ote_flags = 0;
+	bzero(ote->ote_addr, ETHERADDRL);
+	ote->ote_ott = NULL;
+	ote->ote_odd = NULL;
+	freemsgchain(ote->ote_chead);
+	ote->ote_chead = ote->ote_ctail = NULL;
+	ote->ote_mbsize = 0;
+	ote->ote_vtime = 0;
+	kmem_cache_free(overlay_entry_cache, ote);
+}
+
+static int
+overlay_mac_avl(const void *a, const void *b)
+{
+	int i;
+	const overlay_target_entry_t *l, *r;
+	l = a;
+	r = b;
+
+	for (i = 0; i < ETHERADDRL; i++) {
+		if (l->ote_addr[i] > r->ote_addr[i])
+			return (1);
+		else if (l->ote_addr[i] < r->ote_addr[i])
+			return (-1);
+	}
+
+	return (0);
+}
+
+void
+overlay_target_init(void)
+{
+	int ret;
+	ret = ddi_soft_state_init(&overlay_thdl_state,
+	    sizeof (overlay_target_hdl_t), 1);
+	VERIFY(ret == 0);
+	overlay_target_cache = kmem_cache_create("overlay_target",
+	    sizeof (overlay_target_t), 0, overlay_target_cache_constructor,
+	    overlay_target_cache_destructor, NULL, NULL, NULL, 0);
+	overlay_entry_cache = kmem_cache_create("overlay_entry",
+	    sizeof (overlay_target_entry_t), 0, overlay_entry_cache_constructor,
+	    overlay_entry_cache_destructor, NULL, NULL, NULL, 0);
+	mutex_init(&overlay_target_lock, NULL, MUTEX_DRIVER, NULL);
+	cv_init(&overlay_target_condvar, NULL, CV_DRIVER, NULL);
+	list_create(&overlay_target_list, sizeof (overlay_target_entry_t),
+	    offsetof(overlay_target_entry_t, ote_qlink));
+	list_create(&overlay_thdl_list, sizeof (overlay_target_hdl_t),
+	    offsetof(overlay_target_hdl_t, oth_link));
+	overlay_thdl_idspace = id_space_create("overlay_target_minors",
+	    1, INT32_MAX);
+}
+
+void
+overlay_target_fini(void)
+{
+	id_space_destroy(overlay_thdl_idspace);
+	list_destroy(&overlay_thdl_list);
+	list_destroy(&overlay_target_list);
+	cv_destroy(&overlay_target_condvar);
+	mutex_destroy(&overlay_target_lock);
+	kmem_cache_destroy(overlay_entry_cache);
+	kmem_cache_destroy(overlay_target_cache);
+	ddi_soft_state_fini(&overlay_thdl_state);
+}
+
+void
+overlay_target_free(overlay_dev_t *odd)
+{
+	if (odd->odd_target == NULL)
+		return;
+
+	if (odd->odd_target->ott_mode == OVERLAY_TARGET_DYNAMIC) {
+		refhash_t *rp = odd->odd_target->ott_u.ott_dyn.ott_dhash;
+		avl_tree_t *ap = &odd->odd_target->ott_u.ott_dyn.ott_tree;
+		overlay_target_entry_t *ote;
+
+		/*
+		 * Our AVL tree and hashtable contain the same elements,
+		 * therefore we should just remove it from the tree, but then
+		 * delete the entries when we remove them from the hash table
+		 * (which happens through the refhash dtor).
+		 */
+		while ((ote = avl_first(ap)) != NULL)
+			avl_remove(ap, ote);
+
+		avl_destroy(ap);
+		for (ote = refhash_first(rp); ote != NULL;
+		    ote = refhash_next(rp, ote)) {
+			refhash_remove(rp, ote);
+		}
+		refhash_destroy(rp);
+	}
+
+	ASSERT(odd->odd_target->ott_ocount == 0);
+	kmem_cache_free(overlay_target_cache, odd->odd_target);
+}
+
+int
+overlay_target_busy()
+{
+	int ret;
+
+	mutex_enter(&overlay_target_lock);
+	ret = !list_is_empty(&overlay_thdl_list);
+	mutex_exit(&overlay_target_lock);
+
+	return (ret);
+}
+
+static void
+overlay_target_queue(overlay_target_entry_t *entry)
+{
+	mutex_enter(&overlay_target_lock);
+	mutex_enter(&entry->ote_ott->ott_lock);
+	if (entry->ote_ott->ott_flags & OVERLAY_T_TEARDOWN) {
+		mutex_exit(&entry->ote_ott->ott_lock);
+		mutex_exit(&overlay_target_lock);
+		return;
+	}
+	entry->ote_ott->ott_ocount++;
+	mutex_exit(&entry->ote_ott->ott_lock);
+	list_insert_tail(&overlay_target_list, entry);
+	cv_signal(&overlay_target_condvar);
+	mutex_exit(&overlay_target_lock);
+}
+
+void
+overlay_target_quiesce(overlay_target_t *ott)
+{
+	if (ott == NULL)
+		return;
+	mutex_enter(&ott->ott_lock);
+	ott->ott_flags |= OVERLAY_T_TEARDOWN;
+	while (ott->ott_ocount != 0)
+		cv_wait(&ott->ott_cond, &ott->ott_lock);
+	mutex_exit(&ott->ott_lock);
+}
+
+/*
+ * This functions assumes that the destination mode is OVERLAY_PLUGIN_D_IP |
+ * OVERLAY_PLUGIN_D_PORT. As we don't have an implementation of anything else at
+ * this time, say for NVGRE, we drop all packets that mcuh this.
+ */
+int
+overlay_target_lookup(overlay_dev_t *odd, mblk_t *mp, struct sockaddr *sock,
+    socklen_t *slenp)
+{
+	int ret;
+	struct sockaddr_in6 *v6;
+	overlay_target_t *ott;
+	mac_header_info_t mhi;
+	overlay_target_entry_t *entry;
+
+	ASSERT(odd->odd_target != NULL);
+
+	/*
+	 * At this point, the overlay device is in a mux which means that it's
+	 * been activated. At this point, parts of the target, such as the mode
+	 * and the destination are now read-only and we don't have to worry
+	 * about synchronization for them.
+	 */
+	ott = odd->odd_target;
+	if (ott->ott_dest != (OVERLAY_PLUGIN_D_IP | OVERLAY_PLUGIN_D_PORT))
+		return (OVERLAY_TARGET_DROP);
+
+	v6 = (struct sockaddr_in6 *)sock;
+	bzero(v6, sizeof (struct sockaddr_in6));
+	v6->sin6_family = AF_INET6;
+
+	if (ott->ott_mode == OVERLAY_TARGET_POINT) {
+		mutex_enter(&ott->ott_lock);
+		bcopy(&ott->ott_u.ott_point.otp_ip, &v6->sin6_addr,
+		    sizeof (struct in6_addr));
+		v6->sin6_port = htons(ott->ott_u.ott_point.otp_port);
+		mutex_exit(&ott->ott_lock);
+		*slenp = sizeof (struct sockaddr_in6);
+
+		return (OVERLAY_TARGET_OK);
+	}
+
+	ASSERT(ott->ott_mode == OVERLAY_TARGET_DYNAMIC);
+
+	/*
+	 * Note we only want the MAC address here, therefore we won't bother
+	 * using mac_vlan_header_info(). If any caller needs the vlan info at
+	 * this point, this should change to a call to mac_vlan_header_info().
+	 */
+	if (mac_header_info(odd->odd_mh, mp, &mhi) != 0)
+		return (OVERLAY_TARGET_DROP);
+	mutex_enter(&ott->ott_lock);
+	entry = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
+	    mhi.mhi_daddr);
+	if (entry == NULL) {
+		entry = kmem_cache_alloc(overlay_entry_cache,
+		    KM_NOSLEEP | KM_NORMALPRI);
+		if (entry == NULL) {
+			mutex_exit(&ott->ott_lock);
+			return (OVERLAY_TARGET_DROP);
+		}
+		bcopy(mhi.mhi_daddr, entry->ote_addr, ETHERADDRL);
+		entry->ote_chead = entry->ote_ctail = mp;
+		entry->ote_mbsize = msgsize(mp);
+		entry->ote_flags |= OVERLAY_ENTRY_F_PENDING;
+		entry->ote_ott = ott;
+		entry->ote_odd = odd;
+		refhash_insert(ott->ott_u.ott_dyn.ott_dhash, entry);
+		avl_add(&ott->ott_u.ott_dyn.ott_tree, entry);
+		mutex_exit(&ott->ott_lock);
+		overlay_target_queue(entry);
+		return (OVERLAY_TARGET_ASYNC);
+	}
+	refhash_hold(ott->ott_u.ott_dyn.ott_dhash, entry);
+	mutex_exit(&ott->ott_lock);
+
+	mutex_enter(&entry->ote_lock);
+	if (entry->ote_flags & OVERLAY_ENTRY_F_DROP) {
+		ret = OVERLAY_TARGET_DROP;
+	} else if (entry->ote_flags & OVERLAY_ENTRY_F_VALID) {
+		bcopy(&entry->ote_dest.otp_ip, &v6->sin6_addr,
+		    sizeof (struct in6_addr));
+		v6->sin6_port = htons(entry->ote_dest.otp_port);
+		*slenp = sizeof (struct sockaddr_in6);
+		ret = OVERLAY_TARGET_OK;
+	} else {
+		size_t mlen = msgsize(mp);
+
+		if (mlen + entry->ote_mbsize > overlay_ent_size) {
+			ret = OVERLAY_TARGET_DROP;
+		} else {
+			if (entry->ote_ctail != NULL) {
+				ASSERT(entry->ote_ctail->b_next ==
+				    NULL);
+				entry->ote_ctail->b_next = mp;
+				entry->ote_ctail = mp;
+			} else {
+				entry->ote_chead = mp;
+				entry->ote_ctail = mp;
+			}
+			entry->ote_mbsize += mlen;
+			if ((entry->ote_flags &
+			    OVERLAY_ENTRY_F_PENDING) == 0) {
+				entry->ote_flags |=
+				    OVERLAY_ENTRY_F_PENDING;
+				overlay_target_queue(entry);
+			}
+			ret = OVERLAY_TARGET_ASYNC;
+		}
+	}
+	mutex_exit(&entry->ote_lock);
+
+	mutex_enter(&ott->ott_lock);
+	refhash_rele(ott->ott_u.ott_dyn.ott_dhash, entry);
+	mutex_exit(&ott->ott_lock);
+
+	return (ret);
+}
+
+/* ARGSUSED */
+static int
+overlay_target_info(overlay_target_hdl_t *thdl, void *arg)
+{
+	overlay_dev_t *odd;
+	overlay_targ_info_t *oti = arg;
+
+	odd = overlay_hold_by_dlid(oti->oti_linkid);
+	if (odd == NULL)
+		return (ENOENT);
+
+	mutex_enter(&odd->odd_lock);
+	oti->oti_flags = 0;
+	oti->oti_needs = odd->odd_plugin->ovp_dest;
+	if (odd->odd_flags & OVERLAY_F_DEGRADED)
+		oti->oti_flags |= OVERLAY_TARG_INFO_F_DEGRADED;
+	if (odd->odd_flags & OVERLAY_F_ACTIVATED)
+		oti->oti_flags |= OVERLAY_TARG_INFO_F_ACTIVE;
+	oti->oti_vnetid = odd->odd_vid;
+	mutex_exit(&odd->odd_lock);
+	overlay_hold_rele(odd);
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+overlay_target_associate(overlay_target_hdl_t *thdl, void *arg)
+{
+	overlay_dev_t *odd;
+	overlay_target_t *ott;
+	overlay_targ_associate_t *ota = arg;
+
+	odd = overlay_hold_by_dlid(ota->ota_linkid);
+	if (odd == NULL)
+		return (ENOENT);
+
+	if (ota->ota_id == 0) {
+		overlay_hold_rele(odd);
+		return (EINVAL);
+	}
+
+	if (ota->ota_mode != OVERLAY_TARGET_POINT &&
+	    ota->ota_mode != OVERLAY_TARGET_DYNAMIC) {
+		overlay_hold_rele(odd);
+		return (EINVAL);
+	}
+
+	if (ota->ota_provides != odd->odd_plugin->ovp_dest) {
+		overlay_hold_rele(odd);
+		return (EINVAL);
+	}
+
+	if (ota->ota_mode == OVERLAY_TARGET_POINT) {
+		if (ota->ota_provides & OVERLAY_PLUGIN_D_IP) {
+			if (IN6_IS_ADDR_UNSPECIFIED(&ota->ota_point.otp_ip) ||
+			    IN6_IS_ADDR_V4COMPAT(&ota->ota_point.otp_ip) ||
+			    IN6_IS_ADDR_V4MAPPED_ANY(&ota->ota_point.otp_ip)) {
+				overlay_hold_rele(odd);
+				return (EINVAL);
+			}
+		}
+
+		if (ota->ota_provides & OVERLAY_PLUGIN_D_PORT) {
+			if (ota->ota_point.otp_port == 0) {
+				overlay_hold_rele(odd);
+				return (EINVAL);
+			}
+		}
+	}
+
+	ott = kmem_cache_alloc(overlay_target_cache, KM_SLEEP);
+	ott->ott_flags = 0;
+	ott->ott_ocount = 0;
+	ott->ott_mode = ota->ota_mode;
+	ott->ott_dest = ota->ota_provides;
+	ott->ott_id = ota->ota_id;
+
+	if (ott->ott_mode == OVERLAY_TARGET_POINT) {
+		bcopy(&ota->ota_point, &ott->ott_u.ott_point,
+		    sizeof (overlay_target_point_t));
+	} else {
+		ott->ott_u.ott_dyn.ott_dhash = refhash_create(OVERLAY_HSIZE,
+		    overlay_mac_hash, overlay_mac_cmp,
+		    overlay_target_entry_dtor, sizeof (overlay_target_entry_t),
+		    offsetof(overlay_target_entry_t, ote_reflink),
+		    offsetof(overlay_target_entry_t, ote_addr), KM_SLEEP);
+		avl_create(&ott->ott_u.ott_dyn.ott_tree, overlay_mac_avl,
+		    sizeof (overlay_target_entry_t),
+		    offsetof(overlay_target_entry_t, ote_avllink));
+	}
+	mutex_enter(&odd->odd_lock);
+	if (odd->odd_flags & OVERLAY_F_VARPD) {
+		mutex_exit(&odd->odd_lock);
+		kmem_cache_free(overlay_target_cache, ott);
+		overlay_hold_rele(odd);
+		return (EEXIST);
+	}
+
+	odd->odd_flags |= OVERLAY_F_VARPD;
+	odd->odd_target = ott;
+	mutex_exit(&odd->odd_lock);
+
+	overlay_hold_rele(odd);
+
+
+	return (0);
+}
+
+
+/* ARGSUSED */
+static int
+overlay_target_degrade(overlay_target_hdl_t *thdl, void *arg)
+{
+	overlay_dev_t *odd;
+	overlay_targ_degrade_t *otd = arg;
+
+	odd = overlay_hold_by_dlid(otd->otd_linkid);
+	if (odd == NULL)
+		return (ENOENT);
+
+	overlay_fm_degrade(odd, otd->otd_buf);
+	overlay_hold_rele(odd);
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+overlay_target_restore(overlay_target_hdl_t *thdl, void *arg)
+{
+	overlay_dev_t *odd;
+	overlay_targ_id_t *otid = arg;
+
+	odd = overlay_hold_by_dlid(otid->otid_linkid);
+	if (odd == NULL)
+		return (ENOENT);
+
+	overlay_fm_restore(odd);
+	overlay_hold_rele(odd);
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+overlay_target_disassociate(overlay_target_hdl_t *thdl, void *arg)
+{
+	overlay_dev_t *odd;
+	overlay_targ_id_t *otid = arg;
+
+	odd = overlay_hold_by_dlid(otid->otid_linkid);
+	if (odd == NULL)
+		return (ENOENT);
+
+	mutex_enter(&odd->odd_lock);
+	odd->odd_flags &= ~OVERLAY_F_VARPD;
+	mutex_exit(&odd->odd_lock);
+
+	overlay_hold_rele(odd);
+	return (0);
+
+}
+
+static int
+overlay_target_lookup_request(overlay_target_hdl_t *thdl, void *arg)
+{
+	overlay_targ_lookup_t *otl = arg;
+	overlay_target_entry_t *entry;
+	clock_t ret, timeout;
+	mac_header_info_t mhi;
+
+	timeout = ddi_get_lbolt() + drv_usectohz(MICROSEC);
+again:
+	mutex_enter(&overlay_target_lock);
+	while (list_is_empty(&overlay_target_list)) {
+		ret = cv_timedwait(&overlay_target_condvar,
+		    &overlay_target_lock, timeout);
+		if (ret == -1) {
+			mutex_exit(&overlay_target_lock);
+			return (ETIME);
+		}
+	}
+	entry = list_remove_head(&overlay_target_list);
+	mutex_exit(&overlay_target_lock);
+	mutex_enter(&entry->ote_lock);
+	if (entry->ote_flags & OVERLAY_ENTRY_F_VALID) {
+		ASSERT(entry->ote_chead == NULL);
+		mutex_exit(&entry->ote_lock);
+		goto again;
+	}
+	ASSERT(entry->ote_chead != NULL);
+
+	/*
+	 * If we have a bogon that doesn't have a valid mac header, drop it and
+	 * try again.
+	 */
+	if (mac_vlan_header_info(entry->ote_odd->odd_mh, entry->ote_chead,
+	    &mhi) != 0) {
+		boolean_t queue = B_FALSE;
+		mblk_t *mp = entry->ote_chead;
+		entry->ote_chead = mp->b_next;
+		mp->b_next = NULL;
+		if (entry->ote_ctail == mp)
+			entry->ote_ctail = entry->ote_chead;
+		entry->ote_mbsize -= msgsize(mp);
+		if (entry->ote_chead != NULL)
+			queue = B_TRUE;
+		mutex_exit(&entry->ote_lock);
+		if (queue == B_TRUE)
+			overlay_target_queue(entry);
+		freemsg(mp);
+		goto again;
+	}
+
+	otl->otl_dlid = entry->ote_odd->odd_linkid;
+	otl->otl_reqid = (uintptr_t)entry;
+	otl->otl_varpdid = entry->ote_ott->ott_id;
+	otl->otl_vnetid = entry->ote_odd->odd_vid;
+
+	otl->otl_hdrsize = mhi.mhi_hdrsize;
+	otl->otl_pktsize = msgsize(entry->ote_chead) - otl->otl_hdrsize;
+	bcopy(mhi.mhi_daddr, otl->otl_dstaddr, ETHERADDRL);
+	bcopy(mhi.mhi_saddr, otl->otl_srcaddr, ETHERADDRL);
+	otl->otl_dsttype = mhi.mhi_dsttype;
+	otl->otl_sap = mhi.mhi_bindsap;
+	otl->otl_vlan = VLAN_ID(mhi.mhi_tci);
+	mutex_exit(&entry->ote_lock);
+
+	mutex_enter(&thdl->oth_lock);
+	list_insert_tail(&thdl->oth_outstanding, entry);
+	mutex_exit(&thdl->oth_lock);
+
+	return (0);
+}
+
+static int
+overlay_target_lookup_respond(overlay_target_hdl_t *thdl, void *arg)
+{
+	const overlay_targ_resp_t *otr = arg;
+	overlay_target_entry_t *entry;
+	mblk_t *mp;
+
+	mutex_enter(&thdl->oth_lock);
+	for (entry = list_head(&thdl->oth_outstanding); entry != NULL;
+	    entry = list_next(&thdl->oth_outstanding, entry)) {
+		if ((uintptr_t)entry == otr->otr_reqid)
+			break;
+	}
+
+	if (entry == NULL) {
+		mutex_exit(&thdl->oth_lock);
+		return (EINVAL);
+	}
+	list_remove(&thdl->oth_outstanding, entry);
+	mutex_exit(&thdl->oth_lock);
+
+	mutex_enter(&entry->ote_lock);
+	bcopy(&otr->otr_answer, &entry->ote_dest,
+	    sizeof (overlay_target_point_t));
+	entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING;
+	entry->ote_flags |= OVERLAY_ENTRY_F_VALID;
+	mp = entry->ote_chead;
+	entry->ote_chead = NULL;
+	entry->ote_ctail = NULL;
+	entry->ote_mbsize = 0;
+	entry->ote_vtime = gethrtime();
+	mutex_exit(&entry->ote_lock);
+
+	/*
+	 * For now do an in-situ drain.
+	 */
+	mp = overlay_m_tx(entry->ote_odd, mp);
+	freemsgchain(mp);
+
+	mutex_enter(&entry->ote_ott->ott_lock);
+	entry->ote_ott->ott_ocount--;
+	cv_signal(&entry->ote_ott->ott_cond);
+	mutex_exit(&entry->ote_ott->ott_lock);
+
+	return (0);
+}
+
+static int
+overlay_target_lookup_drop(overlay_target_hdl_t *thdl, void *arg)
+{
+	const overlay_targ_resp_t *otr = arg;
+	overlay_target_entry_t *entry;
+	mblk_t *mp;
+	boolean_t queue = B_FALSE;
+
+	mutex_enter(&thdl->oth_lock);
+	for (entry = list_head(&thdl->oth_outstanding); entry != NULL;
+	    entry = list_next(&thdl->oth_outstanding, entry)) {
+		if ((uintptr_t)entry == otr->otr_reqid)
+			break;
+	}
+
+	if (entry == NULL) {
+		mutex_exit(&thdl->oth_lock);
+		return (EINVAL);
+	}
+	list_remove(&thdl->oth_outstanding, entry);
+	mutex_exit(&thdl->oth_lock);
+
+	mutex_enter(&entry->ote_lock);
+
+	/* Safeguard against a confused varpd */
+	if (entry->ote_flags & OVERLAY_ENTRY_F_VALID) {
+		entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING;
+		DTRACE_PROBE1(overlay__target__valid__drop,
+		    overlay_target_entry_t *, entry);
+		mutex_exit(&entry->ote_lock);
+		goto done;
+	}
+
+	mp = entry->ote_chead;
+	if (mp != NULL) {
+		entry->ote_chead = mp->b_next;
+		mp->b_next = NULL;
+		if (entry->ote_ctail == mp)
+			entry->ote_ctail = entry->ote_chead;
+		entry->ote_mbsize -= msgsize(mp);
+	}
+	if (entry->ote_chead != NULL) {
+		queue = B_TRUE;
+		entry->ote_flags |= OVERLAY_ENTRY_F_PENDING;
+	} else {
+		entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING;
+	}
+	mutex_exit(&entry->ote_lock);
+
+	if (queue == B_TRUE)
+		overlay_target_queue(entry);
+	freemsg(mp);
+
+done:
+	mutex_enter(&entry->ote_ott->ott_lock);
+	entry->ote_ott->ott_ocount--;
+	cv_signal(&entry->ote_ott->ott_cond);
+	mutex_exit(&entry->ote_ott->ott_lock);
+
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+overlay_target_pkt_copyin(const void *ubuf, void **outp, size_t *bsize,
+    int flags)
+{
+	overlay_targ_pkt_t *pkt;
+	overlay_targ_pkt32_t *pkt32;
+
+	pkt = kmem_alloc(sizeof (overlay_targ_pkt_t), KM_SLEEP);
+	*outp = pkt;
+	*bsize = sizeof (overlay_targ_pkt_t);
+	if (ddi_model_convert_from(flags & FMODELS) == DDI_MODEL_ILP32) {
+		uintptr_t addr;
+
+		if (ddi_copyin(ubuf, pkt, sizeof (overlay_targ_pkt32_t),
+		    flags & FKIOCTL) != 0) {
+			kmem_free(pkt, *bsize);
+			return (EFAULT);
+		}
+		pkt32 = (overlay_targ_pkt32_t *)pkt;
+		addr = pkt32->otp_buf;
+		pkt->otp_buf = (void *)addr;
+	} else {
+		if (ddi_copyin(ubuf, pkt, *bsize, flags & FKIOCTL) != 0) {
+			kmem_free(pkt, *bsize);
+			return (EFAULT);
+		}
+	}
+	return (0);
+}
+
+static int
+overlay_target_pkt_copyout(void *ubuf, void *buf, size_t bufsize,
+    int flags)
+{
+	if (ddi_model_convert_from(flags & FMODELS) == DDI_MODEL_ILP32) {
+		overlay_targ_pkt_t *pkt = buf;
+		overlay_targ_pkt32_t *pkt32 = buf;
+		uintptr_t addr = (uintptr_t)pkt->otp_buf;
+		pkt32->otp_buf = (caddr32_t)addr;
+		if (ddi_copyout(buf, ubuf, sizeof (overlay_targ_pkt32_t),
+		    flags & FKIOCTL) != 0)
+			return (EFAULT);
+	} else {
+		if (ddi_copyout(buf, ubuf, bufsize, flags & FKIOCTL) != 0)
+			return (EFAULT);
+	}
+	return (0);
+}
+
+static int
+overlay_target_packet(overlay_target_hdl_t *thdl, void *arg)
+{
+	overlay_targ_pkt_t *pkt = arg;
+	overlay_target_entry_t *entry;
+	mblk_t *mp;
+	size_t mlen;
+	size_t boff;
+
+	mutex_enter(&thdl->oth_lock);
+	for (entry = list_head(&thdl->oth_outstanding); entry != NULL;
+	    entry = list_next(&thdl->oth_outstanding, entry)) {
+		if ((uintptr_t)entry == pkt->otp_reqid)
+			break;
+	}
+
+	if (entry == NULL) {
+		mutex_exit(&thdl->oth_lock);
+		return (EINVAL);
+	}
+	mutex_enter(&entry->ote_lock);
+	mutex_exit(&thdl->oth_lock);
+	mp = entry->ote_chead;
+	/* Protect against a rogue varpd */
+	if (mp == NULL) {
+		mutex_exit(&entry->ote_lock);
+		return (EINVAL);
+	}
+	mlen = MIN(msgsize(mp), pkt->otp_size);
+	pkt->otp_size = mlen;
+	boff = 0;
+	while (mlen > 0) {
+		size_t wlen = MIN(MBLKL(mp), mlen);
+		if (ddi_copyout(mp->b_rptr,
+		    (void *)((uintptr_t)pkt->otp_buf + boff),
+		    wlen, 0) != 0) {
+			mutex_exit(&entry->ote_lock);
+			return (EFAULT);
+		}
+		mlen -= wlen;
+		boff += wlen;
+		mp = mp->b_cont;
+	}
+	mutex_exit(&entry->ote_lock);
+	return (0);
+}
+
+static int
+overlay_target_inject(overlay_target_hdl_t *thdl, void *arg)
+{
+	overlay_targ_pkt_t *pkt = arg;
+	overlay_target_entry_t *entry;
+	overlay_dev_t *odd;
+	mblk_t *mp;
+
+	if (pkt->otp_size > ETHERMAX + VLAN_TAGSZ)
+		return (EINVAL);
+
+	mp = allocb(pkt->otp_size, 0);
+	if (mp == NULL)
+		return (ENOMEM);
+
+	if (ddi_copyin(pkt->otp_buf, mp->b_rptr, pkt->otp_size, 0) != 0) {
+		freeb(mp);
+		return (EFAULT);
+	}
+	mp->b_wptr += pkt->otp_size;
+
+	if (pkt->otp_linkid != UINT64_MAX) {
+		odd = overlay_hold_by_dlid(pkt->otp_linkid);
+		if (odd == NULL) {
+			freeb(mp);
+			return (ENOENT);
+		}
+	} else {
+		mutex_enter(&thdl->oth_lock);
+		for (entry = list_head(&thdl->oth_outstanding); entry != NULL;
+		    entry = list_next(&thdl->oth_outstanding, entry)) {
+			if ((uintptr_t)entry == pkt->otp_reqid)
+				break;
+		}
+
+		if (entry == NULL) {
+			mutex_exit(&thdl->oth_lock);
+			freeb(mp);
+			return (ENOENT);
+		}
+		odd = entry->ote_odd;
+		mutex_exit(&thdl->oth_lock);
+	}
+
+	mutex_enter(&odd->odd_lock);
+	overlay_io_start(odd, OVERLAY_F_IN_RX);
+	mutex_exit(&odd->odd_lock);
+
+	mac_rx(odd->odd_mh, NULL, mp);
+
+	mutex_enter(&odd->odd_lock);
+	overlay_io_done(odd, OVERLAY_F_IN_RX);
+	mutex_exit(&odd->odd_lock);
+
+	return (0);
+}
+
+static int
+overlay_target_resend(overlay_target_hdl_t *thdl, void *arg)
+{
+	overlay_targ_pkt_t *pkt = arg;
+	overlay_target_entry_t *entry;
+	overlay_dev_t *odd;
+	mblk_t *mp;
+
+	if (pkt->otp_size > ETHERMAX + VLAN_TAGSZ)
+		return (EINVAL);
+
+	mp = allocb(pkt->otp_size, 0);
+	if (mp == NULL)
+		return (ENOMEM);
+
+	if (ddi_copyin(pkt->otp_buf, mp->b_rptr, pkt->otp_size, 0) != 0) {
+		freeb(mp);
+		return (EFAULT);
+	}
+	mp->b_wptr += pkt->otp_size;
+
+	if (pkt->otp_linkid != UINT64_MAX) {
+		odd = overlay_hold_by_dlid(pkt->otp_linkid);
+		if (odd == NULL) {
+			freeb(mp);
+			return (ENOENT);
+		}
+	} else {
+		mutex_enter(&thdl->oth_lock);
+		for (entry = list_head(&thdl->oth_outstanding); entry != NULL;
+		    entry = list_next(&thdl->oth_outstanding, entry)) {
+			if ((uintptr_t)entry == pkt->otp_reqid)
+				break;
+		}
+
+		if (entry == NULL) {
+			mutex_exit(&thdl->oth_lock);
+			freeb(mp);
+			return (ENOENT);
+		}
+		odd = entry->ote_odd;
+		mutex_exit(&thdl->oth_lock);
+	}
+
+	mp = overlay_m_tx(odd, mp);
+	freemsgchain(mp);
+
+	return (0);
+}
+
+typedef struct overlay_targ_list_int {
+	boolean_t	otli_count;
+	uint32_t	otli_cur;
+	uint32_t	otli_nents;
+	uint32_t	otli_ents[];
+} overlay_targ_list_int_t;
+
+static int
+overlay_target_list_copyin(const void *ubuf, void **outp, size_t *bsize,
+    int flags)
+{
+	overlay_targ_list_t n;
+	overlay_targ_list_int_t *otl;
+
+	if (ddi_copyin(ubuf, &n, sizeof (overlay_targ_list_t),
+	    flags & FKIOCTL) != 0)
+		return (EFAULT);
+
+	/*
+	 */
+	if (n.otl_nents >= INT32_MAX / sizeof (uint32_t))
+		return (EINVAL);
+	*bsize = sizeof (overlay_targ_list_int_t) +
+	    sizeof (uint32_t) * n.otl_nents;
+	otl = kmem_zalloc(*bsize, KM_SLEEP);
+	otl->otli_cur = 0;
+	otl->otli_nents = n.otl_nents;
+	if (otl->otli_nents != 0) {
+		otl->otli_count = B_FALSE;
+		if (ddi_copyin((void *)((uintptr_t)ubuf +
+		    offsetof(overlay_targ_list_t, otl_ents)),
+		    otl->otli_ents, n.otl_nents * sizeof (uint32_t),
+		    flags & FKIOCTL) != 0) {
+			kmem_free(otl, *bsize);
+			return (EFAULT);
+		}
+	} else {
+		otl->otli_count = B_TRUE;
+	}
+
+	*outp = otl;
+	return (0);
+}
+
+static int
+overlay_target_ioctl_list_cb(overlay_dev_t *odd, void *arg)
+{
+	overlay_targ_list_int_t *otl = arg;
+
+	if (otl->otli_cur < otl->otli_nents)
+		otl->otli_ents[otl->otli_cur] = odd->odd_linkid;
+	otl->otli_cur++;
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+overlay_target_ioctl_list(overlay_target_hdl_t *thdl, void *arg)
+{
+	overlay_dev_iter(overlay_target_ioctl_list_cb, arg);
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+overlay_target_list_copyout(void *ubuf, void *buf, size_t bufsize, int flags)
+{
+	overlay_targ_list_int_t *otl = buf;
+
+	if (ddi_copyout(&otl->otli_cur, ubuf, sizeof (uint32_t),
+	    flags & FKIOCTL) != 0)
+		return (EFAULT);
+
+	if (otl->otli_count == B_FALSE) {
+		if (ddi_copyout(otl->otli_ents,
+		    (void *)((uintptr_t)ubuf +
+		    offsetof(overlay_targ_list_t, otl_ents)),
+		    sizeof (uint32_t) * otl->otli_nents,
+		    flags & FKIOCTL) != 0)
+			return (EFAULT);
+	}
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+overlay_target_cache_get(overlay_target_hdl_t *thdl, void *arg)
+{
+	int ret = 0;
+	overlay_dev_t *odd;
+	overlay_target_t *ott;
+	overlay_targ_cache_t *otc = arg;
+
+	odd = overlay_hold_by_dlid(otc->otc_linkid);
+	if (odd == NULL)
+		return (ENOENT);
+
+	mutex_enter(&odd->odd_lock);
+	if (!(odd->odd_flags & OVERLAY_F_VARPD)) {
+		mutex_exit(&odd->odd_lock);
+		overlay_hold_rele(odd);
+		return (ENXIO);
+	}
+	ott = odd->odd_target;
+	if (ott->ott_mode != OVERLAY_TARGET_POINT &&
+	    ott->ott_mode != OVERLAY_TARGET_DYNAMIC) {
+		mutex_exit(&odd->odd_lock);
+		overlay_hold_rele(odd);
+		return (ENOTSUP);
+	}
+	mutex_enter(&ott->ott_lock);
+	mutex_exit(&odd->odd_lock);
+
+	if (ott->ott_mode == OVERLAY_TARGET_POINT) {
+		otc->otc_entry.otce_flags = 0;
+		bcopy(&ott->ott_u.ott_point, &otc->otc_entry.otce_dest,
+		    sizeof (overlay_target_point_t));
+	} else {
+		overlay_target_entry_t *ote;
+		ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
+		    otc->otc_entry.otce_mac);
+		if (ote != NULL) {
+			mutex_enter(&ote->ote_lock);
+			if ((ote->ote_flags &
+			    OVERLAY_ENTRY_F_VALID_MASK) != 0) {
+				if (ote->ote_flags & OVERLAY_ENTRY_F_DROP) {
+					otc->otc_entry.otce_flags =
+					    OVERLAY_TARGET_CACHE_DROP;
+				} else {
+					otc->otc_entry.otce_flags = 0;
+					bcopy(&ote->ote_dest,
+					    &otc->otc_entry.otce_dest,
+					    sizeof (overlay_target_point_t));
+				}
+				ret = 0;
+			} else {
+				ret = ENOENT;
+			}
+			mutex_exit(&ote->ote_lock);
+		} else {
+			ret = ENOENT;
+		}
+	}
+
+	mutex_exit(&ott->ott_lock);
+	overlay_hold_rele(odd);
+
+	return (ret);
+}
+
+/* ARGSUSED */
+static int
+overlay_target_cache_set(overlay_target_hdl_t *thdl, void *arg)
+{
+	overlay_dev_t *odd;
+	overlay_target_t *ott;
+	overlay_target_entry_t *ote;
+	overlay_targ_cache_t *otc = arg;
+	mblk_t *mp = NULL;
+
+	if (otc->otc_entry.otce_flags & ~OVERLAY_TARGET_CACHE_DROP)
+		return (EINVAL);
+
+	odd = overlay_hold_by_dlid(otc->otc_linkid);
+	if (odd == NULL)
+		return (ENOENT);
+
+	mutex_enter(&odd->odd_lock);
+	if (!(odd->odd_flags & OVERLAY_F_VARPD)) {
+		mutex_exit(&odd->odd_lock);
+		overlay_hold_rele(odd);
+		return (ENXIO);
+	}
+	ott = odd->odd_target;
+	if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC) {
+		mutex_exit(&odd->odd_lock);
+		overlay_hold_rele(odd);
+		return (ENOTSUP);
+	}
+	mutex_enter(&ott->ott_lock);
+	mutex_exit(&odd->odd_lock);
+
+	ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
+	    otc->otc_entry.otce_mac);
+	if (ote == NULL) {
+		ote = kmem_cache_alloc(overlay_entry_cache, KM_SLEEP);
+		bcopy(otc->otc_entry.otce_mac, ote->ote_addr, ETHERADDRL);
+		ote->ote_chead = ote->ote_ctail = NULL;
+		ote->ote_mbsize = 0;
+		ote->ote_ott = ott;
+		ote->ote_odd = odd;
+		mutex_enter(&ote->ote_lock);
+		refhash_insert(ott->ott_u.ott_dyn.ott_dhash, ote);
+		avl_add(&ott->ott_u.ott_dyn.ott_tree, ote);
+	} else {
+		mutex_enter(&ote->ote_lock);
+	}
+
+	if (otc->otc_entry.otce_flags & OVERLAY_TARGET_CACHE_DROP) {
+		ote->ote_flags |= OVERLAY_ENTRY_F_DROP;
+	} else {
+		ote->ote_flags |= OVERLAY_ENTRY_F_VALID;
+		bcopy(&otc->otc_entry.otce_dest, &ote->ote_dest,
+		    sizeof (overlay_target_point_t));
+		mp = ote->ote_chead;
+		ote->ote_chead = NULL;
+		ote->ote_ctail = NULL;
+		ote->ote_mbsize = 0;
+		ote->ote_vtime = gethrtime();
+	}
+
+	mutex_exit(&ote->ote_lock);
+	mutex_exit(&ott->ott_lock);
+
+	if (mp != NULL) {
+		mp = overlay_m_tx(ote->ote_odd, mp);
+		freemsgchain(mp);
+	}
+
+	overlay_hold_rele(odd);
+
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+overlay_target_cache_remove(overlay_target_hdl_t *thdl, void *arg)
+{
+	int ret = 0;
+	overlay_dev_t *odd;
+	overlay_target_t *ott;
+	overlay_target_entry_t *ote;
+	overlay_targ_cache_t *otc = arg;
+
+	odd = overlay_hold_by_dlid(otc->otc_linkid);
+	if (odd == NULL)
+		return (ENOENT);
+
+	mutex_enter(&odd->odd_lock);
+	if (!(odd->odd_flags & OVERLAY_F_VARPD)) {
+		mutex_exit(&odd->odd_lock);
+		overlay_hold_rele(odd);
+		return (ENXIO);
+	}
+	ott = odd->odd_target;
+	if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC) {
+		mutex_exit(&odd->odd_lock);
+		overlay_hold_rele(odd);
+		return (ENOTSUP);
+	}
+	mutex_enter(&ott->ott_lock);
+	mutex_exit(&odd->odd_lock);
+
+	ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
+	    otc->otc_entry.otce_mac);
+	if (ote != NULL) {
+		mutex_enter(&ote->ote_lock);
+		ote->ote_flags &= ~OVERLAY_ENTRY_F_VALID_MASK;
+		mutex_exit(&ote->ote_lock);
+		ret = 0;
+	} else {
+		ret = ENOENT;
+	}
+
+	mutex_exit(&ott->ott_lock);
+	overlay_hold_rele(odd);
+
+	return (ret);
+}
+
+/* ARGSUSED */
+static int
+overlay_target_cache_flush(overlay_target_hdl_t *thdl, void *arg)
+{
+	avl_tree_t *avl;
+	overlay_dev_t *odd;
+	overlay_target_t *ott;
+	overlay_target_entry_t *ote;
+	overlay_targ_cache_t *otc = arg;
+
+	odd = overlay_hold_by_dlid(otc->otc_linkid);
+	if (odd == NULL)
+		return (ENOENT);
+
+	mutex_enter(&odd->odd_lock);
+	if (!(odd->odd_flags & OVERLAY_F_VARPD)) {
+		mutex_exit(&odd->odd_lock);
+		overlay_hold_rele(odd);
+		return (ENXIO);
+	}
+	ott = odd->odd_target;
+	if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC) {
+		mutex_exit(&odd->odd_lock);
+		overlay_hold_rele(odd);
+		return (ENOTSUP);
+	}
+	mutex_enter(&ott->ott_lock);
+	mutex_exit(&odd->odd_lock);
+	avl = &ott->ott_u.ott_dyn.ott_tree;
+
+	for (ote = avl_first(avl); ote != NULL; ote = AVL_NEXT(avl, ote)) {
+		mutex_enter(&ote->ote_lock);
+		ote->ote_flags &= ~OVERLAY_ENTRY_F_VALID_MASK;
+		mutex_exit(&ote->ote_lock);
+	}
+	ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
+	    otc->otc_entry.otce_mac);
+
+	mutex_exit(&ott->ott_lock);
+	overlay_hold_rele(odd);
+
+	return (0);
+}
+
+static int
+overlay_target_cache_iter_copyin(const void *ubuf, void **outp, size_t *bsize,
+    int flags)
+{
+	overlay_targ_cache_iter_t base, *iter;
+
+	if (ddi_copyin(ubuf, &base, sizeof (overlay_targ_cache_iter_t),
+	    flags & FKIOCTL) != 0)
+		return (EFAULT);
+
+	if (base.otci_count > OVERLAY_TARGET_ITER_MAX)
+		return (E2BIG);
+
+	if (base.otci_count == 0)
+		return (EINVAL);
+
+	*bsize = sizeof (overlay_targ_cache_iter_t) +
+	    base.otci_count * sizeof (overlay_targ_cache_entry_t);
+	iter = kmem_alloc(*bsize, KM_SLEEP);
+	bcopy(&base, iter, sizeof (overlay_targ_cache_iter_t));
+	*outp = iter;
+
+	return (0);
+}
+
+typedef struct overlay_targ_cache_marker {
+	uint8_t		otcm_mac[ETHERADDRL];
+	uint16_t	otcm_done;
+} overlay_targ_cache_marker_t;
+
+/* ARGSUSED */
+static int
+overlay_target_cache_iter(overlay_target_hdl_t *thdl, void *arg)
+{
+	overlay_dev_t *odd;
+	overlay_target_t *ott;
+	overlay_target_entry_t lookup, *ent;
+	overlay_targ_cache_marker_t *mark;
+	avl_index_t where;
+	avl_tree_t *avl;
+	uint16_t written = 0;
+
+	overlay_targ_cache_iter_t *iter = arg;
+	mark = (void *)&iter->otci_marker;
+
+	if (mark->otcm_done != 0) {
+		iter->otci_count = 0;
+		return (0);
+	}
+
+	odd = overlay_hold_by_dlid(iter->otci_linkid);
+	if (odd == NULL)
+		return (ENOENT);
+
+	mutex_enter(&odd->odd_lock);
+	if (!(odd->odd_flags & OVERLAY_F_VARPD)) {
+		mutex_exit(&odd->odd_lock);
+		overlay_hold_rele(odd);
+		return (ENXIO);
+	}
+	ott = odd->odd_target;
+	if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC &&
+	    ott->ott_mode != OVERLAY_TARGET_POINT) {
+		mutex_exit(&odd->odd_lock);
+		overlay_hold_rele(odd);
+		return (ENOTSUP);
+	}
+
+	/*
+	 * Holding this lock across the entire iteration probably isn't very
+	 * good. We should perhaps add an r/w lock for the avl tree. But we'll
+	 * wait until we now it's necessary before we do more.
+	 */
+	mutex_enter(&ott->ott_lock);
+	mutex_exit(&odd->odd_lock);
+
+	if (ott->ott_mode == OVERLAY_TARGET_POINT) {
+		overlay_targ_cache_entry_t *out = &iter->otci_ents[0];
+		bzero(out->otce_mac, ETHERADDRL);
+		out->otce_flags = 0;
+		bcopy(&ott->ott_u.ott_point, &out->otce_dest,
+		    sizeof (overlay_target_point_t));
+		written++;
+		mark->otcm_done = 1;
+	}
+
+	avl = &ott->ott_u.ott_dyn.ott_tree;
+	bcopy(mark->otcm_mac, lookup.ote_addr, ETHERADDRL);
+	ent = avl_find(avl, &lookup, &where);
+
+	/*
+	 * NULL ent means that the entry does not exist, so we want to start
+	 * with the closest node in the tree. This means that we implicitly rely
+	 * on the tree's order and the first node will be the mac 00:00:00:00:00
+	 * and the last will be ff:ff:ff:ff:ff:ff.
+	 */
+	if (ent == NULL) {
+		ent = avl_nearest(avl, where, AVL_AFTER);
+		if (ent == NULL) {
+			mark->otcm_done = 1;
+			goto done;
+		}
+	}
+
+	for (; ent != NULL && written < iter->otci_count;
+	    ent = AVL_NEXT(avl, ent)) {
+		overlay_targ_cache_entry_t *out = &iter->otci_ents[written];
+		mutex_enter(&ent->ote_lock);
+		if ((ent->ote_flags & OVERLAY_ENTRY_F_VALID_MASK) == 0) {
+			mutex_exit(&ent->ote_lock);
+			continue;
+		}
+		bcopy(ent->ote_addr, out->otce_mac, ETHERADDRL);
+		out->otce_flags = 0;
+		if (ent->ote_flags & OVERLAY_ENTRY_F_DROP)
+			out->otce_flags |= OVERLAY_TARGET_CACHE_DROP;
+		if (ent->ote_flags & OVERLAY_ENTRY_F_VALID)
+			bcopy(&ent->ote_dest, &out->otce_dest,
+			    sizeof (overlay_target_point_t));
+		written++;
+		mutex_exit(&ent->ote_lock);
+	}
+
+	if (ent != NULL) {
+		bcopy(ent->ote_addr, mark->otcm_mac, ETHERADDRL);
+	} else {
+		mark->otcm_done = 1;
+	}
+
+done:
+	iter->otci_count = written;
+	mutex_exit(&ott->ott_lock);
+	overlay_hold_rele(odd);
+
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+overlay_target_cache_iter_copyout(void *ubuf, void *buf, size_t bufsize,
+    int flags)
+{
+	size_t outsize;
+	const overlay_targ_cache_iter_t *iter = buf;
+
+	outsize = sizeof (overlay_targ_cache_iter_t) +
+	    iter->otci_count * sizeof (overlay_targ_cache_entry_t);
+
+	if (ddi_copyout(buf, ubuf, outsize, flags & FKIOCTL) != 0)
+		return (EFAULT);
+
+	return (0);
+}
+
+static overlay_target_ioctl_t overlay_target_ioctab[] = {
+	{ OVERLAY_TARG_INFO, B_TRUE, B_TRUE,
+		NULL, overlay_target_info,
+		NULL, sizeof (overlay_targ_info_t)	},
+	{ OVERLAY_TARG_ASSOCIATE, B_TRUE, B_FALSE,
+		NULL, overlay_target_associate,
+		NULL, sizeof (overlay_targ_associate_t)	},
+	{ OVERLAY_TARG_DISASSOCIATE, B_TRUE, B_FALSE,
+		NULL, overlay_target_disassociate,
+		NULL, sizeof (overlay_targ_id_t)	},
+	{ OVERLAY_TARG_DEGRADE, B_TRUE, B_FALSE,
+		NULL, overlay_target_degrade,
+		NULL, sizeof (overlay_targ_degrade_t)	},
+	{ OVERLAY_TARG_RESTORE, B_TRUE, B_FALSE,
+		NULL, overlay_target_restore,
+		NULL, sizeof (overlay_targ_id_t)	},
+	{ OVERLAY_TARG_LOOKUP, B_FALSE, B_TRUE,
+		NULL, overlay_target_lookup_request,
+		NULL, sizeof (overlay_targ_lookup_t)	},
+	{ OVERLAY_TARG_RESPOND, B_TRUE, B_FALSE,
+		NULL, overlay_target_lookup_respond,
+		NULL, sizeof (overlay_targ_resp_t)	},
+	{ OVERLAY_TARG_DROP, B_TRUE, B_FALSE,
+		NULL, overlay_target_lookup_drop,
+		NULL, sizeof (overlay_targ_resp_t)	},
+	{ OVERLAY_TARG_PKT, B_TRUE, B_TRUE,
+		overlay_target_pkt_copyin,
+		overlay_target_packet,
+		overlay_target_pkt_copyout,
+		sizeof (overlay_targ_pkt_t)		},
+	{ OVERLAY_TARG_INJECT, B_TRUE, B_FALSE,
+		overlay_target_pkt_copyin,
+		overlay_target_inject,
+		NULL, sizeof (overlay_targ_pkt_t)	},
+	{ OVERLAY_TARG_RESEND, B_TRUE, B_FALSE,
+		overlay_target_pkt_copyin,
+		overlay_target_resend,
+		NULL, sizeof (overlay_targ_pkt_t)	},
+	{ OVERLAY_TARG_LIST, B_FALSE, B_TRUE,
+		overlay_target_list_copyin,
+		overlay_target_ioctl_list,
+		overlay_target_list_copyout,
+		sizeof (overlay_targ_list_t)		},
+	{ OVERLAY_TARG_CACHE_GET, B_FALSE, B_TRUE,
+		NULL, overlay_target_cache_get,
+		NULL, sizeof (overlay_targ_cache_t)	},
+	{ OVERLAY_TARG_CACHE_SET, B_TRUE, B_TRUE,
+		NULL, overlay_target_cache_set,
+		NULL, sizeof (overlay_targ_cache_t)	},
+	{ OVERLAY_TARG_CACHE_REMOVE, B_TRUE, B_TRUE,
+		NULL, overlay_target_cache_remove,
+		NULL, sizeof (overlay_targ_cache_t)	},
+	{ OVERLAY_TARG_CACHE_FLUSH, B_TRUE, B_TRUE,
+		NULL, overlay_target_cache_flush,
+		NULL, sizeof (overlay_targ_cache_t)	},
+	{ OVERLAY_TARG_CACHE_ITER, B_FALSE, B_TRUE,
+		overlay_target_cache_iter_copyin,
+		overlay_target_cache_iter,
+		overlay_target_cache_iter_copyout,
+		sizeof (overlay_targ_cache_iter_t)		},
+	{ 0 }
+};
+
+int
+overlay_target_open(dev_t *devp, int flags, int otype, cred_t *credp)
+{
+	minor_t mid;
+	overlay_target_hdl_t *thdl;
+
+	if (secpolicy_dl_config(credp) != 0)
+		return (EPERM);
+
+	if (getminor(*devp) != 0)
+		return (ENXIO);
+
+	if (otype & OTYP_BLK)
+		return (EINVAL);
+
+	if (flags & ~(FREAD | FWRITE | FEXCL))
+		return (EINVAL);
+
+	if ((flags & FWRITE) &&
+	    !(flags & FEXCL))
+		return (EINVAL);
+
+	if (!(flags & FREAD) && !(flags & FWRITE))
+		return (EINVAL);
+
+	if (crgetzoneid(credp) != GLOBAL_ZONEID)
+		return (EPERM);
+
+	mid = id_alloc(overlay_thdl_idspace);
+	if (ddi_soft_state_zalloc(overlay_thdl_state, mid) != 0) {
+		id_free(overlay_thdl_idspace, mid);
+		return (ENXIO);
+	}
+
+	thdl = ddi_get_soft_state(overlay_thdl_state, mid);
+	VERIFY(thdl != NULL);
+	thdl->oth_minor = mid;
+	thdl->oth_zoneid = crgetzoneid(credp);
+	thdl->oth_oflags = flags;
+	mutex_init(&thdl->oth_lock, NULL, MUTEX_DRIVER, NULL);
+	list_create(&thdl->oth_outstanding, sizeof (overlay_target_entry_t),
+	    offsetof(overlay_target_entry_t, ote_qlink));
+	*devp = makedevice(getmajor(*devp), mid);
+
+	mutex_enter(&overlay_target_lock);
+	if ((flags & FEXCL) && overlay_target_excl == B_TRUE) {
+		mutex_exit(&overlay_target_lock);
+		list_destroy(&thdl->oth_outstanding);
+		mutex_destroy(&thdl->oth_lock);
+		ddi_soft_state_free(overlay_thdl_state, mid);
+		id_free(overlay_thdl_idspace, mid);
+		return (EEXIST);
+	} else if ((flags & FEXCL) != 0) {
+		VERIFY(overlay_target_excl == B_FALSE);
+		overlay_target_excl = B_TRUE;
+	}
+	list_insert_tail(&overlay_thdl_list, thdl);
+	mutex_exit(&overlay_target_lock);
+
+	return (0);
+}
+
+/* ARGSUSED */
+int
+overlay_target_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
+    int *rvalp)
+{
+	overlay_target_ioctl_t *ioc;
+	overlay_target_hdl_t *thdl;
+
+	if (secpolicy_dl_config(credp) != 0)
+		return (EPERM);
+
+	if ((thdl = ddi_get_soft_state(overlay_thdl_state,
+	    getminor(dev))) == NULL)
+		return (ENXIO);
+
+	for (ioc = &overlay_target_ioctab[0]; ioc->oti_cmd != 0; ioc++) {
+		int ret;
+		caddr_t buf;
+		size_t bufsize;
+
+		if (ioc->oti_cmd != cmd)
+			continue;
+
+		if (ioc->oti_write == B_TRUE && !(mode & FWRITE))
+			return (EBADF);
+
+		if (ioc->oti_copyin == NULL) {
+			bufsize = ioc->oti_size;
+			buf = kmem_alloc(bufsize, KM_SLEEP);
+			if (ddi_copyin((void *)(uintptr_t)arg, buf, bufsize,
+			    mode & FKIOCTL) != 0) {
+				kmem_free(buf, bufsize);
+				return (EFAULT);
+			}
+		} else {
+			if ((ret = ioc->oti_copyin((void *)(uintptr_t)arg,
+			    (void **)&buf, &bufsize, mode)) != 0)
+				return (ret);
+		}
+
+		ret = ioc->oti_func(thdl, buf);
+		if (ret == 0 && ioc->oti_size != 0 &&
+		    ioc->oti_ncopyout == B_TRUE) {
+			if (ioc->oti_copyout == NULL) {
+				if (ddi_copyout(buf, (void *)(uintptr_t)arg,
+				    bufsize, mode & FKIOCTL) != 0)
+					ret = EFAULT;
+			} else {
+				ret = ioc->oti_copyout((void *)(uintptr_t)arg,
+				    buf, bufsize, mode);
+			}
+		}
+
+		kmem_free(buf, bufsize);
+		return (ret);
+	}
+
+	return (ENOTTY);
+}
+
+/* ARGSUSED */
+int
+overlay_target_close(dev_t dev, int flags, int otype, cred_t *credp)
+{
+	overlay_target_hdl_t *thdl;
+	overlay_target_entry_t *entry;
+	minor_t mid = getminor(dev);
+
+	if ((thdl = ddi_get_soft_state(overlay_thdl_state, mid)) == NULL)
+		return (ENXIO);
+
+	mutex_enter(&overlay_target_lock);
+	list_remove(&overlay_thdl_list, thdl);
+	mutex_enter(&thdl->oth_lock);
+	while ((entry = list_remove_head(&thdl->oth_outstanding)) != NULL)
+		list_insert_tail(&overlay_target_list, entry);
+	cv_signal(&overlay_target_condvar);
+	mutex_exit(&thdl->oth_lock);
+	if ((thdl->oth_oflags & FEXCL) != 0) {
+		VERIFY(overlay_target_excl == B_TRUE);
+		overlay_target_excl = B_FALSE;
+	}
+	mutex_exit(&overlay_target_lock);
+
+	list_destroy(&thdl->oth_outstanding);
+	mutex_destroy(&thdl->oth_lock);
+	mid = thdl->oth_minor;
+	ddi_soft_state_free(overlay_thdl_state, mid);
+	id_free(overlay_thdl_idspace, mid);
+
+	return (0);
+}
diff --git a/usr/src/uts/common/io/overlay/plugins/overlay_vxlan.c b/usr/src/uts/common/io/overlay/plugins/overlay_vxlan.c
new file mode 100644
index 0000000000..92144b3985
--- /dev/null
+++ b/usr/src/uts/common/io/overlay/plugins/overlay_vxlan.c
@@ -0,0 +1,394 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc.
+ */
+
+/*
+ * VXLAN encapsulation module
+ *
+ *
+ * The VXLAN header looks as follows in network byte order:
+ *
+ * |0        3| 4 |5                     31|
+ * +----------+---+------------------------+
+ * | Reserved | I | Reserved               |
+ * +---------------------------------------+
+ * | Virtual Network ID         | Reserved |
+ * +----------------------------+----------+
+ * |0                         23|24      31|
+ *
+ * All reserved values must be 0. The I bit must be 1. We call the top
+ * word the VXLAN magic field for the time being. The second word is
+ * definitely not the most friendly way to operate. Specifically, the ID
+ * is a 24-bit big endian value, but we have to make sure not to use the
+ * reserved byte.
+ *
+ * For us, VXLAN encapsulation is a fairly straightforward implementation. It
+ * only has two properties, a listen_ip and a listen_port. These determine on
+ * what address we should be listening on. While we do not have a default
+ * address to listen upon, we do have a default port, which is the IANA assigned
+ * port for VXLAN -- 4789.
+ */
+
+#include <sys/overlay_plugin.h>
+#include <sys/modctl.h>
+#include <sys/errno.h>
+#include <sys/byteorder.h>
+#include <sys/vxlan.h>
+#include <inet/ip.h>
+#include <netinet/in.h>
+#include <sys/strsun.h>
+#include <netinet/udp.h>
+
+static const char *vxlan_ident = "vxlan";
+static uint16_t vxlan_defport = IPPORT_VXLAN;
+
+/*
+ * Should we enable UDP source port hashing for fanout.
+ */
+boolean_t vxlan_fanout = B_TRUE;
+
+/*
+ * This represents the size in bytes that we want to allocate when allocating a
+ * vxlan header block. This is intended such that lower levels can try and use
+ * the message block that we allocate for the IP and UPD header. The hope is
+ * that even if this is tunneled, that this is enough space.
+ *
+ * The vxlan_noalloc_min value represents the minimum amount of space we need to
+ * consider not allocating a message block and just passing it down the stack in
+ * this form. This number assumes that we have a VLAN tag, so 18 byte Ethernet
+ * header, 20 byte IP header, 8 byte UDP header, and 8 byte VXLAN header.
+ */
+uint_t vxlan_alloc_size = 128;
+uint_t vxlan_noalloc_min = 54;
+
+static const char *vxlan_props[] = {
+	"vxlan/listen_ip",
+	"vxlan/listen_port",
+	NULL
+};
+
+typedef struct vxlan {
+	kmutex_t vxl_lock;
+	overlay_handle_t vxl_oh;
+	uint16_t vxl_lport;
+	boolean_t vxl_hladdr;
+	struct in6_addr vxl_laddr;
+} vxlan_t;
+
+static int
+vxlan_o_init(overlay_handle_t oh, void **outp)
+{
+	vxlan_t *vxl;
+
+	vxl = kmem_alloc(sizeof (vxlan_t), KM_SLEEP);
+	*outp = vxl;
+	mutex_init(&vxl->vxl_lock, NULL, MUTEX_DRIVER, NULL);
+	vxl->vxl_oh = oh;
+	vxl->vxl_lport = vxlan_defport;
+	vxl->vxl_hladdr = B_FALSE;
+
+	return (0);
+}
+
+static void
+vxlan_o_fini(void *arg)
+{
+	vxlan_t *vxl = arg;
+
+	mutex_destroy(&vxl->vxl_lock);
+	kmem_free(arg, sizeof (vxlan_t));
+}
+
+static int
+vxlan_o_socket(void *arg, int *dp, int *fp, int *pp, struct sockaddr *addr,
+    socklen_t *slenp)
+{
+	vxlan_t *vxl = arg;
+	struct sockaddr_in6 *in;
+
+	in = (struct sockaddr_in6 *)addr;
+	*dp = AF_INET6;
+	*fp = SOCK_DGRAM;
+	*pp = 0;
+	bzero(in, sizeof (struct sockaddr_in6));
+	in->sin6_family = AF_INET6;
+
+	/*
+	 * We should consider a more expressive private errno set that
+	 * provider's can use.
+	 */
+	mutex_enter(&vxl->vxl_lock);
+	if (vxl->vxl_hladdr == B_FALSE) {
+		mutex_exit(&vxl->vxl_lock);
+		return (EINVAL);
+	}
+	in->sin6_port = htons(vxl->vxl_lport);
+	in->sin6_addr = vxl->vxl_laddr;
+	mutex_exit(&vxl->vxl_lock);
+	*slenp = sizeof (struct sockaddr_in6);
+
+	return (0);
+}
+
+static int
+vxlan_o_sockopt(ksocket_t ksock)
+{
+	int val, err;
+	if (vxlan_fanout == B_FALSE)
+		return (0);
+
+	val = UDP_HASH_VXLAN;
+	err = ksocket_setsockopt(ksock, IPPROTO_UDP, UDP_SRCPORT_HASH, &val,
+	    sizeof (val), kcred);
+	return (err);
+}
+
+/* ARGSUSED */
+static int
+vxlan_o_encap(void *arg, mblk_t *mp, ovep_encap_info_t *einfop,
+    mblk_t **outp)
+{
+	mblk_t *ob;
+	vxlan_hdr_t *vxh;
+
+	ASSERT(einfop->ovdi_id < (1 << 24));
+
+	if (DB_REF(mp) != 1 || mp->b_rptr - vxlan_noalloc_min < DB_BASE(mp)) {
+		/*
+		 * This allocation could get hot. We may want to have a good
+		 * way to cache and handle this allocation the same way that IP
+		 * does with keeping around a message block per entry, or
+		 * basically treating this as an immutable message block in the
+		 * system. Basically freemsg() will be a nop, but we'll do the
+		 * right thing with respect to the rest of the chain.
+		 */
+		ob = allocb(vxlan_alloc_size, 0);
+		if (ob == NULL)
+			return (ENOMEM);
+
+		ob->b_wptr = DB_LIM(ob);
+		ob->b_rptr = ob->b_wptr;
+		ob->b_cont = mp;
+	} else {
+		ob = mp;
+	}
+	ob->b_rptr -= VXLAN_HDR_LEN;
+
+	vxh = (vxlan_hdr_t *)ob->b_rptr;
+	vxh->vxlan_flags = ntohl(VXLAN_F_VDI);
+	vxh->vxlan_id = htonl((uint32_t)einfop->ovdi_id << VXLAN_ID_SHIFT);
+	*outp = ob;
+
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+vxlan_o_decap(void *arg, mblk_t *mp, ovep_encap_info_t *dinfop)
+{
+	vxlan_hdr_t *vxh;
+
+	if (MBLKL(mp) < sizeof (vxlan_hdr_t))
+		return (EINVAL);
+	vxh = (vxlan_hdr_t *)mp->b_rptr;
+	if ((ntohl(vxh->vxlan_flags) & VXLAN_F_VDI) == 0)
+		return (EINVAL);
+
+	dinfop->ovdi_id = ntohl(vxh->vxlan_id) >> VXLAN_ID_SHIFT;
+	dinfop->ovdi_hdr_size = VXLAN_HDR_LEN;
+
+	return (0);
+}
+
+static int
+vxlan_o_getprop(void *arg, const char *pr_name, void *buf, uint32_t *bufsize)
+{
+	vxlan_t *vxl = arg;
+
+	/* vxlan/listen_ip */
+	if (strcmp(pr_name, vxlan_props[0]) == 0) {
+		if (*bufsize < sizeof (struct in6_addr))
+			return (EOVERFLOW);
+
+		mutex_enter(&vxl->vxl_lock);
+		if (vxl->vxl_hladdr == B_FALSE) {
+			*bufsize = 0;
+		} else {
+			bcopy(&vxl->vxl_laddr, buf, sizeof (struct in6_addr));
+			*bufsize = sizeof (struct in6_addr);
+		}
+		mutex_exit(&vxl->vxl_lock);
+		return (0);
+	}
+
+	/* vxlan/listen_port */
+	if (strcmp(pr_name, vxlan_props[1]) == 0) {
+		uint64_t val;
+		if (*bufsize < sizeof (uint64_t))
+			return (EOVERFLOW);
+
+		mutex_enter(&vxl->vxl_lock);
+		val = vxl->vxl_lport;
+		bcopy(&val, buf, sizeof (uint64_t));
+		*bufsize = sizeof (uint64_t);
+		mutex_exit(&vxl->vxl_lock);
+		return (0);
+	}
+
+	return (EINVAL);
+}
+
+static int
+vxlan_o_setprop(void *arg, const char *pr_name, const void *buf,
+    uint32_t bufsize)
+{
+	vxlan_t *vxl = arg;
+
+	/* vxlan/listen_ip */
+	if (strcmp(pr_name, vxlan_props[0]) == 0) {
+		const struct in6_addr *ipv6 = buf;
+		if (bufsize != sizeof (struct in6_addr))
+			return (EINVAL);
+
+		if (IN6_IS_ADDR_V4COMPAT(ipv6))
+			return (EINVAL);
+
+		if (IN6_IS_ADDR_MULTICAST(ipv6))
+			return (EINVAL);
+
+		if (IN6_IS_ADDR_6TO4(ipv6))
+			return (EINVAL);
+
+		if (IN6_IS_ADDR_V4MAPPED(ipv6)) {
+			ipaddr_t v4;
+			IN6_V4MAPPED_TO_IPADDR(ipv6, v4);
+			if (IN_MULTICAST(v4))
+				return (EINVAL);
+		}
+
+		mutex_enter(&vxl->vxl_lock);
+		vxl->vxl_hladdr = B_TRUE;
+		bcopy(ipv6, &vxl->vxl_laddr, sizeof (struct in6_addr));
+		mutex_exit(&vxl->vxl_lock);
+
+		return (0);
+	}
+
+	/* vxlan/listen_port */
+	if (strcmp(pr_name, vxlan_props[1]) == 0) {
+		const uint64_t *valp = buf;
+		if (bufsize != 8)
+			return (EINVAL);
+
+		if (*valp == 0 || *valp > UINT16_MAX)
+			return (EINVAL);
+
+		mutex_enter(&vxl->vxl_lock);
+		vxl->vxl_lport = *valp;
+		mutex_exit(&vxl->vxl_lock);
+		return (0);
+	}
+	return (EINVAL);
+}
+
+static int
+vxlan_o_propinfo(const char *pr_name, overlay_prop_handle_t phdl)
+{
+	/* vxlan/listen_ip */
+	if (strcmp(pr_name, vxlan_props[0]) == 0) {
+		overlay_prop_set_name(phdl, vxlan_props[0]);
+		overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RRW);
+		overlay_prop_set_type(phdl, OVERLAY_PROP_T_IP);
+		overlay_prop_set_nodefault(phdl);
+		return (0);
+	}
+
+	if (strcmp(pr_name, vxlan_props[1]) == 0) {
+		overlay_prop_set_name(phdl, vxlan_props[1]);
+		overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RRW);
+		overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT);
+		(void) overlay_prop_set_default(phdl, &vxlan_defport,
+		    sizeof (vxlan_defport));
+		overlay_prop_set_range_uint32(phdl, 1, UINT16_MAX);
+		return (0);
+	}
+
+	return (EINVAL);
+}
+
+static struct overlay_plugin_ops vxlan_o_ops = {
+	0,
+	vxlan_o_init,
+	vxlan_o_fini,
+	vxlan_o_encap,
+	vxlan_o_decap,
+	vxlan_o_socket,
+	vxlan_o_sockopt,
+	vxlan_o_getprop,
+	vxlan_o_setprop,
+	vxlan_o_propinfo
+};
+
+static struct modlmisc vxlan_modlmisc = {
+	&mod_miscops,
+	"VXLAN encap plugin"
+};
+
+static struct modlinkage vxlan_modlinkage = {
+	MODREV_1,
+	&vxlan_modlmisc
+};
+
+int
+_init(void)
+{
+	int err;
+	overlay_plugin_register_t *ovrp;
+
+	ovrp = overlay_plugin_alloc(OVEP_VERSION);
+	if (ovrp == NULL)
+		return (ENOTSUP);
+	ovrp->ovep_name = vxlan_ident;
+	ovrp->ovep_ops = &vxlan_o_ops;
+	ovrp->ovep_id_size = VXLAN_ID_LEN;
+	ovrp->ovep_flags = OVEP_F_VLAN_TAG;
+	ovrp->ovep_dest = OVERLAY_PLUGIN_D_IP | OVERLAY_PLUGIN_D_PORT;
+	ovrp->ovep_props = vxlan_props;
+
+	if ((err = overlay_plugin_register(ovrp)) == 0) {
+		if ((err = mod_install(&vxlan_modlinkage)) != 0) {
+			(void) overlay_plugin_unregister(vxlan_ident);
+		}
+	}
+
+	overlay_plugin_free(ovrp);
+	return (err);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&vxlan_modlinkage, modinfop));
+}
+
+int
+_fini(void)
+{
+	int err;
+
+	if ((err = overlay_plugin_unregister(vxlan_ident)) != 0)
+		return (err);
+
+	return (mod_remove(&vxlan_modlinkage));
+}
diff --git a/usr/src/uts/common/io/pciex/pcie.c b/usr/src/uts/common/io/pciex/pcie.c
index 95eaa5837f..2f424321f1 100644
--- a/usr/src/uts/common/io/pciex/pcie.c
+++ b/usr/src/uts/common/io/pciex/pcie.c
@@ -684,6 +684,7 @@ pcie_init_pfd(dev_info_t *dip)
 
 	pfd_p->pe_bus_p = bus_p;
 	pfd_p->pe_severity_flags = 0;
+	pfd_p->pe_severity_mask = 0;
 	pfd_p->pe_orig_severity_flags = 0;
 	pfd_p->pe_lock = B_FALSE;
 	pfd_p->pe_valid = B_FALSE;
@@ -840,6 +841,7 @@ pcie_rc_init_pfd(dev_info_t *dip, pf_data_t *pfd_p)
 {
 	pfd_p->pe_bus_p = PCIE_DIP2DOWNBUS(dip);
 	pfd_p->pe_severity_flags = 0;
+	pfd_p->pe_severity_mask = 0;
 	pfd_p->pe_orig_severity_flags = 0;
 	pfd_p->pe_lock = B_FALSE;
 	pfd_p->pe_valid = B_FALSE;
@@ -921,7 +923,7 @@ pcie_rc_init_bus(dev_info_t *dip)
 	bus_p->bus_aer_off = (uint16_t)-1;
 
 	/* Needed only for handle lookup */
-	bus_p->bus_fm_flags |= PF_FM_READY;
+	atomic_or_uint(&bus_p->bus_fm_flags, PF_FM_READY);
 
 	ndi_set_bus_private(dip, B_FALSE, DEVI_PORT_TYPE_PCI, bus_p);
 
@@ -938,6 +940,172 @@ pcie_rc_fini_bus(dev_info_t *dip)
 }
 
 /*
+ * We need to capture the supported, maximum, and current device speed and
+ * width. The way that this has been done has changed over time.
+ *
+ * Prior to PCIe Gen 3, there were only current and supported speed fields.
+ * These were found in the link status and link capabilities registers of the
+ * PCI express capability. With the change to PCIe Gen 3, the information in the
+ * link capabilities changed to the maximum value. The supported speeds vector
+ * was moved to the link capabilities 2 register.
+ *
+ * Now, a device may not implement some of these registers. To determine whether
+ * or not it's here, we have to do the following. First, we need to check the
+ * revision of the PCI express capability. The link capabilities 2 register did
+ * not exist prior to version 2 of this register.
+ */
+static void
+pcie_capture_speeds(pcie_bus_t *bus_p, pcie_req_id_t bdf, dev_info_t *rcdip)
+{
+	uint16_t	vers, status;
+	uint32_t	val, cap, cap2;
+
+	if (!PCIE_IS_PCIE(bus_p))
+		return;
+
+	vers = pci_cfgacc_get16(rcdip, bdf, bus_p->bus_pcie_off + PCIE_PCIECAP);
+	if (vers == PCI_EINVAL16)
+		return;
+	vers &= PCIE_PCIECAP_VER_MASK;
+
+	/*
+	 * Verify the capability's version.
+	 */
+	switch (vers) {
+	case PCIE_PCIECAP_VER_1_0:
+		cap2 = 0;
+		break;
+	case PCIE_PCIECAP_VER_2_0:
+		cap2 = pci_cfgacc_get32(rcdip, bdf, bus_p->bus_pcie_off +
+		    PCIE_LINKCAP2);
+		if (cap2 == PCI_EINVAL32)
+			cap2 = 0;
+		break;
+	default:
+		/* Don't try and handle an unknown version */
+		return;
+	}
+
+	status = pci_cfgacc_get16(rcdip, bdf, bus_p->bus_pcie_off +
+	    PCIE_LINKSTS);
+	cap = pci_cfgacc_get32(rcdip, bdf, bus_p->bus_pcie_off + PCIE_LINKCAP);
+	if (status == PCI_EINVAL16 || cap == PCI_EINVAL32)
+		return;
+
+	switch (status & PCIE_LINKSTS_SPEED_MASK) {
+	case PCIE_LINKSTS_SPEED_2_5:
+		bus_p->bus_cur_speed = PCIE_LINK_SPEED_2_5;
+		break;
+	case PCIE_LINKSTS_SPEED_5:
+		bus_p->bus_cur_speed = PCIE_LINK_SPEED_5;
+		break;
+	case PCIE_LINKSTS_SPEED_8:
+		bus_p->bus_cur_speed = PCIE_LINK_SPEED_8;
+		break;
+	default:
+		bus_p->bus_cur_speed = PCIE_LINK_SPEED_UNKNOWN;
+		break;
+	}
+
+	switch (status & PCIE_LINKSTS_NEG_WIDTH_MASK) {
+	case PCIE_LINKSTS_NEG_WIDTH_X1:
+		bus_p->bus_cur_width = PCIE_LINK_WIDTH_X1;
+		break;
+	case PCIE_LINKSTS_NEG_WIDTH_X2:
+		bus_p->bus_cur_width = PCIE_LINK_WIDTH_X2;
+		break;
+	case PCIE_LINKSTS_NEG_WIDTH_X4:
+		bus_p->bus_cur_width = PCIE_LINK_WIDTH_X4;
+		break;
+	case PCIE_LINKSTS_NEG_WIDTH_X8:
+		bus_p->bus_cur_width = PCIE_LINK_WIDTH_X8;
+		break;
+	case PCIE_LINKSTS_NEG_WIDTH_X12:
+		bus_p->bus_cur_width = PCIE_LINK_WIDTH_X12;
+		break;
+	case PCIE_LINKSTS_NEG_WIDTH_X16:
+		bus_p->bus_cur_width = PCIE_LINK_WIDTH_X16;
+		break;
+	case PCIE_LINKSTS_NEG_WIDTH_X32:
+		bus_p->bus_cur_width = PCIE_LINK_WIDTH_X32;
+		break;
+	default:
+		bus_p->bus_cur_width = PCIE_LINK_WIDTH_UNKNOWN;
+		break;
+	}
+
+	switch (cap & PCIE_LINKCAP_MAX_WIDTH_MASK) {
+	case PCIE_LINKCAP_MAX_WIDTH_X1:
+		bus_p->bus_max_width = PCIE_LINK_WIDTH_X1;
+		break;
+	case PCIE_LINKCAP_MAX_WIDTH_X2:
+		bus_p->bus_max_width = PCIE_LINK_WIDTH_X2;
+		break;
+	case PCIE_LINKCAP_MAX_WIDTH_X4:
+		bus_p->bus_max_width = PCIE_LINK_WIDTH_X4;
+		break;
+	case PCIE_LINKCAP_MAX_WIDTH_X8:
+		bus_p->bus_max_width = PCIE_LINK_WIDTH_X8;
+		break;
+	case PCIE_LINKCAP_MAX_WIDTH_X12:
+		bus_p->bus_max_width = PCIE_LINK_WIDTH_X12;
+		break;
+	case PCIE_LINKCAP_MAX_WIDTH_X16:
+		bus_p->bus_max_width = PCIE_LINK_WIDTH_X16;
+		break;
+	case PCIE_LINKCAP_MAX_WIDTH_X32:
+		bus_p->bus_max_width = PCIE_LINK_WIDTH_X32;
+		break;
+	default:
+		bus_p->bus_max_width = PCIE_LINK_WIDTH_UNKNOWN;
+		break;
+	}
+
+	/*
+	 * If we have the Link Capabilities 2, then we can get the supported
+	 * speeds from it and treat the bits in Link Capabilities 1 as the
+	 * maximum. If we don't, then we need to follow the Implementation Note
+	 * in the standard under Link Capabilities 2. Effectively, this means
+	 * that if the value of 10b is set in Link Capabilities register, that
+	 * it supports both 2.5 and 5 GT/s speeds.
+	 */
+	if (cap2 != 0) {
+		if (cap2 & PCIE_LINKCAP2_SPEED_2_5)
+			bus_p->bus_sup_speed |= PCIE_LINK_SPEED_2_5;
+		if (cap2 & PCIE_LINKCAP2_SPEED_5)
+			bus_p->bus_sup_speed |= PCIE_LINK_SPEED_5;
+		if (cap2 & PCIE_LINKCAP2_SPEED_8)
+			bus_p->bus_sup_speed |= PCIE_LINK_SPEED_8;
+
+		switch (cap & PCIE_LINKCAP_MAX_SPEED_MASK) {
+		case PCIE_LINKCAP_MAX_SPEED_2_5:
+			bus_p->bus_max_speed = PCIE_LINK_SPEED_2_5;
+			break;
+		case PCIE_LINKCAP_MAX_SPEED_5:
+			bus_p->bus_max_speed = PCIE_LINK_SPEED_5;
+			break;
+		case PCIE_LINKCAP_MAX_SPEED_8:
+			bus_p->bus_max_speed = PCIE_LINK_SPEED_8;
+			break;
+		default:
+			bus_p->bus_max_speed = PCIE_LINK_SPEED_UNKNOWN;
+			break;
+		}
+	} else {
+		if (cap & PCIE_LINKCAP_MAX_SPEED_5) {
+			bus_p->bus_max_speed = PCIE_LINK_SPEED_5;
+			bus_p->bus_sup_speed = PCIE_LINK_SPEED_2_5 |
+			    PCIE_LINK_SPEED_5;
+		}
+
+		if (cap & PCIE_LINKCAP_MAX_SPEED_2_5) {
+			bus_p->bus_max_speed = PCIE_LINK_SPEED_2_5;
+			bus_p->bus_sup_speed = PCIE_LINK_SPEED_2_5;
+		}
+	}
+}
+
+/*
  * partially init pcie_bus_t for device (dip,bdf) for accessing pci
  * config space
  *
@@ -1134,6 +1302,10 @@ pcie_init_bus(dev_info_t *dip, pcie_req_id_t bdf, uint8_t flags)
 		}
 	}
 
+	/*
+	 * Save and record speed information about the device.
+	 */
+
 caps_done:
 	/* save RP dip and RP bdf */
 	if (PCIE_IS_RP(bus_p)) {
@@ -1170,7 +1342,7 @@ caps_done:
 	}
 
 	bus_p->bus_soft_state = PCI_SOFT_STATE_CLOSED;
-	bus_p->bus_fm_flags = 0;
+	(void) atomic_swap_uint(&bus_p->bus_fm_flags, 0);
 	bus_p->bus_mps = 0;
 
 	ndi_set_bus_private(dip, B_TRUE, DEVI_PORT_TYPE_PCI, (void *)bus_p);
@@ -1226,6 +1398,8 @@ initial_done:
 
 	pcie_init_plat(dip);
 
+	pcie_capture_speeds(bus_p, bdf, rcdip);
+
 final_done:
 
 	PCIE_DBG("Add %s(dip 0x%p, bdf 0x%x, secbus 0x%x)\n",
@@ -1318,14 +1492,15 @@ pcie_get_rc_dip(dev_info_t *dip)
 	return (rcdip);
 }
 
-static boolean_t
+boolean_t
 pcie_is_pci_device(dev_info_t *dip)
 {
 	dev_info_t	*pdip;
 	char		*device_type;
 
 	pdip = ddi_get_parent(dip);
-	ASSERT(pdip);
+	if (pdip == NULL)
+		return (B_FALSE);
 
 	if (ddi_prop_lookup_string(DDI_DEV_T_ANY, pdip, DDI_PROP_DONTPASS,
 	    "device_type", &device_type) != DDI_PROP_SUCCESS)
diff --git a/usr/src/uts/common/io/pciex/pcie_fault.c b/usr/src/uts/common/io/pciex/pcie_fault.c
index f4a2e9190e..61271d674b 100644
--- a/usr/src/uts/common/io/pciex/pcie_fault.c
+++ b/usr/src/uts/common/io/pciex/pcie_fault.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2017, Joyent, Inc.
  */
 
 #include <sys/sysmacros.h>
@@ -164,8 +165,8 @@ int pcie_disable_scan = 0;		/* Disable fabric scan */
 /* Inform interested parties that error handling is about to begin. */
 /* ARGSUSED */
 void
-pf_eh_enter(pcie_bus_t *bus_p) {
-}
+pf_eh_enter(pcie_bus_t *bus_p)
+{}
 
 /* Inform interested parties that error handling has ended. */
 void
@@ -304,7 +305,8 @@ done:
 }
 
 void
-pcie_force_fullscan() {
+pcie_force_fullscan()
+{
 	pcie_full_scan = B_TRUE;
 }
 
@@ -917,10 +919,18 @@ pf_default_hdl(dev_info_t *dip, pf_impl_t *impl)
 	}
 
 	/*
-	 * Read vendor/device ID and check with cached data, if it doesn't match
-	 * could very well be a device that isn't responding anymore.  Just
-	 * stop.  Save the basic info in the error q for post mortem debugging
-	 * purposes.
+	 * If this is a device used for PCI passthrough into a virtual machine,
+	 * don't let any error it caused panic the system.
+	 */
+	if (bus_p->bus_fm_flags & PF_FM_IS_PASSTHRU)
+		pfd_p->pe_severity_mask |= PF_ERR_PANIC;
+
+	/*
+	 * Read vendor/device ID and check with cached data; if it doesn't
+	 * match, it could very well mean that the device is no longer
+	 * responding.  In this case, we return PF_SCAN_BAD_RESPONSE; should
+	 * the caller choose to panic in this case, we will have the basic
+	 * info in the error queue for the purposes of postmortem debugging.
 	 */
 	if (PCIE_GET(32, bus_p, PCI_CONF_VENID) != bus_p->bus_dev_ven_id) {
 		char buf[FM_MAX_CLASS];
@@ -931,12 +941,12 @@ pf_default_hdl(dev_info_t *dip, pf_impl_t *impl)
 		    DDI_NOSLEEP, FM_VERSION, DATA_TYPE_UINT8, 0, NULL);
 
 		/*
-		 * For IOV/Hotplug purposes skip gathering info fo this device,
+		 * For IOV/Hotplug purposes skip gathering info for this device,
 		 * but populate affected info and severity.  Clear out any data
 		 * that maybe been saved in the last fabric scan.
 		 */
 		pf_reset_pfd(pfd_p);
-		pfd_p->pe_severity_flags = PF_ERR_PANIC_BAD_RESPONSE;
+		pfd_p->pe_severity_flags = PF_ERR_BAD_RESPONSE;
 		PFD_AFFECTED_DEV(pfd_p)->pe_affected_flags = PF_AFFECTED_SELF;
 
 		/* Add the snapshot to the error q */
@@ -948,6 +958,7 @@ pf_default_hdl(dev_info_t *dip, pf_impl_t *impl)
 
 	pf_pci_regs_gather(pfd_p, bus_p);
 	pf_pci_regs_clear(pfd_p, bus_p);
+
 	if (PCIE_IS_RP(bus_p))
 		pf_pci_find_rp_fault(pfd_p, bus_p);
 
@@ -982,6 +993,22 @@ done:
 }
 
 /*
+ * Set the passthru flag on a device bus_p. Called by passthru drivers to
+ * indicate when a device is or is no longer under passthru control.
+ */
+void
+pf_set_passthru(dev_info_t *dip, boolean_t is_passthru)
+{
+	pcie_bus_t *bus_p = PCIE_DIP2BUS(dip);
+
+	if (is_passthru) {
+		atomic_or_uint(&bus_p->bus_fm_flags, PF_FM_IS_PASSTHRU);
+	} else {
+		atomic_and_uint(&bus_p->bus_fm_flags, ~PF_FM_IS_PASSTHRU);
+	}
+}
+
+/*
  * Called during postattach to initialize a device's error handling
  * capabilities.  If the devices has already been hardened, then there isn't
  * much needed.  Otherwise initialize the device's default FMA capabilities.
@@ -1024,7 +1051,7 @@ pf_init(dev_info_t *dip, ddi_iblock_cookie_t ibc, ddi_attach_cmd_t cmd)
 		    DDI_FM_EREPORT_CAPABLE | DDI_FM_ERRCB_CAPABLE);
 		cap &= (DDI_FM_EREPORT_CAPABLE | DDI_FM_ERRCB_CAPABLE);
 
-		bus_p->bus_fm_flags |= PF_FM_IS_NH;
+		atomic_or_uint(&bus_p->bus_fm_flags, PF_FM_IS_NH);
 
 		if (cmd == DDI_ATTACH) {
 			ddi_fm_init(dip, &cap, &ibc);
@@ -1039,7 +1066,7 @@ pf_init(dev_info_t *dip, ddi_iblock_cookie_t ibc, ddi_attach_cmd_t cmd)
 
 	/* If ddi_fm_init fails for any reason RETURN */
 	if (!fmhdl) {
-		bus_p->bus_fm_flags = 0;
+		(void) atomic_swap_uint(&bus_p->bus_fm_flags, 0);
 		return;
 	}
 
@@ -1049,7 +1076,7 @@ pf_init(dev_info_t *dip, ddi_iblock_cookie_t ibc, ddi_attach_cmd_t cmd)
 			ddi_fm_handler_register(dip, pf_dummy_cb, NULL);
 	}
 
-	bus_p->bus_fm_flags |= PF_FM_READY;
+	atomic_or_uint(&bus_p->bus_fm_flags, PF_FM_READY);
 }
 
 /* undo FMA lock, called at predetach */
@@ -1066,7 +1093,7 @@ pf_fini(dev_info_t *dip, ddi_detach_cmd_t cmd)
 		return;
 
 	/* no other code should set the flag to false */
-	bus_p->bus_fm_flags &= ~PF_FM_READY;
+	atomic_and_uint(&bus_p->bus_fm_flags, ~PF_FM_READY);
 
 	/*
 	 * Grab the mutex to make sure device isn't in the middle of
@@ -1080,7 +1107,7 @@ pf_fini(dev_info_t *dip, ddi_detach_cmd_t cmd)
 	/* undo non-hardened drivers */
 	if (bus_p->bus_fm_flags & PF_FM_IS_NH) {
 		if (cmd == DDI_DETACH) {
-			bus_p->bus_fm_flags &= ~PF_FM_IS_NH;
+			atomic_and_uint(&bus_p->bus_fm_flags, ~PF_FM_IS_NH);
 			pci_ereport_teardown(dip);
 			/*
 			 * ddi_fini itself calls ddi_handler_unregister,
@@ -1377,7 +1404,7 @@ pf_analyse_error(ddi_fm_error_t *derr, pf_impl_t *impl)
 		sts_flags = 0;
 
 		/* skip analysing error when no error info is gathered */
-		if (pfd_p->pe_severity_flags == PF_ERR_PANIC_BAD_RESPONSE)
+		if (pfd_p->pe_severity_flags == PF_ERR_BAD_RESPONSE)
 			goto done;
 
 		switch (PCIE_PFD2BUS(pfd_p)->bus_dev_type) {
@@ -1455,6 +1482,8 @@ done:
 		/* Have pciev_eh adjust the severity */
 		pfd_p->pe_severity_flags = pciev_eh(pfd_p, impl);
 
+		pfd_p->pe_severity_flags &= ~pfd_p->pe_severity_mask;
+
 		error_flags |= pfd_p->pe_severity_flags;
 	}
 
@@ -2165,7 +2194,8 @@ pf_matched_in_rc(pf_data_t *dq_head_p, pf_data_t *pfd_p,
  */
 static void
 pf_pci_find_trans_type(pf_data_t *pfd_p, uint64_t *addr, uint32_t *trans_type,
-    pcie_req_id_t *bdf) {
+    pcie_req_id_t *bdf)
+{
 	pf_data_t *rc_pfd_p;
 
 	/* Could be DMA or PIO.  Find out by look at error type. */
@@ -2216,7 +2246,8 @@ pf_pci_find_trans_type(pf_data_t *pfd_p, uint64_t *addr, uint32_t *trans_type,
  */
 /* ARGSUSED */
 int
-pf_pci_decode(pf_data_t *pfd_p, uint16_t *cmd) {
+pf_pci_decode(pf_data_t *pfd_p, uint16_t *cmd)
+{
 	pcix_attr_t	*attr;
 	uint64_t	addr;
 	uint32_t	trans_type;
@@ -2415,7 +2446,8 @@ done:
 
 static int
 pf_hdl_compare(dev_info_t *dip, ddi_fm_error_t *derr, uint32_t flag,
-    uint64_t addr, pcie_req_id_t bdf, ndi_fmc_t *fcp) {
+    uint64_t addr, pcie_req_id_t bdf, ndi_fmc_t *fcp)
+{
 	ndi_fmcentry_t	*fep;
 	int		found = 0;
 	int		status;
@@ -2490,7 +2522,7 @@ pf_hdl_compare(dev_info_t *dip, ddi_fm_error_t *derr, uint32_t flag,
 /* ARGSUSED */
 static int
 pf_log_hdl_lookup(dev_info_t *rpdip, ddi_fm_error_t *derr, pf_data_t *pfd_p,
-	boolean_t is_primary)
+    boolean_t is_primary)
 {
 	/*
 	 * Disabling this function temporarily until errors can be handled
@@ -2537,7 +2569,8 @@ pf_log_hdl_lookup(dev_info_t *rpdip, ddi_fm_error_t *derr, pf_data_t *pfd_p,
  * accessed via the bus_p.
  */
 int
-pf_tlp_decode(pcie_bus_t *bus_p, pf_pcie_adv_err_regs_t *adv_reg_p) {
+pf_tlp_decode(pcie_bus_t *bus_p, pf_pcie_adv_err_regs_t *adv_reg_p)
+{
 	pcie_tlp_hdr_t	*tlp_hdr = (pcie_tlp_hdr_t *)adv_reg_p->pcie_ue_hdr;
 	pcie_req_id_t	my_bdf, tlp_bdf, flt_bdf = PCIE_INVALID_BDF;
 	uint64_t	flt_addr = 0;
@@ -3050,6 +3083,7 @@ pf_reset_pfd(pf_data_t *pfd_p)
 	pcie_bus_t	*bus_p = PCIE_PFD2BUS(pfd_p);
 
 	pfd_p->pe_severity_flags = 0;
+	pfd_p->pe_severity_mask = 0;
 	pfd_p->pe_orig_severity_flags = 0;
 	/* pe_lock and pe_valid were reset in pf_send_ereport */
 
diff --git a/usr/src/uts/common/io/pciex/pciev.c b/usr/src/uts/common/io/pciex/pciev.c
index b442aae5b5..c8b2cfdda7 100644
--- a/usr/src/uts/common/io/pciex/pciev.c
+++ b/usr/src/uts/common/io/pciex/pciev.c
@@ -23,6 +23,10 @@
  * Use is subject to license terms.
  */
 
+/*
+ * Copyright (c) 2017, Joyent, Inc.
+ */
+
 #include <sys/types.h>
 #include <sys/ddi.h>
 #include <sys/dditypes.h>
@@ -302,7 +306,7 @@ pciev_eh(pf_data_t *pfd_p, pf_impl_t *impl)
 		pcie_faulty_all = B_TRUE;
 
 	} else if (severity & (PF_ERR_NO_PANIC | PF_ERR_MATCHED_DEVICE |
-	    PF_ERR_PANIC | PF_ERR_PANIC_BAD_RESPONSE)) {
+	    PF_ERR_PANIC | PF_ERR_BAD_RESPONSE)) {
 
 		uint16_t affected_flag, dev_affected_flags;
 		uint_t is_panic = 0, is_aff_dev_found = 0;
diff --git a/usr/src/uts/common/io/physmem.c b/usr/src/uts/common/io/physmem.c
index 39d5003b02..c48fecd133 100644
--- a/usr/src/uts/common/io/physmem.c
+++ b/usr/src/uts/common/io/physmem.c
@@ -21,6 +21,7 @@
 /*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2016 Joyent, Inc.
  */
 
 
@@ -807,6 +808,13 @@ physmem_open(dev_t *devp, int flag, int otyp, cred_t *credp)
 	int ret;
 	static int msg_printed = 0;
 
+	/*
+	 * This device should never be visible in a zone, but if it somehow
+	 * does get created we refuse to allow the zone to use it.
+	 */
+	if (crgetzoneid(credp) != GLOBAL_ZONEID)
+		return (EACCES);
+
 	if ((flag & (FWRITE | FREAD)) != (FWRITE | FREAD)) {
 		return (EINVAL);
 	}
diff --git a/usr/src/uts/common/io/pseudo.conf b/usr/src/uts/common/io/pseudo.conf
index 42248e93d6..08affec609 100644
--- a/usr/src/uts/common/io/pseudo.conf
+++ b/usr/src/uts/common/io/pseudo.conf
@@ -22,8 +22,7 @@
 #
 # Copyright 2003 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
-#
-# ident	"%Z%%M%	%I%	%E% SMI"
+# Copyright 2014 Joyent, Inc.  All rights reserved.
 #
 # This file is private to the pseudonex driver.  It should not be edited.
 #
@@ -38,3 +37,9 @@ name="pseudo" class="root" instance=0;
 # /pseudo; it has as its children the zone console pseudo nodes.
 #
 name="zconsnex" parent="/pseudo" instance=1 valid-children="zcons";
+
+#
+# zfdnex is an alias for pseudo; this node is instantiated as a child of
+# /pseudo; it has as its children the zone fd pseudo nodes.
+#
+name="zfdnex" parent="/pseudo" instance=2 valid-children="zfd";
diff --git a/usr/src/uts/common/io/pseudonex.c b/usr/src/uts/common/io/pseudonex.c
index f83b0abf39..0ae06f88cc 100644
--- a/usr/src/uts/common/io/pseudonex.c
+++ b/usr/src/uts/common/io/pseudonex.c
@@ -83,6 +83,8 @@ static int pseudonex_detach(dev_info_t *, ddi_detach_cmd_t);
 static int pseudonex_open(dev_t *, int, int, cred_t *);
 static int pseudonex_close(dev_t, int, int, cred_t *);
 static int pseudonex_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
+static int pseudonex_fm_init(dev_info_t *, dev_info_t *, int,
+    ddi_iblock_cookie_t *);
 static int pseudonex_ctl(dev_info_t *, dev_info_t *, ddi_ctl_enum_t, void *,
     void *);
 
@@ -90,6 +92,8 @@ static void *pseudonex_state;
 
 typedef struct pseudonex_state {
 	dev_info_t *pnx_devi;
+	int pnx_fmcap;
+	ddi_iblock_cookie_t pnx_fm_ibc;
 } pseudonex_state_t;
 
 static struct bus_ops pseudonex_bus_ops = {
@@ -116,7 +120,7 @@ static struct bus_ops pseudonex_bus_ops = {
 	NULL,			/* bus_intr_ctl */
 	NULL,			/* bus_config */
 	NULL,			/* bus_unconfig */
-	NULL,			/* bus_fm_init */
+	pseudonex_fm_init,	/* bus_fm_init */
 	NULL,			/* bus_fm_fini */
 	NULL,			/* bus_fm_access_enter */
 	NULL,			/* bus_fm_access_exit */
@@ -228,6 +232,9 @@ pseudonex_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
 	pnx_state = ddi_get_soft_state(pseudonex_state, instance);
 	pnx_state->pnx_devi = devi;
 
+	pnx_state->pnx_fmcap = DDI_FM_EREPORT_CAPABLE;
+	ddi_fm_init(devi, &pnx_state->pnx_fmcap, &pnx_state->pnx_fm_ibc);
+
 	if (ddi_create_minor_node(devi, "devctl", S_IFCHR, instance,
 	    DDI_NT_NEXUS, 0) != DDI_SUCCESS) {
 		ddi_remove_minor_node(devi, NULL);
@@ -247,6 +254,10 @@ pseudonex_detach(dev_info_t *devi, ddi_detach_cmd_t cmd)
 	if (cmd == DDI_SUSPEND)
 		return (DDI_SUCCESS);
 
+	if (cmd != DDI_DETACH)
+		return (DDI_FAILURE);
+
+	ddi_fm_fini(devi);
 	ddi_remove_minor_node(devi, NULL);
 	ddi_soft_state_free(pseudonex_state, instance);
 	return (DDI_SUCCESS);
@@ -375,6 +386,19 @@ pseudonex_auto_assign(dev_info_t *child)
 }
 
 static int
+pseudonex_fm_init(dev_info_t *dip, dev_info_t *tdip, int cap,
+    ddi_iblock_cookie_t *ibc)
+{
+	pseudonex_state_t *pnx_state;
+
+	pnx_state = ddi_get_soft_state(pseudonex_state, ddi_get_instance(dip));
+	ASSERT(pnx_state != NULL);
+	ASSERT(ibc != NULL);
+	*ibc = pnx_state->pnx_fm_ibc;
+	return (pnx_state->pnx_fmcap & cap);
+}
+
+static int
 pseudonex_ctl(dev_info_t *dip, dev_info_t *rdip, ddi_ctl_enum_t ctlop,
     void *arg, void *result)
 {
diff --git a/usr/src/uts/common/io/ptm.c b/usr/src/uts/common/io/ptm.c
index d9ec6cc585..dc6c4e64ad 100644
--- a/usr/src/uts/common/io/ptm.c
+++ b/usr/src/uts/common/io/ptm.c
@@ -447,6 +447,18 @@ ptmclose(queue_t *rqp, int flag, cred_t *credp)
 	return (0);
 }
 
+static boolean_t
+ptmptsopencb(ptmptsopencb_arg_t arg)
+{
+	struct pt_ttys	*ptmp = (struct pt_ttys *)arg;
+	boolean_t rval;
+
+	PT_ENTER_READ(ptmp);
+	rval = (ptmp->pt_nullmsg != NULL);
+	PT_EXIT_READ(ptmp);
+	return (rval);
+}
+
 /*
  * The wput procedure will only handle ioctl and flush messages.
  */
@@ -574,6 +586,41 @@ ptmwput(queue_t *qp, mblk_t *mp)
 			miocack(qp, mp, 0, 0);
 			break;
 		}
+		case PTMPTSOPENCB:
+		{
+			mblk_t		*dp;	/* ioctl reply data */
+			ptmptsopencb_t	*ppocb;
+
+			/* only allow the kernel to invoke this ioctl */
+			if (iocp->ioc_cr != kcred) {
+				miocnak(qp, mp, 0, EINVAL);
+				break;
+			}
+
+			/* we don't support transparent ioctls */
+			ASSERT(iocp->ioc_count != TRANSPARENT);
+			if (iocp->ioc_count == TRANSPARENT) {
+				miocnak(qp, mp, 0, EINVAL);
+				break;
+			}
+
+			/* allocate a response message */
+			dp = allocb(sizeof (ptmptsopencb_t), BPRI_MED);
+			if (dp == NULL) {
+				miocnak(qp, mp, 0, EAGAIN);
+				break;
+			}
+
+			/* initialize the ioctl results */
+			ppocb = (ptmptsopencb_t *)dp->b_rptr;
+			ppocb->ppocb_func = ptmptsopencb;
+			ppocb->ppocb_arg = (ptmptsopencb_arg_t)ptmp;
+
+			/* send the reply data */
+			mioc2ack(mp, dp, sizeof (ptmptsopencb_t), 0);
+			qreply(qp, mp);
+			break;
+		}
 		}
 		break;
 
diff --git a/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/iwarp_sm.jpg b/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/iwarp_sm.jpg
new file mode 100644
index 0000000000..b932ffaa7c
--- /dev/null
+++ b/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/iwarp_sm.jpg
diff --git a/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/qlogic-full-36.jpg b/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/qlogic-full-36.jpg
new file mode 100644
index 0000000000..9421ecc0db
--- /dev/null
+++ b/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/qlogic-full-36.jpg
diff --git a/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/qlogic-full.png b/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/qlogic-full.png
new file mode 100644
index 0000000000..4b8a66761a
--- /dev/null
+++ b/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/qlogic-full.png
diff --git a/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/qlogic-logo.png b/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/qlogic-logo.png
new file mode 100644
index 0000000000..3254fbdc3b
--- /dev/null
+++ b/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/qlogic-logo.png
diff --git a/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/reg_access.jpg b/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/reg_access.jpg
new file mode 100644
index 0000000000..7bb0dbf21b
--- /dev/null
+++ b/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/reg_access.jpg
diff --git a/usr/src/uts/common/io/qede/579xx/drivers/ecore/ecore_init_values.bin b/usr/src/uts/common/io/qede/579xx/drivers/ecore/ecore_init_values.bin
new file mode 100644
index 0000000000..43014fd8ea
--- /dev/null
+++ b/usr/src/uts/common/io/qede/579xx/drivers/ecore/ecore_init_values.bin
diff --git a/usr/src/uts/common/io/qede/579xx/drivers/ecore/ecore_init_values_zipped.bin b/usr/src/uts/common/io/qede/579xx/drivers/ecore/ecore_init_values_zipped.bin
new file mode 100644
index 0000000000..9524eb4a63
--- /dev/null
+++ b/usr/src/uts/common/io/qede/579xx/drivers/ecore/ecore_init_values_zipped.bin
diff --git a/usr/src/uts/common/io/qede/qede_fp.c b/usr/src/uts/common/io/qede/qede_fp.c
index 5b6177963e..fe7cb7a6cd 100644
--- a/usr/src/uts/common/io/qede/qede_fp.c
+++ b/usr/src/uts/common/io/qede/qede_fp.c
@@ -1848,8 +1848,6 @@ qede_ring_tx(void *arg, mblk_t *mp)
 	}
 
 	if (!qede->params.link_state) {
-		qede_print_err("!%s(%d): Link !up for xmit",
-		    __func__, qede->instance);
 		goto exit;
 	}
 
diff --git a/usr/src/uts/common/io/qede/qede_list.h b/usr/src/uts/common/io/qede/qede_list.h
index 2350cb4117..656d2a915f 100644
--- a/usr/src/uts/common/io/qede/qede_list.h
+++ b/usr/src/uts/common/io/qede/qede_list.h
@@ -176,4 +176,3 @@ qede_list_splice_tail(qede_list_t *list,
 #define	QEDE_LIST_FOR_EACH_ENTRY_SAFE	OSAL_LIST_FOR_EACH_ENTRY_SAFE
 
 #endif  /* !_QEDE_LIST_H */
-
diff --git a/usr/src/uts/common/io/qede/qede_version.h b/usr/src/uts/common/io/qede/qede_version.h
index ebf2624268..54678600c9 100644
--- a/usr/src/uts/common/io/qede/qede_version.h
+++ b/usr/src/uts/common/io/qede/qede_version.h
@@ -42,4 +42,3 @@
 #define REVVERSION 23
 
 #endif  /* !_QEDE_VERSION_H */
-
diff --git a/usr/src/uts/common/io/random.c b/usr/src/uts/common/io/random.c
index d79b86362c..a50bbcceec 100644
--- a/usr/src/uts/common/io/random.c
+++ b/usr/src/uts/common/io/random.c
@@ -291,6 +291,9 @@ rnd_write(dev_t dev, struct uio *uiop, cred_t *credp)
 		if ((error = uiomove(buf, bytes, UIO_WRITE, uiop)) != 0)
 			return (error);
 
+		if (crgetzone(credp) != global_zone)
+			continue;
+
 		switch (devno) {
 		case DEVRANDOM:
 			if ((error = random_add_entropy(buf, bytes, 0)) != 0)
diff --git a/usr/src/uts/common/io/rsm/rsm.c b/usr/src/uts/common/io/rsm/rsm.c
index 4ec7d34b92..4c5489b039 100644
--- a/usr/src/uts/common/io/rsm/rsm.c
+++ b/usr/src/uts/common/io/rsm/rsm.c
@@ -22,8 +22,8 @@
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  * Copyright 2012 Milan Jurik. All rights reserved.
- * Copyright (c) 2016 by Delphix. All rights reserved.
  * Copyright 2017 Joyent, Inc.
+ * Copyright (c) 2016 by Delphix. All rights reserved.
  */
 
 
diff --git a/usr/src/uts/common/io/sata/adapters/ahci/ahci.c b/usr/src/uts/common/io/sata/adapters/ahci/ahci.c
index 227fc76b6f..258e1d742c 100644
--- a/usr/src/uts/common/io/sata/adapters/ahci/ahci.c
+++ b/usr/src/uts/common/io/sata/adapters/ahci/ahci.c
@@ -10559,8 +10559,7 @@ ahci_em_led_task(void *arg)
 
 	mutex_enter(&led->aelta_ctl->ahcictl_mutex);
 	if (ret) {
-		led->aelta_ctl->ahcictl_em_state[led->aelta_port] =
-		    led->aelta_state;
+		led->aelta_ctl->ahcictl_em_state[led->aelta_port] = state;
 		led->aelta_ret = 0;
 	} else {
 		led->aelta_ret = EIO;
@@ -10770,6 +10769,7 @@ ahci_em_ioctl_set(ahci_ctl_t *ahci_ctlp, intptr_t arg)
 	}
 
 	task->aelta_ctl = ahci_ctlp;
+	task->aelta_port = set.aiems_port;
 	task->aelta_port = (uint8_t)set.aiems_port;
 	task->aelta_op = set.aiems_op;
 	task->aelta_state = set.aiems_leds;
diff --git a/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas.c b/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas.c
index 25c91f3560..dc9547a275 100644
--- a/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas.c
+++ b/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas.c
@@ -72,6 +72,7 @@
 #include <sys/file.h>
 #include <sys/policy.h>
 #include <sys/model.h>
+#include <sys/refhash.h>
 #include <sys/sysevent.h>
 #include <sys/sysevent/eventdefs.h>
 #include <sys/sysevent/dr.h>
@@ -99,7 +100,6 @@
 #include <sys/scsi/adapters/mpt_sas/mptsas_var.h>
 #include <sys/scsi/adapters/mpt_sas/mptsas_ioctl.h>
 #include <sys/scsi/adapters/mpt_sas/mptsas_smhba.h>
-#include <sys/scsi/adapters/mpt_sas/mptsas_hash.h>
 #include <sys/raidioctl.h>
 
 #include <sys/fs/dv_node.h>	/* devfs_clean */
diff --git a/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas_hash.c b/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas_hash.c
deleted file mode 100644
index 8f96c2d9f1..0000000000
--- a/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas_hash.c
+++ /dev/null
@@ -1,215 +0,0 @@
-/*
- * This file and its contents are supplied under the terms of the
- * Common Development and Distribution License ("CDDL"), version 1.0.
- * You may only use this file in accordance with the terms of version
- * 1.0 of the CDDL.
- *
- * A full copy of the text of the CDDL should have accompanied this
- * source.  A copy of the CDDL is also available via the Internet at
- * http://www.illumos.org/license/CDDL.
- */
-
-/*
- * Copyright 2014 Joyent, Inc.  All rights reserved.
- */
-
-#include <sys/scsi/adapters/mpt_sas/mptsas_hash.h>
-#include <sys/sysmacros.h>
-#include <sys/types.h>
-#include <sys/kmem.h>
-#include <sys/list.h>
-#include <sys/ddi.h>
-
-#ifdef lint
-extern refhash_link_t *obj_to_link(refhash_t *, void *);
-extern void *link_to_obj(refhash_t *, refhash_link_t *);
-extern void *obj_to_tag(refhash_t *, void *);
-#else
-#define	obj_to_link(_h, _o)	\
-	((refhash_link_t *)(((char *)(_o)) + (_h)->rh_link_off))
-#define	link_to_obj(_h, _l)	\
-	((void *)(((char *)(_l)) - (_h)->rh_link_off))
-#define	obj_to_tag(_h, _o)	\
-	((void *)(((char *)(_o)) + (_h)->rh_tag_off))
-#endif
-
-refhash_t *
-refhash_create(uint_t bucket_count, refhash_hash_f hash,
-    refhash_cmp_f cmp, refhash_dtor_f dtor, size_t obj_size, size_t link_off,
-    size_t tag_off, int km_flags)
-{
-	refhash_t *hp;
-	uint_t i;
-
-	hp = kmem_alloc(sizeof (refhash_t), km_flags);
-	if (hp == NULL)
-		return (NULL);
-	hp->rh_buckets = kmem_zalloc(bucket_count * sizeof (list_t), km_flags);
-	if (hp->rh_buckets == NULL) {
-		kmem_free(hp, sizeof (refhash_t));
-		return (NULL);
-	}
-	hp->rh_bucket_count = bucket_count;
-
-	for (i = 0; i < bucket_count; i++) {
-		list_create(&hp->rh_buckets[i], sizeof (refhash_link_t),
-		    offsetof(refhash_link_t, rhl_chain_link));
-	}
-	list_create(&hp->rh_objs, sizeof (refhash_link_t),
-	    offsetof(refhash_link_t, rhl_global_link));
-
-	hp->rh_obj_size = obj_size;
-	hp->rh_link_off = link_off;
-	hp->rh_tag_off = tag_off;
-	hp->rh_hash = hash;
-	hp->rh_cmp = cmp;
-	hp->rh_dtor = dtor;
-
-	return (hp);
-}
-
-void
-refhash_destroy(refhash_t *hp)
-{
-	ASSERT(list_is_empty(&hp->rh_objs));
-
-	kmem_free(hp->rh_buckets, hp->rh_bucket_count * sizeof (list_t));
-	kmem_free(hp, sizeof (refhash_t));
-}
-
-void
-refhash_insert(refhash_t *hp, void *op)
-{
-	uint_t bucket;
-	refhash_link_t *lp = obj_to_link(hp, op);
-
-	bucket = hp->rh_hash(obj_to_tag(hp, op)) % hp->rh_bucket_count;
-	list_link_init(&lp->rhl_chain_link);
-	list_link_init(&lp->rhl_global_link);
-	lp->rhl_flags = 0;
-	lp->rhl_refcnt = 0;
-	list_insert_tail(&hp->rh_buckets[bucket], lp);
-	list_insert_tail(&hp->rh_objs, lp);
-}
-
-static void
-refhash_delete(refhash_t *hp, void *op)
-{
-	refhash_link_t *lp = obj_to_link(hp, op);
-	uint_t bucket;
-
-	bucket = hp->rh_hash(obj_to_tag(hp, op)) % hp->rh_bucket_count;
-	list_remove(&hp->rh_buckets[bucket], lp);
-	list_remove(&hp->rh_objs, lp);
-	hp->rh_dtor(op);
-}
-
-void
-refhash_remove(refhash_t *hp, void *op)
-{
-	refhash_link_t *lp = obj_to_link(hp, op);
-
-	if (lp->rhl_refcnt > 0) {
-		lp->rhl_flags |= RHL_F_DEAD;
-	} else {
-		refhash_delete(hp, op);
-	}
-}
-
-void *
-refhash_lookup(refhash_t *hp, const void *tp)
-{
-	uint_t bucket;
-	refhash_link_t *lp;
-	void *op;
-
-	bucket = hp->rh_hash(tp) % hp->rh_bucket_count;
-	for (lp = list_head(&hp->rh_buckets[bucket]); lp != NULL;
-	    lp = list_next(&hp->rh_buckets[bucket], lp)) {
-		op = link_to_obj(hp, lp);
-		if (hp->rh_cmp(obj_to_tag(hp, op), tp) == 0 &&
-		    !(lp->rhl_flags & RHL_F_DEAD)) {
-			return (op);
-		}
-	}
-
-	return (NULL);
-}
-
-void *
-refhash_linear_search(refhash_t *hp, refhash_eval_f eval, void *arg)
-{
-	void *op;
-	refhash_link_t *lp;
-
-	for (lp = list_head(&hp->rh_objs); lp != NULL;
-	    lp = list_next(&hp->rh_objs, lp)) {
-		op = link_to_obj(hp, lp);
-		if (eval(op, arg) == 0)
-			return (op);
-	}
-
-	return (NULL);
-}
-
-void
-refhash_hold(refhash_t *hp, void *op)
-{
-	refhash_link_t *lp = obj_to_link(hp, op);
-
-	++lp->rhl_refcnt;
-}
-
-void
-refhash_rele(refhash_t *hp, void *op)
-{
-	refhash_link_t *lp = obj_to_link(hp, op);
-
-	ASSERT(lp->rhl_refcnt > 0);
-
-	if (--lp->rhl_refcnt == 0 && (lp->rhl_flags & RHL_F_DEAD))
-		refhash_remove(hp, op);
-}
-
-void *
-refhash_first(refhash_t *hp)
-{
-	refhash_link_t *lp;
-
-	lp = list_head(&hp->rh_objs);
-	if (lp == NULL)
-		return (NULL);
-
-	++lp->rhl_refcnt;
-
-	return (link_to_obj(hp, lp));
-}
-
-void *
-refhash_next(refhash_t *hp, void *op)
-{
-	refhash_link_t *lp;
-
-	lp = obj_to_link(hp, op);
-	while ((lp = list_next(&hp->rh_objs, lp)) != NULL) {
-		if (!(lp->rhl_flags & RHL_F_DEAD))
-			break;
-	}
-
-	refhash_rele(hp, op);
-	if (lp == NULL)
-		return (NULL);
-
-	++lp->rhl_refcnt;
-
-	return (link_to_obj(hp, lp));
-}
-
-boolean_t
-refhash_obj_valid(refhash_t *hp, const void *op)
-{
-	/* LINTED - E_ARG_INCOMPATIBLE_WITH_ARG_L */
-	const refhash_link_t *lp = obj_to_link(hp, op);
-
-	return ((lp->rhl_flags & RHL_F_DEAD) != 0);
-}
diff --git a/usr/src/uts/common/io/scsi/adapters/smrt/smrt.c b/usr/src/uts/common/io/scsi/adapters/smrt/smrt.c
new file mode 100644
index 0000000000..a7ef2b69d7
--- /dev/null
+++ b/usr/src/uts/common/io/scsi/adapters/smrt/smrt.c
@@ -0,0 +1,565 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2017, Joyent, Inc.
+ */
+
+#include <sys/scsi/adapters/smrt/smrt.h>
+
+static int smrt_attach(dev_info_t *, ddi_attach_cmd_t);
+static int smrt_detach(dev_info_t *, ddi_detach_cmd_t);
+static int smrt_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
+static void smrt_cleanup(smrt_t *);
+static int smrt_command_comparator(const void *, const void *);
+
+/*
+ * Controller soft state.  Each entry is an object of type "smrt_t".
+ */
+void *smrt_state;
+
+/*
+ * DMA attributes template.  Each controller will make a copy of this template
+ * with appropriate customisations; e.g., the Scatter/Gather List Length.
+ */
+static ddi_dma_attr_t smrt_dma_attr_template = {
+	.dma_attr_version =		DMA_ATTR_V0,
+	.dma_attr_addr_lo =		0x0000000000000000,
+	.dma_attr_addr_hi =		0xFFFFFFFFFFFFFFFF,
+	.dma_attr_count_max =		0x00FFFFFF,
+	.dma_attr_align =		0x20,
+	.dma_attr_burstsizes =		0x20,
+	.dma_attr_minxfer =		DMA_UNIT_8,
+	.dma_attr_maxxfer =		0xFFFFFFFF,
+	/*
+	 * There is some suggestion that at least some, possibly older, Smart
+	 * Array controllers cannot tolerate a DMA segment that straddles a 4GB
+	 * boundary.
+	 */
+	.dma_attr_seg =			0xFFFFFFFF,
+	.dma_attr_sgllen =		1,
+	.dma_attr_granular =		512,
+	.dma_attr_flags =		0
+};
+
+/*
+ * Device memory access attributes for device control registers.
+ */
+ddi_device_acc_attr_t smrt_dev_attributes = {
+	.devacc_attr_version =		DDI_DEVICE_ATTR_V0,
+	.devacc_attr_endian_flags =	DDI_STRUCTURE_LE_ACC,
+	.devacc_attr_dataorder =	DDI_STRICTORDER_ACC,
+	.devacc_attr_access =		0
+};
+
+/*
+ * Character/Block Operations Structure
+ */
+static struct cb_ops smrt_cb_ops = {
+	.cb_rev =			CB_REV,
+	.cb_flag =			D_NEW | D_MP,
+
+	.cb_open =			scsi_hba_open,
+	.cb_close =			scsi_hba_close,
+
+	.cb_ioctl =			smrt_ioctl,
+
+	.cb_strategy =			nodev,
+	.cb_print =			nodev,
+	.cb_dump =			nodev,
+	.cb_read =			nodev,
+	.cb_write =			nodev,
+	.cb_devmap =			nodev,
+	.cb_mmap =			nodev,
+	.cb_segmap =			nodev,
+	.cb_chpoll =			nochpoll,
+	.cb_prop_op =			ddi_prop_op,
+	.cb_str =			NULL,
+	.cb_aread =			nodev,
+	.cb_awrite =			nodev
+};
+
+/*
+ * Device Operations Structure
+ */
+static struct dev_ops smrt_dev_ops = {
+	.devo_rev =			DEVO_REV,
+	.devo_refcnt =			0,
+
+	.devo_attach =			smrt_attach,
+	.devo_detach =			smrt_detach,
+
+	.devo_cb_ops =			&smrt_cb_ops,
+
+	.devo_getinfo =			nodev,
+	.devo_identify =		nulldev,
+	.devo_probe =			nulldev,
+	.devo_reset =			nodev,
+	.devo_bus_ops =			NULL,
+	.devo_power =			nodev,
+	.devo_quiesce =			nodev
+};
+
+/*
+ * Linkage structures
+ */
+static struct modldrv smrt_modldrv = {
+	.drv_modops =			&mod_driverops,
+	.drv_linkinfo =			"HP Smart Array",
+	.drv_dev_ops =			&smrt_dev_ops
+};
+
+static struct modlinkage smrt_modlinkage = {
+	.ml_rev =			MODREV_1,
+	.ml_linkage =			{ &smrt_modldrv, NULL }
+};
+
+
+int
+_init()
+{
+	int r;
+
+	VERIFY0(ddi_soft_state_init(&smrt_state, sizeof (smrt_t), 0));
+
+	if ((r = scsi_hba_init(&smrt_modlinkage)) != 0) {
+		goto fail;
+	}
+
+	if ((r = mod_install(&smrt_modlinkage)) != 0) {
+		scsi_hba_fini(&smrt_modlinkage);
+		goto fail;
+	}
+
+	return (r);
+
+fail:
+	ddi_soft_state_fini(&smrt_state);
+	return (r);
+}
+
+int
+_fini()
+{
+	int r;
+
+	if ((r = mod_remove(&smrt_modlinkage)) == 0) {
+		scsi_hba_fini(&smrt_modlinkage);
+		ddi_soft_state_fini(&smrt_state);
+	}
+
+	return (r);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&smrt_modlinkage, modinfop));
+}
+
+static int
+smrt_iport_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+	const char *addr;
+	dev_info_t *pdip;
+	int instance;
+	smrt_t *smrt;
+
+	if (cmd != DDI_ATTACH)
+		return (DDI_FAILURE);
+
+	/*
+	 * Note, we cannot get to our parent via the tran's tran_hba_private
+	 * member.  This pointer is reset to NULL when the scsi_hba_tran_t
+	 * structure is duplicated.
+	 */
+	addr = scsi_hba_iport_unit_address(dip);
+	VERIFY(addr != NULL);
+	pdip = ddi_get_parent(dip);
+	instance = ddi_get_instance(pdip);
+	smrt = ddi_get_soft_state(smrt_state, instance);
+	VERIFY(smrt != NULL);
+
+	if (strcmp(addr, SMRT_IPORT_VIRT) == 0) {
+		if (smrt_logvol_hba_setup(smrt, dip) != DDI_SUCCESS)
+			return (DDI_FAILURE);
+		smrt->smrt_virt_iport = dip;
+	} else if (strcmp(addr, SMRT_IPORT_PHYS) == 0) {
+		if (smrt_phys_hba_setup(smrt, dip) != DDI_SUCCESS)
+			return (DDI_FAILURE);
+		smrt->smrt_phys_iport = dip;
+	} else {
+		return (DDI_FAILURE);
+	}
+
+	return (DDI_SUCCESS);
+}
+
+static int
+smrt_iport_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+	const char *addr;
+	scsi_hba_tran_t *tran;
+	smrt_t *smrt;
+
+	if (cmd != DDI_DETACH)
+		return (DDI_FAILURE);
+
+	tran = ddi_get_driver_private(dip);
+	VERIFY(tran != NULL);
+	smrt = tran->tran_hba_private;
+	VERIFY(smrt != NULL);
+
+	addr = scsi_hba_iport_unit_address(dip);
+	VERIFY(addr != NULL);
+
+	if (strcmp(addr, SMRT_IPORT_VIRT) == 0) {
+		smrt_logvol_hba_teardown(smrt, dip);
+		smrt->smrt_virt_iport = NULL;
+	} else if (strcmp(addr, SMRT_IPORT_PHYS) == 0) {
+		smrt_phys_hba_teardown(smrt, dip);
+		smrt->smrt_phys_iport = NULL;
+	} else {
+		return (DDI_FAILURE);
+	}
+
+	return (DDI_SUCCESS);
+}
+
+static int
+smrt_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+	uint32_t instance;
+	smrt_t *smrt;
+	boolean_t check_for_interrupts = B_FALSE;
+	int r;
+	char taskq_name[64];
+
+	if (scsi_hba_iport_unit_address(dip) != NULL)
+		return (smrt_iport_attach(dip, cmd));
+
+	if (cmd != DDI_ATTACH) {
+		return (DDI_FAILURE);
+	}
+
+	/*
+	 * Allocate the per-controller soft state object and get
+	 * a pointer to it.
+	 */
+	instance = ddi_get_instance(dip);
+	if (ddi_soft_state_zalloc(smrt_state, instance) != DDI_SUCCESS) {
+		dev_err(dip, CE_WARN, "could not allocate soft state");
+		return (DDI_FAILURE);
+	}
+	if ((smrt = ddi_get_soft_state(smrt_state, instance)) == NULL) {
+		dev_err(dip, CE_WARN, "could not get soft state");
+		ddi_soft_state_free(smrt_state, instance);
+		return (DDI_FAILURE);
+	}
+
+	/*
+	 * Initialise per-controller state object.
+	 */
+	smrt->smrt_dip = dip;
+	smrt->smrt_instance = instance;
+	smrt->smrt_next_tag = SMRT_MIN_TAG_NUMBER;
+	list_create(&smrt->smrt_commands, sizeof (smrt_command_t),
+	    offsetof(smrt_command_t, smcm_link));
+	list_create(&smrt->smrt_finishq, sizeof (smrt_command_t),
+	    offsetof(smrt_command_t, smcm_link_finish));
+	list_create(&smrt->smrt_abortq, sizeof (smrt_command_t),
+	    offsetof(smrt_command_t, smcm_link_abort));
+	list_create(&smrt->smrt_volumes, sizeof (smrt_volume_t),
+	    offsetof(smrt_volume_t, smlv_link));
+	list_create(&smrt->smrt_physicals, sizeof (smrt_physical_t),
+	    offsetof(smrt_physical_t, smpt_link));
+	list_create(&smrt->smrt_targets, sizeof (smrt_target_t),
+	    offsetof(smrt_target_t, smtg_link_ctlr));
+	avl_create(&smrt->smrt_inflight, smrt_command_comparator,
+	    sizeof (smrt_command_t), offsetof(smrt_command_t,
+	    smcm_node));
+	cv_init(&smrt->smrt_cv_finishq, NULL, CV_DRIVER, NULL);
+
+	smrt->smrt_init_level |= SMRT_INITLEVEL_BASIC;
+
+	/*
+	 * Perform basic device setup, including identifying the board, mapping
+	 * the I2O registers and the Configuration Table.
+	 */
+	if (smrt_device_setup(smrt) != DDI_SUCCESS) {
+		dev_err(dip, CE_WARN, "device setup failed");
+		goto fail;
+	}
+
+	/*
+	 * Select a Transport Method (e.g. Simple or Performant) and update
+	 * the Configuration Table.  This function also waits for the
+	 * controller to become ready.
+	 */
+	if (smrt_ctlr_init(smrt) != DDI_SUCCESS) {
+		dev_err(dip, CE_WARN, "controller initialisation failed");
+		goto fail;
+	}
+
+	/*
+	 * Each controller may have a different Scatter/Gather Element count.
+	 * Configure a per-controller set of DMA attributes with the
+	 * appropriate S/G size.
+	 */
+	VERIFY(smrt->smrt_sg_cnt > 0);
+	smrt->smrt_dma_attr = smrt_dma_attr_template;
+	smrt->smrt_dma_attr.dma_attr_sgllen = smrt->smrt_sg_cnt;
+
+	/*
+	 * Now that we have selected a Transport Method, we can configure
+	 * the appropriate interrupt handlers.
+	 */
+	if (smrt_interrupts_setup(smrt) != DDI_SUCCESS) {
+		dev_err(dip, CE_WARN, "interrupt handler setup failed");
+		goto fail;
+	}
+
+	/*
+	 * Now that we have the correct interrupt priority, we can initialise
+	 * the mutex.  This must be done before the interrupt handler is
+	 * enabled.
+	 */
+	mutex_init(&smrt->smrt_mutex, NULL, MUTEX_DRIVER,
+	    DDI_INTR_PRI(smrt->smrt_interrupt_pri));
+	smrt->smrt_init_level |= SMRT_INITLEVEL_MUTEX;
+
+	/*
+	 * From this point forward, the controller is able to accept commands
+	 * and (at least by polling) return command submissions.  Setting this
+	 * flag allows the rest of the driver to interact with the device.
+	 */
+	smrt->smrt_status |= SMRT_CTLR_STATUS_RUNNING;
+
+	if (smrt_interrupts_enable(smrt) != DDI_SUCCESS) {
+		dev_err(dip, CE_WARN, "interrupt handler could not be enabled");
+		goto fail;
+	}
+
+	if (smrt_ctrl_hba_setup(smrt) != DDI_SUCCESS) {
+		dev_err(dip, CE_WARN, "SCSI framework setup failed");
+		goto fail;
+	}
+
+	/*
+	 * Set the appropriate Interrupt Mask Register bits to start
+	 * command completion interrupts from the controller.
+	 */
+	smrt_intr_set(smrt, B_TRUE);
+	check_for_interrupts = B_TRUE;
+
+	/*
+	 * Register the maintenance routine for periodic execution:
+	 */
+	smrt->smrt_periodic = ddi_periodic_add(smrt_periodic, smrt,
+	    SMRT_PERIODIC_RATE * NANOSEC, DDI_IPL_0);
+	smrt->smrt_init_level |= SMRT_INITLEVEL_PERIODIC;
+
+	(void) snprintf(taskq_name, sizeof (taskq_name), "smrt_discover_%u",
+	    instance);
+	smrt->smrt_discover_taskq = ddi_taskq_create(smrt->smrt_dip, taskq_name,
+	    1, TASKQ_DEFAULTPRI, 0);
+	if (smrt->smrt_discover_taskq == NULL) {
+		dev_err(dip, CE_WARN, "failed to create discovery task queue");
+		goto fail;
+	}
+	smrt->smrt_init_level |= SMRT_INITLEVEL_TASKQ;
+
+	if ((r = smrt_event_init(smrt)) != 0) {
+		dev_err(dip, CE_WARN, "could not initialize event subsystem "
+		    "(%d)", r);
+		goto fail;
+	}
+	smrt->smrt_init_level |= SMRT_INITLEVEL_ASYNC_EVENT;
+
+	if (scsi_hba_iport_register(dip, SMRT_IPORT_VIRT) != DDI_SUCCESS)
+		goto fail;
+
+	if (scsi_hba_iport_register(dip, SMRT_IPORT_PHYS) != DDI_SUCCESS)
+		goto fail;
+
+	/*
+	 * Announce the attachment of this controller.
+	 */
+	ddi_report_dev(dip);
+
+	return (DDI_SUCCESS);
+
+fail:
+	if (check_for_interrupts) {
+		if (smrt->smrt_stats.smrts_claimed_interrupts == 0) {
+			dev_err(dip, CE_WARN, "controller did not interrupt "
+			    "during attach");
+		}
+	}
+	smrt_cleanup(smrt);
+	return (DDI_FAILURE);
+}
+
+static int
+smrt_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+	scsi_hba_tran_t *tran = (scsi_hba_tran_t *)ddi_get_driver_private(dip);
+	smrt_t *smrt = (smrt_t *)tran->tran_hba_private;
+
+	if (scsi_hba_iport_unit_address(dip) != NULL)
+		return (smrt_iport_detach(dip, cmd));
+
+	if (cmd != DDI_DETACH) {
+		return (DDI_FAILURE);
+	}
+
+	/*
+	 * First, check to make sure that all SCSI framework targets have
+	 * detached.
+	 */
+	mutex_enter(&smrt->smrt_mutex);
+	if (!list_is_empty(&smrt->smrt_targets)) {
+		mutex_exit(&smrt->smrt_mutex);
+		dev_err(smrt->smrt_dip, CE_WARN, "cannot detach; targets still "
+		    "using HBA");
+		return (DDI_FAILURE);
+	}
+
+	if (smrt->smrt_virt_iport != NULL || smrt->smrt_phys_iport != NULL) {
+		mutex_exit(&smrt->smrt_mutex);
+		dev_err(smrt->smrt_dip, CE_WARN, "cannot detach: iports still "
+		    "attached");
+		return (DDI_FAILURE);
+	}
+
+	/*
+	 * Prevent new targets from attaching now:
+	 */
+	smrt->smrt_status |= SMRT_CTLR_STATUS_DETACHING;
+	mutex_exit(&smrt->smrt_mutex);
+
+	/*
+	 * Clean up all remaining resources.
+	 */
+	smrt_cleanup(smrt);
+
+	return (DDI_SUCCESS);
+}
+
+static int
+smrt_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
+    int *rval)
+{
+	int inst = MINOR2INST(getminor(dev));
+	int status;
+
+	if (secpolicy_sys_config(credp, B_FALSE) != 0) {
+		return (EPERM);
+	}
+
+	/*
+	 * Ensure that we have a soft state object for this instance.
+	 */
+	if (ddi_get_soft_state(smrt_state, inst) == NULL) {
+		return (ENXIO);
+	}
+
+	switch (cmd) {
+	default:
+		status = scsi_hba_ioctl(dev, cmd, arg, mode, credp, rval);
+		break;
+	}
+
+	return (status);
+}
+
+static void
+smrt_cleanup(smrt_t *smrt)
+{
+	if (smrt->smrt_init_level & SMRT_INITLEVEL_ASYNC_EVENT) {
+		smrt_event_fini(smrt);
+		smrt->smrt_init_level &= ~SMRT_INITLEVEL_ASYNC_EVENT;
+	}
+
+	smrt_interrupts_teardown(smrt);
+
+	if (smrt->smrt_init_level & SMRT_INITLEVEL_TASKQ) {
+		ddi_taskq_destroy(smrt->smrt_discover_taskq);
+		smrt->smrt_discover_taskq = NULL;
+		smrt->smrt_init_level &= ~SMRT_INITLEVEL_TASKQ;
+	}
+
+	if (smrt->smrt_init_level & SMRT_INITLEVEL_PERIODIC) {
+		ddi_periodic_delete(smrt->smrt_periodic);
+		smrt->smrt_init_level &= ~SMRT_INITLEVEL_PERIODIC;
+	}
+
+	smrt_ctrl_hba_teardown(smrt);
+
+	smrt_ctlr_teardown(smrt);
+
+	smrt_device_teardown(smrt);
+
+	if (smrt->smrt_init_level & SMRT_INITLEVEL_BASIC) {
+		smrt_logvol_teardown(smrt);
+		smrt_phys_teardown(smrt);
+
+		cv_destroy(&smrt->smrt_cv_finishq);
+
+		VERIFY(list_is_empty(&smrt->smrt_commands));
+		list_destroy(&smrt->smrt_commands);
+		list_destroy(&smrt->smrt_finishq);
+		list_destroy(&smrt->smrt_abortq);
+
+		VERIFY(list_is_empty(&smrt->smrt_volumes));
+		list_destroy(&smrt->smrt_volumes);
+
+		VERIFY(list_is_empty(&smrt->smrt_physicals));
+		list_destroy(&smrt->smrt_physicals);
+
+		VERIFY(list_is_empty(&smrt->smrt_targets));
+		list_destroy(&smrt->smrt_targets);
+
+		VERIFY(avl_is_empty(&smrt->smrt_inflight));
+		avl_destroy(&smrt->smrt_inflight);
+
+		smrt->smrt_init_level &= ~SMRT_INITLEVEL_BASIC;
+	}
+
+	if (smrt->smrt_init_level & SMRT_INITLEVEL_MUTEX) {
+		mutex_destroy(&smrt->smrt_mutex);
+
+		smrt->smrt_init_level &= ~SMRT_INITLEVEL_MUTEX;
+	}
+
+	VERIFY0(smrt->smrt_init_level);
+
+	ddi_soft_state_free(smrt_state, ddi_get_instance(smrt->smrt_dip));
+}
+
+/*
+ * Comparator for the "smrt_inflight" AVL tree in a "smrt_t".  This AVL tree
+ * allows a tag ID to be mapped back to the relevant "smrt_command_t".
+ */
+static int
+smrt_command_comparator(const void *lp, const void *rp)
+{
+	const smrt_command_t *l = lp;
+	const smrt_command_t *r = rp;
+
+	if (l->smcm_tag > r->smcm_tag) {
+		return (1);
+	} else if (l->smcm_tag < r->smcm_tag) {
+		return (-1);
+	} else {
+		return (0);
+	}
+}
diff --git a/usr/src/uts/common/io/scsi/adapters/smrt/smrt.conf b/usr/src/uts/common/io/scsi/adapters/smrt/smrt.conf
new file mode 100644
index 0000000000..758ecd0779
--- /dev/null
+++ b/usr/src/uts/common/io/scsi/adapters/smrt/smrt.conf
@@ -0,0 +1,16 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2016 Joyent, Inc.
+#
+
+scsi-no-quiesce=1;
diff --git a/usr/src/uts/common/io/scsi/adapters/smrt/smrt_ciss.c b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_ciss.c
new file mode 100644
index 0000000000..b4cdd5607e
--- /dev/null
+++ b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_ciss.c
@@ -0,0 +1,2023 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2017, Joyent, Inc.
+ */
+
+#include <sys/scsi/adapters/smrt/smrt.h>
+
+/*
+ * Discovery, Resets, Periodics, and Events
+ * ----------------------------------------
+ *
+ * Discovery is the act of figuring out what logical and physical volumes exist
+ * under the controller.  Discovery happens in response to the following events:
+ *
+ *   o iports for virtual and physical devices being attached
+ *   o Controller event notifications indicating potential topology changes
+ *   o After a reset of the controller, before we can perform I/O again
+ *
+ * Because we have to perform discovery after a reset, which can happen during
+ * panic(), that also means that discovery may be run in panic context.  We
+ * also need to emphasize the need for discovery to happen after a controller
+ * reset.  Once a reset is initiated, we cannot be certain about the addresses
+ * of any of the existing targets until the reset has completed.  The driver
+ * performs I/Os to addresses that the controller provides.  The controller
+ * specification says that these addresses may change after a controller reset.
+ *
+ * Unfortunately, all of this combined means that making sure we can correctly
+ * run discovery is somewhat complicated.  In non-panic contexts, discovery is
+ * always run from a taskq.  We'll kick off the discovery in the taskq if
+ * nothing is pending at that time.  The state is managed by bits in the
+ * smrt_status member of the smrt_t.  There are four bits at this time:
+ *
+ *	SMRT_CTLR_DISCOVERY_REQUESTED	This flag indicates that something has
+ *					requested that a discovery be performed.
+ *					If no flags are set when this is set,
+ *					then we will kick off discovery.  All
+ *					discovery requests are initiated via the
+ *					smrt_discover_request() function.
+ *
+ *	SMRT_CTLR_DISCOVERY_RUNNING	This flag is set at the start of us
+ *					running a discovery.  It is removed when
+ *					discovery finishes.
+ *
+ *	SMRT_CTLR_DISCOVERY_PERIODIC	This flag is set in a number of
+ *					circumstances, which will be described
+ *					in a subsequent section.  This indicates
+ *					that the periodic must kick off the
+ *					discovery process.
+ *
+ *	SMRT_CTLR_DISCOVERY_REQUIRED	This flag indicates that at some point a
+ *					controller reset occurred and we need to
+ *					have a successful discovery to finish
+ *					the act of resetting and allowing I/O to
+ *					continue.
+ *
+ * In general, a request to discover kicks off the taskq to discover entries, if
+ * it hasn't already been requested or started.  This also allows us to coalesce
+ * multiple requests, if needed.  Note that if a request comes in when a
+ * discovery is ongoing, we do not kick off discovery again.  Instead, we set
+ * the SMRT_CTLR_DISCOVERY_REQUESTED flag which will rerun discovery after the
+ * initial pass has completed.
+ *
+ * When a discovery starts, the first thing it does is clear the
+ * SMRT_CTLR_DISCOVERY_REQUESTED flag.  This is important, because any
+ * additional requests for discovery that come in after this has started likely
+ * indicate that we've missed something.  As such, when the discovery process
+ * finishes, if it sees the REQUESTED flag, then it will need to set the
+ * PERIODIC flag.  The PERIODIC flag is used to indicate that we should run
+ * discovery again, but not kick if off immediately.  Instead, it should be
+ * driven by the normal periodic behavior.
+ *
+ * If for some reason the act of discovery fails, or we fail to dispatch
+ * discovery due to a transient error, then we will flag PERIODIC so that the
+ * periodic tick will try and run things again.
+ *
+ * Now, we need to talk about SMRT_CTLR_DISCOVERY_REQUIRED.  This flag is set
+ * after a reset occurs.  The reset thread will be blocked on this.
+ * Importantly, none of the code in the discovery path can ask for a controller
+ * reset at this time.  If at the end of a discovery, this flag is set, then we
+ * will signal the reset thread that it should check on its status by
+ * broadcasting on the smrt_cv_finishq.  At that point, the reset thread will
+ * continue.
+ *
+ * Panic Context
+ * -------------
+ *
+ * All of this talk of threads and taskqs is well and good, but as an HBA
+ * driver, we have a serious responsibility to try and deal with panic sanely.
+ * In panic context, we will directly call the discovery functions and not poll
+ * for them to occur.
+ *
+ * However, because our discovery relies on the target maps, which aren't safe
+ * for panic context at this time, we have to take a different approach.  We
+ * leverage the fact that we have a generation number stored with every
+ * discovery.  If we try to do an I/O to a device where the generation doesn't
+ * match, then we know that it disappeared and should not be used.  We also
+ * sanity check the model, serial numbers, and WWNs to make sure that these are
+ * the same devices.  If they are, then we'll end up updating the address
+ * structures.
+ *
+ * Now, it is possible that when we were panicking, we had a thread that was in
+ * the process of running a discovery or even resetting the system.  Once we're
+ * in panic, those threads aren't running, so if they didn't end up producing a
+ * new view of the world that the SCSI framework is using, then it shouldn't
+ * really matter, as we won't have updated the list of devices.  Importantly,
+ * once we're in that context, we're not going to be attaching or detaching
+ * targets.  If we get a request for one of these targets which has disappeared,
+ * we're going to have to end up giving up.
+ *
+ * Request Attributes
+ * ------------------
+ *
+ * The CISS specification allows for three different kinds of attributes that
+ * describe how requests are queued to the controller.  These are:
+ *
+ * 	HEAD OF QUEUE		The request should go to the head of the
+ * 				controller queue.  This is used for resets and
+ * 				aborts to ensure that they're not blocked behind
+ * 				additional I/O.
+ *
+ * 	SIMPLE			This queues the request for normal processing.
+ * 				Commands queued this way are not special with
+ * 				respect to one another.  We use this for all I/O
+ * 				and discovery commands.
+ *
+ *	ORDERED			This attribute is used to indicate that commands
+ *				should be submitted and processed in some order.
+ *				This is used primarily for the event
+ *				notification bits so we can ensure that at the
+ *				return of a cancellation of the event
+ *				notification, that any outstanding request has
+ *				been honored.
+ */
+
+static int smrt_ctlr_versions(smrt_t *, uint16_t, smrt_versions_t *);
+static void smrt_discover(void *);
+
+/*
+ * The maximum number of seconds to wait for the controller to come online.
+ */
+unsigned smrt_ciss_init_time = 90;
+
+/*
+ * A tunable that determines the number of events per tick that we'll process
+ * via asynchronous event notification.  If this rate is very high, then we will
+ * not submit the event and it will be picked up at the next tick of the
+ * periodic.
+ */
+uint_t smrt_event_intervention_threshold = 1000;
+
+/*
+ * Converts a LUN Address to a BMIC Identifier.  The BMIC Identifier is used
+ * when performing various physical commands and generally should stay the same
+ * for a given device across inserts and removals; however, not across
+ * controller resets.  These are calculated based on what the CISS specification
+ * calls the 'Level 2' target and bus, which don't have a real meaning in the
+ * SAS world otherwise.
+ */
+uint16_t
+smrt_lun_addr_to_bmic(PhysDevAddr_t *paddr)
+{
+	uint16_t id;
+
+	id = (paddr->Target[1].PeripDev.Bus - 1) << 8;
+	id += paddr->Target[1].PeripDev.Dev;
+
+	return (id);
+}
+
+void
+smrt_write_lun_addr_phys(LUNAddr_t *lun, boolean_t masked, unsigned bus,
+    unsigned target)
+{
+	lun->PhysDev.Mode = masked ? MASK_PERIPHERIAL_DEV_ADDR :
+	    PERIPHERIAL_DEV_ADDR;
+
+	lun->PhysDev.TargetId = target;
+	lun->PhysDev.Bus = bus;
+
+	bzero(&lun->PhysDev.Target, sizeof (lun->PhysDev.Target));
+}
+
+/*
+ * According to the CISS Specification, the controller is always addressed in
+ * Mask Perhiperhal mode with a bus and target ID of zero.  This is used by
+ * commands that need to write to the controller itself, which is generally
+ * discovery and other commands.
+ */
+void
+smrt_write_controller_lun_addr(LUNAddr_t *lun)
+{
+	smrt_write_lun_addr_phys(lun, B_TRUE, 0, 0);
+}
+
+void
+smrt_write_message_common(smrt_command_t *smcm, uint8_t type, int timeout_secs)
+{
+	switch (type) {
+	case CISS_MSG_ABORT:
+	case CISS_MSG_RESET:
+	case CISS_MSG_NOP:
+		break;
+
+	default:
+		panic("unknown message type");
+	}
+
+	smcm->smcm_va_cmd->Request.Type.Type = CISS_TYPE_MSG;
+	smcm->smcm_va_cmd->Request.Type.Attribute = CISS_ATTR_HEADOFQUEUE;
+	smcm->smcm_va_cmd->Request.Type.Direction = CISS_XFER_NONE;
+	smcm->smcm_va_cmd->Request.Timeout = LE_16(timeout_secs);
+	smcm->smcm_va_cmd->Request.CDBLen = CISS_CDBLEN;
+	smcm->smcm_va_cmd->Request.CDB[0] = type;
+}
+
+void
+smrt_write_message_abort_one(smrt_command_t *smcm, uint32_t tag)
+{
+	smrt_tag_t cisstag;
+
+	/*
+	 * When aborting a particular command, the request is addressed
+	 * to the controller.
+	 */
+	smrt_write_lun_addr_phys(&smcm->smcm_va_cmd->Header.LUN,
+	    B_TRUE, 0, 0);
+
+	smrt_write_message_common(smcm, CISS_MSG_ABORT, 0);
+
+	/*
+	 * Abort a single command.
+	 */
+	smcm->smcm_va_cmd->Request.CDB[1] = CISS_ABORT_TASK;
+
+	/*
+	 * The CISS Specification says that the tag value for a task-level
+	 * abort should be in the CDB in bytes 4-11.
+	 */
+	bzero(&cisstag, sizeof (cisstag));
+	cisstag.tag_value = tag;
+	bcopy(&cisstag, &smcm->smcm_va_cmd->Request.CDB[4],
+	    sizeof (cisstag));
+}
+
+void
+smrt_write_message_abort_all(smrt_command_t *smcm, LUNAddr_t *addr)
+{
+	/*
+	 * When aborting all tasks for a particular Logical Volume,
+	 * the command is addressed not to the controller but to
+	 * the Volume itself.
+	 */
+	smcm->smcm_va_cmd->Header.LUN = *addr;
+
+	smrt_write_message_common(smcm, CISS_MSG_ABORT, 0);
+
+	/*
+	 * Abort all commands for a particular Logical Volume.
+	 */
+	smcm->smcm_va_cmd->Request.CDB[1] = CISS_ABORT_TASKSET;
+}
+
+void
+smrt_write_message_event_notify(smrt_command_t *smcm)
+{
+	smrt_event_notify_req_t senr;
+
+	smrt_write_controller_lun_addr(&smcm->smcm_va_cmd->Header.LUN);
+
+	smcm->smcm_va_cmd->Request.Type.Type = CISS_TYPE_CMD;
+	smcm->smcm_va_cmd->Request.Type.Attribute = CISS_ATTR_ORDERED;
+	smcm->smcm_va_cmd->Request.Type.Direction = CISS_XFER_READ;
+	smcm->smcm_va_cmd->Request.Timeout = 0;
+	smcm->smcm_va_cmd->Request.CDBLen = sizeof (senr);
+
+	bzero(&senr, sizeof (senr));
+	senr.senr_opcode = CISS_SCMD_READ;
+	senr.senr_subcode = CISS_BMIC_NOTIFY_ON_EVENT;
+	senr.senr_flags = BE_32(0);
+	senr.senr_size = BE_32(SMRT_EVENT_NOTIFY_BUFLEN);
+
+	bcopy(&senr, &smcm->smcm_va_cmd->Request.CDB[0],
+	    MIN(CISS_CDBLEN, sizeof (senr)));
+}
+
+void
+smrt_write_message_cancel_event_notify(smrt_command_t *smcm)
+{
+	smrt_event_notify_req_t senr;
+
+	smrt_write_controller_lun_addr(&smcm->smcm_va_cmd->Header.LUN);
+
+	smcm->smcm_va_cmd->Request.Type.Type = CISS_TYPE_CMD;
+	smcm->smcm_va_cmd->Request.Type.Attribute = CISS_ATTR_ORDERED;
+	smcm->smcm_va_cmd->Request.Type.Direction = CISS_XFER_WRITE;
+	smcm->smcm_va_cmd->Request.Timeout = LE_16(SMRT_ASYNC_CANCEL_TIMEOUT);
+	smcm->smcm_va_cmd->Request.CDBLen = sizeof (senr);
+
+	bzero(&senr, sizeof (senr));
+	senr.senr_opcode = CISS_SCMD_WRITE;
+	senr.senr_subcode = CISS_BMIC_NOTIFY_ON_EVENT_CANCEL;
+	senr.senr_size = BE_32(SMRT_EVENT_NOTIFY_BUFLEN);
+
+	bcopy(&senr, &smcm->smcm_va_cmd->Request.CDB[0],
+	    MIN(CISS_CDBLEN, sizeof (senr)));
+}
+
+void
+smrt_write_message_reset_ctlr(smrt_command_t *smcm)
+{
+	smrt_write_lun_addr_phys(&smcm->smcm_va_cmd->Header.LUN,
+	    B_TRUE, 0, 0);
+
+	smrt_write_message_common(smcm, CISS_MSG_RESET, 0);
+
+	smcm->smcm_va_cmd->Request.CDB[1] = CISS_RESET_CTLR;
+}
+
+void
+smrt_write_message_nop(smrt_command_t *smcm, int timeout_secs)
+{
+	/*
+	 * No-op messages are always sent to the controller.
+	 */
+	smrt_write_lun_addr_phys(&smcm->smcm_va_cmd->Header.LUN,
+	    B_TRUE, 0, 0);
+
+	smrt_write_message_common(smcm, CISS_MSG_NOP, timeout_secs);
+}
+
+/*
+ * This routine is executed regularly by ddi_periodic_add(9F).  It checks the
+ * health of the controller and looks for submitted commands that have timed
+ * out.
+ */
+void
+smrt_periodic(void *arg)
+{
+	smrt_t *smrt = arg;
+
+	mutex_enter(&smrt->smrt_mutex);
+
+	/*
+	 * Before we even check if the controller is running to process
+	 * everything else, we must first check if we had a request to kick off
+	 * discovery.  We do this before the check if the controller is running,
+	 * as this may be required to finish a discovery.
+	 */
+	if ((smrt->smrt_status & SMRT_CTLR_DISCOVERY_PERIODIC) != 0 &&
+	    (smrt->smrt_status & SMRT_CTLR_DISCOVERY_RUNNING) == 0 &&
+	    (smrt->smrt_status & SMRT_CTLR_STATUS_RESETTING) == 0) {
+		if (ddi_taskq_dispatch(smrt->smrt_discover_taskq,
+		    smrt_discover, smrt, DDI_NOSLEEP) != DDI_SUCCESS) {
+			smrt->smrt_stats.smrts_discovery_tq_errors++;
+		} else {
+			smrt->smrt_status &= ~SMRT_CTLR_DISCOVERY_PERIODIC;
+		}
+	}
+
+	if (!(smrt->smrt_status & SMRT_CTLR_STATUS_RUNNING)) {
+		/*
+		 * The device is currently not active, e.g. due to an
+		 * in-progress controller reset.
+		 */
+		mutex_exit(&smrt->smrt_mutex);
+		return;
+	}
+
+	/*
+	 * Check on the health of the controller firmware.  Note that if the
+	 * controller has locked up, this routine will panic the system.
+	 */
+	smrt_lockup_check(smrt);
+
+	/*
+	 * Reset the event notification threshold counter.
+	 */
+	smrt->smrt_event_count = 0;
+
+	/*
+	 * Check inflight commands to see if they have timed out.
+	 */
+	for (smrt_command_t *smcm = avl_first(&smrt->smrt_inflight);
+	    smcm != NULL; smcm = AVL_NEXT(&smrt->smrt_inflight, smcm)) {
+		if (smcm->smcm_status & SMRT_CMD_STATUS_POLLED) {
+			/*
+			 * Polled commands are timed out by the polling
+			 * routine.
+			 */
+			continue;
+		}
+
+		if (smcm->smcm_status & SMRT_CMD_STATUS_ABORT_SENT) {
+			/*
+			 * This command has been aborted; either it will
+			 * complete or the controller will be reset.
+			 */
+			continue;
+		}
+
+		if (list_link_active(&smcm->smcm_link_abort)) {
+			/*
+			 * Already on the abort queue.
+			 */
+			continue;
+		}
+
+		if (smcm->smcm_expiry == 0) {
+			/*
+			 * This command has no expiry time.
+			 */
+			continue;
+		}
+
+		if (gethrtime() > smcm->smcm_expiry) {
+			list_insert_tail(&smrt->smrt_abortq, smcm);
+			smcm->smcm_status |= SMRT_CMD_STATUS_TIMEOUT;
+		}
+	}
+
+	/*
+	 * Process the abort queue.
+	 */
+	(void) smrt_process_abortq(smrt);
+
+	/*
+	 * Check if we have an outstanding event intervention request.  Note,
+	 * the command in question should always be in a state such that it is
+	 * usable by the system here.  The command is always prepared again by
+	 * the normal event notification path, even if a reset has occurred.
+	 * The reset will be processed before we'd ever consider running an
+	 * event again.  Note, if we fail to submit this, then we leave this for
+	 * the next occurrence of the periodic.
+	 */
+	if (smrt->smrt_status & SMRT_CTLR_ASYNC_INTERVENTION) {
+		smrt->smrt_stats.smrts_events_intervened++;
+
+		if (smrt_submit(smrt, smrt->smrt_event_cmd) == 0) {
+			smrt->smrt_status &= ~SMRT_CTLR_ASYNC_INTERVENTION;
+		}
+	}
+
+	mutex_exit(&smrt->smrt_mutex);
+}
+
+int
+smrt_retrieve(smrt_t *smrt)
+{
+	VERIFY(MUTEX_HELD(&smrt->smrt_mutex));
+
+	switch (smrt->smrt_ctlr_mode) {
+	case SMRT_CTLR_MODE_SIMPLE:
+		smrt_retrieve_simple(smrt);
+		return (DDI_SUCCESS);
+
+	case SMRT_CTLR_MODE_UNKNOWN:
+		break;
+	}
+
+	panic("unknown controller mode");
+	/* LINTED: E_FUNC_NO_RET_VAL */
+}
+
+/*
+ * Grab a new tag number for this command.  We aim to avoid reusing tag numbers
+ * as much as possible, so as to avoid spurious double completion from the
+ * controller.
+ */
+static void
+smrt_set_new_tag(smrt_t *smrt, smrt_command_t *smcm)
+{
+	VERIFY(MUTEX_HELD(&smrt->smrt_mutex));
+
+	/*
+	 * Loop until we find a tag that is not in use.  The tag space is
+	 * very large (~30 bits) and the maximum number of inflight commands
+	 * is comparatively small (~1024 in current controllers).
+	 */
+	for (;;) {
+		uint32_t new_tag = smrt->smrt_next_tag;
+
+		if (++smrt->smrt_next_tag > SMRT_MAX_TAG_NUMBER) {
+			smrt->smrt_next_tag = SMRT_MIN_TAG_NUMBER;
+		}
+
+		if (smrt_lookup_inflight(smrt, new_tag) != NULL) {
+			/*
+			 * This tag is already used on an inflight command.
+			 * Choose another.
+			 */
+			continue;
+		}
+
+		/*
+		 * Set the tag for the command and also write it into the
+		 * appropriate part of the request block.
+		 */
+		smcm->smcm_tag = new_tag;
+		smcm->smcm_va_cmd->Header.Tag.tag_value = new_tag;
+		return;
+	}
+}
+
+/*
+ * Submit a command to the controller.
+ */
+int
+smrt_submit(smrt_t *smrt, smrt_command_t *smcm)
+{
+	VERIFY(MUTEX_HELD(&smrt->smrt_mutex));
+	VERIFY(smcm->smcm_type != SMRT_CMDTYPE_PREINIT);
+
+	/*
+	 * Anything that asks us to ignore the running state of the controller
+	 * must be wired up to poll for completion.
+	 */
+	if (smcm->smcm_status & SMRT_CMD_IGNORE_RUNNING) {
+		VERIFY(smcm->smcm_status & SMRT_CMD_STATUS_POLLED);
+	}
+
+	/*
+	 * If the controller is currently being reset, do not allow command
+	 * submission.  However, if this is one of the commands needed to finish
+	 * reset, as indicated on the command structure, allow it.
+	 */
+	if (!(smrt->smrt_status & SMRT_CTLR_STATUS_RUNNING) &&
+	    !(smcm->smcm_status & SMRT_CMD_IGNORE_RUNNING)) {
+		return (EIO);
+	}
+
+	/*
+	 * Do not allow submission of more concurrent commands than the
+	 * controller supports.
+	 */
+	if (avl_numnodes(&smrt->smrt_inflight) >= smrt->smrt_maxcmds) {
+		return (EAGAIN);
+	}
+
+	/*
+	 * Synchronise the Command Block DMA resources to ensure that the
+	 * device has a consistent view before we pass it the command.
+	 */
+	if (ddi_dma_sync(smcm->smcm_contig.smdma_dma_handle, 0, 0,
+	    DDI_DMA_SYNC_FORDEV) != DDI_SUCCESS) {
+		dev_err(smrt->smrt_dip, CE_PANIC, "DMA sync failure");
+		return (EIO);
+	}
+
+	/*
+	 * Ensure that this command is not re-used without issuing a new
+	 * tag number and performing any appropriate cleanup.
+	 */
+	VERIFY(!(smcm->smcm_status & SMRT_CMD_STATUS_USED));
+	smcm->smcm_status |= SMRT_CMD_STATUS_USED;
+
+	/*
+	 * Assign a tag that is not currently in use
+	 */
+	smrt_set_new_tag(smrt, smcm);
+
+	/*
+	 * Insert this command into the inflight AVL.
+	 */
+	avl_index_t where;
+	if (avl_find(&smrt->smrt_inflight, smcm, &where) != NULL) {
+		dev_err(smrt->smrt_dip, CE_PANIC, "duplicate submit tag %x",
+		    smcm->smcm_tag);
+	}
+	avl_insert(&smrt->smrt_inflight, smcm, where);
+	if (smrt->smrt_stats.smrts_max_inflight <
+	    avl_numnodes(&smrt->smrt_inflight)) {
+		smrt->smrt_stats.smrts_max_inflight =
+		    avl_numnodes(&smrt->smrt_inflight);
+	}
+
+	VERIFY(!(smcm->smcm_status & SMRT_CMD_STATUS_INFLIGHT));
+	smcm->smcm_status |= SMRT_CMD_STATUS_INFLIGHT;
+
+	smcm->smcm_time_submit = gethrtime();
+
+	switch (smrt->smrt_ctlr_mode) {
+	case SMRT_CTLR_MODE_SIMPLE:
+		smrt_submit_simple(smrt, smcm);
+		return (0);
+
+	case SMRT_CTLR_MODE_UNKNOWN:
+		break;
+	}
+	panic("unknown controller mode");
+	/* LINTED: E_FUNC_NO_RET_VAL */
+}
+
+static void
+smrt_process_finishq_sync(smrt_command_t *smcm)
+{
+	smrt_t *smrt = smcm->smcm_ctlr;
+
+	if (ddi_dma_sync(smcm->smcm_contig.smdma_dma_handle, 0, 0,
+	    DDI_DMA_SYNC_FORCPU) != DDI_SUCCESS) {
+		dev_err(smrt->smrt_dip, CE_PANIC, "finishq DMA sync failure");
+	}
+}
+
+static void
+smrt_process_finishq_one(smrt_command_t *smcm)
+{
+	smrt_t *smrt = smcm->smcm_ctlr;
+
+	VERIFY(!(smcm->smcm_status & SMRT_CMD_STATUS_COMPLETE));
+	smcm->smcm_status |= SMRT_CMD_STATUS_COMPLETE;
+
+	switch (smcm->smcm_type) {
+	case SMRT_CMDTYPE_INTERNAL:
+		cv_broadcast(&smcm->smcm_ctlr->smrt_cv_finishq);
+		return;
+
+	case SMRT_CMDTYPE_SCSA:
+		smrt_hba_complete(smcm);
+		return;
+
+	case SMRT_CMDTYPE_EVENT:
+		smrt_event_complete(smcm);
+		return;
+
+	case SMRT_CMDTYPE_ABORTQ:
+		/*
+		 * Abort messages sent as part of abort queue processing
+		 * do not require any completion activity.
+		 */
+		mutex_exit(&smrt->smrt_mutex);
+		smrt_command_free(smcm);
+		mutex_enter(&smrt->smrt_mutex);
+		return;
+
+	case SMRT_CMDTYPE_PREINIT:
+		dev_err(smrt->smrt_dip, CE_PANIC, "preinit command "
+		    "completed after initialisation");
+		return;
+	}
+
+	panic("unknown command type");
+}
+
+/*
+ * Process commands in the completion queue.
+ */
+void
+smrt_process_finishq(smrt_t *smrt)
+{
+	smrt_command_t *smcm;
+
+	VERIFY(MUTEX_HELD(&smrt->smrt_mutex));
+
+	while ((smcm = list_remove_head(&smrt->smrt_finishq)) != NULL) {
+		/*
+		 * Synchronise the Command Block before we read from it or
+		 * free it, to ensure that any writes from the controller are
+		 * visible.
+		 */
+		smrt_process_finishq_sync(smcm);
+
+		/*
+		 * Check if this command was in line to be aborted.
+		 */
+		if (list_link_active(&smcm->smcm_link_abort)) {
+			/*
+			 * This command was in line, but the controller
+			 * subsequently completed the command before we
+			 * were able to do so.
+			 */
+			list_remove(&smrt->smrt_abortq, smcm);
+			smcm->smcm_status &= ~SMRT_CMD_STATUS_TIMEOUT;
+		}
+
+		/*
+		 * Check if this command has been abandoned by the original
+		 * submitter.  If it has, free it now to avoid a leak.
+		 */
+		if (smcm->smcm_status & SMRT_CMD_STATUS_ABANDONED) {
+			mutex_exit(&smrt->smrt_mutex);
+			smrt_command_free(smcm);
+			mutex_enter(&smrt->smrt_mutex);
+			continue;
+		}
+
+		if (smcm->smcm_status & SMRT_CMD_STATUS_POLLED) {
+			/*
+			 * This command will be picked up and processed
+			 * by "smrt_poll_for()" once the CV is triggered
+			 * at the end of processing.
+			 */
+			smcm->smcm_status |= SMRT_CMD_STATUS_POLL_COMPLETE;
+			continue;
+		}
+
+		smrt_process_finishq_one(smcm);
+	}
+
+	cv_broadcast(&smrt->smrt_cv_finishq);
+}
+
+/*
+ * Process commands in the abort queue.
+ */
+void
+smrt_process_abortq(smrt_t *smrt)
+{
+	smrt_command_t *smcm;
+	smrt_command_t *abort_smcm = NULL;
+
+	VERIFY(MUTEX_HELD(&smrt->smrt_mutex));
+
+	if (list_is_empty(&smrt->smrt_abortq)) {
+		goto out;
+	}
+
+another:
+	mutex_exit(&smrt->smrt_mutex);
+	if ((abort_smcm = smrt_command_alloc(smrt, SMRT_CMDTYPE_ABORTQ,
+	    KM_NOSLEEP)) == NULL) {
+		/*
+		 * No resources available to send abort messages.  We will
+		 * try again the next time around.
+		 */
+		mutex_enter(&smrt->smrt_mutex);
+		goto out;
+	}
+	mutex_enter(&smrt->smrt_mutex);
+
+	while ((smcm = list_remove_head(&smrt->smrt_abortq)) != NULL) {
+		if (!(smcm->smcm_status & SMRT_CMD_STATUS_INFLIGHT)) {
+			/*
+			 * This message is not currently inflight, so
+			 * no abort is needed.
+			 */
+			continue;
+		}
+
+		if (smcm->smcm_status & SMRT_CMD_STATUS_ABORT_SENT) {
+			/*
+			 * An abort message has already been sent for
+			 * this command.
+			 */
+			continue;
+		}
+
+		/*
+		 * Send an abort message for the command.
+		 */
+		smrt_write_message_abort_one(abort_smcm, smcm->smcm_tag);
+		if (smrt_submit(smrt, abort_smcm) != 0) {
+			/*
+			 * The command could not be submitted to the
+			 * controller.  Put it back in the abort queue
+			 * and give up for now.
+			 */
+			list_insert_head(&smrt->smrt_abortq, smcm);
+			goto out;
+		}
+		smcm->smcm_status |= SMRT_CMD_STATUS_ABORT_SENT;
+
+		/*
+		 * Record some debugging information about the abort we
+		 * sent:
+		 */
+		smcm->smcm_abort_time = gethrtime();
+		smcm->smcm_abort_tag = abort_smcm->smcm_tag;
+
+		/*
+		 * The abort message was sent.  Release it and
+		 * allocate another command.
+		 */
+		abort_smcm = NULL;
+		goto another;
+	}
+
+out:
+	cv_broadcast(&smrt->smrt_cv_finishq);
+	if (abort_smcm != NULL) {
+		mutex_exit(&smrt->smrt_mutex);
+		smrt_command_free(abort_smcm);
+		mutex_enter(&smrt->smrt_mutex);
+	}
+}
+
+int
+smrt_poll_for(smrt_t *smrt, smrt_command_t *smcm)
+{
+	VERIFY(MUTEX_HELD(&smrt->smrt_mutex));
+	VERIFY(smcm->smcm_status & SMRT_CMD_STATUS_POLLED);
+
+	while (!(smcm->smcm_status & SMRT_CMD_STATUS_POLL_COMPLETE)) {
+		if (smcm->smcm_expiry != 0) {
+			/*
+			 * This command has an expiry time.  Check to see
+			 * if it has already passed:
+			 */
+			if (smcm->smcm_expiry < gethrtime()) {
+				return (ETIMEDOUT);
+			}
+		}
+
+		if (ddi_in_panic()) {
+			/*
+			 * When the system is panicking, there are no
+			 * interrupts or other threads.  Drive the polling loop
+			 * on our own, but with a small delay to avoid
+			 * aggrevating the controller while we're trying to
+			 * dump.
+			 */
+			(void) smrt_retrieve(smrt);
+			smrt_process_finishq(smrt);
+			drv_usecwait(100);
+			continue;
+		}
+
+		/*
+		 * Wait for command completion to return through the regular
+		 * interrupt handling path.
+		 */
+		if (smcm->smcm_expiry == 0) {
+			cv_wait(&smrt->smrt_cv_finishq, &smrt->smrt_mutex);
+		} else {
+			/*
+			 * Wait only until the expiry time for this command.
+			 */
+			(void) cv_timedwait_sig_hrtime(&smrt->smrt_cv_finishq,
+			    &smrt->smrt_mutex, smcm->smcm_expiry);
+		}
+	}
+
+	/*
+	 * Fire the completion callback for this command.  The callback
+	 * is responsible for freeing the command, so it may not be
+	 * referenced again once this call returns.
+	 */
+	smrt_process_finishq_one(smcm);
+
+	return (0);
+}
+
+void
+smrt_intr_set(smrt_t *smrt, boolean_t enabled)
+{
+	/*
+	 * Read the Interrupt Mask Register.
+	 */
+	uint32_t imr = smrt_get32(smrt, CISS_I2O_INTERRUPT_MASK);
+
+	switch (smrt->smrt_ctlr_mode) {
+	case SMRT_CTLR_MODE_SIMPLE:
+		if (enabled) {
+			imr &= ~CISS_IMR_BIT_SIMPLE_INTR_DISABLE;
+		} else {
+			imr |= CISS_IMR_BIT_SIMPLE_INTR_DISABLE;
+		}
+		smrt_put32(smrt, CISS_I2O_INTERRUPT_MASK, imr);
+		return;
+
+	case SMRT_CTLR_MODE_UNKNOWN:
+		break;
+	}
+	panic("unknown controller mode");
+}
+
+/*
+ * Signal to the controller that we have updated the Configuration Table by
+ * writing to the Inbound Doorbell Register.  The controller will, after some
+ * number of seconds, acknowledge this by clearing the bit.
+ *
+ * If successful, return DDI_SUCCESS.  If the controller takes too long to
+ * acknowledge, return DDI_FAILURE.
+ */
+int
+smrt_cfgtbl_flush(smrt_t *smrt)
+{
+	/*
+	 * Read the current value of the Inbound Doorbell Register.
+	 */
+	uint32_t idr = smrt_get32(smrt, CISS_I2O_INBOUND_DOORBELL);
+
+	/*
+	 * Signal the Configuration Table change to the controller.
+	 */
+	idr |= CISS_IDR_BIT_CFGTBL_CHANGE;
+	smrt_put32(smrt, CISS_I2O_INBOUND_DOORBELL, idr);
+
+	/*
+	 * Wait for the controller to acknowledge the change.
+	 */
+	for (unsigned i = 0; i < smrt_ciss_init_time; i++) {
+		idr = smrt_get32(smrt, CISS_I2O_INBOUND_DOORBELL);
+
+		if ((idr & CISS_IDR_BIT_CFGTBL_CHANGE) == 0) {
+			return (DDI_SUCCESS);
+		}
+
+		/*
+		 * Wait for one second before trying again.
+		 */
+		delay(drv_usectohz(1000000));
+	}
+
+	dev_err(smrt->smrt_dip, CE_WARN, "time out expired before controller "
+	    "configuration completed");
+	return (DDI_FAILURE);
+}
+
+int
+smrt_cfgtbl_transport_has_support(smrt_t *smrt, int xport)
+{
+	VERIFY(xport == CISS_CFGTBL_XPORT_SIMPLE);
+
+	/*
+	 * Read the current value of the "Supported Transport Methods" field in
+	 * the Configuration Table.
+	 */
+	uint32_t xport_active = ddi_get32(smrt->smrt_ct_handle,
+	    &smrt->smrt_ct->TransportSupport);
+
+	/*
+	 * Check that the desired transport method is supported by the
+	 * controller:
+	 */
+	if ((xport_active & xport) == 0) {
+		dev_err(smrt->smrt_dip, CE_WARN, "controller does not support "
+		    "method \"%s\"", xport == CISS_CFGTBL_XPORT_SIMPLE ?
+		    "simple" : "performant");
+		return (DDI_FAILURE);
+	}
+
+	return (DDI_SUCCESS);
+}
+
+void
+smrt_cfgtbl_transport_set(smrt_t *smrt, int xport)
+{
+	VERIFY(xport == CISS_CFGTBL_XPORT_SIMPLE);
+
+	ddi_put32(smrt->smrt_ct_handle, &smrt->smrt_ct->TransportRequest,
+	    xport);
+}
+
+int
+smrt_cfgtbl_transport_confirm(smrt_t *smrt, int xport)
+{
+	VERIFY(xport == CISS_CFGTBL_XPORT_SIMPLE);
+
+	/*
+	 * Read the current value of the TransportActive field in the
+	 * Configuration Table.
+	 */
+	uint32_t xport_active = ddi_get32(smrt->smrt_ct_handle,
+	    &smrt->smrt_ct->TransportActive);
+
+	/*
+	 * Check that the desired transport method is now active:
+	 */
+	if ((xport_active & xport) == 0) {
+		dev_err(smrt->smrt_dip, CE_WARN, "failed to enable transport "
+		    "method \"%s\"", xport == CISS_CFGTBL_XPORT_SIMPLE ?
+		    "simple" : "performant");
+		return (DDI_FAILURE);
+	}
+
+	/*
+	 * Ensure that the controller is now ready to accept commands.
+	 */
+	if ((xport_active & CISS_CFGTBL_READY_FOR_COMMANDS) == 0) {
+		dev_err(smrt->smrt_dip, CE_WARN, "controller not ready to "
+		    "accept commands");
+		return (DDI_FAILURE);
+	}
+
+	return (DDI_SUCCESS);
+}
+
+uint32_t
+smrt_ctlr_get_maxsgelements(smrt_t *smrt)
+{
+	return (ddi_get32(smrt->smrt_ct_handle, &smrt->smrt_ct->MaxSGElements));
+}
+
+uint32_t
+smrt_ctlr_get_cmdsoutmax(smrt_t *smrt)
+{
+	return (ddi_get32(smrt->smrt_ct_handle, &smrt->smrt_ct->CmdsOutMax));
+}
+
+static uint32_t
+smrt_ctlr_get_hostdrvsup(smrt_t *smrt)
+{
+	return (ddi_get32(smrt->smrt_ct_handle,
+	    &smrt->smrt_ct->HostDrvrSupport));
+}
+
+int
+smrt_ctlr_init(smrt_t *smrt)
+{
+	uint8_t signature[4] = { 'C', 'I', 'S', 'S' };
+	int e;
+
+	if ((e = smrt_ctlr_wait_for_state(smrt,
+	    SMRT_WAIT_STATE_READY)) != DDI_SUCCESS) {
+		return (e);
+	}
+
+	/*
+	 * The configuration table contains an ASCII signature ("CISS") which
+	 * should be checked as we initialise the controller.
+	 * See: "9.1 Configuration Table" in CISS Specification.
+	 */
+	for (unsigned i = 0; i < 4; i++) {
+		if (ddi_get8(smrt->smrt_ct_handle,
+		    &smrt->smrt_ct->Signature[i]) != signature[i]) {
+			dev_err(smrt->smrt_dip, CE_WARN, "invalid signature "
+			    "detected");
+			return (DDI_FAILURE);
+		}
+	}
+
+	/*
+	 * Initialise an appropriate Transport Method.  For now, this driver
+	 * only supports the "Simple" method.
+	 */
+	if ((e = smrt_ctlr_init_simple(smrt)) != DDI_SUCCESS) {
+		return (e);
+	}
+
+	/*
+	 * Save some common feature support bitfields.
+	 */
+	smrt->smrt_host_support = smrt_ctlr_get_hostdrvsup(smrt);
+	smrt->smrt_bus_support = ddi_get32(smrt->smrt_ct_handle,
+	    &smrt->smrt_ct->BusTypes);
+
+	/*
+	 * Read initial controller heartbeat value and mark the current
+	 * reading time.
+	 */
+	smrt->smrt_last_heartbeat = ddi_get32(smrt->smrt_ct_handle,
+	    &smrt->smrt_ct->HeartBeat);
+	smrt->smrt_last_heartbeat_time = gethrtime();
+
+	/*
+	 * Determine the firmware version of the controller so that we can
+	 * select which type of interrupts to use.
+	 */
+	if ((e = smrt_ctlr_versions(smrt, SMRT_DISCOVER_TIMEOUT,
+	    &smrt->smrt_versions)) != 0) {
+		dev_err(smrt->smrt_dip, CE_WARN, "could not identify "
+		    "controller (%d)", e);
+		return (DDI_FAILURE);
+	}
+
+	dev_err(smrt->smrt_dip, CE_NOTE, "!firmware rev %s",
+	    smrt->smrt_versions.smrtv_firmware_rev);
+
+	return (DDI_SUCCESS);
+}
+
+void
+smrt_ctlr_teardown(smrt_t *smrt)
+{
+	smrt->smrt_status &= ~SMRT_CTLR_STATUS_RUNNING;
+
+	switch (smrt->smrt_ctlr_mode) {
+	case SMRT_CTLR_MODE_SIMPLE:
+		smrt_ctlr_teardown_simple(smrt);
+		return;
+
+	case SMRT_CTLR_MODE_UNKNOWN:
+		return;
+	}
+
+	panic("unknown controller mode");
+}
+
+int
+smrt_ctlr_wait_for_state(smrt_t *smrt, smrt_wait_state_t state)
+{
+	unsigned wait_usec = 100 * 1000;
+	unsigned wait_count = SMRT_WAIT_DELAY_SECONDS * 1000000 / wait_usec;
+
+	VERIFY(state == SMRT_WAIT_STATE_READY ||
+	    state == SMRT_WAIT_STATE_UNREADY);
+
+	/*
+	 * Read from the Scratchpad Register until the expected ready signature
+	 * is detected.  This behaviour is not described in the CISS
+	 * specification.
+	 *
+	 * If the device is not in the desired state immediately, sleep for a
+	 * second and try again.  If the device has not become ready in 300
+	 * seconds, give up.
+	 */
+	for (unsigned i = 0; i < wait_count; i++) {
+		uint32_t spr = smrt_get32(smrt, CISS_I2O_SCRATCHPAD);
+
+		switch (state) {
+		case SMRT_WAIT_STATE_READY:
+			if (spr == CISS_SCRATCHPAD_INITIALISED) {
+				return (DDI_SUCCESS);
+			}
+			break;
+
+		case SMRT_WAIT_STATE_UNREADY:
+			if (spr != CISS_SCRATCHPAD_INITIALISED) {
+				return (DDI_SUCCESS);
+			}
+			break;
+		}
+
+		if (ddi_in_panic()) {
+			/*
+			 * There is no sleep for the panicking, so we
+			 * must spin wait:
+			 */
+			drv_usecwait(wait_usec);
+		} else {
+			/*
+			 * Wait for a quarter second and try again.
+			 */
+			delay(drv_usectohz(wait_usec));
+		}
+	}
+
+	dev_err(smrt->smrt_dip, CE_WARN, "time out waiting for controller "
+	    "to enter state \"%s\"", state == SMRT_WAIT_STATE_READY ?
+	    "ready": "unready");
+	return (DDI_FAILURE);
+}
+
+void
+smrt_lockup_check(smrt_t *smrt)
+{
+	/*
+	 * Read the current controller heartbeat value.
+	 */
+	uint32_t heartbeat = ddi_get32(smrt->smrt_ct_handle,
+	    &smrt->smrt_ct->HeartBeat);
+
+	VERIFY(MUTEX_HELD(&smrt->smrt_mutex));
+
+	/*
+	 * Check to see if the value is the same as last time we looked:
+	 */
+	if (heartbeat != smrt->smrt_last_heartbeat) {
+		/*
+		 * The heartbeat value has changed, which suggests that the
+		 * firmware in the controller has not yet come to a complete
+		 * stop.  Record the new value, as well as the current time.
+		 */
+		smrt->smrt_last_heartbeat = heartbeat;
+		smrt->smrt_last_heartbeat_time = gethrtime();
+		return;
+	}
+
+	/*
+	 * The controller _might_ have been able to signal to us that is
+	 * has locked up.  This is a truly unfathomable state of affairs:
+	 * If the firmware can tell it has flown off the rails, why not
+	 * simply reset the controller?
+	 */
+	uint32_t odr = smrt_get32(smrt, CISS_I2O_OUTBOUND_DOORBELL_STATUS);
+	uint32_t spr = smrt_get32(smrt, CISS_I2O_SCRATCHPAD);
+	if ((odr & CISS_ODR_BIT_LOCKUP) != 0) {
+		dev_err(smrt->smrt_dip, CE_PANIC, "HP SmartArray firmware has "
+		    "reported a critical fault (odr %08x spr %08x)",
+		    odr, spr);
+	}
+
+	if (gethrtime() > smrt->smrt_last_heartbeat_time + 60 * NANOSEC) {
+		dev_err(smrt->smrt_dip, CE_PANIC, "HP SmartArray firmware has "
+		    "stopped responding (odr %08x spr %08x)",
+		    odr, spr);
+	}
+}
+
+/*
+ * Probe the controller with the IDENTIFY CONTROLLER request.  This is a BMIC
+ * command, so it must be submitted to the controller and we must poll for its
+ * completion.  This functionality is only presently used during controller
+ * initialisation, so it uses the special pre-initialisation path for command
+ * allocation and submission.
+ */
+static int
+smrt_ctlr_identify(smrt_t *smrt, uint16_t timeout,
+    smrt_identify_controller_t *resp)
+{
+	smrt_command_t *smcm;
+	smrt_identify_controller_req_t smicr;
+	int r;
+	size_t sz;
+
+	/*
+	 * Allocate a command with a data buffer; the controller will fill it
+	 * with identification information.  There is some suggestion in the
+	 * firmware-level specification that the buffer length should be a
+	 * multiple of 512 bytes for some controllers, so we round up.
+	 */
+	sz = P2ROUNDUP_TYPED(sizeof (*resp), 512, size_t);
+	if ((smcm = smrt_command_alloc_preinit(smrt, sz, KM_SLEEP)) == NULL) {
+		return (ENOMEM);
+	}
+
+	smrt_write_controller_lun_addr(&smcm->smcm_va_cmd->Header.LUN);
+
+	smcm->smcm_va_cmd->Request.CDBLen = sizeof (smicr);
+	smcm->smcm_va_cmd->Request.Timeout = timeout;
+	smcm->smcm_va_cmd->Request.Type.Type = CISS_TYPE_CMD;
+	smcm->smcm_va_cmd->Request.Type.Attribute = CISS_ATTR_SIMPLE;
+	smcm->smcm_va_cmd->Request.Type.Direction = CISS_XFER_READ;
+
+	/*
+	 * Construct the IDENTIFY CONTROLLER request CDB.  Note that any
+	 * reserved fields in the request must be filled with zeroes.
+	 */
+	bzero(&smicr, sizeof (smicr));
+	smicr.smicr_opcode = CISS_SCMD_BMIC_READ;
+	smicr.smicr_lun = 0;
+	smicr.smicr_command = CISS_BMIC_IDENTIFY_CONTROLLER;
+	bcopy(&smicr, &smcm->smcm_va_cmd->Request.CDB[0],
+	    MIN(CISS_CDBLEN, sizeof (smicr)));
+
+	/*
+	 * Send the command to the device and poll for its completion.
+	 */
+	smcm->smcm_status |= SMRT_CMD_STATUS_POLLED;
+	smcm->smcm_expiry = gethrtime() + timeout * NANOSEC;
+	if ((r = smrt_preinit_command_simple(smrt, smcm)) != 0) {
+		VERIFY3S(r, ==, ETIMEDOUT);
+		VERIFY0(smcm->smcm_status & SMRT_CMD_STATUS_POLL_COMPLETE);
+
+		/*
+		 * This command timed out, but the driver is not presently
+		 * initialised to the point where we can try to abort it.
+		 * The command was created with the PREINIT type, so it
+		 * does not appear in the global command tracking list.
+		 * In order to avoid problems with DMA from the controller,
+		 * we have to leak the command allocation.
+		 */
+		smcm = NULL;
+		goto out;
+	}
+
+	if (smcm->smcm_status & SMRT_CMD_STATUS_RESET_SENT) {
+		/*
+		 * The controller was reset while we were trying to identify
+		 * it.  Report failure.
+		 */
+		r = EIO;
+		goto out;
+	}
+
+	if (smcm->smcm_status & SMRT_CMD_STATUS_ERROR) {
+		ErrorInfo_t *ei = smcm->smcm_va_err;
+
+		if (ei->CommandStatus != CISS_CMD_DATA_UNDERRUN) {
+			dev_err(smrt->smrt_dip, CE_WARN, "identify "
+			    "controller error: status 0x%x",
+			    ei->CommandStatus);
+			r = EIO;
+			goto out;
+		}
+	}
+
+	if (resp != NULL) {
+		/*
+		 * Copy the identify response out for the caller.
+		 */
+		bcopy(smcm->smcm_internal->smcmi_va, resp, sizeof (*resp));
+	}
+
+	r = 0;
+
+out:
+	if (smcm != NULL) {
+		smrt_command_free(smcm);
+	}
+	return (r);
+}
+
+/*
+ * The firmware versions in an IDENTIFY CONTROLLER response generally take
+ * the form of a four byte ASCII string containing a dotted decimal version
+ * number; e.g., "8.00".
+ *
+ * This function sanitises the firmware version, replacing unexpected
+ * values with a question mark.
+ */
+static void
+smrt_copy_firmware_version(uint8_t *src, char *dst)
+{
+	for (unsigned i = 0; i < 4; i++) {
+		/*
+		 * Make sure that this is a 7-bit clean ASCII value.
+		 */
+		char c = src[i] <= 0x7f ? (char)(src[i] & 0x7f) : '?';
+
+		if (isalnum(c) || c == '.' || c == ' ') {
+			dst[i] = c;
+		} else {
+			dst[i] = '?';
+		}
+	}
+	dst[4] = '\0';
+}
+
+/*
+ * Using an IDENTIFY CONTROLLER request, determine firmware and controller
+ * version details.  See the comments for "smrt_ctlr_identify()" for more
+ * details about calling context.
+ */
+static int
+smrt_ctlr_versions(smrt_t *smrt, uint16_t timeout, smrt_versions_t *smrtv)
+{
+	smrt_identify_controller_t smic;
+	int r;
+
+	if ((r = smrt_ctlr_identify(smrt, timeout, &smic)) != 0) {
+		return (r);
+	}
+
+	smrtv->smrtv_hardware_version = smic.smic_hardware_version;
+	smrt_copy_firmware_version(smic.smic_firmware_rev,
+	    smrtv->smrtv_firmware_rev);
+	smrt_copy_firmware_version(smic.smic_recovery_rev,
+	    smrtv->smrtv_recovery_rev);
+	smrt_copy_firmware_version(smic.smic_bootblock_rev,
+	    smrtv->smrtv_bootblock_rev);
+
+	return (0);
+}
+
+int
+smrt_ctlr_reset(smrt_t *smrt)
+{
+	smrt_command_t *smcm, *smcm_nop;
+	int r;
+
+	VERIFY(MUTEX_HELD(&smrt->smrt_mutex));
+
+	if (ddi_in_panic()) {
+		goto skip_check;
+	}
+
+	if (smrt->smrt_status & SMRT_CTLR_STATUS_RESETTING) {
+		/*
+		 * Don't pile on.  One reset is enough.  Wait until
+		 * it's complete, and then return success.
+		 */
+		while (smrt->smrt_status & SMRT_CTLR_STATUS_RESETTING) {
+			cv_wait(&smrt->smrt_cv_finishq, &smrt->smrt_mutex);
+		}
+		return (0);
+	}
+	smrt->smrt_status |= SMRT_CTLR_STATUS_RESETTING;
+	smrt->smrt_last_reset_start = gethrtime();
+	smrt->smrt_stats.smrts_ctlr_resets++;
+
+skip_check:
+	/*
+	 * Allocate two commands: one for the soft reset message, which we
+	 * cannot free until the controller has reset; and one for the ping we
+	 * will use to determine when it is once again functional.
+	 */
+	mutex_exit(&smrt->smrt_mutex);
+	if ((smcm = smrt_command_alloc(smrt, SMRT_CMDTYPE_INTERNAL,
+	    KM_NOSLEEP)) == NULL) {
+		mutex_enter(&smrt->smrt_mutex);
+		return (ENOMEM);
+	}
+	if ((smcm_nop = smrt_command_alloc(smrt, SMRT_CMDTYPE_INTERNAL,
+	    KM_NOSLEEP)) == NULL) {
+		smrt_command_free(smcm);
+		mutex_enter(&smrt->smrt_mutex);
+		return (ENOMEM);
+	}
+	mutex_enter(&smrt->smrt_mutex);
+
+	/*
+	 * Send a soft reset command to the controller.  If this command
+	 * succeeds, there will likely be no completion notification.  Instead,
+	 * the device should become unavailable for some period of time and
+	 * then become available again.  Once available again, we know the soft
+	 * reset has completed and should abort all in-flight commands.
+	 */
+	smrt_write_message_reset_ctlr(smcm);
+
+	/*
+	 * Disable interrupts now.
+	 */
+	smrt_intr_set(smrt, B_FALSE);
+
+	dev_err(smrt->smrt_dip, CE_WARN, "attempting controller soft reset");
+	smcm->smcm_status |= SMRT_CMD_STATUS_POLLED;
+	if ((r = smrt_submit(smrt, smcm)) != 0) {
+		dev_err(smrt->smrt_dip, CE_PANIC, "soft reset failed: "
+		    "submit failed (%d)", r);
+	}
+
+	/*
+	 * Mark every currently inflight command as being reset, including the
+	 * soft reset command we just sent.  Once we confirm the reset works,
+	 * we can safely report that these commands have failed.
+	 */
+	for (smrt_command_t *t = avl_first(&smrt->smrt_inflight);
+	    t != NULL; t = AVL_NEXT(&smrt->smrt_inflight, t)) {
+		t->smcm_status |= SMRT_CMD_STATUS_RESET_SENT;
+	}
+
+	/*
+	 * Now that we have submitted our soft reset command, prevent
+	 * the rest of the driver from interacting with the controller.
+	 */
+	smrt->smrt_status &= ~SMRT_CTLR_STATUS_RUNNING;
+
+	/*
+	 * We do not expect a completion from the controller for our soft
+	 * reset command, but we also cannot remove it from the inflight
+	 * list until we know the controller has actually reset.  To do
+	 * otherwise would potentially allow the controller to scribble
+	 * on the memory we were using.
+	 */
+	smcm->smcm_status |= SMRT_CMD_STATUS_ABANDONED;
+
+	if (smrt_ctlr_wait_for_state(smrt, SMRT_WAIT_STATE_UNREADY) !=
+	    DDI_SUCCESS) {
+		dev_err(smrt->smrt_dip, CE_PANIC, "soft reset failed: "
+		    "controller did not become unready");
+	}
+	dev_err(smrt->smrt_dip, CE_NOTE, "soft reset: controller unready");
+
+	if (smrt_ctlr_wait_for_state(smrt, SMRT_WAIT_STATE_READY) !=
+	    DDI_SUCCESS) {
+		dev_err(smrt->smrt_dip, CE_PANIC, "soft reset failed: "
+		    "controller did not come become ready");
+	}
+	dev_err(smrt->smrt_dip, CE_NOTE, "soft reset: controller ready");
+
+	/*
+	 * In at least the Smart Array P420i, the controller can take 30-45
+	 * seconds after the scratchpad register shows it as being available
+	 * before it is ready to receive commands.  In order to avoid hitting
+	 * it too early with our post-reset ping, we will sleep for 10 seconds
+	 * here.
+	 */
+	if (ddi_in_panic()) {
+		drv_usecwait(10 * MICROSEC);
+	} else {
+		delay(drv_usectohz(10 * MICROSEC));
+	}
+
+	smrt_ctlr_teardown(smrt);
+	if (smrt_ctlr_init(smrt) != DDI_SUCCESS) {
+		dev_err(smrt->smrt_dip, CE_PANIC, "soft reset failed: "
+		    "controller transport could not be configured");
+	}
+	dev_err(smrt->smrt_dip, CE_NOTE, "soft reset: controller configured");
+
+	smrt_write_message_nop(smcm_nop, 0);
+	smcm_nop->smcm_status |= SMRT_CMD_STATUS_POLLED |
+	    SMRT_CMD_IGNORE_RUNNING;
+	if ((r = smrt_submit(smrt, smcm_nop)) != 0) {
+		dev_err(smrt->smrt_dip, CE_PANIC, "soft reset failed: "
+		    "ping could not be submitted (%d)", r);
+	}
+
+	/*
+	 * Interrupts are still masked at this stage.  Poll manually in
+	 * a way that will not trigger regular finish queue processing:
+	 */
+	VERIFY(smcm_nop->smcm_status & SMRT_CMD_STATUS_INFLIGHT);
+	for (unsigned i = 0; i < 600; i++) {
+		smrt_retrieve_simple(smrt);
+
+		if (!(smcm_nop->smcm_status & SMRT_CMD_STATUS_INFLIGHT)) {
+			/*
+			 * Remove the ping command from the finish queue and
+			 * process it manually.  This processing must mirror
+			 * what would have been done in smrt_process_finishq().
+			 */
+			VERIFY(list_link_active(&smcm_nop->smcm_link_finish));
+			list_remove(&smrt->smrt_finishq, smcm_nop);
+			smrt_process_finishq_sync(smcm_nop);
+			smcm_nop->smcm_status |= SMRT_CMD_STATUS_POLL_COMPLETE;
+			smrt_process_finishq_one(smcm_nop);
+			break;
+		}
+
+		if (ddi_in_panic()) {
+			drv_usecwait(100 * 1000);
+		} else {
+			delay(drv_usectohz(100 * 1000));
+		}
+	}
+
+	if (!(smcm_nop->smcm_status & SMRT_CMD_STATUS_COMPLETE)) {
+		dev_err(smrt->smrt_dip, CE_PANIC, "soft reset failed: "
+		    "ping did not complete");
+	} else if (smcm_nop->smcm_status & SMRT_CMD_STATUS_ERROR) {
+		dev_err(smrt->smrt_dip, CE_WARN, "soft reset: ping completed "
+		    "in error (status %u)",
+		    (unsigned)smcm_nop->smcm_va_err->CommandStatus);
+	} else {
+		dev_err(smrt->smrt_dip, CE_NOTE, "soft reset: ping completed");
+	}
+
+	/*
+	 * Now that the controller is working again, we can abort any
+	 * commands that were inflight during the reset.
+	 */
+	smrt_command_t *nt;
+	for (smrt_command_t *t = avl_first(&smrt->smrt_inflight);
+	    t != NULL; t = nt) {
+		nt = AVL_NEXT(&smrt->smrt_inflight, t);
+
+		if (t->smcm_status & SMRT_CMD_STATUS_RESET_SENT) {
+			avl_remove(&smrt->smrt_inflight, t);
+			t->smcm_status &= ~SMRT_CMD_STATUS_INFLIGHT;
+
+			list_insert_tail(&smrt->smrt_finishq, t);
+		}
+	}
+
+	/*
+	 * Quiesce our discovery thread.  Note, because
+	 * SMRT_CTLR_STATUS_RESTARTING is set, nothing can cause it to be
+	 * enabled again.
+	 */
+	if (!ddi_in_panic()) {
+		mutex_exit(&smrt->smrt_mutex);
+		ddi_taskq_wait(smrt->smrt_discover_taskq);
+		mutex_enter(&smrt->smrt_mutex);
+	}
+
+	/*
+	 * Re-enable interrupts.  Now, we must kick off a discovery to make sure
+	 * that the system is in a sane state and that we can perform I/O.
+	 */
+	smrt_intr_set(smrt, B_TRUE);
+	smrt->smrt_status &= ~SMRT_CTLR_STATUS_RESETTING;
+	smrt->smrt_status |= SMRT_CTLR_DISCOVERY_REQUIRED;
+
+	/*
+	 * Attempt a discovery to make sure that the drivers sees a realistic
+	 * view of the world.  If we're not in panic context, spin for the
+	 * asynchronous process to complete, otherwise we're in panic context
+	 * and this is going to happen regardless if we want it to or not.
+	 * Before we kick off the request to run discovery, we reset the
+	 * discovery request flags as we know that nothing else can consider
+	 * running discovery and we don't want to delay until the next smrt
+	 * periodic tick if we can avoid it.  In panic context, if this failed,
+	 * then we won't make it back.
+	 */
+	VERIFY0(smrt->smrt_status & SMRT_CTLR_DISCOVERY_RUNNING);
+	smrt->smrt_status &= ~(SMRT_CTLR_DISCOVERY_MASK);
+	smrt_discover(smrt);
+	if (!ddi_in_panic()) {
+		while (smrt->smrt_status & SMRT_CTLR_DISCOVERY_REQUIRED) {
+			cv_wait(&smrt->smrt_cv_finishq, &smrt->smrt_mutex);
+		}
+	}
+
+	smrt->smrt_status |= SMRT_CTLR_STATUS_RUNNING;
+	smrt->smrt_last_reset_finish = gethrtime();
+
+	/*
+	 * Wake anybody that was waiting for the reset to complete.
+	 */
+	cv_broadcast(&smrt->smrt_cv_finishq);
+
+	/*
+	 * Process the completion queue one last time before we let go
+	 * of the mutex.
+	 */
+	smrt_process_finishq(smrt);
+
+	mutex_exit(&smrt->smrt_mutex);
+	smrt_command_free(smcm_nop);
+	mutex_enter(&smrt->smrt_mutex);
+	return (0);
+}
+
+int
+smrt_event_init(smrt_t *smrt)
+{
+	int ret;
+	smrt_command_t *event, *cancel;
+
+	event = smrt_command_alloc(smrt, SMRT_CMDTYPE_EVENT, KM_NOSLEEP);
+	if (event == NULL)
+		return (ENOMEM);
+	if (smrt_command_attach_internal(smrt, event, SMRT_EVENT_NOTIFY_BUFLEN,
+	    KM_NOSLEEP) != 0) {
+		smrt_command_free(event);
+		return (ENOMEM);
+	}
+	smrt_write_message_event_notify(event);
+
+	cancel = smrt_command_alloc(smrt, SMRT_CMDTYPE_INTERNAL, KM_NOSLEEP);
+	if (cancel == NULL) {
+		smrt_command_free(event);
+		return (ENOMEM);
+	}
+	if (smrt_command_attach_internal(smrt, cancel, SMRT_EVENT_NOTIFY_BUFLEN,
+	    KM_NOSLEEP) != 0) {
+		smrt_command_free(event);
+		smrt_command_free(cancel);
+		return (ENOMEM);
+	}
+	smrt_write_message_cancel_event_notify(cancel);
+
+	cv_init(&smrt->smrt_event_queue, NULL, CV_DRIVER, NULL);
+
+	mutex_enter(&smrt->smrt_mutex);
+	if ((ret = smrt_submit(smrt, event)) != 0) {
+		mutex_exit(&smrt->smrt_mutex);
+		smrt_command_free(event);
+		smrt_command_free(cancel);
+		return (ret);
+	}
+
+	smrt->smrt_event_cmd = event;
+	smrt->smrt_event_cancel_cmd = cancel;
+	mutex_exit(&smrt->smrt_mutex);
+
+	return (0);
+}
+
+void
+smrt_event_complete(smrt_command_t *smcm)
+{
+	smrt_event_notify_t *sen;
+	boolean_t log, rescan;
+
+	boolean_t intervene = B_FALSE;
+	smrt_t *smrt = smcm->smcm_ctlr;
+
+	VERIFY(MUTEX_HELD(&smrt->smrt_mutex));
+	VERIFY3P(smcm, ==, smrt->smrt_event_cmd);
+	VERIFY0(smrt->smrt_status & SMRT_CTLR_ASYNC_INTERVENTION);
+
+	smrt->smrt_stats.smrts_events_received++;
+
+	if (smrt->smrt_status & SMRT_CTLR_STATUS_DETACHING) {
+		cv_signal(&smrt->smrt_event_queue);
+		return;
+	}
+
+	if (smrt->smrt_status & SMRT_CTLR_STATUS_RESETTING) {
+		intervene = B_TRUE;
+		goto clean;
+	}
+
+	/*
+	 * The event notification command failed for some reason.  Attempt to
+	 * drive on and try again at the next intervention period.  Because this
+	 * may represent a programmer error (though it's hard to know), we wait
+	 * until the next intervention period and don't panic.
+	 */
+	if (smcm->smcm_status & SMRT_CMD_STATUS_ERROR) {
+		ErrorInfo_t *ei = smcm->smcm_va_err;
+		intervene = B_TRUE;
+
+		smrt->smrt_stats.smrts_events_errors++;
+		dev_err(smrt->smrt_dip, CE_WARN, "!event notification request "
+		    "error: status 0x%x", ei->CommandStatus);
+		goto clean;
+	}
+
+	sen = smcm->smcm_internal->smcmi_va;
+	log = rescan = B_FALSE;
+	switch (sen->sen_class) {
+	case SMRT_EVENT_CLASS_PROTOCOL:
+		/*
+		 * Most of the event protocol class events aren't really
+		 * actionable.  However, subclass 1 indicates errors.  Today,
+		 * the only error is an event overflow.  If there's an event
+		 * overflow, then we must assume that we need to rescan.
+		 */
+		if (sen->sen_subclass == SMRT_EVENT_PROTOCOL_SUBCLASS_ERROR) {
+			rescan = B_TRUE;
+		}
+		break;
+	case SMRT_EVENT_CLASS_HOTPLUG:
+		/*
+		 * We want to log all hotplug events.  However we only need to
+		 * scan these if the subclass indicates the event is for a disk.
+		 */
+		log = B_TRUE;
+		if (sen->sen_subclass == SMRT_EVENT_HOTPLUG_SUBCLASS_DRIVE) {
+			rescan = B_TRUE;
+		}
+		break;
+	case SMRT_EVENT_CLASS_HWERROR:
+	case SMRT_EVENT_CLASS_ENVIRONMENT:
+		log = B_TRUE;
+		break;
+	case SMRT_EVENT_CLASS_PHYS:
+		log = B_TRUE;
+		/*
+		 * This subclass indicates some change for physical drives.  As
+		 * such, this should trigger a rescan.
+		 */
+		if (sen->sen_subclass == SMRT_EVENT_PHYS_SUBCLASS_STATE) {
+			rescan = B_TRUE;
+		}
+		break;
+	case SMRT_EVENT_CLASS_LOGVOL:
+		rescan = B_TRUE;
+		log = B_TRUE;
+		break;
+	default:
+		/*
+		 * While there are other classes of events, it's hard to say how
+		 * actionable they are for the moment.  If we revamp this such
+		 * that it becomes an ireport based system, then we should just
+		 * always log these.  We opt not to at the moment to try and be
+		 * kind to the system log.
+		 */
+		break;
+	}
+
+	/*
+	 * Ideally, this would be an ireport that we could pass onto
+	 * administrators; however, since we don't have any way to generate
+	 * that, we provide a subset of the event information.
+	 */
+	if (log) {
+		const char *rmsg;
+		if (rescan == B_TRUE) {
+			rmsg = "rescanning";
+		} else {
+			rmsg = "not rescanning";
+		}
+		if (sen->sen_message[0] != '\0') {
+			sen->sen_message[sizeof (sen->sen_message) - 1] = '\0';
+			dev_err(smrt->smrt_dip, CE_NOTE, "!controller event "
+			    "class/sub-class/detail %x, %x, %x: %s; %s devices",
+			    sen->sen_class, sen->sen_subclass, sen->sen_detail,
+			    sen->sen_message, rmsg);
+		} else {
+			dev_err(smrt->smrt_dip, CE_NOTE, "!controller event "
+			    "class/sub-class/detail %x, %x, %x; %s devices",
+			    sen->sen_class, sen->sen_subclass, sen->sen_detail,
+			    rmsg);
+		}
+	}
+
+	if (rescan)
+		smrt_discover_request(smrt);
+
+clean:
+	mutex_exit(&smrt->smrt_mutex);
+	smrt_command_reuse(smcm);
+	bzero(smcm->smcm_internal->smcmi_va, SMRT_EVENT_NOTIFY_BUFLEN);
+	mutex_enter(&smrt->smrt_mutex);
+
+	/*
+	 * Make sure we're not _now_ detaching or resetting.
+	 */
+	if (smrt->smrt_status & SMRT_CTLR_STATUS_DETACHING) {
+		cv_signal(&smrt->smrt_event_queue);
+		return;
+	}
+
+	if ((smrt->smrt_status & SMRT_CTLR_STATUS_RESETTING) != 0 ||
+	    intervene == B_TRUE) {
+		smrt->smrt_status |= SMRT_CTLR_ASYNC_INTERVENTION;
+		return;
+	}
+
+	/*
+	 * Check out command count per tick.  If it's too high, leave it for
+	 * intervention to solve.  Likely there is some serious driver or
+	 * firmware error going on.
+	 */
+	smrt->smrt_event_count++;
+	if (smrt->smrt_event_count > smrt_event_intervention_threshold) {
+		smrt->smrt_status |= SMRT_CTLR_ASYNC_INTERVENTION;
+		return;
+	}
+
+	if (smrt_submit(smrt, smcm) != 0) {
+		smrt->smrt_status |= SMRT_CTLR_ASYNC_INTERVENTION;
+	}
+}
+
+void
+smrt_event_fini(smrt_t *smrt)
+{
+	int ret;
+	smrt_command_t *event, *cancel;
+	mutex_enter(&smrt->smrt_mutex);
+
+	/*
+	 * If intervention has been requested, there is nothing for us to do. We
+	 * clear the flag so nothing else accidentally sees this and takes
+	 * action.  We also don't need to bother sending a cancellation request,
+	 * as there is no outstanding event.
+	 */
+	if (smrt->smrt_status & SMRT_CTLR_ASYNC_INTERVENTION) {
+		smrt->smrt_status &= ~SMRT_CTLR_ASYNC_INTERVENTION;
+		goto free;
+	}
+
+	/*
+	 * Submit a cancel request for the event notification queue.  Because we
+	 * submit both the cancel event and the regular notification event as an
+	 * ordered command, we know that by the time this completes, that the
+	 * existing one will have completed.
+	 */
+	smrt->smrt_event_cancel_cmd->smcm_status |= SMRT_CMD_STATUS_POLLED;
+	if ((ret = smrt_submit(smrt, smrt->smrt_event_cancel_cmd)) != 0) {
+		/*
+		 * This is unfortunate.  We've failed to submit the command.  At
+		 * this point all we can do is reset the device.  If the reset
+		 * succeeds, we're done and we can clear all the memory.  If it
+		 * fails, then all we can do is just leak the command and scream
+		 * to the system, sorry.
+		 */
+		if (smrt_ctlr_reset(smrt) != 0) {
+			dev_err(smrt->smrt_dip, CE_WARN, "failed to reset "
+			    "device after failure to submit cancellation "
+			    "(%d), abandoning smrt_command_t at address %p",
+			    ret, smrt->smrt_event_cmd);
+			smrt->smrt_event_cmd = NULL;
+			goto free;
+		}
+	}
+
+	smrt->smrt_event_cancel_cmd->smcm_expiry = gethrtime() +
+	    SMRT_ASYNC_CANCEL_TIMEOUT * NANOSEC;
+	if ((ret = smrt_poll_for(smrt, smrt->smrt_event_cancel_cmd)) != 0) {
+		VERIFY3S(ret, ==, ETIMEDOUT);
+		VERIFY0(smrt->smrt_event_cancel_cmd->smcm_status &
+		    SMRT_CMD_STATUS_POLL_COMPLETE);
+
+		/*
+		 * The command timed out.  All we can do is hope a reset will
+		 * work.
+		 */
+		if (smrt_ctlr_reset(smrt) != 0) {
+			dev_err(smrt->smrt_dip, CE_WARN, "failed to reset "
+			    "device after failure to poll for async "
+			    "cancellation command abandoning smrt_command_t "
+			    "event command at address %p and cancellation "
+			    "command at %p", smrt->smrt_event_cmd,
+			    smrt->smrt_event_cancel_cmd);
+			smrt->smrt_event_cmd = NULL;
+			smrt->smrt_event_cancel_cmd = NULL;
+			goto free;
+		}
+
+	}
+
+	/*
+	 * Well, in the end, it's results that count.
+	 */
+	if (smrt->smrt_event_cancel_cmd->smcm_status &
+	    SMRT_CMD_STATUS_RESET_SENT) {
+		goto free;
+	}
+
+	if (smrt->smrt_event_cancel_cmd->smcm_status & SMRT_CMD_STATUS_ERROR) {
+		ErrorInfo_t *ei = smrt->smrt_event_cancel_cmd->smcm_va_err;
+
+		/*
+		 * This can return a CISS_CMD_TARGET_STATUS entry when the
+		 * controller doesn't think a command is outstanding.  It is
+		 * possible we raced, so don't think too much about that case.
+		 * Anything else leaves us between a rock and a hard place, the
+		 * only way out is a reset.
+		 */
+		if (ei->CommandStatus != CISS_CMD_TARGET_STATUS &&
+		    smrt_ctlr_reset(smrt) != 0) {
+			dev_err(smrt->smrt_dip, CE_WARN, "failed to reset  "
+			    "device after receiving an error on the async "
+			    "cancellation command (%d); abandoning "
+			    "smrt_command_t event command at address %p and "
+			    "cancellation command at %p", ei->CommandStatus,
+			    smrt->smrt_event_cmd, smrt->smrt_event_cancel_cmd);
+			smrt->smrt_event_cmd = NULL;
+			smrt->smrt_event_cancel_cmd = NULL;
+			goto free;
+		}
+	}
+
+free:
+	event = smrt->smrt_event_cmd;
+	smrt->smrt_event_cmd = NULL;
+	cancel = smrt->smrt_event_cancel_cmd;
+	smrt->smrt_event_cancel_cmd = NULL;
+	mutex_exit(&smrt->smrt_mutex);
+	if (event != NULL)
+		smrt_command_free(event);
+	if (cancel != NULL)
+		smrt_command_free(cancel);
+	cv_destroy(&smrt->smrt_event_queue);
+}
+
+/*
+ * We've been asked to do a discovery in panic context.  This would have
+ * occurred because there was a device reset.  Because we can't rely on the
+ * target maps, all we can do at the moment is go over all the active targets
+ * and note which ones no longer exist.  If this target was required to dump,
+ * then the dump code will encounter a fatal error.  If not, then we should
+ * count ourselves surprisingly lucky.
+ */
+static void
+smrt_discover_panic_check(smrt_t *smrt)
+{
+	smrt_target_t *smtg;
+
+	ASSERT(MUTEX_HELD(&smrt->smrt_mutex));
+	for (smtg = list_head(&smrt->smrt_targets); smtg != NULL;
+	    smtg = list_next(&smrt->smrt_targets, smtg)) {
+		uint64_t gen;
+
+		if (smtg->smtg_physical) {
+			smrt_physical_t *smpt = smtg->smtg_lun.smtg_phys;
+			/*
+			 * Don't worry about drives that aren't visible.
+			 */
+			if (!smpt->smpt_visible)
+				continue;
+			gen = smpt->smpt_gen;
+		} else {
+			smrt_volume_t *smlv = smtg->smtg_lun.smtg_vol;
+			gen = smlv->smlv_gen;
+		}
+
+		if (gen != smrt->smrt_discover_gen) {
+			dev_err(smrt->smrt_dip, CE_WARN, "target %s "
+			    "disappeared during post-panic discovery",
+			    scsi_device_unit_address(smtg->smtg_scsi_dev));
+			smtg->smtg_gone = B_TRUE;
+		}
+	}
+}
+
+static void
+smrt_discover(void *arg)
+{
+	int log = 0, phys = 0;
+	smrt_t *smrt = arg;
+	uint64_t gen;
+	boolean_t runphys, runvirt;
+
+	mutex_enter(&smrt->smrt_mutex);
+	smrt->smrt_status |= SMRT_CTLR_DISCOVERY_RUNNING;
+	smrt->smrt_status &= ~SMRT_CTLR_DISCOVERY_REQUESTED;
+
+	smrt->smrt_discover_gen++;
+	gen = smrt->smrt_discover_gen;
+	runphys = smrt->smrt_phys_tgtmap != NULL;
+	runvirt = smrt->smrt_virt_tgtmap != NULL;
+	mutex_exit(&smrt->smrt_mutex);
+	if (runphys)
+		phys = smrt_phys_discover(smrt, SMRT_DISCOVER_TIMEOUT, gen);
+	if (runvirt)
+		log = smrt_logvol_discover(smrt, SMRT_DISCOVER_TIMEOUT, gen);
+	mutex_enter(&smrt->smrt_mutex);
+
+	if (phys != 0 || log != 0) {
+		if (!ddi_in_panic()) {
+			smrt->smrt_status |= SMRT_CTLR_DISCOVERY_PERIODIC;
+		} else {
+			panic("smrt_t %p failed to perform discovery after "
+			    "a reset in panic context, unable to continue. "
+			    "logvol: %d, phys: %d", smrt, log, phys);
+		}
+	} else {
+		if (!ddi_in_panic() &&
+		    smrt->smrt_status & SMRT_CTLR_DISCOVERY_REQUIRED) {
+			smrt->smrt_status &= ~SMRT_CTLR_DISCOVERY_REQUIRED;
+			cv_broadcast(&smrt->smrt_cv_finishq);
+		}
+
+		if (ddi_in_panic()) {
+			smrt_discover_panic_check(smrt);
+		}
+	}
+	smrt->smrt_status &= ~SMRT_CTLR_DISCOVERY_RUNNING;
+	if (smrt->smrt_status & SMRT_CTLR_DISCOVERY_REQUESTED)
+		smrt->smrt_status |= SMRT_CTLR_DISCOVERY_PERIODIC;
+	mutex_exit(&smrt->smrt_mutex);
+}
+
+/*
+ * Request discovery, which is always run via a taskq.
+ */
+void
+smrt_discover_request(smrt_t *smrt)
+{
+	boolean_t run;
+	ASSERT(MUTEX_HELD(&smrt->smrt_mutex));
+
+	if (ddi_in_panic()) {
+		smrt_discover(smrt);
+		return;
+	}
+
+	run = (smrt->smrt_status & SMRT_CTLR_DISCOVERY_MASK) == 0;
+	smrt->smrt_status |= SMRT_CTLR_DISCOVERY_REQUESTED;
+	if (run && ddi_taskq_dispatch(smrt->smrt_discover_taskq,
+	    smrt_discover, smrt, DDI_NOSLEEP) != DDI_SUCCESS) {
+		smrt->smrt_status |= SMRT_CTLR_DISCOVERY_PERIODIC;
+		smrt->smrt_stats.smrts_discovery_tq_errors++;
+	}
+}
diff --git a/usr/src/uts/common/io/scsi/adapters/smrt/smrt_ciss_simple.c b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_ciss_simple.c
new file mode 100644
index 0000000000..1b3d7b2602
--- /dev/null
+++ b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_ciss_simple.c
@@ -0,0 +1,282 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#include <sys/scsi/adapters/smrt/smrt.h>
+
+uint_t
+smrt_isr_hw_simple(caddr_t arg1, caddr_t arg2)
+{
+	_NOTE(ARGUNUSED(arg2))
+
+	/* LINTED: E_BAD_PTR_CAST_ALIGN */
+	smrt_t *smrt = (smrt_t *)arg1;
+	uint32_t isr = smrt_get32(smrt, CISS_I2O_INTERRUPT_STATUS);
+	hrtime_t now = gethrtime();
+
+	mutex_enter(&smrt->smrt_mutex);
+	if (!(smrt->smrt_status & SMRT_CTLR_STATUS_RUNNING)) {
+		smrt->smrt_stats.smrts_unclaimed_interrupts++;
+		smrt->smrt_last_interrupt_unclaimed = now;
+
+		/*
+		 * We should not be receiving interrupts from the controller
+		 * while the driver is not running.
+		 */
+		mutex_exit(&smrt->smrt_mutex);
+		return (DDI_INTR_UNCLAIMED);
+	}
+
+	/*
+	 * Check to see if this interrupt came from the device:
+	 */
+	if ((isr & CISS_ISR_BIT_SIMPLE_INTR) == 0) {
+		smrt->smrt_stats.smrts_unclaimed_interrupts++;
+		smrt->smrt_last_interrupt_unclaimed = now;
+
+		/*
+		 * Check to see if the firmware has come to rest.  If it has,
+		 * this routine will panic the system.
+		 */
+		smrt_lockup_check(smrt);
+
+		mutex_exit(&smrt->smrt_mutex);
+		return (DDI_INTR_UNCLAIMED);
+	}
+
+	smrt->smrt_stats.smrts_claimed_interrupts++;
+	smrt->smrt_last_interrupt_claimed = now;
+
+	/*
+	 * The interrupt was from our controller, so collect any pending
+	 * command completions.
+	 */
+	smrt_retrieve_simple(smrt);
+
+	/*
+	 * Process any commands in the completion queue.
+	 */
+	smrt_process_finishq(smrt);
+
+	mutex_exit(&smrt->smrt_mutex);
+	return (DDI_INTR_CLAIMED);
+}
+
+/*
+ * Read tags and process completion of the associated command until the supply
+ * of tags is exhausted.
+ */
+void
+smrt_retrieve_simple(smrt_t *smrt)
+{
+	uint32_t opq;
+	uint32_t none = 0xffffffff;
+
+	VERIFY(MUTEX_HELD(&smrt->smrt_mutex));
+
+	while ((opq = smrt_get32(smrt, CISS_I2O_OUTBOUND_POST_Q)) != none) {
+		uint32_t tag = CISS_OPQ_READ_TAG(opq);
+		smrt_command_t *smcm;
+
+		if ((smcm = smrt_lookup_inflight(smrt, tag)) == NULL) {
+			dev_err(smrt->smrt_dip, CE_WARN, "spurious tag %x",
+			    tag);
+			continue;
+		}
+
+		avl_remove(&smrt->smrt_inflight, smcm);
+		smcm->smcm_status &= ~SMRT_CMD_STATUS_INFLIGHT;
+		if (CISS_OPQ_READ_ERROR(opq) != 0) {
+			smcm->smcm_status |= SMRT_CMD_STATUS_ERROR;
+		}
+		smcm->smcm_time_complete = gethrtime();
+
+		/*
+		 * Push this command onto the completion queue.
+		 */
+		list_insert_tail(&smrt->smrt_finishq, smcm);
+	}
+}
+
+/*
+ * Submit a command to the controller by posting it to the Inbound Post Queue
+ * Register.
+ */
+void
+smrt_submit_simple(smrt_t *smrt, smrt_command_t *smcm)
+{
+	smrt_put32(smrt, CISS_I2O_INBOUND_POST_Q, smcm->smcm_pa_cmd);
+}
+
+/*
+ * Submit a command to the controller by posting it to the Inbound Post Queue
+ * Register.  Immediately begin polling on the completion of that command.
+ *
+ * NOTE: This function is for controller initialisation only.  It discards
+ * completions of commands other than the expected command as spurious, and
+ * will not interact correctly with the rest of the driver once it is running.
+ */
+int
+smrt_preinit_command_simple(smrt_t *smrt, smrt_command_t *smcm)
+{
+	/*
+	 * The controller must be initialised to use the Simple Transport
+	 * Method, but not be marked RUNNING.  The command to process must be a
+	 * PREINIT command with the expected tag number, marked for polling.
+	 */
+	VERIFY(smrt->smrt_ctlr_mode == SMRT_CTLR_MODE_SIMPLE);
+	VERIFY(!(smrt->smrt_status & SMRT_CTLR_STATUS_RUNNING));
+	VERIFY(smcm->smcm_type == SMRT_CMDTYPE_PREINIT);
+	VERIFY(smcm->smcm_status & SMRT_CMD_STATUS_POLLED);
+	VERIFY3U(smcm->smcm_tag, ==, SMRT_PRE_TAG_NUMBER);
+
+	/*
+	 * Submit this command to the controller.
+	 */
+	smcm->smcm_status |= SMRT_CMD_STATUS_INFLIGHT;
+	smrt_put32(smrt, CISS_I2O_INBOUND_POST_Q, smcm->smcm_pa_cmd);
+
+	/*
+	 * Poll the controller for completions until we see the command we just
+	 * sent, or the timeout expires.
+	 */
+	for (;;) {
+		uint32_t none = 0xffffffff;
+		uint32_t opq = smrt_get32(smrt, CISS_I2O_OUTBOUND_POST_Q);
+		uint32_t tag;
+
+		if (smcm->smcm_expiry != 0) {
+			/*
+			 * This command has an expiry time.  Check to see
+			 * if it has already passed:
+			 */
+			if (smcm->smcm_expiry < gethrtime()) {
+				return (ETIMEDOUT);
+			}
+		}
+
+		if (opq == none) {
+			delay(drv_usectohz(10 * 1000));
+			continue;
+		}
+
+		if ((tag = CISS_OPQ_READ_TAG(opq)) != SMRT_PRE_TAG_NUMBER) {
+			dev_err(smrt->smrt_dip, CE_WARN, "unexpected tag 0x%x"
+			    " completed during driver init", tag);
+			delay(drv_usectohz(10 * 1000));
+			continue;
+		}
+
+		smcm->smcm_status &= ~SMRT_CMD_STATUS_INFLIGHT;
+		if (CISS_OPQ_READ_ERROR(opq) != 0) {
+			smcm->smcm_status |= SMRT_CMD_STATUS_ERROR;
+		}
+		smcm->smcm_time_complete = gethrtime();
+		smcm->smcm_status |= SMRT_CMD_STATUS_POLL_COMPLETE;
+
+		return (0);
+	}
+}
+
+int
+smrt_ctlr_init_simple(smrt_t *smrt)
+{
+	VERIFY(smrt->smrt_ctlr_mode == SMRT_CTLR_MODE_UNKNOWN);
+
+	if (smrt_cfgtbl_transport_has_support(smrt,
+	    CISS_CFGTBL_XPORT_SIMPLE) != DDI_SUCCESS) {
+		return (DDI_FAILURE);
+	}
+	smrt->smrt_ctlr_mode = SMRT_CTLR_MODE_SIMPLE;
+
+	/*
+	 * Disable device interrupts while we are setting up.
+	 */
+	smrt_intr_set(smrt, B_FALSE);
+
+	if ((smrt->smrt_maxcmds = smrt_ctlr_get_cmdsoutmax(smrt)) == 0) {
+		dev_err(smrt->smrt_dip, CE_WARN, "maximum outstanding "
+		    "commands set to zero");
+		return (DDI_FAILURE);
+	}
+
+	/*
+	 * Determine the number of Scatter/Gather List entries this controller
+	 * supports.  The maximum number we allow is CISS_MAXSGENTRIES: the
+	 * number of elements in the static struct we use for command
+	 * submission.
+	 */
+	if ((smrt->smrt_sg_cnt = smrt_ctlr_get_maxsgelements(smrt)) == 0) {
+		/*
+		 * The CISS specification states that if this value is
+		 * zero, we should assume a value of 31 for compatibility
+		 * with older firmware.
+		 */
+		smrt->smrt_sg_cnt = CISS_SGCNT_FALLBACK;
+
+	} else if (smrt->smrt_sg_cnt > CISS_MAXSGENTRIES) {
+		/*
+		 * If the controller supports more than we have allocated,
+		 * just cap the count at the allocation size.
+		 */
+		smrt->smrt_sg_cnt = CISS_MAXSGENTRIES;
+	}
+
+	/*
+	 * Zero the upper 32 bits of the address in the Controller.
+	 */
+	ddi_put32(smrt->smrt_ct_handle, &smrt->smrt_ct->Upper32Addr, 0);
+
+	/*
+	 * Set the Transport Method and flush the changes to the
+	 * Configuration Table.
+	 */
+	smrt_cfgtbl_transport_set(smrt, CISS_CFGTBL_XPORT_SIMPLE);
+	if (smrt_cfgtbl_flush(smrt) != DDI_SUCCESS) {
+		return (DDI_FAILURE);
+	}
+
+	if (smrt_cfgtbl_transport_confirm(smrt,
+	    CISS_CFGTBL_XPORT_SIMPLE) != DDI_SUCCESS) {
+		return (DDI_FAILURE);
+	}
+
+	/*
+	 * Check the outstanding command cap a second time now that we have
+	 * flushed out the new Transport Method.  This is entirely defensive;
+	 * we do not expect this value to change.
+	 */
+	uint32_t check_again = smrt_ctlr_get_cmdsoutmax(smrt);
+	if (check_again != smrt->smrt_maxcmds) {
+		dev_err(smrt->smrt_dip, CE_WARN, "maximum outstanding commands "
+		    "changed during initialisation (was %u, now %u)",
+		    smrt->smrt_maxcmds, check_again);
+		return (DDI_FAILURE);
+	}
+
+	return (DDI_SUCCESS);
+}
+
+void
+smrt_ctlr_teardown_simple(smrt_t *smrt)
+{
+	VERIFY(smrt->smrt_ctlr_mode == SMRT_CTLR_MODE_SIMPLE);
+
+	/*
+	 * Due to the nominal simplicity of the simple mode, we have no
+	 * particular teardown to perform as we do not allocate anything
+	 * on the way up.
+	 */
+	smrt->smrt_ctlr_mode = SMRT_CTLR_MODE_UNKNOWN;
+}
diff --git a/usr/src/uts/common/io/scsi/adapters/smrt/smrt_commands.c b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_commands.c
new file mode 100644
index 0000000000..edcbfa65e2
--- /dev/null
+++ b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_commands.c
@@ -0,0 +1,362 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2017, Joyent, Inc.
+ */
+
+#include <sys/scsi/adapters/smrt/smrt.h>
+
+
+static ddi_dma_attr_t smrt_command_dma_attr = {
+	.dma_attr_version =		DMA_ATTR_V0,
+	.dma_attr_addr_lo =		0x00000000,
+	.dma_attr_addr_hi =		0xFFFFFFFF,
+	.dma_attr_count_max =		0x00FFFFFF,
+	.dma_attr_align =		0x20,
+	.dma_attr_burstsizes =		0x20,
+	.dma_attr_minxfer =		DMA_UNIT_8,
+	.dma_attr_maxxfer =		0xFFFFFFFF,
+	.dma_attr_seg =			0x0000FFFF,
+	.dma_attr_sgllen =		1,
+	.dma_attr_granular =		512,
+	.dma_attr_flags =		0
+};
+
+/*
+ * These device access attributes are for command block allocation, where we do
+ * not use any of the structured byte swapping facilities.
+ */
+static ddi_device_acc_attr_t smrt_command_dev_attr = {
+	.devacc_attr_version =		DDI_DEVICE_ATTR_V0,
+	.devacc_attr_endian_flags =	DDI_NEVERSWAP_ACC,
+	.devacc_attr_dataorder =	DDI_STRICTORDER_ACC,
+	.devacc_attr_access =		0
+};
+
+
+static void smrt_contig_free(smrt_dma_t *);
+
+
+static int
+smrt_check_command_type(smrt_command_type_t type)
+{
+	/*
+	 * Note that we leave out the default case in order to utilise
+	 * compiler warnings about missed enum values.
+	 */
+	switch (type) {
+	case SMRT_CMDTYPE_ABORTQ:
+	case SMRT_CMDTYPE_SCSA:
+	case SMRT_CMDTYPE_INTERNAL:
+	case SMRT_CMDTYPE_PREINIT:
+	case SMRT_CMDTYPE_EVENT:
+		return (type);
+	}
+
+	panic("unexpected command type");
+	/* LINTED: E_FUNC_NO_RET_VAL */
+}
+
+static int
+smrt_contig_alloc(smrt_t *smrt, smrt_dma_t *smdma, size_t sz, int kmflags,
+    void **vap, uint32_t *pap)
+{
+	caddr_t va;
+	int rv;
+	dev_info_t *dip = smrt->smrt_dip;
+	int (*dma_wait)(caddr_t) = (kmflags == KM_SLEEP) ? DDI_DMA_SLEEP :
+	    DDI_DMA_DONTWAIT;
+
+	VERIFY(kmflags == KM_SLEEP || kmflags == KM_NOSLEEP);
+
+	/*
+	 * Ensure we don't try to allocate a second time using the same
+	 * tracking object.
+	 */
+	VERIFY0(smdma->smdma_level);
+
+	if ((rv = ddi_dma_alloc_handle(dip, &smrt_command_dma_attr,
+	    dma_wait, NULL, &smdma->smdma_dma_handle)) != DDI_SUCCESS) {
+		dev_err(dip, CE_WARN, "DMA handle allocation failed (%x)",
+		    rv);
+		goto fail;
+	}
+	smdma->smdma_level |= SMRT_DMALEVEL_HANDLE_ALLOC;
+
+	if ((rv = ddi_dma_mem_alloc(smdma->smdma_dma_handle, sz,
+	    &smrt_command_dev_attr, DDI_DMA_CONSISTENT, dma_wait, NULL,
+	    &va, &smdma->smdma_real_size, &smdma->smdma_acc_handle)) !=
+	    DDI_SUCCESS) {
+		dev_err(dip, CE_WARN, "DMA memory allocation failed (%x)", rv);
+		goto fail;
+	}
+	smdma->smdma_level |= SMRT_DMALEVEL_MEMORY_ALLOC;
+
+	if ((rv = ddi_dma_addr_bind_handle(smdma->smdma_dma_handle,
+	    NULL, va, smdma->smdma_real_size,
+	    DDI_DMA_CONSISTENT | DDI_DMA_RDWR, dma_wait, NULL,
+	    smdma->smdma_dma_cookies, &smdma->smdma_dma_ncookies)) !=
+	    DDI_DMA_MAPPED) {
+		dev_err(dip, CE_WARN, "DMA handle bind failed (%x)", rv);
+		goto fail;
+	}
+	smdma->smdma_level |= SMRT_DMALEVEL_HANDLE_BOUND;
+
+	VERIFY3U(smdma->smdma_dma_ncookies, ==, 1);
+	*pap = smdma->smdma_dma_cookies[0].dmac_address;
+	*vap = (void *)va;
+	return (DDI_SUCCESS);
+
+fail:
+	*vap = NULL;
+	*pap = 0;
+	smrt_contig_free(smdma);
+	return (DDI_FAILURE);
+}
+
+static void
+smrt_contig_free(smrt_dma_t *smdma)
+{
+	if (smdma->smdma_level & SMRT_DMALEVEL_HANDLE_BOUND) {
+		VERIFY3U(ddi_dma_unbind_handle(smdma->smdma_dma_handle), ==,
+		    DDI_SUCCESS);
+
+		smdma->smdma_level &= ~SMRT_DMALEVEL_HANDLE_BOUND;
+	}
+
+	if (smdma->smdma_level & SMRT_DMALEVEL_MEMORY_ALLOC) {
+		ddi_dma_mem_free(&smdma->smdma_acc_handle);
+
+		smdma->smdma_level &= ~SMRT_DMALEVEL_MEMORY_ALLOC;
+	}
+
+	if (smdma->smdma_level & SMRT_DMALEVEL_HANDLE_ALLOC) {
+		ddi_dma_free_handle(&smdma->smdma_dma_handle);
+
+		smdma->smdma_level &= ~SMRT_DMALEVEL_HANDLE_ALLOC;
+	}
+
+	VERIFY(smdma->smdma_level == 0);
+	bzero(smdma, sizeof (*smdma));
+}
+
+static smrt_command_t *
+smrt_command_alloc_impl(smrt_t *smrt, smrt_command_type_t type, int kmflags)
+{
+	smrt_command_t *smcm;
+
+	VERIFY(kmflags == KM_SLEEP || kmflags == KM_NOSLEEP);
+
+	if ((smcm = kmem_zalloc(sizeof (*smcm), kmflags)) == NULL) {
+		return (NULL);
+	}
+
+	smcm->smcm_ctlr = smrt;
+	smcm->smcm_type = smrt_check_command_type(type);
+
+	/*
+	 * Allocate a single contiguous chunk of memory for the command block
+	 * (smcm_va_cmd) and the error information block (smcm_va_err).  The
+	 * physical address of each block should be 32-byte aligned.
+	 */
+	size_t contig_size = 0;
+	contig_size += P2ROUNDUP_TYPED(sizeof (CommandList_t), 32, size_t);
+
+	size_t errorinfo_offset = contig_size;
+	contig_size += P2ROUNDUP_TYPED(sizeof (ErrorInfo_t), 32, size_t);
+
+	if (smrt_contig_alloc(smrt, &smcm->smcm_contig, contig_size,
+	    kmflags, (void **)&smcm->smcm_va_cmd, &smcm->smcm_pa_cmd) !=
+	    DDI_SUCCESS) {
+		kmem_free(smcm, sizeof (*smcm));
+		return (NULL);
+	}
+
+	smcm->smcm_va_err = (void *)((caddr_t)smcm->smcm_va_cmd +
+	    errorinfo_offset);
+	smcm->smcm_pa_err = smcm->smcm_pa_cmd + errorinfo_offset;
+
+	/*
+	 * Ensure we asked for, and received, the correct physical alignment:
+	 */
+	VERIFY0(smcm->smcm_pa_cmd & 0x1f);
+	VERIFY0(smcm->smcm_pa_err & 0x1f);
+
+	/*
+	 * Populate Fields.
+	 */
+	bzero(smcm->smcm_va_cmd, contig_size);
+	smcm->smcm_va_cmd->ErrDesc.Addr = smcm->smcm_pa_err;
+	smcm->smcm_va_cmd->ErrDesc.Len = sizeof (ErrorInfo_t);
+
+	return (smcm);
+}
+
+smrt_command_t *
+smrt_command_alloc_preinit(smrt_t *smrt, size_t datasize, int kmflags)
+{
+	smrt_command_t *smcm;
+
+	if ((smcm = smrt_command_alloc_impl(smrt, SMRT_CMDTYPE_PREINIT,
+	    kmflags)) == NULL) {
+		return (NULL);
+	}
+
+	/*
+	 * Note that most driver infrastructure has not been initialised at
+	 * this time.  All commands are submitted to the controller serially,
+	 * using a pre-specified tag, and are not attached to the command
+	 * tracking list.
+	 */
+	smcm->smcm_tag = SMRT_PRE_TAG_NUMBER;
+	smcm->smcm_va_cmd->Header.Tag.tag_value = SMRT_PRE_TAG_NUMBER;
+
+	if (smrt_command_attach_internal(smrt, smcm, datasize, kmflags) != 0) {
+		smrt_command_free(smcm);
+		return (NULL);
+	}
+
+	return (smcm);
+}
+
+smrt_command_t *
+smrt_command_alloc(smrt_t *smrt, smrt_command_type_t type, int kmflags)
+{
+	smrt_command_t *smcm;
+
+	VERIFY(type != SMRT_CMDTYPE_PREINIT);
+
+	if ((smcm = smrt_command_alloc_impl(smrt, type, kmflags)) == NULL) {
+		return (NULL);
+	}
+
+	/*
+	 * Insert into the per-controller command list.
+	 */
+	mutex_enter(&smrt->smrt_mutex);
+	list_insert_tail(&smrt->smrt_commands, smcm);
+	mutex_exit(&smrt->smrt_mutex);
+
+	return (smcm);
+}
+
+int
+smrt_command_attach_internal(smrt_t *smrt, smrt_command_t *smcm, size_t len,
+    int kmflags)
+{
+	smrt_command_internal_t *smcmi;
+
+	VERIFY(kmflags == KM_SLEEP || kmflags == KM_NOSLEEP);
+	VERIFY3U(len, <=, UINT32_MAX);
+
+	if ((smcmi = kmem_zalloc(sizeof (*smcmi), kmflags)) == NULL) {
+		return (ENOMEM);
+	}
+
+	if (smrt_contig_alloc(smrt, &smcmi->smcmi_contig, len, kmflags,
+	    &smcmi->smcmi_va, &smcmi->smcmi_pa) != DDI_SUCCESS) {
+		kmem_free(smcmi, sizeof (*smcmi));
+		return (ENOMEM);
+	}
+
+	bzero(smcmi->smcmi_va, smcmi->smcmi_len);
+
+	smcm->smcm_internal = smcmi;
+
+	smcm->smcm_va_cmd->SG[0].Addr = smcmi->smcmi_pa;
+	smcm->smcm_va_cmd->SG[0].Len = (uint32_t)len;
+	smcm->smcm_va_cmd->Header.SGList = 1;
+	smcm->smcm_va_cmd->Header.SGTotal = 1;
+
+	return (0);
+}
+
+void
+smrt_command_reuse(smrt_command_t *smcm)
+{
+	smrt_t *smrt = smcm->smcm_ctlr;
+
+	mutex_enter(&smrt->smrt_mutex);
+
+	/*
+	 * Make sure the command is not currently inflight, then
+	 * reset the command status.
+	 */
+	VERIFY(!(smcm->smcm_status & SMRT_CMD_STATUS_INFLIGHT));
+	smcm->smcm_status = SMRT_CMD_STATUS_REUSED;
+
+	/*
+	 * Ensure we are not trying to reuse a command that is in the finish or
+	 * abort queue.
+	 */
+	VERIFY(!list_link_active(&smcm->smcm_link_abort));
+	VERIFY(!list_link_active(&smcm->smcm_link_finish));
+
+	/*
+	 * Clear the previous tag value.
+	 */
+	smcm->smcm_tag = 0;
+	smcm->smcm_va_cmd->Header.Tag.tag_value = 0;
+
+	mutex_exit(&smrt->smrt_mutex);
+}
+
+void
+smrt_command_free(smrt_command_t *smcm)
+{
+	smrt_t *smrt = smcm->smcm_ctlr;
+
+	/*
+	 * Ensure the object we are about to free is not currently in the
+	 * inflight AVL.
+	 */
+	VERIFY(!(smcm->smcm_status & SMRT_CMD_STATUS_INFLIGHT));
+
+	if (smcm->smcm_internal != NULL) {
+		smrt_command_internal_t *smcmi = smcm->smcm_internal;
+
+		smrt_contig_free(&smcmi->smcmi_contig);
+		kmem_free(smcmi, sizeof (*smcmi));
+	}
+
+	smrt_contig_free(&smcm->smcm_contig);
+
+	if (smcm->smcm_type != SMRT_CMDTYPE_PREINIT) {
+		mutex_enter(&smrt->smrt_mutex);
+
+		/*
+		 * Ensure we are not trying to free a command that is in the
+		 * finish or abort queue.
+		 */
+		VERIFY(!list_link_active(&smcm->smcm_link_abort));
+		VERIFY(!list_link_active(&smcm->smcm_link_finish));
+
+		list_remove(&smrt->smrt_commands, smcm);
+
+		mutex_exit(&smrt->smrt_mutex);
+	}
+
+	kmem_free(smcm, sizeof (*smcm));
+}
+
+smrt_command_t *
+smrt_lookup_inflight(smrt_t *smrt, uint32_t tag)
+{
+	smrt_command_t srch;
+
+	VERIFY(MUTEX_HELD(&smrt->smrt_mutex));
+
+	bzero(&srch, sizeof (srch));
+	srch.smcm_tag = tag;
+
+	return (avl_find(&smrt->smrt_inflight, &srch, NULL));
+}
diff --git a/usr/src/uts/common/io/scsi/adapters/smrt/smrt_device.c b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_device.c
new file mode 100644
index 0000000000..9e27448b68
--- /dev/null
+++ b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_device.c
@@ -0,0 +1,238 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2017, Joyent, Inc.
+ */
+
+#include <sys/scsi/adapters/smrt/smrt.h>
+
+/*
+ * We must locate what the CISS specification describes as the "I2O
+ * registers".  The Intelligent I/O (I2O) Architecture Specification describes
+ * this somewhat more coherently as "the memory region specified by the first
+ * base address configuration register indicating memory space (offset 10h,
+ * 14h, and so forth)".
+ */
+static int
+smrt_locate_bar(pci_regspec_t *regs, unsigned nregs,
+    unsigned *i2o_bar)
+{
+	/*
+	 * Locate the first memory-mapped BAR:
+	 */
+	for (unsigned i = 0; i < nregs; i++) {
+		unsigned type = regs[i].pci_phys_hi & PCI_ADDR_MASK;
+
+		if (type == PCI_ADDR_MEM32 || type == PCI_ADDR_MEM64) {
+			*i2o_bar = i;
+			return (DDI_SUCCESS);
+		}
+	}
+
+	return (DDI_FAILURE);
+}
+
+static int
+smrt_locate_cfgtbl(smrt_t *smrt, pci_regspec_t *regs, unsigned nregs,
+    unsigned *ct_bar, uint32_t *baseaddr)
+{
+	uint32_t cfg_offset, mem_offset;
+	unsigned want_type;
+	uint32_t want_bar;
+
+	cfg_offset = smrt_get32(smrt, CISS_I2O_CFGTBL_CFG_OFFSET);
+	mem_offset = smrt_get32(smrt, CISS_I2O_CFGTBL_MEM_OFFSET);
+
+	VERIFY3U(cfg_offset, !=, 0xffffffff);
+	VERIFY3U(mem_offset, !=, 0xffffffff);
+
+	/*
+	 * Locate the Configuration Table.  Three different values read
+	 * from two I2O registers allow us to determine the location:
+	 * 	- the correct PCI BAR offset is in the low 16 bits of
+	 *	  CISS_I2O_CFGTBL_CFG_OFFSET
+	 *	- bit 16 is 0 for a 32-bit space, and 1 for 64-bit
+	 *	- the memory offset from the base of this BAR is
+	 *	  in CISS_I2O_CFGTBL_MEM_OFFSET
+	 */
+	want_bar = (cfg_offset & 0xffff);
+	want_type = (cfg_offset & (1UL << 16)) ? PCI_ADDR_MEM64 :
+	    PCI_ADDR_MEM32;
+
+	DTRACE_PROBE4(locate_cfgtbl, uint32_t, want_bar, unsigned,
+	    want_type, uint32_t, cfg_offset, uint32_t, mem_offset);
+
+	for (unsigned i = 0; i < nregs; i++) {
+		unsigned type = regs[i].pci_phys_hi & PCI_ADDR_MASK;
+		unsigned bar = PCI_REG_REG_G(regs[i].pci_phys_hi);
+
+		if (type != PCI_ADDR_MEM32 && type != PCI_ADDR_MEM64) {
+			continue;
+		}
+
+		if (bar == want_bar) {
+			*ct_bar = i;
+			*baseaddr = mem_offset;
+			return (DDI_SUCCESS);
+		}
+	}
+
+	return (DDI_FAILURE);
+}
+
+/*
+ * Determine the PCI vendor and device ID which is a proxy for which generation
+ * of controller we're working with.
+ */
+static int
+smrt_identify_device(smrt_t *smrt)
+{
+	ddi_acc_handle_t pci_hdl;
+
+	if (pci_config_setup(smrt->smrt_dip, &pci_hdl) != DDI_SUCCESS)
+		return (DDI_FAILURE);
+
+	smrt->smrt_pci_vendor = pci_config_get16(pci_hdl, PCI_CONF_VENID);
+	smrt->smrt_pci_device = pci_config_get16(pci_hdl, PCI_CONF_DEVID);
+
+	pci_config_teardown(&pci_hdl);
+
+	return (DDI_SUCCESS);
+}
+
+static int
+smrt_map_device(smrt_t *smrt)
+{
+	pci_regspec_t *regs;
+	uint_t regslen, nregs;
+	dev_info_t *dip = smrt->smrt_dip;
+	int r = DDI_FAILURE;
+
+	/*
+	 * Get the list of PCI registers from the DDI property "regs":
+	 */
+	if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
+	    "reg", (int **)&regs, &regslen) != DDI_PROP_SUCCESS) {
+		dev_err(dip, CE_WARN, "could not load \"reg\" DDI prop");
+		return (DDI_FAILURE);
+	}
+	nregs = regslen * sizeof (int) / sizeof (pci_regspec_t);
+
+	if (smrt_locate_bar(regs, nregs, &smrt->smrt_i2o_bar) !=
+	    DDI_SUCCESS) {
+		dev_err(dip, CE_WARN, "did not find any memory BARs");
+		goto out;
+	}
+
+	/*
+	 * Map enough of the I2O memory space to enable us to talk to the
+	 * device.
+	 */
+	if (ddi_regs_map_setup(dip, smrt->smrt_i2o_bar, &smrt->smrt_i2o_space,
+	    CISS_I2O_MAP_BASE, CISS_I2O_MAP_LIMIT - CISS_I2O_MAP_BASE,
+	    &smrt_dev_attributes, &smrt->smrt_i2o_handle) != DDI_SUCCESS) {
+		dev_err(dip, CE_WARN, "failed to map I2O registers");
+		goto out;
+	}
+	smrt->smrt_init_level |= SMRT_INITLEVEL_I2O_MAPPED;
+
+	if (smrt_locate_cfgtbl(smrt, regs, nregs, &smrt->smrt_ct_bar,
+	    &smrt->smrt_ct_baseaddr) != DDI_SUCCESS) {
+		dev_err(dip, CE_WARN, "could not find config table");
+		goto out;
+	}
+
+	/*
+	 * Map the Configuration Table.
+	 */
+	if (ddi_regs_map_setup(dip, smrt->smrt_ct_bar,
+	    (caddr_t *)&smrt->smrt_ct, smrt->smrt_ct_baseaddr,
+	    sizeof (CfgTable_t), &smrt_dev_attributes,
+	    &smrt->smrt_ct_handle) != DDI_SUCCESS) {
+		dev_err(dip, CE_WARN, "could not map config table");
+		goto out;
+	}
+	smrt->smrt_init_level |= SMRT_INITLEVEL_CFGTBL_MAPPED;
+
+	r = DDI_SUCCESS;
+
+out:
+	ddi_prop_free(regs);
+	return (r);
+}
+
+int
+smrt_device_setup(smrt_t *smrt)
+{
+	/*
+	 * Ensure that the controller is installed in such a fashion that it
+	 * may become a DMA master.
+	 */
+	if (ddi_slaveonly(smrt->smrt_dip) == DDI_SUCCESS) {
+		dev_err(smrt->smrt_dip, CE_WARN, "device cannot become DMA "
+		    "master");
+		return (DDI_FAILURE);
+	}
+
+	if (smrt_identify_device(smrt) != DDI_SUCCESS)
+		goto fail;
+
+	if (smrt_map_device(smrt) != DDI_SUCCESS) {
+		goto fail;
+	}
+
+	return (DDI_SUCCESS);
+
+fail:
+	smrt_device_teardown(smrt);
+	return (DDI_FAILURE);
+}
+
+void
+smrt_device_teardown(smrt_t *smrt)
+{
+	if (smrt->smrt_init_level & SMRT_INITLEVEL_CFGTBL_MAPPED) {
+		ddi_regs_map_free(&smrt->smrt_ct_handle);
+		smrt->smrt_init_level &= ~SMRT_INITLEVEL_CFGTBL_MAPPED;
+	}
+
+	if (smrt->smrt_init_level & SMRT_INITLEVEL_I2O_MAPPED) {
+		ddi_regs_map_free(&smrt->smrt_i2o_handle);
+		smrt->smrt_init_level &= ~SMRT_INITLEVEL_I2O_MAPPED;
+	}
+}
+
+uint32_t
+smrt_get32(smrt_t *smrt, offset_t off)
+{
+	VERIFY3S(off, >=, CISS_I2O_MAP_BASE);
+	VERIFY3S(off, <, CISS_I2O_MAP_BASE + CISS_I2O_MAP_LIMIT);
+
+	/* LINTED: E_BAD_PTR_CAST_ALIGN */
+	uint32_t *addr = (uint32_t *)(smrt->smrt_i2o_space +
+	    (off - CISS_I2O_MAP_BASE));
+
+	return (ddi_get32(smrt->smrt_i2o_handle, addr));
+}
+
+void
+smrt_put32(smrt_t *smrt, offset_t off, uint32_t val)
+{
+	VERIFY3S(off, >=, CISS_I2O_MAP_BASE);
+	VERIFY3S(off, <, CISS_I2O_MAP_BASE + CISS_I2O_MAP_LIMIT);
+
+	/* LINTED: E_BAD_PTR_CAST_ALIGN */
+	uint32_t *addr = (uint32_t *)(smrt->smrt_i2o_space +
+	    (off - CISS_I2O_MAP_BASE));
+
+	ddi_put32(smrt->smrt_i2o_handle, addr, val);
+}
diff --git a/usr/src/uts/common/io/scsi/adapters/smrt/smrt_hba.c b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_hba.c
new file mode 100644
index 0000000000..8f082ffc9c
--- /dev/null
+++ b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_hba.c
@@ -0,0 +1,1457 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2017, Joyent, Inc.
+ */
+
+#include <sys/scsi/adapters/smrt/smrt.h>
+
+/*
+ * The controller is not allowed to attach.
+ */
+static int
+smrt_ctrl_tran_tgt_init(dev_info_t *hba_dip, dev_info_t *tgt_dip,
+    scsi_hba_tran_t *hba_tran, struct scsi_device *sd)
+{
+	return (DDI_FAILURE);
+}
+
+/*
+ * The controller is not allowed to send packets.
+ */
+static int
+smrt_ctrl_tran_start(struct scsi_address *sa, struct scsi_pkt *pkt)
+{
+	return (TRAN_BADPKT);
+}
+
+static boolean_t
+smrt_logvol_parse(const char *ua, uint_t *targp)
+{
+	long targ, lun;
+	const char *comma;
+	char *eptr;
+
+	comma = strchr(ua, ',');
+	if (comma == NULL) {
+		return (B_FALSE);
+	}
+
+	/*
+	 * We expect the target number for a logical unit number to be zero for
+	 * a logical volume.
+	 */
+	if (ddi_strtol(comma + 1, &eptr, 16, &lun) != 0 || *eptr != '\0' ||
+	    lun != 0) {
+		return (B_FALSE);
+	}
+
+	if (ddi_strtol(ua, &eptr, 16, &targ) != 0 || eptr != comma ||
+	    targ < 0 || targ >= SMRT_MAX_LOGDRV) {
+		return (B_FALSE);
+	}
+
+	*targp = (uint_t)targ;
+
+	return (B_TRUE);
+}
+
+static int
+smrt_logvol_tran_tgt_init(dev_info_t *hba_dip, dev_info_t *tgt_dip,
+    scsi_hba_tran_t *hba_tran, struct scsi_device *sd)
+{
+	_NOTE(ARGUNUSED(hba_dip))
+
+	smrt_volume_t *smlv;
+	smrt_target_t *smtg;
+	const char *ua;
+	uint_t targ;
+
+	smrt_t *smrt = (smrt_t *)hba_tran->tran_hba_private;
+	dev_info_t *dip = smrt->smrt_dip;
+
+	/*
+	 * The unit address comes in the form of 'target,lun'.  We expect the
+	 * lun to be zero.  The target is what we set when we added it to the
+	 * target map earlier.
+	 */
+	ua = scsi_device_unit_address(sd);
+	if (ua == NULL) {
+		return (DDI_FAILURE);
+	}
+
+	if (!smrt_logvol_parse(ua, &targ)) {
+		return (DDI_FAILURE);
+	}
+
+	if ((smtg = kmem_zalloc(sizeof (*smtg), KM_NOSLEEP)) == NULL) {
+		dev_err(dip, CE_WARN, "could not allocate target object "
+		    "due to memory exhaustion");
+		return (DDI_FAILURE);
+	}
+
+	mutex_enter(&smrt->smrt_mutex);
+
+	if (smrt->smrt_status & SMRT_CTLR_STATUS_DETACHING) {
+		/*
+		 * We are detaching.  Do not accept any more requests to
+		 * attach targets from the framework.
+		 */
+		mutex_exit(&smrt->smrt_mutex);
+		kmem_free(smtg, sizeof (*smtg));
+		return (DDI_FAILURE);
+	}
+
+	/*
+	 * Look for a logical volume for the SCSI unit address of this target.
+	 */
+	if ((smlv = smrt_logvol_lookup_by_id(smrt, targ)) == NULL) {
+		mutex_exit(&smrt->smrt_mutex);
+		kmem_free(smtg, sizeof (*smtg));
+		return (DDI_FAILURE);
+	}
+
+	smtg->smtg_lun.smtg_vol = smlv;
+	smtg->smtg_addr = &smlv->smlv_addr;
+	smtg->smtg_physical = B_FALSE;
+	list_insert_tail(&smlv->smlv_targets, smtg);
+
+	/*
+	 * Link this target object to the controller:
+	 */
+	smtg->smtg_ctlr = smrt;
+	list_insert_tail(&smrt->smrt_targets, smtg);
+
+	smtg->smtg_scsi_dev = sd;
+	VERIFY(sd->sd_dev == tgt_dip);
+
+	scsi_device_hba_private_set(sd, smtg);
+
+	mutex_exit(&smrt->smrt_mutex);
+	return (DDI_SUCCESS);
+}
+
+static void
+smrt_logvol_tran_tgt_free(dev_info_t *hba_dip, dev_info_t *tgt_dip,
+    scsi_hba_tran_t *hba_tran, struct scsi_device *sd)
+{
+	_NOTE(ARGUNUSED(hba_dip, tgt_dip))
+
+	smrt_t *smrt = (smrt_t *)hba_tran->tran_hba_private;
+	smrt_target_t *smtg = scsi_device_hba_private_get(sd);
+	smrt_volume_t *smlv = smtg->smtg_lun.smtg_vol;
+
+	VERIFY(smtg->smtg_scsi_dev == sd);
+	VERIFY(smtg->smtg_physical == B_FALSE);
+
+	mutex_enter(&smrt->smrt_mutex);
+	list_remove(&smlv->smlv_targets, smtg);
+	list_remove(&smrt->smrt_targets, smtg);
+
+	scsi_device_hba_private_set(sd, NULL);
+
+	mutex_exit(&smrt->smrt_mutex);
+
+	kmem_free(smtg, sizeof (*smtg));
+}
+
+static int
+smrt_phys_tran_tgt_init(dev_info_t *hba_dip, dev_info_t *tgt_dip,
+    scsi_hba_tran_t *hba_tran, struct scsi_device *sd)
+{
+	_NOTE(ARGUNUSED(hba_dip))
+
+	smrt_target_t *smtg;
+	smrt_physical_t *smpt;
+	const char *ua, *comma;
+	char *eptr;
+	long lun;
+
+	smrt_t *smrt = (smrt_t *)hba_tran->tran_hba_private;
+	dev_info_t *dip = smrt->smrt_dip;
+
+	/*
+	 * The unit address comes in the form of 'target,lun'.  We expect the
+	 * lun to be zero.  The target is what we set when we added it to the
+	 * target map earlier.
+	 */
+	ua = scsi_device_unit_address(sd);
+	if (ua == NULL)
+		return (DDI_FAILURE);
+
+	comma = strchr(ua, ',');
+	if (comma == NULL) {
+		return (DDI_FAILURE);
+	}
+
+	/*
+	 * Confirm the LUN is zero.  We may want to instead check the scsi
+	 * 'lun'/'lun64' property or do so in addition to this logic.
+	 */
+	if (ddi_strtol(comma + 1, &eptr, 16, &lun) != 0 || *eptr != '\0' ||
+	    lun != 0) {
+		return (DDI_FAILURE);
+	}
+
+	if ((smtg = kmem_zalloc(sizeof (*smtg), KM_NOSLEEP)) == NULL) {
+		dev_err(dip, CE_WARN, "could not allocate target object "
+		    "due to memory exhaustion");
+		return (DDI_FAILURE);
+	}
+
+	mutex_enter(&smrt->smrt_mutex);
+
+	if (smrt->smrt_status & SMRT_CTLR_STATUS_DETACHING) {
+		/*
+		 * We are detaching.  Do not accept any more requests to
+		 * attach targets from the framework.
+		 */
+		mutex_exit(&smrt->smrt_mutex);
+		kmem_free(smtg, sizeof (*smtg));
+		return (DDI_FAILURE);
+	}
+
+
+	/*
+	 * Look for a physical target based on the unit address of the target
+	 * (which will encode its WWN and LUN).
+	 */
+	smpt = smrt_phys_lookup_by_ua(smrt, ua);
+	if (smpt == NULL) {
+		mutex_exit(&smrt->smrt_mutex);
+		kmem_free(smtg, sizeof (*smtg));
+		return (DDI_FAILURE);
+	}
+
+	smtg->smtg_scsi_dev = sd;
+	smtg->smtg_physical = B_TRUE;
+	smtg->smtg_lun.smtg_phys = smpt;
+	list_insert_tail(&smpt->smpt_targets, smtg);
+	smtg->smtg_addr = &smpt->smpt_addr;
+
+	/*
+	 * Link this target object to the controller:
+	 */
+	smtg->smtg_ctlr = smrt;
+	list_insert_tail(&smrt->smrt_targets, smtg);
+
+	VERIFY(sd->sd_dev == tgt_dip);
+	smtg->smtg_scsi_dev = sd;
+
+	scsi_device_hba_private_set(sd, smtg);
+	mutex_exit(&smrt->smrt_mutex);
+
+	return (DDI_SUCCESS);
+}
+
+static void
+smrt_phys_tran_tgt_free(dev_info_t *hba_dip, dev_info_t *tgt_dip,
+    scsi_hba_tran_t *hba_tran, struct scsi_device *sd)
+{
+	_NOTE(ARGUNUSED(hba_dip, tgt_dip))
+
+	smrt_t *smrt = (smrt_t *)hba_tran->tran_hba_private;
+	smrt_target_t *smtg = scsi_device_hba_private_get(sd);
+	smrt_physical_t *smpt = smtg->smtg_lun.smtg_phys;
+
+	VERIFY(smtg->smtg_scsi_dev == sd);
+	VERIFY(smtg->smtg_physical == B_TRUE);
+
+	mutex_enter(&smrt->smrt_mutex);
+	list_remove(&smpt->smpt_targets, smtg);
+	list_remove(&smrt->smrt_targets, smtg);
+
+	scsi_device_hba_private_set(sd, NULL);
+	mutex_exit(&smrt->smrt_mutex);
+	kmem_free(smtg, sizeof (*smtg));
+}
+
+/*
+ * This function is called when the SCSI framework has allocated a packet and
+ * our private per-packet object.
+ *
+ * We choose not to have the framework pre-allocate memory for the CDB.
+ * Instead, we will make available the CDB area in the controller command block
+ * itself.
+ *
+ * Status block memory is allocated by the framework because we passed
+ * SCSI_HBA_TRAN_SCB to scsi_hba_attach_setup(9F).
+ */
+static int
+smrt_tran_setup_pkt(struct scsi_pkt *pkt, int (*callback)(caddr_t),
+    caddr_t arg)
+{
+	_NOTE(ARGUNUSED(arg))
+
+	struct scsi_device *sd;
+	smrt_target_t *smtg;
+	smrt_t *smrt;
+	smrt_command_t *smcm;
+	smrt_command_scsa_t *smcms;
+	int kmflags = callback == SLEEP_FUNC ? KM_SLEEP : KM_NOSLEEP;
+
+	sd = scsi_address_device(&pkt->pkt_address);
+	VERIFY(sd != NULL);
+	smtg = scsi_device_hba_private_get(sd);
+	VERIFY(smtg != NULL);
+	smrt = smtg->smtg_ctlr;
+	VERIFY(smrt != NULL);
+	smcms = (smrt_command_scsa_t *)pkt->pkt_ha_private;
+
+	/*
+	 * Check that we have enough space in the command object for the
+	 * request from the target driver:
+	 */
+	if (pkt->pkt_cdblen > CISS_CDBLEN) {
+		/*
+		 * The CDB member of the Request Block of a controller
+		 * command is fixed at 16 bytes.
+		 */
+		dev_err(smrt->smrt_dip, CE_WARN, "oversize CDB: had %u, "
+		    "needed %u", CISS_CDBLEN, pkt->pkt_cdblen);
+		return (-1);
+	}
+
+	/*
+	 * Allocate our command block:
+	 */
+	if ((smcm = smrt_command_alloc(smrt, SMRT_CMDTYPE_SCSA,
+	    kmflags)) == NULL) {
+		return (-1);
+	}
+	smcm->smcm_scsa = smcms;
+	smcms->smcms_command = smcm;
+	smcms->smcms_pkt = pkt;
+
+	pkt->pkt_cdbp = &smcm->smcm_va_cmd->Request.CDB[0];
+	smcm->smcm_va_cmd->Request.CDBLen = pkt->pkt_cdblen;
+
+	smcm->smcm_target = smtg;
+
+	return (0);
+}
+
+static void
+smrt_tran_teardown_pkt(struct scsi_pkt *pkt)
+{
+	smrt_command_scsa_t *smcms = (smrt_command_scsa_t *)
+	    pkt->pkt_ha_private;
+	smrt_command_t *smcm = smcms->smcms_command;
+
+	smrt_command_free(smcm);
+
+	pkt->pkt_cdbp = NULL;
+}
+
+static void
+smrt_set_arq_data(struct scsi_pkt *pkt, uchar_t key)
+{
+	struct scsi_arq_status *sts;
+
+	VERIFY3U(pkt->pkt_scblen, >=, sizeof (struct scsi_arq_status));
+
+	/* LINTED: E_BAD_PTR_CAST_ALIGN */
+	sts = (struct scsi_arq_status *)(pkt->pkt_scbp);
+	bzero(sts, sizeof (*sts));
+
+	/*
+	 * Mock up a CHECK CONDITION SCSI status for the original command:
+	 */
+	sts->sts_status.sts_chk = 1;
+
+	/*
+	 * Pretend that we successfully performed REQUEST SENSE:
+	 */
+	sts->sts_rqpkt_reason = CMD_CMPLT;
+	sts->sts_rqpkt_resid = 0;
+	sts->sts_rqpkt_state = STATE_GOT_BUS | STATE_GOT_TARGET |
+	    STATE_SENT_CMD | STATE_XFERRED_DATA;
+	sts->sts_rqpkt_statistics = 0;
+
+	/*
+	 * Return the key value we were provided in the fake sense data:
+	 */
+	sts->sts_sensedata.es_valid = 1;
+	sts->sts_sensedata.es_class = CLASS_EXTENDED_SENSE;
+	sts->sts_sensedata.es_key = key;
+
+	pkt->pkt_state |= STATE_ARQ_DONE;
+}
+
+/*
+ * When faking up a REPORT LUNS data structure, we simply report one LUN, LUN 0.
+ * We need 16 bytes for this, 4 for the size, 4 reserved bytes, and the 8 for
+ * the actual LUN.
+ */
+static void
+smrt_fake_report_lun(smrt_command_t *smcm, struct scsi_pkt *pkt)
+{
+	size_t sz;
+	char resp[16];
+	struct buf *bp;
+
+	pkt->pkt_reason = CMD_CMPLT;
+	pkt->pkt_state |= STATE_GOT_BUS | STATE_GOT_TARGET | STATE_SENT_CMD |
+	    STATE_GOT_STATUS;
+
+	/*
+	 * Check to make sure this is valid.  If reserved bits are set or if the
+	 * mode is one other than 0x00, 0x01, 0x02, then it's an illegal
+	 * request.
+	 */
+	if (pkt->pkt_cdbp[1] != 0 || pkt->pkt_cdbp[3] != 0 ||
+	    pkt->pkt_cdbp[4] != 0 || pkt->pkt_cdbp[5] != 0 ||
+	    pkt->pkt_cdbp[10] != 0 || pkt->pkt_cdbp[11] != 0 ||
+	    pkt->pkt_cdbp[2] > 0x2) {
+		smrt_set_arq_data(pkt, KEY_ILLEGAL_REQUEST);
+		return;
+	}
+
+	/*
+	 * Construct the actual REPORT LUNS reply.  We need to indicate a single
+	 * LUN of all zeros.  This means that the length needs to be 8 bytes,
+	 * the size of the lun.  Otherwise, the rest of this structure can be
+	 * zeros.
+	 */
+	bzero(resp, sizeof (resp));
+	resp[3] = sizeof (scsi_lun_t);
+
+	bp = scsi_pkt2bp(pkt);
+	sz = MIN(sizeof (resp), bp->b_bcount);
+
+	bp_mapin(bp);
+	bcopy(resp, bp->b_un.b_addr, sz);
+	bp_mapout(bp);
+	pkt->pkt_state |= STATE_XFERRED_DATA;
+	pkt->pkt_resid = bp->b_bcount - sz;
+	if (pkt->pkt_scblen >= 1) {
+		pkt->pkt_scbp[0] = STATUS_GOOD;
+	}
+}
+
+static int
+smrt_tran_start(struct scsi_address *sa, struct scsi_pkt *pkt)
+{
+	_NOTE(ARGUNUSED(sa))
+
+	struct scsi_device *sd;
+	smrt_target_t *smtg;
+	smrt_t *smrt;
+	smrt_command_scsa_t *smcms;
+	smrt_command_t *smcm;
+	int r;
+
+	sd = scsi_address_device(&pkt->pkt_address);
+	VERIFY(sd != NULL);
+	smtg = scsi_device_hba_private_get(sd);
+	VERIFY(smtg != NULL);
+	smrt = smtg->smtg_ctlr;
+	VERIFY(smrt != NULL);
+	smcms = (smrt_command_scsa_t *)pkt->pkt_ha_private;
+	VERIFY(smcms != NULL);
+	smcm = smcms->smcms_command;
+	VERIFY(smcm != NULL);
+
+	if (smcm->smcm_status & SMRT_CMD_STATUS_TRAN_START) {
+		/*
+		 * This is a retry of a command that has already been
+		 * used once.  Assign it a new tag number.
+		 */
+		smrt_command_reuse(smcm);
+	}
+	smcm->smcm_status |= SMRT_CMD_STATUS_TRAN_START;
+
+	/*
+	 * The sophisticated firmware in this controller cannot possibly bear
+	 * the following SCSI commands.  It appears to return a response with
+	 * the status STATUS_ACA_ACTIVE (0x30), which is not something we
+	 * expect.  Instead, fake up a failure response.
+	 */
+	switch (pkt->pkt_cdbp[0]) {
+	case SCMD_FORMAT:
+	case SCMD_LOG_SENSE_G1:
+	case SCMD_MODE_SELECT:
+	case SCMD_PERSISTENT_RESERVE_IN:
+		if (smtg->smtg_physical) {
+			break;
+		}
+
+		smrt->smrt_stats.smrts_ignored_scsi_cmds++;
+		smcm->smcm_status |= SMRT_CMD_STATUS_TRAN_IGNORED;
+
+		/*
+		 * Mark the command as completed to the point where we
+		 * received a SCSI status code:
+		 */
+		pkt->pkt_reason = CMD_CMPLT;
+		pkt->pkt_state |= STATE_GOT_BUS | STATE_GOT_TARGET |
+		    STATE_SENT_CMD | STATE_GOT_STATUS;
+
+		/*
+		 * Mock up sense data for an illegal request:
+		 */
+		smrt_set_arq_data(pkt, KEY_ILLEGAL_REQUEST);
+
+		scsi_hba_pkt_comp(pkt);
+		return (TRAN_ACCEPT);
+	case SCMD_REPORT_LUNS:
+		/*
+		 * The SMRT controller does not accept a REPORT LUNS command for
+		 * logical volumes.  As such, we need to fake up a REPORT LUNS
+		 * response that has a single LUN, LUN 0.
+		 */
+		if (smtg->smtg_physical) {
+			break;
+		}
+
+		smrt_fake_report_lun(smcm, pkt);
+
+		scsi_hba_pkt_comp(pkt);
+		return (TRAN_ACCEPT);
+	default:
+		break;
+	}
+
+	if (pkt->pkt_flags & FLAG_NOINTR) {
+		/*
+		 * We must sleep and wait for the completion of this command.
+		 */
+		smcm->smcm_status |= SMRT_CMD_STATUS_POLLED;
+	}
+
+	/*
+	 * Because we provide a tran_setup_pkt(9E) entrypoint, we must now
+	 * set up the Scatter/Gather List in the Command to reflect any
+	 * DMA resources passed to us by the framework.
+	 */
+	if (pkt->pkt_numcookies > smrt->smrt_sg_cnt) {
+		/*
+		 * More DMA cookies than we are prepared to handle.
+		 */
+		dev_err(smrt->smrt_dip, CE_WARN, "too many DMA cookies (got %u;"
+		    " expected %u)", pkt->pkt_numcookies, smrt->smrt_sg_cnt);
+		return (TRAN_BADPKT);
+	}
+	smcm->smcm_va_cmd->Header.SGList = pkt->pkt_numcookies;
+	smcm->smcm_va_cmd->Header.SGTotal = pkt->pkt_numcookies;
+	for (unsigned i = 0; i < pkt->pkt_numcookies; i++) {
+		smcm->smcm_va_cmd->SG[i].Addr =
+		    LE_64(pkt->pkt_cookies[i].dmac_laddress);
+		smcm->smcm_va_cmd->SG[i].Len =
+		    LE_32(pkt->pkt_cookies[i].dmac_size);
+	}
+
+	/*
+	 * Copy logical volume address from the target object:
+	 */
+	smcm->smcm_va_cmd->Header.LUN = *smcm->smcm_target->smtg_addr;
+
+	/*
+	 * Initialise the command block.
+	 */
+	smcm->smcm_va_cmd->Request.CDBLen = pkt->pkt_cdblen;
+	smcm->smcm_va_cmd->Request.Type.Type = CISS_TYPE_CMD;
+	smcm->smcm_va_cmd->Request.Type.Attribute = CISS_ATTR_SIMPLE;
+	smcm->smcm_va_cmd->Request.Timeout = LE_16(pkt->pkt_time);
+	if (pkt->pkt_numcookies > 0) {
+		/*
+		 * There are DMA resources; set the transfer direction
+		 * appropriately:
+		 */
+		if (pkt->pkt_dma_flags & DDI_DMA_READ) {
+			smcm->smcm_va_cmd->Request.Type.Direction =
+			    CISS_XFER_READ;
+		} else if (pkt->pkt_dma_flags & DDI_DMA_WRITE) {
+			smcm->smcm_va_cmd->Request.Type.Direction =
+			    CISS_XFER_WRITE;
+		} else {
+			smcm->smcm_va_cmd->Request.Type.Direction =
+			    CISS_XFER_NONE;
+		}
+	} else {
+		/*
+		 * No DMA resources means no transfer.
+		 */
+		smcm->smcm_va_cmd->Request.Type.Direction = CISS_XFER_NONE;
+	}
+
+	/*
+	 * Initialise the SCSI packet as described in tran_start(9E).  We will
+	 * progressively update these fields as the command moves through the
+	 * submission and completion states.
+	 */
+	pkt->pkt_resid = 0;
+	pkt->pkt_reason = CMD_CMPLT;
+	pkt->pkt_statistics = 0;
+	pkt->pkt_state = 0;
+
+	/*
+	 * If this SCSI packet has a timeout, configure an appropriate
+	 * expiry time:
+	 */
+	if (pkt->pkt_time != 0) {
+		smcm->smcm_expiry = gethrtime() + pkt->pkt_time * NANOSEC;
+	}
+
+	/*
+	 * Submit the command to the controller.
+	 */
+	mutex_enter(&smrt->smrt_mutex);
+
+	/*
+	 * If we're dumping, there's a chance that the target we're talking to
+	 * could have ended up disappearing during the process of discovery.  If
+	 * this target is part of the dump device, we check here and return that
+	 * we hit a fatal error.
+	 */
+	if (ddi_in_panic() && smtg->smtg_gone) {
+		mutex_exit(&smrt->smrt_mutex);
+
+		dev_err(smrt->smrt_dip, CE_WARN, "smrt_submit failed: target "
+		    "%s is gone, it did not come back after post-panic reset "
+		    "device discovery", scsi_device_unit_address(sd));
+
+		return (TRAN_FATAL_ERROR);
+	}
+
+	smrt->smrt_stats.smrts_tran_starts++;
+	if ((r = smrt_submit(smrt, smcm)) != 0) {
+		mutex_exit(&smrt->smrt_mutex);
+
+		dev_err(smrt->smrt_dip, CE_WARN, "smrt_submit failed %d", r);
+
+		/*
+		 * Inform the SCSI framework that we could not submit
+		 * the command.
+		 */
+		return (r == EAGAIN ? TRAN_BUSY : TRAN_FATAL_ERROR);
+	}
+
+	/*
+	 * Update the SCSI packet to reflect submission of the command.
+	 */
+	pkt->pkt_state |= STATE_GOT_BUS | STATE_GOT_TARGET | STATE_SENT_CMD;
+
+	if (pkt->pkt_flags & FLAG_NOINTR) {
+		/*
+		 * Poll the controller for completion of the command we
+		 * submitted.  Once this routine has returned, the completion
+		 * callback will have been fired with either an active response
+		 * (success or error) or a timeout.  The command is freed by
+		 * the completion callback, so it may not be referenced again
+		 * after this call returns.
+		 */
+		smrt_poll_for(smrt, smcm);
+	}
+
+	mutex_exit(&smrt->smrt_mutex);
+	return (TRAN_ACCEPT);
+}
+
+static int
+smrt_tran_reset(struct scsi_address *sa, int level)
+{
+	_NOTE(ARGUNUSED(level))
+
+	struct scsi_device *sd;
+	smrt_target_t *smtg;
+	smrt_t *smrt;
+	smrt_command_t *smcm;
+	int r;
+
+	sd = scsi_address_device(sa);
+	VERIFY(sd != NULL);
+	smtg = scsi_device_hba_private_get(sd);
+	VERIFY(smtg != NULL);
+	smrt = smtg->smtg_ctlr;
+
+	/*
+	 * The framework has requested some kind of SCSI reset.  A
+	 * controller-level soft reset can take a very long time -- often on
+	 * the order of 30-60 seconds -- but might well be our only option if
+	 * the controller is non-responsive.
+	 *
+	 * First, check if the controller is responding to pings.
+	 */
+again:
+	if ((smcm = smrt_command_alloc(smrt, SMRT_CMDTYPE_INTERNAL,
+	    KM_NOSLEEP)) == NULL) {
+		return (0);
+	}
+
+	smrt_write_message_nop(smcm, SMRT_PING_CHECK_TIMEOUT);
+
+	mutex_enter(&smrt->smrt_mutex);
+	smrt->smrt_stats.smrts_tran_resets++;
+	if (ddi_in_panic()) {
+		goto skip_check;
+	}
+
+	if (smrt->smrt_status & SMRT_CTLR_STATUS_RESETTING) {
+		/*
+		 * The controller is already resetting.  Wait for that
+		 * to finish.
+		 */
+		while (smrt->smrt_status & SMRT_CTLR_STATUS_RESETTING) {
+			cv_wait(&smrt->smrt_cv_finishq, &smrt->smrt_mutex);
+		}
+	}
+
+skip_check:
+	/*
+	 * Submit our ping to the controller.
+	 */
+	smcm->smcm_status |= SMRT_CMD_STATUS_POLLED;
+	smcm->smcm_expiry = gethrtime() + SMRT_PING_CHECK_TIMEOUT * NANOSEC;
+	if (smrt_submit(smrt, smcm) != 0) {
+		mutex_exit(&smrt->smrt_mutex);
+		smrt_command_free(smcm);
+		return (0);
+	}
+
+	if ((r = smrt_poll_for(smrt, smcm)) != 0) {
+		VERIFY3S(r, ==, ETIMEDOUT);
+		VERIFY0(smcm->smcm_status & SMRT_CMD_STATUS_POLL_COMPLETE);
+
+		/*
+		 * The ping command timed out.  Abandon it now.
+		 */
+		dev_err(smrt->smrt_dip, CE_WARN, "controller ping timed out");
+		smcm->smcm_status |= SMRT_CMD_STATUS_ABANDONED;
+		smcm->smcm_status &= ~SMRT_CMD_STATUS_POLLED;
+
+	} else if ((smcm->smcm_status & SMRT_CMD_STATUS_RESET_SENT) ||
+	    (smcm->smcm_status & SMRT_CMD_STATUS_ERROR)) {
+		/*
+		 * The command completed in error, or a controller reset
+		 * was sent while we were trying to ping.
+		 */
+		dev_err(smrt->smrt_dip, CE_WARN, "controller ping error");
+		mutex_exit(&smrt->smrt_mutex);
+		smrt_command_free(smcm);
+		mutex_enter(&smrt->smrt_mutex);
+
+	} else {
+		VERIFY(smcm->smcm_status & SMRT_CMD_STATUS_COMPLETE);
+
+		/*
+		 * The controller is responsive, and a full soft reset would be
+		 * extremely disruptive to the system.  Given our spotty
+		 * support for some SCSI commands (which can upset the target
+		 * drivers) and the historically lax behaviour of the "smrt"
+		 * driver, we grit our teeth and pretend we were able to
+		 * perform a reset.
+		 */
+		mutex_exit(&smrt->smrt_mutex);
+		smrt_command_free(smcm);
+		return (1);
+	}
+
+	/*
+	 * If a reset has been initiated in the last 90 seconds, try
+	 * another ping.
+	 */
+	if (gethrtime() < smrt->smrt_last_reset_start + 90 * NANOSEC) {
+		dev_err(smrt->smrt_dip, CE_WARN, "controller ping failed, but "
+		    "was recently reset; retrying ping");
+		mutex_exit(&smrt->smrt_mutex);
+
+		/*
+		 * Sleep for a second first.
+		 */
+		if (ddi_in_panic()) {
+			drv_usecwait(1 * MICROSEC);
+		} else {
+			delay(drv_usectohz(1 * MICROSEC));
+		}
+		goto again;
+	}
+
+	dev_err(smrt->smrt_dip, CE_WARN, "controller ping failed; resetting "
+	    "controller");
+	if (smrt_ctlr_reset(smrt) != 0) {
+		dev_err(smrt->smrt_dip, CE_WARN, "controller reset failure");
+		mutex_exit(&smrt->smrt_mutex);
+		return (0);
+	}
+
+	mutex_exit(&smrt->smrt_mutex);
+	return (1);
+}
+
+static int
+smrt_tran_abort(struct scsi_address *sa, struct scsi_pkt *pkt)
+{
+	struct scsi_device *sd;
+	smrt_target_t *smtg;
+	smrt_t *smrt;
+	smrt_command_t *smcm = NULL;
+	smrt_command_t *abort_smcm;
+
+	sd = scsi_address_device(sa);
+	VERIFY(sd != NULL);
+	smtg = scsi_device_hba_private_get(sd);
+	VERIFY(smtg != NULL);
+	smrt = smtg->smtg_ctlr;
+	VERIFY(smrt != NULL);
+
+
+	if ((abort_smcm = smrt_command_alloc(smrt, SMRT_CMDTYPE_INTERNAL,
+	    KM_NOSLEEP)) == NULL) {
+		/*
+		 * No resources available to send an abort message.
+		 */
+		return (0);
+	}
+
+	mutex_enter(&smrt->smrt_mutex);
+	smrt->smrt_stats.smrts_tran_aborts++;
+	if (pkt != NULL) {
+		/*
+		 * The framework wants us to abort a specific SCSI packet.
+		 */
+		smrt_command_scsa_t *smcms = (smrt_command_scsa_t *)
+		    pkt->pkt_ha_private;
+		smcm = smcms->smcms_command;
+
+		if (!(smcm->smcm_status & SMRT_CMD_STATUS_INFLIGHT)) {
+			/*
+			 * This message is not currently in flight, so we
+			 * cannot abort it.
+			 */
+			goto fail;
+		}
+
+		if (smcm->smcm_status & SMRT_CMD_STATUS_ABORT_SENT) {
+			/*
+			 * An abort message for this command has already been
+			 * sent to the controller.  Return failure.
+			 */
+			goto fail;
+		}
+
+		smrt_write_message_abort_one(abort_smcm, smcm->smcm_tag);
+	} else {
+		/*
+		 * The framework wants us to abort every in flight command
+		 * for the target with this address.
+		 */
+		smrt_write_message_abort_all(abort_smcm, smtg->smtg_addr);
+	}
+
+	/*
+	 * Submit the abort message to the controller.
+	 */
+	abort_smcm->smcm_status |= SMRT_CMD_STATUS_POLLED;
+	if (smrt_submit(smrt, abort_smcm) != 0) {
+		goto fail;
+	}
+
+	if (pkt != NULL) {
+		/*
+		 * Record some debugging information about the abort we
+		 * sent:
+		 */
+		smcm->smcm_abort_time = gethrtime();
+		smcm->smcm_abort_tag = abort_smcm->smcm_tag;
+
+		/*
+		 * Mark the command as aborted so that we do not send
+		 * a second abort message:
+		 */
+		smcm->smcm_status |= SMRT_CMD_STATUS_ABORT_SENT;
+	}
+
+	/*
+	 * Poll for completion of the abort message.  Note that this function
+	 * only fails if we set a timeout on the command, which we have not
+	 * done.
+	 */
+	VERIFY0(smrt_poll_for(smrt, abort_smcm));
+
+	if ((abort_smcm->smcm_status & SMRT_CMD_STATUS_RESET_SENT) ||
+	    (abort_smcm->smcm_status & SMRT_CMD_STATUS_ERROR)) {
+		/*
+		 * Either the controller was reset or the abort command
+		 * failed.
+		 */
+		goto fail;
+	}
+
+	/*
+	 * The command was successfully aborted.
+	 */
+	mutex_exit(&smrt->smrt_mutex);
+	smrt_command_free(abort_smcm);
+	return (1);
+
+fail:
+	mutex_exit(&smrt->smrt_mutex);
+	smrt_command_free(abort_smcm);
+	return (0);
+}
+
+static void
+smrt_hba_complete_status(smrt_command_t *smcm)
+{
+	ErrorInfo_t *ei = smcm->smcm_va_err;
+	struct scsi_pkt *pkt = smcm->smcm_scsa->smcms_pkt;
+
+	bzero(pkt->pkt_scbp, pkt->pkt_scblen);
+
+	if (ei->ScsiStatus != STATUS_CHECK) {
+		/*
+		 * If the SCSI status is not CHECK CONDITION, we don't want
+		 * to try and read the sense data buffer.
+		 */
+		goto simple_status;
+	}
+
+	if (pkt->pkt_scblen < sizeof (struct scsi_arq_status)) {
+		/*
+		 * There is not enough room for a request sense structure.
+		 * Fall back to reporting just the SCSI status code.
+		 */
+		goto simple_status;
+	}
+
+	/* LINTED: E_BAD_PTR_CAST_ALIGN */
+	struct scsi_arq_status *sts = (struct scsi_arq_status *)pkt->pkt_scbp;
+
+	/*
+	 * Copy in the SCSI status from the original command.
+	 */
+	bcopy(&ei->ScsiStatus, &sts->sts_status, sizeof (sts->sts_status));
+
+	/*
+	 * Mock up a successful REQUEST SENSE:
+	 */
+	sts->sts_rqpkt_reason = CMD_CMPLT;
+	sts->sts_rqpkt_resid = 0;
+	sts->sts_rqpkt_state = STATE_GOT_BUS | STATE_GOT_TARGET |
+	    STATE_SENT_CMD | STATE_XFERRED_DATA | STATE_GOT_STATUS;
+	sts->sts_rqpkt_statistics = 0;
+
+	/*
+	 * The sense data from the controller should be copied into place
+	 * starting at the "sts_sensedata" member of the auto request
+	 * sense object.
+	 */
+	size_t sense_len = pkt->pkt_scblen - offsetof(struct scsi_arq_status,
+	    sts_sensedata);
+	if (ei->SenseLen < sense_len) {
+		/*
+		 * Only copy sense data bytes that are within the region
+		 * the controller marked as valid.
+		 */
+		sense_len = ei->SenseLen;
+	}
+	bcopy(ei->SenseInfo, &sts->sts_sensedata, sense_len);
+
+	pkt->pkt_state |= STATE_ARQ_DONE;
+	return;
+
+simple_status:
+	if (pkt->pkt_scblen < sizeof (struct scsi_status)) {
+		/*
+		 * There is not even enough room for the SCSI status byte.
+		 */
+		return;
+	}
+
+	bcopy(&ei->ScsiStatus, pkt->pkt_scbp, sizeof (struct scsi_status));
+}
+
+static void
+smrt_hba_complete_log_error(smrt_command_t *smcm, const char *name)
+{
+	smrt_t *smrt = smcm->smcm_ctlr;
+	ErrorInfo_t *ei = smcm->smcm_va_err;
+
+	dev_err(smrt->smrt_dip, CE_WARN, "!SCSI command failed: %s: "
+	    "SCSI op %x, CISS status %x, SCSI status %x", name,
+	    (unsigned)smcm->smcm_va_cmd->Request.CDB[0],
+	    (unsigned)ei->CommandStatus, (unsigned)ei->ScsiStatus);
+}
+
+/*
+ * Completion routine for commands submitted to the controller via the SCSI
+ * framework.
+ */
+void
+smrt_hba_complete(smrt_command_t *smcm)
+{
+	smrt_t *smrt = smcm->smcm_ctlr;
+	ErrorInfo_t *ei = smcm->smcm_va_err;
+	struct scsi_pkt *pkt = smcm->smcm_scsa->smcms_pkt;
+
+	VERIFY(MUTEX_HELD(&smrt->smrt_mutex));
+
+	pkt->pkt_resid = ei->ResidualCnt;
+
+	/*
+	 * Check if the controller was reset while this packet was in flight.
+	 */
+	if (smcm->smcm_status & SMRT_CMD_STATUS_RESET_SENT) {
+		if (pkt->pkt_reason != CMD_CMPLT) {
+			/*
+			 * If another error status has already been written,
+			 * do not overwrite it.
+			 */
+			pkt->pkt_reason = CMD_RESET;
+		}
+		pkt->pkt_statistics |= STAT_BUS_RESET | STAT_DEV_RESET;
+		goto finish;
+	}
+
+	if (!(smcm->smcm_status & SMRT_CMD_STATUS_ERROR)) {
+		/*
+		 * The command was completed without error by the controller.
+		 *
+		 * As per the specification, if an error was not signalled
+		 * by the controller through the CISS transport method,
+		 * the error information (including CommandStatus) has not
+		 * been written and should not be checked.
+		 */
+		pkt->pkt_state |= STATE_XFERRED_DATA | STATE_GOT_STATUS;
+		goto finish;
+	}
+
+	/*
+	 * Check the completion status to determine what befell this request.
+	 */
+	switch (ei->CommandStatus) {
+	case CISS_CMD_SUCCESS:
+		/*
+		 * In a certain sense, the specification contradicts itself.
+		 * On the one hand, it suggests that a successful command
+		 * will not result in a controller write to the error
+		 * information block; on the other hand, it makes room
+		 * for a status code (0) which denotes a successful
+		 * execution.
+		 *
+		 * To be on the safe side, we check for that condition here.
+		 */
+		pkt->pkt_state |= STATE_XFERRED_DATA | STATE_GOT_STATUS;
+		break;
+
+	case CISS_CMD_DATA_UNDERRUN:
+		/*
+		 * A data underrun occurred.  Ideally this will result in
+		 * an appropriate SCSI status and sense data.
+		 */
+		pkt->pkt_state |= STATE_XFERRED_DATA | STATE_GOT_STATUS;
+		break;
+
+	case CISS_CMD_TARGET_STATUS:
+		/*
+		 * The command completed, but an error occurred.  We need
+		 * to provide the sense data to the SCSI framework.
+		 */
+		pkt->pkt_state |= STATE_XFERRED_DATA | STATE_GOT_STATUS;
+		break;
+
+	case CISS_CMD_DATA_OVERRUN:
+		/*
+		 * Data overrun has occurred.
+		 */
+		smrt_hba_complete_log_error(smcm, "data overrun");
+		pkt->pkt_reason = CMD_DATA_OVR;
+		pkt->pkt_state |= STATE_XFERRED_DATA | STATE_GOT_STATUS;
+		break;
+
+	case CISS_CMD_INVALID:
+		/*
+		 * One or more fields in the command has invalid data.
+		 */
+		smrt_hba_complete_log_error(smcm, "invalid command");
+		pkt->pkt_reason = CMD_BADMSG;
+		pkt->pkt_state |= STATE_GOT_STATUS;
+		break;
+
+	case CISS_CMD_PROTOCOL_ERR:
+		/*
+		 * An error occurred in communication with the end device.
+		 */
+		smrt_hba_complete_log_error(smcm, "protocol error");
+		pkt->pkt_reason = CMD_BADMSG;
+		pkt->pkt_state |= STATE_GOT_STATUS;
+		break;
+
+	case CISS_CMD_HARDWARE_ERR:
+		/*
+		 * A hardware error occurred.
+		 */
+		smrt_hba_complete_log_error(smcm, "hardware error");
+		pkt->pkt_reason = CMD_INCOMPLETE;
+		break;
+
+	case CISS_CMD_CONNECTION_LOST:
+		/*
+		 * The connection with the end device cannot be
+		 * re-established.
+		 */
+		smrt_hba_complete_log_error(smcm, "connection lost");
+		pkt->pkt_reason = CMD_INCOMPLETE;
+		break;
+
+	case CISS_CMD_ABORTED:
+	case CISS_CMD_UNSOLICITED_ABORT:
+		if (smcm->smcm_status & SMRT_CMD_STATUS_TIMEOUT) {
+			/*
+			 * This abort was arranged by the periodic routine
+			 * in response to an elapsed timeout.
+			 */
+			pkt->pkt_reason = CMD_TIMEOUT;
+			pkt->pkt_statistics |= STAT_TIMEOUT;
+		} else {
+			pkt->pkt_reason = CMD_ABORTED;
+		}
+		pkt->pkt_state |= STATE_XFERRED_DATA | STATE_GOT_STATUS;
+		pkt->pkt_statistics |= STAT_ABORTED;
+		break;
+
+	case CISS_CMD_TIMEOUT:
+		smrt_hba_complete_log_error(smcm, "timeout");
+		pkt->pkt_reason = CMD_TIMEOUT;
+		pkt->pkt_statistics |= STAT_TIMEOUT;
+		break;
+
+	default:
+		/*
+		 * This is an error that we were not prepared to handle.
+		 * Signal a generic transport-level error to the framework.
+		 */
+		smrt_hba_complete_log_error(smcm, "unexpected error");
+		pkt->pkt_reason = CMD_TRAN_ERR;
+	}
+
+	/*
+	 * Attempt to read a SCSI status code and any automatic
+	 * request sense data that may exist:
+	 */
+	smrt_hba_complete_status(smcm);
+
+finish:
+	mutex_exit(&smrt->smrt_mutex);
+	scsi_hba_pkt_comp(pkt);
+	mutex_enter(&smrt->smrt_mutex);
+}
+
+static int
+smrt_getcap(struct scsi_address *sa, char *cap, int whom)
+{
+	_NOTE(ARGUNUSED(whom))
+
+	struct scsi_device *sd;
+	smrt_target_t *smtg;
+	smrt_t *smrt;
+	int index;
+
+	sd = scsi_address_device(sa);
+	VERIFY(sd != NULL);
+	smtg = scsi_device_hba_private_get(sd);
+	VERIFY(smtg != NULL);
+	smrt = smtg->smtg_ctlr;
+	VERIFY(smrt != NULL);
+
+	if ((index = scsi_hba_lookup_capstr(cap)) == DDI_FAILURE) {
+		/*
+		 * This capability string could not be translated to an
+		 * ID number, so it must not exist.
+		 */
+		return (-1);
+	}
+
+	switch (index) {
+	case SCSI_CAP_CDB_LEN:
+		/*
+		 * The CDB field in the CISS request block is fixed at 16
+		 * bytes.
+		 */
+		return (CISS_CDBLEN);
+
+	case SCSI_CAP_DMA_MAX:
+		if (smrt->smrt_dma_attr.dma_attr_maxxfer > INT_MAX) {
+			return (INT_MAX);
+		}
+		return ((int)smrt->smrt_dma_attr.dma_attr_maxxfer);
+
+	case SCSI_CAP_SECTOR_SIZE:
+		if (smrt->smrt_dma_attr.dma_attr_granular > INT_MAX) {
+			return (-1);
+		}
+		return ((int)smrt->smrt_dma_attr.dma_attr_granular);
+
+	/*
+	 * If this target corresponds to a physical device, then we always
+	 * indicate that we're on a SAS interconnect.  Otherwise, we default to
+	 * saying that we're on a parallel bus.  We can't use SAS for
+	 * everything, unfortunately.  When you declare yourself to be a SAS
+	 * interconnect, it's expected that you have a full 16-byte WWN as the
+	 * target.  If not, devfsadm will not be able to enumerate the device
+	 * and create /dev/[r]dsk entries.
+	 */
+	case SCSI_CAP_INTERCONNECT_TYPE:
+		if (smtg->smtg_physical) {
+			return (INTERCONNECT_SAS);
+		} else {
+			return (INTERCONNECT_PARALLEL);
+		}
+
+	case SCSI_CAP_DISCONNECT:
+	case SCSI_CAP_SYNCHRONOUS:
+	case SCSI_CAP_WIDE_XFER:
+	case SCSI_CAP_ARQ:
+	case SCSI_CAP_UNTAGGED_QING:
+	case SCSI_CAP_TAGGED_QING:
+		/*
+		 * These capabilities are supported by the driver and the
+		 * controller.  See scsi_ifgetcap(9F) for more information.
+		 */
+		return (1);
+
+	case SCSI_CAP_INITIATOR_ID:
+	case SCSI_CAP_RESET_NOTIFICATION:
+		/*
+		 * These capabilities are not supported.
+		 */
+		return (0);
+
+	default:
+		/*
+		 * The property in question is not known to this driver.
+		 */
+		return (-1);
+	}
+}
+
+/* ARGSUSED */
+static int
+smrt_setcap(struct scsi_address *sa, char *cap, int value, int whom)
+{
+	int index;
+
+	if ((index = scsi_hba_lookup_capstr(cap)) == DDI_FAILURE) {
+		/*
+		 * This capability string could not be translated to an
+		 * ID number, so it must not exist.
+		 */
+		return (-1);
+	}
+
+	if (whom == 0) {
+		/*
+		 * When whom is 0, this is a request to set a capability for
+		 * all targets.  As per the recommendation in tran_setcap(9E),
+		 * we do not support this mode of operation.
+		 */
+		return (-1);
+	}
+
+	switch (index) {
+	case SCSI_CAP_CDB_LEN:
+	case SCSI_CAP_DMA_MAX:
+	case SCSI_CAP_SECTOR_SIZE:
+	case SCSI_CAP_INITIATOR_ID:
+	case SCSI_CAP_DISCONNECT:
+	case SCSI_CAP_SYNCHRONOUS:
+	case SCSI_CAP_WIDE_XFER:
+	case SCSI_CAP_ARQ:
+	case SCSI_CAP_UNTAGGED_QING:
+	case SCSI_CAP_TAGGED_QING:
+	case SCSI_CAP_RESET_NOTIFICATION:
+	case SCSI_CAP_INTERCONNECT_TYPE:
+		/*
+		 * We do not support changing any capabilities at this time.
+		 */
+		return (0);
+
+	default:
+		/*
+		 * The capability in question is not known to this driver.
+		 */
+		return (-1);
+	}
+}
+
+int
+smrt_ctrl_hba_setup(smrt_t *smrt)
+{
+	int flags;
+	dev_info_t *dip = smrt->smrt_dip;
+	scsi_hba_tran_t *tran;
+
+	if ((tran = scsi_hba_tran_alloc(dip, SCSI_HBA_CANSLEEP)) == NULL) {
+		dev_err(dip, CE_WARN, "could not allocate SCSA resources");
+		return (DDI_FAILURE);
+	}
+
+	smrt->smrt_hba_tran = tran;
+	tran->tran_hba_private = smrt;
+
+	tran->tran_tgt_init = smrt_ctrl_tran_tgt_init;
+	tran->tran_tgt_probe = scsi_hba_probe;
+
+	tran->tran_start = smrt_ctrl_tran_start;
+
+	tran->tran_getcap = smrt_getcap;
+	tran->tran_setcap = smrt_setcap;
+
+	tran->tran_setup_pkt = smrt_tran_setup_pkt;
+	tran->tran_teardown_pkt = smrt_tran_teardown_pkt;
+	tran->tran_hba_len = sizeof (smrt_command_scsa_t);
+	tran->tran_interconnect_type = INTERCONNECT_SAS;
+
+	flags = SCSI_HBA_HBA | SCSI_HBA_TRAN_SCB | SCSI_HBA_ADDR_COMPLEX;
+	if (scsi_hba_attach_setup(dip, &smrt->smrt_dma_attr, tran, flags) !=
+	    DDI_SUCCESS) {
+		dev_err(dip, CE_WARN, "could not attach to SCSA framework");
+		scsi_hba_tran_free(tran);
+		return (DDI_FAILURE);
+	}
+
+	smrt->smrt_init_level |= SMRT_INITLEVEL_SCSA;
+	return (DDI_SUCCESS);
+}
+
+void
+smrt_ctrl_hba_teardown(smrt_t *smrt)
+{
+	if (smrt->smrt_init_level & SMRT_INITLEVEL_SCSA) {
+		VERIFY(scsi_hba_detach(smrt->smrt_dip) != DDI_FAILURE);
+		scsi_hba_tran_free(smrt->smrt_hba_tran);
+		smrt->smrt_init_level &= ~SMRT_INITLEVEL_SCSA;
+	}
+}
+
+int
+smrt_logvol_hba_setup(smrt_t *smrt, dev_info_t *iport)
+{
+	scsi_hba_tran_t *tran;
+
+	tran = ddi_get_driver_private(iport);
+	if (tran == NULL)
+		return (DDI_FAILURE);
+
+	tran->tran_tgt_init = smrt_logvol_tran_tgt_init;
+	tran->tran_tgt_free = smrt_logvol_tran_tgt_free;
+
+	tran->tran_start = smrt_tran_start;
+	tran->tran_reset = smrt_tran_reset;
+	tran->tran_abort = smrt_tran_abort;
+
+	tran->tran_hba_private = smrt;
+
+	mutex_enter(&smrt->smrt_mutex);
+	if (scsi_hba_tgtmap_create(iport, SCSI_TM_FULLSET, MICROSEC,
+	    2 * MICROSEC, smrt, smrt_logvol_tgtmap_activate,
+	    smrt_logvol_tgtmap_deactivate, &smrt->smrt_virt_tgtmap) !=
+	    DDI_SUCCESS) {
+		return (DDI_FAILURE);
+	}
+
+	smrt_discover_request(smrt);
+	mutex_exit(&smrt->smrt_mutex);
+
+	return (DDI_SUCCESS);
+}
+
+void
+smrt_logvol_hba_teardown(smrt_t *smrt, dev_info_t *iport)
+{
+	ASSERT(smrt->smrt_virt_iport == iport);
+
+	mutex_enter(&smrt->smrt_mutex);
+
+	if (smrt->smrt_virt_tgtmap != NULL) {
+		scsi_hba_tgtmap_t *t;
+
+		/*
+		 * Ensure that we can't be racing with discovery.
+		 */
+		while (smrt->smrt_status & SMRT_CTLR_DISCOVERY_RUNNING) {
+			mutex_exit(&smrt->smrt_mutex);
+			ddi_taskq_wait(smrt->smrt_discover_taskq);
+			mutex_enter(&smrt->smrt_mutex);
+		}
+
+		t = smrt->smrt_virt_tgtmap;
+		smrt->smrt_virt_tgtmap = NULL;
+		mutex_exit(&smrt->smrt_mutex);
+		scsi_hba_tgtmap_destroy(t);
+		mutex_enter(&smrt->smrt_mutex);
+	}
+
+	mutex_exit(&smrt->smrt_mutex);
+}
+
+int
+smrt_phys_hba_setup(smrt_t *smrt, dev_info_t *iport)
+{
+	scsi_hba_tran_t *tran;
+
+	tran = ddi_get_driver_private(iport);
+	if (tran == NULL)
+		return (DDI_FAILURE);
+
+	tran->tran_tgt_init = smrt_phys_tran_tgt_init;
+	tran->tran_tgt_free = smrt_phys_tran_tgt_free;
+
+	tran->tran_start = smrt_tran_start;
+	tran->tran_reset = smrt_tran_reset;
+	tran->tran_abort = smrt_tran_abort;
+
+	tran->tran_hba_private = smrt;
+
+	mutex_enter(&smrt->smrt_mutex);
+	if (scsi_hba_tgtmap_create(iport, SCSI_TM_FULLSET, MICROSEC,
+	    2 * MICROSEC, smrt, smrt_phys_tgtmap_activate,
+	    smrt_phys_tgtmap_deactivate, &smrt->smrt_phys_tgtmap) !=
+	    DDI_SUCCESS) {
+		return (DDI_FAILURE);
+	}
+
+	smrt_discover_request(smrt);
+	mutex_exit(&smrt->smrt_mutex);
+
+	return (DDI_SUCCESS);
+}
+
+void
+smrt_phys_hba_teardown(smrt_t *smrt, dev_info_t *iport)
+{
+	ASSERT(smrt->smrt_phys_iport == iport);
+
+	mutex_enter(&smrt->smrt_mutex);
+
+	if (smrt->smrt_phys_tgtmap != NULL) {
+		scsi_hba_tgtmap_t *t;
+
+		/*
+		 * Ensure that we can't be racing with discovery.
+		 */
+		while (smrt->smrt_status & SMRT_CTLR_DISCOVERY_RUNNING) {
+			mutex_exit(&smrt->smrt_mutex);
+			ddi_taskq_wait(smrt->smrt_discover_taskq);
+			mutex_enter(&smrt->smrt_mutex);
+		}
+
+		t = smrt->smrt_phys_tgtmap;
+		smrt->smrt_phys_tgtmap = NULL;
+		mutex_exit(&smrt->smrt_mutex);
+		scsi_hba_tgtmap_destroy(t);
+		mutex_enter(&smrt->smrt_mutex);
+	}
+
+	mutex_exit(&smrt->smrt_mutex);
+}
diff --git a/usr/src/uts/common/io/scsi/adapters/smrt/smrt_interrupts.c b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_interrupts.c
new file mode 100644
index 0000000000..18d5b8e936
--- /dev/null
+++ b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_interrupts.c
@@ -0,0 +1,286 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2017, Joyent, Inc.
+ */
+
+#include <sys/scsi/adapters/smrt/smrt.h>
+
+static char *
+smrt_interrupt_type_name(int type)
+{
+	switch (type) {
+	case DDI_INTR_TYPE_MSIX:
+		return ("MSI-X");
+	case DDI_INTR_TYPE_MSI:
+		return ("MSI");
+	case DDI_INTR_TYPE_FIXED:
+		return ("fixed");
+	default:
+		return ("?");
+	}
+}
+
+static boolean_t
+smrt_try_msix(smrt_t *smrt)
+{
+	char *fwver = smrt->smrt_versions.smrtv_firmware_rev;
+
+	/*
+	 * Generation 9 controllers end up having a different firmware
+	 * versioning scheme than others.  If this is a generation 9 controller,
+	 * which all share the same PCI device ID, then we default to MSI.
+	 */
+	if (smrt->smrt_pci_vendor == SMRT_VENDOR_HP &&
+	    smrt->smrt_pci_device == SMRT_DEVICE_GEN9) {
+		return (B_FALSE);
+	}
+
+	if (fwver[0] == '8' && fwver[1] == '.' && isdigit(fwver[2]) &&
+	    isdigit(fwver[3])) {
+		/*
+		 * Version 8.00 of the Smart Array firmware appears to have
+		 * broken MSI support on at least one controller.  We could
+		 * blindly try MSI-X everywhere, except that on at least some
+		 * 6.XX firmware versions, MSI-X interrupts do not appear
+		 * to be triggered for Simple Transport Method command
+		 * completions.
+		 *
+		 * For now, assume we should try for MSI-X with all 8.XX
+		 * versions of the firmware.
+		 */
+		dev_err(smrt->smrt_dip, CE_NOTE, "!trying MSI-X interrupts "
+		    "to work around 8.XX firmware defect");
+		return (B_TRUE);
+	}
+
+	return (B_FALSE);
+}
+
+static int
+smrt_interrupts_disable(smrt_t *smrt)
+{
+	if (smrt->smrt_interrupt_cap & DDI_INTR_FLAG_BLOCK) {
+		return (ddi_intr_block_disable(smrt->smrt_interrupts,
+		    smrt->smrt_ninterrupts));
+	} else {
+		VERIFY3S(smrt->smrt_ninterrupts, ==, 1);
+
+		return (ddi_intr_disable(smrt->smrt_interrupts[0]));
+	}
+}
+
+int
+smrt_interrupts_enable(smrt_t *smrt)
+{
+	int ret;
+
+	VERIFY(!(smrt->smrt_init_level & SMRT_INITLEVEL_INT_ENABLED));
+
+	if (smrt->smrt_interrupt_cap & DDI_INTR_FLAG_BLOCK) {
+		ret = ddi_intr_block_enable(smrt->smrt_interrupts,
+		    smrt->smrt_ninterrupts);
+	} else {
+		VERIFY3S(smrt->smrt_ninterrupts, ==, 1);
+
+		ret = ddi_intr_enable(smrt->smrt_interrupts[0]);
+	}
+
+	if (ret == DDI_SUCCESS) {
+		smrt->smrt_init_level |= SMRT_INITLEVEL_INT_ENABLED;
+	}
+
+	return (ret);
+}
+
+static void
+smrt_interrupts_free(smrt_t *smrt)
+{
+	for (int i = 0; i < smrt->smrt_ninterrupts; i++) {
+		(void) ddi_intr_free(smrt->smrt_interrupts[i]);
+	}
+	smrt->smrt_ninterrupts = 0;
+	smrt->smrt_interrupt_type = 0;
+	smrt->smrt_interrupt_cap = 0;
+	smrt->smrt_interrupt_pri = 0;
+}
+
+static int
+smrt_interrupts_alloc(smrt_t *smrt, int type)
+{
+	dev_info_t *dip = smrt->smrt_dip;
+	int nintrs = 0;
+	int navail = 0;
+
+	if (ddi_intr_get_nintrs(dip, type, &nintrs) != DDI_SUCCESS) {
+		dev_err(dip, CE_WARN, "could not count %s interrupts",
+		    smrt_interrupt_type_name(type));
+		return (DDI_FAILURE);
+	}
+	if (nintrs < 1) {
+		dev_err(dip, CE_WARN, "no %s interrupts supported",
+		    smrt_interrupt_type_name(type));
+		return (DDI_FAILURE);
+	}
+
+	if (ddi_intr_get_navail(dip, type, &navail) != DDI_SUCCESS) {
+		dev_err(dip, CE_WARN, "could not count available %s "
+		    "interrupts", smrt_interrupt_type_name(type));
+		return (DDI_FAILURE);
+	}
+	if (navail < 1) {
+		dev_err(dip, CE_WARN, "no %s interrupts available",
+		    smrt_interrupt_type_name(type));
+		return (DDI_FAILURE);
+	}
+
+	if (ddi_intr_alloc(dip, smrt->smrt_interrupts, type, 0, 1,
+	    &smrt->smrt_ninterrupts, DDI_INTR_ALLOC_STRICT) != DDI_SUCCESS) {
+		dev_err(dip, CE_WARN, "%s interrupt allocation failed",
+		    smrt_interrupt_type_name(type));
+		smrt_interrupts_free(smrt);
+		return (DDI_FAILURE);
+	}
+
+	smrt->smrt_init_level |= SMRT_INITLEVEL_INT_ALLOC;
+	smrt->smrt_interrupt_type = type;
+	return (DDI_SUCCESS);
+}
+
+int
+smrt_interrupts_setup(smrt_t *smrt)
+{
+	int types;
+	unsigned ipri;
+	uint_t (*hw_isr)(caddr_t, caddr_t);
+	dev_info_t *dip = smrt->smrt_dip;
+
+	/*
+	 * Select the correct hardware interrupt service routine for the
+	 * Transport Method we have configured:
+	 */
+	switch (smrt->smrt_ctlr_mode) {
+	case SMRT_CTLR_MODE_SIMPLE:
+		hw_isr = smrt_isr_hw_simple;
+		break;
+	default:
+		panic("unknown controller mode");
+	}
+
+	if (ddi_intr_get_supported_types(dip, &types) != DDI_SUCCESS) {
+		dev_err(dip, CE_WARN, "could not get support interrupts");
+		goto fail;
+	}
+
+	/*
+	 * At least one firmware version has been released for the Smart Array
+	 * line with entirely defective MSI support.  The specification is
+	 * somewhat unclear on the precise nature of MSI-X support with Smart
+	 * Array controllers, particularly with respect to the Simple Transport
+	 * Method, but for those broken firmware versions we need to try
+	 * anyway.
+	 */
+	if (smrt_try_msix(smrt) && (types & DDI_INTR_TYPE_MSIX)) {
+		if (smrt_interrupts_alloc(smrt, DDI_INTR_TYPE_MSIX) ==
+		    DDI_SUCCESS) {
+			goto add_handler;
+		}
+	}
+
+	/*
+	 * If MSI-X is not available, or not expected to work, fall back to
+	 * MSI.
+	 */
+	if (types & DDI_INTR_TYPE_MSI) {
+		if (smrt_interrupts_alloc(smrt, DDI_INTR_TYPE_MSI) ==
+		    DDI_SUCCESS) {
+			goto add_handler;
+		}
+	}
+
+	/*
+	 * If neither MSI-X nor MSI is available, fall back to fixed
+	 * interrupts.  Note that the use of fixed interrupts has been
+	 * observed, with some combination of controllers and systems, to
+	 * result in interrupts stopping completely at random times.
+	 */
+	if (types & DDI_INTR_TYPE_FIXED) {
+		if (smrt_interrupts_alloc(smrt, DDI_INTR_TYPE_FIXED) ==
+		    DDI_SUCCESS) {
+			goto add_handler;
+		}
+	}
+
+	/*
+	 * We were unable to allocate any interrupts.
+	 */
+	dev_err(dip, CE_WARN, "interrupt allocation failed");
+	goto fail;
+
+add_handler:
+	/*
+	 * Ensure that we have not been given a high-level interrupt, as our
+	 * interrupt handlers do not support them.
+	 */
+	if (ddi_intr_get_pri(smrt->smrt_interrupts[0], &ipri) != DDI_SUCCESS) {
+		dev_err(dip, CE_WARN, "could not determine interrupt priority");
+		goto fail;
+	}
+	if (ipri >= ddi_intr_get_hilevel_pri()) {
+		dev_err(dip, CE_WARN, "high level interrupts not supported");
+		goto fail;
+	}
+	smrt->smrt_interrupt_pri = ipri;
+
+	if (ddi_intr_get_cap(smrt->smrt_interrupts[0],
+	    &smrt->smrt_interrupt_cap) != DDI_SUCCESS) {
+		dev_err(dip, CE_WARN, "could not get %s interrupt cap",
+		    smrt_interrupt_type_name(smrt->smrt_interrupt_type));
+		goto fail;
+	}
+
+	if (ddi_intr_add_handler(smrt->smrt_interrupts[0], hw_isr,
+	    (caddr_t)smrt, NULL) != DDI_SUCCESS) {
+		dev_err(dip, CE_WARN, "adding %s interrupt failed",
+		    smrt_interrupt_type_name(smrt->smrt_interrupt_type));
+		goto fail;
+	}
+	smrt->smrt_init_level |= SMRT_INITLEVEL_INT_ADDED;
+
+	return (DDI_SUCCESS);
+
+fail:
+	smrt_interrupts_teardown(smrt);
+	return (DDI_FAILURE);
+}
+
+void
+smrt_interrupts_teardown(smrt_t *smrt)
+{
+	if (smrt->smrt_init_level & SMRT_INITLEVEL_INT_ENABLED) {
+		(void) smrt_interrupts_disable(smrt);
+
+		smrt->smrt_init_level &= ~SMRT_INITLEVEL_INT_ENABLED;
+	}
+
+	if (smrt->smrt_init_level & SMRT_INITLEVEL_INT_ADDED) {
+		(void) ddi_intr_remove_handler(smrt->smrt_interrupts[0]);
+
+		smrt->smrt_init_level &= ~SMRT_INITLEVEL_INT_ADDED;
+	}
+
+	if (smrt->smrt_init_level & SMRT_INITLEVEL_INT_ALLOC) {
+		smrt_interrupts_free(smrt);
+
+		smrt->smrt_init_level &= ~SMRT_INITLEVEL_INT_ALLOC;
+	}
+}
diff --git a/usr/src/uts/common/io/scsi/adapters/smrt/smrt_logvol.c b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_logvol.c
new file mode 100644
index 0000000000..05963ac2e2
--- /dev/null
+++ b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_logvol.c
@@ -0,0 +1,367 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2017, Joyent, Inc.
+ */
+
+#include <sys/scsi/adapters/smrt/smrt.h>
+
+static void
+smrt_logvol_free(smrt_volume_t *smlv)
+{
+	/*
+	 * By this stage of teardown, all of the SCSI target drivers
+	 * must have been detached from this logical volume.
+	 */
+	VERIFY(list_is_empty(&smlv->smlv_targets));
+	list_destroy(&smlv->smlv_targets);
+
+	kmem_free(smlv, sizeof (*smlv));
+}
+
+smrt_volume_t *
+smrt_logvol_lookup_by_id(smrt_t *smrt, unsigned long id)
+{
+	VERIFY(MUTEX_HELD(&smrt->smrt_mutex));
+
+	for (smrt_volume_t *smlv = list_head(&smrt->smrt_volumes);
+	    smlv != NULL; smlv = list_next(&smrt->smrt_volumes, smlv)) {
+		if (smlv->smlv_addr.LogDev.VolId == id) {
+			return (smlv);
+		}
+	}
+
+	return (NULL);
+}
+
+static int
+smrt_read_logvols(smrt_t *smrt, smrt_report_logical_lun_t *smrll, uint64_t gen)
+{
+	smrt_report_logical_lun_ent_t *ents = smrll->smrll_data.ents;
+	uint32_t count = BE_32(smrll->smrll_datasize) /
+	    sizeof (smrt_report_logical_lun_ent_t);
+
+	VERIFY(MUTEX_HELD(&smrt->smrt_mutex));
+
+	if (count > SMRT_MAX_LOGDRV) {
+		count = SMRT_MAX_LOGDRV;
+	}
+
+	for (unsigned i = 0; i < count; i++) {
+		smrt_volume_t *smlv;
+		char id[SCSI_MAXNAMELEN];
+
+		DTRACE_PROBE2(read_logvol, unsigned, i,
+		    smrt_report_logical_lun_ent_t *, &ents[i]);
+
+		if ((smlv = smrt_logvol_lookup_by_id(smrt,
+		    ents[i].smrle_addr.VolId)) == NULL) {
+
+			/*
+			 * This is a new Logical Volume, so add it the the list.
+			 */
+			if ((smlv = kmem_zalloc(sizeof (*smlv), KM_NOSLEEP)) ==
+			    NULL) {
+				return (ENOMEM);
+			}
+
+			list_create(&smlv->smlv_targets,
+			    sizeof (smrt_target_t),
+			    offsetof(smrt_target_t, smtg_link_lun));
+
+			smlv->smlv_ctlr = smrt;
+			list_insert_tail(&smrt->smrt_volumes, smlv);
+		}
+
+		/*
+		 * Always make sure that the address and the generation are up
+		 * to date, regardless of where this came from.
+		 */
+		smlv->smlv_addr.LogDev = ents[i].smrle_addr;
+		smlv->smlv_gen = gen;
+		(void) snprintf(id, sizeof (id), "%x",
+		    smlv->smlv_addr.LogDev.VolId);
+		if (!ddi_in_panic() &&
+		    scsi_hba_tgtmap_set_add(smrt->smrt_virt_tgtmap,
+		    SCSI_TGT_SCSI_DEVICE, id, NULL) != DDI_SUCCESS) {
+			return (EIO);
+		}
+	}
+
+	return (0);
+}
+
+static int
+smrt_read_logvols_ext(smrt_t *smrt, smrt_report_logical_lun_t *smrll,
+    uint64_t gen)
+{
+	smrt_report_logical_lun_extent_t *extents =
+	    smrll->smrll_data.extents;
+	uint32_t count = BE_32(smrll->smrll_datasize) /
+	    sizeof (smrt_report_logical_lun_extent_t);
+
+	VERIFY(MUTEX_HELD(&smrt->smrt_mutex));
+
+	if (count > SMRT_MAX_LOGDRV) {
+		count = SMRT_MAX_LOGDRV;
+	}
+
+	for (unsigned i = 0; i < count; i++) {
+		smrt_volume_t *smlv;
+		char id[SCSI_MAXNAMELEN];
+
+		DTRACE_PROBE2(read_logvol_ext, unsigned, i,
+		    smrt_report_logical_lun_extent_t *, &extents[i]);
+
+		if ((smlv = smrt_logvol_lookup_by_id(smrt,
+		    extents[i].smrle_addr.VolId)) != NULL) {
+			if ((smlv->smlv_flags & SMRT_VOL_FLAG_WWN) &&
+			    bcmp(extents[i].smrle_wwn, smlv->smlv_wwn,
+			    16) != 0) {
+				dev_err(smrt->smrt_dip, CE_PANIC, "logical "
+				    "volume %u WWN changed unexpectedly", i);
+			}
+		} else {
+			/*
+			 * This is a new Logical Volume, so add it the the list.
+			 */
+			if ((smlv = kmem_zalloc(sizeof (*smlv), KM_NOSLEEP)) ==
+			    NULL) {
+				return (ENOMEM);
+			}
+
+			bcopy(extents[i].smrle_wwn, smlv->smlv_wwn, 16);
+			smlv->smlv_flags |= SMRT_VOL_FLAG_WWN;
+
+			list_create(&smlv->smlv_targets,
+			    sizeof (smrt_target_t),
+			    offsetof(smrt_target_t, smtg_link_lun));
+
+			smlv->smlv_ctlr = smrt;
+			list_insert_tail(&smrt->smrt_volumes, smlv);
+		}
+
+		/*
+		 * Always make sure that the address and the generation are up
+		 * to date.  The address may have changed on a reset.
+		 */
+		smlv->smlv_addr.LogDev = extents[i].smrle_addr;
+		smlv->smlv_gen = gen;
+		(void) snprintf(id, sizeof (id), "%x",
+		    smlv->smlv_addr.LogDev.VolId);
+		if (!ddi_in_panic() &&
+		    scsi_hba_tgtmap_set_add(smrt->smrt_virt_tgtmap,
+		    SCSI_TGT_SCSI_DEVICE, id, NULL) != DDI_SUCCESS) {
+			return (EIO);
+		}
+	}
+
+	return (0);
+}
+
+/*
+ * Discover the currently visible set of Logical Volumes exposed by the
+ * controller.
+ */
+int
+smrt_logvol_discover(smrt_t *smrt, uint16_t timeout, uint64_t gen)
+{
+	smrt_command_t *smcm;
+	smrt_report_logical_lun_t *smrll;
+	smrt_report_logical_lun_req_t smrllr = { 0 };
+	int r;
+
+	/*
+	 * Allocate the command to send to the device, including buffer space
+	 * for the returned list of Logical Volumes.
+	 */
+	if ((smcm = smrt_command_alloc(smrt, SMRT_CMDTYPE_INTERNAL,
+	    KM_NOSLEEP)) == NULL || smrt_command_attach_internal(smrt, smcm,
+	    sizeof (smrt_report_logical_lun_t), KM_NOSLEEP) != 0) {
+		r = ENOMEM;
+		mutex_enter(&smrt->smrt_mutex);
+		goto out;
+	}
+
+	smrll = smcm->smcm_internal->smcmi_va;
+
+	smrt_write_controller_lun_addr(&smcm->smcm_va_cmd->Header.LUN);
+
+	smcm->smcm_va_cmd->Request.CDBLen = sizeof (smrllr);
+	smcm->smcm_va_cmd->Request.Timeout = LE_16(timeout);
+	smcm->smcm_va_cmd->Request.Type.Type = CISS_TYPE_CMD;
+	smcm->smcm_va_cmd->Request.Type.Attribute = CISS_ATTR_SIMPLE;
+	smcm->smcm_va_cmd->Request.Type.Direction = CISS_XFER_READ;
+
+	/*
+	 * The Report Logical LUNs command is essentially a vendor-specific
+	 * SCSI command, which we assemble into the CDB region of the command
+	 * block.
+	 */
+	bzero(&smrllr, sizeof (smrllr));
+	smrllr.smrllr_opcode = CISS_SCMD_REPORT_LOGICAL_LUNS;
+	smrllr.smrllr_extflag = 1;
+	smrllr.smrllr_datasize = htonl(sizeof (smrt_report_logical_lun_t));
+	bcopy(&smrllr, &smcm->smcm_va_cmd->Request.CDB[0],
+	    MIN(CISS_CDBLEN, sizeof (smrllr)));
+
+	mutex_enter(&smrt->smrt_mutex);
+
+	/*
+	 * Send the command to the device.
+	 */
+	smcm->smcm_status |= SMRT_CMD_STATUS_POLLED;
+	if ((r = smrt_submit(smrt, smcm)) != 0) {
+		goto out;
+	}
+
+	/*
+	 * Poll for completion.
+	 */
+	smcm->smcm_expiry = gethrtime() + timeout * NANOSEC;
+	if ((r = smrt_poll_for(smrt, smcm)) != 0) {
+		VERIFY3S(r, ==, ETIMEDOUT);
+		VERIFY0(smcm->smcm_status & SMRT_CMD_STATUS_POLL_COMPLETE);
+
+		/*
+		 * The command timed out; abandon it now.  Remove the POLLED
+		 * flag so that the periodic routine will send an abort to
+		 * clean it up next time around.
+		 */
+		smcm->smcm_status |= SMRT_CMD_STATUS_ABANDONED;
+		smcm->smcm_status &= ~SMRT_CMD_STATUS_POLLED;
+		smcm = NULL;
+		goto out;
+	}
+
+	if (smcm->smcm_status & SMRT_CMD_STATUS_RESET_SENT) {
+		/*
+		 * The controller was reset while we were trying to discover
+		 * logical volumes.  Report failure.
+		 */
+		r = EIO;
+		goto out;
+	}
+
+	if (smcm->smcm_status & SMRT_CMD_STATUS_ERROR) {
+		ErrorInfo_t *ei = smcm->smcm_va_err;
+
+		if (ei->CommandStatus != CISS_CMD_DATA_UNDERRUN) {
+			dev_err(smrt->smrt_dip, CE_WARN, "logical volume "
+			    "discovery error: status 0x%x", ei->CommandStatus);
+			r = EIO;
+			goto out;
+		}
+	}
+
+	if (!ddi_in_panic() &&
+	    scsi_hba_tgtmap_set_begin(smrt->smrt_virt_tgtmap) != DDI_SUCCESS) {
+		dev_err(smrt->smrt_dip, CE_WARN, "failed to begin target map "
+		    "observation on %s", SMRT_IPORT_VIRT);
+		r = EIO;
+		goto out;
+	}
+
+	if ((smrll->smrll_extflag & 0x1) != 0) {
+		r = smrt_read_logvols_ext(smrt, smrll, gen);
+	} else {
+		r = smrt_read_logvols(smrt, smrll, gen);
+	}
+
+	if (r == 0 && !ddi_in_panic()) {
+		if (scsi_hba_tgtmap_set_end(smrt->smrt_virt_tgtmap, 0) !=
+		    DDI_SUCCESS) {
+			dev_err(smrt->smrt_dip, CE_WARN, "failed to end target "
+			    "map observation on %s", SMRT_IPORT_VIRT);
+			r = EIO;
+		}
+	} else if (r != 0 && !ddi_in_panic()) {
+		if (scsi_hba_tgtmap_set_flush(smrt->smrt_virt_tgtmap) !=
+		    DDI_SUCCESS) {
+			dev_err(smrt->smrt_dip, CE_WARN, "failed to end target "
+			    "map observation on %s", SMRT_IPORT_VIRT);
+			r = EIO;
+		}
+	}
+
+	if (r == 0) {
+		/*
+		 * Update the time of the last successful Logical Volume
+		 * discovery:
+		 */
+		smrt->smrt_last_log_discovery = gethrtime();
+	}
+
+out:
+	mutex_exit(&smrt->smrt_mutex);
+
+	if (smcm != NULL) {
+		smrt_command_free(smcm);
+	}
+	return (r);
+}
+
+void
+smrt_logvol_tgtmap_activate(void *arg, char *addr, scsi_tgtmap_tgt_type_t type,
+    void **privpp)
+{
+	smrt_t *smrt = arg;
+	unsigned long volume;
+	char *eptr;
+
+	VERIFY(type == SCSI_TGT_SCSI_DEVICE);
+	VERIFY0(ddi_strtoul(addr, &eptr, 16, &volume));
+	VERIFY3S(*eptr, ==, '\0');
+	VERIFY3S(volume, >=, 0);
+	VERIFY3S(volume, <, SMRT_MAX_LOGDRV);
+	mutex_enter(&smrt->smrt_mutex);
+	VERIFY(smrt_logvol_lookup_by_id(smrt, volume) != NULL);
+	mutex_exit(&smrt->smrt_mutex);
+	*privpp = NULL;
+}
+
+boolean_t
+smrt_logvol_tgtmap_deactivate(void *arg, char *addr,
+    scsi_tgtmap_tgt_type_t type, void *priv, scsi_tgtmap_deact_rsn_t reason)
+{
+	smrt_t *smrt = arg;
+	smrt_volume_t *smlv;
+	unsigned long volume;
+	char *eptr;
+
+	VERIFY(type == SCSI_TGT_SCSI_DEVICE);
+	VERIFY(priv == NULL);
+	VERIFY0(ddi_strtoul(addr, &eptr, 16, &volume));
+	VERIFY3S(*eptr, ==, '\0');
+	VERIFY3S(volume, >=, 0);
+	VERIFY3S(volume, <, SMRT_MAX_LOGDRV);
+
+	mutex_enter(&smrt->smrt_mutex);
+	smlv = smrt_logvol_lookup_by_id(smrt, volume);
+	VERIFY(smlv != NULL);
+
+	list_remove(&smrt->smrt_volumes, smlv);
+	smrt_logvol_free(smlv);
+	mutex_exit(&smrt->smrt_mutex);
+
+	return (B_FALSE);
+}
+
+void
+smrt_logvol_teardown(smrt_t *smrt)
+{
+	smrt_volume_t *smlv;
+
+	while ((smlv = list_remove_head(&smrt->smrt_volumes)) != NULL) {
+		smrt_logvol_free(smlv);
+	}
+}
diff --git a/usr/src/uts/common/io/scsi/adapters/smrt/smrt_physical.c b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_physical.c
new file mode 100644
index 0000000000..8ab3927673
--- /dev/null
+++ b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_physical.c
@@ -0,0 +1,613 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2017 Joyent, Inc.
+ */
+
+#include <sys/scsi/adapters/smrt/smrt.h>
+
+static void
+smrt_physical_free(smrt_physical_t *smpt)
+{
+	VERIFY(list_is_empty(&smpt->smpt_targets));
+	VERIFY(smpt->smpt_info != NULL);
+
+	kmem_free(smpt->smpt_info, sizeof (*smpt->smpt_info));
+	list_destroy(&smpt->smpt_targets);
+	kmem_free(smpt, sizeof (*smpt));
+}
+
+/*
+ * Determine if a physical device enumerated should be shown to the world. There
+ * are three conditions to satisfy for this to be true.
+ *
+ * 1. The device (SAS, SATA, SES, etc.) must not have a masked CISS address.  A
+ * masked CISS address indicates a device that we should not be performing I/O
+ * to.
+ * 2. The drive (SAS or SATA device) must not be marked as a member of a logical
+ * volume.
+ * 3. The drive (SAS or SATA device) must not be marked as a spare.
+ */
+static boolean_t
+smrt_physical_visible(PhysDevAddr_t *addr, smrt_identify_physical_drive_t *info)
+{
+	if (addr->Mode == SMRT_CISS_MODE_MASKED) {
+		return (B_FALSE);
+	}
+
+	if ((info->sipd_more_flags & (SMRT_MORE_FLAGS_LOGVOL |
+	    SMRT_MORE_FLAGS_SPARE)) != 0) {
+		return (B_FALSE);
+	}
+
+	return (B_TRUE);
+}
+
+/*
+ * Note, the caller is responsible for making sure that the unit-address form of
+ * the WWN is pased in.  Any additional information to target a specific LUN
+ * will be ignored.
+ */
+smrt_physical_t *
+smrt_phys_lookup_by_ua(smrt_t *smrt, const char *ua)
+{
+	VERIFY(MUTEX_HELD(&smrt->smrt_mutex));
+
+	/*
+	 * Sanity check that the caller has provided us enough bytes for a
+	 * properly formed unit-address form of a WWN.
+	 */
+	if (strlen(ua) < SCSI_WWN_UA_STRLEN)
+		return (NULL);
+
+	for (smrt_physical_t *smpt = list_head(&smrt->smrt_physicals);
+	    smpt != NULL; smpt = list_next(&smrt->smrt_physicals, smpt)) {
+		char wwnstr[SCSI_WWN_BUFLEN];
+
+		(void) scsi_wwn_to_wwnstr(smpt->smpt_wwn, 1, wwnstr);
+		if (strncmp(wwnstr, ua, SCSI_WWN_UA_STRLEN) != 0)
+			continue;
+
+		/*
+		 * Verify that the UA string is either a comma or null there.
+		 * We accept the comma in case it's being used as part of a
+		 * normal UA with a LUN.
+		 */
+		if (ua[SCSI_WWN_UA_STRLEN] != '\0' &&
+		    ua[SCSI_WWN_UA_STRLEN] != ',') {
+			continue;
+		}
+
+		return (smpt);
+	}
+
+	return (NULL);
+}
+
+static smrt_physical_t *
+smrt_phys_lookup_by_wwn(smrt_t *smrt, uint64_t wwn)
+{
+	VERIFY(MUTEX_HELD(&smrt->smrt_mutex));
+
+	for (smrt_physical_t *smpt = list_head(&smrt->smrt_physicals);
+	    smpt != NULL; smpt = list_next(&smrt->smrt_physicals, smpt)) {
+		if (wwn == smpt->smpt_wwn)
+			return (smpt);
+	}
+
+	return (NULL);
+}
+
+static int
+smrt_phys_identify(smrt_t *smrt, smrt_identify_physical_drive_t *info,
+    uint16_t bmic, uint16_t timeout)
+{
+	smrt_command_t *smcm = NULL;
+	smrt_identify_physical_drive_t *sipd;
+	smrt_identify_physical_drive_req_t sipdr;
+	int ret;
+	size_t sz, copysz;
+
+	sz = sizeof (smrt_identify_physical_drive_t);
+	sz = P2ROUNDUP_TYPED(sz, 512, size_t);
+	if ((smcm = smrt_command_alloc(smrt, SMRT_CMDTYPE_INTERNAL,
+	    KM_NOSLEEP)) == NULL || smrt_command_attach_internal(smrt, smcm,
+	    sizeof (*sipd), KM_NOSLEEP) != 0) {
+		ret = ENOMEM;
+		goto out;
+	}
+
+	sipd = smcm->smcm_internal->smcmi_va;
+
+	smrt_write_controller_lun_addr(&smcm->smcm_va_cmd->Header.LUN);
+
+	smcm->smcm_va_cmd->Request.CDBLen = sizeof (sipdr);
+	smcm->smcm_va_cmd->Request.Timeout = LE_16(timeout);
+	smcm->smcm_va_cmd->Request.Type.Type = CISS_TYPE_CMD;
+	smcm->smcm_va_cmd->Request.Type.Attribute = CISS_ATTR_SIMPLE;
+	smcm->smcm_va_cmd->Request.Type.Direction = CISS_XFER_READ;
+
+	/*
+	 * Construct the IDENTIFY PHYSICAL DEVICE request CDB.  Note that any
+	 * reserved fields in the request must be filled with zeroes.
+	 */
+	bzero(&sipdr, sizeof (sipdr));
+	sipdr.sipdr_opcode = CISS_SCMD_BMIC_READ;
+	sipdr.sipdr_lun = 0;
+	sipdr.sipdr_bmic_index1 = bmic & 0x00ff;
+	sipdr.sipdr_command = CISS_BMIC_IDENTIFY_PHYSICAL_DEVICE;
+	sipdr.sipdr_bmic_index2 = (bmic & 0xff00) >> 8;
+	bcopy(&sipdr, &smcm->smcm_va_cmd->Request.CDB[0],
+	    MIN(CISS_CDBLEN, sizeof (sipdr)));
+
+	mutex_enter(&smrt->smrt_mutex);
+
+	/*
+	 * Send the command to the device.
+	 */
+	smcm->smcm_status |= SMRT_CMD_STATUS_POLLED;
+	if ((ret = smrt_submit(smrt, smcm)) != 0) {
+		mutex_exit(&smrt->smrt_mutex);
+		goto out;
+	}
+
+	/*
+	 * Poll for completion.
+	 */
+	smcm->smcm_expiry = gethrtime() + timeout * NANOSEC;
+	if ((ret = smrt_poll_for(smrt, smcm)) != 0) {
+		VERIFY3S(ret, ==, ETIMEDOUT);
+		VERIFY0(smcm->smcm_status & SMRT_CMD_STATUS_POLL_COMPLETE);
+
+		/*
+		 * The command timed out; abandon it now.  Remove the POLLED
+		 * flag so that the periodic routine will send an abort to
+		 * clean it up next time around.
+		 */
+		smcm->smcm_status |= SMRT_CMD_STATUS_ABANDONED;
+		smcm->smcm_status &= ~SMRT_CMD_STATUS_POLLED;
+		smcm = NULL;
+		mutex_exit(&smrt->smrt_mutex);
+		goto out;
+	}
+	mutex_exit(&smrt->smrt_mutex);
+
+	if (smcm->smcm_status & SMRT_CMD_STATUS_RESET_SENT) {
+		/*
+		 * The controller was reset while we were trying to discover
+		 * physical volumes.  Report failure.
+		 */
+		ret = EIO;
+		goto out;
+	}
+
+	if (smcm->smcm_status & SMRT_CMD_STATUS_ERROR) {
+		ErrorInfo_t *ei = smcm->smcm_va_err;
+
+		if (ei->CommandStatus != CISS_CMD_DATA_UNDERRUN) {
+			dev_err(smrt->smrt_dip, CE_WARN, "identify physical "
+			    "device error: status 0x%x", ei->CommandStatus);
+			ret = EIO;
+			goto out;
+		}
+
+		copysz = MIN(sizeof (*sipd), sz - ei->ResidualCnt);
+	} else {
+		copysz = sizeof (*sipd);
+	}
+
+
+	sz = MIN(sizeof (*sipd), copysz);
+	bcopy(sipd, info, sizeof (*sipd));
+
+	ret = 0;
+out:
+	if (smcm != NULL) {
+		smrt_command_free(smcm);
+	}
+
+	return (ret);
+}
+
+static int
+smrt_read_phys_ext(smrt_t *smrt, smrt_report_physical_lun_t *smrpl,
+    uint16_t timeout, uint64_t gen)
+{
+	smrt_report_physical_lun_extent_t *extents = smrpl->smrpl_data.extents;
+	uint32_t count = BE_32(smrpl->smrpl_datasize) /
+	    sizeof (smrt_report_physical_lun_extent_t);
+	uint32_t i;
+
+	VERIFY(MUTEX_HELD(&smrt->smrt_mutex));
+
+	if (count > SMRT_MAX_PHYSDEV) {
+		count = SMRT_MAX_PHYSDEV;
+	}
+
+	for (i = 0; i < count; i++) {
+		int ret;
+		smrt_physical_t *smpt;
+		smrt_identify_physical_drive_t *info;
+		smrt_report_physical_opdi_t *opdi;
+		uint16_t bmic;
+		uint64_t wwn, satawwn;
+		char name[SCSI_MAXNAMELEN];
+
+		opdi = &extents[i].srple_extdata.srple_opdi;
+
+		mutex_exit(&smrt->smrt_mutex);
+
+		/*
+		 * Get the extended information about this device.
+		 */
+		info = kmem_zalloc(sizeof (*info), KM_NOSLEEP);
+		if (info == NULL) {
+			mutex_enter(&smrt->smrt_mutex);
+			return (ENOMEM);
+		}
+
+		bmic = smrt_lun_addr_to_bmic(&extents[i].srple_addr);
+		ret = smrt_phys_identify(smrt, info, bmic, timeout);
+		if (ret != 0) {
+			mutex_enter(&smrt->smrt_mutex);
+			kmem_free(info, sizeof (*info));
+			return (ret);
+		}
+
+		wwn = *(uint64_t *)opdi->srpo_wwid;
+		wwn = BE_64(wwn);
+
+		/*
+		 * SATA devices may not have a proper WWN returned from firmware
+		 * based on the SATL specification.  Try to fetch the proper id
+		 * for SATA devices, if the drive has one.  If the drive doesn't
+		 * have one or the SATL refuses to give us one, we use whatever
+		 * the controller told us.
+		 */
+		if (opdi->srpo_dtype == SMRT_DTYPE_SATA &&
+		    smrt_sata_determine_wwn(smrt, &extents[i].srple_addr,
+		    &satawwn, timeout) == 0) {
+			wwn = satawwn;
+		}
+
+		mutex_enter(&smrt->smrt_mutex);
+		smpt = smrt_phys_lookup_by_wwn(smrt, wwn);
+		if (smpt != NULL) {
+			/*
+			 * Sanity check that the model and serial number of this
+			 * device is the same for this WWN.  If it's not, the
+			 * controller is probably lying about something.
+			 */
+			if (bcmp(smpt->smpt_info->sipd_model, info->sipd_model,
+			    sizeof (info->sipd_model)) != 0 ||
+			    bcmp(smpt->smpt_info->sipd_serial,
+			    info->sipd_serial, sizeof (info->sipd_serial)) !=
+			    0 || smpt->smpt_dtype != opdi->srpo_dtype) {
+				dev_err(smrt->smrt_dip, CE_PANIC, "physical "
+				    "target with wwn 0x%" PRIx64 " changed "
+				    "model, serial, or type unexpectedly: "
+				    "smrt_physical_t %p, phys info: %p", wwn,
+				    smpt, info);
+			}
+
+			/*
+			 * When panicking, we don't allow a device's visibility
+			 * to change to being invisible and be able to actually
+			 * panic.  We only worry about devices which are used
+			 * for I/O.  We purposefully ignore SES devices.
+			 */
+			if (ddi_in_panic() &&
+			    (opdi->srpo_dtype == SMRT_DTYPE_SATA ||
+			    opdi->srpo_dtype == SMRT_DTYPE_SAS)) {
+				boolean_t visible;
+
+				visible = smrt_physical_visible(
+				    &smpt->smpt_addr.PhysDev, smpt->smpt_info);
+
+				if (visible != smpt->smpt_visible) {
+					dev_err(smrt->smrt_dip, CE_PANIC,
+					    "physical target with wwn 0x%"
+					    PRIx64 " changed visibility status "
+					    "unexpectedly", wwn);
+				}
+			}
+
+			kmem_free(smpt->smpt_info, sizeof (*smpt->smpt_info));
+			smpt->smpt_info = NULL;
+		} else {
+			smpt = kmem_zalloc(sizeof (smrt_physical_t),
+			    KM_NOSLEEP);
+			if (smpt == NULL) {
+				kmem_free(info, sizeof (*info));
+				return (ENOMEM);
+			}
+
+			smpt->smpt_wwn = wwn;
+			smpt->smpt_dtype = opdi->srpo_dtype;
+			list_create(&smpt->smpt_targets, sizeof (smrt_target_t),
+			    offsetof(smrt_target_t, smtg_link_lun));
+			smpt->smpt_ctlr = smrt;
+			list_insert_tail(&smrt->smrt_physicals, smpt);
+		}
+
+		VERIFY3P(smpt->smpt_info, ==, NULL);
+
+		/*
+		 * Determine if this device is supported and if it's visible to
+		 * the system.  Some devices may not be visible to the system
+		 * because they're used in logical volumes or spares.
+		 * Unsupported devices are also not visible.
+		 */
+		switch (smpt->smpt_dtype) {
+		case SMRT_DTYPE_SATA:
+		case SMRT_DTYPE_SAS:
+			smpt->smpt_supported = B_TRUE;
+			smpt->smpt_visible =
+			    smrt_physical_visible(&extents[i].srple_addr, info);
+			break;
+		case SMRT_DTYPE_SES:
+			smpt->smpt_supported = B_TRUE;
+			smpt->smpt_visible =
+			    smrt_physical_visible(&extents[i].srple_addr, info);
+			break;
+		default:
+			smpt->smpt_visible = B_FALSE;
+			smpt->smpt_supported = B_FALSE;
+		}
+
+		smpt->smpt_info = info;
+		smpt->smpt_addr.PhysDev = extents[i].srple_addr;
+		smpt->smpt_bmic = bmic;
+		smpt->smpt_gen = gen;
+		(void) scsi_wwn_to_wwnstr(smpt->smpt_wwn, 1, name);
+		if (!ddi_in_panic() && smpt->smpt_visible &&
+		    scsi_hba_tgtmap_set_add(smrt->smrt_phys_tgtmap,
+		    SCSI_TGT_SCSI_DEVICE, name, NULL) != DDI_SUCCESS) {
+			return (EIO);
+		}
+	}
+
+	return (0);
+}
+
+int
+smrt_phys_discover(smrt_t *smrt, uint16_t timeout, uint64_t gen)
+{
+	smrt_command_t *smcm;
+	smrt_report_physical_lun_t *smrpl;
+	smrt_report_physical_lun_req_t smrplr;
+	int r;
+
+	/*
+	 * Allocate the command to send to the device, including buffer space
+	 * for the returned list of Physical Volumes.
+	 */
+	if ((smcm = smrt_command_alloc(smrt, SMRT_CMDTYPE_INTERNAL,
+	    KM_NOSLEEP)) == NULL || smrt_command_attach_internal(smrt, smcm,
+	    sizeof (*smrpl), KM_NOSLEEP) != 0) {
+		r = ENOMEM;
+		mutex_enter(&smrt->smrt_mutex);
+		goto out;
+	}
+
+	smrpl = smcm->smcm_internal->smcmi_va;
+
+	smrt_write_controller_lun_addr(&smcm->smcm_va_cmd->Header.LUN);
+
+	smcm->smcm_va_cmd->Request.CDBLen = sizeof (smrplr);
+	smcm->smcm_va_cmd->Request.Timeout = LE_16(timeout);
+	smcm->smcm_va_cmd->Request.Type.Type = CISS_TYPE_CMD;
+	smcm->smcm_va_cmd->Request.Type.Attribute = CISS_ATTR_SIMPLE;
+	smcm->smcm_va_cmd->Request.Type.Direction = CISS_XFER_READ;
+
+	/*
+	 * The Report Physical LUNs command is essentially a vendor-specific
+	 * SCSI command, which we assemble into the CDB region of the command
+	 * block.
+	 */
+	bzero(&smrplr, sizeof (smrplr));
+	smrplr.smrplr_opcode = CISS_SCMD_REPORT_PHYSICAL_LUNS;
+	smrplr.smrplr_extflag = SMRT_REPORT_PHYSICAL_LUN_EXT_OPDI;
+	smrplr.smrplr_datasize = BE_32(sizeof (smrt_report_physical_lun_t));
+	bcopy(&smrplr, &smcm->smcm_va_cmd->Request.CDB[0],
+	    MIN(CISS_CDBLEN, sizeof (smrplr)));
+
+	mutex_enter(&smrt->smrt_mutex);
+
+	/*
+	 * Send the command to the device.
+	 */
+	smcm->smcm_status |= SMRT_CMD_STATUS_POLLED;
+	if ((r = smrt_submit(smrt, smcm)) != 0) {
+		goto out;
+	}
+
+	/*
+	 * Poll for completion.
+	 */
+	smcm->smcm_expiry = gethrtime() + timeout * NANOSEC;
+	if ((r = smrt_poll_for(smrt, smcm)) != 0) {
+		VERIFY3S(r, ==, ETIMEDOUT);
+		VERIFY0(smcm->smcm_status & SMRT_CMD_STATUS_POLL_COMPLETE);
+
+		/*
+		 * The command timed out; abandon it now.  Remove the POLLED
+		 * flag so that the periodic routine will send an abort to
+		 * clean it up next time around.
+		 */
+		smcm->smcm_status |= SMRT_CMD_STATUS_ABANDONED;
+		smcm->smcm_status &= ~SMRT_CMD_STATUS_POLLED;
+		smcm = NULL;
+		goto out;
+	}
+
+	if (smcm->smcm_status & SMRT_CMD_STATUS_RESET_SENT) {
+		/*
+		 *
+		 * The controller was reset while we were trying to discover
+		 * logical volumes.  Report failure.
+		 */
+		r = EIO;
+		goto out;
+	}
+
+	if (smcm->smcm_status & SMRT_CMD_STATUS_ERROR) {
+		ErrorInfo_t *ei = smcm->smcm_va_err;
+
+		if (ei->CommandStatus != CISS_CMD_DATA_UNDERRUN) {
+			dev_err(smrt->smrt_dip, CE_WARN, "physical target "
+			    "discovery error: status 0x%x", ei->CommandStatus);
+			r = EIO;
+			goto out;
+		}
+	}
+
+	/*
+	 * If the controller doesn't support extended physical reporting, it
+	 * likely doesn't even support physical devices that we'd care about
+	 * exposing.  As such, we treat this as an OK case.
+	 */
+	if ((smrpl->smrpl_extflag & SMRT_REPORT_PHYSICAL_LUN_EXT_MASK) !=
+	    SMRT_REPORT_PHYSICAL_LUN_EXT_OPDI) {
+		r = 0;
+		goto out;
+	}
+
+	if (!ddi_in_panic() &&
+	    scsi_hba_tgtmap_set_begin(smrt->smrt_phys_tgtmap) != DDI_SUCCESS) {
+		dev_err(smrt->smrt_dip, CE_WARN, "failed to begin target map "
+		    "observation on %s", SMRT_IPORT_PHYS);
+		r = EIO;
+		goto out;
+	}
+
+	r = smrt_read_phys_ext(smrt, smrpl, timeout, gen);
+
+	if (r == 0 && !ddi_in_panic()) {
+		if (scsi_hba_tgtmap_set_end(smrt->smrt_phys_tgtmap, 0) !=
+		    DDI_SUCCESS) {
+			dev_err(smrt->smrt_dip, CE_WARN, "failed to end target "
+			    "map observation on %s", SMRT_IPORT_PHYS);
+			r = EIO;
+		}
+	} else if (r != 0 && !ddi_in_panic()) {
+		if (scsi_hba_tgtmap_set_flush(smrt->smrt_phys_tgtmap) !=
+		    DDI_SUCCESS) {
+			dev_err(smrt->smrt_dip, CE_WARN, "failed to end target "
+			    "map observation on %s", SMRT_IPORT_PHYS);
+			r = EIO;
+		}
+	}
+
+	if (r == 0) {
+		smrt_physical_t *smpt, *next;
+
+		/*
+		 * Prune physical devices that do not match the current
+		 * generation and are not marked as visible devices.  Visible
+		 * devices will be dealt with as part of the target map work.
+		 */
+		for (smpt = list_head(&smrt->smrt_physicals), next = NULL;
+		    smpt != NULL; smpt = next) {
+			next = list_next(&smrt->smrt_physicals, smpt);
+			if (smpt->smpt_visible || smpt->smpt_gen == gen)
+				continue;
+			list_remove(&smrt->smrt_physicals, smpt);
+			smrt_physical_free(smpt);
+		}
+
+		/*
+		 * Update the time of the last successful Physical Volume
+		 * discovery:
+		 */
+		smrt->smrt_last_phys_discovery = gethrtime();
+
+		/*
+		 * Now, for each unsupported device that we haven't warned about
+		 * encountering, try and give the administrator some hope of
+		 * knowing about this.
+		 */
+		for (smpt = list_head(&smrt->smrt_physicals), next = NULL;
+		    smpt != NULL; smpt = next) {
+			if (smpt->smpt_supported || smpt->smpt_unsup_warn)
+				continue;
+			smpt->smpt_unsup_warn = B_TRUE;
+			dev_err(smrt->smrt_dip, CE_WARN, "encountered "
+			    "unsupported device with device type %d",
+			    smpt->smpt_dtype);
+		}
+	}
+
+out:
+	mutex_exit(&smrt->smrt_mutex);
+
+	if (smcm != NULL) {
+		smrt_command_free(smcm);
+	}
+	return (r);
+}
+
+void
+smrt_phys_tgtmap_activate(void *arg, char *addr, scsi_tgtmap_tgt_type_t type,
+    void **privpp)
+{
+	smrt_t *smrt = arg;
+	smrt_physical_t *smpt;
+
+	VERIFY3S(type, ==, SCSI_TGT_SCSI_DEVICE);
+	mutex_enter(&smrt->smrt_mutex);
+	smpt = smrt_phys_lookup_by_ua(smrt, addr);
+	VERIFY(smpt != NULL);
+	VERIFY(smpt->smpt_supported);
+	VERIFY(smpt->smpt_visible);
+	*privpp = NULL;
+	mutex_exit(&smrt->smrt_mutex);
+}
+
+boolean_t
+smrt_phys_tgtmap_deactivate(void *arg, char *addr, scsi_tgtmap_tgt_type_t type,
+    void *priv, scsi_tgtmap_deact_rsn_t reason)
+{
+	smrt_t *smrt = arg;
+	smrt_physical_t *smpt;
+
+	VERIFY3S(type, ==, SCSI_TGT_SCSI_DEVICE);
+	VERIFY3P(priv, ==, NULL);
+
+	mutex_enter(&smrt->smrt_mutex);
+	smpt = smrt_phys_lookup_by_ua(smrt, addr);
+
+	/*
+	 * If the device disappeared or became invisible, then it may have
+	 * already been removed.
+	 */
+	if (smpt == NULL || !smpt->smpt_visible) {
+		mutex_exit(&smrt->smrt_mutex);
+		return (B_FALSE);
+	}
+
+	list_remove(&smrt->smrt_physicals, smpt);
+	smrt_physical_free(smpt);
+	mutex_exit(&smrt->smrt_mutex);
+	return (B_FALSE);
+}
+
+void
+smrt_phys_teardown(smrt_t *smrt)
+{
+	smrt_physical_t *smpt;
+
+	VERIFY(MUTEX_HELD(&smrt->smrt_mutex));
+	while ((smpt = list_remove_head(&smrt->smrt_physicals)) != NULL) {
+		smrt_physical_free(smpt);
+	}
+}
diff --git a/usr/src/uts/common/io/scsi/adapters/smrt/smrt_sata.c b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_sata.c
new file mode 100644
index 0000000000..6224b97732
--- /dev/null
+++ b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_sata.c
@@ -0,0 +1,160 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2017 Joyent, Inc.
+ */
+
+/*
+ * Collection of routines specific to SATA devices and attempting to make them
+ * work.
+ */
+
+#include <sys/scsi/adapters/smrt/smrt.h>
+
+/*
+ * This is a buffer size that should easily cover all of the data that we need
+ * to properly determine the buffer allocation.
+ */
+#define	SMRT_SATA_INQ83_LEN	256
+
+/*
+ * We need to try and determine if a SATA WWN exists on the device.  SAT-2
+ * defines that the response to the inquiry page 0x83.
+ */
+int
+smrt_sata_determine_wwn(smrt_t *smrt, PhysDevAddr_t *addr, uint64_t *wwnp,
+    uint16_t timeout)
+{
+	smrt_command_t *smcm;
+	int r;
+	uint8_t *inq;
+	uint64_t wwn;
+	size_t resid;
+
+	VERIFY3P(wwnp, !=, NULL);
+
+	if ((smcm = smrt_command_alloc(smrt, SMRT_CMDTYPE_INTERNAL,
+	    KM_NOSLEEP)) == NULL || smrt_command_attach_internal(smrt, smcm,
+	    SMRT_SATA_INQ83_LEN, KM_NOSLEEP) != 0) {
+		if (smcm != NULL) {
+			smrt_command_free(smcm);
+		}
+		return (ENOMEM);
+	}
+
+	smcm->smcm_va_cmd->Header.LUN.PhysDev = *addr;
+	smcm->smcm_va_cmd->Request.CDBLen = CDB_GROUP0;
+	smcm->smcm_va_cmd->Request.Type.Type = CISS_TYPE_CMD;
+	smcm->smcm_va_cmd->Request.Type.Attribute = CISS_ATTR_SIMPLE;
+	smcm->smcm_va_cmd->Request.Type.Direction = CISS_XFER_READ;
+	smcm->smcm_va_cmd->Request.Timeout = LE_16(timeout);
+
+	smcm->smcm_va_cmd->Request.CDB[0] = SCMD_INQUIRY;
+	smcm->smcm_va_cmd->Request.CDB[1] = 1;
+	smcm->smcm_va_cmd->Request.CDB[2] = 0x83;
+	smcm->smcm_va_cmd->Request.CDB[3] = (SMRT_SATA_INQ83_LEN & 0xff00) >> 8;
+	smcm->smcm_va_cmd->Request.CDB[4] = SMRT_SATA_INQ83_LEN & 0x00ff;
+	smcm->smcm_va_cmd->Request.CDB[5] = 0;
+
+	mutex_enter(&smrt->smrt_mutex);
+
+	/*
+	 * Send the command to the device.
+	 */
+	smcm->smcm_status |= SMRT_CMD_STATUS_POLLED;
+	if ((r = smrt_submit(smrt, smcm)) != 0) {
+		mutex_exit(&smrt->smrt_mutex);
+		smrt_command_free(smcm);
+		return (r);
+	}
+
+	if ((r = smrt_poll_for(smrt, smcm)) != 0) {
+		VERIFY3S(r, ==, ETIMEDOUT);
+		VERIFY0(smcm->smcm_status & SMRT_CMD_STATUS_POLL_COMPLETE);
+
+		/*
+		 * The command timed out; abandon it now.  Remove the POLLED
+		 * flag so that the periodic routine will send an abort to
+		 * clean it up next time around.
+		 */
+		smcm->smcm_status |= SMRT_CMD_STATUS_ABANDONED;
+		smcm->smcm_status &= ~SMRT_CMD_STATUS_POLLED;
+		mutex_exit(&smrt->smrt_mutex);
+		return (r);
+	}
+
+	if (smcm->smcm_status & SMRT_CMD_STATUS_RESET_SENT) {
+		/*
+		 * The controller was reset while we were trying to discover
+		 * logical volumes.  Report failure.
+		 */
+		mutex_exit(&smrt->smrt_mutex);
+		smrt_command_free(smcm);
+		return (EIO);
+	}
+
+	if (smcm->smcm_status & SMRT_CMD_STATUS_ERROR) {
+		ErrorInfo_t *ei = smcm->smcm_va_err;
+
+		if (ei->CommandStatus != CISS_CMD_DATA_UNDERRUN) {
+			dev_err(smrt->smrt_dip, CE_WARN, "physical target "
+			    "SATA WWN error: status 0x%x", ei->CommandStatus);
+			mutex_exit(&smrt->smrt_mutex);
+			smrt_command_free(smcm);
+			return (EIO);
+		}
+		resid = ei->ResidualCnt;
+	} else {
+		resid = 0;
+	}
+
+	mutex_exit(&smrt->smrt_mutex);
+
+	/*
+	 * We must have at least 12 bytes.  The first four bytes are the header,
+	 * the next four are for the LUN header, and the last 8 are for the
+	 * actual WWN, which according to SAT-2 will always be first.
+	 */
+	if (SMRT_SATA_INQ83_LEN - resid < 16) {
+		smrt_command_free(smcm);
+		return (EINVAL);
+	}
+	inq = smcm->smcm_internal->smcmi_va;
+
+	/*
+	 * Sanity check we have the right page.
+	 */
+	if (inq[1] != 0x83) {
+		smrt_command_free(smcm);
+		return (EINVAL);
+	}
+
+	/*
+	 * Check to see if we have a proper Network Address Authority (NAA)
+	 * based world wide number for this LUN.  It is possible that firmware
+	 * interposes on this and constructs a fake world wide number (WWN).  If
+	 * this is the case, we don't want to actually use it.  We need to
+	 * verify that the WWN declares the correct naming authority and is of
+	 * the proper length.
+	 */
+	if ((inq[5] & 0x30) != 0 || (inq[5] & 0x0f) != 3 || inq[7] != 8) {
+		smrt_command_free(smcm);
+		return (ENOTSUP);
+	}
+
+	bcopy(&inq[8], &wwn, sizeof (uint64_t));
+	*wwnp = BE_64(wwn);
+
+	smrt_command_free(smcm);
+
+	return (0);
+}
diff --git a/usr/src/uts/common/io/scsi/targets/sd.c b/usr/src/uts/common/io/scsi/targets/sd.c
index 769592a3e2..95f0b3b82a 100644
--- a/usr/src/uts/common/io/scsi/targets/sd.c
+++ b/usr/src/uts/common/io/scsi/targets/sd.c
@@ -26,6 +26,7 @@
  * Copyright (c) 2011 Bayard G. Bell.  All rights reserved.
  * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
  * Copyright 2012 DEY Storage Systems, Inc.  All rights reserved.
+ * Copyright 2016 Joyent, Inc.
  * Copyright 2017 Nexenta Systems, Inc.
  */
 /*
@@ -3497,9 +3498,13 @@ sd_set_mmc_caps(sd_ssc_t *ssc)
 		 * according to the successful response to the page
 		 * 0x2A mode sense request.
 		 */
-		scsi_log(SD_DEVINFO(un), sd_label, CE_WARN,
-		    "sd_set_mmc_caps: Mode Sense returned "
-		    "invalid block descriptor length\n");
+		/*
+		 * The following warning occurs due to the KVM CD-ROM
+		 * mishandling the multi-media commands.  Ignore it.
+		 * scsi_log(SD_DEVINFO(un), sd_label, CE_WARN,
+		 *    "sd_set_mmc_caps: Mode Sense returned "
+		 *    "invalid block descriptor length\n");
+		 */
 		kmem_free(buf, BUFLEN_MODE_CDROM_CAP);
 		return;
 	}
@@ -4444,18 +4449,77 @@ sd_sdconf_id_match(struct sd_lun *un, char *id, int idlen)
 {
 	struct scsi_inquiry	*sd_inq;
 	int 			rval = SD_SUCCESS;
+	char			*p;
+	int			chk_vidlen = 0, chk_pidlen = 0;
+	int			has_tail = 0;
+	static const int	VSZ = sizeof (sd_inq->inq_vid);
+	static const int	PSZ = sizeof (sd_inq->inq_pid);
 
 	ASSERT(un != NULL);
 	sd_inq = un->un_sd->sd_inq;
 	ASSERT(id != NULL);
 
 	/*
-	 * We use the inq_vid as a pointer to a buffer containing the
-	 * vid and pid and use the entire vid/pid length of the table
-	 * entry for the comparison. This works because the inq_pid
-	 * data member follows inq_vid in the scsi_inquiry structure.
+	 * We would like to use the inq_vid as a pointer to a buffer
+	 * containing the vid and pid and use the entire vid/pid length of
+	 * the table entry for the comparison.  However, this does not work
+	 * because, while the inq_pid data member follows inq_vid in the
+	 * scsi_inquiry structure, we do not control the contents of this
+	 * buffer, and some broken devices violate SPC 4.3.1 and return
+	 * fields with null bytes in them.
 	 */
-	if (strncasecmp(sd_inq->inq_vid, id, idlen) != 0) {
+	chk_vidlen = MIN(VSZ, idlen);
+	p = id + chk_vidlen - 1;
+	while (*p == ' ' && chk_vidlen > 0) {
+		--p;
+		--chk_vidlen;
+	}
+
+	/*
+	 * If it's all spaces, check the whole thing.
+	 */
+	if (chk_vidlen == 0)
+		chk_vidlen = MIN(VSZ, idlen);
+
+	if (idlen > VSZ) {
+		chk_pidlen = idlen - VSZ;
+		p = id + idlen - 1;
+		while (*p == ' ' && chk_pidlen > 0) {
+			--p;
+			--chk_pidlen;
+		}
+		if (chk_pidlen == 0)
+			chk_pidlen = MIN(PSZ, idlen - VSZ);
+	}
+
+	/*
+	 * There's one more thing we need to do here.  If the user specified
+	 * an ID with trailing spaces, we need to make sure the inquiry
+	 * vid/pid has only spaces or NULs after the check length; otherwise, it
+	 * can't match.
+	 */
+	if (idlen > chk_vidlen && chk_vidlen < VSZ) {
+		for (p = sd_inq->inq_vid + chk_vidlen;
+		    p < sd_inq->inq_vid + VSZ; ++p) {
+			if (*p != ' ' && *p != '\0') {
+				++has_tail;
+				break;
+			}
+		}
+	}
+	if (idlen > chk_pidlen + VSZ && chk_pidlen < PSZ) {
+		for (p = sd_inq->inq_pid + chk_pidlen;
+		    p < sd_inq->inq_pid + PSZ; ++p) {
+			if (*p != ' ' && *p != '\0') {
+				++has_tail;
+				break;
+			}
+		}
+	}
+
+	if (has_tail || strncasecmp(sd_inq->inq_vid, id, chk_vidlen) != 0 ||
+	    (idlen > VSZ &&
+	    strncasecmp(sd_inq->inq_pid, id + VSZ, chk_pidlen) != 0)) {
 		/*
 		 * The user id string is compared to the inquiry vid/pid
 		 * using a case insensitive comparison and ignoring
@@ -6722,7 +6786,7 @@ sdpower(dev_info_t *devi, int component, int level)
 	time_t		intvlp;
 	struct pm_trans_data	sd_pm_tran_data;
 	uchar_t		save_state = SD_STATE_NORMAL;
-	int		sval;
+	int		sval, tursval = 0;
 	uchar_t		state_before_pm;
 	int		got_semaphore_here;
 	sd_ssc_t	*ssc;
@@ -7039,13 +7103,26 @@ sdpower(dev_info_t *devi, int component, int level)
 	 * a deadlock on un_pm_busy_cv will occur.
 	 */
 	if (SD_PM_IS_IO_CAPABLE(un, level)) {
-		sval = sd_send_scsi_TEST_UNIT_READY(ssc,
+		tursval = sd_send_scsi_TEST_UNIT_READY(ssc,
 		    SD_DONT_RETRY_TUR | SD_BYPASS_PM);
-		if (sval != 0)
+		if (tursval != 0)
 			sd_ssc_assessment(ssc, SD_FMT_IGNORE);
 	}
 
-	if (un->un_f_power_condition_supported) {
+	/*
+	 * We've encountered certain classes of drives that pass a TUR, but fail
+	 * the START STOP UNIT when using power conditions, or worse leave the
+	 * drive in an unusable state despite passing SSU. Strictly speaking,
+	 * for SPC-4 or greater, no additional actions are required to make the
+	 * drive operational when a TUR passes. If we have something that
+	 * matches this condition, we continue on and presume the drive is
+	 * successfully powered on.
+	 */
+	if (un->un_f_power_condition_supported &&
+	    SD_SCSI_VERS_IS_GE_SPC_4(un) && SD_PM_IS_IO_CAPABLE(un, level) &&
+	    level == SD_SPINDLE_ACTIVE && tursval == 0) {
+		sval = 0;
+	} else if (un->un_f_power_condition_supported) {
 		char *pm_condition_name[] = {"STOPPED", "STANDBY",
 		    "IDLE", "ACTIVE"};
 		SD_TRACE(SD_LOG_IO_PM, un,
@@ -7065,6 +7142,7 @@ sdpower(dev_info_t *devi, int component, int level)
 			sd_ssc_assessment(ssc, SD_FMT_STATUS_CHECK);
 		else
 			sd_ssc_assessment(ssc, SD_FMT_IGNORE);
+
 	}
 
 	/* Command failed, check for media present. */
@@ -14213,15 +14291,21 @@ sd_initpkt_for_uscsi(struct buf *bp, struct scsi_pkt **pktpp)
 
 	/*
 	 * Allocate the scsi_pkt for the command.
+	 *
 	 * Note: If PKT_DMA_PARTIAL flag is set, scsi_vhci binds a path
 	 *	 during scsi_init_pkt time and will continue to use the
 	 *	 same path as long as the same scsi_pkt is used without
-	 *	 intervening scsi_dma_free(). Since uscsi command does
+	 *	 intervening scsi_dmafree(). Since uscsi command does
 	 *	 not call scsi_dmafree() before retry failed command, it
 	 *	 is necessary to make sure PKT_DMA_PARTIAL flag is NOT
 	 *	 set such that scsi_vhci can use other available path for
 	 *	 retry. Besides, ucsci command does not allow DMA breakup,
 	 *	 so there is no need to set PKT_DMA_PARTIAL flag.
+	 *
+	 *	 More fundamentally, we can't support breaking up this DMA into
+	 *	 multiple windows on x86. There is, in general, no guarantee
+	 *	 that arbitrary SCSI commands are idempotent, which is required
+	 *	 if we want to use multiple windows for a given command.
 	 */
 	if (uscmd->uscsi_rqlen > SENSE_LENGTH) {
 		pktp = scsi_init_pkt(SD_ADDRESS(un), NULL,
@@ -22680,6 +22764,7 @@ sdioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cred_p, int *rval_p)
 		case MHIOCGRP_REGISTERANDIGNOREKEY:
 		case CDROMCLOSETRAY:
 		case USCSICMD:
+		case USCSIMAXXFER:
 			goto skip_ready_valid;
 		default:
 			break;
@@ -23116,6 +23201,23 @@ skip_ready_valid:
 		}
 		break;
 
+	case USCSIMAXXFER:
+		SD_TRACE(SD_LOG_IOCTL, un, "USCSIMAXXFER\n");
+		cr = ddi_get_cred();
+		if ((drv_priv(cred_p) != 0) && (drv_priv(cr) != 0)) {
+			err = EPERM;
+		} else {
+			const uscsi_xfer_t xfer = un->un_max_xfer_size;
+
+			if (ddi_copyout(&xfer, (void *)arg, sizeof (xfer),
+			    flag) != 0) {
+				err = EFAULT;
+			} else {
+				err = 0;
+			}
+		}
+		break;
+
 	case CDROMPAUSE:
 	case CDROMRESUME:
 		SD_TRACE(SD_LOG_IOCTL, un, "PAUSE-RESUME\n");
@@ -31297,7 +31399,7 @@ sd_set_unit_attributes(struct sd_lun *un, dev_info_t *devi)
 		if (SD_PM_CAPABLE_IS_UNDEFINED(pm_cap)) {
 			un->un_f_log_sense_supported = TRUE;
 			if (!un->un_f_power_condition_disabled &&
-			    SD_INQUIRY(un)->inq_ansi == 6) {
+			    SD_SCSI_VERS_IS_GE_SPC_4(un)) {
 				un->un_f_power_condition_supported = TRUE;
 			}
 		} else {
@@ -31315,7 +31417,7 @@ sd_set_unit_attributes(struct sd_lun *un, dev_info_t *devi)
 				/* SD_PM_CAPABLE_IS_TRUE case */
 				un->un_f_pm_supported = TRUE;
 				if (!un->un_f_power_condition_disabled &&
-				    SD_PM_CAPABLE_IS_SPC_4(pm_cap)) {
+				    (SD_PM_CAPABLE_IS_GE_SPC_4(pm_cap))) {
 					un->un_f_power_condition_supported =
 					    TRUE;
 				}
diff --git a/usr/src/uts/common/io/signalfd.c b/usr/src/uts/common/io/signalfd.c
index 6416d6d45a..2a1cf25917 100644
--- a/usr/src/uts/common/io/signalfd.c
+++ b/usr/src/uts/common/io/signalfd.c
@@ -107,6 +107,7 @@
 #include <sys/schedctl.h>
 #include <sys/id_space.h>
 #include <sys/sdt.h>
+#include <sys/brand.h>
 #include <sys/disp.h>
 #include <sys/taskq_impl.h>
 
@@ -459,6 +460,9 @@ consume_signal(k_sigset_t set, uio_t *uio, boolean_t block)
 	lwp->lwp_extsig = 0;
 	mutex_exit(&p->p_lock);
 
+	if (PROC_IS_BRANDED(p) && BROP(p)->b_sigfd_translate)
+		BROP(p)->b_sigfd_translate(infop);
+
 	/* Convert k_siginfo into external, datamodel independent, struct. */
 	bzero(ssp, sizeof (*ssp));
 	ssp->ssi_signo = infop->si_signo;
diff --git a/usr/src/uts/common/io/simnet/simnet.c b/usr/src/uts/common/io/simnet/simnet.c
index 727fbbad8e..9bfe2fe7cf 100644
--- a/usr/src/uts/common/io/simnet/simnet.c
+++ b/usr/src/uts/common/io/simnet/simnet.c
@@ -21,6 +21,8 @@
 /*
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ *
+ * Copyright 2018 Joyent, Inc.
  */
 
 /*
@@ -795,12 +797,6 @@ simnet_m_tx(void *arg, mblk_t *mp_chain)
 			continue;
 		}
 
-		/* Fix mblk checksum as the pkt dest is local */
-		if ((mp = mac_fix_cksum(mp)) == NULL) {
-			sdev->sd_stats.xmit_errors++;
-			continue;
-		}
-
 		/* Hold reference for taskq receive processing per-pkt */
 		if (!simnet_thread_ref(sdev_rx)) {
 			freemsg(mp);
diff --git a/usr/src/uts/common/io/stream.c b/usr/src/uts/common/io/stream.c
index e9af19ca18..994ca8baa8 100644
--- a/usr/src/uts/common/io/stream.c
+++ b/usr/src/uts/common/io/stream.c
@@ -1451,6 +1451,16 @@ copyb(mblk_t *bp)
 	ndp = nbp->b_datap;
 
 	/*
+	 * Copy the various checksum information that came in
+	 * originally.
+	 */
+	ndp->db_cksumstart = dp->db_cksumstart;
+	ndp->db_cksumend = dp->db_cksumend;
+	ndp->db_cksumstuff = dp->db_cksumstuff;
+	bcopy(dp->db_struioun.data, ndp->db_struioun.data,
+	    sizeof (dp->db_struioun.data));
+
+	/*
 	 * Well, here is a potential issue.  If we are trying to
 	 * trace a flow, and we copy the message, we might lose
 	 * information about where this message might have been.
diff --git a/usr/src/uts/common/io/tl.c b/usr/src/uts/common/io/tl.c
index 3e2acab1ec..81a0cac18c 100644
--- a/usr/src/uts/common/io/tl.c
+++ b/usr/src/uts/common/io/tl.c
@@ -1419,8 +1419,9 @@ tl_closeok(tl_endpt_t *tep)
 static int
 tl_open(queue_t	*rq, dev_t *devp, int oflag, int sflag,	cred_t	*credp)
 {
-	tl_endpt_t *tep;
-	minor_t	    minor = getminor(*devp);
+	tl_endpt_t	*tep;
+	minor_t		minor = getminor(*devp);
+	id_t		inst_minor;
 
 	/*
 	 * Driver is called directly. Both CLONEOPEN and MODOPEN
@@ -1440,6 +1441,14 @@ tl_open(queue_t	*rq, dev_t *devp, int oflag, int sflag,	cred_t	*credp)
 		minor |= TL_SOCKET;
 	}
 
+	/*
+	 * Attempt to allocate a unique minor number for this instance.
+	 * Avoid an uninterruptable sleep if none are available.
+	 */
+	if ((inst_minor = id_alloc_nosleep(tl_minors)) == -1) {
+		return (ENOMEM);
+	}
+
 	tep = kmem_cache_alloc(tl_cache, KM_SLEEP);
 	tep->te_refcnt = 1;
 	tep->te_cpid = curproc->p_pid;
@@ -1451,9 +1460,7 @@ tl_open(queue_t	*rq, dev_t *devp, int oflag, int sflag,	cred_t	*credp)
 
 	tep->te_flag = minor & TL_MINOR_MASK;
 	tep->te_transport = &tl_transports[minor];
-
-	/* Allocate a unique minor number for this instance. */
-	tep->te_minor = (minor_t)id_alloc(tl_minors);
+	tep->te_minor = (minor_t)inst_minor;
 
 	/* Reserve hash handle for bind(). */
 	(void) mod_hash_reserve(tep->te_addrhash, &tep->te_hash_hndl);
diff --git a/usr/src/uts/common/io/usb/clients/hid/hid.c b/usr/src/uts/common/io/usb/clients/hid/hid.c
index 084fa7fedc..eccd48bf08 100644
--- a/usr/src/uts/common/io/usb/clients/hid/hid.c
+++ b/usr/src/uts/common/io/usb/clients/hid/hid.c
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2016 Joyent, Inc.
+ * Copyright 2017 Joyent, Inc.
  */
 
 
@@ -139,6 +139,12 @@ static int	hid_info(dev_info_t *, ddi_info_cmd_t, void *, void **);
 static int	hid_attach(dev_info_t *, ddi_attach_cmd_t);
 static int	hid_detach(dev_info_t *, ddi_detach_cmd_t);
 static int	hid_power(dev_info_t *, int, int);
+/* These are to enable ugen support: */
+static int	hid_chropen(dev_t *, int, int, cred_t *);
+static int	hid_chrclose(dev_t, int, int, cred_t *);
+static int	hid_read(dev_t, struct uio *, cred_t *);
+static int	hid_write(dev_t, struct uio *, cred_t *);
+static int	hid_poll(dev_t, short, int, short *, struct pollhead **);
 
 /*
  * Warlock is not aware of the automatic locking mechanisms for
@@ -198,18 +204,18 @@ struct streamtab hid_streamtab = {
 };
 
 struct cb_ops hid_cb_ops = {
-	nulldev,		/* open  */
-	nulldev,		/* close */
+	hid_chropen,		/* open  */
+	hid_chrclose,		/* close */
 	nulldev,		/* strategy */
 	nulldev,		/* print */
 	nulldev,		/* dump */
-	nulldev,		/* read */
-	nulldev,		/* write */
+	hid_read,		/* read */
+	hid_write,		/* write */
 	nulldev,		/* ioctl */
 	nulldev,		/* devmap */
 	nulldev,		/* mmap */
 	nulldev,		/* segmap */
-	nochpoll,		/* poll */
+	hid_poll,		/* poll */
 	ddi_prop_op,		/* cb_prop_op */
 	&hid_streamtab,		/* streamtab  */
 	D_MP | D_MTPERQ
@@ -349,6 +355,7 @@ hid_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
 	usb_alt_if_data_t	*altif_data;
 	char			minor_name[HID_MINOR_NAME_LEN];
 	usb_ep_data_t		*ep_data;
+	usb_ugen_info_t 	usb_ugen_info;
 
 	switch (cmd) {
 		case DDI_ATTACH:
@@ -491,6 +498,28 @@ hid_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
 	usb_free_dev_data(dip, dev_data);
 	hidp->hid_dev_data = NULL;
 
+	if (usb_owns_device(dip)) {
+		/* Get a ugen handle. */
+		bzero(&usb_ugen_info, sizeof (usb_ugen_info));
+
+		usb_ugen_info.usb_ugen_flags = 0;
+		usb_ugen_info.usb_ugen_minor_node_ugen_bits_mask =
+		    (dev_t)HID_MINOR_UGEN_BITS_MASK;
+		usb_ugen_info.usb_ugen_minor_node_instance_mask =
+		    (dev_t)HID_MINOR_INSTANCE_MASK;
+		hidp->hid_ugen_hdl = usb_ugen_get_hdl(dip, &usb_ugen_info);
+
+		if (usb_ugen_attach(hidp->hid_ugen_hdl, cmd) !=
+		    USB_SUCCESS) {
+			USB_DPRINTF_L2(PRINT_MASK_ATTA,
+			    hidp->hid_log_handle,
+			    "usb_ugen_attach failed");
+
+			usb_ugen_release_hdl(hidp->hid_ugen_hdl);
+			hidp->hid_ugen_hdl = NULL;
+		}
+	}
+
 	/*
 	 * Don't get the report descriptor if parsing hid descriptor earlier
 	 * failed since device probably won't return valid report descriptor
@@ -769,6 +798,149 @@ hid_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
 	return (rval);
 }
 
+static int
+hid_chropen(dev_t *devp, int flag, int sflag, cred_t *credp)
+{
+	int rval;
+	minor_t minor = getminor(*devp);
+	int instance;
+	hid_state_t *hidp;
+
+	instance = HID_MINOR_TO_INSTANCE(minor);
+
+	hidp = ddi_get_soft_state(hid_statep, instance);
+	if (hidp == NULL) {
+		return (ENXIO);
+	}
+
+	if (!HID_IS_UGEN_OPEN(minor)) {
+		return (ENXIO);
+	}
+
+	hid_pm_busy_component(hidp);
+	(void) pm_raise_power(hidp->hid_dip, 0, USB_DEV_OS_FULL_PWR);
+
+	mutex_enter(&hidp->hid_mutex);
+
+	rval = usb_ugen_open(hidp->hid_ugen_hdl, devp, flag,
+	    sflag, credp);
+
+	mutex_exit(&hidp->hid_mutex);
+
+	if (rval != 0) {
+		hid_pm_idle_component(hidp);
+	}
+
+	return (rval);
+}
+
+static int
+hid_chrclose(dev_t dev, int flag, int otyp, cred_t *credp)
+{
+	int rval;
+	minor_t minor = getminor(dev);
+	int instance;
+	hid_state_t *hidp;
+
+	instance = HID_MINOR_TO_INSTANCE(minor);
+
+	hidp = ddi_get_soft_state(hid_statep, instance);
+	if (hidp == NULL) {
+		return (ENXIO);
+	}
+
+	if (!HID_IS_UGEN_OPEN(minor)) {
+		return (ENXIO);
+	}
+
+	mutex_enter(&hidp->hid_mutex);
+
+	rval = usb_ugen_close(hidp->hid_ugen_hdl, dev, flag,
+	    otyp, credp);
+
+	mutex_exit(&hidp->hid_mutex);
+
+	if (rval == 0) {
+		hid_pm_idle_component(hidp);
+	}
+
+	return (rval);
+}
+
+static int
+hid_read(dev_t dev, struct uio *uiop, cred_t *credp)
+{
+	int rval;
+	minor_t minor = getminor(dev);
+	int instance;
+	hid_state_t *hidp;
+
+	instance = HID_MINOR_TO_INSTANCE(minor);
+
+	hidp = ddi_get_soft_state(hid_statep, instance);
+	if (hidp == NULL) {
+		return (ENXIO);
+	}
+
+	if (!HID_IS_UGEN_OPEN(minor)) {
+		return (ENXIO);
+	}
+
+	rval = usb_ugen_read(hidp->hid_ugen_hdl, dev, uiop, credp);
+
+	return (rval);
+}
+
+static int
+hid_write(dev_t dev, struct uio *uiop, cred_t *credp)
+{
+	int rval;
+	minor_t minor = getminor(dev);
+	int instance;
+	hid_state_t *hidp;
+
+	instance = HID_MINOR_TO_INSTANCE(minor);
+
+	hidp = ddi_get_soft_state(hid_statep, instance);
+	if (hidp == NULL) {
+		return (ENXIO);
+	}
+
+	if (!HID_IS_UGEN_OPEN(minor)) {
+		return (ENXIO);
+	}
+
+	rval = usb_ugen_write(hidp->hid_ugen_hdl, dev, uiop, credp);
+
+	return (rval);
+}
+
+static int
+hid_poll(dev_t dev, short events, int anyyet, short *reventsp,
+    struct pollhead **phpp)
+{
+	int rval;
+	minor_t minor = getminor(dev);
+	int instance;
+	hid_state_t *hidp;
+
+	instance = HID_MINOR_TO_INSTANCE(minor);
+
+	hidp = ddi_get_soft_state(hid_statep, instance);
+	if (hidp == NULL) {
+		return (ENXIO);
+	}
+
+	if (!HID_IS_UGEN_OPEN(minor)) {
+		return (ENXIO);
+	}
+
+	rval = usb_ugen_poll(hidp->hid_ugen_hdl, dev, events, anyyet,
+	    reventsp, phpp);
+
+	return (rval);
+}
+
 /*
  * hid_open :
  *	Open entry point: Opens the interrupt pipe.  Sets up queues.
@@ -787,13 +959,21 @@ hid_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
 
 	hidp = ddi_get_soft_state(hid_statep, instance);
 	if (hidp == NULL) {
-
 		return (ENXIO);
 	}
 
 	USB_DPRINTF_L4(PRINT_MASK_OPEN, hidp->hid_log_handle,
 	    "hid_open: Begin");
 
+	/*
+	 * If this is a ugen device, return ENOSTR (no streams). This will
+	 * cause spec_open to try hid_chropen from our regular ops_cb instead
+	 * (and thus treat us as a plain character device).
+	 */
+	if (HID_IS_UGEN_OPEN(minor)) {
+		return (ENOSTR);
+	}
+
 	if (sflag) {
 		/* clone open NOT supported here */
 		return (ENXIO);
@@ -803,6 +983,8 @@ hid_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
 		return (EIO);
 	}
 
+	mutex_enter(&hidp->hid_mutex);
+
 	/*
 	 * This is a workaround:
 	 *	Currently, if we open an already disconnected device, and send
@@ -812,7 +994,6 @@ hid_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
 	 *	The consconfig_dacf module need this interface to detect if the
 	 *	device is already disconnnected.
 	 */
-	mutex_enter(&hidp->hid_mutex);
 	if (HID_IS_INTERNAL_OPEN(minor) &&
 	    (hidp->hid_dev_state == USB_DEV_DISCONNECTED)) {
 		mutex_exit(&hidp->hid_mutex);
@@ -1688,6 +1869,11 @@ hid_cpr_suspend(hid_state_t *hidp)
 	}
 	mutex_exit(&hidp->hid_mutex);
 
+	if ((retval == USB_SUCCESS) && hidp->hid_ugen_hdl != NULL) {
+		retval = usb_ugen_detach(hidp->hid_ugen_hdl,
+		    DDI_SUSPEND);
+	}
+
 	return (retval);
 }
 
@@ -1699,6 +1885,10 @@ hid_cpr_resume(hid_state_t *hidp)
 	    "hid_cpr_resume: dip=0x%p", (void *)hidp->hid_dip);
 
 	hid_restore_device_state(hidp->hid_dip, hidp);
+
+	if (hidp->hid_ugen_hdl != NULL) {
+		(void) usb_ugen_attach(hidp->hid_ugen_hdl, DDI_RESUME);
+	}
 }
 
 
@@ -2136,6 +2326,12 @@ hid_detach_cleanup(dev_info_t *dip, hid_state_t *hidp)
 		hidp->hid_pm = NULL;
 	}
 
+	if (hidp->hid_ugen_hdl != NULL) {
+		rval = usb_ugen_detach(hidp->hid_ugen_hdl, DDI_DETACH);
+		VERIFY0(rval);
+		usb_ugen_release_hdl(hidp->hid_ugen_hdl);
+	}
+
 	mutex_exit(&hidp->hid_mutex);
 
 	if (hidp->hid_report_descr != NULL) {
diff --git a/usr/src/uts/common/io/usb/hcd/xhci/xhci.c b/usr/src/uts/common/io/usb/hcd/xhci/xhci.c
index 84bd1a13c9..d404ea1d18 100644
--- a/usr/src/uts/common/io/usb/hcd/xhci/xhci.c
+++ b/usr/src/uts/common/io/usb/hcd/xhci/xhci.c
@@ -10,7 +10,7 @@
  */
 
 /*
- * Copyright (c) 2017, Joyent, Inc.
+ * Copyright (c) 2019, Joyent, Inc.
  */
 
 /*
@@ -86,39 +86,39 @@
  *
  * There are four different kinds of endpoints:
  *
- * 	BULK		These transfers are large transfers of data to or from
- * 			a device. The most common use for bulk transfers is for
- * 			mass storage devices. Though they are often also used by
- * 			network devices and more. Bulk endpoints do not have an
- * 			explicit time component to them. They are always used
- * 			for one-shot transfers.
- *
- * 	CONTROL		These transfers are used to manipulate devices
- * 			themselves and are used for USB protocol level
- * 			operations (whether device-specific, class-specific, or
- * 			generic across all of USB). Unlike other transfers,
- * 			control transfers are always bi-directional and use
- * 			different kinds of transfers.
- *
- * 	INTERRUPT	Interrupt transfers are used for small transfers that
- * 			happen infrequently, but need reasonable latency. A good
- * 			example of interrupt transfers is to receive input from
- * 			a USB keyboard. Interrupt-IN transfers are generally
- * 			polled. Meaning that a client (device driver) opens up
- * 			an interrupt-IN pipe to poll on it, and receives
- * 			periodic updates whenever there is information
- * 			available. However, Interrupt transfers can be used
- * 			as one-shot transfers both going IN and OUT.
- *
- * 	ISOCHRONOUS	These transfers are things that happen once per
- * 			time-interval at a very regular rate. A good example of
- * 			these transfers are for audio and video. A device may
- * 			describe an interval as 10ms at which point it will read
- * 			or write the next batch of data every 10ms and transform
- * 			it for the user. There are no one-shot Isochronous-IN
- * 			transfers. There are one-shot Isochronous-OUT transfers,
- * 			but these are used by device drivers to always provide
- * 			the system with sufficient data.
+ *	BULK		These transfers are large transfers of data to or from
+ *			a device. The most common use for bulk transfers is for
+ *			mass storage devices. Though they are often also used by
+ *			network devices and more. Bulk endpoints do not have an
+ *			explicit time component to them. They are always used
+ *			for one-shot transfers.
+ *
+ *	CONTROL		These transfers are used to manipulate devices
+ *			themselves and are used for USB protocol level
+ *			operations (whether device-specific, class-specific, or
+ *			generic across all of USB). Unlike other transfers,
+ *			control transfers are always bi-directional and use
+ *			different kinds of transfers.
+ *
+ *	INTERRUPT	Interrupt transfers are used for small transfers that
+ *			happen infrequently, but need reasonable latency. A good
+ *			example of interrupt transfers is to receive input from
+ *			a USB keyboard. Interrupt-IN transfers are generally
+ *			polled. Meaning that a client (device driver) opens up
+ *			an interrupt-IN pipe to poll on it, and receives
+ *			periodic updates whenever there is information
+ *			available. However, Interrupt transfers can be used
+ *			as one-shot transfers both going IN and OUT.
+ *
+ *	ISOCHRONOUS	These transfers are things that happen once per
+ *			time-interval at a very regular rate. A good example of
+ *			these transfers are for audio and video. A device may
+ *			describe an interval as 10ms at which point it will read
+ *			or write the next batch of data every 10ms and transform
+ *			it for the user. There are no one-shot Isochronous-IN
+ *			transfers. There are one-shot Isochronous-OUT transfers,
+ *			but these are used by device drivers to always provide
+ *			the system with sufficient data.
  *
  * To find out information about the endpoints, USB devices have a series of
  * descriptors that cover different aspects of the device. For example, there
@@ -220,67 +220,67 @@
  * purpose of each of these files.
  *
  * xhci_command.c:	This file contains the logic to issue commands to the
- * 			controller as well as the actual functions that the
- * 			other parts of the driver use to cause those commands.
+ *			controller as well as the actual functions that the
+ *			other parts of the driver use to cause those commands.
  *
  * xhci_context.c:	This file manages various data structures used by the
- * 			controller to manage the controller's and device's
- * 			context data structures. See more in the xHCI Overview
- * 			and General Design for more information.
+ *			controller to manage the controller's and device's
+ *			context data structures. See more in the xHCI Overview
+ *			and General Design for more information.
  *
  * xhci_dma.c:		This manages the allocation of DMA memory and DMA
- * 			attributes for controller, whether memory is for a
- * 			transfer or something else. This file also deals with
- * 			all the logic of getting data in and out of DMA buffers.
+ *			attributes for controller, whether memory is for a
+ *			transfer or something else. This file also deals with
+ *			all the logic of getting data in and out of DMA buffers.
  *
  * xhci_endpoint.c:	This manages all of the logic of handling endpoints or
- * 			pipes. It deals with endpoint configuration, I/O
- * 			scheduling, timeouts, and callbacks to USBA.
+ *			pipes. It deals with endpoint configuration, I/O
+ *			scheduling, timeouts, and callbacks to USBA.
  *
  * xhci_event.c:	This manages callbacks from the hardware to the driver.
- * 			This covers command completion notifications and I/O
- * 			notifications.
+ *			This covers command completion notifications and I/O
+ *			notifications.
  *
  * xhci_hub.c:		This manages the virtual root-hub. It basically
- * 			implements and translates all of the USB level requests
- * 			into xhci specific implements. It also contains the
- * 			functions to register this hub with USBA.
+ *			implements and translates all of the USB level requests
+ *			into xhci specific implements. It also contains the
+ *			functions to register this hub with USBA.
  *
  * xhci_intr.c:		This manages the underlying interrupt allocation,
- * 			interrupt moderation, and interrupt routines.
+ *			interrupt moderation, and interrupt routines.
  *
  * xhci_quirks.c:	This manages information about buggy hardware that's
- * 			been collected and experienced primarily from other
- * 			systems.
+ *			been collected and experienced primarily from other
+ *			systems.
  *
  * xhci_ring.c:		This manages the abstraction of a ring in xhci, which is
- * 			the primary of communication between the driver and the
- * 			hardware, whether for the controller or a device.
+ *			the primary of communication between the driver and the
+ *			hardware, whether for the controller or a device.
  *
  * xhci_usba.c:		This implements all of the HCDI functions required by
- * 			USBA. This is the main entry point that drivers and the
- * 			kernel frameworks will reach to start any operation.
- * 			Many functions here will end up in the command and
- * 			endpoint code.
+ *			USBA. This is the main entry point that drivers and the
+ *			kernel frameworks will reach to start any operation.
+ *			Many functions here will end up in the command and
+ *			endpoint code.
  *
  * xhci.c:		This provides the main kernel DDI interfaces and
- * 			performs device initialization.
+ *			performs device initialization.
  *
  * xhci.h:		This is the primary header file which defines
- * 			illumos-specific data structures and constants to manage
- * 			the system.
+ *			illumos-specific data structures and constants to manage
+ *			the system.
  *
  * xhcireg.h:		This header file defines all of the register offsets,
- * 			masks, and related macros. It also contains all of the
- * 			constants that are used in various structures as defined
- * 			by the specification, such as command offsets, etc.
+ *			masks, and related macros. It also contains all of the
+ *			constants that are used in various structures as defined
+ *			by the specification, such as command offsets, etc.
  *
  * xhci_ioctl.h:	This contains a few private ioctls that are used by a
- * 			private debugging command. These are private.
+ *			private debugging command. These are private.
  *
  * cmd/xhci/xhci_portsc:	This is a private utility that can be useful for
- * 				debugging xhci state. It is the only consumer of
- * 				xhci_ioctl.h and the private ioctls.
+ *				debugging xhci state. It is the only consumer of
+ *				xhci_ioctl.h and the private ioctls.
  *
  * ----------------------------------
  * xHCI Overview and Structure Layout
@@ -1444,12 +1444,15 @@ xhci_find_ext_cap(xhci_t *xhcip, uint32_t id, uint32_t init, uint32_t *outp)
 static boolean_t
 xhci_port_count(xhci_t *xhcip)
 {
-	uint_t nusb2 = 0, nusb3 = 0;
+	uint_t nusb2 = 0, fusb2 = 0;
+	uint_t nusb30 = 0, fusb30 = 0;
+	uint_t nusb31 = 0, fusb31 = 0;
 	uint32_t off = UINT32_MAX;
 
 	while (xhci_find_ext_cap(xhcip, XHCI_ID_PROTOCOLS, off, &off) ==
 	    B_TRUE) {
 		uint32_t rvers, rport;
+		uint8_t maj, min, count, first;
 
 		/*
 		 * See xHCI 1.1 / 7.2 for the format of this. The first uint32_t
@@ -1466,23 +1469,48 @@ xhci_port_count(xhci_t *xhcip)
 			return (B_FALSE);
 		}
 
-		rvers = XHCI_XECP_PROT_MAJOR(rvers);
-		rport = XHCI_XECP_PROT_PCOUNT(rport);
-
-		if (rvers == 3) {
-			nusb3 += rport;
-		} else if (rvers <= 2) {
-			nusb2 += rport;
+		maj = XHCI_XECP_PROT_MAJOR(rvers);
+		min = XHCI_XECP_PROT_MINOR(rvers);
+		count = XHCI_XECP_PROT_PCOUNT(rport);
+		first = XHCI_XECP_PROT_FPORT(rport);
+
+		if (maj == 3 && min == 1) {
+			nusb31 = count;
+			fusb31 = first;
+		} else if (maj == 3 && min == 0) {
+			nusb30 = count;
+			fusb30 = first;
+		} else if (maj <= 2) {
+			nusb2 = count;
+			fusb2 = first;
 		} else {
 			xhci_error(xhcip, "encountered port capabilities with "
-			    "unknown major USB version: %d\n", rvers);
+			    "unknown USB version: %x.%x\n", maj, min);
 		}
 	}
 
-	(void) ddi_prop_update_int(DDI_DEV_T_NONE, xhcip->xhci_dip,
-	    "usb2-capable-ports", nusb2);
-	(void) ddi_prop_update_int(DDI_DEV_T_NONE, xhcip->xhci_dip,
-	    "usb3-capable-ports", nusb3);
+	/*
+	 * These properties are used by FMA and the USB topo module.
+	 */
+	if (nusb2 > 0) {
+		(void) ddi_prop_update_int(DDI_DEV_T_NONE, xhcip->xhci_dip,
+		    "usb2.0-port-count", nusb2);
+		(void) ddi_prop_update_int(DDI_DEV_T_NONE, xhcip->xhci_dip,
+		    "usb2.0-first-port", fusb2);
+	}
+	if (nusb30 > 0) {
+		(void) ddi_prop_update_int(DDI_DEV_T_NONE, xhcip->xhci_dip,
+		    "usb3.0-port-count", nusb30);
+		(void) ddi_prop_update_int(DDI_DEV_T_NONE, xhcip->xhci_dip,
+		    "usb3.0-first-port", fusb30);
+	}
+
+	if (nusb31 > 0) {
+		(void) ddi_prop_update_int(DDI_DEV_T_NONE, xhcip->xhci_dip,
+		    "usb3.1-port-count", nusb30);
+		(void) ddi_prop_update_int(DDI_DEV_T_NONE, xhcip->xhci_dip,
+		    "usb3.1-first-port", fusb31);
+	}
 
 	return (B_TRUE);
 }
@@ -2183,7 +2211,7 @@ static struct dev_ops xhci_dev_ops = {
 	&xhci_cb_ops,			/* devo_cb_ops */
 	&usba_hubdi_busops,		/* devo_bus_ops */
 	usba_hubdi_root_hub_power,	/* devo_power */
-	ddi_quiesce_not_supported 	/* devo_quiesce */
+	ddi_quiesce_not_supported	/* devo_quiesce */
 };
 
 static struct modldrv xhci_modldrv = {
diff --git a/usr/src/uts/common/io/vioif/vioif.c b/usr/src/uts/common/io/vioif/vioif.c
index d5dd1e8e39..3b6660c542 100644
--- a/usr/src/uts/common/io/vioif/vioif.c
+++ b/usr/src/uts/common/io/vioif/vioif.c
@@ -11,6 +11,7 @@
 
 /*
  * Copyright 2013 Nexenta Inc.  All rights reserved.
+ * Copyright 2015 Joyent, Inc.
  * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
  * Copyright 2015 Joyent, Inc.
  */
@@ -909,16 +910,17 @@ vioif_reclaim_used_tx(struct vioif_softc *sc)
 
 	while ((ve = virtio_pull_chain(sc->sc_tx_vq, &len))) {
 		/* We don't chain descriptors for tx, so don't expect any. */
-		ASSERT(!ve->qe_next);
+		ASSERT(ve->qe_next == NULL);
 
 		buf = &sc->sc_txbufs[ve->qe_index];
 		mp = buf->tb_mp;
 		buf->tb_mp = NULL;
 
 		if (mp != NULL) {
-			for (int i = 0; i < buf->tb_external_num; i++)
+			for (int i = 0; i < buf->tb_external_num; i++) {
 				(void) ddi_dma_unbind_handle(
 				    buf->tb_external_mapping[i].vbm_dmah);
+			}
 		}
 
 		virtio_free_chain(ve);
diff --git a/usr/src/uts/common/io/vnd/frameio.c b/usr/src/uts/common/io/vnd/frameio.c
new file mode 100644
index 0000000000..198c14d4be
--- /dev/null
+++ b/usr/src/uts/common/io/vnd/frameio.c
@@ -0,0 +1,465 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2017 Joyent, Inc.
+ */
+
+/*
+ * Frame I/O utility functions
+ */
+
+#include <sys/frameio.h>
+
+#include <sys/file.h>
+#include <sys/types.h>
+#include <sys/kmem.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/stream.h>
+#include <sys/strsun.h>
+#include <sys/sysmacros.h>
+#include <sys/inttypes.h>
+
+static kmem_cache_t *frameio_cache;
+
+int
+frameio_init(void)
+{
+	frameio_cache = kmem_cache_create("frameio_cache",
+	    sizeof (frameio_t) + sizeof (framevec_t) * FRAMEIO_NVECS_MAX,
+	    0, NULL, NULL, NULL, NULL, NULL, 0);
+	if (frameio_cache == NULL)
+		return (1);
+
+	return (0);
+}
+
+void
+frameio_fini(void)
+{
+	if (frameio_cache != NULL)
+		kmem_cache_destroy(frameio_cache);
+}
+
+frameio_t *
+frameio_alloc(int kmflags)
+{
+	return (kmem_cache_alloc(frameio_cache, kmflags));
+}
+
+void
+frameio_free(frameio_t *fio)
+{
+	kmem_cache_free(frameio_cache, fio);
+}
+
+/*
+ * Ensure that we don't see any garbage in the framevecs that we're nominally
+ * supposed to work with. Specifically we want to make sure that the buflen and
+ * the address are not zero.
+ */
+static int
+frameio_hdr_check_vecs(frameio_t *fio)
+{
+	int i;
+	for (i = 0; i < fio->fio_nvecs; i++)
+		if (fio->fio_vecs[i].fv_buf == NULL ||
+		    fio->fio_vecs[i].fv_buflen == 0)
+			return (EINVAL);
+
+	return (0);
+}
+
+/*
+ * We have to copy in framevec32_t's. To work around the data model issues and
+ * trying not to copy memory we first copy in the framevec32_t data into the
+ * standard fio_vec space. Next we work backwards copying a given framevec32_t
+ * to a temporaory framevec_t and then overwrite the frameio_t's data. Note that
+ * it is important that we do this in reverse so as to ensure that we don't
+ * clobber data as the framevec_t is larger than the framevec32_t.
+ */
+static int
+frameio_hdr_copyin_ilp32(frameio_t *fio, const void *addr)
+{
+	framevec32_t *vec32p;
+	framevec_t fv;
+	int i;
+
+	vec32p = (framevec32_t *)&fio->fio_vecs[0];
+
+	if (ddi_copyin(addr, vec32p, sizeof (framevec32_t) * fio->fio_nvecs,
+	    0) != 0)
+		return (EFAULT);
+
+	for (i = fio->fio_nvecs - 1; i >= 0; i--) {
+		fv.fv_buf = (void *)(uintptr_t)vec32p[i].fv_buf;
+		fv.fv_buflen = vec32p[i].fv_buflen;
+		fv.fv_actlen = vec32p[i].fv_actlen;
+		fio->fio_vecs[i].fv_buf = fv.fv_buf;
+		fio->fio_vecs[i].fv_buflen = fv.fv_buflen;
+		fio->fio_vecs[i].fv_actlen = fv.fv_actlen;
+	}
+
+	return (frameio_hdr_check_vecs(fio));
+}
+
+/*
+ * Copy in a frame io header into fio with space for up to nvecs. If the frameio
+ * contains more vectors than specified it will be ignored. mode should contain
+ * information about the datamodel.
+ */
+int
+frameio_hdr_copyin(frameio_t *fio, int max_vecs, const void *addr, uint_t mode)
+{
+	int model = ddi_model_convert_from(mode & FMODELS);
+	int cpf = mode & FKIOCTL ? FKIOCTL : 0;
+	size_t fsize = model == DDI_MODEL_ILP32 ?
+	    sizeof (frameio32_t) : sizeof (frameio_t);
+
+	/*
+	 * The start of the header is the same in all data models for the
+	 * current verison.
+	 */
+	if (ddi_copyin(addr, fio, fsize, cpf) != 0)
+		return (EFAULT);
+
+	if (fio->fio_version != FRAMEIO_VERSION_ONE)
+		return (EINVAL);
+
+	if (fio->fio_nvecs > FRAMEIO_NVECS_MAX || fio->fio_nvecs == 0)
+		return (EINVAL);
+
+	if (fio->fio_nvpf == 0)
+		return (EINVAL);
+
+	if (fio->fio_nvecs % fio->fio_nvpf != 0)
+		return (EINVAL);
+
+	if (fio->fio_nvecs > max_vecs)
+		return (EOVERFLOW);
+
+	addr = (void *)((uintptr_t)addr + fsize);
+	if (model == DDI_MODEL_ILP32) {
+		if (cpf != 0)
+			return (EINVAL);
+		return (frameio_hdr_copyin_ilp32(fio, addr));
+	}
+
+	if (ddi_copyin(addr, &fio->fio_vecs[0],
+	    sizeof (framevec_t) * fio->fio_nvecs, cpf) != 0)
+		return (EFAULT);
+
+	return (frameio_hdr_check_vecs(fio));
+}
+
+static mblk_t *
+frameio_allocb(size_t sz)
+{
+	mblk_t *mp;
+
+	mp = allocb(sz, 0);
+	if (mp == NULL)
+		return (NULL);
+
+	mp->b_datap->db_type = M_DATA;
+	return (mp);
+}
+
+static int
+framevec_mblk_read(framevec_t *fv, mblk_t **mpp, int cpf)
+{
+	mblk_t *mp;
+	cpf = cpf != 0 ? FKIOCTL : 0;
+
+	mp = frameio_allocb(fv->fv_buflen);
+
+	if (mp == NULL) {
+		freemsg(mp);
+		return (EAGAIN);
+	}
+
+	if (ddi_copyin(fv->fv_buf, mp->b_wptr, fv->fv_buflen,
+	    cpf) != 0) {
+		freemsg(mp);
+		return (EFAULT);
+	}
+
+	mp->b_wptr += fv->fv_buflen;
+	*mpp = mp;
+	return (0);
+}
+
+/*
+ * Read a set of frame vectors that make up a single message boundary and return
+ * that as a single message in *mpp that consists of multiple data parts.
+ */
+static int
+frameio_mblk_read(frameio_t *fio, framevec_t *fv, mblk_t **mpp, int cpf)
+{
+	int nparts = fio->fio_nvpf;
+	int part, error;
+	mblk_t *mp;
+
+	*mpp = NULL;
+	cpf = cpf != 0 ? FKIOCTL : 0;
+
+	/*
+	 * Construct the initial frame
+	 */
+	for (part = 0; part < nparts; part++) {
+		error = framevec_mblk_read(fv, &mp, cpf);
+		if (error != 0) {
+			freemsg(*mpp);
+			return (error);
+		}
+
+		if (*mpp == NULL)
+			*mpp = mp;
+		else
+			linkb(*mpp, mp);
+		fv++;
+	}
+
+	return (0);
+}
+
+/*
+ * Read data from a series of frameio vectors into a message block chain. A
+ * given frameio request has a number of discrete messages divided into
+ * individual vectors based on fio->fio_nvcspframe. Each discrete message will
+ * be constructed into a message block chain pointed to by b_next.
+ *
+ * If we get an EAGAIN while trying to construct a given message block what we
+ * return depends on what else we've done so far. If we have succesfully
+ * completed at least one message then we free everything else we've done so
+ * far and return that. If no messages have been completed we return EAGAIN. If
+ * instead we encounter a different error, say EFAULT, then all of the fv_actlen
+ * entries values are undefined.
+ */
+int
+frameio_mblk_chain_read(frameio_t *fio, mblk_t **mpp, int *nvecs, int cpf)
+{
+	int error = ENOTSUP;
+	int nframes = fio->fio_nvecs / fio->fio_nvpf;
+	int frame;
+	framevec_t *fv;
+	mblk_t *mp, *bmp = NULL;
+
+	/*
+	 * Protect against bogus kernel subsystems.
+	 */
+	VERIFY(fio->fio_nvecs > 0);
+	VERIFY(fio->fio_nvecs % fio->fio_nvpf == 0);
+
+	*mpp = NULL;
+	cpf = cpf != 0 ? FKIOCTL : 0;
+
+	fv = &fio->fio_vecs[0];
+	for (frame = 0; frame < nframes; frame++) {
+		error = frameio_mblk_read(fio, fv, &mp, cpf);
+		if (error != 0)
+			goto failed;
+
+		if (bmp != NULL)
+			bmp->b_next = mp;
+		else
+			*mpp = mp;
+		bmp = mp;
+	}
+
+	*nvecs = nframes;
+	return (0);
+failed:
+	/*
+	 * On EAGAIN we've already taken care of making sure that we have no
+	 * leftover messages, eg. they were never linked in.
+	 */
+	if (error == EAGAIN) {
+		if (frame != 0)
+			error = 0;
+		if (*nvecs != NULL)
+			*nvecs = frame;
+		ASSERT(*mpp != NULL);
+	} else {
+		for (mp = *mpp; mp != NULL; mp = bmp) {
+			bmp = mp->b_next;
+			freemsg(mp);
+		}
+		if (nvecs != NULL)
+			*nvecs = 0;
+		*mpp = NULL;
+	}
+	return (error);
+}
+
+size_t
+frameio_frame_length(frameio_t *fio, framevec_t *fv)
+{
+	int i;
+	size_t len = 0;
+
+	for (i = 0; i < fio->fio_nvpf; i++, fv++)
+		len += fv->fv_buflen;
+
+	return (len);
+}
+
+/*
+ * Write a portion of an mblk to the current.
+ */
+static int
+framevec_write_mblk_part(framevec_t *fv, mblk_t *mp, size_t len, size_t moff,
+    size_t foff, int cpf)
+{
+	ASSERT(len <= MBLKL(mp) - moff);
+	ASSERT(len <= fv->fv_buflen - fv->fv_actlen);
+	cpf = cpf != 0 ? FKIOCTL : 0;
+
+	if (ddi_copyout(mp->b_rptr + moff, (caddr_t)fv->fv_buf + foff, len,
+	    cpf) != 0)
+		return (EFAULT);
+	fv->fv_actlen += len;
+
+	return (0);
+}
+
+/*
+ * Because copying this out to the user might fail we don't want to update the
+ * b_rptr in case we need to copy it out again.
+ */
+static int
+framevec_map_blk(frameio_t *fio, framevec_t *fv, mblk_t *mp, int cpf)
+{
+	int err;
+	size_t msize, blksize, len, moff, foff;
+
+	msize = msgsize(mp);
+	if (msize > frameio_frame_length(fio, fv))
+		return (EOVERFLOW);
+
+	moff = 0;
+	foff = 0;
+	blksize = MBLKL(mp);
+	fv->fv_actlen = 0;
+	while (msize != 0) {
+		len = MIN(blksize, fv->fv_buflen - fv->fv_actlen);
+		err = framevec_write_mblk_part(fv, mp, len, moff, foff, cpf);
+		if (err != 0)
+			return (err);
+
+		msize -= len;
+		blksize -= len;
+		moff += len;
+		foff += len;
+
+		if (blksize == 0 && msize != 0) {
+			mp = mp->b_cont;
+			ASSERT(mp != NULL);
+			moff = 0;
+			blksize = MBLKL(mp);
+		}
+
+		if (fv->fv_buflen == fv->fv_actlen && msize != 0) {
+			fv++;
+			fv->fv_actlen = 0;
+			foff = 0;
+		}
+	}
+
+	return (0);
+}
+
+int
+frameio_mblk_chain_write(frameio_t *fio, frameio_write_mblk_map_t map,
+    mblk_t *mp, int *nwrite, int cpf)
+{
+	int mcount = 0;
+	int ret = 0;
+
+	if (map != MAP_BLK_FRAME)
+		return (EINVAL);
+
+	while (mp != NULL && mcount < fio->fio_nvecs) {
+		ret = framevec_map_blk(fio, &fio->fio_vecs[mcount], mp, cpf);
+		if (ret != 0)
+			break;
+		mcount += fio->fio_nvpf;
+		mp = mp->b_next;
+	}
+
+	if (ret != 0 && mcount == 0) {
+		if (nwrite != NULL)
+			*nwrite = 0;
+		return (ret);
+	}
+
+	if (nwrite != NULL)
+		*nwrite = mcount / fio->fio_nvpf;
+
+	return (0);
+}
+
+/*
+ * Copy out nframes worth of frameio header data back to userland.
+ */
+int
+frameio_hdr_copyout(frameio_t *fio, int nframes, void *addr, uint_t mode)
+{
+	int i;
+	int model = ddi_model_convert_from(mode & FMODELS);
+	framevec32_t *vec32p;
+	framevec32_t f;
+
+	if (fio->fio_nvecs / fio->fio_nvpf < nframes)
+		return (EINVAL);
+
+	fio->fio_nvecs = nframes * fio->fio_nvpf;
+
+	if (model == DDI_MODEL_NONE) {
+		if (ddi_copyout(fio, addr,
+		    sizeof (frameio_t) + fio->fio_nvecs * sizeof (framevec_t),
+		    mode & FKIOCTL) != 0)
+			return (EFAULT);
+		return (0);
+	}
+
+	ASSERT(model == DDI_MODEL_ILP32);
+
+	vec32p = (framevec32_t *)&fio->fio_vecs[0];
+	for (i = 0; i < fio->fio_nvecs; i++) {
+		f.fv_buf = (caddr32_t)(uintptr_t)fio->fio_vecs[i].fv_buf;
+		if (fio->fio_vecs[i].fv_buflen > UINT_MAX ||
+		    fio->fio_vecs[i].fv_actlen > UINT_MAX)
+			return (EOVERFLOW);
+		f.fv_buflen = fio->fio_vecs[i].fv_buflen;
+		f.fv_actlen = fio->fio_vecs[i].fv_actlen;
+		vec32p[i].fv_buf = f.fv_buf;
+		vec32p[i].fv_buflen = f.fv_buflen;
+		vec32p[i].fv_actlen = f.fv_actlen;
+	}
+
+	if (ddi_copyout(fio, addr,
+	    sizeof (frameio32_t) + fio->fio_nvecs * sizeof (framevec32_t),
+	    mode & FKIOCTL) != 0)
+		return (EFAULT);
+	return (0);
+}
+
+void
+frameio_mark_consumed(frameio_t *fio, int nframes)
+{
+	int i;
+
+	ASSERT(fio->fio_nvecs / fio->fio_nvpf >= nframes);
+	for (i = 0; i < nframes * fio->fio_nvpf; i++)
+		fio->fio_vecs[i].fv_actlen = fio->fio_vecs[i].fv_buflen;
+}
diff --git a/usr/src/uts/common/io/vnd/vnd.c b/usr/src/uts/common/io/vnd/vnd.c
new file mode 100644
index 0000000000..d03c7ce4ec
--- /dev/null
+++ b/usr/src/uts/common/io/vnd/vnd.c
@@ -0,0 +1,5857 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2018, Joyent, Inc.
+ */
+
+/*
+ * vnd - virtual (machine) networking datapath
+ *
+ * vnd's purpose is to provide a highly performant data path for Layer 2 network
+ * traffic and exist side by side an active IP netstack, each servicing
+ * different datalinks. vnd provides many of the same capabilities as the
+ * current TCP/IP stack does and some specific to layer two. Specifically:
+ *
+ * 	o Use of the DLD fastpath
+ * 	o Packet capture hooks
+ * 	o Ability to use hardware capabilities
+ * 	o Useful interfaces for handling multiple frames
+ *
+ * The following image shows where vnd fits into today's networking stack:
+ *
+ *             +---------+----------+----------+
+ *             | libdlpi |  libvnd  | libsocket|
+ *             +---------+----------+----------+
+ *             |         ·          ·    VFS   |
+ *             |   VFS   ·    VFS   +----------+
+ *             |         ·          |  sockfs  |
+ *             +---------+----------+----------+
+ *             |         |    VND   |    IP    |
+ *             |         +----------+----------+
+ *             |            DLD/DLS            |
+ *             +-------------------------------+
+ *             |              MAC              |
+ *             +-------------------------------+
+ *             |             GLDv3             |
+ *             +-------------------------------+
+ *
+ * -----------------------------------------
+ * A Tale of Two Devices - DDI Device Basics
+ * -----------------------------------------
+ *
+ * vnd presents itself to userland as a character device; however, it also is a
+ * STREAMS device so that it can interface with dld and the rest of the
+ * networking stack. Users never interface with the STREAMs devices directly and
+ * they are purely an implementation detail of vnd. Opening the STREAMS device
+ * require kcred and as such userland cannot interact with it or push it onto
+ * the stream head.
+ *
+ * The main vnd character device, /dev/vnd/ctl, is a self-cloning device. Every
+ * clone gets its own minor number; however, minor nodes are not created in the
+ * devices tree for these instances. In this state a user may do two different
+ * things. They may issue ioctls that affect global state or they may issue
+ * ioctls that try to attach it to a given datalink. Once a minor device has
+ * been attached to a datalink, all operations on it are scoped to that context,
+ * therefore subsequent global operations are not permitted.
+ *
+ * A given device can be linked into the /devices and /dev name space via a link
+ * ioctl. That ioctl causes a minor node to be created in /devices and then it
+ * will also appear under /dev/vnd/ due to vnd's sdev plugin. This is similar
+ * to, but simpler than, IP's persistence mechanism.
+ *
+ * ---------------------
+ * Binding to a datalink
+ * ---------------------
+ *
+ * Datalinks are backed by the dld (datalink device) and dls (datalink services)
+ * drivers. These drivers provide a STREAMS device for datalinks on the system
+ * which are exposed through /dev/net. Userland generally manipulates datalinks
+ * through libdlpi. When an IP interface is being plumbed up what actually
+ * happens is that someone does a dlpi_open(3DLPI) of the underlying datalink
+ * and then pushes on the ip STREAMS module with an I_PUSH ioctl.  Modules may
+ * then can negotiate with dld and dls to obtain access to various capabilities
+ * and fast paths via a series of STREAMS messages.
+ *
+ * In vnd, we do the same thing, but we leave our STREAMS module as an
+ * implementation detail of the system. We don't want users to be able to
+ * arbitrarily push vnd STREAMS module onto any stream, so we explicitly require
+ * kcred to manipulate it. Thus, when a user issues a request to attach a
+ * datalink to a minor instance of the character device, that vnd minor instance
+ * itself does a layered open (ldi_open_by_name(9F)) of the specified datalink.
+ * vnd does that open using the passed in credentials from the ioctl, not kcred.
+ * This ensures that users who doesn't have permissions to open the device
+ * cannot. Once that's been opened, we push on the vnd streams module.
+ *
+ * Once the vnd STREAMS instance has been created for this device, eg. the
+ * I_PUSH ioctl returns, we explicitly send a STREAMS ioctl
+ * (VND_STRIOC_ASSOCIATE) to associate the vnd STREAMS and character devices.
+ * This association begins the STREAM device's initialization. We start up an
+ * asynchronous state machine that takes care of all the different aspects of
+ * plumbing up the device with dld and dls and enabling the MAC fast path. We
+ * need to guarantee to consumers of the character device that by the time their
+ * ioctl returns, the data path has been fully initialized.
+ *
+ * The state progression is fairly linear. There are two general steady states.
+ * The first is VND_S_ONLINE, which means that everything is jacked up and good
+ * to go. The alternative is VND_S_ZOMBIE, which means that the streams device
+ * encountered an error or we have finished tearing it down and the character
+ * device can clean it up. The following is our state progression and the
+ * meaning of each state:
+ *
+ *                |
+ *                |
+ *                V
+ *        +---------------+
+ *        | VNS_S_INITIAL |                  This is our initial state. Every
+ *        +---------------+                  vnd STREAMS device starts here.
+ *                |                          While in this state, only dlpi
+ *                |                          M_PROTO and M_IOCTL messages can be
+ *                |                          sent or received. All STREAMS based
+ *                |                          data messages are dropped.
+ *                |                          We transition out of this state by
+ *                |                          sending a DL_INFO_REQ to obtain
+ *                |                          information about the underlying
+ *                |                          link.
+ *                v
+ *        +-----------------+
+ *   +--<-| VNS_S_INFO_SENT |                In this state, we verify and
+ *   |    +-----------------+                record information about the
+ *   |            |                          underlying device. If the device is
+ *   |            |                          not suitable, eg. not of type
+ *   v            |                          DL_ETHER, then we immediately
+ *   |            |                          become a ZOMBIE. To leave this
+ *   |            |                          state we request exclusive active
+ *   |            |                          access to the device via
+ *   v            |                          DL_EXCLUSIVE_REQ.
+ *   |            v
+ *   |    +----------------------+
+ *   +--<-| VNS_S_EXCLUSIVE_SENT |           In this state, we verify whether
+ *   |    +----------------------+           or not we were able to obtain
+ *   |       |             |                 exclusive access to the device. If
+ *   |       |             |                 we were not able to, then we leave,
+ *   v       |             |                 as that means that something like
+ *   |       |             |                 IP is already plumbed up on top of
+ *   |       |             |                 the datalink. We leave this state
+ *   |       |             |                 by progressing through to the
+ *   |       |             |                 appropriate DLPI primitive, either
+ *   v       |             |                 DLPI_ATTACH_REQ or DLPI_BIND_REQ
+ *   |       |             |                 depending on the style of the
+ *   |       |             |                 datalink.
+ *   |       |             v
+ *   |       |    +-------------------+
+ *   +------ |--<-| VNS_S_ATTACH_SENT |      In this state, we verify we were
+ *   |       |    +-------------------+      able to perform a standard DLPI
+ *   |       |          |                    attach and if so, go ahead and
+ *   v       |          |                    send a DLPI_BIND_REQ.
+ *   |       v          v
+ *   |    +-------------------+
+ *   +--<-| VNS_S_BIND_SENT   |              In this state we see the result of
+ *   |    +-------------------+              our attempt to bind to PPA 0 of the
+ *   v             |                         underlying device. Because we're
+ *   |             |                         trying to be a layer two datapath,
+ *   |             |                         the specific attachment point isn't
+ *   |             |                         too important as we're going to
+ *   v             |                         have to enable promiscuous mode. We
+ *   |             |                         transition out of this by sending
+ *   |             |                         our first of three promiscuous mode
+ *   |             |                         requests.
+ *   v             v
+ *   |    +------------------------+
+ *   +--<-| VNS_S_SAP_PROMISC_SENT |         In this state we verify that we
+ *   |    +------------------------+         were able to enable promiscuous
+ *   |             |                         mode at the physical level. We
+ *   |             |                         transition out of this by enabling
+ *   |             |                         multicast and broadcast promiscuous
+ *   v             |                         mode.
+ *   |             v
+ *   |    +--------------------------+
+ *   +--<-| VNS_S_MULTI_PROMISC_SENT |       In this state we verify that we
+ *   |    +--------------------------+       have enabled DL_PROMISC_MULTI and
+ *   v             |                         move onto the second promiscuous
+ *   |             |                         mode request.
+ *   |             v
+ *   |    +----------------------------+
+ *   +--<-| VNS_S_RX_ONLY_PROMISC_SENT |     In this state we verify that we
+ *   |    +----------------------------+     enabled RX_ONLY promiscuous mode.
+ *   |             |                         We specifically do this as we don't
+ *   v             |                         want to receive our own traffic
+ *   |             |                         that we'll send out. We leave this
+ *   |             |                         state by enabling the final flag
+ *   |             |                         DL_PROMISC_FIXUPS.
+ *   |             v
+ *   |    +--------------------------+
+ *   +--<-| VNS_S_FIXUP_PROMISC_SENT |       In this state we verify that we
+ *   |    +--------------------------+       enabled FIXUP promiscuous mode.
+ *   |             |                         We specifically do this as we need
+ *   v             |                         to ensure that traffic which is
+ *   |             |                         received by being looped back to us
+ *   |             |                         correctly has checksums fixed. We
+ *   |             |                         leave this state by requesting the
+ *   |             |                         dld/dls capabilities that we can
+ *   v             |                         process.
+ *   |             v
+ *   |    +--------------------+
+ *   +--<-| VNS_S_CAPAB_Q_SENT |             We loop over the set of
+ *   |    +--------------------+             capabilities that dld advertised
+ *   |             |                         and enable the ones that currently
+ *   v             |                         support for use. See the section
+ *   |             |                         later on regarding capabilities
+ *   |             |                         for more information. We leave this
+ *   |             |                         state by sending an enable request.
+ *   v             v
+ *   |    +--------------------+
+ *   +--<-| VNS_S_CAPAB_E_SENT |             Here we finish all capability
+ *   |    +--------------------+             initialization. Once finished, we
+ *   |             |                         transition to the next state. If
+ *   v             |                         the dld fast path is not available,
+ *   |             |                         we become a zombie.
+ *   |             v
+ *   |    +--------------+
+ *   |    | VNS_S_ONLINE |                   This is a vnd STREAMS device's
+ *   |    +--------------+                   steady state. It will normally
+ *   |             |                         reside in this state while it is in
+ *   |             |                         active use. It will only transition
+ *   v             |                         to the next state when the STREAMS
+ *   |             |                         device is closed by the character
+ *   |             |                         device. In this state, all data
+ *   |             |                         flows over the dld fast path.
+ *   |             v
+ *   |    +---------------------+
+ *   +--->| VNS_S_SHUTTING_DOWN |            This vnd state takes care of
+ *   |    +---------------------+            disabling capabilities and
+ *   |             |                         flushing all data. At this point
+ *   |             |                         any additional data that we receive
+ *   |             |                         will be dropped. We leave this
+ *   v             |                         state by trying to remove multicast
+ *   |             |                         promiscuity.
+ *   |             |
+ *   |             v
+ *   |   +---------------------------------+
+ *   +-->| VNS_S_MULTICAST_PROMISCOFF_SENT | In this state, we check if we have
+ *   |   +---------------------------------+ successfully removed multicast
+ *   |             |                         promiscuous mode. If we have
+ *   |             |                         failed, we still carry on but only
+ *   |             |                         warn. We leave this state by trying
+ *   |             |                         to disable SAP level promiscuous
+ *   |             |                         mode.
+ *   |             v
+ *   |   +---------------------------+
+ *   +-->| VNS_S_SAP_PROMISCOFF_SENT |       In this state, we check if we have
+ *   |   +---------------------------+       successfully removed SAP level
+ *   |             |                         promiscuous mode. If we have
+ *   |             |                         failed, we still carry on but only
+ *   |             |                         warn. Note that we don't worry
+ *   |             |                         about either of
+ *   |             |                         DL_PROMISC_FIXUPS or
+ *   |             |                         DL_PROMISC_RX_ONLY. If these are
+ *   |             |                         the only two entries left, then we
+ *   |             |                         should have anything that MAC is
+ *   |             |                         doing for us at this point,
+ *   |             |                         therefore it's safe for us to
+ *   |             |                         proceed to unbind, which is how we
+ *   |             |                         leave this state via a
+ *   |             v                         DL_UNBIND_REQ.
+ *   |    +-------------------+
+ *   +--->| VNS_S_UNBIND_SENT |              Here, we check how the unbind
+ *   |    +-------------------+              request went. Regardless of its
+ *   |             |                         success, we always transition to
+ *   |             |                         a zombie state.
+ *   |             v
+ *   |    +--------------+
+ *   +--->| VNS_S_ZOMBIE |                   In this state, the vnd STREAMS
+ *        +--------------+                   device is waiting to finish being
+ *                                           reaped. Because we have no more
+ *                                           ways to receive data it should be
+ *                                           safe to destroy all remaining data
+ *                                           structures.
+ *
+ * If the stream association fails for any reason the state machine reaches
+ * VNS_S_ZOMBIE. A more detailed vnd_errno_t will propagate back through the
+ * STREAMS ioctl to the character device. That will fail the user ioctl and
+ * propagate the vnd_errno_t back to userland. If, on the other hand, the
+ * association succeeds, then the vnd STREAMS device will be fully plumbed up
+ * and ready to transmit and receive message blocks. Consumers will be able to
+ * start using the other cbops(9E) entry points once the attach has fully
+ * finished, which will occur after the original user attach ioctl to the
+ * character device returns.
+ *
+ * It's quite important that we end up sending the full series of STREAMS
+ * messages when tearing down. While it's tempting to say that we should just
+ * rely on the STREAMS device being closed to properly ensure that we have no
+ * more additional data, that's not sufficient due to our use of direct
+ * callbacks.  DLS does not ensure that by the time we change the direct
+ * callback (vnd_mac_input) that all callers to it will have been quiesced.
+ * However, it does guarantee that if we disable promiscuous mode ourselves and
+ * we turn off the main data path via DL_UNBIND_REQ that it will work.
+ * Therefore, we make sure to do this ourselves rather than letting DLS/DLD do
+ * it as part of tearing down the STREAMS device. This ensures that we'll
+ * quiesce all data before we destroy our data structures and thus we should
+ * eliminate the race in changing the data function.
+ *
+ * --------------------
+ * General Architecture
+ * --------------------
+ *
+ * There are several different devices and structures in the vnd driver. There
+ * is a per-netstack component, pieces related to the character device that
+ * consumers see, the internal STREAMS device state, and the data queues
+ * themselves. The following ASCII art picture describes their relationships and
+ * some of the major pieces of data that contain them. These are not exhaustive,
+ * e.g. synchronization primitives are left out.
+ *
+ *  +----------------+     +-----------------+
+ *  | global         |     | global          |
+ *  | device list    |     | netstack list   |
+ *  | vnd_dev_list   |     | vnd_nsd_list    |
+ *  +----------------+     +-----------------+
+ *      |                    |
+ *      |                    v
+ *      |    +-------------------+      +-------------------+
+ *      |    | per-netstack data | ---> | per-netstack data | --> ...
+ *      |    | vnd_pnsd_t        |      | vnd_pnsd_t        |
+ *      |    |                   |      +-------------------+
+ *      |    |                   |
+ *      |    | nestackid_t    ---+----> Netstack ID
+ *      |    | vnd_pnsd_flags_t -+----> Status flags
+ *      |    | zoneid_t       ---+----> Zone ID for this netstack
+ *      |    | hook_family_t  ---+----> VND IPv4 Hooks
+ *      |    | hook_family_t  ---+----> VND IPv6 Hooks
+ *      |    | list_t ----+      |
+ *      |    +------------+------+
+ *      |                 |
+ *      |                 v
+ *      |           +------------------+       +------------------+
+ *      |           | character device |  ---> | character device | -> ...
+ *      +---------->| vnd_dev_t        |       | vnd_dev_t        |
+ *                  |                  |       +------------------+
+ *                  |                  |
+ *                  | minor_t       ---+--> device minor number
+ *                  | ldi_handle_t  ---+--> handle to /dev/net/%datalink
+ *                  | vnd_dev_flags_t -+--> device flags, non blocking, etc.
+ *                  | char[]        ---+--> name if linked
+ *                  | vnd_str_t * -+   |
+ *                  +--------------+---+
+ *                                 |
+ *                                 v
+ *          +-------------------------+
+ *          | STREAMS device          |
+ *          | vnd_str_t               |
+ *          |                         |
+ *          | vnd_str_state_t      ---+---> State machine state
+ *          | gsqueue_t *          ---+---> mblk_t Serialization queue
+ *          | vnd_str_stat_t       ---+---> per-device kstats
+ *          | vnd_str_capab_t      ---+----------------------------+
+ *          | vnd_data_queue_t ---+   |                            |
+ *          | vnd_data_queue_t -+ |   |                            v
+ *          +-------------------+-+---+                  +---------------------+
+ *                              | |                      | Stream capabilities |
+ *                              | |                      | vnd_str_capab_t     |
+ *                              | |                      |                     |
+ *                              | |    supported caps <--+-- vnd_capab_flags_t |
+ *                              | |    dld cap handle <--+-- void *            |
+ *                              | |    direct tx func <--+-- vnd_dld_tx_t      |
+ *                              | |                      +---------------------+
+ *                              | |
+ *             +----------------+ +-------------+
+ *             |                                |
+ *             v                                v
+ *  +-------------------+                  +-------------------+
+ *  | Read data queue   |                  | Write data queue  |
+ *  | vnd_data_queue_t  |                  | vnd_data_queue_t  |
+ *  |                   |                  |                   |
+ *  | size_t        ----+--> Current size  | size_t        ----+--> Current size
+ *  | size_t        ----+--> Max size      | size_t        ----+--> Max size
+ *  | mblk_t *      ----+--> Queue head    | mblk_t *      ----+--> Queue head
+ *  | mblk_t *      ----+--> Queue tail    | mblk_t *      ----+--> Queue tail
+ *  +-------------------+                  +-------------------+
+ *
+ *
+ * Globally, we maintain two lists. One list contains all of the character
+ * device soft states. The other maintains a list of all our netstack soft
+ * states. Each netstack maintains a list of active devices that have been
+ * associated with a datalink in its netstack.
+ *
+ * Recall that a given minor instance of the character device exists in one of
+ * two modes. It can either be a cloned open of /dev/vnd/ctl, the control node,
+ * or it can be associated with a given datalink. When minor instances are in
+ * the former state, they do not exist in a given vnd_pnsd_t's list of devices.
+ * As part of attaching to a datalink, the given vnd_dev_t will be inserted into
+ * the appropriate vnd_pnsd_t. In addition, this will cause a STREAMS device, a
+ * vnd_str_t, to be created and associated to a vnd_dev_t.
+ *
+ * The character device, and its vnd_dev_t, is the interface to the rest of the
+ * system. The vnd_dev_t keeps track of various aspects like whether various
+ * operations, such as read, write and the frameio ioctls, are considered
+ * blocking or non-blocking in the O_NONBLOCK sense. It also is responsible for
+ * keeping track of things like the name of the device, if any, in /dev. The
+ * vnd_str_t, on the other hand manages aspects like buffer sizes and the actual
+ * data queues. However, ioctls that manipulate these properties all go through
+ * the vnd_dev_t to its associated vnd_str_t.
+ *
+ * Each of the STREAMS devices, the vnd_str_t, maintains two data queues. One
+ * for frames to transmit (write queue) and one for frames received (read
+ * queue). These data queues have a maximum size and attempting to add data
+ * beyond that maximum size will result in data being dropped. The sizes are
+ * configurable via ioctls VND_IOC_SETTXBUF, VND_IOC_SETRXBUF. Data either sits
+ * in those buffers or has a reservation in those buffers while they are in vnd
+ * and waiting to be consumed by the user or by mac.
+ *
+ * Finally, the vnd_str_t also has a vnd_str_capab_t which we use to manage the
+ * available, negotiated, and currently active features.
+ *
+ * ----------------------
+ * Data Path and gsqueues
+ * ----------------------
+ *
+ * There's a lot of plumbing in vnd to get to the point where we can send data,
+ * but vnd's bread and butter is the data path, so it's worth diving into it in
+ * more detail. Data enters and exits the system from two ends.
+ *
+ * The first end is the vnd consumer. This comes in the form of read and write
+ * system calls as well as the frame I/O ioctls. The read and write system calls
+ * operate on a single frame at a time. Think of a frame as a single message
+ * that has come in off the wire, which may itself comprise multiple mblk_t's
+ * linked together in the kernel. readv(2) and writev(2) have the same
+ * limitations as read(2) and write(2). We enforce this as the system is
+ * required to fill up every uio(9S) buffer before moving onto the next one.
+ * This means that if you have a MTU sized buffer and two frames come in which
+ * are less than half of the MTU they must fill up the given iovec. Even if we
+ * didn't want to do this, we have no way of informing the supplier of the
+ * iovecs that they were only partially filled or where one frame ends and
+ * another begins.  That's life, as such we have frame I/O which solves this
+ * problem. It allows for multiple frames to be consumed as well as for frames
+ * to be broken down into multiple vector components.
+ *
+ * The second end is the mac direct calls. As part of negotiating capabilities
+ * via dld, we give mac a function of ours to call when packets are received
+ * [vnd_mac_input()] and a callback to indicate that flow has been restored
+ * [vnd_mac_flow_control()]. In turn, we also get a function pointer that we can
+ * transmit data with. As part of the contract with mac, mac is allowed to flow
+ * control us by returning a cookie to the transmit function. When that happens,
+ * all outbound traffic is halted until our callback function is called and we
+ * can schedule drains.
+ *
+ * It's worth looking at these in further detail. We'll start with the rx path.
+ *
+ *
+ *                                |
+ *                                * . . . packets from gld
+ *                                |
+ *                                v
+ *                         +-------------+
+ *                         |     mac     |
+ *                         +-------------+
+ *                                |
+ *                                v
+ *                         +-------------+
+ *                         |     dld     |
+ *                         +-------------+
+ *                                |
+ *                                * . . . dld direct callback
+ *                                |
+ *                                v
+ *                        +---------------+
+ *                        | vnd_mac_input |
+ *                        +---------------+
+ *                                |
+ *                                v
+ * +---------+             +-------------+
+ * | dropped |<--*---------|  vnd_hooks  |
+ * |   by    |   .         +-------------+
+ * |  hooks  |   . drop probe     |
+ * +---------+     kstat bump     * . . . Do we have free
+ *                                |         buffer space?
+ *                                |
+ *                          no .  |      . yes
+ *                             .  +      .
+ *                         +---*--+------*-------+
+ *                         |                     |
+ *                         * . . drop probe      * . . recv probe
+ *                         |     kstat bump      |     kstat bump
+ *                         v                     |
+ *                      +---------+              * . . fire pollin
+ *                      | freemsg |              v
+ *                      +---------+   +-----------------------+
+ *                                    | vnd_str_t`vns_dq_read |
+ *                                    +-----------------------+
+ *                                             ^ ^
+ *                             +----------+    | |     +---------+
+ *                             | read(9E) |-->-+ +--<--| frameio |
+ *                             +----------+            +---------+
+ *
+ * The rx path is rather linear. Packets come into us from mac. We always run
+ * them through the various hooks, and if they come out of that, we inspect the
+ * read data queue. If there is not enough space for a packet, we drop it.
+ * Otherwise, we append it to the data queue, and fire read notifications
+ * targetting anyone polling or doing blocking I/O on this device. Those
+ * consumers then drain the head of the data queue.
+ *
+ * The tx path is more complicated due to mac flow control. After any call into
+ * mac, we may have to potentially suspend writes and buffer data for an
+ * arbitrary amount of time. As such, we need to carefully track the total
+ * amount of outstanding data so that we don't waste kernel memory. This is
+ * further complicated by the fact that mac will asynchronously tell us when our
+ * flow has been resumed.
+ *
+ * For data to be able to enter the system, it needs to be able to take a
+ * reservation from the write data queue. Once the reservation has been
+ * obtained, we enter the gsqueue so that we can actually append it. We use
+ * gsqueues (serialization queues) to ensure that packets are manipulated in
+ * order as we deal with the draining and appending packets. We also leverage
+ * its worker thread to help us do draining after mac has restorted our flow.
+ *
+ * The following image describes the flow:
+ *
+ * +-----------+   +--------------+       +-------------------------+   +------+
+ * | write(9E) |-->| Space in the |--*--->| gsqueue_enter_one()     |-->| Done |
+ * | frameio   |   | write queue? |  .    | +->vnd_squeue_tx_append |   +------+
+ * +-----------+   +--------------+  .    +-------------------------+
+ *                         |   ^     .
+ *                         |   |     . reserve space           from gsqueue
+ *                         |   |                                   |
+ *            queue  . . . *   |       space                       v
+ *             full        |   * . . . avail          +------------------------+
+ *                         v   |                      | vnd_squeue_tx_append() |
+ * +--------+          +------------+                 +------------------------+
+ * | EAGAIN |<--*------| Non-block? |<-+                           |
+ * +--------+   .      +------------+  |                           v
+ *              . yes             v    |     wait          +--------------+
+ *                          no . .*    * . . for           | append chain |
+ *                                +----+     space         | to outgoing  |
+ *                                                         |  mblk chain  |
+ *   from gsqueue                                          +--------------+
+ *       |                                                        |
+ *       |      +-------------------------------------------------+
+ *       |      |
+ *       |      |                            yes . . .
+ *       v      v                                    .
+ *  +-----------------------+    +--------------+    .     +------+
+ *  | vnd_squeue_tx_drain() |--->| mac blocked? |----*---->| Done |
+ *  +-----------------------+    +--------------+          +------+
+ *                                       |                     |
+ *     +---------------------------------|---------------------+
+ *     |                                 |           tx        |
+ *     |                          no . . *           queue . . *
+ *     | flow controlled .               |           empty     * . fire pollout
+ *     |                 .               v                     |   if mblk_t's
+ *   +-------------+     .      +---------------------+        |   sent
+ *   | set blocked |<----*------| vnd_squeue_tx_one() |--------^-------+
+ *   | flags       |            +---------------------+                |
+ *   +-------------+    More data       |    |      |      More data   |
+ *                      and limit       ^    v      * . .  and limit   ^
+ *                      not reached . . *    |      |      reached     |
+ *                                      +----+      |                  |
+ *                                                  v                  |
+ *   +----------+          +-------------+    +---------------------------+
+ *   | mac flow |--------->| remove mac  |--->| gsqueue_enter_one() with  |
+ *   | control  |          | block flags |    | vnd_squeue_tx_drain() and |
+ *   | callback |          +-------------+    | GSQUEUE_FILL flag, iff    |
+ *   +----------+                             | not already scheduled     |
+ *                                            +---------------------------+
+ *
+ * The final path taken for a given write(9E)/frameio ioctl depends on whether
+ * or not the vnd_dev_t is non-blocking. That controls the initial path of
+ * trying to take a reservation in write data queue. If the device is in
+ * non-blocking mode, we'll return EAGAIN when there is not enough space
+ * available, otherwise, the calling thread blocks on the data queue.
+ *
+ * Today when we call into vnd_squeue_tx_drain() we will not try to drain the
+ * entire queue, as that could be quite large and we don't want to necessarily
+ * keep the thread that's doing the drain until it's been finished. Not only
+ * could more data be coming in, but the draining thread could be a userland
+ * thread that has more work to do. We have two limits today. There is an upper
+ * bound on the total amount of data and the total number of mblk_t chains. If
+ * we hit either limit, then we will schedule another drain in the gsqueue and
+ * go from there.
+ *
+ * It's worth taking some time to describe how we interact with gsqueues. vnd
+ * has a gsqueue_set_t for itself. It's important that it has its own set, as
+ * the profile of work that vnd does is different from other sub-systems in the
+ * kernel. When we open a STREAMS device in vnd_s_open, we get a random gsqueue.
+ * Unlike TCP/IP which uses an gsqueue for per TCP connection, we end up
+ * maintaining one for a given device. Because of that, we want to use a
+ * pseudo-random one to try and spread out the load, and picking one at random
+ * is likely to be just as good as any fancy algorithm we might come up with,
+ * especially as any two devices could have radically different transmit
+ * profiles.
+ *
+ * While some of the write path may seem complicated, it does allow us to
+ * maintain an important property. Once we have acknowledged a write(9E) or
+ * frameio ioctl, we will not drop the packet, excepting something like ipf via
+ * the firewall hooks.
+ *
+ * There is one other source of flow control that can exist in the system which
+ * is in the form of a barrier. The barrier is an internal mechanism used for
+ * ensuring that an gsqueue is drained for a given device. We use this as part
+ * of tearing down. Specifically we disable the write path so nothing new can be
+ * inserted into the gsqueue and then insert a barrier block. Once the barrier
+ * block comes out of the gsqueue, then we know nothing else in the gsqueue that
+ * could refer to the vnd_str_t, being destroyed, exists.
+ *
+ * ---------------------
+ * vnd, zones, netstacks
+ * ---------------------
+ *
+ * vnd devices are scoped to datalinks and datalinks are scoped to a netstack.
+ * Because of that, vnd is also a netstack module. It registers with the
+ * netstack sub-system and receives callbacks every time a netstack is created,
+ * being shutdown, and destroyed. The netstack callbacks drive the creation and
+ * destruction of the vnd_pnsd_t structures.
+ *
+ * Recall from the earlier architecture diagrams that every vnd device is scoped
+ * to a netstack and known about by a given vnd_pnsd_t. When that netstack is
+ * torn down, we also tear down any vnd devices that are hanging around. When
+ * the netstack is torn down, we know that any zones that are scoped to that
+ * netstack are being shut down and have no processes remaining. This is going
+ * to be the case whether they are shared or exclusive stack zones. We have to
+ * perform a careful dance.
+ *
+ * There are two different callbacks that happen on tear down, the first is a
+ * shutdown callback, the second is a destroy callback. When the shutdown
+ * callback is fired we need to prepare for the netstack to go away and ensure
+ * that nothing can continue to persist itself.
+ *
+ * More specifically, when we get notice of a stack being shutdown we first
+ * remove the netstack from the global netstack list to ensure that no one new
+ * can come in and find the netstack and get a reference to it. After that, we
+ * notify the neti hooks that they're going away. Once that's all done, we get
+ * to the heart of the matter.
+ *
+ * When shutting down there could be any number of outstanding contexts that
+ * have a reference on the vnd_pnsd_t and on the individual links. However, we
+ * know that no one new will be able to find the vnd_pnsd_t. To account for
+ * things that have existing references we mark the vnd_pnsd_t`vpnd_flags with
+ * VND_NS_CONDEMNED. This is checked by code paths that wish to append a device
+ * to the netstack's list. If this is set, then they must not append to it.
+ * Once this is set, we know that the netstack's list of devices can never grow,
+ * only shrink.
+ *
+ * Next, for each device we tag it with VND_D_ZONE_DYING. This indicates that
+ * the container for the device is being destroyed and that we should not allow
+ * additional references to the device to be created, whether via open, or
+ * linking. The presence of this bit also allows things like the list ioctl and
+ * sdev to know not to consider its existence. At the conclusion of this being
+ * set, we know that no one else should be able to obtain a new reference to the
+ * device.
+ *
+ * Once that has been set for all devices, we go through and remove any existing
+ * links that have been established in sdev. Because doing that may cause the
+ * final reference for the device to be dropped, which still has a reference to
+ * the netstack, we have to restart our walk due to dropped locks. We know that
+ * this walk will eventually complete because the device cannot be relinked and
+ * no new devices will be attached in this netstack due to VND_NS_CONDEMNED.
+ * Once that's finished, the shutdown callback returns.
+ *
+ * When we reach the destroy callback, we simply wait for references on the
+ * netstack to disappear. Because the zone has been shut down, all processes in
+ * it that have open references have been terminated and reaped. Any threads
+ * that are newly trying to reference it will fail. However, there is one thing
+ * that can halt this that we have no control over, which is the global zone
+ * holding open a reference to the device. In this case the zone halt will hang
+ * in vnd_stack_destroy. Once the last references is dropped we finish destroy
+ * the netinfo hooks and free the vnd_pnsd_t.
+ *
+ * ----
+ * sdev
+ * ----
+ *
+ * vnd registers a sdev plugin which allows it to dynamically fill out /dev/vnd
+ * for both the global and non-global zones. In any given zone we always supply
+ * a control node via /dev/vnd/ctl. This is the self-cloning node. Each zone
+ * will also have an entry per-link in that zone under /dev/vnd/%datalink, eg.
+ * if a link was named net0, there would be a /dev/vnd/net0. The global zone can
+ * also see every link for every zone, ala /dev/net, under
+ * /dev/vnd/%zonename/%datalink, eg. if a zone named 'turin' had a vnd device
+ * named net0, the global zone would have /dev/vnd/turin/net0.
+ *
+ * The sdev plugin has three interfaces that it supplies back to sdev. One is to
+ * validate that a given node is still valid. The next is a callback from sdev
+ * to say that it is no longer using the node. The third and final one is from
+ * sdev where it asks us to fill a directory. All of the heavy lifting is done
+ * in directory filling and in valiation. We opt not to maintain a reference on
+ * the device while there is an sdev node present. This makes the removal of
+ * nodes much simpler and most of the possible failure modes shouldn't cause any
+ * real problems. For example, the open path has to handle both dev_t's which no
+ * longer exist and which are no longer linked.
+ *
+ * -----
+ * hooks
+ * -----
+ *
+ * Like IP, vnd sends all L3 packets through its firewall hooks. Currently vnd
+ * provides these for L3 IP and IPv6 traffic. Each netstack provides these hooks
+ * in a minimal fashion. While we will allow traffic to be filtered through the
+ * hooks, we do not provide means for packet injection or additional inspection
+ * at this time. There are a total of four different events created:
+ *
+ *   o IPv4 physical in
+ *   o IPv4 physical out
+ *   o IPv6 physical in
+ *   o IPv6 physical out
+ *
+ * ---------------
+ * Synchronization
+ * ---------------
+ *
+ * To make our synchronization simpler, we've put more effort into making the
+ * metadata/setup paths do more work. That work allows the data paths to make
+ * assumptions around synchronization that simplify the general case. Each major
+ * structure, the vnd_pnsd_t, vnd_dev_t, vnd_str_t, and vnd_data_queue_t is
+ * annotated with the protection that its members receives.  The following
+ * annotations are used:
+ *
+ * 	A	Atomics; these values are only modified using atomics values.
+ *		Currently this only applies to kstat values.
+ * 	E	Existence; no lock is needed to access this member, it does not
+ *		change while the structure is valid.
+ * 	GL	Global Lock; these members are protected by the global
+ *		vnd_dev_lock.
+ * 	L	Locked; access to the member is controlled by a lock that is in
+ * 		the structure.
+ * 	NSL	netstack lock; this member is protected by the containing
+ * 		netstack. This only applies to the vnd_dev_t`vdd_nslink.
+ *	X	This member is special, and is discussed in this section.
+ *
+ * In addition to locking, we also have reference counts on the vnd_dev_t and
+ * the vnd_pnsd_t. The reference counts describe the lifetimes of the structure.
+ * With rare exception, once a reference count is decremented, the consumer
+ * should not assume that the data is valid any more. The only exception to this
+ * is the case where we're removing an extant reference count from a link into
+ * /devices or /dev. Reference counts are obtained on these structures as a part
+ * of looking them up.
+ *
+ * 	# Global Lock Ordering
+ * 	######################
+ *
+ * The following is the order that you must take locks in vnd:
+ *
+ * 1) vnd`vnd_dev_lock
+ * 2) vnd_pnsd_t`vpnd_lock
+ * 3) vnd_dev_t`vnd_lock
+ * 4) vnd_str_t`vns_lock
+ * 5) vnd_data_queue_t`vdq_lock
+ *
+ * One must adhere to the following rules:
+ *
+ *   o You must acquire a lower numbered lock before a high numbered lock.
+ *   o It is NOT legal to hold two locks of the same level concurrently, eg. you
+ *     can not hold two different vnd_dev_t's vnd_lock at the same time.
+ *   o You may release locks in any order.
+ *   o If you release a lock, you must honor the locking rules before acquiring
+ *     it again.
+ *   o You should not hold any locks when calling any of the rele functions.
+ *
+ * 	# Special Considerations
+ * 	########################
+ *
+ * While most of the locking is what's expected, it's worth going into the
+ * special nature that a few members hold.  Today, only two structures have
+ * special considerations: the vnd_dev_t and the vnd_str_t. All members with
+ * special considerations have an additional annotation that describes how you
+ * should interact with it.
+ *
+ * vnd_dev_t: The vdd_nsd and vdd_cr are only valid when the minor node is
+ * attached or in the process of attaching. If the code path that goes through
+ * requires an attached vnd_dev_t, eg. the data path and tear down path, then it
+ * is always legal to dereference that member without a lock held. When they are
+ * added to the system, they should be done under the vdd_lock and done as part
+ * of setting the VND_D_ATTACH_INFLIGHT flag. These should not change during the
+ * lifetime of the vnd_dev_t.
+ *
+ * vnd_dev_t: The vdd_ldih is similar to the vdd_nsd and vdd_cr, except that it
+ * always exists as it is a part of the structure. The only time that it's valid
+ * to be using it is during the attach path with the VND_D_ATTACH_INFLIGHT flag
+ * set or during tear down. Outside of those paths which are naturally
+ * serialized, there is no explicit locking around the member.
+ *
+ * vnd_str_t: The vns_dev and vns_nsd work in similar ways. They are not
+ * initially set as part of creating the structure, but are set as part of
+ * responding to the association ioctl. Anything in the data path or metadata
+ * path that requires association may assume that they exist, as we do not kick
+ * off the state machine until they're set.
+ *
+ * vnd_str_t: The vns_drainblk and vns_barrierblk are similarly special. The
+ * members are designed to be used as part of various operations with the
+ * gsqueues. A lock isn't needed to use them, but to work with them, the
+ * appropriate flag in the vnd_str_t`vns_flags must have been set by the current
+ * thread. Otherwise, it is always fair game to refer to their addresses. Their
+ * contents are ignored by vnd, but some members are manipulated by the gsqueue
+ * subsystem.
+ */
+
+#include <sys/conf.h>
+#include <sys/devops.h>
+#include <sys/modctl.h>
+#include <sys/stat.h>
+#include <sys/file.h>
+#include <sys/types.h>
+#include <sys/errno.h>
+#include <sys/open.h>
+#include <sys/ddi.h>
+#include <sys/ethernet.h>
+#include <sys/stropts.h>
+#include <sys/sunddi.h>
+#include <sys/stream.h>
+#include <sys/strsun.h>
+#include <sys/ksynch.h>
+#include <sys/taskq_impl.h>
+#include <sys/sdt.h>
+#include <sys/debug.h>
+#include <sys/sysmacros.h>
+#include <sys/dlpi.h>
+#include <sys/cred.h>
+#include <sys/id_space.h>
+#include <sys/list.h>
+#include <sys/ctype.h>
+#include <sys/policy.h>
+#include <sys/sunldi.h>
+#include <sys/cred.h>
+#include <sys/strsubr.h>
+#include <sys/poll.h>
+#include <sys/neti.h>
+#include <sys/hook.h>
+#include <sys/hook_event.h>
+#include <sys/vlan.h>
+#include <sys/dld.h>
+#include <sys/mac_client.h>
+#include <sys/netstack.h>
+#include <sys/fs/sdev_plugin.h>
+#include <sys/kstat.h>
+#include <sys/atomic.h>
+#include <sys/disp.h>
+#include <sys/random.h>
+#include <sys/gsqueue.h>
+#include <sys/ht.h>
+
+#include <inet/ip.h>
+#include <inet/ip6.h>
+
+#include <sys/vnd.h>
+
+/*
+ * Globals
+ */
+static dev_info_t *vnd_dip;
+static taskq_t *vnd_taskq;
+static kmem_cache_t *vnd_str_cache;
+static kmem_cache_t *vnd_dev_cache;
+static kmem_cache_t *vnd_pnsd_cache;
+static id_space_t *vnd_minors;
+static int vnd_list_init = 0;
+static sdev_plugin_hdl_t vnd_sdev_hdl;
+static gsqueue_set_t *vnd_sqset;
+
+static kmutex_t vnd_dev_lock;
+static list_t vnd_dev_list;	/* Protected by the vnd_dev_lock */
+static list_t vnd_nsd_list;	/* Protected by the vnd_dev_lock */
+
+/*
+ * STREAMs ioctls
+ *
+ * The STREAMs ioctls are internal to vnd. No one should be seeing them, as such
+ * they aren't a part of the header file.
+ */
+#define	VND_STRIOC	(('v' << 24) | ('n' << 16) | ('d' << 8) | 0x80)
+
+/*
+ * Private ioctl to associate a given streams instance with a minor instance of
+ * the character device.
+ */
+#define	VND_STRIOC_ASSOCIATE	(VND_STRIOC | 0x1)
+
+typedef struct vnd_strioc_associate {
+	minor_t	vsa_minor;	/* minor device node */
+	netstackid_t vsa_nsid;	/* netstack id */
+	vnd_errno_t vsa_errno;	/* errno */
+} vnd_strioc_associate_t;
+
+typedef enum vnd_strioc_state {
+	VSS_UNKNOWN = 0,
+	VSS_COPYIN = 1,
+	VSS_COPYOUT = 2,
+} vnd_strioc_state_t;
+
+typedef struct vnd_strioc {
+	vnd_strioc_state_t vs_state;
+	caddr_t vs_addr;
+} vnd_strioc_t;
+
+/*
+ * VND SQUEUE TAGS, start at 0x42 so we don't overlap with extent tags. Though
+ * really, overlap is at the end of the day, inevitable.
+ */
+#define	VND_SQUEUE_TAG_TX_DRAIN		0x42
+#define	VND_SQUEUE_TAG_MAC_FLOW_CONTROL	0x43
+#define	VND_SQUEUE_TAG_VND_WRITE	0x44
+#define	VND_SQUEUE_TAG_ND_FRAMEIO_WRITE	0x45
+#define	VND_SQUEUE_TAG_STRBARRIER	0x46
+
+/*
+ * vnd reserved names. These are names which are reserved by vnd and thus
+ * shouldn't be used by some external program.
+ */
+static char *vnd_reserved_names[] = {
+	"ctl",
+	"zone",
+	NULL
+};
+
+/*
+ * vnd's DTrace probe macros
+ *
+ * DTRACE_VND* are all for a stable provider. We also have an unstable internal
+ * set of probes for reference count manipulation.
+ */
+#define	DTRACE_VND3(name, type1, arg1, type2, arg2, type3, arg3) \
+    DTRACE_PROBE3(__vnd_##name, type1, arg1, type2, arg2, type3, arg3);
+
+#define	DTRACE_VND4(name, type1, arg1, type2, arg2, type3, arg3, type4, arg4) \
+    DTRACE_PROBE4(__vnd_##name, type1, arg1, type2, arg2, type3, arg3, \
+	type4, arg4);
+
+#define	DTRACE_VND5(name, type1, arg1, type2, arg2, type3, arg3,	\
+    type4, arg4, type5, arg5)						\
+    DTRACE_PROBE5(__vnd_##name, type1, arg1, type2, arg2, type3, arg3,	\
+	type4, arg4, type5, arg5);
+
+#define	DTRACE_VND_REFINC(vdp) \
+    DTRACE_PROBE2(vnd__ref__inc, vnd_dev_t *, vdp, int, vdp->vdd_ref);
+#define	DTRACE_VND_REFDEC(vdp) \
+    DTRACE_PROBE2(vnd__ref__dec, vnd_dev_t *, vdp, int, vdp->vdd_ref);
+
+
+/*
+ * Tunables
+ */
+size_t vnd_vdq_default_size = 1024 * 64;	/* 64 KB */
+size_t vnd_vdq_hard_max = 1024 * 1024 * 4;	/* 4 MB */
+
+/*
+ * These numbers are designed as per-device tunables that are applied when a new
+ * vnd device is attached. They're a rough stab at what may be a reasonable
+ * amount of work to do in one burst in an squeue.
+ */
+size_t vnd_flush_burst_size = 1520 * 10;	/* 10 1500 MTU packets */
+size_t vnd_flush_nburst = 10;			/* 10 frames */
+
+/*
+ * Constants related to our sdev plugins
+ */
+#define	VND_SDEV_NAME	"vnd"
+#define	VND_SDEV_ROOT	"/dev/vnd"
+#define	VND_SDEV_ZROOT	"/dev/vnd/zone"
+
+/*
+ * vnd relies on privileges, not mode bits to limit access.  As such, device
+ * files are read-write to everyone.
+ */
+#define	VND_SDEV_MODE	(S_IFCHR | S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | \
+			S_IROTH | S_IWOTH)
+
+/*
+ * Statistic macros
+ */
+#define	VND_STAT_INC(vsp, field, val) \
+    atomic_add_64(&(vsp)->vns_ksdata.field.value.ui64, val)
+#define	VND_LATENCY_1MS		1000000
+#define	VND_LATENCY_10MS	10000000
+#define	VND_LATENCY_100MS	100000000
+#define	VND_LATENCY_1S		1000000000
+#define	VND_LATENCY_10S		10000000000
+
+/*
+ * Constants for vnd hooks
+ */
+static uint8_t vnd_bcast_addr[6] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
+#define	IPV4_MCAST_LEN	3
+static uint8_t vnd_ipv4_mcast[3] = { 0x01, 0x00, 0x5E };
+#define	IPV6_MCAST_LEN	2
+static uint8_t vnd_ipv6_mcast[2] = { 0x33, 0x33 };
+
+/*
+ * vnd internal data structures and types
+ */
+
+struct vnd_str;
+struct vnd_dev;
+struct vnd_pnsd;
+
+/*
+ * As part of opening the device stream we need to properly communicate with our
+ * underlying stream. This is a bit of an asynchronous dance and we need to
+ * properly work with dld to get everything set up. We have to initiate the
+ * conversation with dld and as such we keep track of our state here.
+ */
+typedef enum vnd_str_state {
+	VNS_S_INITIAL = 0,
+	VNS_S_INFO_SENT,
+	VNS_S_EXCLUSIVE_SENT,
+	VNS_S_ATTACH_SENT,
+	VNS_S_BIND_SENT,
+	VNS_S_SAP_PROMISC_SENT,
+	VNS_S_MULTI_PROMISC_SENT,
+	VNS_S_RX_ONLY_PROMISC_SENT,
+	VNS_S_FIXUP_PROMISC_SENT,
+	VNS_S_CAPAB_Q_SENT,
+	VNS_S_CAPAB_E_SENT,
+	VNS_S_ONLINE,
+	VNS_S_SHUTTING_DOWN,
+	VNS_S_MULTICAST_PROMISCOFF_SENT,
+	VNS_S_SAP_PROMISCOFF_SENT,
+	VNS_S_UNBIND_SENT,
+	VNS_S_ZOMBIE
+} vnd_str_state_t;
+
+typedef enum vnd_str_flags {
+	VNS_F_NEED_ZONE = 0x1,
+	VNS_F_TASKQ_DISPATCHED = 0x2,
+	VNS_F_CONDEMNED = 0x4,
+	VNS_F_FLOW_CONTROLLED = 0x8,
+	VNS_F_DRAIN_SCHEDULED = 0x10,
+	VNS_F_BARRIER = 0x20,
+	VNS_F_BARRIER_DONE = 0x40
+} vnd_str_flags_t;
+
+typedef enum vnd_capab_flags {
+	VNS_C_HCKSUM = 0x1,
+	VNS_C_DLD = 0x2,
+	VNS_C_DIRECT = 0x4,
+	VNS_C_HCKSUM_BADVERS = 0x8
+} vnd_capab_flags_t;
+
+/*
+ * Definitions to interact with direct callbacks
+ */
+typedef void (*vnd_rx_t)(struct vnd_str *, mac_resource_t *, mblk_t *,
+    mac_header_info_t *);
+typedef uintptr_t vnd_mac_cookie_t;
+/* DLD Direct capability function */
+typedef int (*vnd_dld_cap_t)(void *, uint_t, void *, uint_t);
+/* DLD Direct tx function */
+typedef vnd_mac_cookie_t (*vnd_dld_tx_t)(void *, mblk_t *, uint64_t, uint16_t);
+/* DLD Direct function to set flow control callback */
+typedef void *(*vnd_dld_set_fcb_t)(void *, void (*)(void *, vnd_mac_cookie_t),
+    void *);
+/* DLD Direct function to see if flow controlled still */
+typedef int (*vnd_dld_is_fc_t)(void *, vnd_mac_cookie_t);
+
+/*
+ * The vnd_str_capab_t is always protected by the vnd_str_t it's a member of.
+ */
+typedef struct vnd_str_capab {
+	vnd_capab_flags_t vsc_flags;
+	t_uscalar_t vsc_hcksum_opts;
+	vnd_dld_cap_t vsc_capab_f;
+	void *vsc_capab_hdl;
+	vnd_dld_tx_t vsc_tx_f;
+	void *vsc_tx_hdl;
+	vnd_dld_set_fcb_t vsc_set_fcb_f;
+	void *vsc_set_fcb_hdl;
+	vnd_dld_is_fc_t vsc_is_fc_f;
+	void *vsc_is_fc_hdl;
+	vnd_mac_cookie_t vsc_fc_cookie;
+	void *vsc_tx_fc_hdl;
+} vnd_str_capab_t;
+
+/*
+ * The vnd_data_queue is a simple construct for storing a series of messages in
+ * a queue.
+ *
+ * See synchronization section of the big theory statement for member
+ * annotations.
+ */
+typedef struct vnd_data_queue {
+	struct vnd_str *vdq_vns;	/* E */
+	kmutex_t vdq_lock;
+	kcondvar_t vdq_ready;		/* Uses vdq_lock */
+	ssize_t vdq_max;		/* L */
+	ssize_t vdq_cur;		/* L */
+	mblk_t *vdq_head;		/* L */
+	mblk_t *vdq_tail;		/* L */
+} vnd_data_queue_t;
+
+typedef struct vnd_str_stat {
+	kstat_named_t	vks_rbytes;
+	kstat_named_t	vks_rpackets;
+	kstat_named_t	vks_obytes;
+	kstat_named_t	vks_opackets;
+	kstat_named_t	vks_nhookindrops;
+	kstat_named_t	vks_nhookoutdrops;
+	kstat_named_t	vks_ndlpidrops;
+	kstat_named_t	vks_ndataindrops;
+	kstat_named_t	vks_ndataoutdrops;
+	kstat_named_t	vks_tdrops;
+	kstat_named_t	vks_linkname;
+	kstat_named_t	vks_zonename;
+	kstat_named_t	vks_nmacflow;
+	kstat_named_t	vks_tmacflow;
+	kstat_named_t	vks_mac_flow_1ms;
+	kstat_named_t	vks_mac_flow_10ms;
+	kstat_named_t	vks_mac_flow_100ms;
+	kstat_named_t	vks_mac_flow_1s;
+	kstat_named_t	vks_mac_flow_10s;
+} vnd_str_stat_t;
+
+/*
+ * vnd stream structure
+ *
+ * See synchronization section of the big theory statement for member
+ * annotations.
+ */
+typedef struct vnd_str {
+	kmutex_t 	vns_lock;
+	kcondvar_t	vns_cancelcv;		/* Uses vns_lock */
+	kcondvar_t	vns_barriercv;		/* Uses vns_lock */
+	kcondvar_t	vns_stcv;		/* Uses vns_lock */
+	vnd_str_state_t	vns_state;		/* L */
+	vnd_str_state_t	vns_laststate;		/* L */
+	vnd_errno_t	vns_errno;		/* L */
+	vnd_str_flags_t	vns_flags;		/* L */
+	vnd_str_capab_t vns_caps;		/* L */
+	taskq_ent_t	vns_tqe;		/* L */
+	vnd_data_queue_t vns_dq_read;		/* E */
+	vnd_data_queue_t vns_dq_write;		/* E */
+	mblk_t		*vns_dlpi_inc;		/* L */
+	queue_t		*vns_rq;		/* E */
+	queue_t		*vns_wq;		/* E */
+	queue_t		*vns_lrq;		/* E */
+	t_uscalar_t	vns_dlpi_style;		/* L */
+	t_uscalar_t	vns_minwrite;		/* L */
+	t_uscalar_t	vns_maxwrite;		/* L */
+	hrtime_t	vns_fclatch;		/* L */
+	hrtime_t	vns_fcupdate;		/* L */
+	kstat_t		*vns_kstat;		/* E */
+	gsqueue_t	*vns_squeue;		/* E */
+	mblk_t		vns_drainblk;		/* E + X */
+	mblk_t		vns_barrierblk;		/* E + X */
+	vnd_str_stat_t	vns_ksdata;		/* A */
+	size_t		vns_nflush;		/* L */
+	size_t 		vns_bsize;		/* L */
+	struct vnd_dev	*vns_dev;		/* E + X */
+	struct vnd_pnsd	*vns_nsd;		/* E + X */
+} vnd_str_t;
+
+typedef enum vnd_dev_flags {
+	VND_D_ATTACH_INFLIGHT =	0x001,
+	VND_D_ATTACHED =	0x002,
+	VND_D_LINK_INFLIGHT =	0x004,
+	VND_D_LINKED =		0x008,
+	VND_D_CONDEMNED =	0x010,
+	VND_D_ZONE_DYING =	0x020,
+	VND_D_OPENED =		0x040
+} vnd_dev_flags_t;
+
+/*
+ * This represents the data associated with a minor device instance.
+ *
+ * See synchronization section of the big theory statement for member
+ * annotations.
+ */
+typedef struct vnd_dev {
+	kmutex_t	vdd_lock;
+	list_node_t	vdd_link;			/* GL */
+	list_node_t	vdd_nslink;			/* NSL */
+	int		vdd_ref;			/* L */
+	vnd_dev_flags_t	vdd_flags;			/* L */
+	minor_t		vdd_minor;			/* E */
+	dev_t		vdd_devid;			/* E */
+	ldi_ident_t	vdd_ldiid;			/* E */
+	ldi_handle_t	vdd_ldih;			/* X */
+	cred_t		*vdd_cr;			/* X */
+	vnd_str_t	*vdd_str;			/* L */
+	struct pollhead	vdd_ph;				/* E */
+	struct vnd_pnsd *vdd_nsd;			/* E + X */
+	char		vdd_datalink[VND_NAMELEN];	/* L */
+	char		vdd_lname[VND_NAMELEN];		/* L */
+} vnd_dev_t;
+
+typedef enum vnd_pnsd_flags {
+	VND_NS_CONDEMNED = 0x1
+} vnd_pnsd_flags_t;
+
+/*
+ * Per netstack data structure.
+ *
+ * See synchronization section of the big theory statement for member
+ * annotations.
+ */
+typedef struct vnd_pnsd {
+	list_node_t vpnd_link;	/* protected by global dev lock */
+	zoneid_t vpnd_zid;			/* E */
+	netstackid_t vpnd_nsid;			/* E */
+	boolean_t vpnd_hooked;			/* E */
+	net_handle_t vpnd_neti_v4;		/* E */
+	hook_family_t vpnd_family_v4;		/* E */
+	hook_event_t vpnd_event_in_v4;		/* E */
+	hook_event_t vpnd_event_out_v4;		/* E */
+	hook_event_token_t vpnd_token_in_v4;	/* E */
+	hook_event_token_t vpnd_token_out_v4;	/* E */
+	net_handle_t vpnd_neti_v6;		/* E */
+	hook_family_t vpnd_family_v6;		/* E */
+	hook_event_t vpnd_event_in_v6;		/* E */
+	hook_event_t vpnd_event_out_v6;		/* E */
+	hook_event_token_t vpnd_token_in_v6;	/* E */
+	hook_event_token_t vpnd_token_out_v6;	/* E */
+	kmutex_t vpnd_lock;		/* Protects remaining members */
+	kcondvar_t vpnd_ref_change;		/* Uses vpnd_lock */
+	int vpnd_ref;				/* L */
+	vnd_pnsd_flags_t vpnd_flags;		/* L */
+	list_t vpnd_dev_list;			/* L */
+} vnd_pnsd_t;
+
+static void vnd_squeue_tx_drain(void *, mblk_t *, gsqueue_t *, void *);
+
+/*
+ * Drop function signature.
+ */
+typedef void (*vnd_dropper_f)(vnd_str_t *, mblk_t *, const char *);
+
+static void
+vnd_drop_ctl(vnd_str_t *vsp, mblk_t *mp, const char *reason)
+{
+	DTRACE_VND4(drop__ctl, mblk_t *, mp, vnd_str_t *, vsp, mblk_t *,
+	    mp, const char *, reason);
+	if (mp != NULL) {
+		freemsg(mp);
+	}
+	VND_STAT_INC(vsp, vks_ndlpidrops, 1);
+	VND_STAT_INC(vsp, vks_tdrops, 1);
+}
+
+static void
+vnd_drop_in(vnd_str_t *vsp, mblk_t *mp, const char *reason)
+{
+	DTRACE_VND4(drop__in, mblk_t *, mp, vnd_str_t *, vsp, mblk_t *,
+	    mp, const char *, reason);
+	if (mp != NULL) {
+		freemsg(mp);
+	}
+	VND_STAT_INC(vsp, vks_ndataindrops, 1);
+	VND_STAT_INC(vsp, vks_tdrops, 1);
+}
+
+static void
+vnd_drop_out(vnd_str_t *vsp, mblk_t *mp, const char *reason)
+{
+	DTRACE_VND4(drop__out, mblk_t *, mp, vnd_str_t *, vsp, mblk_t *,
+	    mp, const char *, reason);
+	if (mp != NULL) {
+		freemsg(mp);
+	}
+	VND_STAT_INC(vsp, vks_ndataoutdrops, 1);
+	VND_STAT_INC(vsp, vks_tdrops, 1);
+}
+
+static void
+vnd_drop_hook_in(vnd_str_t *vsp, mblk_t *mp, const char *reason)
+{
+	DTRACE_VND4(drop__in, mblk_t *, mp, vnd_str_t *, vsp, mblk_t *,
+	    mp, const char *, reason);
+	if (mp != NULL) {
+		freemsg(mp);
+	}
+	VND_STAT_INC(vsp, vks_nhookindrops, 1);
+	VND_STAT_INC(vsp, vks_tdrops, 1);
+}
+
+static void
+vnd_drop_hook_out(vnd_str_t *vsp, mblk_t *mp, const char *reason)
+{
+	DTRACE_VND4(drop__out, mblk_t *, mp, vnd_str_t *, vsp, mblk_t *,
+	    mp, const char *, reason);
+	if (mp != NULL) {
+		freemsg(mp);
+	}
+	VND_STAT_INC(vsp, vks_nhookoutdrops, 1);
+	VND_STAT_INC(vsp, vks_tdrops, 1);
+}
+
+/* ARGSUSED */
+static void
+vnd_drop_panic(vnd_str_t *vsp, mblk_t *mp, const char *reason)
+{
+	panic("illegal vnd drop");
+}
+
+/* ARGSUSED */
+static void
+vnd_mac_drop_input(vnd_str_t *vsp, mac_resource_t *unused, mblk_t *mp_chain,
+    mac_header_info_t *mhip)
+{
+	mblk_t *mp;
+
+	while (mp_chain != NULL) {
+		mp = mp_chain;
+		mp_chain = mp->b_next;
+		vnd_drop_hook_in(vsp, mp, "stream not associated");
+	}
+}
+
+static vnd_pnsd_t *
+vnd_nsd_lookup(netstackid_t nsid)
+{
+	vnd_pnsd_t *nsp;
+
+	mutex_enter(&vnd_dev_lock);
+	for (nsp = list_head(&vnd_nsd_list); nsp != NULL;
+	    nsp = list_next(&vnd_nsd_list, nsp)) {
+		if (nsp->vpnd_nsid == nsid) {
+			mutex_enter(&nsp->vpnd_lock);
+			VERIFY(nsp->vpnd_ref >= 0);
+			nsp->vpnd_ref++;
+			mutex_exit(&nsp->vpnd_lock);
+			break;
+		}
+	}
+	mutex_exit(&vnd_dev_lock);
+	return (nsp);
+}
+
+static vnd_pnsd_t *
+vnd_nsd_lookup_by_zid(zoneid_t zid)
+{
+	netstack_t *ns;
+	vnd_pnsd_t *nsp;
+	ns = netstack_find_by_zoneid(zid);
+	if (ns == NULL)
+		return (NULL);
+	nsp = vnd_nsd_lookup(ns->netstack_stackid);
+	netstack_rele(ns);
+	return (nsp);
+}
+
+static vnd_pnsd_t *
+vnd_nsd_lookup_by_zonename(char *zname)
+{
+	zone_t *zonep;
+	vnd_pnsd_t *nsp;
+
+	zonep = zone_find_by_name(zname);
+	if (zonep == NULL)
+		return (NULL);
+
+	nsp = vnd_nsd_lookup_by_zid(zonep->zone_id);
+	zone_rele(zonep);
+	return (nsp);
+}
+
+static void
+vnd_nsd_ref(vnd_pnsd_t *nsp)
+{
+	mutex_enter(&nsp->vpnd_lock);
+	/*
+	 * This can only be used on something that has been obtained through
+	 * some other means. As such, the caller should already have a reference
+	 * before adding another one. This function should not be used as a
+	 * means of creating the initial reference.
+	 */
+	VERIFY(nsp->vpnd_ref > 0);
+	nsp->vpnd_ref++;
+	mutex_exit(&nsp->vpnd_lock);
+	cv_broadcast(&nsp->vpnd_ref_change);
+}
+
+static void
+vnd_nsd_rele(vnd_pnsd_t *nsp)
+{
+	mutex_enter(&nsp->vpnd_lock);
+	VERIFY(nsp->vpnd_ref > 0);
+	nsp->vpnd_ref--;
+	mutex_exit(&nsp->vpnd_lock);
+	cv_broadcast(&nsp->vpnd_ref_change);
+}
+
+static vnd_dev_t *
+vnd_dev_lookup(minor_t m)
+{
+	vnd_dev_t *vdp;
+	mutex_enter(&vnd_dev_lock);
+	for (vdp = list_head(&vnd_dev_list); vdp != NULL;
+	    vdp = list_next(&vnd_dev_list, vdp)) {
+		if (vdp->vdd_minor == m) {
+			mutex_enter(&vdp->vdd_lock);
+			VERIFY(vdp->vdd_ref > 0);
+			vdp->vdd_ref++;
+			DTRACE_VND_REFINC(vdp);
+			mutex_exit(&vdp->vdd_lock);
+			break;
+		}
+	}
+	mutex_exit(&vnd_dev_lock);
+	return (vdp);
+}
+
+static void
+vnd_dev_free(vnd_dev_t *vdp)
+{
+	/*
+	 * When the STREAM exists we need to go through and make sure
+	 * communication gets torn down. As part of closing the stream, we
+	 * guarantee that nothing else should be able to enter the stream layer
+	 * at this point. That means no one should be able to call
+	 * read(),write() or one of the frameio ioctls.
+	 */
+	if (vdp->vdd_flags & VND_D_ATTACHED) {
+		(void) ldi_close(vdp->vdd_ldih, FREAD | FWRITE, vdp->vdd_cr);
+		crfree(vdp->vdd_cr);
+		vdp->vdd_cr = NULL;
+
+		/*
+		 * We have to remove ourselves from our parents list now. It is
+		 * really quite important that we have already set the condemend
+		 * flag here so that our containing netstack basically knows
+		 * that we're on the way down and knows not to wait for us. It's
+		 * also important that we do that before we put a rele on the
+		 * the device as that is the point at which it will check again.
+		 */
+		mutex_enter(&vdp->vdd_nsd->vpnd_lock);
+		list_remove(&vdp->vdd_nsd->vpnd_dev_list, vdp);
+		mutex_exit(&vdp->vdd_nsd->vpnd_lock);
+		vnd_nsd_rele(vdp->vdd_nsd);
+		vdp->vdd_nsd = NULL;
+	}
+	ASSERT(vdp->vdd_flags & VND_D_CONDEMNED);
+	id_free(vnd_minors, vdp->vdd_minor);
+	mutex_destroy(&vdp->vdd_lock);
+	kmem_cache_free(vnd_dev_cache, vdp);
+}
+
+static void
+vnd_dev_ref(vnd_dev_t *vdp)
+{
+	mutex_enter(&vdp->vdd_lock);
+	VERIFY(vdp->vdd_ref > 0);
+	vdp->vdd_ref++;
+	DTRACE_VND_REFINC(vdp);
+	mutex_exit(&vdp->vdd_lock);
+}
+
+/*
+ * As part of releasing the hold on this we may tear down a given vnd_dev_t As
+ * such we need to make sure that we grab the list lock first before grabbing
+ * the vnd_dev_t's lock to ensure proper lock ordering.
+ */
+static void
+vnd_dev_rele(vnd_dev_t *vdp)
+{
+	mutex_enter(&vnd_dev_lock);
+	mutex_enter(&vdp->vdd_lock);
+	VERIFY(vdp->vdd_ref > 0);
+	vdp->vdd_ref--;
+	DTRACE_VND_REFDEC(vdp);
+	if (vdp->vdd_ref > 0) {
+		mutex_exit(&vdp->vdd_lock);
+		mutex_exit(&vnd_dev_lock);
+		return;
+	}
+
+	/*
+	 * Now that we've removed this from the list, we can go ahead and
+	 * drop the list lock. No one else can find this device and reference
+	 * it. As its reference count is zero, it by definition does not have
+	 * any remaining entries in /devices that could lead someone back to
+	 * this.
+	 */
+	vdp->vdd_flags |= VND_D_CONDEMNED;
+	list_remove(&vnd_dev_list, vdp);
+	mutex_exit(&vdp->vdd_lock);
+	mutex_exit(&vnd_dev_lock);
+
+	vnd_dev_free(vdp);
+}
+
+/*
+ * Insert a mesage block chain if there's space, otherwise drop it. Return one
+ * so someone who was waiting for data would now end up having found it. eg.
+ * caller should consider a broadcast.
+ */
+static int
+vnd_dq_push(vnd_data_queue_t *vqp, mblk_t *mp, boolean_t reserved,
+    vnd_dropper_f dropf)
+{
+	size_t msize;
+
+	ASSERT(MUTEX_HELD(&vqp->vdq_lock));
+	if (reserved == B_FALSE) {
+		msize = msgsize(mp);
+		if (vqp->vdq_cur + msize > vqp->vdq_max) {
+			dropf(vqp->vdq_vns, mp, "buffer full");
+			return (0);
+		}
+		vqp->vdq_cur += msize;
+	}
+
+	if (vqp->vdq_head == NULL) {
+		ASSERT(vqp->vdq_tail == NULL);
+		vqp->vdq_head = mp;
+		vqp->vdq_tail = mp;
+	} else {
+		vqp->vdq_tail->b_next = mp;
+		vqp->vdq_tail = mp;
+	}
+
+	return (1);
+}
+
+/*
+ * Remove a message message block chain. If the amount of space in the buffer
+ * has changed we return 1. We have no way of knowing whether or not there is
+ * enough space overall for a given writer who is blocked, so we always end up
+ * having to return true and thus tell consumers that they should consider
+ * signalling.
+ */
+static int
+vnd_dq_pop(vnd_data_queue_t *vqp, mblk_t **mpp)
+{
+	size_t msize;
+	mblk_t *mp;
+
+	ASSERT(MUTEX_HELD(&vqp->vdq_lock));
+	ASSERT(mpp != NULL);
+	if (vqp->vdq_head == NULL) {
+		ASSERT(vqp->vdq_tail == NULL);
+		*mpp = NULL;
+		return (0);
+	}
+
+	mp = vqp->vdq_head;
+	msize = msgsize(mp);
+
+	vqp->vdq_cur -= msize;
+	if (mp->b_next == NULL) {
+		vqp->vdq_head = NULL;
+		vqp->vdq_tail = NULL;
+		/*
+		 * We can't be certain that this is always going to be zero.
+		 * Someone may have basically taken a reservation of space on
+		 * the data queue, eg. claimed spae but not yet pushed it on
+		 * yet.
+		 */
+		ASSERT(vqp->vdq_cur >= 0);
+	} else {
+		vqp->vdq_head = mp->b_next;
+		ASSERT(vqp->vdq_cur > 0);
+	}
+	mp->b_next = NULL;
+	*mpp = mp;
+	return (1);
+}
+
+/*
+ * Reserve space in the queue. This will bump up the size of the queue and
+ * entitle the user to push something on later without bumping the space.
+ */
+static int
+vnd_dq_reserve(vnd_data_queue_t *vqp, ssize_t size)
+{
+	ASSERT(MUTEX_HELD(&vqp->vdq_lock));
+	ASSERT(size >= 0);
+
+	if (size == 0)
+		return (0);
+
+	if (size + vqp->vdq_cur > vqp->vdq_max)
+		return (0);
+
+	vqp->vdq_cur += size;
+	return (1);
+}
+
+static void
+vnd_dq_unreserve(vnd_data_queue_t *vqp, ssize_t size)
+{
+	ASSERT(MUTEX_HELD(&vqp->vdq_lock));
+	ASSERT(size > 0);
+	ASSERT(size <= vqp->vdq_cur);
+
+	vqp->vdq_cur -= size;
+}
+
+static void
+vnd_dq_flush(vnd_data_queue_t *vqp, vnd_dropper_f dropf)
+{
+	mblk_t *mp, *next;
+
+	mutex_enter(&vqp->vdq_lock);
+	for (mp = vqp->vdq_head; mp != NULL; mp = next) {
+		next = mp->b_next;
+		mp->b_next = NULL;
+		dropf(vqp->vdq_vns, mp, "vnd_dq_flush");
+	}
+	vqp->vdq_cur = 0;
+	vqp->vdq_head = NULL;
+	vqp->vdq_tail = NULL;
+	mutex_exit(&vqp->vdq_lock);
+}
+
+static boolean_t
+vnd_dq_is_empty(vnd_data_queue_t *vqp)
+{
+	boolean_t ret;
+
+	mutex_enter(&vqp->vdq_lock);
+	if (vqp->vdq_head == NULL)
+		ret = B_TRUE;
+	else
+		ret = B_FALSE;
+	mutex_exit(&vqp->vdq_lock);
+
+	return (ret);
+}
+
+/*
+ * Get a network uint16_t from the message and translate it into something the
+ * host understands.
+ */
+static int
+vnd_mbc_getu16(mblk_t *mp, off_t off, uint16_t *out)
+{
+	size_t mpsize;
+	uint8_t *bp;
+
+	mpsize = msgsize(mp);
+	/* Check for overflow */
+	if (off + sizeof (uint16_t) > mpsize)
+		return (1);
+
+	mpsize = MBLKL(mp);
+	while (off >= mpsize) {
+		mp = mp->b_cont;
+		off -= mpsize;
+		mpsize = MBLKL(mp);
+	}
+
+	/*
+	 * Data is in network order. Note the second byte of data might be in
+	 * the next mp.
+	 */
+	bp = mp->b_rptr + off;
+	*out = *bp << 8;
+	if (off + 1 == mpsize) {
+		mp = mp->b_cont;
+		bp = mp->b_rptr;
+	} else {
+		bp++;
+	}
+
+	*out |= *bp;
+	return (0);
+}
+
+/*
+ * Given an mblk chain find the mblk and address of a particular offset.
+ */
+static int
+vnd_mbc_getoffset(mblk_t *mp, off_t off, mblk_t **mpp, uintptr_t *offp)
+{
+	size_t mpsize;
+
+	if (off >= msgsize(mp))
+		return (1);
+
+	mpsize = MBLKL(mp);
+	while (off >= mpsize) {
+		mp = mp->b_cont;
+		off -= mpsize;
+		mpsize = MBLKL(mp);
+	}
+	*mpp = mp;
+	*offp = (uintptr_t)mp->b_rptr + off;
+
+	return (0);
+}
+
+/*
+ * Fetch the destination mac address. Set *dstp to that mac address. If the data
+ * is not contiguous in the first mblk_t, fill in datap and set *dstp to it.
+ */
+static int
+vnd_mbc_getdstmac(mblk_t *mp, uint8_t **dstpp, uint8_t *datap)
+{
+	int i;
+
+	if (MBLKL(mp) >= ETHERADDRL) {
+		*dstpp = mp->b_rptr;
+		return (0);
+	}
+
+	*dstpp = datap;
+	for (i = 0; i < ETHERADDRL; i += 2, datap += 2) {
+		if (vnd_mbc_getu16(mp, i, (uint16_t *)datap) != 0)
+			return (1);
+	}
+
+	return (0);
+}
+
+static int
+vnd_hook(vnd_str_t *vsp, mblk_t **mpp, net_handle_t netiv4, hook_event_t hev4,
+    hook_event_token_t hetv4, net_handle_t netiv6, hook_event_t hev6,
+    hook_event_token_t hetv6, vnd_dropper_f hdrop, vnd_dropper_f ddrop)
+{
+	uint16_t etype;
+	hook_pkt_event_t info;
+	size_t offset, mblen;
+	uint8_t *dstp;
+	uint8_t dstaddr[6];
+	hook_event_t he;
+	hook_event_token_t het;
+	net_handle_t neti;
+
+	/*
+	 * Before we can ask if we're interested we have to do enough work to
+	 * determine the ethertype.
+	 */
+
+	/* Byte 12 is either the VLAN tag or the ethertype */
+	if (vnd_mbc_getu16(*mpp, 12, &etype) != 0) {
+		ddrop(vsp, *mpp, "packet has incomplete ethernet header");
+		*mpp = NULL;
+		return (1);
+	}
+
+	if (etype == ETHERTYPE_VLAN) {
+		/* Actual ethertype is another four bytes in */
+		if (vnd_mbc_getu16(*mpp, 16, &etype) != 0) {
+			ddrop(vsp, *mpp,
+			    "packet has incomplete ethernet vlan header");
+			*mpp = NULL;
+			return (1);
+		}
+		offset = sizeof (struct ether_vlan_header);
+	} else {
+		offset = sizeof (struct ether_header);
+	}
+
+	/*
+	 * At the moment we only hook on the kinds of things that the IP module
+	 * would normally.
+	 */
+	if (etype != ETHERTYPE_IP && etype != ETHERTYPE_IPV6)
+		return (0);
+
+	if (etype == ETHERTYPE_IP) {
+		neti = netiv4;
+		he = hev4;
+		het = hetv4;
+	} else {
+		neti = netiv6;
+		he = hev6;
+		het = hetv6;
+	}
+
+	if (!he.he_interested)
+		return (0);
+
+
+	if (vnd_mbc_getdstmac(*mpp, &dstp, dstaddr) != 0) {
+		ddrop(vsp, *mpp, "packet has incomplete ethernet header");
+		*mpp = NULL;
+		return (1);
+	}
+
+	/*
+	 * Now that we know we're interested, we have to do some additional
+	 * sanity checking for IPF's sake, ala ip_check_length(). Specifically
+	 * we need to check to make sure that the remaining packet size,
+	 * excluding MAC, is at least the size of an IP header.
+	 */
+	mblen = msgsize(*mpp);
+	if ((etype == ETHERTYPE_IP &&
+	    mblen - offset < IP_SIMPLE_HDR_LENGTH) ||
+	    (etype == ETHERTYPE_IPV6 && mblen - offset < IPV6_HDR_LEN)) {
+		ddrop(vsp, *mpp, "packet has invalid IP header");
+		*mpp = NULL;
+		return (1);
+	}
+
+	info.hpe_protocol = neti;
+	info.hpe_ifp = (phy_if_t)vsp;
+	info.hpe_ofp = (phy_if_t)vsp;
+	info.hpe_mp = mpp;
+	info.hpe_flags = 0;
+
+	if (bcmp(vnd_bcast_addr, dstp, ETHERADDRL) == 0)
+		info.hpe_flags |= HPE_BROADCAST;
+	else if (etype == ETHERTYPE_IP &&
+	    bcmp(vnd_ipv4_mcast, vnd_bcast_addr, IPV4_MCAST_LEN) == 0)
+		info.hpe_flags |= HPE_MULTICAST;
+	else if (etype == ETHERTYPE_IPV6 &&
+	    bcmp(vnd_ipv6_mcast, vnd_bcast_addr, IPV6_MCAST_LEN) == 0)
+		info.hpe_flags |= HPE_MULTICAST;
+
+	if (vnd_mbc_getoffset(*mpp, offset, &info.hpe_mb,
+	    (uintptr_t *)&info.hpe_hdr) != 0) {
+		ddrop(vsp, *mpp, "packet too small -- "
+		    "unable to find payload");
+		*mpp = NULL;
+		return (1);
+	}
+
+	if (hook_run(neti->netd_hooks, het, (hook_data_t)&info) != 0) {
+		hdrop(vsp, *mpp, "drooped by hooks");
+		return (1);
+	}
+
+	return (0);
+}
+
+/*
+ * This should not be used for DL_INFO_REQ.
+ */
+static mblk_t *
+vnd_dlpi_alloc(size_t len, t_uscalar_t prim)
+{
+	mblk_t *mp;
+	mp = allocb(len, BPRI_MED);
+	if (mp == NULL)
+		return (NULL);
+
+	mp->b_datap->db_type = M_PROTO;
+	mp->b_wptr = mp->b_rptr + len;
+	bzero(mp->b_rptr, len);
+	((dl_unitdata_req_t *)mp->b_rptr)->dl_primitive = prim;
+
+	return (mp);
+}
+
+static void
+vnd_dlpi_inc_push(vnd_str_t *vsp, mblk_t *mp)
+{
+	mblk_t **mpp;
+
+	VERIFY(MUTEX_HELD(&vsp->vns_lock));
+	ASSERT(mp->b_next == NULL);
+	mpp = &vsp->vns_dlpi_inc;
+	while (*mpp != NULL)
+		mpp = &((*mpp)->b_next);
+	*mpp = mp;
+}
+
+static mblk_t *
+vnd_dlpi_inc_pop(vnd_str_t *vsp)
+{
+	mblk_t *mp;
+
+	VERIFY(MUTEX_HELD(&vsp->vns_lock));
+	mp = vsp->vns_dlpi_inc;
+	if (mp != NULL) {
+		VERIFY(mp->b_next == NULL || mp->b_next != mp);
+		vsp->vns_dlpi_inc = mp->b_next;
+		mp->b_next = NULL;
+	}
+	return (mp);
+}
+
+static int
+vnd_st_sinfo(vnd_str_t *vsp)
+{
+	mblk_t *mp;
+	dl_info_req_t *dlir;
+
+	VERIFY(MUTEX_HELD(&vsp->vns_lock));
+	mp = allocb(MAX(sizeof (dl_info_req_t), sizeof (dl_info_ack_t)),
+	    BPRI_HI);
+	if (mp == NULL) {
+		vsp->vns_errno = VND_E_NOMEM;
+		return (1);
+	}
+	vsp->vns_state = VNS_S_INFO_SENT;
+	cv_broadcast(&vsp->vns_stcv);
+
+	mp->b_datap->db_type = M_PCPROTO;
+	dlir = (dl_info_req_t *)mp->b_rptr;
+	mp->b_wptr = (uchar_t *)&dlir[1];
+	dlir->dl_primitive = DL_INFO_REQ;
+	putnext(vsp->vns_wq, mp);
+
+	return (0);
+}
+
+static int
+vnd_st_info(vnd_str_t *vsp)
+{
+	dl_info_ack_t *dlia;
+	mblk_t *mp;
+
+	VERIFY(MUTEX_HELD(&vsp->vns_lock));
+	mp = vnd_dlpi_inc_pop(vsp);
+	dlia = (dl_info_ack_t *)mp->b_rptr;
+	vsp->vns_dlpi_style = dlia->dl_provider_style;
+	vsp->vns_minwrite = dlia->dl_min_sdu;
+	vsp->vns_maxwrite = dlia->dl_max_sdu;
+
+	/*
+	 * At this time we only support DL_ETHER devices.
+	 */
+	if (dlia->dl_mac_type != DL_ETHER) {
+		freemsg(mp);
+		vsp->vns_errno = VND_E_NOTETHER;
+		return (1);
+	}
+
+	/*
+	 * Because vnd operates on entire packets, we need to manually account
+	 * for the ethernet header information. We add the size of the
+	 * ether_vlan_header to account for this, regardless if it is using
+	 * vlans or not.
+	 */
+	vsp->vns_maxwrite += sizeof (struct ether_vlan_header);
+
+	freemsg(mp);
+	return (0);
+}
+
+static int
+vnd_st_sexclusive(vnd_str_t *vsp)
+{
+	mblk_t *mp;
+
+	VERIFY(MUTEX_HELD(&vsp->vns_lock));
+	mp = vnd_dlpi_alloc(sizeof (dl_attach_req_t), DL_EXCLUSIVE_REQ);
+	if (mp == NULL) {
+		vsp->vns_errno = VND_E_NOMEM;
+		return (1);
+	}
+
+	vsp->vns_state = VNS_S_EXCLUSIVE_SENT;
+	cv_broadcast(&vsp->vns_stcv);
+	putnext(vsp->vns_wq, mp);
+	return (0);
+}
+
+static int
+vnd_st_exclusive(vnd_str_t *vsp)
+{
+	mblk_t *mp;
+	t_uscalar_t prim, cprim;
+
+	VERIFY(MUTEX_HELD(&vsp->vns_lock));
+	mp = vnd_dlpi_inc_pop(vsp);
+	prim = ((dl_error_ack_t *)mp->b_rptr)->dl_primitive;
+	cprim = ((dl_ok_ack_t *)mp->b_rptr)->dl_correct_primitive;
+
+	if (prim != DL_OK_ACK && prim != DL_ERROR_ACK) {
+		vnd_drop_ctl(vsp, mp,
+		    "wrong dlpi primitive for vnd_st_exclusive");
+		vsp->vns_errno = VND_E_DLPIINVAL;
+		return (1);
+	}
+
+	if (cprim != DL_EXCLUSIVE_REQ) {
+		vnd_drop_ctl(vsp, mp,
+		    "vnd_st_exclusive: got ack/nack for wrong primitive");
+		vsp->vns_errno = VND_E_DLPIINVAL;
+		return (1);
+	}
+
+	if (prim == DL_ERROR_ACK)
+		vsp->vns_errno = VND_E_DLEXCL;
+
+	freemsg(mp);
+	return (prim == DL_ERROR_ACK);
+}
+
+/*
+ * Send down a DLPI_ATTACH_REQ.
+ */
+static int
+vnd_st_sattach(vnd_str_t *vsp)
+{
+	mblk_t *mp;
+
+	VERIFY(MUTEX_HELD(&vsp->vns_lock));
+	mp = vnd_dlpi_alloc(sizeof (dl_attach_req_t), DL_ATTACH_REQ);
+	if (mp == NULL) {
+		vsp->vns_errno = VND_E_NOMEM;
+		return (1);
+	}
+
+	((dl_attach_req_t *)mp->b_rptr)->dl_ppa = 0;
+	vsp->vns_state = VNS_S_ATTACH_SENT;
+	cv_broadcast(&vsp->vns_stcv);
+	putnext(vsp->vns_wq, mp);
+
+	return (0);
+}
+
+static int
+vnd_st_attach(vnd_str_t *vsp)
+{
+	mblk_t *mp;
+	t_uscalar_t prim, cprim;
+
+	VERIFY(MUTEX_HELD(&vsp->vns_lock));
+	mp = vnd_dlpi_inc_pop(vsp);
+	prim = ((dl_ok_ack_t *)mp->b_rptr)->dl_primitive;
+	cprim = ((dl_ok_ack_t *)mp->b_rptr)->dl_correct_primitive;
+
+
+	if (prim != DL_OK_ACK && prim != DL_ERROR_ACK) {
+		vnd_drop_ctl(vsp, mp, "vnd_st_attach: unknown primitive type");
+		vsp->vns_errno = VND_E_DLPIINVAL;
+		return (1);
+	}
+
+	if (cprim != DL_ATTACH_REQ) {
+		vnd_drop_ctl(vsp, mp,
+		    "vnd_st_attach: Got ack/nack for wrong primitive");
+		vsp->vns_errno = VND_E_DLPIINVAL;
+		return (1);
+	}
+
+	if (prim == DL_ERROR_ACK)
+		vsp->vns_errno = VND_E_ATTACHFAIL;
+
+	freemsg(mp);
+	return (prim == DL_ERROR_ACK);
+}
+
+static int
+vnd_st_sbind(vnd_str_t *vsp)
+{
+	mblk_t *mp;
+	dl_bind_req_t *dbrp;
+
+	VERIFY(MUTEX_HELD(&vsp->vns_lock));
+	mp = vnd_dlpi_alloc(sizeof (dl_bind_req_t) + sizeof (long),
+	    DL_BIND_REQ);
+	if (mp == NULL) {
+		vsp->vns_errno = VND_E_NOMEM;
+		return (1);
+	}
+	dbrp = (dl_bind_req_t *)(mp->b_rptr);
+	dbrp->dl_sap = 0;
+	dbrp->dl_service_mode = DL_CLDLS;
+
+	vsp->vns_state = VNS_S_BIND_SENT;
+	cv_broadcast(&vsp->vns_stcv);
+	putnext(vsp->vns_wq, mp);
+
+	return (0);
+}
+
+static int
+vnd_st_bind(vnd_str_t *vsp)
+{
+	mblk_t *mp;
+	t_uscalar_t prim;
+
+	VERIFY(MUTEX_HELD(&vsp->vns_lock));
+	mp = vnd_dlpi_inc_pop(vsp);
+	prim = ((dl_error_ack_t *)mp->b_rptr)->dl_primitive;
+
+	if (prim != DL_BIND_ACK && prim != DL_ERROR_ACK) {
+		vnd_drop_ctl(vsp, mp, "wrong dlpi primitive for vnd_st_bind");
+		vsp->vns_errno = VND_E_DLPIINVAL;
+		return (1);
+	}
+
+	if (prim == DL_ERROR_ACK)
+		vsp->vns_errno = VND_E_BINDFAIL;
+
+	freemsg(mp);
+	return (prim == DL_ERROR_ACK);
+}
+
+static int
+vnd_st_spromisc(vnd_str_t *vsp, int type, vnd_str_state_t next)
+{
+	mblk_t *mp;
+	dl_promiscon_req_t *dprp;
+
+	VERIFY(MUTEX_HELD(&vsp->vns_lock));
+	mp = vnd_dlpi_alloc(sizeof (dl_promiscon_req_t), DL_PROMISCON_REQ);
+	if (mp == NULL) {
+		vsp->vns_errno = VND_E_NOMEM;
+		return (1);
+	}
+
+	dprp = (dl_promiscon_req_t *)mp->b_rptr;
+	dprp->dl_level = type;
+
+	vsp->vns_state = next;
+	cv_broadcast(&vsp->vns_stcv);
+	putnext(vsp->vns_wq, mp);
+
+	return (0);
+}
+
+static int
+vnd_st_promisc(vnd_str_t *vsp)
+{
+	mblk_t *mp;
+	t_uscalar_t prim, cprim;
+
+	VERIFY(MUTEX_HELD(&vsp->vns_lock));
+	mp = vnd_dlpi_inc_pop(vsp);
+	prim = ((dl_ok_ack_t *)mp->b_rptr)->dl_primitive;
+	cprim = ((dl_ok_ack_t *)mp->b_rptr)->dl_correct_primitive;
+
+	if (prim != DL_OK_ACK && prim != DL_ERROR_ACK) {
+		vnd_drop_ctl(vsp, mp,
+		    "wrong dlpi primitive for vnd_st_promisc");
+		vsp->vns_errno = VND_E_DLPIINVAL;
+		return (1);
+	}
+
+	if (cprim != DL_PROMISCON_REQ) {
+		vnd_drop_ctl(vsp, mp,
+		    "vnd_st_promisc: Got ack/nack for wrong primitive");
+		vsp->vns_errno = VND_E_DLPIINVAL;
+		return (1);
+	}
+
+	if (prim == DL_ERROR_ACK)
+		vsp->vns_errno = VND_E_PROMISCFAIL;
+
+	freemsg(mp);
+	return (prim == DL_ERROR_ACK);
+}
+
+static int
+vnd_st_scapabq(vnd_str_t *vsp)
+{
+	mblk_t *mp;
+
+	VERIFY(MUTEX_HELD(&vsp->vns_lock));
+
+	mp = vnd_dlpi_alloc(sizeof (dl_capability_req_t), DL_CAPABILITY_REQ);
+	if (mp == NULL) {
+		vsp->vns_errno = VND_E_NOMEM;
+		return (1);
+	}
+
+	vsp->vns_state = VNS_S_CAPAB_Q_SENT;
+	cv_broadcast(&vsp->vns_stcv);
+	putnext(vsp->vns_wq, mp);
+
+	return (0);
+}
+
+/* ARGSUSED */
+static void
+vnd_mac_input(vnd_str_t *vsp, mac_resource_t *unused, mblk_t *mp_chain,
+    mac_header_info_t *mhip)
+{
+	int signal = 0;
+	mblk_t *mp;
+	vnd_pnsd_t *nsp = vsp->vns_nsd;
+
+	ASSERT(vsp != NULL);
+	ASSERT(mp_chain != NULL);
+
+	for (mp = mp_chain; mp != NULL; mp = mp_chain) {
+		uint16_t vid;
+		mp_chain = mp->b_next;
+		mp->b_next = NULL;
+
+		/*
+		 * If we were operating in a traditional dlpi context then we
+		 * would have enabled DLIOCRAW and rather than the fast path, we
+		 * would come through dld_str_rx_raw. That function does two
+		 * things that we have to consider doing ourselves. The first is
+		 * that it adjusts the b_rptr back to account for dld bumping us
+		 * past the mac header. It also tries to account for cases where
+		 * mac provides an illusion of the mac header. Fortunately, dld
+		 * only allows the fastpath when the media type is the same as
+		 * the native type. Therefore all we have to do here is adjust
+		 * the b_rptr.
+		 */
+		ASSERT(mp->b_rptr >= DB_BASE(mp) + mhip->mhi_hdrsize);
+		mp->b_rptr -= mhip->mhi_hdrsize;
+		vid = VLAN_ID(mhip->mhi_tci);
+		if (mhip->mhi_istagged && vid != VLAN_ID_NONE) {
+			/*
+			 * This is an overlapping copy. Do not use bcopy(9F).
+			 */
+			(void) memmove(mp->b_rptr + 4, mp->b_rptr, 12);
+			mp->b_rptr += 4;
+		}
+
+		if (nsp->vpnd_hooked && vnd_hook(vsp, &mp, nsp->vpnd_neti_v4,
+		    nsp->vpnd_event_in_v4, nsp->vpnd_token_in_v4,
+		    nsp->vpnd_neti_v6, nsp->vpnd_event_in_v6,
+		    nsp->vpnd_token_in_v6, vnd_drop_hook_in, vnd_drop_in) != 0)
+			continue;
+
+		VND_STAT_INC(vsp, vks_rpackets, 1);
+		VND_STAT_INC(vsp, vks_rbytes, msgsize(mp));
+		DTRACE_VND5(recv, mblk_t *, mp, void *, NULL, void *, NULL,
+		    vnd_str_t *, vsp, mblk_t *, mp);
+		mutex_enter(&vsp->vns_dq_read.vdq_lock);
+		signal |= vnd_dq_push(&vsp->vns_dq_read, mp, B_FALSE,
+		    vnd_drop_in);
+		mutex_exit(&vsp->vns_dq_read.vdq_lock);
+	}
+
+	if (signal != 0) {
+		cv_broadcast(&vsp->vns_dq_read.vdq_ready);
+		pollwakeup(&vsp->vns_dev->vdd_ph, POLLIN | POLLRDNORM);
+	}
+
+}
+
+static void
+vnd_mac_flow_control_stat(vnd_str_t *vsp, hrtime_t diff)
+{
+	VND_STAT_INC(vsp, vks_nmacflow, 1);
+	VND_STAT_INC(vsp, vks_tmacflow, diff);
+	if (diff >= VND_LATENCY_1MS)
+		VND_STAT_INC(vsp, vks_mac_flow_1ms, 1);
+	if (diff >= VND_LATENCY_10MS)
+		VND_STAT_INC(vsp, vks_mac_flow_10ms, 1);
+	if (diff >= VND_LATENCY_100MS)
+		VND_STAT_INC(vsp, vks_mac_flow_100ms, 1);
+	if (diff >= VND_LATENCY_1S)
+		VND_STAT_INC(vsp, vks_mac_flow_1s, 1);
+	if (diff >= VND_LATENCY_10S)
+		VND_STAT_INC(vsp, vks_mac_flow_10s, 1);
+}
+
+/*
+ * This is a callback from MAC that indicates that we are allowed to send
+ * packets again.
+ */
+static void
+vnd_mac_flow_control(void *arg, vnd_mac_cookie_t cookie)
+{
+	vnd_str_t *vsp = arg;
+	hrtime_t now;
+
+	mutex_enter(&vsp->vns_lock);
+	now = gethrtime();
+
+	/*
+	 * Check for the case that we beat vnd_squeue_tx_one to the punch.
+	 * There's also an additional case here that we got notified because
+	 * we're sharing a device that ran out of tx descriptors, even though it
+	 * wasn't because of us.
+	 */
+	if (!(vsp->vns_flags & VNS_F_FLOW_CONTROLLED)) {
+		vsp->vns_fcupdate = now;
+		mutex_exit(&vsp->vns_lock);
+		return;
+	}
+
+	ASSERT(vsp->vns_flags & VNS_F_FLOW_CONTROLLED);
+	ASSERT(vsp->vns_caps.vsc_fc_cookie == cookie);
+	vsp->vns_flags &= ~VNS_F_FLOW_CONTROLLED;
+	vsp->vns_caps.vsc_fc_cookie = NULL;
+	vsp->vns_fclatch = 0;
+	DTRACE_VND3(flow__resumed, vnd_str_t *, vsp, uint64_t,
+	    vsp->vns_dq_write.vdq_cur, uintptr_t, cookie);
+	/*
+	 * If someone has asked to flush the squeue and thus inserted a barrier,
+	 * than we shouldn't schedule a drain.
+	 */
+	if (!(vsp->vns_flags & (VNS_F_DRAIN_SCHEDULED | VNS_F_BARRIER))) {
+		vsp->vns_flags |= VNS_F_DRAIN_SCHEDULED;
+		gsqueue_enter_one(vsp->vns_squeue, &vsp->vns_drainblk,
+		    vnd_squeue_tx_drain, vsp, GSQUEUE_FILL,
+		    VND_SQUEUE_TAG_MAC_FLOW_CONTROL);
+	}
+	mutex_exit(&vsp->vns_lock);
+}
+
+static void
+vnd_mac_enter(vnd_str_t *vsp, mac_perim_handle_t *mphp)
+{
+	ASSERT(MUTEX_HELD(&vsp->vns_lock));
+	VERIFY(vsp->vns_caps.vsc_capab_f(vsp->vns_caps.vsc_capab_hdl,
+	    DLD_CAPAB_PERIM, mphp, DLD_ENABLE) == 0);
+}
+
+static void
+vnd_mac_exit(vnd_str_t *vsp, mac_perim_handle_t mph)
+{
+	ASSERT(MUTEX_HELD(&vsp->vns_lock));
+	VERIFY(vsp->vns_caps.vsc_capab_f(vsp->vns_caps.vsc_capab_hdl,
+	    DLD_CAPAB_PERIM, mph, DLD_DISABLE) == 0);
+}
+
+static int
+vnd_dld_cap_enable(vnd_str_t *vsp, vnd_rx_t rxfunc)
+{
+	int ret;
+	dld_capab_direct_t d;
+	mac_perim_handle_t mph;
+	vnd_str_capab_t *c = &vsp->vns_caps;
+
+	bzero(&d, sizeof (d));
+	d.di_rx_cf = (uintptr_t)rxfunc;
+	d.di_rx_ch = vsp;
+	d.di_flags = DI_DIRECT_RAW;
+
+	vnd_mac_enter(vsp, &mph);
+
+	/*
+	 * If we're coming in here for a second pass, we need to make sure that
+	 * we remove an existing flow control notification callback, otherwise
+	 * we'll create a duplicate that will remain with garbage data.
+	 */
+	if (c->vsc_tx_fc_hdl != NULL) {
+		ASSERT(c->vsc_set_fcb_hdl != NULL);
+		(void) c->vsc_set_fcb_f(c->vsc_set_fcb_hdl, NULL,
+		    c->vsc_tx_fc_hdl);
+		c->vsc_tx_fc_hdl = NULL;
+	}
+
+	if (vsp->vns_caps.vsc_capab_f(c->vsc_capab_hdl,
+	    DLD_CAPAB_DIRECT, &d, DLD_ENABLE) == 0) {
+		c->vsc_tx_f = (vnd_dld_tx_t)d.di_tx_df;
+		c->vsc_tx_hdl = d.di_tx_dh;
+		c->vsc_set_fcb_f = (vnd_dld_set_fcb_t)d.di_tx_cb_df;
+		c->vsc_set_fcb_hdl = d.di_tx_cb_dh;
+		c->vsc_is_fc_f = (vnd_dld_is_fc_t)d.di_tx_fctl_df;
+		c->vsc_is_fc_hdl = d.di_tx_fctl_dh;
+		c->vsc_tx_fc_hdl = c->vsc_set_fcb_f(c->vsc_set_fcb_hdl,
+		    vnd_mac_flow_control, vsp);
+		c->vsc_flags |= VNS_C_DIRECT;
+		ret = 0;
+	} else {
+		vsp->vns_errno = VND_E_DIRECTFAIL;
+		ret = 1;
+	}
+	vnd_mac_exit(vsp, mph);
+	return (ret);
+}
+
+static int
+vnd_st_capabq(vnd_str_t *vsp)
+{
+	mblk_t *mp;
+	dl_capability_ack_t *cap;
+	dl_capability_sub_t *subp;
+	dl_capab_hcksum_t *hck;
+	dl_capab_dld_t *dld;
+	unsigned char *rp;
+	int ret = 0;
+
+	VERIFY(MUTEX_HELD(&vsp->vns_lock));
+	mp = vnd_dlpi_inc_pop(vsp);
+
+	rp = mp->b_rptr;
+	cap = (dl_capability_ack_t *)rp;
+	if (cap->dl_sub_length == 0)
+		goto done;
+
+	/* Don't try to process something too big */
+	if (sizeof (dl_capability_ack_t) + cap->dl_sub_length > MBLKL(mp)) {
+		VND_STAT_INC(vsp, vks_ndlpidrops, 1);
+		VND_STAT_INC(vsp, vks_tdrops, 1);
+		vsp->vns_errno = VND_E_CAPACKINVAL;
+		ret = 1;
+		goto done;
+	}
+
+	rp += cap->dl_sub_offset;
+
+	while (cap->dl_sub_length > 0) {
+		subp = (dl_capability_sub_t *)rp;
+		/* Sanity check something crazy from down below */
+		if (subp->dl_length + sizeof (dl_capability_sub_t) >
+		    cap->dl_sub_length) {
+			VND_STAT_INC(vsp, vks_ndlpidrops, 1);
+			VND_STAT_INC(vsp, vks_tdrops, 1);
+			vsp->vns_errno = VND_E_SUBCAPINVAL;
+			ret = 1;
+			goto done;
+		}
+
+		switch (subp->dl_cap) {
+		case DL_CAPAB_HCKSUM:
+			hck = (dl_capab_hcksum_t *)(rp +
+			    sizeof (dl_capability_sub_t));
+			if (hck->hcksum_version != HCKSUM_CURRENT_VERSION) {
+				vsp->vns_caps.vsc_flags |= VNS_C_HCKSUM_BADVERS;
+				break;
+			}
+			if (dlcapabcheckqid(&hck->hcksum_mid, vsp->vns_lrq) !=
+			    B_TRUE) {
+				vsp->vns_errno = VND_E_CAPABPASS;
+				ret = 1;
+				goto done;
+			}
+			vsp->vns_caps.vsc_flags |= VNS_C_HCKSUM;
+			vsp->vns_caps.vsc_hcksum_opts = hck->hcksum_txflags;
+			break;
+		case DL_CAPAB_DLD:
+			dld = (dl_capab_dld_t *)(rp +
+			    sizeof (dl_capability_sub_t));
+			if (dld->dld_version != DLD_CURRENT_VERSION) {
+				vsp->vns_errno = VND_E_DLDBADVERS;
+				ret = 1;
+				goto done;
+			}
+			if (dlcapabcheckqid(&dld->dld_mid, vsp->vns_lrq) !=
+			    B_TRUE) {
+				vsp->vns_errno = VND_E_CAPABPASS;
+				ret = 1;
+				goto done;
+			}
+			vsp->vns_caps.vsc_flags |= VNS_C_DLD;
+			vsp->vns_caps.vsc_capab_f =
+			    (vnd_dld_cap_t)dld->dld_capab;
+			vsp->vns_caps.vsc_capab_hdl =
+			    (void *)dld->dld_capab_handle;
+			/*
+			 * At this point in time, we have to set up a direct
+			 * function that drops all input. This validates that
+			 * we'll be able to set up direct input and that we can
+			 * easily switch it earlier to the real data function
+			 * when we've plumbed everything up.
+			 */
+			if (vnd_dld_cap_enable(vsp, vnd_mac_drop_input) != 0) {
+				/* vns_errno set by vnd_dld_cap_enable */
+				ret = 1;
+				goto done;
+			}
+			break;
+		default:
+			/* Ignore unsupported cap */
+			break;
+		}
+
+		rp += sizeof (dl_capability_sub_t) + subp->dl_length;
+		cap->dl_sub_length -= sizeof (dl_capability_sub_t) +
+		    subp->dl_length;
+	}
+
+done:
+	/* Make sure we enabled direct callbacks */
+	if (ret == 0 && !(vsp->vns_caps.vsc_flags & VNS_C_DIRECT)) {
+		vsp->vns_errno = VND_E_DIRECTNOTSUP;
+		ret = 1;
+	}
+
+	freemsg(mp);
+	return (ret);
+}
+
+static void
+vnd_st_sonline(vnd_str_t *vsp)
+{
+	VERIFY(MUTEX_HELD(&vsp->vns_lock));
+	vsp->vns_state = VNS_S_ONLINE;
+	cv_broadcast(&vsp->vns_stcv);
+}
+
+static void
+vnd_st_shutdown(vnd_str_t *vsp)
+{
+	mac_perim_handle_t mph;
+	vnd_str_capab_t *vsc = &vsp->vns_caps;
+
+	VERIFY(MUTEX_HELD(&vsp->vns_lock));
+
+	/*
+	 * At this point in time we know that there is no one transmitting as
+	 * our final reference has been torn down and that vnd_s_close inserted
+	 * a barrier to validate that everything is flushed.
+	 */
+	if (vsc->vsc_flags & VNS_C_DIRECT) {
+		vnd_mac_enter(vsp, &mph);
+		vsc->vsc_flags &= ~VNS_C_DIRECT;
+		(void) vsc->vsc_set_fcb_f(vsc->vsc_set_fcb_hdl, NULL,
+		    vsc->vsc_tx_fc_hdl);
+		vsc->vsc_tx_fc_hdl = NULL;
+		(void) vsc->vsc_capab_f(vsc->vsc_capab_hdl, DLD_CAPAB_DIRECT,
+		    NULL, DLD_DISABLE);
+		vnd_mac_exit(vsp, mph);
+	}
+}
+
+static boolean_t
+vnd_st_spromiscoff(vnd_str_t *vsp, int type, vnd_str_state_t next)
+{
+	boolean_t ret = B_TRUE;
+	mblk_t *mp;
+	dl_promiscoff_req_t *dprp;
+
+	VERIFY(MUTEX_HELD(&vsp->vns_lock));
+	mp = vnd_dlpi_alloc(sizeof (dl_promiscon_req_t), DL_PROMISCOFF_REQ);
+	if (mp == NULL) {
+		cmn_err(CE_NOTE, "!vnd failed to allocate mblk_t for "
+		    "promiscoff request");
+		ret = B_FALSE;
+		goto next;
+	}
+
+	dprp = (dl_promiscoff_req_t *)mp->b_rptr;
+	dprp->dl_level = type;
+
+	putnext(vsp->vns_wq, mp);
+next:
+	vsp->vns_state = next;
+	cv_broadcast(&vsp->vns_stcv);
+	return (ret);
+}
+
+static void
+vnd_st_promiscoff(vnd_str_t *vsp)
+{
+	mblk_t *mp;
+	t_uscalar_t prim, cprim;
+
+	VERIFY(MUTEX_HELD(&vsp->vns_lock));
+
+	/*
+	 * Unlike other cases where we guard against the incoming packet being
+	 * NULL, during tear down we try to keep driving and therefore we may
+	 * have gotten here due to an earlier failure, so there's nothing to do.
+	 */
+	mp = vnd_dlpi_inc_pop(vsp);
+	if (mp == NULL)
+		return;
+
+	prim = ((dl_ok_ack_t *)mp->b_rptr)->dl_primitive;
+	cprim = ((dl_ok_ack_t *)mp->b_rptr)->dl_correct_primitive;
+
+	if (prim != DL_OK_ACK && prim != DL_ERROR_ACK) {
+		vnd_drop_ctl(vsp, mp,
+		    "wrong dlpi primitive for vnd_st_promiscoff");
+		return;
+	}
+
+	if (cprim != DL_PROMISCOFF_REQ) {
+		vnd_drop_ctl(vsp, mp,
+		    "vnd_st_promiscoff: Got ack/nack for wrong primitive");
+		return;
+	}
+
+	if (prim == DL_ERROR_ACK) {
+		cmn_err(CE_WARN, "!failed to disable promiscuos mode during "
+		    "vnd teardown");
+	}
+}
+
+static boolean_t
+vnd_st_sunbind(vnd_str_t *vsp)
+{
+	mblk_t *mp;
+	boolean_t ret = B_TRUE;
+
+	mp = vnd_dlpi_alloc(sizeof (dl_unbind_req_t), DL_UNBIND_REQ);
+	if (mp == NULL) {
+		cmn_err(CE_NOTE, "!vnd failed to allocate mblk_t for "
+		    "unbind request");
+		ret = B_FALSE;
+		goto next;
+	}
+
+	putnext(vsp->vns_wq, mp);
+next:
+	vsp->vns_state = VNS_S_UNBIND_SENT;
+	cv_broadcast(&vsp->vns_stcv);
+	return (ret);
+}
+
+static void
+vnd_st_unbind(vnd_str_t *vsp)
+{
+	mblk_t *mp;
+	t_uscalar_t prim, cprim;
+
+	/*
+	 * Unlike other cases where we guard against the incoming packet being
+	 * NULL, during tear down we try to keep driving and therefore we may
+	 * have gotten here due to an earlier failure, so there's nothing to do.
+	 */
+	mp = vnd_dlpi_inc_pop(vsp);
+	if (mp == NULL)
+		goto next;
+
+	prim = ((dl_ok_ack_t *)mp->b_rptr)->dl_primitive;
+	cprim = ((dl_ok_ack_t *)mp->b_rptr)->dl_correct_primitive;
+
+	if (prim != DL_OK_ACK && prim != DL_ERROR_ACK) {
+		vnd_drop_ctl(vsp, mp,
+		    "wrong dlpi primitive for vnd_st_unbind");
+		goto next;
+	}
+
+	if (cprim != DL_UNBIND_REQ) {
+		vnd_drop_ctl(vsp, mp,
+		    "vnd_st_unbind: Got ack/nack for wrong primitive");
+		goto next;
+	}
+
+	if (prim == DL_ERROR_ACK) {
+		cmn_err(CE_WARN, "!failed to unbind stream during vnd "
+		    "teardown");
+	}
+
+next:
+	vsp->vns_state = VNS_S_ZOMBIE;
+	cv_broadcast(&vsp->vns_stcv);
+}
+
+/*
+ * Perform state transitions. This is a one way shot down the flow chart
+ * described in the big theory statement.
+ */
+static void
+vnd_str_state_transition(void *arg)
+{
+	boolean_t died = B_FALSE;
+	vnd_str_t *vsp = arg;
+	mblk_t *mp;
+
+	mutex_enter(&vsp->vns_lock);
+	if (vsp->vns_dlpi_inc == NULL && (vsp->vns_state != VNS_S_INITIAL &&
+	    vsp->vns_state != VNS_S_SHUTTING_DOWN)) {
+		mutex_exit(&vsp->vns_lock);
+		return;
+	}
+
+	/*
+	 * When trying to shut down, or unwinding from a failed enabling, rather
+	 * than immediately entering the ZOMBIE state, we may instead opt to try
+	 * and enter the next state in the progression. This is especially
+	 * important when trying to tear everything down.
+	 */
+loop:
+	DTRACE_PROBE2(vnd__state__transition, uintptr_t, vsp,
+	    vnd_str_state_t, vsp->vns_state);
+	switch (vsp->vns_state) {
+	case VNS_S_INITIAL:
+		VERIFY(vsp->vns_dlpi_inc == NULL);
+		if (vnd_st_sinfo(vsp) != 0)
+			died = B_TRUE;
+		break;
+	case VNS_S_INFO_SENT:
+		VERIFY(vsp->vns_dlpi_inc != NULL);
+		if (vnd_st_info(vsp) == 0) {
+			if (vnd_st_sexclusive(vsp) != 0)
+				died = B_TRUE;
+		} else {
+			died = B_TRUE;
+		}
+		break;
+	case VNS_S_EXCLUSIVE_SENT:
+		VERIFY(vsp->vns_dlpi_inc != NULL);
+		if (vnd_st_exclusive(vsp) == 0) {
+			if (vsp->vns_dlpi_style == DL_STYLE2) {
+				if (vnd_st_sattach(vsp) != 0)
+					died = B_TRUE;
+			} else {
+				if (vnd_st_sbind(vsp) != 0)
+					died = B_TRUE;
+			}
+		} else  {
+			died = B_TRUE;
+		}
+		break;
+	case VNS_S_ATTACH_SENT:
+		VERIFY(vsp->vns_dlpi_inc != NULL);
+		if (vnd_st_attach(vsp) == 0) {
+			if (vnd_st_sbind(vsp) != 0)
+				died = B_TRUE;
+		} else {
+			died = B_TRUE;
+		}
+		break;
+	case VNS_S_BIND_SENT:
+		VERIFY(vsp->vns_dlpi_inc != NULL);
+		if (vnd_st_bind(vsp) == 0) {
+			if (vnd_st_spromisc(vsp, DL_PROMISC_SAP,
+			    VNS_S_SAP_PROMISC_SENT) != 0)
+				died = B_TRUE;
+		} else {
+			died = B_TRUE;
+		}
+		break;
+	case VNS_S_SAP_PROMISC_SENT:
+		VERIFY(vsp->vns_dlpi_inc != NULL);
+		if (vnd_st_promisc(vsp) == 0) {
+			if (vnd_st_spromisc(vsp, DL_PROMISC_MULTI,
+			    VNS_S_MULTI_PROMISC_SENT) != 0)
+				died = B_TRUE;
+		} else {
+			died = B_TRUE;
+		}
+		break;
+	case VNS_S_MULTI_PROMISC_SENT:
+		VERIFY(vsp->vns_dlpi_inc != NULL);
+		if (vnd_st_promisc(vsp) == 0) {
+			if (vnd_st_spromisc(vsp, DL_PROMISC_RX_ONLY,
+			    VNS_S_RX_ONLY_PROMISC_SENT) != 0)
+				died = B_TRUE;
+		} else {
+			died = B_TRUE;
+		}
+		break;
+	case VNS_S_RX_ONLY_PROMISC_SENT:
+		VERIFY(vsp->vns_dlpi_inc != NULL);
+		if (vnd_st_promisc(vsp) == 0) {
+			if (vnd_st_spromisc(vsp, DL_PROMISC_FIXUPS,
+			    VNS_S_FIXUP_PROMISC_SENT) != 0)
+				died = B_TRUE;
+		} else {
+			died = B_TRUE;
+		}
+		break;
+	case VNS_S_FIXUP_PROMISC_SENT:
+		VERIFY(vsp->vns_dlpi_inc != NULL);
+		if (vnd_st_promisc(vsp) == 0) {
+			if (vnd_st_scapabq(vsp) != 0)
+				died = B_TRUE;
+		} else {
+			died = B_TRUE;
+		}
+		break;
+	case VNS_S_CAPAB_Q_SENT:
+		if (vnd_st_capabq(vsp) != 0)
+			died = B_TRUE;
+		else
+			vnd_st_sonline(vsp);
+		break;
+	case VNS_S_SHUTTING_DOWN:
+		vnd_st_shutdown(vsp);
+		if (vnd_st_spromiscoff(vsp, DL_PROMISC_MULTI,
+		    VNS_S_MULTICAST_PROMISCOFF_SENT) == B_FALSE)
+			goto loop;
+		break;
+	case VNS_S_MULTICAST_PROMISCOFF_SENT:
+		vnd_st_promiscoff(vsp);
+		if (vnd_st_spromiscoff(vsp, DL_PROMISC_SAP,
+		    VNS_S_SAP_PROMISCOFF_SENT) == B_FALSE)
+			goto loop;
+		break;
+	case VNS_S_SAP_PROMISCOFF_SENT:
+		vnd_st_promiscoff(vsp);
+		if (vnd_st_sunbind(vsp) == B_FALSE)
+			goto loop;
+		break;
+	case VNS_S_UNBIND_SENT:
+		vnd_st_unbind(vsp);
+		break;
+	case VNS_S_ZOMBIE:
+		while ((mp = vnd_dlpi_inc_pop(vsp)) != NULL)
+			vnd_drop_ctl(vsp, mp, "vsp received data as a zombie");
+		break;
+	default:
+		panic("vnd_str_t entered an unknown state");
+	}
+
+	if (died == B_TRUE) {
+		ASSERT(vsp->vns_errno != VND_E_SUCCESS);
+		vsp->vns_laststate = vsp->vns_state;
+		vsp->vns_state = VNS_S_ZOMBIE;
+		cv_broadcast(&vsp->vns_stcv);
+	}
+
+	mutex_exit(&vsp->vns_lock);
+}
+
+static void
+vnd_dlpi_taskq_dispatch(void *arg)
+{
+	vnd_str_t *vsp = arg;
+	int run = 1;
+
+	while (run != 0) {
+		vnd_str_state_transition(vsp);
+		mutex_enter(&vsp->vns_lock);
+		if (vsp->vns_flags & VNS_F_CONDEMNED ||
+		    vsp->vns_dlpi_inc == NULL) {
+			run = 0;
+			vsp->vns_flags &= ~VNS_F_TASKQ_DISPATCHED;
+		}
+		if (vsp->vns_flags & VNS_F_CONDEMNED)
+			cv_signal(&vsp->vns_cancelcv);
+		mutex_exit(&vsp->vns_lock);
+	}
+}
+
+/* ARGSUSED */
+static int
+vnd_neti_getifname(net_handle_t neti, phy_if_t phy, char *buf, const size_t len)
+{
+	return (-1);
+}
+
+/* ARGSUSED */
+static int
+vnd_neti_getmtu(net_handle_t neti, phy_if_t phy, lif_if_t ifdata)
+{
+	return (-1);
+}
+
+/* ARGSUSED */
+static int
+vnd_neti_getptmue(net_handle_t neti)
+{
+	return (-1);
+}
+
+/* ARGSUSED */
+static int
+vnd_neti_getlifaddr(net_handle_t neti, phy_if_t phy, lif_if_t ifdata,
+    size_t nelem, net_ifaddr_t type[], void *storage)
+{
+	return (-1);
+}
+
+/* ARGSUSED */
+static int
+vnd_neti_getlifzone(net_handle_t neti, phy_if_t phy, lif_if_t ifdata,
+    zoneid_t *zid)
+{
+	return (-1);
+}
+
+/* ARGSUSED */
+static int
+vnd_neti_getlifflags(net_handle_t neti, phy_if_t phy, lif_if_t ifdata,
+    uint64_t *flags)
+{
+	return (-1);
+}
+
+/* ARGSUSED */
+static phy_if_t
+vnd_neti_phygetnext(net_handle_t neti, phy_if_t phy)
+{
+	return ((phy_if_t)-1);
+}
+
+/* ARGSUSED */
+static phy_if_t
+vnd_neti_phylookup(net_handle_t neti, const char *name)
+{
+	return ((phy_if_t)-1);
+}
+
+/* ARGSUSED */
+static lif_if_t
+vnd_neti_lifgetnext(net_handle_t neti, phy_if_t phy, lif_if_t ifdata)
+{
+	return (-1);
+}
+
+/* ARGSUSED */
+static int
+vnd_neti_inject(net_handle_t neti, inject_t style, net_inject_t *packet)
+{
+	return (-1);
+}
+
+/* ARGSUSED */
+static phy_if_t
+vnd_neti_route(net_handle_t neti, struct sockaddr *address,
+    struct sockaddr *next)
+{
+	return ((phy_if_t)-1);
+}
+
+/* ARGSUSED */
+static int
+vnd_neti_ispchksum(net_handle_t neti, mblk_t *mp)
+{
+	return (-1);
+}
+
+/* ARGSUSED */
+static int
+vnd_neti_isvchksum(net_handle_t neti, mblk_t *mp)
+{
+	return (-1);
+}
+
+static net_protocol_t vnd_neti_info_v4 = {
+	NETINFO_VERSION,
+	NHF_VND_INET,
+	vnd_neti_getifname,
+	vnd_neti_getmtu,
+	vnd_neti_getptmue,
+	vnd_neti_getlifaddr,
+	vnd_neti_getlifzone,
+	vnd_neti_getlifflags,
+	vnd_neti_phygetnext,
+	vnd_neti_phylookup,
+	vnd_neti_lifgetnext,
+	vnd_neti_inject,
+	vnd_neti_route,
+	vnd_neti_ispchksum,
+	vnd_neti_isvchksum
+};
+
+static net_protocol_t vnd_neti_info_v6 = {
+	NETINFO_VERSION,
+	NHF_VND_INET6,
+	vnd_neti_getifname,
+	vnd_neti_getmtu,
+	vnd_neti_getptmue,
+	vnd_neti_getlifaddr,
+	vnd_neti_getlifzone,
+	vnd_neti_getlifflags,
+	vnd_neti_phygetnext,
+	vnd_neti_phylookup,
+	vnd_neti_lifgetnext,
+	vnd_neti_inject,
+	vnd_neti_route,
+	vnd_neti_ispchksum,
+	vnd_neti_isvchksum
+};
+
+
+static int
+vnd_netinfo_init(vnd_pnsd_t *nsp)
+{
+	nsp->vpnd_neti_v4 = net_protocol_register(nsp->vpnd_nsid,
+	    &vnd_neti_info_v4);
+	ASSERT(nsp->vpnd_neti_v4 != NULL);
+
+	nsp->vpnd_neti_v6 = net_protocol_register(nsp->vpnd_nsid,
+	    &vnd_neti_info_v6);
+	ASSERT(nsp->vpnd_neti_v6 != NULL);
+
+	nsp->vpnd_family_v4.hf_version = HOOK_VERSION;
+	nsp->vpnd_family_v4.hf_name = "vnd_inet";
+
+	if (net_family_register(nsp->vpnd_neti_v4, &nsp->vpnd_family_v4) != 0) {
+		(void) net_protocol_unregister(nsp->vpnd_neti_v4);
+		(void) net_protocol_unregister(nsp->vpnd_neti_v6);
+		cmn_err(CE_NOTE, "vnd_netinfo_init: net_family_register "
+		    "failed for stack %d", nsp->vpnd_nsid);
+		return (1);
+	}
+
+	nsp->vpnd_family_v6.hf_version = HOOK_VERSION;
+	nsp->vpnd_family_v6.hf_name = "vnd_inet6";
+
+	if (net_family_register(nsp->vpnd_neti_v6, &nsp->vpnd_family_v6) != 0) {
+		(void) net_family_unregister(nsp->vpnd_neti_v4,
+		    &nsp->vpnd_family_v4);
+		(void) net_protocol_unregister(nsp->vpnd_neti_v4);
+		(void) net_protocol_unregister(nsp->vpnd_neti_v6);
+		cmn_err(CE_NOTE, "vnd_netinfo_init: net_family_register "
+		    "failed for stack %d", nsp->vpnd_nsid);
+		return (1);
+	}
+
+	nsp->vpnd_event_in_v4.he_version = HOOK_VERSION;
+	nsp->vpnd_event_in_v4.he_name = NH_PHYSICAL_IN;
+	nsp->vpnd_event_in_v4.he_flags = 0;
+	nsp->vpnd_event_in_v4.he_interested = B_FALSE;
+
+	nsp->vpnd_token_in_v4 = net_event_register(nsp->vpnd_neti_v4,
+	    &nsp->vpnd_event_in_v4);
+	if (nsp->vpnd_token_in_v4 == NULL) {
+		(void) net_family_unregister(nsp->vpnd_neti_v4,
+		    &nsp->vpnd_family_v4);
+		(void) net_family_unregister(nsp->vpnd_neti_v6,
+		    &nsp->vpnd_family_v6);
+		(void) net_protocol_unregister(nsp->vpnd_neti_v4);
+		(void) net_protocol_unregister(nsp->vpnd_neti_v6);
+		cmn_err(CE_NOTE, "vnd_netinfo_init: net_event_register "
+		    "failed for stack %d", nsp->vpnd_nsid);
+		return (1);
+	}
+
+	nsp->vpnd_event_in_v6.he_version = HOOK_VERSION;
+	nsp->vpnd_event_in_v6.he_name = NH_PHYSICAL_IN;
+	nsp->vpnd_event_in_v6.he_flags = 0;
+	nsp->vpnd_event_in_v6.he_interested = B_FALSE;
+
+	nsp->vpnd_token_in_v6 = net_event_register(nsp->vpnd_neti_v6,
+	    &nsp->vpnd_event_in_v6);
+	if (nsp->vpnd_token_in_v6 == NULL) {
+		(void) net_event_shutdown(nsp->vpnd_neti_v4,
+		    &nsp->vpnd_event_in_v4);
+		(void) net_event_unregister(nsp->vpnd_neti_v4,
+		    &nsp->vpnd_event_in_v4);
+		(void) net_family_unregister(nsp->vpnd_neti_v4,
+		    &nsp->vpnd_family_v4);
+		(void) net_family_unregister(nsp->vpnd_neti_v6,
+		    &nsp->vpnd_family_v6);
+		(void) net_protocol_unregister(nsp->vpnd_neti_v4);
+		(void) net_protocol_unregister(nsp->vpnd_neti_v6);
+		cmn_err(CE_NOTE, "vnd_netinfo_init: net_event_register "
+		    "failed for stack %d", nsp->vpnd_nsid);
+		return (1);
+	}
+
+	nsp->vpnd_event_out_v4.he_version = HOOK_VERSION;
+	nsp->vpnd_event_out_v4.he_name = NH_PHYSICAL_OUT;
+	nsp->vpnd_event_out_v4.he_flags = 0;
+	nsp->vpnd_event_out_v4.he_interested = B_FALSE;
+
+	nsp->vpnd_token_out_v4 = net_event_register(nsp->vpnd_neti_v4,
+	    &nsp->vpnd_event_out_v4);
+	if (nsp->vpnd_token_out_v4 == NULL) {
+		(void) net_event_shutdown(nsp->vpnd_neti_v6,
+		    &nsp->vpnd_event_in_v6);
+		(void) net_event_unregister(nsp->vpnd_neti_v6,
+		    &nsp->vpnd_event_in_v6);
+		(void) net_event_shutdown(nsp->vpnd_neti_v4,
+		    &nsp->vpnd_event_in_v4);
+		(void) net_event_unregister(nsp->vpnd_neti_v4,
+		    &nsp->vpnd_event_in_v4);
+		(void) net_family_unregister(nsp->vpnd_neti_v4,
+		    &nsp->vpnd_family_v4);
+		(void) net_family_unregister(nsp->vpnd_neti_v6,
+		    &nsp->vpnd_family_v6);
+		(void) net_protocol_unregister(nsp->vpnd_neti_v4);
+		(void) net_protocol_unregister(nsp->vpnd_neti_v6);
+		cmn_err(CE_NOTE, "vnd_netinfo_init: net_event_register "
+		    "failed for stack %d", nsp->vpnd_nsid);
+		return (1);
+	}
+
+	nsp->vpnd_event_out_v6.he_version = HOOK_VERSION;
+	nsp->vpnd_event_out_v6.he_name = NH_PHYSICAL_OUT;
+	nsp->vpnd_event_out_v6.he_flags = 0;
+	nsp->vpnd_event_out_v6.he_interested = B_FALSE;
+
+	nsp->vpnd_token_out_v6 = net_event_register(nsp->vpnd_neti_v6,
+	    &nsp->vpnd_event_out_v6);
+	if (nsp->vpnd_token_out_v6 == NULL) {
+		(void) net_event_shutdown(nsp->vpnd_neti_v6,
+		    &nsp->vpnd_event_in_v6);
+		(void) net_event_unregister(nsp->vpnd_neti_v6,
+		    &nsp->vpnd_event_in_v6);
+		(void) net_event_shutdown(nsp->vpnd_neti_v6,
+		    &nsp->vpnd_event_in_v6);
+		(void) net_event_unregister(nsp->vpnd_neti_v6,
+		    &nsp->vpnd_event_in_v6);
+		(void) net_event_shutdown(nsp->vpnd_neti_v4,
+		    &nsp->vpnd_event_in_v4);
+		(void) net_event_unregister(nsp->vpnd_neti_v4,
+		    &nsp->vpnd_event_in_v4);
+		(void) net_family_unregister(nsp->vpnd_neti_v4,
+		    &nsp->vpnd_family_v4);
+		(void) net_family_unregister(nsp->vpnd_neti_v6,
+		    &nsp->vpnd_family_v6);
+		(void) net_protocol_unregister(nsp->vpnd_neti_v4);
+		(void) net_protocol_unregister(nsp->vpnd_neti_v6);
+		cmn_err(CE_NOTE, "vnd_netinfo_init: net_event_register "
+		    "failed for stack %d", nsp->vpnd_nsid);
+		return (1);
+	}
+
+	return (0);
+}
+
+static void
+vnd_netinfo_shutdown(vnd_pnsd_t *nsp)
+{
+	int ret;
+
+	ret = net_event_shutdown(nsp->vpnd_neti_v4, &nsp->vpnd_event_in_v4);
+	VERIFY(ret == 0);
+	ret = net_event_shutdown(nsp->vpnd_neti_v4, &nsp->vpnd_event_out_v4);
+	VERIFY(ret == 0);
+	ret = net_event_shutdown(nsp->vpnd_neti_v6, &nsp->vpnd_event_in_v6);
+	VERIFY(ret == 0);
+	ret = net_event_shutdown(nsp->vpnd_neti_v6, &nsp->vpnd_event_out_v6);
+	VERIFY(ret == 0);
+}
+
+static void
+vnd_netinfo_fini(vnd_pnsd_t *nsp)
+{
+	int ret;
+
+	ret = net_event_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_event_in_v4);
+	VERIFY(ret == 0);
+	ret = net_event_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_event_out_v4);
+	VERIFY(ret == 0);
+	ret = net_event_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_event_in_v6);
+	VERIFY(ret == 0);
+	ret = net_event_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_event_out_v6);
+	VERIFY(ret == 0);
+	ret = net_family_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_family_v4);
+	VERIFY(ret == 0);
+	ret = net_family_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_family_v6);
+	VERIFY(ret == 0);
+	ret = net_protocol_unregister(nsp->vpnd_neti_v4);
+	VERIFY(ret == 0);
+	ret = net_protocol_unregister(nsp->vpnd_neti_v6);
+	VERIFY(ret == 0);
+}
+
+/* ARGSUSED */
+static void
+vnd_strbarrier_cb(void *arg, mblk_t *bmp, gsqueue_t *gsp, void *dummy)
+{
+	vnd_str_t *vsp = arg;
+
+	VERIFY(bmp == &vsp->vns_barrierblk);
+	mutex_enter(&vsp->vns_lock);
+	VERIFY(vsp->vns_flags & VNS_F_BARRIER);
+	VERIFY(!(vsp->vns_flags & VNS_F_BARRIER_DONE));
+	vsp->vns_flags |= VNS_F_BARRIER_DONE;
+	mutex_exit(&vsp->vns_lock);
+
+	/*
+	 * For better or worse, we have to broadcast here as we could have a
+	 * thread that's blocked for completion as well as one that's blocked
+	 * waiting to do a barrier itself.
+	 */
+	cv_broadcast(&vsp->vns_barriercv);
+}
+
+/*
+ * This is a data barrier for the stream while it is in fastpath mode. It blocks
+ * and ensures that there is nothing else in the squeue.
+ */
+static void
+vnd_strbarrier(vnd_str_t *vsp)
+{
+	mutex_enter(&vsp->vns_lock);
+	while (vsp->vns_flags & VNS_F_BARRIER)
+		cv_wait(&vsp->vns_barriercv, &vsp->vns_lock);
+	vsp->vns_flags |= VNS_F_BARRIER;
+	mutex_exit(&vsp->vns_lock);
+
+	gsqueue_enter_one(vsp->vns_squeue, &vsp->vns_barrierblk,
+	    vnd_strbarrier_cb, vsp, GSQUEUE_PROCESS, VND_SQUEUE_TAG_STRBARRIER);
+
+	mutex_enter(&vsp->vns_lock);
+	while (!(vsp->vns_flags & VNS_F_BARRIER_DONE))
+		cv_wait(&vsp->vns_barriercv, &vsp->vns_lock);
+	vsp->vns_flags &= ~VNS_F_BARRIER;
+	vsp->vns_flags &= ~VNS_F_BARRIER_DONE;
+	mutex_exit(&vsp->vns_lock);
+
+	/*
+	 * We have to broadcast in case anyone is waiting for the barrier
+	 * themselves.
+	 */
+	cv_broadcast(&vsp->vns_barriercv);
+}
+
+/*
+ * Based on the type of message that we're dealing with we're going to want to
+ * do one of several things. Basically if it looks like it's something we know
+ * about, we should probably handle it in one of our transition threads.
+ * Otherwise, we should just simply putnext.
+ */
+static int
+vnd_s_rput(queue_t *q, mblk_t *mp)
+{
+	t_uscalar_t prim;
+	int dispatch = 0;
+	vnd_str_t *vsp = q->q_ptr;
+
+	switch (DB_TYPE(mp)) {
+	case M_PROTO:
+	case M_PCPROTO:
+		if (MBLKL(mp) < sizeof (t_uscalar_t)) {
+			vnd_drop_ctl(vsp, mp, "PROTO message too short");
+			break;
+		}
+
+		prim = ((union DL_primitives *)mp->b_rptr)->dl_primitive;
+		if (prim == DL_UNITDATA_REQ || prim == DL_UNITDATA_IND) {
+			vnd_drop_ctl(vsp, mp,
+			    "recieved an unsupported dlpi DATA req");
+			break;
+		}
+
+		/*
+		 * Enqueue the entry and fire off a taskq dispatch.
+		 */
+		mutex_enter(&vsp->vns_lock);
+		vnd_dlpi_inc_push(vsp, mp);
+		if (!(vsp->vns_flags & VNS_F_TASKQ_DISPATCHED)) {
+			dispatch = 1;
+			vsp->vns_flags |= VNS_F_TASKQ_DISPATCHED;
+		}
+		mutex_exit(&vsp->vns_lock);
+		if (dispatch != 0)
+			taskq_dispatch_ent(vnd_taskq, vnd_dlpi_taskq_dispatch,
+			    vsp, 0, &vsp->vns_tqe);
+		break;
+	case M_DATA:
+		vnd_drop_in(vsp, mp, "M_DATA via put(9E)");
+		break;
+	default:
+		putnext(vsp->vns_rq, mp);
+	}
+	return (0);
+}
+
+/* ARGSUSED */
+static void
+vnd_strioctl(queue_t *q, vnd_str_t *vsp, mblk_t *mp, struct iocblk *iocp)
+{
+	int error;
+	vnd_strioc_t *visp;
+
+	if (iocp->ioc_cmd != VND_STRIOC_ASSOCIATE ||
+	    iocp->ioc_count != TRANSPARENT) {
+		error = EINVAL;
+		goto nak;
+	}
+
+	/*
+	 * All streams ioctls that we support must use kcred as a means to
+	 * distinguish that this is a layered open by the kernel as opposed to
+	 * one by a user who has done an I_PUSH of the module.
+	 */
+	if (iocp->ioc_cr != kcred) {
+		error = EPERM;
+		goto nak;
+	}
+
+	if (mp->b_cont == NULL) {
+		error = EAGAIN;
+		goto nak;
+	}
+
+	visp = kmem_alloc(sizeof (vnd_strioc_t), KM_SLEEP);
+	ASSERT(MBLKL(mp->b_cont) == sizeof (caddr_t));
+	visp->vs_addr = *(caddr_t *)mp->b_cont->b_rptr;
+	visp->vs_state = VSS_COPYIN;
+
+	mcopyin(mp, (void *)visp, sizeof (vnd_strioc_associate_t), NULL);
+	qreply(q, mp);
+
+	return;
+
+nak:
+	if (mp->b_cont != NULL) {
+		freemsg(mp->b_cont);
+		mp->b_cont = NULL;
+	}
+
+	iocp->ioc_error = error;
+	mp->b_datap->db_type = M_IOCNAK;
+	iocp->ioc_count = 0;
+	qreply(q, mp);
+}
+
+static void
+vnd_striocdata(queue_t *q, vnd_str_t *vsp, mblk_t *mp, struct copyresp *csp)
+{
+	vnd_str_state_t state;
+	struct copyreq *crp;
+	vnd_strioc_associate_t *vss;
+	vnd_dev_t *vdp = NULL;
+	vnd_pnsd_t *nsp = NULL;
+	char iname[2*VND_NAMELEN];
+	zone_t *zone;
+	vnd_strioc_t *visp;
+
+	visp = (vnd_strioc_t *)csp->cp_private;
+
+	/* If it's not ours, it's not our problem */
+	if (csp->cp_cmd != VND_STRIOC_ASSOCIATE) {
+		if (q->q_next != NULL) {
+			putnext(q, mp);
+		} else {
+			VND_STAT_INC(vsp, vks_ndlpidrops, 1);
+			VND_STAT_INC(vsp, vks_tdrops, 1);
+			vnd_drop_ctl(vsp, mp, "uknown cmd for M_IOCDATA");
+		}
+		kmem_free(visp, sizeof (vnd_strioc_t));
+		return;
+	}
+
+	/* The nak is already sent for us */
+	if (csp->cp_rval != 0) {
+		vnd_drop_ctl(vsp, mp, "M_COPYIN failed");
+		kmem_free(visp, sizeof (vnd_strioc_t));
+		return;
+	}
+
+	/* Data is sitting for us in b_cont */
+	if (mp->b_cont == NULL ||
+	    MBLKL(mp->b_cont) != sizeof (vnd_strioc_associate_t)) {
+		kmem_free(visp, sizeof (vnd_strioc_t));
+		miocnak(q, mp, 0, EINVAL);
+		return;
+	}
+
+	vss = (vnd_strioc_associate_t *)mp->b_cont->b_rptr;
+	vdp = vnd_dev_lookup(vss->vsa_minor);
+	if (vdp == NULL) {
+		vss->vsa_errno = VND_E_NODEV;
+		goto nak;
+	}
+
+	nsp = vnd_nsd_lookup(vss->vsa_nsid);
+	if (nsp == NULL) {
+		vss->vsa_errno = VND_E_NONETSTACK;
+		goto nak;
+	}
+
+	mutex_enter(&vsp->vns_lock);
+	if (!(vsp->vns_flags & VNS_F_NEED_ZONE)) {
+		mutex_exit(&vsp->vns_lock);
+		vss->vsa_errno = VND_E_ASSOCIATED;
+		goto nak;
+	}
+
+	vsp->vns_nsd = nsp;
+	vsp->vns_flags &= ~VNS_F_NEED_ZONE;
+	vsp->vns_flags |= VNS_F_TASKQ_DISPATCHED;
+	mutex_exit(&vsp->vns_lock);
+
+	taskq_dispatch_ent(vnd_taskq, vnd_dlpi_taskq_dispatch, vsp, 0,
+	    &vsp->vns_tqe);
+
+
+	/* At this point we need to wait until we have transitioned to ONLINE */
+	mutex_enter(&vsp->vns_lock);
+	while (vsp->vns_state != VNS_S_ONLINE && vsp->vns_state != VNS_S_ZOMBIE)
+		cv_wait(&vsp->vns_stcv, &vsp->vns_lock);
+	state = vsp->vns_state;
+	mutex_exit(&vsp->vns_lock);
+
+	if (state == VNS_S_ZOMBIE) {
+		vss->vsa_errno = vsp->vns_errno;
+		goto nak;
+	}
+
+	mutex_enter(&vdp->vdd_lock);
+	mutex_enter(&vsp->vns_lock);
+	VERIFY(vdp->vdd_str == NULL);
+	/*
+	 * Now initialize the remaining kstat properties and let's go ahead and
+	 * create it.
+	 */
+	(void) snprintf(iname, sizeof (iname), "z%d_%d",
+	    vdp->vdd_nsd->vpnd_zid, vdp->vdd_minor);
+	vsp->vns_kstat = kstat_create_zone("vnd", vdp->vdd_minor, iname, "net",
+	    KSTAT_TYPE_NAMED, sizeof (vnd_str_stat_t) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL, GLOBAL_ZONEID);
+	if (vsp->vns_kstat == NULL) {
+		vss->vsa_errno = VND_E_KSTATCREATE;
+		mutex_exit(&vsp->vns_lock);
+		mutex_exit(&vdp->vdd_lock);
+		goto nak;
+	}
+	vdp->vdd_str = vsp;
+	vsp->vns_dev = vdp;
+
+	/*
+	 * Now, it's time to do the las thing that can fail, changing out the
+	 * input function. After this we know that we can receive data, so we
+	 * should make sure that we're ready.
+	 */
+	if (vnd_dld_cap_enable(vsp, vnd_mac_input) != 0) {
+		vss->vsa_errno = VND_E_DIRECTFAIL;
+		vdp->vdd_str = NULL;
+		vsp->vns_dev = NULL;
+		mutex_exit(&vsp->vns_lock);
+		mutex_exit(&vdp->vdd_lock);
+		goto nak;
+	}
+
+	zone = zone_find_by_id(vdp->vdd_nsd->vpnd_zid);
+	ASSERT(zone != NULL);
+	vsp->vns_kstat->ks_data = &vsp->vns_ksdata;
+	/* Account for zone name */
+	vsp->vns_kstat->ks_data_size += strlen(zone->zone_name) + 1;
+	/* Account for eventual link name */
+	vsp->vns_kstat->ks_data_size += VND_NAMELEN;
+	kstat_named_setstr(&vsp->vns_ksdata.vks_zonename, zone->zone_name);
+	kstat_named_setstr(&vdp->vdd_str->vns_ksdata.vks_linkname,
+	    vdp->vdd_lname);
+	zone_rele(zone);
+	kstat_install(vsp->vns_kstat);
+
+	mutex_exit(&vsp->vns_lock);
+	mutex_exit(&vdp->vdd_lock);
+
+	/*
+	 * Note that the vnd_str_t does not keep a permanent hold on the
+	 * vnd_pnsd_t. We leave that up to the vnd_dev_t as that's also what
+	 * the nestack goes through to take care of everything.
+	 */
+	vss->vsa_errno = VND_E_SUCCESS;
+nak:
+	if (vdp != NULL)
+		vnd_dev_rele(vdp);
+	if (nsp != NULL)
+		vnd_nsd_rele(nsp);
+	/*
+	 * Change the copyin request to a copyout. Note that we can't use
+	 * mcopyout here as it only works when the DB_TYPE is M_IOCTL. That's
+	 * okay, as the copyin vs. copyout is basically the same.
+	 */
+	DB_TYPE(mp) = M_COPYOUT;
+	visp->vs_state = VSS_COPYOUT;
+	crp = (struct copyreq *)mp->b_rptr;
+	crp->cq_private = (void *)visp;
+	crp->cq_addr = visp->vs_addr;
+	crp->cq_size = sizeof (vnd_strioc_associate_t);
+	qreply(q, mp);
+}
+
+static void
+vnd_stroutdata(queue_t *q, vnd_str_t *vsp, mblk_t *mp, struct copyresp *csp)
+{
+	ASSERT(csp->cp_private != NULL);
+	kmem_free(csp->cp_private, sizeof (vnd_strioc_t));
+	if (csp->cp_cmd != VND_STRIOC_ASSOCIATE) {
+		if (q->q_next != NULL) {
+			putnext(q, mp);
+		} else {
+			VND_STAT_INC(vsp, vks_ndlpidrops, 1);
+			VND_STAT_INC(vsp, vks_tdrops, 1);
+			vnd_drop_ctl(vsp, mp, "uknown cmd for M_IOCDATA");
+		}
+		return;
+	}
+
+	/* The nak is already sent for us */
+	if (csp->cp_rval != 0) {
+		vnd_drop_ctl(vsp, mp, "M_COPYOUT failed");
+		return;
+	}
+
+	/* Ack and let's be done with it all */
+	miocack(q, mp, 0, 0);
+}
+
+static int
+vnd_s_wput(queue_t *q, mblk_t *mp)
+{
+	vnd_str_t *vsp = q->q_ptr;
+	struct copyresp *crp;
+	vnd_strioc_state_t vstate;
+	vnd_strioc_t *visp;
+
+	switch (DB_TYPE(mp)) {
+	case M_IOCTL:
+		vnd_strioctl(q, vsp, mp, (struct iocblk *)mp->b_rptr);
+		return (0);
+	case M_IOCDATA:
+		crp = (struct copyresp *)mp->b_rptr;
+		ASSERT(crp->cp_private != NULL);
+		visp = (vnd_strioc_t *)crp->cp_private;
+		vstate = visp->vs_state;
+		ASSERT(vstate == VSS_COPYIN || vstate == VSS_COPYOUT);
+		if (vstate == VSS_COPYIN)
+			vnd_striocdata(q, vsp, mp,
+			    (struct copyresp *)mp->b_rptr);
+		else
+			vnd_stroutdata(q, vsp, mp,
+			    (struct copyresp *)mp->b_rptr);
+		return (0);
+	default:
+		break;
+	}
+	if (q->q_next != NULL)
+		putnext(q, mp);
+	else
+		vnd_drop_ctl(vsp, mp, "!M_IOCTL in wput");
+
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+vnd_s_open(queue_t *q, dev_t *devp, int oflag, int sflag, cred_t *credp)
+{
+	vnd_str_t *vsp;
+	uint_t rand;
+
+	if (q->q_ptr != NULL)
+		return (EINVAL);
+
+	if (!(sflag & MODOPEN))
+		return (ENXIO);
+
+	if (credp != kcred)
+		return (EPERM);
+
+	vsp = kmem_cache_alloc(vnd_str_cache, KM_SLEEP);
+	bzero(vsp, sizeof (*vsp));
+	mutex_init(&vsp->vns_lock, NULL, MUTEX_DRIVER, NULL);
+	cv_init(&vsp->vns_cancelcv, NULL, CV_DRIVER, NULL);
+	cv_init(&vsp->vns_barriercv, NULL, CV_DRIVER, NULL);
+	cv_init(&vsp->vns_stcv, NULL, CV_DRIVER, NULL);
+	vsp->vns_state = VNS_S_INITIAL;
+
+	mutex_init(&vsp->vns_dq_read.vdq_lock, NULL, MUTEX_DRIVER, NULL);
+	mutex_init(&vsp->vns_dq_write.vdq_lock, NULL, MUTEX_DRIVER, NULL);
+	mutex_enter(&vnd_dev_lock);
+	vsp->vns_dq_read.vdq_max = vnd_vdq_default_size;
+	vsp->vns_dq_read.vdq_vns = vsp;
+	vsp->vns_dq_write.vdq_max = vnd_vdq_default_size;
+	vsp->vns_dq_write.vdq_vns = vsp;
+	mutex_exit(&vnd_dev_lock);
+	vsp->vns_rq = q;
+	vsp->vns_wq = WR(q);
+	q->q_ptr = WR(q)->q_ptr = vsp;
+	vsp->vns_flags = VNS_F_NEED_ZONE;
+	vsp->vns_nflush = vnd_flush_nburst;
+	vsp->vns_bsize = vnd_flush_burst_size;
+
+	(void) random_get_pseudo_bytes((uint8_t *)&rand, sizeof (rand));
+	vsp->vns_squeue = gsqueue_set_get(vnd_sqset, rand);
+
+	/*
+	 * We create our kstat and initialize all of its fields now, but we
+	 * don't install it until we actually do the zone association so we can
+	 * get everything.
+	 */
+	kstat_named_init(&vsp->vns_ksdata.vks_rbytes, "rbytes",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&vsp->vns_ksdata.vks_rpackets, "rpackets",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&vsp->vns_ksdata.vks_obytes, "obytes",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&vsp->vns_ksdata.vks_opackets, "opackets",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&vsp->vns_ksdata.vks_nhookindrops, "nhookindrops",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&vsp->vns_ksdata.vks_nhookoutdrops, "nhookoutdrops",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&vsp->vns_ksdata.vks_ndlpidrops, "ndlpidrops",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&vsp->vns_ksdata.vks_ndataindrops, "ndataindrops",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&vsp->vns_ksdata.vks_ndataoutdrops, "ndataoutdrops",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&vsp->vns_ksdata.vks_tdrops, "total_drops",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&vsp->vns_ksdata.vks_linkname, "linkname",
+	    KSTAT_DATA_STRING);
+	kstat_named_init(&vsp->vns_ksdata.vks_zonename, "zonename",
+	    KSTAT_DATA_STRING);
+	kstat_named_init(&vsp->vns_ksdata.vks_nmacflow, "flowcontrol_events",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&vsp->vns_ksdata.vks_tmacflow, "flowcontrol_time",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&vsp->vns_ksdata.vks_mac_flow_1ms, "flowcontrol_1ms",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&vsp->vns_ksdata.vks_mac_flow_10ms, "flowcontrol_10ms",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&vsp->vns_ksdata.vks_mac_flow_100ms,
+	    "flowcontrol_100ms", KSTAT_DATA_UINT64);
+	kstat_named_init(&vsp->vns_ksdata.vks_mac_flow_1s, "flowcontrol_1s",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&vsp->vns_ksdata.vks_mac_flow_10s, "flowcontrol_10s",
+	    KSTAT_DATA_UINT64);
+	qprocson(q);
+	/*
+	 * Now that we've called qprocson, grab the lower module for making sure
+	 * that we don't have any pass through modules.
+	 */
+	vsp->vns_lrq = RD(vsp->vns_wq->q_next);
+
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+vnd_s_close(queue_t *q, int flag, cred_t *credp)
+{
+	vnd_str_t *vsp;
+	mblk_t *mp;
+
+	VERIFY(WR(q)->q_next != NULL);
+
+	vsp = q->q_ptr;
+	ASSERT(vsp != NULL);
+
+	/*
+	 * We need to transition ourselves down.  This means that we have a few
+	 * important different things to do in the process of tearing down our
+	 * input and output buffers, making sure we've drained the current
+	 * squeue, and disabling the fast path. Before we disable the fast path,
+	 * we should make sure the squeue is drained. Because we're in streams
+	 * close, we know that no packets can come into us from userland, but we
+	 * can receive more. As such, the following is the exact order of things
+	 * that we do:
+	 *
+	 * 1) flush the vns_dq_read
+	 * 2) Insert the drain mblk
+	 * 3) When it's been received, tear down the fast path by kicking
+	 * off the state machine.
+	 * 4) One final flush of both the vns_dq_read,vns_dq_write
+	 */
+
+	vnd_dq_flush(&vsp->vns_dq_read, vnd_drop_in);
+	vnd_strbarrier(vsp);
+	mutex_enter(&vsp->vns_lock);
+	vsp->vns_state = VNS_S_SHUTTING_DOWN;
+	if (!(vsp->vns_flags & VNS_F_TASKQ_DISPATCHED)) {
+		vsp->vns_flags |= VNS_F_TASKQ_DISPATCHED;
+		taskq_dispatch_ent(vnd_taskq, vnd_dlpi_taskq_dispatch, vsp,
+		    0, &vsp->vns_tqe);
+	}
+	while (vsp->vns_state != VNS_S_ZOMBIE)
+		cv_wait(&vsp->vns_stcv, &vsp->vns_lock);
+	mutex_exit(&vsp->vns_lock);
+
+	qprocsoff(q);
+	mutex_enter(&vsp->vns_lock);
+	vsp->vns_flags |= VNS_F_CONDEMNED;
+	while (vsp->vns_flags & VNS_F_TASKQ_DISPATCHED)
+		cv_wait(&vsp->vns_cancelcv, &vsp->vns_lock);
+
+	while ((mp = vnd_dlpi_inc_pop(vsp)) != NULL)
+		vnd_drop_ctl(vsp, mp, "vnd_s_close");
+	mutex_exit(&vsp->vns_lock);
+
+	q->q_ptr = NULL;
+	vnd_dq_flush(&vsp->vns_dq_read, vnd_drop_in);
+	vnd_dq_flush(&vsp->vns_dq_write, vnd_drop_out);
+	mutex_destroy(&vsp->vns_dq_read.vdq_lock);
+	mutex_destroy(&vsp->vns_dq_write.vdq_lock);
+
+	if (vsp->vns_kstat != NULL)
+		kstat_delete(vsp->vns_kstat);
+	mutex_destroy(&vsp->vns_lock);
+	cv_destroy(&vsp->vns_stcv);
+	cv_destroy(&vsp->vns_barriercv);
+	cv_destroy(&vsp->vns_cancelcv);
+	kmem_cache_free(vnd_str_cache, vsp);
+
+	return (0);
+}
+
+static vnd_mac_cookie_t
+vnd_squeue_tx_one(vnd_str_t *vsp, mblk_t *mp)
+{
+	hrtime_t txtime;
+	vnd_mac_cookie_t vc;
+
+	VND_STAT_INC(vsp, vks_opackets, 1);
+	VND_STAT_INC(vsp, vks_obytes, msgsize(mp));
+	DTRACE_VND5(send, mblk_t *, mp, void *, NULL, void *, NULL,
+	    vnd_str_t *, vsp, mblk_t *, mp);
+	/* Actually tx now */
+	txtime = gethrtime();
+	vc = vsp->vns_caps.vsc_tx_f(vsp->vns_caps.vsc_tx_hdl,
+	    mp, 0, MAC_DROP_ON_NO_DESC);
+
+	/*
+	 * We need to check two different conditions before we immediately set
+	 * the flow control lock. The first thing that we need to do is verify
+	 * that this is an instance of hard flow control, so to say. The flow
+	 * control callbacks won't always fire in cases where we still get a
+	 * cookie returned. The explicit check for flow control will guarantee
+	 * us that we'll get a subsequent notification callback.
+	 *
+	 * The second case comes about because we do not hold the
+	 * vnd_str_t`vns_lock across calls to tx, we need to determine if a flow
+	 * control notification already came across for us in a different thread
+	 * calling vnd_mac_flow_control(). To deal with this, we record a
+	 * timestamp every time that we change the flow control state. We grab
+	 * txtime here before we transmit because that guarantees that the
+	 * hrtime_t of the call to vnd_mac_flow_control() will be after txtime.
+	 *
+	 * If the flow control notification beat us to the punch, the value of
+	 * vns_fcupdate will be larger than the value of txtime, and we should
+	 * just record the statistics. However, if we didn't beat it to the
+	 * punch (txtime > vns_fcupdate), then we know that it's safe to wait
+	 * for a notification.
+	 */
+	if (vc != NULL) {
+		hrtime_t diff;
+
+		if (vsp->vns_caps.vsc_is_fc_f(vsp->vns_caps.vsc_is_fc_hdl,
+		    vc) == 0)
+			return (NULL);
+		mutex_enter(&vsp->vns_lock);
+		diff = vsp->vns_fcupdate - txtime;
+		if (diff > 0) {
+			mutex_exit(&vsp->vns_lock);
+			vnd_mac_flow_control_stat(vsp, diff);
+			return (NULL);
+		}
+		vsp->vns_flags |= VNS_F_FLOW_CONTROLLED;
+		vsp->vns_caps.vsc_fc_cookie = vc;
+		vsp->vns_fclatch = txtime;
+		vsp->vns_fcupdate = txtime;
+		DTRACE_VND3(flow__blocked, vnd_str_t *, vsp,
+		    uint64_t, vsp->vns_dq_write.vdq_cur, uintptr_t, vc);
+		mutex_exit(&vsp->vns_lock);
+	}
+
+	return (vc);
+}
+
+/* ARGSUSED */
+static void
+vnd_squeue_tx_drain(void *arg, mblk_t *drain_mp, gsqueue_t *gsp, void *dummy)
+{
+	mblk_t *mp;
+	int nmps;
+	size_t mptot, nflush, bsize;
+	boolean_t blocked, empty;
+	vnd_data_queue_t *vqp;
+	vnd_str_t *vsp = arg;
+
+	mutex_enter(&vsp->vns_lock);
+	/*
+	 * We either enter here via an squeue or via vnd_squeue_tx_append(). In
+	 * the former case we need to mark that there is no longer an active
+	 * user of the drain block.
+	 */
+	if (drain_mp != NULL) {
+		VERIFY(drain_mp == &vsp->vns_drainblk);
+		VERIFY(vsp->vns_flags & VNS_F_DRAIN_SCHEDULED);
+		vsp->vns_flags &= ~VNS_F_DRAIN_SCHEDULED;
+	}
+
+	/*
+	 * If we're still flow controlled or under a flush barrier, nothing to
+	 * do.
+	 */
+	if (vsp->vns_flags & (VNS_F_FLOW_CONTROLLED | VNS_F_BARRIER)) {
+		mutex_exit(&vsp->vns_lock);
+		return;
+	}
+
+	nflush = vsp->vns_nflush;
+	bsize = vsp->vns_bsize;
+	mutex_exit(&vsp->vns_lock);
+
+	/*
+	 * We're potentially going deep into the networking layer; make sure the
+	 * guest can't run concurrently.
+	 */
+	ht_begin_unsafe();
+
+	nmps = 0;
+	mptot = 0;
+	blocked = B_FALSE;
+	vqp = &vsp->vns_dq_write;
+	while (nmps < nflush && mptot <= bsize) {
+		mutex_enter(&vqp->vdq_lock);
+		if (vnd_dq_pop(vqp, &mp) == 0) {
+			mutex_exit(&vqp->vdq_lock);
+			break;
+		}
+		mutex_exit(&vqp->vdq_lock);
+
+		nmps++;
+		mptot += msgsize(mp);
+		if (vnd_squeue_tx_one(vsp, mp) != NULL) {
+			blocked = B_TRUE;
+			break;
+		}
+	}
+
+	ht_end_unsafe();
+
+	empty = vnd_dq_is_empty(&vsp->vns_dq_write);
+
+	/*
+	 * If the queue is not empty, we're not blocked, and there isn't a drain
+	 * scheduled, put it into the squeue with the drain block and
+	 * GSQUEUE_FILL.
+	 */
+	if (blocked == B_FALSE && empty == B_FALSE) {
+		mutex_enter(&vsp->vns_lock);
+		if (!(vsp->vns_flags & VNS_F_DRAIN_SCHEDULED)) {
+			mblk_t *mp = &vsp->vns_drainblk;
+			vsp->vns_flags |= VNS_F_DRAIN_SCHEDULED;
+			gsqueue_enter_one(vsp->vns_squeue,
+			    mp, vnd_squeue_tx_drain, vsp,
+			    GSQUEUE_FILL, VND_SQUEUE_TAG_TX_DRAIN);
+		}
+		mutex_exit(&vsp->vns_lock);
+	}
+
+	/*
+	 * If we drained some amount of data, we need to signal the data queue.
+	 */
+	if (nmps > 0) {
+		cv_broadcast(&vsp->vns_dq_write.vdq_ready);
+		pollwakeup(&vsp->vns_dev->vdd_ph, POLLOUT);
+	}
+}
+
+/* ARGSUSED */
+static void
+vnd_squeue_tx_append(void *arg, mblk_t *mp, gsqueue_t *gsp, void *dummy)
+{
+	vnd_str_t *vsp = arg;
+	vnd_data_queue_t *vqp = &vsp->vns_dq_write;
+	vnd_pnsd_t *nsp = vsp->vns_nsd;
+	size_t len = msgsize(mp);
+
+	/*
+	 * Before we append this packet, we should run it through the firewall
+	 * rules.
+	 */
+	if (nsp->vpnd_hooked && vnd_hook(vsp, &mp, nsp->vpnd_neti_v4,
+	    nsp->vpnd_event_out_v4, nsp->vpnd_token_out_v4, nsp->vpnd_neti_v6,
+	    nsp->vpnd_event_out_v6, nsp->vpnd_token_out_v6, vnd_drop_hook_out,
+	    vnd_drop_out) != 0) {
+		/*
+		 * Because we earlier reserved space for this packet and it's
+		 * not making the cut, we need to go through and unreserve that
+		 * space. Also note that the message block will likely be freed
+		 * by the time we return from vnd_hook so we cannot rely on it.
+		 */
+		mutex_enter(&vqp->vdq_lock);
+		vnd_dq_unreserve(vqp, len);
+		mutex_exit(&vqp->vdq_lock);
+		return;
+	}
+
+	/*
+	 * We earlier reserved space for this packet. So for now simply append
+	 * it and call drain. We know that no other drain can be going on right
+	 * now thanks to the squeue.
+	 */
+	mutex_enter(&vqp->vdq_lock);
+	(void) vnd_dq_push(&vsp->vns_dq_write, mp, B_TRUE, vnd_drop_panic);
+	mutex_exit(&vqp->vdq_lock);
+	vnd_squeue_tx_drain(vsp, NULL, NULL, NULL);
+}
+
+/*
+ * We need to see if this is a valid name of sorts for us. That means a few
+ * things. First off, we can't assume that what we've been given has actually
+ * been null terminated. More importantly, that it's a valid name as far as
+ * ddi_create_minor_node is concerned (that means no '@', '/', or ' '). We
+ * further constrain ourselves to simply alphanumeric characters and a few
+ * additional ones, ':', '-', and '_'.
+ */
+static int
+vnd_validate_name(const char *buf, size_t buflen)
+{
+	int i, len;
+
+	/* First make sure a null terminator exists */
+	for (i = 0; i < buflen; i++)
+		if (buf[i] == '\0')
+			break;
+	len = i;
+	if (i == 0 || i == buflen)
+		return (0);
+
+	for (i = 0; i < len; i++)
+		if (!isalnum(buf[i]) && buf[i] != ':' && buf[i] != '-' &&
+		    buf[i] != '_')
+			return (0);
+
+	return (1);
+}
+
+static int
+vnd_ioctl_attach(vnd_dev_t *vdp, uintptr_t arg, cred_t *credp, int cpflag)
+{
+	vnd_ioc_attach_t via;
+	vnd_strioc_associate_t vss;
+	vnd_pnsd_t *nsp;
+	zone_t *zonep;
+	zoneid_t zid;
+	char buf[2*VND_NAMELEN];
+	int ret, rp;
+
+	if (secpolicy_net_config(credp, B_FALSE) != 0)
+		return (EPERM);
+
+	if (secpolicy_net_rawaccess(credp) != 0)
+		return (EPERM);
+
+	if (ddi_copyin((void *)arg, &via, sizeof (via), cpflag) != 0)
+		return (EFAULT);
+	via.via_errno = VND_E_SUCCESS;
+
+	if (vnd_validate_name(via.via_name, VND_NAMELEN) == 0) {
+		via.via_errno = VND_E_BADNAME;
+		ret = EIO;
+		goto errcopyout;
+	}
+
+	/*
+	 * Only the global zone can request to create a device in a different
+	 * zone.
+	 */
+	zid = crgetzoneid(credp);
+	if (zid != GLOBAL_ZONEID && via.via_zoneid != -1 &&
+	    zid != via.via_zoneid) {
+		via.via_errno = VND_E_PERM;
+		ret = EIO;
+		goto errcopyout;
+	}
+
+	if (via.via_zoneid == -1)
+		via.via_zoneid = zid;
+
+	/*
+	 * Establish the name we'll use now. We want to be extra paranoid about
+	 * the device we're opening so check that now.
+	 */
+	if (zid == GLOBAL_ZONEID && via.via_zoneid != zid) {
+		zonep = zone_find_by_id(via.via_zoneid);
+		if (zonep == NULL) {
+			via.via_errno = VND_E_NOZONE;
+			ret = EIO;
+			goto errcopyout;
+		}
+		if (snprintf(NULL, 0, "/dev/net/zone/%s/%s", zonep->zone_name,
+		    via.via_name) >= sizeof (buf)) {
+			zone_rele(zonep);
+			via.via_errno = VND_E_BADNAME;
+			ret = EIO;
+			goto errcopyout;
+		}
+		(void) snprintf(buf, sizeof (buf), "/dev/net/zone/%s/%s",
+		    zonep->zone_name, via.via_name);
+		zone_rele(zonep);
+		zonep = NULL;
+	} else {
+		if (snprintf(NULL, 0, "/dev/net/%s", via.via_name) >=
+		    sizeof (buf)) {
+			via.via_errno = VND_E_BADNAME;
+			ret = EIO;
+			goto errcopyout;
+		}
+		(void) snprintf(buf, sizeof (buf), "/dev/net/%s", via.via_name);
+	}
+
+	/*
+	 * If our zone is dying then the netstack will have been removed from
+	 * this list.
+	 */
+	nsp = vnd_nsd_lookup_by_zid(via.via_zoneid);
+	if (nsp == NULL) {
+		via.via_errno = VND_E_NOZONE;
+		ret = EIO;
+		goto errcopyout;
+	}
+
+	/*
+	 * Note we set the attached handle even though we haven't actually
+	 * finished the process of attaching the ldi handle.
+	 */
+	mutex_enter(&vdp->vdd_lock);
+	if (vdp->vdd_flags & (VND_D_ATTACHED | VND_D_ATTACH_INFLIGHT)) {
+		mutex_exit(&vdp->vdd_lock);
+		vnd_nsd_rele(nsp);
+		via.via_errno = VND_E_ATTACHED;
+		ret = EIO;
+		goto errcopyout;
+	}
+	vdp->vdd_flags |= VND_D_ATTACH_INFLIGHT;
+	ASSERT(vdp->vdd_cr == NULL);
+	crhold(credp);
+	vdp->vdd_cr = credp;
+	ASSERT(vdp->vdd_nsd == NULL);
+	vdp->vdd_nsd = nsp;
+	mutex_exit(&vdp->vdd_lock);
+
+	/*
+	 * Place an additional hold on the vnd_pnsd_t as we go through and do
+	 * all of the rest of our work. This will be the hold that we keep for
+	 * as long as this thing is attached.
+	 */
+	vnd_nsd_ref(nsp);
+
+	ret = ldi_open_by_name(buf, FREAD | FWRITE, vdp->vdd_cr,
+	    &vdp->vdd_ldih, vdp->vdd_ldiid);
+	if (ret != 0) {
+		if (ret == ENODEV)
+			via.via_errno = VND_E_NODATALINK;
+		goto err;
+	}
+
+	/*
+	 * Unfortunately the I_PUSH interface doesn't allow us a way to detect
+	 * whether or not we're coming in from a layered device. We really want
+	 * to make sure that a normal user can't push on our streams module.
+	 * Currently the only idea I have for this is to make sure that the
+	 * credp is kcred which is really terrible.
+	 */
+	ret = ldi_ioctl(vdp->vdd_ldih, I_PUSH, (intptr_t)"vnd", FKIOCTL,
+	    kcred, &rp);
+	if (ret != 0) {
+		rp = ldi_close(vdp->vdd_ldih, FREAD | FWRITE, vdp->vdd_cr);
+		VERIFY(rp == 0);
+		via.via_errno = VND_E_STRINIT;
+		ret = EIO;
+		goto err;
+	}
+
+	vss.vsa_minor = vdp->vdd_minor;
+	vss.vsa_nsid = nsp->vpnd_nsid;
+
+	ret = ldi_ioctl(vdp->vdd_ldih, VND_STRIOC_ASSOCIATE, (intptr_t)&vss,
+	    FKIOCTL, kcred, &rp);
+	if (ret != 0 || vss.vsa_errno != VND_E_SUCCESS) {
+		rp = ldi_close(vdp->vdd_ldih, FREAD | FWRITE, vdp->vdd_cr);
+		VERIFY(rp == 0);
+		if (ret == 0) {
+			via.via_errno = vss.vsa_errno;
+			ret = EIO;
+		}
+		goto err;
+	}
+
+	mutex_enter(&vdp->vdd_nsd->vpnd_lock);
+
+	/*
+	 * There's a chance that our netstack was condemned while we've had a
+	 * hold on it. As such we need to check and if so, error out.
+	 */
+	if (vdp->vdd_nsd->vpnd_flags & VND_NS_CONDEMNED) {
+		mutex_exit(&vdp->vdd_nsd->vpnd_lock);
+		rp = ldi_close(vdp->vdd_ldih, FREAD | FWRITE, vdp->vdd_cr);
+		VERIFY(rp == 0);
+		ret = EIO;
+		via.via_errno = VND_E_NOZONE;
+		goto err;
+	}
+
+	mutex_enter(&vdp->vdd_lock);
+	VERIFY(vdp->vdd_str != NULL);
+	vdp->vdd_flags &= ~VND_D_ATTACH_INFLIGHT;
+	vdp->vdd_flags |= VND_D_ATTACHED;
+	(void) strlcpy(vdp->vdd_datalink, via.via_name,
+	    sizeof (vdp->vdd_datalink));
+	list_insert_tail(&vdp->vdd_nsd->vpnd_dev_list, vdp);
+	mutex_exit(&vdp->vdd_lock);
+	mutex_exit(&vdp->vdd_nsd->vpnd_lock);
+	vnd_nsd_rele(nsp);
+
+	return (0);
+
+err:
+	mutex_enter(&vdp->vdd_lock);
+	vdp->vdd_flags &= ~VND_D_ATTACH_INFLIGHT;
+	crfree(vdp->vdd_cr);
+	vdp->vdd_cr = NULL;
+	vdp->vdd_nsd = NULL;
+	mutex_exit(&vdp->vdd_lock);
+
+	/*
+	 * We have two holds to drop here. One for our original reference and
+	 * one for the hold this operation would have represented.
+	 */
+	vnd_nsd_rele(nsp);
+	vnd_nsd_rele(nsp);
+errcopyout:
+	if (ddi_copyout(&via, (void *)arg, sizeof (via), cpflag) != 0)
+		ret = EFAULT;
+
+	return (ret);
+}
+
+static int
+vnd_ioctl_link(vnd_dev_t *vdp, intptr_t arg, cred_t *credp, int cpflag)
+{
+	int ret = 0;
+	vnd_ioc_link_t vil;
+	char mname[2*VND_NAMELEN];
+	char **c;
+	vnd_dev_t *v;
+	zoneid_t zid;
+
+	/* Not anyone can link something */
+	if (secpolicy_net_config(credp, B_FALSE) != 0)
+		return (EPERM);
+
+	if (ddi_copyin((void *)arg, &vil, sizeof (vil), cpflag) != 0)
+		return (EFAULT);
+
+	if (vnd_validate_name(vil.vil_name, VND_NAMELEN) == 0) {
+		ret = EIO;
+		vil.vil_errno = VND_E_BADNAME;
+		goto errcopyout;
+	}
+
+	c = vnd_reserved_names;
+	while (*c != NULL) {
+		if (strcmp(vil.vil_name, *c) == 0) {
+			ret = EIO;
+			vil.vil_errno = VND_E_BADNAME;
+			goto errcopyout;
+		}
+		c++;
+	}
+
+	mutex_enter(&vdp->vdd_lock);
+	if (!(vdp->vdd_flags & VND_D_ATTACHED)) {
+		mutex_exit(&vdp->vdd_lock);
+		ret = EIO;
+		vil.vil_errno = VND_E_NOTATTACHED;
+		goto errcopyout;
+	}
+
+	if (vdp->vdd_flags & VND_D_ZONE_DYING) {
+		mutex_exit(&vdp->vdd_lock);
+		ret = EIO;
+		vil.vil_errno = VND_E_NOZONE;
+		goto errcopyout;
+	}
+
+	if (vdp->vdd_flags & (VND_D_LINK_INFLIGHT | VND_D_LINKED)) {
+		mutex_exit(&vdp->vdd_lock);
+		ret = EIO;
+		vil.vil_errno = VND_E_LINKED;
+		goto errcopyout;
+	}
+	vdp->vdd_flags |= VND_D_LINK_INFLIGHT;
+	zid = vdp->vdd_nsd->vpnd_zid;
+	mutex_exit(&vdp->vdd_lock);
+
+	if (snprintf(NULL, 0, "z%d:%s", zid, vil.vil_name) >=
+	    sizeof (mname)) {
+		ret = EIO;
+		vil.vil_errno = VND_E_BADNAME;
+		goto errcopyout;
+	}
+
+	mutex_enter(&vnd_dev_lock);
+	for (v = list_head(&vnd_dev_list); v != NULL;
+	    v = list_next(&vnd_dev_list, v)) {
+		if (!(v->vdd_flags & VND_D_LINKED))
+			continue;
+
+		if (v->vdd_nsd->vpnd_zid == zid &&
+		    strcmp(v->vdd_lname, vil.vil_name) == 0) {
+			mutex_exit(&vnd_dev_lock);
+			ret = EIO;
+			vil.vil_errno = VND_E_LINKEXISTS;
+			goto error;
+		}
+	}
+
+	/*
+	 * We set the name and mark ourselves attached while holding the list
+	 * lock to ensure that no other user can mistakingly find our name.
+	 */
+	(void) snprintf(mname, sizeof (mname), "z%d:%s", zid,
+	    vil.vil_name);
+	mutex_enter(&vdp->vdd_lock);
+
+	/*
+	 * Because we dropped our lock, we need to double check whether or not
+	 * the zone was marked as dying while we were here. If it hasn't, then
+	 * it's safe for us to link it in.
+	 */
+	if (vdp->vdd_flags & VND_D_ZONE_DYING) {
+		mutex_exit(&vdp->vdd_lock);
+		mutex_exit(&vnd_dev_lock);
+		ret = EIO;
+		vil.vil_errno = VND_E_NOZONE;
+		goto error;
+	}
+
+	(void) strlcpy(vdp->vdd_lname, vil.vil_name, sizeof (vdp->vdd_lname));
+	if (ddi_create_minor_node(vnd_dip, mname, S_IFCHR, vdp->vdd_minor,
+	    DDI_PSEUDO, 0) != DDI_SUCCESS) {
+		ret = EIO;
+		vil.vil_errno = VND_E_MINORNODE;
+	} else {
+		vdp->vdd_flags &= ~VND_D_LINK_INFLIGHT;
+		vdp->vdd_flags |= VND_D_LINKED;
+		kstat_named_setstr(&vdp->vdd_str->vns_ksdata.vks_linkname,
+		    vdp->vdd_lname);
+		ret = 0;
+	}
+	mutex_exit(&vdp->vdd_lock);
+	mutex_exit(&vnd_dev_lock);
+
+	if (ret == 0) {
+		/*
+		 * Add a reference to represent that this device is linked into
+		 * the file system name space to ensure that it doesn't
+		 * disappear.
+		 */
+		vnd_dev_ref(vdp);
+		return (0);
+	}
+
+error:
+	mutex_enter(&vdp->vdd_lock);
+	vdp->vdd_flags &= ~VND_D_LINK_INFLIGHT;
+	vdp->vdd_lname[0] = '\0';
+	mutex_exit(&vdp->vdd_lock);
+
+errcopyout:
+	if (ddi_copyout(&vil, (void *)arg, sizeof (vil), cpflag) != 0)
+		ret = EFAULT;
+	return (ret);
+}
+
+/*
+ * Common unlink function. This is used both from the ioctl path and from the
+ * netstack shutdown path. The caller is required to hold the mutex on the
+ * vnd_dev_t, but they basically will have it relinquished for them. The only
+ * thing the caller is allowed to do afterward is to potentially rele the
+ * vnd_dev_t if they have their own hold. Note that only the ioctl path has its
+ * own hold.
+ */
+static void
+vnd_dev_unlink(vnd_dev_t *vdp)
+{
+	char mname[2*VND_NAMELEN];
+
+	ASSERT(MUTEX_HELD(&vdp->vdd_lock));
+
+	(void) snprintf(mname, sizeof (mname), "z%d:%s",
+	    vdp->vdd_nsd->vpnd_zid, vdp->vdd_lname);
+	ddi_remove_minor_node(vnd_dip, mname);
+	vdp->vdd_lname[0] = '\0';
+	vdp->vdd_flags &= ~VND_D_LINKED;
+	kstat_named_setstr(&vdp->vdd_str->vns_ksdata.vks_linkname,
+	    vdp->vdd_lname);
+	mutex_exit(&vdp->vdd_lock);
+
+	/*
+	 * This rele corresponds to the reference that we took in
+	 * vnd_ioctl_link.
+	 */
+	vnd_dev_rele(vdp);
+}
+
+static int
+vnd_ioctl_unlink(vnd_dev_t *vdp, intptr_t arg, cred_t *credp, int cpflag)
+{
+	int ret;
+	zoneid_t zid;
+	vnd_ioc_unlink_t viu;
+
+	/* Not anyone can unlink something */
+	if (secpolicy_net_config(credp, B_FALSE) != 0)
+		return (EPERM);
+
+	zid = crgetzoneid(credp);
+
+	if (ddi_copyin((void *)arg, &viu, sizeof (viu), cpflag) != 0)
+		return (EFAULT);
+
+	viu.viu_errno = VND_E_SUCCESS;
+
+	mutex_enter(&vdp->vdd_lock);
+	if (!(vdp->vdd_flags & VND_D_LINKED)) {
+		mutex_exit(&vdp->vdd_lock);
+		ret = EIO;
+		viu.viu_errno = VND_E_NOTLINKED;
+		goto err;
+	}
+	VERIFY(vdp->vdd_flags & VND_D_ATTACHED);
+
+	if (zid != GLOBAL_ZONEID && zid != vdp->vdd_nsd->vpnd_zid) {
+		mutex_exit(&vdp->vdd_lock);
+		ret = EIO;
+		viu.viu_errno = VND_E_PERM;
+		goto err;
+	}
+
+	/* vnd_dev_unlink releases the vdp mutex for us */
+	vnd_dev_unlink(vdp);
+	ret = 0;
+err:
+	if (ddi_copyout(&viu, (void *)arg, sizeof (viu), cpflag) != 0)
+		return (EFAULT);
+
+	return (ret);
+}
+
+static int
+vnd_ioctl_setrxbuf(vnd_dev_t *vdp, intptr_t arg, int cpflag)
+{
+	int ret;
+	vnd_ioc_buf_t vib;
+
+	if (ddi_copyin((void *)arg, &vib, sizeof (vib), cpflag) != 0)
+		return (EFAULT);
+
+	mutex_enter(&vnd_dev_lock);
+	if (vib.vib_size > vnd_vdq_hard_max) {
+		mutex_exit(&vnd_dev_lock);
+		vib.vib_errno = VND_E_BUFTOOBIG;
+		ret = EIO;
+		goto err;
+	}
+	mutex_exit(&vnd_dev_lock);
+
+	mutex_enter(&vdp->vdd_lock);
+	if (!(vdp->vdd_flags & VND_D_ATTACHED)) {
+		mutex_exit(&vdp->vdd_lock);
+		vib.vib_errno = VND_E_NOTATTACHED;
+		ret = EIO;
+		goto err;
+	}
+
+	mutex_enter(&vdp->vdd_str->vns_lock);
+	if (vib.vib_size < vdp->vdd_str->vns_minwrite) {
+		mutex_exit(&vdp->vdd_str->vns_lock);
+		mutex_exit(&vdp->vdd_lock);
+		vib.vib_errno = VND_E_BUFTOOSMALL;
+		ret = EIO;
+		goto err;
+	}
+
+	mutex_exit(&vdp->vdd_str->vns_lock);
+	mutex_enter(&vdp->vdd_str->vns_dq_read.vdq_lock);
+	vdp->vdd_str->vns_dq_read.vdq_max = (size_t)vib.vib_size;
+	mutex_exit(&vdp->vdd_str->vns_dq_read.vdq_lock);
+	mutex_exit(&vdp->vdd_lock);
+	ret = 0;
+
+err:
+	if (ddi_copyout(&vib, (void *)arg, sizeof (vib), cpflag) != 0)
+		return (EFAULT);
+
+	return (ret);
+}
+
+static int
+vnd_ioctl_getrxbuf(vnd_dev_t *vdp, intptr_t arg, int cpflag)
+{
+	int ret;
+	vnd_ioc_buf_t vib;
+
+	mutex_enter(&vdp->vdd_lock);
+	if (!(vdp->vdd_flags & VND_D_ATTACHED)) {
+		mutex_exit(&vdp->vdd_lock);
+		vib.vib_errno = VND_E_NOTATTACHED;
+		ret = EIO;
+		goto err;
+	}
+
+	mutex_enter(&vdp->vdd_str->vns_dq_read.vdq_lock);
+	vib.vib_size = vdp->vdd_str->vns_dq_read.vdq_max;
+	mutex_exit(&vdp->vdd_str->vns_dq_read.vdq_lock);
+	mutex_exit(&vdp->vdd_lock);
+	ret = 0;
+
+err:
+	if (ddi_copyout(&vib, (void *)arg, sizeof (vib), cpflag) != 0)
+		return (EFAULT);
+
+	return (ret);
+}
+
+/* ARGSUSED */
+static int
+vnd_ioctl_getmaxbuf(vnd_dev_t *vdp, intptr_t arg, int cpflag)
+{
+	vnd_ioc_buf_t vib;
+
+	mutex_enter(&vnd_dev_lock);
+	vib.vib_size = vnd_vdq_hard_max;
+	mutex_exit(&vnd_dev_lock);
+
+	if (ddi_copyout(&vib, (void *)arg, sizeof (vib), cpflag) != 0)
+		return (EFAULT);
+
+	return (0);
+}
+
+static int
+vnd_ioctl_gettxbuf(vnd_dev_t *vdp, intptr_t arg, int cpflag)
+{
+	int ret;
+	vnd_ioc_buf_t vib;
+
+	mutex_enter(&vdp->vdd_lock);
+	if (!(vdp->vdd_flags & VND_D_ATTACHED)) {
+		mutex_exit(&vdp->vdd_lock);
+		vib.vib_errno = VND_E_NOTATTACHED;
+		ret = EIO;
+		goto err;
+	}
+
+	mutex_enter(&vdp->vdd_str->vns_dq_write.vdq_lock);
+	vib.vib_size = vdp->vdd_str->vns_dq_write.vdq_max;
+	mutex_exit(&vdp->vdd_str->vns_dq_write.vdq_lock);
+	mutex_exit(&vdp->vdd_lock);
+	ret = 0;
+
+err:
+	if (ddi_copyout(&vib, (void *)arg, sizeof (vib), cpflag) != 0)
+		return (EFAULT);
+
+	return (ret);
+}
+
+static int
+vnd_ioctl_settxbuf(vnd_dev_t *vdp, intptr_t arg, int cpflag)
+{
+	int ret;
+	vnd_ioc_buf_t vib;
+
+	if (ddi_copyin((void *)arg, &vib, sizeof (vib), cpflag) != 0)
+		return (EFAULT);
+
+	mutex_enter(&vnd_dev_lock);
+	if (vib.vib_size > vnd_vdq_hard_max) {
+		mutex_exit(&vnd_dev_lock);
+		vib.vib_errno = VND_E_BUFTOOBIG;
+		ret = EIO;
+		goto err;
+	}
+	mutex_exit(&vnd_dev_lock);
+
+	mutex_enter(&vdp->vdd_lock);
+	if (!(vdp->vdd_flags & VND_D_ATTACHED)) {
+		mutex_exit(&vdp->vdd_lock);
+		vib.vib_errno = VND_E_NOTATTACHED;
+		ret = EIO;
+		goto err;
+	}
+
+	mutex_enter(&vdp->vdd_str->vns_lock);
+	if (vib.vib_size < vdp->vdd_str->vns_minwrite) {
+		mutex_exit(&vdp->vdd_str->vns_lock);
+		mutex_exit(&vdp->vdd_lock);
+		vib.vib_errno = VND_E_BUFTOOSMALL;
+		ret = EIO;
+		goto err;
+	}
+	mutex_exit(&vdp->vdd_str->vns_lock);
+
+	mutex_enter(&vdp->vdd_str->vns_dq_write.vdq_lock);
+	vdp->vdd_str->vns_dq_write.vdq_max = (size_t)vib.vib_size;
+	mutex_exit(&vdp->vdd_str->vns_dq_write.vdq_lock);
+	mutex_exit(&vdp->vdd_lock);
+	ret = 0;
+
+err:
+	if (ddi_copyout(&vib, (void *)arg, sizeof (vib), cpflag) != 0)
+		return (EFAULT);
+
+	return (ret);
+}
+
+static int
+vnd_ioctl_gettu(vnd_dev_t *vdp, intptr_t arg, int mode, boolean_t min)
+{
+	vnd_ioc_buf_t vib;
+
+	vib.vib_errno = 0;
+	mutex_enter(&vdp->vdd_lock);
+	if (vdp->vdd_flags & VND_D_ATTACHED) {
+		mutex_enter(&vdp->vdd_str->vns_lock);
+		if (min == B_TRUE)
+			vib.vib_size = vdp->vdd_str->vns_minwrite;
+		else
+			vib.vib_size = vdp->vdd_str->vns_maxwrite;
+		mutex_exit(&vdp->vdd_str->vns_lock);
+	} else {
+		vib.vib_errno = VND_E_NOTATTACHED;
+	}
+	mutex_exit(&vdp->vdd_lock);
+
+	if (ddi_copyout(&vib, (void *)arg, sizeof (vib), mode & FKIOCTL) != 0)
+		return (EFAULT);
+
+	return (0);
+}
+
+static int
+vnd_frameio_read(vnd_dev_t *vdp, intptr_t addr, int mode)
+{
+	int ret, nonblock, nwrite;
+	frameio_t *fio;
+	vnd_data_queue_t *vqp;
+	mblk_t *mp;
+
+	fio = frameio_alloc(KM_NOSLEEP | KM_NORMALPRI);
+	if (fio == NULL)
+		return (EAGAIN);
+
+	ret = frameio_hdr_copyin(fio, FRAMEIO_NVECS_MAX, (const void *)addr,
+	    mode);
+	if (ret != 0) {
+		frameio_free(fio);
+		return (ret);
+	}
+
+	mutex_enter(&vdp->vdd_lock);
+	if (!(vdp->vdd_flags & VND_D_ATTACHED)) {
+		mutex_exit(&vdp->vdd_lock);
+		frameio_free(fio);
+		return (ENXIO);
+	}
+	mutex_exit(&vdp->vdd_lock);
+
+	nonblock = mode & (FNONBLOCK | FNDELAY);
+
+	vqp = &vdp->vdd_str->vns_dq_read;
+	mutex_enter(&vqp->vdq_lock);
+
+	/* Check empty case */
+	if (vqp->vdq_cur == 0) {
+		if (nonblock != 0) {
+			mutex_exit(&vqp->vdq_lock);
+			frameio_free(fio);
+			return (EWOULDBLOCK);
+		}
+		while (vqp->vdq_cur == 0) {
+			if (cv_wait_sig(&vqp->vdq_ready, &vqp->vdq_lock) <= 0) {
+				mutex_exit(&vqp->vdq_lock);
+				frameio_free(fio);
+				return (EINTR);
+			}
+		}
+	}
+
+	ret = frameio_mblk_chain_write(fio, MAP_BLK_FRAME, vqp->vdq_head,
+	    &nwrite, mode & FKIOCTL);
+	if (ret != 0) {
+		mutex_exit(&vqp->vdq_lock);
+		frameio_free(fio);
+		return (ret);
+	}
+
+	ret = frameio_hdr_copyout(fio, nwrite, (void *)addr, mode);
+	if (ret != 0) {
+		mutex_exit(&vqp->vdq_lock);
+		frameio_free(fio);
+		return (ret);
+	}
+
+	while (nwrite > 0) {
+		(void) vnd_dq_pop(vqp, &mp);
+		freemsg(mp);
+		nwrite--;
+	}
+	mutex_exit(&vqp->vdq_lock);
+	frameio_free(fio);
+
+	return (0);
+}
+
+static int
+vnd_frameio_write(vnd_dev_t *vdp, intptr_t addr, int mode)
+{
+	frameio_t *fio;
+	int ret, nonblock, nframes, i, nread;
+	size_t maxwrite, minwrite, total, flen;
+	mblk_t *mp_chain, *mp, *nmp;
+	vnd_data_queue_t *vqp;
+
+	fio = frameio_alloc(KM_NOSLEEP | KM_NORMALPRI);
+	if (fio == NULL)
+		return (EAGAIN);
+
+	ret = frameio_hdr_copyin(fio, FRAMEIO_NVECS_MAX, (void *)addr, mode);
+	if (ret != 0) {
+		frameio_free(fio);
+		return (ret);
+	}
+
+	mutex_enter(&vdp->vdd_lock);
+	if (!(vdp->vdd_flags & VND_D_ATTACHED)) {
+		mutex_exit(&vdp->vdd_lock);
+		frameio_free(fio);
+		return (ENXIO);
+	}
+	mutex_exit(&vdp->vdd_lock);
+
+	nonblock = mode & (FNONBLOCK | FNDELAY);
+
+	/*
+	 * Make sure no single frame is larger than we can accept.
+	 */
+	mutex_enter(&vdp->vdd_str->vns_lock);
+	minwrite = vdp->vdd_str->vns_minwrite;
+	maxwrite = vdp->vdd_str->vns_maxwrite;
+	mutex_exit(&vdp->vdd_str->vns_lock);
+
+	nframes = fio->fio_nvpf / fio->fio_nvecs;
+	total = 0;
+	for (i = 0; i < nframes; i++) {
+		flen = frameio_frame_length(fio,
+		    &fio->fio_vecs[i*fio->fio_nvpf]);
+		if (flen < minwrite || flen > maxwrite) {
+			frameio_free(fio);
+			return (ERANGE);
+		}
+		total += flen;
+	}
+
+	vqp = &vdp->vdd_str->vns_dq_write;
+	mutex_enter(&vqp->vdq_lock);
+	while (vnd_dq_reserve(vqp, total) == 0) {
+		if (nonblock != 0) {
+			frameio_free(fio);
+			mutex_exit(&vqp->vdq_lock);
+			return (EAGAIN);
+		}
+		if (cv_wait_sig(&vqp->vdq_ready, &vqp->vdq_lock) <= 0) {
+			mutex_exit(&vqp->vdq_lock);
+			frameio_free(fio);
+			return (EINTR);
+		}
+	}
+	mutex_exit(&vqp->vdq_lock);
+
+	/*
+	 * We've reserved our space, let's copyin and go from here.
+	 */
+	ret = frameio_mblk_chain_read(fio, &mp_chain, &nread, mode & FKIOCTL);
+	if (ret != 0) {
+		frameio_free(fio);
+		vnd_dq_unreserve(vqp, total);
+		cv_broadcast(&vqp->vdq_ready);
+		pollwakeup(&vdp->vdd_ph, POLLOUT);
+		return (ret);
+	}
+
+	for (mp = mp_chain; mp != NULL; mp = nmp) {
+		nmp = mp->b_next;
+		mp->b_next = NULL;
+		gsqueue_enter_one(vdp->vdd_str->vns_squeue, mp,
+		    vnd_squeue_tx_append, vdp->vdd_str, GSQUEUE_PROCESS,
+		    VND_SQUEUE_TAG_VND_WRITE);
+	}
+
+	/*
+	 * Update the frameio structure to indicate that we wrote those frames.
+	 */
+	frameio_mark_consumed(fio, nread);
+	ret = frameio_hdr_copyout(fio, nread, (void *)addr, mode);
+	frameio_free(fio);
+
+	return (ret);
+}
+
+static int
+vnd_ioctl_list_copy_info(vnd_dev_t *vdp, vnd_ioc_info_t *arg, int mode)
+{
+	const char *link;
+	uint32_t vers = 1;
+	ASSERT(MUTEX_HELD(&vdp->vdd_lock));
+
+	/*
+	 * Copy all of the members out to userland.
+	 */
+	if (ddi_copyout(&vers, &arg->vii_version, sizeof (uint32_t),
+	    mode & FKIOCTL) != 0)
+		return (EFAULT);
+
+	if (vdp->vdd_flags & VND_D_LINKED)
+		link = vdp->vdd_lname;
+	else
+		link = "<anonymous>";
+	if (ddi_copyout(link, arg->vii_name, sizeof (arg->vii_name),
+	    mode & FKIOCTL) != 0)
+		return (EFAULT);
+
+	if (ddi_copyout(vdp->vdd_datalink, arg->vii_datalink,
+	    sizeof (arg->vii_datalink), mode & FKIOCTL) != 0)
+		return (EFAULT);
+
+	if (ddi_copyout(&vdp->vdd_nsd->vpnd_zid, &arg->vii_zone,
+	    sizeof (zoneid_t), mode & FKIOCTL) != 0)
+		return (EFAULT);
+	return (0);
+}
+
+static int
+vnd_ioctl_list(intptr_t arg, cred_t *credp, int mode)
+{
+	vnd_ioc_list_t vl;
+	vnd_ioc_list32_t vl32;
+	zoneid_t zid;
+	vnd_dev_t *vdp;
+	vnd_ioc_info_t *vip;
+	int found, cancopy, ret;
+
+	if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) {
+		if (ddi_copyin((void *)arg, &vl32, sizeof (vnd_ioc_list32_t),
+		    mode & FKIOCTL) != 0)
+			return (EFAULT);
+		vl.vl_nents = vl32.vl_nents;
+		vl.vl_actents = vl32.vl_actents;
+		vl.vl_ents = (void *)(uintptr_t)vl32.vl_ents;
+	} else {
+		if (ddi_copyin((void *)arg, &vl, sizeof (vnd_ioc_list_t),
+		    mode & FKIOCTL) != 0)
+			return (EFAULT);
+	}
+
+	cancopy = vl.vl_nents;
+	vip = vl.vl_ents;
+	found = 0;
+	zid = crgetzoneid(credp);
+	mutex_enter(&vnd_dev_lock);
+	for (vdp = list_head(&vnd_dev_list); vdp != NULL;
+	    vdp = list_next(&vnd_dev_list, vdp)) {
+		mutex_enter(&vdp->vdd_lock);
+		if (vdp->vdd_flags & VND_D_ATTACHED &&
+		    !(vdp->vdd_flags & (VND_D_CONDEMNED | VND_D_ZONE_DYING)) &&
+		    (zid == GLOBAL_ZONEID || zid == vdp->vdd_nsd->vpnd_zid)) {
+			found++;
+			if (cancopy > 0) {
+				ret = vnd_ioctl_list_copy_info(vdp, vip, mode);
+				if (ret != 0) {
+					mutex_exit(&vdp->vdd_lock);
+					mutex_exit(&vnd_dev_lock);
+					return (ret);
+				}
+				cancopy--;
+				vip++;
+			}
+		}
+		mutex_exit(&vdp->vdd_lock);
+	}
+	mutex_exit(&vnd_dev_lock);
+
+	if (ddi_copyout(&found, &((vnd_ioc_list_t *)arg)->vl_actents,
+	    sizeof (uint_t), mode & FKIOCTL) != 0)
+		return (EFAULT);
+
+	return (0);
+}
+
+
+/* ARGSUSED */
+static int
+vnd_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
+    int *rvalp)
+{
+	int ret;
+	minor_t m;
+	vnd_dev_t *vdp;
+
+	m = getminor(dev);
+	ASSERT(m != 0);
+
+	/*
+	 * Make sure no one has come in on an ioctl from the strioc case.
+	 */
+	if ((cmd & VND_STRIOC) == VND_STRIOC)
+		return (ENOTTY);
+
+	/*
+	 * Like close, seems like if this minor isn't found, it's a programmer
+	 * error somehow.
+	 */
+	vdp = vnd_dev_lookup(m);
+	if (vdp == NULL)
+		return (ENXIO);
+
+	switch (cmd) {
+	case VND_IOC_ATTACH:
+		if (!(mode & FWRITE)) {
+			ret = EBADF;
+			break;
+		}
+		ret = vnd_ioctl_attach(vdp, arg, credp, mode);
+		break;
+	case VND_IOC_LINK:
+		if (!(mode & FWRITE)) {
+			ret = EBADF;
+			break;
+		}
+		ret = vnd_ioctl_link(vdp, arg, credp, mode);
+		break;
+	case VND_IOC_UNLINK:
+		if (!(mode & FWRITE)) {
+			ret = EBADF;
+			break;
+		}
+		ret = vnd_ioctl_unlink(vdp, arg, credp, mode);
+		break;
+	case VND_IOC_GETRXBUF:
+		if (!(mode & FREAD)) {
+			ret = EBADF;
+			break;
+		}
+		ret = vnd_ioctl_getrxbuf(vdp, arg, mode);
+		break;
+	case VND_IOC_SETRXBUF:
+		if (!(mode & FWRITE)) {
+			ret = EBADF;
+			break;
+		}
+		ret = vnd_ioctl_setrxbuf(vdp, arg, mode);
+		break;
+	case VND_IOC_GETTXBUF:
+		if (!(mode & FREAD)) {
+			ret = EBADF;
+			break;
+		}
+		ret = vnd_ioctl_gettxbuf(vdp, arg, mode);
+		break;
+	case VND_IOC_SETTXBUF:
+		if (!(mode & FWRITE)) {
+			ret = EBADF;
+			break;
+		}
+		ret = vnd_ioctl_settxbuf(vdp, arg, mode);
+		break;
+	case VND_IOC_GETMAXBUF:
+		if (!(mode & FREAD)) {
+			ret = EBADF;
+			break;
+		}
+		if (crgetzoneid(credp) != GLOBAL_ZONEID) {
+			ret = EPERM;
+			break;
+		}
+		ret = vnd_ioctl_getmaxbuf(vdp, arg, mode);
+		break;
+	case VND_IOC_GETMINTU:
+		if (!(mode & FREAD)) {
+			ret = EBADF;
+			break;
+		}
+		ret = vnd_ioctl_gettu(vdp, arg, mode, B_TRUE);
+		break;
+	case VND_IOC_GETMAXTU:
+		if (!(mode & FREAD)) {
+			ret = EBADF;
+			break;
+		}
+		ret = vnd_ioctl_gettu(vdp, arg, mode, B_FALSE);
+		break;
+	case VND_IOC_FRAMEIO_READ:
+		if (!(mode & FREAD)) {
+			ret = EBADF;
+			break;
+		}
+		ret = vnd_frameio_read(vdp, arg, mode);
+		break;
+	case VND_IOC_FRAMEIO_WRITE:
+		if (!(mode & FWRITE)) {
+			ret = EBADF;
+			break;
+		}
+		ret = vnd_frameio_write(vdp, arg, mode);
+		break;
+	case VND_IOC_LIST:
+		if (!(mode & FREAD)) {
+			ret = EBADF;
+			break;
+		}
+		ret = vnd_ioctl_list(arg, credp, mode);
+		break;
+	default:
+		ret = ENOTTY;
+		break;
+	}
+
+	vnd_dev_rele(vdp);
+	return (ret);
+}
+
+static int
+vnd_open(dev_t *devp, int flag, int otyp, cred_t *credp)
+{
+	vnd_dev_t *vdp;
+	minor_t m;
+	zoneid_t zid;
+
+	if (flag & (FEXCL | FNDELAY))
+		return (ENOTSUP);
+
+	if (otyp & OTYP_BLK)
+		return (ENOTSUP);
+
+	zid = crgetzoneid(credp);
+	m = getminor(*devp);
+
+	/*
+	 * If we have an open of a non-zero instance then we need to look that
+	 * up in our list of entries.
+	 */
+	if (m != 0) {
+
+		/*
+		 * We don't check for rawaccess globally as a user could be
+		 * doing a list ioctl on the control node which doesn't require
+		 * this privilege.
+		 */
+		if (secpolicy_net_rawaccess(credp) != 0)
+			return (EPERM);
+
+
+		vdp = vnd_dev_lookup(m);
+		if (vdp == NULL)
+			return (ENOENT);
+
+		/*
+		 * We need to check to make sure that the user is allowed to
+		 * open this node. At this point it should be an attached handle
+		 * as that's all we're allowed to access.
+		 */
+		mutex_enter(&vdp->vdd_lock);
+		if (!(vdp->vdd_flags & VND_D_LINKED)) {
+			mutex_exit(&vdp->vdd_lock);
+			vnd_dev_rele(vdp);
+			return (ENOENT);
+		}
+
+		if (vdp->vdd_flags & VND_D_ZONE_DYING) {
+			mutex_exit(&vdp->vdd_lock);
+			vnd_dev_rele(vdp);
+			return (ENOENT);
+		}
+
+		if (zid != GLOBAL_ZONEID && zid != vdp->vdd_nsd->vpnd_zid) {
+			mutex_exit(&vdp->vdd_lock);
+			vnd_dev_rele(vdp);
+			return (ENOENT);
+		}
+
+		if ((flag & FEXCL) && (vdp->vdd_flags & VND_D_OPENED)) {
+			mutex_exit(&vdp->vdd_lock);
+			vnd_dev_rele(vdp);
+			return (EBUSY);
+		}
+
+		if (!(vdp->vdd_flags & VND_D_OPENED)) {
+			vdp->vdd_flags |= VND_D_OPENED;
+			vdp->vdd_ref++;
+			DTRACE_VND_REFINC(vdp);
+		}
+		mutex_exit(&vdp->vdd_lock);
+		vnd_dev_rele(vdp);
+
+		return (0);
+	}
+
+	if (flag & FEXCL)
+		return (ENOTSUP);
+
+	/*
+	 * We need to clone ourselves and set up new a state.
+	 */
+	vdp = kmem_cache_alloc(vnd_dev_cache, KM_SLEEP);
+	bzero(vdp, sizeof (vnd_dev_t));
+
+	if (ldi_ident_from_dev(*devp, &vdp->vdd_ldiid) != 0) {
+		kmem_cache_free(vnd_dev_cache, vdp);
+		return (EINVAL);
+	}
+
+	vdp->vdd_minor = id_alloc(vnd_minors);
+	mutex_init(&vdp->vdd_lock, NULL, MUTEX_DRIVER, NULL);
+	list_link_init(&vdp->vdd_link);
+	vdp->vdd_ref = 1;
+	*devp = makedevice(getmajor(*devp), vdp->vdd_minor);
+	vdp->vdd_devid = *devp;
+	DTRACE_VND_REFINC(vdp);
+	vdp->vdd_flags |= VND_D_OPENED;
+
+	mutex_enter(&vnd_dev_lock);
+	list_insert_head(&vnd_dev_list, vdp);
+	mutex_exit(&vnd_dev_lock);
+
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+vnd_close(dev_t dev, int flag, int otyp, cred_t *credp)
+{
+	minor_t m;
+	vnd_dev_t *vdp;
+
+	m = getminor(dev);
+	if (m == 0)
+		return (ENXIO);
+
+	vdp = vnd_dev_lookup(m);
+	if (vdp == NULL)
+		return (ENXIO);
+
+	mutex_enter(&vdp->vdd_lock);
+	VERIFY(vdp->vdd_flags & VND_D_OPENED);
+	vdp->vdd_flags &= ~VND_D_OPENED;
+	mutex_exit(&vdp->vdd_lock);
+
+	/* Remove the hold from the previous open. */
+	vnd_dev_rele(vdp);
+
+	/* And now from lookup */
+	vnd_dev_rele(vdp);
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+vnd_read(dev_t dev, struct uio *uiop, cred_t *credp)
+{
+	int nonblock, error = 0;
+	size_t mpsize;
+	vnd_dev_t *vdp;
+	vnd_data_queue_t *vqp;
+	mblk_t *mp = NULL;
+	offset_t u_loffset;
+
+	/*
+	 * If we have more than one uio we refuse to do anything. That's for
+	 * frameio.
+	 */
+	if (uiop->uio_iovcnt > 1)
+		return (EINVAL);
+
+	vdp = vnd_dev_lookup(getminor(dev));
+	if (vdp == NULL)
+		return (ENXIO);
+
+	mutex_enter(&vdp->vdd_lock);
+	if (!(vdp->vdd_flags & VND_D_ATTACHED)) {
+		mutex_exit(&vdp->vdd_lock);
+		vnd_dev_rele(vdp);
+		return (ENXIO);
+	}
+	mutex_exit(&vdp->vdd_lock);
+	nonblock = uiop->uio_fmode & (FNONBLOCK | FNDELAY);
+
+	vqp = &vdp->vdd_str->vns_dq_read;
+	mutex_enter(&vqp->vdq_lock);
+
+	/* Check empty case */
+	if (vqp->vdq_cur == 0) {
+		if (nonblock != 0) {
+			error = EWOULDBLOCK;
+			goto err;
+		}
+		while (vqp->vdq_cur == 0) {
+			if (cv_wait_sig(&vqp->vdq_ready, &vqp->vdq_lock) <= 0) {
+				error = EINTR;
+				goto err;
+			}
+		}
+	}
+
+	/* Ensure our buffer is big enough */
+	mp = vqp->vdq_head;
+	ASSERT(mp != NULL);
+	mpsize = msgsize(mp);
+	if (mpsize > uiop->uio_resid) {
+		error = EOVERFLOW;
+		goto err;
+	}
+
+	u_loffset = uiop->uio_loffset;
+	while (mp != NULL) {
+		if (uiomove(mp->b_rptr, MBLKL(mp), UIO_READ, uiop) != 0) {
+			error = EFAULT;
+			uiop->uio_loffset = u_loffset;
+			mp = NULL;
+			goto err;
+		}
+		mpsize -= MBLKL(mp);
+		mp = mp->b_cont;
+	}
+	ASSERT(mpsize == 0);
+	(void) vnd_dq_pop(vqp, &mp);
+	freemsg(mp);
+err:
+	mutex_exit(&vqp->vdq_lock);
+	vnd_dev_rele(vdp);
+
+	return (error);
+}
+
+/* ARGSUSED */
+static int
+vnd_write(dev_t dev, struct uio *uiop, cred_t *credp)
+{
+	int nonblock, error;
+	vnd_dev_t *vdp;
+	mblk_t *mp;
+	ssize_t iosize, origsize;
+	vnd_data_queue_t *vqp;
+
+	if (uiop->uio_iovcnt > 1)
+		return (EINVAL);
+
+	vdp = vnd_dev_lookup(getminor(dev));
+	if (vdp == NULL)
+		return (ENXIO);
+
+	mutex_enter(&vdp->vdd_lock);
+	if (!(vdp->vdd_flags & VND_D_ATTACHED)) {
+		mutex_exit(&vdp->vdd_lock);
+		vnd_dev_rele(vdp);
+		return (ENXIO);
+	}
+	mutex_exit(&vdp->vdd_lock);
+	nonblock = uiop->uio_fmode & (FNONBLOCK | FNDELAY);
+
+	mutex_enter(&vdp->vdd_str->vns_lock);
+	if (uiop->uio_resid > vdp->vdd_str->vns_maxwrite ||
+	    uiop->uio_resid < vdp->vdd_str->vns_minwrite) {
+		mutex_exit(&vdp->vdd_str->vns_lock);
+		vnd_dev_rele(vdp);
+		return (ERANGE);
+	}
+	mutex_exit(&vdp->vdd_str->vns_lock);
+	VERIFY(vdp->vdd_str != NULL);
+
+	/*
+	 * Reserve space in the data queue if we can. If we can't, block or
+	 * return EAGAIN. If we can, go and squeue_enter.
+	 */
+	vqp = &vdp->vdd_str->vns_dq_write;
+	mutex_enter(&vqp->vdq_lock);
+	while (vnd_dq_reserve(vqp, uiop->uio_resid) == 0) {
+		if (nonblock != 0) {
+			mutex_exit(&vqp->vdq_lock);
+			vnd_dev_rele(vdp);
+			return (EAGAIN);
+		}
+		if (cv_wait_sig(&vqp->vdq_ready, &vqp->vdq_lock) <= 0) {
+			mutex_exit(&vqp->vdq_lock);
+			vnd_dev_rele(vdp);
+			return (EINTR);
+		}
+	}
+	mutex_exit(&vqp->vdq_lock);
+
+	/*
+	 * Now that we've reserved the space, try to allocate kernel space for
+	 * and copy in the block. To take care of all this we use the
+	 * strmakedata subroutine for now.
+	 */
+	origsize = iosize = uiop->uio_resid;
+	error = strmakedata(&iosize, uiop, vdp->vdd_str->vns_wq->q_stream, 0,
+	    &mp);
+
+	/*
+	 * strmakedata() will return an error or it may only consume a portion
+	 * of the data.
+	 */
+	if (error != 0 || uiop->uio_resid != 0) {
+		vnd_dq_unreserve(vqp, origsize);
+		cv_broadcast(&vqp->vdq_ready);
+		pollwakeup(&vdp->vdd_ph, POLLOUT);
+		vnd_dev_rele(vdp);
+		return (ENOSR);
+	}
+
+	gsqueue_enter_one(vdp->vdd_str->vns_squeue, mp,
+	    vnd_squeue_tx_append, vdp->vdd_str, GSQUEUE_PROCESS,
+	    VND_SQUEUE_TAG_VND_WRITE);
+
+	vnd_dev_rele(vdp);
+	return (0);
+}
+
+static int
+vnd_chpoll(dev_t dev, short events, int anyyet, short *reventsp,
+    struct pollhead **phpp)
+{
+	short ready = 0;
+	vnd_dev_t *vdp;
+	vnd_data_queue_t *vqp;
+
+	vdp = vnd_dev_lookup(getminor(dev));
+	if (vdp == NULL)
+		return (ENXIO);
+
+	mutex_enter(&vdp->vdd_lock);
+	if (!(vdp->vdd_flags & VND_D_ATTACHED)) {
+		mutex_exit(&vdp->vdd_lock);
+		vnd_dev_rele(vdp);
+		return (ENXIO);
+	}
+	mutex_exit(&vdp->vdd_lock);
+
+	if ((events & POLLIN) || (events & POLLRDNORM)) {
+		vqp = &vdp->vdd_str->vns_dq_read;
+		mutex_enter(&vqp->vdq_lock);
+		if (vqp->vdq_head != NULL)
+			ready |= events & (POLLIN | POLLRDNORM);
+		mutex_exit(&vqp->vdq_lock);
+	}
+
+	if (events & POLLOUT) {
+		vqp = &vdp->vdd_str->vns_dq_write;
+		mutex_enter(&vqp->vdq_lock);
+		if (vqp->vdq_cur != vqp->vdq_max)
+			ready |= POLLOUT;
+		mutex_exit(&vqp->vdq_lock);
+	}
+
+	if ((ready == 0 && !anyyet) || (events & POLLET)) {
+		*phpp = &vdp->vdd_ph;
+	}
+	*reventsp = ready;
+	vnd_dev_rele(vdp);
+	return (0);
+}
+
+/* ARGSUSED */
+static void *
+vnd_stack_init(netstackid_t stackid, netstack_t *ns)
+{
+	vnd_pnsd_t *nsp;
+
+	nsp = kmem_cache_alloc(vnd_pnsd_cache, KM_SLEEP);
+	bzero(nsp, sizeof (*nsp));
+	nsp->vpnd_nsid = stackid;
+	nsp->vpnd_zid = netstackid_to_zoneid(stackid);
+	nsp->vpnd_flags = 0;
+	mutex_init(&nsp->vpnd_lock, NULL, MUTEX_DRIVER, NULL);
+	list_create(&nsp->vpnd_dev_list, sizeof (vnd_dev_t),
+	    offsetof(vnd_dev_t, vdd_nslink));
+	if (vnd_netinfo_init(nsp) == 0)
+		nsp->vpnd_hooked = B_TRUE;
+
+	mutex_enter(&vnd_dev_lock);
+	list_insert_tail(&vnd_nsd_list, nsp);
+	mutex_exit(&vnd_dev_lock);
+
+	return (nsp);
+}
+
+/* ARGSUSED */
+static void
+vnd_stack_shutdown(netstackid_t stackid, void *arg)
+{
+	vnd_pnsd_t *nsp = arg;
+	vnd_dev_t *vdp;
+
+	ASSERT(nsp != NULL);
+	/*
+	 * After shut down no one should be able to find their way to this
+	 * netstack again.
+	 */
+	mutex_enter(&vnd_dev_lock);
+	list_remove(&vnd_nsd_list, nsp);
+	mutex_exit(&vnd_dev_lock);
+
+	/*
+	 * Make sure hooks know that they're going away.
+	 */
+	if (nsp->vpnd_hooked == B_TRUE)
+		vnd_netinfo_shutdown(nsp);
+
+	/*
+	 * Now we need to go through and notify each zone that they are in
+	 * teardown phase.  See the big theory statement section on vnd, zones,
+	 * netstacks, and sdev for more information about this.
+	 */
+	mutex_enter(&nsp->vpnd_lock);
+	nsp->vpnd_flags |= VND_NS_CONDEMNED;
+	for (vdp = list_head(&nsp->vpnd_dev_list); vdp != NULL;
+	    vdp = list_next(&nsp->vpnd_dev_list, vdp)) {
+		mutex_enter(&vdp->vdd_lock);
+		if (!(vdp->vdd_flags & VND_D_CONDEMNED))
+			vdp->vdd_flags |= VND_D_ZONE_DYING;
+		mutex_exit(&vdp->vdd_lock);
+	}
+	mutex_exit(&nsp->vpnd_lock);
+
+	/*
+	 * Next we remove all the links as we know nothing new can be added to
+	 * the list and that none of the extent devices can obtain additional
+	 * links.
+	 */
+restart:
+	mutex_enter(&nsp->vpnd_lock);
+	for (vdp = list_head(&nsp->vpnd_dev_list); vdp != NULL;
+	    vdp = list_next(&nsp->vpnd_dev_list, vdp)) {
+		mutex_enter(&vdp->vdd_lock);
+		if ((vdp->vdd_flags & VND_D_CONDEMNED) ||
+		    !(vdp->vdd_flags & VND_D_LINKED)) {
+			mutex_exit(&vdp->vdd_lock);
+			continue;
+		}
+
+		/*
+		 * We drop our lock here and restart afterwards. Note that as
+		 * part of unlinking we end up doing a rele of the vnd_dev_t. If
+		 * this is the final hold on the vnd_dev_t then it might try and
+		 * remove itself. Our locking rules requires not to be holding
+		 * any locks when we call any of the rele functions.
+		 *
+		 * Note that the unlink function requires holders to call into
+		 * it with the vnd_dev_t->vdd_lock held and will take care of it
+		 * for us. Because we don't have a hold on it, we're done at
+		 * this point.
+		 */
+		mutex_exit(&nsp->vpnd_lock);
+		/* Forcibly unlink */
+		vnd_dev_unlink(vdp);
+		goto restart;
+	}
+	mutex_exit(&nsp->vpnd_lock);
+}
+
+/* ARGSUSED */
+static void
+vnd_stack_destroy(netstackid_t stackid, void *arg)
+{
+	vnd_pnsd_t *nsp = arg;
+
+	ASSERT(nsp != NULL);
+
+	/*
+	 * Now that we've unlinked everything we just have to hang out for
+	 * it to finish exiting. Now that it's no longer the kernel itself
+	 * that's doing this we just need to wait for our reference count to
+	 * equal zero and then we're free. If the global zone is holding open a
+	 * reference to a vnd device for another zone, that's bad, but there's
+	 * nothing much we can do. See the section on 'vnd, zones, netstacks' in
+	 * the big theory statement for more information.
+	 */
+	mutex_enter(&nsp->vpnd_lock);
+	while (nsp->vpnd_ref != 0)
+		cv_wait(&nsp->vpnd_ref_change, &nsp->vpnd_lock);
+	mutex_exit(&nsp->vpnd_lock);
+
+	/*
+	 * During shutdown we removed ourselves from the list and now we have no
+	 * more references so we can safely say that there is nothing left and
+	 * destroy everything that we had sitting around.
+	 */
+	if (nsp->vpnd_hooked == B_TRUE)
+		vnd_netinfo_fini(nsp);
+
+	mutex_destroy(&nsp->vpnd_lock);
+	list_destroy(&nsp->vpnd_dev_list);
+	kmem_cache_free(vnd_pnsd_cache, nsp);
+}
+
+/*
+ * Convert a node with a name of the form /dev/vnd/zone/%zonename and
+ * /dev/vnd/zone/%zonename/%linkname to the corresponding vnd netstack.
+ */
+static vnd_pnsd_t *
+vnd_sdev_ctx_to_ns(sdev_ctx_t ctx)
+{
+	enum vtype vt;
+	const char *path = sdev_ctx_path(ctx);
+	char *zstart, *dup;
+	size_t duplen;
+	vnd_pnsd_t *nsp;
+
+	vt = sdev_ctx_vtype(ctx);
+	ASSERT(strncmp(path, VND_SDEV_ZROOT, strlen(VND_SDEV_ZROOT)) == 0);
+
+	if (vt == VDIR) {
+		zstart = strrchr(path, '/');
+		ASSERT(zstart != NULL);
+		zstart++;
+		return (vnd_nsd_lookup_by_zonename(zstart));
+	}
+
+	ASSERT(vt == VCHR);
+
+	dup = strdup(path);
+	duplen = strlen(dup) + 1;
+	zstart = strrchr(dup, '/');
+	*zstart = '\0';
+	zstart--;
+	zstart = strrchr(dup, '/');
+	zstart++;
+	nsp = vnd_nsd_lookup_by_zonename(zstart);
+	kmem_free(dup, duplen);
+
+	return (nsp);
+}
+
+static sdev_plugin_validate_t
+vnd_sdev_validate_dir(sdev_ctx_t ctx)
+{
+	vnd_pnsd_t *nsp;
+
+	if (strcmp(sdev_ctx_path(ctx), VND_SDEV_ROOT) == 0)
+		return (SDEV_VTOR_VALID);
+
+	if (strcmp(sdev_ctx_path(ctx), VND_SDEV_ZROOT) == 0) {
+		ASSERT(getzoneid() == GLOBAL_ZONEID);
+		ASSERT(sdev_ctx_flags(ctx) & SDEV_CTX_GLOBAL);
+		return (SDEV_VTOR_VALID);
+	}
+
+	nsp = vnd_sdev_ctx_to_ns(ctx);
+	if (nsp == NULL)
+		return (SDEV_VTOR_INVALID);
+	vnd_nsd_rele(nsp);
+
+	return (SDEV_VTOR_VALID);
+}
+
+static sdev_plugin_validate_t
+vnd_sdev_validate(sdev_ctx_t ctx)
+{
+	enum vtype vt;
+	vnd_dev_t *vdp;
+	minor_t minor;
+
+	vt = sdev_ctx_vtype(ctx);
+	if (vt == VDIR)
+		return (vnd_sdev_validate_dir(ctx));
+	ASSERT(vt == VCHR);
+
+	if (strcmp("ctl", sdev_ctx_name(ctx)) == 0)
+		return (SDEV_VTOR_VALID);
+
+	if (sdev_ctx_minor(ctx, &minor) != 0)
+		return (SDEV_VTOR_STALE);
+
+	vdp = vnd_dev_lookup(minor);
+	if (vdp == NULL)
+		return (SDEV_VTOR_STALE);
+
+	mutex_enter(&vdp->vdd_lock);
+	if (!(vdp->vdd_flags & VND_D_LINKED) ||
+	    (vdp->vdd_flags & (VND_D_CONDEMNED | VND_D_ZONE_DYING))) {
+		mutex_exit(&vdp->vdd_lock);
+		vnd_dev_rele(vdp);
+		return (SDEV_VTOR_STALE);
+	}
+
+	if (strcmp(sdev_ctx_name(ctx), vdp->vdd_lname) != 0) {
+		mutex_exit(&vdp->vdd_lock);
+		vnd_dev_rele(vdp);
+		return (SDEV_VTOR_STALE);
+	}
+
+	mutex_exit(&vdp->vdd_lock);
+	vnd_dev_rele(vdp);
+	return (SDEV_VTOR_VALID);
+}
+
+/*
+ * This function is a no-op. sdev never has holds on our devices as they can go
+ * away at any time and specfs has to deal with that fact.
+ */
+/* ARGSUSED */
+static void
+vnd_sdev_inactive(sdev_ctx_t ctx)
+{
+}
+
+static int
+vnd_sdev_fillzone(vnd_pnsd_t *nsp, sdev_ctx_t ctx)
+{
+	int ret;
+	vnd_dev_t *vdp;
+
+	mutex_enter(&nsp->vpnd_lock);
+	for (vdp = list_head(&nsp->vpnd_dev_list); vdp != NULL;
+	    vdp = list_next(&nsp->vpnd_dev_list, vdp)) {
+		mutex_enter(&vdp->vdd_lock);
+		if ((vdp->vdd_flags & VND_D_LINKED) &&
+		    !(vdp->vdd_flags & (VND_D_CONDEMNED | VND_D_ZONE_DYING))) {
+			ret = sdev_plugin_mknod(ctx, vdp->vdd_lname,
+			    VND_SDEV_MODE, vdp->vdd_devid);
+			if (ret != 0 && ret != EEXIST) {
+				mutex_exit(&vdp->vdd_lock);
+				mutex_exit(&nsp->vpnd_lock);
+				vnd_nsd_rele(nsp);
+				return (ret);
+			}
+		}
+		mutex_exit(&vdp->vdd_lock);
+	}
+	mutex_exit(&nsp->vpnd_lock);
+
+	return (0);
+}
+
+static int
+vnd_sdev_filldir_root(sdev_ctx_t ctx)
+{
+	zoneid_t zid;
+	vnd_pnsd_t *nsp;
+	int ret;
+
+	zid = getzoneid();
+	nsp = vnd_nsd_lookup(zoneid_to_netstackid(zid));
+	ASSERT(nsp != NULL);
+	ret = vnd_sdev_fillzone(nsp, ctx);
+	vnd_nsd_rele(nsp);
+	if (ret != 0)
+		return (ret);
+
+	/*
+	 * Checking the zone id is not sufficient as the global zone could be
+	 * reaching down into a non-global zone's mounted /dev.
+	 */
+	if (zid == GLOBAL_ZONEID && (sdev_ctx_flags(ctx) & SDEV_CTX_GLOBAL)) {
+		ret = sdev_plugin_mkdir(ctx, "zone");
+		if (ret != 0 && ret != EEXIST)
+			return (ret);
+	}
+
+	/*
+	 * Always add a reference to the control node. There's no need to
+	 * reference it since it always exists and is always what we clone from.
+	 */
+	ret = sdev_plugin_mknod(ctx, "ctl", VND_SDEV_MODE,
+	    makedevice(ddi_driver_major(vnd_dip), 0));
+	if (ret != 0 && ret != EEXIST)
+		return (ret);
+
+	return (0);
+}
+
+static int
+vnd_sdev_filldir_zroot(sdev_ctx_t ctx)
+{
+	int ret;
+	vnd_pnsd_t *nsp;
+	zone_t *zonep;
+
+	ASSERT(getzoneid() == GLOBAL_ZONEID);
+	ASSERT(sdev_ctx_flags(ctx) & SDEV_CTX_GLOBAL);
+
+	mutex_enter(&vnd_dev_lock);
+	for (nsp = list_head(&vnd_nsd_list); nsp != NULL;
+	    nsp = list_next(&vnd_nsd_list, nsp)) {
+		mutex_enter(&nsp->vpnd_lock);
+		if (list_is_empty(&nsp->vpnd_dev_list)) {
+			mutex_exit(&nsp->vpnd_lock);
+			continue;
+		}
+		mutex_exit(&nsp->vpnd_lock);
+		zonep = zone_find_by_id(nsp->vpnd_zid);
+		/*
+		 * This zone must be being torn down, so skip it.
+		 */
+		if (zonep == NULL)
+			continue;
+		ret = sdev_plugin_mkdir(ctx, zonep->zone_name);
+		zone_rele(zonep);
+		if (ret != 0 && ret != EEXIST) {
+			mutex_exit(&vnd_dev_lock);
+			return (ret);
+		}
+	}
+	mutex_exit(&vnd_dev_lock);
+	return (0);
+}
+
+static int
+vnd_sdev_filldir(sdev_ctx_t ctx)
+{
+	int ret;
+	vnd_pnsd_t *nsp;
+
+	ASSERT(sdev_ctx_vtype(ctx) == VDIR);
+	if (strcmp(VND_SDEV_ROOT, sdev_ctx_path(ctx)) == 0)
+		return (vnd_sdev_filldir_root(ctx));
+
+	if (strcmp(VND_SDEV_ZROOT, sdev_ctx_path(ctx)) == 0)
+		return (vnd_sdev_filldir_zroot(ctx));
+
+	ASSERT(strncmp(VND_SDEV_ZROOT, sdev_ctx_path(ctx),
+	    strlen(VND_SDEV_ZROOT)) == 0);
+	nsp = vnd_sdev_ctx_to_ns(ctx);
+	if (nsp == NULL)
+		return (0);
+
+	ret = vnd_sdev_fillzone(nsp, ctx);
+	vnd_nsd_rele(nsp);
+
+	return (ret);
+}
+
+static sdev_plugin_ops_t vnd_sdev_ops = {
+	SDEV_PLUGIN_VERSION,
+	SDEV_PLUGIN_SUBDIR,
+	vnd_sdev_validate,
+	vnd_sdev_filldir,
+	vnd_sdev_inactive
+};
+
+static int
+vnd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+	int errp = 0;
+
+	if (cmd != DDI_ATTACH)
+		return (DDI_FAILURE);
+
+	/*
+	 * Only allow one instance.
+	 */
+	if (vnd_dip != NULL)
+		return (DDI_FAILURE);
+
+	vnd_dip = dip;
+	if (ddi_create_minor_node(vnd_dip, "vnd", S_IFCHR, 0, DDI_PSEUDO, 0) !=
+	    DDI_SUCCESS) {
+		vnd_dip = NULL;
+		return (DDI_FAILURE);
+	}
+
+	if (ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
+	    DDI_KERNEL_IOCTL, NULL, 0) != DDI_PROP_SUCCESS) {
+		ddi_remove_minor_node(vnd_dip, NULL);
+		vnd_dip = NULL;
+		return (DDI_FAILURE);
+	}
+
+	vnd_sdev_hdl = sdev_plugin_register(VND_SDEV_NAME, &vnd_sdev_ops,
+	    &errp);
+	if (vnd_sdev_hdl == NULL) {
+		ddi_remove_minor_node(vnd_dip, NULL);
+		ddi_prop_remove_all(vnd_dip);
+		vnd_dip = NULL;
+		return (DDI_FAILURE);
+	}
+
+	vnd_sqset = gsqueue_set_create(GSQUEUE_DEFAULT_PRIORITY);
+
+	return (DDI_SUCCESS);
+}
+
+/* ARGSUSED */
+static int
+vnd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+	if (cmd != DDI_DETACH)
+		return (DDI_FAILURE);
+
+	mutex_enter(&vnd_dev_lock);
+	if (!list_is_empty(&vnd_dev_list)) {
+		mutex_exit(&vnd_dev_lock);
+		return (DDI_FAILURE);
+	}
+	mutex_exit(&vnd_dev_lock);
+
+	return (DDI_FAILURE);
+}
+
+/* ARGSUSED */
+static int
+vnd_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
+{
+	int error;
+
+	switch (cmd) {
+	case DDI_INFO_DEVT2DEVINFO:
+		*result = (void *)vnd_dip;
+		error = DDI_SUCCESS;
+		break;
+	case DDI_INFO_DEVT2INSTANCE:
+		*result = (void *)0;
+		error = DDI_SUCCESS;
+		break;
+	default:
+		error = DDI_FAILURE;
+		break;
+	}
+	return (error);
+}
+
+
+
+static void
+vnd_ddi_fini(void)
+{
+	netstack_unregister(NS_VND);
+	if (vnd_taskq != NULL)
+		taskq_destroy(vnd_taskq);
+	if (vnd_str_cache != NULL)
+		kmem_cache_destroy(vnd_str_cache);
+	if (vnd_dev_cache != NULL)
+		kmem_cache_destroy(vnd_dev_cache);
+	if (vnd_pnsd_cache != NULL)
+		kmem_cache_destroy(vnd_pnsd_cache);
+	if (vnd_minors != NULL)
+		id_space_destroy(vnd_minors);
+	if (vnd_list_init != 0) {
+		list_destroy(&vnd_nsd_list);
+		list_destroy(&vnd_dev_list);
+		mutex_destroy(&vnd_dev_lock);
+		vnd_list_init = 0;
+	}
+	frameio_fini();
+}
+
+static int
+vnd_ddi_init(void)
+{
+	if (frameio_init() != 0)
+		return (DDI_FAILURE);
+
+	vnd_str_cache = kmem_cache_create("vnd_str_cache", sizeof (vnd_str_t),
+	    0, NULL, NULL, NULL, NULL, NULL, 0);
+	if (vnd_str_cache == NULL) {
+		frameio_fini();
+		return (DDI_FAILURE);
+	}
+	vnd_dev_cache = kmem_cache_create("vnd_dev_cache", sizeof (vnd_dev_t),
+	    0, NULL, NULL, NULL, NULL, NULL, 0);
+	if (vnd_dev_cache == NULL) {
+		kmem_cache_destroy(vnd_str_cache);
+		frameio_fini();
+		return (DDI_FAILURE);
+	}
+	vnd_pnsd_cache = kmem_cache_create("vnd_pnsd_cache",
+	    sizeof (vnd_pnsd_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+	if (vnd_pnsd_cache == NULL) {
+		kmem_cache_destroy(vnd_dev_cache);
+		kmem_cache_destroy(vnd_str_cache);
+		frameio_fini();
+		return (DDI_FAILURE);
+	}
+
+	vnd_taskq = taskq_create_instance("vnd", -1, 1, minclsyspri, 0, 0, 0);
+	if (vnd_taskq == NULL) {
+		kmem_cache_destroy(vnd_pnsd_cache);
+		kmem_cache_destroy(vnd_dev_cache);
+		kmem_cache_destroy(vnd_str_cache);
+		frameio_fini();
+		return (DDI_FAILURE);
+	}
+
+	vnd_minors = id_space_create("vnd_minors", 1, INT32_MAX);
+	if (vnd_minors == NULL) {
+		taskq_destroy(vnd_taskq);
+		kmem_cache_destroy(vnd_pnsd_cache);
+		kmem_cache_destroy(vnd_dev_cache);
+		kmem_cache_destroy(vnd_str_cache);
+		frameio_fini();
+		return (DDI_FAILURE);
+	}
+
+	mutex_init(&vnd_dev_lock, NULL, MUTEX_DRIVER, NULL);
+	list_create(&vnd_dev_list, sizeof (vnd_dev_t),
+	    offsetof(vnd_dev_t, vdd_link));
+	list_create(&vnd_nsd_list, sizeof (vnd_pnsd_t),
+	    offsetof(vnd_pnsd_t, vpnd_link));
+	vnd_list_init = 1;
+
+	netstack_register(NS_VND, vnd_stack_init, vnd_stack_shutdown,
+	    vnd_stack_destroy);
+
+	return (DDI_SUCCESS);
+}
+
+static struct module_info vnd_minfo = {
+	0,		/* module id */
+	"vnd",		/* module name */
+	1,		/* smallest packet size */
+	INFPSZ,		/* largest packet size (infinite) */
+	1,		/* high watermark */
+	0		/* low watermark */
+};
+
+static struct qinit vnd_r_qinit = {
+	vnd_s_rput,
+	NULL,
+	vnd_s_open,
+	vnd_s_close,
+	NULL,
+	&vnd_minfo,
+	NULL
+};
+
+static struct qinit vnd_w_qinit = {
+	vnd_s_wput,
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	&vnd_minfo,
+	NULL
+};
+
+static struct streamtab vnd_strtab = {
+	&vnd_r_qinit,
+	&vnd_w_qinit,
+	NULL,
+	NULL
+};
+
+
+static struct cb_ops vnd_cb_ops = {
+	vnd_open,		/* open */
+	vnd_close,		/* close */
+	nulldev,		/* strategy */
+	nulldev,		/* print */
+	nodev,			/* dump */
+	vnd_read,		/* read */
+	vnd_write,		/* write */
+	vnd_ioctl,		/* ioctl */
+	nodev,			/* devmap */
+	nodev,			/* mmap */
+	nodev,			/* segmap */
+	vnd_chpoll,		/* poll */
+	ddi_prop_op,		/* cb_prop_op */
+	NULL,			/* streamtab  */
+	D_MP			/* Driver compatibility flag */
+};
+
+static struct dev_ops vnd_dev_ops = {
+	DEVO_REV,		/* devo_rev */
+	0,			/* refcnt */
+	vnd_info,		/* get_dev_info */
+	nulldev,		/* identify */
+	nulldev,		/* probe */
+	vnd_attach,		/* attach */
+	vnd_detach,		/* detach */
+	nodev,			/* reset */
+	&vnd_cb_ops,		/* driver operations */
+	NULL,			/* bus operations */
+	nodev,			/* dev power */
+	ddi_quiesce_not_needed	/* quiesce */
+};
+
+static struct modldrv vnd_modldrv = {
+	&mod_driverops,
+	"Virtual Networking Datapath Driver",
+	&vnd_dev_ops
+};
+
+static struct fmodsw vnd_fmodfsw = {
+	"vnd",
+	&vnd_strtab,
+	D_NEW | D_MP
+};
+
+static struct modlstrmod vnd_modlstrmod = {
+	&mod_strmodops,
+	"Virtual Networking Datapath Driver",
+	&vnd_fmodfsw
+};
+
+static struct modlinkage vnd_modlinkage = {
+	MODREV_1,
+	&vnd_modldrv,
+	&vnd_modlstrmod,
+	NULL
+};
+
+int
+_init(void)
+{
+	int error;
+
+	/*
+	 * We need to do all of our global initialization in init as opposed to
+	 * attach and detach. The problem here is that because vnd can be used
+	 * from a stream context while being detached, we can not rely on having
+	 * run attach to create everything, alas. so it goes in _init, just like
+	 * our friend ip.
+	 */
+	if ((error = vnd_ddi_init()) != DDI_SUCCESS)
+		return (error);
+	error = mod_install((&vnd_modlinkage));
+	if (error != 0)
+		vnd_ddi_fini();
+	return (error);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&vnd_modlinkage, modinfop));
+}
+
+int
+_fini(void)
+{
+	int error;
+
+	error = mod_remove(&vnd_modlinkage);
+	if (error == 0)
+		vnd_ddi_fini();
+	return (error);
+}
diff --git a/usr/src/uts/common/io/vnd/vnd.conf b/usr/src/uts/common/io/vnd/vnd.conf
new file mode 100644
index 0000000000..65872e1ddf
--- /dev/null
+++ b/usr/src/uts/common/io/vnd/vnd.conf
@@ -0,0 +1,16 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2014, Joyent, Inc.  All rights reserved.
+#
+
+name="vnd" parent="pseudo" instance=0;
diff --git a/usr/src/uts/common/io/vnic/vnic_dev.c b/usr/src/uts/common/io/vnic/vnic_dev.c
index d671153967..e532a551e7 100644
--- a/usr/src/uts/common/io/vnic/vnic_dev.c
+++ b/usr/src/uts/common/io/vnic/vnic_dev.c
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2015 Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
  * Copyright 2016 OmniTI Computer Consulting, Inc. All rights reserved.
  */
 
@@ -354,7 +354,7 @@ vnic_dev_create(datalink_id_t vnic_id, datalink_id_t linkid,
 
 	rw_enter(&vnic_lock, RW_WRITER);
 
-	/* does a VNIC with the same id already exist? */
+	/* Does a VNIC with the same id already exist? */
 	err = mod_hash_find(vnic_hash, VNIC_HASH_KEY(vnic_id),
 	    (mod_hash_val_t *)&vnic);
 	if (err == 0) {
@@ -370,6 +370,7 @@ vnic_dev_create(datalink_id_t vnic_id, datalink_id_t linkid,
 
 	bzero(vnic, sizeof (*vnic));
 
+	vnic->vn_ls = LINK_STATE_UNKNOWN;
 	vnic->vn_id = vnic_id;
 	vnic->vn_link_id = linkid;
 	vnic->vn_vrid = vrid;
@@ -455,6 +456,20 @@ vnic_dev_create(datalink_id_t vnic_id, datalink_id_t linkid,
 		} else {
 			vnic->vn_hcksum_txflags = 0;
 		}
+
+		/*
+		 * Check for LSO capabilities. LSO implementations
+		 * depend on hardware checksumming, so the same
+		 * requirement is enforced here.
+		 */
+		if (vnic->vn_hcksum_txflags != 0) {
+			if (!mac_capab_get(vnic->vn_lower_mh, MAC_CAPAB_LSO,
+			    &vnic->vn_cap_lso)) {
+				vnic->vn_cap_lso.lso_flags = 0;
+			}
+		} else {
+			vnic->vn_cap_lso.lso_flags = 0;
+		}
 	}
 
 	/* register with the MAC module */
@@ -580,11 +595,12 @@ vnic_dev_create(datalink_id_t vnic_id, datalink_id_t linkid,
 	vnic->vn_enabled = B_TRUE;
 
 	if (is_anchor) {
-		mac_link_update(vnic->vn_mh, LINK_STATE_UP);
+		vnic->vn_ls = LINK_STATE_UP;
 	} else {
-		mac_link_update(vnic->vn_mh,
-		    mac_client_stat_get(vnic->vn_mch, MAC_STAT_LINK_STATE));
+		vnic->vn_ls = mac_client_stat_get(vnic->vn_mch,
+		    MAC_STAT_LINK_STATE);
 	}
+	mac_link_update(vnic->vn_mh, vnic->vn_ls);
 
 	rw_exit(&vnic_lock);
 
@@ -824,6 +840,15 @@ vnic_m_capab_get(void *arg, mac_capab_t cap, void *cap_data)
 		    HCKSUM_INET_PARTIAL);
 		break;
 	}
+	case MAC_CAPAB_LSO: {
+		mac_capab_lso_t *cap_lso = cap_data;
+
+		if (vnic->vn_cap_lso.lso_flags == 0) {
+			return (B_FALSE);
+		}
+		*cap_lso = vnic->vn_cap_lso;
+		break;
+	}
 	case MAC_CAPAB_VNIC: {
 		mac_capab_vnic_t *vnic_capab = cap_data;
 
@@ -1092,6 +1117,34 @@ vnic_m_setprop(void *m_driver, const char *pr_name, mac_prop_id_t pr_num,
 		err = vnic_set_secondary_macs(vn, &msa);
 		break;
 	}
+	case MAC_PROP_PRIVATE: {
+		long val, i;
+		const char *v;
+
+		if (vn->vn_link_id != DATALINK_INVALID_LINKID ||
+		    strcmp(pr_name, "_linkstate") != 0) {
+			err = ENOTSUP;
+			break;
+		}
+
+		for (v = pr_val, i = 0; i < pr_valsize; i++, v++) {
+			if (*v == '\0')
+				break;
+		}
+		if (i == pr_valsize) {
+			err = EINVAL;
+			break;
+		}
+
+		(void) ddi_strtol(pr_val, (char **)NULL, 0, &val);
+		if (val != LINK_STATE_UP && val != LINK_STATE_DOWN) {
+			err = EINVAL;
+			break;
+		}
+		vn->vn_ls = val;
+		mac_link_update(vn->vn_mh, vn->vn_ls);
+		break;
+	}
 	default:
 		err = ENOTSUP;
 		break;
@@ -1117,6 +1170,18 @@ vnic_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
 	case MAC_PROP_SECONDARY_ADDRS:
 		ret = vnic_get_secondary_macs(vn, pr_valsize, pr_val);
 		break;
+	case MAC_PROP_PRIVATE:
+		if (vn->vn_link_id != DATALINK_INVALID_LINKID) {
+			ret = EINVAL;
+			break;
+		}
+
+		if (strcmp(pr_name, "_linkstate") != 0) {
+			ret = EINVAL;
+			break;
+		}
+		(void) snprintf(pr_val, pr_valsize, "%d", vn->vn_ls);
+		break;
 	default:
 		ret = ENOTSUP;
 		break;
@@ -1126,7 +1191,8 @@ vnic_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
 }
 
 /* ARGSUSED */
-static void vnic_m_propinfo(void *m_driver, const char *pr_name,
+static void
+vnic_m_propinfo(void *m_driver, const char *pr_name,
     mac_prop_id_t pr_num, mac_prop_info_handle_t prh)
 {
 	vnic_t		*vn = m_driver;
@@ -1169,6 +1235,18 @@ static void vnic_m_propinfo(void *m_driver, const char *pr_name,
 			mac_perim_exit(mph);
 		}
 		break;
+	case MAC_PROP_PRIVATE:
+		if (vn->vn_link_id != DATALINK_INVALID_LINKID)
+			break;
+
+		if (strcmp(pr_name, "_linkstate") == 0) {
+			char buf[16];
+
+			mac_prop_info_set_perm(prh, MAC_PROP_PERM_RW);
+			(void) snprintf(buf, sizeof (buf), "%d", vn->vn_ls);
+			mac_prop_info_set_default_str(prh, buf);
+		}
+		break;
 	}
 }
 
@@ -1241,8 +1319,9 @@ vnic_notify_cb(void *arg, mac_notify_type_t type)
 		break;
 
 	case MAC_NOTE_LINK:
-		mac_link_update(vnic->vn_mh,
-		    mac_client_stat_get(vnic->vn_mch, MAC_STAT_LINK_STATE));
+		vnic->vn_ls = mac_client_stat_get(vnic->vn_mch,
+		    MAC_STAT_LINK_STATE);
+		mac_link_update(vnic->vn_mh, vnic->vn_ls);
 		break;
 
 	default:
diff --git a/usr/src/uts/common/io/zfd.c b/usr/src/uts/common/io/zfd.c
new file mode 100644
index 0000000000..2da310ab8d
--- /dev/null
+++ b/usr/src/uts/common/io/zfd.c
@@ -0,0 +1,1154 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2016 Joyent, Inc.  All rights reserved.
+ */
+
+/*
+ * Zone File Descriptor Driver.
+ *
+ * This driver is derived from the zcons driver which is in turn derived from
+ * the pts/ptm drivers. The purpose is to expose file descriptors within the
+ * zone which are connected to zoneadmd and used for logging or an interactive
+ * connection to a process within the zone.
+ *
+ * Its implementation is straightforward. Each instance of the driver
+ * represents a global-zone/local-zone pair. Unlike the zcons device, zoneadmd
+ * uses these devices unidirectionally to provide stdin, stdout and stderr to
+ * the process within the zone.
+ *
+ * Instances of zfd are onlined as children of /pseudo/zfdnex@2/ by zoneadmd,
+ * using the devctl framework; thus the driver does not need to maintain any
+ * sort of "admin" node.
+ *
+ * The driver shuttles I/O from master side to slave side and back.  In a break
+ * from the pts/ptm semantics, if one side is not open, I/O directed towards
+ * it will simply be discarded. This is so that if zoneadmd is not holding the
+ * master side fd open (i.e. it has died somehow), processes in the zone do not
+ * experience any errors and I/O to the fd does not cause the process to hang.
+ *
+ * The driver can also act as a multiplexer so that data written to the
+ * slave side within the zone is also redirected back to another zfd device
+ * inside the zone for consumption (i.e. it can be read). The intention is
+ * that a logging process within the zone can consume data that is being
+ * written by an application onto the primary stream. This is essentially
+ * a tee off of the primary stream into a log stream. This tee can also be
+ * configured to be flow controlled via an ioctl. Flow control happens on the
+ * primary stream and is used to ensure that the log stream receives all of
+ * the messages off the primary stream when consumption of the data off of
+ * the log stream gets behind. Configuring for flow control implies that the
+ * application writing to the primary stream will be blocked when the log
+ * consumer gets behind. Note that closing the log stream (e.g. when the zone
+ * halts) will cause the loss of all messages queued in the stream.
+ *
+ * The zone's zfd device configuration is driven by zoneadmd and a zone mode.
+ * The mode, which is controlled by the zone attribute "zlog-mode" is somewhat
+ * of a misnomer since its purpose has evolved. The attribute can have a
+ * variety of values, but the lowest two positions are used to control how many
+ * zfd devices are created inside the zone and if the primary stream is a tty.
+ *
+ * Here is a summary of how the 4 modes control what zfd devices are created
+ * and how they're used:
+ *
+ *    t-:  1 stdio zdev  (0) configured as a tty
+ *    --:  3 stdio zdevs (0, 1, 2), not configured as a tty
+ *    tn:  1 stdio zdev  (0) configured as a tty, 1 additional zdev (1)
+ *    -n:  3 stdio zdevs (0, 1, 2), not tty, 2 additional zdevs (3, 4)
+ *
+ * With the 't' flag set, stdin/out/err is multiplexed onto a single full-duplex
+ * stream which is configured as a tty. That is, ptem, ldterm and ttycompat are
+ * autopushed onto the stream when the slave side is opened. There is only a
+ * single zfd dev (0) needed for the primary stream.
+ *
+ * When the 'n' flag is set, it is assumed that output logging will be done
+ * within the zone itself. In this configuration 1 or 2 additional zfd devices,
+ * depending on tty mode ('t' flag) are created within the zone. An application
+ * can then configure the zfd streams driver into a multiplexer. Output from
+ * the stdout/stderr zfd(s) will be teed into the correspond logging zfd(s)
+ * within the zone.
+ *
+ * The following is a diagram of how this works for a '-n' configuration:
+ *
+ *
+ *              zoneadmd (for zlogin -I stdout)
+ * GZ:             ^
+ *                 |
+ *     --------------------------
+ *                 ^
+ * NGZ:            |
+ *      app >1 -> zfd1 -> zfd3 -> logger (for logger to consume app's stdout)
+ *
+ * There would be a similar path for the app's stderr into zfd4 for the logger
+ * to consume stderr.
+ */
+
+#include <sys/types.h>
+#include <sys/cmn_err.h>
+#include <sys/conf.h>
+#include <sys/cred.h>
+#include <sys/ddi.h>
+#include <sys/debug.h>
+#include <sys/devops.h>
+#include <sys/errno.h>
+#include <sys/file.h>
+#include <sys/kstr.h>
+#include <sys/modctl.h>
+#include <sys/param.h>
+#include <sys/stat.h>
+#include <sys/stream.h>
+#include <sys/stropts.h>
+#include <sys/strsun.h>
+#include <sys/sunddi.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/types.h>
+#include <sys/zfd.h>
+#include <sys/vnode.h>
+#include <sys/fs/snode.h>
+#include <sys/zone.h>
+#include <sys/sdt.h>
+
+static kmutex_t zfd_mux_lock;
+
+static int zfd_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
+static int zfd_attach(dev_info_t *, ddi_attach_cmd_t);
+static int zfd_detach(dev_info_t *, ddi_detach_cmd_t);
+
+static int zfd_open(queue_t *, dev_t *, int, int, cred_t *);
+static int zfd_close(queue_t *, int, cred_t *);
+static void zfd_wput(queue_t *, mblk_t *);
+static void zfd_rsrv(queue_t *);
+static void zfd_wsrv(queue_t *);
+
+/*
+ * The instance number is encoded in the dev_t in the minor number; the lowest
+ * bit of the minor number is used to track the master vs. slave side of the
+ * fd. The rest of the bits in the minor number are the instance.
+ */
+#define	ZFD_MASTER_MINOR		0
+#define	ZFD_SLAVE_MINOR		1
+
+#define	ZFD_INSTANCE(x)		(getminor((x)) >> 1)
+#define	ZFD_NODE(x)		(getminor((x)) & 0x01)
+
+/*
+ * This macro converts a zfd_state_t pointer to the associated slave minor
+ * node's dev_t.
+ */
+#define	ZFD_STATE_TO_SLAVEDEV(x)	\
+	(makedevice(ddi_driver_major((x)->zfd_devinfo), \
+	(minor_t)(ddi_get_instance((x)->zfd_devinfo) << 1 | ZFD_SLAVE_MINOR)))
+
+int zfd_debug = 0;
+#define	DBG(a)		if (zfd_debug) cmn_err(CE_NOTE, a)
+#define	DBG1(a, b)	if (zfd_debug) cmn_err(CE_NOTE, a, b)
+
+/*
+ * ZFD Pseudo Terminal Module: stream data structure definitions,
+ * based on zcons.
+ */
+static struct module_info zfd_info = {
+	0x20FD,	/* ZOFD - 8445 */
+	"zfd",
+	0,		/* min packet size */
+	INFPSZ,		/* max packet size - infinity */
+	2048,		/* high water */
+	128		/* low water */
+};
+
+static struct qinit zfd_rinit = {
+	NULL,
+	(int (*)()) zfd_rsrv,
+	zfd_open,
+	zfd_close,
+	NULL,
+	&zfd_info,
+	NULL
+};
+
+static struct qinit zfd_winit = {
+	(int (*)()) zfd_wput,
+	(int (*)()) zfd_wsrv,
+	NULL,
+	NULL,
+	NULL,
+	&zfd_info,
+	NULL
+};
+
+static struct streamtab zfd_tab_info = {
+	&zfd_rinit,
+	&zfd_winit,
+	NULL,
+	NULL
+};
+
+#define	ZFD_CONF_FLAG	(D_MP | D_MTQPAIR | D_MTOUTPERIM | D_MTOCEXCL)
+
+/*
+ * this will define (struct cb_ops cb_zfd_ops) and (struct dev_ops zfd_ops)
+ */
+DDI_DEFINE_STREAM_OPS(zfd_ops, nulldev, nulldev, zfd_attach, zfd_detach, \
+	nodev, zfd_getinfo, ZFD_CONF_FLAG, &zfd_tab_info, \
+	ddi_quiesce_not_needed);
+
+/*
+ * Module linkage information for the kernel.
+ */
+
+static struct modldrv modldrv = {
+	&mod_driverops, 	/* Type of module (this is a pseudo driver) */
+	"Zone FD driver",	/* description of module */
+	&zfd_ops		/* driver ops */
+};
+
+static struct modlinkage modlinkage = {
+	MODREV_1,
+	&modldrv,
+	NULL
+};
+
+typedef enum {
+	ZFD_NO_MUX,
+	ZFD_PRIMARY_STREAM,
+	ZFD_LOG_STREAM
+} zfd_mux_type_t;
+
+typedef struct zfd_state {
+	dev_info_t *zfd_devinfo;	/* instance info */
+	queue_t *zfd_master_rdq;	/* GZ read queue */
+	queue_t *zfd_slave_rdq;		/* in-zone read queue */
+	int zfd_state;			/* ZFD_STATE_MOPEN, ZFD_STATE_SOPEN */
+	int zfd_tty;			/* ZFD_MAKETTY - strm mods will push */
+	boolean_t zfd_is_flowcon;	/* primary stream flow stopped */
+	boolean_t zfd_allow_flowcon;	/* use flow control */
+	zfd_mux_type_t zfd_muxt;	/* state type: none, primary, log */
+	struct zfd_state *zfd_inst_pri; /* log state's primary ptr */
+	struct zfd_state *zfd_inst_log;	/* primary state's log ptr */
+} zfd_state_t;
+
+#define	ZFD_STATE_MOPEN	0x01
+#define	ZFD_STATE_SOPEN	0x02
+
+static void *zfd_soft_state;
+
+/*
+ * List of STREAMS modules that are autopushed onto a slave instance when its
+ * opened, but only if the ZFD_MAKETTY ioctl has first been received by the
+ * master.
+ */
+static char *zfd_mods[] = {
+	"ptem",
+	"ldterm",
+	"ttcompat",
+	NULL
+};
+
+int
+_init(void)
+{
+	int err;
+
+	if ((err = ddi_soft_state_init(&zfd_soft_state, sizeof (zfd_state_t),
+	    0)) != 0) {
+		return (err);
+	}
+
+	if ((err = mod_install(&modlinkage)) != 0)
+		ddi_soft_state_fini(zfd_soft_state);
+
+	mutex_init(&zfd_mux_lock, NULL, MUTEX_DEFAULT, NULL);
+	return (err);
+}
+
+
+int
+_fini(void)
+{
+	int err;
+
+	if ((err = mod_remove(&modlinkage)) != 0) {
+		return (err);
+	}
+
+	ddi_soft_state_fini(&zfd_soft_state);
+	mutex_destroy(&zfd_mux_lock);
+	return (0);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&modlinkage, modinfop));
+}
+
+static int
+zfd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+	zfd_state_t *zfds;
+	int instance;
+	char masternm[ZFD_NAME_LEN], slavenm[ZFD_NAME_LEN];
+
+	if (cmd != DDI_ATTACH)
+		return (DDI_FAILURE);
+
+	instance = ddi_get_instance(dip);
+	if (ddi_soft_state_zalloc(zfd_soft_state, instance) != DDI_SUCCESS)
+		return (DDI_FAILURE);
+
+	(void) snprintf(masternm, sizeof (masternm), "%s%d", ZFD_MASTER_NAME,
+	    instance);
+	(void) snprintf(slavenm, sizeof (slavenm), "%s%d", ZFD_SLAVE_NAME,
+	    instance);
+
+	/*
+	 * Create the master and slave minor nodes.
+	 */
+	if ((ddi_create_minor_node(dip, slavenm, S_IFCHR,
+	    instance << 1 | ZFD_SLAVE_MINOR, DDI_PSEUDO, 0) == DDI_FAILURE) ||
+	    (ddi_create_minor_node(dip, masternm, S_IFCHR,
+	    instance << 1 | ZFD_MASTER_MINOR, DDI_PSEUDO, 0) == DDI_FAILURE)) {
+		ddi_remove_minor_node(dip, NULL);
+		ddi_soft_state_free(zfd_soft_state, instance);
+		return (DDI_FAILURE);
+	}
+
+	VERIFY((zfds = ddi_get_soft_state(zfd_soft_state, instance)) != NULL);
+	zfds->zfd_devinfo = dip;
+	zfds->zfd_tty = 0;
+	zfds->zfd_muxt = ZFD_NO_MUX;
+	zfds->zfd_inst_log = NULL;
+	return (DDI_SUCCESS);
+}
+
+static int
+zfd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+	zfd_state_t *zfds;
+	int instance;
+
+	if (cmd != DDI_DETACH)
+		return (DDI_FAILURE);
+
+	instance = ddi_get_instance(dip);
+	if ((zfds = ddi_get_soft_state(zfd_soft_state, instance)) == NULL)
+		return (DDI_FAILURE);
+
+	if ((zfds->zfd_state & ZFD_STATE_MOPEN) ||
+	    (zfds->zfd_state & ZFD_STATE_SOPEN)) {
+		DBG1("zfd_detach: device (dip=%p) still open\n", (void *)dip);
+		return (DDI_FAILURE);
+	}
+
+	ddi_remove_minor_node(dip, NULL);
+	ddi_soft_state_free(zfd_soft_state, instance);
+
+	return (DDI_SUCCESS);
+}
+
+/*
+ * zfd_getinfo()
+ *	getinfo(9e) entrypoint.
+ */
+/*ARGSUSED*/
+static int
+zfd_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
+{
+	zfd_state_t *zfds;
+	int instance = ZFD_INSTANCE((dev_t)arg);
+
+	switch (infocmd) {
+	case DDI_INFO_DEVT2DEVINFO:
+		if ((zfds = ddi_get_soft_state(zfd_soft_state,
+		    instance)) == NULL)
+			return (DDI_FAILURE);
+		*result = zfds->zfd_devinfo;
+		return (DDI_SUCCESS);
+	case DDI_INFO_DEVT2INSTANCE:
+		*result = (void *)(uintptr_t)instance;
+		return (DDI_SUCCESS);
+	}
+	return (DDI_FAILURE);
+}
+
+/*
+ * Return the equivalent queue from the other side of the relationship.
+ * e.g.: given the slave's write queue, return the master's write queue.
+ */
+static queue_t *
+zfd_switch(queue_t *qp)
+{
+	zfd_state_t *zfds = qp->q_ptr;
+	ASSERT(zfds != NULL);
+
+	if (qp == zfds->zfd_master_rdq)
+		return (zfds->zfd_slave_rdq);
+	else if (OTHERQ(qp) == zfds->zfd_master_rdq && zfds->zfd_slave_rdq
+	    != NULL)
+		return (OTHERQ(zfds->zfd_slave_rdq));
+	else if (qp == zfds->zfd_slave_rdq)
+		return (zfds->zfd_master_rdq);
+	else if (OTHERQ(qp) == zfds->zfd_slave_rdq && zfds->zfd_master_rdq
+	    != NULL)
+		return (OTHERQ(zfds->zfd_master_rdq));
+	else
+		return (NULL);
+}
+
+/*
+ * For debugging and outputting messages.  Returns the name of the side of
+ * the relationship associated with this queue.
+ */
+static const char *
+zfd_side(queue_t *qp)
+{
+	zfd_state_t *zfds = qp->q_ptr;
+	ASSERT(zfds != NULL);
+
+	if (qp == zfds->zfd_master_rdq ||
+	    OTHERQ(qp) == zfds->zfd_master_rdq) {
+		return ("master");
+	}
+	ASSERT(qp == zfds->zfd_slave_rdq || OTHERQ(qp) == zfds->zfd_slave_rdq);
+	return ("slave");
+}
+
+/*ARGSUSED*/
+static int
+zfd_master_open(zfd_state_t *zfds,
+    queue_t	*rqp,	/* pointer to the read side queue */
+    dev_t	*devp,	/* pointer to stream tail's dev */
+    int		oflag,	/* the user open(2) supplied flags */
+    int		sflag,	/* open state flag */
+    cred_t	*credp)	/* credentials */
+{
+	mblk_t *mop;
+	struct stroptions *sop;
+
+	/*
+	 * Enforce exclusivity on the master side; the only consumer should
+	 * be the zoneadmd for the zone.
+	 */
+	if ((zfds->zfd_state & ZFD_STATE_MOPEN) != 0)
+		return (EBUSY);
+
+	if ((mop = allocb(sizeof (struct stroptions), BPRI_MED)) == NULL) {
+		DBG("zfd_master_open(): mop allocation failed\n");
+		return (ENOMEM);
+	}
+
+	zfds->zfd_state |= ZFD_STATE_MOPEN;
+
+	/*
+	 * q_ptr stores driver private data; stash the soft state data on both
+	 * read and write sides of the queue.
+	 */
+	WR(rqp)->q_ptr = rqp->q_ptr = zfds;
+	qprocson(rqp);
+
+	/*
+	 * Following qprocson(), the master side is fully plumbed into the
+	 * STREAM and may send/receive messages.  Setting zfds->zfd_master_rdq
+	 * will allow the slave to send messages to us (the master).
+	 * This cannot occur before qprocson() because the master is not
+	 * ready to process them until that point.
+	 */
+	zfds->zfd_master_rdq = rqp;
+
+	/*
+	 * set up hi/lo water marks on stream head read queue and add
+	 * controlling tty as needed.
+	 */
+	mop->b_datap->db_type = M_SETOPTS;
+	mop->b_wptr += sizeof (struct stroptions);
+	sop = (struct stroptions *)(void *)mop->b_rptr;
+	if (oflag & FNOCTTY)
+		sop->so_flags = SO_HIWAT | SO_LOWAT;
+	else
+		sop->so_flags = SO_HIWAT | SO_LOWAT | SO_ISTTY;
+	sop->so_hiwat = 512;
+	sop->so_lowat = 256;
+	putnext(rqp, mop);
+
+	return (0);
+}
+
+/*ARGSUSED*/
+static int
+zfd_slave_open(zfd_state_t *zfds,
+    queue_t	*rqp,	/* pointer to the read side queue */
+    dev_t	*devp,	/* pointer to stream tail's dev */
+    int		oflag,	/* the user open(2) supplied flags */
+    int		sflag,	/* open state flag */
+    cred_t	*credp)	/* credentials */
+{
+	mblk_t *mop;
+	struct stroptions *sop;
+	/*
+	 * The slave side can be opened as many times as needed.
+	 */
+	if ((zfds->zfd_state & ZFD_STATE_SOPEN) != 0) {
+		ASSERT((rqp != NULL) && (WR(rqp)->q_ptr == zfds));
+		return (0);
+	}
+
+	/* A log stream is read-only */
+	if (zfds->zfd_muxt == ZFD_LOG_STREAM &&
+	    (oflag & (FREAD | FWRITE)) != FREAD)
+		return (EINVAL);
+
+	if (zfds->zfd_tty == 1) {
+		major_t major;
+		minor_t minor;
+		minor_t lastminor;
+		uint_t anchorindex;
+
+		/*
+		 * Set up sad(7D) so that the necessary STREAMS modules will
+		 * be in place.  A wrinkle is that 'ptem' must be anchored
+		 * in place (see streamio(7i)) because we always want the
+		 * fd to have terminal semantics.
+		 */
+		minor =
+		    ddi_get_instance(zfds->zfd_devinfo) << 1 | ZFD_SLAVE_MINOR;
+		major = ddi_driver_major(zfds->zfd_devinfo);
+		lastminor = 0;
+		anchorindex = 1;
+		if (kstr_autopush(SET_AUTOPUSH, &major, &minor, &lastminor,
+		    &anchorindex, zfd_mods) != 0) {
+			DBG("zfd_slave_open(): kstr_autopush() failed\n");
+			return (EIO);
+		}
+	}
+
+	if ((mop = allocb(sizeof (struct stroptions), BPRI_MED)) == NULL) {
+		DBG("zfd_slave_open(): mop allocation failed\n");
+		return (ENOMEM);
+	}
+
+	zfds->zfd_state |= ZFD_STATE_SOPEN;
+
+	/*
+	 * q_ptr stores driver private data; stash the soft state data on both
+	 * read and write sides of the queue.
+	 */
+	WR(rqp)->q_ptr = rqp->q_ptr = zfds;
+
+	qprocson(rqp);
+
+	/*
+	 * Must follow qprocson(), since we aren't ready to process until then.
+	 */
+	zfds->zfd_slave_rdq = rqp;
+
+	/*
+	 * set up hi/lo water marks on stream head read queue and add
+	 * controlling tty as needed.
+	 */
+	mop->b_datap->db_type = M_SETOPTS;
+	mop->b_wptr += sizeof (struct stroptions);
+	sop = (struct stroptions *)(void *)mop->b_rptr;
+	sop->so_flags = SO_HIWAT | SO_LOWAT | SO_ISTTY;
+	sop->so_hiwat = 512;
+	sop->so_lowat = 256;
+	putnext(rqp, mop);
+
+	return (0);
+}
+
+/*
+ * open(9e) entrypoint; checks sflag, and rejects anything unordinary.
+ */
+static int
+zfd_open(queue_t *rqp,		/* pointer to the read side queue */
+	dev_t   *devp,		/* pointer to stream tail's dev */
+	int	oflag,		/* the user open(2) supplied flags */
+	int	sflag,		/* open state flag */
+	cred_t  *credp)		/* credentials */
+{
+	int instance = ZFD_INSTANCE(*devp);
+	int ret;
+	zfd_state_t *zfds;
+
+	if (sflag != 0)
+		return (EINVAL);
+
+	if ((zfds = ddi_get_soft_state(zfd_soft_state, instance)) == NULL)
+		return (ENXIO);
+
+	switch (ZFD_NODE(*devp)) {
+	case ZFD_MASTER_MINOR:
+		ret = zfd_master_open(zfds, rqp, devp, oflag, sflag, credp);
+		break;
+	case ZFD_SLAVE_MINOR:
+		ret = zfd_slave_open(zfds, rqp, devp, oflag, sflag, credp);
+		/*
+		 * If we just opened the log stream and flow control has
+		 * been enabled, we want to make sure the primary stream can
+		 * start flowing.
+		 */
+		if (ret == 0 && zfds->zfd_muxt == ZFD_LOG_STREAM &&
+		    zfds->zfd_inst_pri->zfd_allow_flowcon) {
+			zfds->zfd_inst_pri->zfd_is_flowcon = B_FALSE;
+			if (zfds->zfd_inst_pri->zfd_master_rdq != NULL)
+				qenable(RD(zfds->zfd_inst_pri->zfd_master_rdq));
+		}
+		break;
+	default:
+		ret = ENXIO;
+		break;
+	}
+
+	return (ret);
+}
+
+/*
+ * close(9e) entrypoint.
+ */
+/*ARGSUSED1*/
+static int
+zfd_close(queue_t *rqp, int flag, cred_t *credp)
+{
+	queue_t *wqp;
+	mblk_t	*bp;
+	zfd_state_t *zfds;
+	major_t major;
+	minor_t minor;
+
+	zfds = (zfd_state_t *)rqp->q_ptr;
+
+	if (rqp == zfds->zfd_master_rdq) {
+		DBG("Closing master side");
+
+		zfds->zfd_master_rdq = NULL;
+		zfds->zfd_state &= ~ZFD_STATE_MOPEN;
+
+		/*
+		 * qenable slave side write queue so that it can flush
+		 * its messages as master's read queue is going away
+		 */
+		if (zfds->zfd_slave_rdq != NULL) {
+			qenable(WR(zfds->zfd_slave_rdq));
+		}
+
+		qprocsoff(rqp);
+		WR(rqp)->q_ptr = rqp->q_ptr = NULL;
+
+	} else if (rqp == zfds->zfd_slave_rdq) {
+
+		DBG("Closing slave side");
+		zfds->zfd_state &= ~ZFD_STATE_SOPEN;
+		zfds->zfd_slave_rdq = NULL;
+
+		wqp = WR(rqp);
+		while ((bp = getq(wqp)) != NULL) {
+			if (zfds->zfd_master_rdq != NULL)
+				putnext(zfds->zfd_master_rdq, bp);
+			else if (bp->b_datap->db_type == M_IOCTL)
+				miocnak(wqp, bp, 0, 0);
+			else
+				freemsg(bp);
+		}
+
+		/*
+		 * Qenable master side write queue so that it can flush its
+		 * messages as slaves's read queue is going away.
+		 */
+		if (zfds->zfd_master_rdq != NULL)
+			qenable(WR(zfds->zfd_master_rdq));
+
+		/*
+		 * Qenable primary stream if necessary.
+		 */
+		if (zfds->zfd_muxt == ZFD_LOG_STREAM &&
+		    zfds->zfd_inst_pri->zfd_allow_flowcon) {
+			zfds->zfd_inst_pri->zfd_is_flowcon = B_FALSE;
+			if (zfds->zfd_inst_pri->zfd_master_rdq != NULL)
+				qenable(RD(zfds->zfd_inst_pri->zfd_master_rdq));
+		}
+
+		qprocsoff(rqp);
+		WR(rqp)->q_ptr = rqp->q_ptr = NULL;
+
+		if (zfds->zfd_tty == 1) {
+			/*
+			 * Clear the sad configuration so that reopening
+			 * doesn't fail to set up sad configuration.
+			 */
+			major = ddi_driver_major(zfds->zfd_devinfo);
+			minor = ddi_get_instance(zfds->zfd_devinfo) << 1 |
+			    ZFD_SLAVE_MINOR;
+			(void) kstr_autopush(CLR_AUTOPUSH, &major, &minor,
+			    NULL, NULL, NULL);
+		}
+	}
+
+	return (0);
+}
+
+static void
+handle_mflush(queue_t *qp, mblk_t *mp)
+{
+	mblk_t *nmp;
+	DBG1("M_FLUSH on %s side", zfd_side(qp));
+
+	if (*mp->b_rptr & FLUSHW) {
+		DBG1("M_FLUSH, FLUSHW, %s side", zfd_side(qp));
+		flushq(qp, FLUSHDATA);
+		*mp->b_rptr &= ~FLUSHW;
+		if ((*mp->b_rptr & FLUSHR) == 0) {
+			/*
+			 * FLUSHW only. Change to FLUSHR and putnext other side,
+			 * then we are done.
+			 */
+			*mp->b_rptr |= FLUSHR;
+			if (zfd_switch(RD(qp)) != NULL) {
+				putnext(zfd_switch(RD(qp)), mp);
+				return;
+			}
+		} else if ((zfd_switch(RD(qp)) != NULL) &&
+		    (nmp = copyb(mp)) != NULL) {
+			/*
+			 * It is a FLUSHRW; we copy the mblk and send
+			 * it to the other side, since we still need to use
+			 * the mblk in FLUSHR processing, below.
+			 */
+			putnext(zfd_switch(RD(qp)), nmp);
+		}
+	}
+
+	if (*mp->b_rptr & FLUSHR) {
+		DBG("qreply(qp) turning FLUSHR around\n");
+		qreply(qp, mp);
+		return;
+	}
+	freemsg(mp);
+}
+
+/*
+ * Evaluate the various conditionals to determine if we're teeing into a log
+ * stream and if the primary stream should be flow controlled. This function
+ * can set the zfd_is_flowcon flag as a side effect.
+ *
+ * When teeing with flow control, we always queue the teed msg here and if
+ * the queue is getting full, we set zfd_is_flowcon. The primary stream will
+ * always queue when zfd_is_flowcon and will also not be served when
+ * zfd_is_flowcon is set. This causes backpressure on the primary stream
+ * until the teed queue can drain.
+ */
+static void
+zfd_tee_handler(zfd_state_t *zfds, unsigned char type, mblk_t *mp)
+{
+	queue_t *log_qp;
+	zfd_state_t *log_zfds;
+	mblk_t *lmp;
+
+	if (zfds->zfd_muxt != ZFD_PRIMARY_STREAM)
+		return;
+
+	if (type != M_DATA)
+		return;
+
+	log_zfds = zfds->zfd_inst_log;
+	if (log_zfds == NULL)
+		return;
+
+	ASSERT(log_zfds->zfd_muxt == ZFD_LOG_STREAM);
+
+	if ((log_zfds->zfd_state & ZFD_STATE_SOPEN) == 0) {
+		if (zfds->zfd_allow_flowcon)
+			zfds->zfd_is_flowcon = B_TRUE;
+		return;
+	}
+
+	/* The zfd_slave_rdq is null until the log dev is opened in the zone */
+	log_qp = RD(log_zfds->zfd_slave_rdq);
+	DTRACE_PROBE2(zfd__tee__check, void *, log_qp, void *, zfds);
+
+	if (!zfds->zfd_allow_flowcon) {
+		/*
+		 * We're not supposed to tee with flow control and the tee is
+		 * full so we skip teeing into the log stream.
+		 */
+		if ((log_qp->q_flag & QFULL) != 0)
+			return;
+	}
+
+	/*
+	 * Tee the message into the log stream.
+	 */
+	lmp = dupmsg(mp);
+	if (lmp == NULL) {
+		if (zfds->zfd_allow_flowcon)
+			zfds->zfd_is_flowcon = B_TRUE;
+		return;
+	}
+
+	if (log_qp->q_first == NULL && bcanputnext(log_qp, lmp->b_band)) {
+		putnext(log_qp, lmp);
+	} else {
+		if (putq(log_qp, lmp) == 0) {
+			/* The logger queue is full, free the msg. */
+			freemsg(lmp);
+		}
+		/*
+		 * If we're supposed to tee with flow control and the tee is
+		 * over the high water mark then we want the primary stream to
+		 * stop flowing. We'll stop queueing the primary stream after
+		 * the log stream has drained.
+		 */
+		if (zfds->zfd_allow_flowcon &&
+		    log_qp->q_count > log_qp->q_hiwat) {
+			zfds->zfd_is_flowcon = B_TRUE;
+		}
+	}
+}
+
+/*
+ * wput(9E) is symmetric for master and slave sides, so this handles both
+ * without splitting the codepath.  (The only exception to this is the
+ * processing of zfd ioctls, which is restricted to the master side.)
+ *
+ * zfd_wput() looks at the other side; if there is no process holding that
+ * side open, it frees the message.  This prevents processes from hanging
+ * if no one is holding open the fd.  Otherwise, it putnext's high
+ * priority messages, putnext's normal messages if possible, and otherwise
+ * enqueues the messages; in the case that something is enqueued, wsrv(9E)
+ * will take care of eventually shuttling I/O to the other side.
+ *
+ * When configured as a multiplexer, then anything written to the stream
+ * from inside the zone is also teed off to the corresponding log stream
+ * for consumption within the zone (i.e. the log stream can be read, but never
+ * written to, by an application inside the zone).
+ */
+static void
+zfd_wput(queue_t *qp, mblk_t *mp)
+{
+	unsigned char type = mp->b_datap->db_type;
+	zfd_state_t *zfds;
+	struct iocblk *iocbp;
+	boolean_t must_queue = B_FALSE;
+
+	ASSERT(qp->q_ptr);
+
+	DBG1("entering zfd_wput, %s side", zfd_side(qp));
+
+	/*
+	 * Process zfd ioctl messages if qp is the master side's write queue.
+	 */
+	zfds = (zfd_state_t *)qp->q_ptr;
+
+	if (type == M_IOCTL) {
+		iocbp = (struct iocblk *)(void *)mp->b_rptr;
+
+		switch (iocbp->ioc_cmd) {
+		case ZFD_MAKETTY:
+			zfds->zfd_tty = 1;
+			miocack(qp, mp, 0, 0);
+			return;
+		case ZFD_EOF:
+			if (zfds->zfd_slave_rdq != NULL)
+				(void) putnextctl(zfds->zfd_slave_rdq,
+				    M_HANGUP);
+			miocack(qp, mp, 0, 0);
+			return;
+		case ZFD_HAS_SLAVE:
+			if ((zfds->zfd_state & ZFD_STATE_SOPEN) != 0) {
+				miocack(qp, mp, 0, 0);
+			} else {
+				miocack(qp, mp, 0, ENOTTY);
+			}
+			return;
+		case ZFD_MUX: {
+			/*
+			 * Setup the multiplexer configuration for the two
+			 * streams.
+			 *
+			 * We expect to be called on the stream that will
+			 * become the log stream and be passed one data block
+			 * with the minor number of the slave side of the
+			 * primary stream.
+			 */
+			int to;
+			int instance;
+			zfd_state_t *prim_zfds;
+
+			if (iocbp->ioc_count != TRANSPARENT ||
+			    mp->b_cont == NULL) {
+				miocack(qp, mp, 0, EINVAL);
+				return;
+			}
+
+			/* Get the primary slave minor device number */
+			to = *(int *)mp->b_cont->b_rptr;
+			instance = ZFD_INSTANCE(to);
+
+			if ((prim_zfds = ddi_get_soft_state(zfd_soft_state,
+			    instance)) == NULL) {
+				miocack(qp, mp, 0, EINVAL);
+				return;
+			}
+
+			/* Disallow changing primary/log once set. */
+			mutex_enter(&zfd_mux_lock);
+			if (zfds->zfd_muxt != ZFD_NO_MUX ||
+			    prim_zfds->zfd_muxt != ZFD_NO_MUX) {
+				mutex_exit(&zfd_mux_lock);
+				miocack(qp, mp, 0, EINVAL);
+				return;
+			}
+
+			zfds->zfd_muxt = ZFD_LOG_STREAM;
+			zfds->zfd_inst_pri = prim_zfds;
+			prim_zfds->zfd_muxt = ZFD_PRIMARY_STREAM;
+			prim_zfds->zfd_inst_log = zfds;
+			mutex_exit(&zfd_mux_lock);
+			DTRACE_PROBE2(zfd__mux__link, void *, prim_zfds,
+			    void *, zfds);
+
+			miocack(qp, mp, 0, 0);
+			return;
+			}
+		case ZFD_MUX_FLOWCON: {
+			/*
+			 * We expect this ioctl to be issued against the
+			 * log stream. We don't use the primary stream since
+			 * there can be other streams modules pushed onto that
+			 * stream which would interfere with the ioctl.
+			 */
+			int val;
+			zfd_state_t *prim_zfds;
+
+			if (iocbp->ioc_count != TRANSPARENT ||
+			    mp->b_cont == NULL) {
+				miocack(qp, mp, 0, EINVAL);
+				return;
+			}
+
+			if (zfds->zfd_muxt != ZFD_LOG_STREAM) {
+				miocack(qp, mp, 0, EINVAL);
+				return;
+			}
+			prim_zfds = zfds->zfd_inst_pri;
+
+			/* Get the flow control setting */
+			val = *(int *)mp->b_cont->b_rptr;
+			if (val != 0 && val != 1) {
+				miocack(qp, mp, 0, EINVAL);
+				return;
+			}
+
+			prim_zfds->zfd_allow_flowcon = (boolean_t)val;
+			if (!prim_zfds->zfd_allow_flowcon)
+				prim_zfds->zfd_is_flowcon = B_FALSE;
+
+			DTRACE_PROBE1(zfd__mux__flowcon, void *, prim_zfds);
+			miocack(qp, mp, 0, 0);
+			return;
+			}
+		default:
+			break;
+		}
+	}
+
+	/* if on the write side, may need to tee */
+	if (zfds->zfd_slave_rdq != NULL && qp == WR(zfds->zfd_slave_rdq)) {
+		/* tee output to any attached log stream */
+		zfd_tee_handler(zfds, type, mp);
+
+		/* high-priority msgs are not subject to flow control */
+		if (zfds->zfd_is_flowcon && type == M_DATA)
+			must_queue = B_TRUE;
+	}
+
+	if (zfd_switch(RD(qp)) == NULL) {
+		DBG1("wput to %s side (no one listening)", zfd_side(qp));
+		switch (type) {
+		case M_FLUSH:
+			handle_mflush(qp, mp);
+			break;
+		case M_IOCTL:
+			miocnak(qp, mp, 0, 0);
+			break;
+		default:
+			freemsg(mp);
+			break;
+		}
+		return;
+	}
+
+	if (type >= QPCTL) {
+		DBG1("(hipri) wput, %s side", zfd_side(qp));
+		switch (type) {
+		case M_READ:		/* supposedly from ldterm? */
+			DBG("zfd_wput: tossing M_READ\n");
+			freemsg(mp);
+			break;
+		case M_FLUSH:
+			handle_mflush(qp, mp);
+			break;
+		default:
+			/*
+			 * Put this to the other side.
+			 */
+			ASSERT(zfd_switch(RD(qp)) != NULL);
+			putnext(zfd_switch(RD(qp)), mp);
+			break;
+		}
+		DBG1("done (hipri) wput, %s side", zfd_side(qp));
+		return;
+	}
+
+	/*
+	 * If the primary stream has been stopped for flow control then
+	 * enqueue the msg, otherwise only putnext if there isn't already
+	 * something in the queue. If we don't do this then things would wind
+	 * up out of order.
+	 */
+	if (!must_queue && qp->q_first == NULL &&
+	    bcanputnext(RD(zfd_switch(qp)), mp->b_band)) {
+		putnext(RD(zfd_switch(qp)), mp);
+	} else {
+		/*
+		 * zfd_wsrv expects msgs queued on the primary queue. Those
+		 * will be handled by zfd_wsrv after zfd_rsrv performs the
+		 * qenable on the proper queue.
+		 */
+		(void) putq(qp, mp);
+	}
+
+	DBG1("done wput, %s side", zfd_side(qp));
+}
+
+/*
+ * Read server
+ *
+ * For primary stream:
+ * Under normal execution rsrv(9E) is symmetric for master and slave, so
+ * zfd_rsrv() can handle both without splitting up the codepath. We do this by
+ * enabling the write side of the partner.  This triggers the partner to send
+ * messages queued on its write side to this queue's read side.
+ *
+ * For log stream:
+ * Internally we've queued up the msgs that we've teed off to the log stream
+ * so when we're invoked we need to pass these along.
+ */
+static void
+zfd_rsrv(queue_t *qp)
+{
+	zfd_state_t *zfds;
+	zfds = (zfd_state_t *)qp->q_ptr;
+
+	/*
+	 * log stream server
+	 */
+	if (zfds->zfd_muxt == ZFD_LOG_STREAM && zfds->zfd_slave_rdq != NULL) {
+		queue_t *log_qp;
+		mblk_t *mp;
+
+		log_qp = RD(zfds->zfd_slave_rdq);
+
+		if ((zfds->zfd_state & ZFD_STATE_SOPEN) != 0) {
+			zfd_state_t *pzfds = zfds->zfd_inst_pri;
+
+			while ((mp = getq(qp)) != NULL) {
+				if (bcanputnext(log_qp, mp->b_band)) {
+					putnext(log_qp, mp);
+				} else {
+					(void) putbq(log_qp, mp);
+					break;
+				}
+			}
+
+			if (log_qp->q_count < log_qp->q_lowat) {
+				DTRACE_PROBE(zfd__flow__on);
+				pzfds->zfd_is_flowcon = B_FALSE;
+				if (pzfds->zfd_master_rdq != NULL)
+					qenable(RD(pzfds->zfd_master_rdq));
+			}
+		} else {
+			/* No longer open, drain the queue */
+			while ((mp = getq(qp)) != NULL) {
+				freemsg(mp);
+			}
+			flushq(qp, FLUSHALL);
+		}
+		return;
+	}
+
+	/*
+	 * Care must be taken here, as either of the master or slave side
+	 * qptr could be NULL.
+	 */
+	ASSERT(qp == zfds->zfd_master_rdq || qp == zfds->zfd_slave_rdq);
+	if (zfd_switch(qp) == NULL) {
+		DBG("zfd_rsrv: other side isn't listening\n");
+		return;
+	}
+	qenable(WR(zfd_switch(qp)));
+}
+
+/*
+ * Write server
+ *
+ * This routine is symmetric for master and slave, so it handles both without
+ * splitting up the codepath.
+ *
+ * If there are messages on this queue that can be sent to the other, send
+ * them via putnext(). Else, if queued messages cannot be sent, leave them
+ * on this queue.
+ */
+static void
+zfd_wsrv(queue_t *qp)
+{
+	queue_t *swq;
+	mblk_t *mp;
+	zfd_state_t *zfds = (zfd_state_t *)qp->q_ptr;
+
+	ASSERT(zfds != NULL);
+
+	/*
+	 * Partner has no read queue, so take the data, and throw it away.
+	 */
+	if (zfd_switch(RD(qp)) == NULL) {
+		DBG("zfd_wsrv: other side isn't listening");
+		while ((mp = getq(qp)) != NULL) {
+			if (mp->b_datap->db_type == M_IOCTL)
+				miocnak(qp, mp, 0, 0);
+			else
+				freemsg(mp);
+		}
+		flushq(qp, FLUSHALL);
+		return;
+	}
+
+	swq = RD(zfd_switch(qp));
+
+	/*
+	 * while there are messages on this write queue...
+	 */
+	while (!zfds->zfd_is_flowcon && (mp = getq(qp)) != NULL) {
+		/*
+		 * Due to the way zfd_wput is implemented, we should never
+		 * see a high priority control message here.
+		 */
+		ASSERT(mp->b_datap->db_type < QPCTL);
+
+		if (bcanputnext(swq, mp->b_band)) {
+			putnext(swq, mp);
+		} else {
+			(void) putbq(qp, mp);
+			break;
+		}
+	}
+}