summaryrefslogtreecommitdiff
path: root/usr/src/uts/sun4v/io/vnet.c
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src/uts/sun4v/io/vnet.c')
-rw-r--r--usr/src/uts/sun4v/io/vnet.c1408
1 files changed, 1345 insertions, 63 deletions
diff --git a/usr/src/uts/sun4v/io/vnet.c b/usr/src/uts/sun4v/io/vnet.c
index 32b67b2588..884665b77f 100644
--- a/usr/src/uts/sun4v/io/vnet.c
+++ b/usr/src/uts/sun4v/io/vnet.c
@@ -40,6 +40,8 @@
#include <sys/dlpi.h>
#include <net/if.h>
#include <sys/mac_provider.h>
+#include <sys/mac_client.h>
+#include <sys/mac_client_priv.h>
#include <sys/mac_ether.h>
#include <sys/ddi.h>
#include <sys/sunddi.h>
@@ -75,11 +77,38 @@ static void vnet_m_ioctl(void *arg, queue_t *q, mblk_t *mp);
#ifdef VNET_IOC_DEBUG
static void vnet_force_link_state(vnet_t *vnetp, queue_t *q, mblk_t *mp);
#endif
+static boolean_t vnet_m_capab(void *arg, mac_capab_t cap, void *cap_data);
+static void vnet_get_ring(void *arg, mac_ring_type_t rtype, const int g_index,
+ const int r_index, mac_ring_info_t *infop, mac_ring_handle_t r_handle);
+static void vnet_get_group(void *arg, mac_ring_type_t type, const int index,
+ mac_group_info_t *infop, mac_group_handle_t handle);
+static int vnet_rx_ring_start(mac_ring_driver_t rdriver, uint64_t mr_gen_num);
+static void vnet_rx_ring_stop(mac_ring_driver_t rdriver);
+static int vnet_tx_ring_start(mac_ring_driver_t rdriver, uint64_t mr_gen_num);
+static void vnet_tx_ring_stop(mac_ring_driver_t rdriver);
+static int vnet_ring_enable_intr(void *arg);
+static int vnet_ring_disable_intr(void *arg);
+static mblk_t *vnet_rx_poll(void *arg, int bytes_to_pickup);
+static int vnet_addmac(void *arg, const uint8_t *mac_addr);
+static int vnet_remmac(void *arg, const uint8_t *mac_addr);
/* vnet internal functions */
static int vnet_unattach(vnet_t *vnetp);
+static void vnet_ring_grp_init(vnet_t *vnetp);
+static void vnet_ring_grp_uninit(vnet_t *vnetp);
static int vnet_mac_register(vnet_t *);
static int vnet_read_mac_address(vnet_t *vnetp);
+static int vnet_bind_vgenring(vnet_res_t *vresp);
+static void vnet_unbind_vgenring(vnet_res_t *vresp);
+static int vnet_bind_hwrings(vnet_t *vnetp);
+static void vnet_unbind_hwrings(vnet_t *vnetp);
+static int vnet_bind_rings(vnet_res_t *vresp);
+static void vnet_unbind_rings(vnet_res_t *vresp);
+static int vnet_hio_stat(void *, uint_t, uint64_t *);
+static int vnet_hio_start(void *);
+static void vnet_hio_stop(void *);
+static void vnet_hio_notify_cb(void *arg, mac_notify_type_t type);
+mblk_t *vnet_hio_tx(void *, mblk_t *);
/* Forwarding database (FDB) routines */
static void vnet_fdb_create(vnet_t *vnetp);
@@ -98,6 +127,8 @@ static void vnet_stop_resources(vnet_t *vnetp);
static void vnet_dispatch_res_task(vnet_t *vnetp);
static void vnet_res_start_task(void *arg);
static void vnet_handle_res_err(vio_net_handle_t vrh, vio_net_err_val_t err);
+static void vnet_add_resource(vnet_t *vnetp, vnet_res_t *vresp);
+static vnet_res_t *vnet_rem_resource(vnet_t *vnetp, vnet_res_t *vresp);
/* Exported to vnet_gen */
int vnet_mtu_update(vnet_t *vnetp, uint32_t mtu);
@@ -112,15 +143,21 @@ static void vnet_hio_destroy_kstats(kstat_t *ksp);
/* Exported to to vnet_dds */
int vnet_send_dds_msg(vnet_t *vnetp, void *dmsg);
+int vnet_hio_mac_init(vnet_t *vnetp, char *ifname);
+void vnet_hio_mac_cleanup(vnet_t *vnetp);
/* Externs that are imported from vnet_gen */
extern int vgen_init(void *vnetp, uint64_t regprop, dev_info_t *vnetdip,
const uint8_t *macaddr, void **vgenhdl);
+extern int vgen_init_mdeg(void *arg);
extern void vgen_uninit(void *arg);
extern int vgen_dds_tx(void *arg, void *dmsg);
extern void vgen_mod_init(void);
extern int vgen_mod_cleanup(void);
extern void vgen_mod_fini(void);
+extern int vgen_enable_intr(void *arg);
+extern int vgen_disable_intr(void *arg);
+extern mblk_t *vgen_poll(void *arg, int bytes_to_pickup);
/* Externs that are imported from vnet_dds */
extern void vdds_mod_init(void);
@@ -131,6 +168,9 @@ extern void vdds_process_dds_msg(vnet_t *vnetp, vio_dds_msg_t *dmsg);
extern void vdds_cleanup_hybrid_res(void *arg);
extern void vdds_cleanup_hio(vnet_t *vnetp);
+/* Externs imported from mac_impl */
+extern mblk_t *mac_hwring_tx(mac_ring_handle_t, mblk_t *);
+
#define DRV_NAME "vnet"
#define VNET_FDBE_REFHOLD(p) \
{ \
@@ -145,9 +185,9 @@ extern void vdds_cleanup_hio(vnet_t *vnetp);
}
#ifdef VNET_IOC_DEBUG
-#define VNET_M_CALLBACK_FLAGS (MC_IOCTL)
+#define VNET_M_CALLBACK_FLAGS (MC_IOCTL | MC_GETCAPAB)
#else
-#define VNET_M_CALLBACK_FLAGS (0)
+#define VNET_M_CALLBACK_FLAGS (MC_GETCAPAB)
#endif
static mac_callbacks_t vnet_m_callbacks = {
@@ -157,9 +197,23 @@ static mac_callbacks_t vnet_m_callbacks = {
vnet_m_stop,
vnet_m_promisc,
vnet_m_multicst,
- vnet_m_unicst,
- vnet_m_tx,
+ NULL, /* m_unicst entry must be NULL while rx rings are exposed */
+ NULL, /* m_tx entry must be NULL while tx rings are exposed */
vnet_m_ioctl,
+ vnet_m_capab,
+ NULL
+};
+
+static mac_callbacks_t vnet_hio_res_callbacks = {
+ 0,
+ vnet_hio_stat,
+ vnet_hio_start,
+ vnet_hio_stop,
+ NULL,
+ NULL,
+ NULL,
+ vnet_hio_tx,
+ NULL,
NULL,
NULL
};
@@ -176,6 +230,9 @@ uint32_t vnet_ldcwd_interval = VNET_LDCWD_INTERVAL; /* watchdog freq in msec */
uint32_t vnet_ldcwd_txtimeout = VNET_LDCWD_TXTIMEOUT; /* tx timeout in msec */
uint32_t vnet_ldc_mtu = VNET_LDC_MTU; /* ldc mtu */
+/* Configure tx serialization in mac layer for the vnet device */
+boolean_t vnet_mac_tx_serialize = B_TRUE;
+
/*
* Set this to non-zero to enable additional internal receive buffer pools
* based on the MTU of the device for better performance at the cost of more
@@ -206,6 +263,11 @@ static struct ether_addr etherbroadcastaddr = {
0xff, 0xff, 0xff, 0xff, 0xff, 0xff
};
+/* mac_open() retry delay in usec */
+uint32_t vnet_mac_open_delay = 100; /* 0.1 ms */
+
+/* max # of mac_open() retries */
+uint32_t vnet_mac_open_retries = 100;
/*
* Property names
@@ -375,6 +437,9 @@ vnetattach(dev_info_t *dip, ddi_attach_cmd_t cmd)
rw_init(&vnetp->vsw_fp_rw, NULL, RW_DRIVER, NULL);
attach_progress |= AST_vnet_alloc;
+ vnet_ring_grp_init(vnetp);
+ attach_progress |= AST_ring_init;
+
status = vdds_init(vnetp);
if (status != 0) {
goto vnet_attach_fail;
@@ -419,10 +484,19 @@ vnetattach(dev_info_t *dip, ddi_attach_cmd_t cmd)
attach_progress |= AST_vnet_list;
/*
- * Initialize the generic vnet plugin which provides
- * communication via sun4v LDC (logical domain channel) based
- * resources. It will register the LDC resources as and when
- * they become available.
+ * Initialize the generic vnet plugin which provides communication via
+ * sun4v LDC (logical domain channel) based resources. This involves 2
+ * steps; first, vgen_init() is invoked to read the various properties
+ * of the vnet device from its MD node (including its mtu which is
+ * needed to mac_register()) and obtain a handle to the vgen layer.
+ * After mac_register() is done and we have a mac handle, we then
+ * invoke vgen_init_mdeg() which registers with the the MD event
+ * generator (mdeg) framework to allow LDC resource notifications.
+ * Note: this sequence also allows us to report the correct default #
+ * of pseudo rings (2TX and 3RX) in vnet_m_capab() which gets invoked
+ * in the context of mac_register(); and avoids conflicting with
+ * dynamic pseudo rx rings which get added/removed as a result of mdeg
+ * events in vgen.
*/
status = vgen_init(vnetp, reg, vnetp->dip,
(uint8_t *)vnetp->curr_macaddr, &vnetp->vgenhdl);
@@ -432,15 +506,19 @@ vnetattach(dev_info_t *dip, ddi_attach_cmd_t cmd)
}
attach_progress |= AST_vgen_init;
- /* register with MAC layer */
status = vnet_mac_register(vnetp);
if (status != DDI_SUCCESS) {
goto vnet_attach_fail;
}
vnetp->link_state = LINK_STATE_UNKNOWN;
-
attach_progress |= AST_macreg;
+ status = vgen_init_mdeg(vnetp->vgenhdl);
+ if (status != DDI_SUCCESS) {
+ goto vnet_attach_fail;
+ }
+ attach_progress |= AST_init_mdeg;
+
vnetp->attach_progress = attach_progress;
DBG1(NULL, "instance(%d) exit\n", instance);
@@ -503,21 +581,25 @@ vnet_unattach(vnet_t *vnetp)
attach_progress = vnetp->attach_progress;
/*
- * Unregister from the gldv3 subsystem. This can fail, in particular
- * if there are still any open references to this mac device; in which
- * case we just return failure without continuing to detach further.
+ * Disable the mac device in the gldv3 subsystem. This can fail, in
+ * particular if there are still any open references to this mac
+ * device; in which case we just return failure without continuing to
+ * detach further.
+ * If it succeeds, we then invoke vgen_uninit() which should unregister
+ * any pseudo rings registered with the mac layer. Note we keep the
+ * AST_macreg flag on, so we can unregister with the mac layer at
+ * the end of this routine.
*/
if (attach_progress & AST_macreg) {
- if (mac_unregister(vnetp->mh) != 0) {
+ if (mac_disable(vnetp->mh) != 0) {
return (1);
}
- attach_progress &= ~AST_macreg;
}
/*
- * Now that we have unregistered from gldv3, we must finish all other
- * steps and successfully return from this function; otherwise we will
- * end up leaving the device in a broken/unusable state.
+ * Now that we have disabled the device, we must finish all other steps
+ * and successfully return from this function; otherwise we will end up
+ * leaving the device in a broken/unusable state.
*
* First, release any hybrid resources assigned to this vnet device.
*/
@@ -530,9 +612,10 @@ vnet_unattach(vnet_t *vnetp)
* Uninit vgen. This stops further mdeg callbacks to this vnet
* device and/or its ports; and detaches any existing ports.
*/
- if (attach_progress & AST_vgen_init) {
+ if (attach_progress & (AST_vgen_init|AST_init_mdeg)) {
vgen_uninit(vnetp->vgenhdl);
attach_progress &= ~AST_vgen_init;
+ attach_progress &= ~AST_init_mdeg;
}
/* Destroy the taskq. */
@@ -563,6 +646,17 @@ vnet_unattach(vnet_t *vnetp)
attach_progress &= ~AST_vnet_list;
}
+ if (attach_progress & AST_ring_init) {
+ vnet_ring_grp_uninit(vnetp);
+ attach_progress &= ~AST_ring_init;
+ }
+
+ if (attach_progress & AST_macreg) {
+ VERIFY(mac_unregister(vnetp->mh) == 0);
+ vnetp->mh = NULL;
+ attach_progress &= ~AST_macreg;
+ }
+
if (attach_progress & AST_vnet_alloc) {
rw_destroy(&vnetp->vrwlock);
rw_destroy(&vnetp->vsw_fp_rw);
@@ -683,8 +777,9 @@ vnet_m_promisc(void *arg, boolean_t on)
* external hosts.
*/
mblk_t *
-vnet_m_tx(void *arg, mblk_t *mp)
+vnet_tx_ring_send(void *arg, mblk_t *mp)
{
+ vnet_pseudo_tx_ring_t *tx_ringp;
vnet_t *vnetp;
vnet_res_t *vresp;
mblk_t *next;
@@ -694,8 +789,10 @@ vnet_m_tx(void *arg, mblk_t *mp)
boolean_t is_unicast;
boolean_t is_pvid; /* non-default pvid ? */
boolean_t hres; /* Hybrid resource ? */
+ void *tx_arg;
- vnetp = (vnet_t *)arg;
+ tx_ringp = (vnet_pseudo_tx_ring_t *)arg;
+ vnetp = (vnet_t *)tx_ringp->vnetp;
DBG1(vnetp, "enter\n");
ASSERT(mp != NULL);
@@ -790,10 +887,14 @@ vnet_m_tx(void *arg, mblk_t *mp)
}
}
- }
- macp = &vresp->macreg;
- resid_mp = macp->m_callbacks->mc_tx(macp->m_driver, mp);
+ macp = &vresp->macreg;
+ tx_arg = tx_ringp;
+ } else {
+ macp = &vresp->macreg;
+ tx_arg = macp->m_driver;
+ }
+ resid_mp = macp->m_callbacks->mc_tx(tx_arg, mp);
/* tx done; now release ref on fdb entry */
VNET_FDBE_REFRELE(vresp);
@@ -848,6 +949,124 @@ vnet_m_stat(void *arg, uint_t stat, uint64_t *val)
return (0);
}
+static void
+vnet_ring_grp_init(vnet_t *vnetp)
+{
+ vnet_pseudo_rx_group_t *rx_grp;
+ vnet_pseudo_rx_ring_t *rx_ringp;
+ vnet_pseudo_tx_group_t *tx_grp;
+ vnet_pseudo_tx_ring_t *tx_ringp;
+ int i;
+
+ tx_grp = &vnetp->tx_grp[0];
+ tx_ringp = kmem_zalloc(sizeof (vnet_pseudo_tx_ring_t) *
+ VNET_NUM_PSEUDO_TXRINGS, KM_SLEEP);
+ for (i = 0; i < VNET_NUM_PSEUDO_TXRINGS; i++) {
+ tx_ringp[i].state |= VNET_TXRING_SHARED;
+ }
+ tx_grp->rings = tx_ringp;
+ tx_grp->ring_cnt = VNET_NUM_PSEUDO_TXRINGS;
+
+ rx_grp = &vnetp->rx_grp[0];
+ rx_grp->max_ring_cnt = MAX_RINGS_PER_GROUP;
+ rw_init(&rx_grp->lock, NULL, RW_DRIVER, NULL);
+ rx_ringp = kmem_zalloc(sizeof (vnet_pseudo_rx_ring_t) *
+ rx_grp->max_ring_cnt, KM_SLEEP);
+
+ /*
+ * Setup the first 3 Pseudo RX Rings that are reserved;
+ * 1 for LDC resource to vswitch + 2 for RX rings of Hybrid resource.
+ */
+ rx_ringp[0].state |= VNET_RXRING_INUSE|VNET_RXRING_LDC_SERVICE;
+ rx_ringp[0].index = 0;
+ rx_ringp[1].state |= VNET_RXRING_INUSE|VNET_RXRING_HYBRID;
+ rx_ringp[1].index = 1;
+ rx_ringp[2].state |= VNET_RXRING_INUSE|VNET_RXRING_HYBRID;
+ rx_ringp[2].index = 2;
+
+ rx_grp->ring_cnt = VNET_NUM_PSEUDO_RXRINGS_DEFAULT;
+ rx_grp->rings = rx_ringp;
+
+ for (i = VNET_NUM_PSEUDO_RXRINGS_DEFAULT;
+ i < rx_grp->max_ring_cnt; i++) {
+ rx_ringp = &rx_grp->rings[i];
+ rx_ringp->state = VNET_RXRING_FREE;
+ rx_ringp->index = i;
+ }
+}
+
+static void
+vnet_ring_grp_uninit(vnet_t *vnetp)
+{
+ vnet_pseudo_rx_group_t *rx_grp;
+ vnet_pseudo_tx_group_t *tx_grp;
+
+ tx_grp = &vnetp->tx_grp[0];
+ if (tx_grp->rings != NULL) {
+ ASSERT(tx_grp->ring_cnt == VNET_NUM_PSEUDO_TXRINGS);
+ kmem_free(tx_grp->rings, sizeof (vnet_pseudo_tx_ring_t) *
+ tx_grp->ring_cnt);
+ tx_grp->rings = NULL;
+ }
+
+ rx_grp = &vnetp->rx_grp[0];
+ if (rx_grp->rings != NULL) {
+ ASSERT(rx_grp->max_ring_cnt == MAX_RINGS_PER_GROUP);
+ ASSERT(rx_grp->ring_cnt == VNET_NUM_PSEUDO_RXRINGS_DEFAULT);
+ kmem_free(rx_grp->rings, sizeof (vnet_pseudo_rx_ring_t) *
+ rx_grp->max_ring_cnt);
+ rx_grp->rings = NULL;
+ }
+}
+
+static vnet_pseudo_rx_ring_t *
+vnet_alloc_pseudo_rx_ring(vnet_t *vnetp)
+{
+ vnet_pseudo_rx_group_t *rx_grp;
+ vnet_pseudo_rx_ring_t *rx_ringp;
+ int index;
+
+ rx_grp = &vnetp->rx_grp[0];
+ WRITE_ENTER(&rx_grp->lock);
+
+ if (rx_grp->ring_cnt == rx_grp->max_ring_cnt) {
+ /* no rings available */
+ RW_EXIT(&rx_grp->lock);
+ return (NULL);
+ }
+
+ for (index = VNET_NUM_PSEUDO_RXRINGS_DEFAULT;
+ index < rx_grp->max_ring_cnt; index++) {
+ rx_ringp = &rx_grp->rings[index];
+ if (rx_ringp->state == VNET_RXRING_FREE) {
+ rx_ringp->state |= VNET_RXRING_INUSE;
+ rx_grp->ring_cnt++;
+ break;
+ }
+ }
+
+ RW_EXIT(&rx_grp->lock);
+ return (rx_ringp);
+}
+
+static void
+vnet_free_pseudo_rx_ring(vnet_t *vnetp, vnet_pseudo_rx_ring_t *ringp)
+{
+ vnet_pseudo_rx_group_t *rx_grp;
+
+ ASSERT(ringp->index >= VNET_NUM_PSEUDO_RXRINGS_DEFAULT);
+ rx_grp = &vnetp->rx_grp[0];
+ WRITE_ENTER(&rx_grp->lock);
+
+ if (ringp->state != VNET_RXRING_FREE) {
+ ringp->state = VNET_RXRING_FREE;
+ ringp->handle = NULL;
+ rx_grp->ring_cnt--;
+ }
+
+ RW_EXIT(&rx_grp->lock);
+}
+
/* wrapper function for mac_register() */
static int
vnet_mac_register(vnet_t *vnetp)
@@ -867,6 +1086,15 @@ vnet_mac_register(vnet_t *vnetp)
macp->m_margin = VLAN_TAGSZ;
/*
+ * MAC_VIRT_SERIALIZE flag is needed while hybridIO is enabled to
+ * workaround tx lock contention issues in nxge.
+ */
+ macp->m_v12n = MAC_VIRT_LEVEL1;
+ if (vnet_mac_tx_serialize == B_TRUE) {
+ macp->m_v12n |= MAC_VIRT_SERIALIZE;
+ }
+
+ /*
* Finally, we're ready to register ourselves with the MAC layer
* interface; if this succeeds, we're all ready to start()
*/
@@ -1116,42 +1344,57 @@ vnet_rx_frames_untag(uint16_t pvid, mblk_t **mp)
static void
vnet_rx(vio_net_handle_t vrh, mblk_t *mp)
{
- vnet_res_t *vresp = (vnet_res_t *)vrh;
- vnet_t *vnetp = vresp->vnetp;
+ vnet_res_t *vresp = (vnet_res_t *)vrh;
+ vnet_t *vnetp = vresp->vnetp;
+ vnet_pseudo_rx_ring_t *ringp;
if ((vnetp == NULL) || (vnetp->mh == 0)) {
freemsgchain(mp);
return;
}
- /*
- * Packets received over a hybrid resource need additional processing
- * to remove the tag, for the pvid case. The underlying resource is
- * not aware of the vnet's pvid and thus packets are received with the
- * vlan tag in the header; unlike packets that are received over a ldc
- * channel in which case the peer vnet/vsw would have already removed
- * the tag.
- */
- if (vresp->type == VIO_NET_RES_HYBRID &&
- vnetp->pvid != vnetp->default_vlan_id) {
-
- vnet_rx_frames_untag(vnetp->pvid, &mp);
- if (mp == NULL) {
- return;
- }
- }
-
- mac_rx(vnetp->mh, NULL, mp);
+ ringp = vresp->rx_ringp;
+ mac_rx_ring(vnetp->mh, ringp->handle, mp, ringp->gen_num);
}
void
vnet_tx_update(vio_net_handle_t vrh)
{
- vnet_res_t *vresp = (vnet_res_t *)vrh;
- vnet_t *vnetp = vresp->vnetp;
+ vnet_res_t *vresp = (vnet_res_t *)vrh;
+ vnet_t *vnetp = vresp->vnetp;
+ vnet_pseudo_tx_ring_t *tx_ringp;
+ vnet_pseudo_tx_group_t *tx_grp;
+ int i;
+
+ if (vnetp == NULL || vnetp->mh == NULL) {
+ return;
+ }
- if ((vnetp != NULL) && (vnetp->mh != NULL)) {
- mac_tx_update(vnetp->mh);
+ /*
+ * Currently, the tx hwring API (used to access rings that belong to
+ * a Hybrid IO resource) does not provide us a per ring flow ctrl
+ * update; also the pseudo rings are shared by the ports/ldcs in the
+ * vgen layer. Thus we can't figure out which pseudo ring is being
+ * re-enabled for transmits. To work around this, when we get a tx
+ * restart notification from below, we simply propagate that to all
+ * the tx pseudo rings registered with the mac layer above.
+ *
+ * There are a couple of side effects with this approach, but they are
+ * not harmful, as outlined below:
+ *
+ * A) We might send an invalid ring_update() for a ring that is not
+ * really flow controlled. This will not have any effect in the mac
+ * layer and packets will continue to be transmitted on that ring.
+ *
+ * B) We might end up clearing the flow control in the mac layer for
+ * a ring that is still flow controlled in the underlying resource.
+ * This will result in the mac layer restarting transmit, only to be
+ * flow controlled again on that ring.
+ */
+ tx_grp = &vnetp->tx_grp[0];
+ for (i = 0; i < tx_grp->ring_cnt; i++) {
+ tx_ringp = &tx_grp->rings[i];
+ mac_tx_ring_update(vnetp->mh, tx_ringp->handle);
}
}
@@ -1233,8 +1476,8 @@ int vio_net_resource_reg(mac_register_t *macp, vio_net_res_type_t type,
ether_addr_t local_macaddr, ether_addr_t rem_macaddr, vio_net_handle_t *vhp,
vio_net_callbacks_t *vcb)
{
- vnet_t *vnetp;
- vnet_res_t *vresp;
+ vnet_t *vnetp;
+ vnet_res_t *vresp;
vresp = kmem_zalloc(sizeof (vnet_res_t), KM_SLEEP);
ether_copy(local_macaddr, vresp->local_macaddr);
@@ -1260,11 +1503,7 @@ int vio_net_resource_reg(mac_register_t *macp, vio_net_res_type_t type,
vnetp->instance);
}
}
-
- WRITE_ENTER(&vnetp->vrwlock);
- vresp->nextp = vnetp->vres_list;
- vnetp->vres_list = vresp;
- RW_EXIT(&vnetp->vrwlock);
+ vnet_add_resource(vnetp, vresp);
break;
}
vnetp = vnetp->nextp;
@@ -1281,6 +1520,14 @@ int vio_net_resource_reg(mac_register_t *macp, vio_net_res_type_t type,
vcb->vio_net_tx_update = vnet_tx_update;
vcb->vio_net_report_err = vnet_handle_res_err;
+ /* Bind the resource to pseudo ring(s) */
+ if (vnet_bind_rings(vresp) != 0) {
+ (void) vnet_rem_resource(vnetp, vresp);
+ vnet_hio_destroy_kstats(vresp->ksp);
+ KMEM_FREE(vresp);
+ return (1);
+ }
+
/* Dispatch a task to start resources */
vnet_dispatch_res_task(vnetp);
return (0);
@@ -1294,8 +1541,6 @@ vio_net_resource_unreg(vio_net_handle_t vhp)
{
vnet_res_t *vresp = (vnet_res_t *)vhp;
vnet_t *vnetp = vresp->vnetp;
- vnet_res_t *vrp;
- kstat_t *ksp = NULL;
DBG1(NULL, "Resource Registerig hdl=0x%p", vhp);
@@ -1306,7 +1551,29 @@ vio_net_resource_unreg(vio_net_handle_t vhp)
*/
vnet_fdbe_del(vnetp, vresp);
+ vnet_unbind_rings(vresp);
+
/* Now remove the resource from the list */
+ (void) vnet_rem_resource(vnetp, vresp);
+
+ vnet_hio_destroy_kstats(vresp->ksp);
+ KMEM_FREE(vresp);
+}
+
+static void
+vnet_add_resource(vnet_t *vnetp, vnet_res_t *vresp)
+{
+ WRITE_ENTER(&vnetp->vrwlock);
+ vresp->nextp = vnetp->vres_list;
+ vnetp->vres_list = vresp;
+ RW_EXIT(&vnetp->vrwlock);
+}
+
+static vnet_res_t *
+vnet_rem_resource(vnet_t *vnetp, vnet_res_t *vresp)
+{
+ vnet_res_t *vrp;
+
WRITE_ENTER(&vnetp->vrwlock);
if (vresp == vnetp->vres_list) {
vnetp->vres_list = vresp->nextp;
@@ -1320,15 +1587,12 @@ vio_net_resource_unreg(vio_net_handle_t vhp)
vrp = vrp->nextp;
}
}
-
- ksp = vresp->ksp;
- vresp->ksp = NULL;
-
vresp->vnetp = NULL;
vresp->nextp = NULL;
+
RW_EXIT(&vnetp->vrwlock);
- vnet_hio_destroy_kstats(ksp);
- KMEM_FREE(vresp);
+
+ return (vresp);
}
/*
@@ -1710,6 +1974,1024 @@ vnet_hio_get_stats(vnet_res_t *vresp, vnet_hio_stats_t *statsp)
}
}
+static boolean_t
+vnet_m_capab(void *arg, mac_capab_t cap, void *cap_data)
+{
+ vnet_t *vnetp = (vnet_t *)arg;
+
+ if (vnetp == NULL) {
+ return (0);
+ }
+
+ switch (cap) {
+
+ case MAC_CAPAB_RINGS: {
+
+ mac_capab_rings_t *cap_rings = cap_data;
+ /*
+ * Rings Capability Notes:
+ * We advertise rings to make use of the rings framework in
+ * gldv3 mac layer, to improve the performance. This is
+ * specifically needed when a Hybrid resource (with multiple
+ * tx/rx hardware rings) is assigned to a vnet device. We also
+ * leverage this for the normal case when no Hybrid resource is
+ * assigned.
+ *
+ * Ring Allocation:
+ * - TX path:
+ * We expose a pseudo ring group with 2 pseudo tx rings (as
+ * currently HybridIO exports only 2 rings) In the normal case,
+ * transmit traffic that comes down to the driver through the
+ * mri_tx (vnet_tx_ring_send()) entry point goes through the
+ * distributed switching algorithm in vnet and gets transmitted
+ * over a port/LDC in the vgen layer to either the vswitch or a
+ * peer vnet. If and when a Hybrid resource is assigned to the
+ * vnet, we obtain the tx ring information of the Hybrid device
+ * (nxge) and map the pseudo rings 1:1 to the 2 hw tx rings.
+ * Traffic being sent over the Hybrid resource by the mac layer
+ * gets spread across both hw rings, as they are mapped to the
+ * 2 pseudo tx rings in vnet.
+ *
+ * - RX path:
+ * We expose a pseudo ring group with 3 pseudo rx rings (static
+ * rings) initially. The first (default) pseudo rx ring is
+ * reserved for the resource that connects to the vswitch
+ * service. The next 2 rings are reserved for a Hybrid resource
+ * that may be assigned to the vnet device. If and when a
+ * Hybrid resource is assigned to the vnet, we obtain the rx
+ * ring information of the Hybrid device (nxge) and map these
+ * pseudo rings 1:1 to the 2 hw rx rings. For each additional
+ * resource that connects to a peer vnet, we dynamically
+ * allocate a pseudo rx ring and map it to that resource, when
+ * the resource gets added; and the pseudo rx ring is
+ * dynamically registered with the upper mac layer. We do the
+ * reverse and unregister the ring with the mac layer when
+ * the resource gets removed.
+ *
+ * Synchronization notes:
+ * We don't need any lock to protect members of ring structure,
+ * specifically ringp->hw_rh, in either the TX or the RX ring,
+ * as explained below.
+ * - TX ring:
+ * ring->hw_rh is initialized only when a Hybrid resource is
+ * associated; and gets referenced only in vnet_hio_tx(). The
+ * Hybrid resource itself is available in fdb only after tx
+ * hwrings are found and mapped; i.e, in vio_net_resource_reg()
+ * we call vnet_bind_rings() first and then call
+ * vnet_start_resources() which adds an entry to fdb. For
+ * traffic going over LDC resources, we don't reference
+ * ring->hw_rh at all.
+ * - RX ring:
+ * For rings mapped to Hybrid resource ring->hw_rh is
+ * initialized and only then do we add the rx callback for
+ * the underlying Hybrid resource; we disable callbacks before
+ * we unmap ring->hw_rh. For rings mapped to LDC resources, we
+ * stop the rx callbacks (in vgen) before we remove ring->hw_rh
+ * (vio_net_resource_unreg()).
+ */
+
+ if (cap_rings->mr_type == MAC_RING_TYPE_RX) {
+ cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
+
+ /*
+ * The ring_cnt for rx grp is initialized in
+ * vnet_ring_grp_init(). Later, the ring_cnt gets
+ * updated dynamically whenever LDC resources are added
+ * or removed.
+ */
+ cap_rings->mr_rnum = vnetp->rx_grp[0].ring_cnt;
+ cap_rings->mr_rget = vnet_get_ring;
+
+ cap_rings->mr_gnum = VNET_NUM_PSEUDO_GROUPS;
+ cap_rings->mr_gget = vnet_get_group;
+ cap_rings->mr_gaddring = NULL;
+ cap_rings->mr_gremring = NULL;
+ } else {
+ cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
+
+ /*
+ * The ring_cnt for tx grp is initialized in
+ * vnet_ring_grp_init() and remains constant, as we
+ * do not support dymanic tx rings for now.
+ */
+ cap_rings->mr_rnum = vnetp->tx_grp[0].ring_cnt;
+ cap_rings->mr_rget = vnet_get_ring;
+
+ /*
+ * Transmit rings are not grouped; i.e, the number of
+ * transmit ring groups advertised should be set to 0.
+ */
+ cap_rings->mr_gnum = 0;
+
+ cap_rings->mr_gget = vnet_get_group;
+ cap_rings->mr_gaddring = NULL;
+ cap_rings->mr_gremring = NULL;
+ }
+ return (B_TRUE);
+
+ }
+
+ default:
+ break;
+
+ }
+
+ return (B_FALSE);
+}
+
+/*
+ * Callback funtion for MAC layer to get ring information.
+ */
+static void
+vnet_get_ring(void *arg, mac_ring_type_t rtype, const int g_index,
+ const int r_index, mac_ring_info_t *infop, mac_ring_handle_t r_handle)
+{
+ vnet_t *vnetp = arg;
+
+ switch (rtype) {
+
+ case MAC_RING_TYPE_RX: {
+
+ vnet_pseudo_rx_group_t *rx_grp;
+ vnet_pseudo_rx_ring_t *rx_ringp;
+ mac_intr_t *mintr;
+
+ /* We advertised only one RX group */
+ ASSERT(g_index == 0);
+ rx_grp = &vnetp->rx_grp[g_index];
+
+ /* Check the current # of rings in the rx group */
+ ASSERT((r_index >= 0) && (r_index < rx_grp->max_ring_cnt));
+
+ /* Get the ring based on the index */
+ rx_ringp = &rx_grp->rings[r_index];
+
+ rx_ringp->handle = r_handle;
+ /*
+ * Note: we don't need to save the incoming r_index in rx_ring,
+ * as vnet_ring_grp_init() would have initialized the index for
+ * each ring in the array.
+ */
+ rx_ringp->grp = rx_grp;
+ rx_ringp->vnetp = vnetp;
+
+ mintr = &infop->mri_intr;
+ mintr->mi_handle = (mac_intr_handle_t)rx_ringp;
+ mintr->mi_enable = (mac_intr_enable_t)vnet_ring_enable_intr;
+ mintr->mi_disable = (mac_intr_disable_t)vnet_ring_disable_intr;
+
+ infop->mri_driver = (mac_ring_driver_t)rx_ringp;
+ infop->mri_start = vnet_rx_ring_start;
+ infop->mri_stop = vnet_rx_ring_stop;
+
+ /* Set the poll function, as this is an rx ring */
+ infop->mri_poll = vnet_rx_poll;
+
+ break;
+ }
+
+ case MAC_RING_TYPE_TX: {
+ vnet_pseudo_tx_group_t *tx_grp;
+ vnet_pseudo_tx_ring_t *tx_ringp;
+
+ /*
+ * No need to check grp index; mac layer passes -1 for it.
+ */
+ tx_grp = &vnetp->tx_grp[0];
+
+ /* Check the # of rings in the tx group */
+ ASSERT((r_index >= 0) && (r_index < tx_grp->ring_cnt));
+
+ /* Get the ring based on the index */
+ tx_ringp = &tx_grp->rings[r_index];
+
+ tx_ringp->handle = r_handle;
+ tx_ringp->index = r_index;
+ tx_ringp->grp = tx_grp;
+ tx_ringp->vnetp = vnetp;
+
+ infop->mri_driver = (mac_ring_driver_t)tx_ringp;
+ infop->mri_start = vnet_tx_ring_start;
+ infop->mri_stop = vnet_tx_ring_stop;
+
+ /* Set the transmit function, as this is a tx ring */
+ infop->mri_tx = vnet_tx_ring_send;
+
+ break;
+ }
+
+ default:
+ break;
+ }
+}
+
+/*
+ * Callback funtion for MAC layer to get group information.
+ */
+static void
+vnet_get_group(void *arg, mac_ring_type_t type, const int index,
+ mac_group_info_t *infop, mac_group_handle_t handle)
+{
+ vnet_t *vnetp = (vnet_t *)arg;
+
+ switch (type) {
+
+ case MAC_RING_TYPE_RX:
+ {
+ vnet_pseudo_rx_group_t *rx_grp;
+
+ /* We advertised only one RX group */
+ ASSERT(index == 0);
+
+ rx_grp = &vnetp->rx_grp[index];
+ rx_grp->handle = handle;
+ rx_grp->index = index;
+ rx_grp->vnetp = vnetp;
+
+ infop->mgi_driver = (mac_group_driver_t)rx_grp;
+ infop->mgi_start = NULL;
+ infop->mgi_stop = NULL;
+ infop->mgi_addmac = vnet_addmac;
+ infop->mgi_remmac = vnet_remmac;
+ infop->mgi_count = rx_grp->ring_cnt;
+
+ break;
+ }
+
+ case MAC_RING_TYPE_TX:
+ {
+ vnet_pseudo_tx_group_t *tx_grp;
+
+ /* We advertised only one TX group */
+ ASSERT(index == 0);
+
+ tx_grp = &vnetp->tx_grp[index];
+ tx_grp->handle = handle;
+ tx_grp->index = index;
+ tx_grp->vnetp = vnetp;
+
+ infop->mgi_driver = (mac_group_driver_t)tx_grp;
+ infop->mgi_start = NULL;
+ infop->mgi_stop = NULL;
+ infop->mgi_addmac = NULL;
+ infop->mgi_remmac = NULL;
+ infop->mgi_count = VNET_NUM_PSEUDO_TXRINGS;
+
+ break;
+ }
+
+ default:
+ break;
+
+ }
+}
+
+static int
+vnet_rx_ring_start(mac_ring_driver_t arg, uint64_t mr_gen_num)
+{
+ vnet_pseudo_rx_ring_t *rx_ringp = (vnet_pseudo_rx_ring_t *)arg;
+ int err;
+
+ /*
+ * If this ring is mapped to a LDC resource, simply mark the state to
+ * indicate the ring is started and return.
+ */
+ if ((rx_ringp->state &
+ (VNET_RXRING_LDC_SERVICE|VNET_RXRING_LDC_GUEST)) != 0) {
+ rx_ringp->gen_num = mr_gen_num;
+ rx_ringp->state |= VNET_RXRING_STARTED;
+ return (0);
+ }
+
+ ASSERT((rx_ringp->state & VNET_RXRING_HYBRID) != 0);
+
+ /*
+ * This must be a ring reserved for a hwring. If the hwring is not
+ * bound yet, simply mark the state to indicate the ring is started and
+ * return. If and when a hybrid resource is activated for this vnet
+ * device, we will bind the hwring and start it then. If a hwring is
+ * already bound, start it now.
+ */
+ if (rx_ringp->hw_rh == NULL) {
+ rx_ringp->gen_num = mr_gen_num;
+ rx_ringp->state |= VNET_RXRING_STARTED;
+ return (0);
+ }
+
+ err = mac_hwring_start(rx_ringp->hw_rh);
+ if (err == 0) {
+ rx_ringp->gen_num = mr_gen_num;
+ rx_ringp->state |= VNET_RXRING_STARTED;
+ } else {
+ err = ENXIO;
+ }
+
+ return (err);
+}
+
+static void
+vnet_rx_ring_stop(mac_ring_driver_t arg)
+{
+ vnet_pseudo_rx_ring_t *rx_ringp = (vnet_pseudo_rx_ring_t *)arg;
+
+ /*
+ * If this ring is mapped to a LDC resource, simply mark the state to
+ * indicate the ring is now stopped and return.
+ */
+ if ((rx_ringp->state &
+ (VNET_RXRING_LDC_SERVICE|VNET_RXRING_LDC_GUEST)) != 0) {
+ rx_ringp->state &= ~VNET_RXRING_STARTED;
+ }
+
+ ASSERT((rx_ringp->state & VNET_RXRING_HYBRID) != 0);
+
+ /*
+ * This must be a ring reserved for a hwring. If the hwring is not
+ * bound yet, simply mark the state to indicate the ring is stopped and
+ * return. If a hwring is already bound, stop it now.
+ */
+ if (rx_ringp->hw_rh == NULL) {
+ rx_ringp->state &= ~VNET_RXRING_STARTED;
+ return;
+ }
+
+ mac_hwring_stop(rx_ringp->hw_rh);
+ rx_ringp->state &= ~VNET_RXRING_STARTED;
+}
+
+/* ARGSUSED */
+static int
+vnet_tx_ring_start(mac_ring_driver_t arg, uint64_t mr_gen_num)
+{
+ vnet_pseudo_tx_ring_t *tx_ringp = (vnet_pseudo_tx_ring_t *)arg;
+
+ tx_ringp->state |= VNET_TXRING_STARTED;
+ return (0);
+}
+
+static void
+vnet_tx_ring_stop(mac_ring_driver_t arg)
+{
+ vnet_pseudo_tx_ring_t *tx_ringp = (vnet_pseudo_tx_ring_t *)arg;
+
+ tx_ringp->state &= ~VNET_TXRING_STARTED;
+}
+
+/*
+ * Disable polling for a ring and enable its interrupt.
+ */
+static int
+vnet_ring_enable_intr(void *arg)
+{
+ vnet_pseudo_rx_ring_t *rx_ringp = (vnet_pseudo_rx_ring_t *)arg;
+ vnet_res_t *vresp;
+
+ if (rx_ringp->hw_rh == NULL) {
+ /*
+ * Ring enable intr func is being invoked, but the ring is
+ * not bound to any underlying resource ? This must be a ring
+ * reserved for Hybrid resource and no such resource has been
+ * assigned to this vnet device yet. We simply return success.
+ */
+ ASSERT((rx_ringp->state & VNET_RXRING_HYBRID) != 0);
+ return (0);
+ }
+
+ /*
+ * The rx ring has been bound to either a LDC or a Hybrid resource.
+ * Call the appropriate function to enable interrupts for the ring.
+ */
+ if (rx_ringp->state & VNET_RXRING_HYBRID) {
+ return (mac_hwring_enable_intr(rx_ringp->hw_rh));
+ } else {
+ vresp = (vnet_res_t *)rx_ringp->hw_rh;
+ return (vgen_enable_intr(vresp->macreg.m_driver));
+ }
+}
+
+/*
+ * Enable polling for a ring and disable its interrupt.
+ */
+static int
+vnet_ring_disable_intr(void *arg)
+{
+ vnet_pseudo_rx_ring_t *rx_ringp = (vnet_pseudo_rx_ring_t *)arg;
+ vnet_res_t *vresp;
+
+ if (rx_ringp->hw_rh == NULL) {
+ /*
+ * Ring disable intr func is being invoked, but the ring is
+ * not bound to any underlying resource ? This must be a ring
+ * reserved for Hybrid resource and no such resource has been
+ * assigned to this vnet device yet. We simply return success.
+ */
+ ASSERT((rx_ringp->state & VNET_RXRING_HYBRID) != 0);
+ return (0);
+ }
+
+ /*
+ * The rx ring has been bound to either a LDC or a Hybrid resource.
+ * Call the appropriate function to disable interrupts for the ring.
+ */
+ if (rx_ringp->state & VNET_RXRING_HYBRID) {
+ return (mac_hwring_disable_intr(rx_ringp->hw_rh));
+ } else {
+ vresp = (vnet_res_t *)rx_ringp->hw_rh;
+ return (vgen_disable_intr(vresp->macreg.m_driver));
+ }
+}
+
+/*
+ * Poll 'bytes_to_pickup' bytes of message from the rx ring.
+ */
+static mblk_t *
+vnet_rx_poll(void *arg, int bytes_to_pickup)
+{
+ vnet_pseudo_rx_ring_t *rx_ringp = (vnet_pseudo_rx_ring_t *)arg;
+ mblk_t *mp = NULL;
+ vnet_res_t *vresp;
+ vnet_t *vnetp = rx_ringp->vnetp;
+
+ if (rx_ringp->hw_rh == NULL) {
+ return (NULL);
+ }
+
+ if (rx_ringp->state & VNET_RXRING_HYBRID) {
+ mp = mac_hwring_poll(rx_ringp->hw_rh, bytes_to_pickup);
+ /*
+ * Packets received over a hybrid resource need additional
+ * processing to remove the tag, for the pvid case. The
+ * underlying resource is not aware of the vnet's pvid and thus
+ * packets are received with the vlan tag in the header; unlike
+ * packets that are received over a ldc channel in which case
+ * the peer vnet/vsw would have already removed the tag.
+ */
+ if (vnetp->pvid != vnetp->default_vlan_id) {
+ vnet_rx_frames_untag(vnetp->pvid, &mp);
+ }
+ } else {
+ vresp = (vnet_res_t *)rx_ringp->hw_rh;
+ mp = vgen_poll(vresp->macreg.m_driver, bytes_to_pickup);
+ }
+ return (mp);
+}
+
+/* ARGSUSED */
+void
+vnet_hio_rx_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
+ boolean_t loopback)
+{
+ vnet_t *vnetp = (vnet_t *)arg;
+ vnet_pseudo_rx_ring_t *ringp = (vnet_pseudo_rx_ring_t *)mrh;
+
+ /*
+ * Packets received over a hybrid resource need additional processing
+ * to remove the tag, for the pvid case. The underlying resource is
+ * not aware of the vnet's pvid and thus packets are received with the
+ * vlan tag in the header; unlike packets that are received over a ldc
+ * channel in which case the peer vnet/vsw would have already removed
+ * the tag.
+ */
+ if (vnetp->pvid != vnetp->default_vlan_id) {
+ vnet_rx_frames_untag(vnetp->pvid, &mp);
+ if (mp == NULL) {
+ return;
+ }
+ }
+ mac_rx_ring(vnetp->mh, ringp->handle, mp, ringp->gen_num);
+}
+
+static int
+vnet_addmac(void *arg, const uint8_t *mac_addr)
+{
+ vnet_pseudo_rx_group_t *rx_grp = (vnet_pseudo_rx_group_t *)arg;
+ vnet_t *vnetp;
+
+ vnetp = rx_grp->vnetp;
+
+ if (bcmp(mac_addr, vnetp->curr_macaddr, ETHERADDRL) == 0) {
+ return (0);
+ }
+
+ cmn_err(CE_CONT, "!vnet%d: %s: Multiple macaddr unsupported\n",
+ vnetp->instance, __func__);
+ return (EINVAL);
+}
+
+static int
+vnet_remmac(void *arg, const uint8_t *mac_addr)
+{
+ vnet_pseudo_rx_group_t *rx_grp = (vnet_pseudo_rx_group_t *)arg;
+ vnet_t *vnetp;
+
+ vnetp = rx_grp->vnetp;
+
+ if (bcmp(mac_addr, vnetp->curr_macaddr, ETHERADDRL) == 0) {
+ return (0);
+ }
+
+ cmn_err(CE_CONT, "!vnet%d: %s: Invalid macaddr: %s\n",
+ vnetp->instance, __func__, ether_sprintf((void *)mac_addr));
+ return (EINVAL);
+}
+
+int
+vnet_hio_mac_init(vnet_t *vnetp, char *ifname)
+{
+ mac_handle_t mh;
+ mac_client_handle_t mch = NULL;
+ mac_unicast_handle_t muh = NULL;
+ mac_diag_t diag;
+ mac_register_t *macp;
+ char client_name[MAXNAMELEN];
+ int rv;
+ uint16_t mac_flags = MAC_UNICAST_TAG_DISABLE |
+ MAC_UNICAST_STRIP_DISABLE | MAC_UNICAST_PRIMARY;
+ vio_net_callbacks_t vcb;
+ ether_addr_t rem_addr =
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
+ uint32_t retries = 0;
+
+ if ((macp = mac_alloc(MAC_VERSION)) == NULL) {
+ return (EAGAIN);
+ }
+
+ do {
+ rv = mac_open_by_linkname(ifname, &mh);
+ if (rv == 0) {
+ break;
+ }
+ if (rv != ENOENT || (retries++ >= vnet_mac_open_retries)) {
+ mac_free(macp);
+ return (rv);
+ }
+ drv_usecwait(vnet_mac_open_delay);
+ } while (rv == ENOENT);
+
+ vnetp->hio_mh = mh;
+
+ (void) snprintf(client_name, MAXNAMELEN, "vnet%d-%s", vnetp->instance,
+ ifname);
+ rv = mac_client_open(mh, &mch, client_name, MAC_OPEN_FLAGS_EXCLUSIVE);
+ if (rv != 0) {
+ goto fail;
+ }
+ vnetp->hio_mch = mch;
+
+ rv = mac_unicast_add(mch, vnetp->curr_macaddr, mac_flags, &muh, 0,
+ &diag);
+ if (rv != 0) {
+ goto fail;
+ }
+ vnetp->hio_muh = muh;
+
+ macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
+ macp->m_driver = vnetp;
+ macp->m_dip = NULL;
+ macp->m_src_addr = NULL;
+ macp->m_callbacks = &vnet_hio_res_callbacks;
+ macp->m_min_sdu = 0;
+ macp->m_max_sdu = ETHERMTU;
+
+ rv = vio_net_resource_reg(macp, VIO_NET_RES_HYBRID,
+ vnetp->curr_macaddr, rem_addr, &vnetp->hio_vhp, &vcb);
+ if (rv != 0) {
+ goto fail;
+ }
+ mac_free(macp);
+
+ /* add the recv callback */
+ mac_rx_set(vnetp->hio_mch, vnet_hio_rx_cb, vnetp);
+
+ /* add the notify callback - only tx updates for now */
+ vnetp->hio_mnh = mac_notify_add(vnetp->hio_mh, vnet_hio_notify_cb,
+ vnetp);
+
+ return (0);
+
+fail:
+ mac_free(macp);
+ vnet_hio_mac_cleanup(vnetp);
+ return (1);
+}
+
+void
+vnet_hio_mac_cleanup(vnet_t *vnetp)
+{
+ if (vnetp->hio_mnh != NULL) {
+ (void) mac_notify_remove(vnetp->hio_mnh, B_TRUE);
+ vnetp->hio_mnh = NULL;
+ }
+
+ if (vnetp->hio_vhp != NULL) {
+ vio_net_resource_unreg(vnetp->hio_vhp);
+ vnetp->hio_vhp = NULL;
+ }
+
+ if (vnetp->hio_muh != NULL) {
+ mac_unicast_remove(vnetp->hio_mch, vnetp->hio_muh);
+ vnetp->hio_muh = NULL;
+ }
+
+ if (vnetp->hio_mch != NULL) {
+ mac_client_close(vnetp->hio_mch, 0);
+ vnetp->hio_mch = NULL;
+ }
+
+ if (vnetp->hio_mh != NULL) {
+ mac_close(vnetp->hio_mh);
+ vnetp->hio_mh = NULL;
+ }
+}
+
+/* Bind pseudo rings to hwrings */
+static int
+vnet_bind_hwrings(vnet_t *vnetp)
+{
+ mac_ring_handle_t hw_rh[VNET_NUM_HYBRID_RINGS];
+ mac_perim_handle_t mph1;
+ vnet_pseudo_rx_group_t *rx_grp;
+ vnet_pseudo_rx_ring_t *rx_ringp;
+ vnet_pseudo_tx_group_t *tx_grp;
+ vnet_pseudo_tx_ring_t *tx_ringp;
+ int hw_ring_cnt;
+ int i;
+ int rv;
+
+ mac_perim_enter_by_mh(vnetp->hio_mh, &mph1);
+
+ /* Get the list of the underlying RX rings. */
+ hw_ring_cnt = mac_hwrings_get(vnetp->hio_mch, &vnetp->rx_hwgh, hw_rh,
+ MAC_RING_TYPE_RX);
+
+ /* We expect the the # of hw rx rings to match VNET_NUM_HYBRID_RINGS */
+ if (hw_ring_cnt != VNET_NUM_HYBRID_RINGS) {
+ cmn_err(CE_WARN,
+ "!vnet%d: vnet_bind_hwrings: bad rx hw_ring_cnt(%d)\n",
+ vnetp->instance, hw_ring_cnt);
+ goto fail;
+ }
+
+ if (vnetp->rx_hwgh != NULL) {
+ /*
+ * Quiesce the HW ring and the mac srs on the ring. Note
+ * that the HW ring will be restarted when the pseudo ring
+ * is started. At that time all the packets will be
+ * directly passed up to the pseudo RX ring and handled
+ * by mac srs created over the pseudo RX ring.
+ */
+ mac_rx_client_quiesce(vnetp->hio_mch);
+ mac_srs_perm_quiesce(vnetp->hio_mch, B_TRUE);
+ }
+
+ /*
+ * Bind the pseudo rings to the hwrings and start the hwrings.
+ * Note we don't need to register these with the upper mac, as we have
+ * statically exported these pseudo rxrings which are reserved for
+ * rxrings of Hybrid resource.
+ */
+ rx_grp = &vnetp->rx_grp[0];
+ for (i = 0; i < VNET_NUM_HYBRID_RINGS; i++) {
+ /* Pick the rxrings reserved for Hybrid resource */
+ rx_ringp = &rx_grp->rings[i + VNET_HYBRID_RXRING_INDEX];
+
+ /* Store the hw ring handle */
+ rx_ringp->hw_rh = hw_rh[i];
+
+ /* Bind the pseudo ring to the underlying hwring */
+ mac_hwring_setup(rx_ringp->hw_rh,
+ (mac_resource_handle_t)rx_ringp);
+
+ /* Start the hwring if needed */
+ if (rx_ringp->state & VNET_RXRING_STARTED) {
+ rv = mac_hwring_start(rx_ringp->hw_rh);
+ if (rv != 0) {
+ mac_hwring_teardown(rx_ringp->hw_rh);
+ rx_ringp->hw_rh = NULL;
+ goto fail;
+ }
+ }
+ }
+
+ /* Get the list of the underlying TX rings. */
+ hw_ring_cnt = mac_hwrings_get(vnetp->hio_mch, &vnetp->tx_hwgh, hw_rh,
+ MAC_RING_TYPE_TX);
+
+ /* We expect the # of hw tx rings to match VNET_NUM_HYBRID_RINGS */
+ if (hw_ring_cnt != VNET_NUM_HYBRID_RINGS) {
+ cmn_err(CE_WARN,
+ "!vnet%d: vnet_bind_hwrings: bad tx hw_ring_cnt(%d)\n",
+ vnetp->instance, hw_ring_cnt);
+ goto fail;
+ }
+
+ /*
+ * Now map the pseudo txrings to the hw txrings. Note we don't need
+ * to register these with the upper mac, as we have statically exported
+ * these rings. Note that these rings will continue to be used for LDC
+ * resources to peer vnets and vswitch (shared ring).
+ */
+ tx_grp = &vnetp->tx_grp[0];
+ for (i = 0; i < tx_grp->ring_cnt; i++) {
+ tx_ringp = &tx_grp->rings[i];
+ tx_ringp->hw_rh = hw_rh[i];
+ tx_ringp->state |= VNET_TXRING_HYBRID;
+ }
+
+ mac_perim_exit(mph1);
+ return (0);
+
+fail:
+ mac_perim_exit(mph1);
+ vnet_unbind_hwrings(vnetp);
+ return (1);
+}
+
+/* Unbind pseudo rings from hwrings */
+static void
+vnet_unbind_hwrings(vnet_t *vnetp)
+{
+ mac_perim_handle_t mph1;
+ vnet_pseudo_rx_ring_t *rx_ringp;
+ vnet_pseudo_rx_group_t *rx_grp;
+ vnet_pseudo_tx_group_t *tx_grp;
+ vnet_pseudo_tx_ring_t *tx_ringp;
+ int i;
+
+ mac_perim_enter_by_mh(vnetp->hio_mh, &mph1);
+
+ tx_grp = &vnetp->tx_grp[0];
+ for (i = 0; i < VNET_NUM_HYBRID_RINGS; i++) {
+ tx_ringp = &tx_grp->rings[i];
+ if (tx_ringp->state & VNET_TXRING_HYBRID) {
+ tx_ringp->state &= ~VNET_TXRING_HYBRID;
+ tx_ringp->hw_rh = NULL;
+ }
+ }
+
+ rx_grp = &vnetp->rx_grp[0];
+ for (i = 0; i < VNET_NUM_HYBRID_RINGS; i++) {
+ rx_ringp = &rx_grp->rings[i + VNET_HYBRID_RXRING_INDEX];
+ if (rx_ringp->hw_rh != NULL) {
+ /* Stop the hwring */
+ mac_hwring_stop(rx_ringp->hw_rh);
+
+ /* Teardown the hwring */
+ mac_hwring_teardown(rx_ringp->hw_rh);
+ rx_ringp->hw_rh = NULL;
+ }
+ }
+
+ if (vnetp->rx_hwgh != NULL) {
+ vnetp->rx_hwgh = NULL;
+ /*
+ * First clear the permanent-quiesced flag of the RX srs then
+ * restart the HW ring and the mac srs on the ring.
+ */
+ mac_srs_perm_quiesce(vnetp->hio_mch, B_FALSE);
+ mac_rx_client_restart(vnetp->hio_mch);
+ }
+
+ mac_perim_exit(mph1);
+}
+
+/* Bind pseudo ring to a LDC resource */
+static int
+vnet_bind_vgenring(vnet_res_t *vresp)
+{
+ vnet_t *vnetp;
+ vnet_pseudo_rx_group_t *rx_grp;
+ vnet_pseudo_rx_ring_t *rx_ringp;
+ mac_perim_handle_t mph1;
+ int rv;
+ int type;
+
+ vnetp = vresp->vnetp;
+ type = vresp->type;
+ rx_grp = &vnetp->rx_grp[0];
+
+ if (type == VIO_NET_RES_LDC_SERVICE) {
+ /*
+ * Ring Index 0 is the default ring in the group and is
+ * reserved for LDC_SERVICE in vnet_ring_grp_init(). This ring
+ * is allocated statically and is reported to the mac layer
+ * in vnet_m_capab(). So, all we need to do here, is save a
+ * reference to the associated vresp.
+ */
+ rx_ringp = &rx_grp->rings[0];
+ rx_ringp->hw_rh = (mac_ring_handle_t)vresp;
+ vresp->rx_ringp = (void *)rx_ringp;
+ return (0);
+ }
+ ASSERT(type == VIO_NET_RES_LDC_GUEST);
+
+ mac_perim_enter_by_mh(vnetp->mh, &mph1);
+
+ rx_ringp = vnet_alloc_pseudo_rx_ring(vnetp);
+ if (rx_ringp == NULL) {
+ cmn_err(CE_WARN, "!vnet%d: Failed to allocate pseudo rx ring",
+ vnetp->instance);
+ goto fail;
+ }
+
+ /* Store the LDC resource itself as the ring handle */
+ rx_ringp->hw_rh = (mac_ring_handle_t)vresp;
+
+ /*
+ * Save a reference to the ring in the resource for lookup during
+ * unbind. Note this is only done for LDC resources. We don't need this
+ * in the case of a Hybrid resource (see vnet_bind_hwrings()), as its
+ * rx rings are mapped to reserved pseudo rx rings (index 1 and 2).
+ */
+ vresp->rx_ringp = (void *)rx_ringp;
+ rx_ringp->state |= VNET_RXRING_LDC_GUEST;
+
+ /* Register the pseudo ring with upper-mac */
+ rv = mac_group_add_ring(rx_grp->handle, rx_ringp->index);
+ if (rv != 0) {
+ rx_ringp->state &= ~VNET_RXRING_LDC_GUEST;
+ rx_ringp->hw_rh = NULL;
+ vnet_free_pseudo_rx_ring(vnetp, rx_ringp);
+ goto fail;
+ }
+
+ mac_perim_exit(mph1);
+ return (0);
+fail:
+ mac_perim_exit(mph1);
+ return (1);
+}
+
+/* Unbind pseudo ring from a LDC resource */
+static void
+vnet_unbind_vgenring(vnet_res_t *vresp)
+{
+ vnet_t *vnetp;
+ vnet_pseudo_rx_group_t *rx_grp;
+ vnet_pseudo_rx_ring_t *rx_ringp;
+ mac_perim_handle_t mph1;
+ int type;
+
+ vnetp = vresp->vnetp;
+ type = vresp->type;
+ rx_grp = &vnetp->rx_grp[0];
+
+ if (vresp->rx_ringp == NULL) {
+ return;
+ }
+
+ if (type == VIO_NET_RES_LDC_SERVICE) {
+ /*
+ * Ring Index 0 is the default ring in the group and is
+ * reserved for LDC_SERVICE in vnet_ring_grp_init(). This ring
+ * is allocated statically and is reported to the mac layer
+ * in vnet_m_capab(). So, all we need to do here, is remove its
+ * reference to the associated vresp.
+ */
+ rx_ringp = &rx_grp->rings[0];
+ rx_ringp->hw_rh = NULL;
+ vresp->rx_ringp = NULL;
+ return;
+ }
+ ASSERT(type == VIO_NET_RES_LDC_GUEST);
+
+ mac_perim_enter_by_mh(vnetp->mh, &mph1);
+
+ rx_ringp = (vnet_pseudo_rx_ring_t *)vresp->rx_ringp;
+ vresp->rx_ringp = NULL;
+
+ if (rx_ringp != NULL && (rx_ringp->state & VNET_RXRING_LDC_GUEST)) {
+ /* Unregister the pseudo ring with upper-mac */
+ mac_group_rem_ring(rx_grp->handle, rx_ringp->handle);
+
+ rx_ringp->hw_rh = NULL;
+ rx_ringp->state &= ~VNET_RXRING_LDC_GUEST;
+
+ /* Free the pseudo rx ring */
+ vnet_free_pseudo_rx_ring(vnetp, rx_ringp);
+ }
+
+ mac_perim_exit(mph1);
+}
+
+static void
+vnet_unbind_rings(vnet_res_t *vresp)
+{
+ switch (vresp->type) {
+
+ case VIO_NET_RES_LDC_SERVICE:
+ case VIO_NET_RES_LDC_GUEST:
+ vnet_unbind_vgenring(vresp);
+ break;
+
+ case VIO_NET_RES_HYBRID:
+ vnet_unbind_hwrings(vresp->vnetp);
+ break;
+
+ default:
+ break;
+
+ }
+}
+
+static int
+vnet_bind_rings(vnet_res_t *vresp)
+{
+ int rv;
+
+ switch (vresp->type) {
+
+ case VIO_NET_RES_LDC_SERVICE:
+ case VIO_NET_RES_LDC_GUEST:
+ rv = vnet_bind_vgenring(vresp);
+ break;
+
+ case VIO_NET_RES_HYBRID:
+ rv = vnet_bind_hwrings(vresp->vnetp);
+ break;
+
+ default:
+ rv = 1;
+ break;
+
+ }
+
+ return (rv);
+}
+
+/* ARGSUSED */
+int
+vnet_hio_stat(void *arg, uint_t stat, uint64_t *val)
+{
+ vnet_t *vnetp = (vnet_t *)arg;
+
+ *val = mac_stat_get(vnetp->hio_mh, stat);
+ return (0);
+}
+
+/*
+ * The start() and stop() routines for the Hybrid resource below, are just
+ * dummy functions. This is provided to avoid resource type specific code in
+ * vnet_start_resources() and vnet_stop_resources(). The starting and stopping
+ * of the Hybrid resource happens in the context of the mac_client interfaces
+ * that are invoked in vnet_hio_mac_init() and vnet_hio_mac_cleanup().
+ */
+/* ARGSUSED */
+static int
+vnet_hio_start(void *arg)
+{
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+vnet_hio_stop(void *arg)
+{
+}
+
+mblk_t *
+vnet_hio_tx(void *arg, mblk_t *mp)
+{
+ vnet_pseudo_tx_ring_t *tx_ringp;
+ mblk_t *nextp;
+ mblk_t *ret_mp;
+
+ tx_ringp = (vnet_pseudo_tx_ring_t *)arg;
+ for (;;) {
+ nextp = mp->b_next;
+ mp->b_next = NULL;
+
+ ret_mp = mac_hwring_tx(tx_ringp->hw_rh, mp);
+ if (ret_mp != NULL) {
+ ret_mp->b_next = nextp;
+ mp = ret_mp;
+ break;
+ }
+
+ if ((mp = nextp) == NULL)
+ break;
+ }
+ return (mp);
+}
+
+static void
+vnet_hio_notify_cb(void *arg, mac_notify_type_t type)
+{
+ vnet_t *vnetp = (vnet_t *)arg;
+ mac_perim_handle_t mph;
+
+ mac_perim_enter_by_mh(vnetp->hio_mh, &mph);
+ switch (type) {
+ case MAC_NOTE_TX:
+ vnet_tx_update(vnetp->hio_vhp);
+ break;
+
+ default:
+ break;
+ }
+ mac_perim_exit(mph);
+}
+
#ifdef VNET_IOC_DEBUG
/*