diff options
-rw-r--r-- | usr/src/uts/common/io/aggr/aggr_grp.c | 6 | ||||
-rw-r--r-- | usr/src/uts/common/io/mac/mac.c | 95 | ||||
-rw-r--r-- | usr/src/uts/common/io/mac/mac_datapath_setup.c | 81 | ||||
-rw-r--r-- | usr/src/uts/common/io/nxge/nxge_hio.c | 71 | ||||
-rw-r--r-- | usr/src/uts/common/io/nxge/nxge_hio_guest.c | 116 | ||||
-rw-r--r-- | usr/src/uts/common/io/nxge/nxge_main.c | 120 | ||||
-rw-r--r-- | usr/src/uts/common/io/nxge/nxge_rxdma.c | 76 | ||||
-rw-r--r-- | usr/src/uts/common/io/nxge/nxge_send.c | 72 | ||||
-rw-r--r-- | usr/src/uts/common/io/nxge/nxge_virtual.c | 3 | ||||
-rw-r--r-- | usr/src/uts/common/sys/mac_client_priv.h | 2 | ||||
-rw-r--r-- | usr/src/uts/common/sys/mac_impl.h | 6 | ||||
-rw-r--r-- | usr/src/uts/common/sys/mac_soft_ring.h | 3 | ||||
-rw-r--r-- | usr/src/uts/common/sys/nxge/nxge_hio.h | 34 | ||||
-rw-r--r-- | usr/src/uts/sun4v/io/vnet.c | 1408 | ||||
-rw-r--r-- | usr/src/uts/sun4v/io/vnet_dds.c | 30 | ||||
-rw-r--r-- | usr/src/uts/sun4v/io/vnet_gen.c | 236 | ||||
-rw-r--r-- | usr/src/uts/sun4v/sys/vnet.h | 115 | ||||
-rw-r--r-- | usr/src/uts/sun4v/sys/vnet_gen.h | 6 |
18 files changed, 1998 insertions, 482 deletions
diff --git a/usr/src/uts/common/io/aggr/aggr_grp.c b/usr/src/uts/common/io/aggr/aggr_grp.c index 8e080da083..c619144958 100644 --- a/usr/src/uts/common/io/aggr/aggr_grp.c +++ b/usr/src/uts/common/io/aggr/aggr_grp.c @@ -623,7 +623,8 @@ aggr_add_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp) /* * Get the list the the underlying HW rings. */ - hw_rh_cnt = mac_hwrings_get(port->lp_mch, &port->lp_hwgh, hw_rh); + hw_rh_cnt = mac_hwrings_get(port->lp_mch, &port->lp_hwgh, hw_rh, + MAC_RING_TYPE_RX); if (port->lp_hwgh != NULL) { /* @@ -689,7 +690,8 @@ aggr_rem_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp) goto done; ASSERT(rx_grp->arg_gh != NULL); - hw_rh_cnt = mac_hwrings_get(port->lp_mch, &hwgh, hw_rh); + hw_rh_cnt = mac_hwrings_get(port->lp_mch, &hwgh, hw_rh, + MAC_RING_TYPE_RX); /* * If hw_rh_cnt is 0, it means that the underlying port does not diff --git a/usr/src/uts/common/io/mac/mac.c b/usr/src/uts/common/io/mac/mac.c index 91d7aab331..21982219b9 100644 --- a/usr/src/uts/common/io/mac/mac.c +++ b/usr/src/uts/common/io/mac/mac.c @@ -1426,35 +1426,54 @@ mac_hwrings_rx_process(void *arg, mac_resource_handle_t srs, */ int mac_hwrings_get(mac_client_handle_t mch, mac_group_handle_t *hwgh, - mac_ring_handle_t *hwrh) + mac_ring_handle_t *hwrh, mac_ring_type_t rtype) { mac_client_impl_t *mcip = (mac_client_impl_t *)mch; - flow_entry_t *flent = mcip->mci_flent; - mac_group_t *grp = flent->fe_rx_ring_group; - mac_ring_t *ring; int cnt = 0; - /* - * The mac client did not reserve any RX group, return directly. - * This is probably because the underlying MAC does not support - * any RX groups. - */ - *hwgh = NULL; - if (grp == NULL) - return (0); + switch (rtype) { + case MAC_RING_TYPE_RX: { + flow_entry_t *flent = mcip->mci_flent; + mac_group_t *grp; + mac_ring_t *ring; - /* - * This RX group must be reserved by this mac client. - */ - ASSERT((grp->mrg_state == MAC_GROUP_STATE_RESERVED) && - (mch == (mac_client_handle_t)(MAC_RX_GROUP_ONLY_CLIENT(grp)))); + grp = flent->fe_rx_ring_group; + /* + * The mac client did not reserve any RX group, return directly. + * This is probably because the underlying MAC does not support + * any groups. + */ + *hwgh = NULL; + if (grp == NULL) + return (0); + /* + * This group must be reserved by this mac client. + */ + ASSERT((grp->mrg_state == MAC_GROUP_STATE_RESERVED) && + (mch == (mac_client_handle_t) + (MAC_RX_GROUP_ONLY_CLIENT(grp)))); + for (ring = grp->mrg_rings; + ring != NULL; ring = ring->mr_next, cnt++) { + ASSERT(cnt < MAX_RINGS_PER_GROUP); + hwrh[cnt] = (mac_ring_handle_t)ring; + } + *hwgh = (mac_group_handle_t)grp; + return (cnt); + } + case MAC_RING_TYPE_TX: { + mac_soft_ring_set_t *tx_srs; + mac_srs_tx_t *tx; - for (ring = grp->mrg_rings; ring != NULL; ring = ring->mr_next) { - ASSERT(cnt < MAX_RINGS_PER_GROUP); - hwrh[cnt++] = (mac_ring_handle_t)ring; + tx_srs = MCIP_TX_SRS(mcip); + tx = &tx_srs->srs_tx; + for (; cnt < tx->st_ring_count; cnt++) + hwrh[cnt] = tx->st_rings[cnt]; + return (cnt); + } + default: + ASSERT(B_FALSE); + return (-1); } - *hwgh = (mac_group_handle_t)grp; - return (cnt); } /* @@ -1524,6 +1543,22 @@ mac_hwring_poll(mac_ring_handle_t rh, int bytes_to_pickup) return (info->mri_poll(info->mri_driver, bytes_to_pickup)); } +/* + * Send packets through the selected tx ring. + */ +mblk_t * +mac_hwring_tx(mac_ring_handle_t rh, mblk_t *mp) +{ + mac_ring_t *ring = (mac_ring_t *)rh; + mac_ring_info_t *info = &ring->mr_info; + + ASSERT(ring->mr_type == MAC_RING_TYPE_TX); + ASSERT(ring->mr_state >= MR_INUSE); + ASSERT(info->mri_tx != NULL); + + return (info->mri_tx(info->mri_driver, mp)); +} + int mac_hwgroup_addmac(mac_group_handle_t gh, const uint8_t *addr) { @@ -3429,22 +3464,6 @@ mac_release_tx_ring(mac_ring_handle_t rh) } /* - * Send packets through a selected tx ring. - */ -mblk_t * -mac_ring_tx(mac_ring_handle_t rh, mblk_t *mp) -{ - mac_ring_t *ring = (mac_ring_t *)rh; - mac_ring_info_t *info = &ring->mr_info; - - ASSERT(ring->mr_type == MAC_RING_TYPE_TX); - ASSERT(ring->mr_state >= MR_INUSE); - ASSERT(info->mri_tx != NULL); - - return (info->mri_tx(info->mri_driver, mp)); -} - -/* * Find a ring from its index. */ mac_ring_t * diff --git a/usr/src/uts/common/io/mac/mac_datapath_setup.c b/usr/src/uts/common/io/mac/mac_datapath_setup.c index 7b8c4c6567..dc5b51cb80 100644 --- a/usr/src/uts/common/io/mac/mac_datapath_setup.c +++ b/usr/src/uts/common/io/mac/mac_datapath_setup.c @@ -2235,6 +2235,10 @@ mac_srs_group_teardown(mac_client_impl_t *mcip, flow_entry_t *flent, tx->st_group); tx->st_group = NULL; } + if (tx->st_ring_count != 0) { + kmem_free(tx->st_rings, + sizeof (mac_ring_handle_t) * tx->st_ring_count); + } if (tx->st_arg2 != NULL) { ASSERT(tx_srs->srs_type & SRST_TX); mac_release_tx_ring(tx->st_arg2); @@ -3203,7 +3207,7 @@ mac_tx_srs_setup(mac_client_impl_t *mcip, flow_entry_t *flent, mac_impl_t *mip = mcip->mci_mip; mac_soft_ring_set_t *tx_srs; int i, tx_ring_count = 0, tx_rings_reserved = 0; - mac_ring_handle_t *tx_ring = NULL; + mac_ring_handle_t *tx_rings = NULL; uint32_t soft_ring_type; mac_group_t *grp = NULL; mac_ring_t *ring; @@ -3221,7 +3225,7 @@ mac_tx_srs_setup(mac_client_impl_t *mcip, flow_entry_t *flent, } if (tx_ring_count != 0) { - tx_ring = kmem_zalloc(sizeof (mac_ring_handle_t) * + tx_rings = kmem_zalloc(sizeof (mac_ring_handle_t) * tx_ring_count, KM_SLEEP); } @@ -3231,8 +3235,12 @@ mac_tx_srs_setup(mac_client_impl_t *mcip, flow_entry_t *flent, * NIC's. */ if (srs_type == SRST_FLOW || - (mcip->mci_state_flags & MCIS_NO_HWRINGS) != 0) - goto use_default_ring; + (mcip->mci_state_flags & MCIS_NO_HWRINGS) != 0) { + /* use default ring */ + tx_rings[0] = (void *)mip->mi_default_tx_ring; + tx_rings_reserved++; + goto rings_assigned; + } if (mcip->mci_share != NULL) ring = grp->mrg_rings; @@ -3245,8 +3253,7 @@ mac_tx_srs_setup(mac_client_impl_t *mcip, flow_entry_t *flent, * then each Tx ring will have a Tx-side soft ring. All * these soft rings will be hang off Tx SRS. */ - for (i = 0, tx_rings_reserved = 0; - i < tx_ring_count; i++, tx_rings_reserved++) { + for (i = 0; i < tx_ring_count; i++) { if (mcip->mci_share != NULL) { /* * The ring was already chosen and associated @@ -3255,42 +3262,39 @@ mac_tx_srs_setup(mac_client_impl_t *mcip, flow_entry_t *flent, * between the share and non-share cases. */ ASSERT(ring != NULL); - tx_ring[i] = (mac_ring_handle_t)ring; + tx_rings[i] = (mac_ring_handle_t)ring; ring = ring->mr_next; } else { - tx_ring[i] = + tx_rings[i] = (mac_ring_handle_t)mac_reserve_tx_ring(mip, NULL); - if (tx_ring[i] == NULL) + if (tx_rings[i] == NULL) { + /* + * We have run out of Tx rings. So + * give the default ring too. + */ + tx_rings[i] = (void *)mip->mi_default_tx_ring; + tx_rings_reserved++; break; + } } + tx_rings_reserved++; } + +rings_assigned: if (mac_tx_serialize || (mip->mi_v12n_level & MAC_VIRT_SERIALIZE)) serialize = B_TRUE; /* * Did we get the requested number of tx rings? - * There are 3 actions we can take depending upon the number + * There are 2 actions we can take depending upon the number * of tx_rings we got. - * 1) If we got none, then hook up the tx_srs with the - * default ring. - * 2) If we got one, then get the tx_ring from the soft ring, + * 1) If we got one, then get the tx_ring from the soft ring, * save it in SRS and free up the soft ring. - * 3) If we got more than 1, then do the tx fanout among the + * 2) If we got more than 1, then do the tx fanout among the * rings we obtained. */ - switch (tx_rings_reserved) { - case 1: - /* - * No need to allocate Tx soft rings. Tx-side soft - * rings are for Tx fanout case. Just use Tx SRS. - */ - /* FALLTHRU */ - - case 0: -use_default_ring: - if (tx_rings_reserved == 0) - tx->st_arg2 = (void *)mip->mi_default_tx_ring; - else - tx->st_arg2 = (void *)tx_ring[0]; + ASSERT(tx_rings_reserved != 0); + if (tx_rings_reserved == 1) { + tx->st_arg2 = (void *)tx_rings[0]; /* For ring_count of 0 or 1, set the tx_mode and return */ if (tx_srs->srs_type & SRST_BW_CONTROL) tx->st_mode = SRS_TX_BW; @@ -3298,18 +3302,9 @@ use_default_ring: tx->st_mode = SRS_TX_SERIALIZE; else tx->st_mode = SRS_TX_DEFAULT; - break; - - default: + } else { /* * We got multiple Tx rings for Tx fanout. - * - * cpuid of -1 is passed. This creates an unbound - * worker thread. Instead the code should get CPU - * binding information and pass that to - * mac_soft_ring_create(). This needs to be done - * in conjunction with Rx-side soft ring - * bindings. */ soft_ring_type = ST_RING_OTH | ST_RING_TX; if (tx_srs->srs_type & SRST_BW_CONTROL) { @@ -3322,7 +3317,7 @@ use_default_ring: for (i = 0; i < tx_rings_reserved; i++) { (void) mac_soft_ring_create(i, 0, NULL, soft_ring_type, maxclsyspri, mcip, tx_srs, -1, NULL, mcip, - (mac_resource_handle_t)tx_ring[i]); + (mac_resource_handle_t)tx_rings[i]); } mac_srs_update_fanout_list(tx_srs); } @@ -3332,8 +3327,12 @@ use_default_ring: int, tx->st_mode, int, tx_srs->srs_oth_ring_count); if (tx_ring_count != 0) { - kmem_free(tx_ring, - sizeof (mac_ring_handle_t) * tx_ring_count); + tx->st_ring_count = tx_rings_reserved; + tx->st_rings = kmem_zalloc(sizeof (mac_ring_handle_t) * + tx_rings_reserved, KM_SLEEP); + for (i = 0; i < tx->st_ring_count; i++) + tx->st_rings[i] = tx_rings[i]; + kmem_free(tx_rings, sizeof (mac_ring_handle_t) * tx_ring_count); } } diff --git a/usr/src/uts/common/io/nxge/nxge_hio.c b/usr/src/uts/common/io/nxge/nxge_hio.c index 827553301c..b58acde5e8 100644 --- a/usr/src/uts/common/io/nxge/nxge_hio.c +++ b/usr/src/uts/common/io/nxge/nxge_hio.c @@ -41,9 +41,6 @@ #include <sys/nxge/nxge_txdma.h> #include <sys/nxge/nxge_hio.h> -#define NXGE_HIO_SHARE_MIN_CHANNELS 2 -#define NXGE_HIO_SHARE_MAX_CHANNELS 2 - /* * External prototypes */ @@ -1057,23 +1054,6 @@ nxge_hio_init( NXGE_DEBUG_MSG((nxge, HIO_CTL, "Hybrid IO-capable service domain")); return (NXGE_OK); - } else { - /* - * isLDOMguest(nxge) == B_TRUE - */ - nx_vio_fp_t *vio; - nhd->type = NXGE_HIO_TYPE_GUEST; - - vio = &nhd->hio.vio; - vio->__register = (vio_net_resource_reg_t) - modgetsymvalue("vio_net_resource_reg", 0); - vio->unregister = (vio_net_resource_unreg_t) - modgetsymvalue("vio_net_resource_unreg", 0); - - if (vio->__register == 0 || vio->unregister == 0) { - NXGE_ERROR_MSG((nxge, VIR_CTL, "vio_net is absent!")); - return (NXGE_ERROR); - } } return (0); @@ -1144,12 +1124,16 @@ nxge_hio_clear_unicst(p_nxge_t nxgep, const uint8_t *mac_addr) static int nxge_hio_add_mac(void *arg, const uint8_t *mac_addr) { - nxge_ring_group_t *group = (nxge_ring_group_t *)arg; - p_nxge_t nxge = group->nxgep; - int rv; - nxge_hio_vr_t *vr; /* The Virtualization Region */ + nxge_ring_group_t *group = (nxge_ring_group_t *)arg; + p_nxge_t nxge = group->nxgep; + int rv; + nxge_hio_vr_t *vr; /* The Virtualization Region */ ASSERT(group->type == MAC_RING_TYPE_RX); + ASSERT(group->nxgep != NULL); + + if (isLDOMguest(group->nxgep)) + return (0); mutex_enter(nxge->genlock); @@ -1174,8 +1158,7 @@ nxge_hio_add_mac(void *arg, const uint8_t *mac_addr) /* * Program the mac address for the group. */ - if ((rv = nxge_hio_group_mac_add(nxge, group, - mac_addr)) != 0) { + if ((rv = nxge_hio_group_mac_add(nxge, group, mac_addr)) != 0) { return (rv); } @@ -1206,6 +1189,10 @@ nxge_hio_rem_mac(void *arg, const uint8_t *mac_addr) int rv, slot; ASSERT(group->type == MAC_RING_TYPE_RX); + ASSERT(group->nxgep != NULL); + + if (isLDOMguest(group->nxgep)) + return (0); mutex_enter(nxge->genlock); @@ -1253,14 +1240,16 @@ nxge_hio_group_start(mac_group_driver_t gdriver) int dev_gindex; ASSERT(group->type == MAC_RING_TYPE_RX); + ASSERT(group->nxgep != NULL); -#ifdef later ASSERT(group->nxgep->nxge_mac_state == NXGE_MAC_STARTED); -#endif if (group->nxgep->nxge_mac_state != NXGE_MAC_STARTED) return (ENXIO); mutex_enter(group->nxgep->genlock); + if (isLDOMguest(group->nxgep)) + goto nxge_hio_group_start_exit; + dev_gindex = group->nxgep->pt_config.hw_config.def_mac_rxdma_grpid + group->gindex; rdc_grp_p = &group->nxgep->pt_config.rdc_grps[dev_gindex]; @@ -1289,9 +1278,9 @@ nxge_hio_group_start(mac_group_driver_t gdriver) (void) nxge_init_fzc_rdc_tbl(group->nxgep, rdc_grp_p, rdctbl); +nxge_hio_group_start_exit: group->started = B_TRUE; mutex_exit(group->nxgep->genlock); - return (0); } @@ -1305,6 +1294,9 @@ nxge_hio_group_stop(mac_group_driver_t gdriver) mutex_enter(group->nxgep->genlock); group->started = B_FALSE; + if (isLDOMguest(group->nxgep)) + goto nxge_hio_group_stop_exit; + /* * Unbind the RDC table previously bound for this group. * @@ -1314,6 +1306,7 @@ nxge_hio_group_stop(mac_group_driver_t gdriver) if (group->gindex != 0) (void) nxge_fzc_rdc_tbl_unbind(group->nxgep, group->rdctbl); +nxge_hio_group_stop_exit: mutex_exit(group->nxgep->genlock); } @@ -1334,20 +1327,26 @@ nxge_hio_group_get(void *arg, mac_ring_type_t type, int groupid, group->gindex = groupid; group->sindex = 0; /* not yet bound to a share */ - dev_gindex = nxgep->pt_config.hw_config.def_mac_rxdma_grpid + - groupid; + if (!isLDOMguest(nxgep)) { + dev_gindex = + nxgep->pt_config.hw_config.def_mac_rxdma_grpid + + groupid; - if (nxgep->pt_config.hw_config.def_mac_rxdma_grpid == - dev_gindex) - group->port_default_grp = B_TRUE; + if (nxgep->pt_config.hw_config.def_mac_rxdma_grpid == + dev_gindex) + group->port_default_grp = B_TRUE; + + infop->mgi_count = + nxgep->pt_config.rdc_grps[dev_gindex].max_rdcs; + } else { + infop->mgi_count = NXGE_HIO_SHARE_MAX_CHANNELS; + } infop->mgi_driver = (mac_group_driver_t)group; infop->mgi_start = nxge_hio_group_start; infop->mgi_stop = nxge_hio_group_stop; infop->mgi_addmac = nxge_hio_add_mac; infop->mgi_remmac = nxge_hio_rem_mac; - infop->mgi_count = - nxgep->pt_config.rdc_grps[dev_gindex].max_rdcs; break; case MAC_RING_TYPE_TX: diff --git a/usr/src/uts/common/io/nxge/nxge_hio_guest.c b/usr/src/uts/common/io/nxge/nxge_hio_guest.c index eb05298299..78c1bb53a6 100644 --- a/usr/src/uts/common/io/nxge/nxge_hio_guest.c +++ b/usr/src/uts/common/io/nxge/nxge_hio_guest.c @@ -35,46 +35,9 @@ #include <sys/nxge/nxge_fzc.h> #include <sys/nxge/nxge_rxdma.h> #include <sys/nxge/nxge_txdma.h> - #include <sys/nxge/nxge_hio.h> /* - * nxge_hio_unregister - * - * Unregister with the VNET module. - * - * Arguments: - * nxge - * - * Notes: - * We must uninitialize all DMA channels associated with the VR, too. - * - * We're assuming that the channels will be disabled & unassigned - * in the service domain, after we're done here. - * - * Context: - * Guest domain - */ -void -nxge_hio_unregister( - nxge_t *nxge) -{ - nxge_hio_data_t *nhd = (nxge_hio_data_t *)nxge->nxge_hw_p->hio; - - if (nhd == 0) { - return; - } - -#if defined(sun4v) - /* Unregister with vNet. */ - if (nhd->hio.vio.unregister) { - if (nxge->hio_vr) - (*nhd->hio.vio.unregister)(nxge->hio_vr->vhp); - } -#endif -} - -/* * nxge_guest_regs_map * * Map in a guest domain's register set(s). @@ -95,8 +58,7 @@ static ddi_device_acc_attr_t nxge_guest_register_access_attributes = { }; int -nxge_guest_regs_map( - nxge_t *nxge) +nxge_guest_regs_map(nxge_t *nxge) { dev_regs_t *regs; off_t regsize; @@ -211,31 +173,22 @@ static void nxge_check_guest_state(nxge_hio_vr_t *); int nxge_hio_vr_add(nxge_t *nxge) { - extern mac_callbacks_t nxge_m_callbacks; - - nxge_hio_data_t *nhd = (nxge_hio_data_t *)nxge->nxge_hw_p->hio; - nxge_hio_vr_t *vr; - nxge_hio_dc_t *dc; - - int *reg_val; - uint_t reg_len; - uint8_t vr_index; - - nxhv_vr_fp_t *fp; - uint64_t vr_address, vr_size; - uint32_t cookie; + extern nxge_status_t nxge_mac_register(p_nxge_t); - nxhv_dc_fp_t *tx, *rx; - uint64_t tx_map, rx_map; - - uint64_t hv_rv; - - /* Variables needed to register with vnet. */ - mac_register_t *mac_info; - ether_addr_t mac_addr; - nx_vio_fp_t *vio; - - int i; + nxge_hio_data_t *nhd = (nxge_hio_data_t *)nxge->nxge_hw_p->hio; + nxge_hio_vr_t *vr; + nxge_hio_dc_t *dc; + int *reg_val; + uint_t reg_len; + uint8_t vr_index; + nxhv_vr_fp_t *fp; + uint64_t vr_address, vr_size; + uint32_t cookie; + nxhv_dc_fp_t *tx, *rx; + uint64_t tx_map, rx_map; + uint64_t hv_rv; + int i; + nxge_status_t status; NXGE_DEBUG_MSG((nxge, HIO_CTL, "==> nxge_hio_vr_add")); @@ -384,40 +337,13 @@ nxge_hio_vr_add(nxge_t *nxge) } } - /* - * Register with vnet. - */ - if ((mac_info = mac_alloc(MAC_VERSION)) == NULL) - return (NXGE_ERROR); - - mac_info->m_type_ident = MAC_PLUGIN_IDENT_ETHER; - mac_info->m_driver = nxge; - mac_info->m_dip = nxge->dip; - mac_info->m_src_addr = KMEM_ZALLOC(MAXMACADDRLEN, KM_SLEEP); - mac_info->m_dst_addr = KMEM_ZALLOC(MAXMACADDRLEN, KM_SLEEP); - (void) memset(mac_info->m_src_addr, 0xff, sizeof (MAXMACADDRLEN)); - mac_info->m_callbacks = &nxge_m_callbacks; - mac_info->m_min_sdu = 0; - mac_info->m_max_sdu = NXGE_MTU_DEFAULT_MAX - - sizeof (struct ether_header) - ETHERFCSL - 4; - - (void) memset(&mac_addr, 0xff, sizeof (mac_addr)); - - /* Register with vio_net. */ - vio = &nhd->hio.vio; - if ((*vio->__register)(mac_info, VIO_NET_RES_HYBRID, - nxge->hio_mac_addr, mac_addr, &vr->vhp, &vio->cb)) { - NXGE_DEBUG_MSG((nxge, HIO_CTL, "HIO registration() failed")); - KMEM_FREE(mac_info->m_src_addr, MAXMACADDRLEN); - KMEM_FREE(mac_info->m_dst_addr, MAXMACADDRLEN); - mac_free(mac_info); - return (NXGE_ERROR); + status = nxge_mac_register(nxge); + if (status != NXGE_OK) { + cmn_err(CE_WARN, "nxge(%d): nxge_mac_register failed\n", + nxge->instance); + return (status); } - KMEM_FREE(mac_info->m_src_addr, MAXMACADDRLEN); - KMEM_FREE(mac_info->m_dst_addr, MAXMACADDRLEN); - mac_free(mac_info); - nxge->hio_vr = vr; /* For faster lookups. */ NXGE_DEBUG_MSG((nxge, HIO_CTL, "<== nxge_hio_vr_add")); diff --git a/usr/src/uts/common/io/nxge/nxge_main.c b/usr/src/uts/common/io/nxge/nxge_main.c index a1ab453851..c0020bdac4 100644 --- a/usr/src/uts/common/io/nxge/nxge_main.c +++ b/usr/src/uts/common/io/nxge/nxge_main.c @@ -272,14 +272,11 @@ static void nxge_m_stop(void *); static int nxge_m_multicst(void *, boolean_t, const uint8_t *); static int nxge_m_promisc(void *, boolean_t); static void nxge_m_ioctl(void *, queue_t *, mblk_t *); -static nxge_status_t nxge_mac_register(p_nxge_t); +nxge_status_t nxge_mac_register(p_nxge_t); static int nxge_altmac_set(p_nxge_t nxgep, uint8_t *mac_addr, int slot, int rdctbl, boolean_t usetbl); void nxge_mmac_kstat_update(p_nxge_t nxgep, int slot, boolean_t factory); -#if defined(sun4v) -extern mblk_t *nxge_m_tx(void *arg, mblk_t *mp); -#endif static void nxge_m_getfactaddr(void *, uint_t, uint8_t *); static boolean_t nxge_m_getcapab(void *, mac_capab_t, void *); @@ -630,11 +627,6 @@ nxge_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) if (nxgep->niu_type != N2_NIU) { nxge_set_pci_replay_timeout(nxgep); } -#if defined(sun4v) - if (isLDOMguest(nxgep)) { - nxge_m_callbacks.mc_tx = nxge_m_tx; - } -#endif #if defined(sun4v) /* This is required by nxge_hio_init(), which follows. */ @@ -961,11 +953,7 @@ nxge_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) (void) nxge_link_monitor(nxgep, LINK_MONITOR_STOP); - if (isLDOMguest(nxgep)) { - if (nxgep->nxge_mac_state == NXGE_MAC_STARTED) - nxge_m_stop((void *)nxgep); - nxge_hio_unregister(nxgep); - } else if (nxgep->mach && (status = mac_unregister(nxgep->mach)) != 0) { + if (nxgep->mach && (status = mac_unregister(nxgep->mach)) != 0) { NXGE_ERROR_MSG((nxgep, NXGE_ERR_CTL, "<== nxge_detach status = 0x%08X", status)); return (DDI_FAILURE); @@ -4294,10 +4282,13 @@ nxge_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) case MAC_CAPAB_MULTIFACTADDR: { mac_capab_multifactaddr_t *mfacp = cap_data; - mutex_enter(nxgep->genlock); - mfacp->mcm_naddr = nxgep->nxge_mmac_info.num_factory_mmac; - mfacp->mcm_getaddr = nxge_m_getfactaddr; - mutex_exit(nxgep->genlock); + if (!isLDOMguest(nxgep)) { + mutex_enter(nxgep->genlock); + mfacp->mcm_naddr = + nxgep->nxge_mmac_info.num_factory_mmac; + mfacp->mcm_getaddr = nxge_m_getfactaddr; + mutex_exit(nxgep->genlock); + } break; } @@ -4325,34 +4316,68 @@ nxge_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) mutex_enter(nxgep->genlock); if (cap_rings->mr_type == MAC_RING_TYPE_RX) { - cap_rings->mr_group_type = MAC_GROUP_TYPE_DYNAMIC; - cap_rings->mr_rnum = p_cfgp->max_rdcs; - cap_rings->mr_rget = nxge_fill_ring; - cap_rings->mr_gnum = p_cfgp->max_rdc_grpids; - cap_rings->mr_gget = nxge_hio_group_get; - cap_rings->mr_gaddring = nxge_group_add_ring; - cap_rings->mr_gremring = nxge_group_rem_ring; + if (isLDOMguest(nxgep)) { + cap_rings->mr_group_type = + MAC_GROUP_TYPE_STATIC; + cap_rings->mr_rnum = + NXGE_HIO_SHARE_MAX_CHANNELS; + cap_rings->mr_rget = nxge_fill_ring; + cap_rings->mr_gnum = 1; + cap_rings->mr_gget = nxge_hio_group_get; + cap_rings->mr_gaddring = NULL; + cap_rings->mr_gremring = NULL; + } else { + /* + * Service Domain. + */ + cap_rings->mr_group_type = + MAC_GROUP_TYPE_DYNAMIC; + cap_rings->mr_rnum = p_cfgp->max_rdcs; + cap_rings->mr_rget = nxge_fill_ring; + cap_rings->mr_gnum = p_cfgp->max_rdc_grpids; + cap_rings->mr_gget = nxge_hio_group_get; + cap_rings->mr_gaddring = nxge_group_add_ring; + cap_rings->mr_gremring = nxge_group_rem_ring; + } NXGE_DEBUG_MSG((nxgep, RX_CTL, "==> nxge_m_getcapab: rx nrings[%d] ngroups[%d]", p_cfgp->max_rdcs, p_cfgp->max_rdc_grpids)); } else { - cap_rings->mr_group_type = MAC_GROUP_TYPE_DYNAMIC; - cap_rings->mr_rnum = p_cfgp->tdc.count; - cap_rings->mr_rget = nxge_fill_ring; - if (isLDOMservice(nxgep)) { - /* share capable */ - /* Do not report the default ring: hence -1 */ + /* + * TX Rings. + */ + if (isLDOMguest(nxgep)) { + cap_rings->mr_group_type = + MAC_GROUP_TYPE_STATIC; + cap_rings->mr_rnum = + NXGE_HIO_SHARE_MAX_CHANNELS; + cap_rings->mr_rget = nxge_fill_ring; + cap_rings->mr_gnum = 0; + cap_rings->mr_gget = NULL; + cap_rings->mr_gaddring = NULL; + cap_rings->mr_gremring = NULL; + } else { + /* + * Service Domain. + */ + cap_rings->mr_group_type = + MAC_GROUP_TYPE_DYNAMIC; + cap_rings->mr_rnum = p_cfgp->tdc.count; + cap_rings->mr_rget = nxge_fill_ring; + + /* + * Share capable. + * + * Do not report the default group: hence -1 + */ cap_rings->mr_gnum = NXGE_MAX_TDC_GROUPS / nxgep->nports - 1; - } else { - cap_rings->mr_gnum = 0; + cap_rings->mr_gget = nxge_hio_group_get; + cap_rings->mr_gaddring = nxge_group_add_ring; + cap_rings->mr_gremring = nxge_group_rem_ring; } - cap_rings->mr_gget = nxge_hio_group_get; - cap_rings->mr_gaddring = nxge_group_add_ring; - cap_rings->mr_gremring = nxge_group_rem_ring; - NXGE_DEBUG_MSG((nxgep, TX_CTL, "==> nxge_m_getcapab: tx rings # of rings %d", p_cfgp->tdc.count)); @@ -6372,7 +6397,7 @@ nxge_intrs_disable(p_nxge_t nxgep) NXGE_DEBUG_MSG((nxgep, INT_CTL, "<== nxge_intrs_disable")); } -static nxge_status_t +nxge_status_t nxge_mac_register(p_nxge_t nxgep) { mac_register_t *macp; @@ -6386,7 +6411,13 @@ nxge_mac_register(p_nxge_t nxgep) macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER; macp->m_driver = nxgep; macp->m_dip = nxgep->dip; - macp->m_src_addr = nxgep->ouraddr.ether_addr_octet; + if (!isLDOMguest(nxgep)) { + macp->m_src_addr = nxgep->ouraddr.ether_addr_octet; + } else { + macp->m_src_addr = KMEM_ZALLOC(MAXMACADDRLEN, KM_SLEEP); + macp->m_dst_addr = KMEM_ZALLOC(MAXMACADDRLEN, KM_SLEEP); + (void) memset(macp->m_src_addr, 0xff, sizeof (MAXMACADDRLEN)); + } macp->m_callbacks = &nxge_m_callbacks; macp->m_min_sdu = 0; nxgep->mac.default_mtu = nxgep->mac.maxframesize - @@ -6395,7 +6426,12 @@ nxge_mac_register(p_nxge_t nxgep) macp->m_margin = VLAN_TAGSZ; macp->m_priv_props = nxge_priv_props; macp->m_priv_prop_count = NXGE_MAX_PRIV_PROPS; - macp->m_v12n = MAC_VIRT_HIO | MAC_VIRT_LEVEL1 | MAC_VIRT_SERIALIZE; + if (isLDOMguest(nxgep)) { + macp->m_v12n = MAC_VIRT_LEVEL1 | MAC_VIRT_SERIALIZE; + } else { + macp->m_v12n = MAC_VIRT_HIO | MAC_VIRT_LEVEL1 | \ + MAC_VIRT_SERIALIZE; + } NXGE_DEBUG_MSG((nxgep, MAC_CTL, "==> nxge_mac_register: instance %d " @@ -6406,6 +6442,10 @@ nxge_mac_register(p_nxge_t nxgep) NXGE_EHEADER_VLAN_CRC)); status = mac_register(macp, &nxgep->mach); + if (isLDOMguest(nxgep)) { + KMEM_FREE(macp->m_src_addr, MAXMACADDRLEN); + KMEM_FREE(macp->m_dst_addr, MAXMACADDRLEN); + } mac_free(macp); if (status != 0) { diff --git a/usr/src/uts/common/io/nxge/nxge_rxdma.c b/usr/src/uts/common/io/nxge/nxge_rxdma.c index 313e76c8f0..4b427d1a8d 100644 --- a/usr/src/uts/common/io/nxge/nxge_rxdma.c +++ b/usr/src/uts/common/io/nxge/nxge_rxdma.c @@ -1756,7 +1756,7 @@ nxge_rx_intr(void *arg1, void *arg2) uint8_t channel; npi_handle_t handle; rx_dma_ctl_stat_t cs; - p_rx_rcr_ring_t rcr_ring; + p_rx_rcr_ring_t rcrp; mblk_t *mp = NULL; if (ldvp == NULL) { @@ -1789,7 +1789,7 @@ nxge_rx_intr(void *arg1, void *arg2) /* * Get the ring to enable us to process packets. */ - rcr_ring = nxgep->rx_rcr_rings->rcr_rings[ldvp->vdma_index]; + rcrp = nxgep->rx_rcr_rings->rcr_rings[ldvp->vdma_index]; /* * The RCR ring lock must be held when packets @@ -1799,7 +1799,7 @@ nxge_rx_intr(void *arg1, void *arg2) * (will cause fatal errors such as rcrincon bit set) * and the setting of the poll_flag. */ - MUTEX_ENTER(&rcr_ring->lock); + MUTEX_ENTER(&rcrp->lock); /* * Get the control and status for this channel. @@ -1840,12 +1840,12 @@ nxge_rx_intr(void *arg1, void *arg2) mgm.value); } } - MUTEX_EXIT(&rcr_ring->lock); + MUTEX_EXIT(&rcrp->lock); return (DDI_INTR_CLAIMED); } - ASSERT(rcr_ring->ldgp == ldgp); - ASSERT(rcr_ring->ldvp == ldvp); + ASSERT(rcrp->ldgp == ldgp); + ASSERT(rcrp->ldvp == ldvp); RXDMA_REG_READ64(handle, RX_DMA_CTL_STAT_REG, channel, &cs.value); @@ -1856,8 +1856,8 @@ nxge_rx_intr(void *arg1, void *arg2) cs.bits.hdw.rcrto, cs.bits.hdw.rcrthres)); - if (rcr_ring->poll_flag == 0) { - mp = nxge_rx_pkts(nxgep, rcr_ring, cs, -1); + if (!rcrp->poll_flag) { + mp = nxge_rx_pkts(nxgep, rcrp, cs, -1); } /* error events. */ @@ -1873,27 +1873,34 @@ nxge_rx_intr(void *arg1, void *arg2) * these two edge triggered bits. */ cs.value &= RX_DMA_CTL_STAT_WR1C; - cs.bits.hdw.mex = rcr_ring->poll_flag ? 0 : 1; + cs.bits.hdw.mex = rcrp->poll_flag ? 0 : 1; RXDMA_REG_WRITE64(handle, RX_DMA_CTL_STAT_REG, channel, cs.value); /* * If the polling mode is enabled, disable the interrupt. */ - if (rcr_ring->poll_flag) { + if (rcrp->poll_flag) { NXGE_DEBUG_MSG((nxgep, NXGE_ERR_CTL, "==> nxge_rx_intr: rdc %d ldgp $%p ldvp $%p " "(disabling interrupts)", channel, ldgp, ldvp)); + /* * Disarm this logical group if this is a single device * group. */ if (ldgp->nldvs == 1) { - ldgimgm_t mgm; - mgm.value = 0; - mgm.bits.ldw.arm = 0; - NXGE_REG_WR64(handle, - LDGIMGN_REG + LDSV_OFFSET(ldgp->ldg), mgm.value); + if (isLDOMguest(nxgep)) { + ldgp->arm = B_FALSE; + nxge_hio_ldgimgn(nxgep, ldgp); + } else { + ldgimgm_t mgm; + mgm.value = 0; + mgm.bits.ldw.arm = 0; + NXGE_REG_WR64(handle, + LDGIMGN_REG + LDSV_OFFSET(ldgp->ldg), + mgm.value); + } } } else { /* @@ -1920,24 +1927,11 @@ nxge_rx_intr(void *arg1, void *arg2) "==> nxge_rx_intr: rdc %d ldgp $%p " "exiting ISR (and call mac_rx_ring)", channel, ldgp)); } - MUTEX_EXIT(&rcr_ring->lock); + MUTEX_EXIT(&rcrp->lock); if (mp != NULL) { - if (!isLDOMguest(nxgep)) - mac_rx_ring(nxgep->mach, rcr_ring->rcr_mac_handle, mp, - rcr_ring->rcr_gen_num); -#if defined(sun4v) - else { /* isLDOMguest(nxgep) */ - nxge_hio_data_t *nhd = (nxge_hio_data_t *) - nxgep->nxge_hw_p->hio; - nx_vio_fp_t *vio = &nhd->hio.vio; - - if (vio->cb.vio_net_rx_cb) { - (*vio->cb.vio_net_rx_cb) - (nxgep->hio_vr->vhp, mp); - } - } -#endif + mac_rx_ring(nxgep->mach, rcrp->rcr_mac_handle, mp, + rcrp->rcr_gen_num); } NXGE_DEBUG_MSG((nxgep, RX_CTL, "<== nxge_rx_intr: DDI_INTR_CLAIMED")); return (DDI_INTR_CLAIMED); @@ -2720,6 +2714,7 @@ nxge_enable_poll(void *arg) uint32_t channel; if (ring_handle == NULL) { + ASSERT(ring_handle != NULL); return (0); } @@ -2760,6 +2755,7 @@ nxge_disable_poll(void *arg) uint32_t channel; if (ring_handle == NULL) { + ASSERT(ring_handle != NULL); return (0); } @@ -2816,12 +2812,18 @@ nxge_disable_poll(void *arg) "==> nxge_disable_poll: rdc %d ldgp $%p (enable intr)", ringp->rdc, ldgp)); if (ldgp->nldvs == 1) { - ldgimgm_t mgm; - mgm.value = 0; - mgm.bits.ldw.arm = 1; - mgm.bits.ldw.timer = ldgp->ldg_timer; - NXGE_REG_WR64(handle, - LDGIMGN_REG + LDSV_OFFSET(ldgp->ldg), mgm.value); + if (isLDOMguest(nxgep)) { + ldgp->arm = B_TRUE; + nxge_hio_ldgimgn(nxgep, ldgp); + } else { + ldgimgm_t mgm; + mgm.value = 0; + mgm.bits.ldw.arm = 1; + mgm.bits.ldw.timer = ldgp->ldg_timer; + NXGE_REG_WR64(handle, + LDGIMGN_REG + LDSV_OFFSET(ldgp->ldg), + mgm.value); + } } ringp->poll_flag = 0; } diff --git a/usr/src/uts/common/io/nxge/nxge_send.c b/usr/src/uts/common/io/nxge/nxge_send.c index 16ce76ccad..4f7edf292a 100644 --- a/usr/src/uts/common/io/nxge/nxge_send.c +++ b/usr/src/uts/common/io/nxge/nxge_send.c @@ -66,20 +66,9 @@ nxge_tx_ring_task(void *arg) (void) nxge_txdma_reclaim(ring->nxgep, ring, 0); MUTEX_EXIT(&ring->lock); - if (!isLDOMguest(ring->nxgep) && !ring->tx_ring_offline) + if (!ring->tx_ring_offline) { mac_tx_ring_update(ring->nxgep->mach, ring->tx_ring_handle); -#if defined(sun4v) - else { - nxge_hio_data_t *nhd = - (nxge_hio_data_t *)ring->nxgep->nxge_hw_p->hio; - nx_vio_fp_t *vio = &nhd->hio.vio; - - /* Call back vnet. */ - if (vio->cb.vio_net_tx_update) { - (*vio->cb.vio_net_tx_update)(ring->nxgep->hio_vr->vhp); - } } -#endif } static void @@ -141,65 +130,6 @@ nxge_tx_ring_send(void *arg, mblk_t *mp) return ((mblk_t *)NULL); } -#if defined(sun4v) - -/* - * Hashing policy for load balancing over the set of TX rings - * available to the driver. - */ -static uint8_t nxge_tx_hash_policy = MAC_PKT_HASH_L4; - -/* - * nxge_m_tx() is needed for Hybrid I/O operation of the vnet in - * the guest domain. See CR 6778758 for long term solution. - * - * The guest domain driver will for now hash the packet - * to pick a DMA channel from the only group it has group 0. - */ - -mblk_t * -nxge_m_tx(void *arg, mblk_t *mp) -{ - p_nxge_t nxgep = (p_nxge_t)arg; - mblk_t *next; - uint64_t rindex; - p_tx_ring_t tx_ring_p; - int status; - - NXGE_DEBUG_MSG((nxgep, TX_CTL, "==> nxge_m_tx")); - - /* - * Hash to pick a ring from Group 0, the only TX group - * for a guest domain driver. - */ - rindex = mac_pkt_hash(DL_ETHER, mp, nxge_tx_hash_policy, B_TRUE); - rindex = rindex % nxgep->pt_config.tdc_grps[0].max_tdcs; - - /* - * Get the ring handle. - */ - tx_ring_p = nxgep->tx_rings->rings[rindex]; - - while (mp != NULL) { - next = mp->b_next; - mp->b_next = NULL; - - status = nxge_start(nxgep, tx_ring_p, mp); - if (status != 0) { - mp->b_next = next; - nxge_tx_ring_dispatch(tx_ring_p); - return (mp); - } - - mp = next; - } - - NXGE_DEBUG_MSG((nxgep, TX_CTL, "<== nxge_m_tx")); - return ((mblk_t *)NULL); -} - -#endif - int nxge_start(p_nxge_t nxgep, p_tx_ring_t tx_ring_p, p_mblk_t mp) { diff --git a/usr/src/uts/common/io/nxge/nxge_virtual.c b/usr/src/uts/common/io/nxge/nxge_virtual.c index ff78d828d6..c0468f8fed 100644 --- a/usr/src/uts/common/io/nxge/nxge_virtual.c +++ b/usr/src/uts/common/io/nxge/nxge_virtual.c @@ -3994,6 +3994,9 @@ nxge_get_rxring_index(p_nxge_t nxgep, int groupid, int ringidx) p_dma_cfgp = &nxgep->pt_config; p_cfgp = &p_dma_cfgp->hw_config; + if (isLDOMguest(nxgep)) + return (ringidx); + for (i = 0; i < groupid; i++) { rdc_grp_p = &p_dma_cfgp->rdc_grps[p_cfgp->def_mac_rxdma_grpid + i]; diff --git a/usr/src/uts/common/sys/mac_client_priv.h b/usr/src/uts/common/sys/mac_client_priv.h index 4acc126a8f..6174dd1a72 100644 --- a/usr/src/uts/common/sys/mac_client_priv.h +++ b/usr/src/uts/common/sys/mac_client_priv.h @@ -120,7 +120,7 @@ extern void mac_rx_client_quiesce(mac_client_handle_t); extern void mac_rx_client_restart(mac_client_handle_t); extern void mac_srs_perm_quiesce(mac_client_handle_t, boolean_t); extern int mac_hwrings_get(mac_client_handle_t, mac_group_handle_t *, - mac_ring_handle_t *); + mac_ring_handle_t *, mac_ring_type_t); extern void mac_hwring_setup(mac_ring_handle_t, mac_resource_handle_t); extern void mac_hwring_teardown(mac_ring_handle_t); extern int mac_hwring_disable_intr(mac_ring_handle_t); diff --git a/usr/src/uts/common/sys/mac_impl.h b/usr/src/uts/common/sys/mac_impl.h index 3c2e30f37e..a93335606f 100644 --- a/usr/src/uts/common/sys/mac_impl.h +++ b/usr/src/uts/common/sys/mac_impl.h @@ -262,7 +262,7 @@ struct mac_group_s { #define MAC_RING_TX_DEFAULT(mip, mp) \ ((mip->mi_default_tx_ring == NULL) ? \ mip->mi_tx(mip->mi_driver, mp) : \ - mac_ring_tx(mip->mi_default_tx_ring, mp)) + mac_hwring_tx(mip->mi_default_tx_ring, mp)) #define MAC_TX(mip, ring, mp, mcip) { \ /* \ @@ -275,7 +275,7 @@ struct mac_group_s { (ring == NULL)) \ mp = MAC_RING_TX_DEFAULT(mip, mp); \ else \ - mp = mac_ring_tx(ring, mp); \ + mp = mac_hwring_tx(ring, mp); \ } /* mci_tx_flag */ @@ -585,7 +585,7 @@ extern int mac_group_addmac(mac_group_t *, const uint8_t *); extern int mac_group_remmac(mac_group_t *, const uint8_t *); extern int mac_rx_group_add_flow(mac_client_impl_t *, flow_entry_t *, mac_group_t *); -extern mblk_t *mac_ring_tx(mac_ring_handle_t, mblk_t *); +extern mblk_t *mac_hwring_tx(mac_ring_handle_t, mblk_t *); extern mac_ring_t *mac_reserve_tx_ring(mac_impl_t *, mac_ring_t *); extern void mac_release_tx_ring(mac_ring_handle_t); extern mac_group_t *mac_reserve_tx_group(mac_impl_t *, mac_share_handle_t); diff --git a/usr/src/uts/common/sys/mac_soft_ring.h b/usr/src/uts/common/sys/mac_soft_ring.h index 4973b84215..4b07fb4e9f 100644 --- a/usr/src/uts/common/sys/mac_soft_ring.h +++ b/usr/src/uts/common/sys/mac_soft_ring.h @@ -131,6 +131,9 @@ typedef struct mac_srs_tx_s { void *st_arg1; void *st_arg2; mac_group_t *st_group; /* TX group for share */ + uint32_t st_ring_count; /* no. of tx rings */ + mac_ring_handle_t *st_rings; + boolean_t st_woken_up; /* diff --git a/usr/src/uts/common/sys/nxge/nxge_hio.h b/usr/src/uts/common/sys/nxge/nxge_hio.h index d57a5424eb..b18f32e346 100644 --- a/usr/src/uts/common/sys/nxge/nxge_hio.h +++ b/usr/src/uts/common/sys/nxge/nxge_hio.h @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -35,9 +35,6 @@ extern "C" { #include <nxge_ipp.h> #include <nxge_fflp.h> #include <sys/mac_provider.h> -#if defined(sun4v) -#include <sys/vnet_res.h> -#endif #define isLDOMservice(nxge) \ (nxge->environs == SOLARIS_SERVICE_DOMAIN) @@ -46,6 +43,9 @@ extern "C" { #define isLDOMs(nxge) \ (isLDOMservice(nxge) || isLDOMguest(nxge)) +#define NXGE_HIO_SHARE_MIN_CHANNELS 2 +#define NXGE_HIO_SHARE_MAX_CHANNELS 2 + /* ------------------------------------------------------------------ */ typedef uint8_t nx_rdc_t; typedef uint8_t nx_tdc_t; @@ -88,37 +88,19 @@ typedef struct { dc_getinfo getinfo; } nxhv_dc_fp_t; -#if defined(sun4v) -typedef struct { - vio_net_resource_reg_t __register; - vio_net_resource_unreg_t unregister; - - vio_net_callbacks_t cb; - -} nx_vio_fp_t; -#endif - typedef struct { boolean_t ldoms; - nxhv_vr_fp_t vr; nxhv_dc_fp_t tx; nxhv_dc_fp_t rx; - -#if defined(sun4v) - nx_vio_fp_t vio; -#endif - } nxhv_fp_t; /* ------------------------------------------------------------------ */ #define NXGE_VR_SR_MAX 8 /* There are 8 subregions (SR). */ typedef enum { - NXGE_HIO_TYPE_SERVICE, /* We are a service domain driver. */ NXGE_HIO_TYPE_GUEST /* We are a guest domain driver. */ - } nxge_hio_type_t; typedef enum { @@ -130,7 +112,6 @@ typedef enum { FUNC2_VIR = 0x5000000, FUNC3_MNT = 0x6000000, FUNC3_VIR = 0x7000000 - } vr_base_address_t; #define VR_STEP 0x2000000 @@ -146,7 +127,6 @@ typedef enum { /* 0-8 */ FUNC3_VIR0, FUNC3_VIR1, FUNC_VIR_MAX - } vr_region_t; typedef enum { @@ -159,13 +139,11 @@ typedef enum { VP_CHANNEL_6, VP_CHANNEL_7, VP_CHANNEL_MAX - } vp_channel_t; typedef enum { VP_BOUND_TX = 1, VP_BOUND_RX - } vpc_type_t; #define VP_VC_OFFSET(channel) (channel << 10) @@ -254,9 +232,6 @@ typedef struct nxge_hio_vr { ether_addr_t altmac; /* The alternate MAC address. */ int slot; /* According to nxge_m_mmac_add(). */ -#if defined(sun4v) - vio_net_handle_t vhp; /* The handle given to us by the vnet. */ -#endif nxge_grp_t rx_group; nxge_grp_t tx_group; @@ -273,7 +248,6 @@ typedef struct { uint64_t map; /* Currently unused */ int vector; /* The DDI vector number (index) */ - } hio_ldg_t; /* diff --git a/usr/src/uts/sun4v/io/vnet.c b/usr/src/uts/sun4v/io/vnet.c index 32b67b2588..884665b77f 100644 --- a/usr/src/uts/sun4v/io/vnet.c +++ b/usr/src/uts/sun4v/io/vnet.c @@ -40,6 +40,8 @@ #include <sys/dlpi.h> #include <net/if.h> #include <sys/mac_provider.h> +#include <sys/mac_client.h> +#include <sys/mac_client_priv.h> #include <sys/mac_ether.h> #include <sys/ddi.h> #include <sys/sunddi.h> @@ -75,11 +77,38 @@ static void vnet_m_ioctl(void *arg, queue_t *q, mblk_t *mp); #ifdef VNET_IOC_DEBUG static void vnet_force_link_state(vnet_t *vnetp, queue_t *q, mblk_t *mp); #endif +static boolean_t vnet_m_capab(void *arg, mac_capab_t cap, void *cap_data); +static void vnet_get_ring(void *arg, mac_ring_type_t rtype, const int g_index, + const int r_index, mac_ring_info_t *infop, mac_ring_handle_t r_handle); +static void vnet_get_group(void *arg, mac_ring_type_t type, const int index, + mac_group_info_t *infop, mac_group_handle_t handle); +static int vnet_rx_ring_start(mac_ring_driver_t rdriver, uint64_t mr_gen_num); +static void vnet_rx_ring_stop(mac_ring_driver_t rdriver); +static int vnet_tx_ring_start(mac_ring_driver_t rdriver, uint64_t mr_gen_num); +static void vnet_tx_ring_stop(mac_ring_driver_t rdriver); +static int vnet_ring_enable_intr(void *arg); +static int vnet_ring_disable_intr(void *arg); +static mblk_t *vnet_rx_poll(void *arg, int bytes_to_pickup); +static int vnet_addmac(void *arg, const uint8_t *mac_addr); +static int vnet_remmac(void *arg, const uint8_t *mac_addr); /* vnet internal functions */ static int vnet_unattach(vnet_t *vnetp); +static void vnet_ring_grp_init(vnet_t *vnetp); +static void vnet_ring_grp_uninit(vnet_t *vnetp); static int vnet_mac_register(vnet_t *); static int vnet_read_mac_address(vnet_t *vnetp); +static int vnet_bind_vgenring(vnet_res_t *vresp); +static void vnet_unbind_vgenring(vnet_res_t *vresp); +static int vnet_bind_hwrings(vnet_t *vnetp); +static void vnet_unbind_hwrings(vnet_t *vnetp); +static int vnet_bind_rings(vnet_res_t *vresp); +static void vnet_unbind_rings(vnet_res_t *vresp); +static int vnet_hio_stat(void *, uint_t, uint64_t *); +static int vnet_hio_start(void *); +static void vnet_hio_stop(void *); +static void vnet_hio_notify_cb(void *arg, mac_notify_type_t type); +mblk_t *vnet_hio_tx(void *, mblk_t *); /* Forwarding database (FDB) routines */ static void vnet_fdb_create(vnet_t *vnetp); @@ -98,6 +127,8 @@ static void vnet_stop_resources(vnet_t *vnetp); static void vnet_dispatch_res_task(vnet_t *vnetp); static void vnet_res_start_task(void *arg); static void vnet_handle_res_err(vio_net_handle_t vrh, vio_net_err_val_t err); +static void vnet_add_resource(vnet_t *vnetp, vnet_res_t *vresp); +static vnet_res_t *vnet_rem_resource(vnet_t *vnetp, vnet_res_t *vresp); /* Exported to vnet_gen */ int vnet_mtu_update(vnet_t *vnetp, uint32_t mtu); @@ -112,15 +143,21 @@ static void vnet_hio_destroy_kstats(kstat_t *ksp); /* Exported to to vnet_dds */ int vnet_send_dds_msg(vnet_t *vnetp, void *dmsg); +int vnet_hio_mac_init(vnet_t *vnetp, char *ifname); +void vnet_hio_mac_cleanup(vnet_t *vnetp); /* Externs that are imported from vnet_gen */ extern int vgen_init(void *vnetp, uint64_t regprop, dev_info_t *vnetdip, const uint8_t *macaddr, void **vgenhdl); +extern int vgen_init_mdeg(void *arg); extern void vgen_uninit(void *arg); extern int vgen_dds_tx(void *arg, void *dmsg); extern void vgen_mod_init(void); extern int vgen_mod_cleanup(void); extern void vgen_mod_fini(void); +extern int vgen_enable_intr(void *arg); +extern int vgen_disable_intr(void *arg); +extern mblk_t *vgen_poll(void *arg, int bytes_to_pickup); /* Externs that are imported from vnet_dds */ extern void vdds_mod_init(void); @@ -131,6 +168,9 @@ extern void vdds_process_dds_msg(vnet_t *vnetp, vio_dds_msg_t *dmsg); extern void vdds_cleanup_hybrid_res(void *arg); extern void vdds_cleanup_hio(vnet_t *vnetp); +/* Externs imported from mac_impl */ +extern mblk_t *mac_hwring_tx(mac_ring_handle_t, mblk_t *); + #define DRV_NAME "vnet" #define VNET_FDBE_REFHOLD(p) \ { \ @@ -145,9 +185,9 @@ extern void vdds_cleanup_hio(vnet_t *vnetp); } #ifdef VNET_IOC_DEBUG -#define VNET_M_CALLBACK_FLAGS (MC_IOCTL) +#define VNET_M_CALLBACK_FLAGS (MC_IOCTL | MC_GETCAPAB) #else -#define VNET_M_CALLBACK_FLAGS (0) +#define VNET_M_CALLBACK_FLAGS (MC_GETCAPAB) #endif static mac_callbacks_t vnet_m_callbacks = { @@ -157,9 +197,23 @@ static mac_callbacks_t vnet_m_callbacks = { vnet_m_stop, vnet_m_promisc, vnet_m_multicst, - vnet_m_unicst, - vnet_m_tx, + NULL, /* m_unicst entry must be NULL while rx rings are exposed */ + NULL, /* m_tx entry must be NULL while tx rings are exposed */ vnet_m_ioctl, + vnet_m_capab, + NULL +}; + +static mac_callbacks_t vnet_hio_res_callbacks = { + 0, + vnet_hio_stat, + vnet_hio_start, + vnet_hio_stop, + NULL, + NULL, + NULL, + vnet_hio_tx, + NULL, NULL, NULL }; @@ -176,6 +230,9 @@ uint32_t vnet_ldcwd_interval = VNET_LDCWD_INTERVAL; /* watchdog freq in msec */ uint32_t vnet_ldcwd_txtimeout = VNET_LDCWD_TXTIMEOUT; /* tx timeout in msec */ uint32_t vnet_ldc_mtu = VNET_LDC_MTU; /* ldc mtu */ +/* Configure tx serialization in mac layer for the vnet device */ +boolean_t vnet_mac_tx_serialize = B_TRUE; + /* * Set this to non-zero to enable additional internal receive buffer pools * based on the MTU of the device for better performance at the cost of more @@ -206,6 +263,11 @@ static struct ether_addr etherbroadcastaddr = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; +/* mac_open() retry delay in usec */ +uint32_t vnet_mac_open_delay = 100; /* 0.1 ms */ + +/* max # of mac_open() retries */ +uint32_t vnet_mac_open_retries = 100; /* * Property names @@ -375,6 +437,9 @@ vnetattach(dev_info_t *dip, ddi_attach_cmd_t cmd) rw_init(&vnetp->vsw_fp_rw, NULL, RW_DRIVER, NULL); attach_progress |= AST_vnet_alloc; + vnet_ring_grp_init(vnetp); + attach_progress |= AST_ring_init; + status = vdds_init(vnetp); if (status != 0) { goto vnet_attach_fail; @@ -419,10 +484,19 @@ vnetattach(dev_info_t *dip, ddi_attach_cmd_t cmd) attach_progress |= AST_vnet_list; /* - * Initialize the generic vnet plugin which provides - * communication via sun4v LDC (logical domain channel) based - * resources. It will register the LDC resources as and when - * they become available. + * Initialize the generic vnet plugin which provides communication via + * sun4v LDC (logical domain channel) based resources. This involves 2 + * steps; first, vgen_init() is invoked to read the various properties + * of the vnet device from its MD node (including its mtu which is + * needed to mac_register()) and obtain a handle to the vgen layer. + * After mac_register() is done and we have a mac handle, we then + * invoke vgen_init_mdeg() which registers with the the MD event + * generator (mdeg) framework to allow LDC resource notifications. + * Note: this sequence also allows us to report the correct default # + * of pseudo rings (2TX and 3RX) in vnet_m_capab() which gets invoked + * in the context of mac_register(); and avoids conflicting with + * dynamic pseudo rx rings which get added/removed as a result of mdeg + * events in vgen. */ status = vgen_init(vnetp, reg, vnetp->dip, (uint8_t *)vnetp->curr_macaddr, &vnetp->vgenhdl); @@ -432,15 +506,19 @@ vnetattach(dev_info_t *dip, ddi_attach_cmd_t cmd) } attach_progress |= AST_vgen_init; - /* register with MAC layer */ status = vnet_mac_register(vnetp); if (status != DDI_SUCCESS) { goto vnet_attach_fail; } vnetp->link_state = LINK_STATE_UNKNOWN; - attach_progress |= AST_macreg; + status = vgen_init_mdeg(vnetp->vgenhdl); + if (status != DDI_SUCCESS) { + goto vnet_attach_fail; + } + attach_progress |= AST_init_mdeg; + vnetp->attach_progress = attach_progress; DBG1(NULL, "instance(%d) exit\n", instance); @@ -503,21 +581,25 @@ vnet_unattach(vnet_t *vnetp) attach_progress = vnetp->attach_progress; /* - * Unregister from the gldv3 subsystem. This can fail, in particular - * if there are still any open references to this mac device; in which - * case we just return failure without continuing to detach further. + * Disable the mac device in the gldv3 subsystem. This can fail, in + * particular if there are still any open references to this mac + * device; in which case we just return failure without continuing to + * detach further. + * If it succeeds, we then invoke vgen_uninit() which should unregister + * any pseudo rings registered with the mac layer. Note we keep the + * AST_macreg flag on, so we can unregister with the mac layer at + * the end of this routine. */ if (attach_progress & AST_macreg) { - if (mac_unregister(vnetp->mh) != 0) { + if (mac_disable(vnetp->mh) != 0) { return (1); } - attach_progress &= ~AST_macreg; } /* - * Now that we have unregistered from gldv3, we must finish all other - * steps and successfully return from this function; otherwise we will - * end up leaving the device in a broken/unusable state. + * Now that we have disabled the device, we must finish all other steps + * and successfully return from this function; otherwise we will end up + * leaving the device in a broken/unusable state. * * First, release any hybrid resources assigned to this vnet device. */ @@ -530,9 +612,10 @@ vnet_unattach(vnet_t *vnetp) * Uninit vgen. This stops further mdeg callbacks to this vnet * device and/or its ports; and detaches any existing ports. */ - if (attach_progress & AST_vgen_init) { + if (attach_progress & (AST_vgen_init|AST_init_mdeg)) { vgen_uninit(vnetp->vgenhdl); attach_progress &= ~AST_vgen_init; + attach_progress &= ~AST_init_mdeg; } /* Destroy the taskq. */ @@ -563,6 +646,17 @@ vnet_unattach(vnet_t *vnetp) attach_progress &= ~AST_vnet_list; } + if (attach_progress & AST_ring_init) { + vnet_ring_grp_uninit(vnetp); + attach_progress &= ~AST_ring_init; + } + + if (attach_progress & AST_macreg) { + VERIFY(mac_unregister(vnetp->mh) == 0); + vnetp->mh = NULL; + attach_progress &= ~AST_macreg; + } + if (attach_progress & AST_vnet_alloc) { rw_destroy(&vnetp->vrwlock); rw_destroy(&vnetp->vsw_fp_rw); @@ -683,8 +777,9 @@ vnet_m_promisc(void *arg, boolean_t on) * external hosts. */ mblk_t * -vnet_m_tx(void *arg, mblk_t *mp) +vnet_tx_ring_send(void *arg, mblk_t *mp) { + vnet_pseudo_tx_ring_t *tx_ringp; vnet_t *vnetp; vnet_res_t *vresp; mblk_t *next; @@ -694,8 +789,10 @@ vnet_m_tx(void *arg, mblk_t *mp) boolean_t is_unicast; boolean_t is_pvid; /* non-default pvid ? */ boolean_t hres; /* Hybrid resource ? */ + void *tx_arg; - vnetp = (vnet_t *)arg; + tx_ringp = (vnet_pseudo_tx_ring_t *)arg; + vnetp = (vnet_t *)tx_ringp->vnetp; DBG1(vnetp, "enter\n"); ASSERT(mp != NULL); @@ -790,10 +887,14 @@ vnet_m_tx(void *arg, mblk_t *mp) } } - } - macp = &vresp->macreg; - resid_mp = macp->m_callbacks->mc_tx(macp->m_driver, mp); + macp = &vresp->macreg; + tx_arg = tx_ringp; + } else { + macp = &vresp->macreg; + tx_arg = macp->m_driver; + } + resid_mp = macp->m_callbacks->mc_tx(tx_arg, mp); /* tx done; now release ref on fdb entry */ VNET_FDBE_REFRELE(vresp); @@ -848,6 +949,124 @@ vnet_m_stat(void *arg, uint_t stat, uint64_t *val) return (0); } +static void +vnet_ring_grp_init(vnet_t *vnetp) +{ + vnet_pseudo_rx_group_t *rx_grp; + vnet_pseudo_rx_ring_t *rx_ringp; + vnet_pseudo_tx_group_t *tx_grp; + vnet_pseudo_tx_ring_t *tx_ringp; + int i; + + tx_grp = &vnetp->tx_grp[0]; + tx_ringp = kmem_zalloc(sizeof (vnet_pseudo_tx_ring_t) * + VNET_NUM_PSEUDO_TXRINGS, KM_SLEEP); + for (i = 0; i < VNET_NUM_PSEUDO_TXRINGS; i++) { + tx_ringp[i].state |= VNET_TXRING_SHARED; + } + tx_grp->rings = tx_ringp; + tx_grp->ring_cnt = VNET_NUM_PSEUDO_TXRINGS; + + rx_grp = &vnetp->rx_grp[0]; + rx_grp->max_ring_cnt = MAX_RINGS_PER_GROUP; + rw_init(&rx_grp->lock, NULL, RW_DRIVER, NULL); + rx_ringp = kmem_zalloc(sizeof (vnet_pseudo_rx_ring_t) * + rx_grp->max_ring_cnt, KM_SLEEP); + + /* + * Setup the first 3 Pseudo RX Rings that are reserved; + * 1 for LDC resource to vswitch + 2 for RX rings of Hybrid resource. + */ + rx_ringp[0].state |= VNET_RXRING_INUSE|VNET_RXRING_LDC_SERVICE; + rx_ringp[0].index = 0; + rx_ringp[1].state |= VNET_RXRING_INUSE|VNET_RXRING_HYBRID; + rx_ringp[1].index = 1; + rx_ringp[2].state |= VNET_RXRING_INUSE|VNET_RXRING_HYBRID; + rx_ringp[2].index = 2; + + rx_grp->ring_cnt = VNET_NUM_PSEUDO_RXRINGS_DEFAULT; + rx_grp->rings = rx_ringp; + + for (i = VNET_NUM_PSEUDO_RXRINGS_DEFAULT; + i < rx_grp->max_ring_cnt; i++) { + rx_ringp = &rx_grp->rings[i]; + rx_ringp->state = VNET_RXRING_FREE; + rx_ringp->index = i; + } +} + +static void +vnet_ring_grp_uninit(vnet_t *vnetp) +{ + vnet_pseudo_rx_group_t *rx_grp; + vnet_pseudo_tx_group_t *tx_grp; + + tx_grp = &vnetp->tx_grp[0]; + if (tx_grp->rings != NULL) { + ASSERT(tx_grp->ring_cnt == VNET_NUM_PSEUDO_TXRINGS); + kmem_free(tx_grp->rings, sizeof (vnet_pseudo_tx_ring_t) * + tx_grp->ring_cnt); + tx_grp->rings = NULL; + } + + rx_grp = &vnetp->rx_grp[0]; + if (rx_grp->rings != NULL) { + ASSERT(rx_grp->max_ring_cnt == MAX_RINGS_PER_GROUP); + ASSERT(rx_grp->ring_cnt == VNET_NUM_PSEUDO_RXRINGS_DEFAULT); + kmem_free(rx_grp->rings, sizeof (vnet_pseudo_rx_ring_t) * + rx_grp->max_ring_cnt); + rx_grp->rings = NULL; + } +} + +static vnet_pseudo_rx_ring_t * +vnet_alloc_pseudo_rx_ring(vnet_t *vnetp) +{ + vnet_pseudo_rx_group_t *rx_grp; + vnet_pseudo_rx_ring_t *rx_ringp; + int index; + + rx_grp = &vnetp->rx_grp[0]; + WRITE_ENTER(&rx_grp->lock); + + if (rx_grp->ring_cnt == rx_grp->max_ring_cnt) { + /* no rings available */ + RW_EXIT(&rx_grp->lock); + return (NULL); + } + + for (index = VNET_NUM_PSEUDO_RXRINGS_DEFAULT; + index < rx_grp->max_ring_cnt; index++) { + rx_ringp = &rx_grp->rings[index]; + if (rx_ringp->state == VNET_RXRING_FREE) { + rx_ringp->state |= VNET_RXRING_INUSE; + rx_grp->ring_cnt++; + break; + } + } + + RW_EXIT(&rx_grp->lock); + return (rx_ringp); +} + +static void +vnet_free_pseudo_rx_ring(vnet_t *vnetp, vnet_pseudo_rx_ring_t *ringp) +{ + vnet_pseudo_rx_group_t *rx_grp; + + ASSERT(ringp->index >= VNET_NUM_PSEUDO_RXRINGS_DEFAULT); + rx_grp = &vnetp->rx_grp[0]; + WRITE_ENTER(&rx_grp->lock); + + if (ringp->state != VNET_RXRING_FREE) { + ringp->state = VNET_RXRING_FREE; + ringp->handle = NULL; + rx_grp->ring_cnt--; + } + + RW_EXIT(&rx_grp->lock); +} + /* wrapper function for mac_register() */ static int vnet_mac_register(vnet_t *vnetp) @@ -867,6 +1086,15 @@ vnet_mac_register(vnet_t *vnetp) macp->m_margin = VLAN_TAGSZ; /* + * MAC_VIRT_SERIALIZE flag is needed while hybridIO is enabled to + * workaround tx lock contention issues in nxge. + */ + macp->m_v12n = MAC_VIRT_LEVEL1; + if (vnet_mac_tx_serialize == B_TRUE) { + macp->m_v12n |= MAC_VIRT_SERIALIZE; + } + + /* * Finally, we're ready to register ourselves with the MAC layer * interface; if this succeeds, we're all ready to start() */ @@ -1116,42 +1344,57 @@ vnet_rx_frames_untag(uint16_t pvid, mblk_t **mp) static void vnet_rx(vio_net_handle_t vrh, mblk_t *mp) { - vnet_res_t *vresp = (vnet_res_t *)vrh; - vnet_t *vnetp = vresp->vnetp; + vnet_res_t *vresp = (vnet_res_t *)vrh; + vnet_t *vnetp = vresp->vnetp; + vnet_pseudo_rx_ring_t *ringp; if ((vnetp == NULL) || (vnetp->mh == 0)) { freemsgchain(mp); return; } - /* - * Packets received over a hybrid resource need additional processing - * to remove the tag, for the pvid case. The underlying resource is - * not aware of the vnet's pvid and thus packets are received with the - * vlan tag in the header; unlike packets that are received over a ldc - * channel in which case the peer vnet/vsw would have already removed - * the tag. - */ - if (vresp->type == VIO_NET_RES_HYBRID && - vnetp->pvid != vnetp->default_vlan_id) { - - vnet_rx_frames_untag(vnetp->pvid, &mp); - if (mp == NULL) { - return; - } - } - - mac_rx(vnetp->mh, NULL, mp); + ringp = vresp->rx_ringp; + mac_rx_ring(vnetp->mh, ringp->handle, mp, ringp->gen_num); } void vnet_tx_update(vio_net_handle_t vrh) { - vnet_res_t *vresp = (vnet_res_t *)vrh; - vnet_t *vnetp = vresp->vnetp; + vnet_res_t *vresp = (vnet_res_t *)vrh; + vnet_t *vnetp = vresp->vnetp; + vnet_pseudo_tx_ring_t *tx_ringp; + vnet_pseudo_tx_group_t *tx_grp; + int i; + + if (vnetp == NULL || vnetp->mh == NULL) { + return; + } - if ((vnetp != NULL) && (vnetp->mh != NULL)) { - mac_tx_update(vnetp->mh); + /* + * Currently, the tx hwring API (used to access rings that belong to + * a Hybrid IO resource) does not provide us a per ring flow ctrl + * update; also the pseudo rings are shared by the ports/ldcs in the + * vgen layer. Thus we can't figure out which pseudo ring is being + * re-enabled for transmits. To work around this, when we get a tx + * restart notification from below, we simply propagate that to all + * the tx pseudo rings registered with the mac layer above. + * + * There are a couple of side effects with this approach, but they are + * not harmful, as outlined below: + * + * A) We might send an invalid ring_update() for a ring that is not + * really flow controlled. This will not have any effect in the mac + * layer and packets will continue to be transmitted on that ring. + * + * B) We might end up clearing the flow control in the mac layer for + * a ring that is still flow controlled in the underlying resource. + * This will result in the mac layer restarting transmit, only to be + * flow controlled again on that ring. + */ + tx_grp = &vnetp->tx_grp[0]; + for (i = 0; i < tx_grp->ring_cnt; i++) { + tx_ringp = &tx_grp->rings[i]; + mac_tx_ring_update(vnetp->mh, tx_ringp->handle); } } @@ -1233,8 +1476,8 @@ int vio_net_resource_reg(mac_register_t *macp, vio_net_res_type_t type, ether_addr_t local_macaddr, ether_addr_t rem_macaddr, vio_net_handle_t *vhp, vio_net_callbacks_t *vcb) { - vnet_t *vnetp; - vnet_res_t *vresp; + vnet_t *vnetp; + vnet_res_t *vresp; vresp = kmem_zalloc(sizeof (vnet_res_t), KM_SLEEP); ether_copy(local_macaddr, vresp->local_macaddr); @@ -1260,11 +1503,7 @@ int vio_net_resource_reg(mac_register_t *macp, vio_net_res_type_t type, vnetp->instance); } } - - WRITE_ENTER(&vnetp->vrwlock); - vresp->nextp = vnetp->vres_list; - vnetp->vres_list = vresp; - RW_EXIT(&vnetp->vrwlock); + vnet_add_resource(vnetp, vresp); break; } vnetp = vnetp->nextp; @@ -1281,6 +1520,14 @@ int vio_net_resource_reg(mac_register_t *macp, vio_net_res_type_t type, vcb->vio_net_tx_update = vnet_tx_update; vcb->vio_net_report_err = vnet_handle_res_err; + /* Bind the resource to pseudo ring(s) */ + if (vnet_bind_rings(vresp) != 0) { + (void) vnet_rem_resource(vnetp, vresp); + vnet_hio_destroy_kstats(vresp->ksp); + KMEM_FREE(vresp); + return (1); + } + /* Dispatch a task to start resources */ vnet_dispatch_res_task(vnetp); return (0); @@ -1294,8 +1541,6 @@ vio_net_resource_unreg(vio_net_handle_t vhp) { vnet_res_t *vresp = (vnet_res_t *)vhp; vnet_t *vnetp = vresp->vnetp; - vnet_res_t *vrp; - kstat_t *ksp = NULL; DBG1(NULL, "Resource Registerig hdl=0x%p", vhp); @@ -1306,7 +1551,29 @@ vio_net_resource_unreg(vio_net_handle_t vhp) */ vnet_fdbe_del(vnetp, vresp); + vnet_unbind_rings(vresp); + /* Now remove the resource from the list */ + (void) vnet_rem_resource(vnetp, vresp); + + vnet_hio_destroy_kstats(vresp->ksp); + KMEM_FREE(vresp); +} + +static void +vnet_add_resource(vnet_t *vnetp, vnet_res_t *vresp) +{ + WRITE_ENTER(&vnetp->vrwlock); + vresp->nextp = vnetp->vres_list; + vnetp->vres_list = vresp; + RW_EXIT(&vnetp->vrwlock); +} + +static vnet_res_t * +vnet_rem_resource(vnet_t *vnetp, vnet_res_t *vresp) +{ + vnet_res_t *vrp; + WRITE_ENTER(&vnetp->vrwlock); if (vresp == vnetp->vres_list) { vnetp->vres_list = vresp->nextp; @@ -1320,15 +1587,12 @@ vio_net_resource_unreg(vio_net_handle_t vhp) vrp = vrp->nextp; } } - - ksp = vresp->ksp; - vresp->ksp = NULL; - vresp->vnetp = NULL; vresp->nextp = NULL; + RW_EXIT(&vnetp->vrwlock); - vnet_hio_destroy_kstats(ksp); - KMEM_FREE(vresp); + + return (vresp); } /* @@ -1710,6 +1974,1024 @@ vnet_hio_get_stats(vnet_res_t *vresp, vnet_hio_stats_t *statsp) } } +static boolean_t +vnet_m_capab(void *arg, mac_capab_t cap, void *cap_data) +{ + vnet_t *vnetp = (vnet_t *)arg; + + if (vnetp == NULL) { + return (0); + } + + switch (cap) { + + case MAC_CAPAB_RINGS: { + + mac_capab_rings_t *cap_rings = cap_data; + /* + * Rings Capability Notes: + * We advertise rings to make use of the rings framework in + * gldv3 mac layer, to improve the performance. This is + * specifically needed when a Hybrid resource (with multiple + * tx/rx hardware rings) is assigned to a vnet device. We also + * leverage this for the normal case when no Hybrid resource is + * assigned. + * + * Ring Allocation: + * - TX path: + * We expose a pseudo ring group with 2 pseudo tx rings (as + * currently HybridIO exports only 2 rings) In the normal case, + * transmit traffic that comes down to the driver through the + * mri_tx (vnet_tx_ring_send()) entry point goes through the + * distributed switching algorithm in vnet and gets transmitted + * over a port/LDC in the vgen layer to either the vswitch or a + * peer vnet. If and when a Hybrid resource is assigned to the + * vnet, we obtain the tx ring information of the Hybrid device + * (nxge) and map the pseudo rings 1:1 to the 2 hw tx rings. + * Traffic being sent over the Hybrid resource by the mac layer + * gets spread across both hw rings, as they are mapped to the + * 2 pseudo tx rings in vnet. + * + * - RX path: + * We expose a pseudo ring group with 3 pseudo rx rings (static + * rings) initially. The first (default) pseudo rx ring is + * reserved for the resource that connects to the vswitch + * service. The next 2 rings are reserved for a Hybrid resource + * that may be assigned to the vnet device. If and when a + * Hybrid resource is assigned to the vnet, we obtain the rx + * ring information of the Hybrid device (nxge) and map these + * pseudo rings 1:1 to the 2 hw rx rings. For each additional + * resource that connects to a peer vnet, we dynamically + * allocate a pseudo rx ring and map it to that resource, when + * the resource gets added; and the pseudo rx ring is + * dynamically registered with the upper mac layer. We do the + * reverse and unregister the ring with the mac layer when + * the resource gets removed. + * + * Synchronization notes: + * We don't need any lock to protect members of ring structure, + * specifically ringp->hw_rh, in either the TX or the RX ring, + * as explained below. + * - TX ring: + * ring->hw_rh is initialized only when a Hybrid resource is + * associated; and gets referenced only in vnet_hio_tx(). The + * Hybrid resource itself is available in fdb only after tx + * hwrings are found and mapped; i.e, in vio_net_resource_reg() + * we call vnet_bind_rings() first and then call + * vnet_start_resources() which adds an entry to fdb. For + * traffic going over LDC resources, we don't reference + * ring->hw_rh at all. + * - RX ring: + * For rings mapped to Hybrid resource ring->hw_rh is + * initialized and only then do we add the rx callback for + * the underlying Hybrid resource; we disable callbacks before + * we unmap ring->hw_rh. For rings mapped to LDC resources, we + * stop the rx callbacks (in vgen) before we remove ring->hw_rh + * (vio_net_resource_unreg()). + */ + + if (cap_rings->mr_type == MAC_RING_TYPE_RX) { + cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC; + + /* + * The ring_cnt for rx grp is initialized in + * vnet_ring_grp_init(). Later, the ring_cnt gets + * updated dynamically whenever LDC resources are added + * or removed. + */ + cap_rings->mr_rnum = vnetp->rx_grp[0].ring_cnt; + cap_rings->mr_rget = vnet_get_ring; + + cap_rings->mr_gnum = VNET_NUM_PSEUDO_GROUPS; + cap_rings->mr_gget = vnet_get_group; + cap_rings->mr_gaddring = NULL; + cap_rings->mr_gremring = NULL; + } else { + cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC; + + /* + * The ring_cnt for tx grp is initialized in + * vnet_ring_grp_init() and remains constant, as we + * do not support dymanic tx rings for now. + */ + cap_rings->mr_rnum = vnetp->tx_grp[0].ring_cnt; + cap_rings->mr_rget = vnet_get_ring; + + /* + * Transmit rings are not grouped; i.e, the number of + * transmit ring groups advertised should be set to 0. + */ + cap_rings->mr_gnum = 0; + + cap_rings->mr_gget = vnet_get_group; + cap_rings->mr_gaddring = NULL; + cap_rings->mr_gremring = NULL; + } + return (B_TRUE); + + } + + default: + break; + + } + + return (B_FALSE); +} + +/* + * Callback funtion for MAC layer to get ring information. + */ +static void +vnet_get_ring(void *arg, mac_ring_type_t rtype, const int g_index, + const int r_index, mac_ring_info_t *infop, mac_ring_handle_t r_handle) +{ + vnet_t *vnetp = arg; + + switch (rtype) { + + case MAC_RING_TYPE_RX: { + + vnet_pseudo_rx_group_t *rx_grp; + vnet_pseudo_rx_ring_t *rx_ringp; + mac_intr_t *mintr; + + /* We advertised only one RX group */ + ASSERT(g_index == 0); + rx_grp = &vnetp->rx_grp[g_index]; + + /* Check the current # of rings in the rx group */ + ASSERT((r_index >= 0) && (r_index < rx_grp->max_ring_cnt)); + + /* Get the ring based on the index */ + rx_ringp = &rx_grp->rings[r_index]; + + rx_ringp->handle = r_handle; + /* + * Note: we don't need to save the incoming r_index in rx_ring, + * as vnet_ring_grp_init() would have initialized the index for + * each ring in the array. + */ + rx_ringp->grp = rx_grp; + rx_ringp->vnetp = vnetp; + + mintr = &infop->mri_intr; + mintr->mi_handle = (mac_intr_handle_t)rx_ringp; + mintr->mi_enable = (mac_intr_enable_t)vnet_ring_enable_intr; + mintr->mi_disable = (mac_intr_disable_t)vnet_ring_disable_intr; + + infop->mri_driver = (mac_ring_driver_t)rx_ringp; + infop->mri_start = vnet_rx_ring_start; + infop->mri_stop = vnet_rx_ring_stop; + + /* Set the poll function, as this is an rx ring */ + infop->mri_poll = vnet_rx_poll; + + break; + } + + case MAC_RING_TYPE_TX: { + vnet_pseudo_tx_group_t *tx_grp; + vnet_pseudo_tx_ring_t *tx_ringp; + + /* + * No need to check grp index; mac layer passes -1 for it. + */ + tx_grp = &vnetp->tx_grp[0]; + + /* Check the # of rings in the tx group */ + ASSERT((r_index >= 0) && (r_index < tx_grp->ring_cnt)); + + /* Get the ring based on the index */ + tx_ringp = &tx_grp->rings[r_index]; + + tx_ringp->handle = r_handle; + tx_ringp->index = r_index; + tx_ringp->grp = tx_grp; + tx_ringp->vnetp = vnetp; + + infop->mri_driver = (mac_ring_driver_t)tx_ringp; + infop->mri_start = vnet_tx_ring_start; + infop->mri_stop = vnet_tx_ring_stop; + + /* Set the transmit function, as this is a tx ring */ + infop->mri_tx = vnet_tx_ring_send; + + break; + } + + default: + break; + } +} + +/* + * Callback funtion for MAC layer to get group information. + */ +static void +vnet_get_group(void *arg, mac_ring_type_t type, const int index, + mac_group_info_t *infop, mac_group_handle_t handle) +{ + vnet_t *vnetp = (vnet_t *)arg; + + switch (type) { + + case MAC_RING_TYPE_RX: + { + vnet_pseudo_rx_group_t *rx_grp; + + /* We advertised only one RX group */ + ASSERT(index == 0); + + rx_grp = &vnetp->rx_grp[index]; + rx_grp->handle = handle; + rx_grp->index = index; + rx_grp->vnetp = vnetp; + + infop->mgi_driver = (mac_group_driver_t)rx_grp; + infop->mgi_start = NULL; + infop->mgi_stop = NULL; + infop->mgi_addmac = vnet_addmac; + infop->mgi_remmac = vnet_remmac; + infop->mgi_count = rx_grp->ring_cnt; + + break; + } + + case MAC_RING_TYPE_TX: + { + vnet_pseudo_tx_group_t *tx_grp; + + /* We advertised only one TX group */ + ASSERT(index == 0); + + tx_grp = &vnetp->tx_grp[index]; + tx_grp->handle = handle; + tx_grp->index = index; + tx_grp->vnetp = vnetp; + + infop->mgi_driver = (mac_group_driver_t)tx_grp; + infop->mgi_start = NULL; + infop->mgi_stop = NULL; + infop->mgi_addmac = NULL; + infop->mgi_remmac = NULL; + infop->mgi_count = VNET_NUM_PSEUDO_TXRINGS; + + break; + } + + default: + break; + + } +} + +static int +vnet_rx_ring_start(mac_ring_driver_t arg, uint64_t mr_gen_num) +{ + vnet_pseudo_rx_ring_t *rx_ringp = (vnet_pseudo_rx_ring_t *)arg; + int err; + + /* + * If this ring is mapped to a LDC resource, simply mark the state to + * indicate the ring is started and return. + */ + if ((rx_ringp->state & + (VNET_RXRING_LDC_SERVICE|VNET_RXRING_LDC_GUEST)) != 0) { + rx_ringp->gen_num = mr_gen_num; + rx_ringp->state |= VNET_RXRING_STARTED; + return (0); + } + + ASSERT((rx_ringp->state & VNET_RXRING_HYBRID) != 0); + + /* + * This must be a ring reserved for a hwring. If the hwring is not + * bound yet, simply mark the state to indicate the ring is started and + * return. If and when a hybrid resource is activated for this vnet + * device, we will bind the hwring and start it then. If a hwring is + * already bound, start it now. + */ + if (rx_ringp->hw_rh == NULL) { + rx_ringp->gen_num = mr_gen_num; + rx_ringp->state |= VNET_RXRING_STARTED; + return (0); + } + + err = mac_hwring_start(rx_ringp->hw_rh); + if (err == 0) { + rx_ringp->gen_num = mr_gen_num; + rx_ringp->state |= VNET_RXRING_STARTED; + } else { + err = ENXIO; + } + + return (err); +} + +static void +vnet_rx_ring_stop(mac_ring_driver_t arg) +{ + vnet_pseudo_rx_ring_t *rx_ringp = (vnet_pseudo_rx_ring_t *)arg; + + /* + * If this ring is mapped to a LDC resource, simply mark the state to + * indicate the ring is now stopped and return. + */ + if ((rx_ringp->state & + (VNET_RXRING_LDC_SERVICE|VNET_RXRING_LDC_GUEST)) != 0) { + rx_ringp->state &= ~VNET_RXRING_STARTED; + } + + ASSERT((rx_ringp->state & VNET_RXRING_HYBRID) != 0); + + /* + * This must be a ring reserved for a hwring. If the hwring is not + * bound yet, simply mark the state to indicate the ring is stopped and + * return. If a hwring is already bound, stop it now. + */ + if (rx_ringp->hw_rh == NULL) { + rx_ringp->state &= ~VNET_RXRING_STARTED; + return; + } + + mac_hwring_stop(rx_ringp->hw_rh); + rx_ringp->state &= ~VNET_RXRING_STARTED; +} + +/* ARGSUSED */ +static int +vnet_tx_ring_start(mac_ring_driver_t arg, uint64_t mr_gen_num) +{ + vnet_pseudo_tx_ring_t *tx_ringp = (vnet_pseudo_tx_ring_t *)arg; + + tx_ringp->state |= VNET_TXRING_STARTED; + return (0); +} + +static void +vnet_tx_ring_stop(mac_ring_driver_t arg) +{ + vnet_pseudo_tx_ring_t *tx_ringp = (vnet_pseudo_tx_ring_t *)arg; + + tx_ringp->state &= ~VNET_TXRING_STARTED; +} + +/* + * Disable polling for a ring and enable its interrupt. + */ +static int +vnet_ring_enable_intr(void *arg) +{ + vnet_pseudo_rx_ring_t *rx_ringp = (vnet_pseudo_rx_ring_t *)arg; + vnet_res_t *vresp; + + if (rx_ringp->hw_rh == NULL) { + /* + * Ring enable intr func is being invoked, but the ring is + * not bound to any underlying resource ? This must be a ring + * reserved for Hybrid resource and no such resource has been + * assigned to this vnet device yet. We simply return success. + */ + ASSERT((rx_ringp->state & VNET_RXRING_HYBRID) != 0); + return (0); + } + + /* + * The rx ring has been bound to either a LDC or a Hybrid resource. + * Call the appropriate function to enable interrupts for the ring. + */ + if (rx_ringp->state & VNET_RXRING_HYBRID) { + return (mac_hwring_enable_intr(rx_ringp->hw_rh)); + } else { + vresp = (vnet_res_t *)rx_ringp->hw_rh; + return (vgen_enable_intr(vresp->macreg.m_driver)); + } +} + +/* + * Enable polling for a ring and disable its interrupt. + */ +static int +vnet_ring_disable_intr(void *arg) +{ + vnet_pseudo_rx_ring_t *rx_ringp = (vnet_pseudo_rx_ring_t *)arg; + vnet_res_t *vresp; + + if (rx_ringp->hw_rh == NULL) { + /* + * Ring disable intr func is being invoked, but the ring is + * not bound to any underlying resource ? This must be a ring + * reserved for Hybrid resource and no such resource has been + * assigned to this vnet device yet. We simply return success. + */ + ASSERT((rx_ringp->state & VNET_RXRING_HYBRID) != 0); + return (0); + } + + /* + * The rx ring has been bound to either a LDC or a Hybrid resource. + * Call the appropriate function to disable interrupts for the ring. + */ + if (rx_ringp->state & VNET_RXRING_HYBRID) { + return (mac_hwring_disable_intr(rx_ringp->hw_rh)); + } else { + vresp = (vnet_res_t *)rx_ringp->hw_rh; + return (vgen_disable_intr(vresp->macreg.m_driver)); + } +} + +/* + * Poll 'bytes_to_pickup' bytes of message from the rx ring. + */ +static mblk_t * +vnet_rx_poll(void *arg, int bytes_to_pickup) +{ + vnet_pseudo_rx_ring_t *rx_ringp = (vnet_pseudo_rx_ring_t *)arg; + mblk_t *mp = NULL; + vnet_res_t *vresp; + vnet_t *vnetp = rx_ringp->vnetp; + + if (rx_ringp->hw_rh == NULL) { + return (NULL); + } + + if (rx_ringp->state & VNET_RXRING_HYBRID) { + mp = mac_hwring_poll(rx_ringp->hw_rh, bytes_to_pickup); + /* + * Packets received over a hybrid resource need additional + * processing to remove the tag, for the pvid case. The + * underlying resource is not aware of the vnet's pvid and thus + * packets are received with the vlan tag in the header; unlike + * packets that are received over a ldc channel in which case + * the peer vnet/vsw would have already removed the tag. + */ + if (vnetp->pvid != vnetp->default_vlan_id) { + vnet_rx_frames_untag(vnetp->pvid, &mp); + } + } else { + vresp = (vnet_res_t *)rx_ringp->hw_rh; + mp = vgen_poll(vresp->macreg.m_driver, bytes_to_pickup); + } + return (mp); +} + +/* ARGSUSED */ +void +vnet_hio_rx_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp, + boolean_t loopback) +{ + vnet_t *vnetp = (vnet_t *)arg; + vnet_pseudo_rx_ring_t *ringp = (vnet_pseudo_rx_ring_t *)mrh; + + /* + * Packets received over a hybrid resource need additional processing + * to remove the tag, for the pvid case. The underlying resource is + * not aware of the vnet's pvid and thus packets are received with the + * vlan tag in the header; unlike packets that are received over a ldc + * channel in which case the peer vnet/vsw would have already removed + * the tag. + */ + if (vnetp->pvid != vnetp->default_vlan_id) { + vnet_rx_frames_untag(vnetp->pvid, &mp); + if (mp == NULL) { + return; + } + } + mac_rx_ring(vnetp->mh, ringp->handle, mp, ringp->gen_num); +} + +static int +vnet_addmac(void *arg, const uint8_t *mac_addr) +{ + vnet_pseudo_rx_group_t *rx_grp = (vnet_pseudo_rx_group_t *)arg; + vnet_t *vnetp; + + vnetp = rx_grp->vnetp; + + if (bcmp(mac_addr, vnetp->curr_macaddr, ETHERADDRL) == 0) { + return (0); + } + + cmn_err(CE_CONT, "!vnet%d: %s: Multiple macaddr unsupported\n", + vnetp->instance, __func__); + return (EINVAL); +} + +static int +vnet_remmac(void *arg, const uint8_t *mac_addr) +{ + vnet_pseudo_rx_group_t *rx_grp = (vnet_pseudo_rx_group_t *)arg; + vnet_t *vnetp; + + vnetp = rx_grp->vnetp; + + if (bcmp(mac_addr, vnetp->curr_macaddr, ETHERADDRL) == 0) { + return (0); + } + + cmn_err(CE_CONT, "!vnet%d: %s: Invalid macaddr: %s\n", + vnetp->instance, __func__, ether_sprintf((void *)mac_addr)); + return (EINVAL); +} + +int +vnet_hio_mac_init(vnet_t *vnetp, char *ifname) +{ + mac_handle_t mh; + mac_client_handle_t mch = NULL; + mac_unicast_handle_t muh = NULL; + mac_diag_t diag; + mac_register_t *macp; + char client_name[MAXNAMELEN]; + int rv; + uint16_t mac_flags = MAC_UNICAST_TAG_DISABLE | + MAC_UNICAST_STRIP_DISABLE | MAC_UNICAST_PRIMARY; + vio_net_callbacks_t vcb; + ether_addr_t rem_addr = + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; + uint32_t retries = 0; + + if ((macp = mac_alloc(MAC_VERSION)) == NULL) { + return (EAGAIN); + } + + do { + rv = mac_open_by_linkname(ifname, &mh); + if (rv == 0) { + break; + } + if (rv != ENOENT || (retries++ >= vnet_mac_open_retries)) { + mac_free(macp); + return (rv); + } + drv_usecwait(vnet_mac_open_delay); + } while (rv == ENOENT); + + vnetp->hio_mh = mh; + + (void) snprintf(client_name, MAXNAMELEN, "vnet%d-%s", vnetp->instance, + ifname); + rv = mac_client_open(mh, &mch, client_name, MAC_OPEN_FLAGS_EXCLUSIVE); + if (rv != 0) { + goto fail; + } + vnetp->hio_mch = mch; + + rv = mac_unicast_add(mch, vnetp->curr_macaddr, mac_flags, &muh, 0, + &diag); + if (rv != 0) { + goto fail; + } + vnetp->hio_muh = muh; + + macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER; + macp->m_driver = vnetp; + macp->m_dip = NULL; + macp->m_src_addr = NULL; + macp->m_callbacks = &vnet_hio_res_callbacks; + macp->m_min_sdu = 0; + macp->m_max_sdu = ETHERMTU; + + rv = vio_net_resource_reg(macp, VIO_NET_RES_HYBRID, + vnetp->curr_macaddr, rem_addr, &vnetp->hio_vhp, &vcb); + if (rv != 0) { + goto fail; + } + mac_free(macp); + + /* add the recv callback */ + mac_rx_set(vnetp->hio_mch, vnet_hio_rx_cb, vnetp); + + /* add the notify callback - only tx updates for now */ + vnetp->hio_mnh = mac_notify_add(vnetp->hio_mh, vnet_hio_notify_cb, + vnetp); + + return (0); + +fail: + mac_free(macp); + vnet_hio_mac_cleanup(vnetp); + return (1); +} + +void +vnet_hio_mac_cleanup(vnet_t *vnetp) +{ + if (vnetp->hio_mnh != NULL) { + (void) mac_notify_remove(vnetp->hio_mnh, B_TRUE); + vnetp->hio_mnh = NULL; + } + + if (vnetp->hio_vhp != NULL) { + vio_net_resource_unreg(vnetp->hio_vhp); + vnetp->hio_vhp = NULL; + } + + if (vnetp->hio_muh != NULL) { + mac_unicast_remove(vnetp->hio_mch, vnetp->hio_muh); + vnetp->hio_muh = NULL; + } + + if (vnetp->hio_mch != NULL) { + mac_client_close(vnetp->hio_mch, 0); + vnetp->hio_mch = NULL; + } + + if (vnetp->hio_mh != NULL) { + mac_close(vnetp->hio_mh); + vnetp->hio_mh = NULL; + } +} + +/* Bind pseudo rings to hwrings */ +static int +vnet_bind_hwrings(vnet_t *vnetp) +{ + mac_ring_handle_t hw_rh[VNET_NUM_HYBRID_RINGS]; + mac_perim_handle_t mph1; + vnet_pseudo_rx_group_t *rx_grp; + vnet_pseudo_rx_ring_t *rx_ringp; + vnet_pseudo_tx_group_t *tx_grp; + vnet_pseudo_tx_ring_t *tx_ringp; + int hw_ring_cnt; + int i; + int rv; + + mac_perim_enter_by_mh(vnetp->hio_mh, &mph1); + + /* Get the list of the underlying RX rings. */ + hw_ring_cnt = mac_hwrings_get(vnetp->hio_mch, &vnetp->rx_hwgh, hw_rh, + MAC_RING_TYPE_RX); + + /* We expect the the # of hw rx rings to match VNET_NUM_HYBRID_RINGS */ + if (hw_ring_cnt != VNET_NUM_HYBRID_RINGS) { + cmn_err(CE_WARN, + "!vnet%d: vnet_bind_hwrings: bad rx hw_ring_cnt(%d)\n", + vnetp->instance, hw_ring_cnt); + goto fail; + } + + if (vnetp->rx_hwgh != NULL) { + /* + * Quiesce the HW ring and the mac srs on the ring. Note + * that the HW ring will be restarted when the pseudo ring + * is started. At that time all the packets will be + * directly passed up to the pseudo RX ring and handled + * by mac srs created over the pseudo RX ring. + */ + mac_rx_client_quiesce(vnetp->hio_mch); + mac_srs_perm_quiesce(vnetp->hio_mch, B_TRUE); + } + + /* + * Bind the pseudo rings to the hwrings and start the hwrings. + * Note we don't need to register these with the upper mac, as we have + * statically exported these pseudo rxrings which are reserved for + * rxrings of Hybrid resource. + */ + rx_grp = &vnetp->rx_grp[0]; + for (i = 0; i < VNET_NUM_HYBRID_RINGS; i++) { + /* Pick the rxrings reserved for Hybrid resource */ + rx_ringp = &rx_grp->rings[i + VNET_HYBRID_RXRING_INDEX]; + + /* Store the hw ring handle */ + rx_ringp->hw_rh = hw_rh[i]; + + /* Bind the pseudo ring to the underlying hwring */ + mac_hwring_setup(rx_ringp->hw_rh, + (mac_resource_handle_t)rx_ringp); + + /* Start the hwring if needed */ + if (rx_ringp->state & VNET_RXRING_STARTED) { + rv = mac_hwring_start(rx_ringp->hw_rh); + if (rv != 0) { + mac_hwring_teardown(rx_ringp->hw_rh); + rx_ringp->hw_rh = NULL; + goto fail; + } + } + } + + /* Get the list of the underlying TX rings. */ + hw_ring_cnt = mac_hwrings_get(vnetp->hio_mch, &vnetp->tx_hwgh, hw_rh, + MAC_RING_TYPE_TX); + + /* We expect the # of hw tx rings to match VNET_NUM_HYBRID_RINGS */ + if (hw_ring_cnt != VNET_NUM_HYBRID_RINGS) { + cmn_err(CE_WARN, + "!vnet%d: vnet_bind_hwrings: bad tx hw_ring_cnt(%d)\n", + vnetp->instance, hw_ring_cnt); + goto fail; + } + + /* + * Now map the pseudo txrings to the hw txrings. Note we don't need + * to register these with the upper mac, as we have statically exported + * these rings. Note that these rings will continue to be used for LDC + * resources to peer vnets and vswitch (shared ring). + */ + tx_grp = &vnetp->tx_grp[0]; + for (i = 0; i < tx_grp->ring_cnt; i++) { + tx_ringp = &tx_grp->rings[i]; + tx_ringp->hw_rh = hw_rh[i]; + tx_ringp->state |= VNET_TXRING_HYBRID; + } + + mac_perim_exit(mph1); + return (0); + +fail: + mac_perim_exit(mph1); + vnet_unbind_hwrings(vnetp); + return (1); +} + +/* Unbind pseudo rings from hwrings */ +static void +vnet_unbind_hwrings(vnet_t *vnetp) +{ + mac_perim_handle_t mph1; + vnet_pseudo_rx_ring_t *rx_ringp; + vnet_pseudo_rx_group_t *rx_grp; + vnet_pseudo_tx_group_t *tx_grp; + vnet_pseudo_tx_ring_t *tx_ringp; + int i; + + mac_perim_enter_by_mh(vnetp->hio_mh, &mph1); + + tx_grp = &vnetp->tx_grp[0]; + for (i = 0; i < VNET_NUM_HYBRID_RINGS; i++) { + tx_ringp = &tx_grp->rings[i]; + if (tx_ringp->state & VNET_TXRING_HYBRID) { + tx_ringp->state &= ~VNET_TXRING_HYBRID; + tx_ringp->hw_rh = NULL; + } + } + + rx_grp = &vnetp->rx_grp[0]; + for (i = 0; i < VNET_NUM_HYBRID_RINGS; i++) { + rx_ringp = &rx_grp->rings[i + VNET_HYBRID_RXRING_INDEX]; + if (rx_ringp->hw_rh != NULL) { + /* Stop the hwring */ + mac_hwring_stop(rx_ringp->hw_rh); + + /* Teardown the hwring */ + mac_hwring_teardown(rx_ringp->hw_rh); + rx_ringp->hw_rh = NULL; + } + } + + if (vnetp->rx_hwgh != NULL) { + vnetp->rx_hwgh = NULL; + /* + * First clear the permanent-quiesced flag of the RX srs then + * restart the HW ring and the mac srs on the ring. + */ + mac_srs_perm_quiesce(vnetp->hio_mch, B_FALSE); + mac_rx_client_restart(vnetp->hio_mch); + } + + mac_perim_exit(mph1); +} + +/* Bind pseudo ring to a LDC resource */ +static int +vnet_bind_vgenring(vnet_res_t *vresp) +{ + vnet_t *vnetp; + vnet_pseudo_rx_group_t *rx_grp; + vnet_pseudo_rx_ring_t *rx_ringp; + mac_perim_handle_t mph1; + int rv; + int type; + + vnetp = vresp->vnetp; + type = vresp->type; + rx_grp = &vnetp->rx_grp[0]; + + if (type == VIO_NET_RES_LDC_SERVICE) { + /* + * Ring Index 0 is the default ring in the group and is + * reserved for LDC_SERVICE in vnet_ring_grp_init(). This ring + * is allocated statically and is reported to the mac layer + * in vnet_m_capab(). So, all we need to do here, is save a + * reference to the associated vresp. + */ + rx_ringp = &rx_grp->rings[0]; + rx_ringp->hw_rh = (mac_ring_handle_t)vresp; + vresp->rx_ringp = (void *)rx_ringp; + return (0); + } + ASSERT(type == VIO_NET_RES_LDC_GUEST); + + mac_perim_enter_by_mh(vnetp->mh, &mph1); + + rx_ringp = vnet_alloc_pseudo_rx_ring(vnetp); + if (rx_ringp == NULL) { + cmn_err(CE_WARN, "!vnet%d: Failed to allocate pseudo rx ring", + vnetp->instance); + goto fail; + } + + /* Store the LDC resource itself as the ring handle */ + rx_ringp->hw_rh = (mac_ring_handle_t)vresp; + + /* + * Save a reference to the ring in the resource for lookup during + * unbind. Note this is only done for LDC resources. We don't need this + * in the case of a Hybrid resource (see vnet_bind_hwrings()), as its + * rx rings are mapped to reserved pseudo rx rings (index 1 and 2). + */ + vresp->rx_ringp = (void *)rx_ringp; + rx_ringp->state |= VNET_RXRING_LDC_GUEST; + + /* Register the pseudo ring with upper-mac */ + rv = mac_group_add_ring(rx_grp->handle, rx_ringp->index); + if (rv != 0) { + rx_ringp->state &= ~VNET_RXRING_LDC_GUEST; + rx_ringp->hw_rh = NULL; + vnet_free_pseudo_rx_ring(vnetp, rx_ringp); + goto fail; + } + + mac_perim_exit(mph1); + return (0); +fail: + mac_perim_exit(mph1); + return (1); +} + +/* Unbind pseudo ring from a LDC resource */ +static void +vnet_unbind_vgenring(vnet_res_t *vresp) +{ + vnet_t *vnetp; + vnet_pseudo_rx_group_t *rx_grp; + vnet_pseudo_rx_ring_t *rx_ringp; + mac_perim_handle_t mph1; + int type; + + vnetp = vresp->vnetp; + type = vresp->type; + rx_grp = &vnetp->rx_grp[0]; + + if (vresp->rx_ringp == NULL) { + return; + } + + if (type == VIO_NET_RES_LDC_SERVICE) { + /* + * Ring Index 0 is the default ring in the group and is + * reserved for LDC_SERVICE in vnet_ring_grp_init(). This ring + * is allocated statically and is reported to the mac layer + * in vnet_m_capab(). So, all we need to do here, is remove its + * reference to the associated vresp. + */ + rx_ringp = &rx_grp->rings[0]; + rx_ringp->hw_rh = NULL; + vresp->rx_ringp = NULL; + return; + } + ASSERT(type == VIO_NET_RES_LDC_GUEST); + + mac_perim_enter_by_mh(vnetp->mh, &mph1); + + rx_ringp = (vnet_pseudo_rx_ring_t *)vresp->rx_ringp; + vresp->rx_ringp = NULL; + + if (rx_ringp != NULL && (rx_ringp->state & VNET_RXRING_LDC_GUEST)) { + /* Unregister the pseudo ring with upper-mac */ + mac_group_rem_ring(rx_grp->handle, rx_ringp->handle); + + rx_ringp->hw_rh = NULL; + rx_ringp->state &= ~VNET_RXRING_LDC_GUEST; + + /* Free the pseudo rx ring */ + vnet_free_pseudo_rx_ring(vnetp, rx_ringp); + } + + mac_perim_exit(mph1); +} + +static void +vnet_unbind_rings(vnet_res_t *vresp) +{ + switch (vresp->type) { + + case VIO_NET_RES_LDC_SERVICE: + case VIO_NET_RES_LDC_GUEST: + vnet_unbind_vgenring(vresp); + break; + + case VIO_NET_RES_HYBRID: + vnet_unbind_hwrings(vresp->vnetp); + break; + + default: + break; + + } +} + +static int +vnet_bind_rings(vnet_res_t *vresp) +{ + int rv; + + switch (vresp->type) { + + case VIO_NET_RES_LDC_SERVICE: + case VIO_NET_RES_LDC_GUEST: + rv = vnet_bind_vgenring(vresp); + break; + + case VIO_NET_RES_HYBRID: + rv = vnet_bind_hwrings(vresp->vnetp); + break; + + default: + rv = 1; + break; + + } + + return (rv); +} + +/* ARGSUSED */ +int +vnet_hio_stat(void *arg, uint_t stat, uint64_t *val) +{ + vnet_t *vnetp = (vnet_t *)arg; + + *val = mac_stat_get(vnetp->hio_mh, stat); + return (0); +} + +/* + * The start() and stop() routines for the Hybrid resource below, are just + * dummy functions. This is provided to avoid resource type specific code in + * vnet_start_resources() and vnet_stop_resources(). The starting and stopping + * of the Hybrid resource happens in the context of the mac_client interfaces + * that are invoked in vnet_hio_mac_init() and vnet_hio_mac_cleanup(). + */ +/* ARGSUSED */ +static int +vnet_hio_start(void *arg) +{ + return (0); +} + +/* ARGSUSED */ +static void +vnet_hio_stop(void *arg) +{ +} + +mblk_t * +vnet_hio_tx(void *arg, mblk_t *mp) +{ + vnet_pseudo_tx_ring_t *tx_ringp; + mblk_t *nextp; + mblk_t *ret_mp; + + tx_ringp = (vnet_pseudo_tx_ring_t *)arg; + for (;;) { + nextp = mp->b_next; + mp->b_next = NULL; + + ret_mp = mac_hwring_tx(tx_ringp->hw_rh, mp); + if (ret_mp != NULL) { + ret_mp->b_next = nextp; + mp = ret_mp; + break; + } + + if ((mp = nextp) == NULL) + break; + } + return (mp); +} + +static void +vnet_hio_notify_cb(void *arg, mac_notify_type_t type) +{ + vnet_t *vnetp = (vnet_t *)arg; + mac_perim_handle_t mph; + + mac_perim_enter_by_mh(vnetp->hio_mh, &mph); + switch (type) { + case MAC_NOTE_TX: + vnet_tx_update(vnetp->hio_vhp); + break; + + default: + break; + } + mac_perim_exit(mph); +} + #ifdef VNET_IOC_DEBUG /* diff --git a/usr/src/uts/sun4v/io/vnet_dds.c b/usr/src/uts/sun4v/io/vnet_dds.c index c3548db771..b6b6cbea13 100644 --- a/usr/src/uts/sun4v/io/vnet_dds.c +++ b/usr/src/uts/sun4v/io/vnet_dds.c @@ -113,6 +113,8 @@ static void vdds_release_range_prop(dev_info_t *nexus_dip, uint64_t cookie); /* Functions imported from other files */ extern int vnet_send_dds_msg(vnet_t *vnetp, void *dmsg); +extern int vnet_hio_mac_init(vnet_t *vnetp, char *ifname); +extern void vnet_hio_mac_cleanup(vnet_t *vnetp); /* HV functions that are used in this file */ extern uint64_t vdds_hv_niu_vr_getinfo(uint32_t hvcookie, @@ -412,7 +414,31 @@ vdds_process_dds_msg_task(void *arg) } else { vdds->hio_dip = dip; vdds->hio_cookie = hio_cookie; - (void) vdds_send_dds_resp_msg(vnetp, dmsg, B_TRUE); + sprintf(vdds->hio_ifname, "%s%d", ddi_driver_name(dip), + ddi_get_instance(dip)); + + rv = vnet_hio_mac_init(vnetp, vdds->hio_ifname); + if (rv != 0) { + /* failed - cleanup, send failed DDS message */ + DERR(vdds, "HIO mac init failed, cleaning up"); + rv = vdds_destroy_niu_node(dip, hio_cookie); + if (rv == 0) { + /* use DERR to print by default */ + DERR(vdds, "Successfully destroyed" + " Hybrid node"); + } else { + cmn_err(CE_WARN, "vnet%d:Failed to " + "destroy Hybrid node", + vnetp->instance); + } + vdds->hio_dip = NULL; + vdds->hio_cookie = 0; + (void) vdds_send_dds_resp_msg(vnetp, + dmsg, B_FALSE); + } else { + (void) vdds_send_dds_resp_msg(vnetp, + dmsg, B_TRUE); + } /* DERR used only print by default */ DERR(vdds, "Successfully created HIO node"); } @@ -424,6 +450,7 @@ vdds_process_dds_msg_task(void *arg) DBG2(vdds, "NACK: No HIO device destroy"); (void) vdds_send_dds_resp_msg(vnetp, dmsg, B_FALSE); } else { + vnet_hio_mac_cleanup(vnetp); rv = vdds_destroy_niu_node(vnetp->vdds_info.hio_dip, vdds->hio_cookie); if (rv == 0) { @@ -444,6 +471,7 @@ vdds_process_dds_msg_task(void *arg) case VNET_DDS_TASK_REL_SHARE: DBG2(vdds, "REL_SHARE task..."); if (vnetp->vdds_info.hio_dip != NULL) { + vnet_hio_mac_cleanup(vnetp); rv = vdds_destroy_niu_node(vnetp->vdds_info.hio_dip, vdds->hio_cookie); if (rv == 0) { diff --git a/usr/src/uts/sun4v/io/vnet_gen.c b/usr/src/uts/sun4v/io/vnet_gen.c index bbf5e32cd3..f83c3a13d0 100644 --- a/usr/src/uts/sun4v/io/vnet_gen.c +++ b/usr/src/uts/sun4v/io/vnet_gen.c @@ -73,11 +73,15 @@ /* vgen proxy entry points */ int vgen_init(void *vnetp, uint64_t regprop, dev_info_t *vnetdip, const uint8_t *macaddr, void **vgenhdl); +int vgen_init_mdeg(void *arg); void vgen_uninit(void *arg); int vgen_dds_tx(void *arg, void *dmsg); void vgen_mod_init(void); int vgen_mod_cleanup(void); void vgen_mod_fini(void); +int vgen_enable_intr(void *arg); +int vgen_disable_intr(void *arg); +mblk_t *vgen_poll(void *arg, int bytes_to_pickup); static int vgen_start(void *arg); static void vgen_stop(void *arg); static mblk_t *vgen_tx(void *arg, mblk_t *mp); @@ -151,6 +155,7 @@ static int vgen_num_txpending(vgen_ldc_t *ldcp); static int vgen_tx_dring_full(vgen_ldc_t *ldcp); static int vgen_ldc_txtimeout(vgen_ldc_t *ldcp); static void vgen_ldc_watchdog(void *arg); +static mblk_t *vgen_ldc_poll(vgen_ldc_t *ldcp, int bytes_to_pickup); /* vgen handshake functions */ static vgen_ldc_t *vh_nextphase(vgen_ldc_t *ldcp); @@ -200,7 +205,7 @@ static void vgen_stop_rcv_thread(vgen_ldc_t *ldcp); static void vgen_drain_rcv_thread(vgen_ldc_t *ldcp); static void vgen_ldc_rcv_worker(void *arg); static void vgen_handle_evt_read(vgen_ldc_t *ldcp); -static void vgen_rx(vgen_ldc_t *ldcp, mblk_t *bp); +static void vgen_rx(vgen_ldc_t *ldcp, mblk_t *bp, mblk_t *bpt); static void vgen_set_vnet_proto_ops(vgen_ldc_t *ldcp); static void vgen_reset_vnet_proto_ops(vgen_ldc_t *ldcp); static void vgen_link_update(vgen_t *vgenp, link_state_t link_state); @@ -536,13 +541,6 @@ vgen_init(void *vnetp, uint64_t regprop, dev_info_t *vnetdip, if (rv != 0) { goto vgen_init_fail; } - - /* register with MD event generator */ - rv = vgen_mdeg_reg(vgenp); - if (rv != DDI_SUCCESS) { - goto vgen_init_fail; - } - *vgenhdl = (void *)vgenp; DBG1(NULL, NULL, "vnet(%d): exit\n", instance); @@ -562,6 +560,15 @@ vgen_init_fail: return (DDI_FAILURE); } +int +vgen_init_mdeg(void *arg) +{ + vgen_t *vgenp = (vgen_t *)arg; + + /* register with MD event generator */ + return (vgen_mdeg_reg(vgenp)); +} + /* * Called by vnet to undo the initializations done by vgen_init(). * The handle provided by generic transport during vgen_init() is the argument. @@ -2094,13 +2101,21 @@ mdeg_reg_fail: static void vgen_mdeg_unreg(vgen_t *vgenp) { - (void) mdeg_unregister(vgenp->mdeg_dev_hdl); - (void) mdeg_unregister(vgenp->mdeg_port_hdl); - kmem_free(vgenp->mdeg_parentp->specp, sizeof (vgen_prop_template)); - KMEM_FREE(vgenp->mdeg_parentp); - vgenp->mdeg_parentp = NULL; - vgenp->mdeg_dev_hdl = NULL; - vgenp->mdeg_port_hdl = NULL; + if (vgenp->mdeg_dev_hdl != NULL) { + (void) mdeg_unregister(vgenp->mdeg_dev_hdl); + vgenp->mdeg_dev_hdl = NULL; + } + if (vgenp->mdeg_port_hdl != NULL) { + (void) mdeg_unregister(vgenp->mdeg_port_hdl); + vgenp->mdeg_port_hdl = NULL; + } + + if (vgenp->mdeg_parentp != NULL) { + kmem_free(vgenp->mdeg_parentp->specp, + sizeof (vgen_prop_template)); + KMEM_FREE(vgenp->mdeg_parentp); + vgenp->mdeg_parentp = NULL; + } } /* mdeg callback function for the port node */ @@ -2907,6 +2922,7 @@ vgen_ldc_attach(vgen_port_t *portp, uint64_t ldc_id) mutex_init(&ldcp->tclock, NULL, MUTEX_DRIVER, NULL); mutex_init(&ldcp->wrlock, NULL, MUTEX_DRIVER, NULL); mutex_init(&ldcp->rxlock, NULL, MUTEX_DRIVER, NULL); + mutex_init(&ldcp->pollq_lock, NULL, MUTEX_DRIVER, NULL); attach_state |= AST_mutex_init; @@ -3032,6 +3048,7 @@ ldc_attach_failed: mutex_destroy(&ldcp->cblock); mutex_destroy(&ldcp->wrlock); mutex_destroy(&ldcp->rxlock); + mutex_destroy(&ldcp->pollq_lock); } if (attach_state & AST_ldc_alloc) { KMEM_FREE(ldcp); @@ -3100,6 +3117,7 @@ vgen_ldc_detach(vgen_ldc_t *ldcp) mutex_destroy(&ldcp->cblock); mutex_destroy(&ldcp->wrlock); mutex_destroy(&ldcp->rxlock); + mutex_destroy(&ldcp->pollq_lock); /* unlink it from the list */ *prev_ldcp = ldcp->nextp; @@ -6278,7 +6296,7 @@ vgen_recv_retry: */ if (bp != NULL) { DTRACE_PROBE1(vgen_rcv_msgs, int, count); - vgen_rx(ldcp, bp); + vgen_rx(ldcp, bp, bpt); count = 0; bp = bpt = NULL; } @@ -6459,7 +6477,7 @@ vgen_recv_retry: if (count++ > vgen_chain_len) { DTRACE_PROBE1(vgen_rcv_msgs, int, count); - vgen_rx(ldcp, bp); + vgen_rx(ldcp, bp, bpt); count = 0; bp = bpt = NULL; } @@ -6512,7 +6530,7 @@ error_ret: /* send up packets received so far */ if (bp != NULL) { DTRACE_PROBE1(vgen_rcv_msgs, int, count); - vgen_rx(ldcp, bp); + vgen_rx(ldcp, bp, bpt); bp = bpt = NULL; } DBG1(vgenp, ldcp, "exit rv(%d)\n", rv); @@ -6996,18 +7014,57 @@ vgen_print_ldcinfo(vgen_ldc_t *ldcp) * Send received packets up the stack. */ static void -vgen_rx(vgen_ldc_t *ldcp, mblk_t *bp) +vgen_rx(vgen_ldc_t *ldcp, mblk_t *bp, mblk_t *bpt) { vio_net_rx_cb_t vrx_cb = ldcp->portp->vcb.vio_net_rx_cb; + vgen_t *vgenp = LDC_TO_VGEN(ldcp); if (ldcp->rcv_thread != NULL) { ASSERT(MUTEX_HELD(&ldcp->rxlock)); - mutex_exit(&ldcp->rxlock); } else { ASSERT(MUTEX_HELD(&ldcp->cblock)); + } + + mutex_enter(&ldcp->pollq_lock); + + if (ldcp->polling_on == B_TRUE) { + /* + * If we are in polling mode, simply queue + * the packets onto the poll queue and return. + */ + if (ldcp->pollq_headp == NULL) { + ldcp->pollq_headp = bp; + ldcp->pollq_tailp = bpt; + } else { + ldcp->pollq_tailp->b_next = bp; + ldcp->pollq_tailp = bpt; + } + + mutex_exit(&ldcp->pollq_lock); + return; + } + + /* + * Prepend any pending mblks in the poll queue, now that we + * are in interrupt mode, before sending up the chain of pkts. + */ + if (ldcp->pollq_headp != NULL) { + DBG2(vgenp, ldcp, "vgen_rx(%lx), pending pollq_headp\n", + (uintptr_t)ldcp); + ldcp->pollq_tailp->b_next = bp; + bp = ldcp->pollq_headp; + ldcp->pollq_headp = ldcp->pollq_tailp = NULL; + } + + mutex_exit(&ldcp->pollq_lock); + + if (ldcp->rcv_thread != NULL) { + mutex_exit(&ldcp->rxlock); + } else { mutex_exit(&ldcp->cblock); } + /* Send up the packets */ vrx_cb(ldcp->portp->vhp, bp); if (ldcp->rcv_thread != NULL) { @@ -7233,6 +7290,145 @@ vgen_ldc_reset(vgen_ldc_t *ldcp) vgen_handshake_retry(ldcp); } +int +vgen_enable_intr(void *arg) +{ + vgen_port_t *portp = (vgen_port_t *)arg; + vgen_ldclist_t *ldclp; + vgen_ldc_t *ldcp; + + ldclp = &portp->ldclist; + READ_ENTER(&ldclp->rwlock); + /* + * NOTE: for now, we will assume we have a single channel. + */ + if (ldclp->headp == NULL) { + RW_EXIT(&ldclp->rwlock); + return (1); + } + ldcp = ldclp->headp; + + mutex_enter(&ldcp->pollq_lock); + ldcp->polling_on = B_FALSE; + mutex_exit(&ldcp->pollq_lock); + + RW_EXIT(&ldclp->rwlock); + + return (0); +} + +int +vgen_disable_intr(void *arg) +{ + vgen_port_t *portp = (vgen_port_t *)arg; + vgen_ldclist_t *ldclp; + vgen_ldc_t *ldcp; + + ldclp = &portp->ldclist; + READ_ENTER(&ldclp->rwlock); + /* + * NOTE: for now, we will assume we have a single channel. + */ + if (ldclp->headp == NULL) { + RW_EXIT(&ldclp->rwlock); + return (1); + } + ldcp = ldclp->headp; + + + mutex_enter(&ldcp->pollq_lock); + ldcp->polling_on = B_TRUE; + mutex_exit(&ldcp->pollq_lock); + + RW_EXIT(&ldclp->rwlock); + + return (0); +} + +mblk_t * +vgen_poll(void *arg, int bytes_to_pickup) +{ + vgen_port_t *portp = (vgen_port_t *)arg; + vgen_ldclist_t *ldclp; + vgen_ldc_t *ldcp; + mblk_t *mp = NULL; + + ldclp = &portp->ldclist; + READ_ENTER(&ldclp->rwlock); + /* + * NOTE: for now, we will assume we have a single channel. + */ + if (ldclp->headp == NULL) { + RW_EXIT(&ldclp->rwlock); + return (NULL); + } + ldcp = ldclp->headp; + + mp = vgen_ldc_poll(ldcp, bytes_to_pickup); + + RW_EXIT(&ldclp->rwlock); + return (mp); +} + +static mblk_t * +vgen_ldc_poll(vgen_ldc_t *ldcp, int bytes_to_pickup) +{ + mblk_t *bp = NULL; + mblk_t *bpt = NULL; + mblk_t *mp = NULL; + size_t mblk_sz = 0; + size_t sz = 0; + uint_t count = 0; + + mutex_enter(&ldcp->pollq_lock); + + bp = ldcp->pollq_headp; + while (bp != NULL) { + /* get the size of this packet */ + mblk_sz = msgdsize(bp); + + /* if adding this pkt, exceeds the size limit, we are done. */ + if (sz + mblk_sz > bytes_to_pickup) { + break; + } + + /* we have room for this packet */ + sz += mblk_sz; + + /* increment the # of packets being sent up */ + count++; + + /* track the last processed pkt */ + bpt = bp; + + /* get the next pkt */ + bp = bp->b_next; + } + + if (count != 0) { + /* + * picked up some packets; save the head of pkts to be sent up. + */ + mp = ldcp->pollq_headp; + + /* move the pollq_headp to skip over the pkts being sent up */ + ldcp->pollq_headp = bp; + + /* picked up all pending pkts in the queue; reset tail also */ + if (ldcp->pollq_headp == NULL) { + ldcp->pollq_tailp = NULL; + } + + /* terminate the tail of pkts to be sent up */ + bpt->b_next = NULL; + } + + mutex_exit(&ldcp->pollq_lock); + + DTRACE_PROBE1(vgen_poll_pkts, uint_t, count); + return (mp); +} + #if DEBUG /* diff --git a/usr/src/uts/sun4v/sys/vnet.h b/usr/src/uts/sun4v/sys/vnet.h index 1e2f88aeb8..21fb92852b 100644 --- a/usr/src/uts/sun4v/sys/vnet.h +++ b/usr/src/uts/sun4v/sys/vnet.h @@ -34,6 +34,8 @@ extern "C" { #include <sys/vnet_res.h> #include <sys/vnet_mailbox.h> #include <sys/modhash.h> +#include <net/if.h> +#include <sys/mac_client.h> #define VNET_SUCCESS (0) /* successful return */ #define VNET_FAILURE (-1) /* unsuccessful return */ @@ -117,6 +119,7 @@ typedef struct vnet_res { uint32_t refcnt; /* reference count */ struct vnet *vnetp; /* back pointer to vnet */ kstat_t *ksp; /* hio kstats */ + void *rx_ringp; /* assoc pseudo rx ring */ } vnet_res_t; #define VNET_DDS_TASK_ADD_SHARE 0x01 @@ -131,6 +134,7 @@ typedef struct vnet_dds_info { vio_dds_msg_t dmsg; /* Pending DDS message */ dev_info_t *hio_dip; /* Hybrid device's dip */ uint64_t hio_cookie; /* Hybrid device's cookie */ + char hio_ifname[LIFNAMSIZ]; /* Hybrid interface name */ ddi_taskq_t *dds_taskqp; /* Taskq's used for DDS */ struct vnet *vnetp; /* Back pointer to vnetp */ } vnet_dds_info_t; @@ -155,12 +159,103 @@ typedef struct vnet_dds_info { typedef enum { AST_init = 0x0, AST_vnet_alloc = 0x1, - AST_mac_alloc = 0x2, AST_read_macaddr = 0x4, - AST_vgen_init = 0x8, AST_fdbh_alloc = 0x10, - AST_vdds_init = 0x20, AST_taskq_create = 0x40, - AST_vnet_list = 0x80, AST_macreg = 0x100 + AST_ring_init = 0x2, AST_vdds_init = 0x4, + AST_read_macaddr = 0x8, AST_fdbh_alloc = 0x10, + AST_taskq_create = 0x20, AST_vnet_list = 0x40, + AST_vgen_init = 0x80, AST_macreg = 0x100, + AST_init_mdeg = 0x200 } vnet_attach_progress_t; +#define VNET_NUM_PSEUDO_GROUPS 1 /* # of pseudo ring grps */ +#define VNET_NUM_HYBRID_RINGS 2 /* # of Hybrid tx/rx rings */ +#define VNET_HYBRID_RXRING_INDEX 1 /* Hybrid rx ring start index */ + +/* + * # of Pseudo TX Rings is defined based on the possible + * # of TX Hardware Rings from a Hybrid resource. + */ +#define VNET_NUM_PSEUDO_TXRINGS VNET_NUM_HYBRID_RINGS + +/* + * # of Pseudo RX Rings that are reserved and exposed by default. + * 1 for LDC resource to vsw + 2 for RX rings of Hybrid resource. + */ +#define VNET_NUM_PSEUDO_RXRINGS_DEFAULT (VNET_NUM_HYBRID_RINGS + 1) + +/* Pseudo RX Ring States */ +typedef enum { + VNET_RXRING_FREE = 0x0, /* Free */ + VNET_RXRING_INUSE = 0x1, /* In use */ + VNET_RXRING_LDC_SERVICE = 0x2, /* Mapped to vswitch */ + VNET_RXRING_LDC_GUEST = 0x4, /* Mapped to a peer vnet */ + VNET_RXRING_HYBRID = 0x8, /* Mapped to Hybrid resource */ + VNET_RXRING_STARTED = 0x10 /* Started */ +} vnet_rxring_state_t; + +/* Pseudo TX Ring States */ +typedef enum { + VNET_TXRING_FREE = 0x0, /* Free */ + VNET_TXRING_INUSE = 0x1, /* In use */ + VNET_TXRING_SHARED = 0x2, /* Shared among LDCs */ + VNET_TXRING_HYBRID = 0x4, /* Shared among LDCs, Hybrid resource */ + VNET_TXRING_STARTED = 0x8 /* Started */ +} vnet_txring_state_t; + +/* + * Psuedo TX Ring + */ +typedef struct vnet_pseudo_tx_ring { + uint_t index; /* ring index */ + vnet_txring_state_t state; /* ring state */ + void *grp; /* grp associated */ + void *vnetp; /* vnet associated */ + mac_ring_handle_t handle; /* ring handle in mac layer */ + mac_ring_handle_t hw_rh; /* Resource type dependent, internal */ + /* ring handle. Hybrid res: ring hdl */ + /* of hardware rx ring; LDC res: hdl */ + /* to the res itself (vnet_res_t) */ +} vnet_pseudo_tx_ring_t; + +/* + * Psuedo RX Ring + */ +typedef struct vnet_pseudo_rx_ring { + uint_t index; /* ring index */ + vnet_rxring_state_t state; /* ring state */ + void *grp; /* grp associated */ + void *vnetp; /* vnet associated */ + mac_ring_handle_t handle; /* ring handle in mac layer */ + mac_ring_handle_t hw_rh; /* Resource type dependent, internal */ + /* ring handle. Hybrid res: ring hdl */ + /* of hardware tx ring; otherwise */ + /* NULL */ + uint64_t gen_num; /* Mac layer gen_num */ +} vnet_pseudo_rx_ring_t; + +/* + * Psuedo TX Ring Group + */ +typedef struct vnet_pseudo_tx_group { + uint_t index; /* group index */ + void *vnetp; /* vnet associated */ + mac_group_handle_t handle; /* grp handle in mac layer */ + uint_t ring_cnt; /* total # of rings in grp */ + vnet_pseudo_tx_ring_t *rings; /* array of rings */ +} vnet_pseudo_tx_group_t; + +/* + * Psuedo RX Ring Group + */ +typedef struct vnet_pseudo_rx_group { + krwlock_t lock; /* sync rings access in grp */ + int index; /* group index */ + void *vnetp; /* vnet this grp belongs to */ + mac_group_handle_t handle; /* grp handle in mac layer */ + uint_t max_ring_cnt; /* total # of rings in grp */ + uint_t ring_cnt; /* # of rings in use */ + vnet_pseudo_rx_ring_t *rings; /* array of rings */ +} vnet_pseudo_rx_group_t; + /* * vnet instance state information */ @@ -194,6 +289,18 @@ typedef struct vnet { vnet_dds_info_t vdds_info; /* DDS related info */ krwlock_t vrwlock; /* Resource list lock */ ddi_taskq_t *taskqp; /* Resource taskq */ + + /* pseudo ring groups */ + vnet_pseudo_rx_group_t rx_grp[VNET_NUM_PSEUDO_GROUPS]; + vnet_pseudo_tx_group_t tx_grp[VNET_NUM_PSEUDO_GROUPS]; + + vio_net_handle_t hio_vhp; /* HIO resource hdl */ + mac_handle_t hio_mh; /* HIO mac hdl */ + mac_client_handle_t hio_mch; /* HIO mac client hdl */ + mac_unicast_handle_t hio_muh; /* HIO mac unicst hdl */ + mac_notify_handle_t hio_mnh; /* HIO notify cb hdl */ + mac_group_handle_t rx_hwgh; /* HIO rx ring-group hdl */ + mac_group_handle_t tx_hwgh; /* HIO tx ring-group hdl */ } vnet_t; #ifdef DEBUG diff --git a/usr/src/uts/sun4v/sys/vnet_gen.h b/usr/src/uts/sun4v/sys/vnet_gen.h index 6c04c3cfe0..577667762b 100644 --- a/usr/src/uts/sun4v/sys/vnet_gen.h +++ b/usr/src/uts/sun4v/sys/vnet_gen.h @@ -180,6 +180,7 @@ typedef struct vgen_ldc { kmutex_t tclock; /* tx reclaim lock */ kmutex_t wrlock; /* sync transmits */ kmutex_t rxlock; /* sync reception */ + kmutex_t pollq_lock; /* sync polling and rxworker */ /* channel info from ldc layer */ uint64_t ldc_id; /* channel number */ @@ -248,6 +249,11 @@ typedef struct vgen_ldc { kmutex_t rcv_thr_lock; /* lock for receive thread */ kcondvar_t rcv_thr_cv; /* cond.var for recv thread */ + /* receive polling fields */ + boolean_t polling_on; /* polling enabled ? */ + mblk_t *pollq_headp; /* head of pkts in pollq */ + mblk_t *pollq_tailp; /* tail of pkts in pollq */ + /* channel statistics */ vgen_stats_t stats; /* channel statistics */ kstat_t *ksp; /* channel kstats */ |