diff options
author | Ryan Zezeski <ryan@zinascii.com> | 2020-01-22 11:52:39 -0700 |
---|---|---|
committer | Robert Mustacchi <rm@fingolfin.org> | 2020-03-02 14:43:17 +0000 |
commit | 45948e49c407e4fc264fdd289ed632d6639e009d (patch) | |
tree | 49af0edae1f628d8fb2b3486d79a648b6f62ab9f | |
parent | b69c34dad3717624ff6b4f32b71014ee05b6a678 (diff) | |
download | illumos-joyent-45948e49c407e4fc264fdd289ed632d6639e009d.tar.gz |
11493 aggr needs support for multiple pseudo rx groups
Portions contributed by: Dan McDonald <danmcd@joyent.com>
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Paul Winder <paul@winders.demon.co.uk>
Approved by: Gordon Ross <gordon.w.ross@gmail.com>
-rw-r--r-- | usr/src/cmd/mdb/common/modules/mac/mac.c | 3 | ||||
-rw-r--r-- | usr/src/uts/common/io/aggr/aggr_grp.c | 426 | ||||
-rw-r--r-- | usr/src/uts/common/io/aggr/aggr_port.c | 135 | ||||
-rw-r--r-- | usr/src/uts/common/io/aggr/aggr_recv.c | 28 | ||||
-rw-r--r-- | usr/src/uts/common/io/mac/mac.c | 237 | ||||
-rw-r--r-- | usr/src/uts/common/io/mac/mac_client.c | 13 | ||||
-rw-r--r-- | usr/src/uts/common/io/mac/mac_datapath_setup.c | 98 | ||||
-rw-r--r-- | usr/src/uts/common/io/mac/mac_provider.c | 38 | ||||
-rw-r--r-- | usr/src/uts/common/io/mac/mac_stat.c | 22 | ||||
-rw-r--r-- | usr/src/uts/common/sys/aggr_impl.h | 56 | ||||
-rw-r--r-- | usr/src/uts/common/sys/mac_client_priv.h | 13 | ||||
-rw-r--r-- | usr/src/uts/common/sys/mac_impl.h | 13 | ||||
-rw-r--r-- | usr/src/uts/common/sys/mac_provider.h | 54 | ||||
-rw-r--r-- | usr/src/uts/sun4v/io/vnet.c | 19 |
14 files changed, 800 insertions, 355 deletions
diff --git a/usr/src/cmd/mdb/common/modules/mac/mac.c b/usr/src/cmd/mdb/common/modules/mac/mac.c index d42066a37d..1f80e11096 100644 --- a/usr/src/cmd/mdb/common/modules/mac/mac.c +++ b/usr/src/cmd/mdb/common/modules/mac/mac.c @@ -21,6 +21,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2018 Joyent, Inc. */ #include <sys/mdb_modapi.h> @@ -967,6 +968,8 @@ mac_ring_classify2str(mac_classify_type_t classify) return ("sw"); case MAC_HW_CLASSIFIER: return ("hw"); + case MAC_PASSTHRU_CLASSIFIER: + return ("pass"); } return ("--"); } diff --git a/usr/src/uts/common/io/aggr/aggr_grp.c b/usr/src/uts/common/io/aggr/aggr_grp.c index 9932c2cb58..48cdc241d6 100644 --- a/usr/src/uts/common/io/aggr/aggr_grp.c +++ b/usr/src/uts/common/io/aggr/aggr_grp.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2018 Joyent, Inc. + * Copyright 2020 Joyent, Inc. */ /* @@ -32,39 +32,69 @@ * module. The hash key is the linkid associated with the link * aggregation group. * - * A set of MAC ports are associated with each association group. + * Each aggregation contains a set of ports. The port is represented + * by the aggr_port_t structure. A port consists of a single MAC + * client which has exclusive (MCIS_EXCLUSIVE) use of the underlying + * MAC. This client is used by the aggr to send and receive LACP + * traffic. Each port client takes on the same MAC unicast address -- + * the address of the aggregation itself (taken from the first port by + * default). * - * Aggr pseudo TX rings - * -------------------- - * The underlying ports (NICs) in an aggregation can have TX rings. To - * enhance aggr's performance, these TX rings are made available to the - * aggr layer as pseudo TX rings. The concept of pseudo rings are not new. - * They are already present and implemented on the RX side. It is called - * as pseudo RX rings. The same concept is extended to the TX side where - * each TX ring of an underlying port is reflected in aggr as a pseudo - * TX ring. Thus each pseudo TX ring will map to a specific hardware TX - * ring. Even in the case of a NIC that does not have a TX ring, a pseudo - * TX ring is given to the aggregation layer. + * The MAC client that hangs off each aggr port is not your typical + * MAC client. Not only does it have exclusive control of the MAC, but + * it also has no Tx or Rx SRSes. An SRS is designed to queue and + * fanout traffic among L4 protocols; but the aggr is an intermediary, + * not a consumer. Instead of using SRSes, the aggr puts the + * underlying hardware rings into passthru mode and ships packets up + * via a direct call to aggr_recv_cb(). This allows aggr to enforce + * LACP while passing all other traffic up to clients of the aggr. + * + * Pseudo Rx Groups and Rings + * -------------------------- + * + * It is imperative for client performance that the aggr provide as + * many MAC groups as possible. In order to use the underlying HW + * resources, aggr creates pseudo groups to aggregate the underlying + * HW groups. Every HW group gets mapped to a pseudo group; and every + * HW ring in that group gets mapped to a pseudo ring. The pseudo + * group at index 0 combines all the HW groups at index 0 from each + * port, etc. The aggr's MAC then creates normal MAC groups and rings + * out of these pseudo groups and rings to present to the aggr's + * clients. To the clients, the aggr's groups and rings are absolutely + * no different than a NIC's groups or rings. + * + * Pseudo Tx Rings + * --------------- + * + * The underlying ports (NICs) in an aggregation can have Tx rings. To + * enhance aggr's performance, these Tx rings are made available to + * the aggr layer as pseudo Tx rings. The concept of pseudo rings are + * not new. They are already present and implemented on the Rx side. + * The same concept is extended to the Tx side where each Tx ring of + * an underlying port is reflected in aggr as a pseudo Tx ring. Thus + * each pseudo Tx ring will map to a specific hardware Tx ring. Even + * in the case of a NIC that does not have a Tx ring, a pseudo Tx ring + * is given to the aggregation layer. * * With this change, the outgoing stack depth looks much better: * * mac_tx() -> mac_tx_aggr_mode() -> mac_tx_soft_ring_process() -> * mac_tx_send() -> aggr_ring_rx() -> <driver>_ring_tx() * - * Two new modes are introduced to mac_tx() to handle aggr pseudo TX rings: + * Two new modes are introduced to mac_tx() to handle aggr pseudo Tx rings: * SRS_TX_AGGR and SRS_TX_BW_AGGR. * * In SRS_TX_AGGR mode, mac_tx_aggr_mode() routine is called. This routine - * invokes an aggr function, aggr_find_tx_ring(), to find a (pseudo) TX + * invokes an aggr function, aggr_find_tx_ring(), to find a (pseudo) Tx * ring belonging to a port on which the packet has to be sent. * aggr_find_tx_ring() first finds the outgoing port based on L2/L3/L4 - * policy and then uses the fanout_hint passed to it to pick a TX ring from + * policy and then uses the fanout_hint passed to it to pick a Tx ring from * the selected port. * * In SRS_TX_BW_AGGR mode, mac_tx_bw_mode() function is called where * bandwidth limit is applied first on the outgoing packet and the packets * allowed to go out would call mac_tx_aggr_mode() to send the packet on a - * particular TX ring. + * particular Tx ring. */ #include <sys/types.h> @@ -121,7 +151,8 @@ static int aggr_add_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *); static void aggr_rem_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *); static int aggr_pseudo_disable_intr(mac_intr_handle_t); static int aggr_pseudo_enable_intr(mac_intr_handle_t); -static int aggr_pseudo_start_ring(mac_ring_driver_t, uint64_t); +static int aggr_pseudo_start_rx_ring(mac_ring_driver_t, uint64_t); +static void aggr_pseudo_stop_rx_ring(mac_ring_driver_t); static int aggr_addmac(void *, const uint8_t *); static int aggr_remmac(void *, const uint8_t *); static int aggr_addvlan(mac_group_driver_t, uint16_t); @@ -366,9 +397,13 @@ aggr_grp_attach_port(aggr_grp_t *grp, aggr_port_t *port) aggr_grp_multicst_port(port, B_TRUE); /* - * Set port's receive callback + * The port client doesn't have an Rx SRS; instead of calling + * mac_rx_set() we set the client's flow callback directly. + * This datapath is used only when the port's driver doesn't + * support MAC_CAPAB_RINGS. Drivers with ring support will + * deliver traffic to the aggr via ring passthru. */ - mac_rx_set(port->lp_mch, aggr_recv_cb, port); + mac_client_set_flow_cb(port->lp_mch, aggr_recv_cb, port); /* * If LACP is OFF, the port can be used to send data as soon @@ -398,7 +433,7 @@ aggr_grp_detach_port(aggr_grp_t *grp, aggr_port_t *port) if (port->lp_state != AGGR_PORT_STATE_ATTACHED) return (B_FALSE); - mac_rx_clear(port->lp_mch); + mac_client_clear_flow_cb(port->lp_mch); aggr_grp_multicst_port(port, B_FALSE); @@ -537,26 +572,27 @@ aggr_grp_add_port(aggr_grp_t *grp, datalink_id_t port_linkid, boolean_t force, zoneid_t port_zoneid = ALL_ZONES; int err; - /* The port must be int the same zone as the aggregation. */ + /* The port must be in the same zone as the aggregation. */ if (zone_check_datalink(&port_zoneid, port_linkid) != 0) port_zoneid = GLOBAL_ZONEID; if (grp->lg_zoneid != port_zoneid) return (EBUSY); /* - * lg_mh could be NULL when the function is called during the creation - * of the aggregation. + * If we are creating the aggr, then there is no MAC handle + * and thus no perimeter to hold. If we are adding a port to + * an existing aggr, then the perimiter of the aggr's MAC must + * be held. */ ASSERT(grp->lg_mh == NULL || MAC_PERIM_HELD(grp->lg_mh)); - /* create new port */ err = aggr_port_create(grp, port_linkid, force, &port); if (err != 0) return (err); mac_perim_enter_by_mh(port->lp_mh, &mph); - /* add port to list of group constituent ports */ + /* Add the new port to the end of the list. */ cport = &grp->lg_ports; while (*cport != NULL) cport = &((*cport)->lp_next); @@ -638,6 +674,7 @@ aggr_add_pseudo_rx_ring(aggr_port_t *port, ring->arr_flags |= MAC_PSEUDO_RING_INUSE; ring->arr_hw_rh = hw_rh; ring->arr_port = port; + ring->arr_grp = rx_grp; rx_grp->arg_ring_cnt++; /* @@ -648,10 +685,15 @@ aggr_add_pseudo_rx_ring(aggr_port_t *port, ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE; ring->arr_hw_rh = NULL; ring->arr_port = NULL; + ring->arr_grp = NULL; rx_grp->arg_ring_cnt--; } else { - mac_hwring_setup(hw_rh, (mac_resource_handle_t)ring, - mac_find_ring(rx_grp->arg_gh, j)); + /* + * This must run after the MAC is registered. + */ + ASSERT3P(ring->arr_rh, !=, NULL); + mac_hwring_set_passthru(hw_rh, (mac_rx_t)aggr_recv_cb, + (void *)port, (mac_resource_handle_t)ring); } return (err); } @@ -662,11 +704,9 @@ aggr_add_pseudo_rx_ring(aggr_port_t *port, static void aggr_rem_pseudo_rx_ring(aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh) { - aggr_pseudo_rx_ring_t *ring; - int j; + for (uint_t j = 0; j < MAX_RINGS_PER_GROUP; j++) { + aggr_pseudo_rx_ring_t *ring = rx_grp->arg_rings + j; - for (j = 0; j < MAX_RINGS_PER_GROUP; j++) { - ring = rx_grp->arg_rings + j; if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE) || ring->arr_hw_rh != hw_rh) { continue; @@ -677,8 +717,9 @@ aggr_rem_pseudo_rx_ring(aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh) ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE; ring->arr_hw_rh = NULL; ring->arr_port = NULL; + ring->arr_grp = NULL; rx_grp->arg_ring_cnt--; - mac_hwring_teardown(hw_rh); + mac_hwring_clear_passthru(hw_rh); break; } } @@ -695,52 +736,41 @@ aggr_rem_pseudo_rx_ring(aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh) static int aggr_add_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp) { - aggr_grp_t *grp = port->lp_grp; mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP]; aggr_unicst_addr_t *addr, *a; mac_perim_handle_t pmph; aggr_vlan_t *avp; - int hw_rh_cnt, i = 0, j; + uint_t hw_rh_cnt, i; int err = 0; + uint_t g_idx = rx_grp->arg_index; - ASSERT(MAC_PERIM_HELD(grp->lg_mh)); + ASSERT(MAC_PERIM_HELD(port->lp_grp->lg_mh)); + ASSERT3U(g_idx, <, MAX_GROUPS_PER_PORT); mac_perim_enter_by_mh(port->lp_mh, &pmph); /* - * This function must be called after the aggr registers its MAC - * and its Rx group has been initialized. + * This function must be called after the aggr registers its + * MAC and its Rx groups have been initialized. */ ASSERT(rx_grp->arg_gh != NULL); /* * Get the list of the underlying HW rings. */ - hw_rh_cnt = mac_hwrings_get(port->lp_mch, - &port->lp_hwgh, hw_rh, MAC_RING_TYPE_RX); - - if (port->lp_hwgh != NULL) { - /* - * Quiesce the HW ring and the MAC SRS on the ring. Note - * that the HW ring will be restarted when the pseudo ring - * is started. At that time all the packets will be - * directly passed up to the pseudo Rx ring and handled - * by MAC SRS created over the pseudo Rx ring. - */ - mac_rx_client_quiesce(port->lp_mch); - mac_srs_perm_quiesce(port->lp_mch, B_TRUE); - } + hw_rh_cnt = mac_hwrings_idx_get(port->lp_mh, g_idx, + &port->lp_hwghs[g_idx], hw_rh, MAC_RING_TYPE_RX); /* * Add existing VLAN and unicast address filters to the port. */ for (avp = list_head(&rx_grp->arg_vlans); avp != NULL; avp = list_next(&rx_grp->arg_vlans, avp)) { - if ((err = aggr_port_addvlan(port, avp->av_vid)) != 0) + if ((err = aggr_port_addvlan(port, g_idx, avp->av_vid)) != 0) goto err; } for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next) { - if ((err = aggr_port_addmac(port, addr->aua_addr)) != 0) + if ((err = aggr_port_addmac(port, g_idx, addr->aua_addr)) != 0) goto err; } @@ -750,18 +780,17 @@ aggr_add_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp) goto err; } - port->lp_rx_grp_added = B_TRUE; mac_perim_exit(pmph); return (0); err: ASSERT(err != 0); - for (j = 0; j < i; j++) + for (uint_t j = 0; j < i; j++) aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[j]); for (a = rx_grp->arg_macaddr; a != addr; a = a->aua_next) - aggr_port_remmac(port, a->aua_addr); + aggr_port_remmac(port, g_idx, a->aua_addr); if (avp != NULL) avp = list_prev(&rx_grp->arg_vlans, avp); @@ -769,19 +798,14 @@ err: for (; avp != NULL; avp = list_prev(&rx_grp->arg_vlans, avp)) { int err2; - if ((err2 = aggr_port_remvlan(port, avp->av_vid)) != 0) { + if ((err2 = aggr_port_remvlan(port, g_idx, avp->av_vid)) != 0) { cmn_err(CE_WARN, "Failed to remove VLAN %u from port %s" ": errno %d.", avp->av_vid, mac_client_name(port->lp_mch), err2); } } - if (port->lp_hwgh != NULL) { - mac_srs_perm_quiesce(port->lp_mch, B_FALSE); - mac_rx_client_restart(port->lp_mch); - port->lp_hwgh = NULL; - } - + port->lp_hwghs[g_idx] = NULL; mac_perim_exit(pmph); return (err); } @@ -795,55 +819,38 @@ err: static void aggr_rem_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp) { - aggr_grp_t *grp = port->lp_grp; mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP]; aggr_unicst_addr_t *addr; - mac_group_handle_t hwgh; mac_perim_handle_t pmph; - int hw_rh_cnt, i; + uint_t hw_rh_cnt; + uint_t g_idx = rx_grp->arg_index; - ASSERT(MAC_PERIM_HELD(grp->lg_mh)); + ASSERT(MAC_PERIM_HELD(port->lp_grp->lg_mh)); + ASSERT3U(g_idx, <, MAX_GROUPS_PER_PORT); + ASSERT3P(rx_grp->arg_gh, !=, NULL); mac_perim_enter_by_mh(port->lp_mh, &pmph); - if (!port->lp_rx_grp_added) - goto done; - - ASSERT(rx_grp->arg_gh != NULL); - hw_rh_cnt = mac_hwrings_get(port->lp_mch, - &hwgh, hw_rh, MAC_RING_TYPE_RX); + hw_rh_cnt = mac_hwrings_idx_get(port->lp_mh, g_idx, NULL, hw_rh, + MAC_RING_TYPE_RX); - for (i = 0; i < hw_rh_cnt; i++) + for (uint_t i = 0; i < hw_rh_cnt; i++) aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[i]); for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next) - aggr_port_remmac(port, addr->aua_addr); + aggr_port_remmac(port, g_idx, addr->aua_addr); for (aggr_vlan_t *avp = list_head(&rx_grp->arg_vlans); avp != NULL; avp = list_next(&rx_grp->arg_vlans, avp)) { int err; - if ((err = aggr_port_remvlan(port, avp->av_vid)) != 0) { + if ((err = aggr_port_remvlan(port, g_idx, avp->av_vid)) != 0) { cmn_err(CE_WARN, "Failed to remove VLAN %u from port %s" ": errno %d.", avp->av_vid, mac_client_name(port->lp_mch), err); } } - if (port->lp_hwgh != NULL) { - port->lp_hwgh = NULL; - - /* - * First clear the permanent-quiesced flag of the RX srs then - * restart the HW ring and the mac srs on the ring. Note that - * the HW ring and associated SRS will soon been removed when - * the port is removed from the aggr. - */ - mac_srs_perm_quiesce(port->lp_mch, B_FALSE); - mac_rx_client_restart(port->lp_mch); - } - - port->lp_rx_grp_added = B_FALSE; -done: + port->lp_hwghs[g_idx] = NULL; mac_perim_exit(pmph); } @@ -947,8 +954,8 @@ aggr_add_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp) /* * Get the list the the underlying HW rings. */ - hw_rh_cnt = mac_hwrings_get(port->lp_mch, - NULL, hw_rh, MAC_RING_TYPE_TX); + hw_rh_cnt = mac_hwrings_get(port->lp_mch, NULL, hw_rh, + MAC_RING_TYPE_TX); /* * Even if the underlying NIC does not have TX rings, we @@ -1054,21 +1061,45 @@ aggr_pseudo_enable_intr(mac_intr_handle_t ih) } /* - * Here we need to start the pseudo-ring. As MAC already ensures that the - * underlying device is set up, all we need to do is save the ring generation. - * - * Note, we don't end up wanting to use the underlying mac_hwring_start/stop - * functions here as those don't actually stop and start the ring, they just - * quiesce the ring. Regardless of whether the aggr is logically up or not, we - * want to make sure that we can receive traffic for LACP. + * Start the pseudo ring. Since the pseudo ring is just an abstraction + * over an actual HW ring, the real task is to start the underlying HW + * ring. */ static int -aggr_pseudo_start_ring(mac_ring_driver_t arg, uint64_t mr_gen) +aggr_pseudo_start_rx_ring(mac_ring_driver_t arg, uint64_t mr_gen) { + int err; aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg; + err = mac_hwring_start(rr_ring->arr_hw_rh); + + if (err != 0) + return (err); + rr_ring->arr_gen = mr_gen; - return (0); + return (err); +} + +/* + * Stop the pseudo ring. Since the pseudo ring is just an abstraction + * over an actual HW ring, the real task is to stop the underlying HW + * ring. + */ +static void +aggr_pseudo_stop_rx_ring(mac_ring_driver_t arg) +{ + aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg; + + /* + * The rings underlying the default group must stay up to + * continue receiving LACP traffic. We would normally never + * stop the default Rx rings because of the primary MAC + * client; but aggr's primary MAC client doesn't call + * mac_unicast_add() and thus mi_active is 0 when the last + * non-primary client is deleted. + */ + if (rr_ring->arr_grp->arg_index != 0) + mac_hwring_stop(rr_ring->arr_hw_rh); } /* @@ -1078,13 +1109,15 @@ int aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force, laioc_port_t *ports) { - int rc, i, nadded = 0; + int rc; + uint_t port_added = 0; + uint_t grp_added; aggr_grp_t *grp = NULL; aggr_port_t *port; boolean_t link_state_changed = B_FALSE; mac_perim_handle_t mph, pmph; - /* get group corresponding to linkid */ + /* Get the aggr corresponding to linkid. */ rw_enter(&aggr_grp_lock, RW_READER); if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid), (mod_hash_val_t *)&grp) != 0) { @@ -1094,20 +1127,22 @@ aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force, AGGR_GRP_REFHOLD(grp); /* - * Hold the perimeter so that the aggregation won't be destroyed. + * Hold the perimeter so that the aggregation can't be destroyed. */ mac_perim_enter_by_mh(grp->lg_mh, &mph); rw_exit(&aggr_grp_lock); - /* add the specified ports to group */ - for (i = 0; i < nports; i++) { - /* add port to group */ + /* Add the specified ports to the aggr. */ + for (uint_t i = 0; i < nports; i++) { + grp_added = 0; + if ((rc = aggr_grp_add_port(grp, ports[i].lp_linkid, force, &port)) != 0) { goto bail; } + ASSERT(port != NULL); - nadded++; + port_added++; /* check capabilities */ if (!aggr_grp_capab_check(grp, port) || @@ -1124,9 +1159,16 @@ aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force, rc = aggr_add_pseudo_tx_group(port, &grp->lg_tx_group); if (rc != 0) goto bail; - rc = aggr_add_pseudo_rx_group(port, &grp->lg_rx_group); - if (rc != 0) - goto bail; + + for (uint_t j = 0; j < grp->lg_rx_group_count; j++) { + rc = aggr_add_pseudo_rx_group(port, + &grp->lg_rx_groups[j]); + + if (rc != 0) + goto bail; + + grp_added++; + } mac_perim_enter_by_mh(port->lp_mh, &pmph); @@ -1144,7 +1186,7 @@ aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force, /* * Turn on the promiscuous mode over the port when it * is requested to be turned on to receive the - * non-primary address over a port, or the promiscous + * non-primary address over a port, or the promiscuous * mode is enabled over the aggr. */ if (grp->lg_promisc || port->lp_prom_addr != NULL) { @@ -1179,17 +1221,33 @@ aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force, bail: if (rc != 0) { /* stop and remove ports that have been added */ - for (i = 0; i < nadded; i++) { + for (uint_t i = 0; i < port_added; i++) { + uint_t grp_remove; + port = aggr_grp_port_lookup(grp, ports[i].lp_linkid); ASSERT(port != NULL); + if (grp->lg_started) { mac_perim_enter_by_mh(port->lp_mh, &pmph); (void) aggr_port_promisc(port, B_FALSE); aggr_port_stop(port); mac_perim_exit(pmph); } + aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group); - aggr_rem_pseudo_rx_group(port, &grp->lg_rx_group); + + /* + * Only the last port could have a partial set + * of groups added. + */ + grp_remove = (i + 1 == port_added) ? grp_added : + grp->lg_rx_group_count; + + for (uint_t j = 0; j < grp_remove; j++) { + aggr_rem_pseudo_rx_group(port, + &grp->lg_rx_groups[j]); + } + (void) aggr_grp_rem_port(grp, port, NULL, NULL); } } @@ -1351,14 +1409,11 @@ aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports, grp->lg_tx_blocked_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) * MAX_RINGS_PER_GROUP), KM_SLEEP); grp->lg_tx_blocked_cnt = 0; - bzero(&grp->lg_rx_group, sizeof (aggr_pseudo_rx_group_t)); + bzero(&grp->lg_rx_groups, + sizeof (aggr_pseudo_rx_group_t) * MAX_GROUPS_PER_PORT); bzero(&grp->lg_tx_group, sizeof (aggr_pseudo_tx_group_t)); aggr_lacp_init_grp(grp); - grp->lg_rx_group.arg_untagged = 0; - list_create(&(grp->lg_rx_group.arg_vlans), sizeof (aggr_vlan_t), - offsetof(aggr_vlan_t, av_link)); - /* add MAC ports to group */ grp->lg_ports = NULL; grp->lg_nports = 0; @@ -1380,6 +1435,42 @@ aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports, goto bail; } + grp->lg_rx_group_count = 1; + + for (port = grp->lg_ports; port != NULL; port = port->lp_next) { + uint_t num_rgroups; + + mac_perim_enter_by_mh(port->lp_mh, &mph); + num_rgroups = mac_get_num_rx_groups(port->lp_mh); + mac_perim_exit(mph); + + /* + * Utilize all the groups in a port. If some ports + * have less groups than others, then traffic destined + * for the same unicast address may be HW classified + * on some ports but SW classified by aggr when + * arriving on other ports. + */ + grp->lg_rx_group_count = MAX(grp->lg_rx_group_count, + num_rgroups); + } + + /* + * There could be cases where the hardware provides more + * groups than aggr can support. Make sure we never go above + * the max aggr can support. + */ + grp->lg_rx_group_count = MIN(grp->lg_rx_group_count, + MAX_GROUPS_PER_PORT); + + ASSERT3U(grp->lg_rx_group_count, >, 0); + for (i = 0; i < MAX_GROUPS_PER_PORT; i++) { + grp->lg_rx_groups[i].arg_index = i; + grp->lg_rx_groups[i].arg_untagged = 0; + list_create(&(grp->lg_rx_groups[i].arg_vlans), + sizeof (aggr_vlan_t), offsetof(aggr_vlan_t, av_link)); + } + /* * If no explicit MAC address was specified by the administrator, * set it to the MAC address of the first port. @@ -1397,7 +1488,7 @@ aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports, grp->lg_mac_addr_port = grp->lg_ports; } - /* set the initial group capabilities */ + /* Set the initial group capabilities. */ aggr_grp_capab_set(grp); if ((mac = mac_alloc(MAC_VERSION)) == NULL) { @@ -1432,14 +1523,18 @@ aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports, * Update the MAC address of the constituent ports. * None of the port is attached at this time, the link state of the * aggregation will not change. + * + * All ports take on the primary MAC address of the aggr + * (lg_aggr). At this point, none of the ports are attached; + * thus the link state of the aggregation will not change. */ link_state_changed = aggr_grp_update_ports_mac(grp); ASSERT(!link_state_changed); - /* update outbound load balancing policy */ + /* Update outbound load balancing policy. */ aggr_send_update_policy(grp, policy); - /* set LACP mode */ + /* Set LACP mode. */ aggr_lacp_set_mode(grp, lacp_mode, lacp_timer); /* @@ -1447,12 +1542,18 @@ aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports, */ for (port = grp->lg_ports; port != NULL; port = port->lp_next) { /* - * Create the pseudo ring for each HW ring of the underlying - * port. Note that this is done after the aggr registers the - * mac. + * Create the pseudo ring for each HW ring of the + * underlying port. Note that this is done after the + * aggr registers its MAC. */ - VERIFY(aggr_add_pseudo_tx_group(port, &grp->lg_tx_group) == 0); - VERIFY(aggr_add_pseudo_rx_group(port, &grp->lg_rx_group) == 0); + VERIFY3S(aggr_add_pseudo_tx_group(port, &grp->lg_tx_group), + ==, 0); + + for (i = 0; i < grp->lg_rx_group_count; i++) { + VERIFY3S(aggr_add_pseudo_rx_group(port, + &grp->lg_rx_groups[i]), ==, 0); + } + if (aggr_port_notify_link(grp, port)) link_state_changed = B_TRUE; @@ -1734,7 +1835,8 @@ aggr_grp_rem_ports(datalink_id_t linkid, uint_t nports, laioc_port_t *ports) * aggr_find_tx_ring() will not return any rings * belonging to it. */ - aggr_rem_pseudo_rx_group(port, &grp->lg_rx_group); + for (i = 0; i < grp->lg_rx_group_count; i++) + aggr_rem_pseudo_rx_group(port, &grp->lg_rx_groups[i]); /* remove port from group */ rc = aggr_grp_rem_port(grp, port, &mac_addr_changed, @@ -1839,7 +1941,8 @@ aggr_grp_delete(datalink_id_t linkid, cred_t *cred) (void) aggr_grp_detach_port(grp, port); mac_perim_exit(pmph); aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group); - aggr_rem_pseudo_rx_group(port, &grp->lg_rx_group); + for (uint_t i = 0; i < grp->lg_rx_group_count; i++) + aggr_rem_pseudo_rx_group(port, &grp->lg_rx_groups[i]); aggr_port_delete(port); port = cport; } @@ -1858,7 +1961,9 @@ aggr_grp_delete(datalink_id_t linkid, cred_t *cred) VERIFY(mac_unregister(grp->lg_mh) == 0); grp->lg_mh = NULL; - list_destroy(&(grp->lg_rx_group.arg_vlans)); + for (uint_t i = 0; i < MAX_GROUPS_PER_PORT; i++) { + list_destroy(&(grp->lg_rx_groups[i].arg_vlans)); + } AGGR_GRP_REFRELE(grp); return (0); @@ -2224,17 +2329,15 @@ aggr_m_capab_get(void *arg, mac_capab_t cap, void *cap_data) return (!grp->lg_zcopy); case MAC_CAPAB_RINGS: { mac_capab_rings_t *cap_rings = cap_data; + uint_t ring_cnt = 0; + + for (uint_t i = 0; i < grp->lg_rx_group_count; i++) + ring_cnt += grp->lg_rx_groups[i].arg_ring_cnt; if (cap_rings->mr_type == MAC_RING_TYPE_RX) { cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC; - cap_rings->mr_rnum = grp->lg_rx_group.arg_ring_cnt; - - /* - * An aggregation advertises only one (pseudo) RX - * group, which virtualizes the main/primary group of - * the underlying devices. - */ - cap_rings->mr_gnum = 1; + cap_rings->mr_rnum = ring_cnt; + cap_rings->mr_gnum = grp->lg_rx_group_count; cap_rings->mr_gaddring = NULL; cap_rings->mr_gremring = NULL; } else { @@ -2273,12 +2376,10 @@ aggr_fill_group(void *arg, mac_ring_type_t rtype, const int index, mac_group_info_t *infop, mac_group_handle_t gh) { aggr_grp_t *grp = arg; - aggr_pseudo_rx_group_t *rx_group; - aggr_pseudo_tx_group_t *tx_group; - ASSERT(index == 0); if (rtype == MAC_RING_TYPE_RX) { - rx_group = &grp->lg_rx_group; + aggr_pseudo_rx_group_t *rx_group = &grp->lg_rx_groups[index]; + rx_group->arg_gh = gh; rx_group->arg_grp = grp; @@ -2297,7 +2398,9 @@ aggr_fill_group(void *arg, mac_ring_type_t rtype, const int index, infop->mgi_addvlan = aggr_addvlan; infop->mgi_remvlan = aggr_remvlan; } else { - tx_group = &grp->lg_tx_group; + aggr_pseudo_tx_group_t *tx_group = &grp->lg_tx_group; + + ASSERT3S(index, ==, 0); tx_group->atg_gh = gh; } } @@ -2313,13 +2416,13 @@ aggr_fill_ring(void *arg, mac_ring_type_t rtype, const int rg_index, switch (rtype) { case MAC_RING_TYPE_RX: { - aggr_pseudo_rx_group_t *rx_group = &grp->lg_rx_group; + aggr_pseudo_rx_group_t *rx_group; aggr_pseudo_rx_ring_t *rx_ring; mac_intr_t aggr_mac_intr; - ASSERT(rg_index == 0); - - ASSERT((index >= 0) && (index < rx_group->arg_ring_cnt)); + rx_group = &grp->lg_rx_groups[rg_index]; + ASSERT3S(index, >=, 0); + ASSERT3S(index, <, rx_group->arg_ring_cnt); rx_ring = rx_group->arg_rings + index; rx_ring->arr_rh = rh; @@ -2333,8 +2436,8 @@ aggr_fill_ring(void *arg, mac_ring_type_t rtype, const int rg_index, aggr_mac_intr.mi_ddi_handle = NULL; infop->mri_driver = (mac_ring_driver_t)rx_ring; - infop->mri_start = aggr_pseudo_start_ring; - infop->mri_stop = NULL; + infop->mri_start = aggr_pseudo_start_rx_ring; + infop->mri_stop = aggr_pseudo_stop_rx_ring; infop->mri_intr = aggr_mac_intr; infop->mri_poll = aggr_rx_poll; @@ -2421,6 +2524,7 @@ aggr_addmac(void *arg, const uint8_t *mac_addr) aggr_port_t *port, *p; mac_perim_handle_t mph; int err = 0; + uint_t idx = rx_group->arg_index; mac_perim_enter_by_mh(grp->lg_mh, &mph); @@ -2447,12 +2551,12 @@ aggr_addmac(void *arg, const uint8_t *mac_addr) *pprev = addr; for (port = grp->lg_ports; port != NULL; port = port->lp_next) - if ((err = aggr_port_addmac(port, mac_addr)) != 0) + if ((err = aggr_port_addmac(port, idx, mac_addr)) != 0) break; if (err != 0) { for (p = grp->lg_ports; p != port; p = p->lp_next) - aggr_port_remmac(p, mac_addr); + aggr_port_remmac(p, idx, mac_addr); *pprev = NULL; kmem_free(addr, sizeof (aggr_unicst_addr_t)); @@ -2497,7 +2601,7 @@ aggr_remmac(void *arg, const uint8_t *mac_addr) } for (port = grp->lg_ports; port != NULL; port = port->lp_next) - aggr_port_remmac(port, mac_addr); + aggr_port_remmac(port, rx_group->arg_index, mac_addr); *pprev = addr->aua_next; kmem_free(addr, sizeof (aggr_unicst_addr_t)); @@ -2533,12 +2637,13 @@ aggr_find_vlan(aggr_pseudo_rx_group_t *rx_group, uint16_t vid) static int aggr_addvlan(mac_group_driver_t gdriver, uint16_t vid) { - aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)gdriver; + aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)gdriver; aggr_grp_t *aggr = rx_group->arg_grp; aggr_port_t *port, *p; mac_perim_handle_t mph; int err = 0; aggr_vlan_t *avp = NULL; + uint_t idx = rx_group->arg_index; mac_perim_enter_by_mh(aggr->lg_mh, &mph); @@ -2568,7 +2673,7 @@ aggr_addvlan(mac_group_driver_t gdriver, uint16_t vid) update_ports: for (port = aggr->lg_ports; port != NULL; port = port->lp_next) - if ((err = aggr_port_addvlan(port, vid)) != 0) + if ((err = aggr_port_addvlan(port, idx, vid)) != 0) break; if (err != 0) { @@ -2581,7 +2686,7 @@ update_ports: for (p = aggr->lg_ports; p != port; p = p->lp_next) { int err2; - if ((err2 = aggr_port_remvlan(p, vid)) != 0) { + if ((err2 = aggr_port_remvlan(p, idx, vid)) != 0) { cmn_err(CE_WARN, "Failed to remove VLAN %u" " from port %s: errno %d.", vid, mac_client_name(p->lp_mch), err2); @@ -2612,12 +2717,13 @@ done: static int aggr_remvlan(mac_group_driver_t gdriver, uint16_t vid) { - aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)gdriver; + aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)gdriver; aggr_grp_t *aggr = rx_group->arg_grp; aggr_port_t *port, *p; mac_perim_handle_t mph; int err = 0; aggr_vlan_t *avp = NULL; + uint_t idx = rx_group->arg_index; mac_perim_enter_by_mh(aggr->lg_mh, &mph); @@ -2648,7 +2754,7 @@ aggr_remvlan(mac_group_driver_t gdriver, uint16_t vid) update_ports: for (port = aggr->lg_ports; port != NULL; port = port->lp_next) - if ((err = aggr_port_remvlan(port, vid)) != 0) + if ((err = aggr_port_remvlan(port, idx, vid)) != 0) break; /* @@ -2659,7 +2765,7 @@ update_ports: for (p = aggr->lg_ports; p != port; p = p->lp_next) { int err2; - if ((err2 = aggr_port_addvlan(p, vid)) != 0) { + if ((err2 = aggr_port_addvlan(p, idx, vid)) != 0) { cmn_err(CE_WARN, "Failed to add VLAN %u" " to port %s: errno %d.", vid, mac_client_name(p->lp_mch), err2); diff --git a/usr/src/uts/common/io/aggr/aggr_port.c b/usr/src/uts/common/io/aggr/aggr_port.c index 9d2edd4f97..e764dd104e 100644 --- a/usr/src/uts/common/io/aggr/aggr_port.c +++ b/usr/src/uts/common/io/aggr/aggr_port.c @@ -71,10 +71,10 @@ aggr_port_destructor(void *buf, void *arg) { aggr_port_t *port = buf; - ASSERT(port->lp_mnh == NULL); - ASSERT(port->lp_mphp == NULL); - ASSERT(!port->lp_rx_grp_added && !port->lp_tx_grp_added); - ASSERT(port->lp_hwgh == NULL); + ASSERT3P(port->lp_mnh, ==, NULL); + ASSERT(!port->lp_tx_grp_added); + for (uint_t i = 0; i < MAX_GROUPS_PER_PORT; i++) + ASSERT3P(port->lp_hwghs[i], ==, NULL); } void @@ -128,7 +128,6 @@ aggr_port_init_callbacks(aggr_port_t *port) aggr_grp_port_hold(port); } -/* ARGSUSED */ int aggr_port_create(aggr_grp_t *grp, const datalink_id_t linkid, boolean_t force, aggr_port_t **pp) @@ -197,9 +196,9 @@ aggr_port_create(aggr_grp_t *grp, const datalink_id_t linkid, boolean_t force, } /* - * As the underlying mac's current margin size is used to determine + * As the underlying MAC's current margin size is used to determine * the margin size of the aggregation itself, request the underlying - * mac not to change to a smaller size. + * MAC not to change to a smaller size. */ if ((err = mac_margin_add(mh, &margin, B_TRUE)) != 0) { id_free(aggr_portids, portid); @@ -208,7 +207,7 @@ aggr_port_create(aggr_grp_t *grp, const datalink_id_t linkid, boolean_t force, if ((err = mac_unicast_add(mch, NULL, MAC_UNICAST_PRIMARY | MAC_UNICAST_DISABLE_TX_VID_CHECK, &mah, 0, &diag)) != 0) { - VERIFY(mac_margin_remove(mh, margin) == 0); + VERIFY3S(mac_margin_remove(mh, margin), ==, 0); id_free(aggr_portids, portid); goto fail; } @@ -263,6 +262,7 @@ aggr_port_create(aggr_grp_t *grp, const datalink_id_t linkid, boolean_t force, fail: if (mch != NULL) mac_client_close(mch, MAC_CLOSE_FLAGS_EXCLUSIVE); + mac_close(mh); return (err); } @@ -272,13 +272,11 @@ aggr_port_delete(aggr_port_t *port) { aggr_lacp_port_t *pl = &port->lp_lacp; - ASSERT(port->lp_mphp == NULL); ASSERT(!port->lp_promisc_on); - port->lp_closing = B_TRUE; + VERIFY0(mac_margin_remove(port->lp_mh, port->lp_margin)); + mac_client_clear_flow_cb(port->lp_mch); - VERIFY(mac_margin_remove(port->lp_mh, port->lp_margin) == 0); - mac_rx_clear(port->lp_mch); /* * If the notification callback is already in process and waiting for * the aggr grp's mac perimeter, don't wait (otherwise there would be @@ -309,8 +307,10 @@ aggr_port_delete(aggr_port_t *port) * port's MAC_NOTE_UNICST notify callback function being called. */ (void) mac_unicast_primary_set(port->lp_mh, port->lp_addr); + if (port->lp_mah != NULL) (void) mac_unicast_remove(port->lp_mch, port->lp_mah); + mac_client_close(port->lp_mch, MAC_CLOSE_FLAGS_EXCLUSIVE); mac_close(port->lp_mh); AGGR_PORT_REFRELE(port); @@ -521,6 +521,10 @@ aggr_port_stop(aggr_port_t *port) port->lp_started = B_FALSE; } +/* + * Set the promisc mode of the port. If the port is already in the + * requested mode then do nothing. + */ int aggr_port_promisc(aggr_port_t *port, boolean_t on) { @@ -529,34 +533,14 @@ aggr_port_promisc(aggr_port_t *port, boolean_t on) ASSERT(MAC_PERIM_HELD(port->lp_mh)); if (on == port->lp_promisc_on) - /* already in desired promiscous mode */ return (0); - if (on) { - mac_rx_clear(port->lp_mch); + rc = mac_set_promisc(port->lp_mh, on); - /* - * We use the promisc callback because without hardware - * rings, we deliver through flows that will cause duplicate - * delivery of packets when we've flipped into this mode - * to compensate for the lack of hardware MAC matching - */ - rc = mac_promisc_add(port->lp_mch, MAC_CLIENT_PROMISC_ALL, - aggr_recv_promisc_cb, port, &port->lp_mphp, - MAC_PROMISC_FLAGS_NO_TX_LOOP); - if (rc != 0) { - mac_rx_set(port->lp_mch, aggr_recv_cb, port); - return (rc); - } - } else { - mac_promisc_remove(port->lp_mphp); - port->lp_mphp = NULL; - mac_rx_set(port->lp_mch, aggr_recv_cb, port); - } - - port->lp_promisc_on = on; + if (rc == 0) + port->lp_promisc_on = on; - return (0); + return (rc); } /* @@ -596,35 +580,45 @@ aggr_port_stat(aggr_port_t *port, uint_t stat) } /* - * Add a non-primary unicast address to the underlying port. If the port - * supports HW Rx group, try to add the address into the HW Rx group of - * the port first. If that fails, or if the port does not support HW Rx - * group, enable the port's promiscous mode. + * Add a non-primary unicast address to the underlying port. If the + * port supports HW Rx groups, then try to add the address filter to + * the HW group first. If that fails, or if the port does not support + * RINGS capab, then enable the port's promiscous mode. */ int -aggr_port_addmac(aggr_port_t *port, const uint8_t *mac_addr) +aggr_port_addmac(aggr_port_t *port, uint_t idx, const uint8_t *mac_addr) { aggr_unicst_addr_t *addr, **pprev; mac_perim_handle_t pmph; int err; ASSERT(MAC_PERIM_HELD(port->lp_grp->lg_mh)); + ASSERT3U(idx, <, MAX_GROUPS_PER_PORT); mac_perim_enter_by_mh(port->lp_mh, &pmph); /* - * If the underlying port support HW Rx group, add the mac to its - * RX group directly. + * If the port doesn't have a HW group to back the aggr's + * pseudo group, then try using the port's default group and + * let the aggr SW classify its traffic. This scenario happens + * when mixing ports with a different number of HW groups. */ - if ((port->lp_hwgh != NULL) && - ((mac_hwgroup_addmac(port->lp_hwgh, mac_addr)) == 0)) { + if (port->lp_hwghs[idx] == NULL) + idx = 0; + + /* + * If there is an underlying HW Rx group, then try adding this + * unicast address to it. + */ + if ((port->lp_hwghs[idx] != NULL) && + ((mac_hwgroup_addmac(port->lp_hwghs[idx], mac_addr)) == 0)) { mac_perim_exit(pmph); return (0); } /* - * If that fails, or if the port does not support HW Rx group, enable - * the port's promiscous mode. (Note that we turn on the promiscous - * mode only if the port is already started. + * If the port doesn't have HW groups, or we failed to add the + * HW filter, then enable the port's promiscuous mode. We + * enable promiscuous mode only if the port is already started. */ if (port->lp_started && ((err = aggr_port_promisc(port, B_TRUE)) != 0)) { @@ -656,13 +650,14 @@ aggr_port_addmac(aggr_port_t *port, const uint8_t *mac_addr) * promiscous mode. */ void -aggr_port_remmac(aggr_port_t *port, const uint8_t *mac_addr) +aggr_port_remmac(aggr_port_t *port, uint_t idx, const uint8_t *mac_addr) { aggr_grp_t *grp = port->lp_grp; aggr_unicst_addr_t *addr, **pprev; mac_perim_handle_t pmph; ASSERT(MAC_PERIM_HELD(grp->lg_mh)); + ASSERT3U(idx, <, MAX_GROUPS_PER_PORT); mac_perim_enter_by_mh(port->lp_mh, &pmph); /* @@ -675,6 +670,7 @@ aggr_port_remmac(aggr_port_t *port, const uint8_t *mac_addr) break; pprev = &addr->aua_next; } + if (addr != NULL) { /* * This unicast address put the port into the promiscous mode, @@ -687,52 +683,65 @@ aggr_port_remmac(aggr_port_t *port, const uint8_t *mac_addr) if (port->lp_prom_addr == NULL && !grp->lg_promisc) (void) aggr_port_promisc(port, B_FALSE); } else { - ASSERT(port->lp_hwgh != NULL); - (void) mac_hwgroup_remmac(port->lp_hwgh, mac_addr); + /* See comment in aggr_port_addmac(). */ + if (port->lp_hwghs[idx] == NULL) + idx = 0; + + ASSERT3P(port->lp_hwghs[idx], !=, NULL); + (void) mac_hwgroup_remmac(port->lp_hwghs[idx], mac_addr); } + mac_perim_exit(pmph); } int -aggr_port_addvlan(aggr_port_t *port, uint16_t vid) +aggr_port_addvlan(aggr_port_t *port, uint_t idx, uint16_t vid) { mac_perim_handle_t pmph; int err; ASSERT(MAC_PERIM_HELD(port->lp_grp->lg_mh)); + ASSERT3U(idx, <, MAX_GROUPS_PER_PORT); mac_perim_enter_by_mh(port->lp_mh, &pmph); + /* See comment in aggr_port_addmac(). */ + if (port->lp_hwghs[idx] == NULL) + idx = 0; + /* * Add the VLAN filter to the HW group if the port has a HW * group. If the port doesn't have a HW group, then it will * implicitly allow tagged traffic to pass and there is * nothing to do. */ - if (port->lp_hwgh == NULL) { - mac_perim_exit(pmph); - return (0); - } + if (port->lp_hwghs[idx] == NULL) + err = 0; + else + err = mac_hwgroup_addvlan(port->lp_hwghs[idx], vid); - err = mac_hwgroup_addvlan(port->lp_hwgh, vid); mac_perim_exit(pmph); return (err); } int -aggr_port_remvlan(aggr_port_t *port, uint16_t vid) +aggr_port_remvlan(aggr_port_t *port, uint_t idx, uint16_t vid) { mac_perim_handle_t pmph; int err; ASSERT(MAC_PERIM_HELD(port->lp_grp->lg_mh)); + ASSERT3U(idx, <, MAX_GROUPS_PER_PORT); mac_perim_enter_by_mh(port->lp_mh, &pmph); - if (port->lp_hwgh == NULL) { - mac_perim_exit(pmph); - return (0); - } + /* See comment in aggr_port_addmac(). */ + if (port->lp_hwghs[idx] == NULL) + idx = 0; + + if (port->lp_hwghs[idx] == NULL) + err = 0; + else + err = mac_hwgroup_remvlan(port->lp_hwghs[idx], vid); - err = mac_hwgroup_remvlan(port->lp_hwgh, vid); mac_perim_exit(pmph); return (err); } diff --git a/usr/src/uts/common/io/aggr/aggr_recv.c b/usr/src/uts/common/io/aggr/aggr_recv.c index 33a060da48..b6b3e6de1f 100644 --- a/usr/src/uts/common/io/aggr/aggr_recv.c +++ b/usr/src/uts/common/io/aggr/aggr_recv.c @@ -22,6 +22,7 @@ * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * Copyright 2012 OmniTI Computer Consulting, Inc All rights reserved. + * Copyright 2018 Joyent, Inc. */ /* @@ -56,7 +57,7 @@ aggr_recv_lacp(aggr_port_t *port, mac_resource_handle_t mrh, mblk_t *mp) { aggr_grp_t *grp = port->lp_grp; - /* in promiscuous mode, send copy of packet up */ + /* In promiscuous mode, pass copy of packet up. */ if (grp->lg_promisc) { mblk_t *nmp = copymsg(mp); @@ -74,23 +75,11 @@ aggr_recv_lacp(aggr_port_t *port, mac_resource_handle_t mrh, mblk_t *mp) /* ARGSUSED */ static void aggr_recv_path_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp, - boolean_t loopback, boolean_t promisc_path) + boolean_t loopback) { aggr_port_t *port = (aggr_port_t *)arg; aggr_grp_t *grp = port->lp_grp; - /* - * In the case where lp_promisc_on has been turned on to - * compensate for insufficient hardware MAC matching and - * hardware rings are not in use we will fall back to - * using flows for delivery which can result in duplicates - * pushed up the stack. Only respect the chosen path. - */ - if (port->lp_promisc_on != promisc_path) { - freemsgchain(mp); - return; - } - if (grp->lg_lacp_mode == AGGR_LACP_OFF) { aggr_mac_rx(grp->lg_mh, mrh, mp); } else { @@ -175,18 +164,9 @@ aggr_recv_path_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp, } } -/* ARGSUSED */ void aggr_recv_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp, boolean_t loopback) { - aggr_recv_path_cb(arg, mrh, mp, loopback, B_FALSE); -} - -/* ARGSUSED */ -void -aggr_recv_promisc_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp, - boolean_t loopback) -{ - aggr_recv_path_cb(arg, mrh, mp, loopback, B_TRUE); + aggr_recv_path_cb(arg, mrh, mp, loopback); } diff --git a/usr/src/uts/common/io/mac/mac.c b/usr/src/uts/common/io/mac/mac.c index 4d450a539b..79a518a164 100644 --- a/usr/src/uts/common/io/mac/mac.c +++ b/usr/src/uts/common/io/mac/mac.c @@ -1457,7 +1457,7 @@ mac_rx_group_unmark(mac_group_t *grp, uint_t flag) * used by the aggr driver to access and control the underlying HW Rx group * and rings. In this case, the aggr driver has exclusive control of the * underlying HW Rx group/rings, it calls the following functions to - * start/stop the HW Rx rings, disable/enable polling, add/remove mac' + * start/stop the HW Rx rings, disable/enable polling, add/remove MAC * addresses, or set up the Rx callback. */ /* ARGSUSED */ @@ -1502,8 +1502,9 @@ mac_hwrings_get(mac_client_handle_t mch, mac_group_handle_t *hwgh, ASSERT(B_FALSE); return (-1); } + /* - * The mac client did not reserve any RX group, return directly. + * The MAC client did not reserve an Rx group, return directly. * This is probably because the underlying MAC does not support * any groups. */ @@ -1512,7 +1513,7 @@ mac_hwrings_get(mac_client_handle_t mch, mac_group_handle_t *hwgh, if (grp == NULL) return (0); /* - * This group must be reserved by this mac client. + * This group must be reserved by this MAC client. */ ASSERT((grp->mrg_state == MAC_GROUP_STATE_RESERVED) && (mcip == MAC_GROUP_ONLY_CLIENT(grp))); @@ -1528,6 +1529,78 @@ mac_hwrings_get(mac_client_handle_t mch, mac_group_handle_t *hwgh, } /* + * Get the HW ring handles of the given group index. If the MAC + * doesn't have a group at this index, or any groups at all, then 0 is + * returned and hwgh is set to NULL. This is a private client API. The + * MAC perimeter must be held when calling this function. + * + * mh: A handle to the MAC that owns the group. + * + * idx: The index of the HW group to be read. + * + * hwgh: If non-NULL, contains a handle to the HW group on return. + * + * hwrh: An array of ring handles pointing to the HW rings in the + * group. The array must be large enough to hold a handle to each ring + * in the group. To be safe, this array should be of size MAX_RINGS_PER_GROUP. + * + * rtype: Used to determine if we are fetching Rx or Tx rings. + * + * Returns the number of rings in the group. + */ +uint_t +mac_hwrings_idx_get(mac_handle_t mh, uint_t idx, mac_group_handle_t *hwgh, + mac_ring_handle_t *hwrh, mac_ring_type_t rtype) +{ + mac_impl_t *mip = (mac_impl_t *)mh; + mac_group_t *grp; + mac_ring_t *ring; + uint_t cnt = 0; + + /* + * The MAC perimeter must be held when accessing the + * mi_{rx,tx}_groups fields. + */ + ASSERT(MAC_PERIM_HELD(mh)); + ASSERT(rtype == MAC_RING_TYPE_RX || rtype == MAC_RING_TYPE_TX); + + if (rtype == MAC_RING_TYPE_RX) { + grp = mip->mi_rx_groups; + } else { + ASSERT(rtype == MAC_RING_TYPE_TX); + grp = mip->mi_tx_groups; + } + + while (grp != NULL && grp->mrg_index != idx) + grp = grp->mrg_next; + + /* + * If the MAC doesn't have a group at this index or doesn't + * impelement RINGS capab, then set hwgh to NULL and return 0. + */ + if (hwgh != NULL) + *hwgh = NULL; + + if (grp == NULL) + return (0); + + ASSERT3U(idx, ==, grp->mrg_index); + + for (ring = grp->mrg_rings; ring != NULL; ring = ring->mr_next, cnt++) { + ASSERT3U(cnt, <, MAX_RINGS_PER_GROUP); + hwrh[cnt] = (mac_ring_handle_t)ring; + } + + /* A group should always have at least one ring. */ + ASSERT3U(cnt, >, 0); + + if (hwgh != NULL) + *hwgh = (mac_group_handle_t)grp; + + return (cnt); +} + +/* * This function is called to get info about Tx/Rx rings. * * Return value: returns uint_t which will have various bits set @@ -1543,6 +1616,69 @@ mac_hwring_getinfo(mac_ring_handle_t rh) } /* + * Set the passthru callback on the hardware ring. + */ +void +mac_hwring_set_passthru(mac_ring_handle_t hwrh, mac_rx_t fn, void *arg1, + mac_resource_handle_t arg2) +{ + mac_ring_t *hwring = (mac_ring_t *)hwrh; + + ASSERT3S(hwring->mr_type, ==, MAC_RING_TYPE_RX); + + hwring->mr_classify_type = MAC_PASSTHRU_CLASSIFIER; + + hwring->mr_pt_fn = fn; + hwring->mr_pt_arg1 = arg1; + hwring->mr_pt_arg2 = arg2; +} + +/* + * Clear the passthru callback on the hardware ring. + */ +void +mac_hwring_clear_passthru(mac_ring_handle_t hwrh) +{ + mac_ring_t *hwring = (mac_ring_t *)hwrh; + + ASSERT3S(hwring->mr_type, ==, MAC_RING_TYPE_RX); + + hwring->mr_classify_type = MAC_NO_CLASSIFIER; + + hwring->mr_pt_fn = NULL; + hwring->mr_pt_arg1 = NULL; + hwring->mr_pt_arg2 = NULL; +} + +void +mac_client_set_flow_cb(mac_client_handle_t mch, mac_rx_t func, void *arg1) +{ + mac_client_impl_t *mcip = (mac_client_impl_t *)mch; + flow_entry_t *flent = mcip->mci_flent; + + mutex_enter(&flent->fe_lock); + flent->fe_cb_fn = (flow_fn_t)func; + flent->fe_cb_arg1 = arg1; + flent->fe_cb_arg2 = NULL; + flent->fe_flags &= ~FE_MC_NO_DATAPATH; + mutex_exit(&flent->fe_lock); +} + +void +mac_client_clear_flow_cb(mac_client_handle_t mch) +{ + mac_client_impl_t *mcip = (mac_client_impl_t *)mch; + flow_entry_t *flent = mcip->mci_flent; + + mutex_enter(&flent->fe_lock); + flent->fe_cb_fn = (flow_fn_t)mac_pkt_drop; + flent->fe_cb_arg1 = NULL; + flent->fe_cb_arg2 = NULL; + flent->fe_flags |= FE_MC_NO_DATAPATH; + mutex_exit(&flent->fe_lock); +} + +/* * Export ddi interrupt handles from the HW ring to the pseudo ring and * setup the RX callback of the mac client which exclusively controls * HW ring. @@ -1614,17 +1750,56 @@ mac_hwring_enable_intr(mac_ring_handle_t rh) return (intr->mi_enable(intr->mi_handle)); } +/* + * Start the HW ring pointed to by rh. + * + * This is used by special MAC clients that are MAC themselves and + * need to exert control over the underlying HW rings of the NIC. + */ int mac_hwring_start(mac_ring_handle_t rh) { mac_ring_t *rr_ring = (mac_ring_t *)rh; + int rv = 0; + + if (rr_ring->mr_state != MR_INUSE) + rv = mac_start_ring(rr_ring); + + return (rv); +} + +/* + * Stop the HW ring pointed to by rh. Also see mac_hwring_start(). + */ +void +mac_hwring_stop(mac_ring_handle_t rh) +{ + mac_ring_t *rr_ring = (mac_ring_t *)rh; + + if (rr_ring->mr_state != MR_FREE) + mac_stop_ring(rr_ring); +} + +/* + * Remove the quiesced flag from the HW ring pointed to by rh. + * + * This is used by special MAC clients that are MAC themselves and + * need to exert control over the underlying HW rings of the NIC. + */ +int +mac_hwring_activate(mac_ring_handle_t rh) +{ + mac_ring_t *rr_ring = (mac_ring_t *)rh; MAC_RING_UNMARK(rr_ring, MR_QUIESCE); return (0); } +/* + * Quiesce the HW ring pointed to by rh. Also see mac_hwring_activate(). + */ void -mac_hwring_stop(mac_ring_handle_t rh) +mac_hwring_quiesce(mac_ring_handle_t rh) { mac_ring_t *rr_ring = (mac_ring_t *)rh; @@ -1772,6 +1947,27 @@ mac_has_hw_vlan(mac_handle_t mh) } /* + * Get the number of Rx HW groups on this MAC. + */ +uint_t +mac_get_num_rx_groups(mac_handle_t mh) +{ + mac_impl_t *mip = (mac_impl_t *)mh; + + ASSERT(MAC_PERIM_HELD(mh)); + return (mip->mi_rx_group_count); +} + +int +mac_set_promisc(mac_handle_t mh, boolean_t value) +{ + mac_impl_t *mip = (mac_impl_t *)mh; + + ASSERT(MAC_PERIM_HELD(mh)); + return (i_mac_promisc_set(mip, value)); +} + +/* * Set the RX group to be shared/reserved. Note that the group must be * started/stopped outside of this function. */ @@ -2465,19 +2661,6 @@ mac_rx_classify(mac_impl_t *mip, mac_resource_handle_t mrh, mblk_t *mp) uint_t flags = FLOW_INBOUND; int err; - /* - * If the MAC is a port of an aggregation, pass FLOW_IGNORE_VLAN - * to mac_flow_lookup() so that the VLAN packets can be successfully - * passed to the non-VLAN aggregation flows. - * - * Note that there is possibly a race between this and - * mac_unicast_remove/add() and VLAN packets could be incorrectly - * classified to non-VLAN flows of non-aggregation MAC clients. These - * VLAN packets will be then filtered out by the MAC module. - */ - if ((mip->mi_state_flags & MIS_EXCLUSIVE) != 0) - flags |= FLOW_IGNORE_VLAN; - err = mac_flow_lookup(mip->mi_flow_tab, mp, flags, &flent); if (err != 0) { /* no registered receive function */ @@ -3811,9 +3994,27 @@ mac_start_group_and_rings(mac_group_t *group) for (ring = group->mrg_rings; ring != NULL; ring = ring->mr_next) { ASSERT(ring->mr_state == MR_FREE); + if ((rv = mac_start_ring(ring)) != 0) goto error; - ring->mr_classify_type = MAC_SW_CLASSIFIER; + + /* + * When aggr_set_port_sdu() is called, it will remove + * the port client's unicast address. This will cause + * MAC to stop the default group's rings on the port + * MAC. After it modifies the SDU, it will then re-add + * the unicast address. At which time, this function is + * called to start the default group's rings. Normally + * this function would set the classify type to + * MAC_SW_CLASSIFIER; but that will break aggr which + * relies on the passthru classify mode being set for + * correct delivery (see mac_rx_common()). To avoid + * that, we check for a passthru callback and set the + * classify type to MAC_PASSTHRU_CLASSIFIER; as it was + * before the rings were stopped. + */ + ring->mr_classify_type = (ring->mr_pt_fn != NULL) ? + MAC_PASSTHRU_CLASSIFIER : MAC_SW_CLASSIFIER; } return (0); diff --git a/usr/src/uts/common/io/mac/mac_client.c b/usr/src/uts/common/io/mac/mac_client.c index b918bf4aca..c39e3fa12f 100644 --- a/usr/src/uts/common/io/mac/mac_client.c +++ b/usr/src/uts/common/io/mac/mac_client.c @@ -1436,6 +1436,7 @@ mac_client_open(mac_handle_t mh, mac_client_handle_t *mchp, char *name, mcip->mci_flent = flent; FLOW_MARK(flent, FE_MC_NO_DATAPATH); flent->fe_mcip = mcip; + /* * Place initial creation reference on the flow. This reference * is released in the corresponding delete action viz. @@ -2437,7 +2438,17 @@ done_setup: if (flent->fe_rx_ring_group != NULL) mac_rx_group_unmark(flent->fe_rx_ring_group, MR_INCIPIENT); FLOW_UNMARK(flent, FE_INCIPIENT); - FLOW_UNMARK(flent, FE_MC_NO_DATAPATH); + + /* + * If this is an aggr port client, don't enable the flow's + * datapath at this stage. Otherwise, bcast traffic could + * arrive while the aggr port is in the process of + * initializing. Instead, the flow's datapath is started later + * when mac_client_set_flow_cb() is called. + */ + if ((mcip->mci_state_flags & MCIS_IS_AGGR_PORT) == 0) + FLOW_UNMARK(flent, FE_MC_NO_DATAPATH); + mac_tx_client_unblock(mcip); return (0); bail: diff --git a/usr/src/uts/common/io/mac/mac_datapath_setup.c b/usr/src/uts/common/io/mac/mac_datapath_setup.c index a3fc2529b9..e3b660c3b3 100644 --- a/usr/src/uts/common/io/mac/mac_datapath_setup.c +++ b/usr/src/uts/common/io/mac/mac_datapath_setup.c @@ -1975,8 +1975,6 @@ no_softrings: } /* - * mac_fanout_setup: - * * Calls mac_srs_fanout_init() or modify() depending upon whether * the SRS is getting initialized or re-initialized. */ @@ -1989,14 +1987,14 @@ mac_fanout_setup(mac_client_impl_t *mcip, flow_entry_t *flent, int i, rx_srs_cnt; ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip)); + /* - * This is an aggregation port. Fanout will be setup - * over the aggregation itself. + * Aggr ports do not have SRSes. This function should never be + * called on an aggr port. */ - if (mcip->mci_state_flags & MCIS_EXCLUSIVE) - return; - + ASSERT3U((mcip->mci_state_flags & MCIS_IS_AGGR_PORT), ==, 0); mac_rx_srs = flent->fe_rx_srs[0]; + /* * Set up the fanout on the tx side only once, with the * first rx SRS. The CPU binding, fanout, and bandwidth @@ -2052,8 +2050,6 @@ mac_fanout_setup(mac_client_impl_t *mcip, flow_entry_t *flent, } /* - * mac_srs_create: - * * Create a mac_soft_ring_set_t (SRS). If soft_ring_fanout_type is * SRST_TX, an SRS for Tx side is created. Otherwise an SRS for Rx side * processing is created. @@ -2355,6 +2351,10 @@ mac_srs_group_setup(mac_client_impl_t *mcip, flow_entry_t *flent, mac_rx_srs_group_setup(mcip, flent, link_type); mac_tx_srs_group_setup(mcip, flent, link_type); + /* Aggr ports don't have SRSes; thus there is no soft ring fanout. */ + if ((mcip->mci_state_flags & MCIS_IS_AGGR_PORT) != 0) + return; + pool_lock(); cpupart = mac_pset_find(mrp, &use_default); mac_fanout_setup(mcip, flent, MCIP_RESOURCE_PROPS(mcip), @@ -2381,6 +2381,29 @@ mac_rx_srs_group_setup(mac_client_impl_t *mcip, flow_entry_t *flent, mac_group_t *rx_group = flent->fe_rx_ring_group; boolean_t no_unicast; + /* + * If this is an an aggr port, then don't setup Rx SRS and Rx + * soft rings as they won't be used. However, we still need to + * start the rings to receive data on them. + */ + if (mcip->mci_state_flags & MCIS_IS_AGGR_PORT) { + if (rx_group == NULL) + return; + + for (ring = rx_group->mrg_rings; ring != NULL; + ring = ring->mr_next) { + if (ring->mr_state != MR_INUSE) + (void) mac_start_ring(ring); + } + + return; + } + + /* + * Aggr ports should never have SRSes. + */ + ASSERT3U((mcip->mci_state_flags & MCIS_IS_AGGR_PORT), ==, 0); + fanout_type = mac_find_fanout(flent, link_type); no_unicast = (mcip->mci_state_flags & MCIS_NO_UNICAST_ADDR) != 0; @@ -2469,38 +2492,40 @@ void mac_tx_srs_group_setup(mac_client_impl_t *mcip, flow_entry_t *flent, uint32_t link_type) { - int cnt; - int ringcnt; - mac_ring_t *ring; - mac_group_t *grp; - /* - * If we are opened exclusively (like aggr does for aggr_ports), - * don't set up Tx SRS and Tx soft rings as they won't be used. - * The same thing has to be done for Rx side also. See bug: - * 6880080 + * If this is an exclusive client (e.g. an aggr port), then + * don't setup Tx SRS and Tx soft rings as they won't be used. + * However, we still need to start the rings to send data + * across them. */ if (mcip->mci_state_flags & MCIS_EXCLUSIVE) { - /* - * If we have rings, start them here. - */ - if (flent->fe_tx_ring_group == NULL) - return; + mac_ring_t *ring; + mac_group_t *grp; + grp = (mac_group_t *)flent->fe_tx_ring_group; - ringcnt = grp->mrg_cur_count; - ring = grp->mrg_rings; - for (cnt = 0; cnt < ringcnt; cnt++) { - if (ring->mr_state != MR_INUSE) { + + if (grp == NULL) + return; + + for (ring = grp->mrg_rings; ring != NULL; + ring = ring->mr_next) { + if (ring->mr_state != MR_INUSE) (void) mac_start_ring(ring); - } - ring = ring->mr_next; } + return; } + + /* + * Aggr ports should never have SRSes. + */ + ASSERT3U((mcip->mci_state_flags & MCIS_IS_AGGR_PORT), ==, 0); + if (flent->fe_tx_srs == NULL) { (void) mac_srs_create(mcip, flent, SRST_TX | link_type, NULL, mcip, NULL, NULL); } + mac_tx_srs_setup(mcip, flent); } @@ -3168,12 +3193,12 @@ mac_datapath_teardown(mac_client_impl_t *mcip, flow_entry_t *flent, mac_flow_remove(mip->mi_flow_tab, flent, B_FALSE); mac_flow_wait(flent, FLOW_DRIVER_UPCALL); - /* Now quiesce and destroy all SRS and soft rings */ + /* Quiesce and destroy all the SRSes. */ mac_rx_srs_group_teardown(flent, B_FALSE); mac_tx_srs_group_teardown(mcip, flent, SRST_LINK); - ASSERT((mcip->mci_flent == flent) && - (flent->fe_next == NULL)); + ASSERT3P(mcip->mci_flent, ==, flent); + ASSERT3P(flent->fe_next, ==, NULL); /* * Release our hold on the group as well. We need @@ -4022,8 +4047,8 @@ mac_fanout_recompute_client(mac_client_impl_t *mcip, cpupart_t *cpupart) } /* - * Walk through the list of mac clients for the MAC. - * For each active mac client, recompute the number of soft rings + * Walk through the list of MAC clients for the MAC. + * For each active MAC client, recompute the number of soft rings * associated with every client, only if current speed is different * from the speed that was previously used for soft ring computation. * If the cable is disconnected whlie the NIC is started, we would get @@ -4046,6 +4071,10 @@ mac_fanout_recompute(mac_impl_t *mip) for (mcip = mip->mi_clients_list; mcip != NULL; mcip = mcip->mci_client_next) { + /* Aggr port clients don't have SRSes. */ + if ((mcip->mci_state_flags & MCIS_IS_AGGR_PORT) != 0) + continue; + if ((mcip->mci_state_flags & MCIS_SHARE_BOUND) != 0 || !MCIP_DATAPATH_SETUP(mcip)) continue; @@ -4058,6 +4087,7 @@ mac_fanout_recompute(mac_impl_t *mip) mac_set_pool_effective(use_default, cpupart, mrp, emrp); pool_unlock(); } + i_mac_perim_exit(mip); } diff --git a/usr/src/uts/common/io/mac/mac_provider.c b/usr/src/uts/common/io/mac/mac_provider.c index 26f501668e..a3f0ca89ed 100644 --- a/usr/src/uts/common/io/mac/mac_provider.c +++ b/usr/src/uts/common/io/mac/mac_provider.c @@ -699,7 +699,6 @@ mac_rx_common(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain) mac_ring_t *mr = (mac_ring_t *)mrh; mac_soft_ring_set_t *mac_srs; mblk_t *bp = mp_chain; - boolean_t hw_classified = B_FALSE; /* * If there are any promiscuous mode callbacks defined for @@ -711,7 +710,7 @@ mac_rx_common(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain) if (mr != NULL) { /* * If the SRS teardown has started, just return. The 'mr' - * continues to be valid until the driver unregisters the mac. + * continues to be valid until the driver unregisters the MAC. * Hardware classified packets will not make their way up * beyond this point once the teardown has started. The driver * is never passed a pointer to a flow entry or SRS or any @@ -724,11 +723,25 @@ mac_rx_common(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain) freemsgchain(mp_chain); return; } - if (mr->mr_classify_type == MAC_HW_CLASSIFIER) { - hw_classified = B_TRUE; + + /* + * The ring is in passthru mode; pass the chain up to + * the pseudo ring. + */ + if (mr->mr_classify_type == MAC_PASSTHRU_CLASSIFIER) { MR_REFHOLD_LOCKED(mr); + mutex_exit(&mr->mr_lock); + mr->mr_pt_fn(mr->mr_pt_arg1, mr->mr_pt_arg2, mp_chain, + B_FALSE); + MR_REFRELE(mr); + return; } - mutex_exit(&mr->mr_lock); + + /* + * The passthru callback should only be set when in + * MAC_PASSTHRU_CLASSIFIER mode. + */ + ASSERT3P(mr->mr_pt_fn, ==, NULL); /* * We check if an SRS is controlling this ring. @@ -736,19 +749,24 @@ mac_rx_common(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain) * routine otherwise we need to go through mac_rx_classify * to reach the right place. */ - if (hw_classified) { + if (mr->mr_classify_type == MAC_HW_CLASSIFIER) { + MR_REFHOLD_LOCKED(mr); + mutex_exit(&mr->mr_lock); + ASSERT3P(mr->mr_srs, !=, NULL); mac_srs = mr->mr_srs; + /* - * This is supposed to be the fast path. - * All packets received though here were steered by - * the hardware classifier, and share the same - * MAC header info. + * This is the fast path. All packets received + * on this ring are hardware classified and + * share the same MAC header info. */ mac_srs->srs_rx.sr_lower_proc(mh, (mac_resource_handle_t)mac_srs, mp_chain, B_FALSE); MR_REFRELE(mr); return; } + + mutex_exit(&mr->mr_lock); /* We'll fall through to software classification */ } else { flow_entry_t *flent; diff --git a/usr/src/uts/common/io/mac/mac_stat.c b/usr/src/uts/common/io/mac/mac_stat.c index 31972f94d8..dbb5c0a914 100644 --- a/usr/src/uts/common/io/mac/mac_stat.c +++ b/usr/src/uts/common/io/mac/mac_stat.c @@ -21,6 +21,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2018 Joyent, Inc. */ /* @@ -261,7 +262,7 @@ static stat_info_t rx_srs_stats_list[] = { {RX_SRS_STAT_OFF(mrs_chaincntover50)}, {RX_SRS_STAT_OFF(mrs_ierrors)} }; -#define RX_SRS_STAT_SIZE \ +#define RX_SRS_STAT_SIZE \ (sizeof (rx_srs_stats_list) / sizeof (stat_info_t)) #define TX_SOFTRING_STAT_OFF(f) (offsetof(mac_tx_stats_t, f)) @@ -273,14 +274,14 @@ static stat_info_t tx_softring_stats_list[] = { {TX_SOFTRING_STAT_OFF(mts_unblockcnt)}, {TX_SOFTRING_STAT_OFF(mts_sdrops)}, }; -#define TX_SOFTRING_STAT_SIZE \ +#define TX_SOFTRING_STAT_SIZE \ (sizeof (tx_softring_stats_list) / sizeof (stat_info_t)) static void i_mac_add_stats(void *sum, void *op1, void *op2, stat_info_t stats_list[], uint_t size) { - int i; + int i; for (i = 0; i < size; i++) { uint64_t *op1_val = (uint64_t *) @@ -678,8 +679,8 @@ i_mac_rx_hwlane_stat_create(mac_soft_ring_set_t *mac_srs, const char *modname, static uint64_t i_mac_misc_stat_get(void *handle, uint_t stat) { - flow_entry_t *flent = handle; - mac_client_impl_t *mcip = flent->fe_mcip; + flow_entry_t *flent = handle; + mac_client_impl_t *mcip = flent->fe_mcip; mac_misc_stats_t *mac_misc_stat = &mcip->mci_misc_stat; mac_rx_stats_t *mac_rx_stat; mac_tx_stats_t *mac_tx_stat; @@ -870,9 +871,9 @@ i_mac_tx_hwlane_stat_create(mac_soft_ring_t *ringp, const char *modname, static uint64_t i_mac_rx_fanout_stat_get(void *handle, uint_t stat) { - mac_soft_ring_t *tcp_ringp = (mac_soft_ring_t *)handle; + mac_soft_ring_t *tcp_ringp = (mac_soft_ring_t *)handle; mac_soft_ring_t *udp_ringp = NULL, *oth_ringp = NULL; - mac_soft_ring_set_t *mac_srs = tcp_ringp->s_ring_set; + mac_soft_ring_set_t *mac_srs = tcp_ringp->s_ring_set; int index; uint64_t val; @@ -1003,6 +1004,7 @@ void mac_ring_stat_create(mac_ring_t *ring) { mac_impl_t *mip = ring->mr_mip; + mac_group_t *grp = (mac_group_t *)ring->mr_gh; char statname[MAXNAMELEN]; char modname[MAXNAMELEN]; @@ -1014,8 +1016,8 @@ mac_ring_stat_create(mac_ring_t *ring) switch (ring->mr_type) { case MAC_RING_TYPE_RX: - (void) snprintf(statname, sizeof (statname), "mac_rx_ring%d", - ring->mr_index); + (void) snprintf(statname, sizeof (statname), + "mac_rx_ring_%d_%d", grp->mrg_index, ring->mr_index); i_mac_rx_ring_stat_create(ring, modname, statname); break; @@ -1035,7 +1037,7 @@ void mac_srs_stat_create(mac_soft_ring_set_t *mac_srs) { flow_entry_t *flent = mac_srs->srs_flent; - char statname[MAXNAMELEN]; + char statname[MAXNAMELEN]; boolean_t is_tx_srs; /* No hardware/software lanes for user defined flows */ diff --git a/usr/src/uts/common/sys/aggr_impl.h b/usr/src/uts/common/sys/aggr_impl.h index 415e176ef3..80733aa31e 100644 --- a/usr/src/uts/common/sys/aggr_impl.h +++ b/usr/src/uts/common/sys/aggr_impl.h @@ -56,6 +56,8 @@ extern "C" { */ #define MAC_PSEUDO_RING_INUSE 0x01 +#define MAX_GROUPS_PER_PORT 128 + /* * VLAN filters placed on the Rx pseudo group. */ @@ -71,14 +73,23 @@ typedef struct aggr_unicst_addr_s { } aggr_unicst_addr_t; typedef struct aggr_pseudo_rx_ring_s { - mac_ring_handle_t arr_rh; /* filled in by aggr_fill_ring() */ - struct aggr_port_s *arr_port; - mac_ring_handle_t arr_hw_rh; - uint_t arr_flags; - uint64_t arr_gen; + mac_ring_handle_t arr_rh; /* set by aggr_fill_ring() */ + struct aggr_port_s *arr_port; + struct aggr_pseudo_rx_group_s *arr_grp; + mac_ring_handle_t arr_hw_rh; + uint_t arr_flags; + uint64_t arr_gen; } aggr_pseudo_rx_ring_t; +/* + * An aggr pseudo group abstracts the underlying ports' HW groups. For + * example, if each port has 8 groups (mac_group_t), then the aggr + * will create 8 pseudo groups. Each pseudo group represents a + * collection of HW groups: one group from each port. If you have + * three ports then the pseudo group stands in for three HW groups. + */ typedef struct aggr_pseudo_rx_group_s { + uint_t arg_index; struct aggr_grp_s *arg_grp; /* filled in by aggr_fill_group() */ mac_group_handle_t arg_gh; /* filled in by aggr_fill_group() */ aggr_unicst_addr_t *arg_macaddr; @@ -119,12 +130,13 @@ typedef struct aggr_port_s { lp_collector_enabled : 1, lp_promisc_on : 1, lp_no_link_update : 1, - lp_rx_grp_added : 1, lp_tx_grp_added : 1, lp_closing : 1, - lp_pad_bits : 24; + lp_pad_bits : 25; mac_handle_t lp_mh; - mac_client_handle_t lp_mch; + + mac_client_handle_t lp_mch; + const mac_info_t *lp_mip; mac_notify_handle_t lp_mnh; uint_t lp_tx_idx; /* idx in group's tx array */ @@ -136,13 +148,19 @@ typedef struct aggr_port_s { aggr_lacp_port_t lp_lacp; /* LACP state */ lacp_stats_t lp_lacp_stats; uint32_t lp_margin; - mac_promisc_handle_t lp_mphp; + mac_unicast_handle_t lp_mah; /* List of non-primary addresses that requires promiscous mode set */ aggr_unicst_addr_t *lp_prom_addr; - /* handle of the underlying HW RX group */ - mac_group_handle_t lp_hwgh; + + /* + * References to the underlying HW Rx groups of this port. + * Used by aggr to program HW classification for the pseudo + * groups. + */ + mac_group_handle_t lp_hwghs[MAX_GROUPS_PER_PORT]; + int lp_tx_ring_cnt; /* handles of the underlying HW TX rings */ mac_ring_handle_t *lp_tx_rings; @@ -189,7 +207,7 @@ typedef struct aggr_grp_s { lg_lso : 1, lg_pad_bits : 8; aggr_port_t *lg_ports; /* list of configured ports */ - aggr_port_t *lg_mac_addr_port; + aggr_port_t *lg_mac_addr_port; /* using address of this port */ mac_handle_t lg_mh; zoneid_t lg_zoneid; uint_t lg_nattached_ports; @@ -233,7 +251,9 @@ typedef struct aggr_grp_s { kthread_t *lg_lacp_rx_thread; boolean_t lg_lacp_done; - aggr_pseudo_rx_group_t lg_rx_group; + uint_t lg_rx_group_count; + aggr_pseudo_rx_group_t lg_rx_groups[MAX_GROUPS_PER_PORT]; + aggr_pseudo_tx_group_t lg_tx_group; kmutex_t lg_tx_flowctl_lock; @@ -328,8 +348,6 @@ extern boolean_t aggr_port_notify_link(aggr_grp_t *, aggr_port_t *); extern void aggr_port_init_callbacks(aggr_port_t *); extern void aggr_recv_cb(void *, mac_resource_handle_t, mblk_t *, boolean_t); -extern void aggr_recv_promisc_cb(void *, mac_resource_handle_t, mblk_t *, - boolean_t); extern void aggr_tx_ring_update(void *, uintptr_t); extern void aggr_tx_notify_thread(void *); @@ -357,11 +375,11 @@ extern void aggr_grp_port_hold(aggr_port_t *); extern void aggr_grp_port_rele(aggr_port_t *); extern void aggr_grp_port_wait(aggr_grp_t *); -extern int aggr_port_addmac(aggr_port_t *, const uint8_t *); -extern void aggr_port_remmac(aggr_port_t *, const uint8_t *); +extern int aggr_port_addmac(aggr_port_t *, uint_t, const uint8_t *); +extern void aggr_port_remmac(aggr_port_t *, uint_t, const uint8_t *); -extern int aggr_port_addvlan(aggr_port_t *, uint16_t); -extern int aggr_port_remvlan(aggr_port_t *, uint16_t); +extern int aggr_port_addvlan(aggr_port_t *, uint_t, uint16_t); +extern int aggr_port_remvlan(aggr_port_t *, uint_t, uint16_t); extern mblk_t *aggr_ring_tx(void *, mblk_t *); extern mblk_t *aggr_find_tx_ring(void *, mblk_t *, diff --git a/usr/src/uts/common/sys/mac_client_priv.h b/usr/src/uts/common/sys/mac_client_priv.h index 77475b339e..965dca263c 100644 --- a/usr/src/uts/common/sys/mac_client_priv.h +++ b/usr/src/uts/common/sys/mac_client_priv.h @@ -121,9 +121,17 @@ extern void mac_tx_client_quiesce(mac_client_handle_t); extern void mac_tx_client_condemn(mac_client_handle_t); extern void mac_tx_client_restart(mac_client_handle_t); extern void mac_srs_perm_quiesce(mac_client_handle_t, boolean_t); +extern uint_t mac_hwrings_idx_get(mac_handle_t, uint_t, mac_group_handle_t *, + mac_ring_handle_t *, mac_ring_type_t); extern int mac_hwrings_get(mac_client_handle_t, mac_group_handle_t *, mac_ring_handle_t *, mac_ring_type_t); extern uint_t mac_hwring_getinfo(mac_ring_handle_t); +extern void mac_hwring_set_passthru(mac_ring_handle_t, mac_rx_t, void *, + mac_resource_handle_t); +extern void mac_hwring_clear_passthru(mac_ring_handle_t); +extern void mac_client_set_flow_cb(mac_client_handle_t, mac_rx_t, void *); +extern void mac_client_clear_flow_cb(mac_client_handle_t); + extern void mac_hwring_setup(mac_ring_handle_t, mac_resource_handle_t, mac_ring_handle_t); extern void mac_hwring_teardown(mac_ring_handle_t); @@ -131,6 +139,8 @@ extern int mac_hwring_disable_intr(mac_ring_handle_t); extern int mac_hwring_enable_intr(mac_ring_handle_t); extern int mac_hwring_start(mac_ring_handle_t); extern void mac_hwring_stop(mac_ring_handle_t); +extern int mac_hwring_activate(mac_ring_handle_t); +extern void mac_hwring_quiesce(mac_ring_handle_t); extern mblk_t *mac_hwring_poll(mac_ring_handle_t, int); extern mblk_t *mac_hwring_tx(mac_ring_handle_t, mblk_t *); extern int mac_hwring_getstat(mac_ring_handle_t, uint_t, uint64_t *); @@ -149,6 +159,9 @@ extern int mac_hwgroup_remvlan(mac_group_handle_t, uint16_t); extern boolean_t mac_has_hw_vlan(mac_handle_t); +extern uint_t mac_get_num_rx_groups(mac_handle_t); +extern int mac_set_promisc(mac_handle_t, boolean_t); + extern void mac_set_upper_mac(mac_client_handle_t, mac_handle_t, mac_resource_props_t *); diff --git a/usr/src/uts/common/sys/mac_impl.h b/usr/src/uts/common/sys/mac_impl.h index eebbde37de..df03a76715 100644 --- a/usr/src/uts/common/sys/mac_impl.h +++ b/usr/src/uts/common/sys/mac_impl.h @@ -208,9 +208,18 @@ struct mac_ring_s { mac_ring_t *mr_next; /* next ring in the chain */ mac_group_handle_t mr_gh; /* reference to group */ - mac_classify_type_t mr_classify_type; /* HW vs SW */ + mac_classify_type_t mr_classify_type; struct mac_soft_ring_set_s *mr_srs; /* associated SRS */ - mac_ring_handle_t mr_prh; /* associated pseudo ring hdl */ + mac_ring_handle_t mr_prh; /* associated pseudo ring hdl */ + + /* + * Ring passthru callback and arguments. See the + * MAC_PASSTHRU_CLASSIFIER comment in mac_provider.h. + */ + mac_rx_t mr_pt_fn; + void *mr_pt_arg1; + mac_resource_handle_t mr_pt_arg2; + uint_t mr_refcnt; /* Ring references */ /* ring generation no. to guard against drivers using stale rings */ uint64_t mr_gen_num; diff --git a/usr/src/uts/common/sys/mac_provider.h b/usr/src/uts/common/sys/mac_provider.h index 301bc9a058..8e00dfced6 100644 --- a/usr/src/uts/common/sys/mac_provider.h +++ b/usr/src/uts/common/sys/mac_provider.h @@ -242,16 +242,59 @@ typedef struct mac_callbacks_s { /* * Virtualization Capabilities */ + /* - * The ordering of entries below is important. MAC_HW_CLASSIFIER - * is the cutoff below which are entries which don't depend on - * H/W. MAC_HW_CLASSIFIER and entries after that are cases where - * H/W has been updated through add/modify/delete APIs. + * The type of ring classification. This is used by MAC to determine + * what, if any, processing it has to do upon receiving traffic on a + * particular Rx ring. + * + * MAC_NO_CLASSIFIER + * + * No classification has been set. No traffic should cross an Rx + * ring in this state. + * + * MAC_SW_CLASSIFIER + * + * The driver delivers traffic for multiple clients to this ring. + * All traffic must be software classified by MAC to guarantee + * delivery to the correct client. This classification type may + * be chosen for several reasons. + * + * o The driver provides only one group and there are multiple + * clients using the MAC. + * + * o The driver provides some hardware filtering but not enough + * to fully classify the traffic. E.g., a VLAN VNIC requires L2 + * unicast address filtering as well as VLAN filtering, but + * some drivers may only support the former. + * + * o The ring belongs to the default group. The default group + * acts as a spillover for all clients that can't reserve an + * exclusive group. It also handles multicast traffic for all + * clients. For these reasons, the default group's rings are + * always software classified. + * + * MAC_HW_CLASSIFIER + * + * The driver delivers traffic for a single MAC client across + * this ring. With this guarantee, MAC can simply pass the + * traffic up the stack or even allow polling of the ring. + * + * MAC_PASSTHRU_CLASSIFIER + * + * The ring is in "passthru" mode. In this mode we bypass all of + * the typical MAC processing and pass the traffic directly to + * the mr_pt_fn callback, see mac_rx_common(). This is used in + * cases where there is another module acting as MAC provider on + * behalf of the driver. E.g., link aggregations use this mode to + * take full control of the port's rings; allowing it to enforce + * LACP protocols and aggregate rings across discrete drivers. */ typedef enum { MAC_NO_CLASSIFIER = 0, MAC_SW_CLASSIFIER, - MAC_HW_CLASSIFIER + MAC_HW_CLASSIFIER, + MAC_PASSTHRU_CLASSIFIER } mac_classify_type_t; typedef void (*mac_rx_func_t)(void *, mac_resource_handle_t, mblk_t *, @@ -364,6 +407,7 @@ typedef struct mac_ring_info_s { mac_ring_poll_t poll; } mrfunion; mac_ring_stat_t mri_stat; + /* * mri_flags will have some bits set to indicate some special * property/feature of a ring like serialization needed for a diff --git a/usr/src/uts/sun4v/io/vnet.c b/usr/src/uts/sun4v/io/vnet.c index c9e378f89e..96fb04175d 100644 --- a/usr/src/uts/sun4v/io/vnet.c +++ b/usr/src/uts/sun4v/io/vnet.c @@ -22,6 +22,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2018 Joyent, Inc. */ #include <sys/types.h> @@ -1132,9 +1133,9 @@ vnet_mac_register(vnet_t *vnetp) static int vnet_read_mac_address(vnet_t *vnetp) { - uchar_t *macaddr; - uint32_t size; - int rv; + uchar_t *macaddr; + uint32_t size; + int rv; rv = ddi_prop_lookup_byte_array(DDI_DEV_T_ANY, vnetp->dip, DDI_PROP_DONTPASS, macaddr_propname, &macaddr, &size); @@ -2317,7 +2318,7 @@ vnet_get_ring(void *arg, mac_ring_type_t rtype, const int g_index, */ static void vnet_get_group(void *arg, mac_ring_type_t type, const int index, - mac_group_info_t *infop, mac_group_handle_t handle) + mac_group_info_t *infop, mac_group_handle_t handle) { vnet_t *vnetp = (vnet_t *)arg; @@ -2405,7 +2406,7 @@ vnet_rx_ring_start(mac_ring_driver_t arg, uint64_t mr_gen_num) return (0); } - err = mac_hwring_start(rx_ringp->hw_rh); + err = mac_hwring_activate(rx_ringp->hw_rh); if (err == 0) { rx_ringp->gen_num = mr_gen_num; rx_ringp->state |= VNET_RXRING_STARTED; @@ -2443,7 +2444,7 @@ vnet_rx_ring_stop(mac_ring_driver_t arg) return; } - mac_hwring_stop(rx_ringp->hw_rh); + mac_hwring_quiesce(rx_ringp->hw_rh); rx_ringp->state &= ~VNET_RXRING_STARTED; } @@ -2630,7 +2631,7 @@ vnet_rx_poll(void *arg, int bytes_to_pickup) /* ARGSUSED */ void vnet_hio_rx_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp, - boolean_t loopback) + boolean_t loopback) { vnet_t *vnetp = (vnet_t *)arg; vnet_pseudo_rx_ring_t *ringp = (vnet_pseudo_rx_ring_t *)mrh; @@ -2846,7 +2847,7 @@ vnet_bind_hwrings(vnet_t *vnetp) /* Start the hwring if needed */ if (rx_ringp->state & VNET_RXRING_STARTED) { - rv = mac_hwring_start(rx_ringp->hw_rh); + rv = mac_hwring_activate(rx_ringp->hw_rh); if (rv != 0) { mac_hwring_teardown(rx_ringp->hw_rh); rx_ringp->hw_rh = NULL; @@ -2920,7 +2921,7 @@ vnet_unbind_hwrings(vnet_t *vnetp) rx_ringp = &rx_grp->rings[i + VNET_HYBRID_RXRING_INDEX]; if (rx_ringp->hw_rh != NULL) { /* Stop the hwring */ - mac_hwring_stop(rx_ringp->hw_rh); + mac_hwring_quiesce(rx_ringp->hw_rh); /* Teardown the hwring */ mac_hwring_teardown(rx_ringp->hw_rh); |