diff options
author | Paul Winder <paul@winder.uk.net> | 2020-10-12 12:01:29 +0100 |
---|---|---|
committer | Paul Winder <paul@winder.uk.net> | 2020-12-11 07:51:49 +0000 |
commit | 13810335a5a8384eed97a8661536eb5352f0c933 (patch) | |
tree | 7afb12a82fa8bc973c6cc683e02148b2d0d9afee | |
parent | fe17aa88307d9cacf6677bbbe955585b11920199 (diff) | |
download | illumos-joyent-13810335a5a8384eed97a8661536eb5352f0c933.tar.gz |
13208 Create aggr fails when underlying links have more than 128 Tx rings
Reviewed by: Robert Mustacchi <rm@fingolfin.org>
Reviewed by: Garrett D'Amore <garrett@damore.org>
Approved by: Dan McDonald <danmcd@joyent.com>
-rw-r--r-- | usr/src/uts/common/io/aggr/aggr_grp.c | 117 | ||||
-rw-r--r-- | usr/src/uts/common/sys/aggr_impl.h | 5 |
2 files changed, 108 insertions, 14 deletions
diff --git a/usr/src/uts/common/io/aggr/aggr_grp.c b/usr/src/uts/common/io/aggr/aggr_grp.c index 9a4a936450..a4cfdad51e 100644 --- a/usr/src/uts/common/io/aggr/aggr_grp.c +++ b/usr/src/uts/common/io/aggr/aggr_grp.c @@ -607,6 +607,8 @@ aggr_grp_add_port(aggr_grp_t *grp, datalink_id_t port_linkid, boolean_t force, port->lp_grp = grp; AGGR_GRP_REFHOLD(grp); grp->lg_nports++; + if (grp->lg_nports > grp->lg_nports_high) + grp->lg_nports_high = grp->lg_nports; aggr_lacp_init_port(port); mac_perim_exit(mph); @@ -675,7 +677,7 @@ aggr_add_pseudo_rx_ring(aggr_port_t *port, * No slot for this new RX ring. */ if (j == MAX_RINGS_PER_GROUP) - return (EIO); + return (ENOSPC); ring->arr_flags |= MAC_PSEUDO_RING_INUSE; ring->arr_hw_rh = hw_rh; @@ -884,7 +886,7 @@ aggr_add_pseudo_tx_ring(aggr_port_t *port, * No slot for this new TX ring. */ if (i == MAX_RINGS_PER_GROUP) - return (EIO); + return (ENOSPC); /* * The following 4 statements needs to be done before * calling mac_group_add_ring(). Otherwise it will @@ -948,7 +950,8 @@ aggr_rem_pseudo_tx_ring(aggr_pseudo_tx_group_t *tx_grp, * rings of the aggr and the hardware rings of the underlying port. */ static int -aggr_add_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp) +aggr_add_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp, + uint_t limit) { aggr_grp_t *grp = port->lp_grp; mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP], pseudo_rh; @@ -956,6 +959,9 @@ aggr_add_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp) int hw_rh_cnt, i = 0, j; int err = 0; + if (limit == 0) + return (ENOSPC); + ASSERT(MAC_PERIM_HELD(grp->lg_mh)); mac_perim_enter_by_mh(port->lp_mh, &pmph); @@ -973,12 +979,13 @@ aggr_add_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp) if (hw_rh_cnt == 0) port->lp_tx_ring_cnt = 1; else - port->lp_tx_ring_cnt = hw_rh_cnt; + port->lp_tx_ring_cnt = MIN(hw_rh_cnt, limit); + port->lp_tx_ring_alloc = port->lp_tx_ring_cnt; port->lp_tx_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) * - port->lp_tx_ring_cnt), KM_SLEEP); + port->lp_tx_ring_alloc), KM_SLEEP); port->lp_pseudo_tx_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) * - port->lp_tx_ring_cnt), KM_SLEEP); + port->lp_tx_ring_alloc), KM_SLEEP); if (hw_rh_cnt == 0) { if ((err = aggr_add_pseudo_tx_ring(port, tx_grp, @@ -987,7 +994,7 @@ aggr_add_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp) port->lp_pseudo_tx_rings[0] = pseudo_rh; } } else { - for (i = 0; err == 0 && i < hw_rh_cnt; i++) { + for (i = 0; err == 0 && i < port->lp_tx_ring_cnt; i++) { err = aggr_add_pseudo_tx_ring(port, tx_grp, hw_rh[i], &pseudo_rh); if (err != 0) @@ -1005,10 +1012,11 @@ aggr_add_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp) } } kmem_free(port->lp_tx_rings, - (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt)); + (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_alloc)); kmem_free(port->lp_pseudo_tx_rings, - (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt)); + (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_alloc)); port->lp_tx_ring_cnt = 0; + port->lp_tx_ring_alloc = 0; } else { port->lp_tx_grp_added = B_TRUE; port->lp_tx_notify_mh = mac_client_tx_notify(port->lp_mch, @@ -1042,9 +1050,9 @@ aggr_rem_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp) aggr_rem_pseudo_tx_ring(tx_grp, port->lp_pseudo_tx_rings[i]); kmem_free(port->lp_tx_rings, - (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt)); + (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_alloc)); kmem_free(port->lp_pseudo_tx_rings, - (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt)); + (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_alloc)); port->lp_tx_ring_cnt = 0; (void) mac_client_tx_notify(port->lp_mch, NULL, port->lp_tx_notify_mh); @@ -1111,6 +1119,48 @@ aggr_pseudo_stop_rx_ring(mac_ring_driver_t arg) } /* + * Trim each port in a group to ensure it uses no more than tx_ring_limit + * rings. + */ +static void +aggr_grp_balance_tx(aggr_grp_t *grp, uint_t tx_ring_limit) +{ + aggr_port_t *port; + mac_perim_handle_t mph; + uint_t i, tx_ring_cnt; + + ASSERT(tx_ring_limit > 0); + ASSERT(MAC_PERIM_HELD(grp->lg_mh)); + + for (port = grp->lg_ports; port != NULL; port = port->lp_next) { + mac_perim_enter_by_mh(port->lp_mh, &mph); + + /* + * Reduce the Tx ring count first to prevent rings being + * used as they are removed. + */ + rw_enter(&grp->lg_tx_lock, RW_WRITER); + if (port->lp_tx_ring_cnt <= tx_ring_limit) { + rw_exit(&grp->lg_tx_lock); + mac_perim_exit(mph); + continue; + } + + tx_ring_cnt = port->lp_tx_ring_cnt; + port->lp_tx_ring_cnt = tx_ring_limit; + rw_exit(&grp->lg_tx_lock); + + for (i = tx_ring_cnt - 1; i >= tx_ring_limit; i--) { + aggr_rem_pseudo_tx_ring(&grp->lg_tx_group, + port->lp_pseudo_tx_rings[i]); + + } + + mac_perim_exit(mph); + } +} + +/* * Add one or more ports to an existing link aggregation group. */ int @@ -1120,6 +1170,7 @@ aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force, int rc; uint_t port_added = 0; uint_t grp_added; + uint_t nports_high, tx_ring_limit; aggr_grp_t *grp = NULL; aggr_port_t *port; boolean_t link_state_changed = B_FALSE; @@ -1140,6 +1191,24 @@ aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force, mac_perim_enter_by_mh(grp->lg_mh, &mph); rw_exit(&aggr_grp_lock); + /* + * Limit the number of Tx rings per port. When determining the + * number of ports take into consideration the existing high + * value, and what the new high value may be after this request. + */ + nports_high = MAX(grp->lg_nports_high, grp->lg_nports + nports); + tx_ring_limit = MAX_RINGS_PER_GROUP / nports_high; + + if (tx_ring_limit == 0) { + rc = ENOSPC; + goto bail; + } + + /* + * Balance the Tx rings so each port has a fair share of rings. + */ + aggr_grp_balance_tx(grp, tx_ring_limit); + /* Add the specified ports to the aggr. */ for (uint_t i = 0; i < nports; i++) { grp_added = 0; @@ -1164,7 +1233,8 @@ aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force, * Create the pseudo ring for each HW ring of the underlying * port. */ - rc = aggr_add_pseudo_tx_group(port, &grp->lg_tx_group); + rc = aggr_add_pseudo_tx_group(port, &grp->lg_tx_group, + tx_ring_limit); if (rc != 0) goto bail; @@ -1380,6 +1450,7 @@ aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports, mac_perim_handle_t mph, pmph; datalink_id_t tempid; boolean_t mac_registered = B_FALSE; + uint_t tx_ring_limit; int err; int i, j; kt_did_t tid = 0; @@ -1551,6 +1622,25 @@ aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports, aggr_lacp_set_mode(grp, lacp_mode, lacp_timer); /* + * The pseudo Tx group holds a maximum of MAX_RINGS_PER_GROUP + * rings, when all the Tx rings of all the ports are accumulated + * it is conceivable this limit is exceeded. We try and prevent + * this by limiting the number of rings an individual port will use. + * + * - When an aggr is first created, we will not let an + * individual port use more than MAX_RINGS_PER_GROUP/nports + * rings. + * - As ports are added to an existing aggr, each of the + * ports will not use more than MAX_RINGS_PER_GROUP/nports_high. + * Where nports_high is the highest number of ports the aggr has + * held (including any ports being added). This may involve + * trimming rings from existing ports. + */ + + /* Leave room for 4 ports */ + tx_ring_limit = MAX_RINGS_PER_GROUP / MAX(4, nports); + + /* * Attach each port if necessary. */ for (port = grp->lg_ports; port != NULL; port = port->lp_next) { @@ -1559,7 +1649,8 @@ aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports, * underlying port. Note that this is done after the * aggr registers its MAC. */ - err = aggr_add_pseudo_tx_group(port, &grp->lg_tx_group); + err = aggr_add_pseudo_tx_group(port, &grp->lg_tx_group, + tx_ring_limit); if (err != 0) { mac_perim_exit(mph); diff --git a/usr/src/uts/common/sys/aggr_impl.h b/usr/src/uts/common/sys/aggr_impl.h index 80733aa31e..de4162bc61 100644 --- a/usr/src/uts/common/sys/aggr_impl.h +++ b/usr/src/uts/common/sys/aggr_impl.h @@ -23,6 +23,7 @@ * Use is subject to license terms. * Copyright 2012 OmniTI Computer Consulting, Inc All rights reserved. * Copyright 2018 Joyent, Inc. + * Copyright 2020 RackTop Systems, Inc. */ #ifndef _SYS_AGGR_IMPL_H @@ -161,7 +162,8 @@ typedef struct aggr_port_s { */ mac_group_handle_t lp_hwghs[MAX_GROUPS_PER_PORT]; - int lp_tx_ring_cnt; + uint_t lp_tx_ring_alloc; + uint_t lp_tx_ring_cnt; /* handles of the underlying HW TX rings */ mac_ring_handle_t *lp_tx_rings; /* @@ -195,6 +197,7 @@ typedef struct aggr_grp_s { uint16_t lg_key; /* key (group port number) */ uint32_t lg_refs; /* refcount */ uint16_t lg_nports; /* number of MAC ports */ + uint16_t lg_nports_high; /* highest no. of MAC ports */ uint8_t lg_addr[ETHERADDRL]; /* group MAC address */ uint16_t lg_closing : 1, |