summaryrefslogtreecommitdiff
path: root/usr/src/uts/common/io
diff options
context:
space:
mode:
authorVenugopal Iyer <Venu.Iyer@Sun.COM>2009-02-17 01:31:30 -0800
committerVenugopal Iyer <Venu.Iyer@Sun.COM>2009-02-17 01:31:30 -0800
commitae6aa22afeb444ae208c287e7227a4a7c877f17a (patch)
tree744dffd8856e6a2a16544575ca8773771051dada /usr/src/uts/common/io
parentd02310705313ee2fcefee164a4b26d1fa85e9d22 (diff)
downloadillumos-gate-ae6aa22afeb444ae208c287e7227a4a7c877f17a.tar.gz
PSARC/2009/099 dladm show-usage modifications
6726676 flow should not be seen by flowadm show-usage after the flow been removed by flowadm remove-flow 6766669 "dladm show-vnic -o" can't accept MACADDRESS 6773854 Per Tx ring flow control for UDP 6777547 mac_tx() should compute the hash if the passed hint is zero 6778557 nxge m_tx() should fanout to multiple rings for vnet scalability 6779356 sometimes packets are not classified to the correct flow 6783011 pre-existing subflows not initialized on a non-dls client when brought up 6786734 acctadm dladm_start_usagelog() calls need some work 6789760 mac perimeter deadlock due to dls_devnet_stat_update() 6789883 dladm show-link -s is adrift again. 6791099 mac_tx() frees the message but returns non-NULL cookie which causes panic 6791109 maxbw set on a link should not apply if this link is the underlying port of an aggregation 6791118 panic in mac_bcast_delete() unplumbing an IP interface 6791456 deleting last vnic interface causes bge interface to stop working 6791678 xvm guests don't communicate through vnics configured on vlan 6792164 race between mac_tx_is_flow_blocked() and mac_srs_group_teardown() could cause panic 6792546 paniced in bge_ring_tx()/freemsg() due to mp->b_next == NULL && mp->b_prev == NULL 6792555 paniced in mac_flow_walk_nolock() due to assertion failed: cnt == ft->ft_flow_count 6792871 multiple VLANs per MAC client cause hang in mac_flow_wait() 6792942 60% regression for Guest-to-Guest network throughput on snv106 6793278 the multicast addresses are not added to the aggregation port in certain scenarios 6793436 panic in mac_fini_macaddr() on mac_register() failure 6796850 SUNWcnetr postinstall script spews errors due to bad interface matching 6803378 need support for dls_bypass and rx fanout on non-ethernet media
Diffstat (limited to 'usr/src/uts/common/io')
-rw-r--r--usr/src/uts/common/io/aggr/aggr_grp.c16
-rw-r--r--usr/src/uts/common/io/aggr/aggr_port.c9
-rw-r--r--usr/src/uts/common/io/aggr/aggr_send.c231
-rw-r--r--usr/src/uts/common/io/dld/dld_proto.c21
-rw-r--r--usr/src/uts/common/io/dls/dls.c24
-rw-r--r--usr/src/uts/common/io/dls/dls_link.c4
-rw-r--r--usr/src/uts/common/io/dls/dls_mgmt.c106
-rw-r--r--usr/src/uts/common/io/e1000g/e1000g_main.c177
-rw-r--r--usr/src/uts/common/io/e1000g/e1000g_rx.c9
-rw-r--r--usr/src/uts/common/io/e1000g/e1000g_sw.h7
-rw-r--r--usr/src/uts/common/io/mac/mac.c22
-rw-r--r--usr/src/uts/common/io/mac/mac_bcast.c148
-rw-r--r--usr/src/uts/common/io/mac/mac_client.c179
-rw-r--r--usr/src/uts/common/io/mac/mac_datapath_setup.c67
-rw-r--r--usr/src/uts/common/io/mac/mac_flow.c50
-rw-r--r--usr/src/uts/common/io/mac/mac_provider.c20
-rw-r--r--usr/src/uts/common/io/mac/mac_sched.c552
-rw-r--r--usr/src/uts/common/io/mac/mac_soft_ring.c4
-rw-r--r--usr/src/uts/common/io/mac/mac_util.c195
-rw-r--r--usr/src/uts/common/io/nxge/nxge_send.c27
-rw-r--r--usr/src/uts/common/io/softmac/softmac_main.c16
21 files changed, 1015 insertions, 869 deletions
diff --git a/usr/src/uts/common/io/aggr/aggr_grp.c b/usr/src/uts/common/io/aggr/aggr_grp.c
index 04dc974198..59eb75f9d5 100644
--- a/usr/src/uts/common/io/aggr/aggr_grp.c
+++ b/usr/src/uts/common/io/aggr/aggr_grp.c
@@ -313,13 +313,13 @@ aggr_grp_attach_port(aggr_grp_t *grp, aggr_port_t *port)
link_state_changed = B_TRUE;
}
- aggr_grp_multicst_port(port, B_TRUE);
-
/*
* Update port's state.
*/
port->lp_state = AGGR_PORT_STATE_ATTACHED;
+ aggr_grp_multicst_port(port, B_TRUE);
+
/*
* Set port's receive callback
*/
@@ -2028,8 +2028,10 @@ aggr_remmac(void *arg, const uint8_t *mac_addr)
/*
* Add or remove the multicast addresses that are defined for the group
* to or from the specified port.
- * This function is called before stopping a port, before a port
- * is detached from a group, and when attaching a port to a group.
+ *
+ * Note that aggr_grp_multicst_port(..., B_TRUE) is called when the port
+ * is started and attached, and aggr_grp_multicst_port(..., B_FALSE) is
+ * called when the port is either stopped or detached.
*/
void
aggr_grp_multicst_port(aggr_port_t *port, boolean_t add)
@@ -2039,7 +2041,7 @@ aggr_grp_multicst_port(aggr_port_t *port, boolean_t add)
ASSERT(MAC_PERIM_HELD(port->lp_mh));
ASSERT(MAC_PERIM_HELD(grp->lg_mh));
- if (!port->lp_started)
+ if (!port->lp_started || port->lp_state != AGGR_PORT_STATE_ATTACHED)
return;
mac_multicast_refresh(grp->lg_mh, aggr_port_multicst, port, add);
@@ -2055,8 +2057,10 @@ aggr_m_multicst(void *arg, boolean_t add, const uint8_t *addrp)
mac_perim_enter_by_mh(grp->lg_mh, &mph);
for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
- if (port->lp_state != AGGR_PORT_STATE_ATTACHED)
+ if (port->lp_state != AGGR_PORT_STATE_ATTACHED ||
+ !port->lp_started) {
continue;
+ }
cerr = aggr_port_multicst(port, add, addrp);
if (cerr != 0 && err == 0)
err = cerr;
diff --git a/usr/src/uts/common/io/aggr/aggr_port.c b/usr/src/uts/common/io/aggr/aggr_port.c
index 0323b622f1..4097ba335e 100644
--- a/usr/src/uts/common/io/aggr/aggr_port.c
+++ b/usr/src/uts/common/io/aggr/aggr_port.c
@@ -493,9 +493,11 @@ aggr_port_start(aggr_port_t *port)
{
ASSERT(MAC_PERIM_HELD(port->lp_mh));
- if (!port->lp_started)
- port->lp_started = B_TRUE;
+ if (port->lp_started)
+ return (0);
+ port->lp_started = B_TRUE;
+ aggr_grp_multicst_port(port, B_TRUE);
return (0);
}
@@ -507,8 +509,7 @@ aggr_port_stop(aggr_port_t *port)
if (!port->lp_started)
return;
- if (port->lp_state == AGGR_PORT_STATE_ATTACHED)
- aggr_grp_multicst_port(port, B_FALSE);
+ aggr_grp_multicst_port(port, B_FALSE);
/* update the port state */
port->lp_started = B_FALSE;
diff --git a/usr/src/uts/common/io/aggr/aggr_send.c b/usr/src/uts/common/io/aggr/aggr_send.c
index 9b4ad24621..bc0a19368d 100644
--- a/usr/src/uts/common/io/aggr/aggr_send.c
+++ b/usr/src/uts/common/io/aggr/aggr_send.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -35,6 +35,7 @@
#include <sys/vlan.h>
#include <sys/strsun.h>
#include <sys/strsubr.h>
+#include <sys/dlpi.h>
#include <inet/common.h>
#include <inet/led.h>
@@ -42,184 +43,29 @@
#include <inet/ip6.h>
#include <inet/tcp.h>
#include <netinet/udp.h>
-#include <inet/ipsec_impl.h>
-#include <inet/sadb.h>
-#include <inet/ipsecesp.h>
-#include <inet/ipsecah.h>
#include <sys/aggr.h>
#include <sys/aggr_impl.h>
-#define HASH_4BYTES(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3])
-#define HASH_MAC(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3] ^ (x)[4] ^ (x)[5])
-
-static uint16_t aggr_send_ip6_hdr_len(mblk_t *, ip6_t *);
-
-static uint64_t
-aggr_send_hash(aggr_grp_t *grp, mblk_t *mp)
-{
- struct ether_header *ehp;
- uint16_t sap;
- uint_t skip_len;
- uint8_t proto;
- uint32_t policy = grp->lg_tx_policy;
- uint64_t hash = 0;
-
- ASSERT(IS_P2ALIGNED(mp->b_rptr, sizeof (uint16_t)));
- ASSERT(MBLKL(mp) >= sizeof (struct ether_header));
- ASSERT(RW_READ_HELD(&grp->lg_tx_lock));
-
- /* compute MAC hash */
-
- ehp = (struct ether_header *)mp->b_rptr;
-
- if (policy & AGGR_POLICY_L2) {
- uchar_t *mac_src = ehp->ether_shost.ether_addr_octet;
- uchar_t *mac_dst = ehp->ether_dhost.ether_addr_octet;
- hash = HASH_MAC(mac_src) ^ HASH_MAC(mac_dst);
- policy &= ~AGGR_POLICY_L2;
- }
-
- if (policy == 0)
- goto done;
-
- /* skip ethernet header */
-
- if (ntohs(ehp->ether_type) == ETHERTYPE_VLAN) {
- struct ether_vlan_header *evhp;
- mblk_t *newmp = NULL;
-
- skip_len = sizeof (struct ether_vlan_header);
- if (MBLKL(mp) < skip_len) {
- /* the vlan tag is the payload, pull up first */
- newmp = msgpullup(mp, -1);
- if ((newmp == NULL) || (MBLKL(newmp) < skip_len)) {
- goto done;
- }
- evhp = (struct ether_vlan_header *)newmp->b_rptr;
- } else {
- evhp = (struct ether_vlan_header *)mp->b_rptr;
- }
-
- sap = ntohs(evhp->ether_type);
- freemsg(newmp);
- } else {
- sap = ntohs(ehp->ether_type);
- skip_len = sizeof (struct ether_header);
- }
-
- /* if ethernet header is in its own mblk, skip it */
- if (MBLKL(mp) <= skip_len) {
- skip_len -= MBLKL(mp);
- mp = mp->b_cont;
- }
-
- sap = (sap < ETHERTYPE_802_MIN) ? 0 : sap;
-
- /* compute IP src/dst addresses hash and skip IPv{4,6} header */
-
- switch (sap) {
- case ETHERTYPE_IP: {
- ipha_t *iphp;
-
- if (MBLKL(mp) < (skip_len + sizeof (ipha_t)))
- goto done;
-
- iphp = (ipha_t *)(mp->b_rptr + skip_len);
- proto = iphp->ipha_protocol;
- skip_len += IPH_HDR_LENGTH(iphp);
-
- if (policy & AGGR_POLICY_L3) {
- uint8_t *ip_src = (uint8_t *)&(iphp->ipha_src);
- uint8_t *ip_dst = (uint8_t *)&(iphp->ipha_dst);
-
- hash ^= (HASH_4BYTES(ip_src) ^ HASH_4BYTES(ip_dst));
- policy &= ~AGGR_POLICY_L3;
- }
- break;
- }
- case ETHERTYPE_IPV6: {
- ip6_t *ip6hp;
-
- /*
- * if ipv6 packet has options, the proto will not be one of the
- * ones handled by the ULP processor below, and will return 0
- * as the index
- */
- if (MBLKL(mp) < (skip_len + sizeof (ip6_t)))
- goto done;
-
- ip6hp = (ip6_t *)(mp->b_rptr + skip_len);
- proto = ip6hp->ip6_nxt;
- skip_len += aggr_send_ip6_hdr_len(mp, ip6hp);
-
- if (policy & AGGR_POLICY_L3) {
- uint8_t *ip_src = &(ip6hp->ip6_src.s6_addr8[12]);
- uint8_t *ip_dst = &(ip6hp->ip6_dst.s6_addr8[12]);
-
- hash ^= (HASH_4BYTES(ip_src) ^ HASH_4BYTES(ip_dst));
- policy &= ~AGGR_POLICY_L3;
- }
- break;
- }
- default:
- goto done;
- }
-
- if (!(policy & AGGR_POLICY_L4))
- goto done;
-
- /* if ip header is in its own mblk, skip it */
- if (MBLKL(mp) <= skip_len) {
- skip_len -= MBLKL(mp);
- mp = mp->b_cont;
- }
-
- /* parse ULP header */
-again:
- switch (proto) {
- case IPPROTO_TCP:
- case IPPROTO_UDP:
- case IPPROTO_ESP:
- case IPPROTO_SCTP:
- /*
- * These Internet Protocols are intentionally designed
- * for hashing from the git-go. Port numbers are in the first
- * word for transports, SPI is first for ESP.
- */
- hash ^= HASH_4BYTES((mp->b_rptr + skip_len));
- break;
-
- case IPPROTO_AH: {
- ah_t *ah = (ah_t *)(mp->b_rptr + skip_len);
-
- uint_t ah_length = AH_TOTAL_LEN(ah);
- proto = ah->ah_nexthdr;
- skip_len += ah_length;
-
- /* if ip header is in its own mblk, skip it */
- if (MBLKL(mp) <= skip_len) {
- skip_len -= MBLKL(mp);
- mp = mp->b_cont;
- }
-
- goto again;
- }
- }
-
-done:
- return (hash);
-}
-
/*
* Update the TX load balancing policy of the specified group.
*/
void
aggr_send_update_policy(aggr_grp_t *grp, uint32_t policy)
{
+ uint8_t mac_policy = 0;
+
ASSERT(MAC_PERIM_HELD(grp->lg_mh));
+ if ((policy & AGGR_POLICY_L2) != 0)
+ mac_policy |= MAC_PKT_HASH_L2;
+ if ((policy & AGGR_POLICY_L3) != 0)
+ mac_policy |= MAC_PKT_HASH_L3;
+ if ((policy & AGGR_POLICY_L4) != 0)
+ mac_policy |= MAC_PKT_HASH_L4;
+
grp->lg_tx_policy = policy;
+ grp->lg_mac_tx_policy = mac_policy;
}
/*
@@ -250,7 +96,8 @@ aggr_m_tx(void *arg, mblk_t *mp)
nextp = mp->b_next;
mp->b_next = NULL;
- hash = aggr_send_hash(grp, mp);
+ hash = mac_pkt_hash(DL_ETHER, mp, grp->lg_mac_tx_policy,
+ B_TRUE);
port = grp->lg_tx_ports[hash % grp->lg_ntx_ports];
/*
@@ -266,7 +113,7 @@ aggr_m_tx(void *arg, mblk_t *mp)
*/
freemsg(mp);
} else {
- mblk_t *ret_mp;
+ mblk_t *ret_mp = NULL;
/*
* It is fine that the port state changes now.
@@ -385,51 +232,3 @@ aggr_send_port_disable(aggr_port_t *port)
port->lp_tx_enabled = B_FALSE;
}
-
-static uint16_t
-aggr_send_ip6_hdr_len(mblk_t *mp, ip6_t *ip6h)
-{
- uint16_t length;
- uint_t ehdrlen;
- uint8_t *nexthdrp;
- uint8_t *whereptr;
- uint8_t *endptr;
- ip6_dest_t *desthdr;
- ip6_rthdr_t *rthdr;
- ip6_frag_t *fraghdr;
-
- length = IPV6_HDR_LEN;
- whereptr = ((uint8_t *)&ip6h[1]); /* point to next hdr */
- endptr = mp->b_wptr;
-
- nexthdrp = &ip6h->ip6_nxt;
- while (whereptr < endptr) {
- switch (*nexthdrp) {
- case IPPROTO_HOPOPTS:
- case IPPROTO_DSTOPTS:
- /* Assumes the headers are identical for hbh and dst */
- desthdr = (ip6_dest_t *)whereptr;
- ehdrlen = 8 * (desthdr->ip6d_len + 1);
- nexthdrp = &desthdr->ip6d_nxt;
- break;
- case IPPROTO_ROUTING:
- rthdr = (ip6_rthdr_t *)whereptr;
- ehdrlen = 8 * (rthdr->ip6r_len + 1);
- nexthdrp = &rthdr->ip6r_nxt;
- break;
- case IPPROTO_FRAGMENT:
- fraghdr = (ip6_frag_t *)whereptr;
- ehdrlen = sizeof (ip6_frag_t);
- nexthdrp = &fraghdr->ip6f_nxt;
- break;
- case IPPROTO_NONE:
- /* No next header means we're finished */
- default:
- return (length);
- }
- length += ehdrlen;
- whereptr += ehdrlen;
- }
-
- return (length);
-}
diff --git a/usr/src/uts/common/io/dld/dld_proto.c b/usr/src/uts/common/io/dld/dld_proto.c
index 2c3d0f7ecb..5533b582a0 100644
--- a/usr/src/uts/common/io/dld/dld_proto.c
+++ b/usr/src/uts/common/io/dld/dld_proto.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -1334,25 +1334,14 @@ dld_capab_direct(dld_str_t *dsp, void *data, uint_t flags)
case DLD_ENABLE:
dls_rx_set(dsp, (dls_rx_t)direct->di_rx_cf,
direct->di_rx_ch);
- /*
- * TODO: XXXGopi
- *
- * Direct pointer to functions in the MAC layer
- * should be passed here:
- *
- * 1) pass mac_tx() and mac_client_handle instead
- * of str_mdata_fastpath_put() and dld_str_t. But
- * not done presently because of some VLAN
- * processing stuff in str_mdata_fastpath_put().
- *
- * 2) pass a MAC layer callback instead of
- * dld_flow_ctl_callb().
- */
+
direct->di_tx_df = (uintptr_t)str_mdata_fastpath_put;
direct->di_tx_dh = dsp;
-
direct->di_tx_cb_df = (uintptr_t)mac_client_tx_notify;
direct->di_tx_cb_dh = dsp->ds_mch;
+ direct->di_tx_fctl_df = (uintptr_t)mac_tx_is_flow_blocked;
+ direct->di_tx_fctl_dh = dsp->ds_mch;
+
dsp->ds_direct = B_TRUE;
return (0);
diff --git a/usr/src/uts/common/io/dls/dls.c b/usr/src/uts/common/io/dls/dls.c
index 064217c8f2..53450a45d1 100644
--- a/usr/src/uts/common/io/dls/dls.c
+++ b/usr/src/uts/common/io/dls/dls.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -607,14 +607,6 @@ dls_mac_active_set(dls_link_t *dlp)
* Set the function to start receiving packets.
*/
mac_rx_set(dlp->dl_mch, i_dls_link_rx, dlp);
-
- /*
- * We've got a MAC client for this link now.
- * Push down the flows that were defined on this link
- * hitherto. The flows are added to the active flow table
- * and SRS, softrings etc. are created as needed.
- */
- mac_link_init_flows(dlp->dl_mch);
}
dlp->dl_nactive++;
return (0);
@@ -625,20 +617,6 @@ dls_mac_active_clear(dls_link_t *dlp)
{
if (--dlp->dl_nactive == 0) {
ASSERT(dlp->dl_mah != NULL);
- /*
- * We would have initialized subflows etc. only if we
- * brought up the primary client and set the unicast
- * unicast address etc. Deactivate the flows. The flow
- * entry will be removed from the active flow tables,
- * and the associated SRS, softrings etc will be
- * deleted. But the flow entry itself won't be
- * destroyed, instead it will continue to be
- * archived off the the global flow hash list, for a
- * possible future activation when say
- * IP is plumbed again
- */
-
- mac_link_release_flows(dlp->dl_mch);
(void) mac_unicast_remove(dlp->dl_mch, dlp->dl_mah);
dlp->dl_mah = NULL;
mac_rx_clear(dlp->dl_mch);
diff --git a/usr/src/uts/common/io/dls/dls_link.c b/usr/src/uts/common/io/dls/dls_link.c
index 852b87d24b..85aee7fe86 100644
--- a/usr/src/uts/common/io/dls/dls_link.c
+++ b/usr/src/uts/common/io/dls/dls_link.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -36,7 +36,7 @@
#include <sys/atomic.h>
static kmem_cache_t *i_dls_link_cachep;
-static mod_hash_t *i_dls_link_hash;
+mod_hash_t *i_dls_link_hash;
static uint_t i_dls_link_count;
#define LINK_HASHSZ 67 /* prime */
diff --git a/usr/src/uts/common/io/dls/dls_mgmt.c b/usr/src/uts/common/io/dls/dls_mgmt.c
index bb922423b3..576e13ac2c 100644
--- a/usr/src/uts/common/io/dls/dls_mgmt.c
+++ b/usr/src/uts/common/io/dls/dls_mgmt.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -60,10 +60,15 @@ boolean_t devnet_need_rebuild;
/* Upcall door handle */
static door_handle_t dls_mgmt_dh = NULL;
-#define DD_CONDEMNED 0x1
+#define DD_CONDEMNED 0x1
+#define DD_KSTAT_CHANGING 0x2
/*
* This structure is used to keep the <linkid, macname> mapping.
+ * This structure itself is not protected by the mac perimeter, but is
+ * protected by the dd_mutex and i_dls_devnet_lock. Thus most of the
+ * functions manipulating this structure such as dls_devnet_set/unset etc.
+ * may be called while not holding the mac perimeter.
*/
typedef struct dls_devnet_s {
datalink_id_t dd_linkid;
@@ -614,6 +619,11 @@ dls_devnet_rele_link(dls_dl_handle_t dlh, dls_link_t *dlp)
/*
* Query the "link" kstats.
+ *
+ * We may be called from the kstat subsystem in an arbitrary context.
+ * If the caller is the stack, the context could be an upcall data
+ * thread. Hence we can't acquire the mac perimeter in this function
+ * for fear of deadlock.
*/
static int
dls_devnet_stat_update(kstat_t *ksp, int rw)
@@ -621,21 +631,34 @@ dls_devnet_stat_update(kstat_t *ksp, int rw)
dls_devnet_t *ddp = ksp->ks_private;
dls_link_t *dlp;
int err;
- mac_perim_handle_t mph;
- err = mac_perim_enter_by_macname(ddp->dd_mac, &mph);
- if (err != 0)
- return (err);
+ /*
+ * Check the link is being renamed or if the link is going away
+ * before incrementing dd_tref which in turn prevents the link
+ * from being renamed or deleted until we finish.
+ */
+ mutex_enter(&ddp->dd_mutex);
+ if (ddp->dd_flags & (DD_CONDEMNED | DD_KSTAT_CHANGING)) {
+ mutex_exit(&ddp->dd_mutex);
+ return (ENOENT);
+ }
+ ddp->dd_tref++;
+ mutex_exit(&ddp->dd_mutex);
- err = dls_link_hold(ddp->dd_mac, &dlp);
- if (err != 0) {
- mac_perim_exit(mph);
- return (err);
+ /*
+ * If a device detach happens at this time, it will block in
+ * dls_devnet_unset since the dd_tref has been bumped up above. So the
+ * access to 'dlp' is safe even though we don't hold the mac perimeter.
+ */
+ if (mod_hash_find(i_dls_link_hash, (mod_hash_key_t)ddp->dd_mac,
+ (mod_hash_val_t *)&dlp) != 0) {
+ dls_devnet_rele_tmp(ddp);
+ return (ENOENT);
}
err = dls_stat_update(ksp, dlp, rw);
- dls_link_rele(dlp);
- mac_perim_exit(mph);
+
+ dls_devnet_rele_tmp(ddp);
return (err);
}
@@ -707,6 +730,7 @@ dls_devnet_set(const char *macname, datalink_id_t linkid, dls_devnet_t **ddpp)
dls_devnet_t *ddp = NULL;
datalink_class_t class;
int err;
+ boolean_t stat_create = B_FALSE;
rw_enter(&i_dls_devnet_lock, RW_WRITER);
if ((err = mod_hash_find(i_dls_devnet_hash,
@@ -748,8 +772,7 @@ newphys:
(mod_hash_key_t)(uintptr_t)linkid,
(mod_hash_val_t)ddp) == 0);
devnet_need_rebuild = B_TRUE;
- dls_devnet_stat_create(ddp);
-
+ stat_create = B_TRUE;
mutex_enter(&ddp->dd_mutex);
if (!ddp->dd_prop_loaded && (ddp->dd_prop_taskid == NULL)) {
ddp->dd_prop_taskid = taskq_dispatch(system_taskq,
@@ -761,6 +784,20 @@ newphys:
err = 0;
done:
rw_exit(&i_dls_devnet_lock);
+ /*
+ * It is safe to drop the i_dls_devnet_lock at this point. In the case
+ * of physical devices, the softmac framework will fail the device
+ * detach based on the smac_state or smac_hold_cnt. Other cases like
+ * vnic and aggr use their own scheme to serialize creates and deletes
+ * and ensure that *ddp is valid.
+ *
+ * The kstat subsystem holds its own locks (rather perimeter) before
+ * calling the ks_update (dls_devnet_stat_update) entry point which
+ * in turn grabs the i_dls_devnet_lock. So the lock hierarchy is
+ * kstat locks -> i_dls_devnet_lock.
+ */
+ if (stat_create)
+ dls_devnet_stat_create(ddp);
if (err == 0 && ddpp != NULL)
*ddpp = ddp;
return (err);
@@ -815,7 +852,6 @@ dls_devnet_unset(const char *macname, datalink_id_t *id, boolean_t wait)
VERIFY(mod_hash_remove(i_dls_devnet_id_hash,
(mod_hash_key_t)(uintptr_t)ddp->dd_linkid, &val) == 0);
- dls_devnet_stat_destroy(ddp);
devnet_need_rebuild = B_TRUE;
}
rw_exit(&i_dls_devnet_lock);
@@ -830,6 +866,9 @@ dls_devnet_unset(const char *macname, datalink_id_t *id, boolean_t wait)
ASSERT(ddp->dd_tref == 0 && ddp->dd_prop_taskid == NULL);
}
+ if (ddp->dd_linkid != DATALINK_INVALID_LINKID)
+ dls_devnet_stat_destroy(ddp);
+
ddp->dd_prop_loaded = B_FALSE;
ddp->dd_linkid = DATALINK_INVALID_LINKID;
ddp->dd_zid = GLOBAL_ZONEID;
@@ -1112,6 +1151,7 @@ dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link)
mac_perim_handle_t mph = NULL;
mac_handle_t mh;
mod_hash_val_t val;
+ boolean_t clear_dd_flag = B_FALSE;
/*
* In the second case, id2 must be a REMOVED physical link.
@@ -1134,8 +1174,10 @@ dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link)
* mac perimeter, hence enter the perimeter first. This also waits
* for the property loading to finish.
*/
- if ((err = mac_perim_enter_by_linkid(id1, &mph)) != 0)
- goto done;
+ if ((err = mac_perim_enter_by_linkid(id1, &mph)) != 0) {
+ softmac_rele_device(ddh);
+ return (err);
+ }
rw_enter(&i_dls_devnet_lock, RW_WRITER);
if ((err = mod_hash_find(i_dls_devnet_id_hash,
@@ -1146,13 +1188,22 @@ dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link)
}
/*
- * Return EBUSY if any applications have this link open.
+ * Return EBUSY if any applications have this link open or if any
+ * thread is currently accessing the link kstats. Then set the
+ * DD_KSTAT_CHANGING flag to prevent any access to the kstats
+ * while we delete and recreate kstats below.
*/
+ mutex_enter(&ddp->dd_mutex);
if (ddp->dd_ref > 1) {
+ mutex_exit(&ddp->dd_mutex);
err = EBUSY;
goto done;
}
+ ddp->dd_flags |= DD_KSTAT_CHANGING;
+ clear_dd_flag = B_TRUE;
+ mutex_exit(&ddp->dd_mutex);
+
if (id2 == DATALINK_INVALID_LINKID) {
(void) strlcpy(linkname, link, sizeof (linkname));
@@ -1225,11 +1276,21 @@ dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link)
done:
/*
* Change the name of the kstat based on the new link name.
+ * We can't hold the i_dls_devnet_lock across calls to the kstat
+ * subsystem. Instead the DD_KSTAT_CHANGING flag set above in this
+ * function prevents any access to the dd_ksp while we delete and
+ * recreate it below.
*/
+ rw_exit(&i_dls_devnet_lock);
if (err == 0)
dls_devnet_stat_rename(ddp, linkname);
- rw_exit(&i_dls_devnet_lock);
+ if (clear_dd_flag) {
+ mutex_enter(&ddp->dd_mutex);
+ ddp->dd_flags &= ~DD_KSTAT_CHANGING;
+ mutex_exit(&ddp->dd_mutex);
+ }
+
if (mph != NULL)
mac_perim_exit(mph);
softmac_rele_device(ddh);
@@ -1388,6 +1449,11 @@ dls_devnet_create(mac_handle_t mh, datalink_id_t linkid)
int err;
mac_perim_handle_t mph;
+ /*
+ * Holding the mac perimeter ensures that the downcall from the
+ * dlmgmt daemon which does the property loading does not proceed
+ * until we relinquish the perimeter.
+ */
mac_perim_enter_by_mh(mh, &mph);
/*
@@ -1400,8 +1466,8 @@ dls_devnet_create(mac_handle_t mh, datalink_id_t linkid)
return (err);
}
if ((err = dls_link_hold_create(mac_name(mh), &dlp)) != 0) {
- (void) dls_devnet_unset(mac_name(mh), &linkid, B_TRUE);
mac_perim_exit(mph);
+ (void) dls_devnet_unset(mac_name(mh), &linkid, B_TRUE);
return (err);
}
mac_perim_exit(mph);
diff --git a/usr/src/uts/common/io/e1000g/e1000g_main.c b/usr/src/uts/common/io/e1000g/e1000g_main.c
index 44a73391e1..5272a26fb1 100644
--- a/usr/src/uts/common/io/e1000g/e1000g_main.c
+++ b/usr/src/uts/common/io/e1000g/e1000g_main.c
@@ -1618,7 +1618,6 @@ static mblk_t *e1000g_poll_ring(void *arg, int bytes_to_pickup)
e1000g_rx_ring_t *rx_ring = (e1000g_rx_ring_t *)arg;
mblk_t *mp = NULL;
mblk_t *tail;
- uint_t sz = 0;
struct e1000g *adapter;
adapter = rx_ring->adapter;
@@ -1631,68 +1630,7 @@ static mblk_t *e1000g_poll_ring(void *arg, int bytes_to_pickup)
}
mutex_enter(&rx_ring->rx_lock);
- ASSERT(rx_ring->poll_flag);
-
- /*
- * Get any packets that have arrived. Works only if we
- * actually disable the physical adapter/rx_ring interrupt.
- * (e1000g_poll_mode == 1). In case e1000g_poll_mode == 0,
- * packets will have already been added to the poll list
- * by the interrupt (see e1000g_intr_work()).
- */
- if (adapter->poll_mode) {
- mp = e1000g_receive(rx_ring, &tail, &sz);
- if (mp != NULL) {
- if (rx_ring->poll_list_head == NULL)
- rx_ring->poll_list_head = mp;
- else
- rx_ring->poll_list_tail->b_next = mp;
- rx_ring->poll_list_tail = tail;
- rx_ring->poll_list_sz += sz;
- }
- }
-
- mp = rx_ring->poll_list_head;
- if (mp == NULL) {
- mutex_exit(&rx_ring->rx_lock);
- rw_exit(&adapter->chip_lock);
- return (NULL);
- }
-
- /* Check if we can sendup the entire chain */
- if (bytes_to_pickup >= rx_ring->poll_list_sz) {
- mp = rx_ring->poll_list_head;
- rx_ring->poll_list_head = NULL;
- rx_ring->poll_list_tail = NULL;
- rx_ring->poll_list_sz = 0;
- mutex_exit(&rx_ring->rx_lock);
- rw_exit(&adapter->chip_lock);
- return (mp);
- }
-
- /*
- * We need to find out how much chain we can send up. We
- * are guaranteed that atleast one packet will go up since
- * we already checked that.
- */
- tail = mp;
- sz = 0;
- while (mp != NULL) {
- sz += MBLKL(mp);
- if (sz > bytes_to_pickup) {
- sz -= MBLKL(mp);
- break;
- }
- tail = mp;
- mp = mp->b_next;
- }
-
- mp = rx_ring->poll_list_head;
- rx_ring->poll_list_head = tail->b_next;
- if (rx_ring->poll_list_head == NULL)
- rx_ring->poll_list_tail = NULL;
- rx_ring->poll_list_sz -= sz;
- tail->b_next = NULL;
+ mp = e1000g_receive(rx_ring, &tail, bytes_to_pickup);
mutex_exit(&rx_ring->rx_lock);
rw_exit(&adapter->chip_lock);
return (mp);
@@ -2118,79 +2056,26 @@ e1000g_intr_work(struct e1000g *Adapter, uint32_t icr)
}
if (icr & E1000_ICR_RXT0) {
- mblk_t *mp;
- uint_t sz = 0;
- mblk_t *tmp, *tail = NULL;
+ mblk_t *mp = NULL;
+ mblk_t *tail = NULL;
e1000g_rx_ring_t *rx_ring;
rx_ring = Adapter->rx_ring;
mutex_enter(&rx_ring->rx_lock);
-
/*
- * If the real interrupt for the Rx ring was
- * not disabled (e1000g_poll_mode == 0), then
- * we still pick up the packets and queue them
- * on Rx ring if we were in polling mode. this
- * enables the polling thread to pick up packets
- * really fast in polling mode and helps improve
- * latency.
+ * Sometimes with legacy interrupts, it possible that
+ * there is a single interrupt for Rx/Tx. In which
+ * case, if poll flag is set, we shouldn't really
+ * be doing Rx processing.
*/
- mp = e1000g_receive(rx_ring, &tail, &sz);
+ if (!rx_ring->poll_flag)
+ mp = e1000g_receive(rx_ring, &tail,
+ E1000G_CHAIN_NO_LIMIT);
+ mutex_exit(&rx_ring->rx_lock);
rw_exit(&Adapter->chip_lock);
-
- if (mp != NULL) {
- ASSERT(tail != NULL);
- if (!rx_ring->poll_flag) {
- /*
- * If not polling, see if something was
- * already queued. Take care not to
- * reorder packets.
- */
- if (rx_ring->poll_list_head == NULL) {
- mutex_exit(&rx_ring->rx_lock);
- mac_rx_ring(Adapter->mh, rx_ring->mrh,
- mp, rx_ring->ring_gen_num);
- } else {
- tmp = rx_ring->poll_list_head;
- rx_ring->poll_list_head = NULL;
- rx_ring->poll_list_tail->b_next = mp;
- rx_ring->poll_list_tail = NULL;
- rx_ring->poll_list_sz = 0;
- mutex_exit(&rx_ring->rx_lock);
- mac_rx_ring(Adapter->mh, rx_ring->mrh,
- tmp, rx_ring->ring_gen_num);
- }
- } else {
- /*
- * We are in a polling mode. Put the
- * processed packets on the poll list.
- */
- if (rx_ring->poll_list_head == NULL)
- rx_ring->poll_list_head = mp;
- else
- rx_ring->poll_list_tail->b_next = mp;
- rx_ring->poll_list_tail = tail;
- rx_ring->poll_list_sz += sz;
- mutex_exit(&rx_ring->rx_lock);
- }
- } else if (!rx_ring->poll_flag &&
- rx_ring->poll_list_head != NULL) {
- /*
- * Nothing new has arrived (then why
- * was the interrupt raised??). Check
- * if something queued from the last
- * time.
- */
- tmp = rx_ring->poll_list_head;
- rx_ring->poll_list_head = NULL;
- rx_ring->poll_list_tail = NULL;
- rx_ring->poll_list_sz = 0;
- mutex_exit(&rx_ring->rx_lock);
+ if (mp != NULL)
mac_rx_ring(Adapter->mh, rx_ring->mrh,
- tmp, rx_ring->ring_gen_num);
- } else {
- mutex_exit(&rx_ring->rx_lock);
- }
+ mp, rx_ring->ring_gen_num);
} else
rw_exit(&Adapter->chip_lock);
@@ -2698,7 +2583,6 @@ e1000g_rx_ring_intr_enable(mac_intr_handle_t intrh)
struct e1000g *adapter = rx_ring->adapter;
struct e1000_hw *hw = &adapter->shared;
uint32_t intr_mask;
- boolean_t poll_mode;
rw_enter(&adapter->chip_lock, RW_READER);
@@ -2709,20 +2593,17 @@ e1000g_rx_ring_intr_enable(mac_intr_handle_t intrh)
mutex_enter(&rx_ring->rx_lock);
rx_ring->poll_flag = 0;
- poll_mode = adapter->poll_mode;
mutex_exit(&rx_ring->rx_lock);
- if (poll_mode) {
- /* Rx interrupt enabling for MSI and legacy */
- intr_mask = E1000_READ_REG(hw, E1000_IMS);
- intr_mask |= E1000_IMS_RXT0;
- E1000_WRITE_REG(hw, E1000_IMS, intr_mask);
- E1000_WRITE_FLUSH(hw);
+ /* Rx interrupt enabling for MSI and legacy */
+ intr_mask = E1000_READ_REG(hw, E1000_IMS);
+ intr_mask |= E1000_IMS_RXT0;
+ E1000_WRITE_REG(hw, E1000_IMS, intr_mask);
+ E1000_WRITE_FLUSH(hw);
- /* Trigger a Rx interrupt to check Rx ring */
- E1000_WRITE_REG(hw, E1000_ICS, E1000_IMS_RXT0);
- E1000_WRITE_FLUSH(hw);
- }
+ /* Trigger a Rx interrupt to check Rx ring */
+ E1000_WRITE_REG(hw, E1000_ICS, E1000_IMS_RXT0);
+ E1000_WRITE_FLUSH(hw);
rw_exit(&adapter->chip_lock);
return (0);
@@ -2734,7 +2615,6 @@ e1000g_rx_ring_intr_disable(mac_intr_handle_t intrh)
e1000g_rx_ring_t *rx_ring = (e1000g_rx_ring_t *)intrh;
struct e1000g *adapter = rx_ring->adapter;
struct e1000_hw *hw = &adapter->shared;
- boolean_t poll_mode;
rw_enter(&adapter->chip_lock, RW_READER);
@@ -2742,22 +2622,13 @@ e1000g_rx_ring_intr_disable(mac_intr_handle_t intrh)
rw_exit(&adapter->chip_lock);
return (0);
}
-
- /*
- * Once the adapter can support per Rx ring interrupt,
- * we should disable the real interrupt instead of just setting
- * the flag.
- */
mutex_enter(&rx_ring->rx_lock);
rx_ring->poll_flag = 1;
- poll_mode = adapter->poll_mode;
mutex_exit(&rx_ring->rx_lock);
- if (poll_mode) {
- /* Rx interrupt disabling for MSI and legacy */
- E1000_WRITE_REG(hw, E1000_IMC, E1000_IMS_RXT0);
- E1000_WRITE_FLUSH(hw);
- }
+ /* Rx interrupt disabling for MSI and legacy */
+ E1000_WRITE_REG(hw, E1000_IMC, E1000_IMS_RXT0);
+ E1000_WRITE_FLUSH(hw);
rw_exit(&adapter->chip_lock);
return (0);
diff --git a/usr/src/uts/common/io/e1000g/e1000g_rx.c b/usr/src/uts/common/io/e1000g/e1000g_rx.c
index 5876cb51b3..b1ac40145c 100644
--- a/usr/src/uts/common/io/e1000g/e1000g_rx.c
+++ b/usr/src/uts/common/io/e1000g/e1000g_rx.c
@@ -452,7 +452,7 @@ e1000g_get_buf(e1000g_rx_ring_t *rx_ring)
* This routine will process packets received in an interrupt
*/
mblk_t *
-e1000g_receive(e1000g_rx_ring_t *rx_ring, mblk_t **tail, uint_t *sz)
+e1000g_receive(e1000g_rx_ring_t *rx_ring, mblk_t **tail, uint_t sz)
{
struct e1000_hw *hw;
mblk_t *nmp;
@@ -471,13 +471,13 @@ e1000g_receive(e1000g_rx_ring_t *rx_ring, mblk_t **tail, uint_t *sz)
struct e1000g *Adapter;
dma_buffer_t *rx_buf;
uint16_t cksumflags;
+ uint_t chain_sz = 0;
ret_mp = NULL;
ret_nmp = NULL;
pkt_count = 0;
desc_count = 0;
cksumflags = 0;
- *sz = 0;
Adapter = rx_ring->adapter;
hw = &Adapter->shared;
@@ -505,7 +505,8 @@ e1000g_receive(e1000g_rx_ring_t *rx_ring, mblk_t **tail, uint_t *sz)
* descriptor owned by the hardware that begins a packet.
*/
while ((current_desc->status & E1000_RXD_STAT_DD) &&
- (pkt_count < Adapter->rx_limit_onintr)) {
+ (pkt_count < Adapter->rx_limit_onintr) &&
+ ((sz == E1000G_CHAIN_NO_LIMIT) || (chain_sz <= sz))) {
desc_count++;
/*
@@ -832,7 +833,7 @@ rx_end_of_packet:
}
ret_nmp->b_next = NULL;
*tail = ret_nmp;
- *sz += length;
+ chain_sz += length;
rx_ring->rx_mblk = NULL;
rx_ring->rx_mblk_tail = NULL;
diff --git a/usr/src/uts/common/io/e1000g/e1000g_sw.h b/usr/src/uts/common/io/e1000g/e1000g_sw.h
index 277ba680a0..d0d465d666 100644
--- a/usr/src/uts/common/io/e1000g/e1000g_sw.h
+++ b/usr/src/uts/common/io/e1000g/e1000g_sw.h
@@ -198,6 +198,8 @@ extern "C" {
#define E1000G_RX_SW_STOP 0x2
#define E1000G_RX_SW_DETACH 0x3
+#define E1000G_CHAIN_NO_LIMIT 0
+
/*
* definitions for smartspeed workaround
*/
@@ -786,9 +788,6 @@ typedef struct _e1000g_rx_ring {
mac_ring_handle_t mrh;
mac_ring_handle_t mrh_init;
uint64_t ring_gen_num;
- mblk_t *poll_list_head;
- mblk_t *poll_list_tail;
- uint_t poll_list_sz;
boolean_t poll_flag;
/*
@@ -998,7 +997,7 @@ void e1000g_free_tx_swpkt(p_tx_sw_packet_t packet);
void e1000g_tx_freemsg(e1000g_tx_ring_t *tx_ring);
uint_t e1000g_tx_softint_worker(caddr_t arg1, caddr_t arg2);
mblk_t *e1000g_m_tx(void *arg, mblk_t *mp);
-mblk_t *e1000g_receive(e1000g_rx_ring_t *rx_ring, mblk_t **tail, uint_t *sz);
+mblk_t *e1000g_receive(e1000g_rx_ring_t *rx_ring, mblk_t **tail, uint_t sz);
void e1000g_rxfree_func(p_rx_sw_packet_t packet);
int e1000g_m_stat(void *arg, uint_t stat, uint64_t *val);
diff --git a/usr/src/uts/common/io/mac/mac.c b/usr/src/uts/common/io/mac/mac.c
index a8b411f994..be8518b523 100644
--- a/usr/src/uts/common/io/mac/mac.c
+++ b/usr/src/uts/common/io/mac/mac.c
@@ -504,6 +504,7 @@ i_mac_destructor(void *buf, void *arg)
ASSERT(mip->mi_kstat_count == 0);
ASSERT(mip->mi_nclients == 0);
ASSERT(mip->mi_nactiveclients == 0);
+ ASSERT(mip->mi_single_active_client == NULL);
ASSERT(mip->mi_state_flags == 0);
ASSERT(mip->mi_factory_addr == NULL);
ASSERT(mip->mi_factory_addr_num == 0);
@@ -1712,6 +1713,12 @@ mac_tx_client_unblock(mac_client_impl_t *mcip)
mac_tx_lock_all(mcip);
mcip->mci_tx_flag &= ~MCI_TX_QUIESCE;
mac_tx_unlock_all(mcip);
+ /*
+ * We may fail to disable flow control for the last MAC_NOTE_TX
+ * notification because the MAC client is quiesced. Send the
+ * notification again.
+ */
+ i_mac_notify(mcip->mci_mip, MAC_NOTE_TX);
}
/*
@@ -2350,10 +2357,8 @@ i_mac_tx_srs_notify(mac_impl_t *mip, mac_ring_handle_t ring)
cclient = cclient->mci_client_next) {
if ((mac_srs = MCIP_TX_SRS(cclient)) != NULL)
mac_tx_srs_wakeup(mac_srs, ring);
- if (!FLOW_TAB_EMPTY(cclient->mci_subflow_tab)) {
- (void) mac_flow_walk_nolock(cclient->mci_subflow_tab,
- mac_tx_flow_srs_wakeup, ring);
- }
+ (void) mac_flow_walk(cclient->mci_subflow_tab,
+ mac_tx_flow_srs_wakeup, ring);
}
rw_exit(&mip->mi_rw_lock);
rw_exit(&i_mac_impl_lock);
@@ -4107,8 +4112,13 @@ mac_fini_macaddr(mac_impl_t *mip)
{
mac_address_t *map = mip->mi_addresses;
- /* there should be exactly one entry left on the list */
- ASSERT(map != NULL);
+ if (map == NULL)
+ return;
+
+ /*
+ * If mi_addresses is initialized, there should be exactly one
+ * entry left on the list with no users.
+ */
ASSERT(map->ma_nusers == 0);
ASSERT(map->ma_next == NULL);
diff --git a/usr/src/uts/common/io/mac/mac_bcast.c b/usr/src/uts/common/io/mac/mac_bcast.c
index 31a9d0ed7d..eaac168aaf 100644
--- a/usr/src/uts/common/io/mac/mac_bcast.c
+++ b/usr/src/uts/common/io/mac/mac_bcast.c
@@ -124,14 +124,6 @@ mac_bcast_grp_free(void *bcast_grp)
ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
- if (grp->mbg_addrtype == MAC_ADDRTYPE_MULTICAST) {
- /*
- * The address is a multicast address, have the
- * underlying NIC leave the multicast group.
- */
- (void) mip->mi_multicst(mip->mi_driver, B_FALSE, grp->mbg_addr);
- }
-
ASSERT(grp->mbg_addr != NULL);
kmem_free(grp->mbg_addr, mip->mi_type->mt_addr_length);
kmem_free(grp->mbg_clients,
@@ -271,15 +263,69 @@ mac_bcast_add(mac_client_impl_t *mcip, const uint8_t *addr, uint16_t vid,
size_t addr_len = mip->mi_type->mt_addr_length;
int rc = 0;
int i, index = -1;
- mac_mcast_addrs_t *mci_maddr = NULL;
- mac_mcast_addrs_t *mi_maddr = NULL;
- mac_mcast_addrs_t **last_maddr;
+ mac_mcast_addrs_t **prev_mi_addr = NULL;
+ mac_mcast_addrs_t **prev_mci_addr = NULL;
ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
ASSERT(addrtype == MAC_ADDRTYPE_MULTICAST ||
addrtype == MAC_ADDRTYPE_BROADCAST);
+ /*
+ * Add the MAC client to the list of MAC clients associated
+ * with the group.
+ */
+ if (addrtype == MAC_ADDRTYPE_MULTICAST) {
+ mac_mcast_addrs_t *maddr;
+
+ /*
+ * In case of a driver (say aggr), we need this information
+ * on a per MAC instance basis.
+ */
+ prev_mi_addr = &mip->mi_mcast_addrs;
+ for (maddr = *prev_mi_addr; maddr != NULL;
+ prev_mi_addr = &maddr->mma_next, maddr = maddr->mma_next) {
+ if (bcmp(maddr->mma_addr, addr, addr_len) == 0)
+ break;
+ }
+ if (maddr == NULL) {
+ /*
+ * For multicast addresses, have the underlying MAC
+ * join the corresponding multicast group.
+ */
+ rc = mip->mi_multicst(mip->mi_driver, B_TRUE, addr);
+ if (rc != 0)
+ return (rc);
+ maddr = kmem_zalloc(sizeof (mac_mcast_addrs_t),
+ KM_SLEEP);
+ bcopy(addr, maddr->mma_addr, addr_len);
+ *prev_mi_addr = maddr;
+ } else {
+ prev_mi_addr = NULL;
+ }
+ maddr->mma_ref++;
+
+ /*
+ * We maintain a separate list for each MAC client. Get
+ * the entry or add, if it is not present.
+ */
+ prev_mci_addr = &mcip->mci_mcast_addrs;
+ for (maddr = *prev_mci_addr; maddr != NULL;
+ prev_mci_addr = &maddr->mma_next, maddr = maddr->mma_next) {
+ if (bcmp(maddr->mma_addr, addr, addr_len) == 0)
+ break;
+ }
+ if (maddr == NULL) {
+ maddr = kmem_zalloc(sizeof (mac_mcast_addrs_t),
+ KM_SLEEP);
+ bcopy(addr, maddr->mma_addr, addr_len);
+ *prev_mci_addr = maddr;
+ } else {
+ prev_mci_addr = NULL;
+ }
+ maddr->mma_ref++;
+ }
+
/* The list is protected by the perimeter */
last_grp = &mip->mi_bcast_grp;
for (grp = *last_grp; grp != NULL;
@@ -331,7 +377,7 @@ mac_bcast_add(mac_client_impl_t *mcip, const uint8_t *addr, uint16_t vid,
if (rc != 0) {
kmem_free(grp->mbg_addr, addr_len);
kmem_cache_free(mac_bcast_grp_cache, grp);
- return (rc);
+ goto fail;
}
grp->mbg_flow_ent->fe_mbg = grp;
mip->mi_bcast_ngrps++;
@@ -366,23 +412,7 @@ mac_bcast_add(mac_client_impl_t *mcip, const uint8_t *addr, uint16_t vid,
rc = mac_flow_add(mip->mi_flow_tab, grp->mbg_flow_ent);
if (rc != 0) {
FLOW_FINAL_REFRELE(grp->mbg_flow_ent);
- return (rc);
- }
-
- /*
- * For multicast addresses, have the underlying MAC
- * join the corresponsing multicast group.
- */
- if (addrtype == MAC_ADDRTYPE_MULTICAST) {
- rc = mip->mi_multicst(mip->mi_driver, B_TRUE, addr);
- if (rc != 0) {
- mac_flow_remove(mip->mi_flow_tab,
- grp->mbg_flow_ent, B_FALSE);
- mac_flow_wait(grp->mbg_flow_ent,
- FLOW_DRIVER_UPCALL);
- FLOW_FINAL_REFRELE(grp->mbg_flow_ent);
- return (rc);
- }
+ goto fail;
}
*last_grp = grp;
@@ -395,45 +425,6 @@ mac_bcast_add(mac_client_impl_t *mcip, const uint8_t *addr, uint16_t vid,
* with the group.
*/
rw_enter(&mip->mi_rw_lock, RW_WRITER);
- if (addrtype == MAC_ADDRTYPE_MULTICAST) {
- /*
- * We maintain a separate list for each MAC client. Get
- * the entry or add, if it is not present.
- */
- last_maddr = &mcip->mci_mcast_addrs;
- for (mci_maddr = *last_maddr; mci_maddr != NULL;
- last_maddr = &mci_maddr->mma_next,
- mci_maddr = mci_maddr->mma_next) {
- if (bcmp(mci_maddr->mma_addr, addr, addr_len) == 0)
- break;
- }
- if (mci_maddr == NULL) {
- mci_maddr = kmem_zalloc(sizeof (mac_mcast_addrs_t),
- KM_SLEEP);
- bcopy(addr, mci_maddr->mma_addr, addr_len);
- *last_maddr = mci_maddr;
- }
- mci_maddr->mma_ref++;
-
- /*
- * In case of a driver (say aggr), we also need this
- * information on a per MAC instance basis.
- */
- last_maddr = &mip->mi_mcast_addrs;
- for (mi_maddr = *last_maddr; mi_maddr != NULL;
- last_maddr = &mi_maddr->mma_next,
- mi_maddr = mi_maddr->mma_next) {
- if (bcmp(mi_maddr->mma_addr, addr, addr_len) == 0)
- break;
- }
- if (mi_maddr == NULL) {
- mi_maddr = kmem_zalloc(sizeof (mac_mcast_addrs_t),
- KM_SLEEP);
- bcopy(addr, mi_maddr->mma_addr, addr_len);
- *last_maddr = mi_maddr;
- }
- mi_maddr->mma_ref++;
- }
for (i = 0; i < grp->mbg_nclients_alloc; i++) {
/*
* The MAC client was already added, say when we have
@@ -442,7 +433,8 @@ mac_bcast_add(mac_client_impl_t *mcip, const uint8_t *addr, uint16_t vid,
*/
if (grp->mbg_clients[i].mgb_client == mcip) {
grp->mbg_clients[i].mgb_client_ref++;
- goto add_done;
+ rw_exit(&mip->mi_rw_lock);
+ return (0);
} else if (grp->mbg_clients[i].mgb_client == NULL &&
index == -1) {
index = i;
@@ -478,10 +470,20 @@ mac_bcast_add(mac_client_impl_t *mcip, const uint8_t *addr, uint16_t vid,
* to detect that condition after re-acquiring the lock.
*/
grp->mbg_clients_gen++;
-add_done:
rw_exit(&mip->mi_rw_lock);
-
return (0);
+
+fail:
+ if (prev_mi_addr != NULL) {
+ kmem_free(*prev_mi_addr, sizeof (mac_mcast_addrs_t));
+ *prev_mi_addr = NULL;
+ (void) mip->mi_multicst(mip->mi_driver, B_FALSE, addr);
+ }
+ if (prev_mci_addr != NULL) {
+ kmem_free(*prev_mci_addr, sizeof (mac_mcast_addrs_t));
+ *prev_mci_addr = NULL;
+ }
+ return (rc);
}
/*
@@ -559,6 +561,8 @@ mac_bcast_delete(mac_client_impl_t *mcip, const uint8_t *addr, uint16_t vid)
*prev = grp->mbg_next;
}
update_maddr:
+ rw_exit(&mip->mi_rw_lock);
+
if (grp->mbg_addrtype == MAC_ADDRTYPE_MULTICAST) {
mprev = &mcip->mci_mcast_addrs;
for (maddr = mcip->mci_mcast_addrs; maddr != NULL;
@@ -583,12 +587,12 @@ update_maddr:
}
ASSERT(maddr != NULL);
if (--maddr->mma_ref == 0) {
+ (void) mip->mi_multicst(mip->mi_driver, B_FALSE, addr);
*mprev = maddr->mma_next;
maddr->mma_next = NULL;
kmem_free(maddr, sizeof (mac_mcast_addrs_t));
}
}
- rw_exit(&mip->mi_rw_lock);
/*
* If the group itself is being removed, remove the
diff --git a/usr/src/uts/common/io/mac/mac_client.c b/usr/src/uts/common/io/mac/mac_client.c
index 84e302ad9f..cf4a8f4421 100644
--- a/usr/src/uts/common/io/mac/mac_client.c
+++ b/usr/src/uts/common/io/mac/mac_client.c
@@ -1159,18 +1159,6 @@ mac_client_open(mac_handle_t mh, mac_client_handle_t *mchp, char *name,
*/
mcip = mac_vnic_lower(mip);
- /*
- * If there are multiple MAC clients of the VNIC, they
- * all share the same underlying MAC client handle.
- */
- if ((flags & MAC_OPEN_FLAGS_TAG_DISABLE) != 0)
- mcip->mci_state_flags |= MCIS_TAG_DISABLE;
-
- if ((flags & MAC_OPEN_FLAGS_STRIP_DISABLE) != 0)
- mcip->mci_state_flags |= MCIS_STRIP_DISABLE;
-
- if ((flags & MAC_OPEN_FLAGS_DISABLE_TX_VID_CHECK) != 0)
- mcip->mci_state_flags |= MCIS_DISABLE_TX_VID_CHECK;
/*
* Note that multiple mac clients share the same mcip in
@@ -1328,13 +1316,6 @@ mac_client_close(mac_client_handle_t mch, uint16_t flags)
* when the VNIC is deleted.
*/
- /*
- * Clear the flags set when the upper client initiated
- * open.
- */
- mcip->mci_state_flags &= ~(MCIS_TAG_DISABLE |
- MCIS_STRIP_DISABLE | MCIS_DISABLE_TX_VID_CHECK);
-
i_mac_perim_exit(mip);
return;
}
@@ -1377,12 +1358,11 @@ mac_rx_bypass_set(mac_client_handle_t mch, mac_direct_rx_t rx_fn, void *arg1)
ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
/*
- * If the mac_client is a VLAN or native media is non ethernet, we
- * should not do DLS bypass and instead let the packets go via the
- * default mac_rx_deliver route so vlan header can be stripped etc.
+ * If the mac_client is a VLAN, we should not do DLS bypass and
+ * instead let the packets come up via mac_rx_deliver so the vlan
+ * header can be stripped.
*/
- if (mcip->mci_nvids > 0 ||
- mip->mi_info.mi_nativemedia != DL_ETHER)
+ if (mcip->mci_nvids > 0)
return (B_FALSE);
/*
@@ -1606,6 +1586,37 @@ mac_client_update_mcast(void *arg, boolean_t add, const uint8_t *addrp)
}
}
+static void
+mac_update_single_active_client(mac_impl_t *mip)
+{
+ mac_client_impl_t *client = NULL;
+
+ ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+
+ rw_enter(&mip->mi_rw_lock, RW_WRITER);
+ if (mip->mi_nactiveclients == 1) {
+ /*
+ * Find the one active MAC client from the list of MAC
+ * clients. The active MAC client has at least one
+ * unicast address.
+ */
+ for (client = mip->mi_clients_list; client != NULL;
+ client = client->mci_client_next) {
+ if (client->mci_unicast_list != NULL)
+ break;
+ }
+ ASSERT(client != NULL);
+ }
+
+ /*
+ * mi_single_active_client is protected by the MAC impl's read/writer
+ * lock, which allows mac_rx() to check the value of that pointer
+ * as a reader.
+ */
+ mip->mi_single_active_client = client;
+ rw_exit(&mip->mi_rw_lock);
+}
+
/*
* Add a new unicast address to the MAC client.
*
@@ -1712,11 +1723,13 @@ i_mac_unicast_add(mac_client_handle_t mch, uint8_t *mac_addr, uint16_t flags,
mip->mi_state_flags |= MIS_EXCLUSIVE;
bzero(&mrp, sizeof (mac_resource_props_t));
- if (is_primary && !(mcip->mci_state_flags & MCIS_IS_VNIC)) {
+ if (is_primary && !(mcip->mci_state_flags & (MCIS_IS_VNIC |
+ MCIS_IS_AGGR_PORT))) {
/*
* Apply the property cached in the mac_impl_t to the primary
- * mac client. If the mac client is a VNIC, its property were
- * already set in the mcip when the VNIC was created.
+ * mac client. If the mac client is a VNIC or an aggregation
+ * port, its property should be set in the mcip when the
+ * VNIC/aggr was created.
*/
mac_get_resources((mac_handle_t)mip, &mrp);
(void) mac_client_set_resources(mch, &mrp);
@@ -1781,8 +1794,13 @@ i_mac_unicast_add(mac_client_handle_t mch, uint8_t *mac_addr, uint16_t flags,
goto bail;
bcast_added = B_TRUE;
}
- flent = mcip->mci_flent;
- ASSERT(flent != NULL);
+
+ /*
+ * If this is the first unicast address addition for this
+ * client, reuse the pre-allocated larval flow entry associated with
+ * the MAC client.
+ */
+ flent = (mcip->mci_nflents == 0) ? mcip->mci_flent : NULL;
/* We are configuring the unicast flow now */
if (!MCIP_DATAPATH_SETUP(mcip)) {
@@ -1806,6 +1824,7 @@ i_mac_unicast_add(mac_client_handle_t mch, uint8_t *mac_addr, uint16_t flags,
mip->mi_nactiveclients++;
nactiveclients_added = B_TRUE;
+
/*
* This will allocate the RX ring group if possible for the
* flow and program the software classifier as needed.
@@ -1817,6 +1836,12 @@ i_mac_unicast_add(mac_client_handle_t mch, uint8_t *mac_addr, uint16_t flags,
* The unicast MAC address must have been added successfully.
*/
ASSERT(mcip->mci_unicast != NULL);
+ /*
+ * Push down the sub-flows that were defined on this link
+ * hitherto. The flows are added to the active flow table
+ * and SRS, softrings etc. are created as needed.
+ */
+ mac_link_init_flows(mch);
} else {
mac_address_t *map = mcip->mci_unicast;
@@ -1871,6 +1896,9 @@ i_mac_unicast_add(mac_client_handle_t mch, uint8_t *mac_addr, uint16_t flags,
mcip->mci_unicast_list = muip;
rw_exit(&mcip->mci_rw_lock);
+ if (nactiveclients_added)
+ mac_update_single_active_client(mip);
+
*mah = (mac_unicast_handle_t)muip;
/* add it to the flow list of this mcip */
@@ -1906,8 +1934,11 @@ bail:
if (mac_started)
mac_stop(mip);
- if (nactiveclients_added)
+ if (nactiveclients_added) {
mip->mi_nactiveclients--;
+ mac_update_single_active_client(mip);
+ }
+
if (mcip->mci_state_flags & MCIS_EXCLUSIVE)
mip->mi_state_flags &= ~MIS_EXCLUSIVE;
kmem_free(muip, sizeof (mac_unicast_impl_t));
@@ -1983,9 +2014,9 @@ mac_unicast_remove(mac_client_handle_t mch, mac_unicast_handle_t mah)
* Remove the VID from the list of client's VIDs.
*/
pre = mcip->mci_unicast_list;
- if (muip == pre)
+ if (muip == pre) {
mcip->mci_unicast_list = muip->mui_next;
- else {
+ } else {
while ((pre->mui_next != NULL) && (pre->mui_next != muip))
pre = pre->mui_next;
ASSERT(pre->mui_next == muip);
@@ -1997,14 +2028,16 @@ mac_unicast_remove(mac_client_handle_t mch, mac_unicast_handle_t mah)
if ((mcip->mci_flags & MAC_CLIENT_FLAGS_PRIMARY) && muip->mui_vid == 0)
mcip->mci_flags &= ~MAC_CLIENT_FLAGS_PRIMARY;
- /*
- * This MAC client is shared, so we will just remove the flent
- * corresponding to the address being removed. We don't invoke
- * mac_rx_classify_flow_rem() since the additional flow is
- * not associated with its own separate set of SRS and rings,
- * and these constructs are still needed for the remaining flows.
- */
if (!mac_client_single_rcvr(mcip)) {
+ /*
+ * This MAC client is shared by more than one unicast
+ * addresses, so we will just remove the flent
+ * corresponding to the address being removed. We don't invoke
+ * mac_rx_classify_flow_rem() since the additional flow is
+ * not associated with its own separate set of SRS and rings,
+ * and these constructs are still needed for the remaining
+ * flows.
+ */
flent = mac_client_get_flow(mcip, muip);
ASSERT(flent != NULL);
@@ -2037,7 +2070,20 @@ mac_unicast_remove(mac_client_handle_t mch, mac_unicast_handle_t mah)
return (0);
}
+ /*
+ * We would have initialized subflows etc. only if we brought up
+ * the primary client and set the unicast unicast address etc.
+ * Deactivate the flows. The flow entry will be removed from the
+ * active flow tables, and the associated SRS, softrings etc will
+ * be deleted. But the flow entry itself won't be destroyed, instead
+ * it will continue to be archived off the the global flow hash
+ * list, for a possible future activation when say IP is plumbed
+ * again.
+ */
+ mac_link_release_flows(mch);
+
mip->mi_nactiveclients--;
+ mac_update_single_active_client(mip);
/* Tear down the Data path */
mac_datapath_teardown(mcip, mcip->mci_flent, SRST_LINK);
@@ -2252,6 +2298,8 @@ mac_promisc_add(mac_client_handle_t mch, mac_client_promisc_type_t type,
mpip->mpi_mcip = mcip;
mpip->mpi_no_tx_loop = ((flags & MAC_PROMISC_FLAGS_NO_TX_LOOP) != 0);
mpip->mpi_no_phys = ((flags & MAC_PROMISC_FLAGS_NO_PHYS) != 0);
+ mpip->mpi_strip_vlan_tag =
+ ((flags & MAC_PROMISC_FLAGS_VLAN_TAG_STRIP) != 0);
mcbi = &mip->mi_promisc_cb_info;
mutex_enter(mcbi->mcbi_lockp);
@@ -2503,44 +2551,65 @@ done:
* mac_tx_is_blocked
*
* Given a cookie, it returns if the ring identified by the cookie is
- * flow-controlled or not (this is not implemented yet). If NULL is
- * passed in place of a cookie, then it finds out if any of the
- * underlying rings belonging to the SRS is flow controlled or not
- * and returns that status.
+ * flow-controlled or not. If NULL is passed in place of a cookie,
+ * then it finds out if any of the underlying rings belonging to the
+ * SRS is flow controlled or not and returns that status.
*/
/* ARGSUSED */
boolean_t
mac_tx_is_flow_blocked(mac_client_handle_t mch, mac_tx_cookie_t cookie)
{
mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
- mac_soft_ring_set_t *mac_srs = MCIP_TX_SRS(mcip);
+ mac_soft_ring_set_t *mac_srs;
mac_soft_ring_t *sringp;
boolean_t blocked = B_FALSE;
+ mac_tx_percpu_t *mytx;
+ int err;
int i;
/*
- * On etherstubs, there won't be a Tx SRS or an Rx
- * SRS. Infact there won't even be a flow_entry.
+ * Bump the reference count so that mac_srs won't be deleted.
+ * If the client is currently quiesced and we failed to bump
+ * the reference, return B_TRUE so that flow control stays
+ * as enabled.
+ *
+ * Flow control will then be disabled once the client is no
+ * longer quiesced.
*/
- if (mac_srs == NULL)
+ MAC_TX_TRY_HOLD(mcip, mytx, err);
+ if (err != 0)
+ return (B_TRUE);
+
+ if ((mac_srs = MCIP_TX_SRS(mcip)) == NULL) {
+ MAC_TX_RELE(mcip, mytx);
return (B_FALSE);
+ }
mutex_enter(&mac_srs->srs_lock);
if (mac_srs->srs_tx.st_mode == SRS_TX_FANOUT) {
- for (i = 0; i < mac_srs->srs_oth_ring_count; i++) {
- sringp = mac_srs->srs_oth_soft_rings[i];
+ if (cookie != NULL) {
+ sringp = (mac_soft_ring_t *)cookie;
mutex_enter(&sringp->s_ring_lock);
- if (sringp->s_ring_state & S_RING_TX_HIWAT) {
+ if (sringp->s_ring_state & S_RING_TX_HIWAT)
blocked = B_TRUE;
+ mutex_exit(&sringp->s_ring_lock);
+ } else {
+ for (i = 0; i < mac_srs->srs_oth_ring_count; i++) {
+ sringp = mac_srs->srs_oth_soft_rings[i];
+ mutex_enter(&sringp->s_ring_lock);
+ if (sringp->s_ring_state & S_RING_TX_HIWAT) {
+ blocked = B_TRUE;
+ mutex_exit(&sringp->s_ring_lock);
+ break;
+ }
mutex_exit(&sringp->s_ring_lock);
- break;
}
- mutex_exit(&sringp->s_ring_lock);
}
} else {
blocked = (mac_srs->srs_state & SRS_TX_HIWAT);
}
mutex_exit(&mac_srs->srs_lock);
+ MAC_TX_RELE(mcip, mytx);
return (blocked);
}
@@ -2846,6 +2915,10 @@ mac_promisc_dispatch_one(mac_promisc_impl_t *mpip, mblk_t *mp,
return;
mp_copy->b_next = NULL;
+ if (mpip->mpi_strip_vlan_tag) {
+ if ((mp_copy = mac_strip_vlan_tag_chain(mp_copy)) == NULL)
+ return;
+ }
mpip->mpi_fn(mpip->mpi_arg, NULL, mp_copy, loopback);
}
@@ -3218,7 +3291,7 @@ i_mac_set_resources(mac_handle_t mh, mac_resource_props_t *mrp)
*/
bcopy(mrp, &tmrp, sizeof (mac_resource_props_t));
mcip = mac_primary_client_handle(mip);
- if (mcip != NULL) {
+ if (mcip != NULL && (mcip->mci_state_flags & MCIS_IS_AGGR_PORT) == 0) {
err =
mac_client_set_resources((mac_client_handle_t)mcip, &tmrp);
}
diff --git a/usr/src/uts/common/io/mac/mac_datapath_setup.c b/usr/src/uts/common/io/mac/mac_datapath_setup.c
index c93fe0ca8f..9c316911d4 100644
--- a/usr/src/uts/common/io/mac/mac_datapath_setup.c
+++ b/usr/src/uts/common/io/mac/mac_datapath_setup.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -73,16 +73,24 @@ kmem_cache_t *mac_soft_ring_cache;
* The duration in msec we wait before signalling the soft ring
* worker thread in case packets get queued.
*/
-static uint32_t mac_soft_ring_worker_wait = 0;
+uint32_t mac_soft_ring_worker_wait = 0;
+
+/*
+ * A global tunable for turning polling on/off. By default, dynamic
+ * polling is always on and is always very beneficial. It should be
+ * turned off with absolute care and for the rare workload (very
+ * low latency sensitive traffic).
+ */
+int mac_poll_enable = B_TRUE;
/*
* Need to set mac_soft_ring_max_q_cnt based on bandwidth and perhaps latency.
* Large values could end up in consuming lot of system memory and cause
* system hang.
*/
-static int mac_soft_ring_max_q_cnt = 1024;
-static int mac_soft_ring_min_q_cnt = 256;
-static int mac_soft_ring_poll_thres = 16;
+int mac_soft_ring_max_q_cnt = 1024;
+int mac_soft_ring_min_q_cnt = 256;
+int mac_soft_ring_poll_thres = 16;
/*
* Default value of number of TX rings to be assigned to a MAC client.
@@ -91,8 +99,8 @@ static int mac_soft_ring_poll_thres = 16;
* If no TX rings are available, then MAC client(s) will be assigned the
* default Tx ring. Default Tx ring can be shared among multiple MAC clients.
*/
-static uint32_t mac_tx_ring_count = 8;
-static boolean_t mac_tx_serialize = B_FALSE;
+uint32_t mac_tx_ring_count = 8;
+boolean_t mac_tx_serialize = B_FALSE;
/*
* mac_tx_srs_hiwat is the queue depth threshold at which callers of
@@ -105,8 +113,8 @@ static boolean_t mac_tx_serialize = B_FALSE;
* Note that mac_tx_srs_hiwat is always be lesser than
* mac_tx_srs_max_q_cnt.
*/
-static uint32_t mac_tx_srs_max_q_cnt = 100000;
-static uint32_t mac_tx_srs_hiwat = 1000;
+uint32_t mac_tx_srs_max_q_cnt = 100000;
+uint32_t mac_tx_srs_hiwat = 1000;
/*
* mac_rx_soft_ring_count, mac_soft_ring_10gig_count:
@@ -131,8 +139,8 @@ static uint32_t mac_tx_srs_hiwat = 1000;
* rings is based on specified bandwidth, CPU speed and number of CPUs in
* the system.
*/
-static uint_t mac_rx_soft_ring_count = 8;
-static uint_t mac_rx_soft_ring_10gig_count = 8;
+uint_t mac_rx_soft_ring_count = 8;
+uint_t mac_rx_soft_ring_10gig_count = 8;
/*
* Every Tx and Rx mac_soft_ring_set_t (mac_srs) created gets added
@@ -146,18 +154,12 @@ static krwlock_t mac_srs_g_lock;
/*
* Whether the SRS threads should be bound, or not.
*/
-static boolean_t mac_srs_thread_bind = B_TRUE;
+boolean_t mac_srs_thread_bind = B_TRUE;
/*
* CPU to fallback to, used by mac_next_bind_cpu().
*/
-static processorid_t srs_bind_cpu = 0;
-
-/*
- * Possible setting for soft_ring_process_flag is
- * 0 or ST_RING_WORKER_ONLY.
- */
-static int soft_ring_process_flag = ST_RING_WORKER_ONLY;
+processorid_t srs_bind_cpu = 0;
/*
* If cpu bindings are specified by user, then Tx SRS and its soft
@@ -503,7 +505,7 @@ mac_srs_poll_state_change(mac_soft_ring_set_t *mac_srs,
(ring->mr_classify_type == MAC_HW_CLASSIFIER)) {
if (turn_off_poll_capab)
mac_srs->srs_state &= ~SRS_POLLING_CAPAB;
- else
+ else if (mac_poll_enable)
mac_srs->srs_state |= SRS_POLLING_CAPAB;
}
srs_rx->sr_lower_proc = rx_func;
@@ -1498,7 +1500,7 @@ mac_srs_fanout_modify(mac_client_impl_t *mcip, flow_entry_t *flent,
mac_soft_ring_set_t *mac_tx_srs)
{
mac_soft_ring_t *softring;
- uint32_t soft_ring_flag = soft_ring_process_flag;
+ uint32_t soft_ring_flag = 0;
processorid_t cpuid = -1;
boolean_t user_specified;
int i, srings_present, new_fanout_cnt;
@@ -1606,7 +1608,7 @@ mac_srs_fanout_init(mac_client_impl_t *mcip, flow_entry_t *flent,
{
int i;
processorid_t cpuid, worker_cpuid, poll_cpuid;
- uint32_t soft_ring_flag = soft_ring_process_flag;
+ uint32_t soft_ring_flag = 0;
int soft_ring_cnt;
boolean_t user_specified = B_FALSE;
mac_cpus_t *srs_cpu = &mac_rx_srs->srs_cpu;
@@ -1917,7 +1919,8 @@ mac_srs_create(mac_client_impl_t *mcip, flow_entry_t *flent, uint32_t srs_type,
(srs_rx->sr_lowat >> 1) ? mac_soft_ring_poll_thres :
(srs_rx->sr_lowat >> 1);
if (mac_latency_optimize)
- mac_srs->srs_state |= SRS_LATENCY_OPT;
+ mac_srs->srs_state |=
+ (SRS_LATENCY_OPT|SRS_SOFTRING_QUEUE);
}
mac_srs->srs_worker = thread_create(NULL, 0,
@@ -1956,12 +1959,21 @@ mac_srs_create(mac_client_impl_t *mcip, flow_entry_t *flent, uint32_t srs_type,
ring->mr_classify_type = MAC_HW_CLASSIFIER;
ring->mr_flag |= MR_INCIPIENT;
- if (FLOW_TAB_EMPTY(mcip->mci_subflow_tab))
+ if (FLOW_TAB_EMPTY(mcip->mci_subflow_tab) && mac_poll_enable)
mac_srs->srs_state |= SRS_POLLING_CAPAB;
mac_srs->srs_poll_thr = thread_create(NULL, 0,
mac_rx_srs_poll_ring, mac_srs, 0, &p0, TS_RUN,
mac_srs->srs_pri);
+ /*
+ * Some drivers require serialization and don't send
+ * packet chains in interrupt context. For such
+ * drivers, we should always queue in soft ring
+ * so that we get a chance to switch into a polling
+ * mode under backlog.
+ */
+ if (mcip->mci_mip->mi_v12n_level & MAC_VIRT_SERIALIZE)
+ mac_srs->srs_state |= SRS_SOFTRING_QUEUE;
}
return (mac_srs);
}
@@ -2131,10 +2143,6 @@ mac_srs_group_setup(mac_client_impl_t *mcip, flow_entry_t *flent,
mac_srs = mac_srs_create(mcip, flent,
fanout_type | link_type,
mac_rx_deliver, mcip, NULL, ring);
- if (mip->mi_v12n_level & MAC_VIRT_SERIALIZE) {
- mac_srs->srs_rx.sr_enqueue_always =
- B_TRUE;
- }
break;
default:
cmn_err(CE_PANIC, "srs_setup: mcip = %p "
@@ -2706,6 +2714,7 @@ mac_datapath_teardown(mac_client_impl_t *mcip, flow_entry_t *flent,
mac_srs_group_setup(grp_only_mcip,
grp_only_mcip->mci_flent,
default_group, SRST_LINK);
+ mac_rx_group_unmark(default_group, MR_INCIPIENT);
}
}
}
@@ -3173,7 +3182,7 @@ mac_tx_srs_setup(mac_client_impl_t *mcip, flow_entry_t *flent,
{
mac_impl_t *mip = mcip->mci_mip;
mac_soft_ring_set_t *tx_srs;
- int i, tx_ring_count = 0, tx_rings_reserved;
+ int i, tx_ring_count = 0, tx_rings_reserved = 0;
mac_ring_handle_t *tx_ring = NULL;
uint32_t soft_ring_type;
mac_group_t *grp = NULL;
diff --git a/usr/src/uts/common/io/mac/mac_flow.c b/usr/src/uts/common/io/mac/mac_flow.c
index 6dc3a8a7b4..cb6560b1f7 100644
--- a/usr/src/uts/common/io/mac/mac_flow.c
+++ b/usr/src/uts/common/io/mac/mac_flow.c
@@ -479,8 +479,8 @@ mac_flow_lookup(flow_tab_t *ft, mblk_t *mp, uint_t flags, flow_entry_t **flentp)
int i, err;
s.fs_flags = flags;
- s.fs_mp = mp;
retry:
+ s.fs_mp = mp;
/*
* Walk the list of predeclared accept functions.
@@ -489,6 +489,8 @@ retry:
*/
for (i = 0; i < FLOW_MAX_ACCEPT && ops->fo_accept[i] != NULL; i++) {
if ((err = (ops->fo_accept[i])(ft, &s)) != 0) {
+ mblk_t *last;
+
/*
* ENOBUFS indicates that the mp could be too short
* and may need a pullup.
@@ -497,11 +499,13 @@ retry:
return (err);
/*
- * Don't modify the mblk if there are references to it.
- * Also, there is no point pulling up if b_cont is NULL.
+ * The pullup is done on the last processed mblk, not
+ * the starting one. pullup is not done if the mblk
+ * has references or if b_cont is NULL.
*/
- if (DB_REF(mp) > 1 || mp->b_cont == NULL ||
- pullupmsg(mp, -1) == 0)
+ last = s.fs_mp;
+ if (DB_REF(last) > 1 || last->b_cont == NULL ||
+ pullupmsg(last, -1) == 0)
return (EINVAL);
retried = B_TRUE;
@@ -1209,10 +1213,11 @@ mac_link_flow_add(datalink_id_t linkid, char *flow_name,
/*
* Add the subflow to the subflow table. Also instantiate the flow
- * in the mac if there is an active DLS user. The dl_mah is set when
- * dls_active_set() is called, typically during interface plumb.
+ * in the mac if there is an active user (we check if the MAC client's
+ * datapath has been setup).
*/
- err = mac_flow_add_subflow(dlp->dl_mch, flent, dlp->dl_mah != NULL);
+ err = mac_flow_add_subflow(dlp->dl_mch, flent,
+ MCIP_DATAPATH_SETUP((mac_client_impl_t *)dlp->dl_mch));
if (err != 0)
goto bail;
@@ -1514,6 +1519,17 @@ mac_link_flow_info(char *flow_name, mac_flowinfo_t *finfo)
#define PKT_TOO_SMALL(s, end) ((s)->fs_mp->b_wptr < (end))
+#define CHECK_AND_ADJUST_START_PTR(s, start) { \
+ if ((s)->fs_mp->b_wptr == (start)) { \
+ mblk_t *next = (s)->fs_mp->b_cont; \
+ if (next == NULL) \
+ return (EINVAL); \
+ \
+ (s)->fs_mp = next; \
+ (start) = next->b_rptr; \
+ } \
+}
+
/* ARGSUSED */
static boolean_t
flow_l2_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
@@ -1830,7 +1846,14 @@ flow_ip_accept(flow_tab_t *ft, flow_state_t *s)
uint16_t sap = l2info->l2_sap;
uchar_t *l3_start;
- l3info->l3_start = l3_start = l2info->l2_start + l2info->l2_hdrsize;
+ l3_start = l2info->l2_start + l2info->l2_hdrsize;
+
+ /*
+ * Adjust start pointer if we're at the end of an mblk.
+ */
+ CHECK_AND_ADJUST_START_PTR(s, l3_start);
+
+ l3info->l3_start = l3_start;
if (!OK_32PTR(l3_start))
return (EINVAL);
@@ -2193,7 +2216,14 @@ flow_transport_accept(flow_tab_t *ft, flow_state_t *s)
uint8_t proto = l3info->l3_protocol;
uchar_t *l4_start;
- l4info->l4_start = l4_start = l3info->l3_start + l3info->l3_hdrsize;
+ l4_start = l3info->l3_start + l3info->l3_hdrsize;
+
+ /*
+ * Adjust start pointer if we're at the end of an mblk.
+ */
+ CHECK_AND_ADJUST_START_PTR(s, l4_start);
+
+ l4info->l4_start = l4_start;
if (!OK_32PTR(l4_start))
return (EINVAL);
diff --git a/usr/src/uts/common/io/mac/mac_provider.c b/usr/src/uts/common/io/mac/mac_provider.c
index 714fb79afb..4d9d590457 100644
--- a/usr/src/uts/common/io/mac/mac_provider.c
+++ b/usr/src/uts/common/io/mac/mac_provider.c
@@ -20,7 +20,7 @@
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -668,6 +668,24 @@ mac_rx(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain)
return;
}
/* We'll fall through to software classification */
+ } else {
+ flow_entry_t *flent;
+ int err;
+
+ rw_enter(&mip->mi_rw_lock, RW_READER);
+ if (mip->mi_single_active_client != NULL) {
+ flent = mip->mi_single_active_client->mci_flent_list;
+ FLOW_TRY_REFHOLD(flent, err);
+ rw_exit(&mip->mi_rw_lock);
+ if (err == 0) {
+ (flent->fe_cb_fn)(flent->fe_cb_arg1,
+ flent->fe_cb_arg2, mp_chain, B_FALSE);
+ FLOW_REFRELE(flent);
+ return;
+ }
+ } else {
+ rw_exit(&mip->mi_rw_lock);
+ }
}
if (!FLOW_TAB_EMPTY(mip->mi_flow_tab)) {
diff --git a/usr/src/uts/common/io/mac/mac_sched.c b/usr/src/uts/common/io/mac/mac_sched.c
index 290366f5d2..927e3842d3 100644
--- a/usr/src/uts/common/io/mac/mac_sched.c
+++ b/usr/src/uts/common/io/mac/mac_sched.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -515,25 +515,27 @@ static void
mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
{
struct ether_header *ehp;
- uint16_t etype;
+ struct ether_vlan_header *evhp;
+ uint32_t sap;
ipha_t *ipha;
- mac_soft_ring_t *softring;
- size_t ether_hlen;
+ uint8_t *dstaddr;
+ size_t hdrsize;
mblk_t *mp;
mblk_t *headmp[MAX_SR_TYPES];
mblk_t *tailmp[MAX_SR_TYPES];
int cnt[MAX_SR_TYPES];
size_t sz[MAX_SR_TYPES];
size_t sz1;
- boolean_t bw_ctl = B_FALSE;
+ boolean_t bw_ctl;
boolean_t hw_classified;
- boolean_t dls_bypass = B_TRUE;
- enum pkt_type type;
+ boolean_t dls_bypass;
+ boolean_t is_ether;
+ boolean_t is_unicast;
+ enum pkt_type type;
mac_client_impl_t *mcip = mac_srs->srs_mcip;
- struct ether_vlan_header *evhp;
- if (mac_srs->srs_type & SRST_BW_CONTROL)
- bw_ctl = B_TRUE;
+ is_ether = (mcip->mci_mip->mi_info.mi_nativemedia == DL_ETHER);
+ bw_ctl = ((mac_srs->srs_type & SRST_BW_CONTROL) != 0);
/*
* If we don't have a Rx ring, S/W classification would have done
@@ -550,8 +552,7 @@ mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
* processing in the Rx path. SRST_DLS_BYPASS will be clear for
* such SRSs.
*/
- if (!(mac_srs->srs_type & SRST_DLS_BYPASS))
- dls_bypass = B_FALSE;
+ dls_bypass = ((mac_srs->srs_type & SRST_DLS_BYPASS) != 0);
bzero(headmp, MAX_SR_TYPES * sizeof (mblk_t *));
bzero(tailmp, MAX_SR_TYPES * sizeof (mblk_t *));
@@ -570,68 +571,62 @@ mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
mp->b_next = NULL;
type = OTH;
- sz1 = msgdsize(mp);
-
- if (!dls_bypass) {
- mac_impl_t *mip = mcip->mci_mip;
+ sz1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp);
+ if (is_ether) {
+ /*
+ * At this point we can be sure the packet at least
+ * has an ether header.
+ */
+ if (sz1 < sizeof (struct ether_header)) {
+ mac_rx_drop_pkt(mac_srs, mp);
+ continue;
+ }
ehp = (struct ether_header *)mp->b_rptr;
/*
- * For VLAN packets, if the VLAN id doesn't belong
- * to this client, we drop the packet.
+ * Determine if this is a VLAN or non-VLAN packet.
*/
- if (mip->mi_info.mi_nativemedia == DL_ETHER &&
- ntohs(ehp->ether_type) == VLAN_TPID) {
+ if ((sap = ntohs(ehp->ether_type)) == VLAN_TPID) {
+ evhp = (struct ether_vlan_header *)mp->b_rptr;
+ sap = ntohs(evhp->ether_type);
+ hdrsize = sizeof (struct ether_vlan_header);
/*
- * LINTED: cast may result in improper
- * alignment
+ * Check if the VID of the packet, if any,
+ * belongs to this client.
*/
- evhp = (struct ether_vlan_header *)ehp;
if (!mac_client_check_flow_vid(mcip,
VLAN_ID(ntohs(evhp->ether_tci)))) {
mac_rx_drop_pkt(mac_srs, mp);
continue;
}
+ } else {
+ hdrsize = sizeof (struct ether_header);
}
- FANOUT_ENQUEUE_MP(headmp[type], tailmp[type],
- cnt[type], bw_ctl, sz[type], sz1, mp);
- continue;
- }
-
- /*
- * At this point we can be sure the packet at least
- * has an ether header.
- */
- if (sz1 < sizeof (struct ether_header)) {
- mac_rx_drop_pkt(mac_srs, mp);
- continue;
- }
- /* LINTED: cast may result in improper alignment */
- ehp = (struct ether_header *)mp->b_rptr;
+ is_unicast =
+ ((((uint8_t *)&ehp->ether_dhost)[0] & 0x01) == 0);
+ dstaddr = (uint8_t *)&ehp->ether_dhost;
+ } else {
+ mac_header_info_t mhi;
- /*
- * Determine if this is a VLAN or non-VLAN packet.
- */
- if ((etype = ntohs(ehp->ether_type)) == VLAN_TPID) {
- /* LINTED: cast may result in improper alignment */
- evhp = (struct ether_vlan_header *)mp->b_rptr;
- etype = ntohs(evhp->ether_type);
- ether_hlen = sizeof (struct ether_vlan_header);
- /*
- * Check if the VID of the packet, if any, belongs
- * to this client.
- */
- if (!mac_client_check_flow_vid(mcip,
- VLAN_ID(ntohs(evhp->ether_tci)))) {
+ if (mac_header_info((mac_handle_t)mcip->mci_mip,
+ mp, &mhi) != 0) {
mac_rx_drop_pkt(mac_srs, mp);
continue;
}
- } else {
- ether_hlen = sizeof (struct ether_header);
+ hdrsize = mhi.mhi_hdrsize;
+ sap = mhi.mhi_bindsap;
+ is_unicast = (mhi.mhi_dsttype == MAC_ADDRTYPE_UNICAST);
+ dstaddr = (uint8_t *)mhi.mhi_daddr;
}
- if (etype == ETHERTYPE_IP) {
+ if (!dls_bypass) {
+ FANOUT_ENQUEUE_MP(headmp[type], tailmp[type],
+ cnt[type], bw_ctl, sz[type], sz1, mp);
+ continue;
+ }
+
+ if (sap == ETHERTYPE_IP) {
/*
* If we are H/W classified, but we have promisc
* on, then we need to check for the unicast address.
@@ -641,12 +636,11 @@ mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
rw_enter(&mcip->mci_rw_lock, RW_READER);
map = mcip->mci_unicast;
- if (bcmp(&ehp->ether_dhost, map->ma_addr,
+ if (bcmp(dstaddr, map->ma_addr,
map->ma_len) == 0)
type = UNDEF;
rw_exit(&mcip->mci_rw_lock);
- } else if (((((uint8_t *)&ehp->ether_dhost)[0] &
- 0x01) == 0)) {
+ } else if (is_unicast) {
type = UNDEF;
}
}
@@ -665,8 +659,7 @@ mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
* the 'OTH' type path without DLS bypass.
*/
- /* LINTED: cast may result in improper alignment */
- ipha = (ipha_t *)(mp->b_rptr + ether_hlen);
+ ipha = (ipha_t *)(mp->b_rptr + hdrsize);
if ((type != OTH) && MBLK_RX_FANOUT_SLOWPATH(mp, ipha))
type = OTH;
@@ -686,25 +679,25 @@ mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
switch (ipha->ipha_protocol) {
case IPPROTO_TCP:
type = V4_TCP;
- mp->b_rptr += ether_hlen;
+ mp->b_rptr += hdrsize;
break;
case IPPROTO_UDP:
type = V4_UDP;
- mp->b_rptr += ether_hlen;
+ mp->b_rptr += hdrsize;
break;
default:
type = OTH;
break;
}
- ASSERT(type != UNDEF);
-
FANOUT_ENQUEUE_MP(headmp[type], tailmp[type], cnt[type],
bw_ctl, sz[type], sz1, mp);
}
for (type = V4_TCP; type < UNDEF; type++) {
if (headmp[type] != NULL) {
+ mac_soft_ring_t *softring;
+
ASSERT(tailmp[type]->b_next == NULL);
switch (type) {
case V4_TCP:
@@ -716,7 +709,7 @@ mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
case OTH:
softring = mac_srs->srs_oth_soft_rings[0];
}
- mac_rx_soft_ring_process(mac_srs->srs_mcip, softring,
+ mac_rx_soft_ring_process(mcip, softring,
headmp[type], tailmp[type], cnt[type], sz[type]);
}
}
@@ -731,7 +724,7 @@ int fanout_unalligned = 0;
*/
static int
mac_rx_srs_long_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *mp,
- uint16_t etype, enum pkt_type *type, uint_t *indx)
+ uint32_t sap, size_t hdrsize, enum pkt_type *type, uint_t *indx)
{
ip6_t *ip6h;
uint8_t *whereptr;
@@ -740,18 +733,18 @@ mac_rx_srs_long_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *mp,
uint8_t nexthdr;
uint16_t hdr_len;
- if (etype == ETHERTYPE_IPV6) {
+ if (sap == ETHERTYPE_IPV6) {
boolean_t modifiable = B_TRUE;
- ASSERT(MBLKL(mp) >= sizeof (struct ether_header));
+ ASSERT(MBLKL(mp) >= hdrsize);
- ip6h = (ip6_t *)(mp->b_rptr + sizeof (struct ether_header));
+ ip6h = (ip6_t *)(mp->b_rptr + hdrsize);
if ((unsigned char *)ip6h == mp->b_wptr) {
/*
- * The first mblk_t only includes the ethernet header.
+ * The first mblk_t only includes the mac header.
* Note that it is safe to change the mp pointer here,
* as the subsequent operation does not assume mp
- * points to the start of the ethernet header.
+ * points to the start of the mac header.
*/
mp = mp->b_cont;
@@ -900,32 +893,32 @@ static void
mac_rx_srs_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
{
struct ether_header *ehp;
- uint16_t etype;
+ struct ether_vlan_header *evhp;
+ uint32_t sap;
ipha_t *ipha;
+ uint8_t *dstaddr;
uint_t indx;
- int ports_offset = -1;
- int ipha_len;
+ size_t ports_offset;
+ size_t ipha_len;
+ size_t hdrsize;
uint_t hash;
- mac_soft_ring_t *softring;
- size_t ether_hlen;
- uint16_t frag_offset_flags;
mblk_t *mp;
mblk_t *headmp[MAX_SR_TYPES][MAX_SR_FANOUT];
mblk_t *tailmp[MAX_SR_TYPES][MAX_SR_FANOUT];
int cnt[MAX_SR_TYPES][MAX_SR_FANOUT];
size_t sz[MAX_SR_TYPES][MAX_SR_FANOUT];
size_t sz1;
- boolean_t bw_ctl = B_FALSE;
+ boolean_t bw_ctl;
boolean_t hw_classified;
- boolean_t dls_bypass = B_TRUE;
- int i;
+ boolean_t dls_bypass;
+ boolean_t is_ether;
+ boolean_t is_unicast;
int fanout_cnt;
- enum pkt_type type;
+ enum pkt_type type;
mac_client_impl_t *mcip = mac_srs->srs_mcip;
- struct ether_vlan_header *evhp;
- if (mac_srs->srs_type & SRST_BW_CONTROL)
- bw_ctl = B_TRUE;
+ is_ether = (mcip->mci_mip->mi_info.mi_nativemedia == DL_ETHER);
+ bw_ctl = ((mac_srs->srs_type & SRST_BW_CONTROL) != 0);
/*
* If we don't have a Rx ring, S/W classification would have done
@@ -942,8 +935,7 @@ mac_rx_srs_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
* processing in the Rx path. SRST_DLS_BYPASS will be clear for
* such SRSs.
*/
- if (!(mac_srs->srs_type & SRST_DLS_BYPASS))
- dls_bypass = B_FALSE;
+ dls_bypass = ((mac_srs->srs_type & SRST_DLS_BYPASS) != 0);
/*
* Since the softrings are never destroyed and we always
@@ -972,77 +964,66 @@ mac_rx_srs_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
mp->b_next = NULL;
type = OTH;
- sz1 = msgdsize(mp);
+ sz1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp);
- if (!dls_bypass) {
- mac_impl_t *mip = mcip->mci_mip;
+ if (is_ether) {
+ /*
+ * At this point we can be sure the packet at least
+ * has an ether header.
+ */
+ if (sz1 < sizeof (struct ether_header)) {
+ mac_rx_drop_pkt(mac_srs, mp);
+ continue;
+ }
+ ehp = (struct ether_header *)mp->b_rptr;
- indx = 0;
- if (mip->mi_info.mi_nativemedia == DL_ETHER) {
- ehp = (struct ether_header *)mp->b_rptr;
- etype = ntohs(ehp->ether_type);
+ /*
+ * Determine if this is a VLAN or non-VLAN packet.
+ */
+ if ((sap = ntohs(ehp->ether_type)) == VLAN_TPID) {
+ evhp = (struct ether_vlan_header *)mp->b_rptr;
+ sap = ntohs(evhp->ether_type);
+ hdrsize = sizeof (struct ether_vlan_header);
/*
- * For VLAN packets, if the VLAN id doesn't
- * belong to this client, we drop the packet.
+ * Check if the VID of the packet, if any,
+ * belongs to this client.
*/
- if (etype == VLAN_TPID) {
- /*
- * LINTED: cast may result in improper
- * alignment
- */
- evhp = (struct ether_vlan_header *)
- mp->b_rptr;
- if (!mac_client_check_flow_vid(mcip,
- VLAN_ID(ntohs(evhp->ether_tci)))) {
- mac_rx_drop_pkt(mac_srs, mp);
- continue;
- }
- }
- if (mac_rx_srs_long_fanout(mac_srs, mp, etype,
- &type, &indx) == -1) {
+ if (!mac_client_check_flow_vid(mcip,
+ VLAN_ID(ntohs(evhp->ether_tci)))) {
mac_rx_drop_pkt(mac_srs, mp);
continue;
}
+ } else {
+ hdrsize = sizeof (struct ether_header);
}
+ is_unicast =
+ ((((uint8_t *)&ehp->ether_dhost)[0] & 0x01) == 0);
+ dstaddr = (uint8_t *)&ehp->ether_dhost;
+ } else {
+ mac_header_info_t mhi;
- FANOUT_ENQUEUE_MP(headmp[type][indx],
- tailmp[type][indx], cnt[type][indx], bw_ctl,
- sz[type][indx], sz1, mp);
- continue;
- }
-
- /*
- * At this point we can be sure the packet at least
- * has an ether header. On the outbound side, GLD/stack
- * ensure this. On the inbound side, the driver needs
- * to ensure this.
- */
- if (sz1 < sizeof (struct ether_header)) {
- mac_rx_drop_pkt(mac_srs, mp);
- continue;
+ if (mac_header_info((mac_handle_t)mcip->mci_mip,
+ mp, &mhi) != 0) {
+ mac_rx_drop_pkt(mac_srs, mp);
+ continue;
+ }
+ hdrsize = mhi.mhi_hdrsize;
+ sap = mhi.mhi_bindsap;
+ is_unicast = (mhi.mhi_dsttype == MAC_ADDRTYPE_UNICAST);
+ dstaddr = (uint8_t *)mhi.mhi_daddr;
}
- /* LINTED: cast may result in improper alignment */
- ehp = (struct ether_header *)mp->b_rptr;
- /*
- * Determine if this is a VLAN or non-VLAN packet.
- */
- if ((etype = ntohs(ehp->ether_type)) == VLAN_TPID) {
- /* LINTED: cast may result in improper alignment */
- evhp = (struct ether_vlan_header *)mp->b_rptr;
- etype = ntohs(evhp->ether_type);
- ether_hlen = sizeof (struct ether_vlan_header);
- /*
- * Check if the VID of the packet, if any, belongs
- * to this client.
- */
- if (!mac_client_check_flow_vid(mcip,
- VLAN_ID(ntohs(evhp->ether_tci)))) {
+ if (!dls_bypass) {
+ if (mac_rx_srs_long_fanout(mac_srs, mp, sap,
+ hdrsize, &type, &indx) == -1) {
mac_rx_drop_pkt(mac_srs, mp);
continue;
}
- } else {
- ether_hlen = sizeof (struct ether_header);
+
+ FANOUT_ENQUEUE_MP(headmp[type][indx],
+ tailmp[type][indx], cnt[type][indx], bw_ctl,
+ sz[type][indx], sz1, mp);
+ continue;
}
@@ -1051,7 +1032,7 @@ mac_rx_srs_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
* classification has not happened, we need to verify if
* this unicast packet really belongs to us.
*/
- if (etype == ETHERTYPE_IP) {
+ if (sap == ETHERTYPE_IP) {
/*
* If we are H/W classified, but we have promisc
* on, then we need to check for the unicast address.
@@ -1061,12 +1042,11 @@ mac_rx_srs_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
rw_enter(&mcip->mci_rw_lock, RW_READER);
map = mcip->mci_unicast;
- if (bcmp(&ehp->ether_dhost, map->ma_addr,
+ if (bcmp(dstaddr, map->ma_addr,
map->ma_len) == 0)
type = UNDEF;
rw_exit(&mcip->mci_rw_lock);
- } else if (((((uint8_t *)&ehp->ether_dhost)[0] &
- 0x01) == 0)) {
+ } else if (is_unicast) {
type = UNDEF;
}
}
@@ -1076,14 +1056,15 @@ mac_rx_srs_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
* the fast path.
*/
- /* LINTED: cast may result in improper alignment */
- ipha = (ipha_t *)(mp->b_rptr + ether_hlen);
+ ipha = (ipha_t *)(mp->b_rptr + hdrsize);
if ((type != OTH) && MBLK_RX_FANOUT_SLOWPATH(mp, ipha)) {
type = OTH;
fanout_oth1++;
}
if (type != OTH) {
+ uint16_t frag_offset_flags;
+
switch (ipha->ipha_protocol) {
case IPPROTO_TCP:
case IPPROTO_UDP:
@@ -1103,7 +1084,7 @@ mac_rx_srs_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
fanout_oth3++;
break;
}
- ports_offset = ether_hlen + ipha_len;
+ ports_offset = hdrsize + ipha_len;
break;
default:
type = OTH;
@@ -1113,8 +1094,8 @@ mac_rx_srs_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
}
if (type == OTH) {
- if (mac_rx_srs_long_fanout(mac_srs, mp, etype,
- &type, &indx) == -1) {
+ if (mac_rx_srs_long_fanout(mac_srs, mp, sap,
+ hdrsize, &type, &indx) == -1) {
mac_rx_drop_pkt(mac_srs, mp);
continue;
}
@@ -1146,7 +1127,7 @@ mac_rx_srs_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
*(uint32_t *)(mp->b_rptr + ports_offset));
indx = COMPUTE_INDEX(hash, mac_srs->srs_tcp_ring_count);
type = V4_TCP;
- mp->b_rptr += ether_hlen;
+ mp->b_rptr += hdrsize;
break;
case IPPROTO_UDP:
case IPPROTO_SCTP:
@@ -1162,19 +1143,24 @@ mac_rx_srs_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
mac_srs->srs_ind++;
}
type = V4_UDP;
- mp->b_rptr += ether_hlen;
+ mp->b_rptr += hdrsize;
break;
+ default:
+ indx = 0;
+ type = OTH;
}
- ASSERT(type != UNDEF);
-
FANOUT_ENQUEUE_MP(headmp[type][indx], tailmp[type][indx],
cnt[type][indx], bw_ctl, sz[type][indx], sz1, mp);
}
for (type = V4_TCP; type < UNDEF; type++) {
+ int i;
+
for (i = 0; i < fanout_cnt; i++) {
if (headmp[type][i] != NULL) {
+ mac_soft_ring_t *softring;
+
ASSERT(tailmp[type][i]->b_next == NULL);
switch (type) {
case V4_TCP:
@@ -1190,7 +1176,7 @@ mac_rx_srs_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
mac_srs->srs_oth_soft_rings[i];
break;
}
- mac_rx_soft_ring_process(mac_srs->srs_mcip,
+ mac_rx_soft_ring_process(mcip,
softring, headmp[type][i], tailmp[type][i],
cnt[type][i], sz[type][i]);
}
@@ -1373,46 +1359,39 @@ check_again:
(mac_srs->srs_first != NULL)) {
/*
* We have packets to process and worker thread
- * is not running. Check to see if poll thread is
- * allowed to process. Let it do processing only if it
- * picked up some packets from the NIC otherwise
- * wakeup the worker thread.
+ * is not running. Check to see if poll thread is
+ * allowed to process.
*/
- if ((mac_srs->srs_state & SRS_LATENCY_OPT) &&
- (head != NULL)) {
+ if (mac_srs->srs_state & SRS_LATENCY_OPT) {
mac_srs->srs_drain_func(mac_srs, SRS_POLL_PROC);
if (srs_rx->sr_poll_pkt_cnt <=
srs_rx->sr_lowat) {
srs_rx->sr_poll_again++;
goto check_again;
- } else {
- /*
- * We are already above low water mark
- * so stay in the polling mode but no
- * need to poll. Once we dip below
- * the polling threshold, the processing
- * thread (soft ring) will signal us
- * to poll again (MAC_UPDATE_SRS_COUNT)
- */
- srs_rx->sr_poll_drain_no_poll++;
- mac_srs->srs_state &=
- ~(SRS_PROC|SRS_GET_PKTS);
- /*
- * In B/W control case, its possible
- * that the backlog built up due to
- * B/W limit being reached and packets
- * are queued only in SRS. In this case,
- * we should schedule worker thread
- * since no one else will wake us up.
- */
- if ((mac_srs->srs_type &
- SRST_BW_CONTROL) &&
- (mac_srs->srs_tid == NULL)) {
- mac_srs->srs_tid =
- timeout(mac_srs_fire,
- mac_srs, 1);
- srs_rx->sr_poll_worker_wakeup++;
- }
+ }
+ /*
+ * We are already above low water mark
+ * so stay in the polling mode but no
+ * need to poll. Once we dip below
+ * the polling threshold, the processing
+ * thread (soft ring) will signal us
+ * to poll again (MAC_UPDATE_SRS_COUNT)
+ */
+ srs_rx->sr_poll_drain_no_poll++;
+ mac_srs->srs_state &= ~(SRS_PROC|SRS_GET_PKTS);
+ /*
+ * In B/W control case, its possible
+ * that the backlog built up due to
+ * B/W limit being reached and packets
+ * are queued only in SRS. In this case,
+ * we should schedule worker thread
+ * since no one else will wake us up.
+ */
+ if ((mac_srs->srs_type & SRST_BW_CONTROL) &&
+ (mac_srs->srs_tid == NULL)) {
+ mac_srs->srs_tid =
+ timeout(mac_srs_fire, mac_srs, 1);
+ srs_rx->sr_poll_worker_wakeup++;
}
} else {
/*
@@ -1598,7 +1577,7 @@ mac_rx_srs_drain(mac_soft_ring_set_t *mac_srs, uint_t proc_type)
ASSERT(MUTEX_HELD(&mac_srs->srs_lock));
ASSERT(!(mac_srs->srs_type & SRST_BW_CONTROL));
-again:
+
/* If we are blanked i.e. can't do upcalls, then we are done */
if (mac_srs->srs_state & (SRS_BLANK | SRS_PAUSE)) {
ASSERT((mac_srs->srs_type & SRST_NO_SOFT_RINGS) ||
@@ -1609,6 +1588,26 @@ again:
if (mac_srs->srs_first == NULL)
goto out;
+ if (!(mac_srs->srs_state & SRS_LATENCY_OPT) &&
+ (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat)) {
+ /*
+ * In the normal case, the SRS worker thread does no
+ * work and we wait for a backlog to build up before
+ * we switch into polling mode. In case we are
+ * optimizing for throughput, we use the worker thread
+ * as well. The goal is to let worker thread process
+ * the queue and poll thread to feed packets into
+ * the queue. As such, we should signal the poll
+ * thread to try and get more packets.
+ *
+ * We could have pulled this check in the POLL_RING
+ * macro itself but keeping it explicit here makes
+ * the architecture more human understandable.
+ */
+ MAC_SRS_POLL_RING(mac_srs);
+ }
+
+again:
head = mac_srs->srs_first;
mac_srs->srs_first = NULL;
tail = mac_srs->srs_last;
@@ -1624,10 +1623,7 @@ again:
mac_srs->srs_state |= (SRS_PROC|proc_type);
- /* Switch to polling mode */
- MAC_SRS_WORKER_POLLING_ON(mac_srs);
- if (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat)
- MAC_SRS_POLL_RING(mac_srs);
+
/*
* mcip is NULL for broadcast and multicast flows. The promisc
* callbacks for broadcast and multicast packets are delivered from
@@ -1696,37 +1692,27 @@ again:
mutex_enter(&mac_srs->srs_lock);
}
- /*
- * Send the poll thread to pick up any packets arrived
- * so far. This also serves as the last check in case
- * nothing else is queued in the SRS. The poll thread
- * is signalled only in the case the drain was done
- * by the worker thread and SRS_WORKER is set. The
- * worker thread can run in parallel as long as the
- * SRS_WORKER flag is set. We we have nothing else to
- * process, we can exit while leaving SRS_PROC set
- * which gives the poll thread control to process and
- * cleanup once it returns from the NIC.
- *
- * If we have nothing else to process, we need to
- * ensure that we keep holding the srs_lock till
- * all the checks below are done and control is
- * handed to the poll thread if it was running.
- */
- if (mac_srs->srs_first != NULL) {
- if (proc_type == SRS_WORKER) {
- if (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat)
- MAC_SRS_POLL_RING(mac_srs);
+ if (!(mac_srs->srs_state & (SRS_LATENCY_OPT|SRS_BLANK|SRS_PAUSE))) {
+ /*
+ * In case we are optimizing for throughput, we
+ * should try and keep the worker thread running
+ * as much as possible. Send the poll thread down
+ * to check one more time if something else
+ * arrived. In the meanwhile, if poll thread had
+ * collected something due to earlier signal,
+ * process it now.
+ */
+ if (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat) {
+ srs_rx->sr_drain_poll_sig++;
+ MAC_SRS_POLL_RING(mac_srs);
+ }
+ if (mac_srs->srs_first != NULL) {
srs_rx->sr_drain_again++;
goto again;
- } else {
- srs_rx->sr_drain_worker_sig++;
- cv_signal(&mac_srs->srs_async);
}
}
out:
-
if (mac_srs->srs_state & SRS_GET_PKTS) {
/*
* Poll thread is already running. Leave the
@@ -1885,12 +1871,6 @@ again:
mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
}
- /*
- * We can continue processing the queue.
- * We need to figure out if there is a fanout needed or
- * we can just process this here.
- */
-
if ((tid = mac_srs->srs_tid) != 0)
mac_srs->srs_tid = 0;
@@ -2405,8 +2385,7 @@ mac_rx_srs_process(void *arg, mac_resource_handle_t srs, mblk_t *mp_chain,
* optimizing for latency, we should signal the
* worker thread.
*/
- if (loopback || ((count > 1) &&
- !(mac_srs->srs_state & SRS_LATENCY_OPT))) {
+ if (loopback || !(mac_srs->srs_state & SRS_LATENCY_OPT)) {
/*
* For loopback, We need to let the worker take
* over as we don't want to continue in the same
@@ -2502,6 +2481,12 @@ mac_tx_srs_enqueue(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
mblk_t *tail;
boolean_t wakeup_worker = B_TRUE;
+ /*
+ * Ignore fanout hint if we don't have multiple tx rings.
+ */
+ if (!TX_MULTI_RING_MODE(mac_srs))
+ fanout_hint = 0;
+
if (mac_srs->srs_first != NULL)
wakeup_worker = B_FALSE;
MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
@@ -2753,18 +2738,89 @@ mac_tx_serializer_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
* the soft ring associated with that Tx ring. The srs itself will not
* queue any packets.
*/
+
+#define MAC_TX_SOFT_RING_PROCESS(chain) { \
+ index = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count), \
+ softring = mac_srs->srs_oth_soft_rings[index]; \
+ cookie = mac_tx_soft_ring_process(softring, chain, flag, ret_mp); \
+ DTRACE_PROBE2(tx__fanout, uint64_t, hash, uint_t, index); \
+}
+
static mac_tx_cookie_t
mac_tx_fanout_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp)
{
mac_soft_ring_t *softring;
- uint_t indx, hash;
+ uint64_t hash;
+ uint_t index;
+ mac_tx_cookie_t cookie = NULL;
ASSERT(mac_srs->srs_tx.st_mode == SRS_TX_FANOUT);
- hash = HASH_HINT(fanout_hint);
- indx = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count);
- softring = mac_srs->srs_oth_soft_rings[indx];
- return (mac_tx_soft_ring_process(softring, mp_chain, flag, ret_mp));
+ if (fanout_hint != 0) {
+ /*
+ * The hint is specified by the caller, simply pass the
+ * whole chain to the soft ring.
+ */
+ hash = HASH_HINT(fanout_hint);
+ MAC_TX_SOFT_RING_PROCESS(mp_chain);
+ } else {
+ mblk_t *last_mp, *cur_mp, *sub_chain;
+ uint64_t last_hash = 0;
+ uint_t media = mac_srs->srs_mcip->mci_mip->mi_info.mi_media;
+
+ /*
+ * Compute the hash from the contents (headers) of the
+ * packets of the mblk chain. Split the chains into
+ * subchains of the same conversation.
+ *
+ * Since there may be more than one ring used for
+ * sub-chains of the same call, and since the caller
+ * does not maintain per conversation state since it
+ * passed a zero hint, unsent subchains will be
+ * dropped.
+ */
+
+ flag |= MAC_DROP_ON_NO_DESC;
+ ret_mp = NULL;
+
+ ASSERT(ret_mp == NULL);
+
+ sub_chain = NULL;
+ last_mp = NULL;
+
+ for (cur_mp = mp_chain; cur_mp != NULL;
+ cur_mp = cur_mp->b_next) {
+ hash = mac_pkt_hash(media, cur_mp, MAC_PKT_HASH_L4,
+ B_TRUE);
+ if (last_hash != 0 && hash != last_hash) {
+ /*
+ * Starting a different subchain, send current
+ * chain out.
+ */
+ ASSERT(last_mp != NULL);
+ last_mp->b_next = NULL;
+ MAC_TX_SOFT_RING_PROCESS(sub_chain);
+ sub_chain = NULL;
+ }
+
+ /* add packet to subchain */
+ if (sub_chain == NULL)
+ sub_chain = cur_mp;
+ last_mp = cur_mp;
+ last_hash = hash;
+ }
+
+ if (sub_chain != NULL) {
+ /* send last subchain */
+ ASSERT(last_mp != NULL);
+ last_mp->b_next = NULL;
+ MAC_TX_SOFT_RING_PROCESS(sub_chain);
+ }
+
+ cookie = NULL;
+ }
+
+ return (cookie);
}
/*
@@ -2788,8 +2844,17 @@ mac_tx_bw_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
ASSERT(mac_srs->srs_type & SRST_BW_CONTROL);
mutex_enter(&mac_srs->srs_lock);
if (mac_srs->srs_bw->mac_bw_limit == 0) {
- /* zero bandwidth: drop all */
- MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie);
+ /*
+ * zero bandwidth, no traffic is sent: drop the packets,
+ * or return the whole chain if the caller requests all
+ * unsent packets back.
+ */
+ if (flag & MAC_TX_NO_ENQUEUE) {
+ cookie = (mac_tx_cookie_t)mac_srs;
+ *ret_mp = mp_chain;
+ } else {
+ MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie);
+ }
mutex_exit(&mac_srs->srs_lock);
return (cookie);
} else if ((mac_srs->srs_first != NULL) ||
@@ -3223,9 +3288,6 @@ mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain,
DTRACE_PROBE3(slowpath, mac_client_impl_t *,
src_mcip, int, mip->mi_nclients, mblk_t *, mp_chain);
- if (mip->mi_promisc_list != NULL)
- mac_promisc_dispatch(mip, mp_chain, src_mcip);
-
mp = mp_chain;
while (mp != NULL) {
flow_entry_t *dst_flow_ent;
@@ -3241,6 +3303,12 @@ mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain,
CHECK_VID_AND_ADD_TAG(mp);
/*
+ * Check if there are promiscuous mode callbacks defined.
+ */
+ if (mip->mi_promisc_list != NULL)
+ mac_promisc_dispatch(mip, mp, src_mcip);
+
+ /*
* Find the destination.
*/
dst_flow_ent = mac_tx_classify(mip, mp);
@@ -3516,9 +3584,8 @@ mac_rx_soft_ring_process(mac_client_impl_t *mcip, mac_soft_ring_t *ringp,
mutex_enter(&ringp->s_ring_lock);
ringp->s_ring_total_inpkt += cnt;
- if ((ringp->s_ring_type & ST_RING_ANY) ||
- ((mac_srs->srs_rx.sr_poll_pkt_cnt <= 1) &&
- !mac_srs->srs_rx.sr_enqueue_always)) {
+ if ((mac_srs->srs_rx.sr_poll_pkt_cnt <= 1) &&
+ !(ringp->s_ring_type & ST_RING_WORKER_ONLY)) {
/* If on processor or blanking on, then enqueue and return */
if (ringp->s_ring_state & S_RING_BLANK ||
ringp->s_ring_state & S_RING_PROC) {
@@ -3526,7 +3593,6 @@ mac_rx_soft_ring_process(mac_client_impl_t *mcip, mac_soft_ring_t *ringp,
mutex_exit(&ringp->s_ring_lock);
return;
}
-
proc = ringp->s_ring_rx_func;
arg1 = ringp->s_ring_rx_arg1;
arg2 = ringp->s_ring_rx_arg2;
diff --git a/usr/src/uts/common/io/mac/mac_soft_ring.c b/usr/src/uts/common/io/mac/mac_soft_ring.c
index b216e23ff9..a9816e045e 100644
--- a/usr/src/uts/common/io/mac/mac_soft_ring.c
+++ b/usr/src/uts/common/io/mac/mac_soft_ring.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -207,6 +207,8 @@ mac_soft_ring_create(int id, clock_t wait, void *flent, uint16_t type,
ringp->s_ring_rx_func = rx_func;
ringp->s_ring_rx_arg1 = x_arg1;
ringp->s_ring_rx_arg2 = x_arg2;
+ if (mac_srs->srs_state & SRS_SOFTRING_QUEUE)
+ ringp->s_ring_type |= ST_RING_WORKER_ONLY;
}
if (cpuid != -1)
(void) mac_soft_ring_bind(ringp, cpuid);
diff --git a/usr/src/uts/common/io/mac/mac_util.c b/usr/src/uts/common/io/mac/mac_util.c
index 1615060736..8b87c25b19 100644
--- a/usr/src/uts/common/io/mac/mac_util.c
+++ b/usr/src/uts/common/io/mac/mac_util.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -44,6 +44,10 @@
#include <sys/vtrace.h>
#include <sys/dlpi.h>
#include <sys/sunndi.h>
+#include <inet/ipsec_impl.h>
+#include <inet/sadb.h>
+#include <inet/ipsecesp.h>
+#include <inet/ipsecah.h>
/*
* Copy an mblk, preserving its hardware checksum flags.
@@ -821,3 +825,192 @@ mac_get_devinfo(mac_handle_t mh)
return ((void *)mip->mi_dip);
}
+
+#define PKT_HASH_4BYTES(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3])
+#define PKT_HASH_MAC(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3] ^ (x)[4] ^ (x)[5])
+
+uint64_t
+mac_pkt_hash(uint_t media, mblk_t *mp, uint8_t policy, boolean_t is_outbound)
+{
+ struct ether_header *ehp;
+ uint64_t hash = 0;
+ uint16_t sap;
+ uint_t skip_len;
+ uint8_t proto;
+
+ /*
+ * We may want to have one of these per MAC type plugin in the
+ * future. For now supports only ethernet.
+ */
+ if (media != DL_ETHER)
+ return (0L);
+
+ /* for now we support only outbound packets */
+ ASSERT(is_outbound);
+ ASSERT(IS_P2ALIGNED(mp->b_rptr, sizeof (uint16_t)));
+ ASSERT(MBLKL(mp) >= sizeof (struct ether_header));
+
+ /* compute L2 hash */
+
+ ehp = (struct ether_header *)mp->b_rptr;
+
+ if ((policy & MAC_PKT_HASH_L2) != 0) {
+ uchar_t *mac_src = ehp->ether_shost.ether_addr_octet;
+ uchar_t *mac_dst = ehp->ether_dhost.ether_addr_octet;
+ hash = PKT_HASH_MAC(mac_src) ^ PKT_HASH_MAC(mac_dst);
+ policy &= ~MAC_PKT_HASH_L2;
+ }
+
+ if (policy == 0)
+ goto done;
+
+ /* skip ethernet header */
+
+ sap = ntohs(ehp->ether_type);
+ if (sap == ETHERTYPE_VLAN) {
+ struct ether_vlan_header *evhp;
+ mblk_t *newmp = NULL;
+
+ skip_len = sizeof (struct ether_vlan_header);
+ if (MBLKL(mp) < skip_len) {
+ /* the vlan tag is the payload, pull up first */
+ newmp = msgpullup(mp, -1);
+ if ((newmp == NULL) || (MBLKL(newmp) < skip_len)) {
+ goto done;
+ }
+ evhp = (struct ether_vlan_header *)newmp->b_rptr;
+ } else {
+ evhp = (struct ether_vlan_header *)mp->b_rptr;
+ }
+
+ sap = ntohs(evhp->ether_type);
+ freemsg(newmp);
+ } else {
+ skip_len = sizeof (struct ether_header);
+ }
+
+ /* if ethernet header is in its own mblk, skip it */
+ if (MBLKL(mp) <= skip_len) {
+ skip_len -= MBLKL(mp);
+ mp = mp->b_cont;
+ if (mp == NULL)
+ goto done;
+ }
+
+ sap = (sap < ETHERTYPE_802_MIN) ? 0 : sap;
+
+ /* compute IP src/dst addresses hash and skip IPv{4,6} header */
+
+ switch (sap) {
+ case ETHERTYPE_IP: {
+ ipha_t *iphp;
+
+ /*
+ * If the header is not aligned or the header doesn't fit
+ * in the mblk, bail now. Note that this may cause packets
+ * reordering.
+ */
+ iphp = (ipha_t *)(mp->b_rptr + skip_len);
+ if (((unsigned char *)iphp + sizeof (ipha_t) > mp->b_wptr) ||
+ !OK_32PTR((char *)iphp))
+ goto done;
+
+ proto = iphp->ipha_protocol;
+ skip_len += IPH_HDR_LENGTH(iphp);
+
+ if ((policy & MAC_PKT_HASH_L3) != 0) {
+ uint8_t *ip_src = (uint8_t *)&(iphp->ipha_src);
+ uint8_t *ip_dst = (uint8_t *)&(iphp->ipha_dst);
+
+ hash ^= (PKT_HASH_4BYTES(ip_src) ^
+ PKT_HASH_4BYTES(ip_dst));
+ policy &= ~MAC_PKT_HASH_L3;
+ }
+ break;
+ }
+ case ETHERTYPE_IPV6: {
+ ip6_t *ip6hp;
+ uint16_t hdr_length;
+
+ /*
+ * If the header is not aligned or the header doesn't fit
+ * in the mblk, bail now. Note that this may cause packets
+ * reordering.
+ */
+
+ ip6hp = (ip6_t *)(mp->b_rptr + skip_len);
+ if (((unsigned char *)ip6hp + IPV6_HDR_LEN > mp->b_wptr) ||
+ !OK_32PTR((char *)ip6hp))
+ goto done;
+
+ if (!mac_ip_hdr_length_v6(mp, ip6hp, &hdr_length, &proto))
+ goto done;
+ skip_len += hdr_length;
+
+ if ((policy & MAC_PKT_HASH_L3) != 0) {
+ uint8_t *ip_src = &(ip6hp->ip6_src.s6_addr8[12]);
+ uint8_t *ip_dst = &(ip6hp->ip6_dst.s6_addr8[12]);
+
+ hash ^= (PKT_HASH_4BYTES(ip_src) ^
+ PKT_HASH_4BYTES(ip_dst));
+ policy &= ~MAC_PKT_HASH_L3;
+ }
+ break;
+ }
+ default:
+ goto done;
+ }
+
+ if (policy == 0)
+ goto done;
+
+ /* if ip header is in its own mblk, skip it */
+ if (MBLKL(mp) <= skip_len) {
+ skip_len -= MBLKL(mp);
+ mp = mp->b_cont;
+ if (mp == NULL)
+ goto done;
+ }
+
+ /* parse ULP header */
+again:
+ switch (proto) {
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ case IPPROTO_ESP:
+ case IPPROTO_SCTP:
+ /*
+ * These Internet Protocols are intentionally designed
+ * for hashing from the git-go. Port numbers are in the first
+ * word for transports, SPI is first for ESP.
+ */
+ if (mp->b_rptr + skip_len + 4 > mp->b_wptr)
+ goto done;
+ hash ^= PKT_HASH_4BYTES((mp->b_rptr + skip_len));
+ break;
+
+ case IPPROTO_AH: {
+ ah_t *ah = (ah_t *)(mp->b_rptr + skip_len);
+ uint_t ah_length = AH_TOTAL_LEN(ah);
+
+ if ((unsigned char *)ah + sizeof (ah_t) > mp->b_wptr)
+ goto done;
+
+ proto = ah->ah_nexthdr;
+ skip_len += ah_length;
+
+ /* if AH header is in its own mblk, skip it */
+ if (MBLKL(mp) <= skip_len) {
+ skip_len -= MBLKL(mp);
+ mp = mp->b_cont;
+ if (mp == NULL)
+ goto done;
+ }
+
+ goto again;
+ }
+ }
+
+done:
+ return (hash);
+}
diff --git a/usr/src/uts/common/io/nxge/nxge_send.c b/usr/src/uts/common/io/nxge/nxge_send.c
index 0bb35ef423..97603172be 100644
--- a/usr/src/uts/common/io/nxge/nxge_send.c
+++ b/usr/src/uts/common/io/nxge/nxge_send.c
@@ -19,10 +19,11 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
+#include <sys/mac_provider.h>
#include <sys/nxge/nxge_impl.h>
#include <sys/nxge/nxge_hio.h>
#include <npi_tx_wr64.h>
@@ -32,6 +33,9 @@
#include <inet/ip_impl.h>
#include <inet/tcp.h>
+extern uint64_t mac_pkt_hash(uint_t, mblk_t *mp, uint8_t policy,
+ boolean_t is_outbound);
+
static mblk_t *nxge_lso_eliminate(mblk_t *);
static mblk_t *nxge_do_softlso(mblk_t *mp, uint32_t mss);
static void nxge_lso_info_get(mblk_t *, uint32_t *, uint32_t *);
@@ -121,8 +125,17 @@ nxge_tx_ring_send(void *arg, mblk_t *mp)
#if defined(sun4v)
/*
+ * Hashing policy for load balancing over the set of TX rings
+ * available to the driver.
+ */
+static uint8_t nxge_tx_hash_policy = MAC_PKT_HASH_L4;
+
+/*
* nxge_m_tx() is needed for Hybrid I/O operation of the vnet in
* the guest domain. See CR 6778758 for long term solution.
+ *
+ * The guest domain driver will for now hash the packet
+ * to pick a DMA channel from the only group it has group 0.
*/
mblk_t *
@@ -130,15 +143,23 @@ nxge_m_tx(void *arg, mblk_t *mp)
{
p_nxge_t nxgep = (p_nxge_t)arg;
mblk_t *next;
+ uint64_t rindex;
p_tx_ring_t tx_ring_p;
int status;
NXGE_DEBUG_MSG((nxgep, TX_CTL, "==> nxge_m_tx"));
/*
- * Get the default ring handle.
+ * Hash to pick a ring from Group 0, the only TX group
+ * for a guest domain driver.
+ */
+ rindex = mac_pkt_hash(DL_ETHER, mp, nxge_tx_hash_policy, B_TRUE);
+ rindex = rindex % nxgep->pt_config.tdc_grps[0].max_tdcs;
+
+ /*
+ * Get the ring handle.
*/
- tx_ring_p = nxgep->tx_rings->rings[0];
+ tx_ring_p = nxgep->tx_rings->rings[rindex];
while (mp != NULL) {
next = mp->b_next;
diff --git a/usr/src/uts/common/io/softmac/softmac_main.c b/usr/src/uts/common/io/softmac/softmac_main.c
index c940794d72..a44856c849 100644
--- a/usr/src/uts/common/io/softmac/softmac_main.c
+++ b/usr/src/uts/common/io/softmac/softmac_main.c
@@ -1042,17 +1042,26 @@ softmac_mac_recreate(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
return (MH_WALK_CONTINUE);
}
+ /*
+ * Bumping up the smac_hold_cnt allows us to drop the lock. It also
+ * makes softmac_destroy() return failure on an attempted device detach.
+ * We don't want to hold the lock across calls to other subsystems
+ * like kstats, which will happen in the call to dls_devnet_recreate
+ */
+ softmac->smac_hold_cnt++;
+ mutex_exit(&softmac->smac_mutex);
+
if (dls_mgmt_create(softmac->smac_devname,
makedevice(softmac->smac_umajor, softmac->smac_uppa + 1),
DATALINK_CLASS_PHYS, softmac->smac_media, B_TRUE, &linkid) != 0) {
- mutex_exit(&softmac->smac_mutex);
+ softmac_rele_device((dls_dev_handle_t)softmac);
return (MH_WALK_CONTINUE);
}
if ((err = softmac_update_info(softmac, &linkid)) != 0) {
cmn_err(CE_WARN, "softmac: softmac_update_info() for %s "
"failed (%d)", softmac->smac_devname, err);
- mutex_exit(&softmac->smac_mutex);
+ softmac_rele_device((dls_dev_handle_t)softmac);
return (MH_WALK_CONTINUE);
}
@@ -1069,7 +1078,10 @@ softmac_mac_recreate(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
}
}
+ mutex_enter(&softmac->smac_mutex);
softmac->smac_flags &= ~SOFTMAC_NEED_RECREATE;
+ ASSERT(softmac->smac_hold_cnt != 0);
+ softmac->smac_hold_cnt--;
mutex_exit(&softmac->smac_mutex);
return (MH_WALK_CONTINUE);